From 83752e12896a72b24845c94f000e4c51b2bc5b50 Mon Sep 17 00:00:00 2001
From: Nicolas Rybowski <nicolas.rybowski@tessares.net>
Date: Thu, 26 Sep 2024 19:30:22 +0200
Subject: selftests/bpf: Add mptcp subflow example

Move Nicolas' patch into bpf selftests directory. This example adds a
different mark (SO_MARK) on each subflow, and changes the TCP CC only on
the first subflow.

From the userspace, an application can do a setsockopt() on an MPTCP
socket, and typically the same value will be propagated to all subflows
(paths). If someone wants to have different values per subflow, the
recommended way is to use BPF. So it is good to add such example here,
and make sure there is no regressions.

This example shows how it is possible to:

    Identify the parent msk of an MPTCP subflow.
    Put different sockopt for each subflow of a same MPTCP connection.

Here especially, two different behaviours are implemented:

    A socket mark (SOL_SOCKET SO_MARK) is put on each subflow of a same
    MPTCP connection. The order of creation of the current subflow defines
    its mark. The TCP CC algorithm of the very first subflow of an MPTCP
    connection is set to "reno".

This is just to show it is possible to identify an MPTCP connection, and
set socket options, from different SOL levels, per subflow. "reno" has
been picked because it is built-in and usually not set as default one.
It is easy to verify with 'ss' that these modifications have been
applied correctly. That's what the next patch is going to do.

Nicolas' code comes from:

    commit 4d120186e4d6 ("bpf:examples: update mptcp_set_mark_kern.c")

from the MPTCP repo https://github.com/multipath-tcp/mptcp_net-next (the
"scripts" branch), and it has been adapted by Geliang.

Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/76
Co-developed-by: Geliang Tang <tanggeliang@kylinos.cn>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
Signed-off-by: Nicolas Rybowski <nicolas.rybowski@tessares.net>
Reviewed-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://lore.kernel.org/r/20240926-upstream-bpf-next-20240506-mptcp-subflow-test-v7-1-d26029e15cdd@kernel.org
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/progs/mptcp_subflow.c | 59 +++++++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/mptcp_subflow.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/progs/mptcp_subflow.c b/tools/testing/selftests/bpf/progs/mptcp_subflow.c
new file mode 100644
index 000000000000..2e28f4a215b5
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/mptcp_subflow.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020, Tessares SA. */
+/* Copyright (c) 2024, Kylin Software */
+
+/* vmlinux.h, bpf_helpers.h and other 'define' */
+#include "bpf_tracing_net.h"
+
+char _license[] SEC("license") = "GPL";
+
+char cc[TCP_CA_NAME_MAX] = "reno";
+
+/* Associate a subflow counter to each token */
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(__u32));
+	__uint(max_entries, 100);
+} mptcp_sf SEC(".maps");
+
+SEC("sockops")
+int mptcp_subflow(struct bpf_sock_ops *skops)
+{
+	__u32 init = 1, key, mark, *cnt;
+	struct mptcp_sock *msk;
+	struct bpf_sock *sk;
+	int err;
+
+	if (skops->op != BPF_SOCK_OPS_TCP_CONNECT_CB)
+		return 1;
+
+	sk = skops->sk;
+	if (!sk)
+		return 1;
+
+	msk = bpf_skc_to_mptcp_sock(sk);
+	if (!msk)
+		return 1;
+
+	key = msk->token;
+	cnt = bpf_map_lookup_elem(&mptcp_sf, &key);
+	if (cnt) {
+		/* A new subflow is added to an existing MPTCP connection */
+		__sync_fetch_and_add(cnt, 1);
+		mark = *cnt;
+	} else {
+		/* A new MPTCP connection is just initiated and this is its primary subflow */
+		bpf_map_update_elem(&mptcp_sf, &key, &init, BPF_ANY);
+		mark = init;
+	}
+
+	/* Set the mark of the subflow's socket based on appearance order */
+	err = bpf_setsockopt(skops, SOL_SOCKET, SO_MARK, &mark, sizeof(mark));
+	if (err < 0)
+		return 1;
+	if (mark == 2)
+		err = bpf_setsockopt(skops, SOL_TCP, TCP_CONGESTION, cc, TCP_CA_NAME_MAX);
+
+	return 1;
+}
-- 
cgit v1.2.3


From cd19b885106e0a24c28ef72fccc4c020782e6e7e Mon Sep 17 00:00:00 2001
From: Geliang Tang <tanggeliang@kylinos.cn>
Date: Thu, 26 Sep 2024 19:30:23 +0200
Subject: selftests/bpf: Add getsockopt to inspect mptcp subflow

This patch adds a "cgroup/getsockopt" way to inspect the subflows of an
MPTCP socket, and verify the modifications done by the same BPF program
in the previous commit: a different mark per subflow, and a different
TCP CC set on the second one. This new hook will be used by the next
commit to verify the socket options set on each subflow.

This extra "cgroup/getsockopt" prog walks the msk->conn_list and use
bpf_core_cast to cast a pointer for readonly. It allows to inspect all
the fields of a structure.

Note that on the kernel side, the MPTCP socket stores a list of subflows
under 'msk->conn_list'. They can be iterated using the generic 'list'
helpers. They have been imported here, with a small difference:
list_for_each_entry() uses 'can_loop' to limit the number of iterations,
and ease its use. Because only data need to be read here, it is enough
to use this technique. It is planned to use bpf_iter, when BPF programs
will be used to modify data from the different subflows.
mptcp_subflow_tcp_sock() and mptcp_for_each_stubflow() helpers have also
be imported.

Suggested-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://lore.kernel.org/r/20240926-upstream-bpf-next-20240506-mptcp-subflow-test-v7-2-d26029e15cdd@kernel.org
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 MAINTAINERS                                       |  2 +-
 tools/testing/selftests/bpf/progs/mptcp_bpf.h     | 42 ++++++++++++++
 tools/testing/selftests/bpf/progs/mptcp_subflow.c | 69 +++++++++++++++++++++++
 3 files changed, 112 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/progs/mptcp_bpf.h

(limited to 'tools/testing')

diff --git a/MAINTAINERS b/MAINTAINERS
index e71d066dc919..f02b7485b215 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16281,7 +16281,7 @@ F:	include/net/mptcp.h
 F:	include/trace/events/mptcp.h
 F:	include/uapi/linux/mptcp*.h
 F:	net/mptcp/
-F:	tools/testing/selftests/bpf/*/*mptcp*.c
+F:	tools/testing/selftests/bpf/*/*mptcp*.[ch]
 F:	tools/testing/selftests/net/mptcp/
 
 NETWORKING [TCP]
diff --git a/tools/testing/selftests/bpf/progs/mptcp_bpf.h b/tools/testing/selftests/bpf/progs/mptcp_bpf.h
new file mode 100644
index 000000000000..3b188ccdcc40
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/mptcp_bpf.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+#ifndef __MPTCP_BPF_H__
+#define __MPTCP_BPF_H__
+
+#include "bpf_experimental.h"
+
+/* list helpers from include/linux/list.h */
+static inline int list_is_head(const struct list_head *list,
+			       const struct list_head *head)
+{
+	return list == head;
+}
+
+#define list_entry(ptr, type, member)					\
+	container_of(ptr, type, member)
+
+#define list_first_entry(ptr, type, member)				\
+	list_entry((ptr)->next, type, member)
+
+#define list_next_entry(pos, member)					\
+	list_entry((pos)->member.next, typeof(*(pos)), member)
+
+#define list_entry_is_head(pos, head, member)				\
+	list_is_head(&pos->member, (head))
+
+/* small difference: 'can_loop' has been added in the conditions */
+#define list_for_each_entry(pos, head, member)				\
+	for (pos = list_first_entry(head, typeof(*pos), member);	\
+	     !list_entry_is_head(pos, head, member) && can_loop;	\
+	     pos = list_next_entry(pos, member))
+
+/* mptcp helpers from protocol.h */
+#define mptcp_for_each_subflow(__msk, __subflow)			\
+	list_for_each_entry(__subflow, &((__msk)->conn_list), node)
+
+static __always_inline struct sock *
+mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow)
+{
+	return subflow->tcp_sock;
+}
+
+#endif
diff --git a/tools/testing/selftests/bpf/progs/mptcp_subflow.c b/tools/testing/selftests/bpf/progs/mptcp_subflow.c
index 2e28f4a215b5..70302477e326 100644
--- a/tools/testing/selftests/bpf/progs/mptcp_subflow.c
+++ b/tools/testing/selftests/bpf/progs/mptcp_subflow.c
@@ -4,10 +4,12 @@
 
 /* vmlinux.h, bpf_helpers.h and other 'define' */
 #include "bpf_tracing_net.h"
+#include "mptcp_bpf.h"
 
 char _license[] SEC("license") = "GPL";
 
 char cc[TCP_CA_NAME_MAX] = "reno";
+int pid;
 
 /* Associate a subflow counter to each token */
 struct {
@@ -57,3 +59,70 @@ int mptcp_subflow(struct bpf_sock_ops *skops)
 
 	return 1;
 }
+
+static int _check_getsockopt_subflow_mark(struct mptcp_sock *msk, struct bpf_sockopt *ctx)
+{
+	struct mptcp_subflow_context *subflow;
+	int i = 0;
+
+	mptcp_for_each_subflow(msk, subflow) {
+		struct sock *ssk;
+
+		ssk = mptcp_subflow_tcp_sock(bpf_core_cast(subflow,
+							   struct mptcp_subflow_context));
+
+		if (ssk->sk_mark != ++i) {
+			ctx->retval = -2;
+			break;
+		}
+	}
+
+	return 1;
+}
+
+static int _check_getsockopt_subflow_cc(struct mptcp_sock *msk, struct bpf_sockopt *ctx)
+{
+	struct mptcp_subflow_context *subflow;
+
+	mptcp_for_each_subflow(msk, subflow) {
+		struct inet_connection_sock *icsk;
+		struct sock *ssk;
+
+		ssk = mptcp_subflow_tcp_sock(bpf_core_cast(subflow,
+							   struct mptcp_subflow_context));
+		icsk = bpf_core_cast(ssk, struct inet_connection_sock);
+
+		if (ssk->sk_mark == 2 &&
+		    __builtin_memcmp(icsk->icsk_ca_ops->name, cc, TCP_CA_NAME_MAX)) {
+			ctx->retval = -2;
+			break;
+		}
+	}
+
+	return 1;
+}
+
+SEC("cgroup/getsockopt")
+int _getsockopt_subflow(struct bpf_sockopt *ctx)
+{
+	struct bpf_sock *sk = ctx->sk;
+	struct mptcp_sock *msk;
+
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 1;
+
+	if (!sk || sk->protocol != IPPROTO_MPTCP ||
+	    (!(ctx->level == SOL_SOCKET && ctx->optname == SO_MARK) &&
+	     !(ctx->level == SOL_TCP && ctx->optname == TCP_CONGESTION)))
+		return 1;
+
+	msk = bpf_core_cast(sk, struct mptcp_sock);
+	if (msk->pm.subflows != 1) {
+		ctx->retval = -1;
+		return 1;
+	}
+
+	if (ctx->optname == SO_MARK)
+		return _check_getsockopt_subflow_mark(msk, ctx);
+	return _check_getsockopt_subflow_cc(msk, ctx);
+}
-- 
cgit v1.2.3


From 9b85f11efa02f3dc78c60961c0b9cff166516464 Mon Sep 17 00:00:00 2001
From: Geliang Tang <tanggeliang@kylinos.cn>
Date: Thu, 26 Sep 2024 19:30:24 +0200
Subject: selftests/bpf: Add mptcp subflow subtest

This patch adds a subtest named test_subflow in test_mptcp to load and
verify the newly added MPTCP subflow BPF program. To goal is to make
sure it is possible to set different socket options per subflows, while
the userspace socket interface only lets the application to set the same
socket options for the whole MPTCP connection and its multiple subflows.

To check that, a client and a server are started in a dedicated netns,
with veth interfaces to simulate multiple paths. They will exchange data
to allow the creation of an additional subflow.

When the different subflows are being created, the new MPTCP subflow BPF
program will set some socket options: marks and TCP CC. The validation
is done by the same program, when the userspace checks the value of the
modified socket options. On the userspace side, it will see that the
default values are still being used on the MPTCP connection, while the
BPF program will see different options set per subflow of the same MPTCP
connection.

Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/76
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
Reviewed-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://lore.kernel.org/r/20240926-upstream-bpf-next-20240506-mptcp-subflow-test-v7-3-d26029e15cdd@kernel.org
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/mptcp.c | 121 +++++++++++++++++++++++++
 1 file changed, 121 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/mptcp.c b/tools/testing/selftests/bpf/prog_tests/mptcp.c
index d2ca32fa3b21..be3cad2aff77 100644
--- a/tools/testing/selftests/bpf/prog_tests/mptcp.c
+++ b/tools/testing/selftests/bpf/prog_tests/mptcp.c
@@ -5,12 +5,17 @@
 #include <linux/const.h>
 #include <netinet/in.h>
 #include <test_progs.h>
+#include <unistd.h>
 #include "cgroup_helpers.h"
 #include "network_helpers.h"
 #include "mptcp_sock.skel.h"
 #include "mptcpify.skel.h"
+#include "mptcp_subflow.skel.h"
 
 #define NS_TEST "mptcp_ns"
+#define ADDR_1	"10.0.1.1"
+#define ADDR_2	"10.0.1.2"
+#define PORT_1	10001
 
 #ifndef IPPROTO_MPTCP
 #define IPPROTO_MPTCP 262
@@ -335,10 +340,126 @@ fail:
 	close(cgroup_fd);
 }
 
+static int endpoint_init(char *flags)
+{
+	SYS(fail, "ip -net %s link add veth1 type veth peer name veth2", NS_TEST);
+	SYS(fail, "ip -net %s addr add %s/24 dev veth1", NS_TEST, ADDR_1);
+	SYS(fail, "ip -net %s link set dev veth1 up", NS_TEST);
+	SYS(fail, "ip -net %s addr add %s/24 dev veth2", NS_TEST, ADDR_2);
+	SYS(fail, "ip -net %s link set dev veth2 up", NS_TEST);
+	if (SYS_NOFAIL("ip -net %s mptcp endpoint add %s %s", NS_TEST, ADDR_2, flags)) {
+		printf("'ip mptcp' not supported, skip this test.\n");
+		test__skip();
+		goto fail;
+	}
+
+	return 0;
+fail:
+	return -1;
+}
+
+static void wait_for_new_subflows(int fd)
+{
+	socklen_t len;
+	u8 subflows;
+	int err, i;
+
+	len = sizeof(subflows);
+	/* Wait max 5 sec for new subflows to be created */
+	for (i = 0; i < 50; i++) {
+		err = getsockopt(fd, SOL_MPTCP, MPTCP_INFO, &subflows, &len);
+		if (!err && subflows > 0)
+			break;
+
+		usleep(100000); /* 0.1s */
+	}
+}
+
+static void run_subflow(void)
+{
+	int server_fd, client_fd, err;
+	char new[TCP_CA_NAME_MAX];
+	char cc[TCP_CA_NAME_MAX];
+	unsigned int mark;
+	socklen_t len;
+
+	server_fd = start_mptcp_server(AF_INET, ADDR_1, PORT_1, 0);
+	if (!ASSERT_OK_FD(server_fd, "start_mptcp_server"))
+		return;
+
+	client_fd = connect_to_fd(server_fd, 0);
+	if (!ASSERT_OK_FD(client_fd, "connect_to_fd"))
+		goto close_server;
+
+	send_byte(client_fd);
+	wait_for_new_subflows(client_fd);
+
+	len = sizeof(mark);
+	err = getsockopt(client_fd, SOL_SOCKET, SO_MARK, &mark, &len);
+	if (ASSERT_OK(err, "getsockopt(client_fd, SO_MARK)"))
+		ASSERT_EQ(mark, 0, "mark");
+
+	len = sizeof(new);
+	err = getsockopt(client_fd, SOL_TCP, TCP_CONGESTION, new, &len);
+	if (ASSERT_OK(err, "getsockopt(client_fd, TCP_CONGESTION)")) {
+		get_msk_ca_name(cc);
+		ASSERT_STREQ(new, cc, "cc");
+	}
+
+	close(client_fd);
+close_server:
+	close(server_fd);
+}
+
+static void test_subflow(void)
+{
+	struct mptcp_subflow *skel;
+	struct nstoken *nstoken;
+	int cgroup_fd;
+
+	cgroup_fd = test__join_cgroup("/mptcp_subflow");
+	if (!ASSERT_OK_FD(cgroup_fd, "join_cgroup: mptcp_subflow"))
+		return;
+
+	skel = mptcp_subflow__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel_open_load: mptcp_subflow"))
+		goto close_cgroup;
+
+	skel->bss->pid = getpid();
+
+	skel->links.mptcp_subflow =
+		bpf_program__attach_cgroup(skel->progs.mptcp_subflow, cgroup_fd);
+	if (!ASSERT_OK_PTR(skel->links.mptcp_subflow, "attach mptcp_subflow"))
+		goto skel_destroy;
+
+	skel->links._getsockopt_subflow =
+		bpf_program__attach_cgroup(skel->progs._getsockopt_subflow, cgroup_fd);
+	if (!ASSERT_OK_PTR(skel->links._getsockopt_subflow, "attach _getsockopt_subflow"))
+		goto skel_destroy;
+
+	nstoken = create_netns();
+	if (!ASSERT_OK_PTR(nstoken, "create_netns: mptcp_subflow"))
+		goto skel_destroy;
+
+	if (endpoint_init("subflow") < 0)
+		goto close_netns;
+
+	run_subflow();
+
+close_netns:
+	cleanup_netns(nstoken);
+skel_destroy:
+	mptcp_subflow__destroy(skel);
+close_cgroup:
+	close(cgroup_fd);
+}
+
 void test_mptcp(void)
 {
 	if (test__start_subtest("base"))
 		test_base();
 	if (test__start_subtest("mptcpify"))
 		test_mptcpify();
+	if (test__start_subtest("subflow"))
+		test_subflow();
 }
-- 
cgit v1.2.3


From 7c2f1c2690a5965fa0913c7b8c1b833dccddbb39 Mon Sep 17 00:00:00 2001
From: zhang jiao <zhangjiao2@cmss.chinamobile.com>
Date: Fri, 27 Sep 2024 12:00:50 +0800
Subject: selftests/net: Add missing va_end.

There is no va_end after va_copy, just add it.

Signed-off-by: zhang jiao <zhangjiao2@cmss.chinamobile.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20240927040050.7851-1-zhangjiao2@cmss.chinamobile.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/tcp_ao/lib/aolib.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/tcp_ao/lib/aolib.h b/tools/testing/selftests/net/tcp_ao/lib/aolib.h
index db44e77428dd..5db2f65cddc4 100644
--- a/tools/testing/selftests/net/tcp_ao/lib/aolib.h
+++ b/tools/testing/selftests/net/tcp_ao/lib/aolib.h
@@ -46,6 +46,7 @@ static inline char *test_snprintf(const char *fmt, va_list vargs)
 
 	va_copy(tmp, vargs);
 	n = vsnprintf(ret, size, fmt, tmp);
+	va_end(tmp);
 	if (n < 0)
 		return NULL;
 
-- 
cgit v1.2.3


From be4e3235445725546f25f09ace04a4237c72e071 Mon Sep 17 00:00:00 2001
From: Amit Cohen <amcohen@nvidia.com>
Date: Mon, 30 Sep 2024 17:12:50 +0200
Subject: selftests: mlxsw: rtnetlink: Use devlink_reload() API

The test runs "devlink reload" explicitly. Instead, it is better to use
devlink_reload() which waits for udev events to be processed. Do not sleep
after reload, as devlink_reload() blocks until all the netdevs are renamed.

Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: Petr Machata <petrm@nvidia.com>
Link: https://patch.msgid.link/844509e3057b65277a7181a23c95b71ec95e8a56.1727706741.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh b/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh
index 893a693ad805..45a569618424 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh
@@ -186,10 +186,7 @@ bridge_vlan_flags_test()
 
 	# If we did not handle references correctly, then this should produce a
 	# trace
-	devlink dev reload "$DEVLINK_DEV"
-
-	# Allow netdevices to be re-created following the reload
-	sleep 20
+	devlink_reload
 
 	log_test "bridge vlan flags"
 }
@@ -923,12 +920,9 @@ devlink_reload_test()
 	# devlink reload can be performed without errors
 	RET=0
 
-	devlink dev reload "$DEVLINK_DEV"
-	check_err $? "devlink reload failed"
+	devlink_reload
 
 	log_test "devlink reload - last test"
-
-	sleep 20
 }
 
 trap cleanup EXIT
-- 
cgit v1.2.3


From d772cc25ccf772f4cbb81270970cbe1356c23d3e Mon Sep 17 00:00:00 2001
From: Sean Anderson <sean.anderson@linux.dev>
Date: Mon, 30 Sep 2024 12:29:34 -0400
Subject: selftests: net: csum: Clean up recv_verify_packet_ipv6

Rename ip_len to payload_len since the length in this case refers only
to the payload, and not the entire IP packet like for IPv4. While we're
at it, just use the variable directly when calling
recv_verify_packet_udp/tcp.

Signed-off-by: Sean Anderson <sean.anderson@linux.dev>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20240930162935.980712-1-sean.anderson@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/lib/csum.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/lib/csum.c b/tools/testing/selftests/net/lib/csum.c
index e0a34e5e8dd5..27437590eeb5 100644
--- a/tools/testing/selftests/net/lib/csum.c
+++ b/tools/testing/selftests/net/lib/csum.c
@@ -675,22 +675,20 @@ static int recv_verify_packet_ipv6(void *nh, int len)
 {
 	struct ipv6hdr *ip6h = nh;
 	uint16_t proto = cfg_encap ? IPPROTO_UDP : cfg_proto;
-	uint16_t ip_len;
+	uint16_t payload_len;
 
 	if (len < sizeof(*ip6h) || ip6h->nexthdr != proto)
 		return -1;
 
-	ip_len = ntohs(ip6h->payload_len);
-	if (ip_len > len - sizeof(*ip6h))
+	payload_len = ntohs(ip6h->payload_len);
+	if (payload_len > len - sizeof(*ip6h))
 		return -1;
 
-	len = ip_len;
 	iph_addr_p = &ip6h->saddr;
-
 	if (proto == IPPROTO_TCP)
-		return recv_verify_packet_tcp(ip6h + 1, len);
+		return recv_verify_packet_tcp(ip6h + 1, payload_len);
 	else
-		return recv_verify_packet_udp(ip6h + 1, len);
+		return recv_verify_packet_udp(ip6h + 1, payload_len);
 }
 
 /* return whether auxdata includes TP_STATUS_CSUM_VALID */
-- 
cgit v1.2.3


From d002b922c4d5d695d617ec262f3e07cd62ee866e Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@pm.me>
Date: Mon, 16 Sep 2024 19:59:22 +0000
Subject: selftests/bpf: Remove test_skb_cgroup_id.sh from TEST_PROGS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

test_skb_cgroup_id.sh was deleted in
https://git.kernel.org/bpf/bpf-next/c/f957c230e173

It has to be removed from TEST_PROGS variable in
tools/testing/selftests/bpf/Makefile, otherwise install target fails.

Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Björn Töpel <bjorn@rivosinc.com>
Link: https://lore.kernel.org/bpf/20240916195919.1872371-1-ihor.solodrai@pm.me
Link: https://lore.kernel.org/bpf/Q3BN2kW9Kgy6LkrDOwnyY4Pv7_YF8fInLCd2_QA3LimKYM3wD64kRdnwp7blwG2dI_s7UGnfUae-4_dOmuTrxpYCi32G_KTzB3PfmxIerH8=@pm.me/
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/Makefile | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index f04af11df8eb..df75f1beb731 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -132,7 +132,6 @@ TEST_PROGS := test_kmod.sh \
 	test_tunnel.sh \
 	test_lwt_seg6local.sh \
 	test_lirc_mode2.sh \
-	test_skb_cgroup_id.sh \
 	test_flow_dissector.sh \
 	test_xdp_vlan_mode_generic.sh \
 	test_xdp_vlan_mode_native.sh \
-- 
cgit v1.2.3


From fd4a0e67838c1e0fc4927fae113d785aa893997d Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@pm.me>
Date: Mon, 16 Sep 2024 19:59:27 +0000
Subject: selftests/bpf: Set vpath in Makefile to search for skels
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Auto-dependencies generated for %.test.o files refer to skels using
filenames as opposed to full paths. This requires make to be able to
link this name to an actual path, because not all generated skels are
put in the working directory.

In the original patch [1], this was mitigated by this target:

$(notdir %.skel.h): $(TRUNNER_OUTPUT)/%.skel.h
	@true

This turned out to be insufficient.

First, %.lskel.h and %.subskel.h were missed, because a typical
selftests/bpf build could find these files in the working directory.
This error was detected by an out-of-tree build [2].

Second, even with missing rules added, this target causes unnecessary
rebuilds in the out-of-tree case, as X.skel.h is searched for in the
working directory, and not in the $(OUTPUT).

Using vpath directive [3] is a better solution. Instead of introducing
a separate target (X.skel.h in addition to $(TRUNNER_OUTPUT)/X.skel.h),
make is instructed to search for skels in the output, which allows make
to correctly detect that skel has already been generated.

[1]: https://lore.kernel.org/bpf/VJihUTnvtwEgv_mOnpfy7EgD9D2MPNoHO-MlANeLIzLJPGhDeyOuGKIYyKgk0O6KPjfM-MuhtvPwZcngN8WFqbTnTRyCSMc2aMZ1ODm1T_g=@pm.me/
[2]: https://lore.kernel.org/bpf/CIjrhJwoIqMc2IhuppVqh4ZtJGbx8kC8rc9PHhAIU6RccnWT4I04F_EIr4GxQwxZe89McuGJlCnUk9UbkdvWtSJjAsd7mHmnTy9F8K2TLZM=@pm.me/
[3]: https://www.gnu.org/software/make/manual/html_node/Selective-Search.html

Reported-by: Björn Töpel <bjorn@kernel.org>
Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Björn Töpel <bjorn@rivosinc.com>
Link: https://lore.kernel.org/bpf/20240916195919.1872371-2-ihor.solodrai@pm.me
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/Makefile | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index df75f1beb731..365740f24d2e 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -622,10 +622,11 @@ $(TRUNNER_BPF_SKELS_LINKED): $(TRUNNER_OUTPUT)/%: $$$$(%-deps) $(BPFTOOL) | $(TR
 
 # When the compiler generates a %.d file, only skel basenames (not
 # full paths) are specified as prerequisites for corresponding %.o
-# file. This target makes %.skel.h basename dependent on full paths,
-# linking generated %.d dependency with actual %.skel.h files.
-$(notdir %.skel.h): $(TRUNNER_OUTPUT)/%.skel.h
-	@true
+# file. vpath directives below instruct make to search for skel files
+# in TRUNNER_OUTPUT, if they are not present in the working directory.
+vpath %.skel.h $(TRUNNER_OUTPUT)
+vpath %.lskel.h $(TRUNNER_OUTPUT)
+vpath %.subskel.h $(TRUNNER_OUTPUT)
 
 endif
 
-- 
cgit v1.2.3


From 4b7c05598a644782b8451e415bb56f31e5c9d3ee Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Tue, 24 Sep 2024 13:07:30 +0200
Subject: selftests/bpf: Fix uprobe consumer test

With newly merged code the uprobe behaviour is slightly different
and affects uprobe consumer test.

We no longer need to check if the uprobe object is still preserved
after removing last uretprobe, because it stays as long as there's
pending/installed uretprobe instance.

This allows to run uretprobe consumers registered 'after' uprobe was
hit even if previous uretprobe got unregistered before being hit.

The uprobe object will be now removed after the last uprobe ref is
released and in such case it's held by ri->uprobe (return instance)
which is released after the uretprobe is hit.

Reported-by: Ihor Solodrai <ihor.solodrai@pm.me>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Ihor Solodrai <ihor.solodrai@pm.me>
Closes: https://lore.kernel.org/bpf/w6U8Z9fdhjnkSp2UaFaV1fGqJXvfLEtDKEUyGDkwmoruDJ_AgF_c0FFhrkeKW18OqiP-05s9yDKiT6X-Ns-avN_ABf0dcUkXqbSJN1TQSXo=@pm.me/
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
index 844f6fc8487b..c1ac813ff9ba 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
@@ -869,21 +869,14 @@ static void consumer_test(struct uprobe_multi_consumers *skel,
 			fmt = "prog 0/1: uprobe";
 		} else {
 			/*
-			 * uprobe return is tricky ;-)
-			 *
 			 * to trigger uretprobe consumer, the uretprobe needs to be installed,
 			 * which means one of the 'return' uprobes was alive when probe was hit:
 			 *
 			 *   idxs: 2/3 uprobe return in 'installed' mask
-			 *
-			 * in addition if 'after' state removes everything that was installed in
-			 * 'before' state, then uprobe kernel object goes away and return uprobe
-			 * is not installed and we won't hit it even if it's in 'after' state.
 			 */
 			unsigned long had_uretprobes  = before & 0b1100; /* is uretprobe installed */
-			unsigned long probe_preserved = before & after;  /* did uprobe go away */
 
-			if (had_uretprobes && probe_preserved && test_bit(idx, after))
+			if (had_uretprobes && test_bit(idx, after))
 				val++;
 			fmt = "idx 2/3: uretprobe";
 		}
-- 
cgit v1.2.3


From 58dbb36930183aea41024d9c0b0ed97629473e20 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Tue, 24 Sep 2024 13:07:31 +0200
Subject: selftests/bpf: Bail out quickly from failing consumer test

Let's bail out from consumer test after we hit first fail,
so we don't pollute the log with many instances with possibly
the same error.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../testing/selftests/bpf/prog_tests/uprobe_multi_test.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
index c1ac813ff9ba..2c39902b8a09 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
@@ -836,10 +836,10 @@ uprobe_consumer_test(struct uprobe_multi_consumers *skel,
 	return 0;
 }
 
-static void consumer_test(struct uprobe_multi_consumers *skel,
-			  unsigned long before, unsigned long after)
+static int consumer_test(struct uprobe_multi_consumers *skel,
+			 unsigned long before, unsigned long after)
 {
-	int err, idx;
+	int err, idx, ret = -1;
 
 	printf("consumer_test before %lu after %lu\n", before, after);
 
@@ -881,13 +881,17 @@ static void consumer_test(struct uprobe_multi_consumers *skel,
 			fmt = "idx 2/3: uretprobe";
 		}
 
-		ASSERT_EQ(skel->bss->uprobe_result[idx], val, fmt);
+		if (!ASSERT_EQ(skel->bss->uprobe_result[idx], val, fmt))
+			goto cleanup;
 		skel->bss->uprobe_result[idx] = 0;
 	}
 
+	ret = 0;
+
 cleanup:
 	for (idx = 0; idx < 4; idx++)
 		uprobe_detach(skel, idx);
+	return ret;
 }
 
 static void test_consumers(void)
@@ -939,9 +943,11 @@ static void test_consumers(void)
 
 	for (before = 0; before < 16; before++) {
 		for (after = 0; after < 16; after++)
-			consumer_test(skel, before, after);
+			if (consumer_test(skel, before, after))
+				goto out;
 	}
 
+out:
 	uprobe_multi_consumers__destroy(skel);
 }
 
-- 
cgit v1.2.3


From a1ec23b947538520b3182c598dc2bb9930d032b1 Mon Sep 17 00:00:00 2001
From: Zhang Jiao <zhangjiao2@cmss.chinamobile.com>
Date: Tue, 24 Sep 2024 12:55:34 +0800
Subject: selftests/bpf: Add missing va_end.

There is no va_end after va_copy, just add it.

Signed-off-by: Zhang Jiao <zhangjiao2@cmss.chinamobile.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240924045534.8672-1-zhangjiao2@cmss.chinamobile.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_progs.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index c7a70e1a1085..7846f7f98908 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -868,6 +868,7 @@ static int libbpf_print_fn(enum libbpf_print_level level,
 
 		va_copy(args2, args);
 		vfprintf(libbpf_capture_stream, format, args2);
+		va_end(args2);
 	}
 
 	if (env.verbosity < VERBOSE_VERY && level == LIBBPF_DEBUG)
-- 
cgit v1.2.3


From 78971150660650cd22ef236c708aab3a7620e2fa Mon Sep 17 00:00:00 2001
From: Manu Bretelle <chantr4@gmail.com>
Date: Tue, 24 Sep 2024 17:22:10 -0700
Subject: selftests/bpf: vm: Add support for VIRTIO_FS

danobi/vmtest is going to migrate from using 9p to using virtio_fs to
mount the local rootfs: https://github.com/danobi/vmtest/pull/88

BPF CI uses danobi/vmtest to run bpf selftests and will need to support
VIRTIO_FS.

This change enables new kconfigs to be able to support the upcoming
danobi/vmtest.

Tested by building a new kernel with those config and confirming it
would successfully run with 9p (currently what is used by vmtest), and
with virtio_fs (using a local build of vmtest).

  $ vmtest -k arch/x86/boot/bzImage "findmnt /"
  => bzImage
  ===> Booting
  ===> Setting up VM
  ===> Running command
  TARGET SOURCE    FSTYPE OPTIONS
  /      /dev/root 9p     rw,relatime,cache=5,access=client,msize=512000,trans=virtio
  $ /home/chantra/local/danobi-vmtest/target/debug/vmtest -k arch/x86/boot/bzImage "findmnt /"
  => bzImage
  ===> Initializing host environment
  ===> Booting
  ===> Setting up VM
  ===> Running command
  TARGET SOURCE FSTYPE   OPTIONS
  /      rootfs virtiofs rw,relatime

Changes in v2:
* Sorted configs alphabetically

Signed-off-by: Manu Bretelle <chantr4@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Daniel Xu <dxu@dxuuu.xyz>
Link: https://lore.kernel.org/bpf/20240925002210.501266-1-chantr4@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/config.vm | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/config.vm b/tools/testing/selftests/bpf/config.vm
index a9746ca78777..da543b24c144 100644
--- a/tools/testing/selftests/bpf/config.vm
+++ b/tools/testing/selftests/bpf/config.vm
@@ -1,12 +1,15 @@
-CONFIG_9P_FS=y
 CONFIG_9P_FS_POSIX_ACL=y
 CONFIG_9P_FS_SECURITY=y
+CONFIG_9P_FS=y
 CONFIG_CRYPTO_DEV_VIRTIO=y
-CONFIG_NET_9P=y
+CONFIG_FUSE_FS=y
+CONFIG_FUSE_PASSTHROUGH=y
 CONFIG_NET_9P_VIRTIO=y
+CONFIG_NET_9P=y
 CONFIG_VIRTIO_BALLOON=y
 CONFIG_VIRTIO_BLK=y
 CONFIG_VIRTIO_CONSOLE=y
+CONFIG_VIRTIO_FS=y
 CONFIG_VIRTIO_NET=y
 CONFIG_VIRTIO_PCI=y
 CONFIG_VIRTIO_VSOCKETS_COMMON=y
-- 
cgit v1.2.3


From c27d8235ba97139d7a085367ff57773902eb3fc5 Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Thu, 26 Sep 2024 15:49:48 +0100
Subject: selftests/bpf: Fix uprobe_multi compilation error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When building selftests, the following was seen:

uprobe_multi.c: In function ‘trigger_uprobe’:
uprobe_multi.c:108:40: error: ‘MADV_PAGEOUT’ undeclared (first use in this function)
  108 |                 madvise(addr, page_sz, MADV_PAGEOUT);
      |                                        ^~~~~~~~~~~~
uprobe_multi.c:108:40: note: each undeclared identifier is reported only once for each function it appears in
make: *** [Makefile:850: bpf-next/tools/testing/selftests/bpf/uprobe_multi] Error 1

...even with updated UAPI headers. It seems the above value is
defined in UAPI <linux/mman.h> but including that file triggers
other redefinition errors.  Simplest solution is to add a
guarded definition, as was done for MADV_POPULATE_READ.

Fixes: 3c217a182018 ("selftests/bpf: add build ID tests")
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20240926144948.172090-1-alan.maguire@oracle.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/uprobe_multi.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/uprobe_multi.c b/tools/testing/selftests/bpf/uprobe_multi.c
index c7828b13e5ff..dd38dc68f635 100644
--- a/tools/testing/selftests/bpf/uprobe_multi.c
+++ b/tools/testing/selftests/bpf/uprobe_multi.c
@@ -12,6 +12,10 @@
 #define MADV_POPULATE_READ 22
 #endif
 
+#ifndef MADV_PAGEOUT
+#define MADV_PAGEOUT 21
+#endif
+
 int __attribute__((weak)) uprobe(void)
 {
 	return 0;
-- 
cgit v1.2.3


From 5a63c33d6f00e1739944cea2f445819951610c7d Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Mon, 16 Sep 2024 01:37:47 -0700
Subject: selftests/bpf: Support cross-endian building

Update Makefile build rules to compile BPF programs with target endianness
rather than host byte-order. With recent changes, this allows building the
full selftests/bpf suite hosted on x86_64 and targeting s390x or mips64eb
for example.

Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/bpf/880ccc6342cfc4d3c48b44f581e87adfbce2876e.1726475448.git.tony.ambardar@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/Makefile | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 365740f24d2e..e295e3df5ec6 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -445,6 +445,7 @@ endef
 IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - </dev/null | \
 			grep 'define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__')
 MENDIAN=$(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian)
+BPF_TARGET_ENDIAN=$(if $(IS_LITTLE_ENDIAN),--target=bpfel,--target=bpfeb)
 
 ifneq ($(CROSS_COMPILE),)
 CLANG_TARGET_ARCH = --target=$(notdir $(CROSS_COMPILE:%-=%))
@@ -472,17 +473,17 @@ $(OUTPUT)/cgroup_getset_retval_hooks.o: cgroup_getset_retval_hooks.h
 # $4 - binary name
 define CLANG_BPF_BUILD_RULE
 	$(call msg,CLNG-BPF,$4,$2)
-	$(Q)$(CLANG) $3 -O2 --target=bpf -c $1 -mcpu=v3 -o $2
+	$(Q)$(CLANG) $3 -O2 $(BPF_TARGET_ENDIAN) -c $1 -mcpu=v3 -o $2
 endef
 # Similar to CLANG_BPF_BUILD_RULE, but with disabled alu32
 define CLANG_NOALU32_BPF_BUILD_RULE
 	$(call msg,CLNG-BPF,$4,$2)
-	$(Q)$(CLANG) $3 -O2 --target=bpf -c $1 -mcpu=v2 -o $2
+	$(Q)$(CLANG) $3 -O2 $(BPF_TARGET_ENDIAN) -c $1 -mcpu=v2 -o $2
 endef
 # Similar to CLANG_BPF_BUILD_RULE, but with cpu-v4
 define CLANG_CPUV4_BPF_BUILD_RULE
 	$(call msg,CLNG-BPF,$4,$2)
-	$(Q)$(CLANG) $3 -O2 --target=bpf -c $1 -mcpu=v4 -o $2
+	$(Q)$(CLANG) $3 -O2 $(BPF_TARGET_ENDIAN) -c $1 -mcpu=v4 -o $2
 endef
 # Build BPF object using GCC
 define GCC_BPF_BUILD_RULE
-- 
cgit v1.2.3


From a5da3d65681f86f582420b5aea49c1d9a7c7e51e Mon Sep 17 00:00:00 2001
From: Mykyta Yatsenko <yatsenko@meta.com>
Date: Tue, 1 Oct 2024 00:15:22 +0100
Subject: selftests/bpf: Emit top frequent code lines in veristat

Production BPF programs are increasing in number of instructions and states
to the point, where optimising verification process for them is necessary
to avoid running into instruction limit. Authors of those BPF programs
need to analyze verifier output, for example, collecting the most
frequent source code lines to understand which part of the program has
the biggest verification cost.

This patch introduces `--top-src-lines` flag in veristat.
`--top-src-lines=N` makes veristat output N the most popular sorce code
lines, parsed from verification log.

An example of output:
```
sudo ./veristat  --top-src-lines=2   bpf_flow.bpf.o
Processing 'bpf_flow.bpf.o'...
Top source lines (_dissect):
    4: (bpf_helpers.h:161)	asm volatile("r1 = %[ctx]\n\t"
    4: (bpf_flow.c:155)	if (iph && iph->ihl == 5 &&
...
```

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240930231522.58650-1-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/veristat.c | 129 ++++++++++++++++++++++++++++++++-
 1 file changed, 128 insertions(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c
index 1ec5c4c47235..c8efd44590d9 100644
--- a/tools/testing/selftests/bpf/veristat.c
+++ b/tools/testing/selftests/bpf/veristat.c
@@ -179,6 +179,7 @@ static struct env {
 	int files_skipped;
 	int progs_processed;
 	int progs_skipped;
+	int top_src_lines;
 } env;
 
 static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
@@ -228,6 +229,7 @@ static const struct argp_option opts[] = {
 	  "Force frequent BPF verifier state checkpointing (set BPF_F_TEST_STATE_FREQ program flag)" },
 	{ "test-reg-invariants", 'r', NULL, 0,
 	  "Force BPF verifier failure on register invariant violation (BPF_F_TEST_REG_INVARIANTS program flag)" },
+	{ "top-src-lines", 'S', "N", 0, "Emit N most frequent source code lines" },
 	{},
 };
 
@@ -327,6 +329,14 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
 			return err;
 		}
 		break;
+	case 'S':
+		errno = 0;
+		env.top_src_lines = strtol(arg, NULL, 10);
+		if (errno) {
+			fprintf(stderr, "invalid top lines N specifier: %s\n", arg);
+			argp_usage(state);
+		}
+		break;
 	case ARGP_KEY_ARG:
 		tmp = realloc(env.filenames, (env.filename_cnt + 1) * sizeof(*env.filenames));
 		if (!tmp)
@@ -854,6 +864,118 @@ static int parse_verif_log(char * const buf, size_t buf_sz, struct verif_stats *
 	return 0;
 }
 
+struct line_cnt {
+	char *line;
+	int cnt;
+};
+
+static int str_cmp(const void *a, const void *b)
+{
+	const char **str1 = (const char **)a;
+	const char **str2 = (const char **)b;
+
+	return strcmp(*str1, *str2);
+}
+
+static int line_cnt_cmp(const void *a, const void *b)
+{
+	const struct line_cnt *a_cnt = (const struct line_cnt *)a;
+	const struct line_cnt *b_cnt = (const struct line_cnt *)b;
+
+	if (a_cnt->cnt != b_cnt->cnt)
+		return a_cnt->cnt < b_cnt->cnt ? -1 : 1;
+	return strcmp(a_cnt->line, b_cnt->line);
+}
+
+static int print_top_src_lines(char * const buf, size_t buf_sz, const char *prog_name)
+{
+	int lines_cap = 0;
+	int lines_size = 0;
+	char **lines = NULL;
+	char *line = NULL;
+	char *state;
+	struct line_cnt *freq = NULL;
+	struct line_cnt *cur;
+	int unique_lines;
+	int err = 0;
+	int i;
+
+	while ((line = strtok_r(line ? NULL : buf, "\n", &state))) {
+		if (strncmp(line, "; ", 2) != 0)
+			continue;
+		line += 2;
+
+		if (lines_size == lines_cap) {
+			char **tmp;
+
+			lines_cap = max(16, lines_cap * 2);
+			tmp = realloc(lines, lines_cap * sizeof(*tmp));
+			if (!tmp) {
+				err = -ENOMEM;
+				goto cleanup;
+			}
+			lines = tmp;
+		}
+		lines[lines_size] = line;
+		lines_size++;
+	}
+
+	if (lines_size == 0)
+		goto cleanup;
+
+	qsort(lines, lines_size, sizeof(*lines), str_cmp);
+
+	freq = calloc(lines_size, sizeof(*freq));
+	if (!freq) {
+		err = -ENOMEM;
+		goto cleanup;
+	}
+
+	cur = freq;
+	cur->line = lines[0];
+	cur->cnt = 1;
+	for (i = 1; i < lines_size; ++i) {
+		if (strcmp(lines[i], cur->line) != 0) {
+			cur++;
+			cur->line = lines[i];
+			cur->cnt = 0;
+		}
+		cur->cnt++;
+	}
+	unique_lines = cur - freq + 1;
+
+	qsort(freq, unique_lines, sizeof(struct line_cnt), line_cnt_cmp);
+
+	printf("Top source lines (%s):\n", prog_name);
+	for (i = 0; i < min(unique_lines, env.top_src_lines); ++i) {
+		const char *src_code = freq[i].line;
+		const char *src_line = NULL;
+		char *split = strrchr(freq[i].line, '@');
+
+		if (split) {
+			src_line = split + 1;
+
+			while (*src_line && isspace(*src_line))
+				src_line++;
+
+			while (split > src_code && isspace(*split))
+				split--;
+			*split = '\0';
+		}
+
+		if (src_line)
+			printf("%5d: (%s)\t%s\n", freq[i].cnt, src_line, src_code);
+		else
+			printf("%5d: %s\n", freq[i].cnt, src_code);
+	}
+	printf("\n");
+
+cleanup:
+	free(freq);
+	free(lines);
+	return err;
+}
+
 static int guess_prog_type_by_ctx_name(const char *ctx_name,
 				       enum bpf_prog_type *prog_type,
 				       enum bpf_attach_type *attach_type)
@@ -1009,13 +1131,16 @@ static int process_prog(const char *filename, struct bpf_object *obj, struct bpf
 	stats = &env.prog_stats[env.prog_stat_cnt++];
 	memset(stats, 0, sizeof(*stats));
 
-	if (env.verbose) {
+	if (env.verbose || env.top_src_lines > 0) {
 		buf_sz = env.log_size ? env.log_size : 16 * 1024 * 1024;
 		buf = malloc(buf_sz);
 		if (!buf)
 			return -ENOMEM;
 		/* ensure we always request stats */
 		log_level = env.log_level | 4 | (env.log_fixed ? 8 : 0);
+		/* --top-src-lines needs verifier log */
+		if (env.top_src_lines > 0 && env.log_level == 0)
+			log_level |= 2;
 	} else {
 		buf = verif_log_buf;
 		buf_sz = sizeof(verif_log_buf);
@@ -1048,6 +1173,8 @@ static int process_prog(const char *filename, struct bpf_object *obj, struct bpf
 		       filename, prog_name, stats->stats[DURATION],
 		       err ? "failure" : "success", buf);
 	}
+	if (env.top_src_lines > 0)
+		print_top_src_lines(buf, buf_sz, stats->prog_name);
 
 	if (verif_log_buf != buf)
 		free(buf);
-- 
cgit v1.2.3


From 7a2f671db61f32de0671eeb163a7764e5a258114 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:08 +0100
Subject: kselftest/arm64: Verify the GCS hwcap

Add coverage of the GCS hwcap to the hwcap selftest, using a read of
GCSPR_EL0 to generate SIGILL without having to worry about enabling GCS.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Tested-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-29-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/abi/hwcap.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c
index f2d6007a2b98..1f07772ae578 100644
--- a/tools/testing/selftests/arm64/abi/hwcap.c
+++ b/tools/testing/selftests/arm64/abi/hwcap.c
@@ -98,6 +98,17 @@ static void fpmr_sigill(void)
 	asm volatile("mrs x0, S3_3_C4_C4_2" : : : "x0");
 }
 
+static void gcs_sigill(void)
+{
+	unsigned long *gcspr;
+
+	asm volatile(
+		"mrs	%0, S3_3_C2_C5_1"
+	: "=r" (gcspr)
+	:
+	: "cc");
+}
+
 static void ilrcpc_sigill(void)
 {
 	/* LDAPUR W0, [SP, #8] */
@@ -534,6 +545,14 @@ static const struct hwcap_data {
 		.sigill_fn = fpmr_sigill,
 		.sigill_reliable = true,
 	},
+	{
+		.name = "GCS",
+		.at_hwcap = AT_HWCAP,
+		.hwcap_bit = HWCAP_GCS,
+		.cpuinfo = "gcs",
+		.sigill_fn = gcs_sigill,
+		.sigill_reliable = true,
+	},
 	{
 		.name = "JSCVT",
 		.at_hwcap = AT_HWCAP,
-- 
cgit v1.2.3


From b2d2f11ff5d69cd4b3585ddab4bec9f69503f680 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:09 +0100
Subject: kselftest/arm64: Add GCS as a detected feature in the signal tests

In preparation for testing GCS related signal handling add it as a feature
we check for in the signal handling support code.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-30-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/signal/test_signals.h       | 2 ++
 tools/testing/selftests/arm64/signal/test_signals_utils.c | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/signal/test_signals.h b/tools/testing/selftests/arm64/signal/test_signals.h
index 1e6273d81575..7ada43688c02 100644
--- a/tools/testing/selftests/arm64/signal/test_signals.h
+++ b/tools/testing/selftests/arm64/signal/test_signals.h
@@ -35,6 +35,7 @@ enum {
 	FSME_BIT,
 	FSME_FA64_BIT,
 	FSME2_BIT,
+	FGCS_BIT,
 	FMAX_END
 };
 
@@ -43,6 +44,7 @@ enum {
 #define FEAT_SME		(1UL << FSME_BIT)
 #define FEAT_SME_FA64		(1UL << FSME_FA64_BIT)
 #define FEAT_SME2		(1UL << FSME2_BIT)
+#define FEAT_GCS		(1UL << FGCS_BIT)
 
 /*
  * A descriptor used to describe and configure a test case.
diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.c b/tools/testing/selftests/arm64/signal/test_signals_utils.c
index 0dc948db3a4a..dcc49e3ce1eb 100644
--- a/tools/testing/selftests/arm64/signal/test_signals_utils.c
+++ b/tools/testing/selftests/arm64/signal/test_signals_utils.c
@@ -30,6 +30,7 @@ static char const *const feats_names[FMAX_END] = {
 	" SME ",
 	" FA64 ",
 	" SME2 ",
+	" GCS ",
 };
 
 #define MAX_FEATS_SZ	128
@@ -329,6 +330,8 @@ int test_init(struct tdescr *td)
 			td->feats_supported |= FEAT_SME_FA64;
 		if (getauxval(AT_HWCAP2) & HWCAP2_SME2)
 			td->feats_supported |= FEAT_SME2;
+		if (getauxval(AT_HWCAP) & HWCAP_GCS)
+			td->feats_supported |= FEAT_GCS;
 		if (feats_ok(td)) {
 			if (td->feats_required & td->feats_supported)
 				fprintf(stderr,
-- 
cgit v1.2.3


From 0d426f7dd9a0d88aa39c1dd54a6bf10f0466c6b9 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:10 +0100
Subject: kselftest/arm64: Add framework support for GCS to signal handling
 tests

Teach the framework about the GCS signal context, avoiding warnings on
the unknown context.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-31-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/signal/testcases/testcases.c | 7 +++++++
 tools/testing/selftests/arm64/signal/testcases/testcases.h | 1 +
 2 files changed, 8 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.c b/tools/testing/selftests/arm64/signal/testcases/testcases.c
index e6daa94fcd2e..0c1a6b26afac 100644
--- a/tools/testing/selftests/arm64/signal/testcases/testcases.c
+++ b/tools/testing/selftests/arm64/signal/testcases/testcases.c
@@ -198,6 +198,13 @@ bool validate_reserved(ucontext_t *uc, size_t resv_sz, char **err)
 				*err = "Bad size for fpmr_context";
 			new_flags |= FPMR_CTX;
 			break;
+		case GCS_MAGIC:
+			if (flags & GCS_CTX)
+				*err = "Multiple GCS_MAGIC";
+			if (head->size != sizeof(struct gcs_context))
+				*err = "Bad size for gcs_context";
+			new_flags |= GCS_CTX;
+			break;
 		case EXTRA_MAGIC:
 			if (flags & EXTRA_CTX)
 				*err = "Multiple EXTRA_MAGIC";
diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.h b/tools/testing/selftests/arm64/signal/testcases/testcases.h
index 9872b8912714..98b97efdda23 100644
--- a/tools/testing/selftests/arm64/signal/testcases/testcases.h
+++ b/tools/testing/selftests/arm64/signal/testcases/testcases.h
@@ -20,6 +20,7 @@
 #define EXTRA_CTX	(1 << 3)
 #define ZT_CTX		(1 << 4)
 #define FPMR_CTX	(1 << 5)
+#define GCS_CTX		(1 << 6)
 
 #define KSFT_BAD_MAGIC	0xdeadbeef
 
-- 
cgit v1.2.3


From 956573ac189066a32326245ebf5abf35b64a490f Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:11 +0100
Subject: kselftest/arm64: Allow signals tests to specify an expected si_code

Currently we ignore si_code unless the expected signal is a SIGSEGV, in
which case we enforce it being SEGV_ACCERR. Allow test cases to specify
exactly which si_code should be generated so we can validate this, and
test for other segfault codes.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-32-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 .../testing/selftests/arm64/signal/test_signals.h  |  4 +++
 .../selftests/arm64/signal/test_signals_utils.c    | 29 ++++++++++++++--------
 2 files changed, 23 insertions(+), 10 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/signal/test_signals.h b/tools/testing/selftests/arm64/signal/test_signals.h
index 7ada43688c02..ee75a2c25ce7 100644
--- a/tools/testing/selftests/arm64/signal/test_signals.h
+++ b/tools/testing/selftests/arm64/signal/test_signals.h
@@ -71,6 +71,10 @@ struct tdescr {
 	 * Zero when no signal is expected on success
 	 */
 	int			sig_ok;
+	/*
+	 * expected si_code for sig_ok, or 0 to not check
+	 */
+	int			sig_ok_code;
 	/* signum expected on unsupported CPU features. */
 	int			sig_unsupp;
 	/* a timeout in second for test completion */
diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.c b/tools/testing/selftests/arm64/signal/test_signals_utils.c
index dcc49e3ce1eb..5d3621921cfe 100644
--- a/tools/testing/selftests/arm64/signal/test_signals_utils.c
+++ b/tools/testing/selftests/arm64/signal/test_signals_utils.c
@@ -143,16 +143,25 @@ static bool handle_signal_ok(struct tdescr *td,
 			"current->token ZEROED...test is probably broken!\n");
 		abort();
 	}
-	/*
-	 * Trying to narrow down the SEGV to the ones generated by Kernel itself
-	 * via arm64_notify_segfault(). This is a best-effort check anyway, and
-	 * the si_code check may need to change if this aspect of the kernel
-	 * ABI changes.
-	 */
-	if (td->sig_ok == SIGSEGV && si->si_code != SEGV_ACCERR) {
-		fprintf(stdout,
-			"si_code != SEGV_ACCERR...test is probably broken!\n");
-		abort();
+	if (td->sig_ok_code) {
+		if (si->si_code != td->sig_ok_code) {
+			fprintf(stdout, "si_code is %d not %d\n",
+				si->si_code, td->sig_ok_code);
+			abort();
+		}
+	} else {
+		/*
+		 * Trying to narrow down the SEGV to the ones
+		 * generated by Kernel itself via
+		 * arm64_notify_segfault(). This is a best-effort
+		 * check anyway, and the si_code check may need to
+		 * change if this aspect of the kernel ABI changes.
+		 */
+		if (td->sig_ok == SIGSEGV && si->si_code != SEGV_ACCERR) {
+			fprintf(stdout,
+				"si_code != SEGV_ACCERR...test is probably broken!\n");
+			abort();
+		}
 	}
 	td->pass = 1;
 	/*
-- 
cgit v1.2.3


From 42155a8eb0f63f634a98ad17a85e9f2826bcff11 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:12 +0100
Subject: kselftest/arm64: Always run signals tests with GCS enabled

Since it is not possible to return from the function that enabled GCS
without disabling GCS it is very inconvenient to use the signal handling
tests to cover GCS when GCS is not enabled by the toolchain and runtime,
something that no current distribution does. Since none of the testcases
do anything with stacks that would cause problems with GCS we can sidestep
this issue by unconditionally enabling GCS on startup and exiting with a
call to exit() rather than a return from main().

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-33-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 .../testing/selftests/arm64/signal/test_signals.c  | 17 ++++++++++++-
 .../selftests/arm64/signal/test_signals_utils.h    | 29 ++++++++++++++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/signal/test_signals.c b/tools/testing/selftests/arm64/signal/test_signals.c
index 00051b40d71e..1304c8ec0f2f 100644
--- a/tools/testing/selftests/arm64/signal/test_signals.c
+++ b/tools/testing/selftests/arm64/signal/test_signals.c
@@ -7,6 +7,10 @@
  * Each test provides its own tde struct tdescr descriptor to link with
  * this wrapper. Framework provides common helpers.
  */
+
+#include <sys/auxv.h>
+#include <sys/prctl.h>
+
 #include <kselftest.h>
 
 #include "test_signals.h"
@@ -16,6 +20,16 @@ struct tdescr *current = &tde;
 
 int main(int argc, char *argv[])
 {
+	/*
+	 * Ensure GCS is at least enabled throughout the tests if
+	 * supported, otherwise the inability to return from the
+	 * function that enabled GCS makes it very inconvenient to set
+	 * up test cases.  The prctl() may fail if GCS was locked by
+	 * libc setup code.
+	 */
+	if (getauxval(AT_HWCAP) & HWCAP_GCS)
+		gcs_set_state(PR_SHADOW_STACK_ENABLE);
+
 	ksft_print_msg("%s :: %s\n", current->name, current->descr);
 	if (test_setup(current) && test_init(current)) {
 		test_run(current);
@@ -23,5 +37,6 @@ int main(int argc, char *argv[])
 	}
 	test_result(current);
 
-	return current->result;
+	/* Do not return in case GCS was enabled */
+	exit(current->result);
 }
diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.h b/tools/testing/selftests/arm64/signal/test_signals_utils.h
index 762c8fe9c54a..1e80808ee105 100644
--- a/tools/testing/selftests/arm64/signal/test_signals_utils.h
+++ b/tools/testing/selftests/arm64/signal/test_signals_utils.h
@@ -18,6 +18,35 @@ void test_cleanup(struct tdescr *td);
 int test_run(struct tdescr *td);
 void test_result(struct tdescr *td);
 
+#ifndef __NR_prctl
+#define __NR_prctl 167
+#endif
+
+/*
+ * The prctl takes 1 argument but we need to ensure that the other
+ * values passed in registers to the syscall are zero since the kernel
+ * validates them.
+ */
+#define gcs_set_state(state)					\
+	({								\
+		register long _num  __asm__ ("x8") = __NR_prctl;	\
+		register long _arg1 __asm__ ("x0") =  PR_SET_SHADOW_STACK_STATUS; \
+		register long _arg2 __asm__ ("x1") = (long)(state);	\
+		register long _arg3 __asm__ ("x2") = 0;			\
+		register long _arg4 __asm__ ("x3") = 0;			\
+		register long _arg5 __asm__ ("x4") = 0;			\
+	                                                                      \
+		__asm__  volatile (					\
+			"svc #0\n"					\
+			: "=r"(_arg1)					\
+			: "r"(_arg1), "r"(_arg2),			\
+			  "r"(_arg3), "r"(_arg4),			\
+			  "r"(_arg5), "r"(_num)				\
+			: "memory", "cc"				\
+			);						\
+		_arg1;							\
+	})
+
 static inline bool feats_ok(struct tdescr *td)
 {
 	if (td->feats_incompatible & td->feats_supported)
-- 
cgit v1.2.3


From 3d37d4307e0fc958c4461bb6973ce5573d1570c2 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:13 +0100
Subject: kselftest/arm64: Add very basic GCS test program

This test program just covers the basic GCS ABI, covering aspects of the
ABI as standalone features without attempting to integrate things.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Tested-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-34-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/Makefile        |   2 +-
 tools/testing/selftests/arm64/gcs/.gitignore  |   1 +
 tools/testing/selftests/arm64/gcs/Makefile    |  18 ++
 tools/testing/selftests/arm64/gcs/basic-gcs.c | 357 ++++++++++++++++++++++++++
 tools/testing/selftests/arm64/gcs/gcs-util.h  |  90 +++++++
 5 files changed, 467 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/arm64/gcs/.gitignore
 create mode 100644 tools/testing/selftests/arm64/gcs/Makefile
 create mode 100644 tools/testing/selftests/arm64/gcs/basic-gcs.c
 create mode 100644 tools/testing/selftests/arm64/gcs/gcs-util.h

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/Makefile b/tools/testing/selftests/arm64/Makefile
index 28b93cab8c0d..22029e60eff3 100644
--- a/tools/testing/selftests/arm64/Makefile
+++ b/tools/testing/selftests/arm64/Makefile
@@ -4,7 +4,7 @@
 ARCH ?= $(shell uname -m 2>/dev/null || echo not)
 
 ifneq (,$(filter $(ARCH),aarch64 arm64))
-ARM64_SUBTARGETS ?= tags signal pauth fp mte bti abi
+ARM64_SUBTARGETS ?= tags signal pauth fp mte bti abi gcs
 else
 ARM64_SUBTARGETS :=
 endif
diff --git a/tools/testing/selftests/arm64/gcs/.gitignore b/tools/testing/selftests/arm64/gcs/.gitignore
new file mode 100644
index 000000000000..0e5e695ecba5
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/.gitignore
@@ -0,0 +1 @@
+basic-gcs
diff --git a/tools/testing/selftests/arm64/gcs/Makefile b/tools/testing/selftests/arm64/gcs/Makefile
new file mode 100644
index 000000000000..61a30f483429
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/Makefile
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2023 ARM Limited
+#
+# In order to avoid interaction with the toolchain and dynamic linker the
+# portions of these tests that interact with the GCS are implemented using
+# nolibc.
+#
+
+TEST_GEN_PROGS := basic-gcs
+
+include ../../lib.mk
+
+$(OUTPUT)/basic-gcs: basic-gcs.c
+	$(CC) -g -fno-asynchronous-unwind-tables -fno-ident -s -Os -nostdlib \
+		-static -include ../../../../include/nolibc/nolibc.h \
+		-I../../../../../usr/include \
+		-std=gnu99 -I../.. -g \
+		-ffreestanding -Wall $^ -o $@ -lgcc
diff --git a/tools/testing/selftests/arm64/gcs/basic-gcs.c b/tools/testing/selftests/arm64/gcs/basic-gcs.c
new file mode 100644
index 000000000000..3fb9742342a3
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/basic-gcs.c
@@ -0,0 +1,357 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 ARM Limited.
+ */
+
+#include <limits.h>
+#include <stdbool.h>
+
+#include <linux/prctl.h>
+
+#include <sys/mman.h>
+#include <asm/mman.h>
+#include <linux/sched.h>
+
+#include "kselftest.h"
+#include "gcs-util.h"
+
+/* nolibc doesn't have sysconf(), just hard code the maximum */
+static size_t page_size = 65536;
+
+static  __attribute__((noinline)) void valid_gcs_function(void)
+{
+	/* Do something the compiler can't optimise out */
+	my_syscall1(__NR_prctl, PR_SVE_GET_VL);
+}
+
+static inline int gcs_set_status(unsigned long mode)
+{
+	bool enabling = mode & PR_SHADOW_STACK_ENABLE;
+	int ret;
+	unsigned long new_mode;
+
+	/*
+	 * The prctl takes 1 argument but we need to ensure that the
+	 * other 3 values passed in registers to the syscall are zero
+	 * since the kernel validates them.
+	 */
+	ret = my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, mode,
+			  0, 0, 0);
+
+	if (ret == 0) {
+		ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS,
+				  &new_mode, 0, 0, 0);
+		if (ret == 0) {
+			if (new_mode != mode) {
+				ksft_print_msg("Mode set to %lx not %lx\n",
+					       new_mode, mode);
+				ret = -EINVAL;
+			}
+		} else {
+			ksft_print_msg("Failed to validate mode: %d\n", ret);
+		}
+
+		if (enabling != chkfeat_gcs()) {
+			ksft_print_msg("%senabled by prctl but %senabled in CHKFEAT\n",
+				       enabling ? "" : "not ",
+				       chkfeat_gcs() ? "" : "not ");
+			ret = -EINVAL;
+		}
+	}
+
+	return ret;
+}
+
+/* Try to read the status */
+static bool read_status(void)
+{
+	unsigned long state;
+	int ret;
+
+	ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS,
+			  &state, 0, 0, 0);
+	if (ret != 0) {
+		ksft_print_msg("Failed to read state: %d\n", ret);
+		return false;
+	}
+
+	return state & PR_SHADOW_STACK_ENABLE;
+}
+
+/* Just a straight enable */
+static bool base_enable(void)
+{
+	int ret;
+
+	ret = gcs_set_status(PR_SHADOW_STACK_ENABLE);
+	if (ret) {
+		ksft_print_msg("PR_SHADOW_STACK_ENABLE failed %d\n", ret);
+		return false;
+	}
+
+	return true;
+}
+
+/* Check we can read GCSPR_EL0 when GCS is enabled */
+static bool read_gcspr_el0(void)
+{
+	unsigned long *gcspr_el0;
+
+	ksft_print_msg("GET GCSPR\n");
+	gcspr_el0 = get_gcspr();
+	ksft_print_msg("GCSPR_EL0 is %p\n", gcspr_el0);
+
+	return true;
+}
+
+/* Also allow writes to stack */
+static bool enable_writeable(void)
+{
+	int ret;
+
+	ret = gcs_set_status(PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE);
+	if (ret) {
+		ksft_print_msg("PR_SHADOW_STACK_ENABLE writeable failed: %d\n", ret);
+		return false;
+	}
+
+	ret = gcs_set_status(PR_SHADOW_STACK_ENABLE);
+	if (ret) {
+		ksft_print_msg("failed to restore plain enable %d\n", ret);
+		return false;
+	}
+
+	return true;
+}
+
+/* Also allow writes to stack */
+static bool enable_push_pop(void)
+{
+	int ret;
+
+	ret = gcs_set_status(PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_PUSH);
+	if (ret) {
+		ksft_print_msg("PR_SHADOW_STACK_ENABLE with push failed: %d\n",
+			       ret);
+		return false;
+	}
+
+	ret = gcs_set_status(PR_SHADOW_STACK_ENABLE);
+	if (ret) {
+		ksft_print_msg("failed to restore plain enable %d\n", ret);
+		return false;
+	}
+
+	return true;
+}
+
+/* Enable GCS and allow everything */
+static bool enable_all(void)
+{
+	int ret;
+
+	ret = gcs_set_status(PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_PUSH |
+			     PR_SHADOW_STACK_WRITE);
+	if (ret) {
+		ksft_print_msg("PR_SHADOW_STACK_ENABLE with everything failed: %d\n",
+			       ret);
+		return false;
+	}
+
+	ret = gcs_set_status(PR_SHADOW_STACK_ENABLE);
+	if (ret) {
+		ksft_print_msg("failed to restore plain enable %d\n", ret);
+		return false;
+	}
+
+	return true;
+}
+
+static bool enable_invalid(void)
+{
+	int ret = gcs_set_status(ULONG_MAX);
+	if (ret == 0) {
+		ksft_print_msg("GCS_SET_STATUS %lx succeeded\n", ULONG_MAX);
+		return false;
+	}
+
+	return true;
+}
+
+/* Map a GCS */
+static bool map_guarded_stack(void)
+{
+	int ret;
+	uint64_t *buf;
+	uint64_t expected_cap;
+	int elem;
+	bool pass = true;
+
+	buf = (void *)my_syscall3(__NR_map_shadow_stack, 0, page_size,
+				  SHADOW_STACK_SET_MARKER |
+				  SHADOW_STACK_SET_TOKEN);
+	if (buf == MAP_FAILED) {
+		ksft_print_msg("Failed to map %lu byte GCS: %d\n",
+			       page_size, errno);
+		return false;
+	}
+	ksft_print_msg("Mapped GCS at %p-%p\n", buf,
+		       (void *)((uint64_t)buf + page_size));
+
+	/* The top of the newly allocated region should be 0 */
+	elem = (page_size / sizeof(uint64_t)) - 1;
+	if (buf[elem]) {
+		ksft_print_msg("Last entry is 0x%llx not 0x0\n", buf[elem]);
+		pass = false;
+	}
+
+	/* Then a valid cap token */
+	elem--;
+	expected_cap = ((uint64_t)buf + page_size - 16);
+	expected_cap &= GCS_CAP_ADDR_MASK;
+	expected_cap |= GCS_CAP_VALID_TOKEN;
+	if (buf[elem] != expected_cap) {
+		ksft_print_msg("Cap entry is 0x%llx not 0x%llx\n",
+			       buf[elem], expected_cap);
+		pass = false;
+	}
+	ksft_print_msg("cap token is 0x%llx\n", buf[elem]);
+
+	/* The rest should be zeros */
+	for (elem = 0; elem < page_size / sizeof(uint64_t) - 2; elem++) {
+		if (!buf[elem])
+			continue;
+		ksft_print_msg("GCS slot %d is 0x%llx not 0x0\n",
+			       elem, buf[elem]);
+		pass = false;
+	}
+
+	ret = munmap(buf, page_size);
+	if (ret != 0) {
+		ksft_print_msg("Failed to unmap %ld byte GCS: %d\n",
+			       page_size, errno);
+		pass = false;
+	}
+
+	return pass;
+}
+
+/* A fork()ed process can run */
+static bool test_fork(void)
+{
+	unsigned long child_mode;
+	int ret, status;
+	pid_t pid;
+	bool pass = true;
+
+	pid = fork();
+	if (pid == -1) {
+		ksft_print_msg("fork() failed: %d\n", errno);
+		pass = false;
+		goto out;
+	}
+	if (pid == 0) {
+		/* In child, make sure we can call a function, read
+		 * the GCS pointer and status and then exit */
+		valid_gcs_function();
+		get_gcspr();
+
+		ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS,
+				  &child_mode, 0, 0, 0);
+		if (ret == 0 && !(child_mode & PR_SHADOW_STACK_ENABLE)) {
+			ksft_print_msg("GCS not enabled in child\n");
+			ret = -EINVAL;
+		}
+
+		exit(ret);
+	}
+
+	/*
+	 * In parent, check we can still do function calls then block
+	 * for the child.
+	 */
+	valid_gcs_function();
+
+	ksft_print_msg("Waiting for child %d\n", pid);
+
+	ret = waitpid(pid, &status, 0);
+	if (ret == -1) {
+		ksft_print_msg("Failed to wait for child: %d\n",
+			       errno);
+		return false;
+	}
+
+	if (!WIFEXITED(status)) {
+		ksft_print_msg("Child exited due to signal %d\n",
+			       WTERMSIG(status));
+		pass = false;
+	} else {
+		if (WEXITSTATUS(status)) {
+			ksft_print_msg("Child exited with status %d\n",
+				       WEXITSTATUS(status));
+			pass = false;
+		}
+	}
+
+out:
+
+	return pass;
+}
+
+typedef bool (*gcs_test)(void);
+
+static struct {
+	char *name;
+	gcs_test test;
+	bool needs_enable;
+} tests[] = {
+	{ "read_status", read_status },
+	{ "base_enable", base_enable, true },
+	{ "read_gcspr_el0", read_gcspr_el0 },
+	{ "enable_writeable", enable_writeable, true },
+	{ "enable_push_pop", enable_push_pop, true },
+	{ "enable_all", enable_all, true },
+	{ "enable_invalid", enable_invalid, true },
+	{ "map_guarded_stack", map_guarded_stack },
+	{ "fork", test_fork },
+};
+
+int main(void)
+{
+	int i, ret;
+	unsigned long gcs_mode;
+
+	ksft_print_header();
+
+	/*
+	 * We don't have getauxval() with nolibc so treat a failure to
+	 * read GCS state as a lack of support and skip.
+	 */
+	ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS,
+			  &gcs_mode, 0, 0, 0);
+	if (ret != 0)
+		ksft_exit_skip("Failed to read GCS state: %d\n", ret);
+
+	if (!(gcs_mode & PR_SHADOW_STACK_ENABLE)) {
+		gcs_mode = PR_SHADOW_STACK_ENABLE;
+		ret = my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS,
+				  gcs_mode, 0, 0, 0);
+		if (ret != 0)
+			ksft_exit_fail_msg("Failed to enable GCS: %d\n", ret);
+	}
+
+	ksft_set_plan(ARRAY_SIZE(tests));
+
+	for (i = 0; i < ARRAY_SIZE(tests); i++) {
+		ksft_test_result((*tests[i].test)(), "%s\n", tests[i].name);
+	}
+
+	/* One last test: disable GCS, we can do this one time */
+	my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, 0, 0, 0, 0);
+	if (ret != 0)
+		ksft_print_msg("Failed to disable GCS: %d\n", ret);
+
+	ksft_finished();
+
+	return 0;
+}
diff --git a/tools/testing/selftests/arm64/gcs/gcs-util.h b/tools/testing/selftests/arm64/gcs/gcs-util.h
new file mode 100644
index 000000000000..1ae6864d3f86
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/gcs-util.h
@@ -0,0 +1,90 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2023 ARM Limited.
+ */
+
+#ifndef GCS_UTIL_H
+#define GCS_UTIL_H
+
+#include <stdbool.h>
+
+#ifndef __NR_map_shadow_stack
+#define __NR_map_shadow_stack 453
+#endif
+
+#ifndef __NR_prctl
+#define __NR_prctl 167
+#endif
+
+/* Shadow Stack/Guarded Control Stack interface */
+#define PR_GET_SHADOW_STACK_STATUS	74
+#define PR_SET_SHADOW_STACK_STATUS      75
+#define PR_LOCK_SHADOW_STACK_STATUS     76
+
+# define PR_SHADOW_STACK_ENABLE         (1UL << 0)
+# define PR_SHADOW_STACK_WRITE		(1UL << 1)
+# define PR_SHADOW_STACK_PUSH		(1UL << 2)
+
+#define PR_SHADOW_STACK_ALL_MODES \
+	PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE | PR_SHADOW_STACK_PUSH
+
+#define SHADOW_STACK_SET_TOKEN (1ULL << 0)     /* Set up a restore token in the shadow stack */
+#define SHADOW_STACK_SET_MARKER (1ULL << 1)     /* Set up a top of stack merker in the shadow stack */
+
+#define GCS_CAP_ADDR_MASK		(0xfffffffffffff000UL)
+#define GCS_CAP_TOKEN_MASK		(0x0000000000000fffUL)
+#define GCS_CAP_VALID_TOKEN		1
+#define GCS_CAP_IN_PROGRESS_TOKEN	5
+
+#define GCS_CAP(x) (((unsigned long)(x) & GCS_CAP_ADDR_MASK) | \
+		    GCS_CAP_VALID_TOKEN)
+
+static inline unsigned long *get_gcspr(void)
+{
+	unsigned long *gcspr;
+
+	asm volatile(
+		"mrs	%0, S3_3_C2_C5_1"
+	: "=r" (gcspr)
+	:
+	: "cc");
+
+	return gcspr;
+}
+
+static inline void __attribute__((always_inline)) gcsss1(unsigned long *Xt)
+{
+	asm volatile (
+		"sys #3, C7, C7, #2, %0\n"
+		:
+		: "rZ" (Xt)
+		: "memory");
+}
+
+static inline unsigned long __attribute__((always_inline)) *gcsss2(void)
+{
+	unsigned long *Xt;
+
+	asm volatile(
+		"SYSL %0, #3, C7, C7, #3\n"
+		: "=r" (Xt)
+		:
+		: "memory");
+
+	return Xt;
+}
+
+static inline bool chkfeat_gcs(void)
+{
+	register long val __asm__ ("x16") = 1;
+
+	/* CHKFEAT x16 */
+	asm volatile(
+		"hint #0x28\n"
+		: "=r" (val)
+		: "r" (val));
+
+	return val != 1;
+}
+
+#endif
-- 
cgit v1.2.3


From a505a52b4e292f5e031a01eb3d4e203eb18acb7d Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:14 +0100
Subject: kselftest/arm64: Add a GCS test program built with the system libc

There are things like threads which nolibc struggles with which we want
to add coverage for, and the ABI allows us to test most of these even if
libc itself does not understand GCS so add a test application built
using the system libc.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Tested-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-35-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/gcs/.gitignore |   1 +
 tools/testing/selftests/arm64/gcs/Makefile   |   4 +-
 tools/testing/selftests/arm64/gcs/gcs-util.h |  10 +
 tools/testing/selftests/arm64/gcs/libc-gcs.c | 728 +++++++++++++++++++++++++++
 4 files changed, 742 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/arm64/gcs/libc-gcs.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/gcs/.gitignore b/tools/testing/selftests/arm64/gcs/.gitignore
index 0e5e695ecba5..5810c4a163d4 100644
--- a/tools/testing/selftests/arm64/gcs/.gitignore
+++ b/tools/testing/selftests/arm64/gcs/.gitignore
@@ -1 +1,2 @@
 basic-gcs
+libc-gcs
diff --git a/tools/testing/selftests/arm64/gcs/Makefile b/tools/testing/selftests/arm64/gcs/Makefile
index 61a30f483429..a8fdf21e9a47 100644
--- a/tools/testing/selftests/arm64/gcs/Makefile
+++ b/tools/testing/selftests/arm64/gcs/Makefile
@@ -6,7 +6,9 @@
 # nolibc.
 #
 
-TEST_GEN_PROGS := basic-gcs
+TEST_GEN_PROGS := basic-gcs libc-gcs
+
+LDLIBS+=-lpthread
 
 include ../../lib.mk
 
diff --git a/tools/testing/selftests/arm64/gcs/gcs-util.h b/tools/testing/selftests/arm64/gcs/gcs-util.h
index 1ae6864d3f86..c99a6b39ac14 100644
--- a/tools/testing/selftests/arm64/gcs/gcs-util.h
+++ b/tools/testing/selftests/arm64/gcs/gcs-util.h
@@ -16,6 +16,16 @@
 #define __NR_prctl 167
 #endif
 
+#ifndef NT_ARM_GCS
+#define NT_ARM_GCS 0x410
+
+struct user_gcs {
+	__u64 features_enabled;
+	__u64 features_locked;
+	__u64 gcspr_el0;
+};
+#endif
+
 /* Shadow Stack/Guarded Control Stack interface */
 #define PR_GET_SHADOW_STACK_STATUS	74
 #define PR_SET_SHADOW_STACK_STATUS      75
diff --git a/tools/testing/selftests/arm64/gcs/libc-gcs.c b/tools/testing/selftests/arm64/gcs/libc-gcs.c
new file mode 100644
index 000000000000..17b2fabfec38
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/libc-gcs.c
@@ -0,0 +1,728 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 ARM Limited.
+ */
+
+#define _GNU_SOURCE
+
+#include <pthread.h>
+#include <stdbool.h>
+
+#include <sys/auxv.h>
+#include <sys/mman.h>
+#include <sys/prctl.h>
+#include <sys/ptrace.h>
+#include <sys/uio.h>
+
+#include <asm/hwcap.h>
+#include <asm/mman.h>
+
+#include <linux/compiler.h>
+
+#include "kselftest_harness.h"
+
+#include "gcs-util.h"
+
+#define my_syscall2(num, arg1, arg2)                                          \
+({                                                                            \
+	register long _num  __asm__ ("x8") = (num);                           \
+	register long _arg1 __asm__ ("x0") = (long)(arg1);                    \
+	register long _arg2 __asm__ ("x1") = (long)(arg2);                    \
+	register long _arg3 __asm__ ("x2") = 0;                               \
+	register long _arg4 __asm__ ("x3") = 0;                               \
+	register long _arg5 __asm__ ("x4") = 0;                               \
+	                                                                      \
+	__asm__  volatile (                                                   \
+		"svc #0\n"                                                    \
+		: "=r"(_arg1)                                                 \
+		: "r"(_arg1), "r"(_arg2),                                     \
+		  "r"(_arg3), "r"(_arg4),                                     \
+		  "r"(_arg5), "r"(_num)					      \
+		: "memory", "cc"                                              \
+	);                                                                    \
+	_arg1;                                                                \
+})
+
+static noinline void gcs_recurse(int depth)
+{
+	if (depth)
+		gcs_recurse(depth - 1);
+
+	/* Prevent tail call optimization so we actually recurse */
+	asm volatile("dsb sy" : : : "memory");
+}
+
+/* Smoke test that a function call and return works*/
+TEST(can_call_function)
+{
+	gcs_recurse(0);
+}
+
+static void *gcs_test_thread(void *arg)
+{
+	int ret;
+	unsigned long mode;
+
+	/*
+	 * Some libcs don't seem to fill unused arguments with 0 but
+	 * the kernel validates this so we supply all 5 arguments.
+	 */
+	ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0);
+	if (ret != 0) {
+		ksft_print_msg("PR_GET_SHADOW_STACK_STATUS failed: %d\n", ret);
+		return NULL;
+	}
+
+	if (!(mode & PR_SHADOW_STACK_ENABLE)) {
+		ksft_print_msg("GCS not enabled in thread, mode is %lu\n",
+			       mode);
+		return NULL;
+	}
+
+	/* Just in case... */
+	gcs_recurse(0);
+
+	/* Use a non-NULL value to indicate a pass */
+	return &gcs_test_thread;
+}
+
+/* Verify that if we start a new thread it has GCS enabled */
+TEST(gcs_enabled_thread)
+{
+	pthread_t thread;
+	void *thread_ret;
+	int ret;
+
+	ret = pthread_create(&thread, NULL, gcs_test_thread, NULL);
+	ASSERT_TRUE(ret == 0);
+	if (ret != 0)
+		return;
+
+	ret = pthread_join(thread, &thread_ret);
+	ASSERT_TRUE(ret == 0);
+	if (ret != 0)
+		return;
+
+	ASSERT_TRUE(thread_ret != NULL);
+}
+
+/* Read the GCS until we find the terminator */
+TEST(gcs_find_terminator)
+{
+	unsigned long *gcs, *cur;
+
+	gcs = get_gcspr();
+	cur = gcs;
+	while (*cur)
+		cur++;
+
+	ksft_print_msg("GCS in use from %p-%p\n", gcs, cur);
+
+	/*
+	 * We should have at least whatever called into this test so
+	 * the two pointer should differ.
+	 */
+	ASSERT_TRUE(gcs != cur);
+}
+
+/*
+ * We can access a GCS via ptrace
+ *
+ * This could usefully have a fixture but note that each test is
+ * fork()ed into a new child whcih causes issues.  Might be better to
+ * lift at least some of this out into a separate, non-harness, test
+ * program.
+ */
+TEST(ptrace_read_write)
+{
+	pid_t child, pid;
+	int ret, status;
+	siginfo_t si;
+	uint64_t val, rval, gcspr;
+	struct user_gcs child_gcs;
+	struct iovec iov, local_iov, remote_iov;
+
+	child = fork();
+	if (child == -1) {
+		ksft_print_msg("fork() failed: %d (%s)\n",
+			       errno, strerror(errno));
+		ASSERT_NE(child, -1);
+	}
+
+	if (child == 0) {
+		/*
+		 * In child, make sure there's something on the stack and
+		 * ask to be traced.
+		 */
+		gcs_recurse(0);
+		if (ptrace(PTRACE_TRACEME, -1, NULL, NULL))
+			ksft_exit_fail_msg("PTRACE_TRACEME %s",
+					   strerror(errno));
+
+		if (raise(SIGSTOP))
+			ksft_exit_fail_msg("raise(SIGSTOP) %s",
+					   strerror(errno));
+
+		return;
+	}
+
+	ksft_print_msg("Child: %d\n", child);
+
+	/* Attach to the child */
+	while (1) {
+		int sig;
+
+		pid = wait(&status);
+		if (pid == -1) {
+			ksft_print_msg("wait() failed: %s",
+				       strerror(errno));
+			goto error;
+		}
+
+		/*
+		 * This should never happen but it's hard to flag in
+		 * the framework.
+		 */
+		if (pid != child)
+			continue;
+
+		if (WIFEXITED(status) || WIFSIGNALED(status))
+			ksft_exit_fail_msg("Child died unexpectedly\n");
+
+		if (!WIFSTOPPED(status))
+			goto error;
+
+		sig = WSTOPSIG(status);
+
+		if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &si)) {
+			if (errno == ESRCH) {
+				ASSERT_NE(errno, ESRCH);
+				return;
+			}
+
+			if (errno == EINVAL) {
+				sig = 0; /* bust group-stop */
+				goto cont;
+			}
+
+			ksft_print_msg("PTRACE_GETSIGINFO: %s\n",
+				       strerror(errno));
+			goto error;
+		}
+
+		if (sig == SIGSTOP && si.si_code == SI_TKILL &&
+		    si.si_pid == pid)
+			break;
+
+	cont:
+		if (ptrace(PTRACE_CONT, pid, NULL, sig)) {
+			if (errno == ESRCH) {
+				ASSERT_NE(errno, ESRCH);
+				return;
+			}
+
+			ksft_print_msg("PTRACE_CONT: %s\n", strerror(errno));
+			goto error;
+		}
+	}
+
+	/* Where is the child GCS? */
+	iov.iov_base = &child_gcs;
+	iov.iov_len = sizeof(child_gcs);
+	ret = ptrace(PTRACE_GETREGSET, child, NT_ARM_GCS, &iov);
+	if (ret != 0) {
+		ksft_print_msg("Failed to read child GCS state: %s (%d)\n",
+			       strerror(errno), errno);
+		goto error;
+	}
+
+	/* We should have inherited GCS over fork(), confirm */
+	if (!(child_gcs.features_enabled & PR_SHADOW_STACK_ENABLE)) {
+		ASSERT_TRUE(child_gcs.features_enabled &
+			    PR_SHADOW_STACK_ENABLE);
+		goto error;
+	}
+
+	gcspr = child_gcs.gcspr_el0;
+	ksft_print_msg("Child GCSPR 0x%lx, flags %llx, locked %llx\n",
+		       gcspr, child_gcs.features_enabled,
+		       child_gcs.features_locked);
+
+	/* Ideally we'd cross check with the child memory map */
+
+	errno = 0;
+	val = ptrace(PTRACE_PEEKDATA, child, (void *)gcspr, NULL);
+	ret = errno;
+	if (ret != 0)
+		ksft_print_msg("PTRACE_PEEKDATA failed: %s (%d)\n",
+			       strerror(ret), ret);
+	EXPECT_EQ(ret, 0);
+
+	/* The child should be in a function, the GCSPR shouldn't be 0 */
+	EXPECT_NE(val, 0);
+
+	/* Same thing via process_vm_readv() */
+	local_iov.iov_base = &rval;
+	local_iov.iov_len = sizeof(rval);
+	remote_iov.iov_base = (void *)gcspr;
+	remote_iov.iov_len = sizeof(rval);
+	ret = process_vm_readv(child, &local_iov, 1, &remote_iov, 1, 0);
+	if (ret == -1)
+		ksft_print_msg("process_vm_readv() failed: %s (%d)\n",
+			       strerror(errno), errno);
+	EXPECT_EQ(ret, sizeof(rval));
+	EXPECT_EQ(val, rval);
+
+	/* Write data via a peek */
+	ret = ptrace(PTRACE_POKEDATA, child, (void *)gcspr, NULL);
+	if (ret == -1)
+		ksft_print_msg("PTRACE_POKEDATA failed: %s (%d)\n",
+			       strerror(errno), errno);
+	EXPECT_EQ(ret, 0);
+	EXPECT_EQ(0, ptrace(PTRACE_PEEKDATA, child, (void *)gcspr, NULL));
+
+	/* Restore what we had before */
+	ret = ptrace(PTRACE_POKEDATA, child, (void *)gcspr, val);
+	if (ret == -1)
+		ksft_print_msg("PTRACE_POKEDATA failed: %s (%d)\n",
+			       strerror(errno), errno);
+	EXPECT_EQ(ret, 0);
+	EXPECT_EQ(val, ptrace(PTRACE_PEEKDATA, child, (void *)gcspr, NULL));
+
+	/* That's all, folks */
+	kill(child, SIGKILL);
+	return;
+
+error:
+	kill(child, SIGKILL);
+	ASSERT_FALSE(true);
+}
+
+FIXTURE(map_gcs)
+{
+	unsigned long *stack;
+};
+
+FIXTURE_VARIANT(map_gcs)
+{
+	size_t stack_size;
+	unsigned long flags;
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s2k_cap_marker)
+{
+	.stack_size = 2 * 1024,
+	.flags = SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s2k_cap)
+{
+	.stack_size = 2 * 1024,
+	.flags = SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s2k_marker)
+{
+	.stack_size = 2 * 1024,
+	.flags = SHADOW_STACK_SET_MARKER,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s2k)
+{
+	.stack_size = 2 * 1024,
+	.flags = 0,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s4k_cap_marker)
+{
+	.stack_size = 4 * 1024,
+	.flags = SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s4k_cap)
+{
+	.stack_size = 4 * 1024,
+	.flags = SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s3k_marker)
+{
+	.stack_size = 4 * 1024,
+	.flags = SHADOW_STACK_SET_MARKER,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s4k)
+{
+	.stack_size = 4 * 1024,
+	.flags = 0,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s16k_cap_marker)
+{
+	.stack_size = 16 * 1024,
+	.flags = SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s16k_cap)
+{
+	.stack_size = 16 * 1024,
+	.flags = SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s16k_marker)
+{
+	.stack_size = 16 * 1024,
+	.flags = SHADOW_STACK_SET_MARKER,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s16k)
+{
+	.stack_size = 16 * 1024,
+	.flags = 0,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s64k_cap_marker)
+{
+	.stack_size = 64 * 1024,
+	.flags = SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s64k_cap)
+{
+	.stack_size = 64 * 1024,
+	.flags = SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s64k_marker)
+{
+	.stack_size = 64 * 1024,
+	.flags = SHADOW_STACK_SET_MARKER,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s64k)
+{
+	.stack_size = 64 * 1024,
+	.flags = 0,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s128k_cap_marker)
+{
+	.stack_size = 128 * 1024,
+	.flags = SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s128k_cap)
+{
+	.stack_size = 128 * 1024,
+	.flags = SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s128k_marker)
+{
+	.stack_size = 128 * 1024,
+	.flags = SHADOW_STACK_SET_MARKER,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s128k)
+{
+	.stack_size = 128 * 1024,
+	.flags = 0,
+};
+
+FIXTURE_SETUP(map_gcs)
+{
+	self->stack = (void *)syscall(__NR_map_shadow_stack, 0,
+				      variant->stack_size, 
+				      variant->flags);
+	ASSERT_FALSE(self->stack == MAP_FAILED);
+	ksft_print_msg("Allocated stack from %p-%p\n", self->stack,
+		       self->stack + variant->stack_size);
+}
+
+FIXTURE_TEARDOWN(map_gcs)
+{
+	int ret;
+
+	if (self->stack != MAP_FAILED) {
+		ret = munmap(self->stack, variant->stack_size);
+		ASSERT_EQ(ret, 0);
+	}
+}
+
+/* The stack has a cap token */
+TEST_F(map_gcs, stack_capped)
+{
+	unsigned long *stack = self->stack;
+	size_t cap_index;
+
+	cap_index = (variant->stack_size / sizeof(unsigned long));
+
+	switch (variant->flags & (SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN)) {
+	case SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN:
+		cap_index -= 2;
+		break;
+	case SHADOW_STACK_SET_TOKEN:
+		cap_index -= 1;
+		break;
+	case SHADOW_STACK_SET_MARKER:
+	case 0:
+		/* No cap, no test */
+		return;
+	}
+
+	ASSERT_EQ(stack[cap_index], GCS_CAP(&stack[cap_index]));
+}
+
+/* The top of the stack is 0 */
+TEST_F(map_gcs, stack_terminated)
+{
+	unsigned long *stack = self->stack;
+	size_t term_index;
+
+	if (!(variant->flags & SHADOW_STACK_SET_MARKER))
+		return;
+
+	term_index = (variant->stack_size / sizeof(unsigned long)) - 1;
+
+	ASSERT_EQ(stack[term_index], 0);
+}
+
+/* Writes should fault */
+TEST_F_SIGNAL(map_gcs, not_writeable, SIGSEGV)
+{
+	self->stack[0] = 0;
+}
+
+/* Put it all together, we can safely switch to and from the stack */
+TEST_F(map_gcs, stack_switch)
+{
+	size_t cap_index;
+	cap_index = (variant->stack_size / sizeof(unsigned long));
+	unsigned long *orig_gcspr_el0, *pivot_gcspr_el0;
+
+	/* Skip over the stack terminator and point at the cap */
+	switch (variant->flags & (SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN)) {
+	case SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN:
+		cap_index -= 2;
+		break;
+	case SHADOW_STACK_SET_TOKEN:
+		cap_index -= 1;
+		break;
+	case SHADOW_STACK_SET_MARKER:
+	case 0:
+		/* No cap, no test */
+		return;
+	}
+	pivot_gcspr_el0 = &self->stack[cap_index];
+
+	/* Pivot to the new GCS */
+	ksft_print_msg("Pivoting to %p from %p, target has value 0x%lx\n",
+		       pivot_gcspr_el0, get_gcspr(),
+		       *pivot_gcspr_el0);
+	gcsss1(pivot_gcspr_el0);
+	orig_gcspr_el0 = gcsss2();
+	ksft_print_msg("Pivoted to %p from %p, target has value 0x%lx\n",
+		       get_gcspr(), orig_gcspr_el0,
+		       *pivot_gcspr_el0);
+
+	ksft_print_msg("Pivoted, GCSPR_EL0 now %p\n", get_gcspr());
+
+	/* New GCS must be in the new buffer */
+	ASSERT_TRUE((unsigned long)get_gcspr() > (unsigned long)self->stack);
+	ASSERT_TRUE((unsigned long)get_gcspr() <=
+		    (unsigned long)self->stack + variant->stack_size);
+
+	/* We should be able to use all but 2 slots of the new stack */
+	ksft_print_msg("Recursing %zu levels\n", cap_index - 1);
+	gcs_recurse(cap_index - 1);
+
+	/* Pivot back to the original GCS */
+	gcsss1(orig_gcspr_el0);
+	pivot_gcspr_el0 = gcsss2();
+
+	gcs_recurse(0);
+	ksft_print_msg("Pivoted back to GCSPR_EL0 0x%p\n", get_gcspr());
+}
+
+/* We fault if we try to go beyond the end of the stack */
+TEST_F_SIGNAL(map_gcs, stack_overflow, SIGSEGV)
+{
+	size_t cap_index;
+	cap_index = (variant->stack_size / sizeof(unsigned long));
+	unsigned long *orig_gcspr_el0, *pivot_gcspr_el0;
+
+	/* Skip over the stack terminator and point at the cap */
+	switch (variant->flags & (SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN)) {
+	case SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN:
+		cap_index -= 2;
+		break;
+	case SHADOW_STACK_SET_TOKEN:
+		cap_index -= 1;
+		break;
+	case SHADOW_STACK_SET_MARKER:
+	case 0:
+		/* No cap, no test but we need to SEGV to avoid a false fail */
+		orig_gcspr_el0 = get_gcspr();
+		*orig_gcspr_el0 = 0;
+		return;
+	}
+	pivot_gcspr_el0 = &self->stack[cap_index];
+
+	/* Pivot to the new GCS */
+	ksft_print_msg("Pivoting to %p from %p, target has value 0x%lx\n",
+		       pivot_gcspr_el0, get_gcspr(),
+		       *pivot_gcspr_el0);
+	gcsss1(pivot_gcspr_el0);
+	orig_gcspr_el0 = gcsss2();
+	ksft_print_msg("Pivoted to %p from %p, target has value 0x%lx\n",
+		       pivot_gcspr_el0, orig_gcspr_el0,
+		       *pivot_gcspr_el0);
+
+	ksft_print_msg("Pivoted, GCSPR_EL0 now %p\n", get_gcspr());
+
+	/* New GCS must be in the new buffer */
+	ASSERT_TRUE((unsigned long)get_gcspr() > (unsigned long)self->stack);
+	ASSERT_TRUE((unsigned long)get_gcspr() <=
+		    (unsigned long)self->stack + variant->stack_size);
+
+	/* Now try to recurse, we should fault doing this. */
+	ksft_print_msg("Recursing %zu levels...\n", cap_index + 1);
+	gcs_recurse(cap_index + 1);
+	ksft_print_msg("...done\n");
+
+	/* Clean up properly to try to guard against spurious passes. */
+	gcsss1(orig_gcspr_el0);
+	pivot_gcspr_el0 = gcsss2();
+	ksft_print_msg("Pivoted back to GCSPR_EL0 0x%p\n", get_gcspr());
+}
+
+FIXTURE(map_invalid_gcs)
+{
+};
+
+FIXTURE_VARIANT(map_invalid_gcs)
+{
+	size_t stack_size;
+};
+
+FIXTURE_SETUP(map_invalid_gcs)
+{
+}
+
+FIXTURE_TEARDOWN(map_invalid_gcs)
+{
+}
+
+/* GCS must be larger than 16 bytes */
+FIXTURE_VARIANT_ADD(map_invalid_gcs, too_small)
+{
+	.stack_size = 8,
+};
+
+/* GCS size must be 16 byte aligned */
+FIXTURE_VARIANT_ADD(map_invalid_gcs, unligned_1)  { .stack_size = 1024 + 1  };
+FIXTURE_VARIANT_ADD(map_invalid_gcs, unligned_2)  { .stack_size = 1024 + 2  };
+FIXTURE_VARIANT_ADD(map_invalid_gcs, unligned_3)  { .stack_size = 1024 + 3  };
+FIXTURE_VARIANT_ADD(map_invalid_gcs, unligned_4)  { .stack_size = 1024 + 4  };
+FIXTURE_VARIANT_ADD(map_invalid_gcs, unligned_5)  { .stack_size = 1024 + 5  };
+FIXTURE_VARIANT_ADD(map_invalid_gcs, unligned_6)  { .stack_size = 1024 + 6  };
+FIXTURE_VARIANT_ADD(map_invalid_gcs, unligned_7)  { .stack_size = 1024 + 7  };
+
+TEST_F(map_invalid_gcs, do_map)
+{
+	void *stack;
+
+	stack = (void *)syscall(__NR_map_shadow_stack, 0,
+				variant->stack_size, 0);
+	ASSERT_TRUE(stack == MAP_FAILED);
+	if (stack != MAP_FAILED)
+		munmap(stack, variant->stack_size);
+}
+
+FIXTURE(invalid_mprotect)
+{
+	unsigned long *stack;
+	size_t stack_size;
+};
+
+FIXTURE_VARIANT(invalid_mprotect)
+{
+	unsigned long flags;
+};
+
+FIXTURE_SETUP(invalid_mprotect)
+{
+	self->stack_size = sysconf(_SC_PAGE_SIZE);
+	self->stack = (void *)syscall(__NR_map_shadow_stack, 0,
+				      self->stack_size, 0);
+	ASSERT_FALSE(self->stack == MAP_FAILED);
+	ksft_print_msg("Allocated stack from %p-%p\n", self->stack,
+		       self->stack + self->stack_size);
+}
+
+FIXTURE_TEARDOWN(invalid_mprotect)
+{
+	int ret;
+
+	if (self->stack != MAP_FAILED) {
+		ret = munmap(self->stack, self->stack_size);
+		ASSERT_EQ(ret, 0);
+	}
+}
+
+FIXTURE_VARIANT_ADD(invalid_mprotect, exec)
+{
+	.flags = PROT_EXEC,
+};
+
+TEST_F(invalid_mprotect, do_map)
+{
+	int ret;
+
+	ret = mprotect(self->stack, self->stack_size, variant->flags);
+	ASSERT_EQ(ret, -1);
+}
+
+TEST_F(invalid_mprotect, do_map_read)
+{
+	int ret;
+
+	ret = mprotect(self->stack, self->stack_size,
+		       variant->flags | PROT_READ);
+	ASSERT_EQ(ret, -1);
+}
+
+int main(int argc, char **argv)
+{
+	unsigned long gcs_mode;
+	int ret;
+
+	if (!(getauxval(AT_HWCAP) & HWCAP_GCS))
+		ksft_exit_skip("SKIP GCS not supported\n");
+
+	/* 
+	 * Force shadow stacks on, our tests *should* be fine with or
+	 * without libc support and with or without this having ended
+	 * up tagged for GCS and enabled by the dynamic linker.  We
+	 * can't use the libc prctl() function since we can't return
+	 * from enabling the stack.
+	 */
+	ret = my_syscall2(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &gcs_mode);
+	if (ret) {
+		ksft_print_msg("Failed to read GCS state: %d\n", ret);
+		return EXIT_FAILURE;
+	}
+	
+	if (!(gcs_mode & PR_SHADOW_STACK_ENABLE)) {
+		gcs_mode = PR_SHADOW_STACK_ENABLE;
+		ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS,
+				  gcs_mode);
+		if (ret) {
+			ksft_print_msg("Failed to configure GCS: %d\n", ret);
+			return EXIT_FAILURE;
+		}
+	}
+
+	/* Avoid returning in case libc doesn't understand GCS */
+	exit(test_harness_run(argc, argv));
+}
-- 
cgit v1.2.3


From 58d69a3e35825698b5daddc1a074e9ea19cb0c51 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:15 +0100
Subject: kselftest/arm64: Add test coverage for GCS mode locking

Verify that we can lock individual GCS mode bits, that other modes
aren't affected and as a side effect also that every combination of
modes can be enabled.

Normally the inability to reenable GCS after disabling it would be an
issue with testing but fortunately the kselftest_harness runs each test
within a fork()ed child.  This can be inconvenient for some kinds of
testing but here it means that each test is in a separate thread and
therefore won't be affected by other tests in the suite.

Once we get toolchains with support for enabling GCS by default we will
need to take care to not do that in the build system but there are no
such toolchains yet so it is not yet an issue.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Tested-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-36-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/gcs/.gitignore    |   1 +
 tools/testing/selftests/arm64/gcs/Makefile      |   2 +-
 tools/testing/selftests/arm64/gcs/gcs-locking.c | 200 ++++++++++++++++++++++++
 3 files changed, 202 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/arm64/gcs/gcs-locking.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/gcs/.gitignore b/tools/testing/selftests/arm64/gcs/.gitignore
index 5810c4a163d4..0c86f53f68ad 100644
--- a/tools/testing/selftests/arm64/gcs/.gitignore
+++ b/tools/testing/selftests/arm64/gcs/.gitignore
@@ -1,2 +1,3 @@
 basic-gcs
 libc-gcs
+gcs-locking
diff --git a/tools/testing/selftests/arm64/gcs/Makefile b/tools/testing/selftests/arm64/gcs/Makefile
index a8fdf21e9a47..2173d6275956 100644
--- a/tools/testing/selftests/arm64/gcs/Makefile
+++ b/tools/testing/selftests/arm64/gcs/Makefile
@@ -6,7 +6,7 @@
 # nolibc.
 #
 
-TEST_GEN_PROGS := basic-gcs libc-gcs
+TEST_GEN_PROGS := basic-gcs libc-gcs gcs-locking
 
 LDLIBS+=-lpthread
 
diff --git a/tools/testing/selftests/arm64/gcs/gcs-locking.c b/tools/testing/selftests/arm64/gcs/gcs-locking.c
new file mode 100644
index 000000000000..989f75a491b7
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/gcs-locking.c
@@ -0,0 +1,200 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 ARM Limited.
+ *
+ * Tests for GCS mode locking.  These tests rely on both having GCS
+ * unconfigured on entry and on the kselftest harness running each
+ * test in a fork()ed process which will have it's own mode.
+ */
+
+#include <limits.h>
+
+#include <sys/auxv.h>
+#include <sys/prctl.h>
+
+#include <asm/hwcap.h>
+
+#include "kselftest_harness.h"
+
+#include "gcs-util.h"
+
+#define my_syscall2(num, arg1, arg2)                                          \
+({                                                                            \
+	register long _num  __asm__ ("x8") = (num);                           \
+	register long _arg1 __asm__ ("x0") = (long)(arg1);                    \
+	register long _arg2 __asm__ ("x1") = (long)(arg2);                    \
+	register long _arg3 __asm__ ("x2") = 0;                               \
+	register long _arg4 __asm__ ("x3") = 0;                               \
+	register long _arg5 __asm__ ("x4") = 0;                               \
+	                                                                      \
+	__asm__  volatile (                                                   \
+		"svc #0\n"                                                    \
+		: "=r"(_arg1)                                                 \
+		: "r"(_arg1), "r"(_arg2),                                     \
+		  "r"(_arg3), "r"(_arg4),                                     \
+		  "r"(_arg5), "r"(_num)					      \
+		: "memory", "cc"                                              \
+	);                                                                    \
+	_arg1;                                                                \
+})
+
+/* No mode bits are rejected for locking */
+TEST(lock_all_modes)
+{
+	int ret;
+
+	ret = prctl(PR_LOCK_SHADOW_STACK_STATUS, ULONG_MAX, 0, 0, 0);
+	ASSERT_EQ(ret, 0);
+}
+
+FIXTURE(valid_modes)
+{
+};
+
+FIXTURE_VARIANT(valid_modes)
+{
+	unsigned long mode;
+};
+
+FIXTURE_VARIANT_ADD(valid_modes, enable)
+{
+	.mode = PR_SHADOW_STACK_ENABLE,
+};
+
+FIXTURE_VARIANT_ADD(valid_modes, enable_write)
+{
+	.mode = PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE,
+};
+
+FIXTURE_VARIANT_ADD(valid_modes, enable_push)
+{
+	.mode = PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_PUSH,
+};
+
+FIXTURE_VARIANT_ADD(valid_modes, enable_write_push)
+{
+	.mode = PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE |
+		PR_SHADOW_STACK_PUSH,
+};
+
+FIXTURE_SETUP(valid_modes)
+{
+}
+
+FIXTURE_TEARDOWN(valid_modes)
+{
+}
+
+/* We can set the mode at all */
+TEST_F(valid_modes, set)
+{
+	int ret;
+
+	ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS,
+			  variant->mode);
+	ASSERT_EQ(ret, 0);
+
+	_exit(0);
+}
+
+/* Enabling, locking then disabling is rejected */
+TEST_F(valid_modes, enable_lock_disable)
+{
+	unsigned long mode;
+	int ret;
+
+	ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS,
+			  variant->mode);
+	ASSERT_EQ(ret, 0);
+
+	ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(mode, variant->mode);
+
+	ret = prctl(PR_LOCK_SHADOW_STACK_STATUS, variant->mode, 0, 0, 0);
+	ASSERT_EQ(ret, 0);
+
+	ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, 0);
+	ASSERT_EQ(ret, -EBUSY);
+
+	_exit(0);
+}
+
+/* Locking then enabling is rejected */
+TEST_F(valid_modes, lock_enable)
+{
+	unsigned long mode;
+	int ret;
+
+	ret = prctl(PR_LOCK_SHADOW_STACK_STATUS, variant->mode, 0, 0, 0);
+	ASSERT_EQ(ret, 0);
+
+	ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS,
+			  variant->mode);
+	ASSERT_EQ(ret, -EBUSY);
+
+	ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(mode, 0);
+
+	_exit(0);
+}
+
+/* Locking then changing other modes is fine */
+TEST_F(valid_modes, lock_enable_disable_others)
+{
+	unsigned long mode;
+	int ret;
+
+	ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS,
+			  variant->mode);
+	ASSERT_EQ(ret, 0);
+
+	ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(mode, variant->mode);
+
+	ret = prctl(PR_LOCK_SHADOW_STACK_STATUS, variant->mode, 0, 0, 0);
+	ASSERT_EQ(ret, 0);
+
+	ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS,
+			  PR_SHADOW_STACK_ALL_MODES);
+	ASSERT_EQ(ret, 0);
+
+	ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(mode, PR_SHADOW_STACK_ALL_MODES);
+
+
+	ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS,
+			  variant->mode);
+	ASSERT_EQ(ret, 0);
+
+	ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(mode, variant->mode);
+
+	_exit(0);
+}
+
+int main(int argc, char **argv)
+{
+	unsigned long mode;
+	int ret;
+
+	if (!(getauxval(AT_HWCAP) & HWCAP_GCS))
+		ksft_exit_skip("SKIP GCS not supported\n");
+
+	ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0);
+	if (ret) {
+		ksft_print_msg("Failed to read GCS state: %d\n", ret);
+		return EXIT_FAILURE;
+	}
+
+	if (mode & PR_SHADOW_STACK_ENABLE) {
+		ksft_print_msg("GCS was enabled, test unsupported\n");
+		return KSFT_SKIP;
+	}
+
+	return test_harness_run(argc, argv);
+}
-- 
cgit v1.2.3


From 794b64ca5665323f36e5fc92dfca02a3797b6523 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:16 +0100
Subject: kselftest/arm64: Add GCS signal tests

Do some testing of the signal handling for GCS, checking that a GCS
frame has the expected information in it and that the expected signals
are delivered with invalid operations.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Tested-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-37-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/signal/.gitignore    |  1 +
 .../selftests/arm64/signal/test_signals_utils.h    | 10 +++
 .../arm64/signal/testcases/gcs_exception_fault.c   | 62 +++++++++++++++
 .../selftests/arm64/signal/testcases/gcs_frame.c   | 88 ++++++++++++++++++++++
 .../arm64/signal/testcases/gcs_write_fault.c       | 67 ++++++++++++++++
 5 files changed, 228 insertions(+)
 create mode 100644 tools/testing/selftests/arm64/signal/testcases/gcs_exception_fault.c
 create mode 100644 tools/testing/selftests/arm64/signal/testcases/gcs_frame.c
 create mode 100644 tools/testing/selftests/arm64/signal/testcases/gcs_write_fault.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/signal/.gitignore b/tools/testing/selftests/arm64/signal/.gitignore
index b2f2bfd5c6aa..b257db665a35 100644
--- a/tools/testing/selftests/arm64/signal/.gitignore
+++ b/tools/testing/selftests/arm64/signal/.gitignore
@@ -3,6 +3,7 @@ mangle_*
 fake_sigreturn_*
 fpmr_*
 poe_*
+gcs_*
 sme_*
 ssve_*
 sve_*
diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.h b/tools/testing/selftests/arm64/signal/test_signals_utils.h
index 1e80808ee105..36fc12b3cd60 100644
--- a/tools/testing/selftests/arm64/signal/test_signals_utils.h
+++ b/tools/testing/selftests/arm64/signal/test_signals_utils.h
@@ -6,6 +6,7 @@
 
 #include <assert.h>
 #include <stdio.h>
+#include <stdint.h>
 #include <string.h>
 
 #include <linux/compiler.h>
@@ -47,6 +48,15 @@ void test_result(struct tdescr *td);
 		_arg1;							\
 	})
 
+static inline __attribute__((always_inline)) uint64_t get_gcspr_el0(void)
+{
+	uint64_t val;
+
+	asm volatile("mrs %0, S3_3_C2_C5_1" : "=r" (val));
+
+	return val;
+}
+
 static inline bool feats_ok(struct tdescr *td)
 {
 	if (td->feats_incompatible & td->feats_supported)
diff --git a/tools/testing/selftests/arm64/signal/testcases/gcs_exception_fault.c b/tools/testing/selftests/arm64/signal/testcases/gcs_exception_fault.c
new file mode 100644
index 000000000000..6228448b2ae7
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/gcs_exception_fault.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023 ARM Limited
+ */
+
+#include <errno.h>
+#include <signal.h>
+#include <unistd.h>
+
+#include <sys/mman.h>
+#include <sys/prctl.h>
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+/*
+ * We should get this from asm/siginfo.h but the testsuite is being
+ * clever with redefining siginfo_t.
+ */
+#ifndef SEGV_CPERR
+#define SEGV_CPERR 10
+#endif
+
+static inline void gcsss1(uint64_t Xt)
+{
+	asm volatile (
+		"sys #3, C7, C7, #2, %0\n"
+		:
+		: "rZ" (Xt)
+		: "memory");
+}
+
+static int gcs_op_fault_trigger(struct tdescr *td)
+{
+	/*
+	 * The slot below our current GCS should be in a valid GCS but
+	 * must not have a valid cap in it.
+	 */
+	gcsss1(get_gcspr_el0() - 8);
+
+	return 0;
+}
+
+static int gcs_op_fault_signal(struct tdescr *td, siginfo_t *si,
+				  ucontext_t *uc)
+{
+	ASSERT_GOOD_CONTEXT(uc);
+
+	return 1;
+}
+
+struct tdescr tde = {
+	.name = "Invalid GCS operation",
+	.descr = "An invalid GCS operation generates the expected signal",
+	.feats_required = FEAT_GCS,
+	.timeout = 3,
+	.sig_ok = SIGSEGV,
+	.sig_ok_code = SEGV_CPERR,
+	.sanity_disabled = true,
+	.trigger = gcs_op_fault_trigger,
+	.run = gcs_op_fault_signal,
+};
diff --git a/tools/testing/selftests/arm64/signal/testcases/gcs_frame.c b/tools/testing/selftests/arm64/signal/testcases/gcs_frame.c
new file mode 100644
index 000000000000..b405d82321da
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/gcs_frame.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023 ARM Limited
+ */
+
+#include <signal.h>
+#include <ucontext.h>
+#include <sys/prctl.h>
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+static union {
+	ucontext_t uc;
+	char buf[1024 * 64];
+} context;
+
+static int gcs_regs(struct tdescr *td, siginfo_t *si, ucontext_t *uc)
+{
+	size_t offset;
+	struct _aarch64_ctx *head = GET_BUF_RESV_HEAD(context);
+	struct gcs_context *gcs;
+	unsigned long expected, gcspr;
+	uint64_t *u64_val;
+	int ret;
+
+	ret = prctl(PR_GET_SHADOW_STACK_STATUS, &expected, 0, 0, 0);
+	if (ret != 0) {
+		fprintf(stderr, "Unable to query GCS status\n");
+		return 1;
+	}
+
+	/* We expect a cap to be added to the GCS in the signal frame */
+	gcspr = get_gcspr_el0();
+	gcspr -= 8;
+	fprintf(stderr, "Expecting GCSPR_EL0 %lx\n", gcspr);
+
+	if (!get_current_context(td, &context.uc, sizeof(context))) {
+		fprintf(stderr, "Failed getting context\n");
+		return 1;
+	}
+
+	/* Ensure that the signal restore token was consumed */
+	u64_val = (uint64_t *)get_gcspr_el0() + 1;
+	if (*u64_val) {
+		fprintf(stderr, "GCS value at %p is %lx not 0\n",
+			u64_val, *u64_val);
+		return 1;
+	}
+
+	fprintf(stderr, "Got context\n");
+
+	head = get_header(head, GCS_MAGIC, GET_BUF_RESV_SIZE(context),
+			  &offset);
+	if (!head) {
+		fprintf(stderr, "No GCS context\n");
+		return 1;
+	}
+
+	gcs = (struct gcs_context *)head;
+
+	/* Basic size validation is done in get_current_context() */
+
+	if (gcs->features_enabled != expected) {
+		fprintf(stderr, "Features enabled %llx but expected %lx\n",
+			gcs->features_enabled, expected);
+		return 1;
+	}
+
+	if (gcs->gcspr != gcspr) {
+		fprintf(stderr, "Got GCSPR %llx but expected %lx\n",
+			gcs->gcspr, gcspr);
+		return 1;
+	}
+
+	fprintf(stderr, "GCS context validated\n");
+	td->pass = 1;
+
+	return 0;
+}
+
+struct tdescr tde = {
+	.name = "GCS basics",
+	.descr = "Validate a GCS signal context",
+	.feats_required = FEAT_GCS,
+	.timeout = 3,
+	.run = gcs_regs,
+};
diff --git a/tools/testing/selftests/arm64/signal/testcases/gcs_write_fault.c b/tools/testing/selftests/arm64/signal/testcases/gcs_write_fault.c
new file mode 100644
index 000000000000..faeabb18c4b2
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/gcs_write_fault.c
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023 ARM Limited
+ */
+
+#include <errno.h>
+#include <signal.h>
+#include <unistd.h>
+
+#include <sys/mman.h>
+#include <sys/prctl.h>
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+static uint64_t *gcs_page;
+
+#ifndef __NR_map_shadow_stack
+#define __NR_map_shadow_stack 453
+#endif
+
+static bool alloc_gcs(struct tdescr *td)
+{
+	long page_size = sysconf(_SC_PAGE_SIZE);
+
+	gcs_page = (void *)syscall(__NR_map_shadow_stack, 0,
+				   page_size, 0);
+	if (gcs_page == MAP_FAILED) {
+		fprintf(stderr, "Failed to map %ld byte GCS: %d\n",
+			page_size, errno);
+		return false;
+	}
+
+	return true;
+}
+
+static int gcs_write_fault_trigger(struct tdescr *td)
+{
+	/* Verify that the page is readable (ie, not completely unmapped) */
+	fprintf(stderr, "Read value 0x%lx\n", gcs_page[0]);
+
+	/* A regular write should trigger a fault */
+	gcs_page[0] = EINVAL;
+
+	return 0;
+}
+
+static int gcs_write_fault_signal(struct tdescr *td, siginfo_t *si,
+				  ucontext_t *uc)
+{
+	ASSERT_GOOD_CONTEXT(uc);
+
+	return 1;
+}
+
+
+struct tdescr tde = {
+	.name = "GCS write fault",
+	.descr = "Normal writes to a GCS segfault",
+	.feats_required = FEAT_GCS,
+	.timeout = 3,
+	.sig_ok = SIGSEGV,
+	.sanity_disabled = true,
+	.init = alloc_gcs,
+	.trigger = gcs_write_fault_trigger,
+	.run = gcs_write_fault_signal,
+};
-- 
cgit v1.2.3


From 05e6cfff58c481bfe0ada24ebe1c205e2817dacd Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:17 +0100
Subject: kselftest/arm64: Add a GCS stress test

Add a stress test which runs one more process than we have CPUs spinning
through a very recursive function with frequent syscalls immediately prior
to return and signals being injected every 100ms. The goal is to flag up
any scheduling related issues, for example failure to ensure that barriers
are inserted when moving a GCS using task to another CPU. The test runs for
a configurable amount of time, defaulting to 10 seconds.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Tested-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-38-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/gcs/.gitignore       |   2 +
 tools/testing/selftests/arm64/gcs/Makefile         |   6 +-
 tools/testing/selftests/arm64/gcs/asm-offsets.h    |   0
 .../selftests/arm64/gcs/gcs-stress-thread.S        | 311 ++++++++++++
 tools/testing/selftests/arm64/gcs/gcs-stress.c     | 530 +++++++++++++++++++++
 5 files changed, 848 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/arm64/gcs/asm-offsets.h
 create mode 100644 tools/testing/selftests/arm64/gcs/gcs-stress-thread.S
 create mode 100644 tools/testing/selftests/arm64/gcs/gcs-stress.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/gcs/.gitignore b/tools/testing/selftests/arm64/gcs/.gitignore
index 0c86f53f68ad..1e8d1f6b27f2 100644
--- a/tools/testing/selftests/arm64/gcs/.gitignore
+++ b/tools/testing/selftests/arm64/gcs/.gitignore
@@ -1,3 +1,5 @@
 basic-gcs
 libc-gcs
 gcs-locking
+gcs-stress
+gcs-stress-thread
diff --git a/tools/testing/selftests/arm64/gcs/Makefile b/tools/testing/selftests/arm64/gcs/Makefile
index 2173d6275956..d8b06ca51e22 100644
--- a/tools/testing/selftests/arm64/gcs/Makefile
+++ b/tools/testing/selftests/arm64/gcs/Makefile
@@ -6,7 +6,8 @@
 # nolibc.
 #
 
-TEST_GEN_PROGS := basic-gcs libc-gcs gcs-locking
+TEST_GEN_PROGS := basic-gcs libc-gcs gcs-locking gcs-stress
+TEST_GEN_PROGS_EXTENDED := gcs-stress-thread
 
 LDLIBS+=-lpthread
 
@@ -18,3 +19,6 @@ $(OUTPUT)/basic-gcs: basic-gcs.c
 		-I../../../../../usr/include \
 		-std=gnu99 -I../.. -g \
 		-ffreestanding -Wall $^ -o $@ -lgcc
+
+$(OUTPUT)/gcs-stress-thread: gcs-stress-thread.S
+	$(CC) -nostdlib $^ -o $@
diff --git a/tools/testing/selftests/arm64/gcs/asm-offsets.h b/tools/testing/selftests/arm64/gcs/asm-offsets.h
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tools/testing/selftests/arm64/gcs/gcs-stress-thread.S b/tools/testing/selftests/arm64/gcs/gcs-stress-thread.S
new file mode 100644
index 000000000000..b88b25217da5
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/gcs-stress-thread.S
@@ -0,0 +1,311 @@
+// Program that loops for ever doing lots of recursions and system calls,
+// intended to be used as part of a stress test for GCS context switching.
+//
+// Copyright 2015-2023 Arm Ltd
+
+#include <asm/unistd.h>
+
+#define sa_sz 32
+#define sa_flags 8
+#define sa_handler 0
+#define sa_mask_sz 8
+
+#define si_code 8
+
+#define SIGINT 2
+#define SIGABRT 6
+#define SIGUSR1 10
+#define SIGSEGV 11
+#define SIGUSR2 12
+#define SIGTERM 15
+#define SEGV_CPERR 10
+
+#define SA_NODEFER 1073741824
+#define SA_SIGINFO 4
+#define ucontext_regs 184
+
+#define PR_SET_SHADOW_STACK_STATUS      75
+# define PR_SHADOW_STACK_ENABLE         (1UL << 0)
+
+#define	GCSPR_EL0 S3_3_C2_C5_1
+
+.macro function name
+	.macro endfunction
+		.type \name, @function
+		.purgem endfunction
+	.endm
+\name:
+.endm
+
+// Print a single character x0 to stdout
+// Clobbers x0-x2,x8
+function putc
+	str	x0, [sp, #-16]!
+
+	mov	x0, #1			// STDOUT_FILENO
+	mov	x1, sp
+	mov	x2, #1
+	mov	x8, #__NR_write
+	svc	#0
+
+	add	sp, sp, #16
+	ret
+endfunction
+.globl	putc
+
+// Print a NUL-terminated string starting at address x0 to stdout
+// Clobbers x0-x3,x8
+function puts
+	mov	x1, x0
+
+	mov	x2, #0
+0:	ldrb	w3, [x0], #1
+	cbz	w3, 1f
+	add	x2, x2, #1
+	b	0b
+
+1:	mov	w0, #1			// STDOUT_FILENO
+	mov	x8, #__NR_write
+	svc	#0
+
+	ret
+endfunction
+.globl	puts
+
+// Utility macro to print a literal string
+// Clobbers x0-x4,x8
+.macro puts string
+	.pushsection .rodata.str1.1, "aMS", @progbits, 1
+.L__puts_literal\@: .string "\string"
+	.popsection
+
+	ldr	x0, =.L__puts_literal\@
+	bl	puts
+.endm
+
+// Print an unsigned decimal number x0 to stdout
+// Clobbers x0-x4,x8
+function putdec
+	mov	x1, sp
+	str	x30, [sp, #-32]!	// Result can't be > 20 digits
+
+	mov	x2, #0
+	strb	w2, [x1, #-1]!		// Write the NUL terminator
+
+	mov	x2, #10
+0:	udiv	x3, x0, x2		// div-mod loop to generate the digits
+	msub	x0, x3, x2, x0
+	add	w0, w0, #'0'
+	strb	w0, [x1, #-1]!
+	mov	x0, x3
+	cbnz	x3, 0b
+
+	ldrb	w0, [x1]
+	cbnz	w0, 1f
+	mov	w0, #'0'		// Print "0" for 0, not ""
+	strb	w0, [x1, #-1]!
+
+1:	mov	x0, x1
+	bl	puts
+
+	ldr	x30, [sp], #32
+	ret
+endfunction
+.globl	putdec
+
+// Print an unsigned decimal number x0 to stdout, followed by a newline
+// Clobbers x0-x5,x8
+function putdecn
+	mov	x5, x30
+
+	bl	putdec
+	mov	x0, #'\n'
+	bl	putc
+
+	ret	x5
+endfunction
+.globl	putdecn
+
+// Fill x1 bytes starting at x0 with 0.
+// Clobbers x1, x2.
+function memclr
+	mov	w2, #0
+endfunction
+.globl	memclr
+	// fall through to memfill
+
+// Trivial memory fill: fill x1 bytes starting at address x0 with byte w2
+// Clobbers x1
+function memfill
+	cmp	x1, #0
+	b.eq	1f
+
+0:	strb	w2, [x0], #1
+	subs	x1, x1, #1
+	b.ne	0b
+
+1:	ret
+endfunction
+.globl	memfill
+
+// w0: signal number
+// x1: sa_action
+// w2: sa_flags
+// Clobbers x0-x6,x8
+function setsignal
+	str	x30, [sp, #-((sa_sz + 15) / 16 * 16 + 16)]!
+
+	mov	w4, w0
+	mov	x5, x1
+	mov	w6, w2
+
+	add	x0, sp, #16
+	mov	x1, #sa_sz
+	bl	memclr
+
+	mov	w0, w4
+	add	x1, sp, #16
+	str	w6, [x1, #sa_flags]
+	str	x5, [x1, #sa_handler]
+	mov	x2, #0
+	mov	x3, #sa_mask_sz
+	mov	x8, #__NR_rt_sigaction
+	svc	#0
+
+	cbz	w0, 1f
+
+	puts	"sigaction failure\n"
+	b	abort
+
+1:	ldr	x30, [sp], #((sa_sz + 15) / 16 * 16 + 16)
+	ret
+endfunction
+
+
+function tickle_handler
+	// Perhaps collect GCSPR_EL0 here in future?
+	ret
+endfunction
+
+function terminate_handler
+	mov	w21, w0
+	mov	x20, x2
+
+	puts	"Terminated by signal "
+	mov	w0, w21
+	bl	putdec
+	puts	", no error\n"
+
+	mov	x0, #0
+	mov	x8, #__NR_exit
+	svc	#0
+endfunction
+
+function segv_handler
+	// stash the siginfo_t *
+	mov	x20, x1
+
+	// Disable GCS, we don't want additional faults logging things
+	mov	x0, PR_SET_SHADOW_STACK_STATUS
+	mov	x1, xzr
+	mov	x2, xzr
+	mov	x3, xzr
+	mov	x4, xzr
+	mov	x5, xzr
+	mov	x8, #__NR_prctl
+	svc	#0
+
+	puts	"Got SIGSEGV code "
+
+	ldr	x21, [x20, #si_code]
+	mov	x0, x21
+	bl	putdec
+
+	// GCS faults should have si_code SEGV_CPERR
+	cmp	x21, #SEGV_CPERR
+	bne	1f
+
+	puts	" (GCS violation)"
+1:
+	mov	x0, '\n'
+	bl	putc
+	b	abort
+endfunction
+
+// Recurse x20 times
+.macro recurse id
+function recurse\id
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+
+	cmp	x20, 0
+	beq	1f
+	sub	x20, x20, 1
+	bl	recurse\id
+
+1:
+	ldp	x29, x30, [sp], #16
+
+	// Do a syscall immediately prior to returning to try to provoke
+	// scheduling and migration at a point where coherency issues
+	// might trigger.
+	mov	x8, #__NR_getpid
+	svc	#0
+
+	ret
+endfunction
+.endm
+
+// Generate and use two copies so we're changing the GCS contents
+recurse 1
+recurse 2
+
+.globl _start
+function _start
+	// Run with GCS
+	mov	x0, PR_SET_SHADOW_STACK_STATUS
+	mov	x1, PR_SHADOW_STACK_ENABLE
+	mov	x2, xzr
+	mov	x3, xzr
+	mov	x4, xzr
+	mov	x5, xzr
+	mov	x8, #__NR_prctl
+	svc	#0
+	cbz	x0, 1f
+	puts	"Failed to enable GCS\n"
+	b	abort
+1:
+
+	mov	w0, #SIGTERM
+	adr	x1, terminate_handler
+	mov	w2, #SA_SIGINFO
+	bl	setsignal
+
+	mov	w0, #SIGUSR1
+	adr	x1, tickle_handler
+	mov	w2, #SA_SIGINFO
+	orr	w2, w2, #SA_NODEFER
+	bl	setsignal
+
+	mov	w0, #SIGSEGV
+	adr	x1, segv_handler
+	mov	w2, #SA_SIGINFO
+	orr	w2, w2, #SA_NODEFER
+	bl	setsignal
+
+	puts	"Running\n"
+
+loop:
+	// Small recursion depth so we're frequently flipping between
+	// the two recursors and changing what's on the stack
+	mov	x20, #5
+	bl	recurse1
+	mov	x20, #5
+	bl	recurse2
+	b	loop
+endfunction
+
+abort:
+	mov	x0, #255
+	mov	x8, #__NR_exit
+	svc	#0
diff --git a/tools/testing/selftests/arm64/gcs/gcs-stress.c b/tools/testing/selftests/arm64/gcs/gcs-stress.c
new file mode 100644
index 000000000000..bdec7ee8cfd5
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/gcs-stress.c
@@ -0,0 +1,530 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2022-3 ARM Limited.
+ */
+
+#define _GNU_SOURCE
+#define _POSIX_C_SOURCE 199309L
+
+#include <errno.h>
+#include <getopt.h>
+#include <poll.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/auxv.h>
+#include <sys/epoll.h>
+#include <sys/prctl.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/wait.h>
+#include <asm/hwcap.h>
+
+#include "../../kselftest.h"
+
+struct child_data {
+	char *name, *output;
+	pid_t pid;
+	int stdout;
+	bool output_seen;
+	bool exited;
+	int exit_status;
+	int exit_signal;
+};
+
+static int epoll_fd;
+static struct child_data *children;
+static struct epoll_event *evs;
+static int tests;
+static int num_children;
+static bool terminate;
+
+static int startup_pipe[2];
+
+static int num_processors(void)
+{
+	long nproc = sysconf(_SC_NPROCESSORS_CONF);
+	if (nproc < 0) {
+		perror("Unable to read number of processors\n");
+		exit(EXIT_FAILURE);
+	}
+
+	return nproc;
+}
+
+static void start_thread(struct child_data *child)
+{
+	int ret, pipefd[2], i;
+	struct epoll_event ev;
+
+	ret = pipe(pipefd);
+	if (ret != 0)
+		ksft_exit_fail_msg("Failed to create stdout pipe: %s (%d)\n",
+				   strerror(errno), errno);
+
+	child->pid = fork();
+	if (child->pid == -1)
+		ksft_exit_fail_msg("fork() failed: %s (%d)\n",
+				   strerror(errno), errno);
+
+	if (!child->pid) {
+		/*
+		 * In child, replace stdout with the pipe, errors to
+		 * stderr from here as kselftest prints to stdout.
+		 */
+		ret = dup2(pipefd[1], 1);
+		if (ret == -1) {
+			fprintf(stderr, "dup2() %d\n", errno);
+			exit(EXIT_FAILURE);
+		}
+
+		/*
+		 * Duplicate the read side of the startup pipe to
+		 * FD 3 so we can close everything else.
+		 */
+		ret = dup2(startup_pipe[0], 3);
+		if (ret == -1) {
+			fprintf(stderr, "dup2() %d\n", errno);
+			exit(EXIT_FAILURE);
+		}
+
+		/*
+		 * Very dumb mechanism to clean open FDs other than
+		 * stdio. We don't want O_CLOEXEC for the pipes...
+		 */
+		for (i = 4; i < 8192; i++)
+			close(i);
+
+		/*
+		 * Read from the startup pipe, there should be no data
+		 * and we should block until it is closed.  We just
+		 * carry on on error since this isn't super critical.
+		 */
+		ret = read(3, &i, sizeof(i));
+		if (ret < 0)
+			fprintf(stderr, "read(startp pipe) failed: %s (%d)\n",
+				strerror(errno), errno);
+		if (ret > 0)
+			fprintf(stderr, "%d bytes of data on startup pipe\n",
+				ret);
+		close(3);
+
+		ret = execl("gcs-stress-thread", "gcs-stress-thread", NULL);
+		fprintf(stderr, "execl(gcs-stress-thread) failed: %d (%s)\n",
+			errno, strerror(errno));
+
+		exit(EXIT_FAILURE);
+	} else {
+		/*
+		 * In parent, remember the child and close our copy of the
+		 * write side of stdout.
+		 */
+		close(pipefd[1]);
+		child->stdout = pipefd[0];
+		child->output = NULL;
+		child->exited = false;
+		child->output_seen = false;
+
+		ev.events = EPOLLIN | EPOLLHUP;
+		ev.data.ptr = child;
+
+		ret = asprintf(&child->name, "Thread-%d", child->pid);
+		if (ret == -1)
+			ksft_exit_fail_msg("asprintf() failed\n");
+
+		ret = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, child->stdout, &ev);
+		if (ret < 0) {
+			ksft_exit_fail_msg("%s EPOLL_CTL_ADD failed: %s (%d)\n",
+					   child->name, strerror(errno), errno);
+		}
+	}
+
+	ksft_print_msg("Started %s\n", child->name);
+	num_children++;
+}
+
+static bool child_output_read(struct child_data *child)
+{
+	char read_data[1024];
+	char work[1024];
+	int ret, len, cur_work, cur_read;
+
+	ret = read(child->stdout, read_data, sizeof(read_data));
+	if (ret < 0) {
+		if (errno == EINTR)
+			return true;
+
+		ksft_print_msg("%s: read() failed: %s (%d)\n",
+			       child->name, strerror(errno),
+			       errno);
+		return false;
+	}
+	len = ret;
+
+	child->output_seen = true;
+
+	/* Pick up any partial read */
+	if (child->output) {
+		strncpy(work, child->output, sizeof(work) - 1);
+		cur_work = strnlen(work, sizeof(work));
+		free(child->output);
+		child->output = NULL;
+	} else {
+		cur_work = 0;
+	}
+
+	cur_read = 0;
+	while (cur_read < len) {
+		work[cur_work] = read_data[cur_read++];
+
+		if (work[cur_work] == '\n') {
+			work[cur_work] = '\0';
+			ksft_print_msg("%s: %s\n", child->name, work);
+			cur_work = 0;
+		} else {
+			cur_work++;
+		}
+	}
+
+	if (cur_work) {
+		work[cur_work] = '\0';
+		ret = asprintf(&child->output, "%s", work);
+		if (ret == -1)
+			ksft_exit_fail_msg("Out of memory\n");
+	}
+
+	return false;
+}
+
+static void child_output(struct child_data *child, uint32_t events,
+			 bool flush)
+{
+	bool read_more;
+
+	if (events & EPOLLIN) {
+		do {
+			read_more = child_output_read(child);
+		} while (read_more);
+	}
+
+	if (events & EPOLLHUP) {
+		close(child->stdout);
+		child->stdout = -1;
+		flush = true;
+	}
+
+	if (flush && child->output) {
+		ksft_print_msg("%s: %s<EOF>\n", child->name, child->output);
+		free(child->output);
+		child->output = NULL;
+	}
+}
+
+static void child_tickle(struct child_data *child)
+{
+	if (child->output_seen && !child->exited)
+		kill(child->pid, SIGUSR1);
+}
+
+static void child_stop(struct child_data *child)
+{
+	if (!child->exited)
+		kill(child->pid, SIGTERM);
+}
+
+static void child_cleanup(struct child_data *child)
+{
+	pid_t ret;
+	int status;
+	bool fail = false;
+
+	if (!child->exited) {
+		do {
+			ret = waitpid(child->pid, &status, 0);
+			if (ret == -1 && errno == EINTR)
+				continue;
+
+			if (ret == -1) {
+				ksft_print_msg("waitpid(%d) failed: %s (%d)\n",
+					       child->pid, strerror(errno),
+					       errno);
+				fail = true;
+				break;
+			}
+
+			if (WIFEXITED(status)) {
+				child->exit_status = WEXITSTATUS(status);
+				child->exited = true;
+			}
+
+			if (WIFSIGNALED(status)) {
+				child->exit_signal = WTERMSIG(status);
+				ksft_print_msg("%s: Exited due to signal %d\n",
+					       child->name);
+				fail = true;
+				child->exited = true;
+			}
+		} while (!child->exited);
+	}
+
+	if (!child->output_seen) {
+		ksft_print_msg("%s no output seen\n", child->name);
+		fail = true;
+	}
+
+	if (child->exit_status != 0) {
+		ksft_print_msg("%s exited with error code %d\n",
+			       child->name, child->exit_status);
+		fail = true;
+	}
+
+	ksft_test_result(!fail, "%s\n", child->name);
+}
+
+static void handle_child_signal(int sig, siginfo_t *info, void *context)
+{
+	int i;
+	bool found = false;
+
+	for (i = 0; i < num_children; i++) {
+		if (children[i].pid == info->si_pid) {
+			children[i].exited = true;
+			children[i].exit_status = info->si_status;
+			found = true;
+			break;
+		}
+	}
+
+	if (!found)
+		ksft_print_msg("SIGCHLD for unknown PID %d with status %d\n",
+			       info->si_pid, info->si_status);
+}
+
+static void handle_exit_signal(int sig, siginfo_t *info, void *context)
+{
+	int i;
+
+	/* If we're already exiting then don't signal again */
+	if (terminate)
+		return;
+
+	ksft_print_msg("Got signal, exiting...\n");
+
+	terminate = true;
+
+	/*
+	 * This should be redundant, the main loop should clean up
+	 * after us, but for safety stop everything we can here.
+	 */
+	for (i = 0; i < num_children; i++)
+		child_stop(&children[i]);
+}
+
+/* Handle any pending output without blocking */
+static void drain_output(bool flush)
+{
+	int ret = 1;
+	int i;
+
+	while (ret > 0) {
+		ret = epoll_wait(epoll_fd, evs, tests, 0);
+		if (ret < 0) {
+			if (errno == EINTR)
+				continue;
+			ksft_print_msg("epoll_wait() failed: %s (%d)\n",
+				       strerror(errno), errno);
+		}
+
+		for (i = 0; i < ret; i++)
+			child_output(evs[i].data.ptr, evs[i].events, flush);
+	}
+}
+
+static const struct option options[] = {
+	{ "timeout",	required_argument, NULL, 't' },
+	{ }
+};
+
+int main(int argc, char **argv)
+{
+	int seen_children;
+	bool all_children_started = false;
+	int gcs_threads;
+	int timeout = 10;
+	int ret, cpus, i, c;
+	struct sigaction sa;
+
+	while ((c = getopt_long(argc, argv, "t:", options, NULL)) != -1) {
+		switch (c) {
+		case 't':
+			ret = sscanf(optarg, "%d", &timeout);
+			if (ret != 1)
+				ksft_exit_fail_msg("Failed to parse timeout %s\n",
+						   optarg);
+			break;
+		default:
+			ksft_exit_fail_msg("Unknown argument\n");
+		}
+	}
+
+	cpus = num_processors();
+	tests = 0;
+
+	if (getauxval(AT_HWCAP) & HWCAP_GCS) {
+		/* One extra thread, trying to trigger migrations */
+		gcs_threads = cpus + 1;
+		tests += gcs_threads;
+	} else {
+		gcs_threads = 0;
+	}
+
+	ksft_print_header();
+	ksft_set_plan(tests);
+
+	ksft_print_msg("%d CPUs, %d GCS threads\n",
+		       cpus, gcs_threads);
+
+	if (!tests)
+		ksft_exit_skip("No tests scheduled\n");
+
+	if (timeout > 0)
+		ksft_print_msg("Will run for %ds\n", timeout);
+	else
+		ksft_print_msg("Will run until terminated\n");
+
+	children = calloc(sizeof(*children), tests);
+	if (!children)
+		ksft_exit_fail_msg("Unable to allocate child data\n");
+
+	ret = epoll_create1(EPOLL_CLOEXEC);
+	if (ret < 0)
+		ksft_exit_fail_msg("epoll_create1() failed: %s (%d)\n",
+				   strerror(errno), ret);
+	epoll_fd = ret;
+
+	/* Create a pipe which children will block on before execing */
+	ret = pipe(startup_pipe);
+	if (ret != 0)
+		ksft_exit_fail_msg("Failed to create startup pipe: %s (%d)\n",
+				   strerror(errno), errno);
+
+	/* Get signal handers ready before we start any children */
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_sigaction = handle_exit_signal;
+	sa.sa_flags = SA_RESTART | SA_SIGINFO;
+	sigemptyset(&sa.sa_mask);
+	ret = sigaction(SIGINT, &sa, NULL);
+	if (ret < 0)
+		ksft_print_msg("Failed to install SIGINT handler: %s (%d)\n",
+			       strerror(errno), errno);
+	ret = sigaction(SIGTERM, &sa, NULL);
+	if (ret < 0)
+		ksft_print_msg("Failed to install SIGTERM handler: %s (%d)\n",
+			       strerror(errno), errno);
+	sa.sa_sigaction = handle_child_signal;
+	ret = sigaction(SIGCHLD, &sa, NULL);
+	if (ret < 0)
+		ksft_print_msg("Failed to install SIGCHLD handler: %s (%d)\n",
+			       strerror(errno), errno);
+
+	evs = calloc(tests, sizeof(*evs));
+	if (!evs)
+		ksft_exit_fail_msg("Failed to allocated %d epoll events\n",
+				   tests);
+
+	for (i = 0; i < gcs_threads; i++)
+		start_thread(&children[i]);
+
+	/*
+	 * All children started, close the startup pipe and let them
+	 * run.
+	 */
+	close(startup_pipe[0]);
+	close(startup_pipe[1]);
+
+	timeout *= 10;
+	for (;;) {
+		/* Did we get a signal asking us to exit? */
+		if (terminate)
+			break;
+
+		/*
+		 * Timeout is counted in 100ms with no output, the
+		 * tests print during startup then are silent when
+		 * running so this should ensure they all ran enough
+		 * to install the signal handler, this is especially
+		 * useful in emulation where we will both be slow and
+		 * likely to have a large set of VLs.
+		 */
+		ret = epoll_wait(epoll_fd, evs, tests, 100);
+		if (ret < 0) {
+			if (errno == EINTR)
+				continue;
+			ksft_exit_fail_msg("epoll_wait() failed: %s (%d)\n",
+					   strerror(errno), errno);
+		}
+
+		/* Output? */
+		if (ret > 0) {
+			for (i = 0; i < ret; i++) {
+				child_output(evs[i].data.ptr, evs[i].events,
+					     false);
+			}
+			continue;
+		}
+
+		/* Otherwise epoll_wait() timed out */
+
+		/*
+		 * If the child processes have not produced output they
+		 * aren't actually running the tests yet.
+		 */
+		if (!all_children_started) {
+			seen_children = 0;
+
+			for (i = 0; i < num_children; i++)
+				if (children[i].output_seen ||
+				    children[i].exited)
+					seen_children++;
+
+			if (seen_children != num_children) {
+				ksft_print_msg("Waiting for %d children\n",
+					       num_children - seen_children);
+				continue;
+			}
+
+			all_children_started = true;
+		}
+
+		ksft_print_msg("Sending signals, timeout remaining: %d00ms\n",
+			       timeout);
+
+		for (i = 0; i < num_children; i++)
+			child_tickle(&children[i]);
+
+		/* Negative timeout means run indefinitely */
+		if (timeout < 0)
+			continue;
+		if (--timeout == 0)
+			break;
+	}
+
+	ksft_print_msg("Finishing up...\n");
+	terminate = true;
+
+	for (i = 0; i < tests; i++)
+		child_stop(&children[i]);
+
+	drain_output(false);
+
+	for (i = 0; i < tests; i++)
+		child_cleanup(&children[i]);
+
+	drain_output(true);
+
+	ksft_finished();
+}
-- 
cgit v1.2.3


From bb9ae1a66c85eeb626864efd812c62026e126ec0 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:18 +0100
Subject: kselftest/arm64: Enable GCS for the FP stress tests

While it's a bit off topic for them the floating point stress tests do give
us some coverage of context thrashing cases, and also of active signal
delivery separate to the relatively complicated framework in the actual
signals tests. Have the tests enable GCS on startup, ignoring failures so
they continue to work as before on systems without GCS.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Tested-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-39-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/assembler.h   | 15 +++++++++++++++
 tools/testing/selftests/arm64/fp/fpsimd-test.S |  2 ++
 tools/testing/selftests/arm64/fp/sve-test.S    |  2 ++
 tools/testing/selftests/arm64/fp/za-test.S     |  2 ++
 tools/testing/selftests/arm64/fp/zt-test.S     |  2 ++
 5 files changed, 23 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/fp/assembler.h b/tools/testing/selftests/arm64/fp/assembler.h
index 9b38a0da407d..1fc46a5642c2 100644
--- a/tools/testing/selftests/arm64/fp/assembler.h
+++ b/tools/testing/selftests/arm64/fp/assembler.h
@@ -65,4 +65,19 @@ endfunction
 	bl	puts
 .endm
 
+#define PR_SET_SHADOW_STACK_STATUS      75
+# define PR_SHADOW_STACK_ENABLE         (1UL << 0)
+
+.macro enable_gcs
+	// Run with GCS
+	mov	x0, PR_SET_SHADOW_STACK_STATUS
+	mov	x1, PR_SHADOW_STACK_ENABLE
+	mov	x2, xzr
+	mov	x3, xzr
+	mov	x4, xzr
+	mov	x5, xzr
+	mov	x8, #__NR_prctl
+	svc	#0
+.endm
+
 #endif /* ! ASSEMBLER_H */
diff --git a/tools/testing/selftests/arm64/fp/fpsimd-test.S b/tools/testing/selftests/arm64/fp/fpsimd-test.S
index 8b960d01ed2e..b16fb7f42e3e 100644
--- a/tools/testing/selftests/arm64/fp/fpsimd-test.S
+++ b/tools/testing/selftests/arm64/fp/fpsimd-test.S
@@ -215,6 +215,8 @@ endfunction
 // Main program entry point
 .globl _start
 function _start
+	enable_gcs
+
 	mov	x23, #0		// signal count
 
 	mov	w0, #SIGINT
diff --git a/tools/testing/selftests/arm64/fp/sve-test.S b/tools/testing/selftests/arm64/fp/sve-test.S
index fff60e2a25ad..2fb4f0b84476 100644
--- a/tools/testing/selftests/arm64/fp/sve-test.S
+++ b/tools/testing/selftests/arm64/fp/sve-test.S
@@ -378,6 +378,8 @@ endfunction
 // Main program entry point
 .globl _start
 function _start
+	enable_gcs
+
 	mov	x23, #0		// Irritation signal count
 
 	mov	w0, #SIGINT
diff --git a/tools/testing/selftests/arm64/fp/za-test.S b/tools/testing/selftests/arm64/fp/za-test.S
index 095b45531640..b2603aba99de 100644
--- a/tools/testing/selftests/arm64/fp/za-test.S
+++ b/tools/testing/selftests/arm64/fp/za-test.S
@@ -231,6 +231,8 @@ endfunction
 // Main program entry point
 .globl _start
 function _start
+	enable_gcs
+
 	mov	x23, #0		// signal count
 
 	mov	w0, #SIGINT
diff --git a/tools/testing/selftests/arm64/fp/zt-test.S b/tools/testing/selftests/arm64/fp/zt-test.S
index b5c81e81a379..8d9609a49008 100644
--- a/tools/testing/selftests/arm64/fp/zt-test.S
+++ b/tools/testing/selftests/arm64/fp/zt-test.S
@@ -200,6 +200,8 @@ endfunction
 // Main program entry point
 .globl _start
 function _start
+	enable_gcs
+
 	mov	x23, #0		// signal count
 
 	mov	w0, #SIGINT
-- 
cgit v1.2.3


From 0b838d768ccdbdfbcaed5f4b18b4bf63e53a0e0d Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <bentiss@kernel.org>
Date: Tue, 1 Oct 2024 16:30:09 +0200
Subject: selftests/hid: add dependency on hid_common.h

Allows to recompile the C tests when that file changes

Reviewed-by: Peter Hutterer <peter.hutterer@who-t.net>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Link: https://patch.msgid.link/20241001-hid-bpf-hid-generic-v3-5-2ef1019468df@kernel.org
Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
---
 tools/testing/selftests/hid/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/hid/Makefile b/tools/testing/selftests/hid/Makefile
index 72be55ac4bdf..9399fa3f2f9d 100644
--- a/tools/testing/selftests/hid/Makefile
+++ b/tools/testing/selftests/hid/Makefile
@@ -229,7 +229,7 @@ $(BPF_SKELS): %.skel.h: %.bpf.o $(BPFTOOL) | $(OUTPUT)
 	$(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $<
 	$(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked1.o) name $(notdir $(<:.bpf.o=)) > $@
 
-$(OUTPUT)/%.o: %.c $(BPF_SKELS)
+$(OUTPUT)/%.o: %.c $(BPF_SKELS) hid_common.h
 	$(call msg,CC,,$@)
 	$(Q)$(CC) $(CFLAGS) -c $(filter %.c,$^) $(LDLIBS) -o $@
 
-- 
cgit v1.2.3


From 4fb41dfde0699796b955eb94e7b8037a67b4b3a5 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <bentiss@kernel.org>
Date: Tue, 1 Oct 2024 16:30:10 +0200
Subject: selftests/hid: cleanup C tests by adding a common struct uhid_device

Allows to have an abstract class uhid_device which handles all of the
uhid part without having to mess up with individual fds.

struct attach_prog_args is now never used in hid_bpf.c, so drop it as well

Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Link: https://patch.msgid.link/20241001-hid-bpf-hid-generic-v3-6-2ef1019468df@kernel.org
Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
---
 tools/testing/selftests/hid/hid_bpf.c    | 77 +++++++++++---------------------
 tools/testing/selftests/hid/hid_common.h | 74 ++++++++++++++++++++----------
 tools/testing/selftests/hid/hidraw.c     | 36 +++++----------
 3 files changed, 87 insertions(+), 100 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/hid/hid_bpf.c b/tools/testing/selftests/hid/hid_bpf.c
index 86f4d66379f7..31ff92e0debd 100644
--- a/tools/testing/selftests/hid/hid_bpf.c
+++ b/tools/testing/selftests/hid/hid_bpf.c
@@ -4,13 +4,6 @@
 #include "hid_common.h"
 #include <bpf/bpf.h>
 
-struct attach_prog_args {
-	int prog_fd;
-	unsigned int hid;
-	int retval;
-	int insert_head;
-};
-
 struct hid_hw_request_syscall_args {
 	__u8 data[10];
 	unsigned int hid;
@@ -21,11 +14,8 @@ struct hid_hw_request_syscall_args {
 };
 
 FIXTURE(hid_bpf) {
-	int dev_id;
-	int uhid_fd;
+	struct uhid_device hid;
 	int hidraw_fd;
-	int hid_id;
-	pthread_t tid;
 	struct hid *skel;
 	struct bpf_link *hid_links[3]; /* max number of programs loaded in a single test */
 };
@@ -54,10 +44,10 @@ static void detach_bpf(FIXTURE_DATA(hid_bpf) * self)
 FIXTURE_TEARDOWN(hid_bpf) {
 	void *uhid_err;
 
-	uhid_destroy(_metadata, self->uhid_fd);
+	uhid_destroy(_metadata, &self->hid);
 
 	detach_bpf(self);
-	pthread_join(self->tid, &uhid_err);
+	pthread_join(self->hid.tid, &uhid_err);
 }
 #define TEARDOWN_LOG(fmt, ...) do { \
 	TH_LOG(fmt, ##__VA_ARGS__); \
@@ -66,23 +56,10 @@ FIXTURE_TEARDOWN(hid_bpf) {
 
 FIXTURE_SETUP(hid_bpf)
 {
-	time_t t;
 	int err;
 
-	/* initialize random number generator */
-	srand((unsigned int)time(&t));
-
-	self->dev_id = rand() % 1024;
-
-	self->uhid_fd = setup_uhid(_metadata, self->dev_id);
-
-	/* locate the uev, self, variant);ent file of the created device */
-	self->hid_id = get_hid_id(self->dev_id);
-	ASSERT_GT(self->hid_id, 0)
-		TEARDOWN_LOG("Could not locate uhid device id: %d", self->hid_id);
-
-	err = uhid_start_listener(_metadata, &self->tid, self->uhid_fd);
-	ASSERT_EQ(0, err) TEARDOWN_LOG("could not start udev listener: %d", err);
+	err = setup_uhid(_metadata, &self->hid);
+	ASSERT_OK(err);
 }
 
 struct test_program {
@@ -129,7 +106,7 @@ static void load_programs(const struct test_program programs[],
 		ops_hid_id = bpf_map__initial_value(map, NULL);
 		ASSERT_OK_PTR(ops_hid_id) TH_LOG("unable to retrieve struct_ops data");
 
-		*ops_hid_id = self->hid_id;
+		*ops_hid_id = self->hid.hid_id;
 	}
 
 	/* we disable the auto-attach feature of all maps because we
@@ -157,7 +134,7 @@ static void load_programs(const struct test_program programs[],
 
 	hid__attach(self->skel);
 
-	self->hidraw_fd = open_hidraw(self->dev_id);
+	self->hidraw_fd = open_hidraw(&self->hid);
 	ASSERT_GE(self->hidraw_fd, 0) TH_LOG("open_hidraw");
 }
 
@@ -192,7 +169,7 @@ TEST_F(hid_bpf, raw_event)
 	/* inject one event */
 	buf[0] = 1;
 	buf[1] = 42;
-	uhid_send_event(_metadata, self->uhid_fd, buf, 6);
+	uhid_send_event(_metadata, &self->hid, buf, 6);
 
 	/* check that hid_first_event() was executed */
 	ASSERT_EQ(self->skel->data->callback_check, 42) TH_LOG("callback_check1");
@@ -208,7 +185,7 @@ TEST_F(hid_bpf, raw_event)
 	memset(buf, 0, sizeof(buf));
 	buf[0] = 1;
 	buf[1] = 47;
-	uhid_send_event(_metadata, self->uhid_fd, buf, 6);
+	uhid_send_event(_metadata, &self->hid, buf, 6);
 
 	/* check that hid_first_event() was executed */
 	ASSERT_EQ(self->skel->data->callback_check, 47) TH_LOG("callback_check1");
@@ -239,7 +216,7 @@ TEST_F(hid_bpf, subprog_raw_event)
 	/* inject one event */
 	buf[0] = 1;
 	buf[1] = 42;
-	uhid_send_event(_metadata, self->uhid_fd, buf, 6);
+	uhid_send_event(_metadata, &self->hid, buf, 6);
 
 	/* read the data from hidraw */
 	memset(buf, 0, sizeof(buf));
@@ -252,7 +229,7 @@ TEST_F(hid_bpf, subprog_raw_event)
 	memset(buf, 0, sizeof(buf));
 	buf[0] = 1;
 	buf[1] = 47;
-	uhid_send_event(_metadata, self->uhid_fd, buf, 6);
+	uhid_send_event(_metadata, &self->hid, buf, 6);
 
 	/* read the data from hidraw */
 	memset(buf, 0, sizeof(buf));
@@ -303,7 +280,7 @@ TEST_F(hid_bpf, test_attach_detach)
 	/* inject one event */
 	buf[0] = 1;
 	buf[1] = 42;
-	uhid_send_event(_metadata, self->uhid_fd, buf, 6);
+	uhid_send_event(_metadata, &self->hid, buf, 6);
 
 	/* read the data from hidraw */
 	memset(buf, 0, sizeof(buf));
@@ -326,14 +303,14 @@ TEST_F(hid_bpf, test_attach_detach)
 	/* detach the program */
 	detach_bpf(self);
 
-	self->hidraw_fd = open_hidraw(self->dev_id);
+	self->hidraw_fd = open_hidraw(&self->hid);
 	ASSERT_GE(self->hidraw_fd, 0) TH_LOG("open_hidraw");
 
 	/* inject another event */
 	memset(buf, 0, sizeof(buf));
 	buf[0] = 1;
 	buf[1] = 47;
-	uhid_send_event(_metadata, self->uhid_fd, buf, 6);
+	uhid_send_event(_metadata, &self->hid, buf, 6);
 
 	/* read the data from hidraw */
 	memset(buf, 0, sizeof(buf));
@@ -352,7 +329,7 @@ TEST_F(hid_bpf, test_attach_detach)
 	memset(buf, 0, sizeof(buf));
 	buf[0] = 1;
 	buf[1] = 42;
-	uhid_send_event(_metadata, self->uhid_fd, buf, 6);
+	uhid_send_event(_metadata, &self->hid, buf, 6);
 
 	/* read the data from hidraw */
 	memset(buf, 0, sizeof(buf));
@@ -382,7 +359,7 @@ TEST_F(hid_bpf, test_hid_change_report)
 	/* inject one event */
 	buf[0] = 1;
 	buf[1] = 42;
-	uhid_send_event(_metadata, self->uhid_fd, buf, 6);
+	uhid_send_event(_metadata, &self->hid, buf, 6);
 
 	/* read the data from hidraw */
 	memset(buf, 0, sizeof(buf));
@@ -412,7 +389,7 @@ TEST_F(hid_bpf, test_hid_user_input_report_call)
 
 	LOAD_BPF;
 
-	args.hid = self->hid_id;
+	args.hid = self->hid.hid_id;
 	args.data[0] = 1; /* report ID */
 	args.data[1] = 2; /* report ID */
 	args.data[2] = 42; /* report ID */
@@ -458,7 +435,7 @@ TEST_F(hid_bpf, test_hid_user_output_report_call)
 
 	LOAD_BPF;
 
-	args.hid = self->hid_id;
+	args.hid = self->hid.hid_id;
 	args.data[0] = 1; /* report ID */
 	args.data[1] = 2; /* report ID */
 	args.data[2] = 42; /* report ID */
@@ -506,7 +483,7 @@ TEST_F(hid_bpf, test_hid_user_raw_request_call)
 
 	LOAD_BPF;
 
-	args.hid = self->hid_id;
+	args.hid = self->hid.hid_id;
 	args.data[0] = 1; /* report ID */
 
 	prog_fd = bpf_program__fd(self->skel->progs.hid_user_raw_request);
@@ -539,7 +516,7 @@ TEST_F(hid_bpf, test_hid_filter_raw_request_call)
 	/* inject one event */
 	buf[0] = 1;
 	buf[1] = 42;
-	uhid_send_event(_metadata, self->uhid_fd, buf, 6);
+	uhid_send_event(_metadata, &self->hid, buf, 6);
 
 	/* read the data from hidraw */
 	memset(buf, 0, sizeof(buf));
@@ -565,7 +542,7 @@ TEST_F(hid_bpf, test_hid_filter_raw_request_call)
 	/* detach the program */
 	detach_bpf(self);
 
-	self->hidraw_fd = open_hidraw(self->dev_id);
+	self->hidraw_fd = open_hidraw(&self->hid);
 	ASSERT_GE(self->hidraw_fd, 0) TH_LOG("open_hidraw");
 
 	err = ioctl(self->hidraw_fd, HIDIOCGFEATURE(sizeof(buf)), buf);
@@ -641,7 +618,7 @@ TEST_F(hid_bpf, test_hid_filter_output_report_call)
 	/* inject one event */
 	buf[0] = 1;
 	buf[1] = 42;
-	uhid_send_event(_metadata, self->uhid_fd, buf, 6);
+	uhid_send_event(_metadata, &self->hid, buf, 6);
 
 	/* read the data from hidraw */
 	memset(buf, 0, sizeof(buf));
@@ -667,7 +644,7 @@ TEST_F(hid_bpf, test_hid_filter_output_report_call)
 	/* detach the program */
 	detach_bpf(self);
 
-	self->hidraw_fd = open_hidraw(self->dev_id);
+	self->hidraw_fd = open_hidraw(&self->hid);
 	ASSERT_GE(self->hidraw_fd, 0) TH_LOG("open_hidraw");
 
 	err = write(self->hidraw_fd, buf, 3);
@@ -742,7 +719,7 @@ TEST_F(hid_bpf, test_multiply_events_wq)
 	/* inject one event */
 	buf[0] = 1;
 	buf[1] = 42;
-	uhid_send_event(_metadata, self->uhid_fd, buf, 6);
+	uhid_send_event(_metadata, &self->hid, buf, 6);
 
 	/* read the data from hidraw */
 	memset(buf, 0, sizeof(buf));
@@ -780,7 +757,7 @@ TEST_F(hid_bpf, test_multiply_events)
 	/* inject one event */
 	buf[0] = 1;
 	buf[1] = 42;
-	uhid_send_event(_metadata, self->uhid_fd, buf, 6);
+	uhid_send_event(_metadata, &self->hid, buf, 6);
 
 	/* read the data from hidraw */
 	memset(buf, 0, sizeof(buf));
@@ -816,7 +793,7 @@ TEST_F(hid_bpf, test_hid_infinite_loop_input_report_call)
 	buf[1] = 2;
 	buf[2] = 42;
 
-	uhid_send_event(_metadata, self->uhid_fd, buf, 6);
+	uhid_send_event(_metadata, &self->hid, buf, 6);
 
 	/* read the data from hidraw */
 	memset(buf, 0, sizeof(buf));
@@ -867,7 +844,7 @@ TEST_F(hid_bpf, test_hid_attach_flags)
 
 	/* inject one event */
 	buf[0] = 1;
-	uhid_send_event(_metadata, self->uhid_fd, buf, 6);
+	uhid_send_event(_metadata, &self->hid, buf, 6);
 
 	/* read the data from hidraw */
 	memset(buf, 0, sizeof(buf));
diff --git a/tools/testing/selftests/hid/hid_common.h b/tools/testing/selftests/hid/hid_common.h
index f151f151a1ed..a7d836a35bb1 100644
--- a/tools/testing/selftests/hid/hid_common.h
+++ b/tools/testing/selftests/hid/hid_common.h
@@ -19,6 +19,13 @@
 	__typeof__(b) _b = (b); \
 	_a < _b ? _a : _b; })
 
+struct uhid_device {
+	int dev_id;		/* uniq (random) number to identify the device */
+	int uhid_fd;
+	int hid_id;		/* HID device id in the system */
+	pthread_t tid;		/* thread for reading uhid events */
+};
+
 static unsigned char rdesc[] = {
 	0x06, 0x00, 0xff,	/* Usage Page (Vendor Defined Page 1) */
 	0x09, 0x21,		/* Usage (Vendor Usage 0x21) */
@@ -146,14 +153,14 @@ static int uhid_create(struct __test_metadata *_metadata, int fd, int rand_nb)
 	return uhid_write(_metadata, fd, &ev);
 }
 
-static void uhid_destroy(struct __test_metadata *_metadata, int fd)
+static void uhid_destroy(struct __test_metadata *_metadata, struct uhid_device *hid)
 {
 	struct uhid_event ev;
 
 	memset(&ev, 0, sizeof(ev));
 	ev.type = UHID_DESTROY;
 
-	uhid_write(_metadata, fd, &ev);
+	uhid_write(_metadata, hid->uhid_fd, &ev);
 }
 
 static int uhid_event(struct __test_metadata *_metadata, int fd)
@@ -281,7 +288,8 @@ static int uhid_start_listener(struct __test_metadata *_metadata, pthread_t *tid
 	return 0;
 }
 
-static int uhid_send_event(struct __test_metadata *_metadata, int fd, __u8 *buf, size_t size)
+static int uhid_send_event(struct __test_metadata *_metadata, struct uhid_device *hid,
+			   __u8 *buf, size_t size)
 {
 	struct uhid_event ev;
 
@@ -294,25 +302,7 @@ static int uhid_send_event(struct __test_metadata *_metadata, int fd, __u8 *buf,
 
 	memcpy(ev.u.input2.data, buf, size);
 
-	return uhid_write(_metadata, fd, &ev);
-}
-
-static int setup_uhid(struct __test_metadata *_metadata, int rand_nb)
-{
-	int fd;
-	const char *path = "/dev/uhid";
-	int ret;
-
-	fd = open(path, O_RDWR | O_CLOEXEC);
-	ASSERT_GE(fd, 0) TH_LOG("open uhid-cdev failed; %d", fd);
-
-	ret = uhid_create(_metadata, fd, rand_nb);
-	ASSERT_EQ(0, ret) {
-		TH_LOG("create uhid device failed: %d", ret);
-		close(fd);
-	}
-
-	return fd;
+	return uhid_write(_metadata, hid->uhid_fd, &ev);
 }
 
 static bool match_sysfs_device(int dev_id, const char *workdir, struct dirent *dir)
@@ -421,12 +411,12 @@ static int get_hidraw(int dev_id)
 	return found;
 }
 
-static int open_hidraw(int dev_id)
+static int open_hidraw(struct uhid_device *hid)
 {
 	int hidraw_number;
 	char hidraw_path[64] = { 0 };
 
-	hidraw_number = get_hidraw(dev_id);
+	hidraw_number = get_hidraw(hid->dev_id);
 	if (hidraw_number < 0)
 		return hidraw_number;
 
@@ -434,3 +424,39 @@ static int open_hidraw(int dev_id)
 	sprintf(hidraw_path, "/dev/hidraw%d", hidraw_number);
 	return open(hidraw_path, O_RDWR | O_NONBLOCK);
 }
+
+static int setup_uhid(struct __test_metadata *_metadata, struct uhid_device *hid)
+{
+	const char *path = "/dev/uhid";
+	time_t t;
+	int ret;
+
+	/* initialize random number generator */
+	srand((unsigned int)time(&t));
+
+	hid->dev_id = rand() % 1024;
+
+	hid->uhid_fd = open(path, O_RDWR | O_CLOEXEC);
+	ASSERT_GE(hid->uhid_fd, 0) TH_LOG("open uhid-cdev failed; %d", hid->uhid_fd);
+
+	ret = uhid_create(_metadata, hid->uhid_fd, hid->dev_id);
+	ASSERT_EQ(0, ret) {
+		TH_LOG("create uhid device failed: %d", ret);
+		close(hid->uhid_fd);
+		return ret;
+	}
+
+	/* locate the uevent file of the created device */
+	hid->hid_id = get_hid_id(hid->dev_id);
+	ASSERT_GT(hid->hid_id, 0)
+		TH_LOG("Could not locate uhid device id: %d", hid->hid_id);
+
+	ret = uhid_start_listener(_metadata, &hid->tid, hid->uhid_fd);
+	ASSERT_EQ(0, ret) {
+		TH_LOG("could not start udev listener: %d", ret);
+		close(hid->uhid_fd);
+		return ret;
+	}
+
+	return 0;
+}
diff --git a/tools/testing/selftests/hid/hidraw.c b/tools/testing/selftests/hid/hidraw.c
index f8b4f7ff292c..5934818b2036 100644
--- a/tools/testing/selftests/hid/hidraw.c
+++ b/tools/testing/selftests/hid/hidraw.c
@@ -9,11 +9,8 @@
 #endif /* HIDIOCREVOKE */
 
 FIXTURE(hidraw) {
-	int dev_id;
-	int uhid_fd;
+	struct uhid_device hid;
 	int hidraw_fd;
-	int hid_id;
-	pthread_t tid;
 };
 static void close_hidraw(FIXTURE_DATA(hidraw) * self)
 {
@@ -25,10 +22,10 @@ static void close_hidraw(FIXTURE_DATA(hidraw) * self)
 FIXTURE_TEARDOWN(hidraw) {
 	void *uhid_err;
 
-	uhid_destroy(_metadata, self->uhid_fd);
+	uhid_destroy(_metadata, &self->hid);
 
 	close_hidraw(self);
-	pthread_join(self->tid, &uhid_err);
+	pthread_join(self->hid.tid, &uhid_err);
 }
 #define TEARDOWN_LOG(fmt, ...) do { \
 	TH_LOG(fmt, ##__VA_ARGS__); \
@@ -37,25 +34,12 @@ FIXTURE_TEARDOWN(hidraw) {
 
 FIXTURE_SETUP(hidraw)
 {
-	time_t t;
 	int err;
 
-	/* initialize random number generator */
-	srand((unsigned int)time(&t));
+	err = setup_uhid(_metadata, &self->hid);
+	ASSERT_OK(err);
 
-	self->dev_id = rand() % 1024;
-
-	self->uhid_fd = setup_uhid(_metadata, self->dev_id);
-
-	/* locate the uev, self, variant);ent file of the created device */
-	self->hid_id = get_hid_id(self->dev_id);
-	ASSERT_GT(self->hid_id, 0)
-		TEARDOWN_LOG("Could not locate uhid device id: %d", self->hid_id);
-
-	err = uhid_start_listener(_metadata, &self->tid, self->uhid_fd);
-	ASSERT_EQ(0, err) TEARDOWN_LOG("could not start udev listener: %d", err);
-
-	self->hidraw_fd = open_hidraw(self->dev_id);
+	self->hidraw_fd = open_hidraw(&self->hid);
 	ASSERT_GE(self->hidraw_fd, 0) TH_LOG("open_hidraw");
 }
 
@@ -79,7 +63,7 @@ TEST_F(hidraw, raw_event)
 	/* inject one event */
 	buf[0] = 1;
 	buf[1] = 42;
-	uhid_send_event(_metadata, self->uhid_fd, buf, 6);
+	uhid_send_event(_metadata, &self->hid, buf, 6);
 
 	/* read the data from hidraw */
 	memset(buf, 0, sizeof(buf));
@@ -101,7 +85,7 @@ TEST_F(hidraw, raw_event_revoked)
 	/* inject one event */
 	buf[0] = 1;
 	buf[1] = 42;
-	uhid_send_event(_metadata, self->uhid_fd, buf, 6);
+	uhid_send_event(_metadata, &self->hid, buf, 6);
 
 	/* read the data from hidraw */
 	memset(buf, 0, sizeof(buf));
@@ -117,7 +101,7 @@ TEST_F(hidraw, raw_event_revoked)
 	/* inject one other event */
 	buf[0] = 1;
 	buf[1] = 43;
-	uhid_send_event(_metadata, self->uhid_fd, buf, 6);
+	uhid_send_event(_metadata, &self->hid, buf, 6);
 
 	/* read the data from hidraw */
 	memset(buf, 0, sizeof(buf));
@@ -161,7 +145,7 @@ TEST_F(hidraw, poll_revoked)
 	/* inject one event */
 	buf[0] = 1;
 	buf[1] = 42;
-	uhid_send_event(_metadata, self->uhid_fd, buf, 6);
+	uhid_send_event(_metadata, &self->hid, buf, 6);
 
 	while (true) {
 		ready = poll(pfds, 1, 5000);
-- 
cgit v1.2.3


From 72c55473fc8c82b06df79473f04c5231d51580d7 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <bentiss@kernel.org>
Date: Tue, 1 Oct 2024 16:30:11 +0200
Subject: selftests/hid: allow to parametrize bus/vid/pid/rdesc on the test
 device

This will be useful to introduce variants in tests to test the
interactions between HID-BPF and some kernel modules.

Reviewed-by: Peter Hutterer <peter.hutterer@who-t.net>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Link: https://patch.msgid.link/20241001-hid-bpf-hid-generic-v3-7-2ef1019468df@kernel.org
Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
---
 tools/testing/selftests/hid/hid_bpf.c    |  2 +-
 tools/testing/selftests/hid/hid_common.h | 46 ++++++++++++++++++++------------
 tools/testing/selftests/hid/hidraw.c     |  2 +-
 3 files changed, 31 insertions(+), 19 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/hid/hid_bpf.c b/tools/testing/selftests/hid/hid_bpf.c
index 31ff92e0debd..1e979fb3542b 100644
--- a/tools/testing/selftests/hid/hid_bpf.c
+++ b/tools/testing/selftests/hid/hid_bpf.c
@@ -58,7 +58,7 @@ FIXTURE_SETUP(hid_bpf)
 {
 	int err;
 
-	err = setup_uhid(_metadata, &self->hid);
+	err = setup_uhid(_metadata, &self->hid, BUS_USB, 0x0001, 0x0a36, rdesc, sizeof(rdesc));
 	ASSERT_OK(err);
 }
 
diff --git a/tools/testing/selftests/hid/hid_common.h b/tools/testing/selftests/hid/hid_common.h
index a7d836a35bb1..f77f69c6657d 100644
--- a/tools/testing/selftests/hid/hid_common.h
+++ b/tools/testing/selftests/hid/hid_common.h
@@ -23,6 +23,9 @@ struct uhid_device {
 	int dev_id;		/* uniq (random) number to identify the device */
 	int uhid_fd;
 	int hid_id;		/* HID device id in the system */
+	__u16 bus;
+	__u32 vid;
+	__u32 pid;
 	pthread_t tid;		/* thread for reading uhid events */
 };
 
@@ -129,7 +132,9 @@ static int uhid_write(struct __test_metadata *_metadata, int fd, const struct uh
 	}
 }
 
-static int uhid_create(struct __test_metadata *_metadata, int fd, int rand_nb)
+static int uhid_create(struct __test_metadata *_metadata, int fd, int rand_nb,
+		       __u16 bus, __u32 vid, __u32 pid, __u8 *rdesc,
+		       size_t rdesc_size)
 {
 	struct uhid_event ev;
 	char buf[25];
@@ -140,10 +145,10 @@ static int uhid_create(struct __test_metadata *_metadata, int fd, int rand_nb)
 	ev.type = UHID_CREATE;
 	strcpy((char *)ev.u.create.name, buf);
 	ev.u.create.rd_data = rdesc;
-	ev.u.create.rd_size = sizeof(rdesc);
-	ev.u.create.bus = BUS_USB;
-	ev.u.create.vendor = 0x0001;
-	ev.u.create.product = 0x0a37;
+	ev.u.create.rd_size = rdesc_size;
+	ev.u.create.bus = bus;
+	ev.u.create.vendor = vid;
+	ev.u.create.product = pid;
 	ev.u.create.version = 0;
 	ev.u.create.country = 0;
 
@@ -305,15 +310,17 @@ static int uhid_send_event(struct __test_metadata *_metadata, struct uhid_device
 	return uhid_write(_metadata, hid->uhid_fd, &ev);
 }
 
-static bool match_sysfs_device(int dev_id, const char *workdir, struct dirent *dir)
+static bool match_sysfs_device(struct uhid_device *hid, const char *workdir, struct dirent *dir)
 {
-	const char *target = "0003:0001:0A37.*";
+	char target[20] = "";
 	char phys[512];
 	char uevent[1024];
 	char temp[512];
 	int fd, nread;
 	bool found = false;
 
+	snprintf(target, sizeof(target), "%04X:%04X:%04X.*", hid->bus, hid->vid, hid->pid);
+
 	if (fnmatch(target, dir->d_name, 0))
 		return false;
 
@@ -324,7 +331,7 @@ static bool match_sysfs_device(int dev_id, const char *workdir, struct dirent *d
 	if (fd < 0)
 		return false;
 
-	sprintf(phys, "PHYS=%d", dev_id);
+	sprintf(phys, "PHYS=%d", hid->dev_id);
 
 	nread = read(fd, temp, ARRAY_SIZE(temp));
 	if (nread > 0 && (strstr(temp, phys)) != NULL)
@@ -335,7 +342,7 @@ static bool match_sysfs_device(int dev_id, const char *workdir, struct dirent *d
 	return found;
 }
 
-static int get_hid_id(int dev_id)
+static int get_hid_id(struct uhid_device *hid)
 {
 	const char *workdir = "/sys/devices/virtual/misc/uhid";
 	const char *str_id;
@@ -350,10 +357,10 @@ static int get_hid_id(int dev_id)
 		d = opendir(workdir);
 		if (d) {
 			while ((dir = readdir(d)) != NULL) {
-				if (!match_sysfs_device(dev_id, workdir, dir))
+				if (!match_sysfs_device(hid, workdir, dir))
 					continue;
 
-				str_id = dir->d_name + sizeof("0003:0001:0A37.");
+				str_id = dir->d_name + sizeof("0000:0000:0000.");
 				found = (int)strtol(str_id, NULL, 16);
 
 				break;
@@ -367,7 +374,7 @@ static int get_hid_id(int dev_id)
 	return found;
 }
 
-static int get_hidraw(int dev_id)
+static int get_hidraw(struct uhid_device *hid)
 {
 	const char *workdir = "/sys/devices/virtual/misc/uhid";
 	char sysfs[1024];
@@ -384,7 +391,7 @@ static int get_hidraw(int dev_id)
 			continue;
 
 		while ((dir = readdir(d)) != NULL) {
-			if (!match_sysfs_device(dev_id, workdir, dir))
+			if (!match_sysfs_device(hid, workdir, dir))
 				continue;
 
 			sprintf(sysfs, "%s/%s/hidraw", workdir, dir->d_name);
@@ -416,7 +423,7 @@ static int open_hidraw(struct uhid_device *hid)
 	int hidraw_number;
 	char hidraw_path[64] = { 0 };
 
-	hidraw_number = get_hidraw(hid->dev_id);
+	hidraw_number = get_hidraw(hid);
 	if (hidraw_number < 0)
 		return hidraw_number;
 
@@ -425,7 +432,8 @@ static int open_hidraw(struct uhid_device *hid)
 	return open(hidraw_path, O_RDWR | O_NONBLOCK);
 }
 
-static int setup_uhid(struct __test_metadata *_metadata, struct uhid_device *hid)
+static int setup_uhid(struct __test_metadata *_metadata, struct uhid_device *hid,
+		      __u16 bus, __u32 vid, __u32 pid, const __u8 *rdesc, size_t rdesc_size)
 {
 	const char *path = "/dev/uhid";
 	time_t t;
@@ -435,11 +443,15 @@ static int setup_uhid(struct __test_metadata *_metadata, struct uhid_device *hid
 	srand((unsigned int)time(&t));
 
 	hid->dev_id = rand() % 1024;
+	hid->bus = bus;
+	hid->vid = vid;
+	hid->pid = pid;
 
 	hid->uhid_fd = open(path, O_RDWR | O_CLOEXEC);
 	ASSERT_GE(hid->uhid_fd, 0) TH_LOG("open uhid-cdev failed; %d", hid->uhid_fd);
 
-	ret = uhid_create(_metadata, hid->uhid_fd, hid->dev_id);
+	ret = uhid_create(_metadata, hid->uhid_fd, hid->dev_id, bus, vid, pid,
+			  (__u8 *)rdesc, rdesc_size);
 	ASSERT_EQ(0, ret) {
 		TH_LOG("create uhid device failed: %d", ret);
 		close(hid->uhid_fd);
@@ -447,7 +459,7 @@ static int setup_uhid(struct __test_metadata *_metadata, struct uhid_device *hid
 	}
 
 	/* locate the uevent file of the created device */
-	hid->hid_id = get_hid_id(hid->dev_id);
+	hid->hid_id = get_hid_id(hid);
 	ASSERT_GT(hid->hid_id, 0)
 		TH_LOG("Could not locate uhid device id: %d", hid->hid_id);
 
diff --git a/tools/testing/selftests/hid/hidraw.c b/tools/testing/selftests/hid/hidraw.c
index 5934818b2036..821db37ba4bb 100644
--- a/tools/testing/selftests/hid/hidraw.c
+++ b/tools/testing/selftests/hid/hidraw.c
@@ -36,7 +36,7 @@ FIXTURE_SETUP(hidraw)
 {
 	int err;
 
-	err = setup_uhid(_metadata, &self->hid);
+	err = setup_uhid(_metadata, &self->hid, BUS_USB, 0x0001, 0x0a37, rdesc, sizeof(rdesc));
 	ASSERT_OK(err);
 
 	self->hidraw_fd = open_hidraw(&self->hid);
-- 
cgit v1.2.3


From e14e0eaeb040899f7cb363cdfdf8fbee84a45f08 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <bentiss@kernel.org>
Date: Tue, 1 Oct 2024 16:30:13 +0200
Subject: selftests/hid: add test for assigning a given device to hid-generic

We use a well known VID/PID on a driver that doesn't need to talk to
the device, ensures we created the device against the target driver,
then load our program and ensure we have unbound to this driver and use
hid-generic instead.

Reviewed-by: Peter Hutterer <peter.hutterer@who-t.net>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Link: https://patch.msgid.link/20241001-hid-bpf-hid-generic-v3-9-2ef1019468df@kernel.org
Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
---
 tools/testing/selftests/hid/hid_bpf.c              | 80 +++++++++++++++++++++-
 tools/testing/selftests/hid/progs/hid.c            | 12 ++++
 .../testing/selftests/hid/progs/hid_bpf_helpers.h  |  6 +-
 3 files changed, 96 insertions(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/hid/hid_bpf.c b/tools/testing/selftests/hid/hid_bpf.c
index 1e979fb3542b..ca58bfa3ca65 100644
--- a/tools/testing/selftests/hid/hid_bpf.c
+++ b/tools/testing/selftests/hid/hid_bpf.c
@@ -54,11 +54,41 @@ FIXTURE_TEARDOWN(hid_bpf) {
 	hid_bpf_teardown(_metadata, self, variant); \
 } while (0)
 
+struct specific_device {
+	const char test_name[64];
+	__u16 bus;
+	__u32 vid;
+	__u32 pid;
+};
+
 FIXTURE_SETUP(hid_bpf)
 {
+	const struct specific_device *match = NULL;
 	int err;
 
-	err = setup_uhid(_metadata, &self->hid, BUS_USB, 0x0001, 0x0a36, rdesc, sizeof(rdesc));
+	const struct specific_device devices[] = {
+	{
+		.test_name = "test_hid_driver_probe",
+		.bus = BUS_BLUETOOTH,
+		.vid = 0x05ac,  /* USB_VENDOR_ID_APPLE */
+		.pid = 0x022c,  /* USB_DEVICE_ID_APPLE_ALU_WIRELESS_ANSI */
+	}, {
+		.test_name = "*",
+		.bus = BUS_USB,
+		.vid = 0x0001,
+		.pid = 0x0a36,
+	}};
+
+	for (int i = 0; i < ARRAY_SIZE(devices); i++) {
+		match = &devices[i];
+		if (!strncmp(_metadata->name, devices[i].test_name, sizeof(devices[i].test_name)))
+			break;
+	}
+
+	ASSERT_OK_PTR(match);
+
+	err = setup_uhid(_metadata, &self->hid, match->bus, match->vid, match->pid,
+			 rdesc, sizeof(rdesc));
 	ASSERT_OK(err);
 }
 
@@ -855,6 +885,54 @@ TEST_F(hid_bpf, test_hid_attach_flags)
 	ASSERT_EQ(buf[3], 3);
 }
 
+static bool is_using_driver(struct __test_metadata *_metadata, struct uhid_device *hid,
+			    const char *driver)
+{
+	char driver_line[512];
+	char uevent[1024];
+	char temp[512];
+	int fd, nread;
+	bool found = false;
+
+	sprintf(uevent, "/sys/bus/hid/devices/%04X:%04X:%04X.%04X/uevent",
+		hid->bus, hid->vid, hid->pid, hid->hid_id);
+
+	fd = open(uevent, O_RDONLY | O_NONBLOCK);
+	if (fd < 0) {
+		TH_LOG("couldn't open '%s': %d, %d", uevent, fd, errno);
+		return false;
+	}
+
+	sprintf(driver_line, "DRIVER=%s", driver);
+
+	nread = read(fd, temp, ARRAY_SIZE(temp));
+	if (nread > 0 && (strstr(temp, driver_line)) != NULL)
+		found = true;
+
+	close(fd);
+
+	return found;
+}
+
+/*
+ * Attach hid_driver_probe to the given uhid device,
+ * check that the device is now using hid-generic.
+ */
+TEST_F(hid_bpf, test_hid_driver_probe)
+{
+	const struct test_program progs[] = {
+		{
+			.name = "hid_test_driver_probe",
+		},
+	};
+
+	ASSERT_TRUE(is_using_driver(_metadata, &self->hid, "apple"));
+
+	LOAD_PROGRAMS(progs);
+
+	ASSERT_TRUE(is_using_driver(_metadata, &self->hid, "hid-generic"));
+}
+
 /*
  * Attach hid_rdesc_fixup to the given uhid device,
  * retrieve and open the matching hidraw node,
diff --git a/tools/testing/selftests/hid/progs/hid.c b/tools/testing/selftests/hid/progs/hid.c
index 5ecc845ef792..9b22e9a0e658 100644
--- a/tools/testing/selftests/hid/progs/hid.c
+++ b/tools/testing/selftests/hid/progs/hid.c
@@ -598,3 +598,15 @@ SEC(".struct_ops.link")
 struct hid_bpf_ops test_infinite_loop_input_report = {
 	.hid_device_event = (void *)hid_test_infinite_loop_input_report,
 };
+
+SEC("?struct_ops.s/hid_rdesc_fixup")
+int BPF_PROG(hid_test_driver_probe, struct hid_bpf_ctx *hid_ctx)
+{
+	hid_ctx->hid->quirks |= HID_QUIRK_IGNORE_SPECIAL_DRIVER;
+	return 0;
+}
+
+SEC(".struct_ops.link")
+struct hid_bpf_ops test_driver_probe = {
+	.hid_rdesc_fixup = (void *)hid_test_driver_probe,
+};
diff --git a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h
index e5db897586bb..1a645684a117 100644
--- a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h
+++ b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h
@@ -84,10 +84,14 @@ struct hid_bpf_ops {
 	struct hid_device *hdev;
 };
 
+#define BIT(n) (1U << n)
+
 #ifndef BPF_F_BEFORE
-#define BPF_F_BEFORE (1U << 3)
+#define BPF_F_BEFORE BIT(3)
 #endif
 
+#define HID_QUIRK_IGNORE_SPECIAL_DRIVER		BIT(22)
+
 /* following are kfuncs exported by HID for HID-BPF */
 extern __u8 *hid_bpf_get_data(struct hid_bpf_ctx *ctx,
 			      unsigned int offset,
-- 
cgit v1.2.3


From a89568e9be75845bdbba496f40e8cd0ea29c7af1 Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <vadfed@meta.com>
Date: Tue, 1 Oct 2024 05:57:16 -0700
Subject: selftests: txtimestamp: add SCM_TS_OPT_ID test

Extend txtimestamp test to run with fixed tskey using
SCM_TS_OPT_ID control message for all types of sockets.

Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
Link: https://patch.msgid.link/20241001125716.2832769-4-vadfed@meta.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/include/uapi/asm-generic/socket.h    |  2 ++
 tools/testing/selftests/net/txtimestamp.c  | 44 +++++++++++++++++++++++-------
 tools/testing/selftests/net/txtimestamp.sh | 12 ++++----
 3 files changed, 43 insertions(+), 15 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/include/uapi/asm-generic/socket.h b/tools/include/uapi/asm-generic/socket.h
index 54d9c8bf7c55..281df9139d2b 100644
--- a/tools/include/uapi/asm-generic/socket.h
+++ b/tools/include/uapi/asm-generic/socket.h
@@ -124,6 +124,8 @@
 #define SO_PASSPIDFD		76
 #define SO_PEERPIDFD		77
 
+#define SCM_TS_OPT_ID		78
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
diff --git a/tools/testing/selftests/net/txtimestamp.c b/tools/testing/selftests/net/txtimestamp.c
index d626f22f9550..dae91eb97d69 100644
--- a/tools/testing/selftests/net/txtimestamp.c
+++ b/tools/testing/selftests/net/txtimestamp.c
@@ -77,6 +77,8 @@ static bool cfg_epollet;
 static bool cfg_do_listen;
 static uint16_t dest_port = 9000;
 static bool cfg_print_nsec;
+static uint32_t ts_opt_id;
+static bool cfg_use_cmsg_opt_id;
 
 static struct sockaddr_in daddr;
 static struct sockaddr_in6 daddr6;
@@ -136,12 +138,13 @@ static void validate_key(int tskey, int tstype)
 	/* compare key for each subsequent request
 	 * must only test for one type, the first one requested
 	 */
-	if (saved_tskey == -1)
+	if (saved_tskey == -1 || cfg_use_cmsg_opt_id)
 		saved_tskey_type = tstype;
 	else if (saved_tskey_type != tstype)
 		return;
 
 	stepsize = cfg_proto == SOCK_STREAM ? cfg_payload_len : 1;
+	stepsize = cfg_use_cmsg_opt_id ? 0 : stepsize;
 	if (tskey != saved_tskey + stepsize) {
 		fprintf(stderr, "ERROR: key %d, expected %d\n",
 				tskey, saved_tskey + stepsize);
@@ -484,7 +487,7 @@ static void fill_header_udp(void *p, bool is_ipv4)
 
 static void do_test(int family, unsigned int report_opt)
 {
-	char control[CMSG_SPACE(sizeof(uint32_t))];
+	char control[2 * CMSG_SPACE(sizeof(uint32_t))];
 	struct sockaddr_ll laddr;
 	unsigned int sock_opt;
 	struct cmsghdr *cmsg;
@@ -624,18 +627,32 @@ static void do_test(int family, unsigned int report_opt)
 		msg.msg_iov = &iov;
 		msg.msg_iovlen = 1;
 
-		if (cfg_use_cmsg) {
+		if (cfg_use_cmsg || cfg_use_cmsg_opt_id) {
 			memset(control, 0, sizeof(control));
 
 			msg.msg_control = control;
-			msg.msg_controllen = sizeof(control);
+			msg.msg_controllen = cfg_use_cmsg * CMSG_SPACE(sizeof(uint32_t));
+			msg.msg_controllen += cfg_use_cmsg_opt_id * CMSG_SPACE(sizeof(uint32_t));
 
-			cmsg = CMSG_FIRSTHDR(&msg);
-			cmsg->cmsg_level = SOL_SOCKET;
-			cmsg->cmsg_type = SO_TIMESTAMPING;
-			cmsg->cmsg_len = CMSG_LEN(sizeof(uint32_t));
+			cmsg = NULL;
+			if (cfg_use_cmsg) {
+				cmsg = CMSG_FIRSTHDR(&msg);
+				cmsg->cmsg_level = SOL_SOCKET;
+				cmsg->cmsg_type = SO_TIMESTAMPING;
+				cmsg->cmsg_len = CMSG_LEN(sizeof(uint32_t));
+
+				*((uint32_t *)CMSG_DATA(cmsg)) = report_opt;
+			}
+			if (cfg_use_cmsg_opt_id) {
+				cmsg = cmsg ? CMSG_NXTHDR(&msg, cmsg) : CMSG_FIRSTHDR(&msg);
+				cmsg->cmsg_level = SOL_SOCKET;
+				cmsg->cmsg_type = SCM_TS_OPT_ID;
+				cmsg->cmsg_len = CMSG_LEN(sizeof(uint32_t));
+
+				*((uint32_t *)CMSG_DATA(cmsg)) = ts_opt_id;
+				saved_tskey = ts_opt_id;
+			}
 
-			*((uint32_t *) CMSG_DATA(cmsg)) = report_opt;
 		}
 
 		val = sendmsg(fd, &msg, 0);
@@ -685,6 +702,7 @@ static void __attribute__((noreturn)) usage(const char *filepath)
 			"  -L    listen on hostname and port\n"
 			"  -n:   set no-payload option\n"
 			"  -N:   print timestamps and durations in nsec (instead of usec)\n"
+			"  -o N: use SCM_TS_OPT_ID control message to provide N as tskey\n"
 			"  -p N: connect to port N\n"
 			"  -P:   use PF_PACKET\n"
 			"  -r:   use raw\n"
@@ -705,7 +723,7 @@ static void parse_opt(int argc, char **argv)
 	int c;
 
 	while ((c = getopt(argc, argv,
-				"46bc:CeEFhIl:LnNp:PrRS:t:uv:V:x")) != -1) {
+				"46bc:CeEFhIl:LnNo:p:PrRS:t:uv:V:x")) != -1) {
 		switch (c) {
 		case '4':
 			do_ipv6 = 0;
@@ -746,6 +764,10 @@ static void parse_opt(int argc, char **argv)
 		case 'N':
 			cfg_print_nsec = true;
 			break;
+		case 'o':
+			ts_opt_id = strtoul(optarg, NULL, 10);
+			cfg_use_cmsg_opt_id = true;
+			break;
 		case 'p':
 			dest_port = strtoul(optarg, NULL, 10);
 			break;
@@ -803,6 +825,8 @@ static void parse_opt(int argc, char **argv)
 		error(1, 0, "cannot ask for pktinfo over pf_packet");
 	if (cfg_busy_poll && cfg_use_epoll)
 		error(1, 0, "pass epoll or busy_poll, not both");
+	if (cfg_proto == SOCK_STREAM && cfg_use_cmsg_opt_id)
+		error(1, 0, "TCP sockets don't support SCM_TS_OPT_ID");
 
 	if (optind != argc - 1)
 		error(1, 0, "missing required hostname argument");
diff --git a/tools/testing/selftests/net/txtimestamp.sh b/tools/testing/selftests/net/txtimestamp.sh
index 25baca4b148e..fe4649bb8786 100755
--- a/tools/testing/selftests/net/txtimestamp.sh
+++ b/tools/testing/selftests/net/txtimestamp.sh
@@ -37,11 +37,13 @@ run_test_v4v6() {
 run_test_tcpudpraw() {
 	local -r args=$@
 
-	run_test_v4v6 ${args}		# tcp
-	run_test_v4v6 ${args} -u	# udp
-	run_test_v4v6 ${args} -r	# raw
-	run_test_v4v6 ${args} -R	# raw (IPPROTO_RAW)
-	run_test_v4v6 ${args} -P	# pf_packet
+	run_test_v4v6 ${args}		  # tcp
+	run_test_v4v6 ${args} -u	  # udp
+	run_test_v4v6 ${args} -u -o 42	  # udp with fixed tskey
+	run_test_v4v6 ${args} -r	  # raw
+	run_test_v4v6 ${args} -r -o 42	  # raw
+	run_test_v4v6 ${args} -R	  # raw (IPPROTO_RAW)
+	run_test_v4v6 ${args} -P	  # pf_packet
 }
 
 run_test_all() {
-- 
cgit v1.2.3


From 897408d5e2248b8f139d57fe8f8bab651f80e6e6 Mon Sep 17 00:00:00 2001
From: Justin Iurman <justin.iurman@uliege.be>
Date: Wed, 2 Oct 2024 18:27:30 +0200
Subject: selftests: net: remove ioam tests

This patch entirely removes the ioam selftests to prepare for the next
patch in this series, which re-adds the new ioam selftests for better
readability.

Signed-off-by: Justin Iurman <justin.iurman@uliege.be>
Link: https://patch.msgid.link/20241002162731.19847-2-justin.iurman@uliege.be
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/Makefile       |   2 -
 tools/testing/selftests/net/config         |   1 -
 tools/testing/selftests/net/ioam6.sh       | 771 -----------------------------
 tools/testing/selftests/net/ioam6_parser.c | 674 -------------------------
 4 files changed, 1448 deletions(-)
 delete mode 100755 tools/testing/selftests/net/ioam6.sh
 delete mode 100644 tools/testing/selftests/net/ioam6_parser.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 649f1fe0dc46..ef40d099aa1c 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -28,7 +28,6 @@ TEST_PROGS += unicast_extensions.sh
 TEST_PROGS += udpgro_fwd.sh
 TEST_PROGS += udpgro_frglist.sh
 TEST_PROGS += veth.sh
-TEST_PROGS += ioam6.sh
 TEST_PROGS += gro.sh
 TEST_PROGS += gre_gso.sh
 TEST_PROGS += cmsg_so_mark.sh
@@ -67,7 +66,6 @@ TEST_GEN_FILES += fin_ack_lat
 TEST_GEN_FILES += reuseaddr_ports_exhausted
 TEST_GEN_FILES += hwtstamp_config rxtimestamp timestamping txtimestamp
 TEST_GEN_FILES += ipsec
-TEST_GEN_FILES += ioam6_parser
 TEST_GEN_FILES += gro
 TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
 TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls tun tap epoll_busy_poll
diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config
index 5b9baf708950..3f0b02835e78 100644
--- a/tools/testing/selftests/net/config
+++ b/tools/testing/selftests/net/config
@@ -95,7 +95,6 @@ CONFIG_NET_CLS_FLOWER=m
 CONFIG_NET_ACT_TUNNEL_KEY=m
 CONFIG_NET_ACT_MIRRED=m
 CONFIG_BAREUDP=m
-CONFIG_IPV6_IOAM6_LWTUNNEL=y
 CONFIG_CRYPTO_SM4_GENERIC=y
 CONFIG_AMT=m
 CONFIG_TUN=y
diff --git a/tools/testing/selftests/net/ioam6.sh b/tools/testing/selftests/net/ioam6.sh
deleted file mode 100755
index 12491850ae98..000000000000
--- a/tools/testing/selftests/net/ioam6.sh
+++ /dev/null
@@ -1,771 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0+
-#
-# Author: Justin Iurman <justin.iurman@uliege.be>
-#
-# This script evaluates the IOAM insertion for IPv6 by checking the IOAM data
-# consistency directly inside packets on the receiver side. Tests are divided
-# into three categories: OUTPUT (evaluates the IOAM processing by the sender),
-# INPUT (evaluates the IOAM processing by a receiver) and GLOBAL (evaluates
-# wider use cases that do not fall into the other two categories). Both OUTPUT
-# and INPUT tests only use a two-node topology (alpha and beta), while GLOBAL
-# tests use the entire three-node topology (alpha, beta, gamma). Each test is
-# documented inside its own handler in the code below.
-#
-# An IOAM domain is configured from Alpha to Gamma but not on the reverse path.
-# When either Beta or Gamma is the destination (depending on the test category),
-# Alpha adds an IOAM option (Pre-allocated Trace) inside a Hop-by-hop.
-#
-#
-#            +-------------------+            +-------------------+
-#            |                   |            |                   |
-#            |    Alpha netns    |            |    Gamma netns    |
-#            |                   |            |                   |
-#            |  +-------------+  |            |  +-------------+  |
-#            |  |    veth0    |  |            |  |    veth0    |  |
-#            |  |  db01::2/64 |  |            |  |  db02::2/64 |  |
-#            |  +-------------+  |            |  +-------------+  |
-#            |         .         |            |         .         |
-#            +-------------------+            +-------------------+
-#                      .                                .
-#                      .                                .
-#                      .                                .
-#            +----------------------------------------------------+
-#            |         .                                .         |
-#            |  +-------------+                  +-------------+  |
-#            |  |    veth0    |                  |    veth1    |  |
-#            |  |  db01::1/64 | ................ |  db02::1/64 |  |
-#            |  +-------------+                  +-------------+  |
-#            |                                                    |
-#            |                      Beta netns                    |
-#            |                                                    |
-#            +----------------------------------------------------+
-#
-#
-#
-#        =============================================================
-#        |                Alpha - IOAM configuration                 |
-#        +===========================================================+
-#        | Node ID             | 1                                   |
-#        +-----------------------------------------------------------+
-#        | Node Wide ID        | 11111111                            |
-#        +-----------------------------------------------------------+
-#        | Ingress ID          | 0xffff (default value)              |
-#        +-----------------------------------------------------------+
-#        | Ingress Wide ID     | 0xffffffff (default value)          |
-#        +-----------------------------------------------------------+
-#        | Egress ID           | 101                                 |
-#        +-----------------------------------------------------------+
-#        | Egress Wide ID      | 101101                              |
-#        +-----------------------------------------------------------+
-#        | Namespace Data      | 0xdeadbee0                          |
-#        +-----------------------------------------------------------+
-#        | Namespace Wide Data | 0xcafec0caf00dc0de                  |
-#        +-----------------------------------------------------------+
-#        | Schema ID           | 777                                 |
-#        +-----------------------------------------------------------+
-#        | Schema Data         | something that will be 4n-aligned   |
-#        +-----------------------------------------------------------+
-#
-#
-#        =============================================================
-#        |                 Beta - IOAM configuration                 |
-#        +===========================================================+
-#        | Node ID             | 2                                   |
-#        +-----------------------------------------------------------+
-#        | Node Wide ID        | 22222222                            |
-#        +-----------------------------------------------------------+
-#        | Ingress ID          | 201                                 |
-#        +-----------------------------------------------------------+
-#        | Ingress Wide ID     | 201201                              |
-#        +-----------------------------------------------------------+
-#        | Egress ID           | 202                                 |
-#        +-----------------------------------------------------------+
-#        | Egress Wide ID      | 202202                              |
-#        +-----------------------------------------------------------+
-#        | Namespace Data      | 0xdeadbee1                          |
-#        +-----------------------------------------------------------+
-#        | Namespace Wide Data | 0xcafec0caf11dc0de                  |
-#        +-----------------------------------------------------------+
-#        | Schema ID           | 666                                 |
-#        +-----------------------------------------------------------+
-#        | Schema Data         | Hello there -Obi                    |
-#        +-----------------------------------------------------------+
-#
-#
-#        =============================================================
-#        |                Gamma - IOAM configuration                 |
-#        +===========================================================+
-#        | Node ID             | 3                                   |
-#        +-----------------------------------------------------------+
-#        | Node Wide ID        | 33333333                            |
-#        +-----------------------------------------------------------+
-#        | Ingress ID          | 301                                 |
-#        +-----------------------------------------------------------+
-#        | Ingress Wide ID     | 301301                              |
-#        +-----------------------------------------------------------+
-#        | Egress ID           | 0xffff (default value)              |
-#        +-----------------------------------------------------------+
-#        | Egress Wide ID      | 0xffffffff (default value)          |
-#        +-----------------------------------------------------------+
-#        | Namespace Data      | 0xdeadbee2                          |
-#        +-----------------------------------------------------------+
-#        | Namespace Wide Data | 0xcafec0caf22dc0de                  |
-#        +-----------------------------------------------------------+
-#        | Schema ID           | 0xffffff (= None)                   |
-#        +-----------------------------------------------------------+
-#        | Schema Data         |                                     |
-#        +-----------------------------------------------------------+
-
-source lib.sh
-
-################################################################################
-#                                                                              #
-# WARNING: Be careful if you modify the block below - it MUST be kept          #
-#          synchronized with configurations inside ioam6_parser.c and always   #
-#          reflect the same.                                                   #
-#                                                                              #
-################################################################################
-
-ALPHA=(
-	1					# ID
-	11111111				# Wide ID
-	0xffff					# Ingress ID
-	0xffffffff				# Ingress Wide ID
-	101					# Egress ID
-	101101					# Egress Wide ID
-	0xdeadbee0				# Namespace Data
-	0xcafec0caf00dc0de			# Namespace Wide Data
-	777					# Schema ID (0xffffff = None)
-	"something that will be 4n-aligned"	# Schema Data
-)
-
-BETA=(
-	2
-	22222222
-	201
-	201201
-	202
-	202202
-	0xdeadbee1
-	0xcafec0caf11dc0de
-	666
-	"Hello there -Obi"
-)
-
-GAMMA=(
-	3
-	33333333
-	301
-	301301
-	0xffff
-	0xffffffff
-	0xdeadbee2
-	0xcafec0caf22dc0de
-	0xffffff
-	""
-)
-
-TESTS_OUTPUT="
-	out_undef_ns
-	out_no_room
-	out_bits
-	out_full_supp_trace
-"
-
-TESTS_INPUT="
-	in_undef_ns
-	in_no_room
-	in_oflag
-	in_bits
-	in_full_supp_trace
-"
-
-TESTS_GLOBAL="
-	fwd_full_supp_trace
-"
-
-
-################################################################################
-#                                                                              #
-#                                   LIBRARY                                    #
-#                                                                              #
-################################################################################
-
-check_kernel_compatibility()
-{
-  setup_ns ioam_tmp_node
-  ip link add name veth0 netns $ioam_tmp_node type veth \
-         peer name veth1 netns $ioam_tmp_node
-
-  ip -netns $ioam_tmp_node link set veth0 up
-  ip -netns $ioam_tmp_node link set veth1 up
-
-  ip -netns $ioam_tmp_node ioam namespace add 0
-  ns_ad=$?
-
-  ip -netns $ioam_tmp_node ioam namespace show | grep -q "namespace 0"
-  ns_sh=$?
-
-  if [[ $ns_ad != 0 || $ns_sh != 0 ]]
-  then
-    echo "SKIP: kernel version probably too old, missing ioam support"
-    ip link del veth0 2>/dev/null || true
-    cleanup_ns $ioam_tmp_node || true
-    exit $ksft_skip
-  fi
-
-  ip -netns $ioam_tmp_node route add db02::/64 encap ioam6 mode inline \
-         trace prealloc type 0x800000 ns 0 size 4 dev veth0
-  tr_ad=$?
-
-  ip -netns $ioam_tmp_node -6 route | grep -q "encap ioam6"
-  tr_sh=$?
-
-  if [[ $tr_ad != 0 || $tr_sh != 0 ]]
-  then
-    echo "SKIP: cannot attach an ioam trace to a route, did you compile" \
-         "without CONFIG_IPV6_IOAM6_LWTUNNEL?"
-    ip link del veth0 2>/dev/null || true
-    cleanup_ns $ioam_tmp_node || true
-    exit $ksft_skip
-  fi
-
-  ip link del veth0 2>/dev/null || true
-  cleanup_ns $ioam_tmp_node || true
-
-  lsmod | grep -q "ip6_tunnel"
-  ip6tnl_loaded=$?
-
-  if [ $ip6tnl_loaded = 0 ]
-  then
-    encap_tests=0
-  else
-    modprobe ip6_tunnel &>/dev/null
-    lsmod | grep -q "ip6_tunnel"
-    encap_tests=$?
-
-    if [ $encap_tests != 0 ]
-    then
-      ip a | grep -q "ip6tnl0"
-      encap_tests=$?
-
-      if [ $encap_tests != 0 ]
-      then
-        echo "Note: ip6_tunnel not found neither as a module nor inside the" \
-             "kernel, tests that require it (encap mode) will be omitted"
-      fi
-    fi
-  fi
-}
-
-cleanup()
-{
-  ip link del ioam-veth-alpha 2>/dev/null || true
-  ip link del ioam-veth-gamma 2>/dev/null || true
-
-  cleanup_ns $ioam_node_alpha $ioam_node_beta $ioam_node_gamma || true
-
-  if [ $ip6tnl_loaded != 0 ]
-  then
-    modprobe -r ip6_tunnel 2>/dev/null || true
-  fi
-}
-
-setup()
-{
-  setup_ns ioam_node_alpha ioam_node_beta ioam_node_gamma
-
-  ip link add name ioam-veth-alpha netns $ioam_node_alpha type veth \
-         peer name ioam-veth-betaL netns $ioam_node_beta
-  ip link add name ioam-veth-betaR netns $ioam_node_beta type veth \
-         peer name ioam-veth-gamma netns $ioam_node_gamma
-
-  ip -netns $ioam_node_alpha link set ioam-veth-alpha name veth0
-  ip -netns $ioam_node_beta link set ioam-veth-betaL name veth0
-  ip -netns $ioam_node_beta link set ioam-veth-betaR name veth1
-  ip -netns $ioam_node_gamma link set ioam-veth-gamma name veth0
-
-  ip -netns $ioam_node_alpha addr add db01::2/64 dev veth0
-  ip -netns $ioam_node_alpha link set veth0 up
-  ip -netns $ioam_node_alpha link set lo up
-  ip -netns $ioam_node_alpha route add db02::/64 via db01::1 dev veth0
-  ip -netns $ioam_node_alpha route del db01::/64
-  ip -netns $ioam_node_alpha route add db01::/64 dev veth0
-
-  ip -netns $ioam_node_beta addr add db01::1/64 dev veth0
-  ip -netns $ioam_node_beta addr add db02::1/64 dev veth1
-  ip -netns $ioam_node_beta link set veth0 up
-  ip -netns $ioam_node_beta link set veth1 up
-  ip -netns $ioam_node_beta link set lo up
-
-  ip -netns $ioam_node_gamma addr add db02::2/64 dev veth0
-  ip -netns $ioam_node_gamma link set veth0 up
-  ip -netns $ioam_node_gamma link set lo up
-  ip -netns $ioam_node_gamma route add db01::/64 via db02::1 dev veth0
-
-  # - IOAM config -
-  ip netns exec $ioam_node_alpha sysctl -wq net.ipv6.ioam6_id=${ALPHA[0]}
-  ip netns exec $ioam_node_alpha sysctl -wq net.ipv6.ioam6_id_wide=${ALPHA[1]}
-  ip netns exec $ioam_node_alpha sysctl -wq net.ipv6.conf.veth0.ioam6_id=${ALPHA[4]}
-  ip netns exec $ioam_node_alpha sysctl -wq net.ipv6.conf.veth0.ioam6_id_wide=${ALPHA[5]}
-  ip -netns $ioam_node_alpha ioam namespace add 123 data ${ALPHA[6]} wide ${ALPHA[7]}
-  ip -netns $ioam_node_alpha ioam schema add ${ALPHA[8]} "${ALPHA[9]}"
-  ip -netns $ioam_node_alpha ioam namespace set 123 schema ${ALPHA[8]}
-
-  ip netns exec $ioam_node_beta sysctl -wq net.ipv6.conf.all.forwarding=1
-  ip netns exec $ioam_node_beta sysctl -wq net.ipv6.ioam6_id=${BETA[0]}
-  ip netns exec $ioam_node_beta sysctl -wq net.ipv6.ioam6_id_wide=${BETA[1]}
-  ip netns exec $ioam_node_beta sysctl -wq net.ipv6.conf.veth0.ioam6_enabled=1
-  ip netns exec $ioam_node_beta sysctl -wq net.ipv6.conf.veth0.ioam6_id=${BETA[2]}
-  ip netns exec $ioam_node_beta sysctl -wq net.ipv6.conf.veth0.ioam6_id_wide=${BETA[3]}
-  ip netns exec $ioam_node_beta sysctl -wq net.ipv6.conf.veth1.ioam6_id=${BETA[4]}
-  ip netns exec $ioam_node_beta sysctl -wq net.ipv6.conf.veth1.ioam6_id_wide=${BETA[5]}
-  ip -netns $ioam_node_beta ioam namespace add 123 data ${BETA[6]} wide ${BETA[7]}
-  ip -netns $ioam_node_beta ioam schema add ${BETA[8]} "${BETA[9]}"
-  ip -netns $ioam_node_beta ioam namespace set 123 schema ${BETA[8]}
-
-  ip netns exec $ioam_node_gamma sysctl -wq net.ipv6.ioam6_id=${GAMMA[0]}
-  ip netns exec $ioam_node_gamma sysctl -wq net.ipv6.ioam6_id_wide=${GAMMA[1]}
-  ip netns exec $ioam_node_gamma sysctl -wq net.ipv6.conf.veth0.ioam6_enabled=1
-  ip netns exec $ioam_node_gamma sysctl -wq net.ipv6.conf.veth0.ioam6_id=${GAMMA[2]}
-  ip netns exec $ioam_node_gamma sysctl -wq net.ipv6.conf.veth0.ioam6_id_wide=${GAMMA[3]}
-  ip -netns $ioam_node_gamma ioam namespace add 123 data ${GAMMA[6]} wide ${GAMMA[7]}
-
-  sleep 1
-
-  ip netns exec $ioam_node_alpha ping6 -c 5 -W 1 db02::2 &>/dev/null
-  if [ $? != 0 ]
-  then
-    echo "Setup FAILED"
-    cleanup &>/dev/null
-    exit 0
-  fi
-}
-
-log_test_passed()
-{
-  local desc=$1
-  printf "TEST: %-60s  [ OK ]\n" "${desc}"
-}
-
-log_test_failed()
-{
-  local desc=$1
-  printf "TEST: %-60s  [FAIL]\n" "${desc}"
-}
-
-log_results()
-{
-  echo "- Tests passed: ${npassed}"
-  echo "- Tests failed: ${nfailed}"
-}
-
-run_test()
-{
-  local name=$1
-  local desc=$2
-  local node_src=$3
-  local node_dst=$4
-  local ip6_dst=$5
-  local trace_type=$6
-  local ioam_ns=$7
-  local type=$8
-
-  ip netns exec $node_dst ./ioam6_parser $name $trace_type $ioam_ns $type &
-  local spid=$!
-  sleep 0.1
-
-  ip netns exec $node_src ping6 -t 64 -c 1 -W 1 $ip6_dst &>/dev/null
-  if [ $? != 0 ]
-  then
-    nfailed=$((nfailed+1))
-    log_test_failed "${desc}"
-    kill -2 $spid &>/dev/null
-  else
-    wait $spid
-    if [ $? = 0 ]
-    then
-      npassed=$((npassed+1))
-      log_test_passed "${desc}"
-    else
-      nfailed=$((nfailed+1))
-      log_test_failed "${desc}"
-    fi
-  fi
-}
-
-run()
-{
-  echo
-  printf "%0.s-" {1..74}
-  echo
-  echo "OUTPUT tests"
-  printf "%0.s-" {1..74}
-  echo
-
-  # set OUTPUT settings
-  ip netns exec $ioam_node_beta sysctl -wq net.ipv6.conf.veth0.ioam6_enabled=0
-
-  for t in $TESTS_OUTPUT
-  do
-    $t "inline"
-    [ $encap_tests = 0 ] && $t "encap"
-  done
-
-  # clean OUTPUT settings
-  ip netns exec $ioam_node_beta sysctl -wq net.ipv6.conf.veth0.ioam6_enabled=1
-  ip -netns $ioam_node_alpha route change db01::/64 dev veth0
-
-
-  echo
-  printf "%0.s-" {1..74}
-  echo
-  echo "INPUT tests"
-  printf "%0.s-" {1..74}
-  echo
-
-  # set INPUT settings
-  ip -netns $ioam_node_alpha ioam namespace del 123
-
-  for t in $TESTS_INPUT
-  do
-    $t "inline"
-    [ $encap_tests = 0 ] && $t "encap"
-  done
-
-  # clean INPUT settings
-  ip -netns $ioam_node_alpha ioam namespace add 123 \
-         data ${ALPHA[6]} wide ${ALPHA[7]}
-  ip -netns $ioam_node_alpha ioam namespace set 123 schema ${ALPHA[8]}
-  ip -netns $ioam_node_alpha route change db01::/64 dev veth0
-
-  echo
-  printf "%0.s-" {1..74}
-  echo
-  echo "GLOBAL tests"
-  printf "%0.s-" {1..74}
-  echo
-
-  for t in $TESTS_GLOBAL
-  do
-    $t "inline"
-    [ $encap_tests = 0 ] && $t "encap"
-  done
-
-  echo
-  log_results
-}
-
-bit2type=(
-  0x800000 0x400000 0x200000 0x100000 0x080000 0x040000 0x020000 0x010000
-  0x008000 0x004000 0x002000 0x001000 0x000800 0x000400 0x000200 0x000100
-  0x000080 0x000040 0x000020 0x000010 0x000008 0x000004 0x000002
-)
-bit2size=( 4 4 4 4 4 4 4 4 8 8 8 4 4 4 4 4 4 4 4 4 4 4 4 )
-
-
-################################################################################
-#                                                                              #
-#                              OUTPUT tests                                    #
-#                                                                              #
-#   Two nodes (sender/receiver), IOAM disabled on ingress for the receiver.    #
-################################################################################
-
-out_undef_ns()
-{
-  ##############################################################################
-  # Make sure that the encap node won't fill the trace if the chosen IOAM      #
-  # namespace is not configured locally.                                       #
-  ##############################################################################
-  local desc="Unknown IOAM namespace"
-
-  [ "$1" = "encap" ] && mode="$1 tundst db01::1" || mode="$1"
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 up
-
-  ip -netns $ioam_node_alpha route change db01::/64 encap ioam6 mode $mode \
-         trace prealloc type 0x800000 ns 0 size 4 dev veth0
-
-  run_test ${FUNCNAME[0]} "${desc} ($1 mode)" $ioam_node_alpha $ioam_node_beta \
-         db01::1 0x800000 0 $1
-
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 down
-}
-
-out_no_room()
-{
-  ##############################################################################
-  # Make sure that the encap node won't fill the trace and will set the        #
-  # Overflow flag since there is no room enough for its data.                  #
-  ##############################################################################
-  local desc="Missing trace room"
-
-  [ "$1" = "encap" ] && mode="$1 tundst db01::1" || mode="$1"
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 up
-
-  ip -netns $ioam_node_alpha route change db01::/64 encap ioam6 mode $mode \
-         trace prealloc type 0xc00000 ns 123 size 4 dev veth0
-
-  run_test ${FUNCNAME[0]} "${desc} ($1 mode)" $ioam_node_alpha $ioam_node_beta \
-         db01::1 0xc00000 123 $1
-
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 down
-}
-
-out_bits()
-{
-  ##############################################################################
-  # Make sure that, for each trace type bit, the encap node will either:       #
-  #  (i)  fill the trace with its data when it is a supported bit              #
-  #  (ii) not fill the trace with its data when it is an unsupported bit       #
-  ##############################################################################
-  local desc="Trace type with bit <n> only"
-
-  local tmp=${bit2size[22]}
-  bit2size[22]=$(( $tmp + ${#ALPHA[9]} + ((4 - (${#ALPHA[9]} % 4)) % 4) ))
-
-  [ "$1" = "encap" ] && mode="$1 tundst db01::1" || mode="$1"
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 up
-
-  for i in {0..22}
-  do
-    ip -netns $ioam_node_alpha route change db01::/64 encap ioam6 mode $mode \
-           trace prealloc type ${bit2type[$i]} ns 123 size ${bit2size[$i]} \
-           dev veth0 &>/dev/null
-
-    local cmd_res=$?
-    local descr="${desc/<n>/$i}"
-
-    if [[ $i -ge 12 && $i -le 21 ]]
-    then
-      if [ $cmd_res != 0 ]
-      then
-        npassed=$((npassed+1))
-        log_test_passed "$descr ($1 mode)"
-      else
-        nfailed=$((nfailed+1))
-        log_test_failed "$descr ($1 mode)"
-      fi
-    else
-	run_test "out_bit$i" "$descr ($1 mode)" $ioam_node_alpha \
-           $ioam_node_beta db01::1 ${bit2type[$i]} 123 $1
-    fi
-  done
-
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 down
-
-  bit2size[22]=$tmp
-}
-
-out_full_supp_trace()
-{
-  ##############################################################################
-  # Make sure that the encap node will correctly fill a full trace. Be careful,#
-  # "full trace" here does NOT mean all bits (only supported ones).            #
-  ##############################################################################
-  local desc="Full supported trace"
-
-  [ "$1" = "encap" ] && mode="$1 tundst db01::1" || mode="$1"
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 up
-
-  ip -netns $ioam_node_alpha route change db01::/64 encap ioam6 mode $mode \
-         trace prealloc type 0xfff002 ns 123 size 100 dev veth0
-
-  run_test ${FUNCNAME[0]} "${desc} ($1 mode)" $ioam_node_alpha $ioam_node_beta \
-         db01::1 0xfff002 123 $1
-
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 down
-}
-
-
-################################################################################
-#                                                                              #
-#                               INPUT tests                                    #
-#                                                                              #
-#     Two nodes (sender/receiver), the sender MUST NOT fill the trace upon     #
-#     insertion -> the IOAM namespace configured on the sender is removed      #
-#     and is used in the inserted trace to force the sender not to fill it.    #
-################################################################################
-
-in_undef_ns()
-{
-  ##############################################################################
-  # Make sure that the receiving node won't fill the trace if the related IOAM #
-  # namespace is not configured locally.                                       #
-  ##############################################################################
-  local desc="Unknown IOAM namespace"
-
-  [ "$1" = "encap" ] && mode="$1 tundst db01::1" || mode="$1"
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 up
-
-  ip -netns $ioam_node_alpha route change db01::/64 encap ioam6 mode $mode \
-         trace prealloc type 0x800000 ns 0 size 4 dev veth0
-
-  run_test ${FUNCNAME[0]} "${desc} ($1 mode)" $ioam_node_alpha $ioam_node_beta \
-         db01::1 0x800000 0 $1
-
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 down
-}
-
-in_no_room()
-{
-  ##############################################################################
-  # Make sure that the receiving node won't fill the trace and will set the    #
-  # Overflow flag if there is no room enough for its data.                     #
-  ##############################################################################
-  local desc="Missing trace room"
-
-  [ "$1" = "encap" ] && mode="$1 tundst db01::1" || mode="$1"
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 up
-
-  ip -netns $ioam_node_alpha route change db01::/64 encap ioam6 mode $mode \
-         trace prealloc type 0xc00000 ns 123 size 4 dev veth0
-
-  run_test ${FUNCNAME[0]} "${desc} ($1 mode)" $ioam_node_alpha $ioam_node_beta \
-         db01::1 0xc00000 123 $1
-
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 down
-}
-
-in_bits()
-{
-  ##############################################################################
-  # Make sure that, for each trace type bit, the receiving node will either:   #
-  #  (i)  fill the trace with its data when it is a supported bit              #
-  #  (ii) not fill the trace with its data when it is an unsupported bit       #
-  ##############################################################################
-  local desc="Trace type with bit <n> only"
-
-  local tmp=${bit2size[22]}
-  bit2size[22]=$(( $tmp + ${#BETA[9]} + ((4 - (${#BETA[9]} % 4)) % 4) ))
-
-  [ "$1" = "encap" ] && mode="$1 tundst db01::1" || mode="$1"
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 up
-
-  for i in {0..11} {22..22}
-  do
-    ip -netns $ioam_node_alpha route change db01::/64 encap ioam6 mode $mode \
-           trace prealloc type ${bit2type[$i]} ns 123 size ${bit2size[$i]} \
-           dev veth0
-
-    run_test "in_bit$i" "${desc/<n>/$i} ($1 mode)" $ioam_node_alpha \
-           $ioam_node_beta db01::1 ${bit2type[$i]} 123 $1
-  done
-
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 down
-
-  bit2size[22]=$tmp
-}
-
-in_oflag()
-{
-  ##############################################################################
-  # Make sure that the receiving node won't fill the trace since the Overflow  #
-  # flag is set.                                                               #
-  ##############################################################################
-  local desc="Overflow flag is set"
-
-  # Exception:
-  #   Here, we need the sender to set the Overflow flag. For that, we will add
-  #   back the IOAM namespace that was previously configured on the sender.
-  ip -netns $ioam_node_alpha ioam namespace add 123
-
-  [ "$1" = "encap" ] && mode="$1 tundst db01::1" || mode="$1"
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 up
-
-  ip -netns $ioam_node_alpha route change db01::/64 encap ioam6 mode $mode \
-         trace prealloc type 0xc00000 ns 123 size 4 dev veth0
-
-  run_test ${FUNCNAME[0]} "${desc} ($1 mode)" $ioam_node_alpha $ioam_node_beta \
-         db01::1 0xc00000 123 $1
-
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 down
-
-  # And we clean the exception for this test to get things back to normal for
-  # other INPUT tests
-  ip -netns $ioam_node_alpha ioam namespace del 123
-}
-
-in_full_supp_trace()
-{
-  ##############################################################################
-  # Make sure that the receiving node will correctly fill a full trace. Be     #
-  # careful, "full trace" here does NOT mean all bits (only supported ones).   #
-  ##############################################################################
-  local desc="Full supported trace"
-
-  [ "$1" = "encap" ] && mode="$1 tundst db01::1" || mode="$1"
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 up
-
-  ip -netns $ioam_node_alpha route change db01::/64 encap ioam6 mode $mode \
-         trace prealloc type 0xfff002 ns 123 size 80 dev veth0
-
-  run_test ${FUNCNAME[0]} "${desc} ($1 mode)" $ioam_node_alpha $ioam_node_beta \
-         db01::1 0xfff002 123 $1
-
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 down
-}
-
-
-################################################################################
-#                                                                              #
-#                              GLOBAL tests                                    #
-#                                                                              #
-#   Three nodes (sender/router/receiver), IOAM fully enabled on every node.    #
-################################################################################
-
-fwd_full_supp_trace()
-{
-  ##############################################################################
-  # Make sure that all three nodes correctly filled the full supported trace   #
-  # by checking that the trace data is consistent with the predefined config.  #
-  ##############################################################################
-  local desc="Forward - Full supported trace"
-
-  [ "$1" = "encap" ] && mode="$1 tundst db02::2" || mode="$1"
-  [ "$1" = "encap" ] && ip -netns $ioam_node_gamma link set ip6tnl0 up
-
-  ip -netns $ioam_node_alpha route change db02::/64 encap ioam6 mode $mode \
-         trace prealloc type 0xfff002 ns 123 size 244 via db01::1 dev veth0
-
-  run_test ${FUNCNAME[0]} "${desc} ($1 mode)" $ioam_node_alpha $ioam_node_gamma \
-         db02::2 0xfff002 123 $1
-
-  [ "$1" = "encap" ] && ip -netns $ioam_node_gamma link set ip6tnl0 down
-}
-
-
-################################################################################
-#                                                                              #
-#                                     MAIN                                     #
-#                                                                              #
-################################################################################
-
-npassed=0
-nfailed=0
-
-if [ "$(id -u)" -ne 0 ]
-then
-  echo "SKIP: Need root privileges"
-  exit $ksft_skip
-fi
-
-if [ ! -x "$(command -v ip)" ]
-then
-  echo "SKIP: Could not run test without ip tool"
-  exit $ksft_skip
-fi
-
-ip ioam &>/dev/null
-if [ $? = 1 ]
-then
-  echo "SKIP: iproute2 too old, missing ioam command"
-  exit $ksft_skip
-fi
-
-check_kernel_compatibility
-
-cleanup &>/dev/null
-setup
-run
-cleanup &>/dev/null
diff --git a/tools/testing/selftests/net/ioam6_parser.c b/tools/testing/selftests/net/ioam6_parser.c
deleted file mode 100644
index 895e5bb5044b..000000000000
--- a/tools/testing/selftests/net/ioam6_parser.c
+++ /dev/null
@@ -1,674 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Author: Justin Iurman (justin.iurman@uliege.be)
- *
- * IOAM tester for IPv6, see ioam6.sh for details on each test case.
- */
-#include <arpa/inet.h>
-#include <errno.h>
-#include <limits.h>
-#include <linux/const.h>
-#include <linux/ioam6.h>
-#include <linux/ipv6.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-
-struct ioam_config {
-	__u32 id;
-	__u64 wide;
-	__u16 ingr_id;
-	__u16 egr_id;
-	__u32 ingr_wide;
-	__u32 egr_wide;
-	__u32 ns_data;
-	__u64 ns_wide;
-	__u32 sc_id;
-	__u8 hlim;
-	char *sc_data;
-};
-
-/*
- * Be careful if you modify structs below - everything MUST be kept synchronized
- * with configurations inside ioam6.sh and always reflect the same.
- */
-
-static struct ioam_config node1 = {
-	.id = 1,
-	.wide = 11111111,
-	.ingr_id = 0xffff, /* default value */
-	.egr_id = 101,
-	.ingr_wide = 0xffffffff, /* default value */
-	.egr_wide = 101101,
-	.ns_data = 0xdeadbee0,
-	.ns_wide = 0xcafec0caf00dc0de,
-	.sc_id = 777,
-	.sc_data = "something that will be 4n-aligned",
-	.hlim = 64,
-};
-
-static struct ioam_config node2 = {
-	.id = 2,
-	.wide = 22222222,
-	.ingr_id = 201,
-	.egr_id = 202,
-	.ingr_wide = 201201,
-	.egr_wide = 202202,
-	.ns_data = 0xdeadbee1,
-	.ns_wide = 0xcafec0caf11dc0de,
-	.sc_id = 666,
-	.sc_data = "Hello there -Obi",
-	.hlim = 63,
-};
-
-static struct ioam_config node3 = {
-	.id = 3,
-	.wide = 33333333,
-	.ingr_id = 301,
-	.egr_id = 0xffff, /* default value */
-	.ingr_wide = 301301,
-	.egr_wide = 0xffffffff, /* default value */
-	.ns_data = 0xdeadbee2,
-	.ns_wide = 0xcafec0caf22dc0de,
-	.sc_id = 0xffffff, /* default value */
-	.sc_data = NULL,
-	.hlim = 62,
-};
-
-enum {
-	/**********
-	 * OUTPUT *
-	 **********/
-	TEST_OUT_UNDEF_NS,
-	TEST_OUT_NO_ROOM,
-	TEST_OUT_BIT0,
-	TEST_OUT_BIT1,
-	TEST_OUT_BIT2,
-	TEST_OUT_BIT3,
-	TEST_OUT_BIT4,
-	TEST_OUT_BIT5,
-	TEST_OUT_BIT6,
-	TEST_OUT_BIT7,
-	TEST_OUT_BIT8,
-	TEST_OUT_BIT9,
-	TEST_OUT_BIT10,
-	TEST_OUT_BIT11,
-	TEST_OUT_BIT22,
-	TEST_OUT_FULL_SUPP_TRACE,
-
-	/*********
-	 * INPUT *
-	 *********/
-	TEST_IN_UNDEF_NS,
-	TEST_IN_NO_ROOM,
-	TEST_IN_OFLAG,
-	TEST_IN_BIT0,
-	TEST_IN_BIT1,
-	TEST_IN_BIT2,
-	TEST_IN_BIT3,
-	TEST_IN_BIT4,
-	TEST_IN_BIT5,
-	TEST_IN_BIT6,
-	TEST_IN_BIT7,
-	TEST_IN_BIT8,
-	TEST_IN_BIT9,
-	TEST_IN_BIT10,
-	TEST_IN_BIT11,
-	TEST_IN_BIT22,
-	TEST_IN_FULL_SUPP_TRACE,
-
-	/**********
-	 * GLOBAL *
-	 **********/
-	TEST_FWD_FULL_SUPP_TRACE,
-
-	__TEST_MAX,
-};
-
-static int check_ioam_header(int tid, struct ioam6_trace_hdr *ioam6h,
-			     __u32 trace_type, __u16 ioam_ns)
-{
-	if (__be16_to_cpu(ioam6h->namespace_id) != ioam_ns ||
-	    __be32_to_cpu(ioam6h->type_be32) != (trace_type << 8))
-		return 1;
-
-	switch (tid) {
-	case TEST_OUT_UNDEF_NS:
-	case TEST_IN_UNDEF_NS:
-		return ioam6h->overflow ||
-		       ioam6h->nodelen != 1 ||
-		       ioam6h->remlen != 1;
-
-	case TEST_OUT_NO_ROOM:
-	case TEST_IN_NO_ROOM:
-	case TEST_IN_OFLAG:
-		return !ioam6h->overflow ||
-		       ioam6h->nodelen != 2 ||
-		       ioam6h->remlen != 1;
-
-	case TEST_OUT_BIT0:
-	case TEST_IN_BIT0:
-	case TEST_OUT_BIT1:
-	case TEST_IN_BIT1:
-	case TEST_OUT_BIT2:
-	case TEST_IN_BIT2:
-	case TEST_OUT_BIT3:
-	case TEST_IN_BIT3:
-	case TEST_OUT_BIT4:
-	case TEST_IN_BIT4:
-	case TEST_OUT_BIT5:
-	case TEST_IN_BIT5:
-	case TEST_OUT_BIT6:
-	case TEST_IN_BIT6:
-	case TEST_OUT_BIT7:
-	case TEST_IN_BIT7:
-	case TEST_OUT_BIT11:
-	case TEST_IN_BIT11:
-		return ioam6h->overflow ||
-		       ioam6h->nodelen != 1 ||
-		       ioam6h->remlen;
-
-	case TEST_OUT_BIT8:
-	case TEST_IN_BIT8:
-	case TEST_OUT_BIT9:
-	case TEST_IN_BIT9:
-	case TEST_OUT_BIT10:
-	case TEST_IN_BIT10:
-		return ioam6h->overflow ||
-		       ioam6h->nodelen != 2 ||
-		       ioam6h->remlen;
-
-	case TEST_OUT_BIT22:
-	case TEST_IN_BIT22:
-		return ioam6h->overflow ||
-		       ioam6h->nodelen ||
-		       ioam6h->remlen;
-
-	case TEST_OUT_FULL_SUPP_TRACE:
-	case TEST_IN_FULL_SUPP_TRACE:
-	case TEST_FWD_FULL_SUPP_TRACE:
-		return ioam6h->overflow ||
-		       ioam6h->nodelen != 15 ||
-		       ioam6h->remlen;
-
-	default:
-		break;
-	}
-
-	return 1;
-}
-
-static int check_ioam6_data(__u8 **p, struct ioam6_trace_hdr *ioam6h,
-			    const struct ioam_config cnf)
-{
-	unsigned int len;
-	__u8 aligned;
-	__u64 raw64;
-	__u32 raw32;
-
-	if (ioam6h->type.bit0) {
-		raw32 = __be32_to_cpu(*((__u32 *)*p));
-		if (cnf.hlim != (raw32 >> 24) || cnf.id != (raw32 & 0xffffff))
-			return 1;
-		*p += sizeof(__u32);
-	}
-
-	if (ioam6h->type.bit1) {
-		raw32 = __be32_to_cpu(*((__u32 *)*p));
-		if (cnf.ingr_id != (raw32 >> 16) ||
-		    cnf.egr_id != (raw32 & 0xffff))
-			return 1;
-		*p += sizeof(__u32);
-	}
-
-	if (ioam6h->type.bit2)
-		*p += sizeof(__u32);
-
-	if (ioam6h->type.bit3)
-		*p += sizeof(__u32);
-
-	if (ioam6h->type.bit4) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff)
-			return 1;
-		*p += sizeof(__u32);
-	}
-
-	if (ioam6h->type.bit5) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != cnf.ns_data)
-			return 1;
-		*p += sizeof(__u32);
-	}
-
-	if (ioam6h->type.bit6)
-		*p += sizeof(__u32);
-
-	if (ioam6h->type.bit7) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff)
-			return 1;
-		*p += sizeof(__u32);
-	}
-
-	if (ioam6h->type.bit8) {
-		raw64 = __be64_to_cpu(*((__u64 *)*p));
-		if (cnf.hlim != (raw64 >> 56) ||
-		    cnf.wide != (raw64 & 0xffffffffffffff))
-			return 1;
-		*p += sizeof(__u64);
-	}
-
-	if (ioam6h->type.bit9) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != cnf.ingr_wide)
-			return 1;
-		*p += sizeof(__u32);
-
-		if (__be32_to_cpu(*((__u32 *)*p)) != cnf.egr_wide)
-			return 1;
-		*p += sizeof(__u32);
-	}
-
-	if (ioam6h->type.bit10) {
-		if (__be64_to_cpu(*((__u64 *)*p)) != cnf.ns_wide)
-			return 1;
-		*p += sizeof(__u64);
-	}
-
-	if (ioam6h->type.bit11) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff)
-			return 1;
-		*p += sizeof(__u32);
-	}
-
-	if (ioam6h->type.bit12) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff)
-			return 1;
-		*p += sizeof(__u32);
-	}
-
-	if (ioam6h->type.bit13) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff)
-			return 1;
-		*p += sizeof(__u32);
-	}
-
-	if (ioam6h->type.bit14) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff)
-			return 1;
-		*p += sizeof(__u32);
-	}
-
-	if (ioam6h->type.bit15) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff)
-			return 1;
-		*p += sizeof(__u32);
-	}
-
-	if (ioam6h->type.bit16) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff)
-			return 1;
-		*p += sizeof(__u32);
-	}
-
-	if (ioam6h->type.bit17) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff)
-			return 1;
-		*p += sizeof(__u32);
-	}
-
-	if (ioam6h->type.bit18) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff)
-			return 1;
-		*p += sizeof(__u32);
-	}
-
-	if (ioam6h->type.bit19) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff)
-			return 1;
-		*p += sizeof(__u32);
-	}
-
-	if (ioam6h->type.bit20) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff)
-			return 1;
-		*p += sizeof(__u32);
-	}
-
-	if (ioam6h->type.bit21) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff)
-			return 1;
-		*p += sizeof(__u32);
-	}
-
-	if (ioam6h->type.bit22) {
-		len = cnf.sc_data ? strlen(cnf.sc_data) : 0;
-		aligned = cnf.sc_data ? __ALIGN_KERNEL(len, 4) : 0;
-
-		raw32 = __be32_to_cpu(*((__u32 *)*p));
-		if (aligned != (raw32 >> 24) * 4 ||
-		    cnf.sc_id != (raw32 & 0xffffff))
-			return 1;
-		*p += sizeof(__u32);
-
-		if (cnf.sc_data) {
-			if (strncmp((char *)*p, cnf.sc_data, len))
-				return 1;
-
-			*p += len;
-			aligned -= len;
-
-			while (aligned--) {
-				if (**p != '\0')
-					return 1;
-				*p += sizeof(__u8);
-			}
-		}
-	}
-
-	return 0;
-}
-
-static int check_ioam_header_and_data(int tid, struct ioam6_trace_hdr *ioam6h,
-				      __u32 trace_type, __u16 ioam_ns)
-{
-	__u8 *p;
-
-	if (check_ioam_header(tid, ioam6h, trace_type, ioam_ns))
-		return 1;
-
-	p = ioam6h->data + ioam6h->remlen * 4;
-
-	switch (tid) {
-	case TEST_OUT_BIT0:
-	case TEST_OUT_BIT1:
-	case TEST_OUT_BIT2:
-	case TEST_OUT_BIT3:
-	case TEST_OUT_BIT4:
-	case TEST_OUT_BIT5:
-	case TEST_OUT_BIT6:
-	case TEST_OUT_BIT7:
-	case TEST_OUT_BIT8:
-	case TEST_OUT_BIT9:
-	case TEST_OUT_BIT10:
-	case TEST_OUT_BIT11:
-	case TEST_OUT_BIT22:
-	case TEST_OUT_FULL_SUPP_TRACE:
-		return check_ioam6_data(&p, ioam6h, node1);
-
-	case TEST_IN_BIT0:
-	case TEST_IN_BIT1:
-	case TEST_IN_BIT2:
-	case TEST_IN_BIT3:
-	case TEST_IN_BIT4:
-	case TEST_IN_BIT5:
-	case TEST_IN_BIT6:
-	case TEST_IN_BIT7:
-	case TEST_IN_BIT8:
-	case TEST_IN_BIT9:
-	case TEST_IN_BIT10:
-	case TEST_IN_BIT11:
-	case TEST_IN_BIT22:
-	case TEST_IN_FULL_SUPP_TRACE:
-	{
-		__u32 tmp32 = node2.egr_wide;
-		__u16 tmp16 = node2.egr_id;
-		int res;
-
-		node2.egr_id = 0xffff;
-		node2.egr_wide = 0xffffffff;
-
-		res = check_ioam6_data(&p, ioam6h, node2);
-
-		node2.egr_id = tmp16;
-		node2.egr_wide = tmp32;
-
-		return res;
-	}
-
-	case TEST_FWD_FULL_SUPP_TRACE:
-		if (check_ioam6_data(&p, ioam6h, node3))
-			return 1;
-		if (check_ioam6_data(&p, ioam6h, node2))
-			return 1;
-		return check_ioam6_data(&p, ioam6h, node1);
-
-	default:
-		break;
-	}
-
-	return 1;
-}
-
-static int str2id(const char *tname)
-{
-	if (!strcmp("out_undef_ns", tname))
-		return TEST_OUT_UNDEF_NS;
-	if (!strcmp("out_no_room", tname))
-		return TEST_OUT_NO_ROOM;
-	if (!strcmp("out_bit0", tname))
-		return TEST_OUT_BIT0;
-	if (!strcmp("out_bit1", tname))
-		return TEST_OUT_BIT1;
-	if (!strcmp("out_bit2", tname))
-		return TEST_OUT_BIT2;
-	if (!strcmp("out_bit3", tname))
-		return TEST_OUT_BIT3;
-	if (!strcmp("out_bit4", tname))
-		return TEST_OUT_BIT4;
-	if (!strcmp("out_bit5", tname))
-		return TEST_OUT_BIT5;
-	if (!strcmp("out_bit6", tname))
-		return TEST_OUT_BIT6;
-	if (!strcmp("out_bit7", tname))
-		return TEST_OUT_BIT7;
-	if (!strcmp("out_bit8", tname))
-		return TEST_OUT_BIT8;
-	if (!strcmp("out_bit9", tname))
-		return TEST_OUT_BIT9;
-	if (!strcmp("out_bit10", tname))
-		return TEST_OUT_BIT10;
-	if (!strcmp("out_bit11", tname))
-		return TEST_OUT_BIT11;
-	if (!strcmp("out_bit22", tname))
-		return TEST_OUT_BIT22;
-	if (!strcmp("out_full_supp_trace", tname))
-		return TEST_OUT_FULL_SUPP_TRACE;
-	if (!strcmp("in_undef_ns", tname))
-		return TEST_IN_UNDEF_NS;
-	if (!strcmp("in_no_room", tname))
-		return TEST_IN_NO_ROOM;
-	if (!strcmp("in_oflag", tname))
-		return TEST_IN_OFLAG;
-	if (!strcmp("in_bit0", tname))
-		return TEST_IN_BIT0;
-	if (!strcmp("in_bit1", tname))
-		return TEST_IN_BIT1;
-	if (!strcmp("in_bit2", tname))
-		return TEST_IN_BIT2;
-	if (!strcmp("in_bit3", tname))
-		return TEST_IN_BIT3;
-	if (!strcmp("in_bit4", tname))
-		return TEST_IN_BIT4;
-	if (!strcmp("in_bit5", tname))
-		return TEST_IN_BIT5;
-	if (!strcmp("in_bit6", tname))
-		return TEST_IN_BIT6;
-	if (!strcmp("in_bit7", tname))
-		return TEST_IN_BIT7;
-	if (!strcmp("in_bit8", tname))
-		return TEST_IN_BIT8;
-	if (!strcmp("in_bit9", tname))
-		return TEST_IN_BIT9;
-	if (!strcmp("in_bit10", tname))
-		return TEST_IN_BIT10;
-	if (!strcmp("in_bit11", tname))
-		return TEST_IN_BIT11;
-	if (!strcmp("in_bit22", tname))
-		return TEST_IN_BIT22;
-	if (!strcmp("in_full_supp_trace", tname))
-		return TEST_IN_FULL_SUPP_TRACE;
-	if (!strcmp("fwd_full_supp_trace", tname))
-		return TEST_FWD_FULL_SUPP_TRACE;
-
-	return -1;
-}
-
-static int get_u32(__u32 *val, const char *arg, int base)
-{
-	unsigned long res;
-	char *ptr;
-
-	if (!arg || !*arg)
-		return -1;
-	res = strtoul(arg, &ptr, base);
-
-	if (!ptr || ptr == arg || *ptr)
-		return -1;
-
-	if (res == ULONG_MAX && errno == ERANGE)
-		return -1;
-
-	if (res > 0xFFFFFFFFUL)
-		return -1;
-
-	*val = res;
-	return 0;
-}
-
-static int get_u16(__u16 *val, const char *arg, int base)
-{
-	unsigned long res;
-	char *ptr;
-
-	if (!arg || !*arg)
-		return -1;
-	res = strtoul(arg, &ptr, base);
-
-	if (!ptr || ptr == arg || *ptr)
-		return -1;
-
-	if (res == ULONG_MAX && errno == ERANGE)
-		return -1;
-
-	if (res > 0xFFFFUL)
-		return -1;
-
-	*val = res;
-	return 0;
-}
-
-static int (*func[__TEST_MAX])(int, struct ioam6_trace_hdr *, __u32, __u16) = {
-	[TEST_OUT_UNDEF_NS]		= check_ioam_header,
-	[TEST_OUT_NO_ROOM]		= check_ioam_header,
-	[TEST_OUT_BIT0]		= check_ioam_header_and_data,
-	[TEST_OUT_BIT1]		= check_ioam_header_and_data,
-	[TEST_OUT_BIT2]		= check_ioam_header_and_data,
-	[TEST_OUT_BIT3]		= check_ioam_header_and_data,
-	[TEST_OUT_BIT4]		= check_ioam_header_and_data,
-	[TEST_OUT_BIT5]		= check_ioam_header_and_data,
-	[TEST_OUT_BIT6]		= check_ioam_header_and_data,
-	[TEST_OUT_BIT7]		= check_ioam_header_and_data,
-	[TEST_OUT_BIT8]		= check_ioam_header_and_data,
-	[TEST_OUT_BIT9]		= check_ioam_header_and_data,
-	[TEST_OUT_BIT10]		= check_ioam_header_and_data,
-	[TEST_OUT_BIT11]		= check_ioam_header_and_data,
-	[TEST_OUT_BIT22]		= check_ioam_header_and_data,
-	[TEST_OUT_FULL_SUPP_TRACE]	= check_ioam_header_and_data,
-	[TEST_IN_UNDEF_NS]		= check_ioam_header,
-	[TEST_IN_NO_ROOM]		= check_ioam_header,
-	[TEST_IN_OFLAG]		= check_ioam_header,
-	[TEST_IN_BIT0]			= check_ioam_header_and_data,
-	[TEST_IN_BIT1]			= check_ioam_header_and_data,
-	[TEST_IN_BIT2]			= check_ioam_header_and_data,
-	[TEST_IN_BIT3]			= check_ioam_header_and_data,
-	[TEST_IN_BIT4]			= check_ioam_header_and_data,
-	[TEST_IN_BIT5]			= check_ioam_header_and_data,
-	[TEST_IN_BIT6]			= check_ioam_header_and_data,
-	[TEST_IN_BIT7]			= check_ioam_header_and_data,
-	[TEST_IN_BIT8]			= check_ioam_header_and_data,
-	[TEST_IN_BIT9]			= check_ioam_header_and_data,
-	[TEST_IN_BIT10]		= check_ioam_header_and_data,
-	[TEST_IN_BIT11]		= check_ioam_header_and_data,
-	[TEST_IN_BIT22]		= check_ioam_header_and_data,
-	[TEST_IN_FULL_SUPP_TRACE]	= check_ioam_header_and_data,
-	[TEST_FWD_FULL_SUPP_TRACE]	= check_ioam_header_and_data,
-};
-
-int main(int argc, char **argv)
-{
-	int fd, size, hoplen, tid, ret = 1, on = 1;
-	struct ioam6_hdr *opt;
-	struct cmsghdr *cmsg;
-	struct msghdr msg;
-	struct iovec iov;
-	__u8 buffer[512];
-	__u32 tr_type;
-	__u16 ioam_ns;
-	__u8 *ptr;
-
-	if (argc != 5)
-		goto out;
-
-	tid = str2id(argv[1]);
-	if (tid < 0 || !func[tid])
-		goto out;
-
-	if (get_u32(&tr_type, argv[2], 16) ||
-	    get_u16(&ioam_ns, argv[3], 0))
-		goto out;
-
-	fd = socket(PF_INET6, SOCK_RAW,
-		    !strcmp(argv[4], "encap") ? IPPROTO_IPV6 : IPPROTO_ICMPV6);
-	if (fd < 0)
-		goto out;
-
-	setsockopt(fd, IPPROTO_IPV6, IPV6_RECVHOPOPTS,  &on, sizeof(on));
-
-	iov.iov_len = 1;
-	iov.iov_base = malloc(CMSG_SPACE(sizeof(buffer)));
-	if (!iov.iov_base)
-		goto close;
-recv:
-	memset(&msg, 0, sizeof(msg));
-	msg.msg_iov = &iov;
-	msg.msg_iovlen = 1;
-	msg.msg_control = buffer;
-	msg.msg_controllen = CMSG_SPACE(sizeof(buffer));
-
-	size = recvmsg(fd, &msg, 0);
-	if (size <= 0)
-		goto close;
-
-	for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
-		if (cmsg->cmsg_level != IPPROTO_IPV6 ||
-		    cmsg->cmsg_type != IPV6_HOPOPTS ||
-		    cmsg->cmsg_len < sizeof(struct ipv6_hopopt_hdr))
-			continue;
-
-		ptr = (__u8 *)CMSG_DATA(cmsg);
-
-		hoplen = (ptr[1] + 1) << 3;
-		ptr += sizeof(struct ipv6_hopopt_hdr);
-
-		while (hoplen > 0) {
-			opt = (struct ioam6_hdr *)ptr;
-
-			if (opt->opt_type == IPV6_TLV_IOAM &&
-			    opt->type == IOAM6_TYPE_PREALLOC) {
-				ptr += sizeof(*opt);
-				ret = func[tid](tid,
-						(struct ioam6_trace_hdr *)ptr,
-						tr_type, ioam_ns);
-				goto close;
-			}
-
-			ptr += opt->opt_len + 2;
-			hoplen -= opt->opt_len + 2;
-		}
-	}
-
-	goto recv;
-close:
-	free(iov.iov_base);
-	close(fd);
-out:
-	return ret;
-}
-- 
cgit v1.2.3


From 2d2b5028b4abfb312c3fe964ce724ad8873ac574 Mon Sep 17 00:00:00 2001
From: Justin Iurman <justin.iurman@uliege.be>
Date: Wed, 2 Oct 2024 18:27:31 +0200
Subject: selftests: net: add new ioam tests

This patch re-adds the (updated) ioam selftests with support for the
tunsrc feature.

Signed-off-by: Justin Iurman <justin.iurman@uliege.be>
Link: https://patch.msgid.link/20241002162731.19847-3-justin.iurman@uliege.be
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/Makefile       |    2 +
 tools/testing/selftests/net/config         |    1 +
 tools/testing/selftests/net/ioam6.sh       | 1683 ++++++++++++++++++++++++++++
 tools/testing/selftests/net/ioam6_parser.c | 1101 ++++++++++++++++++
 4 files changed, 2787 insertions(+)
 create mode 100755 tools/testing/selftests/net/ioam6.sh
 create mode 100644 tools/testing/selftests/net/ioam6_parser.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index ef40d099aa1c..649f1fe0dc46 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -28,6 +28,7 @@ TEST_PROGS += unicast_extensions.sh
 TEST_PROGS += udpgro_fwd.sh
 TEST_PROGS += udpgro_frglist.sh
 TEST_PROGS += veth.sh
+TEST_PROGS += ioam6.sh
 TEST_PROGS += gro.sh
 TEST_PROGS += gre_gso.sh
 TEST_PROGS += cmsg_so_mark.sh
@@ -66,6 +67,7 @@ TEST_GEN_FILES += fin_ack_lat
 TEST_GEN_FILES += reuseaddr_ports_exhausted
 TEST_GEN_FILES += hwtstamp_config rxtimestamp timestamping txtimestamp
 TEST_GEN_FILES += ipsec
+TEST_GEN_FILES += ioam6_parser
 TEST_GEN_FILES += gro
 TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
 TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls tun tap epoll_busy_poll
diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config
index 3f0b02835e78..5b9baf708950 100644
--- a/tools/testing/selftests/net/config
+++ b/tools/testing/selftests/net/config
@@ -95,6 +95,7 @@ CONFIG_NET_CLS_FLOWER=m
 CONFIG_NET_ACT_TUNNEL_KEY=m
 CONFIG_NET_ACT_MIRRED=m
 CONFIG_BAREUDP=m
+CONFIG_IPV6_IOAM6_LWTUNNEL=y
 CONFIG_CRYPTO_SM4_GENERIC=y
 CONFIG_AMT=m
 CONFIG_TUN=y
diff --git a/tools/testing/selftests/net/ioam6.sh b/tools/testing/selftests/net/ioam6.sh
new file mode 100755
index 000000000000..845c26dd01a9
--- /dev/null
+++ b/tools/testing/selftests/net/ioam6.sh
@@ -0,0 +1,1683 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Author: Justin Iurman <justin.iurman@uliege.be>
+#
+# This script evaluates IOAM for IPv6 by checking local IOAM configurations and
+# IOAM data inside packets. There are three categories of tests: LOCAL, OUTPUT,
+# and INPUT. The former (LOCAL) checks all IOAM related configurations locally
+# without sending packets. OUTPUT tests verify the processing of an IOAM
+# encapsulating node, while INPUT tests verify the processing of an IOAM transit
+# node. Both OUTPUT and INPUT tests send packets. Each test is documented inside
+# its own handler.
+#
+# The topology used for OUTPUT and INPUT tests is made of three nodes:
+# - Alpha (the IOAM encapsulating node)
+# - Beta  (the IOAM transit node)
+# - Gamma (the receiver) **
+#
+# An IOAM domain is configured from Alpha to Beta, but not on the reverse path.
+# Alpha adds an IOAM option (Pre-allocated Trace) inside a Hop-by-hop.
+#
+# ** Gamma is required because ioam6_parser.c uses a packet socket and we need
+#    to see IOAM data inserted by the very last node (Beta), which would happen
+#    _after_ we get a copy of the packet on Beta. Note that using an
+#    IPv6 raw socket with IPV6_RECVHOPOPTS on Beta would not be enough: we also
+#    need to access the IPv6 header to check some fields (e.g., source and
+#    destination addresses), which is not possible in that case. As a
+#    consequence, we need Gamma as a receiver to run ioam6_parser.c which uses a
+#    packet socket.
+#
+#
+#         +-----------------------+          +-----------------------+
+#         |                       |          |                       |
+#         |      Alpha netns      |          |      Gamma netns      |
+#         |                       |          |                       |
+#         | +-------------------+ |          | +-------------------+ |
+#         | |       veth0       | |          | |       veth0       | |
+#         | | 2001:db8:1::2/64  | |          | | 2001:db8:2::2/64  | |
+#         | +-------------------+ |          | +-------------------+ |
+#         |           .           |          |           .           |
+#         +-----------.-----------+          +-----------.-----------+
+#                     .                                  .
+#                     .                                  .
+#                     .                                  .
+#         +-----------.----------------------------------.-----------+
+#         |           .                                  .           |
+#         | +-------------------+              +-------------------+ |
+#         | |       veth0       |              |       veth1       | |
+#         | | 2001:db8:1::1/64  | ............ | 2001:db8:2::1/64  | |
+#         | +-------------------+              +-------------------+ |
+#         |                                                          |
+#         |                        Beta netns                        |
+#         |                                                          |
+#         +----------------------------------------------------------+
+#
+#
+#
+#         +==========================================================+
+#         |                Alpha - IOAM configuration                |
+#         +=====================+====================================+
+#         | Node ID             | 1                                  |
+#         +---------------------+------------------------------------+
+#         | Node Wide ID        | 11111111                           |
+#         +---------------------+------------------------------------+
+#         | Ingress ID          | 0xffff (default value)             |
+#         +---------------------+------------------------------------+
+#         | Ingress Wide ID     | 0xffffffff (default value)         |
+#         +---------------------+------------------------------------+
+#         | Egress ID           | 101                                |
+#         +---------------------+------------------------------------+
+#         | Egress Wide ID      | 101101                             |
+#         +---------------------+------------------------------------+
+#         | Namespace Data      | 0xdeadbeef                         |
+#         +---------------------+------------------------------------+
+#         | Namespace Wide Data | 0xcafec0caf00dc0de                 |
+#         +---------------------+------------------------------------+
+#         | Schema ID           | 777                                |
+#         +---------------------+------------------------------------+
+#         | Schema Data         | something that will be 4n-aligned  |
+#         +---------------------+------------------------------------+
+#
+#
+#         +==========================================================+
+#         |                 Beta - IOAM configuration                |
+#         +=====================+====================================+
+#         | Node ID             | 2                                  |
+#         +---------------------+------------------------------------+
+#         | Node Wide ID        | 22222222                           |
+#         +---------------------+------------------------------------+
+#         | Ingress ID          | 201                                |
+#         +---------------------+------------------------------------+
+#         | Ingress Wide ID     | 201201                             |
+#         +---------------------+------------------------------------+
+#         | Egress ID           | 202                                |
+#         +---------------------+------------------------------------+
+#         | Egress Wide ID      | 202202                             |
+#         +---------------------+------------------------------------+
+#         | Namespace Data      | 0xffffffff (default value)         |
+#         +---------------------+------------------------------------+
+#         | Namespace Wide Data | 0xffffffffffffffff (default value) |
+#         +---------------------+------------------------------------+
+#         | Schema ID           | 0xffffff (= None)                  |
+#         +---------------------+------------------------------------+
+#         | Schema Data         |                                    |
+#         +---------------------+------------------------------------+
+
+source lib.sh
+
+################################################################################
+#                                                                              #
+# WARNING: Be careful if you modify the block below - it MUST be kept          #
+#          synchronized with configurations inside ioam6_parser.c and always   #
+#          reflect the same.                                                   #
+#                                                                              #
+################################################################################
+
+ALPHA=(
+  1                                    # ID
+  11111111                             # Wide ID
+  0xffff                               # Ingress ID (default value)
+  0xffffffff                           # Ingress Wide ID (default value)
+  101                                  # Egress ID
+  101101                               # Egress Wide ID
+  0xdeadbeef                           # Namespace Data
+  0xcafec0caf00dc0de                   # Namespace Wide Data
+  777                                  # Schema ID
+  "something that will be 4n-aligned"  # Schema Data
+)
+
+BETA=(
+  2                                    # ID
+  22222222                             # Wide ID
+  201                                  # Ingress ID
+  201201                               # Ingress Wide ID
+  202                                  # Egress ID
+  202202                               # Egress Wide ID
+  0xffffffff                           # Namespace Data (empty value)
+  0xffffffffffffffff                   # Namespace Wide Data (empty value)
+  0xffffff                             # Schema ID (empty value)
+  ""                                   # Schema Data (empty value)
+)
+
+TESTS_LOCAL="
+  local_sysctl_ioam_id
+  local_sysctl_ioam_id_wide
+  local_sysctl_ioam_intf_id
+  local_sysctl_ioam_intf_id_wide
+  local_sysctl_ioam_intf_enabled
+  local_ioam_namespace
+  local_ioam_schema
+  local_ioam_schema_namespace
+  local_route_ns
+  local_route_tunsrc
+  local_route_tundst
+  local_route_trace_type
+  local_route_trace_size
+  local_route_trace_type_bits
+  local_route_trace_size_values
+"
+
+TESTS_OUTPUT="
+  output_undef_ns
+  output_no_room
+  output_no_room_oss
+  output_bits
+  output_sizes
+  output_full_supp_trace
+"
+
+TESTS_INPUT="
+  input_undef_ns
+  input_no_room
+  input_no_room_oss
+  input_disabled
+  input_oflag
+  input_bits
+  input_sizes
+  input_full_supp_trace
+"
+
+################################################################################
+#                                                                              #
+#                                   LIBRARY                                    #
+#                                                                              #
+################################################################################
+
+check_kernel_compatibility()
+{
+  setup_ns ioam_tmp_node &>/dev/null
+  local ret=$?
+
+  ip link add name veth0 netns $ioam_tmp_node type veth \
+    peer name veth1 netns $ioam_tmp_node &>/dev/null
+  ret=$((ret + $?))
+
+  ip -netns $ioam_tmp_node link set veth0 up &>/dev/null
+  ret=$((ret + $?))
+
+  ip -netns $ioam_tmp_node link set veth1 up &>/dev/null
+  ret=$((ret + $?))
+
+  if [ $ret != 0 ]
+  then
+    echo "SKIP: Setup failed."
+    cleanup_ns $ioam_tmp_node
+    exit $ksft_skip
+  fi
+
+  ip -netns $ioam_tmp_node route add 2001:db8:2::/64 \
+    encap ioam6 trace prealloc type 0x800000 ns 0 size 4 dev veth0 &>/dev/null
+  ret=$?
+
+  ip -netns $ioam_tmp_node -6 route 2>/dev/null | grep -q "encap ioam6"
+  ret=$((ret + $?))
+
+  if [ $ret != 0 ]
+  then
+    echo "SKIP: Cannot attach an IOAM trace to a route. Was your kernel" \
+         "compiled without CONFIG_IPV6_IOAM6_LWTUNNEL? Are you running an" \
+         "old kernel? Are you using an old version of iproute2?"
+    cleanup_ns $ioam_tmp_node
+    exit $ksft_skip
+  fi
+
+  cleanup_ns $ioam_tmp_node
+
+  lsmod 2>/dev/null | grep -q "ip6_tunnel"
+  ip6tnl_loaded=$?
+
+  if [ $ip6tnl_loaded == 0 ]
+  then
+    encap_tests=0
+  else
+    modprobe ip6_tunnel &>/dev/null
+    lsmod 2>/dev/null | grep -q "ip6_tunnel"
+    encap_tests=$?
+
+    if [ $encap_tests != 0 ]
+    then
+      ip a 2>/dev/null | grep -q "ip6tnl0"
+      encap_tests=$?
+
+      if [ $encap_tests != 0 ]
+      then
+        echo "Note: ip6_tunnel not found neither as a module nor inside the" \
+             "kernel. Any tests that require it will be skipped."
+      fi
+    fi
+  fi
+}
+
+cleanup()
+{
+  cleanup_ns $ioam_node_alpha $ioam_node_beta $ioam_node_gamma
+
+  if [ $ip6tnl_loaded != 0 ]
+  then
+    modprobe -r ip6_tunnel &>/dev/null
+  fi
+}
+
+setup()
+{
+  setup_ns ioam_node_alpha ioam_node_beta ioam_node_gamma &>/dev/null
+
+  ip link add name ioam-veth-alpha netns $ioam_node_alpha type veth \
+    peer name ioam-veth-betaL netns $ioam_node_beta &>/dev/null
+  ip link add name ioam-veth-betaR netns $ioam_node_beta type veth \
+    peer name ioam-veth-gamma netns $ioam_node_gamma &>/dev/null
+
+  ip -netns $ioam_node_alpha link set ioam-veth-alpha name veth0 &>/dev/null
+  ip -netns $ioam_node_beta link set ioam-veth-betaL name veth0 &>/dev/null
+  ip -netns $ioam_node_beta link set ioam-veth-betaR name veth1 &>/dev/null
+  ip -netns $ioam_node_gamma link set ioam-veth-gamma name veth0 &>/dev/null
+
+  ip -netns $ioam_node_alpha addr add 2001:db8:1::50/64 dev veth0 &>/dev/null
+  ip -netns $ioam_node_alpha addr add 2001:db8:1::2/64 dev veth0 &>/dev/null
+  ip -netns $ioam_node_alpha link set veth0 up &>/dev/null
+  ip -netns $ioam_node_alpha link set lo up &>/dev/null
+  ip -netns $ioam_node_alpha route add 2001:db8:2::/64 \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  ip -netns $ioam_node_beta addr add 2001:db8:1::1/64 dev veth0 &>/dev/null
+  ip -netns $ioam_node_beta addr add 2001:db8:2::1/64 dev veth1 &>/dev/null
+  ip -netns $ioam_node_beta link set veth0 up &>/dev/null
+  ip -netns $ioam_node_beta link set veth1 up &>/dev/null
+  ip -netns $ioam_node_beta link set lo up &>/dev/null
+
+  ip -netns $ioam_node_gamma addr add 2001:db8:2::2/64 dev veth0 &>/dev/null
+  ip -netns $ioam_node_gamma link set veth0 up &>/dev/null
+  ip -netns $ioam_node_gamma link set lo up &>/dev/null
+  ip -netns $ioam_node_gamma route add 2001:db8:1::/64 \
+    via 2001:db8:2::1 dev veth0 &>/dev/null
+
+  # - Alpha: IOAM config -
+  ip netns exec $ioam_node_alpha \
+    sysctl -wq net.ipv6.ioam6_id=${ALPHA[0]} &>/dev/null
+  ip netns exec $ioam_node_alpha \
+    sysctl -wq net.ipv6.ioam6_id_wide=${ALPHA[1]} &>/dev/null
+  ip netns exec $ioam_node_alpha \
+    sysctl -wq net.ipv6.conf.veth0.ioam6_id=${ALPHA[4]} &>/dev/null
+  ip netns exec $ioam_node_alpha \
+    sysctl -wq net.ipv6.conf.veth0.ioam6_id_wide=${ALPHA[5]} &>/dev/null
+  ip -netns $ioam_node_alpha \
+    ioam namespace add 123 data ${ALPHA[6]} wide ${ALPHA[7]} &>/dev/null
+  ip -netns $ioam_node_alpha \
+    ioam schema add ${ALPHA[8]} "${ALPHA[9]}" &>/dev/null
+  ip -netns $ioam_node_alpha \
+    ioam namespace set 123 schema ${ALPHA[8]} &>/dev/null
+
+  # - Beta: IOAM config -
+  ip netns exec $ioam_node_beta \
+    sysctl -wq net.ipv6.conf.all.forwarding=1 &>/dev/null
+  ip netns exec $ioam_node_beta \
+    sysctl -wq net.ipv6.ioam6_id=${BETA[0]} &>/dev/null
+  ip netns exec $ioam_node_beta \
+    sysctl -wq net.ipv6.ioam6_id_wide=${BETA[1]} &>/dev/null
+  ip netns exec $ioam_node_beta \
+    sysctl -wq net.ipv6.conf.veth0.ioam6_enabled=1 &>/dev/null
+  ip netns exec $ioam_node_beta \
+    sysctl -wq net.ipv6.conf.veth0.ioam6_id=${BETA[2]} &>/dev/null
+  ip netns exec $ioam_node_beta \
+    sysctl -wq net.ipv6.conf.veth0.ioam6_id_wide=${BETA[3]} &>/dev/null
+  ip netns exec $ioam_node_beta \
+    sysctl -wq net.ipv6.conf.veth1.ioam6_id=${BETA[4]} &>/dev/null
+  ip netns exec $ioam_node_beta \
+    sysctl -wq net.ipv6.conf.veth1.ioam6_id_wide=${BETA[5]} &>/dev/null
+  ip -netns $ioam_node_beta ioam namespace add 123 &>/dev/null
+
+  sleep 1
+
+  ip netns exec $ioam_node_alpha ping6 -c 5 -W 1 2001:db8:2::2 &>/dev/null
+  if [ $? != 0 ]
+  then
+    echo "SKIP: Setup failed."
+    cleanup
+    exit $ksft_skip
+  fi
+}
+
+log_test_passed()
+{
+  printf " - TEST: %-57s  [ OK ]\n" "$1"
+  npassed=$((npassed+1))
+}
+
+log_test_skipped()
+{
+  printf " - TEST: %-57s  [SKIP]\n" "$1"
+  nskipped=$((nskipped+1))
+}
+
+log_test_failed()
+{
+  printf " - TEST: %-57s  [FAIL]\n" "$1"
+  nfailed=$((nfailed+1))
+}
+
+run_test()
+{
+  local name=$1
+  local desc=$2
+  local ip6_src=$3
+  local trace_type=$4
+  local trace_size=$5
+  local ioam_ns=$6
+  local type=$7
+
+  ip netns exec $ioam_node_gamma \
+    ./ioam6_parser veth0 $name $ip6_src 2001:db8:2::2 \
+                   $trace_type $trace_size $ioam_ns $type &
+  local spid=$!
+  sleep 0.1
+
+  ip netns exec $ioam_node_alpha ping6 -t 64 -c 1 -W 1 2001:db8:2::2 &>/dev/null
+  if [ $? != 0 ]
+  then
+    log_test_failed "${desc}"
+    kill -2 $spid &>/dev/null
+  else
+    wait $spid
+    [ $? == 0 ] && log_test_passed "${desc}" || log_test_failed "${desc}"
+  fi
+}
+
+run()
+{
+  local test
+
+  echo
+  printf "+"
+  printf "%0.s-" {1..72}
+  printf "+"
+  echo
+  printf "| %-28s LOCAL tests %-29s |"
+  echo
+  printf "+"
+  printf "%0.s-" {1..72}
+  printf "+"
+  echo
+
+  echo
+  echo "Global config"
+  for test in $TESTS_LOCAL
+  do
+    $test
+  done
+
+  echo
+  echo "Inline mode"
+  for test in $TESTS_LOCAL
+  do
+    $test "inline"
+  done
+
+  echo
+  echo "Encap mode"
+  for test in $TESTS_LOCAL
+  do
+    $test "encap"
+  done
+
+  echo
+  printf "+"
+  printf "%0.s-" {1..72}
+  printf "+"
+  echo
+  printf "| %-28s OUTPUT tests %-28s |"
+  echo
+  printf "+"
+  printf "%0.s-" {1..72}
+  printf "+"
+  echo
+
+  # set OUTPUT settings
+  ip netns exec $ioam_node_beta \
+    sysctl -wq net.ipv6.conf.veth0.ioam6_enabled=0 &>/dev/null
+
+  echo
+  echo "Inline mode"
+  for test in $TESTS_OUTPUT
+  do
+    $test "inline"
+  done
+
+  echo
+  echo "Encap mode"
+  for test in $TESTS_OUTPUT
+  do
+    $test "encap"
+  done
+
+  echo
+  echo "Encap mode (with tunsrc)"
+  for test in $TESTS_OUTPUT
+  do
+    $test "encap" "tunsrc"
+  done
+
+  # clean OUTPUT settings
+  ip netns exec $ioam_node_beta \
+    sysctl -wq net.ipv6.conf.veth0.ioam6_enabled=1 &>/dev/null
+
+  echo
+  printf "+"
+  printf "%0.s-" {1..72}
+  printf "+"
+  echo
+  printf "| %-28s INPUT tests %-29s |"
+  echo
+  printf "+"
+  printf "%0.s-" {1..72}
+  printf "+"
+  echo
+
+  # set INPUT settings
+  ip -netns $ioam_node_alpha ioam namespace del 123 &>/dev/null
+
+  echo
+  echo "Inline mode"
+  for test in $TESTS_INPUT
+  do
+    $test "inline"
+  done
+
+  echo
+  echo "Encap mode"
+  for test in $TESTS_INPUT
+  do
+    $test "encap"
+  done
+
+  # clean INPUT settings
+  ip -netns $ioam_node_alpha \
+    ioam namespace add 123 data ${ALPHA[6]} wide ${ALPHA[7]} &>/dev/null
+  ip -netns $ioam_node_alpha \
+    ioam namespace set 123 schema ${ALPHA[8]} &>/dev/null
+
+  echo
+  printf "+"
+  printf "%0.s-" {1..72}
+  printf "+"
+  echo
+  printf "| %-30s Results %-31s |"
+  echo
+  printf "+"
+  printf "%0.s-" {1..72}
+  printf "+"
+  echo
+
+  echo
+  echo "- Passed:  ${npassed}"
+  echo "- Skipped: ${nskipped}"
+  echo "- Failed:  ${nfailed}"
+  echo
+}
+
+bit2type=(
+  0x800000 0x400000 0x200000 0x100000 0x080000 0x040000 0x020000 0x010000
+  0x008000 0x004000 0x002000 0x001000 0x000800 0x000400 0x000200 0x000100
+  0x000080 0x000040 0x000020 0x000010 0x000008 0x000004 0x000002 0x000001
+)
+bit2size=( 4 4 4 4 4 4 4 4 8 8 8 4 4 4 4 4 4 4 4 4 4 4 4 0 )
+
+
+################################################################################
+#                                                                              #
+#                                 LOCAL tests                                  #
+#                                                                              #
+################################################################################
+
+local_sysctl_ioam_id()
+{
+  ##############################################################################
+  # Make sure the sysctl "net.ipv6.ioam6_id" works as expected.                #
+  ##############################################################################
+  local desc="Sysctl net.ipv6.ioam6_id"
+
+  [ ! -z $1 ] && return
+
+  ip netns exec $ioam_node_alpha \
+    sysctl net.ipv6.ioam6_id 2>/dev/null | grep -wq ${ALPHA[0]}
+
+  [ $? == 0 ] && log_test_passed "${desc}" || log_test_failed "${desc}"
+}
+
+local_sysctl_ioam_id_wide()
+{
+  ##############################################################################
+  # Make sure the sysctl "net.ipv6.ioam6_id_wide" works as expected.           #
+  ##############################################################################
+  local desc="Sysctl net.ipv6.ioam6_id_wide"
+
+  [ ! -z $1 ] && return
+
+  ip netns exec $ioam_node_alpha \
+    sysctl net.ipv6.ioam6_id_wide 2>/dev/null | grep -wq ${ALPHA[1]}
+
+  [ $? == 0 ] && log_test_passed "${desc}" || log_test_failed "${desc}"
+}
+
+local_sysctl_ioam_intf_id()
+{
+  ##############################################################################
+  # Make sure the sysctl "net.ipv6.conf.XX.ioam6_id" works as expected.        #
+  ##############################################################################
+  local desc="Sysctl net.ipv6.conf.XX.ioam6_id"
+
+  [ ! -z $1 ] && return
+
+  ip netns exec $ioam_node_alpha \
+    sysctl net.ipv6.conf.veth0.ioam6_id 2>/dev/null | grep -wq ${ALPHA[4]}
+
+  [ $? == 0 ] && log_test_passed "${desc}" || log_test_failed "${desc}"
+}
+
+local_sysctl_ioam_intf_id_wide()
+{
+  ##############################################################################
+  # Make sure the sysctl "net.ipv6.conf.XX.ioam6_id_wide" works as expected.   #
+  ##############################################################################
+  local desc="Sysctl net.ipv6.conf.XX.ioam6_id_wide"
+
+  [ ! -z $1 ] && return
+
+  ip netns exec $ioam_node_alpha \
+    sysctl net.ipv6.conf.veth0.ioam6_id_wide 2>/dev/null | grep -wq ${ALPHA[5]}
+
+  [ $? == 0 ] && log_test_passed "${desc}" || log_test_failed "${desc}"
+}
+
+local_sysctl_ioam_intf_enabled()
+{
+  ##############################################################################
+  # Make sure the sysctl "net.ipv6.conf.XX.ioam6_enabled" works as expected.   #
+  ##############################################################################
+  local desc="Sysctl net.ipv6.conf.XX.ioam6_enabled"
+
+  [ ! -z $1 ] && return
+
+  ip netns exec $ioam_node_beta \
+    sysctl net.ipv6.conf.veth0.ioam6_enabled 2>/dev/null | grep -wq 1
+
+  [ $? == 0 ] && log_test_passed "${desc}" || log_test_failed "${desc}"
+}
+
+local_ioam_namespace()
+{
+  ##############################################################################
+  # Make sure the creation of an IOAM Namespace works as expected.             #
+  ##############################################################################
+  local desc="Create an IOAM Namespace"
+
+  [ ! -z $1 ] && return
+
+  ip -netns $ioam_node_alpha \
+    ioam namespace show 2>/dev/null | grep -wq 123
+  local ret=$?
+
+  ip -netns $ioam_node_alpha \
+    ioam namespace show 2>/dev/null | grep -wq ${ALPHA[6]}
+  ret=$((ret + $?))
+
+  ip -netns $ioam_node_alpha \
+    ioam namespace show 2>/dev/null | grep -wq ${ALPHA[7]}
+  ret=$((ret + $?))
+
+  [ $ret == 0 ] && log_test_passed "${desc}" || log_test_failed "${desc}"
+}
+
+local_ioam_schema()
+{
+  ##############################################################################
+  # Make sure the creation of an IOAM Schema works as expected.                #
+  ##############################################################################
+  local desc="Create an IOAM Schema"
+
+  [ ! -z $1 ] && return
+
+  ip -netns $ioam_node_alpha \
+    ioam schema show 2>/dev/null | grep -wq ${ALPHA[8]}
+  local ret=$?
+
+  local sc_data=$(
+    for i in `seq 0 $((${#ALPHA[9]}-1))`
+    do
+      chr=${ALPHA[9]:i:1}
+      printf "%x " "'${chr}"
+    done
+  )
+
+  ip -netns $ioam_node_alpha \
+    ioam schema show 2>/dev/null | grep -q "$sc_data"
+  ret=$((ret + $?))
+
+  [ $ret == 0 ] && log_test_passed "${desc}" || log_test_failed "${desc}"
+}
+
+local_ioam_schema_namespace()
+{
+  ##############################################################################
+  # Make sure the binding of a Schema to a Namespace works as expected.        #
+  ##############################################################################
+  local desc="Bind an IOAM Schema to an IOAM Namespace"
+
+  [ ! -z $1 ] && return
+
+  ip -netns $ioam_node_alpha \
+    ioam namespace show 2>/dev/null | grep -wq ${ALPHA[8]}
+  local ret=$?
+
+  ip -netns $ioam_node_alpha \
+    ioam schema show 2>/dev/null | grep -wq 123
+  ret=$((ret + $?))
+
+  [ $ret == 0 ] && log_test_passed "${desc}" || log_test_failed "${desc}"
+}
+
+local_route_ns()
+{
+  ##############################################################################
+  # Make sure the Namespace-ID is always provided, whatever the mode.          #
+  ##############################################################################
+  local desc="Mandatory Namespace-ID"
+  local mode
+
+  [ -z $1 ] && return
+
+  [ "$1" == "encap" ] && mode="$1 tundst 2001:db8:2::2" || mode="$1"
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type 0x800000 size 4 \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+  local ret1=$?
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type 0x800000 ns 0 size 4 \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+  local ret2=$?
+
+  [[ $ret1 == 0 || $ret2 != 0 ]] && log_test_failed "${desc}" \
+                                 || log_test_passed "${desc}"
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+}
+
+local_route_tunsrc()
+{
+  ##############################################################################
+  # Make sure the Tunnel Source is only (and possibly) used with encap mode.   #
+  ##############################################################################
+  local desc
+  local mode
+  local mode_tunsrc
+
+  [ -z $1 ] && return
+
+  if [ "$1" == "encap" ]
+  then
+    desc="Optional Tunnel Source"
+    mode="$1 tundst 2001:db8:2::2"
+    mode_tunsrc="$1 tunsrc 2001:db8:1::50 tundst 2001:db8:2::2"
+  else
+    desc="Unneeded Tunnel Source"
+    mode="$1"
+    mode_tunsrc="$1 tunsrc 2001:db8:1::50"
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type 0x800000 ns 0 size 4 \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+  local ret1=$?
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode_tunsrc trace prealloc type 0x800000 ns 0 size 4 \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+  local ret2=$?
+
+  if [ "$1" == "encap" ]
+  then
+    [[ $ret1 != 0 || $ret2 != 0 ]] && log_test_failed "${desc}" \
+                                   || log_test_passed "${desc}"
+  else
+    [[ $ret1 != 0 || $ret2 == 0 ]] && log_test_failed "${desc}" \
+                                   || log_test_passed "${desc}"
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+}
+
+local_route_tundst()
+{
+  ##############################################################################
+  # Make sure the Tunnel Destination is only (and always) used with encap mode.#
+  ##############################################################################
+  local desc
+
+  [ -z $1 ] && return
+
+  [ "$1" == "encap" ] && desc="Mandatory Tunnel Destination" \
+                     || desc="Unneeded Tunnel Destination"
+
+  local mode="$1"
+  local mode_tundst="$1 tundst 2001:db8:2::2"
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type 0x800000 ns 0 size 4 \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+  local ret1=$?
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode_tundst trace prealloc type 0x800000 ns 0 size 4 \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+  local ret2=$?
+
+  if [ "$1" == "encap" ]
+  then
+    [[ $ret1 == 0 || $ret2 != 0 ]] && log_test_failed "${desc}" \
+                                   || log_test_passed "${desc}"
+  else
+    [[ $ret1 != 0 || $ret2 == 0 ]] && log_test_failed "${desc}" \
+                                   || log_test_passed "${desc}"
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+}
+
+local_route_trace_type()
+{
+  ##############################################################################
+  # Make sure the Trace Type is always provided, whatever the mode.            #
+  ##############################################################################
+  local desc="Mandatory Trace Type"
+  local mode
+
+  [ -z $1 ] && return
+
+  [ "$1" == "encap" ] && mode="$1 tundst 2001:db8:2::2" || mode="$1"
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc ns 0 size 4 \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+  local ret1=$?
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type 0x800000 ns 0 size 4 \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+  local ret2=$?
+
+  [[ $ret1 == 0 || $ret2 != 0 ]] && log_test_failed "${desc}" \
+                                 || log_test_passed "${desc}"
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+}
+
+local_route_trace_size()
+{
+  ##############################################################################
+  # Make sure the Trace Size is always provided, whatever the mode.            #
+  ##############################################################################
+  local desc="Mandatory Trace Size"
+  local mode
+
+  [ -z $1 ] && return
+
+  [ "$1" == "encap" ] && mode="$1 tundst 2001:db8:2::2" || mode="$1"
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type 0x800000 ns 0 \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+  local ret1=$?
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type 0x800000 ns 0 size 4 \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+  local ret2=$?
+
+  [[ $ret1 == 0 || $ret2 != 0 ]] && log_test_failed "${desc}" \
+                                 || log_test_passed "${desc}"
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+}
+
+local_route_trace_type_bits()
+{
+  ##############################################################################
+  # Make sure only allowed bits (0-11 and 22) are accepted.                    #
+  ##############################################################################
+  local desc="Trace Type bits"
+  local mode
+
+  [ -z $1 ] && return
+
+  [ "$1" == "encap" ] && mode="$1 tundst 2001:db8:2::2" || mode="$1"
+
+  local i
+  for i in {0..23}
+  do
+    ip -netns $ioam_node_alpha \
+      route change 2001:db8:2::/64 \
+      encap ioam6 mode $mode trace prealloc type ${bit2type[$i]} ns 0 size 4 \
+      via 2001:db8:1::1 dev veth0 &>/dev/null
+
+    if [[ ($? == 0 && (($i -ge 12 && $i -le 21) || $i == 23)) ||
+          ($? != 0 && (($i -ge 0 && $i -le 11) || $i == 22)) ]]
+    then
+      local err=1
+      break
+    fi
+  done
+
+  [ -z $err ] && log_test_passed "${desc}" || log_test_failed "${desc}"
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+}
+
+local_route_trace_size_values()
+{
+  ##############################################################################
+  # Make sure only allowed sizes (multiples of four in [4,244]) are accepted.  #
+  ##############################################################################
+  local desc="Trace Size values"
+  local mode
+
+  [ -z $1 ] && return
+
+  [ "$1" == "encap" ] && mode="$1 tundst 2001:db8:2::2" || mode="$1"
+
+  # we also try the next multiple of four after the MAX to check it's refused
+  local i
+  for i in {0..248}
+  do
+    ip -netns $ioam_node_alpha \
+      route change 2001:db8:2::/64 \
+      encap ioam6 mode $mode trace prealloc type 0x800000 ns 0 size $i \
+      via 2001:db8:1::1 dev veth0 &>/dev/null
+
+    if [[ ($? == 0 && ($i == 0 || $i == 248 || $(( $i % 4 )) != 0)) ||
+          ($? != 0 && $i != 0 && $i != 248 && $(( $i % 4 )) == 0) ]]
+    then
+      local err=1
+      break
+    fi
+  done
+
+  [ -z $err ] && log_test_passed "${desc}" || log_test_failed "${desc}"
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+}
+
+
+################################################################################
+#                                                                              #
+#                                 OUTPUT tests                                 #
+#                                                                              #
+################################################################################
+
+output_undef_ns()
+{
+  ##############################################################################
+  # Make sure an IOAM encapsulating node does NOT fill the trace when the      #
+  # corresponding IOAM Namespace-ID is not configured locally.                 #
+  ##############################################################################
+  local desc="Unknown IOAM Namespace-ID"
+  local ns=0
+  local tr_type=0x800000
+  local tr_size=4
+  local mode="$1"
+  local saddr="2001:db8:1::2"
+
+  if [ "$1" == "encap" ]
+  then
+    if [ $encap_tests != 0 ]
+    then
+      log_test_skipped "${desc}"
+      return
+    fi
+
+    if [ "$2" == "tunsrc" ]
+    then
+      saddr="2001:db8:1::50"
+      mode+=" tunsrc 2001:db8:1::50"
+    fi
+
+    mode+=" tundst 2001:db8:2::2"
+    ip -netns $ioam_node_gamma link set ip6tnl0 up &>/dev/null
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type $tr_type ns $ns size $tr_size \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  if [ $? == 0 ]
+  then
+    run_test ${FUNCNAME[0]} "${desc}" $saddr $tr_type $tr_size $ns $1
+  else
+    log_test_failed "${desc}"
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  [ "$1" == "encap" ] && ip -netns $ioam_node_gamma \
+    link set ip6tnl0 down &>/dev/null
+}
+
+output_no_room()
+{
+  ##############################################################################
+  # Make sure an IOAM encapsulating node does NOT fill the trace AND sets the  #
+  # Overflow flag when there is not enough room for its data.                  #
+  ##############################################################################
+  local desc="Missing room for data"
+  local ns=123
+  local tr_type=0xc00000
+  local tr_size=4
+  local mode="$1"
+  local saddr="2001:db8:1::2"
+
+  if [ "$1" == "encap" ]
+  then
+    if [ $encap_tests != 0 ]
+    then
+      log_test_skipped "${desc}"
+      return
+    fi
+
+    if [ "$2" == "tunsrc" ]
+    then
+      saddr="2001:db8:1::50"
+      mode+=" tunsrc 2001:db8:1::50"
+    fi
+
+    mode+=" tundst 2001:db8:2::2"
+    ip -netns $ioam_node_gamma link set ip6tnl0 up &>/dev/null
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type $tr_type ns $ns size $tr_size \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  if [ $? == 0 ]
+  then
+    run_test ${FUNCNAME[0]} "${desc}" $saddr $tr_type $tr_size $ns $1
+  else
+    log_test_failed "${desc}"
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  [ "$1" == "encap" ] && ip -netns $ioam_node_gamma \
+    link set ip6tnl0 down &>/dev/null
+}
+
+output_no_room_oss()
+{
+  ##############################################################################
+  # Make sure an IOAM encapsulating node does NOT fill the trace AND sets the  #
+  # Overflow flag when there is not enough room for the Opaque State Snapshot. #
+  ##############################################################################
+  local desc="Missing room for Opaque State Snapshot"
+  local ns=123
+  local tr_type=0x000002
+  local tr_size=4
+  local mode="$1"
+  local saddr="2001:db8:1::2"
+
+  if [ "$1" == "encap" ]
+  then
+    if [ $encap_tests != 0 ]
+    then
+      log_test_skipped "${desc}"
+      return
+    fi
+
+    if [ "$2" == "tunsrc" ]
+    then
+      saddr="2001:db8:1::50"
+      mode+=" tunsrc 2001:db8:1::50"
+    fi
+
+    mode+=" tundst 2001:db8:2::2"
+    ip -netns $ioam_node_gamma link set ip6tnl0 up &>/dev/null
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type $tr_type ns $ns size $tr_size \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  if [ $? == 0 ]
+  then
+    run_test ${FUNCNAME[0]} "${desc}" $saddr $tr_type $tr_size $ns $1
+  else
+    log_test_failed "${desc}"
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  [ "$1" == "encap" ] && ip -netns $ioam_node_gamma \
+    link set ip6tnl0 down &>/dev/null
+}
+
+output_bits()
+{
+  ##############################################################################
+  # Make sure an IOAM encapsulating node implements all supported bits by      #
+  # checking it correctly fills the trace with its data.                       #
+  ##############################################################################
+  local desc="Trace Type with supported bit <n> only"
+  local ns=123
+  local mode="$1"
+  local saddr="2001:db8:1::2"
+
+  if [ "$1" == "encap" ]
+  then
+    if [ "$2" == "tunsrc" ]
+    then
+      saddr="2001:db8:1::50"
+      mode+=" tunsrc 2001:db8:1::50"
+    fi
+
+    mode+=" tundst 2001:db8:2::2"
+    ip -netns $ioam_node_gamma link set ip6tnl0 up &>/dev/null
+  fi
+
+  local tmp=${bit2size[22]}
+  bit2size[22]=$(( $tmp + ${#ALPHA[9]} + ((4 - (${#ALPHA[9]} % 4)) % 4) ))
+
+  local i
+  for i in {0..11} {22..22}
+  do
+    local descr="${desc/<n>/$i}"
+
+    if [[ "$1" == "encap" && $encap_tests != 0 ]]
+    then
+      log_test_skipped "${descr}"
+      continue
+    fi
+
+    ip -netns $ioam_node_alpha \
+      route change 2001:db8:2::/64 \
+      encap ioam6 mode $mode trace prealloc \
+      type ${bit2type[$i]} ns $ns size ${bit2size[$i]} \
+      via 2001:db8:1::1 dev veth0 &>/dev/null
+
+    if [ $? == 0 ]
+    then
+      run_test "output_bit$i" "${descr}" $saddr \
+        ${bit2type[$i]} ${bit2size[$i]} $ns $1
+    else
+      log_test_failed "${descr}"
+    fi
+  done
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  [ "$1" == "encap" ] && ip -netns $ioam_node_gamma \
+    link set ip6tnl0 down &>/dev/null
+
+  bit2size[22]=$tmp
+}
+
+output_sizes()
+{
+  ##############################################################################
+  # Make sure an IOAM encapsulating node allocates supported sizes correctly.  #
+  ##############################################################################
+  local desc="Trace Size of <n> bytes"
+  local ns=0
+  local tr_type=0x800000
+  local mode="$1"
+  local saddr="2001:db8:1::2"
+
+  if [ "$1" == "encap" ]
+  then
+    if [ "$2" == "tunsrc" ]
+    then
+      saddr="2001:db8:1::50"
+      mode+=" tunsrc 2001:db8:1::50"
+    fi
+
+    mode+=" tundst 2001:db8:2::2"
+    ip -netns $ioam_node_gamma link set ip6tnl0 up &>/dev/null
+  fi
+
+  local i
+  for i in $(seq 4 4 244)
+  do
+    local descr="${desc/<n>/$i}"
+
+    if [[ "$1" == "encap" && $encap_tests != 0 ]]
+    then
+      log_test_skipped "${descr}"
+      continue
+    fi
+
+    ip -netns $ioam_node_alpha \
+      route change 2001:db8:2::/64 \
+      encap ioam6 mode $mode trace prealloc type $tr_type ns $ns size $i \
+      via 2001:db8:1::1 dev veth0 &>/dev/null
+
+    if [ $? == 0 ]
+    then
+      run_test "output_size$i" "${descr}" $saddr $tr_type $i $ns $1
+    else
+      log_test_failed "${descr}"
+    fi
+  done
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  [ "$1" == "encap" ] && ip -netns $ioam_node_gamma \
+    link set ip6tnl0 down &>/dev/null
+}
+
+output_full_supp_trace()
+{
+  ##############################################################################
+  # Make sure an IOAM encapsulating node correctly fills a trace when all      #
+  # supported bits are set.                                                    #
+  ##############################################################################
+  local desc="Full supported trace"
+  local ns=123
+  local tr_type=0xfff002
+  local tr_size
+  local mode="$1"
+  local saddr="2001:db8:1::2"
+
+  if [ "$1" == "encap" ]
+  then
+    if [ $encap_tests != 0 ]
+    then
+      log_test_skipped "${desc}"
+      return
+    fi
+
+    if [ "$2" == "tunsrc" ]
+    then
+      saddr="2001:db8:1::50"
+      mode+=" tunsrc 2001:db8:1::50"
+    fi
+
+    mode+=" tundst 2001:db8:2::2"
+    ip -netns $ioam_node_gamma link set ip6tnl0 up &>/dev/null
+  fi
+
+  local i
+  tr_size=$(( ${#ALPHA[9]} + ((4 - (${#ALPHA[9]} % 4)) % 4) ))
+  for i in {0..11} {22..22}
+  do
+    tr_size=$((tr_size + bit2size[$i]))
+  done
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type $tr_type ns $ns size $tr_size \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  if [ $? == 0 ]
+  then
+    run_test ${FUNCNAME[0]} "${desc}" $saddr $tr_type $tr_size $ns $1
+  else
+    log_test_failed "${desc}"
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  [ "$1" == "encap" ] && ip -netns $ioam_node_gamma \
+    link set ip6tnl0 down &>/dev/null
+}
+
+
+################################################################################
+#                                                                              #
+#                                 INPUT tests                                  #
+#                                                                              #
+################################################################################
+
+input_undef_ns()
+{
+  ##############################################################################
+  # Make sure an IOAM node does NOT fill the trace when the corresponding IOAM #
+  # Namespace-ID is not configured locally.                                    #
+  ##############################################################################
+  local desc="Unknown IOAM Namespace-ID"
+  local ns=0
+  local tr_type=0x800000
+  local tr_size=4
+  local mode="$1"
+
+  if [ "$1" == "encap" ]
+  then
+    if [ $encap_tests != 0 ]
+    then
+      log_test_skipped "${desc}"
+      return
+    fi
+
+    mode+=" tundst 2001:db8:2::2"
+    ip -netns $ioam_node_gamma link set ip6tnl0 up &>/dev/null
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type $tr_type ns $ns size $tr_size \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  if [ $? == 0 ]
+  then
+    run_test ${FUNCNAME[0]} "${desc}" 2001:db8:1::2 $tr_type $tr_size $ns $1
+  else
+    log_test_failed "${desc}"
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  [ "$1" == "encap" ] && ip -netns $ioam_node_gamma \
+    link set ip6tnl0 down &>/dev/null
+}
+
+input_no_room()
+{
+  ##############################################################################
+  # Make sure an IOAM node does NOT fill the trace AND sets the Overflow flag  #
+  # when there is not enough room for its data.                                #
+  ##############################################################################
+  local desc="Missing room for data"
+  local ns=123
+  local tr_type=0xc00000
+  local tr_size=4
+  local mode="$1"
+
+  if [ "$1" == "encap" ]
+  then
+    if [ $encap_tests != 0 ]
+    then
+      log_test_skipped "${desc}"
+      return
+    fi
+
+    mode+=" tundst 2001:db8:2::2"
+    ip -netns $ioam_node_gamma link set ip6tnl0 up &>/dev/null
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type $tr_type ns $ns size $tr_size \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  if [ $? == 0 ]
+  then
+    run_test ${FUNCNAME[0]} "${desc}" 2001:db8:1::2 $tr_type $tr_size $ns $1
+  else
+    log_test_failed "${desc}"
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  [ "$1" == "encap" ] && ip -netns $ioam_node_gamma \
+    link set ip6tnl0 down &>/dev/null
+}
+
+input_no_room_oss()
+{
+  ##############################################################################
+  # Make sure an IOAM node does NOT fill the trace AND sets the Overflow flag  #
+  # when there is not enough room for the Opaque State Snapshot.               #
+  ##############################################################################
+  local desc="Missing room for Opaque State Snapshot"
+  local ns=123
+  local tr_type=0x000002
+  local tr_size=4
+  local mode="$1"
+
+  if [ "$1" == "encap" ]
+  then
+    if [ $encap_tests != 0 ]
+    then
+      log_test_skipped "${desc}"
+      return
+    fi
+
+    mode+=" tundst 2001:db8:2::2"
+    ip -netns $ioam_node_gamma link set ip6tnl0 up &>/dev/null
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type $tr_type ns $ns size $tr_size \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  if [ $? == 0 ]
+  then
+    run_test ${FUNCNAME[0]} "${desc}" 2001:db8:1::2 $tr_type $tr_size $ns $1
+  else
+    log_test_failed "${desc}"
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  [ "$1" == "encap" ] && ip -netns $ioam_node_gamma \
+    link set ip6tnl0 down &>/dev/null
+}
+
+input_disabled()
+{
+  ##############################################################################
+  # Make sure an IOAM node does NOT fill the trace when IOAM is not enabled on #
+  # the corresponding (ingress) interface.                                     #
+  ##############################################################################
+  local desc="IOAM disabled on ingress interface"
+  local ns=123
+  local tr_type=0x800000
+  local tr_size=4
+  local mode="$1"
+
+  if [ "$1" == "encap" ]
+  then
+    if [ $encap_tests != 0 ]
+    then
+      log_test_skipped "${desc}"
+      return
+    fi
+
+    mode+=" tundst 2001:db8:2::2"
+    ip -netns $ioam_node_gamma link set ip6tnl0 up &>/dev/null
+  fi
+
+  # Exception: disable IOAM on ingress interface
+  ip netns exec $ioam_node_beta \
+    sysctl -wq net.ipv6.conf.veth0.ioam6_enabled=0 &>/dev/null
+  local ret=$?
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type $tr_type ns $ns size $tr_size \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+  ret=$((ret + $?))
+
+  if [ $ret == 0 ]
+  then
+    run_test ${FUNCNAME[0]} "${desc}" 2001:db8:1::2 $tr_type $tr_size $ns $1
+  else
+    log_test_failed "${desc}"
+  fi
+
+  # Clean Exception
+  ip netns exec $ioam_node_beta \
+    sysctl -wq net.ipv6.conf.veth0.ioam6_enabled=1 &>/dev/null
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  [ "$1" == "encap" ] && ip -netns $ioam_node_gamma \
+    link set ip6tnl0 down &>/dev/null
+}
+
+input_oflag()
+{
+  ##############################################################################
+  # Make sure an IOAM node does NOT fill the trace when the Overflow flag is   #
+  # set.                                                                       #
+  ##############################################################################
+  local desc="Overflow flag is set"
+  local ns=123
+  local tr_type=0xc00000
+  local tr_size=4
+  local mode="$1"
+
+  if [ "$1" == "encap" ]
+  then
+    if [ $encap_tests != 0 ]
+    then
+      log_test_skipped "${desc}"
+      return
+    fi
+
+    mode+=" tundst 2001:db8:2::2"
+    ip -netns $ioam_node_gamma link set ip6tnl0 up &>/dev/null
+  fi
+
+  # Exception:
+  #   Here, we need the sender to set the Overflow flag. For that, we will add
+  #   back the IOAM namespace that was previously configured on the sender.
+  ip -netns $ioam_node_alpha ioam namespace add 123 &>/dev/null
+  local ret=$?
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type $tr_type ns $ns size $tr_size \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+  ret=$((ret + $?))
+
+  if [ $ret == 0 ]
+  then
+    run_test ${FUNCNAME[0]} "${desc}" 2001:db8:1::2 $tr_type $tr_size $ns $1
+  else
+    log_test_failed "${desc}"
+  fi
+
+  # Clean Exception
+  ip -netns $ioam_node_alpha ioam namespace del 123 &>/dev/null
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  [ "$1" == "encap" ] && ip -netns $ioam_node_gamma \
+    link set ip6tnl0 down &>/dev/null
+}
+
+input_bits()
+{
+  ##############################################################################
+  # Make sure an IOAM node implements all supported bits by checking it        #
+  # correctly fills the trace with its data.                                   #
+  ##############################################################################
+  local desc="Trace Type with supported bit <n> only"
+  local ns=123
+  local mode="$1"
+
+  if [ "$1" == "encap" ]
+  then
+    mode+=" tundst 2001:db8:2::2"
+    ip -netns $ioam_node_gamma link set ip6tnl0 up &>/dev/null
+  fi
+
+  local tmp=${bit2size[22]}
+  bit2size[22]=$(( $tmp + ${#BETA[9]} + ((4 - (${#BETA[9]} % 4)) % 4) ))
+
+  local i
+  for i in {0..11} {22..22}
+  do
+    local descr="${desc/<n>/$i}"
+
+    if [[ "$1" == "encap" && $encap_tests != 0 ]]
+    then
+      log_test_skipped "${descr}"
+      continue
+    fi
+
+    ip -netns $ioam_node_alpha \
+      route change 2001:db8:2::/64 \
+      encap ioam6 mode $mode trace prealloc \
+      type ${bit2type[$i]} ns $ns size ${bit2size[$i]} \
+      via 2001:db8:1::1 dev veth0 &>/dev/null
+
+    if [ $? == 0 ]
+    then
+      run_test "input_bit$i" "${descr}" 2001:db8:1::2 \
+        ${bit2type[$i]} ${bit2size[$i]} $ns $1
+    else
+      log_test_failed "${descr}"
+    fi
+  done
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  [ "$1" == "encap" ] && ip -netns $ioam_node_gamma \
+    link set ip6tnl0 down &>/dev/null
+
+  bit2size[22]=$tmp
+}
+
+input_sizes()
+{
+  ##############################################################################
+  # Make sure an IOAM node handles all supported sizes correctly.              #
+  ##############################################################################
+  local desc="Trace Size of <n> bytes"
+  local ns=123
+  local tr_type=0x800000
+  local mode="$1"
+
+  if [ "$1" == "encap" ]
+  then
+    mode+=" tundst 2001:db8:2::2"
+    ip -netns $ioam_node_gamma link set ip6tnl0 up &>/dev/null
+  fi
+
+  local i
+  for i in $(seq 4 4 244)
+  do
+    local descr="${desc/<n>/$i}"
+
+    if [[ "$1" == "encap" && $encap_tests != 0 ]]
+    then
+      log_test_skipped "${descr}"
+      continue
+    fi
+
+    ip -netns $ioam_node_alpha \
+      route change 2001:db8:2::/64 \
+      encap ioam6 mode $mode trace prealloc type $tr_type ns $ns size $i \
+      via 2001:db8:1::1 dev veth0 &>/dev/null
+
+    if [ $? == 0 ]
+    then
+      run_test "input_size$i" "${descr}" 2001:db8:1::2 $tr_type $i $ns $1
+    else
+      log_test_failed "${descr}"
+    fi
+  done
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  [ "$1" == "encap" ] && ip -netns $ioam_node_gamma \
+    link set ip6tnl0 down &>/dev/null
+}
+
+input_full_supp_trace()
+{
+  ##############################################################################
+  # Make sure an IOAM node correctly fills a trace when all supported bits are #
+  # set.                                                                       #
+  ##############################################################################
+  local desc="Full supported trace"
+  local ns=123
+  local tr_type=0xfff002
+  local tr_size
+  local mode="$1"
+
+  if [ "$1" == "encap" ]
+  then
+    if [ $encap_tests != 0 ]
+    then
+      log_test_skipped "${desc}"
+      return
+    fi
+
+    mode+=" tundst 2001:db8:2::2"
+    ip -netns $ioam_node_gamma link set ip6tnl0 up &>/dev/null
+  fi
+
+  local i
+  tr_size=$(( ${#BETA[9]} + ((4 - (${#BETA[9]} % 4)) % 4) ))
+  for i in {0..11} {22..22}
+  do
+    tr_size=$((tr_size + bit2size[$i]))
+  done
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type $tr_type ns $ns size $tr_size \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  if [ $? == 0 ]
+  then
+    run_test ${FUNCNAME[0]} "${desc}" 2001:db8:1::2 $tr_type $tr_size $ns $1
+  else
+    log_test_failed "${desc}"
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  [ "$1" == "encap" ] && ip -netns $ioam_node_gamma \
+    link set ip6tnl0 down &>/dev/null
+}
+
+
+################################################################################
+#                                                                              #
+#                                     MAIN                                     #
+#                                                                              #
+################################################################################
+
+npassed=0
+nskipped=0
+nfailed=0
+
+if [ "$(id -u)" -ne 0 ]
+then
+  echo "SKIP: Need root privileges."
+  exit $ksft_skip
+fi
+
+if [ ! -x "$(command -v ip)" ]
+then
+  echo "SKIP: Could not run test without ip tool."
+  exit $ksft_skip
+fi
+
+check_kernel_compatibility
+setup
+run
+cleanup
+
+if [ $nfailed != 0 ]
+then
+  exit $ksft_fail
+fi
+
+exit $ksft_pass
diff --git a/tools/testing/selftests/net/ioam6_parser.c b/tools/testing/selftests/net/ioam6_parser.c
new file mode 100644
index 000000000000..de4b5c9e8a74
--- /dev/null
+++ b/tools/testing/selftests/net/ioam6_parser.c
@@ -0,0 +1,1101 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Author: Justin Iurman (justin.iurman@uliege.be)
+ *
+ * IOAM tester for IPv6, see ioam6.sh for details on each test case.
+ */
+#include <arpa/inet.h>
+#include <errno.h>
+#include <limits.h>
+#include <linux/const.h>
+#include <linux/if_ether.h>
+#include <linux/ioam6.h>
+#include <linux/ipv6.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+struct ioam_config {
+	__u32 id;
+	__u64 wide;
+	__u16 ingr_id;
+	__u16 egr_id;
+	__u32 ingr_wide;
+	__u32 egr_wide;
+	__u32 ns_data;
+	__u64 ns_wide;
+	__u32 sc_id;
+	__u8 hlim;
+	char *sc_data;
+};
+
+/*
+ * Be careful if you modify structs below - everything MUST be kept synchronized
+ * with configurations inside ioam6.sh and always reflect the same.
+ */
+
+static struct ioam_config node1 = {
+	.id = 1,
+	.wide = 11111111,
+	.ingr_id = 0xffff, /* default value */
+	.egr_id = 101,
+	.ingr_wide = 0xffffffff, /* default value */
+	.egr_wide = 101101,
+	.ns_data = 0xdeadbeef,
+	.ns_wide = 0xcafec0caf00dc0de,
+	.sc_id = 777,
+	.sc_data = "something that will be 4n-aligned",
+	.hlim = 64,
+};
+
+static struct ioam_config node2 = {
+	.id = 2,
+	.wide = 22222222,
+	.ingr_id = 201,
+	.egr_id = 202,
+	.ingr_wide = 201201,
+	.egr_wide = 202202,
+	.ns_data = 0xffffffff, /* default value */
+	.ns_wide = 0xffffffffffffffff, /* default value */
+	.sc_id = 0xffffff, /* default value */
+	.sc_data = NULL,
+	.hlim = 63,
+};
+
+enum {
+	/**********
+	 * OUTPUT *
+	 **********/
+	__TEST_OUT_MIN,
+
+	TEST_OUT_UNDEF_NS,
+	TEST_OUT_NO_ROOM,
+	TEST_OUT_NO_ROOM_OSS,
+	TEST_OUT_BIT0,
+	TEST_OUT_BIT1,
+	TEST_OUT_BIT2,
+	TEST_OUT_BIT3,
+	TEST_OUT_BIT4,
+	TEST_OUT_BIT5,
+	TEST_OUT_BIT6,
+	TEST_OUT_BIT7,
+	TEST_OUT_BIT8,
+	TEST_OUT_BIT9,
+	TEST_OUT_BIT10,
+	TEST_OUT_BIT11,
+	TEST_OUT_BIT22,
+	TEST_OUT_SIZE4,
+	TEST_OUT_SIZE8,
+	TEST_OUT_SIZE12,
+	TEST_OUT_SIZE16,
+	TEST_OUT_SIZE20,
+	TEST_OUT_SIZE24,
+	TEST_OUT_SIZE28,
+	TEST_OUT_SIZE32,
+	TEST_OUT_SIZE36,
+	TEST_OUT_SIZE40,
+	TEST_OUT_SIZE44,
+	TEST_OUT_SIZE48,
+	TEST_OUT_SIZE52,
+	TEST_OUT_SIZE56,
+	TEST_OUT_SIZE60,
+	TEST_OUT_SIZE64,
+	TEST_OUT_SIZE68,
+	TEST_OUT_SIZE72,
+	TEST_OUT_SIZE76,
+	TEST_OUT_SIZE80,
+	TEST_OUT_SIZE84,
+	TEST_OUT_SIZE88,
+	TEST_OUT_SIZE92,
+	TEST_OUT_SIZE96,
+	TEST_OUT_SIZE100,
+	TEST_OUT_SIZE104,
+	TEST_OUT_SIZE108,
+	TEST_OUT_SIZE112,
+	TEST_OUT_SIZE116,
+	TEST_OUT_SIZE120,
+	TEST_OUT_SIZE124,
+	TEST_OUT_SIZE128,
+	TEST_OUT_SIZE132,
+	TEST_OUT_SIZE136,
+	TEST_OUT_SIZE140,
+	TEST_OUT_SIZE144,
+	TEST_OUT_SIZE148,
+	TEST_OUT_SIZE152,
+	TEST_OUT_SIZE156,
+	TEST_OUT_SIZE160,
+	TEST_OUT_SIZE164,
+	TEST_OUT_SIZE168,
+	TEST_OUT_SIZE172,
+	TEST_OUT_SIZE176,
+	TEST_OUT_SIZE180,
+	TEST_OUT_SIZE184,
+	TEST_OUT_SIZE188,
+	TEST_OUT_SIZE192,
+	TEST_OUT_SIZE196,
+	TEST_OUT_SIZE200,
+	TEST_OUT_SIZE204,
+	TEST_OUT_SIZE208,
+	TEST_OUT_SIZE212,
+	TEST_OUT_SIZE216,
+	TEST_OUT_SIZE220,
+	TEST_OUT_SIZE224,
+	TEST_OUT_SIZE228,
+	TEST_OUT_SIZE232,
+	TEST_OUT_SIZE236,
+	TEST_OUT_SIZE240,
+	TEST_OUT_SIZE244,
+	TEST_OUT_FULL_SUPP_TRACE,
+
+	__TEST_OUT_MAX,
+
+	/*********
+	 * INPUT *
+	 *********/
+	__TEST_IN_MIN,
+
+	TEST_IN_UNDEF_NS,
+	TEST_IN_NO_ROOM,
+	TEST_IN_NO_ROOM_OSS,
+	TEST_IN_DISABLED,
+	TEST_IN_OFLAG,
+	TEST_IN_BIT0,
+	TEST_IN_BIT1,
+	TEST_IN_BIT2,
+	TEST_IN_BIT3,
+	TEST_IN_BIT4,
+	TEST_IN_BIT5,
+	TEST_IN_BIT6,
+	TEST_IN_BIT7,
+	TEST_IN_BIT8,
+	TEST_IN_BIT9,
+	TEST_IN_BIT10,
+	TEST_IN_BIT11,
+	TEST_IN_BIT22,
+	TEST_IN_SIZE4,
+	TEST_IN_SIZE8,
+	TEST_IN_SIZE12,
+	TEST_IN_SIZE16,
+	TEST_IN_SIZE20,
+	TEST_IN_SIZE24,
+	TEST_IN_SIZE28,
+	TEST_IN_SIZE32,
+	TEST_IN_SIZE36,
+	TEST_IN_SIZE40,
+	TEST_IN_SIZE44,
+	TEST_IN_SIZE48,
+	TEST_IN_SIZE52,
+	TEST_IN_SIZE56,
+	TEST_IN_SIZE60,
+	TEST_IN_SIZE64,
+	TEST_IN_SIZE68,
+	TEST_IN_SIZE72,
+	TEST_IN_SIZE76,
+	TEST_IN_SIZE80,
+	TEST_IN_SIZE84,
+	TEST_IN_SIZE88,
+	TEST_IN_SIZE92,
+	TEST_IN_SIZE96,
+	TEST_IN_SIZE100,
+	TEST_IN_SIZE104,
+	TEST_IN_SIZE108,
+	TEST_IN_SIZE112,
+	TEST_IN_SIZE116,
+	TEST_IN_SIZE120,
+	TEST_IN_SIZE124,
+	TEST_IN_SIZE128,
+	TEST_IN_SIZE132,
+	TEST_IN_SIZE136,
+	TEST_IN_SIZE140,
+	TEST_IN_SIZE144,
+	TEST_IN_SIZE148,
+	TEST_IN_SIZE152,
+	TEST_IN_SIZE156,
+	TEST_IN_SIZE160,
+	TEST_IN_SIZE164,
+	TEST_IN_SIZE168,
+	TEST_IN_SIZE172,
+	TEST_IN_SIZE176,
+	TEST_IN_SIZE180,
+	TEST_IN_SIZE184,
+	TEST_IN_SIZE188,
+	TEST_IN_SIZE192,
+	TEST_IN_SIZE196,
+	TEST_IN_SIZE200,
+	TEST_IN_SIZE204,
+	TEST_IN_SIZE208,
+	TEST_IN_SIZE212,
+	TEST_IN_SIZE216,
+	TEST_IN_SIZE220,
+	TEST_IN_SIZE224,
+	TEST_IN_SIZE228,
+	TEST_IN_SIZE232,
+	TEST_IN_SIZE236,
+	TEST_IN_SIZE240,
+	TEST_IN_SIZE244,
+	TEST_IN_FULL_SUPP_TRACE,
+
+	__TEST_IN_MAX,
+
+	__TEST_MAX,
+};
+
+static int check_header(int tid, struct ioam6_trace_hdr *trace,
+			__u32 trace_type, __u8 trace_size, __u16 ioam_ns)
+{
+	if (__be16_to_cpu(trace->namespace_id) != ioam_ns ||
+	    __be32_to_cpu(trace->type_be32) != (trace_type << 8))
+		return 1;
+
+	switch (tid) {
+	case TEST_OUT_UNDEF_NS:
+	case TEST_IN_UNDEF_NS:
+	case TEST_IN_DISABLED:
+		return trace->overflow == 1 ||
+		       trace->nodelen != 1 ||
+		       trace->remlen != 1;
+
+	case TEST_OUT_NO_ROOM:
+	case TEST_IN_NO_ROOM:
+	case TEST_IN_OFLAG:
+		return trace->overflow == 0 ||
+		       trace->nodelen != 2 ||
+		       trace->remlen != 1;
+
+	case TEST_OUT_NO_ROOM_OSS:
+		return trace->overflow == 0 ||
+		       trace->nodelen != 0 ||
+		       trace->remlen != 1;
+
+	case TEST_IN_NO_ROOM_OSS:
+	case TEST_OUT_BIT22:
+	case TEST_IN_BIT22:
+		return trace->overflow == 1 ||
+		       trace->nodelen != 0 ||
+		       trace->remlen != 0;
+
+	case TEST_OUT_BIT0:
+	case TEST_IN_BIT0:
+	case TEST_OUT_BIT1:
+	case TEST_IN_BIT1:
+	case TEST_OUT_BIT2:
+	case TEST_IN_BIT2:
+	case TEST_OUT_BIT3:
+	case TEST_IN_BIT3:
+	case TEST_OUT_BIT4:
+	case TEST_IN_BIT4:
+	case TEST_OUT_BIT5:
+	case TEST_IN_BIT5:
+	case TEST_OUT_BIT6:
+	case TEST_IN_BIT6:
+	case TEST_OUT_BIT7:
+	case TEST_IN_BIT7:
+	case TEST_OUT_BIT11:
+	case TEST_IN_BIT11:
+		return trace->overflow == 1 ||
+		       trace->nodelen != 1 ||
+		       trace->remlen != 0;
+
+	case TEST_OUT_BIT8:
+	case TEST_IN_BIT8:
+	case TEST_OUT_BIT9:
+	case TEST_IN_BIT9:
+	case TEST_OUT_BIT10:
+	case TEST_IN_BIT10:
+		return trace->overflow == 1 ||
+		       trace->nodelen != 2 ||
+		       trace->remlen != 0;
+
+	case TEST_OUT_SIZE4:
+	case TEST_OUT_SIZE8:
+	case TEST_OUT_SIZE12:
+	case TEST_OUT_SIZE16:
+	case TEST_OUT_SIZE20:
+	case TEST_OUT_SIZE24:
+	case TEST_OUT_SIZE28:
+	case TEST_OUT_SIZE32:
+	case TEST_OUT_SIZE36:
+	case TEST_OUT_SIZE40:
+	case TEST_OUT_SIZE44:
+	case TEST_OUT_SIZE48:
+	case TEST_OUT_SIZE52:
+	case TEST_OUT_SIZE56:
+	case TEST_OUT_SIZE60:
+	case TEST_OUT_SIZE64:
+	case TEST_OUT_SIZE68:
+	case TEST_OUT_SIZE72:
+	case TEST_OUT_SIZE76:
+	case TEST_OUT_SIZE80:
+	case TEST_OUT_SIZE84:
+	case TEST_OUT_SIZE88:
+	case TEST_OUT_SIZE92:
+	case TEST_OUT_SIZE96:
+	case TEST_OUT_SIZE100:
+	case TEST_OUT_SIZE104:
+	case TEST_OUT_SIZE108:
+	case TEST_OUT_SIZE112:
+	case TEST_OUT_SIZE116:
+	case TEST_OUT_SIZE120:
+	case TEST_OUT_SIZE124:
+	case TEST_OUT_SIZE128:
+	case TEST_OUT_SIZE132:
+	case TEST_OUT_SIZE136:
+	case TEST_OUT_SIZE140:
+	case TEST_OUT_SIZE144:
+	case TEST_OUT_SIZE148:
+	case TEST_OUT_SIZE152:
+	case TEST_OUT_SIZE156:
+	case TEST_OUT_SIZE160:
+	case TEST_OUT_SIZE164:
+	case TEST_OUT_SIZE168:
+	case TEST_OUT_SIZE172:
+	case TEST_OUT_SIZE176:
+	case TEST_OUT_SIZE180:
+	case TEST_OUT_SIZE184:
+	case TEST_OUT_SIZE188:
+	case TEST_OUT_SIZE192:
+	case TEST_OUT_SIZE196:
+	case TEST_OUT_SIZE200:
+	case TEST_OUT_SIZE204:
+	case TEST_OUT_SIZE208:
+	case TEST_OUT_SIZE212:
+	case TEST_OUT_SIZE216:
+	case TEST_OUT_SIZE220:
+	case TEST_OUT_SIZE224:
+	case TEST_OUT_SIZE228:
+	case TEST_OUT_SIZE232:
+	case TEST_OUT_SIZE236:
+	case TEST_OUT_SIZE240:
+	case TEST_OUT_SIZE244:
+		return trace->overflow == 1 ||
+		       trace->nodelen != 1 ||
+		       trace->remlen != trace_size / 4;
+
+	case TEST_IN_SIZE4:
+	case TEST_IN_SIZE8:
+	case TEST_IN_SIZE12:
+	case TEST_IN_SIZE16:
+	case TEST_IN_SIZE20:
+	case TEST_IN_SIZE24:
+	case TEST_IN_SIZE28:
+	case TEST_IN_SIZE32:
+	case TEST_IN_SIZE36:
+	case TEST_IN_SIZE40:
+	case TEST_IN_SIZE44:
+	case TEST_IN_SIZE48:
+	case TEST_IN_SIZE52:
+	case TEST_IN_SIZE56:
+	case TEST_IN_SIZE60:
+	case TEST_IN_SIZE64:
+	case TEST_IN_SIZE68:
+	case TEST_IN_SIZE72:
+	case TEST_IN_SIZE76:
+	case TEST_IN_SIZE80:
+	case TEST_IN_SIZE84:
+	case TEST_IN_SIZE88:
+	case TEST_IN_SIZE92:
+	case TEST_IN_SIZE96:
+	case TEST_IN_SIZE100:
+	case TEST_IN_SIZE104:
+	case TEST_IN_SIZE108:
+	case TEST_IN_SIZE112:
+	case TEST_IN_SIZE116:
+	case TEST_IN_SIZE120:
+	case TEST_IN_SIZE124:
+	case TEST_IN_SIZE128:
+	case TEST_IN_SIZE132:
+	case TEST_IN_SIZE136:
+	case TEST_IN_SIZE140:
+	case TEST_IN_SIZE144:
+	case TEST_IN_SIZE148:
+	case TEST_IN_SIZE152:
+	case TEST_IN_SIZE156:
+	case TEST_IN_SIZE160:
+	case TEST_IN_SIZE164:
+	case TEST_IN_SIZE168:
+	case TEST_IN_SIZE172:
+	case TEST_IN_SIZE176:
+	case TEST_IN_SIZE180:
+	case TEST_IN_SIZE184:
+	case TEST_IN_SIZE188:
+	case TEST_IN_SIZE192:
+	case TEST_IN_SIZE196:
+	case TEST_IN_SIZE200:
+	case TEST_IN_SIZE204:
+	case TEST_IN_SIZE208:
+	case TEST_IN_SIZE212:
+	case TEST_IN_SIZE216:
+	case TEST_IN_SIZE220:
+	case TEST_IN_SIZE224:
+	case TEST_IN_SIZE228:
+	case TEST_IN_SIZE232:
+	case TEST_IN_SIZE236:
+	case TEST_IN_SIZE240:
+	case TEST_IN_SIZE244:
+		return trace->overflow == 1 ||
+		       trace->nodelen != 1 ||
+		       trace->remlen != (trace_size / 4) - trace->nodelen;
+
+	case TEST_OUT_FULL_SUPP_TRACE:
+	case TEST_IN_FULL_SUPP_TRACE:
+		return trace->overflow == 1 ||
+		       trace->nodelen != 15 ||
+		       trace->remlen != 0;
+
+	default:
+		break;
+	}
+
+	return 1;
+}
+
+static int check_data(struct ioam6_trace_hdr *trace, __u8 trace_size,
+		      const struct ioam_config cnf, bool is_output)
+{
+	unsigned int len, i;
+	__u8 aligned;
+	__u64 raw64;
+	__u32 raw32;
+	__u8 *p;
+
+	if (trace->type.bit12 | trace->type.bit13 | trace->type.bit14 |
+	    trace->type.bit15 | trace->type.bit16 | trace->type.bit17 |
+	    trace->type.bit18 | trace->type.bit19 | trace->type.bit20 |
+	    trace->type.bit21 | trace->type.bit23)
+		return 1;
+
+	for (i = 0; i < trace->remlen * 4; i++) {
+		if (trace->data[i] != 0)
+			return 1;
+	}
+
+	if (trace->remlen * 4 == trace_size)
+		return 0;
+
+	p = trace->data + trace->remlen * 4;
+
+	if (trace->type.bit0) {
+		raw32 = __be32_to_cpu(*((__u32 *)p));
+		if (cnf.hlim != (raw32 >> 24) || cnf.id != (raw32 & 0xffffff))
+			return 1;
+		p += sizeof(__u32);
+	}
+
+	if (trace->type.bit1) {
+		raw32 = __be32_to_cpu(*((__u32 *)p));
+		if (cnf.ingr_id != (raw32 >> 16) ||
+		    cnf.egr_id != (raw32 & 0xffff))
+			return 1;
+		p += sizeof(__u32);
+	}
+
+	if (trace->type.bit2) {
+		raw32 = __be32_to_cpu(*((__u32 *)p));
+		if ((is_output && raw32 != 0xffffffff) ||
+		    (!is_output && (raw32 == 0 || raw32 == 0xffffffff)))
+			return 1;
+		p += sizeof(__u32);
+	}
+
+	if (trace->type.bit3) {
+		raw32 = __be32_to_cpu(*((__u32 *)p));
+		if ((is_output && raw32 != 0xffffffff) ||
+		    (!is_output && (raw32 == 0 || raw32 == 0xffffffff)))
+			return 1;
+		p += sizeof(__u32);
+	}
+
+	if (trace->type.bit4) {
+		if (__be32_to_cpu(*((__u32 *)p)) != 0xffffffff)
+			return 1;
+		p += sizeof(__u32);
+	}
+
+	if (trace->type.bit5) {
+		if (__be32_to_cpu(*((__u32 *)p)) != cnf.ns_data)
+			return 1;
+		p += sizeof(__u32);
+	}
+
+	if (trace->type.bit6) {
+		if (__be32_to_cpu(*((__u32 *)p)) == 0xffffffff)
+			return 1;
+		p += sizeof(__u32);
+	}
+
+	if (trace->type.bit7) {
+		if (__be32_to_cpu(*((__u32 *)p)) != 0xffffffff)
+			return 1;
+		p += sizeof(__u32);
+	}
+
+	if (trace->type.bit8) {
+		raw64 = __be64_to_cpu(*((__u64 *)p));
+		if (cnf.hlim != (raw64 >> 56) ||
+		    cnf.wide != (raw64 & 0xffffffffffffff))
+			return 1;
+		p += sizeof(__u64);
+	}
+
+	if (trace->type.bit9) {
+		if (__be32_to_cpu(*((__u32 *)p)) != cnf.ingr_wide)
+			return 1;
+		p += sizeof(__u32);
+
+		if (__be32_to_cpu(*((__u32 *)p)) != cnf.egr_wide)
+			return 1;
+		p += sizeof(__u32);
+	}
+
+	if (trace->type.bit10) {
+		if (__be64_to_cpu(*((__u64 *)p)) != cnf.ns_wide)
+			return 1;
+		p += sizeof(__u64);
+	}
+
+	if (trace->type.bit11) {
+		if (__be32_to_cpu(*((__u32 *)p)) != 0xffffffff)
+			return 1;
+		p += sizeof(__u32);
+	}
+
+	if (trace->type.bit22) {
+		len = cnf.sc_data ? strlen(cnf.sc_data) : 0;
+		aligned = cnf.sc_data ? __ALIGN_KERNEL(len, 4) : 0;
+
+		raw32 = __be32_to_cpu(*((__u32 *)p));
+		if (aligned != (raw32 >> 24) * 4 ||
+		    cnf.sc_id != (raw32 & 0xffffff))
+			return 1;
+		p += sizeof(__u32);
+
+		if (cnf.sc_data) {
+			if (strncmp((char *)p, cnf.sc_data, len))
+				return 1;
+
+			p += len;
+			aligned -= len;
+
+			while (aligned--) {
+				if (*p != '\0')
+					return 1;
+				p += sizeof(__u8);
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int check_ioam_trace(int tid, struct ioam6_trace_hdr *trace,
+			    __u32 trace_type, __u8 trace_size, __u16 ioam_ns)
+{
+	if (check_header(tid, trace, trace_type, trace_size, ioam_ns))
+		return 1;
+
+	if (tid > __TEST_OUT_MIN && tid < __TEST_OUT_MAX)
+		return check_data(trace, trace_size, node1, true);
+
+	if (tid > __TEST_IN_MIN && tid < __TEST_IN_MAX)
+		return check_data(trace, trace_size, node2, false);
+
+	return 1;
+}
+
+static int str2id(const char *tname)
+{
+	if (!strcmp("output_undef_ns", tname))
+		return TEST_OUT_UNDEF_NS;
+	if (!strcmp("output_no_room", tname))
+		return TEST_OUT_NO_ROOM;
+	if (!strcmp("output_no_room_oss", tname))
+		return TEST_OUT_NO_ROOM_OSS;
+	if (!strcmp("output_bit0", tname))
+		return TEST_OUT_BIT0;
+	if (!strcmp("output_bit1", tname))
+		return TEST_OUT_BIT1;
+	if (!strcmp("output_bit2", tname))
+		return TEST_OUT_BIT2;
+	if (!strcmp("output_bit3", tname))
+		return TEST_OUT_BIT3;
+	if (!strcmp("output_bit4", tname))
+		return TEST_OUT_BIT4;
+	if (!strcmp("output_bit5", tname))
+		return TEST_OUT_BIT5;
+	if (!strcmp("output_bit6", tname))
+		return TEST_OUT_BIT6;
+	if (!strcmp("output_bit7", tname))
+		return TEST_OUT_BIT7;
+	if (!strcmp("output_bit8", tname))
+		return TEST_OUT_BIT8;
+	if (!strcmp("output_bit9", tname))
+		return TEST_OUT_BIT9;
+	if (!strcmp("output_bit10", tname))
+		return TEST_OUT_BIT10;
+	if (!strcmp("output_bit11", tname))
+		return TEST_OUT_BIT11;
+	if (!strcmp("output_bit22", tname))
+		return TEST_OUT_BIT22;
+	if (!strcmp("output_size4", tname))
+		return TEST_OUT_SIZE4;
+	if (!strcmp("output_size8", tname))
+		return TEST_OUT_SIZE8;
+	if (!strcmp("output_size12", tname))
+		return TEST_OUT_SIZE12;
+	if (!strcmp("output_size16", tname))
+		return TEST_OUT_SIZE16;
+	if (!strcmp("output_size20", tname))
+		return TEST_OUT_SIZE20;
+	if (!strcmp("output_size24", tname))
+		return TEST_OUT_SIZE24;
+	if (!strcmp("output_size28", tname))
+		return TEST_OUT_SIZE28;
+	if (!strcmp("output_size32", tname))
+		return TEST_OUT_SIZE32;
+	if (!strcmp("output_size36", tname))
+		return TEST_OUT_SIZE36;
+	if (!strcmp("output_size40", tname))
+		return TEST_OUT_SIZE40;
+	if (!strcmp("output_size44", tname))
+		return TEST_OUT_SIZE44;
+	if (!strcmp("output_size48", tname))
+		return TEST_OUT_SIZE48;
+	if (!strcmp("output_size52", tname))
+		return TEST_OUT_SIZE52;
+	if (!strcmp("output_size56", tname))
+		return TEST_OUT_SIZE56;
+	if (!strcmp("output_size60", tname))
+		return TEST_OUT_SIZE60;
+	if (!strcmp("output_size64", tname))
+		return TEST_OUT_SIZE64;
+	if (!strcmp("output_size68", tname))
+		return TEST_OUT_SIZE68;
+	if (!strcmp("output_size72", tname))
+		return TEST_OUT_SIZE72;
+	if (!strcmp("output_size76", tname))
+		return TEST_OUT_SIZE76;
+	if (!strcmp("output_size80", tname))
+		return TEST_OUT_SIZE80;
+	if (!strcmp("output_size84", tname))
+		return TEST_OUT_SIZE84;
+	if (!strcmp("output_size88", tname))
+		return TEST_OUT_SIZE88;
+	if (!strcmp("output_size92", tname))
+		return TEST_OUT_SIZE92;
+	if (!strcmp("output_size96", tname))
+		return TEST_OUT_SIZE96;
+	if (!strcmp("output_size100", tname))
+		return TEST_OUT_SIZE100;
+	if (!strcmp("output_size104", tname))
+		return TEST_OUT_SIZE104;
+	if (!strcmp("output_size108", tname))
+		return TEST_OUT_SIZE108;
+	if (!strcmp("output_size112", tname))
+		return TEST_OUT_SIZE112;
+	if (!strcmp("output_size116", tname))
+		return TEST_OUT_SIZE116;
+	if (!strcmp("output_size120", tname))
+		return TEST_OUT_SIZE120;
+	if (!strcmp("output_size124", tname))
+		return TEST_OUT_SIZE124;
+	if (!strcmp("output_size128", tname))
+		return TEST_OUT_SIZE128;
+	if (!strcmp("output_size132", tname))
+		return TEST_OUT_SIZE132;
+	if (!strcmp("output_size136", tname))
+		return TEST_OUT_SIZE136;
+	if (!strcmp("output_size140", tname))
+		return TEST_OUT_SIZE140;
+	if (!strcmp("output_size144", tname))
+		return TEST_OUT_SIZE144;
+	if (!strcmp("output_size148", tname))
+		return TEST_OUT_SIZE148;
+	if (!strcmp("output_size152", tname))
+		return TEST_OUT_SIZE152;
+	if (!strcmp("output_size156", tname))
+		return TEST_OUT_SIZE156;
+	if (!strcmp("output_size160", tname))
+		return TEST_OUT_SIZE160;
+	if (!strcmp("output_size164", tname))
+		return TEST_OUT_SIZE164;
+	if (!strcmp("output_size168", tname))
+		return TEST_OUT_SIZE168;
+	if (!strcmp("output_size172", tname))
+		return TEST_OUT_SIZE172;
+	if (!strcmp("output_size176", tname))
+		return TEST_OUT_SIZE176;
+	if (!strcmp("output_size180", tname))
+		return TEST_OUT_SIZE180;
+	if (!strcmp("output_size184", tname))
+		return TEST_OUT_SIZE184;
+	if (!strcmp("output_size188", tname))
+		return TEST_OUT_SIZE188;
+	if (!strcmp("output_size192", tname))
+		return TEST_OUT_SIZE192;
+	if (!strcmp("output_size196", tname))
+		return TEST_OUT_SIZE196;
+	if (!strcmp("output_size200", tname))
+		return TEST_OUT_SIZE200;
+	if (!strcmp("output_size204", tname))
+		return TEST_OUT_SIZE204;
+	if (!strcmp("output_size208", tname))
+		return TEST_OUT_SIZE208;
+	if (!strcmp("output_size212", tname))
+		return TEST_OUT_SIZE212;
+	if (!strcmp("output_size216", tname))
+		return TEST_OUT_SIZE216;
+	if (!strcmp("output_size220", tname))
+		return TEST_OUT_SIZE220;
+	if (!strcmp("output_size224", tname))
+		return TEST_OUT_SIZE224;
+	if (!strcmp("output_size228", tname))
+		return TEST_OUT_SIZE228;
+	if (!strcmp("output_size232", tname))
+		return TEST_OUT_SIZE232;
+	if (!strcmp("output_size236", tname))
+		return TEST_OUT_SIZE236;
+	if (!strcmp("output_size240", tname))
+		return TEST_OUT_SIZE240;
+	if (!strcmp("output_size244", tname))
+		return TEST_OUT_SIZE244;
+	if (!strcmp("output_full_supp_trace", tname))
+		return TEST_OUT_FULL_SUPP_TRACE;
+	if (!strcmp("input_undef_ns", tname))
+		return TEST_IN_UNDEF_NS;
+	if (!strcmp("input_no_room", tname))
+		return TEST_IN_NO_ROOM;
+	if (!strcmp("input_no_room_oss", tname))
+		return TEST_IN_NO_ROOM_OSS;
+	if (!strcmp("input_disabled", tname))
+		return TEST_IN_DISABLED;
+	if (!strcmp("input_oflag", tname))
+		return TEST_IN_OFLAG;
+	if (!strcmp("input_bit0", tname))
+		return TEST_IN_BIT0;
+	if (!strcmp("input_bit1", tname))
+		return TEST_IN_BIT1;
+	if (!strcmp("input_bit2", tname))
+		return TEST_IN_BIT2;
+	if (!strcmp("input_bit3", tname))
+		return TEST_IN_BIT3;
+	if (!strcmp("input_bit4", tname))
+		return TEST_IN_BIT4;
+	if (!strcmp("input_bit5", tname))
+		return TEST_IN_BIT5;
+	if (!strcmp("input_bit6", tname))
+		return TEST_IN_BIT6;
+	if (!strcmp("input_bit7", tname))
+		return TEST_IN_BIT7;
+	if (!strcmp("input_bit8", tname))
+		return TEST_IN_BIT8;
+	if (!strcmp("input_bit9", tname))
+		return TEST_IN_BIT9;
+	if (!strcmp("input_bit10", tname))
+		return TEST_IN_BIT10;
+	if (!strcmp("input_bit11", tname))
+		return TEST_IN_BIT11;
+	if (!strcmp("input_bit22", tname))
+		return TEST_IN_BIT22;
+	if (!strcmp("input_size4", tname))
+		return TEST_IN_SIZE4;
+	if (!strcmp("input_size8", tname))
+		return TEST_IN_SIZE8;
+	if (!strcmp("input_size12", tname))
+		return TEST_IN_SIZE12;
+	if (!strcmp("input_size16", tname))
+		return TEST_IN_SIZE16;
+	if (!strcmp("input_size20", tname))
+		return TEST_IN_SIZE20;
+	if (!strcmp("input_size24", tname))
+		return TEST_IN_SIZE24;
+	if (!strcmp("input_size28", tname))
+		return TEST_IN_SIZE28;
+	if (!strcmp("input_size32", tname))
+		return TEST_IN_SIZE32;
+	if (!strcmp("input_size36", tname))
+		return TEST_IN_SIZE36;
+	if (!strcmp("input_size40", tname))
+		return TEST_IN_SIZE40;
+	if (!strcmp("input_size44", tname))
+		return TEST_IN_SIZE44;
+	if (!strcmp("input_size48", tname))
+		return TEST_IN_SIZE48;
+	if (!strcmp("input_size52", tname))
+		return TEST_IN_SIZE52;
+	if (!strcmp("input_size56", tname))
+		return TEST_IN_SIZE56;
+	if (!strcmp("input_size60", tname))
+		return TEST_IN_SIZE60;
+	if (!strcmp("input_size64", tname))
+		return TEST_IN_SIZE64;
+	if (!strcmp("input_size68", tname))
+		return TEST_IN_SIZE68;
+	if (!strcmp("input_size72", tname))
+		return TEST_IN_SIZE72;
+	if (!strcmp("input_size76", tname))
+		return TEST_IN_SIZE76;
+	if (!strcmp("input_size80", tname))
+		return TEST_IN_SIZE80;
+	if (!strcmp("input_size84", tname))
+		return TEST_IN_SIZE84;
+	if (!strcmp("input_size88", tname))
+		return TEST_IN_SIZE88;
+	if (!strcmp("input_size92", tname))
+		return TEST_IN_SIZE92;
+	if (!strcmp("input_size96", tname))
+		return TEST_IN_SIZE96;
+	if (!strcmp("input_size100", tname))
+		return TEST_IN_SIZE100;
+	if (!strcmp("input_size104", tname))
+		return TEST_IN_SIZE104;
+	if (!strcmp("input_size108", tname))
+		return TEST_IN_SIZE108;
+	if (!strcmp("input_size112", tname))
+		return TEST_IN_SIZE112;
+	if (!strcmp("input_size116", tname))
+		return TEST_IN_SIZE116;
+	if (!strcmp("input_size120", tname))
+		return TEST_IN_SIZE120;
+	if (!strcmp("input_size124", tname))
+		return TEST_IN_SIZE124;
+	if (!strcmp("input_size128", tname))
+		return TEST_IN_SIZE128;
+	if (!strcmp("input_size132", tname))
+		return TEST_IN_SIZE132;
+	if (!strcmp("input_size136", tname))
+		return TEST_IN_SIZE136;
+	if (!strcmp("input_size140", tname))
+		return TEST_IN_SIZE140;
+	if (!strcmp("input_size144", tname))
+		return TEST_IN_SIZE144;
+	if (!strcmp("input_size148", tname))
+		return TEST_IN_SIZE148;
+	if (!strcmp("input_size152", tname))
+		return TEST_IN_SIZE152;
+	if (!strcmp("input_size156", tname))
+		return TEST_IN_SIZE156;
+	if (!strcmp("input_size160", tname))
+		return TEST_IN_SIZE160;
+	if (!strcmp("input_size164", tname))
+		return TEST_IN_SIZE164;
+	if (!strcmp("input_size168", tname))
+		return TEST_IN_SIZE168;
+	if (!strcmp("input_size172", tname))
+		return TEST_IN_SIZE172;
+	if (!strcmp("input_size176", tname))
+		return TEST_IN_SIZE176;
+	if (!strcmp("input_size180", tname))
+		return TEST_IN_SIZE180;
+	if (!strcmp("input_size184", tname))
+		return TEST_IN_SIZE184;
+	if (!strcmp("input_size188", tname))
+		return TEST_IN_SIZE188;
+	if (!strcmp("input_size192", tname))
+		return TEST_IN_SIZE192;
+	if (!strcmp("input_size196", tname))
+		return TEST_IN_SIZE196;
+	if (!strcmp("input_size200", tname))
+		return TEST_IN_SIZE200;
+	if (!strcmp("input_size204", tname))
+		return TEST_IN_SIZE204;
+	if (!strcmp("input_size208", tname))
+		return TEST_IN_SIZE208;
+	if (!strcmp("input_size212", tname))
+		return TEST_IN_SIZE212;
+	if (!strcmp("input_size216", tname))
+		return TEST_IN_SIZE216;
+	if (!strcmp("input_size220", tname))
+		return TEST_IN_SIZE220;
+	if (!strcmp("input_size224", tname))
+		return TEST_IN_SIZE224;
+	if (!strcmp("input_size228", tname))
+		return TEST_IN_SIZE228;
+	if (!strcmp("input_size232", tname))
+		return TEST_IN_SIZE232;
+	if (!strcmp("input_size236", tname))
+		return TEST_IN_SIZE236;
+	if (!strcmp("input_size240", tname))
+		return TEST_IN_SIZE240;
+	if (!strcmp("input_size244", tname))
+		return TEST_IN_SIZE244;
+	if (!strcmp("input_full_supp_trace", tname))
+		return TEST_IN_FULL_SUPP_TRACE;
+
+	return -1;
+}
+
+static int ipv6_addr_equal(const struct in6_addr *a1, const struct in6_addr *a2)
+{
+	return ((a1->s6_addr32[0] ^ a2->s6_addr32[0]) |
+		(a1->s6_addr32[1] ^ a2->s6_addr32[1]) |
+		(a1->s6_addr32[2] ^ a2->s6_addr32[2]) |
+		(a1->s6_addr32[3] ^ a2->s6_addr32[3])) == 0;
+}
+
+static int get_u32(__u32 *val, const char *arg, int base)
+{
+	unsigned long res;
+	char *ptr;
+
+	if (!arg || !*arg)
+		return -1;
+	res = strtoul(arg, &ptr, base);
+
+	if (!ptr || ptr == arg || *ptr)
+		return -1;
+
+	if (res == ULONG_MAX && errno == ERANGE)
+		return -1;
+
+	if (res > 0xFFFFFFFFUL)
+		return -1;
+
+	*val = res;
+	return 0;
+}
+
+static int get_u16(__u16 *val, const char *arg, int base)
+{
+	unsigned long res;
+	char *ptr;
+
+	if (!arg || !*arg)
+		return -1;
+	res = strtoul(arg, &ptr, base);
+
+	if (!ptr || ptr == arg || *ptr)
+		return -1;
+
+	if (res == ULONG_MAX && errno == ERANGE)
+		return -1;
+
+	if (res > 0xFFFFUL)
+		return -1;
+
+	*val = res;
+	return 0;
+}
+
+static int get_u8(__u8 *val, const char *arg, int base)
+{
+	unsigned long res;
+	char *ptr;
+
+	if (!arg || !*arg)
+		return -1;
+	res = strtoul(arg, &ptr, base);
+
+	if (!ptr || ptr == arg || *ptr)
+		return -1;
+
+	if (res == ULONG_MAX && errno == ERANGE)
+		return -1;
+
+	if (res > 0xFFUL)
+		return -1;
+
+	*val = res;
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	__u8 buffer[512], *ptr, nexthdr, tr_size;
+	struct ioam6_trace_hdr *trace;
+	unsigned int hoplen, ret = 1;
+	struct ipv6_hopopt_hdr *hbh;
+	int fd, size, testname_id;
+	struct in6_addr src, dst;
+	struct ioam6_hdr *ioam6;
+	struct timeval timeout;
+	struct ipv6hdr *ipv6;
+	__u32 tr_type;
+	__u16 ioam_ns;
+
+	if (argc != 9)
+		goto out;
+
+	testname_id = str2id(argv[2]);
+
+	if (testname_id < 0 ||
+	    inet_pton(AF_INET6, argv[3], &src) != 1 ||
+	    inet_pton(AF_INET6, argv[4], &dst) != 1 ||
+	    get_u32(&tr_type, argv[5], 16) ||
+	    get_u8(&tr_size, argv[6], 0) ||
+	    get_u16(&ioam_ns, argv[7], 0))
+		goto out;
+
+	nexthdr = (!strcmp(argv[8], "encap") ? IPPROTO_IPV6 : IPPROTO_ICMPV6);
+
+	hoplen = sizeof(*hbh);
+	hoplen += 2; // 2-byte padding for alignment
+	hoplen += sizeof(*ioam6); // IOAM option header
+	hoplen += sizeof(*trace); // IOAM trace header
+	hoplen += tr_size; // IOAM trace size
+	hoplen += (tr_size % 8); // optional padding
+
+	fd = socket(AF_PACKET, SOCK_DGRAM, __cpu_to_be16(ETH_P_IPV6));
+	if (fd < 0)
+		goto out;
+
+	if (setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE,
+		       argv[1], strlen(argv[1])))
+		goto close;
+
+	timeout.tv_sec = 1;
+	timeout.tv_usec = 0;
+	if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO,
+		       (const char *)&timeout, sizeof(timeout)))
+		goto close;
+recv:
+	size = recv(fd, buffer, sizeof(buffer), 0);
+	if (size <= 0)
+		goto close;
+
+	ipv6 = (struct ipv6hdr *)buffer;
+
+	/* Skip packets that do not have the expected src/dst address or that
+	 * do not have a Hop-by-hop.
+	 */
+	if (!ipv6_addr_equal(&ipv6->saddr, &src) ||
+	    !ipv6_addr_equal(&ipv6->daddr, &dst) ||
+	    ipv6->nexthdr != IPPROTO_HOPOPTS)
+		goto recv;
+
+	/* Check Hbh's Next Header and Size. */
+	hbh = (struct ipv6_hopopt_hdr *)(buffer + sizeof(*ipv6));
+	if (hbh->nexthdr != nexthdr || hbh->hdrlen != (hoplen >> 3) - 1)
+		goto close;
+
+	/* Check we have a 2-byte padding for alignment. */
+	ptr = (__u8 *)hbh + sizeof(*hbh);
+	if (ptr[0] != IPV6_TLV_PADN && ptr[1] != 0)
+		goto close;
+
+	/* Check we now have the IOAM option. */
+	ptr += 2;
+	if (ptr[0] != IPV6_TLV_IOAM)
+		goto close;
+
+	/* Check its size and the IOAM option type. */
+	ioam6 = (struct ioam6_hdr *)ptr;
+	if (ioam6->opt_len != sizeof(*ioam6) - 2 + sizeof(*trace) + tr_size ||
+	    ioam6->type != IOAM6_TYPE_PREALLOC)
+		goto close;
+
+	trace = (struct ioam6_trace_hdr *)(ptr + sizeof(*ioam6));
+
+	/* Check the trailing 4-byte padding (potentially). */
+	ptr = (__u8 *)trace + sizeof(*trace) + tr_size;
+	if (tr_size % 8 && ptr[0] != IPV6_TLV_PADN && ptr[1] != 2 &&
+	    ptr[2] != 0 && ptr[3] != 0)
+		goto close;
+
+	/* Check the IOAM header and data. */
+	ret = check_ioam_trace(testname_id, trace, tr_type, tr_size, ioam_ns);
+close:
+	close(fd);
+out:
+	return ret;
+}
-- 
cgit v1.2.3


From 3d07b691ee707c00afaf365440975e81bb96cd9b Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar <maheshb@google.com>
Date: Thu, 3 Oct 2024 03:15:06 -0700
Subject: selftest/ptp: update ptp selftest to exercise the gettimex options

With the inclusion of commit c259acab839e ("ptp/ioctl: support
MONOTONIC{,_RAW} timestamps for PTP_SYS_OFFSET_EXTENDED") clock_gettime()
now allows retrieval of pre/post timestamps for CLOCK_MONOTONIC and
CLOCK_MONOTONIC_RAW timebases along with the previously supported
CLOCK_REALTIME.

This patch adds a command line option 'y' to the testptp program to
choose one of the allowed timebases [realtime aka system, monotonic,
and monotonic-raw).

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Link: https://patch.msgid.link/20241003101506.769418-1-maheshb@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/ptp/testptp.c | 62 ++++++++++++++++++++++++++++++++---
 1 file changed, 57 insertions(+), 5 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/ptp/testptp.c b/tools/testing/selftests/ptp/testptp.c
index 011252fe238c..58064151f2c8 100644
--- a/tools/testing/selftests/ptp/testptp.c
+++ b/tools/testing/selftests/ptp/testptp.c
@@ -146,6 +146,7 @@ static void usage(char *progname)
 		" -T val     set the ptp clock time to 'val' seconds\n"
 		" -x val     get an extended ptp clock time with the desired number of samples (up to %d)\n"
 		" -X         get a ptp clock cross timestamp\n"
+		" -y val     pre/post tstamp timebase to use {realtime|monotonic|monotonic-raw}\n"
 		" -z         test combinations of rising/falling external time stamp flags\n",
 		progname, PTP_MAX_SAMPLES);
 }
@@ -189,6 +190,7 @@ int main(int argc, char *argv[])
 	int seconds = 0;
 	int settime = 0;
 	int channel = -1;
+	clockid_t ext_clockid = CLOCK_REALTIME;
 
 	int64_t t1, t2, tp;
 	int64_t interval, offset;
@@ -198,7 +200,7 @@ int main(int argc, char *argv[])
 
 	progname = strrchr(argv[0], '/');
 	progname = progname ? 1+progname : argv[0];
-	while (EOF != (c = getopt(argc, argv, "cd:e:f:F:ghH:i:k:lL:n:o:p:P:sSt:T:w:x:Xz"))) {
+	while (EOF != (c = getopt(argc, argv, "cd:e:f:F:ghH:i:k:lL:n:o:p:P:sSt:T:w:x:Xy:z"))) {
 		switch (c) {
 		case 'c':
 			capabilities = 1;
@@ -278,6 +280,21 @@ int main(int argc, char *argv[])
 		case 'X':
 			getcross = 1;
 			break;
+		case 'y':
+			if (!strcasecmp(optarg, "realtime"))
+				ext_clockid = CLOCK_REALTIME;
+			else if (!strcasecmp(optarg, "monotonic"))
+				ext_clockid = CLOCK_MONOTONIC;
+			else if (!strcasecmp(optarg, "monotonic-raw"))
+				ext_clockid = CLOCK_MONOTONIC_RAW;
+			else {
+				fprintf(stderr,
+					"type needs to be realtime, monotonic or monotonic-raw; was given %s\n",
+					optarg);
+				return -1;
+			}
+			break;
+
 		case 'z':
 			flagtest = 1;
 			break;
@@ -566,6 +583,7 @@ int main(int argc, char *argv[])
 		}
 
 		soe->n_samples = getextended;
+		soe->clockid = ext_clockid;
 
 		if (ioctl(fd, PTP_SYS_OFFSET_EXTENDED, soe)) {
 			perror("PTP_SYS_OFFSET_EXTENDED");
@@ -574,12 +592,46 @@ int main(int argc, char *argv[])
 			       getextended);
 
 			for (i = 0; i < getextended; i++) {
-				printf("sample #%2d: system time before: %lld.%09u\n",
-				       i, soe->ts[i][0].sec, soe->ts[i][0].nsec);
+				switch (ext_clockid) {
+				case CLOCK_REALTIME:
+					printf("sample #%2d: real time before: %lld.%09u\n",
+					       i, soe->ts[i][0].sec,
+					       soe->ts[i][0].nsec);
+					break;
+				case CLOCK_MONOTONIC:
+					printf("sample #%2d: monotonic time before: %lld.%09u\n",
+					       i, soe->ts[i][0].sec,
+					       soe->ts[i][0].nsec);
+					break;
+				case CLOCK_MONOTONIC_RAW:
+					printf("sample #%2d: monotonic-raw time before: %lld.%09u\n",
+					       i, soe->ts[i][0].sec,
+					       soe->ts[i][0].nsec);
+					break;
+				default:
+					break;
+				}
 				printf("            phc time: %lld.%09u\n",
 				       soe->ts[i][1].sec, soe->ts[i][1].nsec);
-				printf("            system time after: %lld.%09u\n",
-				       soe->ts[i][2].sec, soe->ts[i][2].nsec);
+				switch (ext_clockid) {
+				case CLOCK_REALTIME:
+					printf("            real time after: %lld.%09u\n",
+					       soe->ts[i][2].sec,
+					       soe->ts[i][2].nsec);
+					break;
+				case CLOCK_MONOTONIC:
+					printf("            monotonic time after: %lld.%09u\n",
+					       soe->ts[i][2].sec,
+					       soe->ts[i][2].nsec);
+					break;
+				case CLOCK_MONOTONIC_RAW:
+					printf("            monotonic-raw time after: %lld.%09u\n",
+					       soe->ts[i][2].sec,
+					       soe->ts[i][2].nsec);
+					break;
+				default:
+					break;
+				}
 			}
 		}
 
-- 
cgit v1.2.3


From 21e92806d39c68af2accd1fb238c2daecfcf9fbd Mon Sep 17 00:00:00 2001
From: Donglin Peng <pengdonglin@xiaomi.com>
Date: Sat, 14 Sep 2024 20:29:12 -0700
Subject: function_graph: Support recording and printing the function return
 address

When using function_graph tracer to analyze the flow of kernel function
execution, it is often necessary to quickly locate the exact line of code
where the call occurs. While this may be easy at times, it can be more
time-consuming when some functions are inlined or the flow is too long.

This feature aims to simplify the process by recording the return address
of traced funcions and printing it when outputing trace logs.

To enhance human readability, the prefix 'ret=' is used for the kernel return
value, while '<-' serves as the prefix for the return address in trace logs to
make it look more like the function tracer.

A new trace option named 'funcgraph-retaddr' has been introduced, and the
existing option 'sym-addr' can be used to control the format of the return
address.

See below logs with both funcgraph-retval and funcgraph-retaddr enabled.

0)             | load_elf_binary() { /* <-bprm_execve+0x249/0x600 */
0)             |   load_elf_phdrs() { /* <-load_elf_binary+0x84/0x1730 */
0)             |     __kmalloc_noprof() { /* <-load_elf_phdrs+0x4a/0xb0 */
0)   3.657 us  |       __cond_resched(); /* <-__kmalloc_noprof+0x28c/0x390 ret=0x0 */
0) + 24.335 us |     } /* __kmalloc_noprof ret=0xffff8882007f3000 */
0)             |     kernel_read() { /* <-load_elf_phdrs+0x6c/0xb0 */
0)             |       rw_verify_area() { /* <-kernel_read+0x2b/0x50 */
0)             |         security_file_permission() { /* <-kernel_read+0x2b/0x50 */
0)             |           selinux_file_permission() { /* <-security_file_permission+0x26/0x40 */
0)             |             __inode_security_revalidate() { /* <-selinux_file_permission+0x6d/0x140 */
0)   2.034 us  |               __cond_resched(); /* <-__inode_security_revalidate+0x5f/0x80 ret=0x0 */
0)   6.602 us  |             } /* __inode_security_revalidate ret=0x0 */
0)   2.214 us  |             avc_policy_seqno(); /* <-selinux_file_permission+0x107/0x140 ret=0x0 */
0) + 16.670 us |           } /* selinux_file_permission ret=0x0 */
0) + 20.809 us |         } /* security_file_permission ret=0x0 */
0) + 25.217 us |       } /* rw_verify_area ret=0x0 */
0)             |       __kernel_read() { /* <-load_elf_phdrs+0x6c/0xb0 */
0)             |         ext4_file_read_iter() { /* <-__kernel_read+0x160/0x2e0 */

Then, we can use the faddr2line to locate the source code, for example:

$ ./scripts/faddr2line ./vmlinux load_elf_phdrs+0x6c/0xb0
load_elf_phdrs+0x6c/0xb0:
elf_read at fs/binfmt_elf.c:471
(inlined by) load_elf_phdrs at fs/binfmt_elf.c:531

Link: https://lore.kernel.org/20240915032912.1118397-1-dolinux.peng@gmail.com
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202409150605.HgUmU8ea-lkp@intel.com/
Signed-off-by: Donglin Peng <dolinux.peng@gmail.com>
[ Rebased to handle text_delta offsets ]
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ftrace.h                             |  27 ++-
 kernel/trace/Kconfig                               |  10 +
 kernel/trace/fgraph.c                              |  22 ++-
 kernel/trace/ftrace.c                              |   3 +-
 kernel/trace/trace.h                               |  11 +-
 kernel/trace/trace_entries.h                       |  29 ++-
 kernel/trace/trace_functions_graph.c               | 216 ++++++++++++++++-----
 kernel/trace/trace_irqsoff.c                       |   3 +-
 kernel/trace/trace_sched_wakeup.c                  |   3 +-
 kernel/trace/trace_selftest.c                      |   9 +-
 .../ftrace/test.d/ftrace/fgraph-retval.tc          |   2 +-
 11 files changed, 274 insertions(+), 61 deletions(-)

(limited to 'tools/testing')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index e684addf6508..2ac3b3b53cd0 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1040,6 +1040,17 @@ struct ftrace_graph_ent {
 	int depth;
 } __packed;
 
+/*
+ * Structure that defines an entry function trace with retaddr.
+ * It's already packed but the attribute "packed" is needed
+ * to remove extra padding at the end.
+ */
+struct fgraph_retaddr_ent {
+	unsigned long func; /* Current function */
+	int depth;
+	unsigned long retaddr;  /* Return address */
+} __packed;
+
 /*
  * Structure that defines a return function trace.
  * It's already packed but the attribute "packed" is needed
@@ -1057,19 +1068,29 @@ struct ftrace_graph_ret {
 	unsigned long long rettime;
 } __packed;
 
+struct fgraph_extras;
 struct fgraph_ops;
 
 /* Type of the callback handlers for tracing function graph*/
 typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *,
 				       struct fgraph_ops *); /* return */
 typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *,
-				      struct fgraph_ops *); /* entry */
+				      struct fgraph_ops *,
+				      struct fgraph_extras *); /* entry */
 
-extern int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace, struct fgraph_ops *gops);
+extern int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace,
+				   struct fgraph_ops *gops,
+				   struct fgraph_extras *extras);
 bool ftrace_pids_enabled(struct ftrace_ops *ops);
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
+/* Used to convey some extra datas when creating a graph entry */
+struct fgraph_extras {
+	u32 flags;
+	unsigned long retaddr;
+};
+
 struct fgraph_ops {
 	trace_func_graph_ent_t		entryfunc;
 	trace_func_graph_ret_t		retfunc;
@@ -1115,6 +1136,8 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
 				    unsigned long ret, unsigned long *retp);
 unsigned long *fgraph_get_task_var(struct fgraph_ops *gops);
 
+u32 graph_tracer_flags_get(u32 flags);
+
 /*
  * Sometimes we don't want to trace a function with the function
  * graph tracer but we want them to keep traced by the usual function
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 721c3b221048..74c2b1d43bb9 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -242,6 +242,16 @@ config FUNCTION_GRAPH_RETVAL
 	  enable it via the trace option funcgraph-retval.
 	  See Documentation/trace/ftrace.rst
 
+config FUNCTION_GRAPH_RETADDR
+	bool "Kernel Function Graph Return Address"
+	depends on FUNCTION_GRAPH_TRACER
+	default n
+	help
+	  Support recording and printing the function return address when
+	  using function graph tracer. It can be helpful to locate code line that
+	  the function is called. This feature is off by default, and you can
+	  enable it via the trace option funcgraph-retaddr.
+
 config DYNAMIC_FTRACE
 	bool "enable/disable function tracing dynamically"
 	depends on FUNCTION_TRACER
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 58a28ec35dab..875aefe60a13 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -290,7 +290,8 @@ static inline unsigned long make_data_type_val(int idx, int size, int offset)
 }
 
 /* ftrace_graph_entry set to this to tell some archs to run function graph */
-static int entry_run(struct ftrace_graph_ent *trace, struct fgraph_ops *ops)
+static int entry_run(struct ftrace_graph_ent *trace, struct fgraph_ops *ops,
+		     struct fgraph_extras *extras)
 {
 	return 0;
 }
@@ -518,7 +519,8 @@ int __weak ftrace_disable_ftrace_graph_caller(void)
 #endif
 
 int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace,
-			    struct fgraph_ops *gops)
+			    struct fgraph_ops *gops,
+			    struct fgraph_extras *extras)
 {
 	return 0;
 }
@@ -646,13 +648,20 @@ int function_graph_enter(unsigned long ret, unsigned long func,
 			 unsigned long frame_pointer, unsigned long *retp)
 {
 	struct ftrace_graph_ent trace;
+	struct fgraph_extras extras;
 	unsigned long bitmap = 0;
 	int offset;
 	int i;
+	int idx = 0;
 
 	trace.func = func;
 	trace.depth = ++current->curr_ret_depth;
 
+	extras.flags = graph_tracer_flags_get(TRACE_GRAPH_PRINT_RETADDR);
+	if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR)
+		&& extras.flags & TRACE_GRAPH_PRINT_RETADDR)
+		extras.retaddr = ftrace_graph_ret_addr(current, &idx, ret, retp);
+
 	offset = ftrace_push_return_trace(ret, func, frame_pointer, retp, 0);
 	if (offset < 0)
 		goto out;
@@ -661,7 +670,7 @@ int function_graph_enter(unsigned long ret, unsigned long func,
 	if (static_branch_likely(&fgraph_do_direct)) {
 		int save_curr_ret_stack = current->curr_ret_stack;
 
-		if (static_call(fgraph_func)(&trace, fgraph_direct_gops))
+		if (static_call(fgraph_func)(&trace, fgraph_direct_gops, &extras))
 			bitmap |= BIT(fgraph_direct_gops->idx);
 		else
 			/* Clear out any saved storage */
@@ -679,7 +688,7 @@ int function_graph_enter(unsigned long ret, unsigned long func,
 
 			save_curr_ret_stack = current->curr_ret_stack;
 			if (ftrace_ops_test(&gops->ops, func, NULL) &&
-			    gops->entryfunc(&trace, gops))
+			    gops->entryfunc(&trace, gops, &extras))
 				bitmap |= BIT(i);
 			else
 				/* Clear out any saved storage */
@@ -1136,7 +1145,8 @@ void ftrace_graph_exit_task(struct task_struct *t)
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 static int fgraph_pid_func(struct ftrace_graph_ent *trace,
-			   struct fgraph_ops *gops)
+			   struct fgraph_ops *gops,
+			   struct fgraph_extras *extras)
 {
 	struct trace_array *tr = gops->ops.private;
 	int pid;
@@ -1150,7 +1160,7 @@ static int fgraph_pid_func(struct ftrace_graph_ent *trace,
 			return 0;
 	}
 
-	return gops->saved_func(trace, gops);
+	return gops->saved_func(trace, gops, NULL);
 }
 
 void fgraph_update_pid_func(void)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index cae388122ca8..5d87dac83b80 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -827,7 +827,8 @@ struct profile_fgraph_data {
 };
 
 static int profile_graph_entry(struct ftrace_graph_ent *trace,
-			       struct fgraph_ops *gops)
+			       struct fgraph_ops *gops,
+			       struct fgraph_extras *extras)
 {
 	struct profile_fgraph_data *profile_data;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2f8017f8d34d..13f08f257c0b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -46,6 +46,7 @@ enum trace_type {
 	TRACE_BRANCH,
 	TRACE_GRAPH_RET,
 	TRACE_GRAPH_ENT,
+	TRACE_GRAPH_RETADDR_ENT,
 	TRACE_USER_STACK,
 	TRACE_BLK,
 	TRACE_BPUTS,
@@ -512,6 +513,8 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
 		IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry,	\
 			  TRACE_GRAPH_ENT);		\
+		IF_ASSIGN(var, ent, struct fgraph_retaddr_ent_entry,\
+			  TRACE_GRAPH_RETADDR_ENT);		\
 		IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry,	\
 			  TRACE_GRAPH_RET);		\
 		IF_ASSIGN(var, ent, struct func_repeats_entry,		\
@@ -692,7 +695,8 @@ void trace_default_header(struct seq_file *m);
 void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
 
 void trace_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops);
-int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops);
+int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
+		      struct fgraph_extras *extras);
 
 void tracing_start_cmdline_record(void);
 void tracing_stop_cmdline_record(void);
@@ -879,6 +883,7 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash)
 #define TRACE_GRAPH_GRAPH_TIME          0x400
 #define TRACE_GRAPH_PRINT_RETVAL        0x800
 #define TRACE_GRAPH_PRINT_RETVAL_HEX    0x1000
+#define TRACE_GRAPH_PRINT_RETADDR       0x2000
 #define TRACE_GRAPH_PRINT_FILL_SHIFT	28
 #define TRACE_GRAPH_PRINT_FILL_MASK	(0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)
 
@@ -900,6 +905,10 @@ extern void graph_trace_close(struct trace_iterator *iter);
 extern int __trace_graph_entry(struct trace_array *tr,
 			       struct ftrace_graph_ent *trace,
 			       unsigned int trace_ctx);
+extern int __trace_graph_retaddr_entry(struct trace_array *tr,
+				struct ftrace_graph_ent *trace,
+				unsigned int trace_ctx,
+				unsigned long retaddr);
 extern void __trace_graph_return(struct trace_array *tr,
 				 struct ftrace_graph_ret *trace,
 				 unsigned int trace_ctx);
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index c47422b20908..82fd174ebbe0 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -85,9 +85,35 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry,
 	F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth)
 );
 
-/* Function return entry */
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+
+/* Function call entry with a return address */
+FTRACE_ENTRY_PACKED(fgraph_retaddr_entry, fgraph_retaddr_ent_entry,
+
+	TRACE_GRAPH_RETADDR_ENT,
+
+	F_STRUCT(
+		__field_struct(	struct fgraph_retaddr_ent,	graph_ent	)
+		__field_packed(	unsigned long,	graph_ent,	func		)
+		__field_packed(	int,		graph_ent,	depth		)
+		__field_packed(	unsigned long,	graph_ent,	retaddr		)
+	),
+
+	F_printk("--> %ps (%d) <- %ps", (void *)__entry->func, __entry->depth,
+		(void *)__entry->retaddr)
+);
+
+#else
+
+#ifndef fgraph_retaddr_ent_entry
+#define fgraph_retaddr_ent_entry ftrace_graph_ent_entry
+#endif
+
+#endif
+
 #ifdef CONFIG_FUNCTION_GRAPH_RETVAL
 
+/* Function return entry */
 FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
 
 	TRACE_GRAPH_RET,
@@ -110,6 +136,7 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
 
 #else
 
+/* Function return entry */
 FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
 
 	TRACE_GRAPH_RET,
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 5c1b150fbba3..3dd63ae2afe8 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -31,7 +31,10 @@ struct fgraph_data {
 	struct fgraph_cpu_data __percpu *cpu_data;
 
 	/* Place to preserve last processed entry. */
-	struct ftrace_graph_ent_entry	ent;
+	union {
+		struct ftrace_graph_ent_entry	ent;
+		struct fgraph_retaddr_ent_entry	rent;
+	} ent;
 	struct ftrace_graph_ret_entry	ret;
 	int				failed;
 	int				cpu;
@@ -63,6 +66,10 @@ static struct tracer_opt trace_opts[] = {
 	{ TRACER_OPT(funcgraph-retval, TRACE_GRAPH_PRINT_RETVAL) },
 	/* Display function return value in hexadecimal format ? */
 	{ TRACER_OPT(funcgraph-retval-hex, TRACE_GRAPH_PRINT_RETVAL_HEX) },
+#endif
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+	/* Display function return address ? */
+	{ TRACER_OPT(funcgraph-retaddr, TRACE_GRAPH_PRINT_RETADDR) },
 #endif
 	/* Include sleep time (scheduled out) between entry and return */
 	{ TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) },
@@ -83,6 +90,11 @@ static struct tracer_flags tracer_flags = {
 	.opts = trace_opts
 };
 
+u32 graph_tracer_flags_get(u32 flags)
+{
+	return tracer_flags.val & flags;
+}
+
 /*
  * DURATION column is being also used to display IRQ signs,
  * following values are used by print_graph_irq and others
@@ -119,6 +131,40 @@ int __trace_graph_entry(struct trace_array *tr,
 	return 1;
 }
 
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+int __trace_graph_retaddr_entry(struct trace_array *tr,
+				struct ftrace_graph_ent *trace,
+				unsigned int trace_ctx,
+				unsigned long retaddr)
+{
+	struct trace_event_call *call = &event_fgraph_retaddr_entry;
+	struct ring_buffer_event *event;
+	struct trace_buffer *buffer = tr->array_buffer.buffer;
+	struct fgraph_retaddr_ent_entry *entry;
+
+	event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RETADDR_ENT,
+					  sizeof(*entry), trace_ctx);
+	if (!event)
+		return 0;
+	entry	= ring_buffer_event_data(event);
+	entry->graph_ent.func = trace->func;
+	entry->graph_ent.depth = trace->depth;
+	entry->graph_ent.retaddr = retaddr;
+	if (!call_filter_check_discard(call, entry, buffer, event))
+		trace_buffer_unlock_commit_nostack(buffer, event);
+
+	return 1;
+}
+#else
+int __trace_graph_retaddr_entry(struct trace_array *tr,
+				struct ftrace_graph_ent *trace,
+				unsigned int trace_ctx,
+				unsigned long retaddr)
+{
+	return 1;
+}
+#endif
+
 static inline int ftrace_graph_ignore_irqs(void)
 {
 	if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT))
@@ -133,7 +179,8 @@ struct fgraph_times {
 };
 
 int trace_graph_entry(struct ftrace_graph_ent *trace,
-		      struct fgraph_ops *gops)
+		      struct fgraph_ops *gops,
+		      struct fgraph_extras *extras)
 {
 	unsigned long *task_var = fgraph_get_task_var(gops);
 	struct trace_array *tr = gops->private;
@@ -199,7 +246,12 @@ int trace_graph_entry(struct ftrace_graph_ent *trace,
 	disabled = atomic_inc_return(&data->disabled);
 	if (likely(disabled == 1)) {
 		trace_ctx = tracing_gen_ctx_flags(flags);
-		ret = __trace_graph_entry(tr, trace, trace_ctx);
+		if (unlikely(IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && extras
+				&& (extras->flags & TRACE_GRAPH_PRINT_RETADDR)))
+			ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx,
+							  extras->retaddr);
+		else
+			ret = __trace_graph_entry(tr, trace, trace_ctx);
 	} else {
 		ret = 0;
 	}
@@ -507,7 +559,7 @@ get_return_for_leaf(struct trace_iterator *iter,
 	 * then we just reuse the data from before.
 	 */
 	if (data && data->failed) {
-		curr = &data->ent;
+		curr = &data->ent.ent;
 		next = &data->ret;
 	} else {
 
@@ -537,7 +589,10 @@ get_return_for_leaf(struct trace_iterator *iter,
 			 * Save current and next entries for later reference
 			 * if the output fails.
 			 */
-			data->ent = *curr;
+			if (unlikely(curr->ent.type == TRACE_GRAPH_RETADDR_ENT))
+				data->ent.rent = *(struct fgraph_retaddr_ent_entry *)curr;
+			else
+				data->ent.ent = *curr;
 			/*
 			 * If the next event is not a return type, then
 			 * we only care about what type it is. Otherwise we can
@@ -701,52 +756,96 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration,
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_RETVAL
-
 #define __TRACE_GRAPH_PRINT_RETVAL TRACE_GRAPH_PRINT_RETVAL
+#else
+#define __TRACE_GRAPH_PRINT_RETVAL 0
+#endif
 
-static void print_graph_retval(struct trace_seq *s, unsigned long retval,
-				bool leaf, void *func, bool hex_format)
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+#define __TRACE_GRAPH_PRINT_RETADDR TRACE_GRAPH_PRINT_RETADDR
+static void print_graph_retaddr(struct trace_seq *s, struct fgraph_retaddr_ent_entry *entry,
+				u32 trace_flags, bool comment)
+{
+	if (comment)
+		trace_seq_puts(s, " /*");
+
+	trace_seq_puts(s, " <-");
+	seq_print_ip_sym(s, entry->graph_ent.retaddr, trace_flags | TRACE_ITER_SYM_OFFSET);
+
+	if (comment)
+		trace_seq_puts(s, " */");
+}
+#else
+#define __TRACE_GRAPH_PRINT_RETADDR 0
+#define print_graph_retaddr(_seq, _entry, _tflags, _comment)		do { } while (0)
+#endif
+
+#if defined(CONFIG_FUNCTION_GRAPH_RETVAL) || defined(CONFIG_FUNCTION_GRAPH_RETADDR)
+
+static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entry *entry,
+				struct ftrace_graph_ret *graph_ret, void *func,
+				u32 opt_flags, u32 trace_flags)
 {
 	unsigned long err_code = 0;
+	unsigned long retval = 0;
+	bool print_retaddr = false;
+	bool print_retval = false;
+	bool hex_format = !!(opt_flags & TRACE_GRAPH_PRINT_RETVAL_HEX);
 
-	if (retval == 0 || hex_format)
-		goto done;
+#ifdef CONFIG_FUNCTION_GRAPH_RETVAL
+	retval = graph_ret->retval;
+	print_retval = !!(opt_flags & TRACE_GRAPH_PRINT_RETVAL);
+#endif
 
-	/* Check if the return value matches the negative format */
-	if (IS_ENABLED(CONFIG_64BIT) && (retval & BIT(31)) &&
-		(((u64)retval) >> 32) == 0) {
-		/* sign extension */
-		err_code = (unsigned long)(s32)retval;
-	} else {
-		err_code = retval;
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+	print_retaddr = !!(opt_flags & TRACE_GRAPH_PRINT_RETADDR);
+#endif
+
+	if (print_retval && retval && !hex_format) {
+		/* Check if the return value matches the negative format */
+		if (IS_ENABLED(CONFIG_64BIT) && (retval & BIT(31)) &&
+			(((u64)retval) >> 32) == 0) {
+			err_code = sign_extend64(retval, 31);
+		} else {
+			err_code = retval;
+		}
+
+		if (!IS_ERR_VALUE(err_code))
+			err_code = 0;
 	}
 
-	if (!IS_ERR_VALUE(err_code))
-		err_code = 0;
+	if (entry) {
+		if (entry->ent.type != TRACE_GRAPH_RETADDR_ENT)
+			print_retaddr = false;
 
-done:
-	if (leaf) {
-		if (hex_format || (err_code == 0))
-			trace_seq_printf(s, "%ps(); /* = 0x%lx */\n",
-					func, retval);
+		trace_seq_printf(s, "%ps();", func);
+		if (print_retval || print_retaddr)
+			trace_seq_puts(s, " /*");
 		else
-			trace_seq_printf(s, "%ps(); /* = %ld */\n",
-					func, err_code);
+			trace_seq_putc(s, '\n');
 	} else {
+		print_retaddr = false;
+		trace_seq_printf(s, "} /* %ps", func);
+	}
+
+	if (print_retaddr)
+		print_graph_retaddr(s, (struct fgraph_retaddr_ent_entry *)entry,
+				    trace_flags, false);
+
+	if (print_retval) {
 		if (hex_format || (err_code == 0))
-			trace_seq_printf(s, "} /* %ps = 0x%lx */\n",
-					func, retval);
+			trace_seq_printf(s, " ret=0x%lx", retval);
 		else
-			trace_seq_printf(s, "} /* %ps = %ld */\n",
-					func, err_code);
+			trace_seq_printf(s, " ret=%ld", err_code);
 	}
+
+	if (!entry || print_retval || print_retaddr)
+		trace_seq_puts(s, " */\n");
 }
 
 #else
 
-#define __TRACE_GRAPH_PRINT_RETVAL 0
-
-#define print_graph_retval(_seq, _retval, _leaf, _func, _format) do {} while (0)
+#define print_graph_retval(_seq, _ent, _ret, _func, _opt_flags, _trace_flags) do {} while (0)
 
 #endif
 
@@ -798,14 +897,15 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 		trace_seq_putc(s, ' ');
 
 	/*
-	 * Write out the function return value if the option function-retval is
-	 * enabled.
+	 * Write out the function return value or return address
 	 */
-	if (flags & __TRACE_GRAPH_PRINT_RETVAL)
-		print_graph_retval(s, graph_ret->retval, true, (void *)func,
-				!!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX));
-	else
+	if (flags & (__TRACE_GRAPH_PRINT_RETVAL | __TRACE_GRAPH_PRINT_RETADDR)) {
+		print_graph_retval(s, entry, graph_ret,
+				   (void *)graph_ret->func + iter->tr->text_delta,
+				   flags, tr->trace_flags);
+	} else {
 		trace_seq_printf(s, "%ps();\n", (void *)func);
+	}
 
 	print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET,
 			cpu, iter->ent->pid, flags);
@@ -846,7 +946,12 @@ print_graph_entry_nested(struct trace_iterator *iter,
 
 	func = call->func + iter->tr->text_delta;
 
-	trace_seq_printf(s, "%ps() {\n", (void *)func);
+	trace_seq_printf(s, "%ps() {", (void *)func);
+	if (flags & __TRACE_GRAPH_PRINT_RETADDR  &&
+		entry->ent.type == TRACE_GRAPH_RETADDR_ENT)
+		print_graph_retaddr(s, (struct fgraph_retaddr_ent_entry *)entry,
+			tr->trace_flags, true);
+	trace_seq_putc(s, '\n');
 
 	if (trace_seq_has_overflowed(s))
 		return TRACE_TYPE_PARTIAL_LINE;
@@ -1093,11 +1198,10 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 
 	/*
 	 * Always write out the function name and its return value if the
-	 * function-retval option is enabled.
+	 * funcgraph-retval option is enabled.
 	 */
 	if (flags & __TRACE_GRAPH_PRINT_RETVAL) {
-		print_graph_retval(s, trace->retval, false, (void *)func,
-			!!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX));
+		print_graph_retval(s, NULL, trace, (void *)func, flags, tr->trace_flags);
 	} else {
 		/*
 		 * If the return function does not have a matching entry,
@@ -1212,7 +1316,7 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 	 * to print out the missing entry which would never go out.
 	 */
 	if (data && data->failed) {
-		field = &data->ent;
+		field = &data->ent.ent;
 		iter->cpu = data->cpu;
 		ret = print_graph_entry(field, s, iter, flags);
 		if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
@@ -1236,6 +1340,16 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 		saved = *field;
 		return print_graph_entry(&saved, s, iter, flags);
 	}
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+	case TRACE_GRAPH_RETADDR_ENT: {
+		struct fgraph_retaddr_ent_entry saved;
+		struct fgraph_retaddr_ent_entry *rfield;
+
+		trace_assign_type(rfield, entry);
+		saved = *rfield;
+		return print_graph_entry((struct ftrace_graph_ent_entry *)&saved, s, iter, flags);
+	}
+#endif
 	case TRACE_GRAPH_RET: {
 		struct ftrace_graph_ret_entry *field;
 		trace_assign_type(field, entry);
@@ -1430,6 +1544,13 @@ static struct trace_event graph_trace_entry_event = {
 	.funcs		= &graph_functions,
 };
 
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+static struct trace_event graph_trace_retaddr_entry_event = {
+	.type		= TRACE_GRAPH_RETADDR_ENT,
+	.funcs		= &graph_functions,
+};
+#endif
+
 static struct trace_event graph_trace_ret_event = {
 	.type		= TRACE_GRAPH_RET,
 	.funcs		= &graph_functions
@@ -1516,6 +1637,13 @@ static __init int init_graph_trace(void)
 		return 1;
 	}
 
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+	if (!register_trace_event(&graph_trace_retaddr_entry_event)) {
+		pr_warn("Warning: could not register graph trace retaddr events\n");
+		return 1;
+	}
+#endif
+
 	if (!register_trace_event(&graph_trace_ret_event)) {
 		pr_warn("Warning: could not register graph trace events\n");
 		return 1;
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index fce064e20570..eb3aa36cf10f 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -176,7 +176,8 @@ static int irqsoff_display_graph(struct trace_array *tr, int set)
 }
 
 static int irqsoff_graph_entry(struct ftrace_graph_ent *trace,
-			       struct fgraph_ops *gops)
+			       struct fgraph_ops *gops,
+			       struct fgraph_extras *extras)
 {
 	struct trace_array *tr = irqsoff_trace;
 	struct trace_array_cpu *data;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index ae2ace5e515a..155de2551507 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -113,7 +113,8 @@ static int wakeup_display_graph(struct trace_array *tr, int set)
 }
 
 static int wakeup_graph_entry(struct ftrace_graph_ent *trace,
-			      struct fgraph_ops *gops)
+			      struct fgraph_ops *gops,
+			      struct fgraph_extras *extras)
 {
 	struct trace_array *tr = wakeup_trace;
 	struct trace_array_cpu *data;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index c4ad7cd7e778..fbb99f8c8062 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -17,6 +17,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 	case TRACE_PRINT:
 	case TRACE_BRANCH:
 	case TRACE_GRAPH_ENT:
+	case TRACE_GRAPH_RETADDR_ENT:
 	case TRACE_GRAPH_RET:
 		return 1;
 	}
@@ -773,7 +774,8 @@ struct fgraph_fixture {
 };
 
 static __init int store_entry(struct ftrace_graph_ent *trace,
-			      struct fgraph_ops *gops)
+			      struct fgraph_ops *gops,
+			      struct fgraph_extras *extras)
 {
 	struct fgraph_fixture *fixture = container_of(gops, struct fgraph_fixture, gops);
 	const char *type = fixture->store_type_name;
@@ -1024,7 +1026,8 @@ static unsigned int graph_hang_thresh;
 
 /* Wrap the real function entry probe to avoid possible hanging */
 static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace,
-				      struct fgraph_ops *gops)
+				      struct fgraph_ops *gops,
+				      struct fgraph_extras *extras)
 {
 	/* This is harmlessly racy, we want to approximately detect a hang */
 	if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
@@ -1038,7 +1041,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace,
 		return 0;
 	}
 
-	return trace_graph_entry(trace, gops);
+	return trace_graph_entry(trace, gops, NULL);
 }
 
 static struct fgraph_ops fgraph_ops __initdata  = {
diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-retval.tc b/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-retval.tc
index e34c0bdef3ed..e8e46378b88d 100644
--- a/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-retval.tc
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-retval.tc
@@ -29,7 +29,7 @@ set -e
 
 : "Test printing the error code in signed decimal format"
 echo 0 > options/funcgraph-retval-hex
-count=`cat trace | grep 'proc_reg_write' | grep '= -5' | wc -l`
+count=`cat trace | grep 'proc_reg_write' | grep '=-5' | wc -l`
 if [ $count -eq 0 ]; then
     fail "Return value can not be printed in signed decimal format"
 fi
-- 
cgit v1.2.3


From 2688d6814193f81b0b4f9704a44963ebd755182f Mon Sep 17 00:00:00 2001
From: Hariharan Mari <hari55@linux.ibm.com>
Date: Fri, 23 Aug 2024 15:05:04 +0200
Subject: KVM: s390: selftests: Add regression tests for SORTL and DFLTCC CPU
 subfunctions

Introduce new regression tests to verify the ASM inline block in the SORTL
and DFLTCC CPU subfunctions for the s390x architecture. These tests ensure
that future changes to the ASM code are properly validated.

The test procedure:

1. Create a VM and request the KVM_S390_VM_CPU_MACHINE_SUBFUNC attribute
   from the KVM_S390_VM_CPU_MODEL group for this VM. This SUBFUNC attribute
   contains the results of all CPU subfunction instructions.
2. For each tested subfunction (SORTL and DFLTCC), execute the
   corresponding ASM instruction and capture the result array.
3. Perform a memory comparison between the results stored in the SUBFUNC
   attribute (obtained in step 1) and the ASM instruction results (obtained
   in step 2) for each tested subfunction.

This process ensures that the KVM implementation accurately reflects the
behavior of the actual CPU instructions for the tested subfunctions.

Suggested-by: Janosch Frank <frankja@linux.ibm.com>
Signed-off-by: Hariharan Mari <hari55@linux.ibm.com>
Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
Reviewed-by: Christoph Schlameuss <schlameuss@linux.ibm.com>
Link: https://lore.kernel.org/r/20240823130947.38323-2-hari55@linux.ibm.com
Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Message-ID: <20240823130947.38323-2-hari55@linux.ibm.com>
---
 tools/testing/selftests/kvm/Makefile               |   2 +
 .../testing/selftests/kvm/include/s390x/facility.h |  50 ++++++++++
 tools/testing/selftests/kvm/lib/s390x/facility.c   |  14 +++
 .../selftests/kvm/s390x/cpumodel_subfuncs_test.c   | 105 +++++++++++++++++++++
 4 files changed, 171 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/include/s390x/facility.h
 create mode 100644 tools/testing/selftests/kvm/lib/s390x/facility.c
 create mode 100644 tools/testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 960cf6a77198..46b647b6b976 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -55,6 +55,7 @@ LIBKVM_aarch64 += lib/aarch64/vgic.c
 LIBKVM_s390x += lib/s390x/diag318_test_handler.c
 LIBKVM_s390x += lib/s390x/processor.c
 LIBKVM_s390x += lib/s390x/ucall.c
+LIBKVM_s390x += lib/s390x/facility.c
 
 LIBKVM_riscv += lib/riscv/handlers.S
 LIBKVM_riscv += lib/riscv/processor.c
@@ -189,6 +190,7 @@ TEST_GEN_PROGS_s390x += s390x/sync_regs_test
 TEST_GEN_PROGS_s390x += s390x/tprot
 TEST_GEN_PROGS_s390x += s390x/cmma_test
 TEST_GEN_PROGS_s390x += s390x/debug_test
+TEST_GEN_PROGS_s390x += s390x/cpumodel_subfuncs_test
 TEST_GEN_PROGS_s390x += s390x/shared_zeropage_test
 TEST_GEN_PROGS_s390x += s390x/ucontrol_test
 TEST_GEN_PROGS_s390x += demand_paging_test
diff --git a/tools/testing/selftests/kvm/include/s390x/facility.h b/tools/testing/selftests/kvm/include/s390x/facility.h
new file mode 100644
index 000000000000..00a1ced6538b
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/s390x/facility.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright IBM Corp. 2024
+ *
+ * Authors:
+ *  Hariharan Mari <hari55@linux.ibm.com>
+ *
+ * Get the facility bits with the STFLE instruction
+ */
+
+#ifndef SELFTEST_KVM_FACILITY_H
+#define SELFTEST_KVM_FACILITY_H
+
+#include <linux/bitops.h>
+
+/* alt_stfle_fac_list[16] + stfle_fac_list[16] */
+#define NB_STFL_DOUBLEWORDS 32
+
+extern uint64_t stfl_doublewords[NB_STFL_DOUBLEWORDS];
+extern bool stfle_flag;
+
+static inline bool test_bit_inv(unsigned long nr, const unsigned long *ptr)
+{
+	return test_bit(nr ^ (BITS_PER_LONG - 1), ptr);
+}
+
+static inline void stfle(uint64_t *fac, unsigned int nb_doublewords)
+{
+	register unsigned long r0 asm("0") = nb_doublewords - 1;
+
+	asm volatile("	.insn	s,0xb2b00000,0(%1)\n"
+			: "+d" (r0)
+			: "a" (fac)
+			: "memory", "cc");
+}
+
+static inline void setup_facilities(void)
+{
+	stfle(stfl_doublewords, NB_STFL_DOUBLEWORDS);
+	stfle_flag = true;
+}
+
+static inline bool test_facility(int nr)
+{
+	if (!stfle_flag)
+		setup_facilities();
+	return test_bit_inv(nr, stfl_doublewords);
+}
+
+#endif
diff --git a/tools/testing/selftests/kvm/lib/s390x/facility.c b/tools/testing/selftests/kvm/lib/s390x/facility.c
new file mode 100644
index 000000000000..d540812d911a
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/s390x/facility.c
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright IBM Corp. 2024
+ *
+ * Authors:
+ *  Hariharan Mari <hari55@linux.ibm.com>
+ *
+ * Contains the definition for the global variables to have the test facitlity feature.
+ */
+
+#include "facility.h"
+
+uint64_t stfl_doublewords[NB_STFL_DOUBLEWORDS];
+bool stfle_flag;
diff --git a/tools/testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c b/tools/testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c
new file mode 100644
index 000000000000..ee525c841767
--- /dev/null
+++ b/tools/testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright IBM Corp. 2024
+ *
+ * Authors:
+ *  Hariharan Mari <hari55@linux.ibm.com>
+ *
+ * The tests compare the result of the KVM ioctl for obtaining CPU subfunction data with those
+ * from an ASM block performing the same CPU subfunction. Currently KVM doesn't mask instruction
+ * query data reported via the CPU Model, allowing us to directly compare it with the data
+ * acquired through executing the queries in the test.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include "facility.h"
+
+#include "kvm_util.h"
+
+/* Query available CPU subfunctions */
+struct kvm_s390_vm_cpu_subfunc cpu_subfunc;
+
+static void get_cpu_machine_subfuntions(struct kvm_vm *vm,
+					struct kvm_s390_vm_cpu_subfunc *cpu_subfunc)
+{
+	int r;
+
+	r = __kvm_device_attr_get(vm->fd, KVM_S390_VM_CPU_MODEL,
+				  KVM_S390_VM_CPU_MACHINE_SUBFUNC, cpu_subfunc);
+
+	TEST_ASSERT(!r, "Get cpu subfunctions failed r=%d errno=%d", r, errno);
+}
+
+/* Testing Sort Lists (SORTL) CPU subfunction's ASM block */
+static void test_sortl_asm_block(u8 (*query)[32])
+{
+	asm volatile("	lghi	0,0\n"
+			"	la	1,%[query]\n"
+			"	.insn	rre,0xb9380000,2,4\n"
+			: [query] "=R" (*query)
+			:
+			: "cc", "0", "1");
+}
+
+/* Testing Deflate Conversion Call (DFLTCC) CPU subfunction's ASM block */
+static void test_dfltcc_asm_block(u8 (*query)[32])
+{
+	asm volatile("	lghi	0,0\n"
+			"	la	1,%[query]\n"
+			"	.insn	rrf,0xb9390000,2,4,6,0\n"
+			: [query] "=R" (*query)
+			:
+			: "cc", "0", "1");
+}
+
+typedef void (*testfunc_t)(u8 (*array)[]);
+
+struct testdef {
+	const char *subfunc_name;
+	u8 *subfunc_array;
+	size_t array_size;
+	testfunc_t test;
+	int facility_bit;
+} testlist[] = {
+	/* SORTL - Facility bit 150 */
+	{ "SORTL", cpu_subfunc.sortl, sizeof(cpu_subfunc.sortl), test_sortl_asm_block, 150 },
+	/* DFLTCC - Facility bit 151 */
+	{ "DFLTCC", cpu_subfunc.dfltcc, sizeof(cpu_subfunc.dfltcc), test_dfltcc_asm_block, 151 },
+};
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vm *vm;
+	int idx;
+
+	ksft_print_header();
+
+	vm = vm_create(1);
+
+	memset(&cpu_subfunc, 0, sizeof(cpu_subfunc));
+	get_cpu_machine_subfuntions(vm, &cpu_subfunc);
+
+	ksft_set_plan(ARRAY_SIZE(testlist));
+	for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) {
+		if (test_facility(testlist[idx].facility_bit)) {
+			u8 *array = malloc(testlist[idx].array_size);
+
+			testlist[idx].test((u8 (*)[testlist[idx].array_size])array);
+
+			TEST_ASSERT_EQ(memcmp(testlist[idx].subfunc_array,
+					      array, testlist[idx].array_size), 0);
+
+			ksft_test_result_pass("%s\n", testlist[idx].subfunc_name);
+			free(array);
+		} else {
+			ksft_test_result_skip("%s feature is not avaialable\n",
+					      testlist[idx].subfunc_name);
+		}
+	}
+
+	kvm_vm_free(vm);
+	ksft_finished();
+}
-- 
cgit v1.2.3


From ff4cafc585e7554063ae2f301da208559ff9418f Mon Sep 17 00:00:00 2001
From: Hariharan Mari <hari55@linux.ibm.com>
Date: Fri, 23 Aug 2024 15:05:05 +0200
Subject: KVM: s390: selftests: Add regression tests for PRNO, KDSA and KMA
 crypto subfunctions

Extend the existing regression test framework for s390x CPU subfunctions
to include tests for the PRNO (Perform Random Number Operation), KDSA
(Compute Digital Signature Authentication) and KMA (Cipher Message with
Authentication) crypto functions.

The test procedure follows the established pattern:

1. Obtain KVM_S390_VM_CPU_MACHINE_SUBFUNC attribute for the VM.
2. Execute PRNO, KDSA and KMA instructions.
3. Compare KVM-reported results with direct instruction execution results.

Suggested-by: Janosch Frank <frankja@linux.ibm.com>
Signed-off-by: Hariharan Mari <hari55@linux.ibm.com>
Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
Reviewed-by: Christoph Schlameuss <schlameuss@linux.ibm.com>
Link: https://lore.kernel.org/r/20240823130947.38323-3-hari55@linux.ibm.com
Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Message-ID: <20240823130947.38323-3-hari55@linux.ibm.com>
---
 .../selftests/kvm/s390x/cpumodel_subfuncs_test.c   | 39 ++++++++++++++++++++++
 1 file changed, 39 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c b/tools/testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c
index ee525c841767..96e7ca07220f 100644
--- a/tools/testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c
+++ b/tools/testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c
@@ -33,6 +33,39 @@ static void get_cpu_machine_subfuntions(struct kvm_vm *vm,
 	TEST_ASSERT(!r, "Get cpu subfunctions failed r=%d errno=%d", r, errno);
 }
 
+/* Testing Crypto Perform Random Number Operation (PRNO) CPU subfunction's ASM block */
+static void test_prno_asm_block(u8 (*query)[16])
+{
+	asm volatile("	la	%%r1,%[query]\n"
+			"	xgr	%%r0,%%r0\n"
+			"	.insn	rre,0xb93c0000,2,4\n"
+			: [query] "=R" (*query)
+			:
+			: "cc", "r0", "r1");
+}
+
+/* Testing Crypto Cipher Message with Authentication (KMA) CPU subfunction's ASM block */
+static void test_kma_asm_block(u8 (*query)[16])
+{
+	asm volatile("	la	%%r1,%[query]\n"
+			"	xgr	%%r0,%%r0\n"
+			"	.insn	rrf,0xb9290000,2,4,6,0\n"
+			: [query] "=R" (*query)
+			:
+			: "cc", "r0", "r1");
+}
+
+/* Testing Crypto Compute Digital Signature Authentication (KDSA) CPU subfunction's ASM block */
+static void test_kdsa_asm_block(u8 (*query)[16])
+{
+	asm volatile("	la	%%r1,%[query]\n"
+			"	xgr	%%r0,%%r0\n"
+			"	.insn	rre,0xb93a0000,0,2\n"
+			: [query] "=R" (*query)
+			:
+			: "cc", "r0", "r1");
+}
+
 /* Testing Sort Lists (SORTL) CPU subfunction's ASM block */
 static void test_sortl_asm_block(u8 (*query)[32])
 {
@@ -64,6 +97,12 @@ struct testdef {
 	testfunc_t test;
 	int facility_bit;
 } testlist[] = {
+	/* MSA5 - Facility bit 57 */
+	{ "PPNO", cpu_subfunc.ppno, sizeof(cpu_subfunc.ppno), test_prno_asm_block, 57 },
+	/* MSA8 - Facility bit 146 */
+	{ "KMA", cpu_subfunc.kma, sizeof(cpu_subfunc.kma), test_kma_asm_block, 146 },
+	/* MSA9 - Facility bit 155 */
+	{ "KDSA", cpu_subfunc.kdsa, sizeof(cpu_subfunc.kdsa), test_kdsa_asm_block, 155 },
 	/* SORTL - Facility bit 150 */
 	{ "SORTL", cpu_subfunc.sortl, sizeof(cpu_subfunc.sortl), test_sortl_asm_block, 150 },
 	/* DFLTCC - Facility bit 151 */
-- 
cgit v1.2.3


From 7f269dd22d51fc21c4a2a561cb93652c29f9863c Mon Sep 17 00:00:00 2001
From: Hariharan Mari <hari55@linux.ibm.com>
Date: Fri, 23 Aug 2024 15:05:06 +0200
Subject: KVM: s390: selftests: Add regression tests for KMCTR, KMF, KMO and
 PCC crypto subfunctions

Extend the existing regression test framework for s390x CPU subfunctions
to include tests for the KMCTR (Cipher Message with Counter) KMO
(Cipher Message with Output Feedback), KMF (Cipher Message with Cipher
Feedback) and PCC (Perform Cryptographic Computation) crypto functions.

The test procedure follows the established pattern.

Suggested-by: Janosch Frank <frankja@linux.ibm.com>
Signed-off-by: Hariharan Mari <hari55@linux.ibm.com>
Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
Reviewed-by: Christoph Schlameuss <schlameuss@linux.ibm.com>
Link: https://lore.kernel.org/r/20240823130947.38323-4-hari55@linux.ibm.com
Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Message-ID: <20240823130947.38323-4-hari55@linux.ibm.com>
---
 .../selftests/kvm/s390x/cpumodel_subfuncs_test.c   | 49 ++++++++++++++++++++++
 1 file changed, 49 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c b/tools/testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c
index 96e7ca07220f..28faceeaf089 100644
--- a/tools/testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c
+++ b/tools/testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c
@@ -33,6 +33,50 @@ static void get_cpu_machine_subfuntions(struct kvm_vm *vm,
 	TEST_ASSERT(!r, "Get cpu subfunctions failed r=%d errno=%d", r, errno);
 }
 
+/* Testing Crypto Cipher Message with Counter (KMCTR) CPU subfunction's ASM block */
+static void test_kmctr_asm_block(u8 (*query)[16])
+{
+	asm volatile("	la	%%r1,%[query]\n"
+			"	xgr	%%r0,%%r0\n"
+			"	.insn	rrf,0xb92d0000,2,4,6,0\n"
+			: [query] "=R" (*query)
+			:
+			: "cc", "r0", "r1");
+}
+
+/* Testing Crypto Cipher Message with Cipher Feedback (KMF) CPU subfunction's ASM block */
+static void test_kmf_asm_block(u8 (*query)[16])
+{
+	asm volatile("	la	%%r1,%[query]\n"
+			"	xgr	%%r0,%%r0\n"
+			"	.insn	rre,0xb92a0000,2,4\n"
+			: [query] "=R" (*query)
+			:
+			: "cc", "r0", "r1");
+}
+
+/* Testing Crypto Cipher Message with Output Feedback (KMO) CPU subfunction's ASM block */
+static void test_kmo_asm_block(u8 (*query)[16])
+{
+	asm volatile("	la	%%r1,%[query]\n"
+			"	xgr	%%r0,%%r0\n"
+			"	.insn	rre,0xb92b0000,2,4\n"
+			: [query] "=R" (*query)
+			:
+			: "cc", "r0", "r1");
+}
+
+/* Testing Crypto Perform Cryptographic Computation (PCC) CPU subfunction's ASM block */
+static void test_pcc_asm_block(u8 (*query)[16])
+{
+	asm volatile("	la	%%r1,%[query]\n"
+			"	xgr	%%r0,%%r0\n"
+			"	.insn	rre,0xb92c0000,0,0\n"
+			: [query] "=R" (*query)
+			:
+			: "cc", "r0", "r1");
+}
+
 /* Testing Crypto Perform Random Number Operation (PRNO) CPU subfunction's ASM block */
 static void test_prno_asm_block(u8 (*query)[16])
 {
@@ -97,6 +141,11 @@ struct testdef {
 	testfunc_t test;
 	int facility_bit;
 } testlist[] = {
+	/* MSA - Facility bit 77 */
+	{ "KMCTR", cpu_subfunc.kmctr, sizeof(cpu_subfunc.kmctr), test_kmctr_asm_block, 77 },
+	{ "KMF", cpu_subfunc.kmf, sizeof(cpu_subfunc.kmf), test_kmf_asm_block, 77 },
+	{ "KMO", cpu_subfunc.kmo, sizeof(cpu_subfunc.kmo), test_kmo_asm_block, 77 },
+	{ "PCC", cpu_subfunc.pcc, sizeof(cpu_subfunc.pcc), test_pcc_asm_block, 77 },
 	/* MSA5 - Facility bit 57 */
 	{ "PPNO", cpu_subfunc.ppno, sizeof(cpu_subfunc.ppno), test_prno_asm_block, 57 },
 	/* MSA8 - Facility bit 146 */
-- 
cgit v1.2.3


From d1dbab52ebc2447c7aa623b8d677135a6b23e406 Mon Sep 17 00:00:00 2001
From: Hariharan Mari <hari55@linux.ibm.com>
Date: Fri, 23 Aug 2024 15:05:07 +0200
Subject: KVM: s390: selftests: Add regression tests for KMAC, KMC, KM, KIMD
 and KLMD crypto subfunctions

Extend the existing regression test framework for s390x CPU subfunctions
to include tests for the KMAC (Compute Message Authentication Code),
KMC (Cipher Message with Chaining), KM (Cipher Message) KIMD (Compute
Intermediate Message Digest) and KLMD (Compute Last Message Digest)
crypto functions.

The test procedure follows the established pattern.

Suggested-by: Janosch Frank <frankja@linux.ibm.com>
Signed-off-by: Hariharan Mari <hari55@linux.ibm.com>
Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
Reviewed-by: Christoph Schlameuss <schlameuss@linux.ibm.com>
Link: https://lore.kernel.org/r/20240823130947.38323-5-hari55@linux.ibm.com
Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Message-ID: <20240823130947.38323-5-hari55@linux.ibm.com>
---
 .../selftests/kvm/s390x/cpumodel_subfuncs_test.c   | 61 ++++++++++++++++++++++
 1 file changed, 61 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c b/tools/testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c
index 28faceeaf089..fe45fb131583 100644
--- a/tools/testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c
+++ b/tools/testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c
@@ -33,6 +33,61 @@ static void get_cpu_machine_subfuntions(struct kvm_vm *vm,
 	TEST_ASSERT(!r, "Get cpu subfunctions failed r=%d errno=%d", r, errno);
 }
 
+/* Testing Crypto Compute Message Authentication Code (KMAC) CPU subfunction's ASM block */
+static void test_kmac_asm_block(u8 (*query)[16])
+{
+	asm volatile("	la	%%r1,%[query]\n"
+			"	xgr	%%r0,%%r0\n"
+			"	.insn	rre,0xb91e0000,0,2\n"
+			: [query] "=R" (*query)
+			:
+			: "cc", "r0", "r1");
+}
+
+/* Testing Crypto Cipher Message with Chaining (KMC) CPU subfunction's ASM block */
+static void test_kmc_asm_block(u8 (*query)[16])
+{
+	asm volatile("	la	%%r1,%[query]\n"
+			"	xgr	%%r0,%%r0\n"
+			"	.insn	rre,0xb92f0000,2,4\n"
+			: [query] "=R" (*query)
+			:
+			: "cc", "r0", "r1");
+}
+
+/* Testing Crypto Cipher Message (KM) CPU subfunction's ASM block */
+static void test_km_asm_block(u8 (*query)[16])
+{
+	asm volatile("	la	%%r1,%[query]\n"
+			"	xgr	%%r0,%%r0\n"
+			"	.insn	rre,0xb92e0000,2,4\n"
+			: [query] "=R" (*query)
+			:
+			: "cc", "r0", "r1");
+}
+
+/* Testing Crypto Compute Intermediate Message Digest (KIMD) CPU subfunction's ASM block */
+static void test_kimd_asm_block(u8 (*query)[16])
+{
+	asm volatile("	la	%%r1,%[query]\n"
+			"	xgr	%%r0,%%r0\n"
+			"	.insn	rre,0xb93e0000,0,2\n"
+			: [query] "=R" (*query)
+			:
+			: "cc", "r0", "r1");
+}
+
+/* Testing Crypto Compute Last Message Digest (KLMD) CPU subfunction's ASM block */
+static void test_klmd_asm_block(u8 (*query)[16])
+{
+	asm volatile("	la	%%r1,%[query]\n"
+			"	xgr	%%r0,%%r0\n"
+			"	.insn	rre,0xb93f0000,0,2\n"
+			: [query] "=R" (*query)
+			:
+			: "cc", "r0", "r1");
+}
+
 /* Testing Crypto Cipher Message with Counter (KMCTR) CPU subfunction's ASM block */
 static void test_kmctr_asm_block(u8 (*query)[16])
 {
@@ -141,6 +196,12 @@ struct testdef {
 	testfunc_t test;
 	int facility_bit;
 } testlist[] = {
+	/* MSA - Facility bit 17 */
+	{ "KMAC", cpu_subfunc.kmac, sizeof(cpu_subfunc.kmac), test_kmac_asm_block, 17 },
+	{ "KMC", cpu_subfunc.kmc, sizeof(cpu_subfunc.kmc), test_kmc_asm_block, 17 },
+	{ "KM", cpu_subfunc.km, sizeof(cpu_subfunc.km), test_km_asm_block, 17 },
+	{ "KIMD", cpu_subfunc.kimd, sizeof(cpu_subfunc.kimd), test_kimd_asm_block, 17 },
+	{ "KLMD", cpu_subfunc.klmd, sizeof(cpu_subfunc.klmd), test_klmd_asm_block, 17 },
 	/* MSA - Facility bit 77 */
 	{ "KMCTR", cpu_subfunc.kmctr, sizeof(cpu_subfunc.kmctr), test_kmctr_asm_block, 77 },
 	{ "KMF", cpu_subfunc.kmf, sizeof(cpu_subfunc.kmf), test_kmf_asm_block, 77 },
-- 
cgit v1.2.3


From 75ec613efa257a43663232b2aa46d1737395bb73 Mon Sep 17 00:00:00 2001
From: Hariharan Mari <hari55@linux.ibm.com>
Date: Fri, 23 Aug 2024 15:05:08 +0200
Subject: KVM: s390: selftests: Add regression tests for PLO subfunctions

Extend the existing regression test framework for s390x CPU subfunctions
to include tests for the Perform Locked Operation (PLO) subfunction
functions.

PLO was introduced in the very first 64-bit machine generation.
Hence it is assumed PLO is always installed in the Z Arch.
The test procedure follows the established pattern.

Suggested-by: Janosch Frank <frankja@linux.ibm.com>
Signed-off-by: Hariharan Mari <hari55@linux.ibm.com>
Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
Reviewed-by: Christoph Schlameuss <schlameuss@linux.ibm.com>
Link: https://lore.kernel.org/r/20240823130947.38323-6-hari55@linux.ibm.com
Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Message-ID: <20240823130947.38323-6-hari55@linux.ibm.com>
---
 .../selftests/kvm/s390x/cpumodel_subfuncs_test.c   | 32 ++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c b/tools/testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c
index fe45fb131583..222ba1cc3cac 100644
--- a/tools/testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c
+++ b/tools/testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c
@@ -19,6 +19,8 @@
 
 #include "kvm_util.h"
 
+#define PLO_FUNCTION_MAX 256
+
 /* Query available CPU subfunctions */
 struct kvm_s390_vm_cpu_subfunc cpu_subfunc;
 
@@ -33,6 +35,31 @@ static void get_cpu_machine_subfuntions(struct kvm_vm *vm,
 	TEST_ASSERT(!r, "Get cpu subfunctions failed r=%d errno=%d", r, errno);
 }
 
+static inline int plo_test_bit(unsigned char nr)
+{
+	unsigned long function = nr | 0x100;
+	int cc;
+
+	asm volatile("	lgr	0,%[function]\n"
+			/* Parameter registers are ignored for "test bit" */
+			"	plo	0,0,0,0(0)\n"
+			"	ipm	%0\n"
+			"	srl	%0,28\n"
+			: "=d" (cc)
+			: [function] "d" (function)
+			: "cc", "0");
+	return cc == 0;
+}
+
+/* Testing Perform Locked Operation (PLO) CPU subfunction's ASM block */
+static void test_plo_asm_block(u8 (*query)[32])
+{
+	for (int i = 0; i < PLO_FUNCTION_MAX; ++i) {
+		if (plo_test_bit(i))
+			(*query)[i >> 3] |= 0x80 >> (i & 7);
+	}
+}
+
 /* Testing Crypto Compute Message Authentication Code (KMAC) CPU subfunction's ASM block */
 static void test_kmac_asm_block(u8 (*query)[16])
 {
@@ -196,6 +223,11 @@ struct testdef {
 	testfunc_t test;
 	int facility_bit;
 } testlist[] = {
+	/*
+	 * PLO was introduced in the very first 64-bit machine generation.
+	 * Hence it is assumed PLO is always installed in Z Arch.
+	 */
+	{ "PLO", cpu_subfunc.plo, sizeof(cpu_subfunc.plo), test_plo_asm_block, 1 },
 	/* MSA - Facility bit 17 */
 	{ "KMAC", cpu_subfunc.kmac, sizeof(cpu_subfunc.kmac), test_kmac_asm_block, 17 },
 	{ "KMC", cpu_subfunc.kmc, sizeof(cpu_subfunc.kmc), test_kmc_asm_block, 17 },
-- 
cgit v1.2.3


From ad0558f3883130954ca724697f2d19aef93967b3 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Mon, 7 Oct 2024 10:10:55 +0200
Subject: selftests/nolibc: start qemu with 1 GiB of memory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Recently the loongarch defconfig stopped working with the default 128 MiB
of memory. The VM just spins infinitively.
Increasing the available memory to 1 GiB, similar to s390, fixes the
issue. To avoid having to do this for each architecture on its own,
proactively apply to all architectures.

Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Link: https://lore.kernel.org/r/20241007-nolibc-qemu-mem-v1-1-c1c2f9acd0f8@linutronix.de
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/testing/selftests/nolibc/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile
index 8de98ea7af80..e92e0b885861 100644
--- a/tools/testing/selftests/nolibc/Makefile
+++ b/tools/testing/selftests/nolibc/Makefile
@@ -130,9 +130,9 @@ QEMU_ARGS_ppc        = -M g3beige -append "console=ttyS0 panic=-1 $(TEST:%=NOLIB
 QEMU_ARGS_ppc64      = -M powernv -append "console=hvc0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
 QEMU_ARGS_ppc64le    = -M powernv -append "console=hvc0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
 QEMU_ARGS_riscv      = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
-QEMU_ARGS_s390       = -M s390-ccw-virtio -m 1G -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
+QEMU_ARGS_s390       = -M s390-ccw-virtio -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
 QEMU_ARGS_loongarch  = -M virt -append "console=ttyS0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
-QEMU_ARGS            = $(QEMU_ARGS_$(XARCH)) $(QEMU_ARGS_BIOS) $(QEMU_ARGS_EXTRA)
+QEMU_ARGS            = -m 1G $(QEMU_ARGS_$(XARCH)) $(QEMU_ARGS_BIOS) $(QEMU_ARGS_EXTRA)
 
 # OUTPUT is only set when run from the main makefile, otherwise
 # it defaults to this nolibc directory.
-- 
cgit v1.2.3


From 716fa7dadf116ec4a27f56558b2a5bdd7e8decab Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 Oct 2024 12:13:35 +0200
Subject: selftests/bpf: Extend netkit tests to validate skb meta data

Add a small netkit test to validate skb mark and priority under the
default scrubbing as well as with mark and priority scrubbing off.

  # ./vmtest.sh -- ./test_progs -t netkit
  [...]
  ./test_progs -t netkit
  [    1.419662] tsc: Refined TSC clocksource calibration: 3407.993 MHz
  [    1.420151] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x311fcd52370, max_idle_ns: 440795242006 ns
  [    1.420897] clocksource: Switched to clocksource tsc
  [    1.447996] bpf_testmod: loading out-of-tree module taints kernel.
  [    1.448447] bpf_testmod: module verification failed: signature and/or required key missing - tainting kernel
  #357     tc_netkit_basic:OK
  #358     tc_netkit_device:OK
  #359     tc_netkit_multi_links:OK
  #360     tc_netkit_multi_opts:OK
  #361     tc_netkit_neigh_links:OK
  #362     tc_netkit_pkt_type:OK
  #363     tc_netkit_scrub:OK
  Summary: 7/0 PASSED, 0 SKIPPED, 0 FAILED

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Nikolay Aleksandrov <razor@blackwall.org>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://lore.kernel.org/r/20241004101335.117711-5-daniel@iogearbox.net
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/tc_netkit.c | 94 +++++++++++++++++++---
 tools/testing/selftests/bpf/progs/test_tc_link.c   | 12 +++
 2 files changed, 97 insertions(+), 9 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/tc_netkit.c b/tools/testing/selftests/bpf/prog_tests/tc_netkit.c
index b9135720024c..151a4210028f 100644
--- a/tools/testing/selftests/bpf/prog_tests/tc_netkit.c
+++ b/tools/testing/selftests/bpf/prog_tests/tc_netkit.c
@@ -14,7 +14,9 @@
 #include "netlink_helpers.h"
 #include "tc_helpers.h"
 
-#define ICMP_ECHO 8
+#define MARK		42
+#define PRIO		0xeb9f
+#define ICMP_ECHO	8
 
 struct icmphdr {
 	__u8		type;
@@ -33,7 +35,7 @@ struct iplink_req {
 };
 
 static int create_netkit(int mode, int policy, int peer_policy, int *ifindex,
-			 bool same_netns)
+			 bool same_netns, int scrub, int peer_scrub)
 {
 	struct rtnl_handle rth = { .fd = -1 };
 	struct iplink_req req = {};
@@ -58,6 +60,8 @@ static int create_netkit(int mode, int policy, int peer_policy, int *ifindex,
 	data = addattr_nest(&req.n, sizeof(req), IFLA_INFO_DATA);
 	addattr32(&req.n, sizeof(req), IFLA_NETKIT_POLICY, policy);
 	addattr32(&req.n, sizeof(req), IFLA_NETKIT_PEER_POLICY, peer_policy);
+	addattr32(&req.n, sizeof(req), IFLA_NETKIT_SCRUB, scrub);
+	addattr32(&req.n, sizeof(req), IFLA_NETKIT_PEER_SCRUB, peer_scrub);
 	addattr32(&req.n, sizeof(req), IFLA_NETKIT_MODE, mode);
 	addattr_nest_end(&req.n, data);
 	addattr_nest_end(&req.n, linkinfo);
@@ -118,9 +122,9 @@ static void destroy_netkit(void)
 
 static int __send_icmp(__u32 dest)
 {
+	int sock, ret, mark = MARK, prio = PRIO;
 	struct sockaddr_in addr;
 	struct icmphdr icmp;
-	int sock, ret;
 
 	ret = write_sysctl("/proc/sys/net/ipv4/ping_group_range", "0 0");
 	if (!ASSERT_OK(ret, "write_sysctl(net.ipv4.ping_group_range)"))
@@ -135,6 +139,15 @@ static int __send_icmp(__u32 dest)
 	if (!ASSERT_OK(ret, "setsockopt(SO_BINDTODEVICE)"))
 		goto out;
 
+	ret = setsockopt(sock, SOL_SOCKET, SO_MARK, &mark, sizeof(mark));
+	if (!ASSERT_OK(ret, "setsockopt(SO_MARK)"))
+		goto out;
+
+	ret = setsockopt(sock, SOL_SOCKET, SO_PRIORITY,
+			 &prio, sizeof(prio));
+	if (!ASSERT_OK(ret, "setsockopt(SO_PRIORITY)"))
+		goto out;
+
 	memset(&addr, 0, sizeof(addr));
 	addr.sin_family = AF_INET;
 	addr.sin_addr.s_addr = htonl(dest);
@@ -171,7 +184,8 @@ void serial_test_tc_netkit_basic(void)
 	int err, ifindex;
 
 	err = create_netkit(NETKIT_L2, NETKIT_PASS, NETKIT_PASS,
-			    &ifindex, false);
+			    &ifindex, false, NETKIT_SCRUB_DEFAULT,
+			    NETKIT_SCRUB_DEFAULT);
 	if (err)
 		return;
 
@@ -285,7 +299,8 @@ static void serial_test_tc_netkit_multi_links_target(int mode, int target)
 	int err, ifindex;
 
 	err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS,
-			    &ifindex, false);
+			    &ifindex, false, NETKIT_SCRUB_DEFAULT,
+			    NETKIT_SCRUB_DEFAULT);
 	if (err)
 		return;
 
@@ -413,7 +428,8 @@ static void serial_test_tc_netkit_multi_opts_target(int mode, int target)
 	int err, ifindex;
 
 	err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS,
-			    &ifindex, false);
+			    &ifindex, false, NETKIT_SCRUB_DEFAULT,
+			    NETKIT_SCRUB_DEFAULT);
 	if (err)
 		return;
 
@@ -527,7 +543,8 @@ void serial_test_tc_netkit_device(void)
 	int err, ifindex, ifindex2;
 
 	err = create_netkit(NETKIT_L3, NETKIT_PASS, NETKIT_PASS,
-			    &ifindex, true);
+			    &ifindex, true, NETKIT_SCRUB_DEFAULT,
+			    NETKIT_SCRUB_DEFAULT);
 	if (err)
 		return;
 
@@ -638,7 +655,8 @@ static void serial_test_tc_netkit_neigh_links_target(int mode, int target)
 	int err, ifindex;
 
 	err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS,
-			    &ifindex, false);
+			    &ifindex, false, NETKIT_SCRUB_DEFAULT,
+			    NETKIT_SCRUB_DEFAULT);
 	if (err)
 		return;
 
@@ -715,7 +733,8 @@ static void serial_test_tc_netkit_pkt_type_mode(int mode)
 	struct bpf_link *link;
 
 	err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS,
-			    &ifindex, true);
+			    &ifindex, true, NETKIT_SCRUB_DEFAULT,
+			    NETKIT_SCRUB_DEFAULT);
 	if (err)
 		return;
 
@@ -779,3 +798,60 @@ void serial_test_tc_netkit_pkt_type(void)
 	serial_test_tc_netkit_pkt_type_mode(NETKIT_L2);
 	serial_test_tc_netkit_pkt_type_mode(NETKIT_L3);
 }
+
+static void serial_test_tc_netkit_scrub_type(int scrub)
+{
+	LIBBPF_OPTS(bpf_netkit_opts, optl);
+	struct test_tc_link *skel;
+	struct bpf_link *link;
+	int err, ifindex;
+
+	err = create_netkit(NETKIT_L2, NETKIT_PASS, NETKIT_PASS,
+			    &ifindex, false, scrub, scrub);
+	if (err)
+		return;
+
+	skel = test_tc_link__open();
+	if (!ASSERT_OK_PTR(skel, "skel_open"))
+		goto cleanup;
+
+	ASSERT_EQ(bpf_program__set_expected_attach_type(skel->progs.tc8,
+		  BPF_NETKIT_PRIMARY), 0, "tc8_attach_type");
+
+	err = test_tc_link__load(skel);
+	if (!ASSERT_OK(err, "skel_load"))
+		goto cleanup;
+
+	assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PRIMARY, 0);
+	assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PEER, 0);
+
+	ASSERT_EQ(skel->bss->seen_tc8, false, "seen_tc8");
+
+	link = bpf_program__attach_netkit(skel->progs.tc8, ifindex, &optl);
+	if (!ASSERT_OK_PTR(link, "link_attach"))
+		goto cleanup;
+
+	skel->links.tc8 = link;
+
+	assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PRIMARY, 1);
+	assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PEER, 0);
+
+	tc_skel_reset_all_seen(skel);
+	ASSERT_EQ(send_icmp(), 0, "icmp_pkt");
+
+	ASSERT_EQ(skel->bss->seen_tc8, true, "seen_tc8");
+	ASSERT_EQ(skel->bss->mark, scrub == NETKIT_SCRUB_NONE ? MARK : 0, "mark");
+	ASSERT_EQ(skel->bss->prio, scrub == NETKIT_SCRUB_NONE ? PRIO : 0, "prio");
+cleanup:
+	test_tc_link__destroy(skel);
+
+	assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PRIMARY, 0);
+	assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PEER, 0);
+	destroy_netkit();
+}
+
+void serial_test_tc_netkit_scrub(void)
+{
+	serial_test_tc_netkit_scrub_type(NETKIT_SCRUB_DEFAULT);
+	serial_test_tc_netkit_scrub_type(NETKIT_SCRUB_NONE);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_tc_link.c b/tools/testing/selftests/bpf/progs/test_tc_link.c
index ab3eae3d6af8..10d825928499 100644
--- a/tools/testing/selftests/bpf/progs/test_tc_link.c
+++ b/tools/testing/selftests/bpf/progs/test_tc_link.c
@@ -18,6 +18,7 @@ bool seen_tc4;
 bool seen_tc5;
 bool seen_tc6;
 bool seen_tc7;
+bool seen_tc8;
 
 bool set_type;
 
@@ -25,6 +26,8 @@ bool seen_eth;
 bool seen_host;
 bool seen_mcast;
 
+int mark, prio;
+
 SEC("tc/ingress")
 int tc1(struct __sk_buff *skb)
 {
@@ -100,3 +103,12 @@ out:
 	seen_tc7 = true;
 	return TCX_PASS;
 }
+
+SEC("tc/egress")
+int tc8(struct __sk_buff *skb)
+{
+	seen_tc8 = true;
+	mark = skb->mark;
+	prio = skb->priority;
+	return TCX_PASS;
+}
-- 
cgit v1.2.3


From 19090f0306f1748980596c6c71f1c4b128639cff Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn@rivosinc.com>
Date: Fri, 27 Sep 2024 15:13:53 +0200
Subject: selftests: bpf: Add missing per-arch include path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The prog_tests programs do not include the per-arch tools include
path, e.g. tools/arch/riscv/include. Some architectures depend those
files to build properly.

Include tools/arch/$(SUBARCH)/include in the selftests bpf build.

Fixes: 6d74d178fe6e ("tools: Add riscv barrier implementation")
Signed-off-by: Björn Töpel <bjorn@rivosinc.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240927131355.350918-2-bjorn@kernel.org
---
 tools/testing/selftests/bpf/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index e295e3df5ec6..28a76baa854d 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -10,6 +10,7 @@ TOOLSDIR := $(abspath ../../..)
 LIBDIR := $(TOOLSDIR)/lib
 BPFDIR := $(LIBDIR)/bpf
 TOOLSINCDIR := $(TOOLSDIR)/include
+TOOLSARCHINCDIR := $(TOOLSDIR)/arch/$(SRCARCH)/include
 BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool
 APIDIR := $(TOOLSINCDIR)/uapi
 ifneq ($(O),)
@@ -44,7 +45,7 @@ CFLAGS += -g $(OPT_FLAGS) -rdynamic					\
 	  -Wall -Werror -fno-omit-frame-pointer				\
 	  $(GENFLAGS) $(SAN_CFLAGS) $(LIBELF_CFLAGS)			\
 	  -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR)		\
-	  -I$(TOOLSINCDIR) -I$(APIDIR) -I$(OUTPUT)
+	  -I$(TOOLSINCDIR) -I$(TOOLSARCHINCDIR) -I$(APIDIR) -I$(OUTPUT)
 LDFLAGS += $(SAN_LDFLAGS)
 LDLIBS += $(LIBELF_LIBS) -lz -lrt -lpthread
 
-- 
cgit v1.2.3


From 3c591de2854381e313ec149bc1bbd8360f9ed53b Mon Sep 17 00:00:00 2001
From: Eric Long <i@hack3r.moe>
Date: Wed, 2 Oct 2024 14:25:07 +0800
Subject: selftests/bpf: Test linking with duplicate extern functions

Previously when multiple BPF object files referencing the same extern
function (usually kfunc) are statically linked using `bpftool gen
object`, libbpf tries to get the nonexistent size of BTF_KIND_FUNC_PROTO
and fails. This test ensures it is fixed.

Signed-off-by: Eric Long <i@hack3r.moe>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241002-libbpf-dup-extern-funcs-v4-2-560eb460ff90@hack3r.moe
---
 tools/testing/selftests/bpf/progs/linked_funcs1.c | 8 ++++++++
 tools/testing/selftests/bpf/progs/linked_funcs2.c | 8 ++++++++
 2 files changed, 16 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/progs/linked_funcs1.c b/tools/testing/selftests/bpf/progs/linked_funcs1.c
index cc79dddac182..049a1f78de3f 100644
--- a/tools/testing/selftests/bpf/progs/linked_funcs1.c
+++ b/tools/testing/selftests/bpf/progs/linked_funcs1.c
@@ -63,6 +63,8 @@ extern int set_output_val2(int x);
 /* here we'll force set_output_ctx2() to be __hidden in the final obj file */
 __hidden extern void set_output_ctx2(__u64 *ctx);
 
+void *bpf_cast_to_kern_ctx(void *obj) __ksym;
+
 SEC("?raw_tp/sys_enter")
 int BPF_PROG(handler1, struct pt_regs *regs, long id)
 {
@@ -86,4 +88,10 @@ int BPF_PROG(handler1, struct pt_regs *regs, long id)
 	return 0;
 }
 
+/* Generate BTF FUNC record and test linking with duplicate extern functions */
+void kfunc_gen1(void)
+{
+	bpf_cast_to_kern_ctx(0);
+}
+
 char LICENSE[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/linked_funcs2.c b/tools/testing/selftests/bpf/progs/linked_funcs2.c
index 942cc5526ddf..96850759fd8d 100644
--- a/tools/testing/selftests/bpf/progs/linked_funcs2.c
+++ b/tools/testing/selftests/bpf/progs/linked_funcs2.c
@@ -63,6 +63,8 @@ extern int set_output_val1(int x);
 /* here we'll force set_output_ctx1() to be __hidden in the final obj file */
 __hidden extern void set_output_ctx1(__u64 *ctx);
 
+void *bpf_cast_to_kern_ctx(void *obj) __ksym;
+
 SEC("?raw_tp/sys_enter")
 int BPF_PROG(handler2, struct pt_regs *regs, long id)
 {
@@ -86,4 +88,10 @@ int BPF_PROG(handler2, struct pt_regs *regs, long id)
 	return 0;
 }
 
+/* Generate BTF FUNC record and test linking with duplicate extern functions */
+void kfunc_gen2(void)
+{
+	bpf_cast_to_kern_ctx(0);
+}
+
 char LICENSE[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 5bf1557e3d6a69113649d831276ea2f97585fc33 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Thu, 3 Oct 2024 14:03:07 -0700
Subject: selftests/bpf: Fix backtrace printing for selftests crashes

test_progs uses glibc specific functions backtrace() and
backtrace_symbols_fd() to print backtrace in case of SIGSEGV.

Recent commit (see fixes) updated test_progs.c to define stub versions
of the same functions with attriubte "weak" in order to allow linking
test_progs against musl libc. Unfortunately this broke the backtrace
handling for glibc builds.

As it turns out, glibc defines backtrace() and backtrace_symbols_fd()
as weak:

  $ llvm-readelf --symbols /lib64/libc.so.6 \
     | grep -P '( backtrace_symbols_fd| backtrace)$'
  4910: 0000000000126b40   161 FUNC    WEAK   DEFAULT    16 backtrace
  6843: 0000000000126f90   852 FUNC    WEAK   DEFAULT    16 backtrace_symbols_fd

So does test_progs:

 $ llvm-readelf --symbols test_progs \
    | grep -P '( backtrace_symbols_fd| backtrace)$'
  2891: 00000000006ad190    15 FUNC    WEAK   DEFAULT    13 backtrace
 11215: 00000000006ad1a0    41 FUNC    WEAK   DEFAULT    13 backtrace_symbols_fd

In such situation dynamic linker is not obliged to favour glibc
implementation over the one defined in test_progs.

Compiling with the following simple modification to test_progs.c
demonstrates the issue:

  $ git diff
  ...
  \--- a/tools/testing/selftests/bpf/test_progs.c
  \+++ b/tools/testing/selftests/bpf/test_progs.c
  \@@ -1817,6 +1817,7 @@ int main(int argc, char **argv)
          if (err)
                  return err;

  +       *(int *)0xdeadbeef  = 42;
          err = cd_flavor_subdir(argv[0]);
          if (err)
                  return err;

  $ ./test_progs
  [0]: Caught signal #11!
  Stack trace:
  <backtrace not supported>
  Segmentation fault (core dumped)

Resolve this by hiding stub definitions behind __GLIBC__ macro check
instead of using "weak" attribute.

Fixes: c9a83e76b5a9 ("selftests/bpf: Fix compile if backtrace support missing in libc")
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Tony Ambardar <tony.ambardar@gmail.com>
Reviewed-by: Tony Ambardar <tony.ambardar@gmail.com>
Acked-by: Daniel Xu <dxu@dxuuu.xyz>
Link: https://lore.kernel.org/bpf/20241003210307.3847907-1-eddyz87@gmail.com
---
 tools/testing/selftests/bpf/test_progs.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 7846f7f98908..7421874380c2 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -20,11 +20,13 @@
 
 #include "network_helpers.h"
 
+/* backtrace() and backtrace_symbols_fd() are glibc specific,
+ * use header file when glibc is available and provide stub
+ * implementations when another libc implementation is used.
+ */
 #ifdef __GLIBC__
 #include <execinfo.h> /* backtrace */
-#endif
-
-/* Default backtrace funcs if missing at link */
+#else
 __weak int backtrace(void **buffer, int size)
 {
 	return 0;
@@ -34,6 +36,7 @@ __weak void backtrace_symbols_fd(void *const *buffer, int size, int fd)
 {
 	dprintf(fd, "<backtrace not supported>\n");
 }
+#endif /*__GLIBC__ */
 
 int env_verbosity = 0;
 
-- 
cgit v1.2.3


From bc9b3fb827fceec4e05564d6e668280f4470ab5b Mon Sep 17 00:00:00 2001
From: "Alexis Lothoré (eBPF Foundation)" <alexis.lothore@bootlin.com>
Date: Tue, 8 Oct 2024 16:50:57 +0200
Subject: selftests/bpf: add missing header include for htons
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Including the network_helpers.h header in tests can lead to the following
build error:

./network_helpers.h: In function ‘csum_tcpudp_magic’:
./network_helpers.h:116:14: error: implicit declaration of function \
  ‘htons’ [-Werror=implicit-function-declaration]
  116 |         s += htons(proto + len);

The error is avoided in many cases thanks to some other headers included
earlier and bringing in arpa/inet.h (ie: test_progs.h).

Make sure that test_progs build success does not depend on header ordering
by adding the missing header include in network_helpers.h

Fixes: f6642de0c3e9 ("selftests/bpf: Add csum helpers")
Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20241008-network_helpers_fix-v1-1-2c2ae03df7ef@bootlin.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/network_helpers.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h
index c72c16e1aff8..5764155b6d25 100644
--- a/tools/testing/selftests/bpf/network_helpers.h
+++ b/tools/testing/selftests/bpf/network_helpers.h
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef __NETWORK_HELPERS_H
 #define __NETWORK_HELPERS_H
+#include <arpa/inet.h>
 #include <sys/socket.h>
 #include <sys/types.h>
 #include <linux/types.h>
-- 
cgit v1.2.3


From 2e82c0d4562a4b8292af83577b70af888a93d16d Mon Sep 17 00:00:00 2001
From: Joshua Hahn <joshua.hahn6@gmail.com>
Date: Wed, 2 Oct 2024 11:47:17 -0700
Subject: cgroup/rstat: Selftests for niced CPU statistics

Creates a cgroup with a single nice CPU hog process running.
fork() is called to generate the nice process because un-nicing is
not possible (see man nice(3)). If fork() was not used to generate
the CPU hog, we would run the rest of the cgroup selftest suite as a
nice process.

Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/cgroup/test_cpu.c | 75 +++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/cgroup/test_cpu.c b/tools/testing/selftests/cgroup/test_cpu.c
index dad2ed82f3ef..201ce14cb422 100644
--- a/tools/testing/selftests/cgroup/test_cpu.c
+++ b/tools/testing/selftests/cgroup/test_cpu.c
@@ -8,6 +8,7 @@
 #include <pthread.h>
 #include <stdio.h>
 #include <time.h>
+#include <unistd.h>
 
 #include "../kselftest.h"
 #include "cgroup_util.h"
@@ -229,6 +230,79 @@ cleanup:
 	return ret;
 }
 
+/*
+ * Creates a nice process that consumes CPU and checks that the elapsed
+ * usertime in the cgroup is close to the expected time.
+ */
+static int test_cpucg_nice(const char *root)
+{
+	int ret = KSFT_FAIL;
+	int status;
+	long user_usec, nice_usec;
+	long usage_seconds = 2;
+	long expected_nice_usec = usage_seconds * USEC_PER_SEC;
+	char *cpucg;
+	pid_t pid;
+
+	cpucg = cg_name(root, "cpucg_test");
+	if (!cpucg)
+		goto cleanup;
+
+	if (cg_create(cpucg))
+		goto cleanup;
+
+	user_usec = cg_read_key_long(cpucg, "cpu.stat", "user_usec");
+	nice_usec = cg_read_key_long(cpucg, "cpu.stat", "nice_usec");
+	if (nice_usec == -1)
+		ret = KSFT_SKIP;
+	if (user_usec != 0 || nice_usec != 0)
+		goto cleanup;
+
+	/*
+	 * We fork here to create a new process that can be niced without
+	 * polluting the nice value of other selftests
+	 */
+	pid = fork();
+	if (pid < 0) {
+		goto cleanup;
+	} else if (pid == 0) {
+		struct cpu_hog_func_param param = {
+			.nprocs = 1,
+			.ts = {
+				.tv_sec = usage_seconds,
+				.tv_nsec = 0,
+			},
+			.clock_type = CPU_HOG_CLOCK_PROCESS,
+		};
+		char buf[64];
+		snprintf(buf, sizeof(buf), "%d", getpid());
+		if (cg_write(cpucg, "cgroup.procs", buf))
+			goto cleanup;
+
+		/* Try to keep niced CPU usage as constrained to hog_cpu as possible */
+		nice(1);
+		hog_cpus_timed(cpucg, param);
+		exit(0);
+	} else {
+		waitpid(pid, &status, 0);
+		if (!WIFEXITED(status))
+			goto cleanup;
+
+		user_usec = cg_read_key_long(cpucg, "cpu.stat", "user_usec");
+		nice_usec = cg_read_key_long(cpucg, "cpu.stat", "nice_usec");
+		if (!values_close(nice_usec, expected_nice_usec, 1))
+			goto cleanup;
+
+		ret = KSFT_PASS;
+	}
+
+cleanup:
+	cg_destroy(cpucg);
+	free(cpucg);
+
+	return ret;
+}
+
 static int
 run_cpucg_weight_test(
 		const char *root,
@@ -686,6 +760,7 @@ struct cpucg_test {
 } tests[] = {
 	T(test_cpucg_subtree_control),
 	T(test_cpucg_stats),
+	T(test_cpucg_nice),
 	T(test_cpucg_weight_overprovisioned),
 	T(test_cpucg_weight_underprovisioned),
 	T(test_cpucg_nested_weight_overprovisioned),
-- 
cgit v1.2.3


From 693fe954d61d4696aa06f631fd0bce0b3b3e8027 Mon Sep 17 00:00:00 2001
From: Mahe Tardy <mahe.tardy@gmail.com>
Date: Mon, 7 Oct 2024 09:59:58 +0000
Subject: selftests/bpf: add tcx netns cookie tests

Add netns cookie test that verifies the helper is now supported and work
in the context of tc programs.

Signed-off-by: Mahe Tardy <mahe.tardy@gmail.com>
Link: https://lore.kernel.org/r/20241007095958.97442-2-mahe.tardy@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 .../selftests/bpf/prog_tests/netns_cookie.c        | 29 ++++++++++++++++------
 .../selftests/bpf/progs/netns_cookie_prog.c        | 10 ++++++++
 2 files changed, 32 insertions(+), 7 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/netns_cookie.c b/tools/testing/selftests/bpf/prog_tests/netns_cookie.c
index 71d8f3ba7d6b..ac3c3c097c0e 100644
--- a/tools/testing/selftests/bpf/prog_tests/netns_cookie.c
+++ b/tools/testing/selftests/bpf/prog_tests/netns_cookie.c
@@ -8,12 +8,16 @@
 #define SO_NETNS_COOKIE 71
 #endif
 
+#define loopback 1
+
 static int duration;
 
 void test_netns_cookie(void)
 {
+	LIBBPF_OPTS(bpf_prog_attach_opts, opta);
+	LIBBPF_OPTS(bpf_prog_detach_opts, optd);
 	int server_fd = -1, client_fd = -1, cgroup_fd = -1;
-	int err, val, ret, map, verdict;
+	int err, val, ret, map, verdict, tc_fd;
 	struct netns_cookie_prog *skel;
 	uint64_t cookie_expected_value;
 	socklen_t vallen = sizeof(cookie_expected_value);
@@ -38,36 +42,47 @@ void test_netns_cookie(void)
 	if (!ASSERT_OK(err, "prog_attach"))
 		goto done;
 
+	tc_fd = bpf_program__fd(skel->progs.get_netns_cookie_tcx);
+	err = bpf_prog_attach_opts(tc_fd, loopback, BPF_TCX_INGRESS, &opta);
+	if (!ASSERT_OK(err, "prog_attach"))
+		goto done;
+
 	server_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0);
 	if (CHECK(server_fd < 0, "start_server", "errno %d\n", errno))
-		goto done;
+		goto cleanup_tc;
 
 	client_fd = connect_to_fd(server_fd, 0);
 	if (CHECK(client_fd < 0, "connect_to_fd", "errno %d\n", errno))
-		goto done;
+		goto cleanup_tc;
 
 	ret = send(client_fd, send_msg, sizeof(send_msg), 0);
 	if (CHECK(ret != sizeof(send_msg), "send(msg)", "ret:%d\n", ret))
-		goto done;
+		goto cleanup_tc;
 
 	err = bpf_map_lookup_elem(bpf_map__fd(skel->maps.sockops_netns_cookies),
 				  &client_fd, &val);
 	if (!ASSERT_OK(err, "map_lookup(sockops_netns_cookies)"))
-		goto done;
+		goto cleanup_tc;
 
 	err = getsockopt(client_fd, SOL_SOCKET, SO_NETNS_COOKIE,
 			 &cookie_expected_value, &vallen);
 	if (!ASSERT_OK(err, "getsockopt"))
-		goto done;
+		goto cleanup_tc;
 
 	ASSERT_EQ(val, cookie_expected_value, "cookie_value");
 
 	err = bpf_map_lookup_elem(bpf_map__fd(skel->maps.sk_msg_netns_cookies),
 				  &client_fd, &val);
 	if (!ASSERT_OK(err, "map_lookup(sk_msg_netns_cookies)"))
-		goto done;
+		goto cleanup_tc;
 
 	ASSERT_EQ(val, cookie_expected_value, "cookie_value");
+	ASSERT_EQ(skel->bss->tcx_init_netns_cookie, cookie_expected_value, "cookie_value");
+	ASSERT_EQ(skel->bss->tcx_netns_cookie, cookie_expected_value, "cookie_value");
+
+cleanup_tc:
+	err = bpf_prog_detach_opts(tc_fd, loopback, BPF_TCX_INGRESS, &optd);
+	ASSERT_OK(err, "prog_detach");
 
 done:
 	if (server_fd != -1)
diff --git a/tools/testing/selftests/bpf/progs/netns_cookie_prog.c b/tools/testing/selftests/bpf/progs/netns_cookie_prog.c
index aeff3a4f9287..c6edf8dbefeb 100644
--- a/tools/testing/selftests/bpf/progs/netns_cookie_prog.c
+++ b/tools/testing/selftests/bpf/progs/netns_cookie_prog.c
@@ -27,6 +27,8 @@ struct {
 	__type(value, __u64);
 } sock_map SEC(".maps");
 
+int tcx_init_netns_cookie, tcx_netns_cookie;
+
 SEC("sockops")
 int get_netns_cookie_sockops(struct bpf_sock_ops *ctx)
 {
@@ -81,4 +83,12 @@ int get_netns_cookie_sk_msg(struct sk_msg_md *msg)
 	return 1;
 }
 
+SEC("tcx/ingress")
+int get_netns_cookie_tcx(struct __sk_buff *skb)
+{
+	tcx_init_netns_cookie = bpf_get_netns_cookie(NULL);
+	tcx_netns_cookie = bpf_get_netns_cookie(skb);
+	return TCX_PASS;
+}
+
 char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 870dd51117cb901f560ea5a85b9876956e6f35b8 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Mon, 7 Oct 2024 18:26:05 +0200
Subject: selftests: mlxsw: sch_red_ets: Increase required backlog

Backlog fluctuates on Spectrum-4 much more than on <4. Increasing the
desired backlog seems to help, as the constant fluctuations do not overlap
into the territory where packets are marked.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Amit Cohen <amcohen@nvidia.com>
Link: https://patch.msgid.link/0821fb3aa8bb6a6c0d3000baab04995517c9a0cc.1728316370.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh
index 8ecddafa79b3..576067b207a8 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh
@@ -20,8 +20,8 @@ source sch_red_core.sh
 # $BACKLOG2 are far enough not to overlap, so that we can assume that if we do
 # see (do not see) marking, it is actually due to the configuration of that one
 # TC, and not due to configuration of the other TC leaking over.
-BACKLOG1=200000
-BACKLOG2=500000
+BACKLOG1=400000
+BACKLOG2=1000000
 
 install_root_qdisc()
 {
@@ -35,7 +35,7 @@ install_qdisc_tc0()
 
 	tc qdisc add dev $swp3 parent 10:8 handle 108: red \
 	   limit 1000000 min $BACKLOG1 max $((BACKLOG1 + 1)) \
-	   probability 1.0 avpkt 8000 burst 38 "${args[@]}"
+	   probability 1.0 avpkt 8000 burst 51 "${args[@]}"
 }
 
 install_qdisc_tc1()
@@ -44,7 +44,7 @@ install_qdisc_tc1()
 
 	tc qdisc add dev $swp3 parent 10:7 handle 107: red \
 	   limit 1000000 min $BACKLOG2 max $((BACKLOG2 + 1)) \
-	   probability 1.0 avpkt 8000 burst 63 "${args[@]}"
+	   probability 1.0 avpkt 8000 burst 126 "${args[@]}"
 }
 
 install_qdisc()
-- 
cgit v1.2.3


From 8fb5b60734564473c72ca85d617cd685738948f0 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Mon, 7 Oct 2024 18:26:06 +0200
Subject: selftests: mlxsw: sch_red_core: Increase backlog size tolerance

Backlog fluctuates on Spectrum-4 much more than on <4. In practice we can
sample queue depth values going from about -12% to about +7% of the
configured RED limit. The test which checks the queue size has a limit of
+-10%, and as a result often fails. We attempted to fix the issue by
busywaiting for several seconds hoping to get within the bounds, but that
still proved to be too noisy (or the wait time would be impractically
long). Unfortunately we have to bump the value tolerance from 10% to 15%,
which in this patch do.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Amit Cohen <amcohen@nvidia.com>
Link: https://patch.msgid.link/f54950df2a8fcba46c3ddc1053376352fa2e592b.1728316370.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh
index 299e06a5808c..a25a15eb6d31 100644
--- a/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh
@@ -532,10 +532,11 @@ do_red_test()
 	check_fail $? "Traffic went into backlog instead of being early-dropped"
 	pct=$(check_marking get_nmarked $vlan "== 0")
 	check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected == 0."
+	backlog=$(get_qdisc_backlog $vlan)
 	local diff=$((limit - backlog))
 	pct=$((100 * diff / limit))
-	((-10 <= pct && pct <= 10))
-	check_err $? "backlog $backlog / $limit expected <= 10% distance"
+	((-15 <= pct && pct <= 15))
+	check_err $? "backlog $backlog / $limit expected <= 15% distance"
 	log_test "TC $((vlan - 10)): RED backlog > limit"
 
 	stop_traffic
-- 
cgit v1.2.3


From 787f148cec340114dc22c49d8b045ff3ff0adad6 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Mon, 7 Oct 2024 18:26:07 +0200
Subject: selftests: mlxsw: sch_red_core: Sleep before querying queue depth

The qdisc stats are taken from the port's periodic HW stats, which are
updated once a second. We try to accommodate the latency by using busywait
in build_backlog().

The issue in that seems to be that when do_mark_test() builds the backlog,
it makes the decision whether to send more packets based on the first
instance of the queue depth stat exceeding the current value, when in fact
more traffic is on the way and the queue depth would increase further. This
leads to failures in TC 1 of mark-mirror test, where we see the following
failure:

TEST: TC 0: marked packets mirror'd                                 [ OK ]
TEST: TC 1: marked packets mirror'd                                 [FAIL]
        Spurious packets (1680 -> 2290) observed without buffer pressure

Fix by waiting for the full second before reading the queue depth for the
first time, to make sure it reflects all in-flight traffic.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Amit Cohen <amcohen@nvidia.com>
Link: https://patch.msgid.link/321dcf8b3e9a1f0766429c8cf3e3f1746f1bc375.1728316370.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh
index a25a15eb6d31..b1ff395b3880 100644
--- a/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh
@@ -372,6 +372,7 @@ build_backlog()
 	local i=0
 
 	while :; do
+		sleep 1
 		local cur=$(busywait 1100 until_counter_is "> $cur" \
 					    get_qdisc_backlog $vlan)
 		local diff=$((size - cur))
-- 
cgit v1.2.3


From 7049166e51bc2b854935eda72f7fb8c3f4492f6f Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Mon, 7 Oct 2024 18:26:08 +0200
Subject: selftests: mlxsw: sch_red_core: Send more packets for drop tests

This test works by injecting into a port with a maxed-out queue a couple
packets and checks if a corresponding number of packets were dropped. This
has worked well on Spectrum<4, but on Spectrum-4 it has been noisy. This
is in line with the observation that on Spectrum-4, queue size tends to
fluctuate more. A handful of packets could then still be accepted to the
queue even though it was nominally full just recently.

In order to accommodate this behavior, send many more packets. The buffer
can fit N extra packets, but not N% packets. This therefore allows us to
set wider absolute margins, while actually narrowing them relatively.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/abc869b9f6003d400d6293ddd5edb2f4517f44d5.1728316370.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../testing/selftests/drivers/net/mlxsw/sch_red_core.sh  | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh
index b1ff395b3880..316444389c4e 100644
--- a/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh
@@ -653,20 +653,22 @@ do_drop_test()
 	build_backlog $vlan $((3 * limit / 2)) udp >/dev/null
 
 	base=$($fetch_counter)
-	send_packets $vlan udp 11
+	send_packets $vlan udp 100
 
-	now=$(busywait 1100 until_counter_is ">= $((base + 10))" $fetch_counter)
-	check_err $? "Dropped packets not observed: 11 expected, $((now - base)) seen"
+	now=$(busywait 1100 until_counter_is ">= $((base + 95))" $fetch_counter)
+	check_err $? "${trigger}ped packets not observed: 100 expected, $((now - base)) seen"
 
 	# When no extra traffic is injected, there should be no mirroring.
-	busywait 1100 until_counter_is ">= $((base + 20))" $fetch_counter >/dev/null
+	busywait 1100 until_counter_is ">= $((base + 110))" \
+		 $fetch_counter >/dev/null
 	check_fail $? "Spurious packets observed"
 
 	# When the rule is uninstalled, there should be no mirroring.
 	qevent_rule_uninstall_$subtest
-	send_packets $vlan udp 11
-	busywait 1100 until_counter_is ">= $((base + 20))" $fetch_counter >/dev/null
-	check_fail $? "Spurious packets observed after uninstall"
+	send_packets $vlan udp 100
+	now=$(busywait 1100 until_counter_is ">= $((base + 110))" \
+		       $fetch_counter)
+	check_fail $? "$((now - base)) spurious packets observed after uninstall"
 
 	log_test "TC $((vlan - 10)): ${trigger}ped packets $subtest'd"
 
-- 
cgit v1.2.3


From 501fa2426b5ffbe08c2571b6f5c3f3afa1970aa2 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Mon, 7 Oct 2024 18:26:09 +0200
Subject: selftests: mlxsw: sch_red_core: Lower TBF rate

The RED test uses a pair of TBF shapers. The first to get predictably-sized
stream of traffic, and second to get a 100% saturated chokepoint. To this
chokepoint it injects individual packets. Because the chokepoint is
saturated, these additional packets go straight to the backlog. This allows
the test to check RED behavior across various queue sizes.

The shapers are rated at 1Gbps, for historical reasons (before mlxsw
supported TBF offload, the test used port speed to create the chokepoints).
Machines with a low-power CPU may have trouble consistently generating
1Gbps of traffic, and the test then spuriously fails.

Instead, drop the rate to 200Mbps (Spectrum has a guaranteed shaper rate
granularity of 200Mbps, so anything lower is not guaranteed to work well).
Because that means fewer packets will be mirrored in the ECN-mark test,
adjust the passing condition accordingly.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Link: https://patch.msgid.link/c6712f9c5de75ae0bc2ab3d8ea7d92aaaf93af95.1728316370.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh
index 316444389c4e..f4c324957dcc 100644
--- a/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh
@@ -137,7 +137,7 @@ h2_create()
 	# Prevent this by adding a shaper which limits the traffic in $h2 to
 	# 1Gbps.
 
-	tc qdisc replace dev $h2 root handle 10: tbf rate 1gbit \
+	tc qdisc replace dev $h2 root handle 10: tbf rate 200mbit \
 		burst 128K limit 1G
 }
 
@@ -199,7 +199,7 @@ switch_create()
 	done
 
 	for intf in $swp3 $swp4; do
-		tc qdisc replace dev $intf root handle 1: tbf rate 1gbit \
+		tc qdisc replace dev $intf root handle 1: tbf rate 200mbit \
 			burst 128K limit 1G
 	done
 
@@ -602,7 +602,7 @@ do_mark_test()
 	# Above limit, everything should be mirrored, we should see lots of
 	# packets.
 	build_backlog $vlan $((3 * limit / 2)) tcp tos=0x01 >/dev/null
-	busywait_for_counter 1100 +10000 \
+	busywait_for_counter 1100 +2500 \
 		 $fetch_counter > /dev/null
 	check_err_fail "$should_fail" $? "ECN-marked packets $subtest'd"
 
-- 
cgit v1.2.3


From b3ea416419c83ba4a042163f17e0fd8bac417f1a Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 9 Oct 2024 10:09:57 +0200
Subject: testing: net-drv: add basic shaper test

Leverage a basic/dummy netdevsim implementation to do functional
coverage for NL interface.

Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Link: https://patch.msgid.link/43092afbf38365c796088bf8fc155e523ab434ae.1728460186.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/Kconfig                            |   1 +
 drivers/net/netdevsim/ethtool.c                |   2 +
 drivers/net/netdevsim/netdev.c                 |  39 +++
 tools/testing/selftests/drivers/net/Makefile   |   1 +
 tools/testing/selftests/drivers/net/shaper.py  | 461 +++++++++++++++++++++++++
 tools/testing/selftests/net/lib/py/__init__.py |   1 +
 tools/testing/selftests/net/lib/py/ynl.py      |   5 +
 7 files changed, 510 insertions(+)
 create mode 100755 tools/testing/selftests/drivers/net/shaper.py

(limited to 'tools/testing')

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 9920b3a68ed1..1fd5acdc73c6 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -641,6 +641,7 @@ config NETDEVSIM
 	depends on PTP_1588_CLOCK_MOCK || PTP_1588_CLOCK_MOCK=n
 	select NET_DEVLINK
 	select PAGE_POOL
+	select NET_SHAPER
 	help
 	  This driver is a developer testing tool and software model that can
 	  be used to test various control path networking APIs, especially
diff --git a/drivers/net/netdevsim/ethtool.c b/drivers/net/netdevsim/ethtool.c
index 1436905bc106..5fe1eaef99b5 100644
--- a/drivers/net/netdevsim/ethtool.c
+++ b/drivers/net/netdevsim/ethtool.c
@@ -103,8 +103,10 @@ nsim_set_channels(struct net_device *dev, struct ethtool_channels *ch)
 	struct netdevsim *ns = netdev_priv(dev);
 	int err;
 
+	mutex_lock(&dev->lock);
 	err = netif_set_real_num_queues(dev, ch->combined_count,
 					ch->combined_count);
+	mutex_unlock(&dev->lock);
 	if (err)
 		return err;
 
diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c
index 017a6102be0a..cad85bb0cf54 100644
--- a/drivers/net/netdevsim/netdev.c
+++ b/drivers/net/netdevsim/netdev.c
@@ -22,6 +22,7 @@
 #include <net/netdev_queues.h>
 #include <net/page_pool/helpers.h>
 #include <net/netlink.h>
+#include <net/net_shaper.h>
 #include <net/pkt_cls.h>
 #include <net/rtnetlink.h>
 #include <net/udp_tunnel.h>
@@ -475,6 +476,43 @@ static int nsim_stop(struct net_device *dev)
 	return 0;
 }
 
+static int nsim_shaper_set(struct net_shaper_binding *binding,
+			   const struct net_shaper *shaper,
+			   struct netlink_ext_ack *extack)
+{
+	return 0;
+}
+
+static int nsim_shaper_del(struct net_shaper_binding *binding,
+			   const struct net_shaper_handle *handle,
+			   struct netlink_ext_ack *extack)
+{
+	return 0;
+}
+
+static int nsim_shaper_group(struct net_shaper_binding *binding,
+			     int leaves_count,
+			     const struct net_shaper *leaves,
+			     const struct net_shaper *root,
+			     struct netlink_ext_ack *extack)
+{
+	return 0;
+}
+
+static void nsim_shaper_cap(struct net_shaper_binding *binding,
+			    enum net_shaper_scope scope,
+			    unsigned long *flags)
+{
+	*flags = ULONG_MAX;
+}
+
+static const struct net_shaper_ops nsim_shaper_ops = {
+	.set			= nsim_shaper_set,
+	.delete			= nsim_shaper_del,
+	.group			= nsim_shaper_group,
+	.capabilities		= nsim_shaper_cap,
+};
+
 static const struct net_device_ops nsim_netdev_ops = {
 	.ndo_start_xmit		= nsim_start_xmit,
 	.ndo_set_rx_mode	= nsim_set_rx_mode,
@@ -496,6 +534,7 @@ static const struct net_device_ops nsim_netdev_ops = {
 	.ndo_bpf		= nsim_bpf,
 	.ndo_open		= nsim_open,
 	.ndo_stop		= nsim_stop,
+	.net_shaper_ops		= &nsim_shaper_ops,
 };
 
 static const struct net_device_ops nsim_vf_netdev_ops = {
diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile
index 39fb97a8c1df..25aec5c081df 100644
--- a/tools/testing/selftests/drivers/net/Makefile
+++ b/tools/testing/selftests/drivers/net/Makefile
@@ -9,6 +9,7 @@ TEST_PROGS := \
 	ping.py \
 	queues.py \
 	stats.py \
+	shaper.py
 # end of TEST_PROGS
 
 include ../../lib.mk
diff --git a/tools/testing/selftests/drivers/net/shaper.py b/tools/testing/selftests/drivers/net/shaper.py
new file mode 100755
index 000000000000..11310f19bfa0
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/shaper.py
@@ -0,0 +1,461 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+from lib.py import ksft_run, ksft_exit, ksft_eq, ksft_true, KsftSkipEx
+from lib.py import EthtoolFamily, NetshaperFamily
+from lib.py import NetDrvEnv
+from lib.py import NlError
+from lib.py import cmd
+
+def get_shapers(cfg, nl_shaper) -> None:
+    try:
+        shapers = nl_shaper.get({'ifindex': cfg.ifindex}, dump=True)
+    except NlError as e:
+        if e.error == 95:
+            raise KsftSkipEx("shapers not supported by the device")
+        raise
+
+    # Default configuration: no shapers configured.
+    ksft_eq(len(shapers), 0)
+
+def get_caps(cfg, nl_shaper) -> None:
+    try:
+        caps = nl_shaper.cap_get({'ifindex': cfg.ifindex}, dump=True)
+    except NlError as e:
+        if e.error == 95:
+            raise KsftSkipEx("shapers not supported by the device")
+        raise
+
+    # Each device implementing shaper support must support some
+    # features in at least a scope.
+    ksft_true(len(caps)> 0)
+
+def set_qshapers(cfg, nl_shaper) -> None:
+    try:
+        caps = nl_shaper.cap_get({'ifindex': cfg.ifindex,
+                                 'scope':'queue'})
+    except NlError as e:
+        if e.error == 95:
+            raise KsftSkipEx("shapers not supported by the device")
+        raise
+    if not 'support-bw-max' in caps or not 'support-metric-bps' in caps:
+        raise KsftSkipEx("device does not support queue scope shapers with bw_max and metric bps")
+
+    cfg.queues = True;
+    netnl = EthtoolFamily()
+    channels = netnl.channels_get({'header': {'dev-index': cfg.ifindex}})
+    if channels['combined-count'] == 0:
+        cfg.rx_type = 'rx'
+        cfg.nr_queues = channels['rx-count']
+    else:
+        cfg.rx_type = 'combined'
+        cfg.nr_queues = channels['combined-count']
+    if cfg.nr_queues < 3:
+        raise KsftSkipEx(f"device does not support enough queues min 3 found {cfg.nr_queues}")
+
+    nl_shaper.set({'ifindex': cfg.ifindex,
+                   'handle': {'scope': 'queue', 'id': 1},
+                   'metric': 'bps',
+                   'bw-max': 10000})
+    nl_shaper.set({'ifindex': cfg.ifindex,
+                   'handle': {'scope': 'queue', 'id': 2},
+                   'metric': 'bps',
+                   'bw-max': 20000})
+
+    # Querying a specific shaper not yet configured must fail.
+    raised = False
+    try:
+        shaper_q0 = nl_shaper.get({'ifindex': cfg.ifindex,
+                                   'handle': {'scope': 'queue', 'id': 0}})
+    except (NlError):
+        raised = True
+    ksft_eq(raised, True)
+
+    shaper_q1 = nl_shaper.get({'ifindex': cfg.ifindex,
+                              'handle': {'scope': 'queue', 'id': 1}})
+    ksft_eq(shaper_q1, {'ifindex': cfg.ifindex,
+                        'parent': {'scope': 'netdev'},
+                        'handle': {'scope': 'queue', 'id': 1},
+                        'metric': 'bps',
+                        'bw-max': 10000})
+
+    shapers = nl_shaper.get({'ifindex': cfg.ifindex}, dump=True)
+    ksft_eq(shapers, [{'ifindex': cfg.ifindex,
+                       'parent': {'scope': 'netdev'},
+                       'handle': {'scope': 'queue', 'id': 1},
+                       'metric': 'bps',
+                       'bw-max': 10000},
+                      {'ifindex': cfg.ifindex,
+                       'parent': {'scope': 'netdev'},
+                       'handle': {'scope': 'queue', 'id': 2},
+                       'metric': 'bps',
+                       'bw-max': 20000}])
+
+def del_qshapers(cfg, nl_shaper) -> None:
+    if not cfg.queues:
+        raise KsftSkipEx("queue shapers not supported by device, skipping delete")
+
+    nl_shaper.delete({'ifindex': cfg.ifindex,
+                      'handle': {'scope': 'queue', 'id': 2}})
+    nl_shaper.delete({'ifindex': cfg.ifindex,
+                      'handle': {'scope': 'queue', 'id': 1}})
+    shapers = nl_shaper.get({'ifindex': cfg.ifindex}, dump=True)
+    ksft_eq(len(shapers), 0)
+
+def set_nshapers(cfg, nl_shaper) -> None:
+    # Check required features.
+    try:
+        caps = nl_shaper.cap_get({'ifindex': cfg.ifindex,
+                                  'scope':'netdev'})
+    except NlError as e:
+        if e.error == 95:
+            raise KsftSkipEx("shapers not supported by the device")
+        raise
+    if not 'support-bw-max' in caps or not 'support-metric-bps' in caps:
+        raise KsftSkipEx("device does not support nested netdev scope shapers with weight")
+
+    cfg.netdev = True;
+    nl_shaper.set({'ifindex': cfg.ifindex,
+                   'handle': {'scope': 'netdev', 'id': 0},
+                   'bw-max': 100000})
+
+    shapers = nl_shaper.get({'ifindex': cfg.ifindex}, dump=True)
+    ksft_eq(shapers, [{'ifindex': cfg.ifindex,
+                       'handle': {'scope': 'netdev'},
+                       'metric': 'bps',
+                       'bw-max': 100000}])
+
+def del_nshapers(cfg, nl_shaper) -> None:
+    if not cfg.netdev:
+        raise KsftSkipEx("netdev shaper not supported by device, skipping delete")
+
+    nl_shaper.delete({'ifindex': cfg.ifindex,
+                      'handle': {'scope': 'netdev'}})
+    shapers = nl_shaper.get({'ifindex': cfg.ifindex}, dump=True)
+    ksft_eq(len(shapers), 0)
+
+def basic_groups(cfg, nl_shaper) -> None:
+    if not cfg.netdev:
+        raise KsftSkipEx("netdev shaper not supported by the device")
+    if cfg.nr_queues < 3:
+        raise KsftSkipEx(f"netdev does not have enough queues min 3 reported {cfg.nr_queues}")
+
+    try:
+        caps = nl_shaper.cap_get({'ifindex': cfg.ifindex,
+                                  'scope':'queue'})
+    except NlError as e:
+        if e.error == 95:
+            raise KsftSkipEx("shapers not supported by the device")
+        raise
+    if not 'support-weight' in caps:
+        raise KsftSkipEx("device does not support queue scope shapers with weight")
+
+    node_handle = nl_shaper.group({
+                        'ifindex': cfg.ifindex,
+                        'leaves':[{'handle': {'scope': 'queue', 'id': 1},
+                                   'weight': 1},
+                                  {'handle': {'scope': 'queue', 'id': 2},
+                                   'weight': 2}],
+                         'handle': {'scope':'netdev'},
+                         'metric': 'bps',
+                         'bw-max': 10000})
+    ksft_eq(node_handle, {'ifindex': cfg.ifindex,
+                          'handle': {'scope': 'netdev'}})
+
+    shaper = nl_shaper.get({'ifindex': cfg.ifindex,
+                            'handle': {'scope': 'queue', 'id': 1}})
+    ksft_eq(shaper, {'ifindex': cfg.ifindex,
+                     'parent': {'scope': 'netdev'},
+                     'handle': {'scope': 'queue', 'id': 1},
+                     'weight': 1 })
+
+    nl_shaper.delete({'ifindex': cfg.ifindex,
+                      'handle': {'scope': 'queue', 'id': 2}})
+    nl_shaper.delete({'ifindex': cfg.ifindex,
+                      'handle': {'scope': 'queue', 'id': 1}})
+
+    # Deleting all the leaves shaper does not affect the node one
+    # when the latter has 'netdev' scope.
+    shapers = nl_shaper.get({'ifindex': cfg.ifindex}, dump=True)
+    ksft_eq(len(shapers), 1)
+
+    nl_shaper.delete({'ifindex': cfg.ifindex,
+                      'handle': {'scope': 'netdev'}})
+
+def qgroups(cfg, nl_shaper) -> None:
+    if cfg.nr_queues < 4:
+        raise KsftSkipEx(f"netdev does not have enough queues min 4 reported {cfg.nr_queues}")
+    try:
+        caps = nl_shaper.cap_get({'ifindex': cfg.ifindex,
+                                  'scope':'node'})
+    except NlError as e:
+        if e.error == 95:
+            raise KsftSkipEx("shapers not supported by the device")
+        raise
+    if not 'support-bw-max' in caps or not 'support-metric-bps' in caps:
+        raise KsftSkipEx("device does not support node scope shapers with bw_max and metric bps")
+    try:
+        caps = nl_shaper.cap_get({'ifindex': cfg.ifindex,
+                                  'scope':'queue'})
+    except NlError as e:
+        if e.error == 95:
+            raise KsftSkipEx("shapers not supported by the device")
+        raise
+    if not 'support-nesting' in caps or not 'support-weight' in caps or not 'support-metric-bps' in caps:
+            raise KsftSkipEx("device does not support nested queue scope shapers with weight")
+
+    cfg.groups = True;
+    node_handle = nl_shaper.group({
+                   'ifindex': cfg.ifindex,
+                   'leaves':[{'handle': {'scope': 'queue', 'id': 1},
+                              'weight': 3},
+                             {'handle': {'scope': 'queue', 'id': 2},
+                              'weight': 2}],
+                   'handle': {'scope':'node'},
+                   'metric': 'bps',
+                   'bw-max': 10000})
+    node_id = node_handle['handle']['id']
+
+    shaper = nl_shaper.get({'ifindex': cfg.ifindex,
+                            'handle': {'scope': 'queue', 'id': 1}})
+    ksft_eq(shaper, {'ifindex': cfg.ifindex,
+                     'parent': {'scope': 'node', 'id': node_id},
+                     'handle': {'scope': 'queue', 'id': 1},
+                     'weight': 3})
+    shaper = nl_shaper.get({'ifindex': cfg.ifindex,
+                            'handle': {'scope': 'node', 'id': node_id}})
+    ksft_eq(shaper, {'ifindex': cfg.ifindex,
+                     'handle': {'scope': 'node', 'id': node_id},
+                     'parent': {'scope': 'netdev'},
+                     'metric': 'bps',
+                     'bw-max': 10000})
+
+    # Grouping to a specified, not existing node scope shaper must fail
+    raised = False
+    try:
+        nl_shaper.group({
+                   'ifindex': cfg.ifindex,
+                   'leaves':[{'handle': {'scope': 'queue', 'id': 3},
+                              'weight': 3}],
+                   'handle': {'scope':'node', 'id': node_id + 1},
+                   'metric': 'bps',
+                   'bw-max': 10000})
+
+    except (NlError):
+        raised = True
+    ksft_eq(raised, True)
+
+    # Add to an existing node
+    node_handle = nl_shaper.group({
+                   'ifindex': cfg.ifindex,
+                   'leaves':[{'handle': {'scope': 'queue', 'id': 3},
+                              'weight': 4}],
+                   'handle': {'scope':'node', 'id': node_id}})
+    ksft_eq(node_handle, {'ifindex': cfg.ifindex,
+                          'handle': {'scope': 'node', 'id': node_id}})
+
+    shaper = nl_shaper.get({'ifindex': cfg.ifindex,
+                            'handle': {'scope': 'queue', 'id': 3}})
+    ksft_eq(shaper, {'ifindex': cfg.ifindex,
+                     'parent': {'scope': 'node', 'id': node_id},
+                     'handle': {'scope': 'queue', 'id': 3},
+                     'weight': 4})
+
+    nl_shaper.delete({'ifindex': cfg.ifindex,
+                      'handle': {'scope': 'queue', 'id': 2}})
+    nl_shaper.delete({'ifindex': cfg.ifindex,
+                      'handle': {'scope': 'queue', 'id': 1}})
+
+    # Deleting a non empty node will move the leaves downstream.
+    nl_shaper.delete({'ifindex': cfg.ifindex,
+                      'handle': {'scope': 'node', 'id': node_id}})
+    shapers = nl_shaper.get({'ifindex': cfg.ifindex}, dump=True)
+    ksft_eq(shapers, [{'ifindex': cfg.ifindex,
+                       'parent': {'scope': 'netdev'},
+                       'handle': {'scope': 'queue', 'id': 3},
+                       'weight': 4}])
+
+    # Finish and verify the complete cleanup.
+    nl_shaper.delete({'ifindex': cfg.ifindex,
+                      'handle': {'scope': 'queue', 'id': 3}})
+    shapers = nl_shaper.get({'ifindex': cfg.ifindex}, dump=True)
+    ksft_eq(len(shapers), 0)
+
+def delegation(cfg, nl_shaper) -> None:
+    if not cfg.groups:
+        raise KsftSkipEx("device does not support node scope")
+    try:
+        caps = nl_shaper.cap_get({'ifindex': cfg.ifindex,
+                                  'scope':'node'})
+    except NlError as e:
+        if e.error == 95:
+            raise KsftSkipEx("node scope shapers not supported by the device")
+        raise
+    if not 'support-nesting' in caps:
+        raise KsftSkipEx("device does not support node scope shapers nesting")
+
+    node_handle = nl_shaper.group({
+                   'ifindex': cfg.ifindex,
+                   'leaves':[{'handle': {'scope': 'queue', 'id': 1},
+                              'weight': 3},
+                             {'handle': {'scope': 'queue', 'id': 2},
+                              'weight': 2},
+                             {'handle': {'scope': 'queue', 'id': 3},
+                              'weight': 1}],
+                   'handle': {'scope':'node'},
+                   'metric': 'bps',
+                   'bw-max': 10000})
+    node_id = node_handle['handle']['id']
+
+    # Create the nested node and validate the hierarchy
+    nested_node_handle = nl_shaper.group({
+                   'ifindex': cfg.ifindex,
+                   'leaves':[{'handle': {'scope': 'queue', 'id': 1},
+                              'weight': 3},
+                             {'handle': {'scope': 'queue', 'id': 2},
+                              'weight': 2}],
+                   'handle': {'scope':'node'},
+                   'metric': 'bps',
+                   'bw-max': 5000})
+    nested_node_id = nested_node_handle['handle']['id']
+    ksft_true(nested_node_id != node_id)
+    shapers = nl_shaper.get({'ifindex': cfg.ifindex}, dump=True)
+    ksft_eq(shapers, [{'ifindex': cfg.ifindex,
+                       'parent': {'scope': 'node', 'id': nested_node_id},
+                       'handle': {'scope': 'queue', 'id': 1},
+                       'weight': 3},
+                      {'ifindex': cfg.ifindex,
+                       'parent': {'scope': 'node', 'id': nested_node_id},
+                       'handle': {'scope': 'queue', 'id': 2},
+                       'weight': 2},
+                      {'ifindex': cfg.ifindex,
+                       'parent': {'scope': 'node', 'id': node_id},
+                       'handle': {'scope': 'queue', 'id': 3},
+                       'weight': 1},
+                      {'ifindex': cfg.ifindex,
+                       'parent': {'scope': 'netdev'},
+                       'handle': {'scope': 'node', 'id': node_id},
+                       'metric': 'bps',
+                       'bw-max': 10000},
+                      {'ifindex': cfg.ifindex,
+                       'parent': {'scope': 'node', 'id': node_id},
+                       'handle': {'scope': 'node', 'id': nested_node_id},
+                       'metric': 'bps',
+                       'bw-max': 5000}])
+
+    # Deleting a non empty node will move the leaves downstream.
+    nl_shaper.delete({'ifindex': cfg.ifindex,
+                      'handle': {'scope': 'node', 'id': nested_node_id}})
+    shapers = nl_shaper.get({'ifindex': cfg.ifindex}, dump=True)
+    ksft_eq(shapers, [{'ifindex': cfg.ifindex,
+                       'parent': {'scope': 'node', 'id': node_id},
+                       'handle': {'scope': 'queue', 'id': 1},
+                       'weight': 3},
+                      {'ifindex': cfg.ifindex,
+                       'parent': {'scope': 'node', 'id': node_id},
+                       'handle': {'scope': 'queue', 'id': 2},
+                       'weight': 2},
+                      {'ifindex': cfg.ifindex,
+                       'parent': {'scope': 'node', 'id': node_id},
+                       'handle': {'scope': 'queue', 'id': 3},
+                       'weight': 1},
+                      {'ifindex': cfg.ifindex,
+                       'parent': {'scope': 'netdev'},
+                       'handle': {'scope': 'node', 'id': node_id},
+                       'metric': 'bps',
+                       'bw-max': 10000}])
+
+    # Final cleanup.
+    for i in range(1, 4):
+        nl_shaper.delete({'ifindex': cfg.ifindex,
+                          'handle': {'scope': 'queue', 'id': i}})
+    shapers = nl_shaper.get({'ifindex': cfg.ifindex}, dump=True)
+    ksft_eq(len(shapers), 0)
+
+def queue_update(cfg, nl_shaper) -> None:
+    if cfg.nr_queues < 4:
+        raise KsftSkipEx(f"netdev does not have enough queues min 4 reported {cfg.nr_queues}")
+    if not cfg.queues:
+        raise KsftSkipEx("device does not support queue scope")
+
+    for i in range(3):
+        nl_shaper.set({'ifindex': cfg.ifindex,
+                       'handle': {'scope': 'queue', 'id': i},
+                       'metric': 'bps',
+                       'bw-max': (i + 1) * 1000})
+    # Delete a channel, with no shapers configured on top of the related
+    # queue: no changes expected
+    cmd(f"ethtool -L {cfg.dev['ifname']} {cfg.rx_type} 3", timeout=10)
+    shapers = nl_shaper.get({'ifindex': cfg.ifindex}, dump=True)
+    ksft_eq(shapers, [{'ifindex': cfg.ifindex,
+                       'parent': {'scope': 'netdev'},
+                       'handle': {'scope': 'queue', 'id': 0},
+                       'metric': 'bps',
+                       'bw-max': 1000},
+                      {'ifindex': cfg.ifindex,
+                       'parent': {'scope': 'netdev'},
+                       'handle': {'scope': 'queue', 'id': 1},
+                       'metric': 'bps',
+                       'bw-max': 2000},
+                      {'ifindex': cfg.ifindex,
+                       'parent': {'scope': 'netdev'},
+                       'handle': {'scope': 'queue', 'id': 2},
+                       'metric': 'bps',
+                       'bw-max': 3000}])
+
+    # Delete a channel, with a shaper configured on top of the related
+    # queue: the shaper must be deleted, too
+    cmd(f"ethtool -L {cfg.dev['ifname']} {cfg.rx_type} 2", timeout=10)
+
+    shapers = nl_shaper.get({'ifindex': cfg.ifindex}, dump=True)
+    ksft_eq(shapers, [{'ifindex': cfg.ifindex,
+                       'parent': {'scope': 'netdev'},
+                       'handle': {'scope': 'queue', 'id': 0},
+                       'metric': 'bps',
+                       'bw-max': 1000},
+                      {'ifindex': cfg.ifindex,
+                       'parent': {'scope': 'netdev'},
+                       'handle': {'scope': 'queue', 'id': 1},
+                       'metric': 'bps',
+                       'bw-max': 2000}])
+
+    # Restore the original channels number, no expected changes
+    cmd(f"ethtool -L {cfg.dev['ifname']} {cfg.rx_type} {cfg.nr_queues}", timeout=10)
+    shapers = nl_shaper.get({'ifindex': cfg.ifindex}, dump=True)
+    ksft_eq(shapers, [{'ifindex': cfg.ifindex,
+                       'parent': {'scope': 'netdev'},
+                       'handle': {'scope': 'queue', 'id': 0},
+                       'metric': 'bps',
+                       'bw-max': 1000},
+                      {'ifindex': cfg.ifindex,
+                       'parent': {'scope': 'netdev'},
+                       'handle': {'scope': 'queue', 'id': 1},
+                       'metric': 'bps',
+                       'bw-max': 2000}])
+
+    # Final cleanup.
+    for i in range(0, 2):
+        nl_shaper.delete({'ifindex': cfg.ifindex,
+                          'handle': {'scope': 'queue', 'id': i}})
+
+def main() -> None:
+    with NetDrvEnv(__file__, queue_count=4) as cfg:
+        cfg.queues = False
+        cfg.netdev = False
+        cfg.groups = False
+        cfg.nr_queues = 0
+        ksft_run([get_shapers,
+                  get_caps,
+                  set_qshapers,
+                  del_qshapers,
+                  set_nshapers,
+                  del_nshapers,
+                  basic_groups,
+                  qgroups,
+                  delegation,
+                  queue_update], args=(cfg, NetshaperFamily()))
+    ksft_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/testing/selftests/net/lib/py/__init__.py b/tools/testing/selftests/net/lib/py/__init__.py
index b6d498d125fe..54d8f5eba810 100644
--- a/tools/testing/selftests/net/lib/py/__init__.py
+++ b/tools/testing/selftests/net/lib/py/__init__.py
@@ -6,3 +6,4 @@ from .netns import NetNS
 from .nsim import *
 from .utils import *
 from .ynl import NlError, YnlFamily, EthtoolFamily, NetdevFamily, RtnlFamily
+from .ynl import NetshaperFamily
diff --git a/tools/testing/selftests/net/lib/py/ynl.py b/tools/testing/selftests/net/lib/py/ynl.py
index 1ace58370c06..a0d689d58c57 100644
--- a/tools/testing/selftests/net/lib/py/ynl.py
+++ b/tools/testing/selftests/net/lib/py/ynl.py
@@ -47,3 +47,8 @@ class NetdevFamily(YnlFamily):
     def __init__(self):
         super().__init__((SPEC_PATH / Path('netdev.yaml')).as_posix(),
                          schema='')
+
+class NetshaperFamily(YnlFamily):
+    def __init__(self):
+        super().__init__((SPEC_PATH / Path('net_shaper.yaml')).as_posix(),
+                         schema='')
-- 
cgit v1.2.3


From ac8d16b2d3772934f4cba44cb01bad05b4b2864c Mon Sep 17 00:00:00 2001
From: "Alexis Lothoré (eBPF Foundation)" <alexis.lothore@bootlin.com>
Date: Wed, 9 Oct 2024 12:12:07 +0200
Subject: selftests/bpf: fix bpf_map_redirect call for cpu map test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

xdp_redir_prog currently redirects packets based on the entry at index 1
in cpu_map, but the corresponding test only manipulates the entry at
index 0. This does not really affect the test in its current form since
the program is detached before having the opportunity to execute, but it
needs to be fixed before being able improve the corresponding test (ie,
not only test attach/detach but also the redirect feature)

Fix this XDP program by making it redirect packets based on entry 0 in
cpu_map instead of entry 1.

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20241009-convert_xdp_tests-v3-1-51cea913710c@bootlin.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/progs/test_xdp_with_cpumap_helpers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/progs/test_xdp_with_cpumap_helpers.c b/tools/testing/selftests/bpf/progs/test_xdp_with_cpumap_helpers.c
index 20ec6723df18..d848fe96924e 100644
--- a/tools/testing/selftests/bpf/progs/test_xdp_with_cpumap_helpers.c
+++ b/tools/testing/selftests/bpf/progs/test_xdp_with_cpumap_helpers.c
@@ -15,7 +15,7 @@ struct {
 SEC("xdp")
 int xdp_redir_prog(struct xdp_md *ctx)
 {
-	return bpf_redirect_map(&cpu_map, 1, 0);
+	return bpf_redirect_map(&cpu_map, 0, 0);
 }
 
 SEC("xdp")
-- 
cgit v1.2.3


From d5fbcf46ee82574aee443423f3e4132d1154372b Mon Sep 17 00:00:00 2001
From: "Alexis Lothoré (eBPF Foundation)" <alexis.lothore@bootlin.com>
Date: Wed, 9 Oct 2024 12:12:08 +0200
Subject: selftests/bpf: make xdp_cpumap_attach keep redirect prog attached
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Current test only checks attach/detach on cpu map type program, and so
does not check that it can be properly executed, neither that it
redirects correctly.

Update the existing test to extend its coverage:
- keep the redirected program loaded
- try to execute it through bpf_prog_test_run_opts with some dummy
  context

While at it, bring the following minor improvements:
- isolate test interface in its own namespace

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20241009-convert_xdp_tests-v3-2-51cea913710c@bootlin.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 .../selftests/bpf/prog_tests/xdp_cpumap_attach.c   | 41 +++++++++++++++++-----
 1 file changed, 33 insertions(+), 8 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c
index 481626a875d1..88e8a886d1e6 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c
@@ -2,35 +2,41 @@
 #include <uapi/linux/bpf.h>
 #include <linux/if_link.h>
 #include <test_progs.h>
+#include <network_helpers.h>
 
 #include "test_xdp_with_cpumap_frags_helpers.skel.h"
 #include "test_xdp_with_cpumap_helpers.skel.h"
 
 #define IFINDEX_LO	1
+#define TEST_NS "cpu_attach_ns"
 
 static void test_xdp_with_cpumap_helpers(void)
 {
-	struct test_xdp_with_cpumap_helpers *skel;
+	struct test_xdp_with_cpumap_helpers *skel = NULL;
 	struct bpf_prog_info info = {};
 	__u32 len = sizeof(info);
 	struct bpf_cpumap_val val = {
 		.qsize = 192,
 	};
-	int err, prog_fd, map_fd;
+	int err, prog_fd, prog_redir_fd, map_fd;
+	struct nstoken *nstoken = NULL;
 	__u32 idx = 0;
 
+	SYS(out_close, "ip netns add %s", TEST_NS);
+	nstoken = open_netns(TEST_NS);
+	if (!ASSERT_OK_PTR(nstoken, "open_netns"))
+		goto out_close;
+	SYS(out_close, "ip link set dev lo up");
+
 	skel = test_xdp_with_cpumap_helpers__open_and_load();
 	if (!ASSERT_OK_PTR(skel, "test_xdp_with_cpumap_helpers__open_and_load"))
 		return;
 
-	prog_fd = bpf_program__fd(skel->progs.xdp_redir_prog);
-	err = bpf_xdp_attach(IFINDEX_LO, prog_fd, XDP_FLAGS_SKB_MODE, NULL);
+	prog_redir_fd = bpf_program__fd(skel->progs.xdp_redir_prog);
+	err = bpf_xdp_attach(IFINDEX_LO, prog_redir_fd, XDP_FLAGS_SKB_MODE, NULL);
 	if (!ASSERT_OK(err, "Generic attach of program with 8-byte CPUMAP"))
 		goto out_close;
 
-	err = bpf_xdp_detach(IFINDEX_LO, XDP_FLAGS_SKB_MODE, NULL);
-	ASSERT_OK(err, "XDP program detach");
-
 	prog_fd = bpf_program__fd(skel->progs.xdp_dummy_cm);
 	map_fd = bpf_map__fd(skel->maps.cpu_map);
 	err = bpf_prog_get_info_by_fd(prog_fd, &info, &len);
@@ -45,6 +51,23 @@ static void test_xdp_with_cpumap_helpers(void)
 	ASSERT_OK(err, "Read cpumap entry");
 	ASSERT_EQ(info.id, val.bpf_prog.id, "Match program id to cpumap entry prog_id");
 
+	/* send a packet to trigger any potential bugs in there */
+	char data[10] = {};
+	DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
+			    .data_in = &data,
+			    .data_size_in = 10,
+			    .flags = BPF_F_TEST_XDP_LIVE_FRAMES,
+			    .repeat = 1,
+		);
+	err = bpf_prog_test_run_opts(prog_redir_fd, &opts);
+	ASSERT_OK(err, "XDP test run");
+
+	/* wait for the packets to be flushed */
+	kern_sync_rcu();
+
+	err = bpf_xdp_detach(IFINDEX_LO, XDP_FLAGS_SKB_MODE, NULL);
+	ASSERT_OK(err, "XDP program detach");
+
 	/* can not attach BPF_XDP_CPUMAP program to a device */
 	err = bpf_xdp_attach(IFINDEX_LO, prog_fd, XDP_FLAGS_SKB_MODE, NULL);
 	if (!ASSERT_NEQ(err, 0, "Attach of BPF_XDP_CPUMAP program"))
@@ -65,6 +88,8 @@ static void test_xdp_with_cpumap_helpers(void)
 	ASSERT_NEQ(err, 0, "Add BPF_XDP program with frags to cpumap entry");
 
 out_close:
+	close_netns(nstoken);
+	SYS_NOFAIL("ip netns del %s", TEST_NS);
 	test_xdp_with_cpumap_helpers__destroy(skel);
 }
 
@@ -111,7 +136,7 @@ out_close:
 	test_xdp_with_cpumap_frags_helpers__destroy(skel);
 }
 
-void serial_test_xdp_cpumap_attach(void)
+void test_xdp_cpumap_attach(void)
 {
 	if (test__start_subtest("CPUMAP with programs in entries"))
 		test_xdp_with_cpumap_helpers();
-- 
cgit v1.2.3


From d124d984c8a2d677e1cea6740a01ccdd0371a38d Mon Sep 17 00:00:00 2001
From: "Alexis Lothoré (eBPF Foundation)" <alexis.lothore@bootlin.com>
Date: Wed, 9 Oct 2024 12:12:09 +0200
Subject: selftests/bpf: check program redirect in xdp_cpumap_attach
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

xdp_cpumap_attach, in its current form, only checks that an xdp cpumap
program can be executed, but not that it performs correctly the cpu
redirect as configured by userspace (bpf_prog_test_run_opts will return
success even if the redirect program returns an error)

Add a check to ensure that the program performs the configured redirect
as well. The check is based on a global variable incremented by a
chained program executed only if the redirect program properly executes.

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20241009-convert_xdp_tests-v3-3-51cea913710c@bootlin.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c       | 5 ++++-
 tools/testing/selftests/bpf/progs/test_xdp_with_cpumap_helpers.c | 5 +++++
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c
index 88e8a886d1e6..c7f74f068e78 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c
@@ -62,8 +62,11 @@ static void test_xdp_with_cpumap_helpers(void)
 	err = bpf_prog_test_run_opts(prog_redir_fd, &opts);
 	ASSERT_OK(err, "XDP test run");
 
-	/* wait for the packets to be flushed */
+	/* wait for the packets to be flushed, then check that redirect has been
+	 * performed
+	 */
 	kern_sync_rcu();
+	ASSERT_NEQ(skel->bss->redirect_count, 0, "redirected packets");
 
 	err = bpf_xdp_detach(IFINDEX_LO, XDP_FLAGS_SKB_MODE, NULL);
 	ASSERT_OK(err, "XDP program detach");
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_with_cpumap_helpers.c b/tools/testing/selftests/bpf/progs/test_xdp_with_cpumap_helpers.c
index d848fe96924e..3619239b01b7 100644
--- a/tools/testing/selftests/bpf/progs/test_xdp_with_cpumap_helpers.c
+++ b/tools/testing/selftests/bpf/progs/test_xdp_with_cpumap_helpers.c
@@ -12,6 +12,8 @@ struct {
 	__uint(max_entries, 4);
 } cpu_map SEC(".maps");
 
+__u32 redirect_count = 0;
+
 SEC("xdp")
 int xdp_redir_prog(struct xdp_md *ctx)
 {
@@ -27,6 +29,9 @@ int xdp_dummy_prog(struct xdp_md *ctx)
 SEC("xdp/cpumap")
 int xdp_dummy_cm(struct xdp_md *ctx)
 {
+	if (bpf_get_smp_processor_id() == 0)
+		redirect_count++;
+
 	if (ctx->ingress_ifindex == IFINDEX_LO)
 		return XDP_DROP;
 
-- 
cgit v1.2.3


From ba4fb3b3f7d891ba2fb82fe344d5068f87d7481c Mon Sep 17 00:00:00 2001
From: Zhu Jun <zhujun2@cmss.chinamobile.com>
Date: Wed, 9 Oct 2024 22:57:37 -0700
Subject: selftests/bpf: Removed redundant fd after close in
 bpf_prog_load_log_buf

Removed unnecessary `fd = -1` assignments after closing file descriptors.
because it will be assigned by the function bpf_prog_load().This improves
code readability and removes redundant operations.

Signed-off-by: Zhu Jun <zhujun2@cmss.chinamobile.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241010055737.4292-1-zhujun2@cmss.chinamobile.com
---
 tools/testing/selftests/bpf/prog_tests/log_buf.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/log_buf.c b/tools/testing/selftests/bpf/prog_tests/log_buf.c
index 27676a04d0b6..169ce689b97c 100644
--- a/tools/testing/selftests/bpf/prog_tests/log_buf.c
+++ b/tools/testing/selftests/bpf/prog_tests/log_buf.c
@@ -169,7 +169,6 @@ static void bpf_prog_load_log_buf(void)
 	ASSERT_GE(fd, 0, "good_fd1");
 	if (fd >= 0)
 		close(fd);
-	fd = -1;
 
 	/* log_level == 2 should always fill log_buf, even for good prog */
 	log_buf[0] = '\0';
@@ -180,7 +179,6 @@ static void bpf_prog_load_log_buf(void)
 	ASSERT_GE(fd, 0, "good_fd2");
 	if (fd >= 0)
 		close(fd);
-	fd = -1;
 
 	/* log_level == 0 should fill log_buf for bad prog */
 	log_buf[0] = '\0';
@@ -191,7 +189,6 @@ static void bpf_prog_load_log_buf(void)
 	ASSERT_LT(fd, 0, "bad_fd");
 	if (fd >= 0)
 		close(fd);
-	fd = -1;
 
 	free(log_buf);
 }
-- 
cgit v1.2.3


From ec6c4be073237814f4dd7f56ef72f08760cfaa90 Mon Sep 17 00:00:00 2001
From: "Daniel T. Lee" <danieltimlee@gmail.com>
Date: Fri, 11 Oct 2024 04:48:44 +0000
Subject: selftests/bpf: migrate cgroup sock create test for setting
 iface/mark/prio

This patch migrates the old test for cgroup BPF that sets
sk_bound_dev_if, mark, and priority when AF_INET{6} sockets are created.
The most closely related tests under selftests are 'test_sock' and
'sockopt'. However, these existing tests serve different purposes.
'test_sock' focuses mainly on verifying the socket binding process,
while 'sockopt' concentrates on testing the behavior of getsockopt and
setsockopt operations for various socket options.

Neither of these existing tests directly covers the ability of cgroup
BPF to set socket attributes such as sk_bound_dev_if, mark, and priority
during socket creation. To address this gap, this patch introduces a
migration of the old cgroup socket attribute test, now included as the
'sock_create' test in selftests/bpf. This ensures that the ability to
configure these attributes during socket creation is properly tested.

Signed-off-by: Daniel T. Lee <danieltimlee@gmail.com>
Link: https://lore.kernel.org/r/20241011044847.51584-2-danieltimlee@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/Makefile                               |   2 -
 samples/bpf/test_cgrp2_sock.c                      | 296 ---------------------
 samples/bpf/test_cgrp2_sock.sh                     | 137 ----------
 .../testing/selftests/bpf/prog_tests/sock_create.c | 256 ++++++++++++++++++
 4 files changed, 256 insertions(+), 435 deletions(-)
 delete mode 100644 samples/bpf/test_cgrp2_sock.c
 delete mode 100755 samples/bpf/test_cgrp2_sock.sh
 create mode 100644 tools/testing/selftests/bpf/prog_tests/sock_create.c

(limited to 'tools/testing')

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 7afe040cf43b..f514c6fb1ae2 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -27,7 +27,6 @@ tprogs-y += map_perf_test
 tprogs-y += test_overhead
 tprogs-y += test_cgrp2_array_pin
 tprogs-y += test_cgrp2_attach
-tprogs-y += test_cgrp2_sock
 tprogs-y += test_cgrp2_sock2
 tprogs-y += xdp_router_ipv4
 tprogs-y += test_current_task_under_cgroup
@@ -76,7 +75,6 @@ map_perf_test-objs := map_perf_test_user.o
 test_overhead-objs := test_overhead_user.o
 test_cgrp2_array_pin-objs := test_cgrp2_array_pin.o
 test_cgrp2_attach-objs := test_cgrp2_attach.o
-test_cgrp2_sock-objs := test_cgrp2_sock.o
 test_cgrp2_sock2-objs := test_cgrp2_sock2.o
 test_current_task_under_cgroup-objs := $(CGROUP_HELPERS) \
 				       test_current_task_under_cgroup_user.o
diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c
deleted file mode 100644
index 8ca2a445ffa1..000000000000
--- a/samples/bpf/test_cgrp2_sock.c
+++ /dev/null
@@ -1,296 +0,0 @@
-/* eBPF example program:
- *
- * - Loads eBPF program
- *
- *   The eBPF program sets the sk_bound_dev_if index in new AF_INET{6}
- *   sockets opened by processes in the cgroup.
- *
- * - Attaches the new program to a cgroup using BPF_PROG_ATTACH
- */
-
-#define _GNU_SOURCE
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stddef.h>
-#include <string.h>
-#include <unistd.h>
-#include <assert.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <net/if.h>
-#include <inttypes.h>
-#include <linux/bpf.h>
-#include <bpf/bpf.h>
-
-#include "bpf_insn.h"
-
-char bpf_log_buf[BPF_LOG_BUF_SIZE];
-
-static int prog_load(__u32 idx, __u32 mark, __u32 prio)
-{
-	/* save pointer to context */
-	struct bpf_insn prog_start[] = {
-		BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
-	};
-	struct bpf_insn prog_end[] = {
-		BPF_MOV64_IMM(BPF_REG_0, 1), /* r0 = verdict */
-		BPF_EXIT_INSN(),
-	};
-
-	/* set sk_bound_dev_if on socket */
-	struct bpf_insn prog_dev[] = {
-		BPF_MOV64_IMM(BPF_REG_3, idx),
-		BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, bound_dev_if)),
-		BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, bound_dev_if)),
-	};
-
-	/* set mark on socket */
-	struct bpf_insn prog_mark[] = {
-		/* get uid of process */
-		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
-			     BPF_FUNC_get_current_uid_gid),
-		BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xffffffff),
-
-		/* if uid is 0, use given mark, else use the uid as the mark */
-		BPF_MOV64_REG(BPF_REG_3, BPF_REG_0),
-		BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
-		BPF_MOV64_IMM(BPF_REG_3, mark),
-
-		/* set the mark on the new socket */
-		BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
-		BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, mark)),
-		BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, mark)),
-	};
-
-	/* set priority on socket */
-	struct bpf_insn prog_prio[] = {
-		BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
-		BPF_MOV64_IMM(BPF_REG_3, prio),
-		BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, priority)),
-		BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, priority)),
-	};
-	LIBBPF_OPTS(bpf_prog_load_opts, opts,
-		.log_buf = bpf_log_buf,
-		.log_size = BPF_LOG_BUF_SIZE,
-	);
-
-	struct bpf_insn *prog;
-	size_t insns_cnt;
-	void *p;
-	int ret;
-
-	insns_cnt = sizeof(prog_start) + sizeof(prog_end);
-	if (idx)
-		insns_cnt += sizeof(prog_dev);
-
-	if (mark)
-		insns_cnt += sizeof(prog_mark);
-
-	if (prio)
-		insns_cnt += sizeof(prog_prio);
-
-	p = prog = malloc(insns_cnt);
-	if (!prog) {
-		fprintf(stderr, "Failed to allocate memory for instructions\n");
-		return EXIT_FAILURE;
-	}
-
-	memcpy(p, prog_start, sizeof(prog_start));
-	p += sizeof(prog_start);
-
-	if (idx) {
-		memcpy(p, prog_dev, sizeof(prog_dev));
-		p += sizeof(prog_dev);
-	}
-
-	if (mark) {
-		memcpy(p, prog_mark, sizeof(prog_mark));
-		p += sizeof(prog_mark);
-	}
-
-	if (prio) {
-		memcpy(p, prog_prio, sizeof(prog_prio));
-		p += sizeof(prog_prio);
-	}
-
-	memcpy(p, prog_end, sizeof(prog_end));
-	p += sizeof(prog_end);
-
-	insns_cnt /= sizeof(struct bpf_insn);
-
-	ret = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK, NULL, "GPL",
-			    prog, insns_cnt, &opts);
-
-	free(prog);
-
-	return ret;
-}
-
-static int get_bind_to_device(int sd, char *name, size_t len)
-{
-	socklen_t optlen = len;
-	int rc;
-
-	name[0] = '\0';
-	rc = getsockopt(sd, SOL_SOCKET, SO_BINDTODEVICE, name, &optlen);
-	if (rc < 0)
-		perror("setsockopt(SO_BINDTODEVICE)");
-
-	return rc;
-}
-
-static unsigned int get_somark(int sd)
-{
-	unsigned int mark = 0;
-	socklen_t optlen = sizeof(mark);
-	int rc;
-
-	rc = getsockopt(sd, SOL_SOCKET, SO_MARK, &mark, &optlen);
-	if (rc < 0)
-		perror("getsockopt(SO_MARK)");
-
-	return mark;
-}
-
-static unsigned int get_priority(int sd)
-{
-	unsigned int prio = 0;
-	socklen_t optlen = sizeof(prio);
-	int rc;
-
-	rc = getsockopt(sd, SOL_SOCKET, SO_PRIORITY, &prio, &optlen);
-	if (rc < 0)
-		perror("getsockopt(SO_PRIORITY)");
-
-	return prio;
-}
-
-static int show_sockopts(int family)
-{
-	unsigned int mark, prio;
-	char name[16];
-	int sd;
-
-	sd = socket(family, SOCK_DGRAM, 17);
-	if (sd < 0) {
-		perror("socket");
-		return 1;
-	}
-
-	if (get_bind_to_device(sd, name, sizeof(name)) < 0) {
-		close(sd);
-		return 1;
-	}
-
-	mark = get_somark(sd);
-	prio = get_priority(sd);
-
-	close(sd);
-
-	printf("sd %d: dev %s, mark %u, priority %u\n", sd, name, mark, prio);
-
-	return 0;
-}
-
-static int usage(const char *argv0)
-{
-	printf("Usage:\n");
-	printf("  Attach a program\n");
-	printf("  %s -b bind-to-dev -m mark -p prio cg-path\n", argv0);
-	printf("\n");
-	printf("  Detach a program\n");
-	printf("  %s -d cg-path\n", argv0);
-	printf("\n");
-	printf("  Show inherited socket settings (mark, priority, and device)\n");
-	printf("  %s [-6]\n", argv0);
-	return EXIT_FAILURE;
-}
-
-int main(int argc, char **argv)
-{
-	__u32 idx = 0, mark = 0, prio = 0;
-	const char *cgrp_path = NULL;
-	int cg_fd, prog_fd, ret;
-	int family = PF_INET;
-	int do_attach = 1;
-	int rc;
-
-	while ((rc = getopt(argc, argv, "db:m:p:6")) != -1) {
-		switch (rc) {
-		case 'd':
-			do_attach = 0;
-			break;
-		case 'b':
-			idx = if_nametoindex(optarg);
-			if (!idx) {
-				idx = strtoumax(optarg, NULL, 0);
-				if (!idx) {
-					printf("Invalid device name\n");
-					return EXIT_FAILURE;
-				}
-			}
-			break;
-		case 'm':
-			mark = strtoumax(optarg, NULL, 0);
-			break;
-		case 'p':
-			prio = strtoumax(optarg, NULL, 0);
-			break;
-		case '6':
-			family = PF_INET6;
-			break;
-		default:
-			return usage(argv[0]);
-		}
-	}
-
-	if (optind == argc)
-		return show_sockopts(family);
-
-	cgrp_path = argv[optind];
-	if (!cgrp_path) {
-		fprintf(stderr, "cgroup path not given\n");
-		return EXIT_FAILURE;
-	}
-
-	if (do_attach && !idx && !mark && !prio) {
-		fprintf(stderr,
-			"One of device, mark or priority must be given\n");
-		return EXIT_FAILURE;
-	}
-
-	cg_fd = open(cgrp_path, O_DIRECTORY | O_RDONLY);
-	if (cg_fd < 0) {
-		printf("Failed to open cgroup path: '%s'\n", strerror(errno));
-		return EXIT_FAILURE;
-	}
-
-	if (do_attach) {
-		prog_fd = prog_load(idx, mark, prio);
-		if (prog_fd < 0) {
-			printf("Failed to load prog: '%s'\n", strerror(errno));
-			printf("Output from kernel verifier:\n%s\n-------\n",
-			       bpf_log_buf);
-			return EXIT_FAILURE;
-		}
-
-		ret = bpf_prog_attach(prog_fd, cg_fd,
-				      BPF_CGROUP_INET_SOCK_CREATE, 0);
-		if (ret < 0) {
-			printf("Failed to attach prog to cgroup: '%s'\n",
-			       strerror(errno));
-			return EXIT_FAILURE;
-		}
-	} else {
-		ret = bpf_prog_detach(cg_fd, BPF_CGROUP_INET_SOCK_CREATE);
-		if (ret < 0) {
-			printf("Failed to detach prog from cgroup: '%s'\n",
-			       strerror(errno));
-			return EXIT_FAILURE;
-		}
-	}
-
-	close(cg_fd);
-	return EXIT_SUCCESS;
-}
diff --git a/samples/bpf/test_cgrp2_sock.sh b/samples/bpf/test_cgrp2_sock.sh
deleted file mode 100755
index 36bd7cb46f06..000000000000
--- a/samples/bpf/test_cgrp2_sock.sh
+++ /dev/null
@@ -1,137 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-
-# Test various socket options that can be set by attaching programs to cgroups.
-
-MY_DIR=$(dirname $0)
-TEST=$MY_DIR/test_cgrp2_sock
-CGRP_MNT="/tmp/cgroupv2-test_cgrp2_sock"
-
-################################################################################
-#
-print_result()
-{
-	local rc=$1
-	local status=" OK "
-
-	[ $rc -ne 0 ] && status="FAIL"
-
-	printf "%-50s    [%4s]\n" "$2" "$status"
-}
-
-check_sock()
-{
-	out=$($TEST)
-	echo $out | grep -q "$1"
-	if [ $? -ne 0 ]; then
-		print_result 1 "IPv4: $2"
-		echo "    expected: $1"
-		echo "        have: $out"
-		rc=1
-	else
-		print_result 0 "IPv4: $2"
-	fi
-}
-
-check_sock6()
-{
-	out=$($TEST -6)
-	echo $out | grep -q "$1"
-	if [ $? -ne 0 ]; then
-		print_result 1 "IPv6: $2"
-		echo "    expected: $1"
-		echo "        have: $out"
-		rc=1
-	else
-		print_result 0 "IPv6: $2"
-	fi
-}
-
-################################################################################
-#
-
-cleanup()
-{
-	echo $$ >> ${CGRP_MNT}/cgroup.procs
-	rmdir ${CGRP_MNT}/sockopts
-}
-
-cleanup_and_exit()
-{
-	local rc=$1
-	local msg="$2"
-
-	[ -n "$msg" ] && echo "ERROR: $msg"
-
-	$TEST -d ${CGRP_MNT}/sockopts
-	ip li del cgrp2_sock
-	umount ${CGRP_MNT}
-
-	exit $rc
-}
-
-
-################################################################################
-# main
-
-rc=0
-
-ip li add cgrp2_sock type dummy 2>/dev/null
-
-set -e
-mkdir -p ${CGRP_MNT}
-mount -t cgroup2 none ${CGRP_MNT}
-set +e
-
-
-# make sure we have a known start point
-cleanup 2>/dev/null
-
-mkdir -p ${CGRP_MNT}/sockopts
-[ $? -ne 0 ] && cleanup_and_exit 1 "Failed to create cgroup hierarchy"
-
-
-# set pid into cgroup
-echo $$ > ${CGRP_MNT}/sockopts/cgroup.procs
-
-# no bpf program attached, so socket should show no settings
-check_sock "dev , mark 0, priority 0" "No programs attached"
-check_sock6 "dev , mark 0, priority 0" "No programs attached"
-
-# verify device is set
-#
-$TEST -b cgrp2_sock ${CGRP_MNT}/sockopts
-if [ $? -ne 0 ]; then
-	cleanup_and_exit 1 "Failed to install program to set device"
-fi
-check_sock "dev cgrp2_sock, mark 0, priority 0" "Device set"
-check_sock6 "dev cgrp2_sock, mark 0, priority 0" "Device set"
-
-# verify mark is set
-#
-$TEST -m 666 ${CGRP_MNT}/sockopts
-if [ $? -ne 0 ]; then
-	cleanup_and_exit 1 "Failed to install program to set mark"
-fi
-check_sock "dev , mark 666, priority 0" "Mark set"
-check_sock6 "dev , mark 666, priority 0" "Mark set"
-
-# verify priority is set
-#
-$TEST -p 123 ${CGRP_MNT}/sockopts
-if [ $? -ne 0 ]; then
-	cleanup_and_exit 1 "Failed to install program to set priority"
-fi
-check_sock "dev , mark 0, priority 123" "Priority set"
-check_sock6 "dev , mark 0, priority 123" "Priority set"
-
-# all 3 at once
-#
-$TEST -b cgrp2_sock -m 666 -p 123 ${CGRP_MNT}/sockopts
-if [ $? -ne 0 ]; then
-	cleanup_and_exit 1 "Failed to install program to set device, mark and priority"
-fi
-check_sock "dev cgrp2_sock, mark 666, priority 123" "Priority set"
-check_sock6 "dev cgrp2_sock, mark 666, priority 123" "Priority set"
-
-cleanup_and_exit $rc
diff --git a/tools/testing/selftests/bpf/prog_tests/sock_create.c b/tools/testing/selftests/bpf/prog_tests/sock_create.c
new file mode 100644
index 000000000000..072910c05c99
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/sock_create.c
@@ -0,0 +1,256 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <test_progs.h>
+#include "cgroup_helpers.h"
+
+static char bpf_log_buf[4096];
+static bool verbose;
+
+static struct sock_create_test {
+	const char			*descr;
+	const struct bpf_insn		insns[64];
+	enum bpf_attach_type		attach_type;
+	enum bpf_attach_type		expected_attach_type;
+
+	int				domain;
+	int				type;
+
+	int				optname;
+	int				optval;
+} tests[] = {
+	{
+		.descr = "AF_INET set priority",
+		.insns = {
+			/* r3 = 123 (priority) */
+			BPF_MOV64_IMM(BPF_REG_3, 123),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3,
+				    offsetof(struct bpf_sock, priority)),
+
+			/* return 1 */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+		.attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+
+		.domain = AF_INET,
+		.type = SOCK_DGRAM,
+
+		.optname = SO_PRIORITY,
+		.optval = 123,
+	},
+	{
+		.descr = "AF_INET6 set priority",
+		.insns = {
+			/* r3 = 123 (priority) */
+			BPF_MOV64_IMM(BPF_REG_3, 123),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3,
+				    offsetof(struct bpf_sock, priority)),
+
+			/* return 1 */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+		.attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+
+		.domain = AF_INET6,
+		.type = SOCK_DGRAM,
+
+		.optname = SO_PRIORITY,
+		.optval = 123,
+	},
+	{
+		.descr = "AF_INET set mark",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+			/* get uid of process */
+			BPF_EMIT_CALL(BPF_FUNC_get_current_uid_gid),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xffffffff),
+
+			/* if uid is 0, use given mark(666), else use uid as the mark */
+			BPF_MOV64_REG(BPF_REG_3, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+			BPF_MOV64_IMM(BPF_REG_3, 666),
+
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3,
+				    offsetof(struct bpf_sock, mark)),
+
+			/* return 1 */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+		.attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+
+		.domain = AF_INET,
+		.type = SOCK_DGRAM,
+
+		.optname = SO_MARK,
+		.optval = 666,
+	},
+	{
+		.descr = "AF_INET6 set mark",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+			/* get uid of process */
+			BPF_EMIT_CALL(BPF_FUNC_get_current_uid_gid),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xffffffff),
+
+			/* if uid is 0, use given mark(666), else use uid as the mark */
+			BPF_MOV64_REG(BPF_REG_3, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+			BPF_MOV64_IMM(BPF_REG_3, 666),
+
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3,
+				    offsetof(struct bpf_sock, mark)),
+
+			/* return 1 */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+		.attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+
+		.domain = AF_INET6,
+		.type = SOCK_DGRAM,
+
+		.optname = SO_MARK,
+		.optval = 666,
+	},
+	{
+		.descr = "AF_INET bound to iface",
+		.insns = {
+			/* r3 = 1 (lo interface) */
+			BPF_MOV64_IMM(BPF_REG_3, 1),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3,
+				    offsetof(struct bpf_sock, bound_dev_if)),
+
+			/* return 1 */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+		.attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+
+		.domain = AF_INET,
+		.type = SOCK_DGRAM,
+
+		.optname = SO_BINDTOIFINDEX,
+		.optval = 1,
+	},
+	{
+		.descr = "AF_INET6 bound to iface",
+		.insns = {
+			/* r3 = 1 (lo interface) */
+			BPF_MOV64_IMM(BPF_REG_3, 1),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3,
+				    offsetof(struct bpf_sock, bound_dev_if)),
+
+			/* return 1 */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+		.attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+
+		.domain = AF_INET6,
+		.type = SOCK_DGRAM,
+
+		.optname = SO_BINDTOIFINDEX,
+		.optval = 1,
+	},
+};
+
+static int load_prog(const struct bpf_insn *insns,
+		     enum bpf_attach_type expected_attach_type)
+{
+	LIBBPF_OPTS(bpf_prog_load_opts, opts,
+		    .expected_attach_type = expected_attach_type,
+		    .log_level = 2,
+		    .log_buf = bpf_log_buf,
+		    .log_size = sizeof(bpf_log_buf),
+	);
+	int fd, insns_cnt = 0;
+
+	for (;
+	     insns[insns_cnt].code != (BPF_JMP | BPF_EXIT);
+	     insns_cnt++) {
+	}
+	insns_cnt++;
+
+	fd = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK, NULL, "GPL", insns,
+			   insns_cnt, &opts);
+	if (verbose && fd < 0)
+		fprintf(stderr, "%s\n", bpf_log_buf);
+
+	return fd;
+}
+
+static int run_test(int cgroup_fd, struct sock_create_test *test)
+{
+	int sock_fd, err, prog_fd, optval, ret = -1;
+	socklen_t optlen = sizeof(optval);
+
+	prog_fd = load_prog(test->insns, test->expected_attach_type);
+	if (prog_fd < 0) {
+		log_err("Failed to load BPF program");
+		return -1;
+	}
+
+	err = bpf_prog_attach(prog_fd, cgroup_fd, test->attach_type, 0);
+	if (err < 0) {
+		log_err("Failed to attach BPF program");
+		goto close_prog_fd;
+	}
+
+	sock_fd = socket(test->domain, test->type, 0);
+	if (sock_fd < 0) {
+		log_err("Failed to create socket");
+		goto detach_prog;
+	}
+
+	err = getsockopt(sock_fd, SOL_SOCKET, test->optname, &optval, &optlen);
+	if (err) {
+		log_err("Failed to call getsockopt");
+		goto cleanup;
+	}
+
+	if (optval != test->optval) {
+		errno = 0;
+		log_err("getsockopt returned unexpected optval");
+		goto cleanup;
+	}
+
+	ret = 0;
+
+cleanup:
+	close(sock_fd);
+detach_prog:
+	bpf_prog_detach2(prog_fd, cgroup_fd, test->attach_type);
+close_prog_fd:
+	close(prog_fd);
+	return ret;
+}
+
+void test_sock_create(void)
+{
+	int cgroup_fd, i;
+
+	cgroup_fd = test__join_cgroup("/sock_create");
+	if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup"))
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(tests); i++) {
+		if (!test__start_subtest(tests[i].descr))
+			continue;
+
+		ASSERT_OK(run_test(cgroup_fd, &tests[i]), tests[i].descr);
+	}
+
+	close(cgroup_fd);
+}
-- 
cgit v1.2.3


From 64a4658d6f766ca058e28277a4c2743525d7dc26 Mon Sep 17 00:00:00 2001
From: "Daniel T. Lee" <danieltimlee@gmail.com>
Date: Fri, 11 Oct 2024 04:48:45 +0000
Subject: selftests/bpf: migrate cgroup sock create test for prohibiting
 sockets

This patch continues the migration and removal process for cgroup
sock_create tests to selftests.

The test being migrated verifies the ability of cgroup BPF to block the
creation of specific types of sockets using a verdict. Specifically, the
test denies socket creation when the socket is of type AF_INET{6},
SOCK_DGRAM, and IPPROTO_ICMP{V6}. If the requested socket type matches
these attributes, the cgroup BPF verdict blocks the socket creation.

As with the previous commit, this test currently lacks coverage in
selftests, so this patch migrates the functionality into the sock_create
tests under selftests. This migration ensures that the socket creation
blocking behavior with cgroup bpf program is properly tested within the
selftest framework.

Signed-off-by: Daniel T. Lee <danieltimlee@gmail.com>
Link: https://lore.kernel.org/r/20241011044847.51584-3-danieltimlee@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/Makefile                               |   3 -
 samples/bpf/sock_flags.bpf.c                       |  47 ----------
 samples/bpf/test_cgrp2_sock2.c                     |  95 -------------------
 samples/bpf/test_cgrp2_sock2.sh                    | 103 ---------------------
 .../testing/selftests/bpf/prog_tests/sock_create.c |  83 ++++++++++++++++-
 5 files changed, 80 insertions(+), 251 deletions(-)
 delete mode 100644 samples/bpf/sock_flags.bpf.c
 delete mode 100644 samples/bpf/test_cgrp2_sock2.c
 delete mode 100755 samples/bpf/test_cgrp2_sock2.sh

(limited to 'tools/testing')

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index f514c6fb1ae2..490833f8706a 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -27,7 +27,6 @@ tprogs-y += map_perf_test
 tprogs-y += test_overhead
 tprogs-y += test_cgrp2_array_pin
 tprogs-y += test_cgrp2_attach
-tprogs-y += test_cgrp2_sock2
 tprogs-y += xdp_router_ipv4
 tprogs-y += test_current_task_under_cgroup
 tprogs-y += trace_event
@@ -75,7 +74,6 @@ map_perf_test-objs := map_perf_test_user.o
 test_overhead-objs := test_overhead_user.o
 test_cgrp2_array_pin-objs := test_cgrp2_array_pin.o
 test_cgrp2_attach-objs := test_cgrp2_attach.o
-test_cgrp2_sock2-objs := test_cgrp2_sock2.o
 test_current_task_under_cgroup-objs := $(CGROUP_HELPERS) \
 				       test_current_task_under_cgroup_user.o
 trace_event-objs := trace_event_user.o $(TRACE_HELPERS)
@@ -106,7 +104,6 @@ always-y += tracex4.bpf.o
 always-y += tracex5.bpf.o
 always-y += tracex6.bpf.o
 always-y += tracex7.bpf.o
-always-y += sock_flags.bpf.o
 always-y += test_probe_write_user.bpf.o
 always-y += trace_output.bpf.o
 always-y += tcbpf1_kern.o
diff --git a/samples/bpf/sock_flags.bpf.c b/samples/bpf/sock_flags.bpf.c
deleted file mode 100644
index 0da749f6a9e1..000000000000
--- a/samples/bpf/sock_flags.bpf.c
+++ /dev/null
@@ -1,47 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "vmlinux.h"
-#include "net_shared.h"
-#include <bpf/bpf_helpers.h>
-
-SEC("cgroup/sock")
-int bpf_prog1(struct bpf_sock *sk)
-{
-	char fmt[] = "socket: family %d type %d protocol %d\n";
-	char fmt2[] = "socket: uid %u gid %u\n";
-	__u64 gid_uid = bpf_get_current_uid_gid();
-	__u32 uid = gid_uid & 0xffffffff;
-	__u32 gid = gid_uid >> 32;
-
-	bpf_trace_printk(fmt, sizeof(fmt), sk->family, sk->type, sk->protocol);
-	bpf_trace_printk(fmt2, sizeof(fmt2), uid, gid);
-
-	/* block AF_INET6, SOCK_DGRAM, IPPROTO_ICMPV6 sockets
-	 * ie., make ping6 fail
-	 */
-	if (sk->family == AF_INET6 &&
-	    sk->type == SOCK_DGRAM   &&
-	    sk->protocol == IPPROTO_ICMPV6)
-		return 0;
-
-	return 1;
-}
-
-SEC("cgroup/sock")
-int bpf_prog2(struct bpf_sock *sk)
-{
-	char fmt[] = "socket: family %d type %d protocol %d\n";
-
-	bpf_trace_printk(fmt, sizeof(fmt), sk->family, sk->type, sk->protocol);
-
-	/* block AF_INET, SOCK_DGRAM, IPPROTO_ICMP sockets
-	 * ie., make ping fail
-	 */
-	if (sk->family == AF_INET &&
-	    sk->type == SOCK_DGRAM  &&
-	    sk->protocol == IPPROTO_ICMP)
-		return 0;
-
-	return 1;
-}
-
-char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/test_cgrp2_sock2.c b/samples/bpf/test_cgrp2_sock2.c
deleted file mode 100644
index e7060aaa2f5a..000000000000
--- a/samples/bpf/test_cgrp2_sock2.c
+++ /dev/null
@@ -1,95 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* eBPF example program:
- *
- * - Loads eBPF program
- *
- *   The eBPF program loads a filter from file and attaches the
- *   program to a cgroup using BPF_PROG_ATTACH
- */
-
-#define _GNU_SOURCE
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stddef.h>
-#include <string.h>
-#include <unistd.h>
-#include <assert.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <net/if.h>
-#include <linux/bpf.h>
-#include <bpf/bpf.h>
-#include <bpf/libbpf.h>
-
-#include "bpf_insn.h"
-
-static int usage(const char *argv0)
-{
-	printf("Usage: %s cg-path filter-path [filter-id]\n", argv0);
-	return EXIT_FAILURE;
-}
-
-int main(int argc, char **argv)
-{
-	int cg_fd, err, ret = EXIT_FAILURE, filter_id = 0, prog_cnt = 0;
-	const char *link_pin_path = "/sys/fs/bpf/test_cgrp2_sock2";
-	struct bpf_link *link = NULL;
-	struct bpf_program *progs[2];
-	struct bpf_program *prog;
-	struct bpf_object *obj;
-
-	if (argc < 3)
-		return usage(argv[0]);
-
-	if (argc > 3)
-		filter_id = atoi(argv[3]);
-
-	cg_fd = open(argv[1], O_DIRECTORY | O_RDONLY);
-	if (cg_fd < 0) {
-		printf("Failed to open cgroup path: '%s'\n", strerror(errno));
-		return ret;
-	}
-
-	obj = bpf_object__open_file(argv[2], NULL);
-	if (libbpf_get_error(obj)) {
-		printf("ERROR: opening BPF object file failed\n");
-		return ret;
-	}
-
-	bpf_object__for_each_program(prog, obj) {
-		progs[prog_cnt] = prog;
-		prog_cnt++;
-	}
-
-	if (filter_id >= prog_cnt) {
-		printf("Invalid program id; program not found in file\n");
-		goto cleanup;
-	}
-
-	/* load BPF program */
-	if (bpf_object__load(obj)) {
-		printf("ERROR: loading BPF object file failed\n");
-		goto cleanup;
-	}
-
-	link = bpf_program__attach_cgroup(progs[filter_id], cg_fd);
-	if (libbpf_get_error(link)) {
-		printf("ERROR: bpf_program__attach failed\n");
-		link = NULL;
-		goto cleanup;
-	}
-
-	err = bpf_link__pin(link, link_pin_path);
-	if (err < 0) {
-		printf("ERROR: bpf_link__pin failed: %d\n", err);
-		goto cleanup;
-	}
-
-	ret = EXIT_SUCCESS;
-
-cleanup:
-	bpf_link__destroy(link);
-	bpf_object__close(obj);
-	return ret;
-}
diff --git a/samples/bpf/test_cgrp2_sock2.sh b/samples/bpf/test_cgrp2_sock2.sh
deleted file mode 100755
index 82acff93d739..000000000000
--- a/samples/bpf/test_cgrp2_sock2.sh
+++ /dev/null
@@ -1,103 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-BPFFS=/sys/fs/bpf
-MY_DIR=$(dirname $0)
-TEST=$MY_DIR/test_cgrp2_sock2
-LINK_PIN=$BPFFS/test_cgrp2_sock2
-BPF_PROG=$MY_DIR/sock_flags.bpf.o
-
-function config_device {
-	ip netns add at_ns0
-	ip link add veth0 type veth peer name veth0b
-	ip link set veth0 netns at_ns0
-	ip netns exec at_ns0 sysctl -q net.ipv6.conf.veth0.disable_ipv6=0
-	ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0
-	ip netns exec at_ns0 ip addr add 2401:db00::1/64 dev veth0 nodad
-	ip netns exec at_ns0 ip link set dev veth0 up
-	sysctl -q net.ipv6.conf.veth0b.disable_ipv6=0
-	ip addr add 172.16.1.101/24 dev veth0b
-	ip addr add 2401:db00::2/64 dev veth0b nodad
-	ip link set veth0b up
-}
-
-function config_cgroup {
-	rm -rf /tmp/cgroupv2
-	mkdir -p /tmp/cgroupv2
-	mount -t cgroup2 none /tmp/cgroupv2
-	mkdir -p /tmp/cgroupv2/foo
-	echo $$ >> /tmp/cgroupv2/foo/cgroup.procs
-}
-
-function config_bpffs {
-	if mount | grep $BPFFS > /dev/null; then
-		echo "bpffs already mounted"
-	else
-		echo "bpffs not mounted. Mounting..."
-		mount -t bpf none $BPFFS
-	fi
-}
-
-function attach_bpf {
-	$TEST /tmp/cgroupv2/foo $BPF_PROG $1
-	[ $? -ne 0 ] && exit 1
-}
-
-function cleanup {
-	rm -rf $LINK_PIN
-	ip link del veth0b
-	ip netns delete at_ns0
-	umount /tmp/cgroupv2
-	rm -rf /tmp/cgroupv2
-}
-
-cleanup 2>/dev/null
-
-set -e
-config_device
-config_cgroup
-config_bpffs
-set +e
-
-#
-# Test 1 - fail ping6
-#
-attach_bpf 0
-ping -c1 -w1 172.16.1.100
-if [ $? -ne 0 ]; then
-	echo "ping failed when it should succeed"
-	cleanup
-	exit 1
-fi
-
-ping6 -c1 -w1 2401:db00::1
-if [ $? -eq 0 ]; then
-	echo "ping6 succeeded when it should not"
-	cleanup
-	exit 1
-fi
-
-rm -rf $LINK_PIN
-sleep 1                 # Wait for link detach
-
-#
-# Test 2 - fail ping
-#
-attach_bpf 1
-ping6 -c1 -w1 2401:db00::1
-if [ $? -ne 0 ]; then
-	echo "ping6 failed when it should succeed"
-	cleanup
-	exit 1
-fi
-
-ping -c1 -w1 172.16.1.100
-if [ $? -eq 0 ]; then
-	echo "ping succeeded when it should not"
-	cleanup
-	exit 1
-fi
-
-cleanup
-echo
-echo "*** PASS ***"
diff --git a/tools/testing/selftests/bpf/prog_tests/sock_create.c b/tools/testing/selftests/bpf/prog_tests/sock_create.c
index 072910c05c99..17a3713621dd 100644
--- a/tools/testing/selftests/bpf/prog_tests/sock_create.c
+++ b/tools/testing/selftests/bpf/prog_tests/sock_create.c
@@ -6,6 +6,11 @@
 static char bpf_log_buf[4096];
 static bool verbose;
 
+enum sock_create_test_error {
+	OK = 0,
+	DENY_CREATE,
+};
+
 static struct sock_create_test {
 	const char			*descr;
 	const struct bpf_insn		insns[64];
@@ -14,9 +19,11 @@ static struct sock_create_test {
 
 	int				domain;
 	int				type;
+	int				protocol;
 
 	int				optname;
 	int				optval;
+	enum sock_create_test_error	error;
 } tests[] = {
 	{
 		.descr = "AF_INET set priority",
@@ -164,6 +171,72 @@ static struct sock_create_test {
 		.optname = SO_BINDTOIFINDEX,
 		.optval = 1,
 	},
+	{
+		.descr = "block AF_INET, SOCK_DGRAM, IPPROTO_ICMP socket",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 1),	/* r0 = verdict */
+
+			/* sock->family == AF_INET */
+			BPF_LDX_MEM(BPF_H, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct bpf_sock, family)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_2, AF_INET, 5),
+
+			/* sock->type == SOCK_DGRAM */
+			BPF_LDX_MEM(BPF_H, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct bpf_sock, type)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_2, SOCK_DGRAM, 3),
+
+			/* sock->protocol == IPPROTO_ICMP */
+			BPF_LDX_MEM(BPF_H, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct bpf_sock, protocol)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_2, IPPROTO_ICMP, 1),
+
+			/* return 0 (block) */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+		.attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+
+		.domain = AF_INET,
+		.type = SOCK_DGRAM,
+		.protocol = IPPROTO_ICMP,
+
+		.error = DENY_CREATE,
+	},
+	{
+		.descr = "block AF_INET6, SOCK_DGRAM, IPPROTO_ICMPV6 socket",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 1),	/* r0 = verdict */
+
+			/* sock->family == AF_INET6 */
+			BPF_LDX_MEM(BPF_H, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct bpf_sock, family)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_2, AF_INET6, 5),
+
+			/* sock->type == SOCK_DGRAM */
+			BPF_LDX_MEM(BPF_H, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct bpf_sock, type)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_2, SOCK_DGRAM, 3),
+
+			/* sock->protocol == IPPROTO_ICMPV6 */
+			BPF_LDX_MEM(BPF_H, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct bpf_sock, protocol)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_2, IPPROTO_ICMPV6, 1),
+
+			/* return 0 (block) */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+		.attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+
+		.domain = AF_INET,
+		.type = SOCK_DGRAM,
+		.protocol = IPPROTO_ICMPV6,
+
+		.error = DENY_CREATE,
+	},
 };
 
 static int load_prog(const struct bpf_insn *insns,
@@ -208,9 +281,13 @@ static int run_test(int cgroup_fd, struct sock_create_test *test)
 		goto close_prog_fd;
 	}
 
-	sock_fd = socket(test->domain, test->type, 0);
+	sock_fd = socket(test->domain, test->type, test->protocol);
 	if (sock_fd < 0) {
-		log_err("Failed to create socket");
+		if (test->error == DENY_CREATE)
+			ret = 0;
+		else
+			log_err("Failed to create socket");
+
 		goto detach_prog;
 	}
 
@@ -226,7 +303,7 @@ static int run_test(int cgroup_fd, struct sock_create_test *test)
 		goto cleanup;
 	}
 
-	ret = 0;
+	ret = test->error != OK;
 
 cleanup:
 	close(sock_fd);
-- 
cgit v1.2.3


From 82370ed5ade58d99484a607a6000fc8333921c63 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 10 Oct 2024 14:17:31 -0700
Subject: selftests/bpf: add subprog to BPF object file with no entry programs

Add a subprogram to BPF object file that otherwise has no entry BPF
programs to validate that libbpf can still load this correctly.

Until this was fixed, user could expect this very confusing error message:

  libbpf: prog 'dangling_subprog': missing BPF prog type, check ELF section name '.text'
  libbpf: prog 'dangling_subprog': failed to load: -22
  libbpf: failed to load object 'struct_ops_detach'
  libbpf: failed to load BPF skeleton 'struct_ops_detach': -22

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20241010211731.4121837-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/struct_ops_detach.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/progs/struct_ops_detach.c b/tools/testing/selftests/bpf/progs/struct_ops_detach.c
index 56b787a89876..d7fdcabe7d90 100644
--- a/tools/testing/selftests/bpf/progs/struct_ops_detach.c
+++ b/tools/testing/selftests/bpf/progs/struct_ops_detach.c
@@ -6,5 +6,17 @@
 
 char _license[] SEC("license") = "GPL";
 
+/*
+ * This subprogram validates that libbpf handles the situation in which BPF
+ * object has subprograms in .text section, but has no entry BPF programs.
+ * At some point that was causing issues due to legacy logic of treating such
+ * subprogram as entry program (with unknown program type, which would fail).
+ */
+int dangling_subprog(void)
+{
+	/* do nothing, just be here */
+	return 0;
+}
+
 SEC(".struct_ops.link")
 struct bpf_testmod_ops testmod_do_detach;
-- 
cgit v1.2.3


From e6c209da7e0e9aaf955a7b59e91ed78c2b6c96fb Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@pm.me>
Date: Fri, 11 Oct 2024 15:31:07 +0000
Subject: selftests/bpf: Check for timeout in perf_link test

Recently perf_link test started unreliably failing on libbpf CI:
  * https://github.com/libbpf/libbpf/actions/runs/11260672407/job/31312405473
  * https://github.com/libbpf/libbpf/actions/runs/11260992334/job/31315514626
  * https://github.com/libbpf/libbpf/actions/runs/11263162459/job/31320458251

Part of the test is running a dummy loop for a while and then checking
for a counter incremented by the test program.

Instead of waiting for an arbitrary number of loop iterations once,
check for the test counter in a loop and use get_time_ns() helper to
enforce a 100ms timeout.

v1: https://lore.kernel.org/bpf/zuRd072x9tumn2iN4wDNs5av0nu5nekMNV4PkR-YwCT10eFFTrUtZBRkLWFbrcCe7guvLStGQlhibo8qWojCO7i2-NGajes5GYIyynexD-w=@pm.me/

Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241011153104.249800-1-ihor.solodrai@pm.me
---
 tools/testing/selftests/bpf/prog_tests/perf_link.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/perf_link.c b/tools/testing/selftests/bpf/prog_tests/perf_link.c
index 3a25f1c743a1..d940ff87fa08 100644
--- a/tools/testing/selftests/bpf/prog_tests/perf_link.c
+++ b/tools/testing/selftests/bpf/prog_tests/perf_link.c
@@ -4,8 +4,12 @@
 #include <pthread.h>
 #include <sched.h>
 #include <test_progs.h>
+#include "testing_helpers.h"
 #include "test_perf_link.skel.h"
 
+#define BURN_TIMEOUT_MS 100
+#define BURN_TIMEOUT_NS BURN_TIMEOUT_MS * 1000000
+
 static void burn_cpu(void)
 {
 	volatile int j = 0;
@@ -32,6 +36,7 @@ void serial_test_perf_link(void)
 	int run_cnt_before, run_cnt_after;
 	struct bpf_link_info info;
 	__u32 info_len = sizeof(info);
+	__u64 timeout_time_ns;
 
 	/* create perf event */
 	memset(&attr, 0, sizeof(attr));
@@ -63,8 +68,14 @@ void serial_test_perf_link(void)
 	ASSERT_GT(info.prog_id, 0, "link_prog_id");
 
 	/* ensure we get at least one perf_event prog execution */
-	burn_cpu();
-	ASSERT_GT(skel->bss->run_cnt, 0, "run_cnt");
+	timeout_time_ns = get_time_ns() + BURN_TIMEOUT_NS;
+	while (true) {
+		burn_cpu();
+		if (skel->bss->run_cnt > 0)
+			break;
+	        if (!ASSERT_LT(get_time_ns(), timeout_time_ns, "run_cnt_timeout"))
+			break;
+	}
 
 	/* perf_event is still active, but we close link and BPF program
 	 * shouldn't be executed anymore
-- 
cgit v1.2.3


From 3ec920bb978ccdc68a7dfb304d303d598d038cb1 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Thu, 10 Oct 2024 04:00:27 +0000
Subject: selftests: rtnetlink: update netdevsim ipsec output format

After the netdevsim update to use human-readable IP address formats for
IPsec, we can now use the source and destination IPs directly in testing.
Here is the result:
  # ./rtnetlink.sh -t kci_test_ipsec_offload
  PASS: ipsec_offload

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20241010040027.21440-4-liuhangbin@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/rtnetlink.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh
index bdf6f10d0558..87dce3efe31e 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -809,10 +809,10 @@ kci_test_ipsec_offload()
 	# does driver have correct offload info
 	run_cmd diff $sysfsf - << EOF
 SA count=2 tx=3
-sa[0] tx ipaddr=0x00000000 00000000 00000000 00000000
+sa[0] tx ipaddr=$dstip
 sa[0]    spi=0x00000009 proto=0x32 salt=0x61626364 crypt=1
 sa[0]    key=0x34333231 38373635 32313039 36353433
-sa[1] rx ipaddr=0x00000000 00000000 00000000 037ba8c0
+sa[1] rx ipaddr=$srcip
 sa[1]    spi=0x00000009 proto=0x32 salt=0x61626364 crypt=1
 sa[1]    key=0x34333231 38373635 32313039 36353433
 EOF
-- 
cgit v1.2.3


From ec35b0c53cc7398143315d42342a9798094dada7 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 10 Oct 2024 14:18:57 -0700
Subject: selftests: drv-net: add missing trailing backslash

Commit b3ea416419c8 ("testing: net-drv: add basic shaper test")
removed the trailing backslash from the last entry. We have
a terminating comment here to avoid having to modify the last
line when adding at the end.

Reviewed-by: Joe Damato <jdamato@fastly.com>
Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20241010211857.2193076-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile
index 25aec5c081df..0fec8f9801ad 100644
--- a/tools/testing/selftests/drivers/net/Makefile
+++ b/tools/testing/selftests/drivers/net/Makefile
@@ -9,7 +9,7 @@ TEST_PROGS := \
 	ping.py \
 	queues.py \
 	stats.py \
-	shaper.py
+	shaper.py \
 # end of TEST_PROGS
 
 include ../../lib.mk
-- 
cgit v1.2.3


From 96ea65295337fed271ce9f136edf6f7eaf3b657c Mon Sep 17 00:00:00 2001
From: Ba Jing <bajing@cmss.chinamobile.com>
Date: Tue, 3 Sep 2024 12:16:20 +0800
Subject: binderfs: binderfs_test: remove unused variable

The variable "wret" is never referenced in the code, just remove it.

Signed-off-by: Ba Jing <bajing@cmss.chinamobile.com>
Link: https://lore.kernel.org/r/20240903041620.10812-1-bajing@cmss.chinamobile.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 tools/testing/selftests/filesystems/binderfs/binderfs_test.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/filesystems/binderfs/binderfs_test.c b/tools/testing/selftests/filesystems/binderfs/binderfs_test.c
index 319567f0fae1..81db85a5cc16 100644
--- a/tools/testing/selftests/filesystems/binderfs/binderfs_test.c
+++ b/tools/testing/selftests/filesystems/binderfs/binderfs_test.c
@@ -57,7 +57,6 @@ static int __do_binderfs_test(struct __test_metadata *_metadata)
 {
 	int fd, ret, saved_errno, result = 1;
 	size_t len;
-	ssize_t wret;
 	struct binderfs_device device = { 0 };
 	struct binder_version version = { 0 };
 	char binderfs_mntpt[] = P_tmpdir "/binderfs_XXXXXX",
-- 
cgit v1.2.3


From e94fdd5d9aa263ec259e0bc1ae53b89829c09aad Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 14 Oct 2024 11:40:59 +0200
Subject: selftests: use shared header

So that we don't have to redefine the same system calls over and over.

Link: https://lore.kernel.org/r/20241014-work-overlayfs-v3-4-32b3fed1286e@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 .../selftests/filesystems/overlayfs/dev_in_maps.c  | 27 +-------------
 .../selftests/filesystems/overlayfs/wrappers.h     | 43 ++++++++++++++++++++++
 2 files changed, 44 insertions(+), 26 deletions(-)
 create mode 100644 tools/testing/selftests/filesystems/overlayfs/wrappers.h

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c b/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c
index 2862aae58b79..3b796264223f 100644
--- a/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c
+++ b/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c
@@ -17,32 +17,7 @@
 
 #include "../../kselftest.h"
 #include "log.h"
-
-static int sys_fsopen(const char *fsname, unsigned int flags)
-{
-	return syscall(__NR_fsopen, fsname, flags);
-}
-
-static int sys_fsconfig(int fd, unsigned int cmd, const char *key, const char *value, int aux)
-{
-	return syscall(__NR_fsconfig, fd, cmd, key, value, aux);
-}
-
-static int sys_fsmount(int fd, unsigned int flags, unsigned int attr_flags)
-{
-	return syscall(__NR_fsmount, fd, flags, attr_flags);
-}
-static int sys_mount(const char *src, const char *tgt, const char *fst,
-		unsigned long flags, const void *data)
-{
-	return syscall(__NR_mount, src, tgt, fst, flags, data);
-}
-static int sys_move_mount(int from_dfd, const char *from_pathname,
-			  int to_dfd, const char *to_pathname,
-			  unsigned int flags)
-{
-	return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd, to_pathname, flags);
-}
+#include "wrappers.h"
 
 static long get_file_dev_and_inode(void *addr, struct statx *stx)
 {
diff --git a/tools/testing/selftests/filesystems/overlayfs/wrappers.h b/tools/testing/selftests/filesystems/overlayfs/wrappers.h
new file mode 100644
index 000000000000..4f99e10f7f01
--- /dev/null
+++ b/tools/testing/selftests/filesystems/overlayfs/wrappers.h
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+//
+#ifndef __SELFTEST_OVERLAYFS_WRAPPERS_H__
+#define __SELFTEST_OVERLAYFS_WRAPPERS_H__
+
+#define _GNU_SOURCE
+
+#include <linux/types.h>
+#include <linux/mount.h>
+#include <sys/syscall.h>
+
+static inline int sys_fsopen(const char *fsname, unsigned int flags)
+{
+	return syscall(__NR_fsopen, fsname, flags);
+}
+
+static inline int sys_fsconfig(int fd, unsigned int cmd, const char *key,
+			       const char *value, int aux)
+{
+	return syscall(__NR_fsconfig, fd, cmd, key, value, aux);
+}
+
+static inline int sys_fsmount(int fd, unsigned int flags,
+			      unsigned int attr_flags)
+{
+	return syscall(__NR_fsmount, fd, flags, attr_flags);
+}
+
+static inline int sys_mount(const char *src, const char *tgt, const char *fst,
+			    unsigned long flags, const void *data)
+{
+	return syscall(__NR_mount, src, tgt, fst, flags, data);
+}
+
+static inline int sys_move_mount(int from_dfd, const char *from_pathname,
+				 int to_dfd, const char *to_pathname,
+				 unsigned int flags)
+{
+	return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd,
+		       to_pathname, flags);
+}
+
+#endif
-- 
cgit v1.2.3


From af9199145b1977316b3c752e2124543e320f087f Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 14 Oct 2024 11:41:00 +0200
Subject: selftests: add overlayfs fd mounting selftests

Link: https://lore.kernel.org/r/20241014-work-overlayfs-v3-5-32b3fed1286e@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 .../selftests/filesystems/overlayfs/.gitignore     |   1 +
 .../selftests/filesystems/overlayfs/Makefile       |   2 +-
 .../filesystems/overlayfs/set_layers_via_fds.c     | 152 +++++++++++++++++++++
 .../selftests/filesystems/overlayfs/wrappers.h     |   4 +
 4 files changed, 158 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/filesystems/overlayfs/.gitignore b/tools/testing/selftests/filesystems/overlayfs/.gitignore
index 52ae618fdd98..e23a18c8b37f 100644
--- a/tools/testing/selftests/filesystems/overlayfs/.gitignore
+++ b/tools/testing/selftests/filesystems/overlayfs/.gitignore
@@ -1,2 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
 dev_in_maps
+set_layers_via_fds
diff --git a/tools/testing/selftests/filesystems/overlayfs/Makefile b/tools/testing/selftests/filesystems/overlayfs/Makefile
index 56b2b48a765b..e8d1adb021af 100644
--- a/tools/testing/selftests/filesystems/overlayfs/Makefile
+++ b/tools/testing/selftests/filesystems/overlayfs/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
-TEST_GEN_PROGS := dev_in_maps
+TEST_GEN_PROGS := dev_in_maps set_layers_via_fds
 
 CFLAGS := -Wall -Werror
 
diff --git a/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c b/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c
new file mode 100644
index 000000000000..301fb5c02852
--- /dev/null
+++ b/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#define __SANE_USERSPACE_TYPES__ // Use ll64
+
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <unistd.h>
+
+#include "../../kselftest_harness.h"
+#include "log.h"
+#include "wrappers.h"
+
+FIXTURE(set_layers_via_fds) {
+};
+
+FIXTURE_SETUP(set_layers_via_fds)
+{
+	ASSERT_EQ(mkdir("/set_layers_via_fds", 0755), 0);
+}
+
+FIXTURE_TEARDOWN(set_layers_via_fds)
+{
+	umount2("/set_layers_via_fds", 0);
+	ASSERT_EQ(rmdir("/set_layers_via_fds"), 0);
+}
+
+TEST_F(set_layers_via_fds, set_layers_via_fds)
+{
+	int fd_context, fd_tmpfs, fd_overlay;
+	int layer_fds[] = { [0 ... 8] = -EBADF };
+	bool layers_found[] = { [0 ... 8] =  false };
+	size_t len = 0;
+	char *line = NULL;
+	FILE *f_mountinfo;
+
+	ASSERT_EQ(unshare(CLONE_NEWNS), 0);
+	ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0);
+
+	fd_context = sys_fsopen("tmpfs", 0);
+	ASSERT_GE(fd_context, 0);
+
+	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
+	fd_tmpfs = sys_fsmount(fd_context, 0, 0);
+	ASSERT_GE(fd_tmpfs, 0);
+	ASSERT_EQ(close(fd_context), 0);
+
+	ASSERT_EQ(mkdirat(fd_tmpfs, "w", 0755), 0);
+	ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0);
+	ASSERT_EQ(mkdirat(fd_tmpfs, "l1", 0755), 0);
+	ASSERT_EQ(mkdirat(fd_tmpfs, "l2", 0755), 0);
+	ASSERT_EQ(mkdirat(fd_tmpfs, "l3", 0755), 0);
+	ASSERT_EQ(mkdirat(fd_tmpfs, "l4", 0755), 0);
+	ASSERT_EQ(mkdirat(fd_tmpfs, "d1", 0755), 0);
+	ASSERT_EQ(mkdirat(fd_tmpfs, "d2", 0755), 0);
+	ASSERT_EQ(mkdirat(fd_tmpfs, "d3", 0755), 0);
+
+	layer_fds[0] = openat(fd_tmpfs, "w", O_DIRECTORY);
+	ASSERT_GE(layer_fds[0], 0);
+
+	layer_fds[1] = openat(fd_tmpfs, "u", O_DIRECTORY);
+	ASSERT_GE(layer_fds[1], 0);
+
+	layer_fds[2] = openat(fd_tmpfs, "l1", O_DIRECTORY);
+	ASSERT_GE(layer_fds[2], 0);
+
+	layer_fds[3] = openat(fd_tmpfs, "l2", O_DIRECTORY);
+	ASSERT_GE(layer_fds[3], 0);
+
+	layer_fds[4] = openat(fd_tmpfs, "l3", O_DIRECTORY);
+	ASSERT_GE(layer_fds[4], 0);
+
+	layer_fds[5] = openat(fd_tmpfs, "l4", O_DIRECTORY);
+	ASSERT_GE(layer_fds[5], 0);
+
+	layer_fds[6] = openat(fd_tmpfs, "d1", O_DIRECTORY);
+	ASSERT_GE(layer_fds[6], 0);
+
+	layer_fds[7] = openat(fd_tmpfs, "d2", O_DIRECTORY);
+	ASSERT_GE(layer_fds[7], 0);
+
+	layer_fds[8] = openat(fd_tmpfs, "d3", O_DIRECTORY);
+	ASSERT_GE(layer_fds[8], 0);
+
+	ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/tmp", MOVE_MOUNT_F_EMPTY_PATH), 0);
+	ASSERT_EQ(close(fd_tmpfs), 0);
+
+	fd_context = sys_fsopen("overlay", 0);
+	ASSERT_GE(fd_context, 0);
+
+	ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir", NULL, layer_fds[2]), 0);
+
+	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir",   NULL, layer_fds[0]), 0);
+	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir",  NULL, layer_fds[1]), 0);
+	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[2]), 0);
+	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[3]), 0);
+	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[4]), 0);
+	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[5]), 0);
+	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "datadir+",  NULL, layer_fds[6]), 0);
+	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "datadir+",  NULL, layer_fds[7]), 0);
+	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "datadir+",  NULL, layer_fds[8]), 0);
+
+	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_STRING, "metacopy", "on", 0), 0);
+
+	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
+
+	fd_overlay = sys_fsmount(fd_context, 0, 0);
+	ASSERT_GE(fd_overlay, 0);
+
+	ASSERT_EQ(sys_move_mount(fd_overlay, "", -EBADF, "/set_layers_via_fds", MOVE_MOUNT_F_EMPTY_PATH), 0);
+
+	f_mountinfo = fopen("/proc/self/mountinfo", "r");
+	ASSERT_NE(f_mountinfo, NULL);
+
+	while (getline(&line, &len, f_mountinfo) != -1) {
+		char *haystack = line;
+
+		if (strstr(haystack, "workdir=/tmp/w"))
+			layers_found[0] = true;
+		if (strstr(haystack, "upperdir=/tmp/u"))
+			layers_found[1] = true;
+		if (strstr(haystack, "lowerdir+=/tmp/l1"))
+			layers_found[2] = true;
+		if (strstr(haystack, "lowerdir+=/tmp/l2"))
+			layers_found[3] = true;
+		if (strstr(haystack, "lowerdir+=/tmp/l3"))
+			layers_found[4] = true;
+		if (strstr(haystack, "lowerdir+=/tmp/l4"))
+			layers_found[5] = true;
+		if (strstr(haystack, "datadir+=/tmp/d1"))
+			layers_found[6] = true;
+		if (strstr(haystack, "datadir+=/tmp/d2"))
+			layers_found[7] = true;
+		if (strstr(haystack, "datadir+=/tmp/d3"))
+			layers_found[8] = true;
+	}
+	free(line);
+
+	for (int i = 0; i < ARRAY_SIZE(layer_fds); i++) {
+		ASSERT_EQ(layers_found[i], true);
+		ASSERT_EQ(close(layer_fds[i]), 0);
+	}
+
+	ASSERT_EQ(close(fd_context), 0);
+	ASSERT_EQ(close(fd_overlay), 0);
+	ASSERT_EQ(fclose(f_mountinfo), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/filesystems/overlayfs/wrappers.h b/tools/testing/selftests/filesystems/overlayfs/wrappers.h
index 4f99e10f7f01..071b95fd2ac0 100644
--- a/tools/testing/selftests/filesystems/overlayfs/wrappers.h
+++ b/tools/testing/selftests/filesystems/overlayfs/wrappers.h
@@ -32,6 +32,10 @@ static inline int sys_mount(const char *src, const char *tgt, const char *fst,
 	return syscall(__NR_mount, src, tgt, fst, flags, data);
 }
 
+#ifndef MOVE_MOUNT_F_EMPTY_PATH
+#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */
+#endif
+
 static inline int sys_move_mount(int from_dfd, const char *from_pathname,
 				 int to_dfd, const char *to_pathname,
 				 unsigned int flags)
-- 
cgit v1.2.3


From 80fa614e2fbcf11069f0995e1601fb2e5702e2f4 Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Mon, 23 Sep 2024 18:30:36 -0600
Subject: selftests: timers: Remove local NSEC_PER_SEC and USEC_PER_SEC defines

Remove local NSEC_PER_SEC and USEC_PER_SEC defines. Pick them up from
include/vdso/time64.h. This requires -I $(top_srcdir) to the timers
Makefile to include the include/vdso/time64.h.

posix_timers test names the defines NSECS_PER_SEC and USECS_PER_SEC.
Change posix_timers test references to the defines to match the
defines in the header file.

Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Acked-by: John Stultz <jstultz@google.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/timers/Makefile              |  2 +-
 tools/testing/selftests/timers/adjtick.c             |  4 +---
 tools/testing/selftests/timers/alarmtimer-suspend.c  |  2 +-
 tools/testing/selftests/timers/inconsistency-check.c |  2 +-
 tools/testing/selftests/timers/leap-a-day.c          |  2 +-
 tools/testing/selftests/timers/mqueue-lat.c          |  2 +-
 tools/testing/selftests/timers/nanosleep.c           |  3 +--
 tools/testing/selftests/timers/nsleep-lat.c          |  3 +--
 tools/testing/selftests/timers/posix_timers.c        | 15 +++++++--------
 tools/testing/selftests/timers/raw_skew.c            |  2 +-
 tools/testing/selftests/timers/set-2038.c            |  3 +--
 tools/testing/selftests/timers/set-timer-lat.c       |  3 +--
 tools/testing/selftests/timers/valid-adjtimex.c      |  4 +---
 13 files changed, 19 insertions(+), 28 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/timers/Makefile b/tools/testing/selftests/timers/Makefile
index 0e73a16874c4..32203593c62e 100644
--- a/tools/testing/selftests/timers/Makefile
+++ b/tools/testing/selftests/timers/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-CFLAGS += -O3 -Wl,-no-as-needed -Wall
+CFLAGS += -O3 -Wl,-no-as-needed -Wall -I $(top_srcdir)
 LDLIBS += -lrt -lpthread -lm
 
 # these are all "safe" tests that don't modify
diff --git a/tools/testing/selftests/timers/adjtick.c b/tools/testing/selftests/timers/adjtick.c
index 205b76a4abb4..cb9a30f54662 100644
--- a/tools/testing/selftests/timers/adjtick.c
+++ b/tools/testing/selftests/timers/adjtick.c
@@ -22,14 +22,12 @@
 #include <sys/time.h>
 #include <sys/timex.h>
 #include <time.h>
+#include <include/vdso/time64.h>
 
 #include "../kselftest.h"
 
 #define CLOCK_MONOTONIC_RAW	4
 
-#define NSEC_PER_SEC		1000000000LL
-#define USEC_PER_SEC		1000000
-
 #define MILLION			1000000
 
 long systick;
diff --git a/tools/testing/selftests/timers/alarmtimer-suspend.c b/tools/testing/selftests/timers/alarmtimer-suspend.c
index ad52e608b88e..62da2a3f949e 100644
--- a/tools/testing/selftests/timers/alarmtimer-suspend.c
+++ b/tools/testing/selftests/timers/alarmtimer-suspend.c
@@ -28,6 +28,7 @@
 #include <signal.h>
 #include <stdlib.h>
 #include <pthread.h>
+#include <include/vdso/time64.h>
 #include "../kselftest.h"
 
 #define CLOCK_REALTIME			0
@@ -45,7 +46,6 @@
 #define NR_CLOCKIDS			12
 
 
-#define NSEC_PER_SEC 1000000000ULL
 #define UNREASONABLE_LAT (NSEC_PER_SEC * 5) /* hopefully we resume in 5 secs */
 
 #define SUSPEND_SECS 15
diff --git a/tools/testing/selftests/timers/inconsistency-check.c b/tools/testing/selftests/timers/inconsistency-check.c
index 36a49fba6c9b..75650cf0503f 100644
--- a/tools/testing/selftests/timers/inconsistency-check.c
+++ b/tools/testing/selftests/timers/inconsistency-check.c
@@ -28,10 +28,10 @@
 #include <sys/timex.h>
 #include <string.h>
 #include <signal.h>
+#include <include/vdso/time64.h>
 #include "../kselftest.h"
 
 #define CALLS_PER_LOOP 64
-#define NSEC_PER_SEC 1000000000ULL
 
 #define CLOCK_REALTIME			0
 #define CLOCK_MONOTONIC			1
diff --git a/tools/testing/selftests/timers/leap-a-day.c b/tools/testing/selftests/timers/leap-a-day.c
index 986abbdb1521..04004a7c0934 100644
--- a/tools/testing/selftests/timers/leap-a-day.c
+++ b/tools/testing/selftests/timers/leap-a-day.c
@@ -48,9 +48,9 @@
 #include <string.h>
 #include <signal.h>
 #include <unistd.h>
+#include <include/vdso/time64.h>
 #include "../kselftest.h"
 
-#define NSEC_PER_SEC 1000000000ULL
 #define CLOCK_TAI 11
 
 time_t next_leap;
diff --git a/tools/testing/selftests/timers/mqueue-lat.c b/tools/testing/selftests/timers/mqueue-lat.c
index f3179a605bba..63de2334a291 100644
--- a/tools/testing/selftests/timers/mqueue-lat.c
+++ b/tools/testing/selftests/timers/mqueue-lat.c
@@ -29,9 +29,9 @@
 #include <signal.h>
 #include <errno.h>
 #include <mqueue.h>
+#include <include/vdso/time64.h>
 #include "../kselftest.h"
 
-#define NSEC_PER_SEC 1000000000ULL
 
 #define TARGET_TIMEOUT		100000000	/* 100ms in nanoseconds */
 #define UNRESONABLE_LATENCY	40000000	/* 40ms in nanosecs */
diff --git a/tools/testing/selftests/timers/nanosleep.c b/tools/testing/selftests/timers/nanosleep.c
index df1d03516e7b..9a354e38a569 100644
--- a/tools/testing/selftests/timers/nanosleep.c
+++ b/tools/testing/selftests/timers/nanosleep.c
@@ -27,10 +27,9 @@
 #include <sys/timex.h>
 #include <string.h>
 #include <signal.h>
+#include <include/vdso/time64.h>
 #include "../kselftest.h"
 
-#define NSEC_PER_SEC 1000000000ULL
-
 #define CLOCK_REALTIME			0
 #define CLOCK_MONOTONIC			1
 #define CLOCK_PROCESS_CPUTIME_ID	2
diff --git a/tools/testing/selftests/timers/nsleep-lat.c b/tools/testing/selftests/timers/nsleep-lat.c
index edb5acacf214..f6a99490b291 100644
--- a/tools/testing/selftests/timers/nsleep-lat.c
+++ b/tools/testing/selftests/timers/nsleep-lat.c
@@ -24,10 +24,9 @@
 #include <sys/timex.h>
 #include <string.h>
 #include <signal.h>
+#include <include/vdso/time64.h>
 #include "../kselftest.h"
 
-#define NSEC_PER_SEC 1000000000ULL
-
 #define UNRESONABLE_LATENCY 40000000 /* 40ms in nanosecs */
 
 
diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c
index ddb1cebc844e..9814b3a1c77d 100644
--- a/tools/testing/selftests/timers/posix_timers.c
+++ b/tools/testing/selftests/timers/posix_timers.c
@@ -15,13 +15,12 @@
 #include <string.h>
 #include <unistd.h>
 #include <time.h>
+#include <include/vdso/time64.h>
 #include <pthread.h>
 
 #include "../kselftest.h"
 
 #define DELAY 2
-#define USECS_PER_SEC 1000000
-#define NSECS_PER_SEC 1000000000
 
 static void __fatal_error(const char *test, const char *name, const char *what)
 {
@@ -86,9 +85,9 @@ static int check_diff(struct timeval start, struct timeval end)
 	long long diff;
 
 	diff = end.tv_usec - start.tv_usec;
-	diff += (end.tv_sec - start.tv_sec) * USECS_PER_SEC;
+	diff += (end.tv_sec - start.tv_sec) * USEC_PER_SEC;
 
-	if (llabs(diff - DELAY * USECS_PER_SEC) > USECS_PER_SEC / 2) {
+	if (llabs(diff - DELAY * USEC_PER_SEC) > USEC_PER_SEC / 2) {
 		printf("Diff too high: %lld..", diff);
 		return -1;
 	}
@@ -448,7 +447,7 @@ static inline int64_t calcdiff_ns(struct timespec t1, struct timespec t2)
 {
 	int64_t diff;
 
-	diff = NSECS_PER_SEC * (int64_t)((int) t1.tv_sec - (int) t2.tv_sec);
+	diff = NSEC_PER_SEC * (int64_t)((int) t1.tv_sec - (int) t2.tv_sec);
 	diff += ((int) t1.tv_nsec - (int) t2.tv_nsec);
 	return diff;
 }
@@ -479,7 +478,7 @@ static void check_sigev_none(int which, const char *name)
 	do {
 		if (clock_gettime(which, &now))
 			fatal_error(name, "clock_gettime()");
-	} while (calcdiff_ns(now, start) < NSECS_PER_SEC);
+	} while (calcdiff_ns(now, start) < NSEC_PER_SEC);
 
 	if (timer_gettime(timerid, &its))
 		fatal_error(name, "timer_gettime()");
@@ -536,7 +535,7 @@ static void check_gettime(int which, const char *name)
 			wraps++;
 		prev = its;
 
-	} while (calcdiff_ns(now, start) < NSECS_PER_SEC);
+	} while (calcdiff_ns(now, start) < NSEC_PER_SEC);
 
 	if (timer_delete(timerid))
 		fatal_error(name, "timer_delete()");
@@ -587,7 +586,7 @@ static void check_overrun(int which, const char *name)
 	do {
 		if (clock_gettime(which, &now))
 			fatal_error(name, "clock_gettime()");
-	} while (calcdiff_ns(now, start) < NSECS_PER_SEC);
+	} while (calcdiff_ns(now, start) < NSEC_PER_SEC);
 
 	/* Unblock it, which should deliver a signal */
 	if (sigprocmask(SIG_UNBLOCK, &set, NULL))
diff --git a/tools/testing/selftests/timers/raw_skew.c b/tools/testing/selftests/timers/raw_skew.c
index 030143eb09b4..ea50e4efc422 100644
--- a/tools/testing/selftests/timers/raw_skew.c
+++ b/tools/testing/selftests/timers/raw_skew.c
@@ -25,10 +25,10 @@
 #include <sys/time.h>
 #include <sys/timex.h>
 #include <time.h>
+#include <include/vdso/time64.h>
 #include "../kselftest.h"
 
 #define CLOCK_MONOTONIC_RAW		4
-#define NSEC_PER_SEC 1000000000LL
 
 #define shift_right(x, s) ({		\
 	__typeof__(x) __x = (x);	\
diff --git a/tools/testing/selftests/timers/set-2038.c b/tools/testing/selftests/timers/set-2038.c
index f7d978721b9e..ed244315e11c 100644
--- a/tools/testing/selftests/timers/set-2038.c
+++ b/tools/testing/selftests/timers/set-2038.c
@@ -27,10 +27,9 @@
 #include <unistd.h>
 #include <time.h>
 #include <sys/time.h>
+#include <include/vdso/time64.h>
 #include "../kselftest.h"
 
-#define NSEC_PER_SEC 1000000000LL
-
 #define KTIME_MAX	((long long)~((unsigned long long)1 << 63))
 #define KTIME_SEC_MAX	(KTIME_MAX / NSEC_PER_SEC)
 
diff --git a/tools/testing/selftests/timers/set-timer-lat.c b/tools/testing/selftests/timers/set-timer-lat.c
index 7ce240c89b21..5365e9ae61c3 100644
--- a/tools/testing/selftests/timers/set-timer-lat.c
+++ b/tools/testing/selftests/timers/set-timer-lat.c
@@ -28,6 +28,7 @@
 #include <signal.h>
 #include <stdlib.h>
 #include <pthread.h>
+#include <include/vdso/time64.h>
 #include "../kselftest.h"
 
 #define CLOCK_REALTIME			0
@@ -44,8 +45,6 @@
 #define CLOCK_TAI			11
 #define NR_CLOCKIDS			12
 
-
-#define NSEC_PER_SEC 1000000000ULL
 #define UNRESONABLE_LATENCY 40000000 /* 40ms in nanosecs */
 
 #define TIMER_SECS 1
diff --git a/tools/testing/selftests/timers/valid-adjtimex.c b/tools/testing/selftests/timers/valid-adjtimex.c
index d500884801d8..6b7801055ad1 100644
--- a/tools/testing/selftests/timers/valid-adjtimex.c
+++ b/tools/testing/selftests/timers/valid-adjtimex.c
@@ -29,11 +29,9 @@
 #include <string.h>
 #include <signal.h>
 #include <unistd.h>
+#include <include/vdso/time64.h>
 #include "../kselftest.h"
 
-#define NSEC_PER_SEC 1000000000LL
-#define USEC_PER_SEC 1000000LL
-
 #define ADJ_SETOFFSET 0x0100
 
 #include <sys/syscall.h>
-- 
cgit v1.2.3


From d70d4218339e657e80ea478e43ec327cf374826b Mon Sep 17 00:00:00 2001
From: Gianfranco Trad <gianf.trad@gmail.com>
Date: Thu, 29 Aug 2024 17:37:25 +0200
Subject: selftests: timers: improve timer_create failure message

improve timer_create failure message with strerror() function
to give more information to the user.

Signed-off-by: Gianfranco Trad <gianf.trad@gmail.com>
Acked-by: John Stultz <jstultz@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/timers/alarmtimer-suspend.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/timers/alarmtimer-suspend.c b/tools/testing/selftests/timers/alarmtimer-suspend.c
index 62da2a3f949e..9877158a0853 100644
--- a/tools/testing/selftests/timers/alarmtimer-suspend.c
+++ b/tools/testing/selftests/timers/alarmtimer-suspend.c
@@ -29,6 +29,7 @@
 #include <stdlib.h>
 #include <pthread.h>
 #include <include/vdso/time64.h>
+#include <errno.h>
 #include "../kselftest.h"
 
 #define CLOCK_REALTIME			0
@@ -142,8 +143,8 @@ int main(void)
 
 		alarmcount = 0;
 		if (timer_create(alarm_clock_id, &se, &tm1) == -1) {
-			printf("timer_create failed, %s unsupported?\n",
-					clockstring(alarm_clock_id));
+			printf("timer_create failed, %s unsupported?: %s\n",
+					clockstring(alarm_clock_id), strerror(errno));
 			break;
 		}
 
-- 
cgit v1.2.3


From 488be88a3237f840fd5992465945ed6914b46257 Mon Sep 17 00:00:00 2001
From: Chen Ni <nichen@iscas.ac.cn>
Date: Thu, 10 Oct 2024 15:37:07 +0800
Subject: selftests: timers: Remove unneeded semicolon

Remove unnecessary semicolons reported by Coccinelle/coccicheck and the
semantic patch at scripts/coccinelle/misc/semicolon.cocci.

Signed-off-by: Chen Ni <nichen@iscas.ac.cn>
Acked-by: John Stultz <jstultz@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/timers/set-timer-lat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/timers/set-timer-lat.c b/tools/testing/selftests/timers/set-timer-lat.c
index 5365e9ae61c3..7a1a2382538c 100644
--- a/tools/testing/selftests/timers/set-timer-lat.c
+++ b/tools/testing/selftests/timers/set-timer-lat.c
@@ -79,7 +79,7 @@ char *clockstring(int clockid)
 		return "CLOCK_BOOTTIME_ALARM";
 	case CLOCK_TAI:
 		return "CLOCK_TAI";
-	};
+	}
 	return "UNKNOWN_CLOCKID";
 }
 
-- 
cgit v1.2.3


From ecfe6870abac400036d802e28dde4822ec153ffd Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Fri, 11 Oct 2024 16:52:36 -0600
Subject: selftests:timers: remove local CLOCKID defines

timers tests defines CLOCKIDs locally. Remove all local CLOCKIDs except
CLOCK_HWSPECIFIC and use defines from time.h header file.

CLOCK_HWSPECIFIC and CLOCK_SGI_CYCLE are the same and CLOCK_SGI_CYCLE
is deprecated,

Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Acked-by: John Stultz <jstultz@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/timers/adjtick.c             |  2 --
 tools/testing/selftests/timers/alarmtimer-suspend.c  | 15 ---------------
 tools/testing/selftests/timers/inconsistency-check.c | 19 ++++---------------
 tools/testing/selftests/timers/nanosleep.c           | 18 ++++--------------
 tools/testing/selftests/timers/nsleep-lat.c          | 19 ++++---------------
 tools/testing/selftests/timers/raw_skew.c            |  2 --
 tools/testing/selftests/timers/set-timer-lat.c       | 16 +++-------------
 7 files changed, 15 insertions(+), 76 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/timers/adjtick.c b/tools/testing/selftests/timers/adjtick.c
index cb9a30f54662..777d9494b683 100644
--- a/tools/testing/selftests/timers/adjtick.c
+++ b/tools/testing/selftests/timers/adjtick.c
@@ -26,8 +26,6 @@
 
 #include "../kselftest.h"
 
-#define CLOCK_MONOTONIC_RAW	4
-
 #define MILLION			1000000
 
 long systick;
diff --git a/tools/testing/selftests/timers/alarmtimer-suspend.c b/tools/testing/selftests/timers/alarmtimer-suspend.c
index 9877158a0853..a9ef76ea6051 100644
--- a/tools/testing/selftests/timers/alarmtimer-suspend.c
+++ b/tools/testing/selftests/timers/alarmtimer-suspend.c
@@ -32,21 +32,6 @@
 #include <errno.h>
 #include "../kselftest.h"
 
-#define CLOCK_REALTIME			0
-#define CLOCK_MONOTONIC			1
-#define CLOCK_PROCESS_CPUTIME_ID	2
-#define CLOCK_THREAD_CPUTIME_ID		3
-#define CLOCK_MONOTONIC_RAW		4
-#define CLOCK_REALTIME_COARSE		5
-#define CLOCK_MONOTONIC_COARSE		6
-#define CLOCK_BOOTTIME			7
-#define CLOCK_REALTIME_ALARM		8
-#define CLOCK_BOOTTIME_ALARM		9
-#define CLOCK_HWSPECIFIC		10
-#define CLOCK_TAI			11
-#define NR_CLOCKIDS			12
-
-
 #define UNREASONABLE_LAT (NSEC_PER_SEC * 5) /* hopefully we resume in 5 secs */
 
 #define SUSPEND_SECS 15
diff --git a/tools/testing/selftests/timers/inconsistency-check.c b/tools/testing/selftests/timers/inconsistency-check.c
index 75650cf0503f..9d1573769d55 100644
--- a/tools/testing/selftests/timers/inconsistency-check.c
+++ b/tools/testing/selftests/timers/inconsistency-check.c
@@ -31,21 +31,10 @@
 #include <include/vdso/time64.h>
 #include "../kselftest.h"
 
-#define CALLS_PER_LOOP 64
-
-#define CLOCK_REALTIME			0
-#define CLOCK_MONOTONIC			1
-#define CLOCK_PROCESS_CPUTIME_ID	2
-#define CLOCK_THREAD_CPUTIME_ID		3
-#define CLOCK_MONOTONIC_RAW		4
-#define CLOCK_REALTIME_COARSE		5
-#define CLOCK_MONOTONIC_COARSE		6
-#define CLOCK_BOOTTIME			7
-#define CLOCK_REALTIME_ALARM		8
-#define CLOCK_BOOTTIME_ALARM		9
+/* CLOCK_HWSPECIFIC == CLOCK_SGI_CYCLE (Deprecated) */
 #define CLOCK_HWSPECIFIC		10
-#define CLOCK_TAI			11
-#define NR_CLOCKIDS			12
+
+#define CALLS_PER_LOOP 64
 
 char *clockstring(int clockid)
 {
@@ -152,7 +141,7 @@ int main(int argc, char *argv[])
 {
 	int clockid, opt;
 	int userclock = CLOCK_REALTIME;
-	int maxclocks = NR_CLOCKIDS;
+	int maxclocks = CLOCK_TAI + 1;
 	int runtime = 10;
 	struct timespec ts;
 
diff --git a/tools/testing/selftests/timers/nanosleep.c b/tools/testing/selftests/timers/nanosleep.c
index 9a354e38a569..252c6308c569 100644
--- a/tools/testing/selftests/timers/nanosleep.c
+++ b/tools/testing/selftests/timers/nanosleep.c
@@ -30,19 +30,8 @@
 #include <include/vdso/time64.h>
 #include "../kselftest.h"
 
-#define CLOCK_REALTIME			0
-#define CLOCK_MONOTONIC			1
-#define CLOCK_PROCESS_CPUTIME_ID	2
-#define CLOCK_THREAD_CPUTIME_ID		3
-#define CLOCK_MONOTONIC_RAW		4
-#define CLOCK_REALTIME_COARSE		5
-#define CLOCK_MONOTONIC_COARSE		6
-#define CLOCK_BOOTTIME			7
-#define CLOCK_REALTIME_ALARM		8
-#define CLOCK_BOOTTIME_ALARM		9
+/* CLOCK_HWSPECIFIC == CLOCK_SGI_CYCLE (Deprecated) */
 #define CLOCK_HWSPECIFIC		10
-#define CLOCK_TAI			11
-#define NR_CLOCKIDS			12
 
 #define UNSUPPORTED 0xf00f
 
@@ -131,11 +120,12 @@ int main(int argc, char **argv)
 {
 	long long length;
 	int clockid, ret;
+	int max_clocks = CLOCK_TAI + 1;
 
 	ksft_print_header();
-	ksft_set_plan(NR_CLOCKIDS);
+	ksft_set_plan(max_clocks);
 
-	for (clockid = CLOCK_REALTIME; clockid < NR_CLOCKIDS; clockid++) {
+	for (clockid = CLOCK_REALTIME; clockid < max_clocks; clockid++) {
 
 		/* Skip cputime clockids since nanosleep won't increment cputime */
 		if (clockid == CLOCK_PROCESS_CPUTIME_ID ||
diff --git a/tools/testing/selftests/timers/nsleep-lat.c b/tools/testing/selftests/timers/nsleep-lat.c
index f6a99490b291..de23dc0c9f97 100644
--- a/tools/testing/selftests/timers/nsleep-lat.c
+++ b/tools/testing/selftests/timers/nsleep-lat.c
@@ -29,20 +29,8 @@
 
 #define UNRESONABLE_LATENCY 40000000 /* 40ms in nanosecs */
 
-
-#define CLOCK_REALTIME			0
-#define CLOCK_MONOTONIC			1
-#define CLOCK_PROCESS_CPUTIME_ID	2
-#define CLOCK_THREAD_CPUTIME_ID		3
-#define CLOCK_MONOTONIC_RAW		4
-#define CLOCK_REALTIME_COARSE		5
-#define CLOCK_MONOTONIC_COARSE		6
-#define CLOCK_BOOTTIME			7
-#define CLOCK_REALTIME_ALARM		8
-#define CLOCK_BOOTTIME_ALARM		9
+/* CLOCK_HWSPECIFIC == CLOCK_SGI_CYCLE (Deprecated) */
 #define CLOCK_HWSPECIFIC		10
-#define CLOCK_TAI			11
-#define NR_CLOCKIDS			12
 
 #define UNSUPPORTED 0xf00f
 
@@ -144,11 +132,12 @@ int main(int argc, char **argv)
 {
 	long long length;
 	int clockid, ret;
+	int max_clocks = CLOCK_TAI + 1;
 
 	ksft_print_header();
-	ksft_set_plan(NR_CLOCKIDS - CLOCK_REALTIME - SKIPPED_CLOCK_COUNT);
+	ksft_set_plan(max_clocks - CLOCK_REALTIME - SKIPPED_CLOCK_COUNT);
 
-	for (clockid = CLOCK_REALTIME; clockid < NR_CLOCKIDS; clockid++) {
+	for (clockid = CLOCK_REALTIME; clockid < max_clocks; clockid++) {
 
 		/* Skip cputime clockids since nanosleep won't increment cputime */
 		if (clockid == CLOCK_PROCESS_CPUTIME_ID ||
diff --git a/tools/testing/selftests/timers/raw_skew.c b/tools/testing/selftests/timers/raw_skew.c
index ea50e4efc422..957f7cd29cb1 100644
--- a/tools/testing/selftests/timers/raw_skew.c
+++ b/tools/testing/selftests/timers/raw_skew.c
@@ -28,8 +28,6 @@
 #include <include/vdso/time64.h>
 #include "../kselftest.h"
 
-#define CLOCK_MONOTONIC_RAW		4
-
 #define shift_right(x, s) ({		\
 	__typeof__(x) __x = (x);	\
 	__typeof__(s) __s = (s);	\
diff --git a/tools/testing/selftests/timers/set-timer-lat.c b/tools/testing/selftests/timers/set-timer-lat.c
index 7a1a2382538c..9d8437c13929 100644
--- a/tools/testing/selftests/timers/set-timer-lat.c
+++ b/tools/testing/selftests/timers/set-timer-lat.c
@@ -31,19 +31,8 @@
 #include <include/vdso/time64.h>
 #include "../kselftest.h"
 
-#define CLOCK_REALTIME			0
-#define CLOCK_MONOTONIC			1
-#define CLOCK_PROCESS_CPUTIME_ID	2
-#define CLOCK_THREAD_CPUTIME_ID		3
-#define CLOCK_MONOTONIC_RAW		4
-#define CLOCK_REALTIME_COARSE		5
-#define CLOCK_MONOTONIC_COARSE		6
-#define CLOCK_BOOTTIME			7
-#define CLOCK_REALTIME_ALARM		8
-#define CLOCK_BOOTTIME_ALARM		9
+/* CLOCK_HWSPECIFIC == CLOCK_SGI_CYCLE (Deprecated) */
 #define CLOCK_HWSPECIFIC		10
-#define CLOCK_TAI			11
-#define NR_CLOCKIDS			12
 
 #define UNRESONABLE_LATENCY 40000000 /* 40ms in nanosecs */
 
@@ -253,6 +242,7 @@ int main(void)
 	struct sigaction act;
 	int signum = SIGRTMAX;
 	int ret = 0;
+	int max_clocks = CLOCK_TAI + 1;
 
 	/* Set up signal handler: */
 	sigfillset(&act.sa_mask);
@@ -261,7 +251,7 @@ int main(void)
 	sigaction(signum, &act, NULL);
 
 	printf("Setting timers for every %i seconds\n", TIMER_SECS);
-	for (clock_id = 0; clock_id < NR_CLOCKIDS; clock_id++) {
+	for (clock_id = 0; clock_id < max_clocks; clock_id++) {
 
 		if ((clock_id == CLOCK_PROCESS_CPUTIME_ID) ||
 				(clock_id == CLOCK_THREAD_CPUTIME_ID) ||
-- 
cgit v1.2.3


From 0cb06dc6c42b1b2940e01f207ddf980f2d637545 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 11 Oct 2024 16:03:10 -0700
Subject: selftests: net: rebuild YNL if dependencies changed

Try to rebuild YNL if either user added a new family or the specs
of the families have changed. Stanislav's ncdevmem cause a false
positive build failure in NIPA because libynl.a isn't rebuilt
after ethtool is added to YNL_GENS.

Note that sha1sum is already used in other parts of the build system.

Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20241011230311.2529760-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/ynl.mk | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/ynl.mk b/tools/testing/selftests/net/ynl.mk
index 1ef24119def0..add5c0cdeac4 100644
--- a/tools/testing/selftests/net/ynl.mk
+++ b/tools/testing/selftests/net/ynl.mk
@@ -9,6 +9,8 @@
 # YNL_GEN_FILES: TEST_GEN_FILES which need YNL
 
 YNL_OUTPUTS := $(patsubst %,$(OUTPUT)/%,$(YNL_GEN_FILES))
+YNL_SPECS := \
+	$(patsubst %,$(top_srcdir)/Documentation/netlink/specs/%.yaml,$(YNL_GENS))
 
 $(YNL_OUTPUTS): $(OUTPUT)/libynl.a
 $(YNL_OUTPUTS): CFLAGS += \
@@ -16,10 +18,19 @@ $(YNL_OUTPUTS): CFLAGS += \
 	-I$(top_srcdir)/tools/net/ynl/lib/ \
 	-I$(top_srcdir)/tools/net/ynl/generated/
 
-$(OUTPUT)/libynl.a:
+# Make sure we rebuild libynl if user added a new family. We can't easily
+# depend on the contents of a variable so create a fake file with a hash.
+YNL_GENS_HASH := $(shell echo $(YNL_GENS) | sha1sum | cut -c1-8)
+$(OUTPUT)/.libynl-$(YNL_GENS_HASH).sig:
+	$(Q)rm -f $(OUTPUT)/.libynl-*.sig
+	$(Q)touch $(OUTPUT)/.libynl-$(YNL_GENS_HASH).sig
+
+$(OUTPUT)/libynl.a: $(YNL_SPECS) $(OUTPUT)/.libynl-$(YNL_GENS_HASH).sig
+	$(Q)rm -f $(top_srcdir)/tools/net/ynl/libynl.a
 	$(Q)$(MAKE) -C $(top_srcdir)/tools/net/ynl GENS="$(YNL_GENS)" libynl.a
 	$(Q)cp $(top_srcdir)/tools/net/ynl/libynl.a $(OUTPUT)/libynl.a
 
 EXTRA_CLEAN += \
 	$(top_srcdir)/tools/net/ynl/lib/__pycache__ \
-	$(top_srcdir)/tools/net/ynl/lib/*.[ado]
+	$(top_srcdir)/tools/net/ynl/lib/*.[ado] \
+	$(OUTPUT)/.libynl-*.sig
-- 
cgit v1.2.3


From 60b4d49b9621db4b000c9065dd6457c9a0eda80b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 11 Oct 2024 16:03:11 -0700
Subject: selftests: net: move EXTRA_CLEAN of libynl.a into ynl.mk

Commit 1fd9e4f25782 ("selftests: make kselftest-clean remove libynl outputs")
added EXTRA_CLEAN of YNL generated files to ynl.mk. We already had
a EXTRA_CLEAN in the file including the snippet. Consolidate them.

Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20241011230311.2529760-2-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/Makefile | 1 -
 tools/testing/selftests/net/ynl.mk   | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 649f1fe0dc46..26a4883a65c9 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -98,7 +98,6 @@ TEST_PROGS += vlan_hw_filter.sh
 TEST_PROGS += bpf_offload.py
 
 # YNL files, must be before "include ..lib.mk"
-EXTRA_CLEAN += $(OUTPUT)/libynl.a
 YNL_GEN_FILES := ncdevmem
 TEST_GEN_FILES += $(YNL_GEN_FILES)
 
diff --git a/tools/testing/selftests/net/ynl.mk b/tools/testing/selftests/net/ynl.mk
index add5c0cdeac4..d43afe243779 100644
--- a/tools/testing/selftests/net/ynl.mk
+++ b/tools/testing/selftests/net/ynl.mk
@@ -33,4 +33,5 @@ $(OUTPUT)/libynl.a: $(YNL_SPECS) $(OUTPUT)/.libynl-$(YNL_GENS_HASH).sig
 EXTRA_CLEAN += \
 	$(top_srcdir)/tools/net/ynl/lib/__pycache__ \
 	$(top_srcdir)/tools/net/ynl/lib/*.[ado] \
-	$(OUTPUT)/.libynl-*.sig
+	$(OUTPUT)/.libynl-*.sig \
+	$(OUTPUT)/libynl.a
-- 
cgit v1.2.3


From 11312c86f9d7d1bffe0587185934a7070ce9ec33 Mon Sep 17 00:00:00 2001
From: Xiu Jianfeng <xiujianfeng@huawei.com>
Date: Fri, 11 Oct 2024 06:11:53 +0000
Subject: selftests/cgroup: Fix compile error in test_cpu.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When compiling the cgroup selftests with the following command:

make -C tools/testing/selftests/cgroup/

the compiler complains as below:

test_cpu.c: In function ‘test_cpucg_nice’:
test_cpu.c:284:39: error: incompatible type for argument 2 of ‘hog_cpus_timed’
  284 |                 hog_cpus_timed(cpucg, param);
      |                                       ^~~~~
      |                                       |
      |                                       struct cpu_hog_func_param
test_cpu.c:132:53: note: expected ‘void *’ but argument is of type ‘struct cpu_hog_func_param’
  132 | static int hog_cpus_timed(const char *cgroup, void *arg)
      |                                               ~~~~~~^~~

Fix it by passing the address of param to hog_cpus_timed().

Fixes: 2e82c0d4562a ("cgroup/rstat: Selftests for niced CPU statistics")
Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/cgroup/test_cpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/cgroup/test_cpu.c b/tools/testing/selftests/cgroup/test_cpu.c
index 201ce14cb422..a2b50af8e9ee 100644
--- a/tools/testing/selftests/cgroup/test_cpu.c
+++ b/tools/testing/selftests/cgroup/test_cpu.c
@@ -281,7 +281,7 @@ static int test_cpucg_nice(const char *root)
 
 		/* Try to keep niced CPU usage as constrained to hog_cpu as possible */
 		nice(1);
-		hog_cpus_timed(cpucg, param);
+		hog_cpus_timed(cpucg, &param);
 		exit(0);
 	} else {
 		waitpid(pid, &status, 0);
-- 
cgit v1.2.3


From d59dfd625a8bae3bfc527dd61f24750c4f87266c Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 15 Oct 2024 13:15:29 +0200
Subject: selftests: add test for specifying 500 lower layers

Verify that we can actually specify 500 lower layers and fail at the
501st one.

Link: https://lore.kernel.org/r/20241015-leiht-filmabend-a86eed4ff304@brauner
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 .../filesystems/overlayfs/set_layers_via_fds.c     | 65 ++++++++++++++++++++++
 1 file changed, 65 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c b/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c
index 301fb5c02852..1d0ae785a667 100644
--- a/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c
+++ b/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c
@@ -149,4 +149,69 @@ TEST_F(set_layers_via_fds, set_layers_via_fds)
 	ASSERT_EQ(fclose(f_mountinfo), 0);
 }
 
+TEST_F(set_layers_via_fds, set_500_layers_via_fds)
+{
+	int fd_context, fd_tmpfs, fd_overlay, fd_work, fd_upper, fd_lower;
+	int layer_fds[500] = { [0 ... 499] = -EBADF };
+
+	ASSERT_EQ(unshare(CLONE_NEWNS), 0);
+	ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0);
+
+	fd_context = sys_fsopen("tmpfs", 0);
+	ASSERT_GE(fd_context, 0);
+
+	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
+	fd_tmpfs = sys_fsmount(fd_context, 0, 0);
+	ASSERT_GE(fd_tmpfs, 0);
+	ASSERT_EQ(close(fd_context), 0);
+
+	for (int i = 0; i < ARRAY_SIZE(layer_fds); i++) {
+		char path[100];
+
+		sprintf(path, "l%d", i);
+		ASSERT_EQ(mkdirat(fd_tmpfs, path, 0755), 0);
+		layer_fds[i] = openat(fd_tmpfs, path, O_DIRECTORY);
+		ASSERT_GE(layer_fds[i], 0);
+	}
+
+	ASSERT_EQ(mkdirat(fd_tmpfs, "w", 0755), 0);
+	fd_work = openat(fd_tmpfs, "w", O_DIRECTORY);
+	ASSERT_GE(fd_work, 0);
+
+	ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0);
+	fd_upper = openat(fd_tmpfs, "u", O_DIRECTORY);
+	ASSERT_GE(fd_upper, 0);
+
+	ASSERT_EQ(mkdirat(fd_tmpfs, "l501", 0755), 0);
+	fd_lower = openat(fd_tmpfs, "l501", O_DIRECTORY);
+	ASSERT_GE(fd_lower, 0);
+
+	ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/tmp", MOVE_MOUNT_F_EMPTY_PATH), 0);
+	ASSERT_EQ(close(fd_tmpfs), 0);
+
+	fd_context = sys_fsopen("overlay", 0);
+	ASSERT_GE(fd_context, 0);
+
+	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir",   NULL, fd_work), 0);
+	ASSERT_EQ(close(fd_work), 0);
+
+	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir",  NULL, fd_upper), 0);
+	ASSERT_EQ(close(fd_upper), 0);
+
+	for (int i = 0; i < ARRAY_SIZE(layer_fds); i++) {
+		ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[i]), 0);
+		ASSERT_EQ(close(layer_fds[i]), 0);
+	}
+
+	ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, fd_lower), 0);
+	ASSERT_EQ(close(fd_lower), 0);
+
+	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
+
+	fd_overlay = sys_fsmount(fd_context, 0, 0);
+	ASSERT_GE(fd_overlay, 0);
+	ASSERT_EQ(close(fd_context), 0);
+	ASSERT_EQ(close(fd_overlay), 0);
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From 9317e8933e27501575cbef923dfab30867484e7e Mon Sep 17 00:00:00 2001
From: Gur Stavi <gur.stavi@huawei.com>
Date: Sun, 13 Oct 2024 10:15:26 +0300
Subject: selftests: net/psock_fanout: socket joins fanout when link is down

Modify test_control_group to have toggle parameter.
When toggle is non-zero, loopback device will be set down for the
initialization of fd[1] which is still expected to successfully join
the fanout.

Signed-off-by: Gur Stavi <gur.stavi@huawei.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/6f4a506ed5f08f8fc00a966dec8febd1030c6e98.1728802323.git.gur.stavi@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/psock_fanout.c | 42 +++++++++++++++++++++++++++---
 1 file changed, 39 insertions(+), 3 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/psock_fanout.c b/tools/testing/selftests/net/psock_fanout.c
index 4f31e92ebd96..acdfae8f8a9a 100644
--- a/tools/testing/selftests/net/psock_fanout.c
+++ b/tools/testing/selftests/net/psock_fanout.c
@@ -48,6 +48,7 @@
 #include <string.h>
 #include <sys/mman.h>
 #include <sys/socket.h>
+#include <sys/ioctl.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
@@ -59,6 +60,33 @@
 
 static uint32_t cfg_max_num_members;
 
+static void loopback_set_up_down(int state_up)
+{
+	struct ifreq ifreq = {};
+	int fd, err;
+
+	fd = socket(AF_PACKET, SOCK_RAW, 0);
+	if (fd < 0) {
+		perror("socket loopback");
+		exit(1);
+	}
+	strcpy(ifreq.ifr_name, "lo");
+	err = ioctl(fd, SIOCGIFFLAGS, &ifreq);
+	if (err) {
+		perror("SIOCGIFFLAGS");
+		exit(1);
+	}
+	if (state_up != !!(ifreq.ifr_flags & IFF_UP)) {
+		ifreq.ifr_flags ^= IFF_UP;
+		err = ioctl(fd, SIOCSIFFLAGS, &ifreq);
+		if (err) {
+			perror("SIOCSIFFLAGS");
+			exit(1);
+		}
+	}
+	close(fd);
+}
+
 /* Open a socket in a given fanout mode.
  * @return -1 if mode is bad, a valid socket otherwise */
 static int sock_fanout_open(uint16_t typeflags, uint16_t group_id)
@@ -264,17 +292,22 @@ static void test_control_single(void)
 }
 
 /* Test illegal group with different modes or flags */
-static void test_control_group(void)
+static void test_control_group(int toggle)
 {
 	int fds[2];
 
-	fprintf(stderr, "test: control multiple sockets\n");
+	if (toggle)
+		fprintf(stderr, "test: control multiple sockets with link down toggle\n");
+	else
+		fprintf(stderr, "test: control multiple sockets\n");
 
 	fds[0] = sock_fanout_open(PACKET_FANOUT_HASH, 0);
 	if (fds[0] == -1) {
 		fprintf(stderr, "ERROR: failed to open HASH socket\n");
 		exit(1);
 	}
+	if (toggle)
+		loopback_set_up_down(0);
 	if (sock_fanout_open(PACKET_FANOUT_HASH |
 			       PACKET_FANOUT_FLAG_DEFRAG, 0) != -1) {
 		fprintf(stderr, "ERROR: joined group with wrong flag defrag\n");
@@ -294,6 +327,8 @@ static void test_control_group(void)
 		fprintf(stderr, "ERROR: failed to join group\n");
 		exit(1);
 	}
+	if (toggle)
+		loopback_set_up_down(1);
 	if (close(fds[1]) || close(fds[0])) {
 		fprintf(stderr, "ERROR: closing sockets\n");
 		exit(1);
@@ -489,7 +524,8 @@ int main(int argc, char **argv)
 	int port_off = 2, tries = 20, ret;
 
 	test_control_single();
-	test_control_group();
+	test_control_group(0);
+	test_control_group(1);
 	test_control_group_max_num_members();
 	test_unique_fanout_group_ids();
 
-- 
cgit v1.2.3


From 7ec02a3aef05098a413e1d1c7326c15b92189d0c Mon Sep 17 00:00:00 2001
From: Gur Stavi <gur.stavi@huawei.com>
Date: Sun, 13 Oct 2024 10:15:27 +0300
Subject: selftests: net/psock_fanout: unbound socket fanout

Add a test that validates that an unbound packet socket cannot create/join
a fanout group.

Signed-off-by: Gur Stavi <gur.stavi@huawei.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/7612fa90f613100e2b64c563cab3d7fdf36010db.1728802323.git.gur.stavi@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/psock_fanout.c | 36 ++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/psock_fanout.c b/tools/testing/selftests/net/psock_fanout.c
index acdfae8f8a9a..84c524357075 100644
--- a/tools/testing/selftests/net/psock_fanout.c
+++ b/tools/testing/selftests/net/psock_fanout.c
@@ -279,6 +279,41 @@ static int sock_fanout_read(int fds[], char *rings[], const int expect[])
 	return 0;
 }
 
+/* Test that creating/joining a fanout group fails for unbound socket without
+ * a specified protocol
+ */
+static void test_unbound_fanout(void)
+{
+	int val, fd0, fd1, err;
+
+	fprintf(stderr, "test: unbound fanout\n");
+	fd0 = socket(PF_PACKET, SOCK_RAW, 0);
+	if (fd0 < 0) {
+		perror("socket packet");
+		exit(1);
+	}
+	/* Try to create a new fanout group. Should fail. */
+	val = (PACKET_FANOUT_HASH << 16) | 1;
+	err = setsockopt(fd0, SOL_PACKET, PACKET_FANOUT, &val, sizeof(val));
+	if (!err) {
+		fprintf(stderr, "ERROR: unbound socket fanout create\n");
+		exit(1);
+	}
+	fd1 = sock_fanout_open(PACKET_FANOUT_HASH, 1);
+	if (fd1 == -1) {
+		fprintf(stderr, "ERROR: failed to open HASH socket\n");
+		exit(1);
+	}
+	/* Try to join an existing fanout group. Should fail. */
+	err = setsockopt(fd0, SOL_PACKET, PACKET_FANOUT, &val, sizeof(val));
+	if (!err) {
+		fprintf(stderr, "ERROR: unbound socket fanout join\n");
+		exit(1);
+	}
+	close(fd0);
+	close(fd1);
+}
+
 /* Test illegal mode + flag combination */
 static void test_control_single(void)
 {
@@ -523,6 +558,7 @@ int main(int argc, char **argv)
 	const int expect_uniqueid[2][2] = { { 20, 20},  { 20, 20 } };
 	int port_off = 2, tries = 20, ret;
 
+	test_unbound_fanout();
 	test_control_single();
 	test_control_group(0);
 	test_control_group(1);
-- 
cgit v1.2.3


From 8684f2f37d65eeb30b6f704750c691b6e697854b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 11 Oct 2024 13:20:42 -0400
Subject: selftests/ftrace: Fix check of return value in fgraph-retval.tc test

The addition of recording both the function name and return address to the
function graph tracer updated the selftest to check for "=-5" from "= -5".
But this causes the test to fail on certain configs, as "= -5" is still a
value that can be returned if function addresses are not enabled (older kernels).

Check for both "=-5" and " -5" as a success value.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Donglin Peng <pengdonglin@xiaomi.com>
Link: https://lore.kernel.org/20241011132042.435f43cc@gandalf.local.home
Fixes: 21e92806d39c6 ("function_graph: Support recording and printing the function return address")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/testing/selftests/ftrace/test.d/ftrace/fgraph-retval.tc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-retval.tc b/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-retval.tc
index e8e46378b88d..4307d4eef417 100644
--- a/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-retval.tc
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-retval.tc
@@ -29,7 +29,7 @@ set -e
 
 : "Test printing the error code in signed decimal format"
 echo 0 > options/funcgraph-retval-hex
-count=`cat trace | grep 'proc_reg_write' | grep '=-5' | wc -l`
+count=`cat trace | grep 'proc_reg_write' | grep -e '=-5 ' -e '= -5 '  | wc -l`
 if [ $count -eq 0 ]; then
     fail "Return value can not be printed in signed decimal format"
 fi
-- 
cgit v1.2.3


From 0161bd38c24312853ed5ae9a425a1c41c4ac674a Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Thu, 10 Oct 2024 00:17:57 +0200
Subject: powerpc/vdso: Flag VDSO64 entry points as functions

On powerpc64 as shown below by readelf, vDSO functions symbols have
type NOTYPE.

$ powerpc64-linux-gnu-readelf -a arch/powerpc/kernel/vdso/vdso64.so.dbg
ELF Header:
  Magic:   7f 45 4c 46 02 02 01 00 00 00 00 00 00 00 00 00
  Class:                             ELF64
  Data:                              2's complement, big endian
  Version:                           1 (current)
  OS/ABI:                            UNIX - System V
  ABI Version:                       0
  Type:                              DYN (Shared object file)
  Machine:                           PowerPC64
  Version:                           0x1
...

Symbol table '.dynsym' contains 12 entries:
   Num:    Value          Size Type    Bind   Vis      Ndx Name
...
     1: 0000000000000524    84 NOTYPE  GLOBAL DEFAULT    8 __[...]@@LINUX_2.6.15
...
     4: 0000000000000000     0 OBJECT  GLOBAL DEFAULT  ABS LINUX_2.6.15
     5: 00000000000006c0    48 NOTYPE  GLOBAL DEFAULT    8 __[...]@@LINUX_2.6.15

Symbol table '.symtab' contains 56 entries:
   Num:    Value          Size Type    Bind   Vis      Ndx Name
...
    45: 0000000000000000     0 OBJECT  GLOBAL DEFAULT  ABS LINUX_2.6.15
    46: 00000000000006c0    48 NOTYPE  GLOBAL DEFAULT    8 __kernel_getcpu
    47: 0000000000000524    84 NOTYPE  GLOBAL DEFAULT    8 __kernel_clock_getres

To overcome that, commit ba83b3239e65 ("selftests: vDSO: fix vDSO
symbols lookup for powerpc64") was applied to have selftests also
look for NOTYPE symbols, but the correct fix should be to flag VDSO
entry points as functions.

The original commit that brought VDSO support into powerpc/64 has the
following explanation:

    Note that the symbols exposed by the vDSO aren't "normal" function symbols, apps
    can't be expected to link against them directly, the vDSO's are both seen
    as if they were linked at 0 and the symbols just contain offsets to the
    various functions.  This is done on purpose to avoid a relocation step
    (ppc64 functions normally have descriptors with abs addresses in them).
    When glibc uses those functions, it's expected to use it's own trampolines
    that know how to reach them.

The descriptors it's talking about are the OPD function descriptors
used on ABI v1 (big endian). But it would be more correct for a text
symbol to have type function, even if there's no function descriptor
for it.

glibc has a special case already for handling the VDSO symbols which
creates a fake opd pointing at the kernel symbol. So changing the VDSO
symbol type to function shouldn't affect that.

For ABI v2, there is no function descriptors and VDSO functions can
safely have function type.

So lets flag VDSO entry points as functions and revert the
selftest change.

Link: https://github.com/mpe/linux-fullhistory/commit/5f2dd691b62da9d9cc54b938f8b29c22c93cb805
Fixes: ba83b3239e65 ("selftests: vDSO: fix vDSO symbols lookup for powerpc64")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Reviewed-By: Segher Boessenkool <segher@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/b6ad2f1ee9887af3ca5ecade2a56f4acda517a85.1728512263.git.christophe.leroy@csgroup.eu
---
 arch/powerpc/include/asm/vdso.h           | 1 +
 tools/testing/selftests/vDSO/parse_vdso.c | 3 +--
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/arch/powerpc/include/asm/vdso.h b/arch/powerpc/include/asm/vdso.h
index 7650b6ce14c8..8d972bc98b55 100644
--- a/arch/powerpc/include/asm/vdso.h
+++ b/arch/powerpc/include/asm/vdso.h
@@ -25,6 +25,7 @@ int vdso_getcpu_init(void);
 #ifdef __VDSO64__
 #define V_FUNCTION_BEGIN(name)		\
 	.globl name;			\
+	.type name,@function; 		\
 	name:				\
 
 #define V_FUNCTION_END(name)		\
diff --git a/tools/testing/selftests/vDSO/parse_vdso.c b/tools/testing/selftests/vDSO/parse_vdso.c
index 7dd5668ea8a6..28f35620c499 100644
--- a/tools/testing/selftests/vDSO/parse_vdso.c
+++ b/tools/testing/selftests/vDSO/parse_vdso.c
@@ -222,8 +222,7 @@ void *vdso_sym(const char *version, const char *name)
 		ELF(Sym) *sym = &vdso_info.symtab[chain];
 
 		/* Check for a defined global or weak function w/ right name. */
-		if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC &&
-		    ELF64_ST_TYPE(sym->st_info) != STT_NOTYPE)
+		if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC)
 			continue;
 		if (ELF64_ST_BIND(sym->st_info) != STB_GLOBAL &&
 		    ELF64_ST_BIND(sym->st_info) != STB_WEAK)
-- 
cgit v1.2.3


From 27879e8cb6b0fdb5cdcd76685f290729309711c6 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang@os.amperecomputing.com>
Date: Tue, 1 Oct 2024 15:52:20 -0700
Subject: selftests: arm64: add hugetlb mte tests

The tests cover mmap, mprotect hugetlb with MTE prot and COW.

Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
Link: https://lore.kernel.org/r/20241001225220.271178-2-yang@os.amperecomputing.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 .../selftests/arm64/mte/check_hugetlb_options.c    | 285 +++++++++++++++++++++
 1 file changed, 285 insertions(+)
 create mode 100644 tools/testing/selftests/arm64/mte/check_hugetlb_options.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/mte/check_hugetlb_options.c b/tools/testing/selftests/arm64/mte/check_hugetlb_options.c
new file mode 100644
index 000000000000..303260a6dc65
--- /dev/null
+++ b/tools/testing/selftests/arm64/mte/check_hugetlb_options.c
@@ -0,0 +1,285 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2024 Ampere Computing LLC
+
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ucontext.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include "kselftest.h"
+#include "mte_common_util.h"
+#include "mte_def.h"
+
+#define TAG_CHECK_ON		0
+#define TAG_CHECK_OFF		1
+
+static unsigned long default_huge_page_size(void)
+{
+	unsigned long hps = 0;
+	char *line = NULL;
+	size_t linelen = 0;
+	FILE *f = fopen("/proc/meminfo", "r");
+
+	if (!f)
+		return 0;
+	while (getline(&line, &linelen, f) > 0) {
+		if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
+			hps <<= 10;
+			break;
+		}
+	}
+
+	free(line);
+	fclose(f);
+	return hps;
+}
+
+static bool is_hugetlb_allocated(void)
+{
+	unsigned long hps = 0;
+	char *line = NULL;
+	size_t linelen = 0;
+	FILE *f = fopen("/proc/meminfo", "r");
+
+	if (!f)
+		return false;
+	while (getline(&line, &linelen, f) > 0) {
+		if (sscanf(line, "Hugetlb:       %lu kB", &hps) == 1) {
+			hps <<= 10;
+			break;
+		}
+	}
+
+	free(line);
+	fclose(f);
+
+	if (hps > 0)
+		return true;
+
+	return false;
+}
+
+static void write_sysfs(char *str, unsigned long val)
+{
+	FILE *f;
+
+	f = fopen(str, "w");
+	if (!f) {
+		ksft_print_msg("ERR: missing %s\n", str);
+		return;
+	}
+	fprintf(f, "%lu", val);
+	fclose(f);
+}
+
+static void allocate_hugetlb()
+{
+	write_sysfs("/proc/sys/vm/nr_hugepages", 2);
+}
+
+static void free_hugetlb()
+{
+	write_sysfs("/proc/sys/vm/nr_hugepages", 0);
+}
+
+static int check_child_tag_inheritance(char *ptr, int size, int mode)
+{
+	int i, parent_tag, child_tag, fault, child_status;
+	pid_t child;
+
+	parent_tag = MT_FETCH_TAG((uintptr_t)ptr);
+	fault = 0;
+
+	child = fork();
+	if (child == -1) {
+		ksft_print_msg("FAIL: child process creation\n");
+		return KSFT_FAIL;
+	} else if (child == 0) {
+		mte_initialize_current_context(mode, (uintptr_t)ptr, size);
+		/* Do copy on write */
+		memset(ptr, '1', size);
+		mte_wait_after_trig();
+		if (cur_mte_cxt.fault_valid == true) {
+			fault = 1;
+			goto check_child_tag_inheritance_err;
+		}
+		for (i = 0; i < size; i += MT_GRANULE_SIZE) {
+			child_tag = MT_FETCH_TAG((uintptr_t)(mte_get_tag_address(ptr + i)));
+			if (parent_tag != child_tag) {
+				ksft_print_msg("FAIL: child mte tag (%d) mismatch\n", i);
+				fault = 1;
+				goto check_child_tag_inheritance_err;
+			}
+		}
+check_child_tag_inheritance_err:
+		_exit(fault);
+	}
+	/* Wait for child process to terminate */
+	wait(&child_status);
+	if (WIFEXITED(child_status))
+		fault = WEXITSTATUS(child_status);
+	else
+		fault = 1;
+	return (fault) ? KSFT_FAIL : KSFT_PASS;
+}
+
+static int check_mte_memory(char *ptr, int size, int mode, int tag_check)
+{
+	mte_initialize_current_context(mode, (uintptr_t)ptr, size);
+	memset(ptr, '1', size);
+	mte_wait_after_trig();
+	if (cur_mte_cxt.fault_valid == true)
+		return KSFT_FAIL;
+
+	return KSFT_PASS;
+}
+
+static int check_hugetlb_memory_mapping(int mem_type, int mode, int mapping, int tag_check)
+{
+	char *ptr, *map_ptr;
+	int result;
+	unsigned long map_size;
+
+	map_size = default_huge_page_size();
+
+	mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG);
+	map_ptr = (char *)mte_allocate_memory(map_size, mem_type, mapping, false);
+	if (check_allocated_memory(map_ptr, map_size, mem_type, false) != KSFT_PASS)
+		return KSFT_FAIL;
+
+	mte_initialize_current_context(mode, (uintptr_t)map_ptr, map_size);
+	/* Only mte enabled memory will allow tag insertion */
+	ptr = mte_insert_tags((void *)map_ptr, map_size);
+	if (!ptr || cur_mte_cxt.fault_valid == true) {
+		ksft_print_msg("FAIL: Insert tags on anonymous mmap memory\n");
+		munmap((void *)map_ptr, map_size);
+		return KSFT_FAIL;
+	}
+	result = check_mte_memory(ptr, map_size, mode, tag_check);
+	mte_clear_tags((void *)ptr, map_size);
+	mte_free_memory((void *)map_ptr, map_size, mem_type, false);
+	if (result == KSFT_FAIL)
+		return KSFT_FAIL;
+
+	return KSFT_PASS;
+}
+
+static int check_clear_prot_mte_flag(int mem_type, int mode, int mapping)
+{
+	char *map_ptr;
+	int prot_flag, result;
+	unsigned long map_size;
+
+	prot_flag = PROT_READ | PROT_WRITE;
+	mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG);
+	map_size = default_huge_page_size();
+	map_ptr = (char *)mte_allocate_memory_tag_range(map_size, mem_type, mapping,
+							0, 0);
+	if (check_allocated_memory_range(map_ptr, map_size, mem_type,
+					 0, 0) != KSFT_PASS)
+		return KSFT_FAIL;
+	/* Try to clear PROT_MTE property and verify it by tag checking */
+	if (mprotect(map_ptr, map_size, prot_flag)) {
+		mte_free_memory_tag_range((void *)map_ptr, map_size, mem_type,
+					  0, 0);
+		ksft_print_msg("FAIL: mprotect not ignoring clear PROT_MTE property\n");
+		return KSFT_FAIL;
+	}
+	result = check_mte_memory(map_ptr, map_size, mode, TAG_CHECK_ON);
+	mte_free_memory_tag_range((void *)map_ptr, map_size, mem_type, 0, 0);
+	if (result != KSFT_PASS)
+		return KSFT_FAIL;
+
+	return KSFT_PASS;
+}
+
+static int check_child_hugetlb_memory_mapping(int mem_type, int mode, int mapping)
+{
+	char *ptr;
+	int result;
+	unsigned long map_size;
+
+	map_size = default_huge_page_size();
+
+	mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG);
+	ptr = (char *)mte_allocate_memory_tag_range(map_size, mem_type, mapping,
+						    0, 0);
+	if (check_allocated_memory_range(ptr, map_size, mem_type,
+					 0, 0) != KSFT_PASS)
+		return KSFT_FAIL;
+	result = check_child_tag_inheritance(ptr, map_size, mode);
+	mte_free_memory_tag_range((void *)ptr, map_size, mem_type, 0, 0);
+	if (result == KSFT_FAIL)
+		return result;
+
+	return KSFT_PASS;
+}
+
+int main(int argc, char *argv[])
+{
+	int err;
+
+	err = mte_default_setup();
+	if (err)
+		return err;
+
+	/* Register signal handlers */
+	mte_register_signal(SIGBUS, mte_default_handler);
+	mte_register_signal(SIGSEGV, mte_default_handler);
+
+	allocate_hugetlb();
+
+	if (!is_hugetlb_allocated()) {
+		ksft_print_msg("ERR: Unable allocate hugetlb pages\n");
+		return KSFT_FAIL;
+	}
+
+	/* Set test plan */
+	ksft_set_plan(12);
+
+	mte_enable_pstate_tco();
+
+	evaluate_test(check_hugetlb_memory_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB, TAG_CHECK_OFF),
+	"Check hugetlb memory with private mapping, sync error mode, mmap memory and tag check off\n");
+
+	mte_disable_pstate_tco();
+	evaluate_test(check_hugetlb_memory_mapping(USE_MMAP, MTE_NONE_ERR, MAP_PRIVATE | MAP_HUGETLB, TAG_CHECK_OFF),
+	"Check hugetlb memory with private mapping, no error mode, mmap memory and tag check off\n");
+
+	evaluate_test(check_hugetlb_memory_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB, TAG_CHECK_ON),
+	"Check hugetlb memory with private mapping, sync error mode, mmap memory and tag check on\n");
+	evaluate_test(check_hugetlb_memory_mapping(USE_MPROTECT, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB, TAG_CHECK_ON),
+	"Check hugetlb memory with private mapping, sync error mode, mmap/mprotect memory and tag check on\n");
+	evaluate_test(check_hugetlb_memory_mapping(USE_MMAP, MTE_ASYNC_ERR, MAP_PRIVATE | MAP_HUGETLB, TAG_CHECK_ON),
+	"Check hugetlb memory with private mapping, async error mode, mmap memory and tag check on\n");
+	evaluate_test(check_hugetlb_memory_mapping(USE_MPROTECT, MTE_ASYNC_ERR, MAP_PRIVATE | MAP_HUGETLB, TAG_CHECK_ON),
+	"Check hugetlb memory with private mapping, async error mode, mmap/mprotect memory and tag check on\n");
+
+	evaluate_test(check_clear_prot_mte_flag(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB),
+	"Check clear PROT_MTE flags with private mapping, sync error mode and mmap memory\n");
+	evaluate_test(check_clear_prot_mte_flag(USE_MPROTECT, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB),
+	"Check clear PROT_MTE flags with private mapping and sync error mode and mmap/mprotect memory\n");
+
+	evaluate_test(check_child_hugetlb_memory_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB),
+		"Check child hugetlb memory with private mapping, precise mode and mmap memory\n");
+	evaluate_test(check_child_hugetlb_memory_mapping(USE_MMAP, MTE_ASYNC_ERR, MAP_PRIVATE | MAP_HUGETLB),
+		"Check child hugetlb memory with private mapping, precise mode and mmap memory\n");
+	evaluate_test(check_child_hugetlb_memory_mapping(USE_MPROTECT, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB),
+		"Check child hugetlb memory with private mapping, precise mode and mmap/mprotect memory\n");
+	evaluate_test(check_child_hugetlb_memory_mapping(USE_MPROTECT, MTE_ASYNC_ERR, MAP_PRIVATE | MAP_HUGETLB),
+		"Check child hugetlb memory with private mapping, precise mode and mmap/mprotect memory\n");
+
+	mte_restore_setup();
+	free_hugetlb();
+	ksft_print_cnts();
+	return ksft_get_fail_cnt() == 0 ? KSFT_PASS : KSFT_FAIL;
+}
-- 
cgit v1.2.3


From 48f8d9cef766f8ed4bbccc0d759710262d34f40b Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Sat, 5 Oct 2024 01:17:18 +0100
Subject: kselftest/arm64: Validate that GCS push and write permissions work

Add trivial assembly programs which give themselves the appropriate
permissions and then execute GCSPUSHM and GCSSTR, they will report errors
by generating signals on the non-permitted instructions. Not using libc
minimises the interaction with any policy set for the system but we skip on
failure to get the permissions in case the system is locked down to make
them inaccessible.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241005-arm64-gcs-test-flags-v1-1-03cb9786c5cd@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/gcs/.gitignore |  2 +
 tools/testing/selftests/arm64/gcs/Makefile   |  8 ++-
 tools/testing/selftests/arm64/gcs/gcspushm.S | 96 +++++++++++++++++++++++++++
 tools/testing/selftests/arm64/gcs/gcsstr.S   | 99 ++++++++++++++++++++++++++++
 4 files changed, 204 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/arm64/gcs/gcspushm.S
 create mode 100644 tools/testing/selftests/arm64/gcs/gcsstr.S

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/gcs/.gitignore b/tools/testing/selftests/arm64/gcs/.gitignore
index 1e8d1f6b27f2..bbb8e40a7e52 100644
--- a/tools/testing/selftests/arm64/gcs/.gitignore
+++ b/tools/testing/selftests/arm64/gcs/.gitignore
@@ -3,3 +3,5 @@ libc-gcs
 gcs-locking
 gcs-stress
 gcs-stress-thread
+gcspushm
+gcsstr
diff --git a/tools/testing/selftests/arm64/gcs/Makefile b/tools/testing/selftests/arm64/gcs/Makefile
index d8b06ca51e22..d2f3497a9103 100644
--- a/tools/testing/selftests/arm64/gcs/Makefile
+++ b/tools/testing/selftests/arm64/gcs/Makefile
@@ -6,7 +6,7 @@
 # nolibc.
 #
 
-TEST_GEN_PROGS := basic-gcs libc-gcs gcs-locking gcs-stress
+TEST_GEN_PROGS := basic-gcs libc-gcs gcs-locking gcs-stress gcspushm gcsstr
 TEST_GEN_PROGS_EXTENDED := gcs-stress-thread
 
 LDLIBS+=-lpthread
@@ -22,3 +22,9 @@ $(OUTPUT)/basic-gcs: basic-gcs.c
 
 $(OUTPUT)/gcs-stress-thread: gcs-stress-thread.S
 	$(CC) -nostdlib $^ -o $@
+
+$(OUTPUT)/gcspushm: gcspushm.S
+	$(CC) -nostdlib $^ -o $@
+
+$(OUTPUT)/gcsstr: gcsstr.S
+	$(CC) -nostdlib $^ -o $@
diff --git a/tools/testing/selftests/arm64/gcs/gcspushm.S b/tools/testing/selftests/arm64/gcs/gcspushm.S
new file mode 100644
index 000000000000..bbe17c1325ac
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/gcspushm.S
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0-only
+//
+// Copyright 2024 Arm Limited
+//
+// Give ourselves GCS push permissions then use them
+
+#include <asm/unistd.h>
+
+/* Shadow Stack/Guarded Control Stack interface */
+#define PR_GET_SHADOW_STACK_STATUS	74
+#define PR_SET_SHADOW_STACK_STATUS      75
+#define PR_LOCK_SHADOW_STACK_STATUS     76
+
+# define PR_SHADOW_STACK_ENABLE         (1UL << 0)
+# define PR_SHADOW_STACK_WRITE		(1UL << 1)
+# define PR_SHADOW_STACK_PUSH		(1UL << 2)
+
+#define KSFT_SKIP 4
+
+.macro function name
+	.macro endfunction
+		.type \name, @function
+		.purgem endfunction
+	.endm
+\name:
+.endm
+
+// Print a single character x0 to stdout
+// Clobbers x0-x2,x8
+function putc
+	str	x0, [sp, #-16]!
+
+	mov	x0, #1			// STDOUT_FILENO
+	mov	x1, sp
+	mov	x2, #1
+	mov	x8, #__NR_write
+	svc	#0
+
+	add	sp, sp, #16
+	ret
+endfunction
+.globl	putc
+
+// Print a NUL-terminated string starting at address x0 to stdout
+// Clobbers x0-x3,x8
+function puts
+	mov	x1, x0
+
+	mov	x2, #0
+0:	ldrb	w3, [x0], #1
+	cbz	w3, 1f
+	add	x2, x2, #1
+	b	0b
+
+1:	mov	w0, #1			// STDOUT_FILENO
+	mov	x8, #__NR_write
+	svc	#0
+
+	ret
+endfunction
+.globl	puts
+
+// Utility macro to print a literal string
+// Clobbers x0-x4,x8
+.macro puts string
+	.pushsection .rodata.str1.1, "aMS", @progbits, 1
+.L__puts_literal\@: .string "\string"
+	.popsection
+
+	ldr	x0, =.L__puts_literal\@
+	bl	puts
+.endm
+
+.globl _start
+function _start
+	// Run with GCS
+	mov	x0, PR_SET_SHADOW_STACK_STATUS
+	mov	x1, PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_PUSH
+	mov	x2, xzr
+	mov	x3, xzr
+	mov	x4, xzr
+	mov	x5, xzr
+	mov	x8, #__NR_prctl
+	svc	#0
+	cbz	x0, 1f
+	puts	"Failed to enable GCS with push permission\n"
+	mov	x0, #KSFT_SKIP
+	b	2f
+1:
+	sys	#3, c7, c7, #0, x0	// GCSPUSHM
+	sysl	x0, #3, c7, c7, #1	// GCSPOPM
+
+	mov	x0, #0
+2:
+	mov	x8, #__NR_exit
+	svc	#0
diff --git a/tools/testing/selftests/arm64/gcs/gcsstr.S b/tools/testing/selftests/arm64/gcs/gcsstr.S
new file mode 100644
index 000000000000..a42bba6e30b1
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/gcsstr.S
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: GPL-2.0-only
+//
+// Copyright 2024 Arm Limited
+//
+// Give ourselves GCS write permissions then use them
+
+#include <asm/unistd.h>
+
+/* Shadow Stack/Guarded Control Stack interface */
+#define PR_GET_SHADOW_STACK_STATUS	74
+#define PR_SET_SHADOW_STACK_STATUS      75
+#define PR_LOCK_SHADOW_STACK_STATUS     76
+
+# define PR_SHADOW_STACK_ENABLE         (1UL << 0)
+# define PR_SHADOW_STACK_WRITE		(1UL << 1)
+# define PR_SHADOW_STACK_PUSH		(1UL << 2)
+
+#define	GCSPR_EL0 S3_3_C2_C5_1
+
+#define KSFT_SKIP 4
+
+.macro function name
+	.macro endfunction
+		.type \name, @function
+		.purgem endfunction
+	.endm
+\name:
+.endm
+
+// Print a single character x0 to stdout
+// Clobbers x0-x2,x8
+function putc
+	str	x0, [sp, #-16]!
+
+	mov	x0, #1			// STDOUT_FILENO
+	mov	x1, sp
+	mov	x2, #1
+	mov	x8, #__NR_write
+	svc	#0
+
+	add	sp, sp, #16
+	ret
+endfunction
+.globl	putc
+
+// Print a NUL-terminated string starting at address x0 to stdout
+// Clobbers x0-x3,x8
+function puts
+	mov	x1, x0
+
+	mov	x2, #0
+0:	ldrb	w3, [x0], #1
+	cbz	w3, 1f
+	add	x2, x2, #1
+	b	0b
+
+1:	mov	w0, #1			// STDOUT_FILENO
+	mov	x8, #__NR_write
+	svc	#0
+
+	ret
+endfunction
+.globl	puts
+
+// Utility macro to print a literal string
+// Clobbers x0-x4,x8
+.macro puts string
+	.pushsection .rodata.str1.1, "aMS", @progbits, 1
+.L__puts_literal\@: .string "\string"
+	.popsection
+
+	ldr	x0, =.L__puts_literal\@
+	bl	puts
+.endm
+
+.globl _start
+function _start
+	// Run with GCS
+	mov	x0, PR_SET_SHADOW_STACK_STATUS
+	mov	x1, PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE
+	mov	x2, xzr
+	mov	x3, xzr
+	mov	x4, xzr
+	mov	x5, xzr
+	mov	x8, #__NR_prctl
+	svc	#0
+	cbz	x0, 1f
+	puts	"Failed to enable GCS with write permission\n"
+	mov	x0, #KSFT_SKIP
+	b	2f
+1:
+	mrs	x0, GCSPR_EL0
+	sub	x0, x0, #8
+	.inst	0xd91f1c01	// GCSSTR x1, x0
+
+	mov	x0, #0
+2:
+	mov	x8, #__NR_exit
+	svc	#0
-- 
cgit v1.2.3


From 9b9be78258511e67767e4aa51f587cf22feb5065 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 11 Oct 2024 15:36:25 +0100
Subject: kselftest/arm64: Ensure stable names for GCS stress test results

The GCS stress test program currently uses the PID of the threads it
creates in the test names it reports, resulting in unstable test names
between runs. Fix this by using a thread number instead.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241011-arm64-gcs-stress-stable-name-v1-1-4950f226218e@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/gcs/gcs-stress.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/gcs/gcs-stress.c b/tools/testing/selftests/arm64/gcs/gcs-stress.c
index bdec7ee8cfd5..03222c36c436 100644
--- a/tools/testing/selftests/arm64/gcs/gcs-stress.c
+++ b/tools/testing/selftests/arm64/gcs/gcs-stress.c
@@ -56,7 +56,7 @@ static int num_processors(void)
 	return nproc;
 }
 
-static void start_thread(struct child_data *child)
+static void start_thread(struct child_data *child, int id)
 {
 	int ret, pipefd[2], i;
 	struct epoll_event ev;
@@ -132,7 +132,7 @@ static void start_thread(struct child_data *child)
 		ev.events = EPOLLIN | EPOLLHUP;
 		ev.data.ptr = child;
 
-		ret = asprintf(&child->name, "Thread-%d", child->pid);
+		ret = asprintf(&child->name, "Thread-%d", id);
 		if (ret == -1)
 			ksft_exit_fail_msg("asprintf() failed\n");
 
@@ -437,7 +437,7 @@ int main(int argc, char **argv)
 				   tests);
 
 	for (i = 0; i < gcs_threads; i++)
-		start_thread(&children[i]);
+		start_thread(&children[i], i);
 
 	/*
 	 * All children started, close the startup pipe and let them
-- 
cgit v1.2.3


From a496d0cdc84d81fbfd2026ef41c8ae9385d01fbb Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 10 Oct 2024 16:25:05 -0700
Subject: selftests/bpf: Add a test for kmem_cache_iter

The test traverses all slab caches using the kmem_cache_iter and save
the data into slab_result array map.  And check if current task's
pointer is from "task_struct" slab cache using bpf_get_kmem_cache().

Also compare the result array with /proc/slabinfo if available (when
CONFIG_SLUB_DEBUG is on).  Note that many of the fields in the slabinfo
are transient, so it only compares the name and objsize fields.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20241010232505.1339892-4-namhyung@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/kmem_cache_iter.c     | 115 +++++++++++++++++++++
 tools/testing/selftests/bpf/progs/bpf_iter.h       |   7 ++
 .../testing/selftests/bpf/progs/kmem_cache_iter.c  |  87 ++++++++++++++++
 3 files changed, 209 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c
 create mode 100644 tools/testing/selftests/bpf/progs/kmem_cache_iter.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c b/tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c
new file mode 100644
index 000000000000..848d8fc9171f
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Google */
+
+#include <test_progs.h>
+#include <bpf/libbpf.h>
+#include <bpf/btf.h>
+#include "kmem_cache_iter.skel.h"
+
+#define SLAB_NAME_MAX  32
+
+struct kmem_cache_result {
+	char name[SLAB_NAME_MAX];
+	long obj_size;
+};
+
+static void subtest_kmem_cache_iter_check_task_struct(struct kmem_cache_iter *skel)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, opts,
+		.flags = 0,  /* Run it with the current task */
+	);
+	int prog_fd = bpf_program__fd(skel->progs.check_task_struct);
+
+	/* Get task_struct and check it if's from a slab cache */
+	ASSERT_OK(bpf_prog_test_run_opts(prog_fd, &opts), "prog_test_run");
+
+	/* The BPF program should set 'found' variable */
+	ASSERT_EQ(skel->bss->task_struct_found, 1, "task_struct_found");
+}
+
+static void subtest_kmem_cache_iter_check_slabinfo(struct kmem_cache_iter *skel)
+{
+	FILE *fp;
+	int map_fd;
+	char name[SLAB_NAME_MAX];
+	unsigned long objsize;
+	char rest_of_line[1000];
+	struct kmem_cache_result r;
+	int seen = 0;
+
+	fp = fopen("/proc/slabinfo", "r");
+	if (fp == NULL) {
+		/* CONFIG_SLUB_DEBUG is not enabled */
+		return;
+	}
+
+	map_fd = bpf_map__fd(skel->maps.slab_result);
+
+	/* Ignore first two lines for header */
+	fscanf(fp, "slabinfo - version: %*d.%*d\n");
+	fscanf(fp, "# %*s %*s %*s %*s %*s %*s : %[^\n]\n", rest_of_line);
+
+	/* Compare name and objsize only - others can be changes frequently */
+	while (fscanf(fp, "%s %*u %*u %lu %*u %*u : %[^\n]\n",
+		      name, &objsize, rest_of_line) == 3) {
+		int ret = bpf_map_lookup_elem(map_fd, &seen, &r);
+
+		if (!ASSERT_OK(ret, "kmem_cache_lookup"))
+			break;
+
+		ASSERT_STREQ(r.name, name, "kmem_cache_name");
+		ASSERT_EQ(r.obj_size, objsize, "kmem_cache_objsize");
+
+		seen++;
+	}
+
+	ASSERT_EQ(skel->bss->kmem_cache_seen, seen, "kmem_cache_seen_eq");
+
+	fclose(fp);
+}
+
+void test_kmem_cache_iter(void)
+{
+	DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+	struct kmem_cache_iter *skel = NULL;
+	union bpf_iter_link_info linfo = {};
+	struct bpf_link *link;
+	char buf[256];
+	int iter_fd;
+
+	skel = kmem_cache_iter__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "kmem_cache_iter__open_and_load"))
+		return;
+
+	opts.link_info = &linfo;
+	opts.link_info_len = sizeof(linfo);
+
+	link = bpf_program__attach_iter(skel->progs.slab_info_collector, &opts);
+	if (!ASSERT_OK_PTR(link, "attach_iter"))
+		goto destroy;
+
+	iter_fd = bpf_iter_create(bpf_link__fd(link));
+	if (!ASSERT_GE(iter_fd, 0, "iter_create"))
+		goto free_link;
+
+	memset(buf, 0, sizeof(buf));
+	while (read(iter_fd, buf, sizeof(buf) > 0)) {
+		/* Read out all contents */
+		printf("%s", buf);
+	}
+
+	/* Next reads should return 0 */
+	ASSERT_EQ(read(iter_fd, buf, sizeof(buf)), 0, "read");
+
+	if (test__start_subtest("check_task_struct"))
+		subtest_kmem_cache_iter_check_task_struct(skel);
+	if (test__start_subtest("check_slabinfo"))
+		subtest_kmem_cache_iter_check_slabinfo(skel);
+
+	close(iter_fd);
+
+free_link:
+	bpf_link__destroy(link);
+destroy:
+	kmem_cache_iter__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter.h b/tools/testing/selftests/bpf/progs/bpf_iter.h
index c41ee80533ca..3305dc3a74b3 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter.h
+++ b/tools/testing/selftests/bpf/progs/bpf_iter.h
@@ -24,6 +24,7 @@
 #define BTF_F_PTR_RAW BTF_F_PTR_RAW___not_used
 #define BTF_F_ZERO BTF_F_ZERO___not_used
 #define bpf_iter__ksym bpf_iter__ksym___not_used
+#define bpf_iter__kmem_cache bpf_iter__kmem_cache___not_used
 #include "vmlinux.h"
 #undef bpf_iter_meta
 #undef bpf_iter__bpf_map
@@ -48,6 +49,7 @@
 #undef BTF_F_PTR_RAW
 #undef BTF_F_ZERO
 #undef bpf_iter__ksym
+#undef bpf_iter__kmem_cache
 
 struct bpf_iter_meta {
 	struct seq_file *seq;
@@ -165,3 +167,8 @@ struct bpf_iter__ksym {
 	struct bpf_iter_meta *meta;
 	struct kallsym_iter *ksym;
 };
+
+struct bpf_iter__kmem_cache {
+	struct bpf_iter_meta *meta;
+	struct kmem_cache *s;
+} __attribute__((preserve_access_index));
diff --git a/tools/testing/selftests/bpf/progs/kmem_cache_iter.c b/tools/testing/selftests/bpf/progs/kmem_cache_iter.c
new file mode 100644
index 000000000000..72c9dafecd98
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/kmem_cache_iter.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Google */
+
+#include "bpf_iter.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+#define SLAB_NAME_MAX  32
+
+struct kmem_cache_result {
+	char name[SLAB_NAME_MAX];
+	long obj_size;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(key_size, sizeof(void *));
+	__uint(value_size, SLAB_NAME_MAX);
+	__uint(max_entries, 1);
+} slab_hash SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(key_size, sizeof(int));
+	__uint(value_size, sizeof(struct kmem_cache_result));
+	__uint(max_entries, 1024);
+} slab_result SEC(".maps");
+
+extern struct kmem_cache *bpf_get_kmem_cache(u64 addr) __ksym;
+
+/* Result, will be checked by userspace */
+int task_struct_found;
+int kmem_cache_seen;
+
+SEC("iter/kmem_cache")
+int slab_info_collector(struct bpf_iter__kmem_cache *ctx)
+{
+	struct seq_file *seq = ctx->meta->seq;
+	struct kmem_cache *s = ctx->s;
+	struct kmem_cache_result *r;
+	int idx;
+
+	if (s) {
+		/* To make sure if the slab_iter implements the seq interface
+		 * properly and it's also useful for debugging.
+		 */
+		BPF_SEQ_PRINTF(seq, "%s: %u\n", s->name, s->size);
+
+		idx = kmem_cache_seen;
+		r = bpf_map_lookup_elem(&slab_result, &idx);
+		if (r == NULL)
+			return 0;
+
+		kmem_cache_seen++;
+
+		/* Save name and size to match /proc/slabinfo */
+		bpf_probe_read_kernel_str(r->name, sizeof(r->name), s->name);
+		r->obj_size = s->size;
+
+		if (!bpf_strncmp(r->name, 11, "task_struct"))
+			bpf_map_update_elem(&slab_hash, &s, r->name, BPF_NOEXIST);
+	}
+
+	return 0;
+}
+
+SEC("raw_tp/bpf_test_finish")
+int BPF_PROG(check_task_struct)
+{
+	u64 curr = bpf_get_current_task();
+	struct kmem_cache *s;
+	char *name;
+
+	s = bpf_get_kmem_cache(curr);
+	if (s == NULL) {
+		task_struct_found = -1;
+		return 0;
+	}
+	name = bpf_map_lookup_elem(&slab_hash, &s);
+	if (name && !bpf_strncmp(name, 11, "task_struct"))
+		task_struct_found = 1;
+	else
+		task_struct_found = -2;
+	return 0;
+}
-- 
cgit v1.2.3


From f987a640e853e96c85f8193d0c2f79744622e3d8 Mon Sep 17 00:00:00 2001
From: Juntong Deng <juntong.deng@outlook.com>
Date: Mon, 14 Oct 2024 10:25:53 +0100
Subject: selftests/bpf: Add tests for bpf_task_from_vpid() kfunc

This patch adds test cases for bpf_task_from_vpid() kfunc.

task_kfunc_from_vpid_no_null_check is used to test the case where
the return value is not checked for NULL pointer.

test_task_from_vpid_current is used to test obtaining the
struct task_struct of the process in the pid namespace based on vpid.

test_task_from_vpid_invalid is used to test the case of invalid vpid.

test_task_from_vpid_current and test_task_from_vpid_invalid will run
in the new namespace.

Signed-off-by: Juntong Deng <juntong.deng@outlook.com>
Link: https://lore.kernel.org/r/AM6PR03MB5848F13435CD650AC4B7BD7099442@AM6PR03MB5848.eurprd03.prod.outlook.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../testing/selftests/bpf/prog_tests/task_kfunc.c  | 80 ++++++++++++++++++++++
 .../selftests/bpf/progs/task_kfunc_common.h        |  1 +
 .../selftests/bpf/progs/task_kfunc_failure.c       | 14 ++++
 .../selftests/bpf/progs/task_kfunc_success.c       | 51 ++++++++++++++
 4 files changed, 146 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/task_kfunc.c b/tools/testing/selftests/bpf/prog_tests/task_kfunc.c
index d4579f735398..83b90335967a 100644
--- a/tools/testing/selftests/bpf/prog_tests/task_kfunc.c
+++ b/tools/testing/selftests/bpf/prog_tests/task_kfunc.c
@@ -68,6 +68,74 @@ cleanup:
 	task_kfunc_success__destroy(skel);
 }
 
+static int run_vpid_test(void *prog_name)
+{
+	struct task_kfunc_success *skel;
+	struct bpf_program *prog;
+	int prog_fd, err = 0;
+
+	if (getpid() != 1)
+		return 1;
+
+	skel = open_load_task_kfunc_skel();
+	if (!skel)
+		return 2;
+
+	if (skel->bss->err) {
+		err = 3;
+		goto cleanup;
+	}
+
+	prog = bpf_object__find_program_by_name(skel->obj, prog_name);
+	if (!prog) {
+		err = 4;
+		goto cleanup;
+	}
+
+	prog_fd = bpf_program__fd(prog);
+	if (prog_fd < 0) {
+		err = 5;
+		goto cleanup;
+	}
+
+	if (bpf_prog_test_run_opts(prog_fd, NULL)) {
+		err = 6;
+		goto cleanup;
+	}
+
+	if (skel->bss->err)
+		err = 7 + skel->bss->err;
+cleanup:
+	task_kfunc_success__destroy(skel);
+	return err;
+}
+
+static void run_vpid_success_test(const char *prog_name)
+{
+	const int stack_size = 1024 * 1024;
+	int child_pid, wstatus;
+	char *stack;
+
+	stack = (char *)malloc(stack_size);
+	if (!ASSERT_OK_PTR(stack, "clone_stack"))
+		return;
+
+	child_pid = clone(run_vpid_test, stack + stack_size,
+			  CLONE_NEWPID | SIGCHLD, (void *)prog_name);
+	if (!ASSERT_GT(child_pid, -1, "child_pid"))
+		goto cleanup;
+
+	if (!ASSERT_GT(waitpid(child_pid, &wstatus, 0), -1, "waitpid"))
+		goto cleanup;
+
+	if (WEXITSTATUS(wstatus) > 7)
+		ASSERT_OK(WEXITSTATUS(wstatus) - 7, "vpid_test_failure");
+	else
+		ASSERT_OK(WEXITSTATUS(wstatus), "run_vpid_test_err");
+cleanup:
+	free(stack);
+}
+
 static const char * const success_tests[] = {
 	"test_task_acquire_release_argument",
 	"test_task_acquire_release_current",
@@ -83,6 +151,11 @@ static const char * const success_tests[] = {
 	"test_task_kfunc_flavor_relo_not_found",
 };
 
+static const char * const vpid_success_tests[] = {
+	"test_task_from_vpid_current",
+	"test_task_from_vpid_invalid",
+};
+
 void test_task_kfunc(void)
 {
 	int i;
@@ -94,5 +167,12 @@ void test_task_kfunc(void)
 		run_success_test(success_tests[i]);
 	}
 
+	for (i = 0; i < ARRAY_SIZE(vpid_success_tests); i++) {
+		if (!test__start_subtest(vpid_success_tests[i]))
+			continue;
+
+		run_vpid_success_test(vpid_success_tests[i]);
+	}
+
 	RUN_TESTS(task_kfunc_failure);
 }
diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_common.h b/tools/testing/selftests/bpf/progs/task_kfunc_common.h
index 6720c4b5be41..e9c4fea7a4bb 100644
--- a/tools/testing/selftests/bpf/progs/task_kfunc_common.h
+++ b/tools/testing/selftests/bpf/progs/task_kfunc_common.h
@@ -23,6 +23,7 @@ struct {
 struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym;
 void bpf_task_release(struct task_struct *p) __ksym;
 struct task_struct *bpf_task_from_pid(s32 pid) __ksym;
+struct task_struct *bpf_task_from_vpid(s32 vpid) __ksym;
 void bpf_rcu_read_lock(void) __ksym;
 void bpf_rcu_read_unlock(void) __ksym;
 
diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
index ad88a3796ddf..4c07ea193f72 100644
--- a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
+++ b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
@@ -247,6 +247,20 @@ int BPF_PROG(task_kfunc_from_pid_no_null_check, struct task_struct *task, u64 cl
 	return 0;
 }
 
+SEC("tp_btf/task_newtask")
+__failure __msg("Possibly NULL pointer passed to trusted arg0")
+int BPF_PROG(task_kfunc_from_vpid_no_null_check, struct task_struct *task, u64 clone_flags)
+{
+	struct task_struct *acquired;
+
+	acquired = bpf_task_from_vpid(task->pid);
+
+	/* Releasing bpf_task_from_vpid() lookup without a NULL check. */
+	bpf_task_release(acquired);
+
+	return 0;
+}
+
 SEC("lsm/task_free")
 __failure __msg("R1 must be a rcu pointer")
 int BPF_PROG(task_kfunc_from_lsm_task_free, struct task_struct *task)
diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_success.c b/tools/testing/selftests/bpf/progs/task_kfunc_success.c
index a55149015063..5fb4fc19d26a 100644
--- a/tools/testing/selftests/bpf/progs/task_kfunc_success.c
+++ b/tools/testing/selftests/bpf/progs/task_kfunc_success.c
@@ -366,3 +366,54 @@ int BPF_PROG(task_kfunc_acquire_trusted_walked, struct task_struct *task, u64 cl
 
 	return 0;
 }
+
+SEC("syscall")
+int test_task_from_vpid_current(const void *ctx)
+{
+	struct task_struct *current, *v_task;
+
+	v_task = bpf_task_from_vpid(1);
+	if (!v_task) {
+		err = 1;
+		return 0;
+	}
+
+	current = bpf_get_current_task_btf();
+
+	/* The current process should be the init process (pid 1) in the new pid namespace. */
+	if (current != v_task)
+		err = 2;
+
+	bpf_task_release(v_task);
+	return 0;
+}
+
+SEC("syscall")
+int test_task_from_vpid_invalid(const void *ctx)
+{
+	struct task_struct *v_task;
+
+	v_task = bpf_task_from_vpid(-1);
+	if (v_task) {
+		err = 1;
+		goto err;
+	}
+
+	/* There should be only one process (current process) in the new pid namespace. */
+	v_task = bpf_task_from_vpid(2);
+	if (v_task) {
+		err = 2;
+		goto err;
+	}
+
+	v_task = bpf_task_from_vpid(9999);
+	if (v_task) {
+		err = 3;
+		goto err;
+	}
+
+	return 0;
+err:
+	bpf_task_release(v_task);
+	return 0;
+}
-- 
cgit v1.2.3


From 021611d33e78694f4bd54573093c6fc70a812644 Mon Sep 17 00:00:00 2001
From: Leon Hwang <leon.hwang@linux.dev>
Date: Tue, 15 Oct 2024 23:02:07 +0800
Subject: selftests/bpf: Add test to verify tailcall and freplace restrictions

Add a test case to ensure that attaching a tail callee program with an
freplace program fails, and that updating an extended program to a
prog_array map is also prohibited.

This test is designed to prevent the potential infinite loop issue caused
by the combination of tail calls and freplace, ensuring the correct
behavior and stability of the system.

Additionally, fix the broken tailcalls/tailcall_freplace selftest
because an extension prog should not be tailcalled.

cd tools/testing/selftests/bpf; ./test_progs -t tailcalls
337/25  tailcalls/tailcall_freplace:OK
337/26  tailcalls/tailcall_bpf2bpf_freplace:OK
337     tailcalls:OK
Summary: 1/26 PASSED, 0 SKIPPED, 0 FAILED

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
Link: https://lore.kernel.org/r/20241015150207.70264-3-leon.hwang@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/tailcalls.c | 120 ++++++++++++++++++---
 tools/testing/selftests/bpf/progs/tc_bpf2bpf.c     |   5 +-
 2 files changed, 109 insertions(+), 16 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c
index 21c5a37846ad..40f22454cf05 100644
--- a/tools/testing/selftests/bpf/prog_tests/tailcalls.c
+++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c
@@ -1496,8 +1496,8 @@ static void test_tailcall_bpf2bpf_hierarchy_3(void)
 	RUN_TESTS(tailcall_bpf2bpf_hierarchy3);
 }
 
-/* test_tailcall_freplace checks that the attached freplace prog is OK to
- * update the prog_array map.
+/* test_tailcall_freplace checks that the freplace prog fails to update the
+ * prog_array map, no matter whether the freplace prog attaches to its target.
  */
 static void test_tailcall_freplace(void)
 {
@@ -1505,7 +1505,7 @@ static void test_tailcall_freplace(void)
 	struct bpf_link *freplace_link = NULL;
 	struct bpf_program *freplace_prog;
 	struct tc_bpf2bpf *tc_skel = NULL;
-	int prog_fd, map_fd;
+	int prog_fd, tc_prog_fd, map_fd;
 	char buff[128] = {};
 	int err, key;
 
@@ -1523,9 +1523,10 @@ static void test_tailcall_freplace(void)
 	if (!ASSERT_OK_PTR(tc_skel, "tc_bpf2bpf__open_and_load"))
 		goto out;
 
-	prog_fd = bpf_program__fd(tc_skel->progs.entry_tc);
+	tc_prog_fd = bpf_program__fd(tc_skel->progs.entry_tc);
 	freplace_prog = freplace_skel->progs.entry_freplace;
-	err = bpf_program__set_attach_target(freplace_prog, prog_fd, "subprog");
+	err = bpf_program__set_attach_target(freplace_prog, tc_prog_fd,
+					     "subprog_tc");
 	if (!ASSERT_OK(err, "set_attach_target"))
 		goto out;
 
@@ -1533,27 +1534,116 @@ static void test_tailcall_freplace(void)
 	if (!ASSERT_OK(err, "tailcall_freplace__load"))
 		goto out;
 
-	freplace_link = bpf_program__attach_freplace(freplace_prog, prog_fd,
-						     "subprog");
+	map_fd = bpf_map__fd(freplace_skel->maps.jmp_table);
+	prog_fd = bpf_program__fd(freplace_prog);
+	key = 0;
+	err = bpf_map_update_elem(map_fd, &key, &prog_fd, BPF_ANY);
+	ASSERT_ERR(err, "update jmp_table failure");
+
+	freplace_link = bpf_program__attach_freplace(freplace_prog, tc_prog_fd,
+						     "subprog_tc");
 	if (!ASSERT_OK_PTR(freplace_link, "attach_freplace"))
 		goto out;
 
-	map_fd = bpf_map__fd(freplace_skel->maps.jmp_table);
-	prog_fd = bpf_program__fd(freplace_prog);
+	err = bpf_map_update_elem(map_fd, &key, &prog_fd, BPF_ANY);
+	ASSERT_ERR(err, "update jmp_table failure");
+
+out:
+	bpf_link__destroy(freplace_link);
+	tailcall_freplace__destroy(freplace_skel);
+	tc_bpf2bpf__destroy(tc_skel);
+}
+
+/* test_tailcall_bpf2bpf_freplace checks the failure that fails to attach a tail
+ * callee prog with freplace prog or fails to update an extended prog to
+ * prog_array map.
+ */
+static void test_tailcall_bpf2bpf_freplace(void)
+{
+	struct tailcall_freplace *freplace_skel = NULL;
+	struct bpf_link *freplace_link = NULL;
+	struct tc_bpf2bpf *tc_skel = NULL;
+	char buff[128] = {};
+	int prog_fd, map_fd;
+	int err, key;
+
+	LIBBPF_OPTS(bpf_test_run_opts, topts,
+		    .data_in = buff,
+		    .data_size_in = sizeof(buff),
+		    .repeat = 1,
+	);
+
+	tc_skel = tc_bpf2bpf__open_and_load();
+	if (!ASSERT_OK_PTR(tc_skel, "tc_bpf2bpf__open_and_load"))
+		goto out;
+
+	prog_fd = bpf_program__fd(tc_skel->progs.entry_tc);
+	freplace_skel = tailcall_freplace__open();
+	if (!ASSERT_OK_PTR(freplace_skel, "tailcall_freplace__open"))
+		goto out;
+
+	err = bpf_program__set_attach_target(freplace_skel->progs.entry_freplace,
+					     prog_fd, "subprog_tc");
+	if (!ASSERT_OK(err, "set_attach_target"))
+		goto out;
+
+	err = tailcall_freplace__load(freplace_skel);
+	if (!ASSERT_OK(err, "tailcall_freplace__load"))
+		goto out;
+
+	/* OK to attach then detach freplace prog. */
+
+	freplace_link = bpf_program__attach_freplace(freplace_skel->progs.entry_freplace,
+						     prog_fd, "subprog_tc");
+	if (!ASSERT_OK_PTR(freplace_link, "attach_freplace"))
+		goto out;
+
+	err = bpf_link__destroy(freplace_link);
+	if (!ASSERT_OK(err, "destroy link"))
+		goto out;
+
+	/* OK to update prog_array map then delete element from the map. */
+
 	key = 0;
+	map_fd = bpf_map__fd(freplace_skel->maps.jmp_table);
 	err = bpf_map_update_elem(map_fd, &key, &prog_fd, BPF_ANY);
 	if (!ASSERT_OK(err, "update jmp_table"))
 		goto out;
 
-	prog_fd = bpf_program__fd(tc_skel->progs.entry_tc);
-	err = bpf_prog_test_run_opts(prog_fd, &topts);
-	ASSERT_OK(err, "test_run");
-	ASSERT_EQ(topts.retval, 34, "test_run retval");
+	err = bpf_map_delete_elem(map_fd, &key);
+	if (!ASSERT_OK(err, "delete_elem from jmp_table"))
+		goto out;
+
+	/* Fail to attach a tail callee prog with freplace prog. */
+
+	err = bpf_map_update_elem(map_fd, &key, &prog_fd, BPF_ANY);
+	if (!ASSERT_OK(err, "update jmp_table"))
+		goto out;
+
+	freplace_link = bpf_program__attach_freplace(freplace_skel->progs.entry_freplace,
+						     prog_fd, "subprog_tc");
+	if (!ASSERT_ERR_PTR(freplace_link, "attach_freplace failure"))
+		goto out;
+
+	err = bpf_map_delete_elem(map_fd, &key);
+	if (!ASSERT_OK(err, "delete_elem from jmp_table"))
+		goto out;
+
+	/* Fail to update an extended prog to prog_array map. */
+
+	freplace_link = bpf_program__attach_freplace(freplace_skel->progs.entry_freplace,
+						     prog_fd, "subprog_tc");
+	if (!ASSERT_OK_PTR(freplace_link, "attach_freplace"))
+		goto out;
+
+	err = bpf_map_update_elem(map_fd, &key, &prog_fd, BPF_ANY);
+	if (!ASSERT_ERR(err, "update jmp_table failure"))
+		goto out;
 
 out:
 	bpf_link__destroy(freplace_link);
-	tc_bpf2bpf__destroy(tc_skel);
 	tailcall_freplace__destroy(freplace_skel);
+	tc_bpf2bpf__destroy(tc_skel);
 }
 
 void test_tailcalls(void)
@@ -1606,4 +1696,6 @@ void test_tailcalls(void)
 	test_tailcall_bpf2bpf_hierarchy_3();
 	if (test__start_subtest("tailcall_freplace"))
 		test_tailcall_freplace();
+	if (test__start_subtest("tailcall_bpf2bpf_freplace"))
+		test_tailcall_bpf2bpf_freplace();
 }
diff --git a/tools/testing/selftests/bpf/progs/tc_bpf2bpf.c b/tools/testing/selftests/bpf/progs/tc_bpf2bpf.c
index 8a0632c37839..d1a57f7d09bd 100644
--- a/tools/testing/selftests/bpf/progs/tc_bpf2bpf.c
+++ b/tools/testing/selftests/bpf/progs/tc_bpf2bpf.c
@@ -5,10 +5,11 @@
 #include "bpf_misc.h"
 
 __noinline
-int subprog(struct __sk_buff *skb)
+int subprog_tc(struct __sk_buff *skb)
 {
 	int ret = 1;
 
+	__sink(skb);
 	__sink(ret);
 	return ret;
 }
@@ -16,7 +17,7 @@ int subprog(struct __sk_buff *skb)
 SEC("tc")
 int entry_tc(struct __sk_buff *skb)
 {
-	return subprog(skb);
+	return subprog_tc(skb);
 }
 
 char __license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From ee9b352ce4650ffc0d8ca0ac373d7c009c7e561e Mon Sep 17 00:00:00 2001
From: Zijian Zhang <zijianzhang@bytedance.com>
Date: Sat, 12 Oct 2024 20:37:30 +0000
Subject: selftests/bpf: Fix msg_verify_data in test_sockmap

Function msg_verify_data should have context of bytes_cnt and k instead of
assuming they are zero. Otherwise, test_sockmap with data integrity test
will report some errors. I also fix the logic related to size and index j

1/ 6  sockmap::txmsg test passthrough:FAIL
2/ 6  sockmap::txmsg test redirect:FAIL
7/12  sockmap::txmsg test apply:FAIL
10/11  sockmap::txmsg test push_data:FAIL
11/17  sockmap::txmsg test pull-data:FAIL
12/ 9  sockmap::txmsg test pop-data:FAIL
13/ 1  sockmap::txmsg test push/pop data:FAIL
...
Pass: 24 Fail: 52

After applying this patch, some of the errors are solved, but for push,
pull and pop, we may need more fixes to msg_verify_data, added a TODO

10/11  sockmap::txmsg test push_data:FAIL
11/17  sockmap::txmsg test pull-data:FAIL
12/ 9  sockmap::txmsg test pop-data:FAIL
...
Pass: 37 Fail: 15

Besides, added a custom errno EDATAINTEGRITY for msg_verify_data, we
shall not ignore the error in txmsg_cork case.

Fixes: 753fb2ee0934 ("bpf: sockmap, add msg_peek tests to test_sockmap")
Fixes: 16edddfe3c5d ("selftests/bpf: test_sockmap, check test failure")
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Zijian Zhang <zijianzhang@bytedance.com>
Link: https://lore.kernel.org/r/20241012203731.1248619-2-zijianzhang@bytedance.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/test_sockmap.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 3e02d7267de8..8249f3c1fbd6 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -56,6 +56,8 @@ static void running_handler(int a);
 #define BPF_SOCKHASH_FILENAME "test_sockhash_kern.bpf.o"
 #define CG_PATH "/sockmap"
 
+#define EDATAINTEGRITY 2001
+
 /* global sockets */
 int s1, s2, c1, c2, p1, p2;
 int test_cnt;
@@ -510,23 +512,25 @@ unwind_iov:
 	return -ENOMEM;
 }
 
-static int msg_verify_data(struct msghdr *msg, int size, int chunk_sz)
+/* TODO: Add verification logic for push, pull and pop data */
+static int msg_verify_data(struct msghdr *msg, int size, int chunk_sz,
+				 unsigned char *k_p, int *bytes_cnt_p)
 {
-	int i, j = 0, bytes_cnt = 0;
-	unsigned char k = 0;
+	int i, j, bytes_cnt = *bytes_cnt_p;
+	unsigned char k = *k_p;
 
-	for (i = 0; i < msg->msg_iovlen; i++) {
+	for (i = 0, j = 0; i < msg->msg_iovlen && size; i++, j = 0) {
 		unsigned char *d = msg->msg_iov[i].iov_base;
 
 		/* Special case test for skb ingress + ktls */
 		if (i == 0 && txmsg_ktls_skb) {
 			if (msg->msg_iov[i].iov_len < 4)
-				return -EIO;
+				return -EDATAINTEGRITY;
 			if (memcmp(d, "PASS", 4) != 0) {
 				fprintf(stderr,
 					"detected skb data error with skb ingress update @iov[%i]:%i \"%02x %02x %02x %02x\" != \"PASS\"\n",
 					i, 0, d[0], d[1], d[2], d[3]);
-				return -EIO;
+				return -EDATAINTEGRITY;
 			}
 			j = 4; /* advance index past PASS header */
 		}
@@ -536,7 +540,7 @@ static int msg_verify_data(struct msghdr *msg, int size, int chunk_sz)
 				fprintf(stderr,
 					"detected data corruption @iov[%i]:%i %02x != %02x, %02x ?= %02x\n",
 					i, j, d[j], k - 1, d[j+1], k);
-				return -EIO;
+				return -EDATAINTEGRITY;
 			}
 			bytes_cnt++;
 			if (bytes_cnt == chunk_sz) {
@@ -546,6 +550,8 @@ static int msg_verify_data(struct msghdr *msg, int size, int chunk_sz)
 			size--;
 		}
 	}
+	*k_p = k;
+	*bytes_cnt_p = bytes_cnt;
 	return 0;
 }
 
@@ -602,6 +608,8 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 		float total_bytes, txmsg_pop_total;
 		int fd_flags = O_NONBLOCK;
 		struct timeval timeout;
+		unsigned char k = 0;
+		int bytes_cnt = 0;
 		fd_set w;
 
 		fcntl(fd, fd_flags);
@@ -696,7 +704,7 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 						iov_length * cnt :
 						iov_length * iov_count;
 
-				errno = msg_verify_data(&msg, recv, chunk_sz);
+				errno = msg_verify_data(&msg, recv, chunk_sz, &k, &bytes_cnt);
 				if (errno) {
 					perror("data verify msg failed");
 					goto out_errno;
@@ -704,7 +712,9 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 				if (recvp) {
 					errno = msg_verify_data(&msg_peek,
 								recvp,
-								chunk_sz);
+								chunk_sz,
+								&k,
+								&bytes_cnt);
 					if (errno) {
 						perror("data verify msg_peek failed");
 						goto out_errno;
@@ -812,7 +822,7 @@ static int sendmsg_test(struct sockmap_options *opt)
 				s.bytes_sent, sent_Bps, sent_Bps/giga,
 				s.bytes_recvd, recvd_Bps, recvd_Bps/giga,
 				peek_flag ? "(peek_msg)" : "");
-		if (err && txmsg_cork)
+		if (err && err != -EDATAINTEGRITY && txmsg_cork)
 			err = 0;
 		exit(err ? 1 : 0);
 	} else if (rxpid == -1) {
-- 
cgit v1.2.3


From b29e231d66303c12b7b8ac3ac2a057df06b161e8 Mon Sep 17 00:00:00 2001
From: Zijian Zhang <zijianzhang@bytedance.com>
Date: Sat, 12 Oct 2024 20:37:31 +0000
Subject: selftests/bpf: Fix txmsg_redir of test_txmsg_pull in test_sockmap

txmsg_redir in "Test pull + redirect" case of test_txmsg_pull should be
1 instead of 0.

Fixes: 328aa08a081b ("bpf: Selftests, break down test_sockmap into subtests")
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Zijian Zhang <zijianzhang@bytedance.com>
Link: https://lore.kernel.org/r/20241012203731.1248619-3-zijianzhang@bytedance.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/test_sockmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 8249f3c1fbd6..075c93ed143e 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -1606,7 +1606,7 @@ static void test_txmsg_pull(int cgrp, struct sockmap_options *opt)
 	test_send_large(opt, cgrp);
 
 	/* Test pull + redirect */
-	txmsg_redir = 0;
+	txmsg_redir = 1;
 	txmsg_start = 1;
 	txmsg_end = 2;
 	test_send(opt, cgrp);
-- 
cgit v1.2.3


From 1ec43493c94f1661e5db79743bc720e1fe5ddb7f Mon Sep 17 00:00:00 2001
From: Daniel Zahka <daniel.zahka@gmail.com>
Date: Fri, 11 Oct 2024 11:35:48 -0700
Subject: selftests: drv-net: rss_ctx: add rss ctx busy testcase

It should be invalid to delete an rss context while it is being
referenced from an ntuple filter. ethtool core should prevent this
from happening. This patch adds a testcase to verify this behavior.

Signed-off-by: Daniel Zahka <daniel.zahka@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/drivers/net/hw/rss_ctx.py | 32 +++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/hw/rss_ctx.py b/tools/testing/selftests/drivers/net/hw/rss_ctx.py
index 9d7adb3cf33b..29995586993c 100755
--- a/tools/testing/selftests/drivers/net/hw/rss_ctx.py
+++ b/tools/testing/selftests/drivers/net/hw/rss_ctx.py
@@ -6,7 +6,7 @@ import random
 from lib.py import ksft_run, ksft_pr, ksft_exit, ksft_eq, ksft_ne, ksft_ge, ksft_lt
 from lib.py import NetDrvEpEnv
 from lib.py import EthtoolFamily, NetdevFamily
-from lib.py import KsftSkipEx
+from lib.py import KsftSkipEx, KsftFailEx
 from lib.py import rand_port
 from lib.py import ethtool, ip, defer, GenerateTraffic, CmdExitFailure
 
@@ -606,6 +606,33 @@ def test_rss_context_overlap2(cfg):
     test_rss_context_overlap(cfg, True)
 
 
+def test_delete_rss_context_busy(cfg):
+    """
+    Test that deletion returns -EBUSY when an rss context is being used
+    by an ntuple filter.
+    """
+
+    require_ntuple(cfg)
+
+    # create additional rss context
+    ctx_id = ethtool_create(cfg, "-X", "context new")
+    ctx_deleter = defer(ethtool, f"-X {cfg.ifname} context {ctx_id} delete")
+
+    # utilize context from ntuple filter
+    port = rand_port()
+    flow = f"flow-type tcp{cfg.addr_ipver} dst-port {port} context {ctx_id}"
+    ntuple_id = ethtool_create(cfg, "-N", flow)
+    defer(ethtool, f"-N {cfg.ifname} delete {ntuple_id}")
+
+    # attempt to delete in-use context
+    try:
+        ctx_deleter.exec_only()
+        ctx_deleter.cancel()
+        raise KsftFailEx(f"deleted context {ctx_id} used by rule {ntuple_id}")
+    except CmdExitFailure:
+        pass
+
+
 def main() -> None:
     with NetDrvEpEnv(__file__, nsim_test=False) as cfg:
         cfg.ethnl = EthtoolFamily()
@@ -616,7 +643,8 @@ def main() -> None:
                   test_rss_context, test_rss_context4, test_rss_context32,
                   test_rss_context_dump, test_rss_context_queue_reconfigure,
                   test_rss_context_overlap, test_rss_context_overlap2,
-                  test_rss_context_out_of_order, test_rss_context4_create_with_cfg],
+                  test_rss_context_out_of_order, test_rss_context4_create_with_cfg,
+                  test_delete_rss_context_busy],
                  args=(cfg, ))
     ksft_exit()
 
-- 
cgit v1.2.3


From a2aa5dcc6393dc08844a3f76aa5b7694fbbf99c8 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 16 Aug 2024 16:32:44 +0100
Subject: kselftest/arm64: signal: drop now redundant GNU_SOURCE definition

The definition of GNU_SOURCE was recently centralised in an upper layer
kselftest Makefile, so the definition in the arm64 signal tests Makefile
is no longer needed. To make things worse, since both definitions are
not strictly identical, the compiler warns about it:
<command-line>: warning: "_GNU_SOURCE" redefined
<command-line>: note: this is the location of the previous definition

Drop the definition in the arm64/signal Makefile.

Fixes: cc937dad85ae ("selftests: centralize -D_GNU_SOURCE= to CFLAGS in lib.mk")
Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240816153251.2833702-2-andre.przywara@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/signal/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/signal/Makefile b/tools/testing/selftests/arm64/signal/Makefile
index edb3613513b8..1381039fb36f 100644
--- a/tools/testing/selftests/arm64/signal/Makefile
+++ b/tools/testing/selftests/arm64/signal/Makefile
@@ -2,7 +2,7 @@
 # Copyright (C) 2019 ARM Limited
 
 # Additional include paths needed by kselftest.h and local headers
-CFLAGS += -D_GNU_SOURCE -std=gnu99 -I.
+CFLAGS += -std=gnu99 -I.
 
 SRCS := $(filter-out testcases/testcases.c,$(wildcard testcases/*.c))
 PROGS := $(patsubst %.c,%,$(SRCS))
-- 
cgit v1.2.3


From b0d80dbc378d52155c9ecf9579986edccceed3aa Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 16 Aug 2024 16:32:45 +0100
Subject: kselftest/arm64: hwcap: fix f8dp2 cpuinfo name

The F8DP2 DPISA extension has a separate cpuinfo field, named
accordingly.
Change the erroneously placed name of "f8dp4" to "f8dp2".

Fixes: 44d10c27bd75 ("kselftest/arm64: Add 2023 DPISA hwcap test coverage")
Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240816153251.2833702-3-andre.przywara@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/abi/hwcap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c
index f2d6007a2b98..7e95ba5fd496 100644
--- a/tools/testing/selftests/arm64/abi/hwcap.c
+++ b/tools/testing/selftests/arm64/abi/hwcap.c
@@ -490,7 +490,7 @@ static const struct hwcap_data {
 		.name = "F8DP2",
 		.at_hwcap = AT_HWCAP2,
 		.hwcap_bit = HWCAP2_F8DP2,
-		.cpuinfo = "f8dp4",
+		.cpuinfo = "f8dp2",
 		.sigill_fn = f8dp2_sigill,
 	},
 	{
-- 
cgit v1.2.3


From bf52ca5912c07664276c7b94db820fa2d638b681 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 16 Aug 2024 16:32:46 +0100
Subject: kselftest/arm64: mte: use proper SKIP syntax

If MTE is not available on a system, we detect this early and skip all
the MTE selftests. However this happens before we print the TAP plan, so
tools parsing the TAP output get confused and report an error.

Use the existing ksft_exit_skip() function to handle this, which uses a
dummy plan to work with tools expecting proper TAP syntax, as described
in the TAP specification.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240816153251.2833702-4-andre.przywara@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/mte/mte_common_util.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/mte/mte_common_util.c b/tools/testing/selftests/arm64/mte/mte_common_util.c
index 00ffd34c66d3..69e4a67853c4 100644
--- a/tools/testing/selftests/arm64/mte/mte_common_util.c
+++ b/tools/testing/selftests/arm64/mte/mte_common_util.c
@@ -319,10 +319,9 @@ int mte_default_setup(void)
 	unsigned long en = 0;
 	int ret;
 
-	if (!(hwcaps2 & HWCAP2_MTE)) {
-		ksft_print_msg("SKIP: MTE features unavailable\n");
-		return KSFT_SKIP;
-	}
+	if (!(hwcaps2 & HWCAP2_MTE))
+		ksft_exit_skip("MTE features unavailable\n");
+
 	/* Get current mte mode */
 	ret = prctl(PR_GET_TAGGED_ADDR_CTRL, en, 0, 0, 0);
 	if (ret < 0) {
-- 
cgit v1.2.3


From 0f995f22a03fef8f3bff51d22a0a78c768536814 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 16 Aug 2024 16:32:47 +0100
Subject: kselftest/arm64: mte: use string literal for printf-style functions

Using pointers for the format specifier strings in printf-style
functions can create potential security problems, as the number of
arguments to be parsed could vary from call to call. Most compilers
consequently warn about those:
"format not a string literal and no format arguments [-Wformat-security]"

If we only want to print a constant string, we can just use a fixed "%s"
format instead, and pass the string as an argument.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240816153251.2833702-5-andre.przywara@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/mte/mte_common_util.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/mte/mte_common_util.h b/tools/testing/selftests/arm64/mte/mte_common_util.h
index 2d3e71724e55..a0017a303beb 100644
--- a/tools/testing/selftests/arm64/mte/mte_common_util.h
+++ b/tools/testing/selftests/arm64/mte/mte_common_util.h
@@ -77,13 +77,13 @@ static inline void evaluate_test(int err, const char *msg)
 {
 	switch (err) {
 	case KSFT_PASS:
-		ksft_test_result_pass(msg);
+		ksft_test_result_pass("%s", msg);
 		break;
 	case KSFT_FAIL:
-		ksft_test_result_fail(msg);
+		ksft_test_result_fail("%s", msg);
 		break;
 	case KSFT_SKIP:
-		ksft_test_result_skip(msg);
+		ksft_test_result_skip("%s", msg);
 		break;
 	default:
 		ksft_test_result_error("Unknown return code %d from %s",
-- 
cgit v1.2.3


From 7e893dc81de3e342156389ea0b83ec7d07f25281 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 16 Aug 2024 16:32:49 +0100
Subject: kselftest/arm64: mte: fix printf type warnings about __u64

When printing the signal context's PC, we use a "%lx" format specifier,
which matches the common userland (glibc's) definition of uint64_t as an
"unsigned long". However the structure in question is defined in a
kernel uapi header, which uses a self defined __u64 type, and the arm64
kernel headers define this using "int-ll64.h", so it becomes an
"unsigned long long". This mismatch leads to the usual compiler warning.

The common fix would be to use "PRIx64", but because this is defined by
the userland's toolchain libc headers, it wouldn't match as well. Since
we know the exact type of __u64, just use "%llx" here instead, to silence
this warning.

This also fixes a more severe typo: "$lx" is not a valid format
specifier.

Fixes: 191e678bdc9b ("kselftest/arm64: Log unexpected asynchronous MTE faults")
Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240816153251.2833702-7-andre.przywara@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/mte/mte_common_util.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/mte/mte_common_util.c b/tools/testing/selftests/arm64/mte/mte_common_util.c
index 69e4a67853c4..9380edca29c7 100644
--- a/tools/testing/selftests/arm64/mte/mte_common_util.c
+++ b/tools/testing/selftests/arm64/mte/mte_common_util.c
@@ -38,7 +38,7 @@ void mte_default_handler(int signum, siginfo_t *si, void *uc)
 			if (cur_mte_cxt.trig_si_code == si->si_code)
 				cur_mte_cxt.fault_valid = true;
 			else
-				ksft_print_msg("Got unexpected SEGV_MTEAERR at pc=$lx, fault addr=%lx\n",
+				ksft_print_msg("Got unexpected SEGV_MTEAERR at pc=%llx, fault addr=%lx\n",
 					       ((ucontext_t *)uc)->uc_mcontext.pc,
 					       addr);
 			return;
@@ -64,7 +64,7 @@ void mte_default_handler(int signum, siginfo_t *si, void *uc)
 			exit(1);
 		}
 	} else if (signum == SIGBUS) {
-		ksft_print_msg("INFO: SIGBUS signal at pc=%lx, fault addr=%lx, si_code=%lx\n",
+		ksft_print_msg("INFO: SIGBUS signal at pc=%llx, fault addr=%lx, si_code=%x\n",
 				((ucontext_t *)uc)->uc_mcontext.pc, addr, si->si_code);
 		if ((cur_mte_cxt.trig_range >= 0 &&
 		     addr >= MT_CLEAR_TAG(cur_mte_cxt.trig_addr) &&
-- 
cgit v1.2.3


From 4716f719202e900b52f5f2270ac16b6a8ae40a47 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 16 Aug 2024 16:32:50 +0100
Subject: kselftest/arm64: mte: fix printf type warnings about pointers

When printing the value of a pointer, we should not use an integer
format specifier, but the dedicated "%p" instead.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240816153251.2833702-8-andre.przywara@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/mte/check_buffer_fill.c | 4 ++--
 tools/testing/selftests/arm64/mte/mte_common_util.c   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/mte/check_buffer_fill.c b/tools/testing/selftests/arm64/mte/check_buffer_fill.c
index 1dbbbd47dd50..2ee7f114d7fa 100644
--- a/tools/testing/selftests/arm64/mte/check_buffer_fill.c
+++ b/tools/testing/selftests/arm64/mte/check_buffer_fill.c
@@ -91,7 +91,7 @@ static int check_buffer_underflow_by_byte(int mem_type, int mode,
 		for (j = 0; j < sizes[i]; j++) {
 			if (ptr[j] != '1') {
 				err = true;
-				ksft_print_msg("Buffer is not filled at index:%d of ptr:0x%lx\n",
+				ksft_print_msg("Buffer is not filled at index:%d of ptr:0x%p\n",
 						j, ptr);
 				break;
 			}
@@ -189,7 +189,7 @@ static int check_buffer_overflow_by_byte(int mem_type, int mode,
 		for (j = 0; j < sizes[i]; j++) {
 			if (ptr[j] != '1') {
 				err = true;
-				ksft_print_msg("Buffer is not filled at index:%d of ptr:0x%lx\n",
+				ksft_print_msg("Buffer is not filled at index:%d of ptr:0x%p\n",
 						j, ptr);
 				break;
 			}
diff --git a/tools/testing/selftests/arm64/mte/mte_common_util.c b/tools/testing/selftests/arm64/mte/mte_common_util.c
index 9380edca29c7..17fbe5cfe472 100644
--- a/tools/testing/selftests/arm64/mte/mte_common_util.c
+++ b/tools/testing/selftests/arm64/mte/mte_common_util.c
@@ -100,7 +100,7 @@ void *mte_insert_tags(void *ptr, size_t size)
 	int align_size;
 
 	if (!ptr || (unsigned long)(ptr) & MT_ALIGN_GRANULE) {
-		ksft_print_msg("FAIL: Addr=%lx: invalid\n", ptr);
+		ksft_print_msg("FAIL: Addr=%p: invalid\n", ptr);
 		return NULL;
 	}
 	align_size = MT_ALIGN_UP(size);
@@ -112,7 +112,7 @@ void *mte_insert_tags(void *ptr, size_t size)
 void mte_clear_tags(void *ptr, size_t size)
 {
 	if (!ptr || (unsigned long)(ptr) & MT_ALIGN_GRANULE) {
-		ksft_print_msg("FAIL: Addr=%lx: invalid\n", ptr);
+		ksft_print_msg("FAIL: Addr=%p: invalid\n", ptr);
 		return;
 	}
 	size = MT_ALIGN_UP(size);
-- 
cgit v1.2.3


From 96dddb7b9406259baace9a1831e8da155311be6f Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 16 Aug 2024 16:32:51 +0100
Subject: kselftest/arm64: mte: fix printf type warnings about longs

When checking MTE tags, we print some diagnostic messages when the tests
fail. Some variables uses there are "longs", however we only use "%x"
for the format specifier.

Update the format specifiers to "%lx", to match the variable types they
are supposed to print.

Fixes: f3b2a26ca78d ("kselftest/arm64: Verify mte tag inclusion via prctl")
Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240816153251.2833702-9-andre.przywara@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/mte/check_tags_inclusion.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/mte/check_tags_inclusion.c b/tools/testing/selftests/arm64/mte/check_tags_inclusion.c
index 2b1425b92b69..a3d1e23fe02a 100644
--- a/tools/testing/selftests/arm64/mte/check_tags_inclusion.c
+++ b/tools/testing/selftests/arm64/mte/check_tags_inclusion.c
@@ -65,7 +65,7 @@ static int check_single_included_tags(int mem_type, int mode)
 			ptr = mte_insert_tags(ptr, BUFFER_SIZE);
 			/* Check tag value */
 			if (MT_FETCH_TAG((uintptr_t)ptr) == tag) {
-				ksft_print_msg("FAIL: wrong tag = 0x%x with include mask=0x%x\n",
+				ksft_print_msg("FAIL: wrong tag = 0x%lx with include mask=0x%x\n",
 					       MT_FETCH_TAG((uintptr_t)ptr),
 					       MT_INCLUDE_VALID_TAG(tag));
 				result = KSFT_FAIL;
@@ -97,7 +97,7 @@ static int check_multiple_included_tags(int mem_type, int mode)
 			ptr = mte_insert_tags(ptr, BUFFER_SIZE);
 			/* Check tag value */
 			if (MT_FETCH_TAG((uintptr_t)ptr) < tag) {
-				ksft_print_msg("FAIL: wrong tag = 0x%x with include mask=0x%x\n",
+				ksft_print_msg("FAIL: wrong tag = 0x%lx with include mask=0x%lx\n",
 					       MT_FETCH_TAG((uintptr_t)ptr),
 					       MT_INCLUDE_VALID_TAGS(excl_mask));
 				result = KSFT_FAIL;
-- 
cgit v1.2.3


From fb6f20ecb121cef4d7946f834a6ee867c4e21b4a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 17 Oct 2024 12:28:23 +0200
Subject: reiserfs: The last commit

Deprecation period of reiserfs ends with the end of this year so it is
time to remove it from the kernel.

Acked-by: Darrick J. Wong <djwong@kernel.org>
Acked-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 Documentation/filesystems/porting.rst              |    2 +-
 Documentation/userspace-api/ioctl/ioctl-number.rst |    2 +-
 MAINTAINERS                                        |    5 -
 arch/alpha/configs/defconfig                       |    1 -
 arch/arm/configs/pxa_defconfig                     |    4 -
 arch/m68k/configs/amiga_defconfig                  |    1 -
 arch/m68k/configs/apollo_defconfig                 |    1 -
 arch/m68k/configs/atari_defconfig                  |    1 -
 arch/m68k/configs/bvme6000_defconfig               |    1 -
 arch/m68k/configs/hp300_defconfig                  |    1 -
 arch/m68k/configs/mac_defconfig                    |    1 -
 arch/m68k/configs/multi_defconfig                  |    1 -
 arch/m68k/configs/mvme147_defconfig                |    1 -
 arch/m68k/configs/mvme16x_defconfig                |    1 -
 arch/m68k/configs/q40_defconfig                    |    1 -
 arch/m68k/configs/sun3_defconfig                   |    1 -
 arch/m68k/configs/sun3x_defconfig                  |    1 -
 arch/sh/configs/landisk_defconfig                  |    1 -
 arch/sh/configs/titan_defconfig                    |    1 -
 arch/um/configs/i386_defconfig                     |    1 -
 arch/um/configs/x86_64_defconfig                   |    1 -
 drivers/block/Kconfig                              |    2 +-
 fs/Kconfig                                         |    1 -
 fs/Makefile                                        |    1 -
 fs/buffer.c                                        |    3 +-
 fs/quota/Kconfig                                   |   15 +-
 fs/reiserfs/Kconfig                                |   91 -
 fs/reiserfs/Makefile                               |   30 -
 fs/reiserfs/README                                 |  151 -
 fs/reiserfs/acl.h                                  |   78 -
 fs/reiserfs/bitmap.c                               | 1476 -------
 fs/reiserfs/dir.c                                  |  346 --
 fs/reiserfs/do_balan.c                             | 1900 ---------
 fs/reiserfs/file.c                                 |  270 --
 fs/reiserfs/fix_node.c                             | 2822 -------------
 fs/reiserfs/hashes.c                               |  177 -
 fs/reiserfs/ibalance.c                             | 1161 ------
 fs/reiserfs/inode.c                                | 3416 ---------------
 fs/reiserfs/ioctl.c                                |  221 -
 fs/reiserfs/item_ops.c                             |  737 ----
 fs/reiserfs/journal.c                              | 4404 --------------------
 fs/reiserfs/lbalance.c                             | 1426 -------
 fs/reiserfs/lock.c                                 |  101 -
 fs/reiserfs/namei.c                                | 1725 --------
 fs/reiserfs/objectid.c                             |  216 -
 fs/reiserfs/prints.c                               |  792 ----
 fs/reiserfs/procfs.c                               |  490 ---
 fs/reiserfs/reiserfs.h                             | 3419 ---------------
 fs/reiserfs/resize.c                               |  230 -
 fs/reiserfs/stree.c                                | 2280 ----------
 fs/reiserfs/super.c                                | 2646 ------------
 fs/reiserfs/tail_conversion.c                      |  318 --
 fs/reiserfs/xattr.c                                | 1039 -----
 fs/reiserfs/xattr.h                                |  117 -
 fs/reiserfs/xattr_acl.c                            |  411 --
 fs/reiserfs/xattr_security.c                       |  127 -
 fs/reiserfs/xattr_trusted.c                        |   46 -
 fs/reiserfs/xattr_user.c                           |   43 -
 include/uapi/linux/reiserfs_fs.h                   |   27 -
 include/uapi/linux/reiserfs_xattr.h                |   25 -
 scripts/selinux/mdp/mdp.c                          |    3 -
 tools/objtool/noreturns.h                          |    1 -
 .../filesystems/statmount/statmount_test.c         |    2 +-
 63 files changed, 12 insertions(+), 32804 deletions(-)
 delete mode 100644 fs/reiserfs/Kconfig
 delete mode 100644 fs/reiserfs/Makefile
 delete mode 100644 fs/reiserfs/README
 delete mode 100644 fs/reiserfs/acl.h
 delete mode 100644 fs/reiserfs/bitmap.c
 delete mode 100644 fs/reiserfs/dir.c
 delete mode 100644 fs/reiserfs/do_balan.c
 delete mode 100644 fs/reiserfs/file.c
 delete mode 100644 fs/reiserfs/fix_node.c
 delete mode 100644 fs/reiserfs/hashes.c
 delete mode 100644 fs/reiserfs/ibalance.c
 delete mode 100644 fs/reiserfs/inode.c
 delete mode 100644 fs/reiserfs/ioctl.c
 delete mode 100644 fs/reiserfs/item_ops.c
 delete mode 100644 fs/reiserfs/journal.c
 delete mode 100644 fs/reiserfs/lbalance.c
 delete mode 100644 fs/reiserfs/lock.c
 delete mode 100644 fs/reiserfs/namei.c
 delete mode 100644 fs/reiserfs/objectid.c
 delete mode 100644 fs/reiserfs/prints.c
 delete mode 100644 fs/reiserfs/procfs.c
 delete mode 100644 fs/reiserfs/reiserfs.h
 delete mode 100644 fs/reiserfs/resize.c
 delete mode 100644 fs/reiserfs/stree.c
 delete mode 100644 fs/reiserfs/super.c
 delete mode 100644 fs/reiserfs/tail_conversion.c
 delete mode 100644 fs/reiserfs/xattr.c
 delete mode 100644 fs/reiserfs/xattr.h
 delete mode 100644 fs/reiserfs/xattr_acl.c
 delete mode 100644 fs/reiserfs/xattr_security.c
 delete mode 100644 fs/reiserfs/xattr_trusted.c
 delete mode 100644 fs/reiserfs/xattr_user.c
 delete mode 100644 include/uapi/linux/reiserfs_fs.h
 delete mode 100644 include/uapi/linux/reiserfs_xattr.h

(limited to 'tools/testing')

diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 92bffcc6747a..9ab2a3d6f2b4 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -177,7 +177,7 @@ settles down a bit.
 **mandatory**
 
 s_export_op is now required for exporting a filesystem.
-isofs, ext2, ext3, reiserfs, fat
+isofs, ext2, ext3, fat
 can be used as examples of very different filesystems.
 
 ---
diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index e4be1378ba26..243f1f1b554a 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -375,7 +375,7 @@ Code  Seq#    Include File                                           Comments
 0xCB  00-1F                                                          CBM serial IEC bus in development:
                                                                      <mailto:michael.klein@puffin.lb.shuttle.de>
 0xCC  00-0F  drivers/misc/ibmvmc.h                                   pseries VMC driver
-0xCD  01     linux/reiserfs_fs.h
+0xCD  01     linux/reiserfs_fs.h                                     Dead since 6.13
 0xCE  01-02  uapi/linux/cxl_mem.h                                    Compute Express Link Memory Devices
 0xCF  02     fs/smb/client/cifs_ioctl.h
 0xDB  00-0F  drivers/char/mwave/mwavepub.h
diff --git a/MAINTAINERS b/MAINTAINERS
index 7ad507f49324..02de04d4ae1e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -19578,11 +19578,6 @@ F:	Documentation/devicetree/bindings/regmap/
 F:	drivers/base/regmap/
 F:	include/linux/regmap.h
 
-REISERFS FILE SYSTEM
-L:	reiserfs-devel@vger.kernel.org
-S:	Obsolete
-F:	fs/reiserfs/
-
 REMOTE PROCESSOR (REMOTEPROC) SUBSYSTEM
 M:	Bjorn Andersson <andersson@kernel.org>
 M:	Mathieu Poirier <mathieu.poirier@linaro.org>
diff --git a/arch/alpha/configs/defconfig b/arch/alpha/configs/defconfig
index 1816c1dc22b1..3280bd9e6578 100644
--- a/arch/alpha/configs/defconfig
+++ b/arch/alpha/configs/defconfig
@@ -51,7 +51,6 @@ CONFIG_SERIAL_8250_CONSOLE=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_CMOS=y
 CONFIG_EXT2_FS=y
-CONFIG_REISERFS_FS=m
 CONFIG_ISO9660_FS=y
 CONFIG_MSDOS_FS=y
 CONFIG_VFAT_FS=y
diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig
index e1cb170c2bf0..38916ac4bce4 100644
--- a/arch/arm/configs/pxa_defconfig
+++ b/arch/arm/configs/pxa_defconfig
@@ -583,10 +583,6 @@ CONFIG_EXT2_FS_SECURITY=y
 CONFIG_EXT3_FS=y
 CONFIG_EXT3_FS_POSIX_ACL=y
 CONFIG_EXT3_FS_SECURITY=y
-CONFIG_REISERFS_FS=m
-CONFIG_REISERFS_FS_XATTR=y
-CONFIG_REISERFS_FS_POSIX_ACL=y
-CONFIG_REISERFS_FS_SECURITY=y
 CONFIG_XFS_FS=m
 CONFIG_AUTOFS_FS=m
 CONFIG_FUSE_FS=m
diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig
index d01dc47d52ea..fba7b68c235b 100644
--- a/arch/m68k/configs/amiga_defconfig
+++ b/arch/m68k/configs/amiga_defconfig
@@ -449,7 +449,6 @@ CONFIG_RTC_DRV_RP5C01=m
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_DAX=m
 CONFIG_EXT4_FS=y
-CONFIG_REISERFS_FS=m
 CONFIG_JFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig
index 46808e581d7b..308655a98bb1 100644
--- a/arch/m68k/configs/apollo_defconfig
+++ b/arch/m68k/configs/apollo_defconfig
@@ -406,7 +406,6 @@ CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_DAX=m
 CONFIG_EXT4_FS=y
-CONFIG_REISERFS_FS=m
 CONFIG_JFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig
index 4469a7839c9d..956a3aed97c6 100644
--- a/arch/m68k/configs/atari_defconfig
+++ b/arch/m68k/configs/atari_defconfig
@@ -426,7 +426,6 @@ CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_DAX=m
 CONFIG_EXT4_FS=y
-CONFIG_REISERFS_FS=m
 CONFIG_JFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig
index c0719322c028..8790b6756a76 100644
--- a/arch/m68k/configs/bvme6000_defconfig
+++ b/arch/m68k/configs/bvme6000_defconfig
@@ -398,7 +398,6 @@ CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_DAX=m
 CONFIG_EXT4_FS=y
-CONFIG_REISERFS_FS=m
 CONFIG_JFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig
index 8d429e63f8f2..dfb2fface338 100644
--- a/arch/m68k/configs/hp300_defconfig
+++ b/arch/m68k/configs/hp300_defconfig
@@ -408,7 +408,6 @@ CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_DAX=m
 CONFIG_EXT4_FS=y
-CONFIG_REISERFS_FS=m
 CONFIG_JFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig
index bafd33da27c1..6577b4390c38 100644
--- a/arch/m68k/configs/mac_defconfig
+++ b/arch/m68k/configs/mac_defconfig
@@ -425,7 +425,6 @@ CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_DAX=m
 CONFIG_EXT4_FS=y
-CONFIG_REISERFS_FS=m
 CONFIG_JFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig
index 6f5ca3f85ea1..ad2bbc92d8d1 100644
--- a/arch/m68k/configs/multi_defconfig
+++ b/arch/m68k/configs/multi_defconfig
@@ -511,7 +511,6 @@ CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_DAX=m
 CONFIG_EXT4_FS=y
-CONFIG_REISERFS_FS=m
 CONFIG_JFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig
index d16b328c7136..3b4a2d2d966f 100644
--- a/arch/m68k/configs/mvme147_defconfig
+++ b/arch/m68k/configs/mvme147_defconfig
@@ -397,7 +397,6 @@ CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_DAX=m
 CONFIG_EXT4_FS=y
-CONFIG_REISERFS_FS=m
 CONFIG_JFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig
index 80f6c15a5ed5..9711f37d2ef7 100644
--- a/arch/m68k/configs/mvme16x_defconfig
+++ b/arch/m68k/configs/mvme16x_defconfig
@@ -398,7 +398,6 @@ CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_DAX=m
 CONFIG_EXT4_FS=y
-CONFIG_REISERFS_FS=m
 CONFIG_JFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig
index 0e81589f0ee2..5ae3b707c849 100644
--- a/arch/m68k/configs/q40_defconfig
+++ b/arch/m68k/configs/q40_defconfig
@@ -415,7 +415,6 @@ CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_DAX=m
 CONFIG_EXT4_FS=y
-CONFIG_REISERFS_FS=m
 CONFIG_JFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig
index 8cd785290339..55efa85492d8 100644
--- a/arch/m68k/configs/sun3_defconfig
+++ b/arch/m68k/configs/sun3_defconfig
@@ -396,7 +396,6 @@ CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_DAX=m
 CONFIG_EXT4_FS=y
-CONFIG_REISERFS_FS=m
 CONFIG_JFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig
index 78035369f60f..cf1c78e02fda 100644
--- a/arch/m68k/configs/sun3x_defconfig
+++ b/arch/m68k/configs/sun3x_defconfig
@@ -396,7 +396,6 @@ CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_DAX=m
 CONFIG_EXT4_FS=y
-CONFIG_REISERFS_FS=m
 CONFIG_JFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
diff --git a/arch/sh/configs/landisk_defconfig b/arch/sh/configs/landisk_defconfig
index 0311380160f4..d871623955c5 100644
--- a/arch/sh/configs/landisk_defconfig
+++ b/arch/sh/configs/landisk_defconfig
@@ -95,7 +95,6 @@ CONFIG_USB_SISUSBVGA=m
 CONFIG_EXT2_FS=y
 CONFIG_EXT3_FS=y
 # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
-CONFIG_REISERFS_FS=y
 CONFIG_ISO9660_FS=m
 CONFIG_MSDOS_FS=y
 CONFIG_VFAT_FS=y
diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig
index c1032559ecd4..99bc0e889287 100644
--- a/arch/sh/configs/titan_defconfig
+++ b/arch/sh/configs/titan_defconfig
@@ -220,7 +220,6 @@ CONFIG_EXT2_FS=y
 CONFIG_EXT3_FS=y
 # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 # CONFIG_EXT3_FS_XATTR is not set
-CONFIG_REISERFS_FS=m
 CONFIG_XFS_FS=m
 CONFIG_FUSE_FS=m
 CONFIG_ISO9660_FS=m
diff --git a/arch/um/configs/i386_defconfig b/arch/um/configs/i386_defconfig
index e543cbac8792..9c9c77f1255a 100644
--- a/arch/um/configs/i386_defconfig
+++ b/arch/um/configs/i386_defconfig
@@ -61,7 +61,6 @@ CONFIG_UML_NET_DAEMON=y
 CONFIG_UML_NET_MCAST=y
 CONFIG_UML_NET_SLIRP=y
 CONFIG_EXT4_FS=y
-CONFIG_REISERFS_FS=y
 CONFIG_QUOTA=y
 CONFIG_AUTOFS_FS=m
 CONFIG_ISO9660_FS=m
diff --git a/arch/um/configs/x86_64_defconfig b/arch/um/configs/x86_64_defconfig
index 939cb12318ca..03b10d3f6816 100644
--- a/arch/um/configs/x86_64_defconfig
+++ b/arch/um/configs/x86_64_defconfig
@@ -59,7 +59,6 @@ CONFIG_UML_NET_DAEMON=y
 CONFIG_UML_NET_MCAST=y
 CONFIG_UML_NET_SLIRP=y
 CONFIG_EXT4_FS=y
-CONFIG_REISERFS_FS=y
 CONFIG_QUOTA=y
 CONFIG_AUTOFS_FS=m
 CONFIG_ISO9660_FS=m
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index ed209f4f2798..a97f2c40c640 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -130,7 +130,7 @@ config BLK_DEV_UBD_SYNC
           kernel command line option.  Alternatively, you can say Y here to
           turn on synchronous operation by default for all block devices.
 
-          If you're running a journalling file system (like reiserfs, for
+          If you're running a journalling file system (like xfs, for
           example) in your virtual machine, you will want to say Y here.  If
           you care for the safety of the data in your virtual machine, Y is a
           wise choice too.  In all other cases (for example, if you're just
diff --git a/fs/Kconfig b/fs/Kconfig
index aae170fc2795..64d420e3c475 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -43,7 +43,6 @@ config FS_MBCACHE
 	default y if EXT4_FS=y
 	default m if EXT2_FS_XATTR || EXT4_FS
 
-source "fs/reiserfs/Kconfig"
 source "fs/jfs/Kconfig"
 
 source "fs/xfs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 61679fd587b7..15df0a923d3a 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -61,7 +61,6 @@ obj-$(CONFIG_DLM)		+= dlm/
  
 # Do not add any filesystems before this line
 obj-$(CONFIG_NETFS_SUPPORT)	+= netfs/
-obj-$(CONFIG_REISERFS_FS)	+= reiserfs/
 obj-$(CONFIG_EXT4_FS)		+= ext4/
 # We place ext4 before ext2 so that clean ext3 root fs's do NOT mount using the
 # ext2 driver, which doesn't know about journalling!  Explicitly request ext2
diff --git a/fs/buffer.c b/fs/buffer.c
index 1fc9a50def0b..c17011bc7120 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -855,8 +855,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
  * done a sync().  Just drop the buffers from the inode list.
  *
  * NOTE: we take the inode's blockdev's mapping's i_private_lock.  Which
- * assumes that all the buffers are against the blockdev.  Not true
- * for reiserfs.
+ * assumes that all the buffers are against the blockdev.
  */
 void invalidate_inode_buffers(struct inode *inode)
 {
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index 4c925e55dbcd..818083a36bef 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -9,14 +9,13 @@ config QUOTA
 	help
 	  If you say Y here, you will be able to set per user limits for disk
 	  usage (also called disk quotas). Currently, it works for the
-	  ext2, ext3, ext4, f2fs, jfs, ocfs2 and reiserfs file systems.
-	  Note that gfs2 and xfs use their own quota system.
-	  Ext3, ext4 and reiserfs also support journaled quotas for which
-	  you don't need to run quotacheck(8) after an unclean shutdown.
-	  For further details, read the Quota mini-HOWTO, available from
-	  <https://www.tldp.org/docs.html#howto>, or the documentation provided
-	  with the quota tools. Probably the quota support is only useful for
-	  multi user systems. If unsure, say N.
+	  ext2, ext3, ext4, f2fs, jfs and ocfs2 file systems. Note that gfs2
+	  and xfs use their own quota system. Ext3 and ext4 also support
+	  journaled quotas for which you don't need to run quotacheck(8) after
+	  an unclean shutdown. For further details, read the Quota mini-HOWTO,
+	  available from <https://www.tldp.org/docs.html#howto>, or the
+	  documentation provided with the quota tools. Probably the quota
+	  support is only useful for multi user systems. If unsure, say N.
 
 config QUOTA_NETLINK_INTERFACE
 	bool "Report quota messages through netlink interface"
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
deleted file mode 100644
index 0e6fe26458fe..000000000000
--- a/fs/reiserfs/Kconfig
+++ /dev/null
@@ -1,91 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-config REISERFS_FS
-	tristate "Reiserfs support (deprecated)"
-	select BUFFER_HEAD
-	select CRC32
-	select LEGACY_DIRECT_IO
-	help
-	  Reiserfs is deprecated and scheduled to be removed from the kernel
-	  in 2025. If you are still using it, please migrate to another
-	  filesystem or tell us your usecase for reiserfs.
-
-	  Reiserfs stores not just filenames but the files themselves in a
-	  balanced tree.  Uses journalling.
-
-	  Balanced trees are more efficient than traditional file system
-	  architectural foundations.
-
-	  In general, ReiserFS is as fast as ext2, but is very efficient with
-	  large directories and small files.  Additional patches are needed
-	  for NFS and quotas, please see 
-	  <https://reiser4.wiki.kernel.org/index.php/Main_Page> for links.
-
-	  It is more easily extended to have features currently found in
-	  database and keyword search systems than block allocation based file
-	  systems are.  The next version will be so extended, and will support
-	  plugins consistent with our motto ``It takes more than a license to
-	  make source code open.''
-
-	  Read <https://reiser4.wiki.kernel.org/index.php/Main_Page> 
-	  to learn more about reiserfs.
-
-	  Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com.
-
-	  If you like it, you can pay us to add new features to it that you
-	  need, buy a support contract, or pay us to port it to another OS.
-
-config REISERFS_CHECK
-	bool "Enable reiserfs debug mode"
-	depends on REISERFS_FS
-	help
-	  If you set this to Y, then ReiserFS will perform every check it can
-	  possibly imagine of its internal consistency throughout its
-	  operation.  It will also go substantially slower.  More than once we
-	  have forgotten that this was on, and then gone despondent over the
-	  latest benchmarks.:-) Use of this option allows our team to go all
-	  out in checking for consistency when debugging without fear of its
-	  effect on end users.  If you are on the verge of sending in a bug
-	  report, say Y and you might get a useful error message.  Almost
-	  everyone should say N.
-
-config REISERFS_PROC_INFO
-	bool "Stats in /proc/fs/reiserfs"
-	depends on REISERFS_FS && PROC_FS
-	help
-	  Create under /proc/fs/reiserfs a hierarchy of files, displaying
-	  various ReiserFS statistics and internal data at the expense of
-	  making your kernel or module slightly larger (+8 KB). This also
-	  increases the amount of kernel memory required for each mount.
-	  Almost everyone but ReiserFS developers and people fine-tuning
-	  reiserfs or tracing problems should say N.
-
-config REISERFS_FS_XATTR
-	bool "ReiserFS extended attributes"
-	depends on REISERFS_FS
-	help
-	  Extended attributes are name:value pairs associated with inodes by
-	  the kernel or by users (see the attr(5) manual page for details).
-
-	  If unsure, say N.
-
-config REISERFS_FS_POSIX_ACL
-	bool "ReiserFS POSIX Access Control Lists"
-	depends on REISERFS_FS_XATTR
-	select FS_POSIX_ACL
-	help
-	  Posix Access Control Lists (ACLs) support permissions for users and
-	  groups beyond the owner/group/world scheme.
-
-	  If you don't know what Access Control Lists are, say N
-
-config REISERFS_FS_SECURITY
-	bool "ReiserFS Security Labels"
-	depends on REISERFS_FS_XATTR
-	help
-	  Security labels support alternative access control models
-	  implemented by security modules like SELinux.  This option
-	  enables an extended attribute handler for file security
-	  labels in the ReiserFS filesystem.
-
-	  If you are not using a security module that requires using
-	  extended attributes for file security labels, say N.
diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
deleted file mode 100644
index bd29c58ccbd8..000000000000
--- a/fs/reiserfs/Makefile
+++ /dev/null
@@ -1,30 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Makefile for the linux reiser-filesystem routines.
-#
-
-obj-$(CONFIG_REISERFS_FS) += reiserfs.o
-
-reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \
-		 super.o prints.o objectid.o lbalance.o ibalance.o stree.o \
-		 hashes.o tail_conversion.o journal.o resize.o \
-		 item_ops.o ioctl.o xattr.o lock.o
-
-ifeq ($(CONFIG_REISERFS_PROC_INFO),y)
-reiserfs-objs += procfs.o
-endif
-
-ifeq ($(CONFIG_REISERFS_FS_XATTR),y)
-reiserfs-objs += xattr_user.o xattr_trusted.o
-endif
-
-ifeq ($(CONFIG_REISERFS_FS_SECURITY),y)
-reiserfs-objs += xattr_security.o
-endif
-
-ifeq ($(CONFIG_REISERFS_FS_POSIX_ACL),y)
-reiserfs-objs += xattr_acl.o
-endif
-
-TAGS:
-	etags *.c
diff --git a/fs/reiserfs/README b/fs/reiserfs/README
deleted file mode 100644
index 11e9ecf24b63..000000000000
--- a/fs/reiserfs/README
+++ /dev/null
@@ -1,151 +0,0 @@
-[LICENSING]
-
-ReiserFS is hereby licensed under the GNU General
-Public License version 2.
-
-Source code files that contain the phrase "licensing governed by
-reiserfs/README" are "governed files" throughout this file.  Governed
-files are licensed under the GPL.  The portions of them owned by Hans
-Reiser, or authorized to be licensed by him, have been in the past,
-and likely will be in the future, licensed to other parties under
-other licenses.  If you add your code to governed files, and don't
-want it to be owned by Hans Reiser, put your copyright label on that
-code so the poor blight and his customers can keep things straight.
-All portions of governed files not labeled otherwise are owned by Hans
-Reiser, and by adding your code to it, widely distributing it to
-others or sending us a patch, and leaving the sentence in stating that
-licensing is governed by the statement in this file, you accept this.
-It will be a kindness if you identify whether Hans Reiser is allowed
-to license code labeled as owned by you on your behalf other than
-under the GPL, because he wants to know if it is okay to do so and put
-a check in the mail to you (for non-trivial improvements) when he
-makes his next sale.  He makes no guarantees as to the amount if any,
-though he feels motivated to motivate contributors, and you can surely
-discuss this with him before or after contributing.  You have the
-right to decline to allow him to license your code contribution other
-than under the GPL.
-
-Further licensing options are available for commercial and/or other
-interests directly from Hans Reiser: hans@reiser.to.  If you interpret
-the GPL as not allowing those additional licensing options, you read
-it wrongly, and Richard Stallman agrees with me, when carefully read
-you can see that those restrictions on additional terms do not apply
-to the owner of the copyright, and my interpretation of this shall
-govern for this license.
-
-Finally, nothing in this license shall be interpreted to allow you to
-fail to fairly credit me, or to remove my credits, without my
-permission, unless you are an end user not redistributing to others.
-If you have doubts about how to properly do that, or about what is
-fair, ask.  (Last I spoke with him Richard was contemplating how best
-to address the fair crediting issue in the next GPL version.)
-
-[END LICENSING]
-
-Reiserfs is a file system based on balanced tree algorithms, which is
-described at https://reiser4.wiki.kernel.org/index.php/Main_Page 
-
-Stop reading here.  Go there, then return.
-
-Send bug reports to yura@namesys.botik.ru.
-
-mkreiserfs and other utilities are in reiserfs/utils, or wherever your
-Linux provider put them.  There is some disagreement about how useful
-it is for users to get their fsck and mkreiserfs out of sync with the
-version of reiserfs that is in their kernel, with many important
-distributors wanting them out of sync.:-) Please try to remember to
-recompile and reinstall fsck and mkreiserfs with every update of
-reiserfs, this is a common source of confusion.  Note that some of the
-utilities cannot be compiled without accessing the balancing code
-which is in the kernel code, and relocating the utilities may require
-you to specify where that code can be found.
-
-Yes, if you update your reiserfs kernel module you do have to
-recompile your kernel, most of the time.  The errors you get will be
-quite cryptic if your forget to do so.
-
-Real users, as opposed to folks who want to hack and then understand
-what went wrong, will want REISERFS_CHECK off.
-
-Hideous Commercial Pitch: Spread your development costs across other OS
-vendors.  Select from the best in the world, not the best in your
-building, by buying from third party OS component suppliers.  Leverage
-the software component development power of the internet.  Be the most
-aggressive in taking advantage of the commercial possibilities of
-decentralized internet development, and add value through your branded
-integration that you sell as an operating system.  Let your competitors
-be the ones to compete against the entire internet by themselves.  Be
-hip, get with the new economic trend, before your competitors do.  Send
-email to hans@reiser.to.
-
-To understand the code, after reading the website, start reading the
-code by reading reiserfs_fs.h first.
-
-Hans Reiser was the project initiator, primary architect, source of all
-funding for the first 5.5 years, and one of the programmers.  He owns
-the copyright.
-
-Vladimir Saveljev was one of the programmers, and he worked long hours
-writing the cleanest code.  He always made the effort to be the best he
-could be, and to make his code the best that it could be.  What resulted
-was quite remarkable. I don't think that money can ever motivate someone
-to work the way he did, he is one of the most selfless men I know.
-
-Yura helps with benchmarking, coding hashes, and block pre-allocation
-code.
-
-Anatoly Pinchuk is a former member of our team who worked closely with
-Vladimir throughout the project's development.  He wrote a quite
-substantial portion of the total code.  He realized that there was a
-space problem with packing tails of files for files larger than a node
-that start on a node aligned boundary (there are reasons to want to node
-align files), and he invented and implemented indirect items and
-unformatted nodes as the solution.
-
-Konstantin Shvachko was taking part in the early days.
-
-Mikhail Gilula was a brilliant innovator that has shown much generosity.
-
-Grigory Zaigralin was an extremely effective system administrator for
-our group.
-
-Igor Krasheninnikov was wonderful at hardware procurement, repair, and
-network installation.
-
-Jeremy Fitzhardinge wrote the teahash.c code, and he gives credit to a
-textbook he got the algorithm from in the code.  Note that his analysis
-of how we could use the hashing code in making 32 bit NFS cookies work
-was probably more important than the actual algorithm.  Colin Plumb also
-contributed to it.
-
-Chris Mason dived right into our code, and in just a few months produced
-the journaling code that dramatically increased the value of ReiserFS.
-He is just an amazing programmer.
-
-Igor Zagorovsky is writing much of the new item handler and extent code
-for our next major release.
-
-Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the
-resizer, and is hard at work on implementing allocate on flush.  SGI
-implemented allocate on flush before us for XFS, and generously took
-the time to convince me we should do it also.  They are great people,
-and a great company.
-
-Yuri Shevchuk and Nikita Danilov are doing squid cache optimization.
-
-Vitaly Fertman is doing fsck.
-
-Jeff Mahoney, of SuSE, contributed a few cleanup fixes, most notably
-the endian safe patches which allow ReiserFS to run on any platform
-supported by the Linux kernel.
-
-SuSE, IntegratedLinux.com, Ecila, MP3.com, bigstorage.com, and the
-Alpha PC Company made it possible for me to not have a day job
-anymore, and to dramatically increase our staffing.  Ecila funded
-hypertext feature development, MP3.com funded journaling, SuSE funded
-core development, IntegratedLinux.com funded squid web cache
-appliances, bigstorage.com funded HSM, and the alpha PC company funded
-the alpha port.  Many of these tasks were helped by sponsors other
-than the ones just named.  SuSE has helped in much more than just
-funding....
-
diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h
deleted file mode 100644
index 2571b1a8be84..000000000000
--- a/fs/reiserfs/acl.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <linux/init.h>
-#include <linux/posix_acl.h>
-
-#define REISERFS_ACL_VERSION	0x0001
-
-typedef struct {
-	__le16 e_tag;
-	__le16 e_perm;
-	__le32 e_id;
-} reiserfs_acl_entry;
-
-typedef struct {
-	__le16 e_tag;
-	__le16 e_perm;
-} reiserfs_acl_entry_short;
-
-typedef struct {
-	__le32 a_version;
-} reiserfs_acl_header;
-
-static inline size_t reiserfs_acl_size(int count)
-{
-	if (count <= 4) {
-		return sizeof(reiserfs_acl_header) +
-		    count * sizeof(reiserfs_acl_entry_short);
-	} else {
-		return sizeof(reiserfs_acl_header) +
-		    4 * sizeof(reiserfs_acl_entry_short) +
-		    (count - 4) * sizeof(reiserfs_acl_entry);
-	}
-}
-
-static inline int reiserfs_acl_count(size_t size)
-{
-	ssize_t s;
-	size -= sizeof(reiserfs_acl_header);
-	s = size - 4 * sizeof(reiserfs_acl_entry_short);
-	if (s < 0) {
-		if (size % sizeof(reiserfs_acl_entry_short))
-			return -1;
-		return size / sizeof(reiserfs_acl_entry_short);
-	} else {
-		if (s % sizeof(reiserfs_acl_entry))
-			return -1;
-		return s / sizeof(reiserfs_acl_entry) + 4;
-	}
-}
-
-#ifdef CONFIG_REISERFS_FS_POSIX_ACL
-struct posix_acl *reiserfs_get_acl(struct inode *inode, int type, bool rcu);
-int reiserfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
-		     struct posix_acl *acl, int type);
-int reiserfs_acl_chmod(struct dentry *dentry);
-int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
-				 struct inode *dir, struct dentry *dentry,
-				 struct inode *inode);
-int reiserfs_cache_default_acl(struct inode *dir);
-
-#else
-
-#define reiserfs_cache_default_acl(inode) 0
-#define reiserfs_get_acl NULL
-#define reiserfs_set_acl NULL
-
-static inline int reiserfs_acl_chmod(struct dentry *dentry)
-{
-	return 0;
-}
-
-static inline int
-reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
-			     const struct inode *dir, struct dentry *dentry,
-			     struct inode *inode)
-{
-	return 0;
-}
-#endif
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
deleted file mode 100644
index bf708ac287b4..000000000000
--- a/fs/reiserfs/bitmap.c
+++ /dev/null
@@ -1,1476 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-/* Reiserfs block (de)allocator, bitmap-based. */
-
-#include <linux/time.h>
-#include "reiserfs.h"
-#include <linux/errno.h>
-#include <linux/buffer_head.h>
-#include <linux/kernel.h>
-#include <linux/pagemap.h>
-#include <linux/vmalloc.h>
-#include <linux/quotaops.h>
-#include <linux/seq_file.h>
-
-#define PREALLOCATION_SIZE 9
-
-/* different reiserfs block allocator options */
-
-#define SB_ALLOC_OPTS(s) (REISERFS_SB(s)->s_alloc_options.bits)
-
-#define  _ALLOC_concentrating_formatted_nodes 0
-#define  _ALLOC_displacing_large_files 1
-#define  _ALLOC_displacing_new_packing_localities 2
-#define  _ALLOC_old_hashed_relocation 3
-#define  _ALLOC_new_hashed_relocation 4
-#define  _ALLOC_skip_busy 5
-#define  _ALLOC_displace_based_on_dirid 6
-#define  _ALLOC_hashed_formatted_nodes 7
-#define  _ALLOC_old_way 8
-#define  _ALLOC_hundredth_slices 9
-#define  _ALLOC_dirid_groups 10
-#define  _ALLOC_oid_groups 11
-#define  _ALLOC_packing_groups 12
-
-#define  concentrating_formatted_nodes(s)	test_bit(_ALLOC_concentrating_formatted_nodes, &SB_ALLOC_OPTS(s))
-#define  displacing_large_files(s)		test_bit(_ALLOC_displacing_large_files, &SB_ALLOC_OPTS(s))
-#define  displacing_new_packing_localities(s)	test_bit(_ALLOC_displacing_new_packing_localities, &SB_ALLOC_OPTS(s))
-
-#define SET_OPTION(optname) \
-   do { \
-	reiserfs_info(s, "block allocator option \"%s\" is set", #optname); \
-	set_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)); \
-    } while(0)
-#define TEST_OPTION(optname, s) \
-    test_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s))
-
-static inline void get_bit_address(struct super_block *s,
-				   b_blocknr_t block,
-				   unsigned int *bmap_nr,
-				   unsigned int *offset)
-{
-	/*
-	 * It is in the bitmap block number equal to the block
-	 * number divided by the number of bits in a block.
-	 */
-	*bmap_nr = block >> (s->s_blocksize_bits + 3);
-	/* Within that bitmap block it is located at bit offset *offset. */
-	*offset = block & ((s->s_blocksize << 3) - 1);
-}
-
-int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value)
-{
-	unsigned int bmap, offset;
-	unsigned int bmap_count = reiserfs_bmap_count(s);
-
-	if (block == 0 || block >= SB_BLOCK_COUNT(s)) {
-		reiserfs_error(s, "vs-4010",
-			       "block number is out of range %lu (%u)",
-			       block, SB_BLOCK_COUNT(s));
-		return 0;
-	}
-
-	get_bit_address(s, block, &bmap, &offset);
-
-	/*
-	 * Old format filesystem? Unlikely, but the bitmaps are all
-	 * up front so we need to account for it.
-	 */
-	if (unlikely(test_bit(REISERFS_OLD_FORMAT,
-			      &REISERFS_SB(s)->s_properties))) {
-		b_blocknr_t bmap1 = REISERFS_SB(s)->s_sbh->b_blocknr + 1;
-		if (block >= bmap1 &&
-		    block <= bmap1 + bmap_count) {
-			reiserfs_error(s, "vs-4019", "bitmap block %lu(%u) "
-				       "can't be freed or reused",
-				       block, bmap_count);
-			return 0;
-		}
-	} else {
-		if (offset == 0) {
-			reiserfs_error(s, "vs-4020", "bitmap block %lu(%u) "
-				       "can't be freed or reused",
-				       block, bmap_count);
-			return 0;
-		}
-	}
-
-	if (bmap >= bmap_count) {
-		reiserfs_error(s, "vs-4030", "bitmap for requested block "
-			       "is out of range: block=%lu, bitmap_nr=%u",
-			       block, bmap);
-		return 0;
-	}
-
-	if (bit_value == 0 && block == SB_ROOT_BLOCK(s)) {
-		reiserfs_error(s, "vs-4050", "this is root block (%u), "
-			       "it must be busy", SB_ROOT_BLOCK(s));
-		return 0;
-	}
-
-	return 1;
-}
-
-/*
- * Searches in journal structures for a given block number (bmap, off).
- * If block is found in reiserfs journal it suggests next free block
- * candidate to test.
- */
-static inline int is_block_in_journal(struct super_block *s, unsigned int bmap,
-				      int off, int *next)
-{
-	b_blocknr_t tmp;
-
-	if (reiserfs_in_journal(s, bmap, off, 1, &tmp)) {
-		if (tmp) {	/* hint supplied */
-			*next = tmp;
-			PROC_INFO_INC(s, scan_bitmap.in_journal_hint);
-		} else {
-			(*next) = off + 1;  /* inc offset to avoid looping. */
-			PROC_INFO_INC(s, scan_bitmap.in_journal_nohint);
-		}
-		PROC_INFO_INC(s, scan_bitmap.retry);
-		return 1;
-	}
-	return 0;
-}
-
-/*
- * Searches for a window of zero bits with given minimum and maximum
- * lengths in one bitmap block
- */
-static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
-			     unsigned int bmap_n, int *beg, int boundary,
-			     int min, int max, int unfm)
-{
-	struct super_block *s = th->t_super;
-	struct reiserfs_bitmap_info *bi = &SB_AP_BITMAP(s)[bmap_n];
-	struct buffer_head *bh;
-	int end, next;
-	int org = *beg;
-
-	BUG_ON(!th->t_trans_id);
-	RFALSE(bmap_n >= reiserfs_bmap_count(s), "Bitmap %u is out of "
-	       "range (0..%u)", bmap_n, reiserfs_bmap_count(s) - 1);
-	PROC_INFO_INC(s, scan_bitmap.bmap);
-
-	if (!bi) {
-		reiserfs_error(s, "jdm-4055", "NULL bitmap info pointer "
-			       "for bitmap %d", bmap_n);
-		return 0;
-	}
-
-	bh = reiserfs_read_bitmap_block(s, bmap_n);
-	if (bh == NULL)
-		return 0;
-
-	while (1) {
-cont:
-		if (bi->free_count < min) {
-			brelse(bh);
-			return 0;	/* No free blocks in this bitmap */
-		}
-
-		/* search for a first zero bit -- beginning of a window */
-		*beg = reiserfs_find_next_zero_le_bit
-		    ((unsigned long *)(bh->b_data), boundary, *beg);
-
-		/*
-		 * search for a zero bit fails or the rest of bitmap block
-		 * cannot contain a zero window of minimum size
-		 */
-		if (*beg + min > boundary) {
-			brelse(bh);
-			return 0;
-		}
-
-		if (unfm && is_block_in_journal(s, bmap_n, *beg, beg))
-			continue;
-		/* first zero bit found; we check next bits */
-		for (end = *beg + 1;; end++) {
-			if (end >= *beg + max || end >= boundary
-			    || reiserfs_test_le_bit(end, bh->b_data)) {
-				next = end;
-				break;
-			}
-
-			/*
-			 * finding the other end of zero bit window requires
-			 * looking into journal structures (in case of
-			 * searching for free blocks for unformatted nodes)
-			 */
-			if (unfm && is_block_in_journal(s, bmap_n, end, &next))
-				break;
-		}
-
-		/*
-		 * now (*beg) points to beginning of zero bits window,
-		 * (end) points to one bit after the window end
-		 */
-
-		/* found window of proper size */
-		if (end - *beg >= min) {
-			int i;
-			reiserfs_prepare_for_journal(s, bh, 1);
-			/*
-			 * try to set all blocks used checking are
-			 * they still free
-			 */
-			for (i = *beg; i < end; i++) {
-				/* Don't check in journal again. */
-				if (reiserfs_test_and_set_le_bit
-				    (i, bh->b_data)) {
-					/*
-					 * bit was set by another process while
-					 * we slept in prepare_for_journal()
-					 */
-					PROC_INFO_INC(s, scan_bitmap.stolen);
-
-					/*
-					 * we can continue with smaller set
-					 * of allocated blocks, if length of
-					 * this set is more or equal to `min'
-					 */
-					if (i >= *beg + min) {
-						end = i;
-						break;
-					}
-
-					/*
-					 * otherwise we clear all bit
-					 * were set ...
-					 */
-					while (--i >= *beg)
-						reiserfs_clear_le_bit
-						    (i, bh->b_data);
-					reiserfs_restore_prepared_buffer(s, bh);
-					*beg = org;
-
-					/*
-					 * Search again in current block
-					 * from beginning
-					 */
-					goto cont;
-				}
-			}
-			bi->free_count -= (end - *beg);
-			journal_mark_dirty(th, bh);
-			brelse(bh);
-
-			/* free block count calculation */
-			reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
-						     1);
-			PUT_SB_FREE_BLOCKS(s, SB_FREE_BLOCKS(s) - (end - *beg));
-			journal_mark_dirty(th, SB_BUFFER_WITH_SB(s));
-
-			return end - (*beg);
-		} else {
-			*beg = next;
-		}
-	}
-}
-
-static int bmap_hash_id(struct super_block *s, u32 id)
-{
-	char *hash_in = NULL;
-	unsigned long hash;
-	unsigned bm;
-
-	if (id <= 2) {
-		bm = 1;
-	} else {
-		hash_in = (char *)(&id);
-		hash = keyed_hash(hash_in, 4);
-		bm = hash % reiserfs_bmap_count(s);
-		if (!bm)
-			bm = 1;
-	}
-	/* this can only be true when SB_BMAP_NR = 1 */
-	if (bm >= reiserfs_bmap_count(s))
-		bm = 0;
-	return bm;
-}
-
-/*
- * hashes the id and then returns > 0 if the block group for the
- * corresponding hash is full
- */
-static inline int block_group_used(struct super_block *s, u32 id)
-{
-	int bm = bmap_hash_id(s, id);
-	struct reiserfs_bitmap_info *info = &SB_AP_BITMAP(s)[bm];
-
-	/*
-	 * If we don't have cached information on this bitmap block, we're
-	 * going to have to load it later anyway. Loading it here allows us
-	 * to make a better decision. This favors long-term performance gain
-	 * with a better on-disk layout vs. a short term gain of skipping the
-	 * read and potentially having a bad placement.
-	 */
-	if (info->free_count == UINT_MAX) {
-		struct buffer_head *bh = reiserfs_read_bitmap_block(s, bm);
-		brelse(bh);
-	}
-
-	if (info->free_count > ((s->s_blocksize << 3) * 60 / 100)) {
-		return 0;
-	}
-	return 1;
-}
-
-/*
- * the packing is returned in disk byte order
- */
-__le32 reiserfs_choose_packing(struct inode * dir)
-{
-	__le32 packing;
-	if (TEST_OPTION(packing_groups, dir->i_sb)) {
-		u32 parent_dir = le32_to_cpu(INODE_PKEY(dir)->k_dir_id);
-		/*
-		 * some versions of reiserfsck expect packing locality 1 to be
-		 * special
-		 */
-		if (parent_dir == 1 || block_group_used(dir->i_sb, parent_dir))
-			packing = INODE_PKEY(dir)->k_objectid;
-		else
-			packing = INODE_PKEY(dir)->k_dir_id;
-	} else
-		packing = INODE_PKEY(dir)->k_objectid;
-	return packing;
-}
-
-/*
- * Tries to find contiguous zero bit window (given size) in given region of
- * bitmap and place new blocks there. Returns number of allocated blocks.
- */
-static int scan_bitmap(struct reiserfs_transaction_handle *th,
-		       b_blocknr_t * start, b_blocknr_t finish,
-		       int min, int max, int unfm, sector_t file_block)
-{
-	int nr_allocated = 0;
-	struct super_block *s = th->t_super;
-	unsigned int bm, off;
-	unsigned int end_bm, end_off;
-	unsigned int off_max = s->s_blocksize << 3;
-
-	BUG_ON(!th->t_trans_id);
-	PROC_INFO_INC(s, scan_bitmap.call);
-
-	/* No point in looking for more free blocks */
-	if (SB_FREE_BLOCKS(s) <= 0)
-		return 0;
-
-	get_bit_address(s, *start, &bm, &off);
-	get_bit_address(s, finish, &end_bm, &end_off);
-	if (bm > reiserfs_bmap_count(s))
-		return 0;
-	if (end_bm > reiserfs_bmap_count(s))
-		end_bm = reiserfs_bmap_count(s);
-
-	/*
-	 * When the bitmap is more than 10% free, anyone can allocate.
-	 * When it's less than 10% free, only files that already use the
-	 * bitmap are allowed. Once we pass 80% full, this restriction
-	 * is lifted.
-	 *
-	 * We do this so that files that grow later still have space close to
-	 * their original allocation. This improves locality, and presumably
-	 * performance as a result.
-	 *
-	 * This is only an allocation policy and does not make up for getting a
-	 * bad hint. Decent hinting must be implemented for this to work well.
-	 */
-	if (TEST_OPTION(skip_busy, s)
-	    && SB_FREE_BLOCKS(s) > SB_BLOCK_COUNT(s) / 20) {
-		for (; bm < end_bm; bm++, off = 0) {
-			if ((off && (!unfm || (file_block != 0)))
-			    || SB_AP_BITMAP(s)[bm].free_count >
-			    (s->s_blocksize << 3) / 10)
-				nr_allocated =
-				    scan_bitmap_block(th, bm, &off, off_max,
-						      min, max, unfm);
-			if (nr_allocated)
-				goto ret;
-		}
-		/* we know from above that start is a reasonable number */
-		get_bit_address(s, *start, &bm, &off);
-	}
-
-	for (; bm < end_bm; bm++, off = 0) {
-		nr_allocated =
-		    scan_bitmap_block(th, bm, &off, off_max, min, max, unfm);
-		if (nr_allocated)
-			goto ret;
-	}
-
-	nr_allocated =
-	    scan_bitmap_block(th, bm, &off, end_off + 1, min, max, unfm);
-
-ret:
-	*start = bm * off_max + off;
-	return nr_allocated;
-
-}
-
-static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
-				 struct inode *inode, b_blocknr_t block,
-				 int for_unformatted)
-{
-	struct super_block *s = th->t_super;
-	struct reiserfs_super_block *rs;
-	struct buffer_head *sbh, *bmbh;
-	struct reiserfs_bitmap_info *apbi;
-	unsigned int nr, offset;
-
-	BUG_ON(!th->t_trans_id);
-	PROC_INFO_INC(s, free_block);
-	rs = SB_DISK_SUPER_BLOCK(s);
-	sbh = SB_BUFFER_WITH_SB(s);
-	apbi = SB_AP_BITMAP(s);
-
-	get_bit_address(s, block, &nr, &offset);
-
-	if (nr >= reiserfs_bmap_count(s)) {
-		reiserfs_error(s, "vs-4075", "block %lu is out of range",
-			       block);
-		return;
-	}
-
-	bmbh = reiserfs_read_bitmap_block(s, nr);
-	if (!bmbh)
-		return;
-
-	reiserfs_prepare_for_journal(s, bmbh, 1);
-
-	/* clear bit for the given block in bit map */
-	if (!reiserfs_test_and_clear_le_bit(offset, bmbh->b_data)) {
-		reiserfs_error(s, "vs-4080",
-			       "block %lu: bit already cleared", block);
-	}
-	apbi[nr].free_count++;
-	journal_mark_dirty(th, bmbh);
-	brelse(bmbh);
-
-	reiserfs_prepare_for_journal(s, sbh, 1);
-	/* update super block */
-	set_sb_free_blocks(rs, sb_free_blocks(rs) + 1);
-
-	journal_mark_dirty(th, sbh);
-	if (for_unformatted) {
-		int depth = reiserfs_write_unlock_nested(s);
-		dquot_free_block_nodirty(inode, 1);
-		reiserfs_write_lock_nested(s, depth);
-	}
-}
-
-void reiserfs_free_block(struct reiserfs_transaction_handle *th,
-			 struct inode *inode, b_blocknr_t block,
-			 int for_unformatted)
-{
-	struct super_block *s = th->t_super;
-
-	BUG_ON(!th->t_trans_id);
-	RFALSE(!s, "vs-4061: trying to free block on nonexistent device");
-	if (!is_reusable(s, block, 1))
-		return;
-
-	if (block > sb_block_count(REISERFS_SB(s)->s_rs)) {
-		reiserfs_error(th->t_super, "bitmap-4072",
-			       "Trying to free block outside file system "
-			       "boundaries (%lu > %lu)",
-			       block, sb_block_count(REISERFS_SB(s)->s_rs));
-		return;
-	}
-	/* mark it before we clear it, just in case */
-	journal_mark_freed(th, s, block);
-	_reiserfs_free_block(th, inode, block, for_unformatted);
-}
-
-/* preallocated blocks don't need to be run through journal_mark_freed */
-static void reiserfs_free_prealloc_block(struct reiserfs_transaction_handle *th,
-					 struct inode *inode, b_blocknr_t block)
-{
-	BUG_ON(!th->t_trans_id);
-	RFALSE(!th->t_super,
-	       "vs-4060: trying to free block on nonexistent device");
-	if (!is_reusable(th->t_super, block, 1))
-		return;
-	_reiserfs_free_block(th, inode, block, 1);
-}
-
-static void __discard_prealloc(struct reiserfs_transaction_handle *th,
-			       struct reiserfs_inode_info *ei)
-{
-	unsigned long save = ei->i_prealloc_block;
-	int dirty = 0;
-	struct inode *inode = &ei->vfs_inode;
-
-	BUG_ON(!th->t_trans_id);
-#ifdef CONFIG_REISERFS_CHECK
-	if (ei->i_prealloc_count < 0)
-		reiserfs_error(th->t_super, "zam-4001",
-			       "inode has negative prealloc blocks count.");
-#endif
-	while (ei->i_prealloc_count > 0) {
-		b_blocknr_t block_to_free;
-
-		/*
-		 * reiserfs_free_prealloc_block can drop the write lock,
-		 * which could allow another caller to free the same block.
-		 * We can protect against it by modifying the prealloc
-		 * state before calling it.
-		 */
-		block_to_free = ei->i_prealloc_block++;
-		ei->i_prealloc_count--;
-		reiserfs_free_prealloc_block(th, inode, block_to_free);
-		dirty = 1;
-	}
-	if (dirty)
-		reiserfs_update_sd(th, inode);
-	ei->i_prealloc_block = save;
-	list_del_init(&ei->i_prealloc_list);
-}
-
-/* FIXME: It should be inline function */
-void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th,
-			       struct inode *inode)
-{
-	struct reiserfs_inode_info *ei = REISERFS_I(inode);
-
-	BUG_ON(!th->t_trans_id);
-	if (ei->i_prealloc_count)
-		__discard_prealloc(th, ei);
-}
-
-void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th)
-{
-	struct list_head *plist = &SB_JOURNAL(th->t_super)->j_prealloc_list;
-
-	BUG_ON(!th->t_trans_id);
-	while (!list_empty(plist)) {
-		struct reiserfs_inode_info *ei;
-		ei = list_entry(plist->next, struct reiserfs_inode_info,
-				i_prealloc_list);
-#ifdef CONFIG_REISERFS_CHECK
-		if (!ei->i_prealloc_count) {
-			reiserfs_error(th->t_super, "zam-4001",
-				       "inode is in prealloc list but has "
-				       "no preallocated blocks.");
-		}
-#endif
-		__discard_prealloc(th, ei);
-	}
-}
-
-void reiserfs_init_alloc_options(struct super_block *s)
-{
-	set_bit(_ALLOC_skip_busy, &SB_ALLOC_OPTS(s));
-	set_bit(_ALLOC_dirid_groups, &SB_ALLOC_OPTS(s));
-	set_bit(_ALLOC_packing_groups, &SB_ALLOC_OPTS(s));
-}
-
-/* block allocator related options are parsed here */
-int reiserfs_parse_alloc_options(struct super_block *s, char *options)
-{
-	char *this_char, *value;
-
-	/* clear default settings */
-	REISERFS_SB(s)->s_alloc_options.bits = 0;
-
-	while ((this_char = strsep(&options, ":")) != NULL) {
-		if ((value = strchr(this_char, '=')) != NULL)
-			*value++ = 0;
-
-		if (!strcmp(this_char, "concentrating_formatted_nodes")) {
-			int temp;
-			SET_OPTION(concentrating_formatted_nodes);
-			temp = (value
-				&& *value) ? simple_strtoul(value, &value,
-							    0) : 10;
-			if (temp <= 0 || temp > 100) {
-				REISERFS_SB(s)->s_alloc_options.border = 10;
-			} else {
-				REISERFS_SB(s)->s_alloc_options.border =
-				    100 / temp;
-			}
-			continue;
-		}
-		if (!strcmp(this_char, "displacing_large_files")) {
-			SET_OPTION(displacing_large_files);
-			REISERFS_SB(s)->s_alloc_options.large_file_size =
-			    (value
-			     && *value) ? simple_strtoul(value, &value, 0) : 16;
-			continue;
-		}
-		if (!strcmp(this_char, "displacing_new_packing_localities")) {
-			SET_OPTION(displacing_new_packing_localities);
-			continue;
-		}
-
-		if (!strcmp(this_char, "old_hashed_relocation")) {
-			SET_OPTION(old_hashed_relocation);
-			continue;
-		}
-
-		if (!strcmp(this_char, "new_hashed_relocation")) {
-			SET_OPTION(new_hashed_relocation);
-			continue;
-		}
-
-		if (!strcmp(this_char, "dirid_groups")) {
-			SET_OPTION(dirid_groups);
-			continue;
-		}
-		if (!strcmp(this_char, "oid_groups")) {
-			SET_OPTION(oid_groups);
-			continue;
-		}
-		if (!strcmp(this_char, "packing_groups")) {
-			SET_OPTION(packing_groups);
-			continue;
-		}
-		if (!strcmp(this_char, "hashed_formatted_nodes")) {
-			SET_OPTION(hashed_formatted_nodes);
-			continue;
-		}
-
-		if (!strcmp(this_char, "skip_busy")) {
-			SET_OPTION(skip_busy);
-			continue;
-		}
-
-		if (!strcmp(this_char, "hundredth_slices")) {
-			SET_OPTION(hundredth_slices);
-			continue;
-		}
-
-		if (!strcmp(this_char, "old_way")) {
-			SET_OPTION(old_way);
-			continue;
-		}
-
-		if (!strcmp(this_char, "displace_based_on_dirid")) {
-			SET_OPTION(displace_based_on_dirid);
-			continue;
-		}
-
-		if (!strcmp(this_char, "preallocmin")) {
-			REISERFS_SB(s)->s_alloc_options.preallocmin =
-			    (value
-			     && *value) ? simple_strtoul(value, &value, 0) : 4;
-			continue;
-		}
-
-		if (!strcmp(this_char, "preallocsize")) {
-			REISERFS_SB(s)->s_alloc_options.preallocsize =
-			    (value
-			     && *value) ? simple_strtoul(value, &value,
-							 0) :
-			    PREALLOCATION_SIZE;
-			continue;
-		}
-
-		reiserfs_warning(s, "zam-4001", "unknown option - %s",
-				 this_char);
-		return 1;
-	}
-
-	reiserfs_info(s, "allocator options = [%08x]\n", SB_ALLOC_OPTS(s));
-	return 0;
-}
-
-static void print_sep(struct seq_file *seq, int *first)
-{
-	if (!*first)
-		seq_puts(seq, ":");
-	else
-		*first = 0;
-}
-
-void show_alloc_options(struct seq_file *seq, struct super_block *s)
-{
-	int first = 1;
-
-	if (SB_ALLOC_OPTS(s) == ((1 << _ALLOC_skip_busy) |
-		(1 << _ALLOC_dirid_groups) | (1 << _ALLOC_packing_groups)))
-		return;
-
-	seq_puts(seq, ",alloc=");
-
-	if (TEST_OPTION(concentrating_formatted_nodes, s)) {
-		print_sep(seq, &first);
-		if (REISERFS_SB(s)->s_alloc_options.border != 10) {
-			seq_printf(seq, "concentrating_formatted_nodes=%d",
-				100 / REISERFS_SB(s)->s_alloc_options.border);
-		} else
-			seq_puts(seq, "concentrating_formatted_nodes");
-	}
-	if (TEST_OPTION(displacing_large_files, s)) {
-		print_sep(seq, &first);
-		if (REISERFS_SB(s)->s_alloc_options.large_file_size != 16) {
-			seq_printf(seq, "displacing_large_files=%lu",
-			    REISERFS_SB(s)->s_alloc_options.large_file_size);
-		} else
-			seq_puts(seq, "displacing_large_files");
-	}
-	if (TEST_OPTION(displacing_new_packing_localities, s)) {
-		print_sep(seq, &first);
-		seq_puts(seq, "displacing_new_packing_localities");
-	}
-	if (TEST_OPTION(old_hashed_relocation, s)) {
-		print_sep(seq, &first);
-		seq_puts(seq, "old_hashed_relocation");
-	}
-	if (TEST_OPTION(new_hashed_relocation, s)) {
-		print_sep(seq, &first);
-		seq_puts(seq, "new_hashed_relocation");
-	}
-	if (TEST_OPTION(dirid_groups, s)) {
-		print_sep(seq, &first);
-		seq_puts(seq, "dirid_groups");
-	}
-	if (TEST_OPTION(oid_groups, s)) {
-		print_sep(seq, &first);
-		seq_puts(seq, "oid_groups");
-	}
-	if (TEST_OPTION(packing_groups, s)) {
-		print_sep(seq, &first);
-		seq_puts(seq, "packing_groups");
-	}
-	if (TEST_OPTION(hashed_formatted_nodes, s)) {
-		print_sep(seq, &first);
-		seq_puts(seq, "hashed_formatted_nodes");
-	}
-	if (TEST_OPTION(skip_busy, s)) {
-		print_sep(seq, &first);
-		seq_puts(seq, "skip_busy");
-	}
-	if (TEST_OPTION(hundredth_slices, s)) {
-		print_sep(seq, &first);
-		seq_puts(seq, "hundredth_slices");
-	}
-	if (TEST_OPTION(old_way, s)) {
-		print_sep(seq, &first);
-		seq_puts(seq, "old_way");
-	}
-	if (TEST_OPTION(displace_based_on_dirid, s)) {
-		print_sep(seq, &first);
-		seq_puts(seq, "displace_based_on_dirid");
-	}
-	if (REISERFS_SB(s)->s_alloc_options.preallocmin != 0) {
-		print_sep(seq, &first);
-		seq_printf(seq, "preallocmin=%d",
-				REISERFS_SB(s)->s_alloc_options.preallocmin);
-	}
-	if (REISERFS_SB(s)->s_alloc_options.preallocsize != 17) {
-		print_sep(seq, &first);
-		seq_printf(seq, "preallocsize=%d",
-				REISERFS_SB(s)->s_alloc_options.preallocsize);
-	}
-}
-
-static inline void new_hashed_relocation(reiserfs_blocknr_hint_t * hint)
-{
-	char *hash_in;
-
-	if (hint->formatted_node) {
-		hash_in = (char *)&hint->key.k_dir_id;
-	} else {
-		if (!hint->inode) {
-			/*hint->search_start = hint->beg;*/
-			hash_in = (char *)&hint->key.k_dir_id;
-		} else
-		    if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
-			hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id);
-		else
-			hash_in =
-			    (char *)(&INODE_PKEY(hint->inode)->k_objectid);
-	}
-
-	hint->search_start =
-	    hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg);
-}
-
-/*
- * Relocation based on dirid, hashing them into a given bitmap block
- * files. Formatted nodes are unaffected, a separate policy covers them
- */
-static void dirid_groups(reiserfs_blocknr_hint_t * hint)
-{
-	unsigned long hash;
-	__u32 dirid = 0;
-	int bm = 0;
-	struct super_block *sb = hint->th->t_super;
-
-	if (hint->inode)
-		dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
-	else if (hint->formatted_node)
-		dirid = hint->key.k_dir_id;
-
-	if (dirid) {
-		bm = bmap_hash_id(sb, dirid);
-		hash = bm * (sb->s_blocksize << 3);
-		/* give a portion of the block group to metadata */
-		if (hint->inode)
-			hash += sb->s_blocksize / 2;
-		hint->search_start = hash;
-	}
-}
-
-/*
- * Relocation based on oid, hashing them into a given bitmap block
- * files. Formatted nodes are unaffected, a separate policy covers them
- */
-static void oid_groups(reiserfs_blocknr_hint_t * hint)
-{
-	if (hint->inode) {
-		unsigned long hash;
-		__u32 oid;
-		__u32 dirid;
-		int bm;
-
-		dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
-
-		/*
-		 * keep the root dir and it's first set of subdirs close to
-		 * the start of the disk
-		 */
-		if (dirid <= 2)
-			hash = (hint->inode->i_sb->s_blocksize << 3);
-		else {
-			oid = le32_to_cpu(INODE_PKEY(hint->inode)->k_objectid);
-			bm = bmap_hash_id(hint->inode->i_sb, oid);
-			hash = bm * (hint->inode->i_sb->s_blocksize << 3);
-		}
-		hint->search_start = hash;
-	}
-}
-
-/*
- * returns 1 if it finds an indirect item and gets valid hint info
- * from it, otherwise 0
- */
-static int get_left_neighbor(reiserfs_blocknr_hint_t * hint)
-{
-	struct treepath *path;
-	struct buffer_head *bh;
-	struct item_head *ih;
-	int pos_in_item;
-	__le32 *item;
-	int ret = 0;
-
-	/*
-	 * reiserfs code can call this function w/o pointer to path
-	 * structure supplied; then we rely on supplied search_start
-	 */
-	if (!hint->path)
-		return 0;
-
-	path = hint->path;
-	bh = get_last_bh(path);
-	RFALSE(!bh, "green-4002: Illegal path specified to get_left_neighbor");
-	ih = tp_item_head(path);
-	pos_in_item = path->pos_in_item;
-	item = tp_item_body(path);
-
-	hint->search_start = bh->b_blocknr;
-
-	/*
-	 * for indirect item: go to left and look for the first non-hole entry
-	 * in the indirect item
-	 */
-	if (!hint->formatted_node && is_indirect_le_ih(ih)) {
-		if (pos_in_item == I_UNFM_NUM(ih))
-			pos_in_item--;
-		while (pos_in_item >= 0) {
-			int t = get_block_num(item, pos_in_item);
-			if (t) {
-				hint->search_start = t;
-				ret = 1;
-				break;
-			}
-			pos_in_item--;
-		}
-	}
-
-	/* does result value fit into specified region? */
-	return ret;
-}
-
-/*
- * should be, if formatted node, then try to put on first part of the device
- * specified as number of percent with mount option device, else try to put
- * on last of device.  This is not to say it is good code to do so,
- * but the effect should be measured.
- */
-static inline void set_border_in_hint(struct super_block *s,
-				      reiserfs_blocknr_hint_t * hint)
-{
-	b_blocknr_t border =
-	    SB_BLOCK_COUNT(s) / REISERFS_SB(s)->s_alloc_options.border;
-
-	if (hint->formatted_node)
-		hint->end = border - 1;
-	else
-		hint->beg = border;
-}
-
-static inline void displace_large_file(reiserfs_blocknr_hint_t * hint)
-{
-	if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
-		hint->search_start =
-		    hint->beg +
-		    keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_dir_id),
-			       4) % (hint->end - hint->beg);
-	else
-		hint->search_start =
-		    hint->beg +
-		    keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_objectid),
-			       4) % (hint->end - hint->beg);
-}
-
-static inline void hash_formatted_node(reiserfs_blocknr_hint_t * hint)
-{
-	char *hash_in;
-
-	if (!hint->inode)
-		hash_in = (char *)&hint->key.k_dir_id;
-	else if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
-		hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id);
-	else
-		hash_in = (char *)(&INODE_PKEY(hint->inode)->k_objectid);
-
-	hint->search_start =
-	    hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg);
-}
-
-static inline int
-this_blocknr_allocation_would_make_it_a_large_file(reiserfs_blocknr_hint_t *
-						   hint)
-{
-	return hint->block ==
-	    REISERFS_SB(hint->th->t_super)->s_alloc_options.large_file_size;
-}
-
-#ifdef DISPLACE_NEW_PACKING_LOCALITIES
-static inline void displace_new_packing_locality(reiserfs_blocknr_hint_t * hint)
-{
-	struct in_core_key *key = &hint->key;
-
-	hint->th->displace_new_blocks = 0;
-	hint->search_start =
-	    hint->beg + keyed_hash((char *)(&key->k_objectid),
-				   4) % (hint->end - hint->beg);
-}
-#endif
-
-static inline int old_hashed_relocation(reiserfs_blocknr_hint_t * hint)
-{
-	b_blocknr_t border;
-	u32 hash_in;
-
-	if (hint->formatted_node || hint->inode == NULL) {
-		return 0;
-	}
-
-	hash_in = le32_to_cpu((INODE_PKEY(hint->inode))->k_dir_id);
-	border =
-	    hint->beg + (u32) keyed_hash(((char *)(&hash_in)),
-					 4) % (hint->end - hint->beg - 1);
-	if (border > hint->search_start)
-		hint->search_start = border;
-
-	return 1;
-}
-
-static inline int old_way(reiserfs_blocknr_hint_t * hint)
-{
-	b_blocknr_t border;
-
-	if (hint->formatted_node || hint->inode == NULL) {
-		return 0;
-	}
-
-	border =
-	    hint->beg +
-	    le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id) % (hint->end -
-							      hint->beg);
-	if (border > hint->search_start)
-		hint->search_start = border;
-
-	return 1;
-}
-
-static inline void hundredth_slices(reiserfs_blocknr_hint_t * hint)
-{
-	struct in_core_key *key = &hint->key;
-	b_blocknr_t slice_start;
-
-	slice_start =
-	    (keyed_hash((char *)(&key->k_dir_id), 4) % 100) * (hint->end / 100);
-	if (slice_start > hint->search_start
-	    || slice_start + (hint->end / 100) <= hint->search_start) {
-		hint->search_start = slice_start;
-	}
-}
-
-static void determine_search_start(reiserfs_blocknr_hint_t * hint,
-				   int amount_needed)
-{
-	struct super_block *s = hint->th->t_super;
-	int unfm_hint;
-
-	hint->beg = 0;
-	hint->end = SB_BLOCK_COUNT(s) - 1;
-
-	/* This is former border algorithm. Now with tunable border offset */
-	if (concentrating_formatted_nodes(s))
-		set_border_in_hint(s, hint);
-
-#ifdef DISPLACE_NEW_PACKING_LOCALITIES
-	/*
-	 * whenever we create a new directory, we displace it.  At first
-	 * we will hash for location, later we might look for a moderately
-	 * empty place for it
-	 */
-	if (displacing_new_packing_localities(s)
-	    && hint->th->displace_new_blocks) {
-		displace_new_packing_locality(hint);
-
-		/*
-		 * we do not continue determine_search_start,
-		 * if new packing locality is being displaced
-		 */
-		return;
-	}
-#endif
-
-	/*
-	 * all persons should feel encouraged to add more special cases
-	 * here and test them
-	 */
-
-	if (displacing_large_files(s) && !hint->formatted_node
-	    && this_blocknr_allocation_would_make_it_a_large_file(hint)) {
-		displace_large_file(hint);
-		return;
-	}
-
-	/*
-	 * if none of our special cases is relevant, use the left
-	 * neighbor in the tree order of the new node we are allocating for
-	 */
-	if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes, s)) {
-		hash_formatted_node(hint);
-		return;
-	}
-
-	unfm_hint = get_left_neighbor(hint);
-
-	/*
-	 * Mimic old block allocator behaviour, that is if VFS allowed for
-	 * preallocation, new blocks are displaced based on directory ID.
-	 * Also, if suggested search_start is less than last preallocated
-	 * block, we start searching from it, assuming that HDD dataflow
-	 * is faster in forward direction
-	 */
-	if (TEST_OPTION(old_way, s)) {
-		if (!hint->formatted_node) {
-			if (!reiserfs_hashed_relocation(s))
-				old_way(hint);
-			else if (!reiserfs_no_unhashed_relocation(s))
-				old_hashed_relocation(hint);
-
-			if (hint->inode
-			    && hint->search_start <
-			    REISERFS_I(hint->inode)->i_prealloc_block)
-				hint->search_start =
-				    REISERFS_I(hint->inode)->i_prealloc_block;
-		}
-		return;
-	}
-
-	/* This is an approach proposed by Hans */
-	if (TEST_OPTION(hundredth_slices, s)
-	    && !(displacing_large_files(s) && !hint->formatted_node)) {
-		hundredth_slices(hint);
-		return;
-	}
-
-	/* old_hashed_relocation only works on unformatted */
-	if (!unfm_hint && !hint->formatted_node &&
-	    TEST_OPTION(old_hashed_relocation, s)) {
-		old_hashed_relocation(hint);
-	}
-
-	/* new_hashed_relocation works with both formatted/unformatted nodes */
-	if ((!unfm_hint || hint->formatted_node) &&
-	    TEST_OPTION(new_hashed_relocation, s)) {
-		new_hashed_relocation(hint);
-	}
-
-	/* dirid grouping works only on unformatted nodes */
-	if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups, s)) {
-		dirid_groups(hint);
-	}
-#ifdef DISPLACE_NEW_PACKING_LOCALITIES
-	if (hint->formatted_node && TEST_OPTION(dirid_groups, s)) {
-		dirid_groups(hint);
-	}
-#endif
-
-	/* oid grouping works only on unformatted nodes */
-	if (!unfm_hint && !hint->formatted_node && TEST_OPTION(oid_groups, s)) {
-		oid_groups(hint);
-	}
-	return;
-}
-
-static int determine_prealloc_size(reiserfs_blocknr_hint_t * hint)
-{
-	/* make minimum size a mount option and benchmark both ways */
-	/* we preallocate blocks only for regular files, specific size */
-	/* benchmark preallocating always and see what happens */
-
-	hint->prealloc_size = 0;
-
-	if (!hint->formatted_node && hint->preallocate) {
-		if (S_ISREG(hint->inode->i_mode) && !IS_PRIVATE(hint->inode)
-		    && hint->inode->i_size >=
-		    REISERFS_SB(hint->th->t_super)->s_alloc_options.
-		    preallocmin * hint->inode->i_sb->s_blocksize)
-			hint->prealloc_size =
-			    REISERFS_SB(hint->th->t_super)->s_alloc_options.
-			    preallocsize - 1;
-	}
-	return CARRY_ON;
-}
-
-static inline int allocate_without_wrapping_disk(reiserfs_blocknr_hint_t * hint,
-						 b_blocknr_t * new_blocknrs,
-						 b_blocknr_t start,
-						 b_blocknr_t finish, int min,
-						 int amount_needed,
-						 int prealloc_size)
-{
-	int rest = amount_needed;
-	int nr_allocated;
-
-	while (rest > 0 && start <= finish) {
-		nr_allocated = scan_bitmap(hint->th, &start, finish, min,
-					   rest + prealloc_size,
-					   !hint->formatted_node, hint->block);
-
-		if (nr_allocated == 0)	/* no new blocks allocated, return */
-			break;
-
-		/* fill free_blocknrs array first */
-		while (rest > 0 && nr_allocated > 0) {
-			*new_blocknrs++ = start++;
-			rest--;
-			nr_allocated--;
-		}
-
-		/* do we have something to fill prealloc. array also ? */
-		if (nr_allocated > 0) {
-			/*
-			 * it means prealloc_size was greater that 0 and
-			 * we do preallocation
-			 */
-			list_add(&REISERFS_I(hint->inode)->i_prealloc_list,
-				 &SB_JOURNAL(hint->th->t_super)->
-				 j_prealloc_list);
-			REISERFS_I(hint->inode)->i_prealloc_block = start;
-			REISERFS_I(hint->inode)->i_prealloc_count =
-			    nr_allocated;
-			break;
-		}
-	}
-
-	return (amount_needed - rest);
-}
-
-static inline int blocknrs_and_prealloc_arrays_from_search_start
-    (reiserfs_blocknr_hint_t * hint, b_blocknr_t * new_blocknrs,
-     int amount_needed) {
-	struct super_block *s = hint->th->t_super;
-	b_blocknr_t start = hint->search_start;
-	b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1;
-	int passno = 0;
-	int nr_allocated = 0;
-	int depth;
-
-	determine_prealloc_size(hint);
-	if (!hint->formatted_node) {
-		int quota_ret;
-#ifdef REISERQUOTA_DEBUG
-		reiserfs_debug(s, REISERFS_DEBUG_CODE,
-			       "reiserquota: allocating %d blocks id=%u",
-			       amount_needed, hint->inode->i_uid);
-#endif
-		depth = reiserfs_write_unlock_nested(s);
-		quota_ret =
-		    dquot_alloc_block_nodirty(hint->inode, amount_needed);
-		if (quota_ret) {	/* Quota exceeded? */
-			reiserfs_write_lock_nested(s, depth);
-			return QUOTA_EXCEEDED;
-		}
-		if (hint->preallocate && hint->prealloc_size) {
-#ifdef REISERQUOTA_DEBUG
-			reiserfs_debug(s, REISERFS_DEBUG_CODE,
-				       "reiserquota: allocating (prealloc) %d blocks id=%u",
-				       hint->prealloc_size, hint->inode->i_uid);
-#endif
-			quota_ret = dquot_prealloc_block_nodirty(hint->inode,
-							 hint->prealloc_size);
-			if (quota_ret)
-				hint->preallocate = hint->prealloc_size = 0;
-		}
-		/* for unformatted nodes, force large allocations */
-		reiserfs_write_lock_nested(s, depth);
-	}
-
-	do {
-		switch (passno++) {
-		case 0:	/* Search from hint->search_start to end of disk */
-			start = hint->search_start;
-			finish = SB_BLOCK_COUNT(s) - 1;
-			break;
-		case 1:	/* Search from hint->beg to hint->search_start */
-			start = hint->beg;
-			finish = hint->search_start;
-			break;
-		case 2:	/* Last chance: Search from 0 to hint->beg */
-			start = 0;
-			finish = hint->beg;
-			break;
-		default:
-			/* We've tried searching everywhere, not enough space */
-			/* Free the blocks */
-			if (!hint->formatted_node) {
-#ifdef REISERQUOTA_DEBUG
-				reiserfs_debug(s, REISERFS_DEBUG_CODE,
-					       "reiserquota: freeing (nospace) %d blocks id=%u",
-					       amount_needed +
-					       hint->prealloc_size -
-					       nr_allocated,
-					       hint->inode->i_uid);
-#endif
-				/* Free not allocated blocks */
-				depth = reiserfs_write_unlock_nested(s);
-				dquot_free_block_nodirty(hint->inode,
-					amount_needed + hint->prealloc_size -
-					nr_allocated);
-				reiserfs_write_lock_nested(s, depth);
-			}
-			while (nr_allocated--)
-				reiserfs_free_block(hint->th, hint->inode,
-						    new_blocknrs[nr_allocated],
-						    !hint->formatted_node);
-
-			return NO_DISK_SPACE;
-		}
-	} while ((nr_allocated += allocate_without_wrapping_disk(hint,
-								 new_blocknrs +
-								 nr_allocated,
-								 start, finish,
-								 1,
-								 amount_needed -
-								 nr_allocated,
-								 hint->
-								 prealloc_size))
-		 < amount_needed);
-	if (!hint->formatted_node &&
-	    amount_needed + hint->prealloc_size >
-	    nr_allocated + REISERFS_I(hint->inode)->i_prealloc_count) {
-		/* Some of preallocation blocks were not allocated */
-#ifdef REISERQUOTA_DEBUG
-		reiserfs_debug(s, REISERFS_DEBUG_CODE,
-			       "reiserquota: freeing (failed prealloc) %d blocks id=%u",
-			       amount_needed + hint->prealloc_size -
-			       nr_allocated -
-			       REISERFS_I(hint->inode)->i_prealloc_count,
-			       hint->inode->i_uid);
-#endif
-
-		depth = reiserfs_write_unlock_nested(s);
-		dquot_free_block_nodirty(hint->inode, amount_needed +
-					 hint->prealloc_size - nr_allocated -
-					 REISERFS_I(hint->inode)->
-					 i_prealloc_count);
-		reiserfs_write_lock_nested(s, depth);
-	}
-
-	return CARRY_ON;
-}
-
-/* grab new blocknrs from preallocated list */
-/* return amount still needed after using them */
-static int use_preallocated_list_if_available(reiserfs_blocknr_hint_t * hint,
-					      b_blocknr_t * new_blocknrs,
-					      int amount_needed)
-{
-	struct inode *inode = hint->inode;
-
-	if (REISERFS_I(inode)->i_prealloc_count > 0) {
-		while (amount_needed) {
-
-			*new_blocknrs++ = REISERFS_I(inode)->i_prealloc_block++;
-			REISERFS_I(inode)->i_prealloc_count--;
-
-			amount_needed--;
-
-			if (REISERFS_I(inode)->i_prealloc_count <= 0) {
-				list_del(&REISERFS_I(inode)->i_prealloc_list);
-				break;
-			}
-		}
-	}
-	/* return amount still needed after using preallocated blocks */
-	return amount_needed;
-}
-
-int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *hint,
-			       b_blocknr_t *new_blocknrs,
-			       int amount_needed,
-			       /* Amount of blocks we have already reserved */
-			       int reserved_by_us)
-{
-	int initial_amount_needed = amount_needed;
-	int ret;
-	struct super_block *s = hint->th->t_super;
-
-	/* Check if there is enough space, taking into account reserved space */
-	if (SB_FREE_BLOCKS(s) - REISERFS_SB(s)->reserved_blocks <
-	    amount_needed - reserved_by_us)
-		return NO_DISK_SPACE;
-	/* should this be if !hint->inode &&  hint->preallocate? */
-	/* do you mean hint->formatted_node can be removed ? - Zam */
-	/*
-	 * hint->formatted_node cannot be removed because we try to access
-	 * inode information here, and there is often no inode associated with
-	 * metadata allocations - green
-	 */
-
-	if (!hint->formatted_node && hint->preallocate) {
-		amount_needed = use_preallocated_list_if_available
-		    (hint, new_blocknrs, amount_needed);
-
-		/*
-		 * We have all the block numbers we need from the
-		 * prealloc list
-		 */
-		if (amount_needed == 0)
-			return CARRY_ON;
-		new_blocknrs += (initial_amount_needed - amount_needed);
-	}
-
-	/* find search start and save it in hint structure */
-	determine_search_start(hint, amount_needed);
-	if (hint->search_start >= SB_BLOCK_COUNT(s))
-		hint->search_start = SB_BLOCK_COUNT(s) - 1;
-
-	/* allocation itself; fill new_blocknrs and preallocation arrays */
-	ret = blocknrs_and_prealloc_arrays_from_search_start
-	    (hint, new_blocknrs, amount_needed);
-
-	/*
-	 * We used prealloc. list to fill (partially) new_blocknrs array.
-	 * If final allocation fails we need to return blocks back to
-	 * prealloc. list or just free them. -- Zam (I chose second
-	 * variant)
-	 */
-	if (ret != CARRY_ON) {
-		while (amount_needed++ < initial_amount_needed) {
-			reiserfs_free_block(hint->th, hint->inode,
-					    *(--new_blocknrs), 1);
-		}
-	}
-	return ret;
-}
-
-void reiserfs_cache_bitmap_metadata(struct super_block *sb,
-                                    struct buffer_head *bh,
-                                    struct reiserfs_bitmap_info *info)
-{
-	unsigned long *cur = (unsigned long *)(bh->b_data + bh->b_size);
-
-	/* The first bit must ALWAYS be 1 */
-	if (!reiserfs_test_le_bit(0, (unsigned long *)bh->b_data))
-		reiserfs_error(sb, "reiserfs-2025", "bitmap block %lu is "
-			       "corrupted: first bit must be 1", bh->b_blocknr);
-
-	info->free_count = 0;
-
-	while (--cur >= (unsigned long *)bh->b_data) {
-		/* 0 and ~0 are special, we can optimize for them */
-		if (*cur == 0)
-			info->free_count += BITS_PER_LONG;
-		else if (*cur != ~0L)	/* A mix, investigate */
-			info->free_count += BITS_PER_LONG - hweight_long(*cur);
-	}
-}
-
-struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb,
-                                               unsigned int bitmap)
-{
-	b_blocknr_t block = (sb->s_blocksize << 3) * bitmap;
-	struct reiserfs_bitmap_info *info = SB_AP_BITMAP(sb) + bitmap;
-	struct buffer_head *bh;
-
-	/*
-	 * Way old format filesystems had the bitmaps packed up front.
-	 * I doubt there are any of these left, but just in case...
-	 */
-	if (unlikely(test_bit(REISERFS_OLD_FORMAT,
-			      &REISERFS_SB(sb)->s_properties)))
-		block = REISERFS_SB(sb)->s_sbh->b_blocknr + 1 + bitmap;
-	else if (bitmap == 0)
-		block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1;
-
-	bh = sb_bread(sb, block);
-	if (bh == NULL)
-		reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) "
-		                 "reading failed", __func__, block);
-	else {
-		if (buffer_locked(bh)) {
-			int depth;
-			PROC_INFO_INC(sb, scan_bitmap.wait);
-			depth = reiserfs_write_unlock_nested(sb);
-			__wait_on_buffer(bh);
-			reiserfs_write_lock_nested(sb, depth);
-		}
-		BUG_ON(!buffer_uptodate(bh));
-		BUG_ON(atomic_read(&bh->b_count) == 0);
-
-		if (info->free_count == UINT_MAX)
-			reiserfs_cache_bitmap_metadata(sb, bh, info);
-	}
-
-	return bh;
-}
-
-int reiserfs_init_bitmap_cache(struct super_block *sb)
-{
-	struct reiserfs_bitmap_info *bitmap;
-	unsigned int bmap_nr = reiserfs_bmap_count(sb);
-
-	bitmap = vmalloc(array_size(bmap_nr, sizeof(*bitmap)));
-	if (bitmap == NULL)
-		return -ENOMEM;
-
-	memset(bitmap, 0xff, sizeof(*bitmap) * bmap_nr);
-
-	SB_AP_BITMAP(sb) = bitmap;
-
-	return 0;
-}
-
-void reiserfs_free_bitmap_cache(struct super_block *sb)
-{
-	if (SB_AP_BITMAP(sb)) {
-		vfree(SB_AP_BITMAP(sb));
-		SB_AP_BITMAP(sb) = NULL;
-	}
-}
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
deleted file mode 100644
index 79ee2b436685..000000000000
--- a/fs/reiserfs/dir.c
+++ /dev/null
@@ -1,346 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include "reiserfs.h"
-#include <linux/stat.h>
-#include <linux/buffer_head.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-
-extern const struct reiserfs_key MIN_KEY;
-
-static int reiserfs_readdir(struct file *, struct dir_context *);
-static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
-			      int datasync);
-
-const struct file_operations reiserfs_dir_operations = {
-	.llseek = generic_file_llseek,
-	.read = generic_read_dir,
-	.iterate_shared = reiserfs_readdir,
-	.fsync = reiserfs_dir_fsync,
-	.unlocked_ioctl = reiserfs_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl = reiserfs_compat_ioctl,
-#endif
-};
-
-static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
-			      int datasync)
-{
-	struct inode *inode = filp->f_mapping->host;
-	int err;
-
-	err = file_write_and_wait_range(filp, start, end);
-	if (err)
-		return err;
-
-	inode_lock(inode);
-	reiserfs_write_lock(inode->i_sb);
-	err = reiserfs_commit_for_inode(inode);
-	reiserfs_write_unlock(inode->i_sb);
-	inode_unlock(inode);
-	if (err < 0)
-		return err;
-	return 0;
-}
-
-#define store_ih(where,what) copy_item_head (where, what)
-
-static inline bool is_privroot_deh(struct inode *dir, struct reiserfs_de_head *deh)
-{
-	struct dentry *privroot = REISERFS_SB(dir->i_sb)->priv_root;
-	return (d_really_is_positive(privroot) &&
-	        deh->deh_objectid == INODE_PKEY(d_inode(privroot))->k_objectid);
-}
-
-int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
-{
-
-	/* key of current position in the directory (key of directory entry) */
-	struct cpu_key pos_key;
-
-	INITIALIZE_PATH(path_to_entry);
-	struct buffer_head *bh;
-	int item_num, entry_num;
-	const struct reiserfs_key *rkey;
-	struct item_head *ih, tmp_ih;
-	int search_res;
-	char *local_buf;
-	loff_t next_pos;
-	char small_buf[32];	/* avoid kmalloc if we can */
-	struct reiserfs_dir_entry de;
-	int ret = 0;
-	int depth;
-
-	reiserfs_write_lock(inode->i_sb);
-
-	reiserfs_check_lock_depth(inode->i_sb, "readdir");
-
-	/*
-	 * form key for search the next directory entry using
-	 * f_pos field of file structure
-	 */
-	make_cpu_key(&pos_key, inode, ctx->pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3);
-	next_pos = cpu_key_k_offset(&pos_key);
-
-	path_to_entry.reada = PATH_READA;
-	while (1) {
-research:
-		/*
-		 * search the directory item, containing entry with
-		 * specified key
-		 */
-		search_res =
-		    search_by_entry_key(inode->i_sb, &pos_key, &path_to_entry,
-					&de);
-		if (search_res == IO_ERROR) {
-			/*
-			 * FIXME: we could just skip part of directory
-			 * which could not be read
-			 */
-			ret = -EIO;
-			goto out;
-		}
-		entry_num = de.de_entry_num;
-		bh = de.de_bh;
-		item_num = de.de_item_num;
-		ih = de.de_ih;
-		store_ih(&tmp_ih, ih);
-
-		/* we must have found item, that is item of this directory, */
-		RFALSE(COMP_SHORT_KEYS(&ih->ih_key, &pos_key),
-		       "vs-9000: found item %h does not match to dir we readdir %K",
-		       ih, &pos_key);
-		RFALSE(item_num > B_NR_ITEMS(bh) - 1,
-		       "vs-9005 item_num == %d, item amount == %d",
-		       item_num, B_NR_ITEMS(bh));
-
-		/*
-		 * and entry must be not more than number of entries
-		 * in the item
-		 */
-		RFALSE(ih_entry_count(ih) < entry_num,
-		       "vs-9010: entry number is too big %d (%d)",
-		       entry_num, ih_entry_count(ih));
-
-		/*
-		 * go through all entries in the directory item beginning
-		 * from the entry, that has been found
-		 */
-		if (search_res == POSITION_FOUND
-		    || entry_num < ih_entry_count(ih)) {
-			struct reiserfs_de_head *deh =
-			    B_I_DEH(bh, ih) + entry_num;
-
-			for (; entry_num < ih_entry_count(ih);
-			     entry_num++, deh++) {
-				int d_reclen;
-				char *d_name;
-				ino_t d_ino;
-				loff_t cur_pos = deh_offset(deh);
-
-				/* it is hidden entry */
-				if (!de_visible(deh))
-					continue;
-				d_reclen = entry_length(bh, ih, entry_num);
-				d_name = B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh);
-
-				if (d_reclen <= 0 ||
-				    d_name + d_reclen > bh->b_data + bh->b_size) {
-					/*
-					 * There is corrupted data in entry,
-					 * We'd better stop here
-					 */
-					pathrelse(&path_to_entry);
-					ret = -EIO;
-					goto out;
-				}
-
-				if (!d_name[d_reclen - 1])
-					d_reclen = strlen(d_name);
-
-				/* too big to send back to VFS */
-				if (d_reclen >
-				    REISERFS_MAX_NAME(inode->i_sb->
-						      s_blocksize)) {
-					continue;
-				}
-
-				/* Ignore the .reiserfs_priv entry */
-				if (is_privroot_deh(inode, deh))
-					continue;
-
-				ctx->pos = deh_offset(deh);
-				d_ino = deh_objectid(deh);
-				if (d_reclen <= 32) {
-					local_buf = small_buf;
-				} else {
-					local_buf = kmalloc(d_reclen,
-							    GFP_NOFS);
-					if (!local_buf) {
-						pathrelse(&path_to_entry);
-						ret = -ENOMEM;
-						goto out;
-					}
-					if (item_moved(&tmp_ih, &path_to_entry)) {
-						kfree(local_buf);
-						goto research;
-					}
-				}
-
-				/*
-				 * Note, that we copy name to user space via
-				 * temporary buffer (local_buf) because
-				 * filldir will block if user space buffer is
-				 * swapped out. At that time entry can move to
-				 * somewhere else
-				 */
-				memcpy(local_buf, d_name, d_reclen);
-
-				/*
-				 * Since filldir might sleep, we can release
-				 * the write lock here for other waiters
-				 */
-				depth = reiserfs_write_unlock_nested(inode->i_sb);
-				if (!dir_emit
-				    (ctx, local_buf, d_reclen, d_ino,
-				     DT_UNKNOWN)) {
-					reiserfs_write_lock_nested(inode->i_sb, depth);
-					if (local_buf != small_buf) {
-						kfree(local_buf);
-					}
-					goto end;
-				}
-				reiserfs_write_lock_nested(inode->i_sb, depth);
-				if (local_buf != small_buf) {
-					kfree(local_buf);
-				}
-
-				/* deh_offset(deh) may be invalid now. */
-				next_pos = cur_pos + 1;
-
-				if (item_moved(&tmp_ih, &path_to_entry)) {
-					set_cpu_key_k_offset(&pos_key,
-							     next_pos);
-					goto research;
-				}
-			}	/* for */
-		}
-
-		/* end of directory has been reached */
-		if (item_num != B_NR_ITEMS(bh) - 1)
-			goto end;
-
-		/*
-		 * item we went through is last item of node. Using right
-		 * delimiting key check is it directory end
-		 */
-		rkey = get_rkey(&path_to_entry, inode->i_sb);
-		if (!comp_le_keys(rkey, &MIN_KEY)) {
-			/*
-			 * set pos_key to key, that is the smallest and greater
-			 * that key of the last entry in the item
-			 */
-			set_cpu_key_k_offset(&pos_key, next_pos);
-			continue;
-		}
-
-		/* end of directory has been reached */
-		if (COMP_SHORT_KEYS(rkey, &pos_key)) {
-			goto end;
-		}
-
-		/* directory continues in the right neighboring block */
-		set_cpu_key_k_offset(&pos_key,
-				     le_key_k_offset(KEY_FORMAT_3_5, rkey));
-
-	}			/* while */
-
-end:
-	ctx->pos = next_pos;
-	pathrelse(&path_to_entry);
-	reiserfs_check_path(&path_to_entry);
-out:
-	reiserfs_write_unlock(inode->i_sb);
-	return ret;
-}
-
-static int reiserfs_readdir(struct file *file, struct dir_context *ctx)
-{
-	return reiserfs_readdir_inode(file_inode(file), ctx);
-}
-
-/*
- * compose directory item containing "." and ".." entries (entries are
- * not aligned to 4 byte boundary)
- */
-void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid,
-			    __le32 par_dirid, __le32 par_objid)
-{
-	struct reiserfs_de_head *dot, *dotdot;
-
-	memset(body, 0, EMPTY_DIR_SIZE_V1);
-	dot = (struct reiserfs_de_head *)body;
-	dotdot = dot + 1;
-
-	/* direntry header of "." */
-	put_deh_offset(dot, DOT_OFFSET);
-	/* these two are from make_le_item_head, and are LE */
-	dot->deh_dir_id = dirid;
-	dot->deh_objectid = objid;
-	dot->deh_state = 0;	/* Endian safe if 0 */
-	put_deh_location(dot, EMPTY_DIR_SIZE_V1 - strlen("."));
-	mark_de_visible(dot);
-
-	/* direntry header of ".." */
-	put_deh_offset(dotdot, DOT_DOT_OFFSET);
-	/* key of ".." for the root directory */
-	/* these two are from the inode, and are LE */
-	dotdot->deh_dir_id = par_dirid;
-	dotdot->deh_objectid = par_objid;
-	dotdot->deh_state = 0;	/* Endian safe if 0 */
-	put_deh_location(dotdot, deh_location(dot) - strlen(".."));
-	mark_de_visible(dotdot);
-
-	/* copy ".." and "." */
-	memcpy(body + deh_location(dot), ".", 1);
-	memcpy(body + deh_location(dotdot), "..", 2);
-}
-
-/* compose directory item containing "." and ".." entries */
-void make_empty_dir_item(char *body, __le32 dirid, __le32 objid,
-			 __le32 par_dirid, __le32 par_objid)
-{
-	struct reiserfs_de_head *dot, *dotdot;
-
-	memset(body, 0, EMPTY_DIR_SIZE);
-	dot = (struct reiserfs_de_head *)body;
-	dotdot = dot + 1;
-
-	/* direntry header of "." */
-	put_deh_offset(dot, DOT_OFFSET);
-	/* these two are from make_le_item_head, and are LE */
-	dot->deh_dir_id = dirid;
-	dot->deh_objectid = objid;
-	dot->deh_state = 0;	/* Endian safe if 0 */
-	put_deh_location(dot, EMPTY_DIR_SIZE - ROUND_UP(strlen(".")));
-	mark_de_visible(dot);
-
-	/* direntry header of ".." */
-	put_deh_offset(dotdot, DOT_DOT_OFFSET);
-	/* key of ".." for the root directory */
-	/* these two are from the inode, and are LE */
-	dotdot->deh_dir_id = par_dirid;
-	dotdot->deh_objectid = par_objid;
-	dotdot->deh_state = 0;	/* Endian safe if 0 */
-	put_deh_location(dotdot, deh_location(dot) - ROUND_UP(strlen("..")));
-	mark_de_visible(dotdot);
-
-	/* copy ".." and "." */
-	memcpy(body + deh_location(dot), ".", 1);
-	memcpy(body + deh_location(dotdot), "..", 2);
-}
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
deleted file mode 100644
index 5129efc6f2e6..000000000000
--- a/fs/reiserfs/do_balan.c
+++ /dev/null
@@ -1,1900 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-/*
- * Now we have all buffers that must be used in balancing of the tree
- * Further calculations can not cause schedule(), and thus the buffer
- * tree will be stable until the balancing will be finished
- * balance the tree according to the analysis made before,
- * and using buffers obtained after all above.
- */
-
-#include <linux/uaccess.h>
-#include <linux/time.h>
-#include "reiserfs.h"
-#include <linux/buffer_head.h>
-#include <linux/kernel.h>
-
-static inline void buffer_info_init_left(struct tree_balance *tb,
-                                         struct buffer_info *bi)
-{
-	bi->tb          = tb;
-	bi->bi_bh       = tb->L[0];
-	bi->bi_parent   = tb->FL[0];
-	bi->bi_position = get_left_neighbor_position(tb, 0);
-}
-
-static inline void buffer_info_init_right(struct tree_balance *tb,
-                                          struct buffer_info *bi)
-{
-	bi->tb          = tb;
-	bi->bi_bh       = tb->R[0];
-	bi->bi_parent   = tb->FR[0];
-	bi->bi_position = get_right_neighbor_position(tb, 0);
-}
-
-static inline void buffer_info_init_tbS0(struct tree_balance *tb,
-                                         struct buffer_info *bi)
-{
-	bi->tb          = tb;
-	bi->bi_bh        = PATH_PLAST_BUFFER(tb->tb_path);
-	bi->bi_parent   = PATH_H_PPARENT(tb->tb_path, 0);
-	bi->bi_position = PATH_H_POSITION(tb->tb_path, 1);
-}
-
-static inline void buffer_info_init_bh(struct tree_balance *tb,
-                                       struct buffer_info *bi,
-                                       struct buffer_head *bh)
-{
-	bi->tb          = tb;
-	bi->bi_bh       = bh;
-	bi->bi_parent   = NULL;
-	bi->bi_position = 0;
-}
-
-inline void do_balance_mark_leaf_dirty(struct tree_balance *tb,
-				       struct buffer_head *bh, int flag)
-{
-	journal_mark_dirty(tb->transaction_handle, bh);
-}
-
-#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty
-#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty
-
-/*
- * summary:
- *  if deleting something ( tb->insert_size[0] < 0 )
- *    return(balance_leaf_when_delete()); (flag d handled here)
- *  else
- *    if lnum is larger than 0 we put items into the left node
- *    if rnum is larger than 0 we put items into the right node
- *    if snum1 is larger than 0 we put items into the new node s1
- *    if snum2 is larger than 0 we put items into the new node s2
- * Note that all *num* count new items being created.
- */
-
-static void balance_leaf_when_delete_del(struct tree_balance *tb)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int item_pos = PATH_LAST_POSITION(tb->tb_path);
-	struct buffer_info bi;
-#ifdef CONFIG_REISERFS_CHECK
-	struct item_head *ih = item_head(tbS0, item_pos);
-#endif
-
-	RFALSE(ih_item_len(ih) + IH_SIZE != -tb->insert_size[0],
-	       "vs-12013: mode Delete, insert size %d, ih to be deleted %h",
-	       -tb->insert_size[0], ih);
-
-	buffer_info_init_tbS0(tb, &bi);
-	leaf_delete_items(&bi, 0, item_pos, 1, -1);
-
-	if (!item_pos && tb->CFL[0]) {
-		if (B_NR_ITEMS(tbS0)) {
-			replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
-		} else {
-			if (!PATH_H_POSITION(tb->tb_path, 1))
-				replace_key(tb, tb->CFL[0], tb->lkey[0],
-					    PATH_H_PPARENT(tb->tb_path, 0), 0);
-		}
-	}
-
-	RFALSE(!item_pos && !tb->CFL[0],
-	       "PAP-12020: tb->CFL[0]==%p, tb->L[0]==%p", tb->CFL[0],
-	       tb->L[0]);
-}
-
-/* cut item in S[0] */
-static void balance_leaf_when_delete_cut(struct tree_balance *tb)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int item_pos = PATH_LAST_POSITION(tb->tb_path);
-	struct item_head *ih = item_head(tbS0, item_pos);
-	int pos_in_item = tb->tb_path->pos_in_item;
-	struct buffer_info bi;
-	buffer_info_init_tbS0(tb, &bi);
-
-	if (is_direntry_le_ih(ih)) {
-		/*
-		 * UFS unlink semantics are such that you can only
-		 * delete one directory entry at a time.
-		 *
-		 * when we cut a directory tb->insert_size[0] means
-		 * number of entries to be cut (always 1)
-		 */
-		tb->insert_size[0] = -1;
-		leaf_cut_from_buffer(&bi, item_pos, pos_in_item,
-				     -tb->insert_size[0]);
-
-		RFALSE(!item_pos && !pos_in_item && !tb->CFL[0],
-		       "PAP-12030: can not change delimiting key. CFL[0]=%p",
-		       tb->CFL[0]);
-
-		if (!item_pos && !pos_in_item && tb->CFL[0])
-			replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
-	} else {
-		leaf_cut_from_buffer(&bi, item_pos, pos_in_item,
-				     -tb->insert_size[0]);
-
-		RFALSE(!ih_item_len(ih),
-		       "PAP-12035: cut must leave non-zero dynamic "
-		       "length of item");
-	}
-}
-
-static int balance_leaf_when_delete_left(struct tree_balance *tb)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int n = B_NR_ITEMS(tbS0);
-
-	/* L[0] must be joined with S[0] */
-	if (tb->lnum[0] == -1) {
-		/* R[0] must be also joined with S[0] */
-		if (tb->rnum[0] == -1) {
-			if (tb->FR[0] == PATH_H_PPARENT(tb->tb_path, 0)) {
-				/*
-				 * all contents of all the
-				 * 3 buffers will be in L[0]
-				 */
-				if (PATH_H_POSITION(tb->tb_path, 1) == 0 &&
-				    1 < B_NR_ITEMS(tb->FR[0]))
-					replace_key(tb, tb->CFL[0],
-						    tb->lkey[0], tb->FR[0], 1);
-
-				leaf_move_items(LEAF_FROM_S_TO_L, tb, n, -1,
-						NULL);
-				leaf_move_items(LEAF_FROM_R_TO_L, tb,
-						B_NR_ITEMS(tb->R[0]), -1,
-						NULL);
-
-				reiserfs_invalidate_buffer(tb, tbS0);
-				reiserfs_invalidate_buffer(tb, tb->R[0]);
-
-				return 0;
-			}
-
-			/* all contents of all the 3 buffers will be in R[0] */
-			leaf_move_items(LEAF_FROM_S_TO_R, tb, n, -1, NULL);
-			leaf_move_items(LEAF_FROM_L_TO_R, tb,
-					B_NR_ITEMS(tb->L[0]), -1, NULL);
-
-			/* right_delimiting_key is correct in R[0] */
-			replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
-
-			reiserfs_invalidate_buffer(tb, tbS0);
-			reiserfs_invalidate_buffer(tb, tb->L[0]);
-
-			return -1;
-		}
-
-		RFALSE(tb->rnum[0] != 0,
-		       "PAP-12045: rnum must be 0 (%d)", tb->rnum[0]);
-		/* all contents of L[0] and S[0] will be in L[0] */
-		leaf_shift_left(tb, n, -1);
-
-		reiserfs_invalidate_buffer(tb, tbS0);
-
-		return 0;
-	}
-
-	/*
-	 * a part of contents of S[0] will be in L[0] and
-	 * the rest part of S[0] will be in R[0]
-	 */
-
-	RFALSE((tb->lnum[0] + tb->rnum[0] < n) ||
-	       (tb->lnum[0] + tb->rnum[0] > n + 1),
-	       "PAP-12050: rnum(%d) and lnum(%d) and item "
-	       "number(%d) in S[0] are not consistent",
-	       tb->rnum[0], tb->lnum[0], n);
-	RFALSE((tb->lnum[0] + tb->rnum[0] == n) &&
-	       (tb->lbytes != -1 || tb->rbytes != -1),
-	       "PAP-12055: bad rbytes (%d)/lbytes (%d) "
-	       "parameters when items are not split",
-	       tb->rbytes, tb->lbytes);
-	RFALSE((tb->lnum[0] + tb->rnum[0] == n + 1) &&
-	       (tb->lbytes < 1 || tb->rbytes != -1),
-	       "PAP-12060: bad rbytes (%d)/lbytes (%d) "
-	       "parameters when items are split",
-	       tb->rbytes, tb->lbytes);
-
-	leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
-	leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
-
-	reiserfs_invalidate_buffer(tb, tbS0);
-
-	return 0;
-}
-
-/*
- * Balance leaf node in case of delete or cut: insert_size[0] < 0
- *
- * lnum, rnum can have values >= -1
- *	-1 means that the neighbor must be joined with S
- *	 0 means that nothing should be done with the neighbor
- *	>0 means to shift entirely or partly the specified number of items
- *         to the neighbor
- */
-static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	struct buffer_info bi;
-	int n;
-
-	RFALSE(tb->FR[0] && B_LEVEL(tb->FR[0]) != DISK_LEAF_NODE_LEVEL + 1,
-	       "vs- 12000: level: wrong FR %z", tb->FR[0]);
-	RFALSE(tb->blknum[0] > 1,
-	       "PAP-12005: tb->blknum == %d, can not be > 1", tb->blknum[0]);
-	RFALSE(!tb->blknum[0] && !PATH_H_PPARENT(tb->tb_path, 0),
-	       "PAP-12010: tree can not be empty");
-
-	buffer_info_init_tbS0(tb, &bi);
-
-	/* Delete or truncate the item */
-
-	BUG_ON(flag != M_DELETE && flag != M_CUT);
-	if (flag == M_DELETE)
-		balance_leaf_when_delete_del(tb);
-	else /* M_CUT */
-		balance_leaf_when_delete_cut(tb);
-
-
-	/*
-	 * the rule is that no shifting occurs unless by shifting
-	 * a node can be freed
-	 */
-	n = B_NR_ITEMS(tbS0);
-
-
-	/* L[0] takes part in balancing */
-	if (tb->lnum[0])
-		return balance_leaf_when_delete_left(tb);
-
-	if (tb->rnum[0] == -1) {
-		/* all contents of R[0] and S[0] will be in R[0] */
-		leaf_shift_right(tb, n, -1);
-		reiserfs_invalidate_buffer(tb, tbS0);
-		return 0;
-	}
-
-	RFALSE(tb->rnum[0],
-	       "PAP-12065: bad rnum parameter must be 0 (%d)", tb->rnum[0]);
-	return 0;
-}
-
-static unsigned int balance_leaf_insert_left(struct tree_balance *tb,
-					     struct item_head *const ih,
-					     const char * const body)
-{
-	int ret;
-	struct buffer_info bi;
-	int n = B_NR_ITEMS(tb->L[0]);
-	unsigned body_shift_bytes = 0;
-
-	if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) {
-		/* part of new item falls into L[0] */
-		int new_item_len, shift;
-
-		ret = leaf_shift_left(tb, tb->lnum[0] - 1, -1);
-
-		/* Calculate item length to insert to S[0] */
-		new_item_len = ih_item_len(ih) - tb->lbytes;
-
-		/* Calculate and check item length to insert to L[0] */
-		put_ih_item_len(ih, ih_item_len(ih) - new_item_len);
-
-		RFALSE(ih_item_len(ih) <= 0,
-		       "PAP-12080: there is nothing to insert into L[0]: "
-		       "ih_item_len=%d", ih_item_len(ih));
-
-		/* Insert new item into L[0] */
-		buffer_info_init_left(tb, &bi);
-		leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body,
-			     min_t(int, tb->zeroes_num, ih_item_len(ih)));
-
-		/*
-		 * Calculate key component, item length and body to
-		 * insert into S[0]
-		 */
-		shift = 0;
-		if (is_indirect_le_ih(ih))
-			shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
-
-		add_le_ih_k_offset(ih, tb->lbytes << shift);
-
-		put_ih_item_len(ih, new_item_len);
-		if (tb->lbytes > tb->zeroes_num) {
-			body_shift_bytes = tb->lbytes - tb->zeroes_num;
-			tb->zeroes_num = 0;
-		} else
-			tb->zeroes_num -= tb->lbytes;
-
-		RFALSE(ih_item_len(ih) <= 0,
-		       "PAP-12085: there is nothing to insert into S[0]: "
-		       "ih_item_len=%d", ih_item_len(ih));
-	} else {
-		/* new item in whole falls into L[0] */
-		/* Shift lnum[0]-1 items to L[0] */
-		ret = leaf_shift_left(tb, tb->lnum[0] - 1, tb->lbytes);
-
-		/* Insert new item into L[0] */
-		buffer_info_init_left(tb, &bi);
-		leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body,
-				     tb->zeroes_num);
-		tb->insert_size[0] = 0;
-		tb->zeroes_num = 0;
-	}
-	return body_shift_bytes;
-}
-
-static void balance_leaf_paste_left_shift_dirent(struct tree_balance *tb,
-						 struct item_head * const ih,
-						 const char * const body)
-{
-	int n = B_NR_ITEMS(tb->L[0]);
-	struct buffer_info bi;
-
-	RFALSE(tb->zeroes_num,
-	       "PAP-12090: invalid parameter in case of a directory");
-
-	/* directory item */
-	if (tb->lbytes > tb->pos_in_item) {
-		/* new directory entry falls into L[0] */
-		struct item_head *pasted;
-		int ret, l_pos_in_item = tb->pos_in_item;
-
-		/*
-		 * Shift lnum[0] - 1 items in whole.
-		 * Shift lbytes - 1 entries from given directory item
-		 */
-		ret = leaf_shift_left(tb, tb->lnum[0], tb->lbytes - 1);
-		if (ret && !tb->item_pos) {
-			pasted = item_head(tb->L[0], B_NR_ITEMS(tb->L[0]) - 1);
-			l_pos_in_item += ih_entry_count(pasted) -
-					 (tb->lbytes - 1);
-		}
-
-		/* Append given directory entry to directory item */
-		buffer_info_init_left(tb, &bi);
-		leaf_paste_in_buffer(&bi, n + tb->item_pos - ret,
-				     l_pos_in_item, tb->insert_size[0],
-				     body, tb->zeroes_num);
-
-		/*
-		 * previous string prepared space for pasting new entry,
-		 * following string pastes this entry
-		 */
-
-		/*
-		 * when we have merge directory item, pos_in_item
-		 * has been changed too
-		 */
-
-		/* paste new directory entry. 1 is entry number */
-		leaf_paste_entries(&bi, n + tb->item_pos - ret,
-				   l_pos_in_item, 1,
-				   (struct reiserfs_de_head *) body,
-				   body + DEH_SIZE, tb->insert_size[0]);
-		tb->insert_size[0] = 0;
-	} else {
-		/* new directory item doesn't fall into L[0] */
-		/*
-		 * Shift lnum[0]-1 items in whole. Shift lbytes
-		 * directory entries from directory item number lnum[0]
-		 */
-		leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
-	}
-
-	/* Calculate new position to append in item body */
-	tb->pos_in_item -= tb->lbytes;
-}
-
-static unsigned int balance_leaf_paste_left_shift(struct tree_balance *tb,
-						  struct item_head * const ih,
-						  const char * const body)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int n = B_NR_ITEMS(tb->L[0]);
-	struct buffer_info bi;
-	int body_shift_bytes = 0;
-
-	if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) {
-		balance_leaf_paste_left_shift_dirent(tb, ih, body);
-		return 0;
-	}
-
-	RFALSE(tb->lbytes <= 0,
-	       "PAP-12095: there is nothing to shift to L[0]. "
-	       "lbytes=%d", tb->lbytes);
-	RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)),
-	       "PAP-12100: incorrect position to paste: "
-	       "item_len=%d, pos_in_item=%d",
-	       ih_item_len(item_head(tbS0, tb->item_pos)), tb->pos_in_item);
-
-	/* appended item will be in L[0] in whole */
-	if (tb->lbytes >= tb->pos_in_item) {
-		struct item_head *tbS0_pos_ih, *tbL0_ih;
-		struct item_head *tbS0_0_ih;
-		struct reiserfs_key *left_delim_key;
-		int ret, l_n, version, temp_l;
-
-		tbS0_pos_ih = item_head(tbS0, tb->item_pos);
-		tbS0_0_ih = item_head(tbS0, 0);
-
-		/*
-		 * this bytes number must be appended
-		 * to the last item of L[h]
-		 */
-		l_n = tb->lbytes - tb->pos_in_item;
-
-		/* Calculate new insert_size[0] */
-		tb->insert_size[0] -= l_n;
-
-		RFALSE(tb->insert_size[0] <= 0,
-		       "PAP-12105: there is nothing to paste into "
-		       "L[0]. insert_size=%d", tb->insert_size[0]);
-
-		ret = leaf_shift_left(tb, tb->lnum[0],
-				      ih_item_len(tbS0_pos_ih));
-
-		tbL0_ih = item_head(tb->L[0], n + tb->item_pos - ret);
-
-		/* Append to body of item in L[0] */
-		buffer_info_init_left(tb, &bi);
-		leaf_paste_in_buffer(&bi, n + tb->item_pos - ret,
-				     ih_item_len(tbL0_ih), l_n, body,
-				     min_t(int, l_n, tb->zeroes_num));
-
-		/*
-		 * 0-th item in S0 can be only of DIRECT type
-		 * when l_n != 0
-		 */
-		temp_l = l_n;
-
-		RFALSE(ih_item_len(tbS0_0_ih),
-		       "PAP-12106: item length must be 0");
-		RFALSE(comp_short_le_keys(&tbS0_0_ih->ih_key,
-		       leaf_key(tb->L[0], n + tb->item_pos - ret)),
-		       "PAP-12107: items must be of the same file");
-
-		if (is_indirect_le_ih(tbL0_ih)) {
-			int shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
-			temp_l = l_n << shift;
-		}
-		/* update key of first item in S0 */
-		version = ih_version(tbS0_0_ih);
-		add_le_key_k_offset(version, &tbS0_0_ih->ih_key, temp_l);
-
-		/* update left delimiting key */
-		left_delim_key = internal_key(tb->CFL[0], tb->lkey[0]);
-		add_le_key_k_offset(version, left_delim_key, temp_l);
-
-		/*
-		 * Calculate new body, position in item and
-		 * insert_size[0]
-		 */
-		if (l_n > tb->zeroes_num) {
-			body_shift_bytes = l_n - tb->zeroes_num;
-			tb->zeroes_num = 0;
-		} else
-			tb->zeroes_num -= l_n;
-		tb->pos_in_item = 0;
-
-		RFALSE(comp_short_le_keys(&tbS0_0_ih->ih_key,
-					  leaf_key(tb->L[0],
-						 B_NR_ITEMS(tb->L[0]) - 1)) ||
-		       !op_is_left_mergeable(leaf_key(tbS0, 0), tbS0->b_size) ||
-		       !op_is_left_mergeable(left_delim_key, tbS0->b_size),
-		       "PAP-12120: item must be merge-able with left "
-		       "neighboring item");
-	} else {
-		/* only part of the appended item will be in L[0] */
-
-		/* Calculate position in item for append in S[0] */
-		tb->pos_in_item -= tb->lbytes;
-
-		RFALSE(tb->pos_in_item <= 0,
-		       "PAP-12125: no place for paste. pos_in_item=%d",
-		       tb->pos_in_item);
-
-		/*
-		 * Shift lnum[0] - 1 items in whole.
-		 * Shift lbytes - 1 byte from item number lnum[0]
-		 */
-		leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
-	}
-	return body_shift_bytes;
-}
-
-
-/* appended item will be in L[0] in whole */
-static void balance_leaf_paste_left_whole(struct tree_balance *tb,
-					  struct item_head * const ih,
-					  const char * const body)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int n = B_NR_ITEMS(tb->L[0]);
-	struct buffer_info bi;
-	struct item_head *pasted;
-	int ret;
-
-	/* if we paste into first item of S[0] and it is left mergable */
-	if (!tb->item_pos &&
-	    op_is_left_mergeable(leaf_key(tbS0, 0), tbS0->b_size)) {
-		/*
-		 * then increment pos_in_item by the size of the
-		 * last item in L[0]
-		 */
-		pasted = item_head(tb->L[0], n - 1);
-		if (is_direntry_le_ih(pasted))
-			tb->pos_in_item += ih_entry_count(pasted);
-		else
-			tb->pos_in_item += ih_item_len(pasted);
-	}
-
-	/*
-	 * Shift lnum[0] - 1 items in whole.
-	 * Shift lbytes - 1 byte from item number lnum[0]
-	 */
-	ret = leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
-
-	/* Append to body of item in L[0] */
-	buffer_info_init_left(tb, &bi);
-	leaf_paste_in_buffer(&bi, n + tb->item_pos - ret, tb->pos_in_item,
-			     tb->insert_size[0], body, tb->zeroes_num);
-
-	/* if appended item is directory, paste entry */
-	pasted = item_head(tb->L[0], n + tb->item_pos - ret);
-	if (is_direntry_le_ih(pasted))
-		leaf_paste_entries(&bi, n + tb->item_pos - ret,
-				   tb->pos_in_item, 1,
-				   (struct reiserfs_de_head *)body,
-				   body + DEH_SIZE, tb->insert_size[0]);
-
-	/*
-	 * if appended item is indirect item, put unformatted node
-	 * into un list
-	 */
-	if (is_indirect_le_ih(pasted))
-		set_ih_free_space(pasted, 0);
-
-	tb->insert_size[0] = 0;
-	tb->zeroes_num = 0;
-}
-
-static unsigned int balance_leaf_paste_left(struct tree_balance *tb,
-					    struct item_head * const ih,
-					    const char * const body)
-{
-	/* we must shift the part of the appended item */
-	if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1)
-		return balance_leaf_paste_left_shift(tb, ih, body);
-	else
-		balance_leaf_paste_left_whole(tb, ih, body);
-	return 0;
-}
-
-/* Shift lnum[0] items from S[0] to the left neighbor L[0] */
-static unsigned int balance_leaf_left(struct tree_balance *tb,
-				      struct item_head * const ih,
-				      const char * const body, int flag)
-{
-	if (tb->lnum[0] <= 0)
-		return 0;
-
-	/* new item or it part falls to L[0], shift it too */
-	if (tb->item_pos < tb->lnum[0]) {
-		BUG_ON(flag != M_INSERT && flag != M_PASTE);
-
-		if (flag == M_INSERT)
-			return balance_leaf_insert_left(tb, ih, body);
-		else /* M_PASTE */
-			return balance_leaf_paste_left(tb, ih, body);
-	} else
-		/* new item doesn't fall into L[0] */
-		leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
-	return 0;
-}
-
-
-static void balance_leaf_insert_right(struct tree_balance *tb,
-				      struct item_head * const ih,
-				      const char * const body)
-{
-
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int n = B_NR_ITEMS(tbS0);
-	struct buffer_info bi;
-
-	/* new item or part of it doesn't fall into R[0] */
-	if (n - tb->rnum[0] >= tb->item_pos) {
-		leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
-		return;
-	}
-
-	/* new item or its part falls to R[0] */
-
-	/* part of new item falls into R[0] */
-	if (tb->item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) {
-		loff_t old_key_comp, old_len, r_zeroes_number;
-		const char *r_body;
-		int shift;
-		loff_t offset;
-
-		leaf_shift_right(tb, tb->rnum[0] - 1, -1);
-
-		/* Remember key component and item length */
-		old_key_comp = le_ih_k_offset(ih);
-		old_len = ih_item_len(ih);
-
-		/*
-		 * Calculate key component and item length to insert
-		 * into R[0]
-		 */
-		shift = 0;
-		if (is_indirect_le_ih(ih))
-			shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
-		offset = le_ih_k_offset(ih) + ((old_len - tb->rbytes) << shift);
-		set_le_ih_k_offset(ih, offset);
-		put_ih_item_len(ih, tb->rbytes);
-
-		/* Insert part of the item into R[0] */
-		buffer_info_init_right(tb, &bi);
-		if ((old_len - tb->rbytes) > tb->zeroes_num) {
-			r_zeroes_number = 0;
-			r_body = body + (old_len - tb->rbytes) - tb->zeroes_num;
-		} else {
-			r_body = body;
-			r_zeroes_number = tb->zeroes_num -
-					  (old_len - tb->rbytes);
-			tb->zeroes_num -= r_zeroes_number;
-		}
-
-		leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeroes_number);
-
-		/* Replace right delimiting key by first key in R[0] */
-		replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
-
-		/*
-		 * Calculate key component and item length to
-		 * insert into S[0]
-		 */
-		set_le_ih_k_offset(ih, old_key_comp);
-		put_ih_item_len(ih, old_len - tb->rbytes);
-
-		tb->insert_size[0] -= tb->rbytes;
-
-	} else {
-		/* whole new item falls into R[0] */
-
-		/* Shift rnum[0]-1 items to R[0] */
-		leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes);
-
-		/* Insert new item into R[0] */
-		buffer_info_init_right(tb, &bi);
-		leaf_insert_into_buf(&bi, tb->item_pos - n + tb->rnum[0] - 1,
-				     ih, body, tb->zeroes_num);
-
-		if (tb->item_pos - n + tb->rnum[0] - 1 == 0)
-			replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
-
-		tb->zeroes_num = tb->insert_size[0] = 0;
-	}
-}
-
-
-static void balance_leaf_paste_right_shift_dirent(struct tree_balance *tb,
-				     struct item_head * const ih,
-				     const char * const body)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	struct buffer_info bi;
-	int entry_count;
-
-	RFALSE(tb->zeroes_num,
-	       "PAP-12145: invalid parameter in case of a directory");
-	entry_count = ih_entry_count(item_head(tbS0, tb->item_pos));
-
-	/* new directory entry falls into R[0] */
-	if (entry_count - tb->rbytes < tb->pos_in_item) {
-		int paste_entry_position;
-
-		RFALSE(tb->rbytes - 1 >= entry_count || !tb->insert_size[0],
-		       "PAP-12150: no enough of entries to shift to R[0]: "
-		       "rbytes=%d, entry_count=%d", tb->rbytes, entry_count);
-
-		/*
-		 * Shift rnum[0]-1 items in whole.
-		 * Shift rbytes-1 directory entries from directory
-		 * item number rnum[0]
-		 */
-		leaf_shift_right(tb, tb->rnum[0], tb->rbytes - 1);
-
-		/* Paste given directory entry to directory item */
-		paste_entry_position = tb->pos_in_item - entry_count +
-				       tb->rbytes - 1;
-		buffer_info_init_right(tb, &bi);
-		leaf_paste_in_buffer(&bi, 0, paste_entry_position,
-				     tb->insert_size[0], body, tb->zeroes_num);
-
-		/* paste entry */
-		leaf_paste_entries(&bi, 0, paste_entry_position, 1,
-				   (struct reiserfs_de_head *) body,
-				   body + DEH_SIZE, tb->insert_size[0]);
-
-		/* change delimiting keys */
-		if (paste_entry_position == 0)
-			replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
-
-		tb->insert_size[0] = 0;
-		tb->pos_in_item++;
-	} else {
-		/* new directory entry doesn't fall into R[0] */
-		leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
-	}
-}
-
-static void balance_leaf_paste_right_shift(struct tree_balance *tb,
-				     struct item_head * const ih,
-				     const char * const body)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int n_shift, n_rem, r_zeroes_number, version;
-	unsigned long temp_rem;
-	const char *r_body;
-	struct buffer_info bi;
-
-	/* we append to directory item */
-	if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) {
-		balance_leaf_paste_right_shift_dirent(tb, ih, body);
-		return;
-	}
-
-	/* regular object */
-
-	/*
-	 * Calculate number of bytes which must be shifted
-	 * from appended item
-	 */
-	n_shift = tb->rbytes - tb->insert_size[0];
-	if (n_shift < 0)
-		n_shift = 0;
-
-	RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)),
-	       "PAP-12155: invalid position to paste. ih_item_len=%d, "
-	       "pos_in_item=%d", tb->pos_in_item,
-	       ih_item_len(item_head(tbS0, tb->item_pos)));
-
-	leaf_shift_right(tb, tb->rnum[0], n_shift);
-
-	/*
-	 * Calculate number of bytes which must remain in body
-	 * after appending to R[0]
-	 */
-	n_rem = tb->insert_size[0] - tb->rbytes;
-	if (n_rem < 0)
-		n_rem = 0;
-
-	temp_rem = n_rem;
-
-	version = ih_version(item_head(tb->R[0], 0));
-
-	if (is_indirect_le_key(version, leaf_key(tb->R[0], 0))) {
-		int shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
-		temp_rem = n_rem << shift;
-	}
-
-	add_le_key_k_offset(version, leaf_key(tb->R[0], 0), temp_rem);
-	add_le_key_k_offset(version, internal_key(tb->CFR[0], tb->rkey[0]),
-			    temp_rem);
-
-	do_balance_mark_internal_dirty(tb, tb->CFR[0], 0);
-
-	/* Append part of body into R[0] */
-	buffer_info_init_right(tb, &bi);
-	if (n_rem > tb->zeroes_num) {
-		r_zeroes_number = 0;
-		r_body = body + n_rem - tb->zeroes_num;
-	} else {
-		r_body = body;
-		r_zeroes_number = tb->zeroes_num - n_rem;
-		tb->zeroes_num -= r_zeroes_number;
-	}
-
-	leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem,
-			     r_body, r_zeroes_number);
-
-	if (is_indirect_le_ih(item_head(tb->R[0], 0)))
-		set_ih_free_space(item_head(tb->R[0], 0), 0);
-
-	tb->insert_size[0] = n_rem;
-	if (!n_rem)
-		tb->pos_in_item++;
-}
-
-static void balance_leaf_paste_right_whole(struct tree_balance *tb,
-				     struct item_head * const ih,
-				     const char * const body)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int n = B_NR_ITEMS(tbS0);
-	struct item_head *pasted;
-	struct buffer_info bi;
-
-	buffer_info_init_right(tb, &bi);
-	leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
-
-	/* append item in R[0] */
-	if (tb->pos_in_item >= 0) {
-		buffer_info_init_right(tb, &bi);
-		leaf_paste_in_buffer(&bi, tb->item_pos - n + tb->rnum[0],
-				     tb->pos_in_item, tb->insert_size[0], body,
-				     tb->zeroes_num);
-	}
-
-	/* paste new entry, if item is directory item */
-	pasted = item_head(tb->R[0], tb->item_pos - n + tb->rnum[0]);
-	if (is_direntry_le_ih(pasted) && tb->pos_in_item >= 0) {
-		leaf_paste_entries(&bi, tb->item_pos - n + tb->rnum[0],
-				   tb->pos_in_item, 1,
-				   (struct reiserfs_de_head *)body,
-				   body + DEH_SIZE, tb->insert_size[0]);
-
-		if (!tb->pos_in_item) {
-
-			RFALSE(tb->item_pos - n + tb->rnum[0],
-			       "PAP-12165: directory item must be first "
-			       "item of node when pasting is in 0th position");
-
-			/* update delimiting keys */
-			replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
-		}
-	}
-
-	if (is_indirect_le_ih(pasted))
-		set_ih_free_space(pasted, 0);
-	tb->zeroes_num = tb->insert_size[0] = 0;
-}
-
-static void balance_leaf_paste_right(struct tree_balance *tb,
-				     struct item_head * const ih,
-				     const char * const body)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int n = B_NR_ITEMS(tbS0);
-
-	/* new item doesn't fall into R[0] */
-	if (n - tb->rnum[0] > tb->item_pos) {
-		leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
-		return;
-	}
-
-	/* pasted item or part of it falls to R[0] */
-
-	if (tb->item_pos == n - tb->rnum[0] && tb->rbytes != -1)
-		/* we must shift the part of the appended item */
-		balance_leaf_paste_right_shift(tb, ih, body);
-	else
-		/* pasted item in whole falls into R[0] */
-		balance_leaf_paste_right_whole(tb, ih, body);
-}
-
-/* shift rnum[0] items from S[0] to the right neighbor R[0] */
-static void balance_leaf_right(struct tree_balance *tb,
-			       struct item_head * const ih,
-			       const char * const body, int flag)
-{
-	if (tb->rnum[0] <= 0)
-		return;
-
-	BUG_ON(flag != M_INSERT && flag != M_PASTE);
-
-	if (flag == M_INSERT)
-		balance_leaf_insert_right(tb, ih, body);
-	else /* M_PASTE */
-		balance_leaf_paste_right(tb, ih, body);
-}
-
-static void balance_leaf_new_nodes_insert(struct tree_balance *tb,
-					  struct item_head * const ih,
-					  const char * const body,
-					  struct item_head *insert_key,
-					  struct buffer_head **insert_ptr,
-					  int i)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int n = B_NR_ITEMS(tbS0);
-	struct buffer_info bi;
-	int shift;
-
-	/* new item or it part don't falls into S_new[i] */
-	if (n - tb->snum[i] >= tb->item_pos) {
-		leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
-				tb->snum[i], tb->sbytes[i], tb->S_new[i]);
-		return;
-	}
-
-	/* new item or it's part falls to first new node S_new[i] */
-
-	/* part of new item falls into S_new[i] */
-	if (tb->item_pos == n - tb->snum[i] + 1 && tb->sbytes[i] != -1) {
-		int old_key_comp, old_len, r_zeroes_number;
-		const char *r_body;
-
-		/* Move snum[i]-1 items from S[0] to S_new[i] */
-		leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i] - 1, -1,
-				tb->S_new[i]);
-
-		/* Remember key component and item length */
-		old_key_comp = le_ih_k_offset(ih);
-		old_len = ih_item_len(ih);
-
-		/*
-		 * Calculate key component and item length to insert
-		 * into S_new[i]
-		 */
-		shift = 0;
-		if (is_indirect_le_ih(ih))
-			shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
-		set_le_ih_k_offset(ih,
-				   le_ih_k_offset(ih) +
-				   ((old_len - tb->sbytes[i]) << shift));
-
-		put_ih_item_len(ih, tb->sbytes[i]);
-
-		/* Insert part of the item into S_new[i] before 0-th item */
-		buffer_info_init_bh(tb, &bi, tb->S_new[i]);
-
-		if ((old_len - tb->sbytes[i]) > tb->zeroes_num) {
-			r_zeroes_number = 0;
-			r_body = body + (old_len - tb->sbytes[i]) -
-					 tb->zeroes_num;
-		} else {
-			r_body = body;
-			r_zeroes_number = tb->zeroes_num - (old_len -
-					  tb->sbytes[i]);
-			tb->zeroes_num -= r_zeroes_number;
-		}
-
-		leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeroes_number);
-
-		/*
-		 * Calculate key component and item length to
-		 * insert into S[i]
-		 */
-		set_le_ih_k_offset(ih, old_key_comp);
-		put_ih_item_len(ih, old_len - tb->sbytes[i]);
-		tb->insert_size[0] -= tb->sbytes[i];
-	} else {
-		/* whole new item falls into S_new[i] */
-
-		/*
-		 * Shift snum[0] - 1 items to S_new[i]
-		 * (sbytes[i] of split item)
-		 */
-		leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
-				tb->snum[i] - 1, tb->sbytes[i], tb->S_new[i]);
-
-		/* Insert new item into S_new[i] */
-		buffer_info_init_bh(tb, &bi, tb->S_new[i]);
-		leaf_insert_into_buf(&bi, tb->item_pos - n + tb->snum[i] - 1,
-				     ih, body, tb->zeroes_num);
-
-		tb->zeroes_num = tb->insert_size[0] = 0;
-	}
-}
-
-/* we append to directory item */
-static void balance_leaf_new_nodes_paste_dirent(struct tree_balance *tb,
-					 struct item_head * const ih,
-					 const char * const body,
-					 struct item_head *insert_key,
-					 struct buffer_head **insert_ptr,
-					 int i)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	struct item_head *aux_ih = item_head(tbS0, tb->item_pos);
-	int entry_count = ih_entry_count(aux_ih);
-	struct buffer_info bi;
-
-	if (entry_count - tb->sbytes[i] < tb->pos_in_item &&
-	    tb->pos_in_item <= entry_count) {
-		/* new directory entry falls into S_new[i] */
-
-		RFALSE(!tb->insert_size[0],
-		       "PAP-12215: insert_size is already 0");
-		RFALSE(tb->sbytes[i] - 1 >= entry_count,
-		       "PAP-12220: there are no so much entries (%d), only %d",
-		       tb->sbytes[i] - 1, entry_count);
-
-		/*
-		 * Shift snum[i]-1 items in whole.
-		 * Shift sbytes[i] directory entries
-		 * from directory item number snum[i]
-		 */
-		leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i],
-				tb->sbytes[i] - 1, tb->S_new[i]);
-
-		/*
-		 * Paste given directory entry to
-		 * directory item
-		 */
-		buffer_info_init_bh(tb, &bi, tb->S_new[i]);
-		leaf_paste_in_buffer(&bi, 0, tb->pos_in_item - entry_count +
-				     tb->sbytes[i] - 1, tb->insert_size[0],
-				     body, tb->zeroes_num);
-
-		/* paste new directory entry */
-		leaf_paste_entries(&bi, 0, tb->pos_in_item - entry_count +
-				   tb->sbytes[i] - 1, 1,
-				   (struct reiserfs_de_head *) body,
-				   body + DEH_SIZE, tb->insert_size[0]);
-
-		tb->insert_size[0] = 0;
-		tb->pos_in_item++;
-	} else {
-		/* new directory entry doesn't fall into S_new[i] */
-		leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i],
-				tb->sbytes[i], tb->S_new[i]);
-	}
-
-}
-
-static void balance_leaf_new_nodes_paste_shift(struct tree_balance *tb,
-					 struct item_head * const ih,
-					 const char * const body,
-					 struct item_head *insert_key,
-					 struct buffer_head **insert_ptr,
-					 int i)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	struct item_head *aux_ih = item_head(tbS0, tb->item_pos);
-	int n_shift, n_rem, r_zeroes_number, shift;
-	const char *r_body;
-	struct item_head *tmp;
-	struct buffer_info bi;
-
-	RFALSE(ih, "PAP-12210: ih must be 0");
-
-	if (is_direntry_le_ih(aux_ih)) {
-		balance_leaf_new_nodes_paste_dirent(tb, ih, body, insert_key,
-						    insert_ptr, i);
-		return;
-	}
-
-	/* regular object */
-
-
-	RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)) ||
-	       tb->insert_size[0] <= 0,
-	       "PAP-12225: item too short or insert_size <= 0");
-
-	/*
-	 * Calculate number of bytes which must be shifted from appended item
-	 */
-	n_shift = tb->sbytes[i] - tb->insert_size[0];
-	if (n_shift < 0)
-		n_shift = 0;
-	leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i], n_shift,
-			tb->S_new[i]);
-
-	/*
-	 * Calculate number of bytes which must remain in body after
-	 * append to S_new[i]
-	 */
-	n_rem = tb->insert_size[0] - tb->sbytes[i];
-	if (n_rem < 0)
-		n_rem = 0;
-
-	/* Append part of body into S_new[0] */
-	buffer_info_init_bh(tb, &bi, tb->S_new[i]);
-	if (n_rem > tb->zeroes_num) {
-		r_zeroes_number = 0;
-		r_body = body + n_rem - tb->zeroes_num;
-	} else {
-		r_body = body;
-		r_zeroes_number = tb->zeroes_num - n_rem;
-		tb->zeroes_num -= r_zeroes_number;
-	}
-
-	leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem,
-			     r_body, r_zeroes_number);
-
-	tmp = item_head(tb->S_new[i], 0);
-	shift = 0;
-	if (is_indirect_le_ih(tmp)) {
-		set_ih_free_space(tmp, 0);
-		shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
-	}
-	add_le_ih_k_offset(tmp, n_rem << shift);
-
-	tb->insert_size[0] = n_rem;
-	if (!n_rem)
-		tb->pos_in_item++;
-}
-
-static void balance_leaf_new_nodes_paste_whole(struct tree_balance *tb,
-					       struct item_head * const ih,
-					       const char * const body,
-					       struct item_head *insert_key,
-					       struct buffer_head **insert_ptr,
-					       int i)
-
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int n = B_NR_ITEMS(tbS0);
-	int leaf_mi;
-	struct item_head *pasted;
-	struct buffer_info bi;
-
-#ifdef CONFIG_REISERFS_CHECK
-	struct item_head *ih_check = item_head(tbS0, tb->item_pos);
-
-	if (!is_direntry_le_ih(ih_check) &&
-	    (tb->pos_in_item != ih_item_len(ih_check) ||
-	    tb->insert_size[0] <= 0))
-		reiserfs_panic(tb->tb_sb,
-			     "PAP-12235",
-			     "pos_in_item must be equal to ih_item_len");
-#endif
-
-	leaf_mi = leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i],
-				  tb->sbytes[i], tb->S_new[i]);
-
-	RFALSE(leaf_mi,
-	       "PAP-12240: unexpected value returned by leaf_move_items (%d)",
-	       leaf_mi);
-
-	/* paste into item */
-	buffer_info_init_bh(tb, &bi, tb->S_new[i]);
-	leaf_paste_in_buffer(&bi, tb->item_pos - n + tb->snum[i],
-			     tb->pos_in_item, tb->insert_size[0],
-			     body, tb->zeroes_num);
-
-	pasted = item_head(tb->S_new[i], tb->item_pos - n +
-			   tb->snum[i]);
-	if (is_direntry_le_ih(pasted))
-		leaf_paste_entries(&bi, tb->item_pos - n + tb->snum[i],
-				   tb->pos_in_item, 1,
-				   (struct reiserfs_de_head *)body,
-				   body + DEH_SIZE, tb->insert_size[0]);
-
-	/* if we paste to indirect item update ih_free_space */
-	if (is_indirect_le_ih(pasted))
-		set_ih_free_space(pasted, 0);
-
-	tb->zeroes_num = tb->insert_size[0] = 0;
-
-}
-static void balance_leaf_new_nodes_paste(struct tree_balance *tb,
-					 struct item_head * const ih,
-					 const char * const body,
-					 struct item_head *insert_key,
-					 struct buffer_head **insert_ptr,
-					 int i)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int n = B_NR_ITEMS(tbS0);
-
-	/* pasted item doesn't fall into S_new[i] */
-	if (n - tb->snum[i] > tb->item_pos) {
-		leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
-				tb->snum[i], tb->sbytes[i], tb->S_new[i]);
-		return;
-	}
-
-	/* pasted item or part if it falls to S_new[i] */
-
-	if (tb->item_pos == n - tb->snum[i] && tb->sbytes[i] != -1)
-		/* we must shift part of the appended item */
-		balance_leaf_new_nodes_paste_shift(tb, ih, body, insert_key,
-						   insert_ptr, i);
-	else
-		/* item falls wholly into S_new[i] */
-		balance_leaf_new_nodes_paste_whole(tb, ih, body, insert_key,
-						   insert_ptr, i);
-}
-
-/* Fill new nodes that appear in place of S[0] */
-static void balance_leaf_new_nodes(struct tree_balance *tb,
-				   struct item_head * const ih,
-				   const char * const body,
-				   struct item_head *insert_key,
-				   struct buffer_head **insert_ptr,
-				   int flag)
-{
-	int i;
-	for (i = tb->blknum[0] - 2; i >= 0; i--) {
-		BUG_ON(flag != M_INSERT && flag != M_PASTE);
-
-		RFALSE(!tb->snum[i],
-		       "PAP-12200: snum[%d] == %d. Must be > 0", i,
-		       tb->snum[i]);
-
-		/* here we shift from S to S_new nodes */
-
-		tb->S_new[i] = get_FEB(tb);
-
-		/* initialized block type and tree level */
-		set_blkh_level(B_BLK_HEAD(tb->S_new[i]), DISK_LEAF_NODE_LEVEL);
-
-		if (flag == M_INSERT)
-			balance_leaf_new_nodes_insert(tb, ih, body, insert_key,
-						      insert_ptr, i);
-		else /* M_PASTE */
-			balance_leaf_new_nodes_paste(tb, ih, body, insert_key,
-						     insert_ptr, i);
-
-		memcpy(insert_key + i, leaf_key(tb->S_new[i], 0), KEY_SIZE);
-		insert_ptr[i] = tb->S_new[i];
-
-		RFALSE(!buffer_journaled(tb->S_new[i])
-		       || buffer_journal_dirty(tb->S_new[i])
-		       || buffer_dirty(tb->S_new[i]),
-		       "PAP-12247: S_new[%d] : (%b)",
-		       i, tb->S_new[i]);
-	}
-}
-
-static void balance_leaf_finish_node_insert(struct tree_balance *tb,
-					    struct item_head * const ih,
-					    const char * const body)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	struct buffer_info bi;
-	buffer_info_init_tbS0(tb, &bi);
-	leaf_insert_into_buf(&bi, tb->item_pos, ih, body, tb->zeroes_num);
-
-	/* If we insert the first key change the delimiting key */
-	if (tb->item_pos == 0) {
-		if (tb->CFL[0])	/* can be 0 in reiserfsck */
-			replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
-
-	}
-}
-
-static void balance_leaf_finish_node_paste_dirent(struct tree_balance *tb,
-						  struct item_head * const ih,
-						  const char * const body)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	struct item_head *pasted = item_head(tbS0, tb->item_pos);
-	struct buffer_info bi;
-
-	if (tb->pos_in_item >= 0 && tb->pos_in_item <= ih_entry_count(pasted)) {
-		RFALSE(!tb->insert_size[0],
-		       "PAP-12260: insert_size is 0 already");
-
-		/* prepare space */
-		buffer_info_init_tbS0(tb, &bi);
-		leaf_paste_in_buffer(&bi, tb->item_pos, tb->pos_in_item,
-				     tb->insert_size[0], body, tb->zeroes_num);
-
-		/* paste entry */
-		leaf_paste_entries(&bi, tb->item_pos, tb->pos_in_item, 1,
-				   (struct reiserfs_de_head *)body,
-				   body + DEH_SIZE, tb->insert_size[0]);
-
-		if (!tb->item_pos && !tb->pos_in_item) {
-			RFALSE(!tb->CFL[0] || !tb->L[0],
-			       "PAP-12270: CFL[0]/L[0] must  be specified");
-			if (tb->CFL[0])
-				replace_key(tb, tb->CFL[0], tb->lkey[0],
-					    tbS0, 0);
-		}
-
-		tb->insert_size[0] = 0;
-	}
-}
-
-static void balance_leaf_finish_node_paste(struct tree_balance *tb,
-					   struct item_head * const ih,
-					   const char * const body)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	struct buffer_info bi;
-	struct item_head *pasted = item_head(tbS0, tb->item_pos);
-
-	/* when directory, may be new entry already pasted */
-	if (is_direntry_le_ih(pasted)) {
-		balance_leaf_finish_node_paste_dirent(tb, ih, body);
-		return;
-	}
-
-	/* regular object */
-
-	if (tb->pos_in_item == ih_item_len(pasted)) {
-		RFALSE(tb->insert_size[0] <= 0,
-		       "PAP-12275: insert size must not be %d",
-		       tb->insert_size[0]);
-		buffer_info_init_tbS0(tb, &bi);
-		leaf_paste_in_buffer(&bi, tb->item_pos,
-				     tb->pos_in_item, tb->insert_size[0], body,
-				     tb->zeroes_num);
-
-		if (is_indirect_le_ih(pasted))
-			set_ih_free_space(pasted, 0);
-
-		tb->insert_size[0] = 0;
-	}
-#ifdef CONFIG_REISERFS_CHECK
-	else if (tb->insert_size[0]) {
-		print_cur_tb("12285");
-		reiserfs_panic(tb->tb_sb, "PAP-12285",
-		    "insert_size must be 0 (%d)", tb->insert_size[0]);
-	}
-#endif
-}
-
-/*
- * if the affected item was not wholly shifted then we
- * perform all necessary operations on that part or whole
- * of the affected item which remains in S
- */
-static void balance_leaf_finish_node(struct tree_balance *tb,
-				      struct item_head * const ih,
-				      const char * const body, int flag)
-{
-	/* if we must insert or append into buffer S[0] */
-	if (0 <= tb->item_pos && tb->item_pos < tb->s0num) {
-		if (flag == M_INSERT)
-			balance_leaf_finish_node_insert(tb, ih, body);
-		else /* M_PASTE */
-			balance_leaf_finish_node_paste(tb, ih, body);
-	}
-}
-
-/**
- * balance_leaf - reiserfs tree balancing algorithm
- * @tb: tree balance state
- * @ih: item header of inserted item (little endian)
- * @body: body of inserted item or bytes to paste
- * @flag: i - insert, d - delete, c - cut, p - paste (see do_balance)
- * passed back:
- * @insert_key: key to insert new nodes
- * @insert_ptr: array of nodes to insert at the next level
- *
- * In our processing of one level we sometimes determine what must be
- * inserted into the next higher level.  This insertion consists of a
- * key or two keys and their corresponding pointers.
- */
-static int balance_leaf(struct tree_balance *tb, struct item_head *ih,
-			const char *body, int flag,
-			struct item_head *insert_key,
-			struct buffer_head **insert_ptr)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-
-	PROC_INFO_INC(tb->tb_sb, balance_at[0]);
-
-	/* Make balance in case insert_size[0] < 0 */
-	if (tb->insert_size[0] < 0)
-		return balance_leaf_when_delete(tb, flag);
-
-	tb->item_pos = PATH_LAST_POSITION(tb->tb_path),
-	tb->pos_in_item = tb->tb_path->pos_in_item,
-	tb->zeroes_num = 0;
-	if (flag == M_INSERT && !body)
-		tb->zeroes_num = ih_item_len(ih);
-
-	/*
-	 * for indirect item pos_in_item is measured in unformatted node
-	 * pointers. Recalculate to bytes
-	 */
-	if (flag != M_INSERT
-	    && is_indirect_le_ih(item_head(tbS0, tb->item_pos)))
-		tb->pos_in_item *= UNFM_P_SIZE;
-
-	body += balance_leaf_left(tb, ih, body, flag);
-
-	/* tb->lnum[0] > 0 */
-	/* Calculate new item position */
-	tb->item_pos -= (tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0));
-
-	balance_leaf_right(tb, ih, body, flag);
-
-	/* tb->rnum[0] > 0 */
-	RFALSE(tb->blknum[0] > 3,
-	       "PAP-12180: blknum can not be %d. It must be <= 3", tb->blknum[0]);
-	RFALSE(tb->blknum[0] < 0,
-	       "PAP-12185: blknum can not be %d. It must be >= 0", tb->blknum[0]);
-
-	/*
-	 * if while adding to a node we discover that it is possible to split
-	 * it in two, and merge the left part into the left neighbor and the
-	 * right part into the right neighbor, eliminating the node
-	 */
-	if (tb->blknum[0] == 0) {	/* node S[0] is empty now */
-
-		RFALSE(!tb->lnum[0] || !tb->rnum[0],
-		       "PAP-12190: lnum and rnum must not be zero");
-		/*
-		 * if insertion was done before 0-th position in R[0], right
-		 * delimiting key of the tb->L[0]'s and left delimiting key are
-		 * not set correctly
-		 */
-		if (tb->CFL[0]) {
-			if (!tb->CFR[0])
-				reiserfs_panic(tb->tb_sb, "vs-12195",
-					       "CFR not initialized");
-			copy_key(internal_key(tb->CFL[0], tb->lkey[0]),
-				 internal_key(tb->CFR[0], tb->rkey[0]));
-			do_balance_mark_internal_dirty(tb, tb->CFL[0], 0);
-		}
-
-		reiserfs_invalidate_buffer(tb, tbS0);
-		return 0;
-	}
-
-	balance_leaf_new_nodes(tb, ih, body, insert_key, insert_ptr, flag);
-
-	balance_leaf_finish_node(tb, ih, body, flag);
-
-#ifdef CONFIG_REISERFS_CHECK
-	if (flag == M_PASTE && tb->insert_size[0]) {
-		print_cur_tb("12290");
-		reiserfs_panic(tb->tb_sb,
-			       "PAP-12290", "insert_size is still not 0 (%d)",
-			       tb->insert_size[0]);
-	}
-#endif
-
-	/* Leaf level of the tree is balanced (end of balance_leaf) */
-	return 0;
-}
-
-/* Make empty node */
-void make_empty_node(struct buffer_info *bi)
-{
-	struct block_head *blkh;
-
-	RFALSE(bi->bi_bh == NULL, "PAP-12295: pointer to the buffer is NULL");
-
-	blkh = B_BLK_HEAD(bi->bi_bh);
-	set_blkh_nr_item(blkh, 0);
-	set_blkh_free_space(blkh, MAX_CHILD_SIZE(bi->bi_bh));
-
-	if (bi->bi_parent)
-		B_N_CHILD(bi->bi_parent, bi->bi_position)->dc_size = 0;	/* Endian safe if 0 */
-}
-
-/* Get first empty buffer */
-struct buffer_head *get_FEB(struct tree_balance *tb)
-{
-	int i;
-	struct buffer_info bi;
-
-	for (i = 0; i < MAX_FEB_SIZE; i++)
-		if (tb->FEB[i] != NULL)
-			break;
-
-	if (i == MAX_FEB_SIZE)
-		reiserfs_panic(tb->tb_sb, "vs-12300", "FEB list is empty");
-
-	buffer_info_init_bh(tb, &bi, tb->FEB[i]);
-	make_empty_node(&bi);
-	set_buffer_uptodate(tb->FEB[i]);
-	tb->used[i] = tb->FEB[i];
-	tb->FEB[i] = NULL;
-
-	return tb->used[i];
-}
-
-/* This is now used because reiserfs_free_block has to be able to schedule. */
-static void store_thrown(struct tree_balance *tb, struct buffer_head *bh)
-{
-	int i;
-
-	if (buffer_dirty(bh))
-		reiserfs_warning(tb->tb_sb, "reiserfs-12320",
-				 "called with dirty buffer");
-	for (i = 0; i < ARRAY_SIZE(tb->thrown); i++)
-		if (!tb->thrown[i]) {
-			tb->thrown[i] = bh;
-			get_bh(bh);	/* free_thrown puts this */
-			return;
-		}
-	reiserfs_warning(tb->tb_sb, "reiserfs-12321",
-			 "too many thrown buffers");
-}
-
-static void free_thrown(struct tree_balance *tb)
-{
-	int i;
-	b_blocknr_t blocknr;
-	for (i = 0; i < ARRAY_SIZE(tb->thrown); i++) {
-		if (tb->thrown[i]) {
-			blocknr = tb->thrown[i]->b_blocknr;
-			if (buffer_dirty(tb->thrown[i]))
-				reiserfs_warning(tb->tb_sb, "reiserfs-12322",
-						 "called with dirty buffer %d",
-						 blocknr);
-			brelse(tb->thrown[i]);	/* incremented in store_thrown */
-			reiserfs_free_block(tb->transaction_handle, NULL,
-					    blocknr, 0);
-		}
-	}
-}
-
-void reiserfs_invalidate_buffer(struct tree_balance *tb, struct buffer_head *bh)
-{
-	struct block_head *blkh;
-	blkh = B_BLK_HEAD(bh);
-	set_blkh_level(blkh, FREE_LEVEL);
-	set_blkh_nr_item(blkh, 0);
-
-	clear_buffer_dirty(bh);
-	store_thrown(tb, bh);
-}
-
-/* Replace n_dest'th key in buffer dest by n_src'th key of buffer src.*/
-void replace_key(struct tree_balance *tb, struct buffer_head *dest, int n_dest,
-		 struct buffer_head *src, int n_src)
-{
-
-	RFALSE(dest == NULL || src == NULL,
-	       "vs-12305: source or destination buffer is 0 (src=%p, dest=%p)",
-	       src, dest);
-	RFALSE(!B_IS_KEYS_LEVEL(dest),
-	       "vs-12310: invalid level (%z) for destination buffer. dest must be leaf",
-	       dest);
-	RFALSE(n_dest < 0 || n_src < 0,
-	       "vs-12315: src(%d) or dest(%d) key number < 0", n_src, n_dest);
-	RFALSE(n_dest >= B_NR_ITEMS(dest) || n_src >= B_NR_ITEMS(src),
-	       "vs-12320: src(%d(%d)) or dest(%d(%d)) key number is too big",
-	       n_src, B_NR_ITEMS(src), n_dest, B_NR_ITEMS(dest));
-
-	if (B_IS_ITEMS_LEVEL(src))
-		/* source buffer contains leaf node */
-		memcpy(internal_key(dest, n_dest), item_head(src, n_src),
-		       KEY_SIZE);
-	else
-		memcpy(internal_key(dest, n_dest), internal_key(src, n_src),
-		       KEY_SIZE);
-
-	do_balance_mark_internal_dirty(tb, dest, 0);
-}
-
-int get_left_neighbor_position(struct tree_balance *tb, int h)
-{
-	int Sh_position = PATH_H_POSITION(tb->tb_path, h + 1);
-
-	RFALSE(PATH_H_PPARENT(tb->tb_path, h) == NULL || tb->FL[h] == NULL,
-	       "vs-12325: FL[%d](%p) or F[%d](%p) does not exist",
-	       h, tb->FL[h], h, PATH_H_PPARENT(tb->tb_path, h));
-
-	if (Sh_position == 0)
-		return B_NR_ITEMS(tb->FL[h]);
-	else
-		return Sh_position - 1;
-}
-
-int get_right_neighbor_position(struct tree_balance *tb, int h)
-{
-	int Sh_position = PATH_H_POSITION(tb->tb_path, h + 1);
-
-	RFALSE(PATH_H_PPARENT(tb->tb_path, h) == NULL || tb->FR[h] == NULL,
-	       "vs-12330: F[%d](%p) or FR[%d](%p) does not exist",
-	       h, PATH_H_PPARENT(tb->tb_path, h), h, tb->FR[h]);
-
-	if (Sh_position == B_NR_ITEMS(PATH_H_PPARENT(tb->tb_path, h)))
-		return 0;
-	else
-		return Sh_position + 1;
-}
-
-#ifdef CONFIG_REISERFS_CHECK
-
-int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value);
-static void check_internal_node(struct super_block *s, struct buffer_head *bh,
-				char *mes)
-{
-	struct disk_child *dc;
-	int i;
-
-	RFALSE(!bh, "PAP-12336: bh == 0");
-
-	if (!bh || !B_IS_IN_TREE(bh))
-		return;
-
-	RFALSE(!buffer_dirty(bh) &&
-	       !(buffer_journaled(bh) || buffer_journal_dirty(bh)),
-	       "PAP-12337: buffer (%b) must be dirty", bh);
-	dc = B_N_CHILD(bh, 0);
-
-	for (i = 0; i <= B_NR_ITEMS(bh); i++, dc++) {
-		if (!is_reusable(s, dc_block_number(dc), 1)) {
-			print_cur_tb(mes);
-			reiserfs_panic(s, "PAP-12338",
-				       "invalid child pointer %y in %b",
-				       dc, bh);
-		}
-	}
-}
-
-static int locked_or_not_in_tree(struct tree_balance *tb,
-				  struct buffer_head *bh, char *which)
-{
-	if ((!buffer_journal_prepared(bh) && buffer_locked(bh)) ||
-	    !B_IS_IN_TREE(bh)) {
-		reiserfs_warning(tb->tb_sb, "vs-12339", "%s (%b)", which, bh);
-		return 1;
-	}
-	return 0;
-}
-
-static int check_before_balancing(struct tree_balance *tb)
-{
-	int retval = 0;
-
-	if (REISERFS_SB(tb->tb_sb)->cur_tb) {
-		reiserfs_panic(tb->tb_sb, "vs-12335", "suspect that schedule "
-			       "occurred based on cur_tb not being null at "
-			       "this point in code. do_balance cannot properly "
-			       "handle concurrent tree accesses on a same "
-			       "mount point.");
-	}
-
-	/*
-	 * double check that buffers that we will modify are unlocked.
-	 * (fix_nodes should already have prepped all of these for us).
-	 */
-	if (tb->lnum[0]) {
-		retval |= locked_or_not_in_tree(tb, tb->L[0], "L[0]");
-		retval |= locked_or_not_in_tree(tb, tb->FL[0], "FL[0]");
-		retval |= locked_or_not_in_tree(tb, tb->CFL[0], "CFL[0]");
-		check_leaf(tb->L[0]);
-	}
-	if (tb->rnum[0]) {
-		retval |= locked_or_not_in_tree(tb, tb->R[0], "R[0]");
-		retval |= locked_or_not_in_tree(tb, tb->FR[0], "FR[0]");
-		retval |= locked_or_not_in_tree(tb, tb->CFR[0], "CFR[0]");
-		check_leaf(tb->R[0]);
-	}
-	retval |= locked_or_not_in_tree(tb, PATH_PLAST_BUFFER(tb->tb_path),
-					"S[0]");
-	check_leaf(PATH_PLAST_BUFFER(tb->tb_path));
-
-	return retval;
-}
-
-static void check_after_balance_leaf(struct tree_balance *tb)
-{
-	if (tb->lnum[0]) {
-		if (B_FREE_SPACE(tb->L[0]) !=
-		    MAX_CHILD_SIZE(tb->L[0]) -
-		    dc_size(B_N_CHILD
-			    (tb->FL[0], get_left_neighbor_position(tb, 0)))) {
-			print_cur_tb("12221");
-			reiserfs_panic(tb->tb_sb, "PAP-12355",
-				       "shift to left was incorrect");
-		}
-	}
-	if (tb->rnum[0]) {
-		if (B_FREE_SPACE(tb->R[0]) !=
-		    MAX_CHILD_SIZE(tb->R[0]) -
-		    dc_size(B_N_CHILD
-			    (tb->FR[0], get_right_neighbor_position(tb, 0)))) {
-			print_cur_tb("12222");
-			reiserfs_panic(tb->tb_sb, "PAP-12360",
-				       "shift to right was incorrect");
-		}
-	}
-	if (PATH_H_PBUFFER(tb->tb_path, 1) &&
-	    (B_FREE_SPACE(PATH_H_PBUFFER(tb->tb_path, 0)) !=
-	     (MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)) -
-	      dc_size(B_N_CHILD(PATH_H_PBUFFER(tb->tb_path, 1),
-				PATH_H_POSITION(tb->tb_path, 1)))))) {
-		int left = B_FREE_SPACE(PATH_H_PBUFFER(tb->tb_path, 0));
-		int right = (MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)) -
-			     dc_size(B_N_CHILD(PATH_H_PBUFFER(tb->tb_path, 1),
-					       PATH_H_POSITION(tb->tb_path,
-							       1))));
-		print_cur_tb("12223");
-		reiserfs_warning(tb->tb_sb, "reiserfs-12363",
-				 "B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) = %d; "
-				 "MAX_CHILD_SIZE (%d) - dc_size( %y, %d ) [%d] = %d",
-				 left,
-				 MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)),
-				 PATH_H_PBUFFER(tb->tb_path, 1),
-				 PATH_H_POSITION(tb->tb_path, 1),
-				 dc_size(B_N_CHILD
-					 (PATH_H_PBUFFER(tb->tb_path, 1),
-					  PATH_H_POSITION(tb->tb_path, 1))),
-				 right);
-		reiserfs_panic(tb->tb_sb, "PAP-12365", "S is incorrect");
-	}
-}
-
-static void check_leaf_level(struct tree_balance *tb)
-{
-	check_leaf(tb->L[0]);
-	check_leaf(tb->R[0]);
-	check_leaf(PATH_PLAST_BUFFER(tb->tb_path));
-}
-
-static void check_internal_levels(struct tree_balance *tb)
-{
-	int h;
-
-	/* check all internal nodes */
-	for (h = 1; tb->insert_size[h]; h++) {
-		check_internal_node(tb->tb_sb, PATH_H_PBUFFER(tb->tb_path, h),
-				    "BAD BUFFER ON PATH");
-		if (tb->lnum[h])
-			check_internal_node(tb->tb_sb, tb->L[h], "BAD L");
-		if (tb->rnum[h])
-			check_internal_node(tb->tb_sb, tb->R[h], "BAD R");
-	}
-
-}
-
-#endif
-
-/*
- * Now we have all of the buffers that must be used in balancing of
- * the tree.  We rely on the assumption that schedule() will not occur
- * while do_balance works. ( Only interrupt handlers are acceptable.)
- * We balance the tree according to the analysis made before this,
- * using buffers already obtained.  For SMP support it will someday be
- * necessary to add ordered locking of tb.
- */
-
-/*
- * Some interesting rules of balancing:
- * we delete a maximum of two nodes per level per balancing: we never
- * delete R, when we delete two of three nodes L, S, R then we move
- * them into R.
- *
- * we only delete L if we are deleting two nodes, if we delete only
- * one node we delete S
- *
- * if we shift leaves then we shift as much as we can: this is a
- * deliberate policy of extremism in node packing which results in
- * higher average utilization after repeated random balance operations
- * at the cost of more memory copies and more balancing as a result of
- * small insertions to full nodes.
- *
- * if we shift internal nodes we try to evenly balance the node
- * utilization, with consequent less balancing at the cost of lower
- * utilization.
- *
- * one could argue that the policy for directories in leaves should be
- * that of internal nodes, but we will wait until another day to
- * evaluate this....  It would be nice to someday measure and prove
- * these assumptions as to what is optimal....
- */
-
-static inline void do_balance_starts(struct tree_balance *tb)
-{
-	/* use print_cur_tb() to see initial state of struct tree_balance */
-
-	/* store_print_tb (tb); */
-
-	/* do not delete, just comment it out */
-	/*
-	print_tb(flag, PATH_LAST_POSITION(tb->tb_path),
-		 tb->tb_path->pos_in_item, tb, "check");
-	*/
-	RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB");
-#ifdef CONFIG_REISERFS_CHECK
-	REISERFS_SB(tb->tb_sb)->cur_tb = tb;
-#endif
-}
-
-static inline void do_balance_completed(struct tree_balance *tb)
-{
-
-#ifdef CONFIG_REISERFS_CHECK
-	check_leaf_level(tb);
-	check_internal_levels(tb);
-	REISERFS_SB(tb->tb_sb)->cur_tb = NULL;
-#endif
-
-	/*
-	 * reiserfs_free_block is no longer schedule safe.  So, we need to
-	 * put the buffers we want freed on the thrown list during do_balance,
-	 * and then free them now
-	 */
-
-	REISERFS_SB(tb->tb_sb)->s_do_balance++;
-
-	/* release all nodes hold to perform the balancing */
-	unfix_nodes(tb);
-
-	free_thrown(tb);
-}
-
-/*
- * do_balance - balance the tree
- *
- * @tb: tree_balance structure
- * @ih: item header of inserted item
- * @body: body of inserted item or bytes to paste
- * @flag: 'i' - insert, 'd' - delete, 'c' - cut, 'p' paste
- *
- * Cut means delete part of an item (includes removing an entry from a
- * directory).
- *
- * Delete means delete whole item.
- *
- * Insert means add a new item into the tree.
- *
- * Paste means to append to the end of an existing file or to
- * insert a directory entry.
- */
-void do_balance(struct tree_balance *tb, struct item_head *ih,
-		const char *body, int flag)
-{
-	int child_pos;		/* position of a child node in its parent */
-	int h;			/* level of the tree being processed */
-
-	/*
-	 * in our processing of one level we sometimes determine what
-	 * must be inserted into the next higher level.  This insertion
-	 * consists of a key or two keys and their corresponding
-	 * pointers
-	 */
-	struct item_head insert_key[2];
-
-	/* inserted node-ptrs for the next level */
-	struct buffer_head *insert_ptr[2];
-
-	tb->tb_mode = flag;
-	tb->need_balance_dirty = 0;
-
-	if (FILESYSTEM_CHANGED_TB(tb)) {
-		reiserfs_panic(tb->tb_sb, "clm-6000", "fs generation has "
-			       "changed");
-	}
-	/* if we have no real work to do  */
-	if (!tb->insert_size[0]) {
-		reiserfs_warning(tb->tb_sb, "PAP-12350",
-				 "insert_size == 0, mode == %c", flag);
-		unfix_nodes(tb);
-		return;
-	}
-
-	atomic_inc(&fs_generation(tb->tb_sb));
-	do_balance_starts(tb);
-
-	/*
-	 * balance_leaf returns 0 except if combining L R and S into
-	 * one node.  see balance_internal() for explanation of this
-	 * line of code.
-	 */
-	child_pos = PATH_H_B_ITEM_ORDER(tb->tb_path, 0) +
-	    balance_leaf(tb, ih, body, flag, insert_key, insert_ptr);
-
-#ifdef CONFIG_REISERFS_CHECK
-	check_after_balance_leaf(tb);
-#endif
-
-	/* Balance internal level of the tree. */
-	for (h = 1; h < MAX_HEIGHT && tb->insert_size[h]; h++)
-		child_pos = balance_internal(tb, h, child_pos, insert_key,
-					     insert_ptr);
-
-	do_balance_completed(tb);
-}
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
deleted file mode 100644
index 8eb3ad3e8ae9..000000000000
--- a/fs/reiserfs/file.c
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/time.h>
-#include "reiserfs.h"
-#include "acl.h"
-#include "xattr.h"
-#include <linux/uaccess.h>
-#include <linux/pagemap.h>
-#include <linux/swap.h>
-#include <linux/writeback.h>
-#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
-#include <linux/quotaops.h>
-
-/*
- * We pack the tails of files on file close, not at the time they are written.
- * This implies an unnecessary copy of the tail and an unnecessary indirect item
- * insertion/balancing, for files that are written in one write.
- * It avoids unnecessary tail packings (balances) for files that are written in
- * multiple writes and are small enough to have tails.
- *
- * file_release is called by the VFS layer when the file is closed.  If
- * this is the last open file descriptor, and the file
- * small enough to have a tail, and the tail is currently in an
- * unformatted node, the tail is converted back into a direct item.
- *
- * We use reiserfs_truncate_file to pack the tail, since it already has
- * all the conditions coded.
- */
-static int reiserfs_file_release(struct inode *inode, struct file *filp)
-{
-
-	struct reiserfs_transaction_handle th;
-	int err;
-	int jbegin_failure = 0;
-
-	BUG_ON(!S_ISREG(inode->i_mode));
-
-	if (!atomic_dec_and_mutex_lock(&REISERFS_I(inode)->openers,
-				       &REISERFS_I(inode)->tailpack))
-		return 0;
-
-	/* fast out for when nothing needs to be done */
-	if ((!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
-	     !tail_has_to_be_packed(inode)) &&
-	    REISERFS_I(inode)->i_prealloc_count <= 0) {
-		mutex_unlock(&REISERFS_I(inode)->tailpack);
-		return 0;
-	}
-
-	reiserfs_write_lock(inode->i_sb);
-	/*
-	 * freeing preallocation only involves relogging blocks that
-	 * are already in the current transaction.  preallocation gets
-	 * freed at the end of each transaction, so it is impossible for
-	 * us to log any additional blocks (including quota blocks)
-	 */
-	err = journal_begin(&th, inode->i_sb, 1);
-	if (err) {
-		/*
-		 * uh oh, we can't allow the inode to go away while there
-		 * is still preallocation blocks pending.  Try to join the
-		 * aborted transaction
-		 */
-		jbegin_failure = err;
-		err = journal_join_abort(&th, inode->i_sb);
-
-		if (err) {
-			/*
-			 * hmpf, our choices here aren't good.  We can pin
-			 * the inode which will disallow unmount from ever
-			 * happening, we can do nothing, which will corrupt
-			 * random memory on unmount, or we can forcibly
-			 * remove the file from the preallocation list, which
-			 * will leak blocks on disk.  Lets pin the inode
-			 * and let the admin know what is going on.
-			 */
-			igrab(inode);
-			reiserfs_warning(inode->i_sb, "clm-9001",
-					 "pinning inode %lu because the "
-					 "preallocation can't be freed",
-					 inode->i_ino);
-			goto out;
-		}
-	}
-	reiserfs_update_inode_transaction(inode);
-
-#ifdef REISERFS_PREALLOCATE
-	reiserfs_discard_prealloc(&th, inode);
-#endif
-	err = journal_end(&th);
-
-	/* copy back the error code from journal_begin */
-	if (!err)
-		err = jbegin_failure;
-
-	if (!err &&
-	    (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
-	    tail_has_to_be_packed(inode)) {
-
-		/*
-		 * if regular file is released by last holder and it has been
-		 * appended (we append by unformatted node only) or its direct
-		 * item(s) had to be converted, then it may have to be
-		 * indirect2direct converted
-		 */
-		err = reiserfs_truncate_file(inode, 0);
-	}
-out:
-	reiserfs_write_unlock(inode->i_sb);
-	mutex_unlock(&REISERFS_I(inode)->tailpack);
-	return err;
-}
-
-static int reiserfs_file_open(struct inode *inode, struct file *file)
-{
-	int err = dquot_file_open(inode, file);
-
-	/* somebody might be tailpacking on final close; wait for it */
-        if (!atomic_inc_not_zero(&REISERFS_I(inode)->openers)) {
-		mutex_lock(&REISERFS_I(inode)->tailpack);
-		atomic_inc(&REISERFS_I(inode)->openers);
-		mutex_unlock(&REISERFS_I(inode)->tailpack);
-	}
-	return err;
-}
-
-void reiserfs_vfs_truncate_file(struct inode *inode)
-{
-	mutex_lock(&REISERFS_I(inode)->tailpack);
-	reiserfs_truncate_file(inode, 1);
-	mutex_unlock(&REISERFS_I(inode)->tailpack);
-}
-
-/* Sync a reiserfs file. */
-
-/*
- * FIXME: sync_mapping_buffers() never has anything to sync.  Can
- * be removed...
- */
-
-static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end,
-			      int datasync)
-{
-	struct inode *inode = filp->f_mapping->host;
-	int err;
-	int barrier_done;
-
-	err = file_write_and_wait_range(filp, start, end);
-	if (err)
-		return err;
-
-	inode_lock(inode);
-	BUG_ON(!S_ISREG(inode->i_mode));
-	err = sync_mapping_buffers(inode->i_mapping);
-	reiserfs_write_lock(inode->i_sb);
-	barrier_done = reiserfs_commit_for_inode(inode);
-	reiserfs_write_unlock(inode->i_sb);
-	if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
-		blkdev_issue_flush(inode->i_sb->s_bdev);
-	inode_unlock(inode);
-	if (barrier_done < 0)
-		return barrier_done;
-	return (err < 0) ? -EIO : 0;
-}
-
-/* taken fs/buffer.c:__block_commit_write */
-int reiserfs_commit_page(struct inode *inode, struct page *page,
-			 unsigned from, unsigned to)
-{
-	unsigned block_start, block_end;
-	int partial = 0;
-	unsigned blocksize;
-	struct buffer_head *bh, *head;
-	unsigned long i_size_index = inode->i_size >> PAGE_SHIFT;
-	int new;
-	int logit = reiserfs_file_data_log(inode);
-	struct super_block *s = inode->i_sb;
-	int bh_per_page = PAGE_SIZE / s->s_blocksize;
-	struct reiserfs_transaction_handle th;
-	int ret = 0;
-
-	th.t_trans_id = 0;
-	blocksize = i_blocksize(inode);
-
-	if (logit) {
-		reiserfs_write_lock(s);
-		ret = journal_begin(&th, s, bh_per_page + 1);
-		if (ret)
-			goto drop_write_lock;
-		reiserfs_update_inode_transaction(inode);
-	}
-	for (bh = head = page_buffers(page), block_start = 0;
-	     bh != head || !block_start;
-	     block_start = block_end, bh = bh->b_this_page) {
-
-		new = buffer_new(bh);
-		clear_buffer_new(bh);
-		block_end = block_start + blocksize;
-		if (block_end <= from || block_start >= to) {
-			if (!buffer_uptodate(bh))
-				partial = 1;
-		} else {
-			set_buffer_uptodate(bh);
-			if (logit) {
-				reiserfs_prepare_for_journal(s, bh, 1);
-				journal_mark_dirty(&th, bh);
-			} else if (!buffer_dirty(bh)) {
-				mark_buffer_dirty(bh);
-				/*
-				 * do data=ordered on any page past the end
-				 * of file and any buffer marked BH_New.
-				 */
-				if (reiserfs_data_ordered(inode->i_sb) &&
-				    (new || page->index >= i_size_index)) {
-					reiserfs_add_ordered_list(inode, bh);
-				}
-			}
-		}
-	}
-	if (logit) {
-		ret = journal_end(&th);
-drop_write_lock:
-		reiserfs_write_unlock(s);
-	}
-	/*
-	 * If this is a partial write which happened to make all buffers
-	 * uptodate then we can optimize away a bogus read_folio() for
-	 * the next read(). Here we 'discover' whether the page went
-	 * uptodate as a result of this (potentially partial) write.
-	 */
-	if (!partial)
-		SetPageUptodate(page);
-	return ret;
-}
-
-const struct file_operations reiserfs_file_operations = {
-	.unlocked_ioctl = reiserfs_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl = reiserfs_compat_ioctl,
-#endif
-	.mmap = generic_file_mmap,
-	.open = reiserfs_file_open,
-	.release = reiserfs_file_release,
-	.fsync = reiserfs_sync_file,
-	.read_iter = generic_file_read_iter,
-	.write_iter = generic_file_write_iter,
-	.splice_read = filemap_splice_read,
-	.splice_write = iter_file_splice_write,
-	.llseek = generic_file_llseek,
-};
-
-const struct inode_operations reiserfs_file_inode_operations = {
-	.setattr = reiserfs_setattr,
-	.listxattr = reiserfs_listxattr,
-	.permission = reiserfs_permission,
-	.get_inode_acl = reiserfs_get_acl,
-	.set_acl = reiserfs_set_acl,
-	.fileattr_get = reiserfs_fileattr_get,
-	.fileattr_set = reiserfs_fileattr_set,
-};
-
-const struct inode_operations reiserfs_priv_file_inode_operations = {
-	.setattr = reiserfs_setattr,
-	.permission = reiserfs_permission,
-	.fileattr_get = reiserfs_fileattr_get,
-	.fileattr_set = reiserfs_fileattr_set,
-};
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
deleted file mode 100644
index 6c13a8d9a73c..000000000000
--- a/fs/reiserfs/fix_node.c
+++ /dev/null
@@ -1,2822 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/time.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include "reiserfs.h"
-#include <linux/buffer_head.h>
-
-/*
- * To make any changes in the tree we find a node that contains item
- * to be changed/deleted or position in the node we insert a new item
- * to. We call this node S. To do balancing we need to decide what we
- * will shift to left/right neighbor, or to a new node, where new item
- * will be etc. To make this analysis simpler we build virtual
- * node. Virtual node is an array of items, that will replace items of
- * node S. (For instance if we are going to delete an item, virtual
- * node does not contain it). Virtual node keeps information about
- * item sizes and types, mergeability of first and last items, sizes
- * of all entries in directory item. We use this array of items when
- * calculating what we can shift to neighbors and how many nodes we
- * have to have if we do not any shiftings, if we shift to left/right
- * neighbor or to both.
- */
-
-/*
- * Takes item number in virtual node, returns number of item
- * that it has in source buffer
- */
-static inline int old_item_num(int new_num, int affected_item_num, int mode)
-{
-	if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num)
-		return new_num;
-
-	if (mode == M_INSERT) {
-
-		RFALSE(new_num == 0,
-		       "vs-8005: for INSERT mode and item number of inserted item");
-
-		return new_num - 1;
-	}
-
-	RFALSE(mode != M_DELETE,
-	       "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'",
-	       mode);
-	/* delete mode */
-	return new_num + 1;
-}
-
-static void create_virtual_node(struct tree_balance *tb, int h)
-{
-	struct item_head *ih;
-	struct virtual_node *vn = tb->tb_vn;
-	int new_num;
-	struct buffer_head *Sh;	/* this comes from tb->S[h] */
-
-	Sh = PATH_H_PBUFFER(tb->tb_path, h);
-
-	/* size of changed node */
-	vn->vn_size =
-	    MAX_CHILD_SIZE(Sh) - B_FREE_SPACE(Sh) + tb->insert_size[h];
-
-	/* for internal nodes array if virtual items is not created */
-	if (h) {
-		vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE);
-		return;
-	}
-
-	/* number of items in virtual node  */
-	vn->vn_nr_item =
-	    B_NR_ITEMS(Sh) + ((vn->vn_mode == M_INSERT) ? 1 : 0) -
-	    ((vn->vn_mode == M_DELETE) ? 1 : 0);
-
-	/* first virtual item */
-	vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1);
-	memset(vn->vn_vi, 0, vn->vn_nr_item * sizeof(struct virtual_item));
-	vn->vn_free_ptr += vn->vn_nr_item * sizeof(struct virtual_item);
-
-	/* first item in the node */
-	ih = item_head(Sh, 0);
-
-	/* define the mergeability for 0-th item (if it is not being deleted) */
-	if (op_is_left_mergeable(&ih->ih_key, Sh->b_size)
-	    && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num))
-		vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE;
-
-	/*
-	 * go through all items that remain in the virtual
-	 * node (except for the new (inserted) one)
-	 */
-	for (new_num = 0; new_num < vn->vn_nr_item; new_num++) {
-		int j;
-		struct virtual_item *vi = vn->vn_vi + new_num;
-		int is_affected =
-		    ((new_num != vn->vn_affected_item_num) ? 0 : 1);
-
-		if (is_affected && vn->vn_mode == M_INSERT)
-			continue;
-
-		/* get item number in source node */
-		j = old_item_num(new_num, vn->vn_affected_item_num,
-				 vn->vn_mode);
-
-		vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE;
-		vi->vi_ih = ih + j;
-		vi->vi_item = ih_item_body(Sh, ih + j);
-		vi->vi_uarea = vn->vn_free_ptr;
-
-		/*
-		 * FIXME: there is no check that item operation did not
-		 * consume too much memory
-		 */
-		vn->vn_free_ptr +=
-		    op_create_vi(vn, vi, is_affected, tb->insert_size[0]);
-		if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr)
-			reiserfs_panic(tb->tb_sb, "vs-8030",
-				       "virtual node space consumed");
-
-		if (!is_affected)
-			/* this is not being changed */
-			continue;
-
-		if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) {
-			vn->vn_vi[new_num].vi_item_len += tb->insert_size[0];
-			/* pointer to data which is going to be pasted */
-			vi->vi_new_data = vn->vn_data;
-		}
-	}
-
-	/* virtual inserted item is not defined yet */
-	if (vn->vn_mode == M_INSERT) {
-		struct virtual_item *vi = vn->vn_vi + vn->vn_affected_item_num;
-
-		RFALSE(vn->vn_ins_ih == NULL,
-		       "vs-8040: item header of inserted item is not specified");
-		vi->vi_item_len = tb->insert_size[0];
-		vi->vi_ih = vn->vn_ins_ih;
-		vi->vi_item = vn->vn_data;
-		vi->vi_uarea = vn->vn_free_ptr;
-
-		op_create_vi(vn, vi, 0 /*not pasted or cut */ ,
-			     tb->insert_size[0]);
-	}
-
-	/*
-	 * set right merge flag we take right delimiting key and
-	 * check whether it is a mergeable item
-	 */
-	if (tb->CFR[0]) {
-		struct reiserfs_key *key;
-
-		key = internal_key(tb->CFR[0], tb->rkey[0]);
-		if (op_is_left_mergeable(key, Sh->b_size)
-		    && (vn->vn_mode != M_DELETE
-			|| vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1))
-			vn->vn_vi[vn->vn_nr_item - 1].vi_type |=
-			    VI_TYPE_RIGHT_MERGEABLE;
-
-#ifdef CONFIG_REISERFS_CHECK
-		if (op_is_left_mergeable(key, Sh->b_size) &&
-		    !(vn->vn_mode != M_DELETE
-		      || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) {
-			/*
-			 * we delete last item and it could be merged
-			 * with right neighbor's first item
-			 */
-			if (!
-			    (B_NR_ITEMS(Sh) == 1
-			     && is_direntry_le_ih(item_head(Sh, 0))
-			     && ih_entry_count(item_head(Sh, 0)) == 1)) {
-				/*
-				 * node contains more than 1 item, or item
-				 * is not directory item, or this item
-				 * contains more than 1 entry
-				 */
-				print_block(Sh, 0, -1, -1);
-				reiserfs_panic(tb->tb_sb, "vs-8045",
-					       "rdkey %k, affected item==%d "
-					       "(mode==%c) Must be %c",
-					       key, vn->vn_affected_item_num,
-					       vn->vn_mode, M_DELETE);
-			}
-		}
-#endif
-
-	}
-}
-
-/*
- * Using virtual node check, how many items can be
- * shifted to left neighbor
- */
-static void check_left(struct tree_balance *tb, int h, int cur_free)
-{
-	int i;
-	struct virtual_node *vn = tb->tb_vn;
-	struct virtual_item *vi;
-	int d_size, ih_size;
-
-	RFALSE(cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free);
-
-	/* internal level */
-	if (h > 0) {
-		tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
-		return;
-	}
-
-	/* leaf level */
-
-	if (!cur_free || !vn->vn_nr_item) {
-		/* no free space or nothing to move */
-		tb->lnum[h] = 0;
-		tb->lbytes = -1;
-		return;
-	}
-
-	RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
-	       "vs-8055: parent does not exist or invalid");
-
-	vi = vn->vn_vi;
-	if ((unsigned int)cur_free >=
-	    (vn->vn_size -
-	     ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) {
-		/* all contents of S[0] fits into L[0] */
-
-		RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
-		       "vs-8055: invalid mode or balance condition failed");
-
-		tb->lnum[0] = vn->vn_nr_item;
-		tb->lbytes = -1;
-		return;
-	}
-
-	d_size = 0, ih_size = IH_SIZE;
-
-	/* first item may be merge with last item in left neighbor */
-	if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE)
-		d_size = -((int)IH_SIZE), ih_size = 0;
-
-	tb->lnum[0] = 0;
-	for (i = 0; i < vn->vn_nr_item;
-	     i++, ih_size = IH_SIZE, d_size = 0, vi++) {
-		d_size += vi->vi_item_len;
-		if (cur_free >= d_size) {
-			/* the item can be shifted entirely */
-			cur_free -= d_size;
-			tb->lnum[0]++;
-			continue;
-		}
-
-		/* the item cannot be shifted entirely, try to split it */
-		/*
-		 * check whether L[0] can hold ih and at least one byte
-		 * of the item body
-		 */
-
-		/* cannot shift even a part of the current item */
-		if (cur_free <= ih_size) {
-			tb->lbytes = -1;
-			return;
-		}
-		cur_free -= ih_size;
-
-		tb->lbytes = op_check_left(vi, cur_free, 0, 0);
-		if (tb->lbytes != -1)
-			/* count partially shifted item */
-			tb->lnum[0]++;
-
-		break;
-	}
-
-	return;
-}
-
-/*
- * Using virtual node check, how many items can be
- * shifted to right neighbor
- */
-static void check_right(struct tree_balance *tb, int h, int cur_free)
-{
-	int i;
-	struct virtual_node *vn = tb->tb_vn;
-	struct virtual_item *vi;
-	int d_size, ih_size;
-
-	RFALSE(cur_free < 0, "vs-8070: cur_free < 0");
-
-	/* internal level */
-	if (h > 0) {
-		tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
-		return;
-	}
-
-	/* leaf level */
-
-	if (!cur_free || !vn->vn_nr_item) {
-		/* no free space  */
-		tb->rnum[h] = 0;
-		tb->rbytes = -1;
-		return;
-	}
-
-	RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
-	       "vs-8075: parent does not exist or invalid");
-
-	vi = vn->vn_vi + vn->vn_nr_item - 1;
-	if ((unsigned int)cur_free >=
-	    (vn->vn_size -
-	     ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) {
-		/* all contents of S[0] fits into R[0] */
-
-		RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
-		       "vs-8080: invalid mode or balance condition failed");
-
-		tb->rnum[h] = vn->vn_nr_item;
-		tb->rbytes = -1;
-		return;
-	}
-
-	d_size = 0, ih_size = IH_SIZE;
-
-	/* last item may be merge with first item in right neighbor */
-	if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE)
-		d_size = -(int)IH_SIZE, ih_size = 0;
-
-	tb->rnum[0] = 0;
-	for (i = vn->vn_nr_item - 1; i >= 0;
-	     i--, d_size = 0, ih_size = IH_SIZE, vi--) {
-		d_size += vi->vi_item_len;
-		if (cur_free >= d_size) {
-			/* the item can be shifted entirely */
-			cur_free -= d_size;
-			tb->rnum[0]++;
-			continue;
-		}
-
-		/*
-		 * check whether R[0] can hold ih and at least one
-		 * byte of the item body
-		 */
-
-		/* cannot shift even a part of the current item */
-		if (cur_free <= ih_size) {
-			tb->rbytes = -1;
-			return;
-		}
-
-		/*
-		 * R[0] can hold the header of the item and at least
-		 * one byte of its body
-		 */
-		cur_free -= ih_size;	/* cur_free is still > 0 */
-
-		tb->rbytes = op_check_right(vi, cur_free);
-		if (tb->rbytes != -1)
-			/* count partially shifted item */
-			tb->rnum[0]++;
-
-		break;
-	}
-
-	return;
-}
-
-/*
- * from - number of items, which are shifted to left neighbor entirely
- * to - number of item, which are shifted to right neighbor entirely
- * from_bytes - number of bytes of boundary item (or directory entries)
- *              which are shifted to left neighbor
- * to_bytes - number of bytes of boundary item (or directory entries)
- *            which are shifted to right neighbor
- */
-static int get_num_ver(int mode, struct tree_balance *tb, int h,
-		       int from, int from_bytes,
-		       int to, int to_bytes, short *snum012, int flow)
-{
-	int i;
-	int units;
-	struct virtual_node *vn = tb->tb_vn;
-	int total_node_size, max_node_size, current_item_size;
-	int needed_nodes;
-
-	/* position of item we start filling node from */
-	int start_item;
-
-	/* position of item we finish filling node by */
-	int end_item;
-
-	/*
-	 * number of first bytes (entries for directory) of start_item-th item
-	 * we do not include into node that is being filled
-	 */
-	int start_bytes;
-
-	/*
-	 * number of last bytes (entries for directory) of end_item-th item
-	 * we do node include into node that is being filled
-	 */
-	int end_bytes;
-
-	/*
-	 * these are positions in virtual item of items, that are split
-	 * between S[0] and S1new and S1new and S2new
-	 */
-	int split_item_positions[2];
-
-	split_item_positions[0] = -1;
-	split_item_positions[1] = -1;
-
-	/*
-	 * We only create additional nodes if we are in insert or paste mode
-	 * or we are in replace mode at the internal level. If h is 0 and
-	 * the mode is M_REPLACE then in fix_nodes we change the mode to
-	 * paste or insert before we get here in the code.
-	 */
-	RFALSE(tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE),
-	       "vs-8100: insert_size < 0 in overflow");
-
-	max_node_size = MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, h));
-
-	/*
-	 * snum012 [0-2] - number of items, that lay
-	 * to S[0], first new node and second new node
-	 */
-	snum012[3] = -1;	/* s1bytes */
-	snum012[4] = -1;	/* s2bytes */
-
-	/* internal level */
-	if (h > 0) {
-		i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE);
-		if (i == max_node_size)
-			return 1;
-		return (i / max_node_size + 1);
-	}
-
-	/* leaf level */
-	needed_nodes = 1;
-	total_node_size = 0;
-
-	/* start from 'from'-th item */
-	start_item = from;
-	/* skip its first 'start_bytes' units */
-	start_bytes = ((from_bytes != -1) ? from_bytes : 0);
-
-	/* last included item is the 'end_item'-th one */
-	end_item = vn->vn_nr_item - to - 1;
-	/* do not count last 'end_bytes' units of 'end_item'-th item */
-	end_bytes = (to_bytes != -1) ? to_bytes : 0;
-
-	/*
-	 * go through all item beginning from the start_item-th item
-	 * and ending by the end_item-th item. Do not count first
-	 * 'start_bytes' units of 'start_item'-th item and last
-	 * 'end_bytes' of 'end_item'-th item
-	 */
-	for (i = start_item; i <= end_item; i++) {
-		struct virtual_item *vi = vn->vn_vi + i;
-		int skip_from_end = ((i == end_item) ? end_bytes : 0);
-
-		RFALSE(needed_nodes > 3, "vs-8105: too many nodes are needed");
-
-		/* get size of current item */
-		current_item_size = vi->vi_item_len;
-
-		/*
-		 * do not take in calculation head part (from_bytes)
-		 * of from-th item
-		 */
-		current_item_size -=
-		    op_part_size(vi, 0 /*from start */ , start_bytes);
-
-		/* do not take in calculation tail part of last item */
-		current_item_size -=
-		    op_part_size(vi, 1 /*from end */ , skip_from_end);
-
-		/* if item fits into current node entierly */
-		if (total_node_size + current_item_size <= max_node_size) {
-			snum012[needed_nodes - 1]++;
-			total_node_size += current_item_size;
-			start_bytes = 0;
-			continue;
-		}
-
-		/*
-		 * virtual item length is longer, than max size of item in
-		 * a node. It is impossible for direct item
-		 */
-		if (current_item_size > max_node_size) {
-			RFALSE(is_direct_le_ih(vi->vi_ih),
-			       "vs-8110: "
-			       "direct item length is %d. It can not be longer than %d",
-			       current_item_size, max_node_size);
-			/* we will try to split it */
-			flow = 1;
-		}
-
-		/* as we do not split items, take new node and continue */
-		if (!flow) {
-			needed_nodes++;
-			i--;
-			total_node_size = 0;
-			continue;
-		}
-
-		/*
-		 * calculate number of item units which fit into node being
-		 * filled
-		 */
-		{
-			int free_space;
-
-			free_space = max_node_size - total_node_size - IH_SIZE;
-			units =
-			    op_check_left(vi, free_space, start_bytes,
-					  skip_from_end);
-			/*
-			 * nothing fits into current node, take new
-			 * node and continue
-			 */
-			if (units == -1) {
-				needed_nodes++, i--, total_node_size = 0;
-				continue;
-			}
-		}
-
-		/* something fits into the current node */
-		start_bytes += units;
-		snum012[needed_nodes - 1 + 3] = units;
-
-		if (needed_nodes > 2)
-			reiserfs_warning(tb->tb_sb, "vs-8111",
-					 "split_item_position is out of range");
-		snum012[needed_nodes - 1]++;
-		split_item_positions[needed_nodes - 1] = i;
-		needed_nodes++;
-		/* continue from the same item with start_bytes != -1 */
-		start_item = i;
-		i--;
-		total_node_size = 0;
-	}
-
-	/*
-	 * sum012[4] (if it is not -1) contains number of units of which
-	 * are to be in S1new, snum012[3] - to be in S0. They are supposed
-	 * to be S1bytes and S2bytes correspondingly, so recalculate
-	 */
-	if (snum012[4] > 0) {
-		int split_item_num;
-		int bytes_to_r, bytes_to_l;
-		int bytes_to_S1new;
-
-		split_item_num = split_item_positions[1];
-		bytes_to_l =
-		    ((from == split_item_num
-		      && from_bytes != -1) ? from_bytes : 0);
-		bytes_to_r =
-		    ((end_item == split_item_num
-		      && end_bytes != -1) ? end_bytes : 0);
-		bytes_to_S1new =
-		    ((split_item_positions[0] ==
-		      split_item_positions[1]) ? snum012[3] : 0);
-
-		/* s2bytes */
-		snum012[4] =
-		    op_unit_num(&vn->vn_vi[split_item_num]) - snum012[4] -
-		    bytes_to_r - bytes_to_l - bytes_to_S1new;
-
-		if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY &&
-		    vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT)
-			reiserfs_warning(tb->tb_sb, "vs-8115",
-					 "not directory or indirect item");
-	}
-
-	/* now we know S2bytes, calculate S1bytes */
-	if (snum012[3] > 0) {
-		int split_item_num;
-		int bytes_to_r, bytes_to_l;
-		int bytes_to_S2new;
-
-		split_item_num = split_item_positions[0];
-		bytes_to_l =
-		    ((from == split_item_num
-		      && from_bytes != -1) ? from_bytes : 0);
-		bytes_to_r =
-		    ((end_item == split_item_num
-		      && end_bytes != -1) ? end_bytes : 0);
-		bytes_to_S2new =
-		    ((split_item_positions[0] == split_item_positions[1]
-		      && snum012[4] != -1) ? snum012[4] : 0);
-
-		/* s1bytes */
-		snum012[3] =
-		    op_unit_num(&vn->vn_vi[split_item_num]) - snum012[3] -
-		    bytes_to_r - bytes_to_l - bytes_to_S2new;
-	}
-
-	return needed_nodes;
-}
-
-
-/*
- * Set parameters for balancing.
- * Performs write of results of analysis of balancing into structure tb,
- * where it will later be used by the functions that actually do the balancing.
- * Parameters:
- *	tb	tree_balance structure;
- *	h	current level of the node;
- *	lnum	number of items from S[h] that must be shifted to L[h];
- *	rnum	number of items from S[h] that must be shifted to R[h];
- *	blk_num	number of blocks that S[h] will be splitted into;
- *	s012	number of items that fall into splitted nodes.
- *	lbytes	number of bytes which flow to the left neighbor from the
- *              item that is not shifted entirely
- *	rbytes	number of bytes which flow to the right neighbor from the
- *              item that is not shifted entirely
- *	s1bytes	number of bytes which flow to the first  new node when
- *              S[0] splits (this number is contained in s012 array)
- */
-
-static void set_parameters(struct tree_balance *tb, int h, int lnum,
-			   int rnum, int blk_num, short *s012, int lb, int rb)
-{
-
-	tb->lnum[h] = lnum;
-	tb->rnum[h] = rnum;
-	tb->blknum[h] = blk_num;
-
-	/* only for leaf level */
-	if (h == 0) {
-		if (s012 != NULL) {
-			tb->s0num = *s012++;
-			tb->snum[0] = *s012++;
-			tb->snum[1] = *s012++;
-			tb->sbytes[0] = *s012++;
-			tb->sbytes[1] = *s012;
-		}
-		tb->lbytes = lb;
-		tb->rbytes = rb;
-	}
-	PROC_INFO_ADD(tb->tb_sb, lnum[h], lnum);
-	PROC_INFO_ADD(tb->tb_sb, rnum[h], rnum);
-
-	PROC_INFO_ADD(tb->tb_sb, lbytes[h], lb);
-	PROC_INFO_ADD(tb->tb_sb, rbytes[h], rb);
-}
-
-/*
- * check if node disappears if we shift tb->lnum[0] items to left
- * neighbor and tb->rnum[0] to the right one.
- */
-static int is_leaf_removable(struct tree_balance *tb)
-{
-	struct virtual_node *vn = tb->tb_vn;
-	int to_left, to_right;
-	int size;
-	int remain_items;
-
-	/*
-	 * number of items that will be shifted to left (right) neighbor
-	 * entirely
-	 */
-	to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0);
-	to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0);
-	remain_items = vn->vn_nr_item;
-
-	/* how many items remain in S[0] after shiftings to neighbors */
-	remain_items -= (to_left + to_right);
-
-	/* all content of node can be shifted to neighbors */
-	if (remain_items < 1) {
-		set_parameters(tb, 0, to_left, vn->vn_nr_item - to_left, 0,
-			       NULL, -1, -1);
-		return 1;
-	}
-
-	/* S[0] is not removable */
-	if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1)
-		return 0;
-
-	/* check whether we can divide 1 remaining item between neighbors */
-
-	/* get size of remaining item (in item units) */
-	size = op_unit_num(&vn->vn_vi[to_left]);
-
-	if (tb->lbytes + tb->rbytes >= size) {
-		set_parameters(tb, 0, to_left + 1, to_right + 1, 0, NULL,
-			       tb->lbytes, -1);
-		return 1;
-	}
-
-	return 0;
-}
-
-/* check whether L, S, R can be joined in one node */
-static int are_leaves_removable(struct tree_balance *tb, int lfree, int rfree)
-{
-	struct virtual_node *vn = tb->tb_vn;
-	int ih_size;
-	struct buffer_head *S0;
-
-	S0 = PATH_H_PBUFFER(tb->tb_path, 0);
-
-	ih_size = 0;
-	if (vn->vn_nr_item) {
-		if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE)
-			ih_size += IH_SIZE;
-
-		if (vn->vn_vi[vn->vn_nr_item - 1].
-		    vi_type & VI_TYPE_RIGHT_MERGEABLE)
-			ih_size += IH_SIZE;
-	} else {
-		/* there was only one item and it will be deleted */
-		struct item_head *ih;
-
-		RFALSE(B_NR_ITEMS(S0) != 1,
-		       "vs-8125: item number must be 1: it is %d",
-		       B_NR_ITEMS(S0));
-
-		ih = item_head(S0, 0);
-		if (tb->CFR[0]
-		    && !comp_short_le_keys(&ih->ih_key,
-					   internal_key(tb->CFR[0],
-							  tb->rkey[0])))
-			/*
-			 * Directory must be in correct state here: that is
-			 * somewhere at the left side should exist first
-			 * directory item. But the item being deleted can
-			 * not be that first one because its right neighbor
-			 * is item of the same directory. (But first item
-			 * always gets deleted in last turn). So, neighbors
-			 * of deleted item can be merged, so we can save
-			 * ih_size
-			 */
-			if (is_direntry_le_ih(ih)) {
-				ih_size = IH_SIZE;
-
-				/*
-				 * we might check that left neighbor exists
-				 * and is of the same directory
-				 */
-				RFALSE(le_ih_k_offset(ih) == DOT_OFFSET,
-				       "vs-8130: first directory item can not be removed until directory is not empty");
-			}
-
-	}
-
-	if (MAX_CHILD_SIZE(S0) + vn->vn_size <= rfree + lfree + ih_size) {
-		set_parameters(tb, 0, -1, -1, -1, NULL, -1, -1);
-		PROC_INFO_INC(tb->tb_sb, leaves_removable);
-		return 1;
-	}
-	return 0;
-
-}
-
-/* when we do not split item, lnum and rnum are numbers of entire items */
-#define SET_PAR_SHIFT_LEFT \
-if (h)\
-{\
-   int to_l;\
-   \
-   to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\
-	      (MAX_NR_KEY(Sh) + 1 - lpar);\
-	      \
-	      set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\
-}\
-else \
-{\
-   if (lset==LEFT_SHIFT_FLOW)\
-     set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\
-		     tb->lbytes, -1);\
-   else\
-     set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\
-		     -1, -1);\
-}
-
-#define SET_PAR_SHIFT_RIGHT \
-if (h)\
-{\
-   int to_r;\
-   \
-   to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\
-   \
-   set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\
-}\
-else \
-{\
-   if (rset==RIGHT_SHIFT_FLOW)\
-     set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\
-		  -1, tb->rbytes);\
-   else\
-     set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\
-		  -1, -1);\
-}
-
-static void free_buffers_in_tb(struct tree_balance *tb)
-{
-	int i;
-
-	pathrelse(tb->tb_path);
-
-	for (i = 0; i < MAX_HEIGHT; i++) {
-		brelse(tb->L[i]);
-		brelse(tb->R[i]);
-		brelse(tb->FL[i]);
-		brelse(tb->FR[i]);
-		brelse(tb->CFL[i]);
-		brelse(tb->CFR[i]);
-
-		tb->L[i] = NULL;
-		tb->R[i] = NULL;
-		tb->FL[i] = NULL;
-		tb->FR[i] = NULL;
-		tb->CFL[i] = NULL;
-		tb->CFR[i] = NULL;
-	}
-}
-
-/*
- * Get new buffers for storing new nodes that are created while balancing.
- * Returns:	SCHEDULE_OCCURRED - schedule occurred while the function worked;
- *	        CARRY_ON - schedule didn't occur while the function worked;
- *	        NO_DISK_SPACE - no disk space.
- */
-/* The function is NOT SCHEDULE-SAFE! */
-static int get_empty_nodes(struct tree_balance *tb, int h)
-{
-	struct buffer_head *new_bh, *Sh = PATH_H_PBUFFER(tb->tb_path, h);
-	b_blocknr_t *blocknr, blocknrs[MAX_AMOUNT_NEEDED] = { 0, };
-	int counter, number_of_freeblk;
-	int  amount_needed;	/* number of needed empty blocks */
-	int  retval = CARRY_ON;
-	struct super_block *sb = tb->tb_sb;
-
-	/*
-	 * number_of_freeblk is the number of empty blocks which have been
-	 * acquired for use by the balancing algorithm minus the number of
-	 * empty blocks used in the previous levels of the analysis,
-	 * number_of_freeblk = tb->cur_blknum can be non-zero if a schedule
-	 * occurs after empty blocks are acquired, and the balancing analysis
-	 * is then restarted, amount_needed is the number needed by this
-	 * level (h) of the balancing analysis.
-	 *
-	 * Note that for systems with many processes writing, it would be
-	 * more layout optimal to calculate the total number needed by all
-	 * levels and then to run reiserfs_new_blocks to get all of them at
-	 * once.
-	 */
-
-	/*
-	 * Initiate number_of_freeblk to the amount acquired prior to the
-	 * restart of the analysis or 0 if not restarted, then subtract the
-	 * amount needed by all of the levels of the tree below h.
-	 */
-	/* blknum includes S[h], so we subtract 1 in this calculation */
-	for (counter = 0, number_of_freeblk = tb->cur_blknum;
-	     counter < h; counter++)
-		number_of_freeblk -=
-		    (tb->blknum[counter]) ? (tb->blknum[counter] -
-						   1) : 0;
-
-	/* Allocate missing empty blocks. */
-	/* if Sh == 0  then we are getting a new root */
-	amount_needed = (Sh) ? (tb->blknum[h] - 1) : 1;
-	/*
-	 * Amount_needed = the amount that we need more than the
-	 * amount that we have.
-	 */
-	if (amount_needed > number_of_freeblk)
-		amount_needed -= number_of_freeblk;
-	else	/* If we have enough already then there is nothing to do. */
-		return CARRY_ON;
-
-	/*
-	 * No need to check quota - is not allocated for blocks used
-	 * for formatted nodes
-	 */
-	if (reiserfs_new_form_blocknrs(tb, blocknrs,
-				       amount_needed) == NO_DISK_SPACE)
-		return NO_DISK_SPACE;
-
-	/* for each blocknumber we just got, get a buffer and stick it on FEB */
-	for (blocknr = blocknrs, counter = 0;
-	     counter < amount_needed; blocknr++, counter++) {
-
-		RFALSE(!*blocknr,
-		       "PAP-8135: reiserfs_new_blocknrs failed when got new blocks");
-
-		new_bh = sb_getblk(sb, *blocknr);
-		RFALSE(buffer_dirty(new_bh) ||
-		       buffer_journaled(new_bh) ||
-		       buffer_journal_dirty(new_bh),
-		       "PAP-8140: journaled or dirty buffer %b for the new block",
-		       new_bh);
-
-		/* Put empty buffers into the array. */
-		RFALSE(tb->FEB[tb->cur_blknum],
-		       "PAP-8141: busy slot for new buffer");
-
-		set_buffer_journal_new(new_bh);
-		tb->FEB[tb->cur_blknum++] = new_bh;
-	}
-
-	if (retval == CARRY_ON && FILESYSTEM_CHANGED_TB(tb))
-		retval = REPEAT_SEARCH;
-
-	return retval;
-}
-
-/*
- * Get free space of the left neighbor, which is stored in the parent
- * node of the left neighbor.
- */
-static int get_lfree(struct tree_balance *tb, int h)
-{
-	struct buffer_head *l, *f;
-	int order;
-
-	if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL ||
-	    (l = tb->FL[h]) == NULL)
-		return 0;
-
-	if (f == l)
-		order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) - 1;
-	else {
-		order = B_NR_ITEMS(l);
-		f = l;
-	}
-
-	return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
-}
-
-/*
- * Get free space of the right neighbor,
- * which is stored in the parent node of the right neighbor.
- */
-static int get_rfree(struct tree_balance *tb, int h)
-{
-	struct buffer_head *r, *f;
-	int order;
-
-	if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL ||
-	    (r = tb->FR[h]) == NULL)
-		return 0;
-
-	if (f == r)
-		order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) + 1;
-	else {
-		order = 0;
-		f = r;
-	}
-
-	return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
-
-}
-
-/* Check whether left neighbor is in memory. */
-static int is_left_neighbor_in_cache(struct tree_balance *tb, int h)
-{
-	struct buffer_head *father, *left;
-	struct super_block *sb = tb->tb_sb;
-	b_blocknr_t left_neighbor_blocknr;
-	int left_neighbor_position;
-
-	/* Father of the left neighbor does not exist. */
-	if (!tb->FL[h])
-		return 0;
-
-	/* Calculate father of the node to be balanced. */
-	father = PATH_H_PBUFFER(tb->tb_path, h + 1);
-
-	RFALSE(!father ||
-	       !B_IS_IN_TREE(father) ||
-	       !B_IS_IN_TREE(tb->FL[h]) ||
-	       !buffer_uptodate(father) ||
-	       !buffer_uptodate(tb->FL[h]),
-	       "vs-8165: F[h] (%b) or FL[h] (%b) is invalid",
-	       father, tb->FL[h]);
-
-	/*
-	 * Get position of the pointer to the left neighbor
-	 * into the left father.
-	 */
-	left_neighbor_position = (father == tb->FL[h]) ?
-	    tb->lkey[h] : B_NR_ITEMS(tb->FL[h]);
-	/* Get left neighbor block number. */
-	left_neighbor_blocknr =
-	    B_N_CHILD_NUM(tb->FL[h], left_neighbor_position);
-	/* Look for the left neighbor in the cache. */
-	if ((left = sb_find_get_block(sb, left_neighbor_blocknr))) {
-
-		RFALSE(buffer_uptodate(left) && !B_IS_IN_TREE(left),
-		       "vs-8170: left neighbor (%b %z) is not in the tree",
-		       left, left);
-		put_bh(left);
-		return 1;
-	}
-
-	return 0;
-}
-
-#define LEFT_PARENTS  'l'
-#define RIGHT_PARENTS 'r'
-
-static void decrement_key(struct cpu_key *key)
-{
-	/* call item specific function for this key */
-	item_ops[cpu_key_k_type(key)]->decrement_key(key);
-}
-
-/*
- * Calculate far left/right parent of the left/right neighbor of the
- * current node, that is calculate the left/right (FL[h]/FR[h]) neighbor
- * of the parent F[h].
- * Calculate left/right common parent of the current node and L[h]/R[h].
- * Calculate left/right delimiting key position.
- * Returns:	PATH_INCORRECT    - path in the tree is not correct
- *		SCHEDULE_OCCURRED - schedule occurred while the function worked
- *	        CARRY_ON          - schedule didn't occur while the function
- *				    worked
- */
-static int get_far_parent(struct tree_balance *tb,
-			  int h,
-			  struct buffer_head **pfather,
-			  struct buffer_head **pcom_father, char c_lr_par)
-{
-	struct buffer_head *parent;
-	INITIALIZE_PATH(s_path_to_neighbor_father);
-	struct treepath *path = tb->tb_path;
-	struct cpu_key s_lr_father_key;
-	int counter,
-	    position = INT_MAX,
-	    first_last_position = 0,
-	    path_offset = PATH_H_PATH_OFFSET(path, h);
-
-	/*
-	 * Starting from F[h] go upwards in the tree, and look for the common
-	 * ancestor of F[h], and its neighbor l/r, that should be obtained.
-	 */
-
-	counter = path_offset;
-
-	RFALSE(counter < FIRST_PATH_ELEMENT_OFFSET,
-	       "PAP-8180: invalid path length");
-
-	for (; counter > FIRST_PATH_ELEMENT_OFFSET; counter--) {
-		/*
-		 * Check whether parent of the current buffer in the path
-		 * is really parent in the tree.
-		 */
-		if (!B_IS_IN_TREE
-		    (parent = PATH_OFFSET_PBUFFER(path, counter - 1)))
-			return REPEAT_SEARCH;
-
-		/* Check whether position in the parent is correct. */
-		if ((position =
-		     PATH_OFFSET_POSITION(path,
-					  counter - 1)) >
-		    B_NR_ITEMS(parent))
-			return REPEAT_SEARCH;
-
-		/*
-		 * Check whether parent at the path really points
-		 * to the child.
-		 */
-		if (B_N_CHILD_NUM(parent, position) !=
-		    PATH_OFFSET_PBUFFER(path, counter)->b_blocknr)
-			return REPEAT_SEARCH;
-
-		/*
-		 * Return delimiting key if position in the parent is not
-		 * equal to first/last one.
-		 */
-		if (c_lr_par == RIGHT_PARENTS)
-			first_last_position = B_NR_ITEMS(parent);
-		if (position != first_last_position) {
-			*pcom_father = parent;
-			get_bh(*pcom_father);
-			/*(*pcom_father = parent)->b_count++; */
-			break;
-		}
-	}
-
-	/* if we are in the root of the tree, then there is no common father */
-	if (counter == FIRST_PATH_ELEMENT_OFFSET) {
-		/*
-		 * Check whether first buffer in the path is the
-		 * root of the tree.
-		 */
-		if (PATH_OFFSET_PBUFFER
-		    (tb->tb_path,
-		     FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
-		    SB_ROOT_BLOCK(tb->tb_sb)) {
-			*pfather = *pcom_father = NULL;
-			return CARRY_ON;
-		}
-		return REPEAT_SEARCH;
-	}
-
-	RFALSE(B_LEVEL(*pcom_father) <= DISK_LEAF_NODE_LEVEL,
-	       "PAP-8185: (%b %z) level too small",
-	       *pcom_father, *pcom_father);
-
-	/* Check whether the common parent is locked. */
-
-	if (buffer_locked(*pcom_father)) {
-
-		/* Release the write lock while the buffer is busy */
-		int depth = reiserfs_write_unlock_nested(tb->tb_sb);
-		__wait_on_buffer(*pcom_father);
-		reiserfs_write_lock_nested(tb->tb_sb, depth);
-		if (FILESYSTEM_CHANGED_TB(tb)) {
-			brelse(*pcom_father);
-			return REPEAT_SEARCH;
-		}
-	}
-
-	/*
-	 * So, we got common parent of the current node and its
-	 * left/right neighbor.  Now we are getting the parent of the
-	 * left/right neighbor.
-	 */
-
-	/* Form key to get parent of the left/right neighbor. */
-	le_key2cpu_key(&s_lr_father_key,
-		       internal_key(*pcom_father,
-				      (c_lr_par ==
-				       LEFT_PARENTS) ? (tb->lkey[h - 1] =
-							position -
-							1) : (tb->rkey[h -
-									   1] =
-							      position)));
-
-	if (c_lr_par == LEFT_PARENTS)
-		decrement_key(&s_lr_father_key);
-
-	if (search_by_key
-	    (tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father,
-	     h + 1) == IO_ERROR)
-		/* path is released */
-		return IO_ERROR;
-
-	if (FILESYSTEM_CHANGED_TB(tb)) {
-		pathrelse(&s_path_to_neighbor_father);
-		brelse(*pcom_father);
-		return REPEAT_SEARCH;
-	}
-
-	*pfather = PATH_PLAST_BUFFER(&s_path_to_neighbor_father);
-
-	RFALSE(B_LEVEL(*pfather) != h + 1,
-	       "PAP-8190: (%b %z) level too small", *pfather, *pfather);
-	RFALSE(s_path_to_neighbor_father.path_length <
-	       FIRST_PATH_ELEMENT_OFFSET, "PAP-8192: path length is too small");
-
-	s_path_to_neighbor_father.path_length--;
-	pathrelse(&s_path_to_neighbor_father);
-	return CARRY_ON;
-}
-
-/*
- * Get parents of neighbors of node in the path(S[path_offset]) and
- * common parents of S[path_offset] and L[path_offset]/R[path_offset]:
- * F[path_offset], FL[path_offset], FR[path_offset], CFL[path_offset],
- * CFR[path_offset].
- * Calculate numbers of left and right delimiting keys position:
- * lkey[path_offset], rkey[path_offset].
- * Returns:	SCHEDULE_OCCURRED - schedule occurred while the function worked
- *	        CARRY_ON - schedule didn't occur while the function worked
- */
-static int get_parents(struct tree_balance *tb, int h)
-{
-	struct treepath *path = tb->tb_path;
-	int position,
-	    ret,
-	    path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h);
-	struct buffer_head *curf, *curcf;
-
-	/* Current node is the root of the tree or will be root of the tree */
-	if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
-		/*
-		 * The root can not have parents.
-		 * Release nodes which previously were obtained as
-		 * parents of the current node neighbors.
-		 */
-		brelse(tb->FL[h]);
-		brelse(tb->CFL[h]);
-		brelse(tb->FR[h]);
-		brelse(tb->CFR[h]);
-		tb->FL[h]  = NULL;
-		tb->CFL[h] = NULL;
-		tb->FR[h]  = NULL;
-		tb->CFR[h] = NULL;
-		return CARRY_ON;
-	}
-
-	/* Get parent FL[path_offset] of L[path_offset]. */
-	position = PATH_OFFSET_POSITION(path, path_offset - 1);
-	if (position) {
-		/* Current node is not the first child of its parent. */
-		curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
-		curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
-		get_bh(curf);
-		get_bh(curf);
-		tb->lkey[h] = position - 1;
-	} else {
-		/*
-		 * Calculate current parent of L[path_offset], which is the
-		 * left neighbor of the current node.  Calculate current
-		 * common parent of L[path_offset] and the current node.
-		 * Note that CFL[path_offset] not equal FL[path_offset] and
-		 * CFL[path_offset] not equal F[path_offset].
-		 * Calculate lkey[path_offset].
-		 */
-		if ((ret = get_far_parent(tb, h + 1, &curf,
-						  &curcf,
-						  LEFT_PARENTS)) != CARRY_ON)
-			return ret;
-	}
-
-	brelse(tb->FL[h]);
-	tb->FL[h] = curf;	/* New initialization of FL[h]. */
-	brelse(tb->CFL[h]);
-	tb->CFL[h] = curcf;	/* New initialization of CFL[h]. */
-
-	RFALSE((curf && !B_IS_IN_TREE(curf)) ||
-	       (curcf && !B_IS_IN_TREE(curcf)),
-	       "PAP-8195: FL (%b) or CFL (%b) is invalid", curf, curcf);
-
-	/* Get parent FR[h] of R[h]. */
-
-	/* Current node is the last child of F[h]. FR[h] != F[h]. */
-	if (position == B_NR_ITEMS(PATH_H_PBUFFER(path, h + 1))) {
-		/*
-		 * Calculate current parent of R[h], which is the right
-		 * neighbor of F[h].  Calculate current common parent of
-		 * R[h] and current node. Note that CFR[h] not equal
-		 * FR[path_offset] and CFR[h] not equal F[h].
-		 */
-		if ((ret =
-		     get_far_parent(tb, h + 1, &curf, &curcf,
-				    RIGHT_PARENTS)) != CARRY_ON)
-			return ret;
-	} else {
-		/* Current node is not the last child of its parent F[h]. */
-		curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
-		curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
-		get_bh(curf);
-		get_bh(curf);
-		tb->rkey[h] = position;
-	}
-
-	brelse(tb->FR[h]);
-	/* New initialization of FR[path_offset]. */
-	tb->FR[h] = curf;
-
-	brelse(tb->CFR[h]);
-	/* New initialization of CFR[path_offset]. */
-	tb->CFR[h] = curcf;
-
-	RFALSE((curf && !B_IS_IN_TREE(curf)) ||
-	       (curcf && !B_IS_IN_TREE(curcf)),
-	       "PAP-8205: FR (%b) or CFR (%b) is invalid", curf, curcf);
-
-	return CARRY_ON;
-}
-
-/*
- * it is possible to remove node as result of shiftings to
- * neighbors even when we insert or paste item.
- */
-static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree,
-				      struct tree_balance *tb, int h)
-{
-	struct buffer_head *Sh = PATH_H_PBUFFER(tb->tb_path, h);
-	int levbytes = tb->insert_size[h];
-	struct item_head *ih;
-	struct reiserfs_key *r_key = NULL;
-
-	ih = item_head(Sh, 0);
-	if (tb->CFR[h])
-		r_key = internal_key(tb->CFR[h], tb->rkey[h]);
-
-	if (lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes
-	    /* shifting may merge items which might save space */
-	    -
-	    ((!h
-	      && op_is_left_mergeable(&ih->ih_key, Sh->b_size)) ? IH_SIZE : 0)
-	    -
-	    ((!h && r_key
-	      && op_is_left_mergeable(r_key, Sh->b_size)) ? IH_SIZE : 0)
-	    + ((h) ? KEY_SIZE : 0)) {
-		/* node can not be removed */
-		if (sfree >= levbytes) {
-			/* new item fits into node S[h] without any shifting */
-			if (!h)
-				tb->s0num =
-				    B_NR_ITEMS(Sh) +
-				    ((mode == M_INSERT) ? 1 : 0);
-			set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
-			return NO_BALANCING_NEEDED;
-		}
-	}
-	PROC_INFO_INC(tb->tb_sb, can_node_be_removed[h]);
-	return !NO_BALANCING_NEEDED;
-}
-
-/*
- * Check whether current node S[h] is balanced when increasing its size by
- * Inserting or Pasting.
- * Calculate parameters for balancing for current level h.
- * Parameters:
- *	tb	tree_balance structure;
- *	h	current level of the node;
- *	inum	item number in S[h];
- *	mode	i - insert, p - paste;
- * Returns:	1 - schedule occurred;
- *	        0 - balancing for higher levels needed;
- *	       -1 - no balancing for higher levels needed;
- *	       -2 - no disk space.
- */
-/* ip means Inserting or Pasting */
-static int ip_check_balance(struct tree_balance *tb, int h)
-{
-	struct virtual_node *vn = tb->tb_vn;
-	/*
-	 * Number of bytes that must be inserted into (value is negative
-	 * if bytes are deleted) buffer which contains node being balanced.
-	 * The mnemonic is that the attempted change in node space used
-	 * level is levbytes bytes.
-	 */
-	int levbytes;
-	int ret;
-
-	int lfree, sfree, rfree /* free space in L, S and R */ ;
-
-	/*
-	 * nver is short for number of vertixes, and lnver is the number if
-	 * we shift to the left, rnver is the number if we shift to the
-	 * right, and lrnver is the number if we shift in both directions.
-	 * The goal is to minimize first the number of vertixes, and second,
-	 * the number of vertixes whose contents are changed by shifting,
-	 * and third the number of uncached vertixes whose contents are
-	 * changed by shifting and must be read from disk.
-	 */
-	int nver, lnver, rnver, lrnver;
-
-	/*
-	 * used at leaf level only, S0 = S[0] is the node being balanced,
-	 * sInum [ I = 0,1,2 ] is the number of items that will
-	 * remain in node SI after balancing.  S1 and S2 are new
-	 * nodes that might be created.
-	 */
-
-	/*
-	 * we perform 8 calls to get_num_ver().  For each call we
-	 * calculate five parameters.  where 4th parameter is s1bytes
-	 * and 5th - s2bytes
-	 *
-	 * s0num, s1num, s2num for 8 cases
-	 * 0,1 - do not shift and do not shift but bottle
-	 * 2   - shift only whole item to left
-	 * 3   - shift to left and bottle as much as possible
-	 * 4,5 - shift to right (whole items and as much as possible
-	 * 6,7 - shift to both directions (whole items and as much as possible)
-	 */
-	short snum012[40] = { 0, };
-
-	/* Sh is the node whose balance is currently being checked */
-	struct buffer_head *Sh;
-
-	Sh = PATH_H_PBUFFER(tb->tb_path, h);
-	levbytes = tb->insert_size[h];
-
-	/* Calculate balance parameters for creating new root. */
-	if (!Sh) {
-		if (!h)
-			reiserfs_panic(tb->tb_sb, "vs-8210",
-				       "S[0] can not be 0");
-		switch (ret = get_empty_nodes(tb, h)) {
-		/* no balancing for higher levels needed */
-		case CARRY_ON:
-			set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
-			return NO_BALANCING_NEEDED;
-
-		case NO_DISK_SPACE:
-		case REPEAT_SEARCH:
-			return ret;
-		default:
-			reiserfs_panic(tb->tb_sb, "vs-8215", "incorrect "
-				       "return value of get_empty_nodes");
-		}
-	}
-
-	/* get parents of S[h] neighbors. */
-	ret = get_parents(tb, h);
-	if (ret != CARRY_ON)
-		return ret;
-
-	sfree = B_FREE_SPACE(Sh);
-
-	/* get free space of neighbors */
-	rfree = get_rfree(tb, h);
-	lfree = get_lfree(tb, h);
-
-	/* and new item fits into node S[h] without any shifting */
-	if (can_node_be_removed(vn->vn_mode, lfree, sfree, rfree, tb, h) ==
-	    NO_BALANCING_NEEDED)
-		return NO_BALANCING_NEEDED;
-
-	create_virtual_node(tb, h);
-
-	/*
-	 * determine maximal number of items we can shift to the left
-	 * neighbor (in tb structure) and the maximal number of bytes
-	 * that can flow to the left neighbor from the left most liquid
-	 * item that cannot be shifted from S[0] entirely (returned value)
-	 */
-	check_left(tb, h, lfree);
-
-	/*
-	 * determine maximal number of items we can shift to the right
-	 * neighbor (in tb structure) and the maximal number of bytes
-	 * that can flow to the right neighbor from the right most liquid
-	 * item that cannot be shifted from S[0] entirely (returned value)
-	 */
-	check_right(tb, h, rfree);
-
-	/*
-	 * all contents of internal node S[h] can be moved into its
-	 * neighbors, S[h] will be removed after balancing
-	 */
-	if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) {
-		int to_r;
-
-		/*
-		 * Since we are working on internal nodes, and our internal
-		 * nodes have fixed size entries, then we can balance by the
-		 * number of items rather than the space they consume.  In this
-		 * routine we set the left node equal to the right node,
-		 * allowing a difference of less than or equal to 1 child
-		 * pointer.
-		 */
-		to_r =
-		    ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
-		     vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
-						tb->rnum[h]);
-		set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
-			       -1, -1);
-		return CARRY_ON;
-	}
-
-	/*
-	 * this checks balance condition, that any two neighboring nodes
-	 * can not fit in one node
-	 */
-	RFALSE(h &&
-	       (tb->lnum[h] >= vn->vn_nr_item + 1 ||
-		tb->rnum[h] >= vn->vn_nr_item + 1),
-	       "vs-8220: tree is not balanced on internal level");
-	RFALSE(!h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) ||
-		      (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1))),
-	       "vs-8225: tree is not balanced on leaf level");
-
-	/*
-	 * all contents of S[0] can be moved into its neighbors
-	 * S[0] will be removed after balancing.
-	 */
-	if (!h && is_leaf_removable(tb))
-		return CARRY_ON;
-
-	/*
-	 * why do we perform this check here rather than earlier??
-	 * Answer: we can win 1 node in some cases above. Moreover we
-	 * checked it above, when we checked, that S[0] is not removable
-	 * in principle
-	 */
-
-	 /* new item fits into node S[h] without any shifting */
-	if (sfree >= levbytes) {
-		if (!h)
-			tb->s0num = vn->vn_nr_item;
-		set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
-		return NO_BALANCING_NEEDED;
-	}
-
-	{
-		int lpar, rpar, nset, lset, rset, lrset;
-		/* regular overflowing of the node */
-
-		/*
-		 * get_num_ver works in 2 modes (FLOW & NO_FLOW)
-		 * lpar, rpar - number of items we can shift to left/right
-		 *              neighbor (including splitting item)
-		 * nset, lset, rset, lrset - shows, whether flowing items
-		 *                           give better packing
-		 */
-#define FLOW 1
-#define NO_FLOW 0		/* do not any splitting */
-
-		/* we choose one of the following */
-#define NOTHING_SHIFT_NO_FLOW	0
-#define NOTHING_SHIFT_FLOW	5
-#define LEFT_SHIFT_NO_FLOW	10
-#define LEFT_SHIFT_FLOW		15
-#define RIGHT_SHIFT_NO_FLOW	20
-#define RIGHT_SHIFT_FLOW	25
-#define LR_SHIFT_NO_FLOW	30
-#define LR_SHIFT_FLOW		35
-
-		lpar = tb->lnum[h];
-		rpar = tb->rnum[h];
-
-		/*
-		 * calculate number of blocks S[h] must be split into when
-		 * nothing is shifted to the neighbors, as well as number of
-		 * items in each part of the split node (s012 numbers),
-		 * and number of bytes (s1bytes) of the shared drop which
-		 * flow to S1 if any
-		 */
-		nset = NOTHING_SHIFT_NO_FLOW;
-		nver = get_num_ver(vn->vn_mode, tb, h,
-				   0, -1, h ? vn->vn_nr_item : 0, -1,
-				   snum012, NO_FLOW);
-
-		if (!h) {
-			int nver1;
-
-			/*
-			 * note, that in this case we try to bottle
-			 * between S[0] and S1 (S1 - the first new node)
-			 */
-			nver1 = get_num_ver(vn->vn_mode, tb, h,
-					    0, -1, 0, -1,
-					    snum012 + NOTHING_SHIFT_FLOW, FLOW);
-			if (nver > nver1)
-				nset = NOTHING_SHIFT_FLOW, nver = nver1;
-		}
-
-		/*
-		 * calculate number of blocks S[h] must be split into when
-		 * l_shift_num first items and l_shift_bytes of the right
-		 * most liquid item to be shifted are shifted to the left
-		 * neighbor, as well as number of items in each part of the
-		 * splitted node (s012 numbers), and number of bytes
-		 * (s1bytes) of the shared drop which flow to S1 if any
-		 */
-		lset = LEFT_SHIFT_NO_FLOW;
-		lnver = get_num_ver(vn->vn_mode, tb, h,
-				    lpar - ((h || tb->lbytes == -1) ? 0 : 1),
-				    -1, h ? vn->vn_nr_item : 0, -1,
-				    snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW);
-		if (!h) {
-			int lnver1;
-
-			lnver1 = get_num_ver(vn->vn_mode, tb, h,
-					     lpar -
-					     ((tb->lbytes != -1) ? 1 : 0),
-					     tb->lbytes, 0, -1,
-					     snum012 + LEFT_SHIFT_FLOW, FLOW);
-			if (lnver > lnver1)
-				lset = LEFT_SHIFT_FLOW, lnver = lnver1;
-		}
-
-		/*
-		 * calculate number of blocks S[h] must be split into when
-		 * r_shift_num first items and r_shift_bytes of the left most
-		 * liquid item to be shifted are shifted to the right neighbor,
-		 * as well as number of items in each part of the splitted
-		 * node (s012 numbers), and number of bytes (s1bytes) of the
-		 * shared drop which flow to S1 if any
-		 */
-		rset = RIGHT_SHIFT_NO_FLOW;
-		rnver = get_num_ver(vn->vn_mode, tb, h,
-				    0, -1,
-				    h ? (vn->vn_nr_item - rpar) : (rpar -
-								   ((tb->
-								     rbytes !=
-								     -1) ? 1 :
-								    0)), -1,
-				    snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW);
-		if (!h) {
-			int rnver1;
-
-			rnver1 = get_num_ver(vn->vn_mode, tb, h,
-					     0, -1,
-					     (rpar -
-					      ((tb->rbytes != -1) ? 1 : 0)),
-					     tb->rbytes,
-					     snum012 + RIGHT_SHIFT_FLOW, FLOW);
-
-			if (rnver > rnver1)
-				rset = RIGHT_SHIFT_FLOW, rnver = rnver1;
-		}
-
-		/*
-		 * calculate number of blocks S[h] must be split into when
-		 * items are shifted in both directions, as well as number
-		 * of items in each part of the splitted node (s012 numbers),
-		 * and number of bytes (s1bytes) of the shared drop which
-		 * flow to S1 if any
-		 */
-		lrset = LR_SHIFT_NO_FLOW;
-		lrnver = get_num_ver(vn->vn_mode, tb, h,
-				     lpar - ((h || tb->lbytes == -1) ? 0 : 1),
-				     -1,
-				     h ? (vn->vn_nr_item - rpar) : (rpar -
-								    ((tb->
-								      rbytes !=
-								      -1) ? 1 :
-								     0)), -1,
-				     snum012 + LR_SHIFT_NO_FLOW, NO_FLOW);
-		if (!h) {
-			int lrnver1;
-
-			lrnver1 = get_num_ver(vn->vn_mode, tb, h,
-					      lpar -
-					      ((tb->lbytes != -1) ? 1 : 0),
-					      tb->lbytes,
-					      (rpar -
-					       ((tb->rbytes != -1) ? 1 : 0)),
-					      tb->rbytes,
-					      snum012 + LR_SHIFT_FLOW, FLOW);
-			if (lrnver > lrnver1)
-				lrset = LR_SHIFT_FLOW, lrnver = lrnver1;
-		}
-
-		/*
-		 * Our general shifting strategy is:
-		 * 1) to minimized number of new nodes;
-		 * 2) to minimized number of neighbors involved in shifting;
-		 * 3) to minimized number of disk reads;
-		 */
-
-		/* we can win TWO or ONE nodes by shifting in both directions */
-		if (lrnver < lnver && lrnver < rnver) {
-			RFALSE(h &&
-			       (tb->lnum[h] != 1 ||
-				tb->rnum[h] != 1 ||
-				lrnver != 1 || rnver != 2 || lnver != 2
-				|| h != 1), "vs-8230: bad h");
-			if (lrset == LR_SHIFT_FLOW)
-				set_parameters(tb, h, tb->lnum[h], tb->rnum[h],
-					       lrnver, snum012 + lrset,
-					       tb->lbytes, tb->rbytes);
-			else
-				set_parameters(tb, h,
-					       tb->lnum[h] -
-					       ((tb->lbytes == -1) ? 0 : 1),
-					       tb->rnum[h] -
-					       ((tb->rbytes == -1) ? 0 : 1),
-					       lrnver, snum012 + lrset, -1, -1);
-
-			return CARRY_ON;
-		}
-
-		/*
-		 * if shifting doesn't lead to better packing
-		 * then don't shift
-		 */
-		if (nver == lrnver) {
-			set_parameters(tb, h, 0, 0, nver, snum012 + nset, -1,
-				       -1);
-			return CARRY_ON;
-		}
-
-		/*
-		 * now we know that for better packing shifting in only one
-		 * direction either to the left or to the right is required
-		 */
-
-		/*
-		 * if shifting to the left is better than
-		 * shifting to the right
-		 */
-		if (lnver < rnver) {
-			SET_PAR_SHIFT_LEFT;
-			return CARRY_ON;
-		}
-
-		/*
-		 * if shifting to the right is better than
-		 * shifting to the left
-		 */
-		if (lnver > rnver) {
-			SET_PAR_SHIFT_RIGHT;
-			return CARRY_ON;
-		}
-
-		/*
-		 * now shifting in either direction gives the same number
-		 * of nodes and we can make use of the cached neighbors
-		 */
-		if (is_left_neighbor_in_cache(tb, h)) {
-			SET_PAR_SHIFT_LEFT;
-			return CARRY_ON;
-		}
-
-		/*
-		 * shift to the right independently on whether the
-		 * right neighbor in cache or not
-		 */
-		SET_PAR_SHIFT_RIGHT;
-		return CARRY_ON;
-	}
-}
-
-/*
- * Check whether current node S[h] is balanced when Decreasing its size by
- * Deleting or Cutting for INTERNAL node of S+tree.
- * Calculate parameters for balancing for current level h.
- * Parameters:
- *	tb	tree_balance structure;
- *	h	current level of the node;
- *	inum	item number in S[h];
- *	mode	i - insert, p - paste;
- * Returns:	1 - schedule occurred;
- *	        0 - balancing for higher levels needed;
- *	       -1 - no balancing for higher levels needed;
- *	       -2 - no disk space.
- *
- * Note: Items of internal nodes have fixed size, so the balance condition for
- * the internal part of S+tree is as for the B-trees.
- */
-static int dc_check_balance_internal(struct tree_balance *tb, int h)
-{
-	struct virtual_node *vn = tb->tb_vn;
-
-	/*
-	 * Sh is the node whose balance is currently being checked,
-	 * and Fh is its father.
-	 */
-	struct buffer_head *Sh, *Fh;
-	int ret;
-	int lfree, rfree /* free space in L and R */ ;
-
-	Sh = PATH_H_PBUFFER(tb->tb_path, h);
-	Fh = PATH_H_PPARENT(tb->tb_path, h);
-
-	/*
-	 * using tb->insert_size[h], which is negative in this case,
-	 * create_virtual_node calculates:
-	 * new_nr_item = number of items node would have if operation is
-	 * performed without balancing (new_nr_item);
-	 */
-	create_virtual_node(tb, h);
-
-	if (!Fh) {		/* S[h] is the root. */
-		/* no balancing for higher levels needed */
-		if (vn->vn_nr_item > 0) {
-			set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
-			return NO_BALANCING_NEEDED;
-		}
-		/*
-		 * new_nr_item == 0.
-		 * Current root will be deleted resulting in
-		 * decrementing the tree height.
-		 */
-		set_parameters(tb, h, 0, 0, 0, NULL, -1, -1);
-		return CARRY_ON;
-	}
-
-	if ((ret = get_parents(tb, h)) != CARRY_ON)
-		return ret;
-
-	/* get free space of neighbors */
-	rfree = get_rfree(tb, h);
-	lfree = get_lfree(tb, h);
-
-	/* determine maximal number of items we can fit into neighbors */
-	check_left(tb, h, lfree);
-	check_right(tb, h, rfree);
-
-	/*
-	 * Balance condition for the internal node is valid.
-	 * In this case we balance only if it leads to better packing.
-	 */
-	if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) {
-		/*
-		 * Here we join S[h] with one of its neighbors,
-		 * which is impossible with greater values of new_nr_item.
-		 */
-		if (vn->vn_nr_item == MIN_NR_KEY(Sh)) {
-			/* All contents of S[h] can be moved to L[h]. */
-			if (tb->lnum[h] >= vn->vn_nr_item + 1) {
-				int n;
-				int order_L;
-
-				order_L =
-				    ((n =
-				      PATH_H_B_ITEM_ORDER(tb->tb_path,
-							  h)) ==
-				     0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
-				n = dc_size(B_N_CHILD(tb->FL[h], order_L)) /
-				    (DC_SIZE + KEY_SIZE);
-				set_parameters(tb, h, -n - 1, 0, 0, NULL, -1,
-					       -1);
-				return CARRY_ON;
-			}
-
-			/* All contents of S[h] can be moved to R[h]. */
-			if (tb->rnum[h] >= vn->vn_nr_item + 1) {
-				int n;
-				int order_R;
-
-				order_R =
-				    ((n =
-				      PATH_H_B_ITEM_ORDER(tb->tb_path,
-							  h)) ==
-				     B_NR_ITEMS(Fh)) ? 0 : n + 1;
-				n = dc_size(B_N_CHILD(tb->FR[h], order_R)) /
-				    (DC_SIZE + KEY_SIZE);
-				set_parameters(tb, h, 0, -n - 1, 0, NULL, -1,
-					       -1);
-				return CARRY_ON;
-			}
-		}
-
-		/*
-		 * All contents of S[h] can be moved to the neighbors
-		 * (L[h] & R[h]).
-		 */
-		if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
-			int to_r;
-
-			to_r =
-			    ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] -
-			     tb->rnum[h] + vn->vn_nr_item + 1) / 2 -
-			    (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
-			set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r,
-				       0, NULL, -1, -1);
-			return CARRY_ON;
-		}
-
-		/* Balancing does not lead to better packing. */
-		set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
-		return NO_BALANCING_NEEDED;
-	}
-
-	/*
-	 * Current node contain insufficient number of items.
-	 * Balancing is required.
-	 */
-	/* Check whether we can merge S[h] with left neighbor. */
-	if (tb->lnum[h] >= vn->vn_nr_item + 1)
-		if (is_left_neighbor_in_cache(tb, h)
-		    || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h]) {
-			int n;
-			int order_L;
-
-			order_L =
-			    ((n =
-			      PATH_H_B_ITEM_ORDER(tb->tb_path,
-						  h)) ==
-			     0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
-			n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / (DC_SIZE +
-								      KEY_SIZE);
-			set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, -1);
-			return CARRY_ON;
-		}
-
-	/* Check whether we can merge S[h] with right neighbor. */
-	if (tb->rnum[h] >= vn->vn_nr_item + 1) {
-		int n;
-		int order_R;
-
-		order_R =
-		    ((n =
-		      PATH_H_B_ITEM_ORDER(tb->tb_path,
-					  h)) == B_NR_ITEMS(Fh)) ? 0 : (n + 1);
-		n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / (DC_SIZE +
-							      KEY_SIZE);
-		set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, -1);
-		return CARRY_ON;
-	}
-
-	/* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
-	if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
-		int to_r;
-
-		to_r =
-		    ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
-		     vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
-						tb->rnum[h]);
-		set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
-			       -1, -1);
-		return CARRY_ON;
-	}
-
-	/* For internal nodes try to borrow item from a neighbor */
-	RFALSE(!tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root");
-
-	/* Borrow one or two items from caching neighbor */
-	if (is_left_neighbor_in_cache(tb, h) || !tb->FR[h]) {
-		int from_l;
-
-		from_l =
-		    (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item +
-		     1) / 2 - (vn->vn_nr_item + 1);
-		set_parameters(tb, h, -from_l, 0, 1, NULL, -1, -1);
-		return CARRY_ON;
-	}
-
-	set_parameters(tb, h, 0,
-		       -((MAX_NR_KEY(Sh) + 1 - tb->rnum[h] + vn->vn_nr_item +
-			  1) / 2 - (vn->vn_nr_item + 1)), 1, NULL, -1, -1);
-	return CARRY_ON;
-}
-
-/*
- * Check whether current node S[h] is balanced when Decreasing its size by
- * Deleting or Truncating for LEAF node of S+tree.
- * Calculate parameters for balancing for current level h.
- * Parameters:
- *	tb	tree_balance structure;
- *	h	current level of the node;
- *	inum	item number in S[h];
- *	mode	i - insert, p - paste;
- * Returns:	1 - schedule occurred;
- *	        0 - balancing for higher levels needed;
- *	       -1 - no balancing for higher levels needed;
- *	       -2 - no disk space.
- */
-static int dc_check_balance_leaf(struct tree_balance *tb, int h)
-{
-	struct virtual_node *vn = tb->tb_vn;
-
-	/*
-	 * Number of bytes that must be deleted from
-	 * (value is negative if bytes are deleted) buffer which
-	 * contains node being balanced.  The mnemonic is that the
-	 * attempted change in node space used level is levbytes bytes.
-	 */
-	int levbytes;
-
-	/* the maximal item size */
-	int maxsize, ret;
-
-	/*
-	 * S0 is the node whose balance is currently being checked,
-	 * and F0 is its father.
-	 */
-	struct buffer_head *S0, *F0;
-	int lfree, rfree /* free space in L and R */ ;
-
-	S0 = PATH_H_PBUFFER(tb->tb_path, 0);
-	F0 = PATH_H_PPARENT(tb->tb_path, 0);
-
-	levbytes = tb->insert_size[h];
-
-	maxsize = MAX_CHILD_SIZE(S0);	/* maximal possible size of an item */
-
-	if (!F0) {		/* S[0] is the root now. */
-
-		RFALSE(-levbytes >= maxsize - B_FREE_SPACE(S0),
-		       "vs-8240: attempt to create empty buffer tree");
-
-		set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
-		return NO_BALANCING_NEEDED;
-	}
-
-	if ((ret = get_parents(tb, h)) != CARRY_ON)
-		return ret;
-
-	/* get free space of neighbors */
-	rfree = get_rfree(tb, h);
-	lfree = get_lfree(tb, h);
-
-	create_virtual_node(tb, h);
-
-	/* if 3 leaves can be merge to one, set parameters and return */
-	if (are_leaves_removable(tb, lfree, rfree))
-		return CARRY_ON;
-
-	/*
-	 * determine maximal number of items we can shift to the left/right
-	 * neighbor and the maximal number of bytes that can flow to the
-	 * left/right neighbor from the left/right most liquid item that
-	 * cannot be shifted from S[0] entirely
-	 */
-	check_left(tb, h, lfree);
-	check_right(tb, h, rfree);
-
-	/* check whether we can merge S with left neighbor. */
-	if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1)
-		if (is_left_neighbor_in_cache(tb, h) || ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) ||	/* S can not be merged with R */
-		    !tb->FR[h]) {
-
-			RFALSE(!tb->FL[h],
-			       "vs-8245: dc_check_balance_leaf: FL[h] must exist");
-
-			/* set parameter to merge S[0] with its left neighbor */
-			set_parameters(tb, h, -1, 0, 0, NULL, -1, -1);
-			return CARRY_ON;
-		}
-
-	/* check whether we can merge S[0] with right neighbor. */
-	if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) {
-		set_parameters(tb, h, 0, -1, 0, NULL, -1, -1);
-		return CARRY_ON;
-	}
-
-	/*
-	 * All contents of S[0] can be moved to the neighbors (L[0] & R[0]).
-	 * Set parameters and return
-	 */
-	if (is_leaf_removable(tb))
-		return CARRY_ON;
-
-	/* Balancing is not required. */
-	tb->s0num = vn->vn_nr_item;
-	set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
-	return NO_BALANCING_NEEDED;
-}
-
-/*
- * Check whether current node S[h] is balanced when Decreasing its size by
- * Deleting or Cutting.
- * Calculate parameters for balancing for current level h.
- * Parameters:
- *	tb	tree_balance structure;
- *	h	current level of the node;
- *	inum	item number in S[h];
- *	mode	d - delete, c - cut.
- * Returns:	1 - schedule occurred;
- *	        0 - balancing for higher levels needed;
- *	       -1 - no balancing for higher levels needed;
- *	       -2 - no disk space.
- */
-static int dc_check_balance(struct tree_balance *tb, int h)
-{
-	RFALSE(!(PATH_H_PBUFFER(tb->tb_path, h)),
-	       "vs-8250: S is not initialized");
-
-	if (h)
-		return dc_check_balance_internal(tb, h);
-	else
-		return dc_check_balance_leaf(tb, h);
-}
-
-/*
- * Check whether current node S[h] is balanced.
- * Calculate parameters for balancing for current level h.
- * Parameters:
- *
- *	tb	tree_balance structure:
- *
- *              tb is a large structure that must be read about in the header
- *		file at the same time as this procedure if the reader is
- *		to successfully understand this procedure
- *
- *	h	current level of the node;
- *	inum	item number in S[h];
- *	mode	i - insert, p - paste, d - delete, c - cut.
- * Returns:	1 - schedule occurred;
- *	        0 - balancing for higher levels needed;
- *	       -1 - no balancing for higher levels needed;
- *	       -2 - no disk space.
- */
-static int check_balance(int mode,
-			 struct tree_balance *tb,
-			 int h,
-			 int inum,
-			 int pos_in_item,
-			 struct item_head *ins_ih, const void *data)
-{
-	struct virtual_node *vn;
-
-	vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf);
-	vn->vn_free_ptr = (char *)(tb->tb_vn + 1);
-	vn->vn_mode = mode;
-	vn->vn_affected_item_num = inum;
-	vn->vn_pos_in_item = pos_in_item;
-	vn->vn_ins_ih = ins_ih;
-	vn->vn_data = data;
-
-	RFALSE(mode == M_INSERT && !vn->vn_ins_ih,
-	       "vs-8255: ins_ih can not be 0 in insert mode");
-
-	/* Calculate balance parameters when size of node is increasing. */
-	if (tb->insert_size[h] > 0)
-		return ip_check_balance(tb, h);
-
-	/* Calculate balance parameters when  size of node is decreasing. */
-	return dc_check_balance(tb, h);
-}
-
-/* Check whether parent at the path is the really parent of the current node.*/
-static int get_direct_parent(struct tree_balance *tb, int h)
-{
-	struct buffer_head *bh;
-	struct treepath *path = tb->tb_path;
-	int position,
-	    path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h);
-
-	/* We are in the root or in the new root. */
-	if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
-
-		RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET - 1,
-		       "PAP-8260: invalid offset in the path");
-
-		if (PATH_OFFSET_PBUFFER(path, FIRST_PATH_ELEMENT_OFFSET)->
-		    b_blocknr == SB_ROOT_BLOCK(tb->tb_sb)) {
-			/* Root is not changed. */
-			PATH_OFFSET_PBUFFER(path, path_offset - 1) = NULL;
-			PATH_OFFSET_POSITION(path, path_offset - 1) = 0;
-			return CARRY_ON;
-		}
-		/* Root is changed and we must recalculate the path. */
-		return REPEAT_SEARCH;
-	}
-
-	/* Parent in the path is not in the tree. */
-	if (!B_IS_IN_TREE
-	    (bh = PATH_OFFSET_PBUFFER(path, path_offset - 1)))
-		return REPEAT_SEARCH;
-
-	if ((position =
-	     PATH_OFFSET_POSITION(path,
-				  path_offset - 1)) > B_NR_ITEMS(bh))
-		return REPEAT_SEARCH;
-
-	/* Parent in the path is not parent of the current node in the tree. */
-	if (B_N_CHILD_NUM(bh, position) !=
-	    PATH_OFFSET_PBUFFER(path, path_offset)->b_blocknr)
-		return REPEAT_SEARCH;
-
-	if (buffer_locked(bh)) {
-		int depth = reiserfs_write_unlock_nested(tb->tb_sb);
-		__wait_on_buffer(bh);
-		reiserfs_write_lock_nested(tb->tb_sb, depth);
-		if (FILESYSTEM_CHANGED_TB(tb))
-			return REPEAT_SEARCH;
-	}
-
-	/*
-	 * Parent in the path is unlocked and really parent
-	 * of the current node.
-	 */
-	return CARRY_ON;
-}
-
-/*
- * Using lnum[h] and rnum[h] we should determine what neighbors
- * of S[h] we
- * need in order to balance S[h], and get them if necessary.
- * Returns:	SCHEDULE_OCCURRED - schedule occurred while the function worked;
- *	        CARRY_ON - schedule didn't occur while the function worked;
- */
-static int get_neighbors(struct tree_balance *tb, int h)
-{
-	int child_position,
-	    path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h + 1);
-	unsigned long son_number;
-	struct super_block *sb = tb->tb_sb;
-	struct buffer_head *bh;
-	int depth;
-
-	PROC_INFO_INC(sb, get_neighbors[h]);
-
-	if (tb->lnum[h]) {
-		/* We need left neighbor to balance S[h]. */
-		PROC_INFO_INC(sb, need_l_neighbor[h]);
-		bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
-
-		RFALSE(bh == tb->FL[h] &&
-		       !PATH_OFFSET_POSITION(tb->tb_path, path_offset),
-		       "PAP-8270: invalid position in the parent");
-
-		child_position =
-		    (bh ==
-		     tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb->
-								       FL[h]);
-		son_number = B_N_CHILD_NUM(tb->FL[h], child_position);
-		depth = reiserfs_write_unlock_nested(tb->tb_sb);
-		bh = sb_bread(sb, son_number);
-		reiserfs_write_lock_nested(tb->tb_sb, depth);
-		if (!bh)
-			return IO_ERROR;
-		if (FILESYSTEM_CHANGED_TB(tb)) {
-			brelse(bh);
-			PROC_INFO_INC(sb, get_neighbors_restart[h]);
-			return REPEAT_SEARCH;
-		}
-
-		RFALSE(!B_IS_IN_TREE(tb->FL[h]) ||
-		       child_position > B_NR_ITEMS(tb->FL[h]) ||
-		       B_N_CHILD_NUM(tb->FL[h], child_position) !=
-		       bh->b_blocknr, "PAP-8275: invalid parent");
-		RFALSE(!B_IS_IN_TREE(bh), "PAP-8280: invalid child");
-		RFALSE(!h &&
-		       B_FREE_SPACE(bh) !=
-		       MAX_CHILD_SIZE(bh) -
-		       dc_size(B_N_CHILD(tb->FL[0], child_position)),
-		       "PAP-8290: invalid child size of left neighbor");
-
-		brelse(tb->L[h]);
-		tb->L[h] = bh;
-	}
-
-	/* We need right neighbor to balance S[path_offset]. */
-	if (tb->rnum[h]) {
-		PROC_INFO_INC(sb, need_r_neighbor[h]);
-		bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
-
-		RFALSE(bh == tb->FR[h] &&
-		       PATH_OFFSET_POSITION(tb->tb_path,
-					    path_offset) >=
-		       B_NR_ITEMS(bh),
-		       "PAP-8295: invalid position in the parent");
-
-		child_position =
-		    (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0;
-		son_number = B_N_CHILD_NUM(tb->FR[h], child_position);
-		depth = reiserfs_write_unlock_nested(tb->tb_sb);
-		bh = sb_bread(sb, son_number);
-		reiserfs_write_lock_nested(tb->tb_sb, depth);
-		if (!bh)
-			return IO_ERROR;
-		if (FILESYSTEM_CHANGED_TB(tb)) {
-			brelse(bh);
-			PROC_INFO_INC(sb, get_neighbors_restart[h]);
-			return REPEAT_SEARCH;
-		}
-		brelse(tb->R[h]);
-		tb->R[h] = bh;
-
-		RFALSE(!h
-		       && B_FREE_SPACE(bh) !=
-		       MAX_CHILD_SIZE(bh) -
-		       dc_size(B_N_CHILD(tb->FR[0], child_position)),
-		       "PAP-8300: invalid child size of right neighbor (%d != %d - %d)",
-		       B_FREE_SPACE(bh), MAX_CHILD_SIZE(bh),
-		       dc_size(B_N_CHILD(tb->FR[0], child_position)));
-
-	}
-	return CARRY_ON;
-}
-
-static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh)
-{
-	int max_num_of_items;
-	int max_num_of_entries;
-	unsigned long blocksize = sb->s_blocksize;
-
-#define MIN_NAME_LEN 1
-
-	max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN);
-	max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) /
-	    (DEH_SIZE + MIN_NAME_LEN);
-
-	return sizeof(struct virtual_node) +
-	    max(max_num_of_items * sizeof(struct virtual_item),
-		sizeof(struct virtual_item) +
-		struct_size_t(struct direntry_uarea, entry_sizes,
-			      max_num_of_entries));
-}
-
-/*
- * maybe we should fail balancing we are going to perform when kmalloc
- * fails several times. But now it will loop until kmalloc gets
- * required memory
- */
-static int get_mem_for_virtual_node(struct tree_balance *tb)
-{
-	int check_fs = 0;
-	int size;
-	char *buf;
-
-	size = get_virtual_node_size(tb->tb_sb, PATH_PLAST_BUFFER(tb->tb_path));
-
-	/* we have to allocate more memory for virtual node */
-	if (size > tb->vn_buf_size) {
-		if (tb->vn_buf) {
-			/* free memory allocated before */
-			kfree(tb->vn_buf);
-			/* this is not needed if kfree is atomic */
-			check_fs = 1;
-		}
-
-		/* virtual node requires now more memory */
-		tb->vn_buf_size = size;
-
-		/* get memory for virtual item */
-		buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
-		if (!buf) {
-			/*
-			 * getting memory with GFP_KERNEL priority may involve
-			 * balancing now (due to indirect_to_direct conversion
-			 * on dcache shrinking). So, release path and collected
-			 * resources here
-			 */
-			free_buffers_in_tb(tb);
-			buf = kmalloc(size, GFP_NOFS);
-			if (!buf) {
-				tb->vn_buf_size = 0;
-			}
-			tb->vn_buf = buf;
-			schedule();
-			return REPEAT_SEARCH;
-		}
-
-		tb->vn_buf = buf;
-	}
-
-	if (check_fs && FILESYSTEM_CHANGED_TB(tb))
-		return REPEAT_SEARCH;
-
-	return CARRY_ON;
-}
-
-#ifdef CONFIG_REISERFS_CHECK
-static void tb_buffer_sanity_check(struct super_block *sb,
-				   struct buffer_head *bh,
-				   const char *descr, int level)
-{
-	if (bh) {
-		if (atomic_read(&(bh->b_count)) <= 0)
-
-			reiserfs_panic(sb, "jmacd-1", "negative or zero "
-				       "reference counter for buffer %s[%d] "
-				       "(%b)", descr, level, bh);
-
-		if (!buffer_uptodate(bh))
-			reiserfs_panic(sb, "jmacd-2", "buffer is not up "
-				       "to date %s[%d] (%b)",
-				       descr, level, bh);
-
-		if (!B_IS_IN_TREE(bh))
-			reiserfs_panic(sb, "jmacd-3", "buffer is not "
-				       "in tree %s[%d] (%b)",
-				       descr, level, bh);
-
-		if (bh->b_bdev != sb->s_bdev)
-			reiserfs_panic(sb, "jmacd-4", "buffer has wrong "
-				       "device %s[%d] (%b)",
-				       descr, level, bh);
-
-		if (bh->b_size != sb->s_blocksize)
-			reiserfs_panic(sb, "jmacd-5", "buffer has wrong "
-				       "blocksize %s[%d] (%b)",
-				       descr, level, bh);
-
-		if (bh->b_blocknr > SB_BLOCK_COUNT(sb))
-			reiserfs_panic(sb, "jmacd-6", "buffer block "
-				       "number too high %s[%d] (%b)",
-				       descr, level, bh);
-	}
-}
-#else
-static void tb_buffer_sanity_check(struct super_block *sb,
-				   struct buffer_head *bh,
-				   const char *descr, int level)
-{;
-}
-#endif
-
-static int clear_all_dirty_bits(struct super_block *s, struct buffer_head *bh)
-{
-	return reiserfs_prepare_for_journal(s, bh, 0);
-}
-
-static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
-{
-	struct buffer_head *locked;
-#ifdef CONFIG_REISERFS_CHECK
-	int repeat_counter = 0;
-#endif
-	int i;
-
-	do {
-
-		locked = NULL;
-
-		for (i = tb->tb_path->path_length;
-		     !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) {
-			if (PATH_OFFSET_PBUFFER(tb->tb_path, i)) {
-				/*
-				 * if I understand correctly, we can only
-				 * be sure the last buffer in the path is
-				 * in the tree --clm
-				 */
-#ifdef CONFIG_REISERFS_CHECK
-				if (PATH_PLAST_BUFFER(tb->tb_path) ==
-				    PATH_OFFSET_PBUFFER(tb->tb_path, i))
-					tb_buffer_sanity_check(tb->tb_sb,
-							       PATH_OFFSET_PBUFFER
-							       (tb->tb_path,
-								i), "S",
-							       tb->tb_path->
-							       path_length - i);
-#endif
-				if (!clear_all_dirty_bits(tb->tb_sb,
-							  PATH_OFFSET_PBUFFER
-							  (tb->tb_path,
-							   i))) {
-					locked =
-					    PATH_OFFSET_PBUFFER(tb->tb_path,
-								i);
-				}
-			}
-		}
-
-		for (i = 0; !locked && i < MAX_HEIGHT && tb->insert_size[i];
-		     i++) {
-
-			if (tb->lnum[i]) {
-
-				if (tb->L[i]) {
-					tb_buffer_sanity_check(tb->tb_sb,
-							       tb->L[i],
-							       "L", i);
-					if (!clear_all_dirty_bits
-					    (tb->tb_sb, tb->L[i]))
-						locked = tb->L[i];
-				}
-
-				if (!locked && tb->FL[i]) {
-					tb_buffer_sanity_check(tb->tb_sb,
-							       tb->FL[i],
-							       "FL", i);
-					if (!clear_all_dirty_bits
-					    (tb->tb_sb, tb->FL[i]))
-						locked = tb->FL[i];
-				}
-
-				if (!locked && tb->CFL[i]) {
-					tb_buffer_sanity_check(tb->tb_sb,
-							       tb->CFL[i],
-							       "CFL", i);
-					if (!clear_all_dirty_bits
-					    (tb->tb_sb, tb->CFL[i]))
-						locked = tb->CFL[i];
-				}
-
-			}
-
-			if (!locked && (tb->rnum[i])) {
-
-				if (tb->R[i]) {
-					tb_buffer_sanity_check(tb->tb_sb,
-							       tb->R[i],
-							       "R", i);
-					if (!clear_all_dirty_bits
-					    (tb->tb_sb, tb->R[i]))
-						locked = tb->R[i];
-				}
-
-				if (!locked && tb->FR[i]) {
-					tb_buffer_sanity_check(tb->tb_sb,
-							       tb->FR[i],
-							       "FR", i);
-					if (!clear_all_dirty_bits
-					    (tb->tb_sb, tb->FR[i]))
-						locked = tb->FR[i];
-				}
-
-				if (!locked && tb->CFR[i]) {
-					tb_buffer_sanity_check(tb->tb_sb,
-							       tb->CFR[i],
-							       "CFR", i);
-					if (!clear_all_dirty_bits
-					    (tb->tb_sb, tb->CFR[i]))
-						locked = tb->CFR[i];
-				}
-			}
-		}
-
-		/*
-		 * as far as I can tell, this is not required.  The FEB list
-		 * seems to be full of newly allocated nodes, which will
-		 * never be locked, dirty, or anything else.
-		 * To be safe, I'm putting in the checks and waits in.
-		 * For the moment, they are needed to keep the code in
-		 * journal.c from complaining about the buffer.
-		 * That code is inside CONFIG_REISERFS_CHECK as well.  --clm
-		 */
-		for (i = 0; !locked && i < MAX_FEB_SIZE; i++) {
-			if (tb->FEB[i]) {
-				if (!clear_all_dirty_bits
-				    (tb->tb_sb, tb->FEB[i]))
-					locked = tb->FEB[i];
-			}
-		}
-
-		if (locked) {
-			int depth;
-#ifdef CONFIG_REISERFS_CHECK
-			repeat_counter++;
-			if ((repeat_counter % 10000) == 0) {
-				reiserfs_warning(tb->tb_sb, "reiserfs-8200",
-						 "too many iterations waiting "
-						 "for buffer to unlock "
-						 "(%b)", locked);
-
-				/* Don't loop forever.  Try to recover from possible error. */
-
-				return (FILESYSTEM_CHANGED_TB(tb)) ?
-				    REPEAT_SEARCH : CARRY_ON;
-			}
-#endif
-			depth = reiserfs_write_unlock_nested(tb->tb_sb);
-			__wait_on_buffer(locked);
-			reiserfs_write_lock_nested(tb->tb_sb, depth);
-			if (FILESYSTEM_CHANGED_TB(tb))
-				return REPEAT_SEARCH;
-		}
-
-	} while (locked);
-
-	return CARRY_ON;
-}
-
-/*
- * Prepare for balancing, that is
- *	get all necessary parents, and neighbors;
- *	analyze what and where should be moved;
- *	get sufficient number of new nodes;
- * Balancing will start only after all resources will be collected at a time.
- *
- * When ported to SMP kernels, only at the last moment after all needed nodes
- * are collected in cache, will the resources be locked using the usual
- * textbook ordered lock acquisition algorithms.  Note that ensuring that
- * this code neither write locks what it does not need to write lock nor locks
- * out of order will be a pain in the butt that could have been avoided.
- * Grumble grumble. -Hans
- *
- * fix is meant in the sense of render unchanging
- *
- * Latency might be improved by first gathering a list of what buffers
- * are needed and then getting as many of them in parallel as possible? -Hans
- *
- * Parameters:
- *	op_mode	i - insert, d - delete, c - cut (truncate), p - paste (append)
- *	tb	tree_balance structure;
- *	inum	item number in S[h];
- *      pos_in_item - comment this if you can
- *      ins_ih	item head of item being inserted
- *	data	inserted item or data to be pasted
- * Returns:	1 - schedule occurred while the function worked;
- *	        0 - schedule didn't occur while the function worked;
- *             -1 - if no_disk_space
- */
-
-int fix_nodes(int op_mode, struct tree_balance *tb,
-	      struct item_head *ins_ih, const void *data)
-{
-	int ret, h, item_num = PATH_LAST_POSITION(tb->tb_path);
-	int pos_in_item;
-
-	/*
-	 * we set wait_tb_buffers_run when we have to restore any dirty
-	 * bits cleared during wait_tb_buffers_run
-	 */
-	int wait_tb_buffers_run = 0;
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-
-	++REISERFS_SB(tb->tb_sb)->s_fix_nodes;
-
-	pos_in_item = tb->tb_path->pos_in_item;
-
-	tb->fs_gen = get_generation(tb->tb_sb);
-
-	/*
-	 * we prepare and log the super here so it will already be in the
-	 * transaction when do_balance needs to change it.
-	 * This way do_balance won't have to schedule when trying to prepare
-	 * the super for logging
-	 */
-	reiserfs_prepare_for_journal(tb->tb_sb,
-				     SB_BUFFER_WITH_SB(tb->tb_sb), 1);
-	journal_mark_dirty(tb->transaction_handle,
-			   SB_BUFFER_WITH_SB(tb->tb_sb));
-	if (FILESYSTEM_CHANGED_TB(tb))
-		return REPEAT_SEARCH;
-
-	/* if it possible in indirect_to_direct conversion */
-	if (buffer_locked(tbS0)) {
-		int depth = reiserfs_write_unlock_nested(tb->tb_sb);
-		__wait_on_buffer(tbS0);
-		reiserfs_write_lock_nested(tb->tb_sb, depth);
-		if (FILESYSTEM_CHANGED_TB(tb))
-			return REPEAT_SEARCH;
-	}
-#ifdef CONFIG_REISERFS_CHECK
-	if (REISERFS_SB(tb->tb_sb)->cur_tb) {
-		print_cur_tb("fix_nodes");
-		reiserfs_panic(tb->tb_sb, "PAP-8305",
-			       "there is pending do_balance");
-	}
-
-	if (!buffer_uptodate(tbS0) || !B_IS_IN_TREE(tbS0))
-		reiserfs_panic(tb->tb_sb, "PAP-8320", "S[0] (%b %z) is "
-			       "not uptodate at the beginning of fix_nodes "
-			       "or not in tree (mode %c)",
-			       tbS0, tbS0, op_mode);
-
-	/* Check parameters. */
-	switch (op_mode) {
-	case M_INSERT:
-		if (item_num <= 0 || item_num > B_NR_ITEMS(tbS0))
-			reiserfs_panic(tb->tb_sb, "PAP-8330", "Incorrect "
-				       "item number %d (in S0 - %d) in case "
-				       "of insert", item_num,
-				       B_NR_ITEMS(tbS0));
-		break;
-	case M_PASTE:
-	case M_DELETE:
-	case M_CUT:
-		if (item_num < 0 || item_num >= B_NR_ITEMS(tbS0)) {
-			print_block(tbS0, 0, -1, -1);
-			reiserfs_panic(tb->tb_sb, "PAP-8335", "Incorrect "
-				       "item number(%d); mode = %c "
-				       "insert_size = %d",
-				       item_num, op_mode,
-				       tb->insert_size[0]);
-		}
-		break;
-	default:
-		reiserfs_panic(tb->tb_sb, "PAP-8340", "Incorrect mode "
-			       "of operation");
-	}
-#endif
-
-	if (get_mem_for_virtual_node(tb) == REPEAT_SEARCH)
-		/* FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat */
-		return REPEAT_SEARCH;
-
-	/* Starting from the leaf level; for all levels h of the tree. */
-	for (h = 0; h < MAX_HEIGHT && tb->insert_size[h]; h++) {
-		ret = get_direct_parent(tb, h);
-		if (ret != CARRY_ON)
-			goto repeat;
-
-		ret = check_balance(op_mode, tb, h, item_num,
-				    pos_in_item, ins_ih, data);
-		if (ret != CARRY_ON) {
-			if (ret == NO_BALANCING_NEEDED) {
-				/* No balancing for higher levels needed. */
-				ret = get_neighbors(tb, h);
-				if (ret != CARRY_ON)
-					goto repeat;
-				if (h != MAX_HEIGHT - 1)
-					tb->insert_size[h + 1] = 0;
-				/*
-				 * ok, analysis and resource gathering
-				 * are complete
-				 */
-				break;
-			}
-			goto repeat;
-		}
-
-		ret = get_neighbors(tb, h);
-		if (ret != CARRY_ON)
-			goto repeat;
-
-		/*
-		 * No disk space, or schedule occurred and analysis may be
-		 * invalid and needs to be redone.
-		 */
-		ret = get_empty_nodes(tb, h);
-		if (ret != CARRY_ON)
-			goto repeat;
-
-		/*
-		 * We have a positive insert size but no nodes exist on this
-		 * level, this means that we are creating a new root.
-		 */
-		if (!PATH_H_PBUFFER(tb->tb_path, h)) {
-
-			RFALSE(tb->blknum[h] != 1,
-			       "PAP-8350: creating new empty root");
-
-			if (h < MAX_HEIGHT - 1)
-				tb->insert_size[h + 1] = 0;
-		} else if (!PATH_H_PBUFFER(tb->tb_path, h + 1)) {
-			/*
-			 * The tree needs to be grown, so this node S[h]
-			 * which is the root node is split into two nodes,
-			 * and a new node (S[h+1]) will be created to
-			 * become the root node.
-			 */
-			if (tb->blknum[h] > 1) {
-
-				RFALSE(h == MAX_HEIGHT - 1,
-				       "PAP-8355: attempt to create too high of a tree");
-
-				tb->insert_size[h + 1] =
-				    (DC_SIZE +
-				     KEY_SIZE) * (tb->blknum[h] - 1) +
-				    DC_SIZE;
-			} else if (h < MAX_HEIGHT - 1)
-				tb->insert_size[h + 1] = 0;
-		} else
-			tb->insert_size[h + 1] =
-			    (DC_SIZE + KEY_SIZE) * (tb->blknum[h] - 1);
-	}
-
-	ret = wait_tb_buffers_until_unlocked(tb);
-	if (ret == CARRY_ON) {
-		if (FILESYSTEM_CHANGED_TB(tb)) {
-			wait_tb_buffers_run = 1;
-			ret = REPEAT_SEARCH;
-			goto repeat;
-		} else {
-			return CARRY_ON;
-		}
-	} else {
-		wait_tb_buffers_run = 1;
-		goto repeat;
-	}
-
-repeat:
-	/*
-	 * fix_nodes was unable to perform its calculation due to
-	 * filesystem got changed under us, lack of free disk space or i/o
-	 * failure. If the first is the case - the search will be
-	 * repeated. For now - free all resources acquired so far except
-	 * for the new allocated nodes
-	 */
-	{
-		int i;
-
-		/* Release path buffers. */
-		if (wait_tb_buffers_run) {
-			pathrelse_and_restore(tb->tb_sb, tb->tb_path);
-		} else {
-			pathrelse(tb->tb_path);
-		}
-		/* brelse all resources collected for balancing */
-		for (i = 0; i < MAX_HEIGHT; i++) {
-			if (wait_tb_buffers_run) {
-				reiserfs_restore_prepared_buffer(tb->tb_sb,
-								 tb->L[i]);
-				reiserfs_restore_prepared_buffer(tb->tb_sb,
-								 tb->R[i]);
-				reiserfs_restore_prepared_buffer(tb->tb_sb,
-								 tb->FL[i]);
-				reiserfs_restore_prepared_buffer(tb->tb_sb,
-								 tb->FR[i]);
-				reiserfs_restore_prepared_buffer(tb->tb_sb,
-								 tb->
-								 CFL[i]);
-				reiserfs_restore_prepared_buffer(tb->tb_sb,
-								 tb->
-								 CFR[i]);
-			}
-
-			brelse(tb->L[i]);
-			brelse(tb->R[i]);
-			brelse(tb->FL[i]);
-			brelse(tb->FR[i]);
-			brelse(tb->CFL[i]);
-			brelse(tb->CFR[i]);
-
-			tb->L[i] = NULL;
-			tb->R[i] = NULL;
-			tb->FL[i] = NULL;
-			tb->FR[i] = NULL;
-			tb->CFL[i] = NULL;
-			tb->CFR[i] = NULL;
-		}
-
-		if (wait_tb_buffers_run) {
-			for (i = 0; i < MAX_FEB_SIZE; i++) {
-				if (tb->FEB[i])
-					reiserfs_restore_prepared_buffer
-					    (tb->tb_sb, tb->FEB[i]);
-			}
-		}
-		return ret;
-	}
-
-}
-
-void unfix_nodes(struct tree_balance *tb)
-{
-	int i;
-
-	/* Release path buffers. */
-	pathrelse_and_restore(tb->tb_sb, tb->tb_path);
-
-	/* brelse all resources collected for balancing */
-	for (i = 0; i < MAX_HEIGHT; i++) {
-		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->L[i]);
-		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->R[i]);
-		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FL[i]);
-		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FR[i]);
-		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFL[i]);
-		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFR[i]);
-
-		brelse(tb->L[i]);
-		brelse(tb->R[i]);
-		brelse(tb->FL[i]);
-		brelse(tb->FR[i]);
-		brelse(tb->CFL[i]);
-		brelse(tb->CFR[i]);
-	}
-
-	/* deal with list of allocated (used and unused) nodes */
-	for (i = 0; i < MAX_FEB_SIZE; i++) {
-		if (tb->FEB[i]) {
-			b_blocknr_t blocknr = tb->FEB[i]->b_blocknr;
-			/*
-			 * de-allocated block which was not used by
-			 * balancing and bforget about buffer for it
-			 */
-			brelse(tb->FEB[i]);
-			reiserfs_free_block(tb->transaction_handle, NULL,
-					    blocknr, 0);
-		}
-		if (tb->used[i]) {
-			/* release used as new nodes including a new root */
-			brelse(tb->used[i]);
-		}
-	}
-
-	kfree(tb->vn_buf);
-
-}
diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c
deleted file mode 100644
index 7a26c4fe6c46..000000000000
--- a/fs/reiserfs/hashes.c
+++ /dev/null
@@ -1,177 +0,0 @@
-
-/*
- * Keyed 32-bit hash function using TEA in a Davis-Meyer function
- *   H0 = Key
- *   Hi = E Mi(Hi-1) + Hi-1
- *
- * (see Applied Cryptography, 2nd edition, p448).
- *
- * Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
- *
- * Jeremy has agreed to the contents of reiserfs/README. -Hans
- * Yura's function is added (04/07/2000)
- */
-
-#include <linux/kernel.h>
-#include "reiserfs.h"
-#include <asm/types.h>
-
-#define DELTA 0x9E3779B9
-#define FULLROUNDS 10		/* 32 is overkill, 16 is strong crypto */
-#define PARTROUNDS 6		/* 6 gets complete mixing */
-
-/* a, b, c, d - data; h0, h1 - accumulated hash */
-#define TEACORE(rounds)							\
-	do {								\
-		u32 sum = 0;						\
-		int n = rounds;						\
-		u32 b0, b1;						\
-									\
-		b0 = h0;						\
-		b1 = h1;						\
-									\
-		do							\
-		{							\
-			sum += DELTA;					\
-			b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);	\
-			b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);	\
-		} while(--n);						\
-									\
-		h0 += b0;						\
-		h1 += b1;						\
-	} while(0)
-
-u32 keyed_hash(const signed char *msg, int len)
-{
-	u32 k[] = { 0x9464a485, 0x542e1a94, 0x3e846bff, 0xb75bcfc3 };
-
-	u32 h0 = k[0], h1 = k[1];
-	u32 a, b, c, d;
-	u32 pad;
-	int i;
-
-	/*      assert(len >= 0 && len < 256); */
-
-	pad = (u32) len | ((u32) len << 8);
-	pad |= pad << 16;
-
-	while (len >= 16) {
-		a = (u32) msg[0] |
-		    (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
-		b = (u32) msg[4] |
-		    (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24;
-		c = (u32) msg[8] |
-		    (u32) msg[9] << 8 |
-		    (u32) msg[10] << 16 | (u32) msg[11] << 24;
-		d = (u32) msg[12] |
-		    (u32) msg[13] << 8 |
-		    (u32) msg[14] << 16 | (u32) msg[15] << 24;
-
-		TEACORE(PARTROUNDS);
-
-		len -= 16;
-		msg += 16;
-	}
-
-	if (len >= 12) {
-		a = (u32) msg[0] |
-		    (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
-		b = (u32) msg[4] |
-		    (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24;
-		c = (u32) msg[8] |
-		    (u32) msg[9] << 8 |
-		    (u32) msg[10] << 16 | (u32) msg[11] << 24;
-
-		d = pad;
-		for (i = 12; i < len; i++) {
-			d <<= 8;
-			d |= msg[i];
-		}
-	} else if (len >= 8) {
-		a = (u32) msg[0] |
-		    (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
-		b = (u32) msg[4] |
-		    (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24;
-
-		c = d = pad;
-		for (i = 8; i < len; i++) {
-			c <<= 8;
-			c |= msg[i];
-		}
-	} else if (len >= 4) {
-		a = (u32) msg[0] |
-		    (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
-
-		b = c = d = pad;
-		for (i = 4; i < len; i++) {
-			b <<= 8;
-			b |= msg[i];
-		}
-	} else {
-		a = b = c = d = pad;
-		for (i = 0; i < len; i++) {
-			a <<= 8;
-			a |= msg[i];
-		}
-	}
-
-	TEACORE(FULLROUNDS);
-
-/*	return 0;*/
-	return h0 ^ h1;
-}
-
-/*
- * What follows in this file is copyright 2000 by Hans Reiser, and the
- * licensing of what follows is governed by reiserfs/README
- */
-u32 yura_hash(const signed char *msg, int len)
-{
-	int j, pow;
-	u32 a, c;
-	int i;
-
-	for (pow = 1, i = 1; i < len; i++)
-		pow = pow * 10;
-
-	if (len == 1)
-		a = msg[0] - 48;
-	else
-		a = (msg[0] - 48) * pow;
-
-	for (i = 1; i < len; i++) {
-		c = msg[i] - 48;
-		for (pow = 1, j = i; j < len - 1; j++)
-			pow = pow * 10;
-		a = a + c * pow;
-	}
-
-	for (; i < 40; i++) {
-		c = '0' - 48;
-		for (pow = 1, j = i; j < len - 1; j++)
-			pow = pow * 10;
-		a = a + c * pow;
-	}
-
-	for (; i < 256; i++) {
-		c = i;
-		for (pow = 1, j = i; j < len - 1; j++)
-			pow = pow * 10;
-		a = a + c * pow;
-	}
-
-	a = a << 7;
-	return a;
-}
-
-u32 r5_hash(const signed char *msg, int len)
-{
-	u32 a = 0;
-	while (*msg) {
-		a += *msg << 4;
-		a += *msg >> 4;
-		a *= 11;
-		msg++;
-	}
-	return a;
-}
diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c
deleted file mode 100644
index 5db6f45b3fed..000000000000
--- a/fs/reiserfs/ibalance.c
+++ /dev/null
@@ -1,1161 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/uaccess.h>
-#include <linux/string.h>
-#include <linux/time.h>
-#include "reiserfs.h"
-#include <linux/buffer_head.h>
-
-/* this is one and only function that is used outside (do_balance.c) */
-int balance_internal(struct tree_balance *,
-		     int, int, struct item_head *, struct buffer_head **);
-
-/*
- * modes of internal_shift_left, internal_shift_right and
- * internal_insert_childs
- */
-#define INTERNAL_SHIFT_FROM_S_TO_L 0
-#define INTERNAL_SHIFT_FROM_R_TO_S 1
-#define INTERNAL_SHIFT_FROM_L_TO_S 2
-#define INTERNAL_SHIFT_FROM_S_TO_R 3
-#define INTERNAL_INSERT_TO_S 4
-#define INTERNAL_INSERT_TO_L 5
-#define INTERNAL_INSERT_TO_R 6
-
-static void internal_define_dest_src_infos(int shift_mode,
-					   struct tree_balance *tb,
-					   int h,
-					   struct buffer_info *dest_bi,
-					   struct buffer_info *src_bi,
-					   int *d_key, struct buffer_head **cf)
-{
-	memset(dest_bi, 0, sizeof(struct buffer_info));
-	memset(src_bi, 0, sizeof(struct buffer_info));
-	/* define dest, src, dest parent, dest position */
-	switch (shift_mode) {
-
-	/* used in internal_shift_left */
-	case INTERNAL_SHIFT_FROM_S_TO_L:
-		src_bi->tb = tb;
-		src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
-		src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
-		src_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
-		dest_bi->tb = tb;
-		dest_bi->bi_bh = tb->L[h];
-		dest_bi->bi_parent = tb->FL[h];
-		dest_bi->bi_position = get_left_neighbor_position(tb, h);
-		*d_key = tb->lkey[h];
-		*cf = tb->CFL[h];
-		break;
-	case INTERNAL_SHIFT_FROM_L_TO_S:
-		src_bi->tb = tb;
-		src_bi->bi_bh = tb->L[h];
-		src_bi->bi_parent = tb->FL[h];
-		src_bi->bi_position = get_left_neighbor_position(tb, h);
-		dest_bi->tb = tb;
-		dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
-		dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
-		/* dest position is analog of dest->b_item_order */
-		dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
-		*d_key = tb->lkey[h];
-		*cf = tb->CFL[h];
-		break;
-
-	/* used in internal_shift_left */
-	case INTERNAL_SHIFT_FROM_R_TO_S:
-		src_bi->tb = tb;
-		src_bi->bi_bh = tb->R[h];
-		src_bi->bi_parent = tb->FR[h];
-		src_bi->bi_position = get_right_neighbor_position(tb, h);
-		dest_bi->tb = tb;
-		dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
-		dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
-		dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
-		*d_key = tb->rkey[h];
-		*cf = tb->CFR[h];
-		break;
-
-	case INTERNAL_SHIFT_FROM_S_TO_R:
-		src_bi->tb = tb;
-		src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
-		src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
-		src_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
-		dest_bi->tb = tb;
-		dest_bi->bi_bh = tb->R[h];
-		dest_bi->bi_parent = tb->FR[h];
-		dest_bi->bi_position = get_right_neighbor_position(tb, h);
-		*d_key = tb->rkey[h];
-		*cf = tb->CFR[h];
-		break;
-
-	case INTERNAL_INSERT_TO_L:
-		dest_bi->tb = tb;
-		dest_bi->bi_bh = tb->L[h];
-		dest_bi->bi_parent = tb->FL[h];
-		dest_bi->bi_position = get_left_neighbor_position(tb, h);
-		break;
-
-	case INTERNAL_INSERT_TO_S:
-		dest_bi->tb = tb;
-		dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
-		dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
-		dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
-		break;
-
-	case INTERNAL_INSERT_TO_R:
-		dest_bi->tb = tb;
-		dest_bi->bi_bh = tb->R[h];
-		dest_bi->bi_parent = tb->FR[h];
-		dest_bi->bi_position = get_right_neighbor_position(tb, h);
-		break;
-
-	default:
-		reiserfs_panic(tb->tb_sb, "ibalance-1",
-			       "shift type is unknown (%d)",
-			       shift_mode);
-	}
-}
-
-/*
- * Insert count node pointers into buffer cur before position to + 1.
- * Insert count items into buffer cur before position to.
- * Items and node pointers are specified by inserted and bh respectively.
- */
-static void internal_insert_childs(struct buffer_info *cur_bi,
-				   int to, int count,
-				   struct item_head *inserted,
-				   struct buffer_head **bh)
-{
-	struct buffer_head *cur = cur_bi->bi_bh;
-	struct block_head *blkh;
-	int nr;
-	struct reiserfs_key *ih;
-	struct disk_child new_dc[2];
-	struct disk_child *dc;
-	int i;
-
-	if (count <= 0)
-		return;
-
-	blkh = B_BLK_HEAD(cur);
-	nr = blkh_nr_item(blkh);
-
-	RFALSE(count > 2, "too many children (%d) are to be inserted", count);
-	RFALSE(B_FREE_SPACE(cur) < count * (KEY_SIZE + DC_SIZE),
-	       "no enough free space (%d), needed %d bytes",
-	       B_FREE_SPACE(cur), count * (KEY_SIZE + DC_SIZE));
-
-	/* prepare space for count disk_child */
-	dc = B_N_CHILD(cur, to + 1);
-
-	memmove(dc + count, dc, (nr + 1 - (to + 1)) * DC_SIZE);
-
-	/* copy to_be_insert disk children */
-	for (i = 0; i < count; i++) {
-		put_dc_size(&new_dc[i],
-			    MAX_CHILD_SIZE(bh[i]) - B_FREE_SPACE(bh[i]));
-		put_dc_block_number(&new_dc[i], bh[i]->b_blocknr);
-	}
-	memcpy(dc, new_dc, DC_SIZE * count);
-
-	/* prepare space for count items  */
-	ih = internal_key(cur, ((to == -1) ? 0 : to));
-
-	memmove(ih + count, ih,
-		(nr - to) * KEY_SIZE + (nr + 1 + count) * DC_SIZE);
-
-	/* copy item headers (keys) */
-	memcpy(ih, inserted, KEY_SIZE);
-	if (count > 1)
-		memcpy(ih + 1, inserted + 1, KEY_SIZE);
-
-	/* sizes, item number */
-	set_blkh_nr_item(blkh, blkh_nr_item(blkh) + count);
-	set_blkh_free_space(blkh,
-			    blkh_free_space(blkh) - count * (DC_SIZE +
-							     KEY_SIZE));
-
-	do_balance_mark_internal_dirty(cur_bi->tb, cur, 0);
-
-	/*&&&&&&&&&&&&&&&&&&&&&&&& */
-	check_internal(cur);
-	/*&&&&&&&&&&&&&&&&&&&&&&&& */
-
-	if (cur_bi->bi_parent) {
-		struct disk_child *t_dc =
-		    B_N_CHILD(cur_bi->bi_parent, cur_bi->bi_position);
-		put_dc_size(t_dc,
-			    dc_size(t_dc) + (count * (DC_SIZE + KEY_SIZE)));
-		do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent,
-					       0);
-
-		/*&&&&&&&&&&&&&&&&&&&&&&&& */
-		check_internal(cur_bi->bi_parent);
-		/*&&&&&&&&&&&&&&&&&&&&&&&& */
-	}
-
-}
-
-/*
- * Delete del_num items and node pointers from buffer cur starting from
- * the first_i'th item and first_p'th pointers respectively.
- */
-static void internal_delete_pointers_items(struct buffer_info *cur_bi,
-					   int first_p,
-					   int first_i, int del_num)
-{
-	struct buffer_head *cur = cur_bi->bi_bh;
-	int nr;
-	struct block_head *blkh;
-	struct reiserfs_key *key;
-	struct disk_child *dc;
-
-	RFALSE(cur == NULL, "buffer is 0");
-	RFALSE(del_num < 0,
-	       "negative number of items (%d) can not be deleted", del_num);
-	RFALSE(first_p < 0 || first_p + del_num > B_NR_ITEMS(cur) + 1
-	       || first_i < 0,
-	       "first pointer order (%d) < 0 or "
-	       "no so many pointers (%d), only (%d) or "
-	       "first key order %d < 0", first_p, first_p + del_num,
-	       B_NR_ITEMS(cur) + 1, first_i);
-	if (del_num == 0)
-		return;
-
-	blkh = B_BLK_HEAD(cur);
-	nr = blkh_nr_item(blkh);
-
-	if (first_p == 0 && del_num == nr + 1) {
-		RFALSE(first_i != 0,
-		       "1st deleted key must have order 0, not %d", first_i);
-		make_empty_node(cur_bi);
-		return;
-	}
-
-	RFALSE(first_i + del_num > B_NR_ITEMS(cur),
-	       "first_i = %d del_num = %d "
-	       "no so many keys (%d) in the node (%b)(%z)",
-	       first_i, del_num, first_i + del_num, cur, cur);
-
-	/* deleting */
-	dc = B_N_CHILD(cur, first_p);
-
-	memmove(dc, dc + del_num, (nr + 1 - first_p - del_num) * DC_SIZE);
-	key = internal_key(cur, first_i);
-	memmove(key, key + del_num,
-		(nr - first_i - del_num) * KEY_SIZE + (nr + 1 -
-						       del_num) * DC_SIZE);
-
-	/* sizes, item number */
-	set_blkh_nr_item(blkh, blkh_nr_item(blkh) - del_num);
-	set_blkh_free_space(blkh,
-			    blkh_free_space(blkh) +
-			    (del_num * (KEY_SIZE + DC_SIZE)));
-
-	do_balance_mark_internal_dirty(cur_bi->tb, cur, 0);
-	/*&&&&&&&&&&&&&&&&&&&&&&& */
-	check_internal(cur);
-	/*&&&&&&&&&&&&&&&&&&&&&&& */
-
-	if (cur_bi->bi_parent) {
-		struct disk_child *t_dc;
-		t_dc = B_N_CHILD(cur_bi->bi_parent, cur_bi->bi_position);
-		put_dc_size(t_dc,
-			    dc_size(t_dc) - (del_num * (KEY_SIZE + DC_SIZE)));
-
-		do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent,
-					       0);
-		/*&&&&&&&&&&&&&&&&&&&&&&&& */
-		check_internal(cur_bi->bi_parent);
-		/*&&&&&&&&&&&&&&&&&&&&&&&& */
-	}
-}
-
-/* delete n node pointers and items starting from given position */
-static void internal_delete_childs(struct buffer_info *cur_bi, int from, int n)
-{
-	int i_from;
-
-	i_from = (from == 0) ? from : from - 1;
-
-	/*
-	 * delete n pointers starting from `from' position in CUR;
-	 * delete n keys starting from 'i_from' position in CUR;
-	 */
-	internal_delete_pointers_items(cur_bi, from, i_from, n);
-}
-
-/*
- * copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer
- * dest
- * last_first == FIRST_TO_LAST means that we copy first items
- *                             from src to tail of dest
- * last_first == LAST_TO_FIRST means that we copy last items
- *                             from src to head of dest
- */
-static void internal_copy_pointers_items(struct buffer_info *dest_bi,
-					 struct buffer_head *src,
-					 int last_first, int cpy_num)
-{
-	/*
-	 * ATTENTION! Number of node pointers in DEST is equal to number
-	 * of items in DEST  as delimiting key have already inserted to
-	 * buffer dest.
-	 */
-	struct buffer_head *dest = dest_bi->bi_bh;
-	int nr_dest, nr_src;
-	int dest_order, src_order;
-	struct block_head *blkh;
-	struct reiserfs_key *key;
-	struct disk_child *dc;
-
-	nr_src = B_NR_ITEMS(src);
-
-	RFALSE(dest == NULL || src == NULL,
-	       "src (%p) or dest (%p) buffer is 0", src, dest);
-	RFALSE(last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST,
-	       "invalid last_first parameter (%d)", last_first);
-	RFALSE(nr_src < cpy_num - 1,
-	       "no so many items (%d) in src (%d)", cpy_num, nr_src);
-	RFALSE(cpy_num < 0, "cpy_num less than 0 (%d)", cpy_num);
-	RFALSE(cpy_num - 1 + B_NR_ITEMS(dest) > (int)MAX_NR_KEY(dest),
-	       "cpy_num (%d) + item number in dest (%d) can not be > MAX_NR_KEY(%d)",
-	       cpy_num, B_NR_ITEMS(dest), MAX_NR_KEY(dest));
-
-	if (cpy_num == 0)
-		return;
-
-	/* coping */
-	blkh = B_BLK_HEAD(dest);
-	nr_dest = blkh_nr_item(blkh);
-
-	/*dest_order = (last_first == LAST_TO_FIRST) ? 0 : nr_dest; */
-	/*src_order = (last_first == LAST_TO_FIRST) ? (nr_src - cpy_num + 1) : 0; */
-	(last_first == LAST_TO_FIRST) ? (dest_order = 0, src_order =
-					 nr_src - cpy_num + 1) : (dest_order =
-								  nr_dest,
-								  src_order =
-								  0);
-
-	/* prepare space for cpy_num pointers */
-	dc = B_N_CHILD(dest, dest_order);
-
-	memmove(dc + cpy_num, dc, (nr_dest - dest_order) * DC_SIZE);
-
-	/* insert pointers */
-	memcpy(dc, B_N_CHILD(src, src_order), DC_SIZE * cpy_num);
-
-	/* prepare space for cpy_num - 1 item headers */
-	key = internal_key(dest, dest_order);
-	memmove(key + cpy_num - 1, key,
-		KEY_SIZE * (nr_dest - dest_order) + DC_SIZE * (nr_dest +
-							       cpy_num));
-
-	/* insert headers */
-	memcpy(key, internal_key(src, src_order), KEY_SIZE * (cpy_num - 1));
-
-	/* sizes, item number */
-	set_blkh_nr_item(blkh, blkh_nr_item(blkh) + (cpy_num - 1));
-	set_blkh_free_space(blkh,
-			    blkh_free_space(blkh) - (KEY_SIZE * (cpy_num - 1) +
-						     DC_SIZE * cpy_num));
-
-	do_balance_mark_internal_dirty(dest_bi->tb, dest, 0);
-
-	/*&&&&&&&&&&&&&&&&&&&&&&&& */
-	check_internal(dest);
-	/*&&&&&&&&&&&&&&&&&&&&&&&& */
-
-	if (dest_bi->bi_parent) {
-		struct disk_child *t_dc;
-		t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position);
-		put_dc_size(t_dc,
-			    dc_size(t_dc) + (KEY_SIZE * (cpy_num - 1) +
-					     DC_SIZE * cpy_num));
-
-		do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent,
-					       0);
-		/*&&&&&&&&&&&&&&&&&&&&&&&& */
-		check_internal(dest_bi->bi_parent);
-		/*&&&&&&&&&&&&&&&&&&&&&&&& */
-	}
-
-}
-
-/*
- * Copy cpy_num node pointers and cpy_num - 1 items from buffer src to
- * buffer dest.
- * Delete cpy_num - del_par items and node pointers from buffer src.
- * last_first == FIRST_TO_LAST means, that we copy/delete first items from src.
- * last_first == LAST_TO_FIRST means, that we copy/delete last items from src.
- */
-static void internal_move_pointers_items(struct buffer_info *dest_bi,
-					 struct buffer_info *src_bi,
-					 int last_first, int cpy_num,
-					 int del_par)
-{
-	int first_pointer;
-	int first_item;
-
-	internal_copy_pointers_items(dest_bi, src_bi->bi_bh, last_first,
-				     cpy_num);
-
-	if (last_first == FIRST_TO_LAST) {	/* shift_left occurs */
-		first_pointer = 0;
-		first_item = 0;
-		/*
-		 * delete cpy_num - del_par pointers and keys starting for
-		 * pointers with first_pointer, for key - with first_item
-		 */
-		internal_delete_pointers_items(src_bi, first_pointer,
-					       first_item, cpy_num - del_par);
-	} else {		/* shift_right occurs */
-		int i, j;
-
-		i = (cpy_num - del_par ==
-		     (j =
-		      B_NR_ITEMS(src_bi->bi_bh)) + 1) ? 0 : j - cpy_num +
-		    del_par;
-
-		internal_delete_pointers_items(src_bi,
-					       j + 1 - cpy_num + del_par, i,
-					       cpy_num - del_par);
-	}
-}
-
-/* Insert n_src'th key of buffer src before n_dest'th key of buffer dest. */
-static void internal_insert_key(struct buffer_info *dest_bi,
-				/* insert key before key with n_dest number */
-				int dest_position_before,
-				struct buffer_head *src, int src_position)
-{
-	struct buffer_head *dest = dest_bi->bi_bh;
-	int nr;
-	struct block_head *blkh;
-	struct reiserfs_key *key;
-
-	RFALSE(dest == NULL || src == NULL,
-	       "source(%p) or dest(%p) buffer is 0", src, dest);
-	RFALSE(dest_position_before < 0 || src_position < 0,
-	       "source(%d) or dest(%d) key number less than 0",
-	       src_position, dest_position_before);
-	RFALSE(dest_position_before > B_NR_ITEMS(dest) ||
-	       src_position >= B_NR_ITEMS(src),
-	       "invalid position in dest (%d (key number %d)) or in src (%d (key number %d))",
-	       dest_position_before, B_NR_ITEMS(dest),
-	       src_position, B_NR_ITEMS(src));
-	RFALSE(B_FREE_SPACE(dest) < KEY_SIZE,
-	       "no enough free space (%d) in dest buffer", B_FREE_SPACE(dest));
-
-	blkh = B_BLK_HEAD(dest);
-	nr = blkh_nr_item(blkh);
-
-	/* prepare space for inserting key */
-	key = internal_key(dest, dest_position_before);
-	memmove(key + 1, key,
-		(nr - dest_position_before) * KEY_SIZE + (nr + 1) * DC_SIZE);
-
-	/* insert key */
-	memcpy(key, internal_key(src, src_position), KEY_SIZE);
-
-	/* Change dirt, free space, item number fields. */
-
-	set_blkh_nr_item(blkh, blkh_nr_item(blkh) + 1);
-	set_blkh_free_space(blkh, blkh_free_space(blkh) - KEY_SIZE);
-
-	do_balance_mark_internal_dirty(dest_bi->tb, dest, 0);
-
-	if (dest_bi->bi_parent) {
-		struct disk_child *t_dc;
-		t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position);
-		put_dc_size(t_dc, dc_size(t_dc) + KEY_SIZE);
-
-		do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent,
-					       0);
-	}
-}
-
-/*
- * Insert d_key'th (delimiting) key from buffer cfl to tail of dest.
- * Copy pointer_amount node pointers and pointer_amount - 1 items from
- * buffer src to buffer dest.
- * Replace  d_key'th key in buffer cfl.
- * Delete pointer_amount items and node pointers from buffer src.
- */
-/* this can be invoked both to shift from S to L and from R to S */
-static void internal_shift_left(
-				/*
-				 * INTERNAL_FROM_S_TO_L | INTERNAL_FROM_R_TO_S
-				 */
-				int mode,
-				struct tree_balance *tb,
-				int h, int pointer_amount)
-{
-	struct buffer_info dest_bi, src_bi;
-	struct buffer_head *cf;
-	int d_key_position;
-
-	internal_define_dest_src_infos(mode, tb, h, &dest_bi, &src_bi,
-				       &d_key_position, &cf);
-
-	/*printk("pointer_amount = %d\n",pointer_amount); */
-
-	if (pointer_amount) {
-		/*
-		 * insert delimiting key from common father of dest and
-		 * src to node dest into position B_NR_ITEM(dest)
-		 */
-		internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf,
-				    d_key_position);
-
-		if (B_NR_ITEMS(src_bi.bi_bh) == pointer_amount - 1) {
-			if (src_bi.bi_position /*src->b_item_order */  == 0)
-				replace_key(tb, cf, d_key_position,
-					    src_bi.
-					    bi_parent /*src->b_parent */ , 0);
-		} else
-			replace_key(tb, cf, d_key_position, src_bi.bi_bh,
-				    pointer_amount - 1);
-	}
-	/* last parameter is del_parameter */
-	internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST,
-				     pointer_amount, 0);
-
-}
-
-/*
- * Insert delimiting key to L[h].
- * Copy n node pointers and n - 1 items from buffer S[h] to L[h].
- * Delete n - 1 items and node pointers from buffer S[h].
- */
-/* it always shifts from S[h] to L[h] */
-static void internal_shift1_left(struct tree_balance *tb,
-				 int h, int pointer_amount)
-{
-	struct buffer_info dest_bi, src_bi;
-	struct buffer_head *cf;
-	int d_key_position;
-
-	internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
-				       &dest_bi, &src_bi, &d_key_position, &cf);
-
-	/* insert lkey[h]-th key  from CFL[h] to left neighbor L[h] */
-	if (pointer_amount > 0)
-		internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf,
-				    d_key_position);
-
-	/* last parameter is del_parameter */
-	internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST,
-				     pointer_amount, 1);
-}
-
-/*
- * Insert d_key'th (delimiting) key from buffer cfr to head of dest.
- * Copy n node pointers and n - 1 items from buffer src to buffer dest.
- * Replace  d_key'th key in buffer cfr.
- * Delete n items and node pointers from buffer src.
- */
-static void internal_shift_right(
-				 /*
-				  * INTERNAL_FROM_S_TO_R | INTERNAL_FROM_L_TO_S
-				  */
-				 int mode,
-				 struct tree_balance *tb,
-				 int h, int pointer_amount)
-{
-	struct buffer_info dest_bi, src_bi;
-	struct buffer_head *cf;
-	int d_key_position;
-	int nr;
-
-	internal_define_dest_src_infos(mode, tb, h, &dest_bi, &src_bi,
-				       &d_key_position, &cf);
-
-	nr = B_NR_ITEMS(src_bi.bi_bh);
-
-	if (pointer_amount > 0) {
-		/*
-		 * insert delimiting key from common father of dest
-		 * and src to dest node into position 0
-		 */
-		internal_insert_key(&dest_bi, 0, cf, d_key_position);
-		if (nr == pointer_amount - 1) {
-			RFALSE(src_bi.bi_bh != PATH_H_PBUFFER(tb->tb_path, h) /*tb->S[h] */ ||
-			       dest_bi.bi_bh != tb->R[h],
-			       "src (%p) must be == tb->S[h](%p) when it disappears",
-			       src_bi.bi_bh, PATH_H_PBUFFER(tb->tb_path, h));
-			/* when S[h] disappers replace left delemiting key as well */
-			if (tb->CFL[h])
-				replace_key(tb, cf, d_key_position, tb->CFL[h],
-					    tb->lkey[h]);
-		} else
-			replace_key(tb, cf, d_key_position, src_bi.bi_bh,
-				    nr - pointer_amount);
-	}
-
-	/* last parameter is del_parameter */
-	internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST,
-				     pointer_amount, 0);
-}
-
-/*
- * Insert delimiting key to R[h].
- * Copy n node pointers and n - 1 items from buffer S[h] to R[h].
- * Delete n - 1 items and node pointers from buffer S[h].
- */
-/* it always shift from S[h] to R[h] */
-static void internal_shift1_right(struct tree_balance *tb,
-				  int h, int pointer_amount)
-{
-	struct buffer_info dest_bi, src_bi;
-	struct buffer_head *cf;
-	int d_key_position;
-
-	internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
-				       &dest_bi, &src_bi, &d_key_position, &cf);
-
-	/* insert rkey from CFR[h] to right neighbor R[h] */
-	if (pointer_amount > 0)
-		internal_insert_key(&dest_bi, 0, cf, d_key_position);
-
-	/* last parameter is del_parameter */
-	internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST,
-				     pointer_amount, 1);
-}
-
-/*
- * Delete insert_num node pointers together with their left items
- * and balance current node.
- */
-static void balance_internal_when_delete(struct tree_balance *tb,
-					 int h, int child_pos)
-{
-	int insert_num;
-	int n;
-	struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h);
-	struct buffer_info bi;
-
-	insert_num = tb->insert_size[h] / ((int)(DC_SIZE + KEY_SIZE));
-
-	/* delete child-node-pointer(s) together with their left item(s) */
-	bi.tb = tb;
-	bi.bi_bh = tbSh;
-	bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h);
-	bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
-
-	internal_delete_childs(&bi, child_pos, -insert_num);
-
-	RFALSE(tb->blknum[h] > 1,
-	       "tb->blknum[%d]=%d when insert_size < 0", h, tb->blknum[h]);
-
-	n = B_NR_ITEMS(tbSh);
-
-	if (tb->lnum[h] == 0 && tb->rnum[h] == 0) {
-		if (tb->blknum[h] == 0) {
-			/* node S[h] (root of the tree) is empty now */
-			struct buffer_head *new_root;
-
-			RFALSE(n
-			       || B_FREE_SPACE(tbSh) !=
-			       MAX_CHILD_SIZE(tbSh) - DC_SIZE,
-			       "buffer must have only 0 keys (%d)", n);
-			RFALSE(bi.bi_parent, "root has parent (%p)",
-			       bi.bi_parent);
-
-			/* choose a new root */
-			if (!tb->L[h - 1] || !B_NR_ITEMS(tb->L[h - 1]))
-				new_root = tb->R[h - 1];
-			else
-				new_root = tb->L[h - 1];
-			/*
-			 * switch super block's tree root block
-			 * number to the new value */
-			PUT_SB_ROOT_BLOCK(tb->tb_sb, new_root->b_blocknr);
-			/*REISERFS_SB(tb->tb_sb)->s_rs->s_tree_height --; */
-			PUT_SB_TREE_HEIGHT(tb->tb_sb,
-					   SB_TREE_HEIGHT(tb->tb_sb) - 1);
-
-			do_balance_mark_sb_dirty(tb,
-						 REISERFS_SB(tb->tb_sb)->s_sbh,
-						 1);
-			/*&&&&&&&&&&&&&&&&&&&&&& */
-			/* use check_internal if new root is an internal node */
-			if (h > 1)
-				check_internal(new_root);
-			/*&&&&&&&&&&&&&&&&&&&&&& */
-
-			/* do what is needed for buffer thrown from tree */
-			reiserfs_invalidate_buffer(tb, tbSh);
-			return;
-		}
-		return;
-	}
-
-	/* join S[h] with L[h] */
-	if (tb->L[h] && tb->lnum[h] == -B_NR_ITEMS(tb->L[h]) - 1) {
-
-		RFALSE(tb->rnum[h] != 0,
-		       "invalid tb->rnum[%d]==%d when joining S[h] with L[h]",
-		       h, tb->rnum[h]);
-
-		internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, n + 1);
-		reiserfs_invalidate_buffer(tb, tbSh);
-
-		return;
-	}
-
-	/* join S[h] with R[h] */
-	if (tb->R[h] && tb->rnum[h] == -B_NR_ITEMS(tb->R[h]) - 1) {
-		RFALSE(tb->lnum[h] != 0,
-		       "invalid tb->lnum[%d]==%d when joining S[h] with R[h]",
-		       h, tb->lnum[h]);
-
-		internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, n + 1);
-
-		reiserfs_invalidate_buffer(tb, tbSh);
-		return;
-	}
-
-	/* borrow from left neighbor L[h] */
-	if (tb->lnum[h] < 0) {
-		RFALSE(tb->rnum[h] != 0,
-		       "wrong tb->rnum[%d]==%d when borrow from L[h]", h,
-		       tb->rnum[h]);
-		internal_shift_right(INTERNAL_SHIFT_FROM_L_TO_S, tb, h,
-				     -tb->lnum[h]);
-		return;
-	}
-
-	/* borrow from right neighbor R[h] */
-	if (tb->rnum[h] < 0) {
-		RFALSE(tb->lnum[h] != 0,
-		       "invalid tb->lnum[%d]==%d when borrow from R[h]",
-		       h, tb->lnum[h]);
-		internal_shift_left(INTERNAL_SHIFT_FROM_R_TO_S, tb, h, -tb->rnum[h]);	/*tb->S[h], tb->CFR[h], tb->rkey[h], tb->R[h], -tb->rnum[h]); */
-		return;
-	}
-
-	/* split S[h] into two parts and put them into neighbors */
-	if (tb->lnum[h] > 0) {
-		RFALSE(tb->rnum[h] == 0 || tb->lnum[h] + tb->rnum[h] != n + 1,
-		       "invalid tb->lnum[%d]==%d or tb->rnum[%d]==%d when S[h](item number == %d) is split between them",
-		       h, tb->lnum[h], h, tb->rnum[h], n);
-
-		internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]);	/*tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], tb->lnum[h]); */
-		internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
-				     tb->rnum[h]);
-
-		reiserfs_invalidate_buffer(tb, tbSh);
-
-		return;
-	}
-	reiserfs_panic(tb->tb_sb, "ibalance-2",
-		       "unexpected tb->lnum[%d]==%d or tb->rnum[%d]==%d",
-		       h, tb->lnum[h], h, tb->rnum[h]);
-}
-
-/* Replace delimiting key of buffers L[h] and S[h] by the given key.*/
-static void replace_lkey(struct tree_balance *tb, int h, struct item_head *key)
-{
-	RFALSE(tb->L[h] == NULL || tb->CFL[h] == NULL,
-	       "L[h](%p) and CFL[h](%p) must exist in replace_lkey",
-	       tb->L[h], tb->CFL[h]);
-
-	if (B_NR_ITEMS(PATH_H_PBUFFER(tb->tb_path, h)) == 0)
-		return;
-
-	memcpy(internal_key(tb->CFL[h], tb->lkey[h]), key, KEY_SIZE);
-
-	do_balance_mark_internal_dirty(tb, tb->CFL[h], 0);
-}
-
-/* Replace delimiting key of buffers S[h] and R[h] by the given key.*/
-static void replace_rkey(struct tree_balance *tb, int h, struct item_head *key)
-{
-	RFALSE(tb->R[h] == NULL || tb->CFR[h] == NULL,
-	       "R[h](%p) and CFR[h](%p) must exist in replace_rkey",
-	       tb->R[h], tb->CFR[h]);
-	RFALSE(B_NR_ITEMS(tb->R[h]) == 0,
-	       "R[h] can not be empty if it exists (item number=%d)",
-	       B_NR_ITEMS(tb->R[h]));
-
-	memcpy(internal_key(tb->CFR[h], tb->rkey[h]), key, KEY_SIZE);
-
-	do_balance_mark_internal_dirty(tb, tb->CFR[h], 0);
-}
-
-
-/*
- * if inserting/pasting {
- *   child_pos is the position of the node-pointer in S[h] that
- *   pointed to S[h-1] before balancing of the h-1 level;
- *   this means that new pointers and items must be inserted AFTER
- *   child_pos
- * } else {
- *   it is the position of the leftmost pointer that must be deleted
- *   (together with its corresponding key to the left of the pointer)
- *   as a result of the previous level's balancing.
- * }
- */
-
-int balance_internal(struct tree_balance *tb,
-		     int h,	/* level of the tree */
-		     int child_pos,
-		     /* key for insertion on higher level    */
-		     struct item_head *insert_key,
-		     /* node for insertion on higher level */
-		     struct buffer_head **insert_ptr)
-{
-	struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h);
-	struct buffer_info bi;
-
-	/*
-	 * we return this: it is 0 if there is no S[h],
-	 * else it is tb->S[h]->b_item_order
-	 */
-	int order;
-	int insert_num, n, k;
-	struct buffer_head *S_new;
-	struct item_head new_insert_key;
-	struct buffer_head *new_insert_ptr = NULL;
-	struct item_head *new_insert_key_addr = insert_key;
-
-	RFALSE(h < 1, "h (%d) can not be < 1 on internal level", h);
-
-	PROC_INFO_INC(tb->tb_sb, balance_at[h]);
-
-	order =
-	    (tbSh) ? PATH_H_POSITION(tb->tb_path,
-				     h + 1) /*tb->S[h]->b_item_order */ : 0;
-
-	/*
-	 * Using insert_size[h] calculate the number insert_num of items
-	 * that must be inserted to or deleted from S[h].
-	 */
-	insert_num = tb->insert_size[h] / ((int)(KEY_SIZE + DC_SIZE));
-
-	/* Check whether insert_num is proper * */
-	RFALSE(insert_num < -2 || insert_num > 2,
-	       "incorrect number of items inserted to the internal node (%d)",
-	       insert_num);
-	RFALSE(h > 1 && (insert_num > 1 || insert_num < -1),
-	       "incorrect number of items (%d) inserted to the internal node on a level (h=%d) higher than last internal level",
-	       insert_num, h);
-
-	/* Make balance in case insert_num < 0 */
-	if (insert_num < 0) {
-		balance_internal_when_delete(tb, h, child_pos);
-		return order;
-	}
-
-	k = 0;
-	if (tb->lnum[h] > 0) {
-		/*
-		 * shift lnum[h] items from S[h] to the left neighbor L[h].
-		 * check how many of new items fall into L[h] or CFL[h] after
-		 * shifting
-		 */
-		n = B_NR_ITEMS(tb->L[h]);	/* number of items in L[h] */
-		if (tb->lnum[h] <= child_pos) {
-			/* new items don't fall into L[h] or CFL[h] */
-			internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
-					    tb->lnum[h]);
-			child_pos -= tb->lnum[h];
-		} else if (tb->lnum[h] > child_pos + insert_num) {
-			/* all new items fall into L[h] */
-			internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
-					    tb->lnum[h] - insert_num);
-			/* insert insert_num keys and node-pointers into L[h] */
-			bi.tb = tb;
-			bi.bi_bh = tb->L[h];
-			bi.bi_parent = tb->FL[h];
-			bi.bi_position = get_left_neighbor_position(tb, h);
-			internal_insert_childs(&bi,
-					       /*tb->L[h], tb->S[h-1]->b_next */
-					       n + child_pos + 1,
-					       insert_num, insert_key,
-					       insert_ptr);
-
-			insert_num = 0;
-		} else {
-			struct disk_child *dc;
-
-			/*
-			 * some items fall into L[h] or CFL[h],
-			 * but some don't fall
-			 */
-			internal_shift1_left(tb, h, child_pos + 1);
-			/* calculate number of new items that fall into L[h] */
-			k = tb->lnum[h] - child_pos - 1;
-			bi.tb = tb;
-			bi.bi_bh = tb->L[h];
-			bi.bi_parent = tb->FL[h];
-			bi.bi_position = get_left_neighbor_position(tb, h);
-			internal_insert_childs(&bi,
-					       /*tb->L[h], tb->S[h-1]->b_next, */
-					       n + child_pos + 1, k,
-					       insert_key, insert_ptr);
-
-			replace_lkey(tb, h, insert_key + k);
-
-			/*
-			 * replace the first node-ptr in S[h] by
-			 * node-ptr to insert_ptr[k]
-			 */
-			dc = B_N_CHILD(tbSh, 0);
-			put_dc_size(dc,
-				    MAX_CHILD_SIZE(insert_ptr[k]) -
-				    B_FREE_SPACE(insert_ptr[k]));
-			put_dc_block_number(dc, insert_ptr[k]->b_blocknr);
-
-			do_balance_mark_internal_dirty(tb, tbSh, 0);
-
-			k++;
-			insert_key += k;
-			insert_ptr += k;
-			insert_num -= k;
-			child_pos = 0;
-		}
-	}
-	/* tb->lnum[h] > 0 */
-	if (tb->rnum[h] > 0) {
-		/*shift rnum[h] items from S[h] to the right neighbor R[h] */
-		/*
-		 * check how many of new items fall into R or CFR
-		 * after shifting
-		 */
-		n = B_NR_ITEMS(tbSh);	/* number of items in S[h] */
-		if (n - tb->rnum[h] >= child_pos)
-			/* new items fall into S[h] */
-			internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
-					     tb->rnum[h]);
-		else if (n + insert_num - tb->rnum[h] < child_pos) {
-			/* all new items fall into R[h] */
-			internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
-					     tb->rnum[h] - insert_num);
-
-			/* insert insert_num keys and node-pointers into R[h] */
-			bi.tb = tb;
-			bi.bi_bh = tb->R[h];
-			bi.bi_parent = tb->FR[h];
-			bi.bi_position = get_right_neighbor_position(tb, h);
-			internal_insert_childs(&bi,
-					       /*tb->R[h],tb->S[h-1]->b_next */
-					       child_pos - n - insert_num +
-					       tb->rnum[h] - 1,
-					       insert_num, insert_key,
-					       insert_ptr);
-			insert_num = 0;
-		} else {
-			struct disk_child *dc;
-
-			/* one of the items falls into CFR[h] */
-			internal_shift1_right(tb, h, n - child_pos + 1);
-			/* calculate number of new items that fall into R[h] */
-			k = tb->rnum[h] - n + child_pos - 1;
-			bi.tb = tb;
-			bi.bi_bh = tb->R[h];
-			bi.bi_parent = tb->FR[h];
-			bi.bi_position = get_right_neighbor_position(tb, h);
-			internal_insert_childs(&bi,
-					       /*tb->R[h], tb->R[h]->b_child, */
-					       0, k, insert_key + 1,
-					       insert_ptr + 1);
-
-			replace_rkey(tb, h, insert_key + insert_num - k - 1);
-
-			/*
-			 * replace the first node-ptr in R[h] by
-			 * node-ptr insert_ptr[insert_num-k-1]
-			 */
-			dc = B_N_CHILD(tb->R[h], 0);
-			put_dc_size(dc,
-				    MAX_CHILD_SIZE(insert_ptr
-						   [insert_num - k - 1]) -
-				    B_FREE_SPACE(insert_ptr
-						 [insert_num - k - 1]));
-			put_dc_block_number(dc,
-					    insert_ptr[insert_num - k -
-						       1]->b_blocknr);
-
-			do_balance_mark_internal_dirty(tb, tb->R[h], 0);
-
-			insert_num -= (k + 1);
-		}
-	}
-
-	/** Fill new node that appears instead of S[h] **/
-	RFALSE(tb->blknum[h] > 2, "blknum can not be > 2 for internal level");
-	RFALSE(tb->blknum[h] < 0, "blknum can not be < 0");
-
-	if (!tb->blknum[h]) {	/* node S[h] is empty now */
-		RFALSE(!tbSh, "S[h] is equal NULL");
-
-		/* do what is needed for buffer thrown from tree */
-		reiserfs_invalidate_buffer(tb, tbSh);
-		return order;
-	}
-
-	if (!tbSh) {
-		/* create new root */
-		struct disk_child *dc;
-		struct buffer_head *tbSh_1 = PATH_H_PBUFFER(tb->tb_path, h - 1);
-		struct block_head *blkh;
-
-		if (tb->blknum[h] != 1)
-			reiserfs_panic(NULL, "ibalance-3", "One new node "
-				       "required for creating the new root");
-		/* S[h] = empty buffer from the list FEB. */
-		tbSh = get_FEB(tb);
-		blkh = B_BLK_HEAD(tbSh);
-		set_blkh_level(blkh, h + 1);
-
-		/* Put the unique node-pointer to S[h] that points to S[h-1]. */
-
-		dc = B_N_CHILD(tbSh, 0);
-		put_dc_block_number(dc, tbSh_1->b_blocknr);
-		put_dc_size(dc,
-			    (MAX_CHILD_SIZE(tbSh_1) - B_FREE_SPACE(tbSh_1)));
-
-		tb->insert_size[h] -= DC_SIZE;
-		set_blkh_free_space(blkh, blkh_free_space(blkh) - DC_SIZE);
-
-		do_balance_mark_internal_dirty(tb, tbSh, 0);
-
-		/*&&&&&&&&&&&&&&&&&&&&&&&& */
-		check_internal(tbSh);
-		/*&&&&&&&&&&&&&&&&&&&&&&&& */
-
-		/* put new root into path structure */
-		PATH_OFFSET_PBUFFER(tb->tb_path, ILLEGAL_PATH_ELEMENT_OFFSET) =
-		    tbSh;
-
-		/* Change root in structure super block. */
-		PUT_SB_ROOT_BLOCK(tb->tb_sb, tbSh->b_blocknr);
-		PUT_SB_TREE_HEIGHT(tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) + 1);
-		do_balance_mark_sb_dirty(tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1);
-	}
-
-	if (tb->blknum[h] == 2) {
-		int snum;
-		struct buffer_info dest_bi, src_bi;
-
-		/* S_new = free buffer from list FEB */
-		S_new = get_FEB(tb);
-
-		set_blkh_level(B_BLK_HEAD(S_new), h + 1);
-
-		dest_bi.tb = tb;
-		dest_bi.bi_bh = S_new;
-		dest_bi.bi_parent = NULL;
-		dest_bi.bi_position = 0;
-		src_bi.tb = tb;
-		src_bi.bi_bh = tbSh;
-		src_bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h);
-		src_bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
-
-		n = B_NR_ITEMS(tbSh);	/* number of items in S[h] */
-		snum = (insert_num + n + 1) / 2;
-		if (n - snum >= child_pos) {
-			/* new items don't fall into S_new */
-			/*  store the delimiting key for the next level */
-			/* new_insert_key = (n - snum)'th key in S[h] */
-			memcpy(&new_insert_key, internal_key(tbSh, n - snum),
-			       KEY_SIZE);
-			/* last parameter is del_par */
-			internal_move_pointers_items(&dest_bi, &src_bi,
-						     LAST_TO_FIRST, snum, 0);
-		} else if (n + insert_num - snum < child_pos) {
-			/* all new items fall into S_new */
-			/*  store the delimiting key for the next level */
-			/*
-			 * new_insert_key = (n + insert_item - snum)'th
-			 * key in S[h]
-			 */
-			memcpy(&new_insert_key,
-			       internal_key(tbSh, n + insert_num - snum),
-			       KEY_SIZE);
-			/* last parameter is del_par */
-			internal_move_pointers_items(&dest_bi, &src_bi,
-						     LAST_TO_FIRST,
-						     snum - insert_num, 0);
-
-			/*
-			 * insert insert_num keys and node-pointers
-			 * into S_new
-			 */
-			internal_insert_childs(&dest_bi,
-					       /*S_new,tb->S[h-1]->b_next, */
-					       child_pos - n - insert_num +
-					       snum - 1,
-					       insert_num, insert_key,
-					       insert_ptr);
-
-			insert_num = 0;
-		} else {
-			struct disk_child *dc;
-
-			/* some items fall into S_new, but some don't fall */
-			/* last parameter is del_par */
-			internal_move_pointers_items(&dest_bi, &src_bi,
-						     LAST_TO_FIRST,
-						     n - child_pos + 1, 1);
-			/* calculate number of new items that fall into S_new */
-			k = snum - n + child_pos - 1;
-
-			internal_insert_childs(&dest_bi, /*S_new, */ 0, k,
-					       insert_key + 1, insert_ptr + 1);
-
-			/* new_insert_key = insert_key[insert_num - k - 1] */
-			memcpy(&new_insert_key, insert_key + insert_num - k - 1,
-			       KEY_SIZE);
-			/*
-			 * replace first node-ptr in S_new by node-ptr
-			 * to insert_ptr[insert_num-k-1]
-			 */
-
-			dc = B_N_CHILD(S_new, 0);
-			put_dc_size(dc,
-				    (MAX_CHILD_SIZE
-				     (insert_ptr[insert_num - k - 1]) -
-				     B_FREE_SPACE(insert_ptr
-						  [insert_num - k - 1])));
-			put_dc_block_number(dc,
-					    insert_ptr[insert_num - k -
-						       1]->b_blocknr);
-
-			do_balance_mark_internal_dirty(tb, S_new, 0);
-
-			insert_num -= (k + 1);
-		}
-		/* new_insert_ptr = node_pointer to S_new */
-		new_insert_ptr = S_new;
-
-		RFALSE(!buffer_journaled(S_new) || buffer_journal_dirty(S_new)
-		       || buffer_dirty(S_new), "cm-00001: bad S_new (%b)",
-		       S_new);
-
-		/* S_new is released in unfix_nodes */
-	}
-
-	n = B_NR_ITEMS(tbSh);	/*number of items in S[h] */
-
-	if (0 <= child_pos && child_pos <= n && insert_num > 0) {
-		bi.tb = tb;
-		bi.bi_bh = tbSh;
-		bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h);
-		bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
-		internal_insert_childs(&bi,	/*tbSh, */
-				       /*          ( tb->S[h-1]->b_parent == tb->S[h] ) ? tb->S[h-1]->b_next :  tb->S[h]->b_child->b_next, */
-				       child_pos, insert_num, insert_key,
-				       insert_ptr);
-	}
-
-	insert_ptr[0] = new_insert_ptr;
-	if (new_insert_ptr)
-		memcpy(new_insert_key_addr, &new_insert_key, KEY_SIZE);
-
-	return order;
-}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
deleted file mode 100644
index d39ee5f6c075..000000000000
--- a/fs/reiserfs/inode.c
+++ /dev/null
@@ -1,3416 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/time.h>
-#include <linux/fs.h>
-#include "reiserfs.h"
-#include "acl.h"
-#include "xattr.h"
-#include <linux/exportfs.h>
-#include <linux/pagemap.h>
-#include <linux/highmem.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-#include <linux/unaligned.h>
-#include <linux/buffer_head.h>
-#include <linux/mpage.h>
-#include <linux/writeback.h>
-#include <linux/quotaops.h>
-#include <linux/swap.h>
-#include <linux/uio.h>
-#include <linux/bio.h>
-
-int reiserfs_commit_write(struct file *f, struct page *page,
-			  unsigned from, unsigned to);
-
-void reiserfs_evict_inode(struct inode *inode)
-{
-	/*
-	 * We need blocks for transaction + (user+group) quota
-	 * update (possibly delete)
-	 */
-	int jbegin_count =
-	    JOURNAL_PER_BALANCE_CNT * 2 +
-	    2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
-	struct reiserfs_transaction_handle th;
-	int err;
-
-	if (!inode->i_nlink && !is_bad_inode(inode))
-		dquot_initialize(inode);
-
-	truncate_inode_pages_final(&inode->i_data);
-	if (inode->i_nlink)
-		goto no_delete;
-
-	/*
-	 * The = 0 happens when we abort creating a new inode
-	 * for some reason like lack of space..
-	 * also handles bad_inode case
-	 */
-	if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) {
-
-		reiserfs_delete_xattrs(inode);
-
-		reiserfs_write_lock(inode->i_sb);
-
-		if (journal_begin(&th, inode->i_sb, jbegin_count))
-			goto out;
-		reiserfs_update_inode_transaction(inode);
-
-		reiserfs_discard_prealloc(&th, inode);
-
-		err = reiserfs_delete_object(&th, inode);
-
-		/*
-		 * Do quota update inside a transaction for journaled quotas.
-		 * We must do that after delete_object so that quota updates
-		 * go into the same transaction as stat data deletion
-		 */
-		if (!err) {
-			int depth = reiserfs_write_unlock_nested(inode->i_sb);
-			dquot_free_inode(inode);
-			reiserfs_write_lock_nested(inode->i_sb, depth);
-		}
-
-		if (journal_end(&th))
-			goto out;
-
-		/*
-		 * check return value from reiserfs_delete_object after
-		 * ending the transaction
-		 */
-		if (err)
-		    goto out;
-
-		/*
-		 * all items of file are deleted, so we can remove
-		 * "save" link
-		 * we can't do anything about an error here
-		 */
-		remove_save_link(inode, 0 /* not truncate */);
-out:
-		reiserfs_write_unlock(inode->i_sb);
-	} else {
-		/* no object items are in the tree */
-		;
-	}
-
-	/* note this must go after the journal_end to prevent deadlock */
-	clear_inode(inode);
-
-	dquot_drop(inode);
-	inode->i_blocks = 0;
-	return;
-
-no_delete:
-	clear_inode(inode);
-	dquot_drop(inode);
-}
-
-static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid,
-			  __u32 objectid, loff_t offset, int type, int length)
-{
-	key->version = version;
-
-	key->on_disk_key.k_dir_id = dirid;
-	key->on_disk_key.k_objectid = objectid;
-	set_cpu_key_k_offset(key, offset);
-	set_cpu_key_k_type(key, type);
-	key->key_length = length;
-}
-
-/*
- * take base of inode_key (it comes from inode always) (dirid, objectid)
- * and version from an inode, set offset and type of key
- */
-void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset,
-		  int type, int length)
-{
-	_make_cpu_key(key, get_inode_item_key_version(inode),
-		      le32_to_cpu(INODE_PKEY(inode)->k_dir_id),
-		      le32_to_cpu(INODE_PKEY(inode)->k_objectid), offset, type,
-		      length);
-}
-
-/* when key is 0, do not set version and short key */
-inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
-			      int version,
-			      loff_t offset, int type, int length,
-			      int entry_count /*or ih_free_space */ )
-{
-	if (key) {
-		ih->ih_key.k_dir_id = cpu_to_le32(key->on_disk_key.k_dir_id);
-		ih->ih_key.k_objectid =
-		    cpu_to_le32(key->on_disk_key.k_objectid);
-	}
-	put_ih_version(ih, version);
-	set_le_ih_k_offset(ih, offset);
-	set_le_ih_k_type(ih, type);
-	put_ih_item_len(ih, length);
-	/*    set_ih_free_space (ih, 0); */
-	/*
-	 * for directory items it is entry count, for directs and stat
-	 * datas - 0xffff, for indirects - 0
-	 */
-	put_ih_entry_count(ih, entry_count);
-}
-
-/*
- * FIXME: we might cache recently accessed indirect item
- * Ugh.  Not too eager for that....
- * I cut the code until such time as I see a convincing argument (benchmark).
- * I don't want a bloated inode struct..., and I don't like code complexity....
- */
-
-/*
- * cutting the code is fine, since it really isn't in use yet and is easy
- * to add back in.  But, Vladimir has a really good idea here.  Think
- * about what happens for reading a file.  For each page,
- * The VFS layer calls reiserfs_read_folio, who searches the tree to find
- * an indirect item.  This indirect item has X number of pointers, where
- * X is a big number if we've done the block allocation right.  But,
- * we only use one or two of these pointers during each call to read_folio,
- * needlessly researching again later on.
- *
- * The size of the cache could be dynamic based on the size of the file.
- *
- * I'd also like to see us cache the location the stat data item, since
- * we are needlessly researching for that frequently.
- *
- * --chris
- */
-
-/*
- * If this page has a file tail in it, and
- * it was read in by get_block_create_0, the page data is valid,
- * but tail is still sitting in a direct item, and we can't write to
- * it.  So, look through this page, and check all the mapped buffers
- * to make sure they have valid block numbers.  Any that don't need
- * to be unmapped, so that __block_write_begin will correctly call
- * reiserfs_get_block to convert the tail into an unformatted node
- */
-static inline void fix_tail_page_for_writing(struct page *page)
-{
-	struct buffer_head *head, *next, *bh;
-
-	if (page && page_has_buffers(page)) {
-		head = page_buffers(page);
-		bh = head;
-		do {
-			next = bh->b_this_page;
-			if (buffer_mapped(bh) && bh->b_blocknr == 0) {
-				reiserfs_unmap_buffer(bh);
-			}
-			bh = next;
-		} while (bh != head);
-	}
-}
-
-/*
- * reiserfs_get_block does not need to allocate a block only if it has been
- * done already or non-hole position has been found in the indirect item
- */
-static inline int allocation_needed(int retval, b_blocknr_t allocated,
-				    struct item_head *ih,
-				    __le32 * item, int pos_in_item)
-{
-	if (allocated)
-		return 0;
-	if (retval == POSITION_FOUND && is_indirect_le_ih(ih) &&
-	    get_block_num(item, pos_in_item))
-		return 0;
-	return 1;
-}
-
-static inline int indirect_item_found(int retval, struct item_head *ih)
-{
-	return (retval == POSITION_FOUND) && is_indirect_le_ih(ih);
-}
-
-static inline void set_block_dev_mapped(struct buffer_head *bh,
-					b_blocknr_t block, struct inode *inode)
-{
-	map_bh(bh, inode->i_sb, block);
-}
-
-/*
- * files which were created in the earlier version can not be longer,
- * than 2 gb
- */
-static int file_capable(struct inode *inode, sector_t block)
-{
-	/* it is new file. */
-	if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 ||
-	    /* old file, but 'block' is inside of 2gb */
-	    block < (1 << (31 - inode->i_sb->s_blocksize_bits)))
-		return 1;
-
-	return 0;
-}
-
-static int restart_transaction(struct reiserfs_transaction_handle *th,
-			       struct inode *inode, struct treepath *path)
-{
-	struct super_block *s = th->t_super;
-	int err;
-
-	BUG_ON(!th->t_trans_id);
-	BUG_ON(!th->t_refcount);
-
-	pathrelse(path);
-
-	/* we cannot restart while nested */
-	if (th->t_refcount > 1) {
-		return 0;
-	}
-	reiserfs_update_sd(th, inode);
-	err = journal_end(th);
-	if (!err) {
-		err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6);
-		if (!err)
-			reiserfs_update_inode_transaction(inode);
-	}
-	return err;
-}
-
-/*
- * it is called by get_block when create == 0. Returns block number
- * for 'block'-th logical block of file. When it hits direct item it
- * returns 0 (being called from bmap) or read direct item into piece
- * of page (bh_result)
- * Please improve the english/clarity in the comment above, as it is
- * hard to understand.
- */
-static int _get_block_create_0(struct inode *inode, sector_t block,
-			       struct buffer_head *bh_result, int args)
-{
-	INITIALIZE_PATH(path);
-	struct cpu_key key;
-	struct buffer_head *bh;
-	struct item_head *ih, tmp_ih;
-	b_blocknr_t blocknr;
-	char *p;
-	int chars;
-	int ret;
-	int result;
-	int done = 0;
-	unsigned long offset;
-
-	/* prepare the key to look for the 'block'-th block of file */
-	make_cpu_key(&key, inode,
-		     (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY,
-		     3);
-
-	result = search_for_position_by_key(inode->i_sb, &key, &path);
-	if (result != POSITION_FOUND) {
-		pathrelse(&path);
-		if (result == IO_ERROR)
-			return -EIO;
-		/*
-		 * We do not return -ENOENT if there is a hole but page is
-		 * uptodate, because it means that there is some MMAPED data
-		 * associated with it that is yet to be written to disk.
-		 */
-		if ((args & GET_BLOCK_NO_HOLE)
-		    && !PageUptodate(bh_result->b_page)) {
-			return -ENOENT;
-		}
-		return 0;
-	}
-
-	bh = get_last_bh(&path);
-	ih = tp_item_head(&path);
-	if (is_indirect_le_ih(ih)) {
-		__le32 *ind_item = (__le32 *) ih_item_body(bh, ih);
-
-		/*
-		 * FIXME: here we could cache indirect item or part of it in
-		 * the inode to avoid search_by_key in case of subsequent
-		 * access to file
-		 */
-		blocknr = get_block_num(ind_item, path.pos_in_item);
-		ret = 0;
-		if (blocknr) {
-			map_bh(bh_result, inode->i_sb, blocknr);
-			if (path.pos_in_item ==
-			    ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) {
-				set_buffer_boundary(bh_result);
-			}
-		} else
-			/*
-			 * We do not return -ENOENT if there is a hole but
-			 * page is uptodate, because it means that there is
-			 * some MMAPED data associated with it that is
-			 * yet to be written to disk.
-			 */
-		if ((args & GET_BLOCK_NO_HOLE)
-			    && !PageUptodate(bh_result->b_page)) {
-			ret = -ENOENT;
-		}
-
-		pathrelse(&path);
-		return ret;
-	}
-	/* requested data are in direct item(s) */
-	if (!(args & GET_BLOCK_READ_DIRECT)) {
-		/*
-		 * we are called by bmap. FIXME: we can not map block of file
-		 * when it is stored in direct item(s)
-		 */
-		pathrelse(&path);
-		return -ENOENT;
-	}
-
-	/*
-	 * if we've got a direct item, and the buffer or page was uptodate,
-	 * we don't want to pull data off disk again.  skip to the
-	 * end, where we map the buffer and return
-	 */
-	if (buffer_uptodate(bh_result)) {
-		goto finished;
-	} else
-		/*
-		 * grab_tail_page can trigger calls to reiserfs_get_block on
-		 * up to date pages without any buffers.  If the page is up
-		 * to date, we don't want read old data off disk.  Set the up
-		 * to date bit on the buffer instead and jump to the end
-		 */
-	if (!bh_result->b_page || PageUptodate(bh_result->b_page)) {
-		set_buffer_uptodate(bh_result);
-		goto finished;
-	}
-	/* read file tail into part of page */
-	offset = (cpu_key_k_offset(&key) - 1) & (PAGE_SIZE - 1);
-	copy_item_head(&tmp_ih, ih);
-
-	/*
-	 * we only want to kmap if we are reading the tail into the page.
-	 * this is not the common case, so we don't kmap until we are
-	 * sure we need to.  But, this means the item might move if
-	 * kmap schedules
-	 */
-	p = (char *)kmap(bh_result->b_page);
-	p += offset;
-	memset(p, 0, inode->i_sb->s_blocksize);
-	do {
-		if (!is_direct_le_ih(ih)) {
-			BUG();
-		}
-		/*
-		 * make sure we don't read more bytes than actually exist in
-		 * the file.  This can happen in odd cases where i_size isn't
-		 * correct, and when direct item padding results in a few
-		 * extra bytes at the end of the direct item
-		 */
-		if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
-			break;
-		if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) {
-			chars =
-			    inode->i_size - (le_ih_k_offset(ih) - 1) -
-			    path.pos_in_item;
-			done = 1;
-		} else {
-			chars = ih_item_len(ih) - path.pos_in_item;
-		}
-		memcpy(p, ih_item_body(bh, ih) + path.pos_in_item, chars);
-
-		if (done)
-			break;
-
-		p += chars;
-
-		/*
-		 * we done, if read direct item is not the last item of
-		 * node FIXME: we could try to check right delimiting key
-		 * to see whether direct item continues in the right
-		 * neighbor or rely on i_size
-		 */
-		if (PATH_LAST_POSITION(&path) != (B_NR_ITEMS(bh) - 1))
-			break;
-
-		/* update key to look for the next piece */
-		set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + chars);
-		result = search_for_position_by_key(inode->i_sb, &key, &path);
-		if (result != POSITION_FOUND)
-			/* i/o error most likely */
-			break;
-		bh = get_last_bh(&path);
-		ih = tp_item_head(&path);
-	} while (1);
-
-	flush_dcache_page(bh_result->b_page);
-	kunmap(bh_result->b_page);
-
-finished:
-	pathrelse(&path);
-
-	if (result == IO_ERROR)
-		return -EIO;
-
-	/*
-	 * this buffer has valid data, but isn't valid for io.  mapping it to
-	 * block #0 tells the rest of reiserfs it just has a tail in it
-	 */
-	map_bh(bh_result, inode->i_sb, 0);
-	set_buffer_uptodate(bh_result);
-	return 0;
-}
-
-/*
- * this is called to create file map. So, _get_block_create_0 will not
- * read direct item
- */
-static int reiserfs_bmap(struct inode *inode, sector_t block,
-			 struct buffer_head *bh_result, int create)
-{
-	if (!file_capable(inode, block))
-		return -EFBIG;
-
-	reiserfs_write_lock(inode->i_sb);
-	/* do not read the direct item */
-	_get_block_create_0(inode, block, bh_result, 0);
-	reiserfs_write_unlock(inode->i_sb);
-	return 0;
-}
-
-/*
- * special version of get_block that is only used by grab_tail_page right
- * now.  It is sent to __block_write_begin, and when you try to get a
- * block past the end of the file (or a block from a hole) it returns
- * -ENOENT instead of a valid buffer.  __block_write_begin expects to
- * be able to do i/o on the buffers returned, unless an error value
- * is also returned.
- *
- * So, this allows __block_write_begin to be used for reading a single block
- * in a page.  Where it does not produce a valid page for holes, or past the
- * end of the file.  This turns out to be exactly what we need for reading
- * tails for conversion.
- *
- * The point of the wrapper is forcing a certain value for create, even
- * though the VFS layer is calling this function with create==1.  If you
- * don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
- * don't use this function.
-*/
-static int reiserfs_get_block_create_0(struct inode *inode, sector_t block,
-				       struct buffer_head *bh_result,
-				       int create)
-{
-	return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE);
-}
-
-/*
- * This is special helper for reiserfs_get_block in case we are executing
- * direct_IO request.
- */
-static int reiserfs_get_blocks_direct_io(struct inode *inode,
-					 sector_t iblock,
-					 struct buffer_head *bh_result,
-					 int create)
-{
-	int ret;
-
-	bh_result->b_page = NULL;
-
-	/*
-	 * We set the b_size before reiserfs_get_block call since it is
-	 * referenced in convert_tail_for_hole() that may be called from
-	 * reiserfs_get_block()
-	 */
-	bh_result->b_size = i_blocksize(inode);
-
-	ret = reiserfs_get_block(inode, iblock, bh_result,
-				 create | GET_BLOCK_NO_DANGLE);
-	if (ret)
-		goto out;
-
-	/* don't allow direct io onto tail pages */
-	if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
-		/*
-		 * make sure future calls to the direct io funcs for this
-		 * offset in the file fail by unmapping the buffer
-		 */
-		clear_buffer_mapped(bh_result);
-		ret = -EINVAL;
-	}
-
-	/*
-	 * Possible unpacked tail. Flush the data before pages have
-	 * disappeared
-	 */
-	if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
-		int err;
-
-		reiserfs_write_lock(inode->i_sb);
-
-		err = reiserfs_commit_for_inode(inode);
-		REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
-
-		reiserfs_write_unlock(inode->i_sb);
-
-		if (err < 0)
-			ret = err;
-	}
-out:
-	return ret;
-}
-
-/*
- * helper function for when reiserfs_get_block is called for a hole
- * but the file tail is still in a direct item
- * bh_result is the buffer head for the hole
- * tail_offset is the offset of the start of the tail in the file
- *
- * This calls prepare_write, which will start a new transaction
- * you should not be in a transaction, or have any paths held when you
- * call this.
- */
-static int convert_tail_for_hole(struct inode *inode,
-				 struct buffer_head *bh_result,
-				 loff_t tail_offset)
-{
-	unsigned long index;
-	unsigned long tail_end;
-	unsigned long tail_start;
-	struct page *tail_page;
-	struct page *hole_page = bh_result->b_page;
-	int retval = 0;
-
-	if ((tail_offset & (bh_result->b_size - 1)) != 1)
-		return -EIO;
-
-	/* always try to read until the end of the block */
-	tail_start = tail_offset & (PAGE_SIZE - 1);
-	tail_end = (tail_start | (bh_result->b_size - 1)) + 1;
-
-	index = tail_offset >> PAGE_SHIFT;
-	/*
-	 * hole_page can be zero in case of direct_io, we are sure
-	 * that we cannot get here if we write with O_DIRECT into tail page
-	 */
-	if (!hole_page || index != hole_page->index) {
-		tail_page = grab_cache_page(inode->i_mapping, index);
-		retval = -ENOMEM;
-		if (!tail_page) {
-			goto out;
-		}
-	} else {
-		tail_page = hole_page;
-	}
-
-	/*
-	 * we don't have to make sure the conversion did not happen while
-	 * we were locking the page because anyone that could convert
-	 * must first take i_mutex.
-	 *
-	 * We must fix the tail page for writing because it might have buffers
-	 * that are mapped, but have a block number of 0.  This indicates tail
-	 * data that has been read directly into the page, and
-	 * __block_write_begin won't trigger a get_block in this case.
-	 */
-	fix_tail_page_for_writing(tail_page);
-	retval = __reiserfs_write_begin(tail_page, tail_start,
-				      tail_end - tail_start);
-	if (retval)
-		goto unlock;
-
-	/* tail conversion might change the data in the page */
-	flush_dcache_page(tail_page);
-
-	retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end);
-
-unlock:
-	if (tail_page != hole_page) {
-		unlock_page(tail_page);
-		put_page(tail_page);
-	}
-out:
-	return retval;
-}
-
-static inline int _allocate_block(struct reiserfs_transaction_handle *th,
-				  sector_t block,
-				  struct inode *inode,
-				  b_blocknr_t * allocated_block_nr,
-				  struct treepath *path, int flags)
-{
-	BUG_ON(!th->t_trans_id);
-
-#ifdef REISERFS_PREALLOCATE
-	if (!(flags & GET_BLOCK_NO_IMUX)) {
-		return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr,
-						  path, block);
-	}
-#endif
-	return reiserfs_new_unf_blocknrs(th, inode, allocated_block_nr, path,
-					 block);
-}
-
-int reiserfs_get_block(struct inode *inode, sector_t block,
-		       struct buffer_head *bh_result, int create)
-{
-	int repeat, retval = 0;
-	/* b_blocknr_t is (unsigned) 32 bit int*/
-	b_blocknr_t allocated_block_nr = 0;
-	INITIALIZE_PATH(path);
-	int pos_in_item;
-	struct cpu_key key;
-	struct buffer_head *bh, *unbh = NULL;
-	struct item_head *ih, tmp_ih;
-	__le32 *item;
-	int done;
-	int fs_gen;
-	struct reiserfs_transaction_handle *th = NULL;
-	/*
-	 * space reserved in transaction batch:
-	 * . 3 balancings in direct->indirect conversion
-	 * . 1 block involved into reiserfs_update_sd()
-	 * XXX in practically impossible worst case direct2indirect()
-	 * can incur (much) more than 3 balancings.
-	 * quota update for user, group
-	 */
-	int jbegin_count =
-	    JOURNAL_PER_BALANCE_CNT * 3 + 1 +
-	    2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
-	int version;
-	int dangle = 1;
-	loff_t new_offset =
-	    (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1;
-
-	reiserfs_write_lock(inode->i_sb);
-	version = get_inode_item_key_version(inode);
-
-	if (!file_capable(inode, block)) {
-		reiserfs_write_unlock(inode->i_sb);
-		return -EFBIG;
-	}
-
-	/*
-	 * if !create, we aren't changing the FS, so we don't need to
-	 * log anything, so we don't need to start a transaction
-	 */
-	if (!(create & GET_BLOCK_CREATE)) {
-		int ret;
-		/* find number of block-th logical block of the file */
-		ret = _get_block_create_0(inode, block, bh_result,
-					  create | GET_BLOCK_READ_DIRECT);
-		reiserfs_write_unlock(inode->i_sb);
-		return ret;
-	}
-
-	/*
-	 * if we're already in a transaction, make sure to close
-	 * any new transactions we start in this func
-	 */
-	if ((create & GET_BLOCK_NO_DANGLE) ||
-	    reiserfs_transaction_running(inode->i_sb))
-		dangle = 0;
-
-	/*
-	 * If file is of such a size, that it might have a tail and
-	 * tails are enabled  we should mark it as possibly needing
-	 * tail packing on close
-	 */
-	if ((have_large_tails(inode->i_sb)
-	     && inode->i_size < i_block_size(inode) * 4)
-	    || (have_small_tails(inode->i_sb)
-		&& inode->i_size < i_block_size(inode)))
-		REISERFS_I(inode)->i_flags |= i_pack_on_close_mask;
-
-	/* set the key of the first byte in the 'block'-th block of file */
-	make_cpu_key(&key, inode, new_offset, TYPE_ANY, 3 /*key length */ );
-	if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
-start_trans:
-		th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
-		if (!th) {
-			retval = -ENOMEM;
-			goto failure;
-		}
-		reiserfs_update_inode_transaction(inode);
-	}
-research:
-
-	retval = search_for_position_by_key(inode->i_sb, &key, &path);
-	if (retval == IO_ERROR) {
-		retval = -EIO;
-		goto failure;
-	}
-
-	bh = get_last_bh(&path);
-	ih = tp_item_head(&path);
-	item = tp_item_body(&path);
-	pos_in_item = path.pos_in_item;
-
-	fs_gen = get_generation(inode->i_sb);
-	copy_item_head(&tmp_ih, ih);
-
-	if (allocation_needed
-	    (retval, allocated_block_nr, ih, item, pos_in_item)) {
-		/* we have to allocate block for the unformatted node */
-		if (!th) {
-			pathrelse(&path);
-			goto start_trans;
-		}
-
-		repeat =
-		    _allocate_block(th, block, inode, &allocated_block_nr,
-				    &path, create);
-
-		/*
-		 * restart the transaction to give the journal a chance to free
-		 * some blocks.  releases the path, so we have to go back to
-		 * research if we succeed on the second try
-		 */
-		if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
-			SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
-			retval = restart_transaction(th, inode, &path);
-			if (retval)
-				goto failure;
-			repeat =
-			    _allocate_block(th, block, inode,
-					    &allocated_block_nr, NULL, create);
-
-			if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) {
-				goto research;
-			}
-			if (repeat == QUOTA_EXCEEDED)
-				retval = -EDQUOT;
-			else
-				retval = -ENOSPC;
-			goto failure;
-		}
-
-		if (fs_changed(fs_gen, inode->i_sb)
-		    && item_moved(&tmp_ih, &path)) {
-			goto research;
-		}
-	}
-
-	if (indirect_item_found(retval, ih)) {
-		b_blocknr_t unfm_ptr;
-		/*
-		 * 'block'-th block is in the file already (there is
-		 * corresponding cell in some indirect item). But it may be
-		 * zero unformatted node pointer (hole)
-		 */
-		unfm_ptr = get_block_num(item, pos_in_item);
-		if (unfm_ptr == 0) {
-			/* use allocated block to plug the hole */
-			reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
-			if (fs_changed(fs_gen, inode->i_sb)
-			    && item_moved(&tmp_ih, &path)) {
-				reiserfs_restore_prepared_buffer(inode->i_sb,
-								 bh);
-				goto research;
-			}
-			set_buffer_new(bh_result);
-			if (buffer_dirty(bh_result)
-			    && reiserfs_data_ordered(inode->i_sb))
-				reiserfs_add_ordered_list(inode, bh_result);
-			put_block_num(item, pos_in_item, allocated_block_nr);
-			unfm_ptr = allocated_block_nr;
-			journal_mark_dirty(th, bh);
-			reiserfs_update_sd(th, inode);
-		}
-		set_block_dev_mapped(bh_result, unfm_ptr, inode);
-		pathrelse(&path);
-		retval = 0;
-		if (!dangle && th)
-			retval = reiserfs_end_persistent_transaction(th);
-
-		reiserfs_write_unlock(inode->i_sb);
-
-		/*
-		 * the item was found, so new blocks were not added to the file
-		 * there is no need to make sure the inode is updated with this
-		 * transaction
-		 */
-		return retval;
-	}
-
-	if (!th) {
-		pathrelse(&path);
-		goto start_trans;
-	}
-
-	/*
-	 * desired position is not found or is in the direct item. We have
-	 * to append file with holes up to 'block'-th block converting
-	 * direct items to indirect one if necessary
-	 */
-	done = 0;
-	do {
-		if (is_statdata_le_ih(ih)) {
-			__le32 unp = 0;
-			struct cpu_key tmp_key;
-
-			/* indirect item has to be inserted */
-			make_le_item_head(&tmp_ih, &key, version, 1,
-					  TYPE_INDIRECT, UNFM_P_SIZE,
-					  0 /* free_space */ );
-
-			/*
-			 * we are going to add 'block'-th block to the file.
-			 * Use allocated block for that
-			 */
-			if (cpu_key_k_offset(&key) == 1) {
-				unp = cpu_to_le32(allocated_block_nr);
-				set_block_dev_mapped(bh_result,
-						     allocated_block_nr, inode);
-				set_buffer_new(bh_result);
-				done = 1;
-			}
-			tmp_key = key;	/* ;) */
-			set_cpu_key_k_offset(&tmp_key, 1);
-			PATH_LAST_POSITION(&path)++;
-
-			retval =
-			    reiserfs_insert_item(th, &path, &tmp_key, &tmp_ih,
-						 inode, (char *)&unp);
-			if (retval) {
-				reiserfs_free_block(th, inode,
-						    allocated_block_nr, 1);
-				/*
-				 * retval == -ENOSPC, -EDQUOT or -EIO
-				 * or -EEXIST
-				 */
-				goto failure;
-			}
-		} else if (is_direct_le_ih(ih)) {
-			/* direct item has to be converted */
-			loff_t tail_offset;
-
-			tail_offset =
-			    ((le_ih_k_offset(ih) -
-			      1) & ~(inode->i_sb->s_blocksize - 1)) + 1;
-
-			/*
-			 * direct item we just found fits into block we have
-			 * to map. Convert it into unformatted node: use
-			 * bh_result for the conversion
-			 */
-			if (tail_offset == cpu_key_k_offset(&key)) {
-				set_block_dev_mapped(bh_result,
-						     allocated_block_nr, inode);
-				unbh = bh_result;
-				done = 1;
-			} else {
-				/*
-				 * we have to pad file tail stored in direct
-				 * item(s) up to block size and convert it
-				 * to unformatted node. FIXME: this should
-				 * also get into page cache
-				 */
-
-				pathrelse(&path);
-				/*
-				 * ugly, but we can only end the transaction if
-				 * we aren't nested
-				 */
-				BUG_ON(!th->t_refcount);
-				if (th->t_refcount == 1) {
-					retval =
-					    reiserfs_end_persistent_transaction
-					    (th);
-					th = NULL;
-					if (retval)
-						goto failure;
-				}
-
-				retval =
-				    convert_tail_for_hole(inode, bh_result,
-							  tail_offset);
-				if (retval) {
-					if (retval != -ENOSPC)
-						reiserfs_error(inode->i_sb,
-							"clm-6004",
-							"convert tail failed "
-							"inode %lu, error %d",
-							inode->i_ino,
-							retval);
-					if (allocated_block_nr) {
-						/*
-						 * the bitmap, the super,
-						 * and the stat data == 3
-						 */
-						if (!th)
-							th = reiserfs_persistent_transaction(inode->i_sb, 3);
-						if (th)
-							reiserfs_free_block(th,
-									    inode,
-									    allocated_block_nr,
-									    1);
-					}
-					goto failure;
-				}
-				goto research;
-			}
-			retval =
-			    direct2indirect(th, inode, &path, unbh,
-					    tail_offset);
-			if (retval) {
-				reiserfs_unmap_buffer(unbh);
-				reiserfs_free_block(th, inode,
-						    allocated_block_nr, 1);
-				goto failure;
-			}
-			/*
-			 * it is important the set_buffer_uptodate is done
-			 * after the direct2indirect.  The buffer might
-			 * contain valid data newer than the data on disk
-			 * (read by read_folio, changed, and then sent here by
-			 * writepage).  direct2indirect needs to know if unbh
-			 * was already up to date, so it can decide if the
-			 * data in unbh needs to be replaced with data from
-			 * the disk
-			 */
-			set_buffer_uptodate(unbh);
-
-			/*
-			 * unbh->b_page == NULL in case of DIRECT_IO request,
-			 * this means buffer will disappear shortly, so it
-			 * should not be added to
-			 */
-			if (unbh->b_page) {
-				/*
-				 * we've converted the tail, so we must
-				 * flush unbh before the transaction commits
-				 */
-				reiserfs_add_tail_list(inode, unbh);
-
-				/*
-				 * mark it dirty now to prevent commit_write
-				 * from adding this buffer to the inode's
-				 * dirty buffer list
-				 */
-				/*
-				 * AKPM: changed __mark_buffer_dirty to
-				 * mark_buffer_dirty().  It's still atomic,
-				 * but it sets the page dirty too, which makes
-				 * it eligible for writeback at any time by the
-				 * VM (which was also the case with
-				 * __mark_buffer_dirty())
-				 */
-				mark_buffer_dirty(unbh);
-			}
-		} else {
-			/*
-			 * append indirect item with holes if needed, when
-			 * appending pointer to 'block'-th block use block,
-			 * which is already allocated
-			 */
-			struct cpu_key tmp_key;
-			/*
-			 * We use this in case we need to allocate
-			 * only one block which is a fastpath
-			 */
-			unp_t unf_single = 0;
-			unp_t *un;
-			__u64 max_to_insert =
-			    MAX_ITEM_LEN(inode->i_sb->s_blocksize) /
-			    UNFM_P_SIZE;
-			__u64 blocks_needed;
-
-			RFALSE(pos_in_item != ih_item_len(ih) / UNFM_P_SIZE,
-			       "vs-804: invalid position for append");
-			/*
-			 * indirect item has to be appended,
-			 * set up key of that position
-			 * (key type is unimportant)
-			 */
-			make_cpu_key(&tmp_key, inode,
-				     le_key_k_offset(version,
-						     &ih->ih_key) +
-				     op_bytes_number(ih,
-						     inode->i_sb->s_blocksize),
-				     TYPE_INDIRECT, 3);
-
-			RFALSE(cpu_key_k_offset(&tmp_key) > cpu_key_k_offset(&key),
-			       "green-805: invalid offset");
-			blocks_needed =
-			    1 +
-			    ((cpu_key_k_offset(&key) -
-			      cpu_key_k_offset(&tmp_key)) >> inode->i_sb->
-			     s_blocksize_bits);
-
-			if (blocks_needed == 1) {
-				un = &unf_single;
-			} else {
-				un = kcalloc(min(blocks_needed, max_to_insert),
-					     UNFM_P_SIZE, GFP_NOFS);
-				if (!un) {
-					un = &unf_single;
-					blocks_needed = 1;
-					max_to_insert = 0;
-				}
-			}
-			if (blocks_needed <= max_to_insert) {
-				/*
-				 * we are going to add target block to
-				 * the file. Use allocated block for that
-				 */
-				un[blocks_needed - 1] =
-				    cpu_to_le32(allocated_block_nr);
-				set_block_dev_mapped(bh_result,
-						     allocated_block_nr, inode);
-				set_buffer_new(bh_result);
-				done = 1;
-			} else {
-				/* paste hole to the indirect item */
-				/*
-				 * If kcalloc failed, max_to_insert becomes
-				 * zero and it means we only have space for
-				 * one block
-				 */
-				blocks_needed =
-				    max_to_insert ? max_to_insert : 1;
-			}
-			retval =
-			    reiserfs_paste_into_item(th, &path, &tmp_key, inode,
-						     (char *)un,
-						     UNFM_P_SIZE *
-						     blocks_needed);
-
-			if (blocks_needed != 1)
-				kfree(un);
-
-			if (retval) {
-				reiserfs_free_block(th, inode,
-						    allocated_block_nr, 1);
-				goto failure;
-			}
-			if (!done) {
-				/*
-				 * We need to mark new file size in case
-				 * this function will be interrupted/aborted
-				 * later on. And we may do this only for
-				 * holes.
-				 */
-				inode->i_size +=
-				    inode->i_sb->s_blocksize * blocks_needed;
-			}
-		}
-
-		if (done == 1)
-			break;
-
-		/*
-		 * this loop could log more blocks than we had originally
-		 * asked for.  So, we have to allow the transaction to end
-		 * if it is too big or too full.  Update the inode so things
-		 * are consistent if we crash before the function returns
-		 * release the path so that anybody waiting on the path before
-		 * ending their transaction will be able to continue.
-		 */
-		if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
-			retval = restart_transaction(th, inode, &path);
-			if (retval)
-				goto failure;
-		}
-		/*
-		 * inserting indirect pointers for a hole can take a
-		 * long time.  reschedule if needed and also release the write
-		 * lock for others.
-		 */
-		reiserfs_cond_resched(inode->i_sb);
-
-		retval = search_for_position_by_key(inode->i_sb, &key, &path);
-		if (retval == IO_ERROR) {
-			retval = -EIO;
-			goto failure;
-		}
-		if (retval == POSITION_FOUND) {
-			reiserfs_warning(inode->i_sb, "vs-825",
-					 "%K should not be found", &key);
-			retval = -EEXIST;
-			if (allocated_block_nr)
-				reiserfs_free_block(th, inode,
-						    allocated_block_nr, 1);
-			pathrelse(&path);
-			goto failure;
-		}
-		bh = get_last_bh(&path);
-		ih = tp_item_head(&path);
-		item = tp_item_body(&path);
-		pos_in_item = path.pos_in_item;
-	} while (1);
-
-	retval = 0;
-
-failure:
-	if (th && (!dangle || (retval && !th->t_trans_id))) {
-		int err;
-		if (th->t_trans_id)
-			reiserfs_update_sd(th, inode);
-		err = reiserfs_end_persistent_transaction(th);
-		if (err)
-			retval = err;
-	}
-
-	reiserfs_write_unlock(inode->i_sb);
-	reiserfs_check_path(&path);
-	return retval;
-}
-
-static void reiserfs_readahead(struct readahead_control *rac)
-{
-	mpage_readahead(rac, reiserfs_get_block);
-}
-
-/*
- * Compute real number of used bytes by file
- * Following three functions can go away when we'll have enough space in
- * stat item
- */
-static int real_space_diff(struct inode *inode, int sd_size)
-{
-	int bytes;
-	loff_t blocksize = inode->i_sb->s_blocksize;
-
-	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
-		return sd_size;
-
-	/*
-	 * End of file is also in full block with indirect reference, so round
-	 * up to the next block.
-	 *
-	 * there is just no way to know if the tail is actually packed
-	 * on the file, so we have to assume it isn't.  When we pack the
-	 * tail, we add 4 bytes to pretend there really is an unformatted
-	 * node pointer
-	 */
-	bytes =
-	    ((inode->i_size +
-	      (blocksize - 1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE +
-	    sd_size;
-	return bytes;
-}
-
-static inline loff_t to_real_used_space(struct inode *inode, ulong blocks,
-					int sd_size)
-{
-	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
-		return inode->i_size +
-		    (loff_t) (real_space_diff(inode, sd_size));
-	}
-	return ((loff_t) real_space_diff(inode, sd_size)) +
-	    (((loff_t) blocks) << 9);
-}
-
-/* Compute number of blocks used by file in ReiserFS counting */
-static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size)
-{
-	loff_t bytes = inode_get_bytes(inode);
-	loff_t real_space = real_space_diff(inode, sd_size);
-
-	/* keeps fsck and non-quota versions of reiserfs happy */
-	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
-		bytes += (loff_t) 511;
-	}
-
-	/*
-	 * files from before the quota patch might i_blocks such that
-	 * bytes < real_space.  Deal with that here to prevent it from
-	 * going negative.
-	 */
-	if (bytes < real_space)
-		return 0;
-	return (bytes - real_space) >> 9;
-}
-
-/*
- * BAD: new directories have stat data of new type and all other items
- * of old type. Version stored in the inode says about body items, so
- * in update_stat_data we can not rely on inode, but have to check
- * item version directly
- */
-
-/* called by read_locked_inode */
-static void init_inode(struct inode *inode, struct treepath *path)
-{
-	struct buffer_head *bh;
-	struct item_head *ih;
-	__u32 rdev;
-
-	bh = PATH_PLAST_BUFFER(path);
-	ih = tp_item_head(path);
-
-	copy_key(INODE_PKEY(inode), &ih->ih_key);
-
-	INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list);
-	REISERFS_I(inode)->i_flags = 0;
-	REISERFS_I(inode)->i_prealloc_block = 0;
-	REISERFS_I(inode)->i_prealloc_count = 0;
-	REISERFS_I(inode)->i_trans_id = 0;
-	REISERFS_I(inode)->i_jl = NULL;
-	reiserfs_init_xattr_rwsem(inode);
-
-	if (stat_data_v1(ih)) {
-		struct stat_data_v1 *sd =
-		    (struct stat_data_v1 *)ih_item_body(bh, ih);
-		unsigned long blocks;
-
-		set_inode_item_key_version(inode, KEY_FORMAT_3_5);
-		set_inode_sd_version(inode, STAT_DATA_V1);
-		inode->i_mode = sd_v1_mode(sd);
-		set_nlink(inode, sd_v1_nlink(sd));
-		i_uid_write(inode, sd_v1_uid(sd));
-		i_gid_write(inode, sd_v1_gid(sd));
-		inode->i_size = sd_v1_size(sd);
-		inode_set_atime(inode, sd_v1_atime(sd), 0);
-		inode_set_mtime(inode, sd_v1_mtime(sd), 0);
-		inode_set_ctime(inode, sd_v1_ctime(sd), 0);
-
-		inode->i_blocks = sd_v1_blocks(sd);
-		inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
-		blocks = (inode->i_size + 511) >> 9;
-		blocks = _ROUND_UP(blocks, inode->i_sb->s_blocksize >> 9);
-
-		/*
-		 * there was a bug in <=3.5.23 when i_blocks could take
-		 * negative values. Starting from 3.5.17 this value could
-		 * even be stored in stat data. For such files we set
-		 * i_blocks based on file size. Just 2 notes: this can be
-		 * wrong for sparse files. On-disk value will be only
-		 * updated if file's inode will ever change
-		 */
-		if (inode->i_blocks > blocks) {
-			inode->i_blocks = blocks;
-		}
-
-		rdev = sd_v1_rdev(sd);
-		REISERFS_I(inode)->i_first_direct_byte =
-		    sd_v1_first_direct_byte(sd);
-
-		/*
-		 * an early bug in the quota code can give us an odd
-		 * number for the block count.  This is incorrect, fix it here.
-		 */
-		if (inode->i_blocks & 1) {
-			inode->i_blocks++;
-		}
-		inode_set_bytes(inode,
-				to_real_used_space(inode, inode->i_blocks,
-						   SD_V1_SIZE));
-		/*
-		 * nopack is initially zero for v1 objects. For v2 objects,
-		 * nopack is initialised from sd_attrs
-		 */
-		REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
-	} else {
-		/*
-		 * new stat data found, but object may have old items
-		 * (directories and symlinks)
-		 */
-		struct stat_data *sd = (struct stat_data *)ih_item_body(bh, ih);
-
-		inode->i_mode = sd_v2_mode(sd);
-		set_nlink(inode, sd_v2_nlink(sd));
-		i_uid_write(inode, sd_v2_uid(sd));
-		inode->i_size = sd_v2_size(sd);
-		i_gid_write(inode, sd_v2_gid(sd));
-		inode_set_mtime(inode, sd_v2_mtime(sd), 0);
-		inode_set_atime(inode, sd_v2_atime(sd), 0);
-		inode_set_ctime(inode, sd_v2_ctime(sd), 0);
-		inode->i_blocks = sd_v2_blocks(sd);
-		rdev = sd_v2_rdev(sd);
-		if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
-			inode->i_generation =
-			    le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
-		else
-			inode->i_generation = sd_v2_generation(sd);
-
-		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-			set_inode_item_key_version(inode, KEY_FORMAT_3_5);
-		else
-			set_inode_item_key_version(inode, KEY_FORMAT_3_6);
-		REISERFS_I(inode)->i_first_direct_byte = 0;
-		set_inode_sd_version(inode, STAT_DATA_V2);
-		inode_set_bytes(inode,
-				to_real_used_space(inode, inode->i_blocks,
-						   SD_V2_SIZE));
-		/*
-		 * read persistent inode attributes from sd and initialise
-		 * generic inode flags from them
-		 */
-		REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd);
-		sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode);
-	}
-
-	pathrelse(path);
-	if (S_ISREG(inode->i_mode)) {
-		inode->i_op = &reiserfs_file_inode_operations;
-		inode->i_fop = &reiserfs_file_operations;
-		inode->i_mapping->a_ops = &reiserfs_address_space_operations;
-	} else if (S_ISDIR(inode->i_mode)) {
-		inode->i_op = &reiserfs_dir_inode_operations;
-		inode->i_fop = &reiserfs_dir_operations;
-	} else if (S_ISLNK(inode->i_mode)) {
-		inode->i_op = &reiserfs_symlink_inode_operations;
-		inode_nohighmem(inode);
-		inode->i_mapping->a_ops = &reiserfs_address_space_operations;
-	} else {
-		inode->i_blocks = 0;
-		inode->i_op = &reiserfs_special_inode_operations;
-		init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
-	}
-}
-
-/* update new stat data with inode fields */
-static void inode2sd(void *sd, struct inode *inode, loff_t size)
-{
-	struct stat_data *sd_v2 = (struct stat_data *)sd;
-
-	set_sd_v2_mode(sd_v2, inode->i_mode);
-	set_sd_v2_nlink(sd_v2, inode->i_nlink);
-	set_sd_v2_uid(sd_v2, i_uid_read(inode));
-	set_sd_v2_size(sd_v2, size);
-	set_sd_v2_gid(sd_v2, i_gid_read(inode));
-	set_sd_v2_mtime(sd_v2, inode_get_mtime_sec(inode));
-	set_sd_v2_atime(sd_v2, inode_get_atime_sec(inode));
-	set_sd_v2_ctime(sd_v2, inode_get_ctime_sec(inode));
-	set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
-	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
-		set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
-	else
-		set_sd_v2_generation(sd_v2, inode->i_generation);
-	set_sd_v2_attrs(sd_v2, REISERFS_I(inode)->i_attrs);
-}
-
-/* used to copy inode's fields to old stat data */
-static void inode2sd_v1(void *sd, struct inode *inode, loff_t size)
-{
-	struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd;
-
-	set_sd_v1_mode(sd_v1, inode->i_mode);
-	set_sd_v1_uid(sd_v1, i_uid_read(inode));
-	set_sd_v1_gid(sd_v1, i_gid_read(inode));
-	set_sd_v1_nlink(sd_v1, inode->i_nlink);
-	set_sd_v1_size(sd_v1, size);
-	set_sd_v1_atime(sd_v1, inode_get_atime_sec(inode));
-	set_sd_v1_ctime(sd_v1, inode_get_ctime_sec(inode));
-	set_sd_v1_mtime(sd_v1, inode_get_mtime_sec(inode));
-
-	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
-		set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev));
-	else
-		set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
-
-	/* Sigh. i_first_direct_byte is back */
-	set_sd_v1_first_direct_byte(sd_v1,
-				    REISERFS_I(inode)->i_first_direct_byte);
-}
-
-/*
- * NOTE, you must prepare the buffer head before sending it here,
- * and then log it after the call
- */
-static void update_stat_data(struct treepath *path, struct inode *inode,
-			     loff_t size)
-{
-	struct buffer_head *bh;
-	struct item_head *ih;
-
-	bh = PATH_PLAST_BUFFER(path);
-	ih = tp_item_head(path);
-
-	if (!is_statdata_le_ih(ih))
-		reiserfs_panic(inode->i_sb, "vs-13065", "key %k, found item %h",
-			       INODE_PKEY(inode), ih);
-
-	/* path points to old stat data */
-	if (stat_data_v1(ih)) {
-		inode2sd_v1(ih_item_body(bh, ih), inode, size);
-	} else {
-		inode2sd(ih_item_body(bh, ih), inode, size);
-	}
-
-	return;
-}
-
-void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
-			     struct inode *inode, loff_t size)
-{
-	struct cpu_key key;
-	INITIALIZE_PATH(path);
-	struct buffer_head *bh;
-	int fs_gen;
-	struct item_head *ih, tmp_ih;
-	int retval;
-
-	BUG_ON(!th->t_trans_id);
-
-	/* key type is unimportant */
-	make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);
-
-	for (;;) {
-		int pos;
-		/* look for the object's stat data */
-		retval = search_item(inode->i_sb, &key, &path);
-		if (retval == IO_ERROR) {
-			reiserfs_error(inode->i_sb, "vs-13050",
-				       "i/o failure occurred trying to "
-				       "update %K stat data", &key);
-			return;
-		}
-		if (retval == ITEM_NOT_FOUND) {
-			pos = PATH_LAST_POSITION(&path);
-			pathrelse(&path);
-			if (inode->i_nlink == 0) {
-				/*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found"); */
-				return;
-			}
-			reiserfs_warning(inode->i_sb, "vs-13060",
-					 "stat data of object %k (nlink == %d) "
-					 "not found (pos %d)",
-					 INODE_PKEY(inode), inode->i_nlink,
-					 pos);
-			reiserfs_check_path(&path);
-			return;
-		}
-
-		/*
-		 * sigh, prepare_for_journal might schedule.  When it
-		 * schedules the FS might change.  We have to detect that,
-		 * and loop back to the search if the stat data item has moved
-		 */
-		bh = get_last_bh(&path);
-		ih = tp_item_head(&path);
-		copy_item_head(&tmp_ih, ih);
-		fs_gen = get_generation(inode->i_sb);
-		reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
-
-		/* Stat_data item has been moved after scheduling. */
-		if (fs_changed(fs_gen, inode->i_sb)
-		    && item_moved(&tmp_ih, &path)) {
-			reiserfs_restore_prepared_buffer(inode->i_sb, bh);
-			continue;
-		}
-		break;
-	}
-	update_stat_data(&path, inode, size);
-	journal_mark_dirty(th, bh);
-	pathrelse(&path);
-	return;
-}
-
-/*
- * reiserfs_read_locked_inode is called to read the inode off disk, and it
- * does a make_bad_inode when things go wrong.  But, we need to make sure
- * and clear the key in the private portion of the inode, otherwise a
- * corresponding iput might try to delete whatever object the inode last
- * represented.
- */
-static void reiserfs_make_bad_inode(struct inode *inode)
-{
-	memset(INODE_PKEY(inode), 0, KEY_SIZE);
-	make_bad_inode(inode);
-}
-
-/*
- * initially this function was derived from minix or ext2's analog and
- * evolved as the prototype did
- */
-int reiserfs_init_locked_inode(struct inode *inode, void *p)
-{
-	struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p;
-	inode->i_ino = args->objectid;
-	INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid);
-	return 0;
-}
-
-/*
- * looks for stat data in the tree, and fills up the fields of in-core
- * inode stat data fields
- */
-void reiserfs_read_locked_inode(struct inode *inode,
-				struct reiserfs_iget_args *args)
-{
-	INITIALIZE_PATH(path_to_sd);
-	struct cpu_key key;
-	unsigned long dirino;
-	int retval;
-
-	dirino = args->dirid;
-
-	/*
-	 * set version 1, version 2 could be used too, because stat data
-	 * key is the same in both versions
-	 */
-	_make_cpu_key(&key, KEY_FORMAT_3_5, dirino, inode->i_ino, 0, 0, 3);
-
-	/* look for the object's stat data */
-	retval = search_item(inode->i_sb, &key, &path_to_sd);
-	if (retval == IO_ERROR) {
-		reiserfs_error(inode->i_sb, "vs-13070",
-			       "i/o failure occurred trying to find "
-			       "stat data of %K", &key);
-		reiserfs_make_bad_inode(inode);
-		return;
-	}
-
-	/* a stale NFS handle can trigger this without it being an error */
-	if (retval != ITEM_FOUND) {
-		pathrelse(&path_to_sd);
-		reiserfs_make_bad_inode(inode);
-		clear_nlink(inode);
-		return;
-	}
-
-	init_inode(inode, &path_to_sd);
-
-	/*
-	 * It is possible that knfsd is trying to access inode of a file
-	 * that is being removed from the disk by some other thread. As we
-	 * update sd on unlink all that is required is to check for nlink
-	 * here. This bug was first found by Sizif when debugging
-	 * SquidNG/Butterfly, forgotten, and found again after Philippe
-	 * Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
-
-	 * More logical fix would require changes in fs/inode.c:iput() to
-	 * remove inode from hash-table _after_ fs cleaned disk stuff up and
-	 * in iget() to return NULL if I_FREEING inode is found in
-	 * hash-table.
-	 */
-
-	/*
-	 * Currently there is one place where it's ok to meet inode with
-	 * nlink==0: processing of open-unlinked and half-truncated files
-	 * during mount (fs/reiserfs/super.c:finish_unfinished()).
-	 */
-	if ((inode->i_nlink == 0) &&
-	    !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) {
-		reiserfs_warning(inode->i_sb, "vs-13075",
-				 "dead inode read from disk %K. "
-				 "This is likely to be race with knfsd. Ignore",
-				 &key);
-		reiserfs_make_bad_inode(inode);
-	}
-
-	/* init inode should be relsing */
-	reiserfs_check_path(&path_to_sd);
-
-	/*
-	 * Stat data v1 doesn't support ACLs.
-	 */
-	if (get_inode_sd_version(inode) == STAT_DATA_V1)
-		cache_no_acl(inode);
-}
-
-/*
- * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked().
- *
- * @inode:    inode from hash table to check
- * @opaque:   "cookie" passed to iget5_locked(). This is &reiserfs_iget_args.
- *
- * This function is called by iget5_locked() to distinguish reiserfs inodes
- * having the same inode numbers. Such inodes can only exist due to some
- * error condition. One of them should be bad. Inodes with identical
- * inode numbers (objectids) are distinguished by parent directory ids.
- *
- */
-int reiserfs_find_actor(struct inode *inode, void *opaque)
-{
-	struct reiserfs_iget_args *args;
-
-	args = opaque;
-	/* args is already in CPU order */
-	return (inode->i_ino == args->objectid) &&
-	    (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid);
-}
-
-struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key)
-{
-	struct inode *inode;
-	struct reiserfs_iget_args args;
-	int depth;
-
-	args.objectid = key->on_disk_key.k_objectid;
-	args.dirid = key->on_disk_key.k_dir_id;
-	depth = reiserfs_write_unlock_nested(s);
-	inode = iget5_locked(s, key->on_disk_key.k_objectid,
-			     reiserfs_find_actor, reiserfs_init_locked_inode,
-			     (void *)(&args));
-	reiserfs_write_lock_nested(s, depth);
-	if (!inode)
-		return ERR_PTR(-ENOMEM);
-
-	if (inode->i_state & I_NEW) {
-		reiserfs_read_locked_inode(inode, &args);
-		unlock_new_inode(inode);
-	}
-
-	if (comp_short_keys(INODE_PKEY(inode), key) || is_bad_inode(inode)) {
-		/* either due to i/o error or a stale NFS handle */
-		iput(inode);
-		inode = NULL;
-	}
-	return inode;
-}
-
-static struct dentry *reiserfs_get_dentry(struct super_block *sb,
-	u32 objectid, u32 dir_id, u32 generation)
-
-{
-	struct cpu_key key;
-	struct inode *inode;
-
-	key.on_disk_key.k_objectid = objectid;
-	key.on_disk_key.k_dir_id = dir_id;
-	reiserfs_write_lock(sb);
-	inode = reiserfs_iget(sb, &key);
-	if (inode && !IS_ERR(inode) && generation != 0 &&
-	    generation != inode->i_generation) {
-		iput(inode);
-		inode = NULL;
-	}
-	reiserfs_write_unlock(sb);
-
-	return d_obtain_alias(inode);
-}
-
-struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
-		int fh_len, int fh_type)
-{
-	/*
-	 * fhtype happens to reflect the number of u32s encoded.
-	 * due to a bug in earlier code, fhtype might indicate there
-	 * are more u32s then actually fitted.
-	 * so if fhtype seems to be more than len, reduce fhtype.
-	 * Valid types are:
-	 *   2 - objectid + dir_id - legacy support
-	 *   3 - objectid + dir_id + generation
-	 *   4 - objectid + dir_id + objectid and dirid of parent - legacy
-	 *   5 - objectid + dir_id + generation + objectid and dirid of parent
-	 *   6 - as above plus generation of directory
-	 * 6 does not fit in NFSv2 handles
-	 */
-	if (fh_type > fh_len) {
-		if (fh_type != 6 || fh_len != 5)
-			reiserfs_warning(sb, "reiserfs-13077",
-				"nfsd/reiserfs, fhtype=%d, len=%d - odd",
-				fh_type, fh_len);
-		fh_type = fh_len;
-	}
-	if (fh_len < 2)
-		return NULL;
-
-	return reiserfs_get_dentry(sb, fid->raw[0], fid->raw[1],
-		(fh_type == 3 || fh_type >= 5) ? fid->raw[2] : 0);
-}
-
-struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
-		int fh_len, int fh_type)
-{
-	if (fh_type > fh_len)
-		fh_type = fh_len;
-	if (fh_type < 4)
-		return NULL;
-
-	return reiserfs_get_dentry(sb,
-		(fh_type >= 5) ? fid->raw[3] : fid->raw[2],
-		(fh_type >= 5) ? fid->raw[4] : fid->raw[3],
-		(fh_type == 6) ? fid->raw[5] : 0);
-}
-
-int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp,
-		       struct inode *parent)
-{
-	int maxlen = *lenp;
-
-	if (parent && (maxlen < 5)) {
-		*lenp = 5;
-		return FILEID_INVALID;
-	} else if (maxlen < 3) {
-		*lenp = 3;
-		return FILEID_INVALID;
-	}
-
-	data[0] = inode->i_ino;
-	data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
-	data[2] = inode->i_generation;
-	*lenp = 3;
-	if (parent) {
-		data[3] = parent->i_ino;
-		data[4] = le32_to_cpu(INODE_PKEY(parent)->k_dir_id);
-		*lenp = 5;
-		if (maxlen >= 6) {
-			data[5] = parent->i_generation;
-			*lenp = 6;
-		}
-	}
-	return *lenp;
-}
-
-/*
- * looks for stat data, then copies fields to it, marks the buffer
- * containing stat data as dirty
- */
-/*
- * reiserfs inodes are never really dirty, since the dirty inode call
- * always logs them.  This call allows the VFS inode marking routines
- * to properly mark inodes for datasync and such, but only actually
- * does something when called for a synchronous update.
- */
-int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc)
-{
-	struct reiserfs_transaction_handle th;
-	int jbegin_count = 1;
-
-	if (sb_rdonly(inode->i_sb))
-		return -EROFS;
-	/*
-	 * memory pressure can sometimes initiate write_inode calls with
-	 * sync == 1,
-	 * these cases are just when the system needs ram, not when the
-	 * inode needs to reach disk for safety, and they can safely be
-	 * ignored because the altered inode has already been logged.
-	 */
-	if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) {
-		reiserfs_write_lock(inode->i_sb);
-		if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
-			reiserfs_update_sd(&th, inode);
-			journal_end_sync(&th);
-		}
-		reiserfs_write_unlock(inode->i_sb);
-	}
-	return 0;
-}
-
-/*
- * stat data of new object is inserted already, this inserts the item
- * containing "." and ".." entries
- */
-static int reiserfs_new_directory(struct reiserfs_transaction_handle *th,
-				  struct inode *inode,
-				  struct item_head *ih, struct treepath *path,
-				  struct inode *dir)
-{
-	struct super_block *sb = th->t_super;
-	char empty_dir[EMPTY_DIR_SIZE];
-	char *body = empty_dir;
-	struct cpu_key key;
-	int retval;
-
-	BUG_ON(!th->t_trans_id);
-
-	_make_cpu_key(&key, KEY_FORMAT_3_5, le32_to_cpu(ih->ih_key.k_dir_id),
-		      le32_to_cpu(ih->ih_key.k_objectid), DOT_OFFSET,
-		      TYPE_DIRENTRY, 3 /*key length */ );
-
-	/*
-	 * compose item head for new item. Directories consist of items of
-	 * old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
-	 * is done by reiserfs_new_inode
-	 */
-	if (old_format_only(sb)) {
-		make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
-				  TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2);
-
-		make_empty_dir_item_v1(body, ih->ih_key.k_dir_id,
-				       ih->ih_key.k_objectid,
-				       INODE_PKEY(dir)->k_dir_id,
-				       INODE_PKEY(dir)->k_objectid);
-	} else {
-		make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
-				  TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2);
-
-		make_empty_dir_item(body, ih->ih_key.k_dir_id,
-				    ih->ih_key.k_objectid,
-				    INODE_PKEY(dir)->k_dir_id,
-				    INODE_PKEY(dir)->k_objectid);
-	}
-
-	/* look for place in the tree for new item */
-	retval = search_item(sb, &key, path);
-	if (retval == IO_ERROR) {
-		reiserfs_error(sb, "vs-13080",
-			       "i/o failure occurred creating new directory");
-		return -EIO;
-	}
-	if (retval == ITEM_FOUND) {
-		pathrelse(path);
-		reiserfs_warning(sb, "vs-13070",
-				 "object with this key exists (%k)",
-				 &(ih->ih_key));
-		return -EEXIST;
-	}
-
-	/* insert item, that is empty directory item */
-	return reiserfs_insert_item(th, path, &key, ih, inode, body);
-}
-
-/*
- * stat data of object has been inserted, this inserts the item
- * containing the body of symlink
- */
-static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th,
-				struct inode *inode,
-				struct item_head *ih,
-				struct treepath *path, const char *symname,
-				int item_len)
-{
-	struct super_block *sb = th->t_super;
-	struct cpu_key key;
-	int retval;
-
-	BUG_ON(!th->t_trans_id);
-
-	_make_cpu_key(&key, KEY_FORMAT_3_5,
-		      le32_to_cpu(ih->ih_key.k_dir_id),
-		      le32_to_cpu(ih->ih_key.k_objectid),
-		      1, TYPE_DIRECT, 3 /*key length */ );
-
-	make_le_item_head(ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len,
-			  0 /*free_space */ );
-
-	/* look for place in the tree for new item */
-	retval = search_item(sb, &key, path);
-	if (retval == IO_ERROR) {
-		reiserfs_error(sb, "vs-13080",
-			       "i/o failure occurred creating new symlink");
-		return -EIO;
-	}
-	if (retval == ITEM_FOUND) {
-		pathrelse(path);
-		reiserfs_warning(sb, "vs-13080",
-				 "object with this key exists (%k)",
-				 &(ih->ih_key));
-		return -EEXIST;
-	}
-
-	/* insert item, that is body of symlink */
-	return reiserfs_insert_item(th, path, &key, ih, inode, symname);
-}
-
-/*
- * inserts the stat data into the tree, and then calls
- * reiserfs_new_directory (to insert ".", ".." item if new object is
- * directory) or reiserfs_new_symlink (to insert symlink body if new
- * object is symlink) or nothing (if new object is regular file)
-
- * NOTE! uid and gid must already be set in the inode.  If we return
- * non-zero due to an error, we have to drop the quota previously allocated
- * for the fresh inode.  This can only be done outside a transaction, so
- * if we return non-zero, we also end the transaction.
- *
- * @th: active transaction handle
- * @dir: parent directory for new inode
- * @mode: mode of new inode
- * @symname: symlink contents if inode is symlink
- * @isize: 0 for regular file, EMPTY_DIR_SIZE for dirs, strlen(symname) for
- *         symlinks
- * @inode: inode to be filled
- * @security: optional security context to associate with this inode
- */
-int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
-		       struct inode *dir, umode_t mode, const char *symname,
-		       /* 0 for regular, EMTRY_DIR_SIZE for dirs,
-		          strlen (symname) for symlinks) */
-		       loff_t i_size, struct dentry *dentry,
-		       struct inode *inode,
-		       struct reiserfs_security_handle *security)
-{
-	struct super_block *sb = dir->i_sb;
-	struct reiserfs_iget_args args;
-	INITIALIZE_PATH(path_to_key);
-	struct cpu_key key;
-	struct item_head ih;
-	struct stat_data sd;
-	int retval;
-	int err;
-	int depth;
-
-	BUG_ON(!th->t_trans_id);
-
-	depth = reiserfs_write_unlock_nested(sb);
-	err = dquot_alloc_inode(inode);
-	reiserfs_write_lock_nested(sb, depth);
-	if (err)
-		goto out_end_trans;
-	if (!dir->i_nlink) {
-		err = -EPERM;
-		goto out_bad_inode;
-	}
-
-	/* item head of new item */
-	ih.ih_key.k_dir_id = reiserfs_choose_packing(dir);
-	ih.ih_key.k_objectid = cpu_to_le32(reiserfs_get_unused_objectid(th));
-	if (!ih.ih_key.k_objectid) {
-		err = -ENOMEM;
-		goto out_bad_inode;
-	}
-	args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
-	if (old_format_only(sb))
-		make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
-				  TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
-	else
-		make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
-				  TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
-	memcpy(INODE_PKEY(inode), &ih.ih_key, KEY_SIZE);
-	args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
-
-	depth = reiserfs_write_unlock_nested(inode->i_sb);
-	err = insert_inode_locked4(inode, args.objectid,
-			     reiserfs_find_actor, &args);
-	reiserfs_write_lock_nested(inode->i_sb, depth);
-	if (err) {
-		err = -EINVAL;
-		goto out_bad_inode;
-	}
-
-	if (old_format_only(sb))
-		/*
-		 * not a perfect generation count, as object ids can be reused,
-		 * but this is as good as reiserfs can do right now.
-		 * note that the private part of inode isn't filled in yet,
-		 * we have to use the directory.
-		 */
-		inode->i_generation = le32_to_cpu(INODE_PKEY(dir)->k_objectid);
-	else
-#if defined( USE_INODE_GENERATION_COUNTER )
-		inode->i_generation =
-		    le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation);
-#else
-		inode->i_generation = ++event;
-#endif
-
-	/* fill stat data */
-	set_nlink(inode, (S_ISDIR(mode) ? 2 : 1));
-
-	/* uid and gid must already be set by the caller for quota init */
-
-	simple_inode_init_ts(inode);
-	inode->i_size = i_size;
-	inode->i_blocks = 0;
-	inode->i_bytes = 0;
-	REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 :
-	    U32_MAX /*NO_BYTES_IN_DIRECT_ITEM */ ;
-
-	INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list);
-	REISERFS_I(inode)->i_flags = 0;
-	REISERFS_I(inode)->i_prealloc_block = 0;
-	REISERFS_I(inode)->i_prealloc_count = 0;
-	REISERFS_I(inode)->i_trans_id = 0;
-	REISERFS_I(inode)->i_jl = NULL;
-	REISERFS_I(inode)->i_attrs =
-	    REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
-	sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode);
-	reiserfs_init_xattr_rwsem(inode);
-
-	/* key to search for correct place for new stat data */
-	_make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id),
-		      le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET,
-		      TYPE_STAT_DATA, 3 /*key length */ );
-
-	/* find proper place for inserting of stat data */
-	retval = search_item(sb, &key, &path_to_key);
-	if (retval == IO_ERROR) {
-		err = -EIO;
-		goto out_bad_inode;
-	}
-	if (retval == ITEM_FOUND) {
-		pathrelse(&path_to_key);
-		err = -EEXIST;
-		goto out_bad_inode;
-	}
-	if (old_format_only(sb)) {
-		/* i_uid or i_gid is too big to be stored in stat data v3.5 */
-		if (i_uid_read(inode) & ~0xffff || i_gid_read(inode) & ~0xffff) {
-			pathrelse(&path_to_key);
-			err = -EINVAL;
-			goto out_bad_inode;
-		}
-		inode2sd_v1(&sd, inode, inode->i_size);
-	} else {
-		inode2sd(&sd, inode, inode->i_size);
-	}
-	/*
-	 * store in in-core inode the key of stat data and version all
-	 * object items will have (directory items will have old offset
-	 * format, other new objects will consist of new items)
-	 */
-	if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode))
-		set_inode_item_key_version(inode, KEY_FORMAT_3_5);
-	else
-		set_inode_item_key_version(inode, KEY_FORMAT_3_6);
-	if (old_format_only(sb))
-		set_inode_sd_version(inode, STAT_DATA_V1);
-	else
-		set_inode_sd_version(inode, STAT_DATA_V2);
-
-	/* insert the stat data into the tree */
-#ifdef DISPLACE_NEW_PACKING_LOCALITIES
-	if (REISERFS_I(dir)->new_packing_locality)
-		th->displace_new_blocks = 1;
-#endif
-	retval =
-	    reiserfs_insert_item(th, &path_to_key, &key, &ih, inode,
-				 (char *)(&sd));
-	if (retval) {
-		err = retval;
-		reiserfs_check_path(&path_to_key);
-		goto out_bad_inode;
-	}
-#ifdef DISPLACE_NEW_PACKING_LOCALITIES
-	if (!th->displace_new_blocks)
-		REISERFS_I(dir)->new_packing_locality = 0;
-#endif
-	if (S_ISDIR(mode)) {
-		/* insert item with "." and ".." */
-		retval =
-		    reiserfs_new_directory(th, inode, &ih, &path_to_key, dir);
-	}
-
-	if (S_ISLNK(mode)) {
-		/* insert body of symlink */
-		if (!old_format_only(sb))
-			i_size = ROUND_UP(i_size);
-		retval =
-		    reiserfs_new_symlink(th, inode, &ih, &path_to_key, symname,
-					 i_size);
-	}
-	if (retval) {
-		err = retval;
-		reiserfs_check_path(&path_to_key);
-		journal_end(th);
-		goto out_inserted_sd;
-	}
-
-	/*
-	 * Mark it private if we're creating the privroot
-	 * or something under it.
-	 */
-	if (IS_PRIVATE(dir) || dentry == REISERFS_SB(sb)->priv_root)
-		reiserfs_init_priv_inode(inode);
-
-	if (reiserfs_posixacl(inode->i_sb)) {
-		reiserfs_write_unlock(inode->i_sb);
-		retval = reiserfs_inherit_default_acl(th, dir, dentry, inode);
-		reiserfs_write_lock(inode->i_sb);
-		if (retval) {
-			err = retval;
-			reiserfs_check_path(&path_to_key);
-			journal_end(th);
-			goto out_inserted_sd;
-		}
-	} else if (inode->i_sb->s_flags & SB_POSIXACL) {
-		reiserfs_warning(inode->i_sb, "jdm-13090",
-				 "ACLs aren't enabled in the fs, "
-				 "but vfs thinks they are!");
-	}
-
-	if (security->name) {
-		reiserfs_write_unlock(inode->i_sb);
-		retval = reiserfs_security_write(th, inode, security);
-		reiserfs_write_lock(inode->i_sb);
-		if (retval) {
-			err = retval;
-			reiserfs_check_path(&path_to_key);
-			retval = journal_end(th);
-			if (retval)
-				err = retval;
-			goto out_inserted_sd;
-		}
-	}
-
-	reiserfs_update_sd(th, inode);
-	reiserfs_check_path(&path_to_key);
-
-	return 0;
-
-out_bad_inode:
-	/* Invalidate the object, nothing was inserted yet */
-	INODE_PKEY(inode)->k_objectid = 0;
-
-	/* Quota change must be inside a transaction for journaling */
-	depth = reiserfs_write_unlock_nested(inode->i_sb);
-	dquot_free_inode(inode);
-	reiserfs_write_lock_nested(inode->i_sb, depth);
-
-out_end_trans:
-	journal_end(th);
-	/*
-	 * Drop can be outside and it needs more credits so it's better
-	 * to have it outside
-	 */
-	depth = reiserfs_write_unlock_nested(inode->i_sb);
-	dquot_drop(inode);
-	reiserfs_write_lock_nested(inode->i_sb, depth);
-	inode->i_flags |= S_NOQUOTA;
-	make_bad_inode(inode);
-
-out_inserted_sd:
-	clear_nlink(inode);
-	th->t_trans_id = 0;	/* so the caller can't use this handle later */
-	if (inode->i_state & I_NEW)
-		unlock_new_inode(inode);
-	iput(inode);
-	return err;
-}
-
-/*
- * finds the tail page in the page cache,
- * reads the last block in.
- *
- * On success, page_result is set to a locked, pinned page, and bh_result
- * is set to an up to date buffer for the last block in the file.  returns 0.
- *
- * tail conversion is not done, so bh_result might not be valid for writing
- * check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before
- * trying to write the block.
- *
- * on failure, nonzero is returned, page_result and bh_result are untouched.
- */
-static int grab_tail_page(struct inode *inode,
-			  struct page **page_result,
-			  struct buffer_head **bh_result)
-{
-
-	/*
-	 * we want the page with the last byte in the file,
-	 * not the page that will hold the next byte for appending
-	 */
-	unsigned long index = (inode->i_size - 1) >> PAGE_SHIFT;
-	unsigned long pos = 0;
-	unsigned long start = 0;
-	unsigned long blocksize = inode->i_sb->s_blocksize;
-	unsigned long offset = (inode->i_size) & (PAGE_SIZE - 1);
-	struct buffer_head *bh;
-	struct buffer_head *head;
-	struct folio *folio;
-	int error;
-
-	/*
-	 * we know that we are only called with inode->i_size > 0.
-	 * we also know that a file tail can never be as big as a block
-	 * If i_size % blocksize == 0, our file is currently block aligned
-	 * and it won't need converting or zeroing after a truncate.
-	 */
-	if ((offset & (blocksize - 1)) == 0) {
-		return -ENOENT;
-	}
-	folio = __filemap_get_folio(inode->i_mapping, index,
-			FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
-			mapping_gfp_mask(inode->i_mapping));
-	if (IS_ERR(folio))
-		return PTR_ERR(folio);
-	/* start within the page of the last block in the file */
-	start = (offset / blocksize) * blocksize;
-
-	error = __block_write_begin(folio, start, offset - start,
-				    reiserfs_get_block_create_0);
-	if (error)
-		goto unlock;
-
-	head = folio_buffers(folio);
-	bh = head;
-	do {
-		if (pos >= start) {
-			break;
-		}
-		bh = bh->b_this_page;
-		pos += blocksize;
-	} while (bh != head);
-
-	if (!buffer_uptodate(bh)) {
-		/*
-		 * note, this should never happen, prepare_write should be
-		 * taking care of this for us.  If the buffer isn't up to
-		 * date, I've screwed up the code to find the buffer, or the
-		 * code to call prepare_write
-		 */
-		reiserfs_error(inode->i_sb, "clm-6000",
-			       "error reading block %lu", bh->b_blocknr);
-		error = -EIO;
-		goto unlock;
-	}
-	*bh_result = bh;
-	*page_result = &folio->page;
-
-	return error;
-
-unlock:
-	folio_unlock(folio);
-	folio_put(folio);
-	return error;
-}
-
-/*
- * vfs version of truncate file.  Must NOT be called with
- * a transaction already started.
- *
- * some code taken from block_truncate_page
- */
-int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
-{
-	struct reiserfs_transaction_handle th;
-	/* we want the offset for the first byte after the end of the file */
-	unsigned long offset = inode->i_size & (PAGE_SIZE - 1);
-	unsigned blocksize = inode->i_sb->s_blocksize;
-	unsigned length;
-	struct page *page = NULL;
-	int error;
-	struct buffer_head *bh = NULL;
-	int err2;
-
-	reiserfs_write_lock(inode->i_sb);
-
-	if (inode->i_size > 0) {
-		error = grab_tail_page(inode, &page, &bh);
-		if (error) {
-			/*
-			 * -ENOENT means we truncated past the end of the
-			 * file, and get_block_create_0 could not find a
-			 * block to read in, which is ok.
-			 */
-			if (error != -ENOENT)
-				reiserfs_error(inode->i_sb, "clm-6001",
-					       "grab_tail_page failed %d",
-					       error);
-			page = NULL;
-			bh = NULL;
-		}
-	}
-
-	/*
-	 * so, if page != NULL, we have a buffer head for the offset at
-	 * the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
-	 * then we have an unformatted node.  Otherwise, we have a direct item,
-	 * and no zeroing is required on disk.  We zero after the truncate,
-	 * because the truncate might pack the item anyway
-	 * (it will unmap bh if it packs).
-	 *
-	 * it is enough to reserve space in transaction for 2 balancings:
-	 * one for "save" link adding and another for the first
-	 * cut_from_item. 1 is for update_sd
-	 */
-	error = journal_begin(&th, inode->i_sb,
-			      JOURNAL_PER_BALANCE_CNT * 2 + 1);
-	if (error)
-		goto out;
-	reiserfs_update_inode_transaction(inode);
-	if (update_timestamps)
-		/*
-		 * we are doing real truncate: if the system crashes
-		 * before the last transaction of truncating gets committed
-		 * - on reboot the file either appears truncated properly
-		 * or not truncated at all
-		 */
-		add_save_link(&th, inode, 1);
-	err2 = reiserfs_do_truncate(&th, inode, page, update_timestamps);
-	error = journal_end(&th);
-	if (error)
-		goto out;
-
-	/* check reiserfs_do_truncate after ending the transaction */
-	if (err2) {
-		error = err2;
-  		goto out;
-	}
-	
-	if (update_timestamps) {
-		error = remove_save_link(inode, 1 /* truncate */);
-		if (error)
-			goto out;
-	}
-
-	if (page) {
-		length = offset & (blocksize - 1);
-		/* if we are not on a block boundary */
-		if (length) {
-			length = blocksize - length;
-			zero_user(page, offset, length);
-			if (buffer_mapped(bh) && bh->b_blocknr != 0) {
-				mark_buffer_dirty(bh);
-			}
-		}
-		unlock_page(page);
-		put_page(page);
-	}
-
-	reiserfs_write_unlock(inode->i_sb);
-
-	return 0;
-out:
-	if (page) {
-		unlock_page(page);
-		put_page(page);
-	}
-
-	reiserfs_write_unlock(inode->i_sb);
-
-	return error;
-}
-
-static int map_block_for_writepage(struct inode *inode,
-				   struct buffer_head *bh_result,
-				   unsigned long block)
-{
-	struct reiserfs_transaction_handle th;
-	int fs_gen;
-	struct item_head tmp_ih;
-	struct item_head *ih;
-	struct buffer_head *bh;
-	__le32 *item;
-	struct cpu_key key;
-	INITIALIZE_PATH(path);
-	int pos_in_item;
-	int jbegin_count = JOURNAL_PER_BALANCE_CNT;
-	loff_t byte_offset = ((loff_t)block << inode->i_sb->s_blocksize_bits)+1;
-	int retval;
-	int use_get_block = 0;
-	int bytes_copied = 0;
-	int copy_size;
-	int trans_running = 0;
-
-	/*
-	 * catch places below that try to log something without
-	 * starting a trans
-	 */
-	th.t_trans_id = 0;
-
-	if (!buffer_uptodate(bh_result)) {
-		return -EIO;
-	}
-
-	kmap(bh_result->b_page);
-start_over:
-	reiserfs_write_lock(inode->i_sb);
-	make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3);
-
-research:
-	retval = search_for_position_by_key(inode->i_sb, &key, &path);
-	if (retval != POSITION_FOUND) {
-		use_get_block = 1;
-		goto out;
-	}
-
-	bh = get_last_bh(&path);
-	ih = tp_item_head(&path);
-	item = tp_item_body(&path);
-	pos_in_item = path.pos_in_item;
-
-	/* we've found an unformatted node */
-	if (indirect_item_found(retval, ih)) {
-		if (bytes_copied > 0) {
-			reiserfs_warning(inode->i_sb, "clm-6002",
-					 "bytes_copied %d", bytes_copied);
-		}
-		if (!get_block_num(item, pos_in_item)) {
-			/* crap, we are writing to a hole */
-			use_get_block = 1;
-			goto out;
-		}
-		set_block_dev_mapped(bh_result,
-				     get_block_num(item, pos_in_item), inode);
-	} else if (is_direct_le_ih(ih)) {
-		char *p;
-		p = page_address(bh_result->b_page);
-		p += (byte_offset - 1) & (PAGE_SIZE - 1);
-		copy_size = ih_item_len(ih) - pos_in_item;
-
-		fs_gen = get_generation(inode->i_sb);
-		copy_item_head(&tmp_ih, ih);
-
-		if (!trans_running) {
-			/* vs-3050 is gone, no need to drop the path */
-			retval = journal_begin(&th, inode->i_sb, jbegin_count);
-			if (retval)
-				goto out;
-			reiserfs_update_inode_transaction(inode);
-			trans_running = 1;
-			if (fs_changed(fs_gen, inode->i_sb)
-			    && item_moved(&tmp_ih, &path)) {
-				reiserfs_restore_prepared_buffer(inode->i_sb,
-								 bh);
-				goto research;
-			}
-		}
-
-		reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
-
-		if (fs_changed(fs_gen, inode->i_sb)
-		    && item_moved(&tmp_ih, &path)) {
-			reiserfs_restore_prepared_buffer(inode->i_sb, bh);
-			goto research;
-		}
-
-		memcpy(ih_item_body(bh, ih) + pos_in_item, p + bytes_copied,
-		       copy_size);
-
-		journal_mark_dirty(&th, bh);
-		bytes_copied += copy_size;
-		set_block_dev_mapped(bh_result, 0, inode);
-
-		/* are there still bytes left? */
-		if (bytes_copied < bh_result->b_size &&
-		    (byte_offset + bytes_copied) < inode->i_size) {
-			set_cpu_key_k_offset(&key,
-					     cpu_key_k_offset(&key) +
-					     copy_size);
-			goto research;
-		}
-	} else {
-		reiserfs_warning(inode->i_sb, "clm-6003",
-				 "bad item inode %lu", inode->i_ino);
-		retval = -EIO;
-		goto out;
-	}
-	retval = 0;
-
-out:
-	pathrelse(&path);
-	if (trans_running) {
-		int err = journal_end(&th);
-		if (err)
-			retval = err;
-		trans_running = 0;
-	}
-	reiserfs_write_unlock(inode->i_sb);
-
-	/* this is where we fill in holes in the file. */
-	if (use_get_block) {
-		retval = reiserfs_get_block(inode, block, bh_result,
-					    GET_BLOCK_CREATE | GET_BLOCK_NO_IMUX
-					    | GET_BLOCK_NO_DANGLE);
-		if (!retval) {
-			if (!buffer_mapped(bh_result)
-			    || bh_result->b_blocknr == 0) {
-				/* get_block failed to find a mapped unformatted node. */
-				use_get_block = 0;
-				goto start_over;
-			}
-		}
-	}
-	kunmap(bh_result->b_page);
-
-	if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
-		/*
-		 * we've copied data from the page into the direct item, so the
-		 * buffer in the page is now clean, mark it to reflect that.
-		 */
-		lock_buffer(bh_result);
-		clear_buffer_dirty(bh_result);
-		unlock_buffer(bh_result);
-	}
-	return retval;
-}
-
-/*
- * mason@suse.com: updated in 2.5.54 to follow the same general io
- * start/recovery path as __block_write_full_folio, along with special
- * code to handle reiserfs tails.
- */
-static int reiserfs_write_folio(struct folio *folio,
-		struct writeback_control *wbc, void *data)
-{
-	struct inode *inode = folio->mapping->host;
-	unsigned long end_index = inode->i_size >> PAGE_SHIFT;
-	int error = 0;
-	unsigned long block;
-	sector_t last_block;
-	struct buffer_head *head, *bh;
-	int partial = 0;
-	int nr = 0;
-	int checked = folio_test_checked(folio);
-	struct reiserfs_transaction_handle th;
-	struct super_block *s = inode->i_sb;
-	int bh_per_page = PAGE_SIZE / s->s_blocksize;
-	th.t_trans_id = 0;
-
-	/* no logging allowed when nonblocking or from PF_MEMALLOC */
-	if (checked && (current->flags & PF_MEMALLOC)) {
-		folio_redirty_for_writepage(wbc, folio);
-		folio_unlock(folio);
-		return 0;
-	}
-
-	/*
-	 * The folio dirty bit is cleared before writepage is called, which
-	 * means we have to tell create_empty_buffers to make dirty buffers
-	 * The folio really should be up to date at this point, so tossing
-	 * in the BH_Uptodate is just a sanity check.
-	 */
-	head = folio_buffers(folio);
-	if (!head)
-		head = create_empty_buffers(folio, s->s_blocksize,
-				     (1 << BH_Dirty) | (1 << BH_Uptodate));
-
-	/*
-	 * last folio in the file, zero out any contents past the
-	 * last byte in the file
-	 */
-	if (folio->index >= end_index) {
-		unsigned last_offset;
-
-		last_offset = inode->i_size & (PAGE_SIZE - 1);
-		/* no file contents in this folio */
-		if (folio->index >= end_index + 1 || !last_offset) {
-			folio_unlock(folio);
-			return 0;
-		}
-		folio_zero_segment(folio, last_offset, folio_size(folio));
-	}
-	bh = head;
-	block = folio->index << (PAGE_SHIFT - s->s_blocksize_bits);
-	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
-	/* first map all the buffers, logging any direct items we find */
-	do {
-		if (block > last_block) {
-			/*
-			 * This can happen when the block size is less than
-			 * the folio size.  The corresponding bytes in the folio
-			 * were zero filled above
-			 */
-			clear_buffer_dirty(bh);
-			set_buffer_uptodate(bh);
-		} else if ((checked || buffer_dirty(bh)) &&
-			   (!buffer_mapped(bh) || bh->b_blocknr == 0)) {
-			/*
-			 * not mapped yet, or it points to a direct item, search
-			 * the btree for the mapping info, and log any direct
-			 * items found
-			 */
-			if ((error = map_block_for_writepage(inode, bh, block))) {
-				goto fail;
-			}
-		}
-		bh = bh->b_this_page;
-		block++;
-	} while (bh != head);
-
-	/*
-	 * we start the transaction after map_block_for_writepage,
-	 * because it can create holes in the file (an unbounded operation).
-	 * starting it here, we can make a reliable estimate for how many
-	 * blocks we're going to log
-	 */
-	if (checked) {
-		folio_clear_checked(folio);
-		reiserfs_write_lock(s);
-		error = journal_begin(&th, s, bh_per_page + 1);
-		if (error) {
-			reiserfs_write_unlock(s);
-			goto fail;
-		}
-		reiserfs_update_inode_transaction(inode);
-	}
-	/* now go through and lock any dirty buffers on the folio */
-	do {
-		get_bh(bh);
-		if (!buffer_mapped(bh))
-			continue;
-		if (buffer_mapped(bh) && bh->b_blocknr == 0)
-			continue;
-
-		if (checked) {
-			reiserfs_prepare_for_journal(s, bh, 1);
-			journal_mark_dirty(&th, bh);
-			continue;
-		}
-		/*
-		 * from this point on, we know the buffer is mapped to a
-		 * real block and not a direct item
-		 */
-		if (wbc->sync_mode != WB_SYNC_NONE) {
-			lock_buffer(bh);
-		} else {
-			if (!trylock_buffer(bh)) {
-				folio_redirty_for_writepage(wbc, folio);
-				continue;
-			}
-		}
-		if (test_clear_buffer_dirty(bh)) {
-			mark_buffer_async_write(bh);
-		} else {
-			unlock_buffer(bh);
-		}
-	} while ((bh = bh->b_this_page) != head);
-
-	if (checked) {
-		error = journal_end(&th);
-		reiserfs_write_unlock(s);
-		if (error)
-			goto fail;
-	}
-	BUG_ON(folio_test_writeback(folio));
-	folio_start_writeback(folio);
-	folio_unlock(folio);
-
-	/*
-	 * since any buffer might be the only dirty buffer on the folio,
-	 * the first submit_bh can bring the folio out of writeback.
-	 * be careful with the buffers.
-	 */
-	do {
-		struct buffer_head *next = bh->b_this_page;
-		if (buffer_async_write(bh)) {
-			submit_bh(REQ_OP_WRITE, bh);
-			nr++;
-		}
-		put_bh(bh);
-		bh = next;
-	} while (bh != head);
-
-	error = 0;
-done:
-	if (nr == 0) {
-		/*
-		 * if this folio only had a direct item, it is very possible for
-		 * no io to be required without there being an error.  Or,
-		 * someone else could have locked them and sent them down the
-		 * pipe without locking the folio
-		 */
-		bh = head;
-		do {
-			if (!buffer_uptodate(bh)) {
-				partial = 1;
-				break;
-			}
-			bh = bh->b_this_page;
-		} while (bh != head);
-		if (!partial)
-			folio_mark_uptodate(folio);
-		folio_end_writeback(folio);
-	}
-	return error;
-
-fail:
-	/*
-	 * catches various errors, we need to make sure any valid dirty blocks
-	 * get to the media.  The folio is currently locked and not marked for
-	 * writeback
-	 */
-	folio_clear_uptodate(folio);
-	bh = head;
-	do {
-		get_bh(bh);
-		if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) {
-			lock_buffer(bh);
-			mark_buffer_async_write(bh);
-		} else {
-			/*
-			 * clear any dirty bits that might have come from
-			 * getting attached to a dirty folio
-			 */
-			clear_buffer_dirty(bh);
-		}
-		bh = bh->b_this_page;
-	} while (bh != head);
-	BUG_ON(folio_test_writeback(folio));
-	folio_start_writeback(folio);
-	folio_unlock(folio);
-	do {
-		struct buffer_head *next = bh->b_this_page;
-		if (buffer_async_write(bh)) {
-			clear_buffer_dirty(bh);
-			submit_bh(REQ_OP_WRITE, bh);
-			nr++;
-		}
-		put_bh(bh);
-		bh = next;
-	} while (bh != head);
-	goto done;
-}
-
-static int reiserfs_read_folio(struct file *f, struct folio *folio)
-{
-	return block_read_full_folio(folio, reiserfs_get_block);
-}
-
-static int reiserfs_writepages(struct address_space *mapping,
-		struct writeback_control *wbc)
-{
-	reiserfs_wait_on_write_block(mapping->host->i_sb);
-	return write_cache_pages(mapping, wbc, reiserfs_write_folio, NULL);
-}
-
-static void reiserfs_truncate_failed_write(struct inode *inode)
-{
-	truncate_inode_pages(inode->i_mapping, inode->i_size);
-	reiserfs_truncate_file(inode, 0);
-}
-
-static int reiserfs_write_begin(struct file *file,
-				struct address_space *mapping,
-				loff_t pos, unsigned len,
-				struct folio **foliop, void **fsdata)
-{
-	struct inode *inode;
-	struct folio *folio;
-	pgoff_t index;
-	int ret;
-	int old_ref = 0;
-
- 	inode = mapping->host;
-	index = pos >> PAGE_SHIFT;
-	folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
-			mapping_gfp_mask(mapping));
-	if (IS_ERR(folio))
-		return PTR_ERR(folio);
-	*foliop = folio;
-
-	reiserfs_wait_on_write_block(inode->i_sb);
-	fix_tail_page_for_writing(&folio->page);
-	if (reiserfs_transaction_running(inode->i_sb)) {
-		struct reiserfs_transaction_handle *th;
-		th = (struct reiserfs_transaction_handle *)current->
-		    journal_info;
-		BUG_ON(!th->t_refcount);
-		BUG_ON(!th->t_trans_id);
-		old_ref = th->t_refcount;
-		th->t_refcount++;
-	}
-	ret = __block_write_begin(folio, pos, len, reiserfs_get_block);
-	if (ret && reiserfs_transaction_running(inode->i_sb)) {
-		struct reiserfs_transaction_handle *th = current->journal_info;
-		/*
-		 * this gets a little ugly.  If reiserfs_get_block returned an
-		 * error and left a transacstion running, we've got to close
-		 * it, and we've got to free handle if it was a persistent
-		 * transaction.
-		 *
-		 * But, if we had nested into an existing transaction, we need
-		 * to just drop the ref count on the handle.
-		 *
-		 * If old_ref == 0, the transaction is from reiserfs_get_block,
-		 * and it was a persistent trans.  Otherwise, it was nested
-		 * above.
-		 */
-		if (th->t_refcount > old_ref) {
-			if (old_ref)
-				th->t_refcount--;
-			else {
-				int err;
-				reiserfs_write_lock(inode->i_sb);
-				err = reiserfs_end_persistent_transaction(th);
-				reiserfs_write_unlock(inode->i_sb);
-				if (err)
-					ret = err;
-			}
-		}
-	}
-	if (ret) {
-		folio_unlock(folio);
-		folio_put(folio);
-		/* Truncate allocated blocks */
-		reiserfs_truncate_failed_write(inode);
-	}
-	return ret;
-}
-
-int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len)
-{
-	struct inode *inode = page->mapping->host;
-	int ret;
-	int old_ref = 0;
-	int depth;
-
-	depth = reiserfs_write_unlock_nested(inode->i_sb);
-	reiserfs_wait_on_write_block(inode->i_sb);
-	reiserfs_write_lock_nested(inode->i_sb, depth);
-
-	fix_tail_page_for_writing(page);
-	if (reiserfs_transaction_running(inode->i_sb)) {
-		struct reiserfs_transaction_handle *th;
-		th = (struct reiserfs_transaction_handle *)current->
-		    journal_info;
-		BUG_ON(!th->t_refcount);
-		BUG_ON(!th->t_trans_id);
-		old_ref = th->t_refcount;
-		th->t_refcount++;
-	}
-
-	ret = __block_write_begin(page_folio(page), from, len, reiserfs_get_block);
-	if (ret && reiserfs_transaction_running(inode->i_sb)) {
-		struct reiserfs_transaction_handle *th = current->journal_info;
-		/*
-		 * this gets a little ugly.  If reiserfs_get_block returned an
-		 * error and left a transacstion running, we've got to close
-		 * it, and we've got to free handle if it was a persistent
-		 * transaction.
-		 *
-		 * But, if we had nested into an existing transaction, we need
-		 * to just drop the ref count on the handle.
-		 *
-		 * If old_ref == 0, the transaction is from reiserfs_get_block,
-		 * and it was a persistent trans.  Otherwise, it was nested
-		 * above.
-		 */
-		if (th->t_refcount > old_ref) {
-			if (old_ref)
-				th->t_refcount--;
-			else {
-				int err;
-				reiserfs_write_lock(inode->i_sb);
-				err = reiserfs_end_persistent_transaction(th);
-				reiserfs_write_unlock(inode->i_sb);
-				if (err)
-					ret = err;
-			}
-		}
-	}
-	return ret;
-
-}
-
-static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block)
-{
-	return generic_block_bmap(as, block, reiserfs_bmap);
-}
-
-static int reiserfs_write_end(struct file *file, struct address_space *mapping,
-			      loff_t pos, unsigned len, unsigned copied,
-			      struct folio *folio, void *fsdata)
-{
-	struct inode *inode = folio->mapping->host;
-	int ret = 0;
-	int update_sd = 0;
-	struct reiserfs_transaction_handle *th;
-	unsigned start;
-	bool locked = false;
-
-	reiserfs_wait_on_write_block(inode->i_sb);
-	if (reiserfs_transaction_running(inode->i_sb))
-		th = current->journal_info;
-	else
-		th = NULL;
-
-	start = pos & (PAGE_SIZE - 1);
-	if (unlikely(copied < len)) {
-		if (!folio_test_uptodate(folio))
-			copied = 0;
-
-		folio_zero_new_buffers(folio, start + copied, start + len);
-	}
-	flush_dcache_folio(folio);
-
-	reiserfs_commit_page(inode, &folio->page, start, start + copied);
-
-	/*
-	 * generic_commit_write does this for us, but does not update the
-	 * transaction tracking stuff when the size changes.  So, we have
-	 * to do the i_size updates here.
-	 */
-	if (pos + copied > inode->i_size) {
-		struct reiserfs_transaction_handle myth;
-		reiserfs_write_lock(inode->i_sb);
-		locked = true;
-		/*
-		 * If the file have grown beyond the border where it
-		 * can have a tail, unmark it as needing a tail
-		 * packing
-		 */
-		if ((have_large_tails(inode->i_sb)
-		     && inode->i_size > i_block_size(inode) * 4)
-		    || (have_small_tails(inode->i_sb)
-			&& inode->i_size > i_block_size(inode)))
-			REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
-
-		ret = journal_begin(&myth, inode->i_sb, 1);
-		if (ret)
-			goto journal_error;
-
-		reiserfs_update_inode_transaction(inode);
-		inode->i_size = pos + copied;
-		/*
-		 * this will just nest into our transaction.  It's important
-		 * to use mark_inode_dirty so the inode gets pushed around on
-		 * the dirty lists, and so that O_SYNC works as expected
-		 */
-		mark_inode_dirty(inode);
-		reiserfs_update_sd(&myth, inode);
-		update_sd = 1;
-		ret = journal_end(&myth);
-		if (ret)
-			goto journal_error;
-	}
-	if (th) {
-		if (!locked) {
-			reiserfs_write_lock(inode->i_sb);
-			locked = true;
-		}
-		if (!update_sd)
-			mark_inode_dirty(inode);
-		ret = reiserfs_end_persistent_transaction(th);
-		if (ret)
-			goto out;
-	}
-
-out:
-	if (locked)
-		reiserfs_write_unlock(inode->i_sb);
-	folio_unlock(folio);
-	folio_put(folio);
-
-	if (pos + len > inode->i_size)
-		reiserfs_truncate_failed_write(inode);
-
-	return ret == 0 ? copied : ret;
-
-journal_error:
-	reiserfs_write_unlock(inode->i_sb);
-	locked = false;
-	if (th) {
-		if (!update_sd)
-			reiserfs_update_sd(th, inode);
-		ret = reiserfs_end_persistent_transaction(th);
-	}
-	goto out;
-}
-
-int reiserfs_commit_write(struct file *f, struct page *page,
-			  unsigned from, unsigned to)
-{
-	struct inode *inode = page->mapping->host;
-	loff_t pos = ((loff_t) page->index << PAGE_SHIFT) + to;
-	int ret = 0;
-	int update_sd = 0;
-	struct reiserfs_transaction_handle *th = NULL;
-	int depth;
-
-	depth = reiserfs_write_unlock_nested(inode->i_sb);
-	reiserfs_wait_on_write_block(inode->i_sb);
-	reiserfs_write_lock_nested(inode->i_sb, depth);
-
-	if (reiserfs_transaction_running(inode->i_sb)) {
-		th = current->journal_info;
-	}
-	reiserfs_commit_page(inode, page, from, to);
-
-	/*
-	 * generic_commit_write does this for us, but does not update the
-	 * transaction tracking stuff when the size changes.  So, we have
-	 * to do the i_size updates here.
-	 */
-	if (pos > inode->i_size) {
-		struct reiserfs_transaction_handle myth;
-		/*
-		 * If the file have grown beyond the border where it
-		 * can have a tail, unmark it as needing a tail
-		 * packing
-		 */
-		if ((have_large_tails(inode->i_sb)
-		     && inode->i_size > i_block_size(inode) * 4)
-		    || (have_small_tails(inode->i_sb)
-			&& inode->i_size > i_block_size(inode)))
-			REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
-
-		ret = journal_begin(&myth, inode->i_sb, 1);
-		if (ret)
-			goto journal_error;
-
-		reiserfs_update_inode_transaction(inode);
-		inode->i_size = pos;
-		/*
-		 * this will just nest into our transaction.  It's important
-		 * to use mark_inode_dirty so the inode gets pushed around
-		 * on the dirty lists, and so that O_SYNC works as expected
-		 */
-		mark_inode_dirty(inode);
-		reiserfs_update_sd(&myth, inode);
-		update_sd = 1;
-		ret = journal_end(&myth);
-		if (ret)
-			goto journal_error;
-	}
-	if (th) {
-		if (!update_sd)
-			mark_inode_dirty(inode);
-		ret = reiserfs_end_persistent_transaction(th);
-		if (ret)
-			goto out;
-	}
-
-out:
-	return ret;
-
-journal_error:
-	if (th) {
-		if (!update_sd)
-			reiserfs_update_sd(th, inode);
-		ret = reiserfs_end_persistent_transaction(th);
-	}
-
-	return ret;
-}
-
-void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode)
-{
-	if (reiserfs_attrs(inode->i_sb)) {
-		if (sd_attrs & REISERFS_SYNC_FL)
-			inode->i_flags |= S_SYNC;
-		else
-			inode->i_flags &= ~S_SYNC;
-		if (sd_attrs & REISERFS_IMMUTABLE_FL)
-			inode->i_flags |= S_IMMUTABLE;
-		else
-			inode->i_flags &= ~S_IMMUTABLE;
-		if (sd_attrs & REISERFS_APPEND_FL)
-			inode->i_flags |= S_APPEND;
-		else
-			inode->i_flags &= ~S_APPEND;
-		if (sd_attrs & REISERFS_NOATIME_FL)
-			inode->i_flags |= S_NOATIME;
-		else
-			inode->i_flags &= ~S_NOATIME;
-		if (sd_attrs & REISERFS_NOTAIL_FL)
-			REISERFS_I(inode)->i_flags |= i_nopack_mask;
-		else
-			REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
-	}
-}
-
-/*
- * decide if this buffer needs to stay around for data logging or ordered
- * write purposes
- */
-static int invalidate_folio_can_drop(struct inode *inode, struct buffer_head *bh)
-{
-	int ret = 1;
-	struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
-
-	lock_buffer(bh);
-	spin_lock(&j->j_dirty_buffers_lock);
-	if (!buffer_mapped(bh)) {
-		goto free_jh;
-	}
-	/*
-	 * the page is locked, and the only places that log a data buffer
-	 * also lock the page.
-	 */
-	if (reiserfs_file_data_log(inode)) {
-		/*
-		 * very conservative, leave the buffer pinned if
-		 * anyone might need it.
-		 */
-		if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
-			ret = 0;
-		}
-	} else  if (buffer_dirty(bh)) {
-		struct reiserfs_journal_list *jl;
-		struct reiserfs_jh *jh = bh->b_private;
-
-		/*
-		 * why is this safe?
-		 * reiserfs_setattr updates i_size in the on disk
-		 * stat data before allowing vmtruncate to be called.
-		 *
-		 * If buffer was put onto the ordered list for this
-		 * transaction, we know for sure either this transaction
-		 * or an older one already has updated i_size on disk,
-		 * and this ordered data won't be referenced in the file
-		 * if we crash.
-		 *
-		 * if the buffer was put onto the ordered list for an older
-		 * transaction, we need to leave it around
-		 */
-		if (jh && (jl = jh->jl)
-		    && jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
-			ret = 0;
-	}
-free_jh:
-	if (ret && bh->b_private) {
-		reiserfs_free_jh(bh);
-	}
-	spin_unlock(&j->j_dirty_buffers_lock);
-	unlock_buffer(bh);
-	return ret;
-}
-
-/* clm -- taken from fs/buffer.c:block_invalidate_folio */
-static void reiserfs_invalidate_folio(struct folio *folio, size_t offset,
-				    size_t length)
-{
-	struct buffer_head *head, *bh, *next;
-	struct inode *inode = folio->mapping->host;
-	unsigned int curr_off = 0;
-	unsigned int stop = offset + length;
-	int partial_page = (offset || length < folio_size(folio));
-	int ret = 1;
-
-	BUG_ON(!folio_test_locked(folio));
-
-	if (!partial_page)
-		folio_clear_checked(folio);
-
-	head = folio_buffers(folio);
-	if (!head)
-		goto out;
-
-	bh = head;
-	do {
-		unsigned int next_off = curr_off + bh->b_size;
-		next = bh->b_this_page;
-
-		if (next_off > stop)
-			goto out;
-
-		/*
-		 * is this block fully invalidated?
-		 */
-		if (offset <= curr_off) {
-			if (invalidate_folio_can_drop(inode, bh))
-				reiserfs_unmap_buffer(bh);
-			else
-				ret = 0;
-		}
-		curr_off = next_off;
-		bh = next;
-	} while (bh != head);
-
-	/*
-	 * We release buffers only if the entire page is being invalidated.
-	 * The get_block cached value has been unconditionally invalidated,
-	 * so real IO is not possible anymore.
-	 */
-	if (!partial_page && ret) {
-		ret = filemap_release_folio(folio, 0);
-		/* maybe should BUG_ON(!ret); - neilb */
-	}
-out:
-	return;
-}
-
-static bool reiserfs_dirty_folio(struct address_space *mapping,
-		struct folio *folio)
-{
-	if (reiserfs_file_data_log(mapping->host)) {
-		folio_set_checked(folio);
-		return filemap_dirty_folio(mapping, folio);
-	}
-	return block_dirty_folio(mapping, folio);
-}
-
-/*
- * Returns true if the folio's buffers were dropped.  The folio is locked.
- *
- * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads
- * in the buffers at folio_buffers(folio).
- *
- * even in -o notail mode, we can't be sure an old mount without -o notail
- * didn't create files with tails.
- */
-static bool reiserfs_release_folio(struct folio *folio, gfp_t unused_gfp_flags)
-{
-	struct inode *inode = folio->mapping->host;
-	struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
-	struct buffer_head *head;
-	struct buffer_head *bh;
-	bool ret = true;
-
-	WARN_ON(folio_test_checked(folio));
-	spin_lock(&j->j_dirty_buffers_lock);
-	head = folio_buffers(folio);
-	bh = head;
-	do {
-		if (bh->b_private) {
-			if (!buffer_dirty(bh) && !buffer_locked(bh)) {
-				reiserfs_free_jh(bh);
-			} else {
-				ret = false;
-				break;
-			}
-		}
-		bh = bh->b_this_page;
-	} while (bh != head);
-	if (ret)
-		ret = try_to_free_buffers(folio);
-	spin_unlock(&j->j_dirty_buffers_lock);
-	return ret;
-}
-
-/*
- * We thank Mingming Cao for helping us understand in great detail what
- * to do in this section of the code.
- */
-static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
-{
-	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_mapping->host;
-	size_t count = iov_iter_count(iter);
-	ssize_t ret;
-
-	ret = blockdev_direct_IO(iocb, inode, iter,
-				 reiserfs_get_blocks_direct_io);
-
-	/*
-	 * In case of error extending write may have instantiated a few
-	 * blocks outside i_size. Trim these off again.
-	 */
-	if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
-		loff_t isize = i_size_read(inode);
-		loff_t end = iocb->ki_pos + count;
-
-		if ((end > isize) && inode_newsize_ok(inode, isize) == 0) {
-			truncate_setsize(inode, isize);
-			reiserfs_vfs_truncate_file(inode);
-		}
-	}
-
-	return ret;
-}
-
-int reiserfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
-		     struct iattr *attr)
-{
-	struct inode *inode = d_inode(dentry);
-	unsigned int ia_valid;
-	int error;
-
-	error = setattr_prepare(&nop_mnt_idmap, dentry, attr);
-	if (error)
-		return error;
-
-	/* must be turned off for recursive notify_change calls */
-	ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
-
-	if (is_quota_modification(&nop_mnt_idmap, inode, attr)) {
-		error = dquot_initialize(inode);
-		if (error)
-			return error;
-	}
-	reiserfs_write_lock(inode->i_sb);
-	if (attr->ia_valid & ATTR_SIZE) {
-		/*
-		 * version 2 items will be caught by the s_maxbytes check
-		 * done for us in vmtruncate
-		 */
-		if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
-		    attr->ia_size > MAX_NON_LFS) {
-			reiserfs_write_unlock(inode->i_sb);
-			error = -EFBIG;
-			goto out;
-		}
-
-		inode_dio_wait(inode);
-
-		/* fill in hole pointers in the expanding truncate case. */
-		if (attr->ia_size > inode->i_size) {
-			loff_t pos = attr->ia_size;
-
-			if ((pos & (inode->i_sb->s_blocksize - 1)) == 0)
-				pos++;
-			error = generic_cont_expand_simple(inode, pos);
-			if (REISERFS_I(inode)->i_prealloc_count > 0) {
-				int err;
-				struct reiserfs_transaction_handle th;
-				/* we're changing at most 2 bitmaps, inode + super */
-				err = journal_begin(&th, inode->i_sb, 4);
-				if (!err) {
-					reiserfs_discard_prealloc(&th, inode);
-					err = journal_end(&th);
-				}
-				if (err)
-					error = err;
-			}
-			if (error) {
-				reiserfs_write_unlock(inode->i_sb);
-				goto out;
-			}
-			/*
-			 * file size is changed, ctime and mtime are
-			 * to be updated
-			 */
-			attr->ia_valid |= (ATTR_MTIME | ATTR_CTIME);
-		}
-	}
-	reiserfs_write_unlock(inode->i_sb);
-
-	if ((((attr->ia_valid & ATTR_UID) && (from_kuid(&init_user_ns, attr->ia_uid) & ~0xffff)) ||
-	     ((attr->ia_valid & ATTR_GID) && (from_kgid(&init_user_ns, attr->ia_gid) & ~0xffff))) &&
-	    (get_inode_sd_version(inode) == STAT_DATA_V1)) {
-		/* stat data of format v3.5 has 16 bit uid and gid */
-		error = -EINVAL;
-		goto out;
-	}
-
-	if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
-	    (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
-		struct reiserfs_transaction_handle th;
-		int jbegin_count =
-		    2 *
-		    (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) +
-		     REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) +
-		    2;
-
-		error = reiserfs_chown_xattrs(inode, attr);
-
-		if (error)
-			return error;
-
-		/*
-		 * (user+group)*(old+new) structure - we count quota
-		 * info and , inode write (sb, inode)
-		 */
-		reiserfs_write_lock(inode->i_sb);
-		error = journal_begin(&th, inode->i_sb, jbegin_count);
-		reiserfs_write_unlock(inode->i_sb);
-		if (error)
-			goto out;
-		error = dquot_transfer(&nop_mnt_idmap, inode, attr);
-		reiserfs_write_lock(inode->i_sb);
-		if (error) {
-			journal_end(&th);
-			reiserfs_write_unlock(inode->i_sb);
-			goto out;
-		}
-
-		/*
-		 * Update corresponding info in inode so that everything
-		 * is in one transaction
-		 */
-		if (attr->ia_valid & ATTR_UID)
-			inode->i_uid = attr->ia_uid;
-		if (attr->ia_valid & ATTR_GID)
-			inode->i_gid = attr->ia_gid;
-		mark_inode_dirty(inode);
-		error = journal_end(&th);
-		reiserfs_write_unlock(inode->i_sb);
-		if (error)
-			goto out;
-	}
-
-	if ((attr->ia_valid & ATTR_SIZE) &&
-	    attr->ia_size != i_size_read(inode)) {
-		error = inode_newsize_ok(inode, attr->ia_size);
-		if (!error) {
-			/*
-			 * Could race against reiserfs_file_release
-			 * if called from NFS, so take tailpack mutex.
-			 */
-			mutex_lock(&REISERFS_I(inode)->tailpack);
-			truncate_setsize(inode, attr->ia_size);
-			reiserfs_truncate_file(inode, 1);
-			mutex_unlock(&REISERFS_I(inode)->tailpack);
-		}
-	}
-
-	if (!error) {
-		setattr_copy(&nop_mnt_idmap, inode, attr);
-		mark_inode_dirty(inode);
-	}
-
-	if (!error && reiserfs_posixacl(inode->i_sb)) {
-		if (attr->ia_valid & ATTR_MODE)
-			error = reiserfs_acl_chmod(dentry);
-	}
-
-out:
-	return error;
-}
-
-const struct address_space_operations reiserfs_address_space_operations = {
-	.writepages = reiserfs_writepages,
-	.read_folio = reiserfs_read_folio,
-	.readahead = reiserfs_readahead,
-	.release_folio = reiserfs_release_folio,
-	.invalidate_folio = reiserfs_invalidate_folio,
-	.write_begin = reiserfs_write_begin,
-	.write_end = reiserfs_write_end,
-	.bmap = reiserfs_aop_bmap,
-	.direct_IO = reiserfs_direct_IO,
-	.dirty_folio = reiserfs_dirty_folio,
-	.migrate_folio = buffer_migrate_folio,
-};
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
deleted file mode 100644
index dd33f8cc6eda..000000000000
--- a/fs/reiserfs/ioctl.c
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/capability.h>
-#include <linux/fs.h>
-#include <linux/mount.h>
-#include "reiserfs.h"
-#include <linux/time.h>
-#include <linux/uaccess.h>
-#include <linux/pagemap.h>
-#include <linux/compat.h>
-#include <linux/fileattr.h>
-
-int reiserfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
-{
-	struct inode *inode = d_inode(dentry);
-
-	if (!reiserfs_attrs(inode->i_sb))
-		return -ENOTTY;
-
-	fileattr_fill_flags(fa, REISERFS_I(inode)->i_attrs);
-
-	return 0;
-}
-
-int reiserfs_fileattr_set(struct mnt_idmap *idmap,
-			  struct dentry *dentry, struct fileattr *fa)
-{
-	struct inode *inode = d_inode(dentry);
-	unsigned int flags = fa->flags;
-	int err;
-
-	reiserfs_write_lock(inode->i_sb);
-
-	err = -ENOTTY;
-	if (!reiserfs_attrs(inode->i_sb))
-		goto unlock;
-
-	err = -EOPNOTSUPP;
-	if (fileattr_has_fsx(fa))
-		goto unlock;
-
-	/*
-	 * Is it quota file? Do not allow user to mess with it
-	 */
-	err = -EPERM;
-	if (IS_NOQUOTA(inode))
-		goto unlock;
-
-	if ((flags & REISERFS_NOTAIL_FL) && S_ISREG(inode->i_mode)) {
-		err = reiserfs_unpack(inode);
-		if (err)
-			goto unlock;
-	}
-	sd_attrs_to_i_attrs(flags, inode);
-	REISERFS_I(inode)->i_attrs = flags;
-	inode_set_ctime_current(inode);
-	mark_inode_dirty(inode);
-	err = 0;
-unlock:
-	reiserfs_write_unlock(inode->i_sb);
-
-	return err;
-}
-
-/*
- * reiserfs_ioctl - handler for ioctl for inode
- * supported commands:
- *  1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect
- *                           and prevent packing file (argument arg has t
- *			      be non-zero)
- *  2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION
- *  3) That's all for a while ...
- */
-long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-{
-	struct inode *inode = file_inode(filp);
-	int err = 0;
-
-	reiserfs_write_lock(inode->i_sb);
-
-	switch (cmd) {
-	case REISERFS_IOC_UNPACK:
-		if (S_ISREG(inode->i_mode)) {
-			if (arg)
-				err = reiserfs_unpack(inode);
-		} else
-			err = -ENOTTY;
-		break;
-		/*
-		 * following two cases are taken from fs/ext2/ioctl.c by Remy
-		 * Card (card@masi.ibp.fr)
-		 */
-	case REISERFS_IOC_GETVERSION:
-		err = put_user(inode->i_generation, (int __user *)arg);
-		break;
-	case REISERFS_IOC_SETVERSION:
-		if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) {
-			err = -EPERM;
-			break;
-		}
-		err = mnt_want_write_file(filp);
-		if (err)
-			break;
-		if (get_user(inode->i_generation, (int __user *)arg)) {
-			err = -EFAULT;
-			goto setversion_out;
-		}
-		inode_set_ctime_current(inode);
-		mark_inode_dirty(inode);
-setversion_out:
-		mnt_drop_write_file(filp);
-		break;
-	default:
-		err = -ENOTTY;
-	}
-
-	reiserfs_write_unlock(inode->i_sb);
-
-	return err;
-}
-
-#ifdef CONFIG_COMPAT
-long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
-				unsigned long arg)
-{
-	/*
-	 * These are just misnamed, they actually
-	 * get/put from/to user an int
-	 */
-	switch (cmd) {
-	case REISERFS_IOC32_UNPACK:
-		cmd = REISERFS_IOC_UNPACK;
-		break;
-	case REISERFS_IOC32_GETVERSION:
-		cmd = REISERFS_IOC_GETVERSION;
-		break;
-	case REISERFS_IOC32_SETVERSION:
-		cmd = REISERFS_IOC_SETVERSION;
-		break;
-	default:
-		return -ENOIOCTLCMD;
-	}
-
-	return reiserfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
-}
-#endif
-
-int reiserfs_commit_write(struct file *f, struct page *page,
-			  unsigned from, unsigned to);
-/*
- * reiserfs_unpack
- * Function try to convert tail from direct item into indirect.
- * It set up nopack attribute in the REISERFS_I(inode)->nopack
- */
-int reiserfs_unpack(struct inode *inode)
-{
-	int retval = 0;
-	int index;
-	struct page *page;
-	struct address_space *mapping;
-	unsigned long write_from;
-	unsigned long blocksize = inode->i_sb->s_blocksize;
-
-	if (inode->i_size == 0) {
-		REISERFS_I(inode)->i_flags |= i_nopack_mask;
-		return 0;
-	}
-	/* ioctl already done */
-	if (REISERFS_I(inode)->i_flags & i_nopack_mask) {
-		return 0;
-	}
-
-	/* we need to make sure nobody is changing the file size beneath us */
-	{
-		int depth = reiserfs_write_unlock_nested(inode->i_sb);
-
-		inode_lock(inode);
-		reiserfs_write_lock_nested(inode->i_sb, depth);
-	}
-
-	reiserfs_write_lock(inode->i_sb);
-
-	write_from = inode->i_size & (blocksize - 1);
-	/* if we are on a block boundary, we are already unpacked.  */
-	if (write_from == 0) {
-		REISERFS_I(inode)->i_flags |= i_nopack_mask;
-		goto out;
-	}
-
-	/*
-	 * we unpack by finding the page with the tail, and calling
-	 * __reiserfs_write_begin on that page.  This will force a
-	 * reiserfs_get_block to unpack the tail for us.
-	 */
-	index = inode->i_size >> PAGE_SHIFT;
-	mapping = inode->i_mapping;
-	page = grab_cache_page(mapping, index);
-	retval = -ENOMEM;
-	if (!page) {
-		goto out;
-	}
-	retval = __reiserfs_write_begin(page, write_from, 0);
-	if (retval)
-		goto out_unlock;
-
-	/* conversion can change page contents, must flush */
-	flush_dcache_page(page);
-	retval = reiserfs_commit_write(NULL, page, write_from, write_from);
-	REISERFS_I(inode)->i_flags |= i_nopack_mask;
-
-out_unlock:
-	unlock_page(page);
-	put_page(page);
-
-out:
-	inode_unlock(inode);
-	reiserfs_write_unlock(inode->i_sb);
-	return retval;
-}
diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c
deleted file mode 100644
index 5011c10287c6..000000000000
--- a/fs/reiserfs/item_ops.c
+++ /dev/null
@@ -1,737 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/time.h>
-#include "reiserfs.h"
-
-/*
- * this contains item handlers for old item types: sd, direct,
- * indirect, directory
- */
-
-/*
- * and where are the comments? how about saying where we can find an
- * explanation of each item handler method? -Hans
- */
-
-/* stat data functions */
-static int sd_bytes_number(struct item_head *ih, int block_size)
-{
-	return 0;
-}
-
-static void sd_decrement_key(struct cpu_key *key)
-{
-	key->on_disk_key.k_objectid--;
-	set_cpu_key_k_type(key, TYPE_ANY);
-	set_cpu_key_k_offset(key, (loff_t)(~0ULL >> 1));
-}
-
-static int sd_is_left_mergeable(struct reiserfs_key *key, unsigned long bsize)
-{
-	return 0;
-}
-
-static void sd_print_item(struct item_head *ih, char *item)
-{
-	printk("\tmode | size | nlinks | first direct | mtime\n");
-	if (stat_data_v1(ih)) {
-		struct stat_data_v1 *sd = (struct stat_data_v1 *)item;
-
-		printk("\t0%-6o | %6u | %2u | %d | %u\n", sd_v1_mode(sd),
-		       sd_v1_size(sd), sd_v1_nlink(sd),
-		       sd_v1_first_direct_byte(sd),
-		       sd_v1_mtime(sd));
-	} else {
-		struct stat_data *sd = (struct stat_data *)item;
-
-		printk("\t0%-6o | %6llu | %2u | %d | %u\n", sd_v2_mode(sd),
-		       (unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd),
-		       sd_v2_rdev(sd), sd_v2_mtime(sd));
-	}
-}
-
-static void sd_check_item(struct item_head *ih, char *item)
-{
-	/* unused */
-}
-
-static int sd_create_vi(struct virtual_node *vn,
-			struct virtual_item *vi,
-			int is_affected, int insert_size)
-{
-	vi->vi_index = TYPE_STAT_DATA;
-	return 0;
-}
-
-static int sd_check_left(struct virtual_item *vi, int free,
-			 int start_skip, int end_skip)
-{
-	BUG_ON(start_skip || end_skip);
-	return -1;
-}
-
-static int sd_check_right(struct virtual_item *vi, int free)
-{
-	return -1;
-}
-
-static int sd_part_size(struct virtual_item *vi, int first, int count)
-{
-	BUG_ON(count);
-	return 0;
-}
-
-static int sd_unit_num(struct virtual_item *vi)
-{
-	return vi->vi_item_len - IH_SIZE;
-}
-
-static void sd_print_vi(struct virtual_item *vi)
-{
-	reiserfs_warning(NULL, "reiserfs-16100",
-			 "STATDATA, index %d, type 0x%x, %h",
-			 vi->vi_index, vi->vi_type, vi->vi_ih);
-}
-
-static struct item_operations stat_data_ops = {
-	.bytes_number = sd_bytes_number,
-	.decrement_key = sd_decrement_key,
-	.is_left_mergeable = sd_is_left_mergeable,
-	.print_item = sd_print_item,
-	.check_item = sd_check_item,
-
-	.create_vi = sd_create_vi,
-	.check_left = sd_check_left,
-	.check_right = sd_check_right,
-	.part_size = sd_part_size,
-	.unit_num = sd_unit_num,
-	.print_vi = sd_print_vi
-};
-
-/* direct item functions */
-static int direct_bytes_number(struct item_head *ih, int block_size)
-{
-	return ih_item_len(ih);
-}
-
-/* FIXME: this should probably switch to indirect as well */
-static void direct_decrement_key(struct cpu_key *key)
-{
-	cpu_key_k_offset_dec(key);
-	if (cpu_key_k_offset(key) == 0)
-		set_cpu_key_k_type(key, TYPE_STAT_DATA);
-}
-
-static int direct_is_left_mergeable(struct reiserfs_key *key,
-				    unsigned long bsize)
-{
-	int version = le_key_version(key);
-	return ((le_key_k_offset(version, key) & (bsize - 1)) != 1);
-}
-
-static void direct_print_item(struct item_head *ih, char *item)
-{
-	int j = 0;
-
-/*    return; */
-	printk("\"");
-	while (j < ih_item_len(ih))
-		printk("%c", item[j++]);
-	printk("\"\n");
-}
-
-static void direct_check_item(struct item_head *ih, char *item)
-{
-	/* unused */
-}
-
-static int direct_create_vi(struct virtual_node *vn,
-			    struct virtual_item *vi,
-			    int is_affected, int insert_size)
-{
-	vi->vi_index = TYPE_DIRECT;
-	return 0;
-}
-
-static int direct_check_left(struct virtual_item *vi, int free,
-			     int start_skip, int end_skip)
-{
-	int bytes;
-
-	bytes = free - free % 8;
-	return bytes ? : -1;
-}
-
-static int direct_check_right(struct virtual_item *vi, int free)
-{
-	return direct_check_left(vi, free, 0, 0);
-}
-
-static int direct_part_size(struct virtual_item *vi, int first, int count)
-{
-	return count;
-}
-
-static int direct_unit_num(struct virtual_item *vi)
-{
-	return vi->vi_item_len - IH_SIZE;
-}
-
-static void direct_print_vi(struct virtual_item *vi)
-{
-	reiserfs_warning(NULL, "reiserfs-16101",
-			 "DIRECT, index %d, type 0x%x, %h",
-			 vi->vi_index, vi->vi_type, vi->vi_ih);
-}
-
-static struct item_operations direct_ops = {
-	.bytes_number = direct_bytes_number,
-	.decrement_key = direct_decrement_key,
-	.is_left_mergeable = direct_is_left_mergeable,
-	.print_item = direct_print_item,
-	.check_item = direct_check_item,
-
-	.create_vi = direct_create_vi,
-	.check_left = direct_check_left,
-	.check_right = direct_check_right,
-	.part_size = direct_part_size,
-	.unit_num = direct_unit_num,
-	.print_vi = direct_print_vi
-};
-
-/* indirect item functions */
-static int indirect_bytes_number(struct item_head *ih, int block_size)
-{
-	return ih_item_len(ih) / UNFM_P_SIZE * block_size;
-}
-
-/* decrease offset, if it becomes 0, change type to stat data */
-static void indirect_decrement_key(struct cpu_key *key)
-{
-	cpu_key_k_offset_dec(key);
-	if (cpu_key_k_offset(key) == 0)
-		set_cpu_key_k_type(key, TYPE_STAT_DATA);
-}
-
-/* if it is not first item of the body, then it is mergeable */
-static int indirect_is_left_mergeable(struct reiserfs_key *key,
-				      unsigned long bsize)
-{
-	int version = le_key_version(key);
-	return (le_key_k_offset(version, key) != 1);
-}
-
-/* printing of indirect item */
-static void start_new_sequence(__u32 * start, int *len, __u32 new)
-{
-	*start = new;
-	*len = 1;
-}
-
-static int sequence_finished(__u32 start, int *len, __u32 new)
-{
-	if (start == INT_MAX)
-		return 1;
-
-	if (start == 0 && new == 0) {
-		(*len)++;
-		return 0;
-	}
-	if (start != 0 && (start + *len) == new) {
-		(*len)++;
-		return 0;
-	}
-	return 1;
-}
-
-static void print_sequence(__u32 start, int len)
-{
-	if (start == INT_MAX)
-		return;
-
-	if (len == 1)
-		printk(" %d", start);
-	else
-		printk(" %d(%d)", start, len);
-}
-
-static void indirect_print_item(struct item_head *ih, char *item)
-{
-	int j;
-	__le32 *unp;
-	__u32 prev = INT_MAX;
-	int num = 0;
-
-	unp = (__le32 *) item;
-
-	if (ih_item_len(ih) % UNFM_P_SIZE)
-		reiserfs_warning(NULL, "reiserfs-16102", "invalid item len");
-
-	printk("%d pointers\n[ ", (int)I_UNFM_NUM(ih));
-	for (j = 0; j < I_UNFM_NUM(ih); j++) {
-		if (sequence_finished(prev, &num, get_block_num(unp, j))) {
-			print_sequence(prev, num);
-			start_new_sequence(&prev, &num, get_block_num(unp, j));
-		}
-	}
-	print_sequence(prev, num);
-	printk("]\n");
-}
-
-static void indirect_check_item(struct item_head *ih, char *item)
-{
-	/* unused */
-}
-
-static int indirect_create_vi(struct virtual_node *vn,
-			      struct virtual_item *vi,
-			      int is_affected, int insert_size)
-{
-	vi->vi_index = TYPE_INDIRECT;
-	return 0;
-}
-
-static int indirect_check_left(struct virtual_item *vi, int free,
-			       int start_skip, int end_skip)
-{
-	int bytes;
-
-	bytes = free - free % UNFM_P_SIZE;
-	return bytes ? : -1;
-}
-
-static int indirect_check_right(struct virtual_item *vi, int free)
-{
-	return indirect_check_left(vi, free, 0, 0);
-}
-
-/*
- * return size in bytes of 'units' units. If first == 0 - calculate
- * from the head (left), otherwise - from tail (right)
- */
-static int indirect_part_size(struct virtual_item *vi, int first, int units)
-{
-	/* unit of indirect item is byte (yet) */
-	return units;
-}
-
-static int indirect_unit_num(struct virtual_item *vi)
-{
-	/* unit of indirect item is byte (yet) */
-	return vi->vi_item_len - IH_SIZE;
-}
-
-static void indirect_print_vi(struct virtual_item *vi)
-{
-	reiserfs_warning(NULL, "reiserfs-16103",
-			 "INDIRECT, index %d, type 0x%x, %h",
-			 vi->vi_index, vi->vi_type, vi->vi_ih);
-}
-
-static struct item_operations indirect_ops = {
-	.bytes_number = indirect_bytes_number,
-	.decrement_key = indirect_decrement_key,
-	.is_left_mergeable = indirect_is_left_mergeable,
-	.print_item = indirect_print_item,
-	.check_item = indirect_check_item,
-
-	.create_vi = indirect_create_vi,
-	.check_left = indirect_check_left,
-	.check_right = indirect_check_right,
-	.part_size = indirect_part_size,
-	.unit_num = indirect_unit_num,
-	.print_vi = indirect_print_vi
-};
-
-/* direntry functions */
-static int direntry_bytes_number(struct item_head *ih, int block_size)
-{
-	reiserfs_warning(NULL, "vs-16090",
-			 "bytes number is asked for direntry");
-	return 0;
-}
-
-static void direntry_decrement_key(struct cpu_key *key)
-{
-	cpu_key_k_offset_dec(key);
-	if (cpu_key_k_offset(key) == 0)
-		set_cpu_key_k_type(key, TYPE_STAT_DATA);
-}
-
-static int direntry_is_left_mergeable(struct reiserfs_key *key,
-				      unsigned long bsize)
-{
-	if (le32_to_cpu(key->u.k_offset_v1.k_offset) == DOT_OFFSET)
-		return 0;
-	return 1;
-
-}
-
-static void direntry_print_item(struct item_head *ih, char *item)
-{
-	int i;
-	int namelen;
-	struct reiserfs_de_head *deh;
-	char *name;
-	static char namebuf[80];
-
-	printk("\n # %-15s%-30s%-15s%-15s%-15s\n", "Name",
-	       "Key of pointed object", "Hash", "Gen number", "Status");
-
-	deh = (struct reiserfs_de_head *)item;
-
-	for (i = 0; i < ih_entry_count(ih); i++, deh++) {
-		namelen =
-		    (i ? (deh_location(deh - 1)) : ih_item_len(ih)) -
-		    deh_location(deh);
-		name = item + deh_location(deh);
-		if (name[namelen - 1] == 0)
-			namelen = strlen(name);
-
-		scnprintf(namebuf, sizeof(namebuf), "\"%.*s\"",
-			  (int)sizeof(namebuf)-3, name);
-
-		printk("%d:  %-15s%-15d%-15d%-15lld%-15lld(%s)\n",
-		       i, namebuf,
-		       deh_dir_id(deh), deh_objectid(deh),
-		       GET_HASH_VALUE(deh_offset(deh)),
-		       GET_GENERATION_NUMBER((deh_offset(deh))),
-		       (de_hidden(deh)) ? "HIDDEN" : "VISIBLE");
-	}
-}
-
-static void direntry_check_item(struct item_head *ih, char *item)
-{
-	int i;
-	struct reiserfs_de_head *deh;
-
-	/* unused */
-	deh = (struct reiserfs_de_head *)item;
-	for (i = 0; i < ih_entry_count(ih); i++, deh++) {
-		;
-	}
-}
-
-#define DIRENTRY_VI_FIRST_DIRENTRY_ITEM 1
-
-/*
- * function returns old entry number in directory item in real node
- * using new entry number in virtual item in virtual node
- */
-static inline int old_entry_num(int is_affected, int virtual_entry_num,
-				int pos_in_item, int mode)
-{
-	if (mode == M_INSERT || mode == M_DELETE)
-		return virtual_entry_num;
-
-	if (!is_affected)
-		/* cut or paste is applied to another item */
-		return virtual_entry_num;
-
-	if (virtual_entry_num < pos_in_item)
-		return virtual_entry_num;
-
-	if (mode == M_CUT)
-		return virtual_entry_num + 1;
-
-	RFALSE(mode != M_PASTE || virtual_entry_num == 0,
-	       "vs-8015: old_entry_num: mode must be M_PASTE (mode = \'%c\'",
-	       mode);
-
-	return virtual_entry_num - 1;
-}
-
-/*
- * Create an array of sizes of directory entries for virtual
- * item. Return space used by an item. FIXME: no control over
- * consuming of space used by this item handler
- */
-static int direntry_create_vi(struct virtual_node *vn,
-			      struct virtual_item *vi,
-			      int is_affected, int insert_size)
-{
-	struct direntry_uarea *dir_u = vi->vi_uarea;
-	int i, j;
-	int size = sizeof(struct direntry_uarea);
-	struct reiserfs_de_head *deh;
-
-	vi->vi_index = TYPE_DIRENTRY;
-
-	BUG_ON(!(vi->vi_ih) || !vi->vi_item);
-
-	dir_u->flags = 0;
-	if (le_ih_k_offset(vi->vi_ih) == DOT_OFFSET)
-		dir_u->flags |= DIRENTRY_VI_FIRST_DIRENTRY_ITEM;
-
-	deh = (struct reiserfs_de_head *)(vi->vi_item);
-
-	/* virtual directory item have this amount of entry after */
-	dir_u->entry_count = ih_entry_count(vi->vi_ih) +
-	    ((is_affected) ? ((vn->vn_mode == M_CUT) ? -1 :
-			      (vn->vn_mode == M_PASTE ? 1 : 0)) : 0);
-
-	for (i = 0; i < dir_u->entry_count; i++) {
-		j = old_entry_num(is_affected, i, vn->vn_pos_in_item,
-				  vn->vn_mode);
-		dir_u->entry_sizes[i] =
-		    (j ? deh_location(&deh[j - 1]) : ih_item_len(vi->vi_ih)) -
-		    deh_location(&deh[j]) + DEH_SIZE;
-	}
-
-	size += (dir_u->entry_count * sizeof(short));
-
-	/* set size of pasted entry */
-	if (is_affected && vn->vn_mode == M_PASTE)
-		dir_u->entry_sizes[vn->vn_pos_in_item] = insert_size;
-
-#ifdef CONFIG_REISERFS_CHECK
-	/* compare total size of entries with item length */
-	{
-		int k, l;
-
-		l = 0;
-		for (k = 0; k < dir_u->entry_count; k++)
-			l += dir_u->entry_sizes[k];
-
-		if (l + IH_SIZE != vi->vi_item_len +
-		    ((is_affected
-		      && (vn->vn_mode == M_PASTE
-			  || vn->vn_mode == M_CUT)) ? insert_size : 0)) {
-			reiserfs_panic(NULL, "vs-8025", "(mode==%c, "
-				       "insert_size==%d), invalid length of "
-				       "directory item",
-				       vn->vn_mode, insert_size);
-		}
-	}
-#endif
-
-	return size;
-
-}
-
-/*
- * return number of entries which may fit into specified amount of
- * free space, or -1 if free space is not enough even for 1 entry
- */
-static int direntry_check_left(struct virtual_item *vi, int free,
-			       int start_skip, int end_skip)
-{
-	int i;
-	int entries = 0;
-	struct direntry_uarea *dir_u = vi->vi_uarea;
-
-	for (i = start_skip; i < dir_u->entry_count - end_skip; i++) {
-		/* i-th entry doesn't fit into the remaining free space */
-		if (dir_u->entry_sizes[i] > free)
-			break;
-
-		free -= dir_u->entry_sizes[i];
-		entries++;
-	}
-
-	if (entries == dir_u->entry_count) {
-		reiserfs_panic(NULL, "item_ops-1",
-			       "free space %d, entry_count %d", free,
-			       dir_u->entry_count);
-	}
-
-	/* "." and ".." can not be separated from each other */
-	if (start_skip == 0 && (dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM)
-	    && entries < 2)
-		entries = 0;
-
-	return entries ? : -1;
-}
-
-static int direntry_check_right(struct virtual_item *vi, int free)
-{
-	int i;
-	int entries = 0;
-	struct direntry_uarea *dir_u = vi->vi_uarea;
-
-	for (i = dir_u->entry_count - 1; i >= 0; i--) {
-		/* i-th entry doesn't fit into the remaining free space */
-		if (dir_u->entry_sizes[i] > free)
-			break;
-
-		free -= dir_u->entry_sizes[i];
-		entries++;
-	}
-	BUG_ON(entries == dir_u->entry_count);
-
-	/* "." and ".." can not be separated from each other */
-	if ((dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM)
-	    && entries > dir_u->entry_count - 2)
-		entries = dir_u->entry_count - 2;
-
-	return entries ? : -1;
-}
-
-/* sum of entry sizes between from-th and to-th entries including both edges */
-static int direntry_part_size(struct virtual_item *vi, int first, int count)
-{
-	int i, retval;
-	int from, to;
-	struct direntry_uarea *dir_u = vi->vi_uarea;
-
-	retval = 0;
-	if (first == 0)
-		from = 0;
-	else
-		from = dir_u->entry_count - count;
-	to = from + count - 1;
-
-	for (i = from; i <= to; i++)
-		retval += dir_u->entry_sizes[i];
-
-	return retval;
-}
-
-static int direntry_unit_num(struct virtual_item *vi)
-{
-	struct direntry_uarea *dir_u = vi->vi_uarea;
-
-	return dir_u->entry_count;
-}
-
-static void direntry_print_vi(struct virtual_item *vi)
-{
-	int i;
-	struct direntry_uarea *dir_u = vi->vi_uarea;
-
-	reiserfs_warning(NULL, "reiserfs-16104",
-			 "DIRENTRY, index %d, type 0x%x, %h, flags 0x%x",
-			 vi->vi_index, vi->vi_type, vi->vi_ih, dir_u->flags);
-	printk("%d entries: ", dir_u->entry_count);
-	for (i = 0; i < dir_u->entry_count; i++)
-		printk("%d ", dir_u->entry_sizes[i]);
-	printk("\n");
-}
-
-static struct item_operations direntry_ops = {
-	.bytes_number = direntry_bytes_number,
-	.decrement_key = direntry_decrement_key,
-	.is_left_mergeable = direntry_is_left_mergeable,
-	.print_item = direntry_print_item,
-	.check_item = direntry_check_item,
-
-	.create_vi = direntry_create_vi,
-	.check_left = direntry_check_left,
-	.check_right = direntry_check_right,
-	.part_size = direntry_part_size,
-	.unit_num = direntry_unit_num,
-	.print_vi = direntry_print_vi
-};
-
-/* Error catching functions to catch errors caused by incorrect item types. */
-static int errcatch_bytes_number(struct item_head *ih, int block_size)
-{
-	reiserfs_warning(NULL, "green-16001",
-			 "Invalid item type observed, run fsck ASAP");
-	return 0;
-}
-
-static void errcatch_decrement_key(struct cpu_key *key)
-{
-	reiserfs_warning(NULL, "green-16002",
-			 "Invalid item type observed, run fsck ASAP");
-}
-
-static int errcatch_is_left_mergeable(struct reiserfs_key *key,
-				      unsigned long bsize)
-{
-	reiserfs_warning(NULL, "green-16003",
-			 "Invalid item type observed, run fsck ASAP");
-	return 0;
-}
-
-static void errcatch_print_item(struct item_head *ih, char *item)
-{
-	reiserfs_warning(NULL, "green-16004",
-			 "Invalid item type observed, run fsck ASAP");
-}
-
-static void errcatch_check_item(struct item_head *ih, char *item)
-{
-	reiserfs_warning(NULL, "green-16005",
-			 "Invalid item type observed, run fsck ASAP");
-}
-
-static int errcatch_create_vi(struct virtual_node *vn,
-			      struct virtual_item *vi,
-			      int is_affected, int insert_size)
-{
-	reiserfs_warning(NULL, "green-16006",
-			 "Invalid item type observed, run fsck ASAP");
-	/*
-	 * We might return -1 here as well, but it won't help as
-	 * create_virtual_node() from where this operation is called
-	 * from is of return type void.
-	 */
-	return 0;
-}
-
-static int errcatch_check_left(struct virtual_item *vi, int free,
-			       int start_skip, int end_skip)
-{
-	reiserfs_warning(NULL, "green-16007",
-			 "Invalid item type observed, run fsck ASAP");
-	return -1;
-}
-
-static int errcatch_check_right(struct virtual_item *vi, int free)
-{
-	reiserfs_warning(NULL, "green-16008",
-			 "Invalid item type observed, run fsck ASAP");
-	return -1;
-}
-
-static int errcatch_part_size(struct virtual_item *vi, int first, int count)
-{
-	reiserfs_warning(NULL, "green-16009",
-			 "Invalid item type observed, run fsck ASAP");
-	return 0;
-}
-
-static int errcatch_unit_num(struct virtual_item *vi)
-{
-	reiserfs_warning(NULL, "green-16010",
-			 "Invalid item type observed, run fsck ASAP");
-	return 0;
-}
-
-static void errcatch_print_vi(struct virtual_item *vi)
-{
-	reiserfs_warning(NULL, "green-16011",
-			 "Invalid item type observed, run fsck ASAP");
-}
-
-static struct item_operations errcatch_ops = {
-	.bytes_number = errcatch_bytes_number,
-	.decrement_key = errcatch_decrement_key,
-	.is_left_mergeable = errcatch_is_left_mergeable,
-	.print_item = errcatch_print_item,
-	.check_item = errcatch_check_item,
-
-	.create_vi = errcatch_create_vi,
-	.check_left = errcatch_check_left,
-	.check_right = errcatch_check_right,
-	.part_size = errcatch_part_size,
-	.unit_num = errcatch_unit_num,
-	.print_vi = errcatch_print_vi
-};
-
-#if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3)
-#error Item types must use disk-format assigned values.
-#endif
-
-struct item_operations *item_ops[TYPE_ANY + 1] = {
-	&stat_data_ops,
-	&indirect_ops,
-	&direct_ops,
-	&direntry_ops,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	&errcatch_ops		/* This is to catch errors with invalid type (15th entry for TYPE_ANY) */
-};
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
deleted file mode 100644
index e477ee0ff35d..000000000000
--- a/fs/reiserfs/journal.c
+++ /dev/null
@@ -1,4404 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Write ahead logging implementation copyright Chris Mason 2000
- *
- * The background commits make this code very interrelated, and
- * overly complex.  I need to rethink things a bit....The major players:
- *
- * journal_begin -- call with the number of blocks you expect to log.
- *                  If the current transaction is too
- *		    old, it will block until the current transaction is
- *		    finished, and then start a new one.
- *		    Usually, your transaction will get joined in with
- *                  previous ones for speed.
- *
- * journal_join  -- same as journal_begin, but won't block on the current
- *                  transaction regardless of age.  Don't ever call
- *                  this.  Ever.  There are only two places it should be
- *                  called from, and they are both inside this file.
- *
- * journal_mark_dirty -- adds blocks into this transaction.  clears any flags
- *                       that might make them get sent to disk
- *                       and then marks them BH_JDirty.  Puts the buffer head
- *                       into the current transaction hash.
- *
- * journal_end -- if the current transaction is batchable, it does nothing
- *                   otherwise, it could do an async/synchronous commit, or
- *                   a full flush of all log and real blocks in the
- *                   transaction.
- *
- * flush_old_commits -- if the current transaction is too old, it is ended and
- *                      commit blocks are sent to disk.  Forces commit blocks
- *                      to disk for all backgrounded commits that have been
- *                      around too long.
- *		     -- Note, if you call this as an immediate flush from
- *		        within kupdate, it will ignore the immediate flag
- */
-
-#include <linux/time.h>
-#include <linux/semaphore.h>
-#include <linux/vmalloc.h>
-#include "reiserfs.h"
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/fcntl.h>
-#include <linux/stat.h>
-#include <linux/string.h>
-#include <linux/buffer_head.h>
-#include <linux/workqueue.h>
-#include <linux/writeback.h>
-#include <linux/blkdev.h>
-#include <linux/backing-dev.h>
-#include <linux/uaccess.h>
-#include <linux/slab.h>
-
-
-/* gets a struct reiserfs_journal_list * from a list head */
-#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
-                               j_list))
-
-/* must be correct to keep the desc and commit structs at 4k */
-#define JOURNAL_TRANS_HALF 1018
-#define BUFNR 64		/*read ahead */
-
-/* cnode stat bits.  Move these into reiserfs_fs.h */
-
-/* this block was freed, and can't be written.  */
-#define BLOCK_FREED 2
-/* this block was freed during this transaction, and can't be written */
-#define BLOCK_FREED_HOLDER 3
-
-/* used in flush_journal_list */
-#define BLOCK_NEEDS_FLUSH 4
-#define BLOCK_DIRTIED 5
-
-/* journal list state bits */
-#define LIST_TOUCHED 1
-#define LIST_DIRTY   2
-#define LIST_COMMIT_PENDING  4	/* someone will commit this list */
-
-/* flags for do_journal_end */
-#define FLUSH_ALL   1		/* flush commit and real blocks */
-#define COMMIT_NOW  2		/* end and commit this transaction */
-#define WAIT        4		/* wait for the log blocks to hit the disk */
-
-static int do_journal_end(struct reiserfs_transaction_handle *, int flags);
-static int flush_journal_list(struct super_block *s,
-			      struct reiserfs_journal_list *jl, int flushall);
-static int flush_commit_list(struct super_block *s,
-			     struct reiserfs_journal_list *jl, int flushall);
-static int can_dirty(struct reiserfs_journal_cnode *cn);
-static int journal_join(struct reiserfs_transaction_handle *th,
-			struct super_block *sb);
-static void release_journal_dev(struct reiserfs_journal *journal);
-static void dirty_one_transaction(struct super_block *s,
-				 struct reiserfs_journal_list *jl);
-static void flush_async_commits(struct work_struct *work);
-static void queue_log_writer(struct super_block *s);
-
-/* values for join in do_journal_begin_r */
-enum {
-	JBEGIN_REG = 0,		/* regular journal begin */
-	/* join the running transaction if at all possible */
-	JBEGIN_JOIN = 1,
-	/* called from cleanup code, ignores aborted flag */
-	JBEGIN_ABORT = 2,
-};
-
-static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
-			      struct super_block *sb,
-			      unsigned long nblocks, int join);
-
-static void init_journal_hash(struct super_block *sb)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	memset(journal->j_hash_table, 0,
-	       JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
-}
-
-/*
- * clears BH_Dirty and sticks the buffer on the clean list.  Called because
- * I can't allow refile_buffer to make schedule happen after I've freed a
- * block.  Look at remove_from_transaction and journal_mark_freed for
- * more details.
- */
-static int reiserfs_clean_and_file_buffer(struct buffer_head *bh)
-{
-	if (bh) {
-		clear_buffer_dirty(bh);
-		clear_buffer_journal_test(bh);
-	}
-	return 0;
-}
-
-static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
-							 *sb)
-{
-	struct reiserfs_bitmap_node *bn;
-	static int id;
-
-	bn = kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS);
-	if (!bn) {
-		return NULL;
-	}
-	bn->data = kzalloc(sb->s_blocksize, GFP_NOFS);
-	if (!bn->data) {
-		kfree(bn);
-		return NULL;
-	}
-	bn->id = id++;
-	INIT_LIST_HEAD(&bn->list);
-	return bn;
-}
-
-static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *sb)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	struct reiserfs_bitmap_node *bn = NULL;
-	struct list_head *entry = journal->j_bitmap_nodes.next;
-
-	journal->j_used_bitmap_nodes++;
-repeat:
-
-	if (entry != &journal->j_bitmap_nodes) {
-		bn = list_entry(entry, struct reiserfs_bitmap_node, list);
-		list_del(entry);
-		memset(bn->data, 0, sb->s_blocksize);
-		journal->j_free_bitmap_nodes--;
-		return bn;
-	}
-	bn = allocate_bitmap_node(sb);
-	if (!bn) {
-		yield();
-		goto repeat;
-	}
-	return bn;
-}
-static inline void free_bitmap_node(struct super_block *sb,
-				    struct reiserfs_bitmap_node *bn)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	journal->j_used_bitmap_nodes--;
-	if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) {
-		kfree(bn->data);
-		kfree(bn);
-	} else {
-		list_add(&bn->list, &journal->j_bitmap_nodes);
-		journal->j_free_bitmap_nodes++;
-	}
-}
-
-static void allocate_bitmap_nodes(struct super_block *sb)
-{
-	int i;
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	struct reiserfs_bitmap_node *bn = NULL;
-	for (i = 0; i < REISERFS_MIN_BITMAP_NODES; i++) {
-		bn = allocate_bitmap_node(sb);
-		if (bn) {
-			list_add(&bn->list, &journal->j_bitmap_nodes);
-			journal->j_free_bitmap_nodes++;
-		} else {
-			/* this is ok, we'll try again when more are needed */
-			break;
-		}
-	}
-}
-
-static int set_bit_in_list_bitmap(struct super_block *sb,
-				  b_blocknr_t block,
-				  struct reiserfs_list_bitmap *jb)
-{
-	unsigned int bmap_nr = block / (sb->s_blocksize << 3);
-	unsigned int bit_nr = block % (sb->s_blocksize << 3);
-
-	if (!jb->bitmaps[bmap_nr]) {
-		jb->bitmaps[bmap_nr] = get_bitmap_node(sb);
-	}
-	set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data);
-	return 0;
-}
-
-static void cleanup_bitmap_list(struct super_block *sb,
-				struct reiserfs_list_bitmap *jb)
-{
-	int i;
-	if (jb->bitmaps == NULL)
-		return;
-
-	for (i = 0; i < reiserfs_bmap_count(sb); i++) {
-		if (jb->bitmaps[i]) {
-			free_bitmap_node(sb, jb->bitmaps[i]);
-			jb->bitmaps[i] = NULL;
-		}
-	}
-}
-
-/*
- * only call this on FS unmount.
- */
-static int free_list_bitmaps(struct super_block *sb,
-			     struct reiserfs_list_bitmap *jb_array)
-{
-	int i;
-	struct reiserfs_list_bitmap *jb;
-	for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
-		jb = jb_array + i;
-		jb->journal_list = NULL;
-		cleanup_bitmap_list(sb, jb);
-		vfree(jb->bitmaps);
-		jb->bitmaps = NULL;
-	}
-	return 0;
-}
-
-static int free_bitmap_nodes(struct super_block *sb)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	struct list_head *next = journal->j_bitmap_nodes.next;
-	struct reiserfs_bitmap_node *bn;
-
-	while (next != &journal->j_bitmap_nodes) {
-		bn = list_entry(next, struct reiserfs_bitmap_node, list);
-		list_del(next);
-		kfree(bn->data);
-		kfree(bn);
-		next = journal->j_bitmap_nodes.next;
-		journal->j_free_bitmap_nodes--;
-	}
-
-	return 0;
-}
-
-/*
- * get memory for JOURNAL_NUM_BITMAPS worth of bitmaps.
- * jb_array is the array to be filled in.
- */
-int reiserfs_allocate_list_bitmaps(struct super_block *sb,
-				   struct reiserfs_list_bitmap *jb_array,
-				   unsigned int bmap_nr)
-{
-	int i;
-	int failed = 0;
-	struct reiserfs_list_bitmap *jb;
-	int mem = bmap_nr * sizeof(struct reiserfs_bitmap_node *);
-
-	for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
-		jb = jb_array + i;
-		jb->journal_list = NULL;
-		jb->bitmaps = vzalloc(mem);
-		if (!jb->bitmaps) {
-			reiserfs_warning(sb, "clm-2000", "unable to "
-					 "allocate bitmaps for journal lists");
-			failed = 1;
-			break;
-		}
-	}
-	if (failed) {
-		free_list_bitmaps(sb, jb_array);
-		return -1;
-	}
-	return 0;
-}
-
-/*
- * find an available list bitmap.  If you can't find one, flush a commit list
- * and try again
- */
-static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *sb,
-						    struct reiserfs_journal_list
-						    *jl)
-{
-	int i, j;
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	struct reiserfs_list_bitmap *jb = NULL;
-
-	for (j = 0; j < (JOURNAL_NUM_BITMAPS * 3); j++) {
-		i = journal->j_list_bitmap_index;
-		journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS;
-		jb = journal->j_list_bitmap + i;
-		if (journal->j_list_bitmap[i].journal_list) {
-			flush_commit_list(sb,
-					  journal->j_list_bitmap[i].
-					  journal_list, 1);
-			if (!journal->j_list_bitmap[i].journal_list) {
-				break;
-			}
-		} else {
-			break;
-		}
-	}
-	/* double check to make sure if flushed correctly */
-	if (jb->journal_list)
-		return NULL;
-	jb->journal_list = jl;
-	return jb;
-}
-
-/*
- * allocates a new chunk of X nodes, and links them all together as a list.
- * Uses the cnode->next and cnode->prev pointers
- * returns NULL on failure
- */
-static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes)
-{
-	struct reiserfs_journal_cnode *head;
-	int i;
-	if (num_cnodes <= 0) {
-		return NULL;
-	}
-	head = vzalloc(array_size(num_cnodes,
-				  sizeof(struct reiserfs_journal_cnode)));
-	if (!head) {
-		return NULL;
-	}
-	head[0].prev = NULL;
-	head[0].next = head + 1;
-	for (i = 1; i < num_cnodes; i++) {
-		head[i].prev = head + (i - 1);
-		head[i].next = head + (i + 1);	/* if last one, overwrite it after the if */
-	}
-	head[num_cnodes - 1].next = NULL;
-	return head;
-}
-
-/* pulls a cnode off the free list, or returns NULL on failure */
-static struct reiserfs_journal_cnode *get_cnode(struct super_block *sb)
-{
-	struct reiserfs_journal_cnode *cn;
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-
-	reiserfs_check_lock_depth(sb, "get_cnode");
-
-	if (journal->j_cnode_free <= 0) {
-		return NULL;
-	}
-	journal->j_cnode_used++;
-	journal->j_cnode_free--;
-	cn = journal->j_cnode_free_list;
-	if (!cn) {
-		return cn;
-	}
-	if (cn->next) {
-		cn->next->prev = NULL;
-	}
-	journal->j_cnode_free_list = cn->next;
-	memset(cn, 0, sizeof(struct reiserfs_journal_cnode));
-	return cn;
-}
-
-/*
- * returns a cnode to the free list
- */
-static void free_cnode(struct super_block *sb,
-		       struct reiserfs_journal_cnode *cn)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-
-	reiserfs_check_lock_depth(sb, "free_cnode");
-
-	journal->j_cnode_used--;
-	journal->j_cnode_free++;
-	/* memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; */
-	cn->next = journal->j_cnode_free_list;
-	if (journal->j_cnode_free_list) {
-		journal->j_cnode_free_list->prev = cn;
-	}
-	cn->prev = NULL;	/* not needed with the memset, but I might kill the memset, and forget to do this */
-	journal->j_cnode_free_list = cn;
-}
-
-static void clear_prepared_bits(struct buffer_head *bh)
-{
-	clear_buffer_journal_prepared(bh);
-	clear_buffer_journal_restore_dirty(bh);
-}
-
-/*
- * return a cnode with same dev, block number and size in table,
- * or null if not found
- */
-static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
-								  super_block
-								  *sb,
-								  struct
-								  reiserfs_journal_cnode
-								  **table,
-								  long bl)
-{
-	struct reiserfs_journal_cnode *cn;
-	cn = journal_hash(table, sb, bl);
-	while (cn) {
-		if (cn->blocknr == bl && cn->sb == sb)
-			return cn;
-		cn = cn->hnext;
-	}
-	return (struct reiserfs_journal_cnode *)0;
-}
-
-/*
- * this actually means 'can this block be reallocated yet?'.  If you set
- * search_all, a block can only be allocated if it is not in the current
- * transaction, was not freed by the current transaction, and has no chance
- * of ever being overwritten by a replay after crashing.
- *
- * If you don't set search_all, a block can only be allocated if it is not
- * in the current transaction.  Since deleting a block removes it from the
- * current transaction, this case should never happen.  If you don't set
- * search_all, make sure you never write the block without logging it.
- *
- * next_zero_bit is a suggestion about the next block to try for find_forward.
- * when bl is rejected because it is set in a journal list bitmap, we search
- * for the next zero bit in the bitmap that rejected bl.  Then, we return
- * that through next_zero_bit for find_forward to try.
- *
- * Just because we return something in next_zero_bit does not mean we won't
- * reject it on the next call to reiserfs_in_journal
- */
-int reiserfs_in_journal(struct super_block *sb,
-			unsigned int bmap_nr, int bit_nr, int search_all,
-			b_blocknr_t * next_zero_bit)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	struct reiserfs_list_bitmap *jb;
-	int i;
-	unsigned long bl;
-
-	*next_zero_bit = 0;	/* always start this at zero. */
-
-	PROC_INFO_INC(sb, journal.in_journal);
-	/*
-	 * If we aren't doing a search_all, this is a metablock, and it
-	 * will be logged before use.  if we crash before the transaction
-	 * that freed it commits,  this transaction won't have committed
-	 * either, and the block will never be written
-	 */
-	if (search_all) {
-		for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
-			PROC_INFO_INC(sb, journal.in_journal_bitmap);
-			jb = journal->j_list_bitmap + i;
-			if (jb->journal_list && jb->bitmaps[bmap_nr] &&
-			    test_bit(bit_nr,
-				     (unsigned long *)jb->bitmaps[bmap_nr]->
-				     data)) {
-				*next_zero_bit =
-				    find_next_zero_bit((unsigned long *)
-						       (jb->bitmaps[bmap_nr]->
-							data),
-						       sb->s_blocksize << 3,
-						       bit_nr + 1);
-				return 1;
-			}
-		}
-	}
-
-	bl = bmap_nr * (sb->s_blocksize << 3) + bit_nr;
-	/* is it in any old transactions? */
-	if (search_all
-	    && (get_journal_hash_dev(sb, journal->j_list_hash_table, bl))) {
-		return 1;
-	}
-
-	/* is it in the current transaction.  This should never happen */
-	if ((get_journal_hash_dev(sb, journal->j_hash_table, bl))) {
-		BUG();
-		return 1;
-	}
-
-	PROC_INFO_INC(sb, journal.in_journal_reusable);
-	/* safe for reuse */
-	return 0;
-}
-
-/* insert cn into table */
-static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
-				       struct reiserfs_journal_cnode *cn)
-{
-	struct reiserfs_journal_cnode *cn_orig;
-
-	cn_orig = journal_hash(table, cn->sb, cn->blocknr);
-	cn->hnext = cn_orig;
-	cn->hprev = NULL;
-	if (cn_orig) {
-		cn_orig->hprev = cn;
-	}
-	journal_hash(table, cn->sb, cn->blocknr) = cn;
-}
-
-/* lock the current transaction */
-static inline void lock_journal(struct super_block *sb)
-{
-	PROC_INFO_INC(sb, journal.lock_journal);
-
-	reiserfs_mutex_lock_safe(&SB_JOURNAL(sb)->j_mutex, sb);
-}
-
-/* unlock the current transaction */
-static inline void unlock_journal(struct super_block *sb)
-{
-	mutex_unlock(&SB_JOURNAL(sb)->j_mutex);
-}
-
-static inline void get_journal_list(struct reiserfs_journal_list *jl)
-{
-	jl->j_refcount++;
-}
-
-static inline void put_journal_list(struct super_block *s,
-				    struct reiserfs_journal_list *jl)
-{
-	if (jl->j_refcount < 1) {
-		reiserfs_panic(s, "journal-2", "trans id %u, refcount at %d",
-			       jl->j_trans_id, jl->j_refcount);
-	}
-	if (--jl->j_refcount == 0)
-		kfree(jl);
-}
-
-/*
- * this used to be much more involved, and I'm keeping it just in case
- * things get ugly again.  it gets called by flush_commit_list, and
- * cleans up any data stored about blocks freed during a transaction.
- */
-static void cleanup_freed_for_journal_list(struct super_block *sb,
-					   struct reiserfs_journal_list *jl)
-{
-
-	struct reiserfs_list_bitmap *jb = jl->j_list_bitmap;
-	if (jb) {
-		cleanup_bitmap_list(sb, jb);
-	}
-	jl->j_list_bitmap->journal_list = NULL;
-	jl->j_list_bitmap = NULL;
-}
-
-static int journal_list_still_alive(struct super_block *s,
-				    unsigned int trans_id)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(s);
-	struct list_head *entry = &journal->j_journal_list;
-	struct reiserfs_journal_list *jl;
-
-	if (!list_empty(entry)) {
-		jl = JOURNAL_LIST_ENTRY(entry->next);
-		if (jl->j_trans_id <= trans_id) {
-			return 1;
-		}
-	}
-	return 0;
-}
-
-/*
- * If page->mapping was null, we failed to truncate this page for
- * some reason.  Most likely because it was truncated after being
- * logged via data=journal.
- *
- * This does a check to see if the buffer belongs to one of these
- * lost pages before doing the final put_bh.  If page->mapping was
- * null, it tries to free buffers on the page, which should make the
- * final put_page drop the page from the lru.
- */
-static void release_buffer_page(struct buffer_head *bh)
-{
-	struct folio *folio = bh->b_folio;
-	if (!folio->mapping && folio_trylock(folio)) {
-		folio_get(folio);
-		put_bh(bh);
-		if (!folio->mapping)
-			try_to_free_buffers(folio);
-		folio_unlock(folio);
-		folio_put(folio);
-	} else {
-		put_bh(bh);
-	}
-}
-
-static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
-{
-	if (buffer_journaled(bh)) {
-		reiserfs_warning(NULL, "clm-2084",
-				 "pinned buffer %lu:%pg sent to disk",
-				 bh->b_blocknr, bh->b_bdev);
-	}
-	if (uptodate)
-		set_buffer_uptodate(bh);
-	else
-		clear_buffer_uptodate(bh);
-
-	unlock_buffer(bh);
-	release_buffer_page(bh);
-}
-
-static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate)
-{
-	if (uptodate)
-		set_buffer_uptodate(bh);
-	else
-		clear_buffer_uptodate(bh);
-	unlock_buffer(bh);
-	put_bh(bh);
-}
-
-static void submit_logged_buffer(struct buffer_head *bh)
-{
-	get_bh(bh);
-	bh->b_end_io = reiserfs_end_buffer_io_sync;
-	clear_buffer_journal_new(bh);
-	clear_buffer_dirty(bh);
-	if (!test_clear_buffer_journal_test(bh))
-		BUG();
-	if (!buffer_uptodate(bh))
-		BUG();
-	submit_bh(REQ_OP_WRITE, bh);
-}
-
-static void submit_ordered_buffer(struct buffer_head *bh)
-{
-	get_bh(bh);
-	bh->b_end_io = reiserfs_end_ordered_io;
-	clear_buffer_dirty(bh);
-	if (!buffer_uptodate(bh))
-		BUG();
-	submit_bh(REQ_OP_WRITE, bh);
-}
-
-#define CHUNK_SIZE 32
-struct buffer_chunk {
-	struct buffer_head *bh[CHUNK_SIZE];
-	int nr;
-};
-
-static void write_chunk(struct buffer_chunk *chunk)
-{
-	int i;
-	for (i = 0; i < chunk->nr; i++) {
-		submit_logged_buffer(chunk->bh[i]);
-	}
-	chunk->nr = 0;
-}
-
-static void write_ordered_chunk(struct buffer_chunk *chunk)
-{
-	int i;
-	for (i = 0; i < chunk->nr; i++) {
-		submit_ordered_buffer(chunk->bh[i]);
-	}
-	chunk->nr = 0;
-}
-
-static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
-			spinlock_t * lock, void (fn) (struct buffer_chunk *))
-{
-	int ret = 0;
-	BUG_ON(chunk->nr >= CHUNK_SIZE);
-	chunk->bh[chunk->nr++] = bh;
-	if (chunk->nr >= CHUNK_SIZE) {
-		ret = 1;
-		if (lock) {
-			spin_unlock(lock);
-			fn(chunk);
-			spin_lock(lock);
-		} else {
-			fn(chunk);
-		}
-	}
-	return ret;
-}
-
-static atomic_t nr_reiserfs_jh = ATOMIC_INIT(0);
-static struct reiserfs_jh *alloc_jh(void)
-{
-	struct reiserfs_jh *jh;
-	while (1) {
-		jh = kmalloc(sizeof(*jh), GFP_NOFS);
-		if (jh) {
-			atomic_inc(&nr_reiserfs_jh);
-			return jh;
-		}
-		yield();
-	}
-}
-
-/*
- * we want to free the jh when the buffer has been written
- * and waited on
- */
-void reiserfs_free_jh(struct buffer_head *bh)
-{
-	struct reiserfs_jh *jh;
-
-	jh = bh->b_private;
-	if (jh) {
-		bh->b_private = NULL;
-		jh->bh = NULL;
-		list_del_init(&jh->list);
-		kfree(jh);
-		if (atomic_read(&nr_reiserfs_jh) <= 0)
-			BUG();
-		atomic_dec(&nr_reiserfs_jh);
-		put_bh(bh);
-	}
-}
-
-static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh,
-			   int tail)
-{
-	struct reiserfs_jh *jh;
-
-	if (bh->b_private) {
-		spin_lock(&j->j_dirty_buffers_lock);
-		if (!bh->b_private) {
-			spin_unlock(&j->j_dirty_buffers_lock);
-			goto no_jh;
-		}
-		jh = bh->b_private;
-		list_del_init(&jh->list);
-	} else {
-no_jh:
-		get_bh(bh);
-		jh = alloc_jh();
-		spin_lock(&j->j_dirty_buffers_lock);
-		/*
-		 * buffer must be locked for __add_jh, should be able to have
-		 * two adds at the same time
-		 */
-		BUG_ON(bh->b_private);
-		jh->bh = bh;
-		bh->b_private = jh;
-	}
-	jh->jl = j->j_current_jl;
-	if (tail)
-		list_add_tail(&jh->list, &jh->jl->j_tail_bh_list);
-	else {
-		list_add_tail(&jh->list, &jh->jl->j_bh_list);
-	}
-	spin_unlock(&j->j_dirty_buffers_lock);
-	return 0;
-}
-
-int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh)
-{
-	return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1);
-}
-int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh)
-{
-	return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0);
-}
-
-#define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list)
-static int write_ordered_buffers(spinlock_t * lock,
-				 struct reiserfs_journal *j,
-				 struct reiserfs_journal_list *jl,
-				 struct list_head *list)
-{
-	struct buffer_head *bh;
-	struct reiserfs_jh *jh;
-	int ret = j->j_errno;
-	struct buffer_chunk chunk;
-	struct list_head tmp;
-	INIT_LIST_HEAD(&tmp);
-
-	chunk.nr = 0;
-	spin_lock(lock);
-	while (!list_empty(list)) {
-		jh = JH_ENTRY(list->next);
-		bh = jh->bh;
-		get_bh(bh);
-		if (!trylock_buffer(bh)) {
-			if (!buffer_dirty(bh)) {
-				list_move(&jh->list, &tmp);
-				goto loop_next;
-			}
-			spin_unlock(lock);
-			if (chunk.nr)
-				write_ordered_chunk(&chunk);
-			wait_on_buffer(bh);
-			cond_resched();
-			spin_lock(lock);
-			goto loop_next;
-		}
-		/*
-		 * in theory, dirty non-uptodate buffers should never get here,
-		 * but the upper layer io error paths still have a few quirks.
-		 * Handle them here as gracefully as we can
-		 */
-		if (!buffer_uptodate(bh) && buffer_dirty(bh)) {
-			clear_buffer_dirty(bh);
-			ret = -EIO;
-		}
-		if (buffer_dirty(bh)) {
-			list_move(&jh->list, &tmp);
-			add_to_chunk(&chunk, bh, lock, write_ordered_chunk);
-		} else {
-			reiserfs_free_jh(bh);
-			unlock_buffer(bh);
-		}
-loop_next:
-		put_bh(bh);
-		cond_resched_lock(lock);
-	}
-	if (chunk.nr) {
-		spin_unlock(lock);
-		write_ordered_chunk(&chunk);
-		spin_lock(lock);
-	}
-	while (!list_empty(&tmp)) {
-		jh = JH_ENTRY(tmp.prev);
-		bh = jh->bh;
-		get_bh(bh);
-		reiserfs_free_jh(bh);
-
-		if (buffer_locked(bh)) {
-			spin_unlock(lock);
-			wait_on_buffer(bh);
-			spin_lock(lock);
-		}
-		if (!buffer_uptodate(bh)) {
-			ret = -EIO;
-		}
-		/*
-		 * ugly interaction with invalidate_folio here.
-		 * reiserfs_invalidate_folio will pin any buffer that has a
-		 * valid journal head from an older transaction.  If someone
-		 * else sets our buffer dirty after we write it in the first
-		 * loop, and then someone truncates the page away, nobody
-		 * will ever write the buffer. We're safe if we write the
-		 * page one last time after freeing the journal header.
-		 */
-		if (buffer_dirty(bh) && unlikely(bh->b_folio->mapping == NULL)) {
-			spin_unlock(lock);
-			write_dirty_buffer(bh, 0);
-			spin_lock(lock);
-		}
-		put_bh(bh);
-		cond_resched_lock(lock);
-	}
-	spin_unlock(lock);
-	return ret;
-}
-
-static int flush_older_commits(struct super_block *s,
-			       struct reiserfs_journal_list *jl)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(s);
-	struct reiserfs_journal_list *other_jl;
-	struct reiserfs_journal_list *first_jl;
-	struct list_head *entry;
-	unsigned int trans_id = jl->j_trans_id;
-	unsigned int other_trans_id;
-
-find_first:
-	/*
-	 * first we walk backwards to find the oldest uncommitted transation
-	 */
-	first_jl = jl;
-	entry = jl->j_list.prev;
-	while (1) {
-		other_jl = JOURNAL_LIST_ENTRY(entry);
-		if (entry == &journal->j_journal_list ||
-		    atomic_read(&other_jl->j_older_commits_done))
-			break;
-
-		first_jl = other_jl;
-		entry = other_jl->j_list.prev;
-	}
-
-	/* if we didn't find any older uncommitted transactions, return now */
-	if (first_jl == jl) {
-		return 0;
-	}
-
-	entry = &first_jl->j_list;
-	while (1) {
-		other_jl = JOURNAL_LIST_ENTRY(entry);
-		other_trans_id = other_jl->j_trans_id;
-
-		if (other_trans_id < trans_id) {
-			if (atomic_read(&other_jl->j_commit_left) != 0) {
-				flush_commit_list(s, other_jl, 0);
-
-				/* list we were called with is gone, return */
-				if (!journal_list_still_alive(s, trans_id))
-					return 1;
-
-				/*
-				 * the one we just flushed is gone, this means
-				 * all older lists are also gone, so first_jl
-				 * is no longer valid either.  Go back to the
-				 * beginning.
-				 */
-				if (!journal_list_still_alive
-				    (s, other_trans_id)) {
-					goto find_first;
-				}
-			}
-			entry = entry->next;
-			if (entry == &journal->j_journal_list)
-				return 0;
-		} else {
-			return 0;
-		}
-	}
-	return 0;
-}
-
-static int reiserfs_async_progress_wait(struct super_block *s)
-{
-	struct reiserfs_journal *j = SB_JOURNAL(s);
-
-	if (atomic_read(&j->j_async_throttle)) {
-		int depth;
-
-		depth = reiserfs_write_unlock_nested(s);
-		wait_var_event_timeout(&j->j_async_throttle,
-				       atomic_read(&j->j_async_throttle) == 0,
-				       HZ / 10);
-		reiserfs_write_lock_nested(s, depth);
-	}
-
-	return 0;
-}
-
-/*
- * if this journal list still has commit blocks unflushed, send them to disk.
- *
- * log areas must be flushed in order (transaction 2 can't commit before
- * transaction 1) Before the commit block can by written, every other log
- * block must be safely on disk
- */
-static int flush_commit_list(struct super_block *s,
-			     struct reiserfs_journal_list *jl, int flushall)
-{
-	int i;
-	b_blocknr_t bn;
-	struct buffer_head *tbh = NULL;
-	unsigned int trans_id = jl->j_trans_id;
-	struct reiserfs_journal *journal = SB_JOURNAL(s);
-	int retval = 0;
-	int write_len;
-	int depth;
-
-	reiserfs_check_lock_depth(s, "flush_commit_list");
-
-	if (atomic_read(&jl->j_older_commits_done)) {
-		return 0;
-	}
-
-	/*
-	 * before we can put our commit blocks on disk, we have to make
-	 * sure everyone older than us is on disk too
-	 */
-	BUG_ON(jl->j_len <= 0);
-	BUG_ON(trans_id == journal->j_trans_id);
-
-	get_journal_list(jl);
-	if (flushall) {
-		if (flush_older_commits(s, jl) == 1) {
-			/*
-			 * list disappeared during flush_older_commits.
-			 * return
-			 */
-			goto put_jl;
-		}
-	}
-
-	/* make sure nobody is trying to flush this one at the same time */
-	reiserfs_mutex_lock_safe(&jl->j_commit_mutex, s);
-
-	if (!journal_list_still_alive(s, trans_id)) {
-		mutex_unlock(&jl->j_commit_mutex);
-		goto put_jl;
-	}
-	BUG_ON(jl->j_trans_id == 0);
-
-	/* this commit is done, exit */
-	if (atomic_read(&jl->j_commit_left) <= 0) {
-		if (flushall) {
-			atomic_set(&jl->j_older_commits_done, 1);
-		}
-		mutex_unlock(&jl->j_commit_mutex);
-		goto put_jl;
-	}
-
-	if (!list_empty(&jl->j_bh_list)) {
-		int ret;
-
-		/*
-		 * We might sleep in numerous places inside
-		 * write_ordered_buffers. Relax the write lock.
-		 */
-		depth = reiserfs_write_unlock_nested(s);
-		ret = write_ordered_buffers(&journal->j_dirty_buffers_lock,
-					    journal, jl, &jl->j_bh_list);
-		if (ret < 0 && retval == 0)
-			retval = ret;
-		reiserfs_write_lock_nested(s, depth);
-	}
-	BUG_ON(!list_empty(&jl->j_bh_list));
-	/*
-	 * for the description block and all the log blocks, submit any buffers
-	 * that haven't already reached the disk.  Try to write at least 256
-	 * log blocks. later on, we will only wait on blocks that correspond
-	 * to this transaction, but while we're unplugging we might as well
-	 * get a chunk of data on there.
-	 */
-	atomic_inc(&journal->j_async_throttle);
-	write_len = jl->j_len + 1;
-	if (write_len < 256)
-		write_len = 256;
-	for (i = 0 ; i < write_len ; i++) {
-		bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) %
-		    SB_ONDISK_JOURNAL_SIZE(s);
-		tbh = journal_find_get_block(s, bn);
-		if (tbh) {
-			if (buffer_dirty(tbh)) {
-		            depth = reiserfs_write_unlock_nested(s);
-			    write_dirty_buffer(tbh, 0);
-			    reiserfs_write_lock_nested(s, depth);
-			}
-			put_bh(tbh) ;
-		}
-	}
-	if (atomic_dec_and_test(&journal->j_async_throttle))
-		wake_up_var(&journal->j_async_throttle);
-
-	for (i = 0; i < (jl->j_len + 1); i++) {
-		bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
-		    (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
-		tbh = journal_find_get_block(s, bn);
-
-		depth = reiserfs_write_unlock_nested(s);
-		__wait_on_buffer(tbh);
-		reiserfs_write_lock_nested(s, depth);
-		/*
-		 * since we're using ll_rw_blk above, it might have skipped
-		 * over a locked buffer.  Double check here
-		 */
-		/* redundant, sync_dirty_buffer() checks */
-		if (buffer_dirty(tbh)) {
-			depth = reiserfs_write_unlock_nested(s);
-			sync_dirty_buffer(tbh);
-			reiserfs_write_lock_nested(s, depth);
-		}
-		if (unlikely(!buffer_uptodate(tbh))) {
-#ifdef CONFIG_REISERFS_CHECK
-			reiserfs_warning(s, "journal-601",
-					 "buffer write failed");
-#endif
-			retval = -EIO;
-		}
-		/* once for journal_find_get_block */
-		put_bh(tbh);
-		/* once due to original getblk in do_journal_end */
-		put_bh(tbh);
-		atomic_dec(&jl->j_commit_left);
-	}
-
-	BUG_ON(atomic_read(&jl->j_commit_left) != 1);
-
-	/*
-	 * If there was a write error in the journal - we can't commit
-	 * this transaction - it will be invalid and, if successful,
-	 * will just end up propagating the write error out to
-	 * the file system.
-	 */
-	if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
-		if (buffer_dirty(jl->j_commit_bh))
-			BUG();
-		mark_buffer_dirty(jl->j_commit_bh) ;
-		depth = reiserfs_write_unlock_nested(s);
-		if (reiserfs_barrier_flush(s))
-			__sync_dirty_buffer(jl->j_commit_bh,
-					REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
-		else
-			sync_dirty_buffer(jl->j_commit_bh);
-		reiserfs_write_lock_nested(s, depth);
-	}
-
-	/*
-	 * If there was a write error in the journal - we can't commit this
-	 * transaction - it will be invalid and, if successful, will just end
-	 * up propagating the write error out to the filesystem.
-	 */
-	if (unlikely(!buffer_uptodate(jl->j_commit_bh))) {
-#ifdef CONFIG_REISERFS_CHECK
-		reiserfs_warning(s, "journal-615", "buffer write failed");
-#endif
-		retval = -EIO;
-	}
-	bforget(jl->j_commit_bh);
-	if (journal->j_last_commit_id != 0 &&
-	    (jl->j_trans_id - journal->j_last_commit_id) != 1) {
-		reiserfs_warning(s, "clm-2200", "last commit %lu, current %lu",
-				 journal->j_last_commit_id, jl->j_trans_id);
-	}
-	journal->j_last_commit_id = jl->j_trans_id;
-
-	/*
-	 * now, every commit block is on the disk.  It is safe to allow
-	 * blocks freed during this transaction to be reallocated
-	 */
-	cleanup_freed_for_journal_list(s, jl);
-
-	retval = retval ? retval : journal->j_errno;
-
-	/* mark the metadata dirty */
-	if (!retval)
-		dirty_one_transaction(s, jl);
-	atomic_dec(&jl->j_commit_left);
-
-	if (flushall) {
-		atomic_set(&jl->j_older_commits_done, 1);
-	}
-	mutex_unlock(&jl->j_commit_mutex);
-put_jl:
-	put_journal_list(s, jl);
-
-	if (retval)
-		reiserfs_abort(s, retval, "Journal write error in %s",
-			       __func__);
-	return retval;
-}
-
-/*
- * flush_journal_list frequently needs to find a newer transaction for a
- * given block.  This does that, or returns NULL if it can't find anything
- */
-static struct reiserfs_journal_list *find_newer_jl_for_cn(struct
-							  reiserfs_journal_cnode
-							  *cn)
-{
-	struct super_block *sb = cn->sb;
-	b_blocknr_t blocknr = cn->blocknr;
-
-	cn = cn->hprev;
-	while (cn) {
-		if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist) {
-			return cn->jlist;
-		}
-		cn = cn->hprev;
-	}
-	return NULL;
-}
-
-static void remove_journal_hash(struct super_block *,
-				struct reiserfs_journal_cnode **,
-				struct reiserfs_journal_list *, unsigned long,
-				int);
-
-/*
- * once all the real blocks have been flushed, it is safe to remove them
- * from the journal list for this transaction.  Aside from freeing the
- * cnode, this also allows the block to be reallocated for data blocks
- * if it had been deleted.
- */
-static void remove_all_from_journal_list(struct super_block *sb,
-					 struct reiserfs_journal_list *jl,
-					 int debug)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	struct reiserfs_journal_cnode *cn, *last;
-	cn = jl->j_realblock;
-
-	/*
-	 * which is better, to lock once around the whole loop, or
-	 * to lock for each call to remove_journal_hash?
-	 */
-	while (cn) {
-		if (cn->blocknr != 0) {
-			if (debug) {
-				reiserfs_warning(sb, "reiserfs-2201",
-						 "block %u, bh is %d, state %ld",
-						 cn->blocknr, cn->bh ? 1 : 0,
-						 cn->state);
-			}
-			cn->state = 0;
-			remove_journal_hash(sb, journal->j_list_hash_table,
-					    jl, cn->blocknr, 1);
-		}
-		last = cn;
-		cn = cn->next;
-		free_cnode(sb, last);
-	}
-	jl->j_realblock = NULL;
-}
-
-/*
- * if this timestamp is greater than the timestamp we wrote last to the
- * header block, write it to the header block.  once this is done, I can
- * safely say the log area for this transaction won't ever be replayed,
- * and I can start releasing blocks in this transaction for reuse as data
- * blocks.  called by flush_journal_list, before it calls
- * remove_all_from_journal_list
- */
-static int _update_journal_header_block(struct super_block *sb,
-					unsigned long offset,
-					unsigned int trans_id)
-{
-	struct reiserfs_journal_header *jh;
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	int depth;
-
-	if (reiserfs_is_journal_aborted(journal))
-		return -EIO;
-
-	if (trans_id >= journal->j_last_flush_trans_id) {
-		if (buffer_locked((journal->j_header_bh))) {
-			depth = reiserfs_write_unlock_nested(sb);
-			__wait_on_buffer(journal->j_header_bh);
-			reiserfs_write_lock_nested(sb, depth);
-			if (unlikely(!buffer_uptodate(journal->j_header_bh))) {
-#ifdef CONFIG_REISERFS_CHECK
-				reiserfs_warning(sb, "journal-699",
-						 "buffer write failed");
-#endif
-				return -EIO;
-			}
-		}
-		journal->j_last_flush_trans_id = trans_id;
-		journal->j_first_unflushed_offset = offset;
-		jh = (struct reiserfs_journal_header *)(journal->j_header_bh->
-							b_data);
-		jh->j_last_flush_trans_id = cpu_to_le32(trans_id);
-		jh->j_first_unflushed_offset = cpu_to_le32(offset);
-		jh->j_mount_id = cpu_to_le32(journal->j_mount_id);
-
-		set_buffer_dirty(journal->j_header_bh);
-		depth = reiserfs_write_unlock_nested(sb);
-
-		if (reiserfs_barrier_flush(sb))
-			__sync_dirty_buffer(journal->j_header_bh,
-					REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
-		else
-			sync_dirty_buffer(journal->j_header_bh);
-
-		reiserfs_write_lock_nested(sb, depth);
-		if (!buffer_uptodate(journal->j_header_bh)) {
-			reiserfs_warning(sb, "journal-837",
-					 "IO error during journal replay");
-			return -EIO;
-		}
-	}
-	return 0;
-}
-
-static int update_journal_header_block(struct super_block *sb,
-				       unsigned long offset,
-				       unsigned int trans_id)
-{
-	return _update_journal_header_block(sb, offset, trans_id);
-}
-
-/*
-** flush any and all journal lists older than you are
-** can only be called from flush_journal_list
-*/
-static int flush_older_journal_lists(struct super_block *sb,
-				     struct reiserfs_journal_list *jl)
-{
-	struct list_head *entry;
-	struct reiserfs_journal_list *other_jl;
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	unsigned int trans_id = jl->j_trans_id;
-
-	/*
-	 * we know we are the only ones flushing things, no extra race
-	 * protection is required.
-	 */
-restart:
-	entry = journal->j_journal_list.next;
-	/* Did we wrap? */
-	if (entry == &journal->j_journal_list)
-		return 0;
-	other_jl = JOURNAL_LIST_ENTRY(entry);
-	if (other_jl->j_trans_id < trans_id) {
-		BUG_ON(other_jl->j_refcount <= 0);
-		/* do not flush all */
-		flush_journal_list(sb, other_jl, 0);
-
-		/* other_jl is now deleted from the list */
-		goto restart;
-	}
-	return 0;
-}
-
-static void del_from_work_list(struct super_block *s,
-			       struct reiserfs_journal_list *jl)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(s);
-	if (!list_empty(&jl->j_working_list)) {
-		list_del_init(&jl->j_working_list);
-		journal->j_num_work_lists--;
-	}
-}
-
-/*
- * flush a journal list, both commit and real blocks
- *
- * always set flushall to 1, unless you are calling from inside
- * flush_journal_list
- *
- * IMPORTANT.  This can only be called while there are no journal writers,
- * and the journal is locked.  That means it can only be called from
- * do_journal_end, or by journal_release
- */
-static int flush_journal_list(struct super_block *s,
-			      struct reiserfs_journal_list *jl, int flushall)
-{
-	struct reiserfs_journal_list *pjl;
-	struct reiserfs_journal_cnode *cn;
-	int count;
-	int was_jwait = 0;
-	int was_dirty = 0;
-	struct buffer_head *saved_bh;
-	unsigned long j_len_saved = jl->j_len;
-	struct reiserfs_journal *journal = SB_JOURNAL(s);
-	int err = 0;
-	int depth;
-
-	BUG_ON(j_len_saved <= 0);
-
-	if (atomic_read(&journal->j_wcount) != 0) {
-		reiserfs_warning(s, "clm-2048", "called with wcount %d",
-				 atomic_read(&journal->j_wcount));
-	}
-
-	/* if flushall == 0, the lock is already held */
-	if (flushall) {
-		reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
-	} else if (mutex_trylock(&journal->j_flush_mutex)) {
-		BUG();
-	}
-
-	count = 0;
-	if (j_len_saved > journal->j_trans_max) {
-		reiserfs_panic(s, "journal-715", "length is %lu, trans id %lu",
-			       j_len_saved, jl->j_trans_id);
-		return 0;
-	}
-
-	/* if all the work is already done, get out of here */
-	if (atomic_read(&jl->j_nonzerolen) <= 0 &&
-	    atomic_read(&jl->j_commit_left) <= 0) {
-		goto flush_older_and_return;
-	}
-
-	/*
-	 * start by putting the commit list on disk.  This will also flush
-	 * the commit lists of any olders transactions
-	 */
-	flush_commit_list(s, jl, 1);
-
-	if (!(jl->j_state & LIST_DIRTY)
-	    && !reiserfs_is_journal_aborted(journal))
-		BUG();
-
-	/* are we done now? */
-	if (atomic_read(&jl->j_nonzerolen) <= 0 &&
-	    atomic_read(&jl->j_commit_left) <= 0) {
-		goto flush_older_and_return;
-	}
-
-	/*
-	 * loop through each cnode, see if we need to write it,
-	 * or wait on a more recent transaction, or just ignore it
-	 */
-	if (atomic_read(&journal->j_wcount) != 0) {
-		reiserfs_panic(s, "journal-844", "journal list is flushing, "
-			       "wcount is not 0");
-	}
-	cn = jl->j_realblock;
-	while (cn) {
-		was_jwait = 0;
-		was_dirty = 0;
-		saved_bh = NULL;
-		/* blocknr of 0 is no longer in the hash, ignore it */
-		if (cn->blocknr == 0) {
-			goto free_cnode;
-		}
-
-		/*
-		 * This transaction failed commit.
-		 * Don't write out to the disk
-		 */
-		if (!(jl->j_state & LIST_DIRTY))
-			goto free_cnode;
-
-		pjl = find_newer_jl_for_cn(cn);
-		/*
-		 * the order is important here.  We check pjl to make sure we
-		 * don't clear BH_JDirty_wait if we aren't the one writing this
-		 * block to disk
-		 */
-		if (!pjl && cn->bh) {
-			saved_bh = cn->bh;
-
-			/*
-			 * we do this to make sure nobody releases the
-			 * buffer while we are working with it
-			 */
-			get_bh(saved_bh);
-
-			if (buffer_journal_dirty(saved_bh)) {
-				BUG_ON(!can_dirty(cn));
-				was_jwait = 1;
-				was_dirty = 1;
-			} else if (can_dirty(cn)) {
-				/*
-				 * everything with !pjl && jwait
-				 * should be writable
-				 */
-				BUG();
-			}
-		}
-
-		/*
-		 * if someone has this block in a newer transaction, just make
-		 * sure they are committed, and don't try writing it to disk
-		 */
-		if (pjl) {
-			if (atomic_read(&pjl->j_commit_left))
-				flush_commit_list(s, pjl, 1);
-			goto free_cnode;
-		}
-
-		/*
-		 * bh == NULL when the block got to disk on its own, OR,
-		 * the block got freed in a future transaction
-		 */
-		if (saved_bh == NULL) {
-			goto free_cnode;
-		}
-
-		/*
-		 * this should never happen.  kupdate_one_transaction has
-		 * this list locked while it works, so we should never see a
-		 * buffer here that is not marked JDirty_wait
-		 */
-		if ((!was_jwait) && !buffer_locked(saved_bh)) {
-			reiserfs_warning(s, "journal-813",
-					 "BAD! buffer %llu %cdirty %cjwait, "
-					 "not in a newer transaction",
-					 (unsigned long long)saved_bh->
-					 b_blocknr, was_dirty ? ' ' : '!',
-					 was_jwait ? ' ' : '!');
-		}
-		if (was_dirty) {
-			/*
-			 * we inc again because saved_bh gets decremented
-			 * at free_cnode
-			 */
-			get_bh(saved_bh);
-			set_bit(BLOCK_NEEDS_FLUSH, &cn->state);
-			lock_buffer(saved_bh);
-			BUG_ON(cn->blocknr != saved_bh->b_blocknr);
-			if (buffer_dirty(saved_bh))
-				submit_logged_buffer(saved_bh);
-			else
-				unlock_buffer(saved_bh);
-			count++;
-		} else {
-			reiserfs_warning(s, "clm-2082",
-					 "Unable to flush buffer %llu in %s",
-					 (unsigned long long)saved_bh->
-					 b_blocknr, __func__);
-		}
-free_cnode:
-		cn = cn->next;
-		if (saved_bh) {
-			/*
-			 * we incremented this to keep others from
-			 * taking the buffer head away
-			 */
-			put_bh(saved_bh);
-			if (atomic_read(&saved_bh->b_count) < 0) {
-				reiserfs_warning(s, "journal-945",
-						 "saved_bh->b_count < 0");
-			}
-		}
-	}
-	if (count > 0) {
-		cn = jl->j_realblock;
-		while (cn) {
-			if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {
-				if (!cn->bh) {
-					reiserfs_panic(s, "journal-1011",
-						       "cn->bh is NULL");
-				}
-
-				depth = reiserfs_write_unlock_nested(s);
-				__wait_on_buffer(cn->bh);
-				reiserfs_write_lock_nested(s, depth);
-
-				if (!cn->bh) {
-					reiserfs_panic(s, "journal-1012",
-						       "cn->bh is NULL");
-				}
-				if (unlikely(!buffer_uptodate(cn->bh))) {
-#ifdef CONFIG_REISERFS_CHECK
-					reiserfs_warning(s, "journal-949",
-							 "buffer write failed");
-#endif
-					err = -EIO;
-				}
-				/*
-				 * note, we must clear the JDirty_wait bit
-				 * after the up to date check, otherwise we
-				 * race against our flushpage routine
-				 */
-				BUG_ON(!test_clear_buffer_journal_dirty
-				       (cn->bh));
-
-				/* drop one ref for us */
-				put_bh(cn->bh);
-				/* drop one ref for journal_mark_dirty */
-				release_buffer_page(cn->bh);
-			}
-			cn = cn->next;
-		}
-	}
-
-	if (err)
-		reiserfs_abort(s, -EIO,
-			       "Write error while pushing transaction to disk in %s",
-			       __func__);
-flush_older_and_return:
-
-	/*
-	 * before we can update the journal header block, we _must_ flush all
-	 * real blocks from all older transactions to disk.  This is because
-	 * once the header block is updated, this transaction will not be
-	 * replayed after a crash
-	 */
-	if (flushall) {
-		flush_older_journal_lists(s, jl);
-	}
-
-	err = journal->j_errno;
-	/*
-	 * before we can remove everything from the hash tables for this
-	 * transaction, we must make sure it can never be replayed
-	 *
-	 * since we are only called from do_journal_end, we know for sure there
-	 * are no allocations going on while we are flushing journal lists.  So,
-	 * we only need to update the journal header block for the last list
-	 * being flushed
-	 */
-	if (!err && flushall) {
-		err =
-		    update_journal_header_block(s,
-						(jl->j_start + jl->j_len +
-						 2) % SB_ONDISK_JOURNAL_SIZE(s),
-						jl->j_trans_id);
-		if (err)
-			reiserfs_abort(s, -EIO,
-				       "Write error while updating journal header in %s",
-				       __func__);
-	}
-	remove_all_from_journal_list(s, jl, 0);
-	list_del_init(&jl->j_list);
-	journal->j_num_lists--;
-	del_from_work_list(s, jl);
-
-	if (journal->j_last_flush_id != 0 &&
-	    (jl->j_trans_id - journal->j_last_flush_id) != 1) {
-		reiserfs_warning(s, "clm-2201", "last flush %lu, current %lu",
-				 journal->j_last_flush_id, jl->j_trans_id);
-	}
-	journal->j_last_flush_id = jl->j_trans_id;
-
-	/*
-	 * not strictly required since we are freeing the list, but it should
-	 * help find code using dead lists later on
-	 */
-	jl->j_len = 0;
-	atomic_set(&jl->j_nonzerolen, 0);
-	jl->j_start = 0;
-	jl->j_realblock = NULL;
-	jl->j_commit_bh = NULL;
-	jl->j_trans_id = 0;
-	jl->j_state = 0;
-	put_journal_list(s, jl);
-	if (flushall)
-		mutex_unlock(&journal->j_flush_mutex);
-	return err;
-}
-
-static int write_one_transaction(struct super_block *s,
-				 struct reiserfs_journal_list *jl,
-				 struct buffer_chunk *chunk)
-{
-	struct reiserfs_journal_cnode *cn;
-	int ret = 0;
-
-	jl->j_state |= LIST_TOUCHED;
-	del_from_work_list(s, jl);
-	if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) {
-		return 0;
-	}
-
-	cn = jl->j_realblock;
-	while (cn) {
-		/*
-		 * if the blocknr == 0, this has been cleared from the hash,
-		 * skip it
-		 */
-		if (cn->blocknr == 0) {
-			goto next;
-		}
-		if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) {
-			struct buffer_head *tmp_bh;
-			/*
-			 * we can race against journal_mark_freed when we try
-			 * to lock_buffer(cn->bh), so we have to inc the buffer
-			 * count, and recheck things after locking
-			 */
-			tmp_bh = cn->bh;
-			get_bh(tmp_bh);
-			lock_buffer(tmp_bh);
-			if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) {
-				if (!buffer_journal_dirty(tmp_bh) ||
-				    buffer_journal_prepared(tmp_bh))
-					BUG();
-				add_to_chunk(chunk, tmp_bh, NULL, write_chunk);
-				ret++;
-			} else {
-				/* note, cn->bh might be null now */
-				unlock_buffer(tmp_bh);
-			}
-			put_bh(tmp_bh);
-		}
-next:
-		cn = cn->next;
-		cond_resched();
-	}
-	return ret;
-}
-
-/* used by flush_commit_list */
-static void dirty_one_transaction(struct super_block *s,
-				 struct reiserfs_journal_list *jl)
-{
-	struct reiserfs_journal_cnode *cn;
-	struct reiserfs_journal_list *pjl;
-
-	jl->j_state |= LIST_DIRTY;
-	cn = jl->j_realblock;
-	while (cn) {
-		/*
-		 * look for a more recent transaction that logged this
-		 * buffer.  Only the most recent transaction with a buffer in
-		 * it is allowed to send that buffer to disk
-		 */
-		pjl = find_newer_jl_for_cn(cn);
-		if (!pjl && cn->blocknr && cn->bh
-		    && buffer_journal_dirty(cn->bh)) {
-			BUG_ON(!can_dirty(cn));
-			/*
-			 * if the buffer is prepared, it will either be logged
-			 * or restored.  If restored, we need to make sure
-			 * it actually gets marked dirty
-			 */
-			clear_buffer_journal_new(cn->bh);
-			if (buffer_journal_prepared(cn->bh)) {
-				set_buffer_journal_restore_dirty(cn->bh);
-			} else {
-				set_buffer_journal_test(cn->bh);
-				mark_buffer_dirty(cn->bh);
-			}
-		}
-		cn = cn->next;
-	}
-}
-
-static int kupdate_transactions(struct super_block *s,
-				struct reiserfs_journal_list *jl,
-				struct reiserfs_journal_list **next_jl,
-				unsigned int *next_trans_id,
-				int num_blocks, int num_trans)
-{
-	int ret = 0;
-	int written = 0;
-	int transactions_flushed = 0;
-	unsigned int orig_trans_id = jl->j_trans_id;
-	struct buffer_chunk chunk;
-	struct list_head *entry;
-	struct reiserfs_journal *journal = SB_JOURNAL(s);
-	chunk.nr = 0;
-
-	reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
-	if (!journal_list_still_alive(s, orig_trans_id)) {
-		goto done;
-	}
-
-	/*
-	 * we've got j_flush_mutex held, nobody is going to delete any
-	 * of these lists out from underneath us
-	 */
-	while ((num_trans && transactions_flushed < num_trans) ||
-	       (!num_trans && written < num_blocks)) {
-
-		if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) ||
-		    atomic_read(&jl->j_commit_left)
-		    || !(jl->j_state & LIST_DIRTY)) {
-			del_from_work_list(s, jl);
-			break;
-		}
-		ret = write_one_transaction(s, jl, &chunk);
-
-		if (ret < 0)
-			goto done;
-		transactions_flushed++;
-		written += ret;
-		entry = jl->j_list.next;
-
-		/* did we wrap? */
-		if (entry == &journal->j_journal_list) {
-			break;
-		}
-		jl = JOURNAL_LIST_ENTRY(entry);
-
-		/* don't bother with older transactions */
-		if (jl->j_trans_id <= orig_trans_id)
-			break;
-	}
-	if (chunk.nr) {
-		write_chunk(&chunk);
-	}
-
-done:
-	mutex_unlock(&journal->j_flush_mutex);
-	return ret;
-}
-
-/*
- * for o_sync and fsync heavy applications, they tend to use
- * all the journa list slots with tiny transactions.  These
- * trigger lots and lots of calls to update the header block, which
- * adds seeks and slows things down.
- *
- * This function tries to clear out a large chunk of the journal lists
- * at once, which makes everything faster since only the newest journal
- * list updates the header block
- */
-static int flush_used_journal_lists(struct super_block *s,
-				    struct reiserfs_journal_list *jl)
-{
-	unsigned long len = 0;
-	unsigned long cur_len;
-	int i;
-	int limit = 256;
-	struct reiserfs_journal_list *tjl;
-	struct reiserfs_journal_list *flush_jl;
-	unsigned int trans_id;
-	struct reiserfs_journal *journal = SB_JOURNAL(s);
-
-	flush_jl = tjl = jl;
-
-	/* in data logging mode, try harder to flush a lot of blocks */
-	if (reiserfs_data_log(s))
-		limit = 1024;
-	/* flush for 256 transactions or limit blocks, whichever comes first */
-	for (i = 0; i < 256 && len < limit; i++) {
-		if (atomic_read(&tjl->j_commit_left) ||
-		    tjl->j_trans_id < jl->j_trans_id) {
-			break;
-		}
-		cur_len = atomic_read(&tjl->j_nonzerolen);
-		if (cur_len > 0) {
-			tjl->j_state &= ~LIST_TOUCHED;
-		}
-		len += cur_len;
-		flush_jl = tjl;
-		if (tjl->j_list.next == &journal->j_journal_list)
-			break;
-		tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next);
-	}
-	get_journal_list(jl);
-	get_journal_list(flush_jl);
-
-	/*
-	 * try to find a group of blocks we can flush across all the
-	 * transactions, but only bother if we've actually spanned
-	 * across multiple lists
-	 */
-	if (flush_jl != jl)
-		kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
-
-	flush_journal_list(s, flush_jl, 1);
-	put_journal_list(s, flush_jl);
-	put_journal_list(s, jl);
-	return 0;
-}
-
-/*
- * removes any nodes in table with name block and dev as bh.
- * only touchs the hnext and hprev pointers.
- */
-static void remove_journal_hash(struct super_block *sb,
-			 struct reiserfs_journal_cnode **table,
-			 struct reiserfs_journal_list *jl,
-			 unsigned long block, int remove_freed)
-{
-	struct reiserfs_journal_cnode *cur;
-	struct reiserfs_journal_cnode **head;
-
-	head = &(journal_hash(table, sb, block));
-	if (!head) {
-		return;
-	}
-	cur = *head;
-	while (cur) {
-		if (cur->blocknr == block && cur->sb == sb
-		    && (jl == NULL || jl == cur->jlist)
-		    && (!test_bit(BLOCK_FREED, &cur->state) || remove_freed)) {
-			if (cur->hnext) {
-				cur->hnext->hprev = cur->hprev;
-			}
-			if (cur->hprev) {
-				cur->hprev->hnext = cur->hnext;
-			} else {
-				*head = cur->hnext;
-			}
-			cur->blocknr = 0;
-			cur->sb = NULL;
-			cur->state = 0;
-			/*
-			 * anybody who clears the cur->bh will also
-			 * dec the nonzerolen
-			 */
-			if (cur->bh && cur->jlist)
-				atomic_dec(&cur->jlist->j_nonzerolen);
-			cur->bh = NULL;
-			cur->jlist = NULL;
-		}
-		cur = cur->hnext;
-	}
-}
-
-static void free_journal_ram(struct super_block *sb)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	kfree(journal->j_current_jl);
-	journal->j_num_lists--;
-
-	vfree(journal->j_cnode_free_orig);
-	free_list_bitmaps(sb, journal->j_list_bitmap);
-	free_bitmap_nodes(sb);	/* must be after free_list_bitmaps */
-	if (journal->j_header_bh) {
-		brelse(journal->j_header_bh);
-	}
-	/*
-	 * j_header_bh is on the journal dev, make sure
-	 * not to release the journal dev until we brelse j_header_bh
-	 */
-	release_journal_dev(journal);
-	vfree(journal);
-}
-
-/*
- * call on unmount.  Only set error to 1 if you haven't made your way out
- * of read_super() yet.  Any other caller must keep error at 0.
- */
-static int do_journal_release(struct reiserfs_transaction_handle *th,
-			      struct super_block *sb, int error)
-{
-	struct reiserfs_transaction_handle myth;
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-
-	/*
-	 * we only want to flush out transactions if we were
-	 * called with error == 0
-	 */
-	if (!error && !sb_rdonly(sb)) {
-		/* end the current trans */
-		BUG_ON(!th->t_trans_id);
-		do_journal_end(th, FLUSH_ALL);
-
-		/*
-		 * make sure something gets logged to force
-		 * our way into the flush code
-		 */
-		if (!journal_join(&myth, sb)) {
-			reiserfs_prepare_for_journal(sb,
-						     SB_BUFFER_WITH_SB(sb),
-						     1);
-			journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb));
-			do_journal_end(&myth, FLUSH_ALL);
-		}
-	}
-
-	/* this also catches errors during the do_journal_end above */
-	if (!error && reiserfs_is_journal_aborted(journal)) {
-		memset(&myth, 0, sizeof(myth));
-		if (!journal_join_abort(&myth, sb)) {
-			reiserfs_prepare_for_journal(sb,
-						     SB_BUFFER_WITH_SB(sb),
-						     1);
-			journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb));
-			do_journal_end(&myth, FLUSH_ALL);
-		}
-	}
-
-
-	/*
-	 * We must release the write lock here because
-	 * the workqueue job (flush_async_commit) needs this lock
-	 */
-	reiserfs_write_unlock(sb);
-
-	/*
-	 * Cancel flushing of old commits. Note that neither of these works
-	 * will be requeued because superblock is being shutdown and doesn't
-	 * have SB_ACTIVE set.
-	 */
-	reiserfs_cancel_old_flush(sb);
-	/* wait for all commits to finish */
-	cancel_delayed_work_sync(&SB_JOURNAL(sb)->j_work);
-
-	free_journal_ram(sb);
-
-	reiserfs_write_lock(sb);
-
-	return 0;
-}
-
-/* * call on unmount.  flush all journal trans, release all alloc'd ram */
-int journal_release(struct reiserfs_transaction_handle *th,
-		    struct super_block *sb)
-{
-	return do_journal_release(th, sb, 0);
-}
-
-/* only call from an error condition inside reiserfs_read_super!  */
-int journal_release_error(struct reiserfs_transaction_handle *th,
-			  struct super_block *sb)
-{
-	return do_journal_release(th, sb, 1);
-}
-
-/*
- * compares description block with commit block.
- * returns 1 if they differ, 0 if they are the same
- */
-static int journal_compare_desc_commit(struct super_block *sb,
-				       struct reiserfs_journal_desc *desc,
-				       struct reiserfs_journal_commit *commit)
-{
-	if (get_commit_trans_id(commit) != get_desc_trans_id(desc) ||
-	    get_commit_trans_len(commit) != get_desc_trans_len(desc) ||
-	    get_commit_trans_len(commit) > SB_JOURNAL(sb)->j_trans_max ||
-	    get_commit_trans_len(commit) <= 0) {
-		return 1;
-	}
-	return 0;
-}
-
-/*
- * returns 0 if it did not find a description block
- * returns -1 if it found a corrupt commit block
- * returns 1 if both desc and commit were valid
- * NOTE: only called during fs mount
- */
-static int journal_transaction_is_valid(struct super_block *sb,
-					struct buffer_head *d_bh,
-					unsigned int *oldest_invalid_trans_id,
-					unsigned long *newest_mount_id)
-{
-	struct reiserfs_journal_desc *desc;
-	struct reiserfs_journal_commit *commit;
-	struct buffer_head *c_bh;
-	unsigned long offset;
-
-	if (!d_bh)
-		return 0;
-
-	desc = (struct reiserfs_journal_desc *)d_bh->b_data;
-	if (get_desc_trans_len(desc) > 0
-	    && !memcmp(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8)) {
-		if (oldest_invalid_trans_id && *oldest_invalid_trans_id
-		    && get_desc_trans_id(desc) > *oldest_invalid_trans_id) {
-			reiserfs_debug(sb, REISERFS_DEBUG_CODE,
-				       "journal-986: transaction "
-				       "is valid returning because trans_id %d is greater than "
-				       "oldest_invalid %lu",
-				       get_desc_trans_id(desc),
-				       *oldest_invalid_trans_id);
-			return 0;
-		}
-		if (newest_mount_id
-		    && *newest_mount_id > get_desc_mount_id(desc)) {
-			reiserfs_debug(sb, REISERFS_DEBUG_CODE,
-				       "journal-1087: transaction "
-				       "is valid returning because mount_id %d is less than "
-				       "newest_mount_id %lu",
-				       get_desc_mount_id(desc),
-				       *newest_mount_id);
-			return -1;
-		}
-		if (get_desc_trans_len(desc) > SB_JOURNAL(sb)->j_trans_max) {
-			reiserfs_warning(sb, "journal-2018",
-					 "Bad transaction length %d "
-					 "encountered, ignoring transaction",
-					 get_desc_trans_len(desc));
-			return -1;
-		}
-		offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
-
-		/*
-		 * ok, we have a journal description block,
-		 * let's see if the transaction was valid
-		 */
-		c_bh =
-		    journal_bread(sb,
-				  SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
-				  ((offset + get_desc_trans_len(desc) +
-				    1) % SB_ONDISK_JOURNAL_SIZE(sb)));
-		if (!c_bh)
-			return 0;
-		commit = (struct reiserfs_journal_commit *)c_bh->b_data;
-		if (journal_compare_desc_commit(sb, desc, commit)) {
-			reiserfs_debug(sb, REISERFS_DEBUG_CODE,
-				       "journal_transaction_is_valid, commit offset %ld had bad "
-				       "time %d or length %d",
-				       c_bh->b_blocknr -
-				       SB_ONDISK_JOURNAL_1st_BLOCK(sb),
-				       get_commit_trans_id(commit),
-				       get_commit_trans_len(commit));
-			brelse(c_bh);
-			if (oldest_invalid_trans_id) {
-				*oldest_invalid_trans_id =
-				    get_desc_trans_id(desc);
-				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
-					       "journal-1004: "
-					       "transaction_is_valid setting oldest invalid trans_id "
-					       "to %d",
-					       get_desc_trans_id(desc));
-			}
-			return -1;
-		}
-		brelse(c_bh);
-		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
-			       "journal-1006: found valid "
-			       "transaction start offset %llu, len %d id %d",
-			       d_bh->b_blocknr -
-			       SB_ONDISK_JOURNAL_1st_BLOCK(sb),
-			       get_desc_trans_len(desc),
-			       get_desc_trans_id(desc));
-		return 1;
-	} else {
-		return 0;
-	}
-}
-
-static void brelse_array(struct buffer_head **heads, int num)
-{
-	int i;
-	for (i = 0; i < num; i++) {
-		brelse(heads[i]);
-	}
-}
-
-/*
- * given the start, and values for the oldest acceptable transactions,
- * this either reads in a replays a transaction, or returns because the
- * transaction is invalid, or too old.
- * NOTE: only called during fs mount
- */
-static int journal_read_transaction(struct super_block *sb,
-				    unsigned long cur_dblock,
-				    unsigned long oldest_start,
-				    unsigned int oldest_trans_id,
-				    unsigned long newest_mount_id)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	struct reiserfs_journal_desc *desc;
-	struct reiserfs_journal_commit *commit;
-	unsigned int trans_id = 0;
-	struct buffer_head *c_bh;
-	struct buffer_head *d_bh;
-	struct buffer_head **log_blocks = NULL;
-	struct buffer_head **real_blocks = NULL;
-	unsigned int trans_offset;
-	int i;
-	int trans_half;
-
-	d_bh = journal_bread(sb, cur_dblock);
-	if (!d_bh)
-		return 1;
-	desc = (struct reiserfs_journal_desc *)d_bh->b_data;
-	trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
-	reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1037: "
-		       "journal_read_transaction, offset %llu, len %d mount_id %d",
-		       d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb),
-		       get_desc_trans_len(desc), get_desc_mount_id(desc));
-	if (get_desc_trans_id(desc) < oldest_trans_id) {
-		reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1039: "
-			       "journal_read_trans skipping because %lu is too old",
-			       cur_dblock -
-			       SB_ONDISK_JOURNAL_1st_BLOCK(sb));
-		brelse(d_bh);
-		return 1;
-	}
-	if (get_desc_mount_id(desc) != newest_mount_id) {
-		reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1146: "
-			       "journal_read_trans skipping because %d is != "
-			       "newest_mount_id %lu", get_desc_mount_id(desc),
-			       newest_mount_id);
-		brelse(d_bh);
-		return 1;
-	}
-	c_bh = journal_bread(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
-			     ((trans_offset + get_desc_trans_len(desc) + 1) %
-			      SB_ONDISK_JOURNAL_SIZE(sb)));
-	if (!c_bh) {
-		brelse(d_bh);
-		return 1;
-	}
-	commit = (struct reiserfs_journal_commit *)c_bh->b_data;
-	if (journal_compare_desc_commit(sb, desc, commit)) {
-		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
-			       "journal_read_transaction, "
-			       "commit offset %llu had bad time %d or length %d",
-			       c_bh->b_blocknr -
-			       SB_ONDISK_JOURNAL_1st_BLOCK(sb),
-			       get_commit_trans_id(commit),
-			       get_commit_trans_len(commit));
-		brelse(c_bh);
-		brelse(d_bh);
-		return 1;
-	}
-
-	if (bdev_read_only(sb->s_bdev)) {
-		reiserfs_warning(sb, "clm-2076",
-				 "device is readonly, unable to replay log");
-		brelse(c_bh);
-		brelse(d_bh);
-		return -EROFS;
-	}
-
-	trans_id = get_desc_trans_id(desc);
-	/*
-	 * now we know we've got a good transaction, and it was
-	 * inside the valid time ranges
-	 */
-	log_blocks = kmalloc_array(get_desc_trans_len(desc),
-				   sizeof(struct buffer_head *),
-				   GFP_NOFS);
-	real_blocks = kmalloc_array(get_desc_trans_len(desc),
-				    sizeof(struct buffer_head *),
-				    GFP_NOFS);
-	if (!log_blocks || !real_blocks) {
-		brelse(c_bh);
-		brelse(d_bh);
-		kfree(log_blocks);
-		kfree(real_blocks);
-		reiserfs_warning(sb, "journal-1169",
-				 "kmalloc failed, unable to mount FS");
-		return -1;
-	}
-	/* get all the buffer heads */
-	trans_half = journal_trans_half(sb->s_blocksize);
-	for (i = 0; i < get_desc_trans_len(desc); i++) {
-		log_blocks[i] =
-		    journal_getblk(sb,
-				   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
-				   (trans_offset + 1 +
-				    i) % SB_ONDISK_JOURNAL_SIZE(sb));
-		if (i < trans_half) {
-			real_blocks[i] =
-			    sb_getblk(sb,
-				      le32_to_cpu(desc->j_realblock[i]));
-		} else {
-			real_blocks[i] =
-			    sb_getblk(sb,
-				      le32_to_cpu(commit->
-						  j_realblock[i - trans_half]));
-		}
-		if (real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(sb)) {
-			reiserfs_warning(sb, "journal-1207",
-					 "REPLAY FAILURE fsck required! "
-					 "Block to replay is outside of "
-					 "filesystem");
-			goto abort_replay;
-		}
-		/* make sure we don't try to replay onto log or reserved area */
-		if (is_block_in_log_or_reserved_area
-		    (sb, real_blocks[i]->b_blocknr)) {
-			reiserfs_warning(sb, "journal-1204",
-					 "REPLAY FAILURE fsck required! "
-					 "Trying to replay onto a log block");
-abort_replay:
-			brelse_array(log_blocks, i);
-			brelse_array(real_blocks, i);
-			brelse(c_bh);
-			brelse(d_bh);
-			kfree(log_blocks);
-			kfree(real_blocks);
-			return -1;
-		}
-	}
-	/* read in the log blocks, memcpy to the corresponding real block */
-	bh_read_batch(get_desc_trans_len(desc), log_blocks);
-	for (i = 0; i < get_desc_trans_len(desc); i++) {
-
-		wait_on_buffer(log_blocks[i]);
-		if (!buffer_uptodate(log_blocks[i])) {
-			reiserfs_warning(sb, "journal-1212",
-					 "REPLAY FAILURE fsck required! "
-					 "buffer write failed");
-			brelse_array(log_blocks + i,
-				     get_desc_trans_len(desc) - i);
-			brelse_array(real_blocks, get_desc_trans_len(desc));
-			brelse(c_bh);
-			brelse(d_bh);
-			kfree(log_blocks);
-			kfree(real_blocks);
-			return -1;
-		}
-		memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data,
-		       real_blocks[i]->b_size);
-		set_buffer_uptodate(real_blocks[i]);
-		brelse(log_blocks[i]);
-	}
-	/* flush out the real blocks */
-	for (i = 0; i < get_desc_trans_len(desc); i++) {
-		set_buffer_dirty(real_blocks[i]);
-		write_dirty_buffer(real_blocks[i], 0);
-	}
-	for (i = 0; i < get_desc_trans_len(desc); i++) {
-		wait_on_buffer(real_blocks[i]);
-		if (!buffer_uptodate(real_blocks[i])) {
-			reiserfs_warning(sb, "journal-1226",
-					 "REPLAY FAILURE, fsck required! "
-					 "buffer write failed");
-			brelse_array(real_blocks + i,
-				     get_desc_trans_len(desc) - i);
-			brelse(c_bh);
-			brelse(d_bh);
-			kfree(log_blocks);
-			kfree(real_blocks);
-			return -1;
-		}
-		brelse(real_blocks[i]);
-	}
-	cur_dblock =
-	    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
-	    ((trans_offset + get_desc_trans_len(desc) +
-	      2) % SB_ONDISK_JOURNAL_SIZE(sb));
-	reiserfs_debug(sb, REISERFS_DEBUG_CODE,
-		       "journal-1095: setting journal " "start to offset %ld",
-		       cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb));
-
-	/*
-	 * init starting values for the first transaction, in case
-	 * this is the last transaction to be replayed.
-	 */
-	journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
-	journal->j_last_flush_trans_id = trans_id;
-	journal->j_trans_id = trans_id + 1;
-	/* check for trans_id overflow */
-	if (journal->j_trans_id == 0)
-		journal->j_trans_id = 10;
-	brelse(c_bh);
-	brelse(d_bh);
-	kfree(log_blocks);
-	kfree(real_blocks);
-	return 0;
-}
-
-/*
- * This function reads blocks starting from block and to max_block of bufsize
- * size (but no more than BUFNR blocks at a time). This proved to improve
- * mounting speed on self-rebuilding raid5 arrays at least.
- * Right now it is only used from journal code. But later we might use it
- * from other places.
- * Note: Do not use journal_getblk/sb_getblk functions here!
- */
-static struct buffer_head *reiserfs_breada(struct block_device *dev,
-					   b_blocknr_t block, int bufsize,
-					   b_blocknr_t max_block)
-{
-	struct buffer_head *bhlist[BUFNR];
-	unsigned int blocks = BUFNR;
-	struct buffer_head *bh;
-	int i, j;
-
-	bh = __getblk(dev, block, bufsize);
-	if (!bh || buffer_uptodate(bh))
-		return (bh);
-
-	if (block + BUFNR > max_block) {
-		blocks = max_block - block;
-	}
-	bhlist[0] = bh;
-	j = 1;
-	for (i = 1; i < blocks; i++) {
-		bh = __getblk(dev, block + i, bufsize);
-		if (!bh)
-			break;
-		if (buffer_uptodate(bh)) {
-			brelse(bh);
-			break;
-		} else
-			bhlist[j++] = bh;
-	}
-	bh = bhlist[0];
-	bh_read_nowait(bh, 0);
-	bh_readahead_batch(j - 1, &bhlist[1], 0);
-	for (i = 1; i < j; i++)
-		brelse(bhlist[i]);
-	wait_on_buffer(bh);
-	if (buffer_uptodate(bh))
-		return bh;
-	brelse(bh);
-	return NULL;
-}
-
-/*
- * read and replay the log
- * on a clean unmount, the journal header's next unflushed pointer will be
- * to an invalid transaction.  This tests that before finding all the
- * transactions in the log, which makes normal mount times fast.
- *
- * After a crash, this starts with the next unflushed transaction, and
- * replays until it finds one too old, or invalid.
- *
- * On exit, it sets things up so the first transaction will work correctly.
- * NOTE: only called during fs mount
- */
-static int journal_read(struct super_block *sb)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	struct reiserfs_journal_desc *desc;
-	unsigned int oldest_trans_id = 0;
-	unsigned int oldest_invalid_trans_id = 0;
-	time64_t start;
-	unsigned long oldest_start = 0;
-	unsigned long cur_dblock = 0;
-	unsigned long newest_mount_id = 9;
-	struct buffer_head *d_bh;
-	struct reiserfs_journal_header *jh;
-	int valid_journal_header = 0;
-	int replay_count = 0;
-	int continue_replay = 1;
-	int ret;
-
-	cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb);
-	reiserfs_info(sb, "checking transaction log (%pg)\n",
-		      file_bdev(journal->j_bdev_file));
-	start = ktime_get_seconds();
-
-	/*
-	 * step 1, read in the journal header block.  Check the transaction
-	 * it says is the first unflushed, and if that transaction is not
-	 * valid, replay is done
-	 */
-	journal->j_header_bh = journal_bread(sb,
-					     SB_ONDISK_JOURNAL_1st_BLOCK(sb)
-					     + SB_ONDISK_JOURNAL_SIZE(sb));
-	if (!journal->j_header_bh) {
-		return 1;
-	}
-	jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data);
-	if (le32_to_cpu(jh->j_first_unflushed_offset) <
-	    SB_ONDISK_JOURNAL_SIZE(sb)
-	    && le32_to_cpu(jh->j_last_flush_trans_id) > 0) {
-		oldest_start =
-		    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
-		    le32_to_cpu(jh->j_first_unflushed_offset);
-		oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1;
-		newest_mount_id = le32_to_cpu(jh->j_mount_id);
-		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
-			       "journal-1153: found in "
-			       "header: first_unflushed_offset %d, last_flushed_trans_id "
-			       "%lu", le32_to_cpu(jh->j_first_unflushed_offset),
-			       le32_to_cpu(jh->j_last_flush_trans_id));
-		valid_journal_header = 1;
-
-		/*
-		 * now, we try to read the first unflushed offset.  If it
-		 * is not valid, there is nothing more we can do, and it
-		 * makes no sense to read through the whole log.
-		 */
-		d_bh =
-		    journal_bread(sb,
-				  SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
-				  le32_to_cpu(jh->j_first_unflushed_offset));
-		ret = journal_transaction_is_valid(sb, d_bh, NULL, NULL);
-		if (!ret) {
-			continue_replay = 0;
-		}
-		brelse(d_bh);
-		goto start_log_replay;
-	}
-
-	/*
-	 * ok, there are transactions that need to be replayed.  start
-	 * with the first log block, find all the valid transactions, and
-	 * pick out the oldest.
-	 */
-	while (continue_replay
-	       && cur_dblock <
-	       (SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
-		SB_ONDISK_JOURNAL_SIZE(sb))) {
-		/*
-		 * Note that it is required for blocksize of primary fs
-		 * device and journal device to be the same
-		 */
-		d_bh =
-		    reiserfs_breada(file_bdev(journal->j_bdev_file), cur_dblock,
-				    sb->s_blocksize,
-				    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
-				    SB_ONDISK_JOURNAL_SIZE(sb));
-		ret =
-		    journal_transaction_is_valid(sb, d_bh,
-						 &oldest_invalid_trans_id,
-						 &newest_mount_id);
-		if (ret == 1) {
-			desc = (struct reiserfs_journal_desc *)d_bh->b_data;
-			if (oldest_start == 0) {	/* init all oldest_ values */
-				oldest_trans_id = get_desc_trans_id(desc);
-				oldest_start = d_bh->b_blocknr;
-				newest_mount_id = get_desc_mount_id(desc);
-				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
-					       "journal-1179: Setting "
-					       "oldest_start to offset %llu, trans_id %lu",
-					       oldest_start -
-					       SB_ONDISK_JOURNAL_1st_BLOCK
-					       (sb), oldest_trans_id);
-			} else if (oldest_trans_id > get_desc_trans_id(desc)) {
-				/* one we just read was older */
-				oldest_trans_id = get_desc_trans_id(desc);
-				oldest_start = d_bh->b_blocknr;
-				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
-					       "journal-1180: Resetting "
-					       "oldest_start to offset %lu, trans_id %lu",
-					       oldest_start -
-					       SB_ONDISK_JOURNAL_1st_BLOCK
-					       (sb), oldest_trans_id);
-			}
-			if (newest_mount_id < get_desc_mount_id(desc)) {
-				newest_mount_id = get_desc_mount_id(desc);
-				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
-					       "journal-1299: Setting "
-					       "newest_mount_id to %d",
-					       get_desc_mount_id(desc));
-			}
-			cur_dblock += get_desc_trans_len(desc) + 2;
-		} else {
-			cur_dblock++;
-		}
-		brelse(d_bh);
-	}
-
-start_log_replay:
-	cur_dblock = oldest_start;
-	if (oldest_trans_id) {
-		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
-			       "journal-1206: Starting replay "
-			       "from offset %llu, trans_id %lu",
-			       cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb),
-			       oldest_trans_id);
-
-	}
-	replay_count = 0;
-	while (continue_replay && oldest_trans_id > 0) {
-		ret =
-		    journal_read_transaction(sb, cur_dblock, oldest_start,
-					     oldest_trans_id, newest_mount_id);
-		if (ret < 0) {
-			return ret;
-		} else if (ret != 0) {
-			break;
-		}
-		cur_dblock =
-		    SB_ONDISK_JOURNAL_1st_BLOCK(sb) + journal->j_start;
-		replay_count++;
-		if (cur_dblock == oldest_start)
-			break;
-	}
-
-	if (oldest_trans_id == 0) {
-		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
-			       "journal-1225: No valid " "transactions found");
-	}
-	/*
-	 * j_start does not get set correctly if we don't replay any
-	 * transactions.  if we had a valid journal_header, set j_start
-	 * to the first unflushed transaction value, copy the trans_id
-	 * from the header
-	 */
-	if (valid_journal_header && replay_count == 0) {
-		journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset);
-		journal->j_trans_id =
-		    le32_to_cpu(jh->j_last_flush_trans_id) + 1;
-		/* check for trans_id overflow */
-		if (journal->j_trans_id == 0)
-			journal->j_trans_id = 10;
-		journal->j_last_flush_trans_id =
-		    le32_to_cpu(jh->j_last_flush_trans_id);
-		journal->j_mount_id = le32_to_cpu(jh->j_mount_id) + 1;
-	} else {
-		journal->j_mount_id = newest_mount_id + 1;
-	}
-	reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1299: Setting "
-		       "newest_mount_id to %lu", journal->j_mount_id);
-	journal->j_first_unflushed_offset = journal->j_start;
-	if (replay_count > 0) {
-		reiserfs_info(sb,
-			      "replayed %d transactions in %lu seconds\n",
-			      replay_count, ktime_get_seconds() - start);
-	}
-	/* needed to satisfy the locking in _update_journal_header_block */
-	reiserfs_write_lock(sb);
-	if (!bdev_read_only(sb->s_bdev) &&
-	    _update_journal_header_block(sb, journal->j_start,
-					 journal->j_last_flush_trans_id)) {
-		reiserfs_write_unlock(sb);
-		/*
-		 * replay failed, caller must call free_journal_ram and abort
-		 * the mount
-		 */
-		return -1;
-	}
-	reiserfs_write_unlock(sb);
-	return 0;
-}
-
-static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
-{
-	struct reiserfs_journal_list *jl;
-	jl = kzalloc(sizeof(struct reiserfs_journal_list),
-		     GFP_NOFS | __GFP_NOFAIL);
-	INIT_LIST_HEAD(&jl->j_list);
-	INIT_LIST_HEAD(&jl->j_working_list);
-	INIT_LIST_HEAD(&jl->j_tail_bh_list);
-	INIT_LIST_HEAD(&jl->j_bh_list);
-	mutex_init(&jl->j_commit_mutex);
-	SB_JOURNAL(s)->j_num_lists++;
-	get_journal_list(jl);
-	return jl;
-}
-
-static void journal_list_init(struct super_block *sb)
-{
-	SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb);
-}
-
-static void release_journal_dev(struct reiserfs_journal *journal)
-{
-	if (journal->j_bdev_file) {
-		bdev_fput(journal->j_bdev_file);
-		journal->j_bdev_file = NULL;
-	}
-}
-
-static int journal_init_dev(struct super_block *super,
-			    struct reiserfs_journal *journal,
-			    const char *jdev_name)
-{
-	blk_mode_t blkdev_mode = BLK_OPEN_READ;
-	void *holder = journal;
-	int result;
-	dev_t jdev;
-
-	result = 0;
-
-	journal->j_bdev_file = NULL;
-	jdev = SB_ONDISK_JOURNAL_DEVICE(super) ?
-	    new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev;
-
-	if (!bdev_read_only(super->s_bdev))
-		blkdev_mode |= BLK_OPEN_WRITE;
-
-	/* there is no "jdev" option and journal is on separate device */
-	if ((!jdev_name || !jdev_name[0])) {
-		if (jdev == super->s_dev)
-			holder = NULL;
-		journal->j_bdev_file = bdev_file_open_by_dev(jdev, blkdev_mode,
-							  holder, NULL);
-		if (IS_ERR(journal->j_bdev_file)) {
-			result = PTR_ERR(journal->j_bdev_file);
-			journal->j_bdev_file = NULL;
-			reiserfs_warning(super, "sh-458",
-					 "cannot init journal device unknown-block(%u,%u): %i",
-					 MAJOR(jdev), MINOR(jdev), result);
-			return result;
-		} else if (jdev != super->s_dev)
-			set_blocksize(journal->j_bdev_file, super->s_blocksize);
-
-		return 0;
-	}
-
-	journal->j_bdev_file = bdev_file_open_by_path(jdev_name, blkdev_mode,
-						   holder, NULL);
-	if (IS_ERR(journal->j_bdev_file)) {
-		result = PTR_ERR(journal->j_bdev_file);
-		journal->j_bdev_file = NULL;
-		reiserfs_warning(super, "sh-457",
-				 "journal_init_dev: Cannot open '%s': %i",
-				 jdev_name, result);
-		return result;
-	}
-
-	set_blocksize(journal->j_bdev_file, super->s_blocksize);
-	reiserfs_info(super,
-		      "journal_init_dev: journal device: %pg\n",
-		      file_bdev(journal->j_bdev_file));
-	return 0;
-}
-
-/*
- * When creating/tuning a file system user can assign some
- * journal params within boundaries which depend on the ratio
- * blocksize/standard_blocksize.
- *
- * For blocks >= standard_blocksize transaction size should
- * be not less then JOURNAL_TRANS_MIN_DEFAULT, and not more
- * then JOURNAL_TRANS_MAX_DEFAULT.
- *
- * For blocks < standard_blocksize these boundaries should be
- * decreased proportionally.
- */
-#define REISERFS_STANDARD_BLKSIZE (4096)
-
-static int check_advise_trans_params(struct super_block *sb,
-				     struct reiserfs_journal *journal)
-{
-        if (journal->j_trans_max) {
-		/* Non-default journal params.  Do sanity check for them. */
-	        int ratio = 1;
-		if (sb->s_blocksize < REISERFS_STANDARD_BLKSIZE)
-		        ratio = REISERFS_STANDARD_BLKSIZE / sb->s_blocksize;
-
-		if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio ||
-		    journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio ||
-		    SB_ONDISK_JOURNAL_SIZE(sb) / journal->j_trans_max <
-		    JOURNAL_MIN_RATIO) {
-			reiserfs_warning(sb, "sh-462",
-					 "bad transaction max size (%u). "
-					 "FSCK?", journal->j_trans_max);
-			return 1;
-		}
-		if (journal->j_max_batch != (journal->j_trans_max) *
-		        JOURNAL_MAX_BATCH_DEFAULT/JOURNAL_TRANS_MAX_DEFAULT) {
-			reiserfs_warning(sb, "sh-463",
-					 "bad transaction max batch (%u). "
-					 "FSCK?", journal->j_max_batch);
-			return 1;
-		}
-	} else {
-		/*
-		 * Default journal params.
-		 * The file system was created by old version
-		 * of mkreiserfs, so some fields contain zeros,
-		 * and we need to advise proper values for them
-		 */
-		if (sb->s_blocksize != REISERFS_STANDARD_BLKSIZE) {
-			reiserfs_warning(sb, "sh-464", "bad blocksize (%u)",
-					 sb->s_blocksize);
-			return 1;
-		}
-		journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT;
-		journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT;
-		journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE;
-	}
-	return 0;
-}
-
-/* must be called once on fs mount.  calls journal_read for you */
-int journal_init(struct super_block *sb, const char *j_dev_name,
-		 int old_format, unsigned int commit_max_age)
-{
-	int num_cnodes = SB_ONDISK_JOURNAL_SIZE(sb) * 2;
-	struct buffer_head *bhjh;
-	struct reiserfs_super_block *rs;
-	struct reiserfs_journal_header *jh;
-	struct reiserfs_journal *journal;
-	struct reiserfs_journal_list *jl;
-	int ret;
-
-	journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal));
-	if (!journal) {
-		reiserfs_warning(sb, "journal-1256",
-				 "unable to get memory for journal structure");
-		return 1;
-	}
-	INIT_LIST_HEAD(&journal->j_bitmap_nodes);
-	INIT_LIST_HEAD(&journal->j_prealloc_list);
-	INIT_LIST_HEAD(&journal->j_working_list);
-	INIT_LIST_HEAD(&journal->j_journal_list);
-	journal->j_persistent_trans = 0;
-	if (reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap,
-					   reiserfs_bmap_count(sb)))
-		goto free_and_return;
-
-	allocate_bitmap_nodes(sb);
-
-	/* reserved for journal area support */
-	SB_JOURNAL_1st_RESERVED_BLOCK(sb) = (old_format ?
-						 REISERFS_OLD_DISK_OFFSET_IN_BYTES
-						 / sb->s_blocksize +
-						 reiserfs_bmap_count(sb) +
-						 1 :
-						 REISERFS_DISK_OFFSET_IN_BYTES /
-						 sb->s_blocksize + 2);
-
-	/*
-	 * Sanity check to see is the standard journal fitting
-	 * within first bitmap (actual for small blocksizes)
-	 */
-	if (!SB_ONDISK_JOURNAL_DEVICE(sb) &&
-	    (SB_JOURNAL_1st_RESERVED_BLOCK(sb) +
-	     SB_ONDISK_JOURNAL_SIZE(sb) > sb->s_blocksize * 8)) {
-		reiserfs_warning(sb, "journal-1393",
-				 "journal does not fit for area addressed "
-				 "by first of bitmap blocks. It starts at "
-				 "%u and its size is %u. Block size %ld",
-				 SB_JOURNAL_1st_RESERVED_BLOCK(sb),
-				 SB_ONDISK_JOURNAL_SIZE(sb),
-				 sb->s_blocksize);
-		goto free_and_return;
-	}
-
-	/*
-	 * Sanity check to see if journal first block is correct.
-	 * If journal first block is invalid it can cause
-	 * zeroing important superblock members.
-	 */
-	if (!SB_ONDISK_JOURNAL_DEVICE(sb) &&
-	    SB_ONDISK_JOURNAL_1st_BLOCK(sb) < SB_JOURNAL_1st_RESERVED_BLOCK(sb)) {
-		reiserfs_warning(sb, "journal-1393",
-				 "journal 1st super block is invalid: 1st reserved block %d, but actual 1st block is %d",
-				 SB_JOURNAL_1st_RESERVED_BLOCK(sb),
-				 SB_ONDISK_JOURNAL_1st_BLOCK(sb));
-		goto free_and_return;
-	}
-
-	if (journal_init_dev(sb, journal, j_dev_name) != 0) {
-		reiserfs_warning(sb, "sh-462",
-				 "unable to initialize journal device");
-		goto free_and_return;
-	}
-
-	rs = SB_DISK_SUPER_BLOCK(sb);
-
-	/* read journal header */
-	bhjh = journal_bread(sb,
-			     SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
-			     SB_ONDISK_JOURNAL_SIZE(sb));
-	if (!bhjh) {
-		reiserfs_warning(sb, "sh-459",
-				 "unable to read journal header");
-		goto free_and_return;
-	}
-	jh = (struct reiserfs_journal_header *)(bhjh->b_data);
-
-	/* make sure that journal matches to the super block */
-	if (is_reiserfs_jr(rs)
-	    && (le32_to_cpu(jh->jh_journal.jp_journal_magic) !=
-		sb_jp_journal_magic(rs))) {
-		reiserfs_warning(sb, "sh-460",
-				 "journal header magic %x (device %pg) does "
-				 "not match to magic found in super block %x",
-				 jh->jh_journal.jp_journal_magic,
-				 file_bdev(journal->j_bdev_file),
-				 sb_jp_journal_magic(rs));
-		brelse(bhjh);
-		goto free_and_return;
-	}
-
-	journal->j_trans_max = le32_to_cpu(jh->jh_journal.jp_journal_trans_max);
-	journal->j_max_batch = le32_to_cpu(jh->jh_journal.jp_journal_max_batch);
-	journal->j_max_commit_age =
-	    le32_to_cpu(jh->jh_journal.jp_journal_max_commit_age);
-	journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;
-
-	if (check_advise_trans_params(sb, journal) != 0)
-	        goto free_and_return;
-	journal->j_default_max_commit_age = journal->j_max_commit_age;
-
-	if (commit_max_age != 0) {
-		journal->j_max_commit_age = commit_max_age;
-		journal->j_max_trans_age = commit_max_age;
-	}
-
-	reiserfs_info(sb, "journal params: device %pg, size %u, "
-		      "journal first block %u, max trans len %u, max batch %u, "
-		      "max commit age %u, max trans age %u\n",
-		      file_bdev(journal->j_bdev_file),
-		      SB_ONDISK_JOURNAL_SIZE(sb),
-		      SB_ONDISK_JOURNAL_1st_BLOCK(sb),
-		      journal->j_trans_max,
-		      journal->j_max_batch,
-		      journal->j_max_commit_age, journal->j_max_trans_age);
-
-	brelse(bhjh);
-
-	journal->j_list_bitmap_index = 0;
-	journal_list_init(sb);
-
-	memset(journal->j_list_hash_table, 0,
-	       JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
-
-	INIT_LIST_HEAD(&journal->j_dirty_buffers);
-	spin_lock_init(&journal->j_dirty_buffers_lock);
-
-	journal->j_start = 0;
-	journal->j_len = 0;
-	journal->j_len_alloc = 0;
-	atomic_set(&journal->j_wcount, 0);
-	atomic_set(&journal->j_async_throttle, 0);
-	journal->j_bcount = 0;
-	journal->j_trans_start_time = 0;
-	journal->j_last = NULL;
-	journal->j_first = NULL;
-	init_waitqueue_head(&journal->j_join_wait);
-	mutex_init(&journal->j_mutex);
-	mutex_init(&journal->j_flush_mutex);
-
-	journal->j_trans_id = 10;
-	journal->j_mount_id = 10;
-	journal->j_state = 0;
-	atomic_set(&journal->j_jlock, 0);
-	journal->j_cnode_free_list = allocate_cnodes(num_cnodes);
-	journal->j_cnode_free_orig = journal->j_cnode_free_list;
-	journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0;
-	journal->j_cnode_used = 0;
-	journal->j_must_wait = 0;
-
-	if (journal->j_cnode_free == 0) {
-		reiserfs_warning(sb, "journal-2004", "Journal cnode memory "
-		                 "allocation failed (%ld bytes). Journal is "
-		                 "too large for available memory. Usually "
-		                 "this is due to a journal that is too large.",
-		                 sizeof (struct reiserfs_journal_cnode) * num_cnodes);
-        	goto free_and_return;
-	}
-
-	init_journal_hash(sb);
-	jl = journal->j_current_jl;
-
-	/*
-	 * get_list_bitmap() may call flush_commit_list() which
-	 * requires the lock. Calling flush_commit_list() shouldn't happen
-	 * this early but I like to be paranoid.
-	 */
-	reiserfs_write_lock(sb);
-	jl->j_list_bitmap = get_list_bitmap(sb, jl);
-	reiserfs_write_unlock(sb);
-	if (!jl->j_list_bitmap) {
-		reiserfs_warning(sb, "journal-2005",
-				 "get_list_bitmap failed for journal list 0");
-		goto free_and_return;
-	}
-
-	ret = journal_read(sb);
-	if (ret < 0) {
-		reiserfs_warning(sb, "reiserfs-2006",
-				 "Replay Failure, unable to mount");
-		goto free_and_return;
-	}
-
-	INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
-	journal->j_work_sb = sb;
-	return 0;
-free_and_return:
-	free_journal_ram(sb);
-	return 1;
-}
-
-/*
- * test for a polite end of the current transaction.  Used by file_write,
- * and should be used by delete to make sure they don't write more than
- * can fit inside a single transaction
- */
-int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
-				   int new_alloc)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
-	time64_t now = ktime_get_seconds();
-	/* cannot restart while nested */
-	BUG_ON(!th->t_trans_id);
-	if (th->t_refcount > 1)
-		return 0;
-	if (journal->j_must_wait > 0 ||
-	    (journal->j_len_alloc + new_alloc) >= journal->j_max_batch ||
-	    atomic_read(&journal->j_jlock) ||
-	    (now - journal->j_trans_start_time) > journal->j_max_trans_age ||
-	    journal->j_cnode_free < (journal->j_trans_max * 3)) {
-		return 1;
-	}
-
-	journal->j_len_alloc += new_alloc;
-	th->t_blocks_allocated += new_alloc ;
-	return 0;
-}
-
-/* this must be called inside a transaction */
-void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
-	BUG_ON(!th->t_trans_id);
-	journal->j_must_wait = 1;
-	set_bit(J_WRITERS_BLOCKED, &journal->j_state);
-	return;
-}
-
-/* this must be called without a transaction started */
-void reiserfs_allow_writes(struct super_block *s)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(s);
-	clear_bit(J_WRITERS_BLOCKED, &journal->j_state);
-	wake_up(&journal->j_join_wait);
-}
-
-/* this must be called without a transaction started */
-void reiserfs_wait_on_write_block(struct super_block *s)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(s);
-	wait_event(journal->j_join_wait,
-		   !test_bit(J_WRITERS_BLOCKED, &journal->j_state));
-}
-
-static void queue_log_writer(struct super_block *s)
-{
-	wait_queue_entry_t wait;
-	struct reiserfs_journal *journal = SB_JOURNAL(s);
-	set_bit(J_WRITERS_QUEUED, &journal->j_state);
-
-	/*
-	 * we don't want to use wait_event here because
-	 * we only want to wait once.
-	 */
-	init_waitqueue_entry(&wait, current);
-	add_wait_queue(&journal->j_join_wait, &wait);
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) {
-		int depth = reiserfs_write_unlock_nested(s);
-		schedule();
-		reiserfs_write_lock_nested(s, depth);
-	}
-	__set_current_state(TASK_RUNNING);
-	remove_wait_queue(&journal->j_join_wait, &wait);
-}
-
-static void wake_queued_writers(struct super_block *s)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(s);
-	if (test_and_clear_bit(J_WRITERS_QUEUED, &journal->j_state))
-		wake_up(&journal->j_join_wait);
-}
-
-static void let_transaction_grow(struct super_block *sb, unsigned int trans_id)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	unsigned long bcount = journal->j_bcount;
-	while (1) {
-		int depth;
-
-		depth = reiserfs_write_unlock_nested(sb);
-		schedule_timeout_uninterruptible(1);
-		reiserfs_write_lock_nested(sb, depth);
-
-		journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;
-		while ((atomic_read(&journal->j_wcount) > 0 ||
-			atomic_read(&journal->j_jlock)) &&
-		       journal->j_trans_id == trans_id) {
-			queue_log_writer(sb);
-		}
-		if (journal->j_trans_id != trans_id)
-			break;
-		if (bcount == journal->j_bcount)
-			break;
-		bcount = journal->j_bcount;
-	}
-}
-
-/*
- * join == true if you must join an existing transaction.
- * join == false if you can deal with waiting for others to finish
- *
- * this will block until the transaction is joinable.  send the number of
- * blocks you expect to use in nblocks.
-*/
-static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
-			      struct super_block *sb, unsigned long nblocks,
-			      int join)
-{
-	time64_t now = ktime_get_seconds();
-	unsigned int old_trans_id;
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	struct reiserfs_transaction_handle myth;
-	int retval;
-	int depth;
-
-	reiserfs_check_lock_depth(sb, "journal_begin");
-	BUG_ON(nblocks > journal->j_trans_max);
-
-	PROC_INFO_INC(sb, journal.journal_being);
-	/* set here for journal_join */
-	th->t_refcount = 1;
-	th->t_super = sb;
-
-relock:
-	lock_journal(sb);
-	if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted(journal)) {
-		unlock_journal(sb);
-		retval = journal->j_errno;
-		goto out_fail;
-	}
-	journal->j_bcount++;
-
-	if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
-		unlock_journal(sb);
-		depth = reiserfs_write_unlock_nested(sb);
-		reiserfs_wait_on_write_block(sb);
-		reiserfs_write_lock_nested(sb, depth);
-		PROC_INFO_INC(sb, journal.journal_relock_writers);
-		goto relock;
-	}
-	now = ktime_get_seconds();
-
-	/*
-	 * if there is no room in the journal OR
-	 * if this transaction is too old, and we weren't called joinable,
-	 * wait for it to finish before beginning we don't sleep if there
-	 * aren't other writers
-	 */
-
-	if ((!join && journal->j_must_wait > 0) ||
-	    (!join
-	     && (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch)
-	    || (!join && atomic_read(&journal->j_wcount) > 0
-		&& journal->j_trans_start_time > 0
-		&& (now - journal->j_trans_start_time) >
-		journal->j_max_trans_age) || (!join
-					      && atomic_read(&journal->j_jlock))
-	    || (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) {
-
-		old_trans_id = journal->j_trans_id;
-		/* allow others to finish this transaction */
-		unlock_journal(sb);
-
-		if (!join && (journal->j_len_alloc + nblocks + 2) >=
-		    journal->j_max_batch &&
-		    ((journal->j_len + nblocks + 2) * 100) <
-		    (journal->j_len_alloc * 75)) {
-			if (atomic_read(&journal->j_wcount) > 10) {
-				queue_log_writer(sb);
-				goto relock;
-			}
-		}
-		/*
-		 * don't mess with joining the transaction if all we
-		 * have to do is wait for someone else to do a commit
-		 */
-		if (atomic_read(&journal->j_jlock)) {
-			while (journal->j_trans_id == old_trans_id &&
-			       atomic_read(&journal->j_jlock)) {
-				queue_log_writer(sb);
-			}
-			goto relock;
-		}
-		retval = journal_join(&myth, sb);
-		if (retval)
-			goto out_fail;
-
-		/* someone might have ended the transaction while we joined */
-		if (old_trans_id != journal->j_trans_id) {
-			retval = do_journal_end(&myth, 0);
-		} else {
-			retval = do_journal_end(&myth, COMMIT_NOW);
-		}
-
-		if (retval)
-			goto out_fail;
-
-		PROC_INFO_INC(sb, journal.journal_relock_wcount);
-		goto relock;
-	}
-	/* we are the first writer, set trans_id */
-	if (journal->j_trans_start_time == 0) {
-		journal->j_trans_start_time = ktime_get_seconds();
-	}
-	atomic_inc(&journal->j_wcount);
-	journal->j_len_alloc += nblocks;
-	th->t_blocks_logged = 0;
-	th->t_blocks_allocated = nblocks;
-	th->t_trans_id = journal->j_trans_id;
-	unlock_journal(sb);
-	INIT_LIST_HEAD(&th->t_list);
-	return 0;
-
-out_fail:
-	memset(th, 0, sizeof(*th));
-	/*
-	 * Re-set th->t_super, so we can properly keep track of how many
-	 * persistent transactions there are. We need to do this so if this
-	 * call is part of a failed restart_transaction, we can free it later
-	 */
-	th->t_super = sb;
-	return retval;
-}
-
-struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
-								    super_block
-								    *s,
-								    int nblocks)
-{
-	int ret;
-	struct reiserfs_transaction_handle *th;
-
-	/*
-	 * if we're nesting into an existing transaction.  It will be
-	 * persistent on its own
-	 */
-	if (reiserfs_transaction_running(s)) {
-		th = current->journal_info;
-		th->t_refcount++;
-		BUG_ON(th->t_refcount < 2);
-
-		return th;
-	}
-	th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS);
-	if (!th)
-		return NULL;
-	ret = journal_begin(th, s, nblocks);
-	if (ret) {
-		kfree(th);
-		return NULL;
-	}
-
-	SB_JOURNAL(s)->j_persistent_trans++;
-	return th;
-}
-
-int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th)
-{
-	struct super_block *s = th->t_super;
-	int ret = 0;
-	if (th->t_trans_id)
-		ret = journal_end(th);
-	else
-		ret = -EIO;
-	if (th->t_refcount == 0) {
-		SB_JOURNAL(s)->j_persistent_trans--;
-		kfree(th);
-	}
-	return ret;
-}
-
-static int journal_join(struct reiserfs_transaction_handle *th,
-			struct super_block *sb)
-{
-	struct reiserfs_transaction_handle *cur_th = current->journal_info;
-
-	/*
-	 * this keeps do_journal_end from NULLing out the
-	 * current->journal_info pointer
-	 */
-	th->t_handle_save = cur_th;
-	BUG_ON(cur_th && cur_th->t_refcount > 1);
-	return do_journal_begin_r(th, sb, 1, JBEGIN_JOIN);
-}
-
-int journal_join_abort(struct reiserfs_transaction_handle *th,
-		       struct super_block *sb)
-{
-	struct reiserfs_transaction_handle *cur_th = current->journal_info;
-
-	/*
-	 * this keeps do_journal_end from NULLing out the
-	 * current->journal_info pointer
-	 */
-	th->t_handle_save = cur_th;
-	BUG_ON(cur_th && cur_th->t_refcount > 1);
-	return do_journal_begin_r(th, sb, 1, JBEGIN_ABORT);
-}
-
-int journal_begin(struct reiserfs_transaction_handle *th,
-		  struct super_block *sb, unsigned long nblocks)
-{
-	struct reiserfs_transaction_handle *cur_th = current->journal_info;
-	int ret;
-
-	th->t_handle_save = NULL;
-	if (cur_th) {
-		/* we are nesting into the current transaction */
-		if (cur_th->t_super == sb) {
-			BUG_ON(!cur_th->t_refcount);
-			cur_th->t_refcount++;
-			memcpy(th, cur_th, sizeof(*th));
-			if (th->t_refcount <= 1)
-				reiserfs_warning(sb, "reiserfs-2005",
-						 "BAD: refcount <= 1, but "
-						 "journal_info != 0");
-			return 0;
-		} else {
-			/*
-			 * we've ended up with a handle from a different
-			 * filesystem.  save it and restore on journal_end.
-			 * This should never really happen...
-			 */
-			reiserfs_warning(sb, "clm-2100",
-					 "nesting info a different FS");
-			th->t_handle_save = current->journal_info;
-			current->journal_info = th;
-		}
-	} else {
-		current->journal_info = th;
-	}
-	ret = do_journal_begin_r(th, sb, nblocks, JBEGIN_REG);
-	BUG_ON(current->journal_info != th);
-
-	/*
-	 * I guess this boils down to being the reciprocal of clm-2100 above.
-	 * If do_journal_begin_r fails, we need to put it back, since
-	 * journal_end won't be called to do it. */
-	if (ret)
-		current->journal_info = th->t_handle_save;
-	else
-		BUG_ON(!th->t_refcount);
-
-	return ret;
-}
-
-/*
- * puts bh into the current transaction.  If it was already there, reorders
- * removes the old pointers from the hash, and puts new ones in (to make
- * sure replay happen in the right order).
- *
- * if it was dirty, cleans and files onto the clean list.  I can't let it
- * be dirty again until the transaction is committed.
- *
- * if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len.
- */
-int journal_mark_dirty(struct reiserfs_transaction_handle *th,
-		       struct buffer_head *bh)
-{
-	struct super_block *sb = th->t_super;
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	struct reiserfs_journal_cnode *cn = NULL;
-	int count_already_incd = 0;
-	int prepared = 0;
-	BUG_ON(!th->t_trans_id);
-
-	PROC_INFO_INC(sb, journal.mark_dirty);
-	if (th->t_trans_id != journal->j_trans_id) {
-		reiserfs_panic(th->t_super, "journal-1577",
-			       "handle trans id %ld != current trans id %ld",
-			       th->t_trans_id, journal->j_trans_id);
-	}
-
-	prepared = test_clear_buffer_journal_prepared(bh);
-	clear_buffer_journal_restore_dirty(bh);
-	/* already in this transaction, we are done */
-	if (buffer_journaled(bh)) {
-		PROC_INFO_INC(sb, journal.mark_dirty_already);
-		return 0;
-	}
-
-	/*
-	 * this must be turned into a panic instead of a warning.  We can't
-	 * allow a dirty or journal_dirty or locked buffer to be logged, as
-	 * some changes could get to disk too early.  NOT GOOD.
-	 */
-	if (!prepared || buffer_dirty(bh)) {
-		reiserfs_warning(sb, "journal-1777",
-				 "buffer %llu bad state "
-				 "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT",
-				 (unsigned long long)bh->b_blocknr,
-				 prepared ? ' ' : '!',
-				 buffer_locked(bh) ? ' ' : '!',
-				 buffer_dirty(bh) ? ' ' : '!',
-				 buffer_journal_dirty(bh) ? ' ' : '!');
-	}
-
-	if (atomic_read(&journal->j_wcount) <= 0) {
-		reiserfs_warning(sb, "journal-1409",
-				 "returning because j_wcount was %d",
-				 atomic_read(&journal->j_wcount));
-		return 1;
-	}
-	/*
-	 * this error means I've screwed up, and we've overflowed
-	 * the transaction.  Nothing can be done here, except make the
-	 * FS readonly or panic.
-	 */
-	if (journal->j_len >= journal->j_trans_max) {
-		reiserfs_panic(th->t_super, "journal-1413",
-			       "j_len (%lu) is too big",
-			       journal->j_len);
-	}
-
-	if (buffer_journal_dirty(bh)) {
-		count_already_incd = 1;
-		PROC_INFO_INC(sb, journal.mark_dirty_notjournal);
-		clear_buffer_journal_dirty(bh);
-	}
-
-	if (journal->j_len > journal->j_len_alloc) {
-		journal->j_len_alloc = journal->j_len + JOURNAL_PER_BALANCE_CNT;
-	}
-
-	set_buffer_journaled(bh);
-
-	/* now put this guy on the end */
-	if (!cn) {
-		cn = get_cnode(sb);
-		if (!cn) {
-			reiserfs_panic(sb, "journal-4", "get_cnode failed!");
-		}
-
-		if (th->t_blocks_logged == th->t_blocks_allocated) {
-			th->t_blocks_allocated += JOURNAL_PER_BALANCE_CNT;
-			journal->j_len_alloc += JOURNAL_PER_BALANCE_CNT;
-		}
-		th->t_blocks_logged++;
-		journal->j_len++;
-
-		cn->bh = bh;
-		cn->blocknr = bh->b_blocknr;
-		cn->sb = sb;
-		cn->jlist = NULL;
-		insert_journal_hash(journal->j_hash_table, cn);
-		if (!count_already_incd) {
-			get_bh(bh);
-		}
-	}
-	cn->next = NULL;
-	cn->prev = journal->j_last;
-	cn->bh = bh;
-	if (journal->j_last) {
-		journal->j_last->next = cn;
-		journal->j_last = cn;
-	} else {
-		journal->j_first = cn;
-		journal->j_last = cn;
-	}
-	reiserfs_schedule_old_flush(sb);
-	return 0;
-}
-
-int journal_end(struct reiserfs_transaction_handle *th)
-{
-	struct super_block *sb = th->t_super;
-	if (!current->journal_info && th->t_refcount > 1)
-		reiserfs_warning(sb, "REISER-NESTING",
-				 "th NULL, refcount %d", th->t_refcount);
-
-	if (!th->t_trans_id) {
-		WARN_ON(1);
-		return -EIO;
-	}
-
-	th->t_refcount--;
-	if (th->t_refcount > 0) {
-		struct reiserfs_transaction_handle *cur_th =
-		    current->journal_info;
-
-		/*
-		 * we aren't allowed to close a nested transaction on a
-		 * different filesystem from the one in the task struct
-		 */
-		BUG_ON(cur_th->t_super != th->t_super);
-
-		if (th != cur_th) {
-			memcpy(current->journal_info, th, sizeof(*th));
-			th->t_trans_id = 0;
-		}
-		return 0;
-	} else {
-		return do_journal_end(th, 0);
-	}
-}
-
-/*
- * removes from the current transaction, relsing and descrementing any counters.
- * also files the removed buffer directly onto the clean list
- *
- * called by journal_mark_freed when a block has been deleted
- *
- * returns 1 if it cleaned and relsed the buffer. 0 otherwise
- */
-static int remove_from_transaction(struct super_block *sb,
-				   b_blocknr_t blocknr, int already_cleaned)
-{
-	struct buffer_head *bh;
-	struct reiserfs_journal_cnode *cn;
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	int ret = 0;
-
-	cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr);
-	if (!cn || !cn->bh) {
-		return ret;
-	}
-	bh = cn->bh;
-	if (cn->prev) {
-		cn->prev->next = cn->next;
-	}
-	if (cn->next) {
-		cn->next->prev = cn->prev;
-	}
-	if (cn == journal->j_first) {
-		journal->j_first = cn->next;
-	}
-	if (cn == journal->j_last) {
-		journal->j_last = cn->prev;
-	}
-	remove_journal_hash(sb, journal->j_hash_table, NULL,
-			    bh->b_blocknr, 0);
-	clear_buffer_journaled(bh);	/* don't log this one */
-
-	if (!already_cleaned) {
-		clear_buffer_journal_dirty(bh);
-		clear_buffer_dirty(bh);
-		clear_buffer_journal_test(bh);
-		put_bh(bh);
-		if (atomic_read(&bh->b_count) < 0) {
-			reiserfs_warning(sb, "journal-1752",
-					 "b_count < 0");
-		}
-		ret = 1;
-	}
-	journal->j_len--;
-	journal->j_len_alloc--;
-	free_cnode(sb, cn);
-	return ret;
-}
-
-/*
- * for any cnode in a journal list, it can only be dirtied of all the
- * transactions that include it are committed to disk.
- * this checks through each transaction, and returns 1 if you are allowed
- * to dirty, and 0 if you aren't
- *
- * it is called by dirty_journal_list, which is called after
- * flush_commit_list has gotten all the log blocks for a given
- * transaction on disk
- *
- */
-static int can_dirty(struct reiserfs_journal_cnode *cn)
-{
-	struct super_block *sb = cn->sb;
-	b_blocknr_t blocknr = cn->blocknr;
-	struct reiserfs_journal_cnode *cur = cn->hprev;
-	int can_dirty = 1;
-
-	/*
-	 * first test hprev.  These are all newer than cn, so any node here
-	 * with the same block number and dev means this node can't be sent
-	 * to disk right now.
-	 */
-	while (cur && can_dirty) {
-		if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb &&
-		    cur->blocknr == blocknr) {
-			can_dirty = 0;
-		}
-		cur = cur->hprev;
-	}
-	/*
-	 * then test hnext.  These are all older than cn.  As long as they
-	 * are committed to the log, it is safe to write cn to disk
-	 */
-	cur = cn->hnext;
-	while (cur && can_dirty) {
-		if (cur->jlist && cur->jlist->j_len > 0 &&
-		    atomic_read(&cur->jlist->j_commit_left) > 0 && cur->bh &&
-		    cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) {
-			can_dirty = 0;
-		}
-		cur = cur->hnext;
-	}
-	return can_dirty;
-}
-
-/*
- * syncs the commit blocks, but does not force the real buffers to disk
- * will wait until the current transaction is done/committed before returning
- */
-int journal_end_sync(struct reiserfs_transaction_handle *th)
-{
-	struct super_block *sb = th->t_super;
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-
-	BUG_ON(!th->t_trans_id);
-	/* you can sync while nested, very, very bad */
-	BUG_ON(th->t_refcount > 1);
-	if (journal->j_len == 0) {
-		reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
-					     1);
-		journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb));
-	}
-	return do_journal_end(th, COMMIT_NOW | WAIT);
-}
-
-/* writeback the pending async commits to disk */
-static void flush_async_commits(struct work_struct *work)
-{
-	struct reiserfs_journal *journal =
-		container_of(work, struct reiserfs_journal, j_work.work);
-	struct super_block *sb = journal->j_work_sb;
-	struct reiserfs_journal_list *jl;
-	struct list_head *entry;
-
-	reiserfs_write_lock(sb);
-	if (!list_empty(&journal->j_journal_list)) {
-		/* last entry is the youngest, commit it and you get everything */
-		entry = journal->j_journal_list.prev;
-		jl = JOURNAL_LIST_ENTRY(entry);
-		flush_commit_list(sb, jl, 1);
-	}
-	reiserfs_write_unlock(sb);
-}
-
-/*
- * flushes any old transactions to disk
- * ends the current transaction if it is too old
- */
-void reiserfs_flush_old_commits(struct super_block *sb)
-{
-	time64_t now;
-	struct reiserfs_transaction_handle th;
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-
-	now = ktime_get_seconds();
-	/*
-	 * safety check so we don't flush while we are replaying the log during
-	 * mount
-	 */
-	if (list_empty(&journal->j_journal_list))
-		return;
-
-	/*
-	 * check the current transaction.  If there are no writers, and it is
-	 * too old, finish it, and force the commit blocks to disk
-	 */
-	if (atomic_read(&journal->j_wcount) <= 0 &&
-	    journal->j_trans_start_time > 0 &&
-	    journal->j_len > 0 &&
-	    (now - journal->j_trans_start_time) > journal->j_max_trans_age) {
-		if (!journal_join(&th, sb)) {
-			reiserfs_prepare_for_journal(sb,
-						     SB_BUFFER_WITH_SB(sb),
-						     1);
-			journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb));
-
-			/*
-			 * we're only being called from kreiserfsd, it makes
-			 * no sense to do an async commit so that kreiserfsd
-			 * can do it later
-			 */
-			do_journal_end(&th, COMMIT_NOW | WAIT);
-		}
-	}
-}
-
-/*
- * returns 0 if do_journal_end should return right away, returns 1 if
- * do_journal_end should finish the commit
- *
- * if the current transaction is too old, but still has writers, this will
- * wait on j_join_wait until all the writers are done.  By the time it
- * wakes up, the transaction it was called has already ended, so it just
- * flushes the commit list and returns 0.
- *
- * Won't batch when flush or commit_now is set.  Also won't batch when
- * others are waiting on j_join_wait.
- *
- * Note, we can't allow the journal_end to proceed while there are still
- * writers in the log.
- */
-static int check_journal_end(struct reiserfs_transaction_handle *th, int flags)
-{
-
-	time64_t now;
-	int flush = flags & FLUSH_ALL;
-	int commit_now = flags & COMMIT_NOW;
-	int wait_on_commit = flags & WAIT;
-	struct reiserfs_journal_list *jl;
-	struct super_block *sb = th->t_super;
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-
-	BUG_ON(!th->t_trans_id);
-
-	if (th->t_trans_id != journal->j_trans_id) {
-		reiserfs_panic(th->t_super, "journal-1577",
-			       "handle trans id %ld != current trans id %ld",
-			       th->t_trans_id, journal->j_trans_id);
-	}
-
-	journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged);
-	/* <= 0 is allowed.  unmounting might not call begin */
-	if (atomic_read(&journal->j_wcount) > 0)
-		atomic_dec(&journal->j_wcount);
-
-	/*
-	 * BUG, deal with case where j_len is 0, but people previously
-	 * freed blocks need to be released will be dealt with by next
-	 * transaction that actually writes something, but should be taken
-	 * care of in this trans
-	 */
-	BUG_ON(journal->j_len == 0);
-
-	/*
-	 * if wcount > 0, and we are called to with flush or commit_now,
-	 * we wait on j_join_wait.  We will wake up when the last writer has
-	 * finished the transaction, and started it on its way to the disk.
-	 * Then, we flush the commit or journal list, and just return 0
-	 * because the rest of journal end was already done for this
-	 * transaction.
-	 */
-	if (atomic_read(&journal->j_wcount) > 0) {
-		if (flush || commit_now) {
-			unsigned trans_id;
-
-			jl = journal->j_current_jl;
-			trans_id = jl->j_trans_id;
-			if (wait_on_commit)
-				jl->j_state |= LIST_COMMIT_PENDING;
-			atomic_set(&journal->j_jlock, 1);
-			if (flush) {
-				journal->j_next_full_flush = 1;
-			}
-			unlock_journal(sb);
-
-			/*
-			 * sleep while the current transaction is
-			 * still j_jlocked
-			 */
-			while (journal->j_trans_id == trans_id) {
-				if (atomic_read(&journal->j_jlock)) {
-					queue_log_writer(sb);
-				} else {
-					lock_journal(sb);
-					if (journal->j_trans_id == trans_id) {
-						atomic_set(&journal->j_jlock,
-							   1);
-					}
-					unlock_journal(sb);
-				}
-			}
-			BUG_ON(journal->j_trans_id == trans_id);
-
-			if (commit_now
-			    && journal_list_still_alive(sb, trans_id)
-			    && wait_on_commit) {
-				flush_commit_list(sb, jl, 1);
-			}
-			return 0;
-		}
-		unlock_journal(sb);
-		return 0;
-	}
-
-	/* deal with old transactions where we are the last writers */
-	now = ktime_get_seconds();
-	if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) {
-		commit_now = 1;
-		journal->j_next_async_flush = 1;
-	}
-	/* don't batch when someone is waiting on j_join_wait */
-	/* don't batch when syncing the commit or flushing the whole trans */
-	if (!(journal->j_must_wait > 0) && !(atomic_read(&journal->j_jlock))
-	    && !flush && !commit_now && (journal->j_len < journal->j_max_batch)
-	    && journal->j_len_alloc < journal->j_max_batch
-	    && journal->j_cnode_free > (journal->j_trans_max * 3)) {
-		journal->j_bcount++;
-		unlock_journal(sb);
-		return 0;
-	}
-
-	if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(sb)) {
-		reiserfs_panic(sb, "journal-003",
-			       "j_start (%ld) is too high",
-			       journal->j_start);
-	}
-	return 1;
-}
-
-/*
- * Does all the work that makes deleting blocks safe.
- * when deleting a block mark BH_JNew, just remove it from the current
- * transaction, clean it's buffer_head and move on.
- *
- * otherwise:
- * set a bit for the block in the journal bitmap.  That will prevent it from
- * being allocated for unformatted nodes before this transaction has finished.
- *
- * mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers.
- * That will prevent any old transactions with this block from trying to flush
- * to the real location.  Since we aren't removing the cnode from the
- * journal_list_hash, *the block can't be reallocated yet.
- *
- * Then remove it from the current transaction, decrementing any counters and
- * filing it on the clean list.
- */
-int journal_mark_freed(struct reiserfs_transaction_handle *th,
-		       struct super_block *sb, b_blocknr_t blocknr)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	struct reiserfs_journal_cnode *cn = NULL;
-	struct buffer_head *bh = NULL;
-	struct reiserfs_list_bitmap *jb = NULL;
-	int cleaned = 0;
-	BUG_ON(!th->t_trans_id);
-
-	cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr);
-	if (cn && cn->bh) {
-		bh = cn->bh;
-		get_bh(bh);
-	}
-	/* if it is journal new, we just remove it from this transaction */
-	if (bh && buffer_journal_new(bh)) {
-		clear_buffer_journal_new(bh);
-		clear_prepared_bits(bh);
-		reiserfs_clean_and_file_buffer(bh);
-		cleaned = remove_from_transaction(sb, blocknr, cleaned);
-	} else {
-		/*
-		 * set the bit for this block in the journal bitmap
-		 * for this transaction
-		 */
-		jb = journal->j_current_jl->j_list_bitmap;
-		if (!jb) {
-			reiserfs_panic(sb, "journal-1702",
-				       "journal_list_bitmap is NULL");
-		}
-		set_bit_in_list_bitmap(sb, blocknr, jb);
-
-		/* Note, the entire while loop is not allowed to schedule.  */
-
-		if (bh) {
-			clear_prepared_bits(bh);
-			reiserfs_clean_and_file_buffer(bh);
-		}
-		cleaned = remove_from_transaction(sb, blocknr, cleaned);
-
-		/*
-		 * find all older transactions with this block,
-		 * make sure they don't try to write it out
-		 */
-		cn = get_journal_hash_dev(sb, journal->j_list_hash_table,
-					  blocknr);
-		while (cn) {
-			if (sb == cn->sb && blocknr == cn->blocknr) {
-				set_bit(BLOCK_FREED, &cn->state);
-				if (cn->bh) {
-					/*
-					 * remove_from_transaction will brelse
-					 * the buffer if it was in the current
-					 * trans
-					 */
-					if (!cleaned) {
-						clear_buffer_journal_dirty(cn->
-									   bh);
-						clear_buffer_dirty(cn->bh);
-						clear_buffer_journal_test(cn->
-									  bh);
-						cleaned = 1;
-						put_bh(cn->bh);
-						if (atomic_read
-						    (&cn->bh->b_count) < 0) {
-							reiserfs_warning(sb,
-								 "journal-2138",
-								 "cn->bh->b_count < 0");
-						}
-					}
-					/*
-					 * since we are clearing the bh,
-					 * we MUST dec nonzerolen
-					 */
-					if (cn->jlist) {
-						atomic_dec(&cn->jlist->
-							   j_nonzerolen);
-					}
-					cn->bh = NULL;
-				}
-			}
-			cn = cn->hnext;
-		}
-	}
-
-	if (bh)
-		release_buffer_page(bh); /* get_hash grabs the buffer */
-	return 0;
-}
-
-void reiserfs_update_inode_transaction(struct inode *inode)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(inode->i_sb);
-	REISERFS_I(inode)->i_jl = journal->j_current_jl;
-	REISERFS_I(inode)->i_trans_id = journal->j_trans_id;
-}
-
-/*
- * returns -1 on error, 0 if no commits/barriers were done and 1
- * if a transaction was actually committed and the barrier was done
- */
-static int __commit_trans_jl(struct inode *inode, unsigned long id,
-			     struct reiserfs_journal_list *jl)
-{
-	struct reiserfs_transaction_handle th;
-	struct super_block *sb = inode->i_sb;
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	int ret = 0;
-
-	/*
-	 * is it from the current transaction,
-	 * or from an unknown transaction?
-	 */
-	if (id == journal->j_trans_id) {
-		jl = journal->j_current_jl;
-		/*
-		 * try to let other writers come in and
-		 * grow this transaction
-		 */
-		let_transaction_grow(sb, id);
-		if (journal->j_trans_id != id) {
-			goto flush_commit_only;
-		}
-
-		ret = journal_begin(&th, sb, 1);
-		if (ret)
-			return ret;
-
-		/* someone might have ended this transaction while we joined */
-		if (journal->j_trans_id != id) {
-			reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
-						     1);
-			journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb));
-			ret = journal_end(&th);
-			goto flush_commit_only;
-		}
-
-		ret = journal_end_sync(&th);
-		if (!ret)
-			ret = 1;
-
-	} else {
-		/*
-		 * this gets tricky, we have to make sure the journal list in
-		 * the inode still exists.  We know the list is still around
-		 * if we've got a larger transaction id than the oldest list
-		 */
-flush_commit_only:
-		if (journal_list_still_alive(inode->i_sb, id)) {
-			/*
-			 * we only set ret to 1 when we know for sure
-			 * the barrier hasn't been started yet on the commit
-			 * block.
-			 */
-			if (atomic_read(&jl->j_commit_left) > 1)
-				ret = 1;
-			flush_commit_list(sb, jl, 1);
-			if (journal->j_errno)
-				ret = journal->j_errno;
-		}
-	}
-	/* otherwise the list is gone, and long since committed */
-	return ret;
-}
-
-int reiserfs_commit_for_inode(struct inode *inode)
-{
-	unsigned int id = REISERFS_I(inode)->i_trans_id;
-	struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl;
-
-	/*
-	 * for the whole inode, assume unset id means it was
-	 * changed in the current transaction.  More conservative
-	 */
-	if (!id || !jl) {
-		reiserfs_update_inode_transaction(inode);
-		id = REISERFS_I(inode)->i_trans_id;
-		/* jl will be updated in __commit_trans_jl */
-	}
-
-	return __commit_trans_jl(inode, id, jl);
-}
-
-void reiserfs_restore_prepared_buffer(struct super_block *sb,
-				      struct buffer_head *bh)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	PROC_INFO_INC(sb, journal.restore_prepared);
-	if (!bh) {
-		return;
-	}
-	if (test_clear_buffer_journal_restore_dirty(bh) &&
-	    buffer_journal_dirty(bh)) {
-		struct reiserfs_journal_cnode *cn;
-		reiserfs_write_lock(sb);
-		cn = get_journal_hash_dev(sb,
-					  journal->j_list_hash_table,
-					  bh->b_blocknr);
-		if (cn && can_dirty(cn)) {
-			set_buffer_journal_test(bh);
-			mark_buffer_dirty(bh);
-		}
-		reiserfs_write_unlock(sb);
-	}
-	clear_buffer_journal_prepared(bh);
-}
-
-extern struct tree_balance *cur_tb;
-/*
- * before we can change a metadata block, we have to make sure it won't
- * be written to disk while we are altering it.  So, we must:
- * clean it
- * wait on it.
- */
-int reiserfs_prepare_for_journal(struct super_block *sb,
-				 struct buffer_head *bh, int wait)
-{
-	PROC_INFO_INC(sb, journal.prepare);
-
-	if (!trylock_buffer(bh)) {
-		if (!wait)
-			return 0;
-		lock_buffer(bh);
-	}
-	set_buffer_journal_prepared(bh);
-	if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh)) {
-		clear_buffer_journal_test(bh);
-		set_buffer_journal_restore_dirty(bh);
-	}
-	unlock_buffer(bh);
-	return 1;
-}
-
-/*
- * long and ugly.  If flush, will not return until all commit
- * blocks and all real buffers in the trans are on disk.
- * If no_async, won't return until all commit blocks are on disk.
- *
- * keep reading, there are comments as you go along
- *
- * If the journal is aborted, we just clean up. Things like flushing
- * journal lists, etc just won't happen.
- */
-static int do_journal_end(struct reiserfs_transaction_handle *th, int flags)
-{
-	struct super_block *sb = th->t_super;
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	struct reiserfs_journal_cnode *cn, *next, *jl_cn;
-	struct reiserfs_journal_cnode *last_cn = NULL;
-	struct reiserfs_journal_desc *desc;
-	struct reiserfs_journal_commit *commit;
-	struct buffer_head *c_bh;	/* commit bh */
-	struct buffer_head *d_bh;	/* desc bh */
-	int cur_write_start = 0;	/* start index of current log write */
-	int i;
-	int flush;
-	int wait_on_commit;
-	struct reiserfs_journal_list *jl, *temp_jl;
-	struct list_head *entry, *safe;
-	unsigned long jindex;
-	unsigned int commit_trans_id;
-	int trans_half;
-	int depth;
-
-	BUG_ON(th->t_refcount > 1);
-	BUG_ON(!th->t_trans_id);
-	BUG_ON(!th->t_super);
-
-	/*
-	 * protect flush_older_commits from doing mistakes if the
-	 * transaction ID counter gets overflowed.
-	 */
-	if (th->t_trans_id == ~0U)
-		flags |= FLUSH_ALL | COMMIT_NOW | WAIT;
-	flush = flags & FLUSH_ALL;
-	wait_on_commit = flags & WAIT;
-
-	current->journal_info = th->t_handle_save;
-	reiserfs_check_lock_depth(sb, "journal end");
-	if (journal->j_len == 0) {
-		reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
-					     1);
-		journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb));
-	}
-
-	lock_journal(sb);
-	if (journal->j_next_full_flush) {
-		flags |= FLUSH_ALL;
-		flush = 1;
-	}
-	if (journal->j_next_async_flush) {
-		flags |= COMMIT_NOW | WAIT;
-		wait_on_commit = 1;
-	}
-
-	/*
-	 * check_journal_end locks the journal, and unlocks if it does
-	 * not return 1 it tells us if we should continue with the
-	 * journal_end, or just return
-	 */
-	if (!check_journal_end(th, flags)) {
-		reiserfs_schedule_old_flush(sb);
-		wake_queued_writers(sb);
-		reiserfs_async_progress_wait(sb);
-		goto out;
-	}
-
-	/* check_journal_end might set these, check again */
-	if (journal->j_next_full_flush) {
-		flush = 1;
-	}
-
-	/*
-	 * j must wait means we have to flush the log blocks, and the
-	 * real blocks for this transaction
-	 */
-	if (journal->j_must_wait > 0) {
-		flush = 1;
-	}
-#ifdef REISERFS_PREALLOCATE
-	/*
-	 * quota ops might need to nest, setup the journal_info pointer
-	 * for them and raise the refcount so that it is > 0.
-	 */
-	current->journal_info = th;
-	th->t_refcount++;
-
-	/* it should not involve new blocks into the transaction */
-	reiserfs_discard_all_prealloc(th);
-
-	th->t_refcount--;
-	current->journal_info = th->t_handle_save;
-#endif
-
-	/* setup description block */
-	d_bh =
-	    journal_getblk(sb,
-			   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
-			   journal->j_start);
-	set_buffer_uptodate(d_bh);
-	desc = (struct reiserfs_journal_desc *)(d_bh)->b_data;
-	memset(d_bh->b_data, 0, d_bh->b_size);
-	memcpy(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8);
-	set_desc_trans_id(desc, journal->j_trans_id);
-
-	/*
-	 * setup commit block.  Don't write (keep it clean too) this one
-	 * until after everyone else is written
-	 */
-	c_bh = journal_getblk(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
-			      ((journal->j_start + journal->j_len +
-				1) % SB_ONDISK_JOURNAL_SIZE(sb)));
-	commit = (struct reiserfs_journal_commit *)c_bh->b_data;
-	memset(c_bh->b_data, 0, c_bh->b_size);
-	set_commit_trans_id(commit, journal->j_trans_id);
-	set_buffer_uptodate(c_bh);
-
-	/* init this journal list */
-	jl = journal->j_current_jl;
-
-	/*
-	 * we lock the commit before doing anything because
-	 * we want to make sure nobody tries to run flush_commit_list until
-	 * the new transaction is fully setup, and we've already flushed the
-	 * ordered bh list
-	 */
-	reiserfs_mutex_lock_safe(&jl->j_commit_mutex, sb);
-
-	/* save the transaction id in case we need to commit it later */
-	commit_trans_id = jl->j_trans_id;
-
-	atomic_set(&jl->j_older_commits_done, 0);
-	jl->j_trans_id = journal->j_trans_id;
-	jl->j_timestamp = journal->j_trans_start_time;
-	jl->j_commit_bh = c_bh;
-	jl->j_start = journal->j_start;
-	jl->j_len = journal->j_len;
-	atomic_set(&jl->j_nonzerolen, journal->j_len);
-	atomic_set(&jl->j_commit_left, journal->j_len + 2);
-	jl->j_realblock = NULL;
-
-	/*
-	 * The ENTIRE FOR LOOP MUST not cause schedule to occur.
-	 * for each real block, add it to the journal list hash,
-	 * copy into real block index array in the commit or desc block
-	 */
-	trans_half = journal_trans_half(sb->s_blocksize);
-	for (i = 0, cn = journal->j_first; cn; cn = cn->next, i++) {
-		if (buffer_journaled(cn->bh)) {
-			jl_cn = get_cnode(sb);
-			if (!jl_cn) {
-				reiserfs_panic(sb, "journal-1676",
-					       "get_cnode returned NULL");
-			}
-			if (i == 0) {
-				jl->j_realblock = jl_cn;
-			}
-			jl_cn->prev = last_cn;
-			jl_cn->next = NULL;
-			if (last_cn) {
-				last_cn->next = jl_cn;
-			}
-			last_cn = jl_cn;
-			/*
-			 * make sure the block we are trying to log
-			 * is not a block of journal or reserved area
-			 */
-			if (is_block_in_log_or_reserved_area
-			    (sb, cn->bh->b_blocknr)) {
-				reiserfs_panic(sb, "journal-2332",
-					       "Trying to log block %lu, "
-					       "which is a log block",
-					       cn->bh->b_blocknr);
-			}
-			jl_cn->blocknr = cn->bh->b_blocknr;
-			jl_cn->state = 0;
-			jl_cn->sb = sb;
-			jl_cn->bh = cn->bh;
-			jl_cn->jlist = jl;
-			insert_journal_hash(journal->j_list_hash_table, jl_cn);
-			if (i < trans_half) {
-				desc->j_realblock[i] =
-				    cpu_to_le32(cn->bh->b_blocknr);
-			} else {
-				commit->j_realblock[i - trans_half] =
-				    cpu_to_le32(cn->bh->b_blocknr);
-			}
-		} else {
-			i--;
-		}
-	}
-	set_desc_trans_len(desc, journal->j_len);
-	set_desc_mount_id(desc, journal->j_mount_id);
-	set_desc_trans_id(desc, journal->j_trans_id);
-	set_commit_trans_len(commit, journal->j_len);
-
-	/*
-	 * special check in case all buffers in the journal
-	 * were marked for not logging
-	 */
-	BUG_ON(journal->j_len == 0);
-
-	/*
-	 * we're about to dirty all the log blocks, mark the description block
-	 * dirty now too.  Don't mark the commit block dirty until all the
-	 * others are on disk
-	 */
-	mark_buffer_dirty(d_bh);
-
-	/*
-	 * first data block is j_start + 1, so add one to
-	 * cur_write_start wherever you use it
-	 */
-	cur_write_start = journal->j_start;
-	cn = journal->j_first;
-	jindex = 1;	/* start at one so we don't get the desc again */
-	while (cn) {
-		clear_buffer_journal_new(cn->bh);
-		/* copy all the real blocks into log area.  dirty log blocks */
-		if (buffer_journaled(cn->bh)) {
-			struct buffer_head *tmp_bh;
-			char *addr;
-			struct page *page;
-			tmp_bh =
-			    journal_getblk(sb,
-					   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
-					   ((cur_write_start +
-					     jindex) %
-					    SB_ONDISK_JOURNAL_SIZE(sb)));
-			set_buffer_uptodate(tmp_bh);
-			page = cn->bh->b_page;
-			addr = kmap(page);
-			memcpy(tmp_bh->b_data,
-			       addr + offset_in_page(cn->bh->b_data),
-			       cn->bh->b_size);
-			kunmap(page);
-			mark_buffer_dirty(tmp_bh);
-			jindex++;
-			set_buffer_journal_dirty(cn->bh);
-			clear_buffer_journaled(cn->bh);
-		} else {
-			/*
-			 * JDirty cleared sometime during transaction.
-			 * don't log this one
-			 */
-			reiserfs_warning(sb, "journal-2048",
-					 "BAD, buffer in journal hash, "
-					 "but not JDirty!");
-			brelse(cn->bh);
-		}
-		next = cn->next;
-		free_cnode(sb, cn);
-		cn = next;
-		reiserfs_cond_resched(sb);
-	}
-
-	/*
-	 * we are done with both the c_bh and d_bh, but
-	 * c_bh must be written after all other commit blocks,
-	 * so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
-	 */
-
-	journal->j_current_jl = alloc_journal_list(sb);
-
-	/* now it is safe to insert this transaction on the main list */
-	list_add_tail(&jl->j_list, &journal->j_journal_list);
-	list_add_tail(&jl->j_working_list, &journal->j_working_list);
-	journal->j_num_work_lists++;
-
-	/* reset journal values for the next transaction */
-	journal->j_start =
-	    (journal->j_start + journal->j_len +
-	     2) % SB_ONDISK_JOURNAL_SIZE(sb);
-	atomic_set(&journal->j_wcount, 0);
-	journal->j_bcount = 0;
-	journal->j_last = NULL;
-	journal->j_first = NULL;
-	journal->j_len = 0;
-	journal->j_trans_start_time = 0;
-	/* check for trans_id overflow */
-	if (++journal->j_trans_id == 0)
-		journal->j_trans_id = 10;
-	journal->j_current_jl->j_trans_id = journal->j_trans_id;
-	journal->j_must_wait = 0;
-	journal->j_len_alloc = 0;
-	journal->j_next_full_flush = 0;
-	journal->j_next_async_flush = 0;
-	init_journal_hash(sb);
-
-	/*
-	 * make sure reiserfs_add_jh sees the new current_jl before we
-	 * write out the tails
-	 */
-	smp_mb();
-
-	/*
-	 * tail conversion targets have to hit the disk before we end the
-	 * transaction.  Otherwise a later transaction might repack the tail
-	 * before this transaction commits, leaving the data block unflushed
-	 * and clean, if we crash before the later transaction commits, the
-	 * data block is lost.
-	 */
-	if (!list_empty(&jl->j_tail_bh_list)) {
-		depth = reiserfs_write_unlock_nested(sb);
-		write_ordered_buffers(&journal->j_dirty_buffers_lock,
-				      journal, jl, &jl->j_tail_bh_list);
-		reiserfs_write_lock_nested(sb, depth);
-	}
-	BUG_ON(!list_empty(&jl->j_tail_bh_list));
-	mutex_unlock(&jl->j_commit_mutex);
-
-	/*
-	 * honor the flush wishes from the caller, simple commits can
-	 * be done outside the journal lock, they are done below
-	 *
-	 * if we don't flush the commit list right now, we put it into
-	 * the work queue so the people waiting on the async progress work
-	 * queue don't wait for this proc to flush journal lists and such.
-	 */
-	if (flush) {
-		flush_commit_list(sb, jl, 1);
-		flush_journal_list(sb, jl, 1);
-	} else if (!(jl->j_state & LIST_COMMIT_PENDING)) {
-		/*
-		 * Avoid queueing work when sb is being shut down. Transaction
-		 * will be flushed on journal shutdown.
-		 */
-		if (sb->s_flags & SB_ACTIVE)
-			queue_delayed_work(REISERFS_SB(sb)->commit_wq,
-					   &journal->j_work, HZ / 10);
-	}
-
-	/*
-	 * if the next transaction has any chance of wrapping, flush
-	 * transactions that might get overwritten.  If any journal lists
-	 * are very old flush them as well.
-	 */
-first_jl:
-	list_for_each_safe(entry, safe, &journal->j_journal_list) {
-		temp_jl = JOURNAL_LIST_ENTRY(entry);
-		if (journal->j_start <= temp_jl->j_start) {
-			if ((journal->j_start + journal->j_trans_max + 1) >=
-			    temp_jl->j_start) {
-				flush_used_journal_lists(sb, temp_jl);
-				goto first_jl;
-			} else if ((journal->j_start +
-				    journal->j_trans_max + 1) <
-				   SB_ONDISK_JOURNAL_SIZE(sb)) {
-				/*
-				 * if we don't cross into the next
-				 * transaction and we don't wrap, there is
-				 * no way we can overlap any later transactions
-				 * break now
-				 */
-				break;
-			}
-		} else if ((journal->j_start +
-			    journal->j_trans_max + 1) >
-			   SB_ONDISK_JOURNAL_SIZE(sb)) {
-			if (((journal->j_start + journal->j_trans_max + 1) %
-			     SB_ONDISK_JOURNAL_SIZE(sb)) >=
-			    temp_jl->j_start) {
-				flush_used_journal_lists(sb, temp_jl);
-				goto first_jl;
-			} else {
-				/*
-				* we don't overlap anything from out start
-				* to the end of the log, and our wrapped
-				* portion doesn't overlap anything at
-				* the start of the log.  We can break
-				*/
-				break;
-			}
-		}
-	}
-
-	journal->j_current_jl->j_list_bitmap =
-	    get_list_bitmap(sb, journal->j_current_jl);
-
-	if (!(journal->j_current_jl->j_list_bitmap)) {
-		reiserfs_panic(sb, "journal-1996",
-			       "could not get a list bitmap");
-	}
-
-	atomic_set(&journal->j_jlock, 0);
-	unlock_journal(sb);
-	/* wake up any body waiting to join. */
-	clear_bit(J_WRITERS_QUEUED, &journal->j_state);
-	wake_up(&journal->j_join_wait);
-
-	if (!flush && wait_on_commit &&
-	    journal_list_still_alive(sb, commit_trans_id)) {
-		flush_commit_list(sb, jl, 1);
-	}
-out:
-	reiserfs_check_lock_depth(sb, "journal end2");
-
-	memset(th, 0, sizeof(*th));
-	/*
-	 * Re-set th->t_super, so we can properly keep track of how many
-	 * persistent transactions there are. We need to do this so if this
-	 * call is part of a failed restart_transaction, we can free it later
-	 */
-	th->t_super = sb;
-
-	return journal->j_errno;
-}
-
-/* Send the file system read only and refuse new transactions */
-void reiserfs_abort_journal(struct super_block *sb, int errno)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	if (test_bit(J_ABORTED, &journal->j_state))
-		return;
-
-	if (!journal->j_errno)
-		journal->j_errno = errno;
-
-	sb->s_flags |= SB_RDONLY;
-	set_bit(J_ABORTED, &journal->j_state);
-
-#ifdef CONFIG_REISERFS_CHECK
-	dump_stack();
-#endif
-}
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
deleted file mode 100644
index 7f868569d4d0..000000000000
--- a/fs/reiserfs/lbalance.c
+++ /dev/null
@@ -1,1426 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/uaccess.h>
-#include <linux/string.h>
-#include <linux/time.h>
-#include "reiserfs.h"
-#include <linux/buffer_head.h>
-
-/*
- * copy copy_count entries from source directory item to dest buffer
- * (creating new item if needed)
- */
-static void leaf_copy_dir_entries(struct buffer_info *dest_bi,
-				  struct buffer_head *source, int last_first,
-				  int item_num, int from, int copy_count)
-{
-	struct buffer_head *dest = dest_bi->bi_bh;
-	/*
-	 * either the number of target item, or if we must create a
-	 * new item, the number of the item we will create it next to
-	 */
-	int item_num_in_dest;
-
-	struct item_head *ih;
-	struct reiserfs_de_head *deh;
-	int copy_records_len;	/* length of all records in item to be copied */
-	char *records;
-
-	ih = item_head(source, item_num);
-
-	RFALSE(!is_direntry_le_ih(ih), "vs-10000: item must be directory item");
-
-	/*
-	 * length of all record to be copied and first byte of
-	 * the last of them
-	 */
-	deh = B_I_DEH(source, ih);
-	if (copy_count) {
-		copy_records_len = (from ? deh_location(&deh[from - 1]) :
-				    ih_item_len(ih)) -
-		    deh_location(&deh[from + copy_count - 1]);
-		records =
-		    source->b_data + ih_location(ih) +
-		    deh_location(&deh[from + copy_count - 1]);
-	} else {
-		copy_records_len = 0;
-		records = NULL;
-	}
-
-	/* when copy last to first, dest buffer can contain 0 items */
-	item_num_in_dest =
-	    (last_first ==
-	     LAST_TO_FIRST) ? ((B_NR_ITEMS(dest)) ? 0 : -1) : (B_NR_ITEMS(dest)
-							       - 1);
-
-	/*
-	 * if there are no items in dest or the first/last item in
-	 * dest is not item of the same directory
-	 */
-	if ((item_num_in_dest == -1) ||
-	    (last_first == FIRST_TO_LAST && le_ih_k_offset(ih) == DOT_OFFSET) ||
-	    (last_first == LAST_TO_FIRST
-	     && comp_short_le_keys /*COMP_SHORT_KEYS */ (&ih->ih_key,
-							 leaf_key(dest,
-								  item_num_in_dest))))
-	{
-		/* create new item in dest */
-		struct item_head new_ih;
-
-		/* form item header */
-		memcpy(&new_ih.ih_key, &ih->ih_key, KEY_SIZE);
-		put_ih_version(&new_ih, KEY_FORMAT_3_5);
-		/* calculate item len */
-		put_ih_item_len(&new_ih,
-				DEH_SIZE * copy_count + copy_records_len);
-		put_ih_entry_count(&new_ih, 0);
-
-		if (last_first == LAST_TO_FIRST) {
-			/* form key by the following way */
-			if (from < ih_entry_count(ih)) {
-				set_le_ih_k_offset(&new_ih,
-						   deh_offset(&deh[from]));
-			} else {
-				/*
-				 * no entries will be copied to this
-				 * item in this function
-				 */
-				set_le_ih_k_offset(&new_ih, U32_MAX);
-				/*
-				 * this item is not yet valid, but we
-				 * want I_IS_DIRECTORY_ITEM to return 1
-				 * for it, so we -1
-				 */
-			}
-			set_le_key_k_type(KEY_FORMAT_3_5, &new_ih.ih_key,
-					  TYPE_DIRENTRY);
-		}
-
-		/* insert item into dest buffer */
-		leaf_insert_into_buf(dest_bi,
-				     (last_first ==
-				      LAST_TO_FIRST) ? 0 : B_NR_ITEMS(dest),
-				     &new_ih, NULL, 0);
-	} else {
-		/* prepare space for entries */
-		leaf_paste_in_buffer(dest_bi,
-				     (last_first ==
-				      FIRST_TO_LAST) ? (B_NR_ITEMS(dest) -
-							1) : 0, MAX_US_INT,
-				     DEH_SIZE * copy_count + copy_records_len,
-				     records, 0);
-	}
-
-	item_num_in_dest =
-	    (last_first == FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - 1) : 0;
-
-	leaf_paste_entries(dest_bi, item_num_in_dest,
-			   (last_first ==
-			    FIRST_TO_LAST) ? ih_entry_count(item_head(dest,
-									  item_num_in_dest))
-			   : 0, copy_count, deh + from, records,
-			   DEH_SIZE * copy_count + copy_records_len);
-}
-
-/*
- * Copy the first (if last_first == FIRST_TO_LAST) or last
- * (last_first == LAST_TO_FIRST) item or part of it or nothing
- * (see the return 0 below) from SOURCE to the end (if last_first)
- * or beginning (!last_first) of the DEST
- */
-/* returns 1 if anything was copied, else 0 */
-static int leaf_copy_boundary_item(struct buffer_info *dest_bi,
-				   struct buffer_head *src, int last_first,
-				   int bytes_or_entries)
-{
-	struct buffer_head *dest = dest_bi->bi_bh;
-	/* number of items in the source and destination buffers */
-	int dest_nr_item, src_nr_item;
-	struct item_head *ih;
-	struct item_head *dih;
-
-	dest_nr_item = B_NR_ITEMS(dest);
-
-	/*
-	 * if ( DEST is empty or first item of SOURCE and last item of
-	 * DEST are the items of different objects or of different types )
-	 * then there is no need to treat this item differently from the
-	 * other items that we copy, so we return
-	 */
-	if (last_first == FIRST_TO_LAST) {
-		ih = item_head(src, 0);
-		dih = item_head(dest, dest_nr_item - 1);
-
-		/* there is nothing to merge */
-		if (!dest_nr_item
-		    || (!op_is_left_mergeable(&ih->ih_key, src->b_size)))
-			return 0;
-
-		RFALSE(!ih_item_len(ih),
-		       "vs-10010: item can not have empty length");
-
-		if (is_direntry_le_ih(ih)) {
-			if (bytes_or_entries == -1)
-				/* copy all entries to dest */
-				bytes_or_entries = ih_entry_count(ih);
-			leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST, 0, 0,
-					      bytes_or_entries);
-			return 1;
-		}
-
-		/*
-		 * copy part of the body of the first item of SOURCE
-		 * to the end of the body of the last item of the DEST
-		 * part defined by 'bytes_or_entries'; if bytes_or_entries
-		 * == -1 copy whole body; don't create new item header
-		 */
-		if (bytes_or_entries == -1)
-			bytes_or_entries = ih_item_len(ih);
-
-#ifdef CONFIG_REISERFS_CHECK
-		else {
-			if (bytes_or_entries == ih_item_len(ih)
-			    && is_indirect_le_ih(ih))
-				if (get_ih_free_space(ih))
-					reiserfs_panic(sb_from_bi(dest_bi),
-						       "vs-10020",
-						       "last unformatted node "
-						       "must be filled "
-						       "entirely (%h)", ih);
-		}
-#endif
-
-		/*
-		 * merge first item (or its part) of src buffer with the last
-		 * item of dest buffer. Both are of the same file
-		 */
-		leaf_paste_in_buffer(dest_bi,
-				     dest_nr_item - 1, ih_item_len(dih),
-				     bytes_or_entries, ih_item_body(src, ih), 0);
-
-		if (is_indirect_le_ih(dih)) {
-			RFALSE(get_ih_free_space(dih),
-			       "vs-10030: merge to left: last unformatted node of non-last indirect item %h must have zerto free space",
-			       ih);
-			if (bytes_or_entries == ih_item_len(ih))
-				set_ih_free_space(dih, get_ih_free_space(ih));
-		}
-
-		return 1;
-	}
-
-	/* copy boundary item to right (last_first == LAST_TO_FIRST) */
-
-	/*
-	 * (DEST is empty or last item of SOURCE and first item of DEST
-	 * are the items of different object or of different types)
-	 */
-	src_nr_item = B_NR_ITEMS(src);
-	ih = item_head(src, src_nr_item - 1);
-	dih = item_head(dest, 0);
-
-	if (!dest_nr_item || !op_is_left_mergeable(&dih->ih_key, src->b_size))
-		return 0;
-
-	if (is_direntry_le_ih(ih)) {
-		/*
-		 * bytes_or_entries = entries number in last
-		 * item body of SOURCE
-		 */
-		if (bytes_or_entries == -1)
-			bytes_or_entries = ih_entry_count(ih);
-
-		leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST,
-				      src_nr_item - 1,
-				      ih_entry_count(ih) - bytes_or_entries,
-				      bytes_or_entries);
-		return 1;
-	}
-
-	/*
-	 * copy part of the body of the last item of SOURCE to the
-	 * begin of the body of the first item of the DEST; part defined
-	 * by 'bytes_or_entries'; if byte_or_entriess == -1 copy whole body;
-	 * change first item key of the DEST; don't create new item header
-	 */
-
-	RFALSE(is_indirect_le_ih(ih) && get_ih_free_space(ih),
-	       "vs-10040: merge to right: last unformatted node of non-last indirect item must be filled entirely (%h)",
-	       ih);
-
-	if (bytes_or_entries == -1) {
-		/* bytes_or_entries = length of last item body of SOURCE */
-		bytes_or_entries = ih_item_len(ih);
-
-		RFALSE(le_ih_k_offset(dih) !=
-		       le_ih_k_offset(ih) + op_bytes_number(ih, src->b_size),
-		       "vs-10050: items %h and %h do not match", ih, dih);
-
-		/* change first item key of the DEST */
-		set_le_ih_k_offset(dih, le_ih_k_offset(ih));
-
-		/* item becomes non-mergeable */
-		/* or mergeable if left item was */
-		set_le_ih_k_type(dih, le_ih_k_type(ih));
-	} else {
-		/* merge to right only part of item */
-		RFALSE(ih_item_len(ih) <= bytes_or_entries,
-		       "vs-10060: no so much bytes %lu (needed %lu)",
-		       (unsigned long)ih_item_len(ih),
-		       (unsigned long)bytes_or_entries);
-
-		/* change first item key of the DEST */
-		if (is_direct_le_ih(dih)) {
-			RFALSE(le_ih_k_offset(dih) <=
-			       (unsigned long)bytes_or_entries,
-			       "vs-10070: dih %h, bytes_or_entries(%d)", dih,
-			       bytes_or_entries);
-			set_le_ih_k_offset(dih,
-					   le_ih_k_offset(dih) -
-					   bytes_or_entries);
-		} else {
-			RFALSE(le_ih_k_offset(dih) <=
-			       (bytes_or_entries / UNFM_P_SIZE) * dest->b_size,
-			       "vs-10080: dih %h, bytes_or_entries(%d)",
-			       dih,
-			       (bytes_or_entries / UNFM_P_SIZE) * dest->b_size);
-			set_le_ih_k_offset(dih,
-					   le_ih_k_offset(dih) -
-					   ((bytes_or_entries / UNFM_P_SIZE) *
-					    dest->b_size));
-		}
-	}
-
-	leaf_paste_in_buffer(dest_bi, 0, 0, bytes_or_entries,
-			     ih_item_body(src,
-				       ih) + ih_item_len(ih) - bytes_or_entries,
-			     0);
-	return 1;
-}
-
-/*
- * copy cpy_mun items from buffer src to buffer dest
- * last_first == FIRST_TO_LAST means, that we copy cpy_num items beginning
- *                             from first-th item in src to tail of dest
- * last_first == LAST_TO_FIRST means, that we copy cpy_num items beginning
- *                             from first-th item in src to head of dest
- */
-static void leaf_copy_items_entirely(struct buffer_info *dest_bi,
-				     struct buffer_head *src, int last_first,
-				     int first, int cpy_num)
-{
-	struct buffer_head *dest;
-	int nr, free_space;
-	int dest_before;
-	int last_loc, last_inserted_loc, location;
-	int i, j;
-	struct block_head *blkh;
-	struct item_head *ih;
-
-	RFALSE(last_first != LAST_TO_FIRST && last_first != FIRST_TO_LAST,
-	       "vs-10090: bad last_first parameter %d", last_first);
-	RFALSE(B_NR_ITEMS(src) - first < cpy_num,
-	       "vs-10100: too few items in source %d, required %d from %d",
-	       B_NR_ITEMS(src), cpy_num, first);
-	RFALSE(cpy_num < 0, "vs-10110: can not copy negative amount of items");
-	RFALSE(!dest_bi, "vs-10120: can not copy negative amount of items");
-
-	dest = dest_bi->bi_bh;
-
-	RFALSE(!dest, "vs-10130: can not copy negative amount of items");
-
-	if (cpy_num == 0)
-		return;
-
-	blkh = B_BLK_HEAD(dest);
-	nr = blkh_nr_item(blkh);
-	free_space = blkh_free_space(blkh);
-
-	/*
-	 * we will insert items before 0-th or nr-th item in dest buffer.
-	 * It depends of last_first parameter
-	 */
-	dest_before = (last_first == LAST_TO_FIRST) ? 0 : nr;
-
-	/* location of head of first new item */
-	ih = item_head(dest, dest_before);
-
-	RFALSE(blkh_free_space(blkh) < cpy_num * IH_SIZE,
-	       "vs-10140: not enough free space for headers %d (needed %d)",
-	       B_FREE_SPACE(dest), cpy_num * IH_SIZE);
-
-	/* prepare space for headers */
-	memmove(ih + cpy_num, ih, (nr - dest_before) * IH_SIZE);
-
-	/* copy item headers */
-	memcpy(ih, item_head(src, first), cpy_num * IH_SIZE);
-
-	free_space -= (IH_SIZE * cpy_num);
-	set_blkh_free_space(blkh, free_space);
-
-	/* location of unmovable item */
-	j = location = (dest_before == 0) ? dest->b_size : ih_location(ih - 1);
-	for (i = dest_before; i < nr + cpy_num; i++) {
-		location -= ih_item_len(ih + i - dest_before);
-		put_ih_location(ih + i - dest_before, location);
-	}
-
-	/* prepare space for items */
-	last_loc = ih_location(&ih[nr + cpy_num - 1 - dest_before]);
-	last_inserted_loc = ih_location(&ih[cpy_num - 1]);
-
-	/* check free space */
-	RFALSE(free_space < j - last_inserted_loc,
-	       "vs-10150: not enough free space for items %d (needed %d)",
-	       free_space, j - last_inserted_loc);
-
-	memmove(dest->b_data + last_loc,
-		dest->b_data + last_loc + j - last_inserted_loc,
-		last_inserted_loc - last_loc);
-
-	/* copy items */
-	memcpy(dest->b_data + last_inserted_loc,
-	       item_body(src, (first + cpy_num - 1)),
-	       j - last_inserted_loc);
-
-	/* sizes, item number */
-	set_blkh_nr_item(blkh, nr + cpy_num);
-	set_blkh_free_space(blkh, free_space - (j - last_inserted_loc));
-
-	do_balance_mark_leaf_dirty(dest_bi->tb, dest, 0);
-
-	if (dest_bi->bi_parent) {
-		struct disk_child *t_dc;
-		t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position);
-		RFALSE(dc_block_number(t_dc) != dest->b_blocknr,
-		       "vs-10160: block number in bh does not match to field in disk_child structure %lu and %lu",
-		       (long unsigned)dest->b_blocknr,
-		       (long unsigned)dc_block_number(t_dc));
-		put_dc_size(t_dc,
-			    dc_size(t_dc) + (j - last_inserted_loc +
-					     IH_SIZE * cpy_num));
-
-		do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent,
-					       0);
-	}
-}
-
-/*
- * This function splits the (liquid) item into two items (useful when
- * shifting part of an item into another node.)
- */
-static void leaf_item_bottle(struct buffer_info *dest_bi,
-			     struct buffer_head *src, int last_first,
-			     int item_num, int cpy_bytes)
-{
-	struct buffer_head *dest = dest_bi->bi_bh;
-	struct item_head *ih;
-
-	RFALSE(cpy_bytes == -1,
-	       "vs-10170: bytes == - 1 means: do not split item");
-
-	if (last_first == FIRST_TO_LAST) {
-		/*
-		 * if ( if item in position item_num in buffer SOURCE
-		 * is directory item )
-		 */
-		ih = item_head(src, item_num);
-		if (is_direntry_le_ih(ih))
-			leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST,
-					      item_num, 0, cpy_bytes);
-		else {
-			struct item_head n_ih;
-
-			/*
-			 * copy part of the body of the item number 'item_num'
-			 * of SOURCE to the end of the DEST part defined by
-			 * 'cpy_bytes'; create new item header; change old
-			 * item_header (????); n_ih = new item_header;
-			 */
-			memcpy(&n_ih, ih, IH_SIZE);
-			put_ih_item_len(&n_ih, cpy_bytes);
-			if (is_indirect_le_ih(ih)) {
-				RFALSE(cpy_bytes == ih_item_len(ih)
-				       && get_ih_free_space(ih),
-				       "vs-10180: when whole indirect item is bottle to left neighbor, it must have free_space==0 (not %lu)",
-				       (long unsigned)get_ih_free_space(ih));
-				set_ih_free_space(&n_ih, 0);
-			}
-
-			RFALSE(op_is_left_mergeable(&ih->ih_key, src->b_size),
-			       "vs-10190: bad mergeability of item %h", ih);
-			n_ih.ih_version = ih->ih_version;	/* JDM Endian safe, both le */
-			leaf_insert_into_buf(dest_bi, B_NR_ITEMS(dest), &n_ih,
-					     item_body(src, item_num), 0);
-		}
-	} else {
-		/*
-		 * if ( if item in position item_num in buffer
-		 * SOURCE is directory item )
-		 */
-		ih = item_head(src, item_num);
-		if (is_direntry_le_ih(ih))
-			leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST,
-					      item_num,
-					      ih_entry_count(ih) - cpy_bytes,
-					      cpy_bytes);
-		else {
-			struct item_head n_ih;
-
-			/*
-			 * copy part of the body of the item number 'item_num'
-			 * of SOURCE to the begin of the DEST part defined by
-			 * 'cpy_bytes'; create new item header;
-			 * n_ih = new item_header;
-			 */
-			memcpy(&n_ih.ih_key, &ih->ih_key, KEY_SIZE);
-
-			/* Endian safe, both le */
-			n_ih.ih_version = ih->ih_version;
-
-			if (is_direct_le_ih(ih)) {
-				set_le_ih_k_offset(&n_ih,
-						   le_ih_k_offset(ih) +
-						   ih_item_len(ih) - cpy_bytes);
-				set_le_ih_k_type(&n_ih, TYPE_DIRECT);
-				set_ih_free_space(&n_ih, MAX_US_INT);
-			} else {
-				/* indirect item */
-				RFALSE(!cpy_bytes && get_ih_free_space(ih),
-				       "vs-10200: ih->ih_free_space must be 0 when indirect item will be appended");
-				set_le_ih_k_offset(&n_ih,
-						   le_ih_k_offset(ih) +
-						   (ih_item_len(ih) -
-						    cpy_bytes) / UNFM_P_SIZE *
-						   dest->b_size);
-				set_le_ih_k_type(&n_ih, TYPE_INDIRECT);
-				set_ih_free_space(&n_ih, get_ih_free_space(ih));
-			}
-
-			/* set item length */
-			put_ih_item_len(&n_ih, cpy_bytes);
-
-			/* Endian safe, both le */
-			n_ih.ih_version = ih->ih_version;
-
-			leaf_insert_into_buf(dest_bi, 0, &n_ih,
-					     item_body(src, item_num) +
-						ih_item_len(ih) - cpy_bytes, 0);
-		}
-	}
-}
-
-/*
- * If cpy_bytes equals minus one than copy cpy_num whole items from SOURCE
- * to DEST.  If cpy_bytes not equal to minus one than copy cpy_num-1 whole
- * items from SOURCE to DEST.  From last item copy cpy_num bytes for regular
- * item and cpy_num directory entries for directory item.
- */
-static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src,
-			   int last_first, int cpy_num, int cpy_bytes)
-{
-	struct buffer_head *dest;
-	int pos, i, src_nr_item, bytes;
-
-	dest = dest_bi->bi_bh;
-	RFALSE(!dest || !src, "vs-10210: !dest || !src");
-	RFALSE(last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST,
-	       "vs-10220:last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST");
-	RFALSE(B_NR_ITEMS(src) < cpy_num,
-	       "vs-10230: No enough items: %d, req. %d", B_NR_ITEMS(src),
-	       cpy_num);
-	RFALSE(cpy_num < 0, "vs-10240: cpy_num < 0 (%d)", cpy_num);
-
-	if (cpy_num == 0)
-		return 0;
-
-	if (last_first == FIRST_TO_LAST) {
-		/* copy items to left */
-		pos = 0;
-		if (cpy_num == 1)
-			bytes = cpy_bytes;
-		else
-			bytes = -1;
-
-		/*
-		 * copy the first item or it part or nothing to the end of
-		 * the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,0,bytes))
-		 */
-		i = leaf_copy_boundary_item(dest_bi, src, FIRST_TO_LAST, bytes);
-		cpy_num -= i;
-		if (cpy_num == 0)
-			return i;
-		pos += i;
-		if (cpy_bytes == -1)
-			/*
-			 * copy first cpy_num items starting from position
-			 * 'pos' of SOURCE to end of DEST
-			 */
-			leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST,
-						 pos, cpy_num);
-		else {
-			/*
-			 * copy first cpy_num-1 items starting from position
-			 * 'pos-1' of the SOURCE to the end of the DEST
-			 */
-			leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST,
-						 pos, cpy_num - 1);
-
-			/*
-			 * copy part of the item which number is
-			 * cpy_num+pos-1 to the end of the DEST
-			 */
-			leaf_item_bottle(dest_bi, src, FIRST_TO_LAST,
-					 cpy_num + pos - 1, cpy_bytes);
-		}
-	} else {
-		/* copy items to right */
-		src_nr_item = B_NR_ITEMS(src);
-		if (cpy_num == 1)
-			bytes = cpy_bytes;
-		else
-			bytes = -1;
-
-		/*
-		 * copy the last item or it part or nothing to the
-		 * begin of the DEST
-		 * (i = leaf_copy_boundary_item(DEST,SOURCE,1,bytes));
-		 */
-		i = leaf_copy_boundary_item(dest_bi, src, LAST_TO_FIRST, bytes);
-
-		cpy_num -= i;
-		if (cpy_num == 0)
-			return i;
-
-		pos = src_nr_item - cpy_num - i;
-		if (cpy_bytes == -1) {
-			/*
-			 * starting from position 'pos' copy last cpy_num
-			 * items of SOURCE to begin of DEST
-			 */
-			leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST,
-						 pos, cpy_num);
-		} else {
-			/*
-			 * copy last cpy_num-1 items starting from position
-			 * 'pos+1' of the SOURCE to the begin of the DEST;
-			 */
-			leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST,
-						 pos + 1, cpy_num - 1);
-
-			/*
-			 * copy part of the item which number is pos to
-			 * the begin of the DEST
-			 */
-			leaf_item_bottle(dest_bi, src, LAST_TO_FIRST, pos,
-					 cpy_bytes);
-		}
-	}
-	return i;
-}
-
-/*
- * there are types of coping: from S[0] to L[0], from S[0] to R[0],
- * from R[0] to L[0]. for each of these we have to define parent and
- * positions of destination and source buffers
- */
-static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb,
-				       struct buffer_info *dest_bi,
-				       struct buffer_info *src_bi,
-				       int *first_last,
-				       struct buffer_head *Snew)
-{
-	memset(dest_bi, 0, sizeof(struct buffer_info));
-	memset(src_bi, 0, sizeof(struct buffer_info));
-
-	/* define dest, src, dest parent, dest position */
-	switch (shift_mode) {
-	case LEAF_FROM_S_TO_L:	/* it is used in leaf_shift_left */
-		src_bi->tb = tb;
-		src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path);
-		src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
-
-		/* src->b_item_order */
-		src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0);
-		dest_bi->tb = tb;
-		dest_bi->bi_bh = tb->L[0];
-		dest_bi->bi_parent = tb->FL[0];
-		dest_bi->bi_position = get_left_neighbor_position(tb, 0);
-		*first_last = FIRST_TO_LAST;
-		break;
-
-	case LEAF_FROM_S_TO_R:	/* it is used in leaf_shift_right */
-		src_bi->tb = tb;
-		src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path);
-		src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
-		src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0);
-		dest_bi->tb = tb;
-		dest_bi->bi_bh = tb->R[0];
-		dest_bi->bi_parent = tb->FR[0];
-		dest_bi->bi_position = get_right_neighbor_position(tb, 0);
-		*first_last = LAST_TO_FIRST;
-		break;
-
-	case LEAF_FROM_R_TO_L:	/* it is used in balance_leaf_when_delete */
-		src_bi->tb = tb;
-		src_bi->bi_bh = tb->R[0];
-		src_bi->bi_parent = tb->FR[0];
-		src_bi->bi_position = get_right_neighbor_position(tb, 0);
-		dest_bi->tb = tb;
-		dest_bi->bi_bh = tb->L[0];
-		dest_bi->bi_parent = tb->FL[0];
-		dest_bi->bi_position = get_left_neighbor_position(tb, 0);
-		*first_last = FIRST_TO_LAST;
-		break;
-
-	case LEAF_FROM_L_TO_R:	/* it is used in balance_leaf_when_delete */
-		src_bi->tb = tb;
-		src_bi->bi_bh = tb->L[0];
-		src_bi->bi_parent = tb->FL[0];
-		src_bi->bi_position = get_left_neighbor_position(tb, 0);
-		dest_bi->tb = tb;
-		dest_bi->bi_bh = tb->R[0];
-		dest_bi->bi_parent = tb->FR[0];
-		dest_bi->bi_position = get_right_neighbor_position(tb, 0);
-		*first_last = LAST_TO_FIRST;
-		break;
-
-	case LEAF_FROM_S_TO_SNEW:
-		src_bi->tb = tb;
-		src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path);
-		src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
-		src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0);
-		dest_bi->tb = tb;
-		dest_bi->bi_bh = Snew;
-		dest_bi->bi_parent = NULL;
-		dest_bi->bi_position = 0;
-		*first_last = LAST_TO_FIRST;
-		break;
-
-	default:
-		reiserfs_panic(sb_from_bi(src_bi), "vs-10250",
-			       "shift type is unknown (%d)", shift_mode);
-	}
-	RFALSE(!src_bi->bi_bh || !dest_bi->bi_bh,
-	       "vs-10260: mode==%d, source (%p) or dest (%p) buffer is initialized incorrectly",
-	       shift_mode, src_bi->bi_bh, dest_bi->bi_bh);
-}
-
-/*
- * copy mov_num items and mov_bytes of the (mov_num-1)th item to
- * neighbor. Delete them from source
- */
-int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num,
-		    int mov_bytes, struct buffer_head *Snew)
-{
-	int ret_value;
-	struct buffer_info dest_bi, src_bi;
-	int first_last;
-
-	leaf_define_dest_src_infos(shift_mode, tb, &dest_bi, &src_bi,
-				   &first_last, Snew);
-
-	ret_value =
-	    leaf_copy_items(&dest_bi, src_bi.bi_bh, first_last, mov_num,
-			    mov_bytes);
-
-	leaf_delete_items(&src_bi, first_last,
-			  (first_last ==
-			   FIRST_TO_LAST) ? 0 : (B_NR_ITEMS(src_bi.bi_bh) -
-						 mov_num), mov_num, mov_bytes);
-
-	return ret_value;
-}
-
-/*
- * Shift shift_num items (and shift_bytes of last shifted item if
- * shift_bytes != -1) from S[0] to L[0] and replace the delimiting key
- */
-int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes)
-{
-	struct buffer_head *S0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int i;
-
-	/*
-	 * move shift_num (and shift_bytes bytes) items from S[0]
-	 * to left neighbor L[0]
-	 */
-	i = leaf_move_items(LEAF_FROM_S_TO_L, tb, shift_num, shift_bytes, NULL);
-
-	if (shift_num) {
-		/* number of items in S[0] == 0 */
-		if (B_NR_ITEMS(S0) == 0) {
-
-			RFALSE(shift_bytes != -1,
-			       "vs-10270: S0 is empty now, but shift_bytes != -1 (%d)",
-			       shift_bytes);
-#ifdef CONFIG_REISERFS_CHECK
-			if (tb->tb_mode == M_PASTE || tb->tb_mode == M_INSERT) {
-				print_cur_tb("vs-10275");
-				reiserfs_panic(tb->tb_sb, "vs-10275",
-					       "balance condition corrupted "
-					       "(%c)", tb->tb_mode);
-			}
-#endif
-
-			if (PATH_H_POSITION(tb->tb_path, 1) == 0)
-				replace_key(tb, tb->CFL[0], tb->lkey[0],
-					    PATH_H_PPARENT(tb->tb_path, 0), 0);
-
-		} else {
-			/* replace lkey in CFL[0] by 0-th key from S[0]; */
-			replace_key(tb, tb->CFL[0], tb->lkey[0], S0, 0);
-
-			RFALSE((shift_bytes != -1 &&
-				!(is_direntry_le_ih(item_head(S0, 0))
-				  && !ih_entry_count(item_head(S0, 0)))) &&
-			       (!op_is_left_mergeable
-				(leaf_key(S0, 0), S0->b_size)),
-			       "vs-10280: item must be mergeable");
-		}
-	}
-
-	return i;
-}
-
-/* CLEANING STOPPED HERE */
-
-/*
- * Shift shift_num (shift_bytes) items from S[0] to the right neighbor,
- * and replace the delimiting key
- */
-int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes)
-{
-	int ret_value;
-
-	/*
-	 * move shift_num (and shift_bytes) items from S[0] to
-	 * right neighbor R[0]
-	 */
-	ret_value =
-	    leaf_move_items(LEAF_FROM_S_TO_R, tb, shift_num, shift_bytes, NULL);
-
-	/* replace rkey in CFR[0] by the 0-th key from R[0] */
-	if (shift_num) {
-		replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
-
-	}
-
-	return ret_value;
-}
-
-static void leaf_delete_items_entirely(struct buffer_info *bi,
-				       int first, int del_num);
-/*
- * If del_bytes == -1, starting from position 'first' delete del_num
- * items in whole in buffer CUR.
- *   If not.
- *   If last_first == 0. Starting from position 'first' delete del_num-1
- *   items in whole. Delete part of body of the first item. Part defined by
- *   del_bytes. Don't delete first item header
- *   If last_first == 1. Starting from position 'first+1' delete del_num-1
- *   items in whole. Delete part of body of the last item . Part defined by
- *   del_bytes. Don't delete last item header.
-*/
-void leaf_delete_items(struct buffer_info *cur_bi, int last_first,
-		       int first, int del_num, int del_bytes)
-{
-	struct buffer_head *bh;
-	int item_amount = B_NR_ITEMS(bh = cur_bi->bi_bh);
-
-	RFALSE(!bh, "10155: bh is not defined");
-	RFALSE(del_num < 0, "10160: del_num can not be < 0. del_num==%d",
-	       del_num);
-	RFALSE(first < 0
-	       || first + del_num > item_amount,
-	       "10165: invalid number of first item to be deleted (%d) or "
-	       "no so much items (%d) to delete (only %d)", first,
-	       first + del_num, item_amount);
-
-	if (del_num == 0)
-		return;
-
-	if (first == 0 && del_num == item_amount && del_bytes == -1) {
-		make_empty_node(cur_bi);
-		do_balance_mark_leaf_dirty(cur_bi->tb, bh, 0);
-		return;
-	}
-
-	if (del_bytes == -1)
-		/* delete del_num items beginning from item in position first */
-		leaf_delete_items_entirely(cur_bi, first, del_num);
-	else {
-		if (last_first == FIRST_TO_LAST) {
-			/*
-			 * delete del_num-1 items beginning from
-			 * item in position first
-			 */
-			leaf_delete_items_entirely(cur_bi, first, del_num - 1);
-
-			/*
-			 * delete the part of the first item of the bh
-			 * do not delete item header
-			 */
-			leaf_cut_from_buffer(cur_bi, 0, 0, del_bytes);
-		} else {
-			struct item_head *ih;
-			int len;
-
-			/*
-			 * delete del_num-1 items beginning from
-			 * item in position first+1
-			 */
-			leaf_delete_items_entirely(cur_bi, first + 1,
-						   del_num - 1);
-
-			ih = item_head(bh, B_NR_ITEMS(bh) - 1);
-			if (is_direntry_le_ih(ih))
-				/* the last item is directory  */
-				/*
-				 * len = numbers of directory entries
-				 * in this item
-				 */
-				len = ih_entry_count(ih);
-			else
-				/* len = body len of item */
-				len = ih_item_len(ih);
-
-			/*
-			 * delete the part of the last item of the bh
-			 * do not delete item header
-			 */
-			leaf_cut_from_buffer(cur_bi, B_NR_ITEMS(bh) - 1,
-					     len - del_bytes, del_bytes);
-		}
-	}
-}
-
-/* insert item into the leaf node in position before */
-void leaf_insert_into_buf(struct buffer_info *bi, int before,
-			  struct item_head * const inserted_item_ih,
-			  const char * const inserted_item_body,
-			  int zeros_number)
-{
-	struct buffer_head *bh = bi->bi_bh;
-	int nr, free_space;
-	struct block_head *blkh;
-	struct item_head *ih;
-	int i;
-	int last_loc, unmoved_loc;
-	char *to;
-
-	blkh = B_BLK_HEAD(bh);
-	nr = blkh_nr_item(blkh);
-	free_space = blkh_free_space(blkh);
-
-	/* check free space */
-	RFALSE(free_space < ih_item_len(inserted_item_ih) + IH_SIZE,
-	       "vs-10170: not enough free space in block %z, new item %h",
-	       bh, inserted_item_ih);
-	RFALSE(zeros_number > ih_item_len(inserted_item_ih),
-	       "vs-10172: zero number == %d, item length == %d",
-	       zeros_number, ih_item_len(inserted_item_ih));
-
-	/* get item new item must be inserted before */
-	ih = item_head(bh, before);
-
-	/* prepare space for the body of new item */
-	last_loc = nr ? ih_location(&ih[nr - before - 1]) : bh->b_size;
-	unmoved_loc = before ? ih_location(ih - 1) : bh->b_size;
-
-	memmove(bh->b_data + last_loc - ih_item_len(inserted_item_ih),
-		bh->b_data + last_loc, unmoved_loc - last_loc);
-
-	to = bh->b_data + unmoved_loc - ih_item_len(inserted_item_ih);
-	memset(to, 0, zeros_number);
-	to += zeros_number;
-
-	/* copy body to prepared space */
-	if (inserted_item_body)
-		memmove(to, inserted_item_body,
-			ih_item_len(inserted_item_ih) - zeros_number);
-	else
-		memset(to, '\0', ih_item_len(inserted_item_ih) - zeros_number);
-
-	/* insert item header */
-	memmove(ih + 1, ih, IH_SIZE * (nr - before));
-	memmove(ih, inserted_item_ih, IH_SIZE);
-
-	/* change locations */
-	for (i = before; i < nr + 1; i++) {
-		unmoved_loc -= ih_item_len(&ih[i - before]);
-		put_ih_location(&ih[i - before], unmoved_loc);
-	}
-
-	/* sizes, free space, item number */
-	set_blkh_nr_item(blkh, blkh_nr_item(blkh) + 1);
-	set_blkh_free_space(blkh,
-			    free_space - (IH_SIZE +
-					  ih_item_len(inserted_item_ih)));
-	do_balance_mark_leaf_dirty(bi->tb, bh, 1);
-
-	if (bi->bi_parent) {
-		struct disk_child *t_dc;
-		t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position);
-		put_dc_size(t_dc,
-			    dc_size(t_dc) + (IH_SIZE +
-					     ih_item_len(inserted_item_ih)));
-		do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
-	}
-}
-
-/*
- * paste paste_size bytes to affected_item_num-th item.
- * When item is a directory, this only prepare space for new entries
- */
-void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num,
-			  int pos_in_item, int paste_size,
-			  const char *body, int zeros_number)
-{
-	struct buffer_head *bh = bi->bi_bh;
-	int nr, free_space;
-	struct block_head *blkh;
-	struct item_head *ih;
-	int i;
-	int last_loc, unmoved_loc;
-
-	blkh = B_BLK_HEAD(bh);
-	nr = blkh_nr_item(blkh);
-	free_space = blkh_free_space(blkh);
-
-	/* check free space */
-	RFALSE(free_space < paste_size,
-	       "vs-10175: not enough free space: needed %d, available %d",
-	       paste_size, free_space);
-
-#ifdef CONFIG_REISERFS_CHECK
-	if (zeros_number > paste_size) {
-		struct super_block *sb = NULL;
-		if (bi && bi->tb)
-			sb = bi->tb->tb_sb;
-		print_cur_tb("10177");
-		reiserfs_panic(sb, "vs-10177",
-			       "zeros_number == %d, paste_size == %d",
-			       zeros_number, paste_size);
-	}
-#endif				/* CONFIG_REISERFS_CHECK */
-
-	/* item to be appended */
-	ih = item_head(bh, affected_item_num);
-
-	last_loc = ih_location(&ih[nr - affected_item_num - 1]);
-	unmoved_loc = affected_item_num ? ih_location(ih - 1) : bh->b_size;
-
-	/* prepare space */
-	memmove(bh->b_data + last_loc - paste_size, bh->b_data + last_loc,
-		unmoved_loc - last_loc);
-
-	/* change locations */
-	for (i = affected_item_num; i < nr; i++)
-		put_ih_location(&ih[i - affected_item_num],
-				ih_location(&ih[i - affected_item_num]) -
-				paste_size);
-
-	if (body) {
-		if (!is_direntry_le_ih(ih)) {
-			if (!pos_in_item) {
-				/* shift data to right */
-				memmove(bh->b_data + ih_location(ih) +
-					paste_size,
-					bh->b_data + ih_location(ih),
-					ih_item_len(ih));
-				/* paste data in the head of item */
-				memset(bh->b_data + ih_location(ih), 0,
-				       zeros_number);
-				memcpy(bh->b_data + ih_location(ih) +
-				       zeros_number, body,
-				       paste_size - zeros_number);
-			} else {
-				memset(bh->b_data + unmoved_loc - paste_size, 0,
-				       zeros_number);
-				memcpy(bh->b_data + unmoved_loc - paste_size +
-				       zeros_number, body,
-				       paste_size - zeros_number);
-			}
-		}
-	} else
-		memset(bh->b_data + unmoved_loc - paste_size, '\0', paste_size);
-
-	put_ih_item_len(ih, ih_item_len(ih) + paste_size);
-
-	/* change free space */
-	set_blkh_free_space(blkh, free_space - paste_size);
-
-	do_balance_mark_leaf_dirty(bi->tb, bh, 0);
-
-	if (bi->bi_parent) {
-		struct disk_child *t_dc =
-		    B_N_CHILD(bi->bi_parent, bi->bi_position);
-		put_dc_size(t_dc, dc_size(t_dc) + paste_size);
-		do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
-	}
-}
-
-/*
- * cuts DEL_COUNT entries beginning from FROM-th entry. Directory item
- * does not have free space, so it moves DEHs and remaining records as
- * necessary. Return value is size of removed part of directory item
- * in bytes.
- */
-static int leaf_cut_entries(struct buffer_head *bh,
-			    struct item_head *ih, int from, int del_count)
-{
-	char *item;
-	struct reiserfs_de_head *deh;
-	int prev_record_offset;	/* offset of record, that is (from-1)th */
-	char *prev_record;	/* */
-	int cut_records_len;	/* length of all removed records */
-	int i;
-
-	/*
-	 * make sure that item is directory and there are enough entries to
-	 * remove
-	 */
-	RFALSE(!is_direntry_le_ih(ih), "10180: item is not directory item");
-	RFALSE(ih_entry_count(ih) < from + del_count,
-	       "10185: item contains not enough entries: entry_count = %d, from = %d, to delete = %d",
-	       ih_entry_count(ih), from, del_count);
-
-	if (del_count == 0)
-		return 0;
-
-	/* first byte of item */
-	item = bh->b_data + ih_location(ih);
-
-	/* entry head array */
-	deh = B_I_DEH(bh, ih);
-
-	/*
-	 * first byte of remaining entries, those are BEFORE cut entries
-	 * (prev_record) and length of all removed records (cut_records_len)
-	 */
-	prev_record_offset =
-	    (from ? deh_location(&deh[from - 1]) : ih_item_len(ih));
-	cut_records_len = prev_record_offset /*from_record */  -
-	    deh_location(&deh[from + del_count - 1]);
-	prev_record = item + prev_record_offset;
-
-	/* adjust locations of remaining entries */
-	for (i = ih_entry_count(ih) - 1; i > from + del_count - 1; i--)
-		put_deh_location(&deh[i],
-				 deh_location(&deh[i]) -
-				 (DEH_SIZE * del_count));
-
-	for (i = 0; i < from; i++)
-		put_deh_location(&deh[i],
-				 deh_location(&deh[i]) - (DEH_SIZE * del_count +
-							  cut_records_len));
-
-	put_ih_entry_count(ih, ih_entry_count(ih) - del_count);
-
-	/* shift entry head array and entries those are AFTER removed entries */
-	memmove((char *)(deh + from),
-		deh + from + del_count,
-		prev_record - cut_records_len - (char *)(deh + from +
-							 del_count));
-
-	/* shift records, those are BEFORE removed entries */
-	memmove(prev_record - cut_records_len - DEH_SIZE * del_count,
-		prev_record, item + ih_item_len(ih) - prev_record);
-
-	return DEH_SIZE * del_count + cut_records_len;
-}
-
-/*
- * when cut item is part of regular file
- *      pos_in_item - first byte that must be cut
- *      cut_size - number of bytes to be cut beginning from pos_in_item
- *
- * when cut item is part of directory
- *      pos_in_item - number of first deleted entry
- *      cut_size - count of deleted entries
- */
-void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
-			  int pos_in_item, int cut_size)
-{
-	int nr;
-	struct buffer_head *bh = bi->bi_bh;
-	struct block_head *blkh;
-	struct item_head *ih;
-	int last_loc, unmoved_loc;
-	int i;
-
-	blkh = B_BLK_HEAD(bh);
-	nr = blkh_nr_item(blkh);
-
-	/* item head of truncated item */
-	ih = item_head(bh, cut_item_num);
-
-	if (is_direntry_le_ih(ih)) {
-		/* first cut entry () */
-		cut_size = leaf_cut_entries(bh, ih, pos_in_item, cut_size);
-		if (pos_in_item == 0) {
-			/* change key */
-			RFALSE(cut_item_num,
-			       "when 0-th enrty of item is cut, that item must be first in the node, not %d-th",
-			       cut_item_num);
-			/* change item key by key of first entry in the item */
-			set_le_ih_k_offset(ih, deh_offset(B_I_DEH(bh, ih)));
-		}
-	} else {
-		/* item is direct or indirect */
-		RFALSE(is_statdata_le_ih(ih), "10195: item is stat data");
-		RFALSE(pos_in_item && pos_in_item + cut_size != ih_item_len(ih),
-		       "10200: invalid offset (%lu) or trunc_size (%lu) or ih_item_len (%lu)",
-		       (long unsigned)pos_in_item, (long unsigned)cut_size,
-		       (long unsigned)ih_item_len(ih));
-
-		/* shift item body to left if cut is from the head of item */
-		if (pos_in_item == 0) {
-			memmove(bh->b_data + ih_location(ih),
-				bh->b_data + ih_location(ih) + cut_size,
-				ih_item_len(ih) - cut_size);
-
-			/* change key of item */
-			if (is_direct_le_ih(ih))
-				set_le_ih_k_offset(ih,
-						   le_ih_k_offset(ih) +
-						   cut_size);
-			else {
-				set_le_ih_k_offset(ih,
-						   le_ih_k_offset(ih) +
-						   (cut_size / UNFM_P_SIZE) *
-						   bh->b_size);
-				RFALSE(ih_item_len(ih) == cut_size
-				       && get_ih_free_space(ih),
-				       "10205: invalid ih_free_space (%h)", ih);
-			}
-		}
-	}
-
-	/* location of the last item */
-	last_loc = ih_location(&ih[nr - cut_item_num - 1]);
-
-	/* location of the item, which is remaining at the same place */
-	unmoved_loc = cut_item_num ? ih_location(ih - 1) : bh->b_size;
-
-	/* shift */
-	memmove(bh->b_data + last_loc + cut_size, bh->b_data + last_loc,
-		unmoved_loc - last_loc - cut_size);
-
-	/* change item length */
-	put_ih_item_len(ih, ih_item_len(ih) - cut_size);
-
-	if (is_indirect_le_ih(ih)) {
-		if (pos_in_item)
-			set_ih_free_space(ih, 0);
-	}
-
-	/* change locations */
-	for (i = cut_item_num; i < nr; i++)
-		put_ih_location(&ih[i - cut_item_num],
-				ih_location(&ih[i - cut_item_num]) + cut_size);
-
-	/* size, free space */
-	set_blkh_free_space(blkh, blkh_free_space(blkh) + cut_size);
-
-	do_balance_mark_leaf_dirty(bi->tb, bh, 0);
-
-	if (bi->bi_parent) {
-		struct disk_child *t_dc;
-		t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position);
-		put_dc_size(t_dc, dc_size(t_dc) - cut_size);
-		do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
-	}
-}
-
-/* delete del_num items from buffer starting from the first'th item */
-static void leaf_delete_items_entirely(struct buffer_info *bi,
-				       int first, int del_num)
-{
-	struct buffer_head *bh = bi->bi_bh;
-	int nr;
-	int i, j;
-	int last_loc, last_removed_loc;
-	struct block_head *blkh;
-	struct item_head *ih;
-
-	RFALSE(bh == NULL, "10210: buffer is 0");
-	RFALSE(del_num < 0, "10215: del_num less than 0 (%d)", del_num);
-
-	if (del_num == 0)
-		return;
-
-	blkh = B_BLK_HEAD(bh);
-	nr = blkh_nr_item(blkh);
-
-	RFALSE(first < 0 || first + del_num > nr,
-	       "10220: first=%d, number=%d, there is %d items", first, del_num,
-	       nr);
-
-	if (first == 0 && del_num == nr) {
-		/* this does not work */
-		make_empty_node(bi);
-
-		do_balance_mark_leaf_dirty(bi->tb, bh, 0);
-		return;
-	}
-
-	ih = item_head(bh, first);
-
-	/* location of unmovable item */
-	j = (first == 0) ? bh->b_size : ih_location(ih - 1);
-
-	/* delete items */
-	last_loc = ih_location(&ih[nr - 1 - first]);
-	last_removed_loc = ih_location(&ih[del_num - 1]);
-
-	memmove(bh->b_data + last_loc + j - last_removed_loc,
-		bh->b_data + last_loc, last_removed_loc - last_loc);
-
-	/* delete item headers */
-	memmove(ih, ih + del_num, (nr - first - del_num) * IH_SIZE);
-
-	/* change item location */
-	for (i = first; i < nr - del_num; i++)
-		put_ih_location(&ih[i - first],
-				ih_location(&ih[i - first]) + (j -
-								 last_removed_loc));
-
-	/* sizes, item number */
-	set_blkh_nr_item(blkh, blkh_nr_item(blkh) - del_num);
-	set_blkh_free_space(blkh,
-			    blkh_free_space(blkh) + (j - last_removed_loc +
-						     IH_SIZE * del_num));
-
-	do_balance_mark_leaf_dirty(bi->tb, bh, 0);
-
-	if (bi->bi_parent) {
-		struct disk_child *t_dc =
-		    B_N_CHILD(bi->bi_parent, bi->bi_position);
-		put_dc_size(t_dc,
-			    dc_size(t_dc) - (j - last_removed_loc +
-					     IH_SIZE * del_num));
-		do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
-	}
-}
-
-/*
- * paste new_entry_count entries (new_dehs, records) into position
- * before to item_num-th item
- */
-void leaf_paste_entries(struct buffer_info *bi,
-			int item_num,
-			int before,
-			int new_entry_count,
-			struct reiserfs_de_head *new_dehs,
-			const char *records, int paste_size)
-{
-	struct item_head *ih;
-	char *item;
-	struct reiserfs_de_head *deh;
-	char *insert_point;
-	int i;
-	struct buffer_head *bh = bi->bi_bh;
-
-	if (new_entry_count == 0)
-		return;
-
-	ih = item_head(bh, item_num);
-
-	/*
-	 * make sure, that item is directory, and there are enough
-	 * records in it
-	 */
-	RFALSE(!is_direntry_le_ih(ih), "10225: item is not directory item");
-	RFALSE(ih_entry_count(ih) < before,
-	       "10230: there are no entry we paste entries before. entry_count = %d, before = %d",
-	       ih_entry_count(ih), before);
-
-	/* first byte of dest item */
-	item = bh->b_data + ih_location(ih);
-
-	/* entry head array */
-	deh = B_I_DEH(bh, ih);
-
-	/* new records will be pasted at this point */
-	insert_point =
-	    item +
-	    (before ? deh_location(&deh[before - 1])
-	     : (ih_item_len(ih) - paste_size));
-
-	/* adjust locations of records that will be AFTER new records */
-	for (i = ih_entry_count(ih) - 1; i >= before; i--)
-		put_deh_location(&deh[i],
-				 deh_location(&deh[i]) +
-				 (DEH_SIZE * new_entry_count));
-
-	/* adjust locations of records that will be BEFORE new records */
-	for (i = 0; i < before; i++)
-		put_deh_location(&deh[i],
-				 deh_location(&deh[i]) + paste_size);
-
-	put_ih_entry_count(ih, ih_entry_count(ih) + new_entry_count);
-
-	/* prepare space for pasted records */
-	memmove(insert_point + paste_size, insert_point,
-		item + (ih_item_len(ih) - paste_size) - insert_point);
-
-	/* copy new records */
-	memcpy(insert_point + DEH_SIZE * new_entry_count, records,
-	       paste_size - DEH_SIZE * new_entry_count);
-
-	/* prepare space for new entry heads */
-	deh += before;
-	memmove((char *)(deh + new_entry_count), deh,
-		insert_point - (char *)deh);
-
-	/* copy new entry heads */
-	deh = (struct reiserfs_de_head *)((char *)deh);
-	memcpy(deh, new_dehs, DEH_SIZE * new_entry_count);
-
-	/* set locations of new records */
-	for (i = 0; i < new_entry_count; i++) {
-		put_deh_location(&deh[i],
-				 deh_location(&deh[i]) +
-				 (-deh_location
-				  (&new_dehs[new_entry_count - 1]) +
-				  insert_point + DEH_SIZE * new_entry_count -
-				  item));
-	}
-
-	/* change item key if necessary (when we paste before 0-th entry */
-	if (!before) {
-		set_le_ih_k_offset(ih, deh_offset(new_dehs));
-	}
-#ifdef CONFIG_REISERFS_CHECK
-	{
-		int prev, next;
-		/* check record locations */
-		deh = B_I_DEH(bh, ih);
-		for (i = 0; i < ih_entry_count(ih); i++) {
-			next =
-			    (i <
-			     ih_entry_count(ih) -
-			     1) ? deh_location(&deh[i + 1]) : 0;
-			prev = (i != 0) ? deh_location(&deh[i - 1]) : 0;
-
-			if (prev && prev <= deh_location(&deh[i]))
-				reiserfs_error(sb_from_bi(bi), "vs-10240",
-					       "directory item (%h) "
-					       "corrupted (prev %a, "
-					       "cur(%d) %a)",
-					       ih, deh + i - 1, i, deh + i);
-			if (next && next >= deh_location(&deh[i]))
-				reiserfs_error(sb_from_bi(bi), "vs-10250",
-					       "directory item (%h) "
-					       "corrupted (cur(%d) %a, "
-					       "next %a)",
-					       ih, i, deh + i, deh + i + 1);
-		}
-	}
-#endif
-
-}
diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c
deleted file mode 100644
index 46bd7bd63a71..000000000000
--- a/fs/reiserfs/lock.c
+++ /dev/null
@@ -1,101 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "reiserfs.h"
-#include <linux/mutex.h>
-
-/*
- * The previous reiserfs locking scheme was heavily based on
- * the tricky properties of the Bkl:
- *
- * - it was acquired recursively by a same task
- * - the performances relied on the release-while-schedule() property
- *
- * Now that we replace it by a mutex, we still want to keep the same
- * recursive property to avoid big changes in the code structure.
- * We use our own lock_owner here because the owner field on a mutex
- * is only available in SMP or mutex debugging, also we only need this field
- * for this mutex, no need for a system wide mutex facility.
- *
- * Also this lock is often released before a call that could block because
- * reiserfs performances were partially based on the release while schedule()
- * property of the Bkl.
- */
-void reiserfs_write_lock(struct super_block *s)
-{
-	struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
-
-	if (sb_i->lock_owner != current) {
-		mutex_lock(&sb_i->lock);
-		sb_i->lock_owner = current;
-	}
-
-	/* No need to protect it, only the current task touches it */
-	sb_i->lock_depth++;
-}
-
-void reiserfs_write_unlock(struct super_block *s)
-{
-	struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
-
-	/*
-	 * Are we unlocking without even holding the lock?
-	 * Such a situation must raise a BUG() if we don't want
-	 * to corrupt the data.
-	 */
-	BUG_ON(sb_i->lock_owner != current);
-
-	if (--sb_i->lock_depth == -1) {
-		sb_i->lock_owner = NULL;
-		mutex_unlock(&sb_i->lock);
-	}
-}
-
-int __must_check reiserfs_write_unlock_nested(struct super_block *s)
-{
-	struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
-	int depth;
-
-	/* this can happen when the lock isn't always held */
-	if (sb_i->lock_owner != current)
-		return -1;
-
-	depth = sb_i->lock_depth;
-
-	sb_i->lock_depth = -1;
-	sb_i->lock_owner = NULL;
-	mutex_unlock(&sb_i->lock);
-
-	return depth;
-}
-
-void reiserfs_write_lock_nested(struct super_block *s, int depth)
-{
-	struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
-
-	/* this can happen when the lock isn't always held */
-	if (depth == -1)
-		return;
-
-	mutex_lock(&sb_i->lock);
-	sb_i->lock_owner = current;
-	sb_i->lock_depth = depth;
-}
-
-/*
- * Utility function to force a BUG if it is called without the superblock
- * write lock held.  caller is the string printed just before calling BUG()
- */
-void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
-{
-	struct reiserfs_sb_info *sb_i = REISERFS_SB(sb);
-
-	WARN_ON(sb_i->lock_depth < 0);
-}
-
-#ifdef CONFIG_REISERFS_CHECK
-void reiserfs_lock_check_recursive(struct super_block *sb)
-{
-	struct reiserfs_sb_info *sb_i = REISERFS_SB(sb);
-
-	WARN_ONCE((sb_i->lock_depth > 0), "Unwanted recursive reiserfs lock!\n");
-}
-#endif
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
deleted file mode 100644
index 7e7b531fcc49..000000000000
--- a/fs/reiserfs/namei.c
+++ /dev/null
@@ -1,1725 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- *
- * Trivial changes by Alan Cox to remove EHASHCOLLISION for compatibility
- *
- * Trivial Changes:
- * Rights granted to Hans Reiser to redistribute under other terms providing
- * he accepts all liability including but not limited to patent, fitness
- * for purpose, and direct or indirect claims arising from failure to perform.
- *
- * NO WARRANTY
- */
-
-#include <linux/time.h>
-#include <linux/bitops.h>
-#include <linux/slab.h>
-#include "reiserfs.h"
-#include "acl.h"
-#include "xattr.h"
-#include <linux/quotaops.h>
-
-#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) set_nlink(i, 1); }
-#define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i);
-
-/*
- * directory item contains array of entry headers. This performs
- * binary search through that array
- */
-static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off)
-{
-	struct item_head *ih = de->de_ih;
-	struct reiserfs_de_head *deh = de->de_deh;
-	int rbound, lbound, j;
-
-	lbound = 0;
-	rbound = ih_entry_count(ih) - 1;
-
-	for (j = (rbound + lbound) / 2; lbound <= rbound;
-	     j = (rbound + lbound) / 2) {
-		if (off < deh_offset(deh + j)) {
-			rbound = j - 1;
-			continue;
-		}
-		if (off > deh_offset(deh + j)) {
-			lbound = j + 1;
-			continue;
-		}
-		/* this is not name found, but matched third key component */
-		de->de_entry_num = j;
-		return NAME_FOUND;
-	}
-
-	de->de_entry_num = lbound;
-	return NAME_NOT_FOUND;
-}
-
-/*
- * comment?  maybe something like set de to point to what the path points to?
- */
-static inline void set_de_item_location(struct reiserfs_dir_entry *de,
-					struct treepath *path)
-{
-	de->de_bh = get_last_bh(path);
-	de->de_ih = tp_item_head(path);
-	de->de_deh = B_I_DEH(de->de_bh, de->de_ih);
-	de->de_item_num = PATH_LAST_POSITION(path);
-}
-
-/*
- * de_bh, de_ih, de_deh (points to first element of array), de_item_num is set
- */
-inline void set_de_name_and_namelen(struct reiserfs_dir_entry *de)
-{
-	struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num;
-
-	BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih));
-
-	de->de_entrylen = entry_length(de->de_bh, de->de_ih, de->de_entry_num);
-	de->de_namelen = de->de_entrylen - (de_with_sd(deh) ? SD_SIZE : 0);
-	de->de_name = ih_item_body(de->de_bh, de->de_ih) + deh_location(deh);
-	if (de->de_name[de->de_namelen - 1] == 0)
-		de->de_namelen = strlen(de->de_name);
-}
-
-/* what entry points to */
-static inline void set_de_object_key(struct reiserfs_dir_entry *de)
-{
-	BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih));
-	de->de_dir_id = deh_dir_id(&de->de_deh[de->de_entry_num]);
-	de->de_objectid = deh_objectid(&de->de_deh[de->de_entry_num]);
-}
-
-static inline void store_de_entry_key(struct reiserfs_dir_entry *de)
-{
-	struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num;
-
-	BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih));
-
-	/* store key of the found entry */
-	de->de_entry_key.version = KEY_FORMAT_3_5;
-	de->de_entry_key.on_disk_key.k_dir_id =
-	    le32_to_cpu(de->de_ih->ih_key.k_dir_id);
-	de->de_entry_key.on_disk_key.k_objectid =
-	    le32_to_cpu(de->de_ih->ih_key.k_objectid);
-	set_cpu_key_k_offset(&de->de_entry_key, deh_offset(deh));
-	set_cpu_key_k_type(&de->de_entry_key, TYPE_DIRENTRY);
-}
-
-/*
- * We assign a key to each directory item, and place multiple entries in a
- * single directory item.  A directory item has a key equal to the key of
- * the first directory entry in it.
-
- * This function first calls search_by_key, then, if item whose first entry
- * matches is not found it looks for the entry inside directory item found
- * by search_by_key. Fills the path to the entry, and to the entry position
- * in the item
- */
-/* The function is NOT SCHEDULE-SAFE! */
-int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
-			struct treepath *path, struct reiserfs_dir_entry *de)
-{
-	int retval;
-
-	retval = search_item(sb, key, path);
-	switch (retval) {
-	case ITEM_NOT_FOUND:
-		if (!PATH_LAST_POSITION(path)) {
-			reiserfs_error(sb, "vs-7000", "search_by_key "
-				       "returned item position == 0");
-			pathrelse(path);
-			return IO_ERROR;
-		}
-		PATH_LAST_POSITION(path)--;
-		break;
-
-	case ITEM_FOUND:
-		break;
-
-	case IO_ERROR:
-		return retval;
-
-	default:
-		pathrelse(path);
-		reiserfs_error(sb, "vs-7002", "no path to here");
-		return IO_ERROR;
-	}
-
-	set_de_item_location(de, path);
-
-#ifdef CONFIG_REISERFS_CHECK
-	if (!is_direntry_le_ih(de->de_ih) ||
-	    COMP_SHORT_KEYS(&de->de_ih->ih_key, key)) {
-		print_block(de->de_bh, 0, -1, -1);
-		reiserfs_panic(sb, "vs-7005", "found item %h is not directory "
-			       "item or does not belong to the same directory "
-			       "as key %K", de->de_ih, key);
-	}
-#endif				/* CONFIG_REISERFS_CHECK */
-
-	/*
-	 * binary search in directory item by third component of the
-	 * key. sets de->de_entry_num of de
-	 */
-	retval = bin_search_in_dir_item(de, cpu_key_k_offset(key));
-	path->pos_in_item = de->de_entry_num;
-	if (retval != NAME_NOT_FOUND) {
-		/*
-		 * ugly, but rename needs de_bh, de_deh, de_name,
-		 * de_namelen, de_objectid set
-		 */
-		set_de_name_and_namelen(de);
-		set_de_object_key(de);
-	}
-	return retval;
-}
-
-/* Keyed 32-bit hash function using TEA in a Davis-Meyer function */
-
-/*
- * The third component is hashed, and you can choose from more than
- * one hash function.  Per directory hashes are not yet implemented
- * but are thought about. This function should be moved to hashes.c
- * Jedi, please do so.  -Hans
- */
-static __u32 get_third_component(struct super_block *s,
-				 const char *name, int len)
-{
-	__u32 res;
-
-	if (!len || (len == 1 && name[0] == '.'))
-		return DOT_OFFSET;
-	if (len == 2 && name[0] == '.' && name[1] == '.')
-		return DOT_DOT_OFFSET;
-
-	res = REISERFS_SB(s)->s_hash_function(name, len);
-
-	/* take bits from 7-th to 30-th including both bounds */
-	res = GET_HASH_VALUE(res);
-	if (res == 0)
-		/*
-		 * needed to have no names before "." and ".." those have hash
-		 * value == 0 and generation conters 1 and 2 accordingly
-		 */
-		res = 128;
-	return res + MAX_GENERATION_NUMBER;
-}
-
-static int reiserfs_match(struct reiserfs_dir_entry *de,
-			  const char *name, int namelen)
-{
-	int retval = NAME_NOT_FOUND;
-
-	if ((namelen == de->de_namelen) &&
-	    !memcmp(de->de_name, name, de->de_namelen))
-		retval =
-		    (de_visible(de->de_deh + de->de_entry_num) ? NAME_FOUND :
-		     NAME_FOUND_INVISIBLE);
-
-	return retval;
-}
-
-/* de's de_bh, de_ih, de_deh, de_item_num, de_entry_num are set already */
-
-/* used when hash collisions exist */
-
-static int linear_search_in_dir_item(struct cpu_key *key,
-				     struct reiserfs_dir_entry *de,
-				     const char *name, int namelen)
-{
-	struct reiserfs_de_head *deh = de->de_deh;
-	int retval;
-	int i;
-
-	i = de->de_entry_num;
-
-	if (i == ih_entry_count(de->de_ih) ||
-	    GET_HASH_VALUE(deh_offset(deh + i)) !=
-	    GET_HASH_VALUE(cpu_key_k_offset(key))) {
-		i--;
-	}
-
-	RFALSE(de->de_deh != B_I_DEH(de->de_bh, de->de_ih),
-	       "vs-7010: array of entry headers not found");
-
-	deh += i;
-
-	for (; i >= 0; i--, deh--) {
-		/* hash value does not match, no need to check whole name */
-		if (GET_HASH_VALUE(deh_offset(deh)) !=
-		    GET_HASH_VALUE(cpu_key_k_offset(key))) {
-			return NAME_NOT_FOUND;
-		}
-
-		/* mark that this generation number is used */
-		if (de->de_gen_number_bit_string)
-			set_bit(GET_GENERATION_NUMBER(deh_offset(deh)),
-				de->de_gen_number_bit_string);
-
-		/* calculate pointer to name and namelen */
-		de->de_entry_num = i;
-		set_de_name_and_namelen(de);
-
-		/*
-		 * de's de_name, de_namelen, de_recordlen are set.
-		 * Fill the rest.
-		 */
-		if ((retval =
-		     reiserfs_match(de, name, namelen)) != NAME_NOT_FOUND) {
-
-			/* key of pointed object */
-			set_de_object_key(de);
-
-			store_de_entry_key(de);
-
-			/* retval can be NAME_FOUND or NAME_FOUND_INVISIBLE */
-			return retval;
-		}
-	}
-
-	if (GET_GENERATION_NUMBER(le_ih_k_offset(de->de_ih)) == 0)
-		/*
-		 * we have reached left most entry in the node. In common we
-		 * have to go to the left neighbor, but if generation counter
-		 * is 0 already, we know for sure, that there is no name with
-		 * the same hash value
-		 */
-		/*
-		 * FIXME: this work correctly only because hash value can not
-		 *  be 0. Btw, in case of Yura's hash it is probably possible,
-		 * so, this is a bug
-		 */
-		return NAME_NOT_FOUND;
-
-	RFALSE(de->de_item_num,
-	       "vs-7015: two diritems of the same directory in one node?");
-
-	return GOTO_PREVIOUS_ITEM;
-}
-
-/*
- * may return NAME_FOUND, NAME_FOUND_INVISIBLE, NAME_NOT_FOUND
- * FIXME: should add something like IOERROR
- */
-static int reiserfs_find_entry(struct inode *dir, const char *name, int namelen,
-			       struct treepath *path_to_entry,
-			       struct reiserfs_dir_entry *de)
-{
-	struct cpu_key key_to_search;
-	int retval;
-
-	if (namelen > REISERFS_MAX_NAME(dir->i_sb->s_blocksize))
-		return NAME_NOT_FOUND;
-
-	/* we will search for this key in the tree */
-	make_cpu_key(&key_to_search, dir,
-		     get_third_component(dir->i_sb, name, namelen),
-		     TYPE_DIRENTRY, 3);
-
-	while (1) {
-		retval =
-		    search_by_entry_key(dir->i_sb, &key_to_search,
-					path_to_entry, de);
-		if (retval == IO_ERROR) {
-			reiserfs_error(dir->i_sb, "zam-7001", "io error");
-			return IO_ERROR;
-		}
-
-		/* compare names for all entries having given hash value */
-		retval =
-		    linear_search_in_dir_item(&key_to_search, de, name,
-					      namelen);
-		/*
-		 * there is no need to scan directory anymore.
-		 * Given entry found or does not exist
-		 */
-		if (retval != GOTO_PREVIOUS_ITEM) {
-			path_to_entry->pos_in_item = de->de_entry_num;
-			return retval;
-		}
-
-		/*
-		 * there is left neighboring item of this directory
-		 * and given entry can be there
-		 */
-		set_cpu_key_k_offset(&key_to_search,
-				     le_ih_k_offset(de->de_ih) - 1);
-		pathrelse(path_to_entry);
-
-	}			/* while (1) */
-}
-
-static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
-				      unsigned int flags)
-{
-	int retval;
-	struct inode *inode = NULL;
-	struct reiserfs_dir_entry de;
-	INITIALIZE_PATH(path_to_entry);
-
-	if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len)
-		return ERR_PTR(-ENAMETOOLONG);
-
-	reiserfs_write_lock(dir->i_sb);
-
-	de.de_gen_number_bit_string = NULL;
-	retval =
-	    reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
-				&path_to_entry, &de);
-	pathrelse(&path_to_entry);
-	if (retval == NAME_FOUND) {
-		inode = reiserfs_iget(dir->i_sb,
-				      (struct cpu_key *)&de.de_dir_id);
-		if (!inode || IS_ERR(inode)) {
-			reiserfs_write_unlock(dir->i_sb);
-			return ERR_PTR(-EACCES);
-		}
-
-		/*
-		 * Propagate the private flag so we know we're
-		 * in the priv tree.  Also clear xattr support
-		 * since we don't have xattrs on xattr files.
-		 */
-		if (IS_PRIVATE(dir))
-			reiserfs_init_priv_inode(inode);
-	}
-	reiserfs_write_unlock(dir->i_sb);
-	if (retval == IO_ERROR) {
-		return ERR_PTR(-EIO);
-	}
-
-	return d_splice_alias(inode, dentry);
-}
-
-/*
- * looks up the dentry of the parent directory for child.
- * taken from ext2_get_parent
- */
-struct dentry *reiserfs_get_parent(struct dentry *child)
-{
-	int retval;
-	struct inode *inode = NULL;
-	struct reiserfs_dir_entry de;
-	INITIALIZE_PATH(path_to_entry);
-	struct inode *dir = d_inode(child);
-
-	if (dir->i_nlink == 0) {
-		return ERR_PTR(-ENOENT);
-	}
-	de.de_gen_number_bit_string = NULL;
-
-	reiserfs_write_lock(dir->i_sb);
-	retval = reiserfs_find_entry(dir, "..", 2, &path_to_entry, &de);
-	pathrelse(&path_to_entry);
-	if (retval != NAME_FOUND) {
-		reiserfs_write_unlock(dir->i_sb);
-		return ERR_PTR(-ENOENT);
-	}
-	inode = reiserfs_iget(dir->i_sb, (struct cpu_key *)&de.de_dir_id);
-	reiserfs_write_unlock(dir->i_sb);
-
-	return d_obtain_alias(inode);
-}
-
-/* add entry to the directory (entry can be hidden).
-
-insert definition of when hidden directories are used here -Hans
-
- Does not mark dir   inode dirty, do it after successesfull call to it */
-
-static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
-			      struct inode *dir, const char *name, int namelen,
-			      struct inode *inode, int visible)
-{
-	struct cpu_key entry_key;
-	struct reiserfs_de_head *deh;
-	INITIALIZE_PATH(path);
-	struct reiserfs_dir_entry de;
-	DECLARE_BITMAP(bit_string, MAX_GENERATION_NUMBER + 1);
-	int gen_number;
-
-	/*
-	 * 48 bytes now and we avoid kmalloc if we
-	 * create file with short name
-	 */
-	char small_buf[32 + DEH_SIZE];
-
-	char *buffer;
-	int buflen, paste_size;
-	int retval;
-
-	BUG_ON(!th->t_trans_id);
-
-	/* each entry has unique key. compose it */
-	make_cpu_key(&entry_key, dir,
-		     get_third_component(dir->i_sb, name, namelen),
-		     TYPE_DIRENTRY, 3);
-
-	/* get memory for composing the entry */
-	buflen = DEH_SIZE + ROUND_UP(namelen);
-	if (buflen > sizeof(small_buf)) {
-		buffer = kmalloc(buflen, GFP_NOFS);
-		if (!buffer)
-			return -ENOMEM;
-	} else
-		buffer = small_buf;
-
-	paste_size =
-	    (get_inode_sd_version(dir) ==
-	     STAT_DATA_V1) ? (DEH_SIZE + namelen) : buflen;
-
-	/*
-	 * fill buffer : directory entry head, name[, dir objectid | ,
-	 * stat data | ,stat data, dir objectid ]
-	 */
-	deh = (struct reiserfs_de_head *)buffer;
-	deh->deh_location = 0;	/* JDM Endian safe if 0 */
-	put_deh_offset(deh, cpu_key_k_offset(&entry_key));
-	deh->deh_state = 0;	/* JDM Endian safe if 0 */
-	/* put key (ino analog) to de */
-
-	/* safe: k_dir_id is le */
-	deh->deh_dir_id = INODE_PKEY(inode)->k_dir_id;
-	/* safe: k_objectid is le */
-	deh->deh_objectid = INODE_PKEY(inode)->k_objectid;
-
-	/* copy name */
-	memcpy((char *)(deh + 1), name, namelen);
-	/* padd by 0s to the 4 byte boundary */
-	padd_item((char *)(deh + 1), ROUND_UP(namelen), namelen);
-
-	/*
-	 * entry is ready to be pasted into tree, set 'visibility'
-	 * and 'stat data in entry' attributes
-	 */
-	mark_de_without_sd(deh);
-	visible ? mark_de_visible(deh) : mark_de_hidden(deh);
-
-	/* find the proper place for the new entry */
-	memset(bit_string, 0, sizeof(bit_string));
-	de.de_gen_number_bit_string = bit_string;
-	retval = reiserfs_find_entry(dir, name, namelen, &path, &de);
-	if (retval != NAME_NOT_FOUND) {
-		if (buffer != small_buf)
-			kfree(buffer);
-		pathrelse(&path);
-
-		if (retval == IO_ERROR) {
-			return -EIO;
-		}
-
-		if (retval != NAME_FOUND) {
-			reiserfs_error(dir->i_sb, "zam-7002",
-				       "reiserfs_find_entry() returned "
-				       "unexpected value (%d)", retval);
-		}
-
-		return -EEXIST;
-	}
-
-	gen_number =
-	    find_first_zero_bit(bit_string,
-				MAX_GENERATION_NUMBER + 1);
-	if (gen_number > MAX_GENERATION_NUMBER) {
-		/* there is no free generation number */
-		reiserfs_warning(dir->i_sb, "reiserfs-7010",
-				 "Congratulations! we have got hash function "
-				 "screwed up");
-		if (buffer != small_buf)
-			kfree(buffer);
-		pathrelse(&path);
-		return -EBUSY;
-	}
-	/* adjust offset of directory enrty */
-	put_deh_offset(deh, SET_GENERATION_NUMBER(deh_offset(deh), gen_number));
-	set_cpu_key_k_offset(&entry_key, deh_offset(deh));
-
-	/* update max-hash-collisions counter in reiserfs_sb_info */
-	PROC_INFO_MAX(th->t_super, max_hash_collisions, gen_number);
-
-	/* we need to re-search for the insertion point */
-	if (gen_number != 0) {
-		if (search_by_entry_key(dir->i_sb, &entry_key, &path, &de) !=
-		    NAME_NOT_FOUND) {
-			reiserfs_warning(dir->i_sb, "vs-7032",
-					 "entry with this key (%K) already "
-					 "exists", &entry_key);
-
-			if (buffer != small_buf)
-				kfree(buffer);
-			pathrelse(&path);
-			return -EBUSY;
-		}
-	}
-
-	/* perform the insertion of the entry that we have prepared */
-	retval =
-	    reiserfs_paste_into_item(th, &path, &entry_key, dir, buffer,
-				     paste_size);
-	if (buffer != small_buf)
-		kfree(buffer);
-	if (retval) {
-		reiserfs_check_path(&path);
-		return retval;
-	}
-
-	dir->i_size += paste_size;
-	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
-	if (!S_ISDIR(inode->i_mode) && visible)
-		/* reiserfs_mkdir or reiserfs_rename will do that by itself */
-		reiserfs_update_sd(th, dir);
-
-	reiserfs_check_path(&path);
-	return 0;
-}
-
-/*
- * quota utility function, call if you've had to abort after calling
- * new_inode_init, and have not called reiserfs_new_inode yet.
- * This should only be called on inodes that do not have stat data
- * inserted into the tree yet.
- */
-static int drop_new_inode(struct inode *inode)
-{
-	dquot_drop(inode);
-	make_bad_inode(inode);
-	inode->i_flags |= S_NOQUOTA;
-	iput(inode);
-	return 0;
-}
-
-/*
- * utility function that does setup for reiserfs_new_inode.
- * dquot_initialize needs lots of credits so it's better to have it
- * outside of a transaction, so we had to pull some bits of
- * reiserfs_new_inode out into this func.
- */
-static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode)
-{
-	/*
-	 * Make inode invalid - just in case we are going to drop it before
-	 * the initialization happens
-	 */
-	INODE_PKEY(inode)->k_objectid = 0;
-
-	/*
-	 * the quota init calls have to know who to charge the quota to, so
-	 * we have to set uid and gid here
-	 */
-	inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
-	return dquot_initialize(inode);
-}
-
-static int reiserfs_create(struct mnt_idmap *idmap, struct inode *dir,
-			   struct dentry *dentry, umode_t mode, bool excl)
-{
-	int retval;
-	struct inode *inode;
-	/*
-	 * We need blocks for transaction + (user+group)*(quotas
-	 * for new inode + update of quota for directory owner)
-	 */
-	int jbegin_count =
-	    JOURNAL_PER_BALANCE_CNT * 2 +
-	    2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
-		 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
-	struct reiserfs_transaction_handle th;
-	struct reiserfs_security_handle security;
-
-	retval = dquot_initialize(dir);
-	if (retval)
-		return retval;
-
-	if (!(inode = new_inode(dir->i_sb))) {
-		return -ENOMEM;
-	}
-	retval = new_inode_init(inode, dir, mode);
-	if (retval) {
-		drop_new_inode(inode);
-		return retval;
-	}
-
-	jbegin_count += reiserfs_cache_default_acl(dir);
-	retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
-	if (retval < 0) {
-		drop_new_inode(inode);
-		return retval;
-	}
-	jbegin_count += retval;
-	reiserfs_write_lock(dir->i_sb);
-
-	retval = journal_begin(&th, dir->i_sb, jbegin_count);
-	if (retval) {
-		drop_new_inode(inode);
-		goto out_failed;
-	}
-
-	retval =
-	    reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry,
-			       inode, &security);
-	if (retval)
-		goto out_failed;
-
-	inode->i_op = &reiserfs_file_inode_operations;
-	inode->i_fop = &reiserfs_file_operations;
-	inode->i_mapping->a_ops = &reiserfs_address_space_operations;
-
-	retval =
-	    reiserfs_add_entry(&th, dir, dentry->d_name.name,
-			       dentry->d_name.len, inode, 1 /*visible */ );
-	if (retval) {
-		int err;
-		drop_nlink(inode);
-		reiserfs_update_sd(&th, inode);
-		err = journal_end(&th);
-		if (err)
-			retval = err;
-		unlock_new_inode(inode);
-		iput(inode);
-		goto out_failed;
-	}
-	reiserfs_update_inode_transaction(inode);
-	reiserfs_update_inode_transaction(dir);
-
-	d_instantiate_new(dentry, inode);
-	retval = journal_end(&th);
-
-out_failed:
-	reiserfs_write_unlock(dir->i_sb);
-	reiserfs_security_free(&security);
-	return retval;
-}
-
-static int reiserfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
-			  struct dentry *dentry, umode_t mode, dev_t rdev)
-{
-	int retval;
-	struct inode *inode;
-	struct reiserfs_transaction_handle th;
-	struct reiserfs_security_handle security;
-	/*
-	 * We need blocks for transaction + (user+group)*(quotas
-	 * for new inode + update of quota for directory owner)
-	 */
-	int jbegin_count =
-	    JOURNAL_PER_BALANCE_CNT * 3 +
-	    2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
-		 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
-
-	retval = dquot_initialize(dir);
-	if (retval)
-		return retval;
-
-	if (!(inode = new_inode(dir->i_sb))) {
-		return -ENOMEM;
-	}
-	retval = new_inode_init(inode, dir, mode);
-	if (retval) {
-		drop_new_inode(inode);
-		return retval;
-	}
-
-	jbegin_count += reiserfs_cache_default_acl(dir);
-	retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
-	if (retval < 0) {
-		drop_new_inode(inode);
-		return retval;
-	}
-	jbegin_count += retval;
-	reiserfs_write_lock(dir->i_sb);
-
-	retval = journal_begin(&th, dir->i_sb, jbegin_count);
-	if (retval) {
-		drop_new_inode(inode);
-		goto out_failed;
-	}
-
-	retval =
-	    reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry,
-			       inode, &security);
-	if (retval) {
-		goto out_failed;
-	}
-
-	inode->i_op = &reiserfs_special_inode_operations;
-	init_special_inode(inode, inode->i_mode, rdev);
-
-	/* FIXME: needed for block and char devices only */
-	reiserfs_update_sd(&th, inode);
-
-	reiserfs_update_inode_transaction(inode);
-	reiserfs_update_inode_transaction(dir);
-
-	retval =
-	    reiserfs_add_entry(&th, dir, dentry->d_name.name,
-			       dentry->d_name.len, inode, 1 /*visible */ );
-	if (retval) {
-		int err;
-		drop_nlink(inode);
-		reiserfs_update_sd(&th, inode);
-		err = journal_end(&th);
-		if (err)
-			retval = err;
-		unlock_new_inode(inode);
-		iput(inode);
-		goto out_failed;
-	}
-
-	d_instantiate_new(dentry, inode);
-	retval = journal_end(&th);
-
-out_failed:
-	reiserfs_write_unlock(dir->i_sb);
-	reiserfs_security_free(&security);
-	return retval;
-}
-
-static int reiserfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
-			  struct dentry *dentry, umode_t mode)
-{
-	int retval;
-	struct inode *inode;
-	struct reiserfs_transaction_handle th;
-	struct reiserfs_security_handle security;
-	/*
-	 * We need blocks for transaction + (user+group)*(quotas
-	 * for new inode + update of quota for directory owner)
-	 */
-	int jbegin_count =
-	    JOURNAL_PER_BALANCE_CNT * 3 +
-	    2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
-		 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
-
-	retval = dquot_initialize(dir);
-	if (retval)
-		return retval;
-
-#ifdef DISPLACE_NEW_PACKING_LOCALITIES
-	/*
-	 * set flag that new packing locality created and new blocks
-	 * for the content of that directory are not displaced yet
-	 */
-	REISERFS_I(dir)->new_packing_locality = 1;
-#endif
-	mode = S_IFDIR | mode;
-	if (!(inode = new_inode(dir->i_sb))) {
-		return -ENOMEM;
-	}
-	retval = new_inode_init(inode, dir, mode);
-	if (retval) {
-		drop_new_inode(inode);
-		return retval;
-	}
-
-	jbegin_count += reiserfs_cache_default_acl(dir);
-	retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
-	if (retval < 0) {
-		drop_new_inode(inode);
-		return retval;
-	}
-	jbegin_count += retval;
-	reiserfs_write_lock(dir->i_sb);
-
-	retval = journal_begin(&th, dir->i_sb, jbegin_count);
-	if (retval) {
-		drop_new_inode(inode);
-		goto out_failed;
-	}
-
-	/*
-	 * inc the link count now, so another writer doesn't overflow
-	 * it while we sleep later on.
-	 */
-	INC_DIR_INODE_NLINK(dir)
-
-	retval = reiserfs_new_inode(&th, dir, mode, NULL /*symlink */,
-				    old_format_only(dir->i_sb) ?
-				    EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE,
-				    dentry, inode, &security);
-	if (retval) {
-		DEC_DIR_INODE_NLINK(dir)
-		goto out_failed;
-	}
-
-	reiserfs_update_inode_transaction(inode);
-	reiserfs_update_inode_transaction(dir);
-
-	inode->i_op = &reiserfs_dir_inode_operations;
-	inode->i_fop = &reiserfs_dir_operations;
-
-	/* note, _this_ add_entry will not update dir's stat data */
-	retval =
-	    reiserfs_add_entry(&th, dir, dentry->d_name.name,
-			       dentry->d_name.len, inode, 1 /*visible */ );
-	if (retval) {
-		int err;
-		clear_nlink(inode);
-		DEC_DIR_INODE_NLINK(dir);
-		reiserfs_update_sd(&th, inode);
-		err = journal_end(&th);
-		if (err)
-			retval = err;
-		unlock_new_inode(inode);
-		iput(inode);
-		goto out_failed;
-	}
-	/* the above add_entry did not update dir's stat data */
-	reiserfs_update_sd(&th, dir);
-
-	d_instantiate_new(dentry, inode);
-	retval = journal_end(&th);
-out_failed:
-	reiserfs_write_unlock(dir->i_sb);
-	reiserfs_security_free(&security);
-	return retval;
-}
-
-static inline int reiserfs_empty_dir(struct inode *inode)
-{
-	/*
-	 * we can cheat because an old format dir cannot have
-	 * EMPTY_DIR_SIZE, and a new format dir cannot have
-	 * EMPTY_DIR_SIZE_V1.  So, if the inode is either size,
-	 * regardless of disk format version, the directory is empty.
-	 */
-	if (inode->i_size != EMPTY_DIR_SIZE &&
-	    inode->i_size != EMPTY_DIR_SIZE_V1) {
-		return 0;
-	}
-	return 1;
-}
-
-static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
-{
-	int retval, err;
-	struct inode *inode;
-	struct reiserfs_transaction_handle th;
-	int jbegin_count;
-	INITIALIZE_PATH(path);
-	struct reiserfs_dir_entry de;
-
-	/*
-	 * we will be doing 2 balancings and update 2 stat data, we
-	 * change quotas of the owner of the directory and of the owner
-	 * of the parent directory.  The quota structure is possibly
-	 * deleted only on last iput => outside of this transaction
-	 */
-	jbegin_count =
-	    JOURNAL_PER_BALANCE_CNT * 2 + 2 +
-	    4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
-
-	retval = dquot_initialize(dir);
-	if (retval)
-		return retval;
-
-	reiserfs_write_lock(dir->i_sb);
-	retval = journal_begin(&th, dir->i_sb, jbegin_count);
-	if (retval)
-		goto out_rmdir;
-
-	de.de_gen_number_bit_string = NULL;
-	if ((retval =
-	     reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
-				 &path, &de)) == NAME_NOT_FOUND) {
-		retval = -ENOENT;
-		goto end_rmdir;
-	} else if (retval == IO_ERROR) {
-		retval = -EIO;
-		goto end_rmdir;
-	}
-
-	inode = d_inode(dentry);
-
-	reiserfs_update_inode_transaction(inode);
-	reiserfs_update_inode_transaction(dir);
-
-	if (de.de_objectid != inode->i_ino) {
-		/*
-		 * FIXME: compare key of an object and a key found in the entry
-		 */
-		retval = -EIO;
-		goto end_rmdir;
-	}
-	if (!reiserfs_empty_dir(inode)) {
-		retval = -ENOTEMPTY;
-		goto end_rmdir;
-	}
-
-	/* cut entry from dir directory */
-	retval = reiserfs_cut_from_item(&th, &path, &de.de_entry_key,
-					dir, NULL,	/* page */
-					0 /*new file size - not used here */ );
-	if (retval < 0)
-		goto end_rmdir;
-
-	if (inode->i_nlink != 2 && inode->i_nlink != 1)
-		reiserfs_error(inode->i_sb, "reiserfs-7040",
-			       "empty directory has nlink != 2 (%d)",
-			       inode->i_nlink);
-
-	clear_nlink(inode);
-	inode_set_mtime_to_ts(dir,
-			      inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
-	reiserfs_update_sd(&th, inode);
-
-	DEC_DIR_INODE_NLINK(dir)
-	dir->i_size -= (DEH_SIZE + de.de_entrylen);
-	reiserfs_update_sd(&th, dir);
-
-	/* prevent empty directory from getting lost */
-	add_save_link(&th, inode, 0 /* not truncate */ );
-
-	retval = journal_end(&th);
-	reiserfs_check_path(&path);
-out_rmdir:
-	reiserfs_write_unlock(dir->i_sb);
-	return retval;
-
-end_rmdir:
-	/*
-	 * we must release path, because we did not call
-	 * reiserfs_cut_from_item, or reiserfs_cut_from_item does not
-	 * release path if operation was not complete
-	 */
-	pathrelse(&path);
-	err = journal_end(&th);
-	reiserfs_write_unlock(dir->i_sb);
-	return err ? err : retval;
-}
-
-static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
-{
-	int retval, err;
-	struct inode *inode;
-	struct reiserfs_dir_entry de;
-	INITIALIZE_PATH(path);
-	struct reiserfs_transaction_handle th;
-	int jbegin_count;
-	unsigned long savelink;
-
-	retval = dquot_initialize(dir);
-	if (retval)
-		return retval;
-
-	inode = d_inode(dentry);
-
-	/*
-	 * in this transaction we can be doing at max two balancings and
-	 * update two stat datas, we change quotas of the owner of the
-	 * directory and of the owner of the parent directory. The quota
-	 * structure is possibly deleted only on iput => outside of
-	 * this transaction
-	 */
-	jbegin_count =
-	    JOURNAL_PER_BALANCE_CNT * 2 + 2 +
-	    4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
-
-	reiserfs_write_lock(dir->i_sb);
-	retval = journal_begin(&th, dir->i_sb, jbegin_count);
-	if (retval)
-		goto out_unlink;
-
-	de.de_gen_number_bit_string = NULL;
-	if ((retval =
-	     reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
-				 &path, &de)) == NAME_NOT_FOUND) {
-		retval = -ENOENT;
-		goto end_unlink;
-	} else if (retval == IO_ERROR) {
-		retval = -EIO;
-		goto end_unlink;
-	}
-
-	reiserfs_update_inode_transaction(inode);
-	reiserfs_update_inode_transaction(dir);
-
-	if (de.de_objectid != inode->i_ino) {
-		/*
-		 * FIXME: compare key of an object and a key found in the entry
-		 */
-		retval = -EIO;
-		goto end_unlink;
-	}
-
-	if (!inode->i_nlink) {
-		reiserfs_warning(inode->i_sb, "reiserfs-7042",
-				 "deleting nonexistent file (%lu), %d",
-				 inode->i_ino, inode->i_nlink);
-		set_nlink(inode, 1);
-	}
-
-	drop_nlink(inode);
-
-	/*
-	 * we schedule before doing the add_save_link call, save the link
-	 * count so we don't race
-	 */
-	savelink = inode->i_nlink;
-
-	retval =
-	    reiserfs_cut_from_item(&th, &path, &de.de_entry_key, dir, NULL,
-				   0);
-	if (retval < 0) {
-		inc_nlink(inode);
-		goto end_unlink;
-	}
-	inode_set_ctime_current(inode);
-	reiserfs_update_sd(&th, inode);
-
-	dir->i_size -= (de.de_entrylen + DEH_SIZE);
-	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
-	reiserfs_update_sd(&th, dir);
-
-	if (!savelink)
-		/* prevent file from getting lost */
-		add_save_link(&th, inode, 0 /* not truncate */ );
-
-	retval = journal_end(&th);
-	reiserfs_check_path(&path);
-	reiserfs_write_unlock(dir->i_sb);
-	return retval;
-
-end_unlink:
-	pathrelse(&path);
-	err = journal_end(&th);
-	reiserfs_check_path(&path);
-	if (err)
-		retval = err;
-out_unlink:
-	reiserfs_write_unlock(dir->i_sb);
-	return retval;
-}
-
-static int reiserfs_symlink(struct mnt_idmap *idmap,
-			    struct inode *parent_dir, struct dentry *dentry,
-			    const char *symname)
-{
-	int retval;
-	struct inode *inode;
-	char *name;
-	int item_len;
-	struct reiserfs_transaction_handle th;
-	struct reiserfs_security_handle security;
-	int mode = S_IFLNK | S_IRWXUGO;
-	/*
-	 * We need blocks for transaction + (user+group)*(quotas for
-	 * new inode + update of quota for directory owner)
-	 */
-	int jbegin_count =
-	    JOURNAL_PER_BALANCE_CNT * 3 +
-	    2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) +
-		 REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb));
-
-	retval = dquot_initialize(parent_dir);
-	if (retval)
-		return retval;
-
-	if (!(inode = new_inode(parent_dir->i_sb))) {
-		return -ENOMEM;
-	}
-	retval = new_inode_init(inode, parent_dir, mode);
-	if (retval) {
-		drop_new_inode(inode);
-		return retval;
-	}
-
-	retval = reiserfs_security_init(parent_dir, inode, &dentry->d_name,
-					&security);
-	if (retval < 0) {
-		drop_new_inode(inode);
-		return retval;
-	}
-	jbegin_count += retval;
-
-	reiserfs_write_lock(parent_dir->i_sb);
-	item_len = ROUND_UP(strlen(symname));
-	if (item_len > MAX_DIRECT_ITEM_LEN(parent_dir->i_sb->s_blocksize)) {
-		retval = -ENAMETOOLONG;
-		drop_new_inode(inode);
-		goto out_failed;
-	}
-
-	name = kmalloc(item_len, GFP_NOFS);
-	if (!name) {
-		drop_new_inode(inode);
-		retval = -ENOMEM;
-		goto out_failed;
-	}
-	memcpy(name, symname, strlen(symname));
-	padd_item(name, item_len, strlen(symname));
-
-	retval = journal_begin(&th, parent_dir->i_sb, jbegin_count);
-	if (retval) {
-		drop_new_inode(inode);
-		kfree(name);
-		goto out_failed;
-	}
-
-	retval =
-	    reiserfs_new_inode(&th, parent_dir, mode, name, strlen(symname),
-			       dentry, inode, &security);
-	kfree(name);
-	if (retval) {		/* reiserfs_new_inode iputs for us */
-		goto out_failed;
-	}
-
-	reiserfs_update_inode_transaction(inode);
-	reiserfs_update_inode_transaction(parent_dir);
-
-	inode->i_op = &reiserfs_symlink_inode_operations;
-	inode_nohighmem(inode);
-	inode->i_mapping->a_ops = &reiserfs_address_space_operations;
-
-	retval = reiserfs_add_entry(&th, parent_dir, dentry->d_name.name,
-				    dentry->d_name.len, inode, 1 /*visible */ );
-	if (retval) {
-		int err;
-		drop_nlink(inode);
-		reiserfs_update_sd(&th, inode);
-		err = journal_end(&th);
-		if (err)
-			retval = err;
-		unlock_new_inode(inode);
-		iput(inode);
-		goto out_failed;
-	}
-
-	d_instantiate_new(dentry, inode);
-	retval = journal_end(&th);
-out_failed:
-	reiserfs_write_unlock(parent_dir->i_sb);
-	reiserfs_security_free(&security);
-	return retval;
-}
-
-static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
-			 struct dentry *dentry)
-{
-	int retval;
-	struct inode *inode = d_inode(old_dentry);
-	struct reiserfs_transaction_handle th;
-	/*
-	 * We need blocks for transaction + update of quotas for
-	 * the owners of the directory
-	 */
-	int jbegin_count =
-	    JOURNAL_PER_BALANCE_CNT * 3 +
-	    2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
-
-	retval = dquot_initialize(dir);
-	if (retval)
-		return retval;
-
-	reiserfs_write_lock(dir->i_sb);
-	if (inode->i_nlink >= REISERFS_LINK_MAX) {
-		/* FIXME: sd_nlink is 32 bit for new files */
-		reiserfs_write_unlock(dir->i_sb);
-		return -EMLINK;
-	}
-
-	/* inc before scheduling so reiserfs_unlink knows we are here */
-	inc_nlink(inode);
-
-	retval = journal_begin(&th, dir->i_sb, jbegin_count);
-	if (retval) {
-		drop_nlink(inode);
-		reiserfs_write_unlock(dir->i_sb);
-		return retval;
-	}
-
-	/* create new entry */
-	retval =
-	    reiserfs_add_entry(&th, dir, dentry->d_name.name,
-			       dentry->d_name.len, inode, 1 /*visible */ );
-
-	reiserfs_update_inode_transaction(inode);
-	reiserfs_update_inode_transaction(dir);
-
-	if (retval) {
-		int err;
-		drop_nlink(inode);
-		err = journal_end(&th);
-		reiserfs_write_unlock(dir->i_sb);
-		return err ? err : retval;
-	}
-
-	inode_set_ctime_current(inode);
-	reiserfs_update_sd(&th, inode);
-
-	ihold(inode);
-	d_instantiate(dentry, inode);
-	retval = journal_end(&th);
-	reiserfs_write_unlock(dir->i_sb);
-	return retval;
-}
-
-/* de contains information pointing to an entry which */
-static int de_still_valid(const char *name, int len,
-			  struct reiserfs_dir_entry *de)
-{
-	struct reiserfs_dir_entry tmp = *de;
-
-	/* recalculate pointer to name and name length */
-	set_de_name_and_namelen(&tmp);
-	/* FIXME: could check more */
-	if (tmp.de_namelen != len || memcmp(name, de->de_name, len))
-		return 0;
-	return 1;
-}
-
-static int entry_points_to_object(const char *name, int len,
-				  struct reiserfs_dir_entry *de,
-				  struct inode *inode)
-{
-	if (!de_still_valid(name, len, de))
-		return 0;
-
-	if (inode) {
-		if (!de_visible(de->de_deh + de->de_entry_num))
-			reiserfs_panic(inode->i_sb, "vs-7042",
-				       "entry must be visible");
-		return (de->de_objectid == inode->i_ino) ? 1 : 0;
-	}
-
-	/* this must be added hidden entry */
-	if (de_visible(de->de_deh + de->de_entry_num))
-		reiserfs_panic(NULL, "vs-7043", "entry must be visible");
-
-	return 1;
-}
-
-/* sets key of objectid the entry has to point to */
-static void set_ino_in_dir_entry(struct reiserfs_dir_entry *de,
-				 struct reiserfs_key *key)
-{
-	/* JDM These operations are endian safe - both are le */
-	de->de_deh[de->de_entry_num].deh_dir_id = key->k_dir_id;
-	de->de_deh[de->de_entry_num].deh_objectid = key->k_objectid;
-}
-
-/*
- * process, that is going to call fix_nodes/do_balance must hold only
- * one path. If it holds 2 or more, it can get into endless waiting in
- * get_empty_nodes or its clones
- */
-static int reiserfs_rename(struct mnt_idmap *idmap,
-			   struct inode *old_dir, struct dentry *old_dentry,
-			   struct inode *new_dir, struct dentry *new_dentry,
-			   unsigned int flags)
-{
-	int retval;
-	INITIALIZE_PATH(old_entry_path);
-	INITIALIZE_PATH(new_entry_path);
-	INITIALIZE_PATH(dot_dot_entry_path);
-	struct item_head new_entry_ih, old_entry_ih, dot_dot_ih;
-	struct reiserfs_dir_entry old_de, new_de, dot_dot_de;
-	struct inode *old_inode, *new_dentry_inode;
-	struct reiserfs_transaction_handle th;
-	int jbegin_count;
-	unsigned long savelink = 1;
-	bool update_dir_parent = false;
-
-	if (flags & ~RENAME_NOREPLACE)
-		return -EINVAL;
-
-	/*
-	 * three balancings: (1) old name removal, (2) new name insertion
-	 * and (3) maybe "save" link insertion
-	 * stat data updates: (1) old directory,
-	 * (2) new directory and (3) maybe old object stat data (when it is
-	 * directory) and (4) maybe stat data of object to which new entry
-	 * pointed initially and (5) maybe block containing ".." of
-	 * renamed directory
-	 * quota updates: two parent directories
-	 */
-	jbegin_count =
-	    JOURNAL_PER_BALANCE_CNT * 3 + 5 +
-	    4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb);
-
-	retval = dquot_initialize(old_dir);
-	if (retval)
-		return retval;
-	retval = dquot_initialize(new_dir);
-	if (retval)
-		return retval;
-
-	old_inode = d_inode(old_dentry);
-	new_dentry_inode = d_inode(new_dentry);
-
-	/*
-	 * make sure that oldname still exists and points to an object we
-	 * are going to rename
-	 */
-	old_de.de_gen_number_bit_string = NULL;
-	reiserfs_write_lock(old_dir->i_sb);
-	retval =
-	    reiserfs_find_entry(old_dir, old_dentry->d_name.name,
-				old_dentry->d_name.len, &old_entry_path,
-				&old_de);
-	pathrelse(&old_entry_path);
-	if (retval == IO_ERROR) {
-		reiserfs_write_unlock(old_dir->i_sb);
-		return -EIO;
-	}
-
-	if (retval != NAME_FOUND || old_de.de_objectid != old_inode->i_ino) {
-		reiserfs_write_unlock(old_dir->i_sb);
-		return -ENOENT;
-	}
-
-	if (S_ISDIR(old_inode->i_mode)) {
-		/*
-		 * make sure that directory being renamed has correct ".."
-		 * and that its new parent directory has not too many links
-		 * already
-		 */
-		if (new_dentry_inode) {
-			if (!reiserfs_empty_dir(new_dentry_inode)) {
-				reiserfs_write_unlock(old_dir->i_sb);
-				return -ENOTEMPTY;
-			}
-		}
-
-		if (old_dir != new_dir) {
-			/*
-			 * directory is renamed, its parent directory will be
-			 * changed, so find ".." entry
-			 */
-			dot_dot_de.de_gen_number_bit_string = NULL;
-			retval =
-			    reiserfs_find_entry(old_inode, "..", 2,
-					&dot_dot_entry_path,
-					&dot_dot_de);
-			pathrelse(&dot_dot_entry_path);
-			if (retval != NAME_FOUND) {
-				reiserfs_write_unlock(old_dir->i_sb);
-				return -EIO;
-			}
-
-			/* inode number of .. must equal old_dir->i_ino */
-			if (dot_dot_de.de_objectid != old_dir->i_ino) {
-				reiserfs_write_unlock(old_dir->i_sb);
-				return -EIO;
-			}
-			update_dir_parent = true;
-		}
-	}
-
-	retval = journal_begin(&th, old_dir->i_sb, jbegin_count);
-	if (retval) {
-		reiserfs_write_unlock(old_dir->i_sb);
-		return retval;
-	}
-
-	/* add new entry (or find the existing one) */
-	retval =
-	    reiserfs_add_entry(&th, new_dir, new_dentry->d_name.name,
-			       new_dentry->d_name.len, old_inode, 0);
-	if (retval == -EEXIST) {
-		if (!new_dentry_inode) {
-			reiserfs_panic(old_dir->i_sb, "vs-7050",
-				       "new entry is found, new inode == 0");
-		}
-	} else if (retval) {
-		int err = journal_end(&th);
-		reiserfs_write_unlock(old_dir->i_sb);
-		return err ? err : retval;
-	}
-
-	reiserfs_update_inode_transaction(old_dir);
-	reiserfs_update_inode_transaction(new_dir);
-
-	/*
-	 * this makes it so an fsync on an open fd for the old name will
-	 * commit the rename operation
-	 */
-	reiserfs_update_inode_transaction(old_inode);
-
-	if (new_dentry_inode)
-		reiserfs_update_inode_transaction(new_dentry_inode);
-
-	while (1) {
-		/*
-		 * look for old name using corresponding entry key
-		 * (found by reiserfs_find_entry)
-		 */
-		if ((retval =
-		     search_by_entry_key(new_dir->i_sb, &old_de.de_entry_key,
-					 &old_entry_path,
-					 &old_de)) != NAME_FOUND) {
-			pathrelse(&old_entry_path);
-			journal_end(&th);
-			reiserfs_write_unlock(old_dir->i_sb);
-			return -EIO;
-		}
-
-		copy_item_head(&old_entry_ih, tp_item_head(&old_entry_path));
-
-		reiserfs_prepare_for_journal(old_inode->i_sb, old_de.de_bh, 1);
-
-		/* look for new name by reiserfs_find_entry */
-		new_de.de_gen_number_bit_string = NULL;
-		retval =
-		    reiserfs_find_entry(new_dir, new_dentry->d_name.name,
-					new_dentry->d_name.len, &new_entry_path,
-					&new_de);
-		/*
-		 * reiserfs_add_entry should not return IO_ERROR,
-		 * because it is called with essentially same parameters from
-		 * reiserfs_add_entry above, and we'll catch any i/o errors
-		 * before we get here.
-		 */
-		if (retval != NAME_FOUND_INVISIBLE && retval != NAME_FOUND) {
-			pathrelse(&new_entry_path);
-			pathrelse(&old_entry_path);
-			journal_end(&th);
-			reiserfs_write_unlock(old_dir->i_sb);
-			return -EIO;
-		}
-
-		copy_item_head(&new_entry_ih, tp_item_head(&new_entry_path));
-
-		reiserfs_prepare_for_journal(old_inode->i_sb, new_de.de_bh, 1);
-
-		if (update_dir_parent) {
-			if ((retval =
-			     search_by_entry_key(new_dir->i_sb,
-						 &dot_dot_de.de_entry_key,
-						 &dot_dot_entry_path,
-						 &dot_dot_de)) != NAME_FOUND) {
-				pathrelse(&dot_dot_entry_path);
-				pathrelse(&new_entry_path);
-				pathrelse(&old_entry_path);
-				journal_end(&th);
-				reiserfs_write_unlock(old_dir->i_sb);
-				return -EIO;
-			}
-			copy_item_head(&dot_dot_ih,
-				       tp_item_head(&dot_dot_entry_path));
-			/* node containing ".." gets into transaction */
-			reiserfs_prepare_for_journal(old_inode->i_sb,
-						     dot_dot_de.de_bh, 1);
-		}
-		/*
-		 * we should check seals here, not do
-		 * this stuff, yes? Then, having
-		 * gathered everything into RAM we
-		 * should lock the buffers, yes?  -Hans
-		 */
-		/*
-		 * probably.  our rename needs to hold more
-		 * than one path at once.  The seals would
-		 * have to be written to deal with multi-path
-		 * issues -chris
-		 */
-		/*
-		 * sanity checking before doing the rename - avoid races many
-		 * of the above checks could have scheduled.  We have to be
-		 * sure our items haven't been shifted by another process.
-		 */
-		if (item_moved(&new_entry_ih, &new_entry_path) ||
-		    !entry_points_to_object(new_dentry->d_name.name,
-					    new_dentry->d_name.len,
-					    &new_de, new_dentry_inode) ||
-		    item_moved(&old_entry_ih, &old_entry_path) ||
-		    !entry_points_to_object(old_dentry->d_name.name,
-					    old_dentry->d_name.len,
-					    &old_de, old_inode)) {
-			reiserfs_restore_prepared_buffer(old_inode->i_sb,
-							 new_de.de_bh);
-			reiserfs_restore_prepared_buffer(old_inode->i_sb,
-							 old_de.de_bh);
-			if (update_dir_parent)
-				reiserfs_restore_prepared_buffer(old_inode->
-								 i_sb,
-								 dot_dot_de.
-								 de_bh);
-			continue;
-		}
-		if (update_dir_parent) {
-			if (item_moved(&dot_dot_ih, &dot_dot_entry_path) ||
-			    !entry_points_to_object("..", 2, &dot_dot_de,
-						    old_dir)) {
-				reiserfs_restore_prepared_buffer(old_inode->
-								 i_sb,
-								 old_de.de_bh);
-				reiserfs_restore_prepared_buffer(old_inode->
-								 i_sb,
-								 new_de.de_bh);
-				reiserfs_restore_prepared_buffer(old_inode->
-								 i_sb,
-								 dot_dot_de.
-								 de_bh);
-				continue;
-			}
-		}
-
-		RFALSE(update_dir_parent &&
-		       !buffer_journal_prepared(dot_dot_de.de_bh), "");
-
-		break;
-	}
-
-	/*
-	 * ok, all the changes can be done in one fell swoop when we
-	 * have claimed all the buffers needed.
-	 */
-
-	mark_de_visible(new_de.de_deh + new_de.de_entry_num);
-	set_ino_in_dir_entry(&new_de, INODE_PKEY(old_inode));
-	journal_mark_dirty(&th, new_de.de_bh);
-
-	mark_de_hidden(old_de.de_deh + old_de.de_entry_num);
-	journal_mark_dirty(&th, old_de.de_bh);
-	/*
-	 * thanks to Alex Adriaanse <alex_a@caltech.edu> for patch
-	 * which adds ctime update of renamed object
-	 */
-	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
-
-	if (new_dentry_inode) {
-		/* adjust link number of the victim */
-		if (S_ISDIR(new_dentry_inode->i_mode)) {
-			clear_nlink(new_dentry_inode);
-		} else {
-			drop_nlink(new_dentry_inode);
-		}
-		savelink = new_dentry_inode->i_nlink;
-	}
-
-	if (update_dir_parent) {
-		/* adjust ".." of renamed directory */
-		set_ino_in_dir_entry(&dot_dot_de, INODE_PKEY(new_dir));
-		journal_mark_dirty(&th, dot_dot_de.de_bh);
-	}
-	if (S_ISDIR(old_inode->i_mode)) {
-		/*
-		 * there (in new_dir) was no directory, so it got new link
-		 * (".."  of renamed directory)
-		 */
-		if (!new_dentry_inode)
-			INC_DIR_INODE_NLINK(new_dir);
-
-		/* old directory lost one link - ".. " of renamed directory */
-		DEC_DIR_INODE_NLINK(old_dir);
-	}
-	/*
-	 * looks like in 2.3.99pre3 brelse is atomic.
-	 * so we can use pathrelse
-	 */
-	pathrelse(&new_entry_path);
-	pathrelse(&dot_dot_entry_path);
-
-	/*
-	 * FIXME: this reiserfs_cut_from_item's return value may screw up
-	 * anybody, but it will panic if will not be able to find the
-	 * entry. This needs one more clean up
-	 */
-	if (reiserfs_cut_from_item
-	    (&th, &old_entry_path, &old_de.de_entry_key, old_dir, NULL,
-	     0) < 0)
-		reiserfs_error(old_dir->i_sb, "vs-7060",
-			       "couldn't not cut old name. Fsck later?");
-
-	old_dir->i_size -= DEH_SIZE + old_de.de_entrylen;
-
-	reiserfs_update_sd(&th, old_dir);
-	reiserfs_update_sd(&th, new_dir);
-	reiserfs_update_sd(&th, old_inode);
-
-	if (new_dentry_inode) {
-		if (savelink == 0)
-			add_save_link(&th, new_dentry_inode,
-				      0 /* not truncate */ );
-		reiserfs_update_sd(&th, new_dentry_inode);
-	}
-
-	retval = journal_end(&th);
-	reiserfs_write_unlock(old_dir->i_sb);
-	return retval;
-}
-
-static const struct inode_operations reiserfs_priv_dir_inode_operations = {
-	.create = reiserfs_create,
-	.lookup = reiserfs_lookup,
-	.link = reiserfs_link,
-	.unlink = reiserfs_unlink,
-	.symlink = reiserfs_symlink,
-	.mkdir = reiserfs_mkdir,
-	.rmdir = reiserfs_rmdir,
-	.mknod = reiserfs_mknod,
-	.rename = reiserfs_rename,
-	.setattr = reiserfs_setattr,
-	.permission = reiserfs_permission,
-	.fileattr_get = reiserfs_fileattr_get,
-	.fileattr_set = reiserfs_fileattr_set,
-};
-
-static const struct inode_operations reiserfs_priv_symlink_inode_operations = {
-	.get_link	= page_get_link,
-	.setattr = reiserfs_setattr,
-	.permission = reiserfs_permission,
-};
-
-static const struct inode_operations reiserfs_priv_special_inode_operations = {
-	.setattr = reiserfs_setattr,
-	.permission = reiserfs_permission,
-};
-
-void reiserfs_init_priv_inode(struct inode *inode)
-{
-	inode->i_flags |= S_PRIVATE;
-	inode->i_opflags &= ~IOP_XATTR;
-
-	if (S_ISREG(inode->i_mode))
-		inode->i_op = &reiserfs_priv_file_inode_operations;
-	else if (S_ISDIR(inode->i_mode))
-		inode->i_op = &reiserfs_priv_dir_inode_operations;
-	else if (S_ISLNK(inode->i_mode))
-		inode->i_op = &reiserfs_priv_symlink_inode_operations;
-	else
-		inode->i_op = &reiserfs_priv_special_inode_operations;
-}
-
-/* directories can handle most operations...  */
-const struct inode_operations reiserfs_dir_inode_operations = {
-	.create = reiserfs_create,
-	.lookup = reiserfs_lookup,
-	.link = reiserfs_link,
-	.unlink = reiserfs_unlink,
-	.symlink = reiserfs_symlink,
-	.mkdir = reiserfs_mkdir,
-	.rmdir = reiserfs_rmdir,
-	.mknod = reiserfs_mknod,
-	.rename = reiserfs_rename,
-	.setattr = reiserfs_setattr,
-	.listxattr = reiserfs_listxattr,
-	.permission = reiserfs_permission,
-	.get_inode_acl = reiserfs_get_acl,
-	.set_acl = reiserfs_set_acl,
-	.fileattr_get = reiserfs_fileattr_get,
-	.fileattr_set = reiserfs_fileattr_set,
-};
-
-/*
- * symlink operations.. same as page_symlink_inode_operations, with xattr
- * stuff added
- */
-const struct inode_operations reiserfs_symlink_inode_operations = {
-	.get_link	= page_get_link,
-	.setattr = reiserfs_setattr,
-	.listxattr = reiserfs_listxattr,
-	.permission = reiserfs_permission,
-};
-
-/*
- * special file operations.. just xattr/acl stuff
- */
-const struct inode_operations reiserfs_special_inode_operations = {
-	.setattr = reiserfs_setattr,
-	.listxattr = reiserfs_listxattr,
-	.permission = reiserfs_permission,
-	.get_inode_acl = reiserfs_get_acl,
-	.set_acl = reiserfs_set_acl,
-};
diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
deleted file mode 100644
index 34baf5c0f265..000000000000
--- a/fs/reiserfs/objectid.c
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/string.h>
-#include <linux/time.h>
-#include <linux/uuid.h>
-#include "reiserfs.h"
-
-/* find where objectid map starts */
-#define objectid_map(s,rs) (old_format_only (s) ? \
-                         (__le32 *)((struct reiserfs_super_block_v1 *)(rs) + 1) :\
-			 (__le32 *)((rs) + 1))
-
-#ifdef CONFIG_REISERFS_CHECK
-
-static void check_objectid_map(struct super_block *s, __le32 * map)
-{
-	if (le32_to_cpu(map[0]) != 1)
-		reiserfs_panic(s, "vs-15010", "map corrupted: %lx",
-			       (long unsigned int)le32_to_cpu(map[0]));
-
-	/* FIXME: add something else here */
-}
-
-#else
-static void check_objectid_map(struct super_block *s, __le32 * map)
-{;
-}
-#endif
-
-/*
- * When we allocate objectids we allocate the first unused objectid.
- * Each sequence of objectids in use (the odd sequences) is followed
- * by a sequence of objectids not in use (the even sequences).  We
- * only need to record the last objectid in each of these sequences
- * (both the odd and even sequences) in order to fully define the
- * boundaries of the sequences.  A consequence of allocating the first
- * objectid not in use is that under most conditions this scheme is
- * extremely compact.  The exception is immediately after a sequence
- * of operations which deletes a large number of objects of
- * non-sequential objectids, and even then it will become compact
- * again as soon as more objects are created.  Note that many
- * interesting optimizations of layout could result from complicating
- * objectid assignment, but we have deferred making them for now.
- */
-
-/* get unique object identifier */
-__u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th)
-{
-	struct super_block *s = th->t_super;
-	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
-	__le32 *map = objectid_map(s, rs);
-	__u32 unused_objectid;
-
-	BUG_ON(!th->t_trans_id);
-
-	check_objectid_map(s, map);
-
-	reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
-	/* comment needed -Hans */
-	unused_objectid = le32_to_cpu(map[1]);
-	if (unused_objectid == U32_MAX) {
-		reiserfs_warning(s, "reiserfs-15100", "no more object ids");
-		reiserfs_restore_prepared_buffer(s, SB_BUFFER_WITH_SB(s));
-		return 0;
-	}
-
-	/*
-	 * This incrementation allocates the first unused objectid. That
-	 * is to say, the first entry on the objectid map is the first
-	 * unused objectid, and by incrementing it we use it.  See below
-	 * where we check to see if we eliminated a sequence of unused
-	 * objectids....
-	 */
-	map[1] = cpu_to_le32(unused_objectid + 1);
-
-	/*
-	 * Now we check to see if we eliminated the last remaining member of
-	 * the first even sequence (and can eliminate the sequence by
-	 * eliminating its last objectid from oids), and can collapse the
-	 * first two odd sequences into one sequence.  If so, then the net
-	 * result is to eliminate a pair of objectids from oids.  We do this
-	 * by shifting the entire map to the left.
-	 */
-	if (sb_oid_cursize(rs) > 2 && map[1] == map[2]) {
-		memmove(map + 1, map + 3,
-			(sb_oid_cursize(rs) - 3) * sizeof(__u32));
-		set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2);
-	}
-
-	journal_mark_dirty(th, SB_BUFFER_WITH_SB(s));
-	return unused_objectid;
-}
-
-/* makes object identifier unused */
-void reiserfs_release_objectid(struct reiserfs_transaction_handle *th,
-			       __u32 objectid_to_release)
-{
-	struct super_block *s = th->t_super;
-	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
-	__le32 *map = objectid_map(s, rs);
-	int i = 0;
-
-	BUG_ON(!th->t_trans_id);
-	/*return; */
-	check_objectid_map(s, map);
-
-	reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
-	journal_mark_dirty(th, SB_BUFFER_WITH_SB(s));
-
-	/*
-	 * start at the beginning of the objectid map (i = 0) and go to
-	 * the end of it (i = disk_sb->s_oid_cursize).  Linear search is
-	 * what we use, though it is possible that binary search would be
-	 * more efficient after performing lots of deletions (which is
-	 * when oids is large.)  We only check even i's.
-	 */
-	while (i < sb_oid_cursize(rs)) {
-		if (objectid_to_release == le32_to_cpu(map[i])) {
-			/* This incrementation unallocates the objectid. */
-			le32_add_cpu(&map[i], 1);
-
-			/*
-			 * Did we unallocate the last member of an
-			 * odd sequence, and can shrink oids?
-			 */
-			if (map[i] == map[i + 1]) {
-				/* shrink objectid map */
-				memmove(map + i, map + i + 2,
-					(sb_oid_cursize(rs) - i -
-					 2) * sizeof(__u32));
-				set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2);
-
-				RFALSE(sb_oid_cursize(rs) < 2 ||
-				       sb_oid_cursize(rs) > sb_oid_maxsize(rs),
-				       "vs-15005: objectid map corrupted cur_size == %d (max == %d)",
-				       sb_oid_cursize(rs), sb_oid_maxsize(rs));
-			}
-			return;
-		}
-
-		if (objectid_to_release > le32_to_cpu(map[i]) &&
-		    objectid_to_release < le32_to_cpu(map[i + 1])) {
-			/* size of objectid map is not changed */
-			if (objectid_to_release + 1 == le32_to_cpu(map[i + 1])) {
-				le32_add_cpu(&map[i + 1], -1);
-				return;
-			}
-
-			/*
-			 * JDM comparing two little-endian values for
-			 * equality -- safe
-			 */
-			/*
-			 * objectid map must be expanded, but
-			 * there is no space
-			 */
-			if (sb_oid_cursize(rs) == sb_oid_maxsize(rs)) {
-				PROC_INFO_INC(s, leaked_oid);
-				return;
-			}
-
-			/* expand the objectid map */
-			memmove(map + i + 3, map + i + 1,
-				(sb_oid_cursize(rs) - i - 1) * sizeof(__u32));
-			map[i + 1] = cpu_to_le32(objectid_to_release);
-			map[i + 2] = cpu_to_le32(objectid_to_release + 1);
-			set_sb_oid_cursize(rs, sb_oid_cursize(rs) + 2);
-			return;
-		}
-		i += 2;
-	}
-
-	reiserfs_error(s, "vs-15011", "tried to free free object id (%lu)",
-		       (long unsigned)objectid_to_release);
-}
-
-int reiserfs_convert_objectid_map_v1(struct super_block *s)
-{
-	struct reiserfs_super_block *disk_sb = SB_DISK_SUPER_BLOCK(s);
-	int cur_size = sb_oid_cursize(disk_sb);
-	int new_size = (s->s_blocksize - SB_SIZE) / sizeof(__u32) / 2 * 2;
-	int old_max = sb_oid_maxsize(disk_sb);
-	struct reiserfs_super_block_v1 *disk_sb_v1;
-	__le32 *objectid_map;
-	int i;
-
-	disk_sb_v1 =
-	    (struct reiserfs_super_block_v1 *)(SB_BUFFER_WITH_SB(s)->b_data);
-	objectid_map = (__le32 *) (disk_sb_v1 + 1);
-
-	if (cur_size > new_size) {
-		/*
-		 * mark everyone used that was listed as free at
-		 * the end of the objectid map
-		 */
-		objectid_map[new_size - 1] = objectid_map[cur_size - 1];
-		set_sb_oid_cursize(disk_sb, new_size);
-	}
-	/* move the smaller objectid map past the end of the new super */
-	for (i = new_size - 1; i >= 0; i--) {
-		objectid_map[i + (old_max - new_size)] = objectid_map[i];
-	}
-
-	/* set the max size so we don't overflow later */
-	set_sb_oid_maxsize(disk_sb, new_size);
-
-	/* Zero out label and generate random UUID */
-	memset(disk_sb->s_label, 0, sizeof(disk_sb->s_label));
-	generate_random_uuid(disk_sb->s_uuid);
-
-	/* finally, zero out the unused chunk of the new super */
-	memset(disk_sb->s_unused, 0, sizeof(disk_sb->s_unused));
-	return 0;
-}
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
deleted file mode 100644
index 84a194b77f19..000000000000
--- a/fs/reiserfs/prints.c
+++ /dev/null
@@ -1,792 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/time.h>
-#include <linux/fs.h>
-#include "reiserfs.h"
-#include <linux/string.h>
-#include <linux/buffer_head.h>
-
-#include <linux/stdarg.h>
-
-static char error_buf[1024];
-static char fmt_buf[1024];
-static char off_buf[80];
-
-static char *reiserfs_cpu_offset(struct cpu_key *key)
-{
-	if (cpu_key_k_type(key) == TYPE_DIRENTRY)
-		sprintf(off_buf, "%llu(%llu)",
-			(unsigned long long)
-			GET_HASH_VALUE(cpu_key_k_offset(key)),
-			(unsigned long long)
-			GET_GENERATION_NUMBER(cpu_key_k_offset(key)));
-	else
-		sprintf(off_buf, "0x%Lx",
-			(unsigned long long)cpu_key_k_offset(key));
-	return off_buf;
-}
-
-static char *le_offset(struct reiserfs_key *key)
-{
-	int version;
-
-	version = le_key_version(key);
-	if (le_key_k_type(version, key) == TYPE_DIRENTRY)
-		sprintf(off_buf, "%llu(%llu)",
-			(unsigned long long)
-			GET_HASH_VALUE(le_key_k_offset(version, key)),
-			(unsigned long long)
-			GET_GENERATION_NUMBER(le_key_k_offset(version, key)));
-	else
-		sprintf(off_buf, "0x%Lx",
-			(unsigned long long)le_key_k_offset(version, key));
-	return off_buf;
-}
-
-static char *cpu_type(struct cpu_key *key)
-{
-	if (cpu_key_k_type(key) == TYPE_STAT_DATA)
-		return "SD";
-	if (cpu_key_k_type(key) == TYPE_DIRENTRY)
-		return "DIR";
-	if (cpu_key_k_type(key) == TYPE_DIRECT)
-		return "DIRECT";
-	if (cpu_key_k_type(key) == TYPE_INDIRECT)
-		return "IND";
-	return "UNKNOWN";
-}
-
-static char *le_type(struct reiserfs_key *key)
-{
-	int version;
-
-	version = le_key_version(key);
-
-	if (le_key_k_type(version, key) == TYPE_STAT_DATA)
-		return "SD";
-	if (le_key_k_type(version, key) == TYPE_DIRENTRY)
-		return "DIR";
-	if (le_key_k_type(version, key) == TYPE_DIRECT)
-		return "DIRECT";
-	if (le_key_k_type(version, key) == TYPE_INDIRECT)
-		return "IND";
-	return "UNKNOWN";
-}
-
-/* %k */
-static int scnprintf_le_key(char *buf, size_t size, struct reiserfs_key *key)
-{
-	if (key)
-		return scnprintf(buf, size, "[%d %d %s %s]",
-				 le32_to_cpu(key->k_dir_id),
-				 le32_to_cpu(key->k_objectid), le_offset(key),
-				 le_type(key));
-	else
-		return scnprintf(buf, size, "[NULL]");
-}
-
-/* %K */
-static int scnprintf_cpu_key(char *buf, size_t size, struct cpu_key *key)
-{
-	if (key)
-		return scnprintf(buf, size, "[%d %d %s %s]",
-				 key->on_disk_key.k_dir_id,
-				 key->on_disk_key.k_objectid,
-				 reiserfs_cpu_offset(key), cpu_type(key));
-	else
-		return scnprintf(buf, size, "[NULL]");
-}
-
-static int scnprintf_de_head(char *buf, size_t size,
-			     struct reiserfs_de_head *deh)
-{
-	if (deh)
-		return scnprintf(buf, size,
-				 "[offset=%d dir_id=%d objectid=%d location=%d state=%04x]",
-				 deh_offset(deh), deh_dir_id(deh),
-				 deh_objectid(deh), deh_location(deh),
-				 deh_state(deh));
-	else
-		return scnprintf(buf, size, "[NULL]");
-
-}
-
-static int scnprintf_item_head(char *buf, size_t size, struct item_head *ih)
-{
-	if (ih) {
-		char *p = buf;
-		char * const end = buf + size;
-
-		p += scnprintf(p, end - p, "%s",
-			       (ih_version(ih) == KEY_FORMAT_3_6) ?
-			       "*3.6* " : "*3.5*");
-
-		p += scnprintf_le_key(p, end - p, &ih->ih_key);
-
-		p += scnprintf(p, end - p,
-			       ", item_len %d, item_location %d, free_space(entry_count) %d",
-			       ih_item_len(ih), ih_location(ih),
-			       ih_free_space(ih));
-		return p - buf;
-	} else
-		return scnprintf(buf, size, "[NULL]");
-}
-
-static int scnprintf_direntry(char *buf, size_t size,
-			      struct reiserfs_dir_entry *de)
-{
-	char name[20];
-
-	memcpy(name, de->de_name, de->de_namelen > 19 ? 19 : de->de_namelen);
-	name[de->de_namelen > 19 ? 19 : de->de_namelen] = 0;
-	return scnprintf(buf, size, "\"%s\"==>[%d %d]",
-			 name, de->de_dir_id, de->de_objectid);
-}
-
-static int scnprintf_block_head(char *buf, size_t size, struct buffer_head *bh)
-{
-	return scnprintf(buf, size,
-			 "level=%d, nr_items=%d, free_space=%d rdkey ",
-			 B_LEVEL(bh), B_NR_ITEMS(bh), B_FREE_SPACE(bh));
-}
-
-static int scnprintf_buffer_head(char *buf, size_t size, struct buffer_head *bh)
-{
-	return scnprintf(buf, size,
-			 "dev %pg, size %zd, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)",
-			 bh->b_bdev, bh->b_size,
-			 (unsigned long long)bh->b_blocknr,
-			 atomic_read(&(bh->b_count)),
-			 bh->b_state, bh->b_page,
-			 buffer_uptodate(bh) ? "UPTODATE" : "!UPTODATE",
-			 buffer_dirty(bh) ? "DIRTY" : "CLEAN",
-			 buffer_locked(bh) ? "LOCKED" : "UNLOCKED");
-}
-
-static int scnprintf_disk_child(char *buf, size_t size, struct disk_child *dc)
-{
-	return scnprintf(buf, size, "[dc_number=%d, dc_size=%u]",
-			 dc_block_number(dc), dc_size(dc));
-}
-
-static char *is_there_reiserfs_struct(char *fmt, int *what)
-{
-	char *k = fmt;
-
-	while ((k = strchr(k, '%')) != NULL) {
-		if (k[1] == 'k' || k[1] == 'K' || k[1] == 'h' || k[1] == 't' ||
-		    k[1] == 'z' || k[1] == 'b' || k[1] == 'y' || k[1] == 'a') {
-			*what = k[1];
-			break;
-		}
-		k++;
-	}
-	return k;
-}
-
-/*
- * debugging reiserfs we used to print out a lot of different
- * variables, like keys, item headers, buffer heads etc. Values of
- * most fields matter. So it took a long time just to write
- * appropriative printk. With this reiserfs_warning you can use format
- * specification for complex structures like you used to do with
- * printfs for integers, doubles and pointers. For instance, to print
- * out key structure you have to write just:
- * reiserfs_warning ("bad key %k", key);
- * instead of
- * printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid,
- *         key->k_offset, key->k_uniqueness);
- */
-static DEFINE_SPINLOCK(error_lock);
-static void prepare_error_buf(const char *fmt, va_list args)
-{
-	char *fmt1 = fmt_buf;
-	char *k;
-	char *p = error_buf;
-	char * const end = &error_buf[sizeof(error_buf)];
-	int what;
-
-	spin_lock(&error_lock);
-
-	if (WARN_ON(strscpy(fmt_buf, fmt, sizeof(fmt_buf)) < 0)) {
-		strscpy(error_buf, "format string too long", end - error_buf);
-		goto out_unlock;
-	}
-
-	while ((k = is_there_reiserfs_struct(fmt1, &what)) != NULL) {
-		*k = 0;
-
-		p += vscnprintf(p, end - p, fmt1, args);
-
-		switch (what) {
-		case 'k':
-			p += scnprintf_le_key(p, end - p,
-					      va_arg(args, struct reiserfs_key *));
-			break;
-		case 'K':
-			p += scnprintf_cpu_key(p, end - p,
-					       va_arg(args, struct cpu_key *));
-			break;
-		case 'h':
-			p += scnprintf_item_head(p, end - p,
-						 va_arg(args, struct item_head *));
-			break;
-		case 't':
-			p += scnprintf_direntry(p, end - p,
-						va_arg(args, struct reiserfs_dir_entry *));
-			break;
-		case 'y':
-			p += scnprintf_disk_child(p, end - p,
-						  va_arg(args, struct disk_child *));
-			break;
-		case 'z':
-			p += scnprintf_block_head(p, end - p,
-						  va_arg(args, struct buffer_head *));
-			break;
-		case 'b':
-			p += scnprintf_buffer_head(p, end - p,
-						   va_arg(args, struct buffer_head *));
-			break;
-		case 'a':
-			p += scnprintf_de_head(p, end - p,
-					       va_arg(args, struct reiserfs_de_head *));
-			break;
-		}
-
-		fmt1 = k + 2;
-	}
-	p += vscnprintf(p, end - p, fmt1, args);
-out_unlock:
-	spin_unlock(&error_lock);
-
-}
-
-/*
- * in addition to usual conversion specifiers this accepts reiserfs
- * specific conversion specifiers:
- * %k to print little endian key,
- * %K to print cpu key,
- * %h to print item_head,
- * %t to print directory entry
- * %z to print block head (arg must be struct buffer_head *
- * %b to print buffer_head
- */
-
-#define do_reiserfs_warning(fmt)\
-{\
-    va_list args;\
-    va_start( args, fmt );\
-    prepare_error_buf( fmt, args );\
-    va_end( args );\
-}
-
-void __reiserfs_warning(struct super_block *sb, const char *id,
-			 const char *function, const char *fmt, ...)
-{
-	do_reiserfs_warning(fmt);
-	if (sb)
-		printk(KERN_WARNING "REISERFS warning (device %s): %s%s%s: "
-		       "%s\n", sb->s_id, id ? id : "", id ? " " : "",
-		       function, error_buf);
-	else
-		printk(KERN_WARNING "REISERFS warning: %s%s%s: %s\n",
-		       id ? id : "", id ? " " : "", function, error_buf);
-}
-
-/* No newline.. reiserfs_info calls can be followed by printk's */
-void reiserfs_info(struct super_block *sb, const char *fmt, ...)
-{
-	do_reiserfs_warning(fmt);
-	if (sb)
-		printk(KERN_NOTICE "REISERFS (device %s): %s",
-		       sb->s_id, error_buf);
-	else
-		printk(KERN_NOTICE "REISERFS %s:", error_buf);
-}
-
-/* No newline.. reiserfs_printk calls can be followed by printk's */
-static void reiserfs_printk(const char *fmt, ...)
-{
-	do_reiserfs_warning(fmt);
-	printk(error_buf);
-}
-
-void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...)
-{
-#ifdef CONFIG_REISERFS_CHECK
-	do_reiserfs_warning(fmt);
-	if (s)
-		printk(KERN_DEBUG "REISERFS debug (device %s): %s\n",
-		       s->s_id, error_buf);
-	else
-		printk(KERN_DEBUG "REISERFS debug: %s\n", error_buf);
-#endif
-}
-
-/*
- * The format:
- *
- *          maintainer-errorid: [function-name:] message
- *
- *   where errorid is unique to the maintainer and function-name is
- *   optional, is recommended, so that anyone can easily find the bug
- *   with a simple grep for the short to type string
- *   maintainer-errorid.  Don't bother with reusing errorids, there are
- *   lots of numbers out there.
- *
- *   Example:
- *
- *   reiserfs_panic(
- *     p_sb, "reiser-29: reiserfs_new_blocknrs: "
- *     "one of search_start or rn(%d) is equal to MAX_B_NUM,"
- *     "which means that we are optimizing location based on the "
- *     "bogus location of a temp buffer (%p).",
- *     rn, bh
- *   );
- *
- *   Regular panic()s sometimes clear the screen before the message can
- *   be read, thus the need for the while loop.
- *
- *   Numbering scheme for panic used by Vladimir and Anatoly( Hans completely
- *   ignores this scheme, and considers it pointless complexity):
- *
- *   panics in reiserfs_fs.h have numbers from 1000 to 1999
- *   super.c			2000 to 2999
- *   preserve.c (unused)	3000 to 3999
- *   bitmap.c			4000 to 4999
- *   stree.c			5000 to 5999
- *   prints.c			6000 to 6999
- *   namei.c			7000 to 7999
- *   fix_nodes.c		8000 to 8999
- *   dir.c			9000 to 9999
- *   lbalance.c			10000 to 10999
- *   ibalance.c			11000 to 11999 not ready
- *   do_balan.c			12000 to 12999
- *   inode.c			13000 to 13999
- *   file.c			14000 to 14999
- *   objectid.c			15000 - 15999
- *   buffer.c			16000 - 16999
- *   symlink.c			17000 - 17999
- *
- *  .  */
-
-void __reiserfs_panic(struct super_block *sb, const char *id,
-		      const char *function, const char *fmt, ...)
-{
-	do_reiserfs_warning(fmt);
-
-#ifdef CONFIG_REISERFS_CHECK
-	dump_stack();
-#endif
-	if (sb)
-		printk(KERN_WARNING "REISERFS panic (device %s): %s%s%s: %s\n",
-		      sb->s_id, id ? id : "", id ? " " : "",
-		      function, error_buf);
-	else
-		printk(KERN_WARNING "REISERFS panic: %s%s%s: %s\n",
-		      id ? id : "", id ? " " : "", function, error_buf);
-	BUG();
-}
-
-void __reiserfs_error(struct super_block *sb, const char *id,
-		      const char *function, const char *fmt, ...)
-{
-	do_reiserfs_warning(fmt);
-
-	BUG_ON(sb == NULL);
-
-	if (reiserfs_error_panic(sb))
-		__reiserfs_panic(sb, id, function, error_buf);
-
-	if (id && id[0])
-		printk(KERN_CRIT "REISERFS error (device %s): %s %s: %s\n",
-		       sb->s_id, id, function, error_buf);
-	else
-		printk(KERN_CRIT "REISERFS error (device %s): %s: %s\n",
-		       sb->s_id, function, error_buf);
-
-	if (sb_rdonly(sb))
-		return;
-
-	reiserfs_info(sb, "Remounting filesystem read-only\n");
-	sb->s_flags |= SB_RDONLY;
-	reiserfs_abort_journal(sb, -EIO);
-}
-
-void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...)
-{
-	do_reiserfs_warning(fmt);
-
-	if (reiserfs_error_panic(sb)) {
-		panic(KERN_CRIT "REISERFS panic (device %s): %s\n", sb->s_id,
-		      error_buf);
-	}
-
-	if (reiserfs_is_journal_aborted(SB_JOURNAL(sb)))
-		return;
-
-	printk(KERN_CRIT "REISERFS abort (device %s): %s\n", sb->s_id,
-	       error_buf);
-
-	sb->s_flags |= SB_RDONLY;
-	reiserfs_abort_journal(sb, errno);
-}
-
-/*
- * this prints internal nodes (4 keys/items in line) (dc_number,
- * dc_size)[k_dirid, k_objectid, k_offset, k_uniqueness](dc_number,
- * dc_size)...
- */
-static int print_internal(struct buffer_head *bh, int first, int last)
-{
-	struct reiserfs_key *key;
-	struct disk_child *dc;
-	int i;
-	int from, to;
-
-	if (!B_IS_KEYS_LEVEL(bh))
-		return 1;
-
-	check_internal(bh);
-
-	if (first == -1) {
-		from = 0;
-		to = B_NR_ITEMS(bh);
-	} else {
-		from = first;
-		to = min_t(int, last, B_NR_ITEMS(bh));
-	}
-
-	reiserfs_printk("INTERNAL NODE (%ld) contains %z\n", bh->b_blocknr, bh);
-
-	dc = B_N_CHILD(bh, from);
-	reiserfs_printk("PTR %d: %y ", from, dc);
-
-	for (i = from, key = internal_key(bh, from), dc++; i < to;
-	     i++, key++, dc++) {
-		reiserfs_printk("KEY %d: %k PTR %d: %y ", i, key, i + 1, dc);
-		if (i && i % 4 == 0)
-			printk("\n");
-	}
-	printk("\n");
-	return 0;
-}
-
-static int print_leaf(struct buffer_head *bh, int print_mode, int first,
-		      int last)
-{
-	struct block_head *blkh;
-	struct item_head *ih;
-	int i, nr;
-	int from, to;
-
-	if (!B_IS_ITEMS_LEVEL(bh))
-		return 1;
-
-	check_leaf(bh);
-
-	blkh = B_BLK_HEAD(bh);
-	ih = item_head(bh, 0);
-	nr = blkh_nr_item(blkh);
-
-	printk
-	    ("\n===================================================================\n");
-	reiserfs_printk("LEAF NODE (%ld) contains %z\n", bh->b_blocknr, bh);
-
-	if (!(print_mode & PRINT_LEAF_ITEMS)) {
-		reiserfs_printk("FIRST ITEM_KEY: %k, LAST ITEM KEY: %k\n",
-				&(ih->ih_key), &((ih + nr - 1)->ih_key));
-		return 0;
-	}
-
-	if (first < 0 || first > nr - 1)
-		from = 0;
-	else
-		from = first;
-
-	if (last < 0 || last > nr)
-		to = nr;
-	else
-		to = last;
-
-	ih += from;
-	printk
-	    ("-------------------------------------------------------------------------------\n");
-	printk
-	    ("|##|   type    |           key           | ilen | free_space | version | loc  |\n");
-	for (i = from; i < to; i++, ih++) {
-		printk
-		    ("-------------------------------------------------------------------------------\n");
-		reiserfs_printk("|%2d| %h |\n", i, ih);
-		if (print_mode & PRINT_LEAF_ITEMS)
-			op_print_item(ih, ih_item_body(bh, ih));
-	}
-
-	printk
-	    ("===================================================================\n");
-
-	return 0;
-}
-
-char *reiserfs_hashname(int code)
-{
-	if (code == YURA_HASH)
-		return "rupasov";
-	if (code == TEA_HASH)
-		return "tea";
-	if (code == R5_HASH)
-		return "r5";
-
-	return "unknown";
-}
-
-/* return 1 if this is not super block */
-static int print_super_block(struct buffer_head *bh)
-{
-	struct reiserfs_super_block *rs =
-	    (struct reiserfs_super_block *)(bh->b_data);
-	int skipped, data_blocks;
-	char *version;
-
-	if (is_reiserfs_3_5(rs)) {
-		version = "3.5";
-	} else if (is_reiserfs_3_6(rs)) {
-		version = "3.6";
-	} else if (is_reiserfs_jr(rs)) {
-		version = ((sb_version(rs) == REISERFS_VERSION_2) ?
-			   "3.6" : "3.5");
-	} else {
-		return 1;
-	}
-
-	printk("%pg\'s super block is in block %llu\n", bh->b_bdev,
-	       (unsigned long long)bh->b_blocknr);
-	printk("Reiserfs version %s\n", version);
-	printk("Block count %u\n", sb_block_count(rs));
-	printk("Blocksize %d\n", sb_blocksize(rs));
-	printk("Free blocks %u\n", sb_free_blocks(rs));
-	/*
-	 * FIXME: this would be confusing if
-	 * someone stores reiserfs super block in some data block ;)
-//    skipped = (bh->b_blocknr * bh->b_size) / sb_blocksize(rs);
-	 */
-	skipped = bh->b_blocknr;
-	data_blocks = sb_block_count(rs) - skipped - 1 - sb_bmap_nr(rs) -
-	    (!is_reiserfs_jr(rs) ? sb_jp_journal_size(rs) +
-	     1 : sb_reserved_for_journal(rs)) - sb_free_blocks(rs);
-	printk
-	    ("Busy blocks (skipped %d, bitmaps - %d, journal (or reserved) blocks - %d\n"
-	     "1 super block, %d data blocks\n", skipped, sb_bmap_nr(rs),
-	     (!is_reiserfs_jr(rs) ? (sb_jp_journal_size(rs) + 1) :
-	      sb_reserved_for_journal(rs)), data_blocks);
-	printk("Root block %u\n", sb_root_block(rs));
-	printk("Journal block (first) %d\n", sb_jp_journal_1st_block(rs));
-	printk("Journal dev %d\n", sb_jp_journal_dev(rs));
-	printk("Journal orig size %d\n", sb_jp_journal_size(rs));
-	printk("FS state %d\n", sb_fs_state(rs));
-	printk("Hash function \"%s\"\n",
-	       reiserfs_hashname(sb_hash_function_code(rs)));
-
-	printk("Tree height %d\n", sb_tree_height(rs));
-	return 0;
-}
-
-static int print_desc_block(struct buffer_head *bh)
-{
-	struct reiserfs_journal_desc *desc;
-
-	if (memcmp(get_journal_desc_magic(bh), JOURNAL_DESC_MAGIC, 8))
-		return 1;
-
-	desc = (struct reiserfs_journal_desc *)(bh->b_data);
-	printk("Desc block %llu (j_trans_id %d, j_mount_id %d, j_len %d)",
-	       (unsigned long long)bh->b_blocknr, get_desc_trans_id(desc),
-	       get_desc_mount_id(desc), get_desc_trans_len(desc));
-
-	return 0;
-}
-/* ..., int print_mode, int first, int last) */
-void print_block(struct buffer_head *bh, ...)
-{
-	va_list args;
-	int mode, first, last;
-
-	if (!bh) {
-		printk("print_block: buffer is NULL\n");
-		return;
-	}
-
-	va_start(args, bh);
-
-	mode = va_arg(args, int);
-	first = va_arg(args, int);
-	last = va_arg(args, int);
-	if (print_leaf(bh, mode, first, last))
-		if (print_internal(bh, first, last))
-			if (print_super_block(bh))
-				if (print_desc_block(bh))
-					printk
-					    ("Block %llu contains unformatted data\n",
-					     (unsigned long long)bh->b_blocknr);
-
-	va_end(args);
-}
-
-static char print_tb_buf[2048];
-
-/* this stores initial state of tree balance in the print_tb_buf */
-void store_print_tb(struct tree_balance *tb)
-{
-	int h = 0;
-	int i;
-	struct buffer_head *tbSh, *tbFh;
-
-	if (!tb)
-		return;
-
-	sprintf(print_tb_buf, "\n"
-		"BALANCING %d\n"
-		"MODE=%c, ITEM_POS=%d POS_IN_ITEM=%d\n"
-		"=====================================================================\n"
-		"* h *    S    *    L    *    R    *   F   *   FL  *   FR  *  CFL  *  CFR  *\n",
-		REISERFS_SB(tb->tb_sb)->s_do_balance,
-		tb->tb_mode, PATH_LAST_POSITION(tb->tb_path),
-		tb->tb_path->pos_in_item);
-
-	for (h = 0; h < ARRAY_SIZE(tb->insert_size); h++) {
-		if (PATH_H_PATH_OFFSET(tb->tb_path, h) <=
-		    tb->tb_path->path_length
-		    && PATH_H_PATH_OFFSET(tb->tb_path,
-					  h) > ILLEGAL_PATH_ELEMENT_OFFSET) {
-			tbSh = PATH_H_PBUFFER(tb->tb_path, h);
-			tbFh = PATH_H_PPARENT(tb->tb_path, h);
-		} else {
-			tbSh = NULL;
-			tbFh = NULL;
-		}
-		sprintf(print_tb_buf + strlen(print_tb_buf),
-			"* %d * %3lld(%2d) * %3lld(%2d) * %3lld(%2d) * %5lld * %5lld * %5lld * %5lld * %5lld *\n",
-			h,
-			(tbSh) ? (long long)(tbSh->b_blocknr) : (-1LL),
-			(tbSh) ? atomic_read(&tbSh->b_count) : -1,
-			(tb->L[h]) ? (long long)(tb->L[h]->b_blocknr) : (-1LL),
-			(tb->L[h]) ? atomic_read(&tb->L[h]->b_count) : -1,
-			(tb->R[h]) ? (long long)(tb->R[h]->b_blocknr) : (-1LL),
-			(tb->R[h]) ? atomic_read(&tb->R[h]->b_count) : -1,
-			(tbFh) ? (long long)(tbFh->b_blocknr) : (-1LL),
-			(tb->FL[h]) ? (long long)(tb->FL[h]->
-						  b_blocknr) : (-1LL),
-			(tb->FR[h]) ? (long long)(tb->FR[h]->
-						  b_blocknr) : (-1LL),
-			(tb->CFL[h]) ? (long long)(tb->CFL[h]->
-						   b_blocknr) : (-1LL),
-			(tb->CFR[h]) ? (long long)(tb->CFR[h]->
-						   b_blocknr) : (-1LL));
-	}
-
-	sprintf(print_tb_buf + strlen(print_tb_buf),
-		"=====================================================================\n"
-		"* h * size * ln * lb * rn * rb * blkn * s0 * s1 * s1b * s2 * s2b * curb * lk * rk *\n"
-		"* 0 * %4d * %2d * %2d * %2d * %2d * %4d * %2d * %2d * %3d * %2d * %3d * %4d * %2d * %2d *\n",
-		tb->insert_size[0], tb->lnum[0], tb->lbytes, tb->rnum[0],
-		tb->rbytes, tb->blknum[0], tb->s0num, tb->snum[0],
-		tb->sbytes[0], tb->snum[1], tb->sbytes[1],
-		tb->cur_blknum, tb->lkey[0], tb->rkey[0]);
-
-	/* this prints balance parameters for non-leaf levels */
-	h = 0;
-	do {
-		h++;
-		sprintf(print_tb_buf + strlen(print_tb_buf),
-			"* %d * %4d * %2d *    * %2d *    * %2d *\n",
-			h, tb->insert_size[h], tb->lnum[h], tb->rnum[h],
-			tb->blknum[h]);
-	} while (tb->insert_size[h]);
-
-	sprintf(print_tb_buf + strlen(print_tb_buf),
-		"=====================================================================\n"
-		"FEB list: ");
-
-	/* print FEB list (list of buffers in form (bh (b_blocknr, b_count), that will be used for new nodes) */
-	h = 0;
-	for (i = 0; i < ARRAY_SIZE(tb->FEB); i++)
-		sprintf(print_tb_buf + strlen(print_tb_buf),
-			"%p (%llu %d)%s", tb->FEB[i],
-			tb->FEB[i] ? (unsigned long long)tb->FEB[i]->
-			b_blocknr : 0ULL,
-			tb->FEB[i] ? atomic_read(&tb->FEB[i]->b_count) : 0,
-			(i == ARRAY_SIZE(tb->FEB) - 1) ? "\n" : ", ");
-
-	sprintf(print_tb_buf + strlen(print_tb_buf),
-		"======================== the end ====================================\n");
-}
-
-void print_cur_tb(char *mes)
-{
-	printk("%s\n%s", mes, print_tb_buf);
-}
-
-static void check_leaf_block_head(struct buffer_head *bh)
-{
-	struct block_head *blkh;
-	int nr;
-
-	blkh = B_BLK_HEAD(bh);
-	nr = blkh_nr_item(blkh);
-	if (nr > (bh->b_size - BLKH_SIZE) / IH_SIZE)
-		reiserfs_panic(NULL, "vs-6010", "invalid item number %z",
-			       bh);
-	if (blkh_free_space(blkh) > bh->b_size - BLKH_SIZE - IH_SIZE * nr)
-		reiserfs_panic(NULL, "vs-6020", "invalid free space %z",
-			       bh);
-
-}
-
-static void check_internal_block_head(struct buffer_head *bh)
-{
-	if (!(B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL(bh) <= MAX_HEIGHT))
-		reiserfs_panic(NULL, "vs-6025", "invalid level %z", bh);
-
-	if (B_NR_ITEMS(bh) > (bh->b_size - BLKH_SIZE) / IH_SIZE)
-		reiserfs_panic(NULL, "vs-6030", "invalid item number %z", bh);
-
-	if (B_FREE_SPACE(bh) !=
-	    bh->b_size - BLKH_SIZE - KEY_SIZE * B_NR_ITEMS(bh) -
-	    DC_SIZE * (B_NR_ITEMS(bh) + 1))
-		reiserfs_panic(NULL, "vs-6040", "invalid free space %z", bh);
-
-}
-
-void check_leaf(struct buffer_head *bh)
-{
-	int i;
-	struct item_head *ih;
-
-	if (!bh)
-		return;
-	check_leaf_block_head(bh);
-	for (i = 0, ih = item_head(bh, 0); i < B_NR_ITEMS(bh); i++, ih++)
-		op_check_item(ih, ih_item_body(bh, ih));
-}
-
-void check_internal(struct buffer_head *bh)
-{
-	if (!bh)
-		return;
-	check_internal_block_head(bh);
-}
-
-void print_statistics(struct super_block *s)
-{
-
-	/*
-	   printk ("reiserfs_put_super: session statistics: balances %d, fix_nodes %d, \
-	   bmap with search %d, without %d, dir2ind %d, ind2dir %d\n",
-	   REISERFS_SB(s)->s_do_balance, REISERFS_SB(s)->s_fix_nodes,
-	   REISERFS_SB(s)->s_bmaps, REISERFS_SB(s)->s_bmaps_without_search,
-	   REISERFS_SB(s)->s_direct2indirect, REISERFS_SB(s)->s_indirect2direct);
-	 */
-
-}
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
deleted file mode 100644
index 5c68a4a52d78..000000000000
--- a/fs/reiserfs/procfs.c
+++ /dev/null
@@ -1,490 +0,0 @@
-/* -*- linux-c -*- */
-
-/* fs/reiserfs/procfs.c */
-
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-/* proc info support a la one created by Sizif@Botik.RU for PGC */
-
-#include <linux/module.h>
-#include <linux/time.h>
-#include <linux/seq_file.h>
-#include <linux/uaccess.h>
-#include "reiserfs.h"
-#include <linux/init.h>
-#include <linux/proc_fs.h>
-#include <linux/blkdev.h>
-
-/*
- * LOCKING:
- *
- * These guys are evicted from procfs as the very first step in ->kill_sb().
- *
- */
-
-static int show_version(struct seq_file *m, void *unused)
-{
-	struct super_block *sb = m->private;
-	char *format;
-
-	if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_6)) {
-		format = "3.6";
-	} else if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_5)) {
-		format = "3.5";
-	} else {
-		format = "unknown";
-	}
-
-	seq_printf(m, "%s format\twith checks %s\n", format,
-#if defined( CONFIG_REISERFS_CHECK )
-		   "on"
-#else
-		   "off"
-#endif
-	    );
-	return 0;
-}
-
-#define SF( x ) ( r -> x )
-#define SFP( x ) SF( s_proc_info_data.x )
-#define SFPL( x ) SFP( x[ level ] )
-#define SFPF( x ) SFP( scan_bitmap.x )
-#define SFPJ( x ) SFP( journal.x )
-
-#define D2C( x ) le16_to_cpu( x )
-#define D4C( x ) le32_to_cpu( x )
-#define DF( x ) D2C( rs -> s_v1.x )
-#define DFL( x ) D4C( rs -> s_v1.x )
-
-#define objectid_map( s, rs ) (old_format_only (s) ?				\
-                         (__le32 *)((struct reiserfs_super_block_v1 *)rs + 1) :	\
-			 (__le32 *)(rs + 1))
-#define MAP( i ) D4C( objectid_map( sb, rs )[ i ] )
-
-#define DJF( x ) le32_to_cpu( rs -> x )
-#define DJP( x ) le32_to_cpu( jp -> x )
-#define JF( x ) ( r -> s_journal -> x )
-
-static int show_super(struct seq_file *m, void *unused)
-{
-	struct super_block *sb = m->private;
-	struct reiserfs_sb_info *r = REISERFS_SB(sb);
-
-	seq_printf(m, "state: \t%s\n"
-		   "mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n"
-		   "gen. counter: \t%i\n"
-		   "s_disk_reads: \t%i\n"
-		   "s_disk_writes: \t%i\n"
-		   "s_fix_nodes: \t%i\n"
-		   "s_do_balance: \t%i\n"
-		   "s_unneeded_left_neighbor: \t%i\n"
-		   "s_good_search_by_key_reada: \t%i\n"
-		   "s_bmaps: \t%i\n"
-		   "s_bmaps_without_search: \t%i\n"
-		   "s_direct2indirect: \t%i\n"
-		   "s_indirect2direct: \t%i\n"
-		   "\n"
-		   "max_hash_collisions: \t%i\n"
-		   "breads: \t%lu\n"
-		   "bread_misses: \t%lu\n"
-		   "search_by_key: \t%lu\n"
-		   "search_by_key_fs_changed: \t%lu\n"
-		   "search_by_key_restarted: \t%lu\n"
-		   "insert_item_restarted: \t%lu\n"
-		   "paste_into_item_restarted: \t%lu\n"
-		   "cut_from_item_restarted: \t%lu\n"
-		   "delete_solid_item_restarted: \t%lu\n"
-		   "delete_item_restarted: \t%lu\n"
-		   "leaked_oid: \t%lu\n"
-		   "leaves_removable: \t%lu\n",
-		   SF(s_mount_state) == REISERFS_VALID_FS ?
-		   "REISERFS_VALID_FS" : "REISERFS_ERROR_FS",
-		   reiserfs_r5_hash(sb) ? "FORCE_R5 " : "",
-		   reiserfs_rupasov_hash(sb) ? "FORCE_RUPASOV " : "",
-		   reiserfs_tea_hash(sb) ? "FORCE_TEA " : "",
-		   reiserfs_hash_detect(sb) ? "DETECT_HASH " : "",
-		   reiserfs_no_border(sb) ? "NO_BORDER " : "BORDER ",
-		   reiserfs_no_unhashed_relocation(sb) ?
-		   "NO_UNHASHED_RELOCATION " : "",
-		   reiserfs_hashed_relocation(sb) ? "UNHASHED_RELOCATION " : "",
-		   reiserfs_test4(sb) ? "TEST4 " : "",
-		   have_large_tails(sb) ? "TAILS " : have_small_tails(sb) ?
-		   "SMALL_TAILS " : "NO_TAILS ",
-		   replay_only(sb) ? "REPLAY_ONLY " : "",
-		   convert_reiserfs(sb) ? "CONV " : "",
-		   atomic_read(&r->s_generation_counter),
-		   SF(s_disk_reads), SF(s_disk_writes), SF(s_fix_nodes),
-		   SF(s_do_balance), SF(s_unneeded_left_neighbor),
-		   SF(s_good_search_by_key_reada), SF(s_bmaps),
-		   SF(s_bmaps_without_search), SF(s_direct2indirect),
-		   SF(s_indirect2direct), SFP(max_hash_collisions), SFP(breads),
-		   SFP(bread_miss), SFP(search_by_key),
-		   SFP(search_by_key_fs_changed), SFP(search_by_key_restarted),
-		   SFP(insert_item_restarted), SFP(paste_into_item_restarted),
-		   SFP(cut_from_item_restarted),
-		   SFP(delete_solid_item_restarted), SFP(delete_item_restarted),
-		   SFP(leaked_oid), SFP(leaves_removable));
-
-	return 0;
-}
-
-static int show_per_level(struct seq_file *m, void *unused)
-{
-	struct super_block *sb = m->private;
-	struct reiserfs_sb_info *r = REISERFS_SB(sb);
-	int level;
-
-	seq_printf(m, "level\t"
-		   "     balances"
-		   " [sbk:  reads"
-		   "   fs_changed"
-		   "   restarted]"
-		   "   free space"
-		   "        items"
-		   "   can_remove"
-		   "         lnum"
-		   "         rnum"
-		   "       lbytes"
-		   "       rbytes"
-		   "     get_neig"
-		   " get_neig_res" "  need_l_neig" "  need_r_neig" "\n");
-
-	for (level = 0; level < MAX_HEIGHT; ++level) {
-		seq_printf(m, "%i\t"
-			   " %12lu"
-			   " %12lu"
-			   " %12lu"
-			   " %12lu"
-			   " %12lu"
-			   " %12lu"
-			   " %12lu"
-			   " %12li"
-			   " %12li"
-			   " %12li"
-			   " %12li"
-			   " %12lu"
-			   " %12lu"
-			   " %12lu"
-			   " %12lu"
-			   "\n",
-			   level,
-			   SFPL(balance_at),
-			   SFPL(sbk_read_at),
-			   SFPL(sbk_fs_changed),
-			   SFPL(sbk_restarted),
-			   SFPL(free_at),
-			   SFPL(items_at),
-			   SFPL(can_node_be_removed),
-			   SFPL(lnum),
-			   SFPL(rnum),
-			   SFPL(lbytes),
-			   SFPL(rbytes),
-			   SFPL(get_neighbors),
-			   SFPL(get_neighbors_restart),
-			   SFPL(need_l_neighbor), SFPL(need_r_neighbor)
-		    );
-	}
-	return 0;
-}
-
-static int show_bitmap(struct seq_file *m, void *unused)
-{
-	struct super_block *sb = m->private;
-	struct reiserfs_sb_info *r = REISERFS_SB(sb);
-
-	seq_printf(m, "free_block: %lu\n"
-		   "  scan_bitmap:"
-		   "          wait"
-		   "          bmap"
-		   "         retry"
-		   "        stolen"
-		   "  journal_hint"
-		   "journal_nohint"
-		   "\n"
-		   " %14lu"
-		   " %14lu"
-		   " %14lu"
-		   " %14lu"
-		   " %14lu"
-		   " %14lu"
-		   " %14lu"
-		   "\n",
-		   SFP(free_block),
-		   SFPF(call),
-		   SFPF(wait),
-		   SFPF(bmap),
-		   SFPF(retry),
-		   SFPF(stolen),
-		   SFPF(in_journal_hint), SFPF(in_journal_nohint));
-
-	return 0;
-}
-
-static int show_on_disk_super(struct seq_file *m, void *unused)
-{
-	struct super_block *sb = m->private;
-	struct reiserfs_sb_info *sb_info = REISERFS_SB(sb);
-	struct reiserfs_super_block *rs = sb_info->s_rs;
-	int hash_code = DFL(s_hash_function_code);
-	__u32 flags = DJF(s_flags);
-
-	seq_printf(m, "block_count: \t%i\n"
-		   "free_blocks: \t%i\n"
-		   "root_block: \t%i\n"
-		   "blocksize: \t%i\n"
-		   "oid_maxsize: \t%i\n"
-		   "oid_cursize: \t%i\n"
-		   "umount_state: \t%i\n"
-		   "magic: \t%10.10s\n"
-		   "fs_state: \t%i\n"
-		   "hash: \t%s\n"
-		   "tree_height: \t%i\n"
-		   "bmap_nr: \t%i\n"
-		   "version: \t%i\n"
-		   "flags: \t%x[%s]\n"
-		   "reserved_for_journal: \t%i\n",
-		   DFL(s_block_count),
-		   DFL(s_free_blocks),
-		   DFL(s_root_block),
-		   DF(s_blocksize),
-		   DF(s_oid_maxsize),
-		   DF(s_oid_cursize),
-		   DF(s_umount_state),
-		   rs->s_v1.s_magic,
-		   DF(s_fs_state),
-		   hash_code == TEA_HASH ? "tea" :
-		   (hash_code == YURA_HASH) ? "rupasov" :
-		   (hash_code == R5_HASH) ? "r5" :
-		   (hash_code == UNSET_HASH) ? "unset" : "unknown",
-		   DF(s_tree_height),
-		   DF(s_bmap_nr),
-		   DF(s_version), flags, (flags & reiserfs_attrs_cleared)
-		   ? "attrs_cleared" : "", DF(s_reserved_for_journal));
-
-	return 0;
-}
-
-static int show_oidmap(struct seq_file *m, void *unused)
-{
-	struct super_block *sb = m->private;
-	struct reiserfs_sb_info *sb_info = REISERFS_SB(sb);
-	struct reiserfs_super_block *rs = sb_info->s_rs;
-	unsigned int mapsize = le16_to_cpu(rs->s_v1.s_oid_cursize);
-	unsigned long total_used = 0;
-	int i;
-
-	for (i = 0; i < mapsize; ++i) {
-		__u32 right;
-
-		right = (i == mapsize - 1) ? MAX_KEY_OBJECTID : MAP(i + 1);
-		seq_printf(m, "%s: [ %x .. %x )\n",
-			   (i & 1) ? "free" : "used", MAP(i), right);
-		if (!(i & 1)) {
-			total_used += right - MAP(i);
-		}
-	}
-#if defined( REISERFS_USE_OIDMAPF )
-	if (sb_info->oidmap.use_file && (sb_info->oidmap.mapf != NULL)) {
-		loff_t size = file_inode(sb_info->oidmap.mapf)->i_size;
-		total_used += size / sizeof(reiserfs_oidinterval_d_t);
-	}
-#endif
-	seq_printf(m, "total: \t%i [%i/%i] used: %lu [exact]\n",
-		   mapsize,
-		   mapsize, le16_to_cpu(rs->s_v1.s_oid_maxsize), total_used);
-	return 0;
-}
-
-static time64_t ktime_mono_to_real_seconds(time64_t mono)
-{
-	ktime_t kt = ktime_set(mono, NSEC_PER_SEC/2);
-
-	return ktime_divns(ktime_mono_to_real(kt), NSEC_PER_SEC);
-}
-
-static int show_journal(struct seq_file *m, void *unused)
-{
-	struct super_block *sb = m->private;
-	struct reiserfs_sb_info *r = REISERFS_SB(sb);
-	struct reiserfs_super_block *rs = r->s_rs;
-	struct journal_params *jp = &rs->s_v1.s_journal;
-
-	seq_printf(m,		/* on-disk fields */
-		   "jp_journal_1st_block: \t%i\n"
-		   "jp_journal_dev: \t%pg[%x]\n"
-		   "jp_journal_size: \t%i\n"
-		   "jp_journal_trans_max: \t%i\n"
-		   "jp_journal_magic: \t%i\n"
-		   "jp_journal_max_batch: \t%i\n"
-		   "jp_journal_max_commit_age: \t%i\n"
-		   "jp_journal_max_trans_age: \t%i\n"
-		   /* incore fields */
-		   "j_1st_reserved_block: \t%i\n"
-		   "j_state: \t%li\n"
-		   "j_trans_id: \t%u\n"
-		   "j_mount_id: \t%lu\n"
-		   "j_start: \t%lu\n"
-		   "j_len: \t%lu\n"
-		   "j_len_alloc: \t%lu\n"
-		   "j_wcount: \t%i\n"
-		   "j_bcount: \t%lu\n"
-		   "j_first_unflushed_offset: \t%lu\n"
-		   "j_last_flush_trans_id: \t%u\n"
-		   "j_trans_start_time: \t%lli\n"
-		   "j_list_bitmap_index: \t%i\n"
-		   "j_must_wait: \t%i\n"
-		   "j_next_full_flush: \t%i\n"
-		   "j_next_async_flush: \t%i\n"
-		   "j_cnode_used: \t%i\n" "j_cnode_free: \t%i\n" "\n"
-		   /* reiserfs_proc_info_data_t.journal fields */
-		   "in_journal: \t%12lu\n"
-		   "in_journal_bitmap: \t%12lu\n"
-		   "in_journal_reusable: \t%12lu\n"
-		   "lock_journal: \t%12lu\n"
-		   "lock_journal_wait: \t%12lu\n"
-		   "journal_begin: \t%12lu\n"
-		   "journal_relock_writers: \t%12lu\n"
-		   "journal_relock_wcount: \t%12lu\n"
-		   "mark_dirty: \t%12lu\n"
-		   "mark_dirty_already: \t%12lu\n"
-		   "mark_dirty_notjournal: \t%12lu\n"
-		   "restore_prepared: \t%12lu\n"
-		   "prepare: \t%12lu\n"
-		   "prepare_retry: \t%12lu\n",
-		   DJP(jp_journal_1st_block),
-		   file_bdev(SB_JOURNAL(sb)->j_bdev_file),
-		   DJP(jp_journal_dev),
-		   DJP(jp_journal_size),
-		   DJP(jp_journal_trans_max),
-		   DJP(jp_journal_magic),
-		   DJP(jp_journal_max_batch),
-		   SB_JOURNAL(sb)->j_max_commit_age,
-		   DJP(jp_journal_max_trans_age),
-		   JF(j_1st_reserved_block),
-		   JF(j_state),
-		   JF(j_trans_id),
-		   JF(j_mount_id),
-		   JF(j_start),
-		   JF(j_len),
-		   JF(j_len_alloc),
-		   atomic_read(&r->s_journal->j_wcount),
-		   JF(j_bcount),
-		   JF(j_first_unflushed_offset),
-		   JF(j_last_flush_trans_id),
-		   ktime_mono_to_real_seconds(JF(j_trans_start_time)),
-		   JF(j_list_bitmap_index),
-		   JF(j_must_wait),
-		   JF(j_next_full_flush),
-		   JF(j_next_async_flush),
-		   JF(j_cnode_used),
-		   JF(j_cnode_free),
-		   SFPJ(in_journal),
-		   SFPJ(in_journal_bitmap),
-		   SFPJ(in_journal_reusable),
-		   SFPJ(lock_journal),
-		   SFPJ(lock_journal_wait),
-		   SFPJ(journal_being),
-		   SFPJ(journal_relock_writers),
-		   SFPJ(journal_relock_wcount),
-		   SFPJ(mark_dirty),
-		   SFPJ(mark_dirty_already),
-		   SFPJ(mark_dirty_notjournal),
-		   SFPJ(restore_prepared), SFPJ(prepare), SFPJ(prepare_retry)
-	    );
-	return 0;
-}
-
-static struct proc_dir_entry *proc_info_root = NULL;
-static const char proc_info_root_name[] = "fs/reiserfs";
-
-static void add_file(struct super_block *sb, char *name,
-		     int (*func) (struct seq_file *, void *))
-{
-	proc_create_single_data(name, 0, REISERFS_SB(sb)->procdir, func, sb);
-}
-
-int reiserfs_proc_info_init(struct super_block *sb)
-{
-	char b[BDEVNAME_SIZE];
-	char *s;
-
-	/* Some block devices use /'s */
-	strscpy(b, sb->s_id, BDEVNAME_SIZE);
-	s = strchr(b, '/');
-	if (s)
-		*s = '!';
-
-	spin_lock_init(&__PINFO(sb).lock);
-	REISERFS_SB(sb)->procdir = proc_mkdir_data(b, 0, proc_info_root, sb);
-	if (REISERFS_SB(sb)->procdir) {
-		add_file(sb, "version", show_version);
-		add_file(sb, "super", show_super);
-		add_file(sb, "per-level", show_per_level);
-		add_file(sb, "bitmap", show_bitmap);
-		add_file(sb, "on-disk-super", show_on_disk_super);
-		add_file(sb, "oidmap", show_oidmap);
-		add_file(sb, "journal", show_journal);
-		return 0;
-	}
-	reiserfs_warning(sb, "cannot create /proc/%s/%s",
-			 proc_info_root_name, b);
-	return 1;
-}
-
-int reiserfs_proc_info_done(struct super_block *sb)
-{
-	struct proc_dir_entry *de = REISERFS_SB(sb)->procdir;
-	if (de) {
-		char b[BDEVNAME_SIZE];
-		char *s;
-
-		/* Some block devices use /'s */
-		strscpy(b, sb->s_id, BDEVNAME_SIZE);
-		s = strchr(b, '/');
-		if (s)
-			*s = '!';
-
-		remove_proc_subtree(b, proc_info_root);
-		REISERFS_SB(sb)->procdir = NULL;
-	}
-	return 0;
-}
-
-int reiserfs_proc_info_global_init(void)
-{
-	if (proc_info_root == NULL) {
-		proc_info_root = proc_mkdir(proc_info_root_name, NULL);
-		if (!proc_info_root) {
-			reiserfs_warning(NULL, "cannot create /proc/%s",
-					 proc_info_root_name);
-			return 1;
-		}
-	}
-	return 0;
-}
-
-int reiserfs_proc_info_global_done(void)
-{
-	if (proc_info_root != NULL) {
-		proc_info_root = NULL;
-		remove_proc_entry(proc_info_root_name, NULL);
-	}
-	return 0;
-}
-/*
- * Revision 1.1.8.2  2001/07/15 17:08:42  god
- *  . use get_super() in procfs.c
- *  . remove remove_save_link() from reiserfs_do_truncate()
- *
- * I accept terms and conditions stated in the Legal Agreement
- * (available at http://www.namesys.com/legalese.html)
- *
- * Revision 1.1.8.1  2001/07/11 16:48:50  god
- * proc info support
- *
- * I accept terms and conditions stated in the Legal Agreement
- * (available at http://www.namesys.com/legalese.html)
- *
- */
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
deleted file mode 100644
index 12fc20af8e17..000000000000
--- a/fs/reiserfs/reiserfs.h
+++ /dev/null
@@ -1,3419 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright 1996, 1997, 1998 Hans Reiser, see reiserfs/README for
- * licensing and copyright details
- */
-
-#include <linux/reiserfs_fs.h>
-
-#include <linux/slab.h>
-#include <linux/interrupt.h>
-#include <linux/sched.h>
-#include <linux/bug.h>
-#include <linux/workqueue.h>
-#include <linux/unaligned.h>
-#include <linux/bitops.h>
-#include <linux/proc_fs.h>
-#include <linux/buffer_head.h>
-
-/* the 32 bit compat definitions with int argument */
-#define REISERFS_IOC32_UNPACK		_IOW(0xCD, 1, int)
-#define REISERFS_IOC32_GETVERSION	FS_IOC32_GETVERSION
-#define REISERFS_IOC32_SETVERSION	FS_IOC32_SETVERSION
-
-struct reiserfs_journal_list;
-
-/* bitmasks for i_flags field in reiserfs-specific part of inode */
-typedef enum {
-	/*
-	 * this says what format of key do all items (but stat data) of
-	 * an object have.  If this is set, that format is 3.6 otherwise - 3.5
-	 */
-	i_item_key_version_mask = 0x0001,
-
-	/*
-	 * If this is unset, object has 3.5 stat data, otherwise,
-	 * it has 3.6 stat data with 64bit size, 32bit nlink etc.
-	 */
-	i_stat_data_version_mask = 0x0002,
-
-	/* file might need tail packing on close */
-	i_pack_on_close_mask = 0x0004,
-
-	/* don't pack tail of file */
-	i_nopack_mask = 0x0008,
-
-	/*
-	 * If either of these are set, "safe link" was created for this
-	 * file during truncate or unlink. Safe link is used to avoid
-	 * leakage of disk space on crash with some files open, but unlinked.
-	 */
-	i_link_saved_unlink_mask = 0x0010,
-	i_link_saved_truncate_mask = 0x0020,
-
-	i_has_xattr_dir = 0x0040,
-	i_data_log = 0x0080,
-} reiserfs_inode_flags;
-
-struct reiserfs_inode_info {
-	__u32 i_key[4];		/* key is still 4 32 bit integers */
-
-	/*
-	 * transient inode flags that are never stored on disk. Bitmasks
-	 * for this field are defined above.
-	 */
-	__u32 i_flags;
-
-	/* offset of first byte stored in direct item. */
-	__u32 i_first_direct_byte;
-
-	/* copy of persistent inode flags read from sd_attrs. */
-	__u32 i_attrs;
-
-	/* first unused block of a sequence of unused blocks */
-	int i_prealloc_block;
-	int i_prealloc_count;	/* length of that sequence */
-
-	/* per-transaction list of inodes which  have preallocated blocks */
-	struct list_head i_prealloc_list;
-
-	/*
-	 * new_packing_locality is created; new blocks for the contents
-	 * of this directory should be displaced
-	 */
-	unsigned new_packing_locality:1;
-
-	/*
-	 * we use these for fsync or O_SYNC to decide which transaction
-	 * needs to be committed in order for this inode to be properly
-	 * flushed
-	 */
-	unsigned int i_trans_id;
-
-	struct reiserfs_journal_list *i_jl;
-	atomic_t openers;
-	struct mutex tailpack;
-#ifdef CONFIG_REISERFS_FS_XATTR
-	struct rw_semaphore i_xattr_sem;
-#endif
-#ifdef CONFIG_QUOTA
-	struct dquot __rcu *i_dquot[MAXQUOTAS];
-#endif
-
-	struct inode vfs_inode;
-};
-
-typedef enum {
-	reiserfs_attrs_cleared = 0x00000001,
-} reiserfs_super_block_flags;
-
-/*
- * struct reiserfs_super_block accessors/mutators since this is a disk
- * structure, it will always be in little endian format.
- */
-#define sb_block_count(sbp)         (le32_to_cpu((sbp)->s_v1.s_block_count))
-#define set_sb_block_count(sbp,v)   ((sbp)->s_v1.s_block_count = cpu_to_le32(v))
-#define sb_free_blocks(sbp)         (le32_to_cpu((sbp)->s_v1.s_free_blocks))
-#define set_sb_free_blocks(sbp,v)   ((sbp)->s_v1.s_free_blocks = cpu_to_le32(v))
-#define sb_root_block(sbp)          (le32_to_cpu((sbp)->s_v1.s_root_block))
-#define set_sb_root_block(sbp,v)    ((sbp)->s_v1.s_root_block = cpu_to_le32(v))
-
-#define sb_jp_journal_1st_block(sbp)  \
-              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_1st_block))
-#define set_sb_jp_journal_1st_block(sbp,v) \
-              ((sbp)->s_v1.s_journal.jp_journal_1st_block = cpu_to_le32(v))
-#define sb_jp_journal_dev(sbp) \
-              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_dev))
-#define set_sb_jp_journal_dev(sbp,v) \
-              ((sbp)->s_v1.s_journal.jp_journal_dev = cpu_to_le32(v))
-#define sb_jp_journal_size(sbp) \
-              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_size))
-#define set_sb_jp_journal_size(sbp,v) \
-              ((sbp)->s_v1.s_journal.jp_journal_size = cpu_to_le32(v))
-#define sb_jp_journal_trans_max(sbp) \
-              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_trans_max))
-#define set_sb_jp_journal_trans_max(sbp,v) \
-              ((sbp)->s_v1.s_journal.jp_journal_trans_max = cpu_to_le32(v))
-#define sb_jp_journal_magic(sbp) \
-              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_magic))
-#define set_sb_jp_journal_magic(sbp,v) \
-              ((sbp)->s_v1.s_journal.jp_journal_magic = cpu_to_le32(v))
-#define sb_jp_journal_max_batch(sbp) \
-              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_max_batch))
-#define set_sb_jp_journal_max_batch(sbp,v) \
-              ((sbp)->s_v1.s_journal.jp_journal_max_batch = cpu_to_le32(v))
-#define sb_jp_jourmal_max_commit_age(sbp) \
-              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_max_commit_age))
-#define set_sb_jp_journal_max_commit_age(sbp,v) \
-              ((sbp)->s_v1.s_journal.jp_journal_max_commit_age = cpu_to_le32(v))
-
-#define sb_blocksize(sbp)          (le16_to_cpu((sbp)->s_v1.s_blocksize))
-#define set_sb_blocksize(sbp,v)    ((sbp)->s_v1.s_blocksize = cpu_to_le16(v))
-#define sb_oid_maxsize(sbp)        (le16_to_cpu((sbp)->s_v1.s_oid_maxsize))
-#define set_sb_oid_maxsize(sbp,v)  ((sbp)->s_v1.s_oid_maxsize = cpu_to_le16(v))
-#define sb_oid_cursize(sbp)        (le16_to_cpu((sbp)->s_v1.s_oid_cursize))
-#define set_sb_oid_cursize(sbp,v)  ((sbp)->s_v1.s_oid_cursize = cpu_to_le16(v))
-#define sb_umount_state(sbp)       (le16_to_cpu((sbp)->s_v1.s_umount_state))
-#define set_sb_umount_state(sbp,v) ((sbp)->s_v1.s_umount_state = cpu_to_le16(v))
-#define sb_fs_state(sbp)           (le16_to_cpu((sbp)->s_v1.s_fs_state))
-#define set_sb_fs_state(sbp,v)     ((sbp)->s_v1.s_fs_state = cpu_to_le16(v))
-#define sb_hash_function_code(sbp) \
-              (le32_to_cpu((sbp)->s_v1.s_hash_function_code))
-#define set_sb_hash_function_code(sbp,v) \
-              ((sbp)->s_v1.s_hash_function_code = cpu_to_le32(v))
-#define sb_tree_height(sbp)        (le16_to_cpu((sbp)->s_v1.s_tree_height))
-#define set_sb_tree_height(sbp,v)  ((sbp)->s_v1.s_tree_height = cpu_to_le16(v))
-#define sb_bmap_nr(sbp)            (le16_to_cpu((sbp)->s_v1.s_bmap_nr))
-#define set_sb_bmap_nr(sbp,v)      ((sbp)->s_v1.s_bmap_nr = cpu_to_le16(v))
-#define sb_version(sbp)            (le16_to_cpu((sbp)->s_v1.s_version))
-#define set_sb_version(sbp,v)      ((sbp)->s_v1.s_version = cpu_to_le16(v))
-
-#define sb_mnt_count(sbp)	   (le16_to_cpu((sbp)->s_mnt_count))
-#define set_sb_mnt_count(sbp, v)   ((sbp)->s_mnt_count = cpu_to_le16(v))
-
-#define sb_reserved_for_journal(sbp) \
-              (le16_to_cpu((sbp)->s_v1.s_reserved_for_journal))
-#define set_sb_reserved_for_journal(sbp,v) \
-              ((sbp)->s_v1.s_reserved_for_journal = cpu_to_le16(v))
-
-/* LOGGING -- */
-
-/*
- * These all interelate for performance.
- *
- * If the journal block count is smaller than n transactions, you lose speed.
- * I don't know what n is yet, I'm guessing 8-16.
- *
- * typical transaction size depends on the application, how often fsync is
- * called, and how many metadata blocks you dirty in a 30 second period.
- * The more small files (<16k) you use, the larger your transactions will
- * be.
- *
- * If your journal fills faster than dirty buffers get flushed to disk, it
- * must flush them before allowing the journal to wrap, which slows things
- * down.  If you need high speed meta data updates, the journal should be
- * big enough to prevent wrapping before dirty meta blocks get to disk.
- *
- * If the batch max is smaller than the transaction max, you'll waste space
- * at the end of the journal because journal_end sets the next transaction
- * to start at 0 if the next transaction has any chance of wrapping.
- *
- * The large the batch max age, the better the speed, and the more meta
- * data changes you'll lose after a crash.
- */
-
-/* don't mess with these for a while */
-/* we have a node size define somewhere in reiserfs_fs.h. -Hans */
-#define JOURNAL_BLOCK_SIZE  4096	/* BUG gotta get rid of this */
-#define JOURNAL_MAX_CNODE   1500	/* max cnodes to allocate. */
-#define JOURNAL_HASH_SIZE 8192
-
-/* number of copies of the bitmaps to have floating.  Must be >= 2 */
-#define JOURNAL_NUM_BITMAPS 5
-
-/*
- * One of these for every block in every transaction
- * Each one is in two hash tables.  First, a hash of the current transaction,
- * and after journal_end, a hash of all the in memory transactions.
- * next and prev are used by the current transaction (journal_hash).
- * hnext and hprev are used by journal_list_hash.  If a block is in more
- * than one transaction, the journal_list_hash links it in multiple times.
- * This allows flush_journal_list to remove just the cnode belonging to a
- * given transaction.
- */
-struct reiserfs_journal_cnode {
-	struct buffer_head *bh;	/* real buffer head */
-	struct super_block *sb;	/* dev of real buffer head */
-
-	/* block number of real buffer head, == 0 when buffer on disk */
-	__u32 blocknr;
-
-	unsigned long state;
-
-	/* journal list this cnode lives in */
-	struct reiserfs_journal_list *jlist;
-
-	struct reiserfs_journal_cnode *next;	/* next in transaction list */
-	struct reiserfs_journal_cnode *prev;	/* prev in transaction list */
-	struct reiserfs_journal_cnode *hprev;	/* prev in hash list */
-	struct reiserfs_journal_cnode *hnext;	/* next in hash list */
-};
-
-struct reiserfs_bitmap_node {
-	int id;
-	char *data;
-	struct list_head list;
-};
-
-struct reiserfs_list_bitmap {
-	struct reiserfs_journal_list *journal_list;
-	struct reiserfs_bitmap_node **bitmaps;
-};
-
-/*
- * one of these for each transaction.  The most important part here is the
- * j_realblock.  this list of cnodes is used to hash all the blocks in all
- * the commits, to mark all the real buffer heads dirty once all the commits
- * hit the disk, and to make sure every real block in a transaction is on
- * disk before allowing the log area to be overwritten
- */
-struct reiserfs_journal_list {
-	unsigned long j_start;
-	unsigned long j_state;
-	unsigned long j_len;
-	atomic_t j_nonzerolen;
-	atomic_t j_commit_left;
-
-	/* all commits older than this on disk */
-	atomic_t j_older_commits_done;
-
-	struct mutex j_commit_mutex;
-	unsigned int j_trans_id;
-	time64_t j_timestamp; /* write-only but useful for crash dump analysis */
-	struct reiserfs_list_bitmap *j_list_bitmap;
-	struct buffer_head *j_commit_bh;	/* commit buffer head */
-	struct reiserfs_journal_cnode *j_realblock;
-	struct reiserfs_journal_cnode *j_freedlist;	/* list of buffers that were freed during this trans.  free each of these on flush */
-	/* time ordered list of all active transactions */
-	struct list_head j_list;
-
-	/*
-	 * time ordered list of all transactions we haven't tried
-	 * to flush yet
-	 */
-	struct list_head j_working_list;
-
-	/* list of tail conversion targets in need of flush before commit */
-	struct list_head j_tail_bh_list;
-
-	/* list of data=ordered buffers in need of flush before commit */
-	struct list_head j_bh_list;
-	int j_refcount;
-};
-
-struct reiserfs_journal {
-	struct buffer_head **j_ap_blocks;	/* journal blocks on disk */
-	/* newest journal block */
-	struct reiserfs_journal_cnode *j_last;
-
-	/* oldest journal block.  start here for traverse */
-	struct reiserfs_journal_cnode *j_first;
-
-	struct file *j_bdev_file;
-
-	/* first block on s_dev of reserved area journal */
-	int j_1st_reserved_block;
-
-	unsigned long j_state;
-	unsigned int j_trans_id;
-	unsigned long j_mount_id;
-
-	/* start of current waiting commit (index into j_ap_blocks) */
-	unsigned long j_start;
-	unsigned long j_len;	/* length of current waiting commit */
-
-	/* number of buffers requested by journal_begin() */
-	unsigned long j_len_alloc;
-
-	atomic_t j_wcount;	/* count of writers for current commit */
-
-	/* batch count. allows turning X transactions into 1 */
-	unsigned long j_bcount;
-
-	/* first unflushed transactions offset */
-	unsigned long j_first_unflushed_offset;
-
-	/* last fully flushed journal timestamp */
-	unsigned j_last_flush_trans_id;
-
-	struct buffer_head *j_header_bh;
-
-	time64_t j_trans_start_time;	/* time this transaction started */
-	struct mutex j_mutex;
-	struct mutex j_flush_mutex;
-
-	/* wait for current transaction to finish before starting new one */
-	wait_queue_head_t j_join_wait;
-
-	atomic_t j_jlock;		/* lock for j_join_wait */
-	int j_list_bitmap_index;	/* number of next list bitmap to use */
-
-	/* no more journal begins allowed. MUST sleep on j_join_wait */
-	int j_must_wait;
-
-	/* next journal_end will flush all journal list */
-	int j_next_full_flush;
-
-	/* next journal_end will flush all async commits */
-	int j_next_async_flush;
-
-	int j_cnode_used;	/* number of cnodes on the used list */
-	int j_cnode_free;	/* number of cnodes on the free list */
-
-	/* max number of blocks in a transaction.  */
-	unsigned int j_trans_max;
-
-	/* max number of blocks to batch into a trans */
-	unsigned int j_max_batch;
-
-	/* in seconds, how old can an async commit be */
-	unsigned int j_max_commit_age;
-
-	/* in seconds, how old can a transaction be */
-	unsigned int j_max_trans_age;
-
-	/* the default for the max commit age */
-	unsigned int j_default_max_commit_age;
-
-	struct reiserfs_journal_cnode *j_cnode_free_list;
-
-	/* orig pointer returned from vmalloc */
-	struct reiserfs_journal_cnode *j_cnode_free_orig;
-
-	struct reiserfs_journal_list *j_current_jl;
-	int j_free_bitmap_nodes;
-	int j_used_bitmap_nodes;
-
-	int j_num_lists;	/* total number of active transactions */
-	int j_num_work_lists;	/* number that need attention from kreiserfsd */
-
-	/* debugging to make sure things are flushed in order */
-	unsigned int j_last_flush_id;
-
-	/* debugging to make sure things are committed in order */
-	unsigned int j_last_commit_id;
-
-	struct list_head j_bitmap_nodes;
-	struct list_head j_dirty_buffers;
-	spinlock_t j_dirty_buffers_lock;	/* protects j_dirty_buffers */
-
-	/* list of all active transactions */
-	struct list_head j_journal_list;
-
-	/* lists that haven't been touched by writeback attempts */
-	struct list_head j_working_list;
-
-	/* hash table for real buffer heads in current trans */
-	struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE];
-
-	/* hash table for all the real buffer heads in all the transactions */
-	struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE];
-
-	/* array of bitmaps to record the deleted blocks */
-	struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS];
-
-	/* list of inodes which have preallocated blocks */
-	struct list_head j_prealloc_list;
-	int j_persistent_trans;
-	unsigned long j_max_trans_size;
-	unsigned long j_max_batch_size;
-
-	int j_errno;
-
-	/* when flushing ordered buffers, throttle new ordered writers */
-	struct delayed_work j_work;
-	struct super_block *j_work_sb;
-	atomic_t j_async_throttle;
-};
-
-enum journal_state_bits {
-	J_WRITERS_BLOCKED = 1,	/* set when new writers not allowed */
-	J_WRITERS_QUEUED,    /* set when log is full due to too many writers */
-	J_ABORTED,           /* set when log is aborted */
-};
-
-/* ick.  magic string to find desc blocks in the journal */
-#define JOURNAL_DESC_MAGIC "ReIsErLB"
-
-typedef __u32(*hashf_t) (const signed char *, int);
-
-struct reiserfs_bitmap_info {
-	__u32 free_count;
-};
-
-struct proc_dir_entry;
-
-#if defined( CONFIG_PROC_FS ) && defined( CONFIG_REISERFS_PROC_INFO )
-typedef unsigned long int stat_cnt_t;
-typedef struct reiserfs_proc_info_data {
-	spinlock_t lock;
-	int exiting;
-	int max_hash_collisions;
-
-	stat_cnt_t breads;
-	stat_cnt_t bread_miss;
-	stat_cnt_t search_by_key;
-	stat_cnt_t search_by_key_fs_changed;
-	stat_cnt_t search_by_key_restarted;
-
-	stat_cnt_t insert_item_restarted;
-	stat_cnt_t paste_into_item_restarted;
-	stat_cnt_t cut_from_item_restarted;
-	stat_cnt_t delete_solid_item_restarted;
-	stat_cnt_t delete_item_restarted;
-
-	stat_cnt_t leaked_oid;
-	stat_cnt_t leaves_removable;
-
-	/*
-	 * balances per level.
-	 * Use explicit 5 as MAX_HEIGHT is not visible yet.
-	 */
-	stat_cnt_t balance_at[5];	/* XXX */
-	/* sbk == search_by_key */
-	stat_cnt_t sbk_read_at[5];	/* XXX */
-	stat_cnt_t sbk_fs_changed[5];
-	stat_cnt_t sbk_restarted[5];
-	stat_cnt_t items_at[5];	/* XXX */
-	stat_cnt_t free_at[5];	/* XXX */
-	stat_cnt_t can_node_be_removed[5];	/* XXX */
-	long int lnum[5];	/* XXX */
-	long int rnum[5];	/* XXX */
-	long int lbytes[5];	/* XXX */
-	long int rbytes[5];	/* XXX */
-	stat_cnt_t get_neighbors[5];
-	stat_cnt_t get_neighbors_restart[5];
-	stat_cnt_t need_l_neighbor[5];
-	stat_cnt_t need_r_neighbor[5];
-
-	stat_cnt_t free_block;
-	struct __scan_bitmap_stats {
-		stat_cnt_t call;
-		stat_cnt_t wait;
-		stat_cnt_t bmap;
-		stat_cnt_t retry;
-		stat_cnt_t in_journal_hint;
-		stat_cnt_t in_journal_nohint;
-		stat_cnt_t stolen;
-	} scan_bitmap;
-	struct __journal_stats {
-		stat_cnt_t in_journal;
-		stat_cnt_t in_journal_bitmap;
-		stat_cnt_t in_journal_reusable;
-		stat_cnt_t lock_journal;
-		stat_cnt_t lock_journal_wait;
-		stat_cnt_t journal_being;
-		stat_cnt_t journal_relock_writers;
-		stat_cnt_t journal_relock_wcount;
-		stat_cnt_t mark_dirty;
-		stat_cnt_t mark_dirty_already;
-		stat_cnt_t mark_dirty_notjournal;
-		stat_cnt_t restore_prepared;
-		stat_cnt_t prepare;
-		stat_cnt_t prepare_retry;
-	} journal;
-} reiserfs_proc_info_data_t;
-#else
-typedef struct reiserfs_proc_info_data {
-} reiserfs_proc_info_data_t;
-#endif
-
-/* Number of quota types we support */
-#define REISERFS_MAXQUOTAS 2
-
-/* reiserfs union of in-core super block data */
-struct reiserfs_sb_info {
-	/* Buffer containing the super block */
-	struct buffer_head *s_sbh;
-
-	/* Pointer to the on-disk super block in the buffer */
-	struct reiserfs_super_block *s_rs;
-	struct reiserfs_bitmap_info *s_ap_bitmap;
-
-	/* pointer to journal information */
-	struct reiserfs_journal *s_journal;
-
-	unsigned short s_mount_state;	/* reiserfs state (valid, invalid) */
-
-	/* Serialize writers access, replace the old bkl */
-	struct mutex lock;
-
-	/* Owner of the lock (can be recursive) */
-	struct task_struct *lock_owner;
-
-	/* Depth of the lock, start from -1 like the bkl */
-	int lock_depth;
-
-	struct workqueue_struct *commit_wq;
-
-	/* Comment? -Hans */
-	void (*end_io_handler) (struct buffer_head *, int);
-
-	/*
-	 * pointer to function which is used to sort names in directory.
-	 * Set on mount
-	 */
-	hashf_t s_hash_function;
-
-	/* reiserfs's mount options are set here */
-	unsigned long s_mount_opt;
-
-	/* This is a structure that describes block allocator options */
-	struct {
-		/* Bitfield for enable/disable kind of options */
-		unsigned long bits;
-
-		/*
-		 * size started from which we consider file
-		 * to be a large one (in blocks)
-		 */
-		unsigned long large_file_size;
-
-		int border;	/* percentage of disk, border takes */
-
-		/*
-		 * Minimal file size (in blocks) starting
-		 * from which we do preallocations
-		 */
-		int preallocmin;
-
-		/*
-		 * Number of blocks we try to prealloc when file
-		 * reaches preallocmin size (in blocks) or prealloc_list
-		 is empty.
-		 */
-		int preallocsize;
-	} s_alloc_options;
-
-	/* Comment? -Hans */
-	wait_queue_head_t s_wait;
-	/* increased by one every time the  tree gets re-balanced */
-	atomic_t s_generation_counter;
-
-	/* File system properties. Currently holds on-disk FS format */
-	unsigned long s_properties;
-
-	/* session statistics */
-	int s_disk_reads;
-	int s_disk_writes;
-	int s_fix_nodes;
-	int s_do_balance;
-	int s_unneeded_left_neighbor;
-	int s_good_search_by_key_reada;
-	int s_bmaps;
-	int s_bmaps_without_search;
-	int s_direct2indirect;
-	int s_indirect2direct;
-
-	/*
-	 * set up when it's ok for reiserfs_read_inode2() to read from
-	 * disk inode with nlink==0. Currently this is only used during
-	 * finish_unfinished() processing at mount time
-	 */
-	int s_is_unlinked_ok;
-
-	reiserfs_proc_info_data_t s_proc_info_data;
-	struct proc_dir_entry *procdir;
-
-	/* amount of blocks reserved for further allocations */
-	int reserved_blocks;
-
-
-	/* this lock on now only used to protect reserved_blocks variable */
-	spinlock_t bitmap_lock;
-	struct dentry *priv_root;	/* root of /.reiserfs_priv */
-	struct dentry *xattr_root;	/* root of /.reiserfs_priv/xattrs */
-	int j_errno;
-
-	int work_queued;              /* non-zero delayed work is queued */
-	struct delayed_work old_work; /* old transactions flush delayed work */
-	spinlock_t old_work_lock;     /* protects old_work and work_queued */
-
-#ifdef CONFIG_QUOTA
-	char *s_qf_names[REISERFS_MAXQUOTAS];
-	int s_jquota_fmt;
-#endif
-	char *s_jdev;		/* Stored jdev for mount option showing */
-#ifdef CONFIG_REISERFS_CHECK
-
-	/*
-	 * Detects whether more than one copy of tb exists per superblock
-	 * as a means of checking whether do_balance is executing
-	 * concurrently against another tree reader/writer on a same
-	 * mount point.
-	 */
-	struct tree_balance *cur_tb;
-#endif
-};
-
-/* Definitions of reiserfs on-disk properties: */
-#define REISERFS_3_5 0
-#define REISERFS_3_6 1
-#define REISERFS_OLD_FORMAT 2
-
-/* Mount options */
-enum reiserfs_mount_options {
-	/* large tails will be created in a session */
-	REISERFS_LARGETAIL,
-	/*
-	 * small (for files less than block size) tails will
-	 * be created in a session
-	 */
-	REISERFS_SMALLTAIL,
-
-	/* replay journal and return 0. Use by fsck */
-	REPLAYONLY,
-
-	/*
-	 * -o conv: causes conversion of old format super block to the
-	 * new format. If not specified - old partition will be dealt
-	 * with in a manner of 3.5.x
-	 */
-	REISERFS_CONVERT,
-
-	/*
-	 * -o hash={tea, rupasov, r5, detect} is meant for properly mounting
-	 * reiserfs disks from 3.5.19 or earlier.  99% of the time, this
-	 * option is not required.  If the normal autodection code can't
-	 * determine which hash to use (because both hashes had the same
-	 * value for a file) use this option to force a specific hash.
-	 * It won't allow you to override the existing hash on the FS, so
-	 * if you have a tea hash disk, and mount with -o hash=rupasov,
-	 * the mount will fail.
-	 */
-	FORCE_TEA_HASH,		/* try to force tea hash on mount */
-	FORCE_RUPASOV_HASH,	/* try to force rupasov hash on mount */
-	FORCE_R5_HASH,		/* try to force rupasov hash on mount */
-	FORCE_HASH_DETECT,	/* try to detect hash function on mount */
-
-	REISERFS_DATA_LOG,
-	REISERFS_DATA_ORDERED,
-	REISERFS_DATA_WRITEBACK,
-
-	/*
-	 * used for testing experimental features, makes benchmarking new
-	 * features with and without more convenient, should never be used by
-	 * users in any code shipped to users (ideally)
-	 */
-
-	REISERFS_NO_BORDER,
-	REISERFS_NO_UNHASHED_RELOCATION,
-	REISERFS_HASHED_RELOCATION,
-	REISERFS_ATTRS,
-	REISERFS_XATTRS_USER,
-	REISERFS_POSIXACL,
-	REISERFS_EXPOSE_PRIVROOT,
-	REISERFS_BARRIER_NONE,
-	REISERFS_BARRIER_FLUSH,
-
-	/* Actions on error */
-	REISERFS_ERROR_PANIC,
-	REISERFS_ERROR_RO,
-	REISERFS_ERROR_CONTINUE,
-
-	REISERFS_USRQUOTA,	/* User quota option specified */
-	REISERFS_GRPQUOTA,	/* Group quota option specified */
-
-	REISERFS_TEST1,
-	REISERFS_TEST2,
-	REISERFS_TEST3,
-	REISERFS_TEST4,
-	REISERFS_UNSUPPORTED_OPT,
-};
-
-#define reiserfs_r5_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_R5_HASH))
-#define reiserfs_rupasov_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_RUPASOV_HASH))
-#define reiserfs_tea_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_TEA_HASH))
-#define reiserfs_hash_detect(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_HASH_DETECT))
-#define reiserfs_no_border(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_NO_BORDER))
-#define reiserfs_no_unhashed_relocation(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_NO_UNHASHED_RELOCATION))
-#define reiserfs_hashed_relocation(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_HASHED_RELOCATION))
-#define reiserfs_test4(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_TEST4))
-
-#define have_large_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_LARGETAIL))
-#define have_small_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_SMALLTAIL))
-#define replay_only(s) (REISERFS_SB(s)->s_mount_opt & (1 << REPLAYONLY))
-#define reiserfs_attrs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ATTRS))
-#define old_format_only(s) (REISERFS_SB(s)->s_properties & (1 << REISERFS_3_5))
-#define convert_reiserfs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_CONVERT))
-#define reiserfs_data_log(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_LOG))
-#define reiserfs_data_ordered(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_ORDERED))
-#define reiserfs_data_writeback(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_WRITEBACK))
-#define reiserfs_xattrs_user(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_XATTRS_USER))
-#define reiserfs_posixacl(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_POSIXACL))
-#define reiserfs_expose_privroot(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_EXPOSE_PRIVROOT))
-#define reiserfs_xattrs_optional(s) (reiserfs_xattrs_user(s) || reiserfs_posixacl(s))
-#define reiserfs_barrier_none(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_NONE))
-#define reiserfs_barrier_flush(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_FLUSH))
-
-#define reiserfs_error_panic(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ERROR_PANIC))
-#define reiserfs_error_ro(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ERROR_RO))
-
-void reiserfs_file_buffer(struct buffer_head *bh, int list);
-extern struct file_system_type reiserfs_fs_type;
-int reiserfs_resize(struct super_block *, unsigned long);
-
-#define CARRY_ON                0
-#define SCHEDULE_OCCURRED       1
-
-#define SB_BUFFER_WITH_SB(s) (REISERFS_SB(s)->s_sbh)
-#define SB_JOURNAL(s) (REISERFS_SB(s)->s_journal)
-#define SB_JOURNAL_1st_RESERVED_BLOCK(s) (SB_JOURNAL(s)->j_1st_reserved_block)
-#define SB_JOURNAL_LEN_FREE(s) (SB_JOURNAL(s)->j_journal_len_free)
-#define SB_AP_BITMAP(s) (REISERFS_SB(s)->s_ap_bitmap)
-
-#define SB_DISK_JOURNAL_HEAD(s) (SB_JOURNAL(s)->j_header_bh->)
-
-#define reiserfs_is_journal_aborted(journal) (unlikely (__reiserfs_is_journal_aborted (journal)))
-static inline int __reiserfs_is_journal_aborted(struct reiserfs_journal
-						*journal)
-{
-	return test_bit(J_ABORTED, &journal->j_state);
-}
-
-/*
- * Locking primitives. The write lock is a per superblock
- * special mutex that has properties close to the Big Kernel Lock
- * which was used in the previous locking scheme.
- */
-void reiserfs_write_lock(struct super_block *s);
-void reiserfs_write_unlock(struct super_block *s);
-int __must_check reiserfs_write_unlock_nested(struct super_block *s);
-void reiserfs_write_lock_nested(struct super_block *s, int depth);
-
-#ifdef CONFIG_REISERFS_CHECK
-void reiserfs_lock_check_recursive(struct super_block *s);
-#else
-static inline void reiserfs_lock_check_recursive(struct super_block *s) { }
-#endif
-
-/*
- * Several mutexes depend on the write lock.
- * However sometimes we want to relax the write lock while we hold
- * these mutexes, according to the release/reacquire on schedule()
- * properties of the Bkl that were used.
- * Reiserfs performances and locking were based on this scheme.
- * Now that the write lock is a mutex and not the bkl anymore, doing so
- * may result in a deadlock:
- *
- * A acquire write_lock
- * A acquire j_commit_mutex
- * A release write_lock and wait for something
- * B acquire write_lock
- * B can't acquire j_commit_mutex and sleep
- * A can't acquire write lock anymore
- * deadlock
- *
- * What we do here is avoiding such deadlock by playing the same game
- * than the Bkl: if we can't acquire a mutex that depends on the write lock,
- * we release the write lock, wait a bit and then retry.
- *
- * The mutexes concerned by this hack are:
- * - The commit mutex of a journal list
- * - The flush mutex
- * - The journal lock
- * - The inode mutex
- */
-static inline void reiserfs_mutex_lock_safe(struct mutex *m,
-					    struct super_block *s)
-{
-	int depth;
-
-	depth = reiserfs_write_unlock_nested(s);
-	mutex_lock(m);
-	reiserfs_write_lock_nested(s, depth);
-}
-
-static inline void
-reiserfs_mutex_lock_nested_safe(struct mutex *m, unsigned int subclass,
-				struct super_block *s)
-{
-	int depth;
-
-	depth = reiserfs_write_unlock_nested(s);
-	mutex_lock_nested(m, subclass);
-	reiserfs_write_lock_nested(s, depth);
-}
-
-static inline void
-reiserfs_down_read_safe(struct rw_semaphore *sem, struct super_block *s)
-{
-       int depth;
-       depth = reiserfs_write_unlock_nested(s);
-       down_read(sem);
-       reiserfs_write_lock_nested(s, depth);
-}
-
-/*
- * When we schedule, we usually want to also release the write lock,
- * according to the previous bkl based locking scheme of reiserfs.
- */
-static inline void reiserfs_cond_resched(struct super_block *s)
-{
-	if (need_resched()) {
-		int depth;
-
-		depth = reiserfs_write_unlock_nested(s);
-		schedule();
-		reiserfs_write_lock_nested(s, depth);
-	}
-}
-
-struct fid;
-
-/*
- * in reading the #defines, it may help to understand that they employ
- *  the following abbreviations:
- *
- *  B = Buffer
- *  I = Item header
- *  H = Height within the tree (should be changed to LEV)
- *  N = Number of the item in the node
- *  STAT = stat data
- *  DEH = Directory Entry Header
- *  EC = Entry Count
- *  E = Entry number
- *  UL = Unsigned Long
- *  BLKH = BLocK Header
- *  UNFM = UNForMatted node
- *  DC = Disk Child
- *  P = Path
- *
- *  These #defines are named by concatenating these abbreviations,
- *  where first comes the arguments, and last comes the return value,
- *  of the macro.
- */
-
-#define USE_INODE_GENERATION_COUNTER
-
-#define REISERFS_PREALLOCATE
-#define DISPLACE_NEW_PACKING_LOCALITIES
-#define PREALLOCATION_SIZE 9
-
-/* n must be power of 2 */
-#define _ROUND_UP(x,n) (((x)+(n)-1u) & ~((n)-1u))
-
-/*
- * to be ok for alpha and others we have to align structures to 8 byte
- * boundary.
- * FIXME: do not change 4 by anything else: there is code which relies on that
- */
-#define ROUND_UP(x) _ROUND_UP(x,8LL)
-
-/*
- * debug levels.  Right now, CONFIG_REISERFS_CHECK means print all debug
- * messages.
- */
-#define REISERFS_DEBUG_CODE 5	/* extra messages to help find/debug errors */
-
-void __reiserfs_warning(struct super_block *s, const char *id,
-			 const char *func, const char *fmt, ...);
-#define reiserfs_warning(s, id, fmt, args...) \
-	 __reiserfs_warning(s, id, __func__, fmt, ##args)
-/* assertions handling */
-
-/* always check a condition and panic if it's false. */
-#define __RASSERT(cond, scond, format, args...)			\
-do {									\
-	if (!(cond))							\
-		reiserfs_panic(NULL, "assertion failure", "(" #cond ") at " \
-			       __FILE__ ":%i:%s: " format "\n",		\
-			       __LINE__, __func__ , ##args);		\
-} while (0)
-
-#define RASSERT(cond, format, args...) __RASSERT(cond, #cond, format, ##args)
-
-#if defined( CONFIG_REISERFS_CHECK )
-#define RFALSE(cond, format, args...) __RASSERT(!(cond), "!(" #cond ")", format, ##args)
-#else
-#define RFALSE( cond, format, args... ) do {;} while( 0 )
-#endif
-
-#define CONSTF __attribute_const__
-/*
- * Disk Data Structures
- */
-
-/***************************************************************************
- *                             SUPER BLOCK                                 *
- ***************************************************************************/
-
-/*
- * Structure of super block on disk, a version of which in RAM is often
- * accessed as REISERFS_SB(s)->s_rs. The version in RAM is part of a larger
- * structure containing fields never written to disk.
- */
-#define UNSET_HASH 0	/* Detect hash on disk */
-#define TEA_HASH  1
-#define YURA_HASH 2
-#define R5_HASH   3
-#define DEFAULT_HASH R5_HASH
-
-struct journal_params {
-	/* where does journal start from on its * device */
-	__le32 jp_journal_1st_block;
-
-	/* journal device st_rdev */
-	__le32 jp_journal_dev;
-
-	/* size of the journal */
-	__le32 jp_journal_size;
-
-	/* max number of blocks in a transaction. */
-	__le32 jp_journal_trans_max;
-
-	/*
-	 * random value made on fs creation
-	 * (this was sb_journal_block_count)
-	 */
-	__le32 jp_journal_magic;
-
-	/* max number of blocks to batch into a trans */
-	__le32 jp_journal_max_batch;
-
-	/* in seconds, how old can an async  commit be */
-	__le32 jp_journal_max_commit_age;
-
-	/* in seconds, how old can a transaction be */
-	__le32 jp_journal_max_trans_age;
-};
-
-/* this is the super from 3.5.X, where X >= 10 */
-struct reiserfs_super_block_v1 {
-	__le32 s_block_count;	/* blocks count         */
-	__le32 s_free_blocks;	/* free blocks count    */
-	__le32 s_root_block;	/* root block number    */
-	struct journal_params s_journal;
-	__le16 s_blocksize;	/* block size */
-
-	/* max size of object id array, see get_objectid() commentary  */
-	__le16 s_oid_maxsize;
-	__le16 s_oid_cursize;	/* current size of object id array */
-
-	/* this is set to 1 when filesystem was umounted, to 2 - when not */
-	__le16 s_umount_state;
-
-	/*
-	 * reiserfs magic string indicates that file system is reiserfs:
-	 * "ReIsErFs" or "ReIsEr2Fs" or "ReIsEr3Fs"
-	 */
-	char s_magic[10];
-
-	/*
-	 * it is set to used by fsck to mark which
-	 * phase of rebuilding is done
-	 */
-	__le16 s_fs_state;
-	/*
-	 * indicate, what hash function is being use
-	 * to sort names in a directory
-	 */
-	__le32 s_hash_function_code;
-	__le16 s_tree_height;	/* height of disk tree */
-
-	/*
-	 * amount of bitmap blocks needed to address
-	 * each block of file system
-	 */
-	__le16 s_bmap_nr;
-
-	/*
-	 * this field is only reliable on filesystem with non-standard journal
-	 */
-	__le16 s_version;
-
-	/*
-	 * size in blocks of journal area on main device, we need to
-	 * keep after making fs with non-standard journal
-	 */
-	__le16 s_reserved_for_journal;
-} __attribute__ ((__packed__));
-
-#define SB_SIZE_V1 (sizeof(struct reiserfs_super_block_v1))
-
-/* this is the on disk super block */
-struct reiserfs_super_block {
-	struct reiserfs_super_block_v1 s_v1;
-	__le32 s_inode_generation;
-
-	/* Right now used only by inode-attributes, if enabled */
-	__le32 s_flags;
-
-	unsigned char s_uuid[16];	/* filesystem unique identifier */
-	unsigned char s_label[16];	/* filesystem volume label */
-	__le16 s_mnt_count;		/* Count of mounts since last fsck */
-	__le16 s_max_mnt_count;		/* Maximum mounts before check */
-	__le32 s_lastcheck;		/* Timestamp of last fsck */
-	__le32 s_check_interval;	/* Interval between checks */
-
-	/*
-	 * zero filled by mkreiserfs and reiserfs_convert_objectid_map_v1()
-	 * so any additions must be updated there as well. */
-	char s_unused[76];
-} __attribute__ ((__packed__));
-
-#define SB_SIZE (sizeof(struct reiserfs_super_block))
-
-#define REISERFS_VERSION_1 0
-#define REISERFS_VERSION_2 2
-
-/* on-disk super block fields converted to cpu form */
-#define SB_DISK_SUPER_BLOCK(s) (REISERFS_SB(s)->s_rs)
-#define SB_V1_DISK_SUPER_BLOCK(s) (&(SB_DISK_SUPER_BLOCK(s)->s_v1))
-#define SB_BLOCKSIZE(s) \
-        le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_blocksize))
-#define SB_BLOCK_COUNT(s) \
-        le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_block_count))
-#define SB_FREE_BLOCKS(s) \
-        le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_free_blocks))
-#define SB_REISERFS_MAGIC(s) \
-        (SB_V1_DISK_SUPER_BLOCK(s)->s_magic)
-#define SB_ROOT_BLOCK(s) \
-        le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_root_block))
-#define SB_TREE_HEIGHT(s) \
-        le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_tree_height))
-#define SB_REISERFS_STATE(s) \
-        le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_umount_state))
-#define SB_VERSION(s) le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_version))
-#define SB_BMAP_NR(s) le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_bmap_nr))
-
-#define PUT_SB_BLOCK_COUNT(s, val) \
-   do { SB_V1_DISK_SUPER_BLOCK(s)->s_block_count = cpu_to_le32(val); } while (0)
-#define PUT_SB_FREE_BLOCKS(s, val) \
-   do { SB_V1_DISK_SUPER_BLOCK(s)->s_free_blocks = cpu_to_le32(val); } while (0)
-#define PUT_SB_ROOT_BLOCK(s, val) \
-   do { SB_V1_DISK_SUPER_BLOCK(s)->s_root_block = cpu_to_le32(val); } while (0)
-#define PUT_SB_TREE_HEIGHT(s, val) \
-   do { SB_V1_DISK_SUPER_BLOCK(s)->s_tree_height = cpu_to_le16(val); } while (0)
-#define PUT_SB_REISERFS_STATE(s, val) \
-   do { SB_V1_DISK_SUPER_BLOCK(s)->s_umount_state = cpu_to_le16(val); } while (0)
-#define PUT_SB_VERSION(s, val) \
-   do { SB_V1_DISK_SUPER_BLOCK(s)->s_version = cpu_to_le16(val); } while (0)
-#define PUT_SB_BMAP_NR(s, val) \
-   do { SB_V1_DISK_SUPER_BLOCK(s)->s_bmap_nr = cpu_to_le16 (val); } while (0)
-
-#define SB_ONDISK_JP(s) (&SB_V1_DISK_SUPER_BLOCK(s)->s_journal)
-#define SB_ONDISK_JOURNAL_SIZE(s) \
-         le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_size))
-#define SB_ONDISK_JOURNAL_1st_BLOCK(s) \
-         le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_1st_block))
-#define SB_ONDISK_JOURNAL_DEVICE(s) \
-         le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_dev))
-#define SB_ONDISK_RESERVED_FOR_JOURNAL(s) \
-         le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_reserved_for_journal))
-
-#define is_block_in_log_or_reserved_area(s, block) \
-         block >= SB_JOURNAL_1st_RESERVED_BLOCK(s) \
-         && block < SB_JOURNAL_1st_RESERVED_BLOCK(s) +  \
-         ((!is_reiserfs_jr(SB_DISK_SUPER_BLOCK(s)) ? \
-         SB_ONDISK_JOURNAL_SIZE(s) + 1 : SB_ONDISK_RESERVED_FOR_JOURNAL(s)))
-
-int is_reiserfs_3_5(struct reiserfs_super_block *rs);
-int is_reiserfs_3_6(struct reiserfs_super_block *rs);
-int is_reiserfs_jr(struct reiserfs_super_block *rs);
-
-/*
- * ReiserFS leaves the first 64k unused, so that partition labels have
- * enough space.  If someone wants to write a fancy bootloader that
- * needs more than 64k, let us know, and this will be increased in size.
- * This number must be larger than the largest block size on any
- * platform, or code will break.  -Hans
- */
-#define REISERFS_DISK_OFFSET_IN_BYTES (64 * 1024)
-#define REISERFS_FIRST_BLOCK unused_define
-#define REISERFS_JOURNAL_OFFSET_IN_BYTES REISERFS_DISK_OFFSET_IN_BYTES
-
-/* the spot for the super in versions 3.5 - 3.5.10 (inclusive) */
-#define REISERFS_OLD_DISK_OFFSET_IN_BYTES (8 * 1024)
-
-/* reiserfs internal error code (used by search_by_key and fix_nodes)) */
-#define CARRY_ON      0
-#define REPEAT_SEARCH -1
-#define IO_ERROR      -2
-#define NO_DISK_SPACE -3
-#define NO_BALANCING_NEEDED  (-4)
-#define NO_MORE_UNUSED_CONTIGUOUS_BLOCKS (-5)
-#define QUOTA_EXCEEDED -6
-
-typedef __u32 b_blocknr_t;
-typedef __le32 unp_t;
-
-struct unfm_nodeinfo {
-	unp_t unfm_nodenum;
-	unsigned short unfm_freespace;
-};
-
-/* there are two formats of keys: 3.5 and 3.6 */
-#define KEY_FORMAT_3_5 0
-#define KEY_FORMAT_3_6 1
-
-/* there are two stat datas */
-#define STAT_DATA_V1 0
-#define STAT_DATA_V2 1
-
-static inline struct reiserfs_inode_info *REISERFS_I(const struct inode *inode)
-{
-	return container_of(inode, struct reiserfs_inode_info, vfs_inode);
-}
-
-static inline struct reiserfs_sb_info *REISERFS_SB(const struct super_block *sb)
-{
-	return sb->s_fs_info;
-}
-
-/*
- * Don't trust REISERFS_SB(sb)->s_bmap_nr, it's a u16
- * which overflows on large file systems.
- */
-static inline __u32 reiserfs_bmap_count(struct super_block *sb)
-{
-	return (SB_BLOCK_COUNT(sb) - 1) / (sb->s_blocksize * 8) + 1;
-}
-
-static inline int bmap_would_wrap(unsigned bmap_nr)
-{
-	return bmap_nr > ((1LL << 16) - 1);
-}
-
-extern const struct xattr_handler * const reiserfs_xattr_handlers[];
-
-/*
- * this says about version of key of all items (but stat data) the
- * object consists of
- */
-#define get_inode_item_key_version( inode )                                    \
-    ((REISERFS_I(inode)->i_flags & i_item_key_version_mask) ? KEY_FORMAT_3_6 : KEY_FORMAT_3_5)
-
-#define set_inode_item_key_version( inode, version )                           \
-         ({ if((version)==KEY_FORMAT_3_6)                                      \
-                REISERFS_I(inode)->i_flags |= i_item_key_version_mask;      \
-            else                                                               \
-                REISERFS_I(inode)->i_flags &= ~i_item_key_version_mask; })
-
-#define get_inode_sd_version(inode)                                            \
-    ((REISERFS_I(inode)->i_flags & i_stat_data_version_mask) ? STAT_DATA_V2 : STAT_DATA_V1)
-
-#define set_inode_sd_version(inode, version)                                   \
-         ({ if((version)==STAT_DATA_V2)                                        \
-                REISERFS_I(inode)->i_flags |= i_stat_data_version_mask;     \
-            else                                                               \
-                REISERFS_I(inode)->i_flags &= ~i_stat_data_version_mask; })
-
-/*
- * This is an aggressive tail suppression policy, I am hoping it
- * improves our benchmarks. The principle behind it is that percentage
- * space saving is what matters, not absolute space saving.  This is
- * non-intuitive, but it helps to understand it if you consider that the
- * cost to access 4 blocks is not much more than the cost to access 1
- * block, if you have to do a seek and rotate.  A tail risks a
- * non-linear disk access that is significant as a percentage of total
- * time cost for a 4 block file and saves an amount of space that is
- * less significant as a percentage of space, or so goes the hypothesis.
- * -Hans
- */
-#define STORE_TAIL_IN_UNFM_S1(n_file_size,n_tail_size,n_block_size) \
-(\
-  (!(n_tail_size)) || \
-  (((n_tail_size) > MAX_DIRECT_ITEM_LEN(n_block_size)) || \
-   ( (n_file_size) >= (n_block_size) * 4 ) || \
-   ( ( (n_file_size) >= (n_block_size) * 3 ) && \
-     ( (n_tail_size) >=   (MAX_DIRECT_ITEM_LEN(n_block_size))/4) ) || \
-   ( ( (n_file_size) >= (n_block_size) * 2 ) && \
-     ( (n_tail_size) >=   (MAX_DIRECT_ITEM_LEN(n_block_size))/2) ) || \
-   ( ( (n_file_size) >= (n_block_size) ) && \
-     ( (n_tail_size) >=   (MAX_DIRECT_ITEM_LEN(n_block_size) * 3)/4) ) ) \
-)
-
-/*
- * Another strategy for tails, this one means only create a tail if all the
- * file would fit into one DIRECT item.
- * Primary intention for this one is to increase performance by decreasing
- * seeking.
-*/
-#define STORE_TAIL_IN_UNFM_S2(n_file_size,n_tail_size,n_block_size) \
-(\
-  (!(n_tail_size)) || \
-  (((n_file_size) > MAX_DIRECT_ITEM_LEN(n_block_size)) ) \
-)
-
-/*
- * values for s_umount_state field
- */
-#define REISERFS_VALID_FS    1
-#define REISERFS_ERROR_FS    2
-
-/*
- * there are 5 item types currently
- */
-#define TYPE_STAT_DATA 0
-#define TYPE_INDIRECT 1
-#define TYPE_DIRECT 2
-#define TYPE_DIRENTRY 3
-#define TYPE_MAXTYPE 3
-#define TYPE_ANY 15		/* FIXME: comment is required */
-
-/***************************************************************************
- *                       KEY & ITEM HEAD                                   *
- ***************************************************************************/
-
-/* * directories use this key as well as old files */
-struct offset_v1 {
-	__le32 k_offset;
-	__le32 k_uniqueness;
-} __attribute__ ((__packed__));
-
-struct offset_v2 {
-	__le64 v;
-} __attribute__ ((__packed__));
-
-static inline __u16 offset_v2_k_type(const struct offset_v2 *v2)
-{
-	__u8 type = le64_to_cpu(v2->v) >> 60;
-	return (type <= TYPE_MAXTYPE) ? type : TYPE_ANY;
-}
-
-static inline void set_offset_v2_k_type(struct offset_v2 *v2, int type)
-{
-	v2->v =
-	    (v2->v & cpu_to_le64(~0ULL >> 4)) | cpu_to_le64((__u64) type << 60);
-}
-
-static inline loff_t offset_v2_k_offset(const struct offset_v2 *v2)
-{
-	return le64_to_cpu(v2->v) & (~0ULL >> 4);
-}
-
-static inline void set_offset_v2_k_offset(struct offset_v2 *v2, loff_t offset)
-{
-	offset &= (~0ULL >> 4);
-	v2->v = (v2->v & cpu_to_le64(15ULL << 60)) | cpu_to_le64(offset);
-}
-
-/*
- * Key of an item determines its location in the S+tree, and
- * is composed of 4 components
- */
-struct reiserfs_key {
-	/* packing locality: by default parent directory object id */
-	__le32 k_dir_id;
-
-	__le32 k_objectid;	/* object identifier */
-	union {
-		struct offset_v1 k_offset_v1;
-		struct offset_v2 k_offset_v2;
-	} __attribute__ ((__packed__)) u;
-} __attribute__ ((__packed__));
-
-struct in_core_key {
-	/* packing locality: by default parent directory object id */
-	__u32 k_dir_id;
-	__u32 k_objectid;	/* object identifier */
-	__u64 k_offset;
-	__u8 k_type;
-};
-
-struct cpu_key {
-	struct in_core_key on_disk_key;
-	int version;
-	/* 3 in all cases but direct2indirect and indirect2direct conversion */
-	int key_length;
-};
-
-/*
- * Our function for comparing keys can compare keys of different
- * lengths.  It takes as a parameter the length of the keys it is to
- * compare.  These defines are used in determining what is to be passed
- * to it as that parameter.
- */
-#define REISERFS_FULL_KEY_LEN     4
-#define REISERFS_SHORT_KEY_LEN    2
-
-/* The result of the key compare */
-#define FIRST_GREATER 1
-#define SECOND_GREATER -1
-#define KEYS_IDENTICAL 0
-#define KEY_FOUND 1
-#define KEY_NOT_FOUND 0
-
-#define KEY_SIZE (sizeof(struct reiserfs_key))
-
-/* return values for search_by_key and clones */
-#define ITEM_FOUND 1
-#define ITEM_NOT_FOUND 0
-#define ENTRY_FOUND 1
-#define ENTRY_NOT_FOUND 0
-#define DIRECTORY_NOT_FOUND -1
-#define REGULAR_FILE_FOUND -2
-#define DIRECTORY_FOUND -3
-#define BYTE_FOUND 1
-#define BYTE_NOT_FOUND 0
-#define FILE_NOT_FOUND -1
-
-#define POSITION_FOUND 1
-#define POSITION_NOT_FOUND 0
-
-/* return values for reiserfs_find_entry and search_by_entry_key */
-#define NAME_FOUND 1
-#define NAME_NOT_FOUND 0
-#define GOTO_PREVIOUS_ITEM 2
-#define NAME_FOUND_INVISIBLE 3
-
-/*
- * Everything in the filesystem is stored as a set of items.  The
- * item head contains the key of the item, its free space (for
- * indirect items) and specifies the location of the item itself
- * within the block.
- */
-
-struct item_head {
-	/*
-	 * Everything in the tree is found by searching for it based on
-	 * its key.
-	 */
-	struct reiserfs_key ih_key;
-	union {
-		/*
-		 * The free space in the last unformatted node of an
-		 * indirect item if this is an indirect item.  This
-		 * equals 0xFFFF iff this is a direct item or stat data
-		 * item. Note that the key, not this field, is used to
-		 * determine the item type, and thus which field this
-		 * union contains.
-		 */
-		__le16 ih_free_space_reserved;
-
-		/*
-		 * Iff this is a directory item, this field equals the
-		 * number of directory entries in the directory item.
-		 */
-		__le16 ih_entry_count;
-	} __attribute__ ((__packed__)) u;
-	__le16 ih_item_len;	/* total size of the item body */
-
-	/* an offset to the item body within the block */
-	__le16 ih_item_location;
-
-	/*
-	 * 0 for all old items, 2 for new ones. Highest bit is set by fsck
-	 * temporary, cleaned after all done
-	 */
-	__le16 ih_version;
-} __attribute__ ((__packed__));
-/* size of item header     */
-#define IH_SIZE (sizeof(struct item_head))
-
-#define ih_free_space(ih)            le16_to_cpu((ih)->u.ih_free_space_reserved)
-#define ih_version(ih)               le16_to_cpu((ih)->ih_version)
-#define ih_entry_count(ih)           le16_to_cpu((ih)->u.ih_entry_count)
-#define ih_location(ih)              le16_to_cpu((ih)->ih_item_location)
-#define ih_item_len(ih)              le16_to_cpu((ih)->ih_item_len)
-
-#define put_ih_free_space(ih, val)   do { (ih)->u.ih_free_space_reserved = cpu_to_le16(val); } while(0)
-#define put_ih_version(ih, val)      do { (ih)->ih_version = cpu_to_le16(val); } while (0)
-#define put_ih_entry_count(ih, val)  do { (ih)->u.ih_entry_count = cpu_to_le16(val); } while (0)
-#define put_ih_location(ih, val)     do { (ih)->ih_item_location = cpu_to_le16(val); } while (0)
-#define put_ih_item_len(ih, val)     do { (ih)->ih_item_len = cpu_to_le16(val); } while (0)
-
-#define unreachable_item(ih) (ih_version(ih) & (1 << 15))
-
-#define get_ih_free_space(ih) (ih_version (ih) == KEY_FORMAT_3_6 ? 0 : ih_free_space (ih))
-#define set_ih_free_space(ih,val) put_ih_free_space((ih), ((ih_version(ih) == KEY_FORMAT_3_6) ? 0 : (val)))
-
-/*
- * these operate on indirect items, where you've got an array of ints
- * at a possibly unaligned location.  These are a noop on ia32
- *
- * p is the array of __u32, i is the index into the array, v is the value
- * to store there.
- */
-#define get_block_num(p, i) get_unaligned_le32((p) + (i))
-#define put_block_num(p, i, v) put_unaligned_le32((v), (p) + (i))
-
-/* * in old version uniqueness field shows key type */
-#define V1_SD_UNIQUENESS 0
-#define V1_INDIRECT_UNIQUENESS 0xfffffffe
-#define V1_DIRECT_UNIQUENESS 0xffffffff
-#define V1_DIRENTRY_UNIQUENESS 500
-#define V1_ANY_UNIQUENESS 555	/* FIXME: comment is required */
-
-/* here are conversion routines */
-static inline int uniqueness2type(__u32 uniqueness) CONSTF;
-static inline int uniqueness2type(__u32 uniqueness)
-{
-	switch ((int)uniqueness) {
-	case V1_SD_UNIQUENESS:
-		return TYPE_STAT_DATA;
-	case V1_INDIRECT_UNIQUENESS:
-		return TYPE_INDIRECT;
-	case V1_DIRECT_UNIQUENESS:
-		return TYPE_DIRECT;
-	case V1_DIRENTRY_UNIQUENESS:
-		return TYPE_DIRENTRY;
-	case V1_ANY_UNIQUENESS:
-	default:
-		return TYPE_ANY;
-	}
-}
-
-static inline __u32 type2uniqueness(int type) CONSTF;
-static inline __u32 type2uniqueness(int type)
-{
-	switch (type) {
-	case TYPE_STAT_DATA:
-		return V1_SD_UNIQUENESS;
-	case TYPE_INDIRECT:
-		return V1_INDIRECT_UNIQUENESS;
-	case TYPE_DIRECT:
-		return V1_DIRECT_UNIQUENESS;
-	case TYPE_DIRENTRY:
-		return V1_DIRENTRY_UNIQUENESS;
-	case TYPE_ANY:
-	default:
-		return V1_ANY_UNIQUENESS;
-	}
-}
-
-/*
- * key is pointer to on disk key which is stored in le, result is cpu,
- * there is no way to get version of object from key, so, provide
- * version to these defines
- */
-static inline loff_t le_key_k_offset(int version,
-				     const struct reiserfs_key *key)
-{
-	return (version == KEY_FORMAT_3_5) ?
-	    le32_to_cpu(key->u.k_offset_v1.k_offset) :
-	    offset_v2_k_offset(&(key->u.k_offset_v2));
-}
-
-static inline loff_t le_ih_k_offset(const struct item_head *ih)
-{
-	return le_key_k_offset(ih_version(ih), &(ih->ih_key));
-}
-
-static inline loff_t le_key_k_type(int version, const struct reiserfs_key *key)
-{
-	if (version == KEY_FORMAT_3_5) {
-		loff_t val = le32_to_cpu(key->u.k_offset_v1.k_uniqueness);
-		return uniqueness2type(val);
-	} else
-		return offset_v2_k_type(&(key->u.k_offset_v2));
-}
-
-static inline loff_t le_ih_k_type(const struct item_head *ih)
-{
-	return le_key_k_type(ih_version(ih), &(ih->ih_key));
-}
-
-static inline void set_le_key_k_offset(int version, struct reiserfs_key *key,
-				       loff_t offset)
-{
-	if (version == KEY_FORMAT_3_5)
-		key->u.k_offset_v1.k_offset = cpu_to_le32(offset);
-	else
-		set_offset_v2_k_offset(&key->u.k_offset_v2, offset);
-}
-
-static inline void add_le_key_k_offset(int version, struct reiserfs_key *key,
-				       loff_t offset)
-{
-	set_le_key_k_offset(version, key,
-			    le_key_k_offset(version, key) + offset);
-}
-
-static inline void add_le_ih_k_offset(struct item_head *ih, loff_t offset)
-{
-	add_le_key_k_offset(ih_version(ih), &(ih->ih_key), offset);
-}
-
-static inline void set_le_ih_k_offset(struct item_head *ih, loff_t offset)
-{
-	set_le_key_k_offset(ih_version(ih), &(ih->ih_key), offset);
-}
-
-static inline void set_le_key_k_type(int version, struct reiserfs_key *key,
-				     int type)
-{
-	if (version == KEY_FORMAT_3_5) {
-		type = type2uniqueness(type);
-		key->u.k_offset_v1.k_uniqueness = cpu_to_le32(type);
-	} else
-	       set_offset_v2_k_type(&key->u.k_offset_v2, type);
-}
-
-static inline void set_le_ih_k_type(struct item_head *ih, int type)
-{
-	set_le_key_k_type(ih_version(ih), &(ih->ih_key), type);
-}
-
-static inline int is_direntry_le_key(int version, struct reiserfs_key *key)
-{
-	return le_key_k_type(version, key) == TYPE_DIRENTRY;
-}
-
-static inline int is_direct_le_key(int version, struct reiserfs_key *key)
-{
-	return le_key_k_type(version, key) == TYPE_DIRECT;
-}
-
-static inline int is_indirect_le_key(int version, struct reiserfs_key *key)
-{
-	return le_key_k_type(version, key) == TYPE_INDIRECT;
-}
-
-static inline int is_statdata_le_key(int version, struct reiserfs_key *key)
-{
-	return le_key_k_type(version, key) == TYPE_STAT_DATA;
-}
-
-/* item header has version.  */
-static inline int is_direntry_le_ih(struct item_head *ih)
-{
-	return is_direntry_le_key(ih_version(ih), &ih->ih_key);
-}
-
-static inline int is_direct_le_ih(struct item_head *ih)
-{
-	return is_direct_le_key(ih_version(ih), &ih->ih_key);
-}
-
-static inline int is_indirect_le_ih(struct item_head *ih)
-{
-	return is_indirect_le_key(ih_version(ih), &ih->ih_key);
-}
-
-static inline int is_statdata_le_ih(struct item_head *ih)
-{
-	return is_statdata_le_key(ih_version(ih), &ih->ih_key);
-}
-
-/* key is pointer to cpu key, result is cpu */
-static inline loff_t cpu_key_k_offset(const struct cpu_key *key)
-{
-	return key->on_disk_key.k_offset;
-}
-
-static inline loff_t cpu_key_k_type(const struct cpu_key *key)
-{
-	return key->on_disk_key.k_type;
-}
-
-static inline void set_cpu_key_k_offset(struct cpu_key *key, loff_t offset)
-{
-	key->on_disk_key.k_offset = offset;
-}
-
-static inline void set_cpu_key_k_type(struct cpu_key *key, int type)
-{
-	key->on_disk_key.k_type = type;
-}
-
-static inline void cpu_key_k_offset_dec(struct cpu_key *key)
-{
-	key->on_disk_key.k_offset--;
-}
-
-#define is_direntry_cpu_key(key) (cpu_key_k_type (key) == TYPE_DIRENTRY)
-#define is_direct_cpu_key(key) (cpu_key_k_type (key) == TYPE_DIRECT)
-#define is_indirect_cpu_key(key) (cpu_key_k_type (key) == TYPE_INDIRECT)
-#define is_statdata_cpu_key(key) (cpu_key_k_type (key) == TYPE_STAT_DATA)
-
-/* are these used ? */
-#define is_direntry_cpu_ih(ih) (is_direntry_cpu_key (&((ih)->ih_key)))
-#define is_direct_cpu_ih(ih) (is_direct_cpu_key (&((ih)->ih_key)))
-#define is_indirect_cpu_ih(ih) (is_indirect_cpu_key (&((ih)->ih_key)))
-#define is_statdata_cpu_ih(ih) (is_statdata_cpu_key (&((ih)->ih_key)))
-
-#define I_K_KEY_IN_ITEM(ih, key, n_blocksize) \
-    (!COMP_SHORT_KEYS(ih, key) && \
-	  I_OFF_BYTE_IN_ITEM(ih, k_offset(key), n_blocksize))
-
-/* maximal length of item */
-#define MAX_ITEM_LEN(block_size) (block_size - BLKH_SIZE - IH_SIZE)
-#define MIN_ITEM_LEN 1
-
-/* object identifier for root dir */
-#define REISERFS_ROOT_OBJECTID 2
-#define REISERFS_ROOT_PARENT_OBJECTID 1
-
-extern struct reiserfs_key root_key;
-
-/*
- * Picture represents a leaf of the S+tree
- *  ______________________________________________________
- * |      |  Array of     |                   |           |
- * |Block |  Object-Item  |      F r e e      |  Objects- |
- * | head |  Headers      |     S p a c e     |   Items   |
- * |______|_______________|___________________|___________|
- */
-
-/*
- * Header of a disk block.  More precisely, header of a formatted leaf
- * or internal node, and not the header of an unformatted node.
- */
-struct block_head {
-	__le16 blk_level;	/* Level of a block in the tree. */
-	__le16 blk_nr_item;	/* Number of keys/items in a block. */
-	__le16 blk_free_space;	/* Block free space in bytes. */
-	__le16 blk_reserved;
-	/* dump this in v4/planA */
-
-	/* kept only for compatibility */
-	struct reiserfs_key blk_right_delim_key;
-};
-
-#define BLKH_SIZE                     (sizeof(struct block_head))
-#define blkh_level(p_blkh)            (le16_to_cpu((p_blkh)->blk_level))
-#define blkh_nr_item(p_blkh)          (le16_to_cpu((p_blkh)->blk_nr_item))
-#define blkh_free_space(p_blkh)       (le16_to_cpu((p_blkh)->blk_free_space))
-#define blkh_reserved(p_blkh)         (le16_to_cpu((p_blkh)->blk_reserved))
-#define set_blkh_level(p_blkh,val)    ((p_blkh)->blk_level = cpu_to_le16(val))
-#define set_blkh_nr_item(p_blkh,val)  ((p_blkh)->blk_nr_item = cpu_to_le16(val))
-#define set_blkh_free_space(p_blkh,val) ((p_blkh)->blk_free_space = cpu_to_le16(val))
-#define set_blkh_reserved(p_blkh,val) ((p_blkh)->blk_reserved = cpu_to_le16(val))
-#define blkh_right_delim_key(p_blkh)  ((p_blkh)->blk_right_delim_key)
-#define set_blkh_right_delim_key(p_blkh,val)  ((p_blkh)->blk_right_delim_key = val)
-
-/* values for blk_level field of the struct block_head */
-
-/*
- * When node gets removed from the tree its blk_level is set to FREE_LEVEL.
- * It is then  used to see whether the node is still in the tree
- */
-#define FREE_LEVEL 0
-
-#define DISK_LEAF_NODE_LEVEL  1	/* Leaf node level. */
-
-/*
- * Given the buffer head of a formatted node, resolve to the
- * block head of that node.
- */
-#define B_BLK_HEAD(bh)			((struct block_head *)((bh)->b_data))
-/* Number of items that are in buffer. */
-#define B_NR_ITEMS(bh)			(blkh_nr_item(B_BLK_HEAD(bh)))
-#define B_LEVEL(bh)			(blkh_level(B_BLK_HEAD(bh)))
-#define B_FREE_SPACE(bh)		(blkh_free_space(B_BLK_HEAD(bh)))
-
-#define PUT_B_NR_ITEMS(bh, val)		do { set_blkh_nr_item(B_BLK_HEAD(bh), val); } while (0)
-#define PUT_B_LEVEL(bh, val)		do { set_blkh_level(B_BLK_HEAD(bh), val); } while (0)
-#define PUT_B_FREE_SPACE(bh, val)	do { set_blkh_free_space(B_BLK_HEAD(bh), val); } while (0)
-
-/* Get right delimiting key. -- little endian */
-#define B_PRIGHT_DELIM_KEY(bh)		(&(blk_right_delim_key(B_BLK_HEAD(bh))))
-
-/* Does the buffer contain a disk leaf. */
-#define B_IS_ITEMS_LEVEL(bh)		(B_LEVEL(bh) == DISK_LEAF_NODE_LEVEL)
-
-/* Does the buffer contain a disk internal node */
-#define B_IS_KEYS_LEVEL(bh)      (B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL \
-					    && B_LEVEL(bh) <= MAX_HEIGHT)
-
-/***************************************************************************
- *                             STAT DATA                                   *
- ***************************************************************************/
-
-/*
- * old stat data is 32 bytes long. We are going to distinguish new one by
- * different size
-*/
-struct stat_data_v1 {
-	__le16 sd_mode;		/* file type, permissions */
-	__le16 sd_nlink;	/* number of hard links */
-	__le16 sd_uid;		/* owner */
-	__le16 sd_gid;		/* group */
-	__le32 sd_size;		/* file size */
-	__le32 sd_atime;	/* time of last access */
-	__le32 sd_mtime;	/* time file was last modified  */
-
-	/*
-	 * time inode (stat data) was last changed
-	 * (except changes to sd_atime and sd_mtime)
-	 */
-	__le32 sd_ctime;
-	union {
-		__le32 sd_rdev;
-		__le32 sd_blocks;	/* number of blocks file uses */
-	} __attribute__ ((__packed__)) u;
-
-	/*
-	 * first byte of file which is stored in a direct item: except that if
-	 * it equals 1 it is a symlink and if it equals ~(__u32)0 there is no
-	 * direct item.  The existence of this field really grates on me.
-	 * Let's replace it with a macro based on sd_size and our tail
-	 * suppression policy.  Someday.  -Hans
-	 */
-	__le32 sd_first_direct_byte;
-} __attribute__ ((__packed__));
-
-#define SD_V1_SIZE              (sizeof(struct stat_data_v1))
-#define stat_data_v1(ih)        (ih_version (ih) == KEY_FORMAT_3_5)
-#define sd_v1_mode(sdp)         (le16_to_cpu((sdp)->sd_mode))
-#define set_sd_v1_mode(sdp,v)   ((sdp)->sd_mode = cpu_to_le16(v))
-#define sd_v1_nlink(sdp)        (le16_to_cpu((sdp)->sd_nlink))
-#define set_sd_v1_nlink(sdp,v)  ((sdp)->sd_nlink = cpu_to_le16(v))
-#define sd_v1_uid(sdp)          (le16_to_cpu((sdp)->sd_uid))
-#define set_sd_v1_uid(sdp,v)    ((sdp)->sd_uid = cpu_to_le16(v))
-#define sd_v1_gid(sdp)          (le16_to_cpu((sdp)->sd_gid))
-#define set_sd_v1_gid(sdp,v)    ((sdp)->sd_gid = cpu_to_le16(v))
-#define sd_v1_size(sdp)         (le32_to_cpu((sdp)->sd_size))
-#define set_sd_v1_size(sdp,v)   ((sdp)->sd_size = cpu_to_le32(v))
-#define sd_v1_atime(sdp)        (le32_to_cpu((sdp)->sd_atime))
-#define set_sd_v1_atime(sdp,v)  ((sdp)->sd_atime = cpu_to_le32(v))
-#define sd_v1_mtime(sdp)        (le32_to_cpu((sdp)->sd_mtime))
-#define set_sd_v1_mtime(sdp,v)  ((sdp)->sd_mtime = cpu_to_le32(v))
-#define sd_v1_ctime(sdp)        (le32_to_cpu((sdp)->sd_ctime))
-#define set_sd_v1_ctime(sdp,v)  ((sdp)->sd_ctime = cpu_to_le32(v))
-#define sd_v1_rdev(sdp)         (le32_to_cpu((sdp)->u.sd_rdev))
-#define set_sd_v1_rdev(sdp,v)   ((sdp)->u.sd_rdev = cpu_to_le32(v))
-#define sd_v1_blocks(sdp)       (le32_to_cpu((sdp)->u.sd_blocks))
-#define set_sd_v1_blocks(sdp,v) ((sdp)->u.sd_blocks = cpu_to_le32(v))
-#define sd_v1_first_direct_byte(sdp) \
-                                (le32_to_cpu((sdp)->sd_first_direct_byte))
-#define set_sd_v1_first_direct_byte(sdp,v) \
-                                ((sdp)->sd_first_direct_byte = cpu_to_le32(v))
-
-/* inode flags stored in sd_attrs (nee sd_reserved) */
-
-/*
- * we want common flags to have the same values as in ext2,
- * so chattr(1) will work without problems
- */
-#define REISERFS_IMMUTABLE_FL FS_IMMUTABLE_FL
-#define REISERFS_APPEND_FL    FS_APPEND_FL
-#define REISERFS_SYNC_FL      FS_SYNC_FL
-#define REISERFS_NOATIME_FL   FS_NOATIME_FL
-#define REISERFS_NODUMP_FL    FS_NODUMP_FL
-#define REISERFS_SECRM_FL     FS_SECRM_FL
-#define REISERFS_UNRM_FL      FS_UNRM_FL
-#define REISERFS_COMPR_FL     FS_COMPR_FL
-#define REISERFS_NOTAIL_FL    FS_NOTAIL_FL
-
-/* persistent flags that file inherits from the parent directory */
-#define REISERFS_INHERIT_MASK ( REISERFS_IMMUTABLE_FL |	\
-				REISERFS_SYNC_FL |	\
-				REISERFS_NOATIME_FL |	\
-				REISERFS_NODUMP_FL |	\
-				REISERFS_SECRM_FL |	\
-				REISERFS_COMPR_FL |	\
-				REISERFS_NOTAIL_FL )
-
-/*
- * Stat Data on disk (reiserfs version of UFS disk inode minus the
- * address blocks)
- */
-struct stat_data {
-	__le16 sd_mode;		/* file type, permissions */
-	__le16 sd_attrs;	/* persistent inode flags */
-	__le32 sd_nlink;	/* number of hard links */
-	__le64 sd_size;		/* file size */
-	__le32 sd_uid;		/* owner */
-	__le32 sd_gid;		/* group */
-	__le32 sd_atime;	/* time of last access */
-	__le32 sd_mtime;	/* time file was last modified  */
-
-	/*
-	 * time inode (stat data) was last changed
-	 * (except changes to sd_atime and sd_mtime)
-	 */
-	__le32 sd_ctime;
-	__le32 sd_blocks;
-	union {
-		__le32 sd_rdev;
-		__le32 sd_generation;
-	} __attribute__ ((__packed__)) u;
-} __attribute__ ((__packed__));
-
-/* this is 44 bytes long */
-#define SD_SIZE (sizeof(struct stat_data))
-#define SD_V2_SIZE              SD_SIZE
-#define stat_data_v2(ih)        (ih_version (ih) == KEY_FORMAT_3_6)
-#define sd_v2_mode(sdp)         (le16_to_cpu((sdp)->sd_mode))
-#define set_sd_v2_mode(sdp,v)   ((sdp)->sd_mode = cpu_to_le16(v))
-/* sd_reserved */
-/* set_sd_reserved */
-#define sd_v2_nlink(sdp)        (le32_to_cpu((sdp)->sd_nlink))
-#define set_sd_v2_nlink(sdp,v)  ((sdp)->sd_nlink = cpu_to_le32(v))
-#define sd_v2_size(sdp)         (le64_to_cpu((sdp)->sd_size))
-#define set_sd_v2_size(sdp,v)   ((sdp)->sd_size = cpu_to_le64(v))
-#define sd_v2_uid(sdp)          (le32_to_cpu((sdp)->sd_uid))
-#define set_sd_v2_uid(sdp,v)    ((sdp)->sd_uid = cpu_to_le32(v))
-#define sd_v2_gid(sdp)          (le32_to_cpu((sdp)->sd_gid))
-#define set_sd_v2_gid(sdp,v)    ((sdp)->sd_gid = cpu_to_le32(v))
-#define sd_v2_atime(sdp)        (le32_to_cpu((sdp)->sd_atime))
-#define set_sd_v2_atime(sdp,v)  ((sdp)->sd_atime = cpu_to_le32(v))
-#define sd_v2_mtime(sdp)        (le32_to_cpu((sdp)->sd_mtime))
-#define set_sd_v2_mtime(sdp,v)  ((sdp)->sd_mtime = cpu_to_le32(v))
-#define sd_v2_ctime(sdp)        (le32_to_cpu((sdp)->sd_ctime))
-#define set_sd_v2_ctime(sdp,v)  ((sdp)->sd_ctime = cpu_to_le32(v))
-#define sd_v2_blocks(sdp)       (le32_to_cpu((sdp)->sd_blocks))
-#define set_sd_v2_blocks(sdp,v) ((sdp)->sd_blocks = cpu_to_le32(v))
-#define sd_v2_rdev(sdp)         (le32_to_cpu((sdp)->u.sd_rdev))
-#define set_sd_v2_rdev(sdp,v)   ((sdp)->u.sd_rdev = cpu_to_le32(v))
-#define sd_v2_generation(sdp)   (le32_to_cpu((sdp)->u.sd_generation))
-#define set_sd_v2_generation(sdp,v) ((sdp)->u.sd_generation = cpu_to_le32(v))
-#define sd_v2_attrs(sdp)         (le16_to_cpu((sdp)->sd_attrs))
-#define set_sd_v2_attrs(sdp,v)   ((sdp)->sd_attrs = cpu_to_le16(v))
-
-/***************************************************************************
- *                      DIRECTORY STRUCTURE                                *
- ***************************************************************************/
-/*
- * Picture represents the structure of directory items
- * ________________________________________________
- * |  Array of     |   |     |        |       |   |
- * | directory     |N-1| N-2 | ....   |   1st |0th|
- * | entry headers |   |     |        |       |   |
- * |_______________|___|_____|________|_______|___|
- *                  <----   directory entries         ------>
- *
- * First directory item has k_offset component 1. We store "." and ".."
- * in one item, always, we never split "." and ".." into differing
- * items.  This makes, among other things, the code for removing
- * directories simpler.
- */
-#define SD_OFFSET  0
-#define SD_UNIQUENESS 0
-#define DOT_OFFSET 1
-#define DOT_DOT_OFFSET 2
-#define DIRENTRY_UNIQUENESS 500
-
-#define FIRST_ITEM_OFFSET 1
-
-/*
- * Q: How to get key of object pointed to by entry from entry?
- *
- * A: Each directory entry has its header. This header has deh_dir_id
- *    and deh_objectid fields, those are key of object, entry points to
- */
-
-/*
- * NOT IMPLEMENTED:
- * Directory will someday contain stat data of object
- */
-
-struct reiserfs_de_head {
-	__le32 deh_offset;	/* third component of the directory entry key */
-
-	/*
-	 * objectid of the parent directory of the object, that is referenced
-	 * by directory entry
-	 */
-	__le32 deh_dir_id;
-
-	/* objectid of the object, that is referenced by directory entry */
-	__le32 deh_objectid;
-	__le16 deh_location;	/* offset of name in the whole item */
-
-	/*
-	 * whether 1) entry contains stat data (for future), and
-	 * 2) whether entry is hidden (unlinked)
-	 */
-	__le16 deh_state;
-} __attribute__ ((__packed__));
-#define DEH_SIZE                  sizeof(struct reiserfs_de_head)
-#define deh_offset(p_deh)         (le32_to_cpu((p_deh)->deh_offset))
-#define deh_dir_id(p_deh)         (le32_to_cpu((p_deh)->deh_dir_id))
-#define deh_objectid(p_deh)       (le32_to_cpu((p_deh)->deh_objectid))
-#define deh_location(p_deh)       (le16_to_cpu((p_deh)->deh_location))
-#define deh_state(p_deh)          (le16_to_cpu((p_deh)->deh_state))
-
-#define put_deh_offset(p_deh,v)   ((p_deh)->deh_offset = cpu_to_le32((v)))
-#define put_deh_dir_id(p_deh,v)   ((p_deh)->deh_dir_id = cpu_to_le32((v)))
-#define put_deh_objectid(p_deh,v) ((p_deh)->deh_objectid = cpu_to_le32((v)))
-#define put_deh_location(p_deh,v) ((p_deh)->deh_location = cpu_to_le16((v)))
-#define put_deh_state(p_deh,v)    ((p_deh)->deh_state = cpu_to_le16((v)))
-
-/* empty directory contains two entries "." and ".." and their headers */
-#define EMPTY_DIR_SIZE \
-(DEH_SIZE * 2 + ROUND_UP (sizeof(".") - 1) + ROUND_UP (sizeof("..") - 1))
-
-/* old format directories have this size when empty */
-#define EMPTY_DIR_SIZE_V1 (DEH_SIZE * 2 + 3)
-
-#define DEH_Statdata 0		/* not used now */
-#define DEH_Visible 2
-
-/* 64 bit systems (and the S/390) need to be aligned explicitly -jdm */
-#if BITS_PER_LONG == 64 || defined(__s390__) || defined(__hppa__)
-#   define ADDR_UNALIGNED_BITS  (3)
-#endif
-
-/*
- * These are only used to manipulate deh_state.
- * Because of this, we'll use the ext2_ bit routines,
- * since they are little endian
- */
-#ifdef ADDR_UNALIGNED_BITS
-
-#   define aligned_address(addr)           ((void *)((long)(addr) & ~((1UL << ADDR_UNALIGNED_BITS) - 1)))
-#   define unaligned_offset(addr)          (((int)((long)(addr) & ((1 << ADDR_UNALIGNED_BITS) - 1))) << 3)
-
-#   define set_bit_unaligned(nr, addr)	\
-	__test_and_set_bit_le((nr) + unaligned_offset(addr), aligned_address(addr))
-#   define clear_bit_unaligned(nr, addr)	\
-	__test_and_clear_bit_le((nr) + unaligned_offset(addr), aligned_address(addr))
-#   define test_bit_unaligned(nr, addr)	\
-	test_bit_le((nr) + unaligned_offset(addr), aligned_address(addr))
-
-#else
-
-#   define set_bit_unaligned(nr, addr)	__test_and_set_bit_le(nr, addr)
-#   define clear_bit_unaligned(nr, addr)	__test_and_clear_bit_le(nr, addr)
-#   define test_bit_unaligned(nr, addr)	test_bit_le(nr, addr)
-
-#endif
-
-#define mark_de_with_sd(deh)        set_bit_unaligned (DEH_Statdata, &((deh)->deh_state))
-#define mark_de_without_sd(deh)     clear_bit_unaligned (DEH_Statdata, &((deh)->deh_state))
-#define mark_de_visible(deh)	    set_bit_unaligned (DEH_Visible, &((deh)->deh_state))
-#define mark_de_hidden(deh)	    clear_bit_unaligned (DEH_Visible, &((deh)->deh_state))
-
-#define de_with_sd(deh)		    test_bit_unaligned (DEH_Statdata, &((deh)->deh_state))
-#define de_visible(deh)	    	    test_bit_unaligned (DEH_Visible, &((deh)->deh_state))
-#define de_hidden(deh)	    	    !test_bit_unaligned (DEH_Visible, &((deh)->deh_state))
-
-extern void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid,
-				   __le32 par_dirid, __le32 par_objid);
-extern void make_empty_dir_item(char *body, __le32 dirid, __le32 objid,
-				__le32 par_dirid, __le32 par_objid);
-
-/* two entries per block (at least) */
-#define REISERFS_MAX_NAME(block_size) 255
-
-/*
- * this structure is used for operations on directory entries. It is
- * not a disk structure.
- *
- * When reiserfs_find_entry or search_by_entry_key find directory
- * entry, they return filled reiserfs_dir_entry structure
- */
-struct reiserfs_dir_entry {
-	struct buffer_head *de_bh;
-	int de_item_num;
-	struct item_head *de_ih;
-	int de_entry_num;
-	struct reiserfs_de_head *de_deh;
-	int de_entrylen;
-	int de_namelen;
-	char *de_name;
-	unsigned long *de_gen_number_bit_string;
-
-	__u32 de_dir_id;
-	__u32 de_objectid;
-
-	struct cpu_key de_entry_key;
-};
-
-/*
- * these defines are useful when a particular member of
- * a reiserfs_dir_entry is needed
- */
-
-/* pointer to file name, stored in entry */
-#define B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh) \
-				(ih_item_body(bh, ih) + deh_location(deh))
-
-/* length of name */
-#define I_DEH_N_ENTRY_FILE_NAME_LENGTH(ih,deh,entry_num) \
-(I_DEH_N_ENTRY_LENGTH (ih, deh, entry_num) - (de_with_sd (deh) ? SD_SIZE : 0))
-
-/* hash value occupies bits from 7 up to 30 */
-#define GET_HASH_VALUE(offset) ((offset) & 0x7fffff80LL)
-/* generation number occupies 7 bits starting from 0 up to 6 */
-#define GET_GENERATION_NUMBER(offset) ((offset) & 0x7fLL)
-#define MAX_GENERATION_NUMBER  127
-
-#define SET_GENERATION_NUMBER(offset,gen_number) (GET_HASH_VALUE(offset)|(gen_number))
-
-/*
- * Picture represents an internal node of the reiserfs tree
- *  ______________________________________________________
- * |      |  Array of     |  Array of         |  Free     |
- * |block |    keys       |  pointers         | space     |
- * | head |      N        |      N+1          |           |
- * |______|_______________|___________________|___________|
- */
-
-/***************************************************************************
- *                      DISK CHILD                                         *
- ***************************************************************************/
-/*
- * Disk child pointer:
- * The pointer from an internal node of the tree to a node that is on disk.
- */
-struct disk_child {
-	__le32 dc_block_number;	/* Disk child's block number. */
-	__le16 dc_size;		/* Disk child's used space.   */
-	__le16 dc_reserved;
-};
-
-#define DC_SIZE (sizeof(struct disk_child))
-#define dc_block_number(dc_p)	(le32_to_cpu((dc_p)->dc_block_number))
-#define dc_size(dc_p)		(le16_to_cpu((dc_p)->dc_size))
-#define put_dc_block_number(dc_p, val)   do { (dc_p)->dc_block_number = cpu_to_le32(val); } while(0)
-#define put_dc_size(dc_p, val)   do { (dc_p)->dc_size = cpu_to_le16(val); } while(0)
-
-/* Get disk child by buffer header and position in the tree node. */
-#define B_N_CHILD(bh, n_pos)  ((struct disk_child *)\
-((bh)->b_data + BLKH_SIZE + B_NR_ITEMS(bh) * KEY_SIZE + DC_SIZE * (n_pos)))
-
-/* Get disk child number by buffer header and position in the tree node. */
-#define B_N_CHILD_NUM(bh, n_pos) (dc_block_number(B_N_CHILD(bh, n_pos)))
-#define PUT_B_N_CHILD_NUM(bh, n_pos, val) \
-				(put_dc_block_number(B_N_CHILD(bh, n_pos), val))
-
- /* maximal value of field child_size in structure disk_child */
- /* child size is the combined size of all items and their headers */
-#define MAX_CHILD_SIZE(bh) ((int)( (bh)->b_size - BLKH_SIZE ))
-
-/* amount of used space in buffer (not including block head) */
-#define B_CHILD_SIZE(cur) (MAX_CHILD_SIZE(cur)-(B_FREE_SPACE(cur)))
-
-/* max and min number of keys in internal node */
-#define MAX_NR_KEY(bh) ( (MAX_CHILD_SIZE(bh)-DC_SIZE)/(KEY_SIZE+DC_SIZE) )
-#define MIN_NR_KEY(bh)    (MAX_NR_KEY(bh)/2)
-
-/***************************************************************************
- *                      PATH STRUCTURES AND DEFINES                        *
- ***************************************************************************/
-
-/*
- * search_by_key fills up the path from the root to the leaf as it descends
- * the tree looking for the key.  It uses reiserfs_bread to try to find
- * buffers in the cache given their block number.  If it does not find
- * them in the cache it reads them from disk.  For each node search_by_key
- * finds using reiserfs_bread it then uses bin_search to look through that
- * node.  bin_search will find the position of the block_number of the next
- * node if it is looking through an internal node.  If it is looking through
- * a leaf node bin_search will find the position of the item which has key
- * either equal to given key, or which is the maximal key less than the
- * given key.
- */
-
-struct path_element {
-	/* Pointer to the buffer at the path in the tree. */
-	struct buffer_head *pe_buffer;
-	/* Position in the tree node which is placed in the buffer above. */
-	int pe_position;
-};
-
-/*
- * maximal height of a tree. don't change this without
- * changing JOURNAL_PER_BALANCE_CNT
- */
-#define MAX_HEIGHT 5
-
-/* Must be equals MAX_HEIGHT + FIRST_PATH_ELEMENT_OFFSET */
-#define EXTENDED_MAX_HEIGHT         7
-
-/* Must be equal to at least 2. */
-#define FIRST_PATH_ELEMENT_OFFSET   2
-
-/* Must be equal to FIRST_PATH_ELEMENT_OFFSET - 1 */
-#define ILLEGAL_PATH_ELEMENT_OFFSET 1
-
-/* this MUST be MAX_HEIGHT + 1. See about FEB below */
-#define MAX_FEB_SIZE 6
-
-/*
- * We need to keep track of who the ancestors of nodes are.  When we
- * perform a search we record which nodes were visited while
- * descending the tree looking for the node we searched for. This list
- * of nodes is called the path.  This information is used while
- * performing balancing.  Note that this path information may become
- * invalid, and this means we must check it when using it to see if it
- * is still valid. You'll need to read search_by_key and the comments
- * in it, especially about decrement_counters_in_path(), to understand
- * this structure.
- *
- * Paths make the code so much harder to work with and debug.... An
- * enormous number of bugs are due to them, and trying to write or modify
- * code that uses them just makes my head hurt.  They are based on an
- * excessive effort to avoid disturbing the precious VFS code.:-( The
- * gods only know how we are going to SMP the code that uses them.
- * znodes are the way!
- */
-
-#define PATH_READA	0x1	/* do read ahead */
-#define PATH_READA_BACK 0x2	/* read backwards */
-
-struct treepath {
-	int path_length;	/* Length of the array above.   */
-	int reada;
-	/* Array of the path elements.  */
-	struct path_element path_elements[EXTENDED_MAX_HEIGHT];
-	int pos_in_item;
-};
-
-#define pos_in_item(path) ((path)->pos_in_item)
-
-#define INITIALIZE_PATH(var) \
-struct treepath var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET, .reada = 0,}
-
-/* Get path element by path and path position. */
-#define PATH_OFFSET_PELEMENT(path, n_offset)  ((path)->path_elements + (n_offset))
-
-/* Get buffer header at the path by path and path position. */
-#define PATH_OFFSET_PBUFFER(path, n_offset)   (PATH_OFFSET_PELEMENT(path, n_offset)->pe_buffer)
-
-/* Get position in the element at the path by path and path position. */
-#define PATH_OFFSET_POSITION(path, n_offset) (PATH_OFFSET_PELEMENT(path, n_offset)->pe_position)
-
-#define PATH_PLAST_BUFFER(path) (PATH_OFFSET_PBUFFER((path), (path)->path_length))
-
-/*
- * you know, to the person who didn't write this the macro name does not
- * at first suggest what it does.  Maybe POSITION_FROM_PATH_END? Or
- * maybe we should just focus on dumping paths... -Hans
- */
-#define PATH_LAST_POSITION(path) (PATH_OFFSET_POSITION((path), (path)->path_length))
-
-/*
- * in do_balance leaf has h == 0 in contrast with path structure,
- * where root has level == 0. That is why we need these defines
- */
-
-/* tb->S[h] */
-#define PATH_H_PBUFFER(path, h) \
-			PATH_OFFSET_PBUFFER(path, path->path_length - (h))
-
-/* tb->F[h] or tb->S[0]->b_parent */
-#define PATH_H_PPARENT(path, h) PATH_H_PBUFFER(path, (h) + 1)
-
-#define PATH_H_POSITION(path, h) \
-			PATH_OFFSET_POSITION(path, path->path_length - (h))
-
-/* tb->S[h]->b_item_order */
-#define PATH_H_B_ITEM_ORDER(path, h) PATH_H_POSITION(path, h + 1)
-
-#define PATH_H_PATH_OFFSET(path, n_h) ((path)->path_length - (n_h))
-
-static inline void *reiserfs_node_data(const struct buffer_head *bh)
-{
-	return bh->b_data + sizeof(struct block_head);
-}
-
-/* get key from internal node */
-static inline struct reiserfs_key *internal_key(struct buffer_head *bh,
-						int item_num)
-{
-	struct reiserfs_key *key = reiserfs_node_data(bh);
-
-	return &key[item_num];
-}
-
-/* get the item header from leaf node */
-static inline struct item_head *item_head(const struct buffer_head *bh,
-					  int item_num)
-{
-	struct item_head *ih = reiserfs_node_data(bh);
-
-	return &ih[item_num];
-}
-
-/* get the key from leaf node */
-static inline struct reiserfs_key *leaf_key(const struct buffer_head *bh,
-					    int item_num)
-{
-	return &item_head(bh, item_num)->ih_key;
-}
-
-static inline void *ih_item_body(const struct buffer_head *bh,
-				 const struct item_head *ih)
-{
-	return bh->b_data + ih_location(ih);
-}
-
-/* get item body from leaf node */
-static inline void *item_body(const struct buffer_head *bh, int item_num)
-{
-	return ih_item_body(bh, item_head(bh, item_num));
-}
-
-static inline struct item_head *tp_item_head(const struct treepath *path)
-{
-	return item_head(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path));
-}
-
-static inline void *tp_item_body(const struct treepath *path)
-{
-	return item_body(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path));
-}
-
-#define get_last_bh(path) PATH_PLAST_BUFFER(path)
-#define get_item_pos(path) PATH_LAST_POSITION(path)
-#define item_moved(ih,path) comp_items(ih, path)
-#define path_changed(ih,path) comp_items (ih, path)
-
-/* array of the entry headers */
- /* get item body */
-#define B_I_DEH(bh, ih) ((struct reiserfs_de_head *)(ih_item_body(bh, ih)))
-
-/*
- * length of the directory entry in directory item. This define
- * calculates length of i-th directory entry using directory entry
- * locations from dir entry head. When it calculates length of 0-th
- * directory entry, it uses length of whole item in place of entry
- * location of the non-existent following entry in the calculation.
- * See picture above.
- */
-static inline int entry_length(const struct buffer_head *bh,
-			       const struct item_head *ih, int pos_in_item)
-{
-	struct reiserfs_de_head *deh;
-
-	deh = B_I_DEH(bh, ih) + pos_in_item;
-	if (pos_in_item)
-		return deh_location(deh - 1) - deh_location(deh);
-
-	return ih_item_len(ih) - deh_location(deh);
-}
-
-/***************************************************************************
- *                       MISC                                              *
- ***************************************************************************/
-
-/* Size of pointer to the unformatted node. */
-#define UNFM_P_SIZE (sizeof(unp_t))
-#define UNFM_P_SHIFT 2
-
-/* in in-core inode key is stored on le form */
-#define INODE_PKEY(inode) ((struct reiserfs_key *)(REISERFS_I(inode)->i_key))
-
-#define MAX_UL_INT 0xffffffff
-#define MAX_INT    0x7ffffff
-#define MAX_US_INT 0xffff
-
-// reiserfs version 2 has max offset 60 bits. Version 1 - 32 bit offset
-static inline loff_t max_reiserfs_offset(struct inode *inode)
-{
-	if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5)
-		return (loff_t) U32_MAX;
-
-	return (loff_t) ((~(__u64) 0) >> 4);
-}
-
-#define MAX_KEY_OBJECTID	MAX_UL_INT
-
-#define MAX_B_NUM  MAX_UL_INT
-#define MAX_FC_NUM MAX_US_INT
-
-/* the purpose is to detect overflow of an unsigned short */
-#define REISERFS_LINK_MAX (MAX_US_INT - 1000)
-
-/*
- * The following defines are used in reiserfs_insert_item
- * and reiserfs_append_item
- */
-#define REISERFS_KERNEL_MEM		0	/* kernel memory mode */
-#define REISERFS_USER_MEM		1	/* user memory mode */
-
-#define fs_generation(s) (REISERFS_SB(s)->s_generation_counter)
-#define get_generation(s) atomic_read (&fs_generation(s))
-#define FILESYSTEM_CHANGED_TB(tb)  (get_generation((tb)->tb_sb) != (tb)->fs_gen)
-#define __fs_changed(gen,s) (gen != get_generation (s))
-#define fs_changed(gen,s)		\
-({					\
-	reiserfs_cond_resched(s);	\
-	__fs_changed(gen, s);		\
-})
-
-/***************************************************************************
- *                  FIXATE NODES                                           *
- ***************************************************************************/
-
-#define VI_TYPE_LEFT_MERGEABLE 1
-#define VI_TYPE_RIGHT_MERGEABLE 2
-
-/*
- * To make any changes in the tree we always first find node, that
- * contains item to be changed/deleted or place to insert a new
- * item. We call this node S. To do balancing we need to decide what
- * we will shift to left/right neighbor, or to a new node, where new
- * item will be etc. To make this analysis simpler we build virtual
- * node. Virtual node is an array of items, that will replace items of
- * node S. (For instance if we are going to delete an item, virtual
- * node does not contain it). Virtual node keeps information about
- * item sizes and types, mergeability of first and last items, sizes
- * of all entries in directory item. We use this array of items when
- * calculating what we can shift to neighbors and how many nodes we
- * have to have if we do not any shiftings, if we shift to left/right
- * neighbor or to both.
- */
-struct virtual_item {
-	int vi_index;		/* index in the array of item operations */
-	unsigned short vi_type;	/* left/right mergeability */
-
-	/* length of item that it will have after balancing */
-	unsigned short vi_item_len;
-
-	struct item_head *vi_ih;
-	const char *vi_item;	/* body of item (old or new) */
-	const void *vi_new_data;	/* 0 always but paste mode */
-	void *vi_uarea;		/* item specific area */
-};
-
-struct virtual_node {
-	/* this is a pointer to the free space in the buffer */
-	char *vn_free_ptr;
-
-	unsigned short vn_nr_item;	/* number of items in virtual node */
-
-	/*
-	 * size of node , that node would have if it has
-	 * unlimited size and no balancing is performed
-	 */
-	short vn_size;
-
-	/* mode of balancing (paste, insert, delete, cut) */
-	short vn_mode;
-
-	short vn_affected_item_num;
-	short vn_pos_in_item;
-
-	/* item header of inserted item, 0 for other modes */
-	struct item_head *vn_ins_ih;
-	const void *vn_data;
-
-	/* array of items (including a new one, excluding item to be deleted) */
-	struct virtual_item *vn_vi;
-};
-
-/* used by directory items when creating virtual nodes */
-struct direntry_uarea {
-	int flags;
-	__u16 entry_count;
-	__u16 entry_sizes[];
-} __attribute__ ((__packed__));
-
-/***************************************************************************
- *                  TREE BALANCE                                           *
- ***************************************************************************/
-
-/*
- * This temporary structure is used in tree balance algorithms, and
- * constructed as we go to the extent that its various parts are
- * needed.  It contains arrays of nodes that can potentially be
- * involved in the balancing of node S, and parameters that define how
- * each of the nodes must be balanced.  Note that in these algorithms
- * for balancing the worst case is to need to balance the current node
- * S and the left and right neighbors and all of their parents plus
- * create a new node.  We implement S1 balancing for the leaf nodes
- * and S0 balancing for the internal nodes (S1 and S0 are defined in
- * our papers.)
- */
-
-/* size of the array of buffers to free at end of do_balance */
-#define MAX_FREE_BLOCK 7
-
-/* maximum number of FEB blocknrs on a single level */
-#define MAX_AMOUNT_NEEDED 2
-
-/* someday somebody will prefix every field in this struct with tb_ */
-struct tree_balance {
-	int tb_mode;
-	int need_balance_dirty;
-	struct super_block *tb_sb;
-	struct reiserfs_transaction_handle *transaction_handle;
-	struct treepath *tb_path;
-
-	/* array of left neighbors of nodes in the path */
-	struct buffer_head *L[MAX_HEIGHT];
-
-	/* array of right neighbors of nodes in the path */
-	struct buffer_head *R[MAX_HEIGHT];
-
-	/* array of fathers of the left neighbors */
-	struct buffer_head *FL[MAX_HEIGHT];
-
-	/* array of fathers of the right neighbors */
-	struct buffer_head *FR[MAX_HEIGHT];
-	/* array of common parents of center node and its left neighbor */
-	struct buffer_head *CFL[MAX_HEIGHT];
-
-	/* array of common parents of center node and its right neighbor */
-	struct buffer_head *CFR[MAX_HEIGHT];
-
-	/*
-	 * array of empty buffers. Number of buffers in array equals
-	 * cur_blknum.
-	 */
-	struct buffer_head *FEB[MAX_FEB_SIZE];
-	struct buffer_head *used[MAX_FEB_SIZE];
-	struct buffer_head *thrown[MAX_FEB_SIZE];
-
-	/*
-	 * array of number of items which must be shifted to the left in
-	 * order to balance the current node; for leaves includes item that
-	 * will be partially shifted; for internal nodes, it is the number
-	 * of child pointers rather than items. It includes the new item
-	 * being created. The code sometimes subtracts one to get the
-	 * number of wholly shifted items for other purposes.
-	 */
-	int lnum[MAX_HEIGHT];
-
-	/* substitute right for left in comment above */
-	int rnum[MAX_HEIGHT];
-
-	/*
-	 * array indexed by height h mapping the key delimiting L[h] and
-	 * S[h] to its item number within the node CFL[h]
-	 */
-	int lkey[MAX_HEIGHT];
-
-	/* substitute r for l in comment above */
-	int rkey[MAX_HEIGHT];
-
-	/*
-	 * the number of bytes by we are trying to add or remove from
-	 * S[h]. A negative value means removing.
-	 */
-	int insert_size[MAX_HEIGHT];
-
-	/*
-	 * number of nodes that will replace node S[h] after balancing
-	 * on the level h of the tree.  If 0 then S is being deleted,
-	 * if 1 then S is remaining and no new nodes are being created,
-	 * if 2 or 3 then 1 or 2 new nodes is being created
-	 */
-	int blknum[MAX_HEIGHT];
-
-	/* fields that are used only for balancing leaves of the tree */
-
-	/* number of empty blocks having been already allocated */
-	int cur_blknum;
-
-	/* number of items that fall into left most node when S[0] splits */
-	int s0num;
-
-	/*
-	 * number of bytes which can flow to the left neighbor from the left
-	 * most liquid item that cannot be shifted from S[0] entirely
-	 * if -1 then nothing will be partially shifted
-	 */
-	int lbytes;
-
-	/*
-	 * number of bytes which will flow to the right neighbor from the right
-	 * most liquid item that cannot be shifted from S[0] entirely
-	 * if -1 then nothing will be partially shifted
-	 */
-	int rbytes;
-
-
-	/*
-	 * index into the array of item headers in
-	 * S[0] of the affected item
-	 */
-	int item_pos;
-
-	/* new nodes allocated to hold what could not fit into S */
-	struct buffer_head *S_new[2];
-
-	/*
-	 * number of items that will be placed into nodes in S_new
-	 * when S[0] splits
-	 */
-	int snum[2];
-
-	/*
-	 * number of bytes which flow to nodes in S_new when S[0] splits
-	 * note: if S[0] splits into 3 nodes, then items do not need to be cut
-	 */
-	int sbytes[2];
-
-	int pos_in_item;
-	int zeroes_num;
-
-	/*
-	 * buffers which are to be freed after do_balance finishes
-	 * by unfix_nodes
-	 */
-	struct buffer_head *buf_to_free[MAX_FREE_BLOCK];
-
-	/*
-	 * kmalloced memory. Used to create virtual node and keep
-	 * map of dirtied bitmap blocks
-	 */
-	char *vn_buf;
-
-	int vn_buf_size;	/* size of the vn_buf */
-
-	/* VN starts after bitmap of bitmap blocks */
-	struct virtual_node *tb_vn;
-
-	/*
-	 * saved value of `reiserfs_generation' counter see
-	 * FILESYSTEM_CHANGED() macro in reiserfs_fs.h
-	 */
-	int fs_gen;
-
-#ifdef DISPLACE_NEW_PACKING_LOCALITIES
-	/*
-	 * key pointer, to pass to block allocator or
-	 * another low-level subsystem
-	 */
-	struct in_core_key key;
-#endif
-};
-
-/* These are modes of balancing */
-
-/* When inserting an item. */
-#define M_INSERT	'i'
-/*
- * When inserting into (directories only) or appending onto an already
- * existent item.
- */
-#define M_PASTE		'p'
-/* When deleting an item. */
-#define M_DELETE	'd'
-/* When truncating an item or removing an entry from a (directory) item. */
-#define M_CUT		'c'
-
-/* used when balancing on leaf level skipped (in reiserfsck) */
-#define M_INTERNAL	'n'
-
-/*
- * When further balancing is not needed, then do_balance does not need
- * to be called.
- */
-#define M_SKIP_BALANCING		's'
-#define M_CONVERT	'v'
-
-/* modes of leaf_move_items */
-#define LEAF_FROM_S_TO_L 0
-#define LEAF_FROM_S_TO_R 1
-#define LEAF_FROM_R_TO_L 2
-#define LEAF_FROM_L_TO_R 3
-#define LEAF_FROM_S_TO_SNEW 4
-
-#define FIRST_TO_LAST 0
-#define LAST_TO_FIRST 1
-
-/*
- * used in do_balance for passing parent of node information that has
- * been gotten from tb struct
- */
-struct buffer_info {
-	struct tree_balance *tb;
-	struct buffer_head *bi_bh;
-	struct buffer_head *bi_parent;
-	int bi_position;
-};
-
-static inline struct super_block *sb_from_tb(struct tree_balance *tb)
-{
-	return tb ? tb->tb_sb : NULL;
-}
-
-static inline struct super_block *sb_from_bi(struct buffer_info *bi)
-{
-	return bi ? sb_from_tb(bi->tb) : NULL;
-}
-
-/*
- * there are 4 types of items: stat data, directory item, indirect, direct.
- * +-------------------+------------+--------------+------------+
- * |                   |  k_offset  | k_uniqueness | mergeable? |
- * +-------------------+------------+--------------+------------+
- * |     stat data     |     0      |      0       |   no       |
- * +-------------------+------------+--------------+------------+
- * | 1st directory item| DOT_OFFSET | DIRENTRY_ .. |   no       |
- * | non 1st directory | hash value | UNIQUENESS   |   yes      |
- * |     item          |            |              |            |
- * +-------------------+------------+--------------+------------+
- * | indirect item     | offset + 1 |TYPE_INDIRECT |    [1]	|
- * +-------------------+------------+--------------+------------+
- * | direct item       | offset + 1 |TYPE_DIRECT   |    [2]     |
- * +-------------------+------------+--------------+------------+
- *
- * [1] if this is not the first indirect item of the object
- * [2] if this is not the first direct item of the object
-*/
-
-struct item_operations {
-	int (*bytes_number) (struct item_head * ih, int block_size);
-	void (*decrement_key) (struct cpu_key *);
-	int (*is_left_mergeable) (struct reiserfs_key * ih,
-				  unsigned long bsize);
-	void (*print_item) (struct item_head *, char *item);
-	void (*check_item) (struct item_head *, char *item);
-
-	int (*create_vi) (struct virtual_node * vn, struct virtual_item * vi,
-			  int is_affected, int insert_size);
-	int (*check_left) (struct virtual_item * vi, int free,
-			   int start_skip, int end_skip);
-	int (*check_right) (struct virtual_item * vi, int free);
-	int (*part_size) (struct virtual_item * vi, int from, int to);
-	int (*unit_num) (struct virtual_item * vi);
-	void (*print_vi) (struct virtual_item * vi);
-};
-
-extern struct item_operations *item_ops[TYPE_ANY + 1];
-
-#define op_bytes_number(ih,bsize)                    item_ops[le_ih_k_type (ih)]->bytes_number (ih, bsize)
-#define op_is_left_mergeable(key,bsize)              item_ops[le_key_k_type (le_key_version (key), key)]->is_left_mergeable (key, bsize)
-#define op_print_item(ih,item)                       item_ops[le_ih_k_type (ih)]->print_item (ih, item)
-#define op_check_item(ih,item)                       item_ops[le_ih_k_type (ih)]->check_item (ih, item)
-#define op_create_vi(vn,vi,is_affected,insert_size)  item_ops[le_ih_k_type ((vi)->vi_ih)]->create_vi (vn,vi,is_affected,insert_size)
-#define op_check_left(vi,free,start_skip,end_skip) item_ops[(vi)->vi_index]->check_left (vi, free, start_skip, end_skip)
-#define op_check_right(vi,free)                      item_ops[(vi)->vi_index]->check_right (vi, free)
-#define op_part_size(vi,from,to)                     item_ops[(vi)->vi_index]->part_size (vi, from, to)
-#define op_unit_num(vi)				     item_ops[(vi)->vi_index]->unit_num (vi)
-#define op_print_vi(vi)                              item_ops[(vi)->vi_index]->print_vi (vi)
-
-#define COMP_SHORT_KEYS comp_short_keys
-
-/* number of blocks pointed to by the indirect item */
-#define I_UNFM_NUM(ih)	(ih_item_len(ih) / UNFM_P_SIZE)
-
-/*
- * the used space within the unformatted node corresponding
- * to pos within the item pointed to by ih
- */
-#define I_POS_UNFM_SIZE(ih,pos,size) (((pos) == I_UNFM_NUM(ih) - 1 ) ? (size) - ih_free_space(ih) : (size))
-
-/*
- * number of bytes contained by the direct item or the
- * unformatted nodes the indirect item points to
- */
-
-/* following defines use reiserfs buffer header and item header */
-
-/* get stat-data */
-#define B_I_STAT_DATA(bh, ih) ( (struct stat_data * )((bh)->b_data + ih_location(ih)) )
-
-/* this is 3976 for size==4096 */
-#define MAX_DIRECT_ITEM_LEN(size) ((size) - BLKH_SIZE - 2*IH_SIZE - SD_SIZE - UNFM_P_SIZE)
-
-/*
- * indirect items consist of entries which contain blocknrs, pos
- * indicates which entry, and B_I_POS_UNFM_POINTER resolves to the
- * blocknr contained by the entry pos points to
- */
-#define B_I_POS_UNFM_POINTER(bh, ih, pos)				\
-	le32_to_cpu(*(((unp_t *)ih_item_body(bh, ih)) + (pos)))
-#define PUT_B_I_POS_UNFM_POINTER(bh, ih, pos, val)			\
-	(*(((unp_t *)ih_item_body(bh, ih)) + (pos)) = cpu_to_le32(val))
-
-struct reiserfs_iget_args {
-	__u32 objectid;
-	__u32 dirid;
-};
-
-/***************************************************************************
- *                    FUNCTION DECLARATIONS                                *
- ***************************************************************************/
-
-#define get_journal_desc_magic(bh) (bh->b_data + bh->b_size - 12)
-
-#define journal_trans_half(blocksize) \
-	((blocksize - sizeof(struct reiserfs_journal_desc) - 12) / sizeof(__u32))
-
-/* journal.c see journal.c for all the comments here */
-
-/* first block written in a commit.  */
-struct reiserfs_journal_desc {
-	__le32 j_trans_id;	/* id of commit */
-
-	/* length of commit. len +1 is the commit block */
-	__le32 j_len;
-
-	__le32 j_mount_id;	/* mount id of this trans */
-	__le32 j_realblock[];	/* real locations for each block */
-};
-
-#define get_desc_trans_id(d)   le32_to_cpu((d)->j_trans_id)
-#define get_desc_trans_len(d)  le32_to_cpu((d)->j_len)
-#define get_desc_mount_id(d)   le32_to_cpu((d)->j_mount_id)
-
-#define set_desc_trans_id(d,val)       do { (d)->j_trans_id = cpu_to_le32 (val); } while (0)
-#define set_desc_trans_len(d,val)      do { (d)->j_len = cpu_to_le32 (val); } while (0)
-#define set_desc_mount_id(d,val)       do { (d)->j_mount_id = cpu_to_le32 (val); } while (0)
-
-/* last block written in a commit */
-struct reiserfs_journal_commit {
-	__le32 j_trans_id;	/* must match j_trans_id from the desc block */
-	__le32 j_len;		/* ditto */
-	__le32 j_realblock[];	/* real locations for each block */
-};
-
-#define get_commit_trans_id(c) le32_to_cpu((c)->j_trans_id)
-#define get_commit_trans_len(c)        le32_to_cpu((c)->j_len)
-#define get_commit_mount_id(c) le32_to_cpu((c)->j_mount_id)
-
-#define set_commit_trans_id(c,val)     do { (c)->j_trans_id = cpu_to_le32 (val); } while (0)
-#define set_commit_trans_len(c,val)    do { (c)->j_len = cpu_to_le32 (val); } while (0)
-
-/*
- * this header block gets written whenever a transaction is considered
- * fully flushed, and is more recent than the last fully flushed transaction.
- * fully flushed means all the log blocks and all the real blocks are on
- * disk, and this transaction does not need to be replayed.
- */
-struct reiserfs_journal_header {
-	/* id of last fully flushed transaction */
-	__le32 j_last_flush_trans_id;
-
-	/* offset in the log of where to start replay after a crash */
-	__le32 j_first_unflushed_offset;
-
-	__le32 j_mount_id;
-	/* 12 */ struct journal_params jh_journal;
-};
-
-/* biggest tunable defines are right here */
-#define JOURNAL_BLOCK_COUNT 8192	/* number of blocks in the journal */
-
-/* biggest possible single transaction, don't change for now (8/3/99) */
-#define JOURNAL_TRANS_MAX_DEFAULT 1024
-#define JOURNAL_TRANS_MIN_DEFAULT 256
-
-/*
- * max blocks to batch into one transaction,
- * don't make this any bigger than 900
- */
-#define JOURNAL_MAX_BATCH_DEFAULT   900
-#define JOURNAL_MIN_RATIO 2
-#define JOURNAL_MAX_COMMIT_AGE 30
-#define JOURNAL_MAX_TRANS_AGE 30
-#define JOURNAL_PER_BALANCE_CNT (3 * (MAX_HEIGHT-2) + 9)
-#define JOURNAL_BLOCKS_PER_OBJECT(sb)  (JOURNAL_PER_BALANCE_CNT * 3 + \
-					 2 * (REISERFS_QUOTA_INIT_BLOCKS(sb) + \
-					      REISERFS_QUOTA_TRANS_BLOCKS(sb)))
-
-#ifdef CONFIG_QUOTA
-#define REISERFS_QUOTA_OPTS ((1 << REISERFS_USRQUOTA) | (1 << REISERFS_GRPQUOTA))
-/* We need to update data and inode (atime) */
-#define REISERFS_QUOTA_TRANS_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? 2 : 0)
-/* 1 balancing, 1 bitmap, 1 data per write + stat data update */
-#define REISERFS_QUOTA_INIT_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? \
-(DQUOT_INIT_ALLOC*(JOURNAL_PER_BALANCE_CNT+2)+DQUOT_INIT_REWRITE+1) : 0)
-/* same as with INIT */
-#define REISERFS_QUOTA_DEL_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? \
-(DQUOT_DEL_ALLOC*(JOURNAL_PER_BALANCE_CNT+2)+DQUOT_DEL_REWRITE+1) : 0)
-#else
-#define REISERFS_QUOTA_TRANS_BLOCKS(s) 0
-#define REISERFS_QUOTA_INIT_BLOCKS(s) 0
-#define REISERFS_QUOTA_DEL_BLOCKS(s) 0
-#endif
-
-/*
- * both of these can be as low as 1, or as high as you want.  The min is the
- * number of 4k bitmap nodes preallocated on mount. New nodes are allocated
- * as needed, and released when transactions are committed.  On release, if
- * the current number of nodes is > max, the node is freed, otherwise,
- * it is put on a free list for faster use later.
-*/
-#define REISERFS_MIN_BITMAP_NODES 10
-#define REISERFS_MAX_BITMAP_NODES 100
-
-/* these are based on journal hash size of 8192 */
-#define JBH_HASH_SHIFT 13
-#define JBH_HASH_MASK 8191
-
-#define _jhashfn(sb,block)	\
-	(((unsigned long)sb>>L1_CACHE_SHIFT) ^ \
-	 (((block)<<(JBH_HASH_SHIFT - 6)) ^ ((block) >> 13) ^ ((block) << (JBH_HASH_SHIFT - 12))))
-#define journal_hash(t,sb,block) ((t)[_jhashfn((sb),(block)) & JBH_HASH_MASK])
-
-/* We need these to make journal.c code more readable */
-#define journal_find_get_block(s, block) __find_get_block(\
-		file_bdev(SB_JOURNAL(s)->j_bdev_file), block, s->s_blocksize)
-#define journal_getblk(s, block) __getblk(file_bdev(SB_JOURNAL(s)->j_bdev_file),\
-		block, s->s_blocksize)
-#define journal_bread(s, block) __bread(file_bdev(SB_JOURNAL(s)->j_bdev_file),\
-		block, s->s_blocksize)
-
-enum reiserfs_bh_state_bits {
-	BH_JDirty = BH_PrivateStart,	/* buffer is in current transaction */
-	BH_JDirty_wait,
-	/*
-	 * disk block was taken off free list before being in a
-	 * finished transaction, or written to disk. Can be reused immed.
-	 */
-	BH_JNew,
-	BH_JPrepared,
-	BH_JRestore_dirty,
-	BH_JTest,		/* debugging only will go away */
-};
-
-BUFFER_FNS(JDirty, journaled);
-TAS_BUFFER_FNS(JDirty, journaled);
-BUFFER_FNS(JDirty_wait, journal_dirty);
-TAS_BUFFER_FNS(JDirty_wait, journal_dirty);
-BUFFER_FNS(JNew, journal_new);
-TAS_BUFFER_FNS(JNew, journal_new);
-BUFFER_FNS(JPrepared, journal_prepared);
-TAS_BUFFER_FNS(JPrepared, journal_prepared);
-BUFFER_FNS(JRestore_dirty, journal_restore_dirty);
-TAS_BUFFER_FNS(JRestore_dirty, journal_restore_dirty);
-BUFFER_FNS(JTest, journal_test);
-TAS_BUFFER_FNS(JTest, journal_test);
-
-/* transaction handle which is passed around for all journal calls */
-struct reiserfs_transaction_handle {
-	/*
-	 * super for this FS when journal_begin was called. saves calls to
-	 * reiserfs_get_super also used by nested transactions to make
-	 * sure they are nesting on the right FS _must_ be first
-	 * in the handle
-	 */
-	struct super_block *t_super;
-
-	int t_refcount;
-	int t_blocks_logged;	/* number of blocks this writer has logged */
-	int t_blocks_allocated;	/* number of blocks this writer allocated */
-
-	/* sanity check, equals the current trans id */
-	unsigned int t_trans_id;
-
-	void *t_handle_save;	/* save existing current->journal_info */
-
-	/*
-	 * if new block allocation occurres, that block
-	 * should be displaced from others
-	 */
-	unsigned displace_new_blocks:1;
-
-	struct list_head t_list;
-};
-
-/*
- * used to keep track of ordered and tail writes, attached to the buffer
- * head through b_journal_head.
- */
-struct reiserfs_jh {
-	struct reiserfs_journal_list *jl;
-	struct buffer_head *bh;
-	struct list_head list;
-};
-
-void reiserfs_free_jh(struct buffer_head *bh);
-int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh);
-int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh);
-int journal_mark_dirty(struct reiserfs_transaction_handle *,
-		       struct buffer_head *bh);
-
-static inline int reiserfs_file_data_log(struct inode *inode)
-{
-	if (reiserfs_data_log(inode->i_sb) ||
-	    (REISERFS_I(inode)->i_flags & i_data_log))
-		return 1;
-	return 0;
-}
-
-static inline int reiserfs_transaction_running(struct super_block *s)
-{
-	struct reiserfs_transaction_handle *th = current->journal_info;
-	if (th && th->t_super == s)
-		return 1;
-	if (th && th->t_super == NULL)
-		BUG();
-	return 0;
-}
-
-static inline int reiserfs_transaction_free_space(struct reiserfs_transaction_handle *th)
-{
-	return th->t_blocks_allocated - th->t_blocks_logged;
-}
-
-struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
-								    super_block
-								    *,
-								    int count);
-int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *);
-void reiserfs_vfs_truncate_file(struct inode *inode);
-int reiserfs_commit_page(struct inode *inode, struct page *page,
-			 unsigned from, unsigned to);
-void reiserfs_flush_old_commits(struct super_block *);
-int reiserfs_commit_for_inode(struct inode *);
-int reiserfs_inode_needs_commit(struct inode *);
-void reiserfs_update_inode_transaction(struct inode *);
-void reiserfs_wait_on_write_block(struct super_block *s);
-void reiserfs_block_writes(struct reiserfs_transaction_handle *th);
-void reiserfs_allow_writes(struct super_block *s);
-void reiserfs_check_lock_depth(struct super_block *s, char *caller);
-int reiserfs_prepare_for_journal(struct super_block *, struct buffer_head *bh,
-				 int wait);
-void reiserfs_restore_prepared_buffer(struct super_block *,
-				      struct buffer_head *bh);
-int journal_init(struct super_block *, const char *j_dev_name, int old_format,
-		 unsigned int);
-int journal_release(struct reiserfs_transaction_handle *, struct super_block *);
-int journal_release_error(struct reiserfs_transaction_handle *,
-			  struct super_block *);
-int journal_end(struct reiserfs_transaction_handle *);
-int journal_end_sync(struct reiserfs_transaction_handle *);
-int journal_mark_freed(struct reiserfs_transaction_handle *,
-		       struct super_block *, b_blocknr_t blocknr);
-int journal_transaction_should_end(struct reiserfs_transaction_handle *, int);
-int reiserfs_in_journal(struct super_block *sb, unsigned int bmap_nr,
-			 int bit_nr, int searchall, b_blocknr_t *next);
-int journal_begin(struct reiserfs_transaction_handle *,
-		  struct super_block *sb, unsigned long);
-int journal_join_abort(struct reiserfs_transaction_handle *,
-		       struct super_block *sb);
-void reiserfs_abort_journal(struct super_block *sb, int errno);
-void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...);
-int reiserfs_allocate_list_bitmaps(struct super_block *s,
-				   struct reiserfs_list_bitmap *, unsigned int);
-
-void reiserfs_schedule_old_flush(struct super_block *s);
-void reiserfs_cancel_old_flush(struct super_block *s);
-void add_save_link(struct reiserfs_transaction_handle *th,
-		   struct inode *inode, int truncate);
-int remove_save_link(struct inode *inode, int truncate);
-
-/* objectid.c */
-__u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th);
-void reiserfs_release_objectid(struct reiserfs_transaction_handle *th,
-			       __u32 objectid_to_release);
-int reiserfs_convert_objectid_map_v1(struct super_block *);
-
-/* stree.c */
-int B_IS_IN_TREE(const struct buffer_head *);
-extern void copy_item_head(struct item_head *to,
-			   const struct item_head *from);
-
-/* first key is in cpu form, second - le */
-extern int comp_short_keys(const struct reiserfs_key *le_key,
-			   const struct cpu_key *cpu_key);
-extern void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from);
-
-/* both are in le form */
-extern int comp_le_keys(const struct reiserfs_key *,
-			const struct reiserfs_key *);
-extern int comp_short_le_keys(const struct reiserfs_key *,
-			      const struct reiserfs_key *);
-
-/* * get key version from on disk key - kludge */
-static inline int le_key_version(const struct reiserfs_key *key)
-{
-	int type;
-
-	type = offset_v2_k_type(&(key->u.k_offset_v2));
-	if (type != TYPE_DIRECT && type != TYPE_INDIRECT
-	    && type != TYPE_DIRENTRY)
-		return KEY_FORMAT_3_5;
-
-	return KEY_FORMAT_3_6;
-
-}
-
-static inline void copy_key(struct reiserfs_key *to,
-			    const struct reiserfs_key *from)
-{
-	memcpy(to, from, KEY_SIZE);
-}
-
-int comp_items(const struct item_head *stored_ih, const struct treepath *path);
-const struct reiserfs_key *get_rkey(const struct treepath *chk_path,
-				    const struct super_block *sb);
-int search_by_key(struct super_block *, const struct cpu_key *,
-		  struct treepath *, int);
-#define search_item(s,key,path) search_by_key (s, key, path, DISK_LEAF_NODE_LEVEL)
-int search_for_position_by_key(struct super_block *sb,
-			       const struct cpu_key *cpu_key,
-			       struct treepath *search_path);
-extern void decrement_bcount(struct buffer_head *bh);
-void decrement_counters_in_path(struct treepath *search_path);
-void pathrelse(struct treepath *search_path);
-int reiserfs_check_path(struct treepath *p);
-void pathrelse_and_restore(struct super_block *s, struct treepath *search_path);
-
-int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
-			 struct treepath *path,
-			 const struct cpu_key *key,
-			 struct item_head *ih,
-			 struct inode *inode, const char *body);
-
-int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th,
-			     struct treepath *path,
-			     const struct cpu_key *key,
-			     struct inode *inode,
-			     const char *body, int paste_size);
-
-int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
-			   struct treepath *path,
-			   struct cpu_key *key,
-			   struct inode *inode,
-			   struct page *page, loff_t new_file_size);
-
-int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
-			 struct treepath *path,
-			 const struct cpu_key *key,
-			 struct inode *inode, struct buffer_head *un_bh);
-
-void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
-				struct inode *inode, struct reiserfs_key *key);
-int reiserfs_delete_object(struct reiserfs_transaction_handle *th,
-			   struct inode *inode);
-int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
-			 struct inode *inode, struct page *,
-			 int update_timestamps);
-
-#define i_block_size(inode) ((inode)->i_sb->s_blocksize)
-#define file_size(inode) ((inode)->i_size)
-#define tail_size(inode) (file_size (inode) & (i_block_size (inode) - 1))
-
-#define tail_has_to_be_packed(inode) (have_large_tails ((inode)->i_sb)?\
-!STORE_TAIL_IN_UNFM_S1(file_size (inode), tail_size(inode), inode->i_sb->s_blocksize):have_small_tails ((inode)->i_sb)?!STORE_TAIL_IN_UNFM_S2(file_size (inode), tail_size(inode), inode->i_sb->s_blocksize):0 )
-
-void padd_item(char *item, int total_length, int length);
-
-/* inode.c */
-/* args for the create parameter of reiserfs_get_block */
-#define GET_BLOCK_NO_CREATE 0	 /* don't create new blocks or convert tails */
-#define GET_BLOCK_CREATE 1	 /* add anything you need to find block */
-#define GET_BLOCK_NO_HOLE 2	 /* return -ENOENT for file holes */
-#define GET_BLOCK_READ_DIRECT 4	 /* read the tail if indirect item not found */
-#define GET_BLOCK_NO_IMUX     8	 /* i_mutex is not held, don't preallocate */
-#define GET_BLOCK_NO_DANGLE   16 /* don't leave any transactions running */
-
-void reiserfs_read_locked_inode(struct inode *inode,
-				struct reiserfs_iget_args *args);
-int reiserfs_find_actor(struct inode *inode, void *p);
-int reiserfs_init_locked_inode(struct inode *inode, void *p);
-void reiserfs_evict_inode(struct inode *inode);
-int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc);
-int reiserfs_get_block(struct inode *inode, sector_t block,
-		       struct buffer_head *bh_result, int create);
-struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
-				     int fh_len, int fh_type);
-struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
-				     int fh_len, int fh_type);
-int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp,
-		       struct inode *parent);
-
-int reiserfs_truncate_file(struct inode *, int update_timestamps);
-void make_cpu_key(struct cpu_key *cpu_key, struct inode *inode, loff_t offset,
-		  int type, int key_length);
-void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
-		       int version,
-		       loff_t offset, int type, int length, int entry_count);
-struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key);
-
-struct reiserfs_security_handle;
-int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
-		       struct inode *dir, umode_t mode,
-		       const char *symname, loff_t i_size,
-		       struct dentry *dentry, struct inode *inode,
-		       struct reiserfs_security_handle *security);
-
-void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
-			     struct inode *inode, loff_t size);
-
-static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th,
-				      struct inode *inode)
-{
-	reiserfs_update_sd_size(th, inode, inode->i_size);
-}
-
-void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode);
-int reiserfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
-		     struct iattr *attr);
-
-int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len);
-
-/* namei.c */
-void reiserfs_init_priv_inode(struct inode *inode);
-void set_de_name_and_namelen(struct reiserfs_dir_entry *de);
-int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
-			struct treepath *path, struct reiserfs_dir_entry *de);
-struct dentry *reiserfs_get_parent(struct dentry *);
-
-#ifdef CONFIG_REISERFS_PROC_INFO
-int reiserfs_proc_info_init(struct super_block *sb);
-int reiserfs_proc_info_done(struct super_block *sb);
-int reiserfs_proc_info_global_init(void);
-int reiserfs_proc_info_global_done(void);
-
-#define PROC_EXP( e )   e
-
-#define __PINFO( sb ) REISERFS_SB(sb) -> s_proc_info_data
-#define PROC_INFO_MAX( sb, field, value )								\
-    __PINFO( sb ).field =												\
-        max( REISERFS_SB( sb ) -> s_proc_info_data.field, value )
-#define PROC_INFO_INC( sb, field ) ( ++ ( __PINFO( sb ).field ) )
-#define PROC_INFO_ADD( sb, field, val ) ( __PINFO( sb ).field += ( val ) )
-#define PROC_INFO_BH_STAT( sb, bh, level )							\
-    PROC_INFO_INC( sb, sbk_read_at[ ( level ) ] );						\
-    PROC_INFO_ADD( sb, free_at[ ( level ) ], B_FREE_SPACE( bh ) );	\
-    PROC_INFO_ADD( sb, items_at[ ( level ) ], B_NR_ITEMS( bh ) )
-#else
-static inline int reiserfs_proc_info_init(struct super_block *sb)
-{
-	return 0;
-}
-
-static inline int reiserfs_proc_info_done(struct super_block *sb)
-{
-	return 0;
-}
-
-static inline int reiserfs_proc_info_global_init(void)
-{
-	return 0;
-}
-
-static inline int reiserfs_proc_info_global_done(void)
-{
-	return 0;
-}
-
-#define PROC_EXP( e )
-#define VOID_V ( ( void ) 0 )
-#define PROC_INFO_MAX( sb, field, value ) VOID_V
-#define PROC_INFO_INC( sb, field ) VOID_V
-#define PROC_INFO_ADD( sb, field, val ) VOID_V
-#define PROC_INFO_BH_STAT(sb, bh, n_node_level) VOID_V
-#endif
-
-/* dir.c */
-extern const struct inode_operations reiserfs_dir_inode_operations;
-extern const struct inode_operations reiserfs_symlink_inode_operations;
-extern const struct inode_operations reiserfs_special_inode_operations;
-extern const struct file_operations reiserfs_dir_operations;
-int reiserfs_readdir_inode(struct inode *, struct dir_context *);
-
-/* tail_conversion.c */
-int direct2indirect(struct reiserfs_transaction_handle *, struct inode *,
-		    struct treepath *, struct buffer_head *, loff_t);
-int indirect2direct(struct reiserfs_transaction_handle *, struct inode *,
-		    struct page *, struct treepath *, const struct cpu_key *,
-		    loff_t, char *);
-void reiserfs_unmap_buffer(struct buffer_head *);
-
-/* file.c */
-extern const struct inode_operations reiserfs_file_inode_operations;
-extern const struct inode_operations reiserfs_priv_file_inode_operations;
-extern const struct file_operations reiserfs_file_operations;
-extern const struct address_space_operations reiserfs_address_space_operations;
-
-/* fix_nodes.c */
-
-int fix_nodes(int n_op_mode, struct tree_balance *tb,
-	      struct item_head *ins_ih, const void *);
-void unfix_nodes(struct tree_balance *);
-
-/* prints.c */
-void __reiserfs_panic(struct super_block *s, const char *id,
-		      const char *function, const char *fmt, ...)
-    __attribute__ ((noreturn));
-#define reiserfs_panic(s, id, fmt, args...) \
-	__reiserfs_panic(s, id, __func__, fmt, ##args)
-void __reiserfs_error(struct super_block *s, const char *id,
-		      const char *function, const char *fmt, ...);
-#define reiserfs_error(s, id, fmt, args...) \
-	 __reiserfs_error(s, id, __func__, fmt, ##args)
-void reiserfs_info(struct super_block *s, const char *fmt, ...);
-void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...);
-void print_indirect_item(struct buffer_head *bh, int item_num);
-void store_print_tb(struct tree_balance *tb);
-void print_cur_tb(char *mes);
-void print_de(struct reiserfs_dir_entry *de);
-void print_bi(struct buffer_info *bi, char *mes);
-#define PRINT_LEAF_ITEMS 1	/* print all items */
-#define PRINT_DIRECTORY_ITEMS 2	/* print directory items */
-#define PRINT_DIRECT_ITEMS 4	/* print contents of direct items */
-void print_block(struct buffer_head *bh, ...);
-void print_bmap(struct super_block *s, int silent);
-void print_bmap_block(int i, char *data, int size, int silent);
-/*void print_super_block (struct super_block * s, char * mes);*/
-void print_objectid_map(struct super_block *s);
-void print_block_head(struct buffer_head *bh, char *mes);
-void check_leaf(struct buffer_head *bh);
-void check_internal(struct buffer_head *bh);
-void print_statistics(struct super_block *s);
-char *reiserfs_hashname(int code);
-
-/* lbalance.c */
-int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num,
-		    int mov_bytes, struct buffer_head *Snew);
-int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes);
-int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes);
-void leaf_delete_items(struct buffer_info *cur_bi, int last_first, int first,
-		       int del_num, int del_bytes);
-void leaf_insert_into_buf(struct buffer_info *bi, int before,
-			  struct item_head * const inserted_item_ih,
-			  const char * const inserted_item_body,
-			  int zeros_number);
-void leaf_paste_in_buffer(struct buffer_info *bi, int pasted_item_num,
-			  int pos_in_item, int paste_size,
-			  const char * const body, int zeros_number);
-void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
-			  int pos_in_item, int cut_size);
-void leaf_paste_entries(struct buffer_info *bi, int item_num, int before,
-			int new_entry_count, struct reiserfs_de_head *new_dehs,
-			const char *records, int paste_size);
-/* ibalance.c */
-int balance_internal(struct tree_balance *, int, int, struct item_head *,
-		     struct buffer_head **);
-
-/* do_balance.c */
-void do_balance_mark_leaf_dirty(struct tree_balance *tb,
-				struct buffer_head *bh, int flag);
-#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty
-#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty
-
-void do_balance(struct tree_balance *tb, struct item_head *ih,
-		const char *body, int flag);
-void reiserfs_invalidate_buffer(struct tree_balance *tb,
-				struct buffer_head *bh);
-
-int get_left_neighbor_position(struct tree_balance *tb, int h);
-int get_right_neighbor_position(struct tree_balance *tb, int h);
-void replace_key(struct tree_balance *tb, struct buffer_head *, int,
-		 struct buffer_head *, int);
-void make_empty_node(struct buffer_info *);
-struct buffer_head *get_FEB(struct tree_balance *);
-
-/* bitmap.c */
-
-/*
- * structure contains hints for block allocator, and it is a container for
- * arguments, such as node, search path, transaction_handle, etc.
- */
-struct __reiserfs_blocknr_hint {
-	/* inode passed to allocator, if we allocate unf. nodes */
-	struct inode *inode;
-
-	sector_t block;		/* file offset, in blocks */
-	struct in_core_key key;
-
-	/*
-	 * search path, used by allocator to deternine search_start by
-	 * various ways
-	 */
-	struct treepath *path;
-
-	/*
-	 * transaction handle is needed to log super blocks
-	 * and bitmap blocks changes
-	 */
-	struct reiserfs_transaction_handle *th;
-
-	b_blocknr_t beg, end;
-
-	/*
-	 * a field used to transfer search start value (block number)
-	 * between different block allocator procedures
-	 * (determine_search_start() and others)
-	 */
-	b_blocknr_t search_start;
-
-	/*
-	 * is set in determine_prealloc_size() function,
-	 * used by underlayed function that do actual allocation
-	 */
-	int prealloc_size;
-
-	/*
-	 * the allocator uses different polices for getting disk
-	 * space for formatted/unformatted blocks with/without preallocation
-	 */
-	unsigned formatted_node:1;
-	unsigned preallocate:1;
-};
-
-typedef struct __reiserfs_blocknr_hint reiserfs_blocknr_hint_t;
-
-int reiserfs_parse_alloc_options(struct super_block *, char *);
-void reiserfs_init_alloc_options(struct super_block *s);
-
-/*
- * given a directory, this will tell you what packing locality
- * to use for a new object underneat it.  The locality is returned
- * in disk byte order (le).
- */
-__le32 reiserfs_choose_packing(struct inode *dir);
-
-void show_alloc_options(struct seq_file *seq, struct super_block *s);
-int reiserfs_init_bitmap_cache(struct super_block *sb);
-void reiserfs_free_bitmap_cache(struct super_block *sb);
-void reiserfs_cache_bitmap_metadata(struct super_block *sb, struct buffer_head *bh, struct reiserfs_bitmap_info *info);
-struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb, unsigned int bitmap);
-int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value);
-void reiserfs_free_block(struct reiserfs_transaction_handle *th, struct inode *,
-			 b_blocknr_t, int for_unformatted);
-int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *, b_blocknr_t *, int,
-			       int);
-static inline int reiserfs_new_form_blocknrs(struct tree_balance *tb,
-					     b_blocknr_t * new_blocknrs,
-					     int amount_needed)
-{
-	reiserfs_blocknr_hint_t hint = {
-		.th = tb->transaction_handle,
-		.path = tb->tb_path,
-		.inode = NULL,
-		.key = tb->key,
-		.block = 0,
-		.formatted_node = 1
-	};
-	return reiserfs_allocate_blocknrs(&hint, new_blocknrs, amount_needed,
-					  0);
-}
-
-static inline int reiserfs_new_unf_blocknrs(struct reiserfs_transaction_handle
-					    *th, struct inode *inode,
-					    b_blocknr_t * new_blocknrs,
-					    struct treepath *path,
-					    sector_t block)
-{
-	reiserfs_blocknr_hint_t hint = {
-		.th = th,
-		.path = path,
-		.inode = inode,
-		.block = block,
-		.formatted_node = 0,
-		.preallocate = 0
-	};
-	return reiserfs_allocate_blocknrs(&hint, new_blocknrs, 1, 0);
-}
-
-#ifdef REISERFS_PREALLOCATE
-static inline int reiserfs_new_unf_blocknrs2(struct reiserfs_transaction_handle
-					     *th, struct inode *inode,
-					     b_blocknr_t * new_blocknrs,
-					     struct treepath *path,
-					     sector_t block)
-{
-	reiserfs_blocknr_hint_t hint = {
-		.th = th,
-		.path = path,
-		.inode = inode,
-		.block = block,
-		.formatted_node = 0,
-		.preallocate = 1
-	};
-	return reiserfs_allocate_blocknrs(&hint, new_blocknrs, 1, 0);
-}
-
-void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th,
-			       struct inode *inode);
-void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th);
-#endif
-
-/* hashes.c */
-__u32 keyed_hash(const signed char *msg, int len);
-__u32 yura_hash(const signed char *msg, int len);
-__u32 r5_hash(const signed char *msg, int len);
-
-#define reiserfs_set_le_bit		__set_bit_le
-#define reiserfs_test_and_set_le_bit	__test_and_set_bit_le
-#define reiserfs_clear_le_bit		__clear_bit_le
-#define reiserfs_test_and_clear_le_bit	__test_and_clear_bit_le
-#define reiserfs_test_le_bit		test_bit_le
-#define reiserfs_find_next_zero_le_bit	find_next_zero_bit_le
-
-/*
- * sometimes reiserfs_truncate may require to allocate few new blocks
- * to perform indirect2direct conversion. People probably used to
- * think, that truncate should work without problems on a filesystem
- * without free disk space. They may complain that they can not
- * truncate due to lack of free disk space. This spare space allows us
- * to not worry about it. 500 is probably too much, but it should be
- * absolutely safe
- */
-#define SPARE_SPACE 500
-
-/* prototypes from ioctl.c */
-int reiserfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
-int reiserfs_fileattr_set(struct mnt_idmap *idmap,
-			  struct dentry *dentry, struct fileattr *fa);
-long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
-long reiserfs_compat_ioctl(struct file *filp,
-		   unsigned int cmd, unsigned long arg);
-int reiserfs_unpack(struct inode *inode);
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
deleted file mode 100644
index 7b498a0d060b..000000000000
--- a/fs/reiserfs/resize.c
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-/*
- * Written by Alexander Zarochentcev.
- *
- * The kernel part of the (on-line) reiserfs resizer.
- */
-
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/vmalloc.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include "reiserfs.h"
-#include <linux/buffer_head.h>
-
-int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
-{
-	int err = 0;
-	struct reiserfs_super_block *sb;
-	struct reiserfs_bitmap_info *bitmap;
-	struct reiserfs_bitmap_info *info;
-	struct reiserfs_bitmap_info *old_bitmap = SB_AP_BITMAP(s);
-	struct buffer_head *bh;
-	struct reiserfs_transaction_handle th;
-	unsigned int bmap_nr_new, bmap_nr;
-	unsigned int block_r_new, block_r;
-
-	struct reiserfs_list_bitmap *jb;
-	struct reiserfs_list_bitmap jbitmap[JOURNAL_NUM_BITMAPS];
-
-	unsigned long int block_count, free_blocks;
-	int i;
-	int copy_size;
-	int depth;
-
-	sb = SB_DISK_SUPER_BLOCK(s);
-
-	if (SB_BLOCK_COUNT(s) >= block_count_new) {
-		printk("can\'t shrink filesystem on-line\n");
-		return -EINVAL;
-	}
-
-	/* check the device size */
-	depth = reiserfs_write_unlock_nested(s);
-	bh = sb_bread(s, block_count_new - 1);
-	reiserfs_write_lock_nested(s, depth);
-	if (!bh) {
-		printk("reiserfs_resize: can\'t read last block\n");
-		return -EINVAL;
-	}
-	bforget(bh);
-
-	/*
-	 * old disk layout detection; those partitions can be mounted, but
-	 * cannot be resized
-	 */
-	if (SB_BUFFER_WITH_SB(s)->b_blocknr * SB_BUFFER_WITH_SB(s)->b_size
-	    != REISERFS_DISK_OFFSET_IN_BYTES) {
-		printk
-		    ("reiserfs_resize: unable to resize a reiserfs without distributed bitmap (fs version < 3.5.12)\n");
-		return -ENOTSUPP;
-	}
-
-	/* count used bits in last bitmap block */
-	block_r = SB_BLOCK_COUNT(s) -
-			(reiserfs_bmap_count(s) - 1) * s->s_blocksize * 8;
-
-	/* count bitmap blocks in new fs */
-	bmap_nr_new = block_count_new / (s->s_blocksize * 8);
-	block_r_new = block_count_new - bmap_nr_new * s->s_blocksize * 8;
-	if (block_r_new)
-		bmap_nr_new++;
-	else
-		block_r_new = s->s_blocksize * 8;
-
-	/* save old values */
-	block_count = SB_BLOCK_COUNT(s);
-	bmap_nr = reiserfs_bmap_count(s);
-
-	/* resizing of reiserfs bitmaps (journal and real), if needed */
-	if (bmap_nr_new > bmap_nr) {
-		/* reallocate journal bitmaps */
-		if (reiserfs_allocate_list_bitmaps(s, jbitmap, bmap_nr_new) < 0) {
-			printk
-			    ("reiserfs_resize: unable to allocate memory for journal bitmaps\n");
-			return -ENOMEM;
-		}
-		/*
-		 * the new journal bitmaps are zero filled, now we copy i
-		 * the bitmap node pointers from the old journal bitmap
-		 * structs, and then transfer the new data structures
-		 * into the journal struct.
-		 *
-		 * using the copy_size var below allows this code to work for
-		 * both shrinking and expanding the FS.
-		 */
-		copy_size = min(bmap_nr_new, bmap_nr);
-		copy_size =
-		    copy_size * sizeof(struct reiserfs_list_bitmap_node *);
-		for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
-			struct reiserfs_bitmap_node **node_tmp;
-			jb = SB_JOURNAL(s)->j_list_bitmap + i;
-			memcpy(jbitmap[i].bitmaps, jb->bitmaps, copy_size);
-
-			/*
-			 * just in case vfree schedules on us, copy the new
-			 * pointer into the journal struct before freeing the
-			 * old one
-			 */
-			node_tmp = jb->bitmaps;
-			jb->bitmaps = jbitmap[i].bitmaps;
-			vfree(node_tmp);
-		}
-
-		/*
-		 * allocate additional bitmap blocks, reallocate
-		 * array of bitmap block pointers
-		 */
-		bitmap =
-		    vzalloc(array_size(bmap_nr_new,
-				       sizeof(struct reiserfs_bitmap_info)));
-		if (!bitmap) {
-			/*
-			 * Journal bitmaps are still supersized, but the
-			 * memory isn't leaked, so I guess it's ok
-			 */
-			printk("reiserfs_resize: unable to allocate memory.\n");
-			return -ENOMEM;
-		}
-		for (i = 0; i < bmap_nr; i++)
-			bitmap[i] = old_bitmap[i];
-
-		/*
-		 * This doesn't go through the journal, but it doesn't have to.
-		 * The changes are still atomic: We're synced up when the
-		 * journal transaction begins, and the new bitmaps don't
-		 * matter if the transaction fails.
-		 */
-		for (i = bmap_nr; i < bmap_nr_new; i++) {
-			int depth;
-			/*
-			 * don't use read_bitmap_block since it will cache
-			 * the uninitialized bitmap
-			 */
-			depth = reiserfs_write_unlock_nested(s);
-			bh = sb_bread(s, i * s->s_blocksize * 8);
-			reiserfs_write_lock_nested(s, depth);
-			if (!bh) {
-				vfree(bitmap);
-				return -EIO;
-			}
-			memset(bh->b_data, 0, sb_blocksize(sb));
-			reiserfs_set_le_bit(0, bh->b_data);
-			reiserfs_cache_bitmap_metadata(s, bh, bitmap + i);
-
-			set_buffer_uptodate(bh);
-			mark_buffer_dirty(bh);
-			depth = reiserfs_write_unlock_nested(s);
-			sync_dirty_buffer(bh);
-			reiserfs_write_lock_nested(s, depth);
-			/* update bitmap_info stuff */
-			bitmap[i].free_count = sb_blocksize(sb) * 8 - 1;
-			brelse(bh);
-		}
-		/* free old bitmap blocks array */
-		SB_AP_BITMAP(s) = bitmap;
-		vfree(old_bitmap);
-	}
-
-	/*
-	 * begin transaction, if there was an error, it's fine. Yes, we have
-	 * incorrect bitmaps now, but none of it is ever going to touch the
-	 * disk anyway.
-	 */
-	err = journal_begin(&th, s, 10);
-	if (err)
-		return err;
-
-	/* Extend old last bitmap block - new blocks have been made available */
-	info = SB_AP_BITMAP(s) + bmap_nr - 1;
-	bh = reiserfs_read_bitmap_block(s, bmap_nr - 1);
-	if (!bh) {
-		int jerr = journal_end(&th);
-		if (jerr)
-			return jerr;
-		return -EIO;
-	}
-
-	reiserfs_prepare_for_journal(s, bh, 1);
-	for (i = block_r; i < s->s_blocksize * 8; i++)
-		reiserfs_clear_le_bit(i, bh->b_data);
-	info->free_count += s->s_blocksize * 8 - block_r;
-
-	journal_mark_dirty(&th, bh);
-	brelse(bh);
-
-	/* Correct new last bitmap block - It may not be full */
-	info = SB_AP_BITMAP(s) + bmap_nr_new - 1;
-	bh = reiserfs_read_bitmap_block(s, bmap_nr_new - 1);
-	if (!bh) {
-		int jerr = journal_end(&th);
-		if (jerr)
-			return jerr;
-		return -EIO;
-	}
-
-	reiserfs_prepare_for_journal(s, bh, 1);
-	for (i = block_r_new; i < s->s_blocksize * 8; i++)
-		reiserfs_set_le_bit(i, bh->b_data);
-	journal_mark_dirty(&th, bh);
-	brelse(bh);
-
-	info->free_count -= s->s_blocksize * 8 - block_r_new;
-	/* update super */
-	reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
-	free_blocks = SB_FREE_BLOCKS(s);
-	PUT_SB_FREE_BLOCKS(s,
-			   free_blocks + (block_count_new - block_count -
-					  (bmap_nr_new - bmap_nr)));
-	PUT_SB_BLOCK_COUNT(s, block_count_new);
-	PUT_SB_BMAP_NR(s, bmap_would_wrap(bmap_nr_new) ? : bmap_nr_new);
-
-	journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
-
-	SB_JOURNAL(s)->j_must_wait = 1;
-	return journal_end(&th);
-}
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
deleted file mode 100644
index 5faf702f8d15..000000000000
--- a/fs/reiserfs/stree.c
+++ /dev/null
@@ -1,2280 +0,0 @@
-/*
- *  Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-/*
- *  Written by Anatoly P. Pinchuk pap@namesys.botik.ru
- *  Programm System Institute
- *  Pereslavl-Zalessky Russia
- */
-
-#include <linux/time.h>
-#include <linux/string.h>
-#include <linux/pagemap.h>
-#include <linux/bio.h>
-#include "reiserfs.h"
-#include <linux/buffer_head.h>
-#include <linux/quotaops.h>
-
-/* Does the buffer contain a disk block which is in the tree. */
-inline int B_IS_IN_TREE(const struct buffer_head *bh)
-{
-
-	RFALSE(B_LEVEL(bh) > MAX_HEIGHT,
-	       "PAP-1010: block (%b) has too big level (%z)", bh, bh);
-
-	return (B_LEVEL(bh) != FREE_LEVEL);
-}
-
-/* to get item head in le form */
-inline void copy_item_head(struct item_head *to,
-			   const struct item_head *from)
-{
-	memcpy(to, from, IH_SIZE);
-}
-
-/*
- * k1 is pointer to on-disk structure which is stored in little-endian
- * form. k2 is pointer to cpu variable. For key of items of the same
- * object this returns 0.
- * Returns: -1 if key1 < key2
- * 0 if key1 == key2
- * 1 if key1 > key2
- */
-inline int comp_short_keys(const struct reiserfs_key *le_key,
-			   const struct cpu_key *cpu_key)
-{
-	__u32 n;
-	n = le32_to_cpu(le_key->k_dir_id);
-	if (n < cpu_key->on_disk_key.k_dir_id)
-		return -1;
-	if (n > cpu_key->on_disk_key.k_dir_id)
-		return 1;
-	n = le32_to_cpu(le_key->k_objectid);
-	if (n < cpu_key->on_disk_key.k_objectid)
-		return -1;
-	if (n > cpu_key->on_disk_key.k_objectid)
-		return 1;
-	return 0;
-}
-
-/*
- * k1 is pointer to on-disk structure which is stored in little-endian
- * form. k2 is pointer to cpu variable.
- * Compare keys using all 4 key fields.
- * Returns: -1 if key1 < key2 0
- * if key1 = key2 1 if key1 > key2
- */
-static inline int comp_keys(const struct reiserfs_key *le_key,
-			    const struct cpu_key *cpu_key)
-{
-	int retval;
-
-	retval = comp_short_keys(le_key, cpu_key);
-	if (retval)
-		return retval;
-	if (le_key_k_offset(le_key_version(le_key), le_key) <
-	    cpu_key_k_offset(cpu_key))
-		return -1;
-	if (le_key_k_offset(le_key_version(le_key), le_key) >
-	    cpu_key_k_offset(cpu_key))
-		return 1;
-
-	if (cpu_key->key_length == 3)
-		return 0;
-
-	/* this part is needed only when tail conversion is in progress */
-	if (le_key_k_type(le_key_version(le_key), le_key) <
-	    cpu_key_k_type(cpu_key))
-		return -1;
-
-	if (le_key_k_type(le_key_version(le_key), le_key) >
-	    cpu_key_k_type(cpu_key))
-		return 1;
-
-	return 0;
-}
-
-inline int comp_short_le_keys(const struct reiserfs_key *key1,
-			      const struct reiserfs_key *key2)
-{
-	__u32 *k1_u32, *k2_u32;
-	int key_length = REISERFS_SHORT_KEY_LEN;
-
-	k1_u32 = (__u32 *) key1;
-	k2_u32 = (__u32 *) key2;
-	for (; key_length--; ++k1_u32, ++k2_u32) {
-		if (le32_to_cpu(*k1_u32) < le32_to_cpu(*k2_u32))
-			return -1;
-		if (le32_to_cpu(*k1_u32) > le32_to_cpu(*k2_u32))
-			return 1;
-	}
-	return 0;
-}
-
-inline void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from)
-{
-	int version;
-	to->on_disk_key.k_dir_id = le32_to_cpu(from->k_dir_id);
-	to->on_disk_key.k_objectid = le32_to_cpu(from->k_objectid);
-
-	/* find out version of the key */
-	version = le_key_version(from);
-	to->version = version;
-	to->on_disk_key.k_offset = le_key_k_offset(version, from);
-	to->on_disk_key.k_type = le_key_k_type(version, from);
-}
-
-/*
- * this does not say which one is bigger, it only returns 1 if keys
- * are not equal, 0 otherwise
- */
-inline int comp_le_keys(const struct reiserfs_key *k1,
-			const struct reiserfs_key *k2)
-{
-	return memcmp(k1, k2, sizeof(struct reiserfs_key));
-}
-
-/**************************************************************************
- *  Binary search toolkit function                                        *
- *  Search for an item in the array by the item key                       *
- *  Returns:    1 if found,  0 if not found;                              *
- *        *pos = number of the searched element if found, else the        *
- *        number of the first element that is larger than key.            *
- **************************************************************************/
-/*
- * For those not familiar with binary search: lbound is the leftmost item
- * that it could be, rbound the rightmost item that it could be.  We examine
- * the item halfway between lbound and rbound, and that tells us either
- * that we can increase lbound, or decrease rbound, or that we have found it,
- * or if lbound <= rbound that there are no possible items, and we have not
- * found it. With each examination we cut the number of possible items it
- * could be by one more than half rounded down, or we find it.
- */
-static inline int bin_search(const void *key,	/* Key to search for. */
-			     const void *base,	/* First item in the array. */
-			     int num,	/* Number of items in the array. */
-			     /*
-			      * Item size in the array.  searched. Lest the
-			      * reader be confused, note that this is crafted
-			      * as a general function, and when it is applied
-			      * specifically to the array of item headers in a
-			      * node, width is actually the item header size
-			      * not the item size.
-			      */
-			     int width,
-			     int *pos /* Number of the searched for element. */
-    )
-{
-	int rbound, lbound, j;
-
-	for (j = ((rbound = num - 1) + (lbound = 0)) / 2;
-	     lbound <= rbound; j = (rbound + lbound) / 2)
-		switch (comp_keys
-			((struct reiserfs_key *)((char *)base + j * width),
-			 (struct cpu_key *)key)) {
-		case -1:
-			lbound = j + 1;
-			continue;
-		case 1:
-			rbound = j - 1;
-			continue;
-		case 0:
-			*pos = j;
-			return ITEM_FOUND;	/* Key found in the array.  */
-		}
-
-	/*
-	 * bin_search did not find given key, it returns position of key,
-	 * that is minimal and greater than the given one.
-	 */
-	*pos = lbound;
-	return ITEM_NOT_FOUND;
-}
-
-
-/* Minimal possible key. It is never in the tree. */
-const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} };
-
-/* Maximal possible key. It is never in the tree. */
-static const struct reiserfs_key MAX_KEY = {
-	cpu_to_le32(0xffffffff),
-	cpu_to_le32(0xffffffff),
-	{{cpu_to_le32(0xffffffff),
-	  cpu_to_le32(0xffffffff)},}
-};
-
-/*
- * Get delimiting key of the buffer by looking for it in the buffers in the
- * path, starting from the bottom of the path, and going upwards.  We must
- * check the path's validity at each step.  If the key is not in the path,
- * there is no delimiting key in the tree (buffer is first or last buffer
- * in tree), and in this case we return a special key, either MIN_KEY or
- * MAX_KEY.
- */
-static inline const struct reiserfs_key *get_lkey(const struct treepath *chk_path,
-						  const struct super_block *sb)
-{
-	int position, path_offset = chk_path->path_length;
-	struct buffer_head *parent;
-
-	RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET,
-	       "PAP-5010: invalid offset in the path");
-
-	/* While not higher in path than first element. */
-	while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) {
-
-		RFALSE(!buffer_uptodate
-		       (PATH_OFFSET_PBUFFER(chk_path, path_offset)),
-		       "PAP-5020: parent is not uptodate");
-
-		/* Parent at the path is not in the tree now. */
-		if (!B_IS_IN_TREE
-		    (parent =
-		     PATH_OFFSET_PBUFFER(chk_path, path_offset)))
-			return &MAX_KEY;
-		/* Check whether position in the parent is correct. */
-		if ((position =
-		     PATH_OFFSET_POSITION(chk_path,
-					  path_offset)) >
-		    B_NR_ITEMS(parent))
-			return &MAX_KEY;
-		/* Check whether parent at the path really points to the child. */
-		if (B_N_CHILD_NUM(parent, position) !=
-		    PATH_OFFSET_PBUFFER(chk_path,
-					path_offset + 1)->b_blocknr)
-			return &MAX_KEY;
-		/*
-		 * Return delimiting key if position in the parent
-		 * is not equal to zero.
-		 */
-		if (position)
-			return internal_key(parent, position - 1);
-	}
-	/* Return MIN_KEY if we are in the root of the buffer tree. */
-	if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)->
-	    b_blocknr == SB_ROOT_BLOCK(sb))
-		return &MIN_KEY;
-	return &MAX_KEY;
-}
-
-/* Get delimiting key of the buffer at the path and its right neighbor. */
-inline const struct reiserfs_key *get_rkey(const struct treepath *chk_path,
-					   const struct super_block *sb)
-{
-	int position, path_offset = chk_path->path_length;
-	struct buffer_head *parent;
-
-	RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET,
-	       "PAP-5030: invalid offset in the path");
-
-	while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) {
-
-		RFALSE(!buffer_uptodate
-		       (PATH_OFFSET_PBUFFER(chk_path, path_offset)),
-		       "PAP-5040: parent is not uptodate");
-
-		/* Parent at the path is not in the tree now. */
-		if (!B_IS_IN_TREE
-		    (parent =
-		     PATH_OFFSET_PBUFFER(chk_path, path_offset)))
-			return &MIN_KEY;
-		/* Check whether position in the parent is correct. */
-		if ((position =
-		     PATH_OFFSET_POSITION(chk_path,
-					  path_offset)) >
-		    B_NR_ITEMS(parent))
-			return &MIN_KEY;
-		/*
-		 * Check whether parent at the path really points
-		 * to the child.
-		 */
-		if (B_N_CHILD_NUM(parent, position) !=
-		    PATH_OFFSET_PBUFFER(chk_path,
-					path_offset + 1)->b_blocknr)
-			return &MIN_KEY;
-
-		/*
-		 * Return delimiting key if position in the parent
-		 * is not the last one.
-		 */
-		if (position != B_NR_ITEMS(parent))
-			return internal_key(parent, position);
-	}
-
-	/* Return MAX_KEY if we are in the root of the buffer tree. */
-	if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)->
-	    b_blocknr == SB_ROOT_BLOCK(sb))
-		return &MAX_KEY;
-	return &MIN_KEY;
-}
-
-/*
- * Check whether a key is contained in the tree rooted from a buffer at a path.
- * This works by looking at the left and right delimiting keys for the buffer
- * in the last path_element in the path.  These delimiting keys are stored
- * at least one level above that buffer in the tree. If the buffer is the
- * first or last node in the tree order then one of the delimiting keys may
- * be absent, and in this case get_lkey and get_rkey return a special key
- * which is MIN_KEY or MAX_KEY.
- */
-static inline int key_in_buffer(
-				/* Path which should be checked. */
-				struct treepath *chk_path,
-				/* Key which should be checked. */
-				const struct cpu_key *key,
-				struct super_block *sb
-    )
-{
-
-	RFALSE(!key || chk_path->path_length < FIRST_PATH_ELEMENT_OFFSET
-	       || chk_path->path_length > MAX_HEIGHT,
-	       "PAP-5050: pointer to the key(%p) is NULL or invalid path length(%d)",
-	       key, chk_path->path_length);
-	RFALSE(!PATH_PLAST_BUFFER(chk_path)->b_bdev,
-	       "PAP-5060: device must not be NODEV");
-
-	if (comp_keys(get_lkey(chk_path, sb), key) == 1)
-		/* left delimiting key is bigger, that the key we look for */
-		return 0;
-	/*  if ( comp_keys(key, get_rkey(chk_path, sb)) != -1 ) */
-	if (comp_keys(get_rkey(chk_path, sb), key) != 1)
-		/* key must be less than right delimitiing key */
-		return 0;
-	return 1;
-}
-
-int reiserfs_check_path(struct treepath *p)
-{
-	RFALSE(p->path_length != ILLEGAL_PATH_ELEMENT_OFFSET,
-	       "path not properly relsed");
-	return 0;
-}
-
-/*
- * Drop the reference to each buffer in a path and restore
- * dirty bits clean when preparing the buffer for the log.
- * This version should only be called from fix_nodes()
- */
-void pathrelse_and_restore(struct super_block *sb,
-			   struct treepath *search_path)
-{
-	int path_offset = search_path->path_length;
-
-	RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
-	       "clm-4000: invalid path offset");
-
-	while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) {
-		struct buffer_head *bh;
-		bh = PATH_OFFSET_PBUFFER(search_path, path_offset--);
-		reiserfs_restore_prepared_buffer(sb, bh);
-		brelse(bh);
-	}
-	search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
-}
-
-/* Drop the reference to each buffer in a path */
-void pathrelse(struct treepath *search_path)
-{
-	int path_offset = search_path->path_length;
-
-	RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
-	       "PAP-5090: invalid path offset");
-
-	while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET)
-		brelse(PATH_OFFSET_PBUFFER(search_path, path_offset--));
-
-	search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
-}
-
-static int has_valid_deh_location(struct buffer_head *bh, struct item_head *ih)
-{
-	struct reiserfs_de_head *deh;
-	int i;
-
-	deh = B_I_DEH(bh, ih);
-	for (i = 0; i < ih_entry_count(ih); i++) {
-		if (deh_location(&deh[i]) > ih_item_len(ih)) {
-			reiserfs_warning(NULL, "reiserfs-5094",
-					 "directory entry location seems wrong %h",
-					 &deh[i]);
-			return 0;
-		}
-	}
-
-	return 1;
-}
-
-static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
-{
-	struct block_head *blkh;
-	struct item_head *ih;
-	int used_space;
-	int prev_location;
-	int i;
-	int nr;
-
-	blkh = (struct block_head *)buf;
-	if (blkh_level(blkh) != DISK_LEAF_NODE_LEVEL) {
-		reiserfs_warning(NULL, "reiserfs-5080",
-				 "this should be caught earlier");
-		return 0;
-	}
-
-	nr = blkh_nr_item(blkh);
-	if (nr < 1 || nr > ((blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN))) {
-		/* item number is too big or too small */
-		reiserfs_warning(NULL, "reiserfs-5081",
-				 "nr_item seems wrong: %z", bh);
-		return 0;
-	}
-	ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1;
-	used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location(ih));
-
-	/* free space does not match to calculated amount of use space */
-	if (used_space != blocksize - blkh_free_space(blkh)) {
-		reiserfs_warning(NULL, "reiserfs-5082",
-				 "free space seems wrong: %z", bh);
-		return 0;
-	}
-	/*
-	 * FIXME: it is_leaf will hit performance too much - we may have
-	 * return 1 here
-	 */
-
-	/* check tables of item heads */
-	ih = (struct item_head *)(buf + BLKH_SIZE);
-	prev_location = blocksize;
-	for (i = 0; i < nr; i++, ih++) {
-		if (le_ih_k_type(ih) == TYPE_ANY) {
-			reiserfs_warning(NULL, "reiserfs-5083",
-					 "wrong item type for item %h",
-					 ih);
-			return 0;
-		}
-		if (ih_location(ih) >= blocksize
-		    || ih_location(ih) < IH_SIZE * nr) {
-			reiserfs_warning(NULL, "reiserfs-5084",
-					 "item location seems wrong: %h",
-					 ih);
-			return 0;
-		}
-		if (ih_item_len(ih) < 1
-		    || ih_item_len(ih) > MAX_ITEM_LEN(blocksize)) {
-			reiserfs_warning(NULL, "reiserfs-5085",
-					 "item length seems wrong: %h",
-					 ih);
-			return 0;
-		}
-		if (prev_location - ih_location(ih) != ih_item_len(ih)) {
-			reiserfs_warning(NULL, "reiserfs-5086",
-					 "item location seems wrong "
-					 "(second one): %h", ih);
-			return 0;
-		}
-		if (is_direntry_le_ih(ih)) {
-			if (ih_item_len(ih) < (ih_entry_count(ih) * IH_SIZE)) {
-				reiserfs_warning(NULL, "reiserfs-5093",
-						 "item entry count seems wrong %h",
-						 ih);
-				return 0;
-			}
-			return has_valid_deh_location(bh, ih);
-		}
-		prev_location = ih_location(ih);
-	}
-
-	/* one may imagine many more checks */
-	return 1;
-}
-
-/* returns 1 if buf looks like an internal node, 0 otherwise */
-static int is_internal(char *buf, int blocksize, struct buffer_head *bh)
-{
-	struct block_head *blkh;
-	int nr;
-	int used_space;
-
-	blkh = (struct block_head *)buf;
-	nr = blkh_level(blkh);
-	if (nr <= DISK_LEAF_NODE_LEVEL || nr > MAX_HEIGHT) {
-		/* this level is not possible for internal nodes */
-		reiserfs_warning(NULL, "reiserfs-5087",
-				 "this should be caught earlier");
-		return 0;
-	}
-
-	nr = blkh_nr_item(blkh);
-	/* for internal which is not root we might check min number of keys */
-	if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) {
-		reiserfs_warning(NULL, "reiserfs-5088",
-				 "number of key seems wrong: %z", bh);
-		return 0;
-	}
-
-	used_space = BLKH_SIZE + KEY_SIZE * nr + DC_SIZE * (nr + 1);
-	if (used_space != blocksize - blkh_free_space(blkh)) {
-		reiserfs_warning(NULL, "reiserfs-5089",
-				 "free space seems wrong: %z", bh);
-		return 0;
-	}
-
-	/* one may imagine many more checks */
-	return 1;
-}
-
-/*
- * make sure that bh contains formatted node of reiserfs tree of
- * 'level'-th level
- */
-static int is_tree_node(struct buffer_head *bh, int level)
-{
-	if (B_LEVEL(bh) != level) {
-		reiserfs_warning(NULL, "reiserfs-5090", "node level %d does "
-				 "not match to the expected one %d",
-				 B_LEVEL(bh), level);
-		return 0;
-	}
-	if (level == DISK_LEAF_NODE_LEVEL)
-		return is_leaf(bh->b_data, bh->b_size, bh);
-
-	return is_internal(bh->b_data, bh->b_size, bh);
-}
-
-#define SEARCH_BY_KEY_READA 16
-
-/*
- * The function is NOT SCHEDULE-SAFE!
- * It might unlock the write lock if we needed to wait for a block
- * to be read. Note that in this case it won't recover the lock to avoid
- * high contention resulting from too much lock requests, especially
- * the caller (search_by_key) will perform other schedule-unsafe
- * operations just after calling this function.
- *
- * @return depth of lock to be restored after read completes
- */
-static int search_by_key_reada(struct super_block *s,
-				struct buffer_head **bh,
-				b_blocknr_t *b, int num)
-{
-	int i, j;
-	int depth = -1;
-
-	for (i = 0; i < num; i++) {
-		bh[i] = sb_getblk(s, b[i]);
-	}
-	/*
-	 * We are going to read some blocks on which we
-	 * have a reference. It's safe, though we might be
-	 * reading blocks concurrently changed if we release
-	 * the lock. But it's still fine because we check later
-	 * if the tree changed
-	 */
-	for (j = 0; j < i; j++) {
-		/*
-		 * note, this needs attention if we are getting rid of the BKL
-		 * you have to make sure the prepared bit isn't set on this
-		 * buffer
-		 */
-		if (!buffer_uptodate(bh[j])) {
-			if (depth == -1)
-				depth = reiserfs_write_unlock_nested(s);
-			bh_readahead(bh[j], REQ_RAHEAD);
-		}
-		brelse(bh[j]);
-	}
-	return depth;
-}
-
-/*
- * This function fills up the path from the root to the leaf as it
- * descends the tree looking for the key.  It uses reiserfs_bread to
- * try to find buffers in the cache given their block number.  If it
- * does not find them in the cache it reads them from disk.  For each
- * node search_by_key finds using reiserfs_bread it then uses
- * bin_search to look through that node.  bin_search will find the
- * position of the block_number of the next node if it is looking
- * through an internal node.  If it is looking through a leaf node
- * bin_search will find the position of the item which has key either
- * equal to given key, or which is the maximal key less than the given
- * key.  search_by_key returns a path that must be checked for the
- * correctness of the top of the path but need not be checked for the
- * correctness of the bottom of the path
- */
-/*
- * search_by_key - search for key (and item) in stree
- * @sb: superblock
- * @key: pointer to key to search for
- * @search_path: Allocated and initialized struct treepath; Returned filled
- *		 on success.
- * @stop_level: How far down the tree to search, Use DISK_LEAF_NODE_LEVEL to
- *		stop at leaf level.
- *
- * The function is NOT SCHEDULE-SAFE!
- */
-int search_by_key(struct super_block *sb, const struct cpu_key *key,
-		  struct treepath *search_path, int stop_level)
-{
-	b_blocknr_t block_number;
-	int expected_level;
-	struct buffer_head *bh;
-	struct path_element *last_element;
-	int node_level, retval;
-	int fs_gen;
-	struct buffer_head *reada_bh[SEARCH_BY_KEY_READA];
-	b_blocknr_t reada_blocks[SEARCH_BY_KEY_READA];
-	int reada_count = 0;
-
-#ifdef CONFIG_REISERFS_CHECK
-	int repeat_counter = 0;
-#endif
-
-	PROC_INFO_INC(sb, search_by_key);
-
-	/*
-	 * As we add each node to a path we increase its count.  This means
-	 * that we must be careful to release all nodes in a path before we
-	 * either discard the path struct or re-use the path struct, as we
-	 * do here.
-	 */
-
-	pathrelse(search_path);
-
-	/*
-	 * With each iteration of this loop we search through the items in the
-	 * current node, and calculate the next current node(next path element)
-	 * for the next iteration of this loop..
-	 */
-	block_number = SB_ROOT_BLOCK(sb);
-	expected_level = -1;
-	while (1) {
-
-#ifdef CONFIG_REISERFS_CHECK
-		if (!(++repeat_counter % 50000))
-			reiserfs_warning(sb, "PAP-5100",
-					 "%s: there were %d iterations of "
-					 "while loop looking for key %K",
-					 current->comm, repeat_counter,
-					 key);
-#endif
-
-		/* prep path to have another element added to it. */
-		last_element =
-		    PATH_OFFSET_PELEMENT(search_path,
-					 ++search_path->path_length);
-		fs_gen = get_generation(sb);
-
-		/*
-		 * Read the next tree node, and set the last element
-		 * in the path to have a pointer to it.
-		 */
-		if ((bh = last_element->pe_buffer =
-		     sb_getblk(sb, block_number))) {
-
-			/*
-			 * We'll need to drop the lock if we encounter any
-			 * buffers that need to be read. If all of them are
-			 * already up to date, we don't need to drop the lock.
-			 */
-			int depth = -1;
-
-			if (!buffer_uptodate(bh) && reada_count > 1)
-				depth = search_by_key_reada(sb, reada_bh,
-						    reada_blocks, reada_count);
-
-			if (!buffer_uptodate(bh) && depth == -1)
-				depth = reiserfs_write_unlock_nested(sb);
-
-			bh_read_nowait(bh, 0);
-			wait_on_buffer(bh);
-
-			if (depth != -1)
-				reiserfs_write_lock_nested(sb, depth);
-			if (!buffer_uptodate(bh))
-				goto io_error;
-		} else {
-io_error:
-			search_path->path_length--;
-			pathrelse(search_path);
-			return IO_ERROR;
-		}
-		reada_count = 0;
-		if (expected_level == -1)
-			expected_level = SB_TREE_HEIGHT(sb);
-		expected_level--;
-
-		/*
-		 * It is possible that schedule occurred. We must check
-		 * whether the key to search is still in the tree rooted
-		 * from the current buffer. If not then repeat search
-		 * from the root.
-		 */
-		if (fs_changed(fs_gen, sb) &&
-		    (!B_IS_IN_TREE(bh) ||
-		     B_LEVEL(bh) != expected_level ||
-		     !key_in_buffer(search_path, key, sb))) {
-			PROC_INFO_INC(sb, search_by_key_fs_changed);
-			PROC_INFO_INC(sb, search_by_key_restarted);
-			PROC_INFO_INC(sb,
-				      sbk_restarted[expected_level - 1]);
-			pathrelse(search_path);
-
-			/*
-			 * Get the root block number so that we can
-			 * repeat the search starting from the root.
-			 */
-			block_number = SB_ROOT_BLOCK(sb);
-			expected_level = -1;
-
-			/* repeat search from the root */
-			continue;
-		}
-
-		/*
-		 * only check that the key is in the buffer if key is not
-		 * equal to the MAX_KEY. Latter case is only possible in
-		 * "finish_unfinished()" processing during mount.
-		 */
-		RFALSE(comp_keys(&MAX_KEY, key) &&
-		       !key_in_buffer(search_path, key, sb),
-		       "PAP-5130: key is not in the buffer");
-#ifdef CONFIG_REISERFS_CHECK
-		if (REISERFS_SB(sb)->cur_tb) {
-			print_cur_tb("5140");
-			reiserfs_panic(sb, "PAP-5140",
-				       "schedule occurred in do_balance!");
-		}
-#endif
-
-		/*
-		 * make sure, that the node contents look like a node of
-		 * certain level
-		 */
-		if (!is_tree_node(bh, expected_level)) {
-			reiserfs_error(sb, "vs-5150",
-				       "invalid format found in block %ld. "
-				       "Fsck?", bh->b_blocknr);
-			pathrelse(search_path);
-			return IO_ERROR;
-		}
-
-		/* ok, we have acquired next formatted node in the tree */
-		node_level = B_LEVEL(bh);
-
-		PROC_INFO_BH_STAT(sb, bh, node_level - 1);
-
-		RFALSE(node_level < stop_level,
-		       "vs-5152: tree level (%d) is less than stop level (%d)",
-		       node_level, stop_level);
-
-		retval = bin_search(key, item_head(bh, 0),
-				      B_NR_ITEMS(bh),
-				      (node_level ==
-				       DISK_LEAF_NODE_LEVEL) ? IH_SIZE :
-				      KEY_SIZE,
-				      &last_element->pe_position);
-		if (node_level == stop_level) {
-			return retval;
-		}
-
-		/* we are not in the stop level */
-		/*
-		 * item has been found, so we choose the pointer which
-		 * is to the right of the found one
-		 */
-		if (retval == ITEM_FOUND)
-			last_element->pe_position++;
-
-		/*
-		 * if item was not found we choose the position which is to
-		 * the left of the found item. This requires no code,
-		 * bin_search did it already.
-		 */
-
-		/*
-		 * So we have chosen a position in the current node which is
-		 * an internal node.  Now we calculate child block number by
-		 * position in the node.
-		 */
-		block_number =
-		    B_N_CHILD_NUM(bh, last_element->pe_position);
-
-		/*
-		 * if we are going to read leaf nodes, try for read
-		 * ahead as well
-		 */
-		if ((search_path->reada & PATH_READA) &&
-		    node_level == DISK_LEAF_NODE_LEVEL + 1) {
-			int pos = last_element->pe_position;
-			int limit = B_NR_ITEMS(bh);
-			struct reiserfs_key *le_key;
-
-			if (search_path->reada & PATH_READA_BACK)
-				limit = 0;
-			while (reada_count < SEARCH_BY_KEY_READA) {
-				if (pos == limit)
-					break;
-				reada_blocks[reada_count++] =
-				    B_N_CHILD_NUM(bh, pos);
-				if (search_path->reada & PATH_READA_BACK)
-					pos--;
-				else
-					pos++;
-
-				/*
-				 * check to make sure we're in the same object
-				 */
-				le_key = internal_key(bh, pos);
-				if (le32_to_cpu(le_key->k_objectid) !=
-				    key->on_disk_key.k_objectid) {
-					break;
-				}
-			}
-		}
-	}
-}
-
-/*
- * Form the path to an item and position in this item which contains
- * file byte defined by key. If there is no such item
- * corresponding to the key, we point the path to the item with
- * maximal key less than key, and *pos_in_item is set to one
- * past the last entry/byte in the item.  If searching for entry in a
- * directory item, and it is not found, *pos_in_item is set to one
- * entry more than the entry with maximal key which is less than the
- * sought key.
- *
- * Note that if there is no entry in this same node which is one more,
- * then we point to an imaginary entry.  for direct items, the
- * position is in units of bytes, for indirect items the position is
- * in units of blocknr entries, for directory items the position is in
- * units of directory entries.
- */
-/* The function is NOT SCHEDULE-SAFE! */
-int search_for_position_by_key(struct super_block *sb,
-			       /* Key to search (cpu variable) */
-			       const struct cpu_key *p_cpu_key,
-			       /* Filled up by this function. */
-			       struct treepath *search_path)
-{
-	struct item_head *p_le_ih;	/* pointer to on-disk structure */
-	int blk_size;
-	loff_t item_offset, offset;
-	struct reiserfs_dir_entry de;
-	int retval;
-
-	/* If searching for directory entry. */
-	if (is_direntry_cpu_key(p_cpu_key))
-		return search_by_entry_key(sb, p_cpu_key, search_path,
-					   &de);
-
-	/* If not searching for directory entry. */
-
-	/* If item is found. */
-	retval = search_item(sb, p_cpu_key, search_path);
-	if (retval == IO_ERROR)
-		return retval;
-	if (retval == ITEM_FOUND) {
-
-		RFALSE(!ih_item_len
-		       (item_head
-			(PATH_PLAST_BUFFER(search_path),
-			 PATH_LAST_POSITION(search_path))),
-		       "PAP-5165: item length equals zero");
-
-		pos_in_item(search_path) = 0;
-		return POSITION_FOUND;
-	}
-
-	RFALSE(!PATH_LAST_POSITION(search_path),
-	       "PAP-5170: position equals zero");
-
-	/* Item is not found. Set path to the previous item. */
-	p_le_ih =
-	    item_head(PATH_PLAST_BUFFER(search_path),
-			   --PATH_LAST_POSITION(search_path));
-	blk_size = sb->s_blocksize;
-
-	if (comp_short_keys(&p_le_ih->ih_key, p_cpu_key))
-		return FILE_NOT_FOUND;
-
-	/* FIXME: quite ugly this far */
-
-	item_offset = le_ih_k_offset(p_le_ih);
-	offset = cpu_key_k_offset(p_cpu_key);
-
-	/* Needed byte is contained in the item pointed to by the path. */
-	if (item_offset <= offset &&
-	    item_offset + op_bytes_number(p_le_ih, blk_size) > offset) {
-		pos_in_item(search_path) = offset - item_offset;
-		if (is_indirect_le_ih(p_le_ih)) {
-			pos_in_item(search_path) /= blk_size;
-		}
-		return POSITION_FOUND;
-	}
-
-	/*
-	 * Needed byte is not contained in the item pointed to by the
-	 * path. Set pos_in_item out of the item.
-	 */
-	if (is_indirect_le_ih(p_le_ih))
-		pos_in_item(search_path) =
-		    ih_item_len(p_le_ih) / UNFM_P_SIZE;
-	else
-		pos_in_item(search_path) = ih_item_len(p_le_ih);
-
-	return POSITION_NOT_FOUND;
-}
-
-/* Compare given item and item pointed to by the path. */
-int comp_items(const struct item_head *stored_ih, const struct treepath *path)
-{
-	struct buffer_head *bh = PATH_PLAST_BUFFER(path);
-	struct item_head *ih;
-
-	/* Last buffer at the path is not in the tree. */
-	if (!B_IS_IN_TREE(bh))
-		return 1;
-
-	/* Last path position is invalid. */
-	if (PATH_LAST_POSITION(path) >= B_NR_ITEMS(bh))
-		return 1;
-
-	/* we need only to know, whether it is the same item */
-	ih = tp_item_head(path);
-	return memcmp(stored_ih, ih, IH_SIZE);
-}
-
-/* prepare for delete or cut of direct item */
-static inline int prepare_for_direct_item(struct treepath *path,
-					  struct item_head *le_ih,
-					  struct inode *inode,
-					  loff_t new_file_length, int *cut_size)
-{
-	loff_t round_len;
-
-	if (new_file_length == max_reiserfs_offset(inode)) {
-		/* item has to be deleted */
-		*cut_size = -(IH_SIZE + ih_item_len(le_ih));
-		return M_DELETE;
-	}
-	/* new file gets truncated */
-	if (get_inode_item_key_version(inode) == KEY_FORMAT_3_6) {
-		round_len = ROUND_UP(new_file_length);
-		/* this was new_file_length < le_ih ... */
-		if (round_len < le_ih_k_offset(le_ih)) {
-			*cut_size = -(IH_SIZE + ih_item_len(le_ih));
-			return M_DELETE;	/* Delete this item. */
-		}
-		/* Calculate first position and size for cutting from item. */
-		pos_in_item(path) = round_len - (le_ih_k_offset(le_ih) - 1);
-		*cut_size = -(ih_item_len(le_ih) - pos_in_item(path));
-
-		return M_CUT;	/* Cut from this item. */
-	}
-
-	/* old file: items may have any length */
-
-	if (new_file_length < le_ih_k_offset(le_ih)) {
-		*cut_size = -(IH_SIZE + ih_item_len(le_ih));
-		return M_DELETE;	/* Delete this item. */
-	}
-
-	/* Calculate first position and size for cutting from item. */
-	*cut_size = -(ih_item_len(le_ih) -
-		      (pos_in_item(path) =
-		       new_file_length + 1 - le_ih_k_offset(le_ih)));
-	return M_CUT;		/* Cut from this item. */
-}
-
-static inline int prepare_for_direntry_item(struct treepath *path,
-					    struct item_head *le_ih,
-					    struct inode *inode,
-					    loff_t new_file_length,
-					    int *cut_size)
-{
-	if (le_ih_k_offset(le_ih) == DOT_OFFSET &&
-	    new_file_length == max_reiserfs_offset(inode)) {
-		RFALSE(ih_entry_count(le_ih) != 2,
-		       "PAP-5220: incorrect empty directory item (%h)", le_ih);
-		*cut_size = -(IH_SIZE + ih_item_len(le_ih));
-		/* Delete the directory item containing "." and ".." entry. */
-		return M_DELETE;
-	}
-
-	if (ih_entry_count(le_ih) == 1) {
-		/*
-		 * Delete the directory item such as there is one record only
-		 * in this item
-		 */
-		*cut_size = -(IH_SIZE + ih_item_len(le_ih));
-		return M_DELETE;
-	}
-
-	/* Cut one record from the directory item. */
-	*cut_size =
-	    -(DEH_SIZE +
-	      entry_length(get_last_bh(path), le_ih, pos_in_item(path)));
-	return M_CUT;
-}
-
-#define JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD (2 * JOURNAL_PER_BALANCE_CNT + 1)
-
-/*
- * If the path points to a directory or direct item, calculate mode
- * and the size cut, for balance.
- * If the path points to an indirect item, remove some number of its
- * unformatted nodes.
- * In case of file truncate calculate whether this item must be
- * deleted/truncated or last unformatted node of this item will be
- * converted to a direct item.
- * This function returns a determination of what balance mode the
- * calling function should employ.
- */
-static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th,
-				      struct inode *inode,
-				      struct treepath *path,
-				      const struct cpu_key *item_key,
-				      /*
-				       * Number of unformatted nodes
-				       * which were removed from end
-				       * of the file.
-				       */
-				      int *removed,
-				      int *cut_size,
-				      /* MAX_KEY_OFFSET in case of delete. */
-				      unsigned long long new_file_length
-    )
-{
-	struct super_block *sb = inode->i_sb;
-	struct item_head *p_le_ih = tp_item_head(path);
-	struct buffer_head *bh = PATH_PLAST_BUFFER(path);
-
-	BUG_ON(!th->t_trans_id);
-
-	/* Stat_data item. */
-	if (is_statdata_le_ih(p_le_ih)) {
-
-		RFALSE(new_file_length != max_reiserfs_offset(inode),
-		       "PAP-5210: mode must be M_DELETE");
-
-		*cut_size = -(IH_SIZE + ih_item_len(p_le_ih));
-		return M_DELETE;
-	}
-
-	/* Directory item. */
-	if (is_direntry_le_ih(p_le_ih))
-		return prepare_for_direntry_item(path, p_le_ih, inode,
-						 new_file_length,
-						 cut_size);
-
-	/* Direct item. */
-	if (is_direct_le_ih(p_le_ih))
-		return prepare_for_direct_item(path, p_le_ih, inode,
-					       new_file_length, cut_size);
-
-	/* Case of an indirect item. */
-	{
-	    int blk_size = sb->s_blocksize;
-	    struct item_head s_ih;
-	    int need_re_search;
-	    int delete = 0;
-	    int result = M_CUT;
-	    int pos = 0;
-
-	    if ( new_file_length == max_reiserfs_offset (inode) ) {
-		/*
-		 * prepare_for_delete_or_cut() is called by
-		 * reiserfs_delete_item()
-		 */
-		new_file_length = 0;
-		delete = 1;
-	    }
-
-	    do {
-		need_re_search = 0;
-		*cut_size = 0;
-		bh = PATH_PLAST_BUFFER(path);
-		copy_item_head(&s_ih, tp_item_head(path));
-		pos = I_UNFM_NUM(&s_ih);
-
-		while (le_ih_k_offset (&s_ih) + (pos - 1) * blk_size > new_file_length) {
-		    __le32 *unfm;
-		    __u32 block;
-
-		    /*
-		     * Each unformatted block deletion may involve
-		     * one additional bitmap block into the transaction,
-		     * thereby the initial journal space reservation
-		     * might not be enough.
-		     */
-		    if (!delete && (*cut_size) != 0 &&
-			reiserfs_transaction_free_space(th) < JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD)
-			break;
-
-		    unfm = (__le32 *)ih_item_body(bh, &s_ih) + pos - 1;
-		    block = get_block_num(unfm, 0);
-
-		    if (block != 0) {
-			reiserfs_prepare_for_journal(sb, bh, 1);
-			put_block_num(unfm, 0, 0);
-			journal_mark_dirty(th, bh);
-			reiserfs_free_block(th, inode, block, 1);
-		    }
-
-		    reiserfs_cond_resched(sb);
-
-		    if (item_moved (&s_ih, path))  {
-			need_re_search = 1;
-			break;
-		    }
-
-		    pos --;
-		    (*removed)++;
-		    (*cut_size) -= UNFM_P_SIZE;
-
-		    if (pos == 0) {
-			(*cut_size) -= IH_SIZE;
-			result = M_DELETE;
-			break;
-		    }
-		}
-		/*
-		 * a trick.  If the buffer has been logged, this will
-		 * do nothing.  If we've broken the loop without logging
-		 * it, it will restore the buffer
-		 */
-		reiserfs_restore_prepared_buffer(sb, bh);
-	    } while (need_re_search &&
-		     search_for_position_by_key(sb, item_key, path) == POSITION_FOUND);
-	    pos_in_item(path) = pos * UNFM_P_SIZE;
-
-	    if (*cut_size == 0) {
-		/*
-		 * Nothing was cut. maybe convert last unformatted node to the
-		 * direct item?
-		 */
-		result = M_CONVERT;
-	    }
-	    return result;
-	}
-}
-
-/* Calculate number of bytes which will be deleted or cut during balance */
-static int calc_deleted_bytes_number(struct tree_balance *tb, char mode)
-{
-	int del_size;
-	struct item_head *p_le_ih = tp_item_head(tb->tb_path);
-
-	if (is_statdata_le_ih(p_le_ih))
-		return 0;
-
-	del_size =
-	    (mode ==
-	     M_DELETE) ? ih_item_len(p_le_ih) : -tb->insert_size[0];
-	if (is_direntry_le_ih(p_le_ih)) {
-		/*
-		 * return EMPTY_DIR_SIZE; We delete emty directories only.
-		 * we can't use EMPTY_DIR_SIZE, as old format dirs have a
-		 * different empty size.  ick. FIXME, is this right?
-		 */
-		return del_size;
-	}
-
-	if (is_indirect_le_ih(p_le_ih))
-		del_size = (del_size / UNFM_P_SIZE) *
-				(PATH_PLAST_BUFFER(tb->tb_path)->b_size);
-	return del_size;
-}
-
-static void init_tb_struct(struct reiserfs_transaction_handle *th,
-			   struct tree_balance *tb,
-			   struct super_block *sb,
-			   struct treepath *path, int size)
-{
-
-	BUG_ON(!th->t_trans_id);
-
-	memset(tb, '\0', sizeof(struct tree_balance));
-	tb->transaction_handle = th;
-	tb->tb_sb = sb;
-	tb->tb_path = path;
-	PATH_OFFSET_PBUFFER(path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL;
-	PATH_OFFSET_POSITION(path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0;
-	tb->insert_size[0] = size;
-}
-
-void padd_item(char *item, int total_length, int length)
-{
-	int i;
-
-	for (i = total_length; i > length;)
-		item[--i] = 0;
-}
-
-#ifdef REISERQUOTA_DEBUG
-char key2type(struct reiserfs_key *ih)
-{
-	if (is_direntry_le_key(2, ih))
-		return 'd';
-	if (is_direct_le_key(2, ih))
-		return 'D';
-	if (is_indirect_le_key(2, ih))
-		return 'i';
-	if (is_statdata_le_key(2, ih))
-		return 's';
-	return 'u';
-}
-
-char head2type(struct item_head *ih)
-{
-	if (is_direntry_le_ih(ih))
-		return 'd';
-	if (is_direct_le_ih(ih))
-		return 'D';
-	if (is_indirect_le_ih(ih))
-		return 'i';
-	if (is_statdata_le_ih(ih))
-		return 's';
-	return 'u';
-}
-#endif
-
-/*
- * Delete object item.
- * th       - active transaction handle
- * path     - path to the deleted item
- * item_key - key to search for the deleted item
- * indode   - used for updating i_blocks and quotas
- * un_bh    - NULL or unformatted node pointer
- */
-int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
-			 struct treepath *path, const struct cpu_key *item_key,
-			 struct inode *inode, struct buffer_head *un_bh)
-{
-	struct super_block *sb = inode->i_sb;
-	struct tree_balance s_del_balance;
-	struct item_head s_ih;
-	struct item_head *q_ih;
-	int quota_cut_bytes;
-	int ret_value, del_size, removed;
-	int depth;
-
-#ifdef CONFIG_REISERFS_CHECK
-	char mode;
-#endif
-
-	BUG_ON(!th->t_trans_id);
-
-	init_tb_struct(th, &s_del_balance, sb, path,
-		       0 /*size is unknown */ );
-
-	while (1) {
-		removed = 0;
-
-#ifdef CONFIG_REISERFS_CHECK
-		mode =
-#endif
-		    prepare_for_delete_or_cut(th, inode, path,
-					      item_key, &removed,
-					      &del_size,
-					      max_reiserfs_offset(inode));
-
-		RFALSE(mode != M_DELETE, "PAP-5320: mode must be M_DELETE");
-
-		copy_item_head(&s_ih, tp_item_head(path));
-		s_del_balance.insert_size[0] = del_size;
-
-		ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL);
-		if (ret_value != REPEAT_SEARCH)
-			break;
-
-		PROC_INFO_INC(sb, delete_item_restarted);
-
-		/* file system changed, repeat search */
-		ret_value =
-		    search_for_position_by_key(sb, item_key, path);
-		if (ret_value == IO_ERROR)
-			break;
-		if (ret_value == FILE_NOT_FOUND) {
-			reiserfs_warning(sb, "vs-5340",
-					 "no items of the file %K found",
-					 item_key);
-			break;
-		}
-	}			/* while (1) */
-
-	if (ret_value != CARRY_ON) {
-		unfix_nodes(&s_del_balance);
-		return 0;
-	}
-
-	/* reiserfs_delete_item returns item length when success */
-	ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE);
-	q_ih = tp_item_head(path);
-	quota_cut_bytes = ih_item_len(q_ih);
-
-	/*
-	 * hack so the quota code doesn't have to guess if the file has a
-	 * tail.  On tail insert, we allocate quota for 1 unformatted node.
-	 * We test the offset because the tail might have been
-	 * split into multiple items, and we only want to decrement for
-	 * the unfm node once
-	 */
-	if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(q_ih)) {
-		if ((le_ih_k_offset(q_ih) & (sb->s_blocksize - 1)) == 1) {
-			quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE;
-		} else {
-			quota_cut_bytes = 0;
-		}
-	}
-
-	if (un_bh) {
-		int off;
-		char *data;
-
-		/*
-		 * We are in direct2indirect conversion, so move tail contents
-		 * to the unformatted node
-		 */
-		/*
-		 * note, we do the copy before preparing the buffer because we
-		 * don't care about the contents of the unformatted node yet.
-		 * the only thing we really care about is the direct item's
-		 * data is in the unformatted node.
-		 *
-		 * Otherwise, we would have to call
-		 * reiserfs_prepare_for_journal on the unformatted node,
-		 * which might schedule, meaning we'd have to loop all the
-		 * way back up to the start of the while loop.
-		 *
-		 * The unformatted node must be dirtied later on.  We can't be
-		 * sure here if the entire tail has been deleted yet.
-		 *
-		 * un_bh is from the page cache (all unformatted nodes are
-		 * from the page cache) and might be a highmem page.  So, we
-		 * can't use un_bh->b_data.
-		 * -clm
-		 */
-
-		data = kmap_atomic(un_bh->b_page);
-		off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_SIZE - 1));
-		memcpy(data + off,
-		       ih_item_body(PATH_PLAST_BUFFER(path), &s_ih),
-		       ret_value);
-		kunmap_atomic(data);
-	}
-
-	/* Perform balancing after all resources have been collected at once. */
-	do_balance(&s_del_balance, NULL, NULL, M_DELETE);
-
-#ifdef REISERQUOTA_DEBUG
-	reiserfs_debug(sb, REISERFS_DEBUG_CODE,
-		       "reiserquota delete_item(): freeing %u, id=%u type=%c",
-		       quota_cut_bytes, inode->i_uid, head2type(&s_ih));
-#endif
-	depth = reiserfs_write_unlock_nested(inode->i_sb);
-	dquot_free_space_nodirty(inode, quota_cut_bytes);
-	reiserfs_write_lock_nested(inode->i_sb, depth);
-
-	/* Return deleted body length */
-	return ret_value;
-}
-
-/*
- * Summary Of Mechanisms For Handling Collisions Between Processes:
- *
- *  deletion of the body of the object is performed by iput(), with the
- *  result that if multiple processes are operating on a file, the
- *  deletion of the body of the file is deferred until the last process
- *  that has an open inode performs its iput().
- *
- *  writes and truncates are protected from collisions by use of
- *  semaphores.
- *
- *  creates, linking, and mknod are protected from collisions with other
- *  processes by making the reiserfs_add_entry() the last step in the
- *  creation, and then rolling back all changes if there was a collision.
- *  - Hans
-*/
-
-/* this deletes item which never gets split */
-void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
-				struct inode *inode, struct reiserfs_key *key)
-{
-	struct super_block *sb = th->t_super;
-	struct tree_balance tb;
-	INITIALIZE_PATH(path);
-	int item_len = 0;
-	int tb_init = 0;
-	struct cpu_key cpu_key = {};
-	int retval;
-	int quota_cut_bytes = 0;
-
-	BUG_ON(!th->t_trans_id);
-
-	le_key2cpu_key(&cpu_key, key);
-
-	while (1) {
-		retval = search_item(th->t_super, &cpu_key, &path);
-		if (retval == IO_ERROR) {
-			reiserfs_error(th->t_super, "vs-5350",
-				       "i/o failure occurred trying "
-				       "to delete %K", &cpu_key);
-			break;
-		}
-		if (retval != ITEM_FOUND) {
-			pathrelse(&path);
-			/*
-			 * No need for a warning, if there is just no free
-			 * space to insert '..' item into the
-			 * newly-created subdir
-			 */
-			if (!
-			    ((unsigned long long)
-			     GET_HASH_VALUE(le_key_k_offset
-					    (le_key_version(key), key)) == 0
-			     && (unsigned long long)
-			     GET_GENERATION_NUMBER(le_key_k_offset
-						   (le_key_version(key),
-						    key)) == 1))
-				reiserfs_warning(th->t_super, "vs-5355",
-						 "%k not found", key);
-			break;
-		}
-		if (!tb_init) {
-			tb_init = 1;
-			item_len = ih_item_len(tp_item_head(&path));
-			init_tb_struct(th, &tb, th->t_super, &path,
-				       -(IH_SIZE + item_len));
-		}
-		quota_cut_bytes = ih_item_len(tp_item_head(&path));
-
-		retval = fix_nodes(M_DELETE, &tb, NULL, NULL);
-		if (retval == REPEAT_SEARCH) {
-			PROC_INFO_INC(th->t_super, delete_solid_item_restarted);
-			continue;
-		}
-
-		if (retval == CARRY_ON) {
-			do_balance(&tb, NULL, NULL, M_DELETE);
-			/*
-			 * Should we count quota for item? (we don't
-			 * count quotas for save-links)
-			 */
-			if (inode) {
-				int depth;
-#ifdef REISERQUOTA_DEBUG
-				reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
-					       "reiserquota delete_solid_item(): freeing %u id=%u type=%c",
-					       quota_cut_bytes, inode->i_uid,
-					       key2type(key));
-#endif
-				depth = reiserfs_write_unlock_nested(sb);
-				dquot_free_space_nodirty(inode,
-							 quota_cut_bytes);
-				reiserfs_write_lock_nested(sb, depth);
-			}
-			break;
-		}
-
-		/* IO_ERROR, NO_DISK_SPACE, etc */
-		reiserfs_warning(th->t_super, "vs-5360",
-				 "could not delete %K due to fix_nodes failure",
-				 &cpu_key);
-		unfix_nodes(&tb);
-		break;
-	}
-
-	reiserfs_check_path(&path);
-}
-
-int reiserfs_delete_object(struct reiserfs_transaction_handle *th,
-			   struct inode *inode)
-{
-	int err;
-	inode->i_size = 0;
-	BUG_ON(!th->t_trans_id);
-
-	/* for directory this deletes item containing "." and ".." */
-	err =
-	    reiserfs_do_truncate(th, inode, NULL, 0 /*no timestamp updates */ );
-	if (err)
-		return err;
-
-#if defined( USE_INODE_GENERATION_COUNTER )
-	if (!old_format_only(th->t_super)) {
-		__le32 *inode_generation;
-
-		inode_generation =
-		    &REISERFS_SB(th->t_super)->s_rs->s_inode_generation;
-		le32_add_cpu(inode_generation, 1);
-	}
-/* USE_INODE_GENERATION_COUNTER */
-#endif
-	reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode));
-
-	return err;
-}
-
-static void unmap_buffers(struct page *page, loff_t pos)
-{
-	struct buffer_head *bh;
-	struct buffer_head *head;
-	struct buffer_head *next;
-	unsigned long tail_index;
-	unsigned long cur_index;
-
-	if (page) {
-		if (page_has_buffers(page)) {
-			tail_index = pos & (PAGE_SIZE - 1);
-			cur_index = 0;
-			head = page_buffers(page);
-			bh = head;
-			do {
-				next = bh->b_this_page;
-
-				/*
-				 * we want to unmap the buffers that contain
-				 * the tail, and all the buffers after it
-				 * (since the tail must be at the end of the
-				 * file).  We don't want to unmap file data
-				 * before the tail, since it might be dirty
-				 * and waiting to reach disk
-				 */
-				cur_index += bh->b_size;
-				if (cur_index > tail_index) {
-					reiserfs_unmap_buffer(bh);
-				}
-				bh = next;
-			} while (bh != head);
-		}
-	}
-}
-
-static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th,
-				    struct inode *inode,
-				    struct page *page,
-				    struct treepath *path,
-				    const struct cpu_key *item_key,
-				    loff_t new_file_size, char *mode)
-{
-	struct super_block *sb = inode->i_sb;
-	int block_size = sb->s_blocksize;
-	int cut_bytes;
-	BUG_ON(!th->t_trans_id);
-	BUG_ON(new_file_size != inode->i_size);
-
-	/*
-	 * the page being sent in could be NULL if there was an i/o error
-	 * reading in the last block.  The user will hit problems trying to
-	 * read the file, but for now we just skip the indirect2direct
-	 */
-	if (atomic_read(&inode->i_count) > 1 ||
-	    !tail_has_to_be_packed(inode) ||
-	    !page || (REISERFS_I(inode)->i_flags & i_nopack_mask)) {
-		/* leave tail in an unformatted node */
-		*mode = M_SKIP_BALANCING;
-		cut_bytes =
-		    block_size - (new_file_size & (block_size - 1));
-		pathrelse(path);
-		return cut_bytes;
-	}
-
-	/* Perform the conversion to a direct_item. */
-	return indirect2direct(th, inode, page, path, item_key,
-			       new_file_size, mode);
-}
-
-/*
- * we did indirect_to_direct conversion. And we have inserted direct
- * item successesfully, but there were no disk space to cut unfm
- * pointer being converted. Therefore we have to delete inserted
- * direct item(s)
- */
-static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th,
-					 struct inode *inode, struct treepath *path)
-{
-	struct cpu_key tail_key;
-	int tail_len;
-	int removed;
-	BUG_ON(!th->t_trans_id);
-
-	make_cpu_key(&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4);
-	tail_key.key_length = 4;
-
-	tail_len =
-	    (cpu_key_k_offset(&tail_key) & (inode->i_sb->s_blocksize - 1)) - 1;
-	while (tail_len) {
-		/* look for the last byte of the tail */
-		if (search_for_position_by_key(inode->i_sb, &tail_key, path) ==
-		    POSITION_NOT_FOUND)
-			reiserfs_panic(inode->i_sb, "vs-5615",
-				       "found invalid item");
-		RFALSE(path->pos_in_item !=
-		       ih_item_len(tp_item_head(path)) - 1,
-		       "vs-5616: appended bytes found");
-		PATH_LAST_POSITION(path)--;
-
-		removed =
-		    reiserfs_delete_item(th, path, &tail_key, inode,
-					 NULL /*unbh not needed */ );
-		RFALSE(removed <= 0
-		       || removed > tail_len,
-		       "vs-5617: there was tail %d bytes, removed item length %d bytes",
-		       tail_len, removed);
-		tail_len -= removed;
-		set_cpu_key_k_offset(&tail_key,
-				     cpu_key_k_offset(&tail_key) - removed);
-	}
-	reiserfs_warning(inode->i_sb, "reiserfs-5091", "indirect_to_direct "
-			 "conversion has been rolled back due to "
-			 "lack of disk space");
-	mark_inode_dirty(inode);
-}
-
-/* (Truncate or cut entry) or delete object item. Returns < 0 on failure */
-int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
-			   struct treepath *path,
-			   struct cpu_key *item_key,
-			   struct inode *inode,
-			   struct page *page, loff_t new_file_size)
-{
-	struct super_block *sb = inode->i_sb;
-	/*
-	 * Every function which is going to call do_balance must first
-	 * create a tree_balance structure.  Then it must fill up this
-	 * structure by using the init_tb_struct and fix_nodes functions.
-	 * After that we can make tree balancing.
-	 */
-	struct tree_balance s_cut_balance;
-	struct item_head *p_le_ih;
-	int cut_size = 0;	/* Amount to be cut. */
-	int ret_value = CARRY_ON;
-	int removed = 0;	/* Number of the removed unformatted nodes. */
-	int is_inode_locked = 0;
-	char mode;		/* Mode of the balance. */
-	int retval2 = -1;
-	int quota_cut_bytes;
-	loff_t tail_pos = 0;
-	int depth;
-
-	BUG_ON(!th->t_trans_id);
-
-	init_tb_struct(th, &s_cut_balance, inode->i_sb, path,
-		       cut_size);
-
-	/*
-	 * Repeat this loop until we either cut the item without needing
-	 * to balance, or we fix_nodes without schedule occurring
-	 */
-	while (1) {
-		/*
-		 * Determine the balance mode, position of the first byte to
-		 * be cut, and size to be cut.  In case of the indirect item
-		 * free unformatted nodes which are pointed to by the cut
-		 * pointers.
-		 */
-
-		mode =
-		    prepare_for_delete_or_cut(th, inode, path,
-					      item_key, &removed,
-					      &cut_size, new_file_size);
-		if (mode == M_CONVERT) {
-			/*
-			 * convert last unformatted node to direct item or
-			 * leave tail in the unformatted node
-			 */
-			RFALSE(ret_value != CARRY_ON,
-			       "PAP-5570: can not convert twice");
-
-			ret_value =
-			    maybe_indirect_to_direct(th, inode, page,
-						     path, item_key,
-						     new_file_size, &mode);
-			if (mode == M_SKIP_BALANCING)
-				/* tail has been left in the unformatted node */
-				return ret_value;
-
-			is_inode_locked = 1;
-
-			/*
-			 * removing of last unformatted node will
-			 * change value we have to return to truncate.
-			 * Save it
-			 */
-			retval2 = ret_value;
-
-			/*
-			 * So, we have performed the first part of the
-			 * conversion:
-			 * inserting the new direct item.  Now we are
-			 * removing the last unformatted node pointer.
-			 * Set key to search for it.
-			 */
-			set_cpu_key_k_type(item_key, TYPE_INDIRECT);
-			item_key->key_length = 4;
-			new_file_size -=
-			    (new_file_size & (sb->s_blocksize - 1));
-			tail_pos = new_file_size;
-			set_cpu_key_k_offset(item_key, new_file_size + 1);
-			if (search_for_position_by_key
-			    (sb, item_key,
-			     path) == POSITION_NOT_FOUND) {
-				print_block(PATH_PLAST_BUFFER(path), 3,
-					    PATH_LAST_POSITION(path) - 1,
-					    PATH_LAST_POSITION(path) + 1);
-				reiserfs_panic(sb, "PAP-5580", "item to "
-					       "convert does not exist (%K)",
-					       item_key);
-			}
-			continue;
-		}
-		if (cut_size == 0) {
-			pathrelse(path);
-			return 0;
-		}
-
-		s_cut_balance.insert_size[0] = cut_size;
-
-		ret_value = fix_nodes(mode, &s_cut_balance, NULL, NULL);
-		if (ret_value != REPEAT_SEARCH)
-			break;
-
-		PROC_INFO_INC(sb, cut_from_item_restarted);
-
-		ret_value =
-		    search_for_position_by_key(sb, item_key, path);
-		if (ret_value == POSITION_FOUND)
-			continue;
-
-		reiserfs_warning(sb, "PAP-5610", "item %K not found",
-				 item_key);
-		unfix_nodes(&s_cut_balance);
-		return (ret_value == IO_ERROR) ? -EIO : -ENOENT;
-	}			/* while */
-
-	/* check fix_nodes results (IO_ERROR or NO_DISK_SPACE) */
-	if (ret_value != CARRY_ON) {
-		if (is_inode_locked) {
-			/*
-			 * FIXME: this seems to be not needed: we are always
-			 * able to cut item
-			 */
-			indirect_to_direct_roll_back(th, inode, path);
-		}
-		if (ret_value == NO_DISK_SPACE)
-			reiserfs_warning(sb, "reiserfs-5092",
-					 "NO_DISK_SPACE");
-		unfix_nodes(&s_cut_balance);
-		return -EIO;
-	}
-
-	/* go ahead and perform balancing */
-
-	RFALSE(mode == M_PASTE || mode == M_INSERT, "invalid mode");
-
-	/* Calculate number of bytes that need to be cut from the item. */
-	quota_cut_bytes =
-	    (mode ==
-	     M_DELETE) ? ih_item_len(tp_item_head(path)) : -s_cut_balance.
-	    insert_size[0];
-	if (retval2 == -1)
-		ret_value = calc_deleted_bytes_number(&s_cut_balance, mode);
-	else
-		ret_value = retval2;
-
-	/*
-	 * For direct items, we only change the quota when deleting the last
-	 * item.
-	 */
-	p_le_ih = tp_item_head(s_cut_balance.tb_path);
-	if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(p_le_ih)) {
-		if (mode == M_DELETE &&
-		    (le_ih_k_offset(p_le_ih) & (sb->s_blocksize - 1)) ==
-		    1) {
-			/* FIXME: this is to keep 3.5 happy */
-			REISERFS_I(inode)->i_first_direct_byte = U32_MAX;
-			quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE;
-		} else {
-			quota_cut_bytes = 0;
-		}
-	}
-#ifdef CONFIG_REISERFS_CHECK
-	if (is_inode_locked) {
-		struct item_head *le_ih =
-		    tp_item_head(s_cut_balance.tb_path);
-		/*
-		 * we are going to complete indirect2direct conversion. Make
-		 * sure, that we exactly remove last unformatted node pointer
-		 * of the item
-		 */
-		if (!is_indirect_le_ih(le_ih))
-			reiserfs_panic(sb, "vs-5652",
-				       "item must be indirect %h", le_ih);
-
-		if (mode == M_DELETE && ih_item_len(le_ih) != UNFM_P_SIZE)
-			reiserfs_panic(sb, "vs-5653", "completing "
-				       "indirect2direct conversion indirect "
-				       "item %h being deleted must be of "
-				       "4 byte long", le_ih);
-
-		if (mode == M_CUT
-		    && s_cut_balance.insert_size[0] != -UNFM_P_SIZE) {
-			reiserfs_panic(sb, "vs-5654", "can not complete "
-				       "indirect2direct conversion of %h "
-				       "(CUT, insert_size==%d)",
-				       le_ih, s_cut_balance.insert_size[0]);
-		}
-		/*
-		 * it would be useful to make sure, that right neighboring
-		 * item is direct item of this file
-		 */
-	}
-#endif
-
-	do_balance(&s_cut_balance, NULL, NULL, mode);
-	if (is_inode_locked) {
-		/*
-		 * we've done an indirect->direct conversion.  when the
-		 * data block was freed, it was removed from the list of
-		 * blocks that must be flushed before the transaction
-		 * commits, make sure to unmap and invalidate it
-		 */
-		unmap_buffers(page, tail_pos);
-		REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
-	}
-#ifdef REISERQUOTA_DEBUG
-	reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
-		       "reiserquota cut_from_item(): freeing %u id=%u type=%c",
-		       quota_cut_bytes, inode->i_uid, '?');
-#endif
-	depth = reiserfs_write_unlock_nested(sb);
-	dquot_free_space_nodirty(inode, quota_cut_bytes);
-	reiserfs_write_lock_nested(sb, depth);
-	return ret_value;
-}
-
-static void truncate_directory(struct reiserfs_transaction_handle *th,
-			       struct inode *inode)
-{
-	BUG_ON(!th->t_trans_id);
-	if (inode->i_nlink)
-		reiserfs_error(inode->i_sb, "vs-5655", "link count != 0");
-
-	set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), DOT_OFFSET);
-	set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_DIRENTRY);
-	reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode));
-	reiserfs_update_sd(th, inode);
-	set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), SD_OFFSET);
-	set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_STAT_DATA);
-}
-
-/*
- * Truncate file to the new size. Note, this must be called with a
- * transaction already started
- */
-int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
-			 struct inode *inode,	/* ->i_size contains new size */
-			 struct page *page,	/* up to date for last block */
-			 /*
-			  * when it is called by file_release to convert
-			  * the tail - no timestamps should be updated
-			  */
-			 int update_timestamps
-    )
-{
-	INITIALIZE_PATH(s_search_path);	/* Path to the current object item. */
-	struct item_head *p_le_ih;	/* Pointer to an item header. */
-
-	/* Key to search for a previous file item. */
-	struct cpu_key s_item_key;
-	loff_t file_size,	/* Old file size. */
-	 new_file_size;	/* New file size. */
-	int deleted;		/* Number of deleted or truncated bytes. */
-	int retval;
-	int err = 0;
-
-	BUG_ON(!th->t_trans_id);
-	if (!
-	    (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
-	     || S_ISLNK(inode->i_mode)))
-		return 0;
-
-	/* deletion of directory - no need to update timestamps */
-	if (S_ISDIR(inode->i_mode)) {
-		truncate_directory(th, inode);
-		return 0;
-	}
-
-	/* Get new file size. */
-	new_file_size = inode->i_size;
-
-	/* FIXME: note, that key type is unimportant here */
-	make_cpu_key(&s_item_key, inode, max_reiserfs_offset(inode),
-		     TYPE_DIRECT, 3);
-
-	retval =
-	    search_for_position_by_key(inode->i_sb, &s_item_key,
-				       &s_search_path);
-	if (retval == IO_ERROR) {
-		reiserfs_error(inode->i_sb, "vs-5657",
-			       "i/o failure occurred trying to truncate %K",
-			       &s_item_key);
-		err = -EIO;
-		goto out;
-	}
-	if (retval == POSITION_FOUND || retval == FILE_NOT_FOUND) {
-		reiserfs_error(inode->i_sb, "PAP-5660",
-			       "wrong result %d of search for %K", retval,
-			       &s_item_key);
-
-		err = -EIO;
-		goto out;
-	}
-
-	s_search_path.pos_in_item--;
-
-	/* Get real file size (total length of all file items) */
-	p_le_ih = tp_item_head(&s_search_path);
-	if (is_statdata_le_ih(p_le_ih))
-		file_size = 0;
-	else {
-		loff_t offset = le_ih_k_offset(p_le_ih);
-		int bytes =
-		    op_bytes_number(p_le_ih, inode->i_sb->s_blocksize);
-
-		/*
-		 * this may mismatch with real file size: if last direct item
-		 * had no padding zeros and last unformatted node had no free
-		 * space, this file would have this file size
-		 */
-		file_size = offset + bytes - 1;
-	}
-	/*
-	 * are we doing a full truncate or delete, if so
-	 * kick in the reada code
-	 */
-	if (new_file_size == 0)
-		s_search_path.reada = PATH_READA | PATH_READA_BACK;
-
-	if (file_size == 0 || file_size < new_file_size) {
-		goto update_and_out;
-	}
-
-	/* Update key to search for the last file item. */
-	set_cpu_key_k_offset(&s_item_key, file_size);
-
-	do {
-		/* Cut or delete file item. */
-		deleted =
-		    reiserfs_cut_from_item(th, &s_search_path, &s_item_key,
-					   inode, page, new_file_size);
-		if (deleted < 0) {
-			reiserfs_warning(inode->i_sb, "vs-5665",
-					 "reiserfs_cut_from_item failed");
-			reiserfs_check_path(&s_search_path);
-			return 0;
-		}
-
-		RFALSE(deleted > file_size,
-		       "PAP-5670: reiserfs_cut_from_item: too many bytes deleted: deleted %d, file_size %lu, item_key %K",
-		       deleted, file_size, &s_item_key);
-
-		/* Change key to search the last file item. */
-		file_size -= deleted;
-
-		set_cpu_key_k_offset(&s_item_key, file_size);
-
-		/*
-		 * While there are bytes to truncate and previous
-		 * file item is presented in the tree.
-		 */
-
-		/*
-		 * This loop could take a really long time, and could log
-		 * many more blocks than a transaction can hold.  So, we do
-		 * a polite journal end here, and if the transaction needs
-		 * ending, we make sure the file is consistent before ending
-		 * the current trans and starting a new one
-		 */
-		if (journal_transaction_should_end(th, 0) ||
-		    reiserfs_transaction_free_space(th) <= JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) {
-			pathrelse(&s_search_path);
-
-			if (update_timestamps) {
-				inode_set_mtime_to_ts(inode,
-						      current_time(inode));
-				inode_set_ctime_current(inode);
-			}
-			reiserfs_update_sd(th, inode);
-
-			err = journal_end(th);
-			if (err)
-				goto out;
-			err = journal_begin(th, inode->i_sb,
-					    JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD + JOURNAL_PER_BALANCE_CNT * 4) ;
-			if (err)
-				goto out;
-			reiserfs_update_inode_transaction(inode);
-		}
-	} while (file_size > ROUND_UP(new_file_size) &&
-		 search_for_position_by_key(inode->i_sb, &s_item_key,
-					    &s_search_path) == POSITION_FOUND);
-
-	RFALSE(file_size > ROUND_UP(new_file_size),
-	       "PAP-5680: truncate did not finish: new_file_size %lld, current %lld, oid %d",
-	       new_file_size, file_size, s_item_key.on_disk_key.k_objectid);
-
-update_and_out:
-	if (update_timestamps) {
-		/* this is truncate, not file closing */
-		inode_set_mtime_to_ts(inode, current_time(inode));
-		inode_set_ctime_current(inode);
-	}
-	reiserfs_update_sd(th, inode);
-
-out:
-	pathrelse(&s_search_path);
-	return err;
-}
-
-#ifdef CONFIG_REISERFS_CHECK
-/* this makes sure, that we __append__, not overwrite or add holes */
-static void check_research_for_paste(struct treepath *path,
-				     const struct cpu_key *key)
-{
-	struct item_head *found_ih = tp_item_head(path);
-
-	if (is_direct_le_ih(found_ih)) {
-		if (le_ih_k_offset(found_ih) +
-		    op_bytes_number(found_ih,
-				    get_last_bh(path)->b_size) !=
-		    cpu_key_k_offset(key)
-		    || op_bytes_number(found_ih,
-				       get_last_bh(path)->b_size) !=
-		    pos_in_item(path))
-			reiserfs_panic(NULL, "PAP-5720", "found direct item "
-				       "%h or position (%d) does not match "
-				       "to key %K", found_ih,
-				       pos_in_item(path), key);
-	}
-	if (is_indirect_le_ih(found_ih)) {
-		if (le_ih_k_offset(found_ih) +
-		    op_bytes_number(found_ih,
-				    get_last_bh(path)->b_size) !=
-		    cpu_key_k_offset(key)
-		    || I_UNFM_NUM(found_ih) != pos_in_item(path)
-		    || get_ih_free_space(found_ih) != 0)
-			reiserfs_panic(NULL, "PAP-5730", "found indirect "
-				       "item (%h) or position (%d) does not "
-				       "match to key (%K)",
-				       found_ih, pos_in_item(path), key);
-	}
-}
-#endif				/* config reiserfs check */
-
-/*
- * Paste bytes to the existing item.
- * Returns bytes number pasted into the item.
- */
-int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th,
-			     /* Path to the pasted item. */
-			     struct treepath *search_path,
-			     /* Key to search for the needed item. */
-			     const struct cpu_key *key,
-			     /* Inode item belongs to */
-			     struct inode *inode,
-			     /* Pointer to the bytes to paste. */
-			     const char *body,
-			     /* Size of pasted bytes. */
-			     int pasted_size)
-{
-	struct super_block *sb = inode->i_sb;
-	struct tree_balance s_paste_balance;
-	int retval;
-	int fs_gen;
-	int depth;
-
-	BUG_ON(!th->t_trans_id);
-
-	fs_gen = get_generation(inode->i_sb);
-
-#ifdef REISERQUOTA_DEBUG
-	reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
-		       "reiserquota paste_into_item(): allocating %u id=%u type=%c",
-		       pasted_size, inode->i_uid,
-		       key2type(&key->on_disk_key));
-#endif
-
-	depth = reiserfs_write_unlock_nested(sb);
-	retval = dquot_alloc_space_nodirty(inode, pasted_size);
-	reiserfs_write_lock_nested(sb, depth);
-	if (retval) {
-		pathrelse(search_path);
-		return retval;
-	}
-	init_tb_struct(th, &s_paste_balance, th->t_super, search_path,
-		       pasted_size);
-#ifdef DISPLACE_NEW_PACKING_LOCALITIES
-	s_paste_balance.key = key->on_disk_key;
-#endif
-
-	/* DQUOT_* can schedule, must check before the fix_nodes */
-	if (fs_changed(fs_gen, inode->i_sb)) {
-		goto search_again;
-	}
-
-	while ((retval =
-		fix_nodes(M_PASTE, &s_paste_balance, NULL,
-			  body)) == REPEAT_SEARCH) {
-search_again:
-		/* file system changed while we were in the fix_nodes */
-		PROC_INFO_INC(th->t_super, paste_into_item_restarted);
-		retval =
-		    search_for_position_by_key(th->t_super, key,
-					       search_path);
-		if (retval == IO_ERROR) {
-			retval = -EIO;
-			goto error_out;
-		}
-		if (retval == POSITION_FOUND) {
-			reiserfs_warning(inode->i_sb, "PAP-5710",
-					 "entry or pasted byte (%K) exists",
-					 key);
-			retval = -EEXIST;
-			goto error_out;
-		}
-#ifdef CONFIG_REISERFS_CHECK
-		check_research_for_paste(search_path, key);
-#endif
-	}
-
-	/*
-	 * Perform balancing after all resources are collected by fix_nodes,
-	 * and accessing them will not risk triggering schedule.
-	 */
-	if (retval == CARRY_ON) {
-		do_balance(&s_paste_balance, NULL /*ih */ , body, M_PASTE);
-		return 0;
-	}
-	retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
-error_out:
-	/* this also releases the path */
-	unfix_nodes(&s_paste_balance);
-#ifdef REISERQUOTA_DEBUG
-	reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
-		       "reiserquota paste_into_item(): freeing %u id=%u type=%c",
-		       pasted_size, inode->i_uid,
-		       key2type(&key->on_disk_key));
-#endif
-	depth = reiserfs_write_unlock_nested(sb);
-	dquot_free_space_nodirty(inode, pasted_size);
-	reiserfs_write_lock_nested(sb, depth);
-	return retval;
-}
-
-/*
- * Insert new item into the buffer at the path.
- * th   - active transaction handle
- * path - path to the inserted item
- * ih   - pointer to the item header to insert
- * body - pointer to the bytes to insert
- */
-int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
-			 struct treepath *path, const struct cpu_key *key,
-			 struct item_head *ih, struct inode *inode,
-			 const char *body)
-{
-	struct tree_balance s_ins_balance;
-	int retval;
-	int fs_gen = 0;
-	int quota_bytes = 0;
-
-	BUG_ON(!th->t_trans_id);
-
-	if (inode) {		/* Do we count quotas for item? */
-		int depth;
-		fs_gen = get_generation(inode->i_sb);
-		quota_bytes = ih_item_len(ih);
-
-		/*
-		 * hack so the quota code doesn't have to guess
-		 * if the file has a tail, links are always tails,
-		 * so there's no guessing needed
-		 */
-		if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(ih))
-			quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE;
-#ifdef REISERQUOTA_DEBUG
-		reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
-			       "reiserquota insert_item(): allocating %u id=%u type=%c",
-			       quota_bytes, inode->i_uid, head2type(ih));
-#endif
-		/*
-		 * We can't dirty inode here. It would be immediately
-		 * written but appropriate stat item isn't inserted yet...
-		 */
-		depth = reiserfs_write_unlock_nested(inode->i_sb);
-		retval = dquot_alloc_space_nodirty(inode, quota_bytes);
-		reiserfs_write_lock_nested(inode->i_sb, depth);
-		if (retval) {
-			pathrelse(path);
-			return retval;
-		}
-	}
-	init_tb_struct(th, &s_ins_balance, th->t_super, path,
-		       IH_SIZE + ih_item_len(ih));
-#ifdef DISPLACE_NEW_PACKING_LOCALITIES
-	s_ins_balance.key = key->on_disk_key;
-#endif
-	/*
-	 * DQUOT_* can schedule, must check to be sure calling
-	 * fix_nodes is safe
-	 */
-	if (inode && fs_changed(fs_gen, inode->i_sb)) {
-		goto search_again;
-	}
-
-	while ((retval =
-		fix_nodes(M_INSERT, &s_ins_balance, ih,
-			  body)) == REPEAT_SEARCH) {
-search_again:
-		/* file system changed while we were in the fix_nodes */
-		PROC_INFO_INC(th->t_super, insert_item_restarted);
-		retval = search_item(th->t_super, key, path);
-		if (retval == IO_ERROR) {
-			retval = -EIO;
-			goto error_out;
-		}
-		if (retval == ITEM_FOUND) {
-			reiserfs_warning(th->t_super, "PAP-5760",
-					 "key %K already exists in the tree",
-					 key);
-			retval = -EEXIST;
-			goto error_out;
-		}
-	}
-
-	/* make balancing after all resources will be collected at a time */
-	if (retval == CARRY_ON) {
-		do_balance(&s_ins_balance, ih, body, M_INSERT);
-		return 0;
-	}
-
-	retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
-error_out:
-	/* also releases the path */
-	unfix_nodes(&s_ins_balance);
-#ifdef REISERQUOTA_DEBUG
-	if (inode)
-		reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
-		       "reiserquota insert_item(): freeing %u id=%u type=%c",
-		       quota_bytes, inode->i_uid, head2type(ih));
-#endif
-	if (inode) {
-		int depth = reiserfs_write_unlock_nested(inode->i_sb);
-		dquot_free_space_nodirty(inode, quota_bytes);
-		reiserfs_write_lock_nested(inode->i_sb, depth);
-	}
-	return retval;
-}
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
deleted file mode 100644
index ab76468da02d..000000000000
--- a/fs/reiserfs/super.c
+++ /dev/null
@@ -1,2646 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- *
- * Trivial changes by Alan Cox to add the LFS fixes
- *
- * Trivial Changes:
- * Rights granted to Hans Reiser to redistribute under other terms providing
- * he accepts all liability including but not limited to patent, fitness
- * for purpose, and direct or indirect claims arising from failure to perform.
- *
- * NO WARRANTY
- */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/time.h>
-#include <linux/uaccess.h>
-#include "reiserfs.h"
-#include "acl.h"
-#include "xattr.h"
-#include <linux/init.h>
-#include <linux/blkdev.h>
-#include <linux/backing-dev.h>
-#include <linux/buffer_head.h>
-#include <linux/exportfs.h>
-#include <linux/quotaops.h>
-#include <linux/vfs.h>
-#include <linux/mount.h>
-#include <linux/namei.h>
-#include <linux/crc32.h>
-#include <linux/seq_file.h>
-
-struct file_system_type reiserfs_fs_type;
-
-static const char reiserfs_3_5_magic_string[] = REISERFS_SUPER_MAGIC_STRING;
-static const char reiserfs_3_6_magic_string[] = REISER2FS_SUPER_MAGIC_STRING;
-static const char reiserfs_jr_magic_string[] = REISER2FS_JR_SUPER_MAGIC_STRING;
-
-int is_reiserfs_3_5(struct reiserfs_super_block *rs)
-{
-	return !strncmp(rs->s_v1.s_magic, reiserfs_3_5_magic_string,
-			strlen(reiserfs_3_5_magic_string));
-}
-
-int is_reiserfs_3_6(struct reiserfs_super_block *rs)
-{
-	return !strncmp(rs->s_v1.s_magic, reiserfs_3_6_magic_string,
-			strlen(reiserfs_3_6_magic_string));
-}
-
-int is_reiserfs_jr(struct reiserfs_super_block *rs)
-{
-	return !strncmp(rs->s_v1.s_magic, reiserfs_jr_magic_string,
-			strlen(reiserfs_jr_magic_string));
-}
-
-static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs)
-{
-	return (is_reiserfs_3_5(rs) || is_reiserfs_3_6(rs) ||
-		is_reiserfs_jr(rs));
-}
-
-static int reiserfs_remount(struct super_block *s, int *flags, char *data);
-static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf);
-
-static int reiserfs_sync_fs(struct super_block *s, int wait)
-{
-	struct reiserfs_transaction_handle th;
-
-	/*
-	 * Writeback quota in non-journalled quota case - journalled quota has
-	 * no dirty dquots
-	 */
-	dquot_writeback_dquots(s, -1);
-	reiserfs_write_lock(s);
-	if (!journal_begin(&th, s, 1))
-		if (!journal_end_sync(&th))
-			reiserfs_flush_old_commits(s);
-	reiserfs_write_unlock(s);
-	return 0;
-}
-
-static void flush_old_commits(struct work_struct *work)
-{
-	struct reiserfs_sb_info *sbi;
-	struct super_block *s;
-
-	sbi = container_of(work, struct reiserfs_sb_info, old_work.work);
-	s = sbi->s_journal->j_work_sb;
-
-	/*
-	 * We need s_umount for protecting quota writeback. We have to use
-	 * trylock as reiserfs_cancel_old_flush() may be waiting for this work
-	 * to complete with s_umount held.
-	 */
-	if (!down_read_trylock(&s->s_umount)) {
-		/* Requeue work if we are not cancelling it */
-		spin_lock(&sbi->old_work_lock);
-		if (sbi->work_queued == 1)
-			queue_delayed_work(system_long_wq, &sbi->old_work, HZ);
-		spin_unlock(&sbi->old_work_lock);
-		return;
-	}
-	spin_lock(&sbi->old_work_lock);
-	/* Avoid clobbering the cancel state... */
-	if (sbi->work_queued == 1)
-		sbi->work_queued = 0;
-	spin_unlock(&sbi->old_work_lock);
-
-	reiserfs_sync_fs(s, 1);
-	up_read(&s->s_umount);
-}
-
-void reiserfs_schedule_old_flush(struct super_block *s)
-{
-	struct reiserfs_sb_info *sbi = REISERFS_SB(s);
-	unsigned long delay;
-
-	/*
-	 * Avoid scheduling flush when sb is being shut down. It can race
-	 * with journal shutdown and free still queued delayed work.
-	 */
-	if (sb_rdonly(s) || !(s->s_flags & SB_ACTIVE))
-		return;
-
-	spin_lock(&sbi->old_work_lock);
-	if (!sbi->work_queued) {
-		delay = msecs_to_jiffies(dirty_writeback_interval * 10);
-		queue_delayed_work(system_long_wq, &sbi->old_work, delay);
-		sbi->work_queued = 1;
-	}
-	spin_unlock(&sbi->old_work_lock);
-}
-
-void reiserfs_cancel_old_flush(struct super_block *s)
-{
-	struct reiserfs_sb_info *sbi = REISERFS_SB(s);
-
-	spin_lock(&sbi->old_work_lock);
-	/* Make sure no new flushes will be queued */
-	sbi->work_queued = 2;
-	spin_unlock(&sbi->old_work_lock);
-	cancel_delayed_work_sync(&REISERFS_SB(s)->old_work);
-}
-
-static int reiserfs_freeze(struct super_block *s)
-{
-	struct reiserfs_transaction_handle th;
-
-	reiserfs_cancel_old_flush(s);
-
-	reiserfs_write_lock(s);
-	if (!sb_rdonly(s)) {
-		int err = journal_begin(&th, s, 1);
-		if (err) {
-			reiserfs_block_writes(&th);
-		} else {
-			reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
-						     1);
-			journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
-			reiserfs_block_writes(&th);
-			journal_end_sync(&th);
-		}
-	}
-	reiserfs_write_unlock(s);
-	return 0;
-}
-
-static int reiserfs_unfreeze(struct super_block *s)
-{
-	struct reiserfs_sb_info *sbi = REISERFS_SB(s);
-
-	reiserfs_allow_writes(s);
-	spin_lock(&sbi->old_work_lock);
-	/* Allow old_work to run again */
-	sbi->work_queued = 0;
-	spin_unlock(&sbi->old_work_lock);
-	return 0;
-}
-
-extern const struct in_core_key MAX_IN_CORE_KEY;
-
-/*
- * this is used to delete "save link" when there are no items of a
- * file it points to. It can either happen if unlink is completed but
- * "save unlink" removal, or if file has both unlink and truncate
- * pending and as unlink completes first (because key of "save link"
- * protecting unlink is bigger that a key lf "save link" which
- * protects truncate), so there left no items to make truncate
- * completion on
- */
-static int remove_save_link_only(struct super_block *s,
-				 struct reiserfs_key *key, int oid_free)
-{
-	struct reiserfs_transaction_handle th;
-	int err;
-
-	/* we are going to do one balancing */
-	err = journal_begin(&th, s, JOURNAL_PER_BALANCE_CNT);
-	if (err)
-		return err;
-
-	reiserfs_delete_solid_item(&th, NULL, key);
-	if (oid_free)
-		/* removals are protected by direct items */
-		reiserfs_release_objectid(&th, le32_to_cpu(key->k_objectid));
-
-	return journal_end(&th);
-}
-
-#ifdef CONFIG_QUOTA
-static int reiserfs_quota_on_mount(struct super_block *, int);
-#endif
-
-/*
- * Look for uncompleted unlinks and truncates and complete them
- *
- * Called with superblock write locked.  If quotas are enabled, we have to
- * release/retake lest we call dquot_quota_on_mount(), proceed to
- * schedule_on_each_cpu() in invalidate_bdev() and deadlock waiting for the per
- * cpu worklets to complete flush_async_commits() that in turn wait for the
- * superblock write lock.
- */
-static int finish_unfinished(struct super_block *s)
-{
-	INITIALIZE_PATH(path);
-	struct cpu_key max_cpu_key, obj_key;
-	struct reiserfs_key save_link_key, last_inode_key;
-	int retval = 0;
-	struct item_head *ih;
-	struct buffer_head *bh;
-	int item_pos;
-	char *item;
-	int done;
-	struct inode *inode;
-	int truncate;
-#ifdef CONFIG_QUOTA
-	int i;
-	int ms_active_set;
-	int quota_enabled[REISERFS_MAXQUOTAS];
-#endif
-
-	/* compose key to look for "save" links */
-	max_cpu_key.version = KEY_FORMAT_3_5;
-	max_cpu_key.on_disk_key.k_dir_id = ~0U;
-	max_cpu_key.on_disk_key.k_objectid = ~0U;
-	set_cpu_key_k_offset(&max_cpu_key, ~0U);
-	max_cpu_key.key_length = 3;
-
-	memset(&last_inode_key, 0, sizeof(last_inode_key));
-
-#ifdef CONFIG_QUOTA
-	/* Needed for iput() to work correctly and not trash data */
-	if (s->s_flags & SB_ACTIVE) {
-		ms_active_set = 0;
-	} else {
-		ms_active_set = 1;
-		s->s_flags |= SB_ACTIVE;
-	}
-	/* Turn on quotas so that they are updated correctly */
-	for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
-		quota_enabled[i] = 1;
-		if (REISERFS_SB(s)->s_qf_names[i]) {
-			int ret;
-
-			if (sb_has_quota_active(s, i)) {
-				quota_enabled[i] = 0;
-				continue;
-			}
-			reiserfs_write_unlock(s);
-			ret = reiserfs_quota_on_mount(s, i);
-			reiserfs_write_lock(s);
-			if (ret < 0)
-				reiserfs_warning(s, "reiserfs-2500",
-						 "cannot turn on journaled "
-						 "quota: error %d", ret);
-		}
-	}
-#endif
-
-	done = 0;
-	REISERFS_SB(s)->s_is_unlinked_ok = 1;
-	while (!retval) {
-		int depth;
-		retval = search_item(s, &max_cpu_key, &path);
-		if (retval != ITEM_NOT_FOUND) {
-			reiserfs_error(s, "vs-2140",
-				       "search_by_key returned %d", retval);
-			break;
-		}
-
-		bh = get_last_bh(&path);
-		item_pos = get_item_pos(&path);
-		if (item_pos != B_NR_ITEMS(bh)) {
-			reiserfs_warning(s, "vs-2060",
-					 "wrong position found");
-			break;
-		}
-		item_pos--;
-		ih = item_head(bh, item_pos);
-
-		if (le32_to_cpu(ih->ih_key.k_dir_id) != MAX_KEY_OBJECTID)
-			/* there are no "save" links anymore */
-			break;
-
-		save_link_key = ih->ih_key;
-		if (is_indirect_le_ih(ih))
-			truncate = 1;
-		else
-			truncate = 0;
-
-		/* reiserfs_iget needs k_dirid and k_objectid only */
-		item = ih_item_body(bh, ih);
-		obj_key.on_disk_key.k_dir_id = le32_to_cpu(*(__le32 *) item);
-		obj_key.on_disk_key.k_objectid =
-		    le32_to_cpu(ih->ih_key.k_objectid);
-		obj_key.on_disk_key.k_offset = 0;
-		obj_key.on_disk_key.k_type = 0;
-
-		pathrelse(&path);
-
-		inode = reiserfs_iget(s, &obj_key);
-		if (IS_ERR_OR_NULL(inode)) {
-			/*
-			 * the unlink almost completed, it just did not
-			 * manage to remove "save" link and release objectid
-			 */
-			reiserfs_warning(s, "vs-2180", "iget failed for %K",
-					 &obj_key);
-			retval = remove_save_link_only(s, &save_link_key, 1);
-			continue;
-		}
-
-		if (!truncate && inode->i_nlink) {
-			/* file is not unlinked */
-			reiserfs_warning(s, "vs-2185",
-					 "file %K is not unlinked",
-					 &obj_key);
-			retval = remove_save_link_only(s, &save_link_key, 0);
-			continue;
-		}
-		depth = reiserfs_write_unlock_nested(inode->i_sb);
-		dquot_initialize(inode);
-		reiserfs_write_lock_nested(inode->i_sb, depth);
-
-		if (truncate && S_ISDIR(inode->i_mode)) {
-			/*
-			 * We got a truncate request for a dir which
-			 * is impossible.  The only imaginable way is to
-			 * execute unfinished truncate request then boot
-			 * into old kernel, remove the file and create dir
-			 * with the same key.
-			 */
-			reiserfs_warning(s, "green-2101",
-					 "impossible truncate on a "
-					 "directory %k. Please report",
-					 INODE_PKEY(inode));
-			retval = remove_save_link_only(s, &save_link_key, 0);
-			truncate = 0;
-			iput(inode);
-			continue;
-		}
-
-		if (truncate) {
-			REISERFS_I(inode)->i_flags |=
-			    i_link_saved_truncate_mask;
-			/*
-			 * not completed truncate found. New size was
-			 * committed together with "save" link
-			 */
-			reiserfs_info(s, "Truncating %k to %lld ..",
-				      INODE_PKEY(inode), inode->i_size);
-
-			/* don't update modification time */
-			reiserfs_truncate_file(inode, 0);
-
-			retval = remove_save_link(inode, truncate);
-		} else {
-			REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask;
-			/* not completed unlink (rmdir) found */
-			reiserfs_info(s, "Removing %k..", INODE_PKEY(inode));
-			if (memcmp(&last_inode_key, INODE_PKEY(inode),
-					sizeof(last_inode_key))){
-				last_inode_key = *INODE_PKEY(inode);
-				/* removal gets completed in iput */
-				retval = 0;
-			} else {
-				reiserfs_warning(s, "super-2189", "Dead loop "
-						 "in finish_unfinished "
-						 "detected, just remove "
-						 "save link\n");
-				retval = remove_save_link_only(s,
-							&save_link_key, 0);
-			}
-		}
-
-		iput(inode);
-		printk("done\n");
-		done++;
-	}
-	REISERFS_SB(s)->s_is_unlinked_ok = 0;
-
-#ifdef CONFIG_QUOTA
-	/* Turn quotas off */
-	reiserfs_write_unlock(s);
-	for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
-		if (sb_dqopt(s)->files[i] && quota_enabled[i])
-			dquot_quota_off(s, i);
-	}
-	reiserfs_write_lock(s);
-	if (ms_active_set)
-		/* Restore the flag back */
-		s->s_flags &= ~SB_ACTIVE;
-#endif
-	pathrelse(&path);
-	if (done)
-		reiserfs_info(s, "There were %d uncompleted unlinks/truncates. "
-			      "Completed\n", done);
-	return retval;
-}
-
-/*
- * to protect file being unlinked from getting lost we "safe" link files
- * being unlinked. This link will be deleted in the same transaction with last
- * item of file. mounting the filesystem we scan all these links and remove
- * files which almost got lost
- */
-void add_save_link(struct reiserfs_transaction_handle *th,
-		   struct inode *inode, int truncate)
-{
-	INITIALIZE_PATH(path);
-	int retval;
-	struct cpu_key key;
-	struct item_head ih;
-	__le32 link;
-
-	BUG_ON(!th->t_trans_id);
-
-	/* file can only get one "save link" of each kind */
-	RFALSE(truncate &&
-	       (REISERFS_I(inode)->i_flags & i_link_saved_truncate_mask),
-	       "saved link already exists for truncated inode %lx",
-	       (long)inode->i_ino);
-	RFALSE(!truncate &&
-	       (REISERFS_I(inode)->i_flags & i_link_saved_unlink_mask),
-	       "saved link already exists for unlinked inode %lx",
-	       (long)inode->i_ino);
-
-	/* setup key of "save" link */
-	key.version = KEY_FORMAT_3_5;
-	key.on_disk_key.k_dir_id = MAX_KEY_OBJECTID;
-	key.on_disk_key.k_objectid = inode->i_ino;
-	if (!truncate) {
-		/* unlink, rmdir, rename */
-		set_cpu_key_k_offset(&key, 1 + inode->i_sb->s_blocksize);
-		set_cpu_key_k_type(&key, TYPE_DIRECT);
-
-		/* item head of "safe" link */
-		make_le_item_head(&ih, &key, key.version,
-				  1 + inode->i_sb->s_blocksize, TYPE_DIRECT,
-				  4 /*length */ , 0xffff /*free space */ );
-	} else {
-		/* truncate */
-		if (S_ISDIR(inode->i_mode))
-			reiserfs_warning(inode->i_sb, "green-2102",
-					 "Adding a truncate savelink for "
-					 "a directory %k! Please report",
-					 INODE_PKEY(inode));
-		set_cpu_key_k_offset(&key, 1);
-		set_cpu_key_k_type(&key, TYPE_INDIRECT);
-
-		/* item head of "safe" link */
-		make_le_item_head(&ih, &key, key.version, 1, TYPE_INDIRECT,
-				  4 /*length */ , 0 /*free space */ );
-	}
-	key.key_length = 3;
-
-	/* look for its place in the tree */
-	retval = search_item(inode->i_sb, &key, &path);
-	if (retval != ITEM_NOT_FOUND) {
-		if (retval != -ENOSPC)
-			reiserfs_error(inode->i_sb, "vs-2100",
-				       "search_by_key (%K) returned %d", &key,
-				       retval);
-		pathrelse(&path);
-		return;
-	}
-
-	/* body of "save" link */
-	link = INODE_PKEY(inode)->k_dir_id;
-
-	/* put "save" link into tree, don't charge quota to anyone */
-	retval =
-	    reiserfs_insert_item(th, &path, &key, &ih, NULL, (char *)&link);
-	if (retval) {
-		if (retval != -ENOSPC)
-			reiserfs_error(inode->i_sb, "vs-2120",
-				       "insert_item returned %d", retval);
-	} else {
-		if (truncate)
-			REISERFS_I(inode)->i_flags |=
-			    i_link_saved_truncate_mask;
-		else
-			REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask;
-	}
-}
-
-/* this opens transaction unlike add_save_link */
-int remove_save_link(struct inode *inode, int truncate)
-{
-	struct reiserfs_transaction_handle th;
-	struct reiserfs_key key;
-	int err;
-
-	/* we are going to do one balancing only */
-	err = journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT);
-	if (err)
-		return err;
-
-	/* setup key of "save" link */
-	key.k_dir_id = cpu_to_le32(MAX_KEY_OBJECTID);
-	key.k_objectid = INODE_PKEY(inode)->k_objectid;
-	if (!truncate) {
-		/* unlink, rmdir, rename */
-		set_le_key_k_offset(KEY_FORMAT_3_5, &key,
-				    1 + inode->i_sb->s_blocksize);
-		set_le_key_k_type(KEY_FORMAT_3_5, &key, TYPE_DIRECT);
-	} else {
-		/* truncate */
-		set_le_key_k_offset(KEY_FORMAT_3_5, &key, 1);
-		set_le_key_k_type(KEY_FORMAT_3_5, &key, TYPE_INDIRECT);
-	}
-
-	if ((truncate &&
-	     (REISERFS_I(inode)->i_flags & i_link_saved_truncate_mask)) ||
-	    (!truncate &&
-	     (REISERFS_I(inode)->i_flags & i_link_saved_unlink_mask)))
-		/* don't take quota bytes from anywhere */
-		reiserfs_delete_solid_item(&th, NULL, &key);
-	if (!truncate) {
-		reiserfs_release_objectid(&th, inode->i_ino);
-		REISERFS_I(inode)->i_flags &= ~i_link_saved_unlink_mask;
-	} else
-		REISERFS_I(inode)->i_flags &= ~i_link_saved_truncate_mask;
-
-	return journal_end(&th);
-}
-
-static void reiserfs_kill_sb(struct super_block *s)
-{
-	if (REISERFS_SB(s)) {
-		reiserfs_proc_info_done(s);
-		/*
-		 * Force any pending inode evictions to occur now. Any
-		 * inodes to be removed that have extended attributes
-		 * associated with them need to clean them up before
-		 * we can release the extended attribute root dentries.
-		 * shrink_dcache_for_umount will BUG if we don't release
-		 * those before it's called so ->put_super is too late.
-		 */
-		shrink_dcache_sb(s);
-
-		dput(REISERFS_SB(s)->xattr_root);
-		REISERFS_SB(s)->xattr_root = NULL;
-		dput(REISERFS_SB(s)->priv_root);
-		REISERFS_SB(s)->priv_root = NULL;
-	}
-
-	kill_block_super(s);
-}
-
-#ifdef CONFIG_QUOTA
-static int reiserfs_quota_off(struct super_block *sb, int type);
-
-static void reiserfs_quota_off_umount(struct super_block *s)
-{
-	int type;
-
-	for (type = 0; type < REISERFS_MAXQUOTAS; type++)
-		reiserfs_quota_off(s, type);
-}
-#else
-static inline void reiserfs_quota_off_umount(struct super_block *s)
-{
-}
-#endif
-
-static void reiserfs_put_super(struct super_block *s)
-{
-	struct reiserfs_transaction_handle th;
-	th.t_trans_id = 0;
-
-	reiserfs_quota_off_umount(s);
-
-	reiserfs_write_lock(s);
-
-	/*
-	 * change file system state to current state if it was mounted
-	 * with read-write permissions
-	 */
-	if (!sb_rdonly(s)) {
-		if (!journal_begin(&th, s, 10)) {
-			reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
-						     1);
-			set_sb_umount_state(SB_DISK_SUPER_BLOCK(s),
-					    REISERFS_SB(s)->s_mount_state);
-			journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
-		}
-	}
-
-	/*
-	 * note, journal_release checks for readonly mount, and can
-	 * decide not to do a journal_end
-	 */
-	journal_release(&th, s);
-
-	reiserfs_free_bitmap_cache(s);
-
-	brelse(SB_BUFFER_WITH_SB(s));
-
-	print_statistics(s);
-
-	if (REISERFS_SB(s)->reserved_blocks != 0) {
-		reiserfs_warning(s, "green-2005", "reserved blocks left %d",
-				 REISERFS_SB(s)->reserved_blocks);
-	}
-
-	reiserfs_write_unlock(s);
-	mutex_destroy(&REISERFS_SB(s)->lock);
-	destroy_workqueue(REISERFS_SB(s)->commit_wq);
-	kfree(REISERFS_SB(s)->s_jdev);
-	kfree(s->s_fs_info);
-	s->s_fs_info = NULL;
-}
-
-static struct kmem_cache *reiserfs_inode_cachep;
-
-static struct inode *reiserfs_alloc_inode(struct super_block *sb)
-{
-	struct reiserfs_inode_info *ei;
-	ei = alloc_inode_sb(sb, reiserfs_inode_cachep, GFP_KERNEL);
-	if (!ei)
-		return NULL;
-	atomic_set(&ei->openers, 0);
-	mutex_init(&ei->tailpack);
-#ifdef CONFIG_QUOTA
-	memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
-#endif
-
-	return &ei->vfs_inode;
-}
-
-static void reiserfs_free_inode(struct inode *inode)
-{
-	kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode));
-}
-
-static void init_once(void *foo)
-{
-	struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo;
-
-	INIT_LIST_HEAD(&ei->i_prealloc_list);
-	inode_init_once(&ei->vfs_inode);
-}
-
-static int __init init_inodecache(void)
-{
-	reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache",
-						  sizeof(struct
-							 reiserfs_inode_info),
-						  0, (SLAB_RECLAIM_ACCOUNT|
-						      SLAB_ACCOUNT),
-						  init_once);
-	if (reiserfs_inode_cachep == NULL)
-		return -ENOMEM;
-	return 0;
-}
-
-static void destroy_inodecache(void)
-{
-	/*
-	 * Make sure all delayed rcu free inodes are flushed before we
-	 * destroy cache.
-	 */
-	rcu_barrier();
-	kmem_cache_destroy(reiserfs_inode_cachep);
-}
-
-/* we don't mark inodes dirty, we just log them */
-static void reiserfs_dirty_inode(struct inode *inode, int flags)
-{
-	struct reiserfs_transaction_handle th;
-
-	int err = 0;
-
-	if (sb_rdonly(inode->i_sb)) {
-		reiserfs_warning(inode->i_sb, "clm-6006",
-				 "writing inode %lu on readonly FS",
-				 inode->i_ino);
-		return;
-	}
-	reiserfs_write_lock(inode->i_sb);
-
-	/*
-	 * this is really only used for atime updates, so they don't have
-	 * to be included in O_SYNC or fsync
-	 */
-	err = journal_begin(&th, inode->i_sb, 1);
-	if (err)
-		goto out;
-
-	reiserfs_update_sd(&th, inode);
-	journal_end(&th);
-
-out:
-	reiserfs_write_unlock(inode->i_sb);
-}
-
-static int reiserfs_show_options(struct seq_file *seq, struct dentry *root)
-{
-	struct super_block *s = root->d_sb;
-	struct reiserfs_journal *journal = SB_JOURNAL(s);
-	long opts = REISERFS_SB(s)->s_mount_opt;
-
-	if (opts & (1 << REISERFS_LARGETAIL))
-		seq_puts(seq, ",tails=on");
-	else if (!(opts & (1 << REISERFS_SMALLTAIL)))
-		seq_puts(seq, ",notail");
-	/* tails=small is default so we don't show it */
-
-	if (!(opts & (1 << REISERFS_BARRIER_FLUSH)))
-		seq_puts(seq, ",barrier=none");
-	/* barrier=flush is default so we don't show it */
-
-	if (opts & (1 << REISERFS_ERROR_CONTINUE))
-		seq_puts(seq, ",errors=continue");
-	else if (opts & (1 << REISERFS_ERROR_PANIC))
-		seq_puts(seq, ",errors=panic");
-	/* errors=ro is default so we don't show it */
-
-	if (opts & (1 << REISERFS_DATA_LOG))
-		seq_puts(seq, ",data=journal");
-	else if (opts & (1 << REISERFS_DATA_WRITEBACK))
-		seq_puts(seq, ",data=writeback");
-	/* data=ordered is default so we don't show it */
-
-	if (opts & (1 << REISERFS_ATTRS))
-		seq_puts(seq, ",attrs");
-
-	if (opts & (1 << REISERFS_XATTRS_USER))
-		seq_puts(seq, ",user_xattr");
-
-	if (opts & (1 << REISERFS_EXPOSE_PRIVROOT))
-		seq_puts(seq, ",expose_privroot");
-
-	if (opts & (1 << REISERFS_POSIXACL))
-		seq_puts(seq, ",acl");
-
-	if (REISERFS_SB(s)->s_jdev)
-		seq_show_option(seq, "jdev", REISERFS_SB(s)->s_jdev);
-
-	if (journal->j_max_commit_age != journal->j_default_max_commit_age)
-		seq_printf(seq, ",commit=%d", journal->j_max_commit_age);
-
-#ifdef CONFIG_QUOTA
-	if (REISERFS_SB(s)->s_qf_names[USRQUOTA])
-		seq_show_option(seq, "usrjquota",
-				REISERFS_SB(s)->s_qf_names[USRQUOTA]);
-	else if (opts & (1 << REISERFS_USRQUOTA))
-		seq_puts(seq, ",usrquota");
-	if (REISERFS_SB(s)->s_qf_names[GRPQUOTA])
-		seq_show_option(seq, "grpjquota",
-				REISERFS_SB(s)->s_qf_names[GRPQUOTA]);
-	else if (opts & (1 << REISERFS_GRPQUOTA))
-		seq_puts(seq, ",grpquota");
-	if (REISERFS_SB(s)->s_jquota_fmt) {
-		if (REISERFS_SB(s)->s_jquota_fmt == QFMT_VFS_OLD)
-			seq_puts(seq, ",jqfmt=vfsold");
-		else if (REISERFS_SB(s)->s_jquota_fmt == QFMT_VFS_V0)
-			seq_puts(seq, ",jqfmt=vfsv0");
-	}
-#endif
-
-	/* Block allocator options */
-	if (opts & (1 << REISERFS_NO_BORDER))
-		seq_puts(seq, ",block-allocator=noborder");
-	if (opts & (1 << REISERFS_NO_UNHASHED_RELOCATION))
-		seq_puts(seq, ",block-allocator=no_unhashed_relocation");
-	if (opts & (1 << REISERFS_HASHED_RELOCATION))
-		seq_puts(seq, ",block-allocator=hashed_relocation");
-	if (opts & (1 << REISERFS_TEST4))
-		seq_puts(seq, ",block-allocator=test4");
-	show_alloc_options(seq, s);
-	return 0;
-}
-
-#ifdef CONFIG_QUOTA
-static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
-				    size_t, loff_t);
-static ssize_t reiserfs_quota_read(struct super_block *, int, char *, size_t,
-				   loff_t);
-
-static struct dquot __rcu **reiserfs_get_dquots(struct inode *inode)
-{
-	return REISERFS_I(inode)->i_dquot;
-}
-#endif
-
-static const struct super_operations reiserfs_sops = {
-	.alloc_inode = reiserfs_alloc_inode,
-	.free_inode = reiserfs_free_inode,
-	.write_inode = reiserfs_write_inode,
-	.dirty_inode = reiserfs_dirty_inode,
-	.evict_inode = reiserfs_evict_inode,
-	.put_super = reiserfs_put_super,
-	.sync_fs = reiserfs_sync_fs,
-	.freeze_fs = reiserfs_freeze,
-	.unfreeze_fs = reiserfs_unfreeze,
-	.statfs = reiserfs_statfs,
-	.remount_fs = reiserfs_remount,
-	.show_options = reiserfs_show_options,
-#ifdef CONFIG_QUOTA
-	.quota_read = reiserfs_quota_read,
-	.quota_write = reiserfs_quota_write,
-	.get_dquots = reiserfs_get_dquots,
-#endif
-};
-
-#ifdef CONFIG_QUOTA
-#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
-
-static int reiserfs_write_dquot(struct dquot *);
-static int reiserfs_acquire_dquot(struct dquot *);
-static int reiserfs_release_dquot(struct dquot *);
-static int reiserfs_mark_dquot_dirty(struct dquot *);
-static int reiserfs_write_info(struct super_block *, int);
-static int reiserfs_quota_on(struct super_block *, int, int, const struct path *);
-
-static const struct dquot_operations reiserfs_quota_operations = {
-	.write_dquot = reiserfs_write_dquot,
-	.acquire_dquot = reiserfs_acquire_dquot,
-	.release_dquot = reiserfs_release_dquot,
-	.mark_dirty = reiserfs_mark_dquot_dirty,
-	.write_info = reiserfs_write_info,
-	.alloc_dquot	= dquot_alloc,
-	.destroy_dquot	= dquot_destroy,
-	.get_next_id	= dquot_get_next_id,
-};
-
-static const struct quotactl_ops reiserfs_qctl_operations = {
-	.quota_on = reiserfs_quota_on,
-	.quota_off = reiserfs_quota_off,
-	.quota_sync = dquot_quota_sync,
-	.get_state = dquot_get_state,
-	.set_info = dquot_set_dqinfo,
-	.get_dqblk = dquot_get_dqblk,
-	.set_dqblk = dquot_set_dqblk,
-};
-#endif
-
-static const struct export_operations reiserfs_export_ops = {
-	.encode_fh = reiserfs_encode_fh,
-	.fh_to_dentry = reiserfs_fh_to_dentry,
-	.fh_to_parent = reiserfs_fh_to_parent,
-	.get_parent = reiserfs_get_parent,
-};
-
-/*
- * this struct is used in reiserfs_getopt () for containing the value for
- * those mount options that have values rather than being toggles.
- */
-typedef struct {
-	char *value;
-	/*
-	 * bitmask which is to set on mount_options bitmask
-	 * when this value is found, 0 is no bits are to be changed.
-	 */
-	int setmask;
-	/*
-	 * bitmask which is to clear on mount_options bitmask
-	 * when this value is found, 0 is no bits are to be changed.
-	 * This is applied BEFORE setmask
-	 */
-	int clrmask;
-} arg_desc_t;
-
-/* Set this bit in arg_required to allow empty arguments */
-#define REISERFS_OPT_ALLOWEMPTY 31
-
-/*
- * this struct is used in reiserfs_getopt() for describing the
- * set of reiserfs mount options
- */
-typedef struct {
-	char *option_name;
-
-	/* 0 if argument is not required, not 0 otherwise */
-	int arg_required;
-
-	/* list of values accepted by an option */
-	const arg_desc_t *values;
-
-	/*
-	 * bitmask which is to set on mount_options bitmask
-	 * when this value is found, 0 is no bits are to be changed.
-	 */
-	int setmask;
-
-	/*
-	 * bitmask which is to clear on mount_options bitmask
-	 * when this value is found, 0 is no bits are to be changed.
-	 * This is applied BEFORE setmask
-	 */
-	int clrmask;
-} opt_desc_t;
-
-/* possible values for -o data= */
-static const arg_desc_t logging_mode[] = {
-	{"ordered", 1 << REISERFS_DATA_ORDERED,
-	 (1 << REISERFS_DATA_LOG | 1 << REISERFS_DATA_WRITEBACK)},
-	{"journal", 1 << REISERFS_DATA_LOG,
-	 (1 << REISERFS_DATA_ORDERED | 1 << REISERFS_DATA_WRITEBACK)},
-	{"writeback", 1 << REISERFS_DATA_WRITEBACK,
-	 (1 << REISERFS_DATA_ORDERED | 1 << REISERFS_DATA_LOG)},
-	{.value = NULL}
-};
-
-/* possible values for -o barrier= */
-static const arg_desc_t barrier_mode[] = {
-	{"none", 1 << REISERFS_BARRIER_NONE, 1 << REISERFS_BARRIER_FLUSH},
-	{"flush", 1 << REISERFS_BARRIER_FLUSH, 1 << REISERFS_BARRIER_NONE},
-	{.value = NULL}
-};
-
-/*
- * possible values for "-o block-allocator=" and bits which are to be set in
- * s_mount_opt of reiserfs specific part of in-core super block
- */
-static const arg_desc_t balloc[] = {
-	{"noborder", 1 << REISERFS_NO_BORDER, 0},
-	{"border", 0, 1 << REISERFS_NO_BORDER},
-	{"no_unhashed_relocation", 1 << REISERFS_NO_UNHASHED_RELOCATION, 0},
-	{"hashed_relocation", 1 << REISERFS_HASHED_RELOCATION, 0},
-	{"test4", 1 << REISERFS_TEST4, 0},
-	{"notest4", 0, 1 << REISERFS_TEST4},
-	{NULL, 0, 0}
-};
-
-static const arg_desc_t tails[] = {
-	{"on", 1 << REISERFS_LARGETAIL, 1 << REISERFS_SMALLTAIL},
-	{"off", 0, (1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)},
-	{"small", 1 << REISERFS_SMALLTAIL, 1 << REISERFS_LARGETAIL},
-	{NULL, 0, 0}
-};
-
-static const arg_desc_t error_actions[] = {
-	{"panic", 1 << REISERFS_ERROR_PANIC,
-	 (1 << REISERFS_ERROR_RO | 1 << REISERFS_ERROR_CONTINUE)},
-	{"ro-remount", 1 << REISERFS_ERROR_RO,
-	 (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_CONTINUE)},
-#ifdef REISERFS_JOURNAL_ERROR_ALLOWS_NO_LOG
-	{"continue", 1 << REISERFS_ERROR_CONTINUE,
-	 (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_RO)},
-#endif
-	{NULL, 0, 0},
-};
-
-/*
- * proceed only one option from a list *cur - string containing of mount
- * options
- * opts - array of options which are accepted
- * opt_arg - if option is found and requires an argument and if it is specifed
- * in the input - pointer to the argument is stored here
- * bit_flags - if option requires to set a certain bit - it is set here
- * return -1 if unknown option is found, opt->arg_required otherwise
- */
-static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
-			   char **opt_arg, unsigned long *bit_flags)
-{
-	char *p;
-	/*
-	 * foo=bar,
-	 * ^   ^  ^
-	 * |   |  +-- option_end
-	 * |   +-- arg_start
-	 * +-- option_start
-	 */
-	const opt_desc_t *opt;
-	const arg_desc_t *arg;
-
-	p = *cur;
-
-	/* assume argument cannot contain commas */
-	*cur = strchr(p, ',');
-	if (*cur) {
-		*(*cur) = '\0';
-		(*cur)++;
-	}
-
-	if (!strncmp(p, "alloc=", 6)) {
-		/*
-		 * Ugly special case, probably we should redo options
-		 * parser so that it can understand several arguments for
-		 * some options, also so that it can fill several bitfields
-		 * with option values.
-		 */
-		if (reiserfs_parse_alloc_options(s, p + 6)) {
-			return -1;
-		} else {
-			return 0;
-		}
-	}
-
-	/* for every option in the list */
-	for (opt = opts; opt->option_name; opt++) {
-		if (!strncmp(p, opt->option_name, strlen(opt->option_name))) {
-			if (bit_flags) {
-				if (opt->clrmask ==
-				    (1 << REISERFS_UNSUPPORTED_OPT))
-					reiserfs_warning(s, "super-6500",
-							 "%s not supported.\n",
-							 p);
-				else
-					*bit_flags &= ~opt->clrmask;
-				if (opt->setmask ==
-				    (1 << REISERFS_UNSUPPORTED_OPT))
-					reiserfs_warning(s, "super-6501",
-							 "%s not supported.\n",
-							 p);
-				else
-					*bit_flags |= opt->setmask;
-			}
-			break;
-		}
-	}
-	if (!opt->option_name) {
-		reiserfs_warning(s, "super-6502",
-				 "unknown mount option \"%s\"", p);
-		return -1;
-	}
-
-	p += strlen(opt->option_name);
-	switch (*p) {
-	case '=':
-		if (!opt->arg_required) {
-			reiserfs_warning(s, "super-6503",
-					 "the option \"%s\" does not "
-					 "require an argument\n",
-					 opt->option_name);
-			return -1;
-		}
-		break;
-
-	case 0:
-		if (opt->arg_required) {
-			reiserfs_warning(s, "super-6504",
-					 "the option \"%s\" requires an "
-					 "argument\n", opt->option_name);
-			return -1;
-		}
-		break;
-	default:
-		reiserfs_warning(s, "super-6505",
-				 "head of option \"%s\" is only correct\n",
-				 opt->option_name);
-		return -1;
-	}
-
-	/*
-	 * move to the argument, or to next option if argument is not
-	 * required
-	 */
-	p++;
-
-	if (opt->arg_required
-	    && !(opt->arg_required & (1 << REISERFS_OPT_ALLOWEMPTY))
-	    && !strlen(p)) {
-		/* this catches "option=," if not allowed */
-		reiserfs_warning(s, "super-6506",
-				 "empty argument for \"%s\"\n",
-				 opt->option_name);
-		return -1;
-	}
-
-	if (!opt->values) {
-		/* *=NULLopt_arg contains pointer to argument */
-		*opt_arg = p;
-		return opt->arg_required & ~(1 << REISERFS_OPT_ALLOWEMPTY);
-	}
-
-	/* values possible for this option are listed in opt->values */
-	for (arg = opt->values; arg->value; arg++) {
-		if (!strcmp(p, arg->value)) {
-			if (bit_flags) {
-				*bit_flags &= ~arg->clrmask;
-				*bit_flags |= arg->setmask;
-			}
-			return opt->arg_required;
-		}
-	}
-
-	reiserfs_warning(s, "super-6506",
-			 "bad value \"%s\" for option \"%s\"\n", p,
-			 opt->option_name);
-	return -1;
-}
-
-/* returns 0 if something is wrong in option string, 1 - otherwise */
-static int reiserfs_parse_options(struct super_block *s,
-
-				  /* string given via mount's -o */
-				  char *options,
-
-				  /*
-				   * after the parsing phase, contains the
-				   * collection of bitflags defining what
-				   * mount options were selected.
-				   */
-				  unsigned long *mount_options,
-
-				  /* strtol-ed from NNN of resize=NNN */
-				  unsigned long *blocks,
-				  char **jdev_name,
-				  unsigned int *commit_max_age,
-				  char **qf_names,
-				  unsigned int *qfmt)
-{
-	int c;
-	char *arg = NULL;
-	char *pos;
-	opt_desc_t opts[] = {
-		/*
-		 * Compatibility stuff, so that -o notail for old
-		 * setups still work
-		 */
-		{"tails",.arg_required = 't',.values = tails},
-		{"notail",.clrmask =
-		 (1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)},
-		{"conv",.setmask = 1 << REISERFS_CONVERT},
-		{"attrs",.setmask = 1 << REISERFS_ATTRS},
-		{"noattrs",.clrmask = 1 << REISERFS_ATTRS},
-		{"expose_privroot", .setmask = 1 << REISERFS_EXPOSE_PRIVROOT},
-#ifdef CONFIG_REISERFS_FS_XATTR
-		{"user_xattr",.setmask = 1 << REISERFS_XATTRS_USER},
-		{"nouser_xattr",.clrmask = 1 << REISERFS_XATTRS_USER},
-#else
-		{"user_xattr",.setmask = 1 << REISERFS_UNSUPPORTED_OPT},
-		{"nouser_xattr",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT},
-#endif
-#ifdef CONFIG_REISERFS_FS_POSIX_ACL
-		{"acl",.setmask = 1 << REISERFS_POSIXACL},
-		{"noacl",.clrmask = 1 << REISERFS_POSIXACL},
-#else
-		{"acl",.setmask = 1 << REISERFS_UNSUPPORTED_OPT},
-		{"noacl",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT},
-#endif
-		{.option_name = "nolog"},
-		{"replayonly",.setmask = 1 << REPLAYONLY},
-		{"block-allocator",.arg_required = 'a',.values = balloc},
-		{"data",.arg_required = 'd',.values = logging_mode},
-		{"barrier",.arg_required = 'b',.values = barrier_mode},
-		{"resize",.arg_required = 'r',.values = NULL},
-		{"jdev",.arg_required = 'j',.values = NULL},
-		{"nolargeio",.arg_required = 'w',.values = NULL},
-		{"commit",.arg_required = 'c',.values = NULL},
-		{"usrquota",.setmask = 1 << REISERFS_USRQUOTA},
-		{"grpquota",.setmask = 1 << REISERFS_GRPQUOTA},
-		{"noquota",.clrmask = 1 << REISERFS_USRQUOTA | 1 << REISERFS_GRPQUOTA},
-		{"errors",.arg_required = 'e',.values = error_actions},
-		{"usrjquota",.arg_required =
-		 'u' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL},
-		{"grpjquota",.arg_required =
-		 'g' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL},
-		{"jqfmt",.arg_required = 'f',.values = NULL},
-		{.option_name = NULL}
-	};
-
-	*blocks = 0;
-	if (!options || !*options)
-		/*
-		 * use default configuration: create tails, journaling on, no
-		 * conversion to newest format
-		 */
-		return 1;
-
-	for (pos = options; pos;) {
-		c = reiserfs_getopt(s, &pos, opts, &arg, mount_options);
-		if (c == -1)
-			/* wrong option is given */
-			return 0;
-
-		if (c == 'r') {
-			char *p;
-
-			p = NULL;
-			/* "resize=NNN" or "resize=auto" */
-
-			if (!strcmp(arg, "auto")) {
-				/* From JFS code, to auto-get the size. */
-				*blocks = sb_bdev_nr_blocks(s);
-			} else {
-				*blocks = simple_strtoul(arg, &p, 0);
-				if (*p != '\0') {
-					/* NNN does not look like a number */
-					reiserfs_warning(s, "super-6507",
-							 "bad value %s for "
-							 "-oresize\n", arg);
-					return 0;
-				}
-			}
-		}
-
-		if (c == 'c') {
-			char *p = NULL;
-			unsigned long val = simple_strtoul(arg, &p, 0);
-			/* commit=NNN (time in seconds) */
-			if (*p != '\0' || val >= (unsigned int)-1) {
-				reiserfs_warning(s, "super-6508",
-						 "bad value %s for -ocommit\n",
-						 arg);
-				return 0;
-			}
-			*commit_max_age = (unsigned int)val;
-		}
-
-		if (c == 'w') {
-			reiserfs_warning(s, "super-6509", "nolargeio option "
-					 "is no longer supported");
-			return 0;
-		}
-
-		if (c == 'j') {
-			if (arg && *arg && jdev_name) {
-				/* Hm, already assigned? */
-				if (*jdev_name) {
-					reiserfs_warning(s, "super-6510",
-							 "journal device was "
-							 "already specified to "
-							 "be %s", *jdev_name);
-					return 0;
-				}
-				*jdev_name = arg;
-			}
-		}
-#ifdef CONFIG_QUOTA
-		if (c == 'u' || c == 'g') {
-			int qtype = c == 'u' ? USRQUOTA : GRPQUOTA;
-
-			if (sb_any_quota_loaded(s) &&
-			    (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) {
-				reiserfs_warning(s, "super-6511",
-						 "cannot change journaled "
-						 "quota options when quota "
-						 "turned on.");
-				return 0;
-			}
-			if (qf_names[qtype] !=
-			    REISERFS_SB(s)->s_qf_names[qtype])
-				kfree(qf_names[qtype]);
-			qf_names[qtype] = NULL;
-			if (*arg) {	/* Some filename specified? */
-				if (REISERFS_SB(s)->s_qf_names[qtype]
-				    && strcmp(REISERFS_SB(s)->s_qf_names[qtype],
-					      arg)) {
-					reiserfs_warning(s, "super-6512",
-							 "%s quota file "
-							 "already specified.",
-							 QTYPE2NAME(qtype));
-					return 0;
-				}
-				if (strchr(arg, '/')) {
-					reiserfs_warning(s, "super-6513",
-							 "quotafile must be "
-							 "on filesystem root.");
-					return 0;
-				}
-				qf_names[qtype] = kstrdup(arg, GFP_KERNEL);
-				if (!qf_names[qtype]) {
-					reiserfs_warning(s, "reiserfs-2502",
-							 "not enough memory "
-							 "for storing "
-							 "quotafile name.");
-					return 0;
-				}
-				if (qtype == USRQUOTA)
-					*mount_options |= 1 << REISERFS_USRQUOTA;
-				else
-					*mount_options |= 1 << REISERFS_GRPQUOTA;
-			} else {
-				if (qtype == USRQUOTA)
-					*mount_options &= ~(1 << REISERFS_USRQUOTA);
-				else
-					*mount_options &= ~(1 << REISERFS_GRPQUOTA);
-			}
-		}
-		if (c == 'f') {
-			if (!strcmp(arg, "vfsold"))
-				*qfmt = QFMT_VFS_OLD;
-			else if (!strcmp(arg, "vfsv0"))
-				*qfmt = QFMT_VFS_V0;
-			else {
-				reiserfs_warning(s, "super-6514",
-						 "unknown quota format "
-						 "specified.");
-				return 0;
-			}
-			if (sb_any_quota_loaded(s) &&
-			    *qfmt != REISERFS_SB(s)->s_jquota_fmt) {
-				reiserfs_warning(s, "super-6515",
-						 "cannot change journaled "
-						 "quota options when quota "
-						 "turned on.");
-				return 0;
-			}
-		}
-#else
-		if (c == 'u' || c == 'g' || c == 'f') {
-			reiserfs_warning(s, "reiserfs-2503", "journaled "
-					 "quota options not supported.");
-			return 0;
-		}
-#endif
-	}
-
-#ifdef CONFIG_QUOTA
-	if (!REISERFS_SB(s)->s_jquota_fmt && !*qfmt
-	    && (qf_names[USRQUOTA] || qf_names[GRPQUOTA])) {
-		reiserfs_warning(s, "super-6515",
-				 "journaled quota format not specified.");
-		return 0;
-	}
-	if ((!(*mount_options & (1 << REISERFS_USRQUOTA)) &&
-	       sb_has_quota_loaded(s, USRQUOTA)) ||
-	    (!(*mount_options & (1 << REISERFS_GRPQUOTA)) &&
-	       sb_has_quota_loaded(s, GRPQUOTA))) {
-		reiserfs_warning(s, "super-6516", "quota options must "
-				 "be present when quota is turned on.");
-		return 0;
-	}
-#endif
-
-	return 1;
-}
-
-static void switch_data_mode(struct super_block *s, unsigned long mode)
-{
-	REISERFS_SB(s)->s_mount_opt &= ~((1 << REISERFS_DATA_LOG) |
-					 (1 << REISERFS_DATA_ORDERED) |
-					 (1 << REISERFS_DATA_WRITEBACK));
-	REISERFS_SB(s)->s_mount_opt |= (1 << mode);
-}
-
-static void handle_data_mode(struct super_block *s, unsigned long mount_options)
-{
-	if (mount_options & (1 << REISERFS_DATA_LOG)) {
-		if (!reiserfs_data_log(s)) {
-			switch_data_mode(s, REISERFS_DATA_LOG);
-			reiserfs_info(s, "switching to journaled data mode\n");
-		}
-	} else if (mount_options & (1 << REISERFS_DATA_ORDERED)) {
-		if (!reiserfs_data_ordered(s)) {
-			switch_data_mode(s, REISERFS_DATA_ORDERED);
-			reiserfs_info(s, "switching to ordered data mode\n");
-		}
-	} else if (mount_options & (1 << REISERFS_DATA_WRITEBACK)) {
-		if (!reiserfs_data_writeback(s)) {
-			switch_data_mode(s, REISERFS_DATA_WRITEBACK);
-			reiserfs_info(s, "switching to writeback data mode\n");
-		}
-	}
-}
-
-static void handle_barrier_mode(struct super_block *s, unsigned long bits)
-{
-	int flush = (1 << REISERFS_BARRIER_FLUSH);
-	int none = (1 << REISERFS_BARRIER_NONE);
-	int all_barrier = flush | none;
-
-	if (bits & all_barrier) {
-		REISERFS_SB(s)->s_mount_opt &= ~all_barrier;
-		if (bits & flush) {
-			REISERFS_SB(s)->s_mount_opt |= flush;
-			printk("reiserfs: enabling write barrier flush mode\n");
-		} else if (bits & none) {
-			REISERFS_SB(s)->s_mount_opt |= none;
-			printk("reiserfs: write barriers turned off\n");
-		}
-	}
-}
-
-static void handle_attrs(struct super_block *s)
-{
-	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
-
-	if (reiserfs_attrs(s)) {
-		if (old_format_only(s)) {
-			reiserfs_warning(s, "super-6517", "cannot support "
-					 "attributes on 3.5.x disk format");
-			REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS);
-			return;
-		}
-		if (!(le32_to_cpu(rs->s_flags) & reiserfs_attrs_cleared)) {
-			reiserfs_warning(s, "super-6518", "cannot support "
-					 "attributes until flag is set in "
-					 "super-block");
-			REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS);
-		}
-	}
-}
-
-#ifdef CONFIG_QUOTA
-static void handle_quota_files(struct super_block *s, char **qf_names,
-			       unsigned int *qfmt)
-{
-	int i;
-
-	for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
-		if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
-			kfree(REISERFS_SB(s)->s_qf_names[i]);
-		REISERFS_SB(s)->s_qf_names[i] = qf_names[i];
-	}
-	if (*qfmt)
-		REISERFS_SB(s)->s_jquota_fmt = *qfmt;
-}
-#endif
-
-static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
-{
-	struct reiserfs_super_block *rs;
-	struct reiserfs_transaction_handle th;
-	unsigned long blocks;
-	unsigned long mount_options = REISERFS_SB(s)->s_mount_opt;
-	unsigned long safe_mask = 0;
-	unsigned int commit_max_age = (unsigned int)-1;
-	struct reiserfs_journal *journal = SB_JOURNAL(s);
-	int err;
-	char *qf_names[REISERFS_MAXQUOTAS];
-	unsigned int qfmt = 0;
-#ifdef CONFIG_QUOTA
-	int i;
-#endif
-
-	sync_filesystem(s);
-	reiserfs_write_lock(s);
-
-#ifdef CONFIG_QUOTA
-	memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names));
-#endif
-
-	rs = SB_DISK_SUPER_BLOCK(s);
-
-	if (!reiserfs_parse_options
-	    (s, arg, &mount_options, &blocks, NULL, &commit_max_age,
-	    qf_names, &qfmt)) {
-#ifdef CONFIG_QUOTA
-		for (i = 0; i < REISERFS_MAXQUOTAS; i++)
-			if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
-				kfree(qf_names[i]);
-#endif
-		err = -EINVAL;
-		goto out_err_unlock;
-	}
-#ifdef CONFIG_QUOTA
-	handle_quota_files(s, qf_names, &qfmt);
-#endif
-
-	handle_attrs(s);
-
-	/* Add options that are safe here */
-	safe_mask |= 1 << REISERFS_SMALLTAIL;
-	safe_mask |= 1 << REISERFS_LARGETAIL;
-	safe_mask |= 1 << REISERFS_NO_BORDER;
-	safe_mask |= 1 << REISERFS_NO_UNHASHED_RELOCATION;
-	safe_mask |= 1 << REISERFS_HASHED_RELOCATION;
-	safe_mask |= 1 << REISERFS_TEST4;
-	safe_mask |= 1 << REISERFS_ATTRS;
-	safe_mask |= 1 << REISERFS_XATTRS_USER;
-	safe_mask |= 1 << REISERFS_POSIXACL;
-	safe_mask |= 1 << REISERFS_BARRIER_FLUSH;
-	safe_mask |= 1 << REISERFS_BARRIER_NONE;
-	safe_mask |= 1 << REISERFS_ERROR_RO;
-	safe_mask |= 1 << REISERFS_ERROR_CONTINUE;
-	safe_mask |= 1 << REISERFS_ERROR_PANIC;
-	safe_mask |= 1 << REISERFS_USRQUOTA;
-	safe_mask |= 1 << REISERFS_GRPQUOTA;
-
-	/*
-	 * Update the bitmask, taking care to keep
-	 * the bits we're not allowed to change here
-	 */
-	REISERFS_SB(s)->s_mount_opt =
-	    (REISERFS_SB(s)->
-	     s_mount_opt & ~safe_mask) | (mount_options & safe_mask);
-
-	if (commit_max_age != 0 && commit_max_age != (unsigned int)-1) {
-		journal->j_max_commit_age = commit_max_age;
-		journal->j_max_trans_age = commit_max_age;
-	} else if (commit_max_age == 0) {
-		/* 0 means restore defaults. */
-		journal->j_max_commit_age = journal->j_default_max_commit_age;
-		journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;
-	}
-
-	if (blocks) {
-		err = reiserfs_resize(s, blocks);
-		if (err != 0)
-			goto out_err_unlock;
-	}
-
-	if (*mount_flags & SB_RDONLY) {
-		reiserfs_write_unlock(s);
-		reiserfs_xattr_init(s, *mount_flags);
-		/* remount read-only */
-		if (sb_rdonly(s))
-			/* it is read-only already */
-			goto out_ok_unlocked;
-
-		err = dquot_suspend(s, -1);
-		if (err < 0)
-			goto out_err;
-
-		/* try to remount file system with read-only permissions */
-		if (sb_umount_state(rs) == REISERFS_VALID_FS
-		    || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) {
-			goto out_ok_unlocked;
-		}
-
-		reiserfs_write_lock(s);
-
-		err = journal_begin(&th, s, 10);
-		if (err)
-			goto out_err_unlock;
-
-		/* Mounting a rw partition read-only. */
-		reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
-		set_sb_umount_state(rs, REISERFS_SB(s)->s_mount_state);
-		journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
-	} else {
-		/* remount read-write */
-		if (!sb_rdonly(s)) {
-			reiserfs_write_unlock(s);
-			reiserfs_xattr_init(s, *mount_flags);
-			goto out_ok_unlocked;	/* We are read-write already */
-		}
-
-		if (reiserfs_is_journal_aborted(journal)) {
-			err = journal->j_errno;
-			goto out_err_unlock;
-		}
-
-		handle_data_mode(s, mount_options);
-		handle_barrier_mode(s, mount_options);
-		REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
-
-		/* now it is safe to call journal_begin */
-		s->s_flags &= ~SB_RDONLY;
-		err = journal_begin(&th, s, 10);
-		if (err)
-			goto out_err_unlock;
-
-		/* Mount a partition which is read-only, read-write */
-		reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
-		REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
-		s->s_flags &= ~SB_RDONLY;
-		set_sb_umount_state(rs, REISERFS_ERROR_FS);
-		if (!old_format_only(s))
-			set_sb_mnt_count(rs, sb_mnt_count(rs) + 1);
-		/* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */
-		journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
-		REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS;
-	}
-	/* this will force a full flush of all journal lists */
-	SB_JOURNAL(s)->j_must_wait = 1;
-	err = journal_end(&th);
-	if (err)
-		goto out_err_unlock;
-
-	reiserfs_write_unlock(s);
-	if (!(*mount_flags & SB_RDONLY)) {
-		dquot_resume(s, -1);
-		reiserfs_write_lock(s);
-		finish_unfinished(s);
-		reiserfs_write_unlock(s);
-		reiserfs_xattr_init(s, *mount_flags);
-	}
-
-out_ok_unlocked:
-	return 0;
-
-out_err_unlock:
-	reiserfs_write_unlock(s);
-out_err:
-	return err;
-}
-
-static int read_super_block(struct super_block *s, int offset)
-{
-	struct buffer_head *bh;
-	struct reiserfs_super_block *rs;
-	int fs_blocksize;
-
-	bh = sb_bread(s, offset / s->s_blocksize);
-	if (!bh) {
-		reiserfs_warning(s, "sh-2006",
-				 "bread failed (dev %s, block %lu, size %lu)",
-				 s->s_id, offset / s->s_blocksize,
-				 s->s_blocksize);
-		return 1;
-	}
-
-	rs = (struct reiserfs_super_block *)bh->b_data;
-	if (!is_any_reiserfs_magic_string(rs)) {
-		brelse(bh);
-		return 1;
-	}
-	/*
-	 * ok, reiserfs signature (old or new) found in at the given offset
-	 */
-	fs_blocksize = sb_blocksize(rs);
-	brelse(bh);
-	sb_set_blocksize(s, fs_blocksize);
-
-	bh = sb_bread(s, offset / s->s_blocksize);
-	if (!bh) {
-		reiserfs_warning(s, "sh-2007",
-				 "bread failed (dev %s, block %lu, size %lu)",
-				 s->s_id, offset / s->s_blocksize,
-				 s->s_blocksize);
-		return 1;
-	}
-
-	rs = (struct reiserfs_super_block *)bh->b_data;
-	if (sb_blocksize(rs) != s->s_blocksize) {
-		reiserfs_warning(s, "sh-2011", "can't find a reiserfs "
-				 "filesystem on (dev %s, block %llu, size %lu)",
-				 s->s_id,
-				 (unsigned long long)bh->b_blocknr,
-				 s->s_blocksize);
-		brelse(bh);
-		return 1;
-	}
-
-	if (rs->s_v1.s_root_block == cpu_to_le32(-1)) {
-		brelse(bh);
-		reiserfs_warning(s, "super-6519", "Unfinished reiserfsck "
-				 "--rebuild-tree run detected. Please run\n"
-				 "reiserfsck --rebuild-tree and wait for a "
-				 "completion. If that fails\n"
-				 "get newer reiserfsprogs package");
-		return 1;
-	}
-
-	reiserfs_warning(NULL, "", "reiserfs filesystem is deprecated and "
-		"scheduled to be removed from the kernel in 2025");
-	SB_BUFFER_WITH_SB(s) = bh;
-	SB_DISK_SUPER_BLOCK(s) = rs;
-
-	/*
-	 * magic is of non-standard journal filesystem, look at s_version to
-	 * find which format is in use
-	 */
-	if (is_reiserfs_jr(rs)) {
-		if (sb_version(rs) == REISERFS_VERSION_2)
-			reiserfs_info(s, "found reiserfs format \"3.6\""
-				      " with non-standard journal\n");
-		else if (sb_version(rs) == REISERFS_VERSION_1)
-			reiserfs_info(s, "found reiserfs format \"3.5\""
-				      " with non-standard journal\n");
-		else {
-			reiserfs_warning(s, "sh-2012", "found unknown "
-					 "format \"%u\" of reiserfs with "
-					 "non-standard magic", sb_version(rs));
-			return 1;
-		}
-	} else
-		/*
-		 * s_version of standard format may contain incorrect
-		 * information, so we just look at the magic string
-		 */
-		reiserfs_info(s,
-			      "found reiserfs format \"%s\" with standard journal\n",
-			      is_reiserfs_3_5(rs) ? "3.5" : "3.6");
-
-	s->s_op = &reiserfs_sops;
-	s->s_export_op = &reiserfs_export_ops;
-#ifdef CONFIG_QUOTA
-	s->s_qcop = &reiserfs_qctl_operations;
-	s->dq_op = &reiserfs_quota_operations;
-	s->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
-#endif
-
-	/*
-	 * new format is limited by the 32 bit wide i_blocks field, want to
-	 * be one full block below that.
-	 */
-	s->s_maxbytes = (512LL << 32) - s->s_blocksize;
-	return 0;
-}
-
-/* after journal replay, reread all bitmap and super blocks */
-static int reread_meta_blocks(struct super_block *s)
-{
-	if (bh_read(SB_BUFFER_WITH_SB(s), 0) < 0) {
-		reiserfs_warning(s, "reiserfs-2504", "error reading the super");
-		return 1;
-	}
-
-	return 0;
-}
-
-/* hash detection stuff */
-
-/*
- * if root directory is empty - we set default - Yura's - hash and
- * warn about it
- * FIXME: we look for only one name in a directory. If tea and yura
- * both have the same value - we ask user to send report to the
- * mailing list
- */
-static __u32 find_hash_out(struct super_block *s)
-{
-	int retval;
-	struct inode *inode;
-	struct cpu_key key;
-	INITIALIZE_PATH(path);
-	struct reiserfs_dir_entry de;
-	struct reiserfs_de_head *deh;
-	__u32 hash = DEFAULT_HASH;
-	__u32 deh_hashval, teahash, r5hash, yurahash;
-
-	inode = d_inode(s->s_root);
-
-	make_cpu_key(&key, inode, ~0, TYPE_DIRENTRY, 3);
-	retval = search_by_entry_key(s, &key, &path, &de);
-	if (retval == IO_ERROR) {
-		pathrelse(&path);
-		return UNSET_HASH;
-	}
-	if (retval == NAME_NOT_FOUND)
-		de.de_entry_num--;
-
-	set_de_name_and_namelen(&de);
-	deh = de.de_deh + de.de_entry_num;
-
-	if (deh_offset(deh) == DOT_DOT_OFFSET) {
-		/* allow override in this case */
-		if (reiserfs_rupasov_hash(s))
-			hash = YURA_HASH;
-		reiserfs_info(s, "FS seems to be empty, autodetect is using the default hash\n");
-		goto out;
-	}
-
-	deh_hashval = GET_HASH_VALUE(deh_offset(deh));
-	r5hash = GET_HASH_VALUE(r5_hash(de.de_name, de.de_namelen));
-	teahash = GET_HASH_VALUE(keyed_hash(de.de_name, de.de_namelen));
-	yurahash = GET_HASH_VALUE(yura_hash(de.de_name, de.de_namelen));
-
-	if ((teahash == r5hash && deh_hashval == r5hash) ||
-	    (teahash == yurahash && deh_hashval == yurahash) ||
-	    (r5hash == yurahash && deh_hashval == yurahash)) {
-		reiserfs_warning(s, "reiserfs-2506",
-				 "Unable to automatically detect hash "
-				 "function. Please mount with -o "
-				 "hash={tea,rupasov,r5}");
-		hash = UNSET_HASH;
-		goto out;
-	}
-
-	if (deh_hashval == yurahash)
-		hash = YURA_HASH;
-	else if (deh_hashval == teahash)
-		hash = TEA_HASH;
-	else if (deh_hashval == r5hash)
-		hash = R5_HASH;
-	else {
-		reiserfs_warning(s, "reiserfs-2506",
-				 "Unrecognised hash function");
-		hash = UNSET_HASH;
-	}
-out:
-	pathrelse(&path);
-	return hash;
-}
-
-/* finds out which hash names are sorted with */
-static int what_hash(struct super_block *s)
-{
-	__u32 code;
-
-	code = sb_hash_function_code(SB_DISK_SUPER_BLOCK(s));
-
-	/*
-	 * reiserfs_hash_detect() == true if any of the hash mount options
-	 * were used.  We must check them to make sure the user isn't
-	 * using a bad hash value
-	 */
-	if (code == UNSET_HASH || reiserfs_hash_detect(s))
-		code = find_hash_out(s);
-
-	if (code != UNSET_HASH && reiserfs_hash_detect(s)) {
-		/*
-		 * detection has found the hash, and we must check against the
-		 * mount options
-		 */
-		if (reiserfs_rupasov_hash(s) && code != YURA_HASH) {
-			reiserfs_warning(s, "reiserfs-2507",
-					 "Error, %s hash detected, "
-					 "unable to force rupasov hash",
-					 reiserfs_hashname(code));
-			code = UNSET_HASH;
-		} else if (reiserfs_tea_hash(s) && code != TEA_HASH) {
-			reiserfs_warning(s, "reiserfs-2508",
-					 "Error, %s hash detected, "
-					 "unable to force tea hash",
-					 reiserfs_hashname(code));
-			code = UNSET_HASH;
-		} else if (reiserfs_r5_hash(s) && code != R5_HASH) {
-			reiserfs_warning(s, "reiserfs-2509",
-					 "Error, %s hash detected, "
-					 "unable to force r5 hash",
-					 reiserfs_hashname(code));
-			code = UNSET_HASH;
-		}
-	} else {
-		/*
-		 * find_hash_out was not called or
-		 * could not determine the hash
-		 */
-		if (reiserfs_rupasov_hash(s)) {
-			code = YURA_HASH;
-		} else if (reiserfs_tea_hash(s)) {
-			code = TEA_HASH;
-		} else if (reiserfs_r5_hash(s)) {
-			code = R5_HASH;
-		}
-	}
-
-	/*
-	 * if we are mounted RW, and we have a new valid hash code, update
-	 * the super
-	 */
-	if (code != UNSET_HASH &&
-	    !sb_rdonly(s) &&
-	    code != sb_hash_function_code(SB_DISK_SUPER_BLOCK(s))) {
-		set_sb_hash_function_code(SB_DISK_SUPER_BLOCK(s), code);
-	}
-	return code;
-}
-
-/* return pointer to appropriate function */
-static hashf_t hash_function(struct super_block *s)
-{
-	switch (what_hash(s)) {
-	case TEA_HASH:
-		reiserfs_info(s, "Using tea hash to sort names\n");
-		return keyed_hash;
-	case YURA_HASH:
-		reiserfs_info(s, "Using rupasov hash to sort names\n");
-		return yura_hash;
-	case R5_HASH:
-		reiserfs_info(s, "Using r5 hash to sort names\n");
-		return r5_hash;
-	}
-	return NULL;
-}
-
-/* this is used to set up correct value for old partitions */
-static int function2code(hashf_t func)
-{
-	if (func == keyed_hash)
-		return TEA_HASH;
-	if (func == yura_hash)
-		return YURA_HASH;
-	if (func == r5_hash)
-		return R5_HASH;
-
-	BUG();			/* should never happen */
-
-	return 0;
-}
-
-#define SWARN(silent, s, id, ...)			\
-	if (!(silent))				\
-		reiserfs_warning(s, id, __VA_ARGS__)
-
-static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
-{
-	struct inode *root_inode;
-	struct reiserfs_transaction_handle th;
-	int old_format = 0;
-	unsigned long blocks;
-	unsigned int commit_max_age = 0;
-	int jinit_done = 0;
-	struct reiserfs_iget_args args;
-	struct reiserfs_super_block *rs;
-	char *jdev_name;
-	struct reiserfs_sb_info *sbi;
-	int errval = -EINVAL;
-	char *qf_names[REISERFS_MAXQUOTAS] = {};
-	unsigned int qfmt = 0;
-
-	sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
-	if (!sbi)
-		return -ENOMEM;
-	s->s_fs_info = sbi;
-	/* Set default values for options: non-aggressive tails, RO on errors */
-	sbi->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
-	sbi->s_mount_opt |= (1 << REISERFS_ERROR_RO);
-	sbi->s_mount_opt |= (1 << REISERFS_BARRIER_FLUSH);
-	/* no preallocation minimum, be smart in reiserfs_file_write instead */
-	sbi->s_alloc_options.preallocmin = 0;
-	/* Preallocate by 16 blocks (17-1) at once */
-	sbi->s_alloc_options.preallocsize = 17;
-	/* setup default block allocator options */
-	reiserfs_init_alloc_options(s);
-
-	spin_lock_init(&sbi->old_work_lock);
-	INIT_DELAYED_WORK(&sbi->old_work, flush_old_commits);
-	mutex_init(&sbi->lock);
-	sbi->lock_depth = -1;
-
-	sbi->commit_wq = alloc_workqueue("reiserfs/%s", WQ_MEM_RECLAIM, 0,
-					 s->s_id);
-	if (!sbi->commit_wq) {
-		SWARN(silent, s, "", "Cannot allocate commit workqueue");
-		errval = -ENOMEM;
-		goto error_unlocked;
-	}
-
-	jdev_name = NULL;
-	if (reiserfs_parse_options
-	    (s, (char *)data, &sbi->s_mount_opt, &blocks, &jdev_name,
-	     &commit_max_age, qf_names, &qfmt) == 0) {
-		goto error_unlocked;
-	}
-	if (jdev_name && jdev_name[0]) {
-		sbi->s_jdev = kstrdup(jdev_name, GFP_KERNEL);
-		if (!sbi->s_jdev) {
-			SWARN(silent, s, "", "Cannot allocate memory for "
-				"journal device name");
-			goto error_unlocked;
-		}
-	}
-#ifdef CONFIG_QUOTA
-	handle_quota_files(s, qf_names, &qfmt);
-#endif
-
-	if (blocks) {
-		SWARN(silent, s, "jmacd-7", "resize option for remount only");
-		goto error_unlocked;
-	}
-
-	/*
-	 * try old format (undistributed bitmap, super block in 8-th 1k
-	 * block of a device)
-	 */
-	if (!read_super_block(s, REISERFS_OLD_DISK_OFFSET_IN_BYTES))
-		old_format = 1;
-
-	/*
-	 * try new format (64-th 1k block), which can contain reiserfs
-	 * super block
-	 */
-	else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) {
-		SWARN(silent, s, "sh-2021", "can not find reiserfs on %s",
-		      s->s_id);
-		goto error_unlocked;
-	}
-
-	s->s_time_min = 0;
-	s->s_time_max = U32_MAX;
-
-	rs = SB_DISK_SUPER_BLOCK(s);
-	/*
-	 * Let's do basic sanity check to verify that underlying device is not
-	 * smaller than the filesystem. If the check fails then abort and
-	 * scream, because bad stuff will happen otherwise.
-	 */
-	if (bdev_nr_bytes(s->s_bdev) < sb_block_count(rs) * sb_blocksize(rs)) {
-		SWARN(silent, s, "", "Filesystem cannot be "
-		      "mounted because it is bigger than the device");
-		SWARN(silent, s, "", "You may need to run fsck "
-		      "or increase size of your LVM partition");
-		SWARN(silent, s, "", "Or may be you forgot to "
-		      "reboot after fdisk when it told you to");
-		goto error_unlocked;
-	}
-
-	sbi->s_mount_state = SB_REISERFS_STATE(s);
-	sbi->s_mount_state = REISERFS_VALID_FS;
-
-	if ((errval = reiserfs_init_bitmap_cache(s))) {
-		SWARN(silent, s, "jmacd-8", "unable to read bitmap");
-		goto error_unlocked;
-	}
-
-	errval = -EINVAL;
-#ifdef CONFIG_REISERFS_CHECK
-	SWARN(silent, s, "", "CONFIG_REISERFS_CHECK is set ON");
-	SWARN(silent, s, "", "- it is slow mode for debugging.");
-#endif
-
-	/* make data=ordered the default */
-	if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) &&
-	    !reiserfs_data_writeback(s)) {
-		sbi->s_mount_opt |= (1 << REISERFS_DATA_ORDERED);
-	}
-
-	if (reiserfs_data_log(s)) {
-		reiserfs_info(s, "using journaled data mode\n");
-	} else if (reiserfs_data_ordered(s)) {
-		reiserfs_info(s, "using ordered data mode\n");
-	} else {
-		reiserfs_info(s, "using writeback data mode\n");
-	}
-	if (reiserfs_barrier_flush(s)) {
-		printk("reiserfs: using flush barriers\n");
-	}
-
-	if (journal_init(s, jdev_name, old_format, commit_max_age)) {
-		SWARN(silent, s, "sh-2022",
-		      "unable to initialize journal space");
-		goto error_unlocked;
-	} else {
-		/*
-		 * once this is set, journal_release must be called
-		 * if we error out of the mount
-		 */
-		jinit_done = 1;
-	}
-
-	if (reread_meta_blocks(s)) {
-		SWARN(silent, s, "jmacd-9",
-		      "unable to reread meta blocks after journal init");
-		goto error_unlocked;
-	}
-
-	if (replay_only(s))
-		goto error_unlocked;
-
-	s->s_xattr = reiserfs_xattr_handlers;
-
-	if (bdev_read_only(s->s_bdev) && !sb_rdonly(s)) {
-		SWARN(silent, s, "clm-7000",
-		      "Detected readonly device, marking FS readonly");
-		s->s_flags |= SB_RDONLY;
-	}
-	args.objectid = REISERFS_ROOT_OBJECTID;
-	args.dirid = REISERFS_ROOT_PARENT_OBJECTID;
-	root_inode =
-	    iget5_locked(s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor,
-			 reiserfs_init_locked_inode, (void *)&args);
-	if (!root_inode) {
-		SWARN(silent, s, "jmacd-10", "get root inode failed");
-		goto error_unlocked;
-	}
-
-	/*
-	 * This path assumed to be called with the BKL in the old times.
-	 * Now we have inherited the big reiserfs lock from it and many
-	 * reiserfs helpers called in the mount path and elsewhere require
-	 * this lock to be held even if it's not always necessary. Let's be
-	 * conservative and hold it early. The window can be reduced after
-	 * careful review of the code.
-	 */
-	reiserfs_write_lock(s);
-
-	if (root_inode->i_state & I_NEW) {
-		reiserfs_read_locked_inode(root_inode, &args);
-		unlock_new_inode(root_inode);
-	}
-
-	if (!S_ISDIR(root_inode->i_mode) || !inode_get_bytes(root_inode) ||
-	    !root_inode->i_size) {
-		SWARN(silent, s, "", "corrupt root inode, run fsck");
-		iput(root_inode);
-		errval = -EUCLEAN;
-		goto error;
-	}
-
-	s->s_root = d_make_root(root_inode);
-	if (!s->s_root)
-		goto error;
-	/* define and initialize hash function */
-	sbi->s_hash_function = hash_function(s);
-	if (sbi->s_hash_function == NULL) {
-		dput(s->s_root);
-		s->s_root = NULL;
-		goto error;
-	}
-
-	if (is_reiserfs_3_5(rs)
-	    || (is_reiserfs_jr(rs) && SB_VERSION(s) == REISERFS_VERSION_1))
-		set_bit(REISERFS_3_5, &sbi->s_properties);
-	else if (old_format)
-		set_bit(REISERFS_OLD_FORMAT, &sbi->s_properties);
-	else
-		set_bit(REISERFS_3_6, &sbi->s_properties);
-
-	if (!sb_rdonly(s)) {
-
-		errval = journal_begin(&th, s, 1);
-		if (errval) {
-			dput(s->s_root);
-			s->s_root = NULL;
-			goto error;
-		}
-		reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
-
-		set_sb_umount_state(rs, REISERFS_ERROR_FS);
-		set_sb_fs_state(rs, 0);
-
-		/*
-		 * Clear out s_bmap_nr if it would wrap. We can handle this
-		 * case, but older revisions can't. This will cause the
-		 * file system to fail mount on those older implementations,
-		 * avoiding corruption. -jeffm
-		 */
-		if (bmap_would_wrap(reiserfs_bmap_count(s)) &&
-		    sb_bmap_nr(rs) != 0) {
-			reiserfs_warning(s, "super-2030", "This file system "
-					"claims to use %u bitmap blocks in "
-					"its super block, but requires %u. "
-					"Clearing to zero.", sb_bmap_nr(rs),
-					reiserfs_bmap_count(s));
-
-			set_sb_bmap_nr(rs, 0);
-		}
-
-		if (old_format_only(s)) {
-			/*
-			 * filesystem of format 3.5 either with standard
-			 * or non-standard journal
-			 */
-			if (convert_reiserfs(s)) {
-				/* and -o conv is given */
-				if (!silent)
-					reiserfs_info(s,
-						      "converting 3.5 filesystem to the 3.6 format");
-
-				if (is_reiserfs_3_5(rs))
-					/*
-					 * put magic string of 3.6 format.
-					 * 2.2 will not be able to
-					 * mount this filesystem anymore
-					 */
-					memcpy(rs->s_v1.s_magic,
-					       reiserfs_3_6_magic_string,
-					       sizeof
-					       (reiserfs_3_6_magic_string));
-
-				set_sb_version(rs, REISERFS_VERSION_2);
-				reiserfs_convert_objectid_map_v1(s);
-				set_bit(REISERFS_3_6, &sbi->s_properties);
-				clear_bit(REISERFS_3_5, &sbi->s_properties);
-			} else if (!silent) {
-				reiserfs_info(s, "using 3.5.x disk format\n");
-			}
-		} else
-			set_sb_mnt_count(rs, sb_mnt_count(rs) + 1);
-
-
-		journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
-		errval = journal_end(&th);
-		if (errval) {
-			dput(s->s_root);
-			s->s_root = NULL;
-			goto error;
-		}
-
-		reiserfs_write_unlock(s);
-		if ((errval = reiserfs_lookup_privroot(s)) ||
-		    (errval = reiserfs_xattr_init(s, s->s_flags))) {
-			dput(s->s_root);
-			s->s_root = NULL;
-			goto error_unlocked;
-		}
-		reiserfs_write_lock(s);
-
-		/*
-		 * look for files which were to be removed in previous session
-		 */
-		finish_unfinished(s);
-	} else {
-		if (old_format_only(s) && !silent) {
-			reiserfs_info(s, "using 3.5.x disk format\n");
-		}
-
-		reiserfs_write_unlock(s);
-		if ((errval = reiserfs_lookup_privroot(s)) ||
-		    (errval = reiserfs_xattr_init(s, s->s_flags))) {
-			dput(s->s_root);
-			s->s_root = NULL;
-			goto error_unlocked;
-		}
-		reiserfs_write_lock(s);
-	}
-	/*
-	 * mark hash in super block: it could be unset. overwrite should be ok
-	 */
-	set_sb_hash_function_code(rs, function2code(sbi->s_hash_function));
-
-	handle_attrs(s);
-
-	reiserfs_proc_info_init(s);
-
-	init_waitqueue_head(&(sbi->s_wait));
-	spin_lock_init(&sbi->bitmap_lock);
-
-	reiserfs_write_unlock(s);
-
-	return (0);
-
-error:
-	reiserfs_write_unlock(s);
-
-error_unlocked:
-	/* kill the commit thread, free journal ram */
-	if (jinit_done) {
-		reiserfs_write_lock(s);
-		journal_release_error(NULL, s);
-		reiserfs_write_unlock(s);
-	}
-
-	if (sbi->commit_wq)
-		destroy_workqueue(sbi->commit_wq);
-
-	reiserfs_cancel_old_flush(s);
-
-	reiserfs_free_bitmap_cache(s);
-	if (SB_BUFFER_WITH_SB(s))
-		brelse(SB_BUFFER_WITH_SB(s));
-#ifdef CONFIG_QUOTA
-	{
-		int j;
-		for (j = 0; j < REISERFS_MAXQUOTAS; j++)
-			kfree(qf_names[j]);
-	}
-#endif
-	kfree(sbi->s_jdev);
-	kfree(sbi);
-
-	s->s_fs_info = NULL;
-	return errval;
-}
-
-static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(dentry->d_sb);
-
-	buf->f_namelen = (REISERFS_MAX_NAME(s->s_blocksize));
-	buf->f_bfree = sb_free_blocks(rs);
-	buf->f_bavail = buf->f_bfree;
-	buf->f_blocks = sb_block_count(rs) - sb_bmap_nr(rs) - 1;
-	buf->f_bsize = dentry->d_sb->s_blocksize;
-	/* changed to accommodate gcc folks. */
-	buf->f_type = REISERFS_SUPER_MAGIC;
-	buf->f_fsid.val[0] = (u32)crc32_le(0, rs->s_uuid, sizeof(rs->s_uuid)/2);
-	buf->f_fsid.val[1] = (u32)crc32_le(0, rs->s_uuid + sizeof(rs->s_uuid)/2,
-				sizeof(rs->s_uuid)/2);
-
-	return 0;
-}
-
-#ifdef CONFIG_QUOTA
-static int reiserfs_write_dquot(struct dquot *dquot)
-{
-	struct reiserfs_transaction_handle th;
-	int ret, err;
-	int depth;
-
-	reiserfs_write_lock(dquot->dq_sb);
-	ret =
-	    journal_begin(&th, dquot->dq_sb,
-			  REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
-	if (ret)
-		goto out;
-	depth = reiserfs_write_unlock_nested(dquot->dq_sb);
-	ret = dquot_commit(dquot);
-	reiserfs_write_lock_nested(dquot->dq_sb, depth);
-	err = journal_end(&th);
-	if (!ret && err)
-		ret = err;
-out:
-	reiserfs_write_unlock(dquot->dq_sb);
-	return ret;
-}
-
-static int reiserfs_acquire_dquot(struct dquot *dquot)
-{
-	struct reiserfs_transaction_handle th;
-	int ret, err;
-	int depth;
-
-	reiserfs_write_lock(dquot->dq_sb);
-	ret =
-	    journal_begin(&th, dquot->dq_sb,
-			  REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
-	if (ret)
-		goto out;
-	depth = reiserfs_write_unlock_nested(dquot->dq_sb);
-	ret = dquot_acquire(dquot);
-	reiserfs_write_lock_nested(dquot->dq_sb, depth);
-	err = journal_end(&th);
-	if (!ret && err)
-		ret = err;
-out:
-	reiserfs_write_unlock(dquot->dq_sb);
-	return ret;
-}
-
-static int reiserfs_release_dquot(struct dquot *dquot)
-{
-	struct reiserfs_transaction_handle th;
-	int ret, err;
-
-	reiserfs_write_lock(dquot->dq_sb);
-	ret =
-	    journal_begin(&th, dquot->dq_sb,
-			  REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb));
-	reiserfs_write_unlock(dquot->dq_sb);
-	if (ret) {
-		/* Release dquot anyway to avoid endless cycle in dqput() */
-		dquot_release(dquot);
-		goto out;
-	}
-	ret = dquot_release(dquot);
-	reiserfs_write_lock(dquot->dq_sb);
-	err = journal_end(&th);
-	if (!ret && err)
-		ret = err;
-	reiserfs_write_unlock(dquot->dq_sb);
-out:
-	return ret;
-}
-
-static int reiserfs_mark_dquot_dirty(struct dquot *dquot)
-{
-	/* Are we journaling quotas? */
-	if (REISERFS_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
-	    REISERFS_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
-		dquot_mark_dquot_dirty(dquot);
-		return reiserfs_write_dquot(dquot);
-	} else
-		return dquot_mark_dquot_dirty(dquot);
-}
-
-static int reiserfs_write_info(struct super_block *sb, int type)
-{
-	struct reiserfs_transaction_handle th;
-	int ret, err;
-	int depth;
-
-	/* Data block + inode block */
-	reiserfs_write_lock(sb);
-	ret = journal_begin(&th, sb, 2);
-	if (ret)
-		goto out;
-	depth = reiserfs_write_unlock_nested(sb);
-	ret = dquot_commit_info(sb, type);
-	reiserfs_write_lock_nested(sb, depth);
-	err = journal_end(&th);
-	if (!ret && err)
-		ret = err;
-out:
-	reiserfs_write_unlock(sb);
-	return ret;
-}
-
-/*
- * Turn on quotas during mount time - we need to find the quota file and such...
- */
-static int reiserfs_quota_on_mount(struct super_block *sb, int type)
-{
-	return dquot_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type],
-					REISERFS_SB(sb)->s_jquota_fmt, type);
-}
-
-/*
- * Standard function to be called on quota_on
- */
-static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
-			     const struct path *path)
-{
-	int err;
-	struct inode *inode;
-	struct reiserfs_transaction_handle th;
-	int opt = type == USRQUOTA ? REISERFS_USRQUOTA : REISERFS_GRPQUOTA;
-
-	reiserfs_write_lock(sb);
-	if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt))) {
-		err = -EINVAL;
-		goto out;
-	}
-
-	/* Quotafile not on the same filesystem? */
-	if (path->dentry->d_sb != sb) {
-		err = -EXDEV;
-		goto out;
-	}
-	inode = d_inode(path->dentry);
-	/*
-	 * We must not pack tails for quota files on reiserfs for quota
-	 * IO to work
-	 */
-	if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) {
-		err = reiserfs_unpack(inode);
-		if (err) {
-			reiserfs_warning(sb, "super-6520",
-				"Unpacking tail of quota file failed"
-				" (%d). Cannot turn on quotas.", err);
-			err = -EINVAL;
-			goto out;
-		}
-		mark_inode_dirty(inode);
-	}
-	/* Journaling quota? */
-	if (REISERFS_SB(sb)->s_qf_names[type]) {
-		/* Quotafile not of fs root? */
-		if (path->dentry->d_parent != sb->s_root)
-			reiserfs_warning(sb, "super-6521",
-				 "Quota file not on filesystem root. "
-				 "Journalled quota will not work.");
-	}
-
-	/*
-	 * When we journal data on quota file, we have to flush journal to see
-	 * all updates to the file when we bypass pagecache...
-	 */
-	if (reiserfs_file_data_log(inode)) {
-		/* Just start temporary transaction and finish it */
-		err = journal_begin(&th, sb, 1);
-		if (err)
-			goto out;
-		err = journal_end_sync(&th);
-		if (err)
-			goto out;
-	}
-	reiserfs_write_unlock(sb);
-	err = dquot_quota_on(sb, type, format_id, path);
-	if (!err) {
-		inode_lock(inode);
-		REISERFS_I(inode)->i_attrs |= REISERFS_IMMUTABLE_FL |
-					      REISERFS_NOATIME_FL;
-		inode_set_flags(inode, S_IMMUTABLE | S_NOATIME,
-				S_IMMUTABLE | S_NOATIME);
-		inode_unlock(inode);
-		mark_inode_dirty(inode);
-	}
-	return err;
-out:
-	reiserfs_write_unlock(sb);
-	return err;
-}
-
-static int reiserfs_quota_off(struct super_block *sb, int type)
-{
-	int err;
-	struct inode *inode = sb_dqopt(sb)->files[type];
-
-	if (!inode || !igrab(inode))
-		goto out;
-
-	err = dquot_quota_off(sb, type);
-	if (err)
-		goto out_put;
-
-	inode_lock(inode);
-	REISERFS_I(inode)->i_attrs &= ~(REISERFS_IMMUTABLE_FL |
-					REISERFS_NOATIME_FL);
-	inode_set_flags(inode, 0, S_IMMUTABLE | S_NOATIME);
-	inode_unlock(inode);
-	mark_inode_dirty(inode);
-out_put:
-	iput(inode);
-	return err;
-out:
-	return dquot_quota_off(sb, type);
-}
-
-/*
- * Read data from quotafile - avoid pagecache and such because we cannot afford
- * acquiring the locks... As quota files are never truncated and quota code
- * itself serializes the operations (and no one else should touch the files)
- * we don't have to be afraid of races
- */
-static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data,
-				   size_t len, loff_t off)
-{
-	struct inode *inode = sb_dqopt(sb)->files[type];
-	unsigned long blk = off >> sb->s_blocksize_bits;
-	int err = 0, offset = off & (sb->s_blocksize - 1), tocopy;
-	size_t toread;
-	struct buffer_head tmp_bh, *bh;
-	loff_t i_size = i_size_read(inode);
-
-	if (off > i_size)
-		return 0;
-	if (off + len > i_size)
-		len = i_size - off;
-	toread = len;
-	while (toread > 0) {
-		tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread);
-		tmp_bh.b_state = 0;
-		/*
-		 * Quota files are without tails so we can safely
-		 * use this function
-		 */
-		reiserfs_write_lock(sb);
-		err = reiserfs_get_block(inode, blk, &tmp_bh, 0);
-		reiserfs_write_unlock(sb);
-		if (err)
-			return err;
-		if (!buffer_mapped(&tmp_bh))	/* A hole? */
-			memset(data, 0, tocopy);
-		else {
-			bh = sb_bread(sb, tmp_bh.b_blocknr);
-			if (!bh)
-				return -EIO;
-			memcpy(data, bh->b_data + offset, tocopy);
-			brelse(bh);
-		}
-		offset = 0;
-		toread -= tocopy;
-		data += tocopy;
-		blk++;
-	}
-	return len;
-}
-
-/*
- * Write to quotafile (we know the transaction is already started and has
- * enough credits)
- */
-static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
-				    const char *data, size_t len, loff_t off)
-{
-	struct inode *inode = sb_dqopt(sb)->files[type];
-	unsigned long blk = off >> sb->s_blocksize_bits;
-	int err = 0, offset = off & (sb->s_blocksize - 1), tocopy;
-	int journal_quota = REISERFS_SB(sb)->s_qf_names[type] != NULL;
-	size_t towrite = len;
-	struct buffer_head tmp_bh, *bh;
-
-	if (!current->journal_info) {
-		printk(KERN_WARNING "reiserfs: Quota write (off=%llu, len=%llu) cancelled because transaction is not started.\n",
-			(unsigned long long)off, (unsigned long long)len);
-		return -EIO;
-	}
-	while (towrite > 0) {
-		tocopy = min_t(unsigned long, sb->s_blocksize - offset, towrite);
-		tmp_bh.b_state = 0;
-		reiserfs_write_lock(sb);
-		err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE);
-		reiserfs_write_unlock(sb);
-		if (err)
-			goto out;
-		if (offset || tocopy != sb->s_blocksize)
-			bh = sb_bread(sb, tmp_bh.b_blocknr);
-		else
-			bh = sb_getblk(sb, tmp_bh.b_blocknr);
-		if (!bh) {
-			err = -EIO;
-			goto out;
-		}
-		lock_buffer(bh);
-		memcpy(bh->b_data + offset, data, tocopy);
-		flush_dcache_page(bh->b_page);
-		set_buffer_uptodate(bh);
-		unlock_buffer(bh);
-		reiserfs_write_lock(sb);
-		reiserfs_prepare_for_journal(sb, bh, 1);
-		journal_mark_dirty(current->journal_info, bh);
-		if (!journal_quota)
-			reiserfs_add_ordered_list(inode, bh);
-		reiserfs_write_unlock(sb);
-		brelse(bh);
-		offset = 0;
-		towrite -= tocopy;
-		data += tocopy;
-		blk++;
-	}
-out:
-	if (len == towrite)
-		return err;
-	if (inode->i_size < off + len - towrite)
-		i_size_write(inode, off + len - towrite);
-	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
-	mark_inode_dirty(inode);
-	return len - towrite;
-}
-
-#endif
-
-static struct dentry *get_super_block(struct file_system_type *fs_type,
-			   int flags, const char *dev_name,
-			   void *data)
-{
-	return mount_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super);
-}
-
-static int __init init_reiserfs_fs(void)
-{
-	int ret;
-
-	ret = init_inodecache();
-	if (ret)
-		return ret;
-
-	reiserfs_proc_info_global_init();
-
-	ret = register_filesystem(&reiserfs_fs_type);
-	if (ret)
-		goto out;
-
-	return 0;
-out:
-	reiserfs_proc_info_global_done();
-	destroy_inodecache();
-
-	return ret;
-}
-
-static void __exit exit_reiserfs_fs(void)
-{
-	reiserfs_proc_info_global_done();
-	unregister_filesystem(&reiserfs_fs_type);
-	destroy_inodecache();
-}
-
-struct file_system_type reiserfs_fs_type = {
-	.owner = THIS_MODULE,
-	.name = "reiserfs",
-	.mount = get_super_block,
-	.kill_sb = reiserfs_kill_sb,
-	.fs_flags = FS_REQUIRES_DEV,
-};
-MODULE_ALIAS_FS("reiserfs");
-
-MODULE_DESCRIPTION("ReiserFS journaled filesystem");
-MODULE_AUTHOR("Hans Reiser <reiser@namesys.com>");
-MODULE_LICENSE("GPL");
-
-module_init(init_reiserfs_fs);
-module_exit(exit_reiserfs_fs);
diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
deleted file mode 100644
index 2cec61af2a9e..000000000000
--- a/fs/reiserfs/tail_conversion.c
+++ /dev/null
@@ -1,318 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright 1999 Hans Reiser, see reiserfs/README for licensing and copyright
- * details
- */
-
-#include <linux/time.h>
-#include <linux/pagemap.h>
-#include <linux/buffer_head.h>
-#include "reiserfs.h"
-
-/*
- * access to tail : when one is going to read tail it must make sure, that is
- * not running.  direct2indirect and indirect2direct can not run concurrently
- */
-
-/*
- * Converts direct items to an unformatted node. Panics if file has no
- * tail. -ENOSPC if no disk space for conversion
- */
-/*
- * path points to first direct item of the file regardless of how many of
- * them are there
- */
-int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
-		    struct treepath *path, struct buffer_head *unbh,
-		    loff_t tail_offset)
-{
-	struct super_block *sb = inode->i_sb;
-	struct buffer_head *up_to_date_bh;
-	struct item_head *p_le_ih = tp_item_head(path);
-	unsigned long total_tail = 0;
-
-	/* Key to search for the last byte of the converted item. */
-	struct cpu_key end_key;
-
-	/*
-	 * new indirect item to be inserted or key
-	 * of unfm pointer to be pasted
-	 */
-	struct item_head ind_ih;
-	int blk_size;
-	/* returned value for reiserfs_insert_item and clones */
-	int  retval;
-	/* Handle on an unformatted node that will be inserted in the tree. */
-	unp_t unfm_ptr;
-
-	BUG_ON(!th->t_trans_id);
-
-	REISERFS_SB(sb)->s_direct2indirect++;
-
-	blk_size = sb->s_blocksize;
-
-	/*
-	 * and key to search for append or insert pointer to the new
-	 * unformatted node.
-	 */
-	copy_item_head(&ind_ih, p_le_ih);
-	set_le_ih_k_offset(&ind_ih, tail_offset);
-	set_le_ih_k_type(&ind_ih, TYPE_INDIRECT);
-
-	/* Set the key to search for the place for new unfm pointer */
-	make_cpu_key(&end_key, inode, tail_offset, TYPE_INDIRECT, 4);
-
-	/* FIXME: we could avoid this */
-	if (search_for_position_by_key(sb, &end_key, path) == POSITION_FOUND) {
-		reiserfs_error(sb, "PAP-14030",
-			       "pasted or inserted byte exists in "
-			       "the tree %K. Use fsck to repair.", &end_key);
-		pathrelse(path);
-		return -EIO;
-	}
-
-	p_le_ih = tp_item_head(path);
-
-	unfm_ptr = cpu_to_le32(unbh->b_blocknr);
-
-	if (is_statdata_le_ih(p_le_ih)) {
-		/* Insert new indirect item. */
-		set_ih_free_space(&ind_ih, 0);	/* delete at nearest future */
-		put_ih_item_len(&ind_ih, UNFM_P_SIZE);
-		PATH_LAST_POSITION(path)++;
-		retval =
-		    reiserfs_insert_item(th, path, &end_key, &ind_ih, inode,
-					 (char *)&unfm_ptr);
-	} else {
-		/* Paste into last indirect item of an object. */
-		retval = reiserfs_paste_into_item(th, path, &end_key, inode,
-						    (char *)&unfm_ptr,
-						    UNFM_P_SIZE);
-	}
-	if (retval) {
-		return retval;
-	}
-	/*
-	 * note: from here there are two keys which have matching first
-	 *  three key components. They only differ by the fourth one.
-	 */
-
-	/* Set the key to search for the direct items of the file */
-	make_cpu_key(&end_key, inode, max_reiserfs_offset(inode), TYPE_DIRECT,
-		     4);
-
-	/*
-	 * Move bytes from the direct items to the new unformatted node
-	 * and delete them.
-	 */
-	while (1) {
-		int tail_size;
-
-		/*
-		 * end_key.k_offset is set so, that we will always have found
-		 * last item of the file
-		 */
-		if (search_for_position_by_key(sb, &end_key, path) ==
-		    POSITION_FOUND)
-			reiserfs_panic(sb, "PAP-14050",
-				       "direct item (%K) not found", &end_key);
-		p_le_ih = tp_item_head(path);
-		RFALSE(!is_direct_le_ih(p_le_ih),
-		       "vs-14055: direct item expected(%K), found %h",
-		       &end_key, p_le_ih);
-		tail_size = (le_ih_k_offset(p_le_ih) & (blk_size - 1))
-		    + ih_item_len(p_le_ih) - 1;
-
-		/*
-		 * we only send the unbh pointer if the buffer is not
-		 * up to date.  this avoids overwriting good data from
-		 * writepage() with old data from the disk or buffer cache
-		 * Special case: unbh->b_page will be NULL if we are coming
-		 * through DIRECT_IO handler here.
-		 */
-		if (!unbh->b_page || buffer_uptodate(unbh)
-		    || PageUptodate(unbh->b_page)) {
-			up_to_date_bh = NULL;
-		} else {
-			up_to_date_bh = unbh;
-		}
-		retval = reiserfs_delete_item(th, path, &end_key, inode,
-						up_to_date_bh);
-
-		total_tail += retval;
-
-		/* done: file does not have direct items anymore */
-		if (tail_size == retval)
-			break;
-
-	}
-	/*
-	 * if we've copied bytes from disk into the page, we need to zero
-	 * out the unused part of the block (it was not up to date before)
-	 */
-	if (up_to_date_bh) {
-		unsigned pgoff =
-		    (tail_offset + total_tail - 1) & (PAGE_SIZE - 1);
-		char *kaddr = kmap_atomic(up_to_date_bh->b_page);
-		memset(kaddr + pgoff, 0, blk_size - total_tail);
-		kunmap_atomic(kaddr);
-	}
-
-	REISERFS_I(inode)->i_first_direct_byte = U32_MAX;
-
-	return 0;
-}
-
-/* stolen from fs/buffer.c */
-void reiserfs_unmap_buffer(struct buffer_head *bh)
-{
-	lock_buffer(bh);
-	if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
-		BUG();
-	}
-	clear_buffer_dirty(bh);
-	/*
-	 * Remove the buffer from whatever list it belongs to. We are mostly
-	 * interested in removing it from per-sb j_dirty_buffers list, to avoid
-	 * BUG() on attempt to write not mapped buffer
-	 */
-	if ((!list_empty(&bh->b_assoc_buffers) || bh->b_private) && bh->b_page) {
-		struct inode *inode = bh->b_folio->mapping->host;
-		struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
-		spin_lock(&j->j_dirty_buffers_lock);
-		list_del_init(&bh->b_assoc_buffers);
-		reiserfs_free_jh(bh);
-		spin_unlock(&j->j_dirty_buffers_lock);
-	}
-	clear_buffer_mapped(bh);
-	clear_buffer_req(bh);
-	clear_buffer_new(bh);
-	bh->b_bdev = NULL;
-	unlock_buffer(bh);
-}
-
-/*
- * this first locks inode (neither reads nor sync are permitted),
- * reads tail through page cache, insert direct item. When direct item
- * inserted successfully inode is left locked. Return value is always
- * what we expect from it (number of cut bytes). But when tail remains
- * in the unformatted node, we set mode to SKIP_BALANCING and unlock
- * inode
- */
-int indirect2direct(struct reiserfs_transaction_handle *th,
-		    struct inode *inode, struct page *page,
-		    struct treepath *path,	/* path to the indirect item. */
-		    const struct cpu_key *item_key,	/* Key to look for
-							 * unformatted node
-							 * pointer to be cut. */
-		    loff_t n_new_file_size,	/* New file size. */
-		    char *mode)
-{
-	struct super_block *sb = inode->i_sb;
-	struct item_head s_ih;
-	unsigned long block_size = sb->s_blocksize;
-	char *tail;
-	int tail_len, round_tail_len;
-	loff_t pos, pos1;	/* position of first byte of the tail */
-	struct cpu_key key;
-
-	BUG_ON(!th->t_trans_id);
-
-	REISERFS_SB(sb)->s_indirect2direct++;
-
-	*mode = M_SKIP_BALANCING;
-
-	/* store item head path points to. */
-	copy_item_head(&s_ih, tp_item_head(path));
-
-	tail_len = (n_new_file_size & (block_size - 1));
-	if (get_inode_sd_version(inode) == STAT_DATA_V2)
-		round_tail_len = ROUND_UP(tail_len);
-	else
-		round_tail_len = tail_len;
-
-	pos =
-	    le_ih_k_offset(&s_ih) - 1 + (ih_item_len(&s_ih) / UNFM_P_SIZE -
-					 1) * sb->s_blocksize;
-	pos1 = pos;
-
-	/*
-	 * we are protected by i_mutex. The tail can not disapper, not
-	 * append can be done either
-	 * we are in truncate or packing tail in file_release
-	 */
-
-	tail = (char *)kmap(page);	/* this can schedule */
-
-	if (path_changed(&s_ih, path)) {
-		/* re-search indirect item */
-		if (search_for_position_by_key(sb, item_key, path)
-		    == POSITION_NOT_FOUND)
-			reiserfs_panic(sb, "PAP-5520",
-				       "item to be converted %K does not exist",
-				       item_key);
-		copy_item_head(&s_ih, tp_item_head(path));
-#ifdef CONFIG_REISERFS_CHECK
-		pos = le_ih_k_offset(&s_ih) - 1 +
-		    (ih_item_len(&s_ih) / UNFM_P_SIZE -
-		     1) * sb->s_blocksize;
-		if (pos != pos1)
-			reiserfs_panic(sb, "vs-5530", "tail position "
-				       "changed while we were reading it");
-#endif
-	}
-
-	/* Set direct item header to insert. */
-	make_le_item_head(&s_ih, NULL, get_inode_item_key_version(inode),
-			  pos1 + 1, TYPE_DIRECT, round_tail_len,
-			  0xffff /*ih_free_space */ );
-
-	/*
-	 * we want a pointer to the first byte of the tail in the page.
-	 * the page was locked and this part of the page was up to date when
-	 * indirect2direct was called, so we know the bytes are still valid
-	 */
-	tail = tail + (pos & (PAGE_SIZE - 1));
-
-	PATH_LAST_POSITION(path)++;
-
-	key = *item_key;
-	set_cpu_key_k_type(&key, TYPE_DIRECT);
-	key.key_length = 4;
-	/* Insert tail as new direct item in the tree */
-	if (reiserfs_insert_item(th, path, &key, &s_ih, inode,
-				 tail ? tail : NULL) < 0) {
-		/*
-		 * No disk memory. So we can not convert last unformatted node
-		 * to the direct item.  In this case we used to adjust
-		 * indirect items's ih_free_space. Now ih_free_space is not
-		 * used, it would be ideal to write zeros to corresponding
-		 * unformatted node. For now i_size is considered as guard for
-		 * going out of file size
-		 */
-		kunmap(page);
-		return block_size - round_tail_len;
-	}
-	kunmap(page);
-
-	/* make sure to get the i_blocks changes from reiserfs_insert_item */
-	reiserfs_update_sd(th, inode);
-
-	/*
-	 * note: we have now the same as in above direct2indirect
-	 * conversion: there are two keys which have matching first three
-	 * key components. They only differ by the fourth one.
-	 */
-
-	/*
-	 * We have inserted new direct item and must remove last
-	 * unformatted node.
-	 */
-	*mode = M_CUT;
-
-	/* we store position of first direct item in the in-core inode */
-	/* mark_file_with_tail (inode, pos1 + 1); */
-	REISERFS_I(inode)->i_first_direct_byte = pos1 + 1;
-
-	return block_size - round_tail_len;
-}
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
deleted file mode 100644
index 998035a6388e..000000000000
--- a/fs/reiserfs/xattr.c
+++ /dev/null
@@ -1,1039 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/fs/reiserfs/xattr.c
- *
- * Copyright (c) 2002 by Jeff Mahoney, <jeffm@suse.com>
- *
- */
-
-/*
- * In order to implement EA/ACLs in a clean, backwards compatible manner,
- * they are implemented as files in a "private" directory.
- * Each EA is in it's own file, with the directory layout like so (/ is assumed
- * to be relative to fs root). Inside the /.reiserfs_priv/xattrs directory,
- * directories named using the capital-hex form of the objectid and
- * generation number are used. Inside each directory are individual files
- * named with the name of the extended attribute.
- *
- * So, for objectid 12648430, we could have:
- * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_access
- * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_default
- * /.reiserfs_priv/xattrs/C0FFEE.0/user.Content-Type
- * .. or similar.
- *
- * The file contents are the text of the EA. The size is known based on the
- * stat data describing the file.
- *
- * In the case of system.posix_acl_access and system.posix_acl_default, since
- * these are special cases for filesystem ACLs, they are interpreted by the
- * kernel, in addition, they are negatively and positively cached and attached
- * to the inode so that unnecessary lookups are avoided.
- *
- * Locking works like so:
- * Directory components (xattr root, xattr dir) are protectd by their i_mutex.
- * The xattrs themselves are protected by the xattr_sem.
- */
-
-#include "reiserfs.h"
-#include <linux/capability.h>
-#include <linux/dcache.h>
-#include <linux/namei.h>
-#include <linux/errno.h>
-#include <linux/gfp.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/pagemap.h>
-#include <linux/xattr.h>
-#include "xattr.h"
-#include "acl.h"
-#include <linux/uaccess.h>
-#include <net/checksum.h>
-#include <linux/stat.h>
-#include <linux/quotaops.h>
-#include <linux/security.h>
-#include <linux/posix_acl_xattr.h>
-#include <linux/xattr.h>
-
-#define PRIVROOT_NAME ".reiserfs_priv"
-#define XAROOT_NAME   "xattrs"
-
-
-/*
- * Helpers for inode ops. We do this so that we don't have all the VFS
- * overhead and also for proper i_mutex annotation.
- * dir->i_mutex must be held for all of them.
- */
-#ifdef CONFIG_REISERFS_FS_XATTR
-static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
-{
-	BUG_ON(!inode_is_locked(dir));
-	return dir->i_op->create(&nop_mnt_idmap, dir, dentry, mode, true);
-}
-#endif
-
-static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
-{
-	BUG_ON(!inode_is_locked(dir));
-	return dir->i_op->mkdir(&nop_mnt_idmap, dir, dentry, mode);
-}
-
-/*
- * We use I_MUTEX_CHILD here to silence lockdep. It's safe because xattr
- * mutation ops aren't called during rename or splace, which are the
- * only other users of I_MUTEX_CHILD. It violates the ordering, but that's
- * better than allocating another subclass just for this code.
- */
-static int xattr_unlink(struct inode *dir, struct dentry *dentry)
-{
-	int error;
-
-	BUG_ON(!inode_is_locked(dir));
-
-	inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
-	error = dir->i_op->unlink(dir, dentry);
-	inode_unlock(d_inode(dentry));
-
-	if (!error)
-		d_delete(dentry);
-	return error;
-}
-
-static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
-{
-	int error;
-
-	BUG_ON(!inode_is_locked(dir));
-
-	inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
-	error = dir->i_op->rmdir(dir, dentry);
-	if (!error)
-		d_inode(dentry)->i_flags |= S_DEAD;
-	inode_unlock(d_inode(dentry));
-	if (!error)
-		d_delete(dentry);
-
-	return error;
-}
-
-#define xattr_may_create(flags)	(!flags || flags & XATTR_CREATE)
-
-static struct dentry *open_xa_root(struct super_block *sb, int flags)
-{
-	struct dentry *privroot = REISERFS_SB(sb)->priv_root;
-	struct dentry *xaroot;
-
-	if (d_really_is_negative(privroot))
-		return ERR_PTR(-EOPNOTSUPP);
-
-	inode_lock_nested(d_inode(privroot), I_MUTEX_XATTR);
-
-	xaroot = dget(REISERFS_SB(sb)->xattr_root);
-	if (!xaroot)
-		xaroot = ERR_PTR(-EOPNOTSUPP);
-	else if (d_really_is_negative(xaroot)) {
-		int err = -ENODATA;
-
-		if (xattr_may_create(flags))
-			err = xattr_mkdir(d_inode(privroot), xaroot, 0700);
-		if (err) {
-			dput(xaroot);
-			xaroot = ERR_PTR(err);
-		}
-	}
-
-	inode_unlock(d_inode(privroot));
-	return xaroot;
-}
-
-static struct dentry *open_xa_dir(const struct inode *inode, int flags)
-{
-	struct dentry *xaroot, *xadir;
-	char namebuf[17];
-
-	xaroot = open_xa_root(inode->i_sb, flags);
-	if (IS_ERR(xaroot))
-		return xaroot;
-
-	snprintf(namebuf, sizeof(namebuf), "%X.%X",
-		 le32_to_cpu(INODE_PKEY(inode)->k_objectid),
-		 inode->i_generation);
-
-	inode_lock_nested(d_inode(xaroot), I_MUTEX_XATTR);
-
-	xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf));
-	if (!IS_ERR(xadir) && d_really_is_negative(xadir)) {
-		int err = -ENODATA;
-
-		if (xattr_may_create(flags))
-			err = xattr_mkdir(d_inode(xaroot), xadir, 0700);
-		if (err) {
-			dput(xadir);
-			xadir = ERR_PTR(err);
-		}
-	}
-
-	inode_unlock(d_inode(xaroot));
-	dput(xaroot);
-	return xadir;
-}
-
-/*
- * The following are side effects of other operations that aren't explicitly
- * modifying extended attributes. This includes operations such as permissions
- * or ownership changes, object deletions, etc.
- */
-struct reiserfs_dentry_buf {
-	struct dir_context ctx;
-	struct dentry *xadir;
-	int count;
-	int err;
-	struct dentry *dentries[8];
-};
-
-static bool
-fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
-		   loff_t offset, u64 ino, unsigned int d_type)
-{
-	struct reiserfs_dentry_buf *dbuf =
-		container_of(ctx, struct reiserfs_dentry_buf, ctx);
-	struct dentry *dentry;
-
-	WARN_ON_ONCE(!inode_is_locked(d_inode(dbuf->xadir)));
-
-	if (dbuf->count == ARRAY_SIZE(dbuf->dentries))
-		return false;
-
-	if (name[0] == '.' && (namelen < 2 ||
-			       (namelen == 2 && name[1] == '.')))
-		return true;
-
-	dentry = lookup_one_len(name, dbuf->xadir, namelen);
-	if (IS_ERR(dentry)) {
-		dbuf->err = PTR_ERR(dentry);
-		return false;
-	} else if (d_really_is_negative(dentry)) {
-		/* A directory entry exists, but no file? */
-		reiserfs_error(dentry->d_sb, "xattr-20003",
-			       "Corrupted directory: xattr %pd listed but "
-			       "not found for file %pd.\n",
-			       dentry, dbuf->xadir);
-		dput(dentry);
-		dbuf->err = -EIO;
-		return false;
-	}
-
-	dbuf->dentries[dbuf->count++] = dentry;
-	return true;
-}
-
-static void
-cleanup_dentry_buf(struct reiserfs_dentry_buf *buf)
-{
-	int i;
-
-	for (i = 0; i < buf->count; i++)
-		if (buf->dentries[i])
-			dput(buf->dentries[i]);
-}
-
-static int reiserfs_for_each_xattr(struct inode *inode,
-				   int (*action)(struct dentry *, void *),
-				   void *data)
-{
-	struct dentry *dir;
-	int i, err = 0;
-	struct reiserfs_dentry_buf buf = {
-		.ctx.actor = fill_with_dentries,
-	};
-
-	/* Skip out, an xattr has no xattrs associated with it */
-	if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1)
-		return 0;
-
-	dir = open_xa_dir(inode, XATTR_REPLACE);
-	if (IS_ERR(dir)) {
-		err = PTR_ERR(dir);
-		goto out;
-	} else if (d_really_is_negative(dir)) {
-		err = 0;
-		goto out_dir;
-	}
-
-	inode_lock_nested(d_inode(dir), I_MUTEX_XATTR);
-
-	buf.xadir = dir;
-	while (1) {
-		err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx);
-		if (err)
-			break;
-		if (buf.err) {
-			err = buf.err;
-			break;
-		}
-		if (!buf.count)
-			break;
-		for (i = 0; !err && i < buf.count && buf.dentries[i]; i++) {
-			struct dentry *dentry = buf.dentries[i];
-
-			if (!d_is_dir(dentry))
-				err = action(dentry, data);
-
-			dput(dentry);
-			buf.dentries[i] = NULL;
-		}
-		if (err)
-			break;
-		buf.count = 0;
-	}
-	inode_unlock(d_inode(dir));
-
-	cleanup_dentry_buf(&buf);
-
-	if (!err) {
-		/*
-		 * We start a transaction here to avoid a ABBA situation
-		 * between the xattr root's i_mutex and the journal lock.
-		 * This doesn't incur much additional overhead since the
-		 * new transaction will just nest inside the
-		 * outer transaction.
-		 */
-		int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 +
-			     4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
-		struct reiserfs_transaction_handle th;
-
-		reiserfs_write_lock(inode->i_sb);
-		err = journal_begin(&th, inode->i_sb, blocks);
-		reiserfs_write_unlock(inode->i_sb);
-		if (!err) {
-			int jerror;
-
-			inode_lock_nested(d_inode(dir->d_parent),
-					  I_MUTEX_XATTR);
-			err = action(dir, data);
-			reiserfs_write_lock(inode->i_sb);
-			jerror = journal_end(&th);
-			reiserfs_write_unlock(inode->i_sb);
-			inode_unlock(d_inode(dir->d_parent));
-			err = jerror ?: err;
-		}
-	}
-out_dir:
-	dput(dir);
-out:
-	/*
-	 * -ENODATA: this object doesn't have any xattrs
-	 * -EOPNOTSUPP: this file system doesn't have xattrs enabled on disk.
-	 * Neither are errors
-	 */
-	if (err == -ENODATA || err == -EOPNOTSUPP)
-		err = 0;
-	return err;
-}
-
-static int delete_one_xattr(struct dentry *dentry, void *data)
-{
-	struct inode *dir = d_inode(dentry->d_parent);
-
-	/* This is the xattr dir, handle specially. */
-	if (d_is_dir(dentry))
-		return xattr_rmdir(dir, dentry);
-
-	return xattr_unlink(dir, dentry);
-}
-
-static int chown_one_xattr(struct dentry *dentry, void *data)
-{
-	struct iattr *attrs = data;
-	int ia_valid = attrs->ia_valid;
-	int err;
-
-	/*
-	 * We only want the ownership bits. Otherwise, we'll do
-	 * things like change a directory to a regular file if
-	 * ATTR_MODE is set.
-	 */
-	attrs->ia_valid &= (ATTR_UID|ATTR_GID);
-	err = reiserfs_setattr(&nop_mnt_idmap, dentry, attrs);
-	attrs->ia_valid = ia_valid;
-
-	return err;
-}
-
-/* No i_mutex, but the inode is unconnected. */
-int reiserfs_delete_xattrs(struct inode *inode)
-{
-	int err = reiserfs_for_each_xattr(inode, delete_one_xattr, NULL);
-
-	if (err)
-		reiserfs_warning(inode->i_sb, "jdm-20004",
-				 "Couldn't delete all xattrs (%d)\n", err);
-	return err;
-}
-
-/* inode->i_mutex: down */
-int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs)
-{
-	int err = reiserfs_for_each_xattr(inode, chown_one_xattr, attrs);
-
-	if (err)
-		reiserfs_warning(inode->i_sb, "jdm-20007",
-				 "Couldn't chown all xattrs (%d)\n", err);
-	return err;
-}
-
-#ifdef CONFIG_REISERFS_FS_XATTR
-/*
- * Returns a dentry corresponding to a specific extended attribute file
- * for the inode. If flags allow, the file is created. Otherwise, a
- * valid or negative dentry, or an error is returned.
- */
-static struct dentry *xattr_lookup(struct inode *inode, const char *name,
-				    int flags)
-{
-	struct dentry *xadir, *xafile;
-	int err = 0;
-
-	xadir = open_xa_dir(inode, flags);
-	if (IS_ERR(xadir))
-		return ERR_CAST(xadir);
-
-	inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR);
-	xafile = lookup_one_len(name, xadir, strlen(name));
-	if (IS_ERR(xafile)) {
-		err = PTR_ERR(xafile);
-		goto out;
-	}
-
-	if (d_really_is_positive(xafile) && (flags & XATTR_CREATE))
-		err = -EEXIST;
-
-	if (d_really_is_negative(xafile)) {
-		err = -ENODATA;
-		if (xattr_may_create(flags))
-			err = xattr_create(d_inode(xadir), xafile,
-					      0700|S_IFREG);
-	}
-
-	if (err)
-		dput(xafile);
-out:
-	inode_unlock(d_inode(xadir));
-	dput(xadir);
-	if (err)
-		return ERR_PTR(err);
-	return xafile;
-}
-
-/* Internal operations on file data */
-static inline void reiserfs_put_page(struct page *page)
-{
-	kunmap(page);
-	put_page(page);
-}
-
-static struct page *reiserfs_get_page(struct inode *dir, size_t n)
-{
-	struct address_space *mapping = dir->i_mapping;
-	struct page *page;
-	/*
-	 * We can deadlock if we try to free dentries,
-	 * and an unlink/rmdir has just occurred - GFP_NOFS avoids this
-	 */
-	mapping_set_gfp_mask(mapping, GFP_NOFS);
-	page = read_mapping_page(mapping, n >> PAGE_SHIFT, NULL);
-	if (!IS_ERR(page))
-		kmap(page);
-	return page;
-}
-
-static inline __u32 xattr_hash(const char *msg, int len)
-{
-	/*
-	 * csum_partial() gives different results for little-endian and
-	 * big endian hosts. Images created on little-endian hosts and
-	 * mounted on big-endian hosts(and vice versa) will see csum mismatches
-	 * when trying to fetch xattrs. Treating the hash as __wsum_t would
-	 * lower the frequency of mismatch.  This is an endianness bug in
-	 * reiserfs.  The return statement would result in a sparse warning. Do
-	 * not fix the sparse warning so as to not hide a reminder of the bug.
-	 */
-	return csum_partial(msg, len, 0);
-}
-
-int reiserfs_commit_write(struct file *f, struct page *page,
-			  unsigned from, unsigned to);
-
-static void update_ctime(struct inode *inode)
-{
-	struct timespec64 now = current_time(inode);
-	struct timespec64 ctime = inode_get_ctime(inode);
-
-	if (inode_unhashed(inode) || !inode->i_nlink ||
-	    timespec64_equal(&ctime, &now))
-		return;
-
-	inode_set_ctime_to_ts(inode, now);
-	mark_inode_dirty(inode);
-}
-
-static int lookup_and_delete_xattr(struct inode *inode, const char *name)
-{
-	int err = 0;
-	struct dentry *dentry, *xadir;
-
-	xadir = open_xa_dir(inode, XATTR_REPLACE);
-	if (IS_ERR(xadir))
-		return PTR_ERR(xadir);
-
-	inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR);
-	dentry = lookup_one_len(name, xadir, strlen(name));
-	if (IS_ERR(dentry)) {
-		err = PTR_ERR(dentry);
-		goto out_dput;
-	}
-
-	if (d_really_is_positive(dentry)) {
-		err = xattr_unlink(d_inode(xadir), dentry);
-		update_ctime(inode);
-	}
-
-	dput(dentry);
-out_dput:
-	inode_unlock(d_inode(xadir));
-	dput(xadir);
-	return err;
-}
-
-
-/* Generic extended attribute operations that can be used by xa plugins */
-
-/*
- * inode->i_mutex: down
- */
-int
-reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
-			  struct inode *inode, const char *name,
-			  const void *buffer, size_t buffer_size, int flags)
-{
-	int err = 0;
-	struct dentry *dentry;
-	struct page *page;
-	char *data;
-	size_t file_pos = 0;
-	size_t buffer_pos = 0;
-	size_t new_size;
-	__u32 xahash = 0;
-
-	if (get_inode_sd_version(inode) == STAT_DATA_V1)
-		return -EOPNOTSUPP;
-
-	if (!buffer) {
-		err = lookup_and_delete_xattr(inode, name);
-		return err;
-	}
-
-	dentry = xattr_lookup(inode, name, flags);
-	if (IS_ERR(dentry))
-		return PTR_ERR(dentry);
-
-	down_write(&REISERFS_I(inode)->i_xattr_sem);
-
-	xahash = xattr_hash(buffer, buffer_size);
-	while (buffer_pos < buffer_size || buffer_pos == 0) {
-		size_t chunk;
-		size_t skip = 0;
-		size_t page_offset = (file_pos & (PAGE_SIZE - 1));
-
-		if (buffer_size - buffer_pos > PAGE_SIZE)
-			chunk = PAGE_SIZE;
-		else
-			chunk = buffer_size - buffer_pos;
-
-		page = reiserfs_get_page(d_inode(dentry), file_pos);
-		if (IS_ERR(page)) {
-			err = PTR_ERR(page);
-			goto out_unlock;
-		}
-
-		lock_page(page);
-		data = page_address(page);
-
-		if (file_pos == 0) {
-			struct reiserfs_xattr_header *rxh;
-
-			skip = file_pos = sizeof(struct reiserfs_xattr_header);
-			if (chunk + skip > PAGE_SIZE)
-				chunk = PAGE_SIZE - skip;
-			rxh = (struct reiserfs_xattr_header *)data;
-			rxh->h_magic = cpu_to_le32(REISERFS_XATTR_MAGIC);
-			rxh->h_hash = cpu_to_le32(xahash);
-		}
-
-		reiserfs_write_lock(inode->i_sb);
-		err = __reiserfs_write_begin(page, page_offset, chunk + skip);
-		if (!err) {
-			if (buffer)
-				memcpy(data + skip, buffer + buffer_pos, chunk);
-			err = reiserfs_commit_write(NULL, page, page_offset,
-						    page_offset + chunk +
-						    skip);
-		}
-		reiserfs_write_unlock(inode->i_sb);
-		unlock_page(page);
-		reiserfs_put_page(page);
-		buffer_pos += chunk;
-		file_pos += chunk;
-		skip = 0;
-		if (err || buffer_size == 0 || !buffer)
-			break;
-	}
-
-	new_size = buffer_size + sizeof(struct reiserfs_xattr_header);
-	if (!err && new_size < i_size_read(d_inode(dentry))) {
-		struct iattr newattrs = {
-			.ia_ctime = current_time(inode),
-			.ia_size = new_size,
-			.ia_valid = ATTR_SIZE | ATTR_CTIME,
-		};
-
-		inode_lock_nested(d_inode(dentry), I_MUTEX_XATTR);
-		inode_dio_wait(d_inode(dentry));
-
-		err = reiserfs_setattr(&nop_mnt_idmap, dentry, &newattrs);
-		inode_unlock(d_inode(dentry));
-	} else
-		update_ctime(inode);
-out_unlock:
-	up_write(&REISERFS_I(inode)->i_xattr_sem);
-	dput(dentry);
-	return err;
-}
-
-/* We need to start a transaction to maintain lock ordering */
-int reiserfs_xattr_set(struct inode *inode, const char *name,
-		       const void *buffer, size_t buffer_size, int flags)
-{
-
-	struct reiserfs_transaction_handle th;
-	int error, error2;
-	size_t jbegin_count = reiserfs_xattr_nblocks(inode, buffer_size);
-
-	/* Check before we start a transaction and then do nothing. */
-	if (!d_really_is_positive(REISERFS_SB(inode->i_sb)->priv_root))
-		return -EOPNOTSUPP;
-
-	if (!(flags & XATTR_REPLACE))
-		jbegin_count += reiserfs_xattr_jcreate_nblocks(inode);
-
-	reiserfs_write_lock(inode->i_sb);
-	error = journal_begin(&th, inode->i_sb, jbegin_count);
-	reiserfs_write_unlock(inode->i_sb);
-	if (error) {
-		return error;
-	}
-
-	error = reiserfs_xattr_set_handle(&th, inode, name,
-					  buffer, buffer_size, flags);
-
-	reiserfs_write_lock(inode->i_sb);
-	error2 = journal_end(&th);
-	reiserfs_write_unlock(inode->i_sb);
-	if (error == 0)
-		error = error2;
-
-	return error;
-}
-
-/*
- * inode->i_mutex: down
- */
-int
-reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer,
-		   size_t buffer_size)
-{
-	ssize_t err = 0;
-	struct dentry *dentry;
-	size_t isize;
-	size_t file_pos = 0;
-	size_t buffer_pos = 0;
-	struct page *page;
-	__u32 hash = 0;
-
-	if (name == NULL)
-		return -EINVAL;
-
-	/*
-	 * We can't have xattrs attached to v1 items since they don't have
-	 * generation numbers
-	 */
-	if (get_inode_sd_version(inode) == STAT_DATA_V1)
-		return -EOPNOTSUPP;
-
-	/*
-	 * priv_root needn't be initialized during mount so allow initial
-	 * lookups to succeed.
-	 */
-	if (!REISERFS_SB(inode->i_sb)->priv_root)
-		return 0;
-
-	dentry = xattr_lookup(inode, name, XATTR_REPLACE);
-	if (IS_ERR(dentry)) {
-		err = PTR_ERR(dentry);
-		goto out;
-	}
-
-	down_read(&REISERFS_I(inode)->i_xattr_sem);
-
-	isize = i_size_read(d_inode(dentry));
-
-	/* Just return the size needed */
-	if (buffer == NULL) {
-		err = isize - sizeof(struct reiserfs_xattr_header);
-		goto out_unlock;
-	}
-
-	if (buffer_size < isize - sizeof(struct reiserfs_xattr_header)) {
-		err = -ERANGE;
-		goto out_unlock;
-	}
-
-	while (file_pos < isize) {
-		size_t chunk;
-		char *data;
-		size_t skip = 0;
-
-		if (isize - file_pos > PAGE_SIZE)
-			chunk = PAGE_SIZE;
-		else
-			chunk = isize - file_pos;
-
-		page = reiserfs_get_page(d_inode(dentry), file_pos);
-		if (IS_ERR(page)) {
-			err = PTR_ERR(page);
-			goto out_unlock;
-		}
-
-		lock_page(page);
-		data = page_address(page);
-		if (file_pos == 0) {
-			struct reiserfs_xattr_header *rxh =
-			    (struct reiserfs_xattr_header *)data;
-			skip = file_pos = sizeof(struct reiserfs_xattr_header);
-			chunk -= skip;
-			/* Magic doesn't match up.. */
-			if (rxh->h_magic != cpu_to_le32(REISERFS_XATTR_MAGIC)) {
-				unlock_page(page);
-				reiserfs_put_page(page);
-				reiserfs_warning(inode->i_sb, "jdm-20001",
-						 "Invalid magic for xattr (%s) "
-						 "associated with %k", name,
-						 INODE_PKEY(inode));
-				err = -EIO;
-				goto out_unlock;
-			}
-			hash = le32_to_cpu(rxh->h_hash);
-		}
-		memcpy(buffer + buffer_pos, data + skip, chunk);
-		unlock_page(page);
-		reiserfs_put_page(page);
-		file_pos += chunk;
-		buffer_pos += chunk;
-		skip = 0;
-	}
-	err = isize - sizeof(struct reiserfs_xattr_header);
-
-	if (xattr_hash(buffer, isize - sizeof(struct reiserfs_xattr_header)) !=
-	    hash) {
-		reiserfs_warning(inode->i_sb, "jdm-20002",
-				 "Invalid hash for xattr (%s) associated "
-				 "with %k", name, INODE_PKEY(inode));
-		err = -EIO;
-	}
-
-out_unlock:
-	up_read(&REISERFS_I(inode)->i_xattr_sem);
-	dput(dentry);
-
-out:
-	return err;
-}
-
-/*
- * In order to implement different sets of xattr operations for each xattr
- * prefix with the generic xattr API, a filesystem should create a
- * null-terminated array of struct xattr_handler (one for each prefix) and
- * hang a pointer to it off of the s_xattr field of the superblock.
- *
- * The generic_fooxattr() functions will use this list to dispatch xattr
- * operations to the correct xattr_handler.
- */
-#define for_each_xattr_handler(handlers, handler)		\
-		for ((handler) = *(handlers)++;			\
-			(handler) != NULL;			\
-			(handler) = *(handlers)++)
-
-static inline bool reiserfs_posix_acl_list(const char *name,
-					   struct dentry *dentry)
-{
-	return (posix_acl_type(name) >= 0) &&
-	       IS_POSIXACL(d_backing_inode(dentry));
-}
-
-/* This is the implementation for the xattr plugin infrastructure */
-static inline bool reiserfs_xattr_list(const struct xattr_handler * const *handlers,
-				       const char *name, struct dentry *dentry)
-{
-	if (handlers) {
-		const struct xattr_handler *xah = NULL;
-
-		for_each_xattr_handler(handlers, xah) {
-			const char *prefix = xattr_prefix(xah);
-
-			if (strncmp(prefix, name, strlen(prefix)))
-				continue;
-
-			if (!xattr_handler_can_list(xah, dentry))
-				return false;
-
-			return true;
-		}
-	}
-
-	return reiserfs_posix_acl_list(name, dentry);
-}
-
-struct listxattr_buf {
-	struct dir_context ctx;
-	size_t size;
-	size_t pos;
-	char *buf;
-	struct dentry *dentry;
-};
-
-static bool listxattr_filler(struct dir_context *ctx, const char *name,
-			    int namelen, loff_t offset, u64 ino,
-			    unsigned int d_type)
-{
-	struct listxattr_buf *b =
-		container_of(ctx, struct listxattr_buf, ctx);
-	size_t size;
-
-	if (name[0] != '.' ||
-	    (namelen != 1 && (name[1] != '.' || namelen != 2))) {
-		if (!reiserfs_xattr_list(b->dentry->d_sb->s_xattr, name,
-					 b->dentry))
-			return true;
-		size = namelen + 1;
-		if (b->buf) {
-			if (b->pos + size > b->size) {
-				b->pos = -ERANGE;
-				return false;
-			}
-			memcpy(b->buf + b->pos, name, namelen);
-			b->buf[b->pos + namelen] = 0;
-		}
-		b->pos += size;
-	}
-	return true;
-}
-
-/*
- * Inode operation listxattr()
- *
- * We totally ignore the generic listxattr here because it would be stupid
- * not to. Since the xattrs are organized in a directory, we can just
- * readdir to find them.
- */
-ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
-{
-	struct dentry *dir;
-	int err = 0;
-	struct listxattr_buf buf = {
-		.ctx.actor = listxattr_filler,
-		.dentry = dentry,
-		.buf = buffer,
-		.size = buffer ? size : 0,
-	};
-
-	if (d_really_is_negative(dentry))
-		return -EINVAL;
-
-	if (get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
-		return -EOPNOTSUPP;
-
-	dir = open_xa_dir(d_inode(dentry), XATTR_REPLACE);
-	if (IS_ERR(dir)) {
-		err = PTR_ERR(dir);
-		if (err == -ENODATA)
-			err = 0;  /* Not an error if there aren't any xattrs */
-		goto out;
-	}
-
-	inode_lock_nested(d_inode(dir), I_MUTEX_XATTR);
-	err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx);
-	inode_unlock(d_inode(dir));
-
-	if (!err)
-		err = buf.pos;
-
-	dput(dir);
-out:
-	return err;
-}
-
-static int create_privroot(struct dentry *dentry)
-{
-	int err;
-	struct inode *inode = d_inode(dentry->d_parent);
-
-	WARN_ON_ONCE(!inode_is_locked(inode));
-
-	err = xattr_mkdir(inode, dentry, 0700);
-	if (err || d_really_is_negative(dentry)) {
-		reiserfs_warning(dentry->d_sb, "jdm-20006",
-				 "xattrs/ACLs enabled and couldn't "
-				 "find/create .reiserfs_priv. "
-				 "Failing mount.");
-		return -EOPNOTSUPP;
-	}
-
-	reiserfs_init_priv_inode(d_inode(dentry));
-	reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr "
-		      "storage.\n", PRIVROOT_NAME);
-
-	return 0;
-}
-
-#else
-int __init reiserfs_xattr_register_handlers(void) { return 0; }
-void reiserfs_xattr_unregister_handlers(void) {}
-static int create_privroot(struct dentry *dentry) { return 0; }
-#endif
-
-/* Actual operations that are exported to VFS-land */
-const struct xattr_handler * const reiserfs_xattr_handlers[] = {
-#ifdef CONFIG_REISERFS_FS_XATTR
-	&reiserfs_xattr_user_handler,
-	&reiserfs_xattr_trusted_handler,
-#endif
-#ifdef CONFIG_REISERFS_FS_SECURITY
-	&reiserfs_xattr_security_handler,
-#endif
-	NULL
-};
-
-static int xattr_mount_check(struct super_block *s)
-{
-	/*
-	 * We need generation numbers to ensure that the oid mapping is correct
-	 * v3.5 filesystems don't have them.
-	 */
-	if (old_format_only(s)) {
-		if (reiserfs_xattrs_optional(s)) {
-			/*
-			 * Old format filesystem, but optional xattrs have
-			 * been enabled. Error out.
-			 */
-			reiserfs_warning(s, "jdm-2005",
-					 "xattrs/ACLs not supported "
-					 "on pre-v3.6 format filesystems. "
-					 "Failing mount.");
-			return -EOPNOTSUPP;
-		}
-	}
-
-	return 0;
-}
-
-int reiserfs_permission(struct mnt_idmap *idmap, struct inode *inode,
-			int mask)
-{
-	/*
-	 * We don't do permission checks on the internal objects.
-	 * Permissions are determined by the "owning" object.
-	 */
-	if (IS_PRIVATE(inode))
-		return 0;
-
-	return generic_permission(&nop_mnt_idmap, inode, mask);
-}
-
-static int xattr_hide_revalidate(struct dentry *dentry, unsigned int flags)
-{
-	return -EPERM;
-}
-
-static const struct dentry_operations xattr_lookup_poison_ops = {
-	.d_revalidate = xattr_hide_revalidate,
-};
-
-int reiserfs_lookup_privroot(struct super_block *s)
-{
-	struct dentry *dentry;
-	int err = 0;
-
-	/* If we don't have the privroot located yet - go find it */
-	inode_lock(d_inode(s->s_root));
-	dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
-				strlen(PRIVROOT_NAME));
-	if (!IS_ERR(dentry)) {
-		REISERFS_SB(s)->priv_root = dentry;
-		d_set_d_op(dentry, &xattr_lookup_poison_ops);
-		if (d_really_is_positive(dentry))
-			reiserfs_init_priv_inode(d_inode(dentry));
-	} else
-		err = PTR_ERR(dentry);
-	inode_unlock(d_inode(s->s_root));
-
-	return err;
-}
-
-/*
- * We need to take a copy of the mount flags since things like
- * SB_RDONLY don't get set until *after* we're called.
- * mount_flags != mount_options
- */
-int reiserfs_xattr_init(struct super_block *s, int mount_flags)
-{
-	int err = 0;
-	struct dentry *privroot = REISERFS_SB(s)->priv_root;
-
-	err = xattr_mount_check(s);
-	if (err)
-		goto error;
-
-	if (d_really_is_negative(privroot) && !(mount_flags & SB_RDONLY)) {
-		inode_lock(d_inode(s->s_root));
-		err = create_privroot(REISERFS_SB(s)->priv_root);
-		inode_unlock(d_inode(s->s_root));
-	}
-
-	if (d_really_is_positive(privroot)) {
-		inode_lock(d_inode(privroot));
-		if (!REISERFS_SB(s)->xattr_root) {
-			struct dentry *dentry;
-
-			dentry = lookup_one_len(XAROOT_NAME, privroot,
-						strlen(XAROOT_NAME));
-			if (!IS_ERR(dentry))
-				REISERFS_SB(s)->xattr_root = dentry;
-			else
-				err = PTR_ERR(dentry);
-		}
-		inode_unlock(d_inode(privroot));
-	}
-
-error:
-	if (err) {
-		clear_bit(REISERFS_XATTRS_USER, &REISERFS_SB(s)->s_mount_opt);
-		clear_bit(REISERFS_POSIXACL, &REISERFS_SB(s)->s_mount_opt);
-	}
-
-	/* The super_block SB_POSIXACL must mirror the (no)acl mount option. */
-	if (reiserfs_posixacl(s))
-		s->s_flags |= SB_POSIXACL;
-	else
-		s->s_flags &= ~SB_POSIXACL;
-
-	return err;
-}
diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h
deleted file mode 100644
index 5868a4e990e3..000000000000
--- a/fs/reiserfs/xattr.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <linux/reiserfs_xattr.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/rwsem.h>
-#include <linux/xattr.h>
-
-struct inode;
-struct dentry;
-struct iattr;
-struct super_block;
-
-int reiserfs_xattr_register_handlers(void) __init;
-void reiserfs_xattr_unregister_handlers(void);
-int reiserfs_xattr_init(struct super_block *sb, int mount_flags);
-int reiserfs_lookup_privroot(struct super_block *sb);
-int reiserfs_delete_xattrs(struct inode *inode);
-int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs);
-int reiserfs_permission(struct mnt_idmap *idmap,
-			struct inode *inode, int mask);
-
-#ifdef CONFIG_REISERFS_FS_XATTR
-#define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir)
-ssize_t reiserfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
-
-int reiserfs_xattr_get(struct inode *, const char *, void *, size_t);
-int reiserfs_xattr_set(struct inode *, const char *, const void *, size_t, int);
-int reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *,
-			      struct inode *, const char *, const void *,
-			      size_t, int);
-
-extern const struct xattr_handler reiserfs_xattr_user_handler;
-extern const struct xattr_handler reiserfs_xattr_trusted_handler;
-extern const struct xattr_handler reiserfs_xattr_security_handler;
-#ifdef CONFIG_REISERFS_FS_SECURITY
-int reiserfs_security_init(struct inode *dir, struct inode *inode,
-			   const struct qstr *qstr,
-			   struct reiserfs_security_handle *sec);
-int reiserfs_security_write(struct reiserfs_transaction_handle *th,
-			    struct inode *inode,
-			    struct reiserfs_security_handle *sec);
-void reiserfs_security_free(struct reiserfs_security_handle *sec);
-#endif
-
-static inline int reiserfs_xattrs_initialized(struct super_block *sb)
-{
-	return REISERFS_SB(sb)->priv_root && REISERFS_SB(sb)->xattr_root;
-}
-
-#define xattr_size(size) ((size) + sizeof(struct reiserfs_xattr_header))
-static inline loff_t reiserfs_xattr_nblocks(struct inode *inode, loff_t size)
-{
-	loff_t ret = 0;
-	if (reiserfs_file_data_log(inode)) {
-		ret = _ROUND_UP(xattr_size(size), inode->i_sb->s_blocksize);
-		ret >>= inode->i_sb->s_blocksize_bits;
-	}
-	return ret;
-}
-
-/*
- * We may have to create up to 3 objects: xattr root, xattr dir, xattr file.
- * Let's try to be smart about it.
- * xattr root: We cache it. If it's not cached, we may need to create it.
- * xattr dir: If anything has been loaded for this inode, we can set a flag
- *            saying so.
- * xattr file: Since we don't cache xattrs, we can't tell. We always include
- *             blocks for it.
- *
- * However, since root and dir can be created between calls - YOU MUST SAVE
- * THIS VALUE.
- */
-static inline size_t reiserfs_xattr_jcreate_nblocks(struct inode *inode)
-{
-	size_t nblocks = JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
-
-	if ((REISERFS_I(inode)->i_flags & i_has_xattr_dir) == 0) {
-		nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
-		if (d_really_is_negative(REISERFS_SB(inode->i_sb)->xattr_root))
-			nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
-	}
-
-	return nblocks;
-}
-
-static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
-{
-	init_rwsem(&REISERFS_I(inode)->i_xattr_sem);
-}
-
-#else
-
-#define reiserfs_listxattr NULL
-
-static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
-{
-}
-#endif  /*  CONFIG_REISERFS_FS_XATTR  */
-
-#ifndef CONFIG_REISERFS_FS_SECURITY
-static inline int reiserfs_security_init(struct inode *dir,
-					 struct inode *inode,
-					 const struct qstr *qstr,
-					 struct reiserfs_security_handle *sec)
-{
-	return 0;
-}
-static inline int
-reiserfs_security_write(struct reiserfs_transaction_handle *th,
-			struct inode *inode,
-			struct reiserfs_security_handle *sec)
-{
-	return 0;
-}
-static inline void reiserfs_security_free(struct reiserfs_security_handle *sec)
-{}
-#endif
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
deleted file mode 100644
index 064264992b49..000000000000
--- a/fs/reiserfs/xattr_acl.c
+++ /dev/null
@@ -1,411 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/capability.h>
-#include <linux/fs.h>
-#include <linux/posix_acl.h>
-#include "reiserfs.h"
-#include <linux/errno.h>
-#include <linux/pagemap.h>
-#include <linux/xattr.h>
-#include <linux/slab.h>
-#include <linux/posix_acl_xattr.h>
-#include "xattr.h"
-#include "acl.h"
-#include <linux/uaccess.h>
-
-static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th,
-			    struct inode *inode, int type,
-			    struct posix_acl *acl);
-
-
-int
-reiserfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
-		 struct posix_acl *acl, int type)
-{
-	int error, error2;
-	struct reiserfs_transaction_handle th;
-	size_t jcreate_blocks;
-	int size = acl ? posix_acl_xattr_size(acl->a_count) : 0;
-	int update_mode = 0;
-	struct inode *inode = d_inode(dentry);
-	umode_t mode = inode->i_mode;
-
-	/*
-	 * Pessimism: We can't assume that anything from the xattr root up
-	 * has been created.
-	 */
-
-	jcreate_blocks = reiserfs_xattr_jcreate_nblocks(inode) +
-			 reiserfs_xattr_nblocks(inode, size) * 2;
-
-	reiserfs_write_lock(inode->i_sb);
-	error = journal_begin(&th, inode->i_sb, jcreate_blocks);
-	reiserfs_write_unlock(inode->i_sb);
-	if (error == 0) {
-		if (type == ACL_TYPE_ACCESS && acl) {
-			error = posix_acl_update_mode(&nop_mnt_idmap, inode,
-						      &mode, &acl);
-			if (error)
-				goto unlock;
-			update_mode = 1;
-		}
-		error = __reiserfs_set_acl(&th, inode, type, acl);
-		if (!error && update_mode)
-			inode->i_mode = mode;
-unlock:
-		reiserfs_write_lock(inode->i_sb);
-		error2 = journal_end(&th);
-		reiserfs_write_unlock(inode->i_sb);
-		if (error2)
-			error = error2;
-	}
-
-	return error;
-}
-
-/*
- * Convert from filesystem to in-memory representation.
- */
-static struct posix_acl *reiserfs_posix_acl_from_disk(const void *value, size_t size)
-{
-	const char *end = (char *)value + size;
-	int n, count;
-	struct posix_acl *acl;
-
-	if (!value)
-		return NULL;
-	if (size < sizeof(reiserfs_acl_header))
-		return ERR_PTR(-EINVAL);
-	if (((reiserfs_acl_header *) value)->a_version !=
-	    cpu_to_le32(REISERFS_ACL_VERSION))
-		return ERR_PTR(-EINVAL);
-	value = (char *)value + sizeof(reiserfs_acl_header);
-	count = reiserfs_acl_count(size);
-	if (count < 0)
-		return ERR_PTR(-EINVAL);
-	if (count == 0)
-		return NULL;
-	acl = posix_acl_alloc(count, GFP_NOFS);
-	if (!acl)
-		return ERR_PTR(-ENOMEM);
-	for (n = 0; n < count; n++) {
-		reiserfs_acl_entry *entry = (reiserfs_acl_entry *) value;
-		if ((char *)value + sizeof(reiserfs_acl_entry_short) > end)
-			goto fail;
-		acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
-		acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
-		switch (acl->a_entries[n].e_tag) {
-		case ACL_USER_OBJ:
-		case ACL_GROUP_OBJ:
-		case ACL_MASK:
-		case ACL_OTHER:
-			value = (char *)value +
-			    sizeof(reiserfs_acl_entry_short);
-			break;
-
-		case ACL_USER:
-			value = (char *)value + sizeof(reiserfs_acl_entry);
-			if ((char *)value > end)
-				goto fail;
-			acl->a_entries[n].e_uid = 
-				make_kuid(&init_user_ns,
-					  le32_to_cpu(entry->e_id));
-			break;
-		case ACL_GROUP:
-			value = (char *)value + sizeof(reiserfs_acl_entry);
-			if ((char *)value > end)
-				goto fail;
-			acl->a_entries[n].e_gid =
-				make_kgid(&init_user_ns,
-					  le32_to_cpu(entry->e_id));
-			break;
-
-		default:
-			goto fail;
-		}
-	}
-	if (value != end)
-		goto fail;
-	return acl;
-
-fail:
-	posix_acl_release(acl);
-	return ERR_PTR(-EINVAL);
-}
-
-/*
- * Convert from in-memory to filesystem representation.
- */
-static void *reiserfs_posix_acl_to_disk(const struct posix_acl *acl, size_t * size)
-{
-	reiserfs_acl_header *ext_acl;
-	char *e;
-	int n;
-
-	*size = reiserfs_acl_size(acl->a_count);
-	ext_acl = kmalloc(sizeof(reiserfs_acl_header) +
-						  acl->a_count *
-						  sizeof(reiserfs_acl_entry),
-						  GFP_NOFS);
-	if (!ext_acl)
-		return ERR_PTR(-ENOMEM);
-	ext_acl->a_version = cpu_to_le32(REISERFS_ACL_VERSION);
-	e = (char *)ext_acl + sizeof(reiserfs_acl_header);
-	for (n = 0; n < acl->a_count; n++) {
-		const struct posix_acl_entry *acl_e = &acl->a_entries[n];
-		reiserfs_acl_entry *entry = (reiserfs_acl_entry *) e;
-		entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
-		entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
-		switch (acl->a_entries[n].e_tag) {
-		case ACL_USER:
-			entry->e_id = cpu_to_le32(
-				from_kuid(&init_user_ns, acl_e->e_uid));
-			e += sizeof(reiserfs_acl_entry);
-			break;
-		case ACL_GROUP:
-			entry->e_id = cpu_to_le32(
-				from_kgid(&init_user_ns, acl_e->e_gid));
-			e += sizeof(reiserfs_acl_entry);
-			break;
-
-		case ACL_USER_OBJ:
-		case ACL_GROUP_OBJ:
-		case ACL_MASK:
-		case ACL_OTHER:
-			e += sizeof(reiserfs_acl_entry_short);
-			break;
-
-		default:
-			goto fail;
-		}
-	}
-	return (char *)ext_acl;
-
-fail:
-	kfree(ext_acl);
-	return ERR_PTR(-EINVAL);
-}
-
-/*
- * Inode operation get_posix_acl().
- *
- * inode->i_mutex: down
- * BKL held [before 2.5.x]
- */
-struct posix_acl *reiserfs_get_acl(struct inode *inode, int type, bool rcu)
-{
-	char *name, *value;
-	struct posix_acl *acl;
-	int size;
-	int retval;
-
-	if (rcu)
-		return ERR_PTR(-ECHILD);
-
-	switch (type) {
-	case ACL_TYPE_ACCESS:
-		name = XATTR_NAME_POSIX_ACL_ACCESS;
-		break;
-	case ACL_TYPE_DEFAULT:
-		name = XATTR_NAME_POSIX_ACL_DEFAULT;
-		break;
-	default:
-		BUG();
-	}
-
-	size = reiserfs_xattr_get(inode, name, NULL, 0);
-	if (size < 0) {
-		if (size == -ENODATA || size == -ENOSYS)
-			return NULL;
-		return ERR_PTR(size);
-	}
-
-	value = kmalloc(size, GFP_NOFS);
-	if (!value)
-		return ERR_PTR(-ENOMEM);
-
-	retval = reiserfs_xattr_get(inode, name, value, size);
-	if (retval == -ENODATA || retval == -ENOSYS) {
-		/*
-		 * This shouldn't actually happen as it should have
-		 * been caught above.. but just in case
-		 */
-		acl = NULL;
-	} else if (retval < 0) {
-		acl = ERR_PTR(retval);
-	} else {
-		acl = reiserfs_posix_acl_from_disk(value, retval);
-	}
-
-	kfree(value);
-	return acl;
-}
-
-/*
- * Inode operation set_posix_acl().
- *
- * inode->i_mutex: down
- * BKL held [before 2.5.x]
- */
-static int
-__reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
-		 int type, struct posix_acl *acl)
-{
-	char *name;
-	void *value = NULL;
-	size_t size = 0;
-	int error;
-
-	switch (type) {
-	case ACL_TYPE_ACCESS:
-		name = XATTR_NAME_POSIX_ACL_ACCESS;
-		break;
-	case ACL_TYPE_DEFAULT:
-		name = XATTR_NAME_POSIX_ACL_DEFAULT;
-		if (!S_ISDIR(inode->i_mode))
-			return acl ? -EACCES : 0;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	if (acl) {
-		value = reiserfs_posix_acl_to_disk(acl, &size);
-		if (IS_ERR(value))
-			return (int)PTR_ERR(value);
-	}
-
-	error = reiserfs_xattr_set_handle(th, inode, name, value, size, 0);
-
-	/*
-	 * Ensure that the inode gets dirtied if we're only using
-	 * the mode bits and an old ACL didn't exist. We don't need
-	 * to check if the inode is hashed here since we won't get
-	 * called by reiserfs_inherit_default_acl().
-	 */
-	if (error == -ENODATA) {
-		error = 0;
-		if (type == ACL_TYPE_ACCESS) {
-			inode_set_ctime_current(inode);
-			mark_inode_dirty(inode);
-		}
-	}
-
-	kfree(value);
-
-	if (!error)
-		set_cached_acl(inode, type, acl);
-
-	return error;
-}
-
-/*
- * dir->i_mutex: locked,
- * inode is new and not released into the wild yet
- */
-int
-reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
-			     struct inode *dir, struct dentry *dentry,
-			     struct inode *inode)
-{
-	struct posix_acl *default_acl, *acl;
-	int err = 0;
-
-	/* ACLs only get applied to files and directories */
-	if (S_ISLNK(inode->i_mode))
-		return 0;
-
-	/*
-	 * ACLs can only be used on "new" objects, so if it's an old object
-	 * there is nothing to inherit from
-	 */
-	if (get_inode_sd_version(dir) == STAT_DATA_V1)
-		goto apply_umask;
-
-	/*
-	 * Don't apply ACLs to objects in the .reiserfs_priv tree.. This
-	 * would be useless since permissions are ignored, and a pain because
-	 * it introduces locking cycles
-	 */
-	if (IS_PRIVATE(inode))
-		goto apply_umask;
-
-	err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
-	if (err)
-		return err;
-
-	if (default_acl) {
-		err = __reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT,
-					 default_acl);
-		posix_acl_release(default_acl);
-	}
-	if (acl) {
-		if (!err)
-			err = __reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS,
-						 acl);
-		posix_acl_release(acl);
-	}
-
-	return err;
-
-apply_umask:
-	/* no ACL, apply umask */
-	inode->i_mode &= ~current_umask();
-	return err;
-}
-
-/* This is used to cache the default acl before a new object is created.
- * The biggest reason for this is to get an idea of how many blocks will
- * actually be required for the create operation if we must inherit an ACL.
- * An ACL write can add up to 3 object creations and an additional file write
- * so we'd prefer not to reserve that many blocks in the journal if we can.
- * It also has the advantage of not loading the ACL with a transaction open,
- * this may seem silly, but if the owner of the directory is doing the
- * creation, the ACL may not be loaded since the permissions wouldn't require
- * it.
- * We return the number of blocks required for the transaction.
- */
-int reiserfs_cache_default_acl(struct inode *inode)
-{
-	struct posix_acl *acl;
-	int nblocks = 0;
-
-	if (IS_PRIVATE(inode))
-		return 0;
-
-	acl = get_inode_acl(inode, ACL_TYPE_DEFAULT);
-
-	if (acl && !IS_ERR(acl)) {
-		int size = reiserfs_acl_size(acl->a_count);
-
-		/* Other xattrs can be created during inode creation. We don't
-		 * want to claim too many blocks, so we check to see if we
-		 * need to create the tree to the xattrs, and then we
-		 * just want two files. */
-		nblocks = reiserfs_xattr_jcreate_nblocks(inode);
-		nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
-
-		REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
-
-		/* We need to account for writes + bitmaps for two files */
-		nblocks += reiserfs_xattr_nblocks(inode, size) * 4;
-		posix_acl_release(acl);
-	}
-
-	return nblocks;
-}
-
-/*
- * Called under i_mutex
- */
-int reiserfs_acl_chmod(struct dentry *dentry)
-{
-	struct inode *inode = d_inode(dentry);
-
-	if (IS_PRIVATE(inode))
-		return 0;
-	if (get_inode_sd_version(inode) == STAT_DATA_V1 ||
-	    !reiserfs_posixacl(inode->i_sb))
-		return 0;
-
-	return posix_acl_chmod(&nop_mnt_idmap, dentry, inode->i_mode);
-}
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
deleted file mode 100644
index 078dd8cc312f..000000000000
--- a/fs/reiserfs/xattr_security.c
+++ /dev/null
@@ -1,127 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "reiserfs.h"
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/pagemap.h>
-#include <linux/xattr.h>
-#include <linux/slab.h>
-#include "xattr.h"
-#include <linux/security.h>
-#include <linux/uaccess.h>
-
-static int
-security_get(const struct xattr_handler *handler, struct dentry *unused,
-	     struct inode *inode, const char *name, void *buffer, size_t size)
-{
-	if (IS_PRIVATE(inode))
-		return -EPERM;
-
-	return reiserfs_xattr_get(inode, xattr_full_name(handler, name),
-				  buffer, size);
-}
-
-static int
-security_set(const struct xattr_handler *handler,
-	     struct mnt_idmap *idmap, struct dentry *unused,
-	     struct inode *inode, const char *name, const void *buffer,
-	     size_t size, int flags)
-{
-	if (IS_PRIVATE(inode))
-		return -EPERM;
-
-	return reiserfs_xattr_set(inode,
-				  xattr_full_name(handler, name),
-				  buffer, size, flags);
-}
-
-static bool security_list(struct dentry *dentry)
-{
-	return !IS_PRIVATE(d_inode(dentry));
-}
-
-static int
-reiserfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
-		    void *fs_info)
-{
-	struct reiserfs_security_handle *sec = fs_info;
-
-	sec->value = kmemdup(xattr_array->value, xattr_array->value_len,
-			     GFP_KERNEL);
-	if (!sec->value)
-		return -ENOMEM;
-
-	sec->name = xattr_array->name;
-	sec->length = xattr_array->value_len;
-	return 0;
-}
-
-/* Initializes the security context for a new inode and returns the number
- * of blocks needed for the transaction. If successful, reiserfs_security
- * must be released using reiserfs_security_free when the caller is done. */
-int reiserfs_security_init(struct inode *dir, struct inode *inode,
-			   const struct qstr *qstr,
-			   struct reiserfs_security_handle *sec)
-{
-	int blocks = 0;
-	int error;
-
-	sec->name = NULL;
-	sec->value = NULL;
-	sec->length = 0;
-
-	/* Don't add selinux attributes on xattrs - they'll never get used */
-	if (IS_PRIVATE(dir))
-		return 0;
-
-	error = security_inode_init_security(inode, dir, qstr,
-					     &reiserfs_initxattrs, sec);
-	if (error) {
-		sec->name = NULL;
-		sec->value = NULL;
-		sec->length = 0;
-		return error;
-	}
-
-	if (sec->length && reiserfs_xattrs_initialized(inode->i_sb)) {
-		blocks = reiserfs_xattr_jcreate_nblocks(inode) +
-			 reiserfs_xattr_nblocks(inode, sec->length);
-		/* We don't want to count the directories twice if we have
-		 * a default ACL. */
-		REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
-	}
-	return blocks;
-}
-
-int reiserfs_security_write(struct reiserfs_transaction_handle *th,
-			    struct inode *inode,
-			    struct reiserfs_security_handle *sec)
-{
-	char xattr_name[XATTR_NAME_MAX + 1] = XATTR_SECURITY_PREFIX;
-	int error;
-
-	if (XATTR_SECURITY_PREFIX_LEN + strlen(sec->name) > XATTR_NAME_MAX)
-		return -EINVAL;
-
-	strlcat(xattr_name, sec->name, sizeof(xattr_name));
-
-	error = reiserfs_xattr_set_handle(th, inode, xattr_name, sec->value,
-					  sec->length, XATTR_CREATE);
-	if (error == -ENODATA || error == -EOPNOTSUPP)
-		error = 0;
-
-	return error;
-}
-
-void reiserfs_security_free(struct reiserfs_security_handle *sec)
-{
-	kfree(sec->value);
-	sec->name = NULL;
-	sec->value = NULL;
-}
-
-const struct xattr_handler reiserfs_xattr_security_handler = {
-	.prefix = XATTR_SECURITY_PREFIX,
-	.get = security_get,
-	.set = security_set,
-	.list = security_list,
-};
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
deleted file mode 100644
index 0c0c74d8db0e..000000000000
--- a/fs/reiserfs/xattr_trusted.c
+++ /dev/null
@@ -1,46 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "reiserfs.h"
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/pagemap.h>
-#include <linux/xattr.h>
-#include "xattr.h"
-#include <linux/uaccess.h>
-
-static int
-trusted_get(const struct xattr_handler *handler, struct dentry *unused,
-	    struct inode *inode, const char *name, void *buffer, size_t size)
-{
-	if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
-		return -EPERM;
-
-	return reiserfs_xattr_get(inode, xattr_full_name(handler, name),
-				  buffer, size);
-}
-
-static int
-trusted_set(const struct xattr_handler *handler,
-	    struct mnt_idmap *idmap, struct dentry *unused,
-	    struct inode *inode, const char *name, const void *buffer,
-	    size_t size, int flags)
-{
-	if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
-		return -EPERM;
-
-	return reiserfs_xattr_set(inode,
-				  xattr_full_name(handler, name),
-				  buffer, size, flags);
-}
-
-static bool trusted_list(struct dentry *dentry)
-{
-	return capable(CAP_SYS_ADMIN) && !IS_PRIVATE(d_inode(dentry));
-}
-
-const struct xattr_handler reiserfs_xattr_trusted_handler = {
-	.prefix = XATTR_TRUSTED_PREFIX,
-	.get = trusted_get,
-	.set = trusted_set,
-	.list = trusted_list,
-};
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
deleted file mode 100644
index 88195181e1d7..000000000000
--- a/fs/reiserfs/xattr_user.c
+++ /dev/null
@@ -1,43 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "reiserfs.h"
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/pagemap.h>
-#include <linux/xattr.h>
-#include "xattr.h"
-#include <linux/uaccess.h>
-
-static int
-user_get(const struct xattr_handler *handler, struct dentry *unused,
-	 struct inode *inode, const char *name, void *buffer, size_t size)
-{
-	if (!reiserfs_xattrs_user(inode->i_sb))
-		return -EOPNOTSUPP;
-	return reiserfs_xattr_get(inode, xattr_full_name(handler, name),
-				  buffer, size);
-}
-
-static int
-user_set(const struct xattr_handler *handler, struct mnt_idmap *idmap,
-	 struct dentry *unused,
-	 struct inode *inode, const char *name, const void *buffer,
-	 size_t size, int flags)
-{
-	if (!reiserfs_xattrs_user(inode->i_sb))
-		return -EOPNOTSUPP;
-	return reiserfs_xattr_set(inode,
-				  xattr_full_name(handler, name),
-				  buffer, size, flags);
-}
-
-static bool user_list(struct dentry *dentry)
-{
-	return reiserfs_xattrs_user(dentry->d_sb);
-}
-
-const struct xattr_handler reiserfs_xattr_user_handler = {
-	.prefix = XATTR_USER_PREFIX,
-	.get = user_get,
-	.set = user_set,
-	.list = user_list,
-};
diff --git a/include/uapi/linux/reiserfs_fs.h b/include/uapi/linux/reiserfs_fs.h
deleted file mode 100644
index 5bb921409f2b..000000000000
--- a/include/uapi/linux/reiserfs_fs.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * Copyright 1996, 1997, 1998 Hans Reiser, see reiserfs/README for licensing and copyright details
- */
-#ifndef _LINUX_REISER_FS_H
-#define _LINUX_REISER_FS_H
-
-#include <linux/types.h>
-#include <linux/magic.h>
-
-/*
- *  include/linux/reiser_fs.h
- *
- *  Reiser File System constants and structures
- *
- */
-
-/* ioctl's command */
-#define REISERFS_IOC_UNPACK		_IOW(0xCD,1,long)
-/* define following flags to be the same as in ext2, so that chattr(1),
-   lsattr(1) will work with us. */
-#define REISERFS_IOC_GETFLAGS		FS_IOC_GETFLAGS
-#define REISERFS_IOC_SETFLAGS		FS_IOC_SETFLAGS
-#define REISERFS_IOC_GETVERSION		FS_IOC_GETVERSION
-#define REISERFS_IOC_SETVERSION		FS_IOC_SETVERSION
-
-#endif				/* _LINUX_REISER_FS_H */
diff --git a/include/uapi/linux/reiserfs_xattr.h b/include/uapi/linux/reiserfs_xattr.h
deleted file mode 100644
index 503ad018ce5b..000000000000
--- a/include/uapi/linux/reiserfs_xattr.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
-  File: linux/reiserfs_xattr.h
-*/
-
-#ifndef _LINUX_REISERFS_XATTR_H
-#define _LINUX_REISERFS_XATTR_H
-
-#include <linux/types.h>
-
-/* Magic value in header */
-#define REISERFS_XATTR_MAGIC 0x52465841	/* "RFXA" */
-
-struct reiserfs_xattr_header {
-	__le32 h_magic;		/* magic number for identification */
-	__le32 h_hash;		/* hash of the value */
-};
-
-struct reiserfs_security_handle {
-	const char *name;
-	void *value;
-	__kernel_size_t length;
-};
-
-#endif  /*  _LINUX_REISERFS_XATTR_H  */
diff --git a/scripts/selinux/mdp/mdp.c b/scripts/selinux/mdp/mdp.c
index 1415604c3d24..a413c157904d 100644
--- a/scripts/selinux/mdp/mdp.c
+++ b/scripts/selinux/mdp/mdp.c
@@ -171,9 +171,6 @@ int main(int argc, char *argv[])
 #ifdef CONFIG_JFS_SECURITY
 	FS_USE("xattr", "jfs");
 #endif
-#ifdef CONFIG_REISERFS_FS_SECURITY
-	FS_USE("xattr", "reiserfs");
-#endif
 #ifdef CONFIG_JFFS2_FS_SECURITY
 	FS_USE("xattr", "jffs2");
 #endif
diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h
index e7da92489167..f37614cc2c1b 100644
--- a/tools/objtool/noreturns.h
+++ b/tools/objtool/noreturns.h
@@ -11,7 +11,6 @@ NORETURN(__ia32_sys_exit)
 NORETURN(__ia32_sys_exit_group)
 NORETURN(__kunit_abort)
 NORETURN(__module_put_and_kthread_exit)
-NORETURN(__reiserfs_panic)
 NORETURN(__stack_chk_fail)
 NORETURN(__tdx_hypercall_failed)
 NORETURN(__ubsan_handle_builtin_unreachable)
diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test.c b/tools/testing/selftests/filesystems/statmount/statmount_test.c
index c773334bbcc9..8eb6aa606a0d 100644
--- a/tools/testing/selftests/filesystems/statmount/statmount_test.c
+++ b/tools/testing/selftests/filesystems/statmount/statmount_test.c
@@ -27,7 +27,7 @@ static const char *const known_fs[] = {
 	"ipathfs", "iso9660", "jffs2", "jfs", "minix", "mqueue", "msdos",
 	"nfs", "nfs4", "nfsd", "nilfs2", "nsfs", "ntfs", "ntfs3", "ocfs2",
 	"ocfs2_dlmfs", "ocxlflash", "omfs", "openpromfs", "overlay", "pipefs",
-	"proc", "pstore", "pvfs2", "qnx4", "qnx6", "ramfs", "reiserfs",
+	"proc", "pstore", "pvfs2", "qnx4", "qnx6", "ramfs",
 	"resctrl", "romfs", "rootfs", "rpc_pipefs", "s390_hypfs", "secretmem",
 	"securityfs", "selinuxfs", "smackfs", "smb3", "sockfs", "spufs",
 	"squashfs", "sysfs", "sysv", "tmpfs", "tracefs", "ubifs", "udf",
-- 
cgit v1.2.3


From 6414b3e5d5d44cd214161abf2ce2221d9e9de7bf Mon Sep 17 00:00:00 2001
From: "Alexis Lothoré (eBPF Foundation)" <alexis.lothore@bootlin.com>
Date: Sun, 20 Oct 2024 21:22:53 +0200
Subject: selftests/bpf: factorize conn and syncookies tests in a single runner
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

btf_skc_cls_ingress currently describe two tests, both running a simple
tcp server and then initializing a connection to it. The sole difference
between the tests is about the tcp_syncookie configuration, and some
checks around this feature being enabled/disabled.

Share the common code between those two tests by moving the code into a
single runner, parameterized by a "gen_cookies" argument. Split the
performed checks accordingly.

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20241020-syncookie-v2-1-2db240225fed@bootlin.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 .../selftests/bpf/prog_tests/btf_skc_cls_ingress.c | 106 +++++++--------------
 1 file changed, 37 insertions(+), 69 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c b/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c
index ef4d6a3ae423..5d8d7736edc0 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c
@@ -71,13 +71,14 @@ static void print_err_line(void)
 		printf("bpf prog error at line %u\n", skel->bss->linum);
 }
 
-static void test_conn(void)
+static void run_test(bool gen_cookies)
 {
+	const char *tcp_syncookies = gen_cookies ? "2" : "1";
 	int listen_fd = -1, cli_fd = -1, srv_fd = -1, err;
 	socklen_t addrlen = sizeof(srv_sa6);
 	int srv_port;
 
-	if (write_sysctl("/proc/sys/net/ipv4/tcp_syncookies", "1"))
+	if (write_sysctl("/proc/sys/net/ipv4/tcp_syncookies", tcp_syncookies))
 		return;
 
 	listen_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0);
@@ -99,19 +100,36 @@ static void test_conn(void)
 	if (CHECK_FAIL(srv_fd == -1))
 		goto done;
 
-	if (CHECK(skel->bss->listen_tp_sport != srv_port ||
-		  skel->bss->req_sk_sport != srv_port,
-		  "Unexpected sk src port",
-		  "listen_tp_sport:%u req_sk_sport:%u expected:%u\n",
-		  skel->bss->listen_tp_sport, skel->bss->req_sk_sport,
-		  srv_port))
+	if (CHECK(skel->bss->listen_tp_sport != srv_port,
+		  "Unexpected listen tp src port",
+		  "listen_tp_sport:%u expected:%u\n",
+		  skel->bss->listen_tp_sport, srv_port))
 		goto done;
 
-	if (CHECK(skel->bss->gen_cookie || skel->bss->recv_cookie,
-		  "Unexpected syncookie states",
-		  "gen_cookie:%u recv_cookie:%u\n",
-		  skel->bss->gen_cookie, skel->bss->recv_cookie))
-		goto done;
+	if (!gen_cookies) {
+		if (CHECK(skel->bss->req_sk_sport != srv_port,
+			  "Unexpected req_sk src port",
+			  "req_sk_sport:%u expected:%u\n",
+			  skel->bss->req_sk_sport, srv_port))
+			goto done;
+		if (CHECK(skel->bss->gen_cookie || skel->bss->recv_cookie,
+			  "Unexpected syncookie states",
+			  "gen_cookie:%u recv_cookie:%u\n",
+			  skel->bss->gen_cookie, skel->bss->recv_cookie))
+			goto done;
+	} else {
+		if (CHECK(skel->bss->req_sk_sport,
+			  "Unexpected req_sk src port",
+			  "req_sk_sport:%u expected:0\n",
+			  skel->bss->req_sk_sport))
+			goto done;
+		if (CHECK(!skel->bss->gen_cookie ||
+			  skel->bss->gen_cookie != skel->bss->recv_cookie,
+			  "Unexpected syncookie states",
+			  "gen_cookie:%u recv_cookie:%u\n",
+			  skel->bss->gen_cookie, skel->bss->recv_cookie))
+			goto done;
+	}
 
 	CHECK(skel->bss->linum, "bpf prog detected error", "at line %u\n",
 	      skel->bss->linum);
@@ -125,64 +143,14 @@ done:
 		close(srv_fd);
 }
 
-static void test_syncookie(void)
+static void test_conn(void)
 {
-	int listen_fd = -1, cli_fd = -1, srv_fd = -1, err;
-	socklen_t addrlen = sizeof(srv_sa6);
-	int srv_port;
-
-	/* Enforce syncookie mode */
-	if (write_sysctl("/proc/sys/net/ipv4/tcp_syncookies", "2"))
-		return;
-
-	listen_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0);
-	if (CHECK_FAIL(listen_fd == -1))
-		return;
-
-	err = getsockname(listen_fd, (struct sockaddr *)&srv_sa6, &addrlen);
-	if (CHECK(err, "getsockname(listen_fd)", "err:%d errno:%d\n", err,
-		  errno))
-		goto done;
-	memcpy(&skel->bss->srv_sa6, &srv_sa6, sizeof(srv_sa6));
-	srv_port = ntohs(srv_sa6.sin6_port);
-
-	cli_fd = connect_to_fd(listen_fd, 0);
-	if (CHECK_FAIL(cli_fd == -1))
-		goto done;
-
-	srv_fd = accept(listen_fd, NULL, NULL);
-	if (CHECK_FAIL(srv_fd == -1))
-		goto done;
-
-	if (CHECK(skel->bss->listen_tp_sport != srv_port,
-		  "Unexpected tp src port",
-		  "listen_tp_sport:%u expected:%u\n",
-		  skel->bss->listen_tp_sport, srv_port))
-		goto done;
-
-	if (CHECK(skel->bss->req_sk_sport,
-		  "Unexpected req_sk src port",
-		  "req_sk_sport:%u expected:0\n",
-		   skel->bss->req_sk_sport))
-		goto done;
-
-	if (CHECK(!skel->bss->gen_cookie ||
-		  skel->bss->gen_cookie != skel->bss->recv_cookie,
-		  "Unexpected syncookie states",
-		  "gen_cookie:%u recv_cookie:%u\n",
-		  skel->bss->gen_cookie, skel->bss->recv_cookie))
-		goto done;
-
-	CHECK(skel->bss->linum, "bpf prog detected error", "at line %u\n",
-	      skel->bss->linum);
+	run_test(false);
+}
 
-done:
-	if (listen_fd != -1)
-		close(listen_fd);
-	if (cli_fd != -1)
-		close(cli_fd);
-	if (srv_fd != -1)
-		close(srv_fd);
+static void test_syncookie(void)
+{
+	run_test(true);
 }
 
 struct test {
-- 
cgit v1.2.3


From 0335dd6b5a4c178d9ae34694a0be7862873378bd Mon Sep 17 00:00:00 2001
From: "Alexis Lothoré (eBPF Foundation)" <alexis.lothore@bootlin.com>
Date: Sun, 20 Oct 2024 21:22:54 +0200
Subject: selftests/bpf: add missing ns cleanups in btf_skc_cls_ingress
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

btf_skc_cls_ingress.c currently runs two subtests, and create a
dedicated network namespace for each, but never cleans up the created
namespace once the test has ended.

Add missing namespace cleanup after each subtest to avoid accumulating
namespaces for each new subtest. While at it, switch namespace
management to netns_{new,free}

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20241020-syncookie-v2-2-2db240225fed@bootlin.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 .../selftests/bpf/prog_tests/btf_skc_cls_ingress.c | 33 +++++++++++++---------
 1 file changed, 19 insertions(+), 14 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c b/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c
index 5d8d7736edc0..c88fb0e3048c 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c
@@ -17,32 +17,30 @@
 #include "test_progs.h"
 #include "test_btf_skc_cls_ingress.skel.h"
 
+#define TEST_NS "skc_cls_ingress"
+
 static struct test_btf_skc_cls_ingress *skel;
 static struct sockaddr_in6 srv_sa6;
 static __u32 duration;
 
-static int prepare_netns(void)
+static struct netns_obj *prepare_netns(void)
 {
 	LIBBPF_OPTS(bpf_tc_hook, qdisc_lo, .attach_point = BPF_TC_INGRESS);
 	LIBBPF_OPTS(bpf_tc_opts, tc_attach,
 		    .prog_fd = bpf_program__fd(skel->progs.cls_ingress));
+	struct netns_obj *ns = NULL;
 
-	if (CHECK(unshare(CLONE_NEWNET), "create netns",
-		  "unshare(CLONE_NEWNET): %s (%d)",
-		  strerror(errno), errno))
-		return -1;
-
-	if (CHECK(system("ip link set dev lo up"),
-		  "ip link set dev lo up", "failed\n"))
-		return -1;
+	ns = netns_new(TEST_NS, true);
+	if (!ASSERT_OK_PTR(ns, "create and join netns"))
+		return ns;
 
 	qdisc_lo.ifindex = if_nametoindex("lo");
 	if (!ASSERT_OK(bpf_tc_hook_create(&qdisc_lo), "qdisc add dev lo clsact"))
-		return -1;
+		goto free_ns;
 
 	if (!ASSERT_OK(bpf_tc_attach(&qdisc_lo, &tc_attach),
 		       "filter add dev lo ingress"))
-		return -1;
+		goto free_ns;
 
 	/* Ensure 20 bytes options (i.e. in total 40 bytes tcp header) for the
 	 * bpf_tcp_gen_syncookie() helper.
@@ -50,9 +48,13 @@ static int prepare_netns(void)
 	if (write_sysctl("/proc/sys/net/ipv4/tcp_window_scaling", "1") ||
 	    write_sysctl("/proc/sys/net/ipv4/tcp_timestamps", "1") ||
 	    write_sysctl("/proc/sys/net/ipv4/tcp_sack", "1"))
-		return -1;
+		goto free_ns;
+
+	return ns;
 
-	return 0;
+free_ns:
+	netns_free(ns);
+	return NULL;
 }
 
 static void reset_test(void)
@@ -169,6 +171,7 @@ void test_btf_skc_cls_ingress(void)
 	int i;
 
 	skel = test_btf_skc_cls_ingress__open_and_load();
+	struct netns_obj *ns;
 	if (CHECK(!skel, "test_btf_skc_cls_ingress__open_and_load", "failed\n"))
 		return;
 
@@ -176,13 +179,15 @@ void test_btf_skc_cls_ingress(void)
 		if (!test__start_subtest(tests[i].desc))
 			continue;
 
-		if (prepare_netns())
+		ns = prepare_netns();
+		if (!ns)
 			break;
 
 		tests[i].run();
 
 		print_err_line();
 		reset_test();
+		netns_free(ns);
 	}
 
 	test_btf_skc_cls_ingress__destroy(skel);
-- 
cgit v1.2.3


From 0da0a75cf649e1e5a688af1763653206260f17a9 Mon Sep 17 00:00:00 2001
From: "Alexis Lothoré (eBPF Foundation)" <alexis.lothore@bootlin.com>
Date: Sun, 20 Oct 2024 21:22:55 +0200
Subject: selftests/bpf: get rid of global vars in btf_skc_cls_ingress
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are a few global variables in btf_skc_cls_ingress.c, which are not
really used by different tests. Get rid of those global variables, by
performing the following updates:
- make srv_sa6 local to the main runner function
- make skel local to the main function, and propagate it through
  function arguments
- get rid of duration by replacing CHECK macros with the ASSERT_XXX
  macros. While updating those assert macros:
  - do not return early on asserts performing some actual tests, let the
    other tests run as well (keep the early return for parts handling
    test setup)
  - instead of converting the CHECK on skel->bss->linum, just remove it,
    since there is already a call to print_err_line after the test to
    print the failing line in the bpf program

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20241020-syncookie-v2-3-2db240225fed@bootlin.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 .../selftests/bpf/prog_tests/btf_skc_cls_ingress.c | 87 +++++++++-------------
 1 file changed, 34 insertions(+), 53 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c b/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c
index c88fb0e3048c..426c9d5402fa 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c
@@ -19,11 +19,7 @@
 
 #define TEST_NS "skc_cls_ingress"
 
-static struct test_btf_skc_cls_ingress *skel;
-static struct sockaddr_in6 srv_sa6;
-static __u32 duration;
-
-static struct netns_obj *prepare_netns(void)
+static struct netns_obj *prepare_netns(struct test_btf_skc_cls_ingress *skel)
 {
 	LIBBPF_OPTS(bpf_tc_hook, qdisc_lo, .attach_point = BPF_TC_INGRESS);
 	LIBBPF_OPTS(bpf_tc_opts, tc_attach,
@@ -57,7 +53,7 @@ free_ns:
 	return NULL;
 }
 
-static void reset_test(void)
+static void reset_test(struct test_btf_skc_cls_ingress *skel)
 {
 	memset(&skel->bss->srv_sa6, 0, sizeof(skel->bss->srv_sa6));
 	skel->bss->listen_tp_sport = 0;
@@ -67,16 +63,17 @@ static void reset_test(void)
 	skel->bss->linum = 0;
 }
 
-static void print_err_line(void)
+static void print_err_line(struct test_btf_skc_cls_ingress *skel)
 {
 	if (skel->bss->linum)
 		printf("bpf prog error at line %u\n", skel->bss->linum);
 }
 
-static void run_test(bool gen_cookies)
+static void run_test(struct test_btf_skc_cls_ingress *skel, bool gen_cookies)
 {
 	const char *tcp_syncookies = gen_cookies ? "2" : "1";
 	int listen_fd = -1, cli_fd = -1, srv_fd = -1, err;
+	struct sockaddr_in6 srv_sa6;
 	socklen_t addrlen = sizeof(srv_sa6);
 	int srv_port;
 
@@ -84,58 +81,41 @@ static void run_test(bool gen_cookies)
 		return;
 
 	listen_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0);
-	if (CHECK_FAIL(listen_fd == -1))
+	if (!ASSERT_OK_FD(listen_fd, "start server"))
 		return;
 
 	err = getsockname(listen_fd, (struct sockaddr *)&srv_sa6, &addrlen);
-	if (CHECK(err, "getsockname(listen_fd)", "err:%d errno:%d\n", err,
-		  errno))
+	if (!ASSERT_OK(err, "getsockname(listen_fd)"))
 		goto done;
 	memcpy(&skel->bss->srv_sa6, &srv_sa6, sizeof(srv_sa6));
 	srv_port = ntohs(srv_sa6.sin6_port);
 
 	cli_fd = connect_to_fd(listen_fd, 0);
-	if (CHECK_FAIL(cli_fd == -1))
+	if (!ASSERT_OK_FD(cli_fd, "connect client"))
 		goto done;
 
 	srv_fd = accept(listen_fd, NULL, NULL);
-	if (CHECK_FAIL(srv_fd == -1))
+	if (!ASSERT_OK_FD(srv_fd, "accept connection"))
 		goto done;
 
-	if (CHECK(skel->bss->listen_tp_sport != srv_port,
-		  "Unexpected listen tp src port",
-		  "listen_tp_sport:%u expected:%u\n",
-		  skel->bss->listen_tp_sport, srv_port))
-		goto done;
+	ASSERT_EQ(skel->bss->listen_tp_sport, srv_port, "listen tp src port");
 
 	if (!gen_cookies) {
-		if (CHECK(skel->bss->req_sk_sport != srv_port,
-			  "Unexpected req_sk src port",
-			  "req_sk_sport:%u expected:%u\n",
-			  skel->bss->req_sk_sport, srv_port))
-			goto done;
-		if (CHECK(skel->bss->gen_cookie || skel->bss->recv_cookie,
-			  "Unexpected syncookie states",
-			  "gen_cookie:%u recv_cookie:%u\n",
-			  skel->bss->gen_cookie, skel->bss->recv_cookie))
-			goto done;
+		ASSERT_EQ(skel->bss->req_sk_sport, srv_port,
+			  "request socket source port with syncookies disabled");
+		ASSERT_EQ(skel->bss->gen_cookie, 0,
+			  "generated syncookie with syncookies disabled");
+		ASSERT_EQ(skel->bss->recv_cookie, 0,
+			  "received syncookie with syncookies disabled");
 	} else {
-		if (CHECK(skel->bss->req_sk_sport,
-			  "Unexpected req_sk src port",
-			  "req_sk_sport:%u expected:0\n",
-			  skel->bss->req_sk_sport))
-			goto done;
-		if (CHECK(!skel->bss->gen_cookie ||
-			  skel->bss->gen_cookie != skel->bss->recv_cookie,
-			  "Unexpected syncookie states",
-			  "gen_cookie:%u recv_cookie:%u\n",
-			  skel->bss->gen_cookie, skel->bss->recv_cookie))
-			goto done;
+		ASSERT_EQ(skel->bss->req_sk_sport, 0,
+			  "request socket source port with syncookies enabled");
+		ASSERT_NEQ(skel->bss->gen_cookie, 0,
+			   "syncookie properly generated");
+		ASSERT_EQ(skel->bss->gen_cookie, skel->bss->recv_cookie,
+			  "matching syncookies on client and server");
 	}
 
-	CHECK(skel->bss->linum, "bpf prog detected error", "at line %u\n",
-	      skel->bss->linum);
-
 done:
 	if (listen_fd != -1)
 		close(listen_fd);
@@ -145,19 +125,19 @@ done:
 		close(srv_fd);
 }
 
-static void test_conn(void)
+static void test_conn(struct test_btf_skc_cls_ingress *skel)
 {
-	run_test(false);
+	run_test(skel, false);
 }
 
-static void test_syncookie(void)
+static void test_syncookie(struct test_btf_skc_cls_ingress *skel)
 {
-	run_test(true);
+	run_test(skel, true);
 }
 
 struct test {
 	const char *desc;
-	void (*run)(void);
+	void (*run)(struct test_btf_skc_cls_ingress *skel);
 };
 
 #define DEF_TEST(name) { #name, test_##name }
@@ -168,25 +148,26 @@ static struct test tests[] = {
 
 void test_btf_skc_cls_ingress(void)
 {
+	struct test_btf_skc_cls_ingress *skel;
+	struct netns_obj *ns;
 	int i;
 
 	skel = test_btf_skc_cls_ingress__open_and_load();
-	struct netns_obj *ns;
-	if (CHECK(!skel, "test_btf_skc_cls_ingress__open_and_load", "failed\n"))
+	if (!ASSERT_OK_PTR(skel, "test_btf_skc_cls_ingress__open_and_load"))
 		return;
 
 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
 		if (!test__start_subtest(tests[i].desc))
 			continue;
 
-		ns = prepare_netns();
+		ns = prepare_netns(skel);
 		if (!ns)
 			break;
 
-		tests[i].run();
+		tests[i].run(skel);
 
-		print_err_line();
-		reset_test();
+		print_err_line(skel);
+		reset_test(skel);
 		netns_free(ns);
 	}
 
-- 
cgit v1.2.3


From 8a5cd986023547b3499072e17ff1ddae2c7c66a4 Mon Sep 17 00:00:00 2001
From: "Alexis Lothoré (eBPF Foundation)" <alexis.lothore@bootlin.com>
Date: Sun, 20 Oct 2024 21:22:56 +0200
Subject: selftests/bpf: add ipv4 and dual ipv4/ipv6 support in
 btf_skc_cls_ingress
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

btf_skc_cls_ingress test currently checks that syncookie and
bpf_sk_assign/release helpers behave correctly in multiple scenarios,
but only with ipv6 socket.

Increase those helpers coverage by adding testing support for IPv4
sockets and IPv4/IPv6 sockets. The rework is mostly based on features
brought earlier in test_tcp_check_syncookie.sh to cover some fixes
performed on those helpers, but test_tcp_check_syncookie.sh is not
integrated in test_progs. The most notable changes linked to this are:
- some rework in the corresponding eBPF program to support both types of
  traffic
- the switch from start_server to start_server_str to allow to check
  some socket options
- the introduction of new subtests for ipv4 and ipv4/ipv6

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20241020-syncookie-v2-4-2db240225fed@bootlin.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 .../selftests/bpf/prog_tests/btf_skc_cls_ingress.c | 117 ++++++++++++++++++---
 .../selftests/bpf/progs/test_btf_skc_cls_ingress.c |  80 +++++++++-----
 2 files changed, 158 insertions(+), 39 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c b/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c
index 426c9d5402fa..29b946d84816 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c
@@ -19,6 +19,15 @@
 
 #define TEST_NS "skc_cls_ingress"
 
+#define BIT(n)		(1 << (n))
+#define TEST_MODE_IPV4	BIT(0)
+#define TEST_MODE_IPV6	BIT(1)
+#define TEST_MODE_DUAL	(TEST_MODE_IPV4 | TEST_MODE_IPV6)
+
+#define SERVER_ADDR_IPV4	"127.0.0.1"
+#define SERVER_ADDR_IPV6	"::1"
+#define SERVER_ADDR_DUAL	"::0"
+
 static struct netns_obj *prepare_netns(struct test_btf_skc_cls_ingress *skel)
 {
 	LIBBPF_OPTS(bpf_tc_hook, qdisc_lo, .attach_point = BPF_TC_INGRESS);
@@ -55,6 +64,7 @@ free_ns:
 
 static void reset_test(struct test_btf_skc_cls_ingress *skel)
 {
+	memset(&skel->bss->srv_sa4, 0, sizeof(skel->bss->srv_sa4));
 	memset(&skel->bss->srv_sa6, 0, sizeof(skel->bss->srv_sa6));
 	skel->bss->listen_tp_sport = 0;
 	skel->bss->req_sk_sport = 0;
@@ -69,26 +79,85 @@ static void print_err_line(struct test_btf_skc_cls_ingress *skel)
 		printf("bpf prog error at line %u\n", skel->bss->linum);
 }
 
-static void run_test(struct test_btf_skc_cls_ingress *skel, bool gen_cookies)
+static int v6only_true(int fd, void *opts)
+{
+	int mode = true;
+
+	return setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &mode, sizeof(mode));
+}
+
+static int v6only_false(int fd, void *opts)
+{
+	int mode = false;
+
+	return setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &mode, sizeof(mode));
+}
+
+static void run_test(struct test_btf_skc_cls_ingress *skel, bool gen_cookies,
+		     int ip_mode)
 {
 	const char *tcp_syncookies = gen_cookies ? "2" : "1";
 	int listen_fd = -1, cli_fd = -1, srv_fd = -1, err;
+	struct network_helper_opts opts = { 0 };
+	struct sockaddr_storage *addr;
 	struct sockaddr_in6 srv_sa6;
-	socklen_t addrlen = sizeof(srv_sa6);
+	struct sockaddr_in srv_sa4;
+	socklen_t addr_len;
+	int sock_family;
+	char *srv_addr;
 	int srv_port;
 
+	switch (ip_mode) {
+	case TEST_MODE_IPV4:
+		sock_family = AF_INET;
+		srv_addr = SERVER_ADDR_IPV4;
+		addr = (struct sockaddr_storage *)&srv_sa4;
+		addr_len = sizeof(srv_sa4);
+		break;
+	case TEST_MODE_IPV6:
+		opts.post_socket_cb = v6only_true;
+		sock_family = AF_INET6;
+		srv_addr = SERVER_ADDR_IPV6;
+		addr = (struct sockaddr_storage *)&srv_sa6;
+		addr_len = sizeof(srv_sa6);
+		break;
+	case TEST_MODE_DUAL:
+		opts.post_socket_cb = v6only_false;
+		sock_family = AF_INET6;
+		srv_addr = SERVER_ADDR_DUAL;
+		addr = (struct sockaddr_storage *)&srv_sa6;
+		addr_len = sizeof(srv_sa6);
+		break;
+	default:
+		PRINT_FAIL("Unknown IP mode %d", ip_mode);
+		return;
+	}
+
 	if (write_sysctl("/proc/sys/net/ipv4/tcp_syncookies", tcp_syncookies))
 		return;
 
-	listen_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0);
+	listen_fd = start_server_str(sock_family, SOCK_STREAM, srv_addr,  0,
+				     &opts);
 	if (!ASSERT_OK_FD(listen_fd, "start server"))
 		return;
 
-	err = getsockname(listen_fd, (struct sockaddr *)&srv_sa6, &addrlen);
+	err = getsockname(listen_fd, (struct sockaddr *)addr, &addr_len);
 	if (!ASSERT_OK(err, "getsockname(listen_fd)"))
 		goto done;
-	memcpy(&skel->bss->srv_sa6, &srv_sa6, sizeof(srv_sa6));
-	srv_port = ntohs(srv_sa6.sin6_port);
+
+	switch (ip_mode) {
+	case TEST_MODE_IPV4:
+		memcpy(&skel->bss->srv_sa4, &srv_sa4, sizeof(srv_sa4));
+		srv_port = ntohs(srv_sa4.sin_port);
+		break;
+	case TEST_MODE_IPV6:
+	case TEST_MODE_DUAL:
+		memcpy(&skel->bss->srv_sa6, &srv_sa6, sizeof(srv_sa6));
+		srv_port = ntohs(srv_sa6.sin6_port);
+		break;
+	default:
+		goto done;
+	}
 
 	cli_fd = connect_to_fd(listen_fd, 0);
 	if (!ASSERT_OK_FD(cli_fd, "connect client"))
@@ -125,14 +194,34 @@ done:
 		close(srv_fd);
 }
 
-static void test_conn(struct test_btf_skc_cls_ingress *skel)
+static void test_conn_ipv4(struct test_btf_skc_cls_ingress *skel)
+{
+	run_test(skel, false, TEST_MODE_IPV4);
+}
+
+static void test_conn_ipv6(struct test_btf_skc_cls_ingress *skel)
+{
+	run_test(skel, false, TEST_MODE_IPV6);
+}
+
+static void test_conn_dual(struct test_btf_skc_cls_ingress *skel)
+{
+	run_test(skel, false, TEST_MODE_DUAL);
+}
+
+static void test_syncookie_ipv4(struct test_btf_skc_cls_ingress *skel)
+{
+	run_test(skel, true, TEST_MODE_IPV4);
+}
+
+static void test_syncookie_ipv6(struct test_btf_skc_cls_ingress *skel)
 {
-	run_test(skel, false);
+	run_test(skel, true, TEST_MODE_IPV6);
 }
 
-static void test_syncookie(struct test_btf_skc_cls_ingress *skel)
+static void test_syncookie_dual(struct test_btf_skc_cls_ingress *skel)
 {
-	run_test(skel, true);
+	run_test(skel, true, TEST_MODE_DUAL);
 }
 
 struct test {
@@ -142,8 +231,12 @@ struct test {
 
 #define DEF_TEST(name) { #name, test_##name }
 static struct test tests[] = {
-	DEF_TEST(conn),
-	DEF_TEST(syncookie),
+	DEF_TEST(conn_ipv4),
+	DEF_TEST(conn_ipv6),
+	DEF_TEST(conn_dual),
+	DEF_TEST(syncookie_ipv4),
+	DEF_TEST(syncookie_ipv6),
+	DEF_TEST(syncookie_dual),
 };
 
 void test_btf_skc_cls_ingress(void)
diff --git a/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c b/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c
index f0759efff6ef..b38ca3c35994 100644
--- a/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c
+++ b/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c
@@ -10,6 +10,7 @@
 #endif
 
 struct sockaddr_in6 srv_sa6 = {};
+struct sockaddr_in srv_sa4 = {};
 __u16 listen_tp_sport = 0;
 __u16 req_sk_sport = 0;
 __u32 recv_cookie = 0;
@@ -18,8 +19,8 @@ __u32 linum = 0;
 
 #define LOG() ({ if (!linum) linum = __LINE__; })
 
-static void test_syncookie_helper(struct ipv6hdr *ip6h, struct tcphdr *th,
-				  struct tcp_sock *tp,
+static void test_syncookie_helper(void *iphdr, int iphdr_size,
+				  struct tcphdr *th, struct tcp_sock *tp,
 				  struct __sk_buff *skb)
 {
 	if (th->syn) {
@@ -38,7 +39,7 @@ static void test_syncookie_helper(struct ipv6hdr *ip6h, struct tcphdr *th,
 			return;
 		}
 
-		mss_cookie = bpf_tcp_gen_syncookie(tp, ip6h, sizeof(*ip6h),
+		mss_cookie = bpf_tcp_gen_syncookie(tp, iphdr, iphdr_size,
 						   th, 40);
 		if (mss_cookie < 0) {
 			if (mss_cookie != -ENOENT)
@@ -48,7 +49,7 @@ static void test_syncookie_helper(struct ipv6hdr *ip6h, struct tcphdr *th,
 		}
 	} else if (gen_cookie) {
 		/* It was in cookie mode */
-		int ret = bpf_tcp_check_syncookie(tp, ip6h, sizeof(*ip6h),
+		int ret = bpf_tcp_check_syncookie(tp, iphdr, iphdr_size,
 						  th, sizeof(*th));
 
 		if (ret < 0) {
@@ -60,26 +61,58 @@ static void test_syncookie_helper(struct ipv6hdr *ip6h, struct tcphdr *th,
 	}
 }
 
-static int handle_ip6_tcp(struct ipv6hdr *ip6h, struct __sk_buff *skb)
+static int handle_ip_tcp(struct ethhdr *eth, struct __sk_buff *skb)
 {
-	struct bpf_sock_tuple *tuple;
+	struct bpf_sock_tuple *tuple = NULL;
+	unsigned int tuple_len = 0;
 	struct bpf_sock *bpf_skc;
-	unsigned int tuple_len;
+	void *data_end, *iphdr;
+	struct ipv6hdr *ip6h;
+	struct iphdr *ip4h;
 	struct tcphdr *th;
-	void *data_end;
+	int iphdr_size;
 
 	data_end = (void *)(long)(skb->data_end);
 
-	th = (struct tcphdr *)(ip6h + 1);
-	if (th + 1 > data_end)
-		return TC_ACT_OK;
-
-	/* Is it the testing traffic? */
-	if (th->dest != srv_sa6.sin6_port)
+	switch (eth->h_proto) {
+	case bpf_htons(ETH_P_IP):
+		ip4h = (struct iphdr *)(eth + 1);
+		if (ip4h + 1 > data_end)
+			return TC_ACT_OK;
+		if (ip4h->protocol != IPPROTO_TCP)
+			return TC_ACT_OK;
+		th = (struct tcphdr *)(ip4h + 1);
+		if (th + 1 > data_end)
+			return TC_ACT_OK;
+		/* Is it the testing traffic? */
+		if (th->dest != srv_sa4.sin_port)
+			return TC_ACT_OK;
+		tuple_len = sizeof(tuple->ipv4);
+		tuple = (struct bpf_sock_tuple *)&ip4h->saddr;
+		iphdr = ip4h;
+		iphdr_size = sizeof(*ip4h);
+		break;
+	case bpf_htons(ETH_P_IPV6):
+		ip6h = (struct ipv6hdr *)(eth + 1);
+		if (ip6h + 1 > data_end)
+			return TC_ACT_OK;
+		if (ip6h->nexthdr != IPPROTO_TCP)
+			return TC_ACT_OK;
+		th = (struct tcphdr *)(ip6h + 1);
+		if (th + 1 > data_end)
+			return TC_ACT_OK;
+		/* Is it the testing traffic? */
+		if (th->dest != srv_sa6.sin6_port)
+			return TC_ACT_OK;
+		tuple_len = sizeof(tuple->ipv6);
+		tuple = (struct bpf_sock_tuple *)&ip6h->saddr;
+		iphdr = ip6h;
+		iphdr_size = sizeof(*ip6h);
+		break;
+	default:
 		return TC_ACT_OK;
+	}
 
-	tuple_len = sizeof(tuple->ipv6);
-	tuple = (struct bpf_sock_tuple *)&ip6h->saddr;
 	if ((void *)tuple + tuple_len > data_end) {
 		LOG();
 		return TC_ACT_OK;
@@ -126,7 +159,7 @@ static int handle_ip6_tcp(struct ipv6hdr *ip6h, struct __sk_buff *skb)
 
 		listen_tp_sport = tp->inet_conn.icsk_inet.sk.__sk_common.skc_num;
 
-		test_syncookie_helper(ip6h, th, tp, skb);
+		test_syncookie_helper(iphdr, iphdr_size, th, tp, skb);
 		bpf_sk_release(tp);
 		return TC_ACT_OK;
 	}
@@ -142,7 +175,6 @@ release:
 SEC("tc")
 int cls_ingress(struct __sk_buff *skb)
 {
-	struct ipv6hdr *ip6h;
 	struct ethhdr *eth;
 	void *data_end;
 
@@ -152,17 +184,11 @@ int cls_ingress(struct __sk_buff *skb)
 	if (eth + 1 > data_end)
 		return TC_ACT_OK;
 
-	if (eth->h_proto != bpf_htons(ETH_P_IPV6))
-		return TC_ACT_OK;
-
-	ip6h = (struct ipv6hdr *)(eth + 1);
-	if (ip6h + 1 > data_end)
+	if (eth->h_proto != bpf_htons(ETH_P_IP) &&
+	    eth->h_proto != bpf_htons(ETH_P_IPV6))
 		return TC_ACT_OK;
 
-	if (ip6h->nexthdr == IPPROTO_TCP)
-		return handle_ip6_tcp(ip6h, skb);
-
-	return TC_ACT_OK;
+	return handle_ip_tcp(eth, skb);
 }
 
 char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 3845ce74777eeb94892cdeedaf4b76e2341f3f42 Mon Sep 17 00:00:00 2001
From: "Alexis Lothoré (eBPF Foundation)" <alexis.lothore@bootlin.com>
Date: Sun, 20 Oct 2024 21:22:57 +0200
Subject: selftests/bpf: test MSS value returned with bpf_tcp_gen_syncookie
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

One remaining difference between test_tcp_check_syncookie.sh and
btf_skc_cls_ingress is a small test on the mss value embedded in the
cookie generated with the eBPF helper.

Bring the corresponding test in btf_skc_cls_ingress.

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20241020-syncookie-v2-5-2db240225fed@bootlin.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c | 7 +++++++
 tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c | 2 ++
 2 files changed, 9 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c b/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c
index 29b946d84816..cf15cc3be491 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c
@@ -27,6 +27,8 @@
 #define SERVER_ADDR_IPV4	"127.0.0.1"
 #define SERVER_ADDR_IPV6	"::1"
 #define SERVER_ADDR_DUAL	"::0"
+/* RFC791, 576 for minimal IPv4 datagram, minus 40 bytes of TCP header */
+#define MIN_IPV4_MSS		536
 
 static struct netns_obj *prepare_netns(struct test_btf_skc_cls_ingress *skel)
 {
@@ -71,6 +73,7 @@ static void reset_test(struct test_btf_skc_cls_ingress *skel)
 	skel->bss->recv_cookie = 0;
 	skel->bss->gen_cookie = 0;
 	skel->bss->linum = 0;
+	skel->bss->mss = 0;
 }
 
 static void print_err_line(struct test_btf_skc_cls_ingress *skel)
@@ -183,6 +186,10 @@ static void run_test(struct test_btf_skc_cls_ingress *skel, bool gen_cookies,
 			   "syncookie properly generated");
 		ASSERT_EQ(skel->bss->gen_cookie, skel->bss->recv_cookie,
 			  "matching syncookies on client and server");
+		ASSERT_GT(skel->bss->mss, MIN_IPV4_MSS,
+			  "MSS in cookie min value");
+		ASSERT_LT(skel->bss->mss, USHRT_MAX,
+			  "MSS in cookie max value");
 	}
 
 done:
diff --git a/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c b/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c
index b38ca3c35994..1cd1a1b72cb5 100644
--- a/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c
+++ b/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c
@@ -15,6 +15,7 @@ __u16 listen_tp_sport = 0;
 __u16 req_sk_sport = 0;
 __u32 recv_cookie = 0;
 __u32 gen_cookie = 0;
+__u32 mss = 0;
 __u32 linum = 0;
 
 #define LOG() ({ if (!linum) linum = __LINE__; })
@@ -46,6 +47,7 @@ static void test_syncookie_helper(void *iphdr, int iphdr_size,
 				LOG();
 		} else {
 			gen_cookie = (__u32)mss_cookie;
+			mss = mss_cookie >> 32;
 		}
 	} else if (gen_cookie) {
 		/* It was in cookie mode */
-- 
cgit v1.2.3


From c3566ee6c66c3d6113739ec00cda7e23f39a3744 Mon Sep 17 00:00:00 2001
From: "Alexis Lothoré (eBPF Foundation)" <alexis.lothore@bootlin.com>
Date: Sun, 20 Oct 2024 21:22:58 +0200
Subject: selftests/bpf: remove test_tcp_check_syncookie
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that btf_skc_cls_ingress has the same coverage as
test_tcp_check_syncookie, remove the second one and keep the first one
as it is integrated in test_progs

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20241020-syncookie-v2-6-2db240225fed@bootlin.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/.gitignore             |   1 -
 tools/testing/selftests/bpf/Makefile               |   9 +-
 .../bpf/progs/test_tcp_check_syncookie_kern.c      | 167 ----------------
 .../selftests/bpf/test_tcp_check_syncookie.sh      |  85 --------
 .../selftests/bpf/test_tcp_check_syncookie_user.c  | 213 ---------------------
 5 files changed, 3 insertions(+), 472 deletions(-)
 delete mode 100644 tools/testing/selftests/bpf/progs/test_tcp_check_syncookie_kern.c
 delete mode 100755 tools/testing/selftests/bpf/test_tcp_check_syncookie.sh
 delete mode 100644 tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index e6533b3400de..7e88551f2d38 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -24,7 +24,6 @@ test_flow_dissector
 flow_dissector_load
 test_tcpnotify_user
 test_libbpf
-test_tcp_check_syncookie_user
 test_sysctl
 xdping
 test_cpp
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index f04af11df8eb..46924f406d06 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -137,7 +137,6 @@ TEST_PROGS := test_kmod.sh \
 	test_xdp_vlan_mode_generic.sh \
 	test_xdp_vlan_mode_native.sh \
 	test_lwt_ip_encap.sh \
-	test_tcp_check_syncookie.sh \
 	test_tc_tunnel.sh \
 	test_tc_edt.sh \
 	test_xdping.sh \
@@ -154,10 +153,9 @@ TEST_PROGS_EXTENDED := with_addr.sh \
 
 # Compile but not part of 'make run_tests'
 TEST_GEN_PROGS_EXTENDED = \
-	flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \
-	test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \
-	xskxceiver xdp_redirect_multi xdp_synproxy veristat xdp_hw_metadata \
-	xdp_features bpf_test_no_cfi.ko
+	flow_dissector_load test_flow_dissector	test_lirc_mode2_user xdping \
+	test_cpp runqslower bench bpf_testmod.ko xskxceiver xdp_redirect_multi \
+	xdp_synproxy veristat xdp_hw_metadata xdp_features bpf_test_no_cfi.ko
 
 TEST_GEN_FILES += liburandom_read.so urandom_read sign-file uprobe_multi
 
@@ -347,7 +345,6 @@ $(OUTPUT)/flow_dissector_load: $(TESTING_HELPERS)
 $(OUTPUT)/test_maps: $(TESTING_HELPERS)
 $(OUTPUT)/test_verifier: $(TESTING_HELPERS) $(CAP_HELPERS) $(UNPRIV_HELPERS)
 $(OUTPUT)/xsk.o: $(BPFOBJ)
-$(OUTPUT)/test_tcp_check_syncookie_user: $(NETWORK_HELPERS)
 
 BPFTOOL ?= $(DEFAULT_BPFTOOL)
 $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile)    \
diff --git a/tools/testing/selftests/bpf/progs/test_tcp_check_syncookie_kern.c b/tools/testing/selftests/bpf/progs/test_tcp_check_syncookie_kern.c
deleted file mode 100644
index 6edebce563b5..000000000000
--- a/tools/testing/selftests/bpf/progs/test_tcp_check_syncookie_kern.c
+++ /dev/null
@@ -1,167 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-// Copyright (c) 2018 Facebook
-// Copyright (c) 2019 Cloudflare
-
-#include <string.h>
-
-#include <linux/bpf.h>
-#include <linux/pkt_cls.h>
-#include <linux/if_ether.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <sys/socket.h>
-#include <linux/tcp.h>
-
-#include <bpf/bpf_helpers.h>
-#include <bpf/bpf_endian.h>
-
-struct {
-	__uint(type, BPF_MAP_TYPE_ARRAY);
-	__type(key, __u32);
-	__type(value, __u32);
-	__uint(max_entries, 3);
-} results SEC(".maps");
-
-static __always_inline __s64 gen_syncookie(void *data_end, struct bpf_sock *sk,
-					   void *iph, __u32 ip_size,
-					   struct tcphdr *tcph)
-{
-	__u32 thlen = tcph->doff * 4;
-
-	if (tcph->syn && !tcph->ack) {
-		// packet should only have an MSS option
-		if (thlen != 24)
-			return 0;
-
-		if ((void *)tcph + thlen > data_end)
-			return 0;
-
-		return bpf_tcp_gen_syncookie(sk, iph, ip_size, tcph, thlen);
-	}
-	return 0;
-}
-
-static __always_inline void check_syncookie(void *ctx, void *data,
-					    void *data_end)
-{
-	struct bpf_sock_tuple tup;
-	struct bpf_sock *sk;
-	struct ethhdr *ethh;
-	struct iphdr *ipv4h;
-	struct ipv6hdr *ipv6h;
-	struct tcphdr *tcph;
-	int ret;
-	__u32 key_mss = 2;
-	__u32 key_gen = 1;
-	__u32 key = 0;
-	__s64 seq_mss;
-
-	ethh = data;
-	if (ethh + 1 > data_end)
-		return;
-
-	switch (bpf_ntohs(ethh->h_proto)) {
-	case ETH_P_IP:
-		ipv4h = data + sizeof(struct ethhdr);
-		if (ipv4h + 1 > data_end)
-			return;
-
-		if (ipv4h->ihl != 5)
-			return;
-
-		tcph = data + sizeof(struct ethhdr) + sizeof(struct iphdr);
-		if (tcph + 1 > data_end)
-			return;
-
-		tup.ipv4.saddr = ipv4h->saddr;
-		tup.ipv4.daddr = ipv4h->daddr;
-		tup.ipv4.sport = tcph->source;
-		tup.ipv4.dport = tcph->dest;
-
-		sk = bpf_skc_lookup_tcp(ctx, &tup, sizeof(tup.ipv4),
-					BPF_F_CURRENT_NETNS, 0);
-		if (!sk)
-			return;
-
-		if (sk->state != BPF_TCP_LISTEN)
-			goto release;
-
-		seq_mss = gen_syncookie(data_end, sk, ipv4h, sizeof(*ipv4h),
-					tcph);
-
-		ret = bpf_tcp_check_syncookie(sk, ipv4h, sizeof(*ipv4h),
-					      tcph, sizeof(*tcph));
-		break;
-
-	case ETH_P_IPV6:
-		ipv6h = data + sizeof(struct ethhdr);
-		if (ipv6h + 1 > data_end)
-			return;
-
-		if (ipv6h->nexthdr != IPPROTO_TCP)
-			return;
-
-		tcph = data + sizeof(struct ethhdr) + sizeof(struct ipv6hdr);
-		if (tcph + 1 > data_end)
-			return;
-
-		memcpy(tup.ipv6.saddr, &ipv6h->saddr, sizeof(tup.ipv6.saddr));
-		memcpy(tup.ipv6.daddr, &ipv6h->daddr, sizeof(tup.ipv6.daddr));
-		tup.ipv6.sport = tcph->source;
-		tup.ipv6.dport = tcph->dest;
-
-		sk = bpf_skc_lookup_tcp(ctx, &tup, sizeof(tup.ipv6),
-					BPF_F_CURRENT_NETNS, 0);
-		if (!sk)
-			return;
-
-		if (sk->state != BPF_TCP_LISTEN)
-			goto release;
-
-		seq_mss = gen_syncookie(data_end, sk, ipv6h, sizeof(*ipv6h),
-					tcph);
-
-		ret = bpf_tcp_check_syncookie(sk, ipv6h, sizeof(*ipv6h),
-					      tcph, sizeof(*tcph));
-		break;
-
-	default:
-		return;
-	}
-
-	if (seq_mss > 0) {
-		__u32 cookie = (__u32)seq_mss;
-		__u32 mss = seq_mss >> 32;
-
-		bpf_map_update_elem(&results, &key_gen, &cookie, 0);
-		bpf_map_update_elem(&results, &key_mss, &mss, 0);
-	}
-
-	if (ret == 0) {
-		__u32 cookie = bpf_ntohl(tcph->ack_seq) - 1;
-
-		bpf_map_update_elem(&results, &key, &cookie, 0);
-	}
-
-release:
-	bpf_sk_release(sk);
-}
-
-SEC("tc")
-int check_syncookie_clsact(struct __sk_buff *skb)
-{
-	check_syncookie(skb, (void *)(long)skb->data,
-			(void *)(long)skb->data_end);
-	return TC_ACT_OK;
-}
-
-SEC("xdp")
-int check_syncookie_xdp(struct xdp_md *ctx)
-{
-	check_syncookie(ctx, (void *)(long)ctx->data,
-			(void *)(long)ctx->data_end);
-	return XDP_PASS;
-}
-
-char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_tcp_check_syncookie.sh b/tools/testing/selftests/bpf/test_tcp_check_syncookie.sh
deleted file mode 100755
index b42c24282c25..000000000000
--- a/tools/testing/selftests/bpf/test_tcp_check_syncookie.sh
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-# Copyright (c) 2018 Facebook
-# Copyright (c) 2019 Cloudflare
-
-set -eu
-readonly NS1="ns1-$(mktemp -u XXXXXX)"
-
-wait_for_ip()
-{
-	local _i
-	printf "Wait for IP %s to become available " "$1"
-	for _i in $(seq ${MAX_PING_TRIES}); do
-		printf "."
-		if ns1_exec ping -c 1 -W 1 "$1" >/dev/null 2>&1; then
-			echo " OK"
-			return
-		fi
-		sleep 1
-	done
-	echo 1>&2 "ERROR: Timeout waiting for test IP to become available."
-	exit 1
-}
-
-get_prog_id()
-{
-	awk '/ id / {sub(/.* id /, "", $0); print($1)}'
-}
-
-ns1_exec()
-{
-	ip netns exec ${NS1} "$@"
-}
-
-setup()
-{
-	ip netns add ${NS1}
-	ns1_exec ip link set lo up
-
-	ns1_exec sysctl -w net.ipv4.tcp_syncookies=2
-	ns1_exec sysctl -w net.ipv4.tcp_window_scaling=0
-	ns1_exec sysctl -w net.ipv4.tcp_timestamps=0
-	ns1_exec sysctl -w net.ipv4.tcp_sack=0
-
-	wait_for_ip 127.0.0.1
-	wait_for_ip ::1
-}
-
-cleanup()
-{
-	ip netns del ns1 2>/dev/null || :
-}
-
-main()
-{
-	trap cleanup EXIT 2 3 6 15
-	setup
-
-	printf "Testing clsact..."
-	ns1_exec tc qdisc add dev "${TEST_IF}" clsact
-	ns1_exec tc filter add dev "${TEST_IF}" ingress \
-		bpf obj "${BPF_PROG_OBJ}" sec "${CLSACT_SECTION}" da
-
-	BPF_PROG_ID=$(ns1_exec tc filter show dev "${TEST_IF}" ingress | \
-		      get_prog_id)
-	ns1_exec "${PROG}" "${BPF_PROG_ID}"
-	ns1_exec tc qdisc del dev "${TEST_IF}" clsact
-
-	printf "Testing XDP..."
-	ns1_exec ip link set "${TEST_IF}" xdp \
-		object "${BPF_PROG_OBJ}" section "${XDP_SECTION}"
-	BPF_PROG_ID=$(ns1_exec ip link show "${TEST_IF}" | get_prog_id)
-	ns1_exec "${PROG}" "${BPF_PROG_ID}"
-}
-
-DIR=$(dirname $0)
-TEST_IF=lo
-MAX_PING_TRIES=5
-BPF_PROG_OBJ="${DIR}/test_tcp_check_syncookie_kern.bpf.o"
-CLSACT_SECTION="tc"
-XDP_SECTION="xdp"
-BPF_PROG_ID=0
-PROG="${DIR}/test_tcp_check_syncookie_user"
-
-main
diff --git a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c
deleted file mode 100644
index 3844f9b8232a..000000000000
--- a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c
+++ /dev/null
@@ -1,213 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-// Copyright (c) 2018 Facebook
-// Copyright (c) 2019 Cloudflare
-
-#include <limits.h>
-#include <string.h>
-#include <stdlib.h>
-#include <unistd.h>
-
-#include <arpa/inet.h>
-#include <netinet/in.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-
-#include <bpf/bpf.h>
-#include <bpf/libbpf.h>
-
-#include "cgroup_helpers.h"
-#include "network_helpers.h"
-
-static int get_map_fd_by_prog_id(int prog_id, bool *xdp)
-{
-	struct bpf_prog_info info = {};
-	__u32 info_len = sizeof(info);
-	__u32 map_ids[1];
-	int prog_fd = -1;
-	int map_fd = -1;
-
-	prog_fd = bpf_prog_get_fd_by_id(prog_id);
-	if (prog_fd < 0) {
-		log_err("Failed to get fd by prog id %d", prog_id);
-		goto err;
-	}
-
-	info.nr_map_ids = 1;
-	info.map_ids = (__u64)(unsigned long)map_ids;
-
-	if (bpf_prog_get_info_by_fd(prog_fd, &info, &info_len)) {
-		log_err("Failed to get info by prog fd %d", prog_fd);
-		goto err;
-	}
-
-	if (!info.nr_map_ids) {
-		log_err("No maps found for prog fd %d", prog_fd);
-		goto err;
-	}
-
-	*xdp = info.type == BPF_PROG_TYPE_XDP;
-
-	map_fd = bpf_map_get_fd_by_id(map_ids[0]);
-	if (map_fd < 0)
-		log_err("Failed to get fd by map id %d", map_ids[0]);
-err:
-	if (prog_fd >= 0)
-		close(prog_fd);
-	return map_fd;
-}
-
-static int run_test(int server_fd, int results_fd, bool xdp)
-{
-	int client = -1, srv_client = -1;
-	int ret = 0;
-	__u32 key = 0;
-	__u32 key_gen = 1;
-	__u32 key_mss = 2;
-	__u32 value = 0;
-	__u32 value_gen = 0;
-	__u32 value_mss = 0;
-
-	if (bpf_map_update_elem(results_fd, &key, &value, 0) < 0) {
-		log_err("Can't clear results");
-		goto err;
-	}
-
-	if (bpf_map_update_elem(results_fd, &key_gen, &value_gen, 0) < 0) {
-		log_err("Can't clear results");
-		goto err;
-	}
-
-	if (bpf_map_update_elem(results_fd, &key_mss, &value_mss, 0) < 0) {
-		log_err("Can't clear results");
-		goto err;
-	}
-
-	client = connect_to_fd(server_fd, 0);
-	if (client == -1)
-		goto err;
-
-	srv_client = accept(server_fd, NULL, 0);
-	if (srv_client == -1) {
-		log_err("Can't accept connection");
-		goto err;
-	}
-
-	if (bpf_map_lookup_elem(results_fd, &key, &value) < 0) {
-		log_err("Can't lookup result");
-		goto err;
-	}
-
-	if (value == 0) {
-		log_err("Didn't match syncookie: %u", value);
-		goto err;
-	}
-
-	if (bpf_map_lookup_elem(results_fd, &key_gen, &value_gen) < 0) {
-		log_err("Can't lookup result");
-		goto err;
-	}
-
-	if (xdp && value_gen == 0) {
-		// SYN packets do not get passed through generic XDP, skip the
-		// rest of the test.
-		printf("Skipping XDP cookie check\n");
-		goto out;
-	}
-
-	if (bpf_map_lookup_elem(results_fd, &key_mss, &value_mss) < 0) {
-		log_err("Can't lookup result");
-		goto err;
-	}
-
-	if (value != value_gen) {
-		log_err("BPF generated cookie does not match kernel one");
-		goto err;
-	}
-
-	if (value_mss < 536 || value_mss > USHRT_MAX) {
-		log_err("Unexpected MSS retrieved");
-		goto err;
-	}
-
-	goto out;
-
-err:
-	ret = 1;
-out:
-	close(client);
-	close(srv_client);
-	return ret;
-}
-
-static int v6only_true(int fd, void *opts)
-{
-	int mode = true;
-
-	return setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &mode, sizeof(mode));
-}
-
-static int v6only_false(int fd, void *opts)
-{
-	int mode = false;
-
-	return setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &mode, sizeof(mode));
-}
-
-int main(int argc, char **argv)
-{
-	struct network_helper_opts opts = { 0 };
-	int server = -1;
-	int server_v6 = -1;
-	int server_dual = -1;
-	int results = -1;
-	int err = 0;
-	bool xdp;
-
-	if (argc < 2) {
-		fprintf(stderr, "Usage: %s prog_id\n", argv[0]);
-		exit(1);
-	}
-
-	/* Use libbpf 1.0 API mode */
-	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
-
-	results = get_map_fd_by_prog_id(atoi(argv[1]), &xdp);
-	if (results < 0) {
-		log_err("Can't get map");
-		goto err;
-	}
-
-	server = start_server_str(AF_INET, SOCK_STREAM, "127.0.0.1", 0, NULL);
-	if (server == -1)
-		goto err;
-
-	opts.post_socket_cb = v6only_true;
-	server_v6 = start_server_str(AF_INET6, SOCK_STREAM, "::1", 0, &opts);
-	if (server_v6 == -1)
-		goto err;
-
-	opts.post_socket_cb = v6only_false;
-	server_dual = start_server_str(AF_INET6, SOCK_STREAM, "::0", 0, &opts);
-	if (server_dual == -1)
-		goto err;
-
-	if (run_test(server, results, xdp))
-		goto err;
-
-	if (run_test(server_v6, results, xdp))
-		goto err;
-
-	if (run_test(server_dual, results, xdp))
-		goto err;
-
-	printf("ok\n");
-	goto out;
-err:
-	err = 1;
-out:
-	close(server);
-	close(server_v6);
-	close(server_dual);
-	close(results);
-	return err;
-}
-- 
cgit v1.2.3


From 0e14189459f6c424a95a146d288d59f0ed27fd3f Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay@kernel.org>
Date: Wed, 16 Oct 2024 08:41:36 +0000
Subject: selftests/bpf: Augment send_signal test with remote signaling

Add testcases to test bpf_send_signal_task(). In these new test cases,
the main process triggers the BPF program and the forked process
receives the signals. The target process's signal handler receives a
cookie from the bpf program.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241016084136.10305-3-puranjay@kernel.org
---
 .../testing/selftests/bpf/prog_tests/send_signal.c | 133 ++++++++++++++++-----
 .../selftests/bpf/progs/test_send_signal_kern.c    |  35 +++++-
 2 files changed, 130 insertions(+), 38 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c
index 6cc69900b310..1aed94ec14ef 100644
--- a/tools/testing/selftests/bpf/prog_tests/send_signal.c
+++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c
@@ -8,17 +8,25 @@ static int sigusr1_received;
 
 static void sigusr1_handler(int signum)
 {
-	sigusr1_received = 1;
+	sigusr1_received = 8;
+}
+
+static void sigusr1_siginfo_handler(int s, siginfo_t *i, void *v)
+{
+	sigusr1_received = (int)(long long)i->si_value.sival_ptr;
 }
 
 static void test_send_signal_common(struct perf_event_attr *attr,
-				    bool signal_thread)
+				    bool signal_thread, bool remote)
 {
 	struct test_send_signal_kern *skel;
+	struct sigaction sa;
 	int pipe_c2p[2], pipe_p2c[2];
 	int err = -1, pmu_fd = -1;
+	volatile int j = 0;
 	char buf[256];
 	pid_t pid;
+	int old_prio;
 
 	if (!ASSERT_OK(pipe(pipe_c2p), "pipe_c2p"))
 		return;
@@ -39,11 +47,14 @@ static void test_send_signal_common(struct perf_event_attr *attr,
 	}
 
 	if (pid == 0) {
-		int old_prio;
-		volatile int j = 0;
-
 		/* install signal handler and notify parent */
-		ASSERT_NEQ(signal(SIGUSR1, sigusr1_handler), SIG_ERR, "signal");
+		if (remote) {
+			sa.sa_sigaction = sigusr1_siginfo_handler;
+			sa.sa_flags = SA_RESTART | SA_SIGINFO;
+			ASSERT_NEQ(sigaction(SIGUSR1, &sa, NULL), -1, "sigaction");
+		} else {
+			ASSERT_NEQ(signal(SIGUSR1, sigusr1_handler), SIG_ERR, "signal");
+		}
 
 		close(pipe_c2p[0]); /* close read */
 		close(pipe_p2c[1]); /* close write */
@@ -52,10 +63,12 @@ static void test_send_signal_common(struct perf_event_attr *attr,
 		 * that if an interrupt happens, the underlying task
 		 * is this process.
 		 */
-		errno = 0;
-		old_prio = getpriority(PRIO_PROCESS, 0);
-		ASSERT_OK(errno, "getpriority");
-		ASSERT_OK(setpriority(PRIO_PROCESS, 0, -20), "setpriority");
+		if (!remote) {
+			errno = 0;
+			old_prio = getpriority(PRIO_PROCESS, 0);
+			ASSERT_OK(errno, "getpriority");
+			ASSERT_OK(setpriority(PRIO_PROCESS, 0, -20), "setpriority");
+		}
 
 		/* notify parent signal handler is installed */
 		ASSERT_EQ(write(pipe_c2p[1], buf, 1), 1, "pipe_write");
@@ -66,20 +79,25 @@ static void test_send_signal_common(struct perf_event_attr *attr,
 		/* wait a little for signal handler */
 		for (int i = 0; i < 1000000000 && !sigusr1_received; i++) {
 			j /= i + j + 1;
-			if (!attr)
-				/* trigger the nanosleep tracepoint program. */
-				usleep(1);
+			if (remote)
+				sleep(1);
+			else
+				if (!attr)
+					/* trigger the nanosleep tracepoint program. */
+					usleep(1);
 		}
 
-		buf[0] = sigusr1_received ? '2' : '0';
-		ASSERT_EQ(sigusr1_received, 1, "sigusr1_received");
+		buf[0] = sigusr1_received;
+
+		ASSERT_EQ(sigusr1_received, 8, "sigusr1_received");
 		ASSERT_EQ(write(pipe_c2p[1], buf, 1), 1, "pipe_write");
 
 		/* wait for parent notification and exit */
 		ASSERT_EQ(read(pipe_p2c[0], buf, 1), 1, "pipe_read");
 
 		/* restore the old priority */
-		ASSERT_OK(setpriority(PRIO_PROCESS, 0, old_prio), "setpriority");
+		if (!remote)
+			ASSERT_OK(setpriority(PRIO_PROCESS, 0, old_prio), "setpriority");
 
 		close(pipe_c2p[1]);
 		close(pipe_p2c[0]);
@@ -93,6 +111,17 @@ static void test_send_signal_common(struct perf_event_attr *attr,
 	if (!ASSERT_OK_PTR(skel, "skel_open_and_load"))
 		goto skel_open_load_failure;
 
+	/* boost with a high priority so we got a higher chance
+	 * that if an interrupt happens, the underlying task
+	 * is this process.
+	 */
+	if (remote) {
+		errno = 0;
+		old_prio = getpriority(PRIO_PROCESS, 0);
+		ASSERT_OK(errno, "getpriority");
+		ASSERT_OK(setpriority(PRIO_PROCESS, 0, -20), "setpriority");
+	}
+
 	if (!attr) {
 		err = test_send_signal_kern__attach(skel);
 		if (!ASSERT_OK(err, "skel_attach")) {
@@ -100,8 +129,12 @@ static void test_send_signal_common(struct perf_event_attr *attr,
 			goto destroy_skel;
 		}
 	} else {
-		pmu_fd = syscall(__NR_perf_event_open, attr, pid, -1 /* cpu */,
-				 -1 /* group id */, 0 /* flags */);
+		if (!remote)
+			pmu_fd = syscall(__NR_perf_event_open, attr, pid, -1 /* cpu */,
+					 -1 /* group id */, 0 /* flags */);
+		else
+			pmu_fd = syscall(__NR_perf_event_open, attr, getpid(), -1 /* cpu */,
+					 -1 /* group id */, 0 /* flags */);
 		if (!ASSERT_GE(pmu_fd, 0, "perf_event_open")) {
 			err = -1;
 			goto destroy_skel;
@@ -119,11 +152,30 @@ static void test_send_signal_common(struct perf_event_attr *attr,
 	/* trigger the bpf send_signal */
 	skel->bss->signal_thread = signal_thread;
 	skel->bss->sig = SIGUSR1;
-	skel->bss->pid = pid;
+	if (!remote) {
+		skel->bss->target_pid = 0;
+		skel->bss->pid = pid;
+	} else {
+		skel->bss->target_pid = pid;
+		skel->bss->pid = getpid();
+	}
 
 	/* notify child that bpf program can send_signal now */
 	ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write");
 
+	/* For the remote test, the BPF program is triggered from this
+	 * process but the other process/thread is signaled.
+	 */
+	if (remote) {
+		if (!attr) {
+			for (int i = 0; i < 10; i++)
+				usleep(1);
+		} else {
+			for (int i = 0; i < 100000000; i++)
+				j /= i + 1;
+		}
+	}
+
 	/* wait for result */
 	err = read(pipe_c2p[0], buf, 1);
 	if (!ASSERT_GE(err, 0, "reading pipe"))
@@ -133,7 +185,7 @@ static void test_send_signal_common(struct perf_event_attr *attr,
 		goto disable_pmu;
 	}
 
-	ASSERT_EQ(buf[0], '2', "incorrect result");
+	ASSERT_EQ(buf[0], 8, "incorrect result");
 
 	/* notify child safe to exit */
 	ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write");
@@ -142,18 +194,21 @@ disable_pmu:
 	close(pmu_fd);
 destroy_skel:
 	test_send_signal_kern__destroy(skel);
+	/* restore the old priority */
+	if (remote)
+		ASSERT_OK(setpriority(PRIO_PROCESS, 0, old_prio), "setpriority");
 skel_open_load_failure:
 	close(pipe_c2p[0]);
 	close(pipe_p2c[1]);
 	wait(NULL);
 }
 
-static void test_send_signal_tracepoint(bool signal_thread)
+static void test_send_signal_tracepoint(bool signal_thread, bool remote)
 {
-	test_send_signal_common(NULL, signal_thread);
+	test_send_signal_common(NULL, signal_thread, remote);
 }
 
-static void test_send_signal_perf(bool signal_thread)
+static void test_send_signal_perf(bool signal_thread, bool remote)
 {
 	struct perf_event_attr attr = {
 		.freq = 1,
@@ -162,10 +217,10 @@ static void test_send_signal_perf(bool signal_thread)
 		.config = PERF_COUNT_SW_CPU_CLOCK,
 	};
 
-	test_send_signal_common(&attr, signal_thread);
+	test_send_signal_common(&attr, signal_thread, remote);
 }
 
-static void test_send_signal_nmi(bool signal_thread)
+static void test_send_signal_nmi(bool signal_thread, bool remote)
 {
 	struct perf_event_attr attr = {
 		.sample_period = 1,
@@ -191,21 +246,35 @@ static void test_send_signal_nmi(bool signal_thread)
 		close(pmu_fd);
 	}
 
-	test_send_signal_common(&attr, signal_thread);
+	test_send_signal_common(&attr, signal_thread, remote);
 }
 
 void test_send_signal(void)
 {
 	if (test__start_subtest("send_signal_tracepoint"))
-		test_send_signal_tracepoint(false);
+		test_send_signal_tracepoint(false, false);
 	if (test__start_subtest("send_signal_perf"))
-		test_send_signal_perf(false);
+		test_send_signal_perf(false, false);
 	if (test__start_subtest("send_signal_nmi"))
-		test_send_signal_nmi(false);
+		test_send_signal_nmi(false, false);
 	if (test__start_subtest("send_signal_tracepoint_thread"))
-		test_send_signal_tracepoint(true);
+		test_send_signal_tracepoint(true, false);
 	if (test__start_subtest("send_signal_perf_thread"))
-		test_send_signal_perf(true);
+		test_send_signal_perf(true, false);
 	if (test__start_subtest("send_signal_nmi_thread"))
-		test_send_signal_nmi(true);
+		test_send_signal_nmi(true, false);
+
+	/* Signal remote thread and thread group */
+	if (test__start_subtest("send_signal_tracepoint_remote"))
+		test_send_signal_tracepoint(false, true);
+	if (test__start_subtest("send_signal_perf_remote"))
+		test_send_signal_perf(false, true);
+	if (test__start_subtest("send_signal_nmi_remote"))
+		test_send_signal_nmi(false, true);
+	if (test__start_subtest("send_signal_tracepoint_thread_remote"))
+		test_send_signal_tracepoint(true, true);
+	if (test__start_subtest("send_signal_perf_thread_remote"))
+		test_send_signal_perf(true, true);
+	if (test__start_subtest("send_signal_nmi_thread_remote"))
+		test_send_signal_nmi(true, true);
 }
diff --git a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
index 92354cd72044..176a355e3062 100644
--- a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
+++ b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
@@ -1,27 +1,50 @@
 // SPDX-License-Identifier: GPL-2.0
 // Copyright (c) 2019 Facebook
-#include <linux/bpf.h>
+#include <vmlinux.h>
 #include <linux/version.h>
 #include <bpf/bpf_helpers.h>
 
-__u32 sig = 0, pid = 0, status = 0, signal_thread = 0;
+struct task_struct *bpf_task_from_pid(int pid) __ksym;
+void bpf_task_release(struct task_struct *p) __ksym;
+int bpf_send_signal_task(struct task_struct *task, int sig, enum pid_type type, u64 value) __ksym;
+
+__u32 sig = 0, pid = 0, status = 0, signal_thread = 0, target_pid = 0;
 
 static __always_inline int bpf_send_signal_test(void *ctx)
 {
+	struct task_struct *target_task = NULL;
 	int ret;
+	u64 value;
 
 	if (status != 0 || pid == 0)
 		return 0;
 
 	if ((bpf_get_current_pid_tgid() >> 32) == pid) {
-		if (signal_thread)
-			ret = bpf_send_signal_thread(sig);
-		else
-			ret = bpf_send_signal(sig);
+		if (target_pid) {
+			target_task = bpf_task_from_pid(target_pid);
+			if (!target_task)
+				return 0;
+			value = 8;
+		}
+
+		if (signal_thread) {
+			if (target_pid)
+				ret = bpf_send_signal_task(target_task, sig, PIDTYPE_PID, value);
+			else
+				ret = bpf_send_signal_thread(sig);
+		} else {
+			if (target_pid)
+				ret = bpf_send_signal_task(target_task, sig, PIDTYPE_TGID, value);
+			else
+				ret = bpf_send_signal(sig);
+		}
 		if (ret == 0)
 			status = 1;
 	}
 
+	if (target_task)
+		bpf_task_release(target_task);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 7a08cb9b4bb92fb86f5fe8a3aa0ac08a9b3d783b Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 17 Oct 2024 18:43:31 +0100
Subject: kselftest/arm64: Fail the overall fp-stress test if any test fails

Currently fp-stress does not report a top level test result if it runs to
completion, it always exits with a return code 0. Use the ksft_finished()
helper to ensure that the exit code for the top level program reports a
failure if any of the individual tests has failed.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241017-arm64-fp-stress-exit-code-v1-1-f528e53a2321@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/fp-stress.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/fp/fp-stress.c b/tools/testing/selftests/arm64/fp/fp-stress.c
index faac24bdefeb..e62c9dbad501 100644
--- a/tools/testing/selftests/arm64/fp/fp-stress.c
+++ b/tools/testing/selftests/arm64/fp/fp-stress.c
@@ -651,7 +651,5 @@ int main(int argc, char **argv)
 
 	drain_output(true);
 
-	ksft_print_cnts();
-
-	return 0;
+	ksft_finished();
 }
-- 
cgit v1.2.3


From a6e263f125cd7b10614a83159c453c061dbf6877 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Thu, 17 Oct 2024 11:45:43 +0200
Subject: selftests: net: lib: Introduce deferred commands

In commit 8510801a9dbd ("selftests: drv-net: add ability to schedule
cleanup with defer()"), a defer helper was added to Python selftests.
The idea is to keep cleanup commands close to their dirtying counterparts,
thereby making it more transparent what is cleaning up what, making it
harder to miss a cleanup, and make the whole cleanup business exception
safe. All these benefits are applicable to bash as well, exception safety
can be interpreted in terms of safety vs. a SIGINT.

This patch therefore introduces a framework of several helpers that serve
to schedule cleanups in bash selftests:

- defer_scope_push(), defer_scope_pop(): Deferred statements can be batched
  together in scopes. When a scope is popped, the deferred commands
  scheduled in that scope are executed in the order opposite to order of
  their scheduling.

- defer(): Schedules a defer to the most recently pushed scope (or the
  default scope if none was pushed.)

- defer_prio(): Schedules a defer on the priority track. The priority defer
  queue is run before the default defer queue when scope is popped.

  The issue that this is addressing is specifically the one of restoring
  devlink shared buffer threshold type. When setting up static thresholds,
  one has to first change the threshold type to static, then override the
  individual thresholds. When cleaning up, it would be natural to reset the
  threshold values first, then change the threshold type. But the values
  that are valid for dynamic thresholds are generally invalid for static
  thresholds and vice versa. Attempts to restore the values first would be
  bounced. Thus one has to first reset the threshold type, then adjust the
  thresholds.

  (You could argue that the shared buffer threshold type API is broken and
  you would be right, but here we are.)

  This cannot be solved by pure defers easily. I considered making it
  possible to disable an existing defer, so that one could then schedule a
  new defer and disable the original. But this forward-shifting of the
  defer job would have to take place after every threshold-adjusting
  command, which would make it very awkward to schedule these jobs.

- defer_scopes_cleanup(): Pops any unpopped scopes, including the default
  one. The selftests that use defer should run this in their exit trap.
  This is important to get cleanups of interrupted scripts.

- in_defer_scope(): Sometimes a function would like to introduce a new
  defer scope, then run whatever it is that it wants to run, and then pop
  the scope to run the deferred cleanups. The helper in_defer_scope() can
  be used to run another command within such environment, such that any
  scheduled defers run after the command finishes.

The framework is added as a separate file lib/sh/defer.sh so that it can be
used by all bash selftests, including those that do not currently use
lib.sh. lib.sh however includes the file by default, because ideally all
tests would use these helpers instead of hand-rolling their cleanups.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/forwarding/lib.sh |   3 +-
 tools/testing/selftests/net/lib.sh            |   3 +
 tools/testing/selftests/net/lib/Makefile      |   2 +-
 tools/testing/selftests/net/lib/sh/defer.sh   | 115 ++++++++++++++++++++++++++
 4 files changed, 121 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/net/lib/sh/defer.sh

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index c992e385159c..d24b6af7ebfa 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -1403,7 +1403,8 @@ tests_run()
 	local current_test
 
 	for current_test in ${TESTS:-$ALL_TESTS}; do
-		$current_test
+		in_defer_scope \
+			$current_test
 	done
 }
 
diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh
index be8707bfb46e..c8991cc6bf28 100644
--- a/tools/testing/selftests/net/lib.sh
+++ b/tools/testing/selftests/net/lib.sh
@@ -1,6 +1,9 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
+net_dir=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
+source "$net_dir/lib/sh/defer.sh"
+
 ##############################################################################
 # Defines
 
diff --git a/tools/testing/selftests/net/lib/Makefile b/tools/testing/selftests/net/lib/Makefile
index 82c3264b115e..18b9443454a9 100644
--- a/tools/testing/selftests/net/lib/Makefile
+++ b/tools/testing/selftests/net/lib/Makefile
@@ -10,6 +10,6 @@ TEST_FILES += ../../../../net/ynl
 
 TEST_GEN_FILES += csum
 
-TEST_INCLUDES := $(wildcard py/*.py)
+TEST_INCLUDES := $(wildcard py/*.py sh/*.sh)
 
 include ../../lib.mk
diff --git a/tools/testing/selftests/net/lib/sh/defer.sh b/tools/testing/selftests/net/lib/sh/defer.sh
new file mode 100644
index 000000000000..082f5d38321b
--- /dev/null
+++ b/tools/testing/selftests/net/lib/sh/defer.sh
@@ -0,0 +1,115 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# map[(scope_id,track,cleanup_id) -> cleanup_command]
+# track={d=default | p=priority}
+declare -A __DEFER__JOBS
+
+# map[(scope_id,track) -> # cleanup_commands]
+declare -A __DEFER__NJOBS
+
+# scope_id of the topmost scope.
+__DEFER__SCOPE_ID=0
+
+__defer__ndefer_key()
+{
+	local track=$1; shift
+
+	echo $__DEFER__SCOPE_ID,$track
+}
+
+__defer__defer_key()
+{
+	local track=$1; shift
+	local defer_ix=$1; shift
+
+	echo $__DEFER__SCOPE_ID,$track,$defer_ix
+}
+
+__defer__ndefers()
+{
+	local track=$1; shift
+
+	echo ${__DEFER__NJOBS[$(__defer__ndefer_key $track)]}
+}
+
+__defer__run()
+{
+	local track=$1; shift
+	local defer_ix=$1; shift
+	local defer_key=$(__defer__defer_key $track $defer_ix)
+
+	${__DEFER__JOBS[$defer_key]}
+	unset __DEFER__JOBS[$defer_key]
+}
+
+__defer__schedule()
+{
+	local track=$1; shift
+	local ndefers=$(__defer__ndefers $track)
+	local ndefers_key=$(__defer__ndefer_key $track)
+	local defer_key=$(__defer__defer_key $track $ndefers)
+	local defer="$@"
+
+	__DEFER__JOBS[$defer_key]="$defer"
+	__DEFER__NJOBS[$ndefers_key]=$((ndefers + 1))
+}
+
+__defer__scope_wipe()
+{
+	__DEFER__NJOBS[$(__defer__ndefer_key d)]=0
+	__DEFER__NJOBS[$(__defer__ndefer_key p)]=0
+}
+
+defer_scope_push()
+{
+	((__DEFER__SCOPE_ID++))
+	__defer__scope_wipe
+}
+
+defer_scope_pop()
+{
+	local defer_ix
+
+	for ((defer_ix=$(__defer__ndefers p); defer_ix-->0; )); do
+		__defer__run p $defer_ix
+	done
+
+	for ((defer_ix=$(__defer__ndefers d); defer_ix-->0; )); do
+		__defer__run d $defer_ix
+	done
+
+	__defer__scope_wipe
+	((__DEFER__SCOPE_ID--))
+}
+
+defer()
+{
+	__defer__schedule d "$@"
+}
+
+defer_prio()
+{
+	__defer__schedule p "$@"
+}
+
+defer_scopes_cleanup()
+{
+	while ((__DEFER__SCOPE_ID >= 0)); do
+		defer_scope_pop
+	done
+}
+
+in_defer_scope()
+{
+	local ret
+
+	defer_scope_push
+	"$@"
+	ret=$?
+	defer_scope_pop
+
+	return $ret
+}
+
+__defer__scope_wipe
-- 
cgit v1.2.3


From b4b0549a4e59747b49619b2edabfb0d04e37c0b9 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Thu, 17 Oct 2024 11:45:44 +0200
Subject: selftests: forwarding: Add a fallback cleanup()

Consistent use of defers obviates the need for a separate test-specific
cleanup function -- everything is just taken care of in defers. So in this
patch, introduce a cleanup() helper in the forwarding lib.sh, which calls
just pre_cleanup() and defer_scopes_cleanup(). Selftests are obviously
still free to override the function.

Since pre_cleanup() is too entangled with forwarding-specific minutia, the
function cannot currently be in net/lib.sh.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/forwarding/lib.sh | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index d24b6af7ebfa..76e6d7698caf 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -1408,6 +1408,12 @@ tests_run()
 	done
 }
 
+cleanup()
+{
+	pre_cleanup
+	defer_scopes_cleanup
+}
+
 multipath_eval()
 {
 	local desc="$1"
-- 
cgit v1.2.3


From 0e07d5dbfbd9b0441ae4ec07a2a72738121356e2 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Thu, 17 Oct 2024 11:45:45 +0200
Subject: selftests: forwarding: lib: Allow passing PID to stop_traffic()

Now that it is possible to schedule a deferral of stop_traffic() right
after the traffic is started, we do not have to rely on the %% magic to
kill the background process that was started last. Instead we can just give
the PID explicitly. This makes it possible to start other background
processes after the traffic is started without confusing the cleanup.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/forwarding/lib.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index 76e6d7698caf..89c25f72b10c 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -1768,8 +1768,10 @@ start_tcp_traffic()
 
 stop_traffic()
 {
+	local pid=${1-%%}; shift
+
 	# Suppress noise from killing mausezahn.
-	{ kill %% && wait %%; } 2>/dev/null
+	{ kill $pid && wait $pid; } 2>/dev/null
 }
 
 declare -A cappid
-- 
cgit v1.2.3


From 7f46615d59373b65dcd0fea7784bf20f93c169f0 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Thu, 17 Oct 2024 11:45:46 +0200
Subject: selftests: RED: Use defer for test cleanup

Instead of having a suite of dedicated cleanup functions, use the defer
framework to schedule cleanups right as their setup functions are run.

The sleep after stop_traffic() in mlxsw selftests is necessary, but
scheduling it as "defer sleep; defer stop_traffic" is silly. Instead, add a
local helper to stop traffic and sleep afterwards.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 .../selftests/drivers/net/mlxsw/sch_red_core.sh    | 185 ++++++++++-----------
 .../selftests/drivers/net/mlxsw/sch_red_ets.sh     |  24 +--
 .../selftests/drivers/net/mlxsw/sch_red_root.sh    |  18 +-
 tools/testing/selftests/net/forwarding/sch_red.sh  | 103 +++++-------
 4 files changed, 149 insertions(+), 181 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh
index f4c324957dcc..537d6baa77b7 100644
--- a/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh
@@ -75,6 +75,18 @@ source $lib_dir/lib.sh
 source $lib_dir/devlink_lib.sh
 source mlxsw_lib.sh
 
+stop_traffic_sleep()
+{
+	local pid=$1; shift
+
+	# Issuing a kill still leaves a bunch of packets lingering in the
+	# buffers. This traffic then arrives at the point where a follow-up test
+	# is already running, and can confuse the test. Therefore sleep after
+	# stopping traffic to flush any leftover packets.
+	stop_traffic "$pid"
+	sleep 1
+}
+
 ipaddr()
 {
 	local host=$1; shift
@@ -89,39 +101,31 @@ host_create()
 	local host=$1; shift
 
 	simple_if_init $dev
+	defer simple_if_fini $dev
+
 	mtu_set $dev 10000
+	defer mtu_restore $dev
 
 	vlan_create $dev 10 v$dev $(ipaddr $host 10)/28
+	defer vlan_destroy $dev 10
 	ip link set dev $dev.10 type vlan egress 0:0
 
 	vlan_create $dev 11 v$dev $(ipaddr $host 11)/28
+	defer vlan_destroy $dev 11
 	ip link set dev $dev.11 type vlan egress 0:1
 }
 
-host_destroy()
-{
-	local dev=$1; shift
-
-	vlan_destroy $dev 11
-	vlan_destroy $dev 10
-	mtu_restore $dev
-	simple_if_fini $dev
-}
-
 h1_create()
 {
 	host_create $h1 1
 }
 
-h1_destroy()
-{
-	host_destroy $h1
-}
-
 h2_create()
 {
 	host_create $h2 2
+
 	tc qdisc add dev $h2 clsact
+	defer tc qdisc del dev $h2 clsact
 
 	# Some of the tests in this suite use multicast traffic. As this traffic
 	# enters BR2_10 resp. BR2_11, it is flooded to all other ports. Thus
@@ -139,13 +143,7 @@ h2_create()
 
 	tc qdisc replace dev $h2 root handle 10: tbf rate 200mbit \
 		burst 128K limit 1G
-}
-
-h2_destroy()
-{
-	tc qdisc del dev $h2 root handle 10:
-	tc qdisc del dev $h2 clsact
-	host_destroy $h2
+	defer tc qdisc del dev $h2 root handle 10:
 }
 
 h3_create()
@@ -153,40 +151,54 @@ h3_create()
 	host_create $h3 3
 }
 
-h3_destroy()
-{
-	host_destroy $h3
-}
-
 switch_create()
 {
 	local intf
 	local vlan
 
 	ip link add dev br1_10 type bridge
+	defer ip link del dev br1_10
+
 	ip link add dev br1_11 type bridge
+	defer ip link del dev br1_11
 
 	ip link add dev br2_10 type bridge
+	defer ip link del dev br2_10
+
 	ip link add dev br2_11 type bridge
+	defer ip link del dev br2_11
 
 	for intf in $swp1 $swp2 $swp3 $swp4 $swp5; do
 		ip link set dev $intf up
+		defer ip link set dev $intf down
+
 		mtu_set $intf 10000
+		defer mtu_restore $intf
 	done
 
 	for intf in $swp1 $swp4; do
 		for vlan in 10 11; do
 			vlan_create $intf $vlan
+			defer vlan_destroy $intf $vlan
+
 			ip link set dev $intf.$vlan master br1_$vlan
+			defer ip link set dev $intf.$vlan nomaster
+
 			ip link set dev $intf.$vlan up
+			defer ip link set dev $intf.$vlan up
 		done
 	done
 
 	for intf in $swp2 $swp3 $swp5; do
 		for vlan in 10 11; do
 			vlan_create $intf $vlan
+			defer vlan_destroy $intf $vlan
+
 			ip link set dev $intf.$vlan master br2_$vlan
+			defer ip link set dev $intf.$vlan nomaster
+
 			ip link set dev $intf.$vlan up
+			defer ip link set dev $intf.$vlan up
 		done
 	done
 
@@ -201,49 +213,25 @@ switch_create()
 	for intf in $swp3 $swp4; do
 		tc qdisc replace dev $intf root handle 1: tbf rate 200mbit \
 			burst 128K limit 1G
+		defer tc qdisc del dev $intf root handle 1:
 	done
 
 	ip link set dev br1_10 up
+	defer ip link set dev br1_10 down
+
 	ip link set dev br1_11 up
+	defer ip link set dev br1_11 down
+
 	ip link set dev br2_10 up
+	defer ip link set dev br2_10 down
+
 	ip link set dev br2_11 up
+	defer ip link set dev br2_11 down
 
 	local size=$(devlink_pool_size_thtype 0 | cut -d' ' -f 1)
 	devlink_port_pool_th_save $swp3 8
 	devlink_port_pool_th_set $swp3 8 $size
-}
-
-switch_destroy()
-{
-	local intf
-	local vlan
-
-	devlink_port_pool_th_restore $swp3 8
-
-	ip link set dev br2_11 down
-	ip link set dev br2_10 down
-	ip link set dev br1_11 down
-	ip link set dev br1_10 down
-
-	for intf in $swp4 $swp3; do
-		tc qdisc del dev $intf root handle 1:
-	done
-
-	for intf in $swp5 $swp3 $swp2 $swp4 $swp1; do
-		for vlan in 11 10; do
-			ip link set dev $intf.$vlan down
-			ip link set dev $intf.$vlan nomaster
-			vlan_destroy $intf $vlan
-		done
-
-		mtu_restore $intf
-		ip link set dev $intf down
-	done
-
-	ip link del dev br2_11
-	ip link del dev br2_10
-	ip link del dev br1_11
-	ip link del dev br1_10
+	defer devlink_port_pool_th_restore $swp3 8
 }
 
 setup_prepare()
@@ -263,6 +251,7 @@ setup_prepare()
 	h3_mac=$(mac_get $h3)
 
 	vrf_prepare
+	defer vrf_cleanup
 
 	h1_create
 	h2_create
@@ -270,18 +259,6 @@ setup_prepare()
 	switch_create
 }
 
-cleanup()
-{
-	pre_cleanup
-
-	switch_destroy
-	h3_destroy
-	h2_destroy
-	h1_destroy
-
-	vrf_cleanup
-}
-
 ping_ipv4()
 {
 	ping_test $h1.10 $(ipaddr 3 10) " from host 1, vlan 10"
@@ -450,6 +427,7 @@ __do_ecn_test()
 
 	start_tcp_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 3 $vlan) \
 			  $h3_mac tos=0x01
+	defer stop_traffic_sleep $!
 	sleep 1
 
 	ecn_test_common "$name" "$get_nmarked" $vlan $limit
@@ -461,9 +439,6 @@ __do_ecn_test()
 	build_backlog $vlan $((2 * limit)) udp >/dev/null
 	check_fail $? "UDP traffic went into backlog instead of being early-dropped"
 	log_test "TC $((vlan - 10)): $name backlog > limit: UDP early-dropped"
-
-	stop_traffic
-	sleep 1
 }
 
 do_ecn_test()
@@ -471,7 +446,8 @@ do_ecn_test()
 	local vlan=$1; shift
 	local limit=$1; shift
 
-	__do_ecn_test get_nmarked "$vlan" "$limit"
+	in_defer_scope \
+		__do_ecn_test get_nmarked "$vlan" "$limit"
 }
 
 do_ecn_test_perband()
@@ -480,10 +456,11 @@ do_ecn_test_perband()
 	local limit=$1; shift
 
 	mlxsw_only_on_spectrum 3+ || return
-	__do_ecn_test get_qdisc_nmarked "$vlan" "$limit" "per-band ECN"
+	in_defer_scope \
+		__do_ecn_test get_qdisc_nmarked "$vlan" "$limit" "per-band ECN"
 }
 
-do_ecn_nodrop_test()
+__do_ecn_nodrop_test()
 {
 	local vlan=$1; shift
 	local limit=$1; shift
@@ -491,6 +468,7 @@ do_ecn_nodrop_test()
 
 	start_tcp_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 3 $vlan) \
 			  $h3_mac tos=0x01
+	defer stop_traffic_sleep $!
 	sleep 1
 
 	ecn_test_common "$name" get_nmarked $vlan $limit
@@ -502,12 +480,15 @@ do_ecn_nodrop_test()
 	build_backlog $vlan $((2 * limit)) udp >/dev/null
 	check_err $? "UDP traffic was early-dropped instead of getting into backlog"
 	log_test "TC $((vlan - 10)): $name backlog > limit: UDP not dropped"
+}
 
-	stop_traffic
-	sleep 1
+do_ecn_nodrop_test()
+{
+	in_defer_scope \
+		__do_ecn_nodrop_test "$@"
 }
 
-do_red_test()
+__do_red_test()
 {
 	local vlan=$1; shift
 	local limit=$1; shift
@@ -518,6 +499,7 @@ do_red_test()
 	# is above limit.
 	start_tcp_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 3 $vlan) \
 			  $h3_mac tos=0x01
+	defer stop_traffic_sleep $!
 
 	# Pushing below the queue limit should work.
 	RET=0
@@ -539,12 +521,15 @@ do_red_test()
 	((-15 <= pct && pct <= 15))
 	check_err $? "backlog $backlog / $limit expected <= 15% distance"
 	log_test "TC $((vlan - 10)): RED backlog > limit"
+}
 
-	stop_traffic
-	sleep 1
+do_red_test()
+{
+	in_defer_scope \
+		__do_red_test "$@"
 }
 
-do_mc_backlog_test()
+__do_mc_backlog_test()
 {
 	local vlan=$1; shift
 	local limit=$1; shift
@@ -554,7 +539,10 @@ do_mc_backlog_test()
 	RET=0
 
 	start_tcp_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 3 $vlan) bc
+	defer stop_traffic_sleep $!
+
 	start_tcp_traffic $h2.$vlan $(ipaddr 2 $vlan) $(ipaddr 3 $vlan) bc
+	defer stop_traffic_sleep $!
 
 	qbl=$(busywait 5000 until_counter_is ">= 500000" \
 		       get_qdisc_backlog $vlan)
@@ -567,13 +555,16 @@ do_mc_backlog_test()
 		       get_mc_transmit_queue $vlan)
 	check_err $? "MC backlog reported by qdisc not visible in ethtool"
 
-	stop_traffic
-	stop_traffic
-
 	log_test "TC $((vlan - 10)): Qdisc reports MC backlog"
 }
 
-do_mark_test()
+do_mc_backlog_test()
+{
+	in_defer_scope \
+		__do_mc_backlog_test "$@"
+}
+
+__do_mark_test()
 {
 	local vlan=$1; shift
 	local limit=$1; shift
@@ -588,6 +579,7 @@ do_mark_test()
 
 	start_tcp_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 3 $vlan) \
 			  $h3_mac tos=0x01
+	defer stop_traffic_sleep $!
 
 	# Create a bit of a backlog and observe no mirroring due to marks.
 	qevent_rule_install_$subtest
@@ -617,12 +609,15 @@ do_mark_test()
 	else
 		log_test "TC $((vlan - 10)): marked packets $subtest'd"
 	fi
+}
 
-	stop_traffic
-	sleep 1
+do_mark_test()
+{
+	in_defer_scope \
+		__do_mark_test "$@"
 }
 
-do_drop_test()
+__do_drop_test()
 {
 	local vlan=$1; shift
 	local limit=$1; shift
@@ -637,6 +632,7 @@ do_drop_test()
 	RET=0
 
 	start_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 3 $vlan) $h3_mac
+	defer stop_traffic_sleep $!
 
 	# Create a bit of a backlog and observe no mirroring due to drops.
 	qevent_rule_install_$subtest
@@ -671,9 +667,12 @@ do_drop_test()
 	check_fail $? "$((now - base)) spurious packets observed after uninstall"
 
 	log_test "TC $((vlan - 10)): ${trigger}ped packets $subtest'd"
+}
 
-	stop_traffic
-	sleep 1
+do_drop_test()
+{
+	in_defer_scope \
+		__do_drop_test "$@"
 }
 
 qevent_rule_install_mirror()
diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh
index 576067b207a8..8902a115d9cd 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh
@@ -80,36 +80,34 @@ uninstall_qdisc()
 ecn_test()
 {
 	install_qdisc ecn
+	defer uninstall_qdisc
 
 	do_ecn_test 10 $BACKLOG1
 	do_ecn_test 11 $BACKLOG2
-
-	uninstall_qdisc
 }
 
 ecn_test_perband()
 {
 	install_qdisc ecn
+	defer uninstall_qdisc
 
 	do_ecn_test_perband 10 $BACKLOG1
 	do_ecn_test_perband 11 $BACKLOG2
-
-	uninstall_qdisc
 }
 
 ecn_nodrop_test()
 {
 	install_qdisc ecn nodrop
+	defer uninstall_qdisc
 
 	do_ecn_nodrop_test 10 $BACKLOG1
 	do_ecn_nodrop_test 11 $BACKLOG2
-
-	uninstall_qdisc
 }
 
 red_test()
 {
 	install_qdisc
+	defer uninstall_qdisc
 
 	# Make sure that we get the non-zero value if there is any.
 	local cur=$(busywait 1100 until_counter_is "> 0" \
@@ -120,50 +118,44 @@ red_test()
 
 	do_red_test 10 $BACKLOG1
 	do_red_test 11 $BACKLOG2
-
-	uninstall_qdisc
 }
 
 mc_backlog_test()
 {
 	install_qdisc
+	defer uninstall_qdisc
 
 	# Note that the backlog numbers here do not correspond to RED
 	# configuration, but are arbitrary.
 	do_mc_backlog_test 10 $BACKLOG1
 	do_mc_backlog_test 11 $BACKLOG2
-
-	uninstall_qdisc
 }
 
 red_mirror_test()
 {
 	install_qdisc qevent early_drop block 10
+	defer uninstall_qdisc
 
 	do_drop_mirror_test 10 $BACKLOG1 early_drop
 	do_drop_mirror_test 11 $BACKLOG2 early_drop
-
-	uninstall_qdisc
 }
 
 red_trap_test()
 {
 	install_qdisc qevent early_drop block 10
+	defer uninstall_qdisc
 
 	do_drop_trap_test 10 $BACKLOG1 early_drop
 	do_drop_trap_test 11 $BACKLOG2 early_drop
-
-	uninstall_qdisc
 }
 
 ecn_mirror_test()
 {
 	install_qdisc ecn qevent mark block 10
+	defer uninstall_qdisc
 
 	do_mark_mirror_test 10 $BACKLOG1
 	do_mark_mirror_test 11 $BACKLOG2
-
-	uninstall_qdisc
 }
 
 bail_on_lldpad "configure DCB" "configure Qdiscs"
diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_red_root.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_red_root.sh
index 159108d02895..e9043771787b 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/sch_red_root.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/sch_red_root.sh
@@ -32,45 +32,51 @@ uninstall_qdisc()
 ecn_test()
 {
 	install_qdisc ecn
+	defer uninstall_qdisc
+
 	do_ecn_test 10 $BACKLOG
-	uninstall_qdisc
 }
 
 ecn_test_perband()
 {
 	install_qdisc ecn
+	defer uninstall_qdisc
+
 	do_ecn_test_perband 10 $BACKLOG
-	uninstall_qdisc
 }
 
 ecn_nodrop_test()
 {
 	install_qdisc ecn nodrop
+	defer uninstall_qdisc
+
 	do_ecn_nodrop_test 10 $BACKLOG
-	uninstall_qdisc
 }
 
 red_test()
 {
 	install_qdisc
+	defer uninstall_qdisc
+
 	do_red_test 10 $BACKLOG
-	uninstall_qdisc
 }
 
 mc_backlog_test()
 {
 	install_qdisc
+	defer uninstall_qdisc
+
 	# Note that the backlog value here does not correspond to RED
 	# configuration, but is arbitrary.
 	do_mc_backlog_test 10 $BACKLOG
-	uninstall_qdisc
 }
 
 red_mirror_test()
 {
 	install_qdisc qevent early_drop block 10
+	defer uninstall_qdisc
+
 	do_drop_mirror_test 10 $BACKLOG
-	uninstall_qdisc
 }
 
 bail_on_lldpad "configure DCB" "configure Qdiscs"
diff --git a/tools/testing/selftests/net/forwarding/sch_red.sh b/tools/testing/selftests/net/forwarding/sch_red.sh
index 17f28644568e..af166662b78a 100755
--- a/tools/testing/selftests/net/forwarding/sch_red.sh
+++ b/tools/testing/selftests/net/forwarding/sch_red.sh
@@ -53,71 +53,63 @@ PKTSZ=1400
 h1_create()
 {
 	simple_if_init $h1 192.0.2.1/28
+	defer simple_if_fini $h1 192.0.2.1/28
+
 	mtu_set $h1 10000
+	defer mtu_restore $h1
+
 	tc qdisc replace dev $h1 root handle 1: tbf \
 	   rate 10Mbit burst 10K limit 1M
-}
-
-h1_destroy()
-{
-	tc qdisc del dev $h1 root
-	mtu_restore $h1
-	simple_if_fini $h1 192.0.2.1/28
+	defer tc qdisc del dev $h1 root
 }
 
 h2_create()
 {
 	simple_if_init $h2 192.0.2.2/28
-	mtu_set $h2 10000
-}
+	defer simple_if_fini $h2 192.0.2.2/28
 
-h2_destroy()
-{
-	mtu_restore $h2
-	simple_if_fini $h2 192.0.2.2/28
+	mtu_set $h2 10000
+	defer mtu_restore $h2
 }
 
 h3_create()
 {
 	simple_if_init $h3 192.0.2.3/28
-	mtu_set $h3 10000
-}
+	defer simple_if_fini $h3 192.0.2.3/28
 
-h3_destroy()
-{
-	mtu_restore $h3
-	simple_if_fini $h3 192.0.2.3/28
+	mtu_set $h3 10000
+	defer mtu_restore $h3
 }
 
 switch_create()
 {
 	ip link add dev br up type bridge
+	defer ip link del dev br
+
 	ip link set dev $swp1 up master br
+	defer ip link set dev $swp1 down nomaster
+
 	ip link set dev $swp2 up master br
+	defer ip link set dev $swp2 down nomaster
+
 	ip link set dev $swp3 up master br
+	defer ip link set dev $swp3 down nomaster
 
 	mtu_set $swp1 10000
+	defer mtu_restore $h1
+
 	mtu_set $swp2 10000
+	defer mtu_restore $h2
+
 	mtu_set $swp3 10000
+	defer mtu_restore $h3
 
 	tc qdisc replace dev $swp3 root handle 1: tbf \
 	   rate 10Mbit burst 10K limit 1M
-	ip link add name _drop_test up type dummy
-}
+	defer tc qdisc del dev $swp3 root
 
-switch_destroy()
-{
-	ip link del dev _drop_test
-	tc qdisc del dev $swp3 root
-
-	mtu_restore $h3
-	mtu_restore $h2
-	mtu_restore $h1
-
-	ip link set dev $swp3 down nomaster
-	ip link set dev $swp2 down nomaster
-	ip link set dev $swp1 down nomaster
-	ip link del dev br
+	ip link add name _drop_test up type dummy
+	defer ip link del dev _drop_test
 }
 
 setup_prepare()
@@ -134,6 +126,7 @@ setup_prepare()
 	h3_mac=$(mac_get $h3)
 
 	vrf_prepare
+	defer vrf_cleanup
 
 	h1_create
 	h2_create
@@ -141,18 +134,6 @@ setup_prepare()
 	switch_create
 }
 
-cleanup()
-{
-	pre_cleanup
-
-	switch_destroy
-	h3_destroy
-	h2_destroy
-	h1_destroy
-
-	vrf_cleanup
-}
-
 ping_ipv4()
 {
 	ping_test $h1 192.0.2.3 " from host 1"
@@ -287,6 +268,7 @@ do_ecn_test()
 
 	$MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \
 		-a own -b $h3_mac -t tcp -q tos=0x01 &
+	defer stop_traffic $!
 	sleep 1
 
 	ecn_test_common "$name" $limit
@@ -298,9 +280,6 @@ do_ecn_test()
 	build_backlog $((2 * limit)) udp >/dev/null
 	check_fail $? "UDP traffic went into backlog instead of being early-dropped"
 	log_test "$name backlog > limit: UDP early-dropped"
-
-	stop_traffic
-	sleep 1
 }
 
 do_ecn_nodrop_test()
@@ -310,6 +289,7 @@ do_ecn_nodrop_test()
 
 	$MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \
 		-a own -b $h3_mac -t tcp -q tos=0x01 &
+	defer stop_traffic $!
 	sleep 1
 
 	ecn_test_common "$name" $limit
@@ -321,9 +301,6 @@ do_ecn_nodrop_test()
 	build_backlog $((2 * limit)) udp >/dev/null
 	check_err $? "UDP traffic was early-dropped instead of getting into backlog"
 	log_test "$name backlog > limit: UDP not dropped"
-
-	stop_traffic
-	sleep 1
 }
 
 do_red_test()
@@ -336,6 +313,7 @@ do_red_test()
 	# is above limit.
 	$MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \
 		-a own -b $h3_mac -t tcp -q tos=0x01 &
+	defer stop_traffic $!
 
 	# Pushing below the queue limit should work.
 	RET=0
@@ -352,9 +330,6 @@ do_red_test()
 	pct=$(check_marking "== 0")
 	check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected == 0."
 	log_test "RED backlog > limit"
-
-	stop_traffic
-	sleep 1
 }
 
 do_red_qevent_test()
@@ -369,6 +344,7 @@ do_red_qevent_test()
 
 	$MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \
 		-a own -b $h3_mac -t udp -q &
+	defer stop_traffic $!
 	sleep 1
 
 	tc filter add block 10 pref 1234 handle 102 matchall skip_hw \
@@ -396,9 +372,6 @@ do_red_qevent_test()
 	check_err $? "Dropped packets still observed: 0 expected, $((now - base)) seen"
 
 	log_test "RED early_dropped packets mirrored"
-
-	stop_traffic
-	sleep 1
 }
 
 do_ecn_qevent_test()
@@ -410,6 +383,7 @@ do_ecn_qevent_test()
 
 	$MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \
 		-a own -b $h3_mac -t tcp -q tos=0x01 &
+	defer stop_traffic $!
 	sleep 1
 
 	tc filter add block 10 pref 1234 handle 102 matchall skip_hw \
@@ -428,9 +402,6 @@ do_ecn_qevent_test()
 	tc filter del block 10 pref 1234 handle 102 matchall
 
 	log_test "ECN marked packets mirrored"
-
-	stop_traffic
-	sleep 1
 }
 
 install_qdisc()
@@ -451,36 +422,36 @@ uninstall_qdisc()
 ecn_test()
 {
 	install_qdisc ecn
+	defer uninstall_qdisc
 	xfail_on_slow do_ecn_test $BACKLOG
-	uninstall_qdisc
 }
 
 ecn_nodrop_test()
 {
 	install_qdisc ecn nodrop
+	defer uninstall_qdisc
 	xfail_on_slow do_ecn_nodrop_test $BACKLOG
-	uninstall_qdisc
 }
 
 red_test()
 {
 	install_qdisc
+	defer uninstall_qdisc
 	xfail_on_slow do_red_test $BACKLOG
-	uninstall_qdisc
 }
 
 red_qevent_test()
 {
 	install_qdisc qevent early_drop block 10
+	defer uninstall_qdisc
 	xfail_on_slow do_red_qevent_test $BACKLOG
-	uninstall_qdisc
 }
 
 ecn_qevent_test()
 {
 	install_qdisc ecn qevent mark block 10
+	defer uninstall_qdisc
 	xfail_on_slow do_ecn_qevent_test $BACKLOG
-	uninstall_qdisc
 }
 
 trap cleanup EXIT
-- 
cgit v1.2.3


From a1b3741dcfd16bf1e337c89b9fca5fbb9110fbed Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Thu, 17 Oct 2024 11:45:47 +0200
Subject: selftests: TBF: Use defer for test cleanup

Use the defer framework to schedule cleanups as soon as the command is
executed.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 .../selftests/net/forwarding/sch_tbf_core.sh       | 91 +++++++---------------
 .../selftests/net/forwarding/sch_tbf_etsprio.sh    |  7 +-
 .../selftests/net/forwarding/sch_tbf_root.sh       |  3 +-
 3 files changed, 36 insertions(+), 65 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/forwarding/sch_tbf_core.sh b/tools/testing/selftests/net/forwarding/sch_tbf_core.sh
index 9cd884d4a5de..ec309a5086bc 100644
--- a/tools/testing/selftests/net/forwarding/sch_tbf_core.sh
+++ b/tools/testing/selftests/net/forwarding/sch_tbf_core.sh
@@ -60,68 +60,65 @@ host_create()
 	local host=$1; shift
 
 	simple_if_init $dev
+	defer simple_if_fini $dev
+
 	mtu_set $dev 10000
+	defer mtu_restore $dev
 
 	vlan_create $dev 10 v$dev $(ipaddr $host 10)/28
+	defer vlan_destroy $dev 10
 	ip link set dev $dev.10 type vlan egress 0:0
 
 	vlan_create $dev 11 v$dev $(ipaddr $host 11)/28
+	defer vlan_destroy $dev 11
 	ip link set dev $dev.11 type vlan egress 0:1
 }
 
-host_destroy()
-{
-	local dev=$1; shift
-
-	vlan_destroy $dev 11
-	vlan_destroy $dev 10
-	mtu_restore $dev
-	simple_if_fini $dev
-}
-
 h1_create()
 {
 	host_create $h1 1
 }
 
-h1_destroy()
-{
-	host_destroy $h1
-}
-
 h2_create()
 {
 	host_create $h2 2
 
 	tc qdisc add dev $h2 clsact
+	defer tc qdisc del dev $h2 clsact
+
 	tc filter add dev $h2 ingress pref 1010 prot 802.1q \
 	   flower $TCFLAGS vlan_id 10 action pass
 	tc filter add dev $h2 ingress pref 1011 prot 802.1q \
 	   flower $TCFLAGS vlan_id 11 action pass
 }
 
-h2_destroy()
-{
-	tc qdisc del dev $h2 clsact
-	host_destroy $h2
-}
-
 switch_create()
 {
 	local intf
 	local vlan
 
 	ip link add dev br10 type bridge
+	defer ip link del dev br10
+
 	ip link add dev br11 type bridge
+	defer ip link del dev br11
 
 	for intf in $swp1 $swp2; do
 		ip link set dev $intf up
+		defer ip link set dev $intf down
+
 		mtu_set $intf 10000
+		defer mtu_restore $intf
 
 		for vlan in 10 11; do
 			vlan_create $intf $vlan
+			defer vlan_destroy $intf $vlan
+
 			ip link set dev $intf.$vlan master br$vlan
+			defer ip link set dev $intf.$vlan nomaster
+
 			ip link set dev $intf.$vlan up
+			defer ip link set dev $intf.$vlan down
 		done
 	done
 
@@ -130,34 +127,10 @@ switch_create()
 	done
 
 	ip link set dev br10 up
-	ip link set dev br11 up
-}
-
-switch_destroy()
-{
-	local intf
-	local vlan
-
-	# A test may have been interrupted mid-run, with Qdisc installed. Delete
-	# it here.
-	tc qdisc del dev $swp2 root 2>/dev/null
-
-	ip link set dev br11 down
-	ip link set dev br10 down
+	defer ip link set dev br10 down
 
-	for intf in $swp2 $swp1; do
-		for vlan in 11 10; do
-			ip link set dev $intf.$vlan down
-			ip link set dev $intf.$vlan nomaster
-			vlan_destroy $intf $vlan
-		done
-
-		mtu_restore $intf
-		ip link set dev $intf down
-	done
-
-	ip link del dev br11
-	ip link del dev br10
+	ip link set dev br11 up
+	defer ip link set dev br11 down
 }
 
 setup_prepare()
@@ -177,23 +150,13 @@ setup_prepare()
 	h2_mac=$(mac_get $h2)
 
 	vrf_prepare
+	defer vrf_cleanup
 
 	h1_create
 	h2_create
 	switch_create
 }
 
-cleanup()
-{
-	pre_cleanup
-
-	switch_destroy
-	h2_destroy
-	h1_destroy
-
-	vrf_cleanup
-}
-
 ping_ipv4()
 {
 	ping_test $h1.10 $(ipaddr 2 10) " vlan 10"
@@ -207,18 +170,18 @@ tbf_get_counter()
 	tc_rule_stats_get $h2 10$vlan ingress .bytes
 }
 
-do_tbf_test()
+__tbf_test()
 {
 	local vlan=$1; shift
 	local mbit=$1; shift
 
 	start_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 2 $vlan) $h2_mac
+	defer stop_traffic $!
 	sleep 5 # Wait for the burst to dwindle
 
 	local t2=$(busywait_for_counter 1000 +1 tbf_get_counter $vlan)
 	sleep 10
 	local t3=$(tbf_get_counter $vlan)
-	stop_traffic
 
 	RET=0
 
@@ -231,3 +194,9 @@ do_tbf_test()
 
 	log_test "TC $((vlan - 10)): TBF rate ${mbit}Mbit"
 }
+
+do_tbf_test()
+{
+	in_defer_scope \
+		__tbf_test "$@"
+}
diff --git a/tools/testing/selftests/net/forwarding/sch_tbf_etsprio.sh b/tools/testing/selftests/net/forwarding/sch_tbf_etsprio.sh
index df9bcd6a811a..c182a04282bc 100644
--- a/tools/testing/selftests/net/forwarding/sch_tbf_etsprio.sh
+++ b/tools/testing/selftests/net/forwarding/sch_tbf_etsprio.sh
@@ -30,8 +30,9 @@ tbf_test()
 	# This test is used for both ETS and PRIO. Even though we only need two
 	# bands, PRIO demands a minimum of three.
 	tc qdisc add dev $swp2 root handle 10: $QDISC 3 priomap 2 1 0
+	defer tc qdisc del dev $swp2 root
+
 	tbf_test_one 128K
-	tc qdisc del dev $swp2 root
 }
 
 tbf_root_test()
@@ -42,6 +43,8 @@ tbf_root_test()
 
 	tc qdisc replace dev $swp2 root handle 1: \
 		tbf rate 400Mbit burst $bs limit 1M
+	defer tc qdisc del dev $swp2 root
+
 	tc qdisc replace dev $swp2 parent 1:1 handle 10: \
 		$QDISC 3 priomap 2 1 0
 	tc qdisc replace dev $swp2 parent 10:3 handle 103: \
@@ -53,8 +56,6 @@ tbf_root_test()
 
 	do_tbf_test 10 400 $bs
 	do_tbf_test 11 400 $bs
-
-	tc qdisc del dev $swp2 root
 }
 
 if type -t sch_tbf_pre_hook >/dev/null; then
diff --git a/tools/testing/selftests/net/forwarding/sch_tbf_root.sh b/tools/testing/selftests/net/forwarding/sch_tbf_root.sh
index 96c997be0d03..9f20320f8d84 100755
--- a/tools/testing/selftests/net/forwarding/sch_tbf_root.sh
+++ b/tools/testing/selftests/net/forwarding/sch_tbf_root.sh
@@ -14,13 +14,14 @@ tbf_test_one()
 
 	tc qdisc replace dev $swp2 root handle 108: tbf \
 	   rate 400Mbit burst $bs limit 1M
+	defer tc qdisc del dev $swp2 root
+
 	do_tbf_test 10 400 $bs
 }
 
 tbf_test()
 {
 	tbf_test_one 128K
-	tc qdisc del dev $swp2 root
 }
 
 if type -t sch_tbf_pre_hook >/dev/null; then
-- 
cgit v1.2.3


From cc3e7ee15ddd0d37251127b7802d4483d7f6cad3 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Thu, 17 Oct 2024 11:45:48 +0200
Subject: selftests: ETS: Use defer for test cleanup

Use the defer framework to schedule cleanups as soon as the command is
executed.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 .../testing/selftests/drivers/net/mlxsw/sch_ets.sh | 26 ++++---
 tools/testing/selftests/net/forwarding/sch_ets.sh  |  7 +-
 .../selftests/net/forwarding/sch_ets_core.sh       | 81 ++++++++--------------
 .../selftests/net/forwarding/sch_ets_tests.sh      | 14 ++--
 4 files changed, 50 insertions(+), 78 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_ets.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_ets.sh
index 139175fd03e7..4aaceb6b2b60 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/sch_ets.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/sch_ets.sh
@@ -21,6 +21,7 @@ switch_create()
 	# Create a bottleneck so that the DWRR process can kick in.
 	tc qdisc replace dev $swp2 root handle 3: tbf rate 1gbit \
 		burst 128K limit 1G
+	defer tc qdisc del dev $swp2 root handle 3:
 
 	ets_switch_create
 
@@ -30,16 +31,27 @@ switch_create()
 	# for the DWRR process.
 	devlink_port_pool_th_save $swp1 0
 	devlink_port_pool_th_set $swp1 0 12
+	defer devlink_port_pool_th_restore $swp1 0
+
 	devlink_tc_bind_pool_th_save $swp1 0 ingress
 	devlink_tc_bind_pool_th_set $swp1 0 ingress 0 12
+	defer devlink_tc_bind_pool_th_restore $swp1 0 ingress
+
 	devlink_port_pool_th_save $swp2 4
 	devlink_port_pool_th_set $swp2 4 12
+	defer devlink_port_pool_th_restore $swp2 4
+
 	devlink_tc_bind_pool_th_save $swp2 7 egress
 	devlink_tc_bind_pool_th_set $swp2 7 egress 4 5
+	defer devlink_tc_bind_pool_th_restore $swp2 7 egress
+
 	devlink_tc_bind_pool_th_save $swp2 6 egress
 	devlink_tc_bind_pool_th_set $swp2 6 egress 4 5
+	defer devlink_tc_bind_pool_th_restore $swp2 6 egress
+
 	devlink_tc_bind_pool_th_save $swp2 5 egress
 	devlink_tc_bind_pool_th_set $swp2 5 egress 4 5
+	defer devlink_tc_bind_pool_th_restore $swp2 5 egress
 
 	# Note: sch_ets_core.sh uses VLAN ingress-qos-map to assign packet
 	# priorities at $swp1 based on their 802.1p headers. ingress-qos-map is
@@ -47,20 +59,6 @@ switch_create()
 	# 1:1, which is the mapping currently hard-coded by the driver.
 }
 
-switch_destroy()
-{
-	devlink_tc_bind_pool_th_restore $swp2 5 egress
-	devlink_tc_bind_pool_th_restore $swp2 6 egress
-	devlink_tc_bind_pool_th_restore $swp2 7 egress
-	devlink_port_pool_th_restore $swp2 4
-	devlink_tc_bind_pool_th_restore $swp1 0 ingress
-	devlink_port_pool_th_restore $swp1 0
-
-	ets_switch_destroy
-
-	tc qdisc del dev $swp2 root handle 3:
-}
-
 # Callback from sch_ets_tests.sh
 collect_stats()
 {
diff --git a/tools/testing/selftests/net/forwarding/sch_ets.sh b/tools/testing/selftests/net/forwarding/sch_ets.sh
index e60c8b4818cc..1f6f53e284b5 100755
--- a/tools/testing/selftests/net/forwarding/sch_ets.sh
+++ b/tools/testing/selftests/net/forwarding/sch_ets.sh
@@ -24,15 +24,10 @@ switch_create()
 	# Create a bottleneck so that the DWRR process can kick in.
 	tc qdisc add dev $swp2 root handle 1: tbf \
 	   rate 1Gbit burst 1Mbit latency 100ms
+	defer tc qdisc del dev $swp2 root
 	PARENT="parent 1:"
 }
 
-switch_destroy()
-{
-	ets_switch_destroy
-	tc qdisc del dev $swp2 root
-}
-
 # Callback from sch_ets_tests.sh
 collect_stats()
 {
diff --git a/tools/testing/selftests/net/forwarding/sch_ets_core.sh b/tools/testing/selftests/net/forwarding/sch_ets_core.sh
index f906fcc66572..8f9922c695b0 100644
--- a/tools/testing/selftests/net/forwarding/sch_ets_core.sh
+++ b/tools/testing/selftests/net/forwarding/sch_ets_core.sh
@@ -166,44 +166,32 @@ h1_create()
 	local i;
 
 	simple_if_init $h1
+	defer simple_if_fini $h1
+
 	mtu_set $h1 9900
+	defer mtu_restore $h1
+
 	for i in {0..2}; do
 		vlan_create $h1 1$i v$h1 $(sip $i)/28
+		defer vlan_destroy $h1 1$i
 		ip link set dev $h1.1$i type vlan egress 0:$i
 	done
 }
 
-h1_destroy()
-{
-	local i
-
-	for i in {0..2}; do
-		vlan_destroy $h1 1$i
-	done
-	mtu_restore $h1
-	simple_if_fini $h1
-}
-
 h2_create()
 {
 	local i
 
 	simple_if_init $h2
-	mtu_set $h2 9900
-	for i in {0..2}; do
-		vlan_create $h2 1$i v$h2 $(dip $i)/28
-	done
-}
+	defer simple_if_fini $h2
 
-h2_destroy()
-{
-	local i
+	mtu_set $h2 9900
+	defer mtu_restore $h2
 
 	for i in {0..2}; do
-		vlan_destroy $h2 1$i
+		vlan_create $h2 1$i v$h2 $(dip $i)/28
+		defer vlan_destroy $h2 1$i
 	done
-	mtu_restore $h2
-	simple_if_fini $h2
 }
 
 ets_switch_create()
@@ -211,44 +199,45 @@ ets_switch_create()
 	local i
 
 	ip link set dev $swp1 up
+	defer ip link set dev $swp1 down
+
 	mtu_set $swp1 9900
+	defer mtu_restore $swp1
 
 	ip link set dev $swp2 up
+	defer ip link set dev $swp2 down
+
 	mtu_set $swp2 9900
+	defer mtu_restore $swp2
 
 	for i in {0..2}; do
 		vlan_create $swp1 1$i
+		defer vlan_destroy $swp1 1$i
 		ip link set dev $swp1.1$i type vlan ingress 0:0 1:1 2:2
 
 		vlan_create $swp2 1$i
+		defer vlan_destroy $swp2 1$i
 
 		ip link add dev br1$i type bridge
+		defer ip link del dev br1$i
+
 		ip link set dev $swp1.1$i master br1$i
+		defer ip link set dev $swp1.1$i nomaster
+
 		ip link set dev $swp2.1$i master br1$i
+		defer ip link set dev $swp2.1$i nomaster
 
 		ip link set dev br1$i up
-		ip link set dev $swp1.1$i up
-		ip link set dev $swp2.1$i up
-	done
-}
+		defer ip link set dev br1$i down
 
-ets_switch_destroy()
-{
-	local i
-
-	ets_delete_qdisc
+		ip link set dev $swp1.1$i up
+		defer ip link set dev $swp1.1$i down
 
-	for i in {0..2}; do
-		ip link del dev br1$i
-		vlan_destroy $swp2 1$i
-		vlan_destroy $swp1 1$i
+		ip link set dev $swp2.1$i up
+		defer ip link set dev $swp2.1$i down
 	done
 
-	mtu_restore $swp2
-	ip link set dev $swp2 down
-
-	mtu_restore $swp1
-	ip link set dev $swp1 down
+	defer ets_delete_qdisc
 }
 
 setup_prepare()
@@ -263,23 +252,13 @@ setup_prepare()
 	hut=$h2
 
 	vrf_prepare
+	defer vrf_cleanup
 
 	h1_create
 	h2_create
 	switch_create
 }
 
-cleanup()
-{
-	pre_cleanup
-
-	switch_destroy
-	h2_destroy
-	h1_destroy
-
-	vrf_cleanup
-}
-
 ping_ipv4()
 {
 	ping_test $h1.10 $(dip 0) " vlan 10"
diff --git a/tools/testing/selftests/net/forwarding/sch_ets_tests.sh b/tools/testing/selftests/net/forwarding/sch_ets_tests.sh
index f9d26a7911bb..08240d3e3c87 100644
--- a/tools/testing/selftests/net/forwarding/sch_ets_tests.sh
+++ b/tools/testing/selftests/net/forwarding/sch_ets_tests.sh
@@ -90,6 +90,7 @@ __ets_dwrr_test()
 
 	for stream in ${streams[@]}; do
 		ets_start_traffic $stream
+		defer stop_traffic $!
 	done
 
 	sleep 10
@@ -120,25 +121,24 @@ __ets_dwrr_test()
 				       ${d[0]} ${d[$i]}
 		fi
 	done
-
-	for stream in ${streams[@]}; do
-		stop_traffic
-	done
 }
 
 ets_dwrr_test_012()
 {
-	__ets_dwrr_test 0 1 2
+	in_defer_scope \
+		__ets_dwrr_test 0 1 2
 }
 
 ets_dwrr_test_01()
 {
-	__ets_dwrr_test 0 1
+	in_defer_scope \
+		__ets_dwrr_test 0 1
 }
 
 ets_dwrr_test_12()
 {
-	__ets_dwrr_test 1 2
+	in_defer_scope \
+		__ets_dwrr_test 1 2
 }
 
 ets_qdisc_setup()
-- 
cgit v1.2.3


From 979154e90ff83ab63bb1a52f48e442c953452adb Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Thu, 17 Oct 2024 11:45:49 +0200
Subject: selftests: mlxsw: qos_mc_aware: Use defer for test cleanup

Use the defer framework to schedule cleanups as soon as the command is
executed.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 .../selftests/drivers/net/mlxsw/qos_mc_aware.sh    | 138 ++++++++++-----------
 1 file changed, 64 insertions(+), 74 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh
index 6d892de43fa8..cd4a5c21360c 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh
@@ -73,122 +73,114 @@ source qos_lib.sh
 h1_create()
 {
 	simple_if_init $h1 192.0.2.65/28
-	mtu_set $h1 10000
-}
+	defer simple_if_fini $h1 192.0.2.65/28
 
-h1_destroy()
-{
-	mtu_restore $h1
-	simple_if_fini $h1 192.0.2.65/28
+	mtu_set $h1 10000
+	defer mtu_restore $h1
 }
 
 h2_create()
 {
 	simple_if_init $h2
+	defer simple_if_fini $h2
+
 	mtu_set $h2 10000
+	defer mtu_restore $h2
 
 	vlan_create $h2 111 v$h2 192.0.2.129/28
+	defer vlan_destroy $h2 111
 	ip link set dev $h2.111 type vlan egress-qos-map 0:1
 }
 
-h2_destroy()
-{
-	vlan_destroy $h2 111
-
-	mtu_restore $h2
-	simple_if_fini $h2
-}
-
 h3_create()
 {
 	simple_if_init $h3 192.0.2.66/28
+	defer simple_if_fini $h3 192.0.2.66/28
+
 	mtu_set $h3 10000
+	defer mtu_restore $h3
 
 	vlan_create $h3 111 v$h3 192.0.2.130/28
-}
-
-h3_destroy()
-{
-	vlan_destroy $h3 111
-
-	mtu_restore $h3
-	simple_if_fini $h3 192.0.2.66/28
+	defer vlan_destroy $h3 111
 }
 
 switch_create()
 {
 	ip link set dev $swp1 up
+	defer ip link set dev $swp1 down
+
 	mtu_set $swp1 10000
+	defer mtu_restore $swp1
 
 	ip link set dev $swp2 up
+	defer ip link set dev $swp2 down
+
 	mtu_set $swp2 10000
+	defer mtu_restore $swp2
 
 	ip link set dev $swp3 up
+	defer ip link set dev $swp3 down
+
 	mtu_set $swp3 10000
+	defer mtu_restore $swp3
 
 	vlan_create $swp2 111
+	defer vlan_destroy $swp2 111
+
 	vlan_create $swp3 111
+	defer vlan_destroy $swp3 111
 
 	tc qdisc replace dev $swp3 root handle 3: tbf rate 1gbit \
 		burst 128K limit 1G
+	defer tc qdisc del dev $swp3 root handle 3:
+
 	tc qdisc replace dev $swp3 parent 3:3 handle 33: \
 		prio bands 8 priomap 7 7 7 7 7 7 7 7
+	defer tc qdisc del dev $swp3 parent 3:3 handle 33:
 
 	ip link add name br1 type bridge vlan_filtering 0
+	defer ip link del dev br1
 	ip link set dev br1 addrgenmode none
 	ip link set dev br1 up
+
 	ip link set dev $swp1 master br1
+	defer ip link set dev $swp1 nomaster
+
 	ip link set dev $swp3 master br1
+	defer ip link set dev $swp3 nomaster
 
 	ip link add name br111 type bridge vlan_filtering 0
+	defer ip link del dev br111
 	ip link set dev br111 addrgenmode none
 	ip link set dev br111 up
+
 	ip link set dev $swp2.111 master br111
+	defer ip link set dev $swp2.111 nomaster
+
 	ip link set dev $swp3.111 master br111
+	defer ip link set dev $swp3.111 nomaster
 
 	# Make sure that ingress quotas are smaller than egress so that there is
 	# room for both streams of traffic to be admitted to shared buffer.
 	devlink_port_pool_th_save $swp1 0
 	devlink_port_pool_th_set $swp1 0 5
+	defer devlink_port_pool_th_restore $swp1 0
+
 	devlink_tc_bind_pool_th_save $swp1 0 ingress
 	devlink_tc_bind_pool_th_set $swp1 0 ingress 0 5
+	defer devlink_tc_bind_pool_th_restore $swp1 0 ingress
 
 	devlink_port_pool_th_save $swp2 0
 	devlink_port_pool_th_set $swp2 0 5
+	defer devlink_port_pool_th_restore $swp2 0
+
 	devlink_tc_bind_pool_th_save $swp2 1 ingress
 	devlink_tc_bind_pool_th_set $swp2 1 ingress 0 5
+	defer devlink_tc_bind_pool_th_restore $swp2 1 ingress
 
 	devlink_port_pool_th_save $swp3 4
 	devlink_port_pool_th_set $swp3 4 12
-}
-
-switch_destroy()
-{
-	devlink_port_pool_th_restore $swp3 4
-
-	devlink_tc_bind_pool_th_restore $swp2 1 ingress
-	devlink_port_pool_th_restore $swp2 0
-
-	devlink_tc_bind_pool_th_restore $swp1 0 ingress
-	devlink_port_pool_th_restore $swp1 0
-
-	ip link del dev br111
-	ip link del dev br1
-
-	tc qdisc del dev $swp3 parent 3:3 handle 33:
-	tc qdisc del dev $swp3 root handle 3:
-
-	vlan_destroy $swp3 111
-	vlan_destroy $swp2 111
-
-	mtu_restore $swp3
-	ip link set dev $swp3 down
-
-	mtu_restore $swp2
-	ip link set dev $swp2 down
-
-	mtu_restore $swp1
-	ip link set dev $swp1 down
+	defer devlink_port_pool_th_restore $swp3 4
 }
 
 setup_prepare()
@@ -205,6 +197,7 @@ setup_prepare()
 	h3mac=$(mac_get $h3)
 
 	vrf_prepare
+	defer vrf_cleanup
 
 	h1_create
 	h2_create
@@ -212,45 +205,45 @@ setup_prepare()
 	switch_create
 }
 
-cleanup()
+ping_ipv4()
 {
-	pre_cleanup
+	ping_test $h2 192.0.2.130
+}
 
-	switch_destroy
-	h3_destroy
-	h2_destroy
-	h1_destroy
+__run_uc_measure_rate()
+{
+	local what=$1; shift
+	local -a uc_rate
+
+	start_traffic $h2.111 192.0.2.129 192.0.2.130 $h3mac
+	defer stop_traffic $!
+
+	uc_rate=($(measure_rate $swp2 $h3 rx_octets_prio_1 "$what"))
+	check_err $? "Could not get high enough $what ingress rate"
 
-	vrf_cleanup
+	echo ${uc_rate[@]}
 }
 
-ping_ipv4()
+run_uc_measure_rate()
 {
-	ping_test $h2 192.0.2.130
+	in_defer_scope __run_uc_measure_rate "$@"
 }
 
 test_mc_aware()
 {
 	RET=0
 
-	local -a uc_rate
-	start_traffic $h2.111 192.0.2.129 192.0.2.130 $h3mac
-	uc_rate=($(measure_rate $swp2 $h3 rx_octets_prio_1 "UC-only"))
-	check_err $? "Could not get high enough UC-only ingress rate"
-	stop_traffic
+	local -a uc_rate=($(run_uc_measure_rate "UC-only"))
 	local ucth1=${uc_rate[1]}
 
 	start_traffic $h1 192.0.2.65 bc bc
+	defer stop_traffic $!
 
 	local d0=$(date +%s)
 	local t0=$(ethtool_stats_get $h3 rx_octets_prio_0)
 	local u0=$(ethtool_stats_get $swp1 rx_octets_prio_0)
 
-	local -a uc_rate_2
-	start_traffic $h2.111 192.0.2.129 192.0.2.130 $h3mac
-	uc_rate_2=($(measure_rate $swp2 $h3 rx_octets_prio_1 "UC+MC"))
-	check_err $? "Could not get high enough UC+MC ingress rate"
-	stop_traffic
+	local -a uc_rate_2=($(run_uc_measure_rate "UC+MC"))
 	local ucth2=${uc_rate_2[1]}
 
 	local d1=$(date +%s)
@@ -272,8 +265,6 @@ test_mc_aware()
 	local mc_ir=$(rate $u0 $u1 $interval)
 	local mc_er=$(rate $t0 $t1 $interval)
 
-	stop_traffic
-
 	log_test "UC performance under MC overload"
 
 	echo "UC-only throughput  $(humanize $ucth1)"
@@ -297,6 +288,7 @@ test_uc_aware()
 	RET=0
 
 	start_traffic $h2.111 192.0.2.129 192.0.2.130 $h3mac
+	defer stop_traffic $!
 
 	local d0=$(date +%s)
 	local t0=$(ethtool_stats_get $h3 rx_octets_prio_1)
@@ -326,8 +318,6 @@ test_uc_aware()
 	((attempts == passes))
 	check_err $?
 
-	stop_traffic
-
 	log_test "MC performance under UC overload"
 	echo "    ingress UC throughput $(humanize ${uc_ir})"
 	echo "    egress UC throughput  $(humanize ${uc_er})"
-- 
cgit v1.2.3


From 424745af5271a5990caa25358b9710db0349221c Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Thu, 17 Oct 2024 11:45:50 +0200
Subject: selftests: mlxsw: qos_ets_strict: Use defer for test cleanup

Use the defer framework to schedule cleanups as soon as the command is
executed.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 .../selftests/drivers/net/mlxsw/qos_ets_strict.sh  | 167 +++++++++++----------
 1 file changed, 85 insertions(+), 82 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_ets_strict.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_ets_strict.sh
index fee74f215cec..d5b6f2cc9a29 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/qos_ets_strict.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/qos_ets_strict.sh
@@ -58,65 +58,62 @@ source qos_lib.sh
 h1_create()
 {
 	simple_if_init $h1
+	defer simple_if_fini $h1
+
 	mtu_set $h1 10000
+	defer mtu_restore $h1
 
 	vlan_create $h1 111 v$h1 192.0.2.33/28
+	defer vlan_destroy $h1 111
 	ip link set dev $h1.111 type vlan egress-qos-map 0:1
 }
 
-h1_destroy()
-{
-	vlan_destroy $h1 111
-
-	mtu_restore $h1
-	simple_if_fini $h1
-}
-
 h2_create()
 {
 	simple_if_init $h2
+	defer simple_if_fini $h2
+
 	mtu_set $h2 10000
+	defer mtu_restore $h2
 
 	vlan_create $h2 222 v$h2 192.0.2.65/28
+	defer vlan_destroy $h2 222
 	ip link set dev $h2.222 type vlan egress-qos-map 0:2
 }
 
-h2_destroy()
-{
-	vlan_destroy $h2 222
-
-	mtu_restore $h2
-	simple_if_fini $h2
-}
-
 h3_create()
 {
 	simple_if_init $h3
+	defer simple_if_fini $h3
+
 	mtu_set $h3 10000
+	defer mtu_restore $h3
 
 	vlan_create $h3 111 v$h3 192.0.2.34/28
-	vlan_create $h3 222 v$h3 192.0.2.66/28
-}
-
-h3_destroy()
-{
-	vlan_destroy $h3 222
-	vlan_destroy $h3 111
+	defer vlan_destroy $h3 111
 
-	mtu_restore $h3
-	simple_if_fini $h3
+	vlan_create $h3 222 v$h3 192.0.2.66/28
+	defer vlan_destroy $h3 222
 }
 
 switch_create()
 {
 	ip link set dev $swp1 up
+	defer ip link set dev $swp1 down
+
 	mtu_set $swp1 10000
+	defer mtu_restore $swp1
 
 	ip link set dev $swp2 up
+	defer ip link set dev $swp2 down
+
 	mtu_set $swp2 10000
+	defer mtu_restore $swp2
 
 	# prio n -> TC n, strict scheduling
 	lldptool -T -i $swp3 -V ETS-CFG up2tc=0:0,1:1,2:2,3:3,4:4,5:5,6:6,7:7
+	defer lldptool -T -i $swp3 -V ETS-CFG up2tc=0:0,1:0,2:0,3:0,4:0,5:0,6:0,7:0
+
 	lldptool -T -i $swp3 -V ETS-CFG tsa=$(
 			)"0:strict,"$(
 			)"1:strict,"$(
@@ -129,85 +126,90 @@ switch_create()
 	sleep 1
 
 	ip link set dev $swp3 up
+	defer ip link set dev $swp3 down
+
 	mtu_set $swp3 10000
+	defer mtu_restore $swp3
+
 	tc qdisc replace dev $swp3 root handle 101: tbf rate 1gbit \
 		burst 128K limit 1G
+	defer tc qdisc del dev $swp3 root handle 101:
 
 	vlan_create $swp1 111
+	defer vlan_destroy $swp1 111
+
 	vlan_create $swp2 222
+	defer vlan_destroy $swp2 222
+
 	vlan_create $swp3 111
+	defer vlan_destroy $swp3 111
+
 	vlan_create $swp3 222
+	defer vlan_destroy $swp3 222
 
 	ip link add name br111 type bridge vlan_filtering 0
+	defer ip link del dev br111
 	ip link set dev br111 addrgenmode none
+
 	ip link set dev br111 up
+	defer ip link set dev br111 down
+
 	ip link set dev $swp1.111 master br111
+	defer ip link set dev $swp1.111 nomaster
+
 	ip link set dev $swp3.111 master br111
+	defer ip link set dev $swp3.111 nomaster
 
 	ip link add name br222 type bridge vlan_filtering 0
+	defer ip link del dev br222
 	ip link set dev br222 addrgenmode none
+
 	ip link set dev br222 up
+	defer ip link set dev br222 down
+
 	ip link set dev $swp2.222 master br222
+	defer ip link set dev $swp2.222 nomaster
+
 	ip link set dev $swp3.222 master br222
+	defer ip link set dev $swp3.222 nomaster
 
 	# Make sure that ingress quotas are smaller than egress so that there is
 	# room for both streams of traffic to be admitted to shared buffer.
 	devlink_pool_size_thtype_save 0
 	devlink_pool_size_thtype_set 0 dynamic 10000000
+	defer devlink_pool_size_thtype_restore 0
+
 	devlink_pool_size_thtype_save 4
 	devlink_pool_size_thtype_set 4 dynamic 10000000
+	defer devlink_pool_size_thtype_restore 4
 
 	devlink_port_pool_th_save $swp1 0
 	devlink_port_pool_th_set $swp1 0 6
+	defer devlink_port_pool_th_restore $swp1 0
+
 	devlink_tc_bind_pool_th_save $swp1 1 ingress
 	devlink_tc_bind_pool_th_set $swp1 1 ingress 0 6
+	defer devlink_tc_bind_pool_th_restore $swp1 1 ingress
 
 	devlink_port_pool_th_save $swp2 0
 	devlink_port_pool_th_set $swp2 0 6
+	defer devlink_port_pool_th_restore $swp2 0
+
 	devlink_tc_bind_pool_th_save $swp2 2 ingress
 	devlink_tc_bind_pool_th_set $swp2 2 ingress 0 6
+	defer devlink_tc_bind_pool_th_restore $swp2 2 ingress
 
 	devlink_tc_bind_pool_th_save $swp3 1 egress
 	devlink_tc_bind_pool_th_set $swp3 1 egress 4 7
+	defer devlink_tc_bind_pool_th_restore $swp3 1 egress
+
 	devlink_tc_bind_pool_th_save $swp3 2 egress
 	devlink_tc_bind_pool_th_set $swp3 2 egress 4 7
+	defer devlink_tc_bind_pool_th_restore $swp3 2 egress
+
 	devlink_port_pool_th_save $swp3 4
 	devlink_port_pool_th_set $swp3 4 7
-}
-
-switch_destroy()
-{
-	devlink_port_pool_th_restore $swp3 4
-	devlink_tc_bind_pool_th_restore $swp3 2 egress
-	devlink_tc_bind_pool_th_restore $swp3 1 egress
-
-	devlink_tc_bind_pool_th_restore $swp2 2 ingress
-	devlink_port_pool_th_restore $swp2 0
-
-	devlink_tc_bind_pool_th_restore $swp1 1 ingress
-	devlink_port_pool_th_restore $swp1 0
-
-	devlink_pool_size_thtype_restore 4
-	devlink_pool_size_thtype_restore 0
-
-	ip link del dev br222
-	ip link del dev br111
-
-	vlan_destroy $swp3 222
-	vlan_destroy $swp3 111
-	vlan_destroy $swp2 222
-	vlan_destroy $swp1 111
-
-	tc qdisc del dev $swp3 root handle 101:
-	mtu_restore $swp3
-	ip link set dev $swp3 down
-	lldptool -T -i $swp3 -V ETS-CFG up2tc=0:0,1:0,2:0,3:0,4:0,5:0,6:0,7:0
-
-	mtu_restore $swp2
-	ip link set dev $swp2 down
-
-	mtu_restore $swp1
-	ip link set dev $swp1 down
+	defer devlink_port_pool_th_restore $swp3 4
 }
 
 setup_prepare()
@@ -224,6 +226,7 @@ setup_prepare()
 	h3mac=$(mac_get $h3)
 
 	vrf_prepare
+	defer vrf_cleanup
 
 	h1_create
 	h2_create
@@ -231,18 +234,6 @@ setup_prepare()
 	switch_create
 }
 
-cleanup()
-{
-	pre_cleanup
-
-	switch_destroy
-	h3_destroy
-	h2_destroy
-	h1_destroy
-
-	vrf_cleanup
-}
-
 ping_ipv4()
 {
 	ping_test $h1 192.0.2.34 " from H1"
@@ -261,21 +252,38 @@ rel()
 	"
 }
 
+__run_hi_measure_rate()
+{
+	local what=$1; shift
+	local -a uc_rate
+
+	start_traffic $h2.222 192.0.2.65 192.0.2.66 $h3mac
+	defer stop_traffic $!
+
+	uc_rate=($(measure_rate $swp2 $h3 rx_octets_prio_2 "$what"))
+	check_err $? "Could not get high enough $what ingress rate"
+
+	echo ${uc_rate[@]}
+}
+
+run_hi_measure_rate()
+{
+	in_defer_scope __run_hi_measure_rate "$@"
+}
+
 test_ets_strict()
 {
 	RET=0
 
 	# Run high-prio traffic on its own.
-	start_traffic $h2.222 192.0.2.65 192.0.2.66 $h3mac
 	local -a rate_2
-	rate_2=($(measure_rate $swp2 $h3 rx_octets_prio_2 "prio 2"))
-	check_err $? "Could not get high enough prio-2 ingress rate"
+	rate_2=($(run_hi_measure_rate "prio 2"))
 	local rate_2_in=${rate_2[0]}
 	local rate_2_eg=${rate_2[1]}
-	stop_traffic # $h2.222
 
 	# Start low-prio stream.
 	start_traffic $h1.111 192.0.2.33 192.0.2.34 $h3mac
+	defer stop_traffic $!
 
 	local -a rate_1
 	rate_1=($(measure_rate $swp1 $h3 rx_octets_prio_1 "prio 1"))
@@ -290,14 +298,9 @@ test_ets_strict()
 	check_err $(bc <<< "$rel21 > 105")
 
 	# Start the high-prio stream--now both streams run.
-	start_traffic $h2.222 192.0.2.65 192.0.2.66 $h3mac
-	rate_3=($(measure_rate $swp2 $h3 rx_octets_prio_2 "prio 2 w/ 1"))
-	check_err $? "Could not get high enough prio-2 ingress rate with prio-1"
+	rate_3=($(run_hi_measure_rate "prio 2+1"))
 	local rate_3_in=${rate_3[0]}
 	local rate_3_eg=${rate_3[1]}
-	stop_traffic # $h2.222
-
-	stop_traffic # $h1.111
 
 	# High-prio should have about the same throughput whether or not
 	# low-prio is in the system.
-- 
cgit v1.2.3


From 919419a8870b33405ef9b0e34e837d407e3888f5 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Thu, 17 Oct 2024 11:45:51 +0200
Subject: selftests: mlxsw: qos_max_descriptors: Use defer for test cleanup

Use the defer framework to schedule cleanups as soon as the command is
executed.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 .../drivers/net/mlxsw/qos_max_descriptors.sh       | 118 +++++++--------------
 1 file changed, 41 insertions(+), 77 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_max_descriptors.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_max_descriptors.sh
index 5ac4f795e333..2b5d2c2751d5 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/qos_max_descriptors.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/qos_max_descriptors.sh
@@ -69,127 +69,103 @@ mlxsw_only_on_spectrum 2+ || exit
 h1_create()
 {
 	simple_if_init $h1
+	defer simple_if_fini $h1
 
 	vlan_create $h1 111 v$h1 192.0.2.33/28
+	defer vlan_destroy $h1 111
 	ip link set dev $h1.111 type vlan egress-qos-map 0:1
 }
 
-h1_destroy()
-{
-	vlan_destroy $h1 111
-
-	simple_if_fini $h1
-}
-
 h2_create()
 {
 	simple_if_init $h2
+	defer simple_if_fini $h2
 
 	vlan_create $h2 111 v$h2 192.0.2.34/28
-}
-
-h2_destroy()
-{
-	vlan_destroy $h2 111
-
-	simple_if_fini $h2
+	defer vlan_destroy $h2 111
 }
 
 switch_create()
 {
 	# pools
 	# -----
+	# devlink_pool_size_thtype_restore needs to be done first so that we can
+	# reset the various limits to values that are only valid for the
+	# original static / dynamic setting.
 
 	devlink_pool_size_thtype_save 1
-	devlink_pool_size_thtype_save 6
-
-	devlink_port_pool_th_save $swp1 1
-	devlink_port_pool_th_save $swp2 6
-
-	devlink_tc_bind_pool_th_save $swp1 1 ingress
-	devlink_tc_bind_pool_th_save $swp2 1 egress
-
 	devlink_pool_size_thtype_set 1 dynamic $MAX_POOL_SIZE
+	defer_prio devlink_pool_size_thtype_restore 1
+
+	devlink_pool_size_thtype_save 6
 	devlink_pool_size_thtype_set 6 static $MAX_POOL_SIZE
+	defer_prio devlink_pool_size_thtype_restore 6
 
 	# $swp1
 	# -----
 
 	ip link set dev $swp1 up
+	defer ip link set dev $swp1 down
+
 	vlan_create $swp1 111
+	defer vlan_destroy $swp1 111
 	ip link set dev $swp1.111 type vlan ingress-qos-map 0:0 1:1
 
+	devlink_port_pool_th_save $swp1 1
 	devlink_port_pool_th_set $swp1 1 16
+	defer devlink_tc_bind_pool_th_restore $swp1 1 ingress
+
+	devlink_tc_bind_pool_th_save $swp1 1 ingress
 	devlink_tc_bind_pool_th_set $swp1 1 ingress 1 16
+	defer devlink_port_pool_th_restore $swp1 1
 
 	tc qdisc replace dev $swp1 root handle 1: \
 	   ets bands 8 strict 8 priomap 7 6
+	defer tc qdisc del dev $swp1 root
+
 	dcb buffer set dev $swp1 prio-buffer all:0 1:1
+	defer dcb buffer set dev $swp1 prio-buffer all:0
 
 	# $swp2
 	# -----
 
 	ip link set dev $swp2 up
+	defer ip link set dev $swp2 down
+
 	vlan_create $swp2 111
+	defer vlan_destroy $swp2 111
 	ip link set dev $swp2.111 type vlan egress-qos-map 0:0 1:1
 
+	devlink_port_pool_th_save $swp2 6
 	devlink_port_pool_th_set $swp2 6 $MAX_POOL_SIZE
+	defer devlink_tc_bind_pool_th_restore $swp2 1 egress
+
+	devlink_tc_bind_pool_th_save $swp2 1 egress
 	devlink_tc_bind_pool_th_set $swp2 1 egress 6 $MAX_POOL_SIZE
+	defer devlink_port_pool_th_restore $swp2 6
 
 	tc qdisc replace dev $swp2 root handle 1: tbf rate $SHAPER_RATE \
 		burst 128K limit 500M
+	defer tc qdisc del dev $swp2 root
+
 	tc qdisc replace dev $swp2 parent 1:1 handle 11: \
 		ets bands 8 strict 8 priomap 7 6
+	defer tc qdisc del dev $swp2 parent 1:1 handle 11:
 
 	# bridge
 	# ------
 
 	ip link add name br1 type bridge vlan_filtering 0
+	defer ip link del dev br1
+
 	ip link set dev $swp1.111 master br1
+	defer ip link set dev $swp1.111 nomaster
+
 	ip link set dev br1 up
+	defer ip link set dev br1 down
 
 	ip link set dev $swp2.111 master br1
-}
-
-switch_destroy()
-{
-	# Do this first so that we can reset the limits to values that are only
-	# valid for the original static / dynamic setting.
-	devlink_pool_size_thtype_restore 6
-	devlink_pool_size_thtype_restore 1
-
-	# bridge
-	# ------
-
-	ip link set dev $swp2.111 nomaster
-
-	ip link set dev br1 down
-	ip link set dev $swp1.111 nomaster
-	ip link del dev br1
-
-	# $swp2
-	# -----
-
-	tc qdisc del dev $swp2 parent 1:1 handle 11:
-	tc qdisc del dev $swp2 root
-
-	devlink_tc_bind_pool_th_restore $swp2 1 egress
-	devlink_port_pool_th_restore $swp2 6
-
-	vlan_destroy $swp2 111
-	ip link set dev $swp2 down
-
-	# $swp1
-	# -----
-
-	dcb buffer set dev $swp1 prio-buffer all:0
-	tc qdisc del dev $swp1 root
-
-	devlink_tc_bind_pool_th_restore $swp1 1 ingress
-	devlink_port_pool_th_restore $swp1 1
-
-	vlan_destroy $swp1 111
-	ip link set dev $swp1 down
+	defer ip link set dev $swp2.111 nomaster
 }
 
 setup_prepare()
@@ -203,23 +179,13 @@ setup_prepare()
 	h2mac=$(mac_get $h2)
 
 	vrf_prepare
+	defer vrf_cleanup
 
 	h1_create
 	h2_create
 	switch_create
 }
 
-cleanup()
-{
-	pre_cleanup
-
-	switch_destroy
-	h2_destroy
-	h1_destroy
-
-	vrf_cleanup
-}
-
 ping_ipv4()
 {
 	ping_test $h1 192.0.2.34 " h1->h2"
@@ -251,6 +217,7 @@ max_descriptors()
 
 	log_info "Send many small packets, packet size = $pktsize bytes"
 	start_traffic_pktsize $pktsize $h1.111 192.0.2.33 192.0.2.34 $h2mac
+	defer stop_traffic $!
 
 	# Sleep to wait for congestion.
 	sleep 5
@@ -268,9 +235,6 @@ max_descriptors()
 	check_err $(bc <<< "$perc_used < $exp_perc_used") \
 		"Expected > $exp_perc_used% of descriptors, handle $perc_used%"
 
-	stop_traffic
-	sleep 1
-
 	log_test "Maximum descriptors usage. The percentage used is $perc_used%"
 }
 
-- 
cgit v1.2.3


From cebd281f3c753dcbd48b455bdbc6549889196aa0 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Thu, 17 Oct 2024 11:45:52 +0200
Subject: selftests: mlxsw: devlink_trap_police: Use defer for test cleanup

Use the defer framework to schedule cleanups as soon as the command is
executed.

Note that the start_traffic commands in __burst_test() are each sending a
fixed number of packets (note the -c flag) and then ending. They therefore
do not need a matching stop_traffic.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 .../drivers/net/mlxsw/devlink_trap_policer.sh      | 85 +++++++++-------------
 1 file changed, 36 insertions(+), 49 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_policer.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_policer.sh
index 0bd5ffc218ac..29a672c2270f 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_policer.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_policer.sh
@@ -45,63 +45,52 @@ source $lib_dir/devlink_lib.sh
 h1_create()
 {
 	simple_if_init $h1 192.0.2.1/24
+	defer simple_if_fini $h1 192.0.2.1/24
+
 	mtu_set $h1 10000
+	defer mtu_restore $h1
 
 	ip -4 route add default vrf v$h1 nexthop via 192.0.2.2
-}
-
-h1_destroy()
-{
-	ip -4 route del default vrf v$h1 nexthop via 192.0.2.2
-
-	mtu_restore $h1
-	simple_if_fini $h1 192.0.2.1/24
+	defer ip -4 route del default vrf v$h1 nexthop via 192.0.2.2
 }
 
 h2_create()
 {
 	simple_if_init $h2 198.51.100.1/24
+	defer simple_if_fini $h2 198.51.100.1/24
+
 	mtu_set $h2 10000
+	defer mtu_restore $h2
 
 	ip -4 route add default vrf v$h2 nexthop via 198.51.100.2
-}
-
-h2_destroy()
-{
-	ip -4 route del default vrf v$h2 nexthop via 198.51.100.2
-
-	mtu_restore $h2
-	simple_if_fini $h2 198.51.100.1/24
+	defer ip -4 route del default vrf v$h2 nexthop via 198.51.100.2
 }
 
 router_create()
 {
 	ip link set dev $rp1 up
+	defer ip link set dev $rp1 down
+
 	ip link set dev $rp2 up
+	defer ip link set dev $rp2 down
 
 	__addr_add_del $rp1 add 192.0.2.2/24
+	defer __addr_add_del $rp1 del 192.0.2.2/24
+
 	__addr_add_del $rp2 add 198.51.100.2/24
+	defer __addr_add_del $rp2 del 198.51.100.2/24
+
 	mtu_set $rp1 10000
+	defer mtu_restore $rp1
+
 	mtu_set $rp2 10000
+	defer mtu_restore $rp2
 
 	ip -4 route add blackhole 198.51.100.100
+	defer ip -4 route del blackhole 198.51.100.100
 
 	devlink trap set $DEVLINK_DEV trap blackhole_route action trap
-}
-
-router_destroy()
-{
-	devlink trap set $DEVLINK_DEV trap blackhole_route action drop
-
-	ip -4 route del blackhole 198.51.100.100
-
-	mtu_restore $rp2
-	mtu_restore $rp1
-	__addr_add_del $rp2 del 198.51.100.2/24
-	__addr_add_del $rp1 del 192.0.2.2/24
-
-	ip link set dev $rp2 down
-	ip link set dev $rp1 down
+	defer devlink trap set $DEVLINK_DEV trap blackhole_route action drop
 }
 
 setup_prepare()
@@ -114,7 +103,11 @@ setup_prepare()
 
 	rp1_mac=$(mac_get $rp1)
 
+	# Reload to ensure devlink-trap settings are back to default.
+	defer devlink_reload
+
 	vrf_prepare
+	defer vrf_cleanup
 
 	h1_create
 	h2_create
@@ -122,21 +115,6 @@ setup_prepare()
 	router_create
 }
 
-cleanup()
-{
-	pre_cleanup
-
-	router_destroy
-
-	h2_destroy
-	h1_destroy
-
-	vrf_cleanup
-
-	# Reload to ensure devlink-trap settings are back to default.
-	devlink_reload
-}
-
 rate_limits_test()
 {
 	RET=0
@@ -214,7 +192,10 @@ __rate_test()
 	# by the policer. Make sure measured received rate is about 1000 pps
 	log_info "=== Tx rate: Highest, Policer rate: 1000 pps ==="
 
+	defer_scope_push
+
 	start_traffic $h1 192.0.2.1 198.51.100.100 $rp1_mac
+	defer stop_traffic $!
 
 	sleep 5 # Take measurements when rate is stable
 
@@ -229,13 +210,16 @@ __rate_test()
 	check_err $? "Expected non-zero policer drop rate, got 0"
 	log_info "Measured policer drop rate of $drop_rate pps"
 
-	stop_traffic
+	defer_scope_pop
 
 	# Send packets at a rate of 1000 pps and make sure they are not dropped
 	# by the policer
 	log_info "=== Tx rate: 1000 pps, Policer rate: 1000 pps ==="
 
+	defer_scope_push
+
 	start_traffic $h1 192.0.2.1 198.51.100.100 $rp1_mac -d 1msec
+	defer stop_traffic $!
 
 	sleep 5 # Take measurements when rate is stable
 
@@ -244,7 +228,7 @@ __rate_test()
 	check_err $? "Expected zero policer drop rate, got a drop rate of $drop_rate pps"
 	log_info "Measured policer drop rate of $drop_rate pps"
 
-	stop_traffic
+	defer_scope_pop
 
 	# Unbind the policer and send packets at highest possible rate. Make
 	# sure they are not dropped by the policer and that the measured
@@ -253,7 +237,10 @@ __rate_test()
 
 	devlink trap group set $DEVLINK_DEV group l3_drops nopolicer
 
+	defer_scope_push
+
 	start_traffic $h1 192.0.2.1 198.51.100.100 $rp1_mac
+	defer stop_traffic $!
 
 	rate=$(trap_rate_get)
 	(( rate > 1000 ))
@@ -265,7 +252,7 @@ __rate_test()
 	check_err $? "Expected zero policer drop rate, got a drop rate of $drop_rate pps"
 	log_info "Measured policer drop rate of $drop_rate pps"
 
-	stop_traffic
+	defer_scope_pop
 
 	log_test "Trap policer rate"
 }
-- 
cgit v1.2.3


From 637c730998b8f440dc5d0c6bae254fbf19143ea4 Mon Sep 17 00:00:00 2001
From: Michael Vetter <mvetter@suse.com>
Date: Thu, 17 Oct 2024 22:01:30 +0200
Subject: selftests: livepatch: rename KLP_SYSFS_DIR to SYSFS_KLP_DIR

This naming makes more sense according to the directory structure.
Especially when we later add more paths.

Addtionally replace `/sys/kernel/livepatch` with `$SYSFS_KLP_DIR` in
the livepatch test files.

Signed-off-by: Michael Vetter <mvetter@suse.com>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Joe Lawrence <joe.lawrence@redhat.com>
Tested-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Reviewed-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Tested-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20241017200132.21946-2-mvetter@suse.com
[pmladek@suse.com: Fix corrupted substitution]
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 tools/testing/selftests/livepatch/functions.sh     | 16 +++++++--------
 .../testing/selftests/livepatch/test-callbacks.sh  | 24 +++++++++++-----------
 tools/testing/selftests/livepatch/test-ftrace.sh   |  2 +-
 .../testing/selftests/livepatch/test-livepatch.sh  | 12 +++++------
 tools/testing/selftests/livepatch/test-state.sh    |  8 ++++----
 tools/testing/selftests/livepatch/test-syscall.sh  |  2 +-
 tools/testing/selftests/livepatch/test-sysfs.sh    |  8 ++++----
 7 files changed, 36 insertions(+), 36 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/livepatch/functions.sh b/tools/testing/selftests/livepatch/functions.sh
index fc4c6a016d38..7bdfe668127b 100644
--- a/tools/testing/selftests/livepatch/functions.sh
+++ b/tools/testing/selftests/livepatch/functions.sh
@@ -6,7 +6,7 @@
 
 MAX_RETRIES=600
 RETRY_INTERVAL=".1"	# seconds
-KLP_SYSFS_DIR="/sys/kernel/livepatch"
+SYSFS_KLP_DIR="/sys/kernel/livepatch"
 
 # Kselftest framework requirement - SKIP code is 4
 ksft_skip=4
@@ -183,7 +183,7 @@ function load_lp_nowait() {
 	__load_mod "$mod" "$@"
 
 	# Wait for livepatch in sysfs ...
-	loop_until '[[ -e "/sys/kernel/livepatch/$mod" ]]' ||
+	loop_until '[[ -e "$SYSFS_KLP_DIR/$mod" ]]' ||
 		die "failed to load module $mod (sysfs)"
 }
 
@@ -196,7 +196,7 @@ function load_lp() {
 	load_lp_nowait "$mod" "$@"
 
 	# Wait until the transition finishes ...
-	loop_until 'grep -q '^0$' /sys/kernel/livepatch/$mod/transition' ||
+	loop_until 'grep -q '^0$' $SYSFS_KLP_DIR/$mod/transition' ||
 		die "failed to complete transition"
 }
 
@@ -246,12 +246,12 @@ function unload_lp() {
 function disable_lp() {
 	local mod="$1"
 
-	log "% echo 0 > /sys/kernel/livepatch/$mod/enabled"
-	echo 0 > /sys/kernel/livepatch/"$mod"/enabled
+	log "% echo 0 > $SYSFS_KLP_DIR/$mod/enabled"
+	echo 0 > "$SYSFS_KLP_DIR/$mod/enabled"
 
 	# Wait until the transition finishes and the livepatch gets
 	# removed from sysfs...
-	loop_until '[[ ! -e "/sys/kernel/livepatch/$mod" ]]' ||
+	loop_until '[[ ! -e "$SYSFS_KLP_DIR/$mod" ]]' ||
 		die "failed to disable livepatch $mod"
 }
 
@@ -322,7 +322,7 @@ function check_sysfs_rights() {
 	local rel_path="$1"; shift
 	local expected_rights="$1"; shift
 
-	local path="$KLP_SYSFS_DIR/$mod/$rel_path"
+	local path="$SYSFS_KLP_DIR/$mod/$rel_path"
 	local rights=$(/bin/stat --format '%A' "$path")
 	if test "$rights" != "$expected_rights" ; then
 		die "Unexpected access rights of $path: $expected_rights vs. $rights"
@@ -338,7 +338,7 @@ function check_sysfs_value() {
 	local rel_path="$1"; shift
 	local expected_value="$1"; shift
 
-	local path="$KLP_SYSFS_DIR/$mod/$rel_path"
+	local path="$SYSFS_KLP_DIR/$mod/$rel_path"
 	local value=`cat $path`
 	if test "$value" != "$expected_value" ; then
 		die "Unexpected value in $path: $expected_value vs. $value"
diff --git a/tools/testing/selftests/livepatch/test-callbacks.sh b/tools/testing/selftests/livepatch/test-callbacks.sh
index 32b150e25b10..37bbc3fb2780 100755
--- a/tools/testing/selftests/livepatch/test-callbacks.sh
+++ b/tools/testing/selftests/livepatch/test-callbacks.sh
@@ -46,7 +46,7 @@ livepatch: '$MOD_LIVEPATCH': completing patching transition
 $MOD_LIVEPATCH: post_patch_callback: vmlinux
 $MOD_LIVEPATCH: post_patch_callback: $MOD_TARGET -> [MODULE_STATE_LIVE] Normal state
 livepatch: '$MOD_LIVEPATCH': patching complete
-% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+% echo 0 > $SYSFS_KLP_DIR/$MOD_LIVEPATCH/enabled
 livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
 $MOD_LIVEPATCH: pre_unpatch_callback: vmlinux
 $MOD_LIVEPATCH: pre_unpatch_callback: $MOD_TARGET -> [MODULE_STATE_LIVE] Normal state
@@ -94,7 +94,7 @@ livepatch: applying patch '$MOD_LIVEPATCH' to loading module '$MOD_TARGET'
 $MOD_LIVEPATCH: pre_patch_callback: $MOD_TARGET -> [MODULE_STATE_COMING] Full formed, running module_init
 $MOD_LIVEPATCH: post_patch_callback: $MOD_TARGET -> [MODULE_STATE_COMING] Full formed, running module_init
 $MOD_TARGET: ${MOD_TARGET}_init
-% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+% echo 0 > $SYSFS_KLP_DIR/$MOD_LIVEPATCH/enabled
 livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
 $MOD_LIVEPATCH: pre_unpatch_callback: vmlinux
 $MOD_LIVEPATCH: pre_unpatch_callback: $MOD_TARGET -> [MODULE_STATE_LIVE] Normal state
@@ -146,7 +146,7 @@ $MOD_TARGET: ${MOD_TARGET}_exit
 $MOD_LIVEPATCH: pre_unpatch_callback: $MOD_TARGET -> [MODULE_STATE_GOING] Going away
 livepatch: reverting patch '$MOD_LIVEPATCH' on unloading module '$MOD_TARGET'
 $MOD_LIVEPATCH: post_unpatch_callback: $MOD_TARGET -> [MODULE_STATE_GOING] Going away
-% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+% echo 0 > $SYSFS_KLP_DIR/$MOD_LIVEPATCH/enabled
 livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
 $MOD_LIVEPATCH: pre_unpatch_callback: vmlinux
 livepatch: '$MOD_LIVEPATCH': starting unpatching transition
@@ -195,7 +195,7 @@ $MOD_TARGET: ${MOD_TARGET}_exit
 $MOD_LIVEPATCH: pre_unpatch_callback: $MOD_TARGET -> [MODULE_STATE_GOING] Going away
 livepatch: reverting patch '$MOD_LIVEPATCH' on unloading module '$MOD_TARGET'
 $MOD_LIVEPATCH: post_unpatch_callback: $MOD_TARGET -> [MODULE_STATE_GOING] Going away
-% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+% echo 0 > $SYSFS_KLP_DIR/$MOD_LIVEPATCH/enabled
 livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
 $MOD_LIVEPATCH: pre_unpatch_callback: vmlinux
 livepatch: '$MOD_LIVEPATCH': starting unpatching transition
@@ -227,7 +227,7 @@ livepatch: '$MOD_LIVEPATCH': starting patching transition
 livepatch: '$MOD_LIVEPATCH': completing patching transition
 $MOD_LIVEPATCH: post_patch_callback: vmlinux
 livepatch: '$MOD_LIVEPATCH': patching complete
-% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+% echo 0 > $SYSFS_KLP_DIR/$MOD_LIVEPATCH/enabled
 livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
 $MOD_LIVEPATCH: pre_unpatch_callback: vmlinux
 livepatch: '$MOD_LIVEPATCH': starting unpatching transition
@@ -310,7 +310,7 @@ $MOD_LIVEPATCH: pre_patch_callback: $MOD_TARGET -> [MODULE_STATE_COMING] Full fo
 livepatch: pre-patch callback failed for object '$MOD_TARGET'
 livepatch: patch '$MOD_LIVEPATCH' failed for module '$MOD_TARGET', refusing to load module '$MOD_TARGET'
 insmod: ERROR: could not insert module test_modules/$MOD_TARGET.ko: No such device
-% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+% echo 0 > $SYSFS_KLP_DIR/$MOD_LIVEPATCH/enabled
 livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
 $MOD_LIVEPATCH: pre_unpatch_callback: vmlinux
 livepatch: '$MOD_LIVEPATCH': starting unpatching transition
@@ -364,7 +364,7 @@ $MOD_TARGET: ${MOD_TARGET}_exit
 $MOD_LIVEPATCH: pre_unpatch_callback: $MOD_TARGET -> [MODULE_STATE_GOING] Going away
 livepatch: reverting patch '$MOD_LIVEPATCH' on unloading module '$MOD_TARGET'
 $MOD_LIVEPATCH: post_unpatch_callback: $MOD_TARGET -> [MODULE_STATE_GOING] Going away
-% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+% echo 0 > $SYSFS_KLP_DIR/$MOD_LIVEPATCH/enabled
 livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
 $MOD_LIVEPATCH: pre_unpatch_callback: vmlinux
 $MOD_LIVEPATCH: pre_unpatch_callback: $MOD_TARGET_BUSY -> [MODULE_STATE_LIVE] Normal state
@@ -412,7 +412,7 @@ load_lp_nowait $MOD_LIVEPATCH
 
 # Wait until the livepatch reports in-transition state, i.e. that it's
 # stalled on $MOD_TARGET_BUSY::busymod_work_func()
-loop_until 'grep -q '^1$' /sys/kernel/livepatch/$MOD_LIVEPATCH/transition' ||
+loop_until 'grep -q '^1$' $SYSFS_KLP_DIR/$MOD_LIVEPATCH/transition' ||
 	die "failed to stall transition"
 
 load_mod $MOD_TARGET
@@ -438,7 +438,7 @@ $MOD_TARGET: ${MOD_TARGET}_init
 $MOD_TARGET: ${MOD_TARGET}_exit
 livepatch: reverting patch '$MOD_LIVEPATCH' on unloading module '$MOD_TARGET'
 $MOD_LIVEPATCH: post_unpatch_callback: $MOD_TARGET -> [MODULE_STATE_GOING] Going away
-% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+% echo 0 > $SYSFS_KLP_DIR/$MOD_LIVEPATCH/enabled
 livepatch: '$MOD_LIVEPATCH': reversing transition from patching to unpatching
 livepatch: '$MOD_LIVEPATCH': starting unpatching transition
 livepatch: '$MOD_LIVEPATCH': completing unpatching transition
@@ -483,14 +483,14 @@ livepatch: '$MOD_LIVEPATCH2': starting patching transition
 livepatch: '$MOD_LIVEPATCH2': completing patching transition
 $MOD_LIVEPATCH2: post_patch_callback: vmlinux
 livepatch: '$MOD_LIVEPATCH2': patching complete
-% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH2/enabled
+% echo 0 > $SYSFS_KLP_DIR/$MOD_LIVEPATCH2/enabled
 livepatch: '$MOD_LIVEPATCH2': initializing unpatching transition
 $MOD_LIVEPATCH2: pre_unpatch_callback: vmlinux
 livepatch: '$MOD_LIVEPATCH2': starting unpatching transition
 livepatch: '$MOD_LIVEPATCH2': completing unpatching transition
 $MOD_LIVEPATCH2: post_unpatch_callback: vmlinux
 livepatch: '$MOD_LIVEPATCH2': unpatching complete
-% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+% echo 0 > $SYSFS_KLP_DIR/$MOD_LIVEPATCH/enabled
 livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
 $MOD_LIVEPATCH: pre_unpatch_callback: vmlinux
 livepatch: '$MOD_LIVEPATCH': starting unpatching transition
@@ -539,7 +539,7 @@ livepatch: '$MOD_LIVEPATCH2': starting patching transition
 livepatch: '$MOD_LIVEPATCH2': completing patching transition
 $MOD_LIVEPATCH2: post_patch_callback: vmlinux
 livepatch: '$MOD_LIVEPATCH2': patching complete
-% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH2/enabled
+% echo 0 > $SYSFS_KLP_DIR/$MOD_LIVEPATCH2/enabled
 livepatch: '$MOD_LIVEPATCH2': initializing unpatching transition
 $MOD_LIVEPATCH2: pre_unpatch_callback: vmlinux
 livepatch: '$MOD_LIVEPATCH2': starting unpatching transition
diff --git a/tools/testing/selftests/livepatch/test-ftrace.sh b/tools/testing/selftests/livepatch/test-ftrace.sh
index 730218bce99c..fe14f248913a 100755
--- a/tools/testing/selftests/livepatch/test-ftrace.sh
+++ b/tools/testing/selftests/livepatch/test-ftrace.sh
@@ -53,7 +53,7 @@ livepatch: '$MOD_LIVEPATCH': starting patching transition
 livepatch: '$MOD_LIVEPATCH': completing patching transition
 livepatch: '$MOD_LIVEPATCH': patching complete
 livepatch: sysctl: setting key \"kernel.ftrace_enabled\": Device or resource busy
-% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+% echo 0 > $SYSFS_KLP_DIR/$MOD_LIVEPATCH/enabled
 livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
 livepatch: '$MOD_LIVEPATCH': starting unpatching transition
 livepatch: '$MOD_LIVEPATCH': completing unpatching transition
diff --git a/tools/testing/selftests/livepatch/test-livepatch.sh b/tools/testing/selftests/livepatch/test-livepatch.sh
index bd13257bfdfe..6673023d2b66 100755
--- a/tools/testing/selftests/livepatch/test-livepatch.sh
+++ b/tools/testing/selftests/livepatch/test-livepatch.sh
@@ -39,7 +39,7 @@ livepatch: '$MOD_LIVEPATCH1': initializing patching transition
 livepatch: '$MOD_LIVEPATCH1': starting patching transition
 livepatch: '$MOD_LIVEPATCH1': completing patching transition
 livepatch: '$MOD_LIVEPATCH1': patching complete
-% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH1/enabled
+% echo 0 > $SYSFS_KLP_DIR/$MOD_LIVEPATCH1/enabled
 livepatch: '$MOD_LIVEPATCH1': initializing unpatching transition
 livepatch: '$MOD_LIVEPATCH1': starting unpatching transition
 livepatch: '$MOD_LIVEPATCH1': completing unpatching transition
@@ -92,14 +92,14 @@ livepatch: '$MOD_REPLACE': completing patching transition
 livepatch: '$MOD_REPLACE': patching complete
 $MOD_LIVEPATCH1: this has been live patched
 $MOD_REPLACE: this has been live patched
-% echo 0 > /sys/kernel/livepatch/$MOD_REPLACE/enabled
+% echo 0 > $SYSFS_KLP_DIR/$MOD_REPLACE/enabled
 livepatch: '$MOD_REPLACE': initializing unpatching transition
 livepatch: '$MOD_REPLACE': starting unpatching transition
 livepatch: '$MOD_REPLACE': completing unpatching transition
 livepatch: '$MOD_REPLACE': unpatching complete
 % rmmod $MOD_REPLACE
 $MOD_LIVEPATCH1: this has been live patched
-% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH1/enabled
+% echo 0 > $SYSFS_KLP_DIR/$MOD_LIVEPATCH1/enabled
 livepatch: '$MOD_LIVEPATCH1': initializing unpatching transition
 livepatch: '$MOD_LIVEPATCH1': starting unpatching transition
 livepatch: '$MOD_LIVEPATCH1': completing unpatching transition
@@ -128,7 +128,7 @@ for mod in $MOD_LIVEPATCH2 $MOD_LIVEPATCH3; do
 	load_lp "$mod"
 done
 
-mods=(/sys/kernel/livepatch/*)
+mods=($SYSFS_KLP_DIR/*)
 nmods=${#mods[@]}
 if [ "$nmods" -ne 3 ]; then
 	die "Expecting three modules listed, found $nmods"
@@ -139,7 +139,7 @@ load_lp $MOD_REPLACE replace=1
 grep 'live patched' /proc/cmdline > /dev/kmsg
 grep 'live patched' /proc/meminfo > /dev/kmsg
 
-loop_until 'mods=(/sys/kernel/livepatch/*); nmods=${#mods[@]}; [[ "$nmods" -eq 1 ]]' ||
+loop_until 'mods=($SYSFS_KLP_DIR/*); nmods=${#mods[@]}; [[ "$nmods" -eq 1 ]]' ||
         die "Expecting only one moduled listed, found $nmods"
 
 # These modules were disabled by the atomic replace
@@ -188,7 +188,7 @@ $MOD_REPLACE: this has been live patched
 % rmmod $MOD_LIVEPATCH2
 % rmmod $MOD_LIVEPATCH1
 $MOD_REPLACE: this has been live patched
-% echo 0 > /sys/kernel/livepatch/$MOD_REPLACE/enabled
+% echo 0 > $SYSFS_KLP_DIR/$MOD_REPLACE/enabled
 livepatch: '$MOD_REPLACE': initializing unpatching transition
 livepatch: '$MOD_REPLACE': starting unpatching transition
 livepatch: '$MOD_REPLACE': completing unpatching transition
diff --git a/tools/testing/selftests/livepatch/test-state.sh b/tools/testing/selftests/livepatch/test-state.sh
index 10a52ac06185..04b66380f8a0 100755
--- a/tools/testing/selftests/livepatch/test-state.sh
+++ b/tools/testing/selftests/livepatch/test-state.sh
@@ -29,7 +29,7 @@ livepatch: '$MOD_LIVEPATCH': completing patching transition
 $MOD_LIVEPATCH: post_patch_callback: vmlinux
 $MOD_LIVEPATCH: fix_console_loglevel: fixing console_loglevel
 livepatch: '$MOD_LIVEPATCH': patching complete
-% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+% echo 0 > $SYSFS_KLP_DIR/$MOD_LIVEPATCH/enabled
 livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
 $MOD_LIVEPATCH: pre_unpatch_callback: vmlinux
 $MOD_LIVEPATCH: restore_console_loglevel: restoring console_loglevel
@@ -72,7 +72,7 @@ $MOD_LIVEPATCH2: post_patch_callback: vmlinux
 $MOD_LIVEPATCH2: fix_console_loglevel: taking over the console_loglevel change
 livepatch: '$MOD_LIVEPATCH2': patching complete
 % rmmod $MOD_LIVEPATCH
-% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH2/enabled
+% echo 0 > $SYSFS_KLP_DIR/$MOD_LIVEPATCH2/enabled
 livepatch: '$MOD_LIVEPATCH2': initializing unpatching transition
 $MOD_LIVEPATCH2: pre_unpatch_callback: vmlinux
 $MOD_LIVEPATCH2: restore_console_loglevel: restoring console_loglevel
@@ -127,7 +127,7 @@ livepatch: '$MOD_LIVEPATCH2': completing patching transition
 $MOD_LIVEPATCH2: post_patch_callback: vmlinux
 $MOD_LIVEPATCH2: fix_console_loglevel: taking over the console_loglevel change
 livepatch: '$MOD_LIVEPATCH2': patching complete
-% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH2/enabled
+% echo 0 > $SYSFS_KLP_DIR/$MOD_LIVEPATCH2/enabled
 livepatch: '$MOD_LIVEPATCH2': initializing unpatching transition
 $MOD_LIVEPATCH2: pre_unpatch_callback: vmlinux
 $MOD_LIVEPATCH2: restore_console_loglevel: restoring console_loglevel
@@ -162,7 +162,7 @@ livepatch: '$MOD_LIVEPATCH2': patching complete
 % insmod test_modules/$MOD_LIVEPATCH.ko
 livepatch: Livepatch patch ($MOD_LIVEPATCH) is not compatible with the already installed livepatches.
 insmod: ERROR: could not insert module test_modules/$MOD_LIVEPATCH.ko: Invalid parameters
-% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH2/enabled
+% echo 0 > $SYSFS_KLP_DIR/$MOD_LIVEPATCH2/enabled
 livepatch: '$MOD_LIVEPATCH2': initializing unpatching transition
 $MOD_LIVEPATCH2: pre_unpatch_callback: vmlinux
 $MOD_LIVEPATCH2: restore_console_loglevel: restoring console_loglevel
diff --git a/tools/testing/selftests/livepatch/test-syscall.sh b/tools/testing/selftests/livepatch/test-syscall.sh
index 289eb7d4c4b3..9347464a89a0 100755
--- a/tools/testing/selftests/livepatch/test-syscall.sh
+++ b/tools/testing/selftests/livepatch/test-syscall.sh
@@ -46,7 +46,7 @@ livepatch: '$MOD_SYSCALL': starting patching transition
 livepatch: '$MOD_SYSCALL': completing patching transition
 livepatch: '$MOD_SYSCALL': patching complete
 $MOD_SYSCALL: Remaining not livepatched processes: 0
-% echo 0 > /sys/kernel/livepatch/$MOD_SYSCALL/enabled
+% echo 0 > $SYSFS_KLP_DIR/$MOD_SYSCALL/enabled
 livepatch: '$MOD_SYSCALL': initializing unpatching transition
 livepatch: '$MOD_SYSCALL': starting unpatching transition
 livepatch: '$MOD_SYSCALL': completing unpatching transition
diff --git a/tools/testing/selftests/livepatch/test-sysfs.sh b/tools/testing/selftests/livepatch/test-sysfs.sh
index 05a14f5a7bfb..2c91428d2997 100755
--- a/tools/testing/selftests/livepatch/test-sysfs.sh
+++ b/tools/testing/selftests/livepatch/test-sysfs.sh
@@ -34,7 +34,7 @@ livepatch: '$MOD_LIVEPATCH': initializing patching transition
 livepatch: '$MOD_LIVEPATCH': starting patching transition
 livepatch: '$MOD_LIVEPATCH': completing patching transition
 livepatch: '$MOD_LIVEPATCH': patching complete
-% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+% echo 0 > $SYSFS_KLP_DIR/$MOD_LIVEPATCH/enabled
 livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
 livepatch: '$MOD_LIVEPATCH': starting unpatching transition
 livepatch: '$MOD_LIVEPATCH': completing unpatching transition
@@ -75,7 +75,7 @@ test_klp_callbacks_mod: test_klp_callbacks_mod_exit
 test_klp_callbacks_demo: pre_unpatch_callback: test_klp_callbacks_mod -> [MODULE_STATE_GOING] Going away
 livepatch: reverting patch 'test_klp_callbacks_demo' on unloading module 'test_klp_callbacks_mod'
 test_klp_callbacks_demo: post_unpatch_callback: test_klp_callbacks_mod -> [MODULE_STATE_GOING] Going away
-% echo 0 > /sys/kernel/livepatch/test_klp_callbacks_demo/enabled
+% echo 0 > $SYSFS_KLP_DIR/test_klp_callbacks_demo/enabled
 livepatch: 'test_klp_callbacks_demo': initializing unpatching transition
 test_klp_callbacks_demo: pre_unpatch_callback: vmlinux
 livepatch: 'test_klp_callbacks_demo': starting unpatching transition
@@ -101,7 +101,7 @@ livepatch: '$MOD_LIVEPATCH': initializing patching transition
 livepatch: '$MOD_LIVEPATCH': starting patching transition
 livepatch: '$MOD_LIVEPATCH': completing patching transition
 livepatch: '$MOD_LIVEPATCH': patching complete
-% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+% echo 0 > $SYSFS_KLP_DIR/$MOD_LIVEPATCH/enabled
 livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
 livepatch: '$MOD_LIVEPATCH': starting unpatching transition
 livepatch: '$MOD_LIVEPATCH': completing unpatching transition
@@ -124,7 +124,7 @@ livepatch: '$MOD_LIVEPATCH': initializing patching transition
 livepatch: '$MOD_LIVEPATCH': starting patching transition
 livepatch: '$MOD_LIVEPATCH': completing patching transition
 livepatch: '$MOD_LIVEPATCH': patching complete
-% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+% echo 0 > $SYSFS_KLP_DIR/$MOD_LIVEPATCH/enabled
 livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
 livepatch: '$MOD_LIVEPATCH': starting unpatching transition
 livepatch: '$MOD_LIVEPATCH': completing unpatching transition
-- 
cgit v1.2.3


From 59766286b6e54f8ad334dcef7d2e743b7550df0e Mon Sep 17 00:00:00 2001
From: Michael Vetter <mvetter@suse.com>
Date: Thu, 17 Oct 2024 22:01:31 +0200
Subject: selftests: livepatch: save and restore kprobe state

Save the state of /sys/kernel/debug/kprobes/enabled
during setup_config() and restore it during cleanup().

This is in preparation for a future commit that will add a test
that should confirm that we cannot livepatch a kprobed function
if that kprobe has a post handler.

Signed-off-by: Michael Vetter <mvetter@suse.com>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Joe Lawrence <joe.lawrence@redhat.com>
Tested-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Reviewed-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Tested-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20241017200132.21946-3-mvetter@suse.com
[pmladek@suse.com: Added few more substitutions in test-syscall.sh]
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 tools/testing/selftests/livepatch/functions.sh    | 15 +++++++++++----
 tools/testing/selftests/livepatch/test-syscall.sh |  4 ++--
 2 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/livepatch/functions.sh b/tools/testing/selftests/livepatch/functions.sh
index 7bdfe668127b..e5d06fb40233 100644
--- a/tools/testing/selftests/livepatch/functions.sh
+++ b/tools/testing/selftests/livepatch/functions.sh
@@ -6,7 +6,10 @@
 
 MAX_RETRIES=600
 RETRY_INTERVAL=".1"	# seconds
-SYSFS_KLP_DIR="/sys/kernel/livepatch"
+SYSFS_KERNEL_DIR="/sys/kernel"
+SYSFS_KLP_DIR="$SYSFS_KERNEL_DIR/livepatch"
+SYSFS_DEBUG_DIR="$SYSFS_KERNEL_DIR/debug"
+SYSFS_KPROBES_DIR="$SYSFS_DEBUG_DIR/kprobes"
 
 # Kselftest framework requirement - SKIP code is 4
 ksft_skip=4
@@ -55,22 +58,26 @@ function die() {
 }
 
 function push_config() {
-	DYNAMIC_DEBUG=$(grep '^kernel/livepatch' /sys/kernel/debug/dynamic_debug/control | \
+	DYNAMIC_DEBUG=$(grep '^kernel/livepatch' "$SYSFS_DEBUG_DIR/dynamic_debug/control" | \
 			awk -F'[: ]' '{print "file " $1 " line " $2 " " $4}')
 	FTRACE_ENABLED=$(sysctl --values kernel.ftrace_enabled)
+	KPROBE_ENABLED=$(cat "$SYSFS_KPROBES_DIR/enabled")
 }
 
 function pop_config() {
 	if [[ -n "$DYNAMIC_DEBUG" ]]; then
-		echo -n "$DYNAMIC_DEBUG" > /sys/kernel/debug/dynamic_debug/control
+		echo -n "$DYNAMIC_DEBUG" > "$SYSFS_DEBUG_DIR/dynamic_debug/control"
 	fi
 	if [[ -n "$FTRACE_ENABLED" ]]; then
 		sysctl kernel.ftrace_enabled="$FTRACE_ENABLED" &> /dev/null
 	fi
+	if [[ -n "$KPROBE_ENABLED" ]]; then
+		echo "$KPROBE_ENABLED" > "$SYSFS_KPROBES_DIR/enabled"
+	fi
 }
 
 function set_dynamic_debug() {
-        cat <<-EOF > /sys/kernel/debug/dynamic_debug/control
+        cat <<-EOF > "$SYSFS_DEBUG_DIR/dynamic_debug/control"
 		file kernel/livepatch/* +p
 		func klp_try_switch_task -p
 		EOF
diff --git a/tools/testing/selftests/livepatch/test-syscall.sh b/tools/testing/selftests/livepatch/test-syscall.sh
index 9347464a89a0..5f9344277b62 100755
--- a/tools/testing/selftests/livepatch/test-syscall.sh
+++ b/tools/testing/selftests/livepatch/test-syscall.sh
@@ -27,9 +27,9 @@ pid_list=$(echo ${pids[@]} | tr ' ' ',')
 load_lp $MOD_SYSCALL klp_pids=$pid_list
 
 # wait for all tasks to transition to patched state
-loop_until 'grep -q '^0$' /sys/kernel/test_klp_syscall/npids'
+loop_until 'grep -q '^0$' $SYSFS_KERNEL_DIR/$MOD_SYSCALL/npids'
 
-pending_pids=$(cat /sys/kernel/test_klp_syscall/npids)
+pending_pids=$(cat $SYSFS_KERNEL_DIR/$MOD_SYSCALL/npids)
 log "$MOD_SYSCALL: Remaining not livepatched processes: $pending_pids"
 
 for pid in ${pids[@]}; do
-- 
cgit v1.2.3


From 62597edf6340191511bdf9a7f64fa315ddc58805 Mon Sep 17 00:00:00 2001
From: Michael Vetter <mvetter@suse.com>
Date: Thu, 17 Oct 2024 22:01:32 +0200
Subject: selftests: livepatch: test livepatching a kprobed function

The test proves that a function that is being kprobed and uses a
post_handler cannot be livepatched.

Only one ftrace_ops with FTRACE_OPS_FL_IPMODIFY set may be registered
to any given function at a time.

Note that the conflicting kprobe could not be created using the
tracefs interface, see Documentation/trace/kprobetrace.rst.
This interface uses only the pre_handler(), see alloc_trace_kprobe().
But FTRACE_OPS_FL_IPMODIFY is used only when the kprobe is using a
post_handler, see arm_kprobe_ftrace().

Signed-off-by: Michael Vetter <mvetter@suse.com>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Joe Lawrence <joe.lawrence@redhat.com>
Tested-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Reviewed-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Tested-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20241017200132.21946-4-mvetter@suse.com
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 tools/testing/selftests/livepatch/Makefile         |  3 +-
 tools/testing/selftests/livepatch/test-kprobe.sh   | 62 ++++++++++++++++++++++
 .../selftests/livepatch/test_modules/Makefile      |  3 +-
 .../livepatch/test_modules/test_klp_kprobe.c       | 38 +++++++++++++
 4 files changed, 104 insertions(+), 2 deletions(-)
 create mode 100755 tools/testing/selftests/livepatch/test-kprobe.sh
 create mode 100644 tools/testing/selftests/livepatch/test_modules/test_klp_kprobe.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/livepatch/Makefile b/tools/testing/selftests/livepatch/Makefile
index 35418a4790be..a080eb54a215 100644
--- a/tools/testing/selftests/livepatch/Makefile
+++ b/tools/testing/selftests/livepatch/Makefile
@@ -10,7 +10,8 @@ TEST_PROGS := \
 	test-state.sh \
 	test-ftrace.sh \
 	test-sysfs.sh \
-	test-syscall.sh
+	test-syscall.sh \
+	test-kprobe.sh
 
 TEST_FILES := settings
 
diff --git a/tools/testing/selftests/livepatch/test-kprobe.sh b/tools/testing/selftests/livepatch/test-kprobe.sh
new file mode 100755
index 000000000000..115065156016
--- /dev/null
+++ b/tools/testing/selftests/livepatch/test-kprobe.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2024 SUSE
+# Author: Michael Vetter <mvetter@suse.com>
+
+. $(dirname $0)/functions.sh
+
+MOD_LIVEPATCH=test_klp_livepatch
+MOD_KPROBE=test_klp_kprobe
+
+setup_config
+
+# Kprobe a function and verify that we can't livepatch that same function
+# when it uses a post_handler since only one IPMODIFY maybe be registered
+# to any given function at a time.
+
+start_test "livepatch interaction with kprobed function with post_handler"
+
+echo 1 > "$SYSFS_KPROBES_DIR/enabled"
+
+load_mod $MOD_KPROBE has_post_handler=true
+load_failing_mod $MOD_LIVEPATCH
+unload_mod $MOD_KPROBE
+
+check_result "% insmod test_modules/test_klp_kprobe.ko has_post_handler=true
+% insmod test_modules/$MOD_LIVEPATCH.ko
+livepatch: enabling patch '$MOD_LIVEPATCH'
+livepatch: '$MOD_LIVEPATCH': initializing patching transition
+livepatch: failed to register ftrace handler for function 'cmdline_proc_show' (-16)
+livepatch: failed to patch object 'vmlinux'
+livepatch: failed to enable patch '$MOD_LIVEPATCH'
+livepatch: '$MOD_LIVEPATCH': canceling patching transition, going to unpatch
+livepatch: '$MOD_LIVEPATCH': completing unpatching transition
+livepatch: '$MOD_LIVEPATCH': unpatching complete
+insmod: ERROR: could not insert module test_modules/$MOD_LIVEPATCH.ko: Device or resource busy
+% rmmod test_klp_kprobe"
+
+start_test "livepatch interaction with kprobed function without post_handler"
+
+load_mod $MOD_KPROBE has_post_handler=false
+load_lp $MOD_LIVEPATCH
+
+unload_mod $MOD_KPROBE
+disable_lp $MOD_LIVEPATCH
+unload_lp $MOD_LIVEPATCH
+
+check_result "% insmod test_modules/test_klp_kprobe.ko has_post_handler=false
+% insmod test_modules/$MOD_LIVEPATCH.ko
+livepatch: enabling patch '$MOD_LIVEPATCH'
+livepatch: '$MOD_LIVEPATCH': initializing patching transition
+livepatch: '$MOD_LIVEPATCH': starting patching transition
+livepatch: '$MOD_LIVEPATCH': completing patching transition
+livepatch: '$MOD_LIVEPATCH': patching complete
+% rmmod test_klp_kprobe
+% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
+livepatch: '$MOD_LIVEPATCH': starting unpatching transition
+livepatch: '$MOD_LIVEPATCH': completing unpatching transition
+livepatch: '$MOD_LIVEPATCH': unpatching complete
+% rmmod $MOD_LIVEPATCH"
+
+exit 0
diff --git a/tools/testing/selftests/livepatch/test_modules/Makefile b/tools/testing/selftests/livepatch/test_modules/Makefile
index e6e638c4bcba..939230e571f5 100644
--- a/tools/testing/selftests/livepatch/test_modules/Makefile
+++ b/tools/testing/selftests/livepatch/test_modules/Makefile
@@ -6,11 +6,12 @@ obj-m += test_klp_atomic_replace.o \
 	test_klp_callbacks_demo.o \
 	test_klp_callbacks_demo2.o \
 	test_klp_callbacks_mod.o \
+	test_klp_kprobe.o \
 	test_klp_livepatch.o \
+	test_klp_shadow_vars.o \
 	test_klp_state.o \
 	test_klp_state2.o \
 	test_klp_state3.o \
-	test_klp_shadow_vars.o \
 	test_klp_syscall.o
 
 # Ensure that KDIR exists, otherwise skip the compilation
diff --git a/tools/testing/selftests/livepatch/test_modules/test_klp_kprobe.c b/tools/testing/selftests/livepatch/test_modules/test_klp_kprobe.c
new file mode 100644
index 000000000000..67a8d29012f6
--- /dev/null
+++ b/tools/testing/selftests/livepatch/test_modules/test_klp_kprobe.c
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2024 Marcos Paulo de Souza <mpdesouza@suse.com>
+// Copyright (C) 2024 Michael Vetter <mvetter@suse.com>
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kprobes.h>
+
+static bool has_post_handler = true;
+module_param(has_post_handler, bool, 0444);
+
+static void __kprobes post_handler(struct kprobe *p, struct pt_regs *regs,
+				unsigned long flags)
+{
+}
+
+static struct kprobe kp = {
+	.symbol_name = "cmdline_proc_show",
+};
+
+static int __init kprobe_init(void)
+{
+	if (has_post_handler)
+		kp.post_handler = post_handler;
+
+	return register_kprobe(&kp);
+}
+
+static void __exit kprobe_exit(void)
+{
+	unregister_kprobe(&kp);
+}
+
+module_init(kprobe_init)
+module_exit(kprobe_exit)
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Michael Vetter <mvetter@suse.com>");
+MODULE_DESCRIPTION("Livepatch test: kprobe function");
-- 
cgit v1.2.3


From 94682d6ad9692855b2ae16bb93c408ad0a5bc9ba Mon Sep 17 00:00:00 2001
From: Jordan Rife <jrife@google.com>
Date: Tue, 22 Oct 2024 15:29:01 +0000
Subject: selftests/bpf: Migrate *_POST_BIND test cases to prog_tests

Move all BPF_CGROUP_INET6_POST_BIND and BPF_CGROUP_INET4_POST_BIND test
cases to a new prog_test, prog_tests/sock_post_bind.c, except for
LOAD_REJECT test cases.

Signed-off-by: Jordan Rife <jrife@google.com>
Link: https://lore.kernel.org/r/20241022152913.574836-2-jrife@google.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 .../selftests/bpf/prog_tests/sock_post_bind.c      | 426 +++++++++++++++++++++
 tools/testing/selftests/bpf/test_sock.c            | 245 ------------
 2 files changed, 426 insertions(+), 245 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/sock_post_bind.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/sock_post_bind.c b/tools/testing/selftests/bpf/prog_tests/sock_post_bind.c
new file mode 100644
index 000000000000..788135c9c673
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/sock_post_bind.c
@@ -0,0 +1,426 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <test_progs.h>
+#include "cgroup_helpers.h"
+
+#define TEST_NS "sock_post_bind"
+
+static char bpf_log_buf[4096];
+
+static struct sock_post_bind_test {
+	const char			*descr;
+	/* BPF prog properties */
+	const struct bpf_insn		insns[64];
+	enum bpf_attach_type		attach_type;
+	enum bpf_attach_type		expected_attach_type;
+	/* Socket properties */
+	int				domain;
+	int				type;
+	/* Endpoint to bind() to */
+	const char *ip;
+	unsigned short port;
+	unsigned short port_retry;
+
+	/* Expected test result */
+	enum {
+		ATTACH_REJECT,
+		BIND_REJECT,
+		SUCCESS,
+		RETRY_SUCCESS,
+		RETRY_REJECT
+	} result;
+} tests[] = {
+	{
+		.descr = "attach type mismatch bind4 vs bind6",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.result = ATTACH_REJECT,
+	},
+	{
+		.descr = "attach type mismatch bind6 vs bind4",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.expected_attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.result = ATTACH_REJECT,
+	},
+	{
+		.descr = "attach type mismatch default vs bind4",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.expected_attach_type = 0,
+		.attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.result = ATTACH_REJECT,
+	},
+	{
+		.descr = "attach type mismatch bind6 vs sock_create",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.expected_attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+		.result = ATTACH_REJECT,
+	},
+	{
+		.descr = "bind4 reject all",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.domain = AF_INET,
+		.type = SOCK_STREAM,
+		.ip = "0.0.0.0",
+		.result = BIND_REJECT,
+	},
+	{
+		.descr = "bind6 reject all",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.expected_attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.domain = AF_INET6,
+		.type = SOCK_STREAM,
+		.ip = "::",
+		.result = BIND_REJECT,
+	},
+	{
+		.descr = "bind6 deny specific IP & port",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+			/* if (ip == expected && port == expected) */
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, src_ip6[3])),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7,
+				    __bpf_constant_ntohl(0x00000001), 4),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, src_port)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x2001, 2),
+
+			/* return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_JMP_A(1),
+
+			/* else return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.expected_attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.domain = AF_INET6,
+		.type = SOCK_STREAM,
+		.ip = "::1",
+		.port = 8193,
+		.result = BIND_REJECT,
+	},
+	{
+		.descr = "bind4 allow specific IP & port",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+			/* if (ip == expected && port == expected) */
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, src_ip4)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7,
+				    __bpf_constant_ntohl(0x7F000001), 4),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, src_port)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x1002, 2),
+
+			/* return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_JMP_A(1),
+
+			/* else return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.domain = AF_INET,
+		.type = SOCK_STREAM,
+		.ip = "127.0.0.1",
+		.port = 4098,
+		.result = SUCCESS,
+	},
+	{
+		.descr = "bind4 deny specific IP & port of TCP, and retry",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+			/* if (ip == expected && port == expected) */
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, src_ip4)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7,
+				    __bpf_constant_ntohl(0x7F000001), 4),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, src_port)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x1002, 2),
+
+			/* return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_JMP_A(1),
+
+			/* else return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.domain = AF_INET,
+		.type = SOCK_STREAM,
+		.ip = "127.0.0.1",
+		.port = 4098,
+		.port_retry = 5000,
+		.result = RETRY_SUCCESS,
+	},
+	{
+		.descr = "bind4 deny specific IP & port of UDP, and retry",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+			/* if (ip == expected && port == expected) */
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, src_ip4)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7,
+				    __bpf_constant_ntohl(0x7F000001), 4),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, src_port)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x1002, 2),
+
+			/* return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_JMP_A(1),
+
+			/* else return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.domain = AF_INET,
+		.type = SOCK_DGRAM,
+		.ip = "127.0.0.1",
+		.port = 4098,
+		.port_retry = 5000,
+		.result = RETRY_SUCCESS,
+	},
+	{
+		.descr = "bind6 deny specific IP & port, and retry",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+			/* if (ip == expected && port == expected) */
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, src_ip6[3])),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7,
+				    __bpf_constant_ntohl(0x00000001), 4),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, src_port)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x2001, 2),
+
+			/* return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_JMP_A(1),
+
+			/* else return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.expected_attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.domain = AF_INET6,
+		.type = SOCK_STREAM,
+		.ip = "::1",
+		.port = 8193,
+		.port_retry = 9000,
+		.result = RETRY_SUCCESS,
+	},
+	{
+		.descr = "bind4 allow all",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.domain = AF_INET,
+		.type = SOCK_STREAM,
+		.ip = "0.0.0.0",
+		.result = SUCCESS,
+	},
+	{
+		.descr = "bind6 allow all",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.expected_attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.domain = AF_INET6,
+		.type = SOCK_STREAM,
+		.ip = "::",
+		.result = SUCCESS,
+	},
+};
+
+static int load_prog(const struct bpf_insn *insns,
+		     enum bpf_attach_type expected_attach_type)
+{
+	LIBBPF_OPTS(bpf_prog_load_opts, opts,
+		    .expected_attach_type = expected_attach_type,
+		    .log_level = 2,
+		    .log_buf = bpf_log_buf,
+		    .log_size = sizeof(bpf_log_buf),
+	);
+	int fd, insns_cnt = 0;
+
+	for (;
+	     insns[insns_cnt].code != (BPF_JMP | BPF_EXIT);
+	     insns_cnt++) {
+	}
+	insns_cnt++;
+
+	fd = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK, NULL, "GPL", insns,
+			   insns_cnt, &opts);
+	if (fd < 0)
+		fprintf(stderr, "%s\n", bpf_log_buf);
+
+	return fd;
+}
+
+static int bind_sock(int domain, int type, const char *ip,
+		     unsigned short port, unsigned short port_retry)
+{
+	struct sockaddr_storage addr;
+	struct sockaddr_in6 *addr6;
+	struct sockaddr_in *addr4;
+	int sockfd = -1;
+	socklen_t len;
+	int res = SUCCESS;
+
+	sockfd = socket(domain, type, 0);
+	if (sockfd < 0)
+		goto err;
+
+	memset(&addr, 0, sizeof(addr));
+
+	if (domain == AF_INET) {
+		len = sizeof(struct sockaddr_in);
+		addr4 = (struct sockaddr_in *)&addr;
+		addr4->sin_family = domain;
+		addr4->sin_port = htons(port);
+		if (inet_pton(domain, ip, (void *)&addr4->sin_addr) != 1)
+			goto err;
+	} else if (domain == AF_INET6) {
+		len = sizeof(struct sockaddr_in6);
+		addr6 = (struct sockaddr_in6 *)&addr;
+		addr6->sin6_family = domain;
+		addr6->sin6_port = htons(port);
+		if (inet_pton(domain, ip, (void *)&addr6->sin6_addr) != 1)
+			goto err;
+	} else {
+		goto err;
+	}
+
+	if (bind(sockfd, (const struct sockaddr *)&addr, len) == -1) {
+		/* sys_bind() may fail for different reasons, errno has to be
+		 * checked to confirm that BPF program rejected it.
+		 */
+		if (errno != EPERM)
+			goto err;
+		if (port_retry)
+			goto retry;
+		res = BIND_REJECT;
+		goto out;
+	}
+
+	goto out;
+retry:
+	if (domain == AF_INET)
+		addr4->sin_port = htons(port_retry);
+	else
+		addr6->sin6_port = htons(port_retry);
+	if (bind(sockfd, (const struct sockaddr *)&addr, len) == -1) {
+		if (errno != EPERM)
+			goto err;
+		res = RETRY_REJECT;
+	} else {
+		res = RETRY_SUCCESS;
+	}
+	goto out;
+err:
+	res = -1;
+out:
+	close(sockfd);
+	return res;
+}
+
+static int run_test(int cgroup_fd, struct sock_post_bind_test *test)
+{
+	int err, prog_fd, res, ret = 0;
+
+	prog_fd = load_prog(test->insns, test->expected_attach_type);
+	if (prog_fd < 0)
+		goto err;
+
+	err = bpf_prog_attach(prog_fd, cgroup_fd, test->attach_type, 0);
+	if (err < 0) {
+		if (test->result == ATTACH_REJECT)
+			goto out;
+		else
+			goto err;
+	}
+
+	res = bind_sock(test->domain, test->type, test->ip, test->port,
+			test->port_retry);
+	if (res > 0 && test->result == res)
+		goto out;
+err:
+	ret = -1;
+out:
+	/* Detaching w/o checking return code: best effort attempt. */
+	if (prog_fd != -1)
+		bpf_prog_detach(cgroup_fd, test->attach_type);
+	close(prog_fd);
+	return ret;
+}
+
+void test_sock_post_bind(void)
+{
+	struct netns_obj *ns;
+	int cgroup_fd;
+	int i;
+
+	cgroup_fd = test__join_cgroup("/post_bind");
+	if (!ASSERT_OK_FD(cgroup_fd, "join_cgroup"))
+		return;
+
+	ns = netns_new(TEST_NS, true);
+	if (!ASSERT_OK_PTR(ns, "netns_new"))
+		goto cleanup;
+
+	for (i = 0; i < ARRAY_SIZE(tests); i++) {
+		if (!test__start_subtest(tests[i].descr))
+			continue;
+
+		ASSERT_OK(run_test(cgroup_fd, &tests[i]), tests[i].descr);
+	}
+
+cleanup:
+	netns_free(ns);
+	close(cgroup_fd);
+}
diff --git a/tools/testing/selftests/bpf/test_sock.c b/tools/testing/selftests/bpf/test_sock.c
index 810c3740b2cc..9ed908163d98 100644
--- a/tools/testing/selftests/bpf/test_sock.c
+++ b/tools/testing/selftests/bpf/test_sock.c
@@ -127,251 +127,6 @@ static struct sock_test tests[] = {
 		.port = 8097,
 		.result = SUCCESS,
 	},
-	{
-		.descr = "attach type mismatch bind4 vs bind6",
-		.insns = {
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
-		.attach_type = BPF_CGROUP_INET6_POST_BIND,
-		.result = ATTACH_REJECT,
-	},
-	{
-		.descr = "attach type mismatch bind6 vs bind4",
-		.insns = {
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.expected_attach_type = BPF_CGROUP_INET6_POST_BIND,
-		.attach_type = BPF_CGROUP_INET4_POST_BIND,
-		.result = ATTACH_REJECT,
-	},
-	{
-		.descr = "attach type mismatch default vs bind4",
-		.insns = {
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.expected_attach_type = 0,
-		.attach_type = BPF_CGROUP_INET4_POST_BIND,
-		.result = ATTACH_REJECT,
-	},
-	{
-		.descr = "attach type mismatch bind6 vs sock_create",
-		.insns = {
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.expected_attach_type = BPF_CGROUP_INET6_POST_BIND,
-		.attach_type = BPF_CGROUP_INET_SOCK_CREATE,
-		.result = ATTACH_REJECT,
-	},
-	{
-		.descr = "bind4 reject all",
-		.insns = {
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_EXIT_INSN(),
-		},
-		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
-		.attach_type = BPF_CGROUP_INET4_POST_BIND,
-		.domain = AF_INET,
-		.type = SOCK_STREAM,
-		.ip = "0.0.0.0",
-		.result = BIND_REJECT,
-	},
-	{
-		.descr = "bind6 reject all",
-		.insns = {
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_EXIT_INSN(),
-		},
-		.expected_attach_type = BPF_CGROUP_INET6_POST_BIND,
-		.attach_type = BPF_CGROUP_INET6_POST_BIND,
-		.domain = AF_INET6,
-		.type = SOCK_STREAM,
-		.ip = "::",
-		.result = BIND_REJECT,
-	},
-	{
-		.descr = "bind6 deny specific IP & port",
-		.insns = {
-			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
-
-			/* if (ip == expected && port == expected) */
-			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
-				    offsetof(struct bpf_sock, src_ip6[3])),
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_7,
-				    __bpf_constant_ntohl(0x00000001), 4),
-			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
-				    offsetof(struct bpf_sock, src_port)),
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x2001, 2),
-
-			/* return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_JMP_A(1),
-
-			/* else return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.expected_attach_type = BPF_CGROUP_INET6_POST_BIND,
-		.attach_type = BPF_CGROUP_INET6_POST_BIND,
-		.domain = AF_INET6,
-		.type = SOCK_STREAM,
-		.ip = "::1",
-		.port = 8193,
-		.result = BIND_REJECT,
-	},
-	{
-		.descr = "bind4 allow specific IP & port",
-		.insns = {
-			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
-
-			/* if (ip == expected && port == expected) */
-			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
-				    offsetof(struct bpf_sock, src_ip4)),
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_7,
-				    __bpf_constant_ntohl(0x7F000001), 4),
-			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
-				    offsetof(struct bpf_sock, src_port)),
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x1002, 2),
-
-			/* return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_JMP_A(1),
-
-			/* else return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_EXIT_INSN(),
-		},
-		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
-		.attach_type = BPF_CGROUP_INET4_POST_BIND,
-		.domain = AF_INET,
-		.type = SOCK_STREAM,
-		.ip = "127.0.0.1",
-		.port = 4098,
-		.result = SUCCESS,
-	},
-	{
-		.descr = "bind4 deny specific IP & port of TCP, and retry",
-		.insns = {
-			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
-
-			/* if (ip == expected && port == expected) */
-			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
-				    offsetof(struct bpf_sock, src_ip4)),
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_7,
-				    __bpf_constant_ntohl(0x7F000001), 4),
-			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
-				    offsetof(struct bpf_sock, src_port)),
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x1002, 2),
-
-			/* return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_JMP_A(1),
-
-			/* else return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
-		.attach_type = BPF_CGROUP_INET4_POST_BIND,
-		.domain = AF_INET,
-		.type = SOCK_STREAM,
-		.ip = "127.0.0.1",
-		.port = 4098,
-		.port_retry = 5000,
-		.result = RETRY_SUCCESS,
-	},
-	{
-		.descr = "bind4 deny specific IP & port of UDP, and retry",
-		.insns = {
-			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
-
-			/* if (ip == expected && port == expected) */
-			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
-				    offsetof(struct bpf_sock, src_ip4)),
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_7,
-				    __bpf_constant_ntohl(0x7F000001), 4),
-			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
-				    offsetof(struct bpf_sock, src_port)),
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x1002, 2),
-
-			/* return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_JMP_A(1),
-
-			/* else return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
-		.attach_type = BPF_CGROUP_INET4_POST_BIND,
-		.domain = AF_INET,
-		.type = SOCK_DGRAM,
-		.ip = "127.0.0.1",
-		.port = 4098,
-		.port_retry = 5000,
-		.result = RETRY_SUCCESS,
-	},
-	{
-		.descr = "bind6 deny specific IP & port, and retry",
-		.insns = {
-			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
-
-			/* if (ip == expected && port == expected) */
-			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
-				    offsetof(struct bpf_sock, src_ip6[3])),
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_7,
-				    __bpf_constant_ntohl(0x00000001), 4),
-			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
-				    offsetof(struct bpf_sock, src_port)),
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x2001, 2),
-
-			/* return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_JMP_A(1),
-
-			/* else return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.expected_attach_type = BPF_CGROUP_INET6_POST_BIND,
-		.attach_type = BPF_CGROUP_INET6_POST_BIND,
-		.domain = AF_INET6,
-		.type = SOCK_STREAM,
-		.ip = "::1",
-		.port = 8193,
-		.port_retry = 9000,
-		.result = RETRY_SUCCESS,
-	},
-	{
-		.descr = "bind4 allow all",
-		.insns = {
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
-		.attach_type = BPF_CGROUP_INET4_POST_BIND,
-		.domain = AF_INET,
-		.type = SOCK_STREAM,
-		.ip = "0.0.0.0",
-		.result = SUCCESS,
-	},
-	{
-		.descr = "bind6 allow all",
-		.insns = {
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.expected_attach_type = BPF_CGROUP_INET6_POST_BIND,
-		.attach_type = BPF_CGROUP_INET6_POST_BIND,
-		.domain = AF_INET6,
-		.type = SOCK_STREAM,
-		.ip = "::",
-		.result = SUCCESS,
-	},
 };
 
 static size_t probe_prog_length(const struct bpf_insn *fp)
-- 
cgit v1.2.3


From c17f9734e35b4ca86c67d0a49780978a16effa94 Mon Sep 17 00:00:00 2001
From: Jordan Rife <jrife@google.com>
Date: Tue, 22 Oct 2024 15:29:02 +0000
Subject: selftests/bpf: Migrate LOAD_REJECT test cases to prog_tests

Move LOAD_REJECT test cases from test_sock.c to an equivalent set of
verifier tests in progs/verifier_sock.c.

Signed-off-by: Jordan Rife <jrife@google.com>
Link: https://lore.kernel.org/r/20241022152913.574836-3-jrife@google.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/progs/verifier_sock.c | 60 +++++++++++++++++++++++
 tools/testing/selftests/bpf/test_sock.c           | 52 --------------------
 2 files changed, 60 insertions(+), 52 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/progs/verifier_sock.c b/tools/testing/selftests/bpf/progs/verifier_sock.c
index ee76b51005ab..d3e70e38e442 100644
--- a/tools/testing/selftests/bpf/progs/verifier_sock.c
+++ b/tools/testing/selftests/bpf/progs/verifier_sock.c
@@ -977,4 +977,64 @@ l1_%=:	r0 = *(u8*)(r7 + 0);				\
 	: __clobber_all);
 }
 
+SEC("cgroup/post_bind4")
+__description("sk->src_ip6[0] [load 1st byte]")
+__failure __msg("invalid bpf_context access off=28 size=2")
+__naked void post_bind4_read_src_ip6(void)
+{
+	asm volatile ("					\
+	r6 = r1;					\
+	r7 = *(u16*)(r6 + %[bpf_sock_src_ip6_0]);	\
+	r0 = 1;						\
+	exit;						\
+"	:
+	: __imm_const(bpf_sock_src_ip6_0, offsetof(struct bpf_sock, src_ip6[0]))
+	: __clobber_all);
+}
+
+SEC("cgroup/post_bind4")
+__description("sk->mark [load mark]")
+__failure __msg("invalid bpf_context access off=16 size=2")
+__naked void post_bind4_read_mark(void)
+{
+	asm volatile ("					\
+	r6 = r1;					\
+	r7 = *(u16*)(r6 + %[bpf_sock_mark]);		\
+	r0 = 1;						\
+	exit;						\
+"	:
+	: __imm_const(bpf_sock_mark, offsetof(struct bpf_sock, mark))
+	: __clobber_all);
+}
+
+SEC("cgroup/post_bind6")
+__description("sk->src_ip4 [load src_ip4]")
+__failure __msg("invalid bpf_context access off=24 size=2")
+__naked void post_bind6_read_src_ip4(void)
+{
+	asm volatile ("					\
+	r6 = r1;					\
+	r7 = *(u16*)(r6 + %[bpf_sock_src_ip4]);		\
+	r0 = 1;						\
+	exit;						\
+"	:
+	: __imm_const(bpf_sock_src_ip4, offsetof(struct bpf_sock, src_ip4))
+	: __clobber_all);
+}
+
+SEC("cgroup/sock_create")
+__description("sk->src_port [word load]")
+__failure __msg("invalid bpf_context access off=44 size=2")
+__naked void sock_create_read_src_port(void)
+{
+	asm volatile ("					\
+	r6 = r1;					\
+	r7 = *(u16*)(r6 + %[bpf_sock_src_port]);	\
+	r0 = 1;						\
+	exit;						\
+"	:
+	: __imm_const(bpf_sock_src_port, offsetof(struct bpf_sock, src_port))
+	: __clobber_all);
+}
+
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_sock.c b/tools/testing/selftests/bpf/test_sock.c
index 9ed908163d98..26dff88abbaa 100644
--- a/tools/testing/selftests/bpf/test_sock.c
+++ b/tools/testing/selftests/bpf/test_sock.c
@@ -47,58 +47,6 @@ struct sock_test {
 };
 
 static struct sock_test tests[] = {
-	{
-		.descr = "bind4 load with invalid access: src_ip6",
-		.insns = {
-			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
-			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
-				    offsetof(struct bpf_sock, src_ip6[0])),
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
-		.attach_type = BPF_CGROUP_INET4_POST_BIND,
-		.result = LOAD_REJECT,
-	},
-	{
-		.descr = "bind4 load with invalid access: mark",
-		.insns = {
-			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
-			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
-				    offsetof(struct bpf_sock, mark)),
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
-		.attach_type = BPF_CGROUP_INET4_POST_BIND,
-		.result = LOAD_REJECT,
-	},
-	{
-		.descr = "bind6 load with invalid access: src_ip4",
-		.insns = {
-			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
-			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
-				    offsetof(struct bpf_sock, src_ip4)),
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.expected_attach_type = BPF_CGROUP_INET6_POST_BIND,
-		.attach_type = BPF_CGROUP_INET6_POST_BIND,
-		.result = LOAD_REJECT,
-	},
-	{
-		.descr = "sock_create load with invalid access: src_port",
-		.insns = {
-			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
-			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
-				    offsetof(struct bpf_sock, src_port)),
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE,
-		.attach_type = BPF_CGROUP_INET_SOCK_CREATE,
-		.result = LOAD_REJECT,
-	},
 	{
 		.descr = "sock_create load w/o expected_attach_type (compat mode)",
 		.insns = {
-- 
cgit v1.2.3


From af522f13e9177eca0111562a38cc8ebb6d55f820 Mon Sep 17 00:00:00 2001
From: Jordan Rife <jrife@google.com>
Date: Tue, 22 Oct 2024 15:29:03 +0000
Subject: selftests/bpf: Migrate BPF_CGROUP_INET_SOCK_CREATE test cases to
 prog_tests

Move the "load w/o expected_attach_type" test case to
prog_tests/sock_create.c and drop the remaining test case, as it is made
redundant with the existing coverage inside prog_tests/sock_create.c.

Signed-off-by: Jordan Rife <jrife@google.com>
Link: https://lore.kernel.org/r/20241022152913.574836-4-jrife@google.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 .../testing/selftests/bpf/prog_tests/sock_create.c | 35 +++++++++++++++-------
 tools/testing/selftests/bpf/test_sock.c            | 28 -----------------
 2 files changed, 25 insertions(+), 38 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/sock_create.c b/tools/testing/selftests/bpf/prog_tests/sock_create.c
index 17a3713621dd..187ffc5e60c4 100644
--- a/tools/testing/selftests/bpf/prog_tests/sock_create.c
+++ b/tools/testing/selftests/bpf/prog_tests/sock_create.c
@@ -237,6 +237,19 @@ static struct sock_create_test {
 
 		.error = DENY_CREATE,
 	},
+	{
+		.descr = "load w/o expected_attach_type (compat mode)",
+		.insns = {
+			/* return 1 */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.expected_attach_type = 0,
+		.attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+
+		.domain = AF_INET,
+		.type = SOCK_STREAM,
+	},
 };
 
 static int load_prog(const struct bpf_insn *insns,
@@ -291,16 +304,18 @@ static int run_test(int cgroup_fd, struct sock_create_test *test)
 		goto detach_prog;
 	}
 
-	err = getsockopt(sock_fd, SOL_SOCKET, test->optname, &optval, &optlen);
-	if (err) {
-		log_err("Failed to call getsockopt");
-		goto cleanup;
-	}
-
-	if (optval != test->optval) {
-		errno = 0;
-		log_err("getsockopt returned unexpected optval");
-		goto cleanup;
+	if (test->optname) {
+		err = getsockopt(sock_fd, SOL_SOCKET, test->optname, &optval, &optlen);
+		if (err) {
+			log_err("Failed to call getsockopt");
+			goto cleanup;
+		}
+
+		if (optval != test->optval) {
+			errno = 0;
+			log_err("getsockopt returned unexpected optval");
+			goto cleanup;
+		}
 	}
 
 	ret = test->error != OK;
diff --git a/tools/testing/selftests/bpf/test_sock.c b/tools/testing/selftests/bpf/test_sock.c
index 26dff88abbaa..f97850f1d84a 100644
--- a/tools/testing/selftests/bpf/test_sock.c
+++ b/tools/testing/selftests/bpf/test_sock.c
@@ -47,34 +47,6 @@ struct sock_test {
 };
 
 static struct sock_test tests[] = {
-	{
-		.descr = "sock_create load w/o expected_attach_type (compat mode)",
-		.insns = {
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.expected_attach_type = 0,
-		.attach_type = BPF_CGROUP_INET_SOCK_CREATE,
-		.domain = AF_INET,
-		.type = SOCK_STREAM,
-		.ip = "127.0.0.1",
-		.port = 8097,
-		.result = SUCCESS,
-	},
-	{
-		.descr = "sock_create load w/ expected_attach_type",
-		.insns = {
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE,
-		.attach_type = BPF_CGROUP_INET_SOCK_CREATE,
-		.domain = AF_INET,
-		.type = SOCK_STREAM,
-		.ip = "127.0.0.1",
-		.port = 8097,
-		.result = SUCCESS,
-	},
 };
 
 static size_t probe_prog_length(const struct bpf_insn *fp)
-- 
cgit v1.2.3


From eea6c14c10ce208e5f6ab309fb1c141a39446a1b Mon Sep 17 00:00:00 2001
From: Jordan Rife <jrife@google.com>
Date: Tue, 22 Oct 2024 15:29:04 +0000
Subject: selftests/bpf: Retire test_sock.c

Completely remove test_sock.c and associated config.

Signed-off-by: Jordan Rife <jrife@google.com>
Link: https://lore.kernel.org/r/20241022152913.574836-5-jrife@google.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/.gitignore  |   1 -
 tools/testing/selftests/bpf/Makefile    |   3 +-
 tools/testing/selftests/bpf/test_sock.c | 231 --------------------------------
 3 files changed, 1 insertion(+), 234 deletions(-)
 delete mode 100644 tools/testing/selftests/bpf/test_sock.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index e6533b3400de..d45c9a9b304d 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -16,7 +16,6 @@ fixdep
 /test_progs-cpuv4
 test_verifier_log
 feature
-test_sock
 urandom_read
 test_sockmap
 test_lirc_mode2_user
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 28a76baa854d..c4fc9a3291a8 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -84,7 +84,7 @@ endif
 
 # Order correspond to 'make run_tests' order
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
-	test_sock test_sockmap \
+	test_sockmap \
 	test_tcpnotify_user test_sysctl \
 	test_progs-no_alu32
 TEST_INST_SUBDIRS := no_alu32
@@ -335,7 +335,6 @@ JSON_WRITER		:= $(OUTPUT)/json_writer.o
 CAP_HELPERS	:= $(OUTPUT)/cap_helpers.o
 NETWORK_HELPERS := $(OUTPUT)/network_helpers.o
 
-$(OUTPUT)/test_sock: $(CGROUP_HELPERS) $(TESTING_HELPERS)
 $(OUTPUT)/test_sockmap: $(CGROUP_HELPERS) $(TESTING_HELPERS)
 $(OUTPUT)/test_tcpnotify_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(TRACE_HELPERS)
 $(OUTPUT)/test_sock_fields: $(CGROUP_HELPERS) $(TESTING_HELPERS)
diff --git a/tools/testing/selftests/bpf/test_sock.c b/tools/testing/selftests/bpf/test_sock.c
deleted file mode 100644
index f97850f1d84a..000000000000
--- a/tools/testing/selftests/bpf/test_sock.c
+++ /dev/null
@@ -1,231 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-// Copyright (c) 2018 Facebook
-
-#include <stdio.h>
-#include <unistd.h>
-
-#include <arpa/inet.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-
-#include <linux/filter.h>
-
-#include <bpf/bpf.h>
-
-#include "cgroup_helpers.h"
-#include <bpf/bpf_endian.h>
-#include "bpf_util.h"
-
-#define CG_PATH		"/foo"
-#define MAX_INSNS	512
-
-char bpf_log_buf[BPF_LOG_BUF_SIZE];
-static bool verbose = false;
-
-struct sock_test {
-	const char *descr;
-	/* BPF prog properties */
-	struct bpf_insn	insns[MAX_INSNS];
-	enum bpf_attach_type expected_attach_type;
-	enum bpf_attach_type attach_type;
-	/* Socket properties */
-	int domain;
-	int type;
-	/* Endpoint to bind() to */
-	const char *ip;
-	unsigned short port;
-	unsigned short port_retry;
-	/* Expected test result */
-	enum {
-		LOAD_REJECT,
-		ATTACH_REJECT,
-		BIND_REJECT,
-		SUCCESS,
-		RETRY_SUCCESS,
-		RETRY_REJECT
-	} result;
-};
-
-static struct sock_test tests[] = {
-};
-
-static size_t probe_prog_length(const struct bpf_insn *fp)
-{
-	size_t len;
-
-	for (len = MAX_INSNS - 1; len > 0; --len)
-		if (fp[len].code != 0 || fp[len].imm != 0)
-			break;
-	return len + 1;
-}
-
-static int load_sock_prog(const struct bpf_insn *prog,
-			  enum bpf_attach_type attach_type)
-{
-	LIBBPF_OPTS(bpf_prog_load_opts, opts);
-	int ret, insn_cnt;
-
-	insn_cnt = probe_prog_length(prog);
-
-	opts.expected_attach_type = attach_type;
-	opts.log_buf = bpf_log_buf;
-	opts.log_size = BPF_LOG_BUF_SIZE;
-	opts.log_level = 2;
-
-	ret = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK, NULL, "GPL", prog, insn_cnt, &opts);
-	if (verbose && ret < 0)
-		fprintf(stderr, "%s\n", bpf_log_buf);
-
-	return ret;
-}
-
-static int attach_sock_prog(int cgfd, int progfd,
-			    enum bpf_attach_type attach_type)
-{
-	return bpf_prog_attach(progfd, cgfd, attach_type, BPF_F_ALLOW_OVERRIDE);
-}
-
-static int bind_sock(int domain, int type, const char *ip,
-		     unsigned short port, unsigned short port_retry)
-{
-	struct sockaddr_storage addr;
-	struct sockaddr_in6 *addr6;
-	struct sockaddr_in *addr4;
-	int sockfd = -1;
-	socklen_t len;
-	int res = SUCCESS;
-
-	sockfd = socket(domain, type, 0);
-	if (sockfd < 0)
-		goto err;
-
-	memset(&addr, 0, sizeof(addr));
-
-	if (domain == AF_INET) {
-		len = sizeof(struct sockaddr_in);
-		addr4 = (struct sockaddr_in *)&addr;
-		addr4->sin_family = domain;
-		addr4->sin_port = htons(port);
-		if (inet_pton(domain, ip, (void *)&addr4->sin_addr) != 1)
-			goto err;
-	} else if (domain == AF_INET6) {
-		len = sizeof(struct sockaddr_in6);
-		addr6 = (struct sockaddr_in6 *)&addr;
-		addr6->sin6_family = domain;
-		addr6->sin6_port = htons(port);
-		if (inet_pton(domain, ip, (void *)&addr6->sin6_addr) != 1)
-			goto err;
-	} else {
-		goto err;
-	}
-
-	if (bind(sockfd, (const struct sockaddr *)&addr, len) == -1) {
-		/* sys_bind() may fail for different reasons, errno has to be
-		 * checked to confirm that BPF program rejected it.
-		 */
-		if (errno != EPERM)
-			goto err;
-		if (port_retry)
-			goto retry;
-		res = BIND_REJECT;
-		goto out;
-	}
-
-	goto out;
-retry:
-	if (domain == AF_INET)
-		addr4->sin_port = htons(port_retry);
-	else
-		addr6->sin6_port = htons(port_retry);
-	if (bind(sockfd, (const struct sockaddr *)&addr, len) == -1) {
-		if (errno != EPERM)
-			goto err;
-		res = RETRY_REJECT;
-	} else {
-		res = RETRY_SUCCESS;
-	}
-	goto out;
-err:
-	res = -1;
-out:
-	close(sockfd);
-	return res;
-}
-
-static int run_test_case(int cgfd, const struct sock_test *test)
-{
-	int progfd = -1;
-	int err = 0;
-	int res;
-
-	printf("Test case: %s .. ", test->descr);
-	progfd = load_sock_prog(test->insns, test->expected_attach_type);
-	if (progfd < 0) {
-		if (test->result == LOAD_REJECT)
-			goto out;
-		else
-			goto err;
-	}
-
-	if (attach_sock_prog(cgfd, progfd, test->attach_type) < 0) {
-		if (test->result == ATTACH_REJECT)
-			goto out;
-		else
-			goto err;
-	}
-
-	res = bind_sock(test->domain, test->type, test->ip, test->port,
-			test->port_retry);
-	if (res > 0 && test->result == res)
-		goto out;
-
-err:
-	err = -1;
-out:
-	/* Detaching w/o checking return code: best effort attempt. */
-	if (progfd != -1)
-		bpf_prog_detach(cgfd, test->attach_type);
-	close(progfd);
-	printf("[%s]\n", err ? "FAIL" : "PASS");
-	return err;
-}
-
-static int run_tests(int cgfd)
-{
-	int passes = 0;
-	int fails = 0;
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(tests); ++i) {
-		if (run_test_case(cgfd, &tests[i]))
-			++fails;
-		else
-			++passes;
-	}
-	printf("Summary: %d PASSED, %d FAILED\n", passes, fails);
-	return fails ? -1 : 0;
-}
-
-int main(int argc, char **argv)
-{
-	int cgfd = -1;
-	int err = 0;
-
-	cgfd = cgroup_setup_and_join(CG_PATH);
-	if (cgfd < 0)
-		goto err;
-
-	/* Use libbpf 1.0 API mode */
-	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
-
-	if (run_tests(cgfd))
-		goto err;
-
-	goto out;
-err:
-	err = -1;
-out:
-	close(cgfd);
-	cleanup_cgroup_environment();
-	return err;
-}
-- 
cgit v1.2.3


From dca93d29845dfed60910ba13dbfb6ae6a0e19f6d Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 23 Oct 2024 00:20:45 +0100
Subject: kselftest/arm64: Log fp-stress child startup errors to stdout

Currently if we encounter an error between fork() and exec() of a child
process we log the error to stderr. This means that the errors don't get
annotated with the child information which makes diagnostics harder and
means that if we miss the exit signal from the child we can deadlock
waiting for output from the child. Improve robustness and output quality
by logging to stdout instead.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241023-arm64-fp-stress-exec-fail-v1-1-ee3c62932c15@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/fp-stress.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/fp/fp-stress.c b/tools/testing/selftests/arm64/fp/fp-stress.c
index e62c9dbad501..13958e645afc 100644
--- a/tools/testing/selftests/arm64/fp/fp-stress.c
+++ b/tools/testing/selftests/arm64/fp/fp-stress.c
@@ -79,7 +79,7 @@ static void child_start(struct child_data *child, const char *program)
 		 */
 		ret = dup2(pipefd[1], 1);
 		if (ret == -1) {
-			fprintf(stderr, "dup2() %d\n", errno);
+			printf("dup2() %d\n", errno);
 			exit(EXIT_FAILURE);
 		}
 
@@ -89,7 +89,7 @@ static void child_start(struct child_data *child, const char *program)
 		 */
 		ret = dup2(startup_pipe[0], 3);
 		if (ret == -1) {
-			fprintf(stderr, "dup2() %d\n", errno);
+			printf("dup2() %d\n", errno);
 			exit(EXIT_FAILURE);
 		}
 
@@ -107,16 +107,15 @@ static void child_start(struct child_data *child, const char *program)
 		 */
 		ret = read(3, &i, sizeof(i));
 		if (ret < 0)
-			fprintf(stderr, "read(startp pipe) failed: %s (%d)\n",
-				strerror(errno), errno);
+			printf("read(startp pipe) failed: %s (%d)\n",
+			       strerror(errno), errno);
 		if (ret > 0)
-			fprintf(stderr, "%d bytes of data on startup pipe\n",
-				ret);
+			printf("%d bytes of data on startup pipe\n", ret);
 		close(3);
 
 		ret = execl(program, program, NULL);
-		fprintf(stderr, "execl(%s) failed: %d (%s)\n",
-			program, errno, strerror(errno));
+		printf("execl(%s) failed: %d (%s)\n",
+		       program, errno, strerror(errno));
 
 		exit(EXIT_FAILURE);
 	} else {
-- 
cgit v1.2.3


From 1f7c33630724dfe47f99748bd2a9a56ec8bd337f Mon Sep 17 00:00:00 2001
From: Mykyta Yatsenko <yatsenko@meta.com>
Date: Wed, 23 Oct 2024 16:53:14 +0100
Subject: selftests/bpf: Increase verifier log limit in veristat

The current default buffer size of 16MB allocated by veristat is no
longer sufficient to hold the verifier logs of some production BPF
programs. To address this issue, we need to increase the verifier log
limit.
Commit 7a9f5c65abcc ("bpf: increase verifier log limit") has already
increased the supported buffer size by the kernel, but veristat users
need to explicitly pass a log size argument to use the bigger log.

This patch adds a function to detect the maximum verifier log size
supported by the kernel and uses that by default in veristat.
This ensures that veristat can handle larger verifier logs without
requiring users to manually specify the log size.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241023155314.126255-1-mykyta.yatsenko5@gmail.com
---
 tools/testing/selftests/bpf/veristat.c | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c
index c8efd44590d9..e12ef953fba8 100644
--- a/tools/testing/selftests/bpf/veristat.c
+++ b/tools/testing/selftests/bpf/veristat.c
@@ -16,6 +16,7 @@
 #include <sys/stat.h>
 #include <bpf/libbpf.h>
 #include <bpf/btf.h>
+#include <bpf/bpf.h>
 #include <libelf.h>
 #include <gelf.h>
 #include <float.h>
@@ -1109,6 +1110,35 @@ skip_freplace_fixup:
 	return;
 }
 
+static int max_verifier_log_size(void)
+{
+	const int SMALL_LOG_SIZE = UINT_MAX >> 8;
+	const int BIG_LOG_SIZE = UINT_MAX >> 2;
+	struct bpf_insn insns[] = {
+		{ .code = BPF_ALU | BPF_MOV | BPF_X, .dst_reg = BPF_REG_0, },
+		{ .code  = BPF_JMP | BPF_EXIT, },
+	};
+	LIBBPF_OPTS(bpf_prog_load_opts, opts,
+		    .log_size = BIG_LOG_SIZE,
+		    .log_buf = (void *)-1,
+		    .log_level = 4
+	);
+	int ret, insn_cnt = ARRAY_SIZE(insns);
+	static int log_size;
+
+	if (log_size != 0)
+		return log_size;
+
+	ret = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, &opts);
+
+	if (ret == -EFAULT)
+		log_size = BIG_LOG_SIZE;
+	else /* ret == -EINVAL, big log size is not supported by the verifier */
+		log_size = SMALL_LOG_SIZE;
+
+	return log_size;
+}
+
 static int process_prog(const char *filename, struct bpf_object *obj, struct bpf_program *prog)
 {
 	const char *base_filename = basename(strdupa(filename));
@@ -1132,7 +1162,7 @@ static int process_prog(const char *filename, struct bpf_object *obj, struct bpf
 	memset(stats, 0, sizeof(*stats));
 
 	if (env.verbose || env.top_src_lines > 0) {
-		buf_sz = env.log_size ? env.log_size : 16 * 1024 * 1024;
+		buf_sz = env.log_size ? env.log_size : max_verifier_log_size();
 		buf = malloc(buf_sz);
 		if (!buf)
 			return -ENOMEM;
-- 
cgit v1.2.3


From da09a9e0c3eab164af950be44ee6bdea8527c3e5 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Fri, 18 Oct 2024 22:22:51 +0200
Subject: uprobe: Add data pointer to consumer handlers

Adding data pointer to both entry and exit consumer handlers and all
its users. The functionality itself is coming in following change.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20241018202252.693462-2-jolsa@kernel.org
---
 include/linux/uprobes.h                               |  4 ++--
 kernel/events/uprobes.c                               |  4 ++--
 kernel/trace/bpf_trace.c                              |  6 ++++--
 kernel/trace/trace_uprobe.c                           | 12 ++++++++----
 tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c |  2 +-
 5 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'tools/testing')

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 2b294bf1881f..bb265a632b91 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -37,10 +37,10 @@ struct uprobe_consumer {
 	 * for the current process. If filter() is omitted or returns true,
 	 * UPROBE_HANDLER_REMOVE is effectively ignored.
 	 */
-	int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
+	int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs, __u64 *data);
 	int (*ret_handler)(struct uprobe_consumer *self,
 				unsigned long func,
-				struct pt_regs *regs);
+				struct pt_regs *regs, __u64 *data);
 	bool (*filter)(struct uprobe_consumer *self, struct mm_struct *mm);
 
 	struct list_head cons_node;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 2a0059464383..6b44c386a5df 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2090,7 +2090,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
 		int rc = 0;
 
 		if (uc->handler) {
-			rc = uc->handler(uc, regs);
+			rc = uc->handler(uc, regs, NULL);
 			WARN(rc & ~UPROBE_HANDLER_MASK,
 				"bad rc=0x%x from %ps()\n", rc, uc->handler);
 		}
@@ -2128,7 +2128,7 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
 	rcu_read_lock_trace();
 	list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
 		if (uc->ret_handler)
-			uc->ret_handler(uc, ri->func, regs);
+			uc->ret_handler(uc, ri->func, regs, NULL);
 	}
 	rcu_read_unlock_trace();
 }
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index a582cd25ca87..fdab7ecd8dfa 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -3244,7 +3244,8 @@ uprobe_multi_link_filter(struct uprobe_consumer *con, struct mm_struct *mm)
 }
 
 static int
-uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs)
+uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs,
+			  __u64 *data)
 {
 	struct bpf_uprobe *uprobe;
 
@@ -3253,7 +3254,8 @@ uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs)
 }
 
 static int
-uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs)
+uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs,
+			      __u64 *data)
 {
 	struct bpf_uprobe *uprobe;
 
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index c40531d2cbad..5895eabe3581 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -89,9 +89,11 @@ static struct trace_uprobe *to_trace_uprobe(struct dyn_event *ev)
 static int register_uprobe_event(struct trace_uprobe *tu);
 static int unregister_uprobe_event(struct trace_uprobe *tu);
 
-static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
+static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs,
+			     __u64 *data);
 static int uretprobe_dispatcher(struct uprobe_consumer *con,
-				unsigned long func, struct pt_regs *regs);
+				unsigned long func, struct pt_regs *regs,
+				__u64 *data);
 
 #ifdef CONFIG_STACK_GROWSUP
 static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n)
@@ -1517,7 +1519,8 @@ trace_uprobe_register(struct trace_event_call *event, enum trace_reg type,
 	}
 }
 
-static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
+static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs,
+			     __u64 *data)
 {
 	struct trace_uprobe *tu;
 	struct uprobe_dispatch_data udd;
@@ -1548,7 +1551,8 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
 }
 
 static int uretprobe_dispatcher(struct uprobe_consumer *con,
-				unsigned long func, struct pt_regs *regs)
+				unsigned long func, struct pt_regs *regs,
+				__u64 *data)
 {
 	struct trace_uprobe *tu;
 	struct uprobe_dispatch_data udd;
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index 8835761d9a12..12005e3dc3e4 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -461,7 +461,7 @@ static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = {
 
 static int
 uprobe_ret_handler(struct uprobe_consumer *self, unsigned long func,
-		   struct pt_regs *regs)
+		   struct pt_regs *regs, __u64 *data)
 
 {
 	regs->ax  = 0x12345678deadbeef;
-- 
cgit v1.2.3


From 2a027d6bb66002c8e50e974676f932b33c5fce10 Mon Sep 17 00:00:00 2001
From: Joseph Jang <jjang@nvidia.com>
Date: Sun, 20 Oct 2024 20:22:13 -0700
Subject: selftest: rtc: Add to check rtc alarm status for alarm related test

In alarm_wkalm_set and alarm_wkalm_set_minute test, they use different
ioctl (RTC_ALM_SET/RTC_WKALM_SET) for alarm feature detection. They will
skip testing if RTC_ALM_SET/RTC_WKALM_SET ioctl returns an EINVAL error
code. This design may miss detecting real problems when the
efi.set_wakeup_time() return errors and then RTC_ALM_SET/RTC_WKALM_SET
ioctl returns an EINVAL error code with RTC_FEATURE_ALARM enabled.

In order to make rtctest more explicit and robust, we propose to use
RTC_PARAM_GET ioctl interface to check rtc alarm feature state before
running alarm related tests. If the kernel does not support RTC_PARAM_GET
ioctl interface, we will fallback to check the error number of
(RTC_ALM_SET/RTC_WKALM_SET) ioctl call for alarm feature detection.

Requires commit 101ca8d05913b ("rtc: efi: Enable SET/GET WAKEUP services
as optional")

Reviewed-by: Koba Ko <kobak@nvidia.com>
Reviewed-by: Matthew R. Ochs <mochs@nvidia.com>
Signed-off-by: Joseph Jang <jjang@nvidia.com>
Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/rtc/Makefile  |  2 +-
 tools/testing/selftests/rtc/rtctest.c | 64 +++++++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/rtc/Makefile b/tools/testing/selftests/rtc/Makefile
index 55198ecc04db..9dbb395c5c79 100644
--- a/tools/testing/selftests/rtc/Makefile
+++ b/tools/testing/selftests/rtc/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-CFLAGS += -O3 -Wl,-no-as-needed -Wall
+CFLAGS += -O3 -Wl,-no-as-needed -Wall -I$(top_srcdir)/usr/include
 LDLIBS += -lrt -lpthread -lm
 
 TEST_GEN_PROGS = rtctest
diff --git a/tools/testing/selftests/rtc/rtctest.c b/tools/testing/selftests/rtc/rtctest.c
index 38a8e67de77d..e103097d0b5b 100644
--- a/tools/testing/selftests/rtc/rtctest.c
+++ b/tools/testing/selftests/rtc/rtctest.c
@@ -25,6 +25,12 @@
 
 static char *rtc_file = "/dev/rtc0";
 
+enum rtc_alarm_state {
+	RTC_ALARM_UNKNOWN,
+	RTC_ALARM_ENABLED,
+	RTC_ALARM_DISABLED,
+};
+
 FIXTURE(rtc) {
 	int fd;
 };
@@ -82,6 +88,24 @@ static void nanosleep_with_retries(long ns)
 	}
 }
 
+static enum rtc_alarm_state get_rtc_alarm_state(int fd)
+{
+	struct rtc_param param = { 0 };
+	int rc;
+
+	/* Validate kernel reflects unsupported RTC alarm state */
+	param.param = RTC_PARAM_FEATURES;
+	param.index = 0;
+	rc = ioctl(fd, RTC_PARAM_GET, &param);
+	if (rc < 0)
+		return RTC_ALARM_UNKNOWN;
+
+	if ((param.uvalue & _BITUL(RTC_FEATURE_ALARM)) == 0)
+		return RTC_ALARM_DISABLED;
+
+	return RTC_ALARM_ENABLED;
+}
+
 TEST_F_TIMEOUT(rtc, date_read_loop, READ_LOOP_DURATION_SEC + 2) {
 	int rc;
 	long iter_count = 0;
@@ -197,11 +221,16 @@ TEST_F(rtc, alarm_alm_set) {
 	fd_set readfds;
 	time_t secs, new;
 	int rc;
+	enum rtc_alarm_state alarm_state = RTC_ALARM_UNKNOWN;
 
 	if (self->fd == -1 && errno == ENOENT)
 		SKIP(return, "Skipping test since %s does not exist", rtc_file);
 	ASSERT_NE(-1, self->fd);
 
+	alarm_state = get_rtc_alarm_state(self->fd);
+	if (alarm_state == RTC_ALARM_DISABLED)
+		SKIP(return, "Skipping test since alarms are not supported.");
+
 	rc = ioctl(self->fd, RTC_RD_TIME, &tm);
 	ASSERT_NE(-1, rc);
 
@@ -210,6 +239,11 @@ TEST_F(rtc, alarm_alm_set) {
 
 	rc = ioctl(self->fd, RTC_ALM_SET, &tm);
 	if (rc == -1) {
+		/*
+		 * Report error if rtc alarm was enabled. Fallback to check ioctl
+		 * error number if rtc alarm state is unknown.
+		 */
+		ASSERT_EQ(RTC_ALARM_UNKNOWN, alarm_state);
 		ASSERT_EQ(EINVAL, errno);
 		TH_LOG("skip alarms are not supported.");
 		return;
@@ -255,11 +289,16 @@ TEST_F(rtc, alarm_wkalm_set) {
 	fd_set readfds;
 	time_t secs, new;
 	int rc;
+	enum rtc_alarm_state alarm_state = RTC_ALARM_UNKNOWN;
 
 	if (self->fd == -1 && errno == ENOENT)
 		SKIP(return, "Skipping test since %s does not exist", rtc_file);
 	ASSERT_NE(-1, self->fd);
 
+	alarm_state = get_rtc_alarm_state(self->fd);
+	if (alarm_state == RTC_ALARM_DISABLED)
+		SKIP(return, "Skipping test since alarms are not supported.");
+
 	rc = ioctl(self->fd, RTC_RD_TIME, &alarm.time);
 	ASSERT_NE(-1, rc);
 
@@ -270,6 +309,11 @@ TEST_F(rtc, alarm_wkalm_set) {
 
 	rc = ioctl(self->fd, RTC_WKALM_SET, &alarm);
 	if (rc == -1) {
+		/*
+		 * Report error if rtc alarm was enabled. Fallback to check ioctl
+		 * error number if rtc alarm state is unknown.
+		 */
+		ASSERT_EQ(RTC_ALARM_UNKNOWN, alarm_state);
 		ASSERT_EQ(EINVAL, errno);
 		TH_LOG("skip alarms are not supported.");
 		return;
@@ -307,11 +351,16 @@ TEST_F_TIMEOUT(rtc, alarm_alm_set_minute, 65) {
 	fd_set readfds;
 	time_t secs, new;
 	int rc;
+	enum rtc_alarm_state alarm_state = RTC_ALARM_UNKNOWN;
 
 	if (self->fd == -1 && errno == ENOENT)
 		SKIP(return, "Skipping test since %s does not exist", rtc_file);
 	ASSERT_NE(-1, self->fd);
 
+	alarm_state = get_rtc_alarm_state(self->fd);
+	if (alarm_state == RTC_ALARM_DISABLED)
+		SKIP(return, "Skipping test since alarms are not supported.");
+
 	rc = ioctl(self->fd, RTC_RD_TIME, &tm);
 	ASSERT_NE(-1, rc);
 
@@ -320,6 +369,11 @@ TEST_F_TIMEOUT(rtc, alarm_alm_set_minute, 65) {
 
 	rc = ioctl(self->fd, RTC_ALM_SET, &tm);
 	if (rc == -1) {
+		/*
+		 * Report error if rtc alarm was enabled. Fallback to check ioctl
+		 * error number if rtc alarm state is unknown.
+		 */
+		ASSERT_EQ(RTC_ALARM_UNKNOWN, alarm_state);
 		ASSERT_EQ(EINVAL, errno);
 		TH_LOG("skip alarms are not supported.");
 		return;
@@ -365,11 +419,16 @@ TEST_F_TIMEOUT(rtc, alarm_wkalm_set_minute, 65) {
 	fd_set readfds;
 	time_t secs, new;
 	int rc;
+	enum rtc_alarm_state alarm_state = RTC_ALARM_UNKNOWN;
 
 	if (self->fd == -1 && errno == ENOENT)
 		SKIP(return, "Skipping test since %s does not exist", rtc_file);
 	ASSERT_NE(-1, self->fd);
 
+	alarm_state = get_rtc_alarm_state(self->fd);
+	if (alarm_state == RTC_ALARM_DISABLED)
+		SKIP(return, "Skipping test since alarms are not supported.");
+
 	rc = ioctl(self->fd, RTC_RD_TIME, &alarm.time);
 	ASSERT_NE(-1, rc);
 
@@ -380,6 +439,11 @@ TEST_F_TIMEOUT(rtc, alarm_wkalm_set_minute, 65) {
 
 	rc = ioctl(self->fd, RTC_WKALM_SET, &alarm);
 	if (rc == -1) {
+		/*
+		 * Report error if rtc alarm was enabled. Fallback to check ioctl
+		 * error number if rtc alarm state is unknown.
+		 */
+		ASSERT_EQ(RTC_ALARM_UNKNOWN, alarm_state);
 		ASSERT_EQ(EINVAL, errno);
 		TH_LOG("skip alarms are not supported.");
 		return;
-- 
cgit v1.2.3


From 1b2bfc29695d273492c3dd8512775261f3272686 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 22 Oct 2024 21:39:06 -0700
Subject: selftests/bpf: fix test_spin_lock_fail.c's global vars usage

Global variables of special types (like `struct bpf_spin_lock`) make
underlying ARRAY maps non-mmapable. To make this work with libbpf's
mmaping logic, application is expected to declare such special variables
as static, so libbpf doesn't even attempt to mmap() such ARRAYs.

test_spin_lock_fail.c didn't follow this rule, but given it relied on
this test to trigger failures, this went unnoticed, as we never got to
the step of mmap()'ing these ARRAY maps.

It is fragile and relies on specific sequence of libbpf steps, which are
an internal implementation details.

Fix the test by marking lockA and lockB as static.

Fixes: c48748aea4f8 ("selftests/bpf: Add failure test cases for spin lock pairing")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20241023043908.3834423-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/test_spin_lock_fail.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/progs/test_spin_lock_fail.c b/tools/testing/selftests/bpf/progs/test_spin_lock_fail.c
index 43f40c4fe241..1c8b678e2e9a 100644
--- a/tools/testing/selftests/bpf/progs/test_spin_lock_fail.c
+++ b/tools/testing/selftests/bpf/progs/test_spin_lock_fail.c
@@ -28,8 +28,8 @@ struct {
 	},
 };
 
-SEC(".data.A") struct bpf_spin_lock lockA;
-SEC(".data.B") struct bpf_spin_lock lockB;
+static struct bpf_spin_lock lockA SEC(".data.A");
+static struct bpf_spin_lock lockB SEC(".data.B");
 
 SEC("?tc")
 int lock_id_kptr_preserve(void *ctx)
-- 
cgit v1.2.3


From 80a54566b7f03351f77445ed3ac8d4eff3b04fcc Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 22 Oct 2024 21:39:08 -0700
Subject: selftests/bpf: validate generic bpf_object and subskel APIs work
 together

Add a new subtest validating that bpf_object loaded and initialized
through generic APIs is still interoperable with BPF subskeleton,
including initialization and reading of global variables.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20241023043908.3834423-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../testing/selftests/bpf/prog_tests/subskeleton.c | 76 +++++++++++++++++++++-
 1 file changed, 75 insertions(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/subskeleton.c b/tools/testing/selftests/bpf/prog_tests/subskeleton.c
index 9c31b7004f9c..fdf13ed0152a 100644
--- a/tools/testing/selftests/bpf/prog_tests/subskeleton.c
+++ b/tools/testing/selftests/bpf/prog_tests/subskeleton.c
@@ -46,7 +46,8 @@ static int subskeleton_lib_subresult(struct bpf_object *obj)
 	return result;
 }
 
-void test_subskeleton(void)
+/* initialize and load through skeleton, then instantiate subskeleton out of it */
+static void subtest_skel_subskeleton(void)
 {
 	int err, result;
 	struct test_subskeleton *skel;
@@ -76,3 +77,76 @@ void test_subskeleton(void)
 cleanup:
 	test_subskeleton__destroy(skel);
 }
+
+/* initialize and load through generic bpf_object API, then instantiate subskeleton out of it */
+static void subtest_obj_subskeleton(void)
+{
+	int err, result;
+	const void *elf_bytes;
+	size_t elf_bytes_sz = 0, rodata_sz = 0, bss_sz = 0;
+	struct bpf_object *obj;
+	const struct bpf_map *map;
+	const struct bpf_program *prog;
+	struct bpf_link *link = NULL;
+	struct test_subskeleton__rodata *rodata;
+	struct test_subskeleton__bss *bss;
+
+	elf_bytes = test_subskeleton__elf_bytes(&elf_bytes_sz);
+	if (!ASSERT_OK_PTR(elf_bytes, "elf_bytes"))
+		return;
+
+	obj = bpf_object__open_mem(elf_bytes, elf_bytes_sz, NULL);
+	if (!ASSERT_OK_PTR(obj, "obj_open_mem"))
+		return;
+
+	map = bpf_object__find_map_by_name(obj, ".rodata");
+	if (!ASSERT_OK_PTR(map, "rodata_map_by_name"))
+		goto cleanup;
+
+	rodata = bpf_map__initial_value(map, &rodata_sz);
+	if (!ASSERT_OK_PTR(rodata, "rodata_get"))
+		goto cleanup;
+
+	rodata->rovar1 = 10;
+	rodata->var1 = 1;
+	subskeleton_lib_setup(obj);
+
+	err = bpf_object__load(obj);
+	if (!ASSERT_OK(err, "obj_load"))
+		goto cleanup;
+
+	prog = bpf_object__find_program_by_name(obj, "handler1");
+	if (!ASSERT_OK_PTR(prog, "prog_by_name"))
+		goto cleanup;
+
+	link = bpf_program__attach(prog);
+	if (!ASSERT_OK_PTR(link, "prog_attach"))
+		goto cleanup;
+
+	/* trigger tracepoint */
+	usleep(1);
+
+	map = bpf_object__find_map_by_name(obj, ".bss");
+	if (!ASSERT_OK_PTR(map, "bss_map_by_name"))
+		goto cleanup;
+
+	bss = bpf_map__initial_value(map, &bss_sz);
+	if (!ASSERT_OK_PTR(rodata, "rodata_get"))
+		goto cleanup;
+
+	result = subskeleton_lib_subresult(obj) * 10;
+	ASSERT_EQ(bss->out1, result, "out1");
+
+cleanup:
+	bpf_link__destroy(link);
+	bpf_object__close(obj);
+}
+
+
+void test_subskeleton(void)
+{
+	if (test__start_subtest("skel_subskel"))
+		subtest_skel_subskeleton();
+	if (test__start_subtest("obj_subskel"))
+		subtest_obj_subskeleton();
+}
-- 
cgit v1.2.3


From cdda1f26e74bac732eca537a69f19f6a37b641be Mon Sep 17 00:00:00 2001
From: Luca Boccassi <luca.boccassi@gmail.com>
Date: Thu, 10 Oct 2024 16:52:32 +0100
Subject: pidfd: add ioctl to retrieve pid info

A common pattern when using pid fds is having to get information
about the process, which currently requires /proc being mounted,
resolving the fd to a pid, and then do manual string parsing of
/proc/N/status and friends. This needs to be reimplemented over
and over in all userspace projects (e.g.: I have reimplemented
resolving in systemd, dbus, dbus-daemon, polkit so far), and
requires additional care in checking that the fd is still valid
after having parsed the data, to avoid races.

Having a programmatic API that can be used directly removes all
these requirements, including having /proc mounted.

As discussed at LPC24, add an ioctl with an extensible struct
so that more parameters can be added later if needed. Start with
returning pid/tgid/ppid and creds unconditionally, and cgroupid
optionally.

Signed-off-by: Luca Boccassi <luca.boccassi@gmail.com>
Link: https://lore.kernel.org/r/20241010155401.2268522-1-luca.boccassi@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/pidfs.c                                      | 86 ++++++++++++++++++++++++-
 include/uapi/linux/pidfd.h                      | 50 ++++++++++++++
 tools/testing/selftests/pidfd/pidfd_open_test.c | 82 ++++++++++++++++++++++-
 3 files changed, 214 insertions(+), 4 deletions(-)

(limited to 'tools/testing')

diff --git a/fs/pidfs.c b/fs/pidfs.c
index 80675b6bf884..618abb1fa1b8 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -2,6 +2,7 @@
 #include <linux/anon_inodes.h>
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/cgroup.h>
 #include <linux/magic.h>
 #include <linux/mount.h>
 #include <linux/pid.h>
@@ -114,6 +115,81 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
 	return poll_flags;
 }
 
+static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg)
+{
+	struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
+	size_t usize = _IOC_SIZE(cmd);
+	struct pidfd_info kinfo = {};
+	struct user_namespace *user_ns;
+	const struct cred *c;
+	__u64 mask;
+#ifdef CONFIG_CGROUPS
+	struct cgroup *cgrp;
+#endif
+
+	if (!uinfo)
+		return -EINVAL;
+	if (usize < PIDFD_INFO_SIZE_VER0)
+		return -EINVAL; /* First version, no smaller struct possible */
+
+	if (copy_from_user(&mask, &uinfo->mask, sizeof(mask)))
+		return -EFAULT;
+
+	c = get_task_cred(task);
+	if (!c)
+		return -ESRCH;
+
+	/* Unconditionally return identifiers and credentials, the rest only on request */
+
+	user_ns = current_user_ns();
+	kinfo.ruid = from_kuid_munged(user_ns, c->uid);
+	kinfo.rgid = from_kgid_munged(user_ns, c->gid);
+	kinfo.euid = from_kuid_munged(user_ns, c->euid);
+	kinfo.egid = from_kgid_munged(user_ns, c->egid);
+	kinfo.suid = from_kuid_munged(user_ns, c->suid);
+	kinfo.sgid = from_kgid_munged(user_ns, c->sgid);
+	kinfo.fsuid = from_kuid_munged(user_ns, c->fsuid);
+	kinfo.fsgid = from_kgid_munged(user_ns, c->fsgid);
+	kinfo.mask |= PIDFD_INFO_CREDS;
+	put_cred(c);
+
+#ifdef CONFIG_CGROUPS
+	rcu_read_lock();
+	cgrp = task_dfl_cgroup(task);
+	kinfo.cgroupid = cgroup_id(cgrp);
+	kinfo.mask |= PIDFD_INFO_CGROUPID;
+	rcu_read_unlock();
+#endif
+
+	/*
+	 * Copy pid/tgid last, to reduce the chances the information might be
+	 * stale. Note that it is not possible to ensure it will be valid as the
+	 * task might return as soon as the copy_to_user finishes, but that's ok
+	 * and userspace expects that might happen and can act accordingly, so
+	 * this is just best-effort. What we can do however is checking that all
+	 * the fields are set correctly, or return ESRCH to avoid providing
+	 * incomplete information. */
+
+	kinfo.ppid = task_ppid_nr_ns(task, NULL);
+	kinfo.tgid = task_tgid_vnr(task);
+	kinfo.pid = task_pid_vnr(task);
+	kinfo.mask |= PIDFD_INFO_PID;
+
+	if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1))
+		return -ESRCH;
+
+	/*
+	 * If userspace and the kernel have the same struct size it can just
+	 * be copied. If userspace provides an older struct, only the bits that
+	 * userspace knows about will be copied. If userspace provides a new
+	 * struct, only the bits that the kernel knows about will be copied.
+	 */
+	if (copy_to_user(uinfo, &kinfo, min(usize, sizeof(kinfo))))
+		return -EFAULT;
+
+	return 0;
+}
+
 static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct task_struct *task __free(put_task) = NULL;
@@ -122,13 +198,17 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	struct ns_common *ns_common = NULL;
 	struct pid_namespace *pid_ns;
 
-	if (arg)
-		return -EINVAL;
-
 	task = get_pid_task(pid, PIDTYPE_PID);
 	if (!task)
 		return -ESRCH;
 
+	/* Extensible IOCTL that does not open namespace FDs, take a shortcut */
+	if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO))
+		return pidfd_info(task, cmd, arg);
+
+	if (arg)
+		return -EINVAL;
+
 	scoped_guard(task_lock, task) {
 		nsp = task->nsproxy;
 		if (nsp)
diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
index 565fc0629fff..4540f6301b8c 100644
--- a/include/uapi/linux/pidfd.h
+++ b/include/uapi/linux/pidfd.h
@@ -16,6 +16,55 @@
 #define PIDFD_SIGNAL_THREAD_GROUP	(1UL << 1)
 #define PIDFD_SIGNAL_PROCESS_GROUP	(1UL << 2)
 
+/* Flags for pidfd_info. */
+#define PIDFD_INFO_PID			(1UL << 0) /* Always returned, even if not requested */
+#define PIDFD_INFO_CREDS		(1UL << 1) /* Always returned, even if not requested */
+#define PIDFD_INFO_CGROUPID		(1UL << 2) /* Always returned if available, even if not requested */
+
+#define PIDFD_INFO_SIZE_VER0		64 /* sizeof first published struct */
+
+struct pidfd_info {
+	/*
+	 * This mask is similar to the request_mask in statx(2).
+	 *
+	 * Userspace indicates what extensions or expensive-to-calculate fields
+	 * they want by setting the corresponding bits in mask. The kernel
+	 * will ignore bits that it does not know about.
+	 *
+	 * When filling the structure, the kernel will only set bits
+	 * corresponding to the fields that were actually filled by the kernel.
+	 * This also includes any future extensions that might be automatically
+	 * filled. If the structure size is too small to contain a field
+	 * (requested or not), to avoid confusion the mask will not
+	 * contain a bit for that field.
+	 *
+	 * As such, userspace MUST verify that mask contains the
+	 * corresponding flags after the ioctl(2) returns to ensure that it is
+	 * using valid data.
+	 */
+	__u64 mask;
+	/*
+	 * The information contained in the following fields might be stale at the
+	 * time it is received, as the target process might have exited as soon as
+	 * the IOCTL was processed, and there is no way to avoid that. However, it
+	 * is guaranteed that if the call was successful, then the information was
+	 * correct and referred to the intended process at the time the work was
+	 * performed. */
+	__u64 cgroupid;
+	__u32 pid;
+	__u32 tgid;
+	__u32 ppid;
+	__u32 ruid;
+	__u32 rgid;
+	__u32 euid;
+	__u32 egid;
+	__u32 suid;
+	__u32 sgid;
+	__u32 fsuid;
+	__u32 fsgid;
+	__u32 spare0[1];
+};
+
 #define PIDFS_IOCTL_MAGIC 0xFF
 
 #define PIDFD_GET_CGROUP_NAMESPACE            _IO(PIDFS_IOCTL_MAGIC, 1)
@@ -28,5 +77,6 @@
 #define PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 8)
 #define PIDFD_GET_USER_NAMESPACE              _IO(PIDFS_IOCTL_MAGIC, 9)
 #define PIDFD_GET_UTS_NAMESPACE               _IO(PIDFS_IOCTL_MAGIC, 10)
+#define PIDFD_GET_INFO                        _IOWR(PIDFS_IOCTL_MAGIC, 11, struct pidfd_info)
 
 #endif /* _UAPI_LINUX_PIDFD_H */
diff --git a/tools/testing/selftests/pidfd/pidfd_open_test.c b/tools/testing/selftests/pidfd/pidfd_open_test.c
index c62564c264b1..ce413a221bac 100644
--- a/tools/testing/selftests/pidfd/pidfd_open_test.c
+++ b/tools/testing/selftests/pidfd/pidfd_open_test.c
@@ -13,6 +13,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <syscall.h>
+#include <sys/ioctl.h>
 #include <sys/mount.h>
 #include <sys/prctl.h>
 #include <sys/wait.h>
@@ -21,6 +22,32 @@
 #include "pidfd.h"
 #include "../kselftest.h"
 
+#ifndef PIDFS_IOCTL_MAGIC
+#define PIDFS_IOCTL_MAGIC 0xFF
+#endif
+
+#ifndef PIDFD_GET_INFO
+#define PIDFD_GET_INFO _IOWR(PIDFS_IOCTL_MAGIC, 11, struct pidfd_info)
+#define PIDFD_INFO_CGROUPID		(1UL << 0)
+
+struct pidfd_info {
+	__u64 request_mask;
+	__u64 cgroupid;
+	__u32 pid;
+	__u32 tgid;
+	__u32 ppid;
+	__u32 ruid;
+	__u32 rgid;
+	__u32 euid;
+	__u32 egid;
+	__u32 suid;
+	__u32 sgid;
+	__u32 fsuid;
+	__u32 fsgid;
+	__u32 spare0[1];
+};
+#endif
+
 static int safe_int(const char *numstr, int *converted)
 {
 	char *err = NULL;
@@ -120,10 +147,13 @@ out:
 
 int main(int argc, char **argv)
 {
+	struct pidfd_info info = {
+		.request_mask = PIDFD_INFO_CGROUPID,
+	};
 	int pidfd = -1, ret = 1;
 	pid_t pid;
 
-	ksft_set_plan(3);
+	ksft_set_plan(4);
 
 	pidfd = sys_pidfd_open(-1, 0);
 	if (pidfd >= 0) {
@@ -153,6 +183,56 @@ int main(int argc, char **argv)
 	pid = get_pid_from_fdinfo_file(pidfd, "Pid:", sizeof("Pid:") - 1);
 	ksft_print_msg("pidfd %d refers to process with pid %d\n", pidfd, pid);
 
+	if (ioctl(pidfd, PIDFD_GET_INFO, &info) < 0) {
+		ksft_print_msg("%s - failed to get info from pidfd\n", strerror(errno));
+		goto on_error;
+	}
+	if (info.pid != pid) {
+		ksft_print_msg("pid from fdinfo file %d does not match pid from ioctl %d\n",
+			       pid, info.pid);
+		goto on_error;
+	}
+	if (info.ppid != getppid()) {
+		ksft_print_msg("ppid %d does not match ppid from ioctl %d\n",
+			       pid, info.pid);
+		goto on_error;
+	}
+	if (info.ruid != getuid()) {
+		ksft_print_msg("uid %d does not match uid from ioctl %d\n",
+			       getuid(), info.ruid);
+		goto on_error;
+	}
+	if (info.rgid != getgid()) {
+		ksft_print_msg("gid %d does not match gid from ioctl %d\n",
+			       getgid(), info.rgid);
+		goto on_error;
+	}
+	if (info.euid != geteuid()) {
+		ksft_print_msg("euid %d does not match euid from ioctl %d\n",
+			       geteuid(), info.euid);
+		goto on_error;
+	}
+	if (info.egid != getegid()) {
+		ksft_print_msg("egid %d does not match egid from ioctl %d\n",
+			       getegid(), info.egid);
+		goto on_error;
+	}
+	if (info.suid != geteuid()) {
+		ksft_print_msg("suid %d does not match suid from ioctl %d\n",
+			       geteuid(), info.suid);
+		goto on_error;
+	}
+	if (info.sgid != getegid()) {
+		ksft_print_msg("sgid %d does not match sgid from ioctl %d\n",
+			       getegid(), info.sgid);
+		goto on_error;
+	}
+	if ((info.request_mask & PIDFD_INFO_CGROUPID) && info.cgroupid == 0) {
+		ksft_print_msg("cgroupid should not be 0 when PIDFD_INFO_CGROUPID is set\n");
+		goto on_error;
+	}
+	ksft_test_result_pass("get info from pidfd test: passed\n");
+
 	ret = 0;
 
 on_error:
-- 
cgit v1.2.3


From 81bc949f640f78b507c7523de7c750bcc87c1bb8 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Fri, 18 Oct 2024 12:55:58 +0200
Subject: selftests: tls: add a selftest for wrapping rec_seq

Set the initial rec_seq to 0xffffffffffffffff so that it wraps
immediately. The send() call should fail with EBADMSG.

A bug in this code was fixed in commit cfaa80c91f6f ("net/tls: do not
free tls_rec on async operation in bpf_exec_tx_verdict()").

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20775fcfd0371422921ee60a42de170c0398ac10.1729244987.git.sd@queasysnail.net
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/tls.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c
index f27a12d2a2c9..1a706d03bb6b 100644
--- a/tools/testing/selftests/net/tls.c
+++ b/tools/testing/selftests/net/tls.c
@@ -266,6 +266,25 @@ TEST_F(tls_basic, bad_cipher)
 	EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, sizeof(struct tls12_crypto_info_aes_gcm_128)), -1);
 }
 
+TEST_F(tls_basic, recseq_wrap)
+{
+	struct tls_crypto_info_keys tls12;
+	char const *test_str = "test_read";
+	int send_len = 10;
+
+	if (self->notls)
+		SKIP(return, "no TLS support");
+
+	tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_GCM_128, &tls12);
+	memset(&tls12.aes128.rec_seq, 0xff, sizeof(tls12.aes128.rec_seq));
+
+	ASSERT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0);
+	ASSERT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0);
+
+	EXPECT_EQ(send(self->fd, test_str, send_len, 0), -1);
+	EXPECT_EQ(errno, EBADMSG);
+}
+
 FIXTURE(tls)
 {
 	int fd, cfd;
-- 
cgit v1.2.3


From 84b4a51fce4ccc6605113ed8af41a3d91609a756 Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Mon, 21 Oct 2024 12:11:44 -0700
Subject: selftests: add new kallsyms selftests

We lack find_symbol() selftests, so add one. This let's us stress test
improvements easily on find_symbol() or optimizations. It also inherently
allows us to test the limits of kallsyms on Linux today.

We test a pathalogical use case for kallsyms by introducing modules
which are automatically written for us with a larger number of symbols.
We have 4 kallsyms test modules:

A: has KALLSYSMS_NUMSYMS exported symbols
B: uses one of A's symbols
C: adds KALLSYMS_SCALE_FACTOR * KALLSYSMS_NUMSYMS exported
D: adds 2 * the symbols than C

By using anything much larger than KALLSYSMS_NUMSYMS as 10,000 and
KALLSYMS_SCALE_FACTOR of 8 we segfault today. So we're capped at
around 160000 symbols somehow today. We can inpsect that issue at
our leasure later, but for now the real value to this test is that
this will easily allow us to test improvements on find_symbol().

We want to enable this test on allyesmodconfig builds so we can't
use this combination, so instead just use a safe value for now and
be informative on the Kconfig symbol documentation about where our
thresholds are for testers. We default then to KALLSYSMS_NUMSYMS of
just 100 and KALLSYMS_SCALE_FACTOR of 8.

On x86_64 we can use perf, for other architectures we just use 'time'
and allow for customizations. For example a future enhancements could
be done for parisc to check for unaligned accesses which triggers a
special special exception handler assembler code inside the kernel.
The negative impact on performance is so large on parisc that it
keeps track of its accesses on /proc/cpuinfo as UAH:

IRQ:       CPU0       CPU1
3:       1332          0         SuperIO  ttyS0
7:    1270013          0         SuperIO  pata_ns87415
64:  320023012  320021431             CPU  timer
65:   17080507   20624423             CPU  IPI
UAH:   10948640      58104   Unaligned access handler traps

While at it, this tidies up lib/ test modules to allow us to have
a new directory for them. The amount of test modules under lib/
is insane.

This should also hopefully showcase how to start doing basic
self module writing code, which may be more useful for more complex
cases later in the future.

Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 lib/Kconfig.debug                             | 105 +++++++++++++++++++++
 lib/Makefile                                  |   1 +
 lib/tests/Makefile                            |   1 +
 lib/tests/module/.gitignore                   |   4 +
 lib/tests/module/Makefile                     |  15 +++
 lib/tests/module/gen_test_kallsyms.sh         | 128 ++++++++++++++++++++++++++
 tools/testing/selftests/module/Makefile       |  12 +++
 tools/testing/selftests/module/config         |   3 +
 tools/testing/selftests/module/find_symbol.sh |  81 ++++++++++++++++
 9 files changed, 350 insertions(+)
 create mode 100644 lib/tests/Makefile
 create mode 100644 lib/tests/module/.gitignore
 create mode 100644 lib/tests/module/Makefile
 create mode 100755 lib/tests/module/gen_test_kallsyms.sh
 create mode 100644 tools/testing/selftests/module/Makefile
 create mode 100644 tools/testing/selftests/module/config
 create mode 100755 tools/testing/selftests/module/find_symbol.sh

(limited to 'tools/testing')

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 7315f643817a..b5929721fc63 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2903,6 +2903,111 @@ config TEST_KMOD
 
 	  If unsure, say N.
 
+config TEST_RUNTIME
+	bool
+
+config TEST_RUNTIME_MODULE
+	bool
+
+config TEST_KALLSYMS
+	tristate "module kallsyms find_symbol() test"
+	depends on m
+	select TEST_RUNTIME
+	select TEST_RUNTIME_MODULE
+	select TEST_KALLSYMS_A
+	select TEST_KALLSYMS_B
+	select TEST_KALLSYMS_C
+	select TEST_KALLSYMS_D
+	help
+	  This allows us to stress test find_symbol() through the kallsyms
+	  used to place symbols on the kernel ELF kallsyms and modules kallsyms
+	  where we place kernel symbols such as exported symbols.
+
+	  We have four test modules:
+
+	  A: has KALLSYSMS_NUMSYMS exported symbols
+	  B: uses one of A's symbols
+	  C: adds KALLSYMS_SCALE_FACTOR * KALLSYSMS_NUMSYMS exported
+	  D: adds 2 * the symbols than C
+
+	  We stress test find_symbol() through two means:
+
+	  1) Upon load of B it will trigger simplify_symbols() to look for the
+	  one symbol it uses from the module A with tons of symbols. This is an
+	  indirect way for us to have B call resolve_symbol_wait() upon module
+	  load. This will eventually call find_symbol() which will eventually
+	  try to find the symbols used with find_exported_symbol_in_section().
+	  find_exported_symbol_in_section() uses bsearch() so a binary search
+	  for each symbol. Binary search will at worst be O(log(n)) so the
+	  larger TEST_MODULE_KALLSYSMS the worse the search.
+
+	  2) The selftests should load C first, before B. Upon B's load towards
+	  the end right before we call module B's init routine we get
+	  complete_formation() called on the module. That will first check
+	  for duplicate symbols with the call to verify_exported_symbols().
+	  That is when we'll force iteration on module C's insane symbol list.
+	  Since it has 10 * KALLSYMS_NUMSYMS it means we can first test
+	  just loading B without C. The amount of time it takes to load C Vs
+	  B can give us an idea of the impact growth of the symbol space and
+	  give us projection. Module A only uses one symbol from B so to allow
+	  this scaling in module C to be proportional, if it used more symbols
+	  then the first test would be doing more and increasing just the
+	  search space would be slightly different. The last module, module D
+	  will just increase the search space by twice the number of symbols in
+	  C so to allow for full projects.
+
+	  tools/testing/selftests/module/find_symbol.sh
+
+	  The current defaults will incur a build delay of about 7 minutes
+	  on an x86_64 with only 8 cores. Enable this only if you want to
+	  stress test find_symbol() with thousands of symbols. At the same
+	  time this is also useful to test building modules with thousands of
+	  symbols, and if BTF is enabled this also stress tests adding BTF
+	  information for each module. Currently enabling many more symbols
+	  will segfault the build system.
+
+	  If unsure, say N.
+
+if TEST_KALLSYMS
+
+config TEST_KALLSYMS_A
+	tristate
+	depends on m
+
+config TEST_KALLSYMS_B
+	tristate
+	depends on m
+
+config TEST_KALLSYMS_C
+	tristate
+	depends on m
+
+config TEST_KALLSYMS_D
+	tristate
+	depends on m
+
+config TEST_KALLSYMS_NUMSYMS
+	int "test kallsyms number of symbols"
+	default 100
+	help
+	  The number of symbols to create on TEST_KALLSYMS_A, only one of which
+	  module TEST_KALLSYMS_B will use. This also will be used
+	  for how many symbols TEST_KALLSYMS_C will have, scaled up by
+	  TEST_KALLSYMS_SCALE_FACTOR. Note that setting this to 10,000 will
+	  trigger a segfault today, don't use anything close to it unless
+	  you are aware that this should not be used for automated build tests.
+
+config TEST_KALLSYMS_SCALE_FACTOR
+	int "test kallsyms scale factor"
+	default 8
+	help
+	  How many more unusued symbols will TEST_KALLSYSMS_C have than
+	  TEST_KALLSYMS_A. If 8, then module C will have 8 * syms
+	  than module A. Then TEST_KALLSYMS_D will have double the amount
+	  of symbols than C so to allow projections.
+
+endif # TEST_KALLSYMS
+
 config TEST_DEBUG_VIRTUAL
 	tristate "Test CONFIG_DEBUG_VIRTUAL feature"
 	depends on DEBUG_VIRTUAL
diff --git a/lib/Makefile b/lib/Makefile
index 773adf88af41..ae720c7eb996 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -96,6 +96,7 @@ obj-$(CONFIG_TEST_XARRAY) += test_xarray.o
 obj-$(CONFIG_TEST_MAPLE_TREE) += test_maple_tree.o
 obj-$(CONFIG_TEST_PARMAN) += test_parman.o
 obj-$(CONFIG_TEST_KMOD) += test_kmod.o
+obj-$(CONFIG_TEST_RUNTIME) += tests/
 obj-$(CONFIG_TEST_DEBUG_VIRTUAL) += test_debug_virtual.o
 obj-$(CONFIG_TEST_MEMCAT_P) += test_memcat_p.o
 obj-$(CONFIG_TEST_OBJAGG) += test_objagg.o
diff --git a/lib/tests/Makefile b/lib/tests/Makefile
new file mode 100644
index 000000000000..8e4f42cb9c54
--- /dev/null
+++ b/lib/tests/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_TEST_RUNTIME_MODULE)		+= module/
diff --git a/lib/tests/module/.gitignore b/lib/tests/module/.gitignore
new file mode 100644
index 000000000000..8be7891b250f
--- /dev/null
+++ b/lib/tests/module/.gitignore
@@ -0,0 +1,4 @@
+test_kallsyms_a.c
+test_kallsyms_b.c
+test_kallsyms_c.c
+test_kallsyms_d.c
diff --git a/lib/tests/module/Makefile b/lib/tests/module/Makefile
new file mode 100644
index 000000000000..af5c27b996cb
--- /dev/null
+++ b/lib/tests/module/Makefile
@@ -0,0 +1,15 @@
+obj-$(CONFIG_TEST_KALLSYMS_A) += test_kallsyms_a.o
+obj-$(CONFIG_TEST_KALLSYMS_B) += test_kallsyms_b.o
+obj-$(CONFIG_TEST_KALLSYMS_C) += test_kallsyms_c.o
+obj-$(CONFIG_TEST_KALLSYMS_D) += test_kallsyms_d.o
+
+$(obj)/%.c: FORCE
+	@$(kecho) "  GEN     $@"
+	$(Q)$(srctree)/lib/tests/module/gen_test_kallsyms.sh $@\
+		$(CONFIG_TEST_KALLSYMS_NUMSYMS) \
+		$(CONFIG_TEST_KALLSYMS_SCALE_FACTOR)
+
+clean-files += test_kallsyms_a.c
+clean-files += test_kallsyms_b.c
+clean-files += test_kallsyms_c.c
+clean-files += test_kallsyms_d.c
diff --git a/lib/tests/module/gen_test_kallsyms.sh b/lib/tests/module/gen_test_kallsyms.sh
new file mode 100755
index 000000000000..e85f10dc11bd
--- /dev/null
+++ b/lib/tests/module/gen_test_kallsyms.sh
@@ -0,0 +1,128 @@
+#!/bin/bash
+
+TARGET=$(basename $1)
+DIR=lib/tests/module
+TARGET="$DIR/$TARGET"
+NUM_SYMS=$2
+SCALE_FACTOR=$3
+TEST_TYPE=$(echo $TARGET | sed -e 's|lib/tests/module/test_kallsyms_||g')
+TEST_TYPE=$(echo $TEST_TYPE | sed -e 's|.c||g')
+
+gen_template_module_header()
+{
+	cat <<____END_MODULE
+// SPDX-License-Identifier: GPL-2.0-or-later OR copyleft-next-0.3.1
+/*
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
+ *
+ * Automatically generated code for testing, do not edit manually.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+
+____END_MODULE
+}
+
+gen_num_syms()
+{
+	PREFIX=$1
+	NUM=$2
+	for i in $(seq 1 $NUM); do
+		printf "int auto_test_%s_%010d = 0xff;\n" $PREFIX $i
+		printf "EXPORT_SYMBOL_GPL(auto_test_%s_%010d);\n" $PREFIX $i
+	done
+	echo
+}
+
+gen_template_module_data_a()
+{
+	gen_num_syms a $1
+	cat <<____END_MODULE
+static int auto_runtime_test(void)
+{
+	return 0;
+}
+
+____END_MODULE
+}
+
+gen_template_module_data_b()
+{
+	printf "\nextern int auto_test_a_%010d;\n\n" 28
+	echo "static int auto_runtime_test(void)"
+	echo "{"
+	printf "\nreturn auto_test_a_%010d;\n" 28
+	echo "}"
+}
+
+gen_template_module_data_c()
+{
+	gen_num_syms c $1
+	cat <<____END_MODULE
+static int auto_runtime_test(void)
+{
+	return 0;
+}
+
+____END_MODULE
+}
+
+gen_template_module_data_d()
+{
+	gen_num_syms d $1
+	cat <<____END_MODULE
+static int auto_runtime_test(void)
+{
+	return 0;
+}
+
+____END_MODULE
+}
+
+gen_template_module_exit()
+{
+	cat <<____END_MODULE
+static int __init auto_test_module_init(void)
+{
+	return auto_runtime_test();
+}
+module_init(auto_test_module_init);
+
+static void __exit auto_test_module_exit(void)
+{
+}
+module_exit(auto_test_module_exit);
+
+MODULE_AUTHOR("Luis Chamberlain <mcgrof@kernel.org>");
+MODULE_LICENSE("GPL");
+____END_MODULE
+}
+
+case $TEST_TYPE in
+	a)
+		gen_template_module_header > $TARGET
+		gen_template_module_data_a $NUM_SYMS >> $TARGET
+		gen_template_module_exit >> $TARGET
+		;;
+	b)
+		gen_template_module_header > $TARGET
+		gen_template_module_data_b >> $TARGET
+		gen_template_module_exit >> $TARGET
+		;;
+	c)
+		gen_template_module_header > $TARGET
+		gen_template_module_data_c $((NUM_SYMS * SCALE_FACTOR)) >> $TARGET
+		gen_template_module_exit >> $TARGET
+		;;
+	d)
+		gen_template_module_header > $TARGET
+		gen_template_module_data_d $((NUM_SYMS * SCALE_FACTOR * 2)) >> $TARGET
+		gen_template_module_exit >> $TARGET
+		;;
+	*)
+		;;
+esac
diff --git a/tools/testing/selftests/module/Makefile b/tools/testing/selftests/module/Makefile
new file mode 100644
index 000000000000..6132d7ddb08b
--- /dev/null
+++ b/tools/testing/selftests/module/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# Makefile for module loading selftests
+
+# No binaries, but make sure arg-less "make" doesn't trigger "run_tests"
+all:
+
+TEST_PROGS := find_symbol.sh
+
+include ../lib.mk
+
+# Nothing to clean up.
+clean:
diff --git a/tools/testing/selftests/module/config b/tools/testing/selftests/module/config
new file mode 100644
index 000000000000..b0c206b1ad47
--- /dev/null
+++ b/tools/testing/selftests/module/config
@@ -0,0 +1,3 @@
+CONFIG_TEST_RUNTIME=y
+CONFIG_TEST_RUNTIME_MODULE=y
+CONFIG_TEST_KALLSYMS=m
diff --git a/tools/testing/selftests/module/find_symbol.sh b/tools/testing/selftests/module/find_symbol.sh
new file mode 100755
index 000000000000..140364d3c49f
--- /dev/null
+++ b/tools/testing/selftests/module/find_symbol.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later OR copyleft-next-0.3.1
+# Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
+#
+# This is a stress test script for kallsyms through find_symbol()
+
+set -e
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+test_reqs()
+{
+	if ! which modprobe 2> /dev/null > /dev/null; then
+		echo "$0: You need modprobe installed" >&2
+		exit $ksft_skip
+	fi
+
+	if ! which kmod 2> /dev/null > /dev/null; then
+		echo "$0: You need kmod installed" >&2
+		exit $ksft_skip
+	fi
+
+	if ! which perf 2> /dev/null > /dev/null; then
+		echo "$0: You need perf installed" >&2
+		exit $ksft_skip
+	fi
+
+	uid=$(id -u)
+	if [ $uid -ne 0 ]; then
+		echo $msg must be run as root >&2
+		exit $ksft_skip
+	fi
+}
+
+load_mod()
+{
+	local STATS="-e duration_time"
+	STATS="$STATS -e user_time"
+	STATS="$STATS -e system_time"
+	STATS="$STATS -e page-faults"
+	local MOD=$1
+
+	local ARCH="$(uname -m)"
+	case "${ARCH}" in
+	x86_64)
+		perf stat $STATS $MODPROBE test_kallsyms_b
+		;;
+	*)
+		time $MODPROBE test_kallsyms_b
+		exit 1
+		;;
+	esac
+}
+
+remove_all()
+{
+	$MODPROBE -r test_kallsyms_b
+	for i in a b c d; do
+		$MODPROBE -r test_kallsyms_$i
+	done
+}
+test_reqs
+
+MODPROBE=$(</proc/sys/kernel/modprobe)
+
+remove_all
+load_mod test_kallsyms_b
+remove_all
+
+# Now pollute the namespace
+$MODPROBE test_kallsyms_c
+load_mod test_kallsyms_b
+
+# Now pollute the namespace with twice the number of symbols than the last time
+remove_all
+$MODPROBE test_kallsyms_c
+$MODPROBE test_kallsyms_d
+load_mod test_kallsyms_b
+
+exit 0
-- 
cgit v1.2.3


From 4579b4a4279ec7df9499943f764da03ae837021c Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <thinker.li@gmail.com>
Date: Wed, 23 Oct 2024 16:47:55 -0700
Subject: selftests/bpf: Some basic __uptr tests

Make sure the memory of uptrs have been mapped to the kernel properly.
Also ensure the values of uptrs in the kernel haven't been copied
to userspace.

It also has the syscall update_elem/delete_elem test to test the
pin/unpin code paths.

Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20241023234759.860539-9-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/task_local_storage.c  | 142 +++++++++++++++++++++
 tools/testing/selftests/bpf/progs/task_ls_uptr.c   |  63 +++++++++
 tools/testing/selftests/bpf/uptr_test_common.h     |  35 +++++
 3 files changed, 240 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/task_ls_uptr.c
 create mode 100644 tools/testing/selftests/bpf/uptr_test_common.h

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
index c33c05161a9e..4c8eadd1f083 100644
--- a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
+++ b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
@@ -7,12 +7,15 @@
 #include <pthread.h>
 #include <sys/syscall.h>   /* For SYS_xxx definitions */
 #include <sys/types.h>
+#include <sys/eventfd.h>
 #include <test_progs.h>
 #include "task_local_storage_helpers.h"
 #include "task_local_storage.skel.h"
 #include "task_local_storage_exit_creds.skel.h"
 #include "task_ls_recursion.skel.h"
 #include "task_storage_nodeadlock.skel.h"
+#include "uptr_test_common.h"
+#include "task_ls_uptr.skel.h"
 
 static void test_sys_enter_exit(void)
 {
@@ -227,6 +230,143 @@ done:
 	sched_setaffinity(getpid(), sizeof(old), &old);
 }
 
+static struct user_data udata __attribute__((aligned(16))) = {
+	.a = 1,
+	.b = 2,
+};
+
+static struct user_data udata2 __attribute__((aligned(16))) = {
+	.a = 3,
+	.b = 4,
+};
+
+static void check_udata2(int expected)
+{
+	udata2.result = udata2.nested_result = 0;
+	usleep(1);
+	ASSERT_EQ(udata2.result, expected, "udata2.result");
+	ASSERT_EQ(udata2.nested_result, expected, "udata2.nested_result");
+}
+
+static void test_uptr_basic(void)
+{
+	int map_fd, parent_task_fd, ev_fd;
+	struct value_type value = {};
+	struct task_ls_uptr *skel;
+	pid_t child_pid, my_tid;
+	__u64 ev_dummy_data = 1;
+	int err;
+
+	my_tid = syscall(SYS_gettid);
+	parent_task_fd = sys_pidfd_open(my_tid, 0);
+	if (!ASSERT_OK_FD(parent_task_fd, "parent_task_fd"))
+		return;
+
+	ev_fd = eventfd(0, 0);
+	if (!ASSERT_OK_FD(ev_fd, "ev_fd")) {
+		close(parent_task_fd);
+		return;
+	}
+
+	skel = task_ls_uptr__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel_open_and_load"))
+		goto out;
+
+	map_fd = bpf_map__fd(skel->maps.datamap);
+	value.udata = &udata;
+	value.nested.udata = &udata;
+	err = bpf_map_update_elem(map_fd, &parent_task_fd, &value, BPF_NOEXIST);
+	if (!ASSERT_OK(err, "update_elem(udata)"))
+		goto out;
+
+	err = task_ls_uptr__attach(skel);
+	if (!ASSERT_OK(err, "skel_attach"))
+		goto out;
+
+	child_pid = fork();
+	if (!ASSERT_NEQ(child_pid, -1, "fork"))
+		goto out;
+
+	/* Call syscall in the child process, but access the map value of
+	 * the parent process in the BPF program to check if the user kptr
+	 * is translated/mapped correctly.
+	 */
+	if (child_pid == 0) {
+		/* child */
+
+		/* Overwrite the user_data in the child process to check if
+		 * the BPF program accesses the user_data of the parent.
+		 */
+		udata.a = 0;
+		udata.b = 0;
+
+		/* Wait for the parent to set child_pid */
+		read(ev_fd, &ev_dummy_data, sizeof(ev_dummy_data));
+		exit(0);
+	}
+
+	skel->bss->parent_pid = my_tid;
+	skel->bss->target_pid = child_pid;
+
+	write(ev_fd, &ev_dummy_data, sizeof(ev_dummy_data));
+
+	err = waitpid(child_pid, NULL, 0);
+	ASSERT_EQ(err, child_pid, "waitpid");
+	ASSERT_EQ(udata.result, MAGIC_VALUE + udata.a + udata.b, "udata.result");
+	ASSERT_EQ(udata.nested_result, MAGIC_VALUE + udata.a + udata.b, "udata.nested_result");
+
+	skel->bss->target_pid = my_tid;
+
+	/* update_elem: uptr changes from udata1 to udata2 */
+	value.udata = &udata2;
+	value.nested.udata = &udata2;
+	err = bpf_map_update_elem(map_fd, &parent_task_fd, &value, BPF_EXIST);
+	if (!ASSERT_OK(err, "update_elem(udata2)"))
+		goto out;
+	check_udata2(MAGIC_VALUE + udata2.a + udata2.b);
+
+	/* update_elem: uptr changes from udata2 uptr to NULL */
+	memset(&value, 0, sizeof(value));
+	err = bpf_map_update_elem(map_fd, &parent_task_fd, &value, BPF_EXIST);
+	if (!ASSERT_OK(err, "update_elem(udata2)"))
+		goto out;
+	check_udata2(0);
+
+	/* update_elem: uptr changes from NULL to udata2 */
+	value.udata = &udata2;
+	value.nested.udata = &udata2;
+	err = bpf_map_update_elem(map_fd, &parent_task_fd, &value, BPF_EXIST);
+	if (!ASSERT_OK(err, "update_elem(udata2)"))
+		goto out;
+	check_udata2(MAGIC_VALUE + udata2.a + udata2.b);
+
+	/* Check if user programs can access the value of user kptrs
+	 * through bpf_map_lookup_elem(). Make sure the kernel value is not
+	 * leaked.
+	 */
+	err = bpf_map_lookup_elem(map_fd, &parent_task_fd, &value);
+	if (!ASSERT_OK(err, "bpf_map_lookup_elem"))
+		goto out;
+	ASSERT_EQ(value.udata, NULL, "value.udata");
+	ASSERT_EQ(value.nested.udata, NULL, "value.nested.udata");
+
+	/* delete_elem */
+	err = bpf_map_delete_elem(map_fd, &parent_task_fd);
+	ASSERT_OK(err, "delete_elem(udata2)");
+	check_udata2(0);
+
+	/* update_elem: add uptr back to test map_free */
+	value.udata = &udata2;
+	value.nested.udata = &udata2;
+	err = bpf_map_update_elem(map_fd, &parent_task_fd, &value, BPF_NOEXIST);
+	ASSERT_OK(err, "update_elem(udata2)");
+
+out:
+	task_ls_uptr__destroy(skel);
+	close(ev_fd);
+	close(parent_task_fd);
+}
+
 void test_task_local_storage(void)
 {
 	if (test__start_subtest("sys_enter_exit"))
@@ -237,4 +377,6 @@ void test_task_local_storage(void)
 		test_recursion();
 	if (test__start_subtest("nodeadlock"))
 		test_nodeadlock();
+	if (test__start_subtest("uptr_basic"))
+		test_uptr_basic();
 }
diff --git a/tools/testing/selftests/bpf/progs/task_ls_uptr.c b/tools/testing/selftests/bpf/progs/task_ls_uptr.c
new file mode 100644
index 000000000000..ddbe11b46eef
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/task_ls_uptr.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include "uptr_test_common.h"
+
+struct task_struct *bpf_task_from_pid(s32 pid) __ksym;
+void bpf_task_release(struct task_struct *p) __ksym;
+void bpf_cgroup_release(struct cgroup *cgrp) __ksym;
+
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct value_type);
+} datamap SEC(".maps");
+
+pid_t target_pid = 0;
+pid_t parent_pid = 0;
+
+SEC("tp_btf/sys_enter")
+int on_enter(__u64 *ctx)
+{
+	struct task_struct *task, *data_task;
+	struct value_type *ptr;
+	struct user_data *udata;
+	struct cgroup *cgrp;
+
+	task = bpf_get_current_task_btf();
+	if (task->pid != target_pid)
+		return 0;
+
+	data_task = bpf_task_from_pid(parent_pid);
+	if (!data_task)
+		return 0;
+
+	ptr = bpf_task_storage_get(&datamap, data_task, 0, 0);
+	bpf_task_release(data_task);
+	if (!ptr)
+		return 0;
+
+	cgrp = bpf_kptr_xchg(&ptr->cgrp, NULL);
+	if (cgrp) {
+		int lvl = cgrp->level;
+
+		bpf_cgroup_release(cgrp);
+		return lvl;
+	}
+
+	udata = ptr->udata;
+	if (!udata || udata->result)
+		return 0;
+	udata->result = MAGIC_VALUE + udata->a + udata->b;
+
+	udata = ptr->nested.udata;
+	if (udata && !udata->nested_result)
+		udata->nested_result = udata->result;
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/uptr_test_common.h b/tools/testing/selftests/bpf/uptr_test_common.h
new file mode 100644
index 000000000000..feb41176888c
--- /dev/null
+++ b/tools/testing/selftests/bpf/uptr_test_common.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+
+#ifndef _UPTR_TEST_COMMON_H
+#define _UPTR_TEST_COMMON_H
+
+#define MAGIC_VALUE 0xabcd1234
+
+#ifdef __BPF__
+/* Avoid fwd btf type being generated for the following struct */
+struct user_data *dummy_data;
+struct cgroup *dummy_cgrp;
+#else
+#define __uptr
+#define __kptr
+#endif
+
+struct user_data {
+	int a;
+	int b;
+	int result;
+	int nested_result;
+};
+
+struct nested_udata {
+	struct user_data __uptr *udata;
+};
+
+struct value_type {
+	struct user_data __uptr *udata;
+	struct cgroup __kptr *cgrp;
+	struct nested_udata nested;
+};
+
+#endif
-- 
cgit v1.2.3


From 51fff4083372381e680724dde7ac3e859f9e3a0a Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Wed, 23 Oct 2024 16:47:56 -0700
Subject: selftests/bpf: Test a uptr struct spanning across pages.

This patch tests the case when uptr has a struct spanning across two
pages. It is not supported now and EOPNOTSUPP is expected from the
syscall update_elem.

It also tests the whole uptr struct located exactly at the
end of a page and ensures that this case is accepted by update_elem.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20241023234759.860539-10-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/task_local_storage.c  | 43 ++++++++++++++++++++++
 1 file changed, 43 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
index 4c8eadd1f083..b7af0921b3da 100644
--- a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
+++ b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
@@ -8,6 +8,7 @@
 #include <sys/syscall.h>   /* For SYS_xxx definitions */
 #include <sys/types.h>
 #include <sys/eventfd.h>
+#include <sys/mman.h>
 #include <test_progs.h>
 #include "task_local_storage_helpers.h"
 #include "task_local_storage.skel.h"
@@ -367,6 +368,46 @@ out:
 	close(parent_task_fd);
 }
 
+static void test_uptr_across_pages(void)
+{
+	int page_size = getpagesize();
+	struct value_type value = {};
+	struct task_ls_uptr *skel;
+	int err, task_fd, map_fd;
+	void *mem;
+
+	task_fd = sys_pidfd_open(getpid(), 0);
+	if (!ASSERT_OK_FD(task_fd, "task_fd"))
+		return;
+
+	mem = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
+		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (!ASSERT_OK_PTR(mem, "mmap(page_size * 2)")) {
+		close(task_fd);
+		return;
+	}
+
+	skel = task_ls_uptr__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel_open_and_load"))
+		goto out;
+
+	map_fd = bpf_map__fd(skel->maps.datamap);
+	value.udata = mem + page_size - offsetof(struct user_data, b);
+	err = bpf_map_update_elem(map_fd, &task_fd, &value, 0);
+	if (!ASSERT_ERR(err, "update_elem(udata)"))
+		goto out;
+	ASSERT_EQ(errno, EOPNOTSUPP, "errno");
+
+	value.udata = mem + page_size - sizeof(struct user_data);
+	err = bpf_map_update_elem(map_fd, &task_fd, &value, 0);
+	ASSERT_OK(err, "update_elem(udata)");
+
+out:
+	task_ls_uptr__destroy(skel);
+	close(task_fd);
+	munmap(mem, page_size * 2);
+}
+
 void test_task_local_storage(void)
 {
 	if (test__start_subtest("sys_enter_exit"))
@@ -379,4 +420,6 @@ void test_task_local_storage(void)
 		test_nodeadlock();
 	if (test__start_subtest("uptr_basic"))
 		test_uptr_basic();
+	if (test__start_subtest("uptr_across_pages"))
+		test_uptr_across_pages();
 }
-- 
cgit v1.2.3


From cbf9f849a3e86f1b7c041dfbeeae1c1fff0ddc8d Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Wed, 23 Oct 2024 16:47:57 -0700
Subject: selftests/bpf: Add update_elem failure test for task storage uptr

This patch test the following failures in syscall update_elem
1. The first update_elem(BPF_F_LOCK) should be EOPNOTSUPP. syscall.c takes
   care of unpinning the uptr.
2. The second update_elem(BPF_EXIST) fails. syscall.c takes care of
   unpinning the uptr.
3. The forth update_elem(BPF_NOEXIST) fails. bpf_local_storage_update
   takes care of unpinning the uptr.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20241023234759.860539-11-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/task_local_storage.c  | 45 ++++++++++++++++++++++
 .../selftests/bpf/progs/uptr_update_failure.c      | 42 ++++++++++++++++++++
 tools/testing/selftests/bpf/uptr_test_common.h     |  5 +++
 3 files changed, 92 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/uptr_update_failure.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
index b7af0921b3da..e985665efe7a 100644
--- a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
+++ b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
@@ -17,6 +17,7 @@
 #include "task_storage_nodeadlock.skel.h"
 #include "uptr_test_common.h"
 #include "task_ls_uptr.skel.h"
+#include "uptr_update_failure.skel.h"
 
 static void test_sys_enter_exit(void)
 {
@@ -408,6 +409,48 @@ out:
 	munmap(mem, page_size * 2);
 }
 
+static void test_uptr_update_failure(void)
+{
+	struct value_lock_type value = {};
+	struct uptr_update_failure *skel;
+	int err, task_fd, map_fd;
+
+	task_fd = sys_pidfd_open(getpid(), 0);
+	if (!ASSERT_OK_FD(task_fd, "task_fd"))
+		return;
+
+	skel = uptr_update_failure__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel_open_and_load"))
+		goto out;
+
+	map_fd = bpf_map__fd(skel->maps.datamap);
+
+	value.udata = &udata;
+	err = bpf_map_update_elem(map_fd, &task_fd, &value, BPF_F_LOCK);
+	if (!ASSERT_ERR(err, "update_elem(udata, BPF_F_LOCK)"))
+		goto out;
+	ASSERT_EQ(errno, EOPNOTSUPP, "errno");
+
+	err = bpf_map_update_elem(map_fd, &task_fd, &value, BPF_EXIST);
+	if (!ASSERT_ERR(err, "update_elem(udata, BPF_EXIST)"))
+		goto out;
+	ASSERT_EQ(errno, ENOENT, "errno");
+
+	err = bpf_map_update_elem(map_fd, &task_fd, &value, BPF_NOEXIST);
+	if (!ASSERT_OK(err, "update_elem(udata, BPF_NOEXIST)"))
+		goto out;
+
+	value.udata = &udata2;
+	err = bpf_map_update_elem(map_fd, &task_fd, &value, BPF_NOEXIST);
+	if (!ASSERT_ERR(err, "update_elem(udata2, BPF_NOEXIST)"))
+		goto out;
+	ASSERT_EQ(errno, EEXIST, "errno");
+
+out:
+	uptr_update_failure__destroy(skel);
+	close(task_fd);
+}
+
 void test_task_local_storage(void)
 {
 	if (test__start_subtest("sys_enter_exit"))
@@ -422,4 +465,6 @@ void test_task_local_storage(void)
 		test_uptr_basic();
 	if (test__start_subtest("uptr_across_pages"))
 		test_uptr_across_pages();
+	if (test__start_subtest("uptr_update_failure"))
+		test_uptr_update_failure();
 }
diff --git a/tools/testing/selftests/bpf/progs/uptr_update_failure.c b/tools/testing/selftests/bpf/progs/uptr_update_failure.c
new file mode 100644
index 000000000000..86c3bb954abc
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/uptr_update_failure.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include "uptr_test_common.h"
+
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct value_lock_type);
+} datamap SEC(".maps");
+
+/* load test only. not used */
+SEC("syscall")
+int not_used(void *ctx)
+{
+	struct value_lock_type *ptr;
+	struct task_struct *task;
+	struct user_data *udata;
+
+	task = bpf_get_current_task_btf();
+	ptr = bpf_task_storage_get(&datamap, task, 0, 0);
+	if (!ptr)
+		return 0;
+
+	bpf_spin_lock(&ptr->lock);
+
+	udata = ptr->udata;
+	if (!udata) {
+		bpf_spin_unlock(&ptr->lock);
+		return 0;
+	}
+	udata->result = MAGIC_VALUE + udata->a + udata->b;
+
+	bpf_spin_unlock(&ptr->lock);
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/uptr_test_common.h b/tools/testing/selftests/bpf/uptr_test_common.h
index feb41176888c..45c00c80d935 100644
--- a/tools/testing/selftests/bpf/uptr_test_common.h
+++ b/tools/testing/selftests/bpf/uptr_test_common.h
@@ -32,4 +32,9 @@ struct value_type {
 	struct nested_udata nested;
 };
 
+struct value_lock_type {
+	struct user_data __uptr *udata;
+	struct bpf_spin_lock lock;
+};
+
 #endif
-- 
cgit v1.2.3


From 898cbca4a7579bea3ab746cd8dc33027bff80dac Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Wed, 23 Oct 2024 16:47:58 -0700
Subject: selftests/bpf: Add uptr failure verifier tests

Add verifier tests to ensure invalid uptr usages are rejected.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20241023234759.860539-12-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/task_local_storage.c  |   2 +
 tools/testing/selftests/bpf/progs/uptr_failure.c   | 105 +++++++++++++++++++++
 2 files changed, 107 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/uptr_failure.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
index e985665efe7a..772ed7ce4feb 100644
--- a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
+++ b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
@@ -18,6 +18,7 @@
 #include "uptr_test_common.h"
 #include "task_ls_uptr.skel.h"
 #include "uptr_update_failure.skel.h"
+#include "uptr_failure.skel.h"
 
 static void test_sys_enter_exit(void)
 {
@@ -467,4 +468,5 @@ void test_task_local_storage(void)
 		test_uptr_across_pages();
 	if (test__start_subtest("uptr_update_failure"))
 		test_uptr_update_failure();
+	RUN_TESTS(uptr_failure);
 }
diff --git a/tools/testing/selftests/bpf/progs/uptr_failure.c b/tools/testing/selftests/bpf/progs/uptr_failure.c
new file mode 100644
index 000000000000..0cfa1fd61440
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/uptr_failure.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_experimental.h"
+#include "bpf_misc.h"
+#include "uptr_test_common.h"
+
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct value_type);
+} datamap SEC(".maps");
+
+SEC("?syscall")
+__failure __msg("store to uptr disallowed")
+int uptr_write(const void *ctx)
+{
+	struct task_struct *task;
+	struct value_type *v;
+
+	task = bpf_get_current_task_btf();
+	v = bpf_task_storage_get(&datamap, task, 0,
+				 BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (!v)
+		return 0;
+
+	v->udata = NULL;
+	return 0;
+}
+
+SEC("?syscall")
+__failure __msg("store to uptr disallowed")
+int uptr_write_nested(const void *ctx)
+{
+	struct task_struct *task;
+	struct value_type *v;
+
+	task = bpf_get_current_task_btf();
+	v = bpf_task_storage_get(&datamap, task, 0,
+				 BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (!v)
+		return 0;
+
+	v->nested.udata = NULL;
+	return 0;
+}
+
+SEC("?syscall")
+__failure __msg("R1 invalid mem access 'mem_or_null'")
+int uptr_no_null_check(const void *ctx)
+{
+	struct task_struct *task;
+	struct value_type *v;
+
+	task = bpf_get_current_task_btf();
+	v = bpf_task_storage_get(&datamap, task, 0,
+				 BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (!v)
+		return 0;
+
+	v->udata->result = 0;
+
+	return 0;
+}
+
+SEC("?syscall")
+__failure __msg("doesn't point to kptr")
+int uptr_kptr_xchg(const void *ctx)
+{
+	struct task_struct *task;
+	struct value_type *v;
+
+	task = bpf_get_current_task_btf();
+	v = bpf_task_storage_get(&datamap, task, 0,
+				 BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (!v)
+		return 0;
+
+	bpf_kptr_xchg(&v->udata, NULL);
+
+	return 0;
+}
+
+SEC("?syscall")
+__failure __msg("invalid mem access 'scalar'")
+int uptr_obj_new(const void *ctx)
+{
+	struct value_type *v;
+
+	v = bpf_obj_new(typeof(*v));
+	if (!v)
+		return 0;
+
+	if (v->udata)
+		v->udata->result = 0;
+
+	bpf_obj_drop(v);
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From bd5879a6fe4be407bf36c212cd91ed1e4485a6f9 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Wed, 23 Oct 2024 16:47:59 -0700
Subject: selftests/bpf: Create task_local_storage map with invalid uptr's
 struct

This patch tests the map creation failure when the map_value
has unsupported uptr. The three cases are the struct is larger
than one page, the struct is empty, and the struct is a kernel struct.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20241023234759.860539-13-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/task_local_storage.c  | 46 ++++++++++++++++++++++
 .../testing/selftests/bpf/progs/uptr_map_failure.c | 27 +++++++++++++
 tools/testing/selftests/bpf/test_progs.h           |  8 ++++
 tools/testing/selftests/bpf/uptr_test_common.h     | 23 +++++++++++
 4 files changed, 104 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/uptr_map_failure.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
index 772ed7ce4feb..00cc9d0aee5d 100644
--- a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
+++ b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
@@ -10,6 +10,7 @@
 #include <sys/eventfd.h>
 #include <sys/mman.h>
 #include <test_progs.h>
+#include <bpf/btf.h>
 #include "task_local_storage_helpers.h"
 #include "task_local_storage.skel.h"
 #include "task_local_storage_exit_creds.skel.h"
@@ -19,6 +20,7 @@
 #include "task_ls_uptr.skel.h"
 #include "uptr_update_failure.skel.h"
 #include "uptr_failure.skel.h"
+#include "uptr_map_failure.skel.h"
 
 static void test_sys_enter_exit(void)
 {
@@ -452,6 +454,40 @@ out:
 	close(task_fd);
 }
 
+static void test_uptr_map_failure(const char *map_name, int expected_errno)
+{
+	LIBBPF_OPTS(bpf_map_create_opts, create_attr);
+	struct uptr_map_failure *skel;
+	struct bpf_map *map;
+	struct btf *btf;
+	int map_fd, err;
+
+	skel = uptr_map_failure__open();
+	if (!ASSERT_OK_PTR(skel, "uptr_map_failure__open"))
+		return;
+
+	map = bpf_object__find_map_by_name(skel->obj, map_name);
+	btf = bpf_object__btf(skel->obj);
+	err = btf__load_into_kernel(btf);
+	if (!ASSERT_OK(err, "btf__load_into_kernel"))
+		goto done;
+
+	create_attr.map_flags = bpf_map__map_flags(map);
+	create_attr.btf_fd = btf__fd(btf);
+	create_attr.btf_key_type_id = bpf_map__btf_key_type_id(map);
+	create_attr.btf_value_type_id = bpf_map__btf_value_type_id(map);
+	map_fd = bpf_map_create(bpf_map__type(map), map_name,
+				bpf_map__key_size(map), bpf_map__value_size(map),
+				0, &create_attr);
+	if (ASSERT_ERR_FD(map_fd, "map_create"))
+		ASSERT_EQ(errno, expected_errno, "errno");
+	else
+		close(map_fd);
+
+done:
+	uptr_map_failure__destroy(skel);
+}
+
 void test_task_local_storage(void)
 {
 	if (test__start_subtest("sys_enter_exit"))
@@ -468,5 +504,15 @@ void test_task_local_storage(void)
 		test_uptr_across_pages();
 	if (test__start_subtest("uptr_update_failure"))
 		test_uptr_update_failure();
+	if (test__start_subtest("uptr_map_failure_e2big")) {
+		if (getpagesize() == PAGE_SIZE)
+			test_uptr_map_failure("large_uptr_map", E2BIG);
+		else
+			test__skip();
+	}
+	if (test__start_subtest("uptr_map_failure_size0"))
+		test_uptr_map_failure("empty_uptr_map", EINVAL);
+	if (test__start_subtest("uptr_map_failure_kstruct"))
+		test_uptr_map_failure("kstruct_uptr_map", EINVAL);
 	RUN_TESTS(uptr_failure);
 }
diff --git a/tools/testing/selftests/bpf/progs/uptr_map_failure.c b/tools/testing/selftests/bpf/progs/uptr_map_failure.c
new file mode 100644
index 000000000000..417b763d76b4
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/uptr_map_failure.c
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include "uptr_test_common.h"
+
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct large_uptr);
+} large_uptr_map SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct empty_uptr);
+} empty_uptr_map SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct kstruct_uptr);
+} kstruct_uptr_map SEC(".maps");
diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h
index 7767d9a825ae..7a58895867c3 100644
--- a/tools/testing/selftests/bpf/test_progs.h
+++ b/tools/testing/selftests/bpf/test_progs.h
@@ -390,6 +390,14 @@ int test__join_cgroup(const char *path);
 	___ok;								\
 })
 
+#define ASSERT_ERR_FD(fd, name) ({					\
+	static int duration = 0;					\
+	int ___fd = (fd);						\
+	bool ___ok = ___fd < 0;						\
+	CHECK(!___ok, (name), "unexpected fd: %d\n", ___fd);		\
+	___ok;								\
+})
+
 #define SYS(goto_label, fmt, ...)					\
 	({								\
 		char cmd[1024];						\
diff --git a/tools/testing/selftests/bpf/uptr_test_common.h b/tools/testing/selftests/bpf/uptr_test_common.h
index 45c00c80d935..f8a134ba12f9 100644
--- a/tools/testing/selftests/bpf/uptr_test_common.h
+++ b/tools/testing/selftests/bpf/uptr_test_common.h
@@ -5,9 +5,12 @@
 #define _UPTR_TEST_COMMON_H
 
 #define MAGIC_VALUE 0xabcd1234
+#define PAGE_SIZE 4096
 
 #ifdef __BPF__
 /* Avoid fwd btf type being generated for the following struct */
+struct large_data *dummy_large_data;
+struct empty_data *dummy_empty_data;
 struct user_data *dummy_data;
 struct cgroup *dummy_cgrp;
 #else
@@ -37,4 +40,24 @@ struct value_lock_type {
 	struct bpf_spin_lock lock;
 };
 
+struct large_data {
+	__u8 one_page[PAGE_SIZE];
+	int a;
+};
+
+struct large_uptr {
+	struct large_data __uptr *udata;
+};
+
+struct empty_data {
+};
+
+struct empty_uptr {
+	struct empty_data __uptr *udata;
+};
+
+struct kstruct_uptr {
+	struct cgroup __uptr *cgrp;
+};
+
 #endif
-- 
cgit v1.2.3


From 7470b5afd150e683c7aef03961d0c4c6f500de3b Mon Sep 17 00:00:00 2001
From: Samuel Holland <samuel.holland@sifive.com>
Date: Wed, 16 Oct 2024 13:27:48 -0700
Subject: riscv: selftests: Add a pointer masking test

This test covers the behavior of the PR_SET_TAGGED_ADDR_CTRL and
PR_GET_TAGGED_ADDR_CTRL prctl() operations, their effects on the
userspace ABI, and their effects on the system call ABI.

Reviewed-by: Charlie Jenkins <charlie@rivosinc.com>
Tested-by: Charlie Jenkins <charlie@rivosinc.com>
Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
Link: https://lore.kernel.org/r/20241016202814.4061541-8-samuel.holland@sifive.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 tools/testing/selftests/riscv/Makefile             |   2 +-
 tools/testing/selftests/riscv/abi/.gitignore       |   1 +
 tools/testing/selftests/riscv/abi/Makefile         |  10 +
 .../testing/selftests/riscv/abi/pointer_masking.c  | 332 +++++++++++++++++++++
 4 files changed, 344 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/riscv/abi/.gitignore
 create mode 100644 tools/testing/selftests/riscv/abi/Makefile
 create mode 100644 tools/testing/selftests/riscv/abi/pointer_masking.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/riscv/Makefile b/tools/testing/selftests/riscv/Makefile
index 7ce03d832b64..099b8c1f46f8 100644
--- a/tools/testing/selftests/riscv/Makefile
+++ b/tools/testing/selftests/riscv/Makefile
@@ -5,7 +5,7 @@
 ARCH ?= $(shell uname -m 2>/dev/null || echo not)
 
 ifneq (,$(filter $(ARCH),riscv))
-RISCV_SUBTARGETS ?= hwprobe vector mm sigreturn
+RISCV_SUBTARGETS ?= abi hwprobe mm sigreturn vector
 else
 RISCV_SUBTARGETS :=
 endif
diff --git a/tools/testing/selftests/riscv/abi/.gitignore b/tools/testing/selftests/riscv/abi/.gitignore
new file mode 100644
index 000000000000..b38358f91c4d
--- /dev/null
+++ b/tools/testing/selftests/riscv/abi/.gitignore
@@ -0,0 +1 @@
+pointer_masking
diff --git a/tools/testing/selftests/riscv/abi/Makefile b/tools/testing/selftests/riscv/abi/Makefile
new file mode 100644
index 000000000000..ed82ff9c664e
--- /dev/null
+++ b/tools/testing/selftests/riscv/abi/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
+
+CFLAGS += -I$(top_srcdir)/tools/include
+
+TEST_GEN_PROGS := pointer_masking
+
+include ../../lib.mk
+
+$(OUTPUT)/pointer_masking: pointer_masking.c
+	$(CC) -static -o$@ $(CFLAGS) $(LDFLAGS) $^
diff --git a/tools/testing/selftests/riscv/abi/pointer_masking.c b/tools/testing/selftests/riscv/abi/pointer_masking.c
new file mode 100644
index 000000000000..dee41b7ee3e3
--- /dev/null
+++ b/tools/testing/selftests/riscv/abi/pointer_masking.c
@@ -0,0 +1,332 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <errno.h>
+#include <fcntl.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "../../kselftest.h"
+
+#ifndef PR_PMLEN_SHIFT
+#define PR_PMLEN_SHIFT			24
+#endif
+#ifndef PR_PMLEN_MASK
+#define PR_PMLEN_MASK			(0x7fUL << PR_PMLEN_SHIFT)
+#endif
+
+static int dev_zero;
+
+static int pipefd[2];
+
+static sigjmp_buf jmpbuf;
+
+static void sigsegv_handler(int sig)
+{
+	siglongjmp(jmpbuf, 1);
+}
+
+static int min_pmlen;
+static int max_pmlen;
+
+static inline bool valid_pmlen(int pmlen)
+{
+	return pmlen == 0 || pmlen == 7 || pmlen == 16;
+}
+
+static void test_pmlen(void)
+{
+	ksft_print_msg("Testing available PMLEN values\n");
+
+	for (int request = 0; request <= 16; request++) {
+		int pmlen, ret;
+
+		ret = prctl(PR_SET_TAGGED_ADDR_CTRL, request << PR_PMLEN_SHIFT, 0, 0, 0);
+		if (ret)
+			goto pr_set_error;
+
+		ret = prctl(PR_GET_TAGGED_ADDR_CTRL, 0, 0, 0, 0);
+		ksft_test_result(ret >= 0, "PMLEN=%d PR_GET_TAGGED_ADDR_CTRL\n", request);
+		if (ret < 0)
+			goto pr_get_error;
+
+		pmlen = (ret & PR_PMLEN_MASK) >> PR_PMLEN_SHIFT;
+		ksft_test_result(pmlen >= request, "PMLEN=%d constraint\n", request);
+		ksft_test_result(valid_pmlen(pmlen), "PMLEN=%d validity\n", request);
+
+		if (min_pmlen == 0)
+			min_pmlen = pmlen;
+		if (max_pmlen < pmlen)
+			max_pmlen = pmlen;
+
+		continue;
+
+pr_set_error:
+		ksft_test_result_skip("PMLEN=%d PR_GET_TAGGED_ADDR_CTRL\n", request);
+pr_get_error:
+		ksft_test_result_skip("PMLEN=%d constraint\n", request);
+		ksft_test_result_skip("PMLEN=%d validity\n", request);
+	}
+
+	if (max_pmlen == 0)
+		ksft_exit_fail_msg("Failed to enable pointer masking\n");
+}
+
+static int set_tagged_addr_ctrl(int pmlen, bool tagged_addr_abi)
+{
+	int arg, ret;
+
+	arg = pmlen << PR_PMLEN_SHIFT | tagged_addr_abi;
+	ret = prctl(PR_SET_TAGGED_ADDR_CTRL, arg, 0, 0, 0);
+	if (!ret) {
+		ret = prctl(PR_GET_TAGGED_ADDR_CTRL, 0, 0, 0, 0);
+		if (ret == arg)
+			return 0;
+	}
+
+	return ret < 0 ? -errno : -ENODATA;
+}
+
+static void test_dereference_pmlen(int pmlen)
+{
+	static volatile int i;
+	volatile int *p;
+	int ret;
+
+	ret = set_tagged_addr_ctrl(pmlen, false);
+	if (ret)
+		return ksft_test_result_error("PMLEN=%d setup (%d)\n", pmlen, ret);
+
+	i = pmlen;
+
+	if (pmlen) {
+		p = (volatile int *)((uintptr_t)&i | 1UL << (__riscv_xlen - pmlen));
+
+		/* These dereferences should succeed. */
+		if (sigsetjmp(jmpbuf, 1))
+			return ksft_test_result_fail("PMLEN=%d valid tag\n", pmlen);
+		if (*p != pmlen)
+			return ksft_test_result_fail("PMLEN=%d bad value\n", pmlen);
+		++*p;
+	}
+
+	p = (volatile int *)((uintptr_t)&i | 1UL << (__riscv_xlen - pmlen - 1));
+
+	/* These dereferences should raise SIGSEGV. */
+	if (sigsetjmp(jmpbuf, 1))
+		return ksft_test_result_pass("PMLEN=%d dereference\n", pmlen);
+	++*p;
+	ksft_test_result_fail("PMLEN=%d invalid tag\n", pmlen);
+}
+
+static void test_dereference(void)
+{
+	ksft_print_msg("Testing userspace pointer dereference\n");
+
+	signal(SIGSEGV, sigsegv_handler);
+
+	test_dereference_pmlen(0);
+	test_dereference_pmlen(min_pmlen);
+	test_dereference_pmlen(max_pmlen);
+
+	signal(SIGSEGV, SIG_DFL);
+}
+
+static void execve_child_sigsegv_handler(int sig)
+{
+	exit(42);
+}
+
+static int execve_child(void)
+{
+	static volatile int i;
+	volatile int *p = (volatile int *)((uintptr_t)&i | 1UL << (__riscv_xlen - 7));
+
+	signal(SIGSEGV, execve_child_sigsegv_handler);
+
+	/* This dereference should raise SIGSEGV. */
+	return *p;
+}
+
+static void test_fork_exec(void)
+{
+	int ret, status;
+
+	ksft_print_msg("Testing fork/exec behavior\n");
+
+	ret = set_tagged_addr_ctrl(min_pmlen, false);
+	if (ret)
+		return ksft_test_result_error("setup (%d)\n", ret);
+
+	if (fork()) {
+		wait(&status);
+		ksft_test_result(WIFEXITED(status) && WEXITSTATUS(status) == 42,
+				 "dereference after fork\n");
+	} else {
+		static volatile int i = 42;
+		volatile int *p;
+
+		p = (volatile int *)((uintptr_t)&i | 1UL << (__riscv_xlen - min_pmlen));
+
+		/* This dereference should succeed. */
+		exit(*p);
+	}
+
+	if (fork()) {
+		wait(&status);
+		ksft_test_result(WIFEXITED(status) && WEXITSTATUS(status) == 42,
+				 "dereference after fork+exec\n");
+	} else {
+		/* Will call execve_child(). */
+		execve("/proc/self/exe", (char *const []) { "", NULL }, NULL);
+	}
+}
+
+static void test_tagged_addr_abi_sysctl(void)
+{
+	char value;
+	int fd;
+
+	ksft_print_msg("Testing tagged address ABI sysctl\n");
+
+	fd = open("/proc/sys/abi/tagged_addr_disabled", O_WRONLY);
+	if (fd < 0) {
+		ksft_test_result_skip("failed to open sysctl file\n");
+		ksft_test_result_skip("failed to open sysctl file\n");
+		return;
+	}
+
+	value = '1';
+	pwrite(fd, &value, 1, 0);
+	ksft_test_result(set_tagged_addr_ctrl(min_pmlen, true) == -EINVAL,
+			 "sysctl disabled\n");
+
+	value = '0';
+	pwrite(fd, &value, 1, 0);
+	ksft_test_result(set_tagged_addr_ctrl(min_pmlen, true) == 0,
+			 "sysctl enabled\n");
+
+	set_tagged_addr_ctrl(0, false);
+
+	close(fd);
+}
+
+static void test_tagged_addr_abi_pmlen(int pmlen)
+{
+	int i, *p, ret;
+
+	i = ~pmlen;
+
+	if (pmlen) {
+		p = (int *)((uintptr_t)&i | 1UL << (__riscv_xlen - pmlen));
+
+		ret = set_tagged_addr_ctrl(pmlen, false);
+		if (ret)
+			return ksft_test_result_error("PMLEN=%d ABI disabled setup (%d)\n",
+						      pmlen, ret);
+
+		ret = write(pipefd[1], p, sizeof(*p));
+		if (ret >= 0 || errno != EFAULT)
+			return ksft_test_result_fail("PMLEN=%d ABI disabled write\n", pmlen);
+
+		ret = read(dev_zero, p, sizeof(*p));
+		if (ret >= 0 || errno != EFAULT)
+			return ksft_test_result_fail("PMLEN=%d ABI disabled read\n", pmlen);
+
+		if (i != ~pmlen)
+			return ksft_test_result_fail("PMLEN=%d ABI disabled value\n", pmlen);
+
+		ret = set_tagged_addr_ctrl(pmlen, true);
+		if (ret)
+			return ksft_test_result_error("PMLEN=%d ABI enabled setup (%d)\n",
+						      pmlen, ret);
+
+		ret = write(pipefd[1], p, sizeof(*p));
+		if (ret != sizeof(*p))
+			return ksft_test_result_fail("PMLEN=%d ABI enabled write\n", pmlen);
+
+		ret = read(dev_zero, p, sizeof(*p));
+		if (ret != sizeof(*p))
+			return ksft_test_result_fail("PMLEN=%d ABI enabled read\n", pmlen);
+
+		if (i)
+			return ksft_test_result_fail("PMLEN=%d ABI enabled value\n", pmlen);
+
+		i = ~pmlen;
+	} else {
+		/* The tagged address ABI cannot be enabled when PMLEN == 0. */
+		ret = set_tagged_addr_ctrl(pmlen, true);
+		if (ret != -EINVAL)
+			return ksft_test_result_error("PMLEN=%d ABI setup (%d)\n",
+						      pmlen, ret);
+	}
+
+	p = (int *)((uintptr_t)&i | 1UL << (__riscv_xlen - pmlen - 1));
+
+	ret = write(pipefd[1], p, sizeof(*p));
+	if (ret >= 0 || errno != EFAULT)
+		return ksft_test_result_fail("PMLEN=%d invalid tag write (%d)\n", pmlen, errno);
+
+	ret = read(dev_zero, p, sizeof(*p));
+	if (ret >= 0 || errno != EFAULT)
+		return ksft_test_result_fail("PMLEN=%d invalid tag read\n", pmlen);
+
+	if (i != ~pmlen)
+		return ksft_test_result_fail("PMLEN=%d invalid tag value\n", pmlen);
+
+	ksft_test_result_pass("PMLEN=%d tagged address ABI\n", pmlen);
+}
+
+static void test_tagged_addr_abi(void)
+{
+	ksft_print_msg("Testing tagged address ABI\n");
+
+	test_tagged_addr_abi_pmlen(0);
+	test_tagged_addr_abi_pmlen(min_pmlen);
+	test_tagged_addr_abi_pmlen(max_pmlen);
+}
+
+static struct test_info {
+	unsigned int nr_tests;
+	void (*test_fn)(void);
+} tests[] = {
+	{ .nr_tests = 17 * 3, test_pmlen },
+	{ .nr_tests = 3, test_dereference },
+	{ .nr_tests = 2, test_fork_exec },
+	{ .nr_tests = 2, test_tagged_addr_abi_sysctl },
+	{ .nr_tests = 3, test_tagged_addr_abi },
+};
+
+int main(int argc, char **argv)
+{
+	unsigned int plan = 0;
+	int ret;
+
+	/* Check if this is the child process after execve(). */
+	if (!argv[0][0])
+		return execve_child();
+
+	dev_zero = open("/dev/zero", O_RDWR);
+	if (dev_zero < 0)
+		return 1;
+
+	/* Write to a pipe so the kernel must dereference the buffer pointer. */
+	ret = pipe(pipefd);
+	if (ret)
+		return 1;
+
+	ksft_print_header();
+
+	for (int i = 0; i < ARRAY_SIZE(tests); i++)
+		plan += tests[i].nr_tests;
+
+	ksft_set_plan(plan);
+
+	for (int i = 0; i < ARRAY_SIZE(tests); i++)
+		tests[i].test_fn();
+
+	ksft_finished();
+}
-- 
cgit v1.2.3


From 036a1407b4d49790ca5b35436d02de62212bc790 Mon Sep 17 00:00:00 2001
From: Samuel Holland <samuel.holland@sifive.com>
Date: Wed, 16 Oct 2024 13:27:51 -0700
Subject: KVM: riscv: selftests: Add Smnpm and Ssnpm to get-reg-list test

Add testing for the pointer masking extensions exposed to KVM guests.

Reviewed-by: Anup Patel <anup@brainfault.org>
Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
Link: https://lore.kernel.org/r/20241016202814.4061541-11-samuel.holland@sifive.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 tools/testing/selftests/kvm/riscv/get-reg-list.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/riscv/get-reg-list.c b/tools/testing/selftests/kvm/riscv/get-reg-list.c
index 8e34f7fa44e9..54ab484d0000 100644
--- a/tools/testing/selftests/kvm/riscv/get-reg-list.c
+++ b/tools/testing/selftests/kvm/riscv/get-reg-list.c
@@ -41,9 +41,11 @@ bool filter_reg(__u64 reg)
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_I:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_M:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_V:
+	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SMNPM:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SMSTATEEN:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SSAIA:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SSCOFPMF:
+	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SSNPM:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SSTC:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVINVAL:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVNAPOT:
@@ -414,9 +416,11 @@ static const char *isa_ext_single_id_to_str(__u64 reg_off)
 		KVM_ISA_EXT_ARR(I),
 		KVM_ISA_EXT_ARR(M),
 		KVM_ISA_EXT_ARR(V),
+		KVM_ISA_EXT_ARR(SMNPM),
 		KVM_ISA_EXT_ARR(SMSTATEEN),
 		KVM_ISA_EXT_ARR(SSAIA),
 		KVM_ISA_EXT_ARR(SSCOFPMF),
+		KVM_ISA_EXT_ARR(SSNPM),
 		KVM_ISA_EXT_ARR(SSTC),
 		KVM_ISA_EXT_ARR(SVINVAL),
 		KVM_ISA_EXT_ARR(SVNAPOT),
@@ -946,8 +950,10 @@ KVM_ISA_EXT_SUBLIST_CONFIG(aia, AIA);
 KVM_ISA_EXT_SUBLIST_CONFIG(fp_f, FP_F);
 KVM_ISA_EXT_SUBLIST_CONFIG(fp_d, FP_D);
 KVM_ISA_EXT_SIMPLE_CONFIG(h, H);
+KVM_ISA_EXT_SIMPLE_CONFIG(smnpm, SMNPM);
 KVM_ISA_EXT_SUBLIST_CONFIG(smstateen, SMSTATEEN);
 KVM_ISA_EXT_SIMPLE_CONFIG(sscofpmf, SSCOFPMF);
+KVM_ISA_EXT_SIMPLE_CONFIG(ssnpm, SSNPM);
 KVM_ISA_EXT_SIMPLE_CONFIG(sstc, SSTC);
 KVM_ISA_EXT_SIMPLE_CONFIG(svinval, SVINVAL);
 KVM_ISA_EXT_SIMPLE_CONFIG(svnapot, SVNAPOT);
@@ -1009,8 +1015,10 @@ struct vcpu_reg_list *vcpu_configs[] = {
 	&config_fp_f,
 	&config_fp_d,
 	&config_h,
+	&config_smnpm,
 	&config_smstateen,
 	&config_sscofpmf,
+	&config_ssnpm,
 	&config_sstc,
 	&config_svinval,
 	&config_svnapot,
-- 
cgit v1.2.3


From 72be5aa6be4af29fa2d77737e634b9a4c0e02d69 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Sat, 19 Oct 2024 18:15:45 +0100
Subject: KVM: selftests: Add test for PSCI SYSTEM_OFF2

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Link: https://lore.kernel.org/r/20241019172459.2241939-5-dwmw2@infradead.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
---
 tools/testing/selftests/kvm/aarch64/psci_test.c | 92 +++++++++++++++++++++++++
 1 file changed, 92 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c
index 61731a950def..eaa7655fefc1 100644
--- a/tools/testing/selftests/kvm/aarch64/psci_test.c
+++ b/tools/testing/selftests/kvm/aarch64/psci_test.c
@@ -54,6 +54,15 @@ static uint64_t psci_system_suspend(uint64_t entry_addr, uint64_t context_id)
 	return res.a0;
 }
 
+static uint64_t psci_system_off2(uint64_t type, uint64_t cookie)
+{
+	struct arm_smccc_res res;
+
+	smccc_hvc(PSCI_1_3_FN64_SYSTEM_OFF2, type, cookie, 0, 0, 0, 0, 0, &res);
+
+	return res.a0;
+}
+
 static uint64_t psci_features(uint32_t func_id)
 {
 	struct arm_smccc_res res;
@@ -188,11 +197,94 @@ static void host_test_system_suspend(void)
 	kvm_vm_free(vm);
 }
 
+static void guest_test_system_off2(void)
+{
+	uint64_t ret;
+
+	/* assert that SYSTEM_OFF2 is discoverable */
+	GUEST_ASSERT(psci_features(PSCI_1_3_FN_SYSTEM_OFF2) &
+		     PSCI_1_3_OFF_TYPE_HIBERNATE_OFF);
+	GUEST_ASSERT(psci_features(PSCI_1_3_FN64_SYSTEM_OFF2) &
+		     PSCI_1_3_OFF_TYPE_HIBERNATE_OFF);
+
+	/* With non-zero 'cookie' field, it should fail */
+	ret = psci_system_off2(PSCI_1_3_OFF_TYPE_HIBERNATE_OFF, 1);
+	GUEST_ASSERT(ret == PSCI_RET_INVALID_PARAMS);
+
+	/*
+	 * This would normally never return, so KVM sets the return value
+	 * to PSCI_RET_INTERNAL_FAILURE. The test case *does* return, so
+	 * that it can test both values for HIBERNATE_OFF.
+	 */
+	ret = psci_system_off2(PSCI_1_3_OFF_TYPE_HIBERNATE_OFF, 0);
+	GUEST_ASSERT(ret == PSCI_RET_INTERNAL_FAILURE);
+
+	/*
+	 * Revision F.b of the PSCI v1.3 specification documents zero as an
+	 * alias for HIBERNATE_OFF, since that's the value used in earlier
+	 * revisions of the spec and some implementations in the field.
+	 */
+	ret = psci_system_off2(0, 1);
+	GUEST_ASSERT(ret == PSCI_RET_INVALID_PARAMS);
+
+	ret = psci_system_off2(0, 0);
+	GUEST_ASSERT(ret == PSCI_RET_INTERNAL_FAILURE);
+
+	GUEST_DONE();
+}
+
+static void host_test_system_off2(void)
+{
+	struct kvm_vcpu *source, *target;
+	struct kvm_mp_state mps;
+	uint64_t psci_version = 0;
+	int nr_shutdowns = 0;
+	struct kvm_run *run;
+	struct ucall uc;
+
+	setup_vm(guest_test_system_off2, &source, &target);
+
+	vcpu_get_reg(target, KVM_REG_ARM_PSCI_VERSION, &psci_version);
+
+	TEST_ASSERT(psci_version >= PSCI_VERSION(1, 3),
+		    "Unexpected PSCI version %lu.%lu",
+		    PSCI_VERSION_MAJOR(psci_version),
+		    PSCI_VERSION_MINOR(psci_version));
+
+	vcpu_power_off(target);
+	run = source->run;
+
+	enter_guest(source);
+	while (run->exit_reason == KVM_EXIT_SYSTEM_EVENT) {
+		TEST_ASSERT(run->system_event.type == KVM_SYSTEM_EVENT_SHUTDOWN,
+			    "Unhandled system event: %u (expected: %u)",
+			    run->system_event.type, KVM_SYSTEM_EVENT_SHUTDOWN);
+		TEST_ASSERT(run->system_event.ndata >= 1,
+			    "Unexpected amount of system event data: %u (expected, >= 1)",
+			    run->system_event.ndata);
+		TEST_ASSERT(run->system_event.data[0] & KVM_SYSTEM_EVENT_SHUTDOWN_FLAG_PSCI_OFF2,
+			    "PSCI_OFF2 flag not set. Flags %llu (expected %llu)",
+			    run->system_event.data[0], KVM_SYSTEM_EVENT_SHUTDOWN_FLAG_PSCI_OFF2);
+
+		nr_shutdowns++;
+
+		/* Restart the vCPU */
+	        mps.mp_state = KVM_MP_STATE_RUNNABLE;
+		vcpu_mp_state_set(source, &mps);
+
+		enter_guest(source);
+	}
+
+	TEST_ASSERT(get_ucall(source, &uc) == UCALL_DONE, "Guest did not exit cleanly");
+	TEST_ASSERT(nr_shutdowns == 2, "Two shutdown events were expected, but saw %d", nr_shutdowns);
+}
+
 int main(void)
 {
 	TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_SYSTEM_SUSPEND));
 
 	host_test_cpu_on();
 	host_test_system_suspend();
+	host_test_system_off2();
 	return 0;
 }
-- 
cgit v1.2.3


From c660d334b3a54f22836955ca5255edd946771614 Mon Sep 17 00:00:00 2001
From: Oliver Upton <oliver.upton@linux.dev>
Date: Fri, 25 Oct 2024 20:31:05 +0000
Subject: KVM: arm64: selftests: Convert to kernel's ESR terminology

Drop the KVM selftests specific flavoring of ESR in favor of the kernel
header.

Reviewed-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20241025203106.3529261-4-oliver.upton@linux.dev
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
---
 tools/testing/selftests/kvm/aarch64/debug-exceptions.c    | 10 +++++-----
 tools/testing/selftests/kvm/aarch64/no-vgic-v3.c          |  2 +-
 tools/testing/selftests/kvm/aarch64/page_fault_test.c     |  4 ++--
 tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c | 12 ++++++------
 tools/testing/selftests/kvm/include/aarch64/processor.h   | 15 ++-------------
 tools/testing/selftests/kvm/lib/aarch64/processor.c       |  6 +++---
 6 files changed, 19 insertions(+), 30 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c
index 2582c49e525a..ff7a949fc96a 100644
--- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c
+++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c
@@ -433,15 +433,15 @@ static void test_guest_debug_exceptions(uint8_t bpn, uint8_t wpn, uint8_t ctx_bp
 	vcpu_init_descriptor_tables(vcpu);
 
 	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
-				ESR_EC_BRK_INS, guest_sw_bp_handler);
+				ESR_ELx_EC_BRK64, guest_sw_bp_handler);
 	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
-				ESR_EC_HW_BP_CURRENT, guest_hw_bp_handler);
+				ESR_ELx_EC_BREAKPT_CUR, guest_hw_bp_handler);
 	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
-				ESR_EC_WP_CURRENT, guest_wp_handler);
+				ESR_ELx_EC_WATCHPT_CUR, guest_wp_handler);
 	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
-				ESR_EC_SSTEP_CURRENT, guest_ss_handler);
+				ESR_ELx_EC_SOFTSTP_CUR, guest_ss_handler);
 	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
-				ESR_EC_SVC64, guest_svc_handler);
+				ESR_ELx_EC_SVC64, guest_svc_handler);
 
 	/* Specify bpn/wpn/ctx_bpn to be tested */
 	vcpu_args_set(vcpu, 3, bpn, wpn, ctx_bpn);
diff --git a/tools/testing/selftests/kvm/aarch64/no-vgic-v3.c b/tools/testing/selftests/kvm/aarch64/no-vgic-v3.c
index 943d65fc6b0b..58304bbc2036 100644
--- a/tools/testing/selftests/kvm/aarch64/no-vgic-v3.c
+++ b/tools/testing/selftests/kvm/aarch64/no-vgic-v3.c
@@ -150,7 +150,7 @@ static void test_guest_no_gicv3(void)
 	vcpu_init_descriptor_tables(vcpu);
 
 	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
-				ESR_EC_UNKNOWN, guest_undef_handler);
+				ESR_ELx_EC_UNKNOWN, guest_undef_handler);
 
 	test_run_vcpu(vcpu);
 
diff --git a/tools/testing/selftests/kvm/aarch64/page_fault_test.c b/tools/testing/selftests/kvm/aarch64/page_fault_test.c
index d29b08198b42..ec33a8f9c908 100644
--- a/tools/testing/selftests/kvm/aarch64/page_fault_test.c
+++ b/tools/testing/selftests/kvm/aarch64/page_fault_test.c
@@ -544,9 +544,9 @@ static void setup_abort_handlers(struct kvm_vm *vm, struct kvm_vcpu *vcpu,
 	vcpu_init_descriptor_tables(vcpu);
 
 	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
-				ESR_EC_DABT, no_dabt_handler);
+				ESR_ELx_EC_DABT_CUR, no_dabt_handler);
 	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
-				ESR_EC_IABT, no_iabt_handler);
+				ESR_ELx_EC_IABT_CUR, no_iabt_handler);
 }
 
 static void setup_gva_maps(struct kvm_vm *vm)
diff --git a/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c b/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c
index d31b9f64ba14..f9c0c86d7e85 100644
--- a/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c
+++ b/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c
@@ -300,7 +300,7 @@ static void guest_sync_handler(struct ex_regs *regs)
 	uint64_t esr, ec;
 
 	esr = read_sysreg(esr_el1);
-	ec = (esr >> ESR_EC_SHIFT) & ESR_EC_MASK;
+	ec = ESR_ELx_EC(esr);
 
 	__GUEST_ASSERT(expected_ec == ec,
 			"PC: 0x%lx; ESR: 0x%lx; EC: 0x%lx; EC expected: 0x%lx",
@@ -338,10 +338,10 @@ static void test_access_invalid_pmc_regs(struct pmc_accessor *acc, int pmc_idx)
 	 * Reading/writing the event count/type registers should cause
 	 * an UNDEFINED exception.
 	 */
-	TEST_EXCEPTION(ESR_EC_UNKNOWN, acc->read_cntr(pmc_idx));
-	TEST_EXCEPTION(ESR_EC_UNKNOWN, acc->write_cntr(pmc_idx, 0));
-	TEST_EXCEPTION(ESR_EC_UNKNOWN, acc->read_typer(pmc_idx));
-	TEST_EXCEPTION(ESR_EC_UNKNOWN, acc->write_typer(pmc_idx, 0));
+	TEST_EXCEPTION(ESR_ELx_EC_UNKNOWN, acc->read_cntr(pmc_idx));
+	TEST_EXCEPTION(ESR_ELx_EC_UNKNOWN, acc->write_cntr(pmc_idx, 0));
+	TEST_EXCEPTION(ESR_ELx_EC_UNKNOWN, acc->read_typer(pmc_idx));
+	TEST_EXCEPTION(ESR_ELx_EC_UNKNOWN, acc->write_typer(pmc_idx, 0));
 	/*
 	 * The bit corresponding to the (unimplemented) counter in
 	 * {PMCNTEN,PMINTEN,PMOVS}{SET,CLR} registers should be RAZ.
@@ -425,7 +425,7 @@ static void create_vpmu_vm(void *guest_code)
 
 	vpmu_vm.vm = vm_create(1);
 	vm_init_descriptor_tables(vpmu_vm.vm);
-	for (ec = 0; ec < ESR_EC_NUM; ec++) {
+	for (ec = 0; ec < ESR_ELx_EC_MAX + 1; ec++) {
 		vm_install_sync_handler(vpmu_vm.vm, VECTOR_SYNC_CURRENT, ec,
 					guest_sync_handler);
 	}
diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h
index de977d131082..1e8d0d531fbd 100644
--- a/tools/testing/selftests/kvm/include/aarch64/processor.h
+++ b/tools/testing/selftests/kvm/include/aarch64/processor.h
@@ -12,6 +12,8 @@
 
 #include <linux/stringify.h>
 #include <linux/types.h>
+#include <asm/brk-imm.h>
+#include <asm/esr.h>
 #include <asm/sysreg.h>
 
 
@@ -100,19 +102,6 @@ enum {
 			   (v) == VECTOR_SYNC_LOWER_64    || \
 			   (v) == VECTOR_SYNC_LOWER_32)
 
-#define ESR_EC_NUM		64
-#define ESR_EC_SHIFT		26
-#define ESR_EC_MASK		(ESR_EC_NUM - 1)
-
-#define ESR_EC_UNKNOWN		0x0
-#define ESR_EC_SVC64		0x15
-#define ESR_EC_IABT		0x21
-#define ESR_EC_DABT		0x25
-#define ESR_EC_HW_BP_CURRENT	0x31
-#define ESR_EC_SSTEP_CURRENT	0x33
-#define ESR_EC_WP_CURRENT	0x35
-#define ESR_EC_BRK_INS		0x3c
-
 /* Access flag */
 #define PTE_AF			(1ULL << 10)
 
diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c
index fe4dc3693112..698e34f39241 100644
--- a/tools/testing/selftests/kvm/lib/aarch64/processor.c
+++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c
@@ -450,7 +450,7 @@ void assert_on_unhandled_exception(struct kvm_vcpu *vcpu)
 }
 
 struct handlers {
-	handler_fn exception_handlers[VECTOR_NUM][ESR_EC_NUM];
+	handler_fn exception_handlers[VECTOR_NUM][ESR_ELx_EC_MAX + 1];
 };
 
 void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu)
@@ -469,7 +469,7 @@ void route_exception(struct ex_regs *regs, int vector)
 	switch (vector) {
 	case VECTOR_SYNC_CURRENT:
 	case VECTOR_SYNC_LOWER_64:
-		ec = (read_sysreg(esr_el1) >> ESR_EC_SHIFT) & ESR_EC_MASK;
+		ec = ESR_ELx_EC(read_sysreg(esr_el1));
 		valid_ec = true;
 		break;
 	case VECTOR_IRQ_CURRENT:
@@ -508,7 +508,7 @@ void vm_install_sync_handler(struct kvm_vm *vm, int vector, int ec,
 
 	assert(VECTOR_IS_SYNC(vector));
 	assert(vector < VECTOR_NUM);
-	assert(ec < ESR_EC_NUM);
+	assert(ec <= ESR_ELx_EC_MAX);
 	handlers->exception_handlers[vector][ec] = handler;
 }
 
-- 
cgit v1.2.3


From 3eb09a3e028e26fd26bc132c7aa0577f00de2d05 Mon Sep 17 00:00:00 2001
From: Oliver Upton <oliver.upton@linux.dev>
Date: Fri, 25 Oct 2024 20:31:06 +0000
Subject: KVM: arm64: selftests: Add tests for MMIO external abort injection

Test that the plumbing exposed to userspace for injecting aborts in
response to unexpected MMIO works as intended in two different flavors:

 - A 'normal' MMIO instruction (i.e. ESR_ELx.ISV=1)

 - An ISV=0 MMIO instruction with/without KVM_CAP_ARM_NISV_TO_USER
   enabled

Reviewed-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20241025203106.3529261-5-oliver.upton@linux.dev
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
---
 tools/testing/selftests/kvm/Makefile             |   1 +
 tools/testing/selftests/kvm/aarch64/mmio_abort.c | 159 +++++++++++++++++++++++
 2 files changed, 160 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/aarch64/mmio_abort.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 960cf6a77198..98957a99ead6 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -156,6 +156,7 @@ TEST_GEN_PROGS_aarch64 += aarch64/aarch32_id_regs
 TEST_GEN_PROGS_aarch64 += aarch64/arch_timer_edge_cases
 TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions
 TEST_GEN_PROGS_aarch64 += aarch64/hypercalls
+TEST_GEN_PROGS_aarch64 += aarch64/mmio_abort
 TEST_GEN_PROGS_aarch64 += aarch64/page_fault_test
 TEST_GEN_PROGS_aarch64 += aarch64/psci_test
 TEST_GEN_PROGS_aarch64 += aarch64/set_id_regs
diff --git a/tools/testing/selftests/kvm/aarch64/mmio_abort.c b/tools/testing/selftests/kvm/aarch64/mmio_abort.c
new file mode 100644
index 000000000000..8b7a80a51b1c
--- /dev/null
+++ b/tools/testing/selftests/kvm/aarch64/mmio_abort.c
@@ -0,0 +1,159 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * mmio_abort - Tests for userspace MMIO abort injection
+ *
+ * Copyright (c) 2024 Google LLC
+ */
+#include "processor.h"
+#include "test_util.h"
+
+#define MMIO_ADDR	0x8000000ULL
+
+static u64 expected_abort_pc;
+
+static void expect_sea_handler(struct ex_regs *regs)
+{
+	u64 esr = read_sysreg(esr_el1);
+
+	GUEST_ASSERT_EQ(regs->pc, expected_abort_pc);
+	GUEST_ASSERT_EQ(ESR_ELx_EC(esr), ESR_ELx_EC_DABT_CUR);
+	GUEST_ASSERT_EQ(esr & ESR_ELx_FSC_TYPE, ESR_ELx_FSC_EXTABT);
+
+	GUEST_DONE();
+}
+
+static void unexpected_dabt_handler(struct ex_regs *regs)
+{
+	GUEST_FAIL("Unexpected data abort at PC: %lx\n", regs->pc);
+}
+
+static struct kvm_vm *vm_create_with_dabt_handler(struct kvm_vcpu **vcpu, void *guest_code,
+						  handler_fn dabt_handler)
+{
+	struct kvm_vm *vm = vm_create_with_one_vcpu(vcpu, guest_code);
+
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(*vcpu);
+	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, ESR_ELx_EC_DABT_CUR, dabt_handler);
+
+	virt_map(vm, MMIO_ADDR, MMIO_ADDR, 1);
+
+	return vm;
+}
+
+static void vcpu_inject_extabt(struct kvm_vcpu *vcpu)
+{
+	struct kvm_vcpu_events events = {};
+
+	events.exception.ext_dabt_pending = true;
+	vcpu_events_set(vcpu, &events);
+}
+
+static void vcpu_run_expect_done(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+
+	vcpu_run(vcpu);
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+		break;
+	case UCALL_DONE:
+		break;
+	default:
+		TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
+	}
+}
+
+extern char test_mmio_abort_insn;
+
+static void test_mmio_abort_guest(void)
+{
+	WRITE_ONCE(expected_abort_pc, (u64)&test_mmio_abort_insn);
+
+	asm volatile("test_mmio_abort_insn:\n\t"
+		     "ldr x0, [%0]\n\t"
+		     : : "r" (MMIO_ADDR) : "x0", "memory");
+
+	GUEST_FAIL("MMIO instruction should not retire");
+}
+
+/*
+ * Test that KVM doesn't complete MMIO emulation when userspace has made an
+ * external abort pending for the instruction.
+ */
+static void test_mmio_abort(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm = vm_create_with_dabt_handler(&vcpu, test_mmio_abort_guest,
+							expect_sea_handler);
+	struct kvm_run *run = vcpu->run;
+
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_MMIO);
+	TEST_ASSERT_EQ(run->mmio.phys_addr, MMIO_ADDR);
+	TEST_ASSERT_EQ(run->mmio.len, sizeof(unsigned long));
+	TEST_ASSERT(!run->mmio.is_write, "Expected MMIO read");
+
+	vcpu_inject_extabt(vcpu);
+	vcpu_run_expect_done(vcpu);
+	kvm_vm_free(vm);
+}
+
+extern char test_mmio_nisv_insn;
+
+static void test_mmio_nisv_guest(void)
+{
+	WRITE_ONCE(expected_abort_pc, (u64)&test_mmio_nisv_insn);
+
+	asm volatile("test_mmio_nisv_insn:\n\t"
+		     "ldr x0, [%0], #8\n\t"
+		     : : "r" (MMIO_ADDR) : "x0", "memory");
+
+	GUEST_FAIL("MMIO instruction should not retire");
+}
+
+/*
+ * Test that the KVM_RUN ioctl fails for ESR_EL2.ISV=0 MMIO aborts if userspace
+ * hasn't enabled KVM_CAP_ARM_NISV_TO_USER.
+ */
+static void test_mmio_nisv(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm = vm_create_with_dabt_handler(&vcpu, test_mmio_nisv_guest,
+							unexpected_dabt_handler);
+
+	TEST_ASSERT(_vcpu_run(vcpu), "Expected nonzero return code from KVM_RUN");
+	TEST_ASSERT_EQ(errno, ENOSYS);
+
+	kvm_vm_free(vm);
+}
+
+/*
+ * Test that ESR_EL2.ISV=0 MMIO aborts reach userspace and that an injected SEA
+ * reaches the guest.
+ */
+static void test_mmio_nisv_abort(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm = vm_create_with_dabt_handler(&vcpu, test_mmio_nisv_guest,
+							expect_sea_handler);
+	struct kvm_run *run = vcpu->run;
+
+	vm_enable_cap(vm, KVM_CAP_ARM_NISV_TO_USER, 1);
+
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_ARM_NISV);
+	TEST_ASSERT_EQ(run->arm_nisv.fault_ipa, MMIO_ADDR);
+
+	vcpu_inject_extabt(vcpu);
+	vcpu_run_expect_done(vcpu);
+	kvm_vm_free(vm);
+}
+
+int main(void)
+{
+	test_mmio_abort();
+	test_mmio_nisv();
+	test_mmio_nisv_abort();
+}
-- 
cgit v1.2.3


From 0bcceb1f51c77f6b98a7aab00847ed340bf36e35 Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Fri, 25 Oct 2024 06:11:59 -0700
Subject: iommufd: Selftest coverage for IOMMU_IOAS_MAP_FILE

Add test cases to exercise IOMMU_IOAS_MAP_FILE.

Link: https://patch.msgid.link/r/1729861919-234514-10-git-send-email-steven.sistare@oracle.com
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 tools/testing/selftests/iommu/iommufd.c          | 124 ++++++++++++++++++++---
 tools/testing/selftests/iommu/iommufd_fail_nth.c |  39 +++++++
 tools/testing/selftests/iommu/iommufd_utils.h    |  57 +++++++++++
 3 files changed, 205 insertions(+), 15 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 4927b9add5ad..88b92bb69756 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES */
+#include <asm/unistd.h>
 #include <stdlib.h>
 #include <sys/mman.h>
 #include <sys/eventfd.h>
@@ -49,6 +50,9 @@ static __attribute__((constructor)) void setup_sizes(void)
 	vrc = mmap(buffer, BUFFER_SIZE, PROT_READ | PROT_WRITE,
 		   MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
 	assert(vrc == buffer);
+
+	mfd_buffer = memfd_mmap(BUFFER_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
+				&mfd);
 }
 
 FIXTURE(iommufd)
@@ -128,6 +132,7 @@ TEST_F(iommufd, cmd_length)
 	TEST_LENGTH(iommu_ioas_unmap, IOMMU_IOAS_UNMAP, length);
 	TEST_LENGTH(iommu_option, IOMMU_OPTION, val64);
 	TEST_LENGTH(iommu_vfio_ioas, IOMMU_VFIO_IOAS, __reserved);
+	TEST_LENGTH(iommu_ioas_map_file, IOMMU_IOAS_MAP_FILE, iova);
 #undef TEST_LENGTH
 }
 
@@ -1372,6 +1377,7 @@ FIXTURE_VARIANT(iommufd_mock_domain)
 {
 	unsigned int mock_domains;
 	bool hugepages;
+	bool file;
 };
 
 FIXTURE_SETUP(iommufd_mock_domain)
@@ -1410,26 +1416,45 @@ FIXTURE_VARIANT_ADD(iommufd_mock_domain, one_domain)
 {
 	.mock_domains = 1,
 	.hugepages = false,
+	.file = false,
 };
 
 FIXTURE_VARIANT_ADD(iommufd_mock_domain, two_domains)
 {
 	.mock_domains = 2,
 	.hugepages = false,
+	.file = false,
 };
 
 FIXTURE_VARIANT_ADD(iommufd_mock_domain, one_domain_hugepage)
 {
 	.mock_domains = 1,
 	.hugepages = true,
+	.file = false,
 };
 
 FIXTURE_VARIANT_ADD(iommufd_mock_domain, two_domains_hugepage)
 {
 	.mock_domains = 2,
 	.hugepages = true,
+	.file = false,
+};
+
+FIXTURE_VARIANT_ADD(iommufd_mock_domain, one_domain_file)
+{
+	.mock_domains = 1,
+	.hugepages = false,
+	.file = true,
+};
+
+FIXTURE_VARIANT_ADD(iommufd_mock_domain, one_domain_file_hugepage)
+{
+	.mock_domains = 1,
+	.hugepages = true,
+	.file = true,
 };
 
+
 /* Have the kernel check that the user pages made it to the iommu_domain */
 #define check_mock_iova(_ptr, _iova, _length)                                \
 	({                                                                   \
@@ -1455,7 +1480,10 @@ FIXTURE_VARIANT_ADD(iommufd_mock_domain, two_domains_hugepage)
 		}                                                            \
 	})
 
-TEST_F(iommufd_mock_domain, basic)
+static void
+test_basic_mmap(struct __test_metadata *_metadata,
+		struct _test_data_iommufd_mock_domain *self,
+		const struct _fixture_variant_iommufd_mock_domain *variant)
 {
 	size_t buf_size = self->mmap_buf_size;
 	uint8_t *buf;
@@ -1478,6 +1506,40 @@ TEST_F(iommufd_mock_domain, basic)
 	test_err_ioctl_ioas_map(EFAULT, buf, buf_size, &iova);
 }
 
+static void
+test_basic_file(struct __test_metadata *_metadata,
+		struct _test_data_iommufd_mock_domain *self,
+		const struct _fixture_variant_iommufd_mock_domain *variant)
+{
+	size_t buf_size = self->mmap_buf_size;
+	uint8_t *buf;
+	__u64 iova;
+	int mfd_tmp;
+	int prot = PROT_READ | PROT_WRITE;
+
+	/* Simple one page map */
+	test_ioctl_ioas_map_file(mfd, 0, PAGE_SIZE, &iova);
+	check_mock_iova(mfd_buffer, iova, PAGE_SIZE);
+
+	buf = memfd_mmap(buf_size, prot, MAP_SHARED, &mfd_tmp);
+	ASSERT_NE(MAP_FAILED, buf);
+
+	test_err_ioctl_ioas_map_file(EINVAL, mfd_tmp, 0, buf_size + 1, &iova);
+
+	ASSERT_EQ(0, ftruncate(mfd_tmp, 0));
+	test_err_ioctl_ioas_map_file(EINVAL, mfd_tmp, 0, buf_size, &iova);
+
+	close(mfd_tmp);
+}
+
+TEST_F(iommufd_mock_domain, basic)
+{
+	if (variant->file)
+		test_basic_file(_metadata, self, variant);
+	else
+		test_basic_mmap(_metadata, self, variant);
+}
+
 TEST_F(iommufd_mock_domain, ro_unshare)
 {
 	uint8_t *buf;
@@ -1513,9 +1575,13 @@ TEST_F(iommufd_mock_domain, all_aligns)
 	unsigned int start;
 	unsigned int end;
 	uint8_t *buf;
+	int prot = PROT_READ | PROT_WRITE;
+	int mfd;
 
-	buf = mmap(0, buf_size, PROT_READ | PROT_WRITE, self->mmap_flags, -1,
-		   0);
+	if (variant->file)
+		buf = memfd_mmap(buf_size, prot, MAP_SHARED, &mfd);
+	else
+		buf = mmap(0, buf_size, prot, self->mmap_flags, -1, 0);
 	ASSERT_NE(MAP_FAILED, buf);
 	check_refs(buf, buf_size, 0);
 
@@ -1532,7 +1598,12 @@ TEST_F(iommufd_mock_domain, all_aligns)
 			size_t length = end - start;
 			__u64 iova;
 
-			test_ioctl_ioas_map(buf + start, length, &iova);
+			if (variant->file) {
+				test_ioctl_ioas_map_file(mfd, start, length,
+							 &iova);
+			} else {
+				test_ioctl_ioas_map(buf + start, length, &iova);
+			}
 			check_mock_iova(buf + start, iova, length);
 			check_refs(buf + start / PAGE_SIZE * PAGE_SIZE,
 				   end / PAGE_SIZE * PAGE_SIZE -
@@ -1544,6 +1615,8 @@ TEST_F(iommufd_mock_domain, all_aligns)
 	}
 	check_refs(buf, buf_size, 0);
 	ASSERT_EQ(0, munmap(buf, buf_size));
+	if (variant->file)
+		close(mfd);
 }
 
 TEST_F(iommufd_mock_domain, all_aligns_copy)
@@ -1554,9 +1627,13 @@ TEST_F(iommufd_mock_domain, all_aligns_copy)
 	unsigned int start;
 	unsigned int end;
 	uint8_t *buf;
+	int prot = PROT_READ | PROT_WRITE;
+	int mfd;
 
-	buf = mmap(0, buf_size, PROT_READ | PROT_WRITE, self->mmap_flags, -1,
-		   0);
+	if (variant->file)
+		buf = memfd_mmap(buf_size, prot, MAP_SHARED, &mfd);
+	else
+		buf = mmap(0, buf_size, prot, self->mmap_flags, -1, 0);
 	ASSERT_NE(MAP_FAILED, buf);
 	check_refs(buf, buf_size, 0);
 
@@ -1575,7 +1652,12 @@ TEST_F(iommufd_mock_domain, all_aligns_copy)
 			uint32_t mock_stdev_id;
 			__u64 iova;
 
-			test_ioctl_ioas_map(buf + start, length, &iova);
+			if (variant->file) {
+				test_ioctl_ioas_map_file(mfd, start, length,
+							 &iova);
+			} else {
+				test_ioctl_ioas_map(buf + start, length, &iova);
+			}
 
 			/* Add and destroy a domain while the area exists */
 			old_id = self->hwpt_ids[1];
@@ -1596,15 +1678,18 @@ TEST_F(iommufd_mock_domain, all_aligns_copy)
 	}
 	check_refs(buf, buf_size, 0);
 	ASSERT_EQ(0, munmap(buf, buf_size));
+	if (variant->file)
+		close(mfd);
 }
 
 TEST_F(iommufd_mock_domain, user_copy)
 {
+	void *buf = variant->file ? mfd_buffer : buffer;
 	struct iommu_test_cmd access_cmd = {
 		.size = sizeof(access_cmd),
 		.op = IOMMU_TEST_OP_ACCESS_PAGES,
 		.access_pages = { .length = BUFFER_SIZE,
-				  .uptr = (uintptr_t)buffer },
+				  .uptr = (uintptr_t)buf },
 	};
 	struct iommu_ioas_copy copy_cmd = {
 		.size = sizeof(copy_cmd),
@@ -1623,9 +1708,13 @@ TEST_F(iommufd_mock_domain, user_copy)
 
 	/* Pin the pages in an IOAS with no domains then copy to an IOAS with domains */
 	test_ioctl_ioas_alloc(&ioas_id);
-	test_ioctl_ioas_map_id(ioas_id, buffer, BUFFER_SIZE,
-			       &copy_cmd.src_iova);
-
+	if (variant->file) {
+		test_ioctl_ioas_map_id_file(ioas_id, mfd, 0, BUFFER_SIZE,
+					    &copy_cmd.src_iova);
+	} else {
+		test_ioctl_ioas_map_id(ioas_id, buf, BUFFER_SIZE,
+				       &copy_cmd.src_iova);
+	}
 	test_cmd_create_access(ioas_id, &access_cmd.id,
 			       MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES);
 
@@ -1635,12 +1724,17 @@ TEST_F(iommufd_mock_domain, user_copy)
 			&access_cmd));
 	copy_cmd.src_ioas_id = ioas_id;
 	ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_COPY, &copy_cmd));
-	check_mock_iova(buffer, MOCK_APERTURE_START, BUFFER_SIZE);
+	check_mock_iova(buf, MOCK_APERTURE_START, BUFFER_SIZE);
 
 	/* Now replace the ioas with a new one */
 	test_ioctl_ioas_alloc(&new_ioas_id);
-	test_ioctl_ioas_map_id(new_ioas_id, buffer, BUFFER_SIZE,
-			       &copy_cmd.src_iova);
+	if (variant->file) {
+		test_ioctl_ioas_map_id_file(new_ioas_id, mfd, 0, BUFFER_SIZE,
+					    &copy_cmd.src_iova);
+	} else {
+		test_ioctl_ioas_map_id(new_ioas_id, buf, BUFFER_SIZE,
+				       &copy_cmd.src_iova);
+	}
 	test_cmd_access_replace_ioas(access_cmd.id, new_ioas_id);
 
 	/* Destroy the old ioas and cleanup copied mapping */
@@ -1654,7 +1748,7 @@ TEST_F(iommufd_mock_domain, user_copy)
 			&access_cmd));
 	copy_cmd.src_ioas_id = new_ioas_id;
 	ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_COPY, &copy_cmd));
-	check_mock_iova(buffer, MOCK_APERTURE_START, BUFFER_SIZE);
+	check_mock_iova(buf, MOCK_APERTURE_START, BUFFER_SIZE);
 
 	test_cmd_destroy_access_pages(
 		access_cmd.id, access_cmd.access_pages.out_access_pages_id);
diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c
index c5d5e69452b0..2d7d01638be8 100644
--- a/tools/testing/selftests/iommu/iommufd_fail_nth.c
+++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c
@@ -47,6 +47,9 @@ static __attribute__((constructor)) void setup_buffer(void)
 
 	buffer = mmap(0, BUFFER_SIZE, PROT_READ | PROT_WRITE,
 		      MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+
+	mfd_buffer = memfd_mmap(BUFFER_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
+				&mfd);
 }
 
 /*
@@ -331,6 +334,42 @@ TEST_FAIL_NTH(basic_fail_nth, map_domain)
 	return 0;
 }
 
+/* iopt_area_fill_domains() and iopt_area_fill_domain() */
+TEST_FAIL_NTH(basic_fail_nth, map_file_domain)
+{
+	uint32_t ioas_id;
+	__u32 stdev_id;
+	__u32 hwpt_id;
+	__u64 iova;
+
+	self->fd = open("/dev/iommu", O_RDWR);
+	if (self->fd == -1)
+		return -1;
+
+	if (_test_ioctl_ioas_alloc(self->fd, &ioas_id))
+		return -1;
+
+	if (_test_ioctl_set_temp_memory_limit(self->fd, 32))
+		return -1;
+
+	fail_nth_enable();
+
+	if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL))
+		return -1;
+
+	if (_test_ioctl_ioas_map_file(self->fd, ioas_id, mfd, 0, 262144, &iova,
+				      IOMMU_IOAS_MAP_WRITEABLE |
+					      IOMMU_IOAS_MAP_READABLE))
+		return -1;
+
+	if (_test_ioctl_destroy(self->fd, stdev_id))
+		return -1;
+
+	if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL))
+		return -1;
+	return 0;
+}
+
 TEST_FAIL_NTH(basic_fail_nth, map_two_domains)
 {
 	uint32_t ioas_id;
diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h
index 40f6f14ce136..6a11c26370f3 100644
--- a/tools/testing/selftests/iommu/iommufd_utils.h
+++ b/tools/testing/selftests/iommu/iommufd_utils.h
@@ -40,12 +40,28 @@ static inline bool test_bit(unsigned int nr, unsigned long *addr)
 static void *buffer;
 static unsigned long BUFFER_SIZE;
 
+static void *mfd_buffer;
+static int mfd;
+
 static unsigned long PAGE_SIZE;
 
 #define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER))
 #define offsetofend(TYPE, MEMBER) \
 	(offsetof(TYPE, MEMBER) + sizeof_field(TYPE, MEMBER))
 
+static inline void *memfd_mmap(size_t length, int prot, int flags, int *mfd_p)
+{
+	int mfd_flags = (flags & MAP_HUGETLB) ? MFD_HUGETLB : 0;
+	int mfd = memfd_create("buffer", mfd_flags);
+
+	if (mfd <= 0)
+		return MAP_FAILED;
+	if (ftruncate(mfd, length))
+		return MAP_FAILED;
+	*mfd_p = mfd;
+	return mmap(0, length, prot, flags, mfd, 0);
+}
+
 /*
  * Have the kernel check the refcount on pages. I don't know why a freshly
  * mmap'd anon non-compound page starts out with a ref of 3
@@ -589,6 +605,47 @@ static int _test_ioctl_ioas_unmap(int fd, unsigned int ioas_id, uint64_t iova,
 	EXPECT_ERRNO(_errno, _test_ioctl_ioas_unmap(self->fd, self->ioas_id, \
 						    iova, length, NULL))
 
+static int _test_ioctl_ioas_map_file(int fd, unsigned int ioas_id, int mfd,
+				     size_t start, size_t length, __u64 *iova,
+				     unsigned int flags)
+{
+	struct iommu_ioas_map_file cmd = {
+		.size = sizeof(cmd),
+		.flags = flags,
+		.ioas_id = ioas_id,
+		.fd = mfd,
+		.start = start,
+		.length = length,
+	};
+	int ret;
+
+	if (flags & IOMMU_IOAS_MAP_FIXED_IOVA)
+		cmd.iova = *iova;
+
+	ret = ioctl(fd, IOMMU_IOAS_MAP_FILE, &cmd);
+	*iova = cmd.iova;
+	return ret;
+}
+
+#define test_ioctl_ioas_map_file(mfd, start, length, iova_p)                   \
+	ASSERT_EQ(0,                                                           \
+		  _test_ioctl_ioas_map_file(                                   \
+			  self->fd, self->ioas_id, mfd, start, length, iova_p, \
+			  IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE))
+
+#define test_err_ioctl_ioas_map_file(_errno, mfd, start, length, iova_p)     \
+	EXPECT_ERRNO(                                                        \
+		_errno,                                                      \
+		_test_ioctl_ioas_map_file(                                   \
+			self->fd, self->ioas_id, mfd, start, length, iova_p, \
+			IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE))
+
+#define test_ioctl_ioas_map_id_file(ioas_id, mfd, start, length, iova_p)     \
+	ASSERT_EQ(0,                                                         \
+		  _test_ioctl_ioas_map_file(                                 \
+			  self->fd, ioas_id, mfd, start, length, iova_p,     \
+			  IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE))
+
 static int _test_ioctl_set_temp_memory_limit(int fd, unsigned int limit)
 {
 	struct iommu_test_cmd memlimit_cmd = {
-- 
cgit v1.2.3


From 47e99f30730c0167cd32c9a2fd4a74f0a024cb2b Mon Sep 17 00:00:00 2001
From: Leo Stone <leocstone@gmail.com>
Date: Mon, 21 Oct 2024 10:46:44 -0700
Subject: selftest/tcp-ao: Add filter tests

Add tests that check if getsockopt(TCP_AO_GET_KEYS) returns the right
keys when using different filters.

Sample output:

> # ok 114 filter keys: by sndid, rcvid, address
> # ok 115 filter keys: by is_current
> # ok 116 filter keys: by is_rnext
> # ok 117 filter keys: by sndid, rcvid
> # ok 118 filter keys: correct nkeys when in.nkeys < matches

Acked-by: Dmitry Safonov <0x7f454c46@gmail.com>
Signed-off-by: Leo Stone <leocstone@gmail.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20241021174652.6949-1-leocstone@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../selftests/net/tcp_ao/setsockopt-closed.c       | 186 ++++++++++++++++++++-
 1 file changed, 181 insertions(+), 5 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c b/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c
index 084db4ecdff6..0abb9807d742 100644
--- a/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c
+++ b/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c
@@ -6,6 +6,8 @@
 
 static union tcp_addr tcp_md5_client;
 
+#define FILTER_TEST_NKEYS 16
+
 static int test_port = 7788;
 static void make_listen(int sk)
 {
@@ -813,23 +815,197 @@ static void duplicate_tests(void)
 	setsockopt_checked(sk, TCP_AO_ADD_KEY, &ao, EEXIST, "duplicate: SendID differs");
 }
 
+static void fetch_all_keys(int sk, struct tcp_ao_getsockopt *keys)
+{
+	socklen_t optlen = sizeof(struct tcp_ao_getsockopt);
+
+	memset(keys, 0, sizeof(struct tcp_ao_getsockopt) * FILTER_TEST_NKEYS);
+	keys[0].get_all = 1;
+	keys[0].nkeys = FILTER_TEST_NKEYS;
+	if (getsockopt(sk, IPPROTO_TCP, TCP_AO_GET_KEYS, &keys[0], &optlen))
+		test_error("getsockopt");
+}
+
+static int prepare_test_keys(struct tcp_ao_getsockopt *keys)
+{
+	const char *test_password = "Test password number ";
+	struct tcp_ao_add test_ao[FILTER_TEST_NKEYS];
+	char test_password_scratch[64] = {};
+	u8 rcvid = 100, sndid = 100;
+	int sk;
+
+	sk = socket(test_family, SOCK_STREAM, IPPROTO_TCP);
+	if (sk < 0)
+		test_error("socket()");
+
+	for (int i = 0; i < FILTER_TEST_NKEYS; i++) {
+		snprintf(test_password_scratch, 64, "%s %d", test_password, i);
+		test_prepare_key(&test_ao[i], DEFAULT_TEST_ALGO, this_ip_dest,
+			  false, false, DEFAULT_TEST_PREFIX, 0, sndid++,
+			  rcvid++, 0, 0, strlen(test_password_scratch),
+			  test_password_scratch);
+	}
+	test_ao[0].set_current = 1;
+	test_ao[1].set_rnext = 1;
+	/* One key with a different addr and overlapping sndid, rcvid */
+	tcp_addr_to_sockaddr_in(&test_ao[2].addr, &this_ip_addr, 0);
+	test_ao[2].sndid = 100;
+	test_ao[2].rcvid = 100;
+
+	/* Add keys in a random order */
+	for (int i = 0; i < FILTER_TEST_NKEYS; i++) {
+		int randidx = rand() % (FILTER_TEST_NKEYS - i);
+
+		if (setsockopt(sk, IPPROTO_TCP, TCP_AO_ADD_KEY,
+			       &test_ao[randidx], sizeof(struct tcp_ao_add)))
+			test_error("setsockopt()");
+		memcpy(&test_ao[randidx], &test_ao[FILTER_TEST_NKEYS - 1 - i],
+		       sizeof(struct tcp_ao_add));
+	}
+
+	fetch_all_keys(sk, keys);
+
+	return sk;
+}
+
+/* Assumes passwords are unique */
+static int compare_mkts(struct tcp_ao_getsockopt *expected, int nexpected,
+			struct tcp_ao_getsockopt *actual, int nactual)
+{
+	int matches = 0;
+
+	for (int i = 0; i < nexpected; i++) {
+		for (int j = 0; j < nactual; j++) {
+			if (memcmp(expected[i].key, actual[j].key,
+				   TCP_AO_MAXKEYLEN) == 0)
+				matches++;
+		}
+	}
+	return nexpected - matches;
+}
+
+static void filter_keys_checked(int sk, struct tcp_ao_getsockopt *filter,
+				struct tcp_ao_getsockopt *expected,
+				unsigned int nexpected, const char *tst)
+{
+	struct tcp_ao_getsockopt filtered_keys[FILTER_TEST_NKEYS] = {};
+	struct tcp_ao_getsockopt all_keys[FILTER_TEST_NKEYS] = {};
+	socklen_t len = sizeof(struct tcp_ao_getsockopt);
+
+	fetch_all_keys(sk, all_keys);
+	memcpy(&filtered_keys[0], filter, sizeof(struct tcp_ao_getsockopt));
+	filtered_keys[0].nkeys = FILTER_TEST_NKEYS;
+	if (getsockopt(sk, IPPROTO_TCP, TCP_AO_GET_KEYS, filtered_keys, &len))
+		test_error("getsockopt");
+	if (filtered_keys[0].nkeys != nexpected) {
+		test_fail("wrong nr of keys, expected %u got %u", nexpected,
+			  filtered_keys[0].nkeys);
+		goto out_close;
+	}
+	if (compare_mkts(expected, nexpected, filtered_keys,
+			 filtered_keys[0].nkeys)) {
+		test_fail("got wrong keys back");
+		goto out_close;
+	}
+	test_ok("filter keys: %s", tst);
+
+out_close:
+	close(sk);
+	memset(filter, 0, sizeof(struct tcp_ao_getsockopt));
+}
+
+static void filter_tests(void)
+{
+	struct tcp_ao_getsockopt original_keys[FILTER_TEST_NKEYS];
+	struct tcp_ao_getsockopt expected_keys[FILTER_TEST_NKEYS];
+	struct tcp_ao_getsockopt filter = {};
+	int sk, f, nmatches;
+	socklen_t len;
+
+	f = 2;
+	sk = prepare_test_keys(original_keys);
+	filter.rcvid = original_keys[f].rcvid;
+	filter.sndid = original_keys[f].sndid;
+	memcpy(&filter.addr, &original_keys[f].addr,
+	       sizeof(original_keys[f].addr));
+	filter.prefix = original_keys[f].prefix;
+	filter_keys_checked(sk, &filter, &original_keys[f], 1,
+			    "by sndid, rcvid, address");
+
+	f = -1;
+	sk = prepare_test_keys(original_keys);
+	for (int i = 0; i < original_keys[0].nkeys; i++) {
+		if (original_keys[i].is_current) {
+			f = i;
+			break;
+		}
+	}
+	if (f < 0)
+		test_error("No current key after adding one");
+	filter.is_current = 1;
+	filter_keys_checked(sk, &filter, &original_keys[f], 1, "by is_current");
+
+	f = -1;
+	sk = prepare_test_keys(original_keys);
+	for (int i = 0; i < original_keys[0].nkeys; i++) {
+		if (original_keys[i].is_rnext) {
+			f = i;
+			break;
+		}
+	}
+	if (f < 0)
+		test_error("No rnext key after adding one");
+	filter.is_rnext = 1;
+	filter_keys_checked(sk, &filter, &original_keys[f], 1, "by is_rnext");
+
+	f = -1;
+	nmatches = 0;
+	sk = prepare_test_keys(original_keys);
+	for (int i = 0; i < original_keys[0].nkeys; i++) {
+		if (original_keys[i].sndid == 100) {
+			f = i;
+			memcpy(&expected_keys[nmatches], &original_keys[i],
+			       sizeof(struct tcp_ao_getsockopt));
+			nmatches++;
+		}
+	}
+	if (f < 0)
+		test_error("No key for sndid 100");
+	if (nmatches != 2)
+		test_error("Should have 2 keys with sndid 100");
+	filter.rcvid = original_keys[f].rcvid;
+	filter.sndid = original_keys[f].sndid;
+	filter.addr.ss_family = test_family;
+	filter_keys_checked(sk, &filter, expected_keys, nmatches,
+			    "by sndid, rcvid");
+
+	sk = prepare_test_keys(original_keys);
+	filter.get_all = 1;
+	filter.nkeys = FILTER_TEST_NKEYS / 2;
+	len = sizeof(struct tcp_ao_getsockopt);
+	if (getsockopt(sk, IPPROTO_TCP, TCP_AO_GET_KEYS, &filter, &len))
+		test_error("getsockopt");
+	if (filter.nkeys == FILTER_TEST_NKEYS)
+		test_ok("filter keys: correct nkeys when in.nkeys < matches");
+	else
+		test_fail("filter keys: wrong nkeys, expected %u got %u",
+			  FILTER_TEST_NKEYS, filter.nkeys);
+}
+
 static void *client_fn(void *arg)
 {
 	if (inet_pton(TEST_FAMILY, __TEST_CLIENT_IP(2), &tcp_md5_client) != 1)
 		test_error("Can't convert ip address");
 	extend_tests();
 	einval_tests();
+	filter_tests();
 	duplicate_tests();
-	/*
-	 * TODO: check getsockopt(TCP_AO_GET_KEYS) with different filters
-	 * returning proper nr & keys;
-	 */
 
 	return NULL;
 }
 
 int main(int argc, char *argv[])
 {
-	test_init(121, client_fn, NULL);
+	test_init(126, client_fn, NULL);
 	return 0;
 }
-- 
cgit v1.2.3


From 11f0c8e0468a8bc625164f68dd5ff2a9436658db Mon Sep 17 00:00:00 2001
From: Dev Jain <dev.jain@arm.com>
Date: Wed, 9 Oct 2024 10:44:23 +0530
Subject: selftests: Rename sigaltstack to generic signal

Rename sigaltstack to generic signal directory, to allow adding more
signal tests in the future.

Signed-off-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/Makefile                   |   2 +-
 tools/testing/selftests/sigaltstack/.gitignore     |   2 -
 tools/testing/selftests/sigaltstack/Makefile       |   6 -
 .../selftests/sigaltstack/current_stack_pointer.h  |  23 ---
 tools/testing/selftests/sigaltstack/sas.c          | 197 ---------------------
 tools/testing/selftests/signal/.gitignore          |   2 +
 tools/testing/selftests/signal/Makefile            |   6 +
 .../selftests/signal/current_stack_pointer.h       |  23 +++
 tools/testing/selftests/signal/sas.c               | 197 +++++++++++++++++++++
 9 files changed, 229 insertions(+), 229 deletions(-)
 delete mode 100644 tools/testing/selftests/sigaltstack/.gitignore
 delete mode 100644 tools/testing/selftests/sigaltstack/Makefile
 delete mode 100644 tools/testing/selftests/sigaltstack/current_stack_pointer.h
 delete mode 100644 tools/testing/selftests/sigaltstack/sas.c
 create mode 100644 tools/testing/selftests/signal/.gitignore
 create mode 100644 tools/testing/selftests/signal/Makefile
 create mode 100644 tools/testing/selftests/signal/current_stack_pointer.h
 create mode 100644 tools/testing/selftests/signal/sas.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index b38199965f99..3cfad04d0b5c 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -90,7 +90,7 @@ TARGETS += rtc
 TARGETS += rust
 TARGETS += seccomp
 TARGETS += sgx
-TARGETS += sigaltstack
+TARGETS += signal
 TARGETS += size
 TARGETS += sparc64
 TARGETS += splice
diff --git a/tools/testing/selftests/sigaltstack/.gitignore b/tools/testing/selftests/sigaltstack/.gitignore
deleted file mode 100644
index 50a19a8888ce..000000000000
--- a/tools/testing/selftests/sigaltstack/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-sas
diff --git a/tools/testing/selftests/sigaltstack/Makefile b/tools/testing/selftests/sigaltstack/Makefile
deleted file mode 100644
index 3e96d5d47036..000000000000
--- a/tools/testing/selftests/sigaltstack/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-CFLAGS = -Wall
-TEST_GEN_PROGS = sas
-
-include ../lib.mk
-
diff --git a/tools/testing/selftests/sigaltstack/current_stack_pointer.h b/tools/testing/selftests/sigaltstack/current_stack_pointer.h
deleted file mode 100644
index 09da8f1011ce..000000000000
--- a/tools/testing/selftests/sigaltstack/current_stack_pointer.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#if __alpha__
-register unsigned long sp asm("$30");
-#elif __arm__ || __aarch64__ || __csky__ || __m68k__ || __mips__ || __riscv
-register unsigned long sp asm("sp");
-#elif __i386__
-register unsigned long sp asm("esp");
-#elif __loongarch64
-register unsigned long sp asm("$sp");
-#elif __powerpc__
-register unsigned long sp asm("r1");
-#elif __s390x__
-register unsigned long sp asm("%15");
-#elif __sh__
-register unsigned long sp asm("r15");
-#elif __x86_64__
-register unsigned long sp asm("rsp");
-#elif __XTENSA__
-register unsigned long sp asm("a1");
-#else
-#error "implement current_stack_pointer equivalent"
-#endif
diff --git a/tools/testing/selftests/sigaltstack/sas.c b/tools/testing/selftests/sigaltstack/sas.c
deleted file mode 100644
index 07227fab1cc9..000000000000
--- a/tools/testing/selftests/sigaltstack/sas.c
+++ /dev/null
@@ -1,197 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Stas Sergeev <stsp@users.sourceforge.net>
- *
- * test sigaltstack(SS_ONSTACK | SS_AUTODISARM)
- * If that succeeds, then swapcontext() can be used inside sighandler safely.
- *
- */
-
-#define _GNU_SOURCE
-#include <signal.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/mman.h>
-#include <ucontext.h>
-#include <alloca.h>
-#include <string.h>
-#include <assert.h>
-#include <errno.h>
-#include <sys/auxv.h>
-
-#include "../kselftest.h"
-#include "current_stack_pointer.h"
-
-#ifndef SS_AUTODISARM
-#define SS_AUTODISARM  (1U << 31)
-#endif
-
-#ifndef AT_MINSIGSTKSZ
-#define AT_MINSIGSTKSZ	51
-#endif
-
-static unsigned int stack_size;
-static void *sstack, *ustack;
-static ucontext_t uc, sc;
-static const char *msg = "[OK]\tStack preserved";
-static const char *msg2 = "[FAIL]\tStack corrupted";
-struct stk_data {
-	char msg[128];
-	int flag;
-};
-
-void my_usr1(int sig, siginfo_t *si, void *u)
-{
-	char *aa;
-	int err;
-	stack_t stk;
-	struct stk_data *p;
-
-	if (sp < (unsigned long)sstack ||
-			sp >= (unsigned long)sstack + stack_size) {
-		ksft_exit_fail_msg("SP is not on sigaltstack\n");
-	}
-	/* put some data on stack. other sighandler will try to overwrite it */
-	aa = alloca(1024);
-	assert(aa);
-	p = (struct stk_data *)(aa + 512);
-	strcpy(p->msg, msg);
-	p->flag = 1;
-	ksft_print_msg("[RUN]\tsignal USR1\n");
-	err = sigaltstack(NULL, &stk);
-	if (err) {
-		ksft_exit_fail_msg("sigaltstack() - %s\n", strerror(errno));
-		exit(EXIT_FAILURE);
-	}
-	if (stk.ss_flags != SS_DISABLE)
-		ksft_test_result_fail("tss_flags=%x, should be SS_DISABLE\n",
-				stk.ss_flags);
-	else
-		ksft_test_result_pass(
-				"sigaltstack is disabled in sighandler\n");
-	swapcontext(&sc, &uc);
-	ksft_print_msg("%s\n", p->msg);
-	if (!p->flag) {
-		ksft_exit_fail_msg("[RUN]\tAborting\n");
-		exit(EXIT_FAILURE);
-	}
-}
-
-void my_usr2(int sig, siginfo_t *si, void *u)
-{
-	char *aa;
-	struct stk_data *p;
-
-	ksft_print_msg("[RUN]\tsignal USR2\n");
-	aa = alloca(1024);
-	/* dont run valgrind on this */
-	/* try to find the data stored by previous sighandler */
-	p = memmem(aa, 1024, msg, strlen(msg));
-	if (p) {
-		ksft_test_result_fail("sigaltstack re-used\n");
-		/* corrupt the data */
-		strcpy(p->msg, msg2);
-		/* tell other sighandler that his data is corrupted */
-		p->flag = 0;
-	}
-}
-
-static void switch_fn(void)
-{
-	ksft_print_msg("[RUN]\tswitched to user ctx\n");
-	raise(SIGUSR2);
-	setcontext(&sc);
-}
-
-int main(void)
-{
-	struct sigaction act;
-	stack_t stk;
-	int err;
-
-	/* Make sure more than the required minimum. */
-	stack_size = getauxval(AT_MINSIGSTKSZ) + SIGSTKSZ;
-	ksft_print_msg("[NOTE]\tthe stack size is %u\n", stack_size);
-
-	ksft_print_header();
-	ksft_set_plan(3);
-
-	sigemptyset(&act.sa_mask);
-	act.sa_flags = SA_ONSTACK | SA_SIGINFO;
-	act.sa_sigaction = my_usr1;
-	sigaction(SIGUSR1, &act, NULL);
-	act.sa_sigaction = my_usr2;
-	sigaction(SIGUSR2, &act, NULL);
-	sstack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
-		      MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
-	if (sstack == MAP_FAILED) {
-		ksft_exit_fail_msg("mmap() - %s\n", strerror(errno));
-		return EXIT_FAILURE;
-	}
-
-	err = sigaltstack(NULL, &stk);
-	if (err) {
-		ksft_exit_fail_msg("sigaltstack() - %s\n", strerror(errno));
-		exit(EXIT_FAILURE);
-	}
-	if (stk.ss_flags == SS_DISABLE) {
-		ksft_test_result_pass(
-				"Initial sigaltstack state was SS_DISABLE\n");
-	} else {
-		ksft_exit_fail_msg("Initial sigaltstack state was %x; "
-		       "should have been SS_DISABLE\n", stk.ss_flags);
-		return EXIT_FAILURE;
-	}
-
-	stk.ss_sp = sstack;
-	stk.ss_size = stack_size;
-	stk.ss_flags = SS_ONSTACK | SS_AUTODISARM;
-	err = sigaltstack(&stk, NULL);
-	if (err) {
-		if (errno == EINVAL) {
-			ksft_test_result_skip(
-				"[NOTE]\tThe running kernel doesn't support SS_AUTODISARM\n");
-			/*
-			 * If test cases for the !SS_AUTODISARM variant were
-			 * added, we could still run them.  We don't have any
-			 * test cases like that yet, so just exit and report
-			 * success.
-			 */
-			return 0;
-		} else {
-			ksft_exit_fail_msg(
-				"sigaltstack(SS_ONSTACK | SS_AUTODISARM)  %s\n",
-					strerror(errno));
-			return EXIT_FAILURE;
-		}
-	}
-
-	ustack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
-		      MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
-	if (ustack == MAP_FAILED) {
-		ksft_exit_fail_msg("mmap() - %s\n", strerror(errno));
-		return EXIT_FAILURE;
-	}
-	getcontext(&uc);
-	uc.uc_link = NULL;
-	uc.uc_stack.ss_sp = ustack;
-	uc.uc_stack.ss_size = stack_size;
-	makecontext(&uc, switch_fn, 0);
-	raise(SIGUSR1);
-
-	err = sigaltstack(NULL, &stk);
-	if (err) {
-		ksft_exit_fail_msg("sigaltstack() - %s\n", strerror(errno));
-		exit(EXIT_FAILURE);
-	}
-	if (stk.ss_flags != SS_AUTODISARM) {
-		ksft_exit_fail_msg("ss_flags=%x, should be SS_AUTODISARM\n",
-				stk.ss_flags);
-		exit(EXIT_FAILURE);
-	}
-	ksft_test_result_pass(
-			"sigaltstack is still SS_AUTODISARM after signal\n");
-
-	ksft_exit_pass();
-	return 0;
-}
diff --git a/tools/testing/selftests/signal/.gitignore b/tools/testing/selftests/signal/.gitignore
new file mode 100644
index 000000000000..50a19a8888ce
--- /dev/null
+++ b/tools/testing/selftests/signal/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+sas
diff --git a/tools/testing/selftests/signal/Makefile b/tools/testing/selftests/signal/Makefile
new file mode 100644
index 000000000000..3e96d5d47036
--- /dev/null
+++ b/tools/testing/selftests/signal/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+CFLAGS = -Wall
+TEST_GEN_PROGS = sas
+
+include ../lib.mk
+
diff --git a/tools/testing/selftests/signal/current_stack_pointer.h b/tools/testing/selftests/signal/current_stack_pointer.h
new file mode 100644
index 000000000000..09da8f1011ce
--- /dev/null
+++ b/tools/testing/selftests/signal/current_stack_pointer.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#if __alpha__
+register unsigned long sp asm("$30");
+#elif __arm__ || __aarch64__ || __csky__ || __m68k__ || __mips__ || __riscv
+register unsigned long sp asm("sp");
+#elif __i386__
+register unsigned long sp asm("esp");
+#elif __loongarch64
+register unsigned long sp asm("$sp");
+#elif __powerpc__
+register unsigned long sp asm("r1");
+#elif __s390x__
+register unsigned long sp asm("%15");
+#elif __sh__
+register unsigned long sp asm("r15");
+#elif __x86_64__
+register unsigned long sp asm("rsp");
+#elif __XTENSA__
+register unsigned long sp asm("a1");
+#else
+#error "implement current_stack_pointer equivalent"
+#endif
diff --git a/tools/testing/selftests/signal/sas.c b/tools/testing/selftests/signal/sas.c
new file mode 100644
index 000000000000..07227fab1cc9
--- /dev/null
+++ b/tools/testing/selftests/signal/sas.c
@@ -0,0 +1,197 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Stas Sergeev <stsp@users.sourceforge.net>
+ *
+ * test sigaltstack(SS_ONSTACK | SS_AUTODISARM)
+ * If that succeeds, then swapcontext() can be used inside sighandler safely.
+ *
+ */
+
+#define _GNU_SOURCE
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <ucontext.h>
+#include <alloca.h>
+#include <string.h>
+#include <assert.h>
+#include <errno.h>
+#include <sys/auxv.h>
+
+#include "../kselftest.h"
+#include "current_stack_pointer.h"
+
+#ifndef SS_AUTODISARM
+#define SS_AUTODISARM  (1U << 31)
+#endif
+
+#ifndef AT_MINSIGSTKSZ
+#define AT_MINSIGSTKSZ	51
+#endif
+
+static unsigned int stack_size;
+static void *sstack, *ustack;
+static ucontext_t uc, sc;
+static const char *msg = "[OK]\tStack preserved";
+static const char *msg2 = "[FAIL]\tStack corrupted";
+struct stk_data {
+	char msg[128];
+	int flag;
+};
+
+void my_usr1(int sig, siginfo_t *si, void *u)
+{
+	char *aa;
+	int err;
+	stack_t stk;
+	struct stk_data *p;
+
+	if (sp < (unsigned long)sstack ||
+			sp >= (unsigned long)sstack + stack_size) {
+		ksft_exit_fail_msg("SP is not on sigaltstack\n");
+	}
+	/* put some data on stack. other sighandler will try to overwrite it */
+	aa = alloca(1024);
+	assert(aa);
+	p = (struct stk_data *)(aa + 512);
+	strcpy(p->msg, msg);
+	p->flag = 1;
+	ksft_print_msg("[RUN]\tsignal USR1\n");
+	err = sigaltstack(NULL, &stk);
+	if (err) {
+		ksft_exit_fail_msg("sigaltstack() - %s\n", strerror(errno));
+		exit(EXIT_FAILURE);
+	}
+	if (stk.ss_flags != SS_DISABLE)
+		ksft_test_result_fail("tss_flags=%x, should be SS_DISABLE\n",
+				stk.ss_flags);
+	else
+		ksft_test_result_pass(
+				"sigaltstack is disabled in sighandler\n");
+	swapcontext(&sc, &uc);
+	ksft_print_msg("%s\n", p->msg);
+	if (!p->flag) {
+		ksft_exit_fail_msg("[RUN]\tAborting\n");
+		exit(EXIT_FAILURE);
+	}
+}
+
+void my_usr2(int sig, siginfo_t *si, void *u)
+{
+	char *aa;
+	struct stk_data *p;
+
+	ksft_print_msg("[RUN]\tsignal USR2\n");
+	aa = alloca(1024);
+	/* dont run valgrind on this */
+	/* try to find the data stored by previous sighandler */
+	p = memmem(aa, 1024, msg, strlen(msg));
+	if (p) {
+		ksft_test_result_fail("sigaltstack re-used\n");
+		/* corrupt the data */
+		strcpy(p->msg, msg2);
+		/* tell other sighandler that his data is corrupted */
+		p->flag = 0;
+	}
+}
+
+static void switch_fn(void)
+{
+	ksft_print_msg("[RUN]\tswitched to user ctx\n");
+	raise(SIGUSR2);
+	setcontext(&sc);
+}
+
+int main(void)
+{
+	struct sigaction act;
+	stack_t stk;
+	int err;
+
+	/* Make sure more than the required minimum. */
+	stack_size = getauxval(AT_MINSIGSTKSZ) + SIGSTKSZ;
+	ksft_print_msg("[NOTE]\tthe stack size is %u\n", stack_size);
+
+	ksft_print_header();
+	ksft_set_plan(3);
+
+	sigemptyset(&act.sa_mask);
+	act.sa_flags = SA_ONSTACK | SA_SIGINFO;
+	act.sa_sigaction = my_usr1;
+	sigaction(SIGUSR1, &act, NULL);
+	act.sa_sigaction = my_usr2;
+	sigaction(SIGUSR2, &act, NULL);
+	sstack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
+		      MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
+	if (sstack == MAP_FAILED) {
+		ksft_exit_fail_msg("mmap() - %s\n", strerror(errno));
+		return EXIT_FAILURE;
+	}
+
+	err = sigaltstack(NULL, &stk);
+	if (err) {
+		ksft_exit_fail_msg("sigaltstack() - %s\n", strerror(errno));
+		exit(EXIT_FAILURE);
+	}
+	if (stk.ss_flags == SS_DISABLE) {
+		ksft_test_result_pass(
+				"Initial sigaltstack state was SS_DISABLE\n");
+	} else {
+		ksft_exit_fail_msg("Initial sigaltstack state was %x; "
+		       "should have been SS_DISABLE\n", stk.ss_flags);
+		return EXIT_FAILURE;
+	}
+
+	stk.ss_sp = sstack;
+	stk.ss_size = stack_size;
+	stk.ss_flags = SS_ONSTACK | SS_AUTODISARM;
+	err = sigaltstack(&stk, NULL);
+	if (err) {
+		if (errno == EINVAL) {
+			ksft_test_result_skip(
+				"[NOTE]\tThe running kernel doesn't support SS_AUTODISARM\n");
+			/*
+			 * If test cases for the !SS_AUTODISARM variant were
+			 * added, we could still run them.  We don't have any
+			 * test cases like that yet, so just exit and report
+			 * success.
+			 */
+			return 0;
+		} else {
+			ksft_exit_fail_msg(
+				"sigaltstack(SS_ONSTACK | SS_AUTODISARM)  %s\n",
+					strerror(errno));
+			return EXIT_FAILURE;
+		}
+	}
+
+	ustack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
+		      MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
+	if (ustack == MAP_FAILED) {
+		ksft_exit_fail_msg("mmap() - %s\n", strerror(errno));
+		return EXIT_FAILURE;
+	}
+	getcontext(&uc);
+	uc.uc_link = NULL;
+	uc.uc_stack.ss_sp = ustack;
+	uc.uc_stack.ss_size = stack_size;
+	makecontext(&uc, switch_fn, 0);
+	raise(SIGUSR1);
+
+	err = sigaltstack(NULL, &stk);
+	if (err) {
+		ksft_exit_fail_msg("sigaltstack() - %s\n", strerror(errno));
+		exit(EXIT_FAILURE);
+	}
+	if (stk.ss_flags != SS_AUTODISARM) {
+		ksft_exit_fail_msg("ss_flags=%x, should be SS_AUTODISARM\n",
+				stk.ss_flags);
+		exit(EXIT_FAILURE);
+	}
+	ksft_test_result_pass(
+			"sigaltstack is still SS_AUTODISARM after signal\n");
+
+	ksft_exit_pass();
+	return 0;
+}
-- 
cgit v1.2.3


From cecc795329fc3e0ea2e84567ee57570cc050cf6b Mon Sep 17 00:00:00 2001
From: Dev Jain <dev.jain@arm.com>
Date: Wed, 9 Oct 2024 10:44:24 +0530
Subject: selftests: Add a test mangling with uc_sigmask

The test is motivated by the following observation:

Raise a signal, jump to signal handler. The ucontext_t structure dumped
by kernel to userspace has a uc_sigmask field having the mask of blocked
signals. If you run a fresh minimalistic program doing this, this field
is empty, even if you block some signals while registering the handler
with sigaction().

Here is what the man-pages have to say:

sigaction(2): "sa_mask specifies a mask of signals which should be blocked
(i.e., added to the signal mask of the thread in which the signal handler
is invoked) during execution of the signal handler. In addition, the
signal which triggered the handler will be blocked, unless the SA_NODEFER
flag is used."

signal(7): Under "Execution of signal handlers", (1.3) implies:

"The thread's current signal mask is accessible via the ucontext_t
object that is pointed to by the third argument of the signal handler."

But, (1.4) states:

"Any signals specified in act->sa_mask when registering the handler with
sigprocmask(2) are added to the thread's signal mask.  The signal being
delivered is also added to the signal mask, unless SA_NODEFER was
specified when registering the handler.  These signals are thus blocked
while the handler executes."

There clearly is no distinction being made in the man pages between
"Thread's signal mask" and ucontext_t; this logically should imply
that a signal blocked by populating struct sigaction should be visible
in ucontext_t.

Here is what the kernel code does (for Aarch64):

do_signal() -> handle_signal() -> sigmask_to_save(), which returns
&current->blocked, is passed to setup_rt_frame() -> setup_sigframe() ->
__copy_to_user(). Hence, &current->blocked is copied to ucontext_t
exposed to userspace. Returning back to handle_signal(),
signal_setup_done() -> signal_delivered() -> sigorsets() and
set_current_blocked() are responsible for using information from
struct ksignal ksig, which was populated through the sigaction()
system call in kernel/signal.c:
copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa)),
to update &current->blocked; hence, the set of blocked signals for the
current thread is updated AFTER the kernel dumps ucontext_t to
userspace.

Assuming that the above is indeed the intended behaviour, because it
semantically makes sense, since the signals blocked using sigaction()
remain blocked only till the execution of the handler, and not in the
context present before jumping to the handler (but nothing can be
confirmed from the man-pages), this patch introduces a test for
mangling with uc_sigmask.

The test asserts the relation between blocked signal, delivered signal,
and ucontext. The ucontext is mangled with, by adding a signal mask to
it; on return from the handler, the thread must block the corresponding
signal.

In the test description, I have also described signal delivery and blockage,
for ease of understanding what the test does.

Signed-off-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/signal/.gitignore          |   1 +
 tools/testing/selftests/signal/Makefile            |   3 +-
 tools/testing/selftests/signal/mangle_uc_sigmask.c | 184 +++++++++++++++++++++
 3 files changed, 187 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/signal/mangle_uc_sigmask.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/signal/.gitignore b/tools/testing/selftests/signal/.gitignore
index 50a19a8888ce..3f339865a3b6 100644
--- a/tools/testing/selftests/signal/.gitignore
+++ b/tools/testing/selftests/signal/.gitignore
@@ -1,2 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
+mangle_uc_sigmask
 sas
diff --git a/tools/testing/selftests/signal/Makefile b/tools/testing/selftests/signal/Makefile
index 3e96d5d47036..e0bf7058d19c 100644
--- a/tools/testing/selftests/signal/Makefile
+++ b/tools/testing/selftests/signal/Makefile
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 CFLAGS = -Wall
-TEST_GEN_PROGS = sas
+TEST_GEN_PROGS = mangle_uc_sigmask
+TEST_GEN_PROGS += sas
 
 include ../lib.mk
 
diff --git a/tools/testing/selftests/signal/mangle_uc_sigmask.c b/tools/testing/selftests/signal/mangle_uc_sigmask.c
new file mode 100644
index 000000000000..b79ab92178a8
--- /dev/null
+++ b/tools/testing/selftests/signal/mangle_uc_sigmask.c
@@ -0,0 +1,184 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2024 ARM Ltd.
+ *
+ * Author: Dev Jain <dev.jain@arm.com>
+ *
+ * Test describing a clear distinction between signal states - delivered and
+ * blocked, and their relation with ucontext.
+ *
+ * A process can request blocking of a signal by masking it into its set of
+ * blocked signals; such a signal, when sent to the process by the kernel,
+ * will get blocked by the process and it may later unblock it and take an
+ * action. At that point, the signal will be delivered.
+ *
+ * We test the following functionalities of the kernel:
+ *
+ * ucontext_t describes the interrupted context of the thread; this implies
+ * that, in case of registering a handler and catching the corresponding
+ * signal, that state is before what was jumping into the handler.
+ *
+ * The thread's mask of blocked signals can be permanently changed, i.e, not
+ * just during the execution of the handler, by mangling with uc_sigmask
+ * from inside the handler.
+ *
+ * Assume that we block the set of signals, S1, by sigaction(), and say, the
+ * signal for which the handler was installed, is S2. When S2 is sent to the
+ * program, it will be considered "delivered", since we will act on the
+ * signal and jump to the handler. Any instances of S1 or S2 raised, while the
+ * program is executing inside the handler, will be blocked; they will be
+ * delivered immediately upon termination of the handler.
+ *
+ * For standard signals (also see real-time signals in the man page), multiple
+ * blocked instances of the same signal are not queued; such a signal will
+ * be delivered just once.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <ucontext.h>
+
+#include "../kselftest.h"
+
+void handler_verify_ucontext(int signo, siginfo_t *info, void *uc)
+{
+	int ret;
+
+	/* Kernel dumps ucontext with USR2 blocked */
+	ret = sigismember(&(((ucontext_t *)uc)->uc_sigmask), SIGUSR2);
+	ksft_test_result(ret == 1, "USR2 blocked in ucontext\n");
+
+	/*
+	 * USR2 is blocked; can be delivered neither here, nor after
+	 * exit from handler
+	 */
+	if (raise(SIGUSR2))
+		ksft_exit_fail_perror("raise");
+}
+
+void handler_segv(int signo, siginfo_t *info, void *uc)
+{
+	/*
+	 * Three cases possible:
+	 * 1. Program already terminated due to segmentation fault.
+	 * 2. SEGV was blocked even after returning from handler_usr.
+	 * 3. SEGV was delivered on returning from handler_usr.
+	 * The last option must happen.
+	 */
+	ksft_test_result_pass("SEGV delivered\n");
+}
+
+static int cnt;
+
+void handler_usr(int signo, siginfo_t *info, void *uc)
+{
+	int ret;
+
+	/*
+	 * Break out of infinite recursion caused by raise(SIGUSR1) invoked
+	 * from inside the handler
+	 */
+	++cnt;
+	if (cnt > 1)
+		return;
+
+	/* SEGV blocked during handler execution, delivered on return */
+	if (raise(SIGSEGV))
+		ksft_exit_fail_perror("raise");
+
+	ksft_print_msg("SEGV bypassed successfully\n");
+
+	/*
+	 * Signal responsible for handler invocation is blocked by default;
+	 * delivered on return, leading to recursion
+	 */
+	if (raise(SIGUSR1))
+		ksft_exit_fail_perror("raise");
+
+	ksft_test_result(cnt == 1,
+			 "USR1 is blocked, cannot invoke handler right now\n");
+
+	/* Raise USR1 again; only one instance must be delivered upon exit */
+	if (raise(SIGUSR1))
+		ksft_exit_fail_perror("raise");
+
+	/* SEGV has been blocked in sa_mask, but ucontext is empty */
+	ret = sigismember(&(((ucontext_t *)uc)->uc_sigmask), SIGSEGV);
+	ksft_test_result(ret == 0, "SEGV not blocked in ucontext\n");
+
+	/* USR1 has been blocked, but ucontext is empty */
+	ret = sigismember(&(((ucontext_t *)uc)->uc_sigmask), SIGUSR1);
+	ksft_test_result(ret == 0, "USR1 not blocked in ucontext\n");
+
+	/*
+	 * Mangle ucontext; this will be copied back into &current->blocked
+	 * on return from the handler.
+	 */
+	if (sigaddset(&((ucontext_t *)uc)->uc_sigmask, SIGUSR2))
+		ksft_exit_fail_perror("sigaddset");
+}
+
+int main(int argc, char *argv[])
+{
+	struct sigaction act, act2;
+	sigset_t set, oldset;
+
+	ksft_print_header();
+	ksft_set_plan(7);
+
+	act.sa_flags = SA_SIGINFO;
+	act.sa_sigaction = &handler_usr;
+
+	/* Add SEGV to blocked mask */
+	if (sigemptyset(&act.sa_mask) || sigaddset(&act.sa_mask, SIGSEGV)
+	    || (sigismember(&act.sa_mask, SIGSEGV) != 1))
+		ksft_exit_fail_msg("Cannot add SEGV to blocked mask\n");
+
+	if (sigaction(SIGUSR1, &act, NULL))
+		ksft_exit_fail_perror("Cannot install handler");
+
+	act2.sa_flags = SA_SIGINFO;
+	act2.sa_sigaction = &handler_segv;
+
+	if (sigaction(SIGSEGV, &act2, NULL))
+		ksft_exit_fail_perror("Cannot install handler");
+
+	/* Invoke handler */
+	if (raise(SIGUSR1))
+		ksft_exit_fail_perror("raise");
+
+	/* USR1 must not be queued */
+	ksft_test_result(cnt == 2, "handler invoked only twice\n");
+
+	/* Mangled ucontext implies USR2 is blocked for current thread */
+	if (raise(SIGUSR2))
+		ksft_exit_fail_perror("raise");
+
+	ksft_print_msg("USR2 bypassed successfully\n");
+
+	act.sa_sigaction = &handler_verify_ucontext;
+	if (sigaction(SIGUSR1, &act, NULL))
+		ksft_exit_fail_perror("Cannot install handler");
+
+	if (raise(SIGUSR1))
+		ksft_exit_fail_perror("raise");
+
+	/*
+	 * Raising USR2 in handler_verify_ucontext is redundant since it
+	 * is blocked
+	 */
+	ksft_print_msg("USR2 still blocked on return from handler\n");
+
+	/* Confirm USR2 blockage by sigprocmask() too */
+	if (sigemptyset(&set))
+		ksft_exit_fail_perror("sigemptyset");
+
+	if (sigprocmask(SIG_BLOCK, &set, &oldset))
+		ksft_exit_fail_perror("sigprocmask");
+
+	ksft_test_result(sigismember(&oldset, SIGUSR2) == 1,
+			 "USR2 present in &current->blocked\n");
+
+	ksft_finished();
+}
-- 
cgit v1.2.3


From 42602e3a06f8e5b9a059344e305c9bee2dcc87c8 Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Tue, 29 Oct 2024 15:46:27 +0800
Subject: bpf: handle implicit declaration of function gettid in bpf_iter.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As we can see from the title, when I compiled the selftests/bpf, I
saw the error:
implicit declaration of function ‘gettid’ ; did you mean ‘getgid’? [-Werror=implicit-function-declaration]
  skel->bss->tid = gettid();
                   ^~~~~~
                   getgid

Directly call the syscall solves this issue.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
Tested-by: Alan Maguire <alan.maguire@oracle.com>
Link: https://lore.kernel.org/r/20241029074627.80289-1-kerneljasonxing@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/bpf_iter.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
index f0a3a9c18e9e..9006549a1294 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
@@ -226,7 +226,7 @@ static void test_task_common_nocheck(struct bpf_iter_attach_opts *opts,
 	ASSERT_OK(pthread_create(&thread_id, NULL, &do_nothing_wait, NULL),
 		  "pthread_create");
 
-	skel->bss->tid = gettid();
+	skel->bss->tid = syscall(SYS_gettid);
 
 	do_dummy_read_opts(skel->progs.dump_task, opts);
 
@@ -255,10 +255,10 @@ static void *run_test_task_tid(void *arg)
 	union bpf_iter_link_info linfo;
 	int num_unknown_tid, num_known_tid;
 
-	ASSERT_NEQ(getpid(), gettid(), "check_new_thread_id");
+	ASSERT_NEQ(getpid(), syscall(SYS_gettid), "check_new_thread_id");
 
 	memset(&linfo, 0, sizeof(linfo));
-	linfo.task.tid = gettid();
+	linfo.task.tid = syscall(SYS_gettid);
 	opts.link_info = &linfo;
 	opts.link_info_len = sizeof(linfo);
 	test_task_common(&opts, 0, 1);
-- 
cgit v1.2.3


From 9a1036389fa25cf90d58e3ae1bb0b1ad877ef1c8 Mon Sep 17 00:00:00 2001
From: Karan Sanghavi <karansanghvi98@gmail.com>
Date: Tue, 22 Oct 2024 18:30:52 +0000
Subject: selftests: tc-testing: Fix typo error

Correct the typo errors in json files

- "diffferent" is corrected to "different".
- "muliple" and "miltiple" is corrected to "multiple".

Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Karan Sanghavi <karansanghvi98@gmail.com>
Link: https://patch.msgid.link/20241022-multiple_spell_error-v2-1-7e5036506fe5@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/tc-testing/tc-tests/filters/basic.json  | 6 +++---
 tools/testing/selftests/tc-testing/tc-tests/filters/cgroup.json | 6 +++---
 tools/testing/selftests/tc-testing/tc-tests/filters/flow.json   | 2 +-
 tools/testing/selftests/tc-testing/tc-tests/filters/route.json  | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/basic.json b/tools/testing/selftests/tc-testing/tc-tests/filters/basic.json
index d1278de8ebc3..c9309a44a87e 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/filters/basic.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/filters/basic.json
@@ -67,7 +67,7 @@
     },
     {
         "id": "4943",
-        "name": "Add basic filter with cmp ematch u32/link layer and miltiple actions",
+        "name": "Add basic filter with cmp ematch u32/link layer and multiple actions",
         "category": [
             "filter",
             "basic"
@@ -155,7 +155,7 @@
     },
     {
         "id": "32d8",
-        "name": "Add basic filter with cmp ematch u32/network layer and miltiple actions",
+        "name": "Add basic filter with cmp ematch u32/network layer and multiple actions",
         "category": [
             "filter",
             "basic"
@@ -243,7 +243,7 @@
     },
     {
         "id": "62d7",
-        "name": "Add basic filter with cmp ematch u32/transport layer and miltiple actions",
+        "name": "Add basic filter with cmp ematch u32/transport layer and multiple actions",
         "category": [
             "filter",
             "basic"
diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/cgroup.json b/tools/testing/selftests/tc-testing/tc-tests/filters/cgroup.json
index 03723cf84379..35c9a7dbe1c4 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/filters/cgroup.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/filters/cgroup.json
@@ -67,7 +67,7 @@
     },
     {
         "id": "0234",
-        "name": "Add cgroup filter with cmp ematch u32/link layer and miltiple actions",
+        "name": "Add cgroup filter with cmp ematch u32/link layer and multiple actions",
         "category": [
             "filter",
             "cgroup"
@@ -155,7 +155,7 @@
     },
     {
         "id": "2733",
-        "name": "Add cgroup filter with cmp ematch u32/network layer and miltiple actions",
+        "name": "Add cgroup filter with cmp ematch u32/network layer and multiple actions",
         "category": [
             "filter",
             "cgroup"
@@ -1189,7 +1189,7 @@
     },
     {
         "id": "4319",
-        "name": "Replace cgroup filter with diffferent match",
+        "name": "Replace cgroup filter with different match",
         "category": [
             "filter",
             "cgroup"
diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/flow.json b/tools/testing/selftests/tc-testing/tc-tests/filters/flow.json
index 58189327f644..996448afe31b 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/filters/flow.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/filters/flow.json
@@ -507,7 +507,7 @@
     },
     {
         "id": "4341",
-        "name": "Add flow filter with muliple ops",
+        "name": "Add flow filter with multiple ops",
         "category": [
             "filter",
             "flow"
diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/route.json b/tools/testing/selftests/tc-testing/tc-tests/filters/route.json
index 8d8de8f65aef..05cedca67cca 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/filters/route.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/filters/route.json
@@ -111,7 +111,7 @@
     },
     {
         "id": "7994",
-        "name": "Add route filter with miltiple actions",
+        "name": "Add route filter with multiple actions",
         "category": [
             "filter",
             "route"
-- 
cgit v1.2.3


From 0ab7cd1f18648ab50c4685553ca92e8cdc4a42da Mon Sep 17 00:00:00 2001
From: Vincent Li <vincent.mc.li@gmail.com>
Date: Fri, 25 Oct 2024 03:19:52 +0000
Subject: selftests/bpf: remove xdp_synproxy IP_DF check

In real world production websites, the IP_DF flag
is not always set for each packet from these websites.
the IP_DF flag check breaks Internet connection to
these websites for home based firewall like BPFire
when XDP synproxy program is attached to firewall
Internet facing side interface. see [0]

[0] https://github.com/vincentmli/BPFire/issues/59

Signed-off-by: Vincent Li <vincent.mc.li@gmail.com>
Link: https://lore.kernel.org/r/20241025031952.1351150-1-vincent.mc.li@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c
index f8f5dc9f72b8..62b8e29ced9f 100644
--- a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c
+++ b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c
@@ -21,7 +21,6 @@
 
 #define tcp_flag_word(tp) (((union tcp_word_hdr *)(tp))->words[3])
 
-#define IP_DF 0x4000
 #define IP_MF 0x2000
 #define IP_OFFSET 0x1fff
 
@@ -442,7 +441,7 @@ static __always_inline int tcp_lookup(void *ctx, struct header_pointers *hdr, bo
 		/* TCP doesn't normally use fragments, and XDP can't reassemble
 		 * them.
 		 */
-		if ((hdr->ipv4->frag_off & bpf_htons(IP_DF | IP_MF | IP_OFFSET)) != bpf_htons(IP_DF))
+		if ((hdr->ipv4->frag_off & bpf_htons(IP_MF | IP_OFFSET)) != 0)
 			return XDP_DROP;
 
 		tup.ipv4.saddr = hdr->ipv4->saddr;
-- 
cgit v1.2.3


From e626a13f6fbb4697f8734333432dca577628d09a Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 29 Oct 2024 13:39:19 -0700
Subject: selftests/bpf: drop unnecessary bpf_iter.h type duplication

Drop bpf_iter.h header which uses vmlinux.h but re-defines a bunch of
iterator structures and some of BPF constants for use in BPF iterator
selftests.

None of that is necessary when fresh vmlinux.h header is generated for
vmlinux image that matches latest selftests. So drop ugly hacks and have
a nice plain vmlinux.h usage everywhere.

We could do the same with all the kfunc __ksym redefinitions, but that
has dependency on very fresh pahole, so I'm not addressing that here.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20241029203919.1948941-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/bpf_iter.h       | 174 ---------------------
 .../selftests/bpf/progs/bpf_iter_bpf_array_map.c   |   2 +-
 .../selftests/bpf/progs/bpf_iter_bpf_hash_map.c    |   2 +-
 .../selftests/bpf/progs/bpf_iter_bpf_link.c        |   2 +-
 .../testing/selftests/bpf/progs/bpf_iter_bpf_map.c |   2 +-
 .../bpf/progs/bpf_iter_bpf_percpu_array_map.c      |   2 +-
 .../bpf/progs/bpf_iter_bpf_percpu_hash_map.c       |   2 +-
 .../bpf/progs/bpf_iter_bpf_sk_storage_helpers.c    |   2 +-
 .../bpf/progs/bpf_iter_bpf_sk_storage_map.c        |   2 +-
 .../selftests/bpf/progs/bpf_iter_ipv6_route.c      |   2 +-
 tools/testing/selftests/bpf/progs/bpf_iter_ksym.c  |   2 +-
 .../testing/selftests/bpf/progs/bpf_iter_netlink.c |   2 +-
 .../selftests/bpf/progs/bpf_iter_setsockopt.c      |   2 +-
 .../selftests/bpf/progs/bpf_iter_setsockopt_unix.c |   2 +-
 .../testing/selftests/bpf/progs/bpf_iter_sockmap.c |   2 +-
 .../selftests/bpf/progs/bpf_iter_task_btf.c        |   2 +-
 .../selftests/bpf/progs/bpf_iter_task_file.c       |   2 +-
 .../selftests/bpf/progs/bpf_iter_task_stack.c      |   2 +-
 .../selftests/bpf/progs/bpf_iter_task_vmas.c       |   2 +-
 tools/testing/selftests/bpf/progs/bpf_iter_tasks.c |   2 +-
 tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c  |   2 +-
 tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c  |   2 +-
 .../selftests/bpf/progs/bpf_iter_test_kern3.c      |   2 +-
 .../selftests/bpf/progs/bpf_iter_test_kern4.c      |   2 +-
 .../selftests/bpf/progs/bpf_iter_test_kern5.c      |   2 +-
 .../selftests/bpf/progs/bpf_iter_test_kern6.c      |   2 +-
 .../bpf/progs/bpf_iter_test_kern_common.h          |   2 +-
 tools/testing/selftests/bpf/progs/bpf_iter_udp4.c  |   2 +-
 tools/testing/selftests/bpf/progs/bpf_iter_udp6.c  |   2 +-
 tools/testing/selftests/bpf/progs/bpf_iter_unix.c  |   2 +-
 .../selftests/bpf/progs/bpf_iter_vma_offset.c      |   2 +-
 tools/testing/selftests/bpf/progs/cgroup_iter.c    |   3 +-
 .../selftests/bpf/progs/cgrp_ls_sleepable.c        |   3 +-
 .../testing/selftests/bpf/progs/kmem_cache_iter.c  |   3 +-
 34 files changed, 33 insertions(+), 210 deletions(-)
 delete mode 100644 tools/testing/selftests/bpf/progs/bpf_iter.h

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/progs/bpf_iter.h b/tools/testing/selftests/bpf/progs/bpf_iter.h
deleted file mode 100644
index 3305dc3a74b3..000000000000
--- a/tools/testing/selftests/bpf/progs/bpf_iter.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2020 Facebook */
-/* "undefine" structs in vmlinux.h, because we "override" them below */
-#define bpf_iter_meta bpf_iter_meta___not_used
-#define bpf_iter__bpf_map bpf_iter__bpf_map___not_used
-#define bpf_iter__ipv6_route bpf_iter__ipv6_route___not_used
-#define bpf_iter__netlink bpf_iter__netlink___not_used
-#define bpf_iter__task bpf_iter__task___not_used
-#define bpf_iter__task_file bpf_iter__task_file___not_used
-#define bpf_iter__task_vma bpf_iter__task_vma___not_used
-#define bpf_iter__tcp bpf_iter__tcp___not_used
-#define tcp6_sock tcp6_sock___not_used
-#define bpf_iter__udp bpf_iter__udp___not_used
-#define udp6_sock udp6_sock___not_used
-#define bpf_iter__unix bpf_iter__unix___not_used
-#define bpf_iter__bpf_map_elem bpf_iter__bpf_map_elem___not_used
-#define bpf_iter__bpf_sk_storage_map bpf_iter__bpf_sk_storage_map___not_used
-#define bpf_iter__sockmap bpf_iter__sockmap___not_used
-#define bpf_iter__bpf_link bpf_iter__bpf_link___not_used
-#define bpf_iter__cgroup bpf_iter__cgroup___not_used
-#define btf_ptr btf_ptr___not_used
-#define BTF_F_COMPACT BTF_F_COMPACT___not_used
-#define BTF_F_NONAME BTF_F_NONAME___not_used
-#define BTF_F_PTR_RAW BTF_F_PTR_RAW___not_used
-#define BTF_F_ZERO BTF_F_ZERO___not_used
-#define bpf_iter__ksym bpf_iter__ksym___not_used
-#define bpf_iter__kmem_cache bpf_iter__kmem_cache___not_used
-#include "vmlinux.h"
-#undef bpf_iter_meta
-#undef bpf_iter__bpf_map
-#undef bpf_iter__ipv6_route
-#undef bpf_iter__netlink
-#undef bpf_iter__task
-#undef bpf_iter__task_file
-#undef bpf_iter__task_vma
-#undef bpf_iter__tcp
-#undef tcp6_sock
-#undef bpf_iter__udp
-#undef udp6_sock
-#undef bpf_iter__unix
-#undef bpf_iter__bpf_map_elem
-#undef bpf_iter__bpf_sk_storage_map
-#undef bpf_iter__sockmap
-#undef bpf_iter__bpf_link
-#undef bpf_iter__cgroup
-#undef btf_ptr
-#undef BTF_F_COMPACT
-#undef BTF_F_NONAME
-#undef BTF_F_PTR_RAW
-#undef BTF_F_ZERO
-#undef bpf_iter__ksym
-#undef bpf_iter__kmem_cache
-
-struct bpf_iter_meta {
-	struct seq_file *seq;
-	__u64 session_id;
-	__u64 seq_num;
-} __attribute__((preserve_access_index));
-
-struct bpf_iter__ipv6_route {
-	struct bpf_iter_meta *meta;
-	struct fib6_info *rt;
-} __attribute__((preserve_access_index));
-
-struct bpf_iter__netlink {
-	struct bpf_iter_meta *meta;
-	struct netlink_sock *sk;
-} __attribute__((preserve_access_index));
-
-struct bpf_iter__task {
-	struct bpf_iter_meta *meta;
-	struct task_struct *task;
-} __attribute__((preserve_access_index));
-
-struct bpf_iter__task_file {
-	struct bpf_iter_meta *meta;
-	struct task_struct *task;
-	__u32 fd;
-	struct file *file;
-} __attribute__((preserve_access_index));
-
-struct bpf_iter__task_vma {
-	struct bpf_iter_meta *meta;
-	struct task_struct *task;
-	struct vm_area_struct *vma;
-} __attribute__((preserve_access_index));
-
-struct bpf_iter__bpf_map {
-	struct bpf_iter_meta *meta;
-	struct bpf_map *map;
-} __attribute__((preserve_access_index));
-
-struct bpf_iter__tcp {
-	struct bpf_iter_meta *meta;
-	struct sock_common *sk_common;
-	uid_t uid;
-} __attribute__((preserve_access_index));
-
-struct tcp6_sock {
-	struct tcp_sock	tcp;
-	struct ipv6_pinfo inet6;
-} __attribute__((preserve_access_index));
-
-struct bpf_iter__udp {
-	struct bpf_iter_meta *meta;
-	struct udp_sock *udp_sk;
-	uid_t uid __attribute__((aligned(8)));
-	int bucket __attribute__((aligned(8)));
-} __attribute__((preserve_access_index));
-
-struct udp6_sock {
-	struct udp_sock	udp;
-	struct ipv6_pinfo inet6;
-} __attribute__((preserve_access_index));
-
-struct bpf_iter__unix {
-	struct bpf_iter_meta *meta;
-	struct unix_sock *unix_sk;
-	uid_t uid;
-} __attribute__((preserve_access_index));
-
-struct bpf_iter__bpf_map_elem {
-	struct bpf_iter_meta *meta;
-	struct bpf_map *map;
-	void *key;
-	void *value;
-};
-
-struct bpf_iter__bpf_sk_storage_map {
-	struct bpf_iter_meta *meta;
-	struct bpf_map *map;
-	struct sock *sk;
-	void *value;
-};
-
-struct bpf_iter__sockmap {
-	struct bpf_iter_meta *meta;
-	struct bpf_map *map;
-	void *key;
-	struct sock *sk;
-};
-
-struct bpf_iter__bpf_link {
-	struct bpf_iter_meta *meta;
-	struct bpf_link *link;
-};
-
-struct bpf_iter__cgroup {
-	struct bpf_iter_meta *meta;
-	struct cgroup *cgroup;
-} __attribute__((preserve_access_index));
-
-struct btf_ptr {
-	void *ptr;
-	__u32 type_id;
-	__u32 flags;
-};
-
-enum {
-	BTF_F_COMPACT	=	(1ULL << 0),
-	BTF_F_NONAME	=	(1ULL << 1),
-	BTF_F_PTR_RAW	=	(1ULL << 2),
-	BTF_F_ZERO	=	(1ULL << 3),
-};
-
-struct bpf_iter__ksym {
-	struct bpf_iter_meta *meta;
-	struct kallsym_iter *ksym;
-};
-
-struct bpf_iter__kmem_cache {
-	struct bpf_iter_meta *meta;
-	struct kmem_cache *s;
-} __attribute__((preserve_access_index));
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_array_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_array_map.c
index 564835ba7d51..19710cc0f250 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_array_map.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_array_map.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_hash_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_hash_map.c
index d7a69217fb68..f47da665f7e0 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_hash_map.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_hash_map.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_link.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_link.c
index e1af2f8f75a6..7b69e1887705 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_link.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_link.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2022 Red Hat, Inc. */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c
index 6c39e86b666f..c868ffb8080f 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_array_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_array_map.c
index 9f0e0705b2bf..9fdea8cd4c6f 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_array_map.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_array_map.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_hash_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_hash_map.c
index 5014a17d6c02..aa529f76c7fc 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_hash_map.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_hash_map.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_sk_storage_helpers.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_sk_storage_helpers.c
index 6cecab2b32ba..e88dab196e0f 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_sk_storage_helpers.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_sk_storage_helpers.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Google LLC. */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_sk_storage_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_sk_storage_map.c
index c7b8e006b171..eb9642923e1c 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_sk_storage_map.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_sk_storage_map.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include "bpf_tracing_net.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
index 784a610ce039..73a5cf3ba3d3 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include "bpf_tracing_net.h"
 #include <bpf/bpf_helpers.h>
 
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_ksym.c b/tools/testing/selftests/bpf/progs/bpf_iter_ksym.c
index 521267818f4d..3e725b1fce37 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_ksym.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_ksym.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2022, Oracle and/or its affiliates. */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
index a28e51e2dcee..00b2ceae81fb 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include "bpf_tracing_net.h"
 #include <bpf/bpf_helpers.h>
 
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt.c b/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt.c
index ec7f91850dec..774d4dbe8189 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2021 Facebook */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include "bpf_tracing_net.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_endian.h>
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt_unix.c b/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt_unix.c
index eafc877ea460..d92631ec6161 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt_unix.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt_unix.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright Amazon.com Inc. or its affiliates. */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include "bpf_tracing_net.h"
 #include <bpf/bpf_helpers.h>
 #include <limits.h>
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_sockmap.c b/tools/testing/selftests/bpf/progs/bpf_iter_sockmap.c
index f3af0e30cead..317fe49760cc 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_sockmap.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_sockmap.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Cloudflare */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include "bpf_tracing_net.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_btf.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_btf.c
index bca8b889cb10..ef2f7c8d9373 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_task_btf.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_btf.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020, Oracle and/or its affiliates. */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_core_read.h>
 
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c
index b0255080662d..959a8d899eaf 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c
index 442f4ca39fd7..f5a309455490 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c
index 423b39e60b6f..d64ba7ddaed5 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_tasks.c b/tools/testing/selftests/bpf/progs/bpf_iter_tasks.c
index 6cbb3393f243..bc10c4e4b4fa 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_tasks.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_tasks.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c b/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c
index 92267abb462f..d22449c69363 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include "bpf_tracing_net.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_endian.h>
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c b/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c
index 943f7bba180e..8b072666f9d9 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include "bpf_tracing_net.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_endian.h>
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c
index 2a4647f20c46..6b17e7e86a48 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c
index dbf61c44acac..56177508798f 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern5.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern5.c
index e3a7575e81d2..9d8b7310d2c2 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern5.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern5.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern6.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern6.c
index 1c7304f56b1e..b150bd468824 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern6.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern6.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h
index d5e3df66ad9a..6a4c50497c5e 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (c) 2020 Facebook */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c b/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c
index cf0c485b1ed7..ffbd4b116d17 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include "bpf_tracing_net.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_endian.h>
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c b/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c
index 5031e21c433f..47ff7754f4fd 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include "bpf_tracing_net.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_endian.h>
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_unix.c b/tools/testing/selftests/bpf/progs/bpf_iter_unix.c
index e6aefae38894..fea275df9e22 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_unix.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_unix.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright Amazon.com Inc. or its affiliates. */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include "bpf_tracing_net.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_endian.h>
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_vma_offset.c b/tools/testing/selftests/bpf/progs/bpf_iter_vma_offset.c
index ee7455d2623a..174298e122d3 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_vma_offset.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_vma_offset.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/cgroup_iter.c b/tools/testing/selftests/bpf/progs/cgroup_iter.c
index de03997322a7..f30841997a8d 100644
--- a/tools/testing/selftests/bpf/progs/cgroup_iter.c
+++ b/tools/testing/selftests/bpf/progs/cgroup_iter.c
@@ -1,7 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2022 Google */
-
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
diff --git a/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c b/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c
index 5e282c16eadc..a2de95f85648 100644
--- a/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c
+++ b/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c
@@ -1,7 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
-
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include "bpf_misc.h"
diff --git a/tools/testing/selftests/bpf/progs/kmem_cache_iter.c b/tools/testing/selftests/bpf/progs/kmem_cache_iter.c
index 72c9dafecd98..e775d5cd99fc 100644
--- a/tools/testing/selftests/bpf/progs/kmem_cache_iter.c
+++ b/tools/testing/selftests/bpf/progs/kmem_cache_iter.c
@@ -1,7 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2024 Google */
-
-#include "bpf_iter.h"
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
-- 
cgit v1.2.3


From aab154a442f9ba2a08fc130dbc8d178a33e10345 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 21 Oct 2024 13:54:46 +0200
Subject: selftests: add file SLAB_TYPESAFE_BY_RCU recycling stressor

Add a simple file stressor that lives directly in-tree. This will create
a bunch of processes that each open 500 file descriptors and then use
close_range() to close them all.

Concurrently, other processes read /proc/<pid>/fd/ which rougly does

    f = fget_task_next(p, &fd);
    if (!f)
           break;
    data.mode = f->f_mode;
    fput(f);

Which means that it'll try to get a reference to a file in another
task's file descriptor table.

Under heavy file load it is increasingly likely that the other task will
manage to close @file and @file will be recycled due to
SLAB_TYPEAFE_BY_RCU concurrently. This will trigger various warnings in
the file reference counting code.

Link: https://lore.kernel.org/r/20241021-vergab-streuen-924df15dceb9@brauner
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 tools/testing/selftests/filesystems/.gitignore     |   1 +
 tools/testing/selftests/filesystems/Makefile       |   2 +-
 .../testing/selftests/filesystems/file_stressor.c  | 194 +++++++++++++++++++++
 3 files changed, 196 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/filesystems/file_stressor.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/filesystems/.gitignore b/tools/testing/selftests/filesystems/.gitignore
index f0c0ff20d6cf..828b66a10c63 100644
--- a/tools/testing/selftests/filesystems/.gitignore
+++ b/tools/testing/selftests/filesystems/.gitignore
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
 dnotify_test
 devpts_pts
+file_stressor
diff --git a/tools/testing/selftests/filesystems/Makefile b/tools/testing/selftests/filesystems/Makefile
index c647fd6a0446..66305fc34c60 100644
--- a/tools/testing/selftests/filesystems/Makefile
+++ b/tools/testing/selftests/filesystems/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
 CFLAGS += $(KHDR_INCLUDES)
-TEST_GEN_PROGS := devpts_pts
+TEST_GEN_PROGS := devpts_pts file_stressor
 TEST_GEN_PROGS_EXTENDED := dnotify_test
 
 include ../lib.mk
diff --git a/tools/testing/selftests/filesystems/file_stressor.c b/tools/testing/selftests/filesystems/file_stressor.c
new file mode 100644
index 000000000000..1136f93a9977
--- /dev/null
+++ b/tools/testing/selftests/filesystems/file_stressor.c
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#define __SANE_USERSPACE_TYPES__
+
+#include <fcntl.h>
+#include <limits.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <unistd.h>
+
+#include "../kselftest_harness.h"
+
+#include <linux/types.h>
+#include <linux/mount.h>
+#include <sys/syscall.h>
+
+static inline int sys_fsopen(const char *fsname, unsigned int flags)
+{
+	return syscall(__NR_fsopen, fsname, flags);
+}
+
+static inline int sys_fsconfig(int fd, unsigned int cmd, const char *key,
+			       const char *value, int aux)
+{
+	return syscall(__NR_fsconfig, fd, cmd, key, value, aux);
+}
+
+static inline int sys_fsmount(int fd, unsigned int flags,
+			      unsigned int attr_flags)
+{
+	return syscall(__NR_fsmount, fd, flags, attr_flags);
+}
+
+#ifndef MOVE_MOUNT_F_EMPTY_PATH
+#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */
+#endif
+
+static inline int sys_move_mount(int from_dfd, const char *from_pathname,
+				 int to_dfd, const char *to_pathname,
+				 unsigned int flags)
+{
+	return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd,
+		       to_pathname, flags);
+}
+
+FIXTURE(file_stressor) {
+	int fd_tmpfs;
+	int nr_procs;
+	int max_fds;
+	pid_t *pids_openers;
+	pid_t *pids_getdents;
+	int *fd_proc_pid;
+};
+
+FIXTURE_SETUP(file_stressor)
+{
+	int fd_context;
+
+	ASSERT_EQ(unshare(CLONE_NEWNS), 0);
+	ASSERT_EQ(mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0);
+	ASSERT_EQ(mkdir("/slab_typesafe_by_rcu", 0755), 0);
+
+	fd_context = sys_fsopen("tmpfs", 0);
+	ASSERT_GE(fd_context, 0);
+
+	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
+	self->fd_tmpfs = sys_fsmount(fd_context, 0, 0);
+	ASSERT_GE(self->fd_tmpfs, 0);
+	ASSERT_EQ(close(fd_context), 0);
+
+	ASSERT_EQ(sys_move_mount(self->fd_tmpfs, "", -EBADF, "/slab_typesafe_by_rcu", MOVE_MOUNT_F_EMPTY_PATH), 0);
+
+	self->nr_procs = sysconf(_SC_NPROCESSORS_ONLN);
+	self->pids_openers = malloc(sizeof(pid_t) * self->nr_procs);
+	ASSERT_NE(self->pids_openers, NULL);
+	self->pids_getdents = malloc(sizeof(pid_t) * self->nr_procs);
+	ASSERT_NE(self->pids_getdents, NULL);
+	self->fd_proc_pid = malloc(sizeof(int) * self->nr_procs);
+	ASSERT_NE(self->fd_proc_pid, NULL);
+	self->max_fds = 500;
+}
+
+FIXTURE_TEARDOWN(file_stressor)
+{
+	for (int i = 0; i < self->nr_procs; i++) {
+		int wstatus;
+		pid_t pid;
+
+		pid = waitpid(self->pids_openers[i], &wstatus, 0);
+		ASSERT_EQ(pid, self->pids_openers[i]);
+		ASSERT_TRUE(!WIFEXITED(wstatus) || !WIFSIGNALED(wstatus));
+
+		pid = waitpid(self->pids_getdents[i], &wstatus, 0);
+		ASSERT_EQ(pid, self->pids_getdents[i]);
+		ASSERT_TRUE(!WIFEXITED(wstatus) || !WIFSIGNALED(wstatus));
+	}
+	free(self->pids_openers);
+	free(self->pids_getdents);
+	ASSERT_EQ(close(self->fd_tmpfs), 0);
+
+	umount2("/slab_typesafe_by_rcu", 0);
+	ASSERT_EQ(rmdir("/slab_typesafe_by_rcu"), 0);
+}
+
+TEST_F_TIMEOUT(file_stressor, slab_typesafe_by_rcu, 900 * 2)
+{
+	for (int i = 0; i < self->nr_procs; i++) {
+		pid_t pid_self;
+
+		self->pids_openers[i] = fork();
+		ASSERT_GE(self->pids_openers[i], 0);
+
+		if (self->pids_openers[i] != 0)
+			continue;
+
+		self->pids_openers[i] = getpid();
+		for (;;) {
+			for (int i = 0; i < self->max_fds; i++) {
+				char path[PATH_MAX];
+				int fd;
+
+				sprintf(path, "/slab_typesafe_by_rcu/file-%d-%d", self->pids_openers[i], i);
+				fd = open(path, O_CREAT | O_RDONLY | O_CLOEXEC, 0644);
+				if (fd < 0)
+					continue;
+			}
+
+			close_range(3, ~0U, 0);
+		}
+
+		exit(0);
+	}
+
+	for (int i = 0; i < self->nr_procs; i++) {
+		char path[PATH_MAX];
+
+		sprintf(path, "/proc/%d/fd/", self->pids_openers[i]);
+		self->fd_proc_pid[i] = open(path, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
+		ASSERT_GE(self->fd_proc_pid[i], 0);
+	}
+
+	for (int i = 0; i < self->nr_procs; i++) {
+		self->pids_getdents[i] = fork();
+		ASSERT_GE(self->pids_getdents[i], 0);
+
+		if (self->pids_getdents[i] != 0)
+			continue;
+
+		self->pids_getdents[i] = getpid();
+		for (;;) {
+			char ents[1024];
+			ssize_t nr_read;
+
+			/*
+			 * Concurrently read /proc/<pid>/fd/ which rougly does:
+			 *
+			 * f = fget_task_next(p, &fd);
+			 * if (!f)
+			 *	break;
+			 * data.mode = f->f_mode;
+			 * fput(f);
+			 *
+			 * Which means that it'll try to get a reference to a
+			 * file in another task's file descriptor table.
+			 *
+			 * Under heavy file load it is increasingly likely that
+			 * the other task will manage to close @file and @file
+			 * is being recycled due to SLAB_TYPEAFE_BY_RCU
+			 * concurrently. This will trigger various warnings in
+			 * the file reference counting code.
+			 */
+			do {
+				nr_read = syscall(SYS_getdents64, self->fd_proc_pid[i], ents, sizeof(ents));
+			} while (nr_read >= 0);
+
+			lseek(self->fd_proc_pid[i], 0, SEEK_SET);
+		}
+
+		exit(0);
+	}
+
+	ASSERT_EQ(clock_nanosleep(CLOCK_MONOTONIC, 0, &(struct timespec){ .tv_sec = 900 /* 15 min */ }, NULL), 0);
+
+	for (int i = 0; i < self->nr_procs; i++) {
+		kill(self->pids_openers[i], SIGKILL);
+		kill(self->pids_getdents[i], SIGKILL);
+	}
+}
+
+TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From b87f584024e1df289027cb7671de6af0d97b5de9 Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay@kernel.org>
Date: Sat, 26 Oct 2024 12:53:38 +0000
Subject: selftests/bpf: Don't mask result of bpf_csum_diff() in test_verifier
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The bpf_csum_diff() helper has been fixed to return a 16-bit value for
all archs, so now we don't need to mask the result.

This commit is basically reverting the below:

commit 6185266c5a85 ("selftests/bpf: Mask bpf_csum_diff() return value
to 16 bits in test_verifier")

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20241026125339.26459-4-puranjay@kernel.org
---
 tools/testing/selftests/bpf/progs/verifier_array_access.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/progs/verifier_array_access.c b/tools/testing/selftests/bpf/progs/verifier_array_access.c
index 95d7ecc12963..4195aa824ba5 100644
--- a/tools/testing/selftests/bpf/progs/verifier_array_access.c
+++ b/tools/testing/selftests/bpf/progs/verifier_array_access.c
@@ -368,8 +368,7 @@ __naked void a_read_only_array_2_1(void)
 	r4 = 0;						\
 	r5 = 0;						\
 	call %[bpf_csum_diff];				\
-l0_%=:	r0 &= 0xffff;					\
-	exit;						\
+l0_%=:	exit;						\
 "	:
 	: __imm(bpf_csum_diff),
 	  __imm(bpf_map_lookup_elem),
-- 
cgit v1.2.3


From 00c1f3dc66a38cf65c3cfd0cb4fe7acfc7f60e37 Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay@kernel.org>
Date: Sat, 26 Oct 2024 12:53:39 +0000
Subject: selftests/bpf: Add a selftest for bpf_csum_diff()

Add a selftest for the bpf_csum_diff() helper. This selftests runs the
helper in all three configurations(push, pull, and diff) and verifies
its output. The correct results have been computed by hand and by the
helper's older implementation.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20241026125339.26459-5-puranjay@kernel.org
---
 .../selftests/bpf/prog_tests/test_csum_diff.c      | 408 +++++++++++++++++++++
 tools/testing/selftests/bpf/progs/csum_diff_test.c |  42 +++
 2 files changed, 450 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/test_csum_diff.c
 create mode 100644 tools/testing/selftests/bpf/progs/csum_diff_test.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/test_csum_diff.c b/tools/testing/selftests/bpf/prog_tests/test_csum_diff.c
new file mode 100644
index 000000000000..107b20d43e83
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/test_csum_diff.c
@@ -0,0 +1,408 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright Amazon.com Inc. or its affiliates */
+#include <test_progs.h>
+#include "csum_diff_test.skel.h"
+
+#define BUFF_SZ 512
+
+struct testcase {
+	unsigned long long to_buff[BUFF_SZ / 8];
+	unsigned int to_buff_len;
+	unsigned long long from_buff[BUFF_SZ / 8];
+	unsigned int from_buff_len;
+	unsigned short seed;
+	unsigned short result;
+};
+
+#define NUM_PUSH_TESTS 4
+
+struct testcase push_tests[NUM_PUSH_TESTS] = {
+	{
+		.to_buff = {
+			0xdeadbeefdeadbeef,
+		},
+		.to_buff_len = 8,
+		.from_buff = {},
+		.from_buff_len = 0,
+		.seed = 0,
+		.result = 0x3b3b
+	},
+	{
+		.to_buff = {
+			0xdeadbeefdeadbeef,
+			0xbeefdeadbeefdead,
+		},
+		.to_buff_len = 16,
+		.from_buff = {},
+		.from_buff_len = 0,
+		.seed = 0x1234,
+		.result = 0x88aa
+	},
+	{
+		.to_buff = {
+			0xdeadbeefdeadbeef,
+			0xbeefdeadbeefdead,
+		},
+		.to_buff_len = 15,
+		.from_buff = {},
+		.from_buff_len = 0,
+		.seed = 0x1234,
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+		.result = 0xcaa9
+#else
+		.result = 0x87fd
+#endif
+	},
+	{
+		.to_buff = {
+			0x327b23c66b8b4567,
+			0x66334873643c9869,
+			0x19495cff74b0dc51,
+			0x625558ec2ae8944a,
+			0x46e87ccd238e1f29,
+			0x507ed7ab3d1b58ba,
+			0x41b71efb2eb141f2,
+			0x7545e14679e2a9e3,
+			0x5bd062c2515f007c,
+			0x4db127f812200854,
+			0x1f16e9e80216231b,
+			0x66ef438d1190cde7,
+			0x3352255a140e0f76,
+			0x0ded7263109cf92e,
+			0x1befd79f7fdcc233,
+			0x6b68079a41a7c4c9,
+			0x25e45d324e6afb66,
+			0x431bd7b7519b500d,
+			0x7c83e4583f2dba31,
+			0x62bbd95a257130a3,
+			0x628c895d436c6125,
+			0x721da317333ab105,
+			0x2d1d5ae92443a858,
+			0x75a2a8d46763845e,
+			0x79838cb208edbdab,
+			0x0b03e0c64353d0cd,
+			0x54e49eb4189a769b,
+			0x2ca8861171f32454,
+			0x02901d820836c40e,
+			0x081386413a95f874,
+			0x7c3dbd3d1e7ff521,
+			0x6ceaf087737b8ddc,
+			0x4516dde922221a70,
+			0x614fd4a13006c83e,
+			0x5577f8e1419ac241,
+			0x05072367440badfc,
+			0x77465f013804823e,
+			0x5c482a977724c67e,
+			0x5e884adc2463b9ea,
+			0x2d51779651ead36b,
+			0x153ea438580bd78f,
+			0x70a64e2a3855585c,
+			0x2a487cb06a2342ec,
+			0x725a06fb1d4ed43b,
+			0x57e4ccaf2cd89a32,
+			0x4b588f547a6d8d3c,
+			0x6de91b18542289ec,
+			0x7644a45c38437fdb,
+			0x684a481a32fff902,
+			0x749abb43579478fe,
+			0x1ba026fa3dc240fb,
+			0x75c6c33a79a1deaa,
+			0x70c6a52912e685fb,
+			0x374a3fe6520eedd1,
+			0x23f9c13c4f4ef005,
+			0x275ac794649bb77c,
+			0x1cf10fd839386575,
+			0x235ba861180115be,
+			0x354fe9f947398c89,
+			0x741226bb15b5af5c,
+			0x10233c990d34b6a8,
+			0x615740953f6ab60f,
+			0x77ae35eb7e0c57b1,
+			0x310c50b3579be4f1,
+		},
+		.to_buff_len = 512,
+		.from_buff = {},
+		.from_buff_len = 0,
+		.seed = 0xffff,
+		.result = 0xca45
+	},
+};
+
+#define NUM_PULL_TESTS 4
+
+struct testcase pull_tests[NUM_PULL_TESTS] = {
+	{
+		.from_buff = {
+			0xdeadbeefdeadbeef,
+		},
+		.from_buff_len = 8,
+		.to_buff = {},
+		.to_buff_len = 0,
+		.seed = 0,
+		.result = 0xc4c4
+	},
+	{
+		.from_buff = {
+			0xdeadbeefdeadbeef,
+			0xbeefdeadbeefdead,
+		},
+		.from_buff_len = 16,
+		.to_buff = {},
+		.to_buff_len = 0,
+		.seed = 0x1234,
+		.result = 0x9bbd
+	},
+	{
+		.from_buff = {
+			0xdeadbeefdeadbeef,
+			0xbeefdeadbeefdead,
+		},
+		.from_buff_len = 15,
+		.to_buff = {},
+		.to_buff_len = 0,
+		.seed = 0x1234,
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+		.result = 0x59be
+#else
+		.result = 0x9c6a
+#endif
+	},
+	{
+		.from_buff = {
+			0x327b23c66b8b4567,
+			0x66334873643c9869,
+			0x19495cff74b0dc51,
+			0x625558ec2ae8944a,
+			0x46e87ccd238e1f29,
+			0x507ed7ab3d1b58ba,
+			0x41b71efb2eb141f2,
+			0x7545e14679e2a9e3,
+			0x5bd062c2515f007c,
+			0x4db127f812200854,
+			0x1f16e9e80216231b,
+			0x66ef438d1190cde7,
+			0x3352255a140e0f76,
+			0x0ded7263109cf92e,
+			0x1befd79f7fdcc233,
+			0x6b68079a41a7c4c9,
+			0x25e45d324e6afb66,
+			0x431bd7b7519b500d,
+			0x7c83e4583f2dba31,
+			0x62bbd95a257130a3,
+			0x628c895d436c6125,
+			0x721da317333ab105,
+			0x2d1d5ae92443a858,
+			0x75a2a8d46763845e,
+			0x79838cb208edbdab,
+			0x0b03e0c64353d0cd,
+			0x54e49eb4189a769b,
+			0x2ca8861171f32454,
+			0x02901d820836c40e,
+			0x081386413a95f874,
+			0x7c3dbd3d1e7ff521,
+			0x6ceaf087737b8ddc,
+			0x4516dde922221a70,
+			0x614fd4a13006c83e,
+			0x5577f8e1419ac241,
+			0x05072367440badfc,
+			0x77465f013804823e,
+			0x5c482a977724c67e,
+			0x5e884adc2463b9ea,
+			0x2d51779651ead36b,
+			0x153ea438580bd78f,
+			0x70a64e2a3855585c,
+			0x2a487cb06a2342ec,
+			0x725a06fb1d4ed43b,
+			0x57e4ccaf2cd89a32,
+			0x4b588f547a6d8d3c,
+			0x6de91b18542289ec,
+			0x7644a45c38437fdb,
+			0x684a481a32fff902,
+			0x749abb43579478fe,
+			0x1ba026fa3dc240fb,
+			0x75c6c33a79a1deaa,
+			0x70c6a52912e685fb,
+			0x374a3fe6520eedd1,
+			0x23f9c13c4f4ef005,
+			0x275ac794649bb77c,
+			0x1cf10fd839386575,
+			0x235ba861180115be,
+			0x354fe9f947398c89,
+			0x741226bb15b5af5c,
+			0x10233c990d34b6a8,
+			0x615740953f6ab60f,
+			0x77ae35eb7e0c57b1,
+			0x310c50b3579be4f1,
+		},
+		.from_buff_len = 512,
+		.to_buff = {},
+		.to_buff_len = 0,
+		.seed = 0xffff,
+		.result = 0x35ba
+	},
+};
+
+#define NUM_DIFF_TESTS 4
+
+struct testcase diff_tests[NUM_DIFF_TESTS] = {
+	{
+		.from_buff = {
+			0xdeadbeefdeadbeef,
+		},
+		.from_buff_len = 8,
+		.to_buff = {
+			0xabababababababab,
+		},
+		.to_buff_len = 8,
+		.seed = 0,
+		.result = 0x7373
+	},
+	{
+		.from_buff = {
+			0xdeadbeefdeadbeef,
+		},
+		.from_buff_len = 7,
+		.to_buff = {
+			0xabababababababab,
+		},
+		.to_buff_len = 7,
+		.seed = 0,
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+		.result = 0xa673
+#else
+		.result = 0x73b7
+#endif
+	},
+	{
+		.from_buff = {
+			0,
+		},
+		.from_buff_len = 8,
+		.to_buff = {
+			0xabababababababab,
+		},
+		.to_buff_len = 8,
+		.seed = 0,
+		.result = 0xaeae
+	},
+	{
+		.from_buff = {
+			0xdeadbeefdeadbeef
+		},
+		.from_buff_len = 8,
+		.to_buff = {
+			0,
+		},
+		.to_buff_len = 8,
+		.seed = 0xffff,
+		.result = 0xc4c4
+	},
+};
+
+#define NUM_EDGE_TESTS 4
+
+struct testcase edge_tests[NUM_EDGE_TESTS] = {
+	{
+		.from_buff = {},
+		.from_buff_len = 0,
+		.to_buff = {},
+		.to_buff_len = 0,
+		.seed = 0,
+		.result = 0
+	},
+	{
+		.from_buff = {
+			0x1234
+		},
+		.from_buff_len = 0,
+		.to_buff = {
+			0x1234
+		},
+		.to_buff_len = 0,
+		.seed = 0,
+		.result = 0
+	},
+	{
+		.from_buff = {},
+		.from_buff_len = 0,
+		.to_buff = {},
+		.to_buff_len = 0,
+		.seed = 0x1234,
+		.result = 0x1234
+	},
+	{
+		.from_buff = {},
+		.from_buff_len = 512,
+		.to_buff = {},
+		.to_buff_len = 0,
+		.seed = 0xffff,
+		.result = 0xffff
+	},
+};
+
+static unsigned short trigger_csum_diff(const struct csum_diff_test *skel)
+{
+	u8 tmp_out[64 << 2] = {};
+	u8 tmp_in[64] = {};
+	int err;
+	int pfd;
+
+	LIBBPF_OPTS(bpf_test_run_opts, topts,
+		.data_in = tmp_in,
+		.data_size_in = sizeof(tmp_in),
+		.data_out = tmp_out,
+		.data_size_out = sizeof(tmp_out),
+		.repeat = 1,
+	);
+	pfd = bpf_program__fd(skel->progs.compute_checksum);
+	err = bpf_prog_test_run_opts(pfd, &topts);
+	if (err)
+		return -1;
+
+	return skel->bss->result;
+}
+
+static void test_csum_diff(struct testcase *tests, int num_tests)
+{
+	struct csum_diff_test *skel;
+	unsigned short got;
+	int err;
+
+	for (int i = 0; i < num_tests; i++) {
+		skel = csum_diff_test__open();
+		if (!ASSERT_OK_PTR(skel, "csum_diff_test open"))
+			return;
+
+		skel->rodata->to_buff_len = tests[i].to_buff_len;
+		skel->rodata->from_buff_len = tests[i].from_buff_len;
+
+		err = csum_diff_test__load(skel);
+		if (!ASSERT_EQ(err, 0, "csum_diff_test load"))
+			goto out;
+
+		memcpy(skel->bss->to_buff, tests[i].to_buff, tests[i].to_buff_len);
+		memcpy(skel->bss->from_buff, tests[i].from_buff, tests[i].from_buff_len);
+		skel->bss->seed = tests[i].seed;
+
+		got = trigger_csum_diff(skel);
+		ASSERT_EQ(got, tests[i].result, "csum_diff result");
+
+		csum_diff_test__destroy(skel);
+	}
+
+	return;
+out:
+	csum_diff_test__destroy(skel);
+}
+
+void test_test_csum_diff(void)
+{
+	if (test__start_subtest("csum_diff_push"))
+		test_csum_diff(push_tests, NUM_PUSH_TESTS);
+	if (test__start_subtest("csum_diff_pull"))
+		test_csum_diff(pull_tests, NUM_PULL_TESTS);
+	if (test__start_subtest("csum_diff_diff"))
+		test_csum_diff(diff_tests, NUM_DIFF_TESTS);
+	if (test__start_subtest("csum_diff_edge"))
+		test_csum_diff(edge_tests, NUM_EDGE_TESTS);
+}
diff --git a/tools/testing/selftests/bpf/progs/csum_diff_test.c b/tools/testing/selftests/bpf/progs/csum_diff_test.c
new file mode 100644
index 000000000000..9438f1773a58
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/csum_diff_test.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright Amazon.com Inc. or its affiliates */
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#define BUFF_SZ 512
+
+/* Will be updated by benchmark before program loading */
+char to_buff[BUFF_SZ];
+const volatile unsigned int to_buff_len = 0;
+char from_buff[BUFF_SZ];
+const volatile unsigned int from_buff_len = 0;
+unsigned short seed = 0;
+
+short result;
+
+char _license[] SEC("license") = "GPL";
+
+SEC("tc")
+int compute_checksum(void *ctx)
+{
+	int to_len_half = to_buff_len / 2;
+	int from_len_half = from_buff_len / 2;
+	short result2;
+
+	/* Calculate checksum in one go */
+	result2 = bpf_csum_diff((void *)from_buff, from_buff_len,
+				(void *)to_buff, to_buff_len, seed);
+
+	/* Calculate checksum by concatenating bpf_csum_diff()*/
+	result = bpf_csum_diff((void *)from_buff, from_buff_len - from_len_half,
+			       (void *)to_buff, to_buff_len - to_len_half, seed);
+
+	result = bpf_csum_diff((void *)from_buff + (from_buff_len - from_len_half), from_len_half,
+			       (void *)to_buff + (to_buff_len - to_len_half), to_len_half, result);
+
+	result = (result == result2) ? result : 0;
+
+	return 0;
+}
-- 
cgit v1.2.3


From 600aa88014e9fca778449316a85ffbb81ef03b89 Mon Sep 17 00:00:00 2001
From: Ba Jing <bajing@cmss.chinamobile.com>
Date: Tue, 3 Sep 2024 12:31:35 +0800
Subject: KVM: selftests: Remove unused macro in the hardware disable test

The macro GUEST_CODE_PIO_PORT is never referenced in the code,
just remove it.

Signed-off-by: Ba Jing <bajing@cmss.chinamobile.com>
Link: https://lore.kernel.org/r/20240903043135.11087-1-bajing@cmss.chinamobile.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/hardware_disable_test.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/hardware_disable_test.c b/tools/testing/selftests/kvm/hardware_disable_test.c
index bce73bcb973c..94bd6ed24cf3 100644
--- a/tools/testing/selftests/kvm/hardware_disable_test.c
+++ b/tools/testing/selftests/kvm/hardware_disable_test.c
@@ -20,7 +20,6 @@
 #define SLEEPING_THREAD_NUM (1 << 4)
 #define FORK_NUM (1ULL << 9)
 #define DELAY_US_MAX 2000
-#define GUEST_CODE_PIO_PORT 4
 
 sem_t *sem;
 
-- 
cgit v1.2.3


From f8912210eb21b6288634fca4ee60bcc5f7c58756 Mon Sep 17 00:00:00 2001
From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Date: Fri, 13 Sep 2024 13:43:15 +0800
Subject: KVM: selftests: Use ARRAY_SIZE for array length

Use of macro ARRAY_SIZE to calculate array size minimizes
the redundant code and improves code reusability.

./tools/testing/selftests/kvm/x86_64/debug_regs.c:169:32-33: WARNING: Use ARRAY_SIZE.

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=10847
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Link: https://lore.kernel.org/r/20240913054315.130832-1-jiapeng.chong@linux.alibaba.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/x86_64/debug_regs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/x86_64/debug_regs.c b/tools/testing/selftests/kvm/x86_64/debug_regs.c
index 76cc2df9238a..2d814c1d1dc4 100644
--- a/tools/testing/selftests/kvm/x86_64/debug_regs.c
+++ b/tools/testing/selftests/kvm/x86_64/debug_regs.c
@@ -166,7 +166,7 @@ int main(void)
 	/* Test single step */
 	target_rip = CAST_TO_RIP(ss_start);
 	target_dr6 = 0xffff4ff0ULL;
-	for (i = 0; i < (sizeof(ss_size) / sizeof(ss_size[0])); i++) {
+	for (i = 0; i < ARRAY_SIZE(ss_size); i++) {
 		target_rip += ss_size[i];
 		memset(&debug, 0, sizeof(debug));
 		debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP |
-- 
cgit v1.2.3


From 365836e010a131cce62e3e0bab0348d08ca08490 Mon Sep 17 00:00:00 2001
From: Yunshui Jiang <jiangyunshui@kylinos.cn>
Date: Mon, 28 Oct 2024 16:27:56 +0800
Subject: tests: hsr: Increase timeout to 50 seconds

The HSR test, hsr_ping.sh, actually needs 7 min to run. Around 375s to
be exact, and even more on a debug kernel or kernel with other network
security limits. The timeout setting for the kselftest is currently 45
seconds, which is way too short to integrate hsr tests to run_kselftest
infrastructure. However, timeout of hundreds of seconds is quite a long
time, especially in a CI/CD environment. It seems that we need
accelerate the test and balance with timeout setting.

The most time-consuming func is do_ping_long, where ping command sends
10 packages to the given address. The default interval between two ping
packages is 1s according to the ping Mannual. There isn't any operation
between pings thus we could pass -i 0.1 to ping to make it 10 times
faster.

While even with this short interval, the test still need about 46.4
seconds to finish because of the two HSR interfaces, each of which is
tested by calling do_ping func 12 times and do_ping_long func 19 times
and sleep for 3s.

So, an explicit setting is also needed to slightly increase the
timeout. And to leave us some slack, use 50 as default timeout.

Signed-off-by: Yunshui Jiang <jiangyunshui@kylinos.cn>
Link: https://patch.msgid.link/20241028082757.2945232-1-jiangyunshui@kylinos.cn
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/hsr/hsr_common.sh | 4 ++--
 tools/testing/selftests/net/hsr/settings      | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/net/hsr/settings

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/hsr/hsr_common.sh b/tools/testing/selftests/net/hsr/hsr_common.sh
index 8e97b1f2e7e5..1dc882ac1c74 100644
--- a/tools/testing/selftests/net/hsr/hsr_common.sh
+++ b/tools/testing/selftests/net/hsr/hsr_common.sh
@@ -15,7 +15,7 @@ do_ping()
 {
 	local netns="$1"
 	local connect_addr="$2"
-	local ping_args="-q -c 2"
+	local ping_args="-q -c 2 -i 0.1"
 
 	if is_v6 "${connect_addr}"; then
 		$ipv6 || return 0
@@ -36,7 +36,7 @@ do_ping_long()
 {
 	local netns="$1"
 	local connect_addr="$2"
-	local ping_args="-q -c 10"
+	local ping_args="-q -c 10 -i 0.1"
 
 	if is_v6 "${connect_addr}"; then
 		$ipv6 || return 0
diff --git a/tools/testing/selftests/net/hsr/settings b/tools/testing/selftests/net/hsr/settings
new file mode 100644
index 000000000000..0fbc037f2aa8
--- /dev/null
+++ b/tools/testing/selftests/net/hsr/settings
@@ -0,0 +1 @@
+timeout=50
-- 
cgit v1.2.3


From 75cd027cbcb161e277209e20df14f0818c62d9e7 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 30 Oct 2024 16:03:17 +0000
Subject: KVM: arm64: selftests: Test ID_AA64PFR0.MPAM isn't completely ignored

The ID_AA64PFR0.MPAM bit was previously accidentally exposed to guests,
and is ignored by KVM. KVM will always present the guest with 0 here,
and trap the MPAM system registers to inject an undef.

But, this value is still needed to prevent migration when the value
is incompatible with the target hardware. Add a kvm unit test to try
and write multiple values to ID_AA64PFR0.MPAM. Only the hardware value
previously exposed should be ignored, all other values should be
rejected.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Joey Gouly <joey.gouly@arm.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20241030160317.2528209-8-joey.gouly@arm.com
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
---
 tools/testing/selftests/kvm/aarch64/set_id_regs.c | 99 ++++++++++++++++++++++-
 1 file changed, 98 insertions(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/aarch64/set_id_regs.c b/tools/testing/selftests/kvm/aarch64/set_id_regs.c
index b87e53580bfc..a79b7f18452d 100644
--- a/tools/testing/selftests/kvm/aarch64/set_id_regs.c
+++ b/tools/testing/selftests/kvm/aarch64/set_id_regs.c
@@ -443,6 +443,101 @@ static void test_vm_ftr_id_regs(struct kvm_vcpu *vcpu, bool aarch64_only)
 	}
 }
 
+#define MPAM_IDREG_TEST	6
+static void test_user_set_mpam_reg(struct kvm_vcpu *vcpu)
+{
+	uint64_t masks[KVM_ARM_FEATURE_ID_RANGE_SIZE];
+	struct reg_mask_range range = {
+		.addr = (__u64)masks,
+	};
+	uint64_t val;
+	int idx, err;
+
+	/*
+	 * If ID_AA64PFR0.MPAM is _not_ officially modifiable and is zero,
+	 * check that if it can be set to 1, (i.e. it is supported by the
+	 * hardware), that it can't be set to other values.
+	 */
+
+	/* Get writable masks for feature ID registers */
+	memset(range.reserved, 0, sizeof(range.reserved));
+	vm_ioctl(vcpu->vm, KVM_ARM_GET_REG_WRITABLE_MASKS, &range);
+
+	/* Writeable? Nothing to test! */
+	idx = encoding_to_range_idx(SYS_ID_AA64PFR0_EL1);
+	if ((masks[idx] & ID_AA64PFR0_EL1_MPAM_MASK) == ID_AA64PFR0_EL1_MPAM_MASK) {
+		ksft_test_result_skip("ID_AA64PFR0_EL1.MPAM is officially writable, nothing to test\n");
+		return;
+	}
+
+	/* Get the id register value */
+	vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), &val);
+
+	/* Try to set MPAM=0. This should always be possible. */
+	val &= ~ID_AA64PFR0_EL1_MPAM_MASK;
+	val |= FIELD_PREP(ID_AA64PFR0_EL1_MPAM_MASK, 0);
+	err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), val);
+	if (err)
+		ksft_test_result_fail("ID_AA64PFR0_EL1.MPAM=0 was not accepted\n");
+	else
+		ksft_test_result_pass("ID_AA64PFR0_EL1.MPAM=0 worked\n");
+
+	/* Try to set MPAM=1 */
+	val &= ~ID_AA64PFR0_EL1_MPAM_MASK;
+	val |= FIELD_PREP(ID_AA64PFR0_EL1_MPAM_MASK, 1);
+	err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), val);
+	if (err)
+		ksft_test_result_skip("ID_AA64PFR0_EL1.MPAM is not writable, nothing to test\n");
+	else
+		ksft_test_result_pass("ID_AA64PFR0_EL1.MPAM=1 was writable\n");
+
+	/* Try to set MPAM=2 */
+	val &= ~ID_AA64PFR0_EL1_MPAM_MASK;
+	val |= FIELD_PREP(ID_AA64PFR0_EL1_MPAM_MASK, 2);
+	err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), val);
+	if (err)
+		ksft_test_result_pass("ID_AA64PFR0_EL1.MPAM not arbitrarily modifiable\n");
+	else
+		ksft_test_result_fail("ID_AA64PFR0_EL1.MPAM value should not be ignored\n");
+
+	/* And again for ID_AA64PFR1_EL1.MPAM_frac */
+	idx = encoding_to_range_idx(SYS_ID_AA64PFR1_EL1);
+	if ((masks[idx] & ID_AA64PFR1_EL1_MPAM_frac_MASK) == ID_AA64PFR1_EL1_MPAM_frac_MASK) {
+		ksft_test_result_skip("ID_AA64PFR1_EL1.MPAM_frac is officially writable, nothing to test\n");
+		return;
+	}
+
+	/* Get the id register value */
+	vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1), &val);
+
+	/* Try to set MPAM_frac=0. This should always be possible. */
+	val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK;
+	val |= FIELD_PREP(ID_AA64PFR1_EL1_MPAM_frac_MASK, 0);
+	err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1), val);
+	if (err)
+		ksft_test_result_fail("ID_AA64PFR0_EL1.MPAM_frac=0 was not accepted\n");
+	else
+		ksft_test_result_pass("ID_AA64PFR0_EL1.MPAM_frac=0 worked\n");
+
+	/* Try to set MPAM_frac=1 */
+	val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK;
+	val |= FIELD_PREP(ID_AA64PFR1_EL1_MPAM_frac_MASK, 1);
+	err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1), val);
+	if (err)
+		ksft_test_result_skip("ID_AA64PFR1_EL1.MPAM_frac is not writable, nothing to test\n");
+	else
+		ksft_test_result_pass("ID_AA64PFR0_EL1.MPAM_frac=1 was writable\n");
+
+	/* Try to set MPAM_frac=2 */
+	val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK;
+	val |= FIELD_PREP(ID_AA64PFR1_EL1_MPAM_frac_MASK, 2);
+	err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1), val);
+	if (err)
+		ksft_test_result_pass("ID_AA64PFR1_EL1.MPAM_frac not arbitrarily modifiable\n");
+	else
+		ksft_test_result_fail("ID_AA64PFR1_EL1.MPAM_frac value should not be ignored\n");
+}
+
 static void test_guest_reg_read(struct kvm_vcpu *vcpu)
 {
 	bool done = false;
@@ -581,12 +676,14 @@ int main(void)
 		   ARRAY_SIZE(ftr_id_aa64isar2_el1) + ARRAY_SIZE(ftr_id_aa64pfr0_el1) +
 		   ARRAY_SIZE(ftr_id_aa64pfr1_el1) + ARRAY_SIZE(ftr_id_aa64mmfr0_el1) +
 		   ARRAY_SIZE(ftr_id_aa64mmfr1_el1) + ARRAY_SIZE(ftr_id_aa64mmfr2_el1) +
-		   ARRAY_SIZE(ftr_id_aa64zfr0_el1) - ARRAY_SIZE(test_regs) + 2;
+		   ARRAY_SIZE(ftr_id_aa64zfr0_el1) - ARRAY_SIZE(test_regs) + 2 +
+		   MPAM_IDREG_TEST;
 
 	ksft_set_plan(test_cnt);
 
 	test_vm_ftr_id_regs(vcpu, aarch64_only);
 	test_vcpu_ftr_id_regs(vcpu);
+	test_user_set_mpam_reg(vcpu);
 
 	test_guest_reg_read(vcpu);
 
-- 
cgit v1.2.3


From 7f66456d776a0c44b5e2b932bf8ed186ccec3bc1 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 29 Oct 2024 12:26:03 -0700
Subject: selftests: netdevsim: add fib_notifications to Makefile

Commit 19d36d2971e6 ("selftests: netdevsim: Add fib_notifications test")
added the test but didn't include it in the Makefile.

Reviewed-by: Joe Damato <jdamato@fastly.com>
Link: https://patch.msgid.link/20241029192603.509295-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/netdevsim/Makefile | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/netdevsim/Makefile b/tools/testing/selftests/drivers/net/netdevsim/Makefile
index 5bace0b7fb57..cc08b220323f 100644
--- a/tools/testing/selftests/drivers/net/netdevsim/Makefile
+++ b/tools/testing/selftests/drivers/net/netdevsim/Makefile
@@ -8,6 +8,7 @@ TEST_PROGS = devlink.sh \
 	ethtool-pause.sh \
 	ethtool-ring.sh \
 	fib.sh \
+	fib_notifications.sh \
 	hw_stats_l3.sh \
 	nexthop.sh \
 	peer.sh \
-- 
cgit v1.2.3


From d3774a4b21e98c336d71d67b7605d91f344524c9 Mon Sep 17 00:00:00 2001
From: Li Zhijian <lizhijian@fujitsu.com>
Date: Wed, 30 Oct 2024 08:59:43 +0800
Subject: selftests/net: Fix ./ns-XXXXXX not cleanup

```
readonly STATS="$(mktemp -p /tmp ns-XXXXXX)"
readonly BASE=`basename $STATS`
```
It could be a mistake to write to $BASE rather than $STATS, where $STATS
is used to save the NSTAT_HISTORY and it will be cleaned up before exit.

Although since we've been creating the wrong file this whole time and
everything worked, it's fine to remove these 2 lines completely

Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
Link: https://patch.msgid.link/20241030005943.400225-1-lizhijian@fujitsu.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/veth.sh | 2 --
 1 file changed, 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/veth.sh b/tools/testing/selftests/net/veth.sh
index 4f1edbafb946..6bb7dfaa30b6 100755
--- a/tools/testing/selftests/net/veth.sh
+++ b/tools/testing/selftests/net/veth.sh
@@ -46,8 +46,6 @@ create_ns() {
 		ip -n $BASE$ns addr add dev veth$ns $BM_NET_V4$ns/24
 		ip -n $BASE$ns addr add dev veth$ns $BM_NET_V6$ns/64 nodad
 	done
-	echo "#kernel" > $BASE
-	chmod go-rw $BASE
 }
 
 __chk_flag() {
-- 
cgit v1.2.3


From 8b55572e51805184353ee7d587c720a51818fb82 Mon Sep 17 00:00:00 2001
From: Kalesh Singh <kaleshsingh@google.com>
Date: Wed, 30 Oct 2024 10:17:50 -0700
Subject: tracing/selftests: Add tracefs mount options test

Add a selftest to check that the tracefs gid mount option is applied
correctly.

   ./ftracetest test.d/00basic/mount_options.tc

Use the new readme string "[gid=<gid>] as a requirement and also update
test_ownership.tc requirements to use this.

Cc: Eric Sandeen <sandeen@redhat.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Ali Zahraee <ahzahraee@gmail.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Link: https://lore.kernel.org/20241030171928.4168869-4-kaleshsingh@google.com
Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 .../ftrace/test.d/00basic/mount_options.tc         | 101 +++++++++++++++++++++
 .../ftrace/test.d/00basic/test_ownership.tc        |  16 +---
 tools/testing/selftests/ftrace/test.d/functions    |  25 +++++
 3 files changed, 129 insertions(+), 13 deletions(-)
 create mode 100644 tools/testing/selftests/ftrace/test.d/00basic/mount_options.tc

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/ftrace/test.d/00basic/mount_options.tc b/tools/testing/selftests/ftrace/test.d/00basic/mount_options.tc
new file mode 100644
index 000000000000..35e8d47d6072
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/00basic/mount_options.tc
@@ -0,0 +1,101 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test tracefs GID mount option
+# requires: "[gid=<gid>]":README
+
+fail() {
+	local msg="$1"
+
+	echo "FAILED: $msg"
+	exit_fail
+}
+
+find_alternate_gid() {
+	local original_gid="$1"
+	tac /etc/group | grep -v ":$original_gid:" | head -1 | cut -d: -f3
+}
+
+mount_tracefs_with_options() {
+	local mount_point="$1"
+	local options="$2"
+
+	mount -t tracefs -o "$options" nodev "$mount_point"
+
+	setup
+}
+
+unmount_tracefs() {
+	local mount_point="$1"
+
+	# Need to make sure the mount isn't busy so that we can umount it
+	(cd $mount_point; finish_ftrace;)
+
+	cleanup
+}
+
+create_instance() {
+	local mount_point="$1"
+	local instance="$mount_point/instances/$(mktemp -u test-XXXXXX)"
+
+	mkdir "$instance"
+	echo "$instance"
+}
+
+remove_instance() {
+	local instance="$1"
+
+	rmdir "$instance"
+}
+
+check_gid() {
+	local mount_point="$1"
+	local expected_gid="$2"
+
+	echo "Checking permission group ..."
+
+	cd "$mount_point"
+
+	for file in "." "events" "events/sched" "events/sched/sched_switch" "events/sched/sched_switch/enable"; do
+		local gid=`stat -c "%g" $file`
+		if [ "$gid" -ne "$expected_gid" ]; then
+			cd - # Return to the previous working directory (tracefs root)
+			fail "$(realpath $file): Expected group $expected_gid; Got group $gid"
+		fi
+	done
+
+	cd - # Return to the previous working directory (tracefs root)
+}
+
+test_gid_mount_option() {
+	local mount_point=$(get_mount_point)
+	local mount_options=$(get_mnt_options "$mount_point")
+	local original_group=$(stat -c "%g" .)
+	local other_group=$(find_alternate_gid "$original_group")
+
+	# Set up mount options with new GID for testing
+	local new_options=`echo "$mount_options" | sed -e "s/gid=[0-9]*/gid=$other_group/"`
+	if [ "$new_options" = "$mount_options" ]; then
+		new_options="$mount_options,gid=$other_group"
+		mount_options="$mount_options,gid=$original_group"
+	fi
+
+	# Unmount existing tracefs instance and mount with new GID
+	unmount_tracefs "$mount_point"
+	mount_tracefs_with_options "$mount_point" "$new_options"
+
+	check_gid "$mount_point" "$other_group"
+
+	# Check that files created after the mount inherit the GID
+	local instance=$(create_instance "$mount_point")
+	check_gid "$instance" "$other_group"
+	remove_instance "$instance"
+
+	# Unmount and remount with the original GID
+	unmount_tracefs "$mount_point"
+	mount_tracefs_with_options "$mount_point" "$mount_options"
+	check_gid "$mount_point" "$original_group"
+}
+
+test_gid_mount_option
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/00basic/test_ownership.tc b/tools/testing/selftests/ftrace/test.d/00basic/test_ownership.tc
index 094419e190c2..e71cc3ad0bdf 100644
--- a/tools/testing/selftests/ftrace/test.d/00basic/test_ownership.tc
+++ b/tools/testing/selftests/ftrace/test.d/00basic/test_ownership.tc
@@ -1,24 +1,14 @@
 #!/bin/sh
 # SPDX-License-Identifier: GPL-2.0
 # description: Test file and directory ownership changes for eventfs
+# requires: "[gid=<gid>]":README
 
 original_group=`stat -c "%g" .`
 original_owner=`stat -c "%u" .`
 
-mount_point=`stat -c '%m' .`
+local mount_point=$(get_mount_point)
 
-# If stat -c '%m' does not work (e.g. busybox) or failed, try to use the
-# current working directory (which should be a tracefs) as the mount point.
-if [ ! -d "$mount_point" ]; then
-	if mount | grep -qw $PWD ; then
-		mount_point=$PWD
-	else
-		# If PWD doesn't work, that is an environmental problem.
-		exit_unresolved
-	fi
-fi
-
-mount_options=`mount | grep "$mount_point" | sed -e 's/.*(\(.*\)).*/\1/'`
+mount_options=$(get_mnt_options "$mount_point")
 
 # find another owner and group that is not the original
 other_group=`tac /etc/group | grep -v ":$original_group:" | head -1 | cut -d: -f3`
diff --git a/tools/testing/selftests/ftrace/test.d/functions b/tools/testing/selftests/ftrace/test.d/functions
index 779f3e62ec90..84d6a9c7ad67 100644
--- a/tools/testing/selftests/ftrace/test.d/functions
+++ b/tools/testing/selftests/ftrace/test.d/functions
@@ -193,3 +193,28 @@ ftrace_errlog_check() { # err-prefix command-with-error-pos-by-^ command-file
     # "  Command: " and "^\n" => 13
     test $(expr 13 + $pos) -eq $N
 }
+
+# Helper to get the tracefs mount point
+get_mount_point() {
+	local mount_point=`stat -c '%m' .`
+
+	# If stat -c '%m' does not work (e.g. busybox) or failed, try to use the
+	# current working directory (which should be a tracefs) as the mount point.
+	if [ ! -d "$mount_point" ]; then
+		if mount | grep -qw "$PWD"; then
+			mount_point=$PWD
+		else
+			# If PWD doesn't work, that is an environmental problem.
+			exit_unresolved
+		fi
+	fi
+	echo "$mount_point"
+}
+
+# Helper function to retrieve mount options for a given mount point
+get_mnt_options() {
+	local mnt_point="$1"
+	local opts=$(mount | grep -m1 "$mnt_point" | sed -e 's/.*(\(.*\)).*/\1/')
+
+	echo "$opts"
+}
\ No newline at end of file
-- 
cgit v1.2.3


From 17a2409783f141c9fcd03b140d17bbb75e98c630 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 29 Oct 2024 12:34:21 +0000
Subject: kselftest/arm64: Use ksft_perror() to log MTE failures

The logging in the allocation helpers variously uses ksft_print_msg() with
very intermittent logging of errno and perror() (which won't produce KTAP
conformant output) when logging the result of API calls that set errno.
Standardise on using the ksft_perror() helper in these cases so that more
information is available should the tests fail.

Signed-off-by: Mark Brown <broonie@kernel.org>
Acked-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Link: https://lore.kernel.org/r/20241029-arm64-mte-test-logging-v1-1-a128e732e36e@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/mte/mte_common_util.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/mte/mte_common_util.c b/tools/testing/selftests/arm64/mte/mte_common_util.c
index 17fbe5cfe472..a1dc2fe5285b 100644
--- a/tools/testing/selftests/arm64/mte/mte_common_util.c
+++ b/tools/testing/selftests/arm64/mte/mte_common_util.c
@@ -150,13 +150,13 @@ static void *__mte_allocate_memory_range(size_t size, int mem_type, int mapping,
 		map_flag |= MAP_PRIVATE;
 	ptr = mmap(NULL, entire_size, prot_flag, map_flag, fd, 0);
 	if (ptr == MAP_FAILED) {
-		ksft_print_msg("FAIL: mmap allocation\n");
+		ksft_perror("mmap()");
 		return NULL;
 	}
 	if (mem_type == USE_MPROTECT) {
 		if (mprotect(ptr, entire_size, prot_flag | PROT_MTE)) {
+			ksft_perror("mprotect(PROT_MTE)");
 			munmap(ptr, size);
-			ksft_print_msg("FAIL: mprotect PROT_MTE property\n");
 			return NULL;
 		}
 	}
@@ -190,13 +190,13 @@ void *mte_allocate_file_memory(size_t size, int mem_type, int mapping, bool tags
 	lseek(fd, 0, SEEK_SET);
 	for (index = INIT_BUFFER_SIZE; index < size; index += INIT_BUFFER_SIZE) {
 		if (write(fd, buffer, INIT_BUFFER_SIZE) != INIT_BUFFER_SIZE) {
-			perror("initialising buffer");
+			ksft_perror("initialising buffer");
 			return NULL;
 		}
 	}
 	index -= INIT_BUFFER_SIZE;
 	if (write(fd, buffer, size - index) != size - index) {
-		perror("initialising buffer");
+		ksft_perror("initialising buffer");
 		return NULL;
 	}
 	return __mte_allocate_memory_range(size, mem_type, mapping, 0, 0, tags, fd);
@@ -217,12 +217,12 @@ void *mte_allocate_file_memory_tag_range(size_t size, int mem_type, int mapping,
 	lseek(fd, 0, SEEK_SET);
 	for (index = INIT_BUFFER_SIZE; index < map_size; index += INIT_BUFFER_SIZE)
 		if (write(fd, buffer, INIT_BUFFER_SIZE) != INIT_BUFFER_SIZE) {
-			perror("initialising buffer");
+			ksft_perror("initialising buffer");
 			return NULL;
 		}
 	index -= INIT_BUFFER_SIZE;
 	if (write(fd, buffer, map_size - index) != map_size - index) {
-		perror("initialising buffer");
+		ksft_perror("initialising buffer");
 		return NULL;
 	}
 	return __mte_allocate_memory_range(size, mem_type, mapping, range_before,
@@ -358,7 +358,7 @@ int create_temp_file(void)
 	/* Create a file in the tmpfs filesystem */
 	fd = mkstemp(&filename[0]);
 	if (fd == -1) {
-		perror(filename);
+		ksft_perror(filename);
 		ksft_print_msg("FAIL: Unable to open temporary file\n");
 		return 0;
 	}
-- 
cgit v1.2.3


From bc2ca3680b30869ee9a764ab72c143070f1afec8 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 2 Aug 2024 11:55:04 -0700
Subject: KVM: x86: Disallow changing MSR_PLATFORM_INFO after vCPU has run

Tag MSR_PLATFORM_INFO as a feature MSR (because it is), i.e. disallow it
from being modified after the vCPU has run.

To make KVM's selftest compliant, simply delete the userspace MSR write
that restores KVM's original value at the end of the test.  Verifying that
userspace can write back what it originally read is uninteresting in this
particular case, because KVM doesn't enforce _any_ bits in the MSR, i.e.
userspace should be able to write any arbitrary value.

Link: https://lore.kernel.org/r/20240802185511.305849-3-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/x86.c                                      | 1 +
 tools/testing/selftests/kvm/x86_64/platform_info_test.c | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3787e6a71743..16804637ba97 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -451,6 +451,7 @@ static const u32 msr_based_features_all_except_vmx[] = {
 	MSR_IA32_UCODE_REV,
 	MSR_IA32_ARCH_CAPABILITIES,
 	MSR_IA32_PERF_CAPABILITIES,
+	MSR_PLATFORM_INFO,
 };
 
 static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) +
diff --git a/tools/testing/selftests/kvm/x86_64/platform_info_test.c b/tools/testing/selftests/kvm/x86_64/platform_info_test.c
index eda88080c186..9cbf283ebc55 100644
--- a/tools/testing/selftests/kvm/x86_64/platform_info_test.c
+++ b/tools/testing/selftests/kvm/x86_64/platform_info_test.c
@@ -72,8 +72,6 @@ int main(int argc, char *argv[])
 	}
 
 done:
-	vcpu_set_msr(vcpu, MSR_PLATFORM_INFO, msr_platform_info);
-
 	kvm_vm_free(vm);
 
 	return 0;
-- 
cgit v1.2.3


From b799e3e7da2c8b2ae03c977307b2f082fac6140e Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 2 Aug 2024 11:55:10 -0700
Subject: KVM: selftests: Verify get/set PERF_CAPABILITIES w/o guest PDMC
 behavior

Add another testcase to x86's PMU capabilities test to verify that KVM's
handling of userspace accesses to PERF_CAPABILITIES when the vCPU doesn't
support the MSR (per the vCPU's CPUID).  KVM's (newly established) ABI is
that userspace MSR accesses are subject to architectural existence checks,
but that if the MSR is advertised as supported _by KVM_, "bad" reads get
'0' and writes of '0' are always allowed.

Link: https://lore.kernel.org/r/20240802185511.305849-9-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../selftests/kvm/x86_64/vmx_pmu_caps_test.c       | 23 ++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c
index 7c92536551cc..a1f5ff45d518 100644
--- a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c
+++ b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c
@@ -207,6 +207,29 @@ KVM_ONE_VCPU_TEST(vmx_pmu_caps, lbr_perf_capabilities, guest_code)
 	TEST_ASSERT(!r, "Writing LBR_TOS should fail after disabling vPMU");
 }
 
+KVM_ONE_VCPU_TEST(vmx_pmu_caps, perf_capabilities_unsupported, guest_code)
+{
+	uint64_t val;
+	int i, r;
+
+	vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities);
+	val = vcpu_get_msr(vcpu, MSR_IA32_PERF_CAPABILITIES);
+	TEST_ASSERT_EQ(val, host_cap.capabilities);
+
+	vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_PDCM);
+
+	val = vcpu_get_msr(vcpu, MSR_IA32_PERF_CAPABILITIES);
+	TEST_ASSERT_EQ(val, 0);
+
+	vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, 0);
+
+	for (i = 0; i < 64; i++) {
+		r = _vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, BIT_ULL(i));
+		TEST_ASSERT(!r, "Setting PERF_CAPABILITIES bit %d (= 0x%llx) should fail without PDCM",
+			    i, BIT_ULL(i));
+	}
+}
+
 int main(int argc, char *argv[])
 {
 	TEST_REQUIRE(kvm_is_pmu_enabled());
-- 
cgit v1.2.3


From 0581dfbad9542061406c40eccab9037e59ea62c8 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 2 Aug 2024 11:55:11 -0700
Subject: KVM: selftests: Add a testcase for disabling feature MSRs init quirk

Expand and rename the feature MSRs test to verify KVM's ABI and quirk
for initializing feature MSRs.

Exempt VM_CR{0,4}_FIXED1 from most tests as KVM intentionally takes full
control of the MSRs, e.g. to prevent L1 from running L2 with bogus CR0
and/or CR4 values.

Link: https://lore.kernel.org/r/20240802185511.305849-10-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/Makefile               |   2 +-
 .../selftests/kvm/x86_64/feature_msrs_test.c       | 113 +++++++++++++++++++++
 .../selftests/kvm/x86_64/get_msr_index_features.c  |  35 -------
 3 files changed, 114 insertions(+), 36 deletions(-)
 create mode 100644 tools/testing/selftests/kvm/x86_64/feature_msrs_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/get_msr_index_features.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 156fbfae940f..f186888f0e00 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -67,7 +67,7 @@ TEST_PROGS_x86_64 += x86_64/nx_huge_pages_test.sh
 TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test
 TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test
 TEST_GEN_PROGS_x86_64 += x86_64/dirty_log_page_splitting_test
-TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features
+TEST_GEN_PROGS_x86_64 += x86_64/feature_msrs_test
 TEST_GEN_PROGS_x86_64 += x86_64/exit_on_emulation_failure_test
 TEST_GEN_PROGS_x86_64 += x86_64/fix_hypercall_test
 TEST_GEN_PROGS_x86_64 += x86_64/hwcr_msr_test
diff --git a/tools/testing/selftests/kvm/x86_64/feature_msrs_test.c b/tools/testing/selftests/kvm/x86_64/feature_msrs_test.c
new file mode 100644
index 000000000000..a72f13ae2edb
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/feature_msrs_test.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+static bool is_kvm_controlled_msr(uint32_t msr)
+{
+	return msr == MSR_IA32_VMX_CR0_FIXED1 || msr == MSR_IA32_VMX_CR4_FIXED1;
+}
+
+/*
+ * For VMX MSRs with a "true" variant, KVM requires userspace to set the "true"
+ * MSR, and doesn't allow setting the hidden version.
+ */
+static bool is_hidden_vmx_msr(uint32_t msr)
+{
+	switch (msr) {
+	case MSR_IA32_VMX_PINBASED_CTLS:
+	case MSR_IA32_VMX_PROCBASED_CTLS:
+	case MSR_IA32_VMX_EXIT_CTLS:
+	case MSR_IA32_VMX_ENTRY_CTLS:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool is_quirked_msr(uint32_t msr)
+{
+	return msr != MSR_AMD64_DE_CFG;
+}
+
+static void test_feature_msr(uint32_t msr)
+{
+	const uint64_t supported_mask = kvm_get_feature_msr(msr);
+	uint64_t reset_value = is_quirked_msr(msr) ? supported_mask : 0;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	/*
+	 * Don't bother testing KVM-controlled MSRs beyond verifying that the
+	 * MSR can be read from userspace.  Any value is effectively legal, as
+	 * KVM is bound by x86 architecture, not by ABI.
+	 */
+	if (is_kvm_controlled_msr(msr))
+		return;
+
+	/*
+	 * More goofy behavior.  KVM reports the host CPU's actual revision ID,
+	 * but initializes the vCPU's revision ID to an arbitrary value.
+	 */
+	if (msr == MSR_IA32_UCODE_REV)
+		reset_value = host_cpu_is_intel ? 0x100000000ULL : 0x01000065;
+
+	/*
+	 * For quirked MSRs, KVM's ABI is to initialize the vCPU's value to the
+	 * full set of features supported by KVM.  For non-quirked MSRs, and
+	 * when the quirk is disabled, KVM must zero-initialize the MSR and let
+	 * userspace do the configuration.
+	 */
+	vm = vm_create_with_one_vcpu(&vcpu, NULL);
+	TEST_ASSERT(vcpu_get_msr(vcpu, msr) == reset_value,
+		    "Wanted 0x%lx for %squirked MSR 0x%x, got 0x%lx",
+		    reset_value, is_quirked_msr(msr) ? "" : "non-", msr,
+		    vcpu_get_msr(vcpu, msr));
+	if (!is_hidden_vmx_msr(msr))
+		vcpu_set_msr(vcpu, msr, supported_mask);
+	kvm_vm_free(vm);
+
+	if (is_hidden_vmx_msr(msr))
+		return;
+
+	if (!kvm_has_cap(KVM_CAP_DISABLE_QUIRKS2) ||
+	    !(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
+		return;
+
+	vm = vm_create(1);
+	vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_STUFF_FEATURE_MSRS);
+
+	vcpu = vm_vcpu_add(vm, 0, NULL);
+	TEST_ASSERT(!vcpu_get_msr(vcpu, msr),
+		    "Quirk disabled, wanted '0' for MSR 0x%x, got 0x%lx",
+		    msr, vcpu_get_msr(vcpu, msr));
+	kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+	const struct kvm_msr_list *feature_list;
+	int i;
+
+	/*
+	 * Skip the entire test if MSR_FEATURES isn't supported, other tests
+	 * will cover the "regular" list of MSRs, the coverage here is purely
+	 * opportunistic and not interesting on its own.
+	 */
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_GET_MSR_FEATURES));
+
+	(void)kvm_get_msr_index_list();
+
+	feature_list = kvm_get_feature_msr_index_list();
+	for (i = 0; i < feature_list->nmsrs; i++)
+		test_feature_msr(feature_list->indices[i]);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c b/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c
deleted file mode 100644
index d09b3cbcadc6..000000000000
--- a/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c
+++ /dev/null
@@ -1,35 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Test that KVM_GET_MSR_INDEX_LIST and
- * KVM_GET_MSR_FEATURE_INDEX_LIST work as intended
- *
- * Copyright (C) 2020, Red Hat, Inc.
- */
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-
-int main(int argc, char *argv[])
-{
-	const struct kvm_msr_list *feature_list;
-	int i;
-
-	/*
-	 * Skip the entire test if MSR_FEATURES isn't supported, other tests
-	 * will cover the "regular" list of MSRs, the coverage here is purely
-	 * opportunistic and not interesting on its own.
-	 */
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_GET_MSR_FEATURES));
-
-	(void)kvm_get_msr_index_list();
-
-	feature_list = kvm_get_feature_msr_index_list();
-	for (i = 0; i < feature_list->nmsrs; i++)
-		kvm_get_feature_msr(feature_list->indices[i]);
-}
-- 
cgit v1.2.3


From f2c5aa31670d8532dc820c110faf96d0cdbba7d9 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 3 Oct 2024 16:43:28 -0700
Subject: KVM: selftests: Precisely mask off dynamic fields in CPUID test

When comparing vCPU CPUID entries against KVM's supported CPUID, mask off
only the dynamic fields/bits instead of skipping the entire entry.
Precisely masking bits isn't meaningfully more difficult than skipping
entire entries, and will be necessary to maintain test coverage when a
future commit enables OSXSAVE by default, i.e. makes one bit in all of
CPUID.0x1 dynamic.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Link: https://lore.kernel.org/r/20241003234337.273364-3-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/x86_64/cpuid_test.c | 63 +++++++++++++++----------
 1 file changed, 37 insertions(+), 26 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/x86_64/cpuid_test.c b/tools/testing/selftests/kvm/x86_64/cpuid_test.c
index fec03b11b059..f7fdcef5fa59 100644
--- a/tools/testing/selftests/kvm/x86_64/cpuid_test.c
+++ b/tools/testing/selftests/kvm/x86_64/cpuid_test.c
@@ -12,17 +12,16 @@
 #include "kvm_util.h"
 #include "processor.h"
 
-/* CPUIDs known to differ */
-struct {
-	u32 function;
-	u32 index;
-} mangled_cpuids[] = {
-	/*
-	 * These entries depend on the vCPU's XCR0 register and IA32_XSS MSR,
-	 * which are not controlled for by this test.
-	 */
-	{.function = 0xd, .index = 0},
-	{.function = 0xd, .index = 1},
+struct cpuid_mask {
+	union {
+		struct {
+			u32 eax;
+			u32 ebx;
+			u32 ecx;
+			u32 edx;
+		};
+		u32 regs[4];
+	};
 };
 
 static void test_guest_cpuids(struct kvm_cpuid2 *guest_cpuid)
@@ -56,17 +55,23 @@ static void guest_main(struct kvm_cpuid2 *guest_cpuid)
 	GUEST_DONE();
 }
 
-static bool is_cpuid_mangled(const struct kvm_cpuid_entry2 *entrie)
+static struct cpuid_mask get_const_cpuid_mask(const struct kvm_cpuid_entry2 *entry)
 {
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(mangled_cpuids); i++) {
-		if (mangled_cpuids[i].function == entrie->function &&
-		    mangled_cpuids[i].index == entrie->index)
-			return true;
+	struct cpuid_mask mask;
+
+	memset(&mask, 0xff, sizeof(mask));
+
+	switch (entry->function) {
+	case 0xd:
+		/*
+		 * CPUID.0xD.{0,1}.EBX enumerate XSAVE size based on the current
+		 * XCR0 and IA32_XSS MSR values.
+		 */
+		if (entry->index < 2)
+			mask.ebx = 0;
+		break;
 	}
-
-	return false;
+	return mask;
 }
 
 static void compare_cpuids(const struct kvm_cpuid2 *cpuid1,
@@ -79,6 +84,8 @@ static void compare_cpuids(const struct kvm_cpuid2 *cpuid1,
 		    "CPUID nent mismatch: %d vs. %d", cpuid1->nent, cpuid2->nent);
 
 	for (i = 0; i < cpuid1->nent; i++) {
+		struct cpuid_mask mask;
+
 		e1 = &cpuid1->entries[i];
 		e2 = &cpuid2->entries[i];
 
@@ -88,15 +95,19 @@ static void compare_cpuids(const struct kvm_cpuid2 *cpuid1,
 			    i, e1->function, e1->index, e1->flags,
 			    e2->function, e2->index, e2->flags);
 
-		if (is_cpuid_mangled(e1))
-			continue;
+		/* Mask off dynamic bits, e.g. OSXSAVE, when comparing entries. */
+		mask = get_const_cpuid_mask(e1);
 
-		TEST_ASSERT(e1->eax == e2->eax && e1->ebx == e2->ebx &&
-			    e1->ecx == e2->ecx && e1->edx == e2->edx,
+		TEST_ASSERT((e1->eax & mask.eax) == (e2->eax & mask.eax) &&
+			    (e1->ebx & mask.ebx) == (e2->ebx & mask.ebx) &&
+			    (e1->ecx & mask.ecx) == (e2->ecx & mask.ecx) &&
+			    (e1->edx & mask.edx) == (e2->edx & mask.edx),
 			    "CPUID 0x%x.%x differ: 0x%x:0x%x:0x%x:0x%x vs 0x%x:0x%x:0x%x:0x%x",
 			    e1->function, e1->index,
-			    e1->eax, e1->ebx, e1->ecx, e1->edx,
-			    e2->eax, e2->ebx, e2->ecx, e2->edx);
+			    e1->eax & mask.eax, e1->ebx & mask.ebx,
+			    e1->ecx & mask.ecx, e1->edx & mask.edx,
+			    e2->eax & mask.eax, e2->ebx & mask.ebx,
+			    e2->ecx & mask.ecx, e2->edx & mask.edx);
 	}
 }
 
-- 
cgit v1.2.3


From 164cea33bfedf883651d8d8b2db2fa867b48ecb0 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 3 Oct 2024 16:43:29 -0700
Subject: KVM: selftests: Mask off OSPKE and OSXSAVE when comparing CPUID
 entries

Mask off OSPKE and OSXSAVE, which are toggled based on corresponding CR4
enabling bits, when comparing vCPU CPUID against KVM's supported CPUID.
This will allow setting OSXSAVE by default when creating vCPUs, without
causing test failures (KVM doesn't enumerate OSXSAVE=1).

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Link: https://lore.kernel.org/r/20241003234337.273364-4-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/x86_64/cpuid_test.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/x86_64/cpuid_test.c b/tools/testing/selftests/kvm/x86_64/cpuid_test.c
index f7fdcef5fa59..7b3fda6842bc 100644
--- a/tools/testing/selftests/kvm/x86_64/cpuid_test.c
+++ b/tools/testing/selftests/kvm/x86_64/cpuid_test.c
@@ -62,6 +62,12 @@ static struct cpuid_mask get_const_cpuid_mask(const struct kvm_cpuid_entry2 *ent
 	memset(&mask, 0xff, sizeof(mask));
 
 	switch (entry->function) {
+	case 0x1:
+		mask.regs[X86_FEATURE_OSXSAVE.reg] &= ~BIT(X86_FEATURE_OSXSAVE.bit);
+		break;
+	case 0x7:
+		mask.regs[X86_FEATURE_OSPKE.reg] &= ~BIT(X86_FEATURE_OSPKE.bit);
+		break;
 	case 0xd:
 		/*
 		 * CPUID.0xD.{0,1}.EBX enumerate XSAVE size based on the current
-- 
cgit v1.2.3


From 2b9a126a2986fef604275d7d198163c9c3ced172 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 3 Oct 2024 16:43:30 -0700
Subject: KVM: selftests: Rework OSXSAVE CR4=>CPUID test to play nice with AVX
 insns

Rework the CR4/CPUID sync test to clear CR4.OSXSAVE, do CPUID, and restore
CR4.OSXSAVE in assembly, so that there is zero chance of AVX instructions
being executed while CR4.OSXSAVE is disabled.  This will allow enabling
CR4.OSXSAVE by default for selftests vCPUs as a general means of playing
nice with AVX instructions.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Link: https://lore.kernel.org/r/20241003234337.273364-5-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../selftests/kvm/x86_64/cr4_cpuid_sync_test.c     | 50 +++++++++++++++-------
 1 file changed, 34 insertions(+), 16 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c
index 624dc725e14d..da818afb7031 100644
--- a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c
+++ b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c
@@ -19,15 +19,14 @@
 #include "kvm_util.h"
 #include "processor.h"
 
-static inline bool cr4_cpuid_is_sync(void)
-{
-	uint64_t cr4 = get_cr4();
-
-	return (this_cpu_has(X86_FEATURE_OSXSAVE) == !!(cr4 & X86_CR4_OSXSAVE));
-}
+#define MAGIC_HYPERCALL_PORT	0x80
 
 static void guest_code(void)
 {
+	u32 regs[4] = {
+		[KVM_CPUID_EAX] = X86_FEATURE_OSXSAVE.function,
+		[KVM_CPUID_ECX] = X86_FEATURE_OSXSAVE.index,
+	};
 	uint64_t cr4;
 
 	/* turn on CR4.OSXSAVE */
@@ -36,13 +35,29 @@ static void guest_code(void)
 	set_cr4(cr4);
 
 	/* verify CR4.OSXSAVE == CPUID.OSXSAVE */
-	GUEST_ASSERT(cr4_cpuid_is_sync());
-
-	/* notify hypervisor to change CR4 */
-	GUEST_SYNC(0);
-
-	/* check again */
-	GUEST_ASSERT(cr4_cpuid_is_sync());
+	GUEST_ASSERT(this_cpu_has(X86_FEATURE_OSXSAVE));
+
+	/*
+	 * Notify hypervisor to clear CR4.0SXSAVE, do CPUID and save output,
+	 * and then restore CR4.  Do this all in  assembly to ensure no AVX
+	 * instructions are executed while OSXSAVE=0.
+	 */
+	asm volatile (
+		"out %%al, $" __stringify(MAGIC_HYPERCALL_PORT) "\n\t"
+		"cpuid\n\t"
+		"mov %%rdi, %%cr4\n\t"
+		: "+a" (regs[KVM_CPUID_EAX]),
+		  "=b" (regs[KVM_CPUID_EBX]),
+		  "+c" (regs[KVM_CPUID_ECX]),
+		  "=d" (regs[KVM_CPUID_EDX])
+		: "D" (get_cr4())
+	);
+
+	/* Verify KVM cleared OSXSAVE in CPUID when it was cleared in CR4. */
+	GUEST_ASSERT(!(regs[X86_FEATURE_OSXSAVE.reg] & BIT(X86_FEATURE_OSXSAVE.bit)));
+
+	/* Verify restoring CR4 also restored OSXSAVE in CPUID. */
+	GUEST_ASSERT(this_cpu_has(X86_FEATURE_OSXSAVE));
 
 	GUEST_DONE();
 }
@@ -62,13 +77,16 @@ int main(int argc, char *argv[])
 		vcpu_run(vcpu);
 		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
 
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_SYNC:
+		if (vcpu->run->io.port == MAGIC_HYPERCALL_PORT &&
+		    vcpu->run->io.direction == KVM_EXIT_IO_OUT) {
 			/* emulate hypervisor clearing CR4.OSXSAVE */
 			vcpu_sregs_get(vcpu, &sregs);
 			sregs.cr4 &= ~X86_CR4_OSXSAVE;
 			vcpu_sregs_set(vcpu, &sregs);
-			break;
+			continue;
+		}
+
+		switch (get_ucall(vcpu, &uc)) {
 		case UCALL_ABORT:
 			REPORT_GUEST_ASSERT(uc);
 			break;
-- 
cgit v1.2.3


From 8b14c4d85d031f7700fa4e042aebf99d933971f0 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 3 Oct 2024 16:43:31 -0700
Subject: KVM: selftests: Configure XCR0 to max supported value by default

To play nice with compilers generating AVX instructions, set CR4.OSXSAVE
and configure XCR0 by default when creating selftests vCPUs.  Some distros
have switched gcc to '-march=x86-64-v3' by default, and while it's hard to
find a CPU which doesn't support AVX today, many KVM selftests fail with

  ==== Test Assertion Failure ====
    lib/x86_64/processor.c:570: Unhandled exception in guest
    pid=72747 tid=72747 errno=4 - Interrupted system call
    Unhandled exception '0x6' at guest RIP '0x4104f7'

due to selftests not enabling AVX by default for the guest.  The failure
is easy to reproduce elsewhere with:

   $ make clean && CFLAGS='-march=x86-64-v3' make -j && ./x86_64/kvm_pv_test

E.g. gcc-13 with -march=x86-64-v3 compiles this chunk from selftests'
kvm_fixup_exception():

        regs->rip = regs->r11;
        regs->r9 = regs->vector;
        regs->r10 = regs->error_code;

into this monstronsity (which is clever, but oof):

  405313:       c4 e1 f9 6e c8          vmovq  %rax,%xmm1
  405318:       48 89 68 08             mov    %rbp,0x8(%rax)
  40531c:       48 89 e8                mov    %rbp,%rax
  40531f:       c4 c3 f1 22 c4 01       vpinsrq $0x1,%r12,%xmm1,%xmm0
  405325:       49 89 6d 38             mov    %rbp,0x38(%r13)
  405329:       c5 fa 7f 45 00          vmovdqu %xmm0,0x0(%rbp)

Alternatively, KVM selftests could explicitly restrict the compiler to
-march=x86-64-v2, but odds are very good that punting on AVX enabling will
simply result in tests that "need" AVX doing their own thing, e.g. there
are already three or so additional cleanups that can be done on top.

Reported-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Closes: https://lore.kernel.org/all/20240920154422.2890096-1-vkuznets@redhat.com
Reviewed-and-tested-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Link: https://lore.kernel.org/r/20241003234337.273364-6-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../selftests/kvm/include/x86_64/processor.h       |  5 +++++
 tools/testing/selftests/kvm/lib/x86_64/processor.c | 24 ++++++++++++++++++++++
 .../testing/selftests/kvm/x86_64/xcr0_cpuid_test.c |  6 +++---
 3 files changed, 32 insertions(+), 3 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index e247f99e0473..645200e95f89 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -1049,6 +1049,11 @@ static inline void vcpu_set_cpuid(struct kvm_vcpu *vcpu)
 	vcpu_ioctl(vcpu, KVM_GET_CPUID2, vcpu->cpuid);
 }
 
+static inline void vcpu_get_cpuid(struct kvm_vcpu *vcpu)
+{
+	vcpu_ioctl(vcpu, KVM_GET_CPUID2, vcpu->cpuid);
+}
+
 void vcpu_set_cpuid_property(struct kvm_vcpu *vcpu,
 			     struct kvm_x86_cpu_property property,
 			     uint32_t value);
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index 974bcd2df6d7..636b29ba8985 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -506,6 +506,8 @@ static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
 
 	sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG;
 	sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR;
+	if (kvm_cpu_has(X86_FEATURE_XSAVE))
+		sregs.cr4 |= X86_CR4_OSXSAVE;
 	sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX);
 
 	kvm_seg_set_unusable(&sregs.ldt);
@@ -519,6 +521,20 @@ static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
 	vcpu_sregs_set(vcpu, &sregs);
 }
 
+static void vcpu_init_xcrs(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
+{
+	struct kvm_xcrs xcrs = {
+		.nr_xcrs = 1,
+		.xcrs[0].xcr = 0,
+		.xcrs[0].value = kvm_cpu_supported_xcr0(),
+	};
+
+	if (!kvm_cpu_has(X86_FEATURE_XSAVE))
+		return;
+
+	vcpu_xcrs_set(vcpu, &xcrs);
+}
+
 static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr,
 			  int dpl, unsigned short selector)
 {
@@ -675,6 +691,7 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id)
 	vcpu = __vm_vcpu_add(vm, vcpu_id);
 	vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid());
 	vcpu_init_sregs(vm, vcpu);
+	vcpu_init_xcrs(vm, vcpu);
 
 	/* Setup guest general purpose registers */
 	vcpu_regs_get(vcpu, &regs);
@@ -686,6 +703,13 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id)
 	mp_state.mp_state = 0;
 	vcpu_mp_state_set(vcpu, &mp_state);
 
+	/*
+	 * Refresh CPUID after setting SREGS and XCR0, so that KVM's "runtime"
+	 * updates to guest CPUID, e.g. for OSXSAVE and XSAVE state size, are
+	 * reflected into selftests' vCPU CPUID cache, i.e. so that the cache
+	 * is consistent with vCPU state.
+	 */
+	vcpu_get_cpuid(vcpu);
 	return vcpu;
 }
 
diff --git a/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c b/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c
index 95ce192d0753..a4aecdc77da5 100644
--- a/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c
+++ b/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c
@@ -48,16 +48,16 @@ do {									\
 
 static void guest_code(void)
 {
-	uint64_t xcr0_reset;
+	uint64_t initial_xcr0;
 	uint64_t supported_xcr0;
 	int i, vector;
 
 	set_cr4(get_cr4() | X86_CR4_OSXSAVE);
 
-	xcr0_reset = xgetbv(0);
+	initial_xcr0 = xgetbv(0);
 	supported_xcr0 = this_cpu_supported_xcr0();
 
-	GUEST_ASSERT(xcr0_reset == XFEATURE_MASK_FP);
+	GUEST_ASSERT(initial_xcr0 == supported_xcr0);
 
 	/* Check AVX */
 	ASSERT_XFEATURE_DEPENDENCIES(supported_xcr0,
-- 
cgit v1.2.3


From 8ae01bf64caaea5562f3af40a2fbe404a1e79403 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 3 Oct 2024 16:43:32 -0700
Subject: KVM: selftests: Verify XCR0 can be "downgraded" and "upgraded"

Now that KVM selftests enable all supported XCR0 features by default, add
a testcase to the XCR0 vs. CPUID test to verify that the guest can disable
everything except the legacy FPU in XCR0, and then re-enable the full
feature set, which is kinda sorta what the test did before XCR0 was setup
by default.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Link: https://lore.kernel.org/r/20241003234337.273364-7-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c b/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c
index a4aecdc77da5..c8a5c5e51661 100644
--- a/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c
+++ b/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c
@@ -79,6 +79,11 @@ static void guest_code(void)
 	ASSERT_ALL_OR_NONE_XFEATURE(supported_xcr0,
 				    XFEATURE_MASK_XTILE);
 
+	vector = xsetbv_safe(0, XFEATURE_MASK_FP);
+	__GUEST_ASSERT(!vector,
+		       "Expected success on XSETBV(FP), got vector '0x%x'",
+		       vector);
+
 	vector = xsetbv_safe(0, supported_xcr0);
 	__GUEST_ASSERT(!vector,
 		       "Expected success on XSETBV(0x%lx), got vector '0x%x'",
-- 
cgit v1.2.3


From 3678c7f6114f6fc8614c7e9a249d60f8e1678bad Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 3 Oct 2024 16:43:33 -0700
Subject: KVM: selftests: Drop manual CR4.OSXSAVE enabling from CR4/CPUID sync
 test

Now that CR4.OSXSAVE is enabled by default, drop the manual enabling from
CR4/CPUID sync test and instead assert that CR4.OSXSAVE is enabled.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Link: https://lore.kernel.org/r/20241003234337.273364-8-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c
index da818afb7031..28cc66454601 100644
--- a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c
+++ b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c
@@ -27,12 +27,9 @@ static void guest_code(void)
 		[KVM_CPUID_EAX] = X86_FEATURE_OSXSAVE.function,
 		[KVM_CPUID_ECX] = X86_FEATURE_OSXSAVE.index,
 	};
-	uint64_t cr4;
 
-	/* turn on CR4.OSXSAVE */
-	cr4 = get_cr4();
-	cr4 |= X86_CR4_OSXSAVE;
-	set_cr4(cr4);
+	/* CR4.OSXSAVE should be enabled by default (for selftests vCPUs). */
+	GUEST_ASSERT(get_cr4() & X86_CR4_OSXSAVE);
 
 	/* verify CR4.OSXSAVE == CPUID.OSXSAVE */
 	GUEST_ASSERT(this_cpu_has(X86_FEATURE_OSXSAVE));
-- 
cgit v1.2.3


From d87331890a38240d0743b7fb04c25d92b255ec46 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 3 Oct 2024 16:43:34 -0700
Subject: KVM: selftests: Drop manual XCR0 configuration from AMX test

Now that CR4.OSXSAVE and XCR0 are setup by default, drop the manual
enabling of OXSAVE and XTILE from the AMX test.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Link: https://lore.kernel.org/r/20241003234337.273364-9-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/x86_64/amx_test.c | 23 ++++-------------------
 1 file changed, 4 insertions(+), 19 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c
index 903940c54d2d..f4ce5a185a7d 100644
--- a/tools/testing/selftests/kvm/x86_64/amx_test.c
+++ b/tools/testing/selftests/kvm/x86_64/amx_test.c
@@ -86,6 +86,8 @@ static inline void __xsavec(struct xstate *xstate, uint64_t rfbm)
 
 static void check_xtile_info(void)
 {
+	GUEST_ASSERT((xgetbv(0) & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE);
+
 	GUEST_ASSERT(this_cpu_has_p(X86_PROPERTY_XSTATE_MAX_SIZE_XCR0));
 	GUEST_ASSERT(this_cpu_property(X86_PROPERTY_XSTATE_MAX_SIZE_XCR0) <= XSAVE_SIZE);
 
@@ -122,29 +124,12 @@ static void set_tilecfg(struct tile_config *cfg)
 	}
 }
 
-static void init_regs(void)
-{
-	uint64_t cr4, xcr0;
-
-	GUEST_ASSERT(this_cpu_has(X86_FEATURE_XSAVE));
-
-	/* turn on CR4.OSXSAVE */
-	cr4 = get_cr4();
-	cr4 |= X86_CR4_OSXSAVE;
-	set_cr4(cr4);
-	GUEST_ASSERT(this_cpu_has(X86_FEATURE_OSXSAVE));
-
-	xcr0 = xgetbv(0);
-	xcr0 |= XFEATURE_MASK_XTILE;
-	xsetbv(0x0, xcr0);
-	GUEST_ASSERT((xgetbv(0) & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE);
-}
-
 static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg,
 						    struct tile_data *tiledata,
 						    struct xstate *xstate)
 {
-	init_regs();
+	GUEST_ASSERT(this_cpu_has(X86_FEATURE_XSAVE) &&
+		     this_cpu_has(X86_FEATURE_OSXSAVE));
 	check_xtile_info();
 	GUEST_SYNC(1);
 
-- 
cgit v1.2.3


From 28439090ece61e71c2c2b75c3567446e5aea7519 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 3 Oct 2024 16:43:35 -0700
Subject: KVM: selftests: Drop manual XCR0 configuration from state test

Now that CR4.OSXSAVE and XCR0 are setup by default, drop the manual
enabling from the state test, which is fully redundant with the default
behavior.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Link: https://lore.kernel.org/r/20241003234337.273364-10-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/x86_64/state_test.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/x86_64/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c
index 1c756db329e5..141b7fc0c965 100644
--- a/tools/testing/selftests/kvm/x86_64/state_test.c
+++ b/tools/testing/selftests/kvm/x86_64/state_test.c
@@ -145,11 +145,6 @@ static void __attribute__((__flatten__)) guest_code(void *arg)
 
 		memset(buffer, 0xcc, sizeof(buffer));
 
-		set_cr4(get_cr4() | X86_CR4_OSXSAVE);
-		GUEST_ASSERT(this_cpu_has(X86_FEATURE_OSXSAVE));
-
-		xsetbv(0, xgetbv(0) | supported_xcr0);
-
 		/*
 		 * Modify state for all supported xfeatures to take them out of
 		 * their "init" state, i.e. to make them show up in XSTATE_BV.
-- 
cgit v1.2.3


From 3c4c128d02ed81ddbf9b374a77bc0cb5a91e0a87 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 3 Oct 2024 16:43:36 -0700
Subject: KVM: selftests: Drop manual XCR0 configuration from SEV smoke test

Now that CR4.OSXSAVE and XCR0 are setup by default, drop the manual
enabling from the SEV smoke test that validates FPU state can be
transferred into the VMSA.

In guest_code_xsave(), explicitly set the Requested-Feature Bitmask (RFBM)
to exactly XFEATURE_MASK_X87_AVX instead of relying on the host side of
things to enable only X87_AVX features in guest XCR0.  I.e. match the RFBM
for the host XSAVE.

Link: https://lore.kernel.org/r/20241003234337.273364-11-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/x86_64/sev_smoke_test.c | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c b/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c
index 2e9197eb1652..965fc362dee3 100644
--- a/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c
+++ b/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c
@@ -41,8 +41,8 @@ static void guest_sev_code(void)
 /* Stash state passed via VMSA before any compiled code runs.  */
 extern void guest_code_xsave(void);
 asm("guest_code_xsave:\n"
-    "mov $-1, %eax\n"
-    "mov $-1, %edx\n"
+    "mov $" __stringify(XFEATURE_MASK_X87_AVX) ", %eax\n"
+    "xor %edx, %edx\n"
     "xsave (%rdi)\n"
     "jmp guest_sev_es_code");
 
@@ -70,12 +70,6 @@ static void test_sync_vmsa(uint32_t policy)
 
 	double x87val = M_PI;
 	struct kvm_xsave __attribute__((aligned(64))) xsave = { 0 };
-	struct kvm_sregs sregs;
-	struct kvm_xcrs xcrs = {
-		.nr_xcrs = 1,
-		.xcrs[0].xcr = 0,
-		.xcrs[0].value = XFEATURE_MASK_X87_AVX,
-	};
 
 	vm = vm_sev_create_with_one_vcpu(KVM_X86_SEV_ES_VM, guest_code_xsave, &vcpu);
 	gva = vm_vaddr_alloc_shared(vm, PAGE_SIZE, KVM_UTIL_MIN_VADDR,
@@ -84,11 +78,6 @@ static void test_sync_vmsa(uint32_t policy)
 
 	vcpu_args_set(vcpu, 1, gva);
 
-	vcpu_sregs_get(vcpu, &sregs);
-	sregs.cr4 |= X86_CR4_OSFXSR | X86_CR4_OSXSAVE;
-	vcpu_sregs_set(vcpu, &sregs);
-
-	vcpu_xcrs_set(vcpu, &xcrs);
 	asm("fninit\n"
 	    "vpcmpeqb %%ymm4, %%ymm4, %%ymm4\n"
 	    "fldl %3\n"
-- 
cgit v1.2.3


From 89f8869835e4da836bc60ab20568b7864706f94b Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 3 Oct 2024 16:43:37 -0700
Subject: KVM: selftests: Ensure KVM supports AVX for SEV-ES VMSA FPU test

Verify that KVM's supported XCR0 includes AVX (and earlier features) when
running the SEV-ES VMSA XSAVE test.  In practice, the issue will likely
never pop up, since KVM support for AVX predates KVM support for SEV-ES,
but checking for KVM support makes the requirement more obvious.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Link: https://lore.kernel.org/r/20241003234337.273364-12-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/x86_64/sev_smoke_test.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c b/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c
index 965fc362dee3..ae77698e6e97 100644
--- a/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c
+++ b/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c
@@ -181,6 +181,8 @@ static void test_sev_es_shutdown(void)
 
 int main(int argc, char *argv[])
 {
+	const u64 xf_mask = XFEATURE_MASK_X87_AVX;
+
 	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SEV));
 
 	test_sev(guest_sev_code, SEV_POLICY_NO_DBG);
@@ -193,7 +195,7 @@ int main(int argc, char *argv[])
 		test_sev_es_shutdown();
 
 		if (kvm_has_cap(KVM_CAP_XCRS) &&
-		    (xgetbv(0) & XFEATURE_MASK_X87_AVX) == XFEATURE_MASK_X87_AVX) {
+		    (xgetbv(0) & kvm_cpu_supported_xcr0() & xf_mask) == xf_mask) {
 			test_sync_vmsa(0);
 			test_sync_vmsa(SEV_POLICY_NO_DBG);
 		}
-- 
cgit v1.2.3


From 69c0d824779843b51ca2339b2163db4d3b40c54c Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Mon, 28 Oct 2024 20:22:31 +0000
Subject: kselftest/arm64: Fix encoding for SVE B16B16 test

The test for SVE_B16B16 had a cut'n'paste of a SME instruction, fix it with
a relevant SVE instruction.

Fixes: 44d10c27bd75 ("kselftest/arm64: Add 2023 DPISA hwcap test coverage")
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241028-arm64-b16b16-test-v1-1-59a4a7449bdf@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/abi/hwcap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c
index 7e95ba5fd496..265654ec48b9 100644
--- a/tools/testing/selftests/arm64/abi/hwcap.c
+++ b/tools/testing/selftests/arm64/abi/hwcap.c
@@ -361,8 +361,8 @@ static void sveaes_sigill(void)
 
 static void sveb16b16_sigill(void)
 {
-	/* BFADD ZA.H[W0, 0], {Z0.H-Z1.H} */
-	asm volatile(".inst 0xC1E41C00" : : : );
+	/* BFADD Z0.H, Z0.H, Z0.H */
+	asm volatile(".inst 0x65000000" : : : );
 }
 
 static void svepmull_sigill(void)
-- 
cgit v1.2.3


From e5e4799e2ac3619a10a9e6db7d2a51d7cc6a69ef Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 30 Oct 2024 15:28:19 -0700
Subject: selftests/bpf: Add a test for open coded kmem_cache iter

The new subtest runs with bpf_prog_test_run_opts() as a syscall prog.
It iterates the kmem_cache using bpf_for_each loop and count the number
of entries.  Finally it checks it with the number of entries from the
regular iterator.

  $ ./vmtest.sh -- ./test_progs -t kmem_cache_iter
  ...
  #130/1   kmem_cache_iter/check_task_struct:OK
  #130/2   kmem_cache_iter/check_slabinfo:OK
  #130/3   kmem_cache_iter/open_coded_iter:OK
  #130     kmem_cache_iter:OK
  Summary: 1/3 PASSED, 0 SKIPPED, 0 FAILED

Also simplify the code by using attach routine of the skeleton.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20241030222819.1800667-2-namhyung@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/bpf_experimental.h     |  6 ++++
 .../selftests/bpf/prog_tests/kmem_cache_iter.c     | 35 ++++++++++++++--------
 .../testing/selftests/bpf/progs/kmem_cache_iter.c  | 22 ++++++++++++++
 3 files changed, 51 insertions(+), 12 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h
index b0668f29f7b3..cd8ecd39c3f3 100644
--- a/tools/testing/selftests/bpf/bpf_experimental.h
+++ b/tools/testing/selftests/bpf/bpf_experimental.h
@@ -582,4 +582,10 @@ extern int bpf_wq_set_callback_impl(struct bpf_wq *wq,
 		unsigned int flags__k, void *aux__ign) __ksym;
 #define bpf_wq_set_callback(timer, cb, flags) \
 	bpf_wq_set_callback_impl(timer, cb, flags, NULL)
+
+struct bpf_iter_kmem_cache;
+extern int bpf_iter_kmem_cache_new(struct bpf_iter_kmem_cache *it) __weak __ksym;
+extern struct kmem_cache *bpf_iter_kmem_cache_next(struct bpf_iter_kmem_cache *it) __weak __ksym;
+extern void bpf_iter_kmem_cache_destroy(struct bpf_iter_kmem_cache *it) __weak __ksym;
+
 #endif
diff --git a/tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c b/tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c
index 848d8fc9171f..8e13a3416a21 100644
--- a/tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c
@@ -68,12 +68,27 @@ static void subtest_kmem_cache_iter_check_slabinfo(struct kmem_cache_iter *skel)
 	fclose(fp);
 }
 
+static void subtest_kmem_cache_iter_open_coded(struct kmem_cache_iter *skel)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+	int err, fd;
+
+	/* No need to attach it, just run it directly */
+	fd = bpf_program__fd(skel->progs.open_coded_iter);
+
+	err = bpf_prog_test_run_opts(fd, &topts);
+	if (!ASSERT_OK(err, "test_run_opts err"))
+		return;
+	if (!ASSERT_OK(topts.retval, "test_run_opts retval"))
+		return;
+
+	/* It should be same as we've seen from the explicit iterator */
+	ASSERT_EQ(skel->bss->open_coded_seen, skel->bss->kmem_cache_seen, "open_code_seen_eq");
+}
+
 void test_kmem_cache_iter(void)
 {
-	DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
 	struct kmem_cache_iter *skel = NULL;
-	union bpf_iter_link_info linfo = {};
-	struct bpf_link *link;
 	char buf[256];
 	int iter_fd;
 
@@ -81,16 +96,12 @@ void test_kmem_cache_iter(void)
 	if (!ASSERT_OK_PTR(skel, "kmem_cache_iter__open_and_load"))
 		return;
 
-	opts.link_info = &linfo;
-	opts.link_info_len = sizeof(linfo);
-
-	link = bpf_program__attach_iter(skel->progs.slab_info_collector, &opts);
-	if (!ASSERT_OK_PTR(link, "attach_iter"))
+	if (!ASSERT_OK(kmem_cache_iter__attach(skel), "skel_attach"))
 		goto destroy;
 
-	iter_fd = bpf_iter_create(bpf_link__fd(link));
+	iter_fd = bpf_iter_create(bpf_link__fd(skel->links.slab_info_collector));
 	if (!ASSERT_GE(iter_fd, 0, "iter_create"))
-		goto free_link;
+		goto destroy;
 
 	memset(buf, 0, sizeof(buf));
 	while (read(iter_fd, buf, sizeof(buf) > 0)) {
@@ -105,11 +116,11 @@ void test_kmem_cache_iter(void)
 		subtest_kmem_cache_iter_check_task_struct(skel);
 	if (test__start_subtest("check_slabinfo"))
 		subtest_kmem_cache_iter_check_slabinfo(skel);
+	if (test__start_subtest("open_coded_iter"))
+		subtest_kmem_cache_iter_open_coded(skel);
 
 	close(iter_fd);
 
-free_link:
-	bpf_link__destroy(link);
 destroy:
 	kmem_cache_iter__destroy(skel);
 }
diff --git a/tools/testing/selftests/bpf/progs/kmem_cache_iter.c b/tools/testing/selftests/bpf/progs/kmem_cache_iter.c
index e775d5cd99fc..b9c8f9457492 100644
--- a/tools/testing/selftests/bpf/progs/kmem_cache_iter.c
+++ b/tools/testing/selftests/bpf/progs/kmem_cache_iter.c
@@ -3,6 +3,7 @@
 #include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
+#include "bpf_experimental.h"
 
 char _license[] SEC("license") = "GPL";
 
@@ -32,6 +33,7 @@ extern struct kmem_cache *bpf_get_kmem_cache(u64 addr) __ksym;
 /* Result, will be checked by userspace */
 int task_struct_found;
 int kmem_cache_seen;
+int open_coded_seen;
 
 SEC("iter/kmem_cache")
 int slab_info_collector(struct bpf_iter__kmem_cache *ctx)
@@ -84,3 +86,23 @@ int BPF_PROG(check_task_struct)
 		task_struct_found = -2;
 	return 0;
 }
+
+SEC("syscall")
+int open_coded_iter(const void *ctx)
+{
+	struct kmem_cache *s;
+
+	bpf_for_each(kmem_cache, s) {
+		struct kmem_cache_result *r;
+
+		r = bpf_map_lookup_elem(&slab_result, &open_coded_seen);
+		if (!r)
+			break;
+
+		if (r->obj_size != s->size)
+			break;
+
+		open_coded_seen++;
+	}
+	return 0;
+}
-- 
cgit v1.2.3


From 77017b9c46820d72596e50a3986bd0734c1340a9 Mon Sep 17 00:00:00 2001
From: Viktor Malik <vmalik@redhat.com>
Date: Fri, 1 Nov 2024 09:27:13 +0100
Subject: selftests/bpf: Disable warnings on unused flags for Clang builds

There exist compiler flags supported by GCC but not supported by Clang
(e.g. -specs=...). Currently, these cannot be passed to BPF selftests
builds, even when building with GCC, as some binaries (urandom_read and
liburandom_read.so) are always built with Clang and the unsupported
flags make the compilation fail (as -Werror is turned on).

Add -Wno-unused-command-line-argument to these rules to suppress such
errors.

This allows to do things like:

    $ CFLAGS="-specs=/usr/lib/rpm/redhat/redhat-hardened-cc1" \
      make -C tools/testing/selftests/bpf

Without this patch, the compilation would fail with:

    [...]
    clang: error: argument unused during compilation: '-specs=/usr/lib/rpm/redhat/redhat-hardened-cc1' [-Werror,-Wunused-command-line-argument]
    make: *** [Makefile:273: /bpf-next/tools/testing/selftests/bpf/liburandom_read.so] Error 1
    [...]

Signed-off-by: Viktor Malik <vmalik@redhat.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/2d349e9d5eb0a79dd9ff94b496769d64e6ff7654.1730449390.git.vmalik@redhat.com
---
 tools/testing/selftests/bpf/Makefile | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index a226d0647c4e..d5aaa674dab5 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -274,6 +274,7 @@ $(OUTPUT)/liburandom_read.so: urandom_read_lib1.c urandom_read_lib2.c liburandom
 	$(Q)$(CLANG) $(CLANG_TARGET_ARCH) \
 		     $(filter-out -static,$(CFLAGS) $(LDFLAGS)) \
 		     $(filter %.c,$^) $(filter-out -static,$(LDLIBS)) \
+		     -Wno-unused-command-line-argument \
 		     -fuse-ld=$(LLD) -Wl,-znoseparate-code -Wl,--build-id=sha1 \
 		     -Wl,--version-script=liburandom_read.map \
 		     -fPIC -shared -o $@
@@ -282,6 +283,7 @@ $(OUTPUT)/urandom_read: urandom_read.c urandom_read_aux.c $(OUTPUT)/liburandom_r
 	$(call msg,BINARY,,$@)
 	$(Q)$(CLANG) $(CLANG_TARGET_ARCH) \
 		     $(filter-out -static,$(CFLAGS) $(LDFLAGS)) $(filter %.c,$^) \
+		     -Wno-unused-command-line-argument \
 		     -lurandom_read $(filter-out -static,$(LDLIBS)) -L$(OUTPUT) \
 		     -fuse-ld=$(LLD) -Wl,-znoseparate-code -Wl,--build-id=sha1 \
 		     -Wl,-rpath=. -o $@
-- 
cgit v1.2.3


From d44d26987bb3df6d76556827097fc9ce17565cb8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 31 Oct 2024 13:04:07 +0100
Subject: timekeeping: Remove CONFIG_DEBUG_TIMEKEEPING

Since 135225a363ae timekeeping_cycles_to_ns() handles large offsets which
would lead to 64bit multiplication overflows correctly. It's also protected
against negative motion of the clocksource unconditionally, which was
exclusive to x86 before.

timekeeping_advance() handles large offsets already correctly.

That means the value of CONFIG_DEBUG_TIMEKEEPING which analyzed these cases
is very close to zero. Remove all of it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <jstultz@google.com>
Link: https://lore.kernel.org/all/20241031120328.536010148@linutronix.de
---
 arch/riscv/configs/defconfig                       |   1 -
 include/linux/timekeeper_internal.h                |  16 ---
 kernel/time/timekeeping.c                          | 108 +--------------------
 lib/Kconfig.debug                                  |  13 ---
 .../testing/selftests/wireguard/qemu/debug.config  |   1 -
 5 files changed, 3 insertions(+), 136 deletions(-)

(limited to 'tools/testing')

diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig
index 2341393cfac1..26c01b9e3434 100644
--- a/arch/riscv/configs/defconfig
+++ b/arch/riscv/configs/defconfig
@@ -301,7 +301,6 @@ CONFIG_DEBUG_MEMORY_INIT=y
 CONFIG_DEBUG_PER_CPU_MAPS=y
 CONFIG_SOFTLOCKUP_DETECTOR=y
 CONFIG_WQ_WATCHDOG=y
-CONFIG_DEBUG_TIMEKEEPING=y
 CONFIG_DEBUG_RT_MUTEXES=y
 CONFIG_DEBUG_SPINLOCK=y
 CONFIG_DEBUG_MUTEXES=y
diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index a3b6380a7777..e39d4d563b19 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -76,9 +76,6 @@ struct tk_read_base {
  *				ntp shifted nano seconds.
  * @ntp_err_mult:		Multiplication factor for scaled math conversion
  * @skip_second_overflow:	Flag used to avoid updating NTP twice with same second
- * @last_warning:		Warning ratelimiter (DEBUG_TIMEKEEPING)
- * @underflow_seen:		Underflow warning flag (DEBUG_TIMEKEEPING)
- * @overflow_seen:		Overflow warning flag (DEBUG_TIMEKEEPING)
  *
  * Note: For timespec(64) based interfaces wall_to_monotonic is what
  * we need to add to xtime (or xtime corrected for sub jiffy times)
@@ -147,19 +144,6 @@ struct timekeeper {
 	u32			ntp_error_shift;
 	u32			ntp_err_mult;
 	u32			skip_second_overflow;
-
-#ifdef CONFIG_DEBUG_TIMEKEEPING
-	long			last_warning;
-	/*
-	 * These simple flag variables are managed
-	 * without locks, which is racy, but they are
-	 * ok since we don't really care about being
-	 * super precise about how many events were
-	 * seen, just that a problem was observed.
-	 */
-	int			underflow_seen;
-	int			overflow_seen;
-#endif
 };
 
 #ifdef CONFIG_GENERIC_TIME_VSYSCALL
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 17cae886ca82..d115adebc418 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -226,97 +226,6 @@ static inline u64 tk_clock_read(const struct tk_read_base *tkr)
 	return clock->read(clock);
 }
 
-#ifdef CONFIG_DEBUG_TIMEKEEPING
-#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
-
-static void timekeeping_check_update(struct timekeeper *tk, u64 offset)
-{
-
-	u64 max_cycles = tk->tkr_mono.clock->max_cycles;
-	const char *name = tk->tkr_mono.clock->name;
-
-	if (offset > max_cycles) {
-		printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n",
-				offset, name, max_cycles);
-		printk_deferred("         timekeeping: Your kernel is sick, but tries to cope by capping time updates\n");
-	} else {
-		if (offset > (max_cycles >> 1)) {
-			printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld)\n",
-					offset, name, max_cycles >> 1);
-			printk_deferred("      timekeeping: Your kernel is still fine, but is feeling a bit nervous\n");
-		}
-	}
-
-	if (tk->underflow_seen) {
-		if (jiffies - tk->last_warning > WARNING_FREQ) {
-			printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name);
-			printk_deferred("         Please report this, consider using a different clocksource, if possible.\n");
-			printk_deferred("         Your kernel is probably still fine.\n");
-			tk->last_warning = jiffies;
-		}
-		tk->underflow_seen = 0;
-	}
-
-	if (tk->overflow_seen) {
-		if (jiffies - tk->last_warning > WARNING_FREQ) {
-			printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name);
-			printk_deferred("         Please report this, consider using a different clocksource, if possible.\n");
-			printk_deferred("         Your kernel is probably still fine.\n");
-			tk->last_warning = jiffies;
-		}
-		tk->overflow_seen = 0;
-	}
-}
-
-static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles);
-
-static inline u64 timekeeping_debug_get_ns(const struct tk_read_base *tkr)
-{
-	struct timekeeper *tk = &tk_core.timekeeper;
-	u64 now, last, mask, max, delta;
-	unsigned int seq;
-
-	/*
-	 * Since we're called holding a seqcount, the data may shift
-	 * under us while we're doing the calculation. This can cause
-	 * false positives, since we'd note a problem but throw the
-	 * results away. So nest another seqcount here to atomically
-	 * grab the points we are checking with.
-	 */
-	do {
-		seq = read_seqcount_begin(&tk_core.seq);
-		now = tk_clock_read(tkr);
-		last = tkr->cycle_last;
-		mask = tkr->mask;
-		max = tkr->clock->max_cycles;
-	} while (read_seqcount_retry(&tk_core.seq, seq));
-
-	delta = clocksource_delta(now, last, mask);
-
-	/*
-	 * Try to catch underflows by checking if we are seeing small
-	 * mask-relative negative values.
-	 */
-	if (unlikely((~delta & mask) < (mask >> 3)))
-		tk->underflow_seen = 1;
-
-	/* Check for multiplication overflows */
-	if (unlikely(delta > max))
-		tk->overflow_seen = 1;
-
-	/* timekeeping_cycles_to_ns() handles both under and overflow */
-	return timekeeping_cycles_to_ns(tkr, now);
-}
-#else
-static inline void timekeeping_check_update(struct timekeeper *tk, u64 offset)
-{
-}
-static inline u64 timekeeping_debug_get_ns(const struct tk_read_base *tkr)
-{
-	BUG();
-}
-#endif
-
 /**
  * tk_setup_internals - Set up internals to use clocksource clock.
  *
@@ -421,19 +330,11 @@ static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 c
 	return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift;
 }
 
-static __always_inline u64 __timekeeping_get_ns(const struct tk_read_base *tkr)
+static __always_inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
 {
 	return timekeeping_cycles_to_ns(tkr, tk_clock_read(tkr));
 }
 
-static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
-{
-	if (IS_ENABLED(CONFIG_DEBUG_TIMEKEEPING))
-		return timekeeping_debug_get_ns(tkr);
-
-	return __timekeeping_get_ns(tkr);
-}
-
 /**
  * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
  * @tkr: Timekeeping readout base from which we take the update
@@ -477,7 +378,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
 		seq = raw_read_seqcount_latch(&tkf->seq);
 		tkr = tkf->base + (seq & 0x01);
 		now = ktime_to_ns(tkr->base);
-		now += __timekeeping_get_ns(tkr);
+		now += timekeeping_get_ns(tkr);
 	} while (raw_read_seqcount_latch_retry(&tkf->seq, seq));
 
 	return now;
@@ -593,7 +494,7 @@ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono)
 		tkr = tkf->base + (seq & 0x01);
 		basem = ktime_to_ns(tkr->base);
 		baser = ktime_to_ns(tkr->base_real);
-		delta = __timekeeping_get_ns(tkr);
+		delta = timekeeping_get_ns(tkr);
 	} while (raw_read_seqcount_latch_retry(&tkf->seq, seq));
 
 	if (mono)
@@ -2333,9 +2234,6 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
 	if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
 		return false;
 
-	/* Do some additional sanity checking */
-	timekeeping_check_update(tk, offset);
-
 	/*
 	 * With NO_HZ we may have to accumulate many cycle_intervals
 	 * (think "ticks") worth of time at once. To do this efficiently,
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 7315f643817a..14977b9fc254 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1328,19 +1328,6 @@ config SCHEDSTATS
 
 endmenu
 
-config DEBUG_TIMEKEEPING
-	bool "Enable extra timekeeping sanity checking"
-	help
-	  This option will enable additional timekeeping sanity checks
-	  which may be helpful when diagnosing issues where timekeeping
-	  problems are suspected.
-
-	  This may include checks in the timekeeping hotpaths, so this
-	  option may have a (very small) performance impact to some
-	  workloads.
-
-	  If unsure, say N.
-
 config DEBUG_PREEMPT
 	bool "Debug preemptible kernel"
 	depends on DEBUG_KERNEL && PREEMPTION && TRACE_IRQFLAGS_SUPPORT
diff --git a/tools/testing/selftests/wireguard/qemu/debug.config b/tools/testing/selftests/wireguard/qemu/debug.config
index 9d172210e2c6..139fd9aa8b12 100644
--- a/tools/testing/selftests/wireguard/qemu/debug.config
+++ b/tools/testing/selftests/wireguard/qemu/debug.config
@@ -31,7 +31,6 @@ CONFIG_SCHED_DEBUG=y
 CONFIG_SCHED_INFO=y
 CONFIG_SCHEDSTATS=y
 CONFIG_SCHED_STACK_END_CHECK=y
-CONFIG_DEBUG_TIMEKEEPING=y
 CONFIG_DEBUG_PREEMPT=y
 CONFIG_DEBUG_RT_MUTEXES=y
 CONFIG_DEBUG_SPINLOCK=y
-- 
cgit v1.2.3


From d051cd72dcb769c842494b1dbe29067aba45474f Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Tue, 29 Oct 2024 02:00:28 -0700
Subject: net: netconsole: selftests: Change the IP subnet

Use a less populated IP range to run the tests, as suggested by Petr in
Link: https://lore.kernel.org/netdev/87ikvukv3s.fsf@nvidia.com/.

Suggested-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Breno Leitao <leitao@debian.org>
Link: https://patch.msgid.link/20241029090030.1793551-2-leitao@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/netcons_basic.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/netcons_basic.sh b/tools/testing/selftests/drivers/net/netcons_basic.sh
index 06021b2059b7..8d28e5189e91 100755
--- a/tools/testing/selftests/drivers/net/netcons_basic.sh
+++ b/tools/testing/selftests/drivers/net/netcons_basic.sh
@@ -20,9 +20,9 @@ SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
 
 # Simple script to test dynamic targets in netconsole
 SRCIF="" # to be populated later
-SRCIP=192.168.1.1
+SRCIP=192.0.2.1
 DSTIF="" # to be populated later
-DSTIP=192.168.1.2
+DSTIP=192.0.2.2
 
 PORT="6666"
 MSG="netconsole selftest"
-- 
cgit v1.2.3


From afa4ceb0fb648655c9f04921ccc801feb034109c Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Tue, 29 Oct 2024 02:00:29 -0700
Subject: net: netconsole: selftests: Add userdata validation

Extend netcons_basic selftest to verify the userdata functionality by:
 1. Creating a test key in the userdata configfs directory
 2. Writing a known value to the key
 3. Validating the key-value pair appears in the captured network output

This ensures the userdata feature is properly tested during selftests.

Signed-off-by: Breno Leitao <leitao@debian.org>
Link: https://patch.msgid.link/20241029090030.1793551-3-leitao@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../testing/selftests/drivers/net/netcons_basic.sh | 29 ++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/netcons_basic.sh b/tools/testing/selftests/drivers/net/netcons_basic.sh
index 8d28e5189e91..182eb1a97e59 100755
--- a/tools/testing/selftests/drivers/net/netcons_basic.sh
+++ b/tools/testing/selftests/drivers/net/netcons_basic.sh
@@ -26,10 +26,13 @@ DSTIP=192.0.2.2
 
 PORT="6666"
 MSG="netconsole selftest"
+USERDATA_KEY="key"
+USERDATA_VALUE="value"
 TARGET=$(mktemp -u netcons_XXXXX)
 DEFAULT_PRINTK_VALUES=$(cat /proc/sys/kernel/printk)
 NETCONS_CONFIGFS="/sys/kernel/config/netconsole"
 NETCONS_PATH="${NETCONS_CONFIGFS}"/"${TARGET}"
+KEY_PATH="${NETCONS_PATH}/userdata/${USERDATA_KEY}"
 # NAMESPACE will be populated by setup_ns with a random value
 NAMESPACE=""
 
@@ -122,6 +125,8 @@ function cleanup() {
 
 	# delete netconsole dynamic reconfiguration
 	echo 0 > "${NETCONS_PATH}"/enabled
+	# Remove key
+	rmdir "${KEY_PATH}"
 	# Remove the configfs entry
 	rmdir "${NETCONS_PATH}"
 
@@ -136,6 +141,18 @@ function cleanup() {
 	echo "${DEFAULT_PRINTK_VALUES}" > /proc/sys/kernel/printk
 }
 
+function set_user_data() {
+	if [[ ! -d "${NETCONS_PATH}""/userdata" ]]
+	then
+		echo "Userdata path not available in ${NETCONS_PATH}/userdata"
+		exit "${ksft_skip}"
+	fi
+
+	mkdir -p "${KEY_PATH}"
+	VALUE_PATH="${KEY_PATH}""/value"
+	echo "${USERDATA_VALUE}" > "${VALUE_PATH}"
+}
+
 function listen_port_and_save_to() {
 	local OUTPUT=${1}
 	# Just wait for 2 seconds
@@ -146,6 +163,10 @@ function listen_port_and_save_to() {
 function validate_result() {
 	local TMPFILENAME="$1"
 
+	# TMPFILENAME will contain something like:
+	# 6.11.1-0_fbk0_rc13_509_g30d75cea12f7,13,1822,115075213798,-;netconsole selftest: netcons_gtJHM
+	#  key=value
+
 	# Check if the file exists
 	if [ ! -f "$TMPFILENAME" ]; then
 		echo "FAIL: File was not generated." >&2
@@ -158,6 +179,12 @@ function validate_result() {
 		exit "${ksft_fail}"
 	fi
 
+	if ! grep -q "${USERDATA_KEY}=${USERDATA_VALUE}" "${TMPFILENAME}"; then
+		echo "FAIL: ${USERDATA_KEY}=${USERDATA_VALUE} not found in ${TMPFILENAME}" >&2
+		cat "${TMPFILENAME}" >&2
+		exit "${ksft_fail}"
+	fi
+
 	# Delete the file once it is validated, otherwise keep it
 	# for debugging purposes
 	rm "${TMPFILENAME}"
@@ -220,6 +247,8 @@ trap cleanup EXIT
 set_network
 # Create a dynamic target for netconsole
 create_dynamic_target
+# Set userdata "key" with the "value" value
+set_user_data
 # Listed for netconsole port inside the namespace and destination interface
 listen_port_and_save_to "${OUTPUT_FILE}" &
 # Wait for socat to start and listen to the port.
-- 
cgit v1.2.3


From 9ff75a23dff3622451057b2ccd88c19bbb293841 Mon Sep 17 00:00:00 2001
From: Pedro Tammela <pctammela@mojatatu.com>
Date: Fri, 1 Nov 2024 11:31:48 -0300
Subject: selftests/tc-testing: add tests for qdisc_tree_reduce_backlog

Add 3 tests to check for the expected behaviour of
qdisc_tree_reduce_backlog in special scenarios.

- The first test checks if the qdisc class is notified of deletion for
major handle 'ffff:'.
- The second test checks the same as the first test but with 'ffff:' as the root
qdisc.
- The third test checks if everything works if ingress is active.

Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Pedro Tammela <pctammela@mojatatu.com>
Acked-by: Cong Wang <cong.wang@bytedance.com>
Link: https://patch.msgid.link/20241101143148.1218890-1-pctammela@mojatatu.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../tc-testing/tc-tests/infra/qdiscs.json          | 98 ++++++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json
new file mode 100644
index 000000000000..d3dd65b05b5f
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json
@@ -0,0 +1,98 @@
+[
+    {
+        "id": "ca5e",
+        "name": "Check class delete notification for ffff:",
+        "category": [
+            "qdisc"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "$IP link set dev $DUMMY up || true",
+            "$IP addr add 10.10.10.10/24 dev $DUMMY || true",
+            "$TC qdisc add dev $DUMMY root handle 1: drr",
+            "$TC filter add dev $DUMMY parent 1: basic classid 1:1",
+            "$TC class add dev $DUMMY parent 1: classid 1:1 drr",
+            "$TC qdisc add dev $DUMMY parent 1:1 handle ffff: drr",
+            "$TC filter add dev $DUMMY parent ffff: basic classid ffff:1",
+            "$TC class add dev $DUMMY parent ffff: classid ffff:1 drr",
+            "$TC qdisc add dev $DUMMY parent ffff:1 netem delay 1s",
+            "ping -c1 -W0.01 -I $DUMMY 10.10.10.1 || true",
+            "$TC class del dev $DUMMY classid ffff:1",
+            "$TC class add dev $DUMMY parent ffff: classid ffff:1 drr"
+        ],
+        "cmdUnderTest": "ping -c1 -W0.01 -I $DUMMY 10.10.10.1",
+        "expExitCode": "1",
+        "verifyCmd": "$TC -s qdisc ls dev $DUMMY",
+        "matchPattern": "drr 1: root",
+        "matchCount": "1",
+        "teardown": [
+            "$TC qdisc del dev $DUMMY root handle 1: drr",
+            "$IP addr del 10.10.10.10/24 dev $DUMMY"
+        ]
+    },
+    {
+        "id": "e4b7",
+        "name": "Check class delete notification for root ffff:",
+        "category": [
+            "qdisc"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "$IP link set dev $DUMMY up || true",
+            "$IP addr add 10.10.10.10/24 dev $DUMMY || true",
+            "$TC qdisc add dev $DUMMY root handle ffff: drr",
+            "$TC filter add dev $DUMMY parent ffff: basic classid ffff:1",
+            "$TC class add dev $DUMMY parent ffff: classid ffff:1 drr",
+            "$TC qdisc add dev $DUMMY parent ffff:1 netem delay 1s",
+            "ping -c1 -W0.01 -I $DUMMY 10.10.10.1 || true",
+            "$TC class del dev $DUMMY classid ffff:1",
+            "$TC class add dev $DUMMY parent ffff: classid ffff:1 drr"
+        ],
+        "cmdUnderTest": "ping -c1 -W0.01 -I $DUMMY 10.10.10.1",
+        "expExitCode": "1",
+        "verifyCmd": "$TC qdisc ls dev $DUMMY",
+        "matchPattern": "drr ffff: root",
+        "matchCount": "1",
+        "teardown": [
+            "$TC qdisc del dev $DUMMY root handle ffff: drr",
+            "$IP addr del 10.10.10.10/24 dev $DUMMY"
+        ]
+    },
+    {
+        "id": "33a9",
+        "name": "Check ingress is not searchable on backlog update",
+        "category": [
+            "qdisc"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "$IP link set dev $DUMMY up || true",
+            "$IP addr add 10.10.10.10/24 dev $DUMMY || true",
+            "$TC qdisc add dev $DUMMY ingress",
+            "$TC qdisc add dev $DUMMY root handle 1: drr",
+            "$TC filter add dev $DUMMY parent 1: basic classid 1:1",
+            "$TC class add dev $DUMMY parent 1: classid 1:1 drr",
+            "$TC qdisc add dev $DUMMY parent 1:1 handle 2: drr",
+            "$TC filter add dev $DUMMY parent 2: basic classid 2:1",
+            "$TC class add dev $DUMMY parent 2: classid 2:1 drr",
+            "$TC qdisc add dev $DUMMY parent 2:1 netem delay 1s",
+            "ping -c1 -W0.01 -I $DUMMY 10.10.10.1 || true"
+        ],
+        "cmdUnderTest": "$TC class del dev $DUMMY classid 2:1",
+        "expExitCode": "0",
+        "verifyCmd": "$TC qdisc ls dev $DUMMY",
+        "matchPattern": "drr 1: root",
+        "matchCount": "1",
+        "teardown": [
+            "$TC qdisc del dev $DUMMY root handle 1: drr",
+            "$TC qdisc del dev $DUMMY ingress",
+            "$IP addr del 10.10.10.10/24 dev $DUMMY"
+        ]
+    }
+]
-- 
cgit v1.2.3


From d402755ced2ea8fc1f0513136f074002d509bfa0 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sun, 3 Nov 2024 14:59:39 -0800
Subject: bpf: Unify resource leak checks

There are similar checks for covering locks, references, RCU read
sections and preempt_disable sections in 3 places in the verifer, i.e.
for tail calls, bpf_ld_[abs, ind], and exit path (for BPF_EXIT and
bpf_throw). Unify all of these into a common check_resource_leak
function to avoid code duplication.

Also update the error strings in selftests to the new ones in the same
change to ensure clean bisection.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20241103225940.1408302-3-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                              | 90 ++++++++--------------
 .../testing/selftests/bpf/progs/exceptions_fail.c  |  4 +-
 tools/testing/selftests/bpf/progs/preempt_lock.c   | 14 ++--
 .../selftests/bpf/progs/verifier_ref_tracking.c    |  4 +-
 .../selftests/bpf/progs/verifier_spin_lock.c       |  2 +-
 5 files changed, 46 insertions(+), 68 deletions(-)

(limited to 'tools/testing')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0844b4383ff3..ba800c7611e3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -10352,6 +10352,34 @@ static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exi
 	return refs_lingering ? -EINVAL : 0;
 }
 
+static int check_resource_leak(struct bpf_verifier_env *env, bool exception_exit, bool check_lock, const char *prefix)
+{
+	int err;
+
+	if (check_lock && env->cur_state->active_lock.ptr) {
+		verbose(env, "%s cannot be used inside bpf_spin_lock-ed region\n", prefix);
+		return -EINVAL;
+	}
+
+	err = check_reference_leak(env, exception_exit);
+	if (err) {
+		verbose(env, "%s would lead to reference leak\n", prefix);
+		return err;
+	}
+
+	if (check_lock && env->cur_state->active_rcu_lock) {
+		verbose(env, "%s cannot be used inside bpf_rcu_read_lock-ed region\n", prefix);
+		return -EINVAL;
+	}
+
+	if (check_lock && env->cur_state->active_preempt_lock) {
+		verbose(env, "%s cannot be used inside bpf_preempt_disable-ed region\n", prefix);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
 				   struct bpf_reg_state *regs)
 {
@@ -10620,26 +10648,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 
 	switch (func_id) {
 	case BPF_FUNC_tail_call:
-		if (env->cur_state->active_lock.ptr) {
-			verbose(env, "tail_call cannot be used inside bpf_spin_lock-ed region\n");
-			return -EINVAL;
-		}
-
-		err = check_reference_leak(env, false);
-		if (err) {
-			verbose(env, "tail_call would lead to reference leak\n");
+		err = check_resource_leak(env, false, true, "tail_call");
+		if (err)
 			return err;
-		}
-
-		if (env->cur_state->active_rcu_lock) {
-			verbose(env, "tail_call cannot be used inside bpf_rcu_read_lock-ed region\n");
-			return -EINVAL;
-		}
-
-		if (env->cur_state->active_preempt_lock) {
-			verbose(env, "tail_call cannot be used inside bpf_preempt_disable-ed region\n");
-			return -EINVAL;
-		}
 		break;
 	case BPF_FUNC_get_local_storage:
 		/* check that flags argument in get_local_storage(map, flags) is 0,
@@ -15801,26 +15812,9 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	 * gen_ld_abs() may terminate the program at runtime, leading to
 	 * reference leak.
 	 */
-	err = check_reference_leak(env, false);
-	if (err) {
-		verbose(env, "BPF_LD_[ABS|IND] cannot be mixed with socket references\n");
+	err = check_resource_leak(env, false, true, "BPF_LD_[ABS|IND]");
+	if (err)
 		return err;
-	}
-
-	if (env->cur_state->active_lock.ptr) {
-		verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n");
-		return -EINVAL;
-	}
-
-	if (env->cur_state->active_rcu_lock) {
-		verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_rcu_read_lock-ed region\n");
-		return -EINVAL;
-	}
-
-	if (env->cur_state->active_preempt_lock) {
-		verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_preempt_disable-ed region\n");
-		return -EINVAL;
-	}
 
 	if (regs[ctx_reg].type != PTR_TO_CTX) {
 		verbose(env,
@@ -18606,30 +18600,14 @@ static int do_check(struct bpf_verifier_env *env)
 					return -EINVAL;
 				}
 process_bpf_exit_full:
-				if (env->cur_state->active_lock.ptr && !env->cur_state->curframe) {
-					verbose(env, "bpf_spin_unlock is missing\n");
-					return -EINVAL;
-				}
-
-				if (env->cur_state->active_rcu_lock && !env->cur_state->curframe) {
-					verbose(env, "bpf_rcu_read_unlock is missing\n");
-					return -EINVAL;
-				}
-
-				if (env->cur_state->active_preempt_lock && !env->cur_state->curframe) {
-					verbose(env, "%d bpf_preempt_enable%s missing\n",
-						env->cur_state->active_preempt_lock,
-						env->cur_state->active_preempt_lock == 1 ? " is" : "(s) are");
-					return -EINVAL;
-				}
-
 				/* We must do check_reference_leak here before
 				 * prepare_func_exit to handle the case when
 				 * state->curframe > 0, it may be a callback
 				 * function, for which reference_state must
 				 * match caller reference state when it exits.
 				 */
-				err = check_reference_leak(env, exception_exit);
+				err = check_resource_leak(env, exception_exit, !env->cur_state->curframe,
+							  "BPF_EXIT instruction");
 				if (err)
 					return err;
 
diff --git a/tools/testing/selftests/bpf/progs/exceptions_fail.c b/tools/testing/selftests/bpf/progs/exceptions_fail.c
index 9cceb6521143..fe0f3fa5aab6 100644
--- a/tools/testing/selftests/bpf/progs/exceptions_fail.c
+++ b/tools/testing/selftests/bpf/progs/exceptions_fail.c
@@ -131,7 +131,7 @@ int reject_subprog_with_lock(void *ctx)
 }
 
 SEC("?tc")
-__failure __msg("bpf_rcu_read_unlock is missing")
+__failure __msg("BPF_EXIT instruction cannot be used inside bpf_rcu_read_lock-ed region")
 int reject_with_rcu_read_lock(void *ctx)
 {
 	bpf_rcu_read_lock();
@@ -147,7 +147,7 @@ __noinline static int throwing_subprog(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
-__failure __msg("bpf_rcu_read_unlock is missing")
+__failure __msg("BPF_EXIT instruction cannot be used inside bpf_rcu_read_lock-ed region")
 int reject_subprog_with_rcu_read_lock(void *ctx)
 {
 	bpf_rcu_read_lock();
diff --git a/tools/testing/selftests/bpf/progs/preempt_lock.c b/tools/testing/selftests/bpf/progs/preempt_lock.c
index 672fc368d9c4..885377e83607 100644
--- a/tools/testing/selftests/bpf/progs/preempt_lock.c
+++ b/tools/testing/selftests/bpf/progs/preempt_lock.c
@@ -6,7 +6,7 @@
 #include "bpf_experimental.h"
 
 SEC("?tc")
-__failure __msg("1 bpf_preempt_enable is missing")
+__failure __msg("BPF_EXIT instruction cannot be used inside bpf_preempt_disable-ed region")
 int preempt_lock_missing_1(struct __sk_buff *ctx)
 {
 	bpf_preempt_disable();
@@ -14,7 +14,7 @@ int preempt_lock_missing_1(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
-__failure __msg("2 bpf_preempt_enable(s) are missing")
+__failure __msg("BPF_EXIT instruction cannot be used inside bpf_preempt_disable-ed region")
 int preempt_lock_missing_2(struct __sk_buff *ctx)
 {
 	bpf_preempt_disable();
@@ -23,7 +23,7 @@ int preempt_lock_missing_2(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
-__failure __msg("3 bpf_preempt_enable(s) are missing")
+__failure __msg("BPF_EXIT instruction cannot be used inside bpf_preempt_disable-ed region")
 int preempt_lock_missing_3(struct __sk_buff *ctx)
 {
 	bpf_preempt_disable();
@@ -33,7 +33,7 @@ int preempt_lock_missing_3(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
-__failure __msg("1 bpf_preempt_enable is missing")
+__failure __msg("BPF_EXIT instruction cannot be used inside bpf_preempt_disable-ed region")
 int preempt_lock_missing_3_minus_2(struct __sk_buff *ctx)
 {
 	bpf_preempt_disable();
@@ -55,7 +55,7 @@ static __noinline void preempt_enable(void)
 }
 
 SEC("?tc")
-__failure __msg("1 bpf_preempt_enable is missing")
+__failure __msg("BPF_EXIT instruction cannot be used inside bpf_preempt_disable-ed region")
 int preempt_lock_missing_1_subprog(struct __sk_buff *ctx)
 {
 	preempt_disable();
@@ -63,7 +63,7 @@ int preempt_lock_missing_1_subprog(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
-__failure __msg("2 bpf_preempt_enable(s) are missing")
+__failure __msg("BPF_EXIT instruction cannot be used inside bpf_preempt_disable-ed region")
 int preempt_lock_missing_2_subprog(struct __sk_buff *ctx)
 {
 	preempt_disable();
@@ -72,7 +72,7 @@ int preempt_lock_missing_2_subprog(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
-__failure __msg("1 bpf_preempt_enable is missing")
+__failure __msg("BPF_EXIT instruction cannot be used inside bpf_preempt_disable-ed region")
 int preempt_lock_missing_2_minus_1_subprog(struct __sk_buff *ctx)
 {
 	preempt_disable();
diff --git a/tools/testing/selftests/bpf/progs/verifier_ref_tracking.c b/tools/testing/selftests/bpf/progs/verifier_ref_tracking.c
index c4c6da21265e..683a882b3e6d 100644
--- a/tools/testing/selftests/bpf/progs/verifier_ref_tracking.c
+++ b/tools/testing/selftests/bpf/progs/verifier_ref_tracking.c
@@ -791,7 +791,7 @@ l0_%=:	r0 = *(u8*)skb[0];				\
 
 SEC("tc")
 __description("reference tracking: forbid LD_ABS while holding reference")
-__failure __msg("BPF_LD_[ABS|IND] cannot be mixed with socket references")
+__failure __msg("BPF_LD_[ABS|IND] would lead to reference leak")
 __naked void ld_abs_while_holding_reference(void)
 {
 	asm volatile ("					\
@@ -836,7 +836,7 @@ l0_%=:	r7 = 1;						\
 
 SEC("tc")
 __description("reference tracking: forbid LD_IND while holding reference")
-__failure __msg("BPF_LD_[ABS|IND] cannot be mixed with socket references")
+__failure __msg("BPF_LD_[ABS|IND] would lead to reference leak")
 __naked void ld_ind_while_holding_reference(void)
 {
 	asm volatile ("					\
diff --git a/tools/testing/selftests/bpf/progs/verifier_spin_lock.c b/tools/testing/selftests/bpf/progs/verifier_spin_lock.c
index fb316c080c84..3f679de73229 100644
--- a/tools/testing/selftests/bpf/progs/verifier_spin_lock.c
+++ b/tools/testing/selftests/bpf/progs/verifier_spin_lock.c
@@ -187,7 +187,7 @@ l0_%=:	r6 = r0;					\
 
 SEC("cgroup/skb")
 __description("spin_lock: test6 missing unlock")
-__failure __msg("unlock is missing")
+__failure __msg("BPF_EXIT instruction cannot be used inside bpf_spin_lock-ed region")
 __failure_unpriv __msg_unpriv("")
 __naked void spin_lock_test6_missing_unlock(void)
 {
-- 
cgit v1.2.3


From 711df091dea9b6f9e83ed738967cb0763f4d362c Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sun, 3 Nov 2024 14:59:40 -0800
Subject: selftests/bpf: Add tests for tail calls with locks and refs

Add failure tests to ensure bugs don't slip through for tail calls and
lingering locks, RCU sections, preemption disabled sections, and
references prevent tail calls.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20241103225940.1408302-4-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/tailcalls.c |  8 +++
 tools/testing/selftests/bpf/progs/tailcall_fail.c  | 64 ++++++++++++++++++++++
 2 files changed, 72 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/tailcall_fail.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c
index 40f22454cf05..544144620ca6 100644
--- a/tools/testing/selftests/bpf/prog_tests/tailcalls.c
+++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c
@@ -7,6 +7,7 @@
 #include "tailcall_bpf2bpf_hierarchy3.skel.h"
 #include "tailcall_freplace.skel.h"
 #include "tc_bpf2bpf.skel.h"
+#include "tailcall_fail.skel.h"
 
 /* test_tailcall_1 checks basic functionality by patching multiple locations
  * in a single program for a single tail call slot with nop->jmp, jmp->nop
@@ -1646,6 +1647,11 @@ out:
 	tc_bpf2bpf__destroy(tc_skel);
 }
 
+static void test_tailcall_failure()
+{
+	RUN_TESTS(tailcall_fail);
+}
+
 void test_tailcalls(void)
 {
 	if (test__start_subtest("tailcall_1"))
@@ -1698,4 +1704,6 @@ void test_tailcalls(void)
 		test_tailcall_freplace();
 	if (test__start_subtest("tailcall_bpf2bpf_freplace"))
 		test_tailcall_bpf2bpf_freplace();
+	if (test__start_subtest("tailcall_failure"))
+		test_tailcall_failure();
 }
diff --git a/tools/testing/selftests/bpf/progs/tailcall_fail.c b/tools/testing/selftests/bpf/progs/tailcall_fail.c
new file mode 100644
index 000000000000..bc77921d2bb0
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tailcall_fail.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+
+#include "bpf_misc.h"
+#include "bpf_experimental.h"
+
+extern void bpf_rcu_read_lock(void) __ksym;
+extern void bpf_rcu_read_unlock(void) __ksym;
+
+#define private(name) SEC(".bss." #name) __hidden __attribute__((aligned(8)))
+
+private(A) struct bpf_spin_lock lock;
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+	__uint(max_entries, 3);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(__u32));
+} jmp_table SEC(".maps");
+
+SEC("?tc")
+__failure __msg("function calls are not allowed while holding a lock")
+int reject_tail_call_spin_lock(struct __sk_buff *ctx)
+{
+	bpf_spin_lock(&lock);
+	bpf_tail_call_static(ctx, &jmp_table, 0);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("tail_call cannot be used inside bpf_rcu_read_lock-ed region")
+int reject_tail_call_rcu_lock(struct __sk_buff *ctx)
+{
+	bpf_rcu_read_lock();
+	bpf_tail_call_static(ctx, &jmp_table, 0);
+	bpf_rcu_read_unlock();
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("tail_call cannot be used inside bpf_preempt_disable-ed region")
+int reject_tail_call_preempt_lock(struct __sk_buff *ctx)
+{
+	bpf_guard_preempt();
+	bpf_tail_call_static(ctx, &jmp_table, 0);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("tail_call would lead to reference leak")
+int reject_tail_call_ref(struct __sk_buff *ctx)
+{
+	struct foo { int i; } *p;
+
+	p = bpf_obj_new(typeof(*p));
+	bpf_tail_call_static(ctx, &jmp_table, 0);
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 6e182dc9f2680681ffb0b6d9757927f1bd321b38 Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Tue, 29 Oct 2024 14:45:38 +0000
Subject: selftests/mm: Use generic pkey register manipulation

pkey_sighandler_tests.c currently hardcodes x86 PKRU encodings. The
first step towards running those tests on arm64 is to abstract away
the pkey register values.

Since those tests want to deny access to all keys except a few,
we have each arch define PKEY_REG_ALLOW_NONE, the pkey register value
denying access to all keys. We then use the existing set_pkey_bits()
helper to grant access to specific keys.

Because pkeys may also remove the execute permission on arm64, we
need to be a little careful: all code is mapped with pkey 0, and we
need it to remain executable. pkey_reg_restrictive_default() is
introduced for that purpose: the value it returns prevents RW access
to all pkeys, but retains X permission for pkey 0.

test_pkru_preserved_after_sigusr1() only checks that the pkey
register value remains unchanged after a signal is delivered, so the
particular value is irrelevant. We enable pkey 0 and a few more
arbitrary keys in the smallest range available on all architectures
(8 keys on arm64).

Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lore.kernel.org/r/20241029144539.111155-5-kevin.brodsky@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/mm/pkey-arm64.h            |  1 +
 tools/testing/selftests/mm/pkey-x86.h              |  2 +
 tools/testing/selftests/mm/pkey_sighandler_tests.c | 53 ++++++++++++++++++----
 3 files changed, 47 insertions(+), 9 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/mm/pkey-arm64.h b/tools/testing/selftests/mm/pkey-arm64.h
index 580e1b0bb38e..d57fbeace38f 100644
--- a/tools/testing/selftests/mm/pkey-arm64.h
+++ b/tools/testing/selftests/mm/pkey-arm64.h
@@ -31,6 +31,7 @@
 #define NR_RESERVED_PKEYS	1 /* pkey-0 */
 
 #define PKEY_ALLOW_ALL		0x77777777
+#define PKEY_REG_ALLOW_NONE	0x0
 
 #define PKEY_BITS_PER_PKEY	4
 #define PAGE_SIZE		sysconf(_SC_PAGESIZE)
diff --git a/tools/testing/selftests/mm/pkey-x86.h b/tools/testing/selftests/mm/pkey-x86.h
index 5f28e26a2511..ac91777c8917 100644
--- a/tools/testing/selftests/mm/pkey-x86.h
+++ b/tools/testing/selftests/mm/pkey-x86.h
@@ -34,6 +34,8 @@
 #define PAGE_SIZE		4096
 #define MB			(1<<20)
 
+#define PKEY_REG_ALLOW_NONE	0x55555555
+
 static inline void __page_o_noops(void)
 {
 	/* 8-bytes of instruction * 512 bytes = 1 page */
diff --git a/tools/testing/selftests/mm/pkey_sighandler_tests.c b/tools/testing/selftests/mm/pkey_sighandler_tests.c
index a8088b645ad6..501880dbdc37 100644
--- a/tools/testing/selftests/mm/pkey_sighandler_tests.c
+++ b/tools/testing/selftests/mm/pkey_sighandler_tests.c
@@ -11,6 +11,7 @@
  */
 #define _GNU_SOURCE
 #define __SANE_USERSPACE_TYPES__
+#include <linux/mman.h>
 #include <errno.h>
 #include <sys/syscall.h>
 #include <string.h>
@@ -65,6 +66,20 @@ long syscall_raw(long n, long a1, long a2, long a3, long a4, long a5, long a6)
 	return ret;
 }
 
+/*
+ * Returns the most restrictive pkey register value that can be used by the
+ * tests.
+ */
+static inline u64 pkey_reg_restrictive_default(void)
+{
+	/*
+	 * Disallow everything except execution on pkey 0, so that each caller
+	 * doesn't need to enable it explicitly (the selftest code runs with
+	 * its code mapped with pkey 0).
+	 */
+	return set_pkey_bits(PKEY_REG_ALLOW_NONE, 0, PKEY_DISABLE_ACCESS);
+}
+
 static void sigsegv_handler(int signo, siginfo_t *info, void *ucontext)
 {
 	pthread_mutex_lock(&mutex);
@@ -113,7 +128,7 @@ static void raise_sigusr2(void)
 static void *thread_segv_with_pkey0_disabled(void *ptr)
 {
 	/* Disable MPK 0 (and all others too) */
-	__write_pkey_reg(0x55555555);
+	__write_pkey_reg(pkey_reg_restrictive_default());
 
 	/* Segfault (with SEGV_MAPERR) */
 	*(int *) (0x1) = 1;
@@ -123,7 +138,7 @@ static void *thread_segv_with_pkey0_disabled(void *ptr)
 static void *thread_segv_pkuerr_stack(void *ptr)
 {
 	/* Disable MPK 0 (and all others too) */
-	__write_pkey_reg(0x55555555);
+	__write_pkey_reg(pkey_reg_restrictive_default());
 
 	/* After we disable MPK 0, we can't access the stack to return */
 	return NULL;
@@ -133,6 +148,7 @@ static void *thread_segv_maperr_ptr(void *ptr)
 {
 	stack_t *stack = ptr;
 	int *bad = (int *)1;
+	u64 pkey_reg;
 
 	/*
 	 * Setup alternate signal stack, which should be pkey_mprotect()ed by
@@ -142,7 +158,9 @@ static void *thread_segv_maperr_ptr(void *ptr)
 	syscall_raw(SYS_sigaltstack, (long)stack, 0, 0, 0, 0, 0);
 
 	/* Disable MPK 0.  Only MPK 1 is enabled. */
-	__write_pkey_reg(0x55555551);
+	pkey_reg = pkey_reg_restrictive_default();
+	pkey_reg = set_pkey_bits(pkey_reg, 1, PKEY_UNRESTRICTED);
+	__write_pkey_reg(pkey_reg);
 
 	/* Segfault */
 	*bad = 1;
@@ -240,6 +258,7 @@ static void test_sigsegv_handler_with_different_pkey_for_stack(void)
 	int pkey;
 	int parent_pid = 0;
 	int child_pid = 0;
+	u64 pkey_reg;
 
 	sa.sa_flags = SA_SIGINFO | SA_ONSTACK;
 
@@ -257,7 +276,10 @@ static void test_sigsegv_handler_with_different_pkey_for_stack(void)
 	assert(stack != MAP_FAILED);
 
 	/* Allow access to MPK 0 and MPK 1 */
-	__write_pkey_reg(0x55555550);
+	pkey_reg = pkey_reg_restrictive_default();
+	pkey_reg = set_pkey_bits(pkey_reg, 0, PKEY_UNRESTRICTED);
+	pkey_reg = set_pkey_bits(pkey_reg, 1, PKEY_UNRESTRICTED);
+	__write_pkey_reg(pkey_reg);
 
 	/* Protect the new stack with MPK 1 */
 	pkey = pkey_alloc(0, 0);
@@ -307,7 +329,13 @@ static void test_sigsegv_handler_with_different_pkey_for_stack(void)
 static void test_pkru_preserved_after_sigusr1(void)
 {
 	struct sigaction sa;
-	unsigned long pkru = 0x45454544;
+	u64 pkey_reg;
+
+	/* Allow access to MPK 0 and an arbitrary set of keys */
+	pkey_reg = pkey_reg_restrictive_default();
+	pkey_reg = set_pkey_bits(pkey_reg, 0, PKEY_UNRESTRICTED);
+	pkey_reg = set_pkey_bits(pkey_reg, 3, PKEY_UNRESTRICTED);
+	pkey_reg = set_pkey_bits(pkey_reg, 7, PKEY_UNRESTRICTED);
 
 	sa.sa_flags = SA_SIGINFO;
 
@@ -320,7 +348,7 @@ static void test_pkru_preserved_after_sigusr1(void)
 
 	memset(&siginfo, 0, sizeof(siginfo));
 
-	__write_pkey_reg(pkru);
+	__write_pkey_reg(pkey_reg);
 
 	raise(SIGUSR1);
 
@@ -330,7 +358,7 @@ static void test_pkru_preserved_after_sigusr1(void)
 	pthread_mutex_unlock(&mutex);
 
 	/* Ensure the pkru value is the same after returning from signal. */
-	ksft_test_result(pkru == __read_pkey_reg() &&
+	ksft_test_result(pkey_reg == __read_pkey_reg() &&
 			 siginfo.si_signo == SIGUSR1,
 			 "%s\n", __func__);
 }
@@ -347,6 +375,7 @@ static noinline void *thread_sigusr2_self(void *ptr)
 		'S', 'I', 'G', 'U', 'S', 'R', '2',
 		'.', '.', '.', '\n', '\0'};
 	stack_t *stack = ptr;
+	u64 pkey_reg;
 
 	/*
 	 * Setup alternate signal stack, which should be pkey_mprotect()ed by
@@ -356,7 +385,9 @@ static noinline void *thread_sigusr2_self(void *ptr)
 	syscall(SYS_sigaltstack, (long)stack, 0, 0, 0, 0, 0);
 
 	/* Disable MPK 0.  Only MPK 2 is enabled. */
-	__write_pkey_reg(0x55555545);
+	pkey_reg = pkey_reg_restrictive_default();
+	pkey_reg = set_pkey_bits(pkey_reg, 2, PKEY_UNRESTRICTED);
+	__write_pkey_reg(pkey_reg);
 
 	raise_sigusr2();
 
@@ -384,6 +415,7 @@ static void test_pkru_sigreturn(void)
 	int pkey;
 	int parent_pid = 0;
 	int child_pid = 0;
+	u64 pkey_reg;
 
 	sa.sa_handler = SIG_DFL;
 	sa.sa_flags = 0;
@@ -418,7 +450,10 @@ static void test_pkru_sigreturn(void)
 	 * the current thread's stack is protected by the default MPK 0. Hence
 	 * both need to be enabled.
 	 */
-	__write_pkey_reg(0x55555544);
+	pkey_reg = pkey_reg_restrictive_default();
+	pkey_reg = set_pkey_bits(pkey_reg, 0, PKEY_UNRESTRICTED);
+	pkey_reg = set_pkey_bits(pkey_reg, 2, PKEY_UNRESTRICTED);
+	__write_pkey_reg(pkey_reg);
 
 	/* Protect the stack with MPK 2 */
 	pkey = pkey_alloc(0, 0);
-- 
cgit v1.2.3


From 49f59573e9e06093ba23caf4ea1641b16e7e497e Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Tue, 29 Oct 2024 14:45:39 +0000
Subject: selftests/mm: Enable pkey_sighandler_tests on arm64

pkey_sighandler_tests.c makes raw syscalls using its own helper,
syscall_raw(). One of those syscalls is clone, which is problematic
as every architecture has a different opinion on the order of its
arguments.

To complete arm64 support, we therefore add an appropriate
implementation in syscall_raw(), and introduce a clone_raw() helper
that shuffles arguments as needed for each arch.

Having done this, we enable building pkey_sighandler_tests for arm64
in the Makefile.

Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Link: https://lore.kernel.org/r/20241029144539.111155-6-kevin.brodsky@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/mm/Makefile                |  8 +--
 tools/testing/selftests/mm/pkey_sighandler_tests.c | 62 ++++++++++++++++------
 2 files changed, 50 insertions(+), 20 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 02e1204971b0..0f8c110e0805 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -105,12 +105,12 @@ endif
 ifeq ($(CAN_BUILD_X86_64),1)
 TEST_GEN_FILES += $(BINARIES_64)
 endif
-else
 
-ifneq (,$(filter $(ARCH),arm64 powerpc))
+else ifeq ($(ARCH),arm64)
+TEST_GEN_FILES += protection_keys
+TEST_GEN_FILES += pkey_sighandler_tests
+else ifeq ($(ARCH),powerpc)
 TEST_GEN_FILES += protection_keys
-endif
-
 endif
 
 ifneq (,$(filter $(ARCH),arm64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64 s390))
diff --git a/tools/testing/selftests/mm/pkey_sighandler_tests.c b/tools/testing/selftests/mm/pkey_sighandler_tests.c
index 501880dbdc37..c593a426341c 100644
--- a/tools/testing/selftests/mm/pkey_sighandler_tests.c
+++ b/tools/testing/selftests/mm/pkey_sighandler_tests.c
@@ -60,12 +60,44 @@ long syscall_raw(long n, long a1, long a2, long a3, long a4, long a5, long a6)
 		      : "=a"(ret)
 		      : "a"(n), "b"(a1), "c"(a2), "d"(a3), "S"(a4), "D"(a5)
 		      : "memory");
+#elif defined __aarch64__
+	register long x0 asm("x0") = a1;
+	register long x1 asm("x1") = a2;
+	register long x2 asm("x2") = a3;
+	register long x3 asm("x3") = a4;
+	register long x4 asm("x4") = a5;
+	register long x5 asm("x5") = a6;
+	register long x8 asm("x8") = n;
+	asm volatile ("svc #0"
+		      : "=r"(x0)
+		      : "r"(x0), "r"(x1), "r"(x2), "r"(x3), "r"(x4), "r"(x5), "r"(x8)
+		      : "memory");
+	ret = x0;
 #else
 # error syscall_raw() not implemented
 #endif
 	return ret;
 }
 
+static inline long clone_raw(unsigned long flags, void *stack,
+			     int *parent_tid, int *child_tid)
+{
+	long a1 = flags;
+	long a2 = (long)stack;
+	long a3 = (long)parent_tid;
+#if defined(__x86_64__) || defined(__i386)
+	long a4 = (long)child_tid;
+	long a5 = 0;
+#elif defined(__aarch64__)
+	long a4 = 0;
+	long a5 = (long)child_tid;
+#else
+# error clone_raw() not implemented
+#endif
+
+	return syscall_raw(SYS_clone, a1, a2, a3, a4, a5, 0);
+}
+
 /*
  * Returns the most restrictive pkey register value that can be used by the
  * tests.
@@ -294,14 +326,13 @@ static void test_sigsegv_handler_with_different_pkey_for_stack(void)
 	memset(&siginfo, 0, sizeof(siginfo));
 
 	/* Use clone to avoid newer glibcs using rseq on new threads */
-	long ret = syscall_raw(SYS_clone,
-			       CLONE_VM | CLONE_FS | CLONE_FILES |
-			       CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
-			       CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID |
-			       CLONE_DETACHED,
-			       (long) ((char *)(stack) + STACK_SIZE),
-			       (long) &parent_pid,
-			       (long) &child_pid, 0, 0);
+	long ret = clone_raw(CLONE_VM | CLONE_FS | CLONE_FILES |
+			     CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
+			     CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID |
+			     CLONE_DETACHED,
+			     stack + STACK_SIZE,
+			     &parent_pid,
+			     &child_pid);
 
 	if (ret < 0) {
 		errno = -ret;
@@ -466,14 +497,13 @@ static void test_pkru_sigreturn(void)
 	sigstack.ss_size = STACK_SIZE;
 
 	/* Use clone to avoid newer glibcs using rseq on new threads */
-	long ret = syscall_raw(SYS_clone,
-			       CLONE_VM | CLONE_FS | CLONE_FILES |
-			       CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
-			       CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID |
-			       CLONE_DETACHED,
-			       (long) ((char *)(stack) + STACK_SIZE),
-			       (long) &parent_pid,
-			       (long) &child_pid, 0, 0);
+	long ret = clone_raw(CLONE_VM | CLONE_FS | CLONE_FILES |
+			     CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
+			     CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID |
+			     CLONE_DETACHED,
+			     stack + STACK_SIZE,
+			     &parent_pid,
+			     &child_pid);
 
 	if (ret < 0) {
 		errno = -ret;
-- 
cgit v1.2.3


From cb4158ce8ec8a5bb528cc1693356a5eb8058094d Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Mon, 4 Nov 2024 09:19:57 -0800
Subject: bpf: Mark raw_tp arguments with PTR_MAYBE_NULL

Arguments to a raw tracepoint are tagged as trusted, which carries the
semantics that the pointer will be non-NULL.  However, in certain cases,
a raw tracepoint argument may end up being NULL. More context about this
issue is available in [0].

Thus, there is a discrepancy between the reality, that raw_tp arguments
can actually be NULL, and the verifier's knowledge, that they are never
NULL, causing explicit NULL checks to be deleted, and accesses to such
pointers potentially crashing the kernel.

To fix this, mark raw_tp arguments as PTR_MAYBE_NULL, and then special
case the dereference and pointer arithmetic to permit it, and allow
passing them into helpers/kfuncs; these exceptions are made for raw_tp
programs only. Ensure that we don't do this when ref_obj_id > 0, as in
that case this is an acquired object and doesn't need such adjustment.

The reason we do mask_raw_tp_trusted_reg logic is because other will
recheck in places whether the register is a trusted_reg, and then
consider our register as untrusted when detecting the presence of the
PTR_MAYBE_NULL flag.

To allow safe dereference, we enable PROBE_MEM marking when we see loads
into trusted pointers with PTR_MAYBE_NULL.

While trusted raw_tp arguments can also be passed into helpers or kfuncs
where such broken assumption may cause issues, a future patch set will
tackle their case separately, as PTR_TO_BTF_ID (without PTR_TRUSTED) can
already be passed into helpers and causes similar problems. Thus, they
are left alone for now.

It is possible that these checks also permit passing non-raw_tp args
that are trusted PTR_TO_BTF_ID with null marking. In such a case,
allowing dereference when pointer is NULL expands allowed behavior, so
won't regress existing programs, and the case of passing these into
helpers is the same as above and will be dealt with later.

Also update the failure case in tp_btf_nullable selftest to capture the
new behavior, as the verifier will no longer cause an error when
directly dereference a raw tracepoint argument marked as __nullable.

  [0]: https://lore.kernel.org/bpf/ZrCZS6nisraEqehw@jlelli-thinkpadt14gen4.remote.csb

Reviewed-by: Jiri Olsa <jolsa@kernel.org>
Reported-by: Juri Lelli <juri.lelli@redhat.com>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Fixes: 3f00c5239344 ("bpf: Allow trusted pointers to be passed to KF_TRUSTED_ARGS kfuncs")
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20241104171959.2938862-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                                |  6 ++
 kernel/bpf/btf.c                                   |  5 +-
 kernel/bpf/verifier.c                              | 79 ++++++++++++++++++++--
 .../selftests/bpf/progs/test_tp_btf_nullable.c     |  6 +-
 4 files changed, 87 insertions(+), 9 deletions(-)

(limited to 'tools/testing')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c3ba4d475174..1b84613b10ac 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3495,4 +3495,10 @@ static inline bool bpf_is_subprog(const struct bpf_prog *prog)
 	return prog->aux->func_idx != 0;
 }
 
+static inline bool bpf_prog_is_raw_tp(const struct bpf_prog *prog)
+{
+	return prog->type == BPF_PROG_TYPE_TRACING &&
+	       prog->expected_attach_type == BPF_TRACE_RAW_TP;
+}
+
 #endif /* _LINUX_BPF_H */
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index ed3219da7181..e7a59e6462a9 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6588,7 +6588,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 	if (prog_args_trusted(prog))
 		info->reg_type |= PTR_TRUSTED;
 
-	if (btf_param_match_suffix(btf, &args[arg], "__nullable"))
+	/* Raw tracepoint arguments always get marked as maybe NULL */
+	if (bpf_prog_is_raw_tp(prog))
+		info->reg_type |= PTR_MAYBE_NULL;
+	else if (btf_param_match_suffix(btf, &args[arg], "__nullable"))
 		info->reg_type |= PTR_MAYBE_NULL;
 
 	if (tgt_prog) {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ba800c7611e3..7958d6ff6b73 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -418,6 +418,25 @@ static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
 	return rec;
 }
 
+static bool mask_raw_tp_reg_cond(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) {
+	return reg->type == (PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL) &&
+	       bpf_prog_is_raw_tp(env->prog) && !reg->ref_obj_id;
+}
+
+static bool mask_raw_tp_reg(const struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+	if (!mask_raw_tp_reg_cond(env, reg))
+		return false;
+	reg->type &= ~PTR_MAYBE_NULL;
+	return true;
+}
+
+static void unmask_raw_tp_reg(struct bpf_reg_state *reg, bool result)
+{
+	if (result)
+		reg->type |= PTR_MAYBE_NULL;
+}
+
 static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog)
 {
 	struct bpf_func_info_aux *aux = env->prog->aux->func_info_aux;
@@ -6622,6 +6641,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 	const char *field_name = NULL;
 	enum bpf_type_flag flag = 0;
 	u32 btf_id = 0;
+	bool mask;
 	int ret;
 
 	if (!env->allow_ptr_leaks) {
@@ -6693,7 +6713,21 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 
 	if (ret < 0)
 		return ret;
-
+	/* For raw_tp progs, we allow dereference of PTR_MAYBE_NULL
+	 * trusted PTR_TO_BTF_ID, these are the ones that are possibly
+	 * arguments to the raw_tp. Since internal checks in for trusted
+	 * reg in check_ptr_to_btf_access would consider PTR_MAYBE_NULL
+	 * modifier as problematic, mask it out temporarily for the
+	 * check. Don't apply this to pointers with ref_obj_id > 0, as
+	 * those won't be raw_tp args.
+	 *
+	 * We may end up applying this relaxation to other trusted
+	 * PTR_TO_BTF_ID with maybe null flag, since we cannot
+	 * distinguish PTR_MAYBE_NULL tagged for arguments vs normal
+	 * tagging, but that should expand allowed behavior, and not
+	 * cause regression for existing behavior.
+	 */
+	mask = mask_raw_tp_reg(env, reg);
 	if (ret != PTR_TO_BTF_ID) {
 		/* just mark; */
 
@@ -6754,8 +6788,13 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 		clear_trusted_flags(&flag);
 	}
 
-	if (atype == BPF_READ && value_regno >= 0)
+	if (atype == BPF_READ && value_regno >= 0) {
 		mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag);
+		/* We've assigned a new type to regno, so don't undo masking. */
+		if (regno == value_regno)
+			mask = false;
+	}
+	unmask_raw_tp_reg(reg, mask);
 
 	return 0;
 }
@@ -7140,7 +7179,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);
 	} else if (base_type(reg->type) == PTR_TO_BTF_ID &&
-		   !type_may_be_null(reg->type)) {
+		   (mask_raw_tp_reg_cond(env, reg) || !type_may_be_null(reg->type))) {
 		err = check_ptr_to_btf_access(env, regs, regno, off, size, t,
 					      value_regno);
 	} else if (reg->type == CONST_PTR_TO_MAP) {
@@ -8833,6 +8872,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 	enum bpf_reg_type type = reg->type;
 	u32 *arg_btf_id = NULL;
 	int err = 0;
+	bool mask;
 
 	if (arg_type == ARG_DONTCARE)
 		return 0;
@@ -8873,11 +8913,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 	    base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK)
 		arg_btf_id = fn->arg_btf_id[arg];
 
+	mask = mask_raw_tp_reg(env, reg);
 	err = check_reg_type(env, regno, arg_type, arg_btf_id, meta);
-	if (err)
-		return err;
 
-	err = check_func_arg_reg_off(env, reg, regno, arg_type);
+	err = err ?: check_func_arg_reg_off(env, reg, regno, arg_type);
+	unmask_raw_tp_reg(reg, mask);
 	if (err)
 		return err;
 
@@ -9672,14 +9712,17 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
 				return ret;
 		} else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) {
 			struct bpf_call_arg_meta meta;
+			bool mask;
 			int err;
 
 			if (register_is_null(reg) && type_may_be_null(arg->arg_type))
 				continue;
 
 			memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */
+			mask = mask_raw_tp_reg(env, reg);
 			err = check_reg_type(env, regno, arg->arg_type, &arg->btf_id, &meta);
 			err = err ?: check_func_arg_reg_off(env, reg, regno, arg->arg_type);
+			unmask_raw_tp_reg(reg, mask);
 			if (err)
 				return err;
 		} else {
@@ -12007,6 +12050,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 		enum bpf_arg_type arg_type = ARG_DONTCARE;
 		u32 regno = i + 1, ref_id, type_size;
 		bool is_ret_buf_sz = false;
+		bool mask = false;
 		int kf_arg_type;
 
 		t = btf_type_skip_modifiers(btf, args[i].type, NULL);
@@ -12065,12 +12109,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			return -EINVAL;
 		}
 
+		mask = mask_raw_tp_reg(env, reg);
 		if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) &&
 		    (register_is_null(reg) || type_may_be_null(reg->type)) &&
 			!is_kfunc_arg_nullable(meta->btf, &args[i])) {
 			verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i);
+			unmask_raw_tp_reg(reg, mask);
 			return -EACCES;
 		}
+		unmask_raw_tp_reg(reg, mask);
 
 		if (reg->ref_obj_id) {
 			if (is_kfunc_release(meta) && meta->ref_obj_id) {
@@ -12128,16 +12175,24 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta))
 				break;
 
+			/* Allow passing maybe NULL raw_tp arguments to
+			 * kfuncs for compatibility. Don't apply this to
+			 * arguments with ref_obj_id > 0.
+			 */
+			mask = mask_raw_tp_reg(env, reg);
 			if (!is_trusted_reg(reg)) {
 				if (!is_kfunc_rcu(meta)) {
 					verbose(env, "R%d must be referenced or trusted\n", regno);
+					unmask_raw_tp_reg(reg, mask);
 					return -EINVAL;
 				}
 				if (!is_rcu_reg(reg)) {
 					verbose(env, "R%d must be a rcu pointer\n", regno);
+					unmask_raw_tp_reg(reg, mask);
 					return -EINVAL;
 				}
 			}
+			unmask_raw_tp_reg(reg, mask);
 			fallthrough;
 		case KF_ARG_PTR_TO_CTX:
 		case KF_ARG_PTR_TO_DYNPTR:
@@ -12160,7 +12215,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 
 		if (is_kfunc_release(meta) && reg->ref_obj_id)
 			arg_type |= OBJ_RELEASE;
+		mask = mask_raw_tp_reg(env, reg);
 		ret = check_func_arg_reg_off(env, reg, regno, arg_type);
+		unmask_raw_tp_reg(reg, mask);
 		if (ret < 0)
 			return ret;
 
@@ -12337,6 +12394,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			ref_tname = btf_name_by_offset(btf, ref_t->name_off);
 			fallthrough;
 		case KF_ARG_PTR_TO_BTF_ID:
+			mask = mask_raw_tp_reg(env, reg);
 			/* Only base_type is checked, further checks are done here */
 			if ((base_type(reg->type) != PTR_TO_BTF_ID ||
 			     (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) &&
@@ -12345,9 +12403,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 				verbose(env, "expected %s or socket\n",
 					reg_type_str(env, base_type(reg->type) |
 							  (type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS)));
+				unmask_raw_tp_reg(reg, mask);
 				return -EINVAL;
 			}
 			ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i);
+			unmask_raw_tp_reg(reg, mask);
 			if (ret < 0)
 				return ret;
 			break;
@@ -13320,7 +13380,7 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env,
  */
 static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 				   struct bpf_insn *insn,
-				   const struct bpf_reg_state *ptr_reg,
+				   struct bpf_reg_state *ptr_reg,
 				   const struct bpf_reg_state *off_reg)
 {
 	struct bpf_verifier_state *vstate = env->cur_state;
@@ -13334,6 +13394,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	struct bpf_sanitize_info info = {};
 	u8 opcode = BPF_OP(insn->code);
 	u32 dst = insn->dst_reg;
+	bool mask;
 	int ret;
 
 	dst_reg = &regs[dst];
@@ -13360,11 +13421,14 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		return -EACCES;
 	}
 
+	mask = mask_raw_tp_reg(env, ptr_reg);
 	if (ptr_reg->type & PTR_MAYBE_NULL) {
 		verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n",
 			dst, reg_type_str(env, ptr_reg->type));
+		unmask_raw_tp_reg(ptr_reg, mask);
 		return -EACCES;
 	}
+	unmask_raw_tp_reg(ptr_reg, mask);
 
 	switch (base_type(ptr_reg->type)) {
 	case PTR_TO_CTX:
@@ -19866,6 +19930,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		 * for this case.
 		 */
 		case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED:
+		case PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL:
 			if (type == BPF_READ) {
 				if (BPF_MODE(insn->code) == BPF_MEM)
 					insn->code = BPF_LDX | BPF_PROBE_MEM |
diff --git a/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c b/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c
index bba3e37f749b..5aaf2b065f86 100644
--- a/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c
+++ b/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c
@@ -7,7 +7,11 @@
 #include "bpf_misc.h"
 
 SEC("tp_btf/bpf_testmod_test_nullable_bare")
-__failure __msg("R1 invalid mem access 'trusted_ptr_or_null_'")
+/* This used to be a failure test, but raw_tp nullable arguments can now
+ * directly be dereferenced, whether they have nullable annotation or not,
+ * and don't need to be explicitly checked.
+ */
+__success
 int BPF_PROG(handle_tp_btf_nullable_bare1, struct bpf_testmod_test_read_ctx *nullable_ctx)
 {
 	return nullable_ctx->len;
-- 
cgit v1.2.3


From 0e2fb011a0ba8e2258ce776fdf89fbd589c2a3a6 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Mon, 4 Nov 2024 09:19:58 -0800
Subject: selftests/bpf: Clean up open-coded gettid syscall invocations

Availability of the gettid definition across glibc versions supported by
BPF selftests is not certain. Currently, all users in the tree open-code
syscall to gettid. Convert them to a common macro definition.

Reviewed-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20241104171959.2938862-3-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/benchs/bench_trigger.c           |  3 ++-
 tools/testing/selftests/bpf/bpf_util.h                       |  9 +++++++++
 tools/testing/selftests/bpf/map_tests/task_storage_map.c     |  3 ++-
 tools/testing/selftests/bpf/prog_tests/bpf_cookie.c          |  2 +-
 tools/testing/selftests/bpf/prog_tests/bpf_iter.c            |  6 +++---
 tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c  | 10 +++++-----
 tools/testing/selftests/bpf/prog_tests/core_reloc.c          |  2 +-
 tools/testing/selftests/bpf/prog_tests/linked_funcs.c        |  2 +-
 tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c |  2 +-
 tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c       |  4 ++--
 tools/testing/selftests/bpf/prog_tests/task_local_storage.c  | 10 +++++-----
 tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c   |  2 +-
 12 files changed, 33 insertions(+), 22 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c
index 2ed0ef6f21ee..32e9f194d449 100644
--- a/tools/testing/selftests/bpf/benchs/bench_trigger.c
+++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
@@ -4,6 +4,7 @@
 #include <argp.h>
 #include <unistd.h>
 #include <stdint.h>
+#include "bpf_util.h"
 #include "bench.h"
 #include "trigger_bench.skel.h"
 #include "trace_helpers.h"
@@ -72,7 +73,7 @@ static __always_inline void inc_counter(struct counter *counters)
 	unsigned slot;
 
 	if (unlikely(tid == 0))
-		tid = syscall(SYS_gettid);
+		tid = sys_gettid();
 
 	/* multiplicative hashing, it's fast */
 	slot = 2654435769U * tid;
diff --git a/tools/testing/selftests/bpf/bpf_util.h b/tools/testing/selftests/bpf/bpf_util.h
index 10587a29b967..feff92219e21 100644
--- a/tools/testing/selftests/bpf/bpf_util.h
+++ b/tools/testing/selftests/bpf/bpf_util.h
@@ -6,6 +6,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
+#include <syscall.h>
 #include <bpf/libbpf.h> /* libbpf_num_possible_cpus */
 
 static inline unsigned int bpf_num_possible_cpus(void)
@@ -59,4 +60,12 @@ static inline void bpf_strlcpy(char *dst, const char *src, size_t sz)
 	(offsetof(TYPE, MEMBER)	+ sizeof_field(TYPE, MEMBER))
 #endif
 
+/* Availability of gettid across glibc versions is hit-and-miss, therefore
+ * fallback to syscall in this macro and use it everywhere.
+ */
+#ifndef sys_gettid
+#define sys_gettid() syscall(SYS_gettid)
+#endif
+
+
 #endif /* __BPF_UTIL__ */
diff --git a/tools/testing/selftests/bpf/map_tests/task_storage_map.c b/tools/testing/selftests/bpf/map_tests/task_storage_map.c
index 7d050364efca..62971dbf2996 100644
--- a/tools/testing/selftests/bpf/map_tests/task_storage_map.c
+++ b/tools/testing/selftests/bpf/map_tests/task_storage_map.c
@@ -12,6 +12,7 @@
 #include <bpf/bpf.h>
 #include <bpf/libbpf.h>
 
+#include "bpf_util.h"
 #include "test_maps.h"
 #include "task_local_storage_helpers.h"
 #include "read_bpf_task_storage_busy.skel.h"
@@ -115,7 +116,7 @@ void test_task_storage_map_stress_lookup(void)
 	CHECK(err, "attach", "error %d\n", err);
 
 	/* Trigger program */
-	syscall(SYS_gettid);
+	sys_gettid();
 	skel->bss->pid = 0;
 
 	CHECK(skel->bss->busy != 0, "bad bpf_task_storage_busy", "got %d\n", skel->bss->busy);
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c
index 070c52c312e5..6befa870434b 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c
@@ -690,7 +690,7 @@ void test_bpf_cookie(void)
 	if (!ASSERT_OK_PTR(skel, "skel_open"))
 		return;
 
-	skel->bss->my_tid = syscall(SYS_gettid);
+	skel->bss->my_tid = sys_gettid();
 
 	if (test__start_subtest("kprobe"))
 		kprobe_subtest(skel);
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
index 9006549a1294..b8e1224cfd19 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
@@ -226,7 +226,7 @@ static void test_task_common_nocheck(struct bpf_iter_attach_opts *opts,
 	ASSERT_OK(pthread_create(&thread_id, NULL, &do_nothing_wait, NULL),
 		  "pthread_create");
 
-	skel->bss->tid = syscall(SYS_gettid);
+	skel->bss->tid = sys_gettid();
 
 	do_dummy_read_opts(skel->progs.dump_task, opts);
 
@@ -255,10 +255,10 @@ static void *run_test_task_tid(void *arg)
 	union bpf_iter_link_info linfo;
 	int num_unknown_tid, num_known_tid;
 
-	ASSERT_NEQ(getpid(), syscall(SYS_gettid), "check_new_thread_id");
+	ASSERT_NEQ(getpid(), sys_gettid(), "check_new_thread_id");
 
 	memset(&linfo, 0, sizeof(linfo));
-	linfo.task.tid = syscall(SYS_gettid);
+	linfo.task.tid = sys_gettid();
 	opts.link_info = &linfo;
 	opts.link_info_len = sizeof(linfo);
 	test_task_common(&opts, 0, 1);
diff --git a/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c b/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c
index 747761572098..9015e2c2ab12 100644
--- a/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c
+++ b/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c
@@ -63,14 +63,14 @@ static void test_tp_btf(int cgroup_fd)
 	if (!ASSERT_OK(err, "map_delete_elem"))
 		goto out;
 
-	skel->bss->target_pid = syscall(SYS_gettid);
+	skel->bss->target_pid = sys_gettid();
 
 	err = cgrp_ls_tp_btf__attach(skel);
 	if (!ASSERT_OK(err, "skel_attach"))
 		goto out;
 
-	syscall(SYS_gettid);
-	syscall(SYS_gettid);
+	sys_gettid();
+	sys_gettid();
 
 	skel->bss->target_pid = 0;
 
@@ -154,7 +154,7 @@ static void test_recursion(int cgroup_fd)
 		goto out;
 
 	/* trigger sys_enter, make sure it does not cause deadlock */
-	syscall(SYS_gettid);
+	sys_gettid();
 
 out:
 	cgrp_ls_recursion__destroy(skel);
@@ -224,7 +224,7 @@ static void test_yes_rcu_lock(__u64 cgroup_id)
 		return;
 
 	CGROUP_MODE_SET(skel);
-	skel->bss->target_pid = syscall(SYS_gettid);
+	skel->bss->target_pid = sys_gettid();
 
 	bpf_program__set_autoload(skel->progs.yes_rcu_lock, true);
 	err = cgrp_ls_sleepable__load(skel);
diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c
index 26019313e1fc..1c682550e0e7 100644
--- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c
+++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c
@@ -1010,7 +1010,7 @@ static void run_core_reloc_tests(bool use_btfgen)
 	struct data *data;
 	void *mmap_data = NULL;
 
-	my_pid_tgid = getpid() | ((uint64_t)syscall(SYS_gettid) << 32);
+	my_pid_tgid = getpid() | ((uint64_t)sys_gettid() << 32);
 
 	for (i = 0; i < ARRAY_SIZE(test_cases); i++) {
 		char btf_file[] = "/tmp/core_reloc.btf.XXXXXX";
diff --git a/tools/testing/selftests/bpf/prog_tests/linked_funcs.c b/tools/testing/selftests/bpf/prog_tests/linked_funcs.c
index cad664546912..fa639b021f7e 100644
--- a/tools/testing/selftests/bpf/prog_tests/linked_funcs.c
+++ b/tools/testing/selftests/bpf/prog_tests/linked_funcs.c
@@ -20,7 +20,7 @@ void test_linked_funcs(void)
 	bpf_program__set_autoload(skel->progs.handler1, true);
 	bpf_program__set_autoload(skel->progs.handler2, true);
 
-	skel->rodata->my_tid = syscall(SYS_gettid);
+	skel->rodata->my_tid = sys_gettid();
 	skel->bss->syscall_id = SYS_getpgid;
 
 	err = linked_funcs__load(skel);
diff --git a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c
index c29787e092d6..761ce24bce38 100644
--- a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c
+++ b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c
@@ -23,7 +23,7 @@ static int get_pid_tgid(pid_t *pid, pid_t *tgid,
 	struct stat st;
 	int err;
 
-	*pid = syscall(SYS_gettid);
+	*pid = sys_gettid();
 	*tgid = getpid();
 
 	err = stat("/proc/self/ns/pid", &st);
diff --git a/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c b/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c
index a1f7e7378a64..ebe0c12b5536 100644
--- a/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c
+++ b/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c
@@ -21,7 +21,7 @@ static void test_success(void)
 	if (!ASSERT_OK_PTR(skel, "skel_open"))
 		return;
 
-	skel->bss->target_pid = syscall(SYS_gettid);
+	skel->bss->target_pid = sys_gettid();
 
 	bpf_program__set_autoload(skel->progs.get_cgroup_id, true);
 	bpf_program__set_autoload(skel->progs.task_succ, true);
@@ -58,7 +58,7 @@ static void test_rcuptr_acquire(void)
 	if (!ASSERT_OK_PTR(skel, "skel_open"))
 		return;
 
-	skel->bss->target_pid = syscall(SYS_gettid);
+	skel->bss->target_pid = sys_gettid();
 
 	bpf_program__set_autoload(skel->progs.task_acquire, true);
 	err = rcu_read_lock__load(skel);
diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
index 00cc9d0aee5d..60f474d965a9 100644
--- a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
+++ b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
@@ -31,14 +31,14 @@ static void test_sys_enter_exit(void)
 	if (!ASSERT_OK_PTR(skel, "skel_open_and_load"))
 		return;
 
-	skel->bss->target_pid = syscall(SYS_gettid);
+	skel->bss->target_pid = sys_gettid();
 
 	err = task_local_storage__attach(skel);
 	if (!ASSERT_OK(err, "skel_attach"))
 		goto out;
 
-	syscall(SYS_gettid);
-	syscall(SYS_gettid);
+	sys_gettid();
+	sys_gettid();
 
 	/* 3x syscalls: 1x attach and 2x gettid */
 	ASSERT_EQ(skel->bss->enter_cnt, 3, "enter_cnt");
@@ -107,7 +107,7 @@ static void test_recursion(void)
 
 	/* trigger sys_enter, make sure it does not cause deadlock */
 	skel->bss->test_pid = getpid();
-	syscall(SYS_gettid);
+	sys_gettid();
 	skel->bss->test_pid = 0;
 	task_ls_recursion__detach(skel);
 
@@ -262,7 +262,7 @@ static void test_uptr_basic(void)
 	__u64 ev_dummy_data = 1;
 	int err;
 
-	my_tid = syscall(SYS_gettid);
+	my_tid = sys_gettid();
 	parent_task_fd = sys_pidfd_open(my_tid, 0);
 	if (!ASSERT_OK_FD(parent_task_fd, "parent_task_fd"))
 		return;
diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
index 2c39902b8a09..619b31cd24a1 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
@@ -125,7 +125,7 @@ static void *child_thread(void *ctx)
 	struct child *child = ctx;
 	int c = 0, err;
 
-	child->tid = syscall(SYS_gettid);
+	child->tid = sys_gettid();
 
 	/* let parent know we are ready */
 	err = write(child->c2p[1], &c, 1);
-- 
cgit v1.2.3


From d798ce3f4cab1b0d886b19ec5cc8e6b3d7e35081 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Mon, 4 Nov 2024 09:19:59 -0800
Subject: selftests/bpf: Add tests for raw_tp null handling

Ensure that trusted PTR_TO_BTF_ID accesses perform PROBE_MEM handling in
raw_tp program. Without the previous fix, this selftest crashes the
kernel due to a NULL-pointer dereference. Also ensure that dead code
elimination does not kick in for checks on the pointer.

Reviewed-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20241104171959.2938862-4-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/bpf_testmod/bpf_testmod-events.h |  8 ++++++
 .../selftests/bpf/bpf_testmod/bpf_testmod.c        |  2 ++
 .../testing/selftests/bpf/prog_tests/raw_tp_null.c | 25 +++++++++++++++++
 tools/testing/selftests/bpf/progs/raw_tp_null.c    | 32 ++++++++++++++++++++++
 4 files changed, 67 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/raw_tp_null.c
 create mode 100644 tools/testing/selftests/bpf/progs/raw_tp_null.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h
index 6c3b4d4f173a..aeef86b3da74 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h
@@ -40,6 +40,14 @@ DECLARE_TRACE(bpf_testmod_test_nullable_bare,
 	TP_ARGS(ctx__nullable)
 );
 
+struct sk_buff;
+
+DECLARE_TRACE(bpf_testmod_test_raw_tp_null,
+	TP_PROTO(struct sk_buff *skb),
+	TP_ARGS(skb)
+);
+
+
 #undef BPF_TESTMOD_DECLARE_TRACE
 #ifdef DECLARE_TRACE_WRITABLE
 #define BPF_TESTMOD_DECLARE_TRACE(call, proto, args, size) \
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index 8835761d9a12..4e6a9e9c0368 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -380,6 +380,8 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj,
 
 	(void)bpf_testmod_test_arg_ptr_to_struct(&struct_arg1_2);
 
+	(void)trace_bpf_testmod_test_raw_tp_null(NULL);
+
 	struct_arg3 = kmalloc((sizeof(struct bpf_testmod_struct_arg_3) +
 				sizeof(int)), GFP_KERNEL);
 	if (struct_arg3 != NULL) {
diff --git a/tools/testing/selftests/bpf/prog_tests/raw_tp_null.c b/tools/testing/selftests/bpf/prog_tests/raw_tp_null.c
new file mode 100644
index 000000000000..6fa19449297e
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/raw_tp_null.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+
+#include <test_progs.h>
+#include "raw_tp_null.skel.h"
+
+void test_raw_tp_null(void)
+{
+	struct raw_tp_null *skel;
+
+	skel = raw_tp_null__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "raw_tp_null__open_and_load"))
+		return;
+
+	skel->bss->tid = sys_gettid();
+
+	if (!ASSERT_OK(raw_tp_null__attach(skel), "raw_tp_null__attach"))
+		goto end;
+
+	ASSERT_OK(trigger_module_test_read(2), "trigger testmod read");
+	ASSERT_EQ(skel->bss->i, 3, "invocations");
+
+end:
+	raw_tp_null__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/raw_tp_null.c b/tools/testing/selftests/bpf/progs/raw_tp_null.c
new file mode 100644
index 000000000000..457f34c151e3
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/raw_tp_null.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+int tid;
+int i;
+
+SEC("tp_btf/bpf_testmod_test_raw_tp_null")
+int BPF_PROG(test_raw_tp_null, struct sk_buff *skb)
+{
+	struct task_struct *task = bpf_get_current_task_btf();
+
+	if (task->pid != tid)
+		return 0;
+
+	i = i + skb->mark + 1;
+	/* The compiler may move the NULL check before this deref, which causes
+	 * the load to fail as deref of scalar. Prevent that by using a barrier.
+	 */
+	barrier();
+	/* If dead code elimination kicks in, the increment below will
+	 * be removed. For raw_tp programs, we mark input arguments as
+	 * PTR_MAYBE_NULL, so branch prediction should never kick in.
+	 */
+	if (!skb)
+		i += 2;
+	return 0;
+}
-- 
cgit v1.2.3


From 18f5744e8200e3c2bfeb896d6d3c10c3f0946318 Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Thu, 24 Oct 2024 14:18:38 -0700
Subject: selftests/resctrl: Make functions only used in same file static
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix following sparse warnings:
 tools/testing/selftests/resctrl/resctrl_val.c:47:6: warning: symbol 'membw_initialize_perf_event_attr' was not declared. Should it be static?
 tools/testing/selftests/resctrl/resctrl_val.c:64:6: warning: symbol 'membw_ioctl_perf_event_ioc_reset_enable' was not declared. Should it be
static?
 tools/testing/selftests/resctrl/resctrl_val.c:70:6: warning: symbol 'membw_ioctl_perf_event_ioc_disable' was not declared. Should it be static?
 tools/testing/selftests/resctrl/resctrl_val.c:81:6: warning: symbol 'get_event_and_umask' was not declared. Should it be static?

Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/resctrl_val.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c
index 8c275f6b4dd7..70e8e31f5d1a 100644
--- a/tools/testing/selftests/resctrl/resctrl_val.c
+++ b/tools/testing/selftests/resctrl/resctrl_val.c
@@ -44,7 +44,7 @@ static int imcs;
 static struct imc_counter_config imc_counters_config[MAX_IMCS][2];
 static const struct resctrl_test *current_test;
 
-void membw_initialize_perf_event_attr(int i, int j)
+static void membw_initialize_perf_event_attr(int i, int j)
 {
 	memset(&imc_counters_config[i][j].pe, 0,
 	       sizeof(struct perf_event_attr));
@@ -61,13 +61,13 @@ void membw_initialize_perf_event_attr(int i, int j)
 		PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
 }
 
-void membw_ioctl_perf_event_ioc_reset_enable(int i, int j)
+static void membw_ioctl_perf_event_ioc_reset_enable(int i, int j)
 {
 	ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_RESET, 0);
 	ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_ENABLE, 0);
 }
 
-void membw_ioctl_perf_event_ioc_disable(int i, int j)
+static void membw_ioctl_perf_event_ioc_disable(int i, int j)
 {
 	ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_DISABLE, 0);
 }
@@ -78,7 +78,7 @@ void membw_ioctl_perf_event_ioc_disable(int i, int j)
  * @count:		iMC number
  * @op:			Operation (read/write)
  */
-void get_event_and_umask(char *cas_count_cfg, int count, bool op)
+static void get_event_and_umask(char *cas_count_cfg, int count, bool op)
 {
 	char *token[MAX_TOKENS];
 	int i = 0;
-- 
cgit v1.2.3


From 1b4840395f08e9723a15fea42c2d31090e8375f3 Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Thu, 24 Oct 2024 14:18:39 -0700
Subject: selftests/resctrl: Print accurate buffer size as part of MBM results
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

By default the MBM test uses the "fill_buf" benchmark to keep reading
from a buffer with size DEFAULT_SPAN while measuring memory bandwidth.
User space can provide an alternate benchmark or amend the size of
the buffer "fill_buf" should use.

Analysis of the MBM measurements do not require that a buffer be used
and thus do not require knowing the size of the buffer if it was used
during testing. Even so, the buffer size is printed as informational
as part of the MBM test results. What is printed as buffer size is
hardcoded as DEFAULT_SPAN, even if the test relied on another benchmark
(that may or may not use a buffer) or if user space amended the buffer
size.

Ensure that accurate buffer size is printed when using "fill_buf"
benchmark and omit the buffer size information if another benchmark
is used.

Fixes: ecdbb911f22d ("selftests/resctrl: Add MBM test")
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/mbm_test.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c
index 6b5a3b52d861..cf08ba5e314e 100644
--- a/tools/testing/selftests/resctrl/mbm_test.c
+++ b/tools/testing/selftests/resctrl/mbm_test.c
@@ -40,7 +40,8 @@ show_bw_info(unsigned long *bw_imc, unsigned long *bw_resc, size_t span)
 	ksft_print_msg("%s Check MBM diff within %d%%\n",
 		       ret ? "Fail:" : "Pass:", MAX_DIFF_PERCENT);
 	ksft_print_msg("avg_diff_per: %d%%\n", avg_diff_per);
-	ksft_print_msg("Span (MB): %zu\n", span / MB);
+	if (span)
+		ksft_print_msg("Span (MB): %zu\n", span / MB);
 	ksft_print_msg("avg_bw_imc: %lu\n", avg_bw_imc);
 	ksft_print_msg("avg_bw_resc: %lu\n", avg_bw_resc);
 
@@ -138,15 +139,26 @@ static int mbm_run_test(const struct resctrl_test *test, const struct user_param
 		.setup		= mbm_setup,
 		.measure	= mbm_measure,
 	};
+	char *endptr = NULL;
+	size_t span = 0;
 	int ret;
 
 	remove(RESULT_FILE_NAME);
 
+	if (uparams->benchmark_cmd[0] && strcmp(uparams->benchmark_cmd[0], "fill_buf") == 0) {
+		if (uparams->benchmark_cmd[1] && *uparams->benchmark_cmd[1] != '\0') {
+			errno = 0;
+			span = strtoul(uparams->benchmark_cmd[1], &endptr, 10);
+			if (errno || *endptr != '\0')
+				return -EINVAL;
+		}
+	}
+
 	ret = resctrl_val(test, uparams, uparams->benchmark_cmd, &param);
 	if (ret)
 		return ret;
 
-	ret = check_results(DEFAULT_SPAN);
+	ret = check_results(span);
 	if (ret && (get_vendor() == ARCH_INTEL))
 		ksft_print_msg("Intel MBM may be inaccurate when Sub-NUMA Clustering is enabled. Check BIOS configuration.\n");
 
-- 
cgit v1.2.3


From caf02626b2bf164a02c808240f19dbf97aced664 Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Thu, 24 Oct 2024 14:18:40 -0700
Subject: selftests/resctrl: Fix memory overflow due to unhandled wraparound
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

alloc_buffer() allocates and initializes (with random data) a
buffer of requested size. The initialization starts from the beginning
of the allocated buffer and incrementally assigns sizeof(uint64_t) random
data to each cache line. The initialization uses the size of the
buffer to control the initialization flow, decrementing the amount of
buffer needing to be initialized after each iteration.

The size of the buffer is stored in an unsigned (size_t) variable s64
and the test "s64 > 0" is used to decide if initialization is complete.
The problem is that decrementing the buffer size may wrap around
if the buffer size is not divisible by "CL_SIZE / sizeof(uint64_t)"
resulting in the "s64 > 0" test being true and memory beyond the buffer
"initialized".

Use a signed value for the buffer size to support all buffer sizes.

Fixes: a2561b12fe39 ("selftests/resctrl: Add built in benchmark")
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/fill_buf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/resctrl/fill_buf.c b/tools/testing/selftests/resctrl/fill_buf.c
index ae120f1735c0..34e5df721430 100644
--- a/tools/testing/selftests/resctrl/fill_buf.c
+++ b/tools/testing/selftests/resctrl/fill_buf.c
@@ -127,7 +127,7 @@ unsigned char *alloc_buffer(size_t buf_size, int memflush)
 {
 	void *buf = NULL;
 	uint64_t *p64;
-	size_t s64;
+	ssize_t s64;
 	int ret;
 
 	ret = posix_memalign(&buf, PAGE_SIZE, buf_size);
-- 
cgit v1.2.3


From 48ed4e799e8fbebae838dca404a8527763d41191 Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Thu, 24 Oct 2024 14:18:41 -0700
Subject: selftests/resctrl: Protect against array overrun during iMC config
 parsing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The MBM and MBA tests need to discover the event and umask with which to
configure the performance event used to measure read memory bandwidth.
This is done by parsing the
/sys/bus/event_source/devices/uncore_imc_<imc instance>/events/cas_count_read
file for each iMC instance that contains the formatted
output: "event=<event>,umask=<umask>"

Parsing of cas_count_read contents is done by initializing an array of
MAX_TOKENS elements with tokens (deliminated by "=,") from this file.
Remove the unnecessary append of a delimiter to the string needing to be
parsed. Per the strtok() man page: "delimiter bytes at the start or end of
the string are ignored". This has no impact on the token placement within
the array.

After initialization, the actual event and umask is determined by
parsing the tokens directly following the "event" and "umask" tokens
respectively.

Iterating through the array up to index "i < MAX_TOKENS" but then
accessing index "i + 1" risks array overrun during the final iteration.
Avoid array overrun by ensuring that the index used within for
loop will always be valid.

Fixes: 1d3f08687d76 ("selftests/resctrl: Read memory bandwidth from perf IMC counter and from resctrl file system")
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/resctrl_val.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c
index 70e8e31f5d1a..e88d5ca30517 100644
--- a/tools/testing/selftests/resctrl/resctrl_val.c
+++ b/tools/testing/selftests/resctrl/resctrl_val.c
@@ -83,13 +83,12 @@ static void get_event_and_umask(char *cas_count_cfg, int count, bool op)
 	char *token[MAX_TOKENS];
 	int i = 0;
 
-	strcat(cas_count_cfg, ",");
 	token[0] = strtok(cas_count_cfg, "=,");
 
 	for (i = 1; i < MAX_TOKENS; i++)
 		token[i] = strtok(NULL, "=,");
 
-	for (i = 0; i < MAX_TOKENS; i++) {
+	for (i = 0; i < MAX_TOKENS - 1; i++) {
 		if (!token[i])
 			break;
 		if (strcmp(token[i], "event") == 0) {
-- 
cgit v1.2.3


From 46058430fc5d39c114f7e1b9c6ff14c9f41bd531 Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Thu, 24 Oct 2024 14:18:42 -0700
Subject: selftests/resctrl: Protect against array overflow when reading
 strings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

resctrl selftests discover system properties via a variety of sysfs files.
The MBM and MBA tests need to discover the event and umask with which to
configure the performance event used to measure read memory bandwidth.
This is done by parsing the contents of
/sys/bus/event_source/devices/uncore_imc_<imc instance>/events/cas_count_read
Similarly, the resctrl selftests discover the cache size via
/sys/bus/cpu/devices/cpu<id>/cache/index<index>/size.

Take care to do bounds checking when using fscanf() to read the
contents of files into a string buffer because by default fscanf() assumes
arbitrarily long strings. If the file contains more bytes than the array
can accommodate then an overflow will occur.

Provide a maximum field width to the conversion specifier to protect
against array overflow. The maximum is one less than the array size because
string input stores a terminating null byte that is not covered by the
maximum field width.

Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/resctrl_val.c | 4 ++--
 tools/testing/selftests/resctrl/resctrlfs.c   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c
index e88d5ca30517..c9dd70ce3ea8 100644
--- a/tools/testing/selftests/resctrl/resctrl_val.c
+++ b/tools/testing/selftests/resctrl/resctrl_val.c
@@ -159,7 +159,7 @@ static int read_from_imc_dir(char *imc_dir, int count)
 
 		return -1;
 	}
-	if (fscanf(fp, "%s", cas_count_cfg) <= 0) {
+	if (fscanf(fp, "%1023s", cas_count_cfg) <= 0) {
 		ksft_perror("Could not get iMC cas count read");
 		fclose(fp);
 
@@ -177,7 +177,7 @@ static int read_from_imc_dir(char *imc_dir, int count)
 
 		return -1;
 	}
-	if  (fscanf(fp, "%s", cas_count_cfg) <= 0) {
+	if  (fscanf(fp, "%1023s", cas_count_cfg) <= 0) {
 		ksft_perror("Could not get iMC cas count write");
 		fclose(fp);
 
diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
index 250c320349a7..a53cd1cb6e0c 100644
--- a/tools/testing/selftests/resctrl/resctrlfs.c
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -182,7 +182,7 @@ int get_cache_size(int cpu_no, const char *cache_type, unsigned long *cache_size
 
 		return -1;
 	}
-	if (fscanf(fp, "%s", cache_str) <= 0) {
+	if (fscanf(fp, "%63s", cache_str) <= 0) {
 		ksft_perror("Could not get cache_size");
 		fclose(fp);
 
-- 
cgit v1.2.3


From efffa8c40166af680959c030a815afa8d06af66a Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Thu, 24 Oct 2024 14:18:43 -0700
Subject: selftests/resctrl: Make wraparound handling obvious
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Within mba_setup() the programmed bandwidth delay value starts
at the maximum (100, or rather ALLOCATION_MAX) and progresses
towards ALLOCATION_MIN by decrementing with ALLOCATION_STEP.

The programmed bandwidth delay should never be negative, so
representing it with an unsigned int is most appropriate. This
may introduce confusion because of the "allocation > ALLOCATION_MAX"
check used to check wraparound of the subtraction.

Modify the mba_setup() flow to start at the minimum, ALLOCATION_MIN,
and incrementally, with ALLOCATION_STEP steps, adjust the
bandwidth delay value. This avoids wraparound while making the purpose
of "allocation > ALLOCATION_MAX" clear and eliminates the
need for the "allocation < ALLOCATION_MIN" check.

Reported-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Closes: https://lore.kernel.org/lkml/1903ac13-5c9c-ef8d-78e0-417ac34a971b@linux.intel.com/
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/mba_test.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c
index ab8496a4925b..da40a8ed4413 100644
--- a/tools/testing/selftests/resctrl/mba_test.c
+++ b/tools/testing/selftests/resctrl/mba_test.c
@@ -39,7 +39,8 @@ static int mba_setup(const struct resctrl_test *test,
 		     const struct user_params *uparams,
 		     struct resctrl_val_param *p)
 {
-	static int runs_per_allocation, allocation = 100;
+	static unsigned int allocation = ALLOCATION_MIN;
+	static int runs_per_allocation;
 	char allocation_str[64];
 	int ret;
 
@@ -50,7 +51,7 @@ static int mba_setup(const struct resctrl_test *test,
 	if (runs_per_allocation++ != 0)
 		return 0;
 
-	if (allocation < ALLOCATION_MIN || allocation > ALLOCATION_MAX)
+	if (allocation > ALLOCATION_MAX)
 		return END_OF_TESTS;
 
 	sprintf(allocation_str, "%d", allocation);
@@ -59,7 +60,7 @@ static int mba_setup(const struct resctrl_test *test,
 	if (ret < 0)
 		return ret;
 
-	allocation -= ALLOCATION_STEP;
+	allocation += ALLOCATION_STEP;
 
 	return 0;
 }
@@ -72,8 +73,9 @@ static int mba_measure(const struct user_params *uparams,
 
 static bool show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc)
 {
-	int allocation, runs;
+	unsigned int allocation;
 	bool ret = false;
+	int runs;
 
 	ksft_print_msg("Results are displayed in (MB)\n");
 	/* Memory bandwidth from 100% down to 10% */
@@ -103,7 +105,7 @@ static bool show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc)
 			       avg_diff_per > MAX_DIFF_PERCENT ?
 			       "Fail:" : "Pass:",
 			       MAX_DIFF_PERCENT,
-			       ALLOCATION_MAX - ALLOCATION_STEP * allocation);
+			       ALLOCATION_MIN + ALLOCATION_STEP * allocation);
 
 		ksft_print_msg("avg_diff_per: %d%%\n", avg_diff_per);
 		ksft_print_msg("avg_bw_imc: %lu\n", avg_bw_imc);
-- 
cgit v1.2.3


From f3069136c95f15c05d166281cbb55f0509a42ae1 Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Thu, 24 Oct 2024 14:18:44 -0700
Subject: selftests/resctrl: Remove "once" parameter required to be false
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The CMT, MBM, and MBA tests rely on a benchmark that runs while
the test makes changes to needed configuration (for example memory
bandwidth allocation) and takes needed measurements. By default
the "fill_buf" benchmark is used and by default (via its
"once = false" setting) "fill_buf" is configured to run until
terminated after the test completes.

An unintended consequence of enabling the user to override the
benchmark also enables the user to change parameters to the
"fill_buf" benchmark. This enables the user to set "fill_buf" to
only cycle through the buffer once (by setting "once = true")
and thus breaking the CMT, MBA, and MBM tests that expect
workload/interference to be reflected by their measurements.

Prevent user space from changing the "once" parameter and ensure
that it is always false for the CMT, MBA, and MBM tests.

Suggested-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/fill_buf.c      |  7 ++++---
 tools/testing/selftests/resctrl/resctrl.h       |  2 +-
 tools/testing/selftests/resctrl/resctrl_tests.c |  9 +++++++--
 tools/testing/selftests/resctrl/resctrl_val.c   | 11 +----------
 4 files changed, 13 insertions(+), 16 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/resctrl/fill_buf.c b/tools/testing/selftests/resctrl/fill_buf.c
index 34e5df721430..854f0108d8e6 100644
--- a/tools/testing/selftests/resctrl/fill_buf.c
+++ b/tools/testing/selftests/resctrl/fill_buf.c
@@ -151,7 +151,7 @@ unsigned char *alloc_buffer(size_t buf_size, int memflush)
 	return buf;
 }
 
-int run_fill_buf(size_t buf_size, int memflush, int op, bool once)
+int run_fill_buf(size_t buf_size, int memflush, int op)
 {
 	unsigned char *buf;
 
@@ -160,9 +160,10 @@ int run_fill_buf(size_t buf_size, int memflush, int op, bool once)
 		return -1;
 
 	if (op == 0)
-		fill_cache_read(buf, buf_size, once);
+		fill_cache_read(buf, buf_size, false);
 	else
-		fill_cache_write(buf, buf_size, once);
+		fill_cache_write(buf, buf_size, false);
+
 	free(buf);
 
 	return 0;
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index 2dda56084588..51f5f4b25e06 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -142,7 +142,7 @@ int perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu,
 unsigned char *alloc_buffer(size_t buf_size, int memflush);
 void mem_flush(unsigned char *buf, size_t buf_size);
 void fill_cache_read(unsigned char *buf, size_t buf_size, bool once);
-int run_fill_buf(size_t buf_size, int memflush, int op, bool once);
+int run_fill_buf(size_t buf_size, int memflush, int op);
 int initialize_mem_bw_imc(void);
 int measure_mem_bw(const struct user_params *uparams,
 		   struct resctrl_val_param *param, pid_t bm_pid,
diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
index ecbb7605a981..e7878077883f 100644
--- a/tools/testing/selftests/resctrl/resctrl_tests.c
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -266,8 +266,13 @@ last_arg:
 		uparams.benchmark_cmd[1] = span_str;
 		uparams.benchmark_cmd[2] = "1";
 		uparams.benchmark_cmd[3] = "0";
-		uparams.benchmark_cmd[4] = "false";
-		uparams.benchmark_cmd[5] = NULL;
+		/*
+		 * Fourth parameter was previously used to indicate
+		 * how long "fill_buf" should run for, with "false"
+		 * ("fill_buf" will keep running until terminated)
+		 * the only option that works.
+		 */
+		uparams.benchmark_cmd[4] = NULL;
 	}
 
 	ksft_set_plan(tests);
diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c
index c9dd70ce3ea8..b0f3c594c4da 100644
--- a/tools/testing/selftests/resctrl/resctrl_val.c
+++ b/tools/testing/selftests/resctrl/resctrl_val.c
@@ -625,7 +625,6 @@ static void run_benchmark(int signum, siginfo_t *info, void *ucontext)
 	int operation, ret, memflush;
 	char **benchmark_cmd;
 	size_t span;
-	bool once;
 	FILE *fp;
 
 	benchmark_cmd = info->si_ptr;
@@ -645,16 +644,8 @@ static void run_benchmark(int signum, siginfo_t *info, void *ucontext)
 		span = strtoul(benchmark_cmd[1], NULL, 10);
 		memflush =  atoi(benchmark_cmd[2]);
 		operation = atoi(benchmark_cmd[3]);
-		if (!strcmp(benchmark_cmd[4], "true")) {
-			once = true;
-		} else if (!strcmp(benchmark_cmd[4], "false")) {
-			once = false;
-		} else {
-			ksft_print_msg("Invalid once parameter\n");
-			parent_exit(ppid);
-		}
 
-		if (run_fill_buf(span, memflush, operation, once))
+		if (run_fill_buf(span, memflush, operation))
 			fprintf(stderr, "Error in running fill buffer\n");
 	} else {
 		/* Execute specified benchmark */
-- 
cgit v1.2.3


From 138424170eb06c956bc7e77e8740479161f65463 Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Thu, 24 Oct 2024 14:18:45 -0700
Subject: selftests/resctrl: Only support measured read operation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The CMT, MBM, and MBA tests rely on a benchmark to generate
memory traffic. By default this is the "fill_buf" benchmark that
can be replaced via the "-b" command line argument.

The original intent of the "-b" command line parameter was
to replace the default "fill_buf" benchmark, but the implementation
also exposes an alternative use case where the "fill_buf" parameters
itself can be modified. One of the parameters to "fill_buf" is the
"operation" that can be either "read" or "write" and indicates
whether the "fill_buf" should use "read" or "write" operations on the
allocated buffer.

While replacing "fill_buf" default parameters is technically possible,
replacing the default "read" parameter with "write" is not supported
because the MBA and MBM tests only measure "read" operations. The
"read" operation is also most appropriate for the CMT test that aims
to use the benchmark to allocate into the cache.

Avoid any potential inconsistencies between test and measurement by
removing code for unsupported "write" operations to the buffer.
Ignore any attempt from user space to enable this unsupported test
configuration, instead always use read operations.

Keep the initialization of the, now unused, "fill_buf" parameters
to reserve these parameter positions since it has been exposed as an API.
Future parameter additions cannot use these parameter positions.

Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/fill_buf.c      | 28 ++-----------------------
 tools/testing/selftests/resctrl/resctrl.h       |  2 +-
 tools/testing/selftests/resctrl/resctrl_tests.c |  5 ++++-
 tools/testing/selftests/resctrl/resctrl_val.c   |  5 ++---
 4 files changed, 9 insertions(+), 31 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/resctrl/fill_buf.c b/tools/testing/selftests/resctrl/fill_buf.c
index 854f0108d8e6..e4f1cea317f1 100644
--- a/tools/testing/selftests/resctrl/fill_buf.c
+++ b/tools/testing/selftests/resctrl/fill_buf.c
@@ -88,18 +88,6 @@ static int fill_one_span_read(unsigned char *buf, size_t buf_size)
 	return sum;
 }
 
-static void fill_one_span_write(unsigned char *buf, size_t buf_size)
-{
-	unsigned char *end_ptr = buf + buf_size;
-	unsigned char *p;
-
-	p = buf;
-	while (p < end_ptr) {
-		*p = '1';
-		p += (CL_SIZE / 2);
-	}
-}
-
 void fill_cache_read(unsigned char *buf, size_t buf_size, bool once)
 {
 	int ret = 0;
@@ -114,15 +102,6 @@ void fill_cache_read(unsigned char *buf, size_t buf_size, bool once)
 	*value_sink = ret;
 }
 
-static void fill_cache_write(unsigned char *buf, size_t buf_size, bool once)
-{
-	while (1) {
-		fill_one_span_write(buf, buf_size);
-		if (once)
-			break;
-	}
-}
-
 unsigned char *alloc_buffer(size_t buf_size, int memflush)
 {
 	void *buf = NULL;
@@ -151,7 +130,7 @@ unsigned char *alloc_buffer(size_t buf_size, int memflush)
 	return buf;
 }
 
-int run_fill_buf(size_t buf_size, int memflush, int op)
+int run_fill_buf(size_t buf_size, int memflush)
 {
 	unsigned char *buf;
 
@@ -159,10 +138,7 @@ int run_fill_buf(size_t buf_size, int memflush, int op)
 	if (!buf)
 		return -1;
 
-	if (op == 0)
-		fill_cache_read(buf, buf_size, false);
-	else
-		fill_cache_write(buf, buf_size, false);
+	fill_cache_read(buf, buf_size, false);
 
 	free(buf);
 
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index 51f5f4b25e06..ba1ce1b35699 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -142,7 +142,7 @@ int perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu,
 unsigned char *alloc_buffer(size_t buf_size, int memflush);
 void mem_flush(unsigned char *buf, size_t buf_size);
 void fill_cache_read(unsigned char *buf, size_t buf_size, bool once);
-int run_fill_buf(size_t buf_size, int memflush, int op);
+int run_fill_buf(size_t buf_size, int memflush);
 int initialize_mem_bw_imc(void);
 int measure_mem_bw(const struct user_params *uparams,
 		   struct resctrl_val_param *param, pid_t bm_pid,
diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
index e7878077883f..0f91c475b255 100644
--- a/tools/testing/selftests/resctrl/resctrl_tests.c
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -265,13 +265,16 @@ last_arg:
 			ksft_exit_fail_msg("Out of memory!\n");
 		uparams.benchmark_cmd[1] = span_str;
 		uparams.benchmark_cmd[2] = "1";
-		uparams.benchmark_cmd[3] = "0";
 		/*
+		 * Third parameter was previously used for "operation"
+		 * (read/write) of which only (now default) "read"/"0"
+		 * works.
 		 * Fourth parameter was previously used to indicate
 		 * how long "fill_buf" should run for, with "false"
 		 * ("fill_buf" will keep running until terminated)
 		 * the only option that works.
 		 */
+		uparams.benchmark_cmd[3] = NULL;
 		uparams.benchmark_cmd[4] = NULL;
 	}
 
diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c
index b0f3c594c4da..113ca18d67c1 100644
--- a/tools/testing/selftests/resctrl/resctrl_val.c
+++ b/tools/testing/selftests/resctrl/resctrl_val.c
@@ -622,8 +622,8 @@ close_fp:
  */
 static void run_benchmark(int signum, siginfo_t *info, void *ucontext)
 {
-	int operation, ret, memflush;
 	char **benchmark_cmd;
+	int ret, memflush;
 	size_t span;
 	FILE *fp;
 
@@ -643,9 +643,8 @@ static void run_benchmark(int signum, siginfo_t *info, void *ucontext)
 		/* Execute default fill_buf benchmark */
 		span = strtoul(benchmark_cmd[1], NULL, 10);
 		memflush =  atoi(benchmark_cmd[2]);
-		operation = atoi(benchmark_cmd[3]);
 
-		if (run_fill_buf(span, memflush, operation))
+		if (run_fill_buf(span, memflush))
 			fprintf(stderr, "Error in running fill buffer\n");
 	} else {
 		/* Execute specified benchmark */
-- 
cgit v1.2.3


From 76f8f009f6bc89fd08edae69ccc705a9781fe42e Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Thu, 24 Oct 2024 14:18:46 -0700
Subject: selftests/resctrl: Remove unused measurement code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The MBM and MBA resctrl selftests run a benchmark during which
it takes measurements of read memory bandwidth via perf.
Code exists to support measurements of write memory bandwidth
but there exists no path with which this code can execute.

While code exists for write memory bandwidth measurement
there has not yet been a use case for it. Remove this unused code.
Rename relevant functions to include "read" so that it is clear
that it relates only to memory bandwidth reads, while renaming
the functions also add consistency by changing the "membw"
instances to more prevalent "mem_bw".

Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/mba_test.c    |   4 +-
 tools/testing/selftests/resctrl/mbm_test.c    |   4 +-
 tools/testing/selftests/resctrl/resctrl.h     |   8 +-
 tools/testing/selftests/resctrl/resctrl_val.c | 234 +++++++++-----------------
 tools/testing/selftests/resctrl/resctrlfs.c   |  17 --
 5 files changed, 85 insertions(+), 182 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c
index da40a8ed4413..be0ead73e55d 100644
--- a/tools/testing/selftests/resctrl/mba_test.c
+++ b/tools/testing/selftests/resctrl/mba_test.c
@@ -21,7 +21,7 @@ static int mba_init(const struct resctrl_val_param *param, int domain_id)
 {
 	int ret;
 
-	ret = initialize_mem_bw_imc();
+	ret = initialize_read_mem_bw_imc();
 	if (ret)
 		return ret;
 
@@ -68,7 +68,7 @@ static int mba_setup(const struct resctrl_test *test,
 static int mba_measure(const struct user_params *uparams,
 		       struct resctrl_val_param *param, pid_t bm_pid)
 {
-	return measure_mem_bw(uparams, param, bm_pid, "reads");
+	return measure_read_mem_bw(uparams, param, bm_pid);
 }
 
 static bool show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc)
diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c
index cf08ba5e314e..defa94293915 100644
--- a/tools/testing/selftests/resctrl/mbm_test.c
+++ b/tools/testing/selftests/resctrl/mbm_test.c
@@ -91,7 +91,7 @@ static int mbm_init(const struct resctrl_val_param *param, int domain_id)
 {
 	int ret;
 
-	ret = initialize_mem_bw_imc();
+	ret = initialize_read_mem_bw_imc();
 	if (ret)
 		return ret;
 
@@ -122,7 +122,7 @@ static int mbm_setup(const struct resctrl_test *test,
 static int mbm_measure(const struct user_params *uparams,
 		       struct resctrl_val_param *param, pid_t bm_pid)
 {
-	return measure_mem_bw(uparams, param, bm_pid, "reads");
+	return measure_read_mem_bw(uparams, param, bm_pid);
 }
 
 static void mbm_test_cleanup(void)
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index ba1ce1b35699..82801245e4c1 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -126,7 +126,6 @@ int filter_dmesg(void);
 int get_domain_id(const char *resource, int cpu_no, int *domain_id);
 int mount_resctrlfs(void);
 int umount_resctrlfs(void);
-const char *get_bw_report_type(const char *bw_report);
 bool resctrl_resource_exists(const char *resource);
 bool resctrl_mon_feature_exists(const char *resource, const char *feature);
 bool resource_info_file_exists(const char *resource, const char *file);
@@ -143,10 +142,9 @@ unsigned char *alloc_buffer(size_t buf_size, int memflush);
 void mem_flush(unsigned char *buf, size_t buf_size);
 void fill_cache_read(unsigned char *buf, size_t buf_size, bool once);
 int run_fill_buf(size_t buf_size, int memflush);
-int initialize_mem_bw_imc(void);
-int measure_mem_bw(const struct user_params *uparams,
-		   struct resctrl_val_param *param, pid_t bm_pid,
-		   const char *bw_report);
+int initialize_read_mem_bw_imc(void);
+int measure_read_mem_bw(const struct user_params *uparams,
+			struct resctrl_val_param *param, pid_t bm_pid);
 void initialize_mem_bw_resctrl(const struct resctrl_val_param *param,
 			       int domain_id);
 int resctrl_val(const struct resctrl_test *test,
diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c
index 113ca18d67c1..c4ebf70a46ef 100644
--- a/tools/testing/selftests/resctrl/resctrl_val.c
+++ b/tools/testing/selftests/resctrl/resctrl_val.c
@@ -12,13 +12,10 @@
 
 #define UNCORE_IMC		"uncore_imc"
 #define READ_FILE_NAME		"events/cas_count_read"
-#define WRITE_FILE_NAME		"events/cas_count_write"
 #define DYN_PMU_PATH		"/sys/bus/event_source/devices"
 #define SCALE			0.00006103515625
 #define MAX_IMCS		20
 #define MAX_TOKENS		5
-#define READ			0
-#define WRITE			1
 
 #define CON_MBM_LOCAL_BYTES_PATH		\
 	"%s/%s/mon_data/mon_L3_%02d/mbm_local_bytes"
@@ -41,44 +38,43 @@ struct imc_counter_config {
 
 static char mbm_total_path[1024];
 static int imcs;
-static struct imc_counter_config imc_counters_config[MAX_IMCS][2];
+static struct imc_counter_config imc_counters_config[MAX_IMCS];
 static const struct resctrl_test *current_test;
 
-static void membw_initialize_perf_event_attr(int i, int j)
+static void read_mem_bw_initialize_perf_event_attr(int i)
 {
-	memset(&imc_counters_config[i][j].pe, 0,
+	memset(&imc_counters_config[i].pe, 0,
 	       sizeof(struct perf_event_attr));
-	imc_counters_config[i][j].pe.type = imc_counters_config[i][j].type;
-	imc_counters_config[i][j].pe.size = sizeof(struct perf_event_attr);
-	imc_counters_config[i][j].pe.disabled = 1;
-	imc_counters_config[i][j].pe.inherit = 1;
-	imc_counters_config[i][j].pe.exclude_guest = 0;
-	imc_counters_config[i][j].pe.config =
-		imc_counters_config[i][j].umask << 8 |
-		imc_counters_config[i][j].event;
-	imc_counters_config[i][j].pe.sample_type = PERF_SAMPLE_IDENTIFIER;
-	imc_counters_config[i][j].pe.read_format =
+	imc_counters_config[i].pe.type = imc_counters_config[i].type;
+	imc_counters_config[i].pe.size = sizeof(struct perf_event_attr);
+	imc_counters_config[i].pe.disabled = 1;
+	imc_counters_config[i].pe.inherit = 1;
+	imc_counters_config[i].pe.exclude_guest = 0;
+	imc_counters_config[i].pe.config =
+		imc_counters_config[i].umask << 8 |
+		imc_counters_config[i].event;
+	imc_counters_config[i].pe.sample_type = PERF_SAMPLE_IDENTIFIER;
+	imc_counters_config[i].pe.read_format =
 		PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
 }
 
-static void membw_ioctl_perf_event_ioc_reset_enable(int i, int j)
+static void read_mem_bw_ioctl_perf_event_ioc_reset_enable(int i)
 {
-	ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_RESET, 0);
-	ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_ENABLE, 0);
+	ioctl(imc_counters_config[i].fd, PERF_EVENT_IOC_RESET, 0);
+	ioctl(imc_counters_config[i].fd, PERF_EVENT_IOC_ENABLE, 0);
 }
 
-static void membw_ioctl_perf_event_ioc_disable(int i, int j)
+static void read_mem_bw_ioctl_perf_event_ioc_disable(int i)
 {
-	ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_DISABLE, 0);
+	ioctl(imc_counters_config[i].fd, PERF_EVENT_IOC_DISABLE, 0);
 }
 
 /*
- * get_event_and_umask:	Parse config into event and umask
+ * get_read_event_and_umask:	Parse config into event and umask
  * @cas_count_cfg:	Config
  * @count:		iMC number
- * @op:			Operation (read/write)
  */
-static void get_event_and_umask(char *cas_count_cfg, int count, bool op)
+static void get_read_event_and_umask(char *cas_count_cfg, int count)
 {
 	char *token[MAX_TOKENS];
 	int i = 0;
@@ -91,34 +87,22 @@ static void get_event_and_umask(char *cas_count_cfg, int count, bool op)
 	for (i = 0; i < MAX_TOKENS - 1; i++) {
 		if (!token[i])
 			break;
-		if (strcmp(token[i], "event") == 0) {
-			if (op == READ)
-				imc_counters_config[count][READ].event =
-				strtol(token[i + 1], NULL, 16);
-			else
-				imc_counters_config[count][WRITE].event =
-				strtol(token[i + 1], NULL, 16);
-		}
-		if (strcmp(token[i], "umask") == 0) {
-			if (op == READ)
-				imc_counters_config[count][READ].umask =
-				strtol(token[i + 1], NULL, 16);
-			else
-				imc_counters_config[count][WRITE].umask =
-				strtol(token[i + 1], NULL, 16);
-		}
+		if (strcmp(token[i], "event") == 0)
+			imc_counters_config[count].event = strtol(token[i + 1], NULL, 16);
+		if (strcmp(token[i], "umask") == 0)
+			imc_counters_config[count].umask = strtol(token[i + 1], NULL, 16);
 	}
 }
 
-static int open_perf_event(int i, int cpu_no, int j)
+static int open_perf_read_event(int i, int cpu_no)
 {
-	imc_counters_config[i][j].fd =
-		perf_event_open(&imc_counters_config[i][j].pe, -1, cpu_no, -1,
+	imc_counters_config[i].fd =
+		perf_event_open(&imc_counters_config[i].pe, -1, cpu_no, -1,
 				PERF_FLAG_FD_CLOEXEC);
 
-	if (imc_counters_config[i][j].fd == -1) {
+	if (imc_counters_config[i].fd == -1) {
 		fprintf(stderr, "Error opening leader %llx\n",
-			imc_counters_config[i][j].pe.config);
+			imc_counters_config[i].pe.config);
 
 		return -1;
 	}
@@ -126,7 +110,7 @@ static int open_perf_event(int i, int cpu_no, int j)
 	return 0;
 }
 
-/* Get type and config (read and write) of an iMC counter */
+/* Get type and config of an iMC counter's read event. */
 static int read_from_imc_dir(char *imc_dir, int count)
 {
 	char cas_count_cfg[1024], imc_counter_cfg[1024], imc_counter_type[1024];
@@ -140,7 +124,7 @@ static int read_from_imc_dir(char *imc_dir, int count)
 
 		return -1;
 	}
-	if (fscanf(fp, "%u", &imc_counters_config[count][READ].type) <= 0) {
+	if (fscanf(fp, "%u", &imc_counters_config[count].type) <= 0) {
 		ksft_perror("Could not get iMC type");
 		fclose(fp);
 
@@ -148,9 +132,6 @@ static int read_from_imc_dir(char *imc_dir, int count)
 	}
 	fclose(fp);
 
-	imc_counters_config[count][WRITE].type =
-				imc_counters_config[count][READ].type;
-
 	/* Get read config */
 	sprintf(imc_counter_cfg, "%s%s", imc_dir, READ_FILE_NAME);
 	fp = fopen(imc_counter_cfg, "r");
@@ -167,34 +148,19 @@ static int read_from_imc_dir(char *imc_dir, int count)
 	}
 	fclose(fp);
 
-	get_event_and_umask(cas_count_cfg, count, READ);
-
-	/* Get write config */
-	sprintf(imc_counter_cfg, "%s%s", imc_dir, WRITE_FILE_NAME);
-	fp = fopen(imc_counter_cfg, "r");
-	if (!fp) {
-		ksft_perror("Failed to open iMC config file");
-
-		return -1;
-	}
-	if  (fscanf(fp, "%1023s", cas_count_cfg) <= 0) {
-		ksft_perror("Could not get iMC cas count write");
-		fclose(fp);
-
-		return -1;
-	}
-	fclose(fp);
-
-	get_event_and_umask(cas_count_cfg, count, WRITE);
+	get_read_event_and_umask(cas_count_cfg, count);
 
 	return 0;
 }
 
 /*
  * A system can have 'n' number of iMC (Integrated Memory Controller)
- * counters, get that 'n'. For each iMC counter get it's type and config.
- * Also, each counter has two configs, one for read and the other for write.
- * A config again has two parts, event and umask.
+ * counters, get that 'n'. Discover the properties of the available
+ * counters in support of needed performance measurement via perf.
+ * For each iMC counter get it's type and config. Also obtain each
+ * counter's event and umask for the memory read events that will be
+ * measured.
+ *
  * Enumerate all these details into an array of structures.
  *
  * Return: >= 0 on success. < 0 on failure.
@@ -255,55 +221,46 @@ static int num_of_imcs(void)
 	return count;
 }
 
-int initialize_mem_bw_imc(void)
+int initialize_read_mem_bw_imc(void)
 {
-	int imc, j;
+	int imc;
 
 	imcs = num_of_imcs();
 	if (imcs <= 0)
 		return imcs;
 
 	/* Initialize perf_event_attr structures for all iMC's */
-	for (imc = 0; imc < imcs; imc++) {
-		for (j = 0; j < 2; j++)
-			membw_initialize_perf_event_attr(imc, j);
-	}
+	for (imc = 0; imc < imcs; imc++)
+		read_mem_bw_initialize_perf_event_attr(imc);
 
 	return 0;
 }
 
-static void perf_close_imc_mem_bw(void)
+static void perf_close_imc_read_mem_bw(void)
 {
 	int mc;
 
 	for (mc = 0; mc < imcs; mc++) {
-		if (imc_counters_config[mc][READ].fd != -1)
-			close(imc_counters_config[mc][READ].fd);
-		if (imc_counters_config[mc][WRITE].fd != -1)
-			close(imc_counters_config[mc][WRITE].fd);
+		if (imc_counters_config[mc].fd != -1)
+			close(imc_counters_config[mc].fd);
 	}
 }
 
 /*
- * perf_open_imc_mem_bw - Open perf fds for IMCs
+ * perf_open_imc_read_mem_bw - Open perf fds for IMCs
  * @cpu_no: CPU number that the benchmark PID is bound to
  *
  * Return: = 0 on success. < 0 on failure.
  */
-static int perf_open_imc_mem_bw(int cpu_no)
+static int perf_open_imc_read_mem_bw(int cpu_no)
 {
 	int imc, ret;
 
-	for (imc = 0; imc < imcs; imc++) {
-		imc_counters_config[imc][READ].fd = -1;
-		imc_counters_config[imc][WRITE].fd = -1;
-	}
+	for (imc = 0; imc < imcs; imc++)
+		imc_counters_config[imc].fd = -1;
 
 	for (imc = 0; imc < imcs; imc++) {
-		ret = open_perf_event(imc, cpu_no, READ);
-		if (ret)
-			goto close_fds;
-		ret = open_perf_event(imc, cpu_no, WRITE);
+		ret = open_perf_read_event(imc, cpu_no);
 		if (ret)
 			goto close_fds;
 	}
@@ -311,60 +268,52 @@ static int perf_open_imc_mem_bw(int cpu_no)
 	return 0;
 
 close_fds:
-	perf_close_imc_mem_bw();
+	perf_close_imc_read_mem_bw();
 	return -1;
 }
 
 /*
- * do_mem_bw_test - Perform memory bandwidth test
+ * do_imc_read_mem_bw_test - Perform memory bandwidth test
  *
  * Runs memory bandwidth test over one second period. Also, handles starting
  * and stopping of the IMC perf counters around the test.
  */
-static void do_imc_mem_bw_test(void)
+static void do_imc_read_mem_bw_test(void)
 {
 	int imc;
 
-	for (imc = 0; imc < imcs; imc++) {
-		membw_ioctl_perf_event_ioc_reset_enable(imc, READ);
-		membw_ioctl_perf_event_ioc_reset_enable(imc, WRITE);
-	}
+	for (imc = 0; imc < imcs; imc++)
+		read_mem_bw_ioctl_perf_event_ioc_reset_enable(imc);
 
 	sleep(1);
 
-	/* Stop counters after a second to get results (both read and write) */
-	for (imc = 0; imc < imcs; imc++) {
-		membw_ioctl_perf_event_ioc_disable(imc, READ);
-		membw_ioctl_perf_event_ioc_disable(imc, WRITE);
-	}
+	/* Stop counters after a second to get results. */
+	for (imc = 0; imc < imcs; imc++)
+		read_mem_bw_ioctl_perf_event_ioc_disable(imc);
 }
 
 /*
- * get_mem_bw_imc - Memory bandwidth as reported by iMC counters
- * @bw_report: Bandwidth report type (reads, writes)
+ * get_read_mem_bw_imc - Memory read bandwidth as reported by iMC counters
  *
- * Memory bandwidth utilized by a process on a socket can be calculated
- * using iMC counters. Perf events are used to read these counters.
+ * Memory read bandwidth utilized by a process on a socket can be calculated
+ * using iMC counters' read events. Perf events are used to read these
+ * counters.
  *
  * Return: = 0 on success. < 0 on failure.
  */
-static int get_mem_bw_imc(const char *bw_report, float *bw_imc)
+static int get_read_mem_bw_imc(float *bw_imc)
 {
-	float reads, writes, of_mul_read, of_mul_write;
+	float reads = 0, of_mul_read = 1;
 	int imc;
 
-	/* Start all iMC counters to log values (both read and write) */
-	reads = 0, writes = 0, of_mul_read = 1, of_mul_write = 1;
-
 	/*
-	 * Get results which are stored in struct type imc_counter_config
+	 * Log read event values from all iMC counters into
+	 * struct imc_counter_config.
 	 * Take overflow into consideration before calculating total bandwidth.
 	 */
 	for (imc = 0; imc < imcs; imc++) {
 		struct imc_counter_config *r =
-			&imc_counters_config[imc][READ];
-		struct imc_counter_config *w =
-			&imc_counters_config[imc][WRITE];
+			&imc_counters_config[imc];
 
 		if (read(r->fd, &r->return_value,
 			 sizeof(struct membw_read_format)) == -1) {
@@ -372,12 +321,6 @@ static int get_mem_bw_imc(const char *bw_report, float *bw_imc)
 			return -1;
 		}
 
-		if (read(w->fd, &w->return_value,
-			 sizeof(struct membw_read_format)) == -1) {
-			ksft_perror("Couldn't get write bandwidth through iMC");
-			return -1;
-		}
-
 		__u64 r_time_enabled = r->return_value.time_enabled;
 		__u64 r_time_running = r->return_value.time_running;
 
@@ -385,27 +328,10 @@ static int get_mem_bw_imc(const char *bw_report, float *bw_imc)
 			of_mul_read = (float)r_time_enabled /
 					(float)r_time_running;
 
-		__u64 w_time_enabled = w->return_value.time_enabled;
-		__u64 w_time_running = w->return_value.time_running;
-
-		if (w_time_enabled != w_time_running)
-			of_mul_write = (float)w_time_enabled /
-					(float)w_time_running;
 		reads += r->return_value.value * of_mul_read * SCALE;
-		writes += w->return_value.value * of_mul_write * SCALE;
-	}
-
-	if (strcmp(bw_report, "reads") == 0) {
-		*bw_imc = reads;
-		return 0;
-	}
-
-	if (strcmp(bw_report, "writes") == 0) {
-		*bw_imc = writes;
-		return 0;
 	}
 
-	*bw_imc = reads + writes;
+	*bw_imc = reads;
 	return 0;
 }
 
@@ -551,35 +477,31 @@ static int print_results_bw(char *filename, pid_t bm_pid, float bw_imc,
 }
 
 /*
- * measure_mem_bw - Measures memory bandwidth numbers while benchmark runs
+ * measure_read_mem_bw - Measures read memory bandwidth numbers while benchmark runs
  * @uparams:		User supplied parameters
  * @param:		Parameters passed to resctrl_val()
  * @bm_pid:		PID that runs the benchmark
- * @bw_report:		Bandwidth report type (reads, writes)
  *
  * Measure memory bandwidth from resctrl and from another source which is
  * perf imc value or could be something else if perf imc event is not
  * available. Compare the two values to validate resctrl value. It takes
  * 1 sec to measure the data.
+ * resctrl does not distinguish between read and write operations so
+ * its data includes all memory operations.
  */
-int measure_mem_bw(const struct user_params *uparams,
-		   struct resctrl_val_param *param, pid_t bm_pid,
-		   const char *bw_report)
+int measure_read_mem_bw(const struct user_params *uparams,
+			struct resctrl_val_param *param, pid_t bm_pid)
 {
 	unsigned long bw_resc, bw_resc_start, bw_resc_end;
 	FILE *mem_bw_fp;
 	float bw_imc;
 	int ret;
 
-	bw_report = get_bw_report_type(bw_report);
-	if (!bw_report)
-		return -1;
-
 	mem_bw_fp = open_mem_bw_resctrl(mbm_total_path);
 	if (!mem_bw_fp)
 		return -1;
 
-	ret = perf_open_imc_mem_bw(uparams->cpu);
+	ret = perf_open_imc_read_mem_bw(uparams->cpu);
 	if (ret < 0)
 		goto close_fp;
 
@@ -589,17 +511,17 @@ int measure_mem_bw(const struct user_params *uparams,
 
 	rewind(mem_bw_fp);
 
-	do_imc_mem_bw_test();
+	do_imc_read_mem_bw_test();
 
 	ret = get_mem_bw_resctrl(mem_bw_fp, &bw_resc_end);
 	if (ret < 0)
 		goto close_imc;
 
-	ret = get_mem_bw_imc(bw_report, &bw_imc);
+	ret = get_read_mem_bw_imc(&bw_imc);
 	if (ret < 0)
 		goto close_imc;
 
-	perf_close_imc_mem_bw();
+	perf_close_imc_read_mem_bw();
 	fclose(mem_bw_fp);
 
 	bw_resc = (bw_resc_end - bw_resc_start) / MB;
@@ -607,7 +529,7 @@ int measure_mem_bw(const struct user_params *uparams,
 	return print_results_bw(param->filename, bm_pid, bw_imc, bw_resc);
 
 close_imc:
-	perf_close_imc_mem_bw();
+	perf_close_imc_read_mem_bw();
 close_fp:
 	fclose(mem_bw_fp);
 	return ret;
diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
index a53cd1cb6e0c..d38d6dd90be4 100644
--- a/tools/testing/selftests/resctrl/resctrlfs.c
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -831,23 +831,6 @@ int filter_dmesg(void)
 	return 0;
 }
 
-const char *get_bw_report_type(const char *bw_report)
-{
-	if (strcmp(bw_report, "reads") == 0)
-		return bw_report;
-	if (strcmp(bw_report, "writes") == 0)
-		return bw_report;
-	if (strcmp(bw_report, "nt-writes") == 0) {
-		return "writes";
-	}
-	if (strcmp(bw_report, "total") == 0)
-		return bw_report;
-
-	fprintf(stderr, "Requested iMC bandwidth report type unavailable\n");
-
-	return NULL;
-}
-
 int perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu,
 		    int group_fd, unsigned long flags)
 {
-- 
cgit v1.2.3


From e958c21e2edecd17c87c4a2df39d1e47a3f7c039 Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Thu, 24 Oct 2024 14:18:47 -0700
Subject: selftests/resctrl: Make benchmark parameter passing robust
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The benchmark used during the CMT, MBM, and MBA tests can be provided by
the user via (-b) parameter, if not provided the default "fill_buf"
benchmark is used. The user is additionally able to override
any of the "fill_buf" default parameters when running the tests with
"-b fill_buf <fill_buf parameters>".

The "fill_buf" parameters are managed as an array of strings. Using an
array of strings is complex because it requires transformations to/from
strings at every producer and consumer. This is made worse for the
individual tests where the default benchmark parameters values may not
be appropriate and additional data wrangling is required. For example,
the CMT test duplicates the entire array of strings in order to replace
one of the parameters.

More issues appear when combining the usage of an array of strings with
the use case of user overriding default parameters by specifying
"-b fill_buf <parameters>". This use case is fragile with opportunities
to trigger a SIGSEGV because of opportunities for NULL pointers to exist
in the array of strings. For example, by running below (thus by specifying
"fill_buf" should be used but all parameters are NULL):
	$ sudo resctrl_tests -t mbm -b fill_buf

Replace the "array of strings" parameters used for "fill_buf" with
new struct fill_buf_param that contains the "fill_buf" parameters that
can be used directly without transformations to/from strings. Two
instances of struct fill_buf_param may exist at any point in time:
	* If the user provides new parameters to "fill_buf", the
	  user parameter structure (struct user_params) will point to a
	  fully initialized and immutable struct fill_buf_param
	  containing the user provided parameters.
	* If "fill_buf" is the benchmark that should be used by a test,
	  then the test parameter structure (struct resctrl_val_param)
	  will point to a fully initialized struct fill_buf_param. The
	  latter may contain (a) the user provided parameters verbatim,
	  (b) user provided parameters adjusted to be appropriate for
	  the test, or (c) the default parameters for "fill_buf" that
	  is appropriate for the test if the user did not provide
	  "fill_buf" parameters nor an alternate benchmark.

The existing behavior of CMT test is to use test defined value for the
buffer size even if the user provides another value via command line.
This behavior is maintained since the test requires that the buffer size
matches the size of the cache allocated, and the amount of cache
allocated can instead be changed by the user with the "-n" command line
parameter.

Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/cmt_test.c      |  32 +++-----
 tools/testing/selftests/resctrl/fill_buf.c      |   4 +-
 tools/testing/selftests/resctrl/mba_test.c      |  13 ++-
 tools/testing/selftests/resctrl/mbm_test.c      |  22 ++---
 tools/testing/selftests/resctrl/resctrl.h       |  59 ++++++++++----
 tools/testing/selftests/resctrl/resctrl_tests.c | 103 ++++++++++++++++++------
 tools/testing/selftests/resctrl/resctrl_val.c   |  41 +++++-----
 7 files changed, 178 insertions(+), 96 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/resctrl/cmt_test.c b/tools/testing/selftests/resctrl/cmt_test.c
index 0c045080d808..4c3cf2c25a38 100644
--- a/tools/testing/selftests/resctrl/cmt_test.c
+++ b/tools/testing/selftests/resctrl/cmt_test.c
@@ -116,15 +116,13 @@ static void cmt_test_cleanup(void)
 
 static int cmt_run_test(const struct resctrl_test *test, const struct user_params *uparams)
 {
-	const char * const *cmd = uparams->benchmark_cmd;
-	const char *new_cmd[BENCHMARK_ARGS];
+	struct fill_buf_param fill_buf = {};
 	unsigned long cache_total_size = 0;
 	int n = uparams->bits ? : 5;
 	unsigned long long_mask;
-	char *span_str = NULL;
 	int count_of_bits;
 	size_t span;
-	int ret, i;
+	int ret;
 
 	ret = get_full_cbm("L3", &long_mask);
 	if (ret)
@@ -155,32 +153,26 @@ static int cmt_run_test(const struct resctrl_test *test, const struct user_param
 
 	span = cache_portion_size(cache_total_size, param.mask, long_mask);
 
-	if (strcmp(cmd[0], "fill_buf") == 0) {
-		/* Duplicate the command to be able to replace span in it */
-		for (i = 0; uparams->benchmark_cmd[i]; i++)
-			new_cmd[i] = uparams->benchmark_cmd[i];
-		new_cmd[i] = NULL;
-
-		ret = asprintf(&span_str, "%zu", span);
-		if (ret < 0)
-			return -1;
-		new_cmd[1] = span_str;
-		cmd = new_cmd;
+	if (uparams->fill_buf) {
+		fill_buf.buf_size = span;
+		fill_buf.memflush = uparams->fill_buf->memflush;
+		param.fill_buf = &fill_buf;
+	} else if (!uparams->benchmark_cmd[0]) {
+		fill_buf.buf_size = span;
+		fill_buf.memflush = true;
+		param.fill_buf = &fill_buf;
 	}
 
 	remove(RESULT_FILE_NAME);
 
-	ret = resctrl_val(test, uparams, cmd, &param);
+	ret = resctrl_val(test, uparams, &param);
 	if (ret)
-		goto out;
+		return ret;
 
 	ret = check_results(&param, span, n);
 	if (ret && (get_vendor() == ARCH_INTEL))
 		ksft_print_msg("Intel CMT may be inaccurate when Sub-NUMA Clustering is enabled. Check BIOS configuration.\n");
 
-out:
-	free(span_str);
-
 	return ret;
 }
 
diff --git a/tools/testing/selftests/resctrl/fill_buf.c b/tools/testing/selftests/resctrl/fill_buf.c
index e4f1cea317f1..39545f9369e8 100644
--- a/tools/testing/selftests/resctrl/fill_buf.c
+++ b/tools/testing/selftests/resctrl/fill_buf.c
@@ -102,7 +102,7 @@ void fill_cache_read(unsigned char *buf, size_t buf_size, bool once)
 	*value_sink = ret;
 }
 
-unsigned char *alloc_buffer(size_t buf_size, int memflush)
+unsigned char *alloc_buffer(size_t buf_size, bool memflush)
 {
 	void *buf = NULL;
 	uint64_t *p64;
@@ -130,7 +130,7 @@ unsigned char *alloc_buffer(size_t buf_size, int memflush)
 	return buf;
 }
 
-int run_fill_buf(size_t buf_size, int memflush)
+int run_fill_buf(size_t buf_size, bool memflush)
 {
 	unsigned char *buf;
 
diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c
index be0ead73e55d..74d95c460bd0 100644
--- a/tools/testing/selftests/resctrl/mba_test.c
+++ b/tools/testing/selftests/resctrl/mba_test.c
@@ -172,11 +172,22 @@ static int mba_run_test(const struct resctrl_test *test, const struct user_param
 		.setup		= mba_setup,
 		.measure	= mba_measure,
 	};
+	struct fill_buf_param fill_buf = {};
 	int ret;
 
 	remove(RESULT_FILE_NAME);
 
-	ret = resctrl_val(test, uparams, uparams->benchmark_cmd, &param);
+	if (uparams->fill_buf) {
+		fill_buf.buf_size = uparams->fill_buf->buf_size;
+		fill_buf.memflush = uparams->fill_buf->memflush;
+		param.fill_buf = &fill_buf;
+	} else if (!uparams->benchmark_cmd[0]) {
+		fill_buf.buf_size = DEFAULT_SPAN;
+		fill_buf.memflush = true;
+		param.fill_buf = &fill_buf;
+	}
+
+	ret = resctrl_val(test, uparams, &param);
 	if (ret)
 		return ret;
 
diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c
index defa94293915..72261413c868 100644
--- a/tools/testing/selftests/resctrl/mbm_test.c
+++ b/tools/testing/selftests/resctrl/mbm_test.c
@@ -139,26 +139,26 @@ static int mbm_run_test(const struct resctrl_test *test, const struct user_param
 		.setup		= mbm_setup,
 		.measure	= mbm_measure,
 	};
-	char *endptr = NULL;
-	size_t span = 0;
+	struct fill_buf_param fill_buf = {};
 	int ret;
 
 	remove(RESULT_FILE_NAME);
 
-	if (uparams->benchmark_cmd[0] && strcmp(uparams->benchmark_cmd[0], "fill_buf") == 0) {
-		if (uparams->benchmark_cmd[1] && *uparams->benchmark_cmd[1] != '\0') {
-			errno = 0;
-			span = strtoul(uparams->benchmark_cmd[1], &endptr, 10);
-			if (errno || *endptr != '\0')
-				return -EINVAL;
-		}
+	if (uparams->fill_buf) {
+		fill_buf.buf_size = uparams->fill_buf->buf_size;
+		fill_buf.memflush = uparams->fill_buf->memflush;
+		param.fill_buf = &fill_buf;
+	} else if (!uparams->benchmark_cmd[0]) {
+		fill_buf.buf_size = DEFAULT_SPAN;
+		fill_buf.memflush = true;
+		param.fill_buf = &fill_buf;
 	}
 
-	ret = resctrl_val(test, uparams, uparams->benchmark_cmd, &param);
+	ret = resctrl_val(test, uparams, &param);
 	if (ret)
 		return ret;
 
-	ret = check_results(span);
+	ret = check_results(param.fill_buf ? param.fill_buf->buf_size : 0);
 	if (ret && (get_vendor() == ARCH_INTEL))
 		ksft_print_msg("Intel MBM may be inaccurate when Sub-NUMA Clustering is enabled. Check BIOS configuration.\n");
 
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index 82801245e4c1..c9336f9c2cae 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -43,16 +43,36 @@
 
 #define DEFAULT_SPAN		(250 * MB)
 
+/*
+ * fill_buf_param:	"fill_buf" benchmark parameters
+ * @buf_size:		Size (in bytes) of buffer used in benchmark.
+ *			"fill_buf" allocates and initializes buffer of
+ *			@buf_size. User can change value via command line.
+ * @memflush:		If false the buffer will not be flushed after
+ *			allocation and initialization, otherwise the
+ *			buffer will be flushed. User can change value via
+ *			command line (via integers with 0 interpreted as
+ *			false and anything else as true).
+ */
+struct fill_buf_param {
+	size_t		buf_size;
+	bool		memflush;
+};
+
 /*
  * user_params:		User supplied parameters
  * @cpu:		CPU number to which the benchmark will be bound to
  * @bits:		Number of bits used for cache allocation size
  * @benchmark_cmd:	Benchmark command to run during (some of the) tests
+ * @fill_buf:		Pointer to user provided parameters for "fill_buf",
+ *			NULL if user did not provide parameters and test
+ *			specific defaults should be used.
  */
 struct user_params {
 	int cpu;
 	int bits;
 	const char *benchmark_cmd[BENCHMARK_ARGS];
+	const struct fill_buf_param *fill_buf;
 };
 
 /*
@@ -87,21 +107,29 @@ struct resctrl_test {
  * @init:		Callback function to initialize test environment
  * @setup:		Callback function to setup per test run environment
  * @measure:		Callback that performs the measurement (a single test)
+ * @fill_buf:		Parameters for default "fill_buf" benchmark.
+ *			Initialized with user provided parameters, possibly
+ *			adapted to be relevant to the test. If user does
+ *			not provide parameters for "fill_buf" nor a
+ *			replacement benchmark then initialized with defaults
+ *			appropriate for test. NULL if user provided
+ *			benchmark.
  */
 struct resctrl_val_param {
-	const char	*ctrlgrp;
-	const char	*mongrp;
-	char		filename[64];
-	unsigned long	mask;
-	int		num_of_runs;
-	int		(*init)(const struct resctrl_val_param *param,
-				int domain_id);
-	int		(*setup)(const struct resctrl_test *test,
-				 const struct user_params *uparams,
-				 struct resctrl_val_param *param);
-	int		(*measure)(const struct user_params *uparams,
-				   struct resctrl_val_param *param,
-				   pid_t bm_pid);
+	const char		*ctrlgrp;
+	const char		*mongrp;
+	char			filename[64];
+	unsigned long		mask;
+	int			num_of_runs;
+	int			(*init)(const struct resctrl_val_param *param,
+					int domain_id);
+	int			(*setup)(const struct resctrl_test *test,
+					 const struct user_params *uparams,
+					 struct resctrl_val_param *param);
+	int			(*measure)(const struct user_params *uparams,
+					   struct resctrl_val_param *param,
+					   pid_t bm_pid);
+	struct fill_buf_param	*fill_buf;
 };
 
 struct perf_event_read {
@@ -138,10 +166,10 @@ int write_schemata(const char *ctrlgrp, char *schemata, int cpu_no,
 int write_bm_pid_to_resctrl(pid_t bm_pid, const char *ctrlgrp, const char *mongrp);
 int perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu,
 		    int group_fd, unsigned long flags);
-unsigned char *alloc_buffer(size_t buf_size, int memflush);
+unsigned char *alloc_buffer(size_t buf_size, bool memflush);
 void mem_flush(unsigned char *buf, size_t buf_size);
 void fill_cache_read(unsigned char *buf, size_t buf_size, bool once);
-int run_fill_buf(size_t buf_size, int memflush);
+int run_fill_buf(size_t buf_size, bool memflush);
 int initialize_read_mem_bw_imc(void);
 int measure_read_mem_bw(const struct user_params *uparams,
 			struct resctrl_val_param *param, pid_t bm_pid);
@@ -149,7 +177,6 @@ void initialize_mem_bw_resctrl(const struct resctrl_val_param *param,
 			       int domain_id);
 int resctrl_val(const struct resctrl_test *test,
 		const struct user_params *uparams,
-		const char * const *benchmark_cmd,
 		struct resctrl_val_param *param);
 unsigned long create_bit_mask(unsigned int start, unsigned int len);
 unsigned int count_contiguous_bits(unsigned long val, unsigned int *start);
diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
index 0f91c475b255..24daf76b4039 100644
--- a/tools/testing/selftests/resctrl/resctrl_tests.c
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -148,6 +148,78 @@ cleanup:
 	test_cleanup(test);
 }
 
+/*
+ * Allocate and initialize a struct fill_buf_param with user provided
+ * (via "-b fill_buf <fill_buf parameters>") parameters.
+ *
+ * Use defaults (that may not be appropriate for all tests) for any
+ * fill_buf parameters omitted by the user.
+ *
+ * Historically it may have been possible for user space to provide
+ * additional parameters, "operation" ("read" vs "write") in
+ * benchmark_cmd[3] and "once" (run "once" or until terminated) in
+ * benchmark_cmd[4]. Changing these parameters have never been
+ * supported with the default of "read" operation and running until
+ * terminated built into the tests. Any unsupported values for
+ * (original) "fill_buf" parameters are treated as failure.
+ *
+ * Return: On failure, forcibly exits the test on any parsing failure,
+ *         returns NULL if no parsing needed (user did not actually provide
+ *         "-b fill_buf").
+ *         On success, returns pointer to newly allocated and fully
+ *         initialized struct fill_buf_param that caller must free.
+ */
+static struct fill_buf_param *alloc_fill_buf_param(struct user_params *uparams)
+{
+	struct fill_buf_param *fill_param = NULL;
+	char *endptr = NULL;
+
+	if (!uparams->benchmark_cmd[0] || strcmp(uparams->benchmark_cmd[0], "fill_buf"))
+		return NULL;
+
+	fill_param = malloc(sizeof(*fill_param));
+	if (!fill_param)
+		ksft_exit_skip("Unable to allocate memory for fill_buf parameters.\n");
+
+	if (uparams->benchmark_cmd[1] && *uparams->benchmark_cmd[1] != '\0') {
+		errno = 0;
+		fill_param->buf_size = strtoul(uparams->benchmark_cmd[1], &endptr, 10);
+		if (errno || *endptr != '\0') {
+			free(fill_param);
+			ksft_exit_skip("Unable to parse benchmark buffer size.\n");
+		}
+	} else {
+		fill_param->buf_size = DEFAULT_SPAN;
+	}
+
+	if (uparams->benchmark_cmd[2] && *uparams->benchmark_cmd[2] != '\0') {
+		errno = 0;
+		fill_param->memflush = strtol(uparams->benchmark_cmd[2], &endptr, 10) != 0;
+		if (errno || *endptr != '\0') {
+			free(fill_param);
+			ksft_exit_skip("Unable to parse benchmark memflush parameter.\n");
+		}
+	} else {
+		fill_param->memflush = true;
+	}
+
+	if (uparams->benchmark_cmd[3] && *uparams->benchmark_cmd[3] != '\0') {
+		if (strcmp(uparams->benchmark_cmd[3], "0")) {
+			free(fill_param);
+			ksft_exit_skip("Only read operations supported.\n");
+		}
+	}
+
+	if (uparams->benchmark_cmd[4] && *uparams->benchmark_cmd[4] != '\0') {
+		if (strcmp(uparams->benchmark_cmd[4], "false")) {
+			free(fill_param);
+			ksft_exit_skip("fill_buf is required to run until termination.\n");
+		}
+	}
+
+	return fill_param;
+}
+
 static void init_user_params(struct user_params *uparams)
 {
 	memset(uparams, 0, sizeof(*uparams));
@@ -158,11 +230,11 @@ static void init_user_params(struct user_params *uparams)
 
 int main(int argc, char **argv)
 {
+	struct fill_buf_param *fill_param = NULL;
 	int tests = ARRAY_SIZE(resctrl_tests);
 	bool test_param_seen = false;
 	struct user_params uparams;
-	char *span_str = NULL;
-	int ret, c, i;
+	int c, i;
 
 	init_user_params(&uparams);
 
@@ -239,6 +311,10 @@ int main(int argc, char **argv)
 	}
 last_arg:
 
+	fill_param = alloc_fill_buf_param(&uparams);
+	if (fill_param)
+		uparams.fill_buf = fill_param;
+
 	ksft_print_header();
 
 	/*
@@ -257,32 +333,11 @@ last_arg:
 
 	filter_dmesg();
 
-	if (!uparams.benchmark_cmd[0]) {
-		/* If no benchmark is given by "-b" argument, use fill_buf. */
-		uparams.benchmark_cmd[0] = "fill_buf";
-		ret = asprintf(&span_str, "%u", DEFAULT_SPAN);
-		if (ret < 0)
-			ksft_exit_fail_msg("Out of memory!\n");
-		uparams.benchmark_cmd[1] = span_str;
-		uparams.benchmark_cmd[2] = "1";
-		/*
-		 * Third parameter was previously used for "operation"
-		 * (read/write) of which only (now default) "read"/"0"
-		 * works.
-		 * Fourth parameter was previously used to indicate
-		 * how long "fill_buf" should run for, with "false"
-		 * ("fill_buf" will keep running until terminated)
-		 * the only option that works.
-		 */
-		uparams.benchmark_cmd[3] = NULL;
-		uparams.benchmark_cmd[4] = NULL;
-	}
-
 	ksft_set_plan(tests);
 
 	for (i = 0; i < ARRAY_SIZE(resctrl_tests); i++)
 		run_single_test(resctrl_tests[i], &uparams);
 
-	free(span_str);
+	free(fill_param);
 	ksft_finished();
 }
diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c
index c4ebf70a46ef..00b3808d3bca 100644
--- a/tools/testing/selftests/resctrl/resctrl_val.c
+++ b/tools/testing/selftests/resctrl/resctrl_val.c
@@ -535,6 +535,11 @@ close_fp:
 	return ret;
 }
 
+struct benchmark_info {
+	const struct user_params *uparams;
+	struct resctrl_val_param *param;
+};
+
 /*
  * run_benchmark - Run a specified benchmark or fill_buf (default benchmark)
  *		   in specified signal. Direct benchmark stdio to /dev/null.
@@ -544,12 +549,11 @@ close_fp:
  */
 static void run_benchmark(int signum, siginfo_t *info, void *ucontext)
 {
-	char **benchmark_cmd;
-	int ret, memflush;
-	size_t span;
+	struct benchmark_info *benchmark_info = info->si_ptr;
+	const struct user_params *uparams = benchmark_info->uparams;
+	struct resctrl_val_param *param = benchmark_info->param;
 	FILE *fp;
-
-	benchmark_cmd = info->si_ptr;
+	int ret;
 
 	/*
 	 * Direct stdio of child to /dev/null, so that only parent writes to
@@ -561,16 +565,13 @@ static void run_benchmark(int signum, siginfo_t *info, void *ucontext)
 		parent_exit(ppid);
 	}
 
-	if (strcmp(benchmark_cmd[0], "fill_buf") == 0) {
-		/* Execute default fill_buf benchmark */
-		span = strtoul(benchmark_cmd[1], NULL, 10);
-		memflush =  atoi(benchmark_cmd[2]);
-
-		if (run_fill_buf(span, memflush))
+	if (param->fill_buf) {
+		if (run_fill_buf(param->fill_buf->buf_size,
+				 param->fill_buf->memflush))
 			fprintf(stderr, "Error in running fill buffer\n");
-	} else {
+	} else if (uparams->benchmark_cmd[0]) {
 		/* Execute specified benchmark */
-		ret = execvp(benchmark_cmd[0], benchmark_cmd);
+		ret = execvp(uparams->benchmark_cmd[0], (char **)uparams->benchmark_cmd);
 		if (ret)
 			ksft_perror("execvp");
 	}
@@ -585,16 +586,15 @@ static void run_benchmark(int signum, siginfo_t *info, void *ucontext)
  *			the benchmark
  * @test:		test information structure
  * @uparams:		user supplied parameters
- * @benchmark_cmd:	benchmark command and its arguments
  * @param:		parameters passed to resctrl_val()
  *
  * Return:		0 when the test was run, < 0 on error.
  */
 int resctrl_val(const struct resctrl_test *test,
 		const struct user_params *uparams,
-		const char * const *benchmark_cmd,
 		struct resctrl_val_param *param)
 {
+	struct benchmark_info benchmark_info;
 	struct sigaction sigact;
 	int ret = 0, pipefd[2];
 	char pipe_message = 0;
@@ -610,6 +610,9 @@ int resctrl_val(const struct resctrl_test *test,
 		return ret;
 	}
 
+	benchmark_info.uparams = uparams;
+	benchmark_info.param = param;
+
 	/*
 	 * If benchmark wasn't successfully started by child, then child should
 	 * kill parent, so save parent's pid
@@ -671,13 +674,7 @@ int resctrl_val(const struct resctrl_test *test,
 
 	ksft_print_msg("Benchmark PID: %d\n", (int)bm_pid);
 
-	/*
-	 * The cast removes constness but nothing mutates benchmark_cmd within
-	 * the context of this process. At the receiving process, it becomes
-	 * argv, which is mutable, on exec() but that's after fork() so it
-	 * doesn't matter for the process running the tests.
-	 */
-	value.sival_ptr = (void *)benchmark_cmd;
+	value.sival_ptr = (void *)&benchmark_info;
 
 	/* Taskset benchmark to specified cpu */
 	ret = taskset_benchmark(bm_pid, uparams->cpu, NULL);
-- 
cgit v1.2.3


From 3cb3f0b8755919648281064d44c06319743db343 Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Thu, 24 Oct 2024 14:18:48 -0700
Subject: selftests/resctrl: Ensure measurements skip initialization of default
 benchmark
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The CMT, MBA, and MBM tests rely on the resctrl_val() wrapper to
start and run a benchmark while providing test specific flows
via callbacks to do test specific configuration and measurements.

At a high level, the resctrl_val() flow is:
	a) Start by fork()ing a child process that installs a signal
	   handler for SIGUSR1 that, on receipt of SIGUSR1, will
	   start running a benchmark.
	b) Assign the child process created in (a) to the resctrl
	   control and monitoring group that dictates the memory and
	   cache allocations with which the process can run and will
	   contain all resctrl monitoring data of that process.
	c) Once parent and child are considered "ready" (determined via
	   a message over a pipe) the parent signals the child (via
	   SIGUSR1) to start the benchmark, waits one second for the
	   benchmark to run, and then starts collecting monitoring data
	   for the tests, potentially also changing allocation
	   configuration depending on the various test callbacks.

A problem with the above flow is the "black box" view of the
benchmark that is combined with an arbitrarily chosen
"wait one second" before measurements start. No matter what
the benchmark does, it is given one second to initialize before
measurements start.

The default benchmark "fill_buf" consists of two parts,
first it prepares a buffer (allocate, initialize, then flush), then it
reads from the buffer (in unpredictable ways) until terminated.
Depending on the system and the size of the buffer, the first "prepare"
part may not be complete by the time the one second delay expires. Test
measurements may thus start before the work needing to be measured runs.

Split the default benchmark into its "prepare" and "runtime" parts and
simplify the resctrl_val() wrapper while doing so. This same split
cannot be done for the user provided benchmark (without a user
interface change), so the current behavior is maintained for user
provided benchmark.

Assign the test itself to the control and monitoring group and run the
"prepare" part of the benchmark in this context, ensuring it runs with
required cache and memory bandwidth allocations. With the benchmark
preparation complete it is only needed to fork() the "runtime" part
of the benchmark (or entire user provided benchmark).

Keep the "wait one second" delay before measurements start. For the
default "fill_buf" benchmark this time now covers only the "runtime"
portion that needs to be measured. For the user provided benchmark this
delay maintains current behavior.

Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/fill_buf.c    |  15 --
 tools/testing/selftests/resctrl/resctrl.h     |   1 -
 tools/testing/selftests/resctrl/resctrl_val.c | 195 +++++++-------------------
 3 files changed, 50 insertions(+), 161 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/resctrl/fill_buf.c b/tools/testing/selftests/resctrl/fill_buf.c
index 39545f9369e8..380cc35f10c6 100644
--- a/tools/testing/selftests/resctrl/fill_buf.c
+++ b/tools/testing/selftests/resctrl/fill_buf.c
@@ -129,18 +129,3 @@ unsigned char *alloc_buffer(size_t buf_size, bool memflush)
 
 	return buf;
 }
-
-int run_fill_buf(size_t buf_size, bool memflush)
-{
-	unsigned char *buf;
-
-	buf = alloc_buffer(buf_size, memflush);
-	if (!buf)
-		return -1;
-
-	fill_cache_read(buf, buf_size, false);
-
-	free(buf);
-
-	return 0;
-}
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index c9336f9c2cae..032cd9ebd761 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -169,7 +169,6 @@ int perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu,
 unsigned char *alloc_buffer(size_t buf_size, bool memflush);
 void mem_flush(unsigned char *buf, size_t buf_size);
 void fill_cache_read(unsigned char *buf, size_t buf_size, bool once);
-int run_fill_buf(size_t buf_size, bool memflush);
 int initialize_read_mem_bw_imc(void);
 int measure_read_mem_bw(const struct user_params *uparams,
 			struct resctrl_val_param *param, pid_t bm_pid);
diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c
index 00b3808d3bca..7c08e936572d 100644
--- a/tools/testing/selftests/resctrl/resctrl_val.c
+++ b/tools/testing/selftests/resctrl/resctrl_val.c
@@ -373,7 +373,7 @@ static int get_mem_bw_resctrl(FILE *fp, unsigned long *mbm_total)
 	return 0;
 }
 
-static pid_t bm_pid, ppid;
+static pid_t bm_pid;
 
 void ctrlc_handler(int signum, siginfo_t *info, void *ptr)
 {
@@ -431,13 +431,6 @@ void signal_handler_unregister(void)
 	}
 }
 
-static void parent_exit(pid_t ppid)
-{
-	kill(ppid, SIGKILL);
-	umount_resctrlfs();
-	exit(EXIT_FAILURE);
-}
-
 /*
  * print_results_bw:	the memory bandwidth results are stored in a file
  * @filename:		file that stores the results
@@ -535,52 +528,6 @@ close_fp:
 	return ret;
 }
 
-struct benchmark_info {
-	const struct user_params *uparams;
-	struct resctrl_val_param *param;
-};
-
-/*
- * run_benchmark - Run a specified benchmark or fill_buf (default benchmark)
- *		   in specified signal. Direct benchmark stdio to /dev/null.
- * @signum:	signal number
- * @info:	signal info
- * @ucontext:	user context in signal handling
- */
-static void run_benchmark(int signum, siginfo_t *info, void *ucontext)
-{
-	struct benchmark_info *benchmark_info = info->si_ptr;
-	const struct user_params *uparams = benchmark_info->uparams;
-	struct resctrl_val_param *param = benchmark_info->param;
-	FILE *fp;
-	int ret;
-
-	/*
-	 * Direct stdio of child to /dev/null, so that only parent writes to
-	 * stdio (console)
-	 */
-	fp = freopen("/dev/null", "w", stdout);
-	if (!fp) {
-		ksft_perror("Unable to direct benchmark status to /dev/null");
-		parent_exit(ppid);
-	}
-
-	if (param->fill_buf) {
-		if (run_fill_buf(param->fill_buf->buf_size,
-				 param->fill_buf->memflush))
-			fprintf(stderr, "Error in running fill buffer\n");
-	} else if (uparams->benchmark_cmd[0]) {
-		/* Execute specified benchmark */
-		ret = execvp(uparams->benchmark_cmd[0], (char **)uparams->benchmark_cmd);
-		if (ret)
-			ksft_perror("execvp");
-	}
-
-	fclose(stdout);
-	ksft_print_msg("Unable to run specified benchmark\n");
-	parent_exit(ppid);
-}
-
 /*
  * resctrl_val:	execute benchmark and measure memory bandwidth on
  *			the benchmark
@@ -594,12 +541,11 @@ int resctrl_val(const struct resctrl_test *test,
 		const struct user_params *uparams,
 		struct resctrl_val_param *param)
 {
-	struct benchmark_info benchmark_info;
-	struct sigaction sigact;
-	int ret = 0, pipefd[2];
-	char pipe_message = 0;
-	union sigval value;
+	unsigned char *buf = NULL;
+	cpu_set_t old_affinity;
 	int domain_id;
+	int ret = 0;
+	pid_t ppid;
 
 	if (strcmp(param->filename, "") == 0)
 		sprintf(param->filename, "stdio");
@@ -610,108 +556,65 @@ int resctrl_val(const struct resctrl_test *test,
 		return ret;
 	}
 
-	benchmark_info.uparams = uparams;
-	benchmark_info.param = param;
-
-	/*
-	 * If benchmark wasn't successfully started by child, then child should
-	 * kill parent, so save parent's pid
-	 */
 	ppid = getpid();
 
-	if (pipe(pipefd)) {
-		ksft_perror("Unable to create pipe");
+	/* Taskset test to specified CPU. */
+	ret = taskset_benchmark(ppid, uparams->cpu, &old_affinity);
+	if (ret)
+		return ret;
+
+	/* Write test to specified control & monitoring group in resctrl FS. */
+	ret = write_bm_pid_to_resctrl(ppid, param->ctrlgrp, param->mongrp);
+	if (ret)
+		goto reset_affinity;
 
-		return -1;
+	if (param->init) {
+		ret = param->init(param, domain_id);
+		if (ret)
+			goto reset_affinity;
 	}
 
 	/*
-	 * Fork to start benchmark, save child's pid so that it can be killed
-	 * when needed
+	 * If not running user provided benchmark, run the default
+	 * "fill_buf". First phase of "fill_buf" is to prepare the
+	 * buffer that the benchmark will operate on. No measurements
+	 * are needed during this phase and prepared memory will be
+	 * passed to next part of benchmark via copy-on-write thus
+	 * no impact on the benchmark that relies on reading from
+	 * memory only.
 	 */
+	if (param->fill_buf) {
+		buf = alloc_buffer(param->fill_buf->buf_size,
+				   param->fill_buf->memflush);
+		if (!buf) {
+			ret = -ENOMEM;
+			goto reset_affinity;
+		}
+	}
+
 	fflush(stdout);
 	bm_pid = fork();
 	if (bm_pid == -1) {
+		ret = -errno;
 		ksft_perror("Unable to fork");
-
-		return -1;
+		goto free_buf;
 	}
 
+	/*
+	 * What needs to be measured runs in separate process until
+	 * terminated.
+	 */
 	if (bm_pid == 0) {
-		/*
-		 * Mask all signals except SIGUSR1, parent uses SIGUSR1 to
-		 * start benchmark
-		 */
-		sigfillset(&sigact.sa_mask);
-		sigdelset(&sigact.sa_mask, SIGUSR1);
-
-		sigact.sa_sigaction = run_benchmark;
-		sigact.sa_flags = SA_SIGINFO;
-
-		/* Register for "SIGUSR1" signal from parent */
-		if (sigaction(SIGUSR1, &sigact, NULL)) {
-			ksft_perror("Can't register child for signal");
-			parent_exit(ppid);
-		}
-
-		/* Tell parent that child is ready */
-		close(pipefd[0]);
-		pipe_message = 1;
-		if (write(pipefd[1], &pipe_message, sizeof(pipe_message)) <
-		    sizeof(pipe_message)) {
-			ksft_perror("Failed signaling parent process");
-			close(pipefd[1]);
-			return -1;
-		}
-		close(pipefd[1]);
-
-		/* Suspend child until delivery of "SIGUSR1" from parent */
-		sigsuspend(&sigact.sa_mask);
-
-		ksft_perror("Child is done");
-		parent_exit(ppid);
+		if (param->fill_buf)
+			fill_cache_read(buf, param->fill_buf->buf_size, false);
+		else if (uparams->benchmark_cmd[0])
+			execvp(uparams->benchmark_cmd[0], (char **)uparams->benchmark_cmd);
+		exit(EXIT_SUCCESS);
 	}
 
 	ksft_print_msg("Benchmark PID: %d\n", (int)bm_pid);
 
-	value.sival_ptr = (void *)&benchmark_info;
-
-	/* Taskset benchmark to specified cpu */
-	ret = taskset_benchmark(bm_pid, uparams->cpu, NULL);
-	if (ret)
-		goto out;
-
-	/* Write benchmark to specified control&monitoring grp in resctrl FS */
-	ret = write_bm_pid_to_resctrl(bm_pid, param->ctrlgrp, param->mongrp);
-	if (ret)
-		goto out;
-
-	if (param->init) {
-		ret = param->init(param, domain_id);
-		if (ret)
-			goto out;
-	}
-
-	/* Parent waits for child to be ready. */
-	close(pipefd[1]);
-	while (pipe_message != 1) {
-		if (read(pipefd[0], &pipe_message, sizeof(pipe_message)) <
-		    sizeof(pipe_message)) {
-			ksft_perror("Failed reading message from child process");
-			close(pipefd[0]);
-			goto out;
-		}
-	}
-	close(pipefd[0]);
-
-	/* Signal child to start benchmark */
-	if (sigqueue(bm_pid, SIGUSR1, value) == -1) {
-		ksft_perror("sigqueue SIGUSR1 to child");
-		ret = -1;
-		goto out;
-	}
-
-	/* Give benchmark enough time to fully run */
+	/* Give benchmark enough time to fully run. */
 	sleep(1);
 
 	/* Test runs until the callback setup() tells the test to stop. */
@@ -729,8 +632,10 @@ int resctrl_val(const struct resctrl_test *test,
 			break;
 	}
 
-out:
 	kill(bm_pid, SIGKILL);
-
+free_buf:
+	free(buf);
+reset_affinity:
+	taskset_restore(ppid, &old_affinity);
 	return ret;
 }
-- 
cgit v1.2.3


From f77b9672536e581c945b2623b521a284fdbf75ff Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Thu, 24 Oct 2024 14:18:49 -0700
Subject: selftests/resctrl: Use cache size to determine "fill_buf" buffer size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

By default the MBM and MBA tests use the "fill_buf" benchmark to
read from a buffer with the goal to measure the memory bandwidth
generated by this buffer access.

Care should be taken when sizing the buffer used by the "fill_buf"
benchmark. If the buffer is small enough to fit in the cache then
it cannot be expected that the benchmark will generate much memory
bandwidth. For example, on a system with 320MB L3 cache the existing
hardcoded default of 250MB is insufficient.

Use the measured cache size to determine a buffer size that can be
expected to trigger memory access while keeping the existing default
as minimum, now renamed to MINIMUM_SPAN, that has been appropriate for
testing so far.

Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/fill_buf.c      | 13 +++++++++++++
 tools/testing/selftests/resctrl/mba_test.c      |  7 ++++++-
 tools/testing/selftests/resctrl/mbm_test.c      |  7 ++++++-
 tools/testing/selftests/resctrl/resctrl.h       |  3 ++-
 tools/testing/selftests/resctrl/resctrl_tests.c |  2 +-
 5 files changed, 28 insertions(+), 4 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/resctrl/fill_buf.c b/tools/testing/selftests/resctrl/fill_buf.c
index 380cc35f10c6..19a01a52dc1a 100644
--- a/tools/testing/selftests/resctrl/fill_buf.c
+++ b/tools/testing/selftests/resctrl/fill_buf.c
@@ -129,3 +129,16 @@ unsigned char *alloc_buffer(size_t buf_size, bool memflush)
 
 	return buf;
 }
+
+ssize_t get_fill_buf_size(int cpu_no, const char *cache_type)
+{
+	unsigned long cache_total_size = 0;
+	int ret;
+
+	ret = get_cache_size(cpu_no, cache_type, &cache_total_size);
+	if (ret)
+		return ret;
+
+	return cache_total_size * 2 > MINIMUM_SPAN ?
+			cache_total_size * 2 : MINIMUM_SPAN;
+}
diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c
index 74d95c460bd0..bf37f3555660 100644
--- a/tools/testing/selftests/resctrl/mba_test.c
+++ b/tools/testing/selftests/resctrl/mba_test.c
@@ -182,7 +182,12 @@ static int mba_run_test(const struct resctrl_test *test, const struct user_param
 		fill_buf.memflush = uparams->fill_buf->memflush;
 		param.fill_buf = &fill_buf;
 	} else if (!uparams->benchmark_cmd[0]) {
-		fill_buf.buf_size = DEFAULT_SPAN;
+		ssize_t buf_size;
+
+		buf_size = get_fill_buf_size(uparams->cpu, "L3");
+		if (buf_size < 0)
+			return buf_size;
+		fill_buf.buf_size = buf_size;
 		fill_buf.memflush = true;
 		param.fill_buf = &fill_buf;
 	}
diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c
index 72261413c868..4224f8ce3538 100644
--- a/tools/testing/selftests/resctrl/mbm_test.c
+++ b/tools/testing/selftests/resctrl/mbm_test.c
@@ -149,7 +149,12 @@ static int mbm_run_test(const struct resctrl_test *test, const struct user_param
 		fill_buf.memflush = uparams->fill_buf->memflush;
 		param.fill_buf = &fill_buf;
 	} else if (!uparams->benchmark_cmd[0]) {
-		fill_buf.buf_size = DEFAULT_SPAN;
+		ssize_t buf_size;
+
+		buf_size = get_fill_buf_size(uparams->cpu, "L3");
+		if (buf_size < 0)
+			return buf_size;
+		fill_buf.buf_size = buf_size;
 		fill_buf.memflush = true;
 		param.fill_buf = &fill_buf;
 	}
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index 032cd9ebd761..a553fe975938 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -41,7 +41,7 @@
 
 #define BENCHMARK_ARGS		64
 
-#define DEFAULT_SPAN		(250 * MB)
+#define MINIMUM_SPAN		(250 * MB)
 
 /*
  * fill_buf_param:	"fill_buf" benchmark parameters
@@ -169,6 +169,7 @@ int perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu,
 unsigned char *alloc_buffer(size_t buf_size, bool memflush);
 void mem_flush(unsigned char *buf, size_t buf_size);
 void fill_cache_read(unsigned char *buf, size_t buf_size, bool once);
+ssize_t get_fill_buf_size(int cpu_no, const char *cache_type);
 int initialize_read_mem_bw_imc(void);
 int measure_read_mem_bw(const struct user_params *uparams,
 			struct resctrl_val_param *param, pid_t bm_pid);
diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
index 24daf76b4039..3335af815b21 100644
--- a/tools/testing/selftests/resctrl/resctrl_tests.c
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -189,7 +189,7 @@ static struct fill_buf_param *alloc_fill_buf_param(struct user_params *uparams)
 			ksft_exit_skip("Unable to parse benchmark buffer size.\n");
 		}
 	} else {
-		fill_param->buf_size = DEFAULT_SPAN;
+		fill_param->buf_size = MINIMUM_SPAN;
 	}
 
 	if (uparams->benchmark_cmd[2] && *uparams->benchmark_cmd[2] != '\0') {
-- 
cgit v1.2.3


From 501cfdba0a400e70ea220a4b02f1805e0e9de6a1 Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Thu, 24 Oct 2024 14:18:50 -0700
Subject: selftests/resctrl: Do not compare performance counters and resctrl at
 low bandwidth
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The MBA test incrementally throttles memory bandwidth, each time
followed by a comparison between the memory bandwidth observed
by the performance counters and resctrl respectively.

While a comparison between performance counters and resctrl is
generally appropriate, they do not have an identical view of
memory bandwidth. For example RAS features or memory performance
features that generate memory traffic may drive accesses that are
counted differently by performance counters and MBM respectively,
for instance generating "overhead" traffic which is not counted
against any specific RMID. As a ratio, this different view of memory
bandwidth becomes more apparent at low memory bandwidths.

It is not practical to enable/disable the various features that
may generate memory bandwidth to give performance counters and
resctrl an identical view. Instead, do not compare performance
counters and resctrl view of memory bandwidth when the memory
bandwidth is low.

Bandwidth throttling behaves differently across platforms
so it is not appropriate to drop measurement data simply based
on the throttling level. Instead, use a threshold of 750MiB
that has been observed to support adequate comparison between
performance counters and resctrl.

Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/mba_test.c |  7 +++++++
 tools/testing/selftests/resctrl/resctrl.h  | 10 ++++++++++
 2 files changed, 17 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c
index bf37f3555660..5b4f0aa7a3a4 100644
--- a/tools/testing/selftests/resctrl/mba_test.c
+++ b/tools/testing/selftests/resctrl/mba_test.c
@@ -98,6 +98,13 @@ static bool show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc)
 
 		avg_bw_imc = sum_bw_imc / (NUM_OF_RUNS - 1);
 		avg_bw_resc = sum_bw_resc / (NUM_OF_RUNS - 1);
+		if (avg_bw_imc < THROTTLE_THRESHOLD || avg_bw_resc < THROTTLE_THRESHOLD) {
+			ksft_print_msg("Bandwidth below threshold (%d MiB). Dropping results from MBA schemata %u.\n",
+				       THROTTLE_THRESHOLD,
+				       ALLOCATION_MIN + ALLOCATION_STEP * allocation);
+			continue;
+		}
+
 		avg_diff = (float)labs(avg_bw_resc - avg_bw_imc) / avg_bw_imc;
 		avg_diff_per = (int)(avg_diff * 100);
 
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index a553fe975938..dab1953fc7a0 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -43,6 +43,16 @@
 
 #define MINIMUM_SPAN		(250 * MB)
 
+/*
+ * Memory bandwidth (in MiB) below which the bandwidth comparisons
+ * between iMC and resctrl are considered unreliable. For example RAS
+ * features or memory performance features that generate memory traffic
+ * may drive accesses that are counted differently by performance counters
+ * and MBM respectively, for instance generating "overhead" traffic which
+ * is not counted against any specific RMID.
+ */
+#define THROTTLE_THRESHOLD	750
+
 /*
  * fill_buf_param:	"fill_buf" benchmark parameters
  * @buf_size:		Size (in bytes) of buffer used in benchmark.
-- 
cgit v1.2.3


From 295b898426d8fb5b79672d1dae358ca8070f2196 Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Thu, 24 Oct 2024 14:18:51 -0700
Subject: selftests/resctrl: Keep results from first test run
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The resctrl selftests drop the results from every first test run
to avoid (per comment) "inaccurate due to monitoring setup transition
phase" data. Previously inaccurate data resulted from workloads needing
some time to "settle" and also the measurements themselves to
account for earlier measurements to measure across needed timeframe.

commit da50de0a92f3 ("selftests/resctrl: Calculate resctrl FS derived mem
bw over sleep(1) only")

ensured that measurements accurately measure just the time frame of
interest. The default "fill_buf" benchmark since separated the buffer
prepare phase from the benchmark run phase reducing the need for the
tests themselves to accommodate the benchmark's "settle" time.

With these enhancements there are no remaining portions needing
to "settle" and the first test run can contribute to measurements.

Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/cmt_test.c |  5 ++---
 tools/testing/selftests/resctrl/mba_test.c | 10 +++-------
 tools/testing/selftests/resctrl/mbm_test.c | 10 +++-------
 3 files changed, 8 insertions(+), 17 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/resctrl/cmt_test.c b/tools/testing/selftests/resctrl/cmt_test.c
index 4c3cf2c25a38..3bbf3042fb06 100644
--- a/tools/testing/selftests/resctrl/cmt_test.c
+++ b/tools/testing/selftests/resctrl/cmt_test.c
@@ -99,14 +99,13 @@ static int check_results(struct resctrl_val_param *param, size_t span, int no_of
 		}
 
 		/* Field 3 is llc occ resc value */
-		if (runs > 0)
-			sum_llc_occu_resc += strtoul(token_array[3], NULL, 0);
+		sum_llc_occu_resc += strtoul(token_array[3], NULL, 0);
 		runs++;
 	}
 	fclose(fp);
 
 	return show_results_info(sum_llc_occu_resc, no_of_bits, span,
-				 MAX_DIFF, MAX_DIFF_PERCENT, runs - 1, true);
+				 MAX_DIFF, MAX_DIFF_PERCENT, runs, true);
 }
 
 static void cmt_test_cleanup(void)
diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c
index 5b4f0aa7a3a4..4e6645b172e3 100644
--- a/tools/testing/selftests/resctrl/mba_test.c
+++ b/tools/testing/selftests/resctrl/mba_test.c
@@ -86,18 +86,14 @@ static bool show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc)
 		int avg_diff_per;
 		float avg_diff;
 
-		/*
-		 * The first run is discarded due to inaccurate value from
-		 * phase transition.
-		 */
-		for (runs = NUM_OF_RUNS * allocation + 1;
+		for (runs = NUM_OF_RUNS * allocation;
 		     runs < NUM_OF_RUNS * allocation + NUM_OF_RUNS ; runs++) {
 			sum_bw_imc += bw_imc[runs];
 			sum_bw_resc += bw_resc[runs];
 		}
 
-		avg_bw_imc = sum_bw_imc / (NUM_OF_RUNS - 1);
-		avg_bw_resc = sum_bw_resc / (NUM_OF_RUNS - 1);
+		avg_bw_imc = sum_bw_imc / NUM_OF_RUNS;
+		avg_bw_resc = sum_bw_resc / NUM_OF_RUNS;
 		if (avg_bw_imc < THROTTLE_THRESHOLD || avg_bw_resc < THROTTLE_THRESHOLD) {
 			ksft_print_msg("Bandwidth below threshold (%d MiB). Dropping results from MBA schemata %u.\n",
 				       THROTTLE_THRESHOLD,
diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c
index 4224f8ce3538..315b2ef3b3bc 100644
--- a/tools/testing/selftests/resctrl/mbm_test.c
+++ b/tools/testing/selftests/resctrl/mbm_test.c
@@ -22,17 +22,13 @@ show_bw_info(unsigned long *bw_imc, unsigned long *bw_resc, size_t span)
 	int runs, ret, avg_diff_per;
 	float avg_diff = 0;
 
-	/*
-	 * Discard the first value which is inaccurate due to monitoring setup
-	 * transition phase.
-	 */
-	for (runs = 1; runs < NUM_OF_RUNS ; runs++) {
+	for (runs = 0; runs < NUM_OF_RUNS; runs++) {
 		sum_bw_imc += bw_imc[runs];
 		sum_bw_resc += bw_resc[runs];
 	}
 
-	avg_bw_imc = sum_bw_imc / 4;
-	avg_bw_resc = sum_bw_resc / 4;
+	avg_bw_imc = sum_bw_imc / NUM_OF_RUNS;
+	avg_bw_resc = sum_bw_resc / NUM_OF_RUNS;
 	avg_diff = (float)labs(avg_bw_resc - avg_bw_imc) / avg_bw_imc;
 	avg_diff_per = (int)(avg_diff * 100);
 
-- 
cgit v1.2.3


From a44c26d7fa74a5f4d2795a5c55a2d6ec1ebf1e38 Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Thu, 24 Oct 2024 14:18:52 -0700
Subject: selftests/resctrl: Replace magic constants used as array size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Memory Bandwidth Allocation (MBA) test iterates through all possible
MBA allocations, from 10% (ALLOCATION_MIN) to 100% (ALLOCATION_MAX) with
increments of 10% (ALLOCATION_STEP) at each iteration. During each
iteration the test measures the actual memory bandwidth NUM_OF_RUNS times
to determine the impact of MBA on actual memory bandwidth.

After the MBA test completes all the memory bandwidth measurements are
parsed into an array. One array for resctrl Memory Bandwidth Monitoring
(MBM) measurements and one array for the Integrated Memory Controller
(iMC) measurements. Each array has a hardcoded size of 1024 that is
large enough to hold the current test data, but this hardcoded value makes
the implementation difficult to understand. It will not be clear that this
array needs to be reconsidered if any of the test parameters are changed.

Replace the magic constant as array size with the test parameters the
array size depends on.

Reported-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Closes: https://lore.kernel.org/all/45af2a8c-517d-8f0d-137d-ad0f3f6a3c68@linux.intel.com/
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/mba_test.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c
index 4e6645b172e3..536d9089d2f6 100644
--- a/tools/testing/selftests/resctrl/mba_test.c
+++ b/tools/testing/selftests/resctrl/mba_test.c
@@ -127,8 +127,9 @@ static bool show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc)
 
 static int check_results(void)
 {
+	unsigned long bw_resc[NUM_OF_RUNS * ALLOCATION_MAX / ALLOCATION_STEP];
+	unsigned long bw_imc[NUM_OF_RUNS * ALLOCATION_MAX / ALLOCATION_STEP];
 	char *token_array[8], output[] = RESULT_FILE_NAME, temp[512];
-	unsigned long bw_imc[1024], bw_resc[1024];
 	int runs;
 	FILE *fp;
 
-- 
cgit v1.2.3


From 2d0f2a648147d6bbf0655e03500586a6712a7281 Mon Sep 17 00:00:00 2001
From: Maxim Levitsky <mlevitsk@redhat.com>
Date: Fri, 4 Oct 2024 18:01:53 -0400
Subject: KVM: selftests: memslot_perf_test: increase guest sync timeout

When memslot_perf_test is run nested, first iteration of test_memslot_rw_loop
testcase, sometimes takes more than 2 seconds due to build of shadow page tables.

Following iterations are fast.

To be on the safe side, bump the timeout to 10 seconds.

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
Tested-by: Liam Merwick <liam.merwick@oracle.com>
Reviewed-by: Liam Merwick <liam.merwick@oracle.com>
Link: https://lore.kernel.org/r/20241004220153.287459-1-mlevitsk@redhat.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/memslot_perf_test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c
index 989ffe0d047f..e3711beff7f3 100644
--- a/tools/testing/selftests/kvm/memslot_perf_test.c
+++ b/tools/testing/selftests/kvm/memslot_perf_test.c
@@ -417,7 +417,7 @@ static bool _guest_should_exit(void)
  */
 static noinline void host_perform_sync(struct sync_area *sync)
 {
-	alarm(2);
+	alarm(10);
 
 	atomic_store_explicit(&sync->sync_flag, true, memory_order_release);
 	while (atomic_load_explicit(&sync->sync_flag, memory_order_acquire))
-- 
cgit v1.2.3


From 945bdae20be5a13f1fcdcb14ec356dcbeee35839 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Thu, 24 Oct 2024 10:59:53 +0100
Subject: KVM: selftests: fix unintentional noop test in guest_memfd_test.c

The loop in test_create_guest_memfd_invalid() that is supposed to test
that nothing is accepted as a valid flag to KVM_CREATE_GUEST_MEMFD was
initializing `flag` as 0 instead of BIT(0). This caused the loop to
immediately exit instead of iterating over BIT(0), BIT(1), ... .

Fixes: 8a89efd43423 ("KVM: selftests: Add basic selftest for guest_memfd()")
Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
Reviewed-by: James Gowans <jgowans@amazon.com>
Reviewed-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Link: https://lore.kernel.org/r/20241024095956.3668818-1-roypat@amazon.co.uk
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/guest_memfd_test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c
index ba0c8e996035..ce687f8d248f 100644
--- a/tools/testing/selftests/kvm/guest_memfd_test.c
+++ b/tools/testing/selftests/kvm/guest_memfd_test.c
@@ -134,7 +134,7 @@ static void test_create_guest_memfd_invalid(struct kvm_vm *vm)
 			    size);
 	}
 
-	for (flag = 0; flag; flag <<= 1) {
+	for (flag = BIT(0); flag; flag <<= 1) {
 		fd = __vm_create_guest_memfd(vm, page_size, flag);
 		TEST_ASSERT(fd == -1 && errno == EINVAL,
 			    "guest_memfd() with flag '0x%lx' should fail with EINVAL",
-- 
cgit v1.2.3


From 5b188cc4866aaf712e896f92ac42c7802135e507 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 9 Oct 2024 08:49:41 -0700
Subject: KVM: selftests: Disable strict aliasing

Disable strict aliasing, as has been done in the kernel proper for decades
(literally since before git history) to fix issues where gcc will optimize
away loads in code that looks 100% correct, but is _technically_ undefined
behavior, and thus can be thrown away by the compiler.

E.g. arm64's vPMU counter access test casts a uint64_t (unsigned long)
pointer to a u64 (unsigned long long) pointer when setting PMCR.N via
u64p_replace_bits(), which gcc-13 detects and optimizes away, i.e. ignores
the result and uses the original PMCR.

The issue is most easily observed by making set_pmcr_n() noinline and
wrapping the call with printf(), e.g. sans comments, for this code:

  printf("orig = %lx, next = %lx, want = %lu\n", pmcr_orig, pmcr, pmcr_n);
  set_pmcr_n(&pmcr, pmcr_n);
  printf("orig = %lx, next = %lx, want = %lu\n", pmcr_orig, pmcr, pmcr_n);

gcc-13 generates:

 0000000000401c90 <set_pmcr_n>:
  401c90:       f9400002        ldr     x2, [x0]
  401c94:       b3751022        bfi     x2, x1, #11, #5
  401c98:       f9000002        str     x2, [x0]
  401c9c:       d65f03c0        ret

 0000000000402660 <test_create_vpmu_vm_with_pmcr_n>:
  402724:       aa1403e3        mov     x3, x20
  402728:       aa1503e2        mov     x2, x21
  40272c:       aa1603e0        mov     x0, x22
  402730:       aa1503e1        mov     x1, x21
  402734:       940060ff        bl      41ab30 <_IO_printf>
  402738:       aa1403e1        mov     x1, x20
  40273c:       910183e0        add     x0, sp, #0x60
  402740:       97fffd54        bl      401c90 <set_pmcr_n>
  402744:       aa1403e3        mov     x3, x20
  402748:       aa1503e2        mov     x2, x21
  40274c:       aa1503e1        mov     x1, x21
  402750:       aa1603e0        mov     x0, x22
  402754:       940060f7        bl      41ab30 <_IO_printf>

with the value stored in [sp + 0x60] ignored by both printf() above and
in the test proper, resulting in a false failure due to vcpu_set_reg()
simply storing the original value, not the intended value.

  $ ./vpmu_counter_access
  Random seed: 0x6b8b4567
  orig = 3040, next = 3040, want = 0
  orig = 3040, next = 3040, want = 0
  ==== Test Assertion Failure ====
    aarch64/vpmu_counter_access.c:505: pmcr_n == get_pmcr_n(pmcr)
    pid=71578 tid=71578 errno=9 - Bad file descriptor
       1        0x400673: run_access_test at vpmu_counter_access.c:522
       2         (inlined by) main at vpmu_counter_access.c:643
       3        0x4132d7: __libc_start_call_main at libc-start.o:0
       4        0x413653: __libc_start_main at ??:0
       5        0x40106f: _start at ??:0
    Failed to update PMCR.N to 0 (received: 6)

Somewhat bizarrely, gcc-11 also exhibits the same behavior, but only if
set_pmcr_n() is marked noinline, whereas gcc-13 fails even if set_pmcr_n()
is inlined in its sole caller.

Cc: stable@vger.kernel.org
Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116912
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 156fbfae940f..1896ef383cae 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -241,10 +241,10 @@ CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \
 	-Wno-gnu-variable-sized-type-not-at-end -MD -MP -DCONFIG_64BIT \
 	-fno-builtin-memcmp -fno-builtin-memcpy \
 	-fno-builtin-memset -fno-builtin-strnlen \
-	-fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) \
-	-I$(LINUX_TOOL_ARCH_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude \
-	-I$(<D) -Iinclude/$(ARCH_DIR) -I ../rseq -I.. $(EXTRA_CFLAGS) \
-	$(KHDR_INCLUDES)
+	-fno-stack-protector -fno-PIE -fno-strict-aliasing \
+	-I$(LINUX_TOOL_INCLUDE) -I$(LINUX_TOOL_ARCH_INCLUDE) \
+	-I$(LINUX_HDR_PATH) -Iinclude -I$(<D) -Iinclude/$(ARCH_DIR) \
+	-I ../rseq -I.. $(EXTRA_CFLAGS) $(KHDR_INCLUDES)
 ifeq ($(ARCH),s390)
 	CFLAGS += -march=z10
 endif
-- 
cgit v1.2.3


From 979956bc681105f34642971448c4cda048954a07 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 30 Oct 2024 21:53:33 -0700
Subject: KVM: selftests: Don't force -march=x86-64-v2 if it's unsupported

Force -march=x86-64-v2 to avoid SSE/AVX instructions if and only if the
uarch definition is supported by the compiler, e.g. gcc 7.5 only supports
x86-64.

Fixes: 9a400068a158 ("KVM: selftests: x86: Avoid using SSE/AVX instructions")
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Reviewed-and-tested-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Link: https://lore.kernel.org/r/20241031045333.1209195-1-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/Makefile | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 1896ef383cae..48645a2e29da 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -249,8 +249,10 @@ ifeq ($(ARCH),s390)
 	CFLAGS += -march=z10
 endif
 ifeq ($(ARCH),x86)
+ifeq ($(shell echo "void foo(void) { }" | $(CC) -march=x86-64-v2 -x c - -c -o /dev/null 2>/dev/null; echo "$$?"),0)
 	CFLAGS += -march=x86-64-v2
 endif
+endif
 ifeq ($(ARCH),arm64)
 tools_dir := $(top_srcdir)/tools
 arm64_tools_dir := $(tools_dir)/arch/arm64/tools/
-- 
cgit v1.2.3


From 6801cf7890f2ed8fcc14859b47501f8ee7a58ec7 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Tue, 5 Nov 2024 12:30:57 +0800
Subject: selftests/bpf: Use -4095 as the bad address for bits iterator

As reported by Byeonguk, the bad_words test in verifier_bits_iter.c
occasionally fails on s390 host. Quoting Ilya's explanation:

  s390 kernel runs in a completely separate address space, there is no
  user/kernel split at TASK_SIZE. The same address may be valid in both
  the kernel and the user address spaces, there is no way to tell by
  looking at it. The config option related to this property is
  ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE.

  Also, unfortunately, 0 is a valid address in the s390 kernel address
  space.

Fix the issue by using -4095 as the bad address for bits iterator, as
suggested by Ilya. Verify that bpf_iter_bits_new() returns -EINVAL for
NULL address and -EFAULT for bad address.

Fixes: ebafc1e535db ("selftests/bpf: Add three test cases for bits_iter")
Reported-by: Byeonguk Jeong <jungbu2855@gmail.com>
Closes: https://lore.kernel.org/bpf/ZycSXwjH4UTvx-Cn@ub22/
Signed-off-by: Hou Tao <houtao1@huawei.com>
Acked-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/r/20241105043057.3371482-1-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/progs/verifier_bits_iter.c       | 32 +++++++++++++++++++---
 1 file changed, 28 insertions(+), 4 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/progs/verifier_bits_iter.c b/tools/testing/selftests/bpf/progs/verifier_bits_iter.c
index 156cc278e2fc..7c881bca9af5 100644
--- a/tools/testing/selftests/bpf/progs/verifier_bits_iter.c
+++ b/tools/testing/selftests/bpf/progs/verifier_bits_iter.c
@@ -57,9 +57,15 @@ __description("null pointer")
 __success __retval(0)
 int null_pointer(void)
 {
-	int nr = 0;
+	struct bpf_iter_bits iter;
+	int err, nr = 0;
 	int *bit;
 
+	err = bpf_iter_bits_new(&iter, NULL, 1);
+	bpf_iter_bits_destroy(&iter);
+	if (err != -EINVAL)
+		return 1;
+
 	bpf_for_each(bits, bit, NULL, 1)
 		nr++;
 	return nr;
@@ -194,15 +200,33 @@ __description("bad words")
 __success __retval(0)
 int bad_words(void)
 {
-	void *bad_addr = (void *)(3UL << 30);
-	int nr = 0;
+	void *bad_addr = (void *)-4095;
+	struct bpf_iter_bits iter;
+	volatile int nr;
 	int *bit;
+	int err;
+
+	err = bpf_iter_bits_new(&iter, bad_addr, 1);
+	bpf_iter_bits_destroy(&iter);
+	if (err != -EFAULT)
+		return 1;
 
+	nr = 0;
 	bpf_for_each(bits, bit, bad_addr, 1)
 		nr++;
+	if (nr != 0)
+		return 2;
 
+	err = bpf_iter_bits_new(&iter, bad_addr, 4);
+	bpf_iter_bits_destroy(&iter);
+	if (err != -EFAULT)
+		return 3;
+
+	nr = 0;
 	bpf_for_each(bits, bit, bad_addr, 4)
 		nr++;
+	if (nr != 0)
+		return 4;
 
-	return nr;
+	return 0;
 }
-- 
cgit v1.2.3


From ac1bd50164b7995d0b8337b25d52ae79eefb9487 Mon Sep 17 00:00:00 2001
From: Geliang Tang <tanggeliang@kylinos.cn>
Date: Thu, 31 Oct 2024 09:40:46 +0800
Subject: selftests/bpf: Drop netns helpers in mptcp

New netns selftest helpers netns_new() and netns_free() has been added
in network_helpers.c, let's use them in mptcp selftests too instead of
using MPTCP's own helpers create_netns() and cleanup_netns().

Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://lore.kernel.org/r/c02fda3177b34f9e74a044833fda9761627f4d07.1730338692.git.tanggeliang@kylinos.cn
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/mptcp.c | 42 ++++++++------------------
 1 file changed, 12 insertions(+), 30 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/mptcp.c b/tools/testing/selftests/bpf/prog_tests/mptcp.c
index be3cad2aff77..f8eb7f9d4fd2 100644
--- a/tools/testing/selftests/bpf/prog_tests/mptcp.c
+++ b/tools/testing/selftests/bpf/prog_tests/mptcp.c
@@ -69,24 +69,6 @@ struct mptcp_storage {
 	char ca_name[TCP_CA_NAME_MAX];
 };
 
-static struct nstoken *create_netns(void)
-{
-	SYS(fail, "ip netns add %s", NS_TEST);
-	SYS(fail, "ip -net %s link set dev lo up", NS_TEST);
-
-	return open_netns(NS_TEST);
-fail:
-	return NULL;
-}
-
-static void cleanup_netns(struct nstoken *nstoken)
-{
-	if (nstoken)
-		close_netns(nstoken);
-
-	SYS_NOFAIL("ip netns del %s", NS_TEST);
-}
-
 static int start_mptcp_server(int family, const char *addr_str, __u16 port,
 			      int timeout_ms)
 {
@@ -206,15 +188,15 @@ out:
 
 static void test_base(void)
 {
-	struct nstoken *nstoken = NULL;
+	struct netns_obj *netns = NULL;
 	int server_fd, cgroup_fd;
 
 	cgroup_fd = test__join_cgroup("/mptcp");
 	if (!ASSERT_GE(cgroup_fd, 0, "test__join_cgroup"))
 		return;
 
-	nstoken = create_netns();
-	if (!ASSERT_OK_PTR(nstoken, "create_netns"))
+	netns = netns_new(NS_TEST, true);
+	if (!ASSERT_OK_PTR(netns, "netns_new"))
 		goto fail;
 
 	/* without MPTCP */
@@ -237,7 +219,7 @@ with_mptcp:
 	close(server_fd);
 
 fail:
-	cleanup_netns(nstoken);
+	netns_free(netns);
 	close(cgroup_fd);
 }
 
@@ -322,21 +304,21 @@ out:
 
 static void test_mptcpify(void)
 {
-	struct nstoken *nstoken = NULL;
+	struct netns_obj *netns = NULL;
 	int cgroup_fd;
 
 	cgroup_fd = test__join_cgroup("/mptcpify");
 	if (!ASSERT_GE(cgroup_fd, 0, "test__join_cgroup"))
 		return;
 
-	nstoken = create_netns();
-	if (!ASSERT_OK_PTR(nstoken, "create_netns"))
+	netns = netns_new(NS_TEST, true);
+	if (!ASSERT_OK_PTR(netns, "netns_new"))
 		goto fail;
 
 	ASSERT_OK(run_mptcpify(cgroup_fd), "run_mptcpify");
 
 fail:
-	cleanup_netns(nstoken);
+	netns_free(netns);
 	close(cgroup_fd);
 }
 
@@ -414,7 +396,7 @@ close_server:
 static void test_subflow(void)
 {
 	struct mptcp_subflow *skel;
-	struct nstoken *nstoken;
+	struct netns_obj *netns;
 	int cgroup_fd;
 
 	cgroup_fd = test__join_cgroup("/mptcp_subflow");
@@ -437,8 +419,8 @@ static void test_subflow(void)
 	if (!ASSERT_OK_PTR(skel->links._getsockopt_subflow, "attach _getsockopt_subflow"))
 		goto skel_destroy;
 
-	nstoken = create_netns();
-	if (!ASSERT_OK_PTR(nstoken, "create_netns: mptcp_subflow"))
+	netns = netns_new(NS_TEST, true);
+	if (!ASSERT_OK_PTR(netns, "netns_new: mptcp_subflow"))
 		goto skel_destroy;
 
 	if (endpoint_init("subflow") < 0)
@@ -447,7 +429,7 @@ static void test_subflow(void)
 	run_subflow();
 
 close_netns:
-	cleanup_netns(nstoken);
+	netns_free(netns);
 skel_destroy:
 	mptcp_subflow__destroy(skel);
 close_cgroup:
-- 
cgit v1.2.3


From f72aa1b276281b4e4f75261af8425bc99d903f3e Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Mon, 4 Nov 2024 12:34:26 +0100
Subject: selftests: net: include lib/sh/*.sh with lib.sh

Recently, the net/lib.sh file has been modified to include defer.sh from
net/lib/sh/ directory. The Makefile from net/lib has been modified
accordingly, but not the ones from the sub-targets using net/lib.sh.

Because of that, the new file is not installed as expected when
installing the Forwarding, MPTCP, and Netfilter targets, e.g.

  # make -C tools/testing/selftests TARGETS=net/mptcp install \
        INSTALL_PATH=/tmp/kself
  # cd /tmp/kself/
  # ./run_kselftest.sh -c net/mptcp
    TAP version 13
    1..7
    # timeout set to 1800
    # selftests: net/mptcp: mptcp_connect.sh
    # ./../lib.sh: line 5: /tmp/kself/net/lib/sh/defer.sh: No such file
      or directory
    # (...)

This can be fixed simply by adding all the .sh files from net/lib/sh
directory to the TEST_INCLUDES variable in the different Makefile's.

Fixes: a6e263f125cd ("selftests: net: lib: Introduce deferred commands")
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Link: https://patch.msgid.link/20241104-net-next-selftests-lib-sh-deps-v1-1-7c9f7d939fc2@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/forwarding/Makefile | 3 ++-
 tools/testing/selftests/net/mptcp/Makefile      | 2 +-
 tools/testing/selftests/net/netfilter/Makefile  | 3 ++-
 3 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile
index 224346426ef2..7d885cff8d79 100644
--- a/tools/testing/selftests/net/forwarding/Makefile
+++ b/tools/testing/selftests/net/forwarding/Makefile
@@ -126,6 +126,7 @@ TEST_FILES := devlink_lib.sh \
 	tc_common.sh
 
 TEST_INCLUDES := \
-	../lib.sh
+	../lib.sh \
+	$(wildcard ../lib/sh/*.sh)
 
 include ../../lib.mk
diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile
index 5d796622e730..8e3fc05a5397 100644
--- a/tools/testing/selftests/net/mptcp/Makefile
+++ b/tools/testing/selftests/net/mptcp/Makefile
@@ -11,7 +11,7 @@ TEST_GEN_FILES = mptcp_connect pm_nl_ctl mptcp_sockopt mptcp_inq
 
 TEST_FILES := mptcp_lib.sh settings
 
-TEST_INCLUDES := ../lib.sh ../net_helper.sh
+TEST_INCLUDES := ../lib.sh $(wildcard ../lib/sh/*.sh) ../net_helper.sh
 
 EXTRA_CLEAN := *.pcap
 
diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile
index 542f7886a0bc..9d009f74cfc2 100644
--- a/tools/testing/selftests/net/netfilter/Makefile
+++ b/tools/testing/selftests/net/netfilter/Makefile
@@ -55,4 +55,5 @@ TEST_FILES := lib.sh
 TEST_FILES += packetdrill
 
 TEST_INCLUDES := \
-	../lib.sh
+	../lib.sh \
+	$(wildcard ../lib/sh/*.sh)
-- 
cgit v1.2.3


From f8f55e9ec73f0a07e55fd91ce82fdca0796ad66a Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Tue, 24 Sep 2024 19:59:11 +0100
Subject: selftests/mm: add pkey_sighandler_xx, hugetlb_dio to .gitignore

Commit 6998a73efbb8 ("selftests/mm: Add new testcases for pkeys") and
commit 3a103b5315b7 ("selftest: mm: Test if hugepage does not get leaked
during __bio_release_pages()") generate test binaries hugetlb_dio,
pkey_sighandler_tests_32 and pkey_sighandler_tests_64 but did not add
these to .gitignore.  Correct this.

Link: https://lkml.kernel.org/r/20240924185911.117937-1-lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Donet Tom <donettom@linux.ibm.com>
Cc: Keith Lucas <keith.lucas@oracle.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/.gitignore | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index da030b43e43b..689bbd520296 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -51,3 +51,6 @@ hugetlb_madv_vs_map
 mseal_test
 seal_elf
 droppable
+hugetlb_dio
+pkey_sighandler_tests_32
+pkey_sighandler_tests_64
-- 
cgit v1.2.3


From 3b2faed068b9e736402f0b6f98fd68a177f619ec Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 26 Sep 2024 17:20:43 +0200
Subject: selftests/mm: hugetlb_fault_after_madv: use default hugetlb page size

Patch series "selftests/mm: hugetlb_fault_after_madv improvements".

Mario brought to my attention that the hugetlb_fault_after_madv test is
currently always skipped on s390x.  Let's adjust the test to be
independent of the default hugetlb page size and while at it, also improve
the test output.


This patch (of 2):

We currently assume that the hugetlb page size is 2 MiB, which is why we
mmap() a 2 MiB range.

Is the default hugetlb size is larger, mmap() will fail because the range
is not suitable.  If the default hugetlb size is smaller (e.g., s390x),
mmap() will fail because we would need more than one hugetlb page, but
just asserted that we have exactly one.

So let's simply use the default hugetlb page size instead of hard-coded 2
MiB, so the test isn't unconditionally skipped on architectures like
s390x.

Before this patch on s390x:
$ ./hugetlb_fault_after_madv
	1..0 # SKIP Failed to allocated huge page

With this change on s390x:
	$ ./hugetlb_fault_after_madv

While at it, make "huge_ptr" static.

Link: https://lkml.kernel.org/r/20240926152044.2205129-1-david@redhat.com
Link: https://lkml.kernel.org/r/20240926152044.2205129-2-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reported-by: Mario Casquero <mcasquer@redhat.com>
Tested-by: Mario Casquero <mcasquer@redhat.com>
Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Reviewed-by: Breno Leitao <leitao@debian.org>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/hugetlb_fault_after_madv.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/mm/hugetlb_fault_after_madv.c b/tools/testing/selftests/mm/hugetlb_fault_after_madv.c
index 73b81c632366..ff3ba675278d 100644
--- a/tools/testing/selftests/mm/hugetlb_fault_after_madv.c
+++ b/tools/testing/selftests/mm/hugetlb_fault_after_madv.c
@@ -9,10 +9,10 @@
 #include "vm_util.h"
 #include "../kselftest.h"
 
-#define MMAP_SIZE (1 << 21)
 #define INLOOP_ITER 100
 
-char *huge_ptr;
+static char *huge_ptr;
+static size_t huge_page_size;
 
 /* Touch the memory while it is being madvised() */
 void *touch(void *unused)
@@ -30,7 +30,7 @@ void *madv(void *unused)
 	usleep(rand() % 10);
 
 	for (int i = 0; i < INLOOP_ITER; i++)
-		madvise(huge_ptr, MMAP_SIZE, MADV_DONTNEED);
+		madvise(huge_ptr, huge_page_size, MADV_DONTNEED);
 
 	return NULL;
 }
@@ -47,6 +47,10 @@ int main(void)
 
 	srand(getpid());
 
+	huge_page_size = default_huge_page_size();
+	if (!huge_page_size)
+		ksft_exit_skip("Could not detect default hugetlb page size.");
+
 	free_hugepages = get_free_hugepages();
 	if (free_hugepages != 1) {
 		ksft_exit_skip("This test needs one and only one page to execute. Got %lu\n",
@@ -54,7 +58,7 @@ int main(void)
 	}
 
 	while (max--) {
-		huge_ptr = mmap(NULL, MMAP_SIZE, PROT_READ | PROT_WRITE,
+		huge_ptr = mmap(NULL, huge_page_size, PROT_READ | PROT_WRITE,
 				MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
 				-1, 0);
 
@@ -66,7 +70,7 @@ int main(void)
 
 		pthread_join(thread1, NULL);
 		pthread_join(thread2, NULL);
-		munmap(huge_ptr, MMAP_SIZE);
+		munmap(huge_ptr, huge_page_size);
 	}
 
 	return KSFT_PASS;
-- 
cgit v1.2.3


From f33cea94e37ce10e27b192e3c5e80ff685ac7b1f Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 26 Sep 2024 17:20:44 +0200
Subject: selftests/mm: hugetlb_fault_after_madv: improve test output

Let's improve the test output.  For example, print the proper test result.
Install a SIGBUS handler to catch any SIGBUS instead of crashing the test
on failure.

With unsuitable hugetlb page count:
  $ ./hugetlb_fault_after_madv
  TAP version 13
  1..1
  # [INFO] detected default hugetlb page size: 2048 KiB
  ok 2 # SKIP This test needs one and only one page to execute. Got 0
  # Totals: pass:0 fail:0 xfail:0 xpass:0 skip:1 error:0

On a failure:
  $ ./hugetlb_fault_after_madv
  TAP version 13
  1..1
  not ok 1 SIGBUS behavior
  Bail out! 1 out of 1 tests failed

On success:
  $ ./hugetlb_fault_after_madv
  TAP version 13
  1..1
  # [INFO] detected default hugetlb page size: 2048 KiB
  ok 1 SIGBUS behavior
  # Totals: pass:1 fail:0 xfail:0 xpass:0 skip:0 error:0

Link: https://lkml.kernel.org/r/20240926152044.2205129-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Breno Leitao <leitao@debian.org>
Tested-by: Mario Casquero <mcasquer@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../selftests/mm/hugetlb_fault_after_madv.c        | 34 +++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/mm/hugetlb_fault_after_madv.c b/tools/testing/selftests/mm/hugetlb_fault_after_madv.c
index ff3ba675278d..e2640529dbb2 100644
--- a/tools/testing/selftests/mm/hugetlb_fault_after_madv.c
+++ b/tools/testing/selftests/mm/hugetlb_fault_after_madv.c
@@ -5,6 +5,8 @@
 #include <sys/mman.h>
 #include <sys/types.h>
 #include <unistd.h>
+#include <setjmp.h>
+#include <signal.h>
 
 #include "vm_util.h"
 #include "../kselftest.h"
@@ -14,11 +16,25 @@
 static char *huge_ptr;
 static size_t huge_page_size;
 
+static sigjmp_buf sigbuf;
+static bool sigbus_triggered;
+
+static void signal_handler(int signal)
+{
+	if (signal == SIGBUS) {
+		sigbus_triggered = true;
+		siglongjmp(sigbuf, 1);
+	}
+}
+
 /* Touch the memory while it is being madvised() */
 void *touch(void *unused)
 {
 	char *ptr = (char *)huge_ptr;
 
+	if (sigsetjmp(sigbuf, 1))
+		return NULL;
+
 	for (int i = 0; i < INLOOP_ITER; i++)
 		ptr[0] = '.';
 
@@ -44,13 +60,23 @@ int main(void)
 	 * interactions
 	 */
 	int max = 10000;
+	int err;
+
+	ksft_print_header();
+	ksft_set_plan(1);
 
 	srand(getpid());
 
+	if (signal(SIGBUS, signal_handler) == SIG_ERR)
+		ksft_exit_skip("Could not register signal handler.");
+
 	huge_page_size = default_huge_page_size();
 	if (!huge_page_size)
 		ksft_exit_skip("Could not detect default hugetlb page size.");
 
+	ksft_print_msg("[INFO] detected default hugetlb page size: %zu KiB\n",
+		       huge_page_size / 1024);
+
 	free_hugepages = get_free_hugepages();
 	if (free_hugepages != 1) {
 		ksft_exit_skip("This test needs one and only one page to execute. Got %lu\n",
@@ -73,5 +99,11 @@ int main(void)
 		munmap(huge_ptr, huge_page_size);
 	}
 
-	return KSFT_PASS;
+	ksft_test_result(!sigbus_triggered, "SIGBUS behavior\n");
+
+	err = ksft_get_fail_cnt();
+	if (err)
+		ksft_exit_fail_msg("%d out of %d tests failed\n",
+				   err, ksft_test_num());
+	ksft_exit_pass();
 }
-- 
cgit v1.2.3


From 12833a732346dcf4e3bde55d6556fedf90743656 Mon Sep 17 00:00:00 2001
From: Ba Jing <bajing@cmss.chinamobile.com>
Date: Tue, 24 Sep 2024 10:14:26 +0800
Subject: selftests/damon/access_memory_even: remove unused variables

By reading the code, I found these variables are never referenced in the
code.  Just remove them.

Link: https://lkml.kernel.org/r/20240924021426.1980-1-bajing@cmss.chinamobile.com
Signed-off-by: Ba Jing <bajing@cmss.chinamobile.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/access_memory_even.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/damon/access_memory_even.c b/tools/testing/selftests/damon/access_memory_even.c
index 3be121487432..a9f4e9aaf3a9 100644
--- a/tools/testing/selftests/damon/access_memory_even.c
+++ b/tools/testing/selftests/damon/access_memory_even.c
@@ -14,10 +14,8 @@
 int main(int argc, char *argv[])
 {
 	char **regions;
-	clock_t start_clock;
 	int nr_regions;
 	int sz_region;
-	int access_time_ms;
 	int i;
 
 	if (argc != 3) {
-- 
cgit v1.2.3


From 8801c35c3672c8492824f5d3c4d3b37f43ed63c3 Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Fri, 11 Oct 2024 16:51:55 -0600
Subject: tools: fix -Wunused-result in linux.c

Fix the following -Wunused-result warnings on posix_memalign()
return values and add error handling.

./shared/linux.c:100:25: warning: ignoring return value of `posix_memalign' declared with attribute `warn_unused_result' [-Wunused-result]
  100 |          posix_memalign(&p, cachep->align, cachep->size);
      |          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
../shared/linux.c: In function `kmem_cache_alloc_bulk':
../shared/linux.c:198:33: warning: ignoring return value of `posix_memalign' declared with attribute `warn_unused_result' [-Wunused-result]
  198 |          posix_memalign(&p[i], cachep->align,
      |          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  199 |                                cachep->size);
      |                                ~~~~~~~~~~~~~

Link: https://lkml.kernel.org/r/20241011225155.27607-1-skhan@linuxfoundation.org
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/shared/linux.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/shared/linux.c b/tools/testing/shared/linux.c
index 17263696b5d8..66dbb362385f 100644
--- a/tools/testing/shared/linux.c
+++ b/tools/testing/shared/linux.c
@@ -96,10 +96,13 @@ void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru,
 		p = node;
 	} else {
 		pthread_mutex_unlock(&cachep->lock);
-		if (cachep->align)
-			posix_memalign(&p, cachep->align, cachep->size);
-		else
+		if (cachep->align) {
+			if (posix_memalign(&p, cachep->align, cachep->size) < 0)
+				return NULL;
+		} else {
 			p = malloc(cachep->size);
+		}
+
 		if (cachep->ctor)
 			cachep->ctor(p);
 		else if (gfp & __GFP_ZERO)
@@ -195,8 +198,9 @@ int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size,
 			}
 
 			if (cachep->align) {
-				posix_memalign(&p[i], cachep->align,
-					       cachep->size);
+				if (posix_memalign(&p[i], cachep->align,
+					       cachep->size) < 0)
+					break;
 			} else {
 				p[i] = malloc(cachep->size);
 				if (!p[i])
-- 
cgit v1.2.3


From fc49b804967e5b1cc1665efd4de112945e1ab4c6 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 4 Nov 2024 15:25:24 +0100
Subject: selftests: netfilter: run conntrack_dump_flush in netns

This test will fail if the initial namespace has conntrack
active due to unexpected number of flows returned on dump:

  conntrack_dump_flush.c:451:test_flush_by_zone:Expected ret (7) == 2 (2)
  test_flush_by_zone: Test failed
  FAIL  conntrack_dump_flush.test_flush_by_zone
  not ok 2 conntrack_dump_flush.test_flush_by_zone

Add a wrapper that unshares this program to avoid this problem.

Signed-off-by: Florian Westphal <fw@strlen.de>
Link: https://patch.msgid.link/20241104142529.2352-1-fw@strlen.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/netfilter/Makefile                | 4 ++--
 tools/testing/selftests/net/netfilter/conntrack_dump_flush.sh | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)
 create mode 100755 tools/testing/selftests/net/netfilter/conntrack_dump_flush.sh

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile
index 9d009f74cfc2..ffe161fac8b5 100644
--- a/tools/testing/selftests/net/netfilter/Makefile
+++ b/tools/testing/selftests/net/netfilter/Makefile
@@ -8,6 +8,7 @@ MNL_LDLIBS := $(shell $(HOSTPKG_CONFIG) --libs libmnl 2>/dev/null || echo -lmnl)
 
 TEST_PROGS := br_netfilter.sh bridge_brouter.sh
 TEST_PROGS += br_netfilter_queue.sh
+TEST_PROGS += conntrack_dump_flush.sh
 TEST_PROGS += conntrack_icmp_related.sh
 TEST_PROGS += conntrack_ipip_mtu.sh
 TEST_PROGS += conntrack_tcp_unreplied.sh
@@ -36,10 +37,9 @@ TEST_PROGS += xt_string.sh
 
 TEST_PROGS_EXTENDED = nft_concat_range_perf.sh
 
-TEST_GEN_PROGS = conntrack_dump_flush
-
 TEST_GEN_FILES = audit_logread
 TEST_GEN_FILES += connect_close nf_queue
+TEST_GEN_FILES += conntrack_dump_flush
 TEST_GEN_FILES += conntrack_reverse_clash
 TEST_GEN_FILES += sctp_collision
 
diff --git a/tools/testing/selftests/net/netfilter/conntrack_dump_flush.sh b/tools/testing/selftests/net/netfilter/conntrack_dump_flush.sh
new file mode 100755
index 000000000000..8b0935385849
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/conntrack_dump_flush.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+exec unshare -n ./conntrack_dump_flush
-- 
cgit v1.2.3


From a84e8c05f58305dfa808bc5465c5175c29d7c9b6 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 4 Nov 2024 15:28:18 +0100
Subject: selftests: netfilter: nft_queue.sh: fix warnings with socat 1.8.0.0

Updated to a more recent socat release and saw this:
 socat E xioopen_ipdgram_listen(): unknown address family 0
 socat W address is opened in read-write mode but only supports read-only

First error is avoided via pf=ipv4 option, second one via -u
(unidirectional) mode.

Signed-off-by: Florian Westphal <fw@strlen.de>
Link: https://patch.msgid.link/20241104142821.2608-1-fw@strlen.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/netfilter/nft_queue.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/netfilter/nft_queue.sh b/tools/testing/selftests/net/netfilter/nft_queue.sh
index a9d109fcc15c..785e3875a6da 100755
--- a/tools/testing/selftests/net/netfilter/nft_queue.sh
+++ b/tools/testing/selftests/net/netfilter/nft_queue.sh
@@ -512,10 +512,10 @@ EOF
 	:> "$TMPFILE1"
 	:> "$TMPFILE2"
 
-	timeout 10 ip netns exec "$ns2" socat UDP-LISTEN:12345,fork OPEN:"$TMPFILE1",trunc &
+	timeout 10 ip netns exec "$ns2" socat UDP-LISTEN:12345,fork,pf=ipv4 OPEN:"$TMPFILE1",trunc &
 	local rpid1=$!
 
-	timeout 10 ip netns exec "$ns3" socat UDP-LISTEN:12345,fork OPEN:"$TMPFILE2",trunc &
+	timeout 10 ip netns exec "$ns3" socat UDP-LISTEN:12345,fork,pf=ipv4 OPEN:"$TMPFILE2",trunc &
 	local rpid2=$!
 
 	ip netns exec "$nsrouter" ./nf_queue -q 12 -d 1000 &
@@ -528,8 +528,8 @@ EOF
 	# Send two packets, one should end up in ns1, other in ns2.
 	# This is because nfqueue will delay packet for long enough so that
 	# second packet will not find existing conntrack entry.
-	echo "Packet 1" | ip netns exec "$ns1" socat STDIN UDP-DATAGRAM:10.6.6.6:12345,bind=0.0.0.0:55221
-	echo "Packet 2" | ip netns exec "$ns1" socat STDIN UDP-DATAGRAM:10.6.6.6:12345,bind=0.0.0.0:55221
+	echo "Packet 1" | ip netns exec "$ns1" socat -u STDIN UDP-DATAGRAM:10.6.6.6:12345,bind=0.0.0.0:55221
+	echo "Packet 2" | ip netns exec "$ns1" socat -u STDIN UDP-DATAGRAM:10.6.6.6:12345,bind=0.0.0.0:55221
 
 	busywait 10000 output_files_written "$TMPFILE1" "$TMPFILE2"
 
-- 
cgit v1.2.3


From 66c54c20408d994be34be2c070fba08472f69eee Mon Sep 17 00:00:00 2001
From: Zijian Zhang <zijianzhang@bytedance.com>
Date: Wed, 6 Nov 2024 22:25:13 +0000
Subject: selftests/bpf: Add txmsg_pass to pull/push/pop in test_sockmap

Add txmsg_pass to test_txmsg_pull/push/pop. If txmsg_pass is missing,
tx_prog will be NULL, and no program will be attached to the sockmap.
As a result, pull/push/pop are never invoked.

Fixes: 328aa08a081b ("bpf: Selftests, break down test_sockmap into subtests")
Signed-off-by: Zijian Zhang <zijianzhang@bytedance.com>
Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/r/20241106222520.527076-2-zijianzhang@bytedance.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/test_sockmap.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 075c93ed143e..0f065273fde3 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -1596,11 +1596,13 @@ static void test_txmsg_cork_hangs(int cgrp, struct sockmap_options *opt)
 static void test_txmsg_pull(int cgrp, struct sockmap_options *opt)
 {
 	/* Test basic start/end */
+	txmsg_pass = 1;
 	txmsg_start = 1;
 	txmsg_end = 2;
 	test_send(opt, cgrp);
 
 	/* Test >4k pull */
+	txmsg_pass = 1;
 	txmsg_start = 4096;
 	txmsg_end = 9182;
 	test_send_large(opt, cgrp);
@@ -1629,11 +1631,13 @@ static void test_txmsg_pull(int cgrp, struct sockmap_options *opt)
 static void test_txmsg_pop(int cgrp, struct sockmap_options *opt)
 {
 	/* Test basic pop */
+	txmsg_pass = 1;
 	txmsg_start_pop = 1;
 	txmsg_pop = 2;
 	test_send_many(opt, cgrp);
 
 	/* Test pop with >4k */
+	txmsg_pass = 1;
 	txmsg_start_pop = 4096;
 	txmsg_pop = 4096;
 	test_send_large(opt, cgrp);
@@ -1662,11 +1666,13 @@ static void test_txmsg_pop(int cgrp, struct sockmap_options *opt)
 static void test_txmsg_push(int cgrp, struct sockmap_options *opt)
 {
 	/* Test basic push */
+	txmsg_pass = 1;
 	txmsg_start_push = 1;
 	txmsg_end_push = 1;
 	test_send(opt, cgrp);
 
 	/* Test push 4kB >4k */
+	txmsg_pass = 1;
 	txmsg_start_push = 4096;
 	txmsg_end_push = 4096;
 	test_send_large(opt, cgrp);
@@ -1687,6 +1693,7 @@ static void test_txmsg_push(int cgrp, struct sockmap_options *opt)
 
 static void test_txmsg_push_pop(int cgrp, struct sockmap_options *opt)
 {
+	txmsg_pass = 1;
 	txmsg_start_push = 1;
 	txmsg_end_push = 10;
 	txmsg_start_pop = 5;
-- 
cgit v1.2.3


From 4095031463d4e99b534d2cd82035a417295764ae Mon Sep 17 00:00:00 2001
From: Zijian Zhang <zijianzhang@bytedance.com>
Date: Wed, 6 Nov 2024 22:25:14 +0000
Subject: selftests/bpf: Fix SENDPAGE data logic in test_sockmap

In the SENDPAGE test, "opt->iov_length * cnt" size of data will be sent
cnt times by sendfile.
1. In push/pop tests, they will be invoked cnt times, for the simplicity of
msg_verify_data, change chunk_sz to iov_length
2. Change iov_length in test_send_large from 1024 to 8192. We have pop test
where txmsg_start_pop is 4096. 4096 > 1024, an error will be returned.

Fixes: 328aa08a081b ("bpf: Selftests, break down test_sockmap into subtests")
Signed-off-by: Zijian Zhang <zijianzhang@bytedance.com>
Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/r/20241106222520.527076-3-zijianzhang@bytedance.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/test_sockmap.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 0f065273fde3..1d59bed90d80 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -420,16 +420,18 @@ static int msg_loop_sendpage(int fd, int iov_length, int cnt,
 {
 	bool drop = opt->drop_expected;
 	unsigned char k = 0;
+	int i, j, fp;
 	FILE *file;
-	int i, fp;
 
 	file = tmpfile();
 	if (!file) {
 		perror("create file for sendpage");
 		return 1;
 	}
-	for (i = 0; i < iov_length * cnt; i++, k++)
-		fwrite(&k, sizeof(char), 1, file);
+	for (i = 0; i < cnt; i++, k = 0) {
+		for (j = 0; j < iov_length; j++, k++)
+			fwrite(&k, sizeof(char), 1, file);
+	}
 	fflush(file);
 	fseek(file, 0, SEEK_SET);
 
@@ -623,7 +625,9 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 		 * This is really only useful for testing edge cases in code
 		 * paths.
 		 */
-		total_bytes = (float)iov_count * (float)iov_length * (float)cnt;
+		total_bytes = (float)iov_length * (float)cnt;
+		if (!opt->sendpage)
+			total_bytes *= (float)iov_count;
 		if (txmsg_apply)
 			txmsg_pop_total = txmsg_pop * (total_bytes / txmsg_apply);
 		else
@@ -701,7 +705,7 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 
 			if (data) {
 				int chunk_sz = opt->sendpage ?
-						iov_length * cnt :
+						iov_length :
 						iov_length * iov_count;
 
 				errno = msg_verify_data(&msg, recv, chunk_sz, &k, &bytes_cnt);
@@ -1466,8 +1470,8 @@ static void test_send_many(struct sockmap_options *opt, int cgrp)
 
 static void test_send_large(struct sockmap_options *opt, int cgrp)
 {
-	opt->iov_length = 256;
-	opt->iov_count = 1024;
+	opt->iov_length = 8192;
+	opt->iov_count = 32;
 	opt->rate = 2;
 	test_exec(cgrp, opt);
 }
-- 
cgit v1.2.3


From 523dffccbadea0cfd65f1ff04944b864c558c4a8 Mon Sep 17 00:00:00 2001
From: Zijian Zhang <zijianzhang@bytedance.com>
Date: Wed, 6 Nov 2024 22:25:15 +0000
Subject: selftests/bpf: Fix total_bytes in msg_loop_rx in test_sockmap

total_bytes in msg_loop_rx should also take push into account, otherwise
total_bytes will be a smaller value, which makes the msg_loop_rx end early.

Besides, total_bytes has already taken pop into account, so we don't need
to subtract some bytes from iov_buf in sendmsg_test. The additional
subtraction may make total_bytes a negative number, and msg_loop_rx will
just end without checking anything.

Fixes: 18d4e900a450 ("bpf: Selftests, improve test_sockmap total bytes counter")
Fixes: d69672147faa ("selftests, bpf: Add one test for sockmap with strparser")
Signed-off-by: Zijian Zhang <zijianzhang@bytedance.com>
Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/r/20241106222520.527076-4-zijianzhang@bytedance.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/test_sockmap.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 1d59bed90d80..5f4558f1f004 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -606,8 +606,8 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 		}
 		clock_gettime(CLOCK_MONOTONIC, &s->end);
 	} else {
+		float total_bytes, txmsg_pop_total, txmsg_push_total;
 		int slct, recvp = 0, recv, max_fd = fd;
-		float total_bytes, txmsg_pop_total;
 		int fd_flags = O_NONBLOCK;
 		struct timeval timeout;
 		unsigned char k = 0;
@@ -628,10 +628,14 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 		total_bytes = (float)iov_length * (float)cnt;
 		if (!opt->sendpage)
 			total_bytes *= (float)iov_count;
-		if (txmsg_apply)
+		if (txmsg_apply) {
+			txmsg_push_total = txmsg_end_push * (total_bytes / txmsg_apply);
 			txmsg_pop_total = txmsg_pop * (total_bytes / txmsg_apply);
-		else
+		} else {
+			txmsg_push_total = txmsg_end_push * cnt;
 			txmsg_pop_total = txmsg_pop * cnt;
+		}
+		total_bytes += txmsg_push_total;
 		total_bytes -= txmsg_pop_total;
 		err = clock_gettime(CLOCK_MONOTONIC, &s->start);
 		if (err < 0)
@@ -800,8 +804,6 @@ static int sendmsg_test(struct sockmap_options *opt)
 
 	rxpid = fork();
 	if (rxpid == 0) {
-		if (txmsg_pop || txmsg_start_pop)
-			iov_buf -= (txmsg_pop - txmsg_start_pop + 1);
 		if (opt->drop_expected || txmsg_ktls_skb_drop)
 			_exit(0);
 
-- 
cgit v1.2.3


From 862087c3d36219ed44569666eb263efc97f00c9a Mon Sep 17 00:00:00 2001
From: Zijian Zhang <zijianzhang@bytedance.com>
Date: Wed, 6 Nov 2024 22:25:16 +0000
Subject: selftests/bpf: Add push/pop checking for msg_verify_data in
 test_sockmap

Add push/pop checking for msg_verify_data in test_sockmap, except for
pop/push with cork tests, in these tests the logic will be different.
1. With corking, pop/push might not be invoked in each sendmsg, it makes
the layout of the received data difficult
2. It makes it hard to calculate the total_bytes in the recvmsg
Temporarily skip the data integrity test for these cases now, added a TODO

Fixes: ee9b352ce465 ("selftests/bpf: Fix msg_verify_data in test_sockmap")
Signed-off-by: Zijian Zhang <zijianzhang@bytedance.com>
Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/r/20241106222520.527076-5-zijianzhang@bytedance.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/test_sockmap.c | 106 +++++++++++++++++++++++++++--
 1 file changed, 101 insertions(+), 5 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 5f4558f1f004..61a747afcd05 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -88,6 +88,10 @@ int ktls;
 int peek_flag;
 int skb_use_parser;
 int txmsg_omit_skb_parser;
+int verify_push_start;
+int verify_push_len;
+int verify_pop_start;
+int verify_pop_len;
 
 static const struct option long_options[] = {
 	{"help",	no_argument,		NULL, 'h' },
@@ -514,12 +518,41 @@ unwind_iov:
 	return -ENOMEM;
 }
 
-/* TODO: Add verification logic for push, pull and pop data */
+/* In push or pop test, we need to do some calculations for msg_verify_data */
+static void msg_verify_date_prep(void)
+{
+	int push_range_end = txmsg_start_push + txmsg_end_push - 1;
+	int pop_range_end = txmsg_start_pop + txmsg_pop - 1;
+
+	if (txmsg_end_push && txmsg_pop &&
+	    txmsg_start_push <= pop_range_end && txmsg_start_pop <= push_range_end) {
+		/* The push range and the pop range overlap */
+		int overlap_len;
+
+		verify_push_start = txmsg_start_push;
+		verify_pop_start = txmsg_start_pop;
+		if (txmsg_start_push < txmsg_start_pop)
+			overlap_len = min(push_range_end - txmsg_start_pop + 1, txmsg_pop);
+		else
+			overlap_len = min(pop_range_end - txmsg_start_push + 1, txmsg_end_push);
+		verify_push_len = max(txmsg_end_push - overlap_len, 0);
+		verify_pop_len = max(txmsg_pop - overlap_len, 0);
+	} else {
+		/* Otherwise */
+		verify_push_start = txmsg_start_push;
+		verify_pop_start = txmsg_start_pop;
+		verify_push_len = txmsg_end_push;
+		verify_pop_len = txmsg_pop;
+	}
+}
+
 static int msg_verify_data(struct msghdr *msg, int size, int chunk_sz,
-				 unsigned char *k_p, int *bytes_cnt_p)
+			   unsigned char *k_p, int *bytes_cnt_p,
+			   int *check_cnt_p, int *push_p)
 {
-	int i, j, bytes_cnt = *bytes_cnt_p;
+	int bytes_cnt = *bytes_cnt_p, check_cnt = *check_cnt_p, push = *push_p;
 	unsigned char k = *k_p;
+	int i, j;
 
 	for (i = 0, j = 0; i < msg->msg_iovlen && size; i++, j = 0) {
 		unsigned char *d = msg->msg_iov[i].iov_base;
@@ -538,6 +571,37 @@ static int msg_verify_data(struct msghdr *msg, int size, int chunk_sz,
 		}
 
 		for (; j < msg->msg_iov[i].iov_len && size; j++) {
+			if (push > 0 &&
+			    check_cnt == verify_push_start + verify_push_len - push) {
+				int skipped;
+revisit_push:
+				skipped = push;
+				if (j + push >= msg->msg_iov[i].iov_len)
+					skipped = msg->msg_iov[i].iov_len - j;
+				push -= skipped;
+				size -= skipped;
+				j += skipped - 1;
+				check_cnt += skipped;
+				continue;
+			}
+
+			if (verify_pop_len > 0 && check_cnt == verify_pop_start) {
+				bytes_cnt += verify_pop_len;
+				check_cnt += verify_pop_len;
+				k += verify_pop_len;
+
+				if (bytes_cnt == chunk_sz) {
+					k = 0;
+					bytes_cnt = 0;
+					check_cnt = 0;
+					push = verify_push_len;
+				}
+
+				if (push > 0 &&
+				    check_cnt == verify_push_start + verify_push_len - push)
+					goto revisit_push;
+			}
+
 			if (d[j] != k++) {
 				fprintf(stderr,
 					"detected data corruption @iov[%i]:%i %02x != %02x, %02x ?= %02x\n",
@@ -545,15 +609,20 @@ static int msg_verify_data(struct msghdr *msg, int size, int chunk_sz,
 				return -EDATAINTEGRITY;
 			}
 			bytes_cnt++;
+			check_cnt++;
 			if (bytes_cnt == chunk_sz) {
 				k = 0;
 				bytes_cnt = 0;
+				check_cnt = 0;
+				push = verify_push_len;
 			}
 			size--;
 		}
 	}
 	*k_p = k;
 	*bytes_cnt_p = bytes_cnt;
+	*check_cnt_p = check_cnt;
+	*push_p = push;
 	return 0;
 }
 
@@ -612,6 +681,8 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 		struct timeval timeout;
 		unsigned char k = 0;
 		int bytes_cnt = 0;
+		int check_cnt = 0;
+		int push = 0;
 		fd_set w;
 
 		fcntl(fd, fd_flags);
@@ -637,6 +708,10 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 		}
 		total_bytes += txmsg_push_total;
 		total_bytes -= txmsg_pop_total;
+		if (data) {
+			msg_verify_date_prep();
+			push = verify_push_len;
+		}
 		err = clock_gettime(CLOCK_MONOTONIC, &s->start);
 		if (err < 0)
 			perror("recv start time");
@@ -712,7 +787,8 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 						iov_length :
 						iov_length * iov_count;
 
-				errno = msg_verify_data(&msg, recv, chunk_sz, &k, &bytes_cnt);
+				errno = msg_verify_data(&msg, recv, chunk_sz, &k, &bytes_cnt,
+							&check_cnt, &push);
 				if (errno) {
 					perror("data verify msg failed");
 					goto out_errno;
@@ -722,7 +798,9 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 								recvp,
 								chunk_sz,
 								&k,
-								&bytes_cnt);
+								&bytes_cnt,
+								&check_cnt,
+								&push);
 					if (errno) {
 						perror("data verify msg_peek failed");
 						goto out_errno;
@@ -1636,6 +1714,8 @@ static void test_txmsg_pull(int cgrp, struct sockmap_options *opt)
 
 static void test_txmsg_pop(int cgrp, struct sockmap_options *opt)
 {
+	bool data = opt->data_test;
+
 	/* Test basic pop */
 	txmsg_pass = 1;
 	txmsg_start_pop = 1;
@@ -1654,6 +1734,12 @@ static void test_txmsg_pop(int cgrp, struct sockmap_options *opt)
 	txmsg_pop = 2;
 	test_send_many(opt, cgrp);
 
+	/* TODO: Test for pop + cork should be different,
+	 * - It makes the layout of the received data difficult
+	 * - It makes it hard to calculate the total_bytes in the recvmsg
+	 * Temporarily skip the data integrity test for this case now.
+	 */
+	opt->data_test = false;
 	/* Test pop + cork */
 	txmsg_redir = 0;
 	txmsg_cork = 512;
@@ -1667,10 +1753,13 @@ static void test_txmsg_pop(int cgrp, struct sockmap_options *opt)
 	txmsg_start_pop = 1;
 	txmsg_pop = 2;
 	test_send_many(opt, cgrp);
+	opt->data_test = data;
 }
 
 static void test_txmsg_push(int cgrp, struct sockmap_options *opt)
 {
+	bool data = opt->data_test;
+
 	/* Test basic push */
 	txmsg_pass = 1;
 	txmsg_start_push = 1;
@@ -1689,12 +1778,19 @@ static void test_txmsg_push(int cgrp, struct sockmap_options *opt)
 	txmsg_end_push = 2;
 	test_send_many(opt, cgrp);
 
+	/* TODO: Test for push + cork should be different,
+	 * - It makes the layout of the received data difficult
+	 * - It makes it hard to calculate the total_bytes in the recvmsg
+	 * Temporarily skip the data integrity test for this case now.
+	 */
+	opt->data_test = false;
 	/* Test push + cork */
 	txmsg_redir = 0;
 	txmsg_cork = 512;
 	txmsg_start_push = 1;
 	txmsg_end_push = 2;
 	test_send_many(opt, cgrp);
+	opt->data_test = data;
 }
 
 static void test_txmsg_push_pop(int cgrp, struct sockmap_options *opt)
-- 
cgit v1.2.3


From 47eae080410b1de1d7f6c79b511aaa6be865c61e Mon Sep 17 00:00:00 2001
From: Zijian Zhang <zijianzhang@bytedance.com>
Date: Wed, 6 Nov 2024 22:25:17 +0000
Subject: selftests/bpf: Add more tests for test_txmsg_push_pop in test_sockmap

Add more tests for test_txmsg_push_pop in test_sockmap for better coverage

Signed-off-by: Zijian Zhang <zijianzhang@bytedance.com>
Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/r/20241106222520.527076-6-zijianzhang@bytedance.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/test_sockmap.c | 37 ++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 61a747afcd05..e5c7ecbe57e3 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -1795,12 +1795,49 @@ static void test_txmsg_push(int cgrp, struct sockmap_options *opt)
 
 static void test_txmsg_push_pop(int cgrp, struct sockmap_options *opt)
 {
+	/* Test push/pop range overlapping */
 	txmsg_pass = 1;
 	txmsg_start_push = 1;
 	txmsg_end_push = 10;
 	txmsg_start_pop = 5;
 	txmsg_pop = 4;
 	test_send_large(opt, cgrp);
+
+	txmsg_pass = 1;
+	txmsg_start_push = 1;
+	txmsg_end_push = 10;
+	txmsg_start_pop = 5;
+	txmsg_pop = 16;
+	test_send_large(opt, cgrp);
+
+	txmsg_pass = 1;
+	txmsg_start_push = 5;
+	txmsg_end_push = 4;
+	txmsg_start_pop = 1;
+	txmsg_pop = 10;
+	test_send_large(opt, cgrp);
+
+	txmsg_pass = 1;
+	txmsg_start_push = 5;
+	txmsg_end_push = 16;
+	txmsg_start_pop = 1;
+	txmsg_pop = 10;
+	test_send_large(opt, cgrp);
+
+	/* Test push/pop range non-overlapping */
+	txmsg_pass = 1;
+	txmsg_start_push = 1;
+	txmsg_end_push = 10;
+	txmsg_start_pop = 16;
+	txmsg_pop = 4;
+	test_send_large(opt, cgrp);
+
+	txmsg_pass = 1;
+	txmsg_start_push = 16;
+	txmsg_end_push = 10;
+	txmsg_start_pop = 5;
+	txmsg_pop = 4;
+	test_send_large(opt, cgrp);
 }
 
 static void test_txmsg_apply(int cgrp, struct sockmap_options *opt)
-- 
cgit v1.2.3


From 52ed077aa6336dbef83a2d6d21c52d1706fb7f16 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 5 Nov 2024 19:23:51 +0100
Subject: selftests: net: really check for bg process completion

A recent refactor transformed the check for process completion
in a true statement, due to a typo.

As a result, the relevant test-case is unable to catch the
regression it was supposed to detect.

Restore the correct condition.

Fixes: 691bb4e49c98 ("selftests: net: avoid just another constant wait")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://patch.msgid.link/0e6f213811f8e93a235307e683af8225cc6277ae.1730828007.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/pmtu.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
index 569bce8b6383..6c651c880fe8 100755
--- a/tools/testing/selftests/net/pmtu.sh
+++ b/tools/testing/selftests/net/pmtu.sh
@@ -2056,7 +2056,7 @@ check_running() {
 	pid=${1}
 	cmd=${2}
 
-	[ "$(cat /proc/${pid}/cmdline 2>/dev/null | tr -d '\0')" = "{cmd}" ]
+	[ "$(cat /proc/${pid}/cmdline 2>/dev/null | tr -d '\0')" = "${cmd}" ]
 }
 
 test_cleanup_vxlanX_exception() {
-- 
cgit v1.2.3


From 0f85eb3395c74d7cc823169bbacc670c6645ae80 Mon Sep 17 00:00:00 2001
From: Jiazi Li <jqqlijiazi@gmail.com>
Date: Wed, 26 Jun 2024 12:06:31 -0400
Subject: maple_tree: add some alloc node test case

Add some maple_tree alloc node tese case.

Link: https://lkml.kernel.org/r/20240626160631.3636515-2-Liam.Howlett@oracle.com
Signed-off-by: Jiazi Li <jqqlijiazi@gmail.com>
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Suggested-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/radix-tree/maple.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c
index 551ae6898c1d..bc30050227fd 100644
--- a/tools/testing/radix-tree/maple.c
+++ b/tools/testing/radix-tree/maple.c
@@ -462,6 +462,28 @@ static noinline void __init check_new_node(struct maple_tree *mt)
 	MT_BUG_ON(mt, mas_allocated(&mas) != 10 + MAPLE_ALLOC_SLOTS - 1);
 	mas_destroy(&mas);
 
+	mas.node = MA_ERROR(-ENOMEM);
+	mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 1); /* Request */
+	mas_nomem(&mas, GFP_KERNEL); /* Fill request */
+	MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1);
+	mas.node = MA_ERROR(-ENOMEM);
+	mas_node_count(&mas, MAPLE_ALLOC_SLOTS * 2 + 2); /* Request */
+	mas_nomem(&mas, GFP_KERNEL); /* Fill request */
+	mas.status = ma_start;
+	MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS * 2 + 2);
+	mas_destroy(&mas);
+
+	mas.node = MA_ERROR(-ENOMEM);
+	mas_node_count(&mas, MAPLE_ALLOC_SLOTS * 2 + 1); /* Request */
+	mas_nomem(&mas, GFP_KERNEL); /* Fill request */
+	MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS * 2 + 1);
+	mas.node = MA_ERROR(-ENOMEM);
+	mas_node_count(&mas, MAPLE_ALLOC_SLOTS * 3 + 2); /* Request */
+	mas_nomem(&mas, GFP_KERNEL); /* Fill request */
+	mas.status = ma_start;
+	MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS * 3 + 2);
+	mas_destroy(&mas);
+
 	mtree_unlock(mt);
 }
 
-- 
cgit v1.2.3


From 7146de5ff504003ed6f61c39c379b5777e7bed29 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Thu, 17 Oct 2024 17:56:38 +0100
Subject: tools: testing: fix phys_addr_t size on 64-bit systems

The phys_addr_t size is predicated on whether CONFIG_PHYS_ADDR_T_64BIT is
set or not.

In the VMA tests, virt_to_phys() from tools/include/linux casts a volatile
void * pointer to phys_addr_t, if CONFIG_PHYS_ADDR_T_64BIT is not set,
this will be 32-bit and trigger a warning.

Obviously this might also lead to truncation, which we would rather avoid.

Fix this by adjusting the generation of generated/bit-length.h to generate
a CONFIG_PHYS_ADDR_T{bits}BIT define.

This does result in the generation of the useless CONFIG_PHYS_ADDR_T_32BIT
define for 32-bit systems, but this should have no effect, and makes
implementation of this easier.

This resolves the issue and the warning.

[lorenzo.stoakes@oracle.com: VMA tests not properly importing bit-length.h]
  Link: https://lkml.kernel.org/r/a6183df9-3108-4d59-8128-4fc6c14e22a5@lucifer.local
Link: https://lkml.kernel.org/r/20241017165638.95602-1-lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/shared/shared.mk | 1 +
 tools/testing/vma/vma.c        | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/shared/shared.mk b/tools/testing/shared/shared.mk
index a6bc51d0b0bf..923ee2492256 100644
--- a/tools/testing/shared/shared.mk
+++ b/tools/testing/shared/shared.mk
@@ -69,6 +69,7 @@ generated/bit-length.h: FORCE
 	@if ! grep -qws CONFIG_$(LONG_BIT)BIT generated/bit-length.h; then   \
 		echo "Generating $@";                                        \
 		echo "#define CONFIG_$(LONG_BIT)BIT 1" > $@;                 \
+		echo "#define CONFIG_PHYS_ADDR_T_$(LONG_BIT)BIT 1" >> $@;    \
 	fi
 
 FORCE: ;
diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
index b33b47342d41..8fab5e13c7c3 100644
--- a/tools/testing/vma/vma.c
+++ b/tools/testing/vma/vma.c
@@ -4,6 +4,8 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#include "generated/bit-length.h"
+
 #include "maple-shared.h"
 #include "vma_internal.h"
 
-- 
cgit v1.2.3


From c14f8046cd7c353176c53d2721d52a2bd6a648ec Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Fri, 25 Oct 2024 13:26:23 +0100
Subject: tools: testing: add additional vma_internal.h stubs

Patch series "fix error handling in mmap_region() and refactor", v3.

The mmap_region() function is somewhat terrifying, with spaghetti-like
control flow and numerous means by which issues can arise and incomplete
state, memory leaks and other unpleasantness can occur.

This series goes to great lengths to simplify how mmap_region() works and
to avoid unwinding errors late on in the process of setting up the VMA for
the new mapping, and equally avoids such operations occurring while the
VMA is in an inconsistent state.

This series builds on the previously submitted hotfix patches (see link to
v2 below) which addresses the most critical issues around mmap_region(),
and further works to improve mmap_region() complexity, stability, and
testability.

This series moves the code to mm/vma.c to render it userland testable,
refactors and simplifies it into smaller functions that are significantly
more readable.

It additionally avoids performing an attempt at a second merge mid-way
through allocating a new VMA, a dubious proposition at best and one that
is highly subject to subtle bugs.

Rather than do this, we simply note that we ought to retry the merge and
do this as a final step.


This patch (of 3):

Add some additional vma_internal.h stubs in preparation for
__mmap_region() being moved to mm/vma.c.  Without these the move would
result in the tests no longer compiling.

Link: https://lkml.kernel.org/r/cover.1729858176.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/74b27e159e261d2ac1fe66a130edad1d61fdc176.1729858176.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/vma/vma_internal.h | 115 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 114 insertions(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index c5b9da034511..e76ff579e1fd 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -44,7 +44,9 @@
 #define VM_LOCKED	0x00002000
 #define VM_IO           0x00004000
 #define VM_DONTEXPAND	0x00040000
+#define VM_LOCKONFAULT	0x00080000
 #define VM_ACCOUNT	0x00100000
+#define VM_NORESERVE	0x00200000
 #define VM_MIXEDMAP	0x10000000
 #define VM_STACK	VM_GROWSDOWN
 #define VM_SHADOW_STACK	VM_NONE
@@ -53,6 +55,14 @@
 #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)
 #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
 
+/* This mask represents all the VMA flag bits used by mlock */
+#define VM_LOCKED_MASK	(VM_LOCKED | VM_LOCKONFAULT)
+
+#ifdef CONFIG_64BIT
+/* VM is sealed, in vm_flags */
+#define VM_SEALED	_BITUL(63)
+#endif
+
 #define FIRST_USER_ADDRESS	0UL
 #define USER_PGTABLES_CEILING	0UL
 
@@ -698,8 +708,9 @@ static inline void tlb_finish_mmu(struct mmu_gather *)
 {
 }
 
-static inline void get_file(struct file *)
+static inline struct file *get_file(struct file *f)
 {
+	return f;
 }
 
 static inline int vma_dup_policy(struct vm_area_struct *, struct vm_area_struct *)
@@ -920,4 +931,106 @@ static inline bool signal_pending(void *)
 	return false;
 }
 
+static inline bool is_file_hugepages(struct file *)
+{
+	return false;
+}
+
+static inline int security_vm_enough_memory_mm(struct mm_struct *, long)
+{
+	return true;
+}
+
+static inline bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long)
+{
+	return true;
+}
+
+static inline void vm_flags_init(struct vm_area_struct *vma,
+				 vm_flags_t flags)
+{
+	vma->__vm_flags = flags;
+}
+
+static inline void vm_flags_set(struct vm_area_struct *vma,
+				vm_flags_t flags)
+{
+	vma_start_write(vma);
+	vma->__vm_flags |= flags;
+}
+
+static inline void vm_flags_clear(struct vm_area_struct *vma,
+				  vm_flags_t flags)
+{
+	vma_start_write(vma);
+	vma->__vm_flags &= ~flags;
+}
+
+static inline int call_mmap(struct file *, struct vm_area_struct *)
+{
+	return 0;
+}
+
+static inline int shmem_zero_setup(struct vm_area_struct *)
+{
+	return 0;
+}
+
+static inline void vma_set_anonymous(struct vm_area_struct *vma)
+{
+	vma->vm_ops = NULL;
+}
+
+static inline void ksm_add_vma(struct vm_area_struct *)
+{
+}
+
+static inline void perf_event_mmap(struct vm_area_struct *)
+{
+}
+
+static inline bool vma_is_dax(struct vm_area_struct *)
+{
+	return false;
+}
+
+static inline struct vm_area_struct *get_gate_vma(struct mm_struct *)
+{
+	return NULL;
+}
+
+bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
+
+/* Update vma->vm_page_prot to reflect vma->vm_flags. */
+static inline void vma_set_page_prot(struct vm_area_struct *vma)
+{
+	unsigned long vm_flags = vma->vm_flags;
+	pgprot_t vm_page_prot;
+
+	/* testing: we inline vm_pgprot_modify() to avoid clash with vma.h. */
+	vm_page_prot = pgprot_modify(vma->vm_page_prot, vm_get_page_prot(vm_flags));
+
+	if (vma_wants_writenotify(vma, vm_page_prot)) {
+		vm_flags &= ~VM_SHARED;
+		/* testing: we inline vm_pgprot_modify() to avoid clash with vma.h. */
+		vm_page_prot = pgprot_modify(vm_page_prot, vm_get_page_prot(vm_flags));
+	}
+	/* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */
+	WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
+}
+
+static inline bool arch_validate_flags(unsigned long)
+{
+	return true;
+}
+
+static inline void vma_close(struct vm_area_struct *)
+{
+}
+
+static inline int mmap_file(struct file *, struct vm_area_struct *)
+{
+	return 0;
+}
+
 #endif	/* __MM_VMA_INTERNAL_H */
-- 
cgit v1.2.3


From 3b9bde403aafa55dcbe7dc250b95af917610f139 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 7 Nov 2024 00:04:49 +1100
Subject: selftests/powerpc: Lower run time of count_stcx_fail test

The count_stcx_fail test runs for close to or just over 2 minutes, which
means it sometimes times out.

That's overkill for a test that just demonstrates some PMU counters
are working. Drop the 64 billion instruction case, to lower the runtime
to ~30s.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241106130453.1741013-1-mpe@ellerman.id.au
---
 tools/testing/selftests/powerpc/pmu/count_stcx_fail.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/powerpc/pmu/count_stcx_fail.c b/tools/testing/selftests/powerpc/pmu/count_stcx_fail.c
index 2070a1e2b3a5..d8dd9a9c6c1b 100644
--- a/tools/testing/selftests/powerpc/pmu/count_stcx_fail.c
+++ b/tools/testing/selftests/powerpc/pmu/count_stcx_fail.c
@@ -144,9 +144,6 @@ static int test_body(void)
 	/* Run for 16Bi instructions */
 	FAIL_IF(do_count_loop(events, 16000000000, overhead, true));
 
-	/* Run for 64Bi instructions */
-	FAIL_IF(do_count_loop(events, 64000000000, overhead, true));
-
 	event_close(&events[0]);
 	event_close(&events[1]);
 
-- 
cgit v1.2.3


From 5543d595954eefb3a6faa18a6dc7b1b3d6022052 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 7 Nov 2024 00:04:50 +1100
Subject: selftests/powerpc: Give all tests 2 minutes timeout

Each of the powerpc selftests runs with a timeout of 2 minutes by
default (see tools/testing/selftests/powerpc/harness.c).

But when tests are run with run_kselftest.sh it uses a timeout of 45
seconds, meaning some tests run OK standalone but fail when run with the
test runner.

So tell run_kselftest.sh to give each test 130 seconds, that should
allow the tests to complete, or be killed by the powerpc test harness
after 2 minutes. If for some reason the harness fails, or for the few
tests that don't use the harness, the 130 second timeout should catch
them if they get stuck.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241106130453.1741013-2-mpe@ellerman.id.au
---
 tools/testing/selftests/powerpc/alignment/settings       | 1 +
 tools/testing/selftests/powerpc/cache_shape/settings     | 1 +
 tools/testing/selftests/powerpc/copyloops/settings       | 1 +
 tools/testing/selftests/powerpc/dexcr/settings           | 1 +
 tools/testing/selftests/powerpc/dscr/settings            | 1 +
 tools/testing/selftests/powerpc/lib/settings             | 1 +
 tools/testing/selftests/powerpc/math/settings            | 1 +
 tools/testing/selftests/powerpc/mce/settings             | 1 +
 tools/testing/selftests/powerpc/mm/settings              | 1 +
 tools/testing/selftests/powerpc/nx-gzip/settings         | 1 +
 tools/testing/selftests/powerpc/papr_attributes/settings | 1 +
 tools/testing/selftests/powerpc/papr_sysparm/settings    | 1 +
 tools/testing/selftests/powerpc/papr_vpd/settings        | 1 +
 tools/testing/selftests/powerpc/pmu/settings             | 1 +
 tools/testing/selftests/powerpc/primitives/settings      | 1 +
 tools/testing/selftests/powerpc/ptrace/settings          | 1 +
 tools/testing/selftests/powerpc/scripts/settings         | 1 +
 tools/testing/selftests/powerpc/security/settings        | 1 +
 tools/testing/selftests/powerpc/stringloops/settings     | 1 +
 tools/testing/selftests/powerpc/switch_endian/settings   | 1 +
 tools/testing/selftests/powerpc/syscalls/settings        | 1 +
 tools/testing/selftests/powerpc/vphn/settings            | 1 +
 22 files changed, 22 insertions(+)
 create mode 100644 tools/testing/selftests/powerpc/alignment/settings
 create mode 100644 tools/testing/selftests/powerpc/cache_shape/settings
 create mode 100644 tools/testing/selftests/powerpc/copyloops/settings
 create mode 100644 tools/testing/selftests/powerpc/dexcr/settings
 create mode 100644 tools/testing/selftests/powerpc/dscr/settings
 create mode 100644 tools/testing/selftests/powerpc/lib/settings
 create mode 100644 tools/testing/selftests/powerpc/math/settings
 create mode 100644 tools/testing/selftests/powerpc/mce/settings
 create mode 100644 tools/testing/selftests/powerpc/mm/settings
 create mode 100644 tools/testing/selftests/powerpc/nx-gzip/settings
 create mode 100644 tools/testing/selftests/powerpc/papr_attributes/settings
 create mode 100644 tools/testing/selftests/powerpc/papr_sysparm/settings
 create mode 100644 tools/testing/selftests/powerpc/papr_vpd/settings
 create mode 100644 tools/testing/selftests/powerpc/pmu/settings
 create mode 100644 tools/testing/selftests/powerpc/primitives/settings
 create mode 100644 tools/testing/selftests/powerpc/ptrace/settings
 create mode 100644 tools/testing/selftests/powerpc/scripts/settings
 create mode 100644 tools/testing/selftests/powerpc/security/settings
 create mode 100644 tools/testing/selftests/powerpc/stringloops/settings
 create mode 100644 tools/testing/selftests/powerpc/switch_endian/settings
 create mode 100644 tools/testing/selftests/powerpc/syscalls/settings
 create mode 100644 tools/testing/selftests/powerpc/vphn/settings

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/powerpc/alignment/settings b/tools/testing/selftests/powerpc/alignment/settings
new file mode 100644
index 000000000000..2e8566183318
--- /dev/null
+++ b/tools/testing/selftests/powerpc/alignment/settings
@@ -0,0 +1 @@
+timeout=130
diff --git a/tools/testing/selftests/powerpc/cache_shape/settings b/tools/testing/selftests/powerpc/cache_shape/settings
new file mode 100644
index 000000000000..2e8566183318
--- /dev/null
+++ b/tools/testing/selftests/powerpc/cache_shape/settings
@@ -0,0 +1 @@
+timeout=130
diff --git a/tools/testing/selftests/powerpc/copyloops/settings b/tools/testing/selftests/powerpc/copyloops/settings
new file mode 100644
index 000000000000..2e8566183318
--- /dev/null
+++ b/tools/testing/selftests/powerpc/copyloops/settings
@@ -0,0 +1 @@
+timeout=130
diff --git a/tools/testing/selftests/powerpc/dexcr/settings b/tools/testing/selftests/powerpc/dexcr/settings
new file mode 100644
index 000000000000..2e8566183318
--- /dev/null
+++ b/tools/testing/selftests/powerpc/dexcr/settings
@@ -0,0 +1 @@
+timeout=130
diff --git a/tools/testing/selftests/powerpc/dscr/settings b/tools/testing/selftests/powerpc/dscr/settings
new file mode 100644
index 000000000000..2e8566183318
--- /dev/null
+++ b/tools/testing/selftests/powerpc/dscr/settings
@@ -0,0 +1 @@
+timeout=130
diff --git a/tools/testing/selftests/powerpc/lib/settings b/tools/testing/selftests/powerpc/lib/settings
new file mode 100644
index 000000000000..2e8566183318
--- /dev/null
+++ b/tools/testing/selftests/powerpc/lib/settings
@@ -0,0 +1 @@
+timeout=130
diff --git a/tools/testing/selftests/powerpc/math/settings b/tools/testing/selftests/powerpc/math/settings
new file mode 100644
index 000000000000..2e8566183318
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/settings
@@ -0,0 +1 @@
+timeout=130
diff --git a/tools/testing/selftests/powerpc/mce/settings b/tools/testing/selftests/powerpc/mce/settings
new file mode 100644
index 000000000000..2e8566183318
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/settings
@@ -0,0 +1 @@
+timeout=130
diff --git a/tools/testing/selftests/powerpc/mm/settings b/tools/testing/selftests/powerpc/mm/settings
new file mode 100644
index 000000000000..2e8566183318
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mm/settings
@@ -0,0 +1 @@
+timeout=130
diff --git a/tools/testing/selftests/powerpc/nx-gzip/settings b/tools/testing/selftests/powerpc/nx-gzip/settings
new file mode 100644
index 000000000000..2e8566183318
--- /dev/null
+++ b/tools/testing/selftests/powerpc/nx-gzip/settings
@@ -0,0 +1 @@
+timeout=130
diff --git a/tools/testing/selftests/powerpc/papr_attributes/settings b/tools/testing/selftests/powerpc/papr_attributes/settings
new file mode 100644
index 000000000000..2e8566183318
--- /dev/null
+++ b/tools/testing/selftests/powerpc/papr_attributes/settings
@@ -0,0 +1 @@
+timeout=130
diff --git a/tools/testing/selftests/powerpc/papr_sysparm/settings b/tools/testing/selftests/powerpc/papr_sysparm/settings
new file mode 100644
index 000000000000..2e8566183318
--- /dev/null
+++ b/tools/testing/selftests/powerpc/papr_sysparm/settings
@@ -0,0 +1 @@
+timeout=130
diff --git a/tools/testing/selftests/powerpc/papr_vpd/settings b/tools/testing/selftests/powerpc/papr_vpd/settings
new file mode 100644
index 000000000000..2e8566183318
--- /dev/null
+++ b/tools/testing/selftests/powerpc/papr_vpd/settings
@@ -0,0 +1 @@
+timeout=130
diff --git a/tools/testing/selftests/powerpc/pmu/settings b/tools/testing/selftests/powerpc/pmu/settings
new file mode 100644
index 000000000000..2e8566183318
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/settings
@@ -0,0 +1 @@
+timeout=130
diff --git a/tools/testing/selftests/powerpc/primitives/settings b/tools/testing/selftests/powerpc/primitives/settings
new file mode 100644
index 000000000000..2e8566183318
--- /dev/null
+++ b/tools/testing/selftests/powerpc/primitives/settings
@@ -0,0 +1 @@
+timeout=130
diff --git a/tools/testing/selftests/powerpc/ptrace/settings b/tools/testing/selftests/powerpc/ptrace/settings
new file mode 100644
index 000000000000..2e8566183318
--- /dev/null
+++ b/tools/testing/selftests/powerpc/ptrace/settings
@@ -0,0 +1 @@
+timeout=130
diff --git a/tools/testing/selftests/powerpc/scripts/settings b/tools/testing/selftests/powerpc/scripts/settings
new file mode 100644
index 000000000000..2e8566183318
--- /dev/null
+++ b/tools/testing/selftests/powerpc/scripts/settings
@@ -0,0 +1 @@
+timeout=130
diff --git a/tools/testing/selftests/powerpc/security/settings b/tools/testing/selftests/powerpc/security/settings
new file mode 100644
index 000000000000..2e8566183318
--- /dev/null
+++ b/tools/testing/selftests/powerpc/security/settings
@@ -0,0 +1 @@
+timeout=130
diff --git a/tools/testing/selftests/powerpc/stringloops/settings b/tools/testing/selftests/powerpc/stringloops/settings
new file mode 100644
index 000000000000..2e8566183318
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/settings
@@ -0,0 +1 @@
+timeout=130
diff --git a/tools/testing/selftests/powerpc/switch_endian/settings b/tools/testing/selftests/powerpc/switch_endian/settings
new file mode 100644
index 000000000000..2e8566183318
--- /dev/null
+++ b/tools/testing/selftests/powerpc/switch_endian/settings
@@ -0,0 +1 @@
+timeout=130
diff --git a/tools/testing/selftests/powerpc/syscalls/settings b/tools/testing/selftests/powerpc/syscalls/settings
new file mode 100644
index 000000000000..2e8566183318
--- /dev/null
+++ b/tools/testing/selftests/powerpc/syscalls/settings
@@ -0,0 +1 @@
+timeout=130
diff --git a/tools/testing/selftests/powerpc/vphn/settings b/tools/testing/selftests/powerpc/vphn/settings
new file mode 100644
index 000000000000..2e8566183318
--- /dev/null
+++ b/tools/testing/selftests/powerpc/vphn/settings
@@ -0,0 +1 @@
+timeout=130
-- 
cgit v1.2.3


From d5f578f90a34d85f1cabd4c27af1b2d9fbffe64b Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 7 Nov 2024 00:04:51 +1100
Subject: selftests/powerpc: Fix 32-bit BE build errors on Ubuntu 24.04

Starting with Ubuntu 24.04, building the selftests with the big endian
compiler (which defaults to 32-bit) fails with errors:

  stack_expansion_ldst.c:178:37: error: format '%lx' expects argument
  of type 'long unsigned int', but argument 2 has type 'rlim_t' {aka 'long long unsigned int'}
  subpage_prot.c:214:38: error: format '%lx' expects argument of type
  'long unsigned int', but argument 3 has type 'off_t' {aka 'long long int'}

Prior to 24.04 rlim_t was long unsigned int, and off_t was long int.

Cast to unsigned long long and long long before passing to printf to
avoid the errors.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241106130453.1741013-3-mpe@ellerman.id.au
---
 tools/testing/selftests/powerpc/mm/stack_expansion_ldst.c | 2 +-
 tools/testing/selftests/powerpc/mm/subpage_prot.c         | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/powerpc/mm/stack_expansion_ldst.c b/tools/testing/selftests/powerpc/mm/stack_expansion_ldst.c
index ed9143990888..9c0d343d7137 100644
--- a/tools/testing/selftests/powerpc/mm/stack_expansion_ldst.c
+++ b/tools/testing/selftests/powerpc/mm/stack_expansion_ldst.c
@@ -175,7 +175,7 @@ static int test(void)
 
 	page_size = getpagesize();
 	getrlimit(RLIMIT_STACK, &rlimit);
-	printf("Stack rlimit is 0x%lx\n", rlimit.rlim_cur);
+	printf("Stack rlimit is 0x%llx\n", (unsigned long long)rlimit.rlim_cur);
 
 	printf("Testing loads ...\n");
 	test_one_type(LOAD, page_size, rlimit.rlim_cur);
diff --git a/tools/testing/selftests/powerpc/mm/subpage_prot.c b/tools/testing/selftests/powerpc/mm/subpage_prot.c
index 3ae77ba93208..8cf9fd5fed1c 100644
--- a/tools/testing/selftests/powerpc/mm/subpage_prot.c
+++ b/tools/testing/selftests/powerpc/mm/subpage_prot.c
@@ -211,8 +211,8 @@ int test_file(void)
 		perror("failed to map file");
 		return 1;
 	}
-	printf("allocated %s for 0x%lx bytes at %p\n",
-	       file_name, filesize, fileblock);
+	printf("allocated %s for 0x%llx bytes at %p\n",
+	       file_name, (long long)filesize, fileblock);
 
 	printf("testing file map...\n");
 
-- 
cgit v1.2.3


From c6a75555b4b2643365a007b7162a670d69aa28fe Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 7 Nov 2024 00:04:52 +1100
Subject: selftests/powerpc: Return errors from all tests

Fix some tests which weren't returning an error code from main.

Although these tests only ever return success, they can still fail if
they time out and the harness kills them. If that happens they still
return success to the shell, which is incorrect and confuses the higher
level error reporting.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241106130453.1741013-4-mpe@ellerman.id.au
---
 tools/testing/selftests/powerpc/signal/sigfuz.c                 | 2 +-
 tools/testing/selftests/powerpc/tm/tm-signal-context-force-tm.c | 2 +-
 tools/testing/selftests/powerpc/tm/tm-signal-sigreturn-nt.c     | 3 +--
 3 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/powerpc/signal/sigfuz.c b/tools/testing/selftests/powerpc/signal/sigfuz.c
index 08f9afe3b95c..c101b1391696 100644
--- a/tools/testing/selftests/powerpc/signal/sigfuz.c
+++ b/tools/testing/selftests/powerpc/signal/sigfuz.c
@@ -321,5 +321,5 @@ int main(int argc, char **argv)
 	if (!args)
 		args = ARG_COMPLETE;
 
-	test_harness(signal_fuzzer, "signal_fuzzer");
+	return test_harness(signal_fuzzer, "signal_fuzzer");
 }
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-force-tm.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-force-tm.c
index 421cb082f6be..0a4bc479ae39 100644
--- a/tools/testing/selftests/powerpc/tm/tm-signal-context-force-tm.c
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-force-tm.c
@@ -176,5 +176,5 @@ int tm_signal_context_force_tm(void)
 
 int main(int argc, char **argv)
 {
-	test_harness(tm_signal_context_force_tm, "tm_signal_context_force_tm");
+	return test_harness(tm_signal_context_force_tm, "tm_signal_context_force_tm");
 }
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-sigreturn-nt.c b/tools/testing/selftests/powerpc/tm/tm-signal-sigreturn-nt.c
index 06b801906f27..968864b052ec 100644
--- a/tools/testing/selftests/powerpc/tm/tm-signal-sigreturn-nt.c
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-sigreturn-nt.c
@@ -46,6 +46,5 @@ int tm_signal_sigreturn_nt(void)
 
 int main(int argc, char **argv)
 {
-	test_harness(tm_signal_sigreturn_nt, "tm_signal_sigreturn_nt");
+	return test_harness(tm_signal_sigreturn_nt, "tm_signal_sigreturn_nt");
 }
-
-- 
cgit v1.2.3


From a8a54a65cac4f8202df36f925b6746328802d05f Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 7 Nov 2024 00:04:53 +1100
Subject: selftests/powerpc: Detect taint change in mitigation patching test

Currently the mitigation patching test errors out if the kernel is
tainted prior to the test running.

That causes the test to fail unnecessarily if some other test has caused
the kernel to be tainted, or if a proprietary or force module is loaded
for example.

Instead just warn if the kernel is tainted to begin with, and only
report a change in the taint state as an error in the test.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241106130453.1741013-5-mpe@ellerman.id.au
---
 tools/testing/selftests/powerpc/security/mitigation-patching.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/powerpc/security/mitigation-patching.sh b/tools/testing/selftests/powerpc/security/mitigation-patching.sh
index f43aa4b77fba..9a4612e2e953 100755
--- a/tools/testing/selftests/powerpc/security/mitigation-patching.sh
+++ b/tools/testing/selftests/powerpc/security/mitigation-patching.sh
@@ -36,8 +36,7 @@ fi
 
 tainted=$(cat /proc/sys/kernel/tainted)
 if [[ "$tainted" -ne 0 ]]; then
-    echo "Error: kernel already tainted!" >&2
-    exit 1
+    echo "Warning: kernel already tainted! ($tainted)" >&2
 fi
 
 mitigations="barrier_nospec stf_barrier count_cache_flush rfi_flush entry_flush uaccess_flush"
@@ -68,9 +67,10 @@ fi
 echo "Waiting for timeout ..."
 wait
 
+orig_tainted=$tainted
 tainted=$(cat /proc/sys/kernel/tainted)
-if [[ "$tainted" -ne 0 ]]; then
-    echo "Error: kernel became tainted!" >&2
+if [[ "$tainted" != "$orig_tainted" ]]; then
+    echo "Error: kernel newly tainted, before ($orig_tainted) after ($tainted)" >&2
     exit 1
 fi
 
-- 
cgit v1.2.3


From a3590d71a1acef850864f19bff2f37f56b2d4f00 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 30 Oct 2024 00:02:02 +0000
Subject: kselftest/arm64: Increase frequency of signal delivery in fp-stress

Currently we only deliver signals to the processes being tested about once
a second, meaning that the signal code paths are subject to relatively
little stress. Increase this frequency substantially to 25ms intervals,
along with some minor refactoring to make this more readily tuneable and
maintain the 1s logging interval. This interval was chosen based on some
experimentation with emulated platforms to avoid causing so much extra load
that the test starts to run into the 45s limit for selftests or generally
completely disconnect the timeout numbers from the

We could increase this if we moved the signal generation out of the main
supervisor thread, though we should also consider that he percentage of
time that we spend interacting with the floating point state is also a
consideration.

Suggested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241030-arm64-fp-stress-interval-v2-1-bd3cef48c22c@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/fp-stress.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/fp/fp-stress.c b/tools/testing/selftests/arm64/fp/fp-stress.c
index 13958e645afc..b81bc0842f17 100644
--- a/tools/testing/selftests/arm64/fp/fp-stress.c
+++ b/tools/testing/selftests/arm64/fp/fp-stress.c
@@ -28,6 +28,9 @@
 
 #define MAX_VLS 16
 
+#define SIGNAL_INTERVAL_MS 25
+#define LOG_INTERVALS (1000 / SIGNAL_INTERVAL_MS)
+
 struct child_data {
 	char *name, *output;
 	pid_t pid;
@@ -448,7 +451,7 @@ static const struct option options[] = {
 int main(int argc, char **argv)
 {
 	int ret;
-	int timeout = 10;
+	int timeout = 10 * (1000 / SIGNAL_INTERVAL_MS);
 	int cpus, i, j, c;
 	int sve_vl_count, sme_vl_count;
 	bool all_children_started = false;
@@ -504,7 +507,7 @@ int main(int argc, char **argv)
 		       have_sme2 ? "present" : "absent");
 
 	if (timeout > 0)
-		ksft_print_msg("Will run for %ds\n", timeout);
+		ksft_print_msg("Will run for %d\n", timeout);
 	else
 		ksft_print_msg("Will run until terminated\n");
 
@@ -577,14 +580,14 @@ int main(int argc, char **argv)
 			break;
 
 		/*
-		 * Timeout is counted in seconds with no output, the
-		 * tests print during startup then are silent when
-		 * running so this should ensure they all ran enough
-		 * to install the signal handler, this is especially
-		 * useful in emulation where we will both be slow and
-		 * likely to have a large set of VLs.
+		 * Timeout is counted in poll intervals with no
+		 * output, the tests print during startup then are
+		 * silent when running so this should ensure they all
+		 * ran enough to install the signal handler, this is
+		 * especially useful in emulation where we will both
+		 * be slow and likely to have a large set of VLs.
 		 */
-		ret = epoll_wait(epoll_fd, evs, tests, 1000);
+		ret = epoll_wait(epoll_fd, evs, tests, SIGNAL_INTERVAL_MS);
 		if (ret < 0) {
 			if (errno == EINTR)
 				continue;
@@ -624,8 +627,9 @@ int main(int argc, char **argv)
 			all_children_started = true;
 		}
 
-		ksft_print_msg("Sending signals, timeout remaining: %d\n",
-			       timeout);
+		if ((timeout % LOG_INTERVALS) == 0)
+			ksft_print_msg("Sending signals, timeout remaining: %d\n",
+				       timeout);
 
 		for (i = 0; i < num_children; i++)
 			child_tickle(&children[i]);
-- 
cgit v1.2.3


From 161e9925053cafa83a1eb265a001b6917dfafa29 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 30 Oct 2024 00:02:03 +0000
Subject: kselftest/arm64: Poll less often while waiting for fp-stress children

While fp-stress is waiting for children to start it doesn't send any
signals to them so there is no need for it to have as short an epoll()
timeout as it does when the children are all running. We do still want to
have some timeout so that we can log diagnostics about missing children but
this can be relatively large. On emulated platforms the overhead of running
the supervisor process is quite high, especially during the process of
execing the test binaries.

Implement a longer epoll() timeout during the setup phase, using a 5s
timeout while waiting for children and switching  to the signal raise
interval when all the children are started and we start sending signals.

Signed-off-by: Mark Brown <broonie@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241030-arm64-fp-stress-interval-v2-2-bd3cef48c22c@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/fp-stress.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/fp/fp-stress.c b/tools/testing/selftests/arm64/fp/fp-stress.c
index b81bc0842f17..ad867ff9687a 100644
--- a/tools/testing/selftests/arm64/fp/fp-stress.c
+++ b/tools/testing/selftests/arm64/fp/fp-stress.c
@@ -452,6 +452,7 @@ int main(int argc, char **argv)
 {
 	int ret;
 	int timeout = 10 * (1000 / SIGNAL_INTERVAL_MS);
+	int poll_interval = 5000;
 	int cpus, i, j, c;
 	int sve_vl_count, sme_vl_count;
 	bool all_children_started = false;
@@ -587,7 +588,7 @@ int main(int argc, char **argv)
 		 * especially useful in emulation where we will both
 		 * be slow and likely to have a large set of VLs.
 		 */
-		ret = epoll_wait(epoll_fd, evs, tests, SIGNAL_INTERVAL_MS);
+		ret = epoll_wait(epoll_fd, evs, tests, poll_interval);
 		if (ret < 0) {
 			if (errno == EINTR)
 				continue;
@@ -625,6 +626,7 @@ int main(int argc, char **argv)
 			}
 
 			all_children_started = true;
+			poll_interval = SIGNAL_INTERVAL_MS;
 		}
 
 		if ((timeout % LOG_INTERVALS) == 0)
-- 
cgit v1.2.3


From 94de486e4215b83de8a5e908b4b8f7d3979c3b0e Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 7 Nov 2024 01:39:20 +0000
Subject: kselftest/arm64: Correct misleading comments on fp-stress irritators

The comments in the handlers for the irritator signal in the test threads
for fp-stress suggest that the irritator will corrupt the register state
observed by the main thread but this is not the case, instead the FPSIMD
and SVE irritators (which are the only ones that are implemented) modify
the current register state which is expected to be overwritten on return
from the handler by the saved register state. Update the comment to reflect
what the handler is actually doing.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241107-arm64-fp-stress-irritator-v2-1-c4b9622e36ee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/fpsimd-test.S | 3 +--
 tools/testing/selftests/arm64/fp/sve-test.S    | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/fp/fpsimd-test.S b/tools/testing/selftests/arm64/fp/fpsimd-test.S
index 8b960d01ed2e..bdfb7cf2e4ec 100644
--- a/tools/testing/selftests/arm64/fp/fpsimd-test.S
+++ b/tools/testing/selftests/arm64/fp/fpsimd-test.S
@@ -134,8 +134,7 @@ function check_vreg
 	b	memcmp
 endfunction
 
-// Any SVE register modified here can cause corruption in the main
-// thread -- but *only* the registers modified here.
+// Modify live register state, the signal return will undo our changes
 function irritator_handler
 	// Increment the irritation signal count (x23):
 	ldr	x0, [x2, #ucontext_regs + 8 * 23]
diff --git a/tools/testing/selftests/arm64/fp/sve-test.S b/tools/testing/selftests/arm64/fp/sve-test.S
index fff60e2a25ad..e3c0d585684d 100644
--- a/tools/testing/selftests/arm64/fp/sve-test.S
+++ b/tools/testing/selftests/arm64/fp/sve-test.S
@@ -291,8 +291,7 @@ function check_ffr
 #endif
 endfunction
 
-// Any SVE register modified here can cause corruption in the main
-// thread -- but *only* the registers modified here.
+// Modify live register state, the signal return will undo our changes
 function irritator_handler
 	// Increment the irritation signal count (x23):
 	ldr	x0, [x2, #ucontext_regs + 8 * 23]
-- 
cgit v1.2.3


From ffca567fef9c4661c792ca48a2cdd038b2df4887 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 7 Nov 2024 01:39:21 +0000
Subject: kselftest/arm64: Remove unused ADRs from irritator handlers

The irritator handlers for the fp-stress test programs all use ADR to load
an address into x0 which is then not referenced. Remove these ADRs as they
just cause confusion.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241107-arm64-fp-stress-irritator-v2-2-c4b9622e36ee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/fpsimd-test.S | 1 -
 tools/testing/selftests/arm64/fp/sve-test.S    | 1 -
 tools/testing/selftests/arm64/fp/za-test.S     | 1 -
 tools/testing/selftests/arm64/fp/zt-test.S     | 1 -
 4 files changed, 4 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/fp/fpsimd-test.S b/tools/testing/selftests/arm64/fp/fpsimd-test.S
index bdfb7cf2e4ec..9977ffdd758a 100644
--- a/tools/testing/selftests/arm64/fp/fpsimd-test.S
+++ b/tools/testing/selftests/arm64/fp/fpsimd-test.S
@@ -142,7 +142,6 @@ function irritator_handler
 	str	x0, [x2, #ucontext_regs + 8 * 23]
 
 	// Corrupt some random V-regs
-	adr	x0, .text + (irritator_handler - .text) / 16 * 16
 	movi	v0.8b, #7
 	movi	v9.16b, #9
 	movi	v31.8b, #31
diff --git a/tools/testing/selftests/arm64/fp/sve-test.S b/tools/testing/selftests/arm64/fp/sve-test.S
index e3c0d585684d..f1fb9745c681 100644
--- a/tools/testing/selftests/arm64/fp/sve-test.S
+++ b/tools/testing/selftests/arm64/fp/sve-test.S
@@ -299,7 +299,6 @@ function irritator_handler
 	str	x0, [x2, #ucontext_regs + 8 * 23]
 
 	// Corrupt some random Z-regs
-	adr	x0, .text + (irritator_handler - .text) / 16 * 16
 	movi	v0.8b, #1
 	movi	v9.16b, #2
 	movi	v31.8b, #3
diff --git a/tools/testing/selftests/arm64/fp/za-test.S b/tools/testing/selftests/arm64/fp/za-test.S
index 095b45531640..1ee0ec36766d 100644
--- a/tools/testing/selftests/arm64/fp/za-test.S
+++ b/tools/testing/selftests/arm64/fp/za-test.S
@@ -158,7 +158,6 @@ function irritator_handler
 
 	// Corrupt some random ZA data
 #if 0
-	adr	x0, .text + (irritator_handler - .text) / 16 * 16
 	movi	v0.8b, #1
 	movi	v9.16b, #2
 	movi	v31.8b, #3
diff --git a/tools/testing/selftests/arm64/fp/zt-test.S b/tools/testing/selftests/arm64/fp/zt-test.S
index b5c81e81a379..ade9c98abcda 100644
--- a/tools/testing/selftests/arm64/fp/zt-test.S
+++ b/tools/testing/selftests/arm64/fp/zt-test.S
@@ -127,7 +127,6 @@ function irritator_handler
 
 	// Corrupt some random ZT data
 #if 0
-	adr	x0, .text + (irritator_handler - .text) / 16 * 16
 	movi	v0.8b, #1
 	movi	v9.16b, #2
 	movi	v31.8b, #3
-- 
cgit v1.2.3


From d65f27d240bb897ac5a4a3fb7557724b3717e022 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 7 Nov 2024 01:39:23 +0000
Subject: kselftest/arm64: Implement irritators for ZA and ZT

Currently we don't use the irritator signal in our floating point stress
tests so when we added ZA and ZT stress tests we didn't actually bother
implementing any actual action in the handlers, we just counted the signal
deliveries. In preparation for using the irritators let's implement them,
just trivially SMSTOP and SMSTART to reset all bits in the register to 0.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241107-arm64-fp-stress-irritator-v2-4-c4b9622e36ee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/za-test.S | 12 ++++--------
 tools/testing/selftests/arm64/fp/zt-test.S | 12 ++++--------
 2 files changed, 8 insertions(+), 16 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/fp/za-test.S b/tools/testing/selftests/arm64/fp/za-test.S
index 1ee0ec36766d..f902e6ef9077 100644
--- a/tools/testing/selftests/arm64/fp/za-test.S
+++ b/tools/testing/selftests/arm64/fp/za-test.S
@@ -148,20 +148,16 @@ function check_za
 	b	memcmp
 endfunction
 
-// Any SME register modified here can cause corruption in the main
-// thread -- but *only* the locations modified here.
+// Modify the live SME register state, signal return will undo our changes
 function irritator_handler
 	// Increment the irritation signal count (x23):
 	ldr	x0, [x2, #ucontext_regs + 8 * 23]
 	add	x0, x0, #1
 	str	x0, [x2, #ucontext_regs + 8 * 23]
 
-	// Corrupt some random ZA data
-#if 0
-	movi	v0.8b, #1
-	movi	v9.16b, #2
-	movi	v31.8b, #3
-#endif
+	// This will reset ZA to all bits 0
+	smstop
+	smstart
 
 	ret
 endfunction
diff --git a/tools/testing/selftests/arm64/fp/zt-test.S b/tools/testing/selftests/arm64/fp/zt-test.S
index ade9c98abcda..c96cb7c2ad4b 100644
--- a/tools/testing/selftests/arm64/fp/zt-test.S
+++ b/tools/testing/selftests/arm64/fp/zt-test.S
@@ -117,20 +117,16 @@ function check_zt
 	b	memcmp
 endfunction
 
-// Any SME register modified here can cause corruption in the main
-// thread -- but *only* the locations modified here.
+// Modify the live SME register state, signal return will undo our changes
 function irritator_handler
 	// Increment the irritation signal count (x23):
 	ldr	x0, [x2, #ucontext_regs + 8 * 23]
 	add	x0, x0, #1
 	str	x0, [x2, #ucontext_regs + 8 * 23]
 
-	// Corrupt some random ZT data
-#if 0
-	movi	v0.8b, #1
-	movi	v9.16b, #2
-	movi	v31.8b, #3
-#endif
+	// This will reset ZT to all bits 0
+	smstop
+	smstart
 
 	ret
 endfunction
-- 
cgit v1.2.3


From 7368debf275aa2419365ce47da00922ca51c6094 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 7 Nov 2024 01:39:24 +0000
Subject: kselftest/arm64: Provide a SIGUSR1 handler in the kernel mode FP
 stress test

The other stress test programs provide a SIGUSR1 handler which modifies the
live register state in order to validate that signal context is being
restored during signal return. While we can't usefully do this when testing
kernel mode FP usage provide a handler for SIGUSR1 which just counts the
number of signals like we do for SIGUSR2, allowing fp-stress to treat all
the test programs uniformly.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241107-arm64-fp-stress-irritator-v2-5-c4b9622e36ee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/kernel-test.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/fp/kernel-test.c b/tools/testing/selftests/arm64/fp/kernel-test.c
index e8da3b4cbd23..859345379044 100644
--- a/tools/testing/selftests/arm64/fp/kernel-test.c
+++ b/tools/testing/selftests/arm64/fp/kernel-test.c
@@ -267,6 +267,10 @@ int main(void)
 		       strerror(errno), errno);
 
 	sa.sa_sigaction = handle_kick_signal;
+	ret = sigaction(SIGUSR1, &sa, NULL);
+	if (ret < 0)
+		printf("Failed to install SIGUSR1 handler: %s (%d)\n",
+		       strerror(errno), errno);
 	ret = sigaction(SIGUSR2, &sa, NULL);
 	if (ret < 0)
 		printf("Failed to install SIGUSR2 handler: %s (%d)\n",
-- 
cgit v1.2.3


From ead1c35ce3b3c766443a82b56ee343cfe7ee8305 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 7 Nov 2024 01:39:25 +0000
Subject: kselftest/arm64: Test signal handler state modification in fp-stress

Currently in fp-stress we test signal delivery to the test threads by
sending SIGUSR2 which simply counts how many signals are delivered. The
test programs now also all have a SIGUSR1 handler which for the threads
doing userspace testing additionally modifies the floating point register
state in the signal handler, verifying that when we return the saved
register state is restored from the signal context as expected. Switch over
to triggering that to validate that we are restoring as expected.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241107-arm64-fp-stress-irritator-v2-6-c4b9622e36ee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/fp-stress.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/fp/fp-stress.c b/tools/testing/selftests/arm64/fp/fp-stress.c
index ad867ff9687a..74e23208b94c 100644
--- a/tools/testing/selftests/arm64/fp/fp-stress.c
+++ b/tools/testing/selftests/arm64/fp/fp-stress.c
@@ -223,7 +223,7 @@ static void child_output(struct child_data *child, uint32_t events,
 static void child_tickle(struct child_data *child)
 {
 	if (child->output_seen && !child->exited)
-		kill(child->pid, SIGUSR2);
+		kill(child->pid, SIGUSR1);
 }
 
 static void child_stop(struct child_data *child)
-- 
cgit v1.2.3


From 55d42a0c3f9ccd07c199e0ddbe1ba87572d30074 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 5 Nov 2024 17:52:35 -0800
Subject: selftests: net: add a test for closing a netlink socket ith dump in
 progress

Close a socket with dump in progress. We need a dump which generates
enough info not to fit into a single skb. Policy dump fits the bill.

Use the trick discovered by syzbot for keeping a ref on the socket
longer than just close, with mqueue.

  TAP version 13
  1..3
  # Starting 3 tests from 1 test cases.
  #  RUN           global.test_sanity ...
  #            OK  global.test_sanity
  ok 1 global.test_sanity
  #  RUN           global.close_in_progress ...
  #            OK  global.close_in_progress
  ok 2 global.close_in_progress
  #  RUN           global.close_with_ref ...
  #            OK  global.close_with_ref
  ok 3 global.close_with_ref
  # PASSED: 3 / 3 tests passed.
  # Totals: pass:3 fail:0 xfail:0 xpass:0 skip:0 error:0

Note that this test is not expected to fail but rather crash
the kernel if we get the cleanup wrong.

Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://patch.msgid.link/20241106015235.2458807-2-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/Makefile        |   1 +
 tools/testing/selftests/net/netlink-dumps.c | 110 ++++++++++++++++++++++++++++
 2 files changed, 111 insertions(+)
 create mode 100644 tools/testing/selftests/net/netlink-dumps.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 649f1fe0dc46..5e86f7a51b43 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -78,6 +78,7 @@ TEST_PROGS += test_vxlan_vnifiltering.sh
 TEST_GEN_FILES += io_uring_zerocopy_tx
 TEST_PROGS += io_uring_zerocopy_tx.sh
 TEST_GEN_FILES += bind_bhash
+TEST_GEN_PROGS += netlink-dumps
 TEST_GEN_PROGS += sk_bind_sendto_listen
 TEST_GEN_PROGS += sk_connect_zero_addr
 TEST_GEN_PROGS += sk_so_peek_off
diff --git a/tools/testing/selftests/net/netlink-dumps.c b/tools/testing/selftests/net/netlink-dumps.c
new file mode 100644
index 000000000000..7ee6dcd334df
--- /dev/null
+++ b/tools/testing/selftests/net/netlink-dumps.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <linux/genetlink.h>
+#include <linux/netlink.h>
+#include <linux/mqueue.h>
+
+#include "../kselftest_harness.h"
+
+static const struct {
+	struct nlmsghdr nlhdr;
+	struct genlmsghdr genlhdr;
+	struct nlattr ahdr;
+	__u16 val;
+	__u16 pad;
+} dump_policies = {
+	.nlhdr = {
+		.nlmsg_len	= sizeof(dump_policies),
+		.nlmsg_type	= GENL_ID_CTRL,
+		.nlmsg_flags	= NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
+		.nlmsg_seq	= 1,
+	},
+	.genlhdr = {
+		.cmd		= CTRL_CMD_GETPOLICY,
+		.version	= 2,
+	},
+	.ahdr = {
+		.nla_len	= 6,
+		.nla_type	= CTRL_ATTR_FAMILY_ID,
+	},
+	.val = GENL_ID_CTRL,
+	.pad = 0,
+};
+
+// Sanity check for the test itself, make sure the dump doesn't fit in one msg
+TEST(test_sanity)
+{
+	int netlink_sock;
+	char buf[8192];
+	ssize_t n;
+
+	netlink_sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
+	ASSERT_GE(netlink_sock, 0);
+
+	n = send(netlink_sock, &dump_policies, sizeof(dump_policies), 0);
+	ASSERT_EQ(n, sizeof(dump_policies));
+
+	n = recv(netlink_sock, buf, sizeof(buf), MSG_DONTWAIT);
+	ASSERT_GE(n, sizeof(struct nlmsghdr));
+
+	n = recv(netlink_sock, buf, sizeof(buf), MSG_DONTWAIT);
+	ASSERT_GE(n, sizeof(struct nlmsghdr));
+
+	close(netlink_sock);
+}
+
+TEST(close_in_progress)
+{
+	int netlink_sock;
+	ssize_t n;
+
+	netlink_sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
+	ASSERT_GE(netlink_sock, 0);
+
+	n = send(netlink_sock, &dump_policies, sizeof(dump_policies), 0);
+	ASSERT_EQ(n, sizeof(dump_policies));
+
+	close(netlink_sock);
+}
+
+TEST(close_with_ref)
+{
+	char cookie[NOTIFY_COOKIE_LEN] = {};
+	int netlink_sock, mq_fd;
+	struct sigevent sigev;
+	ssize_t n;
+
+	netlink_sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
+	ASSERT_GE(netlink_sock, 0);
+
+	n = send(netlink_sock, &dump_policies, sizeof(dump_policies), 0);
+	ASSERT_EQ(n, sizeof(dump_policies));
+
+	mq_fd = syscall(__NR_mq_open, "sed", O_CREAT | O_WRONLY, 0600, 0);
+	ASSERT_GE(mq_fd, 0);
+
+	memset(&sigev, 0, sizeof(sigev));
+	sigev.sigev_notify		= SIGEV_THREAD;
+	sigev.sigev_value.sival_ptr	= cookie;
+	sigev.sigev_signo		= netlink_sock;
+
+	syscall(__NR_mq_notify, mq_fd, &sigev);
+
+	close(netlink_sock);
+
+	// give mqueue time to fire
+	usleep(100 * 1000);
+}
+
+TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From db64dfffcad2992d6bfc680822bdf715335c43f1 Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Thu, 7 Nov 2024 13:16:40 +0000
Subject: selftests/mm: Define PKEY_UNRESTRICTED for pkey_sighandler_tests

Commit 6e182dc9f268 ("selftests/mm: Use generic pkey register
manipulation") makes use of PKEY_UNRESTRICTED in
pkey_sighandler_tests. The macro has been proposed for addition to
uapi headers [1], but the patch hasn't landed yet.

Define PKEY_UNRESTRICTED in pkey-helpers.h for the time being to fix
the build.

[1] https://lore.kernel.org/all/20241028090715.509527-2-yury.khrustalev@arm.com/

Fixes: 6e182dc9f268 ("selftests/mm: Use generic pkey register manipulation")
Reported-by: Aishwarya TCV <aishwarya.tcv@arm.com>
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Link: https://lore.kernel.org/r/20241107131640.650703-1-kevin.brodsky@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/mm/pkey-helpers.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h
index 9ab6a3ee153b..f7cfe163b0ff 100644
--- a/tools/testing/selftests/mm/pkey-helpers.h
+++ b/tools/testing/selftests/mm/pkey-helpers.h
@@ -112,6 +112,13 @@ void record_pkey_malloc(void *ptr, long size, int prot);
 #define PKEY_MASK	(PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)
 #endif
 
+/*
+ * FIXME: Remove once the generic PKEY_UNRESTRICTED definition is merged.
+ */
+#ifndef PKEY_UNRESTRICTED
+#define PKEY_UNRESTRICTED 0x0
+#endif
+
 #ifndef set_pkey_bits
 static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags)
 {
-- 
cgit v1.2.3


From 0268d4579901821ff17259213c2d8c9679995d48 Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Fri, 1 Nov 2024 19:15:57 +0500
Subject: selftests: hugetlb_dio: check for initial conditions to skip in the
 start

The test should be skipped if initial conditions aren't fulfilled in the
start instead of failing and outputting non-compliant TAP logs.  This kind
of failure pollutes the results.  The initial conditions are:

- The test should only execute if /tmp file can be allocated.
- The test should only execute if huge pages are free.

Before:
TAP version 13
1..4
Bail out! Error opening file
: Read-only file system (30)
 # Planned tests != run tests (4 != 0)
 # Totals: pass:0 fail:0 xfail:0 xpass:0 skip:0 error:0

After:
TAP version 13
1..0 # SKIP Unable to allocate file: Read-only file system

Link: https://lkml.kernel.org/r/20241101141557.3159432-1-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Fixes: 3a103b5315b7 ("selftest: mm: Test if hugepage does not get leaked during __bio_release_pages()")
Cc: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Donet Tom <donettom@linux.ibm.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/hugetlb_dio.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/mm/hugetlb_dio.c b/tools/testing/selftests/mm/hugetlb_dio.c
index f9ac20c657ec..60001c142ce9 100644
--- a/tools/testing/selftests/mm/hugetlb_dio.c
+++ b/tools/testing/selftests/mm/hugetlb_dio.c
@@ -44,13 +44,6 @@ void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off)
 	if (fd < 0)
 		ksft_exit_fail_perror("Error opening file\n");
 
-	/* Get the free huge pages before allocation */
-	free_hpage_b = get_free_hugepages();
-	if (free_hpage_b == 0) {
-		close(fd);
-		ksft_exit_skip("No free hugepage, exiting!\n");
-	}
-
 	/* Allocate a hugetlb page */
 	orig_buffer = mmap(NULL, h_pagesize, mmap_prot, mmap_flags, -1, 0);
 	if (orig_buffer == MAP_FAILED) {
@@ -94,8 +87,20 @@ void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off)
 int main(void)
 {
 	size_t pagesize = 0;
+	int fd;
 
 	ksft_print_header();
+
+	/* Open the file to DIO */
+	fd = open("/tmp", O_TMPFILE | O_RDWR | O_DIRECT, 0664);
+	if (fd < 0)
+		ksft_exit_skip("Unable to allocate file: %s\n", strerror(errno));
+	close(fd);
+
+	/* Check if huge pages are free */
+	if (!get_free_hugepages())
+		ksft_exit_skip("No free hugepage, exiting\n");
+
 	ksft_set_plan(4);
 
 	/* Get base page size */
-- 
cgit v1.2.3


From 45488345d4b60f5c3a0a5d78fee76f0f3be896b4 Mon Sep 17 00:00:00 2001
From: Andrew Paniakin <apanyaki@amazon.com>
Date: Mon, 28 Oct 2024 16:30:53 -0700
Subject: selftests/damon/huge_count_read_write: provide sufficiently large
 buffer for DEPRECATED file read

Patch series "damon/{self,kunit}tests: minor fixups for DAMON debugfs
interface tests".

Fixup small broken window panes in DAMON selftests and kunit tests.

First four patches clean up DAMON debugfs interface selftests output, by
fixing segmentation fault of a test program (patch 1), removing
unnecessary debugging messages (patch 2), and hiding error messages from
expected failures (patches 3 and 4).

Following two patches fix copy-paste mistakes in DAMON Kconfig help
message that copied from debugfs kunit test (patch 5) and a comment on the
debugfs kunit test code (patch 6).


This patch (of 6):

'huge_count_read_write' crashes with segmentation fault when reading
DEPRECATED file of DAMON debugfs interface.  This is not causing any
problem for users or other tests because the purpose of the test is just
ensuring the read is not causing kernel warning messages.  Nonetheless, it
makes the output unnecessarily noisy, and the DEPRECATED file is not
properly being tested.

It happens because the size of the content of the file is larger than the
size of the buffer for the read.  The file contains about 170 characters.
Increase the buffer size to 256 characters.

Link: https://lkml.kernel.org/r/20241028233058.283381-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20241028233058.283381-2-sj@kernel.org
Fixes: b4a002889d24 ("selftests/damon: test debugfs file reads/writes with huge count")
Signed-off-by: Andrew Paniakin <apanyaki@amazon.com>
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Andrew Panyakin <apanyaki@amazon.com>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/huge_count_read_write.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/damon/huge_count_read_write.c b/tools/testing/selftests/damon/huge_count_read_write.c
index a6fe0689f88d..f3c199dc8eba 100644
--- a/tools/testing/selftests/damon/huge_count_read_write.c
+++ b/tools/testing/selftests/damon/huge_count_read_write.c
@@ -18,7 +18,7 @@
 void write_read_with_huge_count(char *file)
 {
 	int filedesc = open(file, O_RDWR);
-	char buf[25];
+	char buf[256];
 	int ret;
 
 	printf("%s %s\n", __func__, file);
-- 
cgit v1.2.3


From e06a6b55ed3db832cb8fbbc2df38b367dbab51ed Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 28 Oct 2024 16:30:54 -0700
Subject: selftests/damon/huge_count_read_write: remove unnecessary debugging
 message

The program prints expected errors from write/read of the files with
invalid huge count, for only debugging purpose.  It is only making the
output noisy.  Remove those.

Link: https://lkml.kernel.org/r/20241028233058.283381-3-sj@kernel.org
Fixes: b4a002889d24 ("selftests/damon: test debugfs file reads/writes with huge count")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Andrew Paniakin <apanyaki@amazon.com>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/huge_count_read_write.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/damon/huge_count_read_write.c b/tools/testing/selftests/damon/huge_count_read_write.c
index f3c199dc8eba..53e69a669668 100644
--- a/tools/testing/selftests/damon/huge_count_read_write.c
+++ b/tools/testing/selftests/damon/huge_count_read_write.c
@@ -28,9 +28,7 @@ void write_read_with_huge_count(char *file)
 	}
 
 	write(filedesc, "", 0xfffffffful);
-	perror("after write: ");
 	ret = read(filedesc, buf, 0xfffffffful);
-	perror("after read: ");
 	close(filedesc);
 }
 
-- 
cgit v1.2.3


From 82475d111de73b5688389d2736509bf30cb338d8 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 28 Oct 2024 16:30:55 -0700
Subject: selftests/damon/_debugfs_common: hide expected error message from
 test_write_result()

DAMON debugfs interface selftests use test_write_result() to check if
valid or invalid writes to files of the interface success or fail as
expected.  File write error messages from expected failures are only
making the output noisy.  Hide such expected error messages.

Link: https://lkml.kernel.org/r/20241028233058.283381-4-sj@kernel.org
Fixes: b348eb7abd09 ("mm/damon: add user space selftests")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Andrew Paniakin <apanyaki@amazon.com>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/_debugfs_common.sh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/damon/_debugfs_common.sh b/tools/testing/selftests/damon/_debugfs_common.sh
index aa995516870b..54d45791b0d9 100644
--- a/tools/testing/selftests/damon/_debugfs_common.sh
+++ b/tools/testing/selftests/damon/_debugfs_common.sh
@@ -8,7 +8,12 @@ test_write_result() {
 	expect_reason=$4
 	expected=$5
 
-	echo "$content" > "$file"
+	if [ "$expected" = "0" ]
+	then
+		echo "$content" > "$file"
+	else
+		echo "$content" > "$file" 2> /dev/null
+	fi
 	if [ $? -ne "$expected" ]
 	then
 		echo "writing $content to $file doesn't return $expected"
-- 
cgit v1.2.3


From 9b1266ee08c2e45684d58a53a48866a230c76bff Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 28 Oct 2024 16:30:56 -0700
Subject: selftests/damon/debugfs_duplicate_context_creation: hide errors from
 expected file write failures

debugfs_duplicate_context_creation.sh does an invalid file write to ensure
it fails.  Check of the failure is sufficient, so the error message from
the failure only makes the output unnecessarily noisy.  Hide it.

Link: https://lkml.kernel.org/r/20241028233058.283381-5-sj@kernel.org
Fixes: ade38b8ca5ce ("selftest/damon: add a test for duplicate context dirs creation")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Andrew Paniakin <apanyaki@amazon.com>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh b/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh
index 4a76e37ef16b..bd6c22d96ead 100755
--- a/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh
+++ b/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh
@@ -12,7 +12,7 @@ then
 	exit 1
 fi
 
-if echo foo > "$DBGFS/mk_contexts"
+if echo foo > "$DBGFS/mk_contexts" 2> /dev/null
 then
 	echo "duplicate context creation success"
 	exit 1
-- 
cgit v1.2.3


From 10299cdde869abab7a42fb5ab905a47a4e2cd24e Mon Sep 17 00:00:00 2001
From: John Sperbeck <jsperbeck@google.com>
Date: Tue, 5 Nov 2024 19:40:31 -0800
Subject: KVM: selftests: use X86_MEMTYPE_WB instead of VMX_BASIC_MEM_TYPE_WB

In 08a7d2525511 ("tools arch x86: Sync the msr-index.h copy with the
kernel sources"), VMX_BASIC_MEM_TYPE_WB was removed.  Use X86_MEMTYPE_WB
instead.

Fixes: 08a7d2525511 ("tools arch x86: Sync the msr-index.h copy with the
kernel sources")
Signed-off-by: John Sperbeck <jsperbeck@google.com>
Message-ID: <20241106034031.503291-1-jsperbeck@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/lib/x86_64/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c
index 089b8925b6b2..d7ac122820bf 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c
@@ -200,7 +200,7 @@ static inline void init_vmcs_control_fields(struct vmx_pages *vmx)
 	if (vmx->eptp_gpa) {
 		uint64_t ept_paddr;
 		struct eptPageTablePointer eptp = {
-			.memory_type = VMX_BASIC_MEM_TYPE_WB,
+			.memory_type = X86_MEMTYPE_WB,
 			.page_walk_length = 3, /* + 1 */
 			.ad_enabled = ept_vpid_cap_supported(VMX_EPT_VPID_CAP_AD_BITS),
 			.address = vmx->eptp_gpa >> PAGE_SHIFT_4K,
-- 
cgit v1.2.3


From 252e01e68241d33bfe0ed1fc333220d9bd8b06df Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 7 Nov 2024 16:47:31 -0800
Subject: selftests: net: add netlink-dumps to .gitignore

Commit 55d42a0c3f9c ("selftests: net: add a test for closing
a netlink socket ith dump in progress") added a new test
but did not add it to gitignore.

Reviewed-by: Joe Damato <jdamato@fastly.com>
Link: https://patch.msgid.link/20241108004731.2979878-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/.gitignore | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index 217d8b7a7365..59fe07ee2df9 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -19,6 +19,7 @@ log.txt
 msg_oob
 msg_zerocopy
 ncdevmem
+netlink-dumps
 nettest
 psock_fanout
 psock_snd
-- 
cgit v1.2.3


From 774ca6d3bf24287ff60b7d6dd4171ebb6e47760a Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Tue, 5 Nov 2024 15:39:54 +0200
Subject: bridge: Allow deleting FDB entries with non-existent VLAN

It is currently impossible to delete individual FDB entries (as opposed
to flushing) that were added with a VLAN that no longer exists:

 # ip link add name dummy1 up type dummy
 # ip link add name br1 up type bridge vlan_filtering 1
 # ip link set dev dummy1 master br1
 # bridge fdb add 00:11:22:33:44:55 dev dummy1 master static vlan 1
 # bridge vlan del vid 1 dev dummy1
 # bridge fdb get 00:11:22:33:44:55 br br1 vlan 1
 00:11:22:33:44:55 dev dummy1 vlan 1 master br1 static
 # bridge fdb del 00:11:22:33:44:55 dev dummy1 master vlan 1
 RTNETLINK answers: Invalid argument
 # bridge fdb get 00:11:22:33:44:55 br br1 vlan 1
 00:11:22:33:44:55 dev dummy1 vlan 1 master br1 static

This is in contrast to MDB entries that can be deleted after the VLAN
was deleted:

 # bridge vlan add vid 10 dev dummy1
 # bridge mdb add dev br1 port dummy1 grp 239.1.1.1 permanent vid 10
 # bridge vlan del vid 10 dev dummy1
 # bridge mdb get dev br1 grp 239.1.1.1 vid 10
 dev br1 port dummy1 grp 239.1.1.1 permanent vid 10
 # bridge mdb del dev br1 port dummy1 grp 239.1.1.1 permanent vid 10
 # bridge mdb get dev br1 grp 239.1.1.1 vid 10
 Error: bridge: MDB entry not found.

Align the two interfaces and allow user space to delete FDB entries that
were added with a VLAN that no longer exists:

 # ip link add name dummy1 up type dummy
 # ip link add name br1 up type bridge vlan_filtering 1
 # ip link set dev dummy1 master br1
 # bridge fdb add 00:11:22:33:44:55 dev dummy1 master static vlan 1
 # bridge vlan del vid 1 dev dummy1
 # bridge fdb get 00:11:22:33:44:55 br br1 vlan 1
 00:11:22:33:44:55 dev dummy1 vlan 1 master br1 static
 # bridge fdb del 00:11:22:33:44:55 dev dummy1 master vlan 1
 # bridge fdb get 00:11:22:33:44:55 br br1 vlan 1
 Error: Fdb entry not found.

Add a selftest to make sure this behavior does not regress:

 # ./rtnetlink.sh -t kci_test_fdb_del
 PASS: bridge fdb del

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Andy Roulin <aroulin@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/20241105133954.350479-1-idosch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/bridge/br_fdb.c                      |  9 ++-----
 tools/testing/selftests/net/rtnetlink.sh | 40 ++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 7 deletions(-)

(limited to 'tools/testing')

diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 1cd7bade9b3b..77f110035df1 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -1319,7 +1319,6 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 {
 	struct net_bridge_vlan_group *vg;
 	struct net_bridge_port *p = NULL;
-	struct net_bridge_vlan *v;
 	struct net_bridge *br;
 	int err;
 
@@ -1338,14 +1337,10 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 	}
 
 	if (vid) {
-		v = br_vlan_find(vg, vid);
-		if (!v) {
-			pr_info("bridge: RTM_DELNEIGH with unconfigured vlan %d on %s\n", vid, dev->name);
-			return -EINVAL;
-		}
-
 		err = __br_fdb_delete(br, p, addr, vid);
 	} else {
+		struct net_bridge_vlan *v;
+
 		err = -ENOENT;
 		err &= __br_fdb_delete(br, p, addr, 0);
 		if (!vg || !vg->num_vlans)
diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh
index 87dce3efe31e..6e216d7a8e2f 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -25,6 +25,7 @@ ALL_TESTS="
 	kci_test_ipsec
 	kci_test_ipsec_offload
 	kci_test_fdb_get
+	kci_test_fdb_del
 	kci_test_neigh_get
 	kci_test_bridge_parent_id
 	kci_test_address_proto
@@ -1065,6 +1066,45 @@ kci_test_fdb_get()
 	end_test "PASS: bridge fdb get"
 }
 
+kci_test_fdb_del()
+{
+	local test_mac=de:ad:be:ef:13:37
+	local dummydev="dummy1"
+	local brdev="test-br0"
+	local ret=0
+
+	run_cmd_grep 'bridge fdb get' bridge fdb help
+	if [ $? -ne 0 ]; then
+		end_test "SKIP: fdb del tests: iproute2 too old"
+		return $ksft_skip
+	fi
+
+	setup_ns testns
+	if [ $? -ne 0 ]; then
+		end_test "SKIP fdb del tests: cannot add net namespace $testns"
+		return $ksft_skip
+	fi
+	IP="ip -netns $testns"
+	BRIDGE="bridge -netns $testns"
+	run_cmd $IP link add $dummydev type dummy
+	run_cmd $IP link add name $brdev type bridge vlan_filtering 1
+	run_cmd $IP link set dev $dummydev master $brdev
+	run_cmd $BRIDGE fdb add $test_mac dev $dummydev master static vlan 1
+	run_cmd $BRIDGE vlan del vid 1 dev $dummydev
+	run_cmd $BRIDGE fdb get $test_mac br $brdev vlan 1
+	run_cmd $BRIDGE fdb del $test_mac dev $dummydev master vlan 1
+	run_cmd_fail $BRIDGE fdb get $test_mac br $brdev vlan 1
+
+	ip netns del $testns &>/dev/null
+
+	if [ $ret -ne 0 ]; then
+		end_test "FAIL: bridge fdb del"
+		return 1
+	fi
+
+	end_test "PASS: bridge fdb del"
+}
+
 kci_test_neigh_get()
 {
 	dstmac=de:ad:be:ef:13:37
-- 
cgit v1.2.3


From 876320d71f515407b81eb08a1d019f19f34907d7 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 28 Oct 2024 14:13:31 +0000
Subject: selftests/mm: add self tests for guard page feature

Utilise the kselftest harmness to implement tests for the guard page
implementation.

We start by implement basic tests asserting that guard pages can be
installed, removed and that touching guard pages result in SIGSEGV.  We
also assert that, in removing guard pages from a range, non-guard pages
remain intact.

We then examine different operations on regions containing guard markers
behave to ensure correct behaviour:

* Operations over multiple VMAs operate as expected.
* Invoking MADV_GUARD_INSTALL / MADV_GUARD_REMOVE via process_madvise() in
  batches works correctly.
* Ensuring that munmap() correctly tears down guard markers.
* Using mprotect() to adjust protection bits does not in any way override
  or cause issues with guard markers.
* Ensuring that splitting and merging VMAs around guard markers causes no
  issue - i.e. that a marker which 'belongs' to one VMA can function just
  as well 'belonging' to another.
* Ensuring that madvise(..., MADV_DONTNEED) and madvise(..., MADV_FREE)
  do not remove guard markers.
* Ensuring that mlock()'ing a range containing guard markers does not
  cause issues.
* Ensuring that mremap() can move a guard range and retain guard markers.
* Ensuring that mremap() can expand a guard range and retain guard
  markers (perhaps moving the range).
* Ensuring that mremap() can shrink a guard range and retain guard markers.
* Ensuring that forking a process correctly retains guard markers.
* Ensuring that forking a VMA with VM_WIPEONFORK set behaves sanely.
* Ensuring that lazyfree simply clears guard markers.
* Ensuring that userfaultfd can co-exist with guard pages.
* Ensuring that madvise(..., MADV_POPULATE_READ) and
  madvise(..., MADV_POPULATE_WRITE) error out when encountering
  guard markers.
* Ensuring that madvise(..., MADV_COLD) and madvise(..., MADV_PAGEOUT) do
  not remove guard markers.

If any test is unable to be run due to lack of permissions, that test is
skipped.

Link: https://lkml.kernel.org/r/c3dcca76b736bac0aeaf1dc085927536a253ac94.1730123433.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Suggested-by: Jann Horn <jannh@google.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Cc: Arnd Bergmann <arnd@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Chris Zankel <chris@zankel.net>
Cc: Helge Deller <deller@gmx.de>
Cc: James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Vlastimil Babka <vbabkba@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/.gitignore    |    1 +
 tools/testing/selftests/mm/Makefile      |    1 +
 tools/testing/selftests/mm/guard-pages.c | 1243 ++++++++++++++++++++++++++++++
 3 files changed, 1245 insertions(+)
 create mode 100644 tools/testing/selftests/mm/guard-pages.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index 689bbd520296..8f01f4da1c0d 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -54,3 +54,4 @@ droppable
 hugetlb_dio
 pkey_sighandler_tests_32
 pkey_sighandler_tests_64
+guard-pages
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 02e1204971b0..15c734d6cfec 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -79,6 +79,7 @@ TEST_GEN_FILES += hugetlb_fault_after_madv
 TEST_GEN_FILES += hugetlb_madv_vs_map
 TEST_GEN_FILES += hugetlb_dio
 TEST_GEN_FILES += droppable
+TEST_GEN_FILES += guard-pages
 
 ifneq ($(ARCH),arm64)
 TEST_GEN_FILES += soft-dirty
diff --git a/tools/testing/selftests/mm/guard-pages.c b/tools/testing/selftests/mm/guard-pages.c
new file mode 100644
index 000000000000..7cdf815d0d63
--- /dev/null
+++ b/tools/testing/selftests/mm/guard-pages.c
@@ -0,0 +1,1243 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#define _GNU_SOURCE
+#include "../kselftest_harness.h"
+#include <asm-generic/mman.h> /* Force the import of the tools version. */
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/userfaultfd.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+/*
+ * Ignore the checkpatch warning, as per the C99 standard, section 7.14.1.1:
+ *
+ * "If the signal occurs other than as the result of calling the abort or raise
+ *  function, the behavior is undefined if the signal handler refers to any
+ *  object with static storage duration other than by assigning a value to an
+ *  object declared as volatile sig_atomic_t"
+ */
+static volatile sig_atomic_t signal_jump_set;
+static sigjmp_buf signal_jmp_buf;
+
+/*
+ * Ignore the checkpatch warning, we must read from x but don't want to do
+ * anything with it in order to trigger a read page fault. We therefore must use
+ * volatile to stop the compiler from optimising this away.
+ */
+#define FORCE_READ(x) (*(volatile typeof(x) *)x)
+
+static int userfaultfd(int flags)
+{
+	return syscall(SYS_userfaultfd, flags);
+}
+
+static void handle_fatal(int c)
+{
+	if (!signal_jump_set)
+		return;
+
+	siglongjmp(signal_jmp_buf, c);
+}
+
+static int pidfd_open(pid_t pid, unsigned int flags)
+{
+	return syscall(SYS_pidfd_open, pid, flags);
+}
+
+/*
+ * Enable our signal catcher and try to read/write the specified buffer. The
+ * return value indicates whether the read/write succeeds without a fatal
+ * signal.
+ */
+static bool try_access_buf(char *ptr, bool write)
+{
+	bool failed;
+
+	/* Tell signal handler to jump back here on fatal signal. */
+	signal_jump_set = true;
+	/* If a fatal signal arose, we will jump back here and failed is set. */
+	failed = sigsetjmp(signal_jmp_buf, 0) != 0;
+
+	if (!failed) {
+		if (write)
+			*ptr = 'x';
+		else
+			FORCE_READ(ptr);
+	}
+
+	signal_jump_set = false;
+	return !failed;
+}
+
+/* Try and read from a buffer, return true if no fatal signal. */
+static bool try_read_buf(char *ptr)
+{
+	return try_access_buf(ptr, false);
+}
+
+/* Try and write to a buffer, return true if no fatal signal. */
+static bool try_write_buf(char *ptr)
+{
+	return try_access_buf(ptr, true);
+}
+
+/*
+ * Try and BOTH read from AND write to a buffer, return true if BOTH operations
+ * succeed.
+ */
+static bool try_read_write_buf(char *ptr)
+{
+	return try_read_buf(ptr) && try_write_buf(ptr);
+}
+
+FIXTURE(guard_pages)
+{
+	unsigned long page_size;
+};
+
+FIXTURE_SETUP(guard_pages)
+{
+	struct sigaction act = {
+		.sa_handler = &handle_fatal,
+		.sa_flags = SA_NODEFER,
+	};
+
+	sigemptyset(&act.sa_mask);
+	if (sigaction(SIGSEGV, &act, NULL))
+		ksft_exit_fail_perror("sigaction");
+
+	self->page_size = (unsigned long)sysconf(_SC_PAGESIZE);
+};
+
+FIXTURE_TEARDOWN(guard_pages)
+{
+	struct sigaction act = {
+		.sa_handler = SIG_DFL,
+		.sa_flags = SA_NODEFER,
+	};
+
+	sigemptyset(&act.sa_mask);
+	sigaction(SIGSEGV, &act, NULL);
+}
+
+TEST_F(guard_pages, basic)
+{
+	const unsigned long NUM_PAGES = 10;
+	const unsigned long page_size = self->page_size;
+	char *ptr;
+	int i;
+
+	ptr = mmap(NULL, NUM_PAGES * page_size, PROT_READ | PROT_WRITE,
+		   MAP_PRIVATE | MAP_ANON, -1, 0);
+	ASSERT_NE(ptr, MAP_FAILED);
+
+	/* Trivially assert we can touch the first page. */
+	ASSERT_TRUE(try_read_write_buf(ptr));
+
+	ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_INSTALL), 0);
+
+	/* Establish that 1st page SIGSEGV's. */
+	ASSERT_FALSE(try_read_write_buf(ptr));
+
+	/* Ensure we can touch everything else.*/
+	for (i = 1; i < NUM_PAGES; i++) {
+		char *curr = &ptr[i * page_size];
+
+		ASSERT_TRUE(try_read_write_buf(curr));
+	}
+
+	/* Establish a guard page at the end of the mapping. */
+	ASSERT_EQ(madvise(&ptr[(NUM_PAGES - 1) * page_size], page_size,
+			  MADV_GUARD_INSTALL), 0);
+
+	/* Check that both guard pages result in SIGSEGV. */
+	ASSERT_FALSE(try_read_write_buf(ptr));
+	ASSERT_FALSE(try_read_write_buf(&ptr[(NUM_PAGES - 1) * page_size]));
+
+	/* Remove the first guard page. */
+	ASSERT_FALSE(madvise(ptr, page_size, MADV_GUARD_REMOVE));
+
+	/* Make sure we can touch it. */
+	ASSERT_TRUE(try_read_write_buf(ptr));
+
+	/* Remove the last guard page. */
+	ASSERT_FALSE(madvise(&ptr[(NUM_PAGES - 1) * page_size], page_size,
+			     MADV_GUARD_REMOVE));
+
+	/* Make sure we can touch it. */
+	ASSERT_TRUE(try_read_write_buf(&ptr[(NUM_PAGES - 1) * page_size]));
+
+	/*
+	 *  Test setting a _range_ of pages, namely the first 3. The first of
+	 *  these be faulted in, so this also tests that we can install guard
+	 *  pages over backed pages.
+	 */
+	ASSERT_EQ(madvise(ptr, 3 * page_size, MADV_GUARD_INSTALL), 0);
+
+	/* Make sure they are all guard pages. */
+	for (i = 0; i < 3; i++) {
+		char *curr = &ptr[i * page_size];
+
+		ASSERT_FALSE(try_read_write_buf(curr));
+	}
+
+	/* Make sure the rest are not. */
+	for (i = 3; i < NUM_PAGES; i++) {
+		char *curr = &ptr[i * page_size];
+
+		ASSERT_TRUE(try_read_write_buf(curr));
+	}
+
+	/* Remove guard pages. */
+	ASSERT_EQ(madvise(ptr, NUM_PAGES * page_size, MADV_GUARD_REMOVE), 0);
+
+	/* Now make sure we can touch everything. */
+	for (i = 0; i < NUM_PAGES; i++) {
+		char *curr = &ptr[i * page_size];
+
+		ASSERT_TRUE(try_read_write_buf(curr));
+	}
+
+	/*
+	 * Now remove all guard pages, make sure we don't remove existing
+	 * entries.
+	 */
+	ASSERT_EQ(madvise(ptr, NUM_PAGES * page_size, MADV_GUARD_REMOVE), 0);
+
+	for (i = 0; i < NUM_PAGES * page_size; i += page_size) {
+		char chr = ptr[i];
+
+		ASSERT_EQ(chr, 'x');
+	}
+
+	ASSERT_EQ(munmap(ptr, NUM_PAGES * page_size), 0);
+}
+
+/* Assert that operations applied across multiple VMAs work as expected. */
+TEST_F(guard_pages, multi_vma)
+{
+	const unsigned long page_size = self->page_size;
+	char *ptr_region, *ptr, *ptr1, *ptr2, *ptr3;
+	int i;
+
+	/* Reserve a 100 page region over which we can install VMAs. */
+	ptr_region = mmap(NULL, 100 * page_size, PROT_NONE,
+			  MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr_region, MAP_FAILED);
+
+	/* Place a VMA of 10 pages size at the start of the region. */
+	ptr1 = mmap(ptr_region, 10 * page_size, PROT_READ | PROT_WRITE,
+		    MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr1, MAP_FAILED);
+
+	/* Place a VMA of 5 pages size 50 pages into the region. */
+	ptr2 = mmap(&ptr_region[50 * page_size], 5 * page_size,
+		    PROT_READ | PROT_WRITE,
+		    MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr2, MAP_FAILED);
+
+	/* Place a VMA of 20 pages size at the end of the region. */
+	ptr3 = mmap(&ptr_region[80 * page_size], 20 * page_size,
+		    PROT_READ | PROT_WRITE,
+		    MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr3, MAP_FAILED);
+
+	/* Unmap gaps. */
+	ASSERT_EQ(munmap(&ptr_region[10 * page_size], 40 * page_size), 0);
+	ASSERT_EQ(munmap(&ptr_region[55 * page_size], 25 * page_size), 0);
+
+	/*
+	 * We end up with VMAs like this:
+	 *
+	 * 0    10 .. 50   55 .. 80   100
+	 * [---]      [---]      [---]
+	 */
+
+	/*
+	 * Now mark the whole range as guard pages and make sure all VMAs are as
+	 * such.
+	 */
+
+	/*
+	 * madvise() is certifiable and lets you perform operations over gaps,
+	 * everything works, but it indicates an error and errno is set to
+	 * -ENOMEM. Also if anything runs out of memory it is set to
+	 * -ENOMEM. You are meant to guess which is which.
+	 */
+	ASSERT_EQ(madvise(ptr_region, 100 * page_size, MADV_GUARD_INSTALL), -1);
+	ASSERT_EQ(errno, ENOMEM);
+
+	for (i = 0; i < 10; i++) {
+		char *curr = &ptr1[i * page_size];
+
+		ASSERT_FALSE(try_read_write_buf(curr));
+	}
+
+	for (i = 0; i < 5; i++) {
+		char *curr = &ptr2[i * page_size];
+
+		ASSERT_FALSE(try_read_write_buf(curr));
+	}
+
+	for (i = 0; i < 20; i++) {
+		char *curr = &ptr3[i * page_size];
+
+		ASSERT_FALSE(try_read_write_buf(curr));
+	}
+
+	/* Now remove guar pages over range and assert the opposite. */
+
+	ASSERT_EQ(madvise(ptr_region, 100 * page_size, MADV_GUARD_REMOVE), -1);
+	ASSERT_EQ(errno, ENOMEM);
+
+	for (i = 0; i < 10; i++) {
+		char *curr = &ptr1[i * page_size];
+
+		ASSERT_TRUE(try_read_write_buf(curr));
+	}
+
+	for (i = 0; i < 5; i++) {
+		char *curr = &ptr2[i * page_size];
+
+		ASSERT_TRUE(try_read_write_buf(curr));
+	}
+
+	for (i = 0; i < 20; i++) {
+		char *curr = &ptr3[i * page_size];
+
+		ASSERT_TRUE(try_read_write_buf(curr));
+	}
+
+	/* Now map incompatible VMAs in the gaps. */
+	ptr = mmap(&ptr_region[10 * page_size], 40 * page_size,
+		   PROT_READ | PROT_WRITE | PROT_EXEC,
+		   MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr, MAP_FAILED);
+	ptr = mmap(&ptr_region[55 * page_size], 25 * page_size,
+		   PROT_READ | PROT_WRITE | PROT_EXEC,
+		   MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr, MAP_FAILED);
+
+	/*
+	 * We end up with VMAs like this:
+	 *
+	 * 0    10 .. 50   55 .. 80   100
+	 * [---][xxxx][---][xxxx][---]
+	 *
+	 * Where 'x' signifies VMAs that cannot be merged with those adjacent to
+	 * them.
+	 */
+
+	/* Multiple VMAs adjacent to one another should result in no error. */
+	ASSERT_EQ(madvise(ptr_region, 100 * page_size, MADV_GUARD_INSTALL), 0);
+	for (i = 0; i < 100; i++) {
+		char *curr = &ptr_region[i * page_size];
+
+		ASSERT_FALSE(try_read_write_buf(curr));
+	}
+	ASSERT_EQ(madvise(ptr_region, 100 * page_size, MADV_GUARD_REMOVE), 0);
+	for (i = 0; i < 100; i++) {
+		char *curr = &ptr_region[i * page_size];
+
+		ASSERT_TRUE(try_read_write_buf(curr));
+	}
+
+	/* Cleanup. */
+	ASSERT_EQ(munmap(ptr_region, 100 * page_size), 0);
+}
+
+/*
+ * Assert that batched operations performed using process_madvise() work as
+ * expected.
+ */
+TEST_F(guard_pages, process_madvise)
+{
+	const unsigned long page_size = self->page_size;
+	pid_t pid = getpid();
+	int pidfd = pidfd_open(pid, 0);
+	char *ptr_region, *ptr1, *ptr2, *ptr3;
+	ssize_t count;
+	struct iovec vec[6];
+
+	ASSERT_NE(pidfd, -1);
+
+	/* Reserve region to map over. */
+	ptr_region = mmap(NULL, 100 * page_size, PROT_NONE,
+			  MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr_region, MAP_FAILED);
+
+	/*
+	 * 10 pages offset 1 page into reserve region. We MAP_POPULATE so we
+	 * overwrite existing entries and test this code path against
+	 * overwriting existing entries.
+	 */
+	ptr1 = mmap(&ptr_region[page_size], 10 * page_size,
+		    PROT_READ | PROT_WRITE,
+		    MAP_FIXED | MAP_ANON | MAP_PRIVATE | MAP_POPULATE, -1, 0);
+	ASSERT_NE(ptr1, MAP_FAILED);
+	/* We want guard markers at start/end of each VMA. */
+	vec[0].iov_base = ptr1;
+	vec[0].iov_len = page_size;
+	vec[1].iov_base = &ptr1[9 * page_size];
+	vec[1].iov_len = page_size;
+
+	/* 5 pages offset 50 pages into reserve region. */
+	ptr2 = mmap(&ptr_region[50 * page_size], 5 * page_size,
+		    PROT_READ | PROT_WRITE,
+		    MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr2, MAP_FAILED);
+	vec[2].iov_base = ptr2;
+	vec[2].iov_len = page_size;
+	vec[3].iov_base = &ptr2[4 * page_size];
+	vec[3].iov_len = page_size;
+
+	/* 20 pages offset 79 pages into reserve region. */
+	ptr3 = mmap(&ptr_region[79 * page_size], 20 * page_size,
+		    PROT_READ | PROT_WRITE,
+		    MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr3, MAP_FAILED);
+	vec[4].iov_base = ptr3;
+	vec[4].iov_len = page_size;
+	vec[5].iov_base = &ptr3[19 * page_size];
+	vec[5].iov_len = page_size;
+
+	/* Free surrounding VMAs. */
+	ASSERT_EQ(munmap(ptr_region, page_size), 0);
+	ASSERT_EQ(munmap(&ptr_region[11 * page_size], 39 * page_size), 0);
+	ASSERT_EQ(munmap(&ptr_region[55 * page_size], 24 * page_size), 0);
+	ASSERT_EQ(munmap(&ptr_region[99 * page_size], page_size), 0);
+
+	/* Now guard in one step. */
+	count = process_madvise(pidfd, vec, 6, MADV_GUARD_INSTALL, 0);
+
+	/* OK we don't have permission to do this, skip. */
+	if (count == -1 && errno == EPERM)
+		ksft_exit_skip("No process_madvise() permissions, try running as root.\n");
+
+	/* Returns the number of bytes advised. */
+	ASSERT_EQ(count, 6 * page_size);
+
+	/* Now make sure the guarding was applied. */
+
+	ASSERT_FALSE(try_read_write_buf(ptr1));
+	ASSERT_FALSE(try_read_write_buf(&ptr1[9 * page_size]));
+
+	ASSERT_FALSE(try_read_write_buf(ptr2));
+	ASSERT_FALSE(try_read_write_buf(&ptr2[4 * page_size]));
+
+	ASSERT_FALSE(try_read_write_buf(ptr3));
+	ASSERT_FALSE(try_read_write_buf(&ptr3[19 * page_size]));
+
+	/* Now do the same with unguard... */
+	count = process_madvise(pidfd, vec, 6, MADV_GUARD_REMOVE, 0);
+
+	/* ...and everything should now succeed. */
+
+	ASSERT_TRUE(try_read_write_buf(ptr1));
+	ASSERT_TRUE(try_read_write_buf(&ptr1[9 * page_size]));
+
+	ASSERT_TRUE(try_read_write_buf(ptr2));
+	ASSERT_TRUE(try_read_write_buf(&ptr2[4 * page_size]));
+
+	ASSERT_TRUE(try_read_write_buf(ptr3));
+	ASSERT_TRUE(try_read_write_buf(&ptr3[19 * page_size]));
+
+	/* Cleanup. */
+	ASSERT_EQ(munmap(ptr1, 10 * page_size), 0);
+	ASSERT_EQ(munmap(ptr2, 5 * page_size), 0);
+	ASSERT_EQ(munmap(ptr3, 20 * page_size), 0);
+	close(pidfd);
+}
+
+/* Assert that unmapping ranges does not leave guard markers behind. */
+TEST_F(guard_pages, munmap)
+{
+	const unsigned long page_size = self->page_size;
+	char *ptr, *ptr_new1, *ptr_new2;
+
+	ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+		   MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr, MAP_FAILED);
+
+	/* Guard first and last pages. */
+	ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_INSTALL), 0);
+	ASSERT_EQ(madvise(&ptr[9 * page_size], page_size, MADV_GUARD_INSTALL), 0);
+
+	/* Assert that they are guarded. */
+	ASSERT_FALSE(try_read_write_buf(ptr));
+	ASSERT_FALSE(try_read_write_buf(&ptr[9 * page_size]));
+
+	/* Unmap them. */
+	ASSERT_EQ(munmap(ptr, page_size), 0);
+	ASSERT_EQ(munmap(&ptr[9 * page_size], page_size), 0);
+
+	/* Map over them.*/
+	ptr_new1 = mmap(ptr, page_size, PROT_READ | PROT_WRITE,
+			MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr_new1, MAP_FAILED);
+	ptr_new2 = mmap(&ptr[9 * page_size], page_size, PROT_READ | PROT_WRITE,
+			MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr_new2, MAP_FAILED);
+
+	/* Assert that they are now not guarded. */
+	ASSERT_TRUE(try_read_write_buf(ptr_new1));
+	ASSERT_TRUE(try_read_write_buf(ptr_new2));
+
+	/* Cleanup. */
+	ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/* Assert that mprotect() operations have no bearing on guard markers. */
+TEST_F(guard_pages, mprotect)
+{
+	const unsigned long page_size = self->page_size;
+	char *ptr;
+	int i;
+
+	ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+		   MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr, MAP_FAILED);
+
+	/* Guard the middle of the range. */
+	ASSERT_EQ(madvise(&ptr[5 * page_size], 2 * page_size,
+			  MADV_GUARD_INSTALL), 0);
+
+	/* Assert that it is indeed guarded. */
+	ASSERT_FALSE(try_read_write_buf(&ptr[5 * page_size]));
+	ASSERT_FALSE(try_read_write_buf(&ptr[6 * page_size]));
+
+	/* Now make these pages read-only. */
+	ASSERT_EQ(mprotect(&ptr[5 * page_size], 2 * page_size, PROT_READ), 0);
+
+	/* Make sure the range is still guarded. */
+	ASSERT_FALSE(try_read_buf(&ptr[5 * page_size]));
+	ASSERT_FALSE(try_read_buf(&ptr[6 * page_size]));
+
+	/* Make sure we can guard again without issue.*/
+	ASSERT_EQ(madvise(&ptr[5 * page_size], 2 * page_size,
+			  MADV_GUARD_INSTALL), 0);
+
+	/* Make sure the range is, yet again, still guarded. */
+	ASSERT_FALSE(try_read_buf(&ptr[5 * page_size]));
+	ASSERT_FALSE(try_read_buf(&ptr[6 * page_size]));
+
+	/* Now unguard the whole range. */
+	ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_REMOVE), 0);
+
+	/* Make sure the whole range is readable. */
+	for (i = 0; i < 10; i++) {
+		char *curr = &ptr[i * page_size];
+
+		ASSERT_TRUE(try_read_buf(curr));
+	}
+
+	/* Cleanup. */
+	ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/* Split and merge VMAs and make sure guard pages still behave. */
+TEST_F(guard_pages, split_merge)
+{
+	const unsigned long page_size = self->page_size;
+	char *ptr, *ptr_new;
+	int i;
+
+	ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+		   MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr, MAP_FAILED);
+
+	/* Guard the whole range. */
+	ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), 0);
+
+	/* Make sure the whole range is guarded. */
+	for (i = 0; i < 10; i++) {
+		char *curr = &ptr[i * page_size];
+
+		ASSERT_FALSE(try_read_write_buf(curr));
+	}
+
+	/* Now unmap some pages in the range so we split. */
+	ASSERT_EQ(munmap(&ptr[2 * page_size], page_size), 0);
+	ASSERT_EQ(munmap(&ptr[5 * page_size], page_size), 0);
+	ASSERT_EQ(munmap(&ptr[8 * page_size], page_size), 0);
+
+	/* Make sure the remaining ranges are guarded post-split. */
+	for (i = 0; i < 2; i++) {
+		char *curr = &ptr[i * page_size];
+
+		ASSERT_FALSE(try_read_write_buf(curr));
+	}
+	for (i = 2; i < 5; i++) {
+		char *curr = &ptr[i * page_size];
+
+		ASSERT_FALSE(try_read_write_buf(curr));
+	}
+	for (i = 6; i < 8; i++) {
+		char *curr = &ptr[i * page_size];
+
+		ASSERT_FALSE(try_read_write_buf(curr));
+	}
+	for (i = 9; i < 10; i++) {
+		char *curr = &ptr[i * page_size];
+
+		ASSERT_FALSE(try_read_write_buf(curr));
+	}
+
+	/* Now map them again - the unmap will have cleared the guards. */
+	ptr_new = mmap(&ptr[2 * page_size], page_size, PROT_READ | PROT_WRITE,
+		       MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr_new, MAP_FAILED);
+	ptr_new = mmap(&ptr[5 * page_size], page_size, PROT_READ | PROT_WRITE,
+		       MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr_new, MAP_FAILED);
+	ptr_new = mmap(&ptr[8 * page_size], page_size, PROT_READ | PROT_WRITE,
+		       MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr_new, MAP_FAILED);
+
+	/* Now make sure guard pages are established. */
+	for (i = 0; i < 10; i++) {
+		char *curr = &ptr[i * page_size];
+		bool result = try_read_write_buf(curr);
+		bool expect_true = i == 2 || i == 5 || i == 8;
+
+		ASSERT_TRUE(expect_true ? result : !result);
+	}
+
+	/* Now guard everything again. */
+	ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), 0);
+
+	/* Make sure the whole range is guarded. */
+	for (i = 0; i < 10; i++) {
+		char *curr = &ptr[i * page_size];
+
+		ASSERT_FALSE(try_read_write_buf(curr));
+	}
+
+	/* Now split the range into three. */
+	ASSERT_EQ(mprotect(ptr, 3 * page_size, PROT_READ), 0);
+	ASSERT_EQ(mprotect(&ptr[7 * page_size], 3 * page_size, PROT_READ), 0);
+
+	/* Make sure the whole range is guarded for read. */
+	for (i = 0; i < 10; i++) {
+		char *curr = &ptr[i * page_size];
+
+		ASSERT_FALSE(try_read_buf(curr));
+	}
+
+	/* Now reset protection bits so we merge the whole thing. */
+	ASSERT_EQ(mprotect(ptr, 3 * page_size, PROT_READ | PROT_WRITE), 0);
+	ASSERT_EQ(mprotect(&ptr[7 * page_size], 3 * page_size,
+			   PROT_READ | PROT_WRITE), 0);
+
+	/* Make sure the whole range is still guarded. */
+	for (i = 0; i < 10; i++) {
+		char *curr = &ptr[i * page_size];
+
+		ASSERT_FALSE(try_read_write_buf(curr));
+	}
+
+	/* Split range into 3 again... */
+	ASSERT_EQ(mprotect(ptr, 3 * page_size, PROT_READ), 0);
+	ASSERT_EQ(mprotect(&ptr[7 * page_size], 3 * page_size, PROT_READ), 0);
+
+	/* ...and unguard the whole range. */
+	ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_REMOVE), 0);
+
+	/* Make sure the whole range is remedied for read. */
+	for (i = 0; i < 10; i++) {
+		char *curr = &ptr[i * page_size];
+
+		ASSERT_TRUE(try_read_buf(curr));
+	}
+
+	/* Merge them again. */
+	ASSERT_EQ(mprotect(ptr, 3 * page_size, PROT_READ | PROT_WRITE), 0);
+	ASSERT_EQ(mprotect(&ptr[7 * page_size], 3 * page_size,
+			   PROT_READ | PROT_WRITE), 0);
+
+	/* Now ensure the merged range is remedied for read/write. */
+	for (i = 0; i < 10; i++) {
+		char *curr = &ptr[i * page_size];
+
+		ASSERT_TRUE(try_read_write_buf(curr));
+	}
+
+	/* Cleanup. */
+	ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/* Assert that MADV_DONTNEED does not remove guard markers. */
+TEST_F(guard_pages, dontneed)
+{
+	const unsigned long page_size = self->page_size;
+	char *ptr;
+	int i;
+
+	ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+		   MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr, MAP_FAILED);
+
+	/* Back the whole range. */
+	for (i = 0; i < 10; i++) {
+		char *curr = &ptr[i * page_size];
+
+		*curr = 'y';
+	}
+
+	/* Guard every other page. */
+	for (i = 0; i < 10; i += 2) {
+		char *curr = &ptr[i * page_size];
+		int res = madvise(curr, page_size, MADV_GUARD_INSTALL);
+
+		ASSERT_EQ(res, 0);
+	}
+
+	/* Indicate that we don't need any of the range. */
+	ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_DONTNEED), 0);
+
+	/* Check to ensure guard markers are still in place. */
+	for (i = 0; i < 10; i++) {
+		char *curr = &ptr[i * page_size];
+		bool result = try_read_buf(curr);
+
+		if (i % 2 == 0) {
+			ASSERT_FALSE(result);
+		} else {
+			ASSERT_TRUE(result);
+			/* Make sure we really did get reset to zero page. */
+			ASSERT_EQ(*curr, '\0');
+		}
+
+		/* Now write... */
+		result = try_write_buf(&ptr[i * page_size]);
+
+		/* ...and make sure same result. */
+		ASSERT_TRUE(i % 2 != 0 ? result : !result);
+	}
+
+	/* Cleanup. */
+	ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/* Assert that mlock()'ed pages work correctly with guard markers. */
+TEST_F(guard_pages, mlock)
+{
+	const unsigned long page_size = self->page_size;
+	char *ptr;
+	int i;
+
+	ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+		   MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr, MAP_FAILED);
+
+	/* Populate. */
+	for (i = 0; i < 10; i++) {
+		char *curr = &ptr[i * page_size];
+
+		*curr = 'y';
+	}
+
+	/* Lock. */
+	ASSERT_EQ(mlock(ptr, 10 * page_size), 0);
+
+	/* Now try to guard, should fail with EINVAL. */
+	ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), -1);
+	ASSERT_EQ(errno, EINVAL);
+
+	/* OK unlock. */
+	ASSERT_EQ(munlock(ptr, 10 * page_size), 0);
+
+	/* Guard first half of range, should now succeed. */
+	ASSERT_EQ(madvise(ptr, 5 * page_size, MADV_GUARD_INSTALL), 0);
+
+	/* Make sure guard works. */
+	for (i = 0; i < 10; i++) {
+		char *curr = &ptr[i * page_size];
+		bool result = try_read_write_buf(curr);
+
+		if (i < 5) {
+			ASSERT_FALSE(result);
+		} else {
+			ASSERT_TRUE(result);
+			ASSERT_EQ(*curr, 'x');
+		}
+	}
+
+	/*
+	 * Now lock the latter part of the range. We can't lock the guard pages,
+	 * as this would result in the pages being populated and the guarding
+	 * would cause this to error out.
+	 */
+	ASSERT_EQ(mlock(&ptr[5 * page_size], 5 * page_size), 0);
+
+	/*
+	 * Now remove guard pages, we permit mlock()'d ranges to have guard
+	 * pages removed as it is a non-destructive operation.
+	 */
+	ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_REMOVE), 0);
+
+	/* Now check that no guard pages remain. */
+	for (i = 0; i < 10; i++) {
+		char *curr = &ptr[i * page_size];
+
+		ASSERT_TRUE(try_read_write_buf(curr));
+	}
+
+	/* Cleanup. */
+	ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/*
+ * Assert that moving, extending and shrinking memory via mremap() retains
+ * guard markers where possible.
+ *
+ * - Moving a mapping alone should retain markers as they are.
+ */
+TEST_F(guard_pages, mremap_move)
+{
+	const unsigned long page_size = self->page_size;
+	char *ptr, *ptr_new;
+
+	/* Map 5 pages. */
+	ptr = mmap(NULL, 5 * page_size, PROT_READ | PROT_WRITE,
+		   MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr, MAP_FAILED);
+
+	/* Place guard markers at both ends of the 5 page span. */
+	ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_INSTALL), 0);
+	ASSERT_EQ(madvise(&ptr[4 * page_size], page_size, MADV_GUARD_INSTALL), 0);
+
+	/* Make sure the guard pages are in effect. */
+	ASSERT_FALSE(try_read_write_buf(ptr));
+	ASSERT_FALSE(try_read_write_buf(&ptr[4 * page_size]));
+
+	/* Map a new region we will move this range into. Doing this ensures
+	 * that we have reserved a range to map into.
+	 */
+	ptr_new = mmap(NULL, 5 * page_size, PROT_NONE, MAP_ANON | MAP_PRIVATE,
+		       -1, 0);
+	ASSERT_NE(ptr_new, MAP_FAILED);
+
+	ASSERT_EQ(mremap(ptr, 5 * page_size, 5 * page_size,
+			 MREMAP_MAYMOVE | MREMAP_FIXED, ptr_new), ptr_new);
+
+	/* Make sure the guard markers are retained. */
+	ASSERT_FALSE(try_read_write_buf(ptr_new));
+	ASSERT_FALSE(try_read_write_buf(&ptr_new[4 * page_size]));
+
+	/*
+	 * Clean up - we only need reference the new pointer as we overwrote the
+	 * PROT_NONE range and moved the existing one.
+	 */
+	munmap(ptr_new, 5 * page_size);
+}
+
+/*
+ * Assert that moving, extending and shrinking memory via mremap() retains
+ * guard markers where possible.
+ *
+ * Expanding should retain guard pages, only now in different position. The user
+ * will have to remove guard pages manually to fix up (they'd have to do the
+ * same if it were a PROT_NONE mapping).
+ */
+TEST_F(guard_pages, mremap_expand)
+{
+	const unsigned long page_size = self->page_size;
+	char *ptr, *ptr_new;
+
+	/* Map 10 pages... */
+	ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+		   MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr, MAP_FAILED);
+	/* ...But unmap the last 5 so we can ensure we can expand into them. */
+	ASSERT_EQ(munmap(&ptr[5 * page_size], 5 * page_size), 0);
+
+	/* Place guard markers at both ends of the 5 page span. */
+	ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_INSTALL), 0);
+	ASSERT_EQ(madvise(&ptr[4 * page_size], page_size, MADV_GUARD_INSTALL), 0);
+
+	/* Make sure the guarding is in effect. */
+	ASSERT_FALSE(try_read_write_buf(ptr));
+	ASSERT_FALSE(try_read_write_buf(&ptr[4 * page_size]));
+
+	/* Now expand to 10 pages. */
+	ptr = mremap(ptr, 5 * page_size, 10 * page_size, 0);
+	ASSERT_NE(ptr, MAP_FAILED);
+
+	/*
+	 * Make sure the guard markers are retained in their original positions.
+	 */
+	ASSERT_FALSE(try_read_write_buf(ptr));
+	ASSERT_FALSE(try_read_write_buf(&ptr[4 * page_size]));
+
+	/* Reserve a region which we can move to and expand into. */
+	ptr_new = mmap(NULL, 20 * page_size, PROT_NONE,
+		       MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr_new, MAP_FAILED);
+
+	/* Now move and expand into it. */
+	ptr = mremap(ptr, 10 * page_size, 20 * page_size,
+		     MREMAP_MAYMOVE | MREMAP_FIXED, ptr_new);
+	ASSERT_EQ(ptr, ptr_new);
+
+	/*
+	 * Again, make sure the guard markers are retained in their original positions.
+	 */
+	ASSERT_FALSE(try_read_write_buf(ptr));
+	ASSERT_FALSE(try_read_write_buf(&ptr[4 * page_size]));
+
+	/*
+	 * A real user would have to remove guard markers, but would reasonably
+	 * expect all characteristics of the mapping to be retained, including
+	 * guard markers.
+	 */
+
+	/* Cleanup. */
+	munmap(ptr, 20 * page_size);
+}
+/*
+ * Assert that moving, extending and shrinking memory via mremap() retains
+ * guard markers where possible.
+ *
+ * Shrinking will result in markers that are shrunk over being removed. Again,
+ * if the user were using a PROT_NONE mapping they'd have to manually fix this
+ * up also so this is OK.
+ */
+TEST_F(guard_pages, mremap_shrink)
+{
+	const unsigned long page_size = self->page_size;
+	char *ptr;
+	int i;
+
+	/* Map 5 pages. */
+	ptr = mmap(NULL, 5 * page_size, PROT_READ | PROT_WRITE,
+		   MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr, MAP_FAILED);
+
+	/* Place guard markers at both ends of the 5 page span. */
+	ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_INSTALL), 0);
+	ASSERT_EQ(madvise(&ptr[4 * page_size], page_size, MADV_GUARD_INSTALL), 0);
+
+	/* Make sure the guarding is in effect. */
+	ASSERT_FALSE(try_read_write_buf(ptr));
+	ASSERT_FALSE(try_read_write_buf(&ptr[4 * page_size]));
+
+	/* Now shrink to 3 pages. */
+	ptr = mremap(ptr, 5 * page_size, 3 * page_size, MREMAP_MAYMOVE);
+	ASSERT_NE(ptr, MAP_FAILED);
+
+	/* We expect the guard marker at the start to be retained... */
+	ASSERT_FALSE(try_read_write_buf(ptr));
+
+	/* ...But remaining pages will not have guard markers. */
+	for (i = 1; i < 3; i++) {
+		char *curr = &ptr[i * page_size];
+
+		ASSERT_TRUE(try_read_write_buf(curr));
+	}
+
+	/*
+	 * As with expansion, a real user would have to remove guard pages and
+	 * fixup. But you'd have to do similar manual things with PROT_NONE
+	 * mappings too.
+	 */
+
+	/*
+	 * If we expand back to the original size, the end marker will, of
+	 * course, no longer be present.
+	 */
+	ptr = mremap(ptr, 3 * page_size, 5 * page_size, 0);
+	ASSERT_NE(ptr, MAP_FAILED);
+
+	/* Again, we expect the guard marker at the start to be retained... */
+	ASSERT_FALSE(try_read_write_buf(ptr));
+
+	/* ...But remaining pages will not have guard markers. */
+	for (i = 1; i < 5; i++) {
+		char *curr = &ptr[i * page_size];
+
+		ASSERT_TRUE(try_read_write_buf(curr));
+	}
+
+	/* Cleanup. */
+	munmap(ptr, 5 * page_size);
+}
+
+/*
+ * Assert that forking a process with VMAs that do not have VM_WIPEONFORK set
+ * retain guard pages.
+ */
+TEST_F(guard_pages, fork)
+{
+	const unsigned long page_size = self->page_size;
+	char *ptr;
+	pid_t pid;
+	int i;
+
+	/* Map 10 pages. */
+	ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+		   MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr, MAP_FAILED);
+
+	/* Establish guard apges in the first 5 pages. */
+	ASSERT_EQ(madvise(ptr, 5 * page_size, MADV_GUARD_INSTALL), 0);
+
+	pid = fork();
+	ASSERT_NE(pid, -1);
+	if (!pid) {
+		/* This is the child process now. */
+
+		/* Assert that the guarding is in effect. */
+		for (i = 0; i < 10; i++) {
+			char *curr = &ptr[i * page_size];
+			bool result = try_read_write_buf(curr);
+
+			ASSERT_TRUE(i >= 5 ? result : !result);
+		}
+
+		/* Now unguard the range.*/
+		ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_REMOVE), 0);
+
+		exit(0);
+	}
+
+	/* Parent process. */
+
+	/* Parent simply waits on child. */
+	waitpid(pid, NULL, 0);
+
+	/* Child unguard does not impact parent page table state. */
+	for (i = 0; i < 10; i++) {
+		char *curr = &ptr[i * page_size];
+		bool result = try_read_write_buf(curr);
+
+		ASSERT_TRUE(i >= 5 ? result : !result);
+	}
+
+	/* Cleanup. */
+	ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/*
+ * Assert that forking a process with VMAs that do have VM_WIPEONFORK set
+ * behave as expected.
+ */
+TEST_F(guard_pages, fork_wipeonfork)
+{
+	const unsigned long page_size = self->page_size;
+	char *ptr;
+	pid_t pid;
+	int i;
+
+	/* Map 10 pages. */
+	ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+		   MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr, MAP_FAILED);
+
+	/* Mark wipe on fork. */
+	ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_WIPEONFORK), 0);
+
+	/* Guard the first 5 pages. */
+	ASSERT_EQ(madvise(ptr, 5 * page_size, MADV_GUARD_INSTALL), 0);
+
+	pid = fork();
+	ASSERT_NE(pid, -1);
+	if (!pid) {
+		/* This is the child process now. */
+
+		/* Guard will have been wiped. */
+		for (i = 0; i < 10; i++) {
+			char *curr = &ptr[i * page_size];
+
+			ASSERT_TRUE(try_read_write_buf(curr));
+		}
+
+		exit(0);
+	}
+
+	/* Parent process. */
+
+	waitpid(pid, NULL, 0);
+
+	/* Guard markers should be in effect.*/
+	for (i = 0; i < 10; i++) {
+		char *curr = &ptr[i * page_size];
+		bool result = try_read_write_buf(curr);
+
+		ASSERT_TRUE(i >= 5 ? result : !result);
+	}
+
+	/* Cleanup. */
+	ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/* Ensure that MADV_FREE retains guard entries as expected. */
+TEST_F(guard_pages, lazyfree)
+{
+	const unsigned long page_size = self->page_size;
+	char *ptr;
+	int i;
+
+	/* Map 10 pages. */
+	ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+		   MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr, MAP_FAILED);
+
+	/* Guard range. */
+	ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), 0);
+
+	/* Ensure guarded. */
+	for (i = 0; i < 10; i++) {
+		char *curr = &ptr[i * page_size];
+
+		ASSERT_FALSE(try_read_write_buf(curr));
+	}
+
+	/* Lazyfree range. */
+	ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_FREE), 0);
+
+	/* This should leave the guard markers in place. */
+	for (i = 0; i < 10; i++) {
+		char *curr = &ptr[i * page_size];
+
+		ASSERT_FALSE(try_read_write_buf(curr));
+	}
+
+	/* Cleanup. */
+	ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/* Ensure that MADV_POPULATE_READ, MADV_POPULATE_WRITE behave as expected. */
+TEST_F(guard_pages, populate)
+{
+	const unsigned long page_size = self->page_size;
+	char *ptr;
+
+	/* Map 10 pages. */
+	ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+		   MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr, MAP_FAILED);
+
+	/* Guard range. */
+	ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), 0);
+
+	/* Populate read should error out... */
+	ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_POPULATE_READ), -1);
+	ASSERT_EQ(errno, EFAULT);
+
+	/* ...as should populate write. */
+	ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_POPULATE_WRITE), -1);
+	ASSERT_EQ(errno, EFAULT);
+
+	/* Cleanup. */
+	ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/* Ensure that MADV_COLD, MADV_PAGEOUT do not remove guard markers. */
+TEST_F(guard_pages, cold_pageout)
+{
+	const unsigned long page_size = self->page_size;
+	char *ptr;
+	int i;
+
+	/* Map 10 pages. */
+	ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+		   MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr, MAP_FAILED);
+
+	/* Guard range. */
+	ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), 0);
+
+	/* Ensured guarded. */
+	for (i = 0; i < 10; i++) {
+		char *curr = &ptr[i * page_size];
+
+		ASSERT_FALSE(try_read_write_buf(curr));
+	}
+
+	/* Now mark cold. This should have no impact on guard markers. */
+	ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_COLD), 0);
+
+	/* Should remain guarded. */
+	for (i = 0; i < 10; i++) {
+		char *curr = &ptr[i * page_size];
+
+		ASSERT_FALSE(try_read_write_buf(curr));
+	}
+
+	/* OK, now page out. This should equally, have no effect on markers. */
+	ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_PAGEOUT), 0);
+
+	/* Should remain guarded. */
+	for (i = 0; i < 10; i++) {
+		char *curr = &ptr[i * page_size];
+
+		ASSERT_FALSE(try_read_write_buf(curr));
+	}
+
+	/* Cleanup. */
+	ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/* Ensure that guard pages do not break userfaultd. */
+TEST_F(guard_pages, uffd)
+{
+	const unsigned long page_size = self->page_size;
+	int uffd;
+	char *ptr;
+	int i;
+	struct uffdio_api api = {
+		.api = UFFD_API,
+		.features = 0,
+	};
+	struct uffdio_register reg;
+	struct uffdio_range range;
+
+	/* Set up uffd. */
+	uffd = userfaultfd(0);
+	if (uffd == -1 && errno == EPERM)
+		ksft_exit_skip("No userfaultfd permissions, try running as root.\n");
+	ASSERT_NE(uffd, -1);
+
+	ASSERT_EQ(ioctl(uffd, UFFDIO_API, &api), 0);
+
+	/* Map 10 pages. */
+	ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+		   MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr, MAP_FAILED);
+
+	/* Register the range with uffd. */
+	range.start = (unsigned long)ptr;
+	range.len = 10 * page_size;
+	reg.range = range;
+	reg.mode = UFFDIO_REGISTER_MODE_MISSING;
+	ASSERT_EQ(ioctl(uffd, UFFDIO_REGISTER, &reg), 0);
+
+	/* Guard the range. This should not trigger the uffd. */
+	ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), 0);
+
+	/* The guarding should behave as usual with no uffd intervention. */
+	for (i = 0; i < 10; i++) {
+		char *curr = &ptr[i * page_size];
+
+		ASSERT_FALSE(try_read_write_buf(curr));
+	}
+
+	/* Cleanup. */
+	ASSERT_EQ(ioctl(uffd, UFFDIO_UNREGISTER, &range), 0);
+	close(uffd);
+	ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From ae16b0ab3baeb5e969dd8192a185297b96cd56a9 Mon Sep 17 00:00:00 2001
From: Christoph Schlameuss <schlameuss@linux.ibm.com>
Date: Thu, 7 Nov 2024 15:10:20 +0100
Subject: KVM: s390: selftests: Add uc_map_unmap VM test case

Add a test case verifying basic running and interaction of ucontrol VMs.
Fill the segment and page tables for allocated memory and map memory on
first access.

* uc_map_unmap
  Store and load data to mapped and unmapped memory and use pic segment
  translation handling to map memory on access.

Signed-off-by: Christoph Schlameuss <schlameuss@linux.ibm.com>
Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
Link:
https://lore.kernel.org/r/20241107141024.238916-2-schlameuss@linux.ibm.com
[frankja@linux.ibm.com: Fixed patch prefix]
Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Message-ID: <20241107141024.238916-2-schlameuss@linux.ibm.com>
---
 tools/testing/selftests/kvm/s390x/ucontrol_test.c | 145 +++++++++++++++++++++-
 1 file changed, 144 insertions(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/s390x/ucontrol_test.c b/tools/testing/selftests/kvm/s390x/ucontrol_test.c
index f257beec1430..3e649b12a0b9 100644
--- a/tools/testing/selftests/kvm/s390x/ucontrol_test.c
+++ b/tools/testing/selftests/kvm/s390x/ucontrol_test.c
@@ -16,7 +16,11 @@
 #include <linux/capability.h>
 #include <linux/sizes.h>
 
+#define PGM_SEGMENT_TRANSLATION 0x10
+
 #define VM_MEM_SIZE (4 * SZ_1M)
+#define VM_MEM_EXT_SIZE (2 * SZ_1M)
+#define VM_MEM_MAX_M ((VM_MEM_SIZE + VM_MEM_EXT_SIZE) / SZ_1M)
 
 /* so directly declare capget to check caps without libcap */
 int capget(cap_user_header_t header, cap_user_data_t data);
@@ -58,6 +62,23 @@ asm("test_gprs_asm:\n"
 	"	j	0b\n"
 );
 
+/* Test program manipulating memory */
+extern char test_mem_asm[];
+asm("test_mem_asm:\n"
+	"xgr	%r0, %r0\n"
+
+	"0:\n"
+	"	ahi	%r0,1\n"
+	"	st	%r1,0(%r5,%r6)\n"
+
+	"	xgr	%r1,%r1\n"
+	"	l	%r1,0(%r5,%r6)\n"
+	"	ahi	%r0,1\n"
+	"	diag	0,0,0x44\n"
+
+	"	j	0b\n"
+);
+
 FIXTURE(uc_kvm)
 {
 	struct kvm_s390_sie_block *sie_block;
@@ -67,6 +88,7 @@ FIXTURE(uc_kvm)
 	uintptr_t base_hva;
 	uintptr_t code_hva;
 	int kvm_run_size;
+	vm_paddr_t pgd;
 	void *vm_mem;
 	int vcpu_fd;
 	int kvm_fd;
@@ -116,7 +138,7 @@ FIXTURE_SETUP(uc_kvm)
 	self->base_gpa = 0;
 	self->code_gpa = self->base_gpa + (3 * SZ_1M);
 
-	self->vm_mem = aligned_alloc(SZ_1M, VM_MEM_SIZE);
+	self->vm_mem = aligned_alloc(SZ_1M, VM_MEM_MAX_M * SZ_1M);
 	ASSERT_NE(NULL, self->vm_mem) TH_LOG("malloc failed %u", errno);
 	self->base_hva = (uintptr_t)self->vm_mem;
 	self->code_hva = self->base_hva - self->base_gpa + self->code_gpa;
@@ -222,6 +244,60 @@ TEST(uc_cap_hpage)
 	close(kvm_fd);
 }
 
+/* calculate host virtual addr from guest physical addr */
+static void *gpa2hva(FIXTURE_DATA(uc_kvm) *self, u64 gpa)
+{
+	return (void *)(self->base_hva - self->base_gpa + gpa);
+}
+
+/* map / make additional memory available */
+static int uc_map_ext(FIXTURE_DATA(uc_kvm) *self, u64 vcpu_addr, u64 length)
+{
+	struct kvm_s390_ucas_mapping map = {
+		.user_addr = (u64)gpa2hva(self, vcpu_addr),
+		.vcpu_addr = vcpu_addr,
+		.length = length,
+	};
+	pr_info("ucas map %p %p 0x%llx",
+		(void *)map.user_addr, (void *)map.vcpu_addr, map.length);
+	return ioctl(self->vcpu_fd, KVM_S390_UCAS_MAP, &map);
+}
+
+/* unmap previously mapped memory */
+static int uc_unmap_ext(FIXTURE_DATA(uc_kvm) *self, u64 vcpu_addr, u64 length)
+{
+	struct kvm_s390_ucas_mapping map = {
+		.user_addr = (u64)gpa2hva(self, vcpu_addr),
+		.vcpu_addr = vcpu_addr,
+		.length = length,
+	};
+	pr_info("ucas unmap %p %p 0x%llx",
+		(void *)map.user_addr, (void *)map.vcpu_addr, map.length);
+	return ioctl(self->vcpu_fd, KVM_S390_UCAS_UNMAP, &map);
+}
+
+/* handle ucontrol exit by mapping the accessed segment */
+static void uc_handle_exit_ucontrol(FIXTURE_DATA(uc_kvm) *self)
+{
+	struct kvm_run *run = self->run;
+	u64 seg_addr;
+	int rc;
+
+	TEST_ASSERT_EQ(KVM_EXIT_S390_UCONTROL, run->exit_reason);
+	switch (run->s390_ucontrol.pgm_code) {
+	case PGM_SEGMENT_TRANSLATION:
+		seg_addr = run->s390_ucontrol.trans_exc_code & ~(SZ_1M - 1);
+		pr_info("ucontrol pic segment translation 0x%llx, mapping segment 0x%lx\n",
+			run->s390_ucontrol.trans_exc_code, seg_addr);
+		/* map / make additional memory available */
+		rc = uc_map_ext(self, seg_addr, SZ_1M);
+		TEST_ASSERT_EQ(0, rc);
+		break;
+	default:
+		TEST_FAIL("UNEXPECTED PGM CODE %d", run->s390_ucontrol.pgm_code);
+	}
+}
+
 /* verify SIEIC exit
  * * fail on codes not expected in the test cases
  */
@@ -255,6 +331,12 @@ static bool uc_handle_exit(FIXTURE_DATA(uc_kvm) * self)
 	struct kvm_run *run = self->run;
 
 	switch (run->exit_reason) {
+	case KVM_EXIT_S390_UCONTROL:
+		/** check program interruption code
+		 * handle page fault --> ucas map
+		 */
+		uc_handle_exit_ucontrol(self);
+		break;
 	case KVM_EXIT_S390_SIEIC:
 		return uc_handle_sieic(self);
 	default:
@@ -286,6 +368,67 @@ static void uc_assert_diag44(FIXTURE_DATA(uc_kvm) * self)
 	TEST_ASSERT_EQ(0x440000, sie_block->ipb);
 }
 
+TEST_F(uc_kvm, uc_map_unmap)
+{
+	struct kvm_sync_regs *sync_regs = &self->run->s.regs;
+	struct kvm_run *run = self->run;
+	const u64 disp = 1;
+	int rc;
+
+	/* copy test_mem_asm to code_hva / code_gpa */
+	TH_LOG("copy code %p to vm mapped memory %p / %p",
+	       &test_mem_asm, (void *)self->code_hva, (void *)self->code_gpa);
+	memcpy((void *)self->code_hva, &test_mem_asm, PAGE_SIZE);
+
+	/* DAT disabled + 64 bit mode */
+	run->psw_mask = 0x0000000180000000ULL;
+	run->psw_addr = self->code_gpa;
+
+	/* set register content for test_mem_asm to access not mapped memory*/
+	sync_regs->gprs[1] = 0x55;
+	sync_regs->gprs[5] = self->base_gpa;
+	sync_regs->gprs[6] = VM_MEM_SIZE + disp;
+	run->kvm_dirty_regs |= KVM_SYNC_GPRS;
+
+	/* run and expect to fail with ucontrol pic segment translation */
+	ASSERT_EQ(0, uc_run_once(self));
+	ASSERT_EQ(1, sync_regs->gprs[0]);
+	ASSERT_EQ(KVM_EXIT_S390_UCONTROL, run->exit_reason);
+
+	ASSERT_EQ(PGM_SEGMENT_TRANSLATION, run->s390_ucontrol.pgm_code);
+	ASSERT_EQ(self->base_gpa + VM_MEM_SIZE, run->s390_ucontrol.trans_exc_code);
+
+	/* fail to map memory with not segment aligned address */
+	rc = uc_map_ext(self, self->base_gpa + VM_MEM_SIZE + disp, VM_MEM_EXT_SIZE);
+	ASSERT_GT(0, rc)
+		TH_LOG("ucas map for non segment address should fail but didn't; "
+		       "result %d not expected, %s", rc, strerror(errno));
+
+	/* map / make additional memory available */
+	rc = uc_map_ext(self, self->base_gpa + VM_MEM_SIZE, VM_MEM_EXT_SIZE);
+	ASSERT_EQ(0, rc)
+		TH_LOG("ucas map result %d not expected, %s", rc, strerror(errno));
+	ASSERT_EQ(0, uc_run_once(self));
+	ASSERT_EQ(false, uc_handle_exit(self));
+	uc_assert_diag44(self);
+
+	/* assert registers and memory are in expected state */
+	ASSERT_EQ(2, sync_regs->gprs[0]);
+	ASSERT_EQ(0x55, sync_regs->gprs[1]);
+	ASSERT_EQ(0x55, *(u32 *)gpa2hva(self, self->base_gpa + VM_MEM_SIZE + disp));
+
+	/* unmap and run loop again */
+	rc = uc_unmap_ext(self, self->base_gpa + VM_MEM_SIZE, VM_MEM_EXT_SIZE);
+	ASSERT_EQ(0, rc)
+		TH_LOG("ucas unmap result %d not expected, %s", rc, strerror(errno));
+	ASSERT_EQ(0, uc_run_once(self));
+	ASSERT_EQ(3, sync_regs->gprs[0]);
+	ASSERT_EQ(KVM_EXIT_S390_UCONTROL, run->exit_reason);
+	ASSERT_EQ(PGM_SEGMENT_TRANSLATION, run->s390_ucontrol.pgm_code);
+	/* handle ucontrol exit and remap memory after previous map and unmap */
+	ASSERT_EQ(true, uc_handle_exit(self));
+}
+
 TEST_F(uc_kvm, uc_gprs)
 {
 	struct kvm_sync_regs *sync_regs = &self->run->s.regs;
-- 
cgit v1.2.3


From 0185fbc6a2d3cf3cc346d53d91ce6fc5e58c7187 Mon Sep 17 00:00:00 2001
From: Christoph Schlameuss <schlameuss@linux.ibm.com>
Date: Fri, 8 Nov 2024 10:16:20 +0100
Subject: KVM: s390: selftests: Add uc_skey VM test case

Add a test case manipulating s390 storage keys from within the ucontrol
VM.

Storage key instruction (ISKE, SSKE and RRBE) intercepts and
Keyless-subset facility are disabled on first use, where the skeys are
setup by KVM in non ucontrol VMs.

Signed-off-by: Christoph Schlameuss <schlameuss@linux.ibm.com>
Link: https://lore.kernel.org/r/20241108091620.289406-1-schlameuss@linux.ibm.com
Acked-by: Janosch Frank <frankja@linux.ibm.com>
[frankja@linux.ibm.com: Fixed patch prefix]
Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Message-ID: <20241108091620.289406-1-schlameuss@linux.ibm.com>
---
 .../selftests/kvm/include/s390x/processor.h        |   6 +
 tools/testing/selftests/kvm/s390x/ucontrol_test.c  | 145 ++++++++++++++++++++-
 2 files changed, 149 insertions(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/include/s390x/processor.h b/tools/testing/selftests/kvm/include/s390x/processor.h
index 481bd2fd6a32..33fef6fd9617 100644
--- a/tools/testing/selftests/kvm/include/s390x/processor.h
+++ b/tools/testing/selftests/kvm/include/s390x/processor.h
@@ -32,4 +32,10 @@ static inline void cpu_relax(void)
 	barrier();
 }
 
+/* Get the instruction length */
+static inline int insn_length(unsigned char code)
+{
+	return ((((int)code + 64) >> 7) + 1) << 1;
+}
+
 #endif
diff --git a/tools/testing/selftests/kvm/s390x/ucontrol_test.c b/tools/testing/selftests/kvm/s390x/ucontrol_test.c
index 3e649b12a0b9..ad95087cc74c 100644
--- a/tools/testing/selftests/kvm/s390x/ucontrol_test.c
+++ b/tools/testing/selftests/kvm/s390x/ucontrol_test.c
@@ -79,6 +79,33 @@ asm("test_mem_asm:\n"
 	"	j	0b\n"
 );
 
+/* Test program manipulating storage keys */
+extern char test_skey_asm[];
+asm("test_skey_asm:\n"
+	"xgr	%r0, %r0\n"
+
+	"0:\n"
+	"	ahi	%r0,1\n"
+	"	st	%r1,0(%r5,%r6)\n"
+
+	"	iske	%r1,%r6\n"
+	"	ahi	%r0,1\n"
+	"	diag	0,0,0x44\n"
+
+	"	sske	%r1,%r6\n"
+	"	xgr	%r1,%r1\n"
+	"	iske	%r1,%r6\n"
+	"	ahi	%r0,1\n"
+	"	diag	0,0,0x44\n"
+
+	"	rrbe	%r1,%r6\n"
+	"	iske	%r1,%r6\n"
+	"	ahi	%r0,1\n"
+	"	diag	0,0,0x44\n"
+
+	"	j	0b\n"
+);
+
 FIXTURE(uc_kvm)
 {
 	struct kvm_s390_sie_block *sie_block;
@@ -298,8 +325,50 @@ static void uc_handle_exit_ucontrol(FIXTURE_DATA(uc_kvm) *self)
 	}
 }
 
-/* verify SIEIC exit
+/*
+ * Handle the SIEIC exit
  * * fail on codes not expected in the test cases
+ * Returns if interception is handled / execution can be continued
+ */
+static void uc_skey_enable(FIXTURE_DATA(uc_kvm) *self)
+{
+	struct kvm_s390_sie_block *sie_block = self->sie_block;
+
+	/* disable KSS */
+	sie_block->cpuflags &= ~CPUSTAT_KSS;
+	/* disable skey inst interception */
+	sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
+}
+
+/*
+ * Handle the instruction intercept
+ * Returns if interception is handled / execution can be continued
+ */
+static bool uc_handle_insn_ic(FIXTURE_DATA(uc_kvm) *self)
+{
+	struct kvm_s390_sie_block *sie_block = self->sie_block;
+	int ilen = insn_length(sie_block->ipa >> 8);
+	struct kvm_run *run = self->run;
+
+	switch (run->s390_sieic.ipa) {
+	case 0xB229: /* ISKE */
+	case 0xB22b: /* SSKE */
+	case 0xB22a: /* RRBE */
+		uc_skey_enable(self);
+
+		/* rewind to reexecute intercepted instruction */
+		run->psw_addr = run->psw_addr - ilen;
+		pr_info("rewind guest addr to 0x%.16llx\n", run->psw_addr);
+		return true;
+	default:
+		return false;
+	}
+}
+
+/*
+ * Handle the SIEIC exit
+ * * fail on codes not expected in the test cases
+ * Returns if interception is handled / execution can be continued
  */
 static bool uc_handle_sieic(FIXTURE_DATA(uc_kvm) * self)
 {
@@ -315,7 +384,10 @@ static bool uc_handle_sieic(FIXTURE_DATA(uc_kvm) * self)
 	case ICPT_INST:
 		/* end execution in caller on intercepted instruction */
 		pr_info("sie instruction interception\n");
-		return false;
+		return uc_handle_insn_ic(self);
+	case ICPT_KSS:
+		uc_skey_enable(self);
+		return true;
 	case ICPT_OPEREXC:
 		/* operation exception */
 		TEST_FAIL("sie exception on %.4x%.8x", sie_block->ipa, sie_block->ipb);
@@ -472,4 +544,73 @@ TEST_F(uc_kvm, uc_gprs)
 	ASSERT_EQ(1, sync_regs->gprs[0]);
 }
 
+TEST_F(uc_kvm, uc_skey)
+{
+	struct kvm_s390_sie_block *sie_block = self->sie_block;
+	struct kvm_sync_regs *sync_regs = &self->run->s.regs;
+	u64 test_vaddr = VM_MEM_SIZE - (SZ_1M / 2);
+	struct kvm_run *run = self->run;
+	const u8 skeyvalue = 0x34;
+
+	/* copy test_skey_asm to code_hva / code_gpa */
+	TH_LOG("copy code %p to vm mapped memory %p / %p",
+	       &test_skey_asm, (void *)self->code_hva, (void *)self->code_gpa);
+	memcpy((void *)self->code_hva, &test_skey_asm, PAGE_SIZE);
+
+	/* set register content for test_skey_asm to access not mapped memory */
+	sync_regs->gprs[1] = skeyvalue;
+	sync_regs->gprs[5] = self->base_gpa;
+	sync_regs->gprs[6] = test_vaddr;
+	run->kvm_dirty_regs |= KVM_SYNC_GPRS;
+
+	/* DAT disabled + 64 bit mode */
+	run->psw_mask = 0x0000000180000000ULL;
+	run->psw_addr = self->code_gpa;
+
+	ASSERT_EQ(0, uc_run_once(self));
+	ASSERT_EQ(true, uc_handle_exit(self));
+	ASSERT_EQ(1, sync_regs->gprs[0]);
+
+	/* ISKE */
+	ASSERT_EQ(0, uc_run_once(self));
+
+	/*
+	 * Bail out and skip the test after uc_skey_enable was executed but iske
+	 * is still intercepted. Instructions are not handled by the kernel.
+	 * Thus there is no need to test this here.
+	 */
+	TEST_ASSERT_EQ(0, sie_block->cpuflags & CPUSTAT_KSS);
+	TEST_ASSERT_EQ(0, sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE));
+	TEST_ASSERT_EQ(KVM_EXIT_S390_SIEIC, self->run->exit_reason);
+	TEST_ASSERT_EQ(ICPT_INST, sie_block->icptcode);
+	TEST_REQUIRE(sie_block->ipa != 0xb229);
+
+	/* ISKE contd. */
+	ASSERT_EQ(false, uc_handle_exit(self));
+	ASSERT_EQ(2, sync_regs->gprs[0]);
+	/* assert initial skey (ACC = 0, R & C = 1) */
+	ASSERT_EQ(0x06, sync_regs->gprs[1]);
+	uc_assert_diag44(self);
+
+	/* SSKE + ISKE */
+	sync_regs->gprs[1] = skeyvalue;
+	run->kvm_dirty_regs |= KVM_SYNC_GPRS;
+	ASSERT_EQ(0, uc_run_once(self));
+	ASSERT_EQ(false, uc_handle_exit(self));
+	ASSERT_EQ(3, sync_regs->gprs[0]);
+	ASSERT_EQ(skeyvalue, sync_regs->gprs[1]);
+	uc_assert_diag44(self);
+
+	/* RRBE + ISKE */
+	sync_regs->gprs[1] = skeyvalue;
+	run->kvm_dirty_regs |= KVM_SYNC_GPRS;
+	ASSERT_EQ(0, uc_run_once(self));
+	ASSERT_EQ(false, uc_handle_exit(self));
+	ASSERT_EQ(4, sync_regs->gprs[0]);
+	/* assert R reset but rest of skey unchanged */
+	ASSERT_EQ(skeyvalue & 0xfa, sync_regs->gprs[1]);
+	ASSERT_EQ(0, sync_regs->gprs[1] & 0x04);
+	uc_assert_diag44(self);
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From 89be2544579932a7d5cdb5e534dfd00624c5f39f Mon Sep 17 00:00:00 2001
From: Christoph Schlameuss <schlameuss@linux.ibm.com>
Date: Thu, 7 Nov 2024 15:10:22 +0100
Subject: KVM: s390: selftests: Verify reject memory region operations for
 ucontrol VMs

Add a test case verifying KVM_SET_USER_MEMORY_REGION and
KVM_SET_USER_MEMORY_REGION2 cannot be executed on ucontrol VMs.

Executing this test case on not patched kernels will cause a null
pointer dereference in the host kernel.
This is fixed with commit:
commit 7816e58967d0 ("kvm: s390: Reject memory region operations for ucontrol VMs")

Signed-off-by: Christoph Schlameuss <schlameuss@linux.ibm.com>
Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
Link: https://lore.kernel.org/r/20241107141024.238916-4-schlameuss@linux.ibm.com
[frankja@linux.ibm.com: Fixed patch prefix]
Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Message-ID: <20241107141024.238916-4-schlameuss@linux.ibm.com>
---
 tools/testing/selftests/kvm/s390x/ucontrol_test.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/s390x/ucontrol_test.c b/tools/testing/selftests/kvm/s390x/ucontrol_test.c
index ad95087cc74c..adc72ae80e8f 100644
--- a/tools/testing/selftests/kvm/s390x/ucontrol_test.c
+++ b/tools/testing/selftests/kvm/s390x/ucontrol_test.c
@@ -440,6 +440,28 @@ static void uc_assert_diag44(FIXTURE_DATA(uc_kvm) * self)
 	TEST_ASSERT_EQ(0x440000, sie_block->ipb);
 }
 
+TEST_F(uc_kvm, uc_no_user_region)
+{
+	struct kvm_userspace_memory_region region = {
+		.slot = 1,
+		.guest_phys_addr = self->code_gpa,
+		.memory_size = VM_MEM_EXT_SIZE,
+		.userspace_addr = (uintptr_t)self->code_hva,
+	};
+	struct kvm_userspace_memory_region2 region2 = {
+		.slot = 1,
+		.guest_phys_addr = self->code_gpa,
+		.memory_size = VM_MEM_EXT_SIZE,
+		.userspace_addr = (uintptr_t)self->code_hva,
+	};
+
+	ASSERT_EQ(-1, ioctl(self->vm_fd, KVM_SET_USER_MEMORY_REGION, &region));
+	ASSERT_EQ(EINVAL, errno);
+
+	ASSERT_EQ(-1, ioctl(self->vm_fd, KVM_SET_USER_MEMORY_REGION2, &region2));
+	ASSERT_EQ(EINVAL, errno);
+}
+
 TEST_F(uc_kvm, uc_map_unmap)
 {
 	struct kvm_sync_regs *sync_regs = &self->run->s.regs;
-- 
cgit v1.2.3


From 59f82bf467c8fd42b015db2dda1ca33c520633bb Mon Sep 17 00:00:00 2001
From: Christoph Schlameuss <schlameuss@linux.ibm.com>
Date: Thu, 7 Nov 2024 15:10:23 +0100
Subject: KVM: s390: selftests: Fix whitespace confusion in ucontrol test

Checkpatch thinks that we're doing a multiplication but we're obviously
not. Fix 4 instances where we adhered to wrong checkpatch advice.

Signed-off-by: Christoph Schlameuss <schlameuss@linux.ibm.com>
Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
Link: https://lore.kernel.org/r/20241107141024.238916-5-schlameuss@linux.ibm.com
[frankja@linux.ibm.com: Fixed patch prefix]
Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Message-ID: <20241107141024.238916-5-schlameuss@linux.ibm.com>
---
 tools/testing/selftests/kvm/s390x/ucontrol_test.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/s390x/ucontrol_test.c b/tools/testing/selftests/kvm/s390x/ucontrol_test.c
index adc72ae80e8f..690077f2c41d 100644
--- a/tools/testing/selftests/kvm/s390x/ucontrol_test.c
+++ b/tools/testing/selftests/kvm/s390x/ucontrol_test.c
@@ -370,7 +370,7 @@ static bool uc_handle_insn_ic(FIXTURE_DATA(uc_kvm) *self)
  * * fail on codes not expected in the test cases
  * Returns if interception is handled / execution can be continued
  */
-static bool uc_handle_sieic(FIXTURE_DATA(uc_kvm) * self)
+static bool uc_handle_sieic(FIXTURE_DATA(uc_kvm) *self)
 {
 	struct kvm_s390_sie_block *sie_block = self->sie_block;
 	struct kvm_run *run = self->run;
@@ -398,7 +398,7 @@ static bool uc_handle_sieic(FIXTURE_DATA(uc_kvm) * self)
 }
 
 /* verify VM state on exit */
-static bool uc_handle_exit(FIXTURE_DATA(uc_kvm) * self)
+static bool uc_handle_exit(FIXTURE_DATA(uc_kvm) *self)
 {
 	struct kvm_run *run = self->run;
 
@@ -418,7 +418,7 @@ static bool uc_handle_exit(FIXTURE_DATA(uc_kvm) * self)
 }
 
 /* run the VM until interrupted */
-static int uc_run_once(FIXTURE_DATA(uc_kvm) * self)
+static int uc_run_once(FIXTURE_DATA(uc_kvm) *self)
 {
 	int rc;
 
@@ -429,7 +429,7 @@ static int uc_run_once(FIXTURE_DATA(uc_kvm) * self)
 	return rc;
 }
 
-static void uc_assert_diag44(FIXTURE_DATA(uc_kvm) * self)
+static void uc_assert_diag44(FIXTURE_DATA(uc_kvm) *self)
 {
 	struct kvm_s390_sie_block *sie_block = self->sie_block;
 
-- 
cgit v1.2.3


From b6380944401fa4d9d48e51c963826d1137c0e5cf Mon Sep 17 00:00:00 2001
From: Christoph Schlameuss <schlameuss@linux.ibm.com>
Date: Thu, 7 Nov 2024 15:10:24 +0100
Subject: KVM: s390: selftests: correct IP.b length in uc_handle_sieic debug
 output

The length of the interrupt parameters (IP) are:
a: 2 bytes
b: 4 bytes

Signed-off-by: Christoph Schlameuss <schlameuss@linux.ibm.com>
Link: https://lore.kernel.org/r/20241107141024.238916-6-schlameuss@linux.ibm.com
[frankja@linux.ibm.com: Fixed patch prefix]
Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Message-ID: <20241107141024.238916-6-schlameuss@linux.ibm.com>
---
 tools/testing/selftests/kvm/s390x/ucontrol_test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/s390x/ucontrol_test.c b/tools/testing/selftests/kvm/s390x/ucontrol_test.c
index 690077f2c41d..0c112319dab1 100644
--- a/tools/testing/selftests/kvm/s390x/ucontrol_test.c
+++ b/tools/testing/selftests/kvm/s390x/ucontrol_test.c
@@ -376,7 +376,7 @@ static bool uc_handle_sieic(FIXTURE_DATA(uc_kvm) *self)
 	struct kvm_run *run = self->run;
 
 	/* check SIE interception code */
-	pr_info("sieic: 0x%.2x 0x%.4x 0x%.4x\n",
+	pr_info("sieic: 0x%.2x 0x%.4x 0x%.8x\n",
 		run->s390_sieic.icptcode,
 		run->s390_sieic.ipa,
 		run->s390_sieic.ipb);
-- 
cgit v1.2.3


From 7a1f3143377adb655a3912b8dea714949f819fa3 Mon Sep 17 00:00:00 2001
From: Hendrik Brueckner <brueckner@linux.ibm.com>
Date: Thu, 7 Nov 2024 16:23:19 +0100
Subject: KVM: s390: selftests: Add regression tests for PFCR subfunctions

Check if the PFCR query reported in userspace coincides with the
kernel reported function list. Right now we don't mask the functions
in the kernel so they have to be the same.

Signed-off-by: Hendrik Brueckner <brueckner@linux.ibm.com>
Reviewed-by: Hariharan Mari <hari55@linux.ibm.com>
Link: https://lore.kernel.org/r/20241107152319.77816-5-brueckner@linux.ibm.com
[frankja@linux.ibm.com: Added commit description]
Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Message-ID: <20241107152319.77816-5-brueckner@linux.ibm.com>
---
 tools/arch/s390/include/uapi/asm/kvm.h                    |  3 ++-
 .../testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c  | 15 +++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/arch/s390/include/uapi/asm/kvm.h b/tools/arch/s390/include/uapi/asm/kvm.h
index 05eaf6db3ad4..60345dd2cba2 100644
--- a/tools/arch/s390/include/uapi/asm/kvm.h
+++ b/tools/arch/s390/include/uapi/asm/kvm.h
@@ -469,7 +469,8 @@ struct kvm_s390_vm_cpu_subfunc {
 	__u8 kdsa[16];		/* with MSA9 */
 	__u8 sortl[32];		/* with STFLE.150 */
 	__u8 dfltcc[32];	/* with STFLE.151 */
-	__u8 reserved[1728];
+	__u8 pfcr[16];		/* with STFLE.201 */
+	__u8 reserved[1712];
 };
 
 #define KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST	6
diff --git a/tools/testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c b/tools/testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c
index 222ba1cc3cac..27255880dabd 100644
--- a/tools/testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c
+++ b/tools/testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c
@@ -214,6 +214,19 @@ static void test_dfltcc_asm_block(u8 (*query)[32])
 			: "cc", "0", "1");
 }
 
+/*
+ * Testing Perform Function with Concurrent Results (PFCR)
+ * CPU subfunctions's ASM block
+ */
+static void test_pfcr_asm_block(u8 (*query)[16])
+{
+	asm volatile("	lghi	0,0\n"
+			"	.insn   rsy,0xeb0000000016,0,0,%[query]\n"
+			: [query] "=QS" (*query)
+			:
+			: "cc", "0");
+}
+
 typedef void (*testfunc_t)(u8 (*array)[]);
 
 struct testdef {
@@ -249,6 +262,8 @@ struct testdef {
 	{ "SORTL", cpu_subfunc.sortl, sizeof(cpu_subfunc.sortl), test_sortl_asm_block, 150 },
 	/* DFLTCC - Facility bit 151 */
 	{ "DFLTCC", cpu_subfunc.dfltcc, sizeof(cpu_subfunc.dfltcc), test_dfltcc_asm_block, 151 },
+	/* Concurrent-function facility - Facility bit 201 */
+	{ "PFCR", cpu_subfunc.pfcr, sizeof(cpu_subfunc.pfcr), test_pfcr_asm_block, 201 },
 };
 
 int main(int argc, char *argv[])
-- 
cgit v1.2.3


From ec8d3b5c2adc356d3b34c0fe44a5be74022be037 Mon Sep 17 00:00:00 2001
From: Viktor Malik <vmalik@redhat.com>
Date: Fri, 1 Nov 2024 09:27:11 +0100
Subject: selftests/bpf: Allow building with extra flags
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In order to specify extra compilation or linking flags to BPF selftests,
it is possible to set EXTRA_CFLAGS and EXTRA_LDFLAGS from the command
line. The problem is that they are not propagated to sub-make calls
(runqslower, bpftool, libbpf) and in the better case are not applied, in
the worse case cause the entire build fail.

Propagate EXTRA_CFLAGS and EXTRA_LDFLAGS to the sub-makes.

This, for instance, allows to build selftests as PIE with

    $ make EXTRA_CFLAGS='-fPIE' EXTRA_LDFLAGS='-pie'

Without this change, the command would fail because libbpf.a would not
be built with -fPIE and other PIE binaries would not link against it.

The only problem is that we have to explicitly provide empty
EXTRA_CFLAGS='' and EXTRA_LDFLAGS='' to the builds of kernel modules as
we don't want to build modules with flags used for userspace (the above
example would fail as kernel doesn't support PIE).

Signed-off-by: Viktor Malik <vmalik@redhat.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/Makefile | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index d5aaa674dab5..edef5df08cb2 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -297,25 +297,33 @@ $(OUTPUT)/sign-file: ../../../../scripts/sign-file.c
 $(OUTPUT)/bpf_testmod.ko: $(VMLINUX_BTF) $(RESOLVE_BTFIDS) $(wildcard bpf_testmod/Makefile bpf_testmod/*.[ch])
 	$(call msg,MOD,,$@)
 	$(Q)$(RM) bpf_testmod/bpf_testmod.ko # force re-compilation
-	$(Q)$(MAKE) $(submake_extras) RESOLVE_BTFIDS=$(RESOLVE_BTFIDS) -C bpf_testmod
+	$(Q)$(MAKE) $(submake_extras) -C bpf_testmod \
+		RESOLVE_BTFIDS=$(RESOLVE_BTFIDS)     \
+		EXTRA_CFLAGS='' EXTRA_LDFLAGS=''
 	$(Q)cp bpf_testmod/bpf_testmod.ko $@
 
 $(OUTPUT)/bpf_test_no_cfi.ko: $(VMLINUX_BTF) $(RESOLVE_BTFIDS) $(wildcard bpf_test_no_cfi/Makefile bpf_test_no_cfi/*.[ch])
 	$(call msg,MOD,,$@)
 	$(Q)$(RM) bpf_test_no_cfi/bpf_test_no_cfi.ko # force re-compilation
-	$(Q)$(MAKE) $(submake_extras) RESOLVE_BTFIDS=$(RESOLVE_BTFIDS) -C bpf_test_no_cfi
+	$(Q)$(MAKE) $(submake_extras) -C bpf_test_no_cfi \
+		RESOLVE_BTFIDS=$(RESOLVE_BTFIDS)	 \
+		EXTRA_CFLAGS='' EXTRA_LDFLAGS=''
 	$(Q)cp bpf_test_no_cfi/bpf_test_no_cfi.ko $@
 
 $(OUTPUT)/bpf_test_modorder_x.ko: $(VMLINUX_BTF) $(RESOLVE_BTFIDS) $(wildcard bpf_test_modorder_x/Makefile bpf_test_modorder_x/*.[ch])
 	$(call msg,MOD,,$@)
 	$(Q)$(RM) bpf_test_modorder_x/bpf_test_modorder_x.ko # force re-compilation
-	$(Q)$(MAKE) $(submake_extras) RESOLVE_BTFIDS=$(RESOLVE_BTFIDS) -C bpf_test_modorder_x
+	$(Q)$(MAKE) $(submake_extras) -C bpf_test_modorder_x \
+		RESOLVE_BTFIDS=$(RESOLVE_BTFIDS)	     \
+		EXTRA_CFLAGS='' EXTRA_LDFLAGS=''
 	$(Q)cp bpf_test_modorder_x/bpf_test_modorder_x.ko $@
 
 $(OUTPUT)/bpf_test_modorder_y.ko: $(VMLINUX_BTF) $(RESOLVE_BTFIDS) $(wildcard bpf_test_modorder_y/Makefile bpf_test_modorder_y/*.[ch])
 	$(call msg,MOD,,$@)
 	$(Q)$(RM) bpf_test_modorder_y/bpf_test_modorder_y.ko # force re-compilation
-	$(Q)$(MAKE) $(submake_extras) RESOLVE_BTFIDS=$(RESOLVE_BTFIDS) -C bpf_test_modorder_y
+	$(Q)$(MAKE) $(submake_extras) -C bpf_test_modorder_y \
+		RESOLVE_BTFIDS=$(RESOLVE_BTFIDS)	     \
+		EXTRA_CFLAGS='' EXTRA_LDFLAGS=''
 	$(Q)cp bpf_test_modorder_y/bpf_test_modorder_y.ko $@
 
 
@@ -335,8 +343,8 @@ $(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL) $(RUNQSLOWER_OUTPUT)
 		    BPFTOOL_OUTPUT=$(HOST_BUILD_DIR)/bpftool/		       \
 		    BPFOBJ_OUTPUT=$(BUILD_DIR)/libbpf/			       \
 		    BPFOBJ=$(BPFOBJ) BPF_INCLUDE=$(INCLUDE_DIR)		       \
-		    EXTRA_CFLAGS='-g $(OPT_FLAGS) $(SAN_CFLAGS)'	       \
-		    EXTRA_LDFLAGS='$(SAN_LDFLAGS)' &&			       \
+		    EXTRA_CFLAGS='-g $(OPT_FLAGS) $(SAN_CFLAGS) $(EXTRA_CFLAGS)' \
+		    EXTRA_LDFLAGS='$(SAN_LDFLAGS) $(EXTRA_LDFLAGS)' &&	       \
 		    cp $(RUNQSLOWER_OUTPUT)runqslower $@
 
 TEST_GEN_PROGS_EXTENDED += $(TRUNNER_BPFTOOL)
@@ -369,7 +377,8 @@ $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile)    \
 		    $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool
 	$(Q)$(MAKE) $(submake_extras)  -C $(BPFTOOLDIR)			       \
 		    ARCH= CROSS_COMPILE= CC="$(HOSTCC)" LD="$(HOSTLD)" 	       \
-		    EXTRA_CFLAGS='-g $(OPT_FLAGS)'			       \
+		    EXTRA_CFLAGS='-g $(OPT_FLAGS) $(EXTRA_CFLAGS)'	       \
+		    EXTRA_LDFLAGS='$(EXTRA_LDFLAGS)'			       \
 		    OUTPUT=$(HOST_BUILD_DIR)/bpftool/			       \
 		    LIBBPF_OUTPUT=$(HOST_BUILD_DIR)/libbpf/		       \
 		    LIBBPF_DESTDIR=$(HOST_SCRATCH_DIR)/			       \
@@ -380,7 +389,8 @@ $(CROSS_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile)	\
 		    $(BPFOBJ) | $(BUILD_DIR)/bpftool
 	$(Q)$(MAKE) $(submake_extras)  -C $(BPFTOOLDIR)				\
 		    ARCH=$(ARCH) CROSS_COMPILE=$(CROSS_COMPILE)			\
-		    EXTRA_CFLAGS='-g $(OPT_FLAGS)'				\
+		    EXTRA_CFLAGS='-g $(OPT_FLAGS) $(EXTRA_CFLAGS)'		\
+		    EXTRA_LDFLAGS='$(EXTRA_LDFLAGS)'				\
 		    OUTPUT=$(BUILD_DIR)/bpftool/				\
 		    LIBBPF_OUTPUT=$(BUILD_DIR)/libbpf/				\
 		    LIBBPF_DESTDIR=$(SCRATCH_DIR)/				\
@@ -403,8 +413,8 @@ $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)		       \
 	   $(APIDIR)/linux/bpf.h					       \
 	   | $(BUILD_DIR)/libbpf
 	$(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/ \
-		    EXTRA_CFLAGS='-g $(OPT_FLAGS) $(SAN_CFLAGS)'	       \
-		    EXTRA_LDFLAGS='$(SAN_LDFLAGS)'			       \
+		    EXTRA_CFLAGS='-g $(OPT_FLAGS) $(SAN_CFLAGS) $(EXTRA_CFLAGS)' \
+		    EXTRA_LDFLAGS='$(SAN_LDFLAGS) $(EXTRA_LDFLAGS)'	       \
 		    DESTDIR=$(SCRATCH_DIR) prefix= all install_headers
 
 ifneq ($(BPFOBJ),$(HOST_BPFOBJ))
@@ -412,7 +422,9 @@ $(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)		       \
 		$(APIDIR)/linux/bpf.h					       \
 		| $(HOST_BUILD_DIR)/libbpf
 	$(Q)$(MAKE) $(submake_extras) -C $(BPFDIR)                             \
-		    EXTRA_CFLAGS='-g $(OPT_FLAGS)' ARCH= CROSS_COMPILE=	       \
+		    ARCH= CROSS_COMPILE=				       \
+		    EXTRA_CFLAGS='-g $(OPT_FLAGS) $(EXTRA_CFLAGS)'	       \
+		    EXTRA_LDFLAGS='$(EXTRA_LDFLAGS)'			       \
 		    OUTPUT=$(HOST_BUILD_DIR)/libbpf/			       \
 		    CC="$(HOSTCC)" LD="$(HOSTLD)"			       \
 		    DESTDIR=$(HOST_SCRATCH_DIR)/ prefix= all install_headers
-- 
cgit v1.2.3


From dcf04676f347133a0c5944152e8d5110aa28d2dd Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 7 Nov 2024 10:43:37 +0100
Subject: selftests/bpf: Fix uprobe consumer test (again)

The new uprobe changes bring some new behaviour that we need to reflect
in the consumer test. Now pending uprobe instance in the kernel can
survive longer and thus might call uretprobe consumer callbacks in
some situations in which, previously, such callback would be omitted.
We now need to take that into account in uprobe-multi consumer tests.

The idea being that uretprobe under test either stayed from before to
after (uret_stays + test_bit) or uretprobe instance survived and we
have uretprobe active in after (uret_survives + test_bit).

uret_survives just states that uretprobe survives if there are *any*
uretprobes both before and after (overlapping or not, doesn't matter)
and uprobe was attached before.

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241107094337.3848210-1-jolsa@kernel.org
---
 tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
index 619b31cd24a1..616441fdd7f2 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
@@ -869,15 +869,17 @@ static int consumer_test(struct uprobe_multi_consumers *skel,
 			fmt = "prog 0/1: uprobe";
 		} else {
 			/*
-			 * to trigger uretprobe consumer, the uretprobe needs to be installed,
-			 * which means one of the 'return' uprobes was alive when probe was hit:
-			 *
-			 *   idxs: 2/3 uprobe return in 'installed' mask
+			 * To trigger uretprobe consumer, the uretprobe under test either stayed from
+			 * before to after (uret_stays + test_bit) or uretprobe instance survived and
+			 * we have uretprobe active in after (uret_survives + test_bit)
 			 */
-			unsigned long had_uretprobes  = before & 0b1100; /* is uretprobe installed */
 
-			if (had_uretprobes && test_bit(idx, after))
+			bool uret_stays = before & after & 0b1100;
+			bool uret_survives = (before & 0b1100) && (after & 0b1100) && (before & 0b0011);
+
+			if ((uret_stays || uret_survives) && test_bit(idx, after))
 				val++;
+
 			fmt = "idx 2/3: uretprobe";
 		}
 
-- 
cgit v1.2.3


From 4856ecb11524c96bfedbd7dc44d60f394d32bc9f Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Fri, 8 Nov 2024 14:45:37 +0100
Subject: selftests/bpf: Add uprobe session test

Adding uprobe session test and testing that the entry program
return value controls execution of the return probe program.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241108134544.480660-7-jolsa@kernel.org
---
 .../selftests/bpf/prog_tests/uprobe_multi_test.c   | 47 ++++++++++++++
 .../selftests/bpf/progs/uprobe_multi_session.c     | 71 ++++++++++++++++++++++
 2 files changed, 118 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/uprobe_multi_session.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
index 616441fdd7f2..d9c20ca7a833 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
@@ -8,6 +8,7 @@
 #include "uprobe_multi_usdt.skel.h"
 #include "uprobe_multi_consumers.skel.h"
 #include "uprobe_multi_pid_filter.skel.h"
+#include "uprobe_multi_session.skel.h"
 #include "bpf/libbpf_internal.h"
 #include "testing_helpers.h"
 #include "../sdt.h"
@@ -1017,6 +1018,50 @@ static void test_pid_filter_process(bool clone_vm)
 	uprobe_multi_pid_filter__destroy(skel);
 }
 
+static void test_session_skel_api(void)
+{
+	struct uprobe_multi_session *skel = NULL;
+	LIBBPF_OPTS(bpf_kprobe_multi_opts, opts);
+	struct bpf_link *link = NULL;
+	int err;
+
+	skel = uprobe_multi_session__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "uprobe_multi_session__open_and_load"))
+		goto cleanup;
+
+	skel->bss->pid = getpid();
+	skel->bss->user_ptr = test_data;
+
+	err = uprobe_multi_session__attach(skel);
+	if (!ASSERT_OK(err, "uprobe_multi_session__attach"))
+		goto cleanup;
+
+	/* trigger all probes */
+	skel->bss->uprobe_multi_func_1_addr = (__u64) uprobe_multi_func_1;
+	skel->bss->uprobe_multi_func_2_addr = (__u64) uprobe_multi_func_2;
+	skel->bss->uprobe_multi_func_3_addr = (__u64) uprobe_multi_func_3;
+
+	uprobe_multi_func_1();
+	uprobe_multi_func_2();
+	uprobe_multi_func_3();
+
+	/*
+	 * We expect 2 for uprobe_multi_func_2 because it runs both entry/return probe,
+	 * uprobe_multi_func_[13] run just the entry probe. All expected numbers are
+	 * doubled, because we run extra test for sleepable session.
+	 */
+	ASSERT_EQ(skel->bss->uprobe_session_result[0], 2, "uprobe_multi_func_1_result");
+	ASSERT_EQ(skel->bss->uprobe_session_result[1], 4, "uprobe_multi_func_2_result");
+	ASSERT_EQ(skel->bss->uprobe_session_result[2], 2, "uprobe_multi_func_3_result");
+
+	/* We expect increase in 3 entry and 1 return session calls -> 4 */
+	ASSERT_EQ(skel->bss->uprobe_multi_sleep_result, 4, "uprobe_multi_sleep_result");
+
+cleanup:
+	bpf_link__destroy(link);
+	uprobe_multi_session__destroy(skel);
+}
+
 static void test_bench_attach_uprobe(void)
 {
 	long attach_start_ns = 0, attach_end_ns = 0;
@@ -1113,4 +1158,6 @@ void test_uprobe_multi_test(void)
 		test_pid_filter_process(false);
 	if (test__start_subtest("filter_clone_vm"))
 		test_pid_filter_process(true);
+	if (test__start_subtest("session"))
+		test_session_skel_api();
 }
diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_session.c b/tools/testing/selftests/bpf/progs/uprobe_multi_session.c
new file mode 100644
index 000000000000..30bff90b68dc
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/uprobe_multi_session.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <stdbool.h>
+#include "bpf_kfuncs.h"
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+__u64 uprobe_multi_func_1_addr = 0;
+__u64 uprobe_multi_func_2_addr = 0;
+__u64 uprobe_multi_func_3_addr = 0;
+
+__u64 uprobe_session_result[3] = {};
+__u64 uprobe_multi_sleep_result = 0;
+
+void *user_ptr = 0;
+int pid = 0;
+
+static int uprobe_multi_check(void *ctx, bool is_return)
+{
+	const __u64 funcs[] = {
+		uprobe_multi_func_1_addr,
+		uprobe_multi_func_2_addr,
+		uprobe_multi_func_3_addr,
+	};
+	unsigned int i;
+	__u64 addr;
+
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 1;
+
+	addr = bpf_get_func_ip(ctx);
+
+	for (i = 0; i < ARRAY_SIZE(funcs); i++) {
+		if (funcs[i] == addr) {
+			uprobe_session_result[i]++;
+			break;
+		}
+	}
+
+	/* only uprobe_multi_func_2 executes return probe */
+	if ((addr == uprobe_multi_func_1_addr) ||
+	    (addr == uprobe_multi_func_3_addr))
+		return 1;
+
+	return 0;
+}
+
+SEC("uprobe.session//proc/self/exe:uprobe_multi_func_*")
+int uprobe(struct pt_regs *ctx)
+{
+	return uprobe_multi_check(ctx, bpf_session_is_return());
+}
+
+static __always_inline bool verify_sleepable_user_copy(void)
+{
+	char data[9];
+
+	bpf_copy_from_user(data, sizeof(data), user_ptr);
+	return bpf_strncmp(data, sizeof(data), "test_data") == 0;
+}
+
+SEC("uprobe.session.s//proc/self/exe:uprobe_multi_func_*")
+int uprobe_sleepable(struct pt_regs *ctx)
+{
+	if (verify_sleepable_user_copy())
+		uprobe_multi_sleep_result++;
+	return uprobe_multi_check(ctx, bpf_session_is_return());
+}
-- 
cgit v1.2.3


From f6b45e352f0f822bc0bb01b14829ac8f3158d056 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Fri, 8 Nov 2024 14:45:38 +0100
Subject: selftests/bpf: Add uprobe session cookie test

Adding uprobe session test that verifies the cookie value
get properly propagated from entry to return program.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241108134544.480660-8-jolsa@kernel.org
---
 .../selftests/bpf/prog_tests/uprobe_multi_test.c   | 31 ++++++++++++++
 .../bpf/progs/uprobe_multi_session_cookie.c        | 48 ++++++++++++++++++++++
 2 files changed, 79 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
index d9c20ca7a833..986852ec26f5 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
@@ -9,6 +9,7 @@
 #include "uprobe_multi_consumers.skel.h"
 #include "uprobe_multi_pid_filter.skel.h"
 #include "uprobe_multi_session.skel.h"
+#include "uprobe_multi_session_cookie.skel.h"
 #include "bpf/libbpf_internal.h"
 #include "testing_helpers.h"
 #include "../sdt.h"
@@ -1062,6 +1063,34 @@ cleanup:
 	uprobe_multi_session__destroy(skel);
 }
 
+static void test_session_cookie_skel_api(void)
+{
+	struct uprobe_multi_session_cookie *skel = NULL;
+	int err;
+
+	skel = uprobe_multi_session_cookie__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "uprobe_multi_session_cookie__open_and_load"))
+		goto cleanup;
+
+	skel->bss->pid = getpid();
+
+	err = uprobe_multi_session_cookie__attach(skel);
+	if (!ASSERT_OK(err, "uprobe_multi_session_cookie__attach"))
+		goto cleanup;
+
+	/* trigger all probes */
+	uprobe_multi_func_1();
+	uprobe_multi_func_2();
+	uprobe_multi_func_3();
+
+	ASSERT_EQ(skel->bss->test_uprobe_1_result, 1, "test_uprobe_1_result");
+	ASSERT_EQ(skel->bss->test_uprobe_2_result, 2, "test_uprobe_2_result");
+	ASSERT_EQ(skel->bss->test_uprobe_3_result, 3, "test_uprobe_3_result");
+
+cleanup:
+	uprobe_multi_session_cookie__destroy(skel);
+}
+
 static void test_bench_attach_uprobe(void)
 {
 	long attach_start_ns = 0, attach_end_ns = 0;
@@ -1160,4 +1189,6 @@ void test_uprobe_multi_test(void)
 		test_pid_filter_process(true);
 	if (test__start_subtest("session"))
 		test_session_skel_api();
+	if (test__start_subtest("session_cookie"))
+		test_session_cookie_skel_api();
 }
diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c b/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c
new file mode 100644
index 000000000000..5befdf944dc6
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <stdbool.h>
+#include "bpf_kfuncs.h"
+
+char _license[] SEC("license") = "GPL";
+
+int pid = 0;
+
+__u64 test_uprobe_1_result = 0;
+__u64 test_uprobe_2_result = 0;
+__u64 test_uprobe_3_result = 0;
+
+static int check_cookie(__u64 val, __u64 *result)
+{
+	__u64 *cookie;
+
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 1;
+
+	cookie = bpf_session_cookie();
+
+	if (bpf_session_is_return())
+		*result = *cookie == val ? val : 0;
+	else
+		*cookie = val;
+	return 0;
+}
+
+SEC("uprobe.session//proc/self/exe:uprobe_multi_func_1")
+int uprobe_1(struct pt_regs *ctx)
+{
+	return check_cookie(1, &test_uprobe_1_result);
+}
+
+SEC("uprobe.session//proc/self/exe:uprobe_multi_func_2")
+int uprobe_2(struct pt_regs *ctx)
+{
+	return check_cookie(2, &test_uprobe_2_result);
+}
+
+SEC("uprobe.session//proc/self/exe:uprobe_multi_func_3")
+int uprobe_3(struct pt_regs *ctx)
+{
+	return check_cookie(3, &test_uprobe_3_result);
+}
-- 
cgit v1.2.3


From 8bcb9c62f0689402e90886d3b65fc649d7c600d7 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Fri, 8 Nov 2024 14:45:39 +0100
Subject: selftests/bpf: Add uprobe session recursive test

Adding uprobe session test that verifies the cookie value is stored
properly when single uprobe-ed function is executed recursively.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241108134544.480660-9-jolsa@kernel.org
---
 .../selftests/bpf/prog_tests/uprobe_multi_test.c   | 57 ++++++++++++++++++++++
 .../bpf/progs/uprobe_multi_session_recursive.c     | 44 +++++++++++++++++
 2 files changed, 101 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/uprobe_multi_session_recursive.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
index 986852ec26f5..b9448fb63a19 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
@@ -10,6 +10,7 @@
 #include "uprobe_multi_pid_filter.skel.h"
 #include "uprobe_multi_session.skel.h"
 #include "uprobe_multi_session_cookie.skel.h"
+#include "uprobe_multi_session_recursive.skel.h"
 #include "bpf/libbpf_internal.h"
 #include "testing_helpers.h"
 #include "../sdt.h"
@@ -36,6 +37,12 @@ noinline void usdt_trigger(void)
 	STAP_PROBE(test, pid_filter_usdt);
 }
 
+noinline void uprobe_session_recursive(int i)
+{
+	if (i)
+		uprobe_session_recursive(i - 1);
+}
+
 struct child {
 	int go[2];
 	int c2p[2]; /* child -> parent channel */
@@ -1091,6 +1098,54 @@ cleanup:
 	uprobe_multi_session_cookie__destroy(skel);
 }
 
+static void test_session_recursive_skel_api(void)
+{
+	struct uprobe_multi_session_recursive *skel = NULL;
+	int i, err;
+
+	skel = uprobe_multi_session_recursive__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "uprobe_multi_session_recursive__open_and_load"))
+		goto cleanup;
+
+	skel->bss->pid = getpid();
+
+	err = uprobe_multi_session_recursive__attach(skel);
+	if (!ASSERT_OK(err, "uprobe_multi_session_recursive__attach"))
+		goto cleanup;
+
+	for (i = 0; i < ARRAY_SIZE(skel->bss->test_uprobe_cookie_entry); i++)
+		skel->bss->test_uprobe_cookie_entry[i] = i + 1;
+
+	uprobe_session_recursive(5);
+
+	/*
+	 *                                         entry uprobe:
+	 * uprobe_session_recursive(5) {             *cookie = 1, return 0
+	 *   uprobe_session_recursive(4) {           *cookie = 2, return 1
+	 *     uprobe_session_recursive(3) {         *cookie = 3, return 0
+	 *       uprobe_session_recursive(2) {       *cookie = 4, return 1
+	 *         uprobe_session_recursive(1) {     *cookie = 5, return 0
+	 *           uprobe_session_recursive(0) {   *cookie = 6, return 1
+	 *                                          return uprobe:
+	 *           } i = 0                          not executed
+	 *         } i = 1                            test_uprobe_cookie_return[0] = 5
+	 *       } i = 2                              not executed
+	 *     } i = 3                                test_uprobe_cookie_return[1] = 3
+	 *   } i = 4                                  not executed
+	 * } i = 5                                    test_uprobe_cookie_return[2] = 1
+	 */
+
+	ASSERT_EQ(skel->bss->idx_entry, 6, "idx_entry");
+	ASSERT_EQ(skel->bss->idx_return, 3, "idx_return");
+
+	ASSERT_EQ(skel->bss->test_uprobe_cookie_return[0], 5, "test_uprobe_cookie_return[0]");
+	ASSERT_EQ(skel->bss->test_uprobe_cookie_return[1], 3, "test_uprobe_cookie_return[1]");
+	ASSERT_EQ(skel->bss->test_uprobe_cookie_return[2], 1, "test_uprobe_cookie_return[2]");
+
+cleanup:
+	uprobe_multi_session_recursive__destroy(skel);
+}
+
 static void test_bench_attach_uprobe(void)
 {
 	long attach_start_ns = 0, attach_end_ns = 0;
@@ -1191,4 +1246,6 @@ void test_uprobe_multi_test(void)
 		test_session_skel_api();
 	if (test__start_subtest("session_cookie"))
 		test_session_cookie_skel_api();
+	if (test__start_subtest("session_cookie_recursive"))
+		test_session_recursive_skel_api();
 }
diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_session_recursive.c b/tools/testing/selftests/bpf/progs/uprobe_multi_session_recursive.c
new file mode 100644
index 000000000000..8fbcd69fae22
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/uprobe_multi_session_recursive.c
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <stdbool.h>
+#include "bpf_kfuncs.h"
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+int pid = 0;
+
+int idx_entry = 0;
+int idx_return = 0;
+
+__u64 test_uprobe_cookie_entry[6];
+__u64 test_uprobe_cookie_return[3];
+
+static int check_cookie(void)
+{
+	__u64 *cookie = bpf_session_cookie();
+
+	if (bpf_session_is_return()) {
+		if (idx_return >= ARRAY_SIZE(test_uprobe_cookie_return))
+			return 1;
+		test_uprobe_cookie_return[idx_return++] = *cookie;
+		return 0;
+	}
+
+	if (idx_entry >= ARRAY_SIZE(test_uprobe_cookie_entry))
+		return 1;
+	*cookie = test_uprobe_cookie_entry[idx_entry];
+	return idx_entry++ % 2;
+}
+
+
+SEC("uprobe.session//proc/self/exe:uprobe_session_recursive")
+int uprobe_recursive(struct pt_regs *ctx)
+{
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 1;
+
+	return check_cookie();
+}
-- 
cgit v1.2.3


From 8c3a48b0d9b41d8c3903a88d35b8f32c260e1a57 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Fri, 8 Nov 2024 14:45:40 +0100
Subject: selftests/bpf: Add uprobe session verifier test for return value

Making sure uprobe.session program can return only [0,1] values.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241108134544.480660-10-jolsa@kernel.org
---
 .../selftests/bpf/prog_tests/uprobe_multi_test.c   |  2 ++
 .../selftests/bpf/progs/uprobe_multi_verifier.c    | 31 ++++++++++++++++++++++
 2 files changed, 33 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/uprobe_multi_verifier.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
index b9448fb63a19..5dad31d1b606 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
@@ -11,6 +11,7 @@
 #include "uprobe_multi_session.skel.h"
 #include "uprobe_multi_session_cookie.skel.h"
 #include "uprobe_multi_session_recursive.skel.h"
+#include "uprobe_multi_verifier.skel.h"
 #include "bpf/libbpf_internal.h"
 #include "testing_helpers.h"
 #include "../sdt.h"
@@ -1248,4 +1249,5 @@ void test_uprobe_multi_test(void)
 		test_session_cookie_skel_api();
 	if (test__start_subtest("session_cookie_recursive"))
 		test_session_recursive_skel_api();
+	RUN_TESTS(uprobe_multi_verifier);
 }
diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_verifier.c b/tools/testing/selftests/bpf/progs/uprobe_multi_verifier.c
new file mode 100644
index 000000000000..fe49f2cb5360
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/uprobe_multi_verifier.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/usdt.bpf.h>
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+
+SEC("uprobe.session")
+__success
+int uprobe_sesison_return_0(struct pt_regs *ctx)
+{
+	return 0;
+}
+
+SEC("uprobe.session")
+__success
+int uprobe_sesison_return_1(struct pt_regs *ctx)
+{
+	return 1;
+}
+
+SEC("uprobe.session")
+__failure
+__msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]")
+int uprobe_sesison_return_2(struct pt_regs *ctx)
+{
+	return 2;
+}
-- 
cgit v1.2.3


From 504d21d905002f2b3e2a8703a3d4630a680362e2 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Fri, 8 Nov 2024 14:45:41 +0100
Subject: selftests/bpf: Add kprobe session verifier test for return value

Making sure kprobe.session program can return only [0,1] values.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241108134544.480660-11-jolsa@kernel.org
---
 .../selftests/bpf/prog_tests/kprobe_multi_test.c   |  2 ++
 .../selftests/bpf/progs/kprobe_multi_verifier.c    | 31 ++++++++++++++++++++++
 2 files changed, 33 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/kprobe_multi_verifier.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c
index 960c9323d1e0..66ab1cae923e 100644
--- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c
@@ -6,6 +6,7 @@
 #include "kprobe_multi_override.skel.h"
 #include "kprobe_multi_session.skel.h"
 #include "kprobe_multi_session_cookie.skel.h"
+#include "kprobe_multi_verifier.skel.h"
 #include "bpf/libbpf_internal.h"
 #include "bpf/hashmap.h"
 
@@ -764,4 +765,5 @@ void test_kprobe_multi_test(void)
 		test_session_skel_api();
 	if (test__start_subtest("session_cookie"))
 		test_session_cookie_skel_api();
+	RUN_TESTS(kprobe_multi_verifier);
 }
diff --git a/tools/testing/selftests/bpf/progs/kprobe_multi_verifier.c b/tools/testing/selftests/bpf/progs/kprobe_multi_verifier.c
new file mode 100644
index 000000000000..288577e81deb
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/kprobe_multi_verifier.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/usdt.bpf.h>
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+
+SEC("kprobe.session")
+__success
+int kprobe_session_return_0(struct pt_regs *ctx)
+{
+	return 0;
+}
+
+SEC("kprobe.session")
+__success
+int kprobe_session_return_1(struct pt_regs *ctx)
+{
+	return 1;
+}
+
+SEC("kprobe.session")
+__failure
+__msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]")
+int kprobe_session_return_2(struct pt_regs *ctx)
+{
+	return 2;
+}
-- 
cgit v1.2.3


From c574bcd6229333c211dbf4ecba2988c3581b0f92 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Fri, 8 Nov 2024 14:45:42 +0100
Subject: selftests/bpf: Add uprobe session single consumer test

Testing that the session ret_handler bypass works on single
uprobe with multiple consumers, each with different session
ignore return value.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241108134544.480660-12-jolsa@kernel.org
---
 .../selftests/bpf/prog_tests/uprobe_multi_test.c   | 33 ++++++++++++++++
 .../bpf/progs/uprobe_multi_session_single.c        | 44 ++++++++++++++++++++++
 2 files changed, 77 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/uprobe_multi_session_single.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
index 5dad31d1b606..93f5cabd6d01 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
@@ -9,6 +9,7 @@
 #include "uprobe_multi_consumers.skel.h"
 #include "uprobe_multi_pid_filter.skel.h"
 #include "uprobe_multi_session.skel.h"
+#include "uprobe_multi_session_single.skel.h"
 #include "uprobe_multi_session_cookie.skel.h"
 #include "uprobe_multi_session_recursive.skel.h"
 #include "uprobe_multi_verifier.skel.h"
@@ -1071,6 +1072,36 @@ cleanup:
 	uprobe_multi_session__destroy(skel);
 }
 
+static void test_session_single_skel_api(void)
+{
+	struct uprobe_multi_session_single *skel = NULL;
+	LIBBPF_OPTS(bpf_kprobe_multi_opts, opts);
+	int err;
+
+	skel = uprobe_multi_session_single__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "uprobe_multi_session_single__open_and_load"))
+		goto cleanup;
+
+	skel->bss->pid = getpid();
+
+	err = uprobe_multi_session_single__attach(skel);
+	if (!ASSERT_OK(err, "uprobe_multi_session_single__attach"))
+		goto cleanup;
+
+	uprobe_multi_func_1();
+
+	/*
+	 * We expect consumer 0 and 2 to trigger just entry handler (value 1)
+	 * and consumer 1 to hit both (value 2).
+	 */
+	ASSERT_EQ(skel->bss->uprobe_session_result[0], 1, "uprobe_session_result_0");
+	ASSERT_EQ(skel->bss->uprobe_session_result[1], 2, "uprobe_session_result_1");
+	ASSERT_EQ(skel->bss->uprobe_session_result[2], 1, "uprobe_session_result_2");
+
+cleanup:
+	uprobe_multi_session_single__destroy(skel);
+}
+
 static void test_session_cookie_skel_api(void)
 {
 	struct uprobe_multi_session_cookie *skel = NULL;
@@ -1245,6 +1276,8 @@ void test_uprobe_multi_test(void)
 		test_pid_filter_process(true);
 	if (test__start_subtest("session"))
 		test_session_skel_api();
+	if (test__start_subtest("session_single"))
+		test_session_single_skel_api();
 	if (test__start_subtest("session_cookie"))
 		test_session_cookie_skel_api();
 	if (test__start_subtest("session_cookie_recursive"))
diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_session_single.c b/tools/testing/selftests/bpf/progs/uprobe_multi_session_single.c
new file mode 100644
index 000000000000..7c960376ae97
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/uprobe_multi_session_single.c
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <stdbool.h>
+#include "bpf_kfuncs.h"
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+__u64 uprobe_session_result[3] = {};
+int pid = 0;
+
+static int uprobe_multi_check(void *ctx, int idx)
+{
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 1;
+
+	uprobe_session_result[idx]++;
+
+	/* only consumer 1 executes return probe */
+	if (idx == 0 || idx == 2)
+		return 1;
+
+	return 0;
+}
+
+SEC("uprobe.session//proc/self/exe:uprobe_multi_func_1")
+int uprobe_0(struct pt_regs *ctx)
+{
+	return uprobe_multi_check(ctx, 0);
+}
+
+SEC("uprobe.session//proc/self/exe:uprobe_multi_func_1")
+int uprobe_1(struct pt_regs *ctx)
+{
+	return uprobe_multi_check(ctx, 1);
+}
+
+SEC("uprobe.session//proc/self/exe:uprobe_multi_func_1")
+int uprobe_2(struct pt_regs *ctx)
+{
+	return uprobe_multi_check(ctx, 2);
+}
-- 
cgit v1.2.3


From b1c570adc7a6f6cbb42926d5313036ed1543f00e Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Fri, 8 Nov 2024 14:45:43 +0100
Subject: selftests/bpf: Add uprobe sessions to consumer test

Adding uprobe session consumers to the consumer test,
so we get the session into the test mix.

In addition scaling down the test to have just 1 uprobe
and 1 uretprobe, otherwise the test time grows and is
unsuitable for CI even with threads.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241108134544.480660-13-jolsa@kernel.org
---
 .../selftests/bpf/prog_tests/uprobe_multi_test.c   | 70 +++++++++++++++-------
 .../selftests/bpf/progs/uprobe_multi_consumers.c   |  6 +-
 2 files changed, 52 insertions(+), 24 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
index 93f5cabd6d01..0a31ba2d6fb2 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
@@ -799,10 +799,13 @@ static int uprobe_attach(struct uprobe_multi_consumers *skel, int idx)
 		return -1;
 
 	/*
-	 * bit/prog: 0,1 uprobe entry
-	 * bit/prog: 2,3 uprobe return
+	 * bit/prog: 0 uprobe entry
+	 * bit/prog: 1 uprobe return
+	 * bit/prog: 2 uprobe session without return
+	 * bit/prog: 3 uprobe session with return
 	 */
-	opts.retprobe = idx == 2 || idx == 3;
+	opts.retprobe = idx == 1;
+	opts.session  = idx == 2 || idx == 3;
 
 	*link = bpf_program__attach_uprobe_multi(prog, 0, "/proc/self/exe",
 						"uprobe_consumer_test",
@@ -867,31 +870,55 @@ static int consumer_test(struct uprobe_multi_consumers *skel,
 		goto cleanup;
 
 	for (idx = 0; idx < 4; idx++) {
+		bool uret_stays, uret_survives;
 		const char *fmt = "BUG";
 		__u64 val = 0;
 
-		if (idx < 2) {
+		switch (idx) {
+		case 0:
 			/*
 			 * uprobe entry
 			 *   +1 if define in 'before'
 			 */
 			if (test_bit(idx, before))
 				val++;
-			fmt = "prog 0/1: uprobe";
-		} else {
+			fmt = "prog 0: uprobe";
+			break;
+		case 1:
 			/*
 			 * To trigger uretprobe consumer, the uretprobe under test either stayed from
 			 * before to after (uret_stays + test_bit) or uretprobe instance survived and
 			 * we have uretprobe active in after (uret_survives + test_bit)
 			 */
-
-			bool uret_stays = before & after & 0b1100;
-			bool uret_survives = (before & 0b1100) && (after & 0b1100) && (before & 0b0011);
+			uret_stays = before & after & 0b0110;
+			uret_survives = ((before & 0b0110) && (after & 0b0110) && (before & 0b1001));
 
 			if ((uret_stays || uret_survives) && test_bit(idx, after))
 				val++;
-
-			fmt = "idx 2/3: uretprobe";
+			fmt = "prog 1: uretprobe";
+			break;
+		case 2:
+			/*
+			 * session with return
+			 *  +1 if defined in 'before'
+			 *  +1 if defined in 'after'
+			 */
+			if (test_bit(idx, before)) {
+				val++;
+				if (test_bit(idx, after))
+					val++;
+			}
+			fmt = "prog 2: session with return";
+			break;
+		case 3:
+			/*
+			 * session without return
+			 *   +1 if defined in 'before'
+			 */
+			if (test_bit(idx, before))
+				val++;
+			fmt = "prog 3: session with NO return";
+			break;
 		}
 
 		if (!ASSERT_EQ(skel->bss->uprobe_result[idx], val, fmt))
@@ -920,8 +947,10 @@ static void test_consumers(void)
 	 * The idea of this test is to try all possible combinations of
 	 * uprobes consumers attached on single function.
 	 *
-	 *  - 2 uprobe entry consumer
-	 *  - 2 uprobe exit consumers
+	 *  - 1 uprobe entry consumer
+	 *  - 1 uprobe exit consumer
+	 *  - 1 uprobe session with return
+	 *  - 1 uprobe session without return
 	 *
 	 * The test uses 4 uprobes attached on single function, but that
 	 * translates into single uprobe with 4 consumers in kernel.
@@ -929,25 +958,24 @@ static void test_consumers(void)
 	 * The before/after values present the state of attached consumers
 	 * before and after the probed function:
 	 *
-	 *  bit/prog 0,1 : uprobe entry
-	 *  bit/prog 2,3 : uprobe return
+	 *  bit/prog 0 : uprobe entry
+	 *  bit/prog 1 : uprobe return
 	 *
 	 * For example for:
 	 *
-	 *   before = 0b0101
-	 *   after  = 0b0110
+	 *   before = 0b01
+	 *   after  = 0b10
 	 *
 	 * it means that before we call 'uprobe_consumer_test' we attach
 	 * uprobes defined in 'before' value:
 	 *
-	 *   - bit/prog 0: uprobe entry
-	 *   - bit/prog 2: uprobe return
+	 *   - bit/prog 1: uprobe entry
 	 *
 	 * uprobe_consumer_test is called and inside it we attach and detach
 	 * uprobes based on 'after' value:
 	 *
-	 *   - bit/prog 0: stays untouched
-	 *   - bit/prog 2: uprobe return is detached
+	 *   - bit/prog 0: is detached
+	 *   - bit/prog 1: is attached
 	 *
 	 * uprobe_consumer_test returns and we check counters values increased
 	 * by bpf programs on each uprobe to match the expected count based on
diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_consumers.c b/tools/testing/selftests/bpf/progs/uprobe_multi_consumers.c
index 7e0fdcbbd242..93752bb5690b 100644
--- a/tools/testing/selftests/bpf/progs/uprobe_multi_consumers.c
+++ b/tools/testing/selftests/bpf/progs/uprobe_multi_consumers.c
@@ -24,16 +24,16 @@ int uprobe_1(struct pt_regs *ctx)
 	return 0;
 }
 
-SEC("uprobe.multi")
+SEC("uprobe.session")
 int uprobe_2(struct pt_regs *ctx)
 {
 	uprobe_result[2]++;
 	return 0;
 }
 
-SEC("uprobe.multi")
+SEC("uprobe.session")
 int uprobe_3(struct pt_regs *ctx)
 {
 	uprobe_result[3]++;
-	return 0;
+	return 1;
 }
-- 
cgit v1.2.3


From abaec8341a86e556dff739d093aa30babc498ec5 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Fri, 8 Nov 2024 14:45:44 +0100
Subject: selftests/bpf: Add threads to consumer test

With recent uprobe fix [1] the sync time after unregistering uprobe is
much longer and prolongs the consumer test which creates and destroys
hundreds of uprobes.

This change adds 16 threads (which fits the test logic) and speeds up
the test.

Before the change:

  # perf stat --null ./test_progs -t uprobe_multi_test/consumers
  #421/9   uprobe_multi_test/consumers:OK
  #421     uprobe_multi_test:OK
  Summary: 1/1 PASSED, 0 SKIPPED, 0 FAILED

   Performance counter stats for './test_progs -t uprobe_multi_test/consumers':

        28.818778973 seconds time elapsed

         0.745518000 seconds user
         0.919186000 seconds sys

After the change:

  # perf stat --null ./test_progs -t uprobe_multi_test/consumers 2>&1
  #421/9   uprobe_multi_test/consumers:OK
  #421     uprobe_multi_test:OK
  Summary: 1/1 PASSED, 0 SKIPPED, 0 FAILED

   Performance counter stats for './test_progs -t uprobe_multi_test/consumers':

         3.504790814 seconds time elapsed

         0.012141000 seconds user
         0.751760000 seconds sys

[1] commit 87195a1ee332 ("uprobes: switch to RCU Tasks Trace flavor for better performance")

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241108134544.480660-14-jolsa@kernel.org
---
 .../selftests/bpf/prog_tests/uprobe_multi_test.c   | 98 ++++++++++++++++++----
 1 file changed, 80 insertions(+), 18 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
index 0a31ba2d6fb2..2ee17ef1dae2 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
@@ -789,7 +789,7 @@ get_link(struct uprobe_multi_consumers *skel, int link)
 	}
 }
 
-static int uprobe_attach(struct uprobe_multi_consumers *skel, int idx)
+static int uprobe_attach(struct uprobe_multi_consumers *skel, int idx, unsigned long offset)
 {
 	struct bpf_program *prog = get_program(skel, idx);
 	struct bpf_link **link = get_link(skel, idx);
@@ -798,6 +798,9 @@ static int uprobe_attach(struct uprobe_multi_consumers *skel, int idx)
 	if (!prog || !link)
 		return -1;
 
+	opts.offsets = &offset;
+	opts.cnt = 1;
+
 	/*
 	 * bit/prog: 0 uprobe entry
 	 * bit/prog: 1 uprobe return
@@ -807,9 +810,7 @@ static int uprobe_attach(struct uprobe_multi_consumers *skel, int idx)
 	opts.retprobe = idx == 1;
 	opts.session  = idx == 2 || idx == 3;
 
-	*link = bpf_program__attach_uprobe_multi(prog, 0, "/proc/self/exe",
-						"uprobe_consumer_test",
-						&opts);
+	*link = bpf_program__attach_uprobe_multi(prog, 0, "/proc/self/exe", NULL, &opts);
 	if (!ASSERT_OK_PTR(*link, "bpf_program__attach_uprobe_multi"))
 		return -1;
 	return 0;
@@ -830,7 +831,8 @@ static bool test_bit(int bit, unsigned long val)
 
 noinline int
 uprobe_consumer_test(struct uprobe_multi_consumers *skel,
-		     unsigned long before, unsigned long after)
+		     unsigned long before, unsigned long after,
+		     unsigned long offset)
 {
 	int idx;
 
@@ -843,15 +845,43 @@ uprobe_consumer_test(struct uprobe_multi_consumers *skel,
 	/* ... and attach all new programs in 'after' state */
 	for (idx = 0; idx < 4; idx++) {
 		if (!test_bit(idx, before) && test_bit(idx, after)) {
-			if (!ASSERT_OK(uprobe_attach(skel, idx), "uprobe_attach_after"))
+			if (!ASSERT_OK(uprobe_attach(skel, idx, offset), "uprobe_attach_after"))
 				return -1;
 		}
 	}
 	return 0;
 }
 
+/*
+ * We generate 16 consumer_testX functions that will have uprobe installed on
+ * and will be called in separate threads. All function pointer are stored in
+ * "consumers" section and each thread will pick one function based on index.
+ */
+
+extern const void *__start_consumers;
+
+#define __CONSUMER_TEST(func) 							\
+noinline int func(struct uprobe_multi_consumers *skel, unsigned long before,	\
+		  unsigned long after, unsigned long offset)			\
+{										\
+	return uprobe_consumer_test(skel, before, after, offset);		\
+}										\
+void *__ ## func __used __attribute__((section("consumers"))) = (void *) func;
+
+#define CONSUMER_TEST(func) __CONSUMER_TEST(func)
+
+#define C1  CONSUMER_TEST(__PASTE(consumer_test, __COUNTER__))
+#define C4  C1 C1 C1 C1
+#define C16 C4 C4 C4 C4
+
+C16
+
+typedef int (*test_t)(struct uprobe_multi_consumers *, unsigned long,
+		      unsigned long, unsigned long);
+
 static int consumer_test(struct uprobe_multi_consumers *skel,
-			 unsigned long before, unsigned long after)
+			 unsigned long before, unsigned long after,
+			 test_t test, unsigned long offset)
 {
 	int err, idx, ret = -1;
 
@@ -860,12 +890,12 @@ static int consumer_test(struct uprobe_multi_consumers *skel,
 	/* 'before' is each, we attach uprobe for every set idx */
 	for (idx = 0; idx < 4; idx++) {
 		if (test_bit(idx, before)) {
-			if (!ASSERT_OK(uprobe_attach(skel, idx), "uprobe_attach_before"))
+			if (!ASSERT_OK(uprobe_attach(skel, idx, offset), "uprobe_attach_before"))
 				goto cleanup;
 		}
 	}
 
-	err = uprobe_consumer_test(skel, before, after);
+	err = test(skel, before, after, offset);
 	if (!ASSERT_EQ(err, 0, "uprobe_consumer_test"))
 		goto cleanup;
 
@@ -934,14 +964,46 @@ cleanup:
 	return ret;
 }
 
-static void test_consumers(void)
+#define CONSUMER_MAX 16
+
+/*
+ * Each thread runs 1/16 of the load by running test for single
+ * 'before' number (based on thread index) and full scale of
+ * 'after' numbers.
+ */
+static void *consumer_thread(void *arg)
 {
+	unsigned long idx = (unsigned long) arg;
 	struct uprobe_multi_consumers *skel;
-	int before, after;
+	unsigned long offset;
+	const void *func;
+	int after;
 
 	skel = uprobe_multi_consumers__open_and_load();
 	if (!ASSERT_OK_PTR(skel, "uprobe_multi_consumers__open_and_load"))
-		return;
+		return NULL;
+
+	func = *((&__start_consumers) + idx);
+
+	offset = get_uprobe_offset(func);
+	if (!ASSERT_GE(offset, 0, "uprobe_offset"))
+		goto out;
+
+	for (after = 0; after < CONSUMER_MAX; after++)
+		if (consumer_test(skel, idx, after, func, offset))
+			goto out;
+
+out:
+	uprobe_multi_consumers__destroy(skel);
+	return NULL;
+}
+
+
+static void test_consumers(void)
+{
+	pthread_t pt[CONSUMER_MAX];
+	unsigned long idx;
+	int err;
 
 	/*
 	 * The idea of this test is to try all possible combinations of
@@ -982,14 +1044,14 @@ static void test_consumers(void)
 	 * before/after bits.
 	 */
 
-	for (before = 0; before < 16; before++) {
-		for (after = 0; after < 16; after++)
-			if (consumer_test(skel, before, after))
-				goto out;
+	for (idx = 0; idx < CONSUMER_MAX; idx++) {
+		err = pthread_create(&pt[idx], NULL, consumer_thread, (void *) idx);
+		if (!ASSERT_OK(err, "pthread_create"))
+			break;
 	}
 
-out:
-	uprobe_multi_consumers__destroy(skel);
+	while (idx)
+		pthread_join(pt[--idx], NULL);
 }
 
 static struct bpf_program *uprobe_multi_program(struct uprobe_multi_pid_filter *skel, int idx)
-- 
cgit v1.2.3


From 503cfb103c8d6ca4f23a9abbf36672c9cfe6f745 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Wed, 6 Nov 2024 14:35:41 +0800
Subject: selftests/bpf: Move ENOTSUPP from bpf_util.h

Moving the definition of ENOTSUPP into bpf_util.h to remove the
duplicated definitions in multiple files.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20241106063542.357743-3-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/bpf_util.h              | 3 +++
 tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c | 4 ----
 tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c | 4 ----
 tools/testing/selftests/bpf/prog_tests/sock_addr.c  | 4 ----
 tools/testing/selftests/bpf/test_maps.c             | 4 ----
 tools/testing/selftests/bpf/test_verifier.c         | 4 ----
 6 files changed, 3 insertions(+), 20 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/bpf_util.h b/tools/testing/selftests/bpf/bpf_util.h
index feff92219e21..5f6963a320d7 100644
--- a/tools/testing/selftests/bpf/bpf_util.h
+++ b/tools/testing/selftests/bpf/bpf_util.h
@@ -67,5 +67,8 @@ static inline void bpf_strlcpy(char *dst, const char *src, size_t sz)
 #define sys_gettid() syscall(SYS_gettid)
 #endif
 
+#ifndef ENOTSUPP
+#define ENOTSUPP 524
+#endif
 
 #endif /* __BPF_UTIL__ */
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
index 409a06975823..b7d1b52309d0 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
@@ -16,10 +16,6 @@
 #include "tcp_ca_kfunc.skel.h"
 #include "bpf_cc_cubic.skel.h"
 
-#ifndef ENOTSUPP
-#define ENOTSUPP 524
-#endif
-
 static const unsigned int total_bytes = 10 * 1024 * 1024;
 static int expected_stg = 0xeB9F;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c b/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c
index 130a3b21e467..6df25de8f080 100644
--- a/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c
+++ b/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c
@@ -10,10 +10,6 @@
 #include "cgroup_helpers.h"
 #include "network_helpers.h"
 
-#ifndef ENOTSUPP
-#define ENOTSUPP 524
-#endif
-
 static struct btf *btf;
 
 static __u32 query_prog_cnt(int cgroup_fd, const char *attach_func)
diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c
index a6ee7f8d4f79..b2efabbed220 100644
--- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c
+++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c
@@ -23,10 +23,6 @@
 #include "getpeername_unix_prog.skel.h"
 #include "network_helpers.h"
 
-#ifndef ENOTSUPP
-# define ENOTSUPP 524
-#endif
-
 #define TEST_NS                 "sock_addr"
 #define TEST_IF_PREFIX          "test_sock_addr"
 #define TEST_IPV4               "127.0.0.4"
diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index 905d5981ace1..8b40e9496af1 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -26,10 +26,6 @@
 #include "test_maps.h"
 #include "testing_helpers.h"
 
-#ifndef ENOTSUPP
-#define ENOTSUPP 524
-#endif
-
 int skips;
 
 static struct bpf_map_create_opts map_opts = { .sz = sizeof(map_opts) };
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 610392dfc4fb..447b68509d76 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -42,10 +42,6 @@
 #include "../../../include/linux/filter.h"
 #include "testing_helpers.h"
 
-#ifndef ENOTSUPP
-#define ENOTSUPP 524
-#endif
-
 #define MAX_INSNS	BPF_MAXINSNS
 #define MAX_EXPECTED_INSNS	32
 #define MAX_UNEXPECTED_INSNS	32
-- 
cgit v1.2.3


From cb55657c7fc800b722f2ef0afaf4d9c3c8902e6d Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Wed, 6 Nov 2024 14:35:42 +0800
Subject: selftests/bpf: Test the update operations for htab of maps

Add test cases to verify the following four update operations on htab of
maps don't trigger lockdep warning:

(1) add then delete
(2) add, overwrite, then delete
(3) add, then lookup_and_delete
(4) add two elements, then lookup_and_delete_batch

Test cases are added for pre-allocated and non-preallocated htab of maps
respectively.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20241106063542.357743-4-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 .../testing/selftests/bpf/prog_tests/map_in_map.c  | 132 ++++++++++++++++++++-
 .../selftests/bpf/progs/update_map_in_htab.c       |  30 +++++
 2 files changed, 161 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/progs/update_map_in_htab.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/map_in_map.c b/tools/testing/selftests/bpf/prog_tests/map_in_map.c
index d2a10eb4e5b5..286a9fb469e2 100644
--- a/tools/testing/selftests/bpf/prog_tests/map_in_map.c
+++ b/tools/testing/selftests/bpf/prog_tests/map_in_map.c
@@ -5,7 +5,9 @@
 #include <sys/syscall.h>
 #include <test_progs.h>
 #include <bpf/btf.h>
+
 #include "access_map_in_map.skel.h"
+#include "update_map_in_htab.skel.h"
 
 struct thread_ctx {
 	pthread_barrier_t barrier;
@@ -127,6 +129,131 @@ out:
 	access_map_in_map__destroy(skel);
 }
 
+static void add_del_fd_htab(int outer_fd)
+{
+	int inner_fd, err;
+	int key = 1;
+
+	inner_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "arr1", 4, 4, 1, NULL);
+	if (!ASSERT_OK_FD(inner_fd, "inner1"))
+		return;
+	err = bpf_map_update_elem(outer_fd, &key, &inner_fd, BPF_NOEXIST);
+	close(inner_fd);
+	if (!ASSERT_OK(err, "add"))
+		return;
+
+	/* Delete */
+	err = bpf_map_delete_elem(outer_fd, &key);
+	ASSERT_OK(err, "del");
+}
+
+static void overwrite_fd_htab(int outer_fd)
+{
+	int inner_fd, err;
+	int key = 1;
+
+	inner_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "arr1", 4, 4, 1, NULL);
+	if (!ASSERT_OK_FD(inner_fd, "inner1"))
+		return;
+	err = bpf_map_update_elem(outer_fd, &key, &inner_fd, BPF_NOEXIST);
+	close(inner_fd);
+	if (!ASSERT_OK(err, "add"))
+		return;
+
+	/* Overwrite */
+	inner_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "arr2", 4, 4, 1, NULL);
+	if (!ASSERT_OK_FD(inner_fd, "inner2"))
+		goto out;
+	err = bpf_map_update_elem(outer_fd, &key, &inner_fd, BPF_EXIST);
+	close(inner_fd);
+	if (!ASSERT_OK(err, "overwrite"))
+		goto out;
+
+	err = bpf_map_delete_elem(outer_fd, &key);
+	ASSERT_OK(err, "del");
+	return;
+out:
+	bpf_map_delete_elem(outer_fd, &key);
+}
+
+static void lookup_delete_fd_htab(int outer_fd)
+{
+	int key = 1, value;
+	int inner_fd, err;
+
+	inner_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "arr1", 4, 4, 1, NULL);
+	if (!ASSERT_OK_FD(inner_fd, "inner1"))
+		return;
+	err = bpf_map_update_elem(outer_fd, &key, &inner_fd, BPF_NOEXIST);
+	close(inner_fd);
+	if (!ASSERT_OK(err, "add"))
+		return;
+
+	/* lookup_and_delete is not supported for htab of maps */
+	err = bpf_map_lookup_and_delete_elem(outer_fd, &key, &value);
+	ASSERT_EQ(err, -ENOTSUPP, "lookup_del");
+
+	err = bpf_map_delete_elem(outer_fd, &key);
+	ASSERT_OK(err, "del");
+}
+
+static void batched_lookup_delete_fd_htab(int outer_fd)
+{
+	int keys[2] = {1, 2}, values[2];
+	unsigned int cnt, batch;
+	int inner_fd, err;
+
+	inner_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "arr1", 4, 4, 1, NULL);
+	if (!ASSERT_OK_FD(inner_fd, "inner1"))
+		return;
+
+	err = bpf_map_update_elem(outer_fd, &keys[0], &inner_fd, BPF_NOEXIST);
+	close(inner_fd);
+	if (!ASSERT_OK(err, "add1"))
+		return;
+
+	inner_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "arr2", 4, 4, 1, NULL);
+	if (!ASSERT_OK_FD(inner_fd, "inner2"))
+		goto out;
+	err = bpf_map_update_elem(outer_fd, &keys[1], &inner_fd, BPF_NOEXIST);
+	close(inner_fd);
+	if (!ASSERT_OK(err, "add2"))
+		goto out;
+
+	/* batched lookup_and_delete */
+	cnt = ARRAY_SIZE(keys);
+	err = bpf_map_lookup_and_delete_batch(outer_fd, NULL, &batch, keys, values, &cnt, NULL);
+	ASSERT_TRUE((!err || err == -ENOENT), "delete_batch ret");
+	ASSERT_EQ(cnt, ARRAY_SIZE(keys), "delete_batch cnt");
+
+out:
+	bpf_map_delete_elem(outer_fd, &keys[0]);
+}
+
+static void test_update_map_in_htab(bool preallocate)
+{
+	struct update_map_in_htab *skel;
+	int err, fd;
+
+	skel = update_map_in_htab__open();
+	if (!ASSERT_OK_PTR(skel, "open"))
+		return;
+
+	err = update_map_in_htab__load(skel);
+	if (!ASSERT_OK(err, "load"))
+		goto out;
+
+	fd = preallocate ? bpf_map__fd(skel->maps.outer_htab_map) :
+			   bpf_map__fd(skel->maps.outer_alloc_htab_map);
+
+	add_del_fd_htab(fd);
+	overwrite_fd_htab(fd);
+	lookup_delete_fd_htab(fd);
+	batched_lookup_delete_fd_htab(fd);
+out:
+	update_map_in_htab__destroy(skel);
+}
+
 void test_map_in_map(void)
 {
 	if (test__start_subtest("acc_map_in_array"))
@@ -137,5 +264,8 @@ void test_map_in_map(void)
 		test_map_in_map_access("access_map_in_htab", "outer_htab_map");
 	if (test__start_subtest("sleepable_acc_map_in_htab"))
 		test_map_in_map_access("sleepable_access_map_in_htab", "outer_htab_map");
+	if (test__start_subtest("update_map_in_htab"))
+		test_update_map_in_htab(true);
+	if (test__start_subtest("update_map_in_alloc_htab"))
+		test_update_map_in_htab(false);
 }
-
diff --git a/tools/testing/selftests/bpf/progs/update_map_in_htab.c b/tools/testing/selftests/bpf/progs/update_map_in_htab.c
new file mode 100644
index 000000000000..c2066247cd9c
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/update_map_in_htab.c
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2024. Huawei Technologies Co., Ltd */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+struct inner_map_type {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(key_size, 4);
+	__uint(value_size, 4);
+	__uint(max_entries, 1);
+} inner_map SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH_OF_MAPS);
+	__type(key, int);
+	__type(value, int);
+	__uint(max_entries, 2);
+	__array(values, struct inner_map_type);
+} outer_htab_map SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH_OF_MAPS);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, int);
+	__uint(max_entries, 2);
+	__array(values, struct inner_map_type);
+} outer_alloc_htab_map SEC(".maps");
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 937a1c29a287e8f48c4cea714c76a13e14d989ac Mon Sep 17 00:00:00 2001
From: Viktor Malik <vmalik@redhat.com>
Date: Thu, 7 Nov 2024 12:52:31 +0100
Subject: selftests/bpf: skip the timer_lockup test for single-CPU nodes

The timer_lockup test needs 2 CPUs to work, on single-CPU nodes it fails
to set thread affinity to CPU 1 since it doesn't exist:

    # ./test_progs -t timer_lockup
    test_timer_lockup:PASS:timer_lockup__open_and_load 0 nsec
    test_timer_lockup:PASS:pthread_create thread1 0 nsec
    test_timer_lockup:PASS:pthread_create thread2 0 nsec
    timer_lockup_thread:PASS:cpu affinity 0 nsec
    timer_lockup_thread:FAIL:cpu affinity unexpected error: 22 (errno 0)
    test_timer_lockup:PASS: 0 nsec
    #406     timer_lockup:FAIL

Skip the test if only 1 CPU is available.

Signed-off-by: Viktor Malik <vmalik@redhat.com>
Fixes: 50bd5a0c658d1 ("selftests/bpf: Add timer lockup selftest")
Tested-by: Philo Lu <lulie@linux.alibaba.com>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20241107115231.75200-1-vmalik@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/timer_lockup.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/timer_lockup.c b/tools/testing/selftests/bpf/prog_tests/timer_lockup.c
index 871d16cb95cf..1a2f99596916 100644
--- a/tools/testing/selftests/bpf/prog_tests/timer_lockup.c
+++ b/tools/testing/selftests/bpf/prog_tests/timer_lockup.c
@@ -5,6 +5,7 @@
 #include <test_progs.h>
 #include <pthread.h>
 #include <network_helpers.h>
+#include <sys/sysinfo.h>
 
 #include "timer_lockup.skel.h"
 
@@ -52,6 +53,11 @@ void test_timer_lockup(void)
 	pthread_t thrds[2];
 	void *ret;
 
+	if (get_nprocs() < 2) {
+		test__skip();
+		return;
+	}
+
 	skel = timer_lockup__open_and_load();
 	if (!ASSERT_OK_PTR(skel, "timer_lockup__open_and_load"))
 		return;
-- 
cgit v1.2.3


From ae6e3a273f590a2b64f14a9fab3546c3a8f44ed4 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 9 Nov 2024 15:14:30 -0800
Subject: bpf: Drop special callback reference handling

Logic to prevent callbacks from acquiring new references for the program
(i.e. leaving acquired references), and releasing caller references
(i.e. those acquired in parent frames) was introduced in commit
9d9d00ac29d0 ("bpf: Fix reference state management for synchronous callbacks").

This was necessary because back then, the verifier simulated each
callback once (that could potentially be executed N times, where N can
be zero). This meant that callbacks that left lingering resources or
cleared caller resources could do it more than once, operating on
undefined state or leaking memory.

With the fixes to callback verification in commit
ab5cfac139ab ("bpf: verify callbacks as if they are called unknown number of times"),
all of this extra logic is no longer necessary. Hence, drop it as part
of this commit.

Cc: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20241109231430.2475236-3-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 include/linux/bpf_verifier.h                     | 21 ++++----------------
 kernel/bpf/verifier.c                            | 25 +++++-------------------
 tools/testing/selftests/bpf/prog_tests/cb_refs.c |  4 ++--
 3 files changed, 11 insertions(+), 39 deletions(-)

(limited to 'tools/testing')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index d84beed92ae4..3a74033d49c4 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -265,23 +265,10 @@ struct bpf_reference_state {
 	 * is used purely to inform the user of a reference leak.
 	 */
 	int insn_idx;
-	union {
-		/* There can be a case like:
-		 * main (frame 0)
-		 *  cb (frame 1)
-		 *   func (frame 3)
-		 *    cb (frame 4)
-		 * Hence for frame 4, if callback_ref just stored boolean, it would be
-		 * impossible to distinguish nested callback refs. Hence store the
-		 * frameno and compare that to callback_ref in check_reference_leak when
-		 * exiting a callback function.
-		 */
-		int callback_ref;
-		/* Use to keep track of the source object of a lock, to ensure
-		 * it matches on unlock.
-		 */
-		void *ptr;
-	};
+	/* Use to keep track of the source object of a lock, to ensure
+	 * it matches on unlock.
+	 */
+	void *ptr;
 };
 
 struct bpf_retval_range {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d55ca27dc031..9f5de8d4fbd0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1358,7 +1358,6 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
 	state->refs[new_ofs].type = REF_TYPE_PTR;
 	state->refs[new_ofs].id = id;
 	state->refs[new_ofs].insn_idx = insn_idx;
-	state->refs[new_ofs].callback_ref = state->in_callback_fn ? state->frameno : 0;
 
 	return id;
 }
@@ -1392,9 +1391,6 @@ static int release_reference_state(struct bpf_func_state *state, int ptr_id)
 		if (state->refs[i].type != REF_TYPE_PTR)
 			continue;
 		if (state->refs[i].id == ptr_id) {
-			/* Cannot release caller references in callbacks */
-			if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
-				return -EINVAL;
 			if (last_idx && i != last_idx)
 				memcpy(&state->refs[i], &state->refs[last_idx],
 				       sizeof(*state->refs));
@@ -10267,17 +10263,10 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 		caller->regs[BPF_REG_0] = *r0;
 	}
 
-	/* callback_fn frame should have released its own additions to parent's
-	 * reference state at this point, or check_reference_leak would
-	 * complain, hence it must be the same as the caller. There is no need
-	 * to copy it back.
-	 */
-	if (!callee->in_callback_fn) {
-		/* Transfer references to the caller */
-		err = copy_reference_state(caller, callee);
-		if (err)
-			return err;
-	}
+	/* Transfer references to the caller */
+	err = copy_reference_state(caller, callee);
+	if (err)
+		return err;
 
 	/* for callbacks like bpf_loop or bpf_for_each_map_elem go back to callsite,
 	 * there function call logic would reschedule callback visit. If iteration
@@ -10447,14 +10436,12 @@ static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exi
 	bool refs_lingering = false;
 	int i;
 
-	if (!exception_exit && state->frameno && !state->in_callback_fn)
+	if (!exception_exit && state->frameno)
 		return 0;
 
 	for (i = 0; i < state->acquired_refs; i++) {
 		if (state->refs[i].type != REF_TYPE_PTR)
 			continue;
-		if (!exception_exit && state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
-			continue;
 		verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
 			state->refs[i].id, state->refs[i].insn_idx);
 		refs_lingering = true;
@@ -17707,8 +17694,6 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur,
 			return false;
 		switch (old->refs[i].type) {
 		case REF_TYPE_PTR:
-			if (old->refs[i].callback_ref != cur->refs[i].callback_ref)
-				return false;
 			break;
 		case REF_TYPE_LOCK:
 			if (old->refs[i].ptr != cur->refs[i].ptr)
diff --git a/tools/testing/selftests/bpf/prog_tests/cb_refs.c b/tools/testing/selftests/bpf/prog_tests/cb_refs.c
index 3bff680de16c..c40df623a8f7 100644
--- a/tools/testing/selftests/bpf/prog_tests/cb_refs.c
+++ b/tools/testing/selftests/bpf/prog_tests/cb_refs.c
@@ -11,8 +11,8 @@ struct {
 	const char *prog_name;
 	const char *err_msg;
 } cb_refs_tests[] = {
-	{ "underflow_prog", "reference has not been acquired before" },
-	{ "leak_prog", "Unreleased reference" },
+	{ "underflow_prog", "must point to scalar, or struct with scalar" },
+	{ "leak_prog", "Possibly NULL pointer passed to helper arg2" },
 	{ "nested_cb", "Unreleased reference id=4 alloc_insn=2" }, /* alloc_insn=2{4,5} */
 	{ "non_cb_transfer_ref", "Unreleased reference id=4 alloc_insn=1" }, /* alloc_insn=1{1,2} */
 };
-- 
cgit v1.2.3


From 5afe18dfa47daead88517b095b6e0ce012f031f8 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 7 Nov 2024 11:39:59 -0800
Subject: KVM: selftests: Don't bother deleting memslots in KVM when freeing
 VMs

When freeing a VM, don't call into KVM to manually remove each memslot,
simply cleanup and free any userspace assets associated with the memory
region.  KVM is ultimately responsible for ensuring kernel resources are
freed when the VM is destroyed, deleting memslots one-by-one is
unnecessarily slow, and unless a test is already leaking the VM fd, the
VM will be destroyed when kvm_vm_release() is called.

Not deleting KVM's memslot also allows cleaning up dead VMs without having
to care whether or not the to-be-freed VM is dead or alive.

Reported-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Reported-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Link: https://lore.kernel.org/kvmarm/Zy0bcM0m-N18gAZz@google.com/
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
---
 tools/testing/selftests/kvm/lib/kvm_util.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index a2b7df5f1d39..480e3a40d197 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -720,9 +720,6 @@ static void __vm_mem_region_delete(struct kvm_vm *vm,
 	rb_erase(&region->hva_node, &vm->regions.hva_tree);
 	hash_del(&region->slot_node);
 
-	region->region.memory_size = 0;
-	vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &region->region);
-
 	sparsebit_free(&region->unused_phy_pages);
 	sparsebit_free(&region->protected_phy_pages);
 	ret = munmap(region->mmap_start, region->mmap_size);
@@ -1197,7 +1194,12 @@ void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa)
  */
 void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot)
 {
-	__vm_mem_region_delete(vm, memslot2region(vm, slot));
+	struct userspace_mem_region *region = memslot2region(vm, slot);
+
+	region->region.memory_size = 0;
+	vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &region->region);
+
+	__vm_mem_region_delete(vm, region);
 }
 
 void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t base, uint64_t size,
-- 
cgit v1.2.3


From 7fef0dec415c08c16c31dd2c2501a8c734a4b3b8 Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Mon, 28 Oct 2024 19:53:36 +0800
Subject: mm: page_frag: add a test module for page_frag

The testing is done by ensuring that the fragment allocated
from a frag_frag_cache instance is pushed into a ptr_ring
instance in a kthread binded to a specified cpu, and a kthread
binded to a specified cpu will pop the fragment from the
ptr_ring and free the fragment.

CC: Andrew Morton <akpm@linux-foundation.org>
CC: Linux-MM <linux-mm@kvack.org>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Link: https://patch.msgid.link/20241028115343.3405838-2-linyunsheng@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/mm/Makefile                |   3 +
 tools/testing/selftests/mm/page_frag/Makefile      |  18 ++
 .../selftests/mm/page_frag/page_frag_test.c        | 198 +++++++++++++++++++++
 tools/testing/selftests/mm/run_vmtests.sh          |   8 +
 tools/testing/selftests/mm/test_page_frag.sh       | 175 ++++++++++++++++++
 5 files changed, 402 insertions(+)
 create mode 100644 tools/testing/selftests/mm/page_frag/Makefile
 create mode 100644 tools/testing/selftests/mm/page_frag/page_frag_test.c
 create mode 100755 tools/testing/selftests/mm/test_page_frag.sh

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 02e1204971b0..acec529baaca 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -36,6 +36,8 @@ MAKEFLAGS += --no-builtin-rules
 CFLAGS = -Wall -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
 LDLIBS = -lrt -lpthread -lm
 
+TEST_GEN_MODS_DIR := page_frag
+
 TEST_GEN_FILES = cow
 TEST_GEN_FILES += compaction_test
 TEST_GEN_FILES += gup_longterm
@@ -126,6 +128,7 @@ TEST_FILES += test_hmm.sh
 TEST_FILES += va_high_addr_switch.sh
 TEST_FILES += charge_reserved_hugetlb.sh
 TEST_FILES += hugetlb_reparenting_test.sh
+TEST_FILES += test_page_frag.sh
 
 # required by charge_reserved_hugetlb.sh
 TEST_FILES += write_hugetlb_memory.sh
diff --git a/tools/testing/selftests/mm/page_frag/Makefile b/tools/testing/selftests/mm/page_frag/Makefile
new file mode 100644
index 000000000000..58dda74d50a3
--- /dev/null
+++ b/tools/testing/selftests/mm/page_frag/Makefile
@@ -0,0 +1,18 @@
+PAGE_FRAG_TEST_DIR := $(realpath $(dir $(abspath $(lastword $(MAKEFILE_LIST)))))
+KDIR ?= $(abspath $(PAGE_FRAG_TEST_DIR)/../../../../..)
+
+ifeq ($(V),1)
+Q =
+else
+Q = @
+endif
+
+MODULES = page_frag_test.ko
+
+obj-m += page_frag_test.o
+
+all:
+	+$(Q)make -C $(KDIR) M=$(PAGE_FRAG_TEST_DIR) modules
+
+clean:
+	+$(Q)make -C $(KDIR) M=$(PAGE_FRAG_TEST_DIR) clean
diff --git a/tools/testing/selftests/mm/page_frag/page_frag_test.c b/tools/testing/selftests/mm/page_frag/page_frag_test.c
new file mode 100644
index 000000000000..912d97b99107
--- /dev/null
+++ b/tools/testing/selftests/mm/page_frag/page_frag_test.c
@@ -0,0 +1,198 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Test module for page_frag cache
+ *
+ * Copyright (C) 2024 Yunsheng Lin <linyunsheng@huawei.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/cpumask.h>
+#include <linux/completion.h>
+#include <linux/ptr_ring.h>
+#include <linux/kthread.h>
+
+#define TEST_FAILED_PREFIX	"page_frag_test failed: "
+
+static struct ptr_ring ptr_ring;
+static int nr_objs = 512;
+static atomic_t nthreads;
+static struct completion wait;
+static struct page_frag_cache test_nc;
+static int test_popped;
+static int test_pushed;
+static bool force_exit;
+
+static int nr_test = 2000000;
+module_param(nr_test, int, 0);
+MODULE_PARM_DESC(nr_test, "number of iterations to test");
+
+static bool test_align;
+module_param(test_align, bool, 0);
+MODULE_PARM_DESC(test_align, "use align API for testing");
+
+static int test_alloc_len = 2048;
+module_param(test_alloc_len, int, 0);
+MODULE_PARM_DESC(test_alloc_len, "alloc len for testing");
+
+static int test_push_cpu;
+module_param(test_push_cpu, int, 0);
+MODULE_PARM_DESC(test_push_cpu, "test cpu for pushing fragment");
+
+static int test_pop_cpu;
+module_param(test_pop_cpu, int, 0);
+MODULE_PARM_DESC(test_pop_cpu, "test cpu for popping fragment");
+
+static int page_frag_pop_thread(void *arg)
+{
+	struct ptr_ring *ring = arg;
+
+	pr_info("page_frag pop test thread begins on cpu %d\n",
+		smp_processor_id());
+
+	while (test_popped < nr_test) {
+		void *obj = __ptr_ring_consume(ring);
+
+		if (obj) {
+			test_popped++;
+			page_frag_free(obj);
+		} else {
+			if (force_exit)
+				break;
+
+			cond_resched();
+		}
+	}
+
+	if (atomic_dec_and_test(&nthreads))
+		complete(&wait);
+
+	pr_info("page_frag pop test thread exits on cpu %d\n",
+		smp_processor_id());
+
+	return 0;
+}
+
+static int page_frag_push_thread(void *arg)
+{
+	struct ptr_ring *ring = arg;
+
+	pr_info("page_frag push test thread begins on cpu %d\n",
+		smp_processor_id());
+
+	while (test_pushed < nr_test && !force_exit) {
+		void *va;
+		int ret;
+
+		if (test_align) {
+			va = page_frag_alloc_align(&test_nc, test_alloc_len,
+						   GFP_KERNEL, SMP_CACHE_BYTES);
+
+			if ((unsigned long)va & (SMP_CACHE_BYTES - 1)) {
+				force_exit = true;
+				WARN_ONCE(true, TEST_FAILED_PREFIX "unaligned va returned\n");
+			}
+		} else {
+			va = page_frag_alloc(&test_nc, test_alloc_len, GFP_KERNEL);
+		}
+
+		if (!va)
+			continue;
+
+		ret = __ptr_ring_produce(ring, va);
+		if (ret) {
+			page_frag_free(va);
+			cond_resched();
+		} else {
+			test_pushed++;
+		}
+	}
+
+	pr_info("page_frag push test thread exits on cpu %d\n",
+		smp_processor_id());
+
+	if (atomic_dec_and_test(&nthreads))
+		complete(&wait);
+
+	return 0;
+}
+
+static int __init page_frag_test_init(void)
+{
+	struct task_struct *tsk_push, *tsk_pop;
+	int last_pushed = 0, last_popped = 0;
+	ktime_t start;
+	u64 duration;
+	int ret;
+
+	test_nc.va = NULL;
+	atomic_set(&nthreads, 2);
+	init_completion(&wait);
+
+	if (test_alloc_len > PAGE_SIZE || test_alloc_len <= 0 ||
+	    !cpu_active(test_push_cpu) || !cpu_active(test_pop_cpu))
+		return -EINVAL;
+
+	ret = ptr_ring_init(&ptr_ring, nr_objs, GFP_KERNEL);
+	if (ret)
+		return ret;
+
+	tsk_push = kthread_create_on_cpu(page_frag_push_thread, &ptr_ring,
+					 test_push_cpu, "page_frag_push");
+	if (IS_ERR(tsk_push))
+		return PTR_ERR(tsk_push);
+
+	tsk_pop = kthread_create_on_cpu(page_frag_pop_thread, &ptr_ring,
+					test_pop_cpu, "page_frag_pop");
+	if (IS_ERR(tsk_pop)) {
+		kthread_stop(tsk_push);
+		return PTR_ERR(tsk_pop);
+	}
+
+	start = ktime_get();
+	wake_up_process(tsk_push);
+	wake_up_process(tsk_pop);
+
+	pr_info("waiting for test to complete\n");
+
+	while (!wait_for_completion_timeout(&wait, msecs_to_jiffies(10000))) {
+		/* exit if there is no progress for push or pop size */
+		if (last_pushed == test_pushed || last_popped == test_popped) {
+			WARN_ONCE(true, TEST_FAILED_PREFIX "no progress\n");
+			force_exit = true;
+			continue;
+		}
+
+		last_pushed = test_pushed;
+		last_popped = test_popped;
+		pr_info("page_frag_test progress: pushed = %d, popped = %d\n",
+			test_pushed, test_popped);
+	}
+
+	if (force_exit) {
+		pr_err(TEST_FAILED_PREFIX "exit with error\n");
+		goto out;
+	}
+
+	duration = (u64)ktime_us_delta(ktime_get(), start);
+	pr_info("%d of iterations for %s testing took: %lluus\n", nr_test,
+		test_align ? "aligned" : "non-aligned", duration);
+
+out:
+	ptr_ring_cleanup(&ptr_ring, NULL);
+	page_frag_cache_drain(&test_nc);
+
+	return -EAGAIN;
+}
+
+static void __exit page_frag_test_exit(void)
+{
+}
+
+module_init(page_frag_test_init);
+module_exit(page_frag_test_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Yunsheng Lin <linyunsheng@huawei.com>");
+MODULE_DESCRIPTION("Test module for page_frag");
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index c5797ad1d37b..2c5394584af4 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -75,6 +75,8 @@ separated by spaces:
 	read-only VMAs
 - mdwe
 	test prctl(PR_SET_MDWE, ...)
+- page_frag
+	test handling of page fragment allocation and freeing
 
 example: ./run_vmtests.sh -t "hmm mmap ksm"
 EOF
@@ -456,6 +458,12 @@ CATEGORY="mkdirty" run_test ./mkdirty
 
 CATEGORY="mdwe" run_test ./mdwe_test
 
+CATEGORY="page_frag" run_test ./test_page_frag.sh smoke
+
+CATEGORY="page_frag" run_test ./test_page_frag.sh aligned
+
+CATEGORY="page_frag" run_test ./test_page_frag.sh nonaligned
+
 echo "SUMMARY: PASS=${count_pass} SKIP=${count_skip} FAIL=${count_fail}" | tap_prefix
 echo "1..${count_total}" | tap_output
 
diff --git a/tools/testing/selftests/mm/test_page_frag.sh b/tools/testing/selftests/mm/test_page_frag.sh
new file mode 100755
index 000000000000..f55b105084cf
--- /dev/null
+++ b/tools/testing/selftests/mm/test_page_frag.sh
@@ -0,0 +1,175 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2024 Yunsheng Lin <linyunsheng@huawei.com>
+# Copyright (C) 2018 Uladzislau Rezki (Sony) <urezki@gmail.com>
+#
+# This is a test script for the kernel test driver to test the
+# correctness and performance of page_frag's implementation.
+# Therefore it is just a kernel module loader. You can specify
+# and pass different parameters in order to:
+#     a) analyse performance of page fragment allocations;
+#     b) stressing and stability check of page_frag subsystem.
+
+DRIVER="./page_frag/page_frag_test.ko"
+CPU_LIST=$(grep -m 2 processor /proc/cpuinfo | cut -d ' ' -f 2)
+TEST_CPU_0=$(echo $CPU_LIST | awk '{print $1}')
+
+if [ $(echo $CPU_LIST | wc -w) -gt 1 ]; then
+	TEST_CPU_1=$(echo $CPU_LIST | awk '{print $2}')
+	NR_TEST=100000000
+else
+	TEST_CPU_1=$TEST_CPU_0
+	NR_TEST=1000000
+fi
+
+# 1 if fails
+exitcode=1
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+check_test_failed_prefix() {
+	if dmesg | grep -q 'page_frag_test failed:';then
+		echo "page_frag_test failed, please check dmesg"
+		exit $exitcode
+	fi
+}
+
+#
+# Static templates for testing of page_frag APIs.
+# Also it is possible to pass any supported parameters manually.
+#
+SMOKE_PARAM="test_push_cpu=$TEST_CPU_0 test_pop_cpu=$TEST_CPU_1"
+NONALIGNED_PARAM="$SMOKE_PARAM test_alloc_len=75 nr_test=$NR_TEST"
+ALIGNED_PARAM="$NONALIGNED_PARAM test_align=1"
+
+check_test_requirements()
+{
+	uid=$(id -u)
+	if [ $uid -ne 0 ]; then
+		echo "$0: Must be run as root"
+		exit $ksft_skip
+	fi
+
+	if ! which insmod > /dev/null 2>&1; then
+		echo "$0: You need insmod installed"
+		exit $ksft_skip
+	fi
+
+	if [ ! -f $DRIVER ]; then
+		echo "$0: You need to compile page_frag_test module"
+		exit $ksft_skip
+	fi
+}
+
+run_nonaligned_check()
+{
+	echo "Run performance tests to evaluate how fast nonaligned alloc API is."
+
+	insmod $DRIVER $NONALIGNED_PARAM > /dev/null 2>&1
+}
+
+run_aligned_check()
+{
+	echo "Run performance tests to evaluate how fast aligned alloc API is."
+
+	insmod $DRIVER $ALIGNED_PARAM > /dev/null 2>&1
+}
+
+run_smoke_check()
+{
+	echo "Run smoke test."
+
+	insmod $DRIVER $SMOKE_PARAM > /dev/null 2>&1
+}
+
+usage()
+{
+	echo -n "Usage: $0 [ aligned ] | [ nonaligned ] | | [ smoke ] | "
+	echo "manual parameters"
+	echo
+	echo "Valid tests and parameters:"
+	echo
+	modinfo $DRIVER
+	echo
+	echo "Example usage:"
+	echo
+	echo "# Shows help message"
+	echo "$0"
+	echo
+	echo "# Smoke testing"
+	echo "$0 smoke"
+	echo
+	echo "# Performance testing for nonaligned alloc API"
+	echo "$0 nonaligned"
+	echo
+	echo "# Performance testing for aligned alloc API"
+	echo "$0 aligned"
+	echo
+	exit 0
+}
+
+function validate_passed_args()
+{
+	VALID_ARGS=`modinfo $DRIVER | awk '/parm:/ {print $2}' | sed 's/:.*//'`
+
+	#
+	# Something has been passed, check it.
+	#
+	for passed_arg in $@; do
+		key=${passed_arg//=*/}
+		valid=0
+
+		for valid_arg in $VALID_ARGS; do
+			if [[ $key = $valid_arg ]]; then
+				valid=1
+				break
+			fi
+		done
+
+		if [[ $valid -ne 1 ]]; then
+			echo "Error: key is not correct: ${key}"
+			exit $exitcode
+		fi
+	done
+}
+
+function run_manual_check()
+{
+	#
+	# Validate passed parameters. If there is wrong one,
+	# the script exists and does not execute further.
+	#
+	validate_passed_args $@
+
+	echo "Run the test with following parameters: $@"
+	insmod $DRIVER $@ > /dev/null 2>&1
+}
+
+function run_test()
+{
+	if [ $# -eq 0 ]; then
+		usage
+	else
+		if [[ "$1" = "smoke" ]]; then
+			run_smoke_check
+		elif [[ "$1" = "nonaligned" ]]; then
+			run_nonaligned_check
+		elif [[ "$1" = "aligned" ]]; then
+			run_aligned_check
+		else
+			run_manual_check $@
+		fi
+	fi
+
+	check_test_failed_prefix
+
+	echo "Done."
+	echo "Check the kernel ring buffer to see the summary."
+}
+
+check_test_requirements
+run_test $@
+
+exit 0
-- 
cgit v1.2.3


From 65941f10caf2c04781a7defa4ec0ab119dbd235a Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Mon, 28 Oct 2024 19:53:37 +0800
Subject: mm: move the page fragment allocator from page_alloc into its own
 file

Inspired by [1], move the page fragment allocator from page_alloc
into its own c file and header file, as we are about to make more
change for it to replace another page_frag implementation in
sock.c

As this patchset is going to replace 'struct page_frag' with
'struct page_frag_cache' in sched.h, including page_frag_cache.h
in sched.h has a compiler error caused by interdependence between
mm_types.h and mm.h for asm-offsets.c, see [2]. So avoid the compiler
error by moving 'struct page_frag_cache' to mm_types_task.h as
suggested by Alexander, see [3].

1. https://lore.kernel.org/all/20230411160902.4134381-3-dhowells@redhat.com/
2. https://lore.kernel.org/all/15623dac-9358-4597-b3ee-3694a5956920@gmail.com/
3. https://lore.kernel.org/all/CAKgT0UdH1yD=LSCXFJ=YM_aiA4OomD-2wXykO42bizaWMt_HOA@mail.gmail.com/
CC: David Howells <dhowells@redhat.com>
CC: Linux-MM <linux-mm@kvack.org>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Link: https://patch.msgid.link/20241028115343.3405838-3-linyunsheng@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/gfp.h                                |  22 ----
 include/linux/mm_types.h                           |  18 ---
 include/linux/mm_types_task.h                      |  18 +++
 include/linux/page_frag_cache.h                    |  31 +++++
 include/linux/skbuff.h                             |   1 +
 mm/Makefile                                        |   1 +
 mm/page_alloc.c                                    | 136 -------------------
 mm/page_frag_cache.c                               | 145 +++++++++++++++++++++
 .../selftests/mm/page_frag/page_frag_test.c        |   2 +-
 9 files changed, 197 insertions(+), 177 deletions(-)
 create mode 100644 include/linux/page_frag_cache.h
 create mode 100644 mm/page_frag_cache.c

(limited to 'tools/testing')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index a951de920e20..a0a6d25f883f 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -371,28 +371,6 @@ __meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mas
 extern void __free_pages(struct page *page, unsigned int order);
 extern void free_pages(unsigned long addr, unsigned int order);
 
-struct page_frag_cache;
-void page_frag_cache_drain(struct page_frag_cache *nc);
-extern void __page_frag_cache_drain(struct page *page, unsigned int count);
-void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz,
-			      gfp_t gfp_mask, unsigned int align_mask);
-
-static inline void *page_frag_alloc_align(struct page_frag_cache *nc,
-					  unsigned int fragsz, gfp_t gfp_mask,
-					  unsigned int align)
-{
-	WARN_ON_ONCE(!is_power_of_2(align));
-	return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align);
-}
-
-static inline void *page_frag_alloc(struct page_frag_cache *nc,
-			     unsigned int fragsz, gfp_t gfp_mask)
-{
-	return __page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u);
-}
-
-extern void page_frag_free(void *addr);
-
 #define __free_page(page) __free_pages((page), 0)
 #define free_page(addr) free_pages((addr), 0)
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6e3bdf8e38bc..92314ef2d978 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -521,9 +521,6 @@ static_assert(sizeof(struct ptdesc) <= sizeof(struct page));
  */
 #define STRUCT_PAGE_MAX_SHIFT	(order_base_2(sizeof(struct page)))
 
-#define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
-#define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
-
 /*
  * page_private can be used on tail pages.  However, PagePrivate is only
  * checked by the VM on the head page.  So page_private on the tail pages
@@ -542,21 +539,6 @@ static inline void *folio_get_private(struct folio *folio)
 	return folio->private;
 }
 
-struct page_frag_cache {
-	void * va;
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-	__u16 offset;
-	__u16 size;
-#else
-	__u32 offset;
-#endif
-	/* we maintain a pagecount bias, so that we dont dirty cache line
-	 * containing page->_refcount every time we allocate a fragment.
-	 */
-	unsigned int		pagecnt_bias;
-	bool pfmemalloc;
-};
-
 typedef unsigned long vm_flags_t;
 
 /*
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index bff5706b76e1..0ac6daebdd5c 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -8,6 +8,7 @@
  * (These are defined separately to decouple sched.h from mm_types.h as much as possible.)
  */
 
+#include <linux/align.h>
 #include <linux/types.h>
 
 #include <asm/page.h>
@@ -43,6 +44,23 @@ struct page_frag {
 #endif
 };
 
+#define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
+#define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
+struct page_frag_cache {
+	void *va;
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+	__u16 offset;
+	__u16 size;
+#else
+	__u32 offset;
+#endif
+	/* we maintain a pagecount bias, so that we dont dirty cache line
+	 * containing page->_refcount every time we allocate a fragment.
+	 */
+	unsigned int		pagecnt_bias;
+	bool pfmemalloc;
+};
+
 /* Track pages that require TLB flushes */
 struct tlbflush_unmap_batch {
 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
new file mode 100644
index 000000000000..67ac8626ed9b
--- /dev/null
+++ b/include/linux/page_frag_cache.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_PAGE_FRAG_CACHE_H
+#define _LINUX_PAGE_FRAG_CACHE_H
+
+#include <linux/log2.h>
+#include <linux/mm_types_task.h>
+#include <linux/types.h>
+
+void page_frag_cache_drain(struct page_frag_cache *nc);
+void __page_frag_cache_drain(struct page *page, unsigned int count);
+void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz,
+			      gfp_t gfp_mask, unsigned int align_mask);
+
+static inline void *page_frag_alloc_align(struct page_frag_cache *nc,
+					  unsigned int fragsz, gfp_t gfp_mask,
+					  unsigned int align)
+{
+	WARN_ON_ONCE(!is_power_of_2(align));
+	return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align);
+}
+
+static inline void *page_frag_alloc(struct page_frag_cache *nc,
+				    unsigned int fragsz, gfp_t gfp_mask)
+{
+	return __page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u);
+}
+
+void page_frag_free(void *addr);
+
+#endif
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e5095d75abba..60535c706851 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -31,6 +31,7 @@
 #include <linux/in6.h>
 #include <linux/if_packet.h>
 #include <linux/llist.h>
+#include <linux/page_frag_cache.h>
 #include <net/flow.h>
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 #include <linux/netfilter/nf_conntrack_common.h>
diff --git a/mm/Makefile b/mm/Makefile
index d5639b036166..dba52bb0da8a 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -65,6 +65,7 @@ page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o
 memory-hotplug-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 
 obj-y += page-alloc.o
+obj-y += page_frag_cache.o
 obj-y += init-mm.o
 obj-y += memblock.o
 obj-y += $(memory-hotplug-y)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 94a2ffe28008..34b3eb74630a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4836,142 +4836,6 @@ void free_pages(unsigned long addr, unsigned int order)
 
 EXPORT_SYMBOL(free_pages);
 
-/*
- * Page Fragment:
- *  An arbitrary-length arbitrary-offset area of memory which resides
- *  within a 0 or higher order page.  Multiple fragments within that page
- *  are individually refcounted, in the page's reference counter.
- *
- * The page_frag functions below provide a simple allocation framework for
- * page fragments.  This is used by the network stack and network device
- * drivers to provide a backing region of memory for use as either an
- * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
- */
-static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
-					     gfp_t gfp_mask)
-{
-	struct page *page = NULL;
-	gfp_t gfp = gfp_mask;
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-	gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
-		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
-	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
-				PAGE_FRAG_CACHE_MAX_ORDER);
-	nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
-#endif
-	if (unlikely(!page))
-		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
-
-	nc->va = page ? page_address(page) : NULL;
-
-	return page;
-}
-
-void page_frag_cache_drain(struct page_frag_cache *nc)
-{
-	if (!nc->va)
-		return;
-
-	__page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias);
-	nc->va = NULL;
-}
-EXPORT_SYMBOL(page_frag_cache_drain);
-
-void __page_frag_cache_drain(struct page *page, unsigned int count)
-{
-	VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
-
-	if (page_ref_sub_and_test(page, count))
-		free_unref_page(page, compound_order(page));
-}
-EXPORT_SYMBOL(__page_frag_cache_drain);
-
-void *__page_frag_alloc_align(struct page_frag_cache *nc,
-			      unsigned int fragsz, gfp_t gfp_mask,
-			      unsigned int align_mask)
-{
-	unsigned int size = PAGE_SIZE;
-	struct page *page;
-	int offset;
-
-	if (unlikely(!nc->va)) {
-refill:
-		page = __page_frag_cache_refill(nc, gfp_mask);
-		if (!page)
-			return NULL;
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-		/* if size can vary use size else just use PAGE_SIZE */
-		size = nc->size;
-#endif
-		/* Even if we own the page, we do not use atomic_set().
-		 * This would break get_page_unless_zero() users.
-		 */
-		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
-
-		/* reset page count bias and offset to start of new frag */
-		nc->pfmemalloc = page_is_pfmemalloc(page);
-		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		nc->offset = size;
-	}
-
-	offset = nc->offset - fragsz;
-	if (unlikely(offset < 0)) {
-		page = virt_to_page(nc->va);
-
-		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
-			goto refill;
-
-		if (unlikely(nc->pfmemalloc)) {
-			free_unref_page(page, compound_order(page));
-			goto refill;
-		}
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-		/* if size can vary use size else just use PAGE_SIZE */
-		size = nc->size;
-#endif
-		/* OK, page count is 0, we can safely set it */
-		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
-
-		/* reset page count bias and offset to start of new frag */
-		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		offset = size - fragsz;
-		if (unlikely(offset < 0)) {
-			/*
-			 * The caller is trying to allocate a fragment
-			 * with fragsz > PAGE_SIZE but the cache isn't big
-			 * enough to satisfy the request, this may
-			 * happen in low memory conditions.
-			 * We don't release the cache page because
-			 * it could make memory pressure worse
-			 * so we simply return NULL here.
-			 */
-			return NULL;
-		}
-	}
-
-	nc->pagecnt_bias--;
-	offset &= align_mask;
-	nc->offset = offset;
-
-	return nc->va + offset;
-}
-EXPORT_SYMBOL(__page_frag_alloc_align);
-
-/*
- * Frees a page fragment allocated out of either a compound or order 0 page.
- */
-void page_frag_free(void *addr)
-{
-	struct page *page = virt_to_head_page(addr);
-
-	if (unlikely(put_page_testzero(page)))
-		free_unref_page(page, compound_order(page));
-}
-EXPORT_SYMBOL(page_frag_free);
-
 static void *make_alloc_exact(unsigned long addr, unsigned int order,
 		size_t size)
 {
diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
new file mode 100644
index 000000000000..609a485cd02a
--- /dev/null
+++ b/mm/page_frag_cache.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Page fragment allocator
+ *
+ * Page Fragment:
+ *  An arbitrary-length arbitrary-offset area of memory which resides within a
+ *  0 or higher order page.  Multiple fragments within that page are
+ *  individually refcounted, in the page's reference counter.
+ *
+ * The page_frag functions provide a simple allocation framework for page
+ * fragments.  This is used by the network stack and network device drivers to
+ * provide a backing region of memory for use as either an sk_buff->head, or to
+ * be used in the "frags" portion of skb_shared_info.
+ */
+
+#include <linux/export.h>
+#include <linux/gfp_types.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/page_frag_cache.h>
+#include "internal.h"
+
+static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
+					     gfp_t gfp_mask)
+{
+	struct page *page = NULL;
+	gfp_t gfp = gfp_mask;
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+	gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
+		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
+	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
+				PAGE_FRAG_CACHE_MAX_ORDER);
+	nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
+#endif
+	if (unlikely(!page))
+		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
+
+	nc->va = page ? page_address(page) : NULL;
+
+	return page;
+}
+
+void page_frag_cache_drain(struct page_frag_cache *nc)
+{
+	if (!nc->va)
+		return;
+
+	__page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias);
+	nc->va = NULL;
+}
+EXPORT_SYMBOL(page_frag_cache_drain);
+
+void __page_frag_cache_drain(struct page *page, unsigned int count)
+{
+	VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
+
+	if (page_ref_sub_and_test(page, count))
+		free_unref_page(page, compound_order(page));
+}
+EXPORT_SYMBOL(__page_frag_cache_drain);
+
+void *__page_frag_alloc_align(struct page_frag_cache *nc,
+			      unsigned int fragsz, gfp_t gfp_mask,
+			      unsigned int align_mask)
+{
+	unsigned int size = PAGE_SIZE;
+	struct page *page;
+	int offset;
+
+	if (unlikely(!nc->va)) {
+refill:
+		page = __page_frag_cache_refill(nc, gfp_mask);
+		if (!page)
+			return NULL;
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+		/* if size can vary use size else just use PAGE_SIZE */
+		size = nc->size;
+#endif
+		/* Even if we own the page, we do not use atomic_set().
+		 * This would break get_page_unless_zero() users.
+		 */
+		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
+
+		/* reset page count bias and offset to start of new frag */
+		nc->pfmemalloc = page_is_pfmemalloc(page);
+		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
+		nc->offset = size;
+	}
+
+	offset = nc->offset - fragsz;
+	if (unlikely(offset < 0)) {
+		page = virt_to_page(nc->va);
+
+		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
+			goto refill;
+
+		if (unlikely(nc->pfmemalloc)) {
+			free_unref_page(page, compound_order(page));
+			goto refill;
+		}
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+		/* if size can vary use size else just use PAGE_SIZE */
+		size = nc->size;
+#endif
+		/* OK, page count is 0, we can safely set it */
+		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
+
+		/* reset page count bias and offset to start of new frag */
+		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
+		offset = size - fragsz;
+		if (unlikely(offset < 0)) {
+			/*
+			 * The caller is trying to allocate a fragment
+			 * with fragsz > PAGE_SIZE but the cache isn't big
+			 * enough to satisfy the request, this may
+			 * happen in low memory conditions.
+			 * We don't release the cache page because
+			 * it could make memory pressure worse
+			 * so we simply return NULL here.
+			 */
+			return NULL;
+		}
+	}
+
+	nc->pagecnt_bias--;
+	offset &= align_mask;
+	nc->offset = offset;
+
+	return nc->va + offset;
+}
+EXPORT_SYMBOL(__page_frag_alloc_align);
+
+/*
+ * Frees a page fragment allocated out of either a compound or order 0 page.
+ */
+void page_frag_free(void *addr)
+{
+	struct page *page = virt_to_head_page(addr);
+
+	if (unlikely(put_page_testzero(page)))
+		free_unref_page(page, compound_order(page));
+}
+EXPORT_SYMBOL(page_frag_free);
diff --git a/tools/testing/selftests/mm/page_frag/page_frag_test.c b/tools/testing/selftests/mm/page_frag/page_frag_test.c
index 912d97b99107..13c44133e009 100644
--- a/tools/testing/selftests/mm/page_frag/page_frag_test.c
+++ b/tools/testing/selftests/mm/page_frag/page_frag_test.c
@@ -6,12 +6,12 @@
  * Copyright (C) 2024 Yunsheng Lin <linyunsheng@huawei.com>
  */
 
-#include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/cpumask.h>
 #include <linux/completion.h>
 #include <linux/ptr_ring.h>
 #include <linux/kthread.h>
+#include <linux/page_frag_cache.h>
 
 #define TEST_FAILED_PREFIX	"page_frag_test failed: "
 
-- 
cgit v1.2.3


From 3d18dfe69ce46f106af327736d2261d7e3ee81c0 Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Mon, 28 Oct 2024 19:53:39 +0800
Subject: mm: page_frag: avoid caller accessing 'page_frag_cache' directly

Use appropriate frag_page API instead of caller accessing
'page_frag_cache' directly.

CC: Andrew Morton <akpm@linux-foundation.org>
CC: Linux-MM <linux-mm@kvack.org>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Link: https://patch.msgid.link/20241028115343.3405838-5-linyunsheng@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/vhost/net.c                                   |  2 +-
 include/linux/page_frag_cache.h                       | 10 ++++++++++
 net/core/skbuff.c                                     |  6 +++---
 net/rxrpc/conn_object.c                               |  4 +---
 net/rxrpc/local_object.c                              |  4 +---
 net/sunrpc/svcsock.c                                  |  6 ++----
 tools/testing/selftests/mm/page_frag/page_frag_test.c |  2 +-
 7 files changed, 19 insertions(+), 15 deletions(-)

(limited to 'tools/testing')

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f16279351db5..9ad37c012189 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1325,7 +1325,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
 			vqs[VHOST_NET_VQ_RX]);
 
 	f->private_data = n;
-	n->pf_cache.va = NULL;
+	page_frag_cache_init(&n->pf_cache);
 
 	return 0;
 }
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index 67ac8626ed9b..0a52f7a179c8 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -7,6 +7,16 @@
 #include <linux/mm_types_task.h>
 #include <linux/types.h>
 
+static inline void page_frag_cache_init(struct page_frag_cache *nc)
+{
+	nc->va = NULL;
+}
+
+static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
+{
+	return !!nc->pfmemalloc;
+}
+
 void page_frag_cache_drain(struct page_frag_cache *nc);
 void __page_frag_cache_drain(struct page *page, unsigned int count);
 void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 00afeb90c23a..6841e61a6bd0 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -753,14 +753,14 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
 	if (in_hardirq() || irqs_disabled()) {
 		nc = this_cpu_ptr(&netdev_alloc_cache);
 		data = page_frag_alloc(nc, len, gfp_mask);
-		pfmemalloc = nc->pfmemalloc;
+		pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
 	} else {
 		local_bh_disable();
 		local_lock_nested_bh(&napi_alloc_cache.bh_lock);
 
 		nc = this_cpu_ptr(&napi_alloc_cache.page);
 		data = page_frag_alloc(nc, len, gfp_mask);
-		pfmemalloc = nc->pfmemalloc;
+		pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
 
 		local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
 		local_bh_enable();
@@ -850,7 +850,7 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
 		len = SKB_HEAD_ALIGN(len);
 
 		data = page_frag_alloc(&nc->page, len, gfp_mask);
-		pfmemalloc = nc->page.pfmemalloc;
+		pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
 	}
 	local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
 
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 1539d315afe7..694c4df7a1a3 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -337,9 +337,7 @@ static void rxrpc_clean_up_connection(struct work_struct *work)
 	 */
 	rxrpc_purge_queue(&conn->rx_queue);
 
-	if (conn->tx_data_alloc.va)
-		__page_frag_cache_drain(virt_to_page(conn->tx_data_alloc.va),
-					conn->tx_data_alloc.pagecnt_bias);
+	page_frag_cache_drain(&conn->tx_data_alloc);
 	call_rcu(&conn->rcu, rxrpc_rcu_free_connection);
 }
 
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index f9623ace2201..2792d2304605 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -452,9 +452,7 @@ void rxrpc_destroy_local(struct rxrpc_local *local)
 #endif
 	rxrpc_purge_queue(&local->rx_queue);
 	rxrpc_purge_client_connections(local);
-	if (local->tx_alloc.va)
-		__page_frag_cache_drain(virt_to_page(local->tx_alloc.va),
-					local->tx_alloc.pagecnt_bias);
+	page_frag_cache_drain(&local->tx_alloc);
 }
 
 /*
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 825ec5357691..b785425c3315 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1608,7 +1608,6 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt)
 static void svc_sock_free(struct svc_xprt *xprt)
 {
 	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
-	struct page_frag_cache *pfc = &svsk->sk_frag_cache;
 	struct socket *sock = svsk->sk_sock;
 
 	trace_svcsock_free(svsk, sock);
@@ -1618,8 +1617,7 @@ static void svc_sock_free(struct svc_xprt *xprt)
 		sockfd_put(sock);
 	else
 		sock_release(sock);
-	if (pfc->va)
-		__page_frag_cache_drain(virt_to_head_page(pfc->va),
-					pfc->pagecnt_bias);
+
+	page_frag_cache_drain(&svsk->sk_frag_cache);
 	kfree(svsk);
 }
diff --git a/tools/testing/selftests/mm/page_frag/page_frag_test.c b/tools/testing/selftests/mm/page_frag/page_frag_test.c
index 13c44133e009..e806c1866e36 100644
--- a/tools/testing/selftests/mm/page_frag/page_frag_test.c
+++ b/tools/testing/selftests/mm/page_frag/page_frag_test.c
@@ -126,7 +126,7 @@ static int __init page_frag_test_init(void)
 	u64 duration;
 	int ret;
 
-	test_nc.va = NULL;
+	page_frag_cache_init(&test_nc);
 	atomic_set(&nthreads, 2);
 	init_completion(&wait);
 
-- 
cgit v1.2.3


From e847f8cd96ae808516c1615697b464e6f68c02a4 Mon Sep 17 00:00:00 2001
From: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
Date: Tue, 8 Oct 2024 17:41:40 +0800
Subject: selftest/mm: fix typo in virtual_address_range

The function name should be *hint* address, so correct it.

Link: https://lkml.kernel.org/r/20241008094141.549248-4-zhangchunyan@iscas.ac.cn
Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
Reviewed-by: Charlie Jenkins <charlie@rivosinc.com>
Acked-by: Palmer Dabbelt <palmer@rivosinc.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/virtual_address_range.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c
index 4e4c1e311247..2a2b69e91950 100644
--- a/tools/testing/selftests/mm/virtual_address_range.c
+++ b/tools/testing/selftests/mm/virtual_address_range.c
@@ -64,7 +64,7 @@
 #define NR_CHUNKS_HIGH  NR_CHUNKS_384TB
 #endif
 
-static char *hind_addr(void)
+static char *hint_addr(void)
 {
 	int bits = HIGH_ADDR_SHIFT + rand() % (63 - HIGH_ADDR_SHIFT);
 
@@ -185,7 +185,7 @@ int main(int argc, char *argv[])
 	}
 
 	for (i = 0; i < NR_CHUNKS_HIGH; i++) {
-		hint = hind_addr();
+		hint = hint_addr();
 		hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE,
 			       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
 
-- 
cgit v1.2.3


From 4175eff0e007b3b781f45742551393736346755d Mon Sep 17 00:00:00 2001
From: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
Date: Tue, 8 Oct 2024 17:41:41 +0800
Subject: selftests/mm: skip virtual_address_range tests on riscv

RISC-V doesn't currently have the behavior of restricting the virtual
address space which virtual_address_range tests check, this will
cause the tests fail. So lets disable the whole test suite for riscv64
for now, not build it and run_vmtests.sh will skip it if it is not present.

Link: https://lkml.kernel.org/r/20241008094141.549248-5-zhangchunyan@iscas.ac.cn
Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
Reviewed-by: Charlie Jenkins <charlie@rivosinc.com>
Acked-by: Palmer Dabbelt <palmer@rivosinc.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/Makefile       |  2 ++
 tools/testing/selftests/mm/run_vmtests.sh | 10 ++++++----
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 15c734d6cfec..00c6fc694633 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -116,7 +116,9 @@ endif
 
 ifneq (,$(filter $(ARCH),arm64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64 s390))
 TEST_GEN_FILES += va_high_addr_switch
+ifneq ($(ARCH),riscv64)
 TEST_GEN_FILES += virtual_address_range
+endif
 TEST_GEN_FILES += write_to_hugetlbfs
 endif
 
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index c5797ad1d37b..4493bfd1911c 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -347,10 +347,12 @@ if [ $VADDR64 -ne 0 ]; then
 	# allows high virtual address allocation requests independent
 	# of platform's physical memory.
 
-	prev_policy=$(cat /proc/sys/vm/overcommit_memory)
-	echo 1 > /proc/sys/vm/overcommit_memory
-	CATEGORY="hugevm" run_test ./virtual_address_range
-	echo $prev_policy > /proc/sys/vm/overcommit_memory
+	if [ -x ./virtual_address_range ]; then
+		prev_policy=$(cat /proc/sys/vm/overcommit_memory)
+		echo 1 > /proc/sys/vm/overcommit_memory
+		CATEGORY="hugevm" run_test ./virtual_address_range
+		echo $prev_policy > /proc/sys/vm/overcommit_memory
+	fi
 
 	# va high address boundary switch test
 	ARCH_ARM64="arm64"
-- 
cgit v1.2.3


From 0189270117c3a3b43d226ed9da5d1ee4dc58b45c Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Thu, 7 Nov 2024 00:13:28 +0100
Subject: selftests: netdevsim: add a test checking ethtool features

Add a test checking that some features are active by default and
changeable.

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/fff58fa70f8a300440958b5020f6a4eb2e9dad61.1730929545.git.sd@queasysnail.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../selftests/drivers/net/netdevsim/Makefile       |  1 +
 .../drivers/net/netdevsim/ethtool-features.sh      | 31 ++++++++++++++++++++++
 2 files changed, 32 insertions(+)
 create mode 100644 tools/testing/selftests/drivers/net/netdevsim/ethtool-features.sh

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/netdevsim/Makefile b/tools/testing/selftests/drivers/net/netdevsim/Makefile
index cc08b220323f..df167c637af9 100644
--- a/tools/testing/selftests/drivers/net/netdevsim/Makefile
+++ b/tools/testing/selftests/drivers/net/netdevsim/Makefile
@@ -4,6 +4,7 @@ TEST_PROGS = devlink.sh \
 	devlink_in_netns.sh \
 	devlink_trap.sh \
 	ethtool-coalesce.sh \
+	ethtool-features.sh \
 	ethtool-fec.sh \
 	ethtool-pause.sh \
 	ethtool-ring.sh \
diff --git a/tools/testing/selftests/drivers/net/netdevsim/ethtool-features.sh b/tools/testing/selftests/drivers/net/netdevsim/ethtool-features.sh
new file mode 100644
index 000000000000..bc210dc6ad2d
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/netdevsim/ethtool-features.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-only
+
+source ethtool-common.sh
+
+NSIM_NETDEV=$(make_netdev)
+
+set -o pipefail
+
+FEATS="
+  tx-checksum-ip-generic
+  tx-scatter-gather
+  tx-tcp-segmentation
+  generic-segmentation-offload
+  generic-receive-offload"
+
+for feat in $FEATS ; do
+    s=$(ethtool --json -k $NSIM_NETDEV | jq ".[].\"$feat\".active" 2>/dev/null)
+    check $? "$s" true
+
+    s=$(ethtool --json -k $NSIM_NETDEV | jq ".[].\"$feat\".fixed" 2>/dev/null)
+    check $? "$s" false
+done
+
+if [ $num_errors -eq 0 ]; then
+    echo "PASSED all $((num_passes)) checks"
+    exit 0
+else
+    echo "FAILED $num_errors/$((num_errors+num_passes)) checks"
+    exit 1
+fi
-- 
cgit v1.2.3


From 415b7cef1c73590bb897fc3f6dd9a0fa4a79acd8 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Thu, 7 Nov 2024 00:13:32 +0100
Subject: selftests: move macsec offload tests from net/rtnetlink to
 drivers/net/netdvesim

We're going to expand this test, and macsec offload is only lightly
related to rtnetlink.

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/a1f92c250cc129b4bb111a206c4b560bab4e24a5.1730929545.git.sd@queasysnail.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../selftests/drivers/net/netdevsim/Makefile       |  1 +
 .../testing/selftests/drivers/net/netdevsim/config |  1 +
 .../drivers/net/netdevsim/macsec-offload.sh        | 63 ++++++++++++++++++++
 tools/testing/selftests/net/rtnetlink.sh           | 68 ----------------------
 4 files changed, 65 insertions(+), 68 deletions(-)
 create mode 100755 tools/testing/selftests/drivers/net/netdevsim/macsec-offload.sh

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/netdevsim/Makefile b/tools/testing/selftests/drivers/net/netdevsim/Makefile
index df167c637af9..07b7c46d3311 100644
--- a/tools/testing/selftests/drivers/net/netdevsim/Makefile
+++ b/tools/testing/selftests/drivers/net/netdevsim/Makefile
@@ -11,6 +11,7 @@ TEST_PROGS = devlink.sh \
 	fib.sh \
 	fib_notifications.sh \
 	hw_stats_l3.sh \
+	macsec-offload.sh \
 	nexthop.sh \
 	peer.sh \
 	psample.sh \
diff --git a/tools/testing/selftests/drivers/net/netdevsim/config b/tools/testing/selftests/drivers/net/netdevsim/config
index adf45a3a78b4..5117c78ddf0a 100644
--- a/tools/testing/selftests/drivers/net/netdevsim/config
+++ b/tools/testing/selftests/drivers/net/netdevsim/config
@@ -1,6 +1,7 @@
 CONFIG_DUMMY=y
 CONFIG_GENEVE=m
 CONFIG_IPV6=y
+CONFIG_MACSEC=m
 CONFIG_NETDEVSIM=m
 CONFIG_NET_SCH_MQPRIO=y
 CONFIG_NET_SCH_MULTIQ=y
diff --git a/tools/testing/selftests/drivers/net/netdevsim/macsec-offload.sh b/tools/testing/selftests/drivers/net/netdevsim/macsec-offload.sh
new file mode 100755
index 000000000000..7babcfd76b22
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/netdevsim/macsec-offload.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-only
+
+source ethtool-common.sh
+
+NSIM_NETDEV=$(make_netdev)
+MACSEC_NETDEV=macsec_nsim
+
+set -o pipefail
+
+if ! ethtool -k $NSIM_NETDEV | grep -q 'macsec-hw-offload: on'; then
+    echo "SKIP: netdevsim doesn't support MACsec offload"
+    exit 4
+fi
+
+if ! ip link add link $NSIM_NETDEV $MACSEC_NETDEV type macsec offload mac 2>/dev/null; then
+    echo "SKIP: couldn't create macsec device"
+    exit 4
+fi
+ip link del $MACSEC_NETDEV
+
+#
+# test macsec offload API
+#
+
+ip link add link $NSIM_NETDEV "${MACSEC_NETDEV}" type macsec port 4 offload mac
+check $?
+
+ip link add link $NSIM_NETDEV "${MACSEC_NETDEV}2" type macsec address "aa:bb:cc:dd:ee:ff" port 5 offload mac
+check $?
+
+ip link add link $NSIM_NETDEV "${MACSEC_NETDEV}3" type macsec sci abbacdde01020304 offload mac
+check $?
+
+ip link add link $NSIM_NETDEV "${MACSEC_NETDEV}4" type macsec port 8 offload mac 2> /dev/null
+check $? '' '' 1
+
+ip macsec add "${MACSEC_NETDEV}" tx sa 0 pn 1024 on key 01 12345678901234567890123456789012
+check $?
+
+ip macsec add "${MACSEC_NETDEV}" rx port 1234 address "1c:ed:de:ad:be:ef"
+check $?
+
+ip macsec add "${MACSEC_NETDEV}" rx port 1234 address "1c:ed:de:ad:be:ef" sa 0 pn 1 on \
+    key 00 0123456789abcdef0123456789abcdef
+check $?
+
+ip macsec add "${MACSEC_NETDEV}" rx port 1235 address "1c:ed:de:ad:be:ef" 2> /dev/null
+check $? '' '' 1
+
+for dev in ${MACSEC_NETDEV}{,2,3} ; do
+    ip link del $dev
+    check $?
+done
+
+
+if [ $num_errors -eq 0 ]; then
+    echo "PASSED all $((num_passes)) checks"
+    exit 0
+else
+    echo "FAILED $num_errors/$((num_errors+num_passes)) checks"
+    exit 1
+fi
diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh
index 6e216d7a8e2f..7f05b5f9b76f 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -21,7 +21,6 @@ ALL_TESTS="
 	kci_test_vrf
 	kci_test_encap
 	kci_test_macsec
-	kci_test_macsec_offload
 	kci_test_ipsec
 	kci_test_ipsec_offload
 	kci_test_fdb_get
@@ -560,73 +559,6 @@ kci_test_macsec()
 	end_test "PASS: macsec"
 }
 
-kci_test_macsec_offload()
-{
-	sysfsd=/sys/kernel/debug/netdevsim/netdevsim0/ports/0/
-	sysfsnet=/sys/bus/netdevsim/devices/netdevsim0/net/
-	probed=false
-	local ret=0
-	run_cmd_grep "^Usage: ip macsec" ip macsec help
-	if [ $? -ne 0 ]; then
-		end_test "SKIP: macsec: iproute2 too old"
-		return $ksft_skip
-	fi
-
-	if ! mount | grep -q debugfs; then
-		mount -t debugfs none /sys/kernel/debug/ &> /dev/null
-	fi
-
-	# setup netdevsim since dummydev doesn't have offload support
-	if [ ! -w /sys/bus/netdevsim/new_device ] ; then
-		run_cmd modprobe -q netdevsim
-
-		if [ $ret -ne 0 ]; then
-			end_test "SKIP: macsec_offload can't load netdevsim"
-			return $ksft_skip
-		fi
-		probed=true
-	fi
-
-	echo "0" > /sys/bus/netdevsim/new_device
-	while [ ! -d $sysfsnet ] ; do :; done
-	udevadm settle
-	dev=`ls $sysfsnet`
-
-	ip link set $dev up
-	if [ ! -d $sysfsd ] ; then
-		end_test "FAIL: macsec_offload can't create device $dev"
-		return 1
-	fi
-	run_cmd_grep 'macsec-hw-offload: on' ethtool -k $dev
-	if [ $? -eq 1 ] ; then
-		end_test "FAIL: macsec_offload netdevsim doesn't support MACsec offload"
-		return 1
-	fi
-	run_cmd ip link add link $dev kci_macsec1 type macsec port 4 offload mac
-	run_cmd ip link add link $dev kci_macsec2 type macsec address "aa:bb:cc:dd:ee:ff" port 5 offload mac
-	run_cmd ip link add link $dev kci_macsec3 type macsec sci abbacdde01020304 offload mac
-	run_cmd_fail ip link add link $dev kci_macsec4 type macsec port 8 offload mac
-
-	msname=kci_macsec1
-	run_cmd ip macsec add "$msname" tx sa 0 pn 1024 on key 01 12345678901234567890123456789012
-	run_cmd ip macsec add "$msname" rx port 1234 address "1c:ed:de:ad:be:ef"
-	run_cmd ip macsec add "$msname" rx port 1234 address "1c:ed:de:ad:be:ef" sa 0 pn 1 on \
-		key 00 0123456789abcdef0123456789abcdef
-	run_cmd_fail ip macsec add "$msname" rx port 1235 address "1c:ed:de:ad:be:ef"
-	# clean up any leftovers
-	for msdev in kci_macsec{1,2,3,4} ; do
-	    ip link del $msdev 2> /dev/null
-	done
-	echo 0 > /sys/bus/netdevsim/del_device
-	$probed && rmmod netdevsim
-
-	if [ $ret -ne 0 ]; then
-		end_test "FAIL: macsec_offload"
-		return 1
-	fi
-	end_test "PASS: macsec_offload"
-}
-
 #-------------------------------------------------------------------
 # Example commands
 #   ip x s add proto esp src 14.0.0.52 dst 14.0.0.70 \
-- 
cgit v1.2.3


From 29084ea5d0e806abb02a69e18bae3d562a9202a5 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Thu, 7 Nov 2024 00:13:33 +0100
Subject: selftests: netdevsim: add test toggling macsec offload

The test verifies that toggling offload works (both via rtnetlink and
macsec's genetlink APIs). This is only possible when no SA is
configured.

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/bf8e27ee0d921caa4eb35f1e830eca6d4080ddb2.1730929545.git.sd@queasysnail.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../drivers/net/netdevsim/macsec-offload.sh         | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/netdevsim/macsec-offload.sh b/tools/testing/selftests/drivers/net/netdevsim/macsec-offload.sh
index 7babcfd76b22..1f2775846ea0 100755
--- a/tools/testing/selftests/drivers/net/netdevsim/macsec-offload.sh
+++ b/tools/testing/selftests/drivers/net/netdevsim/macsec-offload.sh
@@ -48,6 +48,27 @@ check $?
 ip macsec add "${MACSEC_NETDEV}" rx port 1235 address "1c:ed:de:ad:be:ef" 2> /dev/null
 check $? '' '' 1
 
+# can't disable macsec offload when SAs are configured
+ip link set "${MACSEC_NETDEV}" type macsec offload off 2> /dev/null
+check $? '' '' 1
+
+ip macsec offload "${MACSEC_NETDEV}" off 2> /dev/null
+check $? '' '' 1
+
+# toggle macsec offload via rtnetlink
+ip link set "${MACSEC_NETDEV}2" type macsec offload off
+check $?
+
+ip link set "${MACSEC_NETDEV}2" type macsec offload mac
+check $?
+
+# toggle macsec offload via genetlink
+ip macsec offload "${MACSEC_NETDEV}2" off
+check $?
+
+ip macsec offload "${MACSEC_NETDEV}2" mac
+check $?
+
 for dev in ${MACSEC_NETDEV}{,2,3} ; do
     ip link del $dev
     check $?
-- 
cgit v1.2.3


From 0f8800eb67ae9160d144d803f4f8d26ba6385213 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Thu, 7 Nov 2024 00:13:34 +0100
Subject: selftests: netdevsim: add ethtool features to macsec offload tests

The test verifies that available features aren't changed by toggling
offload on the device. Creating a device with offload off and then
enabling it later should result in the same features as creating the
device with offload enabled directly.

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/ba801bd0a75b02de2dddbfc77f9efceb8b3d8a2e.1730929545.git.sd@queasysnail.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../drivers/net/netdevsim/macsec-offload.sh        | 33 ++++++++++++++++++++++
 1 file changed, 33 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/netdevsim/macsec-offload.sh b/tools/testing/selftests/drivers/net/netdevsim/macsec-offload.sh
index 1f2775846ea0..98033e6667d2 100755
--- a/tools/testing/selftests/drivers/net/netdevsim/macsec-offload.sh
+++ b/tools/testing/selftests/drivers/net/netdevsim/macsec-offload.sh
@@ -75,6 +75,39 @@ for dev in ${MACSEC_NETDEV}{,2,3} ; do
 done
 
 
+#
+# test ethtool features when toggling offload
+#
+
+ip link add link $NSIM_NETDEV $MACSEC_NETDEV type macsec offload mac
+TMP_FEATS_ON_1="$(ethtool -k $MACSEC_NETDEV)"
+
+ip link set $MACSEC_NETDEV type macsec offload off
+TMP_FEATS_OFF_1="$(ethtool -k $MACSEC_NETDEV)"
+
+ip link set $MACSEC_NETDEV type macsec offload mac
+TMP_FEATS_ON_2="$(ethtool -k $MACSEC_NETDEV)"
+
+[ "$TMP_FEATS_ON_1" = "$TMP_FEATS_ON_2" ]
+check $?
+
+ip link del $MACSEC_NETDEV
+
+ip link add link $NSIM_NETDEV $MACSEC_NETDEV type macsec
+check $?
+
+TMP_FEATS_OFF_2="$(ethtool -k $MACSEC_NETDEV)"
+[ "$TMP_FEATS_OFF_1" = "$TMP_FEATS_OFF_2" ]
+check $?
+
+ip link set $MACSEC_NETDEV type macsec offload mac
+check $?
+
+TMP_FEATS_ON_3="$(ethtool -k $MACSEC_NETDEV)"
+[ "$TMP_FEATS_ON_1" = "$TMP_FEATS_ON_3" ]
+check $?
+
+
 if [ $num_errors -eq 0 ]; then
     echo "PASSED all $((num_passes)) checks"
     exit 0
-- 
cgit v1.2.3


From ae465d9ca192f582cf4932e628a25f9625a8bf83 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 8 Nov 2024 15:20:46 +0000
Subject: kselftest/arm64: Fix build with stricter assemblers

While some assemblers (including the LLVM assembler I mostly use) will
happily accept SMSTART as an instruction by default others, specifically
gas, require that any architecture extensions be explicitly enabled.
The assembler SME test programs use manually encoded helpers for the new
instructions but no SMSTART helper is defined, only SM and ZA specific
variants.  Unfortunately the irritators that were just added use plain
SMSTART so on stricter assemblers these fail to build:

za-test.S:160: Error: selected processor does not support `smstart'

Switch to using SMSTART ZA via the manually encoded smstart_za macro we
already have defined.

Fixes: d65f27d240bb ("kselftest/arm64: Implement irritators for ZA and ZT")
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241108-arm64-selftest-asm-error-v1-1-7ce27b42a677@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/za-test.S | 2 +-
 tools/testing/selftests/arm64/fp/zt-test.S | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/fp/za-test.S b/tools/testing/selftests/arm64/fp/za-test.S
index f902e6ef9077..5abcb8f009f8 100644
--- a/tools/testing/selftests/arm64/fp/za-test.S
+++ b/tools/testing/selftests/arm64/fp/za-test.S
@@ -157,7 +157,7 @@ function irritator_handler
 
 	// This will reset ZA to all bits 0
 	smstop
-	smstart
+	smstart_za
 
 	ret
 endfunction
diff --git a/tools/testing/selftests/arm64/fp/zt-test.S b/tools/testing/selftests/arm64/fp/zt-test.S
index c96cb7c2ad4b..7b9de8d2a873 100644
--- a/tools/testing/selftests/arm64/fp/zt-test.S
+++ b/tools/testing/selftests/arm64/fp/zt-test.S
@@ -126,7 +126,7 @@ function irritator_handler
 
 	// This will reset ZT to all bits 0
 	smstop
-	smstart
+	smstart_za
 
 	ret
 endfunction
-- 
cgit v1.2.3


From b6bd50dd3b564d50b8cd748de6bae58804ecb768 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 8 Nov 2024 13:49:17 +0000
Subject: kselftest/arm64: Fix printf() compiler warnings in the arm64 fp tests

Lots of incorrect length modifiers, missing arguments or conversion
specifiers. Fix them.

Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Mark Brown <broonie@kernel.org>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241108134920.1233992-2-catalin.marinas@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/sve-ptrace.c | 16 +++++++++-------
 tools/testing/selftests/arm64/fp/za-ptrace.c  |  8 +++++---
 tools/testing/selftests/arm64/fp/zt-ptrace.c  |  8 +++++---
 3 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c
index 6d61992fe8a0..577b6e05e860 100644
--- a/tools/testing/selftests/arm64/fp/sve-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c
@@ -82,10 +82,12 @@ static void fill_buf(char *buf, size_t size)
 static int do_child(void)
 {
 	if (ptrace(PTRACE_TRACEME, -1, NULL, NULL))
-		ksft_exit_fail_msg("PTRACE_TRACEME", strerror(errno));
+		ksft_exit_fail_msg("ptrace(PTRACE_TRACEME) failed: %s (%d)\n",
+				   strerror(errno), errno);
 
 	if (raise(SIGSTOP))
-		ksft_exit_fail_msg("raise(SIGSTOP)", strerror(errno));
+		ksft_exit_fail_msg("raise(SIGSTOP) failed: %s (%d)\n",
+				   strerror(errno), errno);
 
 	return EXIT_SUCCESS;
 }
@@ -340,7 +342,7 @@ static void ptrace_set_sve_get_sve_data(pid_t child,
 	data_size = SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, SVE_PT_REGS_SVE);
 	write_buf = malloc(data_size);
 	if (!write_buf) {
-		ksft_test_result_fail("Error allocating %d byte buffer for %s VL %u\n",
+		ksft_test_result_fail("Error allocating %ld byte buffer for %s VL %u\n",
 				      data_size, type->name, vl);
 		return;
 	}
@@ -441,7 +443,7 @@ static void ptrace_set_sve_get_fpsimd_data(pid_t child,
 	data_size = SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, SVE_PT_REGS_SVE);
 	write_buf = malloc(data_size);
 	if (!write_buf) {
-		ksft_test_result_fail("Error allocating %d byte buffer for %s VL %u\n",
+		ksft_test_result_fail("Error allocating %ld byte buffer for %s VL %u\n",
 				      data_size, type->name, vl);
 		return;
 	}
@@ -545,7 +547,7 @@ static void ptrace_set_fpsimd_get_sve_data(pid_t child,
 	read_sve = read_buf;
 
 	if (read_sve->vl != vl) {
-		ksft_test_result_fail("Child VL != expected VL %d\n",
+		ksft_test_result_fail("Child VL != expected VL: %u != %u\n",
 				      read_sve->vl, vl);
 		goto out;
 	}
@@ -555,7 +557,7 @@ static void ptrace_set_fpsimd_get_sve_data(pid_t child,
 	case SVE_PT_REGS_FPSIMD:
 		expected_size = SVE_PT_FPSIMD_SIZE(vq, SVE_PT_REGS_FPSIMD);
 		if (read_sve_size < expected_size) {
-			ksft_test_result_fail("Read %d bytes, expected %d\n",
+			ksft_test_result_fail("Read %ld bytes, expected %ld\n",
 					      read_sve_size, expected_size);
 			goto out;
 		}
@@ -571,7 +573,7 @@ static void ptrace_set_fpsimd_get_sve_data(pid_t child,
 	case SVE_PT_REGS_SVE:
 		expected_size = SVE_PT_SVE_SIZE(vq, SVE_PT_REGS_SVE);
 		if (read_sve_size < expected_size) {
-			ksft_test_result_fail("Read %d bytes, expected %d\n",
+			ksft_test_result_fail("Read %ld bytes, expected %ld\n",
 					      read_sve_size, expected_size);
 			goto out;
 		}
diff --git a/tools/testing/selftests/arm64/fp/za-ptrace.c b/tools/testing/selftests/arm64/fp/za-ptrace.c
index ac27d87396fc..08c777f87ea2 100644
--- a/tools/testing/selftests/arm64/fp/za-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/za-ptrace.c
@@ -48,10 +48,12 @@ static void fill_buf(char *buf, size_t size)
 static int do_child(void)
 {
 	if (ptrace(PTRACE_TRACEME, -1, NULL, NULL))
-		ksft_exit_fail_msg("PTRACE_TRACEME", strerror(errno));
+		ksft_exit_fail_msg("ptrace(PTRACE_TRACEME) failed: %s (%d)",
+				   strerror(errno), errno);
 
 	if (raise(SIGSTOP))
-		ksft_exit_fail_msg("raise(SIGSTOP)", strerror(errno));
+		ksft_exit_fail_msg("raise(SIGSTOP) failed: %s (%d)\n",
+				   strerror(errno), errno);
 
 	return EXIT_SUCCESS;
 }
@@ -201,7 +203,7 @@ static void ptrace_set_get_data(pid_t child, unsigned int vl)
 	data_size = ZA_PT_SIZE(vq);
 	write_buf = malloc(data_size);
 	if (!write_buf) {
-		ksft_test_result_fail("Error allocating %d byte buffer for VL %u\n",
+		ksft_test_result_fail("Error allocating %ld byte buffer for VL %u\n",
 				      data_size, vl);
 		return;
 	}
diff --git a/tools/testing/selftests/arm64/fp/zt-ptrace.c b/tools/testing/selftests/arm64/fp/zt-ptrace.c
index 996d9614a131..584b8d59b7ea 100644
--- a/tools/testing/selftests/arm64/fp/zt-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/zt-ptrace.c
@@ -43,10 +43,12 @@ static void fill_buf(char *buf, size_t size)
 static int do_child(void)
 {
 	if (ptrace(PTRACE_TRACEME, -1, NULL, NULL))
-		ksft_exit_fail_msg("PTRACE_TRACEME", strerror(errno));
+		ksft_exit_fail_msg("ptrace(PTRACE_TRACEME) failed: %s (%d)\n",
+				   strerror(errno), errno);
 
 	if (raise(SIGSTOP))
-		ksft_exit_fail_msg("raise(SIGSTOP)", strerror(errno));
+		ksft_exit_fail_msg("raise(SIGSTOP) failed: %s (%d)\n",
+				   strerror(errno), errno);
 
 	return EXIT_SUCCESS;
 }
@@ -231,7 +233,7 @@ static void ptrace_enable_za_via_zt(pid_t child)
 		/* Should have register data */
 		if (za_out->size < ZA_PT_SIZE(vq)) {
 			ksft_print_msg("ZA data less than expected: %u < %u\n",
-				       za_out->size, ZA_PT_SIZE(vq));
+				       za_out->size, (unsigned int)ZA_PT_SIZE(vq));
 			fail = true;
 			vq = 0;
 		}
-- 
cgit v1.2.3


From 0cc6b94a445c53ab152554b4cf60575e1396adf6 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 8 Nov 2024 13:49:18 +0000
Subject: kselftest/arm64: Fix printf() warning in the arm64 MTE prctl() test

While prctl() returns an 'int', the PR_MTE_TCF_MASK is defined as
unsigned long which results in the larger type following a bitwise 'and'
operation. Cast the printf() argument to 'int'.

Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241108134920.1233992-3-catalin.marinas@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/mte/check_prctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/mte/check_prctl.c b/tools/testing/selftests/arm64/mte/check_prctl.c
index f139a33a43ef..4c89e9538ca0 100644
--- a/tools/testing/selftests/arm64/mte/check_prctl.c
+++ b/tools/testing/selftests/arm64/mte/check_prctl.c
@@ -85,7 +85,7 @@ void set_mode_test(const char *name, int hwcap2, int mask)
 		ksft_test_result_pass("%s\n", name);
 	} else {
 		ksft_print_msg("Got %x, expected %x\n",
-			       (ret & PR_MTE_TCF_MASK), mask);
+			       (ret & (int)PR_MTE_TCF_MASK), mask);
 		ksft_test_result_fail("%s\n", name);
 	}
 }
-- 
cgit v1.2.3


From 694e2803fece8d066bd85ce8607c630ce2b69859 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 8 Nov 2024 13:49:19 +0000
Subject: kselftest/arm64: Fix printf() compiler warnings in the arm64
 syscall-abi.c tests

Fix the incorrect length modifiers in arm64/abi/syscall-abi.c.

Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Mark Brown <broonie@kernel.org>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241108134920.1233992-4-catalin.marinas@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/abi/syscall-abi.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/abi/syscall-abi.c b/tools/testing/selftests/arm64/abi/syscall-abi.c
index d704511a0955..5ec9a18ec802 100644
--- a/tools/testing/selftests/arm64/abi/syscall-abi.c
+++ b/tools/testing/selftests/arm64/abi/syscall-abi.c
@@ -81,7 +81,7 @@ static int check_gpr(struct syscall_cfg *cfg, int sve_vl, int sme_vl, uint64_t s
 	 */
 	for (i = 9; i < ARRAY_SIZE(gpr_in); i++) {
 		if (gpr_in[i] != gpr_out[i]) {
-			ksft_print_msg("%s SVE VL %d mismatch in GPR %d: %llx != %llx\n",
+			ksft_print_msg("%s SVE VL %d mismatch in GPR %d: %lx != %lx\n",
 				       cfg->name, sve_vl, i,
 				       gpr_in[i], gpr_out[i]);
 			errors++;
@@ -112,7 +112,7 @@ static int check_fpr(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
 	if (!sve_vl && !(svcr & SVCR_SM_MASK)) {
 		for (i = 0; i < ARRAY_SIZE(fpr_in); i++) {
 			if (fpr_in[i] != fpr_out[i]) {
-				ksft_print_msg("%s Q%d/%d mismatch %llx != %llx\n",
+				ksft_print_msg("%s Q%d/%d mismatch %lx != %lx\n",
 					       cfg->name,
 					       i / 2, i % 2,
 					       fpr_in[i], fpr_out[i]);
@@ -294,13 +294,13 @@ static int check_svcr(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
 	int errors = 0;
 
 	if (svcr_out & SVCR_SM_MASK) {
-		ksft_print_msg("%s Still in SM, SVCR %llx\n",
+		ksft_print_msg("%s Still in SM, SVCR %lx\n",
 			       cfg->name, svcr_out);
 		errors++;
 	}
 
 	if ((svcr_in & SVCR_ZA_MASK) != (svcr_out & SVCR_ZA_MASK)) {
-		ksft_print_msg("%s PSTATE.ZA changed, SVCR %llx != %llx\n",
+		ksft_print_msg("%s PSTATE.ZA changed, SVCR %lx != %lx\n",
 			       cfg->name, svcr_in, svcr_out);
 		errors++;
 	}
-- 
cgit v1.2.3


From 929bbc16abfb0144db7ac619c77f60b188e555ab Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 8 Nov 2024 11:05:49 +0000
Subject: selftests/mm: Fix unused function warning for
 aarch64_write_signal_pkey()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since commit 49f59573e9e0 ("selftests/mm: Enable pkey_sighandler_tests
on arm64"), pkey_sighandler_tests.c (which includes pkey-arm64.h via
pkey-helpers.h) ends up compiled for arm64. Since it doesn't use
aarch64_write_signal_pkey(), the compiler warns:

In file included from pkey-helpers.h:106,
                 from pkey_sighandler_tests.c:31:
pkey-arm64.h:130:13: warning: ‘aarch64_write_signal_pkey’ defined but not used [-Wunused-function]
  130 | static void aarch64_write_signal_pkey(ucontext_t *uctxt, u64 pkey)
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~

Make the aarch64_write_signal_pkey() a 'static inline void' function to
avoid the compiler warning.

Fixes: f5b5ea51f78f ("selftests: mm: make protection_keys test work on arm64")
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Joey Gouly <joey.gouly@arm.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Reviewed-by: Kevin Brodsky <kevin.brodsky@arm.com>
Link: https://lore.kernel.org/r/20241108110549.1185923-1-catalin.marinas@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/mm/pkey-arm64.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/mm/pkey-arm64.h b/tools/testing/selftests/mm/pkey-arm64.h
index d57fbeace38f..d9d2100eafc0 100644
--- a/tools/testing/selftests/mm/pkey-arm64.h
+++ b/tools/testing/selftests/mm/pkey-arm64.h
@@ -127,7 +127,7 @@ static inline u64 get_pkey_bits(u64 reg, int pkey)
 	return 0;
 }
 
-static void aarch64_write_signal_pkey(ucontext_t *uctxt, u64 pkey)
+static inline void aarch64_write_signal_pkey(ucontext_t *uctxt, u64 pkey)
 {
 	struct _aarch64_ctx *ctx = GET_UC_RESV_HEAD(uctxt);
 	struct poe_context *poe_ctx =
-- 
cgit v1.2.3


From d9ccb18f83ea2bb654289b6ecf014fd267cc988b Mon Sep 17 00:00:00 2001
From: Omid Ehtemam-Haghighi <omid.ehtemamhaghighi@menlosecurity.com>
Date: Tue, 5 Nov 2024 17:02:36 -0800
Subject: ipv6: Fix soft lockups in fib6_select_path under high next hop churn
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Soft lockups have been observed on a cluster of Linux-based edge routers
located in a highly dynamic environment. Using the `bird` service, these
routers continuously update BGP-advertised routes due to frequently
changing nexthop destinations, while also managing significant IPv6
traffic. The lockups occur during the traversal of the multipath
circular linked-list in the `fib6_select_path` function, particularly
while iterating through the siblings in the list. The issue typically
arises when the nodes of the linked list are unexpectedly deleted
concurrently on a different core—indicated by their 'next' and
'previous' elements pointing back to the node itself and their reference
count dropping to zero. This results in an infinite loop, leading to a
soft lockup that triggers a system panic via the watchdog timer.

Apply RCU primitives in the problematic code sections to resolve the
issue. Where necessary, update the references to fib6_siblings to
annotate or use the RCU APIs.

Include a test script that reproduces the issue. The script
periodically updates the routing table while generating a heavy load
of outgoing IPv6 traffic through multiple iperf3 clients. It
consistently induces infinite soft lockups within a couple of minutes.

Kernel log:

 0 [ffffbd13003e8d30] machine_kexec at ffffffff8ceaf3eb
 1 [ffffbd13003e8d90] __crash_kexec at ffffffff8d0120e3
 2 [ffffbd13003e8e58] panic at ffffffff8cef65d4
 3 [ffffbd13003e8ed8] watchdog_timer_fn at ffffffff8d05cb03
 4 [ffffbd13003e8f08] __hrtimer_run_queues at ffffffff8cfec62f
 5 [ffffbd13003e8f70] hrtimer_interrupt at ffffffff8cfed756
 6 [ffffbd13003e8fd0] __sysvec_apic_timer_interrupt at ffffffff8cea01af
 7 [ffffbd13003e8ff0] sysvec_apic_timer_interrupt at ffffffff8df1b83d
-- <IRQ stack> --
 8 [ffffbd13003d3708] asm_sysvec_apic_timer_interrupt at ffffffff8e000ecb
    [exception RIP: fib6_select_path+299]
    RIP: ffffffff8ddafe7b  RSP: ffffbd13003d37b8  RFLAGS: 00000287
    RAX: ffff975850b43600  RBX: ffff975850b40200  RCX: 0000000000000000
    RDX: 000000003fffffff  RSI: 0000000051d383e4  RDI: ffff975850b43618
    RBP: ffffbd13003d3800   R8: 0000000000000000   R9: ffff975850b40200
    R10: 0000000000000000  R11: 0000000000000000  R12: ffffbd13003d3830
    R13: ffff975850b436a8  R14: ffff975850b43600  R15: 0000000000000007
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
 9 [ffffbd13003d3808] ip6_pol_route at ffffffff8ddb030c
10 [ffffbd13003d3888] ip6_pol_route_input at ffffffff8ddb068c
11 [ffffbd13003d3898] fib6_rule_lookup at ffffffff8ddf02b5
12 [ffffbd13003d3928] ip6_route_input at ffffffff8ddb0f47
13 [ffffbd13003d3a18] ip6_rcv_finish_core.constprop.0 at ffffffff8dd950d0
14 [ffffbd13003d3a30] ip6_list_rcv_finish.constprop.0 at ffffffff8dd96274
15 [ffffbd13003d3a98] ip6_sublist_rcv at ffffffff8dd96474
16 [ffffbd13003d3af8] ipv6_list_rcv at ffffffff8dd96615
17 [ffffbd13003d3b60] __netif_receive_skb_list_core at ffffffff8dc16fec
18 [ffffbd13003d3be0] netif_receive_skb_list_internal at ffffffff8dc176b3
19 [ffffbd13003d3c50] napi_gro_receive at ffffffff8dc565b9
20 [ffffbd13003d3c80] ice_receive_skb at ffffffffc087e4f5 [ice]
21 [ffffbd13003d3c90] ice_clean_rx_irq at ffffffffc0881b80 [ice]
22 [ffffbd13003d3d20] ice_napi_poll at ffffffffc088232f [ice]
23 [ffffbd13003d3d80] __napi_poll at ffffffff8dc18000
24 [ffffbd13003d3db8] net_rx_action at ffffffff8dc18581
25 [ffffbd13003d3e40] __do_softirq at ffffffff8df352e9
26 [ffffbd13003d3eb0] run_ksoftirqd at ffffffff8ceffe47
27 [ffffbd13003d3ec0] smpboot_thread_fn at ffffffff8cf36a30
28 [ffffbd13003d3ee8] kthread at ffffffff8cf2b39f
29 [ffffbd13003d3f28] ret_from_fork at ffffffff8ce5fa64
30 [ffffbd13003d3f50] ret_from_fork_asm at ffffffff8ce03cbb

Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table")
Reported-by: Adrian Oliver <kernel@aoliver.ca>
Signed-off-by: Omid Ehtemam-Haghighi <omid.ehtemamhaghighi@menlosecurity.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Ido Schimmel <idosch@idosch.org>
Cc: Kuniyuki Iwashima <kuniyu@amazon.com>
Cc: Simon Horman <horms@kernel.org>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://patch.msgid.link/20241106010236.1239299-1-omid.ehtemamhaghighi@menlosecurity.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ipv6/ip6_fib.c                                 |   8 +-
 net/ipv6/route.c                                   |  45 ++--
 tools/testing/selftests/net/Makefile               |   1 +
 .../selftests/net/ipv6_route_update_soft_lockup.sh | 262 +++++++++++++++++++++
 4 files changed, 297 insertions(+), 19 deletions(-)
 create mode 100755 tools/testing/selftests/net/ipv6_route_update_soft_lockup.sh

(limited to 'tools/testing')

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 6383263bfd04..c134ba202c4c 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1183,8 +1183,8 @@ next_iter:
 		while (sibling) {
 			if (sibling->fib6_metric == rt->fib6_metric &&
 			    rt6_qualify_for_ecmp(sibling)) {
-				list_add_tail(&rt->fib6_siblings,
-					      &sibling->fib6_siblings);
+				list_add_tail_rcu(&rt->fib6_siblings,
+						  &sibling->fib6_siblings);
 				break;
 			}
 			sibling = rcu_dereference_protected(sibling->fib6_next,
@@ -1245,7 +1245,7 @@ add:
 							 fib6_siblings)
 					sibling->fib6_nsiblings--;
 				rt->fib6_nsiblings = 0;
-				list_del_init(&rt->fib6_siblings);
+				list_del_rcu(&rt->fib6_siblings);
 				rt6_multipath_rebalance(next_sibling);
 				return err;
 			}
@@ -1963,7 +1963,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 					 &rt->fib6_siblings, fib6_siblings)
 			sibling->fib6_nsiblings--;
 		rt->fib6_nsiblings = 0;
-		list_del_init(&rt->fib6_siblings);
+		list_del_rcu(&rt->fib6_siblings);
 		rt6_multipath_rebalance(next_sibling);
 	}
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 038c1eeef0be..63d7681c929f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -416,8 +416,8 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
 		      struct flowi6 *fl6, int oif, bool have_oif_match,
 		      const struct sk_buff *skb, int strict)
 {
-	struct fib6_info *sibling, *next_sibling;
 	struct fib6_info *match = res->f6i;
+	struct fib6_info *sibling;
 
 	if (!match->nh && (!match->fib6_nsiblings || have_oif_match))
 		goto out;
@@ -443,8 +443,8 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
 	if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound))
 		goto out;
 
-	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
-				 fib6_siblings) {
+	list_for_each_entry_rcu(sibling, &match->fib6_siblings,
+				fib6_siblings) {
 		const struct fib6_nh *nh = sibling->fib6_nh;
 		int nh_upper_bound;
 
@@ -5195,14 +5195,18 @@ static void ip6_route_mpath_notify(struct fib6_info *rt,
 	 * nexthop. Since sibling routes are always added at the end of
 	 * the list, find the first sibling of the last route appended
 	 */
+	rcu_read_lock();
+
 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
-		rt = list_first_entry(&rt_last->fib6_siblings,
-				      struct fib6_info,
-				      fib6_siblings);
+		rt = list_first_or_null_rcu(&rt_last->fib6_siblings,
+					    struct fib6_info,
+					    fib6_siblings);
 	}
 
 	if (rt)
 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
+
+	rcu_read_unlock();
 }
 
 static bool ip6_route_mpath_should_notify(const struct fib6_info *rt)
@@ -5547,17 +5551,21 @@ static size_t rt6_nlmsg_size(struct fib6_info *f6i)
 		nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size,
 					 &nexthop_len);
 	} else {
-		struct fib6_info *sibling, *next_sibling;
 		struct fib6_nh *nh = f6i->fib6_nh;
+		struct fib6_info *sibling;
 
 		nexthop_len = 0;
 		if (f6i->fib6_nsiblings) {
 			rt6_nh_nlmsg_size(nh, &nexthop_len);
 
-			list_for_each_entry_safe(sibling, next_sibling,
-						 &f6i->fib6_siblings, fib6_siblings) {
+			rcu_read_lock();
+
+			list_for_each_entry_rcu(sibling, &f6i->fib6_siblings,
+						fib6_siblings) {
 				rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len);
 			}
+
+			rcu_read_unlock();
 		}
 		nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
 	}
@@ -5721,7 +5729,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 		    lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
 			goto nla_put_failure;
 	} else if (rt->fib6_nsiblings) {
-		struct fib6_info *sibling, *next_sibling;
+		struct fib6_info *sibling;
 		struct nlattr *mp;
 
 		mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
@@ -5733,14 +5741,21 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 				    0) < 0)
 			goto nla_put_failure;
 
-		list_for_each_entry_safe(sibling, next_sibling,
-					 &rt->fib6_siblings, fib6_siblings) {
+		rcu_read_lock();
+
+		list_for_each_entry_rcu(sibling, &rt->fib6_siblings,
+					fib6_siblings) {
 			if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common,
 					    sibling->fib6_nh->fib_nh_weight,
-					    AF_INET6, 0) < 0)
+					    AF_INET6, 0) < 0) {
+				rcu_read_unlock();
+
 				goto nla_put_failure;
+			}
 		}
 
+		rcu_read_unlock();
+
 		nla_nest_end(skb, mp);
 	} else if (rt->nh) {
 		if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id))
@@ -6177,7 +6192,7 @@ void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
 	err = -ENOBUFS;
 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
 
-	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
+	skb = nlmsg_new(rt6_nlmsg_size(rt), GFP_ATOMIC);
 	if (!skb)
 		goto errout;
 
@@ -6190,7 +6205,7 @@ void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
 		goto errout;
 	}
 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
-		    info->nlh, gfp_any());
+		    info->nlh, GFP_ATOMIC);
 	return;
 errout:
 	rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 26a4883a65c9..8c4db5199a42 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -96,6 +96,7 @@ TEST_PROGS += fdb_flush.sh
 TEST_PROGS += fq_band_pktlimit.sh
 TEST_PROGS += vlan_hw_filter.sh
 TEST_PROGS += bpf_offload.py
+TEST_PROGS += ipv6_route_update_soft_lockup.sh
 
 # YNL files, must be before "include ..lib.mk"
 YNL_GEN_FILES := ncdevmem
diff --git a/tools/testing/selftests/net/ipv6_route_update_soft_lockup.sh b/tools/testing/selftests/net/ipv6_route_update_soft_lockup.sh
new file mode 100755
index 000000000000..a6b2b1f9c641
--- /dev/null
+++ b/tools/testing/selftests/net/ipv6_route_update_soft_lockup.sh
@@ -0,0 +1,262 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Testing for potential kernel soft lockup during IPv6 routing table
+# refresh under heavy outgoing IPv6 traffic. If a kernel soft lockup
+# occurs, a kernel panic will be triggered to prevent associated issues.
+#
+#
+#                            Test Environment Layout
+#
+# ┌----------------┐                                         ┌----------------┐
+# |     SOURCE_NS  |                                         |     SINK_NS    |
+# |    NAMESPACE   |                                         |    NAMESPACE   |
+# |(iperf3 clients)|                                         |(iperf3 servers)|
+# |                |                                         |                |
+# |                |                                         |                |
+# |    ┌-----------|                             nexthops    |---------┐      |
+# |    |veth_source|<--------------------------------------->|veth_sink|<┐    |
+# |    └-----------|2001:0DB8:1::0:1/96  2001:0DB8:1::1:1/96 |---------┘ |    |
+# |                |         ^           2001:0DB8:1::1:2/96 |           |    |
+# |                |         .                   .           |       fwd |    |
+# |  ┌---------┐   |         .                   .           |           |    |
+# |  |   IPv6  |   |         .                   .           |           V    |
+# |  | routing |   |         .           2001:0DB8:1::1:80/96|        ┌-----┐ |
+# |  |  table  |   |         .                               |        | lo  | |
+# |  | nexthop |   |         .                               └--------┴-----┴-┘
+# |  | update  |   |         ............................> 2001:0DB8:2::1:1/128
+# |  └-------- ┘   |
+# └----------------┘
+#
+# The test script sets up two network namespaces, source_ns and sink_ns,
+# connected via a veth link. Within source_ns, it continuously updates the
+# IPv6 routing table by flushing and inserting IPV6_NEXTHOP_ADDR_COUNT nexthop
+# IPs destined for SINK_LOOPBACK_IP_ADDR in sink_ns. This refresh occurs at a
+# rate of 1/ROUTING_TABLE_REFRESH_PERIOD per second for TEST_DURATION seconds.
+#
+# Simultaneously, multiple iperf3 clients within source_ns generate heavy
+# outgoing IPv6 traffic. Each client is assigned a unique port number starting
+# at 5000 and incrementing sequentially. Each client targets a unique iperf3
+# server running in sink_ns, connected to the SINK_LOOPBACK_IFACE interface
+# using the same port number.
+#
+# The number of iperf3 servers and clients is set to half of the total
+# available cores on each machine.
+#
+# NOTE: We have tested this script on machines with various CPU specifications,
+# ranging from lower to higher performance as listed below. The test script
+# effectively triggered a kernel soft lockup on machines running an unpatched
+# kernel in under a minute:
+#
+# - 1x Intel Xeon E-2278G 8-Core Processor @ 3.40GHz
+# - 1x Intel Xeon E-2378G Processor 8-Core @ 2.80GHz
+# - 1x AMD EPYC 7401P 24-Core Processor @ 2.00GHz
+# - 1x AMD EPYC 7402P 24-Core Processor @ 2.80GHz
+# - 2x Intel Xeon Gold 5120 14-Core Processor @ 2.20GHz
+# - 1x Ampere Altra Q80-30 80-Core Processor @ 3.00GHz
+# - 2x Intel Xeon Gold 5120 14-Core Processor @ 2.20GHz
+# - 2x Intel Xeon Silver 4214 24-Core Processor @ 2.20GHz
+# - 1x AMD EPYC 7502P 32-Core @ 2.50GHz
+# - 1x Intel Xeon Gold 6314U 32-Core Processor @ 2.30GHz
+# - 2x Intel Xeon Gold 6338 32-Core Processor @ 2.00GHz
+#
+# On less performant machines, you may need to increase the TEST_DURATION
+# parameter to enhance the likelihood of encountering a race condition leading
+# to a kernel soft lockup and avoid a false negative result.
+#
+# NOTE: The test may not produce the expected result in virtualized
+# environments (e.g., qemu) due to differences in timing and CPU handling,
+# which can affect the conditions needed to trigger a soft lockup.
+
+source lib.sh
+source net_helper.sh
+
+TEST_DURATION=300
+ROUTING_TABLE_REFRESH_PERIOD=0.01
+
+IPERF3_BITRATE="300m"
+
+
+IPV6_NEXTHOP_ADDR_COUNT="128"
+IPV6_NEXTHOP_ADDR_MASK="96"
+IPV6_NEXTHOP_PREFIX="2001:0DB8:1"
+
+
+SOURCE_TEST_IFACE="veth_source"
+SOURCE_TEST_IP_ADDR="2001:0DB8:1::0:1/96"
+
+SINK_TEST_IFACE="veth_sink"
+# ${SINK_TEST_IFACE} is populated with the following range of IPv6 addresses:
+# 2001:0DB8:1::1:1  to 2001:0DB8:1::1:${IPV6_NEXTHOP_ADDR_COUNT}
+SINK_LOOPBACK_IFACE="lo"
+SINK_LOOPBACK_IP_MASK="128"
+SINK_LOOPBACK_IP_ADDR="2001:0DB8:2::1:1"
+
+nexthop_ip_list=""
+termination_signal=""
+kernel_softlokup_panic_prev_val=""
+
+terminate_ns_processes_by_pattern() {
+	local ns=$1
+	local pattern=$2
+
+	for pid in $(ip netns pids ${ns}); do
+		[ -e /proc/$pid/cmdline ] && grep -qe "${pattern}" /proc/$pid/cmdline && kill -9 $pid
+	done
+}
+
+cleanup() {
+	echo "info: cleaning up namespaces and terminating all processes within them..."
+
+
+	# Terminate iperf3 instances running in the source_ns. To avoid race
+	# conditions, first iterate over the PIDs and terminate those
+	# associated with the bash shells running the
+	# `while true; do iperf3 -c ...; done` loops. In a second iteration,
+	# terminate the individual `iperf3 -c ...` instances.
+	terminate_ns_processes_by_pattern ${source_ns} while
+	terminate_ns_processes_by_pattern ${source_ns} iperf3
+
+	# Repeat the same process for sink_ns
+	terminate_ns_processes_by_pattern ${sink_ns} while
+	terminate_ns_processes_by_pattern ${sink_ns} iperf3
+
+	# Check if any iperf3 instances are still running. This could happen
+	# if a core has entered an infinite loop and the timeout for detecting
+	# the soft lockup has not expired, but either the test interval has
+	# already elapsed or the test was terminated manually (e.g., with ^C)
+	for pid in $(ip netns pids ${source_ns}); do
+		if [ -e /proc/$pid/cmdline ] && grep -qe 'iperf3' /proc/$pid/cmdline; then
+			echo "FAIL: unable to terminate some iperf3 instances. Soft lockup is underway. A kernel panic is on the way!"
+			exit ${ksft_fail}
+		fi
+	done
+
+	if [ "$termination_signal" == "SIGINT" ]; then
+		echo "SKIP: Termination due to ^C (SIGINT)"
+	elif [ "$termination_signal" == "SIGALRM" ]; then
+		echo "PASS: No kernel soft lockup occurred during this ${TEST_DURATION} second test"
+	fi
+
+	cleanup_ns ${source_ns} ${sink_ns}
+
+	sysctl -qw kernel.softlockup_panic=${kernel_softlokup_panic_prev_val}
+}
+
+setup_prepare() {
+	setup_ns source_ns sink_ns
+
+	ip -n ${source_ns} link add name ${SOURCE_TEST_IFACE} type veth peer name ${SINK_TEST_IFACE} netns ${sink_ns}
+
+	# Setting up the Source namespace
+	ip -n ${source_ns} addr add ${SOURCE_TEST_IP_ADDR} dev ${SOURCE_TEST_IFACE}
+	ip -n ${source_ns} link set dev ${SOURCE_TEST_IFACE} qlen 10000
+	ip -n ${source_ns} link set dev ${SOURCE_TEST_IFACE} up
+	ip netns exec ${source_ns} sysctl -qw net.ipv6.fib_multipath_hash_policy=1
+
+	# Setting up the Sink namespace
+	ip -n ${sink_ns} addr add ${SINK_LOOPBACK_IP_ADDR}/${SINK_LOOPBACK_IP_MASK} dev ${SINK_LOOPBACK_IFACE}
+	ip -n ${sink_ns} link set dev ${SINK_LOOPBACK_IFACE} up
+	ip netns exec ${sink_ns} sysctl -qw net.ipv6.conf.${SINK_LOOPBACK_IFACE}.forwarding=1
+
+	ip -n ${sink_ns} link set ${SINK_TEST_IFACE} up
+	ip netns exec ${sink_ns} sysctl -qw net.ipv6.conf.${SINK_TEST_IFACE}.forwarding=1
+
+
+	# Populate nexthop IPv6 addresses on the test interface in the sink_ns
+	echo "info: populating ${IPV6_NEXTHOP_ADDR_COUNT} IPv6 addresses on the ${SINK_TEST_IFACE} interface ..."
+	for IP in $(seq 1 ${IPV6_NEXTHOP_ADDR_COUNT}); do
+		ip -n ${sink_ns} addr add ${IPV6_NEXTHOP_PREFIX}::$(printf "1:%x" "${IP}")/${IPV6_NEXTHOP_ADDR_MASK} dev ${SINK_TEST_IFACE};
+	done
+
+	# Preparing list of nexthops
+	for IP in $(seq 1 ${IPV6_NEXTHOP_ADDR_COUNT}); do
+		nexthop_ip_list=$nexthop_ip_list" nexthop via ${IPV6_NEXTHOP_PREFIX}::$(printf "1:%x" $IP) dev ${SOURCE_TEST_IFACE} weight 1"
+	done
+}
+
+
+test_soft_lockup_during_routing_table_refresh() {
+	# Start num_of_iperf_servers iperf3 servers in the sink_ns namespace,
+	# each listening on ports starting at 5001 and incrementing
+	# sequentially. Since iperf3 instances may terminate unexpectedly, a
+	# while loop is used to automatically restart them in such cases.
+	echo "info: starting ${num_of_iperf_servers} iperf3 servers in the sink_ns namespace ..."
+	for i in $(seq 1 ${num_of_iperf_servers}); do
+		cmd="iperf3 --bind ${SINK_LOOPBACK_IP_ADDR} -s -p $(printf '5%03d' ${i}) --rcv-timeout 200 &>/dev/null"
+		ip netns exec ${sink_ns} bash -c "while true; do ${cmd}; done &" &>/dev/null
+	done
+
+	# Wait for the iperf3 servers to be ready
+	for i in $(seq ${num_of_iperf_servers}); do
+		port=$(printf '5%03d' ${i});
+		wait_local_port_listen ${sink_ns} ${port} tcp
+	done
+
+	# Continuously refresh the routing table in the background within
+	# the source_ns namespace
+	ip netns exec ${source_ns} bash -c "
+		while \$(ip netns list | grep -q ${source_ns}); do
+			ip -6 route add ${SINK_LOOPBACK_IP_ADDR}/${SINK_LOOPBACK_IP_MASK} ${nexthop_ip_list};
+			sleep ${ROUTING_TABLE_REFRESH_PERIOD};
+			ip -6 route delete ${SINK_LOOPBACK_IP_ADDR}/${SINK_LOOPBACK_IP_MASK};
+		done &"
+
+	# Start num_of_iperf_servers iperf3 clients in the source_ns namespace,
+	# each sending TCP traffic on sequential ports starting at 5001.
+	# Since iperf3 instances may terminate unexpectedly (e.g., if the route
+	# to the server is deleted in the background during a route refresh), a
+	# while loop is used to automatically restart them in such cases.
+	echo "info: starting ${num_of_iperf_servers} iperf3 clients in the source_ns namespace ..."
+	for i in $(seq 1 ${num_of_iperf_servers}); do
+		cmd="iperf3 -c ${SINK_LOOPBACK_IP_ADDR} -p $(printf '5%03d' ${i}) --length 64 --bitrate ${IPERF3_BITRATE} -t 0 --connect-timeout 150 &>/dev/null"
+		ip netns exec ${source_ns} bash -c "while true; do ${cmd}; done &" &>/dev/null
+	done
+
+	echo "info: IPv6 routing table is being updated at the rate of $(echo "1/${ROUTING_TABLE_REFRESH_PERIOD}" | bc)/s for ${TEST_DURATION} seconds ..."
+	echo "info: A kernel soft lockup, if detected, results in a kernel panic!"
+
+	wait
+}
+
+# Make sure 'iperf3' is installed, skip the test otherwise
+if [ ! -x "$(command -v "iperf3")" ]; then
+	echo "SKIP: 'iperf3' is not installed. Skipping the test."
+	exit ${ksft_skip}
+fi
+
+# Determine the number of cores on the machine
+num_of_iperf_servers=$(( $(nproc)/2 ))
+
+# Check if we are running on a multi-core machine, skip the test otherwise
+if [ "${num_of_iperf_servers}" -eq 0 ]; then
+	echo "SKIP: This test is not valid on a single core machine!"
+	exit ${ksft_skip}
+fi
+
+# Since the kernel soft lockup we're testing causes at least one core to enter
+# an infinite loop, destabilizing the host and likely affecting subsequent
+# tests, we trigger a kernel panic instead of reporting a failure and
+# continuing
+kernel_softlokup_panic_prev_val=$(sysctl -n kernel.softlockup_panic)
+sysctl -qw kernel.softlockup_panic=1
+
+handle_sigint() {
+	termination_signal="SIGINT"
+	cleanup
+	exit ${ksft_skip}
+}
+
+handle_sigalrm() {
+	termination_signal="SIGALRM"
+	cleanup
+	exit ${ksft_pass}
+}
+
+trap handle_sigint SIGINT
+trap handle_sigalrm SIGALRM
+
+(sleep ${TEST_DURATION} && kill -s SIGALRM $$)&
+
+setup_prepare
+test_soft_lockup_during_routing_table_refresh
-- 
cgit v1.2.3


From 75e3f12fa51b758f6b8d7684b65b4684386e7706 Mon Sep 17 00:00:00 2001
From: MD Danish Anwar <danishanwar@ti.com>
Date: Wed, 6 Nov 2024 14:47:10 +0530
Subject: selftests: hsr: Add test for VLAN

Add test for VLAN ping for HSR. The test adds vlan interfaces to the hsr
interface and then verifies if ping to them works.

Signed-off-by: MD Danish Anwar <danishanwar@ti.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://patch.msgid.link/20241106091710.3308519-5-danishanwar@ti.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/hsr/config      |  1 +
 tools/testing/selftests/net/hsr/hsr_ping.sh | 98 +++++++++++++++++++++++++++++
 2 files changed, 99 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/hsr/config b/tools/testing/selftests/net/hsr/config
index 241542441c51..555a868743f0 100644
--- a/tools/testing/selftests/net/hsr/config
+++ b/tools/testing/selftests/net/hsr/config
@@ -3,3 +3,4 @@ CONFIG_NET_SCH_NETEM=m
 CONFIG_HSR=y
 CONFIG_VETH=y
 CONFIG_BRIDGE=y
+CONFIG_VLAN_8021Q=m
diff --git a/tools/testing/selftests/net/hsr/hsr_ping.sh b/tools/testing/selftests/net/hsr/hsr_ping.sh
index f5d207fc770a..5a65f4f836be 100755
--- a/tools/testing/selftests/net/hsr/hsr_ping.sh
+++ b/tools/testing/selftests/net/hsr/hsr_ping.sh
@@ -175,6 +175,100 @@ setup_hsr_interfaces()
 	ip -net "$ns3" link set hsr3 up
 }
 
+setup_vlan_interfaces() {
+	ip -net "$ns1" link add link hsr1 name hsr1.2 type vlan id 2
+	ip -net "$ns1" link add link hsr1 name hsr1.3 type vlan id 3
+	ip -net "$ns1" link add link hsr1 name hsr1.4 type vlan id 4
+	ip -net "$ns1" link add link hsr1 name hsr1.5 type vlan id 5
+
+	ip -net "$ns2" link add link hsr2 name hsr2.2 type vlan id 2
+	ip -net "$ns2" link add link hsr2 name hsr2.3 type vlan id 3
+	ip -net "$ns2" link add link hsr2 name hsr2.4 type vlan id 4
+	ip -net "$ns2" link add link hsr2 name hsr2.5 type vlan id 5
+
+	ip -net "$ns3" link add link hsr3 name hsr3.2 type vlan id 2
+	ip -net "$ns3" link add link hsr3 name hsr3.3 type vlan id 3
+	ip -net "$ns3" link add link hsr3 name hsr3.4 type vlan id 4
+	ip -net "$ns3" link add link hsr3 name hsr3.5 type vlan id 5
+
+	ip -net "$ns1" addr add 100.64.2.1/24 dev hsr1.2
+	ip -net "$ns1" addr add 100.64.3.1/24 dev hsr1.3
+	ip -net "$ns1" addr add 100.64.4.1/24 dev hsr1.4
+	ip -net "$ns1" addr add 100.64.5.1/24 dev hsr1.5
+
+	ip -net "$ns2" addr add 100.64.2.2/24 dev hsr2.2
+	ip -net "$ns2" addr add 100.64.3.2/24 dev hsr2.3
+	ip -net "$ns2" addr add 100.64.4.2/24 dev hsr2.4
+	ip -net "$ns2" addr add 100.64.5.2/24 dev hsr2.5
+
+	ip -net "$ns3" addr add 100.64.2.3/24 dev hsr3.2
+	ip -net "$ns3" addr add 100.64.3.3/24 dev hsr3.3
+	ip -net "$ns3" addr add 100.64.4.3/24 dev hsr3.4
+	ip -net "$ns3" addr add 100.64.5.3/24 dev hsr3.5
+
+	ip -net "$ns1" link set dev hsr1.2 up
+	ip -net "$ns1" link set dev hsr1.3 up
+	ip -net "$ns1" link set dev hsr1.4 up
+	ip -net "$ns1" link set dev hsr1.5 up
+
+	ip -net "$ns2" link set dev hsr2.2 up
+	ip -net "$ns2" link set dev hsr2.3 up
+	ip -net "$ns2" link set dev hsr2.4 up
+	ip -net "$ns2" link set dev hsr2.5 up
+
+	ip -net "$ns3" link set dev hsr3.2 up
+	ip -net "$ns3" link set dev hsr3.3 up
+	ip -net "$ns3" link set dev hsr3.4 up
+	ip -net "$ns3" link set dev hsr3.5 up
+
+}
+
+hsr_vlan_ping() {
+	do_ping "$ns1" 100.64.2.2
+	do_ping "$ns1" 100.64.3.2
+	do_ping "$ns1" 100.64.4.2
+	do_ping "$ns1" 100.64.5.2
+
+	do_ping "$ns1" 100.64.2.3
+	do_ping "$ns1" 100.64.3.3
+	do_ping "$ns1" 100.64.4.3
+	do_ping "$ns1" 100.64.5.3
+
+	do_ping "$ns2" 100.64.2.1
+	do_ping "$ns2" 100.64.3.1
+	do_ping "$ns2" 100.64.4.1
+	do_ping "$ns2" 100.64.5.1
+
+	do_ping "$ns2" 100.64.2.3
+	do_ping "$ns2" 100.64.3.3
+	do_ping "$ns2" 100.64.4.3
+	do_ping "$ns2" 100.64.5.3
+
+	do_ping "$ns3" 100.64.2.1
+	do_ping "$ns3" 100.64.3.1
+	do_ping "$ns3" 100.64.4.1
+	do_ping "$ns3" 100.64.5.1
+
+	do_ping "$ns3" 100.64.2.2
+	do_ping "$ns3" 100.64.3.2
+	do_ping "$ns3" 100.64.4.2
+	do_ping "$ns3" 100.64.5.2
+}
+
+run_vlan_tests() {
+	vlan_challenged_hsr1=$(ip net exec "$ns1" ethtool -k hsr1 | grep "vlan-challenged" | awk '{print $2}')
+	vlan_challenged_hsr2=$(ip net exec "$ns2" ethtool -k hsr2 | grep "vlan-challenged" | awk '{print $2}')
+	vlan_challenged_hsr3=$(ip net exec "$ns3" ethtool -k hsr3 | grep "vlan-challenged" | awk '{print $2}')
+
+	if [[ "$vlan_challenged_hsr1" = "off" || "$vlan_challenged_hsr2" = "off" || "$vlan_challenged_hsr3" = "off" ]]; then
+		echo "INFO: Running VLAN tests"
+		setup_vlan_interfaces
+		hsr_vlan_ping
+	else
+		echo "INFO: Not Running VLAN tests as the device does not support VLAN"
+	fi
+}
+
 check_prerequisites
 setup_ns ns1 ns2 ns3
 
@@ -183,9 +277,13 @@ trap cleanup_all_ns EXIT
 setup_hsr_interfaces 0
 do_complete_ping_test
 
+run_vlan_tests
+
 setup_ns ns1 ns2 ns3
 
 setup_hsr_interfaces 1
 do_complete_ping_test
 
+run_vlan_tests
+
 exit $ret
-- 
cgit v1.2.3


From 6891f0b523e1ef452523ba43d67ca2a654760e78 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Thu, 7 Nov 2024 10:12:00 -0800
Subject: selftests: ncdevmem: Redirect all non-payload output to stderr

That should make it possible to do expected payload validation on
the caller side.

Reviewed-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Joe Damato <jdamato@fastly.com>
Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20241107181211.3934153-2-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/ncdevmem.c | 61 +++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 31 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/net/ncdevmem.c
index 64d6805381c5..9245d3f158dd 100644
--- a/tools/testing/selftests/net/ncdevmem.c
+++ b/tools/testing/selftests/net/ncdevmem.c
@@ -88,7 +88,6 @@ void print_nonzero_bytes(void *ptr, size_t size)
 
 	for (i = 0; i < size; i++)
 		putchar(p[i]);
-	printf("\n");
 }
 
 void validate_buffer(void *line, size_t size)
@@ -120,7 +119,7 @@ void validate_buffer(void *line, size_t size)
 		char command[256];                                      \
 		memset(command, 0, sizeof(command));                    \
 		snprintf(command, sizeof(command), cmd, ##__VA_ARGS__); \
-		printf("Running: %s\n", command);                       \
+		fprintf(stderr, "Running: %s\n", command);                       \
 		system(command);                                        \
 	})
 
@@ -128,22 +127,22 @@ static int reset_flow_steering(void)
 {
 	int ret = 0;
 
-	ret = run_command("sudo ethtool -K %s ntuple off", ifname);
+	ret = run_command("sudo ethtool -K %s ntuple off >&2", ifname);
 	if (ret)
 		return ret;
 
-	return run_command("sudo ethtool -K %s ntuple on", ifname);
+	return run_command("sudo ethtool -K %s ntuple on >&2", ifname);
 }
 
 static int configure_headersplit(bool on)
 {
-	return run_command("sudo ethtool -G %s tcp-data-split %s", ifname,
+	return run_command("sudo ethtool -G %s tcp-data-split %s >&2", ifname,
 			   on ? "on" : "off");
 }
 
 static int configure_rss(void)
 {
-	return run_command("sudo ethtool -X %s equal %d", ifname, start_queue);
+	return run_command("sudo ethtool -X %s equal %d >&2", ifname, start_queue);
 }
 
 static int configure_channels(unsigned int rx, unsigned int tx)
@@ -153,7 +152,7 @@ static int configure_channels(unsigned int rx, unsigned int tx)
 
 static int configure_flow_steering(void)
 {
-	return run_command("sudo ethtool -N %s flow-type tcp4 src-ip %s dst-ip %s src-port %s dst-port %s queue %d",
+	return run_command("sudo ethtool -N %s flow-type tcp4 src-ip %s dst-ip %s src-port %s dst-port %s queue %d >&2",
 			   ifname, client_ip, server_ip, port, port, start_queue);
 }
 
@@ -187,7 +186,7 @@ static int bind_rx_queue(unsigned int ifindex, unsigned int dmabuf_fd,
 		goto err_close;
 	}
 
-	printf("got dmabuf id=%d\n", rsp->id);
+	fprintf(stderr, "got dmabuf id=%d\n", rsp->id);
 	dmabuf_id = rsp->id;
 
 	netdev_bind_rx_req_free(req);
@@ -314,8 +313,8 @@ int do_server(void)
 	if (ret)
 		error(errno, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX);
 
-	printf("binding to address %s:%d\n", server_ip,
-	       ntohs(server_sin.sin_port));
+	fprintf(stderr, "binding to address %s:%d\n", server_ip,
+		ntohs(server_sin.sin_port));
 
 	ret = bind(socket_fd, &server_sin, sizeof(server_sin));
 	if (ret)
@@ -329,14 +328,14 @@ int do_server(void)
 
 	inet_ntop(server_sin.sin_family, &server_sin.sin_addr, buffer,
 		  sizeof(buffer));
-	printf("Waiting or connection on %s:%d\n", buffer,
-	       ntohs(server_sin.sin_port));
+	fprintf(stderr, "Waiting or connection on %s:%d\n", buffer,
+		ntohs(server_sin.sin_port));
 	client_fd = accept(socket_fd, &client_addr, &client_addr_len);
 
 	inet_ntop(client_addr.sin_family, &client_addr.sin_addr, buffer,
 		  sizeof(buffer));
-	printf("Got connection from %s:%d\n", buffer,
-	       ntohs(client_addr.sin_port));
+	fprintf(stderr, "Got connection from %s:%d\n", buffer,
+		ntohs(client_addr.sin_port));
 
 	while (1) {
 		struct iovec iov = { .iov_base = iobuf,
@@ -349,14 +348,13 @@ int do_server(void)
 		ssize_t ret;
 
 		is_devmem = false;
-		printf("\n\n");
 
 		msg.msg_iov = &iov;
 		msg.msg_iovlen = 1;
 		msg.msg_control = ctrl_data;
 		msg.msg_controllen = sizeof(ctrl_data);
 		ret = recvmsg(client_fd, &msg, MSG_SOCK_DEVMEM);
-		printf("recvmsg ret=%ld\n", ret);
+		fprintf(stderr, "recvmsg ret=%ld\n", ret);
 		if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK))
 			continue;
 		if (ret < 0) {
@@ -364,7 +362,7 @@ int do_server(void)
 			continue;
 		}
 		if (ret == 0) {
-			printf("client exited\n");
+			fprintf(stderr, "client exited\n");
 			goto cleanup;
 		}
 
@@ -373,7 +371,7 @@ int do_server(void)
 			if (cm->cmsg_level != SOL_SOCKET ||
 			    (cm->cmsg_type != SCM_DEVMEM_DMABUF &&
 			     cm->cmsg_type != SCM_DEVMEM_LINEAR)) {
-				fprintf(stdout, "skipping non-devmem cmsg\n");
+				fprintf(stderr, "skipping non-devmem cmsg\n");
 				continue;
 			}
 
@@ -384,7 +382,7 @@ int do_server(void)
 				/* TODO: process data copied from skb's linear
 				 * buffer.
 				 */
-				fprintf(stdout,
+				fprintf(stderr,
 					"SCM_DEVMEM_LINEAR. dmabuf_cmsg->frag_size=%u\n",
 					dmabuf_cmsg->frag_size);
 
@@ -395,12 +393,13 @@ int do_server(void)
 			token.token_count = 1;
 
 			total_received += dmabuf_cmsg->frag_size;
-			printf("received frag_page=%llu, in_page_offset=%llu, frag_offset=%llu, frag_size=%u, token=%u, total_received=%lu, dmabuf_id=%u\n",
-			       dmabuf_cmsg->frag_offset >> PAGE_SHIFT,
-			       dmabuf_cmsg->frag_offset % getpagesize(),
-			       dmabuf_cmsg->frag_offset, dmabuf_cmsg->frag_size,
-			       dmabuf_cmsg->frag_token, total_received,
-			       dmabuf_cmsg->dmabuf_id);
+			fprintf(stderr,
+				"received frag_page=%llu, in_page_offset=%llu, frag_offset=%llu, frag_size=%u, token=%u, total_received=%lu, dmabuf_id=%u\n",
+				dmabuf_cmsg->frag_offset >> PAGE_SHIFT,
+				dmabuf_cmsg->frag_offset % getpagesize(),
+				dmabuf_cmsg->frag_offset,
+				dmabuf_cmsg->frag_size, dmabuf_cmsg->frag_token,
+				total_received, dmabuf_cmsg->dmabuf_id);
 
 			if (dmabuf_cmsg->dmabuf_id != dmabuf_id)
 				error(1, 0,
@@ -438,15 +437,15 @@ int do_server(void)
 		if (!is_devmem)
 			error(1, 0, "flow steering error\n");
 
-		printf("total_received=%lu\n", total_received);
+		fprintf(stderr, "total_received=%lu\n", total_received);
 	}
 
-	fprintf(stdout, "%s: ok\n", TEST_PREFIX);
+	fprintf(stderr, "%s: ok\n", TEST_PREFIX);
 
-	fprintf(stdout, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n",
+	fprintf(stderr, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n",
 		page_aligned_frags, non_page_aligned_frags);
 
-	fprintf(stdout, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n",
+	fprintf(stderr, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n",
 		page_aligned_frags, non_page_aligned_frags);
 
 cleanup:
@@ -551,7 +550,7 @@ int main(int argc, char *argv[])
 			ifname = optarg;
 			break;
 		case '?':
-			printf("unknown option: %c\n", optopt);
+			fprintf(stderr, "unknown option: %c\n", optopt);
 			break;
 		}
 	}
@@ -559,7 +558,7 @@ int main(int argc, char *argv[])
 	ifindex = if_nametoindex(ifname);
 
 	for (; optind < argc; optind++)
-		printf("extra arguments: %s\n", argv[optind]);
+		fprintf(stderr, "extra arguments: %s\n", argv[optind]);
 
 	run_devmem_tests();
 
-- 
cgit v1.2.3


From 8b9049af8066b4705d83bb7847ee3c960fc58d09 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Thu, 7 Nov 2024 10:12:01 -0800
Subject: selftests: ncdevmem: Separate out dmabuf provider

So we can plug the other ones in the future if needed.

Reviewed-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Joe Damato <jdamato@fastly.com>
Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20241107181211.3934153-3-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/ncdevmem.c | 203 +++++++++++++++++++--------------
 1 file changed, 119 insertions(+), 84 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/net/ncdevmem.c
index 9245d3f158dd..3e7ef2eedd60 100644
--- a/tools/testing/selftests/net/ncdevmem.c
+++ b/tools/testing/selftests/net/ncdevmem.c
@@ -71,17 +71,101 @@ static char *ifname = "eth1";
 static unsigned int ifindex;
 static unsigned int dmabuf_id;
 
-void print_bytes(void *ptr, size_t size)
+struct memory_buffer {
+	int fd;
+	size_t size;
+
+	int devfd;
+	int memfd;
+	char *buf_mem;
+};
+
+struct memory_provider {
+	struct memory_buffer *(*alloc)(size_t size);
+	void (*free)(struct memory_buffer *ctx);
+	void (*memcpy_from_device)(void *dst, struct memory_buffer *src,
+				   size_t off, int n);
+};
+
+static struct memory_buffer *udmabuf_alloc(size_t size)
 {
-	unsigned char *p = ptr;
-	int i;
+	struct udmabuf_create create;
+	struct memory_buffer *ctx;
+	int ret;
 
-	for (i = 0; i < size; i++)
-		printf("%02hhX ", p[i]);
-	printf("\n");
+	ctx = malloc(sizeof(*ctx));
+	if (!ctx)
+		error(1, ENOMEM, "malloc failed");
+
+	ctx->size = size;
+
+	ctx->devfd = open("/dev/udmabuf", O_RDWR);
+	if (ctx->devfd < 0)
+		error(1, errno,
+		      "%s: [skip,no-udmabuf: Unable to access DMA buffer device file]\n",
+		      TEST_PREFIX);
+
+	ctx->memfd = memfd_create("udmabuf-test", MFD_ALLOW_SEALING);
+	if (ctx->memfd < 0)
+		error(1, errno, "%s: [skip,no-memfd]\n", TEST_PREFIX);
+
+	ret = fcntl(ctx->memfd, F_ADD_SEALS, F_SEAL_SHRINK);
+	if (ret < 0)
+		error(1, errno, "%s: [skip,fcntl-add-seals]\n", TEST_PREFIX);
+
+	ret = ftruncate(ctx->memfd, size);
+	if (ret == -1)
+		error(1, errno, "%s: [FAIL,memfd-truncate]\n", TEST_PREFIX);
+
+	memset(&create, 0, sizeof(create));
+
+	create.memfd = ctx->memfd;
+	create.offset = 0;
+	create.size = size;
+	ctx->fd = ioctl(ctx->devfd, UDMABUF_CREATE, &create);
+	if (ctx->fd < 0)
+		error(1, errno, "%s: [FAIL, create udmabuf]\n", TEST_PREFIX);
+
+	ctx->buf_mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED,
+			    ctx->fd, 0);
+	if (ctx->buf_mem == MAP_FAILED)
+		error(1, errno, "%s: [FAIL, map udmabuf]\n", TEST_PREFIX);
+
+	return ctx;
+}
+
+static void udmabuf_free(struct memory_buffer *ctx)
+{
+	munmap(ctx->buf_mem, ctx->size);
+	close(ctx->fd);
+	close(ctx->memfd);
+	close(ctx->devfd);
+	free(ctx);
 }
 
-void print_nonzero_bytes(void *ptr, size_t size)
+static void udmabuf_memcpy_from_device(void *dst, struct memory_buffer *src,
+				       size_t off, int n)
+{
+	struct dma_buf_sync sync = {};
+
+	sync.flags = DMA_BUF_SYNC_START;
+	ioctl(src->fd, DMA_BUF_IOCTL_SYNC, &sync);
+
+	memcpy(dst, src->buf_mem + off, n);
+
+	sync.flags = DMA_BUF_SYNC_END;
+	ioctl(src->fd, DMA_BUF_IOCTL_SYNC, &sync);
+}
+
+static struct memory_provider udmabuf_memory_provider = {
+	.alloc = udmabuf_alloc,
+	.free = udmabuf_free,
+	.memcpy_from_device = udmabuf_memcpy_from_device,
+};
+
+static struct memory_provider *provider = &udmabuf_memory_provider;
+
+static void print_nonzero_bytes(void *ptr, size_t size)
 {
 	unsigned char *p = ptr;
 	unsigned int i;
@@ -201,42 +285,7 @@ err_close:
 	return -1;
 }
 
-static void create_udmabuf(int *devfd, int *memfd, int *buf, size_t dmabuf_size)
-{
-	struct udmabuf_create create;
-	int ret;
-
-	*devfd = open("/dev/udmabuf", O_RDWR);
-	if (*devfd < 0) {
-		error(70, 0,
-		      "%s: [skip,no-udmabuf: Unable to access DMA buffer device file]\n",
-		      TEST_PREFIX);
-	}
-
-	*memfd = memfd_create("udmabuf-test", MFD_ALLOW_SEALING);
-	if (*memfd < 0)
-		error(70, 0, "%s: [skip,no-memfd]\n", TEST_PREFIX);
-
-	/* Required for udmabuf */
-	ret = fcntl(*memfd, F_ADD_SEALS, F_SEAL_SHRINK);
-	if (ret < 0)
-		error(73, 0, "%s: [skip,fcntl-add-seals]\n", TEST_PREFIX);
-
-	ret = ftruncate(*memfd, dmabuf_size);
-	if (ret == -1)
-		error(74, 0, "%s: [FAIL,memfd-truncate]\n", TEST_PREFIX);
-
-	memset(&create, 0, sizeof(create));
-
-	create.memfd = *memfd;
-	create.offset = 0;
-	create.size = dmabuf_size;
-	*buf = ioctl(*devfd, UDMABUF_CREATE, &create);
-	if (*buf < 0)
-		error(75, 0, "%s: [FAIL, create udmabuf]\n", TEST_PREFIX);
-}
-
-int do_server(void)
+int do_server(struct memory_buffer *mem)
 {
 	char ctrl_data[sizeof(int) * 20000];
 	struct netdev_queue_id *queues;
@@ -244,23 +293,18 @@ int do_server(void)
 	struct sockaddr_in client_addr;
 	struct sockaddr_in server_sin;
 	size_t page_aligned_frags = 0;
-	int devfd, memfd, buf, ret;
 	size_t total_received = 0;
 	socklen_t client_addr_len;
 	bool is_devmem = false;
-	char *buf_mem = NULL;
+	char *tmp_mem = NULL;
 	struct ynl_sock *ys;
-	size_t dmabuf_size;
 	char iobuf[819200];
 	char buffer[256];
 	int socket_fd;
 	int client_fd;
 	size_t i = 0;
 	int opt = 1;
-
-	dmabuf_size = getpagesize() * NUM_PAGES;
-
-	create_udmabuf(&devfd, &memfd, &buf, dmabuf_size);
+	int ret;
 
 	if (reset_flow_steering())
 		error(1, 0, "Failed to reset flow steering\n");
@@ -284,13 +328,12 @@ int do_server(void)
 		queues[i].id = start_queue + i;
 	}
 
-	if (bind_rx_queue(ifindex, buf, queues, num_queues, &ys))
+	if (bind_rx_queue(ifindex, mem->fd, queues, num_queues, &ys))
 		error(1, 0, "Failed to bind\n");
 
-	buf_mem = mmap(NULL, dmabuf_size, PROT_READ | PROT_WRITE, MAP_SHARED,
-		       buf, 0);
-	if (buf_mem == MAP_FAILED)
-		error(1, 0, "mmap()");
+	tmp_mem = malloc(mem->size);
+	if (!tmp_mem)
+		error(1, ENOMEM, "malloc failed");
 
 	server_sin.sin_family = AF_INET;
 	server_sin.sin_port = htons(atoi(port));
@@ -341,7 +384,6 @@ int do_server(void)
 		struct iovec iov = { .iov_base = iobuf,
 				     .iov_len = sizeof(iobuf) };
 		struct dmabuf_cmsg *dmabuf_cmsg = NULL;
-		struct dma_buf_sync sync = { 0 };
 		struct cmsghdr *cm = NULL;
 		struct msghdr msg = { 0 };
 		struct dmabuf_token token;
@@ -410,22 +452,16 @@ int do_server(void)
 			else
 				page_aligned_frags++;
 
-			sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_START;
-			ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);
+			provider->memcpy_from_device(tmp_mem, mem,
+						     dmabuf_cmsg->frag_offset,
+						     dmabuf_cmsg->frag_size);
 
 			if (do_validation)
-				validate_buffer(
-					((unsigned char *)buf_mem) +
-						dmabuf_cmsg->frag_offset,
-					dmabuf_cmsg->frag_size);
+				validate_buffer(tmp_mem,
+						dmabuf_cmsg->frag_size);
 			else
-				print_nonzero_bytes(
-					((unsigned char *)buf_mem) +
-						dmabuf_cmsg->frag_offset,
-					dmabuf_cmsg->frag_size);
-
-			sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_END;
-			ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);
+				print_nonzero_bytes(tmp_mem,
+						    dmabuf_cmsg->frag_size);
 
 			ret = setsockopt(client_fd, SOL_SOCKET,
 					 SO_DEVMEM_DONTNEED, &token,
@@ -450,12 +486,9 @@ int do_server(void)
 
 cleanup:
 
-	munmap(buf_mem, dmabuf_size);
+	free(tmp_mem);
 	close(client_fd);
 	close(socket_fd);
-	close(buf);
-	close(memfd);
-	close(devfd);
 	ynl_sock_destroy(ys);
 
 	return 0;
@@ -464,14 +497,11 @@ cleanup:
 void run_devmem_tests(void)
 {
 	struct netdev_queue_id *queues;
-	int devfd, memfd, buf;
+	struct memory_buffer *mem;
 	struct ynl_sock *ys;
-	size_t dmabuf_size;
 	size_t i = 0;
 
-	dmabuf_size = getpagesize() * NUM_PAGES;
-
-	create_udmabuf(&devfd, &memfd, &buf, dmabuf_size);
+	mem = provider->alloc(getpagesize() * NUM_PAGES);
 
 	/* Configure RSS to divert all traffic from our devmem queues */
 	if (configure_rss())
@@ -482,7 +512,7 @@ void run_devmem_tests(void)
 	if (configure_headersplit(1))
 		error(1, 0, "Failed to configure header split\n");
 
-	if (!bind_rx_queue(ifindex, buf, queues, num_queues, &ys))
+	if (!bind_rx_queue(ifindex, mem->fd, queues, num_queues, &ys))
 		error(1, 0, "Binding empty queues array should have failed\n");
 
 	for (i = 0; i < num_queues; i++) {
@@ -495,7 +525,7 @@ void run_devmem_tests(void)
 	if (configure_headersplit(0))
 		error(1, 0, "Failed to configure header split\n");
 
-	if (!bind_rx_queue(ifindex, buf, queues, num_queues, &ys))
+	if (!bind_rx_queue(ifindex, mem->fd, queues, num_queues, &ys))
 		error(1, 0, "Configure dmabuf with header split off should have failed\n");
 
 	if (configure_headersplit(1))
@@ -508,7 +538,7 @@ void run_devmem_tests(void)
 		queues[i].id = start_queue + i;
 	}
 
-	if (bind_rx_queue(ifindex, buf, queues, num_queues, &ys))
+	if (bind_rx_queue(ifindex, mem->fd, queues, num_queues, &ys))
 		error(1, 0, "Failed to bind\n");
 
 	/* Deactivating a bound queue should not be legal */
@@ -517,11 +547,15 @@ void run_devmem_tests(void)
 
 	/* Closing the netlink socket does an implicit unbind */
 	ynl_sock_destroy(ys);
+
+	provider->free(mem);
 }
 
 int main(int argc, char *argv[])
 {
+	struct memory_buffer *mem;
 	int is_server = 0, opt;
+	int ret;
 
 	while ((opt = getopt(argc, argv, "ls:c:p:v:q:t:f:")) != -1) {
 		switch (opt) {
@@ -562,8 +596,9 @@ int main(int argc, char *argv[])
 
 	run_devmem_tests();
 
-	if (is_server)
-		return do_server();
+	mem = provider->alloc(getpagesize() * NUM_PAGES);
+	ret = is_server ? do_server(mem) : 1;
+	provider->free(mem);
 
-	return 0;
+	return ret;
 }
-- 
cgit v1.2.3


From bfccbaac1b45f9af7d76589d7e31ad921b50c0d7 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Thu, 7 Nov 2024 10:12:02 -0800
Subject: selftests: ncdevmem: Unify error handling

There is a bunch of places where error() calls look out of place.
Use the same error(1, errno, ...) pattern everywhere.

Reviewed-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Joe Damato <jdamato@fastly.com>
Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20241107181211.3934153-4-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/ncdevmem.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/net/ncdevmem.c
index 3e7ef2eedd60..4733d1a0aab5 100644
--- a/tools/testing/selftests/net/ncdevmem.c
+++ b/tools/testing/selftests/net/ncdevmem.c
@@ -339,33 +339,33 @@ int do_server(struct memory_buffer *mem)
 	server_sin.sin_port = htons(atoi(port));
 
 	ret = inet_pton(server_sin.sin_family, server_ip, &server_sin.sin_addr);
-	if (socket < 0)
-		error(79, 0, "%s: [FAIL, create socket]\n", TEST_PREFIX);
+	if (ret < 0)
+		error(1, errno, "%s: [FAIL, create socket]\n", TEST_PREFIX);
 
 	socket_fd = socket(server_sin.sin_family, SOCK_STREAM, 0);
-	if (socket < 0)
-		error(errno, errno, "%s: [FAIL, create socket]\n", TEST_PREFIX);
+	if (socket_fd < 0)
+		error(1, errno, "%s: [FAIL, create socket]\n", TEST_PREFIX);
 
 	ret = setsockopt(socket_fd, SOL_SOCKET, SO_REUSEPORT, &opt,
 			 sizeof(opt));
 	if (ret)
-		error(errno, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX);
+		error(1, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX);
 
 	ret = setsockopt(socket_fd, SOL_SOCKET, SO_REUSEADDR, &opt,
 			 sizeof(opt));
 	if (ret)
-		error(errno, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX);
+		error(1, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX);
 
 	fprintf(stderr, "binding to address %s:%d\n", server_ip,
 		ntohs(server_sin.sin_port));
 
 	ret = bind(socket_fd, &server_sin, sizeof(server_sin));
 	if (ret)
-		error(errno, errno, "%s: [FAIL, bind]\n", TEST_PREFIX);
+		error(1, errno, "%s: [FAIL, bind]\n", TEST_PREFIX);
 
 	ret = listen(socket_fd, 1);
 	if (ret)
-		error(errno, errno, "%s: [FAIL, listen]\n", TEST_PREFIX);
+		error(1, errno, "%s: [FAIL, listen]\n", TEST_PREFIX);
 
 	client_addr_len = sizeof(client_addr);
 
-- 
cgit v1.2.3


From 0ebd75f5f2392c2ada04c6e11447415911fe1506 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Thu, 7 Nov 2024 10:12:03 -0800
Subject: selftests: ncdevmem: Make client_ip optional

Support 3-tuple filtering by making client_ip optional. When -c is
not passed, don't specify src-ip/src-port in the filter.

Reviewed-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Joe Damato <jdamato@fastly.com>
Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20241107181211.3934153-5-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/ncdevmem.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/net/ncdevmem.c
index 4733d1a0aab5..faa9dce121c7 100644
--- a/tools/testing/selftests/net/ncdevmem.c
+++ b/tools/testing/selftests/net/ncdevmem.c
@@ -62,7 +62,7 @@
  */
 
 static char *server_ip = "192.168.1.4";
-static char *client_ip = "192.168.1.2";
+static char *client_ip;
 static char *port = "5201";
 static size_t do_validation;
 static int start_queue = 8;
@@ -236,8 +236,14 @@ static int configure_channels(unsigned int rx, unsigned int tx)
 
 static int configure_flow_steering(void)
 {
-	return run_command("sudo ethtool -N %s flow-type tcp4 src-ip %s dst-ip %s src-port %s dst-port %s queue %d >&2",
-			   ifname, client_ip, server_ip, port, port, start_queue);
+	return run_command("sudo ethtool -N %s flow-type tcp4 %s %s dst-ip %s %s %s dst-port %s queue %d >&2",
+			   ifname,
+			   client_ip ? "src-ip" : "",
+			   client_ip ?: "",
+			   server_ip,
+			   client_ip ? "src-port" : "",
+			   client_ip ? port : "",
+			   port, start_queue);
 }
 
 static int bind_rx_queue(unsigned int ifindex, unsigned int dmabuf_fd,
-- 
cgit v1.2.3


From d3ca35c64d48daf3451851043cffe2bda3913648 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Thu, 7 Nov 2024 10:12:04 -0800
Subject: selftests: ncdevmem: Remove default arguments

To make it clear what's required and what's not. Also, some of the
values don't seem like a good defaults; for example eth1.

Move the invocation comment to the top, add missing -s to the client
and cleanup the client invocation a bit to make more readable.

Reviewed-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Joe Damato <jdamato@fastly.com>
Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20241107181211.3934153-6-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/ncdevmem.c | 61 ++++++++++++++++++++++------------
 1 file changed, 39 insertions(+), 22 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/net/ncdevmem.c
index faa9dce121c7..0feeca56c049 100644
--- a/tools/testing/selftests/net/ncdevmem.c
+++ b/tools/testing/selftests/net/ncdevmem.c
@@ -1,4 +1,31 @@
 // SPDX-License-Identifier: GPL-2.0
+/*
+ * tcpdevmem netcat. Works similarly to netcat but does device memory TCP
+ * instead of regular TCP. Uses udmabuf to mock a dmabuf provider.
+ *
+ * Usage:
+ *
+ *     On server:
+ *     ncdevmem -s <server IP> [-c <client IP>] -f eth1 -l -p 5201
+ *
+ *     On client:
+ *     echo -n "hello\nworld" | nc -s <server IP> 5201 -p 5201
+ *
+ * Test data validation:
+ *
+ *     On server:
+ *     ncdevmem -s <server IP> [-c <client IP>] -f eth1 -l -p 5201 -v 7
+ *
+ *     On client:
+ *     yes $(echo -e \\x01\\x02\\x03\\x04\\x05\\x06) | \
+ *             tr \\n \\0 | \
+ *             head -c 5G | \
+ *             nc <server IP> 5201 -p 5201
+ *
+ *
+ * Note this is compatible with regular netcat. i.e. the sender or receiver can
+ * be replaced with regular netcat to test the RX or TX path in isolation.
+ */
 #define _GNU_SOURCE
 #define __EXPORTED_HEADERS__
 
@@ -42,32 +69,13 @@
 #define MSG_SOCK_DEVMEM 0x2000000
 #endif
 
-/*
- * tcpdevmem netcat. Works similarly to netcat but does device memory TCP
- * instead of regular TCP. Uses udmabuf to mock a dmabuf provider.
- *
- * Usage:
- *
- *	On server:
- *	ncdevmem -s <server IP> -c <client IP> -f eth1 -l -p 5201 -v 7
- *
- *	On client:
- *	yes $(echo -e \\x01\\x02\\x03\\x04\\x05\\x06) | \
- *		tr \\n \\0 | \
- *		head -c 5G | \
- *		nc <server IP> 5201 -p 5201
- *
- * Note this is compatible with regular netcat. i.e. the sender or receiver can
- * be replaced with regular netcat to test the RX or TX path in isolation.
- */
-
-static char *server_ip = "192.168.1.4";
+static char *server_ip;
 static char *client_ip;
-static char *port = "5201";
+static char *port;
 static size_t do_validation;
 static int start_queue = 8;
 static int num_queues = 8;
-static char *ifname = "eth1";
+static char *ifname;
 static unsigned int ifindex;
 static unsigned int dmabuf_id;
 
@@ -595,6 +603,15 @@ int main(int argc, char *argv[])
 		}
 	}
 
+	if (!server_ip)
+		error(1, 0, "Missing -s argument\n");
+
+	if (!port)
+		error(1, 0, "Missing -p argument\n");
+
+	if (!ifname)
+		error(1, 0, "Missing -f argument\n");
+
 	ifindex = if_nametoindex(ifname);
 
 	for (; optind < argc; optind++)
-- 
cgit v1.2.3


From 933056357a8cf0c9b3fb2ecc4d2d8d142614f0a3 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Thu, 7 Nov 2024 10:12:05 -0800
Subject: selftests: ncdevmem: Switch to AF_INET6

Use dualstack socket to support both v4 and v6. v4-mapped-v6 address
can be used to do v4.

Reviewed-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Joe Damato <jdamato@fastly.com>
Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20241107181211.3934153-7-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/ncdevmem.c | 97 ++++++++++++++++++++++++----------
 1 file changed, 68 insertions(+), 29 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/net/ncdevmem.c
index 0feeca56c049..645ef0bb63ec 100644
--- a/tools/testing/selftests/net/ncdevmem.c
+++ b/tools/testing/selftests/net/ncdevmem.c
@@ -242,13 +242,26 @@ static int configure_channels(unsigned int rx, unsigned int tx)
 	return run_command("sudo ethtool -L %s rx %u tx %u", ifname, rx, tx);
 }
 
-static int configure_flow_steering(void)
+static int configure_flow_steering(struct sockaddr_in6 *server_sin)
 {
-	return run_command("sudo ethtool -N %s flow-type tcp4 %s %s dst-ip %s %s %s dst-port %s queue %d >&2",
+	const char *type = "tcp6";
+	const char *server_addr;
+	char buf[40];
+
+	inet_ntop(AF_INET6, &server_sin->sin6_addr, buf, sizeof(buf));
+	server_addr = buf;
+
+	if (IN6_IS_ADDR_V4MAPPED(&server_sin->sin6_addr)) {
+		type = "tcp4";
+		server_addr = strrchr(server_addr, ':') + 1;
+	}
+
+	return run_command("sudo ethtool -N %s flow-type %s %s %s dst-ip %s %s %s dst-port %s queue %d >&2",
 			   ifname,
+			   type,
 			   client_ip ? "src-ip" : "",
 			   client_ip ?: "",
-			   server_ip,
+			   server_addr,
 			   client_ip ? "src-port" : "",
 			   client_ip ? port : "",
 			   port, start_queue);
@@ -299,13 +312,51 @@ err_close:
 	return -1;
 }
 
+static void enable_reuseaddr(int fd)
+{
+	int opt = 1;
+	int ret;
+
+	ret = setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt));
+	if (ret)
+		error(1, errno, "%s: [FAIL, SO_REUSEPORT]\n", TEST_PREFIX);
+
+	ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
+	if (ret)
+		error(1, errno, "%s: [FAIL, SO_REUSEADDR]\n", TEST_PREFIX);
+}
+
+static int parse_address(const char *str, int port, struct sockaddr_in6 *sin6)
+{
+	int ret;
+
+	sin6->sin6_family = AF_INET6;
+	sin6->sin6_port = htons(port);
+
+	ret = inet_pton(sin6->sin6_family, str, &sin6->sin6_addr);
+	if (ret != 1) {
+		/* fallback to plain IPv4 */
+		ret = inet_pton(AF_INET, str, &sin6->sin6_addr.s6_addr32[3]);
+		if (ret != 1)
+			return -1;
+
+		/* add ::ffff prefix */
+		sin6->sin6_addr.s6_addr32[0] = 0;
+		sin6->sin6_addr.s6_addr32[1] = 0;
+		sin6->sin6_addr.s6_addr16[4] = 0;
+		sin6->sin6_addr.s6_addr16[5] = 0xffff;
+	}
+
+	return 0;
+}
+
 int do_server(struct memory_buffer *mem)
 {
 	char ctrl_data[sizeof(int) * 20000];
 	struct netdev_queue_id *queues;
 	size_t non_page_aligned_frags = 0;
-	struct sockaddr_in client_addr;
-	struct sockaddr_in server_sin;
+	struct sockaddr_in6 client_addr;
+	struct sockaddr_in6 server_sin;
 	size_t page_aligned_frags = 0;
 	size_t total_received = 0;
 	socklen_t client_addr_len;
@@ -317,9 +368,12 @@ int do_server(struct memory_buffer *mem)
 	int socket_fd;
 	int client_fd;
 	size_t i = 0;
-	int opt = 1;
 	int ret;
 
+	ret = parse_address(server_ip, atoi(port), &server_sin);
+	if (ret < 0)
+		error(1, 0, "parse server address");
+
 	if (reset_flow_steering())
 		error(1, 0, "Failed to reset flow steering\n");
 
@@ -328,7 +382,7 @@ int do_server(struct memory_buffer *mem)
 		error(1, 0, "Failed to configure rss\n");
 
 	/* Flow steer our devmem flows to start_queue */
-	if (configure_flow_steering())
+	if (configure_flow_steering(&server_sin))
 		error(1, 0, "Failed to configure flow steering\n");
 
 	sleep(1);
@@ -349,29 +403,14 @@ int do_server(struct memory_buffer *mem)
 	if (!tmp_mem)
 		error(1, ENOMEM, "malloc failed");
 
-	server_sin.sin_family = AF_INET;
-	server_sin.sin_port = htons(atoi(port));
-
-	ret = inet_pton(server_sin.sin_family, server_ip, &server_sin.sin_addr);
-	if (ret < 0)
-		error(1, errno, "%s: [FAIL, create socket]\n", TEST_PREFIX);
-
-	socket_fd = socket(server_sin.sin_family, SOCK_STREAM, 0);
+	socket_fd = socket(AF_INET6, SOCK_STREAM, 0);
 	if (socket_fd < 0)
 		error(1, errno, "%s: [FAIL, create socket]\n", TEST_PREFIX);
 
-	ret = setsockopt(socket_fd, SOL_SOCKET, SO_REUSEPORT, &opt,
-			 sizeof(opt));
-	if (ret)
-		error(1, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX);
-
-	ret = setsockopt(socket_fd, SOL_SOCKET, SO_REUSEADDR, &opt,
-			 sizeof(opt));
-	if (ret)
-		error(1, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX);
+	enable_reuseaddr(socket_fd);
 
 	fprintf(stderr, "binding to address %s:%d\n", server_ip,
-		ntohs(server_sin.sin_port));
+		ntohs(server_sin.sin6_port));
 
 	ret = bind(socket_fd, &server_sin, sizeof(server_sin));
 	if (ret)
@@ -383,16 +422,16 @@ int do_server(struct memory_buffer *mem)
 
 	client_addr_len = sizeof(client_addr);
 
-	inet_ntop(server_sin.sin_family, &server_sin.sin_addr, buffer,
+	inet_ntop(AF_INET6, &server_sin.sin6_addr, buffer,
 		  sizeof(buffer));
 	fprintf(stderr, "Waiting or connection on %s:%d\n", buffer,
-		ntohs(server_sin.sin_port));
+		ntohs(server_sin.sin6_port));
 	client_fd = accept(socket_fd, &client_addr, &client_addr_len);
 
-	inet_ntop(client_addr.sin_family, &client_addr.sin_addr, buffer,
+	inet_ntop(AF_INET6, &client_addr.sin6_addr, buffer,
 		  sizeof(buffer));
 	fprintf(stderr, "Got connection from %s:%d\n", buffer,
-		ntohs(client_addr.sin_port));
+		ntohs(client_addr.sin6_port));
 
 	while (1) {
 		struct iovec iov = { .iov_base = iobuf,
-- 
cgit v1.2.3


From e3c09623a53b8d11ff9e3c0f435ce1e8f52134ba Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Thu, 7 Nov 2024 10:12:06 -0800
Subject: selftests: ncdevmem: Properly reset flow steering

ntuple off/on might be not enough to do it on all NICs.
Add a bunch of shell crap to explicitly remove the rules.

Reviewed-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Joe Damato <jdamato@fastly.com>
Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20241107181211.3934153-8-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/ncdevmem.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/net/ncdevmem.c
index 645ef0bb63ec..ad6de8e0e97b 100644
--- a/tools/testing/selftests/net/ncdevmem.c
+++ b/tools/testing/selftests/net/ncdevmem.c
@@ -217,13 +217,18 @@ void validate_buffer(void *line, size_t size)
 
 static int reset_flow_steering(void)
 {
-	int ret = 0;
-
-	ret = run_command("sudo ethtool -K %s ntuple off >&2", ifname);
-	if (ret)
-		return ret;
-
-	return run_command("sudo ethtool -K %s ntuple on >&2", ifname);
+	/* Depending on the NIC, toggling ntuple off and on might not
+	 * be allowed. Additionally, attempting to delete existing filters
+	 * will fail if no filters are present. Therefore, do not enforce
+	 * the exit status.
+	 */
+
+	run_command("sudo ethtool -K %s ntuple off >&2", ifname);
+	run_command("sudo ethtool -K %s ntuple on >&2", ifname);
+	run_command(
+		"sudo ethtool -n %s | grep 'Filter:' | awk '{print $2}' | xargs -n1 ethtool -N %s delete >&2",
+		ifname, ifname);
+	return 0;
 }
 
 static int configure_headersplit(bool on)
-- 
cgit v1.2.3


From 798d822e5d34ffe3f25b66b2573928962a5d3c11 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Thu, 7 Nov 2024 10:12:07 -0800
Subject: selftests: ncdevmem: Use YNL to enable TCP header split

In the next patch the hard-coded queue numbers are gonna be removed.
So introduce some initial support for ethtool YNL and use
it to enable header split.

Also, tcp-data-split requires latest ethtool which is unlikely
to be present in the distros right now.

(ideally, we should not shell out to ethtool at all).

Reviewed-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Joe Damato <jdamato@fastly.com>
Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20241107181211.3934153-9-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/Makefile   |  2 +-
 tools/testing/selftests/net/ncdevmem.c | 57 ++++++++++++++++++++++++++++++++--
 2 files changed, 56 insertions(+), 3 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 8c4db5199a42..61cce028f105 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -112,7 +112,7 @@ TEST_INCLUDES := forwarding/lib.sh
 include ../lib.mk
 
 # YNL build
-YNL_GENS := netdev
+YNL_GENS := ethtool netdev
 include ynl.mk
 
 $(OUTPUT)/epoll_busy_poll: LDLIBS += -lcap
diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/net/ncdevmem.c
index ad6de8e0e97b..9ca2da3a2f63 100644
--- a/tools/testing/selftests/net/ncdevmem.c
+++ b/tools/testing/selftests/net/ncdevmem.c
@@ -55,10 +55,12 @@
 #include <linux/netlink.h>
 #include <linux/genetlink.h>
 #include <linux/netdev.h>
+#include <linux/ethtool_netlink.h>
 #include <time.h>
 #include <net/if.h>
 
 #include "netdev-user.h"
+#include "ethtool-user.h"
 #include <ynl.h>
 
 #define PAGE_SHIFT 12
@@ -231,10 +233,58 @@ static int reset_flow_steering(void)
 	return 0;
 }
 
+static const char *tcp_data_split_str(int val)
+{
+	switch (val) {
+	case 0:
+		return "off";
+	case 1:
+		return "auto";
+	case 2:
+		return "on";
+	default:
+		return "?";
+	}
+}
+
 static int configure_headersplit(bool on)
 {
-	return run_command("sudo ethtool -G %s tcp-data-split %s >&2", ifname,
-			   on ? "on" : "off");
+	struct ethtool_rings_get_req *get_req;
+	struct ethtool_rings_get_rsp *get_rsp;
+	struct ethtool_rings_set_req *req;
+	struct ynl_error yerr;
+	struct ynl_sock *ys;
+	int ret;
+
+	ys = ynl_sock_create(&ynl_ethtool_family, &yerr);
+	if (!ys) {
+		fprintf(stderr, "YNL: %s\n", yerr.msg);
+		return -1;
+	}
+
+	req = ethtool_rings_set_req_alloc();
+	ethtool_rings_set_req_set_header_dev_index(req, ifindex);
+	/* 0 - off, 1 - auto, 2 - on */
+	ethtool_rings_set_req_set_tcp_data_split(req, on ? 2 : 0);
+	ret = ethtool_rings_set(ys, req);
+	if (ret < 0)
+		fprintf(stderr, "YNL failed: %s\n", ys->err.msg);
+	ethtool_rings_set_req_free(req);
+
+	if (ret == 0) {
+		get_req = ethtool_rings_get_req_alloc();
+		ethtool_rings_get_req_set_header_dev_index(get_req, ifindex);
+		get_rsp = ethtool_rings_get(ys, get_req);
+		ethtool_rings_get_req_free(get_req);
+		if (get_rsp)
+			fprintf(stderr, "TCP header split: %s\n",
+				tcp_data_split_str(get_rsp->tcp_data_split));
+		ethtool_rings_get_rsp_free(get_rsp);
+	}
+
+	ynl_sock_destroy(ys);
+
+	return ret;
 }
 
 static int configure_rss(void)
@@ -382,6 +432,9 @@ int do_server(struct memory_buffer *mem)
 	if (reset_flow_steering())
 		error(1, 0, "Failed to reset flow steering\n");
 
+	if (configure_headersplit(1))
+		error(1, 0, "Failed to enable TCP header split\n");
+
 	/* Configure RSS to divert all traffic from our devmem queues */
 	if (configure_rss())
 		error(1, 0, "Failed to configure rss\n");
-- 
cgit v1.2.3


From d4ef05d211315395974fa846308c693ab2ea1ff2 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Thu, 7 Nov 2024 10:12:08 -0800
Subject: selftests: ncdevmem: Remove hard-coded queue numbers

Use single last queue of the device and probe it dynamically.

Reviewed-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Joe Damato <jdamato@fastly.com>
Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20241107181211.3934153-10-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/ncdevmem.c | 40 ++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/net/ncdevmem.c
index 9ca2da3a2f63..1ea62c129ddc 100644
--- a/tools/testing/selftests/net/ncdevmem.c
+++ b/tools/testing/selftests/net/ncdevmem.c
@@ -75,8 +75,8 @@ static char *server_ip;
 static char *client_ip;
 static char *port;
 static size_t do_validation;
-static int start_queue = 8;
-static int num_queues = 8;
+static int start_queue = -1;
+static int num_queues = 1;
 static char *ifname;
 static unsigned int ifindex;
 static unsigned int dmabuf_id;
@@ -208,6 +208,33 @@ void validate_buffer(void *line, size_t size)
 	fprintf(stdout, "Validated buffer\n");
 }
 
+static int rxq_num(int ifindex)
+{
+	struct ethtool_channels_get_req *req;
+	struct ethtool_channels_get_rsp *rsp;
+	struct ynl_error yerr;
+	struct ynl_sock *ys;
+	int num = -1;
+
+	ys = ynl_sock_create(&ynl_ethtool_family, &yerr);
+	if (!ys) {
+		fprintf(stderr, "YNL: %s\n", yerr.msg);
+		return -1;
+	}
+
+	req = ethtool_channels_get_req_alloc();
+	ethtool_channels_get_req_set_header_dev_index(req, ifindex);
+	rsp = ethtool_channels_get(ys, req);
+	if (rsp)
+		num = rsp->rx_count + rsp->combined_count;
+	ethtool_channels_get_req_free(req);
+	ethtool_channels_get_rsp_free(rsp);
+
+	ynl_sock_destroy(ys);
+
+	return num;
+}
+
 #define run_command(cmd, ...)                                           \
 	({                                                              \
 		char command[256];                                      \
@@ -711,6 +738,15 @@ int main(int argc, char *argv[])
 
 	ifindex = if_nametoindex(ifname);
 
+	if (start_queue < 0) {
+		start_queue = rxq_num(ifindex) - 1;
+
+		if (start_queue < 0)
+			error(1, 0, "couldn't detect number of queues\n");
+
+		fprintf(stderr, "using queues %d..%d\n", start_queue, start_queue + num_queues);
+	}
+
 	for (; optind < argc; optind++)
 		fprintf(stderr, "extra arguments: %s\n", argv[optind]);
 
-- 
cgit v1.2.3


From 77f870a000165f364082e06bfd8fd16d331219d8 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Thu, 7 Nov 2024 10:12:09 -0800
Subject: selftests: ncdevmem: Run selftest when none of the -s or -c has been
 provided

This will be used as a 'probe' mode in the selftest to check whether
the device supports the devmem or not. Use hard-coded queue layout
(two last queues) and prevent user from passing custom -q and/or -t.

Reviewed-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Joe Damato <jdamato@fastly.com>
Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20241107181211.3934153-11-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/ncdevmem.c | 49 +++++++++++++++++++++++++++-------
 1 file changed, 39 insertions(+), 10 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/net/ncdevmem.c
index 1ea62c129ddc..8e502a1f8f9b 100644
--- a/tools/testing/selftests/net/ncdevmem.c
+++ b/tools/testing/selftests/net/ncdevmem.c
@@ -76,7 +76,7 @@ static char *client_ip;
 static char *port;
 static size_t do_validation;
 static int start_queue = -1;
-static int num_queues = 1;
+static int num_queues = -1;
 static char *ifname;
 static unsigned int ifindex;
 static unsigned int dmabuf_id;
@@ -727,19 +727,38 @@ int main(int argc, char *argv[])
 		}
 	}
 
-	if (!server_ip)
-		error(1, 0, "Missing -s argument\n");
-
-	if (!port)
-		error(1, 0, "Missing -p argument\n");
-
 	if (!ifname)
 		error(1, 0, "Missing -f argument\n");
 
 	ifindex = if_nametoindex(ifname);
 
-	if (start_queue < 0) {
-		start_queue = rxq_num(ifindex) - 1;
+	if (!server_ip && !client_ip) {
+		if (start_queue < 0 && num_queues < 0) {
+			num_queues = rxq_num(ifindex);
+			if (num_queues < 0)
+				error(1, 0, "couldn't detect number of queues\n");
+			if (num_queues < 2)
+				error(1, 0,
+				      "number of device queues is too low\n");
+			/* make sure can bind to multiple queues */
+			start_queue = num_queues / 2;
+			num_queues /= 2;
+		}
+
+		if (start_queue < 0 || num_queues < 0)
+			error(1, 0, "Both -t and -q are required\n");
+
+		run_devmem_tests();
+		return 0;
+	}
+
+	if (start_queue < 0 && num_queues < 0) {
+		num_queues = rxq_num(ifindex);
+		if (num_queues < 2)
+			error(1, 0, "number of device queues is too low\n");
+
+		num_queues = 1;
+		start_queue = rxq_num(ifindex) - num_queues;
 
 		if (start_queue < 0)
 			error(1, 0, "couldn't detect number of queues\n");
@@ -750,7 +769,17 @@ int main(int argc, char *argv[])
 	for (; optind < argc; optind++)
 		fprintf(stderr, "extra arguments: %s\n", argv[optind]);
 
-	run_devmem_tests();
+	if (start_queue < 0)
+		error(1, 0, "Missing -t argument\n");
+
+	if (num_queues < 0)
+		error(1, 0, "Missing -q argument\n");
+
+	if (!server_ip)
+		error(1, 0, "Missing -s argument\n");
+
+	if (!port)
+		error(1, 0, "Missing -p argument\n");
 
 	mem = provider->alloc(getpagesize() * NUM_PAGES);
 	ret = is_server ? do_server(mem) : 1;
-- 
cgit v1.2.3


From be43a6b2382983c89b59166ba2c32ec0f1092cfe Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Thu, 7 Nov 2024 10:12:10 -0800
Subject: selftests: ncdevmem: Move ncdevmem under drivers/net/hw

This is where all the tests that depend on the HW functionality live in
and this is where the automated test is gonna be added in the next
patch.

Reviewed-by: Mina Almasry <almasrymina@google.com>
Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20241107181211.3934153-12-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/hw/.gitignore |   1 +
 tools/testing/selftests/drivers/net/hw/Makefile   |   8 +
 tools/testing/selftests/drivers/net/hw/ncdevmem.c | 789 ++++++++++++++++++++++
 tools/testing/selftests/net/.gitignore            |   1 -
 tools/testing/selftests/net/Makefile              |   8 -
 tools/testing/selftests/net/ncdevmem.c            | 789 ----------------------
 6 files changed, 798 insertions(+), 798 deletions(-)
 create mode 100644 tools/testing/selftests/drivers/net/hw/.gitignore
 create mode 100644 tools/testing/selftests/drivers/net/hw/ncdevmem.c
 delete mode 100644 tools/testing/selftests/net/ncdevmem.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/hw/.gitignore b/tools/testing/selftests/drivers/net/hw/.gitignore
new file mode 100644
index 000000000000..e9fe6ede681a
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/hw/.gitignore
@@ -0,0 +1 @@
+ncdevmem
diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile
index c9f2f48fc30f..182348f4bd40 100644
--- a/tools/testing/selftests/drivers/net/hw/Makefile
+++ b/tools/testing/selftests/drivers/net/hw/Makefile
@@ -26,4 +26,12 @@ TEST_INCLUDES := \
 	../../../net/forwarding/tc_common.sh \
 	#
 
+# YNL files, must be before "include ..lib.mk"
+YNL_GEN_FILES := ncdevmem
+TEST_GEN_FILES += $(YNL_GEN_FILES)
+
 include ../../../lib.mk
+
+# YNL build
+YNL_GENS := ethtool netdev
+include ../../../net/ynl.mk
diff --git a/tools/testing/selftests/drivers/net/hw/ncdevmem.c b/tools/testing/selftests/drivers/net/hw/ncdevmem.c
new file mode 100644
index 000000000000..8e502a1f8f9b
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/hw/ncdevmem.c
@@ -0,0 +1,789 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * tcpdevmem netcat. Works similarly to netcat but does device memory TCP
+ * instead of regular TCP. Uses udmabuf to mock a dmabuf provider.
+ *
+ * Usage:
+ *
+ *     On server:
+ *     ncdevmem -s <server IP> [-c <client IP>] -f eth1 -l -p 5201
+ *
+ *     On client:
+ *     echo -n "hello\nworld" | nc -s <server IP> 5201 -p 5201
+ *
+ * Test data validation:
+ *
+ *     On server:
+ *     ncdevmem -s <server IP> [-c <client IP>] -f eth1 -l -p 5201 -v 7
+ *
+ *     On client:
+ *     yes $(echo -e \\x01\\x02\\x03\\x04\\x05\\x06) | \
+ *             tr \\n \\0 | \
+ *             head -c 5G | \
+ *             nc <server IP> 5201 -p 5201
+ *
+ *
+ * Note this is compatible with regular netcat. i.e. the sender or receiver can
+ * be replaced with regular netcat to test the RX or TX path in isolation.
+ */
+#define _GNU_SOURCE
+#define __EXPORTED_HEADERS__
+
+#include <linux/uio.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <errno.h>
+#define __iovec_defined
+#include <fcntl.h>
+#include <malloc.h>
+#include <error.h>
+
+#include <arpa/inet.h>
+#include <sys/socket.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+
+#include <linux/memfd.h>
+#include <linux/dma-buf.h>
+#include <linux/udmabuf.h>
+#include <libmnl/libmnl.h>
+#include <linux/types.h>
+#include <linux/netlink.h>
+#include <linux/genetlink.h>
+#include <linux/netdev.h>
+#include <linux/ethtool_netlink.h>
+#include <time.h>
+#include <net/if.h>
+
+#include "netdev-user.h"
+#include "ethtool-user.h"
+#include <ynl.h>
+
+#define PAGE_SHIFT 12
+#define TEST_PREFIX "ncdevmem"
+#define NUM_PAGES 16000
+
+#ifndef MSG_SOCK_DEVMEM
+#define MSG_SOCK_DEVMEM 0x2000000
+#endif
+
+static char *server_ip;
+static char *client_ip;
+static char *port;
+static size_t do_validation;
+static int start_queue = -1;
+static int num_queues = -1;
+static char *ifname;
+static unsigned int ifindex;
+static unsigned int dmabuf_id;
+
+struct memory_buffer {
+	int fd;
+	size_t size;
+
+	int devfd;
+	int memfd;
+	char *buf_mem;
+};
+
+struct memory_provider {
+	struct memory_buffer *(*alloc)(size_t size);
+	void (*free)(struct memory_buffer *ctx);
+	void (*memcpy_from_device)(void *dst, struct memory_buffer *src,
+				   size_t off, int n);
+};
+
+static struct memory_buffer *udmabuf_alloc(size_t size)
+{
+	struct udmabuf_create create;
+	struct memory_buffer *ctx;
+	int ret;
+
+	ctx = malloc(sizeof(*ctx));
+	if (!ctx)
+		error(1, ENOMEM, "malloc failed");
+
+	ctx->size = size;
+
+	ctx->devfd = open("/dev/udmabuf", O_RDWR);
+	if (ctx->devfd < 0)
+		error(1, errno,
+		      "%s: [skip,no-udmabuf: Unable to access DMA buffer device file]\n",
+		      TEST_PREFIX);
+
+	ctx->memfd = memfd_create("udmabuf-test", MFD_ALLOW_SEALING);
+	if (ctx->memfd < 0)
+		error(1, errno, "%s: [skip,no-memfd]\n", TEST_PREFIX);
+
+	ret = fcntl(ctx->memfd, F_ADD_SEALS, F_SEAL_SHRINK);
+	if (ret < 0)
+		error(1, errno, "%s: [skip,fcntl-add-seals]\n", TEST_PREFIX);
+
+	ret = ftruncate(ctx->memfd, size);
+	if (ret == -1)
+		error(1, errno, "%s: [FAIL,memfd-truncate]\n", TEST_PREFIX);
+
+	memset(&create, 0, sizeof(create));
+
+	create.memfd = ctx->memfd;
+	create.offset = 0;
+	create.size = size;
+	ctx->fd = ioctl(ctx->devfd, UDMABUF_CREATE, &create);
+	if (ctx->fd < 0)
+		error(1, errno, "%s: [FAIL, create udmabuf]\n", TEST_PREFIX);
+
+	ctx->buf_mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED,
+			    ctx->fd, 0);
+	if (ctx->buf_mem == MAP_FAILED)
+		error(1, errno, "%s: [FAIL, map udmabuf]\n", TEST_PREFIX);
+
+	return ctx;
+}
+
+static void udmabuf_free(struct memory_buffer *ctx)
+{
+	munmap(ctx->buf_mem, ctx->size);
+	close(ctx->fd);
+	close(ctx->memfd);
+	close(ctx->devfd);
+	free(ctx);
+}
+
+static void udmabuf_memcpy_from_device(void *dst, struct memory_buffer *src,
+				       size_t off, int n)
+{
+	struct dma_buf_sync sync = {};
+
+	sync.flags = DMA_BUF_SYNC_START;
+	ioctl(src->fd, DMA_BUF_IOCTL_SYNC, &sync);
+
+	memcpy(dst, src->buf_mem + off, n);
+
+	sync.flags = DMA_BUF_SYNC_END;
+	ioctl(src->fd, DMA_BUF_IOCTL_SYNC, &sync);
+}
+
+static struct memory_provider udmabuf_memory_provider = {
+	.alloc = udmabuf_alloc,
+	.free = udmabuf_free,
+	.memcpy_from_device = udmabuf_memcpy_from_device,
+};
+
+static struct memory_provider *provider = &udmabuf_memory_provider;
+
+static void print_nonzero_bytes(void *ptr, size_t size)
+{
+	unsigned char *p = ptr;
+	unsigned int i;
+
+	for (i = 0; i < size; i++)
+		putchar(p[i]);
+}
+
+void validate_buffer(void *line, size_t size)
+{
+	static unsigned char seed = 1;
+	unsigned char *ptr = line;
+	int errors = 0;
+	size_t i;
+
+	for (i = 0; i < size; i++) {
+		if (ptr[i] != seed) {
+			fprintf(stderr,
+				"Failed validation: expected=%u, actual=%u, index=%lu\n",
+				seed, ptr[i], i);
+			errors++;
+			if (errors > 20)
+				error(1, 0, "validation failed.");
+		}
+		seed++;
+		if (seed == do_validation)
+			seed = 0;
+	}
+
+	fprintf(stdout, "Validated buffer\n");
+}
+
+static int rxq_num(int ifindex)
+{
+	struct ethtool_channels_get_req *req;
+	struct ethtool_channels_get_rsp *rsp;
+	struct ynl_error yerr;
+	struct ynl_sock *ys;
+	int num = -1;
+
+	ys = ynl_sock_create(&ynl_ethtool_family, &yerr);
+	if (!ys) {
+		fprintf(stderr, "YNL: %s\n", yerr.msg);
+		return -1;
+	}
+
+	req = ethtool_channels_get_req_alloc();
+	ethtool_channels_get_req_set_header_dev_index(req, ifindex);
+	rsp = ethtool_channels_get(ys, req);
+	if (rsp)
+		num = rsp->rx_count + rsp->combined_count;
+	ethtool_channels_get_req_free(req);
+	ethtool_channels_get_rsp_free(rsp);
+
+	ynl_sock_destroy(ys);
+
+	return num;
+}
+
+#define run_command(cmd, ...)                                           \
+	({                                                              \
+		char command[256];                                      \
+		memset(command, 0, sizeof(command));                    \
+		snprintf(command, sizeof(command), cmd, ##__VA_ARGS__); \
+		fprintf(stderr, "Running: %s\n", command);                       \
+		system(command);                                        \
+	})
+
+static int reset_flow_steering(void)
+{
+	/* Depending on the NIC, toggling ntuple off and on might not
+	 * be allowed. Additionally, attempting to delete existing filters
+	 * will fail if no filters are present. Therefore, do not enforce
+	 * the exit status.
+	 */
+
+	run_command("sudo ethtool -K %s ntuple off >&2", ifname);
+	run_command("sudo ethtool -K %s ntuple on >&2", ifname);
+	run_command(
+		"sudo ethtool -n %s | grep 'Filter:' | awk '{print $2}' | xargs -n1 ethtool -N %s delete >&2",
+		ifname, ifname);
+	return 0;
+}
+
+static const char *tcp_data_split_str(int val)
+{
+	switch (val) {
+	case 0:
+		return "off";
+	case 1:
+		return "auto";
+	case 2:
+		return "on";
+	default:
+		return "?";
+	}
+}
+
+static int configure_headersplit(bool on)
+{
+	struct ethtool_rings_get_req *get_req;
+	struct ethtool_rings_get_rsp *get_rsp;
+	struct ethtool_rings_set_req *req;
+	struct ynl_error yerr;
+	struct ynl_sock *ys;
+	int ret;
+
+	ys = ynl_sock_create(&ynl_ethtool_family, &yerr);
+	if (!ys) {
+		fprintf(stderr, "YNL: %s\n", yerr.msg);
+		return -1;
+	}
+
+	req = ethtool_rings_set_req_alloc();
+	ethtool_rings_set_req_set_header_dev_index(req, ifindex);
+	/* 0 - off, 1 - auto, 2 - on */
+	ethtool_rings_set_req_set_tcp_data_split(req, on ? 2 : 0);
+	ret = ethtool_rings_set(ys, req);
+	if (ret < 0)
+		fprintf(stderr, "YNL failed: %s\n", ys->err.msg);
+	ethtool_rings_set_req_free(req);
+
+	if (ret == 0) {
+		get_req = ethtool_rings_get_req_alloc();
+		ethtool_rings_get_req_set_header_dev_index(get_req, ifindex);
+		get_rsp = ethtool_rings_get(ys, get_req);
+		ethtool_rings_get_req_free(get_req);
+		if (get_rsp)
+			fprintf(stderr, "TCP header split: %s\n",
+				tcp_data_split_str(get_rsp->tcp_data_split));
+		ethtool_rings_get_rsp_free(get_rsp);
+	}
+
+	ynl_sock_destroy(ys);
+
+	return ret;
+}
+
+static int configure_rss(void)
+{
+	return run_command("sudo ethtool -X %s equal %d >&2", ifname, start_queue);
+}
+
+static int configure_channels(unsigned int rx, unsigned int tx)
+{
+	return run_command("sudo ethtool -L %s rx %u tx %u", ifname, rx, tx);
+}
+
+static int configure_flow_steering(struct sockaddr_in6 *server_sin)
+{
+	const char *type = "tcp6";
+	const char *server_addr;
+	char buf[40];
+
+	inet_ntop(AF_INET6, &server_sin->sin6_addr, buf, sizeof(buf));
+	server_addr = buf;
+
+	if (IN6_IS_ADDR_V4MAPPED(&server_sin->sin6_addr)) {
+		type = "tcp4";
+		server_addr = strrchr(server_addr, ':') + 1;
+	}
+
+	return run_command("sudo ethtool -N %s flow-type %s %s %s dst-ip %s %s %s dst-port %s queue %d >&2",
+			   ifname,
+			   type,
+			   client_ip ? "src-ip" : "",
+			   client_ip ?: "",
+			   server_addr,
+			   client_ip ? "src-port" : "",
+			   client_ip ? port : "",
+			   port, start_queue);
+}
+
+static int bind_rx_queue(unsigned int ifindex, unsigned int dmabuf_fd,
+			 struct netdev_queue_id *queues,
+			 unsigned int n_queue_index, struct ynl_sock **ys)
+{
+	struct netdev_bind_rx_req *req = NULL;
+	struct netdev_bind_rx_rsp *rsp = NULL;
+	struct ynl_error yerr;
+
+	*ys = ynl_sock_create(&ynl_netdev_family, &yerr);
+	if (!*ys) {
+		fprintf(stderr, "YNL: %s\n", yerr.msg);
+		return -1;
+	}
+
+	req = netdev_bind_rx_req_alloc();
+	netdev_bind_rx_req_set_ifindex(req, ifindex);
+	netdev_bind_rx_req_set_fd(req, dmabuf_fd);
+	__netdev_bind_rx_req_set_queues(req, queues, n_queue_index);
+
+	rsp = netdev_bind_rx(*ys, req);
+	if (!rsp) {
+		perror("netdev_bind_rx");
+		goto err_close;
+	}
+
+	if (!rsp->_present.id) {
+		perror("id not present");
+		goto err_close;
+	}
+
+	fprintf(stderr, "got dmabuf id=%d\n", rsp->id);
+	dmabuf_id = rsp->id;
+
+	netdev_bind_rx_req_free(req);
+	netdev_bind_rx_rsp_free(rsp);
+
+	return 0;
+
+err_close:
+	fprintf(stderr, "YNL failed: %s\n", (*ys)->err.msg);
+	netdev_bind_rx_req_free(req);
+	ynl_sock_destroy(*ys);
+	return -1;
+}
+
+static void enable_reuseaddr(int fd)
+{
+	int opt = 1;
+	int ret;
+
+	ret = setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt));
+	if (ret)
+		error(1, errno, "%s: [FAIL, SO_REUSEPORT]\n", TEST_PREFIX);
+
+	ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
+	if (ret)
+		error(1, errno, "%s: [FAIL, SO_REUSEADDR]\n", TEST_PREFIX);
+}
+
+static int parse_address(const char *str, int port, struct sockaddr_in6 *sin6)
+{
+	int ret;
+
+	sin6->sin6_family = AF_INET6;
+	sin6->sin6_port = htons(port);
+
+	ret = inet_pton(sin6->sin6_family, str, &sin6->sin6_addr);
+	if (ret != 1) {
+		/* fallback to plain IPv4 */
+		ret = inet_pton(AF_INET, str, &sin6->sin6_addr.s6_addr32[3]);
+		if (ret != 1)
+			return -1;
+
+		/* add ::ffff prefix */
+		sin6->sin6_addr.s6_addr32[0] = 0;
+		sin6->sin6_addr.s6_addr32[1] = 0;
+		sin6->sin6_addr.s6_addr16[4] = 0;
+		sin6->sin6_addr.s6_addr16[5] = 0xffff;
+	}
+
+	return 0;
+}
+
+int do_server(struct memory_buffer *mem)
+{
+	char ctrl_data[sizeof(int) * 20000];
+	struct netdev_queue_id *queues;
+	size_t non_page_aligned_frags = 0;
+	struct sockaddr_in6 client_addr;
+	struct sockaddr_in6 server_sin;
+	size_t page_aligned_frags = 0;
+	size_t total_received = 0;
+	socklen_t client_addr_len;
+	bool is_devmem = false;
+	char *tmp_mem = NULL;
+	struct ynl_sock *ys;
+	char iobuf[819200];
+	char buffer[256];
+	int socket_fd;
+	int client_fd;
+	size_t i = 0;
+	int ret;
+
+	ret = parse_address(server_ip, atoi(port), &server_sin);
+	if (ret < 0)
+		error(1, 0, "parse server address");
+
+	if (reset_flow_steering())
+		error(1, 0, "Failed to reset flow steering\n");
+
+	if (configure_headersplit(1))
+		error(1, 0, "Failed to enable TCP header split\n");
+
+	/* Configure RSS to divert all traffic from our devmem queues */
+	if (configure_rss())
+		error(1, 0, "Failed to configure rss\n");
+
+	/* Flow steer our devmem flows to start_queue */
+	if (configure_flow_steering(&server_sin))
+		error(1, 0, "Failed to configure flow steering\n");
+
+	sleep(1);
+
+	queues = malloc(sizeof(*queues) * num_queues);
+
+	for (i = 0; i < num_queues; i++) {
+		queues[i]._present.type = 1;
+		queues[i]._present.id = 1;
+		queues[i].type = NETDEV_QUEUE_TYPE_RX;
+		queues[i].id = start_queue + i;
+	}
+
+	if (bind_rx_queue(ifindex, mem->fd, queues, num_queues, &ys))
+		error(1, 0, "Failed to bind\n");
+
+	tmp_mem = malloc(mem->size);
+	if (!tmp_mem)
+		error(1, ENOMEM, "malloc failed");
+
+	socket_fd = socket(AF_INET6, SOCK_STREAM, 0);
+	if (socket_fd < 0)
+		error(1, errno, "%s: [FAIL, create socket]\n", TEST_PREFIX);
+
+	enable_reuseaddr(socket_fd);
+
+	fprintf(stderr, "binding to address %s:%d\n", server_ip,
+		ntohs(server_sin.sin6_port));
+
+	ret = bind(socket_fd, &server_sin, sizeof(server_sin));
+	if (ret)
+		error(1, errno, "%s: [FAIL, bind]\n", TEST_PREFIX);
+
+	ret = listen(socket_fd, 1);
+	if (ret)
+		error(1, errno, "%s: [FAIL, listen]\n", TEST_PREFIX);
+
+	client_addr_len = sizeof(client_addr);
+
+	inet_ntop(AF_INET6, &server_sin.sin6_addr, buffer,
+		  sizeof(buffer));
+	fprintf(stderr, "Waiting or connection on %s:%d\n", buffer,
+		ntohs(server_sin.sin6_port));
+	client_fd = accept(socket_fd, &client_addr, &client_addr_len);
+
+	inet_ntop(AF_INET6, &client_addr.sin6_addr, buffer,
+		  sizeof(buffer));
+	fprintf(stderr, "Got connection from %s:%d\n", buffer,
+		ntohs(client_addr.sin6_port));
+
+	while (1) {
+		struct iovec iov = { .iov_base = iobuf,
+				     .iov_len = sizeof(iobuf) };
+		struct dmabuf_cmsg *dmabuf_cmsg = NULL;
+		struct cmsghdr *cm = NULL;
+		struct msghdr msg = { 0 };
+		struct dmabuf_token token;
+		ssize_t ret;
+
+		is_devmem = false;
+
+		msg.msg_iov = &iov;
+		msg.msg_iovlen = 1;
+		msg.msg_control = ctrl_data;
+		msg.msg_controllen = sizeof(ctrl_data);
+		ret = recvmsg(client_fd, &msg, MSG_SOCK_DEVMEM);
+		fprintf(stderr, "recvmsg ret=%ld\n", ret);
+		if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK))
+			continue;
+		if (ret < 0) {
+			perror("recvmsg");
+			continue;
+		}
+		if (ret == 0) {
+			fprintf(stderr, "client exited\n");
+			goto cleanup;
+		}
+
+		i++;
+		for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) {
+			if (cm->cmsg_level != SOL_SOCKET ||
+			    (cm->cmsg_type != SCM_DEVMEM_DMABUF &&
+			     cm->cmsg_type != SCM_DEVMEM_LINEAR)) {
+				fprintf(stderr, "skipping non-devmem cmsg\n");
+				continue;
+			}
+
+			dmabuf_cmsg = (struct dmabuf_cmsg *)CMSG_DATA(cm);
+			is_devmem = true;
+
+			if (cm->cmsg_type == SCM_DEVMEM_LINEAR) {
+				/* TODO: process data copied from skb's linear
+				 * buffer.
+				 */
+				fprintf(stderr,
+					"SCM_DEVMEM_LINEAR. dmabuf_cmsg->frag_size=%u\n",
+					dmabuf_cmsg->frag_size);
+
+				continue;
+			}
+
+			token.token_start = dmabuf_cmsg->frag_token;
+			token.token_count = 1;
+
+			total_received += dmabuf_cmsg->frag_size;
+			fprintf(stderr,
+				"received frag_page=%llu, in_page_offset=%llu, frag_offset=%llu, frag_size=%u, token=%u, total_received=%lu, dmabuf_id=%u\n",
+				dmabuf_cmsg->frag_offset >> PAGE_SHIFT,
+				dmabuf_cmsg->frag_offset % getpagesize(),
+				dmabuf_cmsg->frag_offset,
+				dmabuf_cmsg->frag_size, dmabuf_cmsg->frag_token,
+				total_received, dmabuf_cmsg->dmabuf_id);
+
+			if (dmabuf_cmsg->dmabuf_id != dmabuf_id)
+				error(1, 0,
+				      "received on wrong dmabuf_id: flow steering error\n");
+
+			if (dmabuf_cmsg->frag_size % getpagesize())
+				non_page_aligned_frags++;
+			else
+				page_aligned_frags++;
+
+			provider->memcpy_from_device(tmp_mem, mem,
+						     dmabuf_cmsg->frag_offset,
+						     dmabuf_cmsg->frag_size);
+
+			if (do_validation)
+				validate_buffer(tmp_mem,
+						dmabuf_cmsg->frag_size);
+			else
+				print_nonzero_bytes(tmp_mem,
+						    dmabuf_cmsg->frag_size);
+
+			ret = setsockopt(client_fd, SOL_SOCKET,
+					 SO_DEVMEM_DONTNEED, &token,
+					 sizeof(token));
+			if (ret != 1)
+				error(1, 0,
+				      "SO_DEVMEM_DONTNEED not enough tokens");
+		}
+		if (!is_devmem)
+			error(1, 0, "flow steering error\n");
+
+		fprintf(stderr, "total_received=%lu\n", total_received);
+	}
+
+	fprintf(stderr, "%s: ok\n", TEST_PREFIX);
+
+	fprintf(stderr, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n",
+		page_aligned_frags, non_page_aligned_frags);
+
+	fprintf(stderr, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n",
+		page_aligned_frags, non_page_aligned_frags);
+
+cleanup:
+
+	free(tmp_mem);
+	close(client_fd);
+	close(socket_fd);
+	ynl_sock_destroy(ys);
+
+	return 0;
+}
+
+void run_devmem_tests(void)
+{
+	struct netdev_queue_id *queues;
+	struct memory_buffer *mem;
+	struct ynl_sock *ys;
+	size_t i = 0;
+
+	mem = provider->alloc(getpagesize() * NUM_PAGES);
+
+	/* Configure RSS to divert all traffic from our devmem queues */
+	if (configure_rss())
+		error(1, 0, "rss error\n");
+
+	queues = calloc(num_queues, sizeof(*queues));
+
+	if (configure_headersplit(1))
+		error(1, 0, "Failed to configure header split\n");
+
+	if (!bind_rx_queue(ifindex, mem->fd, queues, num_queues, &ys))
+		error(1, 0, "Binding empty queues array should have failed\n");
+
+	for (i = 0; i < num_queues; i++) {
+		queues[i]._present.type = 1;
+		queues[i]._present.id = 1;
+		queues[i].type = NETDEV_QUEUE_TYPE_RX;
+		queues[i].id = start_queue + i;
+	}
+
+	if (configure_headersplit(0))
+		error(1, 0, "Failed to configure header split\n");
+
+	if (!bind_rx_queue(ifindex, mem->fd, queues, num_queues, &ys))
+		error(1, 0, "Configure dmabuf with header split off should have failed\n");
+
+	if (configure_headersplit(1))
+		error(1, 0, "Failed to configure header split\n");
+
+	for (i = 0; i < num_queues; i++) {
+		queues[i]._present.type = 1;
+		queues[i]._present.id = 1;
+		queues[i].type = NETDEV_QUEUE_TYPE_RX;
+		queues[i].id = start_queue + i;
+	}
+
+	if (bind_rx_queue(ifindex, mem->fd, queues, num_queues, &ys))
+		error(1, 0, "Failed to bind\n");
+
+	/* Deactivating a bound queue should not be legal */
+	if (!configure_channels(num_queues, num_queues - 1))
+		error(1, 0, "Deactivating a bound queue should be illegal.\n");
+
+	/* Closing the netlink socket does an implicit unbind */
+	ynl_sock_destroy(ys);
+
+	provider->free(mem);
+}
+
+int main(int argc, char *argv[])
+{
+	struct memory_buffer *mem;
+	int is_server = 0, opt;
+	int ret;
+
+	while ((opt = getopt(argc, argv, "ls:c:p:v:q:t:f:")) != -1) {
+		switch (opt) {
+		case 'l':
+			is_server = 1;
+			break;
+		case 's':
+			server_ip = optarg;
+			break;
+		case 'c':
+			client_ip = optarg;
+			break;
+		case 'p':
+			port = optarg;
+			break;
+		case 'v':
+			do_validation = atoll(optarg);
+			break;
+		case 'q':
+			num_queues = atoi(optarg);
+			break;
+		case 't':
+			start_queue = atoi(optarg);
+			break;
+		case 'f':
+			ifname = optarg;
+			break;
+		case '?':
+			fprintf(stderr, "unknown option: %c\n", optopt);
+			break;
+		}
+	}
+
+	if (!ifname)
+		error(1, 0, "Missing -f argument\n");
+
+	ifindex = if_nametoindex(ifname);
+
+	if (!server_ip && !client_ip) {
+		if (start_queue < 0 && num_queues < 0) {
+			num_queues = rxq_num(ifindex);
+			if (num_queues < 0)
+				error(1, 0, "couldn't detect number of queues\n");
+			if (num_queues < 2)
+				error(1, 0,
+				      "number of device queues is too low\n");
+			/* make sure can bind to multiple queues */
+			start_queue = num_queues / 2;
+			num_queues /= 2;
+		}
+
+		if (start_queue < 0 || num_queues < 0)
+			error(1, 0, "Both -t and -q are required\n");
+
+		run_devmem_tests();
+		return 0;
+	}
+
+	if (start_queue < 0 && num_queues < 0) {
+		num_queues = rxq_num(ifindex);
+		if (num_queues < 2)
+			error(1, 0, "number of device queues is too low\n");
+
+		num_queues = 1;
+		start_queue = rxq_num(ifindex) - num_queues;
+
+		if (start_queue < 0)
+			error(1, 0, "couldn't detect number of queues\n");
+
+		fprintf(stderr, "using queues %d..%d\n", start_queue, start_queue + num_queues);
+	}
+
+	for (; optind < argc; optind++)
+		fprintf(stderr, "extra arguments: %s\n", argv[optind]);
+
+	if (start_queue < 0)
+		error(1, 0, "Missing -t argument\n");
+
+	if (num_queues < 0)
+		error(1, 0, "Missing -q argument\n");
+
+	if (!server_ip)
+		error(1, 0, "Missing -s argument\n");
+
+	if (!port)
+		error(1, 0, "Missing -p argument\n");
+
+	mem = provider->alloc(getpagesize() * NUM_PAGES);
+	ret = is_server ? do_server(mem) : 1;
+	provider->free(mem);
+
+	return ret;
+}
diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index 217d8b7a7365..a78debbd1fe7 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -18,7 +18,6 @@ ipv6_flowlabel_mgr
 log.txt
 msg_oob
 msg_zerocopy
-ncdevmem
 nettest
 psock_fanout
 psock_snd
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 61cce028f105..9322b904ad00 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -98,10 +98,6 @@ TEST_PROGS += vlan_hw_filter.sh
 TEST_PROGS += bpf_offload.py
 TEST_PROGS += ipv6_route_update_soft_lockup.sh
 
-# YNL files, must be before "include ..lib.mk"
-YNL_GEN_FILES := ncdevmem
-TEST_GEN_FILES += $(YNL_GEN_FILES)
-
 TEST_FILES := settings
 TEST_FILES += in_netns.sh lib.sh net_helper.sh setup_loopback.sh setup_veth.sh
 
@@ -111,10 +107,6 @@ TEST_INCLUDES := forwarding/lib.sh
 
 include ../lib.mk
 
-# YNL build
-YNL_GENS := ethtool netdev
-include ynl.mk
-
 $(OUTPUT)/epoll_busy_poll: LDLIBS += -lcap
 $(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma
 $(OUTPUT)/tcp_mmap: LDLIBS += -lpthread -lcrypto
diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/net/ncdevmem.c
deleted file mode 100644
index 8e502a1f8f9b..000000000000
--- a/tools/testing/selftests/net/ncdevmem.c
+++ /dev/null
@@ -1,789 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * tcpdevmem netcat. Works similarly to netcat but does device memory TCP
- * instead of regular TCP. Uses udmabuf to mock a dmabuf provider.
- *
- * Usage:
- *
- *     On server:
- *     ncdevmem -s <server IP> [-c <client IP>] -f eth1 -l -p 5201
- *
- *     On client:
- *     echo -n "hello\nworld" | nc -s <server IP> 5201 -p 5201
- *
- * Test data validation:
- *
- *     On server:
- *     ncdevmem -s <server IP> [-c <client IP>] -f eth1 -l -p 5201 -v 7
- *
- *     On client:
- *     yes $(echo -e \\x01\\x02\\x03\\x04\\x05\\x06) | \
- *             tr \\n \\0 | \
- *             head -c 5G | \
- *             nc <server IP> 5201 -p 5201
- *
- *
- * Note this is compatible with regular netcat. i.e. the sender or receiver can
- * be replaced with regular netcat to test the RX or TX path in isolation.
- */
-#define _GNU_SOURCE
-#define __EXPORTED_HEADERS__
-
-#include <linux/uio.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <stdbool.h>
-#include <string.h>
-#include <errno.h>
-#define __iovec_defined
-#include <fcntl.h>
-#include <malloc.h>
-#include <error.h>
-
-#include <arpa/inet.h>
-#include <sys/socket.h>
-#include <sys/mman.h>
-#include <sys/ioctl.h>
-#include <sys/syscall.h>
-
-#include <linux/memfd.h>
-#include <linux/dma-buf.h>
-#include <linux/udmabuf.h>
-#include <libmnl/libmnl.h>
-#include <linux/types.h>
-#include <linux/netlink.h>
-#include <linux/genetlink.h>
-#include <linux/netdev.h>
-#include <linux/ethtool_netlink.h>
-#include <time.h>
-#include <net/if.h>
-
-#include "netdev-user.h"
-#include "ethtool-user.h"
-#include <ynl.h>
-
-#define PAGE_SHIFT 12
-#define TEST_PREFIX "ncdevmem"
-#define NUM_PAGES 16000
-
-#ifndef MSG_SOCK_DEVMEM
-#define MSG_SOCK_DEVMEM 0x2000000
-#endif
-
-static char *server_ip;
-static char *client_ip;
-static char *port;
-static size_t do_validation;
-static int start_queue = -1;
-static int num_queues = -1;
-static char *ifname;
-static unsigned int ifindex;
-static unsigned int dmabuf_id;
-
-struct memory_buffer {
-	int fd;
-	size_t size;
-
-	int devfd;
-	int memfd;
-	char *buf_mem;
-};
-
-struct memory_provider {
-	struct memory_buffer *(*alloc)(size_t size);
-	void (*free)(struct memory_buffer *ctx);
-	void (*memcpy_from_device)(void *dst, struct memory_buffer *src,
-				   size_t off, int n);
-};
-
-static struct memory_buffer *udmabuf_alloc(size_t size)
-{
-	struct udmabuf_create create;
-	struct memory_buffer *ctx;
-	int ret;
-
-	ctx = malloc(sizeof(*ctx));
-	if (!ctx)
-		error(1, ENOMEM, "malloc failed");
-
-	ctx->size = size;
-
-	ctx->devfd = open("/dev/udmabuf", O_RDWR);
-	if (ctx->devfd < 0)
-		error(1, errno,
-		      "%s: [skip,no-udmabuf: Unable to access DMA buffer device file]\n",
-		      TEST_PREFIX);
-
-	ctx->memfd = memfd_create("udmabuf-test", MFD_ALLOW_SEALING);
-	if (ctx->memfd < 0)
-		error(1, errno, "%s: [skip,no-memfd]\n", TEST_PREFIX);
-
-	ret = fcntl(ctx->memfd, F_ADD_SEALS, F_SEAL_SHRINK);
-	if (ret < 0)
-		error(1, errno, "%s: [skip,fcntl-add-seals]\n", TEST_PREFIX);
-
-	ret = ftruncate(ctx->memfd, size);
-	if (ret == -1)
-		error(1, errno, "%s: [FAIL,memfd-truncate]\n", TEST_PREFIX);
-
-	memset(&create, 0, sizeof(create));
-
-	create.memfd = ctx->memfd;
-	create.offset = 0;
-	create.size = size;
-	ctx->fd = ioctl(ctx->devfd, UDMABUF_CREATE, &create);
-	if (ctx->fd < 0)
-		error(1, errno, "%s: [FAIL, create udmabuf]\n", TEST_PREFIX);
-
-	ctx->buf_mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED,
-			    ctx->fd, 0);
-	if (ctx->buf_mem == MAP_FAILED)
-		error(1, errno, "%s: [FAIL, map udmabuf]\n", TEST_PREFIX);
-
-	return ctx;
-}
-
-static void udmabuf_free(struct memory_buffer *ctx)
-{
-	munmap(ctx->buf_mem, ctx->size);
-	close(ctx->fd);
-	close(ctx->memfd);
-	close(ctx->devfd);
-	free(ctx);
-}
-
-static void udmabuf_memcpy_from_device(void *dst, struct memory_buffer *src,
-				       size_t off, int n)
-{
-	struct dma_buf_sync sync = {};
-
-	sync.flags = DMA_BUF_SYNC_START;
-	ioctl(src->fd, DMA_BUF_IOCTL_SYNC, &sync);
-
-	memcpy(dst, src->buf_mem + off, n);
-
-	sync.flags = DMA_BUF_SYNC_END;
-	ioctl(src->fd, DMA_BUF_IOCTL_SYNC, &sync);
-}
-
-static struct memory_provider udmabuf_memory_provider = {
-	.alloc = udmabuf_alloc,
-	.free = udmabuf_free,
-	.memcpy_from_device = udmabuf_memcpy_from_device,
-};
-
-static struct memory_provider *provider = &udmabuf_memory_provider;
-
-static void print_nonzero_bytes(void *ptr, size_t size)
-{
-	unsigned char *p = ptr;
-	unsigned int i;
-
-	for (i = 0; i < size; i++)
-		putchar(p[i]);
-}
-
-void validate_buffer(void *line, size_t size)
-{
-	static unsigned char seed = 1;
-	unsigned char *ptr = line;
-	int errors = 0;
-	size_t i;
-
-	for (i = 0; i < size; i++) {
-		if (ptr[i] != seed) {
-			fprintf(stderr,
-				"Failed validation: expected=%u, actual=%u, index=%lu\n",
-				seed, ptr[i], i);
-			errors++;
-			if (errors > 20)
-				error(1, 0, "validation failed.");
-		}
-		seed++;
-		if (seed == do_validation)
-			seed = 0;
-	}
-
-	fprintf(stdout, "Validated buffer\n");
-}
-
-static int rxq_num(int ifindex)
-{
-	struct ethtool_channels_get_req *req;
-	struct ethtool_channels_get_rsp *rsp;
-	struct ynl_error yerr;
-	struct ynl_sock *ys;
-	int num = -1;
-
-	ys = ynl_sock_create(&ynl_ethtool_family, &yerr);
-	if (!ys) {
-		fprintf(stderr, "YNL: %s\n", yerr.msg);
-		return -1;
-	}
-
-	req = ethtool_channels_get_req_alloc();
-	ethtool_channels_get_req_set_header_dev_index(req, ifindex);
-	rsp = ethtool_channels_get(ys, req);
-	if (rsp)
-		num = rsp->rx_count + rsp->combined_count;
-	ethtool_channels_get_req_free(req);
-	ethtool_channels_get_rsp_free(rsp);
-
-	ynl_sock_destroy(ys);
-
-	return num;
-}
-
-#define run_command(cmd, ...)                                           \
-	({                                                              \
-		char command[256];                                      \
-		memset(command, 0, sizeof(command));                    \
-		snprintf(command, sizeof(command), cmd, ##__VA_ARGS__); \
-		fprintf(stderr, "Running: %s\n", command);                       \
-		system(command);                                        \
-	})
-
-static int reset_flow_steering(void)
-{
-	/* Depending on the NIC, toggling ntuple off and on might not
-	 * be allowed. Additionally, attempting to delete existing filters
-	 * will fail if no filters are present. Therefore, do not enforce
-	 * the exit status.
-	 */
-
-	run_command("sudo ethtool -K %s ntuple off >&2", ifname);
-	run_command("sudo ethtool -K %s ntuple on >&2", ifname);
-	run_command(
-		"sudo ethtool -n %s | grep 'Filter:' | awk '{print $2}' | xargs -n1 ethtool -N %s delete >&2",
-		ifname, ifname);
-	return 0;
-}
-
-static const char *tcp_data_split_str(int val)
-{
-	switch (val) {
-	case 0:
-		return "off";
-	case 1:
-		return "auto";
-	case 2:
-		return "on";
-	default:
-		return "?";
-	}
-}
-
-static int configure_headersplit(bool on)
-{
-	struct ethtool_rings_get_req *get_req;
-	struct ethtool_rings_get_rsp *get_rsp;
-	struct ethtool_rings_set_req *req;
-	struct ynl_error yerr;
-	struct ynl_sock *ys;
-	int ret;
-
-	ys = ynl_sock_create(&ynl_ethtool_family, &yerr);
-	if (!ys) {
-		fprintf(stderr, "YNL: %s\n", yerr.msg);
-		return -1;
-	}
-
-	req = ethtool_rings_set_req_alloc();
-	ethtool_rings_set_req_set_header_dev_index(req, ifindex);
-	/* 0 - off, 1 - auto, 2 - on */
-	ethtool_rings_set_req_set_tcp_data_split(req, on ? 2 : 0);
-	ret = ethtool_rings_set(ys, req);
-	if (ret < 0)
-		fprintf(stderr, "YNL failed: %s\n", ys->err.msg);
-	ethtool_rings_set_req_free(req);
-
-	if (ret == 0) {
-		get_req = ethtool_rings_get_req_alloc();
-		ethtool_rings_get_req_set_header_dev_index(get_req, ifindex);
-		get_rsp = ethtool_rings_get(ys, get_req);
-		ethtool_rings_get_req_free(get_req);
-		if (get_rsp)
-			fprintf(stderr, "TCP header split: %s\n",
-				tcp_data_split_str(get_rsp->tcp_data_split));
-		ethtool_rings_get_rsp_free(get_rsp);
-	}
-
-	ynl_sock_destroy(ys);
-
-	return ret;
-}
-
-static int configure_rss(void)
-{
-	return run_command("sudo ethtool -X %s equal %d >&2", ifname, start_queue);
-}
-
-static int configure_channels(unsigned int rx, unsigned int tx)
-{
-	return run_command("sudo ethtool -L %s rx %u tx %u", ifname, rx, tx);
-}
-
-static int configure_flow_steering(struct sockaddr_in6 *server_sin)
-{
-	const char *type = "tcp6";
-	const char *server_addr;
-	char buf[40];
-
-	inet_ntop(AF_INET6, &server_sin->sin6_addr, buf, sizeof(buf));
-	server_addr = buf;
-
-	if (IN6_IS_ADDR_V4MAPPED(&server_sin->sin6_addr)) {
-		type = "tcp4";
-		server_addr = strrchr(server_addr, ':') + 1;
-	}
-
-	return run_command("sudo ethtool -N %s flow-type %s %s %s dst-ip %s %s %s dst-port %s queue %d >&2",
-			   ifname,
-			   type,
-			   client_ip ? "src-ip" : "",
-			   client_ip ?: "",
-			   server_addr,
-			   client_ip ? "src-port" : "",
-			   client_ip ? port : "",
-			   port, start_queue);
-}
-
-static int bind_rx_queue(unsigned int ifindex, unsigned int dmabuf_fd,
-			 struct netdev_queue_id *queues,
-			 unsigned int n_queue_index, struct ynl_sock **ys)
-{
-	struct netdev_bind_rx_req *req = NULL;
-	struct netdev_bind_rx_rsp *rsp = NULL;
-	struct ynl_error yerr;
-
-	*ys = ynl_sock_create(&ynl_netdev_family, &yerr);
-	if (!*ys) {
-		fprintf(stderr, "YNL: %s\n", yerr.msg);
-		return -1;
-	}
-
-	req = netdev_bind_rx_req_alloc();
-	netdev_bind_rx_req_set_ifindex(req, ifindex);
-	netdev_bind_rx_req_set_fd(req, dmabuf_fd);
-	__netdev_bind_rx_req_set_queues(req, queues, n_queue_index);
-
-	rsp = netdev_bind_rx(*ys, req);
-	if (!rsp) {
-		perror("netdev_bind_rx");
-		goto err_close;
-	}
-
-	if (!rsp->_present.id) {
-		perror("id not present");
-		goto err_close;
-	}
-
-	fprintf(stderr, "got dmabuf id=%d\n", rsp->id);
-	dmabuf_id = rsp->id;
-
-	netdev_bind_rx_req_free(req);
-	netdev_bind_rx_rsp_free(rsp);
-
-	return 0;
-
-err_close:
-	fprintf(stderr, "YNL failed: %s\n", (*ys)->err.msg);
-	netdev_bind_rx_req_free(req);
-	ynl_sock_destroy(*ys);
-	return -1;
-}
-
-static void enable_reuseaddr(int fd)
-{
-	int opt = 1;
-	int ret;
-
-	ret = setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt));
-	if (ret)
-		error(1, errno, "%s: [FAIL, SO_REUSEPORT]\n", TEST_PREFIX);
-
-	ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
-	if (ret)
-		error(1, errno, "%s: [FAIL, SO_REUSEADDR]\n", TEST_PREFIX);
-}
-
-static int parse_address(const char *str, int port, struct sockaddr_in6 *sin6)
-{
-	int ret;
-
-	sin6->sin6_family = AF_INET6;
-	sin6->sin6_port = htons(port);
-
-	ret = inet_pton(sin6->sin6_family, str, &sin6->sin6_addr);
-	if (ret != 1) {
-		/* fallback to plain IPv4 */
-		ret = inet_pton(AF_INET, str, &sin6->sin6_addr.s6_addr32[3]);
-		if (ret != 1)
-			return -1;
-
-		/* add ::ffff prefix */
-		sin6->sin6_addr.s6_addr32[0] = 0;
-		sin6->sin6_addr.s6_addr32[1] = 0;
-		sin6->sin6_addr.s6_addr16[4] = 0;
-		sin6->sin6_addr.s6_addr16[5] = 0xffff;
-	}
-
-	return 0;
-}
-
-int do_server(struct memory_buffer *mem)
-{
-	char ctrl_data[sizeof(int) * 20000];
-	struct netdev_queue_id *queues;
-	size_t non_page_aligned_frags = 0;
-	struct sockaddr_in6 client_addr;
-	struct sockaddr_in6 server_sin;
-	size_t page_aligned_frags = 0;
-	size_t total_received = 0;
-	socklen_t client_addr_len;
-	bool is_devmem = false;
-	char *tmp_mem = NULL;
-	struct ynl_sock *ys;
-	char iobuf[819200];
-	char buffer[256];
-	int socket_fd;
-	int client_fd;
-	size_t i = 0;
-	int ret;
-
-	ret = parse_address(server_ip, atoi(port), &server_sin);
-	if (ret < 0)
-		error(1, 0, "parse server address");
-
-	if (reset_flow_steering())
-		error(1, 0, "Failed to reset flow steering\n");
-
-	if (configure_headersplit(1))
-		error(1, 0, "Failed to enable TCP header split\n");
-
-	/* Configure RSS to divert all traffic from our devmem queues */
-	if (configure_rss())
-		error(1, 0, "Failed to configure rss\n");
-
-	/* Flow steer our devmem flows to start_queue */
-	if (configure_flow_steering(&server_sin))
-		error(1, 0, "Failed to configure flow steering\n");
-
-	sleep(1);
-
-	queues = malloc(sizeof(*queues) * num_queues);
-
-	for (i = 0; i < num_queues; i++) {
-		queues[i]._present.type = 1;
-		queues[i]._present.id = 1;
-		queues[i].type = NETDEV_QUEUE_TYPE_RX;
-		queues[i].id = start_queue + i;
-	}
-
-	if (bind_rx_queue(ifindex, mem->fd, queues, num_queues, &ys))
-		error(1, 0, "Failed to bind\n");
-
-	tmp_mem = malloc(mem->size);
-	if (!tmp_mem)
-		error(1, ENOMEM, "malloc failed");
-
-	socket_fd = socket(AF_INET6, SOCK_STREAM, 0);
-	if (socket_fd < 0)
-		error(1, errno, "%s: [FAIL, create socket]\n", TEST_PREFIX);
-
-	enable_reuseaddr(socket_fd);
-
-	fprintf(stderr, "binding to address %s:%d\n", server_ip,
-		ntohs(server_sin.sin6_port));
-
-	ret = bind(socket_fd, &server_sin, sizeof(server_sin));
-	if (ret)
-		error(1, errno, "%s: [FAIL, bind]\n", TEST_PREFIX);
-
-	ret = listen(socket_fd, 1);
-	if (ret)
-		error(1, errno, "%s: [FAIL, listen]\n", TEST_PREFIX);
-
-	client_addr_len = sizeof(client_addr);
-
-	inet_ntop(AF_INET6, &server_sin.sin6_addr, buffer,
-		  sizeof(buffer));
-	fprintf(stderr, "Waiting or connection on %s:%d\n", buffer,
-		ntohs(server_sin.sin6_port));
-	client_fd = accept(socket_fd, &client_addr, &client_addr_len);
-
-	inet_ntop(AF_INET6, &client_addr.sin6_addr, buffer,
-		  sizeof(buffer));
-	fprintf(stderr, "Got connection from %s:%d\n", buffer,
-		ntohs(client_addr.sin6_port));
-
-	while (1) {
-		struct iovec iov = { .iov_base = iobuf,
-				     .iov_len = sizeof(iobuf) };
-		struct dmabuf_cmsg *dmabuf_cmsg = NULL;
-		struct cmsghdr *cm = NULL;
-		struct msghdr msg = { 0 };
-		struct dmabuf_token token;
-		ssize_t ret;
-
-		is_devmem = false;
-
-		msg.msg_iov = &iov;
-		msg.msg_iovlen = 1;
-		msg.msg_control = ctrl_data;
-		msg.msg_controllen = sizeof(ctrl_data);
-		ret = recvmsg(client_fd, &msg, MSG_SOCK_DEVMEM);
-		fprintf(stderr, "recvmsg ret=%ld\n", ret);
-		if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK))
-			continue;
-		if (ret < 0) {
-			perror("recvmsg");
-			continue;
-		}
-		if (ret == 0) {
-			fprintf(stderr, "client exited\n");
-			goto cleanup;
-		}
-
-		i++;
-		for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) {
-			if (cm->cmsg_level != SOL_SOCKET ||
-			    (cm->cmsg_type != SCM_DEVMEM_DMABUF &&
-			     cm->cmsg_type != SCM_DEVMEM_LINEAR)) {
-				fprintf(stderr, "skipping non-devmem cmsg\n");
-				continue;
-			}
-
-			dmabuf_cmsg = (struct dmabuf_cmsg *)CMSG_DATA(cm);
-			is_devmem = true;
-
-			if (cm->cmsg_type == SCM_DEVMEM_LINEAR) {
-				/* TODO: process data copied from skb's linear
-				 * buffer.
-				 */
-				fprintf(stderr,
-					"SCM_DEVMEM_LINEAR. dmabuf_cmsg->frag_size=%u\n",
-					dmabuf_cmsg->frag_size);
-
-				continue;
-			}
-
-			token.token_start = dmabuf_cmsg->frag_token;
-			token.token_count = 1;
-
-			total_received += dmabuf_cmsg->frag_size;
-			fprintf(stderr,
-				"received frag_page=%llu, in_page_offset=%llu, frag_offset=%llu, frag_size=%u, token=%u, total_received=%lu, dmabuf_id=%u\n",
-				dmabuf_cmsg->frag_offset >> PAGE_SHIFT,
-				dmabuf_cmsg->frag_offset % getpagesize(),
-				dmabuf_cmsg->frag_offset,
-				dmabuf_cmsg->frag_size, dmabuf_cmsg->frag_token,
-				total_received, dmabuf_cmsg->dmabuf_id);
-
-			if (dmabuf_cmsg->dmabuf_id != dmabuf_id)
-				error(1, 0,
-				      "received on wrong dmabuf_id: flow steering error\n");
-
-			if (dmabuf_cmsg->frag_size % getpagesize())
-				non_page_aligned_frags++;
-			else
-				page_aligned_frags++;
-
-			provider->memcpy_from_device(tmp_mem, mem,
-						     dmabuf_cmsg->frag_offset,
-						     dmabuf_cmsg->frag_size);
-
-			if (do_validation)
-				validate_buffer(tmp_mem,
-						dmabuf_cmsg->frag_size);
-			else
-				print_nonzero_bytes(tmp_mem,
-						    dmabuf_cmsg->frag_size);
-
-			ret = setsockopt(client_fd, SOL_SOCKET,
-					 SO_DEVMEM_DONTNEED, &token,
-					 sizeof(token));
-			if (ret != 1)
-				error(1, 0,
-				      "SO_DEVMEM_DONTNEED not enough tokens");
-		}
-		if (!is_devmem)
-			error(1, 0, "flow steering error\n");
-
-		fprintf(stderr, "total_received=%lu\n", total_received);
-	}
-
-	fprintf(stderr, "%s: ok\n", TEST_PREFIX);
-
-	fprintf(stderr, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n",
-		page_aligned_frags, non_page_aligned_frags);
-
-	fprintf(stderr, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n",
-		page_aligned_frags, non_page_aligned_frags);
-
-cleanup:
-
-	free(tmp_mem);
-	close(client_fd);
-	close(socket_fd);
-	ynl_sock_destroy(ys);
-
-	return 0;
-}
-
-void run_devmem_tests(void)
-{
-	struct netdev_queue_id *queues;
-	struct memory_buffer *mem;
-	struct ynl_sock *ys;
-	size_t i = 0;
-
-	mem = provider->alloc(getpagesize() * NUM_PAGES);
-
-	/* Configure RSS to divert all traffic from our devmem queues */
-	if (configure_rss())
-		error(1, 0, "rss error\n");
-
-	queues = calloc(num_queues, sizeof(*queues));
-
-	if (configure_headersplit(1))
-		error(1, 0, "Failed to configure header split\n");
-
-	if (!bind_rx_queue(ifindex, mem->fd, queues, num_queues, &ys))
-		error(1, 0, "Binding empty queues array should have failed\n");
-
-	for (i = 0; i < num_queues; i++) {
-		queues[i]._present.type = 1;
-		queues[i]._present.id = 1;
-		queues[i].type = NETDEV_QUEUE_TYPE_RX;
-		queues[i].id = start_queue + i;
-	}
-
-	if (configure_headersplit(0))
-		error(1, 0, "Failed to configure header split\n");
-
-	if (!bind_rx_queue(ifindex, mem->fd, queues, num_queues, &ys))
-		error(1, 0, "Configure dmabuf with header split off should have failed\n");
-
-	if (configure_headersplit(1))
-		error(1, 0, "Failed to configure header split\n");
-
-	for (i = 0; i < num_queues; i++) {
-		queues[i]._present.type = 1;
-		queues[i]._present.id = 1;
-		queues[i].type = NETDEV_QUEUE_TYPE_RX;
-		queues[i].id = start_queue + i;
-	}
-
-	if (bind_rx_queue(ifindex, mem->fd, queues, num_queues, &ys))
-		error(1, 0, "Failed to bind\n");
-
-	/* Deactivating a bound queue should not be legal */
-	if (!configure_channels(num_queues, num_queues - 1))
-		error(1, 0, "Deactivating a bound queue should be illegal.\n");
-
-	/* Closing the netlink socket does an implicit unbind */
-	ynl_sock_destroy(ys);
-
-	provider->free(mem);
-}
-
-int main(int argc, char *argv[])
-{
-	struct memory_buffer *mem;
-	int is_server = 0, opt;
-	int ret;
-
-	while ((opt = getopt(argc, argv, "ls:c:p:v:q:t:f:")) != -1) {
-		switch (opt) {
-		case 'l':
-			is_server = 1;
-			break;
-		case 's':
-			server_ip = optarg;
-			break;
-		case 'c':
-			client_ip = optarg;
-			break;
-		case 'p':
-			port = optarg;
-			break;
-		case 'v':
-			do_validation = atoll(optarg);
-			break;
-		case 'q':
-			num_queues = atoi(optarg);
-			break;
-		case 't':
-			start_queue = atoi(optarg);
-			break;
-		case 'f':
-			ifname = optarg;
-			break;
-		case '?':
-			fprintf(stderr, "unknown option: %c\n", optopt);
-			break;
-		}
-	}
-
-	if (!ifname)
-		error(1, 0, "Missing -f argument\n");
-
-	ifindex = if_nametoindex(ifname);
-
-	if (!server_ip && !client_ip) {
-		if (start_queue < 0 && num_queues < 0) {
-			num_queues = rxq_num(ifindex);
-			if (num_queues < 0)
-				error(1, 0, "couldn't detect number of queues\n");
-			if (num_queues < 2)
-				error(1, 0,
-				      "number of device queues is too low\n");
-			/* make sure can bind to multiple queues */
-			start_queue = num_queues / 2;
-			num_queues /= 2;
-		}
-
-		if (start_queue < 0 || num_queues < 0)
-			error(1, 0, "Both -t and -q are required\n");
-
-		run_devmem_tests();
-		return 0;
-	}
-
-	if (start_queue < 0 && num_queues < 0) {
-		num_queues = rxq_num(ifindex);
-		if (num_queues < 2)
-			error(1, 0, "number of device queues is too low\n");
-
-		num_queues = 1;
-		start_queue = rxq_num(ifindex) - num_queues;
-
-		if (start_queue < 0)
-			error(1, 0, "couldn't detect number of queues\n");
-
-		fprintf(stderr, "using queues %d..%d\n", start_queue, start_queue + num_queues);
-	}
-
-	for (; optind < argc; optind++)
-		fprintf(stderr, "extra arguments: %s\n", argv[optind]);
-
-	if (start_queue < 0)
-		error(1, 0, "Missing -t argument\n");
-
-	if (num_queues < 0)
-		error(1, 0, "Missing -q argument\n");
-
-	if (!server_ip)
-		error(1, 0, "Missing -s argument\n");
-
-	if (!port)
-		error(1, 0, "Missing -p argument\n");
-
-	mem = provider->alloc(getpagesize() * NUM_PAGES);
-	ret = is_server ? do_server(mem) : 1;
-	provider->free(mem);
-
-	return ret;
-}
-- 
cgit v1.2.3


From 80230864b7b0fd9b54b294ab08a28f01d4193aa2 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Thu, 7 Nov 2024 10:12:11 -0800
Subject: selftests: ncdevmem: Add automated test

Only RX side for now and small message to test the setup.
In the future, we can extend it to TX side and to testing
both sides with a couple of megs of data.

  make \
  	-C tools/testing/selftests \
  	TARGETS="drivers/hw/net" \
  	install INSTALL_PATH=~/tmp/ksft

  scp ~/tmp/ksft ${HOST}:
  scp ~/tmp/ksft ${PEER}:

  cfg+="NETIF=${DEV}\n"
  cfg+="LOCAL_V6=${HOST_IP}\n"
  cfg+="REMOTE_V6=${PEER_IP}\n"
  cfg+="REMOTE_TYPE=ssh\n"
  cfg+="REMOTE_ARGS=root@${PEER}\n"

  echo -e "$cfg" | ssh root@${HOST} "cat > ksft/drivers/net/net.config"
  ssh root@${HOST} "cd ksft && ./run_kselftest.sh -t drivers/net:devmem.py"

Reviewed-by: Mina Almasry <almasrymina@google.com>
Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Reviewed-by: Joe Damato <jdamato@fastly.com>
Link: https://patch.msgid.link/20241107181211.3934153-13-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/hw/Makefile  |  1 +
 tools/testing/selftests/drivers/net/hw/devmem.py | 45 ++++++++++++++++++++++++
 2 files changed, 46 insertions(+)
 create mode 100755 tools/testing/selftests/drivers/net/hw/devmem.py

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile
index 182348f4bd40..1c6a77480923 100644
--- a/tools/testing/selftests/drivers/net/hw/Makefile
+++ b/tools/testing/selftests/drivers/net/hw/Makefile
@@ -3,6 +3,7 @@
 TEST_PROGS = \
 	csum.py \
 	devlink_port_split.py \
+	devmem.py \
 	ethtool.sh \
 	ethtool_extended_state.sh \
 	ethtool_mm.sh \
diff --git a/tools/testing/selftests/drivers/net/hw/devmem.py b/tools/testing/selftests/drivers/net/hw/devmem.py
new file mode 100755
index 000000000000..1223f0f5c10c
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/hw/devmem.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+from lib.py import ksft_run, ksft_exit
+from lib.py import ksft_eq, KsftSkipEx
+from lib.py import NetDrvEpEnv
+from lib.py import bkg, cmd, rand_port, wait_port_listen
+from lib.py import ksft_disruptive
+
+
+def require_devmem(cfg):
+    if not hasattr(cfg, "_devmem_probed"):
+        port = rand_port()
+        probe_command = f"./ncdevmem -f {cfg.ifname}"
+        cfg._devmem_supported = cmd(probe_command, fail=False, shell=True).ret == 0
+        cfg._devmem_probed = True
+
+    if not cfg._devmem_supported:
+        raise KsftSkipEx("Test requires devmem support")
+
+
+@ksft_disruptive
+def check_rx(cfg) -> None:
+    cfg.require_v6()
+    require_devmem(cfg)
+
+    port = rand_port()
+    listen_cmd = f"./ncdevmem -l -f {cfg.ifname} -s {cfg.v6} -p {port}"
+
+    with bkg(listen_cmd) as socat:
+        wait_port_listen(port)
+        cmd(f"echo -e \"hello\\nworld\"| socat -u - TCP6:[{cfg.v6}]:{port}", host=cfg.remote, shell=True)
+
+    ksft_eq(socat.stdout.strip(), "hello\nworld")
+
+
+def main() -> None:
+    with NetDrvEpEnv(__file__) as cfg:
+        ksft_run([check_rx],
+                 args=(cfg, ))
+    ksft_exit()
+
+
+if __name__ == "__main__":
+    main()
-- 
cgit v1.2.3


From 347fcdc414f98998df1c5969e4612e4da67d6852 Mon Sep 17 00:00:00 2001
From: Joe Damato <jdamato@fastly.com>
Date: Sat, 9 Nov 2024 05:02:35 +0000
Subject: selftests: net: Add busy_poll_test

Add an epoll busy poll test using netdevsim.

This test is comprised of:
  - busy_poller (via busy_poller.c)
  - busy_poll_test.sh which loads netdevsim, sets up network namespaces,
    and runs busy_poller to receive data and socat to send data.

The selftest tests two different scenarios:
  - busy poll (the pre-existing version in the kernel)
  - busy poll with suspend enabled (what this series adds)

The data transmit is a 1MiB temporary file generated from /dev/urandom
and the test is considered passing if the md5sum of the input file to
socat matches the md5sum of the output file from busy_poller.

netdevsim was chosen instead of veth due to netdevsim's support for
netdev-genl.

For now, this test uses the functionality that netdevsim provides. In the
future, perhaps netdevsim can be extended to emulate device IRQs to more
thoroughly test all pre-existing kernel options (like defer_hard_irqs)
and suspend.

Signed-off-by: Joe Damato <jdamato@fastly.com>
Co-developed-by: Martin Karsten <mkarsten@uwaterloo.ca>
Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20241109050245.191288-6-jdamato@fastly.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/.gitignore        |   1 +
 tools/testing/selftests/net/Makefile          |   9 +
 tools/testing/selftests/net/busy_poll_test.sh | 165 ++++++++++++
 tools/testing/selftests/net/busy_poller.c     | 346 ++++++++++++++++++++++++++
 4 files changed, 521 insertions(+)
 create mode 100755 tools/testing/selftests/net/busy_poll_test.sh
 create mode 100644 tools/testing/selftests/net/busy_poller.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index a78debbd1fe7..48973e78d46b 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -2,6 +2,7 @@
 bind_bhash
 bind_timewait
 bind_wildcard
+busy_poller
 cmsg_sender
 diag_uid
 epoll_busy_poll
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 9322b904ad00..2b2a5ec7fa6a 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -97,6 +97,11 @@ TEST_PROGS += fq_band_pktlimit.sh
 TEST_PROGS += vlan_hw_filter.sh
 TEST_PROGS += bpf_offload.py
 TEST_PROGS += ipv6_route_update_soft_lockup.sh
+TEST_PROGS += busy_poll_test.sh
+
+# YNL files, must be before "include ..lib.mk"
+YNL_GEN_FILES := busy_poller
+TEST_GEN_FILES += $(YNL_GEN_FILES)
 
 TEST_FILES := settings
 TEST_FILES += in_netns.sh lib.sh net_helper.sh setup_loopback.sh setup_veth.sh
@@ -107,6 +112,10 @@ TEST_INCLUDES := forwarding/lib.sh
 
 include ../lib.mk
 
+# YNL build
+YNL_GENS := netdev
+include ynl.mk
+
 $(OUTPUT)/epoll_busy_poll: LDLIBS += -lcap
 $(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma
 $(OUTPUT)/tcp_mmap: LDLIBS += -lpthread -lcrypto
diff --git a/tools/testing/selftests/net/busy_poll_test.sh b/tools/testing/selftests/net/busy_poll_test.sh
new file mode 100755
index 000000000000..7db292ec4884
--- /dev/null
+++ b/tools/testing/selftests/net/busy_poll_test.sh
@@ -0,0 +1,165 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+source net_helper.sh
+
+NSIM_SV_ID=$((256 + RANDOM % 256))
+NSIM_SV_SYS=/sys/bus/netdevsim/devices/netdevsim$NSIM_SV_ID
+NSIM_CL_ID=$((512 + RANDOM % 256))
+NSIM_CL_SYS=/sys/bus/netdevsim/devices/netdevsim$NSIM_CL_ID
+
+NSIM_DEV_SYS_NEW=/sys/bus/netdevsim/new_device
+NSIM_DEV_SYS_DEL=/sys/bus/netdevsim/del_device
+NSIM_DEV_SYS_LINK=/sys/bus/netdevsim/link_device
+NSIM_DEV_SYS_UNLINK=/sys/bus/netdevsim/unlink_device
+
+SERVER_IP=192.168.1.1
+CLIENT_IP=192.168.1.2
+SERVER_PORT=48675
+
+# busy poll config
+MAX_EVENTS=8
+BUSY_POLL_USECS=0
+BUSY_POLL_BUDGET=16
+PREFER_BUSY_POLL=1
+
+# IRQ deferral config
+NAPI_DEFER_HARD_IRQS=100
+GRO_FLUSH_TIMEOUT=50000
+SUSPEND_TIMEOUT=20000000
+
+setup_ns()
+{
+	set -e
+	ip netns add nssv
+	ip netns add nscl
+
+	NSIM_SV_NAME=$(find $NSIM_SV_SYS/net -maxdepth 1 -type d ! \
+		-path $NSIM_SV_SYS/net -exec basename {} \;)
+	NSIM_CL_NAME=$(find $NSIM_CL_SYS/net -maxdepth 1 -type d ! \
+		-path $NSIM_CL_SYS/net -exec basename {} \;)
+
+	# ensure the server has 1 queue
+	ethtool -L $NSIM_SV_NAME combined 1 2>/dev/null
+
+	ip link set $NSIM_SV_NAME netns nssv
+	ip link set $NSIM_CL_NAME netns nscl
+
+	ip netns exec nssv ip addr add "${SERVER_IP}/24" dev $NSIM_SV_NAME
+	ip netns exec nscl ip addr add "${CLIENT_IP}/24" dev $NSIM_CL_NAME
+
+	ip netns exec nssv ip link set dev $NSIM_SV_NAME up
+	ip netns exec nscl ip link set dev $NSIM_CL_NAME up
+
+	set +e
+}
+
+cleanup_ns()
+{
+	ip netns del nscl
+	ip netns del nssv
+}
+
+test_busypoll()
+{
+	suspend_value=${1:-0}
+	tmp_file=$(mktemp)
+	out_file=$(mktemp)
+
+	# fill a test file with random data
+	dd if=/dev/urandom of=${tmp_file} bs=1M count=1 2> /dev/null
+
+	timeout -k 1s 30s ip netns exec nssv ./busy_poller         \
+					     -p${SERVER_PORT}      \
+					     -b${SERVER_IP}        \
+					     -m${MAX_EVENTS}       \
+					     -u${BUSY_POLL_USECS}  \
+					     -P${PREFER_BUSY_POLL} \
+					     -g${BUSY_POLL_BUDGET} \
+					     -i${NSIM_SV_IFIDX}    \
+					     -s${suspend_value}    \
+					     -o${out_file}&
+
+	wait_local_port_listen nssv ${SERVER_PORT} tcp
+
+	ip netns exec nscl socat -u $tmp_file TCP:${SERVER_IP}:${SERVER_PORT}
+
+	wait
+
+	tmp_file_md5sum=$(md5sum $tmp_file | cut -f1 -d' ')
+	out_file_md5sum=$(md5sum $out_file | cut -f1 -d' ')
+
+	if [ "$tmp_file_md5sum" = "$out_file_md5sum" ]; then
+		res=0
+	else
+		echo "md5sum mismatch"
+		echo "input file md5sum: ${tmp_file_md5sum}";
+		echo "output file md5sum: ${out_file_md5sum}";
+		res=1
+	fi
+
+	rm $out_file $tmp_file
+
+	return $res
+}
+
+test_busypoll_with_suspend()
+{
+	test_busypoll ${SUSPEND_TIMEOUT}
+
+	return $?
+}
+
+###
+### Code start
+###
+
+modprobe netdevsim
+
+# linking
+
+echo $NSIM_SV_ID > $NSIM_DEV_SYS_NEW
+echo $NSIM_CL_ID > $NSIM_DEV_SYS_NEW
+udevadm settle
+
+setup_ns
+
+NSIM_SV_FD=$((256 + RANDOM % 256))
+exec {NSIM_SV_FD}</var/run/netns/nssv
+NSIM_SV_IFIDX=$(ip netns exec nssv cat /sys/class/net/$NSIM_SV_NAME/ifindex)
+
+NSIM_CL_FD=$((256 + RANDOM % 256))
+exec {NSIM_CL_FD}</var/run/netns/nscl
+NSIM_CL_IFIDX=$(ip netns exec nscl cat /sys/class/net/$NSIM_CL_NAME/ifindex)
+
+echo "$NSIM_SV_FD:$NSIM_SV_IFIDX $NSIM_CL_FD:$NSIM_CL_IFIDX" > \
+     $NSIM_DEV_SYS_LINK
+
+if [ $? -ne 0 ]; then
+	echo "linking netdevsim1 with netdevsim2 should succeed"
+	cleanup_ns
+	exit 1
+fi
+
+test_busypoll
+if [ $? -ne 0 ]; then
+	echo "test_busypoll failed"
+	cleanup_ns
+	exit 1
+fi
+
+test_busypoll_with_suspend
+if [ $? -ne 0 ]; then
+	echo "test_busypoll_with_suspend failed"
+	cleanup_ns
+	exit 1
+fi
+
+echo "$NSIM_SV_FD:$NSIM_SV_IFIDX" > $NSIM_DEV_SYS_UNLINK
+
+echo $NSIM_CL_ID > $NSIM_DEV_SYS_DEL
+
+cleanup_ns
+
+modprobe -r netdevsim
+
+exit 0
diff --git a/tools/testing/selftests/net/busy_poller.c b/tools/testing/selftests/net/busy_poller.c
new file mode 100644
index 000000000000..99b0e8c17fca
--- /dev/null
+++ b/tools/testing/selftests/net/busy_poller.c
@@ -0,0 +1,346 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <assert.h>
+#include <errno.h>
+#include <error.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <ynl.h>
+
+#include <arpa/inet.h>
+#include <netinet/in.h>
+
+#include <sys/epoll.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include <linux/genetlink.h>
+#include <linux/netlink.h>
+
+#include "netdev-user.h"
+
+/* The below ifdef blob is required because:
+ *
+ * - sys/epoll.h does not (yet) have the ioctl definitions included. So,
+ *   systems with older glibcs will not have them available. However,
+ *   sys/epoll.h does include the type definition for epoll_data, which is
+ *   needed by the user program (e.g. epoll_event.data.fd)
+ *
+ * - linux/eventpoll.h does not define the epoll_data type, it is simply an
+ *   opaque __u64. It does, however, include the ioctl definition.
+ *
+ * Including both headers is impossible (types would be redefined), so I've
+ * opted instead to take sys/epoll.h, and include the blob below.
+ *
+ * Someday, when glibc is globally up to date, the blob below can be removed.
+ */
+#if !defined(EPOLL_IOC_TYPE)
+struct epoll_params {
+	uint32_t busy_poll_usecs;
+	uint16_t busy_poll_budget;
+	uint8_t prefer_busy_poll;
+
+	/* pad the struct to a multiple of 64bits */
+	uint8_t __pad;
+};
+
+#define EPOLL_IOC_TYPE 0x8A
+#define EPIOCSPARAMS _IOW(EPOLL_IOC_TYPE, 0x01, struct epoll_params)
+#define EPIOCGPARAMS _IOR(EPOLL_IOC_TYPE, 0x02, struct epoll_params)
+#endif
+
+static uint32_t cfg_port = 8000;
+static struct in_addr cfg_bind_addr = { .s_addr = INADDR_ANY };
+static char *cfg_outfile;
+static int cfg_max_events = 8;
+static int cfg_ifindex;
+
+/* busy poll params */
+static uint32_t cfg_busy_poll_usecs;
+static uint32_t cfg_busy_poll_budget;
+static uint32_t cfg_prefer_busy_poll;
+
+/* IRQ params */
+static uint32_t cfg_defer_hard_irqs;
+static uint64_t cfg_gro_flush_timeout;
+static uint64_t cfg_irq_suspend_timeout;
+
+static void usage(const char *filepath)
+{
+	error(1, 0,
+	      "Usage: %s -p<port> -b<addr> -m<max_events> -u<busy_poll_usecs> -P<prefer_busy_poll> -g<busy_poll_budget> -o<outfile> -d<defer_hard_irqs> -r<gro_flush_timeout> -s<irq_suspend_timeout> -i<ifindex>",
+	      filepath);
+}
+
+static void parse_opts(int argc, char **argv)
+{
+	int ret;
+	int c;
+
+	if (argc <= 1)
+		usage(argv[0]);
+
+	while ((c = getopt(argc, argv, "p:m:b:u:P:g:o:d:r:s:i:")) != -1) {
+		switch (c) {
+		case 'u':
+			cfg_busy_poll_usecs = strtoul(optarg, NULL, 0);
+			if (cfg_busy_poll_usecs == ULONG_MAX ||
+			    cfg_busy_poll_usecs > UINT32_MAX)
+				error(1, ERANGE, "busy_poll_usecs too large");
+			break;
+		case 'P':
+			cfg_prefer_busy_poll = strtoul(optarg, NULL, 0);
+			if (cfg_prefer_busy_poll == ULONG_MAX ||
+			    cfg_prefer_busy_poll > 1)
+				error(1, ERANGE,
+				      "prefer busy poll should be 0 or 1");
+			break;
+		case 'g':
+			cfg_busy_poll_budget = strtoul(optarg, NULL, 0);
+			if (cfg_busy_poll_budget == ULONG_MAX ||
+			    cfg_busy_poll_budget > UINT16_MAX)
+				error(1, ERANGE,
+				      "busy poll budget must be [0, UINT16_MAX]");
+			break;
+		case 'p':
+			cfg_port = strtoul(optarg, NULL, 0);
+			if (cfg_port > UINT16_MAX)
+				error(1, ERANGE, "port must be <= 65535");
+			break;
+		case 'b':
+			ret = inet_aton(optarg, &cfg_bind_addr);
+			if (ret == 0)
+				error(1, errno,
+				      "bind address %s invalid", optarg);
+			break;
+		case 'o':
+			cfg_outfile = strdup(optarg);
+			if (!cfg_outfile)
+				error(1, 0, "outfile invalid");
+			break;
+		case 'm':
+			cfg_max_events = strtol(optarg, NULL, 0);
+
+			if (cfg_max_events == LONG_MIN ||
+			    cfg_max_events == LONG_MAX ||
+			    cfg_max_events <= 0)
+				error(1, ERANGE,
+				      "max events must be > 0 and < LONG_MAX");
+			break;
+		case 'd':
+			cfg_defer_hard_irqs = strtoul(optarg, NULL, 0);
+
+			if (cfg_defer_hard_irqs == ULONG_MAX ||
+			    cfg_defer_hard_irqs > INT32_MAX)
+				error(1, ERANGE,
+				      "defer_hard_irqs must be <= INT32_MAX");
+			break;
+		case 'r':
+			cfg_gro_flush_timeout = strtoull(optarg, NULL, 0);
+
+			if (cfg_gro_flush_timeout == ULLONG_MAX)
+				error(1, ERANGE,
+				      "gro_flush_timeout must be < ULLONG_MAX");
+			break;
+		case 's':
+			cfg_irq_suspend_timeout = strtoull(optarg, NULL, 0);
+
+			if (cfg_irq_suspend_timeout == ULLONG_MAX)
+				error(1, ERANGE,
+				      "irq_suspend_timeout must be < ULLONG_MAX");
+			break;
+		case 'i':
+			cfg_ifindex = strtoul(optarg, NULL, 0);
+			if (cfg_ifindex == ULONG_MAX)
+				error(1, ERANGE,
+				      "ifindex must be < ULONG_MAX");
+			break;
+		}
+	}
+
+	if (!cfg_ifindex)
+		usage(argv[0]);
+
+	if (optind != argc)
+		usage(argv[0]);
+}
+
+static void epoll_ctl_add(int epfd, int fd, uint32_t events)
+{
+	struct epoll_event ev;
+
+	ev.events = events;
+	ev.data.fd = fd;
+	if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev) == -1)
+		error(1, errno, "epoll_ctl add fd: %d", fd);
+}
+
+static void setnonblock(int sockfd)
+{
+	int flags;
+
+	flags = fcntl(sockfd, F_GETFL, 0);
+
+	if (fcntl(sockfd, F_SETFL, flags | O_NONBLOCK) == -1)
+		error(1, errno, "unable to set socket to nonblocking mode");
+}
+
+static void write_chunk(int fd, char *buf, ssize_t buflen)
+{
+	ssize_t remaining = buflen;
+	char *buf_offset = buf;
+	ssize_t writelen = 0;
+	ssize_t write_result;
+
+	while (writelen < buflen) {
+		write_result = write(fd, buf_offset, remaining);
+		if (write_result == -1)
+			error(1, errno, "unable to write data to outfile");
+
+		writelen += write_result;
+		remaining -= write_result;
+		buf_offset += write_result;
+	}
+}
+
+static void setup_queue(void)
+{
+	struct netdev_napi_get_list *napi_list = NULL;
+	struct netdev_napi_get_req_dump *req = NULL;
+	struct netdev_napi_set_req *set_req = NULL;
+	struct ynl_sock *ys;
+	struct ynl_error yerr;
+	uint32_t napi_id;
+
+	ys = ynl_sock_create(&ynl_netdev_family, &yerr);
+	if (!ys)
+		error(1, 0, "YNL: %s", yerr.msg);
+
+	req = netdev_napi_get_req_dump_alloc();
+	netdev_napi_get_req_dump_set_ifindex(req, cfg_ifindex);
+	napi_list = netdev_napi_get_dump(ys, req);
+
+	/* assume there is 1 NAPI configured and take the first */
+	if (napi_list->obj._present.id)
+		napi_id = napi_list->obj.id;
+	else
+		error(1, 0, "napi ID not present?");
+
+	set_req = netdev_napi_set_req_alloc();
+	netdev_napi_set_req_set_id(set_req, napi_id);
+	netdev_napi_set_req_set_defer_hard_irqs(set_req, cfg_defer_hard_irqs);
+	netdev_napi_set_req_set_gro_flush_timeout(set_req,
+						  cfg_gro_flush_timeout);
+	netdev_napi_set_req_set_irq_suspend_timeout(set_req,
+						    cfg_irq_suspend_timeout);
+
+	if (netdev_napi_set(ys, set_req))
+		error(1, 0, "can't set NAPI params: %s\n", yerr.msg);
+
+	netdev_napi_get_list_free(napi_list);
+	netdev_napi_get_req_dump_free(req);
+	netdev_napi_set_req_free(set_req);
+	ynl_sock_destroy(ys);
+}
+
+static void run_poller(void)
+{
+	struct epoll_event events[cfg_max_events];
+	struct epoll_params epoll_params = {0};
+	struct sockaddr_in server_addr;
+	int i, epfd, nfds;
+	ssize_t readlen;
+	int outfile_fd;
+	char buf[1024];
+	int sockfd;
+	int conn;
+	int val;
+
+	outfile_fd = open(cfg_outfile, O_WRONLY | O_CREAT, 0644);
+	if (outfile_fd == -1)
+		error(1, errno, "unable to open outfile: %s", cfg_outfile);
+
+	sockfd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+	if (sockfd == -1)
+		error(1, errno, "unable to create listen socket");
+
+	server_addr.sin_family = AF_INET;
+	server_addr.sin_port = htons(cfg_port);
+	server_addr.sin_addr = cfg_bind_addr;
+
+	/* these values are range checked during parse_opts, so casting is safe
+	 * here
+	 */
+	epoll_params.busy_poll_usecs = cfg_busy_poll_usecs;
+	epoll_params.busy_poll_budget = (uint16_t)cfg_busy_poll_budget;
+	epoll_params.prefer_busy_poll = (uint8_t)cfg_prefer_busy_poll;
+	epoll_params.__pad = 0;
+
+	val = 1;
+	if (setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val)))
+		error(1, errno, "poller setsockopt reuseaddr");
+
+	setnonblock(sockfd);
+
+	if (bind(sockfd, (struct sockaddr *)&server_addr,
+		 sizeof(struct sockaddr_in)))
+		error(0, errno, "poller bind to port: %d\n", cfg_port);
+
+	if (listen(sockfd, 1))
+		error(1, errno, "poller listen");
+
+	epfd = epoll_create1(0);
+	if (ioctl(epfd, EPIOCSPARAMS, &epoll_params) == -1)
+		error(1, errno, "unable to set busy poll params");
+
+	epoll_ctl_add(epfd, sockfd, EPOLLIN | EPOLLOUT | EPOLLET);
+
+	for (;;) {
+		nfds = epoll_wait(epfd, events, cfg_max_events, -1);
+		for (i = 0; i < nfds; i++) {
+			if (events[i].data.fd == sockfd) {
+				conn = accept(sockfd, NULL, NULL);
+				if (conn == -1)
+					error(1, errno,
+					      "accepting incoming connection failed");
+
+				setnonblock(conn);
+				epoll_ctl_add(epfd, conn,
+					      EPOLLIN | EPOLLET | EPOLLRDHUP |
+					      EPOLLHUP);
+			} else if (events[i].events & EPOLLIN) {
+				for (;;) {
+					readlen = read(events[i].data.fd, buf,
+						       sizeof(buf));
+					if (readlen > 0)
+						write_chunk(outfile_fd, buf,
+							    readlen);
+					else
+						break;
+				}
+			} else {
+				/* spurious event ? */
+			}
+			if (events[i].events & (EPOLLRDHUP | EPOLLHUP)) {
+				epoll_ctl(epfd, EPOLL_CTL_DEL,
+					  events[i].data.fd, NULL);
+				close(events[i].data.fd);
+				close(outfile_fd);
+				return;
+			}
+		}
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	parse_opts(argc, argv);
+	setup_queue();
+	run_poller();
+	return 0;
+}
-- 
cgit v1.2.3


From 43271bb5bf67e78def9c2898040505e7cb5935f3 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Fri, 8 Nov 2024 06:59:25 -0800
Subject: net: netconsole: selftests: Check if netdevsim is available

The netconsole selftest relies on the availability of the netdevsim module.
To ensure the test can run correctly, we need to check if the netdevsim
module is either loaded or built-in before proceeding.

Update the netconsole selftest to check for the existence of
the /sys/bus/netdevsim/new_device file before running the test. If the
file is not found, the test is skipped with an explanation that the
CONFIG_NETDEVSIM kernel config option may not be enabled.

Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20241108-netcon_selftest_deps-v1-1-1789cbf3adcd@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/netcons_basic.sh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/netcons_basic.sh b/tools/testing/selftests/drivers/net/netcons_basic.sh
index 182eb1a97e59..b175f4d966e5 100755
--- a/tools/testing/selftests/drivers/net/netcons_basic.sh
+++ b/tools/testing/selftests/drivers/net/netcons_basic.sh
@@ -39,6 +39,7 @@ NAMESPACE=""
 # IDs for netdevsim
 NSIM_DEV_1_ID=$((256 + RANDOM % 256))
 NSIM_DEV_2_ID=$((512 + RANDOM % 256))
+NSIM_DEV_SYS_NEW="/sys/bus/netdevsim/new_device"
 
 # Used to create and delete namespaces
 source "${SCRIPTDIR}"/../../net/lib.sh
@@ -46,7 +47,6 @@ source "${SCRIPTDIR}"/../../net/net_helper.sh
 
 # Create netdevsim interfaces
 create_ifaces() {
-	local NSIM_DEV_SYS_NEW=/sys/bus/netdevsim/new_device
 
 	echo "$NSIM_DEV_2_ID" > "$NSIM_DEV_SYS_NEW"
 	echo "$NSIM_DEV_1_ID" > "$NSIM_DEV_SYS_NEW"
@@ -212,6 +212,11 @@ function check_for_dependencies() {
 		exit "${ksft_skip}"
 	fi
 
+	if [ ! -f "${NSIM_DEV_SYS_NEW}" ]; then
+		echo "SKIP: file ${NSIM_DEV_SYS_NEW} does not exist. Check if CONFIG_NETDEVSIM is enabled" >&2
+		exit "${ksft_skip}"
+	fi
+
 	if [ ! -d "${NETCONS_CONFIGFS}" ]; then
 		echo "SKIP: directory ${NETCONS_CONFIGFS} does not exist. Check if NETCONSOLE_DYNAMIC is enabled" >&2
 		exit "${ksft_skip}"
-- 
cgit v1.2.3


From 7d3f3b4367f315a61fc615e3138f3d320da8c466 Mon Sep 17 00:00:00 2001
From: Vladimir Vdovin <deliran@verdict.gg>
Date: Fri, 8 Nov 2024 09:34:24 +0000
Subject: net: ipv4: Cache pmtu for all packet paths if multipath enabled

Check number of paths by fib_info_num_path(),
and update_or_create_fnhe() for every path.
Problem is that pmtu is cached only for the oif
that has received icmp message "need to frag",
other oifs will still try to use "default" iface mtu.

An example topology showing the problem:

                    |  host1
                +---------+
                |  dummy0 | 10.179.20.18/32  mtu9000
                +---------+
        +-----------+----------------+
    +---------+                     +---------+
    | ens17f0 |  10.179.2.141/31    | ens17f1 |  10.179.2.13/31
    +---------+                     +---------+
        |    (all here have mtu 9000)    |
    +------+                         +------+
    | ro1  |  10.179.2.140/31        | ro2  |  10.179.2.12/31
    +------+                         +------+
        |                                |
---------+------------+-------------------+------
                        |
                    +-----+
                    | ro3 | 10.10.10.10  mtu1500
                    +-----+
                        |
    ========================================
                some networks
    ========================================
                        |
                    +-----+
                    | eth0| 10.10.30.30  mtu9000
                    +-----+
                        |  host2

host1 have enabled multipath and
sysctl net.ipv4.fib_multipath_hash_policy = 1:

default proto static src 10.179.20.18
        nexthop via 10.179.2.12 dev ens17f1 weight 1
        nexthop via 10.179.2.140 dev ens17f0 weight 1

When host1 tries to do pmtud from 10.179.20.18/32 to host2,
host1 receives at ens17f1 iface an icmp packet from ro3 that ro3 mtu=1500.
And host1 caches it in nexthop exceptions cache.

Problem is that it is cached only for the iface that has received icmp,
and there is no way that ro3 will send icmp msg to host1 via another path.

Host1 now have this routes to host2:

ip r g 10.10.30.30 sport 30000 dport 443
10.10.30.30 via 10.179.2.12 dev ens17f1 src 10.179.20.18 uid 0
    cache expires 521sec mtu 1500

ip r g 10.10.30.30 sport 30033 dport 443
10.10.30.30 via 10.179.2.140 dev ens17f0 src 10.179.20.18 uid 0
    cache

So when host1 tries again to reach host2 with mtu>1500,
if packet flow is lucky enough to be hashed with oif=ens17f1 its ok,
if oif=ens17f0 it blackholes and still gets icmp msgs from ro3 to ens17f1,
until lucky day when ro3 will send it through another flow to ens17f0.

Signed-off-by: Vladimir Vdovin <deliran@verdict.gg>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/20241108093427.317942-1-deliran@verdict.gg
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ipv4/route.c                    |  13 +++++
 tools/testing/selftests/net/pmtu.sh | 112 ++++++++++++++++++++++++++++++------
 2 files changed, 108 insertions(+), 17 deletions(-)

(limited to 'tools/testing')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 4c5e773002fe..ccdbe9c70132 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1027,6 +1027,19 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 		struct fib_nh_common *nhc;
 
 		fib_select_path(net, &res, fl4, NULL);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+		if (fib_info_num_path(res.fi) > 1) {
+			int nhsel;
+
+			for (nhsel = 0; nhsel < fib_info_num_path(res.fi); nhsel++) {
+				nhc = fib_info_nhc(res.fi, nhsel);
+				update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
+						      jiffies + net->ipv4.ip_rt_mtu_expires);
+			}
+			rcu_read_unlock();
+			return;
+		}
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
 		nhc = FIB_RES_NHC(res);
 		update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
 				      jiffies + net->ipv4.ip_rt_mtu_expires);
diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
index 6c651c880fe8..66be7699c72c 100755
--- a/tools/testing/selftests/net/pmtu.sh
+++ b/tools/testing/selftests/net/pmtu.sh
@@ -197,6 +197,12 @@
 #
 # - pmtu_ipv6_route_change
 #	Same as above but with IPv6
+#
+# - pmtu_ipv4_mp_exceptions
+#	Use the same topology as in pmtu_ipv4, but add routeable addresses
+#	on host A and B on lo reachable via both routers. Host A and B
+#	addresses have multipath routes to each other, b_r1 mtu = 1500.
+#	Check that PMTU exceptions are created for both paths.
 
 source lib.sh
 source net_helper.sh
@@ -266,7 +272,8 @@ tests="
 	list_flush_ipv4_exception	ipv4: list and flush cached exceptions	1
 	list_flush_ipv6_exception	ipv6: list and flush cached exceptions	1
 	pmtu_ipv4_route_change		ipv4: PMTU exception w/route replace	1
-	pmtu_ipv6_route_change		ipv6: PMTU exception w/route replace	1"
+	pmtu_ipv6_route_change		ipv6: PMTU exception w/route replace	1
+	pmtu_ipv4_mp_exceptions		ipv4: PMTU multipath nh exceptions	1"
 
 # Addressing and routing for tests with routers: four network segments, with
 # index SEGMENT between 1 and 4, a common prefix (PREFIX4 or PREFIX6) and an
@@ -343,6 +350,9 @@ tunnel6_a_addr="fd00:2::a"
 tunnel6_b_addr="fd00:2::b"
 tunnel6_mask="64"
 
+host4_a_addr="192.168.99.99"
+host4_b_addr="192.168.88.88"
+
 dummy6_0_prefix="fc00:1000::"
 dummy6_1_prefix="fc00:1001::"
 dummy6_mask="64"
@@ -984,6 +994,52 @@ setup_ovs_bridge() {
 	run_cmd ip route add ${prefix6}:${b_r1}::1 via ${prefix6}:${a_r1}::2
 }
 
+setup_multipath_new() {
+	# Set up host A with multipath routes to host B host4_b_addr
+	run_cmd ${ns_a} ip addr add ${host4_a_addr} dev lo
+	run_cmd ${ns_a} ip nexthop add id 401 via ${prefix4}.${a_r1}.2 dev veth_A-R1
+	run_cmd ${ns_a} ip nexthop add id 402 via ${prefix4}.${a_r2}.2 dev veth_A-R2
+	run_cmd ${ns_a} ip nexthop add id 403 group 401/402
+	run_cmd ${ns_a} ip route add ${host4_b_addr} src ${host4_a_addr} nhid 403
+
+	# Set up host B with multipath routes to host A host4_a_addr
+	run_cmd ${ns_b} ip addr add ${host4_b_addr} dev lo
+	run_cmd ${ns_b} ip nexthop add id 401 via ${prefix4}.${b_r1}.2 dev veth_B-R1
+	run_cmd ${ns_b} ip nexthop add id 402 via ${prefix4}.${b_r2}.2 dev veth_B-R2
+	run_cmd ${ns_b} ip nexthop add id 403 group 401/402
+	run_cmd ${ns_b} ip route add ${host4_a_addr} src ${host4_b_addr} nhid 403
+}
+
+setup_multipath_old() {
+	# Set up host A with multipath routes to host B host4_b_addr
+	run_cmd ${ns_a} ip addr add ${host4_a_addr} dev lo
+	run_cmd ${ns_a} ip route add ${host4_b_addr} \
+			src ${host4_a_addr} \
+			nexthop via ${prefix4}.${a_r1}.2 weight 1 \
+			nexthop via ${prefix4}.${a_r2}.2 weight 1
+
+	# Set up host B with multipath routes to host A host4_a_addr
+	run_cmd ${ns_b} ip addr add ${host4_b_addr} dev lo
+	run_cmd ${ns_b} ip route add ${host4_a_addr} \
+			src ${host4_b_addr} \
+			nexthop via ${prefix4}.${b_r1}.2 weight 1 \
+			nexthop via ${prefix4}.${b_r2}.2 weight 1
+}
+
+setup_multipath() {
+	if [ "$USE_NH" = "yes" ]; then
+		setup_multipath_new
+	else
+		setup_multipath_old
+	fi
+
+	# Set up routers with routes to dummies
+	run_cmd ${ns_r1} ip route add ${host4_a_addr} via ${prefix4}.${a_r1}.1
+	run_cmd ${ns_r2} ip route add ${host4_a_addr} via ${prefix4}.${a_r2}.1
+	run_cmd ${ns_r1} ip route add ${host4_b_addr} via ${prefix4}.${b_r1}.1
+	run_cmd ${ns_r2} ip route add ${host4_b_addr} via ${prefix4}.${b_r2}.1
+}
+
 setup() {
 	[ "$(id -u)" -ne 0 ] && echo "  need to run as root" && return $ksft_skip
 
@@ -1076,23 +1132,15 @@ link_get_mtu() {
 }
 
 route_get_dst_exception() {
-	ns_cmd="${1}"
-	dst="${2}"
-	dsfield="${3}"
+	ns_cmd="${1}"; shift
 
-	if [ -z "${dsfield}" ]; then
-		dsfield=0
-	fi
-
-	${ns_cmd} ip route get "${dst}" dsfield "${dsfield}"
+	${ns_cmd} ip route get "$@"
 }
 
 route_get_dst_pmtu_from_exception() {
-	ns_cmd="${1}"
-	dst="${2}"
-	dsfield="${3}"
+	ns_cmd="${1}"; shift
 
-	mtu_parse "$(route_get_dst_exception "${ns_cmd}" "${dst}" "${dsfield}")"
+	mtu_parse "$(route_get_dst_exception "${ns_cmd}" "$@")"
 }
 
 check_pmtu_value() {
@@ -1235,10 +1283,10 @@ test_pmtu_ipv4_dscp_icmp_exception() {
 	run_cmd "${ns_a}" ping -q -M want -Q "${dsfield}" -c 1 -w 1 -s "${len}" "${dst2}"
 
 	# Check that exceptions have been created with the correct PMTU
-	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")"
+	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" dsfield "${policy_mark}")"
 	check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1
 
-	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")"
+	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" dsfield "${policy_mark}")"
 	check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
 }
 
@@ -1285,9 +1333,9 @@ test_pmtu_ipv4_dscp_udp_exception() {
 		UDP:"${dst2}":50000,tos="${dsfield}"
 
 	# Check that exceptions have been created with the correct PMTU
-	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")"
+	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" dsfield "${policy_mark}")"
 	check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1
-	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")"
+	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" dsfield "${policy_mark}")"
 	check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
 }
 
@@ -2329,6 +2377,36 @@ test_pmtu_ipv6_route_change() {
 	test_pmtu_ipvX_route_change 6
 }
 
+test_pmtu_ipv4_mp_exceptions() {
+	setup namespaces routing multipath || return $ksft_skip
+
+	trace "${ns_a}"  veth_A-R1    "${ns_r1}" veth_R1-A \
+	      "${ns_r1}" veth_R1-B    "${ns_b}"  veth_B-R1 \
+	      "${ns_a}"  veth_A-R2    "${ns_r2}" veth_R2-A \
+	      "${ns_r2}" veth_R2-B    "${ns_b}"  veth_B-R2
+
+	# Set up initial MTU values
+	mtu "${ns_a}"  veth_A-R1 2000
+	mtu "${ns_r1}" veth_R1-A 2000
+	mtu "${ns_r1}" veth_R1-B 1500
+	mtu "${ns_b}"  veth_B-R1 1500
+
+	mtu "${ns_a}"  veth_A-R2 2000
+	mtu "${ns_r2}" veth_R2-A 2000
+	mtu "${ns_r2}" veth_R2-B 1500
+	mtu "${ns_b}"  veth_B-R2 1500
+
+	# Ping and expect two nexthop exceptions for two routes
+	run_cmd ${ns_a} ping -q -M want -i 0.1 -c 1 -s 1800 "${host4_b_addr}"
+
+	# Check that exceptions have been created with the correct PMTU
+	pmtu_a_R1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${host4_b_addr}" oif veth_A-R1)"
+	pmtu_a_R2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${host4_b_addr}" oif veth_A-R2)"
+
+	check_pmtu_value "1500" "${pmtu_a_R1}" "exceeding MTU (veth_A-R1)" || return 1
+	check_pmtu_value "1500" "${pmtu_a_R2}" "exceeding MTU (veth_A-R2)" || return 1
+}
+
 usage() {
 	echo
 	echo "$0 [OPTIONS] [TEST]..."
-- 
cgit v1.2.3


From 116e50d6474e82579086c0397d2fa3999815f29e Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 6 Nov 2024 17:07:51 +0000
Subject: kselftest/arm64: Check that SVCR is 0 in signal handlers

We don't currently validate that we exit streaming mode and clear ZA when
we enter a signal handler. Add simple checks for this in the SSVE and ZA
tests.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241106-arm64-fpmr-signal-test-v1-1-31fa34ce58fe@kernel.org
[catalin.marinas@arm.com: Use %lx in fprintf() as uint64_t seems to be unsigned long in glibc]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/signal/sve_helpers.h         | 13 +++++++++++++
 tools/testing/selftests/arm64/signal/testcases/ssve_regs.c |  5 +++++
 tools/testing/selftests/arm64/signal/testcases/za_regs.c   |  5 +++++
 3 files changed, 23 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/signal/sve_helpers.h b/tools/testing/selftests/arm64/signal/sve_helpers.h
index 50948ce471cc..ca133b93375f 100644
--- a/tools/testing/selftests/arm64/signal/sve_helpers.h
+++ b/tools/testing/selftests/arm64/signal/sve_helpers.h
@@ -18,4 +18,17 @@ extern unsigned int nvls;
 
 int sve_fill_vls(bool use_sme, int min_vls);
 
+static inline uint64_t get_svcr(void)
+{
+	uint64_t val;
+
+	asm volatile (
+		"mrs	%0, S3_3_C4_C2_2\n"
+		: "=r"(val)
+		:
+		: "cc");
+
+	return val;
+}
+
 #endif
diff --git a/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c b/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c
index 6dbe48cf8b09..1dbca9afb13c 100644
--- a/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c
+++ b/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c
@@ -85,6 +85,11 @@ static int do_one_sme_vl(struct tdescr *td, siginfo_t *si, ucontext_t *uc,
 	fprintf(stderr, "Got expected size %u and VL %d\n",
 		head->size, ssve->vl);
 
+	if (get_svcr() != 0) {
+		fprintf(stderr, "Unexpected SVCR %lx\n", get_svcr());
+		return 1;
+	}
+
 	return 0;
 }
 
diff --git a/tools/testing/selftests/arm64/signal/testcases/za_regs.c b/tools/testing/selftests/arm64/signal/testcases/za_regs.c
index b9e13f27f1f9..badaead5326a 100644
--- a/tools/testing/selftests/arm64/signal/testcases/za_regs.c
+++ b/tools/testing/selftests/arm64/signal/testcases/za_regs.c
@@ -91,6 +91,11 @@ static int do_one_sme_vl(struct tdescr *td, siginfo_t *si, ucontext_t *uc,
 		return 1;
 	}
 
+	if (get_svcr() != 0) {
+		fprintf(stderr, "Unexpected SVCR %lx\n", get_svcr());
+		return 1;
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From c297aa7d3fb6755890b78b483e82c9cf07370d50 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Mon, 11 Nov 2024 18:32:58 +0000
Subject: kselftest/arm64: Enable build of PAC tests with LLVM=1

Currently we don't build the PAC selftests when building with LLVM=1 since
we attempt to test for PAC support in the toolchain before we've set up the
build system to point at LLVM in lib.mk, which has to be one of the last
things in the Makefile.

Since all versions of LLVM supported for use with the kernel have PAC
support we can just sidestep the issue by just assuming PAC is there when
doing a LLVM=1 build.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241111-arm64-selftest-pac-clang-v1-1-08599ceee418@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/pauth/Makefile | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/pauth/Makefile b/tools/testing/selftests/arm64/pauth/Makefile
index 72e290b0b10c..b5a1c80e0ead 100644
--- a/tools/testing/selftests/arm64/pauth/Makefile
+++ b/tools/testing/selftests/arm64/pauth/Makefile
@@ -7,8 +7,14 @@ CC := $(CROSS_COMPILE)gcc
 endif
 
 CFLAGS += -mbranch-protection=pac-ret
+
+# All supported LLVMs have PAC, test for GCC
+ifeq ($(LLVM),1)
+pauth_cc_support := 1
+else
 # check if the compiler supports ARMv8.3 and branch protection with PAuth
 pauth_cc_support := $(shell if ($(CC) $(CFLAGS) -march=armv8.3-a -E -x c /dev/null -o /dev/null 2>&1) then echo "1"; fi)
+endif
 
 ifeq ($(pauth_cc_support),1)
 TEST_GEN_PROGS := pac
-- 
cgit v1.2.3


From c0350076c13eac4f1d7f7ab6acd43bb252baef7a Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 12 Nov 2024 13:08:14 +0000
Subject: kselftets/arm64: Use flag bits for features in fp-ptrace assembler
 code

The assembler portions of fp-ptrace are passed feature flags by the C code
indicating which architectural features are supported. Currently these use
an entire register for each flag which is wasteful and gets cumbersome as
new flags are added. Switch to using flag bits in a single register to make
things easier to maintain.

No functional change.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241112-arm64-fp-ptrace-fpmr-v2-1-250b57c61254@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/fp-ptrace-asm.S | 32 +++++++++++++-----------
 tools/testing/selftests/arm64/fp/fp-ptrace.c     | 17 ++++++++++---
 tools/testing/selftests/arm64/fp/fp-ptrace.h     | 10 ++++++++
 3 files changed, 41 insertions(+), 18 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S b/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S
index 7ad59d92d02b..5e7e9c878f2c 100644
--- a/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S
+++ b/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S
@@ -15,10 +15,7 @@
 
 // Load and save register values with pauses for ptrace
 //
-// x0 - SVE in use
-// x1 - SME in use
-// x2 - SME2 in use
-// x3 - FA64 supported
+// x0 - HAVE_ flags indicating which features are in use
 
 .globl load_and_save
 load_and_save:
@@ -44,7 +41,7 @@ load_and_save:
 	ldp	q30, q31, [x7, #16 * 30]
 
 	// SME?
-	cbz	x1, check_sve_in
+	tbz	x0, #HAVE_SME_SHIFT, check_sve_in
 
 	adrp	x7, svcr_in
 	ldr	x7, [x7, :lo12:svcr_in]
@@ -64,7 +61,7 @@ load_and_save:
 	bne	1b
 
 	// ZT?
-	cbz	x2, check_sm_in
+	tbz	x0, #HAVE_SME2_SHIFT, check_sm_in
 	adrp	x6, zt_in
 	add	x6, x6, :lo12:zt_in
 	_ldr_zt 6
@@ -72,12 +69,16 @@ load_and_save:
 	// In streaming mode?
 check_sm_in:
 	tbz	x7, #SVCR_SM_SHIFT, check_sve_in
-	mov	x4, x3		// Load FFR if we have FA64
+
+	// Load FFR if we have FA64
+	mov	x4, #0
+	tbz	x0, #HAVE_FA64_SHIFT, load_sve
+	mov	x4, #1
 	b	load_sve
 
 	// SVE?
 check_sve_in:
-	cbz	x0, wait_for_writes
+	tbz	x0, #HAVE_SVE_SHIFT, wait_for_writes
 	mov	x4, #1
 
 load_sve:
@@ -165,8 +166,7 @@ wait_for_writes:
 	stp	q28, q29, [x7, #16 * 28]
 	stp	q30, q31, [x7, #16 * 30]
 
-	// SME?
-	cbz	x1, check_sve_out
+	tbz	x0, #HAVE_SME_SHIFT, check_sve_out
 
 	rdsvl	11, 1
 	adrp	x6, sme_vl_out
@@ -187,7 +187,7 @@ wait_for_writes:
 	bne	1b
 
 	// ZT?
-	cbz	x2, check_sm_out
+	tbz	x0, #HAVE_SME2_SHIFT, check_sm_out
 	adrp	x6, zt_out
 	add	x6, x6, :lo12:zt_out
 	_str_zt 6
@@ -195,12 +195,16 @@ wait_for_writes:
 	// In streaming mode?
 check_sm_out:
 	tbz	x7, #SVCR_SM_SHIFT, check_sve_out
-	mov	x4, x3				// FFR?
+
+	// Do we have FA64 and FFR?
+	mov	x4, #0
+	tbz	x0, #HAVE_FA64_SHIFT, read_sve
+	mov	x4, #1
 	b	read_sve
 
 	// SVE?
 check_sve_out:
-	cbz	x0, wait_for_reads
+	tbz	x0, #HAVE_SVE_SHIFT, wait_for_reads
 	mov	x4, #1
 
 	rdvl	x7, #1
@@ -271,7 +275,7 @@ wait_for_reads:
 	brk #0
 
 	// Ensure we don't leave ourselves in streaming mode
-	cbz	x1, out
+	tbz	x0, #HAVE_SME_SHIFT, out
 	msr	S3_3_C4_C2_2, xzr
 
 out:
diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.c b/tools/testing/selftests/arm64/fp/fp-ptrace.c
index c7ceafe5f471..d96af27487fa 100644
--- a/tools/testing/selftests/arm64/fp/fp-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/fp-ptrace.c
@@ -82,7 +82,7 @@ uint64_t sve_vl_out;
 uint64_t sme_vl_out;
 uint64_t svcr_in, svcr_expected, svcr_out;
 
-void load_and_save(int sve, int sme, int sme2, int fa64);
+void load_and_save(int flags);
 
 static bool got_alarm;
 
@@ -198,7 +198,7 @@ static int vl_expected(struct test_config *config)
 
 static void run_child(struct test_config *config)
 {
-	int ret;
+	int ret, flags;
 
 	/* Let the parent attach to us */
 	ret = ptrace(PTRACE_TRACEME, 0, 0, 0);
@@ -224,8 +224,17 @@ static void run_child(struct test_config *config)
 	}
 
 	/* Load values and wait for the parent */
-	load_and_save(sve_supported(), sme_supported(),
-		      sme2_supported(), fa64_supported());
+	flags = 0;
+	if (sve_supported())
+		flags |= HAVE_SVE;
+	if (sme_supported())
+		flags |= HAVE_SME;
+	if (sme2_supported())
+		flags |= HAVE_SME2;
+	if (fa64_supported())
+		flags |= HAVE_FA64;
+
+	load_and_save(flags);
 
 	exit(0);
 }
diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.h b/tools/testing/selftests/arm64/fp/fp-ptrace.h
index db4f2c4d750c..36ca627e1980 100644
--- a/tools/testing/selftests/arm64/fp/fp-ptrace.h
+++ b/tools/testing/selftests/arm64/fp/fp-ptrace.h
@@ -10,4 +10,14 @@
 #define SVCR_SM (1 << SVCR_SM_SHIFT)
 #define SVCR_ZA (1 << SVCR_ZA_SHIFT)
 
+#define HAVE_SVE_SHIFT		0
+#define HAVE_SME_SHIFT		1
+#define HAVE_SME2_SHIFT		2
+#define HAVE_FA64_SHIFT		3
+
+#define HAVE_SVE	(1 << HAVE_SVE_SHIFT)
+#define HAVE_SME	(1 << HAVE_SME_SHIFT)
+#define HAVE_SME2	(1 << HAVE_SME2_SHIFT)
+#define HAVE_FA64	(1 << HAVE_FA64_SHIFT)
+
 #endif
-- 
cgit v1.2.3


From 7e9c5b00009a625cc304c865192978c01c0cc077 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 12 Nov 2024 13:08:15 +0000
Subject: kselftest/arm64: Expand the set of ZA writes fp-ptrace does

Currently our test for implementable ZA writes is written in a bit of a
convoluted fashion which excludes all changes where we clear SVCR.SM even
though we can actually support that since changing the vector length resets
SVCR. Make the logic more direct, enabling us to actually run these cases.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241112-arm64-fp-ptrace-fpmr-v2-2-250b57c61254@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/fp-ptrace.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.c b/tools/testing/selftests/arm64/fp/fp-ptrace.c
index d96af27487fa..56cf6e02c535 100644
--- a/tools/testing/selftests/arm64/fp/fp-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/fp-ptrace.c
@@ -1078,21 +1078,19 @@ static void sve_write(pid_t child, struct test_config *config)
 
 static bool za_write_supported(struct test_config *config)
 {
-	if (config->svcr_expected & SVCR_SM) {
-		if (!(config->svcr_in & SVCR_SM))
+	if (config->sme_vl_in != config->sme_vl_expected) {
+		/* Changing the SME VL exits streaming mode. */
+		if (config->svcr_expected & SVCR_SM) {
 			return false;
-
-		/* Changing the SME VL exits streaming mode */
-		if (config->sme_vl_in != config->sme_vl_expected) {
+		}
+	} else {
+		/* Otherwise we can't change streaming mode */
+		if ((config->svcr_in & SVCR_SM) !=
+		    (config->svcr_expected & SVCR_SM)) {
 			return false;
 		}
 	}
 
-	/* Can't disable SM outside a VL change */
-	if ((config->svcr_in & SVCR_SM) &&
-	    !(config->svcr_expected & SVCR_SM))
-		return false;
-
 	return true;
 }
 
-- 
cgit v1.2.3


From 7dbd26d0b22d69d36ab3e76ee7f152482a19cbed Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 12 Nov 2024 13:08:16 +0000
Subject: kselftest/arm64: Add FPMR coverage to fp-ptrace

Add coverage for FPMR to fp-ptrace. FPMR can be available independently of
SVE and SME, if SME is supported then FPMR is cleared by entering and
exiting streaming mode. As with other registers we generate random values
to load into the register, we restrict these to bitfields which are always
defined. We also leave bitfields where the valid values are affected by
the set of supported FP8 formats zero to reduce complexity, it is unlikely
that specific bitfields will be affected by ptrace issues.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241112-arm64-fp-ptrace-fpmr-v2-3-250b57c61254@kernel.org
[catalin.marinas@arm.com: use REG_FPMR instead of FPMR]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/fp-ptrace-asm.S |  23 +++--
 tools/testing/selftests/arm64/fp/fp-ptrace.c     | 126 +++++++++++++++++++++++
 tools/testing/selftests/arm64/fp/fp-ptrace.h     |   2 +
 tools/testing/selftests/arm64/fp/sme-inst.h      |   2 +
 4 files changed, 146 insertions(+), 7 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S b/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S
index 5e7e9c878f2c..82c3ab70e1cf 100644
--- a/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S
+++ b/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S
@@ -71,14 +71,12 @@ check_sm_in:
 	tbz	x7, #SVCR_SM_SHIFT, check_sve_in
 
 	// Load FFR if we have FA64
-	mov	x4, #0
-	tbz	x0, #HAVE_FA64_SHIFT, load_sve
-	mov	x4, #1
+	ubfx	x4, x0, #HAVE_FA64_SHIFT, #1
 	b	load_sve
 
 	// SVE?
 check_sve_in:
-	tbz	x0, #HAVE_SVE_SHIFT, wait_for_writes
+	tbz	x0, #HAVE_SVE_SHIFT, check_fpmr_in
 	mov	x4, #1
 
 load_sve:
@@ -143,6 +141,13 @@ load_sve:
 	ldr	p14, [x7, #14, MUL VL]
 	ldr	p15, [x7, #15, MUL VL]
 
+	// This has to come after we set PSTATE.SM
+check_fpmr_in:
+	tbz	x0, #HAVE_FPMR_SHIFT, wait_for_writes
+	adrp	x7, fpmr_in
+	ldr	x7, [x7, :lo12:fpmr_in]
+	msr	REG_FPMR, x7
+
 wait_for_writes:
 	// Wait for the parent
 	brk #0
@@ -166,6 +171,12 @@ wait_for_writes:
 	stp	q28, q29, [x7, #16 * 28]
 	stp	q30, q31, [x7, #16 * 30]
 
+	tbz	x0, #HAVE_FPMR_SHIFT, check_sme_out
+	mrs	x7, REG_FPMR
+	adrp	x6, fpmr_out
+	str	x7, [x6, :lo12:fpmr_out]
+
+check_sme_out:
 	tbz	x0, #HAVE_SME_SHIFT, check_sve_out
 
 	rdsvl	11, 1
@@ -197,9 +208,7 @@ check_sm_out:
 	tbz	x7, #SVCR_SM_SHIFT, check_sve_out
 
 	// Do we have FA64 and FFR?
-	mov	x4, #0
-	tbz	x0, #HAVE_FA64_SHIFT, read_sve
-	mov	x4, #1
+	ubfx	x4, x0, #HAVE_FA64_SHIFT, #1
 	b	read_sve
 
 	// SVE?
diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.c b/tools/testing/selftests/arm64/fp/fp-ptrace.c
index 56cf6e02c535..4930e03a7b99 100644
--- a/tools/testing/selftests/arm64/fp/fp-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/fp-ptrace.c
@@ -31,6 +31,14 @@
 
 #include "fp-ptrace.h"
 
+#include <linux/bits.h>
+
+#define FPMR_LSCALE2_MASK                               GENMASK(37, 32)
+#define FPMR_NSCALE_MASK                                GENMASK(31, 24)
+#define FPMR_LSCALE_MASK                                GENMASK(22, 16)
+#define FPMR_OSC_MASK                                   GENMASK(15, 15)
+#define FPMR_OSM_MASK                                   GENMASK(14, 14)
+
 /* <linux/elf.h> and <sys/auxv.h> don't like each other, so: */
 #ifndef NT_ARM_SVE
 #define NT_ARM_SVE 0x405
@@ -48,11 +56,22 @@
 #define NT_ARM_ZT 0x40d
 #endif
 
+#ifndef NT_ARM_FPMR
+#define NT_ARM_FPMR 0x40e
+#endif
+
 #define ARCH_VQ_MAX 256
 
 /* VL 128..2048 in powers of 2 */
 #define MAX_NUM_VLS 5
 
+/*
+ * FPMR bits we can set without doing feature checks to see if values
+ * are valid.
+ */
+#define FPMR_SAFE_BITS (FPMR_LSCALE2_MASK | FPMR_NSCALE_MASK | \
+			FPMR_LSCALE_MASK | FPMR_OSC_MASK | FPMR_OSM_MASK)
+
 #define NUM_FPR 32
 __uint128_t v_in[NUM_FPR];
 __uint128_t v_expected[NUM_FPR];
@@ -78,6 +97,8 @@ char zt_in[ZT_SIG_REG_BYTES];
 char zt_expected[ZT_SIG_REG_BYTES];
 char zt_out[ZT_SIG_REG_BYTES];
 
+uint64_t fpmr_in, fpmr_expected, fpmr_out;
+
 uint64_t sve_vl_out;
 uint64_t sme_vl_out;
 uint64_t svcr_in, svcr_expected, svcr_out;
@@ -128,6 +149,11 @@ static bool fa64_supported(void)
 	return getauxval(AT_HWCAP2) & HWCAP2_SME_FA64;
 }
 
+static bool fpmr_supported(void)
+{
+	return getauxval(AT_HWCAP2) & HWCAP2_FPMR;
+}
+
 static bool compare_buffer(const char *name, void *out,
 			   void *expected, size_t size)
 {
@@ -233,6 +259,8 @@ static void run_child(struct test_config *config)
 		flags |= HAVE_SME2;
 	if (fa64_supported())
 		flags |= HAVE_FA64;
+	if (fpmr_supported())
+		flags |= HAVE_FPMR;
 
 	load_and_save(flags);
 
@@ -321,6 +349,14 @@ static void read_child_regs(pid_t child)
 		iov_child.iov_len = sizeof(zt_out);
 		read_one_child_regs(child, "ZT", &iov_parent, &iov_child);
 	}
+
+	if (fpmr_supported()) {
+		iov_parent.iov_base = &fpmr_out;
+		iov_parent.iov_len = sizeof(fpmr_out);
+		iov_child.iov_base = &fpmr_out;
+		iov_child.iov_len = sizeof(fpmr_out);
+		read_one_child_regs(child, "FPMR", &iov_parent, &iov_child);
+	}
 }
 
 static bool continue_breakpoint(pid_t child,
@@ -595,6 +631,26 @@ static bool check_ptrace_values_zt(pid_t child, struct test_config *config)
 	return compare_buffer("initial ZT", buf, zt_in, ZT_SIG_REG_BYTES);
 }
 
+static bool check_ptrace_values_fpmr(pid_t child, struct test_config *config)
+{
+	uint64_t val;
+	struct iovec iov;
+	int ret;
+
+	if (!fpmr_supported())
+		return true;
+
+	iov.iov_base = &val;
+	iov.iov_len = sizeof(val);
+	ret = ptrace(PTRACE_GETREGSET, child, NT_ARM_FPMR, &iov);
+	if (ret != 0) {
+		ksft_print_msg("Failed to read initial FPMR: %s (%d)\n",
+			       strerror(errno), errno);
+		return false;
+	}
+
+	return compare_buffer("initial FPMR", &val, &fpmr_in, sizeof(val));
+}
 
 static bool check_ptrace_values(pid_t child, struct test_config *config)
 {
@@ -629,6 +685,9 @@ static bool check_ptrace_values(pid_t child, struct test_config *config)
 	if (!check_ptrace_values_zt(child, config))
 		pass = false;
 
+	if (!check_ptrace_values_fpmr(child, config))
+		pass = false;
+
 	return pass;
 }
 
@@ -832,11 +891,18 @@ static void set_initial_values(struct test_config *config)
 {
 	int vq = __sve_vq_from_vl(vl_in(config));
 	int sme_vq = __sve_vq_from_vl(config->sme_vl_in);
+	bool sm_change;
 
 	svcr_in = config->svcr_in;
 	svcr_expected = config->svcr_expected;
 	svcr_out = 0;
 
+	if (sme_supported() &&
+	    (svcr_in & SVCR_SM) != (svcr_expected & SVCR_SM))
+		sm_change = true;
+	else
+		sm_change = false;
+
 	fill_random(&v_in, sizeof(v_in));
 	memcpy(v_expected, v_in, sizeof(v_in));
 	memset(v_out, 0, sizeof(v_out));
@@ -883,6 +949,21 @@ static void set_initial_values(struct test_config *config)
 			memset(zt_expected, 0, ZT_SIG_REG_BYTES);
 		memset(zt_out, 0, sizeof(zt_out));
 	}
+
+	if (fpmr_supported()) {
+		fill_random(&fpmr_in, sizeof(fpmr_in));
+		fpmr_in &= FPMR_SAFE_BITS;
+
+		/* Entering or exiting streaming mode clears FPMR */
+		if (sm_change)
+			fpmr_expected = 0;
+		else
+			fpmr_expected = fpmr_in;
+	} else {
+		fpmr_in = 0;
+		fpmr_expected = 0;
+		fpmr_out = 0;
+	}
 }
 
 static bool check_memory_values(struct test_config *config)
@@ -933,6 +1014,12 @@ static bool check_memory_values(struct test_config *config)
 	if (!compare_buffer("saved ZT", zt_out, zt_expected, ZT_SIG_REG_BYTES))
 		pass = false;
 
+	if (fpmr_out != fpmr_expected) {
+		ksft_print_msg("Mismatch in saved FPMR: %lx != %lx\n",
+			       fpmr_out, fpmr_expected);
+		pass = false;
+	}
+
 	return pass;
 }
 
@@ -1010,6 +1097,36 @@ static void fpsimd_write(pid_t child, struct test_config *test_config)
 			       strerror(errno), errno);
 }
 
+static bool fpmr_write_supported(struct test_config *config)
+{
+	if (!fpmr_supported())
+		return false;
+
+	if (!sve_sme_same(config))
+		return false;
+
+	return true;
+}
+
+static void fpmr_write_expected(struct test_config *config)
+{
+	fill_random(&fpmr_expected, sizeof(fpmr_expected));
+	fpmr_expected &= FPMR_SAFE_BITS;
+}
+
+static void fpmr_write(pid_t child, struct test_config *config)
+{
+	struct iovec iov;
+	int ret;
+
+	iov.iov_len = sizeof(fpmr_expected);
+	iov.iov_base = &fpmr_expected;
+	ret = ptrace(PTRACE_SETREGSET, child, NT_ARM_FPMR, &iov);
+	if (ret != 0)
+		ksft_print_msg("Failed to write FPMR: %s (%d)\n",
+			       strerror(errno), errno);
+}
+
 static void sve_write_expected(struct test_config *config)
 {
 	int vl = vl_expected(config);
@@ -1266,6 +1383,12 @@ static struct test_definition base_test_defs[] = {
 		.set_expected_values = fpsimd_write_expected,
 		.modify_values = fpsimd_write,
 	},
+	{
+		.name = "FPMR write",
+		.supported = fpmr_write_supported,
+		.set_expected_values = fpmr_write_expected,
+		.modify_values = fpmr_write,
+	},
 };
 
 static struct test_definition sve_test_defs[] = {
@@ -1475,6 +1598,9 @@ int main(void)
 	if (fa64_supported())
 		ksft_print_msg("FA64 supported\n");
 
+	if (fpmr_supported())
+		ksft_print_msg("FPMR supported\n");
+
 	ksft_set_plan(tests);
 
 	/* Get signal handers ready before we start any children */
diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.h b/tools/testing/selftests/arm64/fp/fp-ptrace.h
index 36ca627e1980..c06919aaf1f7 100644
--- a/tools/testing/selftests/arm64/fp/fp-ptrace.h
+++ b/tools/testing/selftests/arm64/fp/fp-ptrace.h
@@ -14,10 +14,12 @@
 #define HAVE_SME_SHIFT		1
 #define HAVE_SME2_SHIFT		2
 #define HAVE_FA64_SHIFT		3
+#define HAVE_FPMR_SHIFT		4
 
 #define HAVE_SVE	(1 << HAVE_SVE_SHIFT)
 #define HAVE_SME	(1 << HAVE_SME_SHIFT)
 #define HAVE_SME2	(1 << HAVE_SME2_SHIFT)
 #define HAVE_FA64	(1 << HAVE_FA64_SHIFT)
+#define HAVE_FPMR	(1 << HAVE_FPMR_SHIFT)
 
 #endif
diff --git a/tools/testing/selftests/arm64/fp/sme-inst.h b/tools/testing/selftests/arm64/fp/sme-inst.h
index 9292bba5400b..85b9184e0835 100644
--- a/tools/testing/selftests/arm64/fp/sme-inst.h
+++ b/tools/testing/selftests/arm64/fp/sme-inst.h
@@ -5,6 +5,8 @@
 #ifndef SME_INST_H
 #define SME_INST_H
 
+#define REG_FPMR                                        S3_3_C4_C4_2
+
 /*
  * RDSVL X\nx, #\imm
  */
-- 
cgit v1.2.3


From 016d659e62ad9ddda1b6899468d0d0798ed71a4d Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Tue, 12 Nov 2024 14:35:05 +0000
Subject: kselftest/arm64: Fix missing printf() argument in gcs/gcs-stress.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Compiling the child_cleanup() function results in:

gcs-stress.c: In function ‘child_cleanup’:
gcs-stress.c:266:75: warning: format ‘%d’ expects a matching ‘int’ argument [-Wformat=]
  266 |                                 ksft_print_msg("%s: Exited due to signal %d\n",
      |                                                                          ~^
      |                                                                           |
      |                                                                           int

Add the missing child->exit_signal argument.

Fixes: 05e6cfff58c4 ("kselftest/arm64: Add a GCS stress test")
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/gcs/gcs-stress.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/gcs/gcs-stress.c b/tools/testing/selftests/arm64/gcs/gcs-stress.c
index 03222c36c436..bbc7f4950c13 100644
--- a/tools/testing/selftests/arm64/gcs/gcs-stress.c
+++ b/tools/testing/selftests/arm64/gcs/gcs-stress.c
@@ -264,7 +264,7 @@ static void child_cleanup(struct child_data *child)
 			if (WIFSIGNALED(status)) {
 				child->exit_signal = WTERMSIG(status);
 				ksft_print_msg("%s: Exited due to signal %d\n",
-					       child->name);
+					       child->name, child->exit_signal);
 				fail = true;
 				child->exited = true;
 			}
-- 
cgit v1.2.3


From 930d4e1344f16e20c9d9ffc3f8888ac78dfd5659 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 15 Oct 2024 09:11:10 -0700
Subject: rcutorture: Add light-weight SRCU scenario

This commit adds an rcutorture scenario that tests light-weight SRCU
readers.  While in the area, it adjusts the size of the TREE10 scenario.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: <bpf@vger.kernel.org>
Reviewed-by: Neeraj Upadhyay <Neeraj.Upadhyay@amd.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 tools/testing/selftests/rcutorture/configs/rcu/CFLIST      |  1 +
 tools/testing/selftests/rcutorture/configs/rcu/SRCU-L      | 10 ++++++++++
 tools/testing/selftests/rcutorture/configs/rcu/SRCU-L.boot |  3 +++
 tools/testing/selftests/rcutorture/configs/rcu/SRCU-N.boot |  1 +
 tools/testing/selftests/rcutorture/configs/rcu/TREE10      |  2 +-
 5 files changed, 16 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/SRCU-L
 create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/SRCU-L.boot

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFLIST b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST
index 98b6175e5aa0..45f572570a8c 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/CFLIST
+++ b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST
@@ -5,6 +5,7 @@ TREE04
 TREE05
 TREE07
 TREE09
+SRCU-L
 SRCU-N
 SRCU-P
 SRCU-T
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-L b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-L
new file mode 100644
index 000000000000..3b4fa8dbef8a
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-L
@@ -0,0 +1,10 @@
+CONFIG_RCU_TRACE=n
+CONFIG_SMP=y
+CONFIG_NR_CPUS=6
+CONFIG_HOTPLUG_CPU=y
+CONFIG_PREEMPT_NONE=y
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=n
+#CHECK#CONFIG_RCU_EXPERT=n
+CONFIG_KPROBES=n
+CONFIG_FTRACE=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-L.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-L.boot
new file mode 100644
index 000000000000..0207b3138c5b
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-L.boot
@@ -0,0 +1,3 @@
+rcutorture.torture_type=srcu
+rcutorture.reader_flavor=0x4
+rcutorture.fwd_progress=3
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N.boot
index ce0694fd9b92..b54cf87dc110 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N.boot
@@ -1,2 +1,3 @@
 rcutorture.torture_type=srcu
+rcutorture.reader_flavor=0x2
 rcutorture.fwd_progress=3
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE10 b/tools/testing/selftests/rcutorture/configs/rcu/TREE10
index a323d8948b7c..759ee51d3ddc 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE10
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE10
@@ -1,5 +1,5 @@
 CONFIG_SMP=y
-CONFIG_NR_CPUS=56
+CONFIG_NR_CPUS=74
 CONFIG_PREEMPT_NONE=y
 CONFIG_PREEMPT_VOLUNTARY=n
 CONFIG_PREEMPT=n
-- 
cgit v1.2.3


From 3e360ef0c0a1fb6ce9a302e40b8057c41ba8a9d2 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 7 Nov 2024 01:39:22 +0000
Subject: kselftest/arm64: Corrupt P0 in the irritator when testing SSVE

When building for streaming SVE the irritator for SVE skips updates of both
P0 and FFR. While FFR is skipped since it might not be present there is no
reason to skip corrupting P0 so switch to an instruction valid in streaming
mode and move the ifdef.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241107-arm64-fp-stress-irritator-v2-3-c4b9622e36ee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/sve-test.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/fp/sve-test.S b/tools/testing/selftests/arm64/fp/sve-test.S
index f1fb9745c681..28eb8b5cc2d2 100644
--- a/tools/testing/selftests/arm64/fp/sve-test.S
+++ b/tools/testing/selftests/arm64/fp/sve-test.S
@@ -302,9 +302,9 @@ function irritator_handler
 	movi	v0.8b, #1
 	movi	v9.16b, #2
 	movi	v31.8b, #3
-#ifndef SSVE
 	// And P0
-	rdffr	p0.b
+	ptrue	p0.d
+#ifndef SSVE
 	// And FFR
 	wrffr	p15.b
 #endif
-- 
cgit v1.2.3


From 27141b690547da5650a420f26ec369ba142a9ebb Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Mon, 11 Nov 2024 16:18:55 +0000
Subject: kselftest/arm64: Don't leak pipe fds in pac.exec_sign_all()

The PAC exec_sign_all() test spawns some child processes, creating pipes
to be stdin and stdout for the child. It cleans up most of the file
descriptors that are created as part of this but neglects to clean up the
parent end of the child stdin and stdout. Add the missing close() calls.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241111-arm64-pac-test-collisions-v1-1-171875f37e44@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/pauth/pac.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/pauth/pac.c b/tools/testing/selftests/arm64/pauth/pac.c
index b743daa772f5..5a07b3958fbf 100644
--- a/tools/testing/selftests/arm64/pauth/pac.c
+++ b/tools/testing/selftests/arm64/pauth/pac.c
@@ -182,6 +182,9 @@ int exec_sign_all(struct signatures *signed_vals, size_t val)
 		return -1;
 	}
 
+	close(new_stdin[1]);
+	close(new_stdout[0]);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 91a6533811bb81139c5a44d039b9b0a6af238bc8 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Mon, 11 Nov 2024 16:18:56 +0000
Subject: kselftest/arm64: Try harder to generate different keys during PAC
 tests

We very intermittently see failures in the single_thread_different_keys
PAC test. As noted in the comment in the test the PAC field can be quite
narrow so there is a chance of collisions even with different keys with a
chance of 5% for 7 bit keys, and the potential for narrower keys. The test
tries to avoid this by running repeatedly, but only tries 10 times which
even with a 5% chance of collisions isn't enough.

Increase the number of times we attempt to look for collisions by a factor
of 100, this also affects other tests which are following a similar pattern
with running the test repeatedly and either don't care like with
pac_instruction_not_nop or potentially have the same issue like
exec_sign_all.

The PAC tests are very fast, running in a second or two even in emulation,
so the 100x increased cost is mildly irritating but not a huge issue. The
bulk of the overhead is in the exec_sign_all test which does a fork() and
exec() per iteration.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241111-arm64-pac-test-collisions-v1-2-171875f37e44@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/pauth/pac.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/arm64/pauth/pac.c b/tools/testing/selftests/arm64/pauth/pac.c
index 5a07b3958fbf..6d21b2fc758d 100644
--- a/tools/testing/selftests/arm64/pauth/pac.c
+++ b/tools/testing/selftests/arm64/pauth/pac.c
@@ -13,7 +13,7 @@
 #include "../../kselftest_harness.h"
 #include "helper.h"
 
-#define PAC_COLLISION_ATTEMPTS 10
+#define PAC_COLLISION_ATTEMPTS 1000
 /*
  * The kernel sets TBID by default. So bits 55 and above should remain
  * untouched no matter what.
-- 
cgit v1.2.3


From 7156cd9ef24583c88bcc2f6d213f469aef38bfd9 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Tue, 5 Nov 2024 12:04:28 -0800
Subject: iommufd/selftest: Add IOMMU_VIOMMU_ALLOC test coverage

Add a new iommufd_viommu FIXTURE and setup it up with a vIOMMU object.

Any new vIOMMU feature will be added as a TEST_F under that.

Link: https://patch.msgid.link/r/abe267c9d004b29cb1712ceba2f378209d4b7e01.1730836219.git.nicolinc@nvidia.com
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 tools/testing/selftests/iommu/iommufd.c          | 137 +++++++++++++++++++++++
 tools/testing/selftests/iommu/iommufd_fail_nth.c |  11 ++
 tools/testing/selftests/iommu/iommufd_utils.h    |  28 +++++
 3 files changed, 176 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 88b92bb69756..37c7da283824 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -133,6 +133,7 @@ TEST_F(iommufd, cmd_length)
 	TEST_LENGTH(iommu_option, IOMMU_OPTION, val64);
 	TEST_LENGTH(iommu_vfio_ioas, IOMMU_VFIO_IOAS, __reserved);
 	TEST_LENGTH(iommu_ioas_map_file, IOMMU_IOAS_MAP_FILE, iova);
+	TEST_LENGTH(iommu_viommu_alloc, IOMMU_VIOMMU_ALLOC, out_viommu_id);
 #undef TEST_LENGTH
 }
 
@@ -2480,4 +2481,140 @@ TEST_F(vfio_compat_mock_domain, huge_map)
 	}
 }
 
+FIXTURE(iommufd_viommu)
+{
+	int fd;
+	uint32_t ioas_id;
+	uint32_t stdev_id;
+	uint32_t hwpt_id;
+	uint32_t nested_hwpt_id;
+	uint32_t device_id;
+	uint32_t viommu_id;
+};
+
+FIXTURE_VARIANT(iommufd_viommu)
+{
+	unsigned int viommu;
+};
+
+FIXTURE_SETUP(iommufd_viommu)
+{
+	self->fd = open("/dev/iommu", O_RDWR);
+	ASSERT_NE(-1, self->fd);
+	test_ioctl_ioas_alloc(&self->ioas_id);
+	test_ioctl_set_default_memory_limit();
+
+	if (variant->viommu) {
+		struct iommu_hwpt_selftest data = {
+			.iotlb = IOMMU_TEST_IOTLB_DEFAULT,
+		};
+
+		test_cmd_mock_domain(self->ioas_id, &self->stdev_id, NULL,
+				     &self->device_id);
+
+		/* Allocate a nesting parent hwpt */
+		test_cmd_hwpt_alloc(self->device_id, self->ioas_id,
+				    IOMMU_HWPT_ALLOC_NEST_PARENT,
+				    &self->hwpt_id);
+
+		/* Allocate a vIOMMU taking refcount of the parent hwpt */
+		test_cmd_viommu_alloc(self->device_id, self->hwpt_id,
+				      IOMMU_VIOMMU_TYPE_SELFTEST,
+				      &self->viommu_id);
+
+		/* Allocate a regular nested hwpt */
+		test_cmd_hwpt_alloc_nested(self->device_id, self->viommu_id, 0,
+					   &self->nested_hwpt_id,
+					   IOMMU_HWPT_DATA_SELFTEST, &data,
+					   sizeof(data));
+	}
+}
+
+FIXTURE_TEARDOWN(iommufd_viommu)
+{
+	teardown_iommufd(self->fd, _metadata);
+}
+
+FIXTURE_VARIANT_ADD(iommufd_viommu, no_viommu)
+{
+	.viommu = 0,
+};
+
+FIXTURE_VARIANT_ADD(iommufd_viommu, mock_viommu)
+{
+	.viommu = 1,
+};
+
+TEST_F(iommufd_viommu, viommu_auto_destroy)
+{
+}
+
+TEST_F(iommufd_viommu, viommu_negative_tests)
+{
+	uint32_t device_id = self->device_id;
+	uint32_t ioas_id = self->ioas_id;
+	uint32_t hwpt_id;
+
+	if (self->device_id) {
+		/* Negative test -- invalid hwpt (hwpt_id=0) */
+		test_err_viommu_alloc(ENOENT, device_id, 0,
+				      IOMMU_VIOMMU_TYPE_SELFTEST, NULL);
+
+		/* Negative test -- not a nesting parent hwpt */
+		test_cmd_hwpt_alloc(device_id, ioas_id, 0, &hwpt_id);
+		test_err_viommu_alloc(EINVAL, device_id, hwpt_id,
+				      IOMMU_VIOMMU_TYPE_SELFTEST, NULL);
+		test_ioctl_destroy(hwpt_id);
+
+		/* Negative test -- unsupported viommu type */
+		test_err_viommu_alloc(EOPNOTSUPP, device_id, self->hwpt_id,
+				      0xdead, NULL);
+		EXPECT_ERRNO(EBUSY,
+			     _test_ioctl_destroy(self->fd, self->hwpt_id));
+		EXPECT_ERRNO(EBUSY,
+			     _test_ioctl_destroy(self->fd, self->viommu_id));
+	} else {
+		test_err_viommu_alloc(ENOENT, self->device_id, self->hwpt_id,
+				      IOMMU_VIOMMU_TYPE_SELFTEST, NULL);
+	}
+}
+
+TEST_F(iommufd_viommu, viommu_alloc_nested_iopf)
+{
+	struct iommu_hwpt_selftest data = {
+		.iotlb = IOMMU_TEST_IOTLB_DEFAULT,
+	};
+	uint32_t viommu_id = self->viommu_id;
+	uint32_t dev_id = self->device_id;
+	uint32_t iopf_hwpt_id;
+	uint32_t fault_id;
+	uint32_t fault_fd;
+
+	if (self->device_id) {
+		test_ioctl_fault_alloc(&fault_id, &fault_fd);
+		test_err_hwpt_alloc_iopf(
+			ENOENT, dev_id, viommu_id, UINT32_MAX,
+			IOMMU_HWPT_FAULT_ID_VALID, &iopf_hwpt_id,
+			IOMMU_HWPT_DATA_SELFTEST, &data, sizeof(data));
+		test_err_hwpt_alloc_iopf(
+			EOPNOTSUPP, dev_id, viommu_id, fault_id,
+			IOMMU_HWPT_FAULT_ID_VALID | (1 << 31), &iopf_hwpt_id,
+			IOMMU_HWPT_DATA_SELFTEST, &data, sizeof(data));
+		test_cmd_hwpt_alloc_iopf(
+			dev_id, viommu_id, fault_id, IOMMU_HWPT_FAULT_ID_VALID,
+			&iopf_hwpt_id, IOMMU_HWPT_DATA_SELFTEST, &data,
+			sizeof(data));
+
+		test_cmd_mock_domain_replace(self->stdev_id, iopf_hwpt_id);
+		EXPECT_ERRNO(EBUSY,
+			     _test_ioctl_destroy(self->fd, iopf_hwpt_id));
+		test_cmd_trigger_iopf(dev_id, fault_fd);
+
+		test_cmd_mock_domain_replace(self->stdev_id, self->ioas_id);
+		test_ioctl_destroy(iopf_hwpt_id);
+		close(fault_fd);
+		test_ioctl_destroy(fault_id);
+	}
+}
+
 TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c
index 2d7d01638be8..fb618485d7ca 100644
--- a/tools/testing/selftests/iommu/iommufd_fail_nth.c
+++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c
@@ -621,6 +621,7 @@ TEST_FAIL_NTH(basic_fail_nth, device)
 	uint32_t stdev_id;
 	uint32_t idev_id;
 	uint32_t hwpt_id;
+	uint32_t viommu_id;
 	__u64 iova;
 
 	self->fd = open("/dev/iommu", O_RDWR);
@@ -663,6 +664,16 @@ TEST_FAIL_NTH(basic_fail_nth, device)
 
 	if (_test_cmd_mock_domain_replace(self->fd, stdev_id, hwpt_id, NULL))
 		return -1;
+
+	if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0,
+				 IOMMU_HWPT_ALLOC_NEST_PARENT, &hwpt_id,
+				 IOMMU_HWPT_DATA_NONE, 0, 0))
+		return -1;
+
+	if (_test_cmd_viommu_alloc(self->fd, idev_id, hwpt_id,
+				   IOMMU_VIOMMU_TYPE_SELFTEST, 0, &viommu_id))
+		return -1;
+
 	return 0;
 }
 
diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h
index 6a11c26370f3..7dabc261fae2 100644
--- a/tools/testing/selftests/iommu/iommufd_utils.h
+++ b/tools/testing/selftests/iommu/iommufd_utils.h
@@ -819,3 +819,31 @@ static int _test_cmd_trigger_iopf(int fd, __u32 device_id, __u32 fault_fd)
 
 #define test_cmd_trigger_iopf(device_id, fault_fd) \
 	ASSERT_EQ(0, _test_cmd_trigger_iopf(self->fd, device_id, fault_fd))
+
+static int _test_cmd_viommu_alloc(int fd, __u32 device_id, __u32 hwpt_id,
+				  __u32 type, __u32 flags, __u32 *viommu_id)
+{
+	struct iommu_viommu_alloc cmd = {
+		.size = sizeof(cmd),
+		.flags = flags,
+		.type = type,
+		.dev_id = device_id,
+		.hwpt_id = hwpt_id,
+	};
+	int ret;
+
+	ret = ioctl(fd, IOMMU_VIOMMU_ALLOC, &cmd);
+	if (ret)
+		return ret;
+	if (viommu_id)
+		*viommu_id = cmd.out_viommu_id;
+	return 0;
+}
+
+#define test_cmd_viommu_alloc(device_id, hwpt_id, type, viommu_id)        \
+	ASSERT_EQ(0, _test_cmd_viommu_alloc(self->fd, device_id, hwpt_id, \
+					    type, 0, viommu_id))
+#define test_err_viommu_alloc(_errno, device_id, hwpt_id, type, viommu_id) \
+	EXPECT_ERRNO(_errno,                                               \
+		     _test_cmd_viommu_alloc(self->fd, device_id, hwpt_id,  \
+					    type, 0, viommu_id))
-- 
cgit v1.2.3


From 5778c75703c6e01ffd70a429b9015bed8008a5fd Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Tue, 5 Nov 2024 12:05:10 -0800
Subject: iommufd/selftest: Add IOMMU_VDEVICE_ALLOC test coverage

Add a vdevice_alloc op to the viommu mock_viommu_ops for the coverage of
IOMMU_VIOMMU_TYPE_SELFTEST allocations. Then, add a vdevice_alloc TEST_F
to cover the IOMMU_VDEVICE_ALLOC ioctl.

Link: https://patch.msgid.link/r/4b9607e5b86726c8baa7b89bd48123fb44104a23.1730836308.git.nicolinc@nvidia.com
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 tools/testing/selftests/iommu/iommufd.c          | 20 ++++++++++++++++++
 tools/testing/selftests/iommu/iommufd_fail_nth.c |  4 ++++
 tools/testing/selftests/iommu/iommufd_utils.h    | 27 ++++++++++++++++++++++++
 3 files changed, 51 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 37c7da283824..f3cb628753c9 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -134,6 +134,7 @@ TEST_F(iommufd, cmd_length)
 	TEST_LENGTH(iommu_vfio_ioas, IOMMU_VFIO_IOAS, __reserved);
 	TEST_LENGTH(iommu_ioas_map_file, IOMMU_IOAS_MAP_FILE, iova);
 	TEST_LENGTH(iommu_viommu_alloc, IOMMU_VIOMMU_ALLOC, out_viommu_id);
+	TEST_LENGTH(iommu_vdevice_alloc, IOMMU_VDEVICE_ALLOC, virt_id);
 #undef TEST_LENGTH
 }
 
@@ -2617,4 +2618,23 @@ TEST_F(iommufd_viommu, viommu_alloc_nested_iopf)
 	}
 }
 
+TEST_F(iommufd_viommu, vdevice_alloc)
+{
+	uint32_t viommu_id = self->viommu_id;
+	uint32_t dev_id = self->device_id;
+	uint32_t vdev_id = 0;
+
+	if (dev_id) {
+		/* Set vdev_id to 0x99, unset it, and set to 0x88 */
+		test_cmd_vdevice_alloc(viommu_id, dev_id, 0x99, &vdev_id);
+		test_err_vdevice_alloc(EEXIST, viommu_id, dev_id, 0x99,
+				       &vdev_id);
+		test_ioctl_destroy(vdev_id);
+		test_cmd_vdevice_alloc(viommu_id, dev_id, 0x88, &vdev_id);
+		test_ioctl_destroy(vdev_id);
+	} else {
+		test_err_vdevice_alloc(ENOENT, viommu_id, dev_id, 0x99, NULL);
+	}
+}
+
 TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c
index fb618485d7ca..22f6fd5f0f74 100644
--- a/tools/testing/selftests/iommu/iommufd_fail_nth.c
+++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c
@@ -622,6 +622,7 @@ TEST_FAIL_NTH(basic_fail_nth, device)
 	uint32_t idev_id;
 	uint32_t hwpt_id;
 	uint32_t viommu_id;
+	uint32_t vdev_id;
 	__u64 iova;
 
 	self->fd = open("/dev/iommu", O_RDWR);
@@ -674,6 +675,9 @@ TEST_FAIL_NTH(basic_fail_nth, device)
 				   IOMMU_VIOMMU_TYPE_SELFTEST, 0, &viommu_id))
 		return -1;
 
+	if (_test_cmd_vdevice_alloc(self->fd, viommu_id, idev_id, 0, &vdev_id))
+		return -1;
+
 	return 0;
 }
 
diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h
index 7dabc261fae2..7fe905924d72 100644
--- a/tools/testing/selftests/iommu/iommufd_utils.h
+++ b/tools/testing/selftests/iommu/iommufd_utils.h
@@ -847,3 +847,30 @@ static int _test_cmd_viommu_alloc(int fd, __u32 device_id, __u32 hwpt_id,
 	EXPECT_ERRNO(_errno,                                               \
 		     _test_cmd_viommu_alloc(self->fd, device_id, hwpt_id,  \
 					    type, 0, viommu_id))
+
+static int _test_cmd_vdevice_alloc(int fd, __u32 viommu_id, __u32 idev_id,
+				   __u64 virt_id, __u32 *vdev_id)
+{
+	struct iommu_vdevice_alloc cmd = {
+		.size = sizeof(cmd),
+		.dev_id = idev_id,
+		.viommu_id = viommu_id,
+		.virt_id = virt_id,
+	};
+	int ret;
+
+	ret = ioctl(fd, IOMMU_VDEVICE_ALLOC, &cmd);
+	if (ret)
+		return ret;
+	if (vdev_id)
+		*vdev_id = cmd.out_vdevice_id;
+	return 0;
+}
+
+#define test_cmd_vdevice_alloc(viommu_id, idev_id, virt_id, vdev_id)       \
+	ASSERT_EQ(0, _test_cmd_vdevice_alloc(self->fd, viommu_id, idev_id, \
+					     virt_id, vdev_id))
+#define test_err_vdevice_alloc(_errno, viommu_id, idev_id, virt_id, vdev_id) \
+	EXPECT_ERRNO(_errno,                                                 \
+		     _test_cmd_vdevice_alloc(self->fd, viommu_id, idev_id,   \
+					     virt_id, vdev_id))
-- 
cgit v1.2.3


From 54ce69e36c71c88f258b1a322c54343d90954858 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Tue, 5 Nov 2024 12:05:12 -0800
Subject: iommufd: Allow hwpt_id to carry viommu_id for IOMMU_HWPT_INVALIDATE

With a vIOMMU object, use space can flush any IOMMU related cache that can
be directed via a vIOMMU object. It is similar to the IOMMU_HWPT_INVALIDATE
uAPI, but can cover a wider range than IOTLB, e.g. device/desciprtor cache.

Allow hwpt_id of the iommu_hwpt_invalidate structure to carry a viommu_id,
and reuse the IOMMU_HWPT_INVALIDATE uAPI for vIOMMU invalidations. Drivers
can define different structures for vIOMMU invalidations v.s. HWPT ones.

Since both the HWPT-based and vIOMMU-based invalidation pathways check own
cache invalidation op, remove the WARN_ON_ONCE in the allocator.

Update the uAPI, kdoc, and selftest case accordingly.

Link: https://patch.msgid.link/r/b411e2245e303b8a964f39f49453a5dff280968f.1730836308.git.nicolinc@nvidia.com
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/hw_pagetable.c    | 40 +++++++++++++++++++++++++--------
 include/uapi/linux/iommufd.h            |  9 +++++---
 tools/testing/selftests/iommu/iommufd.c |  4 ++--
 3 files changed, 39 insertions(+), 14 deletions(-)

(limited to 'tools/testing')

diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index 982bf4a35a2b..702057655a81 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -251,8 +251,7 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
 	}
 	hwpt->domain->owner = ops;
 
-	if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED ||
-			 !hwpt->domain->ops->cache_invalidate_user)) {
+	if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) {
 		rc = -EINVAL;
 		goto out_abort;
 	}
@@ -483,7 +482,7 @@ int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd)
 		.entry_len = cmd->entry_len,
 		.entry_num = cmd->entry_num,
 	};
-	struct iommufd_hw_pagetable *hwpt;
+	struct iommufd_object *pt_obj;
 	u32 done_num = 0;
 	int rc;
 
@@ -497,17 +496,40 @@ int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd)
 		goto out;
 	}
 
-	hwpt = iommufd_get_hwpt_nested(ucmd, cmd->hwpt_id);
-	if (IS_ERR(hwpt)) {
-		rc = PTR_ERR(hwpt);
+	pt_obj = iommufd_get_object(ucmd->ictx, cmd->hwpt_id, IOMMUFD_OBJ_ANY);
+	if (IS_ERR(pt_obj)) {
+		rc = PTR_ERR(pt_obj);
 		goto out;
 	}
+	if (pt_obj->type == IOMMUFD_OBJ_HWPT_NESTED) {
+		struct iommufd_hw_pagetable *hwpt =
+			container_of(pt_obj, struct iommufd_hw_pagetable, obj);
+
+		if (!hwpt->domain->ops ||
+		    !hwpt->domain->ops->cache_invalidate_user) {
+			rc = -EOPNOTSUPP;
+			goto out_put_pt;
+		}
+		rc = hwpt->domain->ops->cache_invalidate_user(hwpt->domain,
+							      &data_array);
+	} else if (pt_obj->type == IOMMUFD_OBJ_VIOMMU) {
+		struct iommufd_viommu *viommu =
+			container_of(pt_obj, struct iommufd_viommu, obj);
+
+		if (!viommu->ops || !viommu->ops->cache_invalidate) {
+			rc = -EOPNOTSUPP;
+			goto out_put_pt;
+		}
+		rc = viommu->ops->cache_invalidate(viommu, &data_array);
+	} else {
+		rc = -EINVAL;
+		goto out_put_pt;
+	}
 
-	rc = hwpt->domain->ops->cache_invalidate_user(hwpt->domain,
-						      &data_array);
 	done_num = data_array.entry_num;
 
-	iommufd_put_object(ucmd->ictx, &hwpt->obj);
+out_put_pt:
+	iommufd_put_object(ucmd->ictx, pt_obj);
 out:
 	cmd->entry_num = done_num;
 	if (iommufd_ucmd_respond(ucmd, sizeof(*cmd)))
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 9b5236004b8e..badb41c5bfa4 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -700,7 +700,7 @@ struct iommu_hwpt_vtd_s1_invalidate {
 /**
  * struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE)
  * @size: sizeof(struct iommu_hwpt_invalidate)
- * @hwpt_id: ID of a nested HWPT for cache invalidation
+ * @hwpt_id: ID of a nested HWPT or a vIOMMU, for cache invalidation
  * @data_uptr: User pointer to an array of driver-specific cache invalidation
  *             data.
  * @data_type: One of enum iommu_hwpt_invalidate_data_type, defining the data
@@ -711,8 +711,11 @@ struct iommu_hwpt_vtd_s1_invalidate {
  *             Output the number of requests successfully handled by kernel.
  * @__reserved: Must be 0.
  *
- * Invalidate the iommu cache for user-managed page table. Modifications on a
- * user-managed page table should be followed by this operation to sync cache.
+ * Invalidate iommu cache for user-managed page table or vIOMMU. Modifications
+ * on a user-managed page table should be followed by this operation, if a HWPT
+ * is passed in via @hwpt_id. Other caches, such as device cache or descriptor
+ * cache can be flushed if a vIOMMU is passed in via the @hwpt_id field.
+ *
  * Each ioctl can support one or more cache invalidation requests in the array
  * that has a total size of @entry_len * @entry_num.
  *
diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index f3cb628753c9..8cb3e835ca97 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -367,9 +367,9 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
 		EXPECT_ERRNO(EBUSY,
 			     _test_ioctl_destroy(self->fd, parent_hwpt_id));
 
-		/* hwpt_invalidate only supports a user-managed hwpt (nested) */
+		/* hwpt_invalidate does not support a parent hwpt */
 		num_inv = 1;
-		test_err_hwpt_invalidate(ENOENT, parent_hwpt_id, inv_reqs,
+		test_err_hwpt_invalidate(EINVAL, parent_hwpt_id, inv_reqs,
 					 IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
 					 sizeof(*inv_reqs), &num_inv);
 		assert(!num_inv);
-- 
cgit v1.2.3


From 576ad6eb45d6458c1a5e646dc35e3ec23c73fd1b Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Tue, 5 Nov 2024 12:05:16 -0800
Subject: iommufd/selftest: Add IOMMU_TEST_OP_DEV_CHECK_CACHE test command

Similar to IOMMU_TEST_OP_MD_CHECK_IOTLB verifying a mock_domain's iotlb,
IOMMU_TEST_OP_DEV_CHECK_CACHE will be used to verify a mock_dev's cache.

Link: https://patch.msgid.link/r/cd4082079d75427bd67ed90c3c825e15b5720a5f.1730836308.git.nicolinc@nvidia.com
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/iommufd_test.h          |  5 +++++
 drivers/iommu/iommufd/selftest.c              | 22 ++++++++++++++++++++++
 tools/testing/selftests/iommu/iommufd.c       |  7 ++++++-
 tools/testing/selftests/iommu/iommufd_utils.h | 24 ++++++++++++++++++++++++
 4 files changed, 57 insertions(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h
index 46558f83e734..a6b7a163f636 100644
--- a/drivers/iommu/iommufd/iommufd_test.h
+++ b/drivers/iommu/iommufd/iommufd_test.h
@@ -23,6 +23,7 @@ enum {
 	IOMMU_TEST_OP_DIRTY,
 	IOMMU_TEST_OP_MD_CHECK_IOTLB,
 	IOMMU_TEST_OP_TRIGGER_IOPF,
+	IOMMU_TEST_OP_DEV_CHECK_CACHE,
 };
 
 enum {
@@ -140,6 +141,10 @@ struct iommu_test_cmd {
 			__u32 perm;
 			__u64 addr;
 		} trigger_iopf;
+		struct {
+			__u32 id;
+			__u32 cache;
+		} check_dev_cache;
 	};
 	__u32 last;
 };
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index e20498667a2c..2f9de177dffc 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -1125,6 +1125,24 @@ static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd,
 	return rc;
 }
 
+static int iommufd_test_dev_check_cache(struct iommufd_ucmd *ucmd, u32 idev_id,
+					unsigned int cache_id, u32 cache)
+{
+	struct iommufd_device *idev;
+	struct mock_dev *mdev;
+	int rc = 0;
+
+	idev = iommufd_get_device(ucmd, idev_id);
+	if (IS_ERR(idev))
+		return PTR_ERR(idev);
+	mdev = container_of(idev->dev, struct mock_dev, dev);
+
+	if (cache_id > MOCK_DEV_CACHE_ID_MAX || mdev->cache[cache_id] != cache)
+		rc = -EINVAL;
+	iommufd_put_object(ucmd->ictx, &idev->obj);
+	return rc;
+}
+
 struct selftest_access {
 	struct iommufd_access *access;
 	struct file *file;
@@ -1634,6 +1652,10 @@ int iommufd_test(struct iommufd_ucmd *ucmd)
 		return iommufd_test_md_check_iotlb(ucmd, cmd->id,
 						   cmd->check_iotlb.id,
 						   cmd->check_iotlb.iotlb);
+	case IOMMU_TEST_OP_DEV_CHECK_CACHE:
+		return iommufd_test_dev_check_cache(ucmd, cmd->id,
+						    cmd->check_dev_cache.id,
+						    cmd->check_dev_cache.cache);
 	case IOMMU_TEST_OP_CREATE_ACCESS:
 		return iommufd_test_create_access(ucmd, cmd->id,
 						  cmd->create_access.flags);
diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 8cb3e835ca97..4bc9dd2e620a 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -227,6 +227,8 @@ FIXTURE_SETUP(iommufd_ioas)
 	for (i = 0; i != variant->mock_domains; i++) {
 		test_cmd_mock_domain(self->ioas_id, &self->stdev_id,
 				     &self->hwpt_id, &self->device_id);
+		test_cmd_dev_check_cache_all(self->device_id,
+					     IOMMU_TEST_DEV_CACHE_DEFAULT);
 		self->base_iova = MOCK_APERTURE_START;
 	}
 }
@@ -1392,9 +1394,12 @@ FIXTURE_SETUP(iommufd_mock_domain)
 
 	ASSERT_GE(ARRAY_SIZE(self->hwpt_ids), variant->mock_domains);
 
-	for (i = 0; i != variant->mock_domains; i++)
+	for (i = 0; i != variant->mock_domains; i++) {
 		test_cmd_mock_domain(self->ioas_id, &self->stdev_ids[i],
 				     &self->hwpt_ids[i], &self->idev_ids[i]);
+		test_cmd_dev_check_cache_all(self->idev_ids[0],
+					     IOMMU_TEST_DEV_CACHE_DEFAULT);
+	}
 	self->hwpt_id = self->hwpt_ids[0];
 
 	self->mmap_flags = MAP_SHARED | MAP_ANONYMOUS;
diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h
index 7fe905924d72..619ffdb1e5e8 100644
--- a/tools/testing/selftests/iommu/iommufd_utils.h
+++ b/tools/testing/selftests/iommu/iommufd_utils.h
@@ -250,6 +250,30 @@ static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, __u32 ft_i
 			test_cmd_hwpt_check_iotlb(hwpt_id, i, expected);       \
 	})
 
+#define test_cmd_dev_check_cache(device_id, cache_id, expected)                \
+	({                                                                     \
+		struct iommu_test_cmd test_cmd = {                             \
+			.size = sizeof(test_cmd),                              \
+			.op = IOMMU_TEST_OP_DEV_CHECK_CACHE,                   \
+			.id = device_id,                                       \
+			.check_dev_cache = {                                   \
+				.id = cache_id,                                \
+				.cache = expected,                             \
+			},                                                     \
+		};                                                             \
+		ASSERT_EQ(0, ioctl(self->fd,                                   \
+				   _IOMMU_TEST_CMD(                            \
+					   IOMMU_TEST_OP_DEV_CHECK_CACHE),     \
+				   &test_cmd));                                \
+	})
+
+#define test_cmd_dev_check_cache_all(device_id, expected)                      \
+	({                                                                     \
+		int c;                                                         \
+		for (c = 0; c < MOCK_DEV_CACHE_NUM; c++)                       \
+			test_cmd_dev_check_cache(device_id, c, expected);      \
+	})
+
 static int _test_cmd_hwpt_invalidate(int fd, __u32 hwpt_id, void *reqs,
 				     uint32_t data_type, uint32_t lreq,
 				     uint32_t *nreqs)
-- 
cgit v1.2.3


From 49ad127719243420b355fd95b0d51ac46ae586e5 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Tue, 5 Nov 2024 12:05:17 -0800
Subject: iommufd/selftest: Add vIOMMU coverage for IOMMU_HWPT_INVALIDATE ioctl

Add a viommu_cache test function to cover vIOMMU invalidations using the
updated IOMMU_HWPT_INVALIDATE ioctl, which now allows passing in a vIOMMU
via its hwpt_id field.

Link: https://patch.msgid.link/r/f317f902041f3d05deaee4ca3fdd8ef4b8297361.1730836308.git.nicolinc@nvidia.com
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 tools/testing/selftests/iommu/iommufd.c       | 173 ++++++++++++++++++++++++++
 tools/testing/selftests/iommu/iommufd_utils.h |  32 +++++
 2 files changed, 205 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 4bc9dd2e620a..94fe038d2eee 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -2642,4 +2642,177 @@ TEST_F(iommufd_viommu, vdevice_alloc)
 	}
 }
 
+TEST_F(iommufd_viommu, vdevice_cache)
+{
+	struct iommu_viommu_invalidate_selftest inv_reqs[2] = {};
+	uint32_t viommu_id = self->viommu_id;
+	uint32_t dev_id = self->device_id;
+	uint32_t vdev_id = 0;
+	uint32_t num_inv;
+
+	if (dev_id) {
+		test_cmd_vdevice_alloc(viommu_id, dev_id, 0x99, &vdev_id);
+
+		test_cmd_dev_check_cache_all(dev_id,
+					     IOMMU_TEST_DEV_CACHE_DEFAULT);
+
+		/* Check data_type by passing zero-length array */
+		num_inv = 0;
+		test_cmd_viommu_invalidate(viommu_id, inv_reqs,
+					   sizeof(*inv_reqs), &num_inv);
+		assert(!num_inv);
+
+		/* Negative test: Invalid data_type */
+		num_inv = 1;
+		test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs,
+					   IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST_INVALID,
+					   sizeof(*inv_reqs), &num_inv);
+		assert(!num_inv);
+
+		/* Negative test: structure size sanity */
+		num_inv = 1;
+		test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs,
+					   IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+					   sizeof(*inv_reqs) + 1, &num_inv);
+		assert(!num_inv);
+
+		num_inv = 1;
+		test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs,
+					   IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+					   1, &num_inv);
+		assert(!num_inv);
+
+		/* Negative test: invalid flag is passed */
+		num_inv = 1;
+		inv_reqs[0].flags = 0xffffffff;
+		inv_reqs[0].vdev_id = 0x99;
+		test_err_viommu_invalidate(EOPNOTSUPP, viommu_id, inv_reqs,
+					   IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+					   sizeof(*inv_reqs), &num_inv);
+		assert(!num_inv);
+
+		/* Negative test: invalid data_uptr when array is not empty */
+		num_inv = 1;
+		inv_reqs[0].flags = 0;
+		inv_reqs[0].vdev_id = 0x99;
+		test_err_viommu_invalidate(EINVAL, viommu_id, NULL,
+					   IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+					   sizeof(*inv_reqs), &num_inv);
+		assert(!num_inv);
+
+		/* Negative test: invalid entry_len when array is not empty */
+		num_inv = 1;
+		inv_reqs[0].flags = 0;
+		inv_reqs[0].vdev_id = 0x99;
+		test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs,
+					   IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+					   0, &num_inv);
+		assert(!num_inv);
+
+		/* Negative test: invalid cache_id */
+		num_inv = 1;
+		inv_reqs[0].flags = 0;
+		inv_reqs[0].vdev_id = 0x99;
+		inv_reqs[0].cache_id = MOCK_DEV_CACHE_ID_MAX + 1;
+		test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs,
+					   IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+					   sizeof(*inv_reqs), &num_inv);
+		assert(!num_inv);
+
+		/* Negative test: invalid vdev_id */
+		num_inv = 1;
+		inv_reqs[0].flags = 0;
+		inv_reqs[0].vdev_id = 0x9;
+		inv_reqs[0].cache_id = 0;
+		test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs,
+					   IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+					   sizeof(*inv_reqs), &num_inv);
+		assert(!num_inv);
+
+		/*
+		 * Invalidate the 1st cache entry but fail the 2nd request
+		 * due to invalid flags configuration in the 2nd request.
+		 */
+		num_inv = 2;
+		inv_reqs[0].flags = 0;
+		inv_reqs[0].vdev_id = 0x99;
+		inv_reqs[0].cache_id = 0;
+		inv_reqs[1].flags = 0xffffffff;
+		inv_reqs[1].vdev_id = 0x99;
+		inv_reqs[1].cache_id = 1;
+		test_err_viommu_invalidate(EOPNOTSUPP, viommu_id, inv_reqs,
+					   IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+					   sizeof(*inv_reqs), &num_inv);
+		assert(num_inv == 1);
+		test_cmd_dev_check_cache(dev_id, 0, 0);
+		test_cmd_dev_check_cache(dev_id, 1,
+					 IOMMU_TEST_DEV_CACHE_DEFAULT);
+		test_cmd_dev_check_cache(dev_id, 2,
+					 IOMMU_TEST_DEV_CACHE_DEFAULT);
+		test_cmd_dev_check_cache(dev_id, 3,
+					 IOMMU_TEST_DEV_CACHE_DEFAULT);
+
+		/*
+		 * Invalidate the 1st cache entry but fail the 2nd request
+		 * due to invalid cache_id configuration in the 2nd request.
+		 */
+		num_inv = 2;
+		inv_reqs[0].flags = 0;
+		inv_reqs[0].vdev_id = 0x99;
+		inv_reqs[0].cache_id = 0;
+		inv_reqs[1].flags = 0;
+		inv_reqs[1].vdev_id = 0x99;
+		inv_reqs[1].cache_id = MOCK_DEV_CACHE_ID_MAX + 1;
+		test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs,
+					   IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+					   sizeof(*inv_reqs), &num_inv);
+		assert(num_inv == 1);
+		test_cmd_dev_check_cache(dev_id, 0, 0);
+		test_cmd_dev_check_cache(dev_id, 1,
+					 IOMMU_TEST_DEV_CACHE_DEFAULT);
+		test_cmd_dev_check_cache(dev_id, 2,
+					 IOMMU_TEST_DEV_CACHE_DEFAULT);
+		test_cmd_dev_check_cache(dev_id, 3,
+					 IOMMU_TEST_DEV_CACHE_DEFAULT);
+
+		/* Invalidate the 2nd cache entry and verify */
+		num_inv = 1;
+		inv_reqs[0].flags = 0;
+		inv_reqs[0].vdev_id = 0x99;
+		inv_reqs[0].cache_id = 1;
+		test_cmd_viommu_invalidate(viommu_id, inv_reqs,
+					   sizeof(*inv_reqs), &num_inv);
+		assert(num_inv == 1);
+		test_cmd_dev_check_cache(dev_id, 0, 0);
+		test_cmd_dev_check_cache(dev_id, 1, 0);
+		test_cmd_dev_check_cache(dev_id, 2,
+					 IOMMU_TEST_DEV_CACHE_DEFAULT);
+		test_cmd_dev_check_cache(dev_id, 3,
+					 IOMMU_TEST_DEV_CACHE_DEFAULT);
+
+		/* Invalidate the 3rd and 4th cache entries and verify */
+		num_inv = 2;
+		inv_reqs[0].flags = 0;
+		inv_reqs[0].vdev_id = 0x99;
+		inv_reqs[0].cache_id = 2;
+		inv_reqs[1].flags = 0;
+		inv_reqs[1].vdev_id = 0x99;
+		inv_reqs[1].cache_id = 3;
+		test_cmd_viommu_invalidate(viommu_id, inv_reqs,
+					   sizeof(*inv_reqs), &num_inv);
+		assert(num_inv == 2);
+		test_cmd_dev_check_cache_all(dev_id, 0);
+
+		/* Invalidate all cache entries for nested_dev_id[1] and verify */
+		num_inv = 1;
+		inv_reqs[0].vdev_id = 0x99;
+		inv_reqs[0].flags = IOMMU_TEST_INVALIDATE_FLAG_ALL;
+		test_cmd_viommu_invalidate(viommu_id, inv_reqs,
+					   sizeof(*inv_reqs), &num_inv);
+		assert(num_inv == 1);
+		test_cmd_dev_check_cache_all(dev_id, 0);
+		test_ioctl_destroy(vdev_id);
+	}
+}
+
 TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h
index 619ffdb1e5e8..c0239f86f2f8 100644
--- a/tools/testing/selftests/iommu/iommufd_utils.h
+++ b/tools/testing/selftests/iommu/iommufd_utils.h
@@ -305,6 +305,38 @@ static int _test_cmd_hwpt_invalidate(int fd, __u32 hwpt_id, void *reqs,
 					     data_type, lreq, nreqs));   \
 	})
 
+static int _test_cmd_viommu_invalidate(int fd, __u32 viommu_id, void *reqs,
+				       uint32_t data_type, uint32_t lreq,
+				       uint32_t *nreqs)
+{
+	struct iommu_hwpt_invalidate cmd = {
+		.size = sizeof(cmd),
+		.hwpt_id = viommu_id,
+		.data_type = data_type,
+		.data_uptr = (uint64_t)reqs,
+		.entry_len = lreq,
+		.entry_num = *nreqs,
+	};
+	int rc = ioctl(fd, IOMMU_HWPT_INVALIDATE, &cmd);
+	*nreqs = cmd.entry_num;
+	return rc;
+}
+
+#define test_cmd_viommu_invalidate(viommu, reqs, lreq, nreqs)                  \
+	({                                                                     \
+		ASSERT_EQ(0,                                                   \
+			  _test_cmd_viommu_invalidate(self->fd, viommu, reqs,  \
+					IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST, \
+					lreq, nreqs));                         \
+	})
+#define test_err_viommu_invalidate(_errno, viommu_id, reqs, data_type, lreq,   \
+				 nreqs)                                        \
+	({                                                                     \
+		EXPECT_ERRNO(_errno, _test_cmd_viommu_invalidate(              \
+					     self->fd, viommu_id, reqs,        \
+					     data_type, lreq, nreqs));         \
+	})
+
 static int _test_cmd_access_replace_ioas(int fd, __u32 access_id,
 					 unsigned int ioas_id)
 {
-- 
cgit v1.2.3


From fae1980347bfd23325099b69db6638b94149a94c Mon Sep 17 00:00:00 2001
From: Donet Tom <donettom@linux.ibm.com>
Date: Sun, 10 Nov 2024 00:49:03 -0600
Subject: selftests: hugetlb_dio: fixup check for initial conditions to skip in
 the start

This test verifies that a hugepage, used as a user buffer for DIO
operations, is correctly freed upon unmapping.  To test this, we read the
count of free hugepages before and after the mmap, DIO, and munmap
operations, then check if the free hugepage count is the same.

Reading free hugepages before the test was removed by commit 0268d4579901
('selftests: hugetlb_dio: check for initial conditions to skip at the
start'), causing the test to always fail.

This patch adds back reading the free hugepages before starting the test.
With this patch, the tests are now passing.

Test results without this patch:

./tools/testing/selftests/mm/hugetlb_dio
TAP version 13
1..4
 # No. Free pages before allocation : 0
 # No. Free pages after munmap : 100
not ok 1 : Huge pages not freed!
 # No. Free pages before allocation : 0
 # No. Free pages after munmap : 100
not ok 2 : Huge pages not freed!
 # No. Free pages before allocation : 0
 # No. Free pages after munmap : 100
not ok 3 : Huge pages not freed!
 # No. Free pages before allocation : 0
 # No. Free pages after munmap : 100
not ok 4 : Huge pages not freed!
 # Totals: pass:0 fail:4 xfail:0 xpass:0 skip:0 error:0

Test results with this patch:

/tools/testing/selftests/mm/hugetlb_dio
TAP version 13
1..4
# No. Free pages before allocation : 100
# No. Free pages after munmap : 100
ok 1 : Huge pages freed successfully !
# No. Free pages before allocation : 100
# No. Free pages after munmap : 100
ok 2 : Huge pages freed successfully !
# No. Free pages before allocation : 100
# No. Free pages after munmap : 100
ok 3 : Huge pages freed successfully !
# No. Free pages before allocation : 100
# No. Free pages after munmap : 100
ok 4 : Huge pages freed successfully !

# Totals: pass:4 fail:0 xfail:0 xpass:0 skip:0 error:0

Link: https://lkml.kernel.org/r/20241110064903.23626-1-donettom@linux.ibm.com
Fixes: 0268d4579901 ("selftests: hugetlb_dio: check for initial conditions to skip in the start")
Signed-off-by: Donet Tom <donettom@linux.ibm.com>
Cc: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/hugetlb_dio.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/mm/hugetlb_dio.c b/tools/testing/selftests/mm/hugetlb_dio.c
index 60001c142ce9..432d5af15e66 100644
--- a/tools/testing/selftests/mm/hugetlb_dio.c
+++ b/tools/testing/selftests/mm/hugetlb_dio.c
@@ -44,6 +44,13 @@ void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off)
 	if (fd < 0)
 		ksft_exit_fail_perror("Error opening file\n");
 
+	/* Get the free huge pages before allocation */
+	free_hpage_b = get_free_hugepages();
+	if (free_hpage_b == 0) {
+		close(fd);
+		ksft_exit_skip("No free hugepage, exiting!\n");
+	}
+
 	/* Allocate a hugetlb page */
 	orig_buffer = mmap(NULL, h_pagesize, mmap_prot, mmap_flags, -1, 0);
 	if (orig_buffer == MAP_FAILED) {
-- 
cgit v1.2.3


From d9d4d127e813427afb26ff7e0f0c58989501be84 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 12 Nov 2024 03:09:03 -0800
Subject: selftests/bpf: watchdog timer for test_progs

This commit provides a watchdog timer that sets a limit of how long a
single sub-test could run:
- if sub-test runs for 10 seconds, the name of the test is printed
  (currently the name of the test is printed only after it finishes);
- if sub-test runs for 120 seconds, the running thread is terminated
  with SIGSEGV (to trigger crash_handler() and get a stack trace).

Specifically:
- the timer is armed on each call to run_one_test();
- re-armed at each call to test__start_subtest();
- is stopped when exiting run_one_test().

Default timeout could be overridden using '-w' or '--watchdog-timeout'
options. Value 0 can be used to turn the timer off.
Here is an example execution:

    $ ./ssh-exec.sh ./test_progs -w 5 -t \
      send_signal/send_signal_perf_thread_remote,send_signal/send_signal_nmi_thread_remote
    WATCHDOG: test case send_signal/send_signal_nmi_thread_remote executes for 5 seconds, terminating with SIGSEGV
    Caught signal #11!
    Stack trace:
    ./test_progs(crash_handler+0x1f)[0x9049ef]
    /lib64/libc.so.6(+0x40d00)[0x7f1f1184fd00]
    /lib64/libc.so.6(read+0x4a)[0x7f1f1191cc4a]
    ./test_progs[0x720dd3]
    ./test_progs[0x71ef7a]
    ./test_progs(test_send_signal+0x1db)[0x71edeb]
    ./test_progs[0x9066c5]
    ./test_progs(main+0x5ed)[0x9054ad]
    /lib64/libc.so.6(+0x2a088)[0x7f1f11839088]
    /lib64/libc.so.6(__libc_start_main+0x8b)[0x7f1f1183914b]
    ./test_progs(_start+0x25)[0x527385]
    #292     send_signal:FAIL
    test_send_signal_common:PASS:reading pipe 0 nsec
    test_send_signal_common:PASS:reading pipe error: size 0 0 nsec
    test_send_signal_common:PASS:incorrect result 0 nsec
    test_send_signal_common:PASS:pipe_write 0 nsec
    test_send_signal_common:PASS:setpriority 0 nsec

Timer is implemented using timer_{create,start} librt API.
Internally librt uses pthreads for SIGEV_THREAD timers,
so this change adds a background timer thread to the test process.
Because of this a few checks in tests 'bpf_iter' and 'iters'
need an update to account for an extra thread.

For parallelized scenario the watchdog is also created for each worker
fork. If one of the workers gets stuck, it would be terminated by a
watchdog. In theory, this might lead to a scenario when all worker
threads are exhausted, however this should not be a problem for
server_main(), as it would exit with some of the tests not run.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20241112110906.3045278-2-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/bpf_iter.c |   8 +-
 tools/testing/selftests/bpf/prog_tests/iters.c    |   4 +-
 tools/testing/selftests/bpf/test_progs.c          | 104 ++++++++++++++++++++++
 tools/testing/selftests/bpf/test_progs.h          |   6 ++
 4 files changed, 116 insertions(+), 6 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
index b8e1224cfd19..6f1bfacd7375 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
@@ -265,10 +265,10 @@ static void *run_test_task_tid(void *arg)
 
 	linfo.task.tid = 0;
 	linfo.task.pid = getpid();
-	/* This includes the parent thread, this thread,
+	/* This includes the parent thread, this thread, watchdog timer thread
 	 * and the do_nothing_wait thread
 	 */
-	test_task_common(&opts, 2, 1);
+	test_task_common(&opts, 3, 1);
 
 	test_task_common_nocheck(NULL, &num_unknown_tid, &num_known_tid);
 	ASSERT_GT(num_unknown_tid, 2, "check_num_unknown_tid");
@@ -297,7 +297,7 @@ static void test_task_pid(void)
 	opts.link_info = &linfo;
 	opts.link_info_len = sizeof(linfo);
 
-	test_task_common(&opts, 1, 1);
+	test_task_common(&opts, 2, 1);
 }
 
 static void test_task_pidfd(void)
@@ -315,7 +315,7 @@ static void test_task_pidfd(void)
 	opts.link_info = &linfo;
 	opts.link_info_len = sizeof(linfo);
 
-	test_task_common(&opts, 1, 1);
+	test_task_common(&opts, 2, 1);
 
 	close(pidfd);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/iters.c b/tools/testing/selftests/bpf/prog_tests/iters.c
index 89ff23c4a8bc..3cea71f9c500 100644
--- a/tools/testing/selftests/bpf/prog_tests/iters.c
+++ b/tools/testing/selftests/bpf/prog_tests/iters.c
@@ -192,8 +192,8 @@ static void subtest_task_iters(void)
 	syscall(SYS_getpgid);
 	iters_task__detach(skel);
 	ASSERT_EQ(skel->bss->procs_cnt, 1, "procs_cnt");
-	ASSERT_EQ(skel->bss->threads_cnt, thread_num + 1, "threads_cnt");
-	ASSERT_EQ(skel->bss->proc_threads_cnt, thread_num + 1, "proc_threads_cnt");
+	ASSERT_EQ(skel->bss->threads_cnt, thread_num + 2, "threads_cnt");
+	ASSERT_EQ(skel->bss->proc_threads_cnt, thread_num + 2, "proc_threads_cnt");
 	ASSERT_EQ(skel->bss->invalid_cnt, 0, "invalid_cnt");
 	pthread_mutex_unlock(&do_nothing_mutex);
 	for (int i = 0; i < thread_num; i++)
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 7421874380c2..6088d8222d59 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -16,6 +16,7 @@
 #include <sys/socket.h>
 #include <sys/un.h>
 #include <bpf/btf.h>
+#include <time.h>
 #include "json_writer.h"
 
 #include "network_helpers.h"
@@ -179,6 +180,88 @@ int usleep(useconds_t usec)
 	return syscall(__NR_nanosleep, &ts, NULL);
 }
 
+/* Watchdog timer is started by watchdog_start() and stopped by watchdog_stop().
+ * If timer is active for longer than env.secs_till_notify,
+ * it prints the name of the current test to the stderr.
+ * If timer is active for longer than env.secs_till_kill,
+ * it kills the thread executing the test by sending a SIGSEGV signal to it.
+ */
+static void watchdog_timer_func(union sigval sigval)
+{
+	struct itimerspec timeout = {};
+	char test_name[256];
+	int err;
+
+	if (env.subtest_state)
+		snprintf(test_name, sizeof(test_name), "%s/%s",
+			 env.test->test_name, env.subtest_state->name);
+	else
+		snprintf(test_name, sizeof(test_name), "%s",
+			 env.test->test_name);
+
+	switch (env.watchdog_state) {
+	case WD_NOTIFY:
+		fprintf(env.stderr_saved, "WATCHDOG: test case %s executes for %d seconds...\n",
+			test_name, env.secs_till_notify);
+		timeout.it_value.tv_sec = env.secs_till_kill - env.secs_till_notify;
+		env.watchdog_state = WD_KILL;
+		err = timer_settime(env.watchdog, 0, &timeout, NULL);
+		if (err)
+			fprintf(env.stderr_saved, "Failed to arm watchdog timer\n");
+		break;
+	case WD_KILL:
+		fprintf(env.stderr_saved,
+			"WATCHDOG: test case %s executes for %d seconds, terminating with SIGSEGV\n",
+			test_name, env.secs_till_kill);
+		pthread_kill(env.main_thread, SIGSEGV);
+		break;
+	}
+}
+
+static void watchdog_start(void)
+{
+	struct itimerspec timeout = {};
+	int err;
+
+	if (env.secs_till_kill == 0)
+		return;
+	if (env.secs_till_notify > 0) {
+		env.watchdog_state = WD_NOTIFY;
+		timeout.it_value.tv_sec = env.secs_till_notify;
+	} else {
+		env.watchdog_state = WD_KILL;
+		timeout.it_value.tv_sec = env.secs_till_kill;
+	}
+	err = timer_settime(env.watchdog, 0, &timeout, NULL);
+	if (err)
+		fprintf(env.stderr_saved, "Failed to start watchdog timer\n");
+}
+
+static void watchdog_stop(void)
+{
+	struct itimerspec timeout = {};
+	int err;
+
+	env.watchdog_state = WD_NOTIFY;
+	err = timer_settime(env.watchdog, 0, &timeout, NULL);
+	if (err)
+		fprintf(env.stderr_saved, "Failed to stop watchdog timer\n");
+}
+
+static void watchdog_init(void)
+{
+	struct sigevent watchdog_sev = {
+		.sigev_notify = SIGEV_THREAD,
+		.sigev_notify_function = watchdog_timer_func,
+	};
+	int err;
+
+	env.main_thread = pthread_self();
+	err = timer_create(CLOCK_MONOTONIC, &watchdog_sev, &env.watchdog);
+	if (err)
+		fprintf(stderr, "Failed to initialize watchdog timer\n");
+}
+
 static bool should_run(struct test_selector *sel, int num, const char *name)
 {
 	int i;
@@ -515,6 +598,7 @@ bool test__start_subtest(const char *subtest_name)
 
 	env.subtest_state = subtest_state;
 	stdio_hijack_init(&subtest_state->log_buf, &subtest_state->log_cnt);
+	watchdog_start();
 
 	return true;
 }
@@ -780,6 +864,7 @@ enum ARG_KEYS {
 	ARG_DEBUG = -1,
 	ARG_JSON_SUMMARY = 'J',
 	ARG_TRAFFIC_MONITOR = 'm',
+	ARG_WATCHDOG_TIMEOUT = 'w',
 };
 
 static const struct argp_option opts[] = {
@@ -810,6 +895,8 @@ static const struct argp_option opts[] = {
 	{ "traffic-monitor", ARG_TRAFFIC_MONITOR, "NAMES", 0,
 	  "Monitor network traffic of tests with name matching the pattern (supports '*' wildcard)." },
 #endif
+	{ "watchdog-timeout", ARG_WATCHDOG_TIMEOUT, "SECONDS", 0,
+	  "Kill the process if tests are not making progress for specified number of seconds." },
 	{},
 };
 
@@ -1035,6 +1122,16 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
 					      true);
 		break;
 #endif
+	case ARG_WATCHDOG_TIMEOUT:
+		env->secs_till_kill = atoi(arg);
+		if (env->secs_till_kill < 0) {
+			fprintf(stderr, "Invalid watchdog timeout: %s.\n", arg);
+			return -EINVAL;
+		}
+		if (env->secs_till_kill < env->secs_till_notify) {
+			env->secs_till_notify = 0;
+		}
+		break;
 	default:
 		return ARGP_ERR_UNKNOWN;
 	}
@@ -1263,10 +1360,12 @@ static void run_one_test(int test_num)
 
 	stdio_hijack(&state->log_buf, &state->log_cnt);
 
+	watchdog_start();
 	if (test->run_test)
 		test->run_test();
 	else if (test->run_serial_test)
 		test->run_serial_test();
+	watchdog_stop();
 
 	/* ensure last sub-test is finalized properly */
 	if (env.subtest_state)
@@ -1707,6 +1806,7 @@ out:
 static int worker_main(int sock)
 {
 	save_netns();
+	watchdog_init();
 
 	while (true) {
 		/* receive command */
@@ -1816,6 +1916,8 @@ int main(int argc, char **argv)
 
 	sigaction(SIGSEGV, &sigact, NULL);
 
+	env.secs_till_notify = 10;
+	env.secs_till_kill = 120;
 	err = argp_parse(&argp, argc, argv, 0, NULL, &env);
 	if (err)
 		return err;
@@ -1824,6 +1926,8 @@ int main(int argc, char **argv)
 	if (err)
 		return err;
 
+	watchdog_init();
+
 	/* Use libbpf 1.0 API mode */
 	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
 	libbpf_set_print(libbpf_print_fn);
diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h
index 7a58895867c3..74de33ae37e5 100644
--- a/tools/testing/selftests/bpf/test_progs.h
+++ b/tools/testing/selftests/bpf/test_progs.h
@@ -131,6 +131,12 @@ struct test_env {
 	pid_t *worker_pids; /* array of worker pids */
 	int *worker_socks; /* array of worker socks */
 	int *worker_current_test; /* array of current running test for each worker */
+
+	pthread_t main_thread;
+	int secs_till_notify;
+	int secs_till_kill;
+	timer_t watchdog; /* watch for stalled tests/subtests */
+	enum { WD_NOTIFY, WD_KILL } watchdog_state;
 };
 
 #define MAX_LOG_TRUNK_SIZE 8192
-- 
cgit v1.2.3


From 03066ed3105a71c2b0ad39ea44b6e5733ddd4a68 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 12 Nov 2024 03:09:04 -0800
Subject: selftests/bpf: add read_with_timeout() utility function

int read_with_timeout(int fd, char *buf, size_t count, long usec)

As a regular read(2), but allows to specify a timeout in
micro-seconds. Returns -EAGAIN on timeout.
Implemented using select().

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20241112110906.3045278-3-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/Makefile     |  1 +
 tools/testing/selftests/bpf/io_helpers.c | 21 +++++++++++++++++++++
 tools/testing/selftests/bpf/io_helpers.h |  7 +++++++
 3 files changed, 29 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/io_helpers.c
 create mode 100644 tools/testing/selftests/bpf/io_helpers.h

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index edef5df08cb2..b1080284522d 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -742,6 +742,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c		\
 			 unpriv_helpers.c 	\
 			 netlink_helpers.c	\
 			 jit_disasm_helpers.c	\
+			 io_helpers.c		\
 			 test_loader.c		\
 			 xsk.c			\
 			 disasm.c		\
diff --git a/tools/testing/selftests/bpf/io_helpers.c b/tools/testing/selftests/bpf/io_helpers.c
new file mode 100644
index 000000000000..4ada0a74aa1f
--- /dev/null
+++ b/tools/testing/selftests/bpf/io_helpers.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <sys/select.h>
+#include <unistd.h>
+#include <errno.h>
+
+int read_with_timeout(int fd, char *buf, size_t count, long usec)
+{
+	const long M = 1000 * 1000;
+	struct timeval tv = { usec / M, usec % M };
+	fd_set fds;
+	int err;
+
+	FD_ZERO(&fds);
+	FD_SET(fd, &fds);
+	err = select(fd + 1, &fds, NULL, NULL, &tv);
+	if (err < 0)
+		return err;
+	if (FD_ISSET(fd, &fds))
+		return read(fd, buf, count);
+	return -EAGAIN;
+}
diff --git a/tools/testing/selftests/bpf/io_helpers.h b/tools/testing/selftests/bpf/io_helpers.h
new file mode 100644
index 000000000000..21e1134cd3ce
--- /dev/null
+++ b/tools/testing/selftests/bpf/io_helpers.h
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <unistd.h>
+
+/* As a regular read(2), but allows to specify a timeout in micro-seconds.
+ * Returns -EAGAIN on timeout.
+ */
+int read_with_timeout(int fd, char *buf, size_t count, long usec);
-- 
cgit v1.2.3


From 3209139d00e594e30abc2429ea54c36bfbb9528a Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 12 Nov 2024 03:09:05 -0800
Subject: selftests/bpf: allow send_signal test to timeout

The following invocation:

  $ t1=send_signal/send_signal_perf_thread_remote \
    t2=send_signal/send_signal_nmi_thread_remote  \
    ./test_progs -t $t1,$t2

Leads to send_signal_nmi_thread_remote to be stuck
on a line 180:

  /* wait for result */
  err = read(pipe_c2p[0], buf, 1);

In this test case:
- perf event PERF_COUNT_HW_CPU_CYCLES is created for parent process;
- BPF program is attached to perf event, and sends a signal to child
  process when event occurs;
- parent program burns some CPU in busy loop and calls read() to get
  notification from child that it received a signal.

The perf event is declared with .sample_period = 1.
This forces perf to throttle events, and under some unclear conditions
the event does not always occur while parent is in busy loop.
After parent enters read() system call CPU cycles event won't be
generated for parent anymore. Thus, if perf event had not occurred
already the test is stuck.

This commit updates the parent to wait for notification with a timeout,
doing several iterations of busy loop + read_with_timeout().

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20241112110906.3045278-4-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../testing/selftests/bpf/prog_tests/send_signal.c | 32 +++++++++++++---------
 1 file changed, 19 insertions(+), 13 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c
index 1aed94ec14ef..4e03d7a4c6f7 100644
--- a/tools/testing/selftests/bpf/prog_tests/send_signal.c
+++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c
@@ -3,6 +3,7 @@
 #include <sys/time.h>
 #include <sys/resource.h>
 #include "test_send_signal_kern.skel.h"
+#include "io_helpers.h"
 
 static int sigusr1_received;
 
@@ -24,6 +25,7 @@ static void test_send_signal_common(struct perf_event_attr *attr,
 	int pipe_c2p[2], pipe_p2c[2];
 	int err = -1, pmu_fd = -1;
 	volatile int j = 0;
+	int retry_count;
 	char buf[256];
 	pid_t pid;
 	int old_prio;
@@ -163,21 +165,25 @@ static void test_send_signal_common(struct perf_event_attr *attr,
 	/* notify child that bpf program can send_signal now */
 	ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write");
 
-	/* For the remote test, the BPF program is triggered from this
-	 * process but the other process/thread is signaled.
-	 */
-	if (remote) {
-		if (!attr) {
-			for (int i = 0; i < 10; i++)
-				usleep(1);
-		} else {
-			for (int i = 0; i < 100000000; i++)
-				j /= i + 1;
+	for (retry_count = 0;;) {
+		/* For the remote test, the BPF program is triggered from this
+		 * process but the other process/thread is signaled.
+		 */
+		if (remote) {
+			if (!attr) {
+				for (int i = 0; i < 10; i++)
+					usleep(1);
+			} else {
+				for (int i = 0; i < 100000000; i++)
+					j /= i + 1;
+			}
 		}
+		/* wait for result */
+		err = read_with_timeout(pipe_c2p[0], buf, 1, 100);
+		if (err == -EAGAIN && retry_count++ < 10000)
+			continue;
+		break;
 	}
-
-	/* wait for result */
-	err = read(pipe_c2p[0], buf, 1);
 	if (!ASSERT_GE(err, 0, "reading pipe"))
 		goto disable_pmu;
 	if (!ASSERT_GT(err, 0, "reading pipe error: size 0")) {
-- 
cgit v1.2.3


From 4edab4c55d2d070ec7ff3526f93ec6d90d9105d4 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 12 Nov 2024 03:09:06 -0800
Subject: selftests/bpf: update send_signal to lower perf evemts frequency

Similar to commit [1] sample perf events less often in
test_send_signal_nmi(). This should reduce perf events throttling.

[1] 7015843afcaf ("selftests/bpf: Fix send_signal test with nested CONFIG_PARAVIRT")

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20241112110906.3045278-5-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/send_signal.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c
index 4e03d7a4c6f7..1702aa592c2c 100644
--- a/tools/testing/selftests/bpf/prog_tests/send_signal.c
+++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c
@@ -229,7 +229,8 @@ static void test_send_signal_perf(bool signal_thread, bool remote)
 static void test_send_signal_nmi(bool signal_thread, bool remote)
 {
 	struct perf_event_attr attr = {
-		.sample_period = 1,
+		.freq = 1,
+		.sample_freq = 1000,
 		.type = PERF_TYPE_HARDWARE,
 		.config = PERF_COUNT_HW_CPU_CYCLES,
 	};
-- 
cgit v1.2.3


From 32693634cdf90e56bd167e5226db7ca569707437 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 9 Oct 2024 11:02:50 -0700
Subject: torture: Add --no-affinity parameter to kvm.sh

In performance tests, it can be counter-productive to spread torture-test
guest OSes across sockets.  Plus the experimenter might have ideas about
what CPUs individual guest OSes are to run on.  This commit therefore
adds a --no-affinity parameter to kvm.sh to prevent it from running
taskset on its guest OSes.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Neeraj Upadhyay <Neeraj.Upadhyay@amd.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 .../rcutorture/bin/kvm-test-1-run-batch.sh         | 43 ++++++++++++----------
 tools/testing/selftests/rcutorture/bin/kvm.sh      |  6 +++
 2 files changed, 29 insertions(+), 20 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-batch.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-batch.sh
index c3808c490d92..f87046b702d8 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-batch.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-batch.sh
@@ -56,27 +56,30 @@ do
 	echo > $i/kvm-test-1-run-qemu.sh.out
 	export TORTURE_AFFINITY=
 	kvm-get-cpus-script.sh $T/cpuarray.awk $T/cpubatches.awk $T/cpustate
-	cat << '	___EOF___' >> $T/cpubatches.awk
-	END {
-		affinitylist = "";
-		if (!gotcpus()) {
-			print "echo No CPU-affinity information, so no taskset command.";
-		} else if (cpu_count !~ /^[0-9][0-9]*$/) {
-			print "echo " scenario ": Bogus number of CPUs (old qemu-cmd?), so no taskset command.";
-		} else {
-			affinitylist = nextcpus(cpu_count);
-			if (!(affinitylist ~ /^[0-9,-][0-9,-]*$/))
-				print "echo " scenario ": Bogus CPU-affinity information, so no taskset command.";
-			else if (!dumpcpustate())
-				print "echo " scenario ": Could not dump state, so no taskset command.";
-			else
-				print "export TORTURE_AFFINITY=" affinitylist;
+	if test -z "${TORTURE_NO_AFFINITY}"
+	then
+		cat << '		___EOF___' >> $T/cpubatches.awk
+		END {
+			affinitylist = "";
+			if (!gotcpus()) {
+				print "echo No CPU-affinity information, so no taskset command.";
+			} else if (cpu_count !~ /^[0-9][0-9]*$/) {
+				print "echo " scenario ": Bogus number of CPUs (old qemu-cmd?), so no taskset command.";
+			} else {
+				affinitylist = nextcpus(cpu_count);
+				if (!(affinitylist ~ /^[0-9,-][0-9,-]*$/))
+					print "echo " scenario ": Bogus CPU-affinity information, so no taskset command.";
+				else if (!dumpcpustate())
+					print "echo " scenario ": Could not dump state, so no taskset command.";
+				else
+					print "export TORTURE_AFFINITY=" affinitylist;
+			}
 		}
-	}
-	___EOF___
-	cpu_count="`grep '# TORTURE_CPU_COUNT=' $i/qemu-cmd | sed -e 's/^.*=//'`"
-	affinity_export="`awk -f $T/cpubatches.awk -v cpu_count="$cpu_count" -v scenario=$i < /dev/null`"
-	$affinity_export
+		___EOF___
+		cpu_count="`grep '# TORTURE_CPU_COUNT=' $i/qemu-cmd | sed -e 's/^.*=//'`"
+		affinity_export="`awk -f $T/cpubatches.awk -v cpu_count="$cpu_count" -v scenario=$i < /dev/null`"
+		$affinity_export
+	fi
 	kvm-test-1-run-qemu.sh $i >> $i/kvm-test-1-run-qemu.sh.out 2>&1 &
 done
 for i in $runfiles
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index 7af73ddc148d..42e5e8597a1a 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -42,6 +42,7 @@ TORTURE_JITTER_STOP=""
 TORTURE_KCONFIG_KASAN_ARG=""
 TORTURE_KCONFIG_KCSAN_ARG=""
 TORTURE_KMAKE_ARG=""
+TORTURE_NO_AFFINITY=""
 TORTURE_QEMU_MEM=512
 torture_qemu_mem_default=1
 TORTURE_REMOTE=
@@ -82,6 +83,7 @@ usage () {
 	echo "       --kmake-arg kernel-make-arguments"
 	echo "       --mac nn:nn:nn:nn:nn:nn"
 	echo "       --memory megabytes|nnnG"
+	echo "       --no-affinity"
 	echo "       --no-initrd"
 	echo "       --qemu-args qemu-arguments"
 	echo "       --qemu-cmd qemu-system-..."
@@ -220,6 +222,9 @@ do
 		torture_qemu_mem_default=
 		shift
 		;;
+	--no-affinity)
+		TORTURE_NO_AFFINITY="no-affinity"
+		;;
 	--no-initrd)
 		TORTURE_INITRD=""; export TORTURE_INITRD
 		;;
@@ -417,6 +422,7 @@ TORTURE_KCONFIG_KASAN_ARG="$TORTURE_KCONFIG_KASAN_ARG"; export TORTURE_KCONFIG_K
 TORTURE_KCONFIG_KCSAN_ARG="$TORTURE_KCONFIG_KCSAN_ARG"; export TORTURE_KCONFIG_KCSAN_ARG
 TORTURE_KMAKE_ARG="$TORTURE_KMAKE_ARG"; export TORTURE_KMAKE_ARG
 TORTURE_MOD="$TORTURE_MOD"; export TORTURE_MOD
+TORTURE_NO_AFFINITY="$TORTURE_NO_AFFINITY"; export TORTURE_NO_AFFINITY
 TORTURE_QEMU_CMD="$TORTURE_QEMU_CMD"; export TORTURE_QEMU_CMD
 TORTURE_QEMU_INTERACTIVE="$TORTURE_QEMU_INTERACTIVE"; export TORTURE_QEMU_INTERACTIVE
 TORTURE_QEMU_MAC="$TORTURE_QEMU_MAC"; export TORTURE_QEMU_MAC
-- 
cgit v1.2.3


From f4b295ab65980435d7dc8b12d110387d1d1c653c Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Tue, 12 Nov 2024 08:39:27 -0800
Subject: selftests/bpf: Add tracing prog private stack tests

Some private stack tests are added including:
  - main prog only with stack size greater than BPF_PSTACK_MIN_SIZE.
  - main prog only with stack size smaller than BPF_PSTACK_MIN_SIZE.
  - prog with one subprog having MAX_BPF_STACK stack size and another
    subprog having non-zero small stack size.
  - prog with callback function.
  - prog with exception in main prog or subprog.
  - prog with async callback without nesting
  - prog with async callback with possible nesting

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20241112163927.2224750-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/verifier.c  |   2 +
 .../selftests/bpf/progs/verifier_private_stack.c   | 272 +++++++++++++++++++++
 2 files changed, 274 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/verifier_private_stack.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c
index 75f7a2ce334b..d9f65adb456b 100644
--- a/tools/testing/selftests/bpf/prog_tests/verifier.c
+++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
@@ -61,6 +61,7 @@
 #include "verifier_or_jmp32_k.skel.h"
 #include "verifier_precision.skel.h"
 #include "verifier_prevent_map_lookup.skel.h"
+#include "verifier_private_stack.skel.h"
 #include "verifier_raw_stack.skel.h"
 #include "verifier_raw_tp_writable.skel.h"
 #include "verifier_reg_equal.skel.h"
@@ -188,6 +189,7 @@ void test_verifier_bpf_fastcall(void)         { RUN(verifier_bpf_fastcall); }
 void test_verifier_or_jmp32_k(void)           { RUN(verifier_or_jmp32_k); }
 void test_verifier_precision(void)            { RUN(verifier_precision); }
 void test_verifier_prevent_map_lookup(void)   { RUN(verifier_prevent_map_lookup); }
+void test_verifier_private_stack(void)        { RUN(verifier_private_stack); }
 void test_verifier_raw_stack(void)            { RUN(verifier_raw_stack); }
 void test_verifier_raw_tp_writable(void)      { RUN(verifier_raw_tp_writable); }
 void test_verifier_reg_equal(void)            { RUN(verifier_reg_equal); }
diff --git a/tools/testing/selftests/bpf/progs/verifier_private_stack.c b/tools/testing/selftests/bpf/progs/verifier_private_stack.c
new file mode 100644
index 000000000000..b1fbdf119553
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_private_stack.c
@@ -0,0 +1,272 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+#include "bpf_experimental.h"
+
+/* From include/linux/filter.h */
+#define MAX_BPF_STACK    512
+
+#if defined(__TARGET_ARCH_x86)
+
+struct elem {
+	struct bpf_timer t;
+	char pad[256];
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, int);
+	__type(value, struct elem);
+} array SEC(".maps");
+
+SEC("kprobe")
+__description("Private stack, single prog")
+__success
+__arch_x86_64
+__jited("	movabsq	$0x{{.*}}, %r9")
+__jited("	addq	%gs:0x{{.*}}, %r9")
+__jited("	movl	$0x2a, %edi")
+__jited("	movq	%rdi, -0x100(%r9)")
+__naked void private_stack_single_prog(void)
+{
+	asm volatile ("			\
+	r1 = 42;			\
+	*(u64 *)(r10 - 256) = r1;	\
+	r0 = 0;				\
+	exit;				\
+"	::: __clobber_all);
+}
+
+SEC("raw_tp")
+__description("No private stack")
+__success
+__arch_x86_64
+__jited("	subq	$0x8, %rsp")
+__naked void no_private_stack_nested(void)
+{
+	asm volatile ("			\
+	r1 = 42;			\
+	*(u64 *)(r10 - 8) = r1;		\
+	r0 = 0;				\
+	exit;				\
+"	::: __clobber_all);
+}
+
+__used
+__naked static void cumulative_stack_depth_subprog(void)
+{
+	asm volatile ("				\
+	r1 = 41;				\
+	*(u64 *)(r10 - 32) = r1;		\
+	call %[bpf_get_smp_processor_id];	\
+	exit;					\
+"	:
+	: __imm(bpf_get_smp_processor_id)
+	: __clobber_all);
+}
+
+SEC("kprobe")
+__description("Private stack, subtree > MAX_BPF_STACK")
+__success
+__arch_x86_64
+/* private stack fp for the main prog */
+__jited("	movabsq	$0x{{.*}}, %r9")
+__jited("	addq	%gs:0x{{.*}}, %r9")
+__jited("	movl	$0x2a, %edi")
+__jited("	movq	%rdi, -0x200(%r9)")
+__jited("	pushq	%r9")
+__jited("	callq	0x{{.*}}")
+__jited("	popq	%r9")
+__jited("	xorl	%eax, %eax")
+__naked void private_stack_nested_1(void)
+{
+	asm volatile ("				\
+	r1 = 42;				\
+	*(u64 *)(r10 - %[max_bpf_stack]) = r1;	\
+	call cumulative_stack_depth_subprog;	\
+	r0 = 0;					\
+	exit;					\
+"	:
+	: __imm_const(max_bpf_stack, MAX_BPF_STACK)
+	: __clobber_all);
+}
+
+__naked __noinline __used
+static unsigned long loop_callback(void)
+{
+	asm volatile ("				\
+	call %[bpf_get_prandom_u32];		\
+	r1 = 42;				\
+	*(u64 *)(r10 - 512) = r1;		\
+	call cumulative_stack_depth_subprog;	\
+	r0 = 0;					\
+	exit;					\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_common);
+}
+
+SEC("raw_tp")
+__description("Private stack, callback")
+__success
+__arch_x86_64
+/* for func loop_callback */
+__jited("func #1")
+__jited("	endbr64")
+__jited("	nopl	(%rax,%rax)")
+__jited("	nopl	(%rax)")
+__jited("	pushq	%rbp")
+__jited("	movq	%rsp, %rbp")
+__jited("	endbr64")
+__jited("	movabsq	$0x{{.*}}, %r9")
+__jited("	addq	%gs:0x{{.*}}, %r9")
+__jited("	pushq	%r9")
+__jited("	callq")
+__jited("	popq	%r9")
+__jited("	movl	$0x2a, %edi")
+__jited("	movq	%rdi, -0x200(%r9)")
+__jited("	pushq	%r9")
+__jited("	callq")
+__jited("	popq	%r9")
+__naked void private_stack_callback(void)
+{
+	asm volatile ("			\
+	r1 = 1;				\
+	r2 = %[loop_callback];		\
+	r3 = 0;				\
+	r4 = 0;				\
+	call %[bpf_loop];		\
+	r0 = 0;				\
+	exit;				\
+"	:
+	: __imm_ptr(loop_callback),
+	  __imm(bpf_loop)
+	: __clobber_common);
+}
+
+SEC("fentry/bpf_fentry_test9")
+__description("Private stack, exception in main prog")
+__success __retval(0)
+__arch_x86_64
+__jited("	pushq	%r9")
+__jited("	callq")
+__jited("	popq	%r9")
+int private_stack_exception_main_prog(void)
+{
+	asm volatile ("			\
+	r1 = 42;			\
+	*(u64 *)(r10 - 512) = r1;	\
+"	::: __clobber_common);
+
+	bpf_throw(0);
+	return 0;
+}
+
+__used static int subprog_exception(void)
+{
+	bpf_throw(0);
+	return 0;
+}
+
+SEC("fentry/bpf_fentry_test9")
+__description("Private stack, exception in subprog")
+__success __retval(0)
+__arch_x86_64
+__jited("	movq	%rdi, -0x200(%r9)")
+__jited("	pushq	%r9")
+__jited("	callq")
+__jited("	popq	%r9")
+int private_stack_exception_sub_prog(void)
+{
+	asm volatile ("			\
+	r1 = 42;			\
+	*(u64 *)(r10 - 512) = r1;	\
+	call subprog_exception;		\
+"	::: __clobber_common);
+
+	return 0;
+}
+
+int glob;
+__noinline static void subprog2(int *val)
+{
+	glob += val[0] * 2;
+}
+
+__noinline static void subprog1(int *val)
+{
+	int tmp[64] = {};
+
+	tmp[0] = *val;
+	subprog2(tmp);
+}
+
+__noinline static int timer_cb1(void *map, int *key, struct bpf_timer *timer)
+{
+	subprog1(key);
+	return 0;
+}
+
+__noinline static int timer_cb2(void *map, int *key, struct bpf_timer *timer)
+{
+	return 0;
+}
+
+SEC("fentry/bpf_fentry_test9")
+__description("Private stack, async callback, not nested")
+__success __retval(0)
+__arch_x86_64
+__jited("	movabsq	$0x{{.*}}, %r9")
+int private_stack_async_callback_1(void)
+{
+	struct bpf_timer *arr_timer;
+	int array_key = 0;
+
+	arr_timer = bpf_map_lookup_elem(&array, &array_key);
+	if (!arr_timer)
+		return 0;
+
+	bpf_timer_init(arr_timer, &array, 1);
+	bpf_timer_set_callback(arr_timer, timer_cb2);
+	bpf_timer_start(arr_timer, 0, 0);
+	subprog1(&array_key);
+	return 0;
+}
+
+SEC("fentry/bpf_fentry_test9")
+__description("Private stack, async callback, potential nesting")
+__success __retval(0)
+__arch_x86_64
+__jited("	subq	$0x100, %rsp")
+int private_stack_async_callback_2(void)
+{
+	struct bpf_timer *arr_timer;
+	int array_key = 0;
+
+	arr_timer = bpf_map_lookup_elem(&array, &array_key);
+	if (!arr_timer)
+		return 0;
+
+	bpf_timer_init(arr_timer, &array, 1);
+	bpf_timer_set_callback(arr_timer, timer_cb1);
+	bpf_timer_start(arr_timer, 0, 0);
+	subprog1(&array_key);
+	return 0;
+}
+
+#else
+
+SEC("kprobe")
+__description("private stack is not supported, use a dummy test")
+__success
+int dummy_test(void)
+{
+	return 0;
+}
+
+#endif
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From becfe32b57c7d323fbd94c1a2c6d7eba918ddde8 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Tue, 12 Nov 2024 08:39:38 -0800
Subject: selftests/bpf: Add struct_ops prog private stack tests

Add three tests for struct_ops using private stack.
  ./test_progs -t struct_ops_private_stack
  #336/1   struct_ops_private_stack/private_stack:OK
  #336/2   struct_ops_private_stack/private_stack_fail:OK
  #336/3   struct_ops_private_stack/private_stack_recur:OK
  #336     struct_ops_private_stack:OK

The following is a snippet of a struct_ops check_member() implementation:

	u32 moff = __btf_member_bit_offset(t, member) / 8;
	switch (moff) {
	case offsetof(struct bpf_testmod_ops3, test_1):
        	prog->aux->priv_stack_requested = true;
                prog->aux->recursion_detected = test_1_recursion_detected;
        	fallthrough;
	default:
        	break;
	}
	return 0;

The first test is with nested two different callback functions where the
first prog has more than 512 byte stack size (including subprogs) with
private stack enabled.

The second test is a negative test where the second prog has more than 512
byte stack size without private stack enabled.

The third test is the same callback function recursing itself. At run time,
the jit trampoline recursion check kicks in to prevent the recursion. The
recursion_detected() callback function is implemented by the bpf_testmod,
the following message in dmesg
  bpf_testmod: oh no, recursing into test_1, recursion_misses 1
demonstrates the callback function is indeed triggered when recursion miss
happens.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20241112163938.2225528-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/bpf_testmod/bpf_testmod.c        | 104 ++++++++++++++++++++
 .../selftests/bpf/bpf_testmod/bpf_testmod.h        |   5 +
 .../bpf/prog_tests/struct_ops_private_stack.c      | 106 +++++++++++++++++++++
 .../selftests/bpf/progs/struct_ops_private_stack.c |  62 ++++++++++++
 .../bpf/progs/struct_ops_private_stack_fail.c      |  62 ++++++++++++
 .../bpf/progs/struct_ops_private_stack_recur.c     |  50 ++++++++++
 6 files changed, 389 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/struct_ops_private_stack.c
 create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_private_stack.c
 create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_private_stack_fail.c
 create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_private_stack_recur.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index 987d41af71d2..cc9dde507aba 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -245,6 +245,39 @@ __bpf_kfunc void bpf_testmod_ctx_release(struct bpf_testmod_ctx *ctx)
 		call_rcu(&ctx->rcu, testmod_free_cb);
 }
 
+static struct bpf_testmod_ops3 *st_ops3;
+
+static int bpf_testmod_test_3(void)
+{
+	return 0;
+}
+
+static int bpf_testmod_test_4(void)
+{
+	return 0;
+}
+
+static struct bpf_testmod_ops3 __bpf_testmod_ops3 = {
+	.test_1 = bpf_testmod_test_3,
+	.test_2 = bpf_testmod_test_4,
+};
+
+static void bpf_testmod_test_struct_ops3(void)
+{
+	if (st_ops3)
+		st_ops3->test_1();
+}
+
+__bpf_kfunc void bpf_testmod_ops3_call_test_1(void)
+{
+	st_ops3->test_1();
+}
+
+__bpf_kfunc void bpf_testmod_ops3_call_test_2(void)
+{
+	st_ops3->test_2();
+}
+
 struct bpf_testmod_btf_type_tag_1 {
 	int a;
 };
@@ -382,6 +415,8 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj,
 
 	(void)trace_bpf_testmod_test_raw_tp_null(NULL);
 
+	bpf_testmod_test_struct_ops3();
+
 	struct_arg3 = kmalloc((sizeof(struct bpf_testmod_struct_arg_3) +
 				sizeof(int)), GFP_KERNEL);
 	if (struct_arg3 != NULL) {
@@ -586,6 +621,8 @@ BTF_ID_FLAGS(func, bpf_kfunc_trusted_num_test, KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_kfunc_rcu_task_test, KF_RCU)
 BTF_ID_FLAGS(func, bpf_testmod_ctx_create, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_testmod_ctx_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_testmod_ops3_call_test_1)
+BTF_ID_FLAGS(func, bpf_testmod_ops3_call_test_2)
 BTF_KFUNCS_END(bpf_testmod_common_kfunc_ids)
 
 BTF_ID_LIST(bpf_testmod_dtor_ids)
@@ -1096,6 +1133,10 @@ static const struct bpf_verifier_ops bpf_testmod_verifier_ops = {
 	.is_valid_access = bpf_testmod_ops_is_valid_access,
 };
 
+static const struct bpf_verifier_ops bpf_testmod_verifier_ops3 = {
+	.is_valid_access = bpf_testmod_ops_is_valid_access,
+};
+
 static int bpf_dummy_reg(void *kdata, struct bpf_link *link)
 {
 	struct bpf_testmod_ops *ops = kdata;
@@ -1175,6 +1216,68 @@ struct bpf_struct_ops bpf_testmod_ops2 = {
 	.owner = THIS_MODULE,
 };
 
+static int st_ops3_reg(void *kdata, struct bpf_link *link)
+{
+	int err = 0;
+
+	mutex_lock(&st_ops_mutex);
+	if (st_ops3) {
+		pr_err("st_ops has already been registered\n");
+		err = -EEXIST;
+		goto unlock;
+	}
+	st_ops3 = kdata;
+
+unlock:
+	mutex_unlock(&st_ops_mutex);
+	return err;
+}
+
+static void st_ops3_unreg(void *kdata, struct bpf_link *link)
+{
+	mutex_lock(&st_ops_mutex);
+	st_ops3 = NULL;
+	mutex_unlock(&st_ops_mutex);
+}
+
+static void test_1_recursion_detected(struct bpf_prog *prog)
+{
+	struct bpf_prog_stats *stats;
+
+	stats = this_cpu_ptr(prog->stats);
+	printk("bpf_testmod: oh no, recursing into test_1, recursion_misses %llu",
+	       u64_stats_read(&stats->misses));
+}
+
+static int st_ops3_check_member(const struct btf_type *t,
+				const struct btf_member *member,
+				const struct bpf_prog *prog)
+{
+	u32 moff = __btf_member_bit_offset(t, member) / 8;
+
+	switch (moff) {
+	case offsetof(struct bpf_testmod_ops3, test_1):
+		prog->aux->priv_stack_requested = true;
+		prog->aux->recursion_detected = test_1_recursion_detected;
+		fallthrough;
+	default:
+		break;
+	}
+	return 0;
+}
+
+struct bpf_struct_ops bpf_testmod_ops3 = {
+	.verifier_ops = &bpf_testmod_verifier_ops3,
+	.init = bpf_testmod_ops_init,
+	.init_member = bpf_testmod_ops_init_member,
+	.reg = st_ops3_reg,
+	.unreg = st_ops3_unreg,
+	.check_member = st_ops3_check_member,
+	.cfi_stubs = &__bpf_testmod_ops3,
+	.name = "bpf_testmod_ops3",
+	.owner = THIS_MODULE,
+};
+
 static int bpf_test_mod_st_ops__test_prologue(struct st_ops_args *args)
 {
 	return 0;
@@ -1333,6 +1436,7 @@ static int bpf_testmod_init(void)
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_testmod_kfunc_set);
 	ret = ret ?: register_bpf_struct_ops(&bpf_bpf_testmod_ops, bpf_testmod_ops);
 	ret = ret ?: register_bpf_struct_ops(&bpf_testmod_ops2, bpf_testmod_ops2);
+	ret = ret ?: register_bpf_struct_ops(&bpf_testmod_ops3, bpf_testmod_ops3);
 	ret = ret ?: register_bpf_struct_ops(&testmod_st_ops, bpf_testmod_st_ops);
 	ret = ret ?: register_btf_id_dtor_kfuncs(bpf_testmod_dtors,
 						 ARRAY_SIZE(bpf_testmod_dtors),
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h
index fb7dff47597a..356803d1c10e 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h
@@ -94,6 +94,11 @@ struct bpf_testmod_ops2 {
 	int (*test_1)(void);
 };
 
+struct bpf_testmod_ops3 {
+	int (*test_1)(void);
+	int (*test_2)(void);
+};
+
 struct st_ops_args {
 	u64 a;
 };
diff --git a/tools/testing/selftests/bpf/prog_tests/struct_ops_private_stack.c b/tools/testing/selftests/bpf/prog_tests/struct_ops_private_stack.c
new file mode 100644
index 000000000000..4006879ca3fe
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/struct_ops_private_stack.c
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+#include "struct_ops_private_stack.skel.h"
+#include "struct_ops_private_stack_fail.skel.h"
+#include "struct_ops_private_stack_recur.skel.h"
+
+static void test_private_stack(void)
+{
+	struct struct_ops_private_stack *skel;
+	struct bpf_link *link;
+	int err;
+
+	skel = struct_ops_private_stack__open();
+	if (!ASSERT_OK_PTR(skel, "struct_ops_private_stack__open"))
+		return;
+
+	if (skel->data->skip) {
+		test__skip();
+		goto cleanup;
+	}
+
+	err = struct_ops_private_stack__load(skel);
+	if (!ASSERT_OK(err, "struct_ops_private_stack__load"))
+		goto cleanup;
+
+	link = bpf_map__attach_struct_ops(skel->maps.testmod_1);
+	if (!ASSERT_OK_PTR(link, "attach_struct_ops"))
+		goto cleanup;
+
+	ASSERT_OK(trigger_module_test_read(256), "trigger_read");
+
+	ASSERT_EQ(skel->bss->val_i, 3, "val_i");
+	ASSERT_EQ(skel->bss->val_j, 8, "val_j");
+
+	bpf_link__destroy(link);
+
+cleanup:
+	struct_ops_private_stack__destroy(skel);
+}
+
+static void test_private_stack_fail(void)
+{
+	struct struct_ops_private_stack_fail *skel;
+	int err;
+
+	skel = struct_ops_private_stack_fail__open();
+	if (!ASSERT_OK_PTR(skel, "struct_ops_private_stack_fail__open"))
+		return;
+
+	if (skel->data->skip) {
+		test__skip();
+		goto cleanup;
+	}
+
+	err = struct_ops_private_stack_fail__load(skel);
+	if (!ASSERT_ERR(err, "struct_ops_private_stack_fail__load"))
+		goto cleanup;
+	return;
+
+cleanup:
+	struct_ops_private_stack_fail__destroy(skel);
+}
+
+static void test_private_stack_recur(void)
+{
+	struct struct_ops_private_stack_recur *skel;
+	struct bpf_link *link;
+	int err;
+
+	skel = struct_ops_private_stack_recur__open();
+	if (!ASSERT_OK_PTR(skel, "struct_ops_private_stack_recur__open"))
+		return;
+
+	if (skel->data->skip) {
+		test__skip();
+		goto cleanup;
+	}
+
+	err = struct_ops_private_stack_recur__load(skel);
+	if (!ASSERT_OK(err, "struct_ops_private_stack_recur__load"))
+		goto cleanup;
+
+	link = bpf_map__attach_struct_ops(skel->maps.testmod_1);
+	if (!ASSERT_OK_PTR(link, "attach_struct_ops"))
+		goto cleanup;
+
+	ASSERT_OK(trigger_module_test_read(256), "trigger_read");
+
+	ASSERT_EQ(skel->bss->val_j, 3, "val_j");
+
+	bpf_link__destroy(link);
+
+cleanup:
+	struct_ops_private_stack_recur__destroy(skel);
+}
+
+void test_struct_ops_private_stack(void)
+{
+	if (test__start_subtest("private_stack"))
+		test_private_stack();
+	if (test__start_subtest("private_stack_fail"))
+		test_private_stack_fail();
+	if (test__start_subtest("private_stack_recur"))
+		test_private_stack_recur();
+}
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_private_stack.c b/tools/testing/selftests/bpf/progs/struct_ops_private_stack.c
new file mode 100644
index 000000000000..8ea57e5348ab
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/struct_ops_private_stack.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "../bpf_testmod/bpf_testmod.h"
+
+char _license[] SEC("license") = "GPL";
+
+#if defined(__TARGET_ARCH_x86)
+bool skip __attribute((__section__(".data"))) = false;
+#else
+bool skip = true;
+#endif
+
+void bpf_testmod_ops3_call_test_2(void) __ksym;
+
+int val_i, val_j;
+
+__noinline static int subprog2(int *a, int *b)
+{
+	return val_i + a[10] + b[20];
+}
+
+__noinline static int subprog1(int *a)
+{
+	/* stack size 200 bytes */
+	int b[50] = {};
+
+	b[20] = 2;
+	return subprog2(a, b);
+}
+
+
+SEC("struct_ops")
+int BPF_PROG(test_1)
+{
+	/* stack size 400 bytes */
+	int a[100] = {};
+
+	a[10] = 1;
+	val_i = subprog1(a);
+	bpf_testmod_ops3_call_test_2();
+	return 0;
+}
+
+SEC("struct_ops")
+int BPF_PROG(test_2)
+{
+	/* stack size 200 bytes */
+	int a[50] = {};
+
+	a[10] = 3;
+	val_j = subprog1(a);
+	return 0;
+}
+
+SEC(".struct_ops")
+struct bpf_testmod_ops3 testmod_1 = {
+	.test_1 = (void *)test_1,
+	.test_2 = (void *)test_2,
+};
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_private_stack_fail.c b/tools/testing/selftests/bpf/progs/struct_ops_private_stack_fail.c
new file mode 100644
index 000000000000..1f55ec4cee37
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/struct_ops_private_stack_fail.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "../bpf_testmod/bpf_testmod.h"
+
+char _license[] SEC("license") = "GPL";
+
+#if defined(__TARGET_ARCH_x86)
+bool skip __attribute((__section__(".data"))) = false;
+#else
+bool skip = true;
+#endif
+
+void bpf_testmod_ops3_call_test_2(void) __ksym;
+
+int val_i, val_j;
+
+__noinline static int subprog2(int *a, int *b)
+{
+	return val_i + a[10] + b[20];
+}
+
+__noinline static int subprog1(int *a)
+{
+	/* stack size 200 bytes */
+	int b[50] = {};
+
+	b[20] = 2;
+	return subprog2(a, b);
+}
+
+
+SEC("struct_ops")
+int BPF_PROG(test_1)
+{
+	/* stack size 100 bytes */
+	int a[25] = {};
+
+	a[10] = 1;
+	val_i = subprog1(a);
+	bpf_testmod_ops3_call_test_2();
+	return 0;
+}
+
+SEC("struct_ops")
+int BPF_PROG(test_2)
+{
+	/* stack size 400 bytes */
+	int a[100] = {};
+
+	a[10] = 3;
+	val_j = subprog1(a);
+	return 0;
+}
+
+SEC(".struct_ops")
+struct bpf_testmod_ops3 testmod_1 = {
+	.test_1 = (void *)test_1,
+	.test_2 = (void *)test_2,
+};
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_private_stack_recur.c b/tools/testing/selftests/bpf/progs/struct_ops_private_stack_recur.c
new file mode 100644
index 000000000000..f2f300d50988
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/struct_ops_private_stack_recur.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "../bpf_testmod/bpf_testmod.h"
+
+char _license[] SEC("license") = "GPL";
+
+#if defined(__TARGET_ARCH_x86)
+bool skip __attribute((__section__(".data"))) = false;
+#else
+bool skip = true;
+#endif
+
+void bpf_testmod_ops3_call_test_1(void) __ksym;
+
+int val_i, val_j;
+
+__noinline static int subprog2(int *a, int *b)
+{
+	return val_i + a[1] + b[20];
+}
+
+__noinline static int subprog1(int *a)
+{
+	/* stack size 400 bytes */
+	int b[100] = {};
+
+	b[20] = 2;
+	return subprog2(a, b);
+}
+
+
+SEC("struct_ops")
+int BPF_PROG(test_1)
+{
+	/* stack size 20 bytes */
+	int a[5] = {};
+
+	a[1] = 1;
+	val_j += subprog1(a);
+	bpf_testmod_ops3_call_test_1();
+	return 0;
+}
+
+SEC(".struct_ops")
+struct bpf_testmod_ops3 testmod_1 = {
+	.test_1 = (void *)test_1,
+};
-- 
cgit v1.2.3


From e58358afa84e8e271a296459d35d1715c7572013 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 7 Nov 2024 18:56:16 -0800
Subject: selftests/bpf: Add a test for arena range tree algorithm

Add a test that verifies specific behavior of arena range tree
algorithm and adjust existing big_alloc1 test due to use
of global data in arena.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/bpf/20241108025616.17625-3-alexei.starovoitov@gmail.com
---
 .../selftests/bpf/progs/verifier_arena_large.c     | 110 ++++++++++++++++++++-
 1 file changed, 108 insertions(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_large.c b/tools/testing/selftests/bpf/progs/verifier_arena_large.c
index 6065f862d964..8a9af79db884 100644
--- a/tools/testing/selftests/bpf/progs/verifier_arena_large.c
+++ b/tools/testing/selftests/bpf/progs/verifier_arena_large.c
@@ -29,12 +29,12 @@ int big_alloc1(void *ctx)
 	if (!page1)
 		return 1;
 	*page1 = 1;
-	page2 = bpf_arena_alloc_pages(&arena, base + ARENA_SIZE - PAGE_SIZE,
+	page2 = bpf_arena_alloc_pages(&arena, base + ARENA_SIZE - PAGE_SIZE * 2,
 				      1, NUMA_NO_NODE, 0);
 	if (!page2)
 		return 2;
 	*page2 = 2;
-	no_page = bpf_arena_alloc_pages(&arena, base + ARENA_SIZE,
+	no_page = bpf_arena_alloc_pages(&arena, base + ARENA_SIZE - PAGE_SIZE,
 					1, NUMA_NO_NODE, 0);
 	if (no_page)
 		return 3;
@@ -66,4 +66,110 @@ int big_alloc1(void *ctx)
 #endif
 	return 0;
 }
+
+#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
+#define PAGE_CNT 100
+__u8 __arena * __arena page[PAGE_CNT]; /* occupies the first page */
+__u8 __arena *base;
+
+/*
+ * Check that arena's range_tree algorithm allocates pages sequentially
+ * on the first pass and then fills in all gaps on the second pass.
+ */
+__noinline int alloc_pages(int page_cnt, int pages_atonce, bool first_pass,
+		int max_idx, int step)
+{
+	__u8 __arena *pg;
+	int i, pg_idx;
+
+	for (i = 0; i < page_cnt; i++) {
+		pg = bpf_arena_alloc_pages(&arena, NULL, pages_atonce,
+					   NUMA_NO_NODE, 0);
+		if (!pg)
+			return step;
+		pg_idx = (pg - base) / PAGE_SIZE;
+		if (first_pass) {
+			/* Pages must be allocated sequentially */
+			if (pg_idx != i)
+				return step + 100;
+		} else {
+			/* Allocator must fill into gaps */
+			if (pg_idx >= max_idx || (pg_idx & 1))
+				return step + 200;
+		}
+		*pg = pg_idx;
+		page[pg_idx] = pg;
+		cond_break;
+	}
+	return 0;
+}
+
+SEC("syscall")
+__success __retval(0)
+int big_alloc2(void *ctx)
+{
+	__u8 __arena *pg;
+	int i, err;
+
+	base = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
+	if (!base)
+		return 1;
+	bpf_arena_free_pages(&arena, (void __arena *)base, 1);
+
+	err = alloc_pages(PAGE_CNT, 1, true, PAGE_CNT, 2);
+	if (err)
+		return err;
+
+	/* Clear all even pages */
+	for (i = 0; i < PAGE_CNT; i += 2) {
+		pg = page[i];
+		if (*pg != i)
+			return 3;
+		bpf_arena_free_pages(&arena, (void __arena *)pg, 1);
+		page[i] = NULL;
+		cond_break;
+	}
+
+	/* Allocate into freed gaps */
+	err = alloc_pages(PAGE_CNT / 2, 1, false, PAGE_CNT, 4);
+	if (err)
+		return err;
+
+	/* Free pairs of pages */
+	for (i = 0; i < PAGE_CNT; i += 4) {
+		pg = page[i];
+		if (*pg != i)
+			return 5;
+		bpf_arena_free_pages(&arena, (void __arena *)pg, 2);
+		page[i] = NULL;
+		page[i + 1] = NULL;
+		cond_break;
+	}
+
+	/* Allocate 2 pages at a time into freed gaps */
+	err = alloc_pages(PAGE_CNT / 4, 2, false, PAGE_CNT, 6);
+	if (err)
+		return err;
+
+	/* Check pages without freeing */
+	for (i = 0; i < PAGE_CNT; i += 2) {
+		pg = page[i];
+		if (*pg != i)
+			return 7;
+		cond_break;
+	}
+
+	pg = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
+
+	if (!pg)
+		return 8;
+	/*
+	 * The first PAGE_CNT pages are occupied. The new page
+	 * must be above.
+	 */
+	if ((pg - base) / PAGE_SIZE < PAGE_CNT)
+		return 9;
+	return 0;
+}
+#endif
 char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 86fb6173d11e773a00a5b6d1b7bd17caff8692b8 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Mon, 11 Nov 2024 10:16:50 +0000
Subject: selftests: bonding: add ns multicast group testing

Add a test to make sure the backup slaves join correct multicast group
when arp_validate enabled and ns_ip6_target is set. Here is the result:

TEST: arp_validate (active-backup ns_ip6_target arp_validate 0)     [ OK ]
TEST: arp_validate (join mcast group)                               [ OK ]
TEST: arp_validate (active-backup ns_ip6_target arp_validate 1)     [ OK ]
TEST: arp_validate (join mcast group)                               [ OK ]
TEST: arp_validate (active-backup ns_ip6_target arp_validate 2)     [ OK ]
TEST: arp_validate (join mcast group)                               [ OK ]
TEST: arp_validate (active-backup ns_ip6_target arp_validate 3)     [ OK ]
TEST: arp_validate (join mcast group)                               [ OK ]
TEST: arp_validate (active-backup ns_ip6_target arp_validate 4)     [ OK ]
TEST: arp_validate (join mcast group)                               [ OK ]
TEST: arp_validate (active-backup ns_ip6_target arp_validate 5)     [ OK ]
TEST: arp_validate (join mcast group)                               [ OK ]
TEST: arp_validate (active-backup ns_ip6_target arp_validate 6)     [ OK ]
TEST: arp_validate (join mcast group)                               [ OK ]

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 .../selftests/drivers/net/bonding/bond_options.sh  | 54 +++++++++++++++++++++-
 1 file changed, 53 insertions(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/bonding/bond_options.sh b/tools/testing/selftests/drivers/net/bonding/bond_options.sh
index 41d0859feb7d..edc56e2cc606 100755
--- a/tools/testing/selftests/drivers/net/bonding/bond_options.sh
+++ b/tools/testing/selftests/drivers/net/bonding/bond_options.sh
@@ -11,6 +11,8 @@ ALL_TESTS="
 
 lib_dir=$(dirname "$0")
 source ${lib_dir}/bond_topo_3d1c.sh
+c_maddr="33:33:00:00:00:10"
+g_maddr="33:33:00:00:02:54"
 
 skip_prio()
 {
@@ -240,6 +242,54 @@ arp_validate_test()
 	done
 }
 
+# Testing correct multicast groups are added to slaves for ns targets
+arp_validate_mcast()
+{
+	RET=0
+	local arp_valid=$(cmd_jq "ip -n ${s_ns} -j -d link show bond0" ".[].linkinfo.info_data.arp_validate")
+	local active_slave=$(cmd_jq "ip -n ${s_ns} -d -j link show bond0" ".[].linkinfo.info_data.active_slave")
+
+	for i in $(seq 0 2); do
+		maddr_list=$(ip -n ${s_ns} maddr show dev eth${i})
+
+		# arp_valid == 0 or active_slave should not join any maddrs
+		if { [ "$arp_valid" == "null" ] || [ "eth${i}" == ${active_slave} ]; } && \
+			echo "$maddr_list" | grep -qE "${c_maddr}|${g_maddr}"; then
+			RET=1
+			check_err 1 "arp_valid $arp_valid active_slave $active_slave, eth$i has mcast group"
+		# arp_valid != 0 and backup_slave should join both maddrs
+		elif [ "$arp_valid" != "null" ] && [ "eth${i}" != ${active_slave} ] && \
+		     ( ! echo "$maddr_list" | grep -q "${c_maddr}" || \
+		       ! echo "$maddr_list" | grep -q "${m_maddr}"); then
+			RET=1
+			check_err 1 "arp_valid $arp_valid active_slave $active_slave, eth$i has mcast group"
+		fi
+	done
+
+	# Do failover
+	ip -n ${s_ns} link set ${active_slave} down
+	# wait for active link change
+	slowwait 2 active_slave_changed $active_slave
+	active_slave=$(cmd_jq "ip -n ${s_ns} -d -j link show bond0" ".[].linkinfo.info_data.active_slave")
+
+	for i in $(seq 0 2); do
+		maddr_list=$(ip -n ${s_ns} maddr show dev eth${i})
+
+		# arp_valid == 0 or active_slave should not join any maddrs
+		if { [ "$arp_valid" == "null" ] || [ "eth${i}" == ${active_slave} ]; } && \
+			echo "$maddr_list" | grep -qE "${c_maddr}|${g_maddr}"; then
+			RET=1
+			check_err 1 "arp_valid $arp_valid active_slave $active_slave, eth$i has mcast group"
+		# arp_valid != 0 and backup_slave should join both maddrs
+		elif [ "$arp_valid" != "null" ] && [ "eth${i}" != ${active_slave} ] && \
+		     ( ! echo "$maddr_list" | grep -q "${c_maddr}" || \
+		       ! echo "$maddr_list" | grep -q "${m_maddr}"); then
+			RET=1
+			check_err 1 "arp_valid $arp_valid active_slave $active_slave, eth$i has mcast group"
+		fi
+	done
+}
+
 arp_validate_arp()
 {
 	local mode=$1
@@ -261,8 +311,10 @@ arp_validate_ns()
 	fi
 
 	for val in $(seq 0 6); do
-		arp_validate_test "mode $mode arp_interval 100 ns_ip6_target ${g_ip6} arp_validate $val"
+		arp_validate_test "mode $mode arp_interval 100 ns_ip6_target ${g_ip6},${c_ip6} arp_validate $val"
 		log_test "arp_validate" "$mode ns_ip6_target arp_validate $val"
+		arp_validate_mcast
+		log_test "arp_validate" "join mcast group"
 	done
 }
 
-- 
cgit v1.2.3


From ca34aceb322bfcd6ab498884f1805ee12f983259 Mon Sep 17 00:00:00 2001
From: Alexandre Ferrieux <alexandre.ferrieux@gmail.com>
Date: Wed, 13 Nov 2024 11:04:28 +0100
Subject: net: sched: u32: Add test case for systematic hnode IDR leaks

Add a tdc test case to exercise the just-fixed systematic leak of
IDR entries in u32 hnode disposal. Given the IDR in question is
confined to the range [1..0x7FF], it is sufficient to create/delete
the same filter 2048 times to fill it up and get a nonzero exit
status from "tc filter add".

Signed-off-by: Alexandre Ferrieux <alexandre.ferrieux@orange.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Reviewed-by: Victor Nogueira <victor@mojatatu.com>
Link: https://patch.msgid.link/20241113100428.360460-1-alexandre.ferrieux@orange.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 .../selftests/tc-testing/tc-tests/filters/u32.json | 24 ++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/u32.json b/tools/testing/selftests/tc-testing/tc-tests/filters/u32.json
index 24bd0c2a3014..b2ca9d4e991b 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/filters/u32.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/filters/u32.json
@@ -329,5 +329,29 @@
         "teardown": [
             "$TC qdisc del dev $DEV1 parent root drr"
         ]
+    },
+    {
+        "id": "1234",
+        "name": "Exercise IDR leaks by creating/deleting a filter many (2048) times",
+        "category": [
+            "filter",
+            "u32"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "$TC qdisc add dev $DEV1 parent root handle 10: drr",
+            "$TC filter add dev $DEV1 parent 10:0 protocol ip prio 2 u32 match ip src 0.0.0.2/32 action drop",
+            "$TC filter add dev $DEV1 parent 10:0 protocol ip prio 3 u32 match ip src 0.0.0.3/32 action drop"
+        ],
+        "cmdUnderTest": "bash -c 'for i in {1..2048} ;do echo filter delete dev $DEV1 pref 3;echo filter add dev $DEV1 parent 10:0 protocol ip prio 3 u32 match ip src 0.0.0.3/32 action drop;done | $TC -b -'",
+        "expExitCode": "0",
+        "verifyCmd": "$TC filter show dev $DEV1",
+        "matchPattern": "protocol ip pref 3 u32",
+        "matchCount": "3",
+        "teardown": [
+            "$TC qdisc del dev $DEV1 parent root drr"
+        ]
     }
 ]
-- 
cgit v1.2.3


From 7ca93aa9204b706e4afcd4fae0dc8798500598d5 Mon Sep 17 00:00:00 2001
From: zhang jiao <zhangjiao2@cmss.chinamobile.com>
Date: Mon, 30 Sep 2024 09:27:57 +0800
Subject: selftests/powerpc: Remove the path after initialization.

If there were no anamolies noted, then we can simply remove the log file
and return, but only after the path variable has been initialized.

Signed-off-by: zhang jiao <zhangjiao2@cmss.chinamobile.com>
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20240930012757.2395-1-zhangjiao2@cmss.chinamobile.com
---
 tools/testing/selftests/powerpc/mm/tlbie_test.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/powerpc/mm/tlbie_test.c b/tools/testing/selftests/powerpc/mm/tlbie_test.c
index 48344a74b212..35f0098399cc 100644
--- a/tools/testing/selftests/powerpc/mm/tlbie_test.c
+++ b/tools/testing/selftests/powerpc/mm/tlbie_test.c
@@ -313,16 +313,16 @@ static inline void end_verification_log(unsigned int tid, unsigned nr_anamolies)
 
 	fclose(f);
 
-	if (nr_anamolies == 0) {
-		remove(path);
-		return;
-	}
-
 	sprintf(logfile, logfilename, tid);
 	strcpy(path, logdir);
 	strcat(path, separator);
 	strcat(path, logfile);
 
+	if (nr_anamolies == 0) {
+		remove(path);
+		return;
+	}
+
 	printf("Thread %02d chunk has %d corrupted words. For details check %s\n",
 		tid, nr_anamolies, path);
 }
-- 
cgit v1.2.3


From df6cb25f07794b39e7993938ee3ca6749a88f300 Mon Sep 17 00:00:00 2001
From: Li Zhijian <lizhijian@fujitsu.com>
Date: Wed, 30 Oct 2024 09:00:02 +0800
Subject: selftests: netfilter: Add missing gitignore file

Compiled binary files should be added to .gitignore
'git status' complains:
   Untracked files:
   (use "git add <file>..." to include in what will be committed)
         net/netfilter/conntrack_reverse_clash

Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 tools/testing/selftests/net/netfilter/.gitignore | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/netfilter/.gitignore b/tools/testing/selftests/net/netfilter/.gitignore
index 0a64d6d0e29a..64c4f8d9aa6c 100644
--- a/tools/testing/selftests/net/netfilter/.gitignore
+++ b/tools/testing/selftests/net/netfilter/.gitignore
@@ -2,5 +2,6 @@
 audit_logread
 connect_close
 conntrack_dump_flush
+conntrack_reverse_clash
 sctp_collision
 nf_queue
-- 
cgit v1.2.3


From 041bd1e4f2d82859690cd8b41c35f0f9404c3770 Mon Sep 17 00:00:00 2001
From: guanjing <guanjing@cmss.chinamobile.com>
Date: Fri, 8 Nov 2024 16:13:58 +0800
Subject: selftests: netfilter: Fix missing return values in
 conntrack_dump_flush

Fix the bug of some functions were missing return values.

Fixes: eff3c558bb7e ("netfilter: ctnetlink: support filtering by zone")
Signed-off-by: Guan Jing <guanjing@cmss.chinamobile.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 tools/testing/selftests/net/netfilter/conntrack_dump_flush.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c b/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c
index 254ff03297f0..5f827e10717d 100644
--- a/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c
+++ b/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c
@@ -43,6 +43,8 @@ static int build_cta_tuple_v4(struct nlmsghdr *nlh, int type,
 	mnl_attr_nest_end(nlh, nest_proto);
 
 	mnl_attr_nest_end(nlh, nest);
+
+	return 0;
 }
 
 static int build_cta_tuple_v6(struct nlmsghdr *nlh, int type,
@@ -71,6 +73,8 @@ static int build_cta_tuple_v6(struct nlmsghdr *nlh, int type,
 	mnl_attr_nest_end(nlh, nest_proto);
 
 	mnl_attr_nest_end(nlh, nest);
+
+	return 0;
 }
 
 static int build_cta_proto(struct nlmsghdr *nlh)
@@ -90,6 +94,8 @@ static int build_cta_proto(struct nlmsghdr *nlh)
 	mnl_attr_nest_end(nlh, nest_proto);
 
 	mnl_attr_nest_end(nlh, nest);
+
+	return 0;
 }
 
 static int conntrack_data_insert(struct mnl_socket *sock, struct nlmsghdr *nlh,
-- 
cgit v1.2.3


From c0dec4b848ce5110e95095d0d0ae46724beb70ec Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Wed, 13 Nov 2024 11:51:37 -0800
Subject: iommufd: IOMMU_IOAS_CHANGE_PROCESS selftest

Add selftest cases for IOMMU_IOAS_CHANGE_PROCESS.

Link: https://patch.msgid.link/r/1731527497-16091-5-git-send-email-steven.sistare@oracle.com
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 tools/testing/selftests/iommu/Makefile        |   1 +
 tools/testing/selftests/iommu/iommufd.c       | 141 ++++++++++++++++++++++++++
 tools/testing/selftests/iommu/iommufd_utils.h |   6 ++
 3 files changed, 148 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/iommu/Makefile b/tools/testing/selftests/iommu/Makefile
index fd6477911f24..84abeb2f0949 100644
--- a/tools/testing/selftests/iommu/Makefile
+++ b/tools/testing/selftests/iommu/Makefile
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 CFLAGS += -Wall -O2 -Wno-unused-function
 CFLAGS += $(KHDR_INCLUDES)
+LDLIBS += -lcap
 
 TEST_GEN_PROGS :=
 TEST_GEN_PROGS += iommufd
diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 94fe038d2eee..a1b2b657999d 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -2,6 +2,7 @@
 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES */
 #include <asm/unistd.h>
 #include <stdlib.h>
+#include <sys/capability.h>
 #include <sys/mman.h>
 #include <sys/eventfd.h>
 
@@ -135,6 +136,8 @@ TEST_F(iommufd, cmd_length)
 	TEST_LENGTH(iommu_ioas_map_file, IOMMU_IOAS_MAP_FILE, iova);
 	TEST_LENGTH(iommu_viommu_alloc, IOMMU_VIOMMU_ALLOC, out_viommu_id);
 	TEST_LENGTH(iommu_vdevice_alloc, IOMMU_VDEVICE_ALLOC, virt_id);
+	TEST_LENGTH(iommu_ioas_change_process, IOMMU_IOAS_CHANGE_PROCESS,
+		    __reserved);
 #undef TEST_LENGTH
 }
 
@@ -193,6 +196,144 @@ TEST_F(iommufd, global_options)
 	EXPECT_ERRNO(ENOENT, ioctl(self->fd, IOMMU_OPTION, &cmd));
 }
 
+static void drop_cap_ipc_lock(struct __test_metadata *_metadata)
+{
+	cap_t caps;
+	cap_value_t cap_list[1] = { CAP_IPC_LOCK };
+
+	caps = cap_get_proc();
+	ASSERT_NE(caps, NULL);
+	ASSERT_NE(-1,
+		  cap_set_flag(caps, CAP_EFFECTIVE, 1, cap_list, CAP_CLEAR));
+	ASSERT_NE(-1, cap_set_proc(caps));
+	cap_free(caps);
+}
+
+static long get_proc_status_value(pid_t pid, const char *var)
+{
+	FILE *fp;
+	char buf[80], tag[80];
+	long val = -1;
+
+	snprintf(buf, sizeof(buf), "/proc/%d/status", pid);
+	fp = fopen(buf, "r");
+	if (!fp)
+		return val;
+
+	while (fgets(buf, sizeof(buf), fp))
+		if (fscanf(fp, "%s %ld\n", tag, &val) == 2 && !strcmp(tag, var))
+			break;
+
+	fclose(fp);
+	return val;
+}
+
+static long get_vm_pinned(pid_t pid)
+{
+	return get_proc_status_value(pid, "VmPin:");
+}
+
+static long get_vm_locked(pid_t pid)
+{
+	return get_proc_status_value(pid, "VmLck:");
+}
+
+FIXTURE(change_process)
+{
+	int fd;
+	uint32_t ioas_id;
+};
+
+FIXTURE_VARIANT(change_process)
+{
+	int accounting;
+};
+
+FIXTURE_SETUP(change_process)
+{
+	self->fd = open("/dev/iommu", O_RDWR);
+	ASSERT_NE(-1, self->fd);
+
+	drop_cap_ipc_lock(_metadata);
+	if (variant->accounting != IOPT_PAGES_ACCOUNT_NONE) {
+		struct iommu_option set_limit_cmd = {
+			.size = sizeof(set_limit_cmd),
+			.option_id = IOMMU_OPTION_RLIMIT_MODE,
+			.op = IOMMU_OPTION_OP_SET,
+			.val64 = (variant->accounting == IOPT_PAGES_ACCOUNT_MM),
+		};
+		ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &set_limit_cmd));
+	}
+
+	test_ioctl_ioas_alloc(&self->ioas_id);
+	test_cmd_mock_domain(self->ioas_id, NULL, NULL, NULL);
+}
+
+FIXTURE_TEARDOWN(change_process)
+{
+	teardown_iommufd(self->fd, _metadata);
+}
+
+FIXTURE_VARIANT_ADD(change_process, account_none)
+{
+	.accounting = IOPT_PAGES_ACCOUNT_NONE,
+};
+
+FIXTURE_VARIANT_ADD(change_process, account_user)
+{
+	.accounting = IOPT_PAGES_ACCOUNT_USER,
+};
+
+FIXTURE_VARIANT_ADD(change_process, account_mm)
+{
+	.accounting = IOPT_PAGES_ACCOUNT_MM,
+};
+
+TEST_F(change_process, basic)
+{
+	pid_t parent = getpid();
+	pid_t child;
+	__u64 iova;
+	struct iommu_ioas_change_process cmd = {
+		.size = sizeof(cmd),
+	};
+
+	/* Expect failure if non-file maps exist */
+	test_ioctl_ioas_map(buffer, PAGE_SIZE, &iova);
+	EXPECT_ERRNO(EINVAL, ioctl(self->fd, IOMMU_IOAS_CHANGE_PROCESS, &cmd));
+	test_ioctl_ioas_unmap(iova, PAGE_SIZE);
+
+	/* Change process works in current process. */
+	test_ioctl_ioas_map_file(mfd, 0, PAGE_SIZE, &iova);
+	ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_CHANGE_PROCESS, &cmd));
+
+	/* Change process works in another process */
+	child = fork();
+	if (!child) {
+		int nlock = PAGE_SIZE / 1024;
+
+		/* Parent accounts for locked memory before */
+		ASSERT_EQ(nlock, get_vm_pinned(parent));
+		if (variant->accounting == IOPT_PAGES_ACCOUNT_MM)
+			ASSERT_EQ(nlock, get_vm_locked(parent));
+		ASSERT_EQ(0, get_vm_pinned(getpid()));
+		ASSERT_EQ(0, get_vm_locked(getpid()));
+
+		ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_CHANGE_PROCESS, &cmd));
+
+		/* Child accounts for locked memory after */
+		ASSERT_EQ(0, get_vm_pinned(parent));
+		ASSERT_EQ(0, get_vm_locked(parent));
+		ASSERT_EQ(nlock, get_vm_pinned(getpid()));
+		if (variant->accounting == IOPT_PAGES_ACCOUNT_MM)
+			ASSERT_EQ(nlock, get_vm_locked(getpid()));
+
+		exit(0);
+	}
+	ASSERT_NE(-1, child);
+	ASSERT_EQ(child, waitpid(child, NULL, 0));
+}
+
 FIXTURE(iommufd_ioas)
 {
 	int fd;
diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h
index c0239f86f2f8..d979f5b0efe8 100644
--- a/tools/testing/selftests/iommu/iommufd_utils.h
+++ b/tools/testing/selftests/iommu/iommufd_utils.h
@@ -22,6 +22,12 @@
 #define BIT_MASK(nr) (1UL << ((nr) % __BITS_PER_LONG))
 #define BIT_WORD(nr) ((nr) / __BITS_PER_LONG)
 
+enum {
+	IOPT_PAGES_ACCOUNT_NONE = 0,
+	IOPT_PAGES_ACCOUNT_USER = 1,
+	IOPT_PAGES_ACCOUNT_MM = 2,
+};
+
 #define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
 
 static inline void set_bit(unsigned int nr, unsigned long *addr)
-- 
cgit v1.2.3


From f01750aecdfb8bfb02842f60af3d805a3ae7267a Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@pm.me>
Date: Fri, 15 Nov 2024 00:38:55 +0000
Subject: selftests/bpf: Set test path for token/obj_priv_implicit_token_envvar

token/obj_priv_implicit_token_envvar test may fail in an environment
where the process executing tests can not write to the root path.

Example:
https://github.com/libbpf/libbpf/actions/runs/11844507007/job/33007897936

Change default path used by the test to /tmp/bpf-token-fs, and make it
runtime configurable via an environment variable.

Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241115003853.864397-1-ihor.solodrai@pm.me
---
 tools/testing/selftests/bpf/prog_tests/token.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/token.c b/tools/testing/selftests/bpf/prog_tests/token.c
index fe86e4fdb89c..c3ab9b6fb069 100644
--- a/tools/testing/selftests/bpf/prog_tests/token.c
+++ b/tools/testing/selftests/bpf/prog_tests/token.c
@@ -828,8 +828,12 @@ static int userns_obj_priv_btf_success(int mnt_fd, struct token_lsm *lsm_skel)
 	return validate_struct_ops_load(mnt_fd, true /* should succeed */);
 }
 
+static const char *token_bpffs_custom_dir()
+{
+	return getenv("BPF_SELFTESTS_BPF_TOKEN_DIR") ?: "/tmp/bpf-token-fs";
+}
+
 #define TOKEN_ENVVAR "LIBBPF_BPF_TOKEN_PATH"
-#define TOKEN_BPFFS_CUSTOM "/bpf-token-fs"
 
 static int userns_obj_priv_implicit_token(int mnt_fd, struct token_lsm *lsm_skel)
 {
@@ -892,6 +896,7 @@ static int userns_obj_priv_implicit_token(int mnt_fd, struct token_lsm *lsm_skel
 
 static int userns_obj_priv_implicit_token_envvar(int mnt_fd, struct token_lsm *lsm_skel)
 {
+	const char *custom_dir = token_bpffs_custom_dir();
 	LIBBPF_OPTS(bpf_object_open_opts, opts);
 	struct dummy_st_ops_success *skel;
 	int err;
@@ -909,10 +914,10 @@ static int userns_obj_priv_implicit_token_envvar(int mnt_fd, struct token_lsm *l
 	 * BPF token implicitly, unless pointed to it through
 	 * LIBBPF_BPF_TOKEN_PATH envvar
 	 */
-	rmdir(TOKEN_BPFFS_CUSTOM);
-	if (!ASSERT_OK(mkdir(TOKEN_BPFFS_CUSTOM, 0777), "mkdir_bpffs_custom"))
+	rmdir(custom_dir);
+	if (!ASSERT_OK(mkdir(custom_dir, 0777), "mkdir_bpffs_custom"))
 		goto err_out;
-	err = sys_move_mount(mnt_fd, "", AT_FDCWD, TOKEN_BPFFS_CUSTOM, MOVE_MOUNT_F_EMPTY_PATH);
+	err = sys_move_mount(mnt_fd, "", AT_FDCWD, custom_dir, MOVE_MOUNT_F_EMPTY_PATH);
 	if (!ASSERT_OK(err, "move_mount_bpffs"))
 		goto err_out;
 
@@ -925,7 +930,7 @@ static int userns_obj_priv_implicit_token_envvar(int mnt_fd, struct token_lsm *l
 		goto err_out;
 	}
 
-	err = setenv(TOKEN_ENVVAR, TOKEN_BPFFS_CUSTOM, 1 /*overwrite*/);
+	err = setenv(TOKEN_ENVVAR, custom_dir, 1 /*overwrite*/);
 	if (!ASSERT_OK(err, "setenv_token_path"))
 		goto err_out;
 
@@ -951,11 +956,11 @@ static int userns_obj_priv_implicit_token_envvar(int mnt_fd, struct token_lsm *l
 	if (!ASSERT_ERR(err, "obj_empty_token_path_load"))
 		goto err_out;
 
-	rmdir(TOKEN_BPFFS_CUSTOM);
+	rmdir(custom_dir);
 	unsetenv(TOKEN_ENVVAR);
 	return 0;
 err_out:
-	rmdir(TOKEN_BPFFS_CUSTOM);
+	rmdir(custom_dir);
 	unsetenv(TOKEN_ENVVAR);
 	return -EINVAL;
 }
-- 
cgit v1.2.3


From b2d5b4c468568b60916bdd5e7c10ab11b09bd164 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree.xilinx@gmail.com>
Date: Wed, 13 Nov 2024 12:13:11 +0000
Subject: selftest: include dst-ip in ethtool ntuple rules

sfc hardware does not support filters with only ipproto + dst-port;
 adding dst-ip to the flow spec allows the rss_ctx test to be run on
 these devices.

Signed-off-by: Edward Cree <ecree.xilinx@gmail.com>
Reviewed-by: Martin Habets <habetsm.xilinx@gmail.com>
Link: https://patch.msgid.link/8e5d23c8f21310c23c080cc7bcd31b76f8fd3096.1731499022.git.ecree.xilinx@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/hw/rss_ctx.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/hw/rss_ctx.py b/tools/testing/selftests/drivers/net/hw/rss_ctx.py
index 29995586993c..fb61dae20fd8 100755
--- a/tools/testing/selftests/drivers/net/hw/rss_ctx.py
+++ b/tools/testing/selftests/drivers/net/hw/rss_ctx.py
@@ -215,7 +215,7 @@ def test_rss_queue_reconfigure(cfg, main_ctx=True):
         defer(ethtool, f"-X {cfg.ifname} default")
     else:
         other_key = 'noise'
-        flow = f"flow-type tcp{cfg.addr_ipver} dst-port {port} context {ctx_id}"
+        flow = f"flow-type tcp{cfg.addr_ipver} dst-ip {cfg.addr} dst-port {port} context {ctx_id}"
         ntuple = ethtool_create(cfg, "-N", flow)
         defer(ethtool, f"-N {cfg.ifname} delete {ntuple}")
 
@@ -429,7 +429,7 @@ def test_rss_context(cfg, ctx_cnt=1, create_with_cfg=None):
         ksft_eq(max(data['rss-indirection-table']), 2 + i * 2 + 1, "Unexpected context cfg: " + str(data))
 
         ports.append(rand_port())
-        flow = f"flow-type tcp{cfg.addr_ipver} dst-port {ports[i]} context {ctx_id}"
+        flow = f"flow-type tcp{cfg.addr_ipver} dst-ip {cfg.addr} dst-port {ports[i]} context {ctx_id}"
         ntuple = ethtool_create(cfg, "-N", flow)
         defer(ethtool, f"-N {cfg.ifname} delete {ntuple}")
 
@@ -516,7 +516,7 @@ def test_rss_context_out_of_order(cfg, ctx_cnt=4):
         ctx.append(defer(ethtool, f"-X {cfg.ifname} context {ctx_id} delete"))
 
         ports.append(rand_port())
-        flow = f"flow-type tcp{cfg.addr_ipver} dst-port {ports[i]} context {ctx_id}"
+        flow = f"flow-type tcp{cfg.addr_ipver} dst-ip {cfg.addr} dst-port {ports[i]} context {ctx_id}"
         ntuple_id = ethtool_create(cfg, "-N", flow)
         ntuple.append(defer(ethtool, f"-N {cfg.ifname} delete {ntuple_id}"))
 
@@ -569,7 +569,7 @@ def test_rss_context_overlap(cfg, other_ctx=0):
 
     port = rand_port()
     if other_ctx:
-        flow = f"flow-type tcp{cfg.addr_ipver} dst-port {port} context {other_ctx}"
+        flow = f"flow-type tcp{cfg.addr_ipver} dst-ip {cfg.addr} dst-port {port} context {other_ctx}"
         ntuple_id = ethtool_create(cfg, "-N", flow)
         ntuple = defer(ethtool, f"-N {cfg.ifname} delete {ntuple_id}")
 
@@ -587,7 +587,7 @@ def test_rss_context_overlap(cfg, other_ctx=0):
     # Now create a rule for context 1 and make sure traffic goes to a subset
     if other_ctx:
         ntuple.exec()
-    flow = f"flow-type tcp{cfg.addr_ipver} dst-port {port} context {ctx_id}"
+    flow = f"flow-type tcp{cfg.addr_ipver} dst-ip {cfg.addr} dst-port {port} context {ctx_id}"
     ntuple_id = ethtool_create(cfg, "-N", flow)
     defer(ethtool, f"-N {cfg.ifname} delete {ntuple_id}")
 
@@ -620,7 +620,7 @@ def test_delete_rss_context_busy(cfg):
 
     # utilize context from ntuple filter
     port = rand_port()
-    flow = f"flow-type tcp{cfg.addr_ipver} dst-port {port} context {ctx_id}"
+    flow = f"flow-type tcp{cfg.addr_ipver} dst-ip {cfg.addr} dst-port {port} context {ctx_id}"
     ntuple_id = ethtool_create(cfg, "-N", flow)
     defer(ethtool, f"-N {cfg.ifname} delete {ntuple_id}")
 
-- 
cgit v1.2.3


From e9e8abfec214b6a47f6c21d533c43c7f3c1f8887 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree.xilinx@gmail.com>
Date: Wed, 13 Nov 2024 12:13:12 +0000
Subject: selftest: validate RSS+ntuple filters with nonzero ring_cookie

Test creates an ntuple filter with 'action 2' and an RSS context whose
 indirection table has entries 0 and 1.  Resulting traffic should go to
 queues 2 and 3; verify that it never hits queues 0 and 1.

Signed-off-by: Edward Cree <ecree.xilinx@gmail.com>
Link: https://patch.msgid.link/114afdf4d2867f72ed27751e8e08fe8b128a8529.1731499022.git.ecree.xilinx@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/hw/rss_ctx.py | 41 ++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/hw/rss_ctx.py b/tools/testing/selftests/drivers/net/hw/rss_ctx.py
index fb61dae20fd8..8f62dc29bd26 100755
--- a/tools/testing/selftests/drivers/net/hw/rss_ctx.py
+++ b/tools/testing/selftests/drivers/net/hw/rss_ctx.py
@@ -633,6 +633,45 @@ def test_delete_rss_context_busy(cfg):
         pass
 
 
+def test_rss_ntuple_addition(cfg):
+    """
+    Test that the queue offset (ring_cookie) of an ntuple rule is added
+    to the queue number read from the indirection table.
+    """
+
+    require_ntuple(cfg)
+
+    queue_cnt = len(_get_rx_cnts(cfg))
+    if queue_cnt < 4:
+        try:
+            ksft_pr(f"Increasing queue count {queue_cnt} -> 4")
+            ethtool(f"-L {cfg.ifname} combined 4")
+            defer(ethtool, f"-L {cfg.ifname} combined {queue_cnt}")
+        except:
+            raise KsftSkipEx("Not enough queues for the test")
+
+    # Use queue 0 for normal traffic
+    ethtool(f"-X {cfg.ifname} equal 1")
+    defer(ethtool, f"-X {cfg.ifname} default")
+
+    # create additional rss context
+    ctx_id = ethtool_create(cfg, "-X", "context new equal 2")
+    defer(ethtool, f"-X {cfg.ifname} context {ctx_id} delete")
+
+    # utilize context from ntuple filter
+    port = rand_port()
+    flow = f"flow-type tcp{cfg.addr_ipver} dst-ip {cfg.addr} dst-port {port} context {ctx_id} action 2"
+    try:
+        ntuple_id = ethtool_create(cfg, "-N", flow)
+    except CmdExitFailure:
+        raise KsftSkipEx("Ntuple filter with RSS and nonzero action not supported")
+    defer(ethtool, f"-N {cfg.ifname} delete {ntuple_id}")
+
+    _send_traffic_check(cfg, port, f"context {ctx_id}", { 'target': (2, 3),
+                                                          'empty' : (1,),
+                                                          'noise' : (0,) })
+
+
 def main() -> None:
     with NetDrvEpEnv(__file__, nsim_test=False) as cfg:
         cfg.ethnl = EthtoolFamily()
@@ -644,7 +683,7 @@ def main() -> None:
                   test_rss_context_dump, test_rss_context_queue_reconfigure,
                   test_rss_context_overlap, test_rss_context_overlap2,
                   test_rss_context_out_of_order, test_rss_context4_create_with_cfg,
-                  test_delete_rss_context_busy],
+                  test_delete_rss_context_busy, test_rss_ntuple_addition],
                  args=(cfg, ))
     ksft_exit()
 
-- 
cgit v1.2.3


From 29a4bc1fe961caea52a5b945be2b4267b02002d7 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree.xilinx@gmail.com>
Date: Wed, 13 Nov 2024 12:13:13 +0000
Subject: selftest: extend test_rss_context_queue_reconfigure for action
 addition

The combination of ntuple action (ring_cookie) and RSS context can
 cause an ntuple rule to target a higher queue than appears in any
 RSS indirection table or directly in the ntuple rule, since the two
 numbers are added together.  Verify the logic that prevents reducing
 the queue count in this case.

Signed-off-by: Edward Cree <ecree.xilinx@gmail.com>
Link: https://patch.msgid.link/58276b800ab78c0a79c1918046ccae7fe45ba802.1731499022.git.ecree.xilinx@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/hw/rss_ctx.py | 26 +++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/hw/rss_ctx.py b/tools/testing/selftests/drivers/net/hw/rss_ctx.py
index 8f62dc29bd26..0b49ce7ae678 100755
--- a/tools/testing/selftests/drivers/net/hw/rss_ctx.py
+++ b/tools/testing/selftests/drivers/net/hw/rss_ctx.py
@@ -238,6 +238,32 @@ def test_rss_queue_reconfigure(cfg, main_ctx=True):
     else:
         raise Exception(f"Driver didn't prevent us from deactivating a used queue (context {ctx_id})")
 
+    if not main_ctx:
+        ethtool(f"-L {cfg.ifname} combined 4")
+        flow = f"flow-type tcp{cfg.addr_ipver} dst-ip {cfg.addr} dst-port {port} context {ctx_id} action 1"
+        try:
+            # this targets queue 4, which doesn't exist
+            ntuple2 = ethtool_create(cfg, "-N", flow)
+        except CmdExitFailure:
+            pass
+        else:
+            raise Exception(f"Driver didn't prevent us from targeting a nonexistent queue (context {ctx_id})")
+        # change the table to target queues 0 and 2
+        ethtool(f"-X {cfg.ifname} {ctx_ref} weight 1 0 1 0")
+        # ntuple rule therefore targets queues 1 and 3
+        ntuple2 = ethtool_create(cfg, "-N", flow)
+        # should replace existing filter
+        ksft_eq(ntuple, ntuple2)
+        _send_traffic_check(cfg, port, ctx_ref, { 'target': (1, 3),
+                                                  'noise' : (0, 2) })
+        # Setting queue count to 3 should fail, queue 3 is used
+        try:
+            ethtool(f"-L {cfg.ifname} combined 3")
+        except CmdExitFailure:
+            pass
+        else:
+            raise Exception(f"Driver didn't prevent us from deactivating a used queue (context {ctx_id})")
+
 
 def test_rss_resize(cfg):
     """Test resizing of the RSS table.
-- 
cgit v1.2.3


From b219bcfcc92e9bd50c6277ac68cb75f64b403e5e Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Thu, 14 Nov 2024 15:09:55 +0100
Subject: selftests: net: lib: Move logging from forwarding/lib.sh here

Many net selftests invent their own logging helpers. These really should be
in a library sourced by these tests. Currently forwarding/lib.sh has a
suite of perfectly fine logging helpers, but sourcing a forwarding/ library
from a higher-level directory smells of layering violation. In this patch,
move the logging helpers to net/lib.sh so that every net test can use them.

Together with the logging helpers, it's also necessary to move
pause_on_fail(), and EXIT_STATUS and RET.

Existing lib.sh users might be using these same names for their functions
or variables. However lib.sh is always sourced near the top of the
file (checked), and whatever new definitions will simply override the ones
provided by lib.sh.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Amit Cohen <amcohen@nvidia.com>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Link: https://patch.msgid.link/edd3785a3bd72ffbe1409300989e993ee50ae98b.1731589511.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/forwarding/lib.sh | 113 -------------------------
 tools/testing/selftests/net/lib.sh            | 115 ++++++++++++++++++++++++++
 2 files changed, 115 insertions(+), 113 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index 89c25f72b10c..41dd14c42c48 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -48,7 +48,6 @@ declare -A NETIFS=(
 : "${WAIT_TIME:=5}"
 
 # Whether to pause on, respectively, after a failure and before cleanup.
-: "${PAUSE_ON_FAIL:=no}"
 : "${PAUSE_ON_CLEANUP:=no}"
 
 # Whether to create virtual interfaces, and what netdevice type they should be.
@@ -446,22 +445,6 @@ done
 ##############################################################################
 # Helpers
 
-# Exit status to return at the end. Set in case one of the tests fails.
-EXIT_STATUS=0
-# Per-test return value. Clear at the beginning of each test.
-RET=0
-
-ret_set_ksft_status()
-{
-	local ksft_status=$1; shift
-	local msg=$1; shift
-
-	RET=$(ksft_status_merge $RET $ksft_status)
-	if (( $? )); then
-		retmsg=$msg
-	fi
-}
-
 # Whether FAILs should be interpreted as XFAILs. Internal.
 FAIL_TO_XFAIL=
 
@@ -535,102 +518,6 @@ xfail_on_veth()
 	fi
 }
 
-log_test_result()
-{
-	local test_name=$1; shift
-	local opt_str=$1; shift
-	local result=$1; shift
-	local retmsg=$1; shift
-
-	printf "TEST: %-60s  [%s]\n" "$test_name $opt_str" "$result"
-	if [[ $retmsg ]]; then
-		printf "\t%s\n" "$retmsg"
-	fi
-}
-
-pause_on_fail()
-{
-	if [[ $PAUSE_ON_FAIL == yes ]]; then
-		echo "Hit enter to continue, 'q' to quit"
-		read a
-		[[ $a == q ]] && exit 1
-	fi
-}
-
-handle_test_result_pass()
-{
-	local test_name=$1; shift
-	local opt_str=$1; shift
-
-	log_test_result "$test_name" "$opt_str" " OK "
-}
-
-handle_test_result_fail()
-{
-	local test_name=$1; shift
-	local opt_str=$1; shift
-
-	log_test_result "$test_name" "$opt_str" FAIL "$retmsg"
-	pause_on_fail
-}
-
-handle_test_result_xfail()
-{
-	local test_name=$1; shift
-	local opt_str=$1; shift
-
-	log_test_result "$test_name" "$opt_str" XFAIL "$retmsg"
-	pause_on_fail
-}
-
-handle_test_result_skip()
-{
-	local test_name=$1; shift
-	local opt_str=$1; shift
-
-	log_test_result "$test_name" "$opt_str" SKIP "$retmsg"
-}
-
-log_test()
-{
-	local test_name=$1
-	local opt_str=$2
-
-	if [[ $# -eq 2 ]]; then
-		opt_str="($opt_str)"
-	fi
-
-	if ((RET == ksft_pass)); then
-		handle_test_result_pass "$test_name" "$opt_str"
-	elif ((RET == ksft_xfail)); then
-		handle_test_result_xfail "$test_name" "$opt_str"
-	elif ((RET == ksft_skip)); then
-		handle_test_result_skip "$test_name" "$opt_str"
-	else
-		handle_test_result_fail "$test_name" "$opt_str"
-	fi
-
-	EXIT_STATUS=$(ksft_exit_status_merge $EXIT_STATUS $RET)
-	return $RET
-}
-
-log_test_skip()
-{
-	RET=$ksft_skip retmsg= log_test "$@"
-}
-
-log_test_xfail()
-{
-	RET=$ksft_xfail retmsg= log_test "$@"
-}
-
-log_info()
-{
-	local msg=$1
-
-	echo "INFO: $msg"
-}
-
 not()
 {
 	"$@"
diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh
index c8991cc6bf28..691318b1ec55 100644
--- a/tools/testing/selftests/net/lib.sh
+++ b/tools/testing/selftests/net/lib.sh
@@ -9,6 +9,9 @@ source "$net_dir/lib/sh/defer.sh"
 
 : "${WAIT_TIMEOUT:=20}"
 
+# Whether to pause on after a failure.
+: "${PAUSE_ON_FAIL:=no}"
+
 BUSYWAIT_TIMEOUT=$((WAIT_TIMEOUT * 1000)) # ms
 
 # Kselftest framework constants.
@@ -20,6 +23,11 @@ ksft_skip=4
 # namespace list created by setup_ns
 NS_LIST=()
 
+# Exit status to return at the end. Set in case one of the tests fails.
+EXIT_STATUS=0
+# Per-test return value. Clear at the beginning of each test.
+RET=0
+
 ##############################################################################
 # Helpers
 
@@ -236,3 +244,110 @@ tc_rule_handle_stats_get()
 	    | jq ".[] | select(.options.handle == $handle) | \
 		  .options.actions[0].stats$selector"
 }
+
+ret_set_ksft_status()
+{
+	local ksft_status=$1; shift
+	local msg=$1; shift
+
+	RET=$(ksft_status_merge $RET $ksft_status)
+	if (( $? )); then
+		retmsg=$msg
+	fi
+}
+
+log_test_result()
+{
+	local test_name=$1; shift
+	local opt_str=$1; shift
+	local result=$1; shift
+	local retmsg=$1; shift
+
+	printf "TEST: %-60s  [%s]\n" "$test_name $opt_str" "$result"
+	if [[ $retmsg ]]; then
+		printf "\t%s\n" "$retmsg"
+	fi
+}
+
+pause_on_fail()
+{
+	if [[ $PAUSE_ON_FAIL == yes ]]; then
+		echo "Hit enter to continue, 'q' to quit"
+		read a
+		[[ $a == q ]] && exit 1
+	fi
+}
+
+handle_test_result_pass()
+{
+	local test_name=$1; shift
+	local opt_str=$1; shift
+
+	log_test_result "$test_name" "$opt_str" " OK "
+}
+
+handle_test_result_fail()
+{
+	local test_name=$1; shift
+	local opt_str=$1; shift
+
+	log_test_result "$test_name" "$opt_str" FAIL "$retmsg"
+	pause_on_fail
+}
+
+handle_test_result_xfail()
+{
+	local test_name=$1; shift
+	local opt_str=$1; shift
+
+	log_test_result "$test_name" "$opt_str" XFAIL "$retmsg"
+	pause_on_fail
+}
+
+handle_test_result_skip()
+{
+	local test_name=$1; shift
+	local opt_str=$1; shift
+
+	log_test_result "$test_name" "$opt_str" SKIP "$retmsg"
+}
+
+log_test()
+{
+	local test_name=$1
+	local opt_str=$2
+
+	if [[ $# -eq 2 ]]; then
+		opt_str="($opt_str)"
+	fi
+
+	if ((RET == ksft_pass)); then
+		handle_test_result_pass "$test_name" "$opt_str"
+	elif ((RET == ksft_xfail)); then
+		handle_test_result_xfail "$test_name" "$opt_str"
+	elif ((RET == ksft_skip)); then
+		handle_test_result_skip "$test_name" "$opt_str"
+	else
+		handle_test_result_fail "$test_name" "$opt_str"
+	fi
+
+	EXIT_STATUS=$(ksft_exit_status_merge $EXIT_STATUS $RET)
+	return $RET
+}
+
+log_test_skip()
+{
+	RET=$ksft_skip retmsg= log_test "$@"
+}
+
+log_test_xfail()
+{
+	RET=$ksft_xfail retmsg= log_test "$@"
+}
+
+log_info()
+{
+	local msg=$1
+
+	echo "INFO: $msg"
+}
-- 
cgit v1.2.3


From 601d9d70a40a8ccf93f41a153dd4c9aa1db60d57 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Thu, 14 Nov 2024 15:09:56 +0100
Subject: selftests: net: lib: Move tests_run from forwarding/lib.sh here

It would be good to use the same mechanism for scheduling and dispatching
general net tests as the many forwarding tests already use. To that end,
move the logging helpers to net/lib.sh so that every net test can use them.

Existing lib.sh users might be using the name themselves. However lib.sh is
always sourced near the top of the file (checked), and whatever new
definition will simply override the one provided by lib.sh.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Amit Cohen <amcohen@nvidia.com>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Link: https://patch.msgid.link/a6fc083486493425b2c61185c327845b6ce3233a.1731589511.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/forwarding/lib.sh | 10 ----------
 tools/testing/selftests/net/lib.sh            | 10 ++++++++++
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index 41dd14c42c48..d28dbf27c1f0 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -1285,16 +1285,6 @@ matchall_sink_create()
 	   action drop
 }
 
-tests_run()
-{
-	local current_test
-
-	for current_test in ${TESTS:-$ALL_TESTS}; do
-		in_defer_scope \
-			$current_test
-	done
-}
-
 cleanup()
 {
 	pre_cleanup
diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh
index 691318b1ec55..4f52b8e48a3a 100644
--- a/tools/testing/selftests/net/lib.sh
+++ b/tools/testing/selftests/net/lib.sh
@@ -351,3 +351,13 @@ log_info()
 
 	echo "INFO: $msg"
 }
+
+tests_run()
+{
+	local current_test
+
+	for current_test in ${TESTS:-$ALL_TESTS}; do
+		in_defer_scope \
+			$current_test
+	done
+}
-- 
cgit v1.2.3


From af76b4431818cf7a73cf0ec19465ad3b01cdb159 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Thu, 14 Nov 2024 15:09:57 +0100
Subject: selftests: net: lib: Move checks from forwarding/lib.sh here

For logging to be useful, something has to set RET and retmsg by calling
ret_set_ksft_status(). There is a suite of functions to that end in
forwarding/lib: check_err, check_fail et.al. Move them to net/lib.sh so
that every net test can use them.

Existing lib.sh users might be using these same names for their functions.
However lib.sh is always sourced near the top of the file (checked), and
whatever new definitions will simply override the ones provided by lib.sh.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Amit Cohen <amcohen@nvidia.com>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Link: https://patch.msgid.link/f488a00dc85b8e0c1f3c71476b32b21b5189a847.1731589511.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/forwarding/lib.sh | 73 ---------------------------
 tools/testing/selftests/net/lib.sh            | 73 +++++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 73 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index d28dbf27c1f0..8625e3c99f55 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -445,79 +445,6 @@ done
 ##############################################################################
 # Helpers
 
-# Whether FAILs should be interpreted as XFAILs. Internal.
-FAIL_TO_XFAIL=
-
-check_err()
-{
-	local err=$1
-	local msg=$2
-
-	if ((err)); then
-		if [[ $FAIL_TO_XFAIL = yes ]]; then
-			ret_set_ksft_status $ksft_xfail "$msg"
-		else
-			ret_set_ksft_status $ksft_fail "$msg"
-		fi
-	fi
-}
-
-check_fail()
-{
-	local err=$1
-	local msg=$2
-
-	check_err $((!err)) "$msg"
-}
-
-check_err_fail()
-{
-	local should_fail=$1; shift
-	local err=$1; shift
-	local what=$1; shift
-
-	if ((should_fail)); then
-		check_fail $err "$what succeeded, but should have failed"
-	else
-		check_err $err "$what failed"
-	fi
-}
-
-xfail()
-{
-	FAIL_TO_XFAIL=yes "$@"
-}
-
-xfail_on_slow()
-{
-	if [[ $KSFT_MACHINE_SLOW = yes ]]; then
-		FAIL_TO_XFAIL=yes "$@"
-	else
-		"$@"
-	fi
-}
-
-omit_on_slow()
-{
-	if [[ $KSFT_MACHINE_SLOW != yes ]]; then
-		"$@"
-	fi
-}
-
-xfail_on_veth()
-{
-	local dev=$1; shift
-	local kind
-
-	kind=$(ip -j -d link show dev $dev |
-			jq -r '.[].linkinfo.info_kind')
-	if [[ $kind = veth ]]; then
-		FAIL_TO_XFAIL=yes "$@"
-	else
-		"$@"
-	fi
-}
-
 not()
 {
 	"$@"
diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh
index 4f52b8e48a3a..6bcf5d13879d 100644
--- a/tools/testing/selftests/net/lib.sh
+++ b/tools/testing/selftests/net/lib.sh
@@ -361,3 +361,76 @@ tests_run()
 			$current_test
 	done
 }
+
+# Whether FAILs should be interpreted as XFAILs. Internal.
+FAIL_TO_XFAIL=
+
+check_err()
+{
+	local err=$1
+	local msg=$2
+
+	if ((err)); then
+		if [[ $FAIL_TO_XFAIL = yes ]]; then
+			ret_set_ksft_status $ksft_xfail "$msg"
+		else
+			ret_set_ksft_status $ksft_fail "$msg"
+		fi
+	fi
+}
+
+check_fail()
+{
+	local err=$1
+	local msg=$2
+
+	check_err $((!err)) "$msg"
+}
+
+check_err_fail()
+{
+	local should_fail=$1; shift
+	local err=$1; shift
+	local what=$1; shift
+
+	if ((should_fail)); then
+		check_fail $err "$what succeeded, but should have failed"
+	else
+		check_err $err "$what failed"
+	fi
+}
+
+xfail()
+{
+	FAIL_TO_XFAIL=yes "$@"
+}
+
+xfail_on_slow()
+{
+	if [[ $KSFT_MACHINE_SLOW = yes ]]; then
+		FAIL_TO_XFAIL=yes "$@"
+	else
+		"$@"
+	fi
+}
+
+omit_on_slow()
+{
+	if [[ $KSFT_MACHINE_SLOW != yes ]]; then
+		"$@"
+	fi
+}
+
+xfail_on_veth()
+{
+	local dev=$1; shift
+	local kind
+
+	kind=$(ip -j -d link show dev $dev |
+			jq -r '.[].linkinfo.info_kind')
+	if [[ $kind = veth ]]; then
+		FAIL_TO_XFAIL=yes "$@"
+	else
+		"$@"
+	fi
+}
-- 
cgit v1.2.3


From 46f6569cf0754e27816403c3701c7070ff281ad0 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Thu, 14 Nov 2024 15:09:58 +0100
Subject: selftests: net: lib: Add kill_process

A number of selftests run processes in the background and need to kill them
afterwards. Instead for everyone to open-code the kill / wait / redirect
mantra, add a helper in net/lib.sh. Convert existing open-code sites.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Reviewed-by: Amit Cohen <amcohen@nvidia.com>
Link: https://patch.msgid.link/a9db102067d741c118f0bd93b10c75e2a34665ea.1731589511.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/mlxsw/devlink_trap.sh    |  2 +-
 .../selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh     |  4 ++--
 .../drivers/net/mlxsw/devlink_trap_l3_exceptions.sh          | 12 ++++++------
 .../selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh  |  4 ++--
 .../selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip6.sh |  4 ++--
 .../selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh |  4 ++--
 .../drivers/net/mlxsw/devlink_trap_tunnel_vxlan_ipv6.sh      |  4 ++--
 tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh       |  4 ++--
 .../selftests/drivers/net/netdevsim/fib_notifications.sh     |  6 +++---
 tools/testing/selftests/net/drop_monitor_tests.sh            |  2 +-
 tools/testing/selftests/net/fib_tests.sh                     |  8 ++++----
 tools/testing/selftests/net/forwarding/devlink_lib.sh        |  2 +-
 tools/testing/selftests/net/forwarding/lib.sh                |  3 +--
 tools/testing/selftests/net/forwarding/tc_police.sh          |  8 ++++----
 tools/testing/selftests/net/lib.sh                           |  8 ++++++++
 15 files changed, 41 insertions(+), 34 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap.sh
index 89b55e946eed..36055279ba92 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap.sh
@@ -116,7 +116,7 @@ dev_del_test()
 
 	log_test "Device delete"
 
-	kill $mz_pid && wait $mz_pid &> /dev/null
+	kill_process $mz_pid
 }
 
 trap cleanup EXIT
diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh
index 160891dcb4bc..db5806d189bb 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh
@@ -595,7 +595,7 @@ irif_disabled_test()
 
 	log_test "Ingress RIF disabled"
 
-	kill $mz_pid && wait $mz_pid &> /dev/null
+	kill_process $mz_pid
 	ip link set dev $rp1 nomaster
 	__addr_add_del $rp1 add 192.0.2.2/24 2001:db8:1::2/64
 	ip link del dev br0 type bridge
@@ -645,7 +645,7 @@ erif_disabled_test()
 
 	log_test "Egress RIF disabled"
 
-	kill $mz_pid && wait $mz_pid &> /dev/null
+	kill_process $mz_pid
 	__addr_add_del $rp1 add 192.0.2.2/24 2001:db8:1::2/64
 	ip link del dev br0 type bridge
 	devlink_trap_action_set $trap_name "drop"
diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh
index 190c1b6b5365..5d6d88b600f0 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh
@@ -202,7 +202,7 @@ mtu_value_is_too_small_test()
 
 	mtu_restore $rp2
 
-	kill $mz_pid && wait $mz_pid &> /dev/null
+	kill_process $mz_pid
 	tc filter del dev $h1 ingress protocol ip pref 1 handle 101 flower
 }
 
@@ -235,7 +235,7 @@ __ttl_value_is_too_small_test()
 
 	log_test "TTL value is too small: TTL=$ttl_val"
 
-	kill $mz_pid && wait $mz_pid &> /dev/null
+	kill_process $mz_pid
 	tc filter del dev $h1 ingress protocol ip pref 1 handle 101 flower
 }
 
@@ -299,7 +299,7 @@ __mc_reverse_path_forwarding_test()
 
 	log_test "Multicast reverse path forwarding: $desc"
 
-	kill $mz_pid && wait $mz_pid &> /dev/null
+	kill_process $mz_pid
 	tc filter del dev $rp2 egress protocol $proto pref 1 handle 101 flower
 }
 
@@ -347,7 +347,7 @@ __reject_route_test()
 
 	log_test "Reject route: $desc"
 
-	kill $mz_pid && wait $mz_pid &> /dev/null
+	kill_process $mz_pid
 	ip route del unreachable $unreachable
 	tc filter del dev $h1 ingress protocol $proto pref 1 handle 101 flower
 }
@@ -542,7 +542,7 @@ ipv4_lpm_miss_test()
 
 	log_test "LPM miss: IPv4"
 
-	kill $mz_pid && wait $mz_pid &> /dev/null
+	kill_process $mz_pid
 	vrf_without_routes_destroy
 }
 
@@ -569,7 +569,7 @@ ipv6_lpm_miss_test()
 
 	log_test "LPM miss: IPv6"
 
-	kill $mz_pid && wait $mz_pid &> /dev/null
+	kill_process $mz_pid
 	vrf_without_routes_destroy
 }
 
diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh
index e9a82cae8c9a..4ac1dae92d0f 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh
@@ -176,7 +176,7 @@ ecn_decap_test()
 
 	log_test "$desc: Inner ECN is not ECT and outer is $ecn_desc"
 
-	kill $mz_pid && wait $mz_pid &> /dev/null
+	kill_process $mz_pid
 	tc filter del dev $swp1 egress protocol ip pref 1 handle 101 flower
 }
 
@@ -207,7 +207,7 @@ no_matching_tunnel_test()
 
 	log_test "$desc"
 
-	kill $mz_pid && wait $mz_pid &> /dev/null
+	kill_process $mz_pid
 	tc filter del dev $swp1 egress protocol ip pref 1 handle 101 flower
 }
 
diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip6.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip6.sh
index 878125041fc3..fce885184404 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip6.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip6.sh
@@ -176,7 +176,7 @@ ecn_decap_test()
 
 	log_test "$desc: Inner ECN is not ECT and outer is $ecn_desc"
 
-	kill $mz_pid && wait $mz_pid &> /dev/null
+	kill_process $mz_pid
 	tc filter del dev $swp1 egress protocol ipv6 pref 1 handle 101 flower
 }
 
@@ -207,7 +207,7 @@ no_matching_tunnel_test()
 
 	log_test "$desc"
 
-	kill $mz_pid && wait $mz_pid &> /dev/null
+	kill_process $mz_pid
 	tc filter del dev $swp1 egress protocol ipv6 pref 1 handle 101 flower
 }
 
diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh
index 5f6eb965cfd1..7aca8e5922cf 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh
@@ -183,7 +183,7 @@ ecn_decap_test()
 
 	log_test "$desc: Inner ECN is not ECT and outer is $ecn_desc"
 
-	kill $mz_pid && wait $mz_pid &> /dev/null
+	kill_process $mz_pid
 	tc filter del dev $swp1 egress protocol ip pref 1 handle 101 flower
 }
 
@@ -253,7 +253,7 @@ corrupted_packet_test()
 
 	log_test "$desc"
 
-	kill $mz_pid && wait $mz_pid &> /dev/null
+	kill_process $mz_pid
 	tc filter del dev $swp1 egress protocol ip pref 1 handle 101 flower
 }
 
diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan_ipv6.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan_ipv6.sh
index f6c16cbb6cf7..4599c331240b 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan_ipv6.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan_ipv6.sh
@@ -188,7 +188,7 @@ ecn_decap_test()
 
 	log_test "$desc: Inner ECN is not ECT and outer is $ecn_desc"
 
-	kill $mz_pid && wait $mz_pid &> /dev/null
+	kill_process $mz_pid
 	tc filter del dev $swp1 egress protocol ipv6 pref 1 handle 101 flower
 }
 
@@ -262,7 +262,7 @@ corrupted_packet_test()
 
 	log_test "$desc"
 
-	kill $mz_pid && wait $mz_pid &> /dev/null
+	kill_process $mz_pid
 	tc filter del dev $swp1 egress protocol ipv6 pref 1 handle 101 flower
 }
 
diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh
index 83a0210e7544..bc7ea2df49fb 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh
@@ -218,7 +218,7 @@ psample_capture_start()
 
 psample_capture_stop()
 {
-	{ kill %% && wait %%; } 2>/dev/null
+	kill_process %%
 }
 
 __tc_sample_rate_test()
@@ -499,7 +499,7 @@ tc_sample_md_out_tc_occ_test()
 	backlog=$(tc -j -p -s qdisc show dev $rp2 | jq '.[0]["backlog"]')
 
 	# Kill mausezahn.
-	{ kill %% && wait %%; } 2>/dev/null
+	kill_process %%
 
 	psample_capture_stop
 
diff --git a/tools/testing/selftests/drivers/net/netdevsim/fib_notifications.sh b/tools/testing/selftests/drivers/net/netdevsim/fib_notifications.sh
index 8d91191a098c..9896580c3d85 100755
--- a/tools/testing/selftests/drivers/net/netdevsim/fib_notifications.sh
+++ b/tools/testing/selftests/drivers/net/netdevsim/fib_notifications.sh
@@ -94,7 +94,7 @@ route_addition_check()
 	sleep 1
 	$IP route add $route dev dummy1
 	sleep 1
-	kill %% && wait %% &> /dev/null
+	kill_process %%
 
 	route_notify_check $outfile $expected_num_notifications $offload_failed
 	rm -f $outfile
@@ -148,7 +148,7 @@ route_deletion_check()
 	sleep 1
 	$IP route del $route dev dummy1
 	sleep 1
-	kill %% && wait %% &> /dev/null
+	kill_process %%
 
 	route_notify_check $outfile $expected_num_notifications
 	rm -f $outfile
@@ -191,7 +191,7 @@ route_replacement_check()
 	sleep 1
 	$IP route replace $route dev dummy2
 	sleep 1
-	kill %% && wait %% &> /dev/null
+	kill_process %%
 
 	route_notify_check $outfile $expected_num_notifications
 	rm -f $outfile
diff --git a/tools/testing/selftests/net/drop_monitor_tests.sh b/tools/testing/selftests/net/drop_monitor_tests.sh
index 7c4818c971fc..507d0a82f5f0 100755
--- a/tools/testing/selftests/net/drop_monitor_tests.sh
+++ b/tools/testing/selftests/net/drop_monitor_tests.sh
@@ -77,7 +77,7 @@ sw_drops_test()
 
 	rm ${dir}/packets.pcap
 
-	{ kill %% && wait %%; } 2>/dev/null
+	kill_process %%
 	timeout 5 dwdump -o sw -w ${dir}/packets.pcap
 	(( $(tshark -r ${dir}/packets.pcap \
 		-Y 'ip.dst == 192.0.2.10' 2> /dev/null | wc -l) == 0))
diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh
index 5f3c28fc8624..3ea6f886a210 100755
--- a/tools/testing/selftests/net/fib_tests.sh
+++ b/tools/testing/selftests/net/fib_tests.sh
@@ -689,7 +689,7 @@ fib6_notify_test()
 
 	log_test $ret 0 "ipv6 route add notify"
 
-	{ kill %% && wait %%; } 2>/dev/null
+	kill_process %%
 
 	#rm errors.txt
 
@@ -736,7 +736,7 @@ fib_notify_test()
 
 	log_test $ret 0 "ipv4 route add notify"
 
-	{ kill %% && wait %%; } 2>/dev/null
+	kill_process %%
 
 	rm  errors.txt
 
@@ -2328,7 +2328,7 @@ ipv4_mangle_test()
 	$IP route del table 123 172.16.101.0/24 dev veth1
 	$IP rule del pref 100
 
-	{ kill %% && wait %%; } 2>/dev/null
+	kill_process %%
 	rm $tmp_file
 
 	route_cleanup
@@ -2386,7 +2386,7 @@ ipv6_mangle_test()
 	$IP -6 route del table 123 2001:db8:101::/64 dev veth1
 	$IP -6 rule del pref 100
 
-	{ kill %% && wait %%; } 2>/dev/null
+	kill_process %%
 	rm $tmp_file
 
 	route_cleanup
diff --git a/tools/testing/selftests/net/forwarding/devlink_lib.sh b/tools/testing/selftests/net/forwarding/devlink_lib.sh
index 62a05bca1e82..18afa89ebbcc 100644
--- a/tools/testing/selftests/net/forwarding/devlink_lib.sh
+++ b/tools/testing/selftests/net/forwarding/devlink_lib.sh
@@ -501,7 +501,7 @@ devlink_trap_drop_cleanup()
 	local pref=$1; shift
 	local handle=$1; shift
 
-	kill $mz_pid && wait $mz_pid &> /dev/null
+	kill_process $mz_pid
 	tc filter del dev $dev egress protocol $proto pref $pref handle $handle flower
 }
 
diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index 8625e3c99f55..7337f398f9cc 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -1574,8 +1574,7 @@ stop_traffic()
 {
 	local pid=${1-%%}; shift
 
-	# Suppress noise from killing mausezahn.
-	{ kill $pid && wait $pid; } 2>/dev/null
+	kill_process "$pid"
 }
 
 declare -A cappid
diff --git a/tools/testing/selftests/net/forwarding/tc_police.sh b/tools/testing/selftests/net/forwarding/tc_police.sh
index 5103f64a71d6..509fdedfcfa1 100755
--- a/tools/testing/selftests/net/forwarding/tc_police.sh
+++ b/tools/testing/selftests/net/forwarding/tc_police.sh
@@ -148,7 +148,7 @@ police_common_test()
 
 	log_test "$test_name"
 
-	{ kill %% && wait %%; } 2>/dev/null
+	kill_process %%
 	tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
 }
 
@@ -198,7 +198,7 @@ police_shared_common_test()
 
 	log_test "$test_name"
 
-	{ kill %% && wait %%; } 2>/dev/null
+	kill_process %%
 }
 
 police_shared_test()
@@ -278,7 +278,7 @@ police_mirror_common_test()
 
 	log_test "$test_name"
 
-	{ kill %% && wait %%; } 2>/dev/null
+	kill_process %%
 	tc filter del dev $pol_if $dir protocol ip pref 1 handle 101 flower
 	tc filter del dev $h3 ingress protocol ip pref 1 handle 101 flower
 	tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
@@ -320,7 +320,7 @@ police_pps_common_test()
 
 	log_test "$test_name"
 
-	{ kill %% && wait %%; } 2>/dev/null
+	kill_process %%
 	tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
 }
 
diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh
index 6bcf5d13879d..24f63e45735d 100644
--- a/tools/testing/selftests/net/lib.sh
+++ b/tools/testing/selftests/net/lib.sh
@@ -434,3 +434,11 @@ xfail_on_veth()
 		"$@"
 	fi
 }
+
+kill_process()
+{
+	local pid=$1; shift
+
+	# Suppress noise from killing the process.
+	{ kill $pid && wait $pid; } 2>/dev/null
+}
-- 
cgit v1.2.3


From 15880bec9bc32ddc8f70f8c551745c2344233372 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Thu, 14 Nov 2024 15:09:59 +0100
Subject: selftests: net: fdb_notify: Add a test for FDB notifications

Check that only one notification is produced for various FDB edit
operations.

Regarding the ip_link_add() and ip_link_master() helpers. This pattern of
action plus corresponding defer is bound to come up often, and a dedicated
vocabulary to capture it will be handy. tunnel_create() and vlan_create()
from forwarding/lib.sh are somewhat opaque and perhaps too kitchen-sinky,
so I tried to go in the opposite direction with these ones, and wrapped
only the bare minimum to schedule a corresponding cleanup.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Amit Cohen <amcohen@nvidia.com>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Link: https://patch.msgid.link/910c5880ae6d3b558d6889cbdba2be690c2615c6.1731589511.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/Makefile      |  2 +-
 tools/testing/selftests/net/fdb_notify.sh | 96 +++++++++++++++++++++++++++++++
 tools/testing/selftests/net/lib.sh        | 17 ++++++
 3 files changed, 114 insertions(+), 1 deletion(-)
 create mode 100755 tools/testing/selftests/net/fdb_notify.sh

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index d323898c466c..3d487b03c4a0 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -93,7 +93,7 @@ TEST_PROGS += test_vxlan_mdb.sh
 TEST_PROGS += test_bridge_neigh_suppress.sh
 TEST_PROGS += test_vxlan_nolocalbypass.sh
 TEST_PROGS += test_bridge_backup_port.sh
-TEST_PROGS += fdb_flush.sh
+TEST_PROGS += fdb_flush.sh fdb_notify.sh
 TEST_PROGS += fq_band_pktlimit.sh
 TEST_PROGS += vlan_hw_filter.sh
 TEST_PROGS += bpf_offload.py
diff --git a/tools/testing/selftests/net/fdb_notify.sh b/tools/testing/selftests/net/fdb_notify.sh
new file mode 100755
index 000000000000..c03151e7791c
--- /dev/null
+++ b/tools/testing/selftests/net/fdb_notify.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source lib.sh
+
+ALL_TESTS="
+	test_dup_bridge
+	test_dup_vxlan_self
+	test_dup_vxlan_master
+	test_dup_macvlan_self
+	test_dup_macvlan_master
+"
+
+do_test_dup()
+{
+	local op=$1; shift
+	local what=$1; shift
+	local tmpf
+
+	RET=0
+
+	tmpf=$(mktemp)
+	defer rm "$tmpf"
+
+	defer_scope_push
+		bridge monitor fdb &> "$tmpf" &
+		defer kill_process $!
+
+		sleep 0.5
+		bridge fdb "$op" 00:11:22:33:44:55 vlan 1 "$@"
+		sleep 0.5
+	defer_scope_pop
+
+	local count=$(grep -c -e 00:11:22:33:44:55 $tmpf)
+	((count == 1))
+	check_err $? "Got $count notifications, expected 1"
+
+	log_test "$what $op: Duplicate notifications"
+}
+
+test_dup_bridge()
+{
+	ip_link_add br up type bridge vlan_filtering 1
+	do_test_dup add "bridge" dev br self
+	do_test_dup del "bridge" dev br self
+}
+
+test_dup_vxlan_self()
+{
+	ip_link_add br up type bridge vlan_filtering 1
+	ip_link_add vx up type vxlan id 2000 dstport 4789
+	ip_link_master vx br
+
+	do_test_dup add "vxlan" dev vx self dst 192.0.2.1
+	do_test_dup del "vxlan" dev vx self dst 192.0.2.1
+}
+
+test_dup_vxlan_master()
+{
+	ip_link_add br up type bridge vlan_filtering 1
+	ip_link_add vx up type vxlan id 2000 dstport 4789
+	ip_link_master vx br
+
+	do_test_dup add "vxlan master" dev vx master
+	do_test_dup del "vxlan master" dev vx master
+}
+
+test_dup_macvlan_self()
+{
+	ip_link_add dd up type dummy
+	ip_link_add mv up link dd type macvlan mode passthru
+
+	do_test_dup add "macvlan self" dev mv self
+	do_test_dup del "macvlan self" dev mv self
+}
+
+test_dup_macvlan_master()
+{
+	ip_link_add br up type bridge vlan_filtering 1
+	ip_link_add dd up type dummy
+	ip_link_add mv up link dd type macvlan mode passthru
+	ip_link_master mv br
+
+	do_test_dup add "macvlan master" dev mv self
+	do_test_dup del "macvlan master" dev mv self
+}
+
+cleanup()
+{
+	defer_scopes_cleanup
+}
+
+trap cleanup EXIT
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh
index 24f63e45735d..8994fec1c38f 100644
--- a/tools/testing/selftests/net/lib.sh
+++ b/tools/testing/selftests/net/lib.sh
@@ -442,3 +442,20 @@ kill_process()
 	# Suppress noise from killing the process.
 	{ kill $pid && wait $pid; } 2>/dev/null
 }
+
+ip_link_add()
+{
+	local name=$1; shift
+
+	ip link add name "$name" "$@"
+	defer ip link del dev "$name"
+}
+
+ip_link_master()
+{
+	local member=$1; shift
+	local master=$1; shift
+
+	ip link set dev "$member" master "$master"
+	defer ip link set dev "$member" nomaster
+}
-- 
cgit v1.2.3


From 838f12c3d551f8941295ed7085ad360c3d3ad665 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Fri, 18 Oct 2024 17:47:55 +0300
Subject: selftests/pcie_bwctrl: Create selftests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Create selftests for PCIe BW control through the PCIe cooling device sysfs
interface.

First, the BW control selftest finds the PCIe Port to test with. By
default, the PCIe Port with the highest Link Speed is selected but
another PCIe Port can be provided with -d parameter.

The actual test steps the cur_state of the cooling device one-by-one
from max_state to what the cur_state was initially. The speed change
is confirmed by observing the current_link_speed for the corresponding
PCIe Port.

Link: https://lore.kernel.org/r/20241018144755.7875-10-ilpo.jarvinen@linux.intel.com
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 MAINTAINERS                                        |   1 +
 tools/testing/selftests/Makefile                   |   1 +
 tools/testing/selftests/pcie_bwctrl/Makefile       |   2 +
 .../pcie_bwctrl/set_pcie_cooling_state.sh          | 122 +++++++++++++++++++++
 .../selftests/pcie_bwctrl/set_pcie_speed.sh        |  67 +++++++++++
 5 files changed, 193 insertions(+)
 create mode 100644 tools/testing/selftests/pcie_bwctrl/Makefile
 create mode 100755 tools/testing/selftests/pcie_bwctrl/set_pcie_cooling_state.sh
 create mode 100755 tools/testing/selftests/pcie_bwctrl/set_pcie_speed.sh

(limited to 'tools/testing')

diff --git a/MAINTAINERS b/MAINTAINERS
index 393ed7ce5ea1..d7ffef4382df 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -17940,6 +17940,7 @@ S:	Supported
 F:	drivers/pci/pcie/bwctrl.c
 F:	drivers/thermal/pcie_cooling.c
 F:	include/linux/pci-bwctrl.h
+F:	tools/testing/selftests/pcie_bwctrl/
 
 PCIE DRIVER FOR AMAZON ANNAPURNA LABS
 M:	Jonathan Chocron <jonnyc@amazon.com>
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index b38199965f99..7181756f47ff 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -72,6 +72,7 @@ TARGETS += net/packetdrill
 TARGETS += net/rds
 TARGETS += net/tcp_ao
 TARGETS += nsfs
+TARGETS += pcie_bwctrl
 TARGETS += perf_events
 TARGETS += pidfd
 TARGETS += pid_namespace
diff --git a/tools/testing/selftests/pcie_bwctrl/Makefile b/tools/testing/selftests/pcie_bwctrl/Makefile
new file mode 100644
index 000000000000..3e84e26341d1
--- /dev/null
+++ b/tools/testing/selftests/pcie_bwctrl/Makefile
@@ -0,0 +1,2 @@
+TEST_PROGS = set_pcie_cooling_state.sh
+include ../lib.mk
diff --git a/tools/testing/selftests/pcie_bwctrl/set_pcie_cooling_state.sh b/tools/testing/selftests/pcie_bwctrl/set_pcie_cooling_state.sh
new file mode 100755
index 000000000000..9df606552af3
--- /dev/null
+++ b/tools/testing/selftests/pcie_bwctrl/set_pcie_cooling_state.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+SYSFS=
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+retval=0
+skipmsg="skip all tests:"
+
+PCIEPORTTYPE="PCIe_Port_Link_Speed"
+
+prerequisite()
+{
+	local ports
+
+	if [ $UID != 0 ]; then
+		echo $skipmsg must be run as root >&2
+		exit $ksft_skip
+	fi
+
+	SYSFS=`mount -t sysfs | head -1 | awk '{ print $3 }'`
+
+	if [ ! -d "$SYSFS" ]; then
+		echo $skipmsg sysfs is not mounted >&2
+		exit $ksft_skip
+	fi
+
+	if ! ls $SYSFS/class/thermal/cooling_device* > /dev/null 2>&1; then
+		echo $skipmsg thermal cooling devices missing >&2
+		exit $ksft_skip
+	fi
+
+	ports=`grep -e "^$PCIEPORTTYPE" $SYSFS/class/thermal/cooling_device*/type | wc -l`
+	if [ $ports -eq 0 ]; then
+		echo $skipmsg pcie cooling devices missing >&2
+		exit $ksft_skip
+	fi
+}
+
+testport=
+find_pcie_port()
+{
+	local patt="$1"
+	local pcieports
+	local max
+	local cur
+	local delta
+	local bestdelta=-1
+
+	pcieports=`grep -l -F -e "$patt" /sys/class/thermal/cooling_device*/type`
+	if [ -z "$pcieports" ]; then
+		return
+	fi
+	pcieports=${pcieports//\/type/}
+	# Find the port with the highest PCIe Link Speed
+	for port in $pcieports; do
+		max=`cat $port/max_state`
+		cur=`cat $port/cur_state`
+		delta=$((max-cur))
+		if [ $delta -gt $bestdelta ]; then
+			testport="$port"
+			bestdelta=$delta
+		fi
+	done
+}
+
+sysfspcidev=
+find_sysfs_pci_dev()
+{
+	local typefile="$1/type"
+	local pcidir
+
+	pcidir="$SYSFS/bus/pci/devices/`sed -e "s|^${PCIEPORTTYPE}_||g" $typefile`"
+
+	if [ -r "$pcidir/current_link_speed" ]; then
+		sysfspcidev="$pcidir/current_link_speed"
+	fi
+}
+
+usage()
+{
+	echo "Usage $0 [ -d dev ]"
+	echo -e "\t-d: PCIe port BDF string (e.g., 0000:00:04.0)"
+}
+
+pattern="$PCIEPORTTYPE"
+parse_arguments()
+{
+	while getopts d:h opt; do
+		case $opt in
+			h)
+				usage "$0"
+				exit 0
+				;;
+			d)
+				pattern="$PCIEPORTTYPE_$OPTARG"
+				;;
+			*)
+				usage "$0"
+				exit 0
+				;;
+		esac
+	done
+}
+
+parse_arguments "$@"
+prerequisite
+find_pcie_port "$pattern"
+if [ -z "$testport" ]; then
+	echo $skipmsg "pcie cooling device not found from sysfs" >&2
+	exit $ksft_skip
+fi
+find_sysfs_pci_dev "$testport"
+if [ -z "$sysfspcidev" ]; then
+	echo $skipmsg "PCIe port device not found from sysfs" >&2
+	exit $ksft_skip
+fi
+
+./set_pcie_speed.sh "$testport" "$sysfspcidev"
+retval=$?
+
+exit $retval
diff --git a/tools/testing/selftests/pcie_bwctrl/set_pcie_speed.sh b/tools/testing/selftests/pcie_bwctrl/set_pcie_speed.sh
new file mode 100755
index 000000000000..584596949312
--- /dev/null
+++ b/tools/testing/selftests/pcie_bwctrl/set_pcie_speed.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+set -e
+
+TESTNAME=set_pcie_speed
+
+declare -a PCIELINKSPEED=(
+	"2.5 GT/s PCIe"
+	"5.0 GT/s PCIe"
+	"8.0 GT/s PCIe"
+	"16.0 GT/s PCIe"
+	"32.0 GT/s PCIe"
+	"64.0 GT/s PCIe"
+)
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+retval=0
+
+coolingdev="$1"
+statefile="$coolingdev/cur_state"
+maxfile="$coolingdev/max_state"
+linkspeedfile="$2"
+
+oldstate=`cat $statefile`
+maxstate=`cat $maxfile`
+
+set_state()
+{
+	local state=$1
+	local linkspeed
+	local expected_linkspeed
+
+	echo $state > $statefile
+
+	sleep 1
+
+	linkspeed="`cat $linkspeedfile`"
+	expected_linkspeed=$((maxstate-state))
+	expected_str="${PCIELINKSPEED[$expected_linkspeed]}"
+	if [ ! "${expected_str}" = "${linkspeed}" ]; then
+		echo "$TESTNAME failed: expected: ${expected_str}; got ${linkspeed}"
+		retval=1
+	fi
+}
+
+cleanup_skip ()
+{
+	set_state $oldstate
+	exit $ksft_skip
+}
+
+trap cleanup_skip EXIT
+
+echo "$TESTNAME: testing states $maxstate .. $oldstate with $coolingdev"
+for i in $(seq $maxstate -1 $oldstate); do
+	set_state "$i"
+done
+
+trap EXIT
+if [ $retval -eq 0 ]; then
+	echo "$TESTNAME [PASS]"
+else
+	echo "$TESTNAME [FAIL]"
+fi
+exit $retval
-- 
cgit v1.2.3


From 608e99f7869e3a6e028c7cba14a896c7797e8746 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Sat, 16 Nov 2024 10:56:17 -0800
Subject: selftests/bpf: Fix build error with llvm 19

llvm 19 fails to compile arena self test:
CLNG-BPF [test_progs] verifier_arena_large.bpf.o
progs/verifier_arena_large.c:90:24: error: unsupported signed division, please convert to unsigned div/mod.
   90 |                 pg_idx = (pg - base) / PAGE_SIZE;

Though llvm <= 18 and llvm >= 20 don't have this issue,
fix the test to avoid the build error.

Reported-by: Jiri Olsa <olsajiri@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/verifier_arena_large.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_large.c b/tools/testing/selftests/bpf/progs/verifier_arena_large.c
index 8a9af79db884..f94f30cf1bb8 100644
--- a/tools/testing/selftests/bpf/progs/verifier_arena_large.c
+++ b/tools/testing/selftests/bpf/progs/verifier_arena_large.c
@@ -87,7 +87,7 @@ __noinline int alloc_pages(int page_cnt, int pages_atonce, bool first_pass,
 					   NUMA_NO_NODE, 0);
 		if (!pg)
 			return step;
-		pg_idx = (pg - base) / PAGE_SIZE;
+		pg_idx = (unsigned long) (pg - base) / PAGE_SIZE;
 		if (first_pass) {
 			/* Pages must be allocated sequentially */
 			if (pg_idx != i)
-- 
cgit v1.2.3


From 357c52ff860b3d047de5d2c605c46dd9a8448821 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 14 Nov 2024 16:32:48 -0800
Subject: selftests: net: netlink-dumps: validation checks

The sanity checks are going to get silently cast to unsigned
and always pass. Cast the sizeof to signed size.

Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20241115003248.733862-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/netlink-dumps.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/netlink-dumps.c b/tools/testing/selftests/net/netlink-dumps.c
index 7ee6dcd334df..84e29b7dffb6 100644
--- a/tools/testing/selftests/net/netlink-dumps.c
+++ b/tools/testing/selftests/net/netlink-dumps.c
@@ -56,10 +56,10 @@ TEST(test_sanity)
 	ASSERT_EQ(n, sizeof(dump_policies));
 
 	n = recv(netlink_sock, buf, sizeof(buf), MSG_DONTWAIT);
-	ASSERT_GE(n, sizeof(struct nlmsghdr));
+	ASSERT_GE(n, (ssize_t)sizeof(struct nlmsghdr));
 
 	n = recv(netlink_sock, buf, sizeof(buf), MSG_DONTWAIT);
-	ASSERT_GE(n, sizeof(struct nlmsghdr));
+	ASSERT_GE(n, (ssize_t)sizeof(struct nlmsghdr));
 
 	close(netlink_sock);
 }
-- 
cgit v1.2.3


From 920efe3e13f7eb5711d4ad8ecc0cced16b1a84cf Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 15 Nov 2024 12:12:36 -0800
Subject: selftests: net: add more info to error in bpf_offload

bpf_offload caught a spurious warning in TC recently, but the error
message did not provide enough information to know what the problem
is:

  FAIL: Found 'netdevsim' in command output, leaky extack?

Add the extack to the output:

  FAIL: Unexpected command output, leaky extack? ('netdevsim', 'Warning: Filter with specified priority/protocol not found.')

Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/bpf_offload.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/bpf_offload.py b/tools/testing/selftests/net/bpf_offload.py
index 3efe44f6e92a..d10f420e4ef6 100755
--- a/tools/testing/selftests/net/bpf_offload.py
+++ b/tools/testing/selftests/net/bpf_offload.py
@@ -594,8 +594,9 @@ def check_extack_nsim(output, reference, args):
     check_extack(output, "netdevsim: " + reference, args)
 
 def check_no_extack(res, needle):
-    fail((res[1] + res[2]).count(needle) or (res[1] + res[2]).count("Warning:"),
-         "Found '%s' in command output, leaky extack?" % (needle))
+    haystack = (res[1] + res[2]).strip()
+    fail(haystack.count(needle) or haystack.count("Warning:"),
+         "Unexpected command output, leaky extack? ('%s', '%s')" % (needle, haystack))
 
 def check_verifier_log(output, reference):
     lines = output.split("\n")
-- 
cgit v1.2.3


From 0290abc9860917f1ee8b58309c2bbd740a39ee8e Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Sun, 17 Nov 2024 22:20:29 +0100
Subject: wireguard: selftests: load nf_conntrack if not present

Some distros may not load nf_conntrack by default, which will cause
subsequent nf_conntrack sets to fail. Load this module if it is not
already loaded.

Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Reviewed-by: Simon Horman <horms@kernel.org>
[ Jason: add [[ -e ... ]] check so this works in the qemu harness. ]
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Link: https://patch.msgid.link/20241117212030.629159-4-Jason@zx2c4.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/wireguard/netns.sh | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/wireguard/netns.sh b/tools/testing/selftests/wireguard/netns.sh
index 405ff262ca93..55500f901fbc 100755
--- a/tools/testing/selftests/wireguard/netns.sh
+++ b/tools/testing/selftests/wireguard/netns.sh
@@ -332,6 +332,7 @@ waitiface $netns1 vethc
 waitiface $netns2 veths
 
 n0 bash -c 'printf 1 > /proc/sys/net/ipv4/ip_forward'
+[[ -e /proc/sys/net/netfilter/nf_conntrack_udp_timeout ]] || modprobe nf_conntrack
 n0 bash -c 'printf 2 > /proc/sys/net/netfilter/nf_conntrack_udp_timeout'
 n0 bash -c 'printf 2 > /proc/sys/net/netfilter/nf_conntrack_udp_timeout_stream'
 n0 iptables -t nat -A POSTROUTING -s 192.168.1.0/24 -d 10.0.0.0/24 -j SNAT --to 10.0.0.1
-- 
cgit v1.2.3


From 0c4d5cb9a1c3583b61df199a51eccebe759e3c18 Mon Sep 17 00:00:00 2001
From: Jiayuan Chen <mrpre@163.com>
Date: Mon, 18 Nov 2024 11:09:10 +0800
Subject: selftests/bpf: Add some tests with sockmap SK_PASS

Add a new tests in sockmap_basic.c to test SK_PASS for sockmap

Signed-off-by: Jiayuan Chen <mrpre@163.com>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20241118030910.36230-3-mrpre@163.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../selftests/bpf/prog_tests/sockmap_basic.c       | 54 ++++++++++++++++++++++
 1 file changed, 54 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
index 82bfb266741c..a2041f8e32eb 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
@@ -501,6 +501,58 @@ out:
 	test_sockmap_pass_prog__destroy(skel);
 }
 
+static void test_sockmap_stream_pass(void)
+{
+	int zero = 0, sent, recvd;
+	int verdict, parser;
+	int err, map;
+	int c = -1, p = -1;
+	struct test_sockmap_pass_prog *pass = NULL;
+	char snd[256] = "0123456789";
+	char rcv[256] = "0";
+
+	pass = test_sockmap_pass_prog__open_and_load();
+	verdict = bpf_program__fd(pass->progs.prog_skb_verdict);
+	parser = bpf_program__fd(pass->progs.prog_skb_parser);
+	map = bpf_map__fd(pass->maps.sock_map_rx);
+
+	err = bpf_prog_attach(parser, map, BPF_SK_SKB_STREAM_PARSER, 0);
+	if (!ASSERT_OK(err, "bpf_prog_attach stream parser"))
+		goto out;
+
+	err = bpf_prog_attach(verdict, map, BPF_SK_SKB_STREAM_VERDICT, 0);
+	if (!ASSERT_OK(err, "bpf_prog_attach stream verdict"))
+		goto out;
+
+	err = create_pair(AF_INET, SOCK_STREAM, &c, &p);
+	if (err)
+		goto out;
+
+	/* sk_data_ready of 'p' will be replaced by strparser handler */
+	err = bpf_map_update_elem(map, &zero, &p, BPF_NOEXIST);
+	if (!ASSERT_OK(err, "bpf_map_update_elem(p)"))
+		goto out_close;
+
+	/*
+	 * as 'prog_skb_parser' return the original skb len and
+	 * 'prog_skb_verdict' return SK_PASS, the kernel will just
+	 * pass it through to original socket 'p'
+	 */
+	sent = xsend(c, snd, sizeof(snd), 0);
+	ASSERT_EQ(sent, sizeof(snd), "xsend(c)");
+
+	recvd = recv_timeout(p, rcv, sizeof(rcv), SOCK_NONBLOCK,
+			     IO_TIMEOUT_SEC);
+	ASSERT_EQ(recvd, sizeof(rcv), "recv_timeout(p)");
+
+out_close:
+	close(c);
+	close(p);
+
+out:
+	test_sockmap_pass_prog__destroy(pass);
+}
+
 static void test_sockmap_skb_verdict_fionread(bool pass_prog)
 {
 	int err, map, verdict, c0 = -1, c1 = -1, p0 = -1, p1 = -1;
@@ -923,6 +975,8 @@ void test_sockmap_basic(void)
 		test_sockmap_progs_query(BPF_SK_SKB_VERDICT);
 	if (test__start_subtest("sockmap skb_verdict shutdown"))
 		test_sockmap_skb_verdict_shutdown();
+	if (test__start_subtest("sockmap stream parser and verdict pass"))
+		test_sockmap_stream_pass();
 	if (test__start_subtest("sockmap skb_verdict fionread"))
 		test_sockmap_skb_verdict_fionread(true);
 	if (test__start_subtest("sockmap skb_verdict fionread on drop"))
-- 
cgit v1.2.3


From 6116075e18f79698419f2606d9cb34d23198f7e3 Mon Sep 17 00:00:00 2001
From: Mohan Prasad J <mohan.prasad@microchip.com>
Date: Fri, 15 Nov 2024 00:55:18 +0530
Subject: selftests: nic_link_layer: Add link layer selftest for NIC driver

Add selftest file for the link layer tests of a NIC driver.
Test for auto-negotiation is added.
Add LinkConfig class for changing link layer configs.
Selftest makes use of ksft modules and ethtool.
Include selftest file in the Makefile.

Signed-off-by: Mohan Prasad J <mohan.prasad@microchip.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/drivers/net/hw/Makefile    |   1 +
 .../selftests/drivers/net/hw/lib/py/__init__.py    |   1 +
 .../selftests/drivers/net/hw/lib/py/linkconfig.py  | 222 +++++++++++++++++++++
 .../selftests/drivers/net/hw/nic_link_layer.py     |  91 +++++++++
 4 files changed, 315 insertions(+)
 create mode 100644 tools/testing/selftests/drivers/net/hw/lib/py/linkconfig.py
 create mode 100644 tools/testing/selftests/drivers/net/hw/nic_link_layer.py

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile
index 1c6a77480923..a87f1f17a10a 100644
--- a/tools/testing/selftests/drivers/net/hw/Makefile
+++ b/tools/testing/selftests/drivers/net/hw/Makefile
@@ -11,6 +11,7 @@ TEST_PROGS = \
 	hw_stats_l3.sh \
 	hw_stats_l3_gre.sh \
 	loopback.sh \
+	nic_link_layer.py \
 	pp_alloc_fail.py \
 	rss_ctx.py \
 	#
diff --git a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py
index b582885786f5..399789a9676a 100644
--- a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py
+++ b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py
@@ -9,6 +9,7 @@ try:
     sys.path.append(KSFT_DIR.as_posix())
     from net.lib.py import *
     from drivers.net.lib.py import *
+    from .linkconfig import LinkConfig
 except ModuleNotFoundError as e:
     ksft_pr("Failed importing `net` library from kernel sources")
     ksft_pr(str(e))
diff --git a/tools/testing/selftests/drivers/net/hw/lib/py/linkconfig.py b/tools/testing/selftests/drivers/net/hw/lib/py/linkconfig.py
new file mode 100644
index 000000000000..db84000fc75b
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/hw/lib/py/linkconfig.py
@@ -0,0 +1,222 @@
+# SPDX-License-Identifier: GPL-2.0
+
+from lib.py import cmd, ethtool, ip
+from lib.py import ksft_pr, ksft_eq, KsftSkipEx
+from typing import Optional
+import re
+import time
+import json
+
+#The LinkConfig class is implemented to handle the link layer configurations.
+#Required minimum ethtool version is 6.10
+
+class LinkConfig:
+    """Class for handling the link layer configurations"""
+    def __init__(self, cfg: object) -> None:
+        self.cfg = cfg
+        self.partner_netif = self.get_partner_netif_name()
+
+        """Get the initial link configuration of local interface"""
+        self.common_link_modes = self.get_common_link_modes()
+
+    def get_partner_netif_name(self) -> Optional[str]:
+        partner_netif = None
+        try:
+            if not self.verify_link_up():
+                return None
+            """Get partner interface name"""
+            partner_json_output = ip("addr show", json=True, host=self.cfg.remote)
+            for interface in partner_json_output:
+                for addr in interface.get('addr_info', []):
+                    if addr.get('local') == self.cfg.remote_addr:
+                        partner_netif = interface['ifname']
+                        ksft_pr(f"Partner Interface name: {partner_netif}")
+            if partner_netif is None:
+                ksft_pr("Unable to get the partner interface name")
+        except Exception as e:
+            print(f"Unexpected error occurred while getting partner interface name: {e}")
+        self.partner_netif = partner_netif
+        return partner_netif
+
+    def verify_link_up(self) -> bool:
+        """Verify whether the local interface link is up"""
+        with open(f"/sys/class/net/{self.cfg.ifname}/operstate", "r") as fp:
+            link_state = fp.read().strip()
+
+        if link_state == "down":
+            ksft_pr(f"Link state of interface {self.cfg.ifname} is DOWN")
+            return False
+        else:
+            return True
+
+    def reset_interface(self, local: bool = True, remote: bool = True) -> bool:
+        ksft_pr("Resetting interfaces in local and remote")
+        if remote:
+            if self.verify_link_up():
+                if self.partner_netif is not None:
+                    ifname = self.partner_netif
+                    link_up_cmd = f"ip link set up {ifname}"
+                    link_down_cmd = f"ip link set down {ifname}"
+                    reset_cmd = f"{link_down_cmd} && sleep 5 && {link_up_cmd}"
+                    try:
+                        cmd(reset_cmd, host=self.cfg.remote)
+                    except Exception as e:
+                        ksft_pr(f"Unexpected error occurred while resetting remote: {e}")
+                else:
+                    ksft_pr("Partner interface not available")
+        if local:
+            ifname = self.cfg.ifname
+            link_up_cmd = f"ip link set up {ifname}"
+            link_down_cmd = f"ip link set down {ifname}"
+            reset_cmd = f"{link_down_cmd} && sleep 5 && {link_up_cmd}"
+            try:
+                cmd(reset_cmd)
+            except Exception as e:
+                ksft_pr(f"Unexpected error occurred while resetting local: {e}")
+        time.sleep(10)
+        if self.verify_link_up() and self.get_ethtool_field("link-detected"):
+            ksft_pr("Local and remote interfaces reset to original state")
+            return True
+        else:
+            ksft_pr("Error occurred after resetting interfaces. Link is DOWN.")
+            return False
+
+    def set_speed_and_duplex(self, speed: str, duplex: str, autoneg: bool = True) -> bool:
+        """Set the speed and duplex state for the interface"""
+        autoneg_state = "on" if autoneg is True else "off"
+        process = None
+        try:
+            process = ethtool(f"--change {self.cfg.ifname} speed {speed} duplex {duplex} autoneg {autoneg_state}")
+        except Exception as e:
+            ksft_pr(f"Unexpected error occurred while setting speed/duplex: {e}")
+        if process is None or process.ret != 0:
+            return False
+        else:
+            ksft_pr(f"Speed: {speed} Mbps, Duplex: {duplex} set for Interface: {self.cfg.ifname}")
+            return True
+
+    def verify_speed_and_duplex(self, expected_speed: str, expected_duplex: str) -> bool:
+        if not self.verify_link_up():
+            return False
+        """Verifying the speed and duplex state for the interface"""
+        with open(f"/sys/class/net/{self.cfg.ifname}/speed", "r") as fp:
+            actual_speed = fp.read().strip()
+        with open(f"/sys/class/net/{self.cfg.ifname}/duplex", "r") as fp:
+            actual_duplex = fp.read().strip()
+
+        ksft_eq(actual_speed, expected_speed)
+        ksft_eq(actual_duplex, expected_duplex)
+        return True
+
+    def set_autonegotiation_state(self, state: str, remote: bool = False) -> bool:
+        common_link_modes = self.common_link_modes
+        speeds, duplex_modes = self.get_speed_duplex_values(self.common_link_modes)
+        speed = speeds[0]
+        duplex = duplex_modes[0]
+        if not speed or not duplex:
+            ksft_pr("No speed or duplex modes found")
+            return False
+
+        speed_duplex_cmd = f"speed {speed} duplex {duplex}" if state == "off" else ""
+        if remote:
+            if not self.verify_link_up():
+                return False
+            """Set the autonegotiation state for the partner"""
+            command = f"-s {self.partner_netif} {speed_duplex_cmd} autoneg {state}"
+            partner_autoneg_change = None
+            """Set autonegotiation state for interface in remote pc"""
+            try:
+                partner_autoneg_change = ethtool(command, host=self.cfg.remote)
+            except Exception as e:
+                ksft_pr(f"Unexpected error occurred while changing auto-neg in remote: {e}")
+            if partner_autoneg_change is None or partner_autoneg_change.ret != 0:
+                ksft_pr(f"Not able to set autoneg parameter for interface {self.partner_netif}.")
+                return False
+            ksft_pr(f"Autoneg set as {state} for {self.partner_netif}")
+        else:
+            """Set the autonegotiation state for the interface"""
+            try:
+                process = ethtool(f"-s {self.cfg.ifname} {speed_duplex_cmd} autoneg {state}")
+                if process.ret != 0:
+                    ksft_pr(f"Not able to set autoneg parameter for interface {self.cfg.ifname}")
+                    return False
+            except Exception as e:
+                ksft_pr(f"Unexpected error occurred while changing auto-neg in local: {e}")
+                return False
+            ksft_pr(f"Autoneg set as {state} for {self.cfg.ifname}")
+        return True
+
+    def check_autoneg_supported(self, remote: bool = False) -> bool:
+        if not remote:
+            local_autoneg = self.get_ethtool_field("supports-auto-negotiation")
+            if local_autoneg is None:
+                ksft_pr(f"Unable to fetch auto-negotiation status for interface {self.cfg.ifname}")
+            """Return autoneg status of the local interface"""
+            return local_autoneg
+        else:
+            if not self.verify_link_up():
+                raise KsftSkipEx("Link is DOWN")
+            """Check remote auto-negotiation support status"""
+            partner_autoneg = False
+            if self.partner_netif is not None:
+                partner_autoneg = self.get_ethtool_field("supports-auto-negotiation", remote=True)
+                if partner_autoneg is None:
+                    ksft_pr(f"Unable to fetch auto-negotiation status for interface {self.partner_netif}")
+            return partner_autoneg
+
+    def get_common_link_modes(self) -> set[str]:
+        common_link_modes = []
+        """Populate common link modes"""
+        link_modes = self.get_ethtool_field("supported-link-modes")
+        partner_link_modes = self.get_ethtool_field("link-partner-advertised-link-modes")
+        if link_modes is None:
+            raise KsftSkipEx(f"Link modes not available for {self.cfg.ifname}")
+        if partner_link_modes is None:
+            raise KsftSkipEx(f"Partner link modes not available for {self.cfg.ifname}")
+        common_link_modes = set(link_modes) and set(partner_link_modes)
+        return common_link_modes
+
+    def get_speed_duplex_values(self, link_modes: list[str]) -> tuple[list[str], list[str]]:
+        speed = []
+        duplex = []
+        """Check the link modes"""
+        for data in link_modes:
+            parts = data.split('/')
+            speed_value = re.match(r'\d+', parts[0])
+            if speed_value:
+                speed.append(speed_value.group())
+            else:
+                ksft_pr(f"No speed value found for interface {self.ifname}")
+                return None, None
+            duplex.append(parts[1].lower())
+        return speed, duplex
+
+    def get_ethtool_field(self, field: str, remote: bool = False) -> Optional[str]:
+        process = None
+        if not remote:
+            """Get the ethtool field value for the local interface"""
+            try:
+                process = ethtool(self.cfg.ifname, json=True)
+            except Exception as e:
+                ksft_pr("Required minimum ethtool version is 6.10")
+                ksft_pr(f"Unexpected error occurred while getting ethtool field in local: {e}")
+                return None
+        else:
+            if not self.verify_link_up():
+                return None
+            """Get the ethtool field value for the remote interface"""
+            self.cfg.require_cmd("ethtool", remote=True)
+            if self.partner_netif is None:
+                ksft_pr(f"Partner interface name is unavailable.")
+                return None
+            try:
+                process = ethtool(self.partner_netif, json=True, host=self.cfg.remote)
+            except Exception as e:
+                ksft_pr("Required minimum ethtool version is 6.10")
+                ksft_pr(f"Unexpected error occurred while getting ethtool field in remote: {e}")
+                return None
+        json_data = process[0]
+        """Check if the field exist in the json data"""
+        if field not in json_data:
+            raise KsftSkipEx(f"Field {field} does not exist in the output of interface {json_data["ifname"]}")
+        return json_data[field]
diff --git a/tools/testing/selftests/drivers/net/hw/nic_link_layer.py b/tools/testing/selftests/drivers/net/hw/nic_link_layer.py
new file mode 100644
index 000000000000..d8cc12e84a40
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/hw/nic_link_layer.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+#Introduction:
+#This file has basic link layer tests for generic NIC drivers.
+#The test comprises of auto-negotiation, speed and duplex checks.
+#
+#Setup:
+#Connect the DUT PC with NIC card to partner pc back via ethernet medium of your choice(RJ45, T1)
+#
+#        DUT PC                                              Partner PC
+#┌───────────────────────┐                         ┌──────────────────────────┐
+#│                       │                         │                          │
+#│                       │                         │                          │
+#│           ┌───────────┐                         │                          │
+#│           │DUT NIC    │         Eth             │                          │
+#│           │Interface ─┼─────────────────────────┼─    any eth Interface    │
+#│           └───────────┘                         │                          │
+#│                       │                         │                          │
+#│                       │                         │                          │
+#└───────────────────────┘                         └──────────────────────────┘
+#
+#Configurations:
+#Required minimum ethtool version is 6.10 (supports json)
+#Default values:
+#time_delay = 8 #time taken to wait for transitions to happen, in seconds.
+
+import time
+import argparse
+from lib.py import ksft_run, ksft_exit, ksft_pr, ksft_eq
+from lib.py import KsftFailEx, KsftSkipEx
+from lib.py import NetDrvEpEnv
+from lib.py import LinkConfig
+
+def _pre_test_checks(cfg: object, link_config: LinkConfig) -> None:
+    if link_config.partner_netif is None:
+        KsftSkipEx("Partner interface is not available")
+    if not link_config.check_autoneg_supported() or not link_config.check_autoneg_supported(remote=True):
+        KsftSkipEx(f"Auto-negotiation not supported for interface {cfg.ifname} or {link_config.partner_netif}")
+    if not link_config.verify_link_up():
+        raise KsftSkipEx(f"Link state of interface {cfg.ifname} is DOWN")
+
+def verify_autonegotiation(cfg: object, expected_state: str, link_config: LinkConfig) -> None:
+    if not link_config.verify_link_up():
+        raise KsftSkipEx(f"Link state of interface {cfg.ifname} is DOWN")
+    """Verifying the autonegotiation state in partner"""
+    partner_autoneg_output = link_config.get_ethtool_field("auto-negotiation", remote=True)
+    if partner_autoneg_output is None:
+        KsftSkipEx(f"Auto-negotiation state not available for interface {link_config.partner_netif}")
+    partner_autoneg_state = "on" if partner_autoneg_output is True else "off"
+
+    ksft_eq(partner_autoneg_state, expected_state)
+
+    """Verifying the autonegotiation state of local"""
+    autoneg_output = link_config.get_ethtool_field("auto-negotiation")
+    if autoneg_output is None:
+        KsftSkipEx(f"Auto-negotiation state not available for interface {cfg.ifname}")
+    actual_state = "on" if autoneg_output is True else "off"
+
+    ksft_eq(actual_state, expected_state)
+
+    """Verifying the link establishment"""
+    link_available = link_config.get_ethtool_field("link-detected")
+    if link_available is None:
+        KsftSkipEx(f"Link status not available for interface {cfg.ifname}")
+    if link_available != True:
+        raise KsftSkipEx("Link not established at interface {cfg.ifname} after changing auto-negotiation")
+
+def test_autonegotiation(cfg: object, link_config: LinkConfig, time_delay: int) -> None:
+    _pre_test_checks(cfg, link_config)
+    for state in ["off", "on"]:
+        if not link_config.set_autonegotiation_state(state, remote=True):
+            raise KsftSkipEx(f"Unable to set auto-negotiation state for interface {link_config.partner_netif}")
+        if not link_config.set_autonegotiation_state(state):
+            raise KsftSkipEx(f"Unable to set auto-negotiation state for interface {cfg.ifname}")
+        time.sleep(time_delay)
+        verify_autonegotiation(cfg, state, link_config)
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Run basic link layer tests for NIC driver")
+    parser.add_argument('--time-delay', type=int, default=8, help='Time taken to wait for transitions to happen(in seconds). Default is 8 seconds.')
+    args = parser.parse_args()
+    time_delay = args.time_delay
+    with NetDrvEpEnv(__file__, nsim_test=False) as cfg:
+        link_config = LinkConfig(cfg)
+        ksft_run(globs=globals(), case_pfx={"test_"}, args=(cfg, link_config, time_delay,))
+        link_config.reset_interface()
+    ksft_exit()
+
+if __name__ == "__main__":
+    main()
-- 
cgit v1.2.3


From c087dc54394b3f8a2950007fb17a8937a38fe73a Mon Sep 17 00:00:00 2001
From: Mohan Prasad J <mohan.prasad@microchip.com>
Date: Fri, 15 Nov 2024 00:55:19 +0530
Subject: selftests: nic_link_layer: Add selftest case for speed and duplex
 states

Add selftest case for testing the speed and duplex state of
local NIC driver and the partner based on the supported
link modes obtained from the ethtool. Speed and duplex states
are varied and verified using ethtool.

Signed-off-by: Mohan Prasad J <mohan.prasad@microchip.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 .../selftests/drivers/net/hw/nic_link_layer.py     | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/hw/nic_link_layer.py b/tools/testing/selftests/drivers/net/hw/nic_link_layer.py
index d8cc12e84a40..efd921180532 100644
--- a/tools/testing/selftests/drivers/net/hw/nic_link_layer.py
+++ b/tools/testing/selftests/drivers/net/hw/nic_link_layer.py
@@ -76,6 +76,28 @@ def test_autonegotiation(cfg: object, link_config: LinkConfig, time_delay: int)
         time.sleep(time_delay)
         verify_autonegotiation(cfg, state, link_config)
 
+def test_network_speed(cfg: object, link_config: LinkConfig, time_delay: int) -> None:
+    _pre_test_checks(cfg, link_config)
+    common_link_modes = link_config.common_link_modes
+    if not common_link_modes:
+        KsftSkipEx("No common link modes exist")
+    speeds, duplex_modes = link_config.get_speed_duplex_values(common_link_modes)
+
+    if speeds and duplex_modes and len(speeds) == len(duplex_modes):
+        for idx in range(len(speeds)):
+            speed = speeds[idx]
+            duplex = duplex_modes[idx]
+            if not link_config.set_speed_and_duplex(speed, duplex):
+                raise KsftFailEx(f"Unable to set speed and duplex parameters for {cfg.ifname}")
+            time.sleep(time_delay)
+            if not link_config.verify_speed_and_duplex(speed, duplex):
+                raise KsftSkipEx(f"Error occurred while verifying speed and duplex states for interface {cfg.ifname}")
+    else:
+        if not speeds or not duplex_modes:
+            KsftSkipEx(f"No supported speeds or duplex modes found for interface {cfg.ifname}")
+        else:
+            KsftSkipEx("Mismatch in the number of speeds and duplex modes")
+
 def main() -> None:
     parser = argparse.ArgumentParser(description="Run basic link layer tests for NIC driver")
     parser.add_argument('--time-delay', type=int, default=8, help='Time taken to wait for transitions to happen(in seconds). Default is 8 seconds.')
-- 
cgit v1.2.3


From fbbf93556f0c1ad9b53fd1ec8fd2e67b2debb740 Mon Sep 17 00:00:00 2001
From: Mohan Prasad J <mohan.prasad@microchip.com>
Date: Fri, 15 Nov 2024 00:55:20 +0530
Subject: selftests: nic_performance: Add selftest for performance of NIC
 driver

Add selftest case to check the send and receive throughput.
Supported link modes between local NIC driver and partner
are varied. Then send and receive throughput is captured
and verified. Test uses iperf3 tool.
Add iperf3 server/client function in GenerateTraffic class.

Signed-off-by: Mohan Prasad J <mohan.prasad@microchip.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/drivers/net/hw/Makefile    |   1 +
 .../selftests/drivers/net/hw/nic_performance.py    | 137 +++++++++++++++++++++
 tools/testing/selftests/drivers/net/lib/py/load.py |  20 ++-
 3 files changed, 157 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/drivers/net/hw/nic_performance.py

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile
index a87f1f17a10a..21ba64ce1e34 100644
--- a/tools/testing/selftests/drivers/net/hw/Makefile
+++ b/tools/testing/selftests/drivers/net/hw/Makefile
@@ -12,6 +12,7 @@ TEST_PROGS = \
 	hw_stats_l3_gre.sh \
 	loopback.sh \
 	nic_link_layer.py \
+	nic_performance.py \
 	pp_alloc_fail.py \
 	rss_ctx.py \
 	#
diff --git a/tools/testing/selftests/drivers/net/hw/nic_performance.py b/tools/testing/selftests/drivers/net/hw/nic_performance.py
new file mode 100644
index 000000000000..201403b76ea3
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/hw/nic_performance.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+#Introduction:
+#This file has basic performance test for generic NIC drivers.
+#The test comprises of throughput check for TCP and UDP streams.
+#
+#Setup:
+#Connect the DUT PC with NIC card to partner pc back via ethernet medium of your choice(RJ45, T1)
+#
+#        DUT PC                                              Partner PC
+#┌───────────────────────┐                         ┌──────────────────────────┐
+#│                       │                         │                          │
+#│                       │                         │                          │
+#│           ┌───────────┐                         │                          │
+#│           │DUT NIC    │         Eth             │                          │
+#│           │Interface ─┼─────────────────────────┼─    any eth Interface    │
+#│           └───────────┘                         │                          │
+#│                       │                         │                          │
+#│                       │                         │                          │
+#└───────────────────────┘                         └──────────────────────────┘
+#
+#Configurations:
+#To prevent interruptions, Add ethtool, ip to the sudoers list in remote PC and get the ssh key from remote.
+#Required minimum ethtool version is 6.10
+#Change the below configuration based on your hw needs.
+# """Default values"""
+#time_delay = 8 #time taken to wait for transitions to happen, in seconds.
+#test_duration = 10  #performance test duration for the throughput check, in seconds.
+#send_throughput_threshold = 80 #percentage of send throughput required to pass the check
+#receive_throughput_threshold = 50 #percentage of receive throughput required to pass the check
+
+import time
+import json
+import argparse
+from lib.py import ksft_run, ksft_exit, ksft_pr, ksft_true
+from lib.py import KsftFailEx, KsftSkipEx, GenerateTraffic
+from lib.py import NetDrvEpEnv, bkg, wait_port_listen
+from lib.py import cmd
+from lib.py import LinkConfig
+
+class TestConfig:
+    def __init__(self, time_delay: int, test_duration: int, send_throughput_threshold: int, receive_throughput_threshold: int) -> None:
+        self.time_delay = time_delay
+        self.test_duration = test_duration
+        self.send_throughput_threshold = send_throughput_threshold
+        self.receive_throughput_threshold = receive_throughput_threshold
+
+def _pre_test_checks(cfg: object, link_config: LinkConfig) -> None:
+    if not link_config.verify_link_up():
+        KsftSkipEx(f"Link state of interface {cfg.ifname} is DOWN")
+    common_link_modes = link_config.common_link_modes
+    if common_link_modes is None:
+        KsftSkipEx("No common link modes found")
+    if link_config.partner_netif == None:
+        KsftSkipEx("Partner interface is not available")
+    if link_config.check_autoneg_supported():
+        KsftSkipEx("Auto-negotiation not supported by local")
+    if link_config.check_autoneg_supported(remote=True):
+        KsftSkipEx("Auto-negotiation not supported by remote")
+    cfg.require_cmd("iperf3", remote=True)
+
+def check_throughput(cfg: object, link_config: LinkConfig, test_config: TestConfig, protocol: str, traffic: GenerateTraffic) -> None:
+    common_link_modes = link_config.common_link_modes
+    speeds, duplex_modes = link_config.get_speed_duplex_values(common_link_modes)
+    """Test duration in seconds"""
+    duration = test_config.test_duration
+
+    ksft_pr(f"{protocol} test")
+    test_type = "-u" if protocol == "UDP" else ""
+
+    send_throughput = []
+    receive_throughput = []
+    for idx in range(0, len(speeds)):
+        if link_config.set_speed_and_duplex(speeds[idx], duplex_modes[idx]) == False:
+            raise KsftFailEx(f"Not able to set speed and duplex parameters for {cfg.ifname}")
+        time.sleep(test_config.time_delay)
+        if not link_config.verify_link_up():
+            raise KsftSkipEx(f"Link state of interface {cfg.ifname} is DOWN")
+
+        send_command=f"{test_type} -b 0 -t {duration} --json"
+        receive_command=f"{test_type} -b 0 -t {duration} --reverse --json"
+
+        send_result = traffic.run_remote_test(cfg, command=send_command)
+        if send_result.ret != 0:
+            raise KsftSkipEx("Error occurred during data transmit: {send_result.stdout}")
+
+        send_output = send_result.stdout
+        send_data = json.loads(send_output)
+
+        """Convert throughput to Mbps"""
+        send_throughput.append(round(send_data['end']['sum_sent']['bits_per_second'] / 1e6, 2))
+        ksft_pr(f"{protocol}: Send throughput: {send_throughput[idx]} Mbps")
+
+        receive_result = traffic.run_remote_test(cfg, command=receive_command)
+        if receive_result.ret != 0:
+            raise KsftSkipEx("Error occurred during data receive: {receive_result.stdout}")
+
+        receive_output = receive_result.stdout
+        receive_data = json.loads(receive_output)
+
+        """Convert throughput to Mbps"""
+        receive_throughput.append(round(receive_data['end']['sum_received']['bits_per_second'] / 1e6, 2))
+        ksft_pr(f"{protocol}: Receive throughput: {receive_throughput[idx]} Mbps")
+
+    """Check whether throughput is not below the threshold (default values set at start)"""
+    for idx in range(0, len(speeds)):
+        send_threshold = float(speeds[idx]) * float(test_config.send_throughput_threshold / 100)
+        receive_threshold = float(speeds[idx]) * float(test_config.receive_throughput_threshold / 100)
+        ksft_true(send_throughput[idx] >= send_threshold, f"{protocol}: Send throughput is below threshold for {speeds[idx]} Mbps in {duplex_modes[idx]} duplex")
+        ksft_true(receive_throughput[idx] >= receive_threshold, f"{protocol}: Receive throughput is below threshold for {speeds[idx]} Mbps in {duplex_modes[idx]} duplex")
+
+def test_tcp_throughput(cfg: object, link_config: LinkConfig, test_config: TestConfig, traffic: GenerateTraffic) -> None:
+    _pre_test_checks(cfg, link_config)
+    check_throughput(cfg, link_config, test_config, 'TCP', traffic)
+
+def test_udp_throughput(cfg: object, link_config: LinkConfig, test_config: TestConfig, traffic: GenerateTraffic) -> None:
+    _pre_test_checks(cfg, link_config)
+    check_throughput(cfg, link_config, test_config, 'UDP', traffic)
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Run basic performance test for NIC driver")
+    parser.add_argument('--time-delay', type=int, default=8, help='Time taken to wait for transitions to happen(in seconds). Default is 8 seconds.')
+    parser.add_argument('--test-duration', type=int, default=10, help='Performance test duration for the throughput check, in seconds. Default is 10 seconds.')
+    parser.add_argument('--stt', type=int, default=80, help='Send throughput Threshold: Percentage of send throughput upon actual throughput required to pass the throughput check (in percentage). Default is 80.')
+    parser.add_argument('--rtt', type=int, default=50, help='Receive throughput Threshold: Percentage of receive throughput upon actual throughput required to pass the throughput check (in percentage). Default is 50.')
+    args=parser.parse_args()
+    test_config = TestConfig(args.time_delay, args.test_duration, args.stt, args.rtt)
+    with NetDrvEpEnv(__file__, nsim_test=False) as cfg:
+        traffic = GenerateTraffic(cfg)
+        link_config = LinkConfig(cfg)
+        ksft_run(globs=globals(), case_pfx={"test_"}, args=(cfg, link_config, test_config, traffic,  ))
+        link_config.reset_interface()
+    ksft_exit()
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/testing/selftests/drivers/net/lib/py/load.py b/tools/testing/selftests/drivers/net/lib/py/load.py
index d9c10613ae67..da5af2c680fa 100644
--- a/tools/testing/selftests/drivers/net/lib/py/load.py
+++ b/tools/testing/selftests/drivers/net/lib/py/load.py
@@ -2,7 +2,7 @@
 
 import time
 
-from lib.py import ksft_pr, cmd, ip, rand_port, wait_port_listen
+from lib.py import ksft_pr, cmd, ip, rand_port, wait_port_listen, bkg
 
 class GenerateTraffic:
     def __init__(self, env, port=None):
@@ -23,6 +23,24 @@ class GenerateTraffic:
             self.stop(verbose=True)
             raise Exception("iperf3 traffic did not ramp up")
 
+    def run_remote_test(self, env: object, port=None, command=None):
+        if port is None:
+            port = rand_port()
+        try:
+            server_cmd = f"iperf3 -s 1 -p {port} --one-off"
+            with bkg(server_cmd, host=env.remote):
+                #iperf3 opens TCP connection as default in server
+                #-u to be specified in client command for UDP
+                wait_port_listen(port, host=env.remote)
+        except Exception as e:
+            raise Exception(f"Unexpected error occurred while running server command: {e}")
+        try:
+            client_cmd = f"iperf3 -c {env.remote_addr} -p {port} {command}"
+            proc = cmd(client_cmd)
+            return proc
+        except Exception as e:
+            raise Exception(f"Unexpected error occurred while running client command: {e}")
+
     def _wait_pkts(self, pkt_cnt=None, pps=None):
         """
         Wait until we've seen pkt_cnt or until traffic ramps up to pps.
-- 
cgit v1.2.3


From 96ed62ea02984f14b6d4f2e4aed327d803875b7a Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Tue, 19 Nov 2024 11:30:11 +0800
Subject: mm: page_frag: fix a compile error when kernel is not compiled

page_frag test module is an out of tree module, but built
using KDIR as the main kernel tree, the mm test suite is
just getting skipped if newly added page_frag test module
fails to compile due to kernel not yet compiled.

Fix the above problem by ensuring both kernel is built first
and a newer kernel which has page_frag_cache.h is used.

CC: Andrew Morton <akpm@linux-foundation.org>
CC: Alexander Duyck <alexanderduyck@fb.com>
CC: Linux-MM <linux-mm@kvack.org>
Fixes: 7fef0dec415c ("mm: page_frag: add a test module for page_frag")
Fixes: 65941f10caf2 ("mm: move the page fragment allocator from page_alloc into its own file")
Reported-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Tested-by: Mark Brown <broonie@kernel.org>
Link: https://patch.msgid.link/20241119033012.257525-1-linyunsheng@huawei.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/mm/Makefile           | 18 ++++++++++++++++++
 tools/testing/selftests/mm/page_frag/Makefile |  2 +-
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index acec529baaca..04e04733fc8a 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -36,7 +36,16 @@ MAKEFLAGS += --no-builtin-rules
 CFLAGS = -Wall -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
 LDLIBS = -lrt -lpthread -lm
 
+KDIR ?= /lib/modules/$(shell uname -r)/build
+ifneq (,$(wildcard $(KDIR)/Module.symvers))
+ifneq (,$(wildcard $(KDIR)/include/linux/page_frag_cache.h))
 TEST_GEN_MODS_DIR := page_frag
+else
+PAGE_FRAG_WARNING = "missing page_frag_cache.h, please use a newer kernel"
+endif
+else
+PAGE_FRAG_WARNING = "missing Module.symvers, please have the kernel built first"
+endif
 
 TEST_GEN_FILES = cow
 TEST_GEN_FILES += compaction_test
@@ -214,3 +223,12 @@ warn_missing_liburing:
 	echo "Warning: missing liburing support. Some tests will be skipped." ; \
 	echo
 endif
+
+ifneq ($(PAGE_FRAG_WARNING),)
+all: warn_missing_page_frag
+
+warn_missing_page_frag:
+	@echo ; \
+	echo "Warning: $(PAGE_FRAG_WARNING). page_frag test will be skipped." ; \
+	echo
+endif
diff --git a/tools/testing/selftests/mm/page_frag/Makefile b/tools/testing/selftests/mm/page_frag/Makefile
index 58dda74d50a3..8c8bb39ffa28 100644
--- a/tools/testing/selftests/mm/page_frag/Makefile
+++ b/tools/testing/selftests/mm/page_frag/Makefile
@@ -1,5 +1,5 @@
 PAGE_FRAG_TEST_DIR := $(realpath $(dir $(abspath $(lastword $(MAKEFILE_LIST)))))
-KDIR ?= $(abspath $(PAGE_FRAG_TEST_DIR)/../../../../..)
+KDIR ?= /lib/modules/$(shell uname -r)/build
 
 ifeq ($(V),1)
 Q =
-- 
cgit v1.2.3


From 062a9dd9bad7d802a6f6f23b09118b69d8766c61 Mon Sep 17 00:00:00 2001
From: David Gow <davidgow@google.com>
Date: Wed, 13 Nov 2024 22:24:05 +0000
Subject: kunit: tool: Only print the summary

Allow only printing the summary at the end of a test run, rather than all
individual test results. This summary will list a few failing tests if
there are any.

To use:

./tools/testing/kunit/kunit.py run --summary

Link: https://lore.kernel.org/r/20241113222406.1590372-1-rmoar@google.com
Signed-off-by: Rae Moar <rmoar@google.com>
Signed-off-by: David Gow <davidgow@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit.py           |  18 +++++-
 tools/testing/kunit/kunit_parser.py    | 109 ++++++++++++++++++---------------
 tools/testing/kunit/kunit_printer.py   |  14 +++--
 tools/testing/kunit/kunit_tool_test.py |  55 +++++++++--------
 4 files changed, 112 insertions(+), 84 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py
index bc74088c458a..27c55a7fc1a0 100755
--- a/tools/testing/kunit/kunit.py
+++ b/tools/testing/kunit/kunit.py
@@ -23,7 +23,7 @@ from typing import Iterable, List, Optional, Sequence, Tuple
 import kunit_json
 import kunit_kernel
 import kunit_parser
-from kunit_printer import stdout
+from kunit_printer import stdout, null_printer
 
 class KunitStatus(Enum):
 	SUCCESS = auto()
@@ -49,6 +49,7 @@ class KunitBuildRequest(KunitConfigRequest):
 class KunitParseRequest:
 	raw_output: Optional[str]
 	json: Optional[str]
+	summary: bool
 
 @dataclass
 class KunitExecRequest(KunitParseRequest):
@@ -235,11 +236,16 @@ def parse_tests(request: KunitParseRequest, metadata: kunit_json.Metadata, input
 		parse_time = time.time() - parse_start
 		return KunitResult(KunitStatus.SUCCESS, parse_time), fake_test
 
+	default_printer = stdout
+	if request.summary:
+		default_printer = null_printer
 
 	# Actually parse the test results.
-	test = kunit_parser.parse_run_tests(input_data)
+	test = kunit_parser.parse_run_tests(input_data, default_printer)
 	parse_time = time.time() - parse_start
 
+	kunit_parser.print_summary_line(test, stdout)
+
 	if request.json:
 		json_str = kunit_json.get_json_result(
 					test=test,
@@ -413,6 +419,10 @@ def add_parse_opts(parser: argparse.ArgumentParser) -> None:
 			    help='Prints parsed test results as JSON to stdout or a file if '
 			    'a filename is specified. Does nothing if --raw_output is set.',
 			    type=str, const='stdout', default=None, metavar='FILE')
+	parser.add_argument('--summary',
+			    help='Prints only the summary line for parsed test results.'
+				'Does nothing if --raw_output is set.',
+			    action='store_true')
 
 
 def tree_from_args(cli_args: argparse.Namespace) -> kunit_kernel.LinuxSourceTree:
@@ -448,6 +458,7 @@ def run_handler(cli_args: argparse.Namespace) -> None:
 					jobs=cli_args.jobs,
 					raw_output=cli_args.raw_output,
 					json=cli_args.json,
+					summary=cli_args.summary,
 					timeout=cli_args.timeout,
 					filter_glob=cli_args.filter_glob,
 					filter=cli_args.filter,
@@ -495,6 +506,7 @@ def exec_handler(cli_args: argparse.Namespace) -> None:
 	exec_request = KunitExecRequest(raw_output=cli_args.raw_output,
 					build_dir=cli_args.build_dir,
 					json=cli_args.json,
+					summary=cli_args.summary,
 					timeout=cli_args.timeout,
 					filter_glob=cli_args.filter_glob,
 					filter=cli_args.filter,
@@ -520,7 +532,7 @@ def parse_handler(cli_args: argparse.Namespace) -> None:
 	# We know nothing about how the result was created!
 	metadata = kunit_json.Metadata()
 	request = KunitParseRequest(raw_output=cli_args.raw_output,
-					json=cli_args.json)
+					json=cli_args.json, summary=cli_args.summary)
 	result, _ = parse_tests(request, metadata, kunit_output)
 	if result.status != KunitStatus.SUCCESS:
 		sys.exit(1)
diff --git a/tools/testing/kunit/kunit_parser.py b/tools/testing/kunit/kunit_parser.py
index ce34be15c929..732f448263de 100644
--- a/tools/testing/kunit/kunit_parser.py
+++ b/tools/testing/kunit/kunit_parser.py
@@ -17,7 +17,7 @@ import textwrap
 from enum import Enum, auto
 from typing import Iterable, Iterator, List, Optional, Tuple
 
-from kunit_printer import stdout
+from kunit_printer import Printer, stdout
 
 class Test:
 	"""
@@ -54,10 +54,10 @@ class Test:
 		"""Returns string representation of a Test class object."""
 		return str(self)
 
-	def add_error(self, error_message: str) -> None:
+	def add_error(self, printer: Printer, error_message: str) -> None:
 		"""Records an error that occurred while parsing this test."""
 		self.counts.errors += 1
-		stdout.print_with_timestamp(stdout.red('[ERROR]') + f' Test: {self.name}: {error_message}')
+		printer.print_with_timestamp(stdout.red('[ERROR]') + f' Test: {self.name}: {error_message}')
 
 	def ok_status(self) -> bool:
 		"""Returns true if the status was ok, i.e. passed or skipped."""
@@ -251,7 +251,7 @@ KTAP_VERSIONS = [1]
 TAP_VERSIONS = [13, 14]
 
 def check_version(version_num: int, accepted_versions: List[int],
-			version_type: str, test: Test) -> None:
+			version_type: str, test: Test, printer: Printer) -> None:
 	"""
 	Adds error to test object if version number is too high or too
 	low.
@@ -263,13 +263,14 @@ def check_version(version_num: int, accepted_versions: List[int],
 	version_type - 'KTAP' or 'TAP' depending on the type of
 		version line.
 	test - Test object for current test being parsed
+	printer - Printer object to output error
 	"""
 	if version_num < min(accepted_versions):
-		test.add_error(f'{version_type} version lower than expected!')
+		test.add_error(printer, f'{version_type} version lower than expected!')
 	elif version_num > max(accepted_versions):
-		test.add_error(f'{version_type} version higer than expected!')
+		test.add_error(printer, f'{version_type} version higer than expected!')
 
-def parse_ktap_header(lines: LineStream, test: Test) -> bool:
+def parse_ktap_header(lines: LineStream, test: Test, printer: Printer) -> bool:
 	"""
 	Parses KTAP/TAP header line and checks version number.
 	Returns False if fails to parse KTAP/TAP header line.
@@ -281,6 +282,7 @@ def parse_ktap_header(lines: LineStream, test: Test) -> bool:
 	Parameters:
 	lines - LineStream of KTAP output to parse
 	test - Test object for current test being parsed
+	printer - Printer object to output results
 
 	Return:
 	True if successfully parsed KTAP/TAP header line
@@ -289,10 +291,10 @@ def parse_ktap_header(lines: LineStream, test: Test) -> bool:
 	tap_match = TAP_START.match(lines.peek())
 	if ktap_match:
 		version_num = int(ktap_match.group(1))
-		check_version(version_num, KTAP_VERSIONS, 'KTAP', test)
+		check_version(version_num, KTAP_VERSIONS, 'KTAP', test, printer)
 	elif tap_match:
 		version_num = int(tap_match.group(1))
-		check_version(version_num, TAP_VERSIONS, 'TAP', test)
+		check_version(version_num, TAP_VERSIONS, 'TAP', test, printer)
 	else:
 		return False
 	lines.pop()
@@ -380,7 +382,7 @@ def peek_test_name_match(lines: LineStream, test: Test) -> bool:
 	return name == test.name
 
 def parse_test_result(lines: LineStream, test: Test,
-			expected_num: int) -> bool:
+			expected_num: int, printer: Printer) -> bool:
 	"""
 	Parses test result line and stores the status and name in the test
 	object. Reports an error if the test number does not match expected
@@ -398,6 +400,7 @@ def parse_test_result(lines: LineStream, test: Test,
 	lines - LineStream of KTAP output to parse
 	test - Test object for current test being parsed
 	expected_num - expected test number for current test
+	printer - Printer object to output results
 
 	Return:
 	True if successfully parsed a test result line.
@@ -420,7 +423,7 @@ def parse_test_result(lines: LineStream, test: Test,
 	# Check test num
 	num = int(match.group(2))
 	if num != expected_num:
-		test.add_error(f'Expected test number {expected_num} but found {num}')
+		test.add_error(printer, f'Expected test number {expected_num} but found {num}')
 
 	# Set status of test object
 	status = match.group(1)
@@ -486,7 +489,7 @@ def format_test_divider(message: str, len_message: int) -> str:
 		len_2 = difference - len_1
 	return ('=' * len_1) + f' {message} ' + ('=' * len_2)
 
-def print_test_header(test: Test) -> None:
+def print_test_header(test: Test, printer: Printer) -> None:
 	"""
 	Prints test header with test name and optionally the expected number
 	of subtests.
@@ -496,6 +499,7 @@ def print_test_header(test: Test) -> None:
 
 	Parameters:
 	test - Test object representing current test being printed
+	printer - Printer object to output results
 	"""
 	message = test.name
 	if message != "":
@@ -507,15 +511,15 @@ def print_test_header(test: Test) -> None:
 			message += '(1 subtest)'
 		else:
 			message += f'({test.expected_count} subtests)'
-	stdout.print_with_timestamp(format_test_divider(message, len(message)))
+	printer.print_with_timestamp(format_test_divider(message, len(message)))
 
-def print_log(log: Iterable[str]) -> None:
+def print_log(log: Iterable[str], printer: Printer) -> None:
 	"""Prints all strings in saved log for test in yellow."""
 	formatted = textwrap.dedent('\n'.join(log))
 	for line in formatted.splitlines():
-		stdout.print_with_timestamp(stdout.yellow(line))
+		printer.print_with_timestamp(printer.yellow(line))
 
-def format_test_result(test: Test) -> str:
+def format_test_result(test: Test, printer: Printer) -> str:
 	"""
 	Returns string with formatted test result with colored status and test
 	name.
@@ -525,23 +529,24 @@ def format_test_result(test: Test) -> str:
 
 	Parameters:
 	test - Test object representing current test being printed
+	printer - Printer object to output results
 
 	Return:
 	String containing formatted test result
 	"""
 	if test.status == TestStatus.SUCCESS:
-		return stdout.green('[PASSED] ') + test.name
+		return printer.green('[PASSED] ') + test.name
 	if test.status == TestStatus.SKIPPED:
-		return stdout.yellow('[SKIPPED] ') + test.name
+		return printer.yellow('[SKIPPED] ') + test.name
 	if test.status == TestStatus.NO_TESTS:
-		return stdout.yellow('[NO TESTS RUN] ') + test.name
+		return printer.yellow('[NO TESTS RUN] ') + test.name
 	if test.status == TestStatus.TEST_CRASHED:
-		print_log(test.log)
+		print_log(test.log, printer)
 		return stdout.red('[CRASHED] ') + test.name
-	print_log(test.log)
-	return stdout.red('[FAILED] ') + test.name
+	print_log(test.log, printer)
+	return printer.red('[FAILED] ') + test.name
 
-def print_test_result(test: Test) -> None:
+def print_test_result(test: Test, printer: Printer) -> None:
 	"""
 	Prints result line with status of test.
 
@@ -550,10 +555,11 @@ def print_test_result(test: Test) -> None:
 
 	Parameters:
 	test - Test object representing current test being printed
+	printer - Printer object
 	"""
-	stdout.print_with_timestamp(format_test_result(test))
+	printer.print_with_timestamp(format_test_result(test, printer))
 
-def print_test_footer(test: Test) -> None:
+def print_test_footer(test: Test, printer: Printer) -> None:
 	"""
 	Prints test footer with status of test.
 
@@ -562,10 +568,11 @@ def print_test_footer(test: Test) -> None:
 
 	Parameters:
 	test - Test object representing current test being printed
+	printer - Printer object to output results
 	"""
-	message = format_test_result(test)
-	stdout.print_with_timestamp(format_test_divider(message,
-		len(message) - stdout.color_len()))
+	message = format_test_result(test, printer)
+	printer.print_with_timestamp(format_test_divider(message,
+		len(message) - printer.color_len()))
 
 
@@ -601,7 +608,7 @@ def _summarize_failed_tests(test: Test) -> str:
 	return 'Failures: ' + ', '.join(failures)
 
 
-def print_summary_line(test: Test) -> None:
+def print_summary_line(test: Test, printer: Printer) -> None:
 	"""
 	Prints summary line of test object. Color of line is dependent on
 	status of test. Color is green if test passes, yellow if test is
@@ -614,6 +621,7 @@ def print_summary_line(test: Test) -> None:
 	Errors: 0"
 
 	test - Test object representing current test being printed
+	printer - Printer object to output results
 	"""
 	if test.status == TestStatus.SUCCESS:
 		color = stdout.green
@@ -621,7 +629,7 @@ def print_summary_line(test: Test) -> None:
 		color = stdout.yellow
 	else:
 		color = stdout.red
-	stdout.print_with_timestamp(color(f'Testing complete. {test.counts}'))
+	printer.print_with_timestamp(color(f'Testing complete. {test.counts}'))
 
 	# Summarize failures that might have gone off-screen since we had a lot
 	# of tests (arbitrarily defined as >=100 for now).
@@ -630,7 +638,7 @@ def print_summary_line(test: Test) -> None:
 	summarized = _summarize_failed_tests(test)
 	if not summarized:
 		return
-	stdout.print_with_timestamp(color(summarized))
+	printer.print_with_timestamp(color(summarized))
 
 # Other methods:
 
@@ -654,7 +662,7 @@ def bubble_up_test_results(test: Test) -> None:
 	elif test.counts.get_status() == TestStatus.TEST_CRASHED:
 		test.status = TestStatus.TEST_CRASHED
 
-def parse_test(lines: LineStream, expected_num: int, log: List[str], is_subtest: bool) -> Test:
+def parse_test(lines: LineStream, expected_num: int, log: List[str], is_subtest: bool, printer: Printer) -> Test:
 	"""
 	Finds next test to parse in LineStream, creates new Test object,
 	parses any subtests of the test, populates Test object with all
@@ -710,6 +718,7 @@ def parse_test(lines: LineStream, expected_num: int, log: List[str], is_subtest:
 	log - list of strings containing any preceding diagnostic lines
 		corresponding to the current test
 	is_subtest - boolean indicating whether test is a subtest
+	printer - Printer object to output results
 
 	Return:
 	Test object populated with characteristics and any subtests
@@ -725,14 +734,14 @@ def parse_test(lines: LineStream, expected_num: int, log: List[str], is_subtest:
 		# If parsing the main/top-level test, parse KTAP version line and
 		# test plan
 		test.name = "main"
-		ktap_line = parse_ktap_header(lines, test)
+		ktap_line = parse_ktap_header(lines, test, printer)
 		test.log.extend(parse_diagnostic(lines))
 		parse_test_plan(lines, test)
 		parent_test = True
 	else:
 		# If not the main test, attempt to parse a test header containing
 		# the KTAP version line and/or subtest header line
-		ktap_line = parse_ktap_header(lines, test)
+		ktap_line = parse_ktap_header(lines, test, printer)
 		subtest_line = parse_test_header(lines, test)
 		parent_test = (ktap_line or subtest_line)
 		if parent_test:
@@ -740,7 +749,7 @@ def parse_test(lines: LineStream, expected_num: int, log: List[str], is_subtest:
 			# to parse test plan and print test header
 			test.log.extend(parse_diagnostic(lines))
 			parse_test_plan(lines, test)
-			print_test_header(test)
+			print_test_header(test, printer)
 	expected_count = test.expected_count
 	subtests = []
 	test_num = 1
@@ -758,16 +767,16 @@ def parse_test(lines: LineStream, expected_num: int, log: List[str], is_subtest:
 				# If parser reaches end of test before
 				# parsing expected number of subtests, print
 				# crashed subtest and record error
-				test.add_error('missing expected subtest!')
+				test.add_error(printer, 'missing expected subtest!')
 				sub_test.log.extend(sub_log)
 				test.counts.add_status(
 					TestStatus.TEST_CRASHED)
-				print_test_result(sub_test)
+				print_test_result(sub_test, printer)
 			else:
 				test.log.extend(sub_log)
 				break
 		else:
-			sub_test = parse_test(lines, test_num, sub_log, True)
+			sub_test = parse_test(lines, test_num, sub_log, True, printer)
 		subtests.append(sub_test)
 		test_num += 1
 	test.subtests = subtests
@@ -775,51 +784,51 @@ def parse_test(lines: LineStream, expected_num: int, log: List[str], is_subtest:
 		# If not main test, look for test result line
 		test.log.extend(parse_diagnostic(lines))
 		if test.name != "" and not peek_test_name_match(lines, test):
-			test.add_error('missing subtest result line!')
+			test.add_error(printer, 'missing subtest result line!')
 		else:
-			parse_test_result(lines, test, expected_num)
+			parse_test_result(lines, test, expected_num, printer)
 
 	# Check for there being no subtests within parent test
 	if parent_test and len(subtests) == 0:
 		# Don't override a bad status if this test had one reported.
 		# Assumption: no subtests means CRASHED is from Test.__init__()
 		if test.status in (TestStatus.TEST_CRASHED, TestStatus.SUCCESS):
-			print_log(test.log)
+			print_log(test.log, printer)
 			test.status = TestStatus.NO_TESTS
-			test.add_error('0 tests run!')
+			test.add_error(printer, '0 tests run!')
 
 	# Add statuses to TestCounts attribute in Test object
 	bubble_up_test_results(test)
 	if parent_test and is_subtest:
 		# If test has subtests and is not the main test object, print
 		# footer.
-		print_test_footer(test)
+		print_test_footer(test, printer)
 	elif is_subtest:
-		print_test_result(test)
+		print_test_result(test, printer)
 	return test
 
-def parse_run_tests(kernel_output: Iterable[str]) -> Test:
+def parse_run_tests(kernel_output: Iterable[str], printer: Printer) -> Test:
 	"""
 	Using kernel output, extract KTAP lines, parse the lines for test
 	results and print condensed test results and summary line.
 
 	Parameters:
 	kernel_output - Iterable object contains lines of kernel output
+	printer - Printer object to output results
 
 	Return:
 	Test - the main test object with all subtests.
 	"""
-	stdout.print_with_timestamp(DIVIDER)
+	printer.print_with_timestamp(DIVIDER)
 	lines = extract_tap_lines(kernel_output)
 	test = Test()
 	if not lines:
 		test.name = '<missing>'
-		test.add_error('Could not find any KTAP output. Did any KUnit tests run?')
+		test.add_error(printer, 'Could not find any KTAP output. Did any KUnit tests run?')
 		test.status = TestStatus.FAILURE_TO_PARSE_TESTS
 	else:
-		test = parse_test(lines, 0, [], False)
+		test = parse_test(lines, 0, [], False, printer)
 		if test.status != TestStatus.NO_TESTS:
 			test.status = test.counts.get_status()
-	stdout.print_with_timestamp(DIVIDER)
-	print_summary_line(test)
+	printer.print_with_timestamp(DIVIDER)
 	return test
diff --git a/tools/testing/kunit/kunit_printer.py b/tools/testing/kunit/kunit_printer.py
index 015adf87dc2c..ca119f61fe79 100644
--- a/tools/testing/kunit/kunit_printer.py
+++ b/tools/testing/kunit/kunit_printer.py
@@ -15,12 +15,17 @@ _RESET = '\033[0;0m'
 class Printer:
 	"""Wraps a file object, providing utilities for coloring output, etc."""
 
-	def __init__(self, output: typing.IO[str]):
+	def __init__(self, print: bool=True, output: typing.IO[str]=sys.stdout):
 		self._output = output
-		self._use_color = output.isatty()
+		self._print = print
+		if print:
+			self._use_color = output.isatty()
+		else:
+			self._use_color = False
 
 	def print(self, message: str) -> None:
-		print(message, file=self._output)
+		if self._print:
+			print(message, file=self._output)
 
 	def print_with_timestamp(self, message: str) -> None:
 		ts = datetime.datetime.now().strftime('%H:%M:%S')
@@ -45,4 +50,5 @@ class Printer:
 		return len(self.red(''))
 
 # Provides a default instance that prints to stdout
-stdout = Printer(sys.stdout)
+stdout = Printer()
+null_printer = Printer(print=False)
diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py
index 2beb7327e53f..02aa296d8850 100755
--- a/tools/testing/kunit/kunit_tool_test.py
+++ b/tools/testing/kunit/kunit_tool_test.py
@@ -23,6 +23,7 @@ import kunit_parser
 import kunit_kernel
 import kunit_json
 import kunit
+from kunit_printer import stdout
 
 test_tmpdir = ''
 abs_test_data_dir = ''
@@ -139,28 +140,28 @@ class KUnitParserTest(unittest.TestCase):
 	def test_parse_successful_test_log(self):
 		all_passed_log = test_data_path('test_is_test_passed-all_passed.log')
 		with open(all_passed_log) as file:
-			result = kunit_parser.parse_run_tests(file.readlines())
+			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 		self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status)
 		self.assertEqual(result.counts.errors, 0)
 
 	def test_parse_successful_nested_tests_log(self):
 		all_passed_log = test_data_path('test_is_test_passed-all_passed_nested.log')
 		with open(all_passed_log) as file:
-			result = kunit_parser.parse_run_tests(file.readlines())
+			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 		self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status)
 		self.assertEqual(result.counts.errors, 0)
 
 	def test_kselftest_nested(self):
 		kselftest_log = test_data_path('test_is_test_passed-kselftest.log')
 		with open(kselftest_log) as file:
-			result = kunit_parser.parse_run_tests(file.readlines())
+			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 		self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status)
 		self.assertEqual(result.counts.errors, 0)
 
 	def test_parse_failed_test_log(self):
 		failed_log = test_data_path('test_is_test_passed-failure.log')
 		with open(failed_log) as file:
-			result = kunit_parser.parse_run_tests(file.readlines())
+			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 		self.assertEqual(kunit_parser.TestStatus.FAILURE, result.status)
 		self.assertEqual(result.counts.errors, 0)
 
@@ -168,7 +169,7 @@ class KUnitParserTest(unittest.TestCase):
 		empty_log = test_data_path('test_is_test_passed-no_tests_run_no_header.log')
 		with open(empty_log) as file:
 			result = kunit_parser.parse_run_tests(
-				kunit_parser.extract_tap_lines(file.readlines()))
+				kunit_parser.extract_tap_lines(file.readlines()), stdout)
 		self.assertEqual(0, len(result.subtests))
 		self.assertEqual(kunit_parser.TestStatus.FAILURE_TO_PARSE_TESTS, result.status)
 		self.assertEqual(result.counts.errors, 1)
@@ -179,7 +180,7 @@ class KUnitParserTest(unittest.TestCase):
 		with open(missing_plan_log) as file:
 			result = kunit_parser.parse_run_tests(
 				kunit_parser.extract_tap_lines(
-				file.readlines()))
+				file.readlines()), stdout)
 		# A missing test plan is not an error.
 		self.assertEqual(result.counts, kunit_parser.TestCounts(passed=10, errors=0))
 		self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status)
@@ -188,7 +189,7 @@ class KUnitParserTest(unittest.TestCase):
 		header_log = test_data_path('test_is_test_passed-no_tests_run_with_header.log')
 		with open(header_log) as file:
 			result = kunit_parser.parse_run_tests(
-				kunit_parser.extract_tap_lines(file.readlines()))
+				kunit_parser.extract_tap_lines(file.readlines()), stdout)
 		self.assertEqual(0, len(result.subtests))
 		self.assertEqual(kunit_parser.TestStatus.NO_TESTS, result.status)
 		self.assertEqual(result.counts.errors, 1)
@@ -197,7 +198,7 @@ class KUnitParserTest(unittest.TestCase):
 		no_plan_log = test_data_path('test_is_test_passed-no_tests_no_plan.log')
 		with open(no_plan_log) as file:
 			result = kunit_parser.parse_run_tests(
-				kunit_parser.extract_tap_lines(file.readlines()))
+				kunit_parser.extract_tap_lines(file.readlines()), stdout)
 		self.assertEqual(0, len(result.subtests[0].subtests[0].subtests))
 		self.assertEqual(
 			kunit_parser.TestStatus.NO_TESTS,
@@ -210,7 +211,7 @@ class KUnitParserTest(unittest.TestCase):
 		print_mock = mock.patch('kunit_printer.Printer.print').start()
 		with open(crash_log) as file:
 			result = kunit_parser.parse_run_tests(
-				kunit_parser.extract_tap_lines(file.readlines()))
+				kunit_parser.extract_tap_lines(file.readlines()), stdout)
 		print_mock.assert_any_call(StrContains('Could not find any KTAP output.'))
 		print_mock.stop()
 		self.assertEqual(0, len(result.subtests))
@@ -219,7 +220,7 @@ class KUnitParserTest(unittest.TestCase):
 	def test_skipped_test(self):
 		skipped_log = test_data_path('test_skip_tests.log')
 		with open(skipped_log) as file:
-			result = kunit_parser.parse_run_tests(file.readlines())
+			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 
 		# A skipped test does not fail the whole suite.
 		self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status)
@@ -228,7 +229,7 @@ class KUnitParserTest(unittest.TestCase):
 	def test_skipped_all_tests(self):
 		skipped_log = test_data_path('test_skip_all_tests.log')
 		with open(skipped_log) as file:
-			result = kunit_parser.parse_run_tests(file.readlines())
+			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 
 		self.assertEqual(kunit_parser.TestStatus.SKIPPED, result.status)
 		self.assertEqual(result.counts, kunit_parser.TestCounts(skipped=5))
@@ -236,7 +237,7 @@ class KUnitParserTest(unittest.TestCase):
 	def test_ignores_hyphen(self):
 		hyphen_log = test_data_path('test_strip_hyphen.log')
 		with open(hyphen_log) as file:
-			result = kunit_parser.parse_run_tests(file.readlines())
+			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 
 		# A skipped test does not fail the whole suite.
 		self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status)
@@ -250,7 +251,7 @@ class KUnitParserTest(unittest.TestCase):
 	def test_ignores_prefix_printk_time(self):
 		prefix_log = test_data_path('test_config_printk_time.log')
 		with open(prefix_log) as file:
-			result = kunit_parser.parse_run_tests(file.readlines())
+			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 		self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status)
 		self.assertEqual('kunit-resource-test', result.subtests[0].name)
 		self.assertEqual(result.counts.errors, 0)
@@ -258,7 +259,7 @@ class KUnitParserTest(unittest.TestCase):
 	def test_ignores_multiple_prefixes(self):
 		prefix_log = test_data_path('test_multiple_prefixes.log')
 		with open(prefix_log) as file:
-			result = kunit_parser.parse_run_tests(file.readlines())
+			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 		self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status)
 		self.assertEqual('kunit-resource-test', result.subtests[0].name)
 		self.assertEqual(result.counts.errors, 0)
@@ -266,7 +267,7 @@ class KUnitParserTest(unittest.TestCase):
 	def test_prefix_mixed_kernel_output(self):
 		mixed_prefix_log = test_data_path('test_interrupted_tap_output.log')
 		with open(mixed_prefix_log) as file:
-			result = kunit_parser.parse_run_tests(file.readlines())
+			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 		self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status)
 		self.assertEqual('kunit-resource-test', result.subtests[0].name)
 		self.assertEqual(result.counts.errors, 0)
@@ -274,7 +275,7 @@ class KUnitParserTest(unittest.TestCase):
 	def test_prefix_poundsign(self):
 		pound_log = test_data_path('test_pound_sign.log')
 		with open(pound_log) as file:
-			result = kunit_parser.parse_run_tests(file.readlines())
+			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 		self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status)
 		self.assertEqual('kunit-resource-test', result.subtests[0].name)
 		self.assertEqual(result.counts.errors, 0)
@@ -282,7 +283,7 @@ class KUnitParserTest(unittest.TestCase):
 	def test_kernel_panic_end(self):
 		panic_log = test_data_path('test_kernel_panic_interrupt.log')
 		with open(panic_log) as file:
-			result = kunit_parser.parse_run_tests(file.readlines())
+			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 		self.assertEqual(kunit_parser.TestStatus.TEST_CRASHED, result.status)
 		self.assertEqual('kunit-resource-test', result.subtests[0].name)
 		self.assertGreaterEqual(result.counts.errors, 1)
@@ -290,7 +291,7 @@ class KUnitParserTest(unittest.TestCase):
 	def test_pound_no_prefix(self):
 		pound_log = test_data_path('test_pound_no_prefix.log')
 		with open(pound_log) as file:
-			result = kunit_parser.parse_run_tests(file.readlines())
+			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 		self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status)
 		self.assertEqual('kunit-resource-test', result.subtests[0].name)
 		self.assertEqual(result.counts.errors, 0)
@@ -310,7 +311,7 @@ class KUnitParserTest(unittest.TestCase):
 			not ok 2 - test2
 		not ok 1 - some_failed_suite
 		"""
-		result = kunit_parser.parse_run_tests(output.splitlines())
+		result = kunit_parser.parse_run_tests(output.splitlines(), stdout)
 		self.assertEqual(kunit_parser.TestStatus.FAILURE, result.status)
 
 		self.assertEqual(kunit_parser._summarize_failed_tests(result),
@@ -319,7 +320,7 @@ class KUnitParserTest(unittest.TestCase):
 	def test_ktap_format(self):
 		ktap_log = test_data_path('test_parse_ktap_output.log')
 		with open(ktap_log) as file:
-			result = kunit_parser.parse_run_tests(file.readlines())
+			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 		self.assertEqual(result.counts, kunit_parser.TestCounts(passed=3))
 		self.assertEqual('suite', result.subtests[0].name)
 		self.assertEqual('case_1', result.subtests[0].subtests[0].name)
@@ -328,13 +329,13 @@ class KUnitParserTest(unittest.TestCase):
 	def test_parse_subtest_header(self):
 		ktap_log = test_data_path('test_parse_subtest_header.log')
 		with open(ktap_log) as file:
-			kunit_parser.parse_run_tests(file.readlines())
+			kunit_parser.parse_run_tests(file.readlines(), stdout)
 		self.print_mock.assert_any_call(StrContains('suite (1 subtest)'))
 
 	def test_parse_attributes(self):
 		ktap_log = test_data_path('test_parse_attributes.log')
 		with open(ktap_log) as file:
-			result = kunit_parser.parse_run_tests(file.readlines())
+			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 
 		# Test should pass with no errors
 		self.assertEqual(result.counts, kunit_parser.TestCounts(passed=1, errors=0))
@@ -355,7 +356,7 @@ class KUnitParserTest(unittest.TestCase):
 		    Indented more.
 		not ok 1 test1
 		"""
-		result = kunit_parser.parse_run_tests(output.splitlines())
+		result = kunit_parser.parse_run_tests(output.splitlines(), stdout)
 		self.assertEqual(kunit_parser.TestStatus.FAILURE, result.status)
 
 		self.print_mock.assert_any_call(StrContains('Test output.'))
@@ -544,7 +545,7 @@ class KUnitJsonTest(unittest.TestCase):
 
 	def _json_for(self, log_file):
 		with open(test_data_path(log_file)) as file:
-			test_result = kunit_parser.parse_run_tests(file)
+			test_result = kunit_parser.parse_run_tests(file, stdout)
 			json_obj = kunit_json.get_json_result(
 				test=test_result,
 				metadata=kunit_json.Metadata())
@@ -810,7 +811,7 @@ class KUnitMainTest(unittest.TestCase):
 		self.linux_source_mock.run_kernel.return_value = ['TAP version 14', 'init: random output'] + want
 
 		got = kunit._list_tests(self.linux_source_mock,
-				     kunit.KunitExecRequest(None, None, '.kunit', 300, 'suite*', '', None, None, 'suite', False, False))
+				     kunit.KunitExecRequest(None, None, False, '.kunit', 300, 'suite*', '', None, None, 'suite', False, False))
 		self.assertEqual(got, want)
 		# Should respect the user's filter glob when listing tests.
 		self.linux_source_mock.run_kernel.assert_called_once_with(
@@ -823,7 +824,7 @@ class KUnitMainTest(unittest.TestCase):
 
 		# Should respect the user's filter glob when listing tests.
 		mock_tests.assert_called_once_with(mock.ANY,
-				     kunit.KunitExecRequest(None, None, '.kunit', 300, 'suite*.test*', '', None, None, 'suite', False, False))
+				     kunit.KunitExecRequest(None, None, False, '.kunit', 300, 'suite*.test*', '', None, None, 'suite', False, False))
 		self.linux_source_mock.run_kernel.assert_has_calls([
 			mock.call(args=None, build_dir='.kunit', filter_glob='suite.test*', filter='', filter_action=None, timeout=300),
 			mock.call(args=None, build_dir='.kunit', filter_glob='suite2.test*', filter='', filter_action=None, timeout=300),
@@ -836,7 +837,7 @@ class KUnitMainTest(unittest.TestCase):
 
 		# Should respect the user's filter glob when listing tests.
 		mock_tests.assert_called_once_with(mock.ANY,
-				     kunit.KunitExecRequest(None, None, '.kunit', 300, 'suite*', '', None, None, 'test', False, False))
+				     kunit.KunitExecRequest(None, None, False, '.kunit', 300, 'suite*', '', None, None, 'test', False, False))
 		self.linux_source_mock.run_kernel.assert_has_calls([
 			mock.call(args=None, build_dir='.kunit', filter_glob='suite.test1', filter='', filter_action=None, timeout=300),
 			mock.call(args=None, build_dir='.kunit', filter_glob='suite.test2', filter='', filter_action=None, timeout=300),
-- 
cgit v1.2.3


From 3c67a2c09b3c32fd9fc5caf2afacd15267d08071 Mon Sep 17 00:00:00 2001
From: Rae Moar <rmoar@google.com>
Date: Wed, 13 Nov 2024 22:24:06 +0000
Subject: kunit: tool: print failed tests only

Add flag --failed to kunit.py to print only failed tests. This printing
is done after running is over.

This patch also adds the method print_test() that will also print your
Test object. Before, all printing of tests occurred during parsing. This
method could be useful in the future when converting to/from KTAP to this
pretty-print output.

Link: https://lore.kernel.org/r/20241113222406.1590372-2-rmoar@google.com
Signed-off-by: Rae Moar <rmoar@google.com>
Reviewed-by: David Gow <davidgow@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit.py           | 14 ++++++++++++--
 tools/testing/kunit/kunit_parser.py    | 25 +++++++++++++++++++++++++
 tools/testing/kunit/kunit_tool_test.py |  6 +++---
 3 files changed, 40 insertions(+), 5 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py
index 27c55a7fc1a0..676fa99a8b19 100755
--- a/tools/testing/kunit/kunit.py
+++ b/tools/testing/kunit/kunit.py
@@ -50,6 +50,7 @@ class KunitParseRequest:
 	raw_output: Optional[str]
 	json: Optional[str]
 	summary: bool
+	failed: bool
 
 @dataclass
 class KunitExecRequest(KunitParseRequest):
@@ -237,13 +238,15 @@ def parse_tests(request: KunitParseRequest, metadata: kunit_json.Metadata, input
 		return KunitResult(KunitStatus.SUCCESS, parse_time), fake_test
 
 	default_printer = stdout
-	if request.summary:
+	if request.summary or request.failed:
 		default_printer = null_printer
 
 	# Actually parse the test results.
 	test = kunit_parser.parse_run_tests(input_data, default_printer)
 	parse_time = time.time() - parse_start
 
+	if request.failed:
+		kunit_parser.print_test(test, request.failed, stdout)
 	kunit_parser.print_summary_line(test, stdout)
 
 	if request.json:
@@ -423,6 +426,10 @@ def add_parse_opts(parser: argparse.ArgumentParser) -> None:
 			    help='Prints only the summary line for parsed test results.'
 				'Does nothing if --raw_output is set.',
 			    action='store_true')
+	parser.add_argument('--failed',
+			    help='Prints only the failed parsed test results and summary line.'
+				'Does nothing if --raw_output is set.',
+			    action='store_true')
 
 
 def tree_from_args(cli_args: argparse.Namespace) -> kunit_kernel.LinuxSourceTree:
@@ -459,6 +466,7 @@ def run_handler(cli_args: argparse.Namespace) -> None:
 					raw_output=cli_args.raw_output,
 					json=cli_args.json,
 					summary=cli_args.summary,
+					failed=cli_args.failed,
 					timeout=cli_args.timeout,
 					filter_glob=cli_args.filter_glob,
 					filter=cli_args.filter,
@@ -507,6 +515,7 @@ def exec_handler(cli_args: argparse.Namespace) -> None:
 					build_dir=cli_args.build_dir,
 					json=cli_args.json,
 					summary=cli_args.summary,
+					failed=cli_args.failed,
 					timeout=cli_args.timeout,
 					filter_glob=cli_args.filter_glob,
 					filter=cli_args.filter,
@@ -532,7 +541,8 @@ def parse_handler(cli_args: argparse.Namespace) -> None:
 	# We know nothing about how the result was created!
 	metadata = kunit_json.Metadata()
 	request = KunitParseRequest(raw_output=cli_args.raw_output,
-					json=cli_args.json, summary=cli_args.summary)
+					json=cli_args.json, summary=cli_args.summary,
+					failed=cli_args.failed)
 	result, _ = parse_tests(request, metadata, kunit_output)
 	if result.status != KunitStatus.SUCCESS:
 		sys.exit(1)
diff --git a/tools/testing/kunit/kunit_parser.py b/tools/testing/kunit/kunit_parser.py
index 732f448263de..29fc27e8949b 100644
--- a/tools/testing/kunit/kunit_parser.py
+++ b/tools/testing/kunit/kunit_parser.py
@@ -574,7 +574,32 @@ def print_test_footer(test: Test, printer: Printer) -> None:
 	printer.print_with_timestamp(format_test_divider(message,
 		len(message) - printer.color_len()))
 
+def print_test(test: Test, failed_only: bool, printer: Printer) -> None:
+	"""
+	Prints Test object to given printer. For a child test, the result line is
+	printed. For a parent test, the test header, all child test results, and
+	the test footer are all printed. If failed_only is true, only failed/crashed
+	tests will be printed.
 
+	Parameters:
+	test - Test object to print
+	failed_only - True if only failed/crashed tests should be printed.
+	printer - Printer object to output results
+	"""
+	if test.name == "main":
+		printer.print_with_timestamp(DIVIDER)
+		for subtest in test.subtests:
+			print_test(subtest, failed_only, printer)
+		printer.print_with_timestamp(DIVIDER)
+	elif test.subtests != []:
+		if not failed_only or not test.ok_status():
+			print_test_header(test, printer)
+			for subtest in test.subtests:
+				print_test(subtest, failed_only, printer)
+			print_test_footer(test, printer)
+	else:
+		if not failed_only or not test.ok_status():
+			print_test_result(test, printer)
 
 def _summarize_failed_tests(test: Test) -> str:
 	"""Tries to summarize all the failing subtests in `test`."""
diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py
index 02aa296d8850..0bcb0cc002f8 100755
--- a/tools/testing/kunit/kunit_tool_test.py
+++ b/tools/testing/kunit/kunit_tool_test.py
@@ -811,7 +811,7 @@ class KUnitMainTest(unittest.TestCase):
 		self.linux_source_mock.run_kernel.return_value = ['TAP version 14', 'init: random output'] + want
 
 		got = kunit._list_tests(self.linux_source_mock,
-				     kunit.KunitExecRequest(None, None, False, '.kunit', 300, 'suite*', '', None, None, 'suite', False, False))
+				     kunit.KunitExecRequest(None, None, False, False, '.kunit', 300, 'suite*', '', None, None, 'suite', False, False))
 		self.assertEqual(got, want)
 		# Should respect the user's filter glob when listing tests.
 		self.linux_source_mock.run_kernel.assert_called_once_with(
@@ -824,7 +824,7 @@ class KUnitMainTest(unittest.TestCase):
 
 		# Should respect the user's filter glob when listing tests.
 		mock_tests.assert_called_once_with(mock.ANY,
-				     kunit.KunitExecRequest(None, None, False, '.kunit', 300, 'suite*.test*', '', None, None, 'suite', False, False))
+				     kunit.KunitExecRequest(None, None, False, False, '.kunit', 300, 'suite*.test*', '', None, None, 'suite', False, False))
 		self.linux_source_mock.run_kernel.assert_has_calls([
 			mock.call(args=None, build_dir='.kunit', filter_glob='suite.test*', filter='', filter_action=None, timeout=300),
 			mock.call(args=None, build_dir='.kunit', filter_glob='suite2.test*', filter='', filter_action=None, timeout=300),
@@ -837,7 +837,7 @@ class KUnitMainTest(unittest.TestCase):
 
 		# Should respect the user's filter glob when listing tests.
 		mock_tests.assert_called_once_with(mock.ANY,
-				     kunit.KunitExecRequest(None, None, False, '.kunit', 300, 'suite*', '', None, None, 'test', False, False))
+				     kunit.KunitExecRequest(None, None, False, False, '.kunit', 300, 'suite*', '', None, None, 'test', False, False))
 		self.linux_source_mock.run_kernel.assert_has_calls([
 			mock.call(args=None, build_dir='.kunit', filter_glob='suite.test1', filter='', filter_action=None, timeout=300),
 			mock.call(args=None, build_dir='.kunit', filter_glob='suite.test2', filter='', filter_action=None, timeout=300),
-- 
cgit v1.2.3


From d28252440428d37076de166c3d576a5d2f4a53e8 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Mon, 11 Nov 2024 06:29:18 +0100
Subject: kunit: qemu_configs: Add LoongArch config
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a basic config to run kunit tests on LoongArch.
This requires QEMU 9.1.0 or later for the necessary direct kernel boot
support.

Link: https://lore.kernel.org/r/20241111-kunit-loongarch-v2-1-7676eb5f2da3@linutronix.de
Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Reviewed-by: Bibo Mao <maobibo@loongson.cn>
Reviewed-by: David Gow <davidgow@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/qemu_configs/loongarch.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 tools/testing/kunit/qemu_configs/loongarch.py

(limited to 'tools/testing')

diff --git a/tools/testing/kunit/qemu_configs/loongarch.py b/tools/testing/kunit/qemu_configs/loongarch.py
new file mode 100644
index 000000000000..a874a2156e0f
--- /dev/null
+++ b/tools/testing/kunit/qemu_configs/loongarch.py
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: GPL-2.0
+
+from ..qemu_config import QemuArchParams
+
+QEMU_ARCH = QemuArchParams(linux_arch='loongarch',
+			   kconfig='''
+CONFIG_EFI_STUB=n
+CONFIG_PCI_HOST_GENERIC=y
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_SERIAL_OF_PLATFORM=y
+''',
+			   qemu_arch='loongarch64',
+			   kernel_path='arch/loongarch/boot/vmlinux.elf',
+			   kernel_command_line='console=ttyS0',
+			   extra_qemu_params=[
+					   '-machine', 'virt',
+					   '-cpu', 'max',])
-- 
cgit v1.2.3


From 0a1111d4cbaf45100e30ebd98d1e1a175b8ce22d Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Mon, 11 Nov 2024 06:29:19 +0100
Subject: kunit: tool: Allow overriding the shutdown mode from qemu config
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Not all platforms support machine reboot.
If it a proper reboot is not supported the machine will hang.
Allow the QEMU configuration to override the necessary shutdown mode for
the specific system under test.

Link: https://lore.kernel.org/r/20241111-kunit-loongarch-v2-2-7676eb5f2da3@linutronix.de
Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Reviewed-by: David Gow <davidgow@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit_kernel.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py
index 61931c4926fd..e76d7894b6c5 100644
--- a/tools/testing/kunit/kunit_kernel.py
+++ b/tools/testing/kunit/kunit_kernel.py
@@ -105,7 +105,9 @@ class LinuxSourceTreeOperationsQemu(LinuxSourceTreeOperations):
 		self._kconfig = qemu_arch_params.kconfig
 		self._qemu_arch = qemu_arch_params.qemu_arch
 		self._kernel_path = qemu_arch_params.kernel_path
-		self._kernel_command_line = qemu_arch_params.kernel_command_line + ' kunit_shutdown=reboot'
+		self._kernel_command_line = qemu_arch_params.kernel_command_line
+		if 'kunit_shutdown=' not in self._kernel_command_line:
+			self._kernel_command_line += ' kunit_shutdown=reboot'
 		self._extra_qemu_params = qemu_arch_params.extra_qemu_params
 		self._serial = qemu_arch_params.serial
 
-- 
cgit v1.2.3


From 62adcae479fe5bc04fa3b6c3f93bd340441f8b25 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Mon, 11 Nov 2024 06:29:20 +0100
Subject: kunit: qemu_configs: loongarch: Enable shutdown
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

QEMU for LoongArch does not yet support shutdown/restart through ACPI.
Use the pvpanic driver to enable shutdowns.
This requires 9.1.0 for shutdown support in pvpanic, but that is the
requirement of kunit on LoongArch anyways.

Link: https://lore.kernel.org/r/20241111-kunit-loongarch-v2-3-7676eb5f2da3@linutronix.de
Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Reviewed-by: Bibo Mao <maobibo@loongson.cn>
Reviewed-by: David Gow <davidgow@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/qemu_configs/loongarch.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/kunit/qemu_configs/loongarch.py b/tools/testing/kunit/qemu_configs/loongarch.py
index a874a2156e0f..a92422967d1d 100644
--- a/tools/testing/kunit/qemu_configs/loongarch.py
+++ b/tools/testing/kunit/qemu_configs/loongarch.py
@@ -6,13 +6,16 @@ QEMU_ARCH = QemuArchParams(linux_arch='loongarch',
 			   kconfig='''
 CONFIG_EFI_STUB=n
 CONFIG_PCI_HOST_GENERIC=y
+CONFIG_PVPANIC=y
+CONFIG_PVPANIC_PCI=y
 CONFIG_SERIAL_8250=y
 CONFIG_SERIAL_8250_CONSOLE=y
 CONFIG_SERIAL_OF_PLATFORM=y
 ''',
 			   qemu_arch='loongarch64',
 			   kernel_path='arch/loongarch/boot/vmlinux.elf',
-			   kernel_command_line='console=ttyS0',
+			   kernel_command_line='console=ttyS0 kunit_shutdown=poweroff',
 			   extra_qemu_params=[
 					   '-machine', 'virt',
+					   '-device', 'pvpanic-pci',
 					   '-cpu', 'max',])
-- 
cgit v1.2.3


From f13242a46438e690067a4bf47068fde4d5719947 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Sat, 16 Nov 2024 00:41:14 +1100
Subject: selftests/mount_setattr: Fix failures on 64K PAGE_SIZE kernels

Currently the mount_setattr_test fails on machines with a 64K PAGE_SIZE,
with errors such as:

  #  RUN           mount_setattr_idmapped.invalid_fd_negative ...
  mkfs.ext4: No space left on device while writing out and closing file system
  # mount_setattr_test.c:1055:invalid_fd_negative:Expected system("mkfs.ext4 -q /mnt/C/ext4.img") (256) == 0 (0)
  # invalid_fd_negative: Test terminated by assertion
  #          FAIL  mount_setattr_idmapped.invalid_fd_negative
  not ok 12 mount_setattr_idmapped.invalid_fd_negative

The code creates a 100,000 byte tmpfs:

	ASSERT_EQ(mount("testing", "/mnt", "tmpfs", MS_NOATIME | MS_NODEV,
			"size=100000,mode=700"), 0);

And then a little later creates a 2MB ext4 filesystem in that tmpfs:

	ASSERT_EQ(ftruncate(img_fd, 1024 * 2048), 0);
	ASSERT_EQ(system("mkfs.ext4 -q /mnt/C/ext4.img"), 0);

At first glance it seems like that should never work, after all 2MB is
larger than 100,000 bytes. However the filesystem image doesn't actually
occupy 2MB on "disk" (actually RAM, due to tmpfs). On 4K kernels the
ext4.img uses ~84KB of actual space (according to du), which just fits.

However on 64K PAGE_SIZE kernels the ext4.img takes at least 256KB,
which is too large to fit in the tmpfs, hence the errors.

It seems fraught to rely on the ext4.img taking less space on disk than
the allocated size, so instead create the tmpfs with a size of 2MB. With
that all 21 tests pass on 64K PAGE_SIZE kernels.

Fixes: 01eadc8dd96d ("tests: add mount_setattr() selftests")
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20241115134114.1219555-1-mpe@ellerman.id.au
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 tools/testing/selftests/mount_setattr/mount_setattr_test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c
index 68801e1a9ec2..70f65eb320a7 100644
--- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c
+++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c
@@ -1026,7 +1026,7 @@ FIXTURE_SETUP(mount_setattr_idmapped)
 			"size=100000,mode=700"), 0);
 
 	ASSERT_EQ(mount("testing", "/mnt", "tmpfs", MS_NOATIME | MS_NODEV,
-			"size=100000,mode=700"), 0);
+			"size=2m,mode=700"), 0);
 
 	ASSERT_EQ(mkdir("/mnt/A", 0777), 0);
 
-- 
cgit v1.2.3


From c74bfe4ffe8c1ca94e3d60ec7af06cf679e23583 Mon Sep 17 00:00:00 2001
From: Yong-Xuan Wang <yongxuan.wang@sifive.com>
Date: Fri, 26 Jul 2024 16:49:30 +0800
Subject: KVM: riscv: selftests: Add Svade and Svadu Extension to get-reg-list
 test

Update the get-reg-list test to test the Svade and Svadu Extensions are
available for guest OS.

Signed-off-by: Yong-Xuan Wang <yongxuan.wang@sifive.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Acked-by: Palmer Dabbelt <palmer@rivosinc.com>
Link: https://lore.kernel.org/r/20240726084931.28924-6-yongxuan.wang@sifive.com
Signed-off-by: Anup Patel <anup@brainfault.org>
---
 tools/testing/selftests/kvm/riscv/get-reg-list.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/riscv/get-reg-list.c b/tools/testing/selftests/kvm/riscv/get-reg-list.c
index 8e34f7fa44e9..aac40652e181 100644
--- a/tools/testing/selftests/kvm/riscv/get-reg-list.c
+++ b/tools/testing/selftests/kvm/riscv/get-reg-list.c
@@ -45,6 +45,8 @@ bool filter_reg(__u64 reg)
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SSAIA:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SSCOFPMF:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SSTC:
+	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVADE:
+	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVADU:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVINVAL:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVNAPOT:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVPBMT:
@@ -418,6 +420,8 @@ static const char *isa_ext_single_id_to_str(__u64 reg_off)
 		KVM_ISA_EXT_ARR(SSAIA),
 		KVM_ISA_EXT_ARR(SSCOFPMF),
 		KVM_ISA_EXT_ARR(SSTC),
+		KVM_ISA_EXT_ARR(SVADE),
+		KVM_ISA_EXT_ARR(SVADU),
 		KVM_ISA_EXT_ARR(SVINVAL),
 		KVM_ISA_EXT_ARR(SVNAPOT),
 		KVM_ISA_EXT_ARR(SVPBMT),
@@ -949,6 +953,8 @@ KVM_ISA_EXT_SIMPLE_CONFIG(h, H);
 KVM_ISA_EXT_SUBLIST_CONFIG(smstateen, SMSTATEEN);
 KVM_ISA_EXT_SIMPLE_CONFIG(sscofpmf, SSCOFPMF);
 KVM_ISA_EXT_SIMPLE_CONFIG(sstc, SSTC);
+KVM_ISA_EXT_SIMPLE_CONFIG(svade, SVADE);
+KVM_ISA_EXT_SIMPLE_CONFIG(svadu, SVADU);
 KVM_ISA_EXT_SIMPLE_CONFIG(svinval, SVINVAL);
 KVM_ISA_EXT_SIMPLE_CONFIG(svnapot, SVNAPOT);
 KVM_ISA_EXT_SIMPLE_CONFIG(svpbmt, SVPBMT);
@@ -1012,6 +1018,8 @@ struct vcpu_reg_list *vcpu_configs[] = {
 	&config_smstateen,
 	&config_sscofpmf,
 	&config_sstc,
+	&config_svade,
+	&config_svadu,
 	&config_svinval,
 	&config_svnapot,
 	&config_svpbmt,
-- 
cgit v1.2.3


From 9d5ce1aa91db1b9dec9e06128b1ba241aeb004c2 Mon Sep 17 00:00:00 2001
From: Li Zhijian <lizhijian@fujitsu.com>
Date: Fri, 22 Nov 2024 15:36:00 +0800
Subject: selftests/alsa: Add a few missing gitignore files

Compiled binary files should be added to .gitignore

'git status' complains:
Untracked files:
(use "git add <file>..." to include in what will be committed)
     alsa/global-timer
     alsa/utimer-test

Cc: Mark Brown <broonie@kernel.org>
Cc: Jaroslav Kysela <perex@perex.cz>
Cc: Takashi Iwai <tiwai@suse.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
Link: https://patch.msgid.link/20241122073600.1530791-1-lizhijian@fujitsu.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 tools/testing/selftests/alsa/.gitignore | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/alsa/.gitignore b/tools/testing/selftests/alsa/.gitignore
index 12dc3fcd3456..3dd8e1176b89 100644
--- a/tools/testing/selftests/alsa/.gitignore
+++ b/tools/testing/selftests/alsa/.gitignore
@@ -1,3 +1,5 @@
+global-timer
 mixer-test
 pcm-test
 test-pcmtest-driver
+utimer-test
-- 
cgit v1.2.3


From 078f644cb81b78afdfbc42b9cc2c11959f2ed65c Mon Sep 17 00:00:00 2001
From: David Wei <dw@davidwei.uk>
Date: Thu, 21 Nov 2024 22:48:21 -0800
Subject: selftests: fix nested double quotes in f-string

Replace nested double quotes in f-string with outer single quotes.

Fixes: 6116075e18f7 ("selftests: nic_link_layer: Add link layer selftest for NIC driver")
Signed-off-by: David Wei <dw@davidwei.uk>
Link: https://patch.msgid.link/20241122064821.2821199-1-dw@davidwei.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/hw/lib/py/linkconfig.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/hw/lib/py/linkconfig.py b/tools/testing/selftests/drivers/net/hw/lib/py/linkconfig.py
index db84000fc75b..79fde603cbbc 100644
--- a/tools/testing/selftests/drivers/net/hw/lib/py/linkconfig.py
+++ b/tools/testing/selftests/drivers/net/hw/lib/py/linkconfig.py
@@ -218,5 +218,5 @@ class LinkConfig:
         json_data = process[0]
         """Check if the field exist in the json data"""
         if field not in json_data:
-            raise KsftSkipEx(f"Field {field} does not exist in the output of interface {json_data["ifname"]}")
+            raise KsftSkipEx(f'Field {field} does not exist in the output of interface {json_data["ifname"]}')
         return json_data[field]
-- 
cgit v1.2.3


From 9bb88c659673003453fd42e0ddf95c9628409094 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 19 Nov 2024 14:44:32 -0800
Subject: selftests: net: test extacks in netlink dumps

Test that extacks in dumps work. The test fills up the receive buffer
to test both the inline dump (as part of sendmsg()) and delayed one
(run during recvmsg()).

Use YNL helpers to parse the messages. We need to add the test to YNL
file to make sure the right include path are used.

Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Link: https://patch.msgid.link/20241119224432.1713040-2-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/Makefile        |   3 +-
 tools/testing/selftests/net/netlink-dumps.c | 129 ++++++++++++++++++++++++++++
 2 files changed, 130 insertions(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 3d487b03c4a0..cb2fc601de66 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -78,7 +78,6 @@ TEST_PROGS += test_vxlan_vnifiltering.sh
 TEST_GEN_FILES += io_uring_zerocopy_tx
 TEST_PROGS += io_uring_zerocopy_tx.sh
 TEST_GEN_FILES += bind_bhash
-TEST_GEN_PROGS += netlink-dumps
 TEST_GEN_PROGS += sk_bind_sendto_listen
 TEST_GEN_PROGS += sk_connect_zero_addr
 TEST_GEN_PROGS += sk_so_peek_off
@@ -101,7 +100,7 @@ TEST_PROGS += ipv6_route_update_soft_lockup.sh
 TEST_PROGS += busy_poll_test.sh
 
 # YNL files, must be before "include ..lib.mk"
-YNL_GEN_FILES := busy_poller
+YNL_GEN_FILES := busy_poller netlink-dumps
 TEST_GEN_FILES += $(YNL_GEN_FILES)
 
 TEST_FILES := settings
diff --git a/tools/testing/selftests/net/netlink-dumps.c b/tools/testing/selftests/net/netlink-dumps.c
index 84e29b7dffb6..07423f256f96 100644
--- a/tools/testing/selftests/net/netlink-dumps.c
+++ b/tools/testing/selftests/net/netlink-dumps.c
@@ -12,11 +12,140 @@
 #include <unistd.h>
 
 #include <linux/genetlink.h>
+#include <linux/neighbour.h>
+#include <linux/netdevice.h>
 #include <linux/netlink.h>
 #include <linux/mqueue.h>
+#include <linux/rtnetlink.h>
 
 #include "../kselftest_harness.h"
 
+#include <ynl.h>
+
+struct ext_ack {
+	int err;
+
+	__u32 attr_offs;
+	__u32 miss_type;
+	__u32 miss_nest;
+	const char *str;
+};
+
+/* 0: no done, 1: done found, 2: extack found, -1: error */
+static int nl_get_extack(char *buf, size_t n, struct ext_ack *ea)
+{
+	const struct nlmsghdr *nlh;
+	const struct nlattr *attr;
+	ssize_t rem;
+
+	for (rem = n; rem > 0; NLMSG_NEXT(nlh, rem)) {
+		nlh = (struct nlmsghdr *)&buf[n - rem];
+		if (!NLMSG_OK(nlh, rem))
+			return -1;
+
+		if (nlh->nlmsg_type != NLMSG_DONE)
+			continue;
+
+		ea->err = -*(int *)NLMSG_DATA(nlh);
+
+		if (!(nlh->nlmsg_flags & NLM_F_ACK_TLVS))
+			return 1;
+
+		ynl_attr_for_each(attr, nlh, sizeof(int)) {
+			switch (ynl_attr_type(attr)) {
+			case NLMSGERR_ATTR_OFFS:
+				ea->attr_offs = ynl_attr_get_u32(attr);
+				break;
+			case NLMSGERR_ATTR_MISS_TYPE:
+				ea->miss_type = ynl_attr_get_u32(attr);
+				break;
+			case NLMSGERR_ATTR_MISS_NEST:
+				ea->miss_nest = ynl_attr_get_u32(attr);
+				break;
+			case NLMSGERR_ATTR_MSG:
+				ea->str = ynl_attr_get_str(attr);
+				break;
+			}
+		}
+
+		return 2;
+	}
+
+	return 0;
+}
+
+static const struct {
+	struct nlmsghdr nlhdr;
+	struct ndmsg ndm;
+	struct nlattr ahdr;
+	__u32 val;
+} dump_neigh_bad = {
+	.nlhdr = {
+		.nlmsg_len	= sizeof(dump_neigh_bad),
+		.nlmsg_type	= RTM_GETNEIGH,
+		.nlmsg_flags	= NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
+		.nlmsg_seq	= 1,
+	},
+	.ndm = {
+		.ndm_family	= 123,
+	},
+	.ahdr = {
+		.nla_len	= 4 + 4,
+		.nla_type	= NDA_FLAGS_EXT,
+	},
+	.val = -1, // should fail MASK validation
+};
+
+TEST(dump_extack)
+{
+	int netlink_sock;
+	char buf[8192];
+	int one = 1;
+	int i, cnt;
+	ssize_t n;
+
+	netlink_sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+	ASSERT_GE(netlink_sock, 0);
+
+	n = setsockopt(netlink_sock, SOL_NETLINK, NETLINK_CAP_ACK,
+		       &one, sizeof(one));
+	ASSERT_EQ(n, 0);
+	n = setsockopt(netlink_sock, SOL_NETLINK, NETLINK_EXT_ACK,
+		       &one, sizeof(one));
+	ASSERT_EQ(n, 0);
+	n = setsockopt(netlink_sock, SOL_NETLINK, NETLINK_GET_STRICT_CHK,
+		       &one, sizeof(one));
+	ASSERT_EQ(n, 0);
+
+	/* Dump so many times we fill up the buffer */
+	cnt = 64;
+	for (i = 0; i < cnt; i++) {
+		n = send(netlink_sock, &dump_neigh_bad,
+			 sizeof(dump_neigh_bad), 0);
+		ASSERT_EQ(n, sizeof(dump_neigh_bad));
+	}
+
+	/* Read out the ENOBUFS */
+	n = recv(netlink_sock, buf, sizeof(buf), MSG_DONTWAIT);
+	EXPECT_EQ(n, -1);
+	EXPECT_EQ(errno, ENOBUFS);
+
+	for (i = 0; i < cnt; i++) {
+		struct ext_ack ea = {};
+
+		n = recv(netlink_sock, buf, sizeof(buf), MSG_DONTWAIT);
+		if (n < 0) {
+			ASSERT_GE(i, 10);
+			break;
+		}
+		ASSERT_GE(n, (ssize_t)sizeof(struct nlmsghdr));
+
+		EXPECT_EQ(nl_get_extack(buf, n, &ea), 2);
+		EXPECT_EQ(ea.attr_offs,
+			  sizeof(struct nlmsghdr) + sizeof(struct ndmsg));
+	}
+}
+
 static const struct {
 	struct nlmsghdr nlhdr;
 	struct genlmsghdr genlhdr;
-- 
cgit v1.2.3


From 919464deeca24e5bf13b6c8efd0b1d25cc43866f Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 25 Nov 2024 09:21:21 -0800
Subject: Revert "HID: bpf: allow write access to quirks field in struct
 hid_device"

This reverts commit 6fd47effe92b, and the related self-test update
commit e14e0eaeb040 ("selftests/hid: add test for assigning a given
device to hid-generic").

It results in things like the scroll wheel on Logitech mice not working
after a reboot due to the kernel being confused about the state of the
high-resolution mode.

Quoting Benjamin Tissoires:
 "The idea of 6fd47effe92b was to be able to call hid_bpf_rdesc_fixup()
  once per reprobe of the device.

  However, because the bpf filter can now change the quirk value, the
  call had to be moved before the driver gets bound (which was
  previously ensuring the unicity of the call).

  The net effect is that now, in the case hid-generic gets loaded first
  and then the specific driver gets loaded once the disk is available,
  the value of ->quirks is not reset, but kept to the value that was set
  by hid-generic (HID_QUIRK_INPUT_PER_APP).

  Once hid-logitech-hidpp kicks in, that quirk is now set, which creates
  two inputs for the single mouse: one keyboard for fancy shortcuts, and
  one mouse node.

  However, hid-logitech-hidpp expects only one input node to be attached
  (it stores it into hidpp->input), and when a wheel event is received,
  because there is some processing with high-resolution wheel events,
  the wheel event is injected into hidpp->input.

  And of course, when HID_QUIRK_INPUT_PER_APP is set, hidpp->input gets
  the keyboard node, which doesn't have wheel event type, and the events
  are ignored"

Reported-and-bisected-by: Mike Galbraith <efault@gmx.de>
Link: https://lore.kernel.org/all/CAHk-=wiUkQM3uheit2cNM0Y0OOY5qqspJgC8LkmOkJ2p2LDxcw@mail.gmail.com/
Acked-by: Benjamin Tissoires <bentiss@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/hid/bpf/hid_bpf_struct_ops.c               |  1 -
 drivers/hid/hid-core.c                             | 11 +--
 tools/testing/selftests/hid/hid_bpf.c              | 80 +---------------------
 tools/testing/selftests/hid/progs/hid.c            | 12 ----
 .../testing/selftests/hid/progs/hid_bpf_helpers.h  |  6 +-
 5 files changed, 4 insertions(+), 106 deletions(-)

(limited to 'tools/testing')

diff --git a/drivers/hid/bpf/hid_bpf_struct_ops.c b/drivers/hid/bpf/hid_bpf_struct_ops.c
index 0e611a9d79d7..702c22fae136 100644
--- a/drivers/hid/bpf/hid_bpf_struct_ops.c
+++ b/drivers/hid/bpf/hid_bpf_struct_ops.c
@@ -79,7 +79,6 @@ static int hid_bpf_ops_btf_struct_access(struct bpf_verifier_log *log,
 		WRITE_RANGE(hid_device, name, true),
 		WRITE_RANGE(hid_device, uniq, true),
 		WRITE_RANGE(hid_device, phys, true),
-		WRITE_RANGE(hid_device, quirks, false),
 	};
 #undef WRITE_RANGE
 	const struct btf_type *state = NULL;
diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index 81d6c734c8bc..98bef39642a9 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -2692,12 +2692,6 @@ static int __hid_device_probe(struct hid_device *hdev, struct hid_driver *hdrv)
 	int ret;
 
 	if (!hdev->bpf_rsize) {
-		unsigned int quirks;
-
-		/* reset the quirks that has been previously set */
-		quirks = hid_lookup_quirk(hdev);
-		hdev->quirks = quirks;
-
 		/* in case a bpf program gets detached, we need to free the old one */
 		hid_free_bpf_rdesc(hdev);
 
@@ -2707,9 +2701,6 @@ static int __hid_device_probe(struct hid_device *hdev, struct hid_driver *hdrv)
 		/* call_hid_bpf_rdesc_fixup will always return a valid pointer */
 		hdev->bpf_rdesc = call_hid_bpf_rdesc_fixup(hdev, hdev->dev_rdesc,
 							   &hdev->bpf_rsize);
-		if (quirks ^ hdev->quirks)
-			hid_info(hdev, "HID-BPF toggled quirks on the device: %04x",
-				 quirks ^ hdev->quirks);
 	}
 
 	if (!hid_check_device_match(hdev, hdrv, &id))
@@ -2719,6 +2710,8 @@ static int __hid_device_probe(struct hid_device *hdev, struct hid_driver *hdrv)
 	if (!hdev->devres_group_id)
 		return -ENOMEM;
 
+	/* reset the quirks that has been previously set */
+	hdev->quirks = hid_lookup_quirk(hdev);
 	hdev->driver = hdrv;
 
 	if (hdrv->probe) {
diff --git a/tools/testing/selftests/hid/hid_bpf.c b/tools/testing/selftests/hid/hid_bpf.c
index ca58bfa3ca65..1e979fb3542b 100644
--- a/tools/testing/selftests/hid/hid_bpf.c
+++ b/tools/testing/selftests/hid/hid_bpf.c
@@ -54,41 +54,11 @@ FIXTURE_TEARDOWN(hid_bpf) {
 	hid_bpf_teardown(_metadata, self, variant); \
 } while (0)
 
-struct specific_device {
-	const char test_name[64];
-	__u16 bus;
-	__u32 vid;
-	__u32 pid;
-};
-
 FIXTURE_SETUP(hid_bpf)
 {
-	const struct specific_device *match = NULL;
 	int err;
 
-	const struct specific_device devices[] = {
-	{
-		.test_name = "test_hid_driver_probe",
-		.bus = BUS_BLUETOOTH,
-		.vid = 0x05ac,  /* USB_VENDOR_ID_APPLE */
-		.pid = 0x022c,  /* USB_DEVICE_ID_APPLE_ALU_WIRELESS_ANSI */
-	}, {
-		.test_name = "*",
-		.bus = BUS_USB,
-		.vid = 0x0001,
-		.pid = 0x0a36,
-	}};
-
-	for (int i = 0; i < ARRAY_SIZE(devices); i++) {
-		match = &devices[i];
-		if (!strncmp(_metadata->name, devices[i].test_name, sizeof(devices[i].test_name)))
-			break;
-	}
-
-	ASSERT_OK_PTR(match);
-
-	err = setup_uhid(_metadata, &self->hid, match->bus, match->vid, match->pid,
-			 rdesc, sizeof(rdesc));
+	err = setup_uhid(_metadata, &self->hid, BUS_USB, 0x0001, 0x0a36, rdesc, sizeof(rdesc));
 	ASSERT_OK(err);
 }
 
@@ -885,54 +855,6 @@ TEST_F(hid_bpf, test_hid_attach_flags)
 	ASSERT_EQ(buf[3], 3);
 }
 
-static bool is_using_driver(struct __test_metadata *_metadata, struct uhid_device *hid,
-			    const char *driver)
-{
-	char driver_line[512];
-	char uevent[1024];
-	char temp[512];
-	int fd, nread;
-	bool found = false;
-
-	sprintf(uevent, "/sys/bus/hid/devices/%04X:%04X:%04X.%04X/uevent",
-		hid->bus, hid->vid, hid->pid, hid->hid_id);
-
-	fd = open(uevent, O_RDONLY | O_NONBLOCK);
-	if (fd < 0) {
-		TH_LOG("couldn't open '%s': %d, %d", uevent, fd, errno);
-		return false;
-	}
-
-	sprintf(driver_line, "DRIVER=%s", driver);
-
-	nread = read(fd, temp, ARRAY_SIZE(temp));
-	if (nread > 0 && (strstr(temp, driver_line)) != NULL)
-		found = true;
-
-	close(fd);
-
-	return found;
-}
-
-/*
- * Attach hid_driver_probe to the given uhid device,
- * check that the device is now using hid-generic.
- */
-TEST_F(hid_bpf, test_hid_driver_probe)
-{
-	const struct test_program progs[] = {
-		{
-			.name = "hid_test_driver_probe",
-		},
-	};
-
-	ASSERT_TRUE(is_using_driver(_metadata, &self->hid, "apple"));
-
-	LOAD_PROGRAMS(progs);
-
-	ASSERT_TRUE(is_using_driver(_metadata, &self->hid, "hid-generic"));
-}
-
 /*
  * Attach hid_rdesc_fixup to the given uhid device,
  * retrieve and open the matching hidraw node,
diff --git a/tools/testing/selftests/hid/progs/hid.c b/tools/testing/selftests/hid/progs/hid.c
index 9b22e9a0e658..5ecc845ef792 100644
--- a/tools/testing/selftests/hid/progs/hid.c
+++ b/tools/testing/selftests/hid/progs/hid.c
@@ -598,15 +598,3 @@ SEC(".struct_ops.link")
 struct hid_bpf_ops test_infinite_loop_input_report = {
 	.hid_device_event = (void *)hid_test_infinite_loop_input_report,
 };
-
-SEC("?struct_ops.s/hid_rdesc_fixup")
-int BPF_PROG(hid_test_driver_probe, struct hid_bpf_ctx *hid_ctx)
-{
-	hid_ctx->hid->quirks |= HID_QUIRK_IGNORE_SPECIAL_DRIVER;
-	return 0;
-}
-
-SEC(".struct_ops.link")
-struct hid_bpf_ops test_driver_probe = {
-	.hid_rdesc_fixup = (void *)hid_test_driver_probe,
-};
diff --git a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h
index 1a645684a117..e5db897586bb 100644
--- a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h
+++ b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h
@@ -84,14 +84,10 @@ struct hid_bpf_ops {
 	struct hid_device *hdev;
 };
 
-#define BIT(n) (1U << n)
-
 #ifndef BPF_F_BEFORE
-#define BPF_F_BEFORE BIT(3)
+#define BPF_F_BEFORE (1U << 3)
 #endif
 
-#define HID_QUIRK_IGNORE_SPECIAL_DRIVER		BIT(22)
-
 /* following are kfuncs exported by HID for HID-BPF */
 extern __u8 *hid_bpf_get_data(struct hid_bpf_ctx *ctx,
 			      unsigned int offset,
-- 
cgit v1.2.3


From f6e1dcd6444485356d1df9d29df702acc7bba00c Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Wed, 20 Nov 2024 09:51:08 +0000
Subject: selftests/rtnetlink.sh: add mngtempaddr test

Add a test to check the temporary address could be added/removed
correctly when mngtempaddr is set or removed/unmanaged.

Signed-off-by: Sam Edwards <cfsworks@gmail.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/rtnetlink.sh | 95 ++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh
index 7f05b5f9b76f..2e8243a65b50 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -29,6 +29,7 @@ ALL_TESTS="
 	kci_test_bridge_parent_id
 	kci_test_address_proto
 	kci_test_enslave_bonding
+	kci_test_mngtmpaddr
 "
 
 devdummy="test-dummy0"
@@ -44,6 +45,7 @@ check_err()
 	if [ $ret -eq 0 ]; then
 		ret=$1
 	fi
+	[ -n "$2" ] && echo "$2"
 }
 
 # same but inverted -- used when command must fail for test to pass
@@ -1239,6 +1241,99 @@ kci_test_enslave_bonding()
 	ip netns del "$testns"
 }
 
+# Called to validate the addresses on $IFNAME:
+#
+# 1. Every `temporary` address must have a matching `mngtmpaddr`
+# 2. Every `mngtmpaddr` address must have some un`deprecated` `temporary`
+#
+# If the mngtmpaddr or tempaddr checking failed, return 0 and stop slowwait
+validate_mngtmpaddr()
+{
+	local dev=$1
+	local prefix=""
+	local addr_list=$(ip -j -n $testns addr show dev ${dev})
+	local temp_addrs=$(echo ${addr_list} | \
+		jq -r '.[].addr_info[] | select(.temporary == true) | .local')
+	local mng_prefixes=$(echo ${addr_list} | \
+		jq -r '.[].addr_info[] | select(.mngtmpaddr == true) | .local' | \
+		cut -d: -f1-4 | tr '\n' ' ')
+	local undep_prefixes=$(echo ${addr_list} | \
+		jq -r '.[].addr_info[] | select(.temporary == true and .deprecated != true) | .local' | \
+		cut -d: -f1-4 | tr '\n' ' ')
+
+	# 1. All temporary addresses (temp and dep) must have a matching mngtmpaddr
+	for address in ${temp_addrs}; do
+		prefix=$(echo ${address} | cut -d: -f1-4)
+		if [[ ! " ${mng_prefixes} " =~ " $prefix " ]]; then
+			check_err 1 "FAIL: Temporary $address with no matching mngtmpaddr!";
+			return 0
+		fi
+	done
+
+	# 2. All mngtmpaddr addresses must have a temporary address (not dep)
+	for prefix in ${mng_prefixes}; do
+		if [[ ! " ${undep_prefixes} " =~ " $prefix " ]]; then
+			check_err 1 "FAIL: No undeprecated temporary in $prefix!";
+			return 0
+		fi
+	done
+
+	return 1
+}
+
+kci_test_mngtmpaddr()
+{
+	local ret=0
+
+	setup_ns testns
+	if [ $? -ne 0 ]; then
+		end_test "SKIP mngtmpaddr tests: cannot add net namespace $testns"
+		return $ksft_skip
+	fi
+
+	# 1. Create a dummy Ethernet interface
+	run_cmd ip -n $testns link add ${devdummy} type dummy
+	run_cmd ip -n $testns link set ${devdummy} up
+	run_cmd ip netns exec $testns sysctl -w net.ipv6.conf.${devdummy}.use_tempaddr=1
+	run_cmd ip netns exec $testns sysctl -w net.ipv6.conf.${devdummy}.temp_prefered_lft=10
+	run_cmd ip netns exec $testns sysctl -w net.ipv6.conf.${devdummy}.temp_valid_lft=25
+	run_cmd ip netns exec $testns sysctl -w net.ipv6.conf.${devdummy}.max_desync_factor=1
+
+	# 2. Create several mngtmpaddr addresses on that interface.
+	# with temp_*_lft configured to be pretty short (10 and 35 seconds
+	# for prefer/valid respectively)
+	for i in $(seq 1 9); do
+		run_cmd ip -n $testns addr add 2001:db8:7e57:${i}::1/64 mngtmpaddr dev ${devdummy}
+	done
+
+	# 3. Confirm that a preferred temporary address exists for each mngtmpaddr
+	# address at all times, polling once per second for 30 seconds.
+	slowwait 30 validate_mngtmpaddr ${devdummy}
+
+	# 4. Delete each mngtmpaddr address, one at a time (alternating between
+	# deleting and merely un-mngtmpaddr-ing), and confirm that the other
+	# mngtmpaddr addresses still have preferred temporaries.
+	for i in $(seq 1 9); do
+		(( $i % 4 == 0 )) && mng_flag="mngtmpaddr" || mng_flag=""
+		if (( $i % 2 == 0 )); then
+			run_cmd ip -n $testns addr del 2001:db8:7e57:${i}::1/64 $mng_flag dev ${devdummy}
+		else
+			run_cmd ip -n $testns addr change 2001:db8:7e57:${i}::1/64 dev ${devdummy}
+		fi
+		# the temp addr should be deleted
+		validate_mngtmpaddr ${devdummy}
+	done
+
+	if [ $ret -ne 0 ]; then
+		end_test "FAIL: mngtmpaddr add/remove incorrect"
+	else
+		end_test "PASS: mngtmpaddr add/remove correctly"
+	fi
+
+	ip netns del "$testns"
+	return $ret
+}
+
 kci_test_rtnl()
 {
 	local current_test
-- 
cgit v1.2.3


From 663a917475530feff868a4f2bda286ea4171f420 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Sun, 24 Nov 2024 07:32:43 +0000
Subject: selftests: rds: move test.py to TEST_FILES

The test.py should not be run separately. It should be run via run.sh,
which will do some sanity checks first. Move the test.py from TEST_PROGS
to TEST_FILES.

Reported-by: Maximilian Heyne <mheyne@amazon.de>
Closes: https://lore.kernel.org/netdev/20241122150129.GB18887@dev-dsk-mheyne-1b-55676e6a.eu-west-1.amazon.com
Fixes: 3ade6ce1255e ("selftests: rds: add testing infrastructure")
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Link: https://patch.msgid.link/20241124073243.847932-1-liuhangbin@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/rds/Makefile | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/rds/Makefile b/tools/testing/selftests/net/rds/Makefile
index 1803c39dbacb..612a7219990e 100644
--- a/tools/testing/selftests/net/rds/Makefile
+++ b/tools/testing/selftests/net/rds/Makefile
@@ -3,10 +3,9 @@
 all:
 	@echo mk_build_dir="$(shell pwd)" > include.sh
 
-TEST_PROGS := run.sh \
-	test.py
+TEST_PROGS := run.sh
 
-TEST_FILES := include.sh
+TEST_FILES := include.sh test.py
 
 EXTRA_CLEAN := /tmp/rds_logs include.sh
 
-- 
cgit v1.2.3


From c5efad88a94613cf60fed010b96dbc3044389316 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Thu, 28 Nov 2024 15:04:52 +0100
Subject: selftests: find_symbol: Actually use load_mod() parameter

The parameter passed to load_mod() is stored in $MOD, but never used.
Obviously it was intended to be used instead of the hardcoded
"test_kallsyms_b" module name.

Fixes: 84b4a51fce4ccc66 ("selftests: add new kallsyms selftests")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 tools/testing/selftests/module/find_symbol.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/module/find_symbol.sh b/tools/testing/selftests/module/find_symbol.sh
index 140364d3c49f..2c56805c9b6e 100755
--- a/tools/testing/selftests/module/find_symbol.sh
+++ b/tools/testing/selftests/module/find_symbol.sh
@@ -44,10 +44,10 @@ load_mod()
 	local ARCH="$(uname -m)"
 	case "${ARCH}" in
 	x86_64)
-		perf stat $STATS $MODPROBE test_kallsyms_b
+		perf stat $STATS $MODPROBE $MOD
 		;;
 	*)
-		time $MODPROBE test_kallsyms_b
+		time $MODPROBE $MOD
 		exit 1
 		;;
 	esac
-- 
cgit v1.2.3