From f8fded7536a9350ce849f21eee124d66056aa54c Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 31 Jul 2025 14:09:14 +0300 Subject: selftests: net: Fix flaky neighbor garbage collection test The purpose of the "Periodic garbage collection" test case is to make sure that "extern_valid" neighbors are not flushed during periodic garbage collection, unlike regular neighbor entries. The test case is currently doing the following: 1. Changing the base reachable time to 10 seconds so that periodic garbage collection will run every 5 seconds. 2. Changing the garbage collection stale time to 5 seconds so that neighbors that have not been used in the last 5 seconds will be considered for removal. 3. Waiting for the base reachable time change to take effect. 4. Adding an "extern_valid" neighbor, a non-"extern_valid" neighbor and a bunch of other neighbors so that the threshold ("thresh1") will be crossed and stale neighbors will be flushed during garbage collection. 5. Waiting for 10 seconds to give garbage collection a chance to run. 6. Checking that the "extern_valid" neighbor was not flushed and that the non-"extern_valid" neighbor was flushed. The test sometimes fails in the netdev CI because the non-"extern_valid" neighbor was not flushed. I am unable to reproduce this locally, but my theory that since we do not know exactly when the periodic garbage collection runs, it is possible for it to run at a time when the non-"extern_valid" neighbor is still not considered stale. Fix by moving the addition of the two neighbors before step 3 and by reducing the garbage collection stale time to 1 second, to ensure that both neighbors are considered stale when garbage collection runs. Fixes: 171f2ee31a42 ("selftests: net: Add a selftest for externally validated neighbor entries") Reported-by: Jakub Kicinski Closes: https://lore.kernel.org/netdev/20250728093504.4ebbd73c@kernel.org/ Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250731110914.506890-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/test_neigh.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/test_neigh.sh b/tools/testing/selftests/net/test_neigh.sh index 388056472b5b..7c594bf6ead0 100755 --- a/tools/testing/selftests/net/test_neigh.sh +++ b/tools/testing/selftests/net/test_neigh.sh @@ -289,11 +289,11 @@ extern_valid_common() orig_base_reachable=$(ip -j ntable show name "$tbl_name" | jq '.[] | select(has("thresh1")) | .["base_reachable"]') run_cmd "ip ntable change name $tbl_name thresh1 10 base_reachable 10000" orig_gc_stale=$(ip -n "$ns1" -j ntable show name "$tbl_name" dev veth0 | jq '.[]["gc_stale"]') - run_cmd "ip -n $ns1 ntable change name $tbl_name dev veth0 gc_stale 5000" - # Wait orig_base_reachable/2 for the new interval to take effect. - run_cmd "sleep $(((orig_base_reachable / 1000) / 2 + 2))" + run_cmd "ip -n $ns1 ntable change name $tbl_name dev veth0 gc_stale 1000" run_cmd "ip -n $ns1 neigh add $ip_addr lladdr $mac nud stale dev veth0 extern_valid" run_cmd "ip -n $ns1 neigh add ${subnet}3 lladdr $mac nud stale dev veth0" + # Wait orig_base_reachable/2 for the new interval to take effect. + run_cmd "sleep $(((orig_base_reachable / 1000) / 2 + 2))" for i in {1..20}; do run_cmd "ip -n $ns1 neigh add ${subnet}$((i + 4)) nud none dev veth0" done -- cgit v1.2.3 From 7cbd49795d4ca86fba5830084e94fece3b343b79 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 30 Jul 2025 11:53:13 +0000 Subject: selftests: avoid using ifconfig ifconfig is deprecated and not always present, use ip command instead. Fixes: e0f3b3e5c77a ("selftests: Add test cases for vlan_filter modification during runtime") Signed-off-by: Eric Dumazet Cc: Dong Chenchen Reviewed-by: Hangbin Liu Link: https://patch.msgid.link/20250730115313.3356036-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/vlan_hw_filter.sh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/vlan_hw_filter.sh b/tools/testing/selftests/net/vlan_hw_filter.sh index 0fb56baf28e4..e195d5cab6f7 100755 --- a/tools/testing/selftests/net/vlan_hw_filter.sh +++ b/tools/testing/selftests/net/vlan_hw_filter.sh @@ -55,10 +55,10 @@ test_vlan0_del_crash_01() { ip netns exec ${NETNS} ip link add bond0 type bond mode 0 ip netns exec ${NETNS} ip link add link bond0 name vlan0 type vlan id 0 protocol 802.1q ip netns exec ${NETNS} ethtool -K bond0 rx-vlan-filter off - ip netns exec ${NETNS} ifconfig bond0 up + ip netns exec ${NETNS} ip link set dev bond0 up ip netns exec ${NETNS} ethtool -K bond0 rx-vlan-filter on - ip netns exec ${NETNS} ifconfig bond0 down - ip netns exec ${NETNS} ifconfig bond0 up + ip netns exec ${NETNS} ip link set dev bond0 down + ip netns exec ${NETNS} ip link set dev bond0 up ip netns exec ${NETNS} ip link del vlan0 || fail "Please check vlan HW filter function" cleanup } @@ -68,11 +68,11 @@ test_vlan0_del_crash_02() { setup ip netns exec ${NETNS} ip link add bond0 type bond mode 0 ip netns exec ${NETNS} ethtool -K bond0 rx-vlan-filter off - ip netns exec ${NETNS} ifconfig bond0 up + ip netns exec ${NETNS} ip link set dev bond0 up ip netns exec ${NETNS} ethtool -K bond0 rx-vlan-filter on ip netns exec ${NETNS} ip link add link bond0 name vlan0 type vlan id 0 protocol 802.1q - ip netns exec ${NETNS} ifconfig bond0 down - ip netns exec ${NETNS} ifconfig bond0 up + ip netns exec ${NETNS} ip link set dev bond0 down + ip netns exec ${NETNS} ip link set dev bond0 up ip netns exec ${NETNS} ip link del vlan0 || fail "Please check vlan HW filter function" cleanup } @@ -84,9 +84,9 @@ test_vlan0_del_crash_03() { ip netns exec ${NETNS} ip link add bond0 type bond mode 0 ip netns exec ${NETNS} ip link add link bond0 name vlan0 type vlan id 0 protocol 802.1q ip netns exec ${NETNS} ethtool -K bond0 rx-vlan-filter off - ip netns exec ${NETNS} ifconfig bond0 up + ip netns exec ${NETNS} ip link set dev bond0 up ip netns exec ${NETNS} ethtool -K bond0 rx-vlan-filter on - ip netns exec ${NETNS} ifconfig bond0 down + ip netns exec ${NETNS} ip link set dev bond0 down ip netns exec ${NETNS} ip link del vlan0 || fail "Please check vlan HW filter function" cleanup } -- cgit v1.2.3 From 5ef7fdf52c0f2b792802aac3438e67e5ebe7e63d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 1 Aug 2025 11:16:38 -0700 Subject: selftests: net: packetdrill: xfail all problems on slow machines We keep seeing flakes on packetdrill on debug kernels, while non-debug kernels are stable, not a single flake in 200 runs. Time to give up, debug kernels appear to suffer from 10msec latency spikes and any timing-sensitive test is bound to flake. Reviewed-by: Willem de Bruijn Acked-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250801181638.2483531-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- .../testing/selftests/net/packetdrill/ksft_runner.sh | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/packetdrill/ksft_runner.sh b/tools/testing/selftests/net/packetdrill/ksft_runner.sh index c5b01e1bd4c7..a7e790af38ff 100755 --- a/tools/testing/selftests/net/packetdrill/ksft_runner.sh +++ b/tools/testing/selftests/net/packetdrill/ksft_runner.sh @@ -35,24 +35,7 @@ failfunc=ktap_test_fail if [[ -n "${KSFT_MACHINE_SLOW}" ]]; then optargs+=('--tolerance_usecs=14000') - - # xfail tests that are known flaky with dbg config, not fixable. - # still run them for coverage (and expect 100% pass without dbg). - declare -ar xfail_list=( - "tcp_blocking_blocking-connect.pkt" - "tcp_blocking_blocking-read.pkt" - "tcp_eor_no-coalesce-retrans.pkt" - "tcp_fast_recovery_prr-ss.*.pkt" - "tcp_sack_sack-route-refresh-ip-tos.pkt" - "tcp_slow_start_slow-start-after-win-update.pkt" - "tcp_timestamping.*.pkt" - "tcp_user_timeout_user-timeout-probe.pkt" - "tcp_zerocopy_cl.*.pkt" - "tcp_zerocopy_epoll_.*.pkt" - "tcp_tcp_info_tcp-info-.*-limited.pkt" - ) - readonly xfail_regex="^($(printf '%s|' "${xfail_list[@]}"))$" - [[ "$script" =~ ${xfail_regex} ]] && failfunc=ktap_test_xfail + failfunc=ktap_test_xfail fi ktap_print_header -- cgit v1.2.3 From 8d22aea8af0d57a1daff046d65b7c18552e35e29 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 4 Aug 2025 14:43:20 +0300 Subject: selftests: netdevsim: Xfail nexthop test on slow machines A lot of test cases in the file are related to the idle and unbalanced timers of resilient nexthop groups and these tests are reported to be flaky on slow machines running debug kernels. Rather than marking a lot of individual tests with xfail_on_slow(), simply mark all the tests. Note that the test is stable on non-debug machines and that with debug kernels we are mainly interested in the output of various sanitizers in order to determine pass / fail. Before: # make -C tools/testing/selftests KSFT_MACHINE_SLOW=yes \ TARGETS=drivers/net/netdevsim TEST_PROGS=nexthop.sh \ TEST_GEN_PROGS="" run_tests [...] # TEST: Bucket migration after idle timer (with delete) [FAIL] # Group expected to still be unbalanced [...] not ok 1 selftests: drivers/net/netdevsim: nexthop.sh # exit=1 After: # make -C tools/testing/selftests KSFT_MACHINE_SLOW=yes \ TARGETS=drivers/net/netdevsim TEST_PROGS=nexthop.sh \ TEST_GEN_PROGS="" run_tests [...] # TEST: Bucket migration after idle timer (with delete) [XFAIL] # Group expected to still be unbalanced [...] ok 1 selftests: drivers/net/netdevsim: nexthop.sh Reported-by: Jakub Kicinski Closes: https://lore.kernel.org/netdev/20250729160609.02e0f157@kernel.org/ Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250804114320.193203-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/netdevsim/nexthop.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/netdevsim/nexthop.sh b/tools/testing/selftests/drivers/net/netdevsim/nexthop.sh index e8e0dc088d6a..01d0c044a5fc 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/nexthop.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/nexthop.sh @@ -1053,6 +1053,6 @@ trap cleanup EXIT setup_prepare -tests_run +xfail_on_slow tests_run exit $EXIT_STATUS -- cgit v1.2.3 From e6d76268813dc64cc0b74ea9c274501f2de05344 Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Mon, 4 Aug 2025 16:44:57 +0000 Subject: net: Update threaded state in napi config in netif_set_threaded Commit 2677010e7793 ("Add support to set NAPI threaded for individual NAPI") added support to enable/disable threaded napi using netlink. This also extended the napi config save/restore functionality to set the napi threaded state. This breaks netdev reset for drivers that use napi threaded at device level and also use napi config save/restore on napi_disable/napi_enable. Basically on netdev with napi threaded enabled at device level, a napi_enable call will get stuck trying to stop the napi kthread. This is because the napi->config->threaded is set to disabled when threaded is enabled at device level. The issue can be reproduced on virtio-net device using qemu. To reproduce the issue run following, echo 1 > /sys/class/net/threaded ethtool -L eth0 combined 1 Update the threaded state in napi config in netif_set_threaded and add a new test that verifies this scenario. Tested on qemu with virtio-net: NETIF=eth0 ./tools/testing/selftests/drivers/net/napi_threaded.py TAP version 13 1..2 ok 1 napi_threaded.change_num_queues ok 2 napi_threaded.enable_dev_threaded_disable_napi_threaded # Totals: pass:2 fail:0 xfail:0 xpass:0 skip:0 error:0 Fixes: 2677010e7793 ("Add support to set NAPI threaded for individual NAPI") Signed-off-by: Samiullah Khawaja Link: https://patch.msgid.link/20250804164457.2494390-1-skhawaja@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 26 ++--- tools/testing/selftests/drivers/net/Makefile | 1 + .../testing/selftests/drivers/net/napi_threaded.py | 111 +++++++++++++++++++++ 3 files changed, 121 insertions(+), 17 deletions(-) create mode 100755 tools/testing/selftests/drivers/net/napi_threaded.py (limited to 'tools') diff --git a/net/core/dev.c b/net/core/dev.c index b28ce68830b2..68dc47d7e700 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6978,6 +6978,12 @@ int napi_set_threaded(struct napi_struct *napi, if (napi->config) napi->config->threaded = threaded; + /* Setting/unsetting threaded mode on a napi might not immediately + * take effect, if the current napi instance is actively being + * polled. In this case, the switch between threaded mode and + * softirq mode will happen in the next round of napi_schedule(). + * This should not cause hiccups/stalls to the live traffic. + */ if (!threaded && napi->thread) { napi_stop_kthread(napi); } else { @@ -7011,23 +7017,9 @@ int netif_set_threaded(struct net_device *dev, WRITE_ONCE(dev->threaded, threaded); - /* Make sure kthread is created before THREADED bit - * is set. - */ - smp_mb__before_atomic(); - - /* Setting/unsetting threaded mode on a napi might not immediately - * take effect, if the current napi instance is actively being - * polled. In this case, the switch between threaded mode and - * softirq mode will happen in the next round of napi_schedule(). - * This should not cause hiccups/stalls to the live traffic. - */ - list_for_each_entry(napi, &dev->napi_list, dev_list) { - if (!threaded && napi->thread) - napi_stop_kthread(napi); - else - assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); - } + /* The error should not occur as the kthreads are already created. */ + list_for_each_entry(napi, &dev->napi_list, dev_list) + WARN_ON_ONCE(napi_set_threaded(napi, threaded)); return err; } diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile index 3556f3563e08..984ece05f7f9 100644 --- a/tools/testing/selftests/drivers/net/Makefile +++ b/tools/testing/selftests/drivers/net/Makefile @@ -11,6 +11,7 @@ TEST_GEN_FILES := \ TEST_PROGS := \ napi_id.py \ + napi_threaded.py \ netcons_basic.sh \ netcons_cmdline.sh \ netcons_fragmented_msg.sh \ diff --git a/tools/testing/selftests/drivers/net/napi_threaded.py b/tools/testing/selftests/drivers/net/napi_threaded.py new file mode 100755 index 000000000000..b2698db39817 --- /dev/null +++ b/tools/testing/selftests/drivers/net/napi_threaded.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +""" +Test napi threaded states. +""" + +from lib.py import ksft_run, ksft_exit +from lib.py import ksft_eq, ksft_ne, ksft_ge +from lib.py import NetDrvEnv, NetdevFamily +from lib.py import cmd, defer, ethtool + + +def _assert_napi_threaded_enabled(nl, napi_id) -> None: + napi = nl.napi_get({'id': napi_id}) + ksft_eq(napi['threaded'], 'enabled') + ksft_ne(napi.get('pid'), None) + + +def _assert_napi_threaded_disabled(nl, napi_id) -> None: + napi = nl.napi_get({'id': napi_id}) + ksft_eq(napi['threaded'], 'disabled') + ksft_eq(napi.get('pid'), None) + + +def _set_threaded_state(cfg, threaded) -> None: + cmd(f"echo {threaded} > /sys/class/net/{cfg.ifname}/threaded") + + +def _setup_deferred_cleanup(cfg) -> None: + combined = ethtool(f"-l {cfg.ifname}", json=True)[0].get("combined", 0) + ksft_ge(combined, 2) + defer(ethtool, f"-L {cfg.ifname} combined {combined}") + + threaded = cmd(f"cat /sys/class/net/{cfg.ifname}/threaded").stdout + defer(_set_threaded_state, cfg, threaded) + + +def enable_dev_threaded_disable_napi_threaded(cfg, nl) -> None: + """ + Test that when napi threaded is enabled at device level and + then disabled at napi level for one napi, the threaded state + of all napis is preserved after a change in number of queues. + """ + + napis = nl.napi_get({'ifindex': cfg.ifindex}, dump=True) + ksft_ge(len(napis), 2) + + napi0_id = napis[0]['id'] + napi1_id = napis[1]['id'] + + _setup_deferred_cleanup(cfg) + + # set threaded + _set_threaded_state(cfg, 1) + + # check napi threaded is set for both napis + _assert_napi_threaded_enabled(nl, napi0_id) + _assert_napi_threaded_enabled(nl, napi1_id) + + # disable threaded for napi1 + nl.napi_set({'id': napi1_id, 'threaded': 'disabled'}) + + cmd(f"ethtool -L {cfg.ifname} combined 1") + cmd(f"ethtool -L {cfg.ifname} combined 2") + _assert_napi_threaded_enabled(nl, napi0_id) + _assert_napi_threaded_disabled(nl, napi1_id) + + +def change_num_queues(cfg, nl) -> None: + """ + Test that when napi threaded is enabled at device level, + the napi threaded state is preserved after a change in + number of queues. + """ + + napis = nl.napi_get({'ifindex': cfg.ifindex}, dump=True) + ksft_ge(len(napis), 2) + + napi0_id = napis[0]['id'] + napi1_id = napis[1]['id'] + + _setup_deferred_cleanup(cfg) + + # set threaded + _set_threaded_state(cfg, 1) + + # check napi threaded is set for both napis + _assert_napi_threaded_enabled(nl, napi0_id) + _assert_napi_threaded_enabled(nl, napi1_id) + + cmd(f"ethtool -L {cfg.ifname} combined 1") + cmd(f"ethtool -L {cfg.ifname} combined 2") + + # check napi threaded is set for both napis + _assert_napi_threaded_enabled(nl, napi0_id) + _assert_napi_threaded_enabled(nl, napi1_id) + + +def main() -> None: + """ Ksft boiler plate main """ + + with NetDrvEnv(__file__, queue_count=2) as cfg: + ksft_run([change_num_queues, + enable_dev_threaded_disable_napi_threaded], + args=(cfg, NetdevFamily())) + ksft_exit() + + +if __name__ == "__main__": + main() -- cgit v1.2.3