diff options
Diffstat (limited to 'tools')
| -rw-r--r-- | tools/testing/selftests/rseq/.gitignore | 1 | ||||
| -rw-r--r-- | tools/testing/selftests/rseq/Makefile | 5 | ||||
| -rw-r--r-- | tools/testing/selftests/rseq/rseq-abi.h | 27 | ||||
| -rw-r--r-- | tools/testing/selftests/rseq/rseq-slice-hist.py | 132 | ||||
| -rw-r--r-- | tools/testing/selftests/rseq/slice_test.c | 219 | ||||
| -rw-r--r-- | tools/testing/selftests/sched_ext/Makefile | 2 | ||||
| -rw-r--r-- | tools/testing/selftests/sched_ext/rt_stall.bpf.c | 23 | ||||
| -rw-r--r-- | tools/testing/selftests/sched_ext/rt_stall.c | 240 | ||||
| -rw-r--r-- | tools/testing/selftests/sched_ext/total_bw.c | 281 |
9 files changed, 929 insertions, 1 deletions
diff --git a/tools/testing/selftests/rseq/.gitignore b/tools/testing/selftests/rseq/.gitignore index 0fda241fa62b..ec01d164c1f0 100644 --- a/tools/testing/selftests/rseq/.gitignore +++ b/tools/testing/selftests/rseq/.gitignore @@ -10,3 +10,4 @@ param_test_mm_cid param_test_mm_cid_benchmark param_test_mm_cid_compare_twice syscall_errors_test +slice_test diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile index 0d0a5fae5954..4ef90823b652 100644 --- a/tools/testing/selftests/rseq/Makefile +++ b/tools/testing/selftests/rseq/Makefile @@ -17,7 +17,7 @@ OVERRIDE_TARGETS = 1 TEST_GEN_PROGS = basic_test basic_percpu_ops_test basic_percpu_ops_mm_cid_test param_test \ param_test_benchmark param_test_compare_twice param_test_mm_cid \ param_test_mm_cid_benchmark param_test_mm_cid_compare_twice \ - syscall_errors_test + syscall_errors_test slice_test TEST_GEN_PROGS_EXTENDED = librseq.so @@ -59,3 +59,6 @@ $(OUTPUT)/param_test_mm_cid_compare_twice: param_test.c $(TEST_GEN_PROGS_EXTENDE $(OUTPUT)/syscall_errors_test: syscall_errors_test.c $(TEST_GEN_PROGS_EXTENDED) \ rseq.h rseq-*.h $(CC) $(CFLAGS) $< $(LDLIBS) -lrseq -o $@ + +$(OUTPUT)/slice_test: slice_test.c $(TEST_GEN_PROGS_EXTENDED) rseq.h rseq-*.h + $(CC) $(CFLAGS) $< $(LDLIBS) -lrseq -o $@ diff --git a/tools/testing/selftests/rseq/rseq-abi.h b/tools/testing/selftests/rseq/rseq-abi.h index fb4ec8a75dd4..ecef315204b2 100644 --- a/tools/testing/selftests/rseq/rseq-abi.h +++ b/tools/testing/selftests/rseq/rseq-abi.h @@ -53,6 +53,27 @@ struct rseq_abi_cs { __u64 abort_ip; } __attribute__((aligned(4 * sizeof(__u64)))); +/** + * rseq_abi_slice_ctrl - Time slice extension control structure + * @all: Compound value + * @request: Request for a time slice extension + * @granted: Granted time slice extension + * + * @request is set by user space and can be cleared by user space or kernel + * space. @granted is set and cleared by the kernel and must only be read + * by user space. + */ +struct rseq_abi_slice_ctrl { + union { + __u32 all; + struct { + __u8 request; + __u8 granted; + __u16 __reserved; + }; + }; +}; + /* * struct rseq_abi is aligned on 4 * 8 bytes to ensure it is always * contained within a single cache-line. @@ -165,6 +186,12 @@ struct rseq_abi { __u32 mm_cid; /* + * Time slice extension control structure. CPU local updates from + * kernel and user space. + */ + struct rseq_abi_slice_ctrl slice_ctrl; + + /* * Flexible array member at end of structure, after last feature field. */ char end[]; diff --git a/tools/testing/selftests/rseq/rseq-slice-hist.py b/tools/testing/selftests/rseq/rseq-slice-hist.py new file mode 100644 index 000000000000..b7933eeaefb9 --- /dev/null +++ b/tools/testing/selftests/rseq/rseq-slice-hist.py @@ -0,0 +1,132 @@ +#!/usr/bin/python3 + +# +# trace-cmd record -e hrtimer_start -e hrtimer_cancel -e hrtimer_expire_entry -- $cmd +# + +from tracecmd import * + +def load_kallsyms(file_path='/proc/kallsyms'): + """ + Parses /proc/kallsyms into a dictionary. + Returns: { address_int: symbol_name } + """ + kallsyms_map = {} + + try: + with open(file_path, 'r') as f: + for line in f: + # The format is: [address] [type] [name] [module] + parts = line.split() + if len(parts) < 3: + continue + + addr = int(parts[0], 16) + name = parts[2] + + kallsyms_map[addr] = name + + except PermissionError: + print(f"Error: Permission denied reading {file_path}. Try running with sudo.") + except FileNotFoundError: + print(f"Error: {file_path} not found.") + + return kallsyms_map + +ksyms = load_kallsyms() + +# pending[timer_ptr] = {'ts': timestamp, 'comm': comm} +pending = {} + +# histograms[comm][bucket] = count +histograms = {} + +class OnlineHarmonicMean: + def __init__(self): + self.n = 0 # Count of elements + self.S = 0.0 # Cumulative sum of reciprocals + + def update(self, x): + if x == 0: + raise ValueError("Harmonic mean is undefined for zero.") + + self.n += 1 + self.S += 1.0 / x + return self.n / self.S + + @property + def mean(self): + return self.n / self.S if self.n > 0 else 0 + +ohms = {} + +def handle_start(record): + func_name = ksyms[record.num_field("function")] + if "rseq_slice_expired" in func_name: + timer_ptr = record.num_field("hrtimer") + pending[timer_ptr] = { + 'ts': record.ts, + 'comm': record.comm + } + return None + +def handle_cancel(record): + timer_ptr = record.num_field("hrtimer") + + if timer_ptr in pending: + start_data = pending.pop(timer_ptr) + duration_ns = record.ts - start_data['ts'] + duration_us = duration_ns // 1000 + + comm = start_data['comm'] + + if comm not in ohms: + ohms[comm] = OnlineHarmonicMean() + + ohms[comm].update(duration_ns) + + if comm not in histograms: + histograms[comm] = {} + + histograms[comm][duration_us] = histograms[comm].get(duration_us, 0) + 1 + return None + +def handle_expire(record): + timer_ptr = record.num_field("hrtimer") + + if timer_ptr in pending: + start_data = pending.pop(timer_ptr) + comm = start_data['comm'] + + if comm not in histograms: + histograms[comm] = {} + + # Record -1 bucket for expired (failed to cancel) + histograms[comm][-1] = histograms[comm].get(-1, 0) + 1 + return None + +if __name__ == "__main__": + t = Trace("trace.dat") + for cpu in range(0, t.cpus): + ev = t.read_event(cpu) + while ev: + if "hrtimer_start" in ev.name: + handle_start(ev) + if "hrtimer_cancel" in ev.name: + handle_cancel(ev) + if "hrtimer_expire_entry" in ev.name: + handle_expire(ev) + + ev = t.read_event(cpu) + + print("\n" + "="*40) + print("RSEQ SLICE HISTOGRAM (us)") + print("="*40) + for comm, buckets in histograms.items(): + print(f"\nTask: {comm} Mean: {ohms[comm].mean:.3f} ns") + print(f" {'Latency (us)':<15} | {'Count'}") + print(f" {'-'*30}") + # Sort buckets numerically, putting -1 at the top + for bucket in sorted(buckets.keys()): + label = "EXPIRED" if bucket == -1 else f"{bucket} us" + print(f" {label:<15} | {buckets[bucket]}") diff --git a/tools/testing/selftests/rseq/slice_test.c b/tools/testing/selftests/rseq/slice_test.c new file mode 100644 index 000000000000..357122dcb487 --- /dev/null +++ b/tools/testing/selftests/rseq/slice_test.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: LGPL-2.1 +#define _GNU_SOURCE +#include <assert.h> +#include <pthread.h> +#include <sched.h> +#include <signal.h> +#include <stdbool.h> +#include <stdio.h> +#include <string.h> +#include <syscall.h> +#include <unistd.h> + +#include <linux/prctl.h> +#include <sys/prctl.h> +#include <sys/time.h> + +#include "rseq.h" + +#include "../kselftest_harness.h" + +#ifndef __NR_rseq_slice_yield +# define __NR_rseq_slice_yield 471 +#endif + +#define BITS_PER_INT 32 +#define BITS_PER_BYTE 8 + +#ifndef PR_RSEQ_SLICE_EXTENSION +# define PR_RSEQ_SLICE_EXTENSION 79 +# define PR_RSEQ_SLICE_EXTENSION_GET 1 +# define PR_RSEQ_SLICE_EXTENSION_SET 2 +# define PR_RSEQ_SLICE_EXT_ENABLE 0x01 +#endif + +#ifndef RSEQ_SLICE_EXT_REQUEST_BIT +# define RSEQ_SLICE_EXT_REQUEST_BIT 0 +# define RSEQ_SLICE_EXT_GRANTED_BIT 1 +#endif + +#ifndef asm_inline +# define asm_inline asm __inline +#endif + +#define NSEC_PER_SEC 1000000000L +#define NSEC_PER_USEC 1000L + +struct noise_params { + int64_t noise_nsecs; + int64_t sleep_nsecs; + int64_t run; +}; + +FIXTURE(slice_ext) +{ + pthread_t noise_thread; + struct noise_params noise_params; +}; + +FIXTURE_VARIANT(slice_ext) +{ + int64_t total_nsecs; + int64_t slice_nsecs; + int64_t noise_nsecs; + int64_t sleep_nsecs; + bool no_yield; +}; + +FIXTURE_VARIANT_ADD(slice_ext, n2_2_50) +{ + .total_nsecs = 5LL * NSEC_PER_SEC, + .slice_nsecs = 2LL * NSEC_PER_USEC, + .noise_nsecs = 2LL * NSEC_PER_USEC, + .sleep_nsecs = 50LL * NSEC_PER_USEC, +}; + +FIXTURE_VARIANT_ADD(slice_ext, n50_2_50) +{ + .total_nsecs = 5LL * NSEC_PER_SEC, + .slice_nsecs = 50LL * NSEC_PER_USEC, + .noise_nsecs = 2LL * NSEC_PER_USEC, + .sleep_nsecs = 50LL * NSEC_PER_USEC, +}; + +FIXTURE_VARIANT_ADD(slice_ext, n2_2_50_no_yield) +{ + .total_nsecs = 5LL * NSEC_PER_SEC, + .slice_nsecs = 2LL * NSEC_PER_USEC, + .noise_nsecs = 2LL * NSEC_PER_USEC, + .sleep_nsecs = 50LL * NSEC_PER_USEC, + .no_yield = true, +}; + + +static inline bool elapsed(struct timespec *start, struct timespec *now, + int64_t span) +{ + int64_t delta = now->tv_sec - start->tv_sec; + + delta *= NSEC_PER_SEC; + delta += now->tv_nsec - start->tv_nsec; + return delta >= span; +} + +static void *noise_thread(void *arg) +{ + struct noise_params *p = arg; + + while (RSEQ_READ_ONCE(p->run)) { + struct timespec ts_start, ts_now; + + clock_gettime(CLOCK_MONOTONIC, &ts_start); + do { + clock_gettime(CLOCK_MONOTONIC, &ts_now); + } while (!elapsed(&ts_start, &ts_now, p->noise_nsecs)); + + ts_start.tv_sec = 0; + ts_start.tv_nsec = p->sleep_nsecs; + clock_nanosleep(CLOCK_MONOTONIC, 0, &ts_start, NULL); + } + return NULL; +} + +FIXTURE_SETUP(slice_ext) +{ + cpu_set_t affinity; + + ASSERT_EQ(sched_getaffinity(0, sizeof(affinity), &affinity), 0); + + /* Pin it on a single CPU. Avoid CPU 0 */ + for (int i = 1; i < CPU_SETSIZE; i++) { + if (!CPU_ISSET(i, &affinity)) + continue; + + CPU_ZERO(&affinity); + CPU_SET(i, &affinity); + ASSERT_EQ(sched_setaffinity(0, sizeof(affinity), &affinity), 0); + break; + } + + ASSERT_EQ(rseq_register_current_thread(), 0); + + ASSERT_EQ(prctl(PR_RSEQ_SLICE_EXTENSION, PR_RSEQ_SLICE_EXTENSION_SET, + PR_RSEQ_SLICE_EXT_ENABLE, 0, 0), 0); + + self->noise_params.noise_nsecs = variant->noise_nsecs; + self->noise_params.sleep_nsecs = variant->sleep_nsecs; + self->noise_params.run = 1; + + ASSERT_EQ(pthread_create(&self->noise_thread, NULL, noise_thread, &self->noise_params), 0); +} + +FIXTURE_TEARDOWN(slice_ext) +{ + self->noise_params.run = 0; + pthread_join(self->noise_thread, NULL); +} + +TEST_F(slice_ext, slice_test) +{ + unsigned long success = 0, yielded = 0, scheduled = 0, raced = 0; + unsigned long total = 0, aborted = 0; + struct rseq_abi *rs = rseq_get_abi(); + struct timespec ts_start, ts_now; + + ASSERT_NE(rs, NULL); + + clock_gettime(CLOCK_MONOTONIC, &ts_start); + do { + struct timespec ts_cs; + bool req = false; + + clock_gettime(CLOCK_MONOTONIC, &ts_cs); + + total++; + RSEQ_WRITE_ONCE(rs->slice_ctrl.request, 1); + do { + clock_gettime(CLOCK_MONOTONIC, &ts_now); + } while (!elapsed(&ts_cs, &ts_now, variant->slice_nsecs)); + + /* + * request can be cleared unconditionally, but for making + * the stats work this is actually checking it first + */ + if (RSEQ_READ_ONCE(rs->slice_ctrl.request)) { + RSEQ_WRITE_ONCE(rs->slice_ctrl.request, 0); + /* Race between check and clear! */ + req = true; + success++; + } + + if (RSEQ_READ_ONCE(rs->slice_ctrl.granted)) { + /* The above raced against a late grant */ + if (req) + success--; + if (variant->no_yield) { + syscall(__NR_getpid); + aborted++; + } else { + yielded++; + if (!syscall(__NR_rseq_slice_yield)) + raced++; + } + } else { + if (!req) + scheduled++; + } + + clock_gettime(CLOCK_MONOTONIC, &ts_now); + } while (!elapsed(&ts_start, &ts_now, variant->total_nsecs)); + + printf("# Total %12ld\n", total); + printf("# Success %12ld\n", success); + printf("# Yielded %12ld\n", yielded); + printf("# Aborted %12ld\n", aborted); + printf("# Scheduled %12ld\n", scheduled); + printf("# Raced %12ld\n", raced); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile index 5fe45f9c5f8f..2c601a7eaff5 100644 --- a/tools/testing/selftests/sched_ext/Makefile +++ b/tools/testing/selftests/sched_ext/Makefile @@ -183,7 +183,9 @@ auto-test-targets := \ select_cpu_dispatch_bad_dsq \ select_cpu_dispatch_dbl_dsp \ select_cpu_vtime \ + rt_stall \ test_example \ + total_bw \ testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets))) diff --git a/tools/testing/selftests/sched_ext/rt_stall.bpf.c b/tools/testing/selftests/sched_ext/rt_stall.bpf.c new file mode 100644 index 000000000000..80086779dd1e --- /dev/null +++ b/tools/testing/selftests/sched_ext/rt_stall.bpf.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A scheduler that verified if RT tasks can stall SCHED_EXT tasks. + * + * Copyright (c) 2025 NVIDIA Corporation. + */ + +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); + +void BPF_STRUCT_OPS(rt_stall_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops rt_stall_ops = { + .exit = (void *)rt_stall_exit, + .name = "rt_stall", +}; diff --git a/tools/testing/selftests/sched_ext/rt_stall.c b/tools/testing/selftests/sched_ext/rt_stall.c new file mode 100644 index 000000000000..015200f80f6e --- /dev/null +++ b/tools/testing/selftests/sched_ext/rt_stall.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2025 NVIDIA Corporation. + */ +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sched.h> +#include <sys/prctl.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <time.h> +#include <linux/sched.h> +#include <signal.h> +#include <bpf/bpf.h> +#include <scx/common.h> +#include <unistd.h> +#include "rt_stall.bpf.skel.h" +#include "scx_test.h" +#include "../kselftest.h" + +#define CORE_ID 0 /* CPU to pin tasks to */ +#define RUN_TIME 5 /* How long to run the test in seconds */ + +/* Simple busy-wait function for test tasks */ +static void process_func(void) +{ + while (1) { + /* Busy wait */ + for (volatile unsigned long i = 0; i < 10000000UL; i++) + ; + } +} + +/* Set CPU affinity to a specific core */ +static void set_affinity(int cpu) +{ + cpu_set_t mask; + + CPU_ZERO(&mask); + CPU_SET(cpu, &mask); + if (sched_setaffinity(0, sizeof(mask), &mask) != 0) { + perror("sched_setaffinity"); + exit(EXIT_FAILURE); + } +} + +/* Set task scheduling policy and priority */ +static void set_sched(int policy, int priority) +{ + struct sched_param param; + + param.sched_priority = priority; + if (sched_setscheduler(0, policy, ¶m) != 0) { + perror("sched_setscheduler"); + exit(EXIT_FAILURE); + } +} + +/* Get process runtime from /proc/<pid>/stat */ +static float get_process_runtime(int pid) +{ + char path[256]; + FILE *file; + long utime, stime; + int fields; + + snprintf(path, sizeof(path), "/proc/%d/stat", pid); + file = fopen(path, "r"); + if (file == NULL) { + perror("Failed to open stat file"); + return -1; + } + + /* Skip the first 13 fields and read the 14th and 15th */ + fields = fscanf(file, + "%*d %*s %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u %lu %lu", + &utime, &stime); + fclose(file); + + if (fields != 2) { + fprintf(stderr, "Failed to read stat file\n"); + return -1; + } + + /* Calculate the total time spent in the process */ + long total_time = utime + stime; + long ticks_per_second = sysconf(_SC_CLK_TCK); + float runtime_seconds = total_time * 1.0 / ticks_per_second; + + return runtime_seconds; +} + +static enum scx_test_status setup(void **ctx) +{ + struct rt_stall *skel; + + skel = rt_stall__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(rt_stall__load(skel), "Failed to load skel"); + + *ctx = skel; + + return SCX_TEST_PASS; +} + +static bool sched_stress_test(bool is_ext) +{ + /* + * We're expecting the EXT task to get around 5% of CPU time when + * competing with the RT task (small 1% fluctuations are expected). + * + * However, the EXT task should get at least 4% of the CPU to prove + * that the EXT deadline server is working correctly. A percentage + * less than 4% indicates a bug where RT tasks can potentially + * stall SCHED_EXT tasks, causing the test to fail. + */ + const float expected_min_ratio = 0.04; /* 4% */ + const char *class_str = is_ext ? "EXT" : "FAIR"; + + float ext_runtime, rt_runtime, actual_ratio; + int ext_pid, rt_pid; + + ksft_print_header(); + ksft_set_plan(1); + + /* Create and set up a EXT task */ + ext_pid = fork(); + if (ext_pid == 0) { + set_affinity(CORE_ID); + process_func(); + exit(0); + } else if (ext_pid < 0) { + perror("fork task"); + ksft_exit_fail(); + } + + /* Create an RT task */ + rt_pid = fork(); + if (rt_pid == 0) { + set_affinity(CORE_ID); + set_sched(SCHED_FIFO, 50); + process_func(); + exit(0); + } else if (rt_pid < 0) { + perror("fork for RT task"); + ksft_exit_fail(); + } + + /* Let the processes run for the specified time */ + sleep(RUN_TIME); + + /* Get runtime for the EXT task */ + ext_runtime = get_process_runtime(ext_pid); + if (ext_runtime == -1) + ksft_exit_fail_msg("Error getting runtime for %s task (PID %d)\n", + class_str, ext_pid); + ksft_print_msg("Runtime of %s task (PID %d) is %f seconds\n", + class_str, ext_pid, ext_runtime); + + /* Get runtime for the RT task */ + rt_runtime = get_process_runtime(rt_pid); + if (rt_runtime == -1) + ksft_exit_fail_msg("Error getting runtime for RT task (PID %d)\n", rt_pid); + ksft_print_msg("Runtime of RT task (PID %d) is %f seconds\n", rt_pid, rt_runtime); + + /* Kill the processes */ + kill(ext_pid, SIGKILL); + kill(rt_pid, SIGKILL); + waitpid(ext_pid, NULL, 0); + waitpid(rt_pid, NULL, 0); + + /* Verify that the scx task got enough runtime */ + actual_ratio = ext_runtime / (ext_runtime + rt_runtime); + ksft_print_msg("%s task got %.2f%% of total runtime\n", + class_str, actual_ratio * 100); + + if (actual_ratio >= expected_min_ratio) { + ksft_test_result_pass("PASS: %s task got more than %.2f%% of runtime\n", + class_str, expected_min_ratio * 100); + return true; + } + ksft_test_result_fail("FAIL: %s task got less than %.2f%% of runtime\n", + class_str, expected_min_ratio * 100); + return false; +} + +static enum scx_test_status run(void *ctx) +{ + struct rt_stall *skel = ctx; + struct bpf_link *link = NULL; + bool res; + int i; + + /* + * Test if the dl_server is working both with and without the + * sched_ext scheduler attached. + * + * This ensures all the scenarios are covered: + * - fair_server stop -> ext_server start + * - ext_server stop -> fair_server stop + */ + for (i = 0; i < 4; i++) { + bool is_ext = i % 2; + + if (is_ext) { + memset(&skel->data->uei, 0, sizeof(skel->data->uei)); + link = bpf_map__attach_struct_ops(skel->maps.rt_stall_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + } + res = sched_stress_test(is_ext); + if (is_ext) { + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE)); + bpf_link__destroy(link); + } + + if (!res) + ksft_exit_fail(); + } + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct rt_stall *skel = ctx; + + rt_stall__destroy(skel); +} + +struct scx_test rt_stall = { + .name = "rt_stall", + .description = "Verify that RT tasks cannot stall SCHED_EXT tasks", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&rt_stall) diff --git a/tools/testing/selftests/sched_ext/total_bw.c b/tools/testing/selftests/sched_ext/total_bw.c new file mode 100644 index 000000000000..5b0a619bab86 --- /dev/null +++ b/tools/testing/selftests/sched_ext/total_bw.c @@ -0,0 +1,281 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test to verify that total_bw value remains consistent across all CPUs + * in different BPF program states. + * + * Copyright (C) 2025 NVIDIA Corporation. + */ +#include <bpf/bpf.h> +#include <errno.h> +#include <pthread.h> +#include <scx/common.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/wait.h> +#include <unistd.h> +#include "minimal.bpf.skel.h" +#include "scx_test.h" + +#define MAX_CPUS 512 +#define STRESS_DURATION_SEC 5 + +struct total_bw_ctx { + struct minimal *skel; + long baseline_bw[MAX_CPUS]; + int nr_cpus; +}; + +static void *cpu_stress_thread(void *arg) +{ + volatile int i; + time_t end_time = time(NULL) + STRESS_DURATION_SEC; + + while (time(NULL) < end_time) + for (i = 0; i < 1000000; i++) + ; + + return NULL; +} + +/* + * The first enqueue on a CPU causes the DL server to start, for that + * reason run stressor threads in the hopes it schedules on all CPUs. + */ +static int run_cpu_stress(int nr_cpus) +{ + pthread_t *threads; + int i, ret = 0; + + threads = calloc(nr_cpus, sizeof(pthread_t)); + if (!threads) + return -ENOMEM; + + /* Create threads to run on each CPU */ + for (i = 0; i < nr_cpus; i++) { + if (pthread_create(&threads[i], NULL, cpu_stress_thread, NULL)) { + ret = -errno; + fprintf(stderr, "Failed to create thread %d: %s\n", i, strerror(-ret)); + break; + } + } + + /* Wait for all threads to complete */ + for (i = 0; i < nr_cpus; i++) { + if (threads[i]) + pthread_join(threads[i], NULL); + } + + free(threads); + return ret; +} + +static int read_total_bw_values(long *bw_values, int max_cpus) +{ + FILE *fp; + char line[256]; + int cpu_count = 0; + + fp = fopen("/sys/kernel/debug/sched/debug", "r"); + if (!fp) { + SCX_ERR("Failed to open debug file"); + return -1; + } + + while (fgets(line, sizeof(line), fp)) { + char *bw_str = strstr(line, "total_bw"); + + if (bw_str) { + bw_str = strchr(bw_str, ':'); + if (bw_str) { + /* Only store up to max_cpus values */ + if (cpu_count < max_cpus) + bw_values[cpu_count] = atol(bw_str + 1); + cpu_count++; + } + } + } + + fclose(fp); + return cpu_count; +} + +static bool verify_total_bw_consistency(long *bw_values, int count) +{ + int i; + long first_value; + + if (count <= 0) + return false; + + first_value = bw_values[0]; + + for (i = 1; i < count; i++) { + if (bw_values[i] != first_value) { + SCX_ERR("Inconsistent total_bw: CPU0=%ld, CPU%d=%ld", + first_value, i, bw_values[i]); + return false; + } + } + + return true; +} + +static int fetch_verify_total_bw(long *bw_values, int nr_cpus) +{ + int attempts = 0; + int max_attempts = 10; + int count; + + /* + * The first enqueue on a CPU causes the DL server to start, for that + * reason run stressor threads in the hopes it schedules on all CPUs. + */ + if (run_cpu_stress(nr_cpus) < 0) { + SCX_ERR("Failed to run CPU stress"); + return -1; + } + + /* Try multiple times to get stable values */ + while (attempts < max_attempts) { + count = read_total_bw_values(bw_values, nr_cpus); + fprintf(stderr, "Read %d total_bw values (testing %d CPUs)\n", count, nr_cpus); + /* If system has more CPUs than we're testing, that's OK */ + if (count < nr_cpus) { + SCX_ERR("Expected at least %d CPUs, got %d", nr_cpus, count); + attempts++; + sleep(1); + continue; + } + + /* Only verify the CPUs we're testing */ + if (verify_total_bw_consistency(bw_values, nr_cpus)) { + fprintf(stderr, "Values are consistent: %ld\n", bw_values[0]); + return 0; + } + + attempts++; + sleep(1); + } + + return -1; +} + +static enum scx_test_status setup(void **ctx) +{ + struct total_bw_ctx *test_ctx; + + if (access("/sys/kernel/debug/sched/debug", R_OK) != 0) { + fprintf(stderr, "Skipping test: debugfs sched/debug not accessible\n"); + return SCX_TEST_SKIP; + } + + test_ctx = calloc(1, sizeof(*test_ctx)); + if (!test_ctx) + return SCX_TEST_FAIL; + + test_ctx->nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + if (test_ctx->nr_cpus <= 0) { + free(test_ctx); + return SCX_TEST_FAIL; + } + + /* If system has more CPUs than MAX_CPUS, just test the first MAX_CPUS */ + if (test_ctx->nr_cpus > MAX_CPUS) + test_ctx->nr_cpus = MAX_CPUS; + + /* Test scenario 1: BPF program not loaded */ + /* Read and verify baseline total_bw before loading BPF program */ + fprintf(stderr, "BPF prog initially not loaded, reading total_bw values\n"); + if (fetch_verify_total_bw(test_ctx->baseline_bw, test_ctx->nr_cpus) < 0) { + SCX_ERR("Failed to get stable baseline values"); + free(test_ctx); + return SCX_TEST_FAIL; + } + + /* Load the BPF skeleton */ + test_ctx->skel = minimal__open(); + if (!test_ctx->skel) { + free(test_ctx); + return SCX_TEST_FAIL; + } + + SCX_ENUM_INIT(test_ctx->skel); + if (minimal__load(test_ctx->skel)) { + minimal__destroy(test_ctx->skel); + free(test_ctx); + return SCX_TEST_FAIL; + } + + *ctx = test_ctx; + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct total_bw_ctx *test_ctx = ctx; + struct bpf_link *link; + long loaded_bw[MAX_CPUS]; + long unloaded_bw[MAX_CPUS]; + int i; + + /* Test scenario 2: BPF program loaded */ + link = bpf_map__attach_struct_ops(test_ctx->skel->maps.minimal_ops); + if (!link) { + SCX_ERR("Failed to attach scheduler"); + return SCX_TEST_FAIL; + } + + fprintf(stderr, "BPF program loaded, reading total_bw values\n"); + if (fetch_verify_total_bw(loaded_bw, test_ctx->nr_cpus) < 0) { + SCX_ERR("Failed to get stable values with BPF loaded"); + bpf_link__destroy(link); + return SCX_TEST_FAIL; + } + bpf_link__destroy(link); + + /* Test scenario 3: BPF program unloaded */ + fprintf(stderr, "BPF program unloaded, reading total_bw values\n"); + if (fetch_verify_total_bw(unloaded_bw, test_ctx->nr_cpus) < 0) { + SCX_ERR("Failed to get stable values after BPF unload"); + return SCX_TEST_FAIL; + } + + /* Verify all three scenarios have the same total_bw values */ + for (i = 0; i < test_ctx->nr_cpus; i++) { + if (test_ctx->baseline_bw[i] != loaded_bw[i]) { + SCX_ERR("CPU%d: baseline_bw=%ld != loaded_bw=%ld", + i, test_ctx->baseline_bw[i], loaded_bw[i]); + return SCX_TEST_FAIL; + } + + if (test_ctx->baseline_bw[i] != unloaded_bw[i]) { + SCX_ERR("CPU%d: baseline_bw=%ld != unloaded_bw=%ld", + i, test_ctx->baseline_bw[i], unloaded_bw[i]); + return SCX_TEST_FAIL; + } + } + + fprintf(stderr, "All total_bw values are consistent across all scenarios\n"); + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct total_bw_ctx *test_ctx = ctx; + + if (test_ctx) { + if (test_ctx->skel) + minimal__destroy(test_ctx->skel); + free(test_ctx); + } +} + +struct scx_test total_bw = { + .name = "total_bw", + .description = "Verify total_bw consistency across BPF program states", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&total_bw) |
