diff options
Diffstat (limited to 'tools')
77 files changed, 4499 insertions, 2801 deletions
diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c index 64796c0223be..50792df27707 100644 --- a/tools/accounting/getdelays.c +++ b/tools/accounting/getdelays.c @@ -196,20 +196,20 @@ static int get_family_id(int sd) #define delay_ms(t) (t / 1000000ULL) /* - * Format timespec64 to human readable string (YYYY-MM-DD HH:MM:SS) + * Format __kernel_timespec to human readable string (YYYY-MM-DD HH:MM:SS) * Returns formatted string or "N/A" if timestamp is zero */ -static const char *format_timespec64(struct timespec64 *ts) +static const char *format_timespec(struct __kernel_timespec *ts) { static char buffer[32]; struct tm tm_info; - time_t time_sec; + __kernel_time_t time_sec; /* Check if timestamp is zero (not set) */ if (ts->tv_sec == 0 && ts->tv_nsec == 0) return "N/A"; - time_sec = (time_t)ts->tv_sec; + time_sec = ts->tv_sec; /* Use thread-safe localtime_r */ if (localtime_r(&time_sec, &tm_info) == NULL) @@ -257,7 +257,7 @@ static const char *format_timespec64(struct timespec64 *ts) average_ms((double)(t)->cpu_delay_total, (t)->cpu_count), \ delay_ms((double)(t)->cpu_delay_max), \ delay_ms((double)(t)->cpu_delay_min), \ - format_timespec64(&(t)->cpu_delay_max_ts)); \ + format_timespec(&(t)->cpu_delay_max_ts)); \ } else if (version >= 16) { \ printf("%-10s%15s%15s%15s%15s%15s%15s%15s\n", \ "CPU", "count", "real total", "virtual total", \ @@ -316,7 +316,7 @@ static const char *format_timespec64(struct timespec64 *ts) average_ms((double)(t)->total, (t)->count), \ delay_ms((double)(t)->max), \ delay_ms((double)(t)->min), \ - format_timespec64(&(t)->max_ts)); \ + format_timespec(&(t)->max_ts)); \ } else if (version >= 16) { \ printf("%-10s%15s%15s%15s%15s%15s\n", \ name, "count", "delay total", "delay average", \ diff --git a/tools/bootconfig/samples/bad-array-after-comment.bconf b/tools/bootconfig/samples/bad-array-after-comment.bconf new file mode 100644 index 000000000000..fdb6d4e04447 --- /dev/null +++ b/tools/bootconfig/samples/bad-array-after-comment.bconf @@ -0,0 +1,4 @@ +# the first array value must be on the same line as the key +key = # comment + value1, + value2 diff --git a/tools/bootconfig/samples/bad-array-in-next-line.bconf b/tools/bootconfig/samples/bad-array-in-next-line.bconf new file mode 100644 index 000000000000..95a99a3bde8c --- /dev/null +++ b/tools/bootconfig/samples/bad-array-in-next-line.bconf @@ -0,0 +1,4 @@ +# the first array value must be on the same line as the key +key = + value1, + value2 diff --git a/tools/bootconfig/samples/exp-good-array-space-comment.bconf b/tools/bootconfig/samples/exp-good-array-space-comment.bconf new file mode 100644 index 000000000000..8d3278fa6af5 --- /dev/null +++ b/tools/bootconfig/samples/exp-good-array-space-comment.bconf @@ -0,0 +1 @@ +key = "value1", "value2", "value3"; diff --git a/tools/bootconfig/samples/exp-good-comment-after-value.bconf b/tools/bootconfig/samples/exp-good-comment-after-value.bconf new file mode 100644 index 000000000000..a8e8450db3c0 --- /dev/null +++ b/tools/bootconfig/samples/exp-good-comment-after-value.bconf @@ -0,0 +1 @@ +key = "value"; diff --git a/tools/bootconfig/samples/exp-good-mixed-append.bconf b/tools/bootconfig/samples/exp-good-mixed-append.bconf new file mode 100644 index 000000000000..c2b407901ddd --- /dev/null +++ b/tools/bootconfig/samples/exp-good-mixed-append.bconf @@ -0,0 +1,2 @@ +key = "foo", "bar"; +keyx.subkey = "value"; diff --git a/tools/bootconfig/samples/exp-good-mixed-kv1.bconf b/tools/bootconfig/samples/exp-good-mixed-kv1.bconf new file mode 100644 index 000000000000..8346287d9251 --- /dev/null +++ b/tools/bootconfig/samples/exp-good-mixed-kv1.bconf @@ -0,0 +1,2 @@ +key = "value"; +key.subkey = "another-value"; diff --git a/tools/bootconfig/samples/exp-good-mixed-kv2.bconf b/tools/bootconfig/samples/exp-good-mixed-kv2.bconf new file mode 100644 index 000000000000..40c6232c7cdd --- /dev/null +++ b/tools/bootconfig/samples/exp-good-mixed-kv2.bconf @@ -0,0 +1,2 @@ +key = "another-value"; +key.subkey = "value"; diff --git a/tools/bootconfig/samples/exp-good-mixed-kv3.bconf b/tools/bootconfig/samples/exp-good-mixed-kv3.bconf new file mode 100644 index 000000000000..8368a7bef60a --- /dev/null +++ b/tools/bootconfig/samples/exp-good-mixed-kv3.bconf @@ -0,0 +1,5 @@ +key = "value"; +key { + subkey1; + subkey2 = "foo"; +} diff --git a/tools/bootconfig/samples/exp-good-mixed-override.bconf b/tools/bootconfig/samples/exp-good-mixed-override.bconf new file mode 100644 index 000000000000..58757712ca45 --- /dev/null +++ b/tools/bootconfig/samples/exp-good-mixed-override.bconf @@ -0,0 +1,2 @@ +key = "value2"; +key.foo = "bar"; diff --git a/tools/bootconfig/samples/exp-good-override.bconf b/tools/bootconfig/samples/exp-good-override.bconf new file mode 100644 index 000000000000..00bbd30e99ae --- /dev/null +++ b/tools/bootconfig/samples/exp-good-override.bconf @@ -0,0 +1,4 @@ +key { + word = "2", "3"; + new.word = "new"; +} diff --git a/tools/bootconfig/samples/exp-good-printables.bconf b/tools/bootconfig/samples/exp-good-printables.bconf new file mode 100644 index 000000000000..5981d304eacb --- /dev/null +++ b/tools/bootconfig/samples/exp-good-printables.bconf @@ -0,0 +1,2 @@ +key = " + !#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~"; diff --git a/tools/bootconfig/samples/exp-good-simple.bconf b/tools/bootconfig/samples/exp-good-simple.bconf new file mode 100644 index 000000000000..d17f39421c86 --- /dev/null +++ b/tools/bootconfig/samples/exp-good-simple.bconf @@ -0,0 +1,8 @@ +key { + word1 = "1"; + word2 = "2"; + word3 = "3"; + word4 = "4"; + word5 = "5"; + word6 = "6"; +} diff --git a/tools/bootconfig/samples/exp-good-single.bconf b/tools/bootconfig/samples/exp-good-single.bconf new file mode 100644 index 000000000000..01196910d7f4 --- /dev/null +++ b/tools/bootconfig/samples/exp-good-single.bconf @@ -0,0 +1,3 @@ +key = "1"; +key2 = "2"; +key3 = "alpha", "beta"; diff --git a/tools/bootconfig/samples/exp-good-space-after-value.bconf b/tools/bootconfig/samples/exp-good-space-after-value.bconf new file mode 100644 index 000000000000..a8e8450db3c0 --- /dev/null +++ b/tools/bootconfig/samples/exp-good-space-after-value.bconf @@ -0,0 +1 @@ +key = "value"; diff --git a/tools/bootconfig/samples/exp-good-tree.bconf b/tools/bootconfig/samples/exp-good-tree.bconf new file mode 100644 index 000000000000..b711d38d86fd --- /dev/null +++ b/tools/bootconfig/samples/exp-good-tree.bconf @@ -0,0 +1,8 @@ +key { + word.tree.value = "0"; + word2.tree.value = "1", "2"; +} +other.tree { + value = "2"; + value2 = "3"; +} diff --git a/tools/bootconfig/samples/good-array-space-comment.bconf b/tools/bootconfig/samples/good-array-space-comment.bconf index 45b938dc0695..416fa2ed4109 100644 --- a/tools/bootconfig/samples/good-array-space-comment.bconf +++ b/tools/bootconfig/samples/good-array-space-comment.bconf @@ -1,4 +1,3 @@ -key = # comment - "value1", # comment1 +key = "value1", # comment1 "value2" , # comment2 "value3" diff --git a/tools/bootconfig/test-bootconfig.sh b/tools/bootconfig/test-bootconfig.sh index 7594659af1e1..be9bd18b1d56 100755 --- a/tools/bootconfig/test-bootconfig.sh +++ b/tools/bootconfig/test-bootconfig.sh @@ -179,6 +179,9 @@ done echo "=== expected success cases ===" for i in samples/good-* ; do xpass $BOOTCONF -a $i $INITRD + x="samples/exp-"`basename $i` + $BOOTCONF $i > $TEMPCONF + xpass diff $x $TEMPCONF done diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c index f25d66c8395e..974189da8a91 100644 --- a/tools/bpf/bpftool/net.c +++ b/tools/bpf/bpftool/net.c @@ -156,7 +156,7 @@ static int netlink_recv(int sock, __u32 nl_pid, __u32 seq, bool multipart = true; struct nlmsgerr *err; struct nlmsghdr *nh; - char buf[4096]; + char buf[8192]; int len, ret; while (multipart) { @@ -201,6 +201,9 @@ static int netlink_recv(int sock, __u32 nl_pid, __u32 seq, return ret; } } + + if (len) + p_err("Invalid message or trailing data in Netlink response: %d bytes left", len); } ret = 0; done: diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index 0d992245c600..250883090a5d 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -24,6 +24,10 @@ void __bitmap_set(unsigned long *map, unsigned int start, int len); void __bitmap_clear(unsigned long *map, unsigned int start, int len); bool __bitmap_intersects(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits); +bool __bitmap_subset(const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int nbits); +bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int nbits); #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) @@ -81,6 +85,15 @@ static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, __bitmap_or(dst, src1, src2, nbits); } +static __always_inline +bool bitmap_andnot(unsigned long *dst, const unsigned long *src1, + const unsigned long *src2, unsigned int nbits) +{ + if (small_const_nbits(nbits)) + return (*dst = *src1 & ~(*src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0; + return __bitmap_andnot(dst, src1, src2, nbits); +} + static inline unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags __maybe_unused) { return malloc(bitmap_size(nbits)); @@ -157,6 +170,15 @@ static inline bool bitmap_intersects(const unsigned long *src1, return __bitmap_intersects(src1, src2, nbits); } +static __always_inline +bool bitmap_subset(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) +{ + if (small_const_nbits(nbits)) + return ! ((*src1 & ~(*src2)) & BITMAP_LAST_WORD_MASK(nbits)); + else + return __bitmap_subset(src1, src2, nbits); +} + static inline void bitmap_set(unsigned long *map, unsigned int start, unsigned int nbits) { if (__builtin_constant_p(nbits) && nbits == 1) diff --git a/tools/include/linux/mm.h b/tools/include/linux/mm.h index 677c37e4a18c..028f3faf46e7 100644 --- a/tools/include/linux/mm.h +++ b/tools/include/linux/mm.h @@ -4,6 +4,7 @@ #include <linux/align.h> #include <linux/mmzone.h> +#include <linux/sizes.h> #define PAGE_SHIFT 12 #define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) diff --git a/tools/lib/bitmap.c b/tools/lib/bitmap.c index 51255c69754d..aa83d22c45e3 100644 --- a/tools/lib/bitmap.c +++ b/tools/lib/bitmap.c @@ -140,3 +140,32 @@ void __bitmap_clear(unsigned long *map, unsigned int start, int len) *p &= ~mask_to_clear; } } + +bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int bits) +{ + unsigned int k; + unsigned int lim = bits/BITS_PER_LONG; + unsigned long result = 0; + + for (k = 0; k < lim; k++) + result |= (dst[k] = bitmap1[k] & ~bitmap2[k]); + if (bits % BITS_PER_LONG) + result |= (dst[k] = bitmap1[k] & ~bitmap2[k] & + BITMAP_LAST_WORD_MASK(bits)); + return result != 0; +} + +bool __bitmap_subset(const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int bits) +{ + unsigned int k, lim = bits/BITS_PER_LONG; + for (k = 0; k < lim; ++k) + if (bitmap1[k] & ~bitmap2[k]) + return false; + + if (bits % BITS_PER_LONG) + if ((bitmap1[k] & ~bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits)) + return false; + return true; +} diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index c145da05a67c..9d160b5b9c0e 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -315,9 +315,6 @@ enum libbpf_tristate { ___param, sizeof(___param)); \ }) -extern int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, - __u32 len__sz) __weak __ksym; - #define bpf_stream_printk(stream_id, fmt, args...) \ ({ \ static const char ___fmt[] = fmt; \ diff --git a/tools/lib/bpf/features.c b/tools/lib/bpf/features.c index b842b83e2480..2fa434f09cce 100644 --- a/tools/lib/bpf/features.c +++ b/tools/lib/bpf/features.c @@ -506,6 +506,68 @@ static int probe_kern_arg_ctx_tag(int token_fd) return probe_fd(prog_fd); } +static int probe_ldimm64_full_range_off(int token_fd) +{ + char log_buf[1024]; + int prog_fd, map_fd; + int ret; + LIBBPF_OPTS(bpf_map_create_opts, map_opts, + .token_fd = token_fd, + .map_flags = token_fd ? BPF_F_TOKEN_FD : 0, + ); + LIBBPF_OPTS(bpf_prog_load_opts, prog_opts, + .token_fd = token_fd, + .prog_flags = token_fd ? BPF_F_TOKEN_FD : 0, + .log_buf = log_buf, + .log_size = sizeof(log_buf), + ); + struct bpf_insn insns[] = { + BPF_LD_MAP_VALUE(BPF_REG_1, 0, 1UL << 30), + BPF_EXIT_INSN(), + }; + int insn_cnt = ARRAY_SIZE(insns); + + map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "arr", sizeof(int), 1, 1, &map_opts); + if (map_fd < 0) { + ret = -errno; + pr_warn("Error in %s(): %s. Couldn't create simple array map.\n", + __func__, errstr(ret)); + return ret; + } + insns[0].imm = map_fd; + + log_buf[0] = '\0'; + prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, "global_reloc", "GPL", insns, insn_cnt, &prog_opts); + ret = -errno; + + close(map_fd); + + if (prog_fd >= 0) { + pr_warn("Error in %s(): Program loading unexpectedly succeeded.\n", __func__); + close(prog_fd); + return -EINVAL; + } + + /* + * Feature is allowed if we're not failing with the error message + * "direct value offset of %u is not allowed" removed in + * 12a1fe6e12db ("bpf/verifier: Do not limit maximum direct offset into arena map"). + * We should instead fail with "invalid access to map value pointer". + * Ensure we match with one of the two and we're not failing with a + * different, unexpected message. + */ + if (strstr(log_buf, "direct value offset of")) + return 0; + + if (!strstr(log_buf, "invalid access to map value pointer")) { + pr_warn("Error in %s(): Program unexpectedly failed with message: %s.\n", + __func__, log_buf); + return ret; + } + + return 1; +} + typedef int (*feature_probe_fn)(int /* token_fd */); static struct kern_feature_cache feature_cache; @@ -581,6 +643,9 @@ static struct kern_feature_desc { [FEAT_BTF_QMARK_DATASEC] = { "BTF DATASEC names starting from '?'", probe_kern_btf_qmark_datasec, }, + [FEAT_LDIMM64_FULL_RANGE_OFF] = { + "full range LDIMM64 support", probe_ldimm64_full_range_off, + }, }; bool feat_supported(struct kern_feature_cache *cache, enum kern_feature_id feat_id) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 0c8bf0b5cce4..0be7017800fe 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -3009,9 +3009,6 @@ static int init_arena_map_data(struct bpf_object *obj, struct bpf_map *map, memcpy(obj->arena_data, data, data_sz); obj->arena_data_sz = data_sz; - /* place globals at the end of the arena */ - obj->arena_data_off = mmap_sz - data_alloc_sz; - /* make bpf_map__init_value() work for ARENA maps */ map->mmaped = obj->arena_data; @@ -4669,7 +4666,7 @@ static int bpf_program__record_reloc(struct bpf_program *prog, reloc_desc->type = RELO_DATA; reloc_desc->insn_idx = insn_idx; reloc_desc->map_idx = obj->arena_map_idx; - reloc_desc->sym_off = sym->st_value + obj->arena_data_off; + reloc_desc->sym_off = sym->st_value; map = &obj->maps[obj->arena_map_idx]; pr_debug("prog '%s': found arena map %d (%s, sec %d, off %zu) for insn %u\n", @@ -6383,6 +6380,10 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) case RELO_DATA: map = &obj->maps[relo->map_idx]; insn[1].imm = insn[0].imm + relo->sym_off; + + if (relo->map_idx == obj->arena_map_idx) + insn[1].imm += obj->arena_data_off; + if (obj->gen_loader) { insn[0].src_reg = BPF_PSEUDO_MAP_IDX_VALUE; insn[0].imm = relo->map_idx; @@ -7384,6 +7385,14 @@ static int bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_pat bpf_object__sort_relos(obj); } + /* place globals at the end of the arena (if supported) */ + if (obj->arena_map_idx >= 0 && kernel_supports(obj, FEAT_LDIMM64_FULL_RANGE_OFF)) { + struct bpf_map *arena_map = &obj->maps[obj->arena_map_idx]; + + obj->arena_data_off = bpf_map_mmap_sz(arena_map) - + roundup(obj->arena_data_sz, sysconf(_SC_PAGE_SIZE)); + } + /* Before relocating calls pre-process relocations and mark * few ld_imm64 instructions that points to subprogs. * Otherwise bpf_object__reloc_code() later would have to consider diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index fc59b21b51b5..974147e8a8aa 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -392,6 +392,8 @@ enum kern_feature_id { FEAT_ARG_CTX_TAG, /* Kernel supports '?' at the front of datasec names */ FEAT_BTF_QMARK_DATASEC, + /* Kernel supports LDIMM64 imm offsets past 512 MiB. */ + FEAT_LDIMM64_FULL_RANGE_OFF, __FEAT_CNT, }; diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c index f4403e3cf994..78f92c39290a 100644 --- a/tools/lib/bpf/linker.c +++ b/tools/lib/bpf/linker.c @@ -581,7 +581,7 @@ int bpf_linker__add_buf(struct bpf_linker *linker, void *buf, size_t buf_sz, written = 0; while (written < buf_sz) { - ret = write(fd, buf, buf_sz); + ret = write(fd, buf + written, buf_sz - written); if (ret < 0) { ret = -errno; pr_warn("failed to write '%s': %s\n", filename, errstr(ret)); diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c index c997e69d507f..c9a78fb16f11 100644 --- a/tools/lib/bpf/netlink.c +++ b/tools/lib/bpf/netlink.c @@ -143,7 +143,7 @@ static int libbpf_netlink_recv(int sock, __u32 nl_pid, int seq, struct nlmsghdr *nh; int len, ret; - ret = alloc_iov(&iov, 4096); + ret = alloc_iov(&iov, 8192); if (ret) goto done; @@ -212,6 +212,8 @@ start: } } } + if (len) + pr_warn("Invalid message or trailing data in Netlink response: %d bytes left\n", len); } ret = 0; done: diff --git a/tools/lib/python/kdoc/kdoc_parser.py b/tools/lib/python/kdoc/kdoc_parser.py index fd57944ae907..ca00695b47b3 100644 --- a/tools/lib/python/kdoc/kdoc_parser.py +++ b/tools/lib/python/kdoc/kdoc_parser.py @@ -175,6 +175,7 @@ function_xforms = [ (KernRe(r"^__FORTIFY_INLINE +"), ""), (KernRe(r"__init +"), ""), (KernRe(r"__init_or_module +"), ""), + (KernRe(r"__exit +"), ""), (KernRe(r"__deprecated +"), ""), (KernRe(r"__flatten +"), ""), (KernRe(r"__meminit +"), ""), diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile index a40f30232929..6964175abdfd 100644 --- a/tools/objtool/Makefile +++ b/tools/objtool/Makefile @@ -29,6 +29,8 @@ srctree := $(patsubst %/,%,$(dir $(CURDIR))) srctree := $(patsubst %/,%,$(dir $(srctree))) endif +RM ?= rm -f + LIBSUBCMD_DIR = $(srctree)/tools/lib/subcmd/ ifneq ($(OUTPUT),) LIBSUBCMD_OUTPUT = $(abspath $(OUTPUT))/libsubcmd diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 1551fcdbfd8a..344ede2f8546 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -111,10 +111,14 @@ The column name "all" can be used to enable all disabled-by-default built-in cou .PP \fB--no-perf\fP Disable all the uses of the perf API. .PP +\fB--force\fP Force turbostat to run on an unsupported platform (minimal defaults). +.PP \fB--interval seconds\fP overrides the default 5.0 second measurement interval. .PP \fB--num_iterations num\fP number of the measurement iterations. .PP +\fB--header_iterations num\fP print header every num iterations. +.PP \fB--out output_file\fP turbostat output is written to the specified output_file. The file is truncated if it already exists, and it is created if it does not exist. .PP @@ -159,15 +163,19 @@ The system configuration dump (if --quiet is not used) is followed by statistics .PP \fBSMI\fP The number of System Management Interrupts serviced CPU during the measurement interval. While this counter is actually per-CPU, SMI are triggered on all processors, so the number should be the same for all CPUs. .PP -\fBLLCkRPS\fP Last Level Cache Thousands of References Per Second. For CPUs with an L3 LLC, this is the number of references that CPU made to the L3 (and the number of misses that CPU made to it's L2). For CPUs with an L2 LLC, this is the number of references to the L2 (and the number of misses to the CPU's L1). The system summary row shows the sum for all CPUs. In both cases, the value displayed is the actual value divided by 1000 in the interest of usually fitting into 8 columns. +\fBLLCMRPS\fP Last Level Cache Millions of References Per Second. For CPUs with an L3 LLC, this is the number of references that CPU made to the L3 (and the number of misses that CPU made to it's L2). For CPUs with an L2 LLC, this is the number of references to the L2 (and the number of misses to the CPU's L1). The system summary row shows the sum for all CPUs. In both cases, the value displayed is the actual value divided by 1,000,000. If this value is large, then the LLC%hit column is significant. If this value is small, then the LLC%hit column is not significant. +.PP +\fBLLC%hit\fP Last Level Cache Hit Rate %. Hit Rate Percent = 100.0 * Hits/References. The system summary row shows the weighted average for all CPUs (100.0 * Sum_Hits/Sum_References). +.PP +\fBL2MRPS\fP Level-2 Cache Millions of References Per Second. For CPUs with an L2 LLC, this is the same as LLC references. The system summary row shows the sum for all CPUs. In both cases, the value displayed is the actual value divided by 1,000,000. If this value is large, then the L2%hit column is significant. If this value is small, then the L2%hit column is not significant. .PP -\fBLLC%hit\fP Last Level Cache Hit Rate %. Hit Rate Percent = 100.0 * (References - Misses)/References. The system summary row shows the weighted average for all CPUs (100.0 * (Sum_References - Sum_Misses)/Sum_References). +\fBL2%hit\fP Level-2 Cache Hit Rate %. Hit Rate Percent = 100.0 * Hits/References. The system summary row shows the weighted average for all CPUs (100.0 * (Sum_Hits)/Sum_References). .PP \fBC1, C2, C3...\fP The number times Linux requested the C1, C2, C3 idle state during the measurement interval. The system summary line shows the sum for all CPUs. These are C-state names as exported in /sys/devices/system/cpu/cpu*/cpuidle/state*/name. While their names are generic, their attributes are processor specific. They the system description section of output shows what MWAIT sub-states they are mapped to on each system. These counters are in the "cpuidle" group, which is disabled, by default. .PP -\fBC1+, C2+, C3+...\fP The idle governor idle state misprediction statistics. Inidcates the number times Linux requested the C1, C2, C3 idle state during the measurement interval, but should have requested a deeper idle state (if it exists and enabled). These statistics come from the /sys/devices/system/cpu/cpu*/cpuidle/state*/below file. These counters are in the "cpuidle" group, which is disabled, by default. +\fBC1+, C2+, C3+...\fP The idle governor idle state misprediction statistics. Indicates the number times Linux requested the C1, C2, C3 idle state during the measurement interval, but should have requested a deeper idle state (if it exists and enabled). These statistics come from the /sys/devices/system/cpu/cpu*/cpuidle/state*/below file. These counters are in the "cpuidle" group, which is disabled, by default. .PP -\fBC1-, C2-, C3-...\fP The idle governor idle state misprediction statistics. Inidcates the number times Linux requested the C1, C2, C3 idle state during the measurement interval, but should have requested a shallower idle state (if it exists and enabled). These statistics come from the /sys/devices/system/cpu/cpu*/cpuidle/state*/above file. These counters are in the "cpuidle" group, which is disabled, by default. +\fBC1-, C2-, C3-...\fP The idle governor idle state misprediction statistics. Indicates the number times Linux requested the C1, C2, C3 idle state during the measurement interval, but should have requested a shallower idle state (if it exists and enabled). These statistics come from the /sys/devices/system/cpu/cpu*/cpuidle/state*/above file. These counters are in the "cpuidle" group, which is disabled, by default. .PP \fBC1%, C2%, C3%\fP The residency percentage that Linux requested C1, C2, C3.... The system summary is the average of all CPUs in the system. Note that these are software, reflecting what was requested. The hardware counters reflect what was actually achieved. These counters are in the "pct_idle" group, which is enabled by default. .PP @@ -197,7 +205,7 @@ The system configuration dump (if --quiet is not used) is followed by statistics .PP \fBGFX%C0\fP Percentage of time that at least one GFX compute engine is busy. .PP -\fBCPUGFX%\fP Percentage of time that at least one CPU is busy at the same time as at least one Graphics compute enginer is busy. +\fBCPUGFX%\fP Percentage of time that at least one CPU is busy at the same time as at least one Graphics compute engine is busy. .PP \fBPkg%pc2, Pkg%pc3, Pkg%pc6, Pkg%pc7\fP percentage residency in hardware package idle states. These numbers are from hardware residency counters. .PP @@ -559,6 +567,8 @@ If the upstream version isn't new enough, the development tree can be found here If the development tree doesn't work, please contact the author via chat, or via email with the word "turbostat" on the Subject line. +An old turbostat binary may run on unknown hardware by using "--force", +but results are unsupported. .SH FILES .ta .nf diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 5ad45c2ac5bd..1a2671c28209 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -3,7 +3,7 @@ * turbostat -- show CPU frequency and C-state residency * on modern Intel and AMD processors. * - * Copyright (c) 2025 Intel Corporation. + * Copyright (c) 2010 - 2026 Intel Corporation * Len Brown <len.brown@intel.com> */ @@ -210,8 +210,10 @@ struct msr_counter bic[] = { { 0x0, "NMI", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "CPU%c1e", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "pct_idle", NULL, 0, 0, 0, NULL, 0 }, - { 0x0, "LLCkRPS", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "LLCMRPS", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "LLC%hit", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "L2MRPS", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "L2%hit", NULL, 0, 0, 0, NULL, 0 }, }; /* n.b. bic_names must match the order in bic[], above */ @@ -281,8 +283,10 @@ enum bic_names { BIC_NMI, BIC_CPU_c1e, BIC_pct_idle, - BIC_LLC_RPS, + BIC_LLC_MRPS, BIC_LLC_HIT, + BIC_L2_MRPS, + BIC_L2_HIT, MAX_BIC }; @@ -294,12 +298,10 @@ void print_bic_set(char *s, cpu_set_t *set) printf("%s:", s); - for (i = 0; i <= MAX_BIC; ++i) { + for (i = 0; i < MAX_BIC; ++i) { - if (CPU_ISSET(i, set)) { - assert(i < MAX_BIC); + if (CPU_ISSET(i, set)) printf(" %s", bic[i].name); - } } putchar('\n'); } @@ -424,8 +426,10 @@ static void bic_groups_init(void) SET_BIC(BIC_pct_idle, &bic_group_idle); BIC_INIT(&bic_group_cache); - SET_BIC(BIC_LLC_RPS, &bic_group_cache); + SET_BIC(BIC_LLC_MRPS, &bic_group_cache); SET_BIC(BIC_LLC_HIT, &bic_group_cache); + SET_BIC(BIC_L2_MRPS, &bic_group_cache); + SET_BIC(BIC_L2_HIT, &bic_group_cache); BIC_INIT(&bic_group_other); SET_BIC(BIC_IRQ, &bic_group_other); @@ -482,6 +486,7 @@ FILE *outf; int *fd_percpu; int *fd_instr_count_percpu; int *fd_llc_percpu; +int *fd_l2_percpu; struct timeval interval_tv = { 5, 0 }; struct timespec interval_ts = { 5, 0 }; @@ -498,6 +503,7 @@ unsigned int list_header_only; unsigned int dump_only; unsigned int force_load; unsigned int cpuid_has_aperf_mperf; +unsigned int cpuid_has_hv; unsigned int has_aperf_access; unsigned int has_epb; unsigned int has_turbo; @@ -528,7 +534,7 @@ double rapl_dram_energy_units, rapl_energy_units, rapl_psys_energy_units; double rapl_joule_counter_range; unsigned int crystal_hz; unsigned long long tsc_hz; -int base_cpu; +int master_cpu; unsigned int has_hwp; /* IA32_PM_ENABLE, IA32_HWP_CAPABILITIES */ /* IA32_HWP_REQUEST, IA32_HWP_STATUS */ unsigned int has_hwp_notify; /* IA32_HWP_INTERRUPT */ @@ -620,7 +626,7 @@ double slm_bclk(void) unsigned int i; double freq; - if (get_msr(base_cpu, MSR_FSB_FREQ, &msr)) + if (get_msr(master_cpu, MSR_FSB_FREQ, &msr)) fprintf(outf, "SLM BCLK: unknown\n"); i = msr & 0xf; @@ -1248,6 +1254,84 @@ static const struct platform_data turbostat_pdata[] = { { 0, NULL }, }; +struct { + unsigned int uniform; + unsigned int pcore; + unsigned int ecore; + unsigned int lcore; +} perf_pmu_types; + +/* + * Events are enumerated in https://github.com/intel/perfmon + * and tools/perf/pmu-events/arch/x86/.../cache.json + */ +struct perf_l2_events { + unsigned long long refs; /* L2_REQUEST.ALL */ + unsigned long long hits; /* L2_REQUEST.HIT */ +}; + +struct perf_model_support { + unsigned int vfm; + struct perf_l2_events first; + struct perf_l2_events second; + struct perf_l2_events third; +} *perf_model_support; + +/* Perf Cache Events */ +#define PCE(ext_umask, umask) (((unsigned long long) ext_umask) << 40 | umask << 8 | 0x24) + +/* + * Enumerate up to three perf CPU PMU's in a system. + * The first, second, and third columns are populated without skipping, describing + * pcore, ecore, lcore PMUs, in order, if present. (The associated PMU "type" field is + * read from sysfs in all cases.) Eg. + * + * non-hybrid: + * GNR: pcore, {}, {} + * ADL-N: ecore, {}, {} + * hybrid: + * MTL: pcore, ecore, {}% + * ARL-H: pcore, ecore, lcore + * LNL: ecore, ecore%%, {} + * + * % MTL physical lcore share architecture and PMU with ecore, and are thus not enumerated separately. + * %% LNL physical lcore is enumerated by perf as ecore + */ +static struct perf_model_support turbostat_perf_model_support[] = { + { INTEL_SAPPHIRERAPIDS_X, { PCE(0x00, 0xFF), PCE(0x00, 0xDF)}, {}, {} }, + { INTEL_EMERALDRAPIDS_X, { PCE(0x00, 0xFF), PCE(0x00, 0xDF)}, {}, {} }, + { INTEL_GRANITERAPIDS_X, { PCE(0x00, 0xFF), PCE(0x00, 0xDF)}, {}, {} }, + { INTEL_GRANITERAPIDS_D, { PCE(0x00, 0xFF), PCE(0x00, 0xDF)}, {}, {} }, + { INTEL_DIAMONDRAPIDS_X, { PCE(0x00, 0xFF), PCE(0x00, 0x5F)}, {}, {} }, + + { INTEL_ATOM_GRACEMONT, { PCE(0x00, 0x00), PCE(0x00, 0x02)}, {}, {} }, /* ADL-N */ + { INTEL_ATOM_CRESTMONT_X, { PCE(0x00, 0x00), PCE(0x00, 0x02)}, {}, {} }, /* SRF */ + { INTEL_ATOM_CRESTMONT, { PCE(0x00, 0x00), PCE(0x00, 0x02)}, {}, {} }, /* GRR */ + { INTEL_ATOM_DARKMONT_X, { PCE(0x01, 0xFF), PCE(0x01, 0xBF)}, {}, {} }, /* CWF */ + + { INTEL_ALDERLAKE, { PCE(0x00, 0xFF), PCE(0x00, 0xDF)}, { PCE(0x00, 0x00), PCE(0x00, 0x02)}, {} }, + { INTEL_ALDERLAKE, { PCE(0x00, 0xFF), PCE(0x00, 0xDF)}, { PCE(0x00, 0x00), PCE(0x00, 0x02)}, {} }, + { INTEL_ALDERLAKE_L, { PCE(0x00, 0xFF), PCE(0x00, 0xDF)}, { PCE(0x00, 0x00), PCE(0x00, 0x02)}, {} }, + { INTEL_RAPTORLAKE, { PCE(0x00, 0xFF), PCE(0x00, 0xDF)}, { PCE(0x00, 0x00), PCE(0x00, 0x02)}, {} }, + { INTEL_RAPTORLAKE_P, { PCE(0x00, 0xFF), PCE(0x00, 0xDF)}, { PCE(0x00, 0x00), PCE(0x00, 0x02)}, {} }, + { INTEL_RAPTORLAKE_S, { PCE(0x00, 0xFF), PCE(0x00, 0xDF)}, { PCE(0x00, 0x00), PCE(0x00, 0x02)}, {} }, + { INTEL_METEORLAKE_L, { PCE(0x00, 0xFF), PCE(0x00, 0xDF)}, { PCE(0x00, 0x00), PCE(0x00, 0x02)}, {} }, + { INTEL_METEORLAKE, { PCE(0x00, 0xFF), PCE(0x00, 0xDF)}, { PCE(0x00, 0x00), PCE(0x00, 0x02)}, {} }, + { INTEL_ARROWLAKE_U, { PCE(0x00, 0xFF), PCE(0x00, 0xDF)}, { PCE(0x00, 0x00), PCE(0x00, 0x02)}, {} }, + + { INTEL_LUNARLAKE_M, { PCE(0x00, 0xFF), PCE(0x00, 0x5F)}, { PCE(0x00, 0x07), PCE(0x00, 0x02)}, {} }, + { INTEL_ARROWLAKE_H, { PCE(0x00, 0xFF), PCE(0x00, 0x5F)}, { PCE(0x00, 0x07), PCE(0x00, 0x02)}, { PCE(0x00, 0x00), PCE(0x00, 0x02)} }, + { INTEL_ARROWLAKE, { PCE(0x00, 0xFF), PCE(0x00, 0x5F)}, { PCE(0x00, 0x07), PCE(0x00, 0x02)}, {} }, + + { INTEL_PANTHERLAKE_L, { PCE(0x00, 0xFF), PCE(0x00, 0x5F)}, { PCE(0x01, 0xFF), PCE(0x01, 0xBF)}, {} }, + { INTEL_WILDCATLAKE_L, { PCE(0x00, 0xFF), PCE(0x00, 0x5F)}, { PCE(0x01, 0xFF), PCE(0x01, 0xBF)}, {} }, + + { INTEL_NOVALAKE, { PCE(0x00, 0xFF), PCE(0x00, 0x5F)}, { PCE(0x01, 0xFF), PCE(0x01, 0xBF)}, {} }, + { INTEL_NOVALAKE_L, { PCE(0x00, 0xFF), PCE(0x00, 0x5F)}, { PCE(0x01, 0xFF), PCE(0x01, 0xBF)}, {} }, + + { 0, {}, {}, {} } +}; + static const struct platform_features *platform; void probe_platform_features(unsigned int family, unsigned int model) @@ -1291,6 +1375,21 @@ end: exit(1); } +void init_perf_model_support(unsigned int family, unsigned int model) +{ + int i; + + if (!genuine_intel) + return; + + for (i = 0; turbostat_perf_model_support[i].vfm; i++) { + if (VFM_FAMILY(turbostat_perf_model_support[i].vfm) == family && VFM_MODEL(turbostat_perf_model_support[i].vfm) == model) { + perf_model_support = &turbostat_perf_model_support[i]; + return; + } + } +} + /* Model specific support End */ #define TJMAX_DEFAULT 100 @@ -1307,6 +1406,7 @@ char *progname; #define CPU_SUBSET_MAXCPUS 8192 /* need to use before probe... */ cpu_set_t *cpu_present_set, *cpu_possible_set, *cpu_effective_set, *cpu_allowed_set, *cpu_affinity_set, *cpu_subset; +cpu_set_t *perf_pcore_set, *perf_ecore_set, *perf_lcore_set; size_t cpu_present_setsize, cpu_possible_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affinity_setsize, cpu_subset_size; #define MAX_ADDED_THREAD_COUNTERS 24 #define MAX_ADDED_CORE_COUNTERS 8 @@ -2007,6 +2107,10 @@ struct llc_stats { unsigned long long references; unsigned long long misses; }; +struct l2_stats { + unsigned long long references; + unsigned long long hits; +}; struct thread_data { struct timeval tv_begin; struct timeval tv_end; @@ -2020,6 +2124,7 @@ struct thread_data { unsigned long long nmi_count; unsigned int smi_count; struct llc_stats llc; + struct l2_stats l2; unsigned int cpu_id; unsigned int apic_id; unsigned int x2apic_id; @@ -2028,25 +2133,24 @@ struct thread_data { unsigned long long counter[MAX_ADDED_THREAD_COUNTERS]; unsigned long long perf_counter[MAX_ADDED_THREAD_COUNTERS]; unsigned long long pmt_counter[PMT_MAX_ADDED_THREAD_COUNTERS]; -} *thread_even, *thread_odd; +}; struct core_data { - int base_cpu; + int first_cpu; unsigned long long c3; unsigned long long c6; unsigned long long c7; unsigned long long mc6_us; /* duplicate as per-core for now, even though per module */ unsigned int core_temp_c; struct rapl_counter core_energy; /* MSR_CORE_ENERGY_STAT */ - unsigned int core_id; unsigned long long core_throt_cnt; unsigned long long counter[MAX_ADDED_CORE_COUNTERS]; unsigned long long perf_counter[MAX_ADDED_CORE_COUNTERS]; unsigned long long pmt_counter[PMT_MAX_ADDED_CORE_COUNTERS]; -} *core_even, *core_odd; +}; struct pkg_data { - int base_cpu; + int first_cpu; unsigned long long pc2; unsigned long long pc3; unsigned long long pc6; @@ -2066,7 +2170,6 @@ struct pkg_data { long long sam_mc6_ms; unsigned int sam_mhz; unsigned int sam_act_mhz; - unsigned int package_id; struct rapl_counter energy_pkg; /* MSR_PKG_ENERGY_STATUS */ struct rapl_counter energy_dram; /* MSR_DRAM_ENERGY_STATUS */ struct rapl_counter energy_cores; /* MSR_PP0_ENERGY_STATUS */ @@ -2079,24 +2182,10 @@ struct pkg_data { unsigned long long counter[MAX_ADDED_PACKAGE_COUNTERS]; unsigned long long perf_counter[MAX_ADDED_PACKAGE_COUNTERS]; unsigned long long pmt_counter[PMT_MAX_ADDED_PACKAGE_COUNTERS]; -} *package_even, *package_odd; - -#define ODD_COUNTERS thread_odd, core_odd, package_odd -#define EVEN_COUNTERS thread_even, core_even, package_even - -#define GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no) \ - ((thread_base) + \ - ((pkg_no) * \ - topo.nodes_per_pkg * topo.cores_per_node * topo.threads_per_core) + \ - ((node_no) * topo.cores_per_node * topo.threads_per_core) + \ - ((core_no) * topo.threads_per_core) + \ - (thread_no)) +}; -#define GET_CORE(core_base, core_no, node_no, pkg_no) \ - ((core_base) + \ - ((pkg_no) * topo.nodes_per_pkg * topo.cores_per_node) + \ - ((node_no) * topo.cores_per_node) + \ - (core_no)) +#define ODD_COUNTERS odd.threads, odd.cores, odd.packages +#define EVEN_COUNTERS even.threads, even.cores, even.packages /* * The accumulated sum of MSR is defined as a monotonic @@ -2135,7 +2224,7 @@ off_t idx_to_offset(int idx) switch (idx) { case IDX_PKG_ENERGY: - if (valid_rapl_msrs & RAPL_AMD_F17H) + if (platform->plat_rapl_msrs & RAPL_AMD_F17H) offset = MSR_PKG_ENERGY_STAT; else offset = MSR_PKG_ENERGY_STATUS; @@ -2279,25 +2368,28 @@ static void free_sys_msr_counters(void) sys.added_package_counters -= free_msr_counters_(&sys.pp); } -struct system_summary { - struct thread_data threads; - struct core_data cores; - struct pkg_data packages; -} average; +struct counters { + struct thread_data *threads; + struct core_data *cores; + struct pkg_data *packages; +} average, even, odd; struct platform_counters { struct rapl_counter energy_psys; /* MSR_PLATFORM_ENERGY_STATUS */ } platform_counters_odd, platform_counters_even; +#define MAX_HT_ID 3 /* support SMT-4 */ + struct cpu_topology { - int physical_package_id; + int cpu_id; + int core_id; /* unique within a package */ + int package_id; int die_id; int l3_id; - int logical_cpu_id; int physical_node_id; int logical_node_id; /* 0-based count within the package */ - int physical_core_id; - int thread_id; + int ht_id; /* unique within a core */ + int ht_sibling_cpu_id[MAX_HT_ID + 1]; int type; cpu_set_t *put_ids; /* Processing Unit/Thread IDs */ } *cpus; @@ -2306,12 +2398,12 @@ struct topo_params { int num_packages; int num_die; int num_cpus; - int num_cores; + int num_cores; /* system wide */ int allowed_packages; int allowed_cpus; int allowed_cores; int max_cpu_num; - int max_core_id; + int max_core_id; /* within a package */ int max_package_id; int max_die_id; int max_l3_id; @@ -2343,6 +2435,7 @@ int cpu_is_not_allowed(int cpu) return !CPU_ISSET_S(cpu, cpu_allowed_setsize, cpu_allowed_set); } +#define GLOBAL_CORE_ID(core_id, pkg_id) (core_id + pkg_id * (topo.max_core_id + 1)) /* * run func(thread, core, package) in topology order * skip non-present cpus @@ -2353,27 +2446,38 @@ int cpu_is_not_allowed(int cpu) int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pkg_data *), struct thread_data *thread_base, struct core_data *core_base, struct pkg_data *pkg_base) { - int retval, pkg_no, core_no, thread_no, node_no; + int cpu, retval; retval = 0; - for (pkg_no = 0; pkg_no < topo.num_packages; ++pkg_no) { - for (node_no = 0; node_no < topo.nodes_per_pkg; node_no++) { - for (core_no = 0; core_no < topo.cores_per_node; ++core_no) { - for (thread_no = 0; thread_no < topo.threads_per_core; ++thread_no) { - struct thread_data *t; - struct core_data *c; + for (cpu = 0; cpu <= topo.max_cpu_num; ++cpu) { + struct thread_data *t; + struct core_data *c; + struct pkg_data *p; - t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no); + int pkg_id = cpus[cpu].package_id; - if (cpu_is_not_allowed(t->cpu_id)) - continue; + if (cpu_is_not_allowed(cpu)) + continue; - c = GET_CORE(core_base, core_no, node_no, pkg_no); + if (cpus[cpu].ht_id > 0) /* skip HT sibling */ + continue; - retval |= func(t, c, &pkg_base[pkg_no]); - } - } + t = &thread_base[cpu]; + c = &core_base[GLOBAL_CORE_ID(cpus[cpu].core_id, pkg_id)]; + p = &pkg_base[pkg_id]; + + retval |= func(t, c, p); + + /* Handle HT sibling now */ + int i; + + for (i = MAX_HT_ID; i > 0; --i) { /* ht_id 0 is self */ + if (cpus[cpu].ht_sibling_cpu_id[i] <= 0) + continue; + t = &thread_base[cpus[cpu].ht_sibling_cpu_id[i]]; + + retval |= func(t, c, p); } } return retval; @@ -2381,12 +2485,12 @@ int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pk int is_cpu_first_thread_in_core(struct thread_data *t, struct core_data *c) { - return ((int)t->cpu_id == c->base_cpu || c->base_cpu < 0); + return ((int)t->cpu_id == c->first_cpu || c->first_cpu < 0); } int is_cpu_first_core_in_package(struct thread_data *t, struct pkg_data *p) { - return ((int)t->cpu_id == p->base_cpu || p->base_cpu < 0); + return ((int)t->cpu_id == p->first_cpu || p->first_cpu < 0); } int is_cpu_first_thread_in_package(struct thread_data *t, struct core_data *c, struct pkg_data *p) @@ -2439,8 +2543,10 @@ static void bic_disable_msr_access(void) static void bic_disable_perf_access(void) { CLR_BIC(BIC_IPC, &bic_enabled); - CLR_BIC(BIC_LLC_RPS, &bic_enabled); + CLR_BIC(BIC_LLC_MRPS, &bic_enabled); CLR_BIC(BIC_LLC_HIT, &bic_enabled); + CLR_BIC(BIC_L2_MRPS, &bic_enabled); + CLR_BIC(BIC_L2_HIT, &bic_enabled); } static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) @@ -2552,10 +2658,10 @@ unsigned int cpu_to_domain(const struct perf_counter_info *pc, int cpu) return cpu; case SCOPE_CORE: - return cpus[cpu].physical_core_id; + return cpus[cpu].core_id; case SCOPE_PACKAGE: - return cpus[cpu].physical_package_id; + return cpus[cpu].package_id; } __builtin_unreachable(); @@ -2629,8 +2735,7 @@ void help(void) " sets the Thermal Control Circuit temperature in\n" " degrees Celsius\n" " -h, --help\n" - " print this help message\n" - " -v, --version\n\t\tprint version information\n\nFor more help, run \"man turbostat\"\n"); + " print this help message\n -v, --version\n\t\tprint version information\n\nFor more help, run \"man turbostat\"\n"); } /* @@ -2813,12 +2918,18 @@ void print_header(char *delim) if (DO_BIC(BIC_SMI)) outp += sprintf(outp, "%sSMI", (printed++ ? delim : "")); - if (DO_BIC(BIC_LLC_RPS)) - outp += sprintf(outp, "%sLLCkRPS", (printed++ ? delim : "")); + if (DO_BIC(BIC_LLC_MRPS)) + outp += sprintf(outp, "%sLLCMRPS", (printed++ ? delim : "")); if (DO_BIC(BIC_LLC_HIT)) outp += sprintf(outp, "%sLLC%%hit", (printed++ ? delim : "")); + if (DO_BIC(BIC_L2_MRPS)) + outp += sprintf(outp, "%sL2MRPS", (printed++ ? delim : "")); + + if (DO_BIC(BIC_L2_HIT)) + outp += sprintf(outp, "%sL2%%hit", (printed++ ? delim : "")); + for (mp = sys.tp; mp; mp = mp->next) outp += print_name(mp->width, &printed, delim, mp->name, mp->type, mp->format); @@ -3001,29 +3112,37 @@ void print_header(char *delim) } /* - * pct() + * pct(numerator, denominator) * - * If absolute value is < 1.1, return percentage - * otherwise, return nan + * Return sanity checked percentage (100.0 * numerator/denominotor) * - * return value is appropriate for printing percentages with %f - * while flagging some obvious erroneous values. + * n < 0: nan + * d <= 0: nan + * n/d > 1.1: nan */ -double pct(double d) +double pct(double numerator, double denominator) { + double retval; + + if (numerator < 0) + return nan(""); - double abs = fabs(d); + if (denominator <= 0) + return nan(""); - if (abs < 1.10) - return (100.0 * d); - return nan(""); + retval = 100.0 * numerator / denominator; + + if (retval > 110.0) + return nan(""); + + return retval; } int dump_counters(PER_THREAD_PARAMS) { int i; struct msr_counter *mp; - struct platform_counters *pplat_cnt = p == package_odd ? &platform_counters_odd : &platform_counters_even; + struct platform_counters *pplat_cnt = p == odd.packages ? &platform_counters_odd : &platform_counters_even; outp += sprintf(outp, "t %p, c %p, p %p\n", t, c, p); @@ -3046,7 +3165,11 @@ int dump_counters(PER_THREAD_PARAMS) outp += sprintf(outp, "LLC refs: %lld", t->llc.references); outp += sprintf(outp, "LLC miss: %lld", t->llc.misses); - outp += sprintf(outp, "LLC Hit%%: %.2f", pct((t->llc.references - t->llc.misses) / t->llc.references)); + outp += sprintf(outp, "LLC Hit%%: %.2f", pct((t->llc.references - t->llc.misses), t->llc.references)); + + outp += sprintf(outp, "L2 refs: %lld", t->l2.references); + outp += sprintf(outp, "L2 hits: %lld", t->l2.hits); + outp += sprintf(outp, "L2 Hit%%: %.2f", pct(t->l2.hits, t->l2.references)); for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { outp += sprintf(outp, "tADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, t->counter[i], mp->sp->path); @@ -3054,7 +3177,7 @@ int dump_counters(PER_THREAD_PARAMS) } if (c && is_cpu_first_thread_in_core(t, c)) { - outp += sprintf(outp, "core: %d\n", c->core_id); + outp += sprintf(outp, "core: %d\n", cpus[t->cpu_id].core_id); outp += sprintf(outp, "c3: %016llX\n", c->c3); outp += sprintf(outp, "c6: %016llX\n", c->c6); outp += sprintf(outp, "c7: %016llX\n", c->c7); @@ -3074,8 +3197,6 @@ int dump_counters(PER_THREAD_PARAMS) } if (p && is_cpu_first_core_in_package(t, p)) { - outp += sprintf(outp, "package: %d\n", p->package_id); - outp += sprintf(outp, "Weighted cores: %016llX\n", p->pkg_wtd_core_c0); outp += sprintf(outp, "Any cores: %016llX\n", p->pkg_any_core_c0); outp += sprintf(outp, "Any GFX: %016llX\n", p->pkg_any_gfxe_c0); @@ -3141,7 +3262,7 @@ void get_perf_llc_stats(int cpu, struct llc_stats *llc) actual_read_size = read(fd_llc_percpu[cpu], &r, expected_read_size); if (actual_read_size == -1) - err(-1, "%s(cpu%d,) %d,,%ld\n", __func__, cpu, fd_llc_percpu[cpu], expected_read_size); + err(-1, "%s(cpu%d,) %d,,%ld", __func__, cpu, fd_llc_percpu[cpu], expected_read_size); llc->references = r.llc.references; llc->misses = r.llc.misses; @@ -3149,6 +3270,26 @@ void get_perf_llc_stats(int cpu, struct llc_stats *llc) warn("%s: failed to read perf_data (req %zu act %zu)", __func__, expected_read_size, actual_read_size); } +void get_perf_l2_stats(int cpu, struct l2_stats *l2) +{ + struct read_format { + unsigned long long num_read; + struct l2_stats l2; + } r; + const ssize_t expected_read_size = sizeof(r); + ssize_t actual_read_size; + + actual_read_size = read(fd_l2_percpu[cpu], &r, expected_read_size); + + if (actual_read_size == -1) + err(-1, "%s(cpu%d,) %d,,%ld", __func__, cpu, fd_l2_percpu[cpu], expected_read_size); + + l2->references = r.l2.references; + l2->hits = r.l2.hits; + if (actual_read_size != expected_read_size) + warn("%s: cpu%d: failed to read(%d) perf_data (req %zu act %zu)", __func__, cpu, fd_l2_percpu[cpu], expected_read_size, actual_read_size); +} + /* * column formatting convention & formats */ @@ -3167,7 +3308,7 @@ int format_counters(PER_THREAD_PARAMS) char *delim = "\t"; int printed = 0; - if (t == &average.threads) { + if (t == average.threads) { pplat_cnt = count & 1 ? &platform_counters_odd : &platform_counters_even; ++count; } @@ -3181,7 +3322,7 @@ int format_counters(PER_THREAD_PARAMS) return 0; /*if not summary line and --cpu is used */ - if ((t != &average.threads) && (cpu_subset && !CPU_ISSET_S(t->cpu_id, cpu_subset_size, cpu_subset))) + if ((t != average.threads) && (cpu_subset && !CPU_ISSET_S(t->cpu_id, cpu_subset_size, cpu_subset))) return 0; if (DO_BIC(BIC_USEC)) { @@ -3201,7 +3342,7 @@ int format_counters(PER_THREAD_PARAMS) tsc = t->tsc * tsc_tweak; /* topo columns, print blanks on 1st (average) line */ - if (t == &average.threads) { + if (t == average.threads) { if (DO_BIC(BIC_Package)) outp += sprintf(outp, "%s-", (printed++ ? delim : "")); if (DO_BIC(BIC_Die)) @@ -3221,7 +3362,7 @@ int format_counters(PER_THREAD_PARAMS) } else { if (DO_BIC(BIC_Package)) { if (p) - outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->package_id); + outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), cpus[t->cpu_id].package_id); else outp += sprintf(outp, "%s-", (printed++ ? delim : "")); } @@ -3245,7 +3386,7 @@ int format_counters(PER_THREAD_PARAMS) } if (DO_BIC(BIC_Core)) { if (c) - outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), c->core_id); + outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), cpus[t->cpu_id].core_id); else outp += sprintf(outp, "%s-", (printed++ ? delim : "")); } @@ -3261,7 +3402,7 @@ int format_counters(PER_THREAD_PARAMS) outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), 1.0 / units * t->aperf / interval_float); if (DO_BIC(BIC_Busy)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(t->mperf / tsc)); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(t->mperf, tsc)); if (DO_BIC(BIC_Bzy_MHz)) { if (has_base_hz) @@ -3297,13 +3438,18 @@ int format_counters(PER_THREAD_PARAMS) outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), t->smi_count); /* LLC Stats */ - if (DO_BIC(BIC_LLC_RPS) || DO_BIC(BIC_LLC_HIT)) { - if (DO_BIC(BIC_LLC_RPS)) - outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), t->llc.references / interval_float / 1000); + if (DO_BIC(BIC_LLC_MRPS)) + outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), t->llc.references / interval_float / 1000000); - if (DO_BIC(BIC_LLC_HIT)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), pct((t->llc.references - t->llc.misses) / t->llc.references)); - } + if (DO_BIC(BIC_LLC_HIT)) + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), pct((t->llc.references - t->llc.misses), t->llc.references)); + + /* L2 Stats */ + if (DO_BIC(BIC_L2_MRPS)) + outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), t->l2.references / interval_float / 1000000); + + if (DO_BIC(BIC_L2_HIT)) + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), pct(t->l2.hits, t->l2.references)); /* Added Thread Counters */ for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { @@ -3315,7 +3461,7 @@ int format_counters(PER_THREAD_PARAMS) if (mp->type == COUNTER_USEC) outp += print_float_value(&printed, delim, t->counter[i] / interval_float / 10000); else - outp += print_float_value(&printed, delim, pct(t->counter[i] / tsc)); + outp += print_float_value(&printed, delim, pct(t->counter[i], tsc)); } } @@ -3329,7 +3475,7 @@ int format_counters(PER_THREAD_PARAMS) if (pp->type == COUNTER_USEC) outp += print_float_value(&printed, delim, t->perf_counter[i] / interval_float / 10000); else - outp += print_float_value(&printed, delim, pct(t->perf_counter[i] / tsc)); + outp += print_float_value(&printed, delim, pct(t->perf_counter[i], tsc)); } } @@ -3343,34 +3489,34 @@ int format_counters(PER_THREAD_PARAMS) break; case PMT_TYPE_XTAL_TIME: - value_converted = pct(value_raw / crystal_hz / interval_float); + value_converted = pct(value_raw / crystal_hz, interval_float); outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted); break; case PMT_TYPE_TCORE_CLOCK: - value_converted = pct(value_raw / tcore_clock_freq_hz / interval_float); + value_converted = pct(value_raw / tcore_clock_freq_hz, interval_float); outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted); } } /* C1 */ if (DO_BIC(BIC_CPU_c1)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(t->c1 / tsc)); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(t->c1, tsc)); /* print per-core data only for 1st thread in core */ if (!is_cpu_first_thread_in_core(t, c)) goto done; if (DO_BIC(BIC_CPU_c3)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(c->c3 / tsc)); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(c->c3, tsc)); if (DO_BIC(BIC_CPU_c6)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(c->c6 / tsc)); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(c->c6, tsc)); if (DO_BIC(BIC_CPU_c7)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(c->c7 / tsc)); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(c->c7, tsc)); /* Mod%c6 */ if (DO_BIC(BIC_Mod_c6)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(c->mc6_us / tsc)); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(c->mc6_us, tsc)); if (DO_BIC(BIC_CoreTmp)) outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), c->core_temp_c); @@ -3386,7 +3532,7 @@ int format_counters(PER_THREAD_PARAMS) else if (mp->format == FORMAT_DELTA || mp->format == FORMAT_AVERAGE) outp += print_decimal_value(mp->width, &printed, delim, c->counter[i]); else if (mp->format == FORMAT_PERCENT) - outp += print_float_value(&printed, delim, pct(c->counter[i] / tsc)); + outp += print_float_value(&printed, delim, pct(c->counter[i], tsc)); } /* Added perf Core counters */ @@ -3396,7 +3542,7 @@ int format_counters(PER_THREAD_PARAMS) else if (pp->format == FORMAT_DELTA || mp->format == FORMAT_AVERAGE) outp += print_decimal_value(pp->width, &printed, delim, c->perf_counter[i]); else if (pp->format == FORMAT_PERCENT) - outp += print_float_value(&printed, delim, pct(c->perf_counter[i] / tsc)); + outp += print_float_value(&printed, delim, pct(c->perf_counter[i], tsc)); } /* Added PMT Core counters */ @@ -3409,12 +3555,12 @@ int format_counters(PER_THREAD_PARAMS) break; case PMT_TYPE_XTAL_TIME: - value_converted = pct(value_raw / crystal_hz / interval_float); + value_converted = pct(value_raw / crystal_hz, interval_float); outp += print_float_value(&printed, delim, value_converted); break; case PMT_TYPE_TCORE_CLOCK: - value_converted = pct(value_raw / tcore_clock_freq_hz / interval_float); + value_converted = pct(value_raw / tcore_clock_freq_hz, interval_float); outp += print_float_value(&printed, delim, value_converted); } } @@ -3470,39 +3616,39 @@ int format_counters(PER_THREAD_PARAMS) if (DO_BIC(BIC_Totl_c0)) outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100 * p->pkg_wtd_core_c0 / tsc); /* can exceed 100% */ if (DO_BIC(BIC_Any_c0)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pkg_any_core_c0 / tsc)); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pkg_any_core_c0, tsc)); if (DO_BIC(BIC_GFX_c0)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pkg_any_gfxe_c0 / tsc)); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pkg_any_gfxe_c0, tsc)); if (DO_BIC(BIC_CPUGFX)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pkg_both_core_gfxe_c0 / tsc)); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pkg_both_core_gfxe_c0, tsc)); if (DO_BIC(BIC_Pkgpc2)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc2 / tsc)); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc2, tsc)); if (DO_BIC(BIC_Pkgpc3)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc3 / tsc)); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc3, tsc)); if (DO_BIC(BIC_Pkgpc6)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc6 / tsc)); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc6, tsc)); if (DO_BIC(BIC_Pkgpc7)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc7 / tsc)); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc7, tsc)); if (DO_BIC(BIC_Pkgpc8)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc8 / tsc)); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc8, tsc)); if (DO_BIC(BIC_Pkgpc9)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc9 / tsc)); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc9, tsc)); if (DO_BIC(BIC_Pkgpc10)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc10 / tsc)); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc10, tsc)); if (DO_BIC(BIC_Diec6)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->die_c6 / crystal_hz / interval_float)); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->die_c6 / crystal_hz, interval_float)); if (DO_BIC(BIC_CPU_LPI)) { if (p->cpu_lpi >= 0) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->cpu_lpi / 1000000.0 / interval_float)); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->cpu_lpi / 1000000.0, interval_float)); else outp += sprintf(outp, "%s(neg)", (printed++ ? delim : "")); } if (DO_BIC(BIC_SYS_LPI)) { if (p->sys_lpi >= 0) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->sys_lpi / 1000000.0 / interval_float)); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->sys_lpi / 1000000.0, interval_float)); else outp += sprintf(outp, "%s(neg)", (printed++ ? delim : "")); } @@ -3524,11 +3670,9 @@ int format_counters(PER_THREAD_PARAMS) if (DO_BIC(BIC_RAM_J)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), rapl_counter_get_value(&p->energy_dram, RAPL_UNIT_JOULES, interval_float)); if (DO_BIC(BIC_PKG__)) - outp += - sprintf(outp, fmt8, (printed++ ? delim : ""), rapl_counter_get_value(&p->rapl_pkg_perf_status, RAPL_UNIT_WATTS, interval_float)); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), rapl_counter_get_value(&p->rapl_pkg_perf_status, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_RAM__)) - outp += - sprintf(outp, fmt8, (printed++ ? delim : ""), rapl_counter_get_value(&p->rapl_dram_perf_status, RAPL_UNIT_WATTS, interval_float)); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), rapl_counter_get_value(&p->rapl_dram_perf_status, RAPL_UNIT_WATTS, interval_float)); /* UncMHz */ if (DO_BIC(BIC_UNCORE_MHZ)) outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->uncore_mhz); @@ -3542,7 +3686,7 @@ int format_counters(PER_THREAD_PARAMS) else if (mp->format == FORMAT_DELTA || mp->format == FORMAT_AVERAGE) outp += print_decimal_value(mp->width, &printed, delim, p->counter[i]); else if (mp->format == FORMAT_PERCENT) - outp += print_float_value(&printed, delim, pct(p->counter[i] / tsc)); + outp += print_float_value(&printed, delim, pct(p->counter[i], tsc)); } /* Added perf Package Counters */ @@ -3554,7 +3698,7 @@ int format_counters(PER_THREAD_PARAMS) else if (pp->format == FORMAT_DELTA || mp->format == FORMAT_AVERAGE) outp += print_decimal_value(pp->width, &printed, delim, p->perf_counter[i]); else if (pp->format == FORMAT_PERCENT) - outp += print_float_value(&printed, delim, pct(p->perf_counter[i] / tsc)); + outp += print_float_value(&printed, delim, pct(p->perf_counter[i], tsc)); } /* Added PMT Package Counters */ @@ -3567,22 +3711,20 @@ int format_counters(PER_THREAD_PARAMS) break; case PMT_TYPE_XTAL_TIME: - value_converted = pct(value_raw / crystal_hz / interval_float); + value_converted = pct(value_raw / crystal_hz, interval_float); outp += print_float_value(&printed, delim, value_converted); break; case PMT_TYPE_TCORE_CLOCK: - value_converted = pct(value_raw / tcore_clock_freq_hz / interval_float); + value_converted = pct(value_raw / tcore_clock_freq_hz, interval_float); outp += print_float_value(&printed, delim, value_converted); } } - if (DO_BIC(BIC_SysWatt) && (t == &average.threads)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), - rapl_counter_get_value(&pplat_cnt->energy_psys, RAPL_UNIT_WATTS, interval_float)); - if (DO_BIC(BIC_Sys_J) && (t == &average.threads)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), - rapl_counter_get_value(&pplat_cnt->energy_psys, RAPL_UNIT_JOULES, interval_float)); + if (DO_BIC(BIC_SysWatt) && (t == average.threads)) + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), rapl_counter_get_value(&pplat_cnt->energy_psys, RAPL_UNIT_WATTS, interval_float)); + if (DO_BIC(BIC_Sys_J) && (t == average.threads)) + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), rapl_counter_get_value(&pplat_cnt->energy_psys, RAPL_UNIT_JOULES, interval_float)); done: if (*(outp - 1) != '\n') @@ -3620,7 +3762,7 @@ void format_all_counters(PER_THREAD_PARAMS) if ((!count || (header_iterations && !(count % header_iterations))) || !summary_only) print_header("\t"); - format_counters(&average.threads, &average.cores, &average.packages); + format_counters(average.threads, average.cores, average.packages); count++; @@ -3795,7 +3937,7 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d /* check for TSC < 1 Mcycles over interval */ if (old->tsc < (1000 * 1000)) errx(-3, "Insanely slow TSC rate, TSC stops in idle?\n" - "You can disable all c-states by booting with \"idle=poll\"\n" "or just the deep ones with \"processor.max_cstate=1\""); + "You can disable all c-states by booting with \"idle=poll\"\nor just the deep ones with \"processor.max_cstate=1\""); old->c1 = new->c1 - old->c1; @@ -3846,12 +3988,18 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d if (DO_BIC(BIC_SMI)) old->smi_count = new->smi_count - old->smi_count; - if (DO_BIC(BIC_LLC_RPS)) + if (DO_BIC(BIC_LLC_MRPS) || DO_BIC(BIC_LLC_HIT)) old->llc.references = new->llc.references - old->llc.references; if (DO_BIC(BIC_LLC_HIT)) old->llc.misses = new->llc.misses - old->llc.misses; + if (DO_BIC(BIC_L2_MRPS) || DO_BIC(BIC_L2_HIT)) + old->l2.references = new->l2.references - old->l2.references; + + if (DO_BIC(BIC_L2_HIT)) + old->l2.hits = new->l2.hits - old->l2.hits; + for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) old->counter[i] = new->counter[i]; @@ -3932,6 +4080,9 @@ void clear_counters(PER_THREAD_PARAMS) t->llc.references = 0; t->llc.misses = 0; + t->l2.references = 0; + t->l2.hits = 0; + c->c3 = 0; c->c6 = 0; c->c7 = 0; @@ -3940,9 +4091,6 @@ void clear_counters(PER_THREAD_PARAMS) rapl_counter_clear(&c->core_energy); c->core_throt_cnt = 0; - t->llc.references = 0; - t->llc.misses = 0; - p->pkg_wtd_core_c0 = 0; p->pkg_any_core_c0 = 0; p->pkg_any_gfxe_c0 = 0; @@ -4018,75 +4166,78 @@ int sum_counters(PER_THREAD_PARAMS) /* copy un-changing apic_id's */ if (DO_BIC(BIC_APIC)) - average.threads.apic_id = t->apic_id; + average.threads->apic_id = t->apic_id; if (DO_BIC(BIC_X2APIC)) - average.threads.x2apic_id = t->x2apic_id; + average.threads->x2apic_id = t->x2apic_id; /* remember first tv_begin */ - if (average.threads.tv_begin.tv_sec == 0) - average.threads.tv_begin = procsysfs_tv_begin; + if (average.threads->tv_begin.tv_sec == 0) + average.threads->tv_begin = procsysfs_tv_begin; /* remember last tv_end */ - average.threads.tv_end = t->tv_end; + average.threads->tv_end = t->tv_end; + + average.threads->tsc += t->tsc; + average.threads->aperf += t->aperf; + average.threads->mperf += t->mperf; + average.threads->c1 += t->c1; - average.threads.tsc += t->tsc; - average.threads.aperf += t->aperf; - average.threads.mperf += t->mperf; - average.threads.c1 += t->c1; + average.threads->instr_count += t->instr_count; - average.threads.instr_count += t->instr_count; + average.threads->irq_count += t->irq_count; + average.threads->nmi_count += t->nmi_count; + average.threads->smi_count += t->smi_count; - average.threads.irq_count += t->irq_count; - average.threads.nmi_count += t->nmi_count; - average.threads.smi_count += t->smi_count; + average.threads->llc.references += t->llc.references; + average.threads->llc.misses += t->llc.misses; - average.threads.llc.references += t->llc.references; - average.threads.llc.misses += t->llc.misses; + average.threads->l2.references += t->l2.references; + average.threads->l2.hits += t->l2.hits; for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) continue; - average.threads.counter[i] += t->counter[i]; + average.threads->counter[i] += t->counter[i]; } for (i = 0, pp = sys.perf_tp; pp; i++, pp = pp->next) { if (pp->format == FORMAT_RAW) continue; - average.threads.perf_counter[i] += t->perf_counter[i]; + average.threads->perf_counter[i] += t->perf_counter[i]; } for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) { - average.threads.pmt_counter[i] += t->pmt_counter[i]; + average.threads->pmt_counter[i] += t->pmt_counter[i]; } /* sum per-core values only for 1st thread in core */ if (!is_cpu_first_thread_in_core(t, c)) return 0; - average.cores.c3 += c->c3; - average.cores.c6 += c->c6; - average.cores.c7 += c->c7; - average.cores.mc6_us += c->mc6_us; + average.cores->c3 += c->c3; + average.cores->c6 += c->c6; + average.cores->c7 += c->c7; + average.cores->mc6_us += c->mc6_us; - average.cores.core_temp_c = MAX(average.cores.core_temp_c, c->core_temp_c); - average.cores.core_throt_cnt = MAX(average.cores.core_throt_cnt, c->core_throt_cnt); + average.cores->core_temp_c = MAX(average.cores->core_temp_c, c->core_temp_c); + average.cores->core_throt_cnt = MAX(average.cores->core_throt_cnt, c->core_throt_cnt); - rapl_counter_accumulate(&average.cores.core_energy, &c->core_energy); + rapl_counter_accumulate(&average.cores->core_energy, &c->core_energy); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) continue; - average.cores.counter[i] += c->counter[i]; + average.cores->counter[i] += c->counter[i]; } for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) { if (pp->format == FORMAT_RAW) continue; - average.cores.perf_counter[i] += c->perf_counter[i]; + average.cores->perf_counter[i] += c->perf_counter[i]; } for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) { - average.cores.pmt_counter[i] += c->pmt_counter[i]; + average.cores->pmt_counter[i] += c->pmt_counter[i]; } /* sum per-pkg values only for 1st core in pkg */ @@ -4094,63 +4245,63 @@ int sum_counters(PER_THREAD_PARAMS) return 0; if (DO_BIC(BIC_Totl_c0)) - average.packages.pkg_wtd_core_c0 += p->pkg_wtd_core_c0; + average.packages->pkg_wtd_core_c0 += p->pkg_wtd_core_c0; if (DO_BIC(BIC_Any_c0)) - average.packages.pkg_any_core_c0 += p->pkg_any_core_c0; + average.packages->pkg_any_core_c0 += p->pkg_any_core_c0; if (DO_BIC(BIC_GFX_c0)) - average.packages.pkg_any_gfxe_c0 += p->pkg_any_gfxe_c0; + average.packages->pkg_any_gfxe_c0 += p->pkg_any_gfxe_c0; if (DO_BIC(BIC_CPUGFX)) - average.packages.pkg_both_core_gfxe_c0 += p->pkg_both_core_gfxe_c0; + average.packages->pkg_both_core_gfxe_c0 += p->pkg_both_core_gfxe_c0; - average.packages.pc2 += p->pc2; + average.packages->pc2 += p->pc2; if (DO_BIC(BIC_Pkgpc3)) - average.packages.pc3 += p->pc3; + average.packages->pc3 += p->pc3; if (DO_BIC(BIC_Pkgpc6)) - average.packages.pc6 += p->pc6; + average.packages->pc6 += p->pc6; if (DO_BIC(BIC_Pkgpc7)) - average.packages.pc7 += p->pc7; - average.packages.pc8 += p->pc8; - average.packages.pc9 += p->pc9; - average.packages.pc10 += p->pc10; - average.packages.die_c6 += p->die_c6; + average.packages->pc7 += p->pc7; + average.packages->pc8 += p->pc8; + average.packages->pc9 += p->pc9; + average.packages->pc10 += p->pc10; + average.packages->die_c6 += p->die_c6; - average.packages.cpu_lpi = p->cpu_lpi; - average.packages.sys_lpi = p->sys_lpi; + average.packages->cpu_lpi = p->cpu_lpi; + average.packages->sys_lpi = p->sys_lpi; - rapl_counter_accumulate(&average.packages.energy_pkg, &p->energy_pkg); - rapl_counter_accumulate(&average.packages.energy_dram, &p->energy_dram); - rapl_counter_accumulate(&average.packages.energy_cores, &p->energy_cores); - rapl_counter_accumulate(&average.packages.energy_gfx, &p->energy_gfx); + rapl_counter_accumulate(&average.packages->energy_pkg, &p->energy_pkg); + rapl_counter_accumulate(&average.packages->energy_dram, &p->energy_dram); + rapl_counter_accumulate(&average.packages->energy_cores, &p->energy_cores); + rapl_counter_accumulate(&average.packages->energy_gfx, &p->energy_gfx); - average.packages.gfx_rc6_ms = p->gfx_rc6_ms; - average.packages.uncore_mhz = p->uncore_mhz; - average.packages.gfx_mhz = p->gfx_mhz; - average.packages.gfx_act_mhz = p->gfx_act_mhz; - average.packages.sam_mc6_ms = p->sam_mc6_ms; - average.packages.sam_mhz = p->sam_mhz; - average.packages.sam_act_mhz = p->sam_act_mhz; + average.packages->gfx_rc6_ms = p->gfx_rc6_ms; + average.packages->uncore_mhz = p->uncore_mhz; + average.packages->gfx_mhz = p->gfx_mhz; + average.packages->gfx_act_mhz = p->gfx_act_mhz; + average.packages->sam_mc6_ms = p->sam_mc6_ms; + average.packages->sam_mhz = p->sam_mhz; + average.packages->sam_act_mhz = p->sam_act_mhz; - average.packages.pkg_temp_c = MAX(average.packages.pkg_temp_c, p->pkg_temp_c); + average.packages->pkg_temp_c = MAX(average.packages->pkg_temp_c, p->pkg_temp_c); - rapl_counter_accumulate(&average.packages.rapl_pkg_perf_status, &p->rapl_pkg_perf_status); - rapl_counter_accumulate(&average.packages.rapl_dram_perf_status, &p->rapl_dram_perf_status); + rapl_counter_accumulate(&average.packages->rapl_pkg_perf_status, &p->rapl_pkg_perf_status); + rapl_counter_accumulate(&average.packages->rapl_dram_perf_status, &p->rapl_dram_perf_status); for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if ((mp->format == FORMAT_RAW) && (topo.num_packages == 0)) - average.packages.counter[i] = p->counter[i]; + average.packages->counter[i] = p->counter[i]; else - average.packages.counter[i] += p->counter[i]; + average.packages->counter[i] += p->counter[i]; } for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) { if ((pp->format == FORMAT_RAW) && (topo.num_packages == 0)) - average.packages.perf_counter[i] = p->perf_counter[i]; + average.packages->perf_counter[i] = p->perf_counter[i]; else - average.packages.perf_counter[i] += p->perf_counter[i]; + average.packages->perf_counter[i] += p->perf_counter[i]; } for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) { - average.packages.pmt_counter[i] += p->pmt_counter[i]; + average.packages->pmt_counter[i] += p->pmt_counter[i]; } return 0; @@ -4167,117 +4318,117 @@ void compute_average(PER_THREAD_PARAMS) struct perf_counter_info *pp; struct pmt_counter *ppmt; - clear_counters(&average.threads, &average.cores, &average.packages); + clear_counters(average.threads, average.cores, average.packages); for_all_cpus(sum_counters, t, c, p); /* Use the global time delta for the average. */ - average.threads.tv_delta = tv_delta; + average.threads->tv_delta = tv_delta; - average.threads.tsc /= topo.allowed_cpus; - average.threads.aperf /= topo.allowed_cpus; - average.threads.mperf /= topo.allowed_cpus; - average.threads.instr_count /= topo.allowed_cpus; - average.threads.c1 /= topo.allowed_cpus; + average.threads->tsc /= topo.allowed_cpus; + average.threads->aperf /= topo.allowed_cpus; + average.threads->mperf /= topo.allowed_cpus; + average.threads->instr_count /= topo.allowed_cpus; + average.threads->c1 /= topo.allowed_cpus; - if (average.threads.irq_count > 9999999) + if (average.threads->irq_count > 9999999) sums_need_wide_columns = 1; - if (average.threads.nmi_count > 9999999) + if (average.threads->nmi_count > 9999999) sums_need_wide_columns = 1; - average.cores.c3 /= topo.allowed_cores; - average.cores.c6 /= topo.allowed_cores; - average.cores.c7 /= topo.allowed_cores; - average.cores.mc6_us /= topo.allowed_cores; + average.cores->c3 /= topo.allowed_cores; + average.cores->c6 /= topo.allowed_cores; + average.cores->c7 /= topo.allowed_cores; + average.cores->mc6_us /= topo.allowed_cores; if (DO_BIC(BIC_Totl_c0)) - average.packages.pkg_wtd_core_c0 /= topo.allowed_packages; + average.packages->pkg_wtd_core_c0 /= topo.allowed_packages; if (DO_BIC(BIC_Any_c0)) - average.packages.pkg_any_core_c0 /= topo.allowed_packages; + average.packages->pkg_any_core_c0 /= topo.allowed_packages; if (DO_BIC(BIC_GFX_c0)) - average.packages.pkg_any_gfxe_c0 /= topo.allowed_packages; + average.packages->pkg_any_gfxe_c0 /= topo.allowed_packages; if (DO_BIC(BIC_CPUGFX)) - average.packages.pkg_both_core_gfxe_c0 /= topo.allowed_packages; + average.packages->pkg_both_core_gfxe_c0 /= topo.allowed_packages; - average.packages.pc2 /= topo.allowed_packages; + average.packages->pc2 /= topo.allowed_packages; if (DO_BIC(BIC_Pkgpc3)) - average.packages.pc3 /= topo.allowed_packages; + average.packages->pc3 /= topo.allowed_packages; if (DO_BIC(BIC_Pkgpc6)) - average.packages.pc6 /= topo.allowed_packages; + average.packages->pc6 /= topo.allowed_packages; if (DO_BIC(BIC_Pkgpc7)) - average.packages.pc7 /= topo.allowed_packages; + average.packages->pc7 /= topo.allowed_packages; - average.packages.pc8 /= topo.allowed_packages; - average.packages.pc9 /= topo.allowed_packages; - average.packages.pc10 /= topo.allowed_packages; - average.packages.die_c6 /= topo.allowed_packages; + average.packages->pc8 /= topo.allowed_packages; + average.packages->pc9 /= topo.allowed_packages; + average.packages->pc10 /= topo.allowed_packages; + average.packages->die_c6 /= topo.allowed_packages; for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) continue; if (mp->type == COUNTER_ITEMS) { - if (average.threads.counter[i] > 9999999) + if (average.threads->counter[i] > 9999999) sums_need_wide_columns = 1; continue; } - average.threads.counter[i] /= topo.allowed_cpus; + average.threads->counter[i] /= topo.allowed_cpus; } for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) continue; if (mp->type == COUNTER_ITEMS) { - if (average.cores.counter[i] > 9999999) + if (average.cores->counter[i] > 9999999) sums_need_wide_columns = 1; } - average.cores.counter[i] /= topo.allowed_cores; + average.cores->counter[i] /= topo.allowed_cores; } for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) continue; if (mp->type == COUNTER_ITEMS) { - if (average.packages.counter[i] > 9999999) + if (average.packages->counter[i] > 9999999) sums_need_wide_columns = 1; } - average.packages.counter[i] /= topo.allowed_packages; + average.packages->counter[i] /= topo.allowed_packages; } for (i = 0, pp = sys.perf_tp; pp; i++, pp = pp->next) { if (pp->format == FORMAT_RAW) continue; if (pp->type == COUNTER_ITEMS) { - if (average.threads.perf_counter[i] > 9999999) + if (average.threads->perf_counter[i] > 9999999) sums_need_wide_columns = 1; continue; } - average.threads.perf_counter[i] /= topo.allowed_cpus; + average.threads->perf_counter[i] /= topo.allowed_cpus; } for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) { if (pp->format == FORMAT_RAW) continue; if (pp->type == COUNTER_ITEMS) { - if (average.cores.perf_counter[i] > 9999999) + if (average.cores->perf_counter[i] > 9999999) sums_need_wide_columns = 1; } - average.cores.perf_counter[i] /= topo.allowed_cores; + average.cores->perf_counter[i] /= topo.allowed_cores; } for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) { if (pp->format == FORMAT_RAW) continue; if (pp->type == COUNTER_ITEMS) { - if (average.packages.perf_counter[i] > 9999999) + if (average.packages->perf_counter[i] > 9999999) sums_need_wide_columns = 1; } - average.packages.perf_counter[i] /= topo.allowed_packages; + average.packages->perf_counter[i] /= topo.allowed_packages; } for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) { - average.threads.pmt_counter[i] /= topo.allowed_cpus; + average.threads->pmt_counter[i] /= topo.allowed_cpus; } for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) { - average.cores.pmt_counter[i] /= topo.allowed_cores; + average.cores->pmt_counter[i] /= topo.allowed_cores; } for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) { - average.packages.pmt_counter[i] /= topo.allowed_packages; + average.packages->pmt_counter[i] /= topo.allowed_packages; } } @@ -4645,7 +4796,7 @@ void write_rapl_counter(struct rapl_counter *rc, struct rapl_counter_info_t *rci int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct pkg_data *p) { - struct platform_counters *pplat_cnt = p == package_odd ? &platform_counters_odd : &platform_counters_even; + struct platform_counters *pplat_cnt = p == odd.packages ? &platform_counters_odd : &platform_counters_even; unsigned long long perf_data[NUM_RAPL_COUNTERS + 1]; struct rapl_counter_info_t *rci; @@ -5002,32 +5153,18 @@ unsigned long pmt_read_counter(struct pmt_counter *ppmt, unsigned int domain_id) /* Rapl domain enumeration helpers */ static inline int get_rapl_num_domains(void) { - int num_packages = topo.max_package_id + 1; - int num_cores_per_package; - int num_cores; - if (!platform->has_per_core_rapl) - return num_packages; - - num_cores_per_package = topo.max_core_id + 1; - num_cores = num_cores_per_package * num_packages; + return topo.num_packages; - return num_cores; + return topo.num_cores; } static inline int get_rapl_domain_id(int cpu) { - int nr_cores_per_package = topo.max_core_id + 1; - int rapl_core_id; - if (!platform->has_per_core_rapl) - return cpus[cpu].physical_package_id; - - /* Compute the system-wide unique core-id for @cpu */ - rapl_core_id = cpus[cpu].physical_core_id; - rapl_core_id += cpus[cpu].physical_package_id * nr_cores_per_package; + return cpus[cpu].package_id; - return rapl_core_id; + return GLOBAL_CORE_ID(cpus[cpu].core_id, cpus[cpu].package_id); } /* @@ -5058,9 +5195,12 @@ int get_counters(PER_THREAD_PARAMS) get_smi_aperf_mperf(cpu, t); - if (DO_BIC(BIC_LLC_RPS) || DO_BIC(BIC_LLC_HIT)) + if (DO_BIC(BIC_LLC_MRPS) || DO_BIC(BIC_LLC_HIT)) get_perf_llc_stats(cpu, &t->llc); + if (DO_BIC(BIC_L2_MRPS) || DO_BIC(BIC_L2_HIT)) + get_perf_l2_stats(cpu, &t->l2); + if (DO_BIC(BIC_IPC)) if (read(get_instr_count_fd(cpu), &t->instr_count, sizeof(long long)) != sizeof(long long)) return -4; @@ -5125,7 +5265,7 @@ int get_counters(PER_THREAD_PARAMS) return -10; for (i = 0, pp = sys.pmt_cp; pp; i++, pp = pp->next) - c->pmt_counter[i] = pmt_read_counter(pp, c->core_id); + c->pmt_counter[i] = pmt_read_counter(pp, cpus[t->cpu_id].core_id); /* collect package counters only for 1st core in package */ if (!is_cpu_first_core_in_package(t, p)) @@ -5166,7 +5306,7 @@ int get_counters(PER_THREAD_PARAMS) } if (DO_BIC(BIC_UNCORE_MHZ)) - p->uncore_mhz = get_legacy_uncore_mhz(p->package_id); + p->uncore_mhz = get_legacy_uncore_mhz(cpus[t->cpu_id].package_id); if (DO_BIC(BIC_GFX_rc6)) p->gfx_rc6_ms = gfx_info[GFX_rc6].val_ull; @@ -5190,9 +5330,9 @@ int get_counters(PER_THREAD_PARAMS) char *path = NULL; if (mp->msr_num == 0) { - path = find_sysfs_path_by_id(mp->sp, p->package_id); + path = find_sysfs_path_by_id(mp->sp, cpus[t->cpu_id].package_id); if (path == NULL) { - warnx("%s: package_id %d not found", __func__, p->package_id); + warnx("%s: package_id %d not found", __func__, cpus[t->cpu_id].package_id); return -10; } } @@ -5204,7 +5344,7 @@ int get_counters(PER_THREAD_PARAMS) return -10; for (i = 0, pp = sys.pmt_pp; pp; i++, pp = pp->next) - p->pmt_counter[i] = pmt_read_counter(pp, p->package_id); + p->pmt_counter[i] = pmt_read_counter(pp, cpus[t->cpu_id].package_id); done: gettimeofday(&t->tv_end, (struct timezone *)NULL); @@ -5293,7 +5433,7 @@ void probe_cst_limit(void) return; } - get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr); + get_msr(master_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr); pkg_cstate_limit = pkg_cstate_limits[msr & 0xF]; } @@ -5305,9 +5445,9 @@ static void dump_platform_info(void) if (!platform->has_nhm_msrs || no_msr) return; - get_msr(base_cpu, MSR_PLATFORM_INFO, &msr); + get_msr(master_cpu, MSR_PLATFORM_INFO, &msr); - fprintf(outf, "cpu%d: MSR_PLATFORM_INFO: 0x%08llx\n", base_cpu, msr); + fprintf(outf, "cpu%d: MSR_PLATFORM_INFO: 0x%08llx\n", master_cpu, msr); ratio = (msr >> 40) & 0xFF; fprintf(outf, "%d * %.1f = %.1f MHz max efficiency frequency\n", ratio, bclk, ratio * bclk); @@ -5323,8 +5463,8 @@ static void dump_power_ctl(void) if (!platform->has_nhm_msrs || no_msr) return; - get_msr(base_cpu, MSR_IA32_POWER_CTL, &msr); - fprintf(outf, "cpu%d: MSR_IA32_POWER_CTL: 0x%08llx (C1E auto-promotion: %sabled)\n", base_cpu, msr, msr & 0x2 ? "EN" : "DIS"); + get_msr(master_cpu, MSR_IA32_POWER_CTL, &msr); + fprintf(outf, "cpu%d: MSR_IA32_POWER_CTL: 0x%08llx (C1E auto-promotion: %sabled)\n", master_cpu, msr, msr & 0x2 ? "EN" : "DIS"); /* C-state Pre-wake Disable (CSTATE_PREWAKE_DISABLE) */ if (platform->has_cst_prewake_bit) @@ -5338,9 +5478,9 @@ static void dump_turbo_ratio_limit2(void) unsigned long long msr; unsigned int ratio; - get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT2, &msr); + get_msr(master_cpu, MSR_TURBO_RATIO_LIMIT2, &msr); - fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT2: 0x%08llx\n", base_cpu, msr); + fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT2: 0x%08llx\n", master_cpu, msr); ratio = (msr >> 8) & 0xFF; if (ratio) @@ -5357,9 +5497,9 @@ static void dump_turbo_ratio_limit1(void) unsigned long long msr; unsigned int ratio; - get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT1, &msr); + get_msr(master_cpu, MSR_TURBO_RATIO_LIMIT1, &msr); - fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT1: 0x%08llx\n", base_cpu, msr); + fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT1: 0x%08llx\n", master_cpu, msr); ratio = (msr >> 56) & 0xFF; if (ratio) @@ -5400,13 +5540,12 @@ static void dump_turbo_ratio_limits(int trl_msr_offset) unsigned long long msr, core_counts; int shift; - get_msr(base_cpu, trl_msr_offset, &msr); - fprintf(outf, "cpu%d: MSR_%sTURBO_RATIO_LIMIT: 0x%08llx\n", - base_cpu, trl_msr_offset == MSR_SECONDARY_TURBO_RATIO_LIMIT ? "SECONDARY_" : "", msr); + get_msr(master_cpu, trl_msr_offset, &msr); + fprintf(outf, "cpu%d: MSR_%sTURBO_RATIO_LIMIT: 0x%08llx\n", master_cpu, trl_msr_offset == MSR_SECONDARY_TURBO_RATIO_LIMIT ? "SECONDARY_" : "", msr); if (platform->trl_msrs & TRL_CORECOUNT) { - get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT1, &core_counts); - fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT1: 0x%08llx\n", base_cpu, core_counts); + get_msr(master_cpu, MSR_TURBO_RATIO_LIMIT1, &core_counts); + fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT1: 0x%08llx\n", master_cpu, core_counts); } else { core_counts = 0x0807060504030201; } @@ -5428,8 +5567,8 @@ static void dump_atom_turbo_ratio_limits(void) unsigned long long msr; unsigned int ratio; - get_msr(base_cpu, MSR_ATOM_CORE_RATIOS, &msr); - fprintf(outf, "cpu%d: MSR_ATOM_CORE_RATIOS: 0x%08llx\n", base_cpu, msr & 0xFFFFFFFF); + get_msr(master_cpu, MSR_ATOM_CORE_RATIOS, &msr); + fprintf(outf, "cpu%d: MSR_ATOM_CORE_RATIOS: 0x%08llx\n", master_cpu, msr & 0xFFFFFFFF); ratio = (msr >> 0) & 0x3F; if (ratio) @@ -5443,8 +5582,8 @@ static void dump_atom_turbo_ratio_limits(void) if (ratio) fprintf(outf, "%d * %.1f = %.1f MHz base frequency\n", ratio, bclk, ratio * bclk); - get_msr(base_cpu, MSR_ATOM_CORE_TURBO_RATIOS, &msr); - fprintf(outf, "cpu%d: MSR_ATOM_CORE_TURBO_RATIOS: 0x%08llx\n", base_cpu, msr & 0xFFFFFFFF); + get_msr(master_cpu, MSR_ATOM_CORE_TURBO_RATIOS, &msr); + fprintf(outf, "cpu%d: MSR_ATOM_CORE_TURBO_RATIOS: 0x%08llx\n", master_cpu, msr & 0xFFFFFFFF); ratio = (msr >> 24) & 0x3F; if (ratio) @@ -5473,9 +5612,9 @@ static void dump_knl_turbo_ratio_limits(void) unsigned int cores[buckets_no]; unsigned int ratio[buckets_no]; - get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT, &msr); + get_msr(master_cpu, MSR_TURBO_RATIO_LIMIT, &msr); - fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT: 0x%08llx\n", base_cpu, msr); + fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT: 0x%08llx\n", master_cpu, msr); /* * Turbo encoding in KNL is as follows: @@ -5525,9 +5664,9 @@ static void dump_cst_cfg(void) if (!platform->has_nhm_msrs || no_msr) return; - get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr); + get_msr(master_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr); - fprintf(outf, "cpu%d: MSR_PKG_CST_CONFIG_CONTROL: 0x%08llx", base_cpu, msr); + fprintf(outf, "cpu%d: MSR_PKG_CST_CONFIG_CONTROL: 0x%08llx", master_cpu, msr); fprintf(outf, " (%s%s%s%s%slocked, pkg-cstate-limit=%d (%s)", (msr & SNB_C3_AUTO_UNDEMOTE) ? "UNdemote-C3, " : "", @@ -5550,12 +5689,12 @@ static void dump_config_tdp(void) { unsigned long long msr; - get_msr(base_cpu, MSR_CONFIG_TDP_NOMINAL, &msr); - fprintf(outf, "cpu%d: MSR_CONFIG_TDP_NOMINAL: 0x%08llx", base_cpu, msr); + get_msr(master_cpu, MSR_CONFIG_TDP_NOMINAL, &msr); + fprintf(outf, "cpu%d: MSR_CONFIG_TDP_NOMINAL: 0x%08llx", master_cpu, msr); fprintf(outf, " (base_ratio=%d)\n", (unsigned int)msr & 0xFF); - get_msr(base_cpu, MSR_CONFIG_TDP_LEVEL_1, &msr); - fprintf(outf, "cpu%d: MSR_CONFIG_TDP_LEVEL_1: 0x%08llx (", base_cpu, msr); + get_msr(master_cpu, MSR_CONFIG_TDP_LEVEL_1, &msr); + fprintf(outf, "cpu%d: MSR_CONFIG_TDP_LEVEL_1: 0x%08llx (", master_cpu, msr); if (msr) { fprintf(outf, "PKG_MIN_PWR_LVL1=%d ", (unsigned int)(msr >> 48) & 0x7FFF); fprintf(outf, "PKG_MAX_PWR_LVL1=%d ", (unsigned int)(msr >> 32) & 0x7FFF); @@ -5564,8 +5703,8 @@ static void dump_config_tdp(void) } fprintf(outf, ")\n"); - get_msr(base_cpu, MSR_CONFIG_TDP_LEVEL_2, &msr); - fprintf(outf, "cpu%d: MSR_CONFIG_TDP_LEVEL_2: 0x%08llx (", base_cpu, msr); + get_msr(master_cpu, MSR_CONFIG_TDP_LEVEL_2, &msr); + fprintf(outf, "cpu%d: MSR_CONFIG_TDP_LEVEL_2: 0x%08llx (", master_cpu, msr); if (msr) { fprintf(outf, "PKG_MIN_PWR_LVL2=%d ", (unsigned int)(msr >> 48) & 0x7FFF); fprintf(outf, "PKG_MAX_PWR_LVL2=%d ", (unsigned int)(msr >> 32) & 0x7FFF); @@ -5574,15 +5713,15 @@ static void dump_config_tdp(void) } fprintf(outf, ")\n"); - get_msr(base_cpu, MSR_CONFIG_TDP_CONTROL, &msr); - fprintf(outf, "cpu%d: MSR_CONFIG_TDP_CONTROL: 0x%08llx (", base_cpu, msr); + get_msr(master_cpu, MSR_CONFIG_TDP_CONTROL, &msr); + fprintf(outf, "cpu%d: MSR_CONFIG_TDP_CONTROL: 0x%08llx (", master_cpu, msr); if ((msr) & 0x3) fprintf(outf, "TDP_LEVEL=%d ", (unsigned int)(msr) & 0x3); fprintf(outf, " lock=%d", (unsigned int)(msr >> 31) & 1); fprintf(outf, ")\n"); - get_msr(base_cpu, MSR_TURBO_ACTIVATION_RATIO, &msr); - fprintf(outf, "cpu%d: MSR_TURBO_ACTIVATION_RATIO: 0x%08llx (", base_cpu, msr); + get_msr(master_cpu, MSR_TURBO_ACTIVATION_RATIO, &msr); + fprintf(outf, "cpu%d: MSR_TURBO_ACTIVATION_RATIO: 0x%08llx (", master_cpu, msr); fprintf(outf, "MAX_NON_TURBO_RATIO=%d", (unsigned int)(msr) & 0xFF); fprintf(outf, " lock=%d", (unsigned int)(msr >> 31) & 1); fprintf(outf, ")\n"); @@ -5598,38 +5737,38 @@ void print_irtl(void) return; if (platform->supported_cstates & PC3) { - get_msr(base_cpu, MSR_PKGC3_IRTL, &msr); - fprintf(outf, "cpu%d: MSR_PKGC3_IRTL: 0x%08llx (", base_cpu, msr); + get_msr(master_cpu, MSR_PKGC3_IRTL, &msr); + fprintf(outf, "cpu%d: MSR_PKGC3_IRTL: 0x%08llx (", master_cpu, msr); fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); } if (platform->supported_cstates & PC6) { - get_msr(base_cpu, MSR_PKGC6_IRTL, &msr); - fprintf(outf, "cpu%d: MSR_PKGC6_IRTL: 0x%08llx (", base_cpu, msr); + get_msr(master_cpu, MSR_PKGC6_IRTL, &msr); + fprintf(outf, "cpu%d: MSR_PKGC6_IRTL: 0x%08llx (", master_cpu, msr); fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); } if (platform->supported_cstates & PC7) { - get_msr(base_cpu, MSR_PKGC7_IRTL, &msr); - fprintf(outf, "cpu%d: MSR_PKGC7_IRTL: 0x%08llx (", base_cpu, msr); + get_msr(master_cpu, MSR_PKGC7_IRTL, &msr); + fprintf(outf, "cpu%d: MSR_PKGC7_IRTL: 0x%08llx (", master_cpu, msr); fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); } if (platform->supported_cstates & PC8) { - get_msr(base_cpu, MSR_PKGC8_IRTL, &msr); - fprintf(outf, "cpu%d: MSR_PKGC8_IRTL: 0x%08llx (", base_cpu, msr); + get_msr(master_cpu, MSR_PKGC8_IRTL, &msr); + fprintf(outf, "cpu%d: MSR_PKGC8_IRTL: 0x%08llx (", master_cpu, msr); fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); } if (platform->supported_cstates & PC9) { - get_msr(base_cpu, MSR_PKGC9_IRTL, &msr); - fprintf(outf, "cpu%d: MSR_PKGC9_IRTL: 0x%08llx (", base_cpu, msr); + get_msr(master_cpu, MSR_PKGC9_IRTL, &msr); + fprintf(outf, "cpu%d: MSR_PKGC9_IRTL: 0x%08llx (", master_cpu, msr); fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); } if (platform->supported_cstates & PC10) { - get_msr(base_cpu, MSR_PKGC10_IRTL, &msr); - fprintf(outf, "cpu%d: MSR_PKGC10_IRTL: 0x%08llx (", base_cpu, msr); + get_msr(master_cpu, MSR_PKGC10_IRTL, &msr); + fprintf(outf, "cpu%d: MSR_PKGC10_IRTL: 0x%08llx (", master_cpu, msr); fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); } } @@ -5676,6 +5815,26 @@ void free_fd_llc_percpu(void) free(fd_llc_percpu); fd_llc_percpu = NULL; + + BIC_NOT_PRESENT(BIC_LLC_MRPS); + BIC_NOT_PRESENT(BIC_LLC_HIT); +} + +void free_fd_l2_percpu(void) +{ + if (!fd_l2_percpu) + return; + + for (int i = 0; i < topo.max_cpu_num + 1; ++i) { + if (fd_l2_percpu[i] != 0) + close(fd_l2_percpu[i]); + } + + free(fd_l2_percpu); + fd_l2_percpu = NULL; + + BIC_NOT_PRESENT(BIC_L2_MRPS); + BIC_NOT_PRESENT(BIC_L2_HIT); } void free_fd_cstate(void) @@ -5780,21 +5939,36 @@ void free_all_buffers(void) cpu_affinity_set = NULL; cpu_affinity_setsize = 0; - free(thread_even); - free(core_even); - free(package_even); + if (perf_pcore_set) { + CPU_FREE(perf_pcore_set); + perf_pcore_set = NULL; + } + + if (perf_ecore_set) { + CPU_FREE(perf_ecore_set); + perf_ecore_set = NULL; + } + + if (perf_lcore_set) { + CPU_FREE(perf_lcore_set); + perf_lcore_set = NULL; + } + + free(even.threads); + free(even.cores); + free(even.packages); - thread_even = NULL; - core_even = NULL; - package_even = NULL; + even.threads = NULL; + even.cores = NULL; + even.packages = NULL; - free(thread_odd); - free(core_odd); - free(package_odd); + free(odd.threads); + free(odd.cores); + free(odd.packages); - thread_odd = NULL; - core_odd = NULL; - package_odd = NULL; + odd.threads = NULL; + odd.cores = NULL; + odd.packages = NULL; free(output_buffer); output_buffer = NULL; @@ -5803,6 +5977,7 @@ void free_all_buffers(void) free_fd_percpu(); free_fd_instr_count_percpu(); free_fd_llc_percpu(); + free_fd_l2_percpu(); free_fd_msr(); free_fd_rapl_percpu(); free_fd_cstate(); @@ -5852,7 +6027,7 @@ int cpu_is_first_core_in_package(int cpu) return cpu == parse_int_file("/sys/devices/system/cpu/cpu%d/topology/core_siblings_list", cpu); } -int get_physical_package_id(int cpu) +int get_package_id(int cpu) { return parse_int_file("/sys/devices/system/cpu/cpu%d/topology/physical_package_id", cpu); } @@ -5885,7 +6060,7 @@ void set_node_data(void) for (pkg = 0; pkg < topo.num_packages; pkg++) { lnode = 0; for (cpu = 0; cpu <= topo.max_cpu_num; ++cpu) { - if (cpus[cpu].physical_package_id != pkg) + if (cpus[cpu].package_id != pkg) continue; /* find a cpu with an unset logical_node_id */ if (cpus[cpu].logical_node_id != -1) @@ -5898,7 +6073,7 @@ void set_node_data(void) * the logical_node_id */ for (cpux = cpu; cpux <= topo.max_cpu_num; cpux++) { - if ((cpus[cpux].physical_package_id == pkg) && (cpus[cpux].physical_node_id == node)) { + if ((cpus[cpux].package_id == pkg) && (cpus[cpux].physical_node_id == node)) { cpus[cpux].logical_node_id = lnode; cpu_count++; } @@ -5917,7 +6092,7 @@ int get_physical_node_id(struct cpu_topology *thiscpu) char path[80]; FILE *filep; int i; - int cpu = thiscpu->logical_cpu_id; + int cpu = thiscpu->cpu_id; for (i = 0; i <= topo.max_cpu_num; i++) { sprintf(path, "/sys/devices/system/cpu/cpu%d/node%i/cpulist", cpu, i); @@ -5986,20 +6161,20 @@ static int parse_cpu_str(char *cpu_str, cpu_set_t *cpu_set, int cpu_set_size) return 0; } -int get_thread_siblings(struct cpu_topology *thiscpu) +int set_thread_siblings(struct cpu_topology *thiscpu) { char path[80], character; FILE *filep; unsigned long map; int so, shift, sib_core; - int cpu = thiscpu->logical_cpu_id; + int cpu = thiscpu->cpu_id; int offset = topo.max_cpu_num + 1; size_t size; int thread_id = 0; thiscpu->put_ids = CPU_ALLOC((topo.max_cpu_num + 1)); - if (thiscpu->thread_id < 0) - thiscpu->thread_id = thread_id++; + if (thiscpu->ht_id < 0) + thiscpu->ht_id = thread_id++; if (!thiscpu->put_ids) return -1; @@ -6021,10 +6196,15 @@ int get_thread_siblings(struct cpu_topology *thiscpu) if ((map >> shift) & 0x1) { so = shift + offset; sib_core = get_core_id(so); - if (sib_core == thiscpu->physical_core_id) { + if (sib_core == thiscpu->core_id) { CPU_SET_S(so, size, thiscpu->put_ids); - if ((so != cpu) && (cpus[so].thread_id < 0)) - cpus[so].thread_id = thread_id++; + if ((so != cpu) && (cpus[so].ht_id < 0)) { + cpus[so].ht_id = thread_id; + cpus[cpu].ht_sibling_cpu_id[thread_id] = so; + if (debug) + fprintf(stderr, "%s: cpu%d.ht_sibling_cpu_id[%d] = %d\n", __func__, cpu, thread_id, so); + thread_id += 1; + } } } } @@ -6045,30 +6225,40 @@ int for_all_cpus_2(int (func) (struct thread_data *, struct core_data *, struct core_data *core_base, struct pkg_data *pkg_base, struct thread_data *thread_base2, struct core_data *core_base2, struct pkg_data *pkg_base2) { - int retval, pkg_no, node_no, core_no, thread_no; + int cpu, retval; retval = 0; - for (pkg_no = 0; pkg_no < topo.num_packages; ++pkg_no) { - for (node_no = 0; node_no < topo.nodes_per_pkg; ++node_no) { - for (core_no = 0; core_no < topo.cores_per_node; ++core_no) { - for (thread_no = 0; thread_no < topo.threads_per_core; ++thread_no) { - struct thread_data *t, *t2; - struct core_data *c, *c2; + for (cpu = 0; cpu <= topo.max_cpu_num; ++cpu) { + struct thread_data *t, *t2; + struct core_data *c, *c2; + struct pkg_data *p, *p2; + + if (cpu_is_not_allowed(cpu)) + continue; + + if (cpus[cpu].ht_id > 0) /* skip HT sibling */ + continue; - t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no); + t = &thread_base[cpu]; + t2 = &thread_base2[cpu]; + c = &core_base[GLOBAL_CORE_ID(cpus[cpu].core_id, cpus[cpu].package_id)]; + c2 = &core_base2[GLOBAL_CORE_ID(cpus[cpu].core_id, cpus[cpu].package_id)]; + p = &pkg_base[cpus[cpu].package_id]; + p2 = &pkg_base2[cpus[cpu].package_id]; - if (cpu_is_not_allowed(t->cpu_id)) - continue; + retval |= func(t, c, p, t2, c2, p2); - t2 = GET_THREAD(thread_base2, thread_no, core_no, node_no, pkg_no); + /* Handle HT sibling now */ + int i; - c = GET_CORE(core_base, core_no, node_no, pkg_no); - c2 = GET_CORE(core_base2, core_no, node_no, pkg_no); + for (i = MAX_HT_ID; i > 0; --i) { /* ht_id 0 is self */ + if (cpus[cpu].ht_sibling_cpu_id[i] <= 0) + continue; + t = &thread_base[cpus[cpu].ht_sibling_cpu_id[i]]; + t2 = &thread_base2[cpus[cpu].ht_sibling_cpu_id[i]]; - retval |= func(t, c, &pkg_base[pkg_no], t2, c2, &pkg_base2[pkg_no]); - } - } + retval |= func(t, c, p, t2, c2, p2); } } return retval; @@ -6125,7 +6315,7 @@ static int update_effective_str(bool startup) pos = fgets(buf, 1024, fp); if (!pos) - err(1, "%s: file read failed\n", PATH_EFFECTIVE_CPUS); + err(1, "%s: file read failed", PATH_EFFECTIVE_CPUS); fclose(fp); @@ -6142,7 +6332,7 @@ static void update_effective_set(bool startup) update_effective_str(startup); if (parse_cpu_str(cpu_effective_str, cpu_effective_set, cpu_effective_setsize)) - err(1, "%s: cpu str malformat %s\n", PATH_EFFECTIVE_CPUS, cpu_effective_str); + err(1, "%s: cpu str malformat %s", PATH_EFFECTIVE_CPUS, cpu_effective_str); } void linux_perf_init(void); @@ -6150,6 +6340,7 @@ void msr_perf_init(void); void rapl_perf_init(void); void cstate_perf_init(void); void perf_llc_init(void); +void perf_l2_init(void); void added_perf_counters_init(void); void pmt_init(void); @@ -6162,6 +6353,7 @@ void re_initialize(void) rapl_perf_init(); cstate_perf_init(); perf_llc_init(); + perf_l2_init(); added_perf_counters_init(); pmt_init(); fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus); @@ -6170,14 +6362,14 @@ void re_initialize(void) void set_max_cpu_num(void) { FILE *filep; - int base_cpu; + int current_cpu; unsigned long dummy; char pathname[64]; - base_cpu = sched_getcpu(); - if (base_cpu < 0) + current_cpu = sched_getcpu(); + if (current_cpu < 0) err(1, "cannot find calling cpu ID"); - sprintf(pathname, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", base_cpu); + sprintf(pathname, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", current_cpu); filep = fopen_or_die(pathname, "r"); topo.max_cpu_num = 0; @@ -6205,9 +6397,13 @@ int mark_cpu_present(int cpu) return 0; } -int init_thread_id(int cpu) +int clear_ht_id(int cpu) { - cpus[cpu].thread_id = -1; + int i; + + cpus[cpu].ht_id = -1; + for (i = 0; i <= MAX_HT_ID; ++i) + cpus[cpu].ht_sibling_cpu_id[i] = -1; return 0; } @@ -6740,7 +6936,7 @@ int probe_dev_msr(void) struct stat sb; char pathname[32]; - sprintf(pathname, "/dev/msr%d", base_cpu); + sprintf(pathname, "/dev/msr%d", master_cpu); return !stat(pathname, &sb); } @@ -6749,7 +6945,7 @@ int probe_dev_cpu_msr(void) struct stat sb; char pathname[32]; - sprintf(pathname, "/dev/cpu/%d/msr", base_cpu); + sprintf(pathname, "/dev/cpu/%d/msr", master_cpu); return !stat(pathname, &sb); } @@ -6809,7 +7005,7 @@ int check_for_cap_sys_rawio(void) free_and_exit: if (cap_free(caps) == -1) - err(-6, "cap_free\n"); + err(-6, "cap_free"); return ret; } @@ -6826,7 +7022,7 @@ void check_msr_permission(void) failed += check_for_cap_sys_rawio(); /* test file permissions */ - sprintf(pathname, use_android_msr_path ? "/dev/msr%d" : "/dev/cpu/%d/msr", base_cpu); + sprintf(pathname, use_android_msr_path ? "/dev/msr%d" : "/dev/cpu/%d/msr", master_cpu); if (euidaccess(pathname, R_OK)) { failed++; } @@ -6855,7 +7051,7 @@ void probe_bclk(void) else return; - get_msr(base_cpu, MSR_PLATFORM_INFO, &msr); + get_msr(master_cpu, MSR_PLATFORM_INFO, &msr); base_ratio = (msr >> 8) & 0xFF; base_hz = base_ratio * bclk * 1000000; @@ -7006,16 +7202,16 @@ static void probe_intel_uncore_frequency_cluster(void) } for (i = uncore_max_id; i >= 0; --i) { int k, l; - int package_id, domain_id, cluster_id; + int unc_pkg_id, domain_id, cluster_id; char name_buf[16]; sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/uncore%02d", i); if (access(path_base, R_OK)) - err(1, "%s: %s\n", __func__, path_base); + err(1, "%s: %s", __func__, path_base); sprintf(path, "%s/package_id", path_base); - package_id = read_sysfs_int(path); + unc_pkg_id = read_sysfs_int(path); sprintf(path, "%s/domain_id", path_base); domain_id = read_sysfs_int(path); @@ -7038,7 +7234,7 @@ static void probe_intel_uncore_frequency_cluster(void) */ if BIC_IS_ENABLED (BIC_UNCORE_MHZ) - add_counter(0, path, name_buf, 0, SCOPE_PACKAGE, COUNTER_K2M, FORMAT_AVERAGE, 0, package_id); + add_counter(0, path, name_buf, 0, SCOPE_PACKAGE, COUNTER_K2M, FORMAT_AVERAGE, 0, unc_pkg_id); if (quiet) continue; @@ -7047,7 +7243,7 @@ static void probe_intel_uncore_frequency_cluster(void) k = read_sysfs_int(path); sprintf(path, "%s/max_freq_khz", path_base); l = read_sysfs_int(path); - fprintf(outf, "Uncore Frequency package%d domain%d cluster%d: %d - %d MHz ", package_id, domain_id, cluster_id, k / 1000, l / 1000); + fprintf(outf, "Uncore Frequency package%d domain%d cluster%d: %d - %d MHz ", unc_pkg_id, domain_id, cluster_id, k / 1000, l / 1000); sprintf(path, "%s/initial_min_freq_khz", path_base); k = read_sysfs_int(path); @@ -7202,7 +7398,7 @@ static void dump_sysfs_cstate_config(void) for (state = 0; state < 10; ++state) { - sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state); + sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", master_cpu, state); input = fopen(path, "r"); if (input == NULL) continue; @@ -7218,14 +7414,14 @@ static void dump_sysfs_cstate_config(void) remove_underbar(name_buf); - sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/desc", base_cpu, state); + sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/desc", master_cpu, state); input = fopen(path, "r"); if (input == NULL) continue; if (!fgets(desc, sizeof(desc), input)) err(1, "%s: failed to read file", path); - fprintf(outf, "cpu%d: %s: %s", base_cpu, name_buf, desc); + fprintf(outf, "cpu%d: %s: %s", master_cpu, name_buf, desc); fclose(input); } } @@ -7238,7 +7434,7 @@ static void dump_sysfs_pstate_config(void) FILE *input; int turbo; - sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_driver", base_cpu); + sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_driver", master_cpu); input = fopen(path, "r"); if (input == NULL) { fprintf(outf, "NSFOD %s\n", path); @@ -7248,7 +7444,7 @@ static void dump_sysfs_pstate_config(void) err(1, "%s: failed to read file", path); fclose(input); - sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", base_cpu); + sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", master_cpu); input = fopen(path, "r"); if (input == NULL) { fprintf(outf, "NSFOD %s\n", path); @@ -7258,8 +7454,8 @@ static void dump_sysfs_pstate_config(void) err(1, "%s: failed to read file", path); fclose(input); - fprintf(outf, "cpu%d: cpufreq driver: %s", base_cpu, driver_buf); - fprintf(outf, "cpu%d: cpufreq governor: %s", base_cpu, governor_buf); + fprintf(outf, "cpu%d: cpufreq driver: %s", master_cpu, driver_buf); + fprintf(outf, "cpu%d: cpufreq governor: %s", master_cpu, governor_buf); sprintf(path, "/sys/devices/system/cpu/cpufreq/boost"); input = fopen(path, "r"); @@ -7521,7 +7717,7 @@ double get_tdp_intel(void) unsigned long long msr; if (valid_rapl_msrs & RAPL_PKG_POWER_INFO) - if (!get_msr(base_cpu, MSR_PKG_POWER_INFO, &msr)) + if (!get_msr(master_cpu, MSR_PKG_POWER_INFO, &msr)) return ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units; return get_quirk_tdp(); } @@ -7560,7 +7756,7 @@ void rapl_probe_intel(void) CLR_BIC(BIC_RAM__, &bic_enabled); /* units on package 0, verify later other packages match */ - if (get_msr(base_cpu, MSR_RAPL_POWER_UNIT, &msr)) + if (get_msr(master_cpu, MSR_RAPL_POWER_UNIT, &msr)) return; rapl_power_units = 1.0 / (1 << (msr & 0xF)); @@ -7608,7 +7804,7 @@ void rapl_probe_amd(void) if (!valid_rapl_msrs || no_msr) return; - if (get_msr(base_cpu, MSR_RAPL_PWR_UNIT, &msr)) + if (get_msr(master_cpu, MSR_RAPL_PWR_UNIT, &msr)) return; rapl_time_units = ldexp(1.0, -(msr >> 16 & 0xf)); @@ -7817,8 +8013,7 @@ int print_rapl(PER_THREAD_PARAMS) return -1; } - fprintf(outf, "cpu%d: %s: 0x%08llx (%f Watts, %f Joules, %f sec.)\n", cpu, msr_name, msr, - rapl_power_units, rapl_energy_units, rapl_time_units); + fprintf(outf, "cpu%d: %s: 0x%08llx (%f Watts, %f Joules, %f sec.)\n", cpu, msr_name, msr, rapl_power_units, rapl_energy_units, rapl_time_units); if (valid_rapl_msrs & RAPL_PKG_POWER_INFO) { @@ -7850,8 +8045,7 @@ int print_rapl(PER_THREAD_PARAMS) return -9; fprintf(outf, "cpu%d: MSR_VR_CURRENT_CONFIG: 0x%08llx\n", cpu, msr); - fprintf(outf, "cpu%d: PKG Limit #4: %f Watts (%slocked)\n", - cpu, ((msr >> 0) & 0x1FFF) * rapl_power_units, (msr >> 31) & 1 ? "" : "UN"); + fprintf(outf, "cpu%d: PKG Limit #4: %f Watts (%slocked)\n", cpu, ((msr >> 0) & 0x1FFF) * rapl_power_units, (msr >> 31) & 1 ? "" : "UN"); } if (valid_rapl_msrs & RAPL_DRAM_POWER_INFO) { @@ -7919,7 +8113,7 @@ void probe_rapl_msrs(void) if (offset < 0) return; - ret = get_msr(base_cpu, offset, &msr_value); + ret = get_msr(master_cpu, offset, &msr_value); if (ret) { if (debug) fprintf(outf, "Can not read RAPL_PKG_ENERGY MSR(0x%llx)\n", (unsigned long long)offset); @@ -8004,7 +8198,7 @@ int set_temperature_target(PER_THREAD_PARAMS) if (!platform->has_nhm_msrs || no_msr) goto guess; - if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr)) + if (get_msr(master_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr)) goto guess; tcc_default = (msr >> 16) & 0xFF; @@ -8013,7 +8207,7 @@ int set_temperature_target(PER_THREAD_PARAMS) int bits = platform->tcc_offset_bits; unsigned long long enabled = 0; - if (bits && !get_msr(base_cpu, MSR_PLATFORM_INFO, &enabled)) + if (bits && !get_msr(master_cpu, MSR_PLATFORM_INFO, &enabled)) enabled = (enabled >> 30) & 1; if (bits && enabled) { @@ -8148,9 +8342,12 @@ void decode_feature_control_msr(void) if (no_msr) return; - if (!get_msr(base_cpu, MSR_IA32_FEAT_CTL, &msr)) + if (quiet) + return; + + if (!get_msr(master_cpu, MSR_IA32_FEAT_CTL, &msr)) fprintf(outf, "cpu%d: MSR_IA32_FEATURE_CONTROL: 0x%08llx (%sLocked %s)\n", - base_cpu, msr, msr & FEAT_CTL_LOCKED ? "" : "UN-", msr & (1 << 18) ? "SGX" : ""); + master_cpu, msr, msr & FEAT_CTL_LOCKED ? "" : "UN-", msr & (1 << 18) ? "SGX" : ""); } void decode_misc_enable_msr(void) @@ -8163,9 +8360,9 @@ void decode_misc_enable_msr(void) if (!genuine_intel) return; - if (!get_msr(base_cpu, MSR_IA32_MISC_ENABLE, &msr)) + if (!get_msr(master_cpu, MSR_IA32_MISC_ENABLE, &msr)) fprintf(outf, "cpu%d: MSR_IA32_MISC_ENABLE: 0x%08llx (%sTCC %sEIST %sMWAIT %sPREFETCH %sTURBO)\n", - base_cpu, msr, + master_cpu, msr, msr & MSR_IA32_MISC_ENABLE_TM1 ? "" : "No-", msr & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP ? "" : "No-", msr & MSR_IA32_MISC_ENABLE_MWAIT ? "" : "No-", @@ -8182,11 +8379,10 @@ void decode_misc_feature_control(void) if (!platform->has_msr_misc_feature_control) return; - if (!get_msr(base_cpu, MSR_MISC_FEATURE_CONTROL, &msr)) + if (!get_msr(master_cpu, MSR_MISC_FEATURE_CONTROL, &msr)) fprintf(outf, "cpu%d: MSR_MISC_FEATURE_CONTROL: 0x%08llx (%sL2-Prefetch %sL2-Prefetch-pair %sL1-Prefetch %sL1-IP-Prefetch)\n", - base_cpu, msr, msr & (0 << 0) ? "No-" : "", msr & (1 << 0) ? "No-" : "", - msr & (2 << 0) ? "No-" : "", msr & (3 << 0) ? "No-" : ""); + master_cpu, msr, msr & (0 << 0) ? "No-" : "", msr & (1 << 0) ? "No-" : "", msr & (2 << 0) ? "No-" : "", msr & (3 << 0) ? "No-" : ""); } /* @@ -8206,9 +8402,9 @@ void decode_misc_pwr_mgmt_msr(void) if (!platform->has_msr_misc_pwr_mgmt) return; - if (!get_msr(base_cpu, MSR_MISC_PWR_MGMT, &msr)) + if (!get_msr(master_cpu, MSR_MISC_PWR_MGMT, &msr)) fprintf(outf, "cpu%d: MSR_MISC_PWR_MGMT: 0x%08llx (%sable-EIST_Coordination %sable-EPB %sable-OOB)\n", - base_cpu, msr, msr & (1 << 0) ? "DIS" : "EN", msr & (1 << 1) ? "EN" : "DIS", msr & (1 << 8) ? "EN" : "DIS"); + master_cpu, msr, msr & (1 << 0) ? "DIS" : "EN", msr & (1 << 1) ? "EN" : "DIS", msr & (1 << 8) ? "EN" : "DIS"); } /* @@ -8227,13 +8423,11 @@ void decode_c6_demotion_policy_msr(void) if (!platform->has_msr_c6_demotion_policy_config) return; - if (!get_msr(base_cpu, MSR_CC6_DEMOTION_POLICY_CONFIG, &msr)) - fprintf(outf, "cpu%d: MSR_CC6_DEMOTION_POLICY_CONFIG: 0x%08llx (%sable-CC6-Demotion)\n", - base_cpu, msr, msr & (1 << 0) ? "EN" : "DIS"); + if (!get_msr(master_cpu, MSR_CC6_DEMOTION_POLICY_CONFIG, &msr)) + fprintf(outf, "cpu%d: MSR_CC6_DEMOTION_POLICY_CONFIG: 0x%08llx (%sable-CC6-Demotion)\n", master_cpu, msr, msr & (1 << 0) ? "EN" : "DIS"); - if (!get_msr(base_cpu, MSR_MC6_DEMOTION_POLICY_CONFIG, &msr)) - fprintf(outf, "cpu%d: MSR_MC6_DEMOTION_POLICY_CONFIG: 0x%08llx (%sable-MC6-Demotion)\n", - base_cpu, msr, msr & (1 << 0) ? "EN" : "DIS"); + if (!get_msr(master_cpu, MSR_MC6_DEMOTION_POLICY_CONFIG, &msr)) + fprintf(outf, "cpu%d: MSR_MC6_DEMOTION_POLICY_CONFIG: 0x%08llx (%sable-MC6-Demotion)\n", master_cpu, msr, msr & (1 << 0) ? "EN" : "DIS"); } void print_dev_latency(void) @@ -8268,7 +8462,7 @@ static int has_perf_instr_count_access(void) if (no_perf) return 0; - fd = open_perf_counter(base_cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0); + fd = open_perf_counter(master_cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0); if (fd != -1) close(fd); @@ -8321,25 +8515,126 @@ end: return ret; } +char cpuset_buf[1024]; +int initialize_cpu_set_from_sysfs(cpu_set_t *cpu_set, char *sysfs_path, char *sysfs_file) +{ + FILE *fp; + char path[128]; + + if (snprintf(path, 128, "%s/%s", sysfs_path, sysfs_file) > 128) + err(-1, "%s %s", sysfs_path, sysfs_file); + + fp = fopen(path, "r"); + if (!fp) { + warn("open %s", path); + return -1; + } + if (fread(cpuset_buf, sizeof(char), 1024, fp) == 0) { + warn("read %s", sysfs_path); + goto err; + } + if (parse_cpu_str(cpuset_buf, cpu_set, cpu_possible_setsize)) { + warnx("%s: cpu str malformat %s\n", sysfs_path, cpu_effective_str); + goto err; + } + return 0; + +err: + fclose(fp); + return -1; +} + +void print_cpu_set(char *s, cpu_set_t *set) +{ + int i; + + assert(MAX_BIC < CPU_SETSIZE); + + printf("%s:", s); + + for (i = 0; i <= topo.max_cpu_num; ++i) + if (CPU_ISSET(i, set)) + printf(" %d", i); + putchar('\n'); +} + +void linux_perf_init_hybrid_cpus(void) +{ + char *perf_cpu_pcore_path = "/sys/devices/cpu_core"; + char *perf_cpu_ecore_path = "/sys/devices/cpu_atom"; + char *perf_cpu_lcore_path = "/sys/devices/cpu_lowpower"; + char path[128]; + + if (!access(perf_cpu_pcore_path, F_OK)) { + perf_pcore_set = CPU_ALLOC((topo.max_cpu_num + 1)); + if (perf_pcore_set == NULL) + err(3, "CPU_ALLOC"); + CPU_ZERO_S(cpu_possible_setsize, perf_pcore_set); + initialize_cpu_set_from_sysfs(perf_pcore_set, perf_cpu_pcore_path, "cpus"); + if (debug) + print_cpu_set("perf pcores", perf_pcore_set); + sprintf(path, "%s/%s", perf_cpu_pcore_path, "type"); + perf_pmu_types.pcore = snapshot_sysfs_counter(path); + } + + if (!access(perf_cpu_ecore_path, F_OK)) { + perf_ecore_set = CPU_ALLOC((topo.max_cpu_num + 1)); + if (perf_ecore_set == NULL) + err(3, "CPU_ALLOC"); + CPU_ZERO_S(cpu_possible_setsize, perf_ecore_set); + initialize_cpu_set_from_sysfs(perf_ecore_set, perf_cpu_ecore_path, "cpus"); + if (debug) + print_cpu_set("perf ecores", perf_ecore_set); + sprintf(path, "%s/%s", perf_cpu_ecore_path, "type"); + perf_pmu_types.ecore = snapshot_sysfs_counter(path); + } + + if (!access(perf_cpu_lcore_path, F_OK)) { + perf_lcore_set = CPU_ALLOC((topo.max_cpu_num + 1)); + if (perf_lcore_set == NULL) + err(3, "CPU_ALLOC"); + CPU_ZERO_S(cpu_possible_setsize, perf_lcore_set); + initialize_cpu_set_from_sysfs(perf_lcore_set, perf_cpu_lcore_path, "cpus"); + if (debug) + print_cpu_set("perf lcores", perf_lcore_set); + sprintf(path, "%s/%s", perf_cpu_lcore_path, "type"); + perf_pmu_types.lcore = snapshot_sysfs_counter(path); + } +} + /* - * Linux-perf manages the HW instructions-retired counter - * by enabling when requested, and hiding rollover + * Linux-perf related initialization */ void linux_perf_init(void) { + char path[128]; + char *perf_cpu_path = "/sys/devices/cpu"; + if (access("/proc/sys/kernel/perf_event_paranoid", F_OK)) return; + if (!access(perf_cpu_path, F_OK)) { + sprintf(path, "%s/%s", perf_cpu_path, "type"); + perf_pmu_types.uniform = snapshot_sysfs_counter(path); + } else { + linux_perf_init_hybrid_cpus(); + } + if (BIC_IS_ENABLED(BIC_IPC) && cpuid_has_aperf_mperf) { fd_instr_count_percpu = calloc(topo.max_cpu_num + 1, sizeof(int)); if (fd_instr_count_percpu == NULL) err(-1, "calloc fd_instr_count_percpu"); } - if (BIC_IS_ENABLED(BIC_LLC_RPS)) { + if (BIC_IS_ENABLED(BIC_LLC_MRPS) || BIC_IS_ENABLED(BIC_LLC_HIT)) { fd_llc_percpu = calloc(topo.max_cpu_num + 1, sizeof(int)); if (fd_llc_percpu == NULL) err(-1, "calloc fd_llc_percpu"); } + if (BIC_IS_ENABLED(BIC_L2_MRPS) || BIC_IS_ENABLED(BIC_L2_HIT)) { + fd_l2_percpu = calloc(topo.max_cpu_num + 1, sizeof(int)); + if (fd_l2_percpu == NULL) + err(-1, "calloc fd_l2_percpu"); + } } void rapl_perf_init(void) @@ -8397,7 +8692,7 @@ void rapl_perf_init(void) domain_visited[next_domain] = 1; - if ((cai->flags & RAPL_COUNTER_FLAG_PLATFORM_COUNTER) && (cpu != base_cpu)) + if ((cai->flags & RAPL_COUNTER_FLAG_PLATFORM_COUNTER) && (cpu != master_cpu)) continue; struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[next_domain]; @@ -8450,8 +8745,7 @@ void rapl_perf_init(void) /* Assumes msr_counter_info is populated */ static int has_amperf_access(void) { - return cpuid_has_aperf_mperf && msr_counter_arch_infos[MSR_ARCH_INFO_APERF_INDEX].present && - msr_counter_arch_infos[MSR_ARCH_INFO_MPERF_INDEX].present; + return cpuid_has_aperf_mperf && msr_counter_arch_infos[MSR_ARCH_INFO_APERF_INDEX].present && msr_counter_arch_infos[MSR_ARCH_INFO_MPERF_INDEX].present; } int *get_cstate_perf_group_fd(struct cstate_counter_info_t *cci, const char *group_name) @@ -8647,8 +8941,8 @@ void cstate_perf_init_(bool soft_c1) if (cpu_is_not_allowed(cpu)) continue; - const int core_id = cpus[cpu].physical_core_id; - const int pkg_id = cpus[cpu].physical_package_id; + const int core_id = cpus[cpu].core_id; + const int pkg_id = cpus[cpu].package_id; assert(core_id < cores_visited_elems); assert(pkg_id < pkg_visited_elems); @@ -8662,8 +8956,7 @@ void cstate_perf_init_(bool soft_c1) if (!per_core && pkg_visited[pkg_id]) continue; - const bool counter_needed = BIC_IS_ENABLED(cai->bic_number) || - (soft_c1 && (cai->flags & CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY)); + const bool counter_needed = BIC_IS_ENABLED(cai->bic_number) || (soft_c1 && (cai->flags & CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY)); const bool counter_supported = (platform->supported_cstates & cai->feature_mask); if (counter_needed && counter_supported) { @@ -8772,6 +9065,29 @@ void probe_pstates(void) for_all_cpus(print_perf_limit, ODD_COUNTERS); } +void dump_word_chars(unsigned int word) +{ + int i; + + for (i = 0; i < 4; ++i) + fprintf(outf, "%c", (word >> (i * 8)) & 0xFF); +} + +void dump_cpuid_hypervisor(void) +{ + unsigned int ebx = 0; + unsigned int ecx = 0; + unsigned int edx = 0; + + __cpuid(0x40000000, max_extended_level, ebx, ecx, edx); + + fprintf(outf, "Hypervisor: "); + dump_word_chars(ebx); + dump_word_chars(ecx); + dump_word_chars(edx); + fprintf(outf, "\n"); +} + void process_cpuid() { unsigned int eax, ebx, ecx, edx; @@ -8803,6 +9119,7 @@ void process_cpuid() model += ((fms >> 16) & 0xf) << 4; ecx_flags = ecx; edx_flags = edx; + cpuid_has_hv = ecx_flags & (1 << 31); if (!no_msr) { if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch)) @@ -8826,18 +9143,22 @@ void process_cpuid() fputc('\n', outf); fprintf(outf, "CPUID(0x80000000): max_extended_levels: 0x%x\n", max_extended_level); - fprintf(outf, "CPUID(1): %s %s %s %s %s %s %s %s %s %s\n", - ecx_flags & (1 << 0) ? "SSE3" : "-", - ecx_flags & (1 << 3) ? "MONITOR" : "-", - ecx_flags & (1 << 6) ? "SMX" : "-", - ecx_flags & (1 << 7) ? "EIST" : "-", - ecx_flags & (1 << 8) ? "TM2" : "-", - edx_flags & (1 << 4) ? "TSC" : "-", - edx_flags & (1 << 5) ? "MSR" : "-", - edx_flags & (1 << 22) ? "ACPI-TM" : "-", edx_flags & (1 << 28) ? "HT" : "-", edx_flags & (1 << 29) ? "TM" : "-"); - } + fprintf(outf, "CPUID(1): %sSSE3 %sMONITOR %sSMX %sEIST %sTM2 %sHV %sTSC %sMSR %sACPI-TM %sHT %sTM\n", + ecx_flags & (1 << 0) ? "" : "No-", + ecx_flags & (1 << 3) ? "" : "No-", + ecx_flags & (1 << 6) ? "" : "No-", + ecx_flags & (1 << 7) ? "" : "No-", + ecx_flags & (1 << 8) ? "" : "No-", + cpuid_has_hv ? "" : "No-", + edx_flags & (1 << 4) ? "" : "No-", + edx_flags & (1 << 5) ? "" : "No-", + edx_flags & (1 << 22) ? "" : "No-", edx_flags & (1 << 28) ? "" : "No-", edx_flags & (1 << 29) ? "" : "No-"); + } + if (!quiet && cpuid_has_hv) + dump_cpuid_hypervisor(); probe_platform_features(family, model); + init_perf_model_support(family, model); if (!(edx_flags & (1 << 5))) errx(1, "CPUID: no MSR"); @@ -8887,7 +9208,7 @@ void process_cpuid() if (!quiet) decode_misc_enable_msr(); - if (max_level >= 0x7 && !quiet) { + if (max_level >= 0x7) { int has_sgx; ecx = 0; @@ -8896,9 +9217,10 @@ void process_cpuid() has_sgx = ebx & (1 << 2); - is_hybrid = edx & (1 << 15); + is_hybrid = !!(edx & (1 << 15)); - fprintf(outf, "CPUID(7): %sSGX %sHybrid\n", has_sgx ? "" : "No-", is_hybrid ? "" : "No-"); + if (!quiet) + fprintf(outf, "CPUID(7): %sSGX %sHybrid\n", has_sgx ? "" : "No-", is_hybrid ? "" : "No-"); if (has_sgx) decode_feature_control_msr(); @@ -8924,8 +9246,7 @@ void process_cpuid() if (crystal_hz) { tsc_hz = (unsigned long long)crystal_hz *ebx_tsc / eax_crystal; if (!quiet) - fprintf(outf, "TSC: %lld MHz (%d Hz * %d / %d / 1000000)\n", - tsc_hz / 1000000, crystal_hz, ebx_tsc, eax_crystal); + fprintf(outf, "TSC: %lld MHz (%d Hz * %d / %d / 1000000)\n", tsc_hz / 1000000, crystal_hz, ebx_tsc, eax_crystal); } } } @@ -9003,7 +9324,8 @@ void probe_pm_features(void) decode_misc_feature_control(); } -/* perf_llc_probe +/* + * has_perf_llc_access() * * return 1 on success, else 0 */ @@ -9014,7 +9336,7 @@ int has_perf_llc_access(void) if (no_perf) return 0; - fd = open_perf_counter(base_cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES, -1, PERF_FORMAT_GROUP); + fd = open_perf_counter(master_cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES, -1, PERF_FORMAT_GROUP); if (fd != -1) close(fd); @@ -9032,22 +9354,22 @@ void perf_llc_init(void) if (no_perf) return; - if (!(BIC_IS_ENABLED(BIC_LLC_RPS) && BIC_IS_ENABLED(BIC_LLC_HIT))) + if (!(BIC_IS_ENABLED(BIC_LLC_MRPS) || BIC_IS_ENABLED(BIC_LLC_HIT))) return; + assert(fd_llc_percpu != 0); + for (cpu = 0; cpu <= topo.max_cpu_num; ++cpu) { if (cpu_is_not_allowed(cpu)) continue; - assert(fd_llc_percpu != 0); fd_llc_percpu[cpu] = open_perf_counter(cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES, -1, PERF_FORMAT_GROUP); if (fd_llc_percpu[cpu] == -1) { warnx("%s: perf REFS: failed to open counter on cpu%d", __func__, cpu); free_fd_llc_percpu(); return; } - assert(fd_llc_percpu != 0); retval = open_perf_counter(cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES, fd_llc_percpu[cpu], PERF_FORMAT_GROUP); if (retval == -1) { warnx("%s: perf MISS: failed to open counter on cpu%d", __func__, cpu); @@ -9055,10 +9377,90 @@ void perf_llc_init(void) return; } } - BIC_PRESENT(BIC_LLC_RPS); + BIC_PRESENT(BIC_LLC_MRPS); BIC_PRESENT(BIC_LLC_HIT); } +void perf_l2_init(void) +{ + int cpu; + int retval; + + if (no_perf) + return; + if (!(BIC_IS_ENABLED(BIC_L2_MRPS) || BIC_IS_ENABLED(BIC_L2_HIT))) + return; + if (perf_model_support == NULL) + return; + + assert(fd_l2_percpu != 0); + + for (cpu = 0; cpu <= topo.max_cpu_num; ++cpu) { + + if (cpu_is_not_allowed(cpu)) + continue; + + if (!is_hybrid) { + fd_l2_percpu[cpu] = open_perf_counter(cpu, perf_pmu_types.uniform, perf_model_support->first.refs, -1, PERF_FORMAT_GROUP); + if (fd_l2_percpu[cpu] == -1) { + err(-1, "%s(cpu%d, 0x%x, 0x%llx) REFS", __func__, cpu, perf_pmu_types.uniform, perf_model_support->first.refs); + free_fd_l2_percpu(); + return; + } + retval = open_perf_counter(cpu, perf_pmu_types.uniform, perf_model_support->first.hits, fd_l2_percpu[cpu], PERF_FORMAT_GROUP); + if (retval == -1) { + err(-1, "%s(cpu%d, 0x%x, 0x%llx) HITS", __func__, cpu, perf_pmu_types.uniform, perf_model_support->first.hits); + free_fd_l2_percpu(); + return; + } + continue; + } + if (perf_pcore_set && CPU_ISSET_S(cpu, cpu_possible_setsize, perf_pcore_set)) { + fd_l2_percpu[cpu] = open_perf_counter(cpu, perf_pmu_types.pcore, perf_model_support->first.refs, -1, PERF_FORMAT_GROUP); + if (fd_l2_percpu[cpu] == -1) { + err(-1, "%s(cpu%d, 0x%x, 0x%llx) REFS", __func__, cpu, perf_pmu_types.pcore, perf_model_support->first.refs); + free_fd_l2_percpu(); + return; + } + retval = open_perf_counter(cpu, perf_pmu_types.pcore, perf_model_support->first.hits, fd_l2_percpu[cpu], PERF_FORMAT_GROUP); + if (retval == -1) { + err(-1, "%s(cpu%d, 0x%x, 0x%llx) HITS", __func__, cpu, perf_pmu_types.pcore, perf_model_support->first.hits); + free_fd_l2_percpu(); + return; + } + } else if (perf_ecore_set && CPU_ISSET_S(cpu, cpu_possible_setsize, perf_ecore_set)) { + fd_l2_percpu[cpu] = open_perf_counter(cpu, perf_pmu_types.ecore, perf_model_support->second.refs, -1, PERF_FORMAT_GROUP); + if (fd_l2_percpu[cpu] == -1) { + err(-1, "%s(cpu%d, 0x%x, 0x%llx) REFS", __func__, cpu, perf_pmu_types.pcore, perf_model_support->second.refs); + free_fd_l2_percpu(); + return; + } + retval = open_perf_counter(cpu, perf_pmu_types.ecore, perf_model_support->second.hits, fd_l2_percpu[cpu], PERF_FORMAT_GROUP); + if (retval == -1) { + err(-1, "%s(cpu%d, 0x%x, 0x%llx) HITS", __func__, cpu, perf_pmu_types.pcore, perf_model_support->second.hits); + free_fd_l2_percpu(); + return; + } + } else if (perf_lcore_set && CPU_ISSET_S(cpu, cpu_possible_setsize, perf_lcore_set)) { + fd_l2_percpu[cpu] = open_perf_counter(cpu, perf_pmu_types.lcore, perf_model_support->third.refs, -1, PERF_FORMAT_GROUP); + if (fd_l2_percpu[cpu] == -1) { + err(-1, "%s(cpu%d, 0x%x, 0x%llx) REFS", __func__, cpu, perf_pmu_types.pcore, perf_model_support->third.refs); + free_fd_l2_percpu(); + return; + } + retval = open_perf_counter(cpu, perf_pmu_types.lcore, perf_model_support->third.hits, fd_l2_percpu[cpu], PERF_FORMAT_GROUP); + if (retval == -1) { + err(-1, "%s(cpu%d, 0x%x, 0x%llx) HITS", __func__, cpu, perf_pmu_types.pcore, perf_model_support->third.hits); + free_fd_l2_percpu(); + return; + } + } else + err(-1, "%s: cpu%d: type %d", __func__, cpu, cpus[cpu].type); + } + BIC_PRESENT(BIC_L2_MRPS); + BIC_PRESENT(BIC_L2_HIT); +} + /* * in /dev/cpu/ return success for names that are numbers * ie. filter out ".", "..", "microcode". @@ -9071,33 +9473,6 @@ int dir_filter(const struct dirent *dirp) return 0; } -char *possible_file = "/sys/devices/system/cpu/possible"; -char possible_buf[1024]; - -int initialize_cpu_possible_set(void) -{ - FILE *fp; - - fp = fopen(possible_file, "r"); - if (!fp) { - warn("open %s", possible_file); - return -1; - } - if (fread(possible_buf, sizeof(char), 1024, fp) == 0) { - warn("read %s", possible_file); - goto err; - } - if (parse_cpu_str(possible_buf, cpu_possible_set, cpu_possible_setsize)) { - warnx("%s: cpu str malformat %s\n", possible_file, cpu_effective_str); - goto err; - } - return 0; - -err: - fclose(fp); - return -1; -} - void topology_probe(bool startup) { int i; @@ -9137,7 +9512,7 @@ void topology_probe(bool startup) err(3, "CPU_ALLOC"); cpu_possible_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1)); CPU_ZERO_S(cpu_possible_setsize, cpu_possible_set); - initialize_cpu_possible_set(); + initialize_cpu_set_from_sysfs(cpu_possible_set, "/sys/devices/system/cpu", "possible"); /* * Allocate and initialize cpu_effective_set @@ -9205,13 +9580,13 @@ void topology_probe(bool startup) cpu_affinity_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1)); CPU_ZERO_S(cpu_affinity_setsize, cpu_affinity_set); - for_all_proc_cpus(init_thread_id); + for_all_proc_cpus(clear_ht_id); for_all_proc_cpus(set_cpu_hybrid_type); /* * For online cpus - * find max_core_id, max_package_id + * find max_core_id, max_package_id, num_cores (per system) */ for (i = 0; i <= topo.max_cpu_num; ++i) { int siblings; @@ -9222,12 +9597,12 @@ void topology_probe(bool startup) continue; } - cpus[i].logical_cpu_id = i; + cpus[i].cpu_id = i; /* get package information */ - cpus[i].physical_package_id = get_physical_package_id(i); - if (cpus[i].physical_package_id > max_package_id) - max_package_id = cpus[i].physical_package_id; + cpus[i].package_id = get_package_id(i); + if (cpus[i].package_id > max_package_id) + max_package_id = cpus[i].package_id; /* get die information */ cpus[i].die_id = get_die_id(i); @@ -9245,18 +9620,18 @@ void topology_probe(bool startup) topo.max_node_num = cpus[i].physical_node_id; /* get core information */ - cpus[i].physical_core_id = get_core_id(i); - if (cpus[i].physical_core_id > max_core_id) - max_core_id = cpus[i].physical_core_id; + cpus[i].core_id = get_core_id(i); + if (cpus[i].core_id > max_core_id) + max_core_id = cpus[i].core_id; /* get thread information */ - siblings = get_thread_siblings(&cpus[i]); + siblings = set_thread_siblings(&cpus[i]); if (siblings > max_siblings) max_siblings = siblings; - if (cpus[i].thread_id == 0) + if (cpus[i].ht_id == 0) topo.num_cores++; } - topo.max_core_id = max_core_id; + topo.max_core_id = max_core_id; /* within a package */ topo.max_package_id = max_package_id; topo.cores_per_node = max_core_id + 1; @@ -9298,42 +9673,57 @@ void topology_probe(bool startup) continue; fprintf(outf, "cpu %d pkg %d die %d l3 %d node %d lnode %d core %d thread %d\n", - i, cpus[i].physical_package_id, cpus[i].die_id, cpus[i].l3_id, - cpus[i].physical_node_id, cpus[i].logical_node_id, cpus[i].physical_core_id, cpus[i].thread_id); + i, cpus[i].package_id, cpus[i].die_id, cpus[i].l3_id, + cpus[i].physical_node_id, cpus[i].logical_node_id, cpus[i].core_id, cpus[i].ht_id); } } -void allocate_counters(struct thread_data **t, struct core_data **c, struct pkg_data **p) +void allocate_counters_1(struct counters *counters) +{ + counters->threads = calloc(1, sizeof(struct thread_data)); + if (counters->threads == NULL) + goto error; + + counters->cores = calloc(1, sizeof(struct core_data)); + if (counters->cores == NULL) + goto error; + + counters->packages = calloc(1, sizeof(struct pkg_data)); + if (counters->packages == NULL) + goto error; + + return; +error: + err(1, "calloc counters_1"); +} + +void allocate_counters(struct counters *counters) { int i; int num_cores = topo.cores_per_node * topo.nodes_per_pkg * topo.num_packages; int num_threads = topo.threads_per_core * num_cores; - *t = calloc(num_threads, sizeof(struct thread_data)); - if (*t == NULL) + counters->threads = calloc(num_threads, sizeof(struct thread_data)); + if (counters->threads == NULL) goto error; for (i = 0; i < num_threads; i++) - (*t)[i].cpu_id = -1; + (counters->threads)[i].cpu_id = -1; - *c = calloc(num_cores, sizeof(struct core_data)); - if (*c == NULL) + counters->cores = calloc(num_cores, sizeof(struct core_data)); + if (counters->cores == NULL) goto error; - for (i = 0; i < num_cores; i++) { - (*c)[i].core_id = -1; - (*c)[i].base_cpu = -1; - } + for (i = 0; i < num_cores; i++) + (counters->cores)[i].first_cpu = -1; - *p = calloc(topo.num_packages, sizeof(struct pkg_data)); - if (*p == NULL) + counters->packages = calloc(topo.num_packages, sizeof(struct pkg_data)); + if (counters->packages == NULL) goto error; - for (i = 0; i < topo.num_packages; i++) { - (*p)[i].package_id = i; - (*p)[i].base_cpu = -1; - } + for (i = 0; i < topo.num_packages; i++) + (counters->packages)[i].first_cpu = -1; return; error: @@ -9343,14 +9733,13 @@ error: /* * init_counter() * - * set FIRST_THREAD_IN_CORE and FIRST_CORE_IN_PACKAGE + * set t->cpu_id, FIRST_THREAD_IN_CORE and FIRST_CORE_IN_PACKAGE */ void init_counter(struct thread_data *thread_base, struct core_data *core_base, struct pkg_data *pkg_base, int cpu_id) { - int pkg_id = cpus[cpu_id].physical_package_id; + int pkg_id = cpus[cpu_id].package_id; int node_id = cpus[cpu_id].logical_node_id; - int core_id = cpus[cpu_id].physical_core_id; - int thread_id = cpus[cpu_id].thread_id; + int core_id = cpus[cpu_id].core_id; struct thread_data *t; struct core_data *c; @@ -9360,20 +9749,17 @@ void init_counter(struct thread_data *thread_base, struct core_data *core_base, if (node_id < 0) node_id = 0; - t = GET_THREAD(thread_base, thread_id, core_id, node_id, pkg_id); - c = GET_CORE(core_base, core_id, node_id, pkg_id); + t = &thread_base[cpu_id]; + c = &core_base[GLOBAL_CORE_ID(core_id, pkg_id)]; t->cpu_id = cpu_id; if (!cpu_is_not_allowed(cpu_id)) { - if (c->base_cpu < 0) - c->base_cpu = t->cpu_id; - if (pkg_base[pkg_id].base_cpu < 0) - pkg_base[pkg_id].base_cpu = t->cpu_id; + if (c->first_cpu < 0) + c->first_cpu = t->cpu_id; + if (pkg_base[pkg_id].first_cpu < 0) + pkg_base[pkg_id].first_cpu = t->cpu_id; } - - c->core_id = core_id; - pkg_base[pkg_id].package_id = pkg_id; } int initialize_counters(int cpu_id) @@ -9416,9 +9802,9 @@ void allocate_irq_buffers(void) int update_topo(PER_THREAD_PARAMS) { topo.allowed_cpus++; - if ((int)t->cpu_id == c->base_cpu) + if ((int)t->cpu_id == c->first_cpu) topo.allowed_cores++; - if ((int)t->cpu_id == p->base_cpu) + if ((int)t->cpu_id == p->first_cpu) topo.allowed_packages++; return 0; @@ -9437,23 +9823,24 @@ void setup_all_buffers(bool startup) topology_probe(startup); allocate_irq_buffers(); allocate_fd_percpu(); - allocate_counters(&thread_even, &core_even, &package_even); - allocate_counters(&thread_odd, &core_odd, &package_odd); + allocate_counters_1(&average); + allocate_counters(&even); + allocate_counters(&odd); allocate_output_buffer(); for_all_proc_cpus(initialize_counters); topology_update(); } -void set_base_cpu(void) +void set_master_cpu(void) { int i; for (i = 0; i < topo.max_cpu_num + 1; ++i) { if (cpu_is_not_allowed(i)) continue; - base_cpu = i; + master_cpu = i; if (debug > 1) - fprintf(outf, "base_cpu = %d\n", base_cpu); + fprintf(outf, "master_cpu = %d\n", master_cpu); return; } err(-ENODEV, "No valid cpus found"); @@ -9484,7 +9871,7 @@ void check_perf_access(void) if (!has_perf_instr_count_access()) no_perf = 1; - if (BIC_IS_ENABLED(BIC_LLC_RPS) || BIC_IS_ENABLED(BIC_LLC_HIT)) + if (BIC_IS_ENABLED(BIC_LLC_MRPS) || BIC_IS_ENABLED(BIC_LLC_HIT)) if (!has_perf_llc_access()) no_perf = 1; @@ -9967,8 +10354,7 @@ void pmt_init(void) if (BIC_IS_ENABLED(BIC_Diec6)) { pmt_add_counter(PMT_MTL_DC6_GUID, PMT_MTL_DC6_SEQ, "Die%c6", PMT_TYPE_XTAL_TIME, - PMT_COUNTER_MTL_DC6_LSB, PMT_COUNTER_MTL_DC6_MSB, PMT_COUNTER_MTL_DC6_OFFSET, - SCOPE_PACKAGE, FORMAT_DELTA, 0, PMT_OPEN_TRY); + PMT_COUNTER_MTL_DC6_LSB, PMT_COUNTER_MTL_DC6_MSB, PMT_COUNTER_MTL_DC6_OFFSET, SCOPE_PACKAGE, FORMAT_DELTA, 0, PMT_OPEN_TRY); } if (BIC_IS_ENABLED(BIC_CPU_c1e)) { @@ -10029,7 +10415,7 @@ next_loop_iter: void turbostat_init() { setup_all_buffers(true); - set_base_cpu(); + set_master_cpu(); check_msr_access(); check_perf_access(); process_cpuid(); @@ -10040,13 +10426,14 @@ void turbostat_init() rapl_perf_init(); cstate_perf_init(); perf_llc_init(); + perf_l2_init(); added_perf_counters_init(); pmt_init(); for_all_cpus(get_cpu_type, ODD_COUNTERS); for_all_cpus(get_cpu_type, EVEN_COUNTERS); - if (BIC_IS_ENABLED(BIC_IPC) && has_aperf_access && get_instr_count_fd(base_cpu) != -1) + if (BIC_IS_ENABLED(BIC_IPC) && has_aperf_access && get_instr_count_fd(master_cpu) != -1) BIC_PRESENT(BIC_IPC); /* @@ -10145,7 +10532,7 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 2025.12.02 - Len Brown <lenb@kernel.org>\n"); + fprintf(outf, "turbostat version 2026.02.14 - Len Brown <lenb@kernel.org>\n"); } #define COMMAND_LINE_SIZE 2048 @@ -10767,8 +11154,7 @@ next: } if (direct_path && has_guid) { - printf("%s: path and guid+seq parameters are mutually exclusive\n" - "notice: passed guid=0x%x and path=%s\n", __func__, guid, direct_path); + printf("%s: path and guid+seq parameters are mutually exclusive\nnotice: passed guid=0x%x and path=%s\n", __func__, guid, direct_path); exit(1); } @@ -10863,7 +11249,7 @@ void probe_cpuidle_residency(void) for (state = 10; state >= 0; --state) { - sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state); + sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", master_cpu, state); input = fopen(path, "r"); if (input == NULL) continue; @@ -10912,7 +11298,7 @@ void probe_cpuidle_counts(void) for (state = 10; state >= 0; --state) { - sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state); + sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", master_cpu, state); input = fopen(path, "r"); if (input == NULL) continue; @@ -11041,7 +11427,7 @@ void cmdline(int argc, char **argv) * Parse some options early, because they may make other options invalid, * like adding the MSR counter with --add and at the same time using --no-msr. */ - while ((opt = getopt_long_only(argc, argv, "+MPn:", long_options, &option_index)) != -1) { + while ((opt = getopt_long_only(argc, argv, "+:MP", long_options, &option_index)) != -1) { switch (opt) { case 'M': no_msr = 1; @@ -11055,7 +11441,7 @@ void cmdline(int argc, char **argv) } optind = 0; - while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:o:qMST:v", long_options, &option_index)) != -1) { + while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:N:o:qMST:v", long_options, &option_index)) != -1) { switch (opt) { case 'a': parse_add_command(optarg); @@ -11098,7 +11484,6 @@ void cmdline(int argc, char **argv) } break; case 'h': - default: help(); exit(1); case 'i': @@ -11134,20 +11519,18 @@ void cmdline(int argc, char **argv) /* Parsed earlier */ break; case 'n': - num_iterations = strtod(optarg, NULL); + num_iterations = strtoul(optarg, NULL, 0); + errno = 0; - if (num_iterations <= 0) { - fprintf(outf, "iterations %d should be positive number\n", num_iterations); - exit(2); - } + if (errno || num_iterations == 0) + errx(-1, "invalid iteration count: %s", optarg); break; case 'N': - header_iterations = strtod(optarg, NULL); + header_iterations = strtoul(optarg, NULL, 0); + errno = 0; - if (header_iterations <= 0) { - fprintf(outf, "iterations %d should be positive number\n", header_iterations); - exit(2); - } + if (errno || header_iterations == 0) + errx(-1, "invalid header iteration count: %s", optarg); break; case 's': /* @@ -11170,6 +11553,9 @@ void cmdline(int argc, char **argv) print_version(); exit(0); break; + default: + help(); + exit(1); } } } diff --git a/tools/testing/memblock/internal.h b/tools/testing/memblock/internal.h index 0ab4b53bb4f3..009b97bbdd22 100644 --- a/tools/testing/memblock/internal.h +++ b/tools/testing/memblock/internal.h @@ -15,8 +15,7 @@ bool mirrored_kernelcore = false; struct page {}; -void memblock_free_pages(struct page *page, unsigned long pfn, - unsigned int order) +void memblock_free_pages(unsigned long pfn, unsigned int order) { } diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index c6bf4dfb1495..6776158f1f3e 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -723,7 +723,7 @@ $(VERIFICATION_CERT) $(PRIVATE_KEY): $(VERIFY_SIG_SETUP) # Generates a header with C array declaration, containing test_progs_verification_cert bytes $(VERIFY_SIG_HDR): $(VERIFICATION_CERT) $(Q)(echo "unsigned char test_progs_verification_cert[] = {"; \ - hexdump -v -e '12/1 " 0x%02x," "\n"' $< | sed 's/0x ,//g; $$s/,$$//'; \ + od -v -t 'xC' -w12 $< | sed 's/ \(\S\+\)/ 0x\1,/g;s/^\S\+/ /;$$d'; \ echo "};"; \ echo "unsigned int test_progs_verification_cert_len = $$(wc -c < $<);") > $@ diff --git a/tools/testing/selftests/bpf/prog_tests/map_kptr.c b/tools/testing/selftests/bpf/prog_tests/map_kptr.c index f372162c0280..03b46f17cf53 100644 --- a/tools/testing/selftests/bpf/prog_tests/map_kptr.c +++ b/tools/testing/selftests/bpf/prog_tests/map_kptr.c @@ -118,15 +118,16 @@ exit: static int kern_sync_rcu_tasks_trace(struct rcu_tasks_trace_gp *rcu) { - long gp_seq = READ_ONCE(rcu->bss->gp_seq); LIBBPF_OPTS(bpf_test_run_opts, opts); + int ret; - if (!ASSERT_OK(bpf_prog_test_run_opts(bpf_program__fd(rcu->progs.do_call_rcu_tasks_trace), - &opts), "do_call_rcu_tasks_trace")) + WRITE_ONCE(rcu->bss->done, 0); + ret = bpf_prog_test_run_opts(bpf_program__fd(rcu->progs.call_rcu_tasks_trace), &opts); + if (!ASSERT_OK(ret, "call_rcu_tasks_trace")) return -EFAULT; - if (!ASSERT_OK(opts.retval, "opts.retval == 0")) + if (!ASSERT_OK(opts.retval, "call_rcu_tasks_trace retval")) return -EFAULT; - while (gp_seq == READ_ONCE(rcu->bss->gp_seq)) + while (!READ_ONCE(rcu->bss->done)) sched_yield(); return 0; } @@ -159,8 +160,6 @@ void serial_test_map_kptr(void) skel = rcu_tasks_trace_gp__open_and_load(); if (!ASSERT_OK_PTR(skel, "rcu_tasks_trace_gp__open_and_load")) return; - if (!ASSERT_OK(rcu_tasks_trace_gp__attach(skel), "rcu_tasks_trace_gp__attach")) - goto end; if (test__start_subtest("success-map")) { test_map_kptr_success(true); @@ -180,7 +179,5 @@ void serial_test_map_kptr(void) test_map_kptr_success(true); } -end: rcu_tasks_trace_gp__destroy(skel); - return; } diff --git a/tools/testing/selftests/bpf/progs/get_func_args_test.c b/tools/testing/selftests/bpf/progs/get_func_args_test.c index 180ba5098ca1..075a1180ec26 100644 --- a/tools/testing/selftests/bpf/progs/get_func_args_test.c +++ b/tools/testing/selftests/bpf/progs/get_func_args_test.c @@ -167,7 +167,7 @@ int BPF_PROG(tp_test2) } __u64 test7_result = 0; -#if defined(bpf_target_x86) || defined(bpf_target_arm64) +#if defined(bpf_target_x86) || defined(bpf_target_arm64) || defined(bpf_target_riscv) SEC("fsession/bpf_fentry_test1") int BPF_PROG(test7) { diff --git a/tools/testing/selftests/bpf/progs/get_func_ip_test.c b/tools/testing/selftests/bpf/progs/get_func_ip_test.c index 43ff836a8ed8..45eaa54d1ac7 100644 --- a/tools/testing/selftests/bpf/progs/get_func_ip_test.c +++ b/tools/testing/selftests/bpf/progs/get_func_ip_test.c @@ -106,7 +106,7 @@ int BPF_URETPROBE(test8, int ret) __u64 test9_entry_result = 0; __u64 test9_exit_result = 0; -#if defined(bpf_target_x86) || defined(bpf_target_arm64) +#if defined(bpf_target_x86) || defined(bpf_target_arm64) || defined(bpf_target_riscv) SEC("fsession/bpf_fentry_test1") int BPF_PROG(test9, int a) { diff --git a/tools/testing/selftests/bpf/progs/profiler.h b/tools/testing/selftests/bpf/progs/profiler.h index 3bac4fdd4bdf..637fbf2c2652 100644 --- a/tools/testing/selftests/bpf/progs/profiler.h +++ b/tools/testing/selftests/bpf/progs/profiler.h @@ -169,7 +169,7 @@ enum bpf_function_id { profiler_bpf_sched_process_exec, profiler_bpf_sched_process_exit, profiler_bpf_sys_enter_kill, - profiler_bpf_do_filp_open_ret, + profiler_bpf_do_file_open_ret, profiler_bpf_sched_process_fork, profiler_bpf_vfs_link, profiler_bpf_vfs_symlink, diff --git a/tools/testing/selftests/bpf/progs/profiler.inc.h b/tools/testing/selftests/bpf/progs/profiler.inc.h index 813143b4985d..9044dd8aff11 100644 --- a/tools/testing/selftests/bpf/progs/profiler.inc.h +++ b/tools/testing/selftests/bpf/progs/profiler.inc.h @@ -751,11 +751,11 @@ out: return 0; } -SEC("kretprobe/do_filp_open") -int kprobe_ret__do_filp_open(struct pt_regs* ctx) +SEC("kretprobe/do_file_open") +int kprobe_ret__do_file_open(struct pt_regs *ctx) { struct bpf_func_stats_ctx stats_ctx; - bpf_stats_enter(&stats_ctx, profiler_bpf_do_filp_open_ret); + bpf_stats_enter(&stats_ctx, profiler_bpf_do_file_open_ret); struct file* filp = (struct file*)PT_REGS_RC_CORE(ctx); diff --git a/tools/testing/selftests/bpf/progs/rcu_tasks_trace_gp.c b/tools/testing/selftests/bpf/progs/rcu_tasks_trace_gp.c index df4873558634..189c05c6abcc 100644 --- a/tools/testing/selftests/bpf/progs/rcu_tasks_trace_gp.c +++ b/tools/testing/selftests/bpf/progs/rcu_tasks_trace_gp.c @@ -1,36 +1,14 @@ // SPDX-License-Identifier: GPL-2.0 #include <vmlinux.h> -#include <bpf/bpf_tracing.h> #include <bpf/bpf_helpers.h> +#include "../test_kmods/bpf_testmod_kfunc.h" -struct task_ls_map { - __uint(type, BPF_MAP_TYPE_TASK_STORAGE); - __uint(map_flags, BPF_F_NO_PREALLOC); - __type(key, int); - __type(value, int); -} task_ls_map SEC(".maps"); - -long gp_seq; +int done; SEC("syscall") -int do_call_rcu_tasks_trace(void *ctx) -{ - struct task_struct *current; - int *v; - - current = bpf_get_current_task_btf(); - v = bpf_task_storage_get(&task_ls_map, current, NULL, BPF_LOCAL_STORAGE_GET_F_CREATE); - if (!v) - return 1; - /* Invoke call_rcu_tasks_trace */ - return bpf_task_storage_delete(&task_ls_map, current); -} - -SEC("kprobe/rcu_tasks_trace_postgp") -int rcu_tasks_trace_postgp(void *ctx) +int call_rcu_tasks_trace(void *ctx) { - __sync_add_and_fetch(&gp_seq, 1); - return 0; + return bpf_kfunc_call_test_call_rcu_tasks_trace(&done); } char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_meta.c b/tools/testing/selftests/bpf/progs/test_xdp_meta.c index 0a0f371a2dec..fa73b17cb999 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_meta.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_meta.c @@ -1,12 +1,12 @@ -#include <stdbool.h> -#include <linux/bpf.h> -#include <linux/errno.h> -#include <linux/if_ether.h> -#include <linux/pkt_cls.h> +// SPDX-License-Identifier: GPL-2.0 +#include <vmlinux.h> #include <bpf/bpf_endian.h> #include <bpf/bpf_helpers.h> +#include <errno.h> + #include "bpf_kfuncs.h" +#include "bpf_tracing_net.h" #define META_SIZE 32 @@ -42,7 +42,7 @@ static bool check_metadata(const char *file, int line, __u8 *meta_have) if (!__builtin_memcmp(meta_have, meta_want, META_SIZE)) return true; - bpf_stream_printk(BPF_STREAM_STDERR, + bpf_stream_printk(BPF_STDERR, "FAIL:%s:%d: metadata mismatch\n" " have:\n %pI6\n %pI6\n" " want:\n %pI6\n %pI6\n", diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 186a25ab429a..e62c6b78657f 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -18,6 +18,7 @@ #include <linux/in6.h> #include <linux/un.h> #include <linux/filter.h> +#include <linux/rcupdate_trace.h> #include <net/sock.h> #include <linux/namei.h> #include "bpf_testmod.h" @@ -885,6 +886,32 @@ __bpf_kfunc void bpf_kfunc_call_test_sleepable(void) { } +struct bpf_kfunc_rcu_tasks_trace_data { + struct rcu_head rcu; + int *done; +}; + +static void bpf_kfunc_rcu_tasks_trace_cb(struct rcu_head *rhp) +{ + struct bpf_kfunc_rcu_tasks_trace_data *data; + + data = container_of(rhp, struct bpf_kfunc_rcu_tasks_trace_data, rcu); + WRITE_ONCE(*data->done, 1); + kfree(data); +} + +__bpf_kfunc int bpf_kfunc_call_test_call_rcu_tasks_trace(int *done) +{ + struct bpf_kfunc_rcu_tasks_trace_data *data; + + data = kmalloc(sizeof(*data), GFP_ATOMIC); + if (!data) + return -ENOMEM; + data->done = done; + call_rcu_tasks_trace(&data->rcu, bpf_kfunc_rcu_tasks_trace_cb); + return 0; +} + __bpf_kfunc int bpf_kfunc_init_sock(struct init_sock_args *args) { int proto; @@ -1222,6 +1249,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE) BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg) BTF_ID_FLAGS(func, bpf_kfunc_call_test_offset) BTF_ID_FLAGS(func, bpf_kfunc_call_test_sleepable, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_call_test_call_rcu_tasks_trace) BTF_ID_FLAGS(func, bpf_kfunc_init_sock, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_close_sock, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_connect, KF_SLEEPABLE) diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h index d5c5454e257e..b393bf771131 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h @@ -118,6 +118,7 @@ void bpf_kfunc_call_test_mem_len_fail2(__u64 *mem, int len) __ksym; void bpf_kfunc_call_test_destructive(void) __ksym; void bpf_kfunc_call_test_sleepable(void) __ksym; +int bpf_kfunc_call_test_call_rcu_tasks_trace(int *done) __ksym; void bpf_kfunc_call_test_offset(struct prog_test_ref_kfunc *p); struct prog_test_member *bpf_kfunc_call_memb_acquire(void); diff --git a/tools/testing/selftests/drivers/net/hw/devmem.py b/tools/testing/selftests/drivers/net/hw/devmem.py index 45c2d49d55b6..ee863e90d1e0 100755 --- a/tools/testing/selftests/drivers/net/hw/devmem.py +++ b/tools/testing/selftests/drivers/net/hw/devmem.py @@ -63,12 +63,29 @@ def check_tx_chunks(cfg) -> None: ksft_eq(socat.stdout.strip(), "hello\nworld") +def check_rx_hds(cfg) -> None: + """Test HDS splitting across payload sizes.""" + require_devmem(cfg) + + for size in [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]: + port = rand_port() + listen_cmd = f"{cfg.bin_local} -L -l -f {cfg.ifname} -s {cfg.addr} -p {port}" + + with bkg(listen_cmd, exit_wait=True) as ncdevmem: + wait_port_listen(port) + cmd(f"dd if=/dev/zero bs={size} count=1 2>/dev/null | " + + f"socat -b {size} -u - TCP{cfg.addr_ipver}:{cfg.baddr}:{port},nodelay", + host=cfg.remote, shell=True) + + ksft_eq(ncdevmem.ret, 0, f"HDS failed for payload size {size}") + + def main() -> None: with NetDrvEpEnv(__file__) as cfg: cfg.bin_local = path.abspath(path.dirname(__file__) + "/ncdevmem") cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) - ksft_run([check_rx, check_tx, check_tx_chunks], + ksft_run([check_rx, check_tx, check_tx_chunks, check_rx_hds], args=(cfg, )) ksft_exit() diff --git a/tools/testing/selftests/drivers/net/hw/ncdevmem.c b/tools/testing/selftests/drivers/net/hw/ncdevmem.c index 16864c844108..e098d6534c3c 100644 --- a/tools/testing/selftests/drivers/net/hw/ncdevmem.c +++ b/tools/testing/selftests/drivers/net/hw/ncdevmem.c @@ -98,6 +98,7 @@ static unsigned int ifindex; static unsigned int dmabuf_id; static uint32_t tx_dmabuf_id; static int waittime_ms = 500; +static bool fail_on_linear; /* System state loaded by current_config_load() */ #define MAX_FLOWS 8 @@ -975,6 +976,11 @@ static int do_server(struct memory_buffer *mem) "SCM_DEVMEM_LINEAR. dmabuf_cmsg->frag_size=%u\n", dmabuf_cmsg->frag_size); + if (fail_on_linear) { + pr_err("received SCM_DEVMEM_LINEAR but --fail-on-linear (-L) set"); + goto err_close_client; + } + continue; } @@ -1398,8 +1404,11 @@ int main(int argc, char *argv[]) int is_server = 0, opt; int ret, err = 1; - while ((opt = getopt(argc, argv, "ls:c:p:v:q:t:f:z:")) != -1) { + while ((opt = getopt(argc, argv, "Lls:c:p:v:q:t:f:z:")) != -1) { switch (opt) { + case 'L': + fail_on_linear = true; + break; case 'l': is_server = 1; break; diff --git a/tools/testing/selftests/drivers/net/hw/toeplitz.py b/tools/testing/selftests/drivers/net/hw/toeplitz.py index d288c57894f6..cd7e080e6f84 100755 --- a/tools/testing/selftests/drivers/net/hw/toeplitz.py +++ b/tools/testing/selftests/drivers/net/hw/toeplitz.py @@ -19,6 +19,8 @@ from lib.py import ksft_variants, KsftNamedVariant, KsftSkipEx, KsftFailEx # "define" for the ID of the Toeplitz hash function ETH_RSS_HASH_TOP = 1 +# Must match RPS_MAX_CPUS in toeplitz.c +RPS_MAX_CPUS = 16 def _check_rps_and_rfs_not_configured(cfg): @@ -67,23 +69,24 @@ def _get_irq_cpus(cfg): return cpus -def _get_unused_cpus(cfg, count=2): +def _get_unused_rps_cpus(cfg, count=2): """ - Get CPUs that are not used by Rx queues. - Returns a list of at least 'count' CPU numbers. + Get CPUs that are not used by Rx queues for RPS. + Returns a list of at least 'count' CPU numbers within + the RPS_MAX_CPUS supported range. """ # Get CPUs used by Rx queues rx_cpus = set(_get_irq_cpus(cfg)) - # Get total number of CPUs - num_cpus = os.cpu_count() + # Get total number of CPUs, capped by RPS_MAX_CPUS + num_cpus = min(os.cpu_count(), RPS_MAX_CPUS) # Find unused CPUs unused_cpus = [cpu for cpu in range(num_cpus) if cpu not in rx_cpus] if len(unused_cpus) < count: - raise KsftSkipEx(f"Need at {count} CPUs not used by Rx queues, found {len(unused_cpus)}") + raise KsftSkipEx(f"Need at least {count} CPUs in range 0..{num_cpus - 1} not used by Rx queues, found {len(unused_cpus)}") return unused_cpus[:count] @@ -181,7 +184,7 @@ def test(cfg, proto_flag, ipver, grp): ksft_pr(f"RSS using CPUs: {irq_cpus}") elif grp == "rps": # Get CPUs not used by Rx queues and configure them for RPS - rps_cpus = _get_unused_cpus(cfg, count=2) + rps_cpus = _get_unused_rps_cpus(cfg, count=2) rps_mask = _configure_rps(cfg, rps_cpus) defer(_configure_rps, cfg, []) rx_cmd += ["-r", rps_mask] diff --git a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh index b6093bcf2b06..02dcdeb723be 100644 --- a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh +++ b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh @@ -249,8 +249,8 @@ function listen_port_and_save_to() { SOCAT_MODE="UDP6-LISTEN" fi - # Just wait for 2 seconds - timeout 2 ip netns exec "${NAMESPACE}" \ + # Just wait for 3 seconds + timeout 3 ip netns exec "${NAMESPACE}" \ socat "${SOCAT_MODE}":"${PORT}",fork "${OUTPUT}" 2> /dev/null } diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh index 0441a18f098b..aac8ef490feb 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh @@ -317,7 +317,7 @@ police_limits_test() tc filter add dev $swp1 ingress pref 1 proto ip handle 101 \ flower skip_sw \ - action police rate 0.5kbit burst 1m conform-exceed drop/ok + action police rate 0.5kbit burst 2k conform-exceed drop/ok check_fail $? "Incorrect success to add police action with too low rate" tc filter add dev $swp1 ingress pref 1 proto ip handle 101 \ @@ -327,7 +327,7 @@ police_limits_test() tc filter add dev $swp1 ingress pref 1 proto ip handle 101 \ flower skip_sw \ - action police rate 1.5kbit burst 1m conform-exceed drop/ok + action police rate 1.5kbit burst 2k conform-exceed drop/ok check_err $? "Failed to add police action with low rate" tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 5b993924cc3f..2ca07ea7202a 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -18,6 +18,9 @@ #include <sys/stat.h> #include <sys/syscall.h> #include <sys/wait.h> +#include <sys/types.h> +#include <sys/ipc.h> +#include <sys/sem.h> #include <unistd.h> #include <ctype.h> @@ -39,6 +42,20 @@ F_SEAL_EXEC) #define MFD_NOEXEC_SEAL 0x0008U +union semun { + int val; + struct semid_ds *buf; + unsigned short int *array; + struct seminfo *__buf; +}; + +/* + * we use semaphores on nested wait tasks due the use of CLONE_NEWPID: the + * child will be PID 1 and can't send SIGSTOP to themselves due special + * treatment of the init task, so the SIGSTOP/SIGCONT synchronization + * approach can't be used here. + */ +#define SEM_KEY 0xdeadbeef /* * Default is not to test hugetlbfs @@ -1333,8 +1350,22 @@ static int sysctl_nested(void *arg) static int sysctl_nested_wait(void *arg) { - /* Wait for a SIGCONT. */ - kill(getpid(), SIGSTOP); + int sem = semget(SEM_KEY, 1, 0600); + struct sembuf sembuf; + + if (sem < 0) { + perror("semget:"); + abort(); + } + sembuf.sem_num = 0; + sembuf.sem_flg = 0; + sembuf.sem_op = 0; + + if (semop(sem, &sembuf, 1) < 0) { + perror("semop:"); + abort(); + } + return sysctl_nested(arg); } @@ -1355,7 +1386,9 @@ static void test_sysctl_sysctl2_failset(void) static int sysctl_nested_child(void *arg) { - int pid; + int pid, sem; + union semun semun; + struct sembuf sembuf; printf("%s nested sysctl 0\n", memfd_str); sysctl_assert_write("0"); @@ -1389,23 +1422,53 @@ static int sysctl_nested_child(void *arg) test_sysctl_sysctl2_failset); join_thread(pid); + sem = semget(SEM_KEY, 1, IPC_CREAT | 0600); + if (sem < 0) { + perror("semget:"); + return 1; + } + semun.val = 1; + sembuf.sem_op = -1; + sembuf.sem_flg = 0; + sembuf.sem_num = 0; + /* Verify that the rules are actually inherited after fork. */ printf("%s nested sysctl 0 -> 1 after fork\n", memfd_str); sysctl_assert_write("0"); + if (semctl(sem, 0, SETVAL, semun) < 0) { + perror("semctl:"); + return 1; + } + pid = spawn_thread(CLONE_NEWPID, sysctl_nested_wait, test_sysctl_sysctl1_failset); sysctl_assert_write("1"); - kill(pid, SIGCONT); + + /* Allow child to continue */ + if (semop(sem, &sembuf, 1) < 0) { + perror("semop:"); + return 1; + } join_thread(pid); printf("%s nested sysctl 0 -> 2 after fork\n", memfd_str); sysctl_assert_write("0"); + if (semctl(sem, 0, SETVAL, semun) < 0) { + perror("semctl:"); + return 1; + } + pid = spawn_thread(CLONE_NEWPID, sysctl_nested_wait, test_sysctl_sysctl2_failset); sysctl_assert_write("2"); - kill(pid, SIGCONT); + + /* Allow child to continue */ + if (semop(sem, &sembuf, 1) < 0) { + perror("semop:"); + return 1; + } join_thread(pid); /* @@ -1415,28 +1478,62 @@ static int sysctl_nested_child(void *arg) */ printf("%s nested sysctl 2 -> 1 after fork\n", memfd_str); sysctl_assert_write("2"); + + if (semctl(sem, 0, SETVAL, semun) < 0) { + perror("semctl:"); + return 1; + } + pid = spawn_thread(CLONE_NEWPID, sysctl_nested_wait, test_sysctl_sysctl2); sysctl_assert_write("1"); - kill(pid, SIGCONT); + + /* Allow child to continue */ + if (semop(sem, &sembuf, 1) < 0) { + perror("semop:"); + return 1; + } join_thread(pid); printf("%s nested sysctl 2 -> 0 after fork\n", memfd_str); sysctl_assert_write("2"); + + if (semctl(sem, 0, SETVAL, semun) < 0) { + perror("semctl:"); + return 1; + } + pid = spawn_thread(CLONE_NEWPID, sysctl_nested_wait, test_sysctl_sysctl2); sysctl_assert_write("0"); - kill(pid, SIGCONT); + + /* Allow child to continue */ + if (semop(sem, &sembuf, 1) < 0) { + perror("semop:"); + return 1; + } join_thread(pid); printf("%s nested sysctl 1 -> 0 after fork\n", memfd_str); sysctl_assert_write("1"); + + if (semctl(sem, 0, SETVAL, semun) < 0) { + perror("semctl:"); + return 1; + } + pid = spawn_thread(CLONE_NEWPID, sysctl_nested_wait, test_sysctl_sysctl1); sysctl_assert_write("0"); - kill(pid, SIGCONT); + /* Allow child to continue */ + if (semop(sem, &sembuf, 1) < 0) { + perror("semop:"); + return 1; + } join_thread(pid); + semctl(sem, 0, IPC_RMID); + return 0; } diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index c2a8586e51a1..83ad9454dd9d 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -12,6 +12,7 @@ map_hugetlb map_populate thuge-gen compaction_test +memory-failure migration mlock2-tests mrelease_test diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 905f1e034963..7a5de4e9bf52 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -72,9 +72,10 @@ TEST_GEN_FILES += madv_populate TEST_GEN_FILES += map_fixed_noreplace TEST_GEN_FILES += map_hugetlb TEST_GEN_FILES += map_populate -ifneq (,$(filter $(ARCH),arm64 riscv riscv64 x86 x86_64)) +ifneq (,$(filter $(ARCH),arm64 riscv riscv64 x86 x86_64 loongarch32 loongarch64)) TEST_GEN_FILES += memfd_secret endif +TEST_GEN_FILES += memory-failure TEST_GEN_FILES += migration TEST_GEN_FILES += mkdirty TEST_GEN_FILES += mlock-random-test @@ -154,6 +155,7 @@ TEST_PROGS += ksft_ksm_numa.sh TEST_PROGS += ksft_madv_guard.sh TEST_PROGS += ksft_madv_populate.sh TEST_PROGS += ksft_memfd_secret.sh +TEST_PROGS += ksft_memory_failure.sh TEST_PROGS += ksft_migration.sh TEST_PROGS += ksft_mkdirty.sh TEST_PROGS += ksft_mlock.sh diff --git a/tools/testing/selftests/mm/config b/tools/testing/selftests/mm/config index deba93379c80..1dbe2b4558ab 100644 --- a/tools/testing/selftests/mm/config +++ b/tools/testing/selftests/mm/config @@ -11,3 +11,5 @@ CONFIG_ANON_VMA_NAME=y CONFIG_FTRACE=y CONFIG_PROFILING=y CONFIG_UPROBES=y +CONFIG_MEMORY_FAILURE=y +CONFIG_HWPOISON_INJECT=m diff --git a/tools/testing/selftests/mm/ksft_memory_failure.sh b/tools/testing/selftests/mm/ksft_memory_failure.sh new file mode 100755 index 000000000000..ae1614d4d49b --- /dev/null +++ b/tools/testing/selftests/mm/ksft_memory_failure.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t memory-failure diff --git a/tools/testing/selftests/mm/memory-failure.c b/tools/testing/selftests/mm/memory-failure.c new file mode 100644 index 000000000000..3d9e0b9ffb41 --- /dev/null +++ b/tools/testing/selftests/mm/memory-failure.c @@ -0,0 +1,359 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Memory-failure functional tests. + * + * Author(s): Miaohe Lin <linmiaohe@huawei.com> + */ + +#include "../kselftest_harness.h" + +#include <sys/mman.h> +#include <linux/mman.h> +#include <linux/string.h> +#include <unistd.h> +#include <signal.h> +#include <setjmp.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/vfs.h> +#include <linux/magic.h> +#include <errno.h> + +#include "vm_util.h" + +enum inject_type { + MADV_HARD, + MADV_SOFT, +}; + +enum result_type { + MADV_HARD_ANON, + MADV_HARD_CLEAN_PAGECACHE, + MADV_HARD_DIRTY_PAGECACHE, + MADV_SOFT_ANON, + MADV_SOFT_CLEAN_PAGECACHE, + MADV_SOFT_DIRTY_PAGECACHE, +}; + +static jmp_buf signal_jmp_buf; +static siginfo_t siginfo; +const char *pagemap_proc = "/proc/self/pagemap"; +const char *kpageflags_proc = "/proc/kpageflags"; + +FIXTURE(memory_failure) +{ + unsigned long page_size; + unsigned long corrupted_size; + unsigned long pfn; + int pagemap_fd; + int kpageflags_fd; + bool triggered; +}; + +FIXTURE_VARIANT(memory_failure) +{ + enum inject_type type; + int (*inject)(FIXTURE_DATA(memory_failure) * self, void *vaddr); +}; + +static int madv_hard_inject(FIXTURE_DATA(memory_failure) * self, void *vaddr) +{ + return madvise(vaddr, self->page_size, MADV_HWPOISON); +} + +FIXTURE_VARIANT_ADD(memory_failure, madv_hard) +{ + .type = MADV_HARD, + .inject = madv_hard_inject, +}; + +static int madv_soft_inject(FIXTURE_DATA(memory_failure) * self, void *vaddr) +{ + return madvise(vaddr, self->page_size, MADV_SOFT_OFFLINE); +} + +FIXTURE_VARIANT_ADD(memory_failure, madv_soft) +{ + .type = MADV_SOFT, + .inject = madv_soft_inject, +}; + +static void sigbus_action(int signo, siginfo_t *si, void *args) +{ + memcpy(&siginfo, si, sizeof(siginfo_t)); + siglongjmp(signal_jmp_buf, 1); +} + +static int setup_sighandler(void) +{ + struct sigaction sa = { + .sa_sigaction = sigbus_action, + .sa_flags = SA_SIGINFO, + }; + + return sigaction(SIGBUS, &sa, NULL); +} + +FIXTURE_SETUP(memory_failure) +{ + memset(self, 0, sizeof(*self)); + + self->page_size = (unsigned long)sysconf(_SC_PAGESIZE); + + memset(&siginfo, 0, sizeof(siginfo)); + if (setup_sighandler()) + SKIP(return, "setup sighandler failed.\n"); + + self->pagemap_fd = open(pagemap_proc, O_RDONLY); + if (self->pagemap_fd == -1) + SKIP(return, "open %s failed.\n", pagemap_proc); + + self->kpageflags_fd = open(kpageflags_proc, O_RDONLY); + if (self->kpageflags_fd == -1) + SKIP(return, "open %s failed.\n", kpageflags_proc); +} + +static void teardown_sighandler(void) +{ + struct sigaction sa = { + .sa_handler = SIG_DFL, + .sa_flags = SA_SIGINFO, + }; + + sigaction(SIGBUS, &sa, NULL); +} + +FIXTURE_TEARDOWN(memory_failure) +{ + close(self->kpageflags_fd); + close(self->pagemap_fd); + teardown_sighandler(); +} + +static void prepare(struct __test_metadata *_metadata, FIXTURE_DATA(memory_failure) * self, + void *vaddr) +{ + self->pfn = pagemap_get_pfn(self->pagemap_fd, vaddr); + ASSERT_NE(self->pfn, -1UL); + + ASSERT_EQ(get_hardware_corrupted_size(&self->corrupted_size), 0); +} + +static bool check_memory(void *vaddr, unsigned long size) +{ + char buf[64]; + + memset(buf, 0xce, sizeof(buf)); + while (size >= sizeof(buf)) { + if (memcmp(vaddr, buf, sizeof(buf))) + return false; + size -= sizeof(buf); + vaddr += sizeof(buf); + } + + return true; +} + +static void check(struct __test_metadata *_metadata, FIXTURE_DATA(memory_failure) * self, + void *vaddr, enum result_type type, int setjmp) +{ + unsigned long size; + uint64_t pfn_flags; + + switch (type) { + case MADV_SOFT_ANON: + case MADV_HARD_CLEAN_PAGECACHE: + case MADV_SOFT_CLEAN_PAGECACHE: + case MADV_SOFT_DIRTY_PAGECACHE: + /* It is not expected to receive a SIGBUS signal. */ + ASSERT_EQ(setjmp, 0); + + /* The page content should remain unchanged. */ + ASSERT_TRUE(check_memory(vaddr, self->page_size)); + + /* The backing pfn of addr should have changed. */ + ASSERT_NE(pagemap_get_pfn(self->pagemap_fd, vaddr), self->pfn); + break; + case MADV_HARD_ANON: + case MADV_HARD_DIRTY_PAGECACHE: + /* The SIGBUS signal should have been received. */ + ASSERT_EQ(setjmp, 1); + + /* Check if siginfo contains correct SIGBUS context. */ + ASSERT_EQ(siginfo.si_signo, SIGBUS); + ASSERT_EQ(siginfo.si_code, BUS_MCEERR_AR); + ASSERT_EQ(1UL << siginfo.si_addr_lsb, self->page_size); + ASSERT_EQ(siginfo.si_addr, vaddr); + + /* XXX Check backing pte is hwpoison entry when supported. */ + ASSERT_TRUE(pagemap_is_swapped(self->pagemap_fd, vaddr)); + break; + default: + SKIP(return, "unexpected inject type %d.\n", type); + } + + /* Check if the value of HardwareCorrupted has increased. */ + ASSERT_EQ(get_hardware_corrupted_size(&size), 0); + ASSERT_EQ(size, self->corrupted_size + self->page_size / 1024); + + /* Check if HWPoison flag is set. */ + ASSERT_EQ(pageflags_get(self->pfn, self->kpageflags_fd, &pfn_flags), 0); + ASSERT_EQ(pfn_flags & KPF_HWPOISON, KPF_HWPOISON); +} + +static void cleanup(struct __test_metadata *_metadata, FIXTURE_DATA(memory_failure) * self, + void *vaddr) +{ + unsigned long size; + uint64_t pfn_flags; + + ASSERT_EQ(unpoison_memory(self->pfn), 0); + + /* Check if HWPoison flag is cleared. */ + ASSERT_EQ(pageflags_get(self->pfn, self->kpageflags_fd, &pfn_flags), 0); + ASSERT_NE(pfn_flags & KPF_HWPOISON, KPF_HWPOISON); + + /* Check if the value of HardwareCorrupted has decreased. */ + ASSERT_EQ(get_hardware_corrupted_size(&size), 0); + ASSERT_EQ(size, self->corrupted_size); +} + +TEST_F(memory_failure, anon) +{ + char *addr; + int ret; + + addr = mmap(0, self->page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (addr == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + memset(addr, 0xce, self->page_size); + + prepare(_metadata, self, addr); + + ret = sigsetjmp(signal_jmp_buf, 1); + if (!self->triggered) { + self->triggered = true; + ASSERT_EQ(variant->inject(self, addr), 0); + FORCE_READ(*addr); + } + + if (variant->type == MADV_HARD) + check(_metadata, self, addr, MADV_HARD_ANON, ret); + else + check(_metadata, self, addr, MADV_SOFT_ANON, ret); + + cleanup(_metadata, self, addr); + + ASSERT_EQ(munmap(addr, self->page_size), 0); +} + +static int prepare_file(const char *fname, unsigned long size) +{ + int fd; + + fd = open(fname, O_RDWR | O_CREAT, 0664); + if (fd >= 0) { + unlink(fname); + ftruncate(fd, size); + } + return fd; +} + +/* Borrowed from mm/gup_longterm.c. */ +static int get_fs_type(int fd) +{ + struct statfs fs; + int ret; + + do { + ret = fstatfs(fd, &fs); + } while (ret && errno == EINTR); + + return ret ? 0 : (int)fs.f_type; +} + +TEST_F(memory_failure, clean_pagecache) +{ + int fd; + char *addr; + int ret; + int fs_type; + + fd = prepare_file("./clean-page-cache-test-file", self->page_size); + if (fd < 0) + SKIP(return, "failed to open test file.\n"); + fs_type = get_fs_type(fd); + if (!fs_type || fs_type == TMPFS_MAGIC) + SKIP(return, "unsupported filesystem :%x\n", fs_type); + + addr = mmap(0, self->page_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + memset(addr, 0xce, self->page_size); + fsync(fd); + + prepare(_metadata, self, addr); + + ret = sigsetjmp(signal_jmp_buf, 1); + if (!self->triggered) { + self->triggered = true; + ASSERT_EQ(variant->inject(self, addr), 0); + FORCE_READ(*addr); + } + + if (variant->type == MADV_HARD) + check(_metadata, self, addr, MADV_HARD_CLEAN_PAGECACHE, ret); + else + check(_metadata, self, addr, MADV_SOFT_CLEAN_PAGECACHE, ret); + + cleanup(_metadata, self, addr); + + ASSERT_EQ(munmap(addr, self->page_size), 0); + + ASSERT_EQ(close(fd), 0); +} + +TEST_F(memory_failure, dirty_pagecache) +{ + int fd; + char *addr; + int ret; + int fs_type; + + fd = prepare_file("./dirty-page-cache-test-file", self->page_size); + if (fd < 0) + SKIP(return, "failed to open test file.\n"); + fs_type = get_fs_type(fd); + if (!fs_type || fs_type == TMPFS_MAGIC) + SKIP(return, "unsupported filesystem :%x\n", fs_type); + + addr = mmap(0, self->page_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + memset(addr, 0xce, self->page_size); + + prepare(_metadata, self, addr); + + ret = sigsetjmp(signal_jmp_buf, 1); + if (!self->triggered) { + self->triggered = true; + ASSERT_EQ(variant->inject(self, addr), 0); + FORCE_READ(*addr); + } + + if (variant->type == MADV_HARD) + check(_metadata, self, addr, MADV_HARD_DIRTY_PAGECACHE, ret); + else + check(_metadata, self, addr, MADV_SOFT_DIRTY_PAGECACHE, ret); + + cleanup(_metadata, self, addr); + + ASSERT_EQ(munmap(addr, self->page_size), 0); + + ASSERT_EQ(close(fd), 0); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 29be9038bfb0..afdcfd0d7cef 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -91,6 +91,8 @@ separated by spaces: test VMA merge cases behave as expected - rmap test rmap behaves as expected +- memory-failure + test memory-failure behaves as expected example: ./run_vmtests.sh -t "hmm mmap ksm" EOF @@ -527,6 +529,25 @@ CATEGORY="page_frag" run_test ./test_page_frag.sh nonaligned CATEGORY="rmap" run_test ./rmap +# Try to load hwpoison_inject if not present. +HWPOISON_DIR=/sys/kernel/debug/hwpoison/ +if [ ! -d "$HWPOISON_DIR" ]; then + if ! modprobe -q -R hwpoison_inject; then + echo "Module hwpoison_inject not found, skipping..." + else + modprobe hwpoison_inject > /dev/null 2>&1 + LOADED_MOD=1 + fi +fi + +if [ -d "$HWPOISON_DIR" ]; then + CATEGORY="memory-failure" run_test ./memory-failure +fi + +if [ -n "${LOADED_MOD}" ]; then + modprobe -r hwpoison_inject > /dev/null 2>&1 +fi + if [ "${HAVE_HUGEPAGES}" = 1 ]; then echo "$orig_nr_hugepgs" > /proc/sys/vm/nr_hugepages fi diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index d954bf91afd5..a6d4ff7dfdc0 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -723,3 +723,44 @@ int ksm_stop(void) close(ksm_fd); return ret == 1 ? 0 : -errno; } + +int get_hardware_corrupted_size(unsigned long *val) +{ + unsigned long size; + char *line = NULL; + size_t linelen = 0; + FILE *f = fopen("/proc/meminfo", "r"); + int ret = -1; + + if (!f) + return ret; + + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "HardwareCorrupted: %12lu kB", &size) == 1) { + *val = size; + ret = 0; + break; + } + } + + free(line); + fclose(f); + return ret; +} + +int unpoison_memory(unsigned long pfn) +{ + int unpoison_fd, len; + char buf[32]; + ssize_t ret; + + unpoison_fd = open("/sys/kernel/debug/hwpoison/unpoison-pfn", O_WRONLY); + if (unpoison_fd < 0) + return -errno; + + len = sprintf(buf, "0x%lx\n", pfn); + ret = write(unpoison_fd, buf, len); + close(unpoison_fd); + + return ret > 0 ? 0 : -errno; +} diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index 522f7f9050f5..e9c4e24769c1 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -20,6 +20,7 @@ #define KPF_COMPOUND_HEAD BIT_ULL(15) #define KPF_COMPOUND_TAIL BIT_ULL(16) +#define KPF_HWPOISON BIT_ULL(19) #define KPF_THP BIT_ULL(22) /* * Ignore the checkpatch warning, we must read from x but don't want to do @@ -154,6 +155,8 @@ long ksm_get_full_scans(void); int ksm_use_zero_pages(void); int ksm_start(void); int ksm_stop(void); +int get_hardware_corrupted_size(unsigned long *val); +int unpoison_memory(unsigned long pfn); /* * On ppc64 this will only work with radix 2M hugepage size diff --git a/tools/testing/selftests/net/forwarding/bridge_mdb_max.sh b/tools/testing/selftests/net/forwarding/bridge_mdb_max.sh index 3da9d93ab36f..625162fd7e8b 100755 --- a/tools/testing/selftests/net/forwarding/bridge_mdb_max.sh +++ b/tools/testing/selftests/net/forwarding/bridge_mdb_max.sh @@ -28,6 +28,7 @@ ALL_TESTS=" test_8021d test_8021q test_8021qvs + test_mdb_count_warning " NUM_NETIFS=4 @@ -83,8 +84,6 @@ switch_create_8021q() { local br_flags=$1; shift - log_info "802.1q $br_flags${br_flags:+ }tests" - ip link add name br0 type bridge vlan_filtering 1 vlan_default_pvid 0 \ mcast_snooping 1 $br_flags \ mcast_igmp_version 3 mcast_mld_version 2 @@ -106,6 +105,7 @@ switch_create_8021q() switch_create_8021qvs() { + log_info "802.1q mcast_vlan_snooping 1 tests" switch_create_8021q "mcast_vlan_snooping 1" bridge vlan global set dev br0 vid 10 mcast_igmp_version 3 bridge vlan global set dev br0 vid 10 mcast_mld_version 2 @@ -1272,6 +1272,76 @@ test_8021qvs_toggle_vlan_snooping() test_toggle_vlan_snooping_permanent } +mdb_count_check_warn() +{ + local msg=$1; shift + + dmesg | grep -q "WARNING:.*br_multicast_port_ngroups_dec.*" + check_fail $? "$msg" +} + +test_mdb_count_mcast_vlan_snooping_flush() +{ + RET=0 + + # check if we already have a warning + mdb_count_check_warn "Check MDB entries count warning before test" + + bridge mdb add dev br0 port "$swp1" grp 239.0.0.1 permanent vid 10 + ip link set dev br0 down + ip link set dev br0 type bridge mcast_vlan_snooping 1 + bridge mdb flush dev br0 + + mdb_count_check_warn "Check MDB entries count warning after test" + + ip link set dev br0 type bridge mcast_vlan_snooping 0 + ip link set dev br0 up + + log_test "MDB count warning: mcast_vlan_snooping and MDB flush" +} + +test_mdb_count_mcast_snooping_flush() +{ + RET=0 + + # check if we already have a warning + mdb_count_check_warn "Check MDB entries count warning before test" + + bridge mdb add dev br0 port "$swp1" grp 239.0.0.1 permanent vid 10 + ip link set dev br0 type bridge mcast_snooping 0 + ip link set dev br0 type bridge mcast_vlan_snooping 1 + bridge mdb flush dev br0 + + mdb_count_check_warn "Check MDB entries count warning after test" + + ip link set dev br0 type bridge mcast_vlan_snooping 0 + ip link set dev br0 type bridge mcast_snooping 1 + + log_test "MDB count warning: mcast_snooping and MDB flush" +} + +test_mdb_count_vlan_state_flush() +{ + RET=0 + + # check if we already have a warning + mdb_count_check_warn "Check MDB entries count warning before test" + + bridge mdb add dev br0 port "$swp1" grp 239.0.0.1 permanent vid 10 + ip link set dev br0 down + bridge vlan set vid 10 dev "$swp1" state blocking + ip link set dev br0 type bridge mcast_vlan_snooping 1 + ip link set dev br0 up + bridge mdb flush dev br0 + + mdb_count_check_warn "Check MDB entries count warning after test" + + bridge vlan set vid 10 dev "$swp1" state forwarding + ip link set dev br0 type bridge mcast_vlan_snooping 0 + + log_test "MDB count warning: disabled vlan state and MDB flush" +} + # test groups test_8021d() @@ -1297,6 +1367,7 @@ test_8021q() { # Tests for vlan_filtering 1 mcast_vlan_snooping 0. + log_info "802.1q tests" switch_create_8021q setup_wait @@ -1334,6 +1405,21 @@ test_8021qvs() switch_destroy } +test_mdb_count_warning() +{ + # Tests for mdb_n_entries warning + + log_info "MDB count warning tests" + switch_create_8021q + setup_wait + + test_mdb_count_mcast_vlan_snooping_flush + test_mdb_count_mcast_snooping_flush + test_mdb_count_vlan_state_flush + + switch_destroy +} + if ! bridge link help 2>&1 | grep -q "mcast_max_groups"; then echo "SKIP: iproute2 too old, missing bridge \"mcast_max_groups\" support" exit $ksft_skip diff --git a/tools/testing/selftests/net/forwarding/pedit_dsfield.sh b/tools/testing/selftests/net/forwarding/pedit_dsfield.sh index af008fbf2725..eb2d8034de9c 100755 --- a/tools/testing/selftests/net/forwarding/pedit_dsfield.sh +++ b/tools/testing/selftests/net/forwarding/pedit_dsfield.sh @@ -98,12 +98,20 @@ setup_prepare() h1_create h2_create switch_create + + if [ -f /proc/sys/net/bridge/bridge-nf-call-iptables ]; then + sysctl_set net.bridge.bridge-nf-call-iptables 0 + fi } cleanup() { pre_cleanup + if [ -f /proc/sys/net/bridge/bridge-nf-call-iptables ]; then + sysctl_restore net.bridge.bridge-nf-call-iptables + fi + switch_destroy h2_destroy h1_destroy diff --git a/tools/testing/selftests/net/forwarding/pedit_ip.sh b/tools/testing/selftests/net/forwarding/pedit_ip.sh index d14efb2d23b2..9235674627ab 100755 --- a/tools/testing/selftests/net/forwarding/pedit_ip.sh +++ b/tools/testing/selftests/net/forwarding/pedit_ip.sh @@ -91,12 +91,20 @@ setup_prepare() h1_create h2_create switch_create + + if [ -f /proc/sys/net/bridge/bridge-nf-call-iptables ]; then + sysctl_set net.bridge.bridge-nf-call-iptables 0 + fi } cleanup() { pre_cleanup + if [ -f /proc/sys/net/bridge/bridge-nf-call-iptables ]; then + sysctl_restore net.bridge.bridge-nf-call-iptables + fi + switch_destroy h2_destroy h1_destroy diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh b/tools/testing/selftests/net/forwarding/tc_actions.sh index ea89e558672d..86edbc7e2489 100755 --- a/tools/testing/selftests/net/forwarding/tc_actions.sh +++ b/tools/testing/selftests/net/forwarding/tc_actions.sh @@ -223,7 +223,7 @@ mirred_egress_to_ingress_tcp_test() ip_proto icmp \ action drop - ip vrf exec v$h1 ncat --recv-only -w10 -l -p 12345 -o $mirred_e2i_tf2 & + ip vrf exec v$h1 ncat --recv-only -w10 -l -p 12345 > $mirred_e2i_tf2 & local rpid=$! ip vrf exec v$h1 ncat -w1 --send-only 192.0.2.2 12345 <$mirred_e2i_tf1 wait -n $rpid diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh index b43816dd998c..457f41d5e584 100755 --- a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh +++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh @@ -567,6 +567,21 @@ vxlan_encapped_ping_do() local inner_tos=$1; shift local outer_tos=$1; shift + local ipv4hdr=$(: + )"45:"$( : IP version + IHL + )"$inner_tos:"$( : IP TOS + )"00:54:"$( : IP total length + )"99:83:"$( : IP identification + )"40:00:"$( : IP flags + frag off + )"40:"$( : IP TTL + )"01:"$( : IP proto + )"CHECKSUM:"$( : IP header csum + )"c0:00:02:03:"$( : IP saddr: 192.0.2.3 + )"c0:00:02:01"$( : IP daddr: 192.0.2.1 + ) + local checksum=$(payload_template_calc_checksum "$ipv4hdr") + ipv4hdr=$(payload_template_expand_checksum "$ipv4hdr" $checksum) + $MZ $dev -c $count -d 100msec -q \ -b $next_hop_mac -B $dest_ip \ -t udp tos=$outer_tos,sp=23456,dp=$VXPORT,p=$(: @@ -577,16 +592,7 @@ vxlan_encapped_ping_do() )"$dest_mac:"$( : ETH daddr )"$(mac_get w2):"$( : ETH saddr )"08:00:"$( : ETH type - )"45:"$( : IP version + IHL - )"$inner_tos:"$( : IP TOS - )"00:54:"$( : IP total length - )"99:83:"$( : IP identification - )"40:00:"$( : IP flags + frag off - )"40:"$( : IP TTL - )"01:"$( : IP proto - )"00:00:"$( : IP header csum - )"c0:00:02:03:"$( : IP saddr: 192.0.2.3 - )"c0:00:02:01:"$( : IP daddr: 192.0.2.1 + )"$ipv4hdr:"$( : IPv4 header )"08:"$( : ICMP type )"00:"$( : ICMP code )"8b:f2:"$( : ICMP csum diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d_ipv6.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d_ipv6.sh index a603f7b0a08f..e642feeada0e 100755 --- a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d_ipv6.sh +++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d_ipv6.sh @@ -695,7 +695,7 @@ vxlan_encapped_ping_do() )"6"$( : IP version )"$inner_tos"$( : Traffic class )"0:00:00:"$( : Flow label - )"00:08:"$( : Payload length + )"00:03:"$( : Payload length )"3a:"$( : Next header )"04:"$( : Hop limit )"$saddr:"$( : IP saddr diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index 0ec131b339bc..b40694573f4c 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -577,7 +577,7 @@ ip_link_has_flag() local flag=$1; shift local state=$(ip -j link show "$name" | - jq --arg flag "$flag" 'any(.[].flags.[]; . == $flag)') + jq --arg flag "$flag" 'any(.[].flags[]; . == $flag)') [[ $state == true ]] } diff --git a/tools/testing/selftests/net/packetdrill/ksft_runner.sh b/tools/testing/selftests/net/packetdrill/ksft_runner.sh index b34e5cf0112e..0a97d5ae3469 100755 --- a/tools/testing/selftests/net/packetdrill/ksft_runner.sh +++ b/tools/testing/selftests/net/packetdrill/ksft_runner.sh @@ -13,6 +13,15 @@ declare -A ip_args=( -D TFO_COOKIE_ZERO=b7c12350a90dc8f5 -D CMSG_LEVEL_IP=SOL_IP -D CMSG_TYPE_RECVERR=IP_RECVERR" + [ipv4-mapped-ipv6]="--ip_version=ipv4-mapped-ipv6 + --local_ip=192.168.0.1 + --gateway_ip=192.168.0.1 + --netmask_ip=255.255.0.0 + --remote_ip=192.0.2.1 + -D TFO_COOKIE=3021b9d889017eeb + -D TFO_COOKIE_ZERO=b7c12350a90dc8f5 + -D CMSG_LEVEL_IP=SOL_IPV6 + -D CMSG_TYPE_RECVERR=IPV6_RECVERR" [ipv6]="--ip_version=ipv6 --mtu=1520 --local_ip=fd3d:0a0b:17d6::1 @@ -45,7 +54,7 @@ fi ip_versions=$(grep -E '^--ip_version=' $script | cut -d '=' -f 2) if [[ -z $ip_versions ]]; then - ip_versions="ipv4 ipv6" + ip_versions="ipv4 ipv6 ipv4-mapped-ipv6" elif [[ ! "$ip_versions" =~ ^ipv[46]$ ]]; then ktap_exit_fail_msg "Too many or unsupported --ip_version: $ip_versions" exit "$KSFT_FAIL" diff --git a/tools/testing/vma/Makefile b/tools/testing/vma/Makefile index 66f3831a668f..e72b45dedda5 100644 --- a/tools/testing/vma/Makefile +++ b/tools/testing/vma/Makefile @@ -6,10 +6,13 @@ default: vma include ../shared/shared.mk -OFILES = $(SHARED_OFILES) vma.o maple-shim.o +OFILES = $(SHARED_OFILES) main.o shared.o maple-shim.o TARGETS = vma -vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma_init.c ../../../mm/vma_exec.c ../../../mm/vma.h +# These can be varied to test different sizes. +CFLAGS += -DNUM_VMA_FLAG_BITS=128 -DNUM_MM_FLAG_BITS=128 + +main.o: main.c shared.c shared.h vma_internal.h tests/merge.c tests/mmap.c tests/vma.c ../../../mm/vma.c ../../../mm/vma_init.c ../../../mm/vma_exec.c ../../../mm/vma.h include/custom.h include/dup.h include/stubs.h vma: $(OFILES) $(CC) $(CFLAGS) -o $@ $(OFILES) $(LDLIBS) diff --git a/tools/testing/vma/include/custom.h b/tools/testing/vma/include/custom.h new file mode 100644 index 000000000000..802a76317245 --- /dev/null +++ b/tools/testing/vma/include/custom.h @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#pragma once + +/* + * Contains declarations that exist in the kernel which have been CUSTOMISED for + * testing purposes to faciliate userland VMA testing. + */ + +#ifdef CONFIG_MMU +extern unsigned long mmap_min_addr; +extern unsigned long dac_mmap_min_addr; +#else +#define mmap_min_addr 0UL +#define dac_mmap_min_addr 0UL +#endif + +#define VM_WARN_ON(_expr) (WARN_ON(_expr)) +#define VM_WARN_ON_ONCE(_expr) (WARN_ON_ONCE(_expr)) +#define VM_WARN_ON_VMG(_expr, _vmg) (WARN_ON(_expr)) +#define VM_BUG_ON(_expr) (BUG_ON(_expr)) +#define VM_BUG_ON_VMA(_expr, _vma) (BUG_ON(_expr)) + +/* We hardcode this for now. */ +#define sysctl_max_map_count 0x1000000UL + +#define TASK_SIZE ((1ul << 47)-PAGE_SIZE) + +/* + * The shared stubs do not implement this, it amounts to an fprintf(STDERR,...) + * either way :) + */ +#define pr_warn_once pr_err + +#define pgtable_supports_soft_dirty() 1 + +struct anon_vma { + struct anon_vma *root; + struct rb_root_cached rb_root; + + /* Test fields. */ + bool was_cloned; + bool was_unlinked; +}; + +static inline void unlink_anon_vmas(struct vm_area_struct *vma) +{ + /* For testing purposes, indicate that the anon_vma was unlinked. */ + vma->anon_vma->was_unlinked = true; +} + +static inline void vma_start_write(struct vm_area_struct *vma) +{ + /* Used to indicate to tests that a write operation has begun. */ + vma->vm_lock_seq++; +} + +static inline __must_check +int vma_start_write_killable(struct vm_area_struct *vma) +{ + /* Used to indicate to tests that a write operation has begun. */ + vma->vm_lock_seq++; + return 0; +} + +static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src, + enum vma_operation operation) +{ + /* For testing purposes. We indicate that an anon_vma has been cloned. */ + if (src->anon_vma != NULL) { + dst->anon_vma = src->anon_vma; + dst->anon_vma->was_cloned = true; + } + + return 0; +} + +static inline int __anon_vma_prepare(struct vm_area_struct *vma) +{ + struct anon_vma *anon_vma = calloc(1, sizeof(struct anon_vma)); + + if (!anon_vma) + return -ENOMEM; + + anon_vma->root = anon_vma; + vma->anon_vma = anon_vma; + + return 0; +} + +static inline int anon_vma_prepare(struct vm_area_struct *vma) +{ + if (likely(vma->anon_vma)) + return 0; + + return __anon_vma_prepare(vma); +} + +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) +{ + if (reset_refcnt) + refcount_set(&vma->vm_refcnt, 0); +} + +static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits) +{ + vma_flags_t flags; + int i; + + /* + * For testing purposes: allow invalid bit specification so we can + * easily test. + */ + vma_flags_clear_all(&flags); + for (i = 0; i < count; i++) + if (bits[i] < NUM_VMA_FLAG_BITS) + vma_flag_set(&flags, bits[i]); + return flags; +} diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h new file mode 100644 index 000000000000..3078ff1487d3 --- /dev/null +++ b/tools/testing/vma/include/dup.h @@ -0,0 +1,1320 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#pragma once + +/* Forward declarations to avoid header cycle. */ +struct vm_area_struct; +static inline void vma_start_write(struct vm_area_struct *vma); + +extern const struct vm_operations_struct vma_dummy_vm_ops; +extern unsigned long stack_guard_gap; +extern const struct vm_operations_struct vma_dummy_vm_ops; +extern unsigned long rlimit(unsigned int limit); +struct task_struct *get_current(void); + +#define MMF_HAS_MDWE 28 +#define current get_current() + +/* + * Define the task command name length as enum, then it can be visible to + * BPF programs. + */ +enum { + TASK_COMM_LEN = 16, +}; + +/* PARTIALLY implemented types. */ +struct mm_struct { + struct maple_tree mm_mt; + int map_count; /* number of VMAs */ + unsigned long total_vm; /* Total pages mapped */ + unsigned long locked_vm; /* Pages that have PG_mlocked set */ + unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ + unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ + unsigned long stack_vm; /* VM_STACK */ + + unsigned long def_flags; + + mm_flags_t flags; /* Must use mm_flags_* helpers to access */ +}; +struct address_space { + struct rb_root_cached i_mmap; + unsigned long flags; + atomic_t i_mmap_writable; +}; +struct file_operations { + int (*mmap)(struct file *, struct vm_area_struct *); + int (*mmap_prepare)(struct vm_area_desc *); +}; +struct file { + struct address_space *f_mapping; + const struct file_operations *f_op; +}; +struct anon_vma_chain { + struct anon_vma *anon_vma; + struct list_head same_vma; +}; +struct task_struct { + char comm[TASK_COMM_LEN]; + pid_t pid; + struct mm_struct *mm; + + /* Used for emulating ABI behavior of previous Linux versions: */ + unsigned int personality; +}; + +struct kref { + refcount_t refcount; +}; + +struct anon_vma_name { + struct kref kref; + /* The name needs to be at the end because it is dynamically sized. */ + char name[]; +}; + +/* + * Contains declarations that are DUPLICATED from kernel source in order to + * faciliate userland VMA testing. + * + * These must be kept in sync with kernel source. + */ + +#define VMA_LOCK_OFFSET 0x40000000 + +typedef struct { unsigned long v; } freeptr_t; + +#define VM_NONE 0x00000000 + +typedef int __bitwise vma_flag_t; + +#define ACCESS_PRIVATE(p, member) ((p)->member) + +#define DECLARE_VMA_BIT(name, bitnum) \ + VMA_ ## name ## _BIT = ((__force vma_flag_t)bitnum) +#define DECLARE_VMA_BIT_ALIAS(name, aliased) \ + VMA_ ## name ## _BIT = VMA_ ## aliased ## _BIT +enum { + DECLARE_VMA_BIT(READ, 0), + DECLARE_VMA_BIT(WRITE, 1), + DECLARE_VMA_BIT(EXEC, 2), + DECLARE_VMA_BIT(SHARED, 3), + /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ + DECLARE_VMA_BIT(MAYREAD, 4), /* limits for mprotect() etc. */ + DECLARE_VMA_BIT(MAYWRITE, 5), + DECLARE_VMA_BIT(MAYEXEC, 6), + DECLARE_VMA_BIT(MAYSHARE, 7), + DECLARE_VMA_BIT(GROWSDOWN, 8), /* general info on the segment */ +#ifdef CONFIG_MMU + DECLARE_VMA_BIT(UFFD_MISSING, 9),/* missing pages tracking */ +#else + /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ + DECLARE_VMA_BIT(MAYOVERLAY, 9), +#endif /* CONFIG_MMU */ + /* Page-ranges managed without "struct page", just pure PFN */ + DECLARE_VMA_BIT(PFNMAP, 10), + DECLARE_VMA_BIT(MAYBE_GUARD, 11), + DECLARE_VMA_BIT(UFFD_WP, 12), /* wrprotect pages tracking */ + DECLARE_VMA_BIT(LOCKED, 13), + DECLARE_VMA_BIT(IO, 14), /* Memory mapped I/O or similar */ + DECLARE_VMA_BIT(SEQ_READ, 15), /* App will access data sequentially */ + DECLARE_VMA_BIT(RAND_READ, 16), /* App will not benefit from clustered reads */ + DECLARE_VMA_BIT(DONTCOPY, 17), /* Do not copy this vma on fork */ + DECLARE_VMA_BIT(DONTEXPAND, 18),/* Cannot expand with mremap() */ + DECLARE_VMA_BIT(LOCKONFAULT, 19),/* Lock pages covered when faulted in */ + DECLARE_VMA_BIT(ACCOUNT, 20), /* Is a VM accounted object */ + DECLARE_VMA_BIT(NORESERVE, 21), /* should the VM suppress accounting */ + DECLARE_VMA_BIT(HUGETLB, 22), /* Huge TLB Page VM */ + DECLARE_VMA_BIT(SYNC, 23), /* Synchronous page faults */ + DECLARE_VMA_BIT(ARCH_1, 24), /* Architecture-specific flag */ + DECLARE_VMA_BIT(WIPEONFORK, 25),/* Wipe VMA contents in child. */ + DECLARE_VMA_BIT(DONTDUMP, 26), /* Do not include in the core dump */ + DECLARE_VMA_BIT(SOFTDIRTY, 27), /* NOT soft dirty clean area */ + DECLARE_VMA_BIT(MIXEDMAP, 28), /* Can contain struct page and pure PFN pages */ + DECLARE_VMA_BIT(HUGEPAGE, 29), /* MADV_HUGEPAGE marked this vma */ + DECLARE_VMA_BIT(NOHUGEPAGE, 30),/* MADV_NOHUGEPAGE marked this vma */ + DECLARE_VMA_BIT(MERGEABLE, 31), /* KSM may merge identical pages */ + /* These bits are reused, we define specific uses below. */ + DECLARE_VMA_BIT(HIGH_ARCH_0, 32), + DECLARE_VMA_BIT(HIGH_ARCH_1, 33), + DECLARE_VMA_BIT(HIGH_ARCH_2, 34), + DECLARE_VMA_BIT(HIGH_ARCH_3, 35), + DECLARE_VMA_BIT(HIGH_ARCH_4, 36), + DECLARE_VMA_BIT(HIGH_ARCH_5, 37), + DECLARE_VMA_BIT(HIGH_ARCH_6, 38), + /* + * This flag is used to connect VFIO to arch specific KVM code. It + * indicates that the memory under this VMA is safe for use with any + * non-cachable memory type inside KVM. Some VFIO devices, on some + * platforms, are thought to be unsafe and can cause machine crashes + * if KVM does not lock down the memory type. + */ + DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39), +#ifdef CONFIG_PPC32 + DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1), +#else + DECLARE_VMA_BIT(DROPPABLE, 40), +#endif + DECLARE_VMA_BIT(UFFD_MINOR, 41), + DECLARE_VMA_BIT(SEALED, 42), + /* Flags that reuse flags above. */ + DECLARE_VMA_BIT_ALIAS(PKEY_BIT0, HIGH_ARCH_0), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT1, HIGH_ARCH_1), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT2, HIGH_ARCH_2), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT3, HIGH_ARCH_3), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT4, HIGH_ARCH_4), +#if defined(CONFIG_X86_USER_SHADOW_STACK) + /* + * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of + * support core mm. + * + * These VMAs will get a single end guard page. This helps userspace + * protect itself from attacks. A single page is enough for current + * shadow stack archs (x86). See the comments near alloc_shstk() in + * arch/x86/kernel/shstk.c for more details on the guard size. + */ + DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_5), +#elif defined(CONFIG_ARM64_GCS) + /* + * arm64's Guarded Control Stack implements similar functionality and + * has similar constraints to shadow stacks. + */ + DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_6), +#endif + DECLARE_VMA_BIT_ALIAS(SAO, ARCH_1), /* Strong Access Ordering (powerpc) */ + DECLARE_VMA_BIT_ALIAS(GROWSUP, ARCH_1), /* parisc */ + DECLARE_VMA_BIT_ALIAS(SPARC_ADI, ARCH_1), /* sparc64 */ + DECLARE_VMA_BIT_ALIAS(ARM64_BTI, ARCH_1), /* arm64 */ + DECLARE_VMA_BIT_ALIAS(ARCH_CLEAR, ARCH_1), /* sparc64, arm64 */ + DECLARE_VMA_BIT_ALIAS(MAPPED_COPY, ARCH_1), /* !CONFIG_MMU */ + DECLARE_VMA_BIT_ALIAS(MTE, HIGH_ARCH_4), /* arm64 */ + DECLARE_VMA_BIT_ALIAS(MTE_ALLOWED, HIGH_ARCH_5),/* arm64 */ +#ifdef CONFIG_STACK_GROWSUP + DECLARE_VMA_BIT_ALIAS(STACK, GROWSUP), + DECLARE_VMA_BIT_ALIAS(STACK_EARLY, GROWSDOWN), +#else + DECLARE_VMA_BIT_ALIAS(STACK, GROWSDOWN), +#endif +}; + +#define INIT_VM_FLAG(name) BIT((__force int) VMA_ ## name ## _BIT) +#define VM_READ INIT_VM_FLAG(READ) +#define VM_WRITE INIT_VM_FLAG(WRITE) +#define VM_EXEC INIT_VM_FLAG(EXEC) +#define VM_SHARED INIT_VM_FLAG(SHARED) +#define VM_MAYREAD INIT_VM_FLAG(MAYREAD) +#define VM_MAYWRITE INIT_VM_FLAG(MAYWRITE) +#define VM_MAYEXEC INIT_VM_FLAG(MAYEXEC) +#define VM_MAYSHARE INIT_VM_FLAG(MAYSHARE) +#define VM_GROWSDOWN INIT_VM_FLAG(GROWSDOWN) +#ifdef CONFIG_MMU +#define VM_UFFD_MISSING INIT_VM_FLAG(UFFD_MISSING) +#else +#define VM_UFFD_MISSING VM_NONE +#define VM_MAYOVERLAY INIT_VM_FLAG(MAYOVERLAY) +#endif +#define VM_PFNMAP INIT_VM_FLAG(PFNMAP) +#define VM_MAYBE_GUARD INIT_VM_FLAG(MAYBE_GUARD) +#define VM_UFFD_WP INIT_VM_FLAG(UFFD_WP) +#define VM_LOCKED INIT_VM_FLAG(LOCKED) +#define VM_IO INIT_VM_FLAG(IO) +#define VM_SEQ_READ INIT_VM_FLAG(SEQ_READ) +#define VM_RAND_READ INIT_VM_FLAG(RAND_READ) +#define VM_DONTCOPY INIT_VM_FLAG(DONTCOPY) +#define VM_DONTEXPAND INIT_VM_FLAG(DONTEXPAND) +#define VM_LOCKONFAULT INIT_VM_FLAG(LOCKONFAULT) +#define VM_ACCOUNT INIT_VM_FLAG(ACCOUNT) +#define VM_NORESERVE INIT_VM_FLAG(NORESERVE) +#define VM_HUGETLB INIT_VM_FLAG(HUGETLB) +#define VM_SYNC INIT_VM_FLAG(SYNC) +#define VM_ARCH_1 INIT_VM_FLAG(ARCH_1) +#define VM_WIPEONFORK INIT_VM_FLAG(WIPEONFORK) +#define VM_DONTDUMP INIT_VM_FLAG(DONTDUMP) +#ifdef CONFIG_MEM_SOFT_DIRTY +#define VM_SOFTDIRTY INIT_VM_FLAG(SOFTDIRTY) +#else +#define VM_SOFTDIRTY VM_NONE +#endif +#define VM_MIXEDMAP INIT_VM_FLAG(MIXEDMAP) +#define VM_HUGEPAGE INIT_VM_FLAG(HUGEPAGE) +#define VM_NOHUGEPAGE INIT_VM_FLAG(NOHUGEPAGE) +#define VM_MERGEABLE INIT_VM_FLAG(MERGEABLE) +#define VM_STACK INIT_VM_FLAG(STACK) +#ifdef CONFIG_STACK_GROWS_UP +#define VM_STACK_EARLY INIT_VM_FLAG(STACK_EARLY) +#else +#define VM_STACK_EARLY VM_NONE +#endif +#ifdef CONFIG_ARCH_HAS_PKEYS +#define VM_PKEY_SHIFT ((__force int)VMA_HIGH_ARCH_0_BIT) +/* Despite the naming, these are FLAGS not bits. */ +#define VM_PKEY_BIT0 INIT_VM_FLAG(PKEY_BIT0) +#define VM_PKEY_BIT1 INIT_VM_FLAG(PKEY_BIT1) +#define VM_PKEY_BIT2 INIT_VM_FLAG(PKEY_BIT2) +#if CONFIG_ARCH_PKEY_BITS > 3 +#define VM_PKEY_BIT3 INIT_VM_FLAG(PKEY_BIT3) +#else +#define VM_PKEY_BIT3 VM_NONE +#endif /* CONFIG_ARCH_PKEY_BITS > 3 */ +#if CONFIG_ARCH_PKEY_BITS > 4 +#define VM_PKEY_BIT4 INIT_VM_FLAG(PKEY_BIT4) +#else +#define VM_PKEY_BIT4 VM_NONE +#endif /* CONFIG_ARCH_PKEY_BITS > 4 */ +#endif /* CONFIG_ARCH_HAS_PKEYS */ +#if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS) +#define VM_SHADOW_STACK INIT_VM_FLAG(SHADOW_STACK) +#else +#define VM_SHADOW_STACK VM_NONE +#endif +#if defined(CONFIG_PPC64) +#define VM_SAO INIT_VM_FLAG(SAO) +#elif defined(CONFIG_PARISC) +#define VM_GROWSUP INIT_VM_FLAG(GROWSUP) +#elif defined(CONFIG_SPARC64) +#define VM_SPARC_ADI INIT_VM_FLAG(SPARC_ADI) +#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) +#elif defined(CONFIG_ARM64) +#define VM_ARM64_BTI INIT_VM_FLAG(ARM64_BTI) +#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) +#elif !defined(CONFIG_MMU) +#define VM_MAPPED_COPY INIT_VM_FLAG(MAPPED_COPY) +#endif +#ifndef VM_GROWSUP +#define VM_GROWSUP VM_NONE +#endif +#ifdef CONFIG_ARM64_MTE +#define VM_MTE INIT_VM_FLAG(MTE) +#define VM_MTE_ALLOWED INIT_VM_FLAG(MTE_ALLOWED) +#else +#define VM_MTE VM_NONE +#define VM_MTE_ALLOWED VM_NONE +#endif +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR +#define VM_UFFD_MINOR INIT_VM_FLAG(UFFD_MINOR) +#else +#define VM_UFFD_MINOR VM_NONE +#endif +#ifdef CONFIG_64BIT +#define VM_ALLOW_ANY_UNCACHED INIT_VM_FLAG(ALLOW_ANY_UNCACHED) +#define VM_SEALED INIT_VM_FLAG(SEALED) +#else +#define VM_ALLOW_ANY_UNCACHED VM_NONE +#define VM_SEALED VM_NONE +#endif +#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) +#define VM_DROPPABLE INIT_VM_FLAG(DROPPABLE) +#else +#define VM_DROPPABLE VM_NONE +#endif + +/* Bits set in the VMA until the stack is in its final location */ +#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) + +#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) + +/* Common data flag combinations */ +#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \ + VM_MAYWRITE | VM_MAYEXEC) +#define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) + +#ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */ +#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC +#endif + +#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ +#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS +#endif + +#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) + +#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) + +/* VMA basic access permission flags */ +#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) + +/* + * Special vmas that are non-mergable, non-mlock()able. + */ +#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) + +#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) +#define TASK_SIZE_LOW DEFAULT_MAP_WINDOW +#define TASK_SIZE_MAX DEFAULT_MAP_WINDOW +#define STACK_TOP TASK_SIZE_LOW +#define STACK_TOP_MAX TASK_SIZE_MAX + +/* This mask represents all the VMA flag bits used by mlock */ +#define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) + +#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) + +#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) + +#define RLIMIT_STACK 3 /* max stack size */ +#define RLIMIT_MEMLOCK 8 /* max locked-in-memory address space */ + +#define CAP_IPC_LOCK 14 + +#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD) + +#define VM_IGNORE_MERGE VM_STICKY + +#define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD) + +#define pgprot_val(x) ((x).pgprot) +#define __pgprot(x) ((pgprot_t) { (x) } ) + +#define for_each_vma(__vmi, __vma) \ + while (((__vma) = vma_next(&(__vmi))) != NULL) + +/* The MM code likes to work with exclusive end addresses */ +#define for_each_vma_range(__vmi, __vma, __end) \ + while (((__vma) = vma_find(&(__vmi), (__end))) != NULL) + +#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) + +#define PHYS_PFN(x) ((unsigned long)((x) >> PAGE_SHIFT)) + +#define test_and_set_bit(nr, addr) __test_and_set_bit(nr, addr) +#define test_and_clear_bit(nr, addr) __test_and_clear_bit(nr, addr) + +#define AS_MM_ALL_LOCKS 2 + +#define swap(a, b) \ + do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) + +/* + * Flags for bug emulation. + * + * These occupy the top three bytes. + */ +enum { + READ_IMPLIES_EXEC = 0x0400000, +}; + +struct vma_iterator { + struct ma_state mas; +}; + +#define VMA_ITERATOR(name, __mm, __addr) \ + struct vma_iterator name = { \ + .mas = { \ + .tree = &(__mm)->mm_mt, \ + .index = __addr, \ + .node = NULL, \ + .status = ma_start, \ + }, \ + } + +#define DEFINE_MUTEX(mutexname) \ + struct mutex mutexname = {} + +#define DECLARE_BITMAP(name, bits) \ + unsigned long name[BITS_TO_LONGS(bits)] + +#define EMPTY_VMA_FLAGS ((vma_flags_t){ }) + +/* What action should be taken after an .mmap_prepare call is complete? */ +enum mmap_action_type { + MMAP_NOTHING, /* Mapping is complete, no further action. */ + MMAP_REMAP_PFN, /* Remap PFN range. */ + MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ +}; + +/* + * Describes an action an mmap_prepare hook can instruct to be taken to complete + * the mapping of a VMA. Specified in vm_area_desc. + */ +struct mmap_action { + union { + /* Remap range. */ + struct { + unsigned long start; + unsigned long start_pfn; + unsigned long size; + pgprot_t pgprot; + } remap; + }; + enum mmap_action_type type; + + /* + * If specified, this hook is invoked after the selected action has been + * successfully completed. Note that the VMA write lock still held. + * + * The absolute minimum ought to be done here. + * + * Returns 0 on success, or an error code. + */ + int (*success_hook)(const struct vm_area_struct *vma); + + /* + * If specified, this hook is invoked when an error occurred when + * attempting the selection action. + * + * The hook can return an error code in order to filter the error, but + * it is not valid to clear the error here. + */ + int (*error_hook)(int err); + + /* + * This should be set in rare instances where the operation required + * that the rmap should not be able to access the VMA until + * completely set up. + */ + bool hide_from_rmap_until_complete :1; +}; + +/* Operations which modify VMAs. */ +enum vma_operation { + VMA_OP_SPLIT, + VMA_OP_MERGE_UNFAULTED, + VMA_OP_REMAP, + VMA_OP_FORK, +}; + +/* + * Describes a VMA that is about to be mmap()'ed. Drivers may choose to + * manipulate mutable fields which will cause those fields to be updated in the + * resultant VMA. + * + * Helper functions are not required for manipulating any field. + */ +struct vm_area_desc { + /* Immutable state. */ + const struct mm_struct *const mm; + struct file *const file; /* May vary from vm_file in stacked callers. */ + unsigned long start; + unsigned long end; + + /* Mutable fields. Populated with initial state. */ + pgoff_t pgoff; + struct file *vm_file; + union { + vm_flags_t vm_flags; + vma_flags_t vma_flags; + }; + pgprot_t page_prot; + + /* Write-only fields. */ + const struct vm_operations_struct *vm_ops; + void *private_data; + + /* Take further action? */ + struct mmap_action action; +}; + +struct vm_area_struct { + /* The first cache line has the info for VMA tree walking. */ + + union { + struct { + /* VMA covers [vm_start; vm_end) addresses within mm */ + unsigned long vm_start; + unsigned long vm_end; + }; + freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */ + }; + + struct mm_struct *vm_mm; /* The address space we belong to. */ + pgprot_t vm_page_prot; /* Access permissions of this VMA. */ + + /* + * Flags, see mm.h. + * To modify use vm_flags_{init|reset|set|clear|mod} functions. + */ + union { + const vm_flags_t vm_flags; + vma_flags_t flags; + }; + +#ifdef CONFIG_PER_VMA_LOCK + /* + * Can only be written (using WRITE_ONCE()) while holding both: + * - mmap_lock (in write mode) + * - vm_refcnt bit at VMA_LOCK_OFFSET is set + * Can be read reliably while holding one of: + * - mmap_lock (in read or write mode) + * - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1 + * Can be read unreliably (using READ_ONCE()) for pessimistic bailout + * while holding nothing (except RCU to keep the VMA struct allocated). + * + * This sequence counter is explicitly allowed to overflow; sequence + * counter reuse can only lead to occasional unnecessary use of the + * slowpath. + */ + unsigned int vm_lock_seq; +#endif + + /* + * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma + * list, after a COW of one of the file pages. A MAP_SHARED vma + * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack + * or brk vma (with NULL file) can only be in an anon_vma list. + */ + struct list_head anon_vma_chain; /* Serialized by mmap_lock & + * page_table_lock */ + struct anon_vma *anon_vma; /* Serialized by page_table_lock */ + + /* Function pointers to deal with this struct. */ + const struct vm_operations_struct *vm_ops; + + /* Information about our backing store: */ + unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE + units */ + struct file * vm_file; /* File we map to (can be NULL). */ + void * vm_private_data; /* was vm_pte (shared mem) */ + +#ifdef CONFIG_SWAP + atomic_long_t swap_readahead_info; +#endif +#ifndef CONFIG_MMU + struct vm_region *vm_region; /* NOMMU mapping region */ +#endif +#ifdef CONFIG_NUMA + struct mempolicy *vm_policy; /* NUMA policy for the VMA */ +#endif +#ifdef CONFIG_NUMA_BALANCING + struct vma_numab_state *numab_state; /* NUMA Balancing state */ +#endif +#ifdef CONFIG_PER_VMA_LOCK + /* Unstable RCU readers are allowed to read this. */ + refcount_t vm_refcnt; +#endif + /* + * For areas with an address space and backing store, + * linkage into the address_space->i_mmap interval tree. + * + */ + struct { + struct rb_node rb; + unsigned long rb_subtree_last; + } shared; +#ifdef CONFIG_ANON_VMA_NAME + /* + * For private and shared anonymous mappings, a pointer to a null + * terminated string containing the name given to the vma, or NULL if + * unnamed. Serialized by mmap_lock. Use anon_vma_name to access. + */ + struct anon_vma_name *anon_name; +#endif + struct vm_userfaultfd_ctx vm_userfaultfd_ctx; +} __randomize_layout; + +struct vm_operations_struct { + void (*open)(struct vm_area_struct * area); + /** + * @close: Called when the VMA is being removed from the MM. + * Context: User context. May sleep. Caller holds mmap_lock. + */ + void (*close)(struct vm_area_struct * area); + /* Called any time before splitting to check if it's allowed */ + int (*may_split)(struct vm_area_struct *area, unsigned long addr); + int (*mremap)(struct vm_area_struct *area); + /* + * Called by mprotect() to make driver-specific permission + * checks before mprotect() is finalised. The VMA must not + * be modified. Returns 0 if mprotect() can proceed. + */ + int (*mprotect)(struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long newflags); + vm_fault_t (*fault)(struct vm_fault *vmf); + vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order); + vm_fault_t (*map_pages)(struct vm_fault *vmf, + pgoff_t start_pgoff, pgoff_t end_pgoff); + unsigned long (*pagesize)(struct vm_area_struct * area); + + /* notification that a previously read-only page is about to become + * writable, if an error is returned it will cause a SIGBUS */ + vm_fault_t (*page_mkwrite)(struct vm_fault *vmf); + + /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */ + vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf); + + /* called by access_process_vm when get_user_pages() fails, typically + * for use by special VMAs. See also generic_access_phys() for a generic + * implementation useful for any iomem mapping. + */ + int (*access)(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write); + + /* Called by the /proc/PID/maps code to ask the vma whether it + * has a special name. Returning non-NULL will also cause this + * vma to be dumped unconditionally. */ + const char *(*name)(struct vm_area_struct *vma); + +#ifdef CONFIG_NUMA + /* + * set_policy() op must add a reference to any non-NULL @new mempolicy + * to hold the policy upon return. Caller should pass NULL @new to + * remove a policy and fall back to surrounding context--i.e. do not + * install a MPOL_DEFAULT policy, nor the task or system default + * mempolicy. + */ + int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new); + + /* + * get_policy() op must add reference [mpol_get()] to any policy at + * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure + * in mm/mempolicy.c will do this automatically. + * get_policy() must NOT add a ref if the policy at (vma,addr) is not + * marked as MPOL_SHARED. vma policies are protected by the mmap_lock. + * If no [shared/vma] mempolicy exists at the addr, get_policy() op + * must return NULL--i.e., do not "fallback" to task or system default + * policy. + */ + struct mempolicy *(*get_policy)(struct vm_area_struct *vma, + unsigned long addr, pgoff_t *ilx); +#endif +#ifdef CONFIG_FIND_NORMAL_PAGE + /* + * Called by vm_normal_page() for special PTEs in @vma at @addr. This + * allows for returning a "normal" page from vm_normal_page() even + * though the PTE indicates that the "struct page" either does not exist + * or should not be touched: "special". + * + * Do not add new users: this really only works when a "normal" page + * was mapped, but then the PTE got changed to something weird (+ + * marked special) that would not make pte_pfn() identify the originally + * inserted page. + */ + struct page *(*find_normal_page)(struct vm_area_struct *vma, + unsigned long addr); +#endif /* CONFIG_FIND_NORMAL_PAGE */ +}; + +struct vm_unmapped_area_info { +#define VM_UNMAPPED_AREA_TOPDOWN 1 + unsigned long flags; + unsigned long length; + unsigned long low_limit; + unsigned long high_limit; + unsigned long align_mask; + unsigned long align_offset; + unsigned long start_gap; +}; + +struct pagetable_move_control { + struct vm_area_struct *old; /* Source VMA. */ + struct vm_area_struct *new; /* Destination VMA. */ + unsigned long old_addr; /* Address from which the move begins. */ + unsigned long old_end; /* Exclusive address at which old range ends. */ + unsigned long new_addr; /* Address to move page tables to. */ + unsigned long len_in; /* Bytes to remap specified by user. */ + + bool need_rmap_locks; /* Do rmap locks need to be taken? */ + bool for_stack; /* Is this an early temp stack being moved? */ +}; + +#define PAGETABLE_MOVE(name, old_, new_, old_addr_, new_addr_, len_) \ + struct pagetable_move_control name = { \ + .old = old_, \ + .new = new_, \ + .old_addr = old_addr_, \ + .old_end = (old_addr_) + (len_), \ + .new_addr = new_addr_, \ + .len_in = len_, \ + } + +static inline void vma_iter_invalidate(struct vma_iterator *vmi) +{ + mas_pause(&vmi->mas); +} + +static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) +{ + return __pgprot(pgprot_val(oldprot) | pgprot_val(newprot)); +} + +static inline pgprot_t vm_get_page_prot(vm_flags_t vm_flags) +{ + return __pgprot(vm_flags); +} + +static inline bool mm_flags_test(int flag, const struct mm_struct *mm) +{ + return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); +} + +/* + * Copy value to the first system word of VMA flags, non-atomically. + * + * IMPORTANT: This does not overwrite bytes past the first system word. The + * caller must account for this. + */ +static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value) +{ + *ACCESS_PRIVATE(flags, __vma_flags) = value; +} + +/* + * Copy value to the first system word of VMA flags ONCE, non-atomically. + * + * IMPORTANT: This does not overwrite bytes past the first system word. The + * caller must account for this. + */ +static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + WRITE_ONCE(*bitmap, value); +} + +/* Update the first system word of VMA flags setting bits, non-atomically. */ +static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + *bitmap |= value; +} + +/* Update the first system word of VMA flags clearing bits, non-atomically. */ +static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + *bitmap &= ~value; +} + +static inline void vma_flags_clear_all(vma_flags_t *flags) +{ + bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS); +} + +static inline void vma_flag_set(vma_flags_t *flags, vma_flag_t bit) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + __set_bit((__force int)bit, bitmap); +} + +/* Use when VMA is not part of the VMA tree and needs no locking */ +static inline void vm_flags_init(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_flags_clear_all(&vma->flags); + vma_flags_overwrite_word(&vma->flags, flags); +} + +/* + * Use when VMA is part of the VMA tree and modifications need coordination + * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and + * it should be locked explicitly beforehand. + */ +static inline void vm_flags_reset(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_assert_write_locked(vma); + vm_flags_init(vma, flags); +} + +static inline void vm_flags_reset_once(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_assert_write_locked(vma); + /* + * The user should only be interested in avoiding reordering of + * assignment to the first word. + */ + vma_flags_clear_all(&vma->flags); + vma_flags_overwrite_word_once(&vma->flags, flags); +} + +static inline void vm_flags_set(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_start_write(vma); + vma_flags_set_word(&vma->flags, flags); +} + +static inline void vm_flags_clear(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_start_write(vma); + vma_flags_clear_word(&vma->flags, flags); +} + +static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits); + +#define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \ + (const vma_flag_t []){__VA_ARGS__}) + +static __always_inline bool vma_flags_test_mask(const vma_flags_t *flags, + vma_flags_t to_test) +{ + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_to_test = to_test.__vma_flags; + + return bitmap_intersects(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS); +} + +#define vma_flags_test(flags, ...) \ + vma_flags_test_mask(flags, mk_vma_flags(__VA_ARGS__)) + +static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags, + vma_flags_t to_test) +{ + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_to_test = to_test.__vma_flags; + + return bitmap_subset(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS); +} + +#define vma_flags_test_all(flags, ...) \ + vma_flags_test_all_mask(flags, mk_vma_flags(__VA_ARGS__)) + +static __always_inline void vma_flags_set_mask(vma_flags_t *flags, vma_flags_t to_set) +{ + unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_to_set = to_set.__vma_flags; + + bitmap_or(bitmap, bitmap, bitmap_to_set, NUM_VMA_FLAG_BITS); +} + +#define vma_flags_set(flags, ...) \ + vma_flags_set_mask(flags, mk_vma_flags(__VA_ARGS__)) + +static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, vma_flags_t to_clear) +{ + unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_to_clear = to_clear.__vma_flags; + + bitmap_andnot(bitmap, bitmap, bitmap_to_clear, NUM_VMA_FLAG_BITS); +} + +#define vma_flags_clear(flags, ...) \ + vma_flags_clear_mask(flags, mk_vma_flags(__VA_ARGS__)) + +static inline bool vma_test_all_flags_mask(const struct vm_area_struct *vma, + vma_flags_t flags) +{ + return vma_flags_test_all_mask(&vma->flags, flags); +} + +#define vma_test_all_flags(vma, ...) \ + vma_test_all_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) + +static inline bool is_shared_maywrite_vm_flags(vm_flags_t vm_flags) +{ + return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == + (VM_SHARED | VM_MAYWRITE); +} + +static inline void vma_set_flags_mask(struct vm_area_struct *vma, + vma_flags_t flags) +{ + vma_flags_set_mask(&vma->flags, flags); +} + +#define vma_set_flags(vma, ...) \ + vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) + +static inline bool vma_desc_test_flags_mask(const struct vm_area_desc *desc, + vma_flags_t flags) +{ + return vma_flags_test_mask(&desc->vma_flags, flags); +} + +#define vma_desc_test_flags(desc, ...) \ + vma_desc_test_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) + +static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, + vma_flags_t flags) +{ + vma_flags_set_mask(&desc->vma_flags, flags); +} + +#define vma_desc_set_flags(desc, ...) \ + vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) + +static inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, + vma_flags_t flags) +{ + vma_flags_clear_mask(&desc->vma_flags, flags); +} + +#define vma_desc_clear_flags(desc, ...) \ + vma_desc_clear_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) + +static inline bool is_shared_maywrite(const vma_flags_t *flags) +{ + return vma_flags_test_all(flags, VMA_SHARED_BIT, VMA_MAYWRITE_BIT); +} + +static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma) +{ + return is_shared_maywrite(&vma->flags); +} + +static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) +{ + /* + * Uses mas_find() to get the first VMA when the iterator starts. + * Calling mas_next() could skip the first entry. + */ + return mas_find(&vmi->mas, ULONG_MAX); +} + +/* + * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these + * assertions should be made either under mmap_write_lock or when the object + * has been isolated under mmap_write_lock, ensuring no competing writers. + */ +static inline void vma_assert_attached(struct vm_area_struct *vma) +{ + WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt)); +} + +static inline void vma_assert_detached(struct vm_area_struct *vma) +{ + WARN_ON_ONCE(refcount_read(&vma->vm_refcnt)); +} + +static inline void vma_assert_write_locked(struct vm_area_struct *); +static inline void vma_mark_attached(struct vm_area_struct *vma) +{ + vma_assert_write_locked(vma); + vma_assert_detached(vma); + refcount_set_release(&vma->vm_refcnt, 1); +} + +static inline void vma_mark_detached(struct vm_area_struct *vma) +{ + vma_assert_write_locked(vma); + vma_assert_attached(vma); + /* We are the only writer, so no need to use vma_refcount_put(). */ + if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { + /* + * Reader must have temporarily raised vm_refcnt but it will + * drop it without using the vma since vma is write-locked. + */ + } +} + +static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) +{ + memset(vma, 0, sizeof(*vma)); + vma->vm_mm = mm; + vma->vm_ops = &vma_dummy_vm_ops; + INIT_LIST_HEAD(&vma->anon_vma_chain); + vma->vm_lock_seq = UINT_MAX; +} + +/* + * These are defined in vma.h, but sadly vm_stat_account() is referenced by + * kernel/fork.c, so we have to these broadly available there, and temporarily + * define them here to resolve the dependency cycle. + */ +#define is_exec_mapping(flags) \ + ((flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC) + +#define is_stack_mapping(flags) \ + (((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK)) + +#define is_data_mapping(flags) \ + ((flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE) + +static inline void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, + long npages) +{ + WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages); + + if (is_exec_mapping(flags)) + mm->exec_vm += npages; + else if (is_stack_mapping(flags)) + mm->stack_vm += npages; + else if (is_data_mapping(flags)) + mm->data_vm += npages; +} + +#undef is_exec_mapping +#undef is_stack_mapping +#undef is_data_mapping + +static inline void vm_unacct_memory(long pages) +{ + vm_acct_memory(-pages); +} + +static inline void mapping_allow_writable(struct address_space *mapping) +{ + atomic_inc(&mapping->i_mmap_writable); +} + +static inline +struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) +{ + return mas_find(&vmi->mas, max - 1); +} + +static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, + unsigned long start, unsigned long end, gfp_t gfp) +{ + __mas_set_range(&vmi->mas, start, end - 1); + mas_store_gfp(&vmi->mas, NULL, gfp); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} + +static inline void vma_set_anonymous(struct vm_area_struct *vma) +{ + vma->vm_ops = NULL; +} + +/* Declared in vma.h. */ +static inline void set_vma_from_desc(struct vm_area_struct *vma, + struct vm_area_desc *desc); + +static inline int __compat_vma_mmap(const struct file_operations *f_op, + struct file *file, struct vm_area_struct *vma) +{ + struct vm_area_desc desc = { + .mm = vma->vm_mm, + .file = file, + .start = vma->vm_start, + .end = vma->vm_end, + + .pgoff = vma->vm_pgoff, + .vm_file = vma->vm_file, + .vm_flags = vma->vm_flags, + .page_prot = vma->vm_page_prot, + + .action.type = MMAP_NOTHING, /* Default */ + }; + int err; + + err = f_op->mmap_prepare(&desc); + if (err) + return err; + + mmap_action_prepare(&desc.action, &desc); + set_vma_from_desc(vma, &desc); + return mmap_action_complete(&desc.action, vma); +} + +static inline int compat_vma_mmap(struct file *file, + struct vm_area_struct *vma) +{ + return __compat_vma_mmap(file->f_op, file, vma); +} + + +static inline void vma_iter_init(struct vma_iterator *vmi, + struct mm_struct *mm, unsigned long addr) +{ + mas_init(&vmi->mas, &mm->mm_mt, addr); +} + +static inline unsigned long vma_pages(struct vm_area_struct *vma) +{ + return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; +} + +static inline void mmap_assert_locked(struct mm_struct *); +static inline struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, + unsigned long start_addr, + unsigned long end_addr) +{ + unsigned long index = start_addr; + + mmap_assert_locked(mm); + return mt_find(&mm->mm_mt, &index, end_addr - 1); +} + +static inline +struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr) +{ + return mtree_load(&mm->mm_mt, addr); +} + +static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) +{ + return mas_prev(&vmi->mas, 0); +} + +static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr) +{ + mas_set(&vmi->mas, addr); +} + +static inline bool vma_is_anonymous(struct vm_area_struct *vma) +{ + return !vma->vm_ops; +} + +/* Defined in vma.h, so temporarily define here to avoid circular dependency. */ +#define vma_iter_load(vmi) \ + mas_walk(&(vmi)->mas) + +static inline struct vm_area_struct * +find_vma_prev(struct mm_struct *mm, unsigned long addr, + struct vm_area_struct **pprev) +{ + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, addr); + + vma = vma_iter_load(&vmi); + *pprev = vma_prev(&vmi); + if (!vma) + vma = vma_next(&vmi); + return vma; +} + +#undef vma_iter_load + +static inline void vma_iter_free(struct vma_iterator *vmi) +{ + mas_destroy(&vmi->mas); +} + +static inline +struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi) +{ + return mas_next_range(&vmi->mas, ULONG_MAX); +} + +bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); + +/* Update vma->vm_page_prot to reflect vma->vm_flags. */ +static inline void vma_set_page_prot(struct vm_area_struct *vma) +{ + vm_flags_t vm_flags = vma->vm_flags; + pgprot_t vm_page_prot; + + /* testing: we inline vm_pgprot_modify() to avoid clash with vma.h. */ + vm_page_prot = pgprot_modify(vma->vm_page_prot, vm_get_page_prot(vm_flags)); + + if (vma_wants_writenotify(vma, vm_page_prot)) { + vm_flags &= ~VM_SHARED; + /* testing: we inline vm_pgprot_modify() to avoid clash with vma.h. */ + vm_page_prot = pgprot_modify(vm_page_prot, vm_get_page_prot(vm_flags)); + } + /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */ + WRITE_ONCE(vma->vm_page_prot, vm_page_prot); +} + +static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_GROWSDOWN) + return stack_guard_gap; + + /* See reasoning around the VM_SHADOW_STACK definition */ + if (vma->vm_flags & VM_SHADOW_STACK) + return PAGE_SIZE; + + return 0; +} + +static inline unsigned long vm_start_gap(struct vm_area_struct *vma) +{ + unsigned long gap = stack_guard_start_gap(vma); + unsigned long vm_start = vma->vm_start; + + vm_start -= gap; + if (vm_start > vma->vm_start) + vm_start = 0; + return vm_start; +} + +static inline unsigned long vm_end_gap(struct vm_area_struct *vma) +{ + unsigned long vm_end = vma->vm_end; + + if (vma->vm_flags & VM_GROWSUP) { + vm_end += stack_guard_gap; + if (vm_end < vma->vm_end) + vm_end = -PAGE_SIZE; + } + return vm_end; +} + +static inline bool vma_is_accessible(struct vm_area_struct *vma) +{ + return vma->vm_flags & VM_ACCESS_FLAGS; +} + +static inline bool mlock_future_ok(const struct mm_struct *mm, + vm_flags_t vm_flags, unsigned long bytes) +{ + unsigned long locked_pages, limit_pages; + + if (!(vm_flags & VM_LOCKED) || capable(CAP_IPC_LOCK)) + return true; + + locked_pages = bytes >> PAGE_SHIFT; + locked_pages += mm->locked_vm; + + limit_pages = rlimit(RLIMIT_MEMLOCK); + limit_pages >>= PAGE_SHIFT; + + return locked_pages <= limit_pages; +} + +static inline bool map_deny_write_exec(unsigned long old, unsigned long new) +{ + /* If MDWE is disabled, we have nothing to deny. */ + if (mm_flags_test(MMF_HAS_MDWE, current->mm)) + return false; + + /* If the new VMA is not executable, we have nothing to deny. */ + if (!(new & VM_EXEC)) + return false; + + /* Under MDWE we do not accept newly writably executable VMAs... */ + if (new & VM_WRITE) + return true; + + /* ...nor previously non-executable VMAs becoming executable. */ + if (!(old & VM_EXEC)) + return true; + + return false; +} + +static inline int mapping_map_writable(struct address_space *mapping) +{ + return atomic_inc_unless_negative(&mapping->i_mmap_writable) ? + 0 : -EPERM; +} + +/* Did the driver provide valid mmap hook configuration? */ +static inline bool can_mmap_file(struct file *file) +{ + bool has_mmap = file->f_op->mmap; + bool has_mmap_prepare = file->f_op->mmap_prepare; + + /* Hooks are mutually exclusive. */ + if (WARN_ON_ONCE(has_mmap && has_mmap_prepare)) + return false; + if (!has_mmap && !has_mmap_prepare) + return false; + + return true; +} + +static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) +{ + if (file->f_op->mmap_prepare) + return compat_vma_mmap(file, vma); + + return file->f_op->mmap(file, vma); +} + +static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc) +{ + return file->f_op->mmap_prepare(desc); +} + +static inline void vma_set_file(struct vm_area_struct *vma, struct file *file) +{ + /* Changing an anonymous vma with this is illegal */ + get_file(file); + swap(vma->vm_file, file); + fput(file); +} diff --git a/tools/testing/vma/include/stubs.h b/tools/testing/vma/include/stubs.h new file mode 100644 index 000000000000..947a3a0c2566 --- /dev/null +++ b/tools/testing/vma/include/stubs.h @@ -0,0 +1,428 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#pragma once + +/* + * Contains declarations that are STUBBED, that is that are rendered no-ops, in + * order to faciliate userland VMA testing. + */ + +/* Forward declarations. */ +struct mm_struct; +struct vm_area_struct; +struct vm_area_desc; +struct pagetable_move_control; +struct mmap_action; +struct file; +struct anon_vma; +struct anon_vma_chain; +struct address_space; +struct unmap_desc; + +#define __bitwise +#define __randomize_layout + +#define FIRST_USER_ADDRESS 0UL +#define USER_PGTABLES_CEILING 0UL + +#define vma_policy(vma) NULL + +#define down_write_nest_lock(sem, nest_lock) + +#define data_race(expr) expr + +#define ASSERT_EXCLUSIVE_WRITER(x) + +struct vm_userfaultfd_ctx {}; +struct mempolicy {}; +struct mmu_gather {}; +struct mutex {}; +struct vm_fault {}; + +static inline void userfaultfd_unmap_complete(struct mm_struct *mm, + struct list_head *uf) +{ +} + +static inline unsigned long move_page_tables(struct pagetable_move_control *pmc) +{ + return 0; +} + +static inline void free_pgd_range(struct mmu_gather *tlb, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ +} + +static inline int ksm_execve(struct mm_struct *mm) +{ + return 0; +} + +static inline void ksm_exit(struct mm_struct *mm) +{ +} + +static inline void vma_numab_state_init(struct vm_area_struct *vma) +{ +} + +static inline void vma_numab_state_free(struct vm_area_struct *vma) +{ +} + +static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) +{ +} + +static inline void free_anon_vma_name(struct vm_area_struct *vma) +{ +} + +static inline void mmap_action_prepare(struct mmap_action *action, + struct vm_area_desc *desc) +{ +} + +static inline int mmap_action_complete(struct mmap_action *action, + struct vm_area_struct *vma) +{ + return 0; +} + +static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma) +{ +} + +static inline bool shmem_file(struct file *file) +{ + return false; +} + +static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm, + const struct file *file, vm_flags_t vm_flags) +{ + return vm_flags; +} + +static inline void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn) +{ +} + +static inline int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t pgprot) +{ + return 0; +} + +static inline int do_munmap(struct mm_struct *, unsigned long, size_t, + struct list_head *uf) +{ + return 0; +} + +/* Currently stubbed but we may later wish to un-stub. */ +static inline void vm_acct_memory(long pages); + +static inline void mmap_assert_locked(struct mm_struct *mm) +{ +} + + +static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) +{ +} + +static inline void i_mmap_unlock_write(struct address_space *mapping) +{ +} + +static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + struct list_head *unmaps) +{ + return 0; +} + +static inline void mmap_write_downgrade(struct mm_struct *mm) +{ +} + +static inline void mmap_read_unlock(struct mm_struct *mm) +{ +} + +static inline void mmap_write_unlock(struct mm_struct *mm) +{ +} + +static inline int mmap_write_lock_killable(struct mm_struct *mm) +{ + return 0; +} + +static inline bool can_modify_mm(struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + return true; +} + +static inline void arch_unmap(struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ +} + +static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) +{ + return true; +} + +static inline void khugepaged_enter_vma(struct vm_area_struct *vma, + vm_flags_t vm_flags) +{ +} + +static inline bool mapping_can_writeback(struct address_space *mapping) +{ + return true; +} + +static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) +{ + return false; +} + +static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) +{ + return false; +} + +static inline bool userfaultfd_wp(struct vm_area_struct *vma) +{ + return false; +} + +static inline void mmap_assert_write_locked(struct mm_struct *mm) +{ +} + +static inline void mutex_lock(struct mutex *lock) +{ +} + +static inline void mutex_unlock(struct mutex *lock) +{ +} + +static inline bool mutex_is_locked(struct mutex *lock) +{ + return true; +} + +static inline bool signal_pending(void *p) +{ + return false; +} + +static inline bool is_file_hugepages(struct file *file) +{ + return false; +} + +static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages) +{ + return 0; +} + +static inline bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, + unsigned long npages) +{ + return true; +} + +static inline int shmem_zero_setup(struct vm_area_struct *vma) +{ + return 0; +} + + +static inline void vm_acct_memory(long pages) +{ +} + +static inline void vma_interval_tree_insert(struct vm_area_struct *vma, + struct rb_root_cached *rb) +{ +} + +static inline void vma_interval_tree_remove(struct vm_area_struct *vma, + struct rb_root_cached *rb) +{ +} + +static inline void flush_dcache_mmap_unlock(struct address_space *mapping) +{ +} + +static inline void anon_vma_interval_tree_insert(struct anon_vma_chain *avc, + struct rb_root_cached *rb) +{ +} + +static inline void anon_vma_interval_tree_remove(struct anon_vma_chain *avc, + struct rb_root_cached *rb) +{ +} + +static inline void uprobe_mmap(struct vm_area_struct *vma) +{ +} + +static inline void uprobe_munmap(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ +} + +static inline void i_mmap_lock_write(struct address_space *mapping) +{ +} + +static inline void anon_vma_lock_write(struct anon_vma *anon_vma) +{ +} + +static inline void vma_assert_write_locked(struct vm_area_struct *vma) +{ +} + +static inline void ksm_add_vma(struct vm_area_struct *vma) +{ +} + +static inline void perf_event_mmap(struct vm_area_struct *vma) +{ +} + +static inline bool vma_is_dax(struct vm_area_struct *vma) +{ + return false; +} + +static inline struct vm_area_struct *get_gate_vma(struct mm_struct *mm) +{ + return NULL; +} + +static inline bool arch_validate_flags(vm_flags_t flags) +{ + return true; +} + +static inline void vma_close(struct vm_area_struct *vma) +{ +} + +static inline int mmap_file(struct file *file, struct vm_area_struct *vma) +{ + return 0; +} + +static inline int is_hugepage_only_range(struct mm_struct *mm, + unsigned long addr, unsigned long len) +{ + return 0; +} + +static inline bool capable(int cap) +{ + return true; +} + +static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) +{ + return NULL; +} + +static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, + struct vm_userfaultfd_ctx vm_ctx) +{ + return true; +} + +static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, + struct anon_vma_name *anon_name2) +{ + return true; +} + +static inline void might_sleep(void) +{ +} + +static inline void fput(struct file *file) +{ +} + +static inline void mpol_put(struct mempolicy *pol) +{ +} + +static inline void lru_add_drain(void) +{ +} + +static inline void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm) +{ +} + +static inline void update_hiwater_rss(struct mm_struct *mm) +{ +} + +static inline void update_hiwater_vm(struct mm_struct *mm) +{ +} + +static inline void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap) +{ +} + +static inline void free_pgtables(struct mmu_gather *tlb, struct unmap_desc *unmap) +{ +} + +static inline void mapping_unmap_writable(struct address_space *mapping) +{ +} + +static inline void flush_dcache_mmap_lock(struct address_space *mapping) +{ +} + +static inline void tlb_finish_mmu(struct mmu_gather *tlb) +{ +} + +static inline struct file *get_file(struct file *f) +{ + return f; +} + +static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) +{ + return 0; +} + +static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + struct vm_area_struct *next) +{ +} + +static inline void hugetlb_split(struct vm_area_struct *, unsigned long) {} diff --git a/tools/testing/vma/main.c b/tools/testing/vma/main.c new file mode 100644 index 000000000000..49b09e97a51f --- /dev/null +++ b/tools/testing/vma/main.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "shared.h" +/* + * Directly import the VMA implementation here. Our vma_internal.h wrapper + * provides userland-equivalent functionality for everything vma.c uses. + */ +#include "../../../mm/vma_init.c" +#include "../../../mm/vma_exec.c" +#include "../../../mm/vma.c" + +/* Tests are included directly so they can test static functions in mm/vma.c. */ +#include "tests/merge.c" +#include "tests/mmap.c" +#include "tests/vma.c" + +/* Helper functions which utilise static kernel functions. */ + +struct vm_area_struct *merge_existing(struct vma_merge_struct *vmg) +{ + struct vm_area_struct *vma; + + vma = vma_merge_existing_range(vmg); + if (vma) + vma_assert_attached(vma); + return vma; +} + +int attach_vma(struct mm_struct *mm, struct vm_area_struct *vma) +{ + int res; + + res = vma_link(mm, vma); + if (!res) + vma_assert_attached(vma); + return res; +} + +/* Main test running which invokes tests/ *.c runners. */ +int main(void) +{ + int num_tests = 0, num_fail = 0; + + maple_tree_init(); + vma_state_init(); + + run_merge_tests(&num_tests, &num_fail); + run_mmap_tests(&num_tests, &num_fail); + run_vma_tests(&num_tests, &num_fail); + + printf("%d tests run, %d passed, %d failed.\n", + num_tests, num_tests - num_fail, num_fail); + + return num_fail == 0 ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/tools/testing/vma/shared.c b/tools/testing/vma/shared.c new file mode 100644 index 000000000000..bda578cc3304 --- /dev/null +++ b/tools/testing/vma/shared.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "shared.h" + + +bool fail_prealloc; +unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; +unsigned long dac_mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; +unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT; + +const struct vm_operations_struct vma_dummy_vm_ops; +struct anon_vma dummy_anon_vma; +struct task_struct __current; + +struct vm_area_struct *alloc_vma(struct mm_struct *mm, + unsigned long start, unsigned long end, + pgoff_t pgoff, vm_flags_t vm_flags) +{ + struct vm_area_struct *vma = vm_area_alloc(mm); + + if (vma == NULL) + return NULL; + + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; + vm_flags_reset(vma, vm_flags); + vma_assert_detached(vma); + + return vma; +} + +void detach_free_vma(struct vm_area_struct *vma) +{ + vma_mark_detached(vma); + vm_area_free(vma); +} + +struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm, + unsigned long start, unsigned long end, + pgoff_t pgoff, vm_flags_t vm_flags) +{ + struct vm_area_struct *vma = alloc_vma(mm, start, end, pgoff, vm_flags); + + if (vma == NULL) + return NULL; + + if (attach_vma(mm, vma)) { + detach_free_vma(vma); + return NULL; + } + + /* + * Reset this counter which we use to track whether writes have + * begun. Linking to the tree will have caused this to be incremented, + * which means we will get a false positive otherwise. + */ + vma->vm_lock_seq = UINT_MAX; + + return vma; +} + +void reset_dummy_anon_vma(void) +{ + dummy_anon_vma.was_cloned = false; + dummy_anon_vma.was_unlinked = false; +} + +int cleanup_mm(struct mm_struct *mm, struct vma_iterator *vmi) +{ + struct vm_area_struct *vma; + int count = 0; + + fail_prealloc = false; + reset_dummy_anon_vma(); + + vma_iter_set(vmi, 0); + for_each_vma(*vmi, vma) { + detach_free_vma(vma); + count++; + } + + mtree_destroy(&mm->mm_mt); + mm->map_count = 0; + return count; +} + +bool vma_write_started(struct vm_area_struct *vma) +{ + int seq = vma->vm_lock_seq; + + /* We reset after each check. */ + vma->vm_lock_seq = UINT_MAX; + + /* The vma_start_write() stub simply increments this value. */ + return seq > -1; +} + +void __vma_set_dummy_anon_vma(struct vm_area_struct *vma, + struct anon_vma_chain *avc, struct anon_vma *anon_vma) +{ + vma->anon_vma = anon_vma; + INIT_LIST_HEAD(&vma->anon_vma_chain); + list_add(&avc->same_vma, &vma->anon_vma_chain); + avc->anon_vma = vma->anon_vma; +} + +void vma_set_dummy_anon_vma(struct vm_area_struct *vma, + struct anon_vma_chain *avc) +{ + __vma_set_dummy_anon_vma(vma, avc, &dummy_anon_vma); +} + +struct task_struct *get_current(void) +{ + return &__current; +} + +unsigned long rlimit(unsigned int limit) +{ + return (unsigned long)-1; +} + +void vma_set_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + pgoff_t pgoff) +{ + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; +} diff --git a/tools/testing/vma/shared.h b/tools/testing/vma/shared.h new file mode 100644 index 000000000000..6c64211cfa22 --- /dev/null +++ b/tools/testing/vma/shared.h @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> + +#include "generated/bit-length.h" +#include "maple-shared.h" +#include "vma_internal.h" +#include "../../../mm/vma.h" + +/* Simple test runner. Assumes local num_[fail, tests] counters. */ +#define TEST(name) \ + do { \ + (*num_tests)++; \ + if (!test_##name()) { \ + (*num_fail)++; \ + fprintf(stderr, "Test " #name " FAILED\n"); \ + } \ + } while (0) + +#define ASSERT_TRUE(_expr) \ + do { \ + if (!(_expr)) { \ + fprintf(stderr, \ + "Assert FAILED at %s:%d:%s(): %s is FALSE.\n", \ + __FILE__, __LINE__, __FUNCTION__, #_expr); \ + return false; \ + } \ + } while (0) + +#define ASSERT_FALSE(_expr) ASSERT_TRUE(!(_expr)) +#define ASSERT_EQ(_val1, _val2) ASSERT_TRUE((_val1) == (_val2)) +#define ASSERT_NE(_val1, _val2) ASSERT_TRUE((_val1) != (_val2)) + +#define IS_SET(_val, _flags) ((_val & _flags) == _flags) + +extern bool fail_prealloc; + +/* Override vma_iter_prealloc() so we can choose to fail it. */ +#define vma_iter_prealloc(vmi, vma) \ + (fail_prealloc ? -ENOMEM : mas_preallocate(&(vmi)->mas, (vma), GFP_KERNEL)) + +#define CONFIG_DEFAULT_MMAP_MIN_ADDR 65536 + +extern unsigned long mmap_min_addr; +extern unsigned long dac_mmap_min_addr; +extern unsigned long stack_guard_gap; + +extern const struct vm_operations_struct vma_dummy_vm_ops; +extern struct anon_vma dummy_anon_vma; +extern struct task_struct __current; + +/* + * Helper function which provides a wrapper around a merge existing VMA + * operation. + * + * Declared in main.c as uses static VMA function. + */ +struct vm_area_struct *merge_existing(struct vma_merge_struct *vmg); + +/* + * Helper function to allocate a VMA and link it to the tree. + * + * Declared in main.c as uses static VMA function. + */ +int attach_vma(struct mm_struct *mm, struct vm_area_struct *vma); + +/* Helper function providing a dummy vm_ops->close() method.*/ +static inline void dummy_close(struct vm_area_struct *) +{ +} + +/* Helper function to simply allocate a VMA. */ +struct vm_area_struct *alloc_vma(struct mm_struct *mm, + unsigned long start, unsigned long end, + pgoff_t pgoff, vm_flags_t vm_flags); + +/* Helper function to detach and free a VMA. */ +void detach_free_vma(struct vm_area_struct *vma); + +/* Helper function to allocate a VMA and link it to the tree. */ +struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm, + unsigned long start, unsigned long end, + pgoff_t pgoff, vm_flags_t vm_flags); + +/* + * Helper function to reset the dummy anon_vma to indicate it has not been + * duplicated. + */ +void reset_dummy_anon_vma(void); + +/* + * Helper function to remove all VMAs and destroy the maple tree associated with + * a virtual address space. Returns a count of VMAs in the tree. + */ +int cleanup_mm(struct mm_struct *mm, struct vma_iterator *vmi); + +/* Helper function to determine if VMA has had vma_start_write() performed. */ +bool vma_write_started(struct vm_area_struct *vma); + +void __vma_set_dummy_anon_vma(struct vm_area_struct *vma, + struct anon_vma_chain *avc, struct anon_vma *anon_vma); + +/* Provide a simple dummy VMA/anon_vma dummy setup for testing. */ +void vma_set_dummy_anon_vma(struct vm_area_struct *vma, + struct anon_vma_chain *avc); + +/* Helper function to specify a VMA's range. */ +void vma_set_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + pgoff_t pgoff); diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/tests/merge.c index 93d21bc7e112..3708dc6945b0 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/tests/merge.c @@ -1,132 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -#include <stdbool.h> -#include <stdio.h> -#include <stdlib.h> - -#include "generated/bit-length.h" - -#include "maple-shared.h" -#include "vma_internal.h" - -/* Include so header guard set. */ -#include "../../../mm/vma.h" - -static bool fail_prealloc; - -/* Then override vma_iter_prealloc() so we can choose to fail it. */ -#define vma_iter_prealloc(vmi, vma) \ - (fail_prealloc ? -ENOMEM : mas_preallocate(&(vmi)->mas, (vma), GFP_KERNEL)) - -#define CONFIG_DEFAULT_MMAP_MIN_ADDR 65536 - -unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; -unsigned long dac_mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; -unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT; - -/* - * Directly import the VMA implementation here. Our vma_internal.h wrapper - * provides userland-equivalent functionality for everything vma.c uses. - */ -#include "../../../mm/vma_init.c" -#include "../../../mm/vma_exec.c" -#include "../../../mm/vma.c" - -const struct vm_operations_struct vma_dummy_vm_ops; -static struct anon_vma dummy_anon_vma; - -#define ASSERT_TRUE(_expr) \ - do { \ - if (!(_expr)) { \ - fprintf(stderr, \ - "Assert FAILED at %s:%d:%s(): %s is FALSE.\n", \ - __FILE__, __LINE__, __FUNCTION__, #_expr); \ - return false; \ - } \ - } while (0) -#define ASSERT_FALSE(_expr) ASSERT_TRUE(!(_expr)) -#define ASSERT_EQ(_val1, _val2) ASSERT_TRUE((_val1) == (_val2)) -#define ASSERT_NE(_val1, _val2) ASSERT_TRUE((_val1) != (_val2)) - -#define IS_SET(_val, _flags) ((_val & _flags) == _flags) - -static struct task_struct __current; - -struct task_struct *get_current(void) -{ - return &__current; -} - -unsigned long rlimit(unsigned int limit) -{ - return (unsigned long)-1; -} - -/* Helper function to simply allocate a VMA. */ -static struct vm_area_struct *alloc_vma(struct mm_struct *mm, - unsigned long start, - unsigned long end, - pgoff_t pgoff, - vm_flags_t vm_flags) -{ - struct vm_area_struct *vma = vm_area_alloc(mm); - - if (vma == NULL) - return NULL; - - vma->vm_start = start; - vma->vm_end = end; - vma->vm_pgoff = pgoff; - vm_flags_reset(vma, vm_flags); - vma_assert_detached(vma); - - return vma; -} - -/* Helper function to allocate a VMA and link it to the tree. */ -static int attach_vma(struct mm_struct *mm, struct vm_area_struct *vma) -{ - int res; - - res = vma_link(mm, vma); - if (!res) - vma_assert_attached(vma); - return res; -} - -static void detach_free_vma(struct vm_area_struct *vma) -{ - vma_mark_detached(vma); - vm_area_free(vma); -} - -/* Helper function to allocate a VMA and link it to the tree. */ -static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm, - unsigned long start, - unsigned long end, - pgoff_t pgoff, - vm_flags_t vm_flags) -{ - struct vm_area_struct *vma = alloc_vma(mm, start, end, pgoff, vm_flags); - - if (vma == NULL) - return NULL; - - if (attach_vma(mm, vma)) { - detach_free_vma(vma); - return NULL; - } - - /* - * Reset this counter which we use to track whether writes have - * begun. Linking to the tree will have caused this to be incremented, - * which means we will get a false positive otherwise. - */ - vma->vm_lock_seq = UINT_MAX; - - return vma; -} - /* Helper function which provides a wrapper around a merge new VMA operation. */ static struct vm_area_struct *merge_new(struct vma_merge_struct *vmg) { @@ -147,20 +20,6 @@ static struct vm_area_struct *merge_new(struct vma_merge_struct *vmg) } /* - * Helper function which provides a wrapper around a merge existing VMA - * operation. - */ -static struct vm_area_struct *merge_existing(struct vma_merge_struct *vmg) -{ - struct vm_area_struct *vma; - - vma = vma_merge_existing_range(vmg); - if (vma) - vma_assert_attached(vma); - return vma; -} - -/* * Helper function which provides a wrapper around the expansion of an existing * VMA. */ @@ -173,8 +32,8 @@ static int expand_existing(struct vma_merge_struct *vmg) * Helper function to reset merge state the associated VMA iterator to a * specified new range. */ -static void vmg_set_range(struct vma_merge_struct *vmg, unsigned long start, - unsigned long end, pgoff_t pgoff, vm_flags_t vm_flags) +void vmg_set_range(struct vma_merge_struct *vmg, unsigned long start, + unsigned long end, pgoff_t pgoff, vm_flags_t vm_flags) { vma_iter_set(vmg->vmi, start); @@ -197,8 +56,8 @@ static void vmg_set_range(struct vma_merge_struct *vmg, unsigned long start, /* Helper function to set both the VMG range and its anon_vma. */ static void vmg_set_range_anon_vma(struct vma_merge_struct *vmg, unsigned long start, - unsigned long end, pgoff_t pgoff, vm_flags_t vm_flags, - struct anon_vma *anon_vma) + unsigned long end, pgoff_t pgoff, vm_flags_t vm_flags, + struct anon_vma *anon_vma) { vmg_set_range(vmg, start, end, pgoff, vm_flags); vmg->anon_vma = anon_vma; @@ -211,10 +70,9 @@ static void vmg_set_range_anon_vma(struct vma_merge_struct *vmg, unsigned long s * VMA, link it to the maple tree and return it. */ static struct vm_area_struct *try_merge_new_vma(struct mm_struct *mm, - struct vma_merge_struct *vmg, - unsigned long start, unsigned long end, - pgoff_t pgoff, vm_flags_t vm_flags, - bool *was_merged) + struct vma_merge_struct *vmg, unsigned long start, + unsigned long end, pgoff_t pgoff, vm_flags_t vm_flags, + bool *was_merged) { struct vm_area_struct *merged; @@ -234,72 +92,6 @@ static struct vm_area_struct *try_merge_new_vma(struct mm_struct *mm, return alloc_and_link_vma(mm, start, end, pgoff, vm_flags); } -/* - * Helper function to reset the dummy anon_vma to indicate it has not been - * duplicated. - */ -static void reset_dummy_anon_vma(void) -{ - dummy_anon_vma.was_cloned = false; - dummy_anon_vma.was_unlinked = false; -} - -/* - * Helper function to remove all VMAs and destroy the maple tree associated with - * a virtual address space. Returns a count of VMAs in the tree. - */ -static int cleanup_mm(struct mm_struct *mm, struct vma_iterator *vmi) -{ - struct vm_area_struct *vma; - int count = 0; - - fail_prealloc = false; - reset_dummy_anon_vma(); - - vma_iter_set(vmi, 0); - for_each_vma(*vmi, vma) { - detach_free_vma(vma); - count++; - } - - mtree_destroy(&mm->mm_mt); - mm->map_count = 0; - return count; -} - -/* Helper function to determine if VMA has had vma_start_write() performed. */ -static bool vma_write_started(struct vm_area_struct *vma) -{ - int seq = vma->vm_lock_seq; - - /* We reset after each check. */ - vma->vm_lock_seq = UINT_MAX; - - /* The vma_start_write() stub simply increments this value. */ - return seq > -1; -} - -/* Helper function providing a dummy vm_ops->close() method.*/ -static void dummy_close(struct vm_area_struct *) -{ -} - -static void __vma_set_dummy_anon_vma(struct vm_area_struct *vma, - struct anon_vma_chain *avc, - struct anon_vma *anon_vma) -{ - vma->anon_vma = anon_vma; - INIT_LIST_HEAD(&vma->anon_vma_chain); - list_add(&avc->same_vma, &vma->anon_vma_chain); - avc->anon_vma = vma->anon_vma; -} - -static void vma_set_dummy_anon_vma(struct vm_area_struct *vma, - struct anon_vma_chain *avc) -{ - __vma_set_dummy_anon_vma(vma, avc, &dummy_anon_vma); -} - static bool test_simple_merge(void) { struct vm_area_struct *vma; @@ -1616,39 +1408,6 @@ static bool test_merge_extend(void) return true; } -static bool test_copy_vma(void) -{ - vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; - struct mm_struct mm = {}; - bool need_locks = false; - VMA_ITERATOR(vmi, &mm, 0); - struct vm_area_struct *vma, *vma_new, *vma_next; - - /* Move backwards and do not merge. */ - - vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags); - vma_new = copy_vma(&vma, 0, 0x2000, 0, &need_locks); - ASSERT_NE(vma_new, vma); - ASSERT_EQ(vma_new->vm_start, 0); - ASSERT_EQ(vma_new->vm_end, 0x2000); - ASSERT_EQ(vma_new->vm_pgoff, 0); - vma_assert_attached(vma_new); - - cleanup_mm(&mm, &vmi); - - /* Move a VMA into position next to another and merge the two. */ - - vma = alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags); - vma_next = alloc_and_link_vma(&mm, 0x6000, 0x8000, 6, vm_flags); - vma_new = copy_vma(&vma, 0x4000, 0x2000, 4, &need_locks); - vma_assert_attached(vma_new); - - ASSERT_EQ(vma_new, vma_next); - - cleanup_mm(&mm, &vmi); - return true; -} - static bool test_expand_only_mode(void) { vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; @@ -1689,73 +1448,8 @@ static bool test_expand_only_mode(void) return true; } -static bool test_mmap_region_basic(void) -{ - struct mm_struct mm = {}; - unsigned long addr; - struct vm_area_struct *vma; - VMA_ITERATOR(vmi, &mm, 0); - - current->mm = &mm; - - /* Map at 0x300000, length 0x3000. */ - addr = __mmap_region(NULL, 0x300000, 0x3000, - VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, - 0x300, NULL); - ASSERT_EQ(addr, 0x300000); - - /* Map at 0x250000, length 0x3000. */ - addr = __mmap_region(NULL, 0x250000, 0x3000, - VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, - 0x250, NULL); - ASSERT_EQ(addr, 0x250000); - - /* Map at 0x303000, merging to 0x300000 of length 0x6000. */ - addr = __mmap_region(NULL, 0x303000, 0x3000, - VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, - 0x303, NULL); - ASSERT_EQ(addr, 0x303000); - - /* Map at 0x24d000, merging to 0x250000 of length 0x6000. */ - addr = __mmap_region(NULL, 0x24d000, 0x3000, - VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, - 0x24d, NULL); - ASSERT_EQ(addr, 0x24d000); - - ASSERT_EQ(mm.map_count, 2); - - for_each_vma(vmi, vma) { - if (vma->vm_start == 0x300000) { - ASSERT_EQ(vma->vm_end, 0x306000); - ASSERT_EQ(vma->vm_pgoff, 0x300); - } else if (vma->vm_start == 0x24d000) { - ASSERT_EQ(vma->vm_end, 0x253000); - ASSERT_EQ(vma->vm_pgoff, 0x24d); - } else { - ASSERT_FALSE(true); - } - } - - cleanup_mm(&mm, &vmi); - return true; -} - -int main(void) +static void run_merge_tests(int *num_tests, int *num_fail) { - int num_tests = 0, num_fail = 0; - - maple_tree_init(); - vma_state_init(); - -#define TEST(name) \ - do { \ - num_tests++; \ - if (!test_##name()) { \ - num_fail++; \ - fprintf(stderr, "Test " #name " FAILED\n"); \ - } \ - } while (0) - /* Very simple tests to kick the tyres. */ TEST(simple_merge); TEST(simple_modify); @@ -1771,15 +1465,5 @@ int main(void) TEST(dup_anon_vma); TEST(vmi_prealloc_fail); TEST(merge_extend); - TEST(copy_vma); TEST(expand_only_mode); - - TEST(mmap_region_basic); - -#undef TEST - - printf("%d tests run, %d passed, %d failed.\n", - num_tests, num_tests - num_fail, num_fail); - - return num_fail == 0 ? EXIT_SUCCESS : EXIT_FAILURE; } diff --git a/tools/testing/vma/tests/mmap.c b/tools/testing/vma/tests/mmap.c new file mode 100644 index 000000000000..bded4ecbe5db --- /dev/null +++ b/tools/testing/vma/tests/mmap.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +static bool test_mmap_region_basic(void) +{ + struct mm_struct mm = {}; + unsigned long addr; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, &mm, 0); + + current->mm = &mm; + + /* Map at 0x300000, length 0x3000. */ + addr = __mmap_region(NULL, 0x300000, 0x3000, + VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, + 0x300, NULL); + ASSERT_EQ(addr, 0x300000); + + /* Map at 0x250000, length 0x3000. */ + addr = __mmap_region(NULL, 0x250000, 0x3000, + VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, + 0x250, NULL); + ASSERT_EQ(addr, 0x250000); + + /* Map at 0x303000, merging to 0x300000 of length 0x6000. */ + addr = __mmap_region(NULL, 0x303000, 0x3000, + VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, + 0x303, NULL); + ASSERT_EQ(addr, 0x303000); + + /* Map at 0x24d000, merging to 0x250000 of length 0x6000. */ + addr = __mmap_region(NULL, 0x24d000, 0x3000, + VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, + 0x24d, NULL); + ASSERT_EQ(addr, 0x24d000); + + ASSERT_EQ(mm.map_count, 2); + + for_each_vma(vmi, vma) { + if (vma->vm_start == 0x300000) { + ASSERT_EQ(vma->vm_end, 0x306000); + ASSERT_EQ(vma->vm_pgoff, 0x300); + } else if (vma->vm_start == 0x24d000) { + ASSERT_EQ(vma->vm_end, 0x253000); + ASSERT_EQ(vma->vm_pgoff, 0x24d); + } else { + ASSERT_FALSE(true); + } + } + + cleanup_mm(&mm, &vmi); + return true; +} + +static void run_mmap_tests(int *num_tests, int *num_fail) +{ + TEST(mmap_region_basic); +} diff --git a/tools/testing/vma/tests/vma.c b/tools/testing/vma/tests/vma.c new file mode 100644 index 000000000000..c54ffc954f11 --- /dev/null +++ b/tools/testing/vma/tests/vma.c @@ -0,0 +1,339 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +static bool compare_legacy_flags(vm_flags_t legacy_flags, vma_flags_t flags) +{ + const unsigned long legacy_val = legacy_flags; + /* The lower word should contain the precise same value. */ + const unsigned long flags_lower = flags.__vma_flags[0]; +#if NUM_VMA_FLAGS > BITS_PER_LONG + int i; + + /* All bits in higher flag values should be zero. */ + for (i = 1; i < NUM_VMA_FLAGS / BITS_PER_LONG; i++) { + if (flags.__vma_flags[i] != 0) + return false; + } +#endif + + static_assert(sizeof(legacy_flags) == sizeof(unsigned long)); + + return legacy_val == flags_lower; +} + +static bool test_copy_vma(void) +{ + vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + bool need_locks = false; + VMA_ITERATOR(vmi, &mm, 0); + struct vm_area_struct *vma, *vma_new, *vma_next; + + /* Move backwards and do not merge. */ + + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags); + vma_new = copy_vma(&vma, 0, 0x2000, 0, &need_locks); + ASSERT_NE(vma_new, vma); + ASSERT_EQ(vma_new->vm_start, 0); + ASSERT_EQ(vma_new->vm_end, 0x2000); + ASSERT_EQ(vma_new->vm_pgoff, 0); + vma_assert_attached(vma_new); + + cleanup_mm(&mm, &vmi); + + /* Move a VMA into position next to another and merge the two. */ + + vma = alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x6000, 0x8000, 6, vm_flags); + vma_new = copy_vma(&vma, 0x4000, 0x2000, 4, &need_locks); + vma_assert_attached(vma_new); + + ASSERT_EQ(vma_new, vma_next); + + cleanup_mm(&mm, &vmi); + return true; +} + +static bool test_vma_flags_unchanged(void) +{ + vma_flags_t flags = EMPTY_VMA_FLAGS; + vm_flags_t legacy_flags = 0; + int bit; + struct vm_area_struct vma; + struct vm_area_desc desc; + + + vma.flags = EMPTY_VMA_FLAGS; + desc.vma_flags = EMPTY_VMA_FLAGS; + + for (bit = 0; bit < BITS_PER_LONG; bit++) { + vma_flags_t mask = mk_vma_flags(bit); + + legacy_flags |= (1UL << bit); + + /* Individual flags. */ + vma_flags_set(&flags, bit); + ASSERT_TRUE(compare_legacy_flags(legacy_flags, flags)); + + /* Via mask. */ + vma_flags_set_mask(&flags, mask); + ASSERT_TRUE(compare_legacy_flags(legacy_flags, flags)); + + /* Same for VMA. */ + vma_set_flags(&vma, bit); + ASSERT_TRUE(compare_legacy_flags(legacy_flags, vma.flags)); + vma_set_flags_mask(&vma, mask); + ASSERT_TRUE(compare_legacy_flags(legacy_flags, vma.flags)); + + /* Same for VMA descriptor. */ + vma_desc_set_flags(&desc, bit); + ASSERT_TRUE(compare_legacy_flags(legacy_flags, desc.vma_flags)); + vma_desc_set_flags_mask(&desc, mask); + ASSERT_TRUE(compare_legacy_flags(legacy_flags, desc.vma_flags)); + } + + return true; +} + +static bool test_vma_flags_cleared(void) +{ + const vma_flags_t empty = EMPTY_VMA_FLAGS; + vma_flags_t flags; + int i; + + /* Set all bits high. */ + memset(&flags, 1, sizeof(flags)); + /* Try to clear. */ + vma_flags_clear_all(&flags); + /* Equal to EMPTY_VMA_FLAGS? */ + ASSERT_EQ(memcmp(&empty, &flags, sizeof(flags)), 0); + /* Make sure every unsigned long entry in bitmap array zero. */ + for (i = 0; i < sizeof(flags) / BITS_PER_LONG; i++) { + const unsigned long val = flags.__vma_flags[i]; + + ASSERT_EQ(val, 0); + } + + return true; +} + +/* + * Assert that VMA flag functions that operate at the system word level function + * correctly. + */ +static bool test_vma_flags_word(void) +{ + vma_flags_t flags = EMPTY_VMA_FLAGS; + const vma_flags_t comparison = + mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, 64, 65); + + /* Set some custom high flags. */ + vma_flags_set(&flags, 64, 65); + /* Now overwrite the first word. */ + vma_flags_overwrite_word(&flags, VM_READ | VM_WRITE); + /* Ensure they are equal. */ + ASSERT_EQ(memcmp(&flags, &comparison, sizeof(flags)), 0); + + flags = EMPTY_VMA_FLAGS; + vma_flags_set(&flags, 64, 65); + + /* Do the same with the _once() equivalent. */ + vma_flags_overwrite_word_once(&flags, VM_READ | VM_WRITE); + ASSERT_EQ(memcmp(&flags, &comparison, sizeof(flags)), 0); + + flags = EMPTY_VMA_FLAGS; + vma_flags_set(&flags, 64, 65); + + /* Make sure we can set a word without disturbing other bits. */ + vma_flags_set(&flags, VMA_WRITE_BIT); + vma_flags_set_word(&flags, VM_READ); + ASSERT_EQ(memcmp(&flags, &comparison, sizeof(flags)), 0); + + flags = EMPTY_VMA_FLAGS; + vma_flags_set(&flags, 64, 65); + + /* Make sure we can clear a word without disturbing other bits. */ + vma_flags_set(&flags, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT); + vma_flags_clear_word(&flags, VM_EXEC); + ASSERT_EQ(memcmp(&flags, &comparison, sizeof(flags)), 0); + + return true; +} + +/* Ensure that vma_flags_test() and friends works correctly. */ +static bool test_vma_flags_test(void) +{ + const vma_flags_t flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, + VMA_EXEC_BIT, 64, 65); + struct vm_area_struct vma; + struct vm_area_desc desc; + + vma.flags = flags; + desc.vma_flags = flags; + +#define do_test(...) \ + ASSERT_TRUE(vma_flags_test(&flags, __VA_ARGS__)); \ + ASSERT_TRUE(vma_desc_test_flags(&desc, __VA_ARGS__)) + +#define do_test_all_true(...) \ + ASSERT_TRUE(vma_flags_test_all(&flags, __VA_ARGS__)); \ + ASSERT_TRUE(vma_test_all_flags(&vma, __VA_ARGS__)) + +#define do_test_all_false(...) \ + ASSERT_FALSE(vma_flags_test_all(&flags, __VA_ARGS__)); \ + ASSERT_FALSE(vma_test_all_flags(&vma, __VA_ARGS__)) + + /* + * Testing for some flags that are present, some that are not - should + * pass. ANY flags matching should work. + */ + do_test(VMA_READ_BIT, VMA_MAYREAD_BIT, VMA_SEQ_READ_BIT); + /* However, the ...test_all() variant should NOT pass. */ + do_test_all_false(VMA_READ_BIT, VMA_MAYREAD_BIT, VMA_SEQ_READ_BIT); + /* But should pass for flags present. */ + do_test_all_true(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, 64, 65); + /* Also subsets... */ + do_test_all_true(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, 64); + do_test_all_true(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT); + do_test_all_true(VMA_READ_BIT, VMA_WRITE_BIT); + do_test_all_true(VMA_READ_BIT); + /* + * Check _mask variant. We don't need to test extensively as macro + * helper is the equivalent. + */ + ASSERT_TRUE(vma_flags_test_mask(&flags, flags)); + ASSERT_TRUE(vma_flags_test_all_mask(&flags, flags)); + + /* Single bits. */ + do_test(VMA_READ_BIT); + do_test(VMA_WRITE_BIT); + do_test(VMA_EXEC_BIT); +#if NUM_VMA_FLAG_BITS > 64 + do_test(64); + do_test(65); +#endif + + /* Two bits. */ + do_test(VMA_READ_BIT, VMA_WRITE_BIT); + do_test(VMA_READ_BIT, VMA_EXEC_BIT); + do_test(VMA_WRITE_BIT, VMA_EXEC_BIT); + /* Ordering shouldn't matter. */ + do_test(VMA_WRITE_BIT, VMA_READ_BIT); + do_test(VMA_EXEC_BIT, VMA_READ_BIT); + do_test(VMA_EXEC_BIT, VMA_WRITE_BIT); +#if NUM_VMA_FLAG_BITS > 64 + do_test(VMA_READ_BIT, 64); + do_test(VMA_WRITE_BIT, 64); + do_test(64, VMA_READ_BIT); + do_test(64, VMA_WRITE_BIT); + do_test(VMA_READ_BIT, 65); + do_test(VMA_WRITE_BIT, 65); + do_test(65, VMA_READ_BIT); + do_test(65, VMA_WRITE_BIT); +#endif + /* Three bits. */ + do_test(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT); +#if NUM_VMA_FLAG_BITS > 64 + /* No need to consider every single permutation. */ + do_test(VMA_READ_BIT, VMA_WRITE_BIT, 64); + do_test(VMA_READ_BIT, VMA_WRITE_BIT, 65); + + /* Four bits. */ + do_test(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, 64); + do_test(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, 65); + + /* Five bits. */ + do_test(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, 64, 65); +#endif + +#undef do_test +#undef do_test_all_true +#undef do_test_all_false + + return true; +} + +/* Ensure that vma_flags_clear() and friends works correctly. */ +static bool test_vma_flags_clear(void) +{ + vma_flags_t flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, + VMA_EXEC_BIT, 64, 65); + vma_flags_t mask = mk_vma_flags(VMA_EXEC_BIT, 64); + struct vm_area_struct vma; + struct vm_area_desc desc; + + vma.flags = flags; + desc.vma_flags = flags; + + /* Cursory check of _mask() variant, as the helper macros imply. */ + vma_flags_clear_mask(&flags, mask); + vma_flags_clear_mask(&vma.flags, mask); + vma_desc_clear_flags_mask(&desc, mask); + ASSERT_FALSE(vma_flags_test(&flags, VMA_EXEC_BIT, 64)); + ASSERT_FALSE(vma_flags_test(&vma.flags, VMA_EXEC_BIT, 64)); + ASSERT_FALSE(vma_desc_test_flags(&desc, VMA_EXEC_BIT, 64)); + /* Reset. */ + vma_flags_set(&flags, VMA_EXEC_BIT, 64); + vma_set_flags(&vma, VMA_EXEC_BIT, 64); + vma_desc_set_flags(&desc, VMA_EXEC_BIT, 64); + + /* + * Clear the flags and assert clear worked, then reset flags back to + * include specified flags. + */ +#define do_test_and_reset(...) \ + vma_flags_clear(&flags, __VA_ARGS__); \ + vma_flags_clear(&vma.flags, __VA_ARGS__); \ + vma_desc_clear_flags(&desc, __VA_ARGS__); \ + ASSERT_FALSE(vma_flags_test(&flags, __VA_ARGS__)); \ + ASSERT_FALSE(vma_flags_test(&vma.flags, __VA_ARGS__)); \ + ASSERT_FALSE(vma_desc_test_flags(&desc, __VA_ARGS__)); \ + vma_flags_set(&flags, __VA_ARGS__); \ + vma_set_flags(&vma, __VA_ARGS__); \ + vma_desc_set_flags(&desc, __VA_ARGS__) + + /* Single flags. */ + do_test_and_reset(VMA_READ_BIT); + do_test_and_reset(VMA_WRITE_BIT); + do_test_and_reset(VMA_EXEC_BIT); + do_test_and_reset(64); + do_test_and_reset(65); + + /* Two flags, in different orders. */ + do_test_and_reset(VMA_READ_BIT, VMA_WRITE_BIT); + do_test_and_reset(VMA_READ_BIT, VMA_EXEC_BIT); + do_test_and_reset(VMA_READ_BIT, 64); + do_test_and_reset(VMA_READ_BIT, 65); + do_test_and_reset(VMA_WRITE_BIT, VMA_READ_BIT); + do_test_and_reset(VMA_WRITE_BIT, VMA_EXEC_BIT); + do_test_and_reset(VMA_WRITE_BIT, 64); + do_test_and_reset(VMA_WRITE_BIT, 65); + do_test_and_reset(VMA_EXEC_BIT, VMA_READ_BIT); + do_test_and_reset(VMA_EXEC_BIT, VMA_WRITE_BIT); + do_test_and_reset(VMA_EXEC_BIT, 64); + do_test_and_reset(VMA_EXEC_BIT, 65); + do_test_and_reset(64, VMA_READ_BIT); + do_test_and_reset(64, VMA_WRITE_BIT); + do_test_and_reset(64, VMA_EXEC_BIT); + do_test_and_reset(64, 65); + do_test_and_reset(65, VMA_READ_BIT); + do_test_and_reset(65, VMA_WRITE_BIT); + do_test_and_reset(65, VMA_EXEC_BIT); + do_test_and_reset(65, 64); + + /* Three flags. */ + +#undef do_test_some_missing +#undef do_test_and_reset + + return true; +} + +static void run_vma_tests(int *num_tests, int *num_fail) +{ + TEST(copy_vma); + TEST(vma_flags_unchanged); + TEST(vma_flags_cleared); + TEST(vma_flags_word); + TEST(vma_flags_test); + TEST(vma_flags_clear); +} diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 7fa56dcc53a6..0e1121e2ef23 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -12,16 +12,18 @@ #ifndef __MM_VMA_INTERNAL_H #define __MM_VMA_INTERNAL_H -#define __private -#define __bitwise -#define __randomize_layout +#include <stdlib.h> #define CONFIG_MMU #define CONFIG_PER_VMA_LOCK -#include <stdlib.h> +#ifdef __CONCAT +#undef __CONCAT +#endif +#include <linux/args.h> #include <linux/atomic.h> +#include <linux/bitmap.h> #include <linux/list.h> #include <linux/maple_tree.h> #include <linux/mm.h> @@ -29,1839 +31,28 @@ #include <linux/refcount.h> #include <linux/slab.h> -extern unsigned long stack_guard_gap; -#ifdef CONFIG_MMU -extern unsigned long mmap_min_addr; -extern unsigned long dac_mmap_min_addr; -#else -#define mmap_min_addr 0UL -#define dac_mmap_min_addr 0UL -#endif - -#define VM_WARN_ON(_expr) (WARN_ON(_expr)) -#define VM_WARN_ON_ONCE(_expr) (WARN_ON_ONCE(_expr)) -#define VM_WARN_ON_VMG(_expr, _vmg) (WARN_ON(_expr)) -#define VM_BUG_ON(_expr) (BUG_ON(_expr)) -#define VM_BUG_ON_VMA(_expr, _vma) (BUG_ON(_expr)) - -#define MMF_HAS_MDWE 28 - -/* - * vm_flags in vm_area_struct, see mm_types.h. - * When changing, update also include/trace/events/mmflags.h - */ - -#define VM_NONE 0x00000000 - -/** - * typedef vma_flag_t - specifies an individual VMA flag by bit number. - * - * This value is made type safe by sparse to avoid passing invalid flag values - * around. - */ -typedef int __bitwise vma_flag_t; - -#define DECLARE_VMA_BIT(name, bitnum) \ - VMA_ ## name ## _BIT = ((__force vma_flag_t)bitnum) -#define DECLARE_VMA_BIT_ALIAS(name, aliased) \ - VMA_ ## name ## _BIT = VMA_ ## aliased ## _BIT -enum { - DECLARE_VMA_BIT(READ, 0), - DECLARE_VMA_BIT(WRITE, 1), - DECLARE_VMA_BIT(EXEC, 2), - DECLARE_VMA_BIT(SHARED, 3), - /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ - DECLARE_VMA_BIT(MAYREAD, 4), /* limits for mprotect() etc. */ - DECLARE_VMA_BIT(MAYWRITE, 5), - DECLARE_VMA_BIT(MAYEXEC, 6), - DECLARE_VMA_BIT(MAYSHARE, 7), - DECLARE_VMA_BIT(GROWSDOWN, 8), /* general info on the segment */ -#ifdef CONFIG_MMU - DECLARE_VMA_BIT(UFFD_MISSING, 9),/* missing pages tracking */ -#else - /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ - DECLARE_VMA_BIT(MAYOVERLAY, 9), -#endif /* CONFIG_MMU */ - /* Page-ranges managed without "struct page", just pure PFN */ - DECLARE_VMA_BIT(PFNMAP, 10), - DECLARE_VMA_BIT(MAYBE_GUARD, 11), - DECLARE_VMA_BIT(UFFD_WP, 12), /* wrprotect pages tracking */ - DECLARE_VMA_BIT(LOCKED, 13), - DECLARE_VMA_BIT(IO, 14), /* Memory mapped I/O or similar */ - DECLARE_VMA_BIT(SEQ_READ, 15), /* App will access data sequentially */ - DECLARE_VMA_BIT(RAND_READ, 16), /* App will not benefit from clustered reads */ - DECLARE_VMA_BIT(DONTCOPY, 17), /* Do not copy this vma on fork */ - DECLARE_VMA_BIT(DONTEXPAND, 18),/* Cannot expand with mremap() */ - DECLARE_VMA_BIT(LOCKONFAULT, 19),/* Lock pages covered when faulted in */ - DECLARE_VMA_BIT(ACCOUNT, 20), /* Is a VM accounted object */ - DECLARE_VMA_BIT(NORESERVE, 21), /* should the VM suppress accounting */ - DECLARE_VMA_BIT(HUGETLB, 22), /* Huge TLB Page VM */ - DECLARE_VMA_BIT(SYNC, 23), /* Synchronous page faults */ - DECLARE_VMA_BIT(ARCH_1, 24), /* Architecture-specific flag */ - DECLARE_VMA_BIT(WIPEONFORK, 25),/* Wipe VMA contents in child. */ - DECLARE_VMA_BIT(DONTDUMP, 26), /* Do not include in the core dump */ - DECLARE_VMA_BIT(SOFTDIRTY, 27), /* NOT soft dirty clean area */ - DECLARE_VMA_BIT(MIXEDMAP, 28), /* Can contain struct page and pure PFN pages */ - DECLARE_VMA_BIT(HUGEPAGE, 29), /* MADV_HUGEPAGE marked this vma */ - DECLARE_VMA_BIT(NOHUGEPAGE, 30),/* MADV_NOHUGEPAGE marked this vma */ - DECLARE_VMA_BIT(MERGEABLE, 31), /* KSM may merge identical pages */ - /* These bits are reused, we define specific uses below. */ - DECLARE_VMA_BIT(HIGH_ARCH_0, 32), - DECLARE_VMA_BIT(HIGH_ARCH_1, 33), - DECLARE_VMA_BIT(HIGH_ARCH_2, 34), - DECLARE_VMA_BIT(HIGH_ARCH_3, 35), - DECLARE_VMA_BIT(HIGH_ARCH_4, 36), - DECLARE_VMA_BIT(HIGH_ARCH_5, 37), - DECLARE_VMA_BIT(HIGH_ARCH_6, 38), - /* - * This flag is used to connect VFIO to arch specific KVM code. It - * indicates that the memory under this VMA is safe for use with any - * non-cachable memory type inside KVM. Some VFIO devices, on some - * platforms, are thought to be unsafe and can cause machine crashes - * if KVM does not lock down the memory type. - */ - DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39), -#ifdef CONFIG_PPC32 - DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1), -#else - DECLARE_VMA_BIT(DROPPABLE, 40), -#endif - DECLARE_VMA_BIT(UFFD_MINOR, 41), - DECLARE_VMA_BIT(SEALED, 42), - /* Flags that reuse flags above. */ - DECLARE_VMA_BIT_ALIAS(PKEY_BIT0, HIGH_ARCH_0), - DECLARE_VMA_BIT_ALIAS(PKEY_BIT1, HIGH_ARCH_1), - DECLARE_VMA_BIT_ALIAS(PKEY_BIT2, HIGH_ARCH_2), - DECLARE_VMA_BIT_ALIAS(PKEY_BIT3, HIGH_ARCH_3), - DECLARE_VMA_BIT_ALIAS(PKEY_BIT4, HIGH_ARCH_4), -#if defined(CONFIG_X86_USER_SHADOW_STACK) - /* - * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of - * support core mm. - * - * These VMAs will get a single end guard page. This helps userspace - * protect itself from attacks. A single page is enough for current - * shadow stack archs (x86). See the comments near alloc_shstk() in - * arch/x86/kernel/shstk.c for more details on the guard size. - */ - DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_5), -#elif defined(CONFIG_ARM64_GCS) - /* - * arm64's Guarded Control Stack implements similar functionality and - * has similar constraints to shadow stacks. - */ - DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_6), -#endif - DECLARE_VMA_BIT_ALIAS(SAO, ARCH_1), /* Strong Access Ordering (powerpc) */ - DECLARE_VMA_BIT_ALIAS(GROWSUP, ARCH_1), /* parisc */ - DECLARE_VMA_BIT_ALIAS(SPARC_ADI, ARCH_1), /* sparc64 */ - DECLARE_VMA_BIT_ALIAS(ARM64_BTI, ARCH_1), /* arm64 */ - DECLARE_VMA_BIT_ALIAS(ARCH_CLEAR, ARCH_1), /* sparc64, arm64 */ - DECLARE_VMA_BIT_ALIAS(MAPPED_COPY, ARCH_1), /* !CONFIG_MMU */ - DECLARE_VMA_BIT_ALIAS(MTE, HIGH_ARCH_4), /* arm64 */ - DECLARE_VMA_BIT_ALIAS(MTE_ALLOWED, HIGH_ARCH_5),/* arm64 */ -#ifdef CONFIG_STACK_GROWSUP - DECLARE_VMA_BIT_ALIAS(STACK, GROWSUP), - DECLARE_VMA_BIT_ALIAS(STACK_EARLY, GROWSDOWN), -#else - DECLARE_VMA_BIT_ALIAS(STACK, GROWSDOWN), -#endif -}; - -#define INIT_VM_FLAG(name) BIT((__force int) VMA_ ## name ## _BIT) -#define VM_READ INIT_VM_FLAG(READ) -#define VM_WRITE INIT_VM_FLAG(WRITE) -#define VM_EXEC INIT_VM_FLAG(EXEC) -#define VM_SHARED INIT_VM_FLAG(SHARED) -#define VM_MAYREAD INIT_VM_FLAG(MAYREAD) -#define VM_MAYWRITE INIT_VM_FLAG(MAYWRITE) -#define VM_MAYEXEC INIT_VM_FLAG(MAYEXEC) -#define VM_MAYSHARE INIT_VM_FLAG(MAYSHARE) -#define VM_GROWSDOWN INIT_VM_FLAG(GROWSDOWN) -#ifdef CONFIG_MMU -#define VM_UFFD_MISSING INIT_VM_FLAG(UFFD_MISSING) -#else -#define VM_UFFD_MISSING VM_NONE -#define VM_MAYOVERLAY INIT_VM_FLAG(MAYOVERLAY) -#endif -#define VM_PFNMAP INIT_VM_FLAG(PFNMAP) -#define VM_MAYBE_GUARD INIT_VM_FLAG(MAYBE_GUARD) -#define VM_UFFD_WP INIT_VM_FLAG(UFFD_WP) -#define VM_LOCKED INIT_VM_FLAG(LOCKED) -#define VM_IO INIT_VM_FLAG(IO) -#define VM_SEQ_READ INIT_VM_FLAG(SEQ_READ) -#define VM_RAND_READ INIT_VM_FLAG(RAND_READ) -#define VM_DONTCOPY INIT_VM_FLAG(DONTCOPY) -#define VM_DONTEXPAND INIT_VM_FLAG(DONTEXPAND) -#define VM_LOCKONFAULT INIT_VM_FLAG(LOCKONFAULT) -#define VM_ACCOUNT INIT_VM_FLAG(ACCOUNT) -#define VM_NORESERVE INIT_VM_FLAG(NORESERVE) -#define VM_HUGETLB INIT_VM_FLAG(HUGETLB) -#define VM_SYNC INIT_VM_FLAG(SYNC) -#define VM_ARCH_1 INIT_VM_FLAG(ARCH_1) -#define VM_WIPEONFORK INIT_VM_FLAG(WIPEONFORK) -#define VM_DONTDUMP INIT_VM_FLAG(DONTDUMP) -#ifdef CONFIG_MEM_SOFT_DIRTY -#define VM_SOFTDIRTY INIT_VM_FLAG(SOFTDIRTY) -#else -#define VM_SOFTDIRTY VM_NONE -#endif -#define VM_MIXEDMAP INIT_VM_FLAG(MIXEDMAP) -#define VM_HUGEPAGE INIT_VM_FLAG(HUGEPAGE) -#define VM_NOHUGEPAGE INIT_VM_FLAG(NOHUGEPAGE) -#define VM_MERGEABLE INIT_VM_FLAG(MERGEABLE) -#define VM_STACK INIT_VM_FLAG(STACK) -#ifdef CONFIG_STACK_GROWS_UP -#define VM_STACK_EARLY INIT_VM_FLAG(STACK_EARLY) -#else -#define VM_STACK_EARLY VM_NONE -#endif -#ifdef CONFIG_ARCH_HAS_PKEYS -#define VM_PKEY_SHIFT ((__force int)VMA_HIGH_ARCH_0_BIT) -/* Despite the naming, these are FLAGS not bits. */ -#define VM_PKEY_BIT0 INIT_VM_FLAG(PKEY_BIT0) -#define VM_PKEY_BIT1 INIT_VM_FLAG(PKEY_BIT1) -#define VM_PKEY_BIT2 INIT_VM_FLAG(PKEY_BIT2) -#if CONFIG_ARCH_PKEY_BITS > 3 -#define VM_PKEY_BIT3 INIT_VM_FLAG(PKEY_BIT3) -#else -#define VM_PKEY_BIT3 VM_NONE -#endif /* CONFIG_ARCH_PKEY_BITS > 3 */ -#if CONFIG_ARCH_PKEY_BITS > 4 -#define VM_PKEY_BIT4 INIT_VM_FLAG(PKEY_BIT4) -#else -#define VM_PKEY_BIT4 VM_NONE -#endif /* CONFIG_ARCH_PKEY_BITS > 4 */ -#endif /* CONFIG_ARCH_HAS_PKEYS */ -#if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS) -#define VM_SHADOW_STACK INIT_VM_FLAG(SHADOW_STACK) -#else -#define VM_SHADOW_STACK VM_NONE -#endif -#if defined(CONFIG_PPC64) -#define VM_SAO INIT_VM_FLAG(SAO) -#elif defined(CONFIG_PARISC) -#define VM_GROWSUP INIT_VM_FLAG(GROWSUP) -#elif defined(CONFIG_SPARC64) -#define VM_SPARC_ADI INIT_VM_FLAG(SPARC_ADI) -#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) -#elif defined(CONFIG_ARM64) -#define VM_ARM64_BTI INIT_VM_FLAG(ARM64_BTI) -#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) -#elif !defined(CONFIG_MMU) -#define VM_MAPPED_COPY INIT_VM_FLAG(MAPPED_COPY) -#endif -#ifndef VM_GROWSUP -#define VM_GROWSUP VM_NONE -#endif -#ifdef CONFIG_ARM64_MTE -#define VM_MTE INIT_VM_FLAG(MTE) -#define VM_MTE_ALLOWED INIT_VM_FLAG(MTE_ALLOWED) -#else -#define VM_MTE VM_NONE -#define VM_MTE_ALLOWED VM_NONE -#endif -#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR -#define VM_UFFD_MINOR INIT_VM_FLAG(UFFD_MINOR) -#else -#define VM_UFFD_MINOR VM_NONE -#endif -#ifdef CONFIG_64BIT -#define VM_ALLOW_ANY_UNCACHED INIT_VM_FLAG(ALLOW_ANY_UNCACHED) -#define VM_SEALED INIT_VM_FLAG(SEALED) -#else -#define VM_ALLOW_ANY_UNCACHED VM_NONE -#define VM_SEALED VM_NONE -#endif -#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) -#define VM_DROPPABLE INIT_VM_FLAG(DROPPABLE) -#else -#define VM_DROPPABLE VM_NONE -#endif - -/* Bits set in the VMA until the stack is in its final location */ -#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) - -#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) - -/* Common data flag combinations */ -#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -#define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \ - VM_MAYWRITE | VM_MAYEXEC) -#define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) - -#ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */ -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC -#endif - -#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ -#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS -#endif - -#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) - -#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) - -/* VMA basic access permission flags */ -#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) - -/* - * Special vmas that are non-mergable, non-mlock()able. - */ -#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) - -#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) -#define TASK_SIZE_LOW DEFAULT_MAP_WINDOW -#define TASK_SIZE_MAX DEFAULT_MAP_WINDOW -#define STACK_TOP TASK_SIZE_LOW -#define STACK_TOP_MAX TASK_SIZE_MAX - -/* This mask represents all the VMA flag bits used by mlock */ -#define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) - -#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) - -#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) - -#define RLIMIT_STACK 3 /* max stack size */ -#define RLIMIT_MEMLOCK 8 /* max locked-in-memory address space */ - -#define CAP_IPC_LOCK 14 - -/* - * Flags which should be 'sticky' on merge - that is, flags which, when one VMA - * possesses it but the other does not, the merged VMA should nonetheless have - * applied to it: - * - * VM_SOFTDIRTY - if a VMA is marked soft-dirty, that is has not had its - * references cleared via /proc/$pid/clear_refs, any merged VMA - * should be considered soft-dirty also as it operates at a VMA - * granularity. - */ -#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD) - -/* - * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one - * of these flags and the other not does not preclude a merge. - * - * VM_STICKY - When merging VMAs, VMA flags must match, unless they are - * 'sticky'. If any sticky flags exist in either VMA, we simply - * set all of them on the merged VMA. - */ -#define VM_IGNORE_MERGE VM_STICKY - -/* - * Flags which should result in page tables being copied on fork. These are - * flags which indicate that the VMA maps page tables which cannot be - * reconsistuted upon page fault, so necessitate page table copying upon - * - * VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be - * reasonably reconstructed on page fault. - * - * VM_UFFD_WP - Encodes metadata about an installed uffd - * write protect handler, which cannot be - * reconstructed on page fault. - * - * We always copy pgtables when dst_vma has uffd-wp - * enabled even if it's file-backed - * (e.g. shmem). Because when uffd-wp is enabled, - * pgtable contains uffd-wp protection information, - * that's something we can't retrieve from page cache, - * and skip copying will lose those info. - * - * VM_MAYBE_GUARD - Could contain page guard region markers which - * by design are a property of the page tables - * only and thus cannot be reconstructed on page - * fault. - */ -#define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD) - -#define FIRST_USER_ADDRESS 0UL -#define USER_PGTABLES_CEILING 0UL - -#define vma_policy(vma) NULL - -#define down_write_nest_lock(sem, nest_lock) - -#define pgprot_val(x) ((x).pgprot) -#define __pgprot(x) ((pgprot_t) { (x) } ) - -#define for_each_vma(__vmi, __vma) \ - while (((__vma) = vma_next(&(__vmi))) != NULL) - -/* The MM code likes to work with exclusive end addresses */ -#define for_each_vma_range(__vmi, __vma, __end) \ - while (((__vma) = vma_find(&(__vmi), (__end))) != NULL) - -#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) - -#define PHYS_PFN(x) ((unsigned long)((x) >> PAGE_SHIFT)) - -#define test_and_set_bit(nr, addr) __test_and_set_bit(nr, addr) -#define test_and_clear_bit(nr, addr) __test_and_clear_bit(nr, addr) - -#define TASK_SIZE ((1ul << 47)-PAGE_SIZE) - -#define AS_MM_ALL_LOCKS 2 - -/* We hardcode this for now. */ -#define sysctl_max_map_count 0x1000000UL - -#define pgoff_t unsigned long -typedef unsigned long pgprotval_t; -typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; -typedef unsigned long vm_flags_t; -typedef __bitwise unsigned int vm_fault_t; - -/* - * The shared stubs do not implement this, it amounts to an fprintf(STDERR,...) - * either way :) - */ -#define pr_warn_once pr_err - -#define data_race(expr) expr - -#define ASSERT_EXCLUSIVE_WRITER(x) - -#define pgtable_supports_soft_dirty() 1 - -/** - * swap - swap values of @a and @b - * @a: first value - * @b: second value - */ -#define swap(a, b) \ - do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) - -struct kref { - refcount_t refcount; -}; - -/* - * Define the task command name length as enum, then it can be visible to - * BPF programs. - */ -enum { - TASK_COMM_LEN = 16, -}; - /* - * Flags for bug emulation. - * - * These occupy the top three bytes. + * DUPLICATE typedef definitions from kernel source that have to be declared + * ahead of all other headers. */ -enum { - READ_IMPLIES_EXEC = 0x0400000, -}; - -struct task_struct { - char comm[TASK_COMM_LEN]; - pid_t pid; - struct mm_struct *mm; - - /* Used for emulating ABI behavior of previous Linux versions: */ - unsigned int personality; -}; - -struct task_struct *get_current(void); -#define current get_current() - -struct anon_vma { - struct anon_vma *root; - struct rb_root_cached rb_root; - - /* Test fields. */ - bool was_cloned; - bool was_unlinked; -}; - -struct anon_vma_chain { - struct anon_vma *anon_vma; - struct list_head same_vma; -}; - -struct anon_vma_name { - struct kref kref; - /* The name needs to be at the end because it is dynamically sized. */ - char name[]; -}; - -struct vma_iterator { - struct ma_state mas; -}; - -#define VMA_ITERATOR(name, __mm, __addr) \ - struct vma_iterator name = { \ - .mas = { \ - .tree = &(__mm)->mm_mt, \ - .index = __addr, \ - .node = NULL, \ - .status = ma_start, \ - }, \ - } - -struct address_space { - struct rb_root_cached i_mmap; - unsigned long flags; - atomic_t i_mmap_writable; -}; - -struct vm_userfaultfd_ctx {}; -struct mempolicy {}; -struct mmu_gather {}; -struct mutex {}; -#define DEFINE_MUTEX(mutexname) \ - struct mutex mutexname = {} - -#define DECLARE_BITMAP(name, bits) \ - unsigned long name[BITS_TO_LONGS(bits)] - -#define NUM_MM_FLAG_BITS (64) +#define __private +/* NUM_MM_FLAG_BITS defined by test code. */ typedef struct { __private DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS); } mm_flags_t; - -/* - * Opaque type representing current VMA (vm_area_struct) flag state. Must be - * accessed via vma_flags_xxx() helper functions. - */ -#define NUM_VMA_FLAG_BITS BITS_PER_LONG +/* NUM_VMA_FLAG_BITS defined by test code. */ typedef struct { DECLARE_BITMAP(__vma_flags, NUM_VMA_FLAG_BITS); } __private vma_flags_t; -struct mm_struct { - struct maple_tree mm_mt; - int map_count; /* number of VMAs */ - unsigned long total_vm; /* Total pages mapped */ - unsigned long locked_vm; /* Pages that have PG_mlocked set */ - unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ - unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ - unsigned long stack_vm; /* VM_STACK */ - - unsigned long def_flags; - - mm_flags_t flags; /* Must use mm_flags_* helpers to access */ -}; - -struct vm_area_struct; - - -/* What action should be taken after an .mmap_prepare call is complete? */ -enum mmap_action_type { - MMAP_NOTHING, /* Mapping is complete, no further action. */ - MMAP_REMAP_PFN, /* Remap PFN range. */ - MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ -}; - -/* - * Describes an action an mmap_prepare hook can instruct to be taken to complete - * the mapping of a VMA. Specified in vm_area_desc. - */ -struct mmap_action { - union { - /* Remap range. */ - struct { - unsigned long start; - unsigned long start_pfn; - unsigned long size; - pgprot_t pgprot; - } remap; - }; - enum mmap_action_type type; - - /* - * If specified, this hook is invoked after the selected action has been - * successfully completed. Note that the VMA write lock still held. - * - * The absolute minimum ought to be done here. - * - * Returns 0 on success, or an error code. - */ - int (*success_hook)(const struct vm_area_struct *vma); - - /* - * If specified, this hook is invoked when an error occurred when - * attempting the selection action. - * - * The hook can return an error code in order to filter the error, but - * it is not valid to clear the error here. - */ - int (*error_hook)(int err); - - /* - * This should be set in rare instances where the operation required - * that the rmap should not be able to access the VMA until - * completely set up. - */ - bool hide_from_rmap_until_complete :1; -}; - -/* Operations which modify VMAs. */ -enum vma_operation { - VMA_OP_SPLIT, - VMA_OP_MERGE_UNFAULTED, - VMA_OP_REMAP, - VMA_OP_FORK, -}; - -/* - * Describes a VMA that is about to be mmap()'ed. Drivers may choose to - * manipulate mutable fields which will cause those fields to be updated in the - * resultant VMA. - * - * Helper functions are not required for manipulating any field. - */ -struct vm_area_desc { - /* Immutable state. */ - const struct mm_struct *const mm; - struct file *const file; /* May vary from vm_file in stacked callers. */ - unsigned long start; - unsigned long end; - - /* Mutable fields. Populated with initial state. */ - pgoff_t pgoff; - struct file *vm_file; - union { - vm_flags_t vm_flags; - vma_flags_t vma_flags; - }; - pgprot_t page_prot; - - /* Write-only fields. */ - const struct vm_operations_struct *vm_ops; - void *private_data; - - /* Take further action? */ - struct mmap_action action; -}; - -struct file_operations { - int (*mmap)(struct file *, struct vm_area_struct *); - int (*mmap_prepare)(struct vm_area_desc *); -}; - -struct file { - struct address_space *f_mapping; - const struct file_operations *f_op; -}; - -#define VMA_LOCK_OFFSET 0x40000000 - -typedef struct { unsigned long v; } freeptr_t; - -struct vm_area_struct { - /* The first cache line has the info for VMA tree walking. */ - - union { - struct { - /* VMA covers [vm_start; vm_end) addresses within mm */ - unsigned long vm_start; - unsigned long vm_end; - }; - freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */ - }; - - struct mm_struct *vm_mm; /* The address space we belong to. */ - pgprot_t vm_page_prot; /* Access permissions of this VMA. */ - - /* - * Flags, see mm.h. - * To modify use vm_flags_{init|reset|set|clear|mod} functions. - */ - union { - const vm_flags_t vm_flags; - vma_flags_t flags; - }; - -#ifdef CONFIG_PER_VMA_LOCK - /* - * Can only be written (using WRITE_ONCE()) while holding both: - * - mmap_lock (in write mode) - * - vm_refcnt bit at VMA_LOCK_OFFSET is set - * Can be read reliably while holding one of: - * - mmap_lock (in read or write mode) - * - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1 - * Can be read unreliably (using READ_ONCE()) for pessimistic bailout - * while holding nothing (except RCU to keep the VMA struct allocated). - * - * This sequence counter is explicitly allowed to overflow; sequence - * counter reuse can only lead to occasional unnecessary use of the - * slowpath. - */ - unsigned int vm_lock_seq; -#endif - - /* - * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma - * list, after a COW of one of the file pages. A MAP_SHARED vma - * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack - * or brk vma (with NULL file) can only be in an anon_vma list. - */ - struct list_head anon_vma_chain; /* Serialized by mmap_lock & - * page_table_lock */ - struct anon_vma *anon_vma; /* Serialized by page_table_lock */ - - /* Function pointers to deal with this struct. */ - const struct vm_operations_struct *vm_ops; - - /* Information about our backing store: */ - unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE - units */ - struct file * vm_file; /* File we map to (can be NULL). */ - void * vm_private_data; /* was vm_pte (shared mem) */ - -#ifdef CONFIG_SWAP - atomic_long_t swap_readahead_info; -#endif -#ifndef CONFIG_MMU - struct vm_region *vm_region; /* NOMMU mapping region */ -#endif -#ifdef CONFIG_NUMA - struct mempolicy *vm_policy; /* NUMA policy for the VMA */ -#endif -#ifdef CONFIG_NUMA_BALANCING - struct vma_numab_state *numab_state; /* NUMA Balancing state */ -#endif -#ifdef CONFIG_PER_VMA_LOCK - /* Unstable RCU readers are allowed to read this. */ - refcount_t vm_refcnt; -#endif - /* - * For areas with an address space and backing store, - * linkage into the address_space->i_mmap interval tree. - * - */ - struct { - struct rb_node rb; - unsigned long rb_subtree_last; - } shared; -#ifdef CONFIG_ANON_VMA_NAME - /* - * For private and shared anonymous mappings, a pointer to a null - * terminated string containing the name given to the vma, or NULL if - * unnamed. Serialized by mmap_lock. Use anon_vma_name to access. - */ - struct anon_vma_name *anon_name; -#endif - struct vm_userfaultfd_ctx vm_userfaultfd_ctx; -} __randomize_layout; - -struct vm_fault {}; - -struct vm_operations_struct { - void (*open)(struct vm_area_struct * area); - /** - * @close: Called when the VMA is being removed from the MM. - * Context: User context. May sleep. Caller holds mmap_lock. - */ - void (*close)(struct vm_area_struct * area); - /* Called any time before splitting to check if it's allowed */ - int (*may_split)(struct vm_area_struct *area, unsigned long addr); - int (*mremap)(struct vm_area_struct *area); - /* - * Called by mprotect() to make driver-specific permission - * checks before mprotect() is finalised. The VMA must not - * be modified. Returns 0 if mprotect() can proceed. - */ - int (*mprotect)(struct vm_area_struct *vma, unsigned long start, - unsigned long end, unsigned long newflags); - vm_fault_t (*fault)(struct vm_fault *vmf); - vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order); - vm_fault_t (*map_pages)(struct vm_fault *vmf, - pgoff_t start_pgoff, pgoff_t end_pgoff); - unsigned long (*pagesize)(struct vm_area_struct * area); - - /* notification that a previously read-only page is about to become - * writable, if an error is returned it will cause a SIGBUS */ - vm_fault_t (*page_mkwrite)(struct vm_fault *vmf); - - /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */ - vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf); - - /* called by access_process_vm when get_user_pages() fails, typically - * for use by special VMAs. See also generic_access_phys() for a generic - * implementation useful for any iomem mapping. - */ - int (*access)(struct vm_area_struct *vma, unsigned long addr, - void *buf, int len, int write); - - /* Called by the /proc/PID/maps code to ask the vma whether it - * has a special name. Returning non-NULL will also cause this - * vma to be dumped unconditionally. */ - const char *(*name)(struct vm_area_struct *vma); - -#ifdef CONFIG_NUMA - /* - * set_policy() op must add a reference to any non-NULL @new mempolicy - * to hold the policy upon return. Caller should pass NULL @new to - * remove a policy and fall back to surrounding context--i.e. do not - * install a MPOL_DEFAULT policy, nor the task or system default - * mempolicy. - */ - int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new); - - /* - * get_policy() op must add reference [mpol_get()] to any policy at - * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure - * in mm/mempolicy.c will do this automatically. - * get_policy() must NOT add a ref if the policy at (vma,addr) is not - * marked as MPOL_SHARED. vma policies are protected by the mmap_lock. - * If no [shared/vma] mempolicy exists at the addr, get_policy() op - * must return NULL--i.e., do not "fallback" to task or system default - * policy. - */ - struct mempolicy *(*get_policy)(struct vm_area_struct *vma, - unsigned long addr, pgoff_t *ilx); -#endif -#ifdef CONFIG_FIND_NORMAL_PAGE - /* - * Called by vm_normal_page() for special PTEs in @vma at @addr. This - * allows for returning a "normal" page from vm_normal_page() even - * though the PTE indicates that the "struct page" either does not exist - * or should not be touched: "special". - * - * Do not add new users: this really only works when a "normal" page - * was mapped, but then the PTE got changed to something weird (+ - * marked special) that would not make pte_pfn() identify the originally - * inserted page. - */ - struct page *(*find_normal_page)(struct vm_area_struct *vma, - unsigned long addr); -#endif /* CONFIG_FIND_NORMAL_PAGE */ -}; - -struct vm_unmapped_area_info { -#define VM_UNMAPPED_AREA_TOPDOWN 1 - unsigned long flags; - unsigned long length; - unsigned long low_limit; - unsigned long high_limit; - unsigned long align_mask; - unsigned long align_offset; - unsigned long start_gap; -}; - -struct pagetable_move_control { - struct vm_area_struct *old; /* Source VMA. */ - struct vm_area_struct *new; /* Destination VMA. */ - unsigned long old_addr; /* Address from which the move begins. */ - unsigned long old_end; /* Exclusive address at which old range ends. */ - unsigned long new_addr; /* Address to move page tables to. */ - unsigned long len_in; /* Bytes to remap specified by user. */ - - bool need_rmap_locks; /* Do rmap locks need to be taken? */ - bool for_stack; /* Is this an early temp stack being moved? */ -}; - -#define PAGETABLE_MOVE(name, old_, new_, old_addr_, new_addr_, len_) \ - struct pagetable_move_control name = { \ - .old = old_, \ - .new = new_, \ - .old_addr = old_addr_, \ - .old_end = (old_addr_) + (len_), \ - .new_addr = new_addr_, \ - .len_in = len_, \ - } - -static inline void vma_iter_invalidate(struct vma_iterator *vmi) -{ - mas_pause(&vmi->mas); -} - -static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) -{ - return __pgprot(pgprot_val(oldprot) | pgprot_val(newprot)); -} - -static inline pgprot_t vm_get_page_prot(vm_flags_t vm_flags) -{ - return __pgprot(vm_flags); -} - -static inline bool is_shared_maywrite(vm_flags_t vm_flags) -{ - return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == - (VM_SHARED | VM_MAYWRITE); -} - -static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma) -{ - return is_shared_maywrite(vma->vm_flags); -} - -static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) -{ - /* - * Uses mas_find() to get the first VMA when the iterator starts. - * Calling mas_next() could skip the first entry. - */ - return mas_find(&vmi->mas, ULONG_MAX); -} - -/* - * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these - * assertions should be made either under mmap_write_lock or when the object - * has been isolated under mmap_write_lock, ensuring no competing writers. - */ -static inline void vma_assert_attached(struct vm_area_struct *vma) -{ - WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt)); -} - -static inline void vma_assert_detached(struct vm_area_struct *vma) -{ - WARN_ON_ONCE(refcount_read(&vma->vm_refcnt)); -} - -static inline void vma_assert_write_locked(struct vm_area_struct *); -static inline void vma_mark_attached(struct vm_area_struct *vma) -{ - vma_assert_write_locked(vma); - vma_assert_detached(vma); - refcount_set_release(&vma->vm_refcnt, 1); -} - -static inline void vma_mark_detached(struct vm_area_struct *vma) -{ - vma_assert_write_locked(vma); - vma_assert_attached(vma); - /* We are the only writer, so no need to use vma_refcount_put(). */ - if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { - /* - * Reader must have temporarily raised vm_refcnt but it will - * drop it without using the vma since vma is write-locked. - */ - } -} - -extern const struct vm_operations_struct vma_dummy_vm_ops; - -extern unsigned long rlimit(unsigned int limit); - -static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) -{ - memset(vma, 0, sizeof(*vma)); - vma->vm_mm = mm; - vma->vm_ops = &vma_dummy_vm_ops; - INIT_LIST_HEAD(&vma->anon_vma_chain); - vma->vm_lock_seq = UINT_MAX; -} - -/* - * These are defined in vma.h, but sadly vm_stat_account() is referenced by - * kernel/fork.c, so we have to these broadly available there, and temporarily - * define them here to resolve the dependency cycle. - */ - -#define is_exec_mapping(flags) \ - ((flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC) - -#define is_stack_mapping(flags) \ - (((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK)) - -#define is_data_mapping(flags) \ - ((flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE) - -static inline void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, - long npages) -{ - WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages); - - if (is_exec_mapping(flags)) - mm->exec_vm += npages; - else if (is_stack_mapping(flags)) - mm->stack_vm += npages; - else if (is_data_mapping(flags)) - mm->data_vm += npages; -} - -#undef is_exec_mapping -#undef is_stack_mapping -#undef is_data_mapping - -/* Currently stubbed but we may later wish to un-stub. */ -static inline void vm_acct_memory(long pages); -static inline void vm_unacct_memory(long pages) -{ - vm_acct_memory(-pages); -} - -static inline void mapping_allow_writable(struct address_space *mapping) -{ - atomic_inc(&mapping->i_mmap_writable); -} - -static inline void vma_set_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - pgoff_t pgoff) -{ - vma->vm_start = start; - vma->vm_end = end; - vma->vm_pgoff = pgoff; -} - -static inline -struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) -{ - return mas_find(&vmi->mas, max - 1); -} - -static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, - unsigned long start, unsigned long end, gfp_t gfp) -{ - __mas_set_range(&vmi->mas, start, end - 1); - mas_store_gfp(&vmi->mas, NULL, gfp); - if (unlikely(mas_is_err(&vmi->mas))) - return -ENOMEM; - - return 0; -} - -static inline void mmap_assert_locked(struct mm_struct *); -static inline struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, - unsigned long start_addr, - unsigned long end_addr) -{ - unsigned long index = start_addr; - - mmap_assert_locked(mm); - return mt_find(&mm->mm_mt, &index, end_addr - 1); -} - -static inline -struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr) -{ - return mtree_load(&mm->mm_mt, addr); -} - -static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) -{ - return mas_prev(&vmi->mas, 0); -} - -static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr) -{ - mas_set(&vmi->mas, addr); -} - -static inline bool vma_is_anonymous(struct vm_area_struct *vma) -{ - return !vma->vm_ops; -} - -/* Defined in vma.h, so temporarily define here to avoid circular dependency. */ -#define vma_iter_load(vmi) \ - mas_walk(&(vmi)->mas) - -static inline struct vm_area_struct * -find_vma_prev(struct mm_struct *mm, unsigned long addr, - struct vm_area_struct **pprev) -{ - struct vm_area_struct *vma; - VMA_ITERATOR(vmi, mm, addr); - - vma = vma_iter_load(&vmi); - *pprev = vma_prev(&vmi); - if (!vma) - vma = vma_next(&vmi); - return vma; -} - -#undef vma_iter_load - -static inline void vma_iter_init(struct vma_iterator *vmi, - struct mm_struct *mm, unsigned long addr) -{ - mas_init(&vmi->mas, &mm->mm_mt, addr); -} - -/* Stubbed functions. */ - -static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) -{ - return NULL; -} - -static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, - struct vm_userfaultfd_ctx vm_ctx) -{ - return true; -} - -static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, - struct anon_vma_name *anon_name2) -{ - return true; -} - -static inline void might_sleep(void) -{ -} - -static inline unsigned long vma_pages(struct vm_area_struct *vma) -{ - return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; -} - -static inline void fput(struct file *file) -{ -} - -static inline void mpol_put(struct mempolicy *pol) -{ -} - -static inline void lru_add_drain(void) -{ -} - -static inline void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm) -{ -} - -static inline void update_hiwater_rss(struct mm_struct *mm) -{ -} - -static inline void update_hiwater_vm(struct mm_struct *mm) -{ -} - -static inline void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, - struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, unsigned long tree_end) -{ -} - -static inline void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, - struct vm_area_struct *vma, unsigned long floor, - unsigned long ceiling, bool mm_wr_locked) -{ -} - -static inline void mapping_unmap_writable(struct address_space *mapping) -{ -} - -static inline void flush_dcache_mmap_lock(struct address_space *mapping) -{ -} - -static inline void tlb_finish_mmu(struct mmu_gather *tlb) -{ -} - -static inline struct file *get_file(struct file *f) -{ - return f; -} - -static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) -{ - return 0; -} - -static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src, - enum vma_operation operation) -{ - /* For testing purposes. We indicate that an anon_vma has been cloned. */ - if (src->anon_vma != NULL) { - dst->anon_vma = src->anon_vma; - dst->anon_vma->was_cloned = true; - } - - return 0; -} - -static inline void vma_start_write(struct vm_area_struct *vma) -{ - /* Used to indicate to tests that a write operation has begun. */ - vma->vm_lock_seq++; -} - -static inline __must_check -int vma_start_write_killable(struct vm_area_struct *vma) -{ - /* Used to indicate to tests that a write operation has begun. */ - vma->vm_lock_seq++; - return 0; -} - -static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, - unsigned long start, - unsigned long end, - struct vm_area_struct *next) -{ -} - -static inline void hugetlb_split(struct vm_area_struct *, unsigned long) {} - -static inline void vma_iter_free(struct vma_iterator *vmi) -{ - mas_destroy(&vmi->mas); -} - -static inline -struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi) -{ - return mas_next_range(&vmi->mas, ULONG_MAX); -} - -static inline void vm_acct_memory(long pages) -{ -} - -static inline void vma_interval_tree_insert(struct vm_area_struct *vma, - struct rb_root_cached *rb) -{ -} - -static inline void vma_interval_tree_remove(struct vm_area_struct *vma, - struct rb_root_cached *rb) -{ -} - -static inline void flush_dcache_mmap_unlock(struct address_space *mapping) -{ -} - -static inline void anon_vma_interval_tree_insert(struct anon_vma_chain *avc, - struct rb_root_cached *rb) -{ -} - -static inline void anon_vma_interval_tree_remove(struct anon_vma_chain *avc, - struct rb_root_cached *rb) -{ -} - -static inline void uprobe_mmap(struct vm_area_struct *vma) -{ -} - -static inline void uprobe_munmap(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ -} - -static inline void i_mmap_lock_write(struct address_space *mapping) -{ -} - -static inline void anon_vma_lock_write(struct anon_vma *anon_vma) -{ -} - -static inline void vma_assert_write_locked(struct vm_area_struct *vma) -{ -} - -static inline void unlink_anon_vmas(struct vm_area_struct *vma) -{ - /* For testing purposes, indicate that the anon_vma was unlinked. */ - vma->anon_vma->was_unlinked = true; -} - -static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) -{ -} - -static inline void i_mmap_unlock_write(struct address_space *mapping) -{ -} - -static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, - unsigned long start, - unsigned long end, - struct list_head *unmaps) -{ - return 0; -} - -static inline void mmap_write_downgrade(struct mm_struct *mm) -{ -} - -static inline void mmap_read_unlock(struct mm_struct *mm) -{ -} - -static inline void mmap_write_unlock(struct mm_struct *mm) -{ -} - -static inline int mmap_write_lock_killable(struct mm_struct *mm) -{ - return 0; -} - -static inline bool can_modify_mm(struct mm_struct *mm, - unsigned long start, - unsigned long end) -{ - return true; -} - -static inline void arch_unmap(struct mm_struct *mm, - unsigned long start, - unsigned long end) -{ -} - -static inline void mmap_assert_locked(struct mm_struct *mm) -{ -} - -static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) -{ - return true; -} - -static inline void khugepaged_enter_vma(struct vm_area_struct *vma, - vm_flags_t vm_flags) -{ -} - -static inline bool mapping_can_writeback(struct address_space *mapping) -{ - return true; -} - -static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) -{ - return false; -} - -static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) -{ - return false; -} - -static inline bool userfaultfd_wp(struct vm_area_struct *vma) -{ - return false; -} - -static inline void mmap_assert_write_locked(struct mm_struct *mm) -{ -} - -static inline void mutex_lock(struct mutex *lock) -{ -} - -static inline void mutex_unlock(struct mutex *lock) -{ -} - -static inline bool mutex_is_locked(struct mutex *lock) -{ - return true; -} - -static inline bool signal_pending(void *p) -{ - return false; -} - -static inline bool is_file_hugepages(struct file *file) -{ - return false; -} - -static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages) -{ - return 0; -} - -static inline bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, - unsigned long npages) -{ - return true; -} - -static inline int shmem_zero_setup(struct vm_area_struct *vma) -{ - return 0; -} - -static inline void vma_set_anonymous(struct vm_area_struct *vma) -{ - vma->vm_ops = NULL; -} - -static inline void ksm_add_vma(struct vm_area_struct *vma) -{ -} - -static inline void perf_event_mmap(struct vm_area_struct *vma) -{ -} - -static inline bool vma_is_dax(struct vm_area_struct *vma) -{ - return false; -} - -static inline struct vm_area_struct *get_gate_vma(struct mm_struct *mm) -{ - return NULL; -} - -bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); - -/* Update vma->vm_page_prot to reflect vma->vm_flags. */ -static inline void vma_set_page_prot(struct vm_area_struct *vma) -{ - vm_flags_t vm_flags = vma->vm_flags; - pgprot_t vm_page_prot; - - /* testing: we inline vm_pgprot_modify() to avoid clash with vma.h. */ - vm_page_prot = pgprot_modify(vma->vm_page_prot, vm_get_page_prot(vm_flags)); - - if (vma_wants_writenotify(vma, vm_page_prot)) { - vm_flags &= ~VM_SHARED; - /* testing: we inline vm_pgprot_modify() to avoid clash with vma.h. */ - vm_page_prot = pgprot_modify(vm_page_prot, vm_get_page_prot(vm_flags)); - } - /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */ - WRITE_ONCE(vma->vm_page_prot, vm_page_prot); -} - -static inline bool arch_validate_flags(vm_flags_t flags) -{ - return true; -} - -static inline void vma_close(struct vm_area_struct *vma) -{ -} - -static inline int mmap_file(struct file *file, struct vm_area_struct *vma) -{ - return 0; -} - -static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma) -{ - if (vma->vm_flags & VM_GROWSDOWN) - return stack_guard_gap; - - /* See reasoning around the VM_SHADOW_STACK definition */ - if (vma->vm_flags & VM_SHADOW_STACK) - return PAGE_SIZE; - - return 0; -} - -static inline unsigned long vm_start_gap(struct vm_area_struct *vma) -{ - unsigned long gap = stack_guard_start_gap(vma); - unsigned long vm_start = vma->vm_start; - - vm_start -= gap; - if (vm_start > vma->vm_start) - vm_start = 0; - return vm_start; -} - -static inline unsigned long vm_end_gap(struct vm_area_struct *vma) -{ - unsigned long vm_end = vma->vm_end; - - if (vma->vm_flags & VM_GROWSUP) { - vm_end += stack_guard_gap; - if (vm_end < vma->vm_end) - vm_end = -PAGE_SIZE; - } - return vm_end; -} - -static inline int is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, unsigned long len) -{ - return 0; -} - -static inline bool vma_is_accessible(struct vm_area_struct *vma) -{ - return vma->vm_flags & VM_ACCESS_FLAGS; -} - -static inline bool capable(int cap) -{ - return true; -} - -static inline bool mlock_future_ok(const struct mm_struct *mm, - vm_flags_t vm_flags, unsigned long bytes) -{ - unsigned long locked_pages, limit_pages; - - if (!(vm_flags & VM_LOCKED) || capable(CAP_IPC_LOCK)) - return true; - - locked_pages = bytes >> PAGE_SHIFT; - locked_pages += mm->locked_vm; - - limit_pages = rlimit(RLIMIT_MEMLOCK); - limit_pages >>= PAGE_SHIFT; - - return locked_pages <= limit_pages; -} - -static inline int __anon_vma_prepare(struct vm_area_struct *vma) -{ - struct anon_vma *anon_vma = calloc(1, sizeof(struct anon_vma)); - - if (!anon_vma) - return -ENOMEM; - - anon_vma->root = anon_vma; - vma->anon_vma = anon_vma; - - return 0; -} - -static inline int anon_vma_prepare(struct vm_area_struct *vma) -{ - if (likely(vma->anon_vma)) - return 0; - - return __anon_vma_prepare(vma); -} - -static inline void userfaultfd_unmap_complete(struct mm_struct *mm, - struct list_head *uf) -{ -} - -#define ACCESS_PRIVATE(p, member) ((p)->member) - -#define bitmap_size(nbits) (ALIGN(nbits, BITS_PER_LONG) / BITS_PER_BYTE) - -static __always_inline void bitmap_zero(unsigned long *dst, unsigned int nbits) -{ - unsigned int len = bitmap_size(nbits); - - if (small_const_nbits(nbits)) - *dst = 0; - else - memset(dst, 0, len); -} - -static inline bool mm_flags_test(int flag, const struct mm_struct *mm) -{ - return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); -} - -/* Clears all bits in the VMA flags bitmap, non-atomically. */ -static inline void vma_flags_clear_all(vma_flags_t *flags) -{ - bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS); -} - -/* - * Copy value to the first system word of VMA flags, non-atomically. - * - * IMPORTANT: This does not overwrite bytes past the first system word. The - * caller must account for this. - */ -static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value) -{ - *ACCESS_PRIVATE(flags, __vma_flags) = value; -} - -/* - * Copy value to the first system word of VMA flags ONCE, non-atomically. - * - * IMPORTANT: This does not overwrite bytes past the first system word. The - * caller must account for this. - */ -static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value) -{ - unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); - - WRITE_ONCE(*bitmap, value); -} - -/* Update the first system word of VMA flags setting bits, non-atomically. */ -static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) -{ - unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); - - *bitmap |= value; -} - -/* Update the first system word of VMA flags clearing bits, non-atomically. */ -static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value) -{ - unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); - - *bitmap &= ~value; -} - - -/* Use when VMA is not part of the VMA tree and needs no locking */ -static inline void vm_flags_init(struct vm_area_struct *vma, - vm_flags_t flags) -{ - vma_flags_clear_all(&vma->flags); - vma_flags_overwrite_word(&vma->flags, flags); -} - -/* - * Use when VMA is part of the VMA tree and modifications need coordination - * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and - * it should be locked explicitly beforehand. - */ -static inline void vm_flags_reset(struct vm_area_struct *vma, - vm_flags_t flags) -{ - vma_assert_write_locked(vma); - vm_flags_init(vma, flags); -} - -static inline void vm_flags_reset_once(struct vm_area_struct *vma, - vm_flags_t flags) -{ - vma_assert_write_locked(vma); - /* - * The user should only be interested in avoiding reordering of - * assignment to the first word. - */ - vma_flags_clear_all(&vma->flags); - vma_flags_overwrite_word_once(&vma->flags, flags); -} - -static inline void vm_flags_set(struct vm_area_struct *vma, - vm_flags_t flags) -{ - vma_start_write(vma); - vma_flags_set_word(&vma->flags, flags); -} - -static inline void vm_flags_clear(struct vm_area_struct *vma, - vm_flags_t flags) -{ - vma_start_write(vma); - vma_flags_clear_word(&vma->flags, flags); -} - -/* - * Denies creating a writable executable mapping or gaining executable permissions. - * - * This denies the following: - * - * a) mmap(PROT_WRITE | PROT_EXEC) - * - * b) mmap(PROT_WRITE) - * mprotect(PROT_EXEC) - * - * c) mmap(PROT_WRITE) - * mprotect(PROT_READ) - * mprotect(PROT_EXEC) - * - * But allows the following: - * - * d) mmap(PROT_READ | PROT_EXEC) - * mmap(PROT_READ | PROT_EXEC | PROT_BTI) - * - * This is only applicable if the user has set the Memory-Deny-Write-Execute - * (MDWE) protection mask for the current process. - * - * @old specifies the VMA flags the VMA originally possessed, and @new the ones - * we propose to set. - * - * Return: false if proposed change is OK, true if not ok and should be denied. - */ -static inline bool map_deny_write_exec(unsigned long old, unsigned long new) -{ - /* If MDWE is disabled, we have nothing to deny. */ - if (mm_flags_test(MMF_HAS_MDWE, current->mm)) - return false; - - /* If the new VMA is not executable, we have nothing to deny. */ - if (!(new & VM_EXEC)) - return false; - - /* Under MDWE we do not accept newly writably executable VMAs... */ - if (new & VM_WRITE) - return true; - - /* ...nor previously non-executable VMAs becoming executable. */ - if (!(old & VM_EXEC)) - return true; - - return false; -} - -static inline int mapping_map_writable(struct address_space *mapping) -{ - return atomic_inc_unless_negative(&mapping->i_mmap_writable) ? - 0 : -EPERM; -} - -static inline unsigned long move_page_tables(struct pagetable_move_control *pmc) -{ - return 0; -} - -static inline void free_pgd_range(struct mmu_gather *tlb, - unsigned long addr, unsigned long end, - unsigned long floor, unsigned long ceiling) -{ -} - -static inline int ksm_execve(struct mm_struct *mm) -{ - return 0; -} - -static inline void ksm_exit(struct mm_struct *mm) -{ -} - -static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) -{ - if (reset_refcnt) - refcount_set(&vma->vm_refcnt, 0); -} - -static inline void vma_numab_state_init(struct vm_area_struct *vma) -{ -} - -static inline void vma_numab_state_free(struct vm_area_struct *vma) -{ -} - -static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, - struct vm_area_struct *new_vma) -{ -} - -static inline void free_anon_vma_name(struct vm_area_struct *vma) -{ -} - -/* Declared in vma.h. */ -static inline void set_vma_from_desc(struct vm_area_struct *vma, - struct vm_area_desc *desc); - -static inline void mmap_action_prepare(struct mmap_action *action, - struct vm_area_desc *desc) -{ -} - -static inline int mmap_action_complete(struct mmap_action *action, - struct vm_area_struct *vma) -{ - return 0; -} - -static inline int __compat_vma_mmap(const struct file_operations *f_op, - struct file *file, struct vm_area_struct *vma) -{ - struct vm_area_desc desc = { - .mm = vma->vm_mm, - .file = file, - .start = vma->vm_start, - .end = vma->vm_end, - - .pgoff = vma->vm_pgoff, - .vm_file = vma->vm_file, - .vm_flags = vma->vm_flags, - .page_prot = vma->vm_page_prot, - - .action.type = MMAP_NOTHING, /* Default */ - }; - int err; - - err = f_op->mmap_prepare(&desc); - if (err) - return err; - - mmap_action_prepare(&desc.action, &desc); - set_vma_from_desc(vma, &desc); - return mmap_action_complete(&desc.action, vma); -} - -static inline int compat_vma_mmap(struct file *file, - struct vm_area_struct *vma) -{ - return __compat_vma_mmap(file->f_op, file, vma); -} - -/* Did the driver provide valid mmap hook configuration? */ -static inline bool can_mmap_file(struct file *file) -{ - bool has_mmap = file->f_op->mmap; - bool has_mmap_prepare = file->f_op->mmap_prepare; - - /* Hooks are mutually exclusive. */ - if (WARN_ON_ONCE(has_mmap && has_mmap_prepare)) - return false; - if (!has_mmap && !has_mmap_prepare) - return false; - - return true; -} - -static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) -{ - if (file->f_op->mmap_prepare) - return compat_vma_mmap(file, vma); - - return file->f_op->mmap(file, vma); -} - -static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc) -{ - return file->f_op->mmap_prepare(desc); -} - -static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma) -{ -} - -static inline void vma_set_file(struct vm_area_struct *vma, struct file *file) -{ - /* Changing an anonymous vma with this is illegal */ - get_file(file); - swap(vma->vm_file, file); - fput(file); -} - -static inline bool shmem_file(struct file *file) -{ - return false; -} - -static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm, - const struct file *file, vm_flags_t vm_flags) -{ - return vm_flags; -} - -static inline void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn) -{ -} - -static inline int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t pgprot) -{ - return 0; -} +typedef unsigned long vm_flags_t; +#define pgoff_t unsigned long +typedef unsigned long pgprotval_t; +typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; +typedef __bitwise unsigned int vm_fault_t; -static inline int do_munmap(struct mm_struct *, unsigned long, size_t, - struct list_head *uf) -{ - return 0; -} +#include "include/stubs.h" +#include "include/dup.h" +#include "include/custom.h" #endif /* __MM_VMA_INTERNAL_H */ diff --git a/tools/usb/usbip/README b/tools/usb/usbip/README index 2fc021c0eae1..11971538f03e 100644 --- a/tools/usb/usbip/README +++ b/tools/usb/usbip/README @@ -241,8 +241,6 @@ Detach the imported device: [Checklist] - - See 'Debug Tips' on the project wiki. - - http://usbip.wiki.sourceforge.net/how-to-debug-usbip - usbip-host.ko must be bound to the target device. - See /sys/kernel/debug/usb/devices and find "Driver=..." lines of the device. - Target USB gadget must be bound to vudc |
