From ca3cf049d2430ac83eff5c32af7b86bb2d1285bf Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 13 Jul 2017 17:15:24 -0300 Subject: tools: Update include/uapi/linux/fcntl.h copy from the kernel To get the changes in the commit c75b1d9421f8 ("fs: add fcntl() interface for setting/getting write life time hints"). Silencing this perf build warning: Warning: include/uapi/linux/fcntl.h differs from kernel We already beautify the fcntl cmd argument, so an upcoming cset will update the 'cmd' strarray to cover these new commands. The hints are in the 3rd arg, a pointer, so not yet supported in 'perf trace', for that we need to copy it somehow, probably using eBPF, a new attempt at doing that is planned. Cc: Adrian Hunter Cc: David Ahern Cc: Jens Axboe Cc: Jiri Olsa Cc: Martin K. Petersen Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-al471wzs3x48alql0tm3mnfa@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/fcntl.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'tools/include/uapi/linux') diff --git a/tools/include/uapi/linux/fcntl.h b/tools/include/uapi/linux/fcntl.h index 813afd6eee71..ec69d55bcec7 100644 --- a/tools/include/uapi/linux/fcntl.h +++ b/tools/include/uapi/linux/fcntl.h @@ -42,6 +42,27 @@ #define F_SEAL_WRITE 0x0008 /* prevent writes */ /* (1U << 31) is reserved for signed error codes */ +/* + * Set/Get write life time hints. {GET,SET}_RW_HINT operate on the + * underlying inode, while {GET,SET}_FILE_RW_HINT operate only on + * the specific file. + */ +#define F_GET_RW_HINT (F_LINUX_SPECIFIC_BASE + 11) +#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12) +#define F_GET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 13) +#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14) + +/* + * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be + * used to clear any hints previously set. + */ +#define RWF_WRITE_LIFE_NOT_SET 0 +#define RWH_WRITE_LIFE_NONE 1 +#define RWH_WRITE_LIFE_SHORT 2 +#define RWH_WRITE_LIFE_MEDIUM 3 +#define RWH_WRITE_LIFE_LONG 4 +#define RWH_WRITE_LIFE_EXTREME 5 + /* * Types of directory notifications that may be requested. */ -- cgit v1.2.3 From eb0baf8a0d9259d168523b8e7c436b55ade7c546 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 18 Jul 2017 20:13:09 +0800 Subject: perf/core: Define the common branch type classification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is often useful to know the branch types while analyzing branch data. For example, a call is very different from a conditional branch. Currently we have to look it up in binary while the binary may later not be available and even the binary is available but user has to take some time. It is very useful for user to check it directly in perf report. Perf already has support for disassembling the branch instruction to get the x86 branch type. To keep consistent on kernel and userspace and make the classification more common, the patch adds the common branch type classification in perf_event.h. The patch only defines a minimum but most common set of branch types. PERF_BR_UNKNOWN : unknown PERF_BR_COND :conditional PERF_BR_UNCOND : unconditional PERF_BR_IND : indirect PERF_BR_CALL : function call PERF_BR_IND_CALL : indirect function call PERF_BR_RET : function return PERF_BR_SYSCALL : syscall PERF_BR_SYSRET : syscall return PERF_BR_COND_CALL : conditional function call PERF_BR_COND_RET : conditional function return The patch also adds a new field type (4 bits) in perf_branch_entry to record the branch type. Since the disassembling of branch instruction needs some overhead, a new PERF_SAMPLE_BRANCH_TYPE_SAVE is introduced to indicate if it needs to disassemble the branch instruction and record the branch type. Change log: v10: Not changed. v9: Not changed. v8: Change PERF_BR_NONE to PERF_BR_UNKNOWN. No other change. v7: Just keep the most common branch types. Others are removed. v6: Not changed. v5: Not changed. The v5 patch series just change the userspace. v4: Comparing to previous version, the major changes are: 1. Remove the PERF_BR_JCC_FWD/PERF_BR_JCC_BWD, they will be computed later in userspace. 2. Remove the "cross" field in perf_branch_entry. The cross page computing will be done later in userspace. Signed-off-by: Yao Jin Acked-by: Jiri Olsa Acked-by: Michael Ellerman Acked-by: Peter Zijlstra Cc: Alexander Shishkin Cc: Andi Kleen Cc: Kan Liang Link: http://lkml.kernel.org/r/1500379995-6449-2-git-send-email-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- include/uapi/linux/perf_event.h | 27 ++++++++++++++++++++++++++- tools/include/uapi/linux/perf_event.h | 27 ++++++++++++++++++++++++++- 2 files changed, 52 insertions(+), 2 deletions(-) (limited to 'tools/include/uapi/linux') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index b1c0b187acfe..642db5fa3286 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -174,6 +174,8 @@ enum perf_branch_sample_type_shift { PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT = 14, /* no flags */ PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT = 15, /* no cycles */ + PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT = 16, /* save branch type */ + PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */ }; @@ -198,9 +200,30 @@ enum perf_branch_sample_type { PERF_SAMPLE_BRANCH_NO_FLAGS = 1U << PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT, PERF_SAMPLE_BRANCH_NO_CYCLES = 1U << PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT, + PERF_SAMPLE_BRANCH_TYPE_SAVE = + 1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT, + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, }; +/* + * Common flow change classification + */ +enum { + PERF_BR_UNKNOWN = 0, /* unknown */ + PERF_BR_COND = 1, /* conditional */ + PERF_BR_UNCOND = 2, /* unconditional */ + PERF_BR_IND = 3, /* indirect */ + PERF_BR_CALL = 4, /* function call */ + PERF_BR_IND_CALL = 5, /* indirect function call */ + PERF_BR_RET = 6, /* function return */ + PERF_BR_SYSCALL = 7, /* syscall */ + PERF_BR_SYSRET = 8, /* syscall return */ + PERF_BR_COND_CALL = 9, /* conditional function call */ + PERF_BR_COND_RET = 10, /* conditional function return */ + PERF_BR_MAX, +}; + #define PERF_SAMPLE_BRANCH_PLM_ALL \ (PERF_SAMPLE_BRANCH_USER|\ PERF_SAMPLE_BRANCH_KERNEL|\ @@ -1015,6 +1038,7 @@ union perf_mem_data_src { * in_tx: running in a hardware transaction * abort: aborting a hardware transaction * cycles: cycles from last branch (or 0 if not supported) + * type: branch type */ struct perf_branch_entry { __u64 from; @@ -1024,7 +1048,8 @@ struct perf_branch_entry { in_tx:1, /* in transaction */ abort:1, /* transaction abort */ cycles:16, /* cycle count to last branch */ - reserved:44; + type:4, /* branch type */ + reserved:40; }; #endif /* _UAPI_LINUX_PERF_EVENT_H */ diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index b1c0b187acfe..642db5fa3286 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -174,6 +174,8 @@ enum perf_branch_sample_type_shift { PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT = 14, /* no flags */ PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT = 15, /* no cycles */ + PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT = 16, /* save branch type */ + PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */ }; @@ -198,9 +200,30 @@ enum perf_branch_sample_type { PERF_SAMPLE_BRANCH_NO_FLAGS = 1U << PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT, PERF_SAMPLE_BRANCH_NO_CYCLES = 1U << PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT, + PERF_SAMPLE_BRANCH_TYPE_SAVE = + 1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT, + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, }; +/* + * Common flow change classification + */ +enum { + PERF_BR_UNKNOWN = 0, /* unknown */ + PERF_BR_COND = 1, /* conditional */ + PERF_BR_UNCOND = 2, /* unconditional */ + PERF_BR_IND = 3, /* indirect */ + PERF_BR_CALL = 4, /* function call */ + PERF_BR_IND_CALL = 5, /* indirect function call */ + PERF_BR_RET = 6, /* function return */ + PERF_BR_SYSCALL = 7, /* syscall */ + PERF_BR_SYSRET = 8, /* syscall return */ + PERF_BR_COND_CALL = 9, /* conditional function call */ + PERF_BR_COND_RET = 10, /* conditional function return */ + PERF_BR_MAX, +}; + #define PERF_SAMPLE_BRANCH_PLM_ALL \ (PERF_SAMPLE_BRANCH_USER|\ PERF_SAMPLE_BRANCH_KERNEL|\ @@ -1015,6 +1038,7 @@ union perf_mem_data_src { * in_tx: running in a hardware transaction * abort: aborting a hardware transaction * cycles: cycles from last branch (or 0 if not supported) + * type: branch type */ struct perf_branch_entry { __u64 from; @@ -1024,7 +1048,8 @@ struct perf_branch_entry { in_tx:1, /* in transaction */ abort:1, /* transaction abort */ cycles:16, /* cycle count to last branch */ - reserved:44; + type:4, /* branch type */ + reserved:40; }; #endif /* _UAPI_LINUX_PERF_EVENT_H */ -- cgit v1.2.3 From 450c86c9a3301db5165172bcaced73d036f9c582 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 20 Jul 2017 10:46:34 -0300 Subject: tools include uapi: Grab a copy of linux/sched.h So that we make sure we have recent enough defines for things such as 'perf trace' system call argument beautifiers. For instance, the 'clone' syscall argument 'flag' needs to use CLONE_NEWCGROUP, and that is not available in RHEL7. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-81sln0ng4a2lcxrth14vcov4@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/sched.h | 52 ++++++++++++++++++++++++++++++++++++++++ tools/perf/MANIFEST | 1 + tools/perf/check-headers.sh | 1 + 3 files changed, 54 insertions(+) create mode 100644 tools/include/uapi/linux/sched.h (limited to 'tools/include/uapi/linux') diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h new file mode 100644 index 000000000000..e2a6c7b3510b --- /dev/null +++ b/tools/include/uapi/linux/sched.h @@ -0,0 +1,52 @@ +#ifndef _UAPI_LINUX_SCHED_H +#define _UAPI_LINUX_SCHED_H + +/* + * cloning flags: + */ +#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ +#define CLONE_VM 0x00000100 /* set if VM shared between processes */ +#define CLONE_FS 0x00000200 /* set if fs info shared between processes */ +#define CLONE_FILES 0x00000400 /* set if open files shared between processes */ +#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ +#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ +#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ +#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ +#define CLONE_THREAD 0x00010000 /* Same thread group? */ +#define CLONE_NEWNS 0x00020000 /* New mount namespace group */ +#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */ +#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */ +#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */ +#define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */ +#define CLONE_DETACHED 0x00400000 /* Unused, ignored */ +#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ +#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ +#define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */ +#define CLONE_NEWUTS 0x04000000 /* New utsname namespace */ +#define CLONE_NEWIPC 0x08000000 /* New ipc namespace */ +#define CLONE_NEWUSER 0x10000000 /* New user namespace */ +#define CLONE_NEWPID 0x20000000 /* New pid namespace */ +#define CLONE_NEWNET 0x40000000 /* New network namespace */ +#define CLONE_IO 0x80000000 /* Clone io context */ + +/* + * Scheduling policies + */ +#define SCHED_NORMAL 0 +#define SCHED_FIFO 1 +#define SCHED_RR 2 +#define SCHED_BATCH 3 +/* SCHED_ISO: reserved but not implemented yet */ +#define SCHED_IDLE 5 +#define SCHED_DEADLINE 6 + +/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ +#define SCHED_RESET_ON_FORK 0x40000000 + +/* + * For the sched_{set,get}attr() calls + */ +#define SCHED_FLAG_RESET_ON_FORK 0x01 +#define SCHED_FLAG_RECLAIM 0x02 + +#endif /* _UAPI_LINUX_SCHED_H */ diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST index a29da46d180f..9f2267b82eb2 100644 --- a/tools/perf/MANIFEST +++ b/tools/perf/MANIFEST @@ -78,6 +78,7 @@ tools/include/uapi/linux/fcntl.h tools/include/uapi/linux/hw_breakpoint.h tools/include/uapi/linux/mman.h tools/include/uapi/linux/perf_event.h +tools/include/uapi/linux/sched.h tools/include/uapi/linux/stat.h tools/include/linux/poison.h tools/include/linux/rbtree.h diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index 47abd3325190..9b6295809a3b 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -3,6 +3,7 @@ HEADERS=' include/uapi/linux/fcntl.h include/uapi/linux/perf_event.h +include/uapi/linux/sched.h include/uapi/linux/stat.h include/linux/hash.h include/uapi/linux/hw_breakpoint.h -- cgit v1.2.3 From 81f6bf81270ce1052b5cd4d60b9edc40cd5ceefa Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 26 Jul 2017 17:32:07 -0700 Subject: bpf: testing: fix devmap tests Apparently through one of my revisions of the initial patches series I lost the devmap test. We can add more testing later but for now lets fix the simple one we have. Fixes: 546ac1ffb70d "bpf: add devmap, a map for storing net device references" Reported-by: Jakub Kicinski Signed-off-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- tools/include/uapi/linux/bpf.h | 1 + tools/testing/selftests/bpf/test_maps.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'tools/include/uapi/linux') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ce2988be4f0e..1579cab49717 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -104,6 +104,7 @@ enum bpf_map_type { BPF_MAP_TYPE_LPM_TRIE, BPF_MAP_TYPE_ARRAY_OF_MAPS, BPF_MAP_TYPE_HASH_OF_MAPS, + BPF_MAP_TYPE_DEVMAP, }; enum bpf_prog_type { diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 36d6ac3f0c1c..c991ab69a720 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -440,7 +440,7 @@ static void test_arraymap_percpu_many_keys(void) static void test_devmap(int task, void *data) { - int next_key, fd; + int fd; __u32 key, value; fd = bpf_create_map(BPF_MAP_TYPE_DEVMAP, sizeof(key), sizeof(value), @@ -620,6 +620,8 @@ static void run_all_tests(void) test_arraymap_percpu_many_keys(); + test_devmap(0, NULL); + test_map_large(); test_map_parallel(); test_map_stress(); -- cgit v1.2.3 From d62c1d7213b77eb4952a873ad57727e991ac3306 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 31 Jul 2017 10:45:07 -0300 Subject: tools headers: Fixup tools/include/uapi/linux/bpf.h copy of kernel ABI header In 2be7e212d541 ("bpf: add bpf_skb_adjust_room helper") BPF_ADJ_ROOM_NET was added to include/uapi/linux/bpf.h but BPF_ADJ_ROOM_NET_OPS was added to tools/include/uapi/linux/bpf.h, making these files differ, fix it by using the same name in both, BPF_ADJ_ROOM_NET, the one in the kernel headers copy. Cc: Adrian Hunter Cc: Daniel Borkmann Cc: David Ahern Cc: David S. Miller Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Fixes: 2be7e212d541 ("bpf: add bpf_skb_adjust_room helper") Link: http://lkml.kernel.org/n/tip-2bmwovi9lymplyz6wsszppyf@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/include/uapi/linux') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ce2988be4f0e..8ff155ffcd84 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -643,7 +643,7 @@ enum bpf_func_id { /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { - BPF_ADJ_ROOM_NET_OPTS, + BPF_ADJ_ROOM_NET, }; /* user accessible mirror of in-kernel sk_buff. -- cgit v1.2.3 From f1d6cb2d8cdfb210f9b4a0f59e9168ac0f5db774 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 31 Jul 2017 10:45:07 -0300 Subject: tools headers: Fixup tools/include/uapi/linux/bpf.h copy of kernel ABI header In 04df41e343db ("bpf: update tools/include/uapi/linux/bpf.h") the files added in 40304b2a1567 ("bpf: BPF support for sock_ops") were added to tools/include, but not in a verbatim way, missing the comments, which ends up triggering this warning when build tools/perf/: make: Entering directory '/home/acme/git/linux/tools/perf' BUILD: Doing 'make -j4' parallel build Warning: Kernel ABI header at 'tools/include/uapi/linux/bpf.h' differs from latest version at 'include/uapi/linux/bpf.h' Make sure the the lines are equal, to fix the simple header copy drift detector in tools/perf/. Cc: Adrian Hunter Cc: Daniel Borkmann Cc: David Ahern Cc: David S. Miller Cc: Jiri Olsa Cc: Lawrence Brakmo Cc: Namhyung Kim Cc: Wang Nan Fixes: 04df41e343db ("bpf: update tools/include/uapi/linux/bpf.h") Link: http://lkml.kernel.org/n/tip-z9qyyqht9qq3yyxu76sfy0dh@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/bpf.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'tools/include/uapi/linux') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 8ff155ffcd84..e99e3e6f8b37 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -750,6 +750,8 @@ struct bpf_map_info { /* User bpf_sock_ops struct to access socket values and specify request ops * and their replies. + * Some of this fields are in network (bigendian) byte order and may need + * to be converted before use (bpf_ntohl() defined in samples/bpf/bpf_endian.h). * New fields can only be added at the end of this structure */ struct bpf_sock_ops { @@ -759,12 +761,12 @@ struct bpf_sock_ops { __u32 replylong[4]; }; __u32 family; - __u32 remote_ip4; - __u32 local_ip4; - __u32 remote_ip6[4]; - __u32 local_ip6[4]; - __u32 remote_port; - __u32 local_port; + __u32 remote_ip4; /* Stored in network byte order */ + __u32 local_ip4; /* Stored in network byte order */ + __u32 remote_ip6[4]; /* Stored in network byte order */ + __u32 local_ip6[4]; /* Stored in network byte order */ + __u32 remote_port; /* Stored in network byte order */ + __u32 local_port; /* stored in host byte order */ }; /* List of known BPF sock_ops operators. -- cgit v1.2.3 From 3ce97513f960b62fc9b2eee892162e225cd0f3a8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 31 Jul 2017 16:45:32 -0300 Subject: tools include uapi: Grab a copy of linux/kvm.h We will use it to generate tables for beautifying ioctl's 'cmd' arg. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-nxwpq34hu6te1m2ra5m7o8n9@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/kvm.h | 1419 ++++++++++++++++++++++++++++++++++++++++ tools/perf/MANIFEST | 1 + tools/perf/check-headers.sh | 1 + 3 files changed, 1421 insertions(+) create mode 100644 tools/include/uapi/linux/kvm.h (limited to 'tools/include/uapi/linux') diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h new file mode 100644 index 000000000000..6cd63c18708a --- /dev/null +++ b/tools/include/uapi/linux/kvm.h @@ -0,0 +1,1419 @@ +#ifndef __LINUX_KVM_H +#define __LINUX_KVM_H + +/* + * Userspace interface for /dev/kvm - kernel based virtual machine + * + * Note: you must update KVM_API_VERSION if you change this interface. + */ + +#include +#include +#include +#include + +#define KVM_API_VERSION 12 + +/* *** Deprecated interfaces *** */ + +#define KVM_TRC_SHIFT 16 + +#define KVM_TRC_ENTRYEXIT (1 << KVM_TRC_SHIFT) +#define KVM_TRC_HANDLER (1 << (KVM_TRC_SHIFT + 1)) + +#define KVM_TRC_VMENTRY (KVM_TRC_ENTRYEXIT + 0x01) +#define KVM_TRC_VMEXIT (KVM_TRC_ENTRYEXIT + 0x02) +#define KVM_TRC_PAGE_FAULT (KVM_TRC_HANDLER + 0x01) + +#define KVM_TRC_HEAD_SIZE 12 +#define KVM_TRC_CYCLE_SIZE 8 +#define KVM_TRC_EXTRA_MAX 7 + +#define KVM_TRC_INJ_VIRQ (KVM_TRC_HANDLER + 0x02) +#define KVM_TRC_REDELIVER_EVT (KVM_TRC_HANDLER + 0x03) +#define KVM_TRC_PEND_INTR (KVM_TRC_HANDLER + 0x04) +#define KVM_TRC_IO_READ (KVM_TRC_HANDLER + 0x05) +#define KVM_TRC_IO_WRITE (KVM_TRC_HANDLER + 0x06) +#define KVM_TRC_CR_READ (KVM_TRC_HANDLER + 0x07) +#define KVM_TRC_CR_WRITE (KVM_TRC_HANDLER + 0x08) +#define KVM_TRC_DR_READ (KVM_TRC_HANDLER + 0x09) +#define KVM_TRC_DR_WRITE (KVM_TRC_HANDLER + 0x0A) +#define KVM_TRC_MSR_READ (KVM_TRC_HANDLER + 0x0B) +#define KVM_TRC_MSR_WRITE (KVM_TRC_HANDLER + 0x0C) +#define KVM_TRC_CPUID (KVM_TRC_HANDLER + 0x0D) +#define KVM_TRC_INTR (KVM_TRC_HANDLER + 0x0E) +#define KVM_TRC_NMI (KVM_TRC_HANDLER + 0x0F) +#define KVM_TRC_VMMCALL (KVM_TRC_HANDLER + 0x10) +#define KVM_TRC_HLT (KVM_TRC_HANDLER + 0x11) +#define KVM_TRC_CLTS (KVM_TRC_HANDLER + 0x12) +#define KVM_TRC_LMSW (KVM_TRC_HANDLER + 0x13) +#define KVM_TRC_APIC_ACCESS (KVM_TRC_HANDLER + 0x14) +#define KVM_TRC_TDP_FAULT (KVM_TRC_HANDLER + 0x15) +#define KVM_TRC_GTLB_WRITE (KVM_TRC_HANDLER + 0x16) +#define KVM_TRC_STLB_WRITE (KVM_TRC_HANDLER + 0x17) +#define KVM_TRC_STLB_INVAL (KVM_TRC_HANDLER + 0x18) +#define KVM_TRC_PPC_INSTR (KVM_TRC_HANDLER + 0x19) + +struct kvm_user_trace_setup { + __u32 buf_size; + __u32 buf_nr; +}; + +#define __KVM_DEPRECATED_MAIN_W_0x06 \ + _IOW(KVMIO, 0x06, struct kvm_user_trace_setup) +#define __KVM_DEPRECATED_MAIN_0x07 _IO(KVMIO, 0x07) +#define __KVM_DEPRECATED_MAIN_0x08 _IO(KVMIO, 0x08) + +#define __KVM_DEPRECATED_VM_R_0x70 _IOR(KVMIO, 0x70, struct kvm_assigned_irq) + +struct kvm_breakpoint { + __u32 enabled; + __u32 padding; + __u64 address; +}; + +struct kvm_debug_guest { + __u32 enabled; + __u32 pad; + struct kvm_breakpoint breakpoints[4]; + __u32 singlestep; +}; + +#define __KVM_DEPRECATED_VCPU_W_0x87 _IOW(KVMIO, 0x87, struct kvm_debug_guest) + +/* *** End of deprecated interfaces *** */ + + +/* for KVM_CREATE_MEMORY_REGION */ +struct kvm_memory_region { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; /* bytes */ +}; + +/* for KVM_SET_USER_MEMORY_REGION */ +struct kvm_userspace_memory_region { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; /* bytes */ + __u64 userspace_addr; /* start of the userspace allocated memory */ +}; + +/* + * The bit 0 ~ bit 15 of kvm_memory_region::flags are visible for userspace, + * other bits are reserved for kvm internal use which are defined in + * include/linux/kvm_host.h. + */ +#define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) +#define KVM_MEM_READONLY (1UL << 1) + +/* for KVM_IRQ_LINE */ +struct kvm_irq_level { + /* + * ACPI gsi notion of irq. + * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47.. + * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23.. + * For ARM: See Documentation/virtual/kvm/api.txt + */ + union { + __u32 irq; + __s32 status; + }; + __u32 level; +}; + + +struct kvm_irqchip { + __u32 chip_id; + __u32 pad; + union { + char dummy[512]; /* reserving space */ +#ifdef __KVM_HAVE_PIT + struct kvm_pic_state pic; +#endif +#ifdef __KVM_HAVE_IOAPIC + struct kvm_ioapic_state ioapic; +#endif + } chip; +}; + +/* for KVM_CREATE_PIT2 */ +struct kvm_pit_config { + __u32 flags; + __u32 pad[15]; +}; + +#define KVM_PIT_SPEAKER_DUMMY 1 + +struct kvm_s390_skeys { + __u64 start_gfn; + __u64 count; + __u64 skeydata_addr; + __u32 flags; + __u32 reserved[9]; +}; + +#define KVM_S390_CMMA_PEEK (1 << 0) + +/** + * kvm_s390_cmma_log - Used for CMMA migration. + * + * Used both for input and output. + * + * @start_gfn: Guest page number to start from. + * @count: Size of the result buffer. + * @flags: Control operation mode via KVM_S390_CMMA_* flags + * @remaining: Used with KVM_S390_GET_CMMA_BITS. Indicates how many dirty + * pages are still remaining. + * @mask: Used with KVM_S390_SET_CMMA_BITS. Bitmap of bits to actually set + * in the PGSTE. + * @values: Pointer to the values buffer. + * + * Used in KVM_S390_{G,S}ET_CMMA_BITS ioctls. + */ +struct kvm_s390_cmma_log { + __u64 start_gfn; + __u32 count; + __u32 flags; + union { + __u64 remaining; + __u64 mask; + }; + __u64 values; +}; + +struct kvm_hyperv_exit { +#define KVM_EXIT_HYPERV_SYNIC 1 +#define KVM_EXIT_HYPERV_HCALL 2 + __u32 type; + union { + struct { + __u32 msr; + __u64 control; + __u64 evt_page; + __u64 msg_page; + } synic; + struct { + __u64 input; + __u64 result; + __u64 params[2]; + } hcall; + } u; +}; + +#define KVM_S390_GET_SKEYS_NONE 1 +#define KVM_S390_SKEYS_MAX 1048576 + +#define KVM_EXIT_UNKNOWN 0 +#define KVM_EXIT_EXCEPTION 1 +#define KVM_EXIT_IO 2 +#define KVM_EXIT_HYPERCALL 3 +#define KVM_EXIT_DEBUG 4 +#define KVM_EXIT_HLT 5 +#define KVM_EXIT_MMIO 6 +#define KVM_EXIT_IRQ_WINDOW_OPEN 7 +#define KVM_EXIT_SHUTDOWN 8 +#define KVM_EXIT_FAIL_ENTRY 9 +#define KVM_EXIT_INTR 10 +#define KVM_EXIT_SET_TPR 11 +#define KVM_EXIT_TPR_ACCESS 12 +#define KVM_EXIT_S390_SIEIC 13 +#define KVM_EXIT_S390_RESET 14 +#define KVM_EXIT_DCR 15 /* deprecated */ +#define KVM_EXIT_NMI 16 +#define KVM_EXIT_INTERNAL_ERROR 17 +#define KVM_EXIT_OSI 18 +#define KVM_EXIT_PAPR_HCALL 19 +#define KVM_EXIT_S390_UCONTROL 20 +#define KVM_EXIT_WATCHDOG 21 +#define KVM_EXIT_S390_TSCH 22 +#define KVM_EXIT_EPR 23 +#define KVM_EXIT_SYSTEM_EVENT 24 +#define KVM_EXIT_S390_STSI 25 +#define KVM_EXIT_IOAPIC_EOI 26 +#define KVM_EXIT_HYPERV 27 + +/* For KVM_EXIT_INTERNAL_ERROR */ +/* Emulate instruction failed. */ +#define KVM_INTERNAL_ERROR_EMULATION 1 +/* Encounter unexpected simultaneous exceptions. */ +#define KVM_INTERNAL_ERROR_SIMUL_EX 2 +/* Encounter unexpected vm-exit due to delivery event. */ +#define KVM_INTERNAL_ERROR_DELIVERY_EV 3 + +/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ +struct kvm_run { + /* in */ + __u8 request_interrupt_window; + __u8 immediate_exit; + __u8 padding1[6]; + + /* out */ + __u32 exit_reason; + __u8 ready_for_interrupt_injection; + __u8 if_flag; + __u16 flags; + + /* in (pre_kvm_run), out (post_kvm_run) */ + __u64 cr8; + __u64 apic_base; + +#ifdef __KVM_S390 + /* the processor status word for s390 */ + __u64 psw_mask; /* psw upper half */ + __u64 psw_addr; /* psw lower half */ +#endif + union { + /* KVM_EXIT_UNKNOWN */ + struct { + __u64 hardware_exit_reason; + } hw; + /* KVM_EXIT_FAIL_ENTRY */ + struct { + __u64 hardware_entry_failure_reason; + } fail_entry; + /* KVM_EXIT_EXCEPTION */ + struct { + __u32 exception; + __u32 error_code; + } ex; + /* KVM_EXIT_IO */ + struct { +#define KVM_EXIT_IO_IN 0 +#define KVM_EXIT_IO_OUT 1 + __u8 direction; + __u8 size; /* bytes */ + __u16 port; + __u32 count; + __u64 data_offset; /* relative to kvm_run start */ + } io; + /* KVM_EXIT_DEBUG */ + struct { + struct kvm_debug_exit_arch arch; + } debug; + /* KVM_EXIT_MMIO */ + struct { + __u64 phys_addr; + __u8 data[8]; + __u32 len; + __u8 is_write; + } mmio; + /* KVM_EXIT_HYPERCALL */ + struct { + __u64 nr; + __u64 args[6]; + __u64 ret; + __u32 longmode; + __u32 pad; + } hypercall; + /* KVM_EXIT_TPR_ACCESS */ + struct { + __u64 rip; + __u32 is_write; + __u32 pad; + } tpr_access; + /* KVM_EXIT_S390_SIEIC */ + struct { + __u8 icptcode; + __u16 ipa; + __u32 ipb; + } s390_sieic; + /* KVM_EXIT_S390_RESET */ +#define KVM_S390_RESET_POR 1 +#define KVM_S390_RESET_CLEAR 2 +#define KVM_S390_RESET_SUBSYSTEM 4 +#define KVM_S390_RESET_CPU_INIT 8 +#define KVM_S390_RESET_IPL 16 + __u64 s390_reset_flags; + /* KVM_EXIT_S390_UCONTROL */ + struct { + __u64 trans_exc_code; + __u32 pgm_code; + } s390_ucontrol; + /* KVM_EXIT_DCR (deprecated) */ + struct { + __u32 dcrn; + __u32 data; + __u8 is_write; + } dcr; + /* KVM_EXIT_INTERNAL_ERROR */ + struct { + __u32 suberror; + /* Available with KVM_CAP_INTERNAL_ERROR_DATA: */ + __u32 ndata; + __u64 data[16]; + } internal; + /* KVM_EXIT_OSI */ + struct { + __u64 gprs[32]; + } osi; + /* KVM_EXIT_PAPR_HCALL */ + struct { + __u64 nr; + __u64 ret; + __u64 args[9]; + } papr_hcall; + /* KVM_EXIT_S390_TSCH */ + struct { + __u16 subchannel_id; + __u16 subchannel_nr; + __u32 io_int_parm; + __u32 io_int_word; + __u32 ipb; + __u8 dequeued; + } s390_tsch; + /* KVM_EXIT_EPR */ + struct { + __u32 epr; + } epr; + /* KVM_EXIT_SYSTEM_EVENT */ + struct { +#define KVM_SYSTEM_EVENT_SHUTDOWN 1 +#define KVM_SYSTEM_EVENT_RESET 2 +#define KVM_SYSTEM_EVENT_CRASH 3 + __u32 type; + __u64 flags; + } system_event; + /* KVM_EXIT_S390_STSI */ + struct { + __u64 addr; + __u8 ar; + __u8 reserved; + __u8 fc; + __u8 sel1; + __u16 sel2; + } s390_stsi; + /* KVM_EXIT_IOAPIC_EOI */ + struct { + __u8 vector; + } eoi; + /* KVM_EXIT_HYPERV */ + struct kvm_hyperv_exit hyperv; + /* Fix the size of the union. */ + char padding[256]; + }; + + /* + * shared registers between kvm and userspace. + * kvm_valid_regs specifies the register classes set by the host + * kvm_dirty_regs specified the register classes dirtied by userspace + * struct kvm_sync_regs is architecture specific, as well as the + * bits for kvm_valid_regs and kvm_dirty_regs + */ + __u64 kvm_valid_regs; + __u64 kvm_dirty_regs; + union { + struct kvm_sync_regs regs; + char padding[2048]; + } s; +}; + +/* for KVM_REGISTER_COALESCED_MMIO / KVM_UNREGISTER_COALESCED_MMIO */ + +struct kvm_coalesced_mmio_zone { + __u64 addr; + __u32 size; + __u32 pad; +}; + +struct kvm_coalesced_mmio { + __u64 phys_addr; + __u32 len; + __u32 pad; + __u8 data[8]; +}; + +struct kvm_coalesced_mmio_ring { + __u32 first, last; + struct kvm_coalesced_mmio coalesced_mmio[0]; +}; + +#define KVM_COALESCED_MMIO_MAX \ + ((PAGE_SIZE - sizeof(struct kvm_coalesced_mmio_ring)) / \ + sizeof(struct kvm_coalesced_mmio)) + +/* for KVM_TRANSLATE */ +struct kvm_translation { + /* in */ + __u64 linear_address; + + /* out */ + __u64 physical_address; + __u8 valid; + __u8 writeable; + __u8 usermode; + __u8 pad[5]; +}; + +/* for KVM_S390_MEM_OP */ +struct kvm_s390_mem_op { + /* in */ + __u64 gaddr; /* the guest address */ + __u64 flags; /* flags */ + __u32 size; /* amount of bytes */ + __u32 op; /* type of operation */ + __u64 buf; /* buffer in userspace */ + __u8 ar; /* the access register number */ + __u8 reserved[31]; /* should be set to 0 */ +}; +/* types for kvm_s390_mem_op->op */ +#define KVM_S390_MEMOP_LOGICAL_READ 0 +#define KVM_S390_MEMOP_LOGICAL_WRITE 1 +/* flags for kvm_s390_mem_op->flags */ +#define KVM_S390_MEMOP_F_CHECK_ONLY (1ULL << 0) +#define KVM_S390_MEMOP_F_INJECT_EXCEPTION (1ULL << 1) + +/* for KVM_INTERRUPT */ +struct kvm_interrupt { + /* in */ + __u32 irq; +}; + +/* for KVM_GET_DIRTY_LOG */ +struct kvm_dirty_log { + __u32 slot; + __u32 padding1; + union { + void __user *dirty_bitmap; /* one bit per page */ + __u64 padding2; + }; +}; + +/* for KVM_SET_SIGNAL_MASK */ +struct kvm_signal_mask { + __u32 len; + __u8 sigset[0]; +}; + +/* for KVM_TPR_ACCESS_REPORTING */ +struct kvm_tpr_access_ctl { + __u32 enabled; + __u32 flags; + __u32 reserved[8]; +}; + +/* for KVM_SET_VAPIC_ADDR */ +struct kvm_vapic_addr { + __u64 vapic_addr; +}; + +/* for KVM_SET_MP_STATE */ + +/* not all states are valid on all architectures */ +#define KVM_MP_STATE_RUNNABLE 0 +#define KVM_MP_STATE_UNINITIALIZED 1 +#define KVM_MP_STATE_INIT_RECEIVED 2 +#define KVM_MP_STATE_HALTED 3 +#define KVM_MP_STATE_SIPI_RECEIVED 4 +#define KVM_MP_STATE_STOPPED 5 +#define KVM_MP_STATE_CHECK_STOP 6 +#define KVM_MP_STATE_OPERATING 7 +#define KVM_MP_STATE_LOAD 8 + +struct kvm_mp_state { + __u32 mp_state; +}; + +struct kvm_s390_psw { + __u64 mask; + __u64 addr; +}; + +/* valid values for type in kvm_s390_interrupt */ +#define KVM_S390_SIGP_STOP 0xfffe0000u +#define KVM_S390_PROGRAM_INT 0xfffe0001u +#define KVM_S390_SIGP_SET_PREFIX 0xfffe0002u +#define KVM_S390_RESTART 0xfffe0003u +#define KVM_S390_INT_PFAULT_INIT 0xfffe0004u +#define KVM_S390_INT_PFAULT_DONE 0xfffe0005u +#define KVM_S390_MCHK 0xfffe1000u +#define KVM_S390_INT_CLOCK_COMP 0xffff1004u +#define KVM_S390_INT_CPU_TIMER 0xffff1005u +#define KVM_S390_INT_VIRTIO 0xffff2603u +#define KVM_S390_INT_SERVICE 0xffff2401u +#define KVM_S390_INT_EMERGENCY 0xffff1201u +#define KVM_S390_INT_EXTERNAL_CALL 0xffff1202u +/* Anything below 0xfffe0000u is taken by INT_IO */ +#define KVM_S390_INT_IO(ai,cssid,ssid,schid) \ + (((schid)) | \ + ((ssid) << 16) | \ + ((cssid) << 18) | \ + ((ai) << 26)) +#define KVM_S390_INT_IO_MIN 0x00000000u +#define KVM_S390_INT_IO_MAX 0xfffdffffu +#define KVM_S390_INT_IO_AI_MASK 0x04000000u + + +struct kvm_s390_interrupt { + __u32 type; + __u32 parm; + __u64 parm64; +}; + +struct kvm_s390_io_info { + __u16 subchannel_id; + __u16 subchannel_nr; + __u32 io_int_parm; + __u32 io_int_word; +}; + +struct kvm_s390_ext_info { + __u32 ext_params; + __u32 pad; + __u64 ext_params2; +}; + +struct kvm_s390_pgm_info { + __u64 trans_exc_code; + __u64 mon_code; + __u64 per_address; + __u32 data_exc_code; + __u16 code; + __u16 mon_class_nr; + __u8 per_code; + __u8 per_atmid; + __u8 exc_access_id; + __u8 per_access_id; + __u8 op_access_id; +#define KVM_S390_PGM_FLAGS_ILC_VALID 0x01 +#define KVM_S390_PGM_FLAGS_ILC_0 0x02 +#define KVM_S390_PGM_FLAGS_ILC_1 0x04 +#define KVM_S390_PGM_FLAGS_ILC_MASK 0x06 +#define KVM_S390_PGM_FLAGS_NO_REWIND 0x08 + __u8 flags; + __u8 pad[2]; +}; + +struct kvm_s390_prefix_info { + __u32 address; +}; + +struct kvm_s390_extcall_info { + __u16 code; +}; + +struct kvm_s390_emerg_info { + __u16 code; +}; + +#define KVM_S390_STOP_FLAG_STORE_STATUS 0x01 +struct kvm_s390_stop_info { + __u32 flags; +}; + +struct kvm_s390_mchk_info { + __u64 cr14; + __u64 mcic; + __u64 failing_storage_address; + __u32 ext_damage_code; + __u32 pad; + __u8 fixed_logout[16]; +}; + +struct kvm_s390_irq { + __u64 type; + union { + struct kvm_s390_io_info io; + struct kvm_s390_ext_info ext; + struct kvm_s390_pgm_info pgm; + struct kvm_s390_emerg_info emerg; + struct kvm_s390_extcall_info extcall; + struct kvm_s390_prefix_info prefix; + struct kvm_s390_stop_info stop; + struct kvm_s390_mchk_info mchk; + char reserved[64]; + } u; +}; + +struct kvm_s390_irq_state { + __u64 buf; + __u32 flags; + __u32 len; + __u32 reserved[4]; +}; + +/* for KVM_SET_GUEST_DEBUG */ + +#define KVM_GUESTDBG_ENABLE 0x00000001 +#define KVM_GUESTDBG_SINGLESTEP 0x00000002 + +struct kvm_guest_debug { + __u32 control; + __u32 pad; + struct kvm_guest_debug_arch arch; +}; + +enum { + kvm_ioeventfd_flag_nr_datamatch, + kvm_ioeventfd_flag_nr_pio, + kvm_ioeventfd_flag_nr_deassign, + kvm_ioeventfd_flag_nr_virtio_ccw_notify, + kvm_ioeventfd_flag_nr_fast_mmio, + kvm_ioeventfd_flag_nr_max, +}; + +#define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch) +#define KVM_IOEVENTFD_FLAG_PIO (1 << kvm_ioeventfd_flag_nr_pio) +#define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign) +#define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \ + (1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify) + +#define KVM_IOEVENTFD_VALID_FLAG_MASK ((1 << kvm_ioeventfd_flag_nr_max) - 1) + +struct kvm_ioeventfd { + __u64 datamatch; + __u64 addr; /* legal pio/mmio address */ + __u32 len; /* 1, 2, 4, or 8 bytes; or 0 to ignore length */ + __s32 fd; + __u32 flags; + __u8 pad[36]; +}; + +/* for KVM_ENABLE_CAP */ +struct kvm_enable_cap { + /* in */ + __u32 cap; + __u32 flags; + __u64 args[4]; + __u8 pad[64]; +}; + +/* for KVM_PPC_GET_PVINFO */ + +#define KVM_PPC_PVINFO_FLAGS_EV_IDLE (1<<0) + +struct kvm_ppc_pvinfo { + /* out */ + __u32 flags; + __u32 hcall[4]; + __u8 pad[108]; +}; + +/* for KVM_PPC_GET_SMMU_INFO */ +#define KVM_PPC_PAGE_SIZES_MAX_SZ 8 + +struct kvm_ppc_one_page_size { + __u32 page_shift; /* Page shift (or 0) */ + __u32 pte_enc; /* Encoding in the HPTE (>>12) */ +}; + +struct kvm_ppc_one_seg_page_size { + __u32 page_shift; /* Base page shift of segment (or 0) */ + __u32 slb_enc; /* SLB encoding for BookS */ + struct kvm_ppc_one_page_size enc[KVM_PPC_PAGE_SIZES_MAX_SZ]; +}; + +#define KVM_PPC_PAGE_SIZES_REAL 0x00000001 +#define KVM_PPC_1T_SEGMENTS 0x00000002 + +struct kvm_ppc_smmu_info { + __u64 flags; + __u32 slb_size; + __u32 pad; + struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; +}; + +/* for KVM_PPC_RESIZE_HPT_{PREPARE,COMMIT} */ +struct kvm_ppc_resize_hpt { + __u64 flags; + __u32 shift; + __u32 pad; +}; + +#define KVMIO 0xAE + +/* machine type bits, to be used as argument to KVM_CREATE_VM */ +#define KVM_VM_S390_UCONTROL 1 + +/* on ppc, 0 indicate default, 1 should force HV and 2 PR */ +#define KVM_VM_PPC_HV 1 +#define KVM_VM_PPC_PR 2 + +/* on MIPS, 0 forces trap & emulate, 1 forces VZ ASE */ +#define KVM_VM_MIPS_TE 0 +#define KVM_VM_MIPS_VZ 1 + +#define KVM_S390_SIE_PAGE_OFFSET 1 + +/* + * ioctls for /dev/kvm fds: + */ +#define KVM_GET_API_VERSION _IO(KVMIO, 0x00) +#define KVM_CREATE_VM _IO(KVMIO, 0x01) /* returns a VM fd */ +#define KVM_GET_MSR_INDEX_LIST _IOWR(KVMIO, 0x02, struct kvm_msr_list) + +#define KVM_S390_ENABLE_SIE _IO(KVMIO, 0x06) +/* + * Check if a kvm extension is available. Argument is extension number, + * return is 1 (yes) or 0 (no, sorry). + */ +#define KVM_CHECK_EXTENSION _IO(KVMIO, 0x03) +/* + * Get size for mmap(vcpu_fd) + */ +#define KVM_GET_VCPU_MMAP_SIZE _IO(KVMIO, 0x04) /* in bytes */ +#define KVM_GET_SUPPORTED_CPUID _IOWR(KVMIO, 0x05, struct kvm_cpuid2) +#define KVM_TRACE_ENABLE __KVM_DEPRECATED_MAIN_W_0x06 +#define KVM_TRACE_PAUSE __KVM_DEPRECATED_MAIN_0x07 +#define KVM_TRACE_DISABLE __KVM_DEPRECATED_MAIN_0x08 +#define KVM_GET_EMULATED_CPUID _IOWR(KVMIO, 0x09, struct kvm_cpuid2) + +/* + * Extension capability list. + */ +#define KVM_CAP_IRQCHIP 0 +#define KVM_CAP_HLT 1 +#define KVM_CAP_MMU_SHADOW_CACHE_CONTROL 2 +#define KVM_CAP_USER_MEMORY 3 +#define KVM_CAP_SET_TSS_ADDR 4 +#define KVM_CAP_VAPIC 6 +#define KVM_CAP_EXT_CPUID 7 +#define KVM_CAP_CLOCKSOURCE 8 +#define KVM_CAP_NR_VCPUS 9 /* returns recommended max vcpus per vm */ +#define KVM_CAP_NR_MEMSLOTS 10 /* returns max memory slots per vm */ +#define KVM_CAP_PIT 11 +#define KVM_CAP_NOP_IO_DELAY 12 +#define KVM_CAP_PV_MMU 13 +#define KVM_CAP_MP_STATE 14 +#define KVM_CAP_COALESCED_MMIO 15 +#define KVM_CAP_SYNC_MMU 16 /* Changes to host mmap are reflected in guest */ +#define KVM_CAP_IOMMU 18 +/* Bug in KVM_SET_USER_MEMORY_REGION fixed: */ +#define KVM_CAP_DESTROY_MEMORY_REGION_WORKS 21 +#define KVM_CAP_USER_NMI 22 +#ifdef __KVM_HAVE_GUEST_DEBUG +#define KVM_CAP_SET_GUEST_DEBUG 23 +#endif +#ifdef __KVM_HAVE_PIT +#define KVM_CAP_REINJECT_CONTROL 24 +#endif +#define KVM_CAP_IRQ_ROUTING 25 +#define KVM_CAP_IRQ_INJECT_STATUS 26 +#define KVM_CAP_ASSIGN_DEV_IRQ 29 +/* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */ +#define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30 +#ifdef __KVM_HAVE_MCE +#define KVM_CAP_MCE 31 +#endif +#define KVM_CAP_IRQFD 32 +#ifdef __KVM_HAVE_PIT +#define KVM_CAP_PIT2 33 +#endif +#define KVM_CAP_SET_BOOT_CPU_ID 34 +#ifdef __KVM_HAVE_PIT_STATE2 +#define KVM_CAP_PIT_STATE2 35 +#endif +#define KVM_CAP_IOEVENTFD 36 +#define KVM_CAP_SET_IDENTITY_MAP_ADDR 37 +#ifdef __KVM_HAVE_XEN_HVM +#define KVM_CAP_XEN_HVM 38 +#endif +#define KVM_CAP_ADJUST_CLOCK 39 +#define KVM_CAP_INTERNAL_ERROR_DATA 40 +#ifdef __KVM_HAVE_VCPU_EVENTS +#define KVM_CAP_VCPU_EVENTS 41 +#endif +#define KVM_CAP_S390_PSW 42 +#define KVM_CAP_PPC_SEGSTATE 43 +#define KVM_CAP_HYPERV 44 +#define KVM_CAP_HYPERV_VAPIC 45 +#define KVM_CAP_HYPERV_SPIN 46 +#define KVM_CAP_PCI_SEGMENT 47 +#define KVM_CAP_PPC_PAIRED_SINGLES 48 +#define KVM_CAP_INTR_SHADOW 49 +#ifdef __KVM_HAVE_DEBUGREGS +#define KVM_CAP_DEBUGREGS 50 +#endif +#define KVM_CAP_X86_ROBUST_SINGLESTEP 51 +#define KVM_CAP_PPC_OSI 52 +#define KVM_CAP_PPC_UNSET_IRQ 53 +#define KVM_CAP_ENABLE_CAP 54 +#ifdef __KVM_HAVE_XSAVE +#define KVM_CAP_XSAVE 55 +#endif +#ifdef __KVM_HAVE_XCRS +#define KVM_CAP_XCRS 56 +#endif +#define KVM_CAP_PPC_GET_PVINFO 57 +#define KVM_CAP_PPC_IRQ_LEVEL 58 +#define KVM_CAP_ASYNC_PF 59 +#define KVM_CAP_TSC_CONTROL 60 +#define KVM_CAP_GET_TSC_KHZ 61 +#define KVM_CAP_PPC_BOOKE_SREGS 62 +#define KVM_CAP_SPAPR_TCE 63 +#define KVM_CAP_PPC_SMT 64 +#define KVM_CAP_PPC_RMA 65 +#define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */ +#define KVM_CAP_PPC_HIOR 67 +#define KVM_CAP_PPC_PAPR 68 +#define KVM_CAP_SW_TLB 69 +#define KVM_CAP_ONE_REG 70 +#define KVM_CAP_S390_GMAP 71 +#define KVM_CAP_TSC_DEADLINE_TIMER 72 +#define KVM_CAP_S390_UCONTROL 73 +#define KVM_CAP_SYNC_REGS 74 +#define KVM_CAP_PCI_2_3 75 +#define KVM_CAP_KVMCLOCK_CTRL 76 +#define KVM_CAP_SIGNAL_MSI 77 +#define KVM_CAP_PPC_GET_SMMU_INFO 78 +#define KVM_CAP_S390_COW 79 +#define KVM_CAP_PPC_ALLOC_HTAB 80 +#define KVM_CAP_READONLY_MEM 81 +#define KVM_CAP_IRQFD_RESAMPLE 82 +#define KVM_CAP_PPC_BOOKE_WATCHDOG 83 +#define KVM_CAP_PPC_HTAB_FD 84 +#define KVM_CAP_S390_CSS_SUPPORT 85 +#define KVM_CAP_PPC_EPR 86 +#define KVM_CAP_ARM_PSCI 87 +#define KVM_CAP_ARM_SET_DEVICE_ADDR 88 +#define KVM_CAP_DEVICE_CTRL 89 +#define KVM_CAP_IRQ_MPIC 90 +#define KVM_CAP_PPC_RTAS 91 +#define KVM_CAP_IRQ_XICS 92 +#define KVM_CAP_ARM_EL1_32BIT 93 +#define KVM_CAP_SPAPR_MULTITCE 94 +#define KVM_CAP_EXT_EMUL_CPUID 95 +#define KVM_CAP_HYPERV_TIME 96 +#define KVM_CAP_IOAPIC_POLARITY_IGNORED 97 +#define KVM_CAP_ENABLE_CAP_VM 98 +#define KVM_CAP_S390_IRQCHIP 99 +#define KVM_CAP_IOEVENTFD_NO_LENGTH 100 +#define KVM_CAP_VM_ATTRIBUTES 101 +#define KVM_CAP_ARM_PSCI_0_2 102 +#define KVM_CAP_PPC_FIXUP_HCALL 103 +#define KVM_CAP_PPC_ENABLE_HCALL 104 +#define KVM_CAP_CHECK_EXTENSION_VM 105 +#define KVM_CAP_S390_USER_SIGP 106 +#define KVM_CAP_S390_VECTOR_REGISTERS 107 +#define KVM_CAP_S390_MEM_OP 108 +#define KVM_CAP_S390_USER_STSI 109 +#define KVM_CAP_S390_SKEYS 110 +#define KVM_CAP_MIPS_FPU 111 +#define KVM_CAP_MIPS_MSA 112 +#define KVM_CAP_S390_INJECT_IRQ 113 +#define KVM_CAP_S390_IRQ_STATE 114 +#define KVM_CAP_PPC_HWRNG 115 +#define KVM_CAP_DISABLE_QUIRKS 116 +#define KVM_CAP_X86_SMM 117 +#define KVM_CAP_MULTI_ADDRESS_SPACE 118 +#define KVM_CAP_GUEST_DEBUG_HW_BPS 119 +#define KVM_CAP_GUEST_DEBUG_HW_WPS 120 +#define KVM_CAP_SPLIT_IRQCHIP 121 +#define KVM_CAP_IOEVENTFD_ANY_LENGTH 122 +#define KVM_CAP_HYPERV_SYNIC 123 +#define KVM_CAP_S390_RI 124 +#define KVM_CAP_SPAPR_TCE_64 125 +#define KVM_CAP_ARM_PMU_V3 126 +#define KVM_CAP_VCPU_ATTRIBUTES 127 +#define KVM_CAP_MAX_VCPU_ID 128 +#define KVM_CAP_X2APIC_API 129 +#define KVM_CAP_S390_USER_INSTR0 130 +#define KVM_CAP_MSI_DEVID 131 +#define KVM_CAP_PPC_HTM 132 +#define KVM_CAP_SPAPR_RESIZE_HPT 133 +#define KVM_CAP_PPC_MMU_RADIX 134 +#define KVM_CAP_PPC_MMU_HASH_V3 135 +#define KVM_CAP_IMMEDIATE_EXIT 136 +#define KVM_CAP_MIPS_VZ 137 +#define KVM_CAP_MIPS_TE 138 +#define KVM_CAP_MIPS_64BIT 139 +#define KVM_CAP_S390_GS 140 +#define KVM_CAP_S390_AIS 141 +#define KVM_CAP_SPAPR_TCE_VFIO 142 +#define KVM_CAP_X86_GUEST_MWAIT 143 +#define KVM_CAP_ARM_USER_IRQ 144 +#define KVM_CAP_S390_CMMA_MIGRATION 145 +#define KVM_CAP_PPC_FWNMI 146 +#define KVM_CAP_PPC_SMT_POSSIBLE 147 +#define KVM_CAP_HYPERV_SYNIC2 148 +#define KVM_CAP_HYPERV_VP_INDEX 149 + +#ifdef KVM_CAP_IRQ_ROUTING + +struct kvm_irq_routing_irqchip { + __u32 irqchip; + __u32 pin; +}; + +struct kvm_irq_routing_msi { + __u32 address_lo; + __u32 address_hi; + __u32 data; + union { + __u32 pad; + __u32 devid; + }; +}; + +struct kvm_irq_routing_s390_adapter { + __u64 ind_addr; + __u64 summary_addr; + __u64 ind_offset; + __u32 summary_offset; + __u32 adapter_id; +}; + +struct kvm_irq_routing_hv_sint { + __u32 vcpu; + __u32 sint; +}; + +/* gsi routing entry types */ +#define KVM_IRQ_ROUTING_IRQCHIP 1 +#define KVM_IRQ_ROUTING_MSI 2 +#define KVM_IRQ_ROUTING_S390_ADAPTER 3 +#define KVM_IRQ_ROUTING_HV_SINT 4 + +struct kvm_irq_routing_entry { + __u32 gsi; + __u32 type; + __u32 flags; + __u32 pad; + union { + struct kvm_irq_routing_irqchip irqchip; + struct kvm_irq_routing_msi msi; + struct kvm_irq_routing_s390_adapter adapter; + struct kvm_irq_routing_hv_sint hv_sint; + __u32 pad[8]; + } u; +}; + +struct kvm_irq_routing { + __u32 nr; + __u32 flags; + struct kvm_irq_routing_entry entries[0]; +}; + +#endif + +#ifdef KVM_CAP_MCE +/* x86 MCE */ +struct kvm_x86_mce { + __u64 status; + __u64 addr; + __u64 misc; + __u64 mcg_status; + __u8 bank; + __u8 pad1[7]; + __u64 pad2[3]; +}; +#endif + +#ifdef KVM_CAP_XEN_HVM +struct kvm_xen_hvm_config { + __u32 flags; + __u32 msr; + __u64 blob_addr_32; + __u64 blob_addr_64; + __u8 blob_size_32; + __u8 blob_size_64; + __u8 pad2[30]; +}; +#endif + +#define KVM_IRQFD_FLAG_DEASSIGN (1 << 0) +/* + * Available with KVM_CAP_IRQFD_RESAMPLE + * + * KVM_IRQFD_FLAG_RESAMPLE indicates resamplefd is valid and specifies + * the irqfd to operate in resampling mode for level triggered interrupt + * emulation. See Documentation/virtual/kvm/api.txt. + */ +#define KVM_IRQFD_FLAG_RESAMPLE (1 << 1) + +struct kvm_irqfd { + __u32 fd; + __u32 gsi; + __u32 flags; + __u32 resamplefd; + __u8 pad[16]; +}; + +/* For KVM_CAP_ADJUST_CLOCK */ + +/* Do not use 1, KVM_CHECK_EXTENSION returned it before we had flags. */ +#define KVM_CLOCK_TSC_STABLE 2 + +struct kvm_clock_data { + __u64 clock; + __u32 flags; + __u32 pad[9]; +}; + +/* For KVM_CAP_SW_TLB */ + +#define KVM_MMU_FSL_BOOKE_NOHV 0 +#define KVM_MMU_FSL_BOOKE_HV 1 + +struct kvm_config_tlb { + __u64 params; + __u64 array; + __u32 mmu_type; + __u32 array_len; +}; + +struct kvm_dirty_tlb { + __u64 bitmap; + __u32 num_dirty; +}; + +/* Available with KVM_CAP_ONE_REG */ + +#define KVM_REG_ARCH_MASK 0xff00000000000000ULL +#define KVM_REG_GENERIC 0x0000000000000000ULL + +/* + * Architecture specific registers are to be defined in arch headers and + * ORed with the arch identifier. + */ +#define KVM_REG_PPC 0x1000000000000000ULL +#define KVM_REG_X86 0x2000000000000000ULL +#define KVM_REG_IA64 0x3000000000000000ULL +#define KVM_REG_ARM 0x4000000000000000ULL +#define KVM_REG_S390 0x5000000000000000ULL +#define KVM_REG_ARM64 0x6000000000000000ULL +#define KVM_REG_MIPS 0x7000000000000000ULL + +#define KVM_REG_SIZE_SHIFT 52 +#define KVM_REG_SIZE_MASK 0x00f0000000000000ULL +#define KVM_REG_SIZE_U8 0x0000000000000000ULL +#define KVM_REG_SIZE_U16 0x0010000000000000ULL +#define KVM_REG_SIZE_U32 0x0020000000000000ULL +#define KVM_REG_SIZE_U64 0x0030000000000000ULL +#define KVM_REG_SIZE_U128 0x0040000000000000ULL +#define KVM_REG_SIZE_U256 0x0050000000000000ULL +#define KVM_REG_SIZE_U512 0x0060000000000000ULL +#define KVM_REG_SIZE_U1024 0x0070000000000000ULL + +struct kvm_reg_list { + __u64 n; /* number of regs */ + __u64 reg[0]; +}; + +struct kvm_one_reg { + __u64 id; + __u64 addr; +}; + +#define KVM_MSI_VALID_DEVID (1U << 0) +struct kvm_msi { + __u32 address_lo; + __u32 address_hi; + __u32 data; + __u32 flags; + __u32 devid; + __u8 pad[12]; +}; + +struct kvm_arm_device_addr { + __u64 id; + __u64 addr; +}; + +/* + * Device control API, available with KVM_CAP_DEVICE_CTRL + */ +#define KVM_CREATE_DEVICE_TEST 1 + +struct kvm_create_device { + __u32 type; /* in: KVM_DEV_TYPE_xxx */ + __u32 fd; /* out: device handle */ + __u32 flags; /* in: KVM_CREATE_DEVICE_xxx */ +}; + +struct kvm_device_attr { + __u32 flags; /* no flags currently defined */ + __u32 group; /* device-defined */ + __u64 attr; /* group-defined */ + __u64 addr; /* userspace address of attr data */ +}; + +#define KVM_DEV_VFIO_GROUP 1 +#define KVM_DEV_VFIO_GROUP_ADD 1 +#define KVM_DEV_VFIO_GROUP_DEL 2 +#define KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE 3 + +enum kvm_device_type { + KVM_DEV_TYPE_FSL_MPIC_20 = 1, +#define KVM_DEV_TYPE_FSL_MPIC_20 KVM_DEV_TYPE_FSL_MPIC_20 + KVM_DEV_TYPE_FSL_MPIC_42, +#define KVM_DEV_TYPE_FSL_MPIC_42 KVM_DEV_TYPE_FSL_MPIC_42 + KVM_DEV_TYPE_XICS, +#define KVM_DEV_TYPE_XICS KVM_DEV_TYPE_XICS + KVM_DEV_TYPE_VFIO, +#define KVM_DEV_TYPE_VFIO KVM_DEV_TYPE_VFIO + KVM_DEV_TYPE_ARM_VGIC_V2, +#define KVM_DEV_TYPE_ARM_VGIC_V2 KVM_DEV_TYPE_ARM_VGIC_V2 + KVM_DEV_TYPE_FLIC, +#define KVM_DEV_TYPE_FLIC KVM_DEV_TYPE_FLIC + KVM_DEV_TYPE_ARM_VGIC_V3, +#define KVM_DEV_TYPE_ARM_VGIC_V3 KVM_DEV_TYPE_ARM_VGIC_V3 + KVM_DEV_TYPE_ARM_VGIC_ITS, +#define KVM_DEV_TYPE_ARM_VGIC_ITS KVM_DEV_TYPE_ARM_VGIC_ITS + KVM_DEV_TYPE_MAX, +}; + +struct kvm_vfio_spapr_tce { + __s32 groupfd; + __s32 tablefd; +}; + +/* + * ioctls for VM fds + */ +#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region) +/* + * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns + * a vcpu fd. + */ +#define KVM_CREATE_VCPU _IO(KVMIO, 0x41) +#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) +/* KVM_SET_MEMORY_ALIAS is obsolete: */ +#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) +#define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44) +#define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45) +#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46, \ + struct kvm_userspace_memory_region) +#define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) +#define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64) + +/* enable ucontrol for s390 */ +struct kvm_s390_ucas_mapping { + __u64 user_addr; + __u64 vcpu_addr; + __u64 length; +}; +#define KVM_S390_UCAS_MAP _IOW(KVMIO, 0x50, struct kvm_s390_ucas_mapping) +#define KVM_S390_UCAS_UNMAP _IOW(KVMIO, 0x51, struct kvm_s390_ucas_mapping) +#define KVM_S390_VCPU_FAULT _IOW(KVMIO, 0x52, unsigned long) + +/* Device model IOC */ +#define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60) +#define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level) +#define KVM_GET_IRQCHIP _IOWR(KVMIO, 0x62, struct kvm_irqchip) +#define KVM_SET_IRQCHIP _IOR(KVMIO, 0x63, struct kvm_irqchip) +#define KVM_CREATE_PIT _IO(KVMIO, 0x64) +#define KVM_GET_PIT _IOWR(KVMIO, 0x65, struct kvm_pit_state) +#define KVM_SET_PIT _IOR(KVMIO, 0x66, struct kvm_pit_state) +#define KVM_IRQ_LINE_STATUS _IOWR(KVMIO, 0x67, struct kvm_irq_level) +#define KVM_REGISTER_COALESCED_MMIO \ + _IOW(KVMIO, 0x67, struct kvm_coalesced_mmio_zone) +#define KVM_UNREGISTER_COALESCED_MMIO \ + _IOW(KVMIO, 0x68, struct kvm_coalesced_mmio_zone) +#define KVM_ASSIGN_PCI_DEVICE _IOR(KVMIO, 0x69, \ + struct kvm_assigned_pci_dev) +#define KVM_SET_GSI_ROUTING _IOW(KVMIO, 0x6a, struct kvm_irq_routing) +/* deprecated, replaced by KVM_ASSIGN_DEV_IRQ */ +#define KVM_ASSIGN_IRQ __KVM_DEPRECATED_VM_R_0x70 +#define KVM_ASSIGN_DEV_IRQ _IOW(KVMIO, 0x70, struct kvm_assigned_irq) +#define KVM_REINJECT_CONTROL _IO(KVMIO, 0x71) +#define KVM_DEASSIGN_PCI_DEVICE _IOW(KVMIO, 0x72, \ + struct kvm_assigned_pci_dev) +#define KVM_ASSIGN_SET_MSIX_NR _IOW(KVMIO, 0x73, \ + struct kvm_assigned_msix_nr) +#define KVM_ASSIGN_SET_MSIX_ENTRY _IOW(KVMIO, 0x74, \ + struct kvm_assigned_msix_entry) +#define KVM_DEASSIGN_DEV_IRQ _IOW(KVMIO, 0x75, struct kvm_assigned_irq) +#define KVM_IRQFD _IOW(KVMIO, 0x76, struct kvm_irqfd) +#define KVM_CREATE_PIT2 _IOW(KVMIO, 0x77, struct kvm_pit_config) +#define KVM_SET_BOOT_CPU_ID _IO(KVMIO, 0x78) +#define KVM_IOEVENTFD _IOW(KVMIO, 0x79, struct kvm_ioeventfd) +#define KVM_XEN_HVM_CONFIG _IOW(KVMIO, 0x7a, struct kvm_xen_hvm_config) +#define KVM_SET_CLOCK _IOW(KVMIO, 0x7b, struct kvm_clock_data) +#define KVM_GET_CLOCK _IOR(KVMIO, 0x7c, struct kvm_clock_data) +/* Available with KVM_CAP_PIT_STATE2 */ +#define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2) +#define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2) +/* Available with KVM_CAP_PPC_GET_PVINFO */ +#define KVM_PPC_GET_PVINFO _IOW(KVMIO, 0xa1, struct kvm_ppc_pvinfo) +/* Available with KVM_CAP_TSC_CONTROL */ +#define KVM_SET_TSC_KHZ _IO(KVMIO, 0xa2) +#define KVM_GET_TSC_KHZ _IO(KVMIO, 0xa3) +/* Available with KVM_CAP_PCI_2_3 */ +#define KVM_ASSIGN_SET_INTX_MASK _IOW(KVMIO, 0xa4, \ + struct kvm_assigned_pci_dev) +/* Available with KVM_CAP_SIGNAL_MSI */ +#define KVM_SIGNAL_MSI _IOW(KVMIO, 0xa5, struct kvm_msi) +/* Available with KVM_CAP_PPC_GET_SMMU_INFO */ +#define KVM_PPC_GET_SMMU_INFO _IOR(KVMIO, 0xa6, struct kvm_ppc_smmu_info) +/* Available with KVM_CAP_PPC_ALLOC_HTAB */ +#define KVM_PPC_ALLOCATE_HTAB _IOWR(KVMIO, 0xa7, __u32) +#define KVM_CREATE_SPAPR_TCE _IOW(KVMIO, 0xa8, struct kvm_create_spapr_tce) +#define KVM_CREATE_SPAPR_TCE_64 _IOW(KVMIO, 0xa8, \ + struct kvm_create_spapr_tce_64) +/* Available with KVM_CAP_RMA */ +#define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma) +/* Available with KVM_CAP_PPC_HTAB_FD */ +#define KVM_PPC_GET_HTAB_FD _IOW(KVMIO, 0xaa, struct kvm_get_htab_fd) +/* Available with KVM_CAP_ARM_SET_DEVICE_ADDR */ +#define KVM_ARM_SET_DEVICE_ADDR _IOW(KVMIO, 0xab, struct kvm_arm_device_addr) +/* Available with KVM_CAP_PPC_RTAS */ +#define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO, 0xac, struct kvm_rtas_token_args) +/* Available with KVM_CAP_SPAPR_RESIZE_HPT */ +#define KVM_PPC_RESIZE_HPT_PREPARE _IOR(KVMIO, 0xad, struct kvm_ppc_resize_hpt) +#define KVM_PPC_RESIZE_HPT_COMMIT _IOR(KVMIO, 0xae, struct kvm_ppc_resize_hpt) +/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 */ +#define KVM_PPC_CONFIGURE_V3_MMU _IOW(KVMIO, 0xaf, struct kvm_ppc_mmuv3_cfg) +/* Available with KVM_CAP_PPC_RADIX_MMU */ +#define KVM_PPC_GET_RMMU_INFO _IOW(KVMIO, 0xb0, struct kvm_ppc_rmmu_info) + +/* ioctl for vm fd */ +#define KVM_CREATE_DEVICE _IOWR(KVMIO, 0xe0, struct kvm_create_device) + +/* ioctls for fds returned by KVM_CREATE_DEVICE */ +#define KVM_SET_DEVICE_ATTR _IOW(KVMIO, 0xe1, struct kvm_device_attr) +#define KVM_GET_DEVICE_ATTR _IOW(KVMIO, 0xe2, struct kvm_device_attr) +#define KVM_HAS_DEVICE_ATTR _IOW(KVMIO, 0xe3, struct kvm_device_attr) + +/* + * ioctls for vcpu fds + */ +#define KVM_RUN _IO(KVMIO, 0x80) +#define KVM_GET_REGS _IOR(KVMIO, 0x81, struct kvm_regs) +#define KVM_SET_REGS _IOW(KVMIO, 0x82, struct kvm_regs) +#define KVM_GET_SREGS _IOR(KVMIO, 0x83, struct kvm_sregs) +#define KVM_SET_SREGS _IOW(KVMIO, 0x84, struct kvm_sregs) +#define KVM_TRANSLATE _IOWR(KVMIO, 0x85, struct kvm_translation) +#define KVM_INTERRUPT _IOW(KVMIO, 0x86, struct kvm_interrupt) +/* KVM_DEBUG_GUEST is no longer supported, use KVM_SET_GUEST_DEBUG instead */ +#define KVM_DEBUG_GUEST __KVM_DEPRECATED_VCPU_W_0x87 +#define KVM_GET_MSRS _IOWR(KVMIO, 0x88, struct kvm_msrs) +#define KVM_SET_MSRS _IOW(KVMIO, 0x89, struct kvm_msrs) +#define KVM_SET_CPUID _IOW(KVMIO, 0x8a, struct kvm_cpuid) +#define KVM_SET_SIGNAL_MASK _IOW(KVMIO, 0x8b, struct kvm_signal_mask) +#define KVM_GET_FPU _IOR(KVMIO, 0x8c, struct kvm_fpu) +#define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu) +#define KVM_GET_LAPIC _IOR(KVMIO, 0x8e, struct kvm_lapic_state) +#define KVM_SET_LAPIC _IOW(KVMIO, 0x8f, struct kvm_lapic_state) +#define KVM_SET_CPUID2 _IOW(KVMIO, 0x90, struct kvm_cpuid2) +#define KVM_GET_CPUID2 _IOWR(KVMIO, 0x91, struct kvm_cpuid2) +/* Available with KVM_CAP_VAPIC */ +#define KVM_TPR_ACCESS_REPORTING _IOWR(KVMIO, 0x92, struct kvm_tpr_access_ctl) +/* Available with KVM_CAP_VAPIC */ +#define KVM_SET_VAPIC_ADDR _IOW(KVMIO, 0x93, struct kvm_vapic_addr) +/* valid for virtual machine (for floating interrupt)_and_ vcpu */ +#define KVM_S390_INTERRUPT _IOW(KVMIO, 0x94, struct kvm_s390_interrupt) +/* store status for s390 */ +#define KVM_S390_STORE_STATUS_NOADDR (-1ul) +#define KVM_S390_STORE_STATUS_PREFIXED (-2ul) +#define KVM_S390_STORE_STATUS _IOW(KVMIO, 0x95, unsigned long) +/* initial ipl psw for s390 */ +#define KVM_S390_SET_INITIAL_PSW _IOW(KVMIO, 0x96, struct kvm_s390_psw) +/* initial reset for s390 */ +#define KVM_S390_INITIAL_RESET _IO(KVMIO, 0x97) +#define KVM_GET_MP_STATE _IOR(KVMIO, 0x98, struct kvm_mp_state) +#define KVM_SET_MP_STATE _IOW(KVMIO, 0x99, struct kvm_mp_state) +/* Available with KVM_CAP_USER_NMI */ +#define KVM_NMI _IO(KVMIO, 0x9a) +/* Available with KVM_CAP_SET_GUEST_DEBUG */ +#define KVM_SET_GUEST_DEBUG _IOW(KVMIO, 0x9b, struct kvm_guest_debug) +/* MCE for x86 */ +#define KVM_X86_SETUP_MCE _IOW(KVMIO, 0x9c, __u64) +#define KVM_X86_GET_MCE_CAP_SUPPORTED _IOR(KVMIO, 0x9d, __u64) +#define KVM_X86_SET_MCE _IOW(KVMIO, 0x9e, struct kvm_x86_mce) +/* Available with KVM_CAP_VCPU_EVENTS */ +#define KVM_GET_VCPU_EVENTS _IOR(KVMIO, 0x9f, struct kvm_vcpu_events) +#define KVM_SET_VCPU_EVENTS _IOW(KVMIO, 0xa0, struct kvm_vcpu_events) +/* Available with KVM_CAP_DEBUGREGS */ +#define KVM_GET_DEBUGREGS _IOR(KVMIO, 0xa1, struct kvm_debugregs) +#define KVM_SET_DEBUGREGS _IOW(KVMIO, 0xa2, struct kvm_debugregs) +/* + * vcpu version available with KVM_ENABLE_CAP + * vm version available with KVM_CAP_ENABLE_CAP_VM + */ +#define KVM_ENABLE_CAP _IOW(KVMIO, 0xa3, struct kvm_enable_cap) +/* Available with KVM_CAP_XSAVE */ +#define KVM_GET_XSAVE _IOR(KVMIO, 0xa4, struct kvm_xsave) +#define KVM_SET_XSAVE _IOW(KVMIO, 0xa5, struct kvm_xsave) +/* Available with KVM_CAP_XCRS */ +#define KVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs) +#define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs) +/* Available with KVM_CAP_SW_TLB */ +#define KVM_DIRTY_TLB _IOW(KVMIO, 0xaa, struct kvm_dirty_tlb) +/* Available with KVM_CAP_ONE_REG */ +#define KVM_GET_ONE_REG _IOW(KVMIO, 0xab, struct kvm_one_reg) +#define KVM_SET_ONE_REG _IOW(KVMIO, 0xac, struct kvm_one_reg) +/* VM is being stopped by host */ +#define KVM_KVMCLOCK_CTRL _IO(KVMIO, 0xad) +#define KVM_ARM_VCPU_INIT _IOW(KVMIO, 0xae, struct kvm_vcpu_init) +#define KVM_ARM_PREFERRED_TARGET _IOR(KVMIO, 0xaf, struct kvm_vcpu_init) +#define KVM_GET_REG_LIST _IOWR(KVMIO, 0xb0, struct kvm_reg_list) +/* Available with KVM_CAP_S390_MEM_OP */ +#define KVM_S390_MEM_OP _IOW(KVMIO, 0xb1, struct kvm_s390_mem_op) +/* Available with KVM_CAP_S390_SKEYS */ +#define KVM_S390_GET_SKEYS _IOW(KVMIO, 0xb2, struct kvm_s390_skeys) +#define KVM_S390_SET_SKEYS _IOW(KVMIO, 0xb3, struct kvm_s390_skeys) +/* Available with KVM_CAP_S390_INJECT_IRQ */ +#define KVM_S390_IRQ _IOW(KVMIO, 0xb4, struct kvm_s390_irq) +/* Available with KVM_CAP_S390_IRQ_STATE */ +#define KVM_S390_SET_IRQ_STATE _IOW(KVMIO, 0xb5, struct kvm_s390_irq_state) +#define KVM_S390_GET_IRQ_STATE _IOW(KVMIO, 0xb6, struct kvm_s390_irq_state) +/* Available with KVM_CAP_X86_SMM */ +#define KVM_SMI _IO(KVMIO, 0xb7) +/* Available with KVM_CAP_S390_CMMA_MIGRATION */ +#define KVM_S390_GET_CMMA_BITS _IOWR(KVMIO, 0xb8, struct kvm_s390_cmma_log) +#define KVM_S390_SET_CMMA_BITS _IOW(KVMIO, 0xb9, struct kvm_s390_cmma_log) + +#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) +#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) +#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) + +struct kvm_assigned_pci_dev { + __u32 assigned_dev_id; + __u32 busnr; + __u32 devfn; + __u32 flags; + __u32 segnr; + union { + __u32 reserved[11]; + }; +}; + +#define KVM_DEV_IRQ_HOST_INTX (1 << 0) +#define KVM_DEV_IRQ_HOST_MSI (1 << 1) +#define KVM_DEV_IRQ_HOST_MSIX (1 << 2) + +#define KVM_DEV_IRQ_GUEST_INTX (1 << 8) +#define KVM_DEV_IRQ_GUEST_MSI (1 << 9) +#define KVM_DEV_IRQ_GUEST_MSIX (1 << 10) + +#define KVM_DEV_IRQ_HOST_MASK 0x00ff +#define KVM_DEV_IRQ_GUEST_MASK 0xff00 + +struct kvm_assigned_irq { + __u32 assigned_dev_id; + __u32 host_irq; /* ignored (legacy field) */ + __u32 guest_irq; + __u32 flags; + union { + __u32 reserved[12]; + }; +}; + +struct kvm_assigned_msix_nr { + __u32 assigned_dev_id; + __u16 entry_nr; + __u16 padding; +}; + +#define KVM_MAX_MSIX_PER_DEV 256 +struct kvm_assigned_msix_entry { + __u32 assigned_dev_id; + __u32 gsi; + __u16 entry; /* The index of entry in the MSI-X table */ + __u16 padding[3]; +}; + +#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) +#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) + +/* Available with KVM_CAP_ARM_USER_IRQ */ + +/* Bits for run->s.regs.device_irq_level */ +#define KVM_ARM_DEV_EL1_VTIMER (1 << 0) +#define KVM_ARM_DEV_EL1_PTIMER (1 << 1) +#define KVM_ARM_DEV_PMU (1 << 2) + +#endif /* __LINUX_KVM_H */ diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST index 35a33adefed2..513f52a484fd 100644 --- a/tools/perf/MANIFEST +++ b/tools/perf/MANIFEST @@ -80,6 +80,7 @@ tools/include/uapi/linux/bpf.h tools/include/uapi/linux/bpf_common.h tools/include/uapi/linux/fcntl.h tools/include/uapi/linux/hw_breakpoint.h +tools/include/uapi/linux/kvm.h tools/include/uapi/linux/mman.h tools/include/uapi/linux/perf_event.h tools/include/uapi/linux/sched.h diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index 1e07c9b062de..1f009506ff63 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -4,6 +4,7 @@ HEADERS=' include/uapi/drm/drm.h include/uapi/drm/i915_drm.h include/uapi/linux/fcntl.h +include/uapi/linux/kvm.h include/uapi/linux/perf_event.h include/uapi/linux/sched.h include/uapi/linux/stat.h -- cgit v1.2.3 From d02b395e118d7d4100caeeab553cebd182fdefd5 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 31 Jul 2017 16:45:32 -0300 Subject: tools include uapi: Grab a copy of linux/vhost.h We will use it to generate tables for beautifying ioctl's 'cmd' arg. Cc: Adrian Hunter Cc: David Ahern Cc: Jason Wang Cc: Jiri Olsa Cc: "Michael S. Tsirkin" Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-nxwpq34hu6te1m2ra5m7o8n9@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/vhost.h | 209 +++++++++++++++++++++++++++++++++++++++ tools/perf/MANIFEST | 1 + tools/perf/check-headers.sh | 1 + 3 files changed, 211 insertions(+) create mode 100644 tools/include/uapi/linux/vhost.h (limited to 'tools/include/uapi/linux') diff --git a/tools/include/uapi/linux/vhost.h b/tools/include/uapi/linux/vhost.h new file mode 100644 index 000000000000..60180c0b5dc6 --- /dev/null +++ b/tools/include/uapi/linux/vhost.h @@ -0,0 +1,209 @@ +#ifndef _LINUX_VHOST_H +#define _LINUX_VHOST_H +/* Userspace interface for in-kernel virtio accelerators. */ + +/* vhost is used to reduce the number of system calls involved in virtio. + * + * Existing virtio net code is used in the guest without modification. + * + * This header includes interface used by userspace hypervisor for + * device configuration. + */ + +#include +#include +#include +#include +#include + +struct vhost_vring_state { + unsigned int index; + unsigned int num; +}; + +struct vhost_vring_file { + unsigned int index; + int fd; /* Pass -1 to unbind from file. */ + +}; + +struct vhost_vring_addr { + unsigned int index; + /* Option flags. */ + unsigned int flags; + /* Flag values: */ + /* Whether log address is valid. If set enables logging. */ +#define VHOST_VRING_F_LOG 0 + + /* Start of array of descriptors (virtually contiguous) */ + __u64 desc_user_addr; + /* Used structure address. Must be 32 bit aligned */ + __u64 used_user_addr; + /* Available structure address. Must be 16 bit aligned */ + __u64 avail_user_addr; + /* Logging support. */ + /* Log writes to used structure, at offset calculated from specified + * address. Address must be 32 bit aligned. */ + __u64 log_guest_addr; +}; + +/* no alignment requirement */ +struct vhost_iotlb_msg { + __u64 iova; + __u64 size; + __u64 uaddr; +#define VHOST_ACCESS_RO 0x1 +#define VHOST_ACCESS_WO 0x2 +#define VHOST_ACCESS_RW 0x3 + __u8 perm; +#define VHOST_IOTLB_MISS 1 +#define VHOST_IOTLB_UPDATE 2 +#define VHOST_IOTLB_INVALIDATE 3 +#define VHOST_IOTLB_ACCESS_FAIL 4 + __u8 type; +}; + +#define VHOST_IOTLB_MSG 0x1 + +struct vhost_msg { + int type; + union { + struct vhost_iotlb_msg iotlb; + __u8 padding[64]; + }; +}; + +struct vhost_memory_region { + __u64 guest_phys_addr; + __u64 memory_size; /* bytes */ + __u64 userspace_addr; + __u64 flags_padding; /* No flags are currently specified. */ +}; + +/* All region addresses and sizes must be 4K aligned. */ +#define VHOST_PAGE_SIZE 0x1000 + +struct vhost_memory { + __u32 nregions; + __u32 padding; + struct vhost_memory_region regions[0]; +}; + +/* ioctls */ + +#define VHOST_VIRTIO 0xAF + +/* Features bitmask for forward compatibility. Transport bits are used for + * vhost specific features. */ +#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64) +#define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64) + +/* Set current process as the (exclusive) owner of this file descriptor. This + * must be called before any other vhost command. Further calls to + * VHOST_OWNER_SET fail until VHOST_OWNER_RESET is called. */ +#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01) +/* Give up ownership, and reset the device to default values. + * Allows subsequent call to VHOST_OWNER_SET to succeed. */ +#define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02) + +/* Set up/modify memory layout */ +#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory) + +/* Write logging setup. */ +/* Memory writes can optionally be logged by setting bit at an offset + * (calculated from the physical address) from specified log base. + * The bit is set using an atomic 32 bit operation. */ +/* Set base address for logging. */ +#define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64) +/* Specify an eventfd file descriptor to signal on log write. */ +#define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int) + +/* Ring setup. */ +/* Set number of descriptors in ring. This parameter can not + * be modified while ring is running (bound to a device). */ +#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state) +/* Set addresses for the ring. */ +#define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr) +/* Base value where queue looks for available descriptors */ +#define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x12, struct vhost_vring_state) +/* Get accessor: reads index, writes value in num */ +#define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x12, struct vhost_vring_state) + +/* Set the vring byte order in num. Valid values are VHOST_VRING_LITTLE_ENDIAN + * or VHOST_VRING_BIG_ENDIAN (other values return -EINVAL). + * The byte order cannot be changed while the device is active: trying to do so + * returns -EBUSY. + * This is a legacy only API that is simply ignored when VIRTIO_F_VERSION_1 is + * set. + * Not all kernel configurations support this ioctl, but all configurations that + * support SET also support GET. + */ +#define VHOST_VRING_LITTLE_ENDIAN 0 +#define VHOST_VRING_BIG_ENDIAN 1 +#define VHOST_SET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_state) +#define VHOST_GET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state) + +/* The following ioctls use eventfd file descriptors to signal and poll + * for events. */ + +/* Set eventfd to poll for added buffers */ +#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file) +/* Set eventfd to signal when buffers have beed used */ +#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) +/* Set eventfd to signal an error */ +#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) +/* Set busy loop timeout (in us) */ +#define VHOST_SET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x23, \ + struct vhost_vring_state) +/* Get busy loop timeout (in us) */ +#define VHOST_GET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x24, \ + struct vhost_vring_state) + +/* VHOST_NET specific defines */ + +/* Attach virtio net ring to a raw socket, or tap device. + * The socket must be already bound to an ethernet device, this device will be + * used for transmit. Pass fd -1 to unbind from the socket and the transmit + * device. This can be used to stop the ring (e.g. for migration). */ +#define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file) + +/* Feature bits */ +/* Log all write descriptors. Can be changed while device is active. */ +#define VHOST_F_LOG_ALL 26 +/* vhost-net should add virtio_net_hdr for RX, and strip for TX packets. */ +#define VHOST_NET_F_VIRTIO_NET_HDR 27 + +/* VHOST_SCSI specific definitions */ + +/* + * Used by QEMU userspace to ensure a consistent vhost-scsi ABI. + * + * ABI Rev 0: July 2012 version starting point for v3.6-rc merge candidate + + * RFC-v2 vhost-scsi userspace. Add GET_ABI_VERSION ioctl usage + * ABI Rev 1: January 2013. Ignore vhost_tpgt filed in struct vhost_scsi_target. + * All the targets under vhost_wwpn can be seen and used by guset. + */ + +#define VHOST_SCSI_ABI_VERSION 1 + +struct vhost_scsi_target { + int abi_version; + char vhost_wwpn[224]; /* TRANSPORT_IQN_LEN */ + unsigned short vhost_tpgt; + unsigned short reserved; +}; + +#define VHOST_SCSI_SET_ENDPOINT _IOW(VHOST_VIRTIO, 0x40, struct vhost_scsi_target) +#define VHOST_SCSI_CLEAR_ENDPOINT _IOW(VHOST_VIRTIO, 0x41, struct vhost_scsi_target) +/* Changing this breaks userspace. */ +#define VHOST_SCSI_GET_ABI_VERSION _IOW(VHOST_VIRTIO, 0x42, int) +/* Set and get the events missed flag */ +#define VHOST_SCSI_SET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x43, __u32) +#define VHOST_SCSI_GET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x44, __u32) + +/* VHOST_VSOCK specific defines */ + +#define VHOST_VSOCK_SET_GUEST_CID _IOW(VHOST_VIRTIO, 0x60, __u64) +#define VHOST_VSOCK_SET_RUNNING _IOW(VHOST_VIRTIO, 0x61, int) + +#endif diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST index 513f52a484fd..62072822dc85 100644 --- a/tools/perf/MANIFEST +++ b/tools/perf/MANIFEST @@ -85,6 +85,7 @@ tools/include/uapi/linux/mman.h tools/include/uapi/linux/perf_event.h tools/include/uapi/linux/sched.h tools/include/uapi/linux/stat.h +tools/include/uapi/linux/vhost.h tools/include/uapi/sound/asound.h tools/include/linux/poison.h tools/include/linux/rbtree.h diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index 1f009506ff63..932fda54b8a6 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -8,6 +8,7 @@ include/uapi/linux/kvm.h include/uapi/linux/perf_event.h include/uapi/linux/sched.h include/uapi/linux/stat.h +include/uapi/linux/vhost.h include/uapi/sound/asound.h include/linux/hash.h include/uapi/linux/hw_breakpoint.h -- cgit v1.2.3 From 996139e801fd145bc44b70b4f4bfa621d626f948 Mon Sep 17 00:00:00 2001 From: William Tu Date: Mon, 7 Aug 2017 13:14:42 -0700 Subject: selftests: bpf: add a test for XDP redirect Add test for xdp_redirect by creating two namespaces with two veth peers, then forward packets in-between. Signed-off-by: William Tu Cc: Daniel Borkmann Cc: John Fastabend Acked-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: David S. Miller --- tools/include/uapi/linux/bpf.h | 3 +- tools/testing/selftests/bpf/Makefile | 4 +- tools/testing/selftests/bpf/test_xdp_redirect.c | 28 ++++++++++++ tools/testing/selftests/bpf/test_xdp_redirect.sh | 54 ++++++++++++++++++++++++ 4 files changed, 86 insertions(+), 3 deletions(-) create mode 100644 tools/testing/selftests/bpf/test_xdp_redirect.c create mode 100755 tools/testing/selftests/bpf/test_xdp_redirect.sh (limited to 'tools/include/uapi/linux') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1579cab49717..8d9bfcca3fe4 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -592,7 +592,8 @@ union bpf_attr { FN(get_socket_uid), \ FN(set_hash), \ FN(setsockopt), \ - FN(skb_adjust_room), + FN(skb_adjust_room), \ + FN(redirect_map), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 153c3a181a4c..3c2e67da4b41 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -15,9 +15,9 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test test_align TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \ - test_pkt_md_access.o + test_pkt_md_access.o test_xdp_redirect.o -TEST_PROGS := test_kmod.sh +TEST_PROGS := test_kmod.sh test_xdp_redirect.sh include ../lib.mk diff --git a/tools/testing/selftests/bpf/test_xdp_redirect.c b/tools/testing/selftests/bpf/test_xdp_redirect.c new file mode 100644 index 000000000000..ef9e704be140 --- /dev/null +++ b/tools/testing/selftests/bpf/test_xdp_redirect.c @@ -0,0 +1,28 @@ +/* Copyright (c) 2017 VMware + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include "bpf_helpers.h" + +int _version SEC("version") = 1; + +SEC("redirect_to_111") +int xdp_redirect_to_111(struct xdp_md *xdp) +{ + return bpf_redirect(111, 0); +} +SEC("redirect_to_222") +int xdp_redirect_to_222(struct xdp_md *xdp) +{ + return bpf_redirect(222, 0); +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_xdp_redirect.sh b/tools/testing/selftests/bpf/test_xdp_redirect.sh new file mode 100755 index 000000000000..d8c73ed6e040 --- /dev/null +++ b/tools/testing/selftests/bpf/test_xdp_redirect.sh @@ -0,0 +1,54 @@ +#!/bin/sh +# Create 2 namespaces with two veth peers, and +# forward packets in-between using generic XDP +# +# NS1(veth11) NS2(veth22) +# | | +# | | +# (veth1, ------ (veth2, +# id:111) id:222) +# | xdp forwarding | +# ------------------ + +cleanup() +{ + if [ "$?" = "0" ]; then + echo "selftests: test_xdp_redirect [PASS]"; + else + echo "selftests: test_xdp_redirect [FAILED]"; + fi + + set +e + ip netns del ns1 2> /dev/null + ip netns del ns2 2> /dev/null +} + +set -e + +ip netns add ns1 +ip netns add ns2 + +trap cleanup 0 2 3 6 9 + +ip link add veth1 index 111 type veth peer name veth11 +ip link add veth2 index 222 type veth peer name veth22 + +ip link set veth11 netns ns1 +ip link set veth22 netns ns2 + +ip link set veth1 up +ip link set veth2 up + +ip netns exec ns1 ip addr add 10.1.1.11/24 dev veth11 +ip netns exec ns2 ip addr add 10.1.1.22/24 dev veth22 + +ip netns exec ns1 ip link set dev veth11 up +ip netns exec ns2 ip link set dev veth22 up + +ip link set dev veth1 xdpgeneric obj test_xdp_redirect.o sec redirect_to_222 +ip link set dev veth2 xdpgeneric obj test_xdp_redirect.o sec redirect_to_111 + +ip netns exec ns1 ping -c 1 10.1.1.22 +ip netns exec ns2 ping -c 1 10.1.1.11 + +exit 0 -- cgit v1.2.3 From 92b31a9af73b3a3fc801899335d6c47966351830 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 10 Aug 2017 01:39:55 +0200 Subject: bpf: add BPF_J{LT,LE,SLT,SLE} instructions Currently, eBPF only understands BPF_JGT (>), BPF_JGE (>=), BPF_JSGT (s>), BPF_JSGE (s>=) instructions, this means that particularly *JLT/*JLE counterparts involving immediates need to be rewritten from e.g. X < [IMM] by swapping arguments into [IMM] > X, meaning the immediate first is required to be loaded into a register Y := [IMM], such that then we can compare with Y > X. Note that the destination operand is always required to be a register. This has the downside of having unnecessarily increased register pressure, meaning complex program would need to spill other registers temporarily to stack in order to obtain an unused register for the [IMM]. Loading to registers will thus also affect state pruning since we need to account for that register use and potentially those registers that had to be spilled/filled again. As a consequence slightly more stack space might have been used due to spilling, and BPF programs are a bit longer due to extra code involving the register load and potentially required spill/fills. Thus, add BPF_JLT (<), BPF_JLE (<=), BPF_JSLT (s<), BPF_JSLE (s<=) counterparts to the eBPF instruction set. Modifying LLVM to remove the NegateCC() workaround in a PoC patch at [1] and allowing it to also emit the new instructions resulted in cilium's BPF programs that are injected into the fast-path to have a reduced program length in the range of 2-3% (e.g. accumulated main and tail call sections from one of the object file reduced from 4864 to 4729 insns), reduced complexity in the range of 10-30% (e.g. accumulated sections reduced in one of the cases from 116432 to 88428 insns), and reduced stack usage in the range of 1-5% (e.g. accumulated sections from one of the object files reduced from 824 to 784b). The modification for LLVM will be incorporated in a backwards compatible way. Plan is for LLVM to have i) a target specific option to offer a possibility to explicitly enable the extension by the user (as we have with -m target specific extensions today for various CPU insns), and ii) have the kernel checked for presence of the extensions and enable them transparently when the user is selecting more aggressive options such as -march=native in a bpf target context. (Other frontends generating BPF byte code, e.g. ply can probe the kernel directly for its code generation.) [1] https://github.com/borkmann/llvm/tree/bpf-insns Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- Documentation/networking/filter.txt | 4 + include/uapi/linux/bpf.h | 5 + kernel/bpf/core.c | 60 ++++++ lib/test_bpf.c | 364 ++++++++++++++++++++++++++++++++++++ net/core/filter.c | 21 ++- tools/include/uapi/linux/bpf.h | 5 + 6 files changed, 455 insertions(+), 4 deletions(-) (limited to 'tools/include/uapi/linux') diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index d0fdba7d66e2..6a0df8df6c43 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -906,6 +906,10 @@ If BPF_CLASS(code) == BPF_JMP, BPF_OP(code) is one of: BPF_JSGE 0x70 /* eBPF only: signed '>=' */ BPF_CALL 0x80 /* eBPF only: function call */ BPF_EXIT 0x90 /* eBPF only: function return */ + BPF_JLT 0xa0 /* eBPF only: unsigned '<' */ + BPF_JLE 0xb0 /* eBPF only: unsigned '<=' */ + BPF_JSLT 0xc0 /* eBPF only: signed '<' */ + BPF_JSLE 0xd0 /* eBPF only: signed '<=' */ So BPF_ADD | BPF_X | BPF_ALU means 32-bit addition in both classic BPF and eBPF. There are only two registers in classic BPF, so it means A += X. diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1d06be1569b1..91da8371a2d0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -30,9 +30,14 @@ #define BPF_FROM_LE BPF_TO_LE #define BPF_FROM_BE BPF_TO_BE +/* jmp encodings */ #define BPF_JNE 0x50 /* jump != */ +#define BPF_JLT 0xa0 /* LT is unsigned, '<' */ +#define BPF_JLE 0xb0 /* LE is unsigned, '<=' */ #define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */ #define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */ +#define BPF_JSLT 0xc0 /* SLT is signed, '<' */ +#define BPF_JSLE 0xd0 /* SLE is signed, '<=' */ #define BPF_CALL 0x80 /* function call */ #define BPF_EXIT 0x90 /* function return */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ad5f55922a13..c69e7f5bfde7 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -595,9 +595,13 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JNE | BPF_K: case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JLT | BPF_K: case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JLE | BPF_K: case BPF_JMP | BPF_JSGT | BPF_K: + case BPF_JMP | BPF_JSLT | BPF_K: case BPF_JMP | BPF_JSGE | BPF_K: + case BPF_JMP | BPF_JSLE | BPF_K: case BPF_JMP | BPF_JSET | BPF_K: /* Accommodate for extra offset in case of a backjump. */ off = from->off; @@ -833,12 +837,20 @@ static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K, [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X, [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K, + [BPF_JMP | BPF_JLT | BPF_X] = &&JMP_JLT_X, + [BPF_JMP | BPF_JLT | BPF_K] = &&JMP_JLT_K, [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X, [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K, + [BPF_JMP | BPF_JLE | BPF_X] = &&JMP_JLE_X, + [BPF_JMP | BPF_JLE | BPF_K] = &&JMP_JLE_K, [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X, [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K, + [BPF_JMP | BPF_JSLT | BPF_X] = &&JMP_JSLT_X, + [BPF_JMP | BPF_JSLT | BPF_K] = &&JMP_JSLT_K, [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X, [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K, + [BPF_JMP | BPF_JSLE | BPF_X] = &&JMP_JSLE_X, + [BPF_JMP | BPF_JSLE | BPF_K] = &&JMP_JSLE_K, [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X, [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K, /* Program return */ @@ -1073,6 +1085,18 @@ out: CONT_JMP; } CONT; + JMP_JLT_X: + if (DST < SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JLT_K: + if (DST < IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; JMP_JGE_X: if (DST >= SRC) { insn += insn->off; @@ -1085,6 +1109,18 @@ out: CONT_JMP; } CONT; + JMP_JLE_X: + if (DST <= SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JLE_K: + if (DST <= IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; JMP_JSGT_X: if (((s64) DST) > ((s64) SRC)) { insn += insn->off; @@ -1097,6 +1133,18 @@ out: CONT_JMP; } CONT; + JMP_JSLT_X: + if (((s64) DST) < ((s64) SRC)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSLT_K: + if (((s64) DST) < ((s64) IMM)) { + insn += insn->off; + CONT_JMP; + } + CONT; JMP_JSGE_X: if (((s64) DST) >= ((s64) SRC)) { insn += insn->off; @@ -1109,6 +1157,18 @@ out: CONT_JMP; } CONT; + JMP_JSLE_X: + if (((s64) DST) <= ((s64) SRC)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSLE_K: + if (((s64) DST) <= ((s64) IMM)) { + insn += insn->off; + CONT_JMP; + } + CONT; JMP_JSET_X: if (DST & SRC) { insn += insn->off; diff --git a/lib/test_bpf.c b/lib/test_bpf.c index d9d5a410955c..aa8812ae6776 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -951,6 +951,32 @@ static struct bpf_test tests[] = { { 4, 4, 4, 3, 3 }, { { 2, 0 }, { 3, 1 }, { 4, MAX_K } }, }, + { + "JGE (jt 0), test 1", + .u.insns = { + BPF_STMT(BPF_LDX | BPF_LEN, 0), + BPF_STMT(BPF_LD | BPF_B | BPF_ABS, 2), + BPF_JUMP(BPF_JMP | BPF_JGE | BPF_X, 0, 0, 1), + BPF_STMT(BPF_RET | BPF_K, 1), + BPF_STMT(BPF_RET | BPF_K, MAX_K) + }, + CLASSIC, + { 4, 4, 4, 3, 3 }, + { { 2, 0 }, { 3, 1 }, { 4, 1 } }, + }, + { + "JGE (jt 0), test 2", + .u.insns = { + BPF_STMT(BPF_LDX | BPF_LEN, 0), + BPF_STMT(BPF_LD | BPF_B | BPF_ABS, 2), + BPF_JUMP(BPF_JMP | BPF_JGE | BPF_X, 0, 0, 1), + BPF_STMT(BPF_RET | BPF_K, 1), + BPF_STMT(BPF_RET | BPF_K, MAX_K) + }, + CLASSIC, + { 4, 4, 5, 3, 3 }, + { { 4, 1 }, { 5, 1 }, { 6, MAX_K } }, + }, { "JGE", .u.insns = { @@ -4492,6 +4518,35 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + /* BPF_JMP | BPF_JSLT | BPF_K */ + { + "JMP_JSLT_K: Signed jump: if (-2 < -1) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 0xfffffffffffffffeLL), + BPF_JMP_IMM(BPF_JSLT, R1, -1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JSLT_K: Signed jump: if (-1 < -1) return 0", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_LD_IMM64(R1, 0xffffffffffffffffLL), + BPF_JMP_IMM(BPF_JSLT, R1, -1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JSGT | BPF_K */ { "JMP_JSGT_K: Signed jump: if (-1 > -2) return 1", @@ -4521,6 +4576,73 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + /* BPF_JMP | BPF_JSLE | BPF_K */ + { + "JMP_JSLE_K: Signed jump: if (-2 <= -1) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 0xfffffffffffffffeLL), + BPF_JMP_IMM(BPF_JSLE, R1, -1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JSLE_K: Signed jump: if (-1 <= -1) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 0xffffffffffffffffLL), + BPF_JMP_IMM(BPF_JSLE, R1, -1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JSLE_K: Signed jump: value walk 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 3), + BPF_JMP_IMM(BPF_JSLE, R1, 0, 6), + BPF_ALU64_IMM(BPF_SUB, R1, 1), + BPF_JMP_IMM(BPF_JSLE, R1, 0, 4), + BPF_ALU64_IMM(BPF_SUB, R1, 1), + BPF_JMP_IMM(BPF_JSLE, R1, 0, 2), + BPF_ALU64_IMM(BPF_SUB, R1, 1), + BPF_JMP_IMM(BPF_JSLE, R1, 0, 1), + BPF_EXIT_INSN(), /* bad exit */ + BPF_ALU32_IMM(BPF_MOV, R0, 1), /* good exit */ + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JSLE_K: Signed jump: value walk 2", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 3), + BPF_JMP_IMM(BPF_JSLE, R1, 0, 4), + BPF_ALU64_IMM(BPF_SUB, R1, 2), + BPF_JMP_IMM(BPF_JSLE, R1, 0, 2), + BPF_ALU64_IMM(BPF_SUB, R1, 2), + BPF_JMP_IMM(BPF_JSLE, R1, 0, 1), + BPF_EXIT_INSN(), /* bad exit */ + BPF_ALU32_IMM(BPF_MOV, R0, 1), /* good exit */ + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JSGE | BPF_K */ { "JMP_JSGE_K: Signed jump: if (-1 >= -2) return 1", @@ -4617,6 +4739,35 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + /* BPF_JMP | BPF_JLT | BPF_K */ + { + "JMP_JLT_K: if (2 < 3) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 2), + BPF_JMP_IMM(BPF_JLT, R1, 3, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JGT_K: Unsigned jump: if (1 < -1) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 1), + BPF_JMP_IMM(BPF_JLT, R1, -1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JGE | BPF_K */ { "JMP_JGE_K: if (3 >= 2) return 1", @@ -4632,6 +4783,21 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + /* BPF_JMP | BPF_JLE | BPF_K */ + { + "JMP_JLE_K: if (2 <= 3) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 2), + BPF_JMP_IMM(BPF_JLE, R1, 3, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JGT | BPF_K jump backwards */ { "JMP_JGT_K: if (3 > 2) return 1 (jump backwards)", @@ -4662,6 +4828,36 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + /* BPF_JMP | BPF_JLT | BPF_K jump backwards */ + { + "JMP_JGT_K: if (2 < 3) return 1 (jump backwards)", + .u.insns_int = { + BPF_JMP_IMM(BPF_JA, 0, 0, 2), /* goto start */ + BPF_ALU32_IMM(BPF_MOV, R0, 1), /* out: */ + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 0), /* start: */ + BPF_LD_IMM64(R1, 2), /* note: this takes 2 insns */ + BPF_JMP_IMM(BPF_JLT, R1, 3, -6), /* goto out */ + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JLE_K: if (3 <= 3) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 3), + BPF_JMP_IMM(BPF_JLE, R1, 3, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JNE | BPF_K */ { "JMP_JNE_K: if (3 != 2) return 1", @@ -4752,6 +4948,37 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + /* BPF_JMP | BPF_JSLT | BPF_X */ + { + "JMP_JSLT_X: Signed jump: if (-2 < -1) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, -1), + BPF_LD_IMM64(R2, -2), + BPF_JMP_REG(BPF_JSLT, R2, R1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JSLT_X: Signed jump: if (-1 < -1) return 0", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_LD_IMM64(R1, -1), + BPF_LD_IMM64(R2, -1), + BPF_JMP_REG(BPF_JSLT, R1, R2, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JSGE | BPF_X */ { "JMP_JSGE_X: Signed jump: if (-1 >= -2) return 1", @@ -4783,6 +5010,37 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + /* BPF_JMP | BPF_JSLE | BPF_X */ + { + "JMP_JSLE_X: Signed jump: if (-2 <= -1) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, -1), + BPF_LD_IMM64(R2, -2), + BPF_JMP_REG(BPF_JSLE, R2, R1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JSLE_X: Signed jump: if (-1 <= -1) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, -1), + BPF_LD_IMM64(R2, -1), + BPF_JMP_REG(BPF_JSLE, R1, R2, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JGT | BPF_X */ { "JMP_JGT_X: if (3 > 2) return 1", @@ -4814,6 +5072,37 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + /* BPF_JMP | BPF_JLT | BPF_X */ + { + "JMP_JLT_X: if (2 < 3) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 3), + BPF_LD_IMM64(R2, 2), + BPF_JMP_REG(BPF_JLT, R2, R1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JLT_X: Unsigned jump: if (1 < -1) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, -1), + BPF_LD_IMM64(R2, 1), + BPF_JMP_REG(BPF_JLT, R2, R1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JGE | BPF_X */ { "JMP_JGE_X: if (3 >= 2) return 1", @@ -4845,6 +5134,37 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + /* BPF_JMP | BPF_JLE | BPF_X */ + { + "JMP_JLE_X: if (2 <= 3) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 3), + BPF_LD_IMM64(R2, 2), + BPF_JMP_REG(BPF_JLE, R2, R1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JLE_X: if (3 <= 3) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 3), + BPF_LD_IMM64(R2, 3), + BPF_JMP_REG(BPF_JLE, R1, R2, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, { /* Mainly testing JIT + imm64 here. */ "JMP_JGE_X: ldimm64 test 1", @@ -4890,6 +5210,50 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + { + "JMP_JLE_X: ldimm64 test 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 3), + BPF_LD_IMM64(R2, 2), + BPF_JMP_REG(BPF_JLE, R2, R1, 2), + BPF_LD_IMM64(R0, 0xffffffffffffffffULL), + BPF_LD_IMM64(R0, 0xeeeeeeeeeeeeeeeeULL), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0xeeeeeeeeU } }, + }, + { + "JMP_JLE_X: ldimm64 test 2", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 3), + BPF_LD_IMM64(R2, 2), + BPF_JMP_REG(BPF_JLE, R2, R1, 0), + BPF_LD_IMM64(R0, 0xffffffffffffffffULL), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0xffffffffU } }, + }, + { + "JMP_JLE_X: ldimm64 test 3", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_LD_IMM64(R1, 3), + BPF_LD_IMM64(R2, 2), + BPF_JMP_REG(BPF_JLE, R2, R1, 4), + BPF_LD_IMM64(R0, 0xffffffffffffffffULL), + BPF_LD_IMM64(R0, 0xeeeeeeeeeeeeeeeeULL), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JNE | BPF_X */ { "JMP_JNE_X: if (3 != 2) return 1", diff --git a/net/core/filter.c b/net/core/filter.c index 78d00933dbe7..5afe3ac191ec 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -514,14 +514,27 @@ do_pass: break; } - /* Convert JEQ into JNE when 'jump_true' is next insn. */ - if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) { - insn->code = BPF_JMP | BPF_JNE | bpf_src; + /* Convert some jumps when 'jump_true' is next insn. */ + if (fp->jt == 0) { + switch (BPF_OP(fp->code)) { + case BPF_JEQ: + insn->code = BPF_JMP | BPF_JNE | bpf_src; + break; + case BPF_JGT: + insn->code = BPF_JMP | BPF_JLE | bpf_src; + break; + case BPF_JGE: + insn->code = BPF_JMP | BPF_JLT | bpf_src; + break; + default: + goto jmp_rest; + } + target = i + fp->jf + 1; BPF_EMIT_JMP; break; } - +jmp_rest: /* Other jumps are mapped into two insns: Jxx and JA. */ target = i + fp->jt + 1; insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 8d9bfcca3fe4..bf3b2e230455 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -30,9 +30,14 @@ #define BPF_FROM_LE BPF_TO_LE #define BPF_FROM_BE BPF_TO_BE +/* jmp encodings */ #define BPF_JNE 0x50 /* jump != */ +#define BPF_JLT 0xa0 /* LT is unsigned, '<' */ +#define BPF_JLE 0xb0 /* LE is unsigned, '<=' */ #define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */ #define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */ +#define BPF_JSLT 0xc0 /* SLT is signed, '<' */ +#define BPF_JSLE 0xd0 /* SLE is signed, '<=' */ #define BPF_CALL 0x80 /* function call */ #define BPF_EXIT 0x90 /* function return */ -- cgit v1.2.3 From 69e8cc134bcbf0ccfcf852c400b8e6788d1d0038 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 15 Aug 2017 22:33:32 -0700 Subject: bpf: sockmap sample program This program binds a program to a cgroup and then matches hard coded IP addresses and adds these to a sockmap. This will receive messages from the backend and send them to the client. client:X <---> frontend:10000 client:X <---> backend:10001 To keep things simple this is only designed for 1:1 connections using hard coded values. A more complete example would allow many backends and clients. To run, # sockmap Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- samples/bpf/bpf_load.c | 8 +- samples/sockmap/Makefile | 78 ++++++++ samples/sockmap/sockmap_kern.c | 110 ++++++++++++ samples/sockmap/sockmap_user.c | 286 ++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 46 ++++- tools/lib/bpf/bpf.c | 14 +- tools/lib/bpf/bpf.h | 4 + tools/testing/selftests/bpf/bpf_helpers.h | 7 + 8 files changed, 547 insertions(+), 6 deletions(-) create mode 100644 samples/sockmap/Makefile create mode 100644 samples/sockmap/sockmap_kern.c create mode 100644 samples/sockmap/sockmap_user.c (limited to 'tools/include/uapi/linux') diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index 899f40310bc3..a8552b8a2ab6 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -65,6 +65,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0; bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0; bool is_sockops = strncmp(event, "sockops", 7) == 0; + bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0; size_t insns_cnt = size / sizeof(struct bpf_insn); enum bpf_prog_type prog_type; char buf[256]; @@ -92,6 +93,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) prog_type = BPF_PROG_TYPE_CGROUP_SOCK; } else if (is_sockops) { prog_type = BPF_PROG_TYPE_SOCK_OPS; + } else if (is_sk_skb) { + prog_type = BPF_PROG_TYPE_SK_SKB; } else { printf("Unknown event '%s'\n", event); return -1; @@ -109,7 +112,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk) return 0; - if (is_socket || is_sockops) { + if (is_socket || is_sockops || is_sk_skb) { if (is_socket) event += 6; else @@ -567,7 +570,8 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) memcmp(shname, "perf_event", 10) == 0 || memcmp(shname, "socket", 6) == 0 || memcmp(shname, "cgroup/", 7) == 0 || - memcmp(shname, "sockops", 7) == 0) { + memcmp(shname, "sockops", 7) == 0 || + memcmp(shname, "sk_skb", 6) == 0) { ret = load_and_attach(shname, data->d_buf, data->d_size); if (ret != 0) diff --git a/samples/sockmap/Makefile b/samples/sockmap/Makefile new file mode 100644 index 000000000000..9291ab8e0f8c --- /dev/null +++ b/samples/sockmap/Makefile @@ -0,0 +1,78 @@ +# kbuild trick to avoid linker error. Can be omitted if a module is built. +obj- := dummy.o + +# List of programs to build +hostprogs-y := sockmap + +# Libbpf dependencies +LIBBPF := ../../tools/lib/bpf/bpf.o + +HOSTCFLAGS += -I$(objtree)/usr/include +HOSTCFLAGS += -I$(srctree)/tools/lib/ +HOSTCFLAGS += -I$(srctree)/tools/testing/selftests/bpf/ +HOSTCFLAGS += -I$(srctree)/tools/lib/ -I$(srctree)/tools/include +HOSTCFLAGS += -I$(srctree)/tools/perf + +sockmap-objs := ../bpf/bpf_load.o $(LIBBPF) sockmap_user.o + +# Tell kbuild to always build the programs +always := $(hostprogs-y) +always += sockmap_kern.o + +HOSTLOADLIBES_sockmap += -lelf -lpthread + +# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: +# make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang +LLC ?= llc +CLANG ?= clang + +# Trick to allow make to be run from this directory +all: + $(MAKE) -C ../../ $(CURDIR)/ + +clean: + $(MAKE) -C ../../ M=$(CURDIR) clean + @rm -f *~ + +$(obj)/syscall_nrs.s: $(src)/syscall_nrs.c + $(call if_changed_dep,cc_s_c) + +$(obj)/syscall_nrs.h: $(obj)/syscall_nrs.s FORCE + $(call filechk,offsets,__SYSCALL_NRS_H__) + +clean-files += syscall_nrs.h + +FORCE: + + +# Verify LLVM compiler tools are available and bpf target is supported by llc +.PHONY: verify_cmds verify_target_bpf $(CLANG) $(LLC) + +verify_cmds: $(CLANG) $(LLC) + @for TOOL in $^ ; do \ + if ! (which -- "$${TOOL}" > /dev/null 2>&1); then \ + echo "*** ERROR: Cannot find LLVM tool $${TOOL}" ;\ + exit 1; \ + else true; fi; \ + done + +verify_target_bpf: verify_cmds + @if ! (${LLC} -march=bpf -mattr=help > /dev/null 2>&1); then \ + echo "*** ERROR: LLVM (${LLC}) does not support 'bpf' target" ;\ + echo " NOTICE: LLVM version >= 3.7.1 required" ;\ + exit 2; \ + else true; fi + +$(src)/*.c: verify_target_bpf + +# asm/sysreg.h - inline assembly used by it is incompatible with llvm. +# But, there is no easy way to fix it, so just exclude it since it is +# useless for BPF samples. +$(obj)/%.o: $(src)/%.c + $(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) -I$(obj) \ + -D__KERNEL__ -D__ASM_SYSREG_H -Wno-unused-value -Wno-pointer-sign \ + -Wno-compare-distinct-pointer-types \ + -Wno-gnu-variable-sized-type-not-at-end \ + -Wno-address-of-packed-member -Wno-tautological-compare \ + -Wno-unknown-warning-option \ + -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@ diff --git a/samples/sockmap/sockmap_kern.c b/samples/sockmap/sockmap_kern.c new file mode 100644 index 000000000000..6ff986f7059b --- /dev/null +++ b/samples/sockmap/sockmap_kern.c @@ -0,0 +1,110 @@ +/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include +#include "../../tools/testing/selftests/bpf/bpf_helpers.h" +#include "../../tools/testing/selftests/bpf/bpf_endian.h" + +/* Sockmap sample program connects a client and a backend together + * using cgroups. + * + * client:X <---> frontend:80 client:X <---> backend:80 + * + * For simplicity we hard code values here and bind 1:1. The hard + * coded values are part of the setup in sockmap.sh script that + * is associated with this BPF program. + * + * The bpf_printk is verbose and prints information as connections + * are established and verdicts are decided. + */ + +#define bpf_printk(fmt, ...) \ +({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ +}) + +struct bpf_map_def SEC("maps") sock_map = { + .type = BPF_MAP_TYPE_SOCKMAP, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 20, +}; + +SEC("sk_skb1") +int bpf_prog1(struct __sk_buff *skb) +{ + return skb->len; +} + +SEC("sk_skb2") +int bpf_prog2(struct __sk_buff *skb) +{ + __u32 lport = skb->local_port; + __u32 rport = skb->remote_port; + int ret = 0; + + if (lport == 10000) + ret = 10; + else + ret = 1; + + bpf_printk("sockmap: %d -> %d @ %d\n", lport, bpf_ntohl(rport), ret); + return bpf_sk_redirect_map(&sock_map, ret, 0); +} + +SEC("sockops") +int bpf_sockmap(struct bpf_sock_ops *skops) +{ + __u32 lport, rport; + int op, err = 0, index, key, ret; + + + op = (int) skops->op; + + switch (op) { + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + lport = skops->local_port; + rport = skops->remote_port; + + if (lport == 10000) { + ret = 1; + err = bpf_sock_map_update(skops, &sock_map, &ret, + BPF_NOEXIST, + BPF_SOCKMAP_STRPARSER); + bpf_printk("passive(%i -> %i) map ctx update err: %d\n", + lport, bpf_ntohl(rport), err); + } + break; + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: + lport = skops->local_port; + rport = skops->remote_port; + + if (bpf_ntohl(rport) == 10001) { + ret = 10; + err = bpf_sock_map_update(skops, &sock_map, &ret, + BPF_NOEXIST, + BPF_SOCKMAP_STRPARSER); + bpf_printk("active(%i -> %i) map ctx update err: %d\n", + lport, bpf_ntohl(rport), err); + } + break; + default: + break; + } + + return 0; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c new file mode 100644 index 000000000000..fb78f5abefb4 --- /dev/null +++ b/samples/sockmap/sockmap_user.c @@ -0,0 +1,286 @@ +/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "../bpf/bpf_load.h" +#include "../bpf/bpf_util.h" +#include "../bpf/libbpf.h" + +int running; +void running_handler(int a); + +/* randomly selected ports for testing on lo */ +#define S1_PORT 10000 +#define S2_PORT 10001 + +static int sockmap_test_sockets(int rate, int dot) +{ + int i, sc, err, max_fd, one = 1; + int s1, s2, c1, c2, p1, p2; + struct sockaddr_in addr; + struct timeval timeout; + char buf[1024] = {0}; + int *fds[4] = {&s1, &s2, &c1, &c2}; + fd_set w; + + s1 = s2 = p1 = p2 = c1 = c2 = 0; + + /* Init sockets */ + for (i = 0; i < 4; i++) { + *fds[i] = socket(AF_INET, SOCK_STREAM, 0); + if (*fds[i] < 0) { + perror("socket s1 failed()"); + err = *fds[i]; + goto out; + } + } + + /* Allow reuse */ + for (i = 0; i < 2; i++) { + err = setsockopt(*fds[i], SOL_SOCKET, SO_REUSEADDR, + (char *)&one, sizeof(one)); + if (err) { + perror("setsockopt failed()"); + goto out; + } + } + + /* Non-blocking sockets */ + for (i = 0; i < 4; i++) { + err = ioctl(*fds[i], FIONBIO, (char *)&one); + if (err < 0) { + perror("ioctl s1 failed()"); + goto out; + } + } + + /* Bind server sockets */ + memset(&addr, 0, sizeof(struct sockaddr_in)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = inet_addr("127.0.0.1"); + + addr.sin_port = htons(S1_PORT); + err = bind(s1, (struct sockaddr *)&addr, sizeof(addr)); + if (err < 0) { + perror("bind s1 failed()\n"); + goto out; + } + + addr.sin_port = htons(S2_PORT); + err = bind(s2, (struct sockaddr *)&addr, sizeof(addr)); + if (err < 0) { + perror("bind s2 failed()\n"); + goto out; + } + + /* Listen server sockets */ + addr.sin_port = htons(S1_PORT); + err = listen(s1, 32); + if (err < 0) { + perror("listen s1 failed()\n"); + goto out; + } + + addr.sin_port = htons(S2_PORT); + err = listen(s2, 32); + if (err < 0) { + perror("listen s1 failed()\n"); + goto out; + } + + /* Initiate Connect */ + addr.sin_port = htons(S1_PORT); + err = connect(c1, (struct sockaddr *)&addr, sizeof(addr)); + if (err < 0 && errno != EINPROGRESS) { + perror("connect c1 failed()\n"); + goto out; + } + + addr.sin_port = htons(S2_PORT); + err = connect(c2, (struct sockaddr *)&addr, sizeof(addr)); + if (err < 0 && errno != EINPROGRESS) { + perror("connect c2 failed()\n"); + goto out; + } + + /* Accept Connecrtions */ + p1 = accept(s1, NULL, NULL); + if (p1 < 0) { + perror("accept s1 failed()\n"); + goto out; + } + + p2 = accept(s2, NULL, NULL); + if (p2 < 0) { + perror("accept s1 failed()\n"); + goto out; + } + + max_fd = p2; + timeout.tv_sec = 10; + timeout.tv_usec = 0; + + printf("connected sockets: c1 <-> p1, c2 <-> p2\n"); + printf("cgroups binding: c1(%i) <-> s1(%i) - - - c2(%i) <-> s2(%i)\n", + c1, s1, c2, s2); + + /* Ping/Pong data from client to server */ + sc = send(c1, buf, sizeof(buf), 0); + if (sc < 0) { + perror("send failed()\n"); + goto out; + } + + do { + int s, rc, i; + + /* FD sets */ + FD_ZERO(&w); + FD_SET(c1, &w); + FD_SET(c2, &w); + FD_SET(p1, &w); + FD_SET(p2, &w); + + s = select(max_fd + 1, &w, NULL, NULL, &timeout); + if (s == -1) { + perror("select()"); + break; + } else if (!s) { + fprintf(stderr, "unexpected timeout\n"); + break; + } + + for (i = 0; i <= max_fd && s > 0; ++i) { + if (!FD_ISSET(i, &w)) + continue; + + s--; + + rc = recv(i, buf, sizeof(buf), 0); + if (rc < 0) { + if (errno != EWOULDBLOCK) { + perror("recv failed()\n"); + break; + } + } + + if (rc == 0) { + close(i); + break; + } + + sc = send(i, buf, rc, 0); + if (sc < 0) { + perror("send failed()\n"); + break; + } + } + sleep(rate); + if (dot) { + printf("."); + fflush(stdout); + + } + } while (running); + +out: + close(s1); + close(s2); + close(p1); + close(p2); + close(c1); + close(c2); + return err; +} + +int main(int argc, char **argv) +{ + int rate = 1, dot = 1; + char filename[256]; + int err, cg_fd; + char *cg_path; + + cg_path = argv[argc - 1]; + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + running = 1; + + /* catch SIGINT */ + signal(SIGINT, running_handler); + + if (load_bpf_file(filename)) { + fprintf(stderr, "load_bpf_file: (%s) %s\n", + filename, strerror(errno)); + return 1; + } + + /* Cgroup configuration */ + cg_fd = open(cg_path, O_DIRECTORY, O_RDONLY); + if (cg_fd < 0) { + fprintf(stderr, "ERROR: (%i) open cg path failed: %s\n", + cg_fd, cg_path); + return cg_fd; + } + + /* Attach programs to sockmap */ + err = __bpf_prog_attach(prog_fd[0], prog_fd[1], map_fd[0], + BPF_CGROUP_SMAP_INGRESS, 0); + if (err) { + fprintf(stderr, "ERROR: bpf_prog_attach (sockmap): %d (%s)\n", + err, strerror(errno)); + return err; + } + + /* Attach to cgroups */ + err = bpf_prog_attach(prog_fd[2], cg_fd, BPF_CGROUP_SOCK_OPS, 0); + if (err) { + fprintf(stderr, "ERROR: bpf_prog_attach (groups): %d (%s)\n", + err, strerror(errno)); + return err; + } + + err = sockmap_test_sockets(rate, dot); + if (err) { + fprintf(stderr, "ERROR: test socket failed: %d\n", err); + return err; + } + return 0; +} + +void running_handler(int a) +{ + running = 0; +} diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index bf3b2e230455..2d97dd27c8f6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -110,6 +110,7 @@ enum bpf_map_type { BPF_MAP_TYPE_ARRAY_OF_MAPS, BPF_MAP_TYPE_HASH_OF_MAPS, BPF_MAP_TYPE_DEVMAP, + BPF_MAP_TYPE_SOCKMAP, }; enum bpf_prog_type { @@ -127,6 +128,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_LWT_OUT, BPF_PROG_TYPE_LWT_XMIT, BPF_PROG_TYPE_SOCK_OPS, + BPF_PROG_TYPE_SK_SKB, }; enum bpf_attach_type { @@ -134,11 +136,18 @@ enum bpf_attach_type { BPF_CGROUP_INET_EGRESS, BPF_CGROUP_INET_SOCK_CREATE, BPF_CGROUP_SOCK_OPS, + BPF_CGROUP_SMAP_INGRESS, __MAX_BPF_ATTACH_TYPE }; #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE +enum bpf_sockmap_flags { + BPF_SOCKMAP_UNSPEC, + BPF_SOCKMAP_STRPARSER, + __MAX_BPF_SOCKMAP_FLAG +}; + /* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command * to the given target_fd cgroup the descendent cgroup will be able to * override effective bpf program that was inherited from this cgroup @@ -210,6 +219,7 @@ union bpf_attr { __u32 attach_bpf_fd; /* eBPF program to attach */ __u32 attach_type; __u32 attach_flags; + __u32 attach_bpf_fd2; }; struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ @@ -545,6 +555,23 @@ union bpf_attr { * @mode: operation mode (enum bpf_adj_room_mode) * @flags: reserved for future use * Return: 0 on success or negative error code + * + * int bpf_sk_redirect_map(map, key, flags) + * Redirect skb to a sock in map using key as a lookup key for the + * sock in map. + * @map: pointer to sockmap + * @key: key to lookup sock in map + * @flags: reserved for future use + * Return: SK_REDIRECT + * + * int bpf_sock_map_update(skops, map, key, flags, map_flags) + * @skops: pointer to bpf_sock_ops + * @map: pointer to sockmap to update + * @key: key to insert/update sock in map + * @flags: same flags as map update elem + * @map_flags: sock map specific flags + * bit 1: Enable strparser + * other bits: reserved */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -598,7 +625,9 @@ union bpf_attr { FN(set_hash), \ FN(setsockopt), \ FN(skb_adjust_room), \ - FN(redirect_map), + FN(redirect_map), \ + FN(sk_redirect_map), \ + FN(sock_map_update), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -675,6 +704,15 @@ struct __sk_buff { __u32 data; __u32 data_end; __u32 napi_id; + + /* accessed by BPF_PROG_TYPE_sk_skb types */ + __u32 family; + __u32 remote_ip4; /* Stored in network byte order */ + __u32 local_ip4; /* Stored in network byte order */ + __u32 remote_ip6[4]; /* Stored in network byte order */ + __u32 local_ip6[4]; /* Stored in network byte order */ + __u32 remote_port; /* Stored in network byte order */ + __u32 local_port; /* stored in host byte order */ }; struct bpf_tunnel_key { @@ -734,6 +772,12 @@ struct xdp_md { __u32 data_end; }; +enum sk_action { + SK_ABORTED = 0, + SK_DROP, + SK_REDIRECT, +}; + #define BPF_TAG_SIZE 8 struct bpf_prog_info { diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index e5bbb090bf88..77660157a684 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -211,20 +211,28 @@ int bpf_obj_get(const char *pathname) return sys_bpf(BPF_OBJ_GET, &attr, sizeof(attr)); } -int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type, - unsigned int flags) +int __bpf_prog_attach(int prog_fd1, int prog_fd2, int target_fd, + enum bpf_attach_type type, + unsigned int flags) { union bpf_attr attr; bzero(&attr, sizeof(attr)); attr.target_fd = target_fd; - attr.attach_bpf_fd = prog_fd; + attr.attach_bpf_fd = prog_fd1; + attr.attach_bpf_fd2 = prog_fd2; attr.attach_type = type; attr.attach_flags = flags; return sys_bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)); } +int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type, + unsigned int flags) +{ + return __bpf_prog_attach(prog_fd, 0, target_fd, type, flags); +} + int bpf_prog_detach(int target_fd, enum bpf_attach_type type) { union bpf_attr attr; diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 418c86e69bcb..eaee585c1cea 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -50,6 +50,10 @@ int bpf_obj_pin(int fd, const char *pathname); int bpf_obj_get(const char *pathname); int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type, unsigned int flags); +int __bpf_prog_attach(int prog1, int prog2, + int attachable_fd, + enum bpf_attach_type type, + unsigned int flags); int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type); int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size, void *data_out, __u32 *size_out, __u32 *retval, diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index acbd60519467..73092d4a898e 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -65,6 +65,13 @@ static int (*bpf_xdp_adjust_head)(void *ctx, int offset) = static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval, int optlen) = (void *) BPF_FUNC_setsockopt; +static int (*bpf_sk_redirect_map)(void *map, int key, int flags) = + (void *) BPF_FUNC_sk_redirect_map; +static int (*bpf_sock_map_update)(void *map, void *key, void *value, + unsigned long long flags, + unsigned long long map_lags) = + (void *) BPF_FUNC_sock_map_update; + /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions -- cgit v1.2.3 From ad17d0e6c708805bf9e6686eb747cc528b702e67 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 18 Aug 2017 11:28:01 -0700 Subject: bpf: Allow numa selection in INNER_LRU_HASH_PREALLOC test of map_perf_test This patch makes the needed changes to allow each process of the INNER_LRU_HASH_PREALLOC test to provide its numa node id when creating the lru map. Signed-off-by: Martin KaFai Lau Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- samples/bpf/bpf_load.c | 21 ++++++++++++-------- samples/bpf/bpf_load.h | 1 + samples/bpf/map_perf_test_kern.c | 2 ++ samples/bpf/map_perf_test_user.c | 12 +++++++++--- tools/include/uapi/linux/bpf.h | 10 +++++++++- tools/lib/bpf/bpf.c | 32 +++++++++++++++++++++++++++---- tools/lib/bpf/bpf.h | 6 ++++++ tools/testing/selftests/bpf/bpf_helpers.h | 1 + 8 files changed, 69 insertions(+), 16 deletions(-) (limited to 'tools/include/uapi/linux') diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index a8552b8a2ab6..6aa50098dfb8 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -201,7 +201,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) static int load_maps(struct bpf_map_data *maps, int nr_maps, fixup_map_cb fixup_map) { - int i; + int i, numa_node; for (i = 0; i < nr_maps; i++) { if (fixup_map) { @@ -213,21 +213,26 @@ static int load_maps(struct bpf_map_data *maps, int nr_maps, } } + numa_node = maps[i].def.map_flags & BPF_F_NUMA_NODE ? + maps[i].def.numa_node : -1; + if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS || maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) { int inner_map_fd = map_fd[maps[i].def.inner_map_idx]; - map_fd[i] = bpf_create_map_in_map(maps[i].def.type, + map_fd[i] = bpf_create_map_in_map_node(maps[i].def.type, maps[i].def.key_size, inner_map_fd, maps[i].def.max_entries, - maps[i].def.map_flags); + maps[i].def.map_flags, + numa_node); } else { - map_fd[i] = bpf_create_map(maps[i].def.type, - maps[i].def.key_size, - maps[i].def.value_size, - maps[i].def.max_entries, - maps[i].def.map_flags); + map_fd[i] = bpf_create_map_node(maps[i].def.type, + maps[i].def.key_size, + maps[i].def.value_size, + maps[i].def.max_entries, + maps[i].def.map_flags, + numa_node); } if (map_fd[i] < 0) { printf("failed to create a map: %d %s\n", diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h index ca0563d04744..453e3226b4ce 100644 --- a/samples/bpf/bpf_load.h +++ b/samples/bpf/bpf_load.h @@ -13,6 +13,7 @@ struct bpf_map_def { unsigned int max_entries; unsigned int map_flags; unsigned int inner_map_idx; + unsigned int numa_node; }; struct bpf_map_data { diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c index 245165817fbe..ca3b22ed577a 100644 --- a/samples/bpf/map_perf_test_kern.c +++ b/samples/bpf/map_perf_test_kern.c @@ -40,6 +40,8 @@ struct bpf_map_def SEC("maps") inner_lru_hash_map = { .key_size = sizeof(u32), .value_size = sizeof(long), .max_entries = MAX_ENTRIES, + .map_flags = BPF_F_NUMA_NODE, + .numa_node = 0, }; struct bpf_map_def SEC("maps") array_of_lru_hashs = { diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c index 1a8894b5ac51..bccbf8478e43 100644 --- a/samples/bpf/map_perf_test_user.c +++ b/samples/bpf/map_perf_test_user.c @@ -97,14 +97,20 @@ static void do_test_lru(enum test_type test, int cpu) if (test == INNER_LRU_HASH_PREALLOC) { int outer_fd = map_fd[array_of_lru_hashs_idx]; + unsigned int mycpu, mynode; assert(cpu < MAX_NR_CPUS); if (cpu) { + ret = syscall(__NR_getcpu, &mycpu, &mynode, NULL); + assert(!ret); + inner_lru_map_fds[cpu] = - bpf_create_map(BPF_MAP_TYPE_LRU_HASH, - sizeof(uint32_t), sizeof(long), - inner_lru_hash_size, 0); + bpf_create_map_node(BPF_MAP_TYPE_LRU_HASH, + sizeof(uint32_t), + sizeof(long), + inner_lru_hash_size, 0, + mynode); if (inner_lru_map_fds[cpu] == -1) { printf("cannot create BPF_MAP_TYPE_LRU_HASH %s(%d)\n", strerror(errno), errno); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2d97dd27c8f6..f8f6377fd541 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -168,6 +168,7 @@ enum bpf_sockmap_flags { #define BPF_NOEXIST 1 /* create new element if it didn't exist */ #define BPF_EXIST 2 /* update existing element */ +/* flags for BPF_MAP_CREATE command */ #define BPF_F_NO_PREALLOC (1U << 0) /* Instead of having one common LRU list in the * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list @@ -176,6 +177,8 @@ enum bpf_sockmap_flags { * across different LRU lists. */ #define BPF_F_NO_COMMON_LRU (1U << 1) +/* Specify numa node during map creation */ +#define BPF_F_NUMA_NODE (1U << 2) union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ @@ -183,8 +186,13 @@ union bpf_attr { __u32 key_size; /* size of key in bytes */ __u32 value_size; /* size of value in bytes */ __u32 max_entries; /* max number of entries in a map */ - __u32 map_flags; /* prealloc or not */ + __u32 map_flags; /* BPF_MAP_CREATE related + * flags defined above. + */ __u32 inner_map_fd; /* fd pointing to the inner map */ + __u32 numa_node; /* numa node (effective only if + * BPF_F_NUMA_NODE is set). + */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 77660157a684..a0717610b116 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -57,8 +57,9 @@ static inline int sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr, return syscall(__NR_bpf, cmd, attr, size); } -int bpf_create_map(enum bpf_map_type map_type, int key_size, - int value_size, int max_entries, __u32 map_flags) +int bpf_create_map_node(enum bpf_map_type map_type, int key_size, + int value_size, int max_entries, __u32 map_flags, + int node) { union bpf_attr attr; @@ -69,12 +70,24 @@ int bpf_create_map(enum bpf_map_type map_type, int key_size, attr.value_size = value_size; attr.max_entries = max_entries; attr.map_flags = map_flags; + if (node >= 0) { + attr.map_flags |= BPF_F_NUMA_NODE; + attr.numa_node = node; + } return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } -int bpf_create_map_in_map(enum bpf_map_type map_type, int key_size, - int inner_map_fd, int max_entries, __u32 map_flags) +int bpf_create_map(enum bpf_map_type map_type, int key_size, + int value_size, int max_entries, __u32 map_flags) +{ + return bpf_create_map_node(map_type, key_size, value_size, + max_entries, map_flags, -1); +} + +int bpf_create_map_in_map_node(enum bpf_map_type map_type, int key_size, + int inner_map_fd, int max_entries, + __u32 map_flags, int node) { union bpf_attr attr; @@ -86,10 +99,21 @@ int bpf_create_map_in_map(enum bpf_map_type map_type, int key_size, attr.inner_map_fd = inner_map_fd; attr.max_entries = max_entries; attr.map_flags = map_flags; + if (node >= 0) { + attr.map_flags |= BPF_F_NUMA_NODE; + attr.numa_node = node; + } return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } +int bpf_create_map_in_map(enum bpf_map_type map_type, int key_size, + int inner_map_fd, int max_entries, __u32 map_flags) +{ + return bpf_create_map_in_map_node(map_type, key_size, inner_map_fd, + max_entries, map_flags, -1); +} + int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns, size_t insns_cnt, const char *license, __u32 kern_version, char *log_buf, size_t log_buf_sz) diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index eaee585c1cea..90e9d4e85d08 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -24,8 +24,14 @@ #include #include +int bpf_create_map_node(enum bpf_map_type map_type, int key_size, + int value_size, int max_entries, __u32 map_flags, + int node); int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, int max_entries, __u32 map_flags); +int bpf_create_map_in_map_node(enum bpf_map_type map_type, int key_size, + int inner_map_fd, int max_entries, + __u32 map_flags, int node); int bpf_create_map_in_map(enum bpf_map_type map_type, int key_size, int inner_map_fd, int max_entries, __u32 map_flags); diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 73092d4a898e..98f3be26d390 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -94,6 +94,7 @@ struct bpf_map_def { unsigned int max_entries; unsigned int map_flags; unsigned int inner_map_idx; + unsigned int numa_node; }; static int (*bpf_skb_load_bytes)(void *ctx, int off, void *to, int len) = -- cgit v1.2.3 From 52839e653b5629bd237ad2ecdd8fdd897fcd5712 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Aug 2017 15:21:55 -0700 Subject: perf tools: Add support for printing new mem_info encodings Add decoding for the new "lvlx" and "snoopx" meminfo fields added earlier to the kernel so that "perf mem report" and other tools can print it properly. v2: Merge with persistent memory patch. Switch to new bit encoding for each combination. v3: Switch to generic lvlnum field. Signed-off-by: Andi Kleen Acked-by: Peter Zijlstra Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20170816222156.19953-4-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/perf_event.h | 30 ++++++++++++++++++++++-- tools/perf/util/mem-events.c | 43 ++++++++++++++++++++++++++++++++--- 2 files changed, 68 insertions(+), 5 deletions(-) (limited to 'tools/include/uapi/linux') diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 642db5fa3286..2a37ae925d85 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -954,14 +954,20 @@ union perf_mem_data_src { mem_snoop:5, /* snoop mode */ mem_lock:2, /* lock instr */ mem_dtlb:7, /* tlb access */ - mem_rsvd:31; + mem_lvl_num:4, /* memory hierarchy level number */ + mem_remote:1, /* remote */ + mem_snoopx:2, /* snoop mode, ext */ + mem_rsvd:24; }; }; #elif defined(__BIG_ENDIAN_BITFIELD) union perf_mem_data_src { __u64 val; struct { - __u64 mem_rsvd:31, + __u64 mem_rsvd:24, + mem_snoopx:2, /* snoop mode, ext */ + mem_remote:1, /* remote */ + mem_lvl_num:4, /* memory hierarchy level number */ mem_dtlb:7, /* tlb access */ mem_lock:2, /* lock instr */ mem_snoop:5, /* snoop mode */ @@ -998,6 +1004,22 @@ union perf_mem_data_src { #define PERF_MEM_LVL_UNC 0x2000 /* Uncached memory */ #define PERF_MEM_LVL_SHIFT 5 +#define PERF_MEM_REMOTE_REMOTE 0x01 /* Remote */ +#define PERF_MEM_REMOTE_SHIFT 37 + +#define PERF_MEM_LVLNUM_L1 0x01 /* L1 */ +#define PERF_MEM_LVLNUM_L2 0x02 /* L2 */ +#define PERF_MEM_LVLNUM_L3 0x03 /* L3 */ +#define PERF_MEM_LVLNUM_L4 0x04 /* L4 */ +/* 5-0xa available */ +#define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */ +#define PERF_MEM_LVLNUM_LFB 0x0c /* LFB */ +#define PERF_MEM_LVLNUM_RAM 0x0d /* RAM */ +#define PERF_MEM_LVLNUM_PMEM 0x0e /* PMEM */ +#define PERF_MEM_LVLNUM_NA 0x0f /* N/A */ + +#define PERF_MEM_LVLNUM_SHIFT 33 + /* snoop mode */ #define PERF_MEM_SNOOP_NA 0x01 /* not available */ #define PERF_MEM_SNOOP_NONE 0x02 /* no snoop */ @@ -1006,6 +1028,10 @@ union perf_mem_data_src { #define PERF_MEM_SNOOP_HITM 0x10 /* snoop hit modified */ #define PERF_MEM_SNOOP_SHIFT 19 +#define PERF_MEM_SNOOPX_FWD 0x01 /* forward */ +/* 1 free */ +#define PERF_MEM_SNOOPX_SHIFT 37 + /* locked instruction */ #define PERF_MEM_LOCK_NA 0x01 /* not available */ #define PERF_MEM_LOCK_LOCKED 0x02 /* locked transaction */ diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c index 06f5a3a4295c..ced4f3fff035 100644 --- a/tools/perf/util/mem-events.c +++ b/tools/perf/util/mem-events.c @@ -166,11 +166,20 @@ static const char * const mem_lvl[] = { "Uncached", }; +static const char * const mem_lvlnum[] = { + [PERF_MEM_LVLNUM_ANY_CACHE] = "Any cache", + [PERF_MEM_LVLNUM_LFB] = "LFB", + [PERF_MEM_LVLNUM_RAM] = "RAM", + [PERF_MEM_LVLNUM_PMEM] = "PMEM", + [PERF_MEM_LVLNUM_NA] = "N/A", +}; + int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info) { size_t i, l = 0; u64 m = PERF_MEM_LVL_NA; u64 hit, miss; + int printed; if (mem_info) m = mem_info->data_src.mem_lvl; @@ -184,17 +193,37 @@ int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info) /* already taken care of */ m &= ~(PERF_MEM_LVL_HIT|PERF_MEM_LVL_MISS); + + if (mem_info && mem_info->data_src.mem_remote) { + strcat(out, "Remote "); + l += 7; + } + + printed = 0; for (i = 0; m && i < ARRAY_SIZE(mem_lvl); i++, m >>= 1) { if (!(m & 0x1)) continue; - if (l) { + if (printed++) { strcat(out, " or "); l += 4; } l += scnprintf(out + l, sz - l, mem_lvl[i]); } - if (*out == '\0') - l += scnprintf(out, sz - l, "N/A"); + + if (mem_info && mem_info->data_src.mem_lvl_num) { + int lvl = mem_info->data_src.mem_lvl_num; + if (printed++) { + strcat(out, " or "); + l += 4; + } + if (mem_lvlnum[lvl]) + l += scnprintf(out + l, sz - l, mem_lvlnum[lvl]); + else + l += scnprintf(out + l, sz - l, "L%d", lvl); + } + + if (l == 0) + l += scnprintf(out + l, sz - l, "N/A"); if (hit) l += scnprintf(out + l, sz - l, " hit"); if (miss) @@ -231,6 +260,14 @@ int perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info) } l += scnprintf(out + l, sz - l, snoop_access[i]); } + if (mem_info && + (mem_info->data_src.mem_snoopx & PERF_MEM_SNOOPX_FWD)) { + if (l) { + strcat(out, " or "); + l += 4; + } + l += scnprintf(out + l, sz - l, "Fwd"); + } if (*out == '\0') l += scnprintf(out, sz - l, "N/A"); -- cgit v1.2.3 From 464bc0fd6273d518aee79fbd37211dd9bc35d863 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 28 Aug 2017 07:10:04 -0700 Subject: bpf: convert sockmap field attach_bpf_fd2 to type In the initial sockmap API we provided strparser and verdict programs using a single attach command by extending the attach API with a the attach_bpf_fd2 field. However, if we add other programs in the future we will be adding a field for every new possible type, attach_bpf_fd(3,4,..). This seems a bit clumsy for an API. So lets push the programs using two new type fields. BPF_SK_SKB_STREAM_PARSER BPF_SK_SKB_STREAM_VERDICT This has the advantage of having a readable name and can easily be extended in the future. Updates to samples and sockmap included here also generalize tests slightly to support upcoming patch for multiple map support. Signed-off-by: John Fastabend Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support") Suggested-by: Alexei Starovoitov Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 10 +- include/uapi/linux/bpf.h | 9 +- kernel/bpf/sockmap.c | 25 ++-- kernel/bpf/syscall.c | 38 ++---- samples/sockmap/sockmap_kern.c | 6 +- samples/sockmap/sockmap_user.c | 12 +- tools/include/uapi/linux/bpf.h | 9 +- tools/lib/bpf/bpf.c | 14 +-- tools/lib/bpf/bpf.h | 4 - tools/testing/selftests/bpf/bpf_helpers.h | 3 +- tools/testing/selftests/bpf/sockmap_parse_prog.c | 2 +- tools/testing/selftests/bpf/sockmap_verdict_prog.c | 2 +- tools/testing/selftests/bpf/test_maps.c | 133 +++++++++------------ 13 files changed, 116 insertions(+), 151 deletions(-) (limited to 'tools/include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 830f472d8df5..c2cb1b5c094e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -39,8 +39,6 @@ struct bpf_map_ops { void (*map_fd_put_ptr)(void *ptr); u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); u32 (*map_fd_sys_lookup_elem)(void *ptr); - int (*map_attach)(struct bpf_map *map, - struct bpf_prog *p1, struct bpf_prog *p2); }; struct bpf_map { @@ -387,11 +385,19 @@ static inline void __dev_map_flush(struct bpf_map *map) #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key); +int sock_map_attach_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type); #else static inline struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) { return NULL; } + +static inline int sock_map_attach_prog(struct bpf_map *map, + struct bpf_prog *prog, + u32 type) +{ + return -EOPNOTSUPP; +} #endif /* verifier prototypes for helper functions called from eBPF programs */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 843818dff96d..97227be3690c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -136,7 +136,8 @@ enum bpf_attach_type { BPF_CGROUP_INET_EGRESS, BPF_CGROUP_INET_SOCK_CREATE, BPF_CGROUP_SOCK_OPS, - BPF_CGROUP_SMAP_INGRESS, + BPF_SK_SKB_STREAM_PARSER, + BPF_SK_SKB_STREAM_VERDICT, __MAX_BPF_ATTACH_TYPE }; @@ -224,7 +225,6 @@ union bpf_attr { __u32 attach_bpf_fd; /* eBPF program to attach */ __u32 attach_type; __u32 attach_flags; - __u32 attach_bpf_fd2; }; struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ @@ -580,14 +580,11 @@ union bpf_attr { * @flags: reserved for future use * Return: SK_REDIRECT * - * int bpf_sock_map_update(skops, map, key, flags, map_flags) + * int bpf_sock_map_update(skops, map, key, flags) * @skops: pointer to bpf_sock_ops * @map: pointer to sockmap to update * @key: key to insert/update sock in map * @flags: same flags as map update elem - * @map_flags: sock map specific flags - * bit 1: Enable strparser - * other bits: reserved */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 617c239590c2..cf570d108fd5 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -723,20 +723,24 @@ out: return err; } -static int sock_map_attach_prog(struct bpf_map *map, - struct bpf_prog *parse, - struct bpf_prog *verdict) +int sock_map_attach_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - struct bpf_prog *_parse, *_verdict; + struct bpf_prog *orig; - _parse = xchg(&stab->bpf_parse, parse); - _verdict = xchg(&stab->bpf_verdict, verdict); + switch (type) { + case BPF_SK_SKB_STREAM_PARSER: + orig = xchg(&stab->bpf_parse, prog); + break; + case BPF_SK_SKB_STREAM_VERDICT: + orig = xchg(&stab->bpf_verdict, prog); + break; + default: + return -EOPNOTSUPP; + } - if (_parse) - bpf_prog_put(_parse); - if (_verdict) - bpf_prog_put(_verdict); + if (orig) + bpf_prog_put(orig); return 0; } @@ -777,7 +781,6 @@ const struct bpf_map_ops sock_map_ops = { .map_get_next_key = sock_map_get_next_key, .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_map_delete_elem, - .map_attach = sock_map_attach_prog, }; BPF_CALL_5(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9378f3ba2cbf..021a05d9d800 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1093,12 +1093,12 @@ static int bpf_obj_get(const union bpf_attr *attr) #ifdef CONFIG_CGROUP_BPF -#define BPF_PROG_ATTACH_LAST_FIELD attach_bpf_fd2 +#define BPF_PROG_ATTACH_LAST_FIELD attach_flags -static int sockmap_get_from_fd(const union bpf_attr *attr, int ptype) +static int sockmap_get_from_fd(const union bpf_attr *attr) { - struct bpf_prog *prog1, *prog2; int ufd = attr->target_fd; + struct bpf_prog *prog; struct bpf_map *map; struct fd f; int err; @@ -1108,29 +1108,16 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, int ptype) if (IS_ERR(map)) return PTR_ERR(map); - if (!map->ops->map_attach) { - fdput(f); - return -EOPNOTSUPP; - } - - prog1 = bpf_prog_get_type(attr->attach_bpf_fd, ptype); - if (IS_ERR(prog1)) { + prog = bpf_prog_get_type(attr->attach_bpf_fd, BPF_PROG_TYPE_SK_SKB); + if (IS_ERR(prog)) { fdput(f); - return PTR_ERR(prog1); - } - - prog2 = bpf_prog_get_type(attr->attach_bpf_fd2, ptype); - if (IS_ERR(prog2)) { - fdput(f); - bpf_prog_put(prog1); - return PTR_ERR(prog2); + return PTR_ERR(prog); } - err = map->ops->map_attach(map, prog1, prog2); + err = sock_map_attach_prog(map, prog, attr->attach_type); if (err) { fdput(f); - bpf_prog_put(prog1); - bpf_prog_put(prog2); + bpf_prog_put(prog); return err; } @@ -1165,16 +1152,13 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; - case BPF_CGROUP_SMAP_INGRESS: - ptype = BPF_PROG_TYPE_SK_SKB; - break; + case BPF_SK_SKB_STREAM_PARSER: + case BPF_SK_SKB_STREAM_VERDICT: + return sockmap_get_from_fd(attr); default: return -EINVAL; } - if (attr->attach_type == BPF_CGROUP_SMAP_INGRESS) - return sockmap_get_from_fd(attr, ptype); - prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); if (IS_ERR(prog)) return PTR_ERR(prog); diff --git a/samples/sockmap/sockmap_kern.c b/samples/sockmap/sockmap_kern.c index 6ff986f7059b..f9b38ef82dc2 100644 --- a/samples/sockmap/sockmap_kern.c +++ b/samples/sockmap/sockmap_kern.c @@ -82,8 +82,7 @@ int bpf_sockmap(struct bpf_sock_ops *skops) if (lport == 10000) { ret = 1; err = bpf_sock_map_update(skops, &sock_map, &ret, - BPF_NOEXIST, - BPF_SOCKMAP_STRPARSER); + BPF_NOEXIST); bpf_printk("passive(%i -> %i) map ctx update err: %d\n", lport, bpf_ntohl(rport), err); } @@ -95,8 +94,7 @@ int bpf_sockmap(struct bpf_sock_ops *skops) if (bpf_ntohl(rport) == 10001) { ret = 10; err = bpf_sock_map_update(skops, &sock_map, &ret, - BPF_NOEXIST, - BPF_SOCKMAP_STRPARSER); + BPF_NOEXIST); bpf_printk("active(%i -> %i) map ctx update err: %d\n", lport, bpf_ntohl(rport), err); } diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c index fb78f5abefb4..7cc9d228216f 100644 --- a/samples/sockmap/sockmap_user.c +++ b/samples/sockmap/sockmap_user.c @@ -256,8 +256,16 @@ int main(int argc, char **argv) } /* Attach programs to sockmap */ - err = __bpf_prog_attach(prog_fd[0], prog_fd[1], map_fd[0], - BPF_CGROUP_SMAP_INGRESS, 0); + err = bpf_prog_attach(prog_fd[0], map_fd[0], + BPF_SK_SKB_STREAM_PARSER, 0); + if (err) { + fprintf(stderr, "ERROR: bpf_prog_attach (sockmap): %d (%s)\n", + err, strerror(errno)); + return err; + } + + err = bpf_prog_attach(prog_fd[1], map_fd[0], + BPF_SK_SKB_STREAM_VERDICT, 0); if (err) { fprintf(stderr, "ERROR: bpf_prog_attach (sockmap): %d (%s)\n", err, strerror(errno)); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f8f6377fd541..09ac590eefb1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -136,7 +136,8 @@ enum bpf_attach_type { BPF_CGROUP_INET_EGRESS, BPF_CGROUP_INET_SOCK_CREATE, BPF_CGROUP_SOCK_OPS, - BPF_CGROUP_SMAP_INGRESS, + BPF_SK_SKB_STREAM_PARSER, + BPF_SK_SKB_STREAM_VERDICT, __MAX_BPF_ATTACH_TYPE }; @@ -227,7 +228,6 @@ union bpf_attr { __u32 attach_bpf_fd; /* eBPF program to attach */ __u32 attach_type; __u32 attach_flags; - __u32 attach_bpf_fd2; }; struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ @@ -572,14 +572,11 @@ union bpf_attr { * @flags: reserved for future use * Return: SK_REDIRECT * - * int bpf_sock_map_update(skops, map, key, flags, map_flags) + * int bpf_sock_map_update(skops, map, key, flags) * @skops: pointer to bpf_sock_ops * @map: pointer to sockmap to update * @key: key to insert/update sock in map * @flags: same flags as map update elem - * @map_flags: sock map specific flags - * bit 1: Enable strparser - * other bits: reserved */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index a0717610b116..1d6907d379c9 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -235,28 +235,20 @@ int bpf_obj_get(const char *pathname) return sys_bpf(BPF_OBJ_GET, &attr, sizeof(attr)); } -int __bpf_prog_attach(int prog_fd1, int prog_fd2, int target_fd, - enum bpf_attach_type type, - unsigned int flags) +int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type, + unsigned int flags) { union bpf_attr attr; bzero(&attr, sizeof(attr)); attr.target_fd = target_fd; - attr.attach_bpf_fd = prog_fd1; - attr.attach_bpf_fd2 = prog_fd2; + attr.attach_bpf_fd = prog_fd; attr.attach_type = type; attr.attach_flags = flags; return sys_bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)); } -int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type, - unsigned int flags) -{ - return __bpf_prog_attach(prog_fd, 0, target_fd, type, flags); -} - int bpf_prog_detach(int target_fd, enum bpf_attach_type type) { union bpf_attr attr; diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 90e9d4e85d08..b8ea5843c39e 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -56,10 +56,6 @@ int bpf_obj_pin(int fd, const char *pathname); int bpf_obj_get(const char *pathname); int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type, unsigned int flags); -int __bpf_prog_attach(int prog1, int prog2, - int attachable_fd, - enum bpf_attach_type type, - unsigned int flags); int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type); int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size, void *data_out, __u32 *size_out, __u32 *retval, diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 98f3be26d390..36fb9161b34a 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -68,8 +68,7 @@ static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval, static int (*bpf_sk_redirect_map)(void *map, int key, int flags) = (void *) BPF_FUNC_sk_redirect_map; static int (*bpf_sock_map_update)(void *map, void *key, void *value, - unsigned long long flags, - unsigned long long map_lags) = + unsigned long long flags) = (void *) BPF_FUNC_sock_map_update; diff --git a/tools/testing/selftests/bpf/sockmap_parse_prog.c b/tools/testing/selftests/bpf/sockmap_parse_prog.c index 8b5453158399..710f43f42dc4 100644 --- a/tools/testing/selftests/bpf/sockmap_parse_prog.c +++ b/tools/testing/selftests/bpf/sockmap_parse_prog.c @@ -30,7 +30,7 @@ int bpf_prog1(struct __sk_buff *skb) */ d[0] = 1; - bpf_printk("data[0] = (%u): local_port %i remote %i\n", + bpf_printk("parse: data[0] = (%u): local_port %i remote %i\n", d[0], lport, bpf_ntohl(rport)); return skb->len; } diff --git a/tools/testing/selftests/bpf/sockmap_verdict_prog.c b/tools/testing/selftests/bpf/sockmap_verdict_prog.c index d5f9447b3808..0573c1db2519 100644 --- a/tools/testing/selftests/bpf/sockmap_verdict_prog.c +++ b/tools/testing/selftests/bpf/sockmap_verdict_prog.c @@ -40,7 +40,7 @@ int bpf_prog2(struct __sk_buff *skb) d[6] = 0xe; d[7] = 0xf; - bpf_printk("data[0] = (%u): local_port %i remote %i\n", + bpf_printk("verdict: data[0] = (%u): local_port %i remote %i redirect 5\n", d[0], lport, bpf_ntohl(rport)); return bpf_sk_redirect_map(&sock_map, 5, 0); } diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 40b2d1faf02b..6df6e6257424 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -547,20 +547,26 @@ static void test_sockmap(int task, void *data) goto out_sockmap; } - /* Nothing attached so these should fail */ + /* Test update without programs */ for (i = 0; i < 6; i++) { err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY); - if (!err) { - printf("Failed invalid update sockmap '%i:%i'\n", + if (err) { + printf("Failed noprog update sockmap '%i:%i'\n", i, sfd[i]); goto out_sockmap; } } /* Test attaching bad fds */ - err = __bpf_prog_attach(-1, -2, fd, BPF_CGROUP_SMAP_INGRESS, 0); + err = bpf_prog_attach(-1, fd, BPF_SK_SKB_STREAM_PARSER, 0); if (!err) { - printf("Failed invalid prog attach\n"); + printf("Failed invalid parser prog attach\n"); + goto out_sockmap; + } + + err = bpf_prog_attach(-1, fd, BPF_SK_SKB_STREAM_VERDICT, 0); + if (!err) { + printf("Failed invalid verdict prog attach\n"); goto out_sockmap; } @@ -591,14 +597,21 @@ static void test_sockmap(int task, void *data) goto out_sockmap; } - err = __bpf_prog_attach(parse_prog, verdict_prog, map_fd, - BPF_CGROUP_SMAP_INGRESS, 0); + err = bpf_prog_attach(parse_prog, map_fd, + BPF_SK_SKB_STREAM_PARSER, 0); + if (err) { + printf("Failed bpf prog attach\n"); + goto out_sockmap; + } + + err = bpf_prog_attach(verdict_prog, map_fd, + BPF_SK_SKB_STREAM_VERDICT, 0); if (err) { printf("Failed bpf prog attach\n"); goto out_sockmap; } - /* Test map update elem */ + /* Test map update elem afterwards fd lives in fd and map_fd */ for (i = 0; i < 6; i++) { err = bpf_map_update_elem(map_fd, &i, &sfd[i], BPF_ANY); if (err) { @@ -649,96 +662,68 @@ static void test_sockmap(int task, void *data) goto out_sockmap; } - /* Delete the reset of the elems include some NULL elems */ - for (i = 0; i < 6; i++) { - err = bpf_map_delete_elem(map_fd, &i); - if (err && (i == 0 || i == 1 || i >= 4)) { - printf("Failed delete sockmap %i '%i:%i'\n", - err, i, sfd[i]); - goto out_sockmap; - } else if (!err && (i == 2 || i == 3)) { - printf("Failed null delete sockmap %i '%i:%i'\n", - err, i, sfd[i]); - goto out_sockmap; - } - } - - /* Test having multiple SMAPs open and active on same fds */ - err = __bpf_prog_attach(parse_prog, verdict_prog, fd, - BPF_CGROUP_SMAP_INGRESS, 0); - if (err) { - printf("Failed fd bpf prog attach\n"); - goto out_sockmap; - } - - for (i = 0; i < 6; i++) { - err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY); - if (err) { - printf("Failed fd update sockmap %i '%i:%i'\n", - err, i, sfd[i]); - goto out_sockmap; - } - } - - /* Test duplicate socket add of NOEXIST, ANY and EXIST */ - i = 0; + /* Push fd into same slot */ + i = 2; err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_NOEXIST); if (!err) { - printf("Failed BPF_NOEXIST create\n"); + printf("Failed allowed sockmap dup slot BPF_NOEXIST\n"); goto out_sockmap; } err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY); if (err) { - printf("Failed sockmap update BPF_ANY\n"); + printf("Failed sockmap update new slot BPF_ANY\n"); goto out_sockmap; } err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_EXIST); if (err) { - printf("Failed sockmap update BPF_EXIST\n"); + printf("Failed sockmap update new slot BPF_EXIST\n"); goto out_sockmap; } - /* The above were pushing fd into same slot try different slot now */ - i = 2; - err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_NOEXIST); - if (!err) { - printf("Failed BPF_NOEXIST create\n"); - goto out_sockmap; + /* Delete the elems without programs */ + for (i = 0; i < 6; i++) { + err = bpf_map_delete_elem(fd, &i); + if (err) { + printf("Failed delete sockmap %i '%i:%i'\n", + err, i, sfd[i]); + } } - err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY); + /* Test having multiple maps open and set with programs on same fds */ + err = bpf_prog_attach(parse_prog, fd, + BPF_SK_SKB_STREAM_PARSER, 0); if (err) { - printf("Failed sockmap update BPF_ANY\n"); + printf("Failed fd bpf parse prog attach\n"); goto out_sockmap; } - - err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_EXIST); + err = bpf_prog_attach(verdict_prog, fd, + BPF_SK_SKB_STREAM_VERDICT, 0); if (err) { - printf("Failed sockmap update BPF_EXIST\n"); + printf("Failed fd bpf verdict prog attach\n"); goto out_sockmap; } - /* Try pushing fd into different map, this is not allowed at the - * moment. Which programs would we use? - */ - err = bpf_map_update_elem(map_fd, &i, &sfd[i], BPF_NOEXIST); - if (!err) { - printf("Failed BPF_NOEXIST create\n"); - goto out_sockmap; - } - - err = bpf_map_update_elem(map_fd, &i, &sfd[i], BPF_ANY); - if (!err) { - printf("Failed sockmap update BPF_ANY\n"); - goto out_sockmap; - } - - err = bpf_map_update_elem(map_fd, &i, &sfd[i], BPF_EXIST); - if (!err) { - printf("Failed sockmap update BPF_EXIST\n"); - goto out_sockmap; + for (i = 4; i < 6; i++) { + err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY); + if (!err) { + printf("Failed allowed duplicate programs in update ANY sockmap %i '%i:%i'\n", + err, i, sfd[i]); + goto out_sockmap; + } + err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_NOEXIST); + if (!err) { + printf("Failed allowed duplicate program in update NOEXIST sockmap %i '%i:%i'\n", + err, i, sfd[i]); + goto out_sockmap; + } + err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_EXIST); + if (!err) { + printf("Failed allowed duplicate program in update EXIST sockmap %i '%i:%i'\n", + err, i, sfd[i]); + goto out_sockmap; + } } /* Test map close sockets */ -- cgit v1.2.3 From 3b0a5daa061076b2b75ffc294e74483ad9bf241a Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 29 Aug 2017 13:11:08 -0400 Subject: perf tools: Support new sample type for physical address Support new sample type PERF_SAMPLE_PHYS_ADDR for physical address. Add new option --phys-data to record sample physical address. Signed-off-by: Kan Liang Tested-by: Jiri Olsa Acked-by: Stephane Eranian Cc: Andi Kleen Cc: Madhavan Srinivasan Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1504026672-7304-2-git-send-email-kan.liang@intel.com [ Added missing printing in evsel.c patch sent by Jiri Olsa ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/perf_event.h | 4 +++- tools/perf/Documentation/perf-record.txt | 5 ++++- tools/perf/builtin-record.c | 2 ++ tools/perf/perf.h | 1 + tools/perf/util/event.h | 1 + tools/perf/util/evsel.c | 19 ++++++++++++++++++- 6 files changed, 29 insertions(+), 3 deletions(-) (limited to 'tools/include/uapi/linux') diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 2a37ae925d85..140ae638cfd6 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -139,8 +139,9 @@ enum perf_event_sample_format { PERF_SAMPLE_IDENTIFIER = 1U << 16, PERF_SAMPLE_TRANSACTION = 1U << 17, PERF_SAMPLE_REGS_INTR = 1U << 18, + PERF_SAMPLE_PHYS_ADDR = 1U << 19, - PERF_SAMPLE_MAX = 1U << 19, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */ }; /* @@ -814,6 +815,7 @@ enum perf_event_type { * { u64 transaction; } && PERF_SAMPLE_TRANSACTION * { u64 abi; # enum perf_sample_regs_abi * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR + * { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR * }; */ PERF_RECORD_SAMPLE = 9, diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 9bdea047c5db..e397453e5a46 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -249,7 +249,10 @@ OPTIONS -d:: --data:: - Record the sample addresses. + Record the sample virtual addresses. + +--phys-data:: + Record the sample physical addresses. -T:: --timestamp:: diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 36d7117a7562..56f8142ff97f 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1604,6 +1604,8 @@ static struct option __record_options[] = { OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat, "per thread counts"), OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"), + OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr, + "Record the sample physical addresses"), OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"), OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time, &record.opts.sample_time_set, diff --git a/tools/perf/perf.h b/tools/perf/perf.h index 2c010dd6a79d..dc442ba21bf6 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -43,6 +43,7 @@ struct record_opts { bool no_samples; bool raw_samples; bool sample_address; + bool sample_phys_addr; bool sample_weight; bool sample_time; bool sample_time_set; diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index 423ac82605f3..ee7bcc898d35 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -200,6 +200,7 @@ struct perf_sample { u32 cpu; u32 raw_size; u64 data_src; + u64 phys_addr; u32 flags; u16 insn_len; u8 cpumode; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index d9bd632ed7db..4bb89373eb52 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -955,6 +955,9 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts, if (opts->sample_address) perf_evsel__set_sample_bit(evsel, DATA_SRC); + if (opts->sample_phys_addr) + perf_evsel__set_sample_bit(evsel, PHYS_ADDR); + if (opts->no_buffering) { attr->watermark = 0; attr->wakeup_events = 1; @@ -1464,7 +1467,7 @@ static void __p_sample_type(char *buf, size_t size, u64 value) bit_name(PERIOD), bit_name(STREAM_ID), bit_name(RAW), bit_name(BRANCH_STACK), bit_name(REGS_USER), bit_name(STACK_USER), bit_name(IDENTIFIER), bit_name(REGS_INTR), bit_name(DATA_SRC), - bit_name(WEIGHT), + bit_name(WEIGHT), bit_name(PHYS_ADDR), { .name = NULL, } }; #undef bit_name @@ -2206,6 +2209,12 @@ int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event, } } + data->phys_addr = 0; + if (type & PERF_SAMPLE_PHYS_ADDR) { + data->phys_addr = *array; + array++; + } + return 0; } @@ -2311,6 +2320,9 @@ size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type, } } + if (type & PERF_SAMPLE_PHYS_ADDR) + result += sizeof(u64); + return result; } @@ -2500,6 +2512,11 @@ int perf_event__synthesize_sample(union perf_event *event, u64 type, } } + if (type & PERF_SAMPLE_PHYS_ADDR) { + *array = sample->phys_addr; + array++; + } + return 0; } -- cgit v1.2.3