diff options
Diffstat (limited to 'include')
| -rw-r--r-- | include/linux/bpf-cgroup.h | 25 | ||||
| -rw-r--r-- | include/linux/bpf.h | 52 | ||||
| -rw-r--r-- | include/linux/bpf_local_storage.h | 163 | ||||
| -rw-r--r-- | include/linux/bpf_lsm.h | 29 | ||||
| -rw-r--r-- | include/linux/bpf_types.h | 3 | ||||
| -rw-r--r-- | include/linux/btf.h | 3 | ||||
| -rw-r--r-- | include/linux/btf_ids.h | 51 | ||||
| -rw-r--r-- | include/linux/filter.h | 8 | ||||
| -rw-r--r-- | include/linux/if_tun.h | 19 | ||||
| -rw-r--r-- | include/linux/netdevice.h | 10 | ||||
| -rw-r--r-- | include/linux/rcupdate_trace.h | 9 | ||||
| -rw-r--r-- | include/linux/skmsg.h | 17 | ||||
| -rw-r--r-- | include/linux/tcp.h | 20 | ||||
| -rw-r--r-- | include/net/bpf_sk_storage.h | 14 | ||||
| -rw-r--r-- | include/net/inet_connection_sock.h | 2 | ||||
| -rw-r--r-- | include/net/request_sock.h | 9 | ||||
| -rw-r--r-- | include/net/sock.h | 4 | ||||
| -rw-r--r-- | include/net/tcp.h | 59 | ||||
| -rw-r--r-- | include/net/xdp_sock.h | 30 | ||||
| -rw-r--r-- | include/net/xdp_sock_drv.h | 122 | ||||
| -rw-r--r-- | include/net/xsk_buff_pool.h | 53 | ||||
| -rw-r--r-- | include/uapi/linux/bpf.h | 398 |
22 files changed, 967 insertions, 133 deletions
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 64f367044e25..2f98d2fce62e 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -279,6 +279,31 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_RECVMSG, NULL) +/* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a + * fullsock and its parent fullsock cannot be traced by + * sk_to_full_sk(). + * + * e.g. sock_ops->sk is a request_sock and it is under syncookie mode. + * Its listener-sk is not attached to the rsk_listener. + * In this case, the caller holds the listener-sk (unlocked), + * set its sock_ops->sk to req_sk, and call this SOCK_OPS"_SK" with + * the listener-sk such that the cgroup-bpf-progs of the + * listener-sk will be run. + * + * Regardless of syncookie mode or not, + * calling bpf_setsockopt on listener-sk will not make sense anyway, + * so passing 'sock_ops->sk == req_sk' to the bpf prog is appropriate here. + */ +#define BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(sock_ops, sk) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter_sock_ops(sk, \ + sock_ops, \ + BPF_CGROUP_SOCK_OPS); \ + __ret; \ +}) + #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 55f694b63164..c6d9f2c444f4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -34,6 +34,8 @@ struct btf_type; struct exception_table_entry; struct seq_operations; struct bpf_iter_aux_info; +struct bpf_local_storage; +struct bpf_local_storage_map; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -104,6 +106,25 @@ struct bpf_map_ops { __poll_t (*map_poll)(struct bpf_map *map, struct file *filp, struct poll_table_struct *pts); + /* Functions called by bpf_local_storage maps */ + int (*map_local_storage_charge)(struct bpf_local_storage_map *smap, + void *owner, u32 size); + void (*map_local_storage_uncharge)(struct bpf_local_storage_map *smap, + void *owner, u32 size); + struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner); + + /* map_meta_equal must be implemented for maps that can be + * used as an inner map. It is a runtime check to ensure + * an inner map can be inserted to an outer map. + * + * Some properties of the inner map has been used during the + * verification time. When inserting an inner map at the runtime, + * map_meta_equal has to ensure the inserting map has the same + * properties that the verifier has used earlier. + */ + bool (*map_meta_equal)(const struct bpf_map *meta0, + const struct bpf_map *meta1); + /* BTF name and id of struct allocated by map_alloc */ const char * const map_btf_name; int *map_btf_id; @@ -227,6 +248,9 @@ int map_check_no_btf(const struct bpf_map *map, const struct btf_type *key_type, const struct btf_type *value_type); +bool bpf_map_meta_equal(const struct bpf_map *meta0, + const struct bpf_map *meta1); + extern const struct bpf_map_ops bpf_map_offload_ops; /* function argument constraints */ @@ -309,6 +333,7 @@ struct bpf_func_proto { * for this argument. */ int *ret_btf_id; /* return value btf_id */ + bool (*allowed)(const struct bpf_prog *prog); }; /* bpf_context is intentionally undefined structure. Pointer to bpf_context is @@ -514,6 +539,8 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, /* these two functions are called from generated trampoline */ u64 notrace __bpf_prog_enter(void); void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); +void notrace __bpf_prog_enter_sleepable(void); +void notrace __bpf_prog_exit_sleepable(void); struct bpf_ksym { unsigned long start; @@ -709,6 +736,7 @@ struct bpf_prog_aux { bool offload_requested; bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ bool func_proto_unreliable; + bool sleepable; enum bpf_tramp_prog_type trampoline_prog_type; struct bpf_trampoline *trampoline; struct hlist_node tramp_hlist; @@ -1218,12 +1246,18 @@ typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux); typedef void (*bpf_iter_detach_target_t)(struct bpf_iter_aux_info *aux); +typedef void (*bpf_iter_show_fdinfo_t) (const struct bpf_iter_aux_info *aux, + struct seq_file *seq); +typedef int (*bpf_iter_fill_link_info_t)(const struct bpf_iter_aux_info *aux, + struct bpf_link_info *info); #define BPF_ITER_CTX_ARG_MAX 2 struct bpf_iter_reg { const char *target; bpf_iter_attach_target_t attach_target; bpf_iter_detach_target_t detach_target; + bpf_iter_show_fdinfo_t show_fdinfo; + bpf_iter_fill_link_info_t fill_link_info; u32 ctx_arg_info_size; struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX]; const struct bpf_iter_seq_info *seq_info; @@ -1250,6 +1284,10 @@ int bpf_iter_new_fd(struct bpf_link *link); bool bpf_link_is_iter(struct bpf_link *link); struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop); int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx); +void bpf_iter_map_show_fdinfo(const struct bpf_iter_aux_info *aux, + struct seq_file *seq); +int bpf_iter_map_fill_link_info(const struct bpf_iter_aux_info *aux, + struct bpf_link_info *info); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); @@ -1340,6 +1378,8 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id); +bool btf_struct_ids_match(struct bpf_verifier_log *log, + int off, u32 id, u32 need_type_id); int btf_resolve_helper_id(struct bpf_verifier_log *log, const struct bpf_func_proto *fn, int); @@ -1358,6 +1398,7 @@ int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog, struct btf *btf, const struct btf_type *t); struct bpf_prog *bpf_prog_by_id(u32 id); +struct bpf_link *bpf_link_by_id(u32 id); const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); #else /* !CONFIG_BPF_SYSCALL */ @@ -1637,6 +1678,7 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, u32 which); int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); +int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags); void sock_map_unhash(struct sock *sk); void sock_map_close(struct sock *sk, long timeout); #else @@ -1658,6 +1700,12 @@ static inline int sock_map_prog_detach(const union bpf_attr *attr, { return -EOPNOTSUPP; } + +static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, + u64 flags) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_BPF_STREAM_PARSER */ #if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) @@ -1736,6 +1784,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; +extern const struct bpf_func_proto bpf_copy_from_user_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); @@ -1850,4 +1899,7 @@ enum bpf_text_poke_type { int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2); +struct btf_id_set; +bool btf_id_set_contains(struct btf_id_set *set, u32 id); + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h new file mode 100644 index 000000000000..b2c9463f36a1 --- /dev/null +++ b/include/linux/bpf_local_storage.h @@ -0,0 +1,163 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2019 Facebook + * Copyright 2020 Google LLC. + */ + +#ifndef _BPF_LOCAL_STORAGE_H +#define _BPF_LOCAL_STORAGE_H + +#include <linux/bpf.h> +#include <linux/rculist.h> +#include <linux/list.h> +#include <linux/hash.h> +#include <linux/types.h> +#include <uapi/linux/btf.h> + +#define BPF_LOCAL_STORAGE_CACHE_SIZE 16 + +struct bpf_local_storage_map_bucket { + struct hlist_head list; + raw_spinlock_t lock; +}; + +/* Thp map is not the primary owner of a bpf_local_storage_elem. + * Instead, the container object (eg. sk->sk_bpf_storage) is. + * + * The map (bpf_local_storage_map) is for two purposes + * 1. Define the size of the "local storage". It is + * the map's value_size. + * + * 2. Maintain a list to keep track of all elems such + * that they can be cleaned up during the map destruction. + * + * When a bpf local storage is being looked up for a + * particular object, the "bpf_map" pointer is actually used + * as the "key" to search in the list of elem in + * the respective bpf_local_storage owned by the object. + * + * e.g. sk->sk_bpf_storage is the mini-map with the "bpf_map" pointer + * as the searching key. + */ +struct bpf_local_storage_map { + struct bpf_map map; + /* Lookup elem does not require accessing the map. + * + * Updating/Deleting requires a bucket lock to + * link/unlink the elem from the map. Having + * multiple buckets to improve contention. + */ + struct bpf_local_storage_map_bucket *buckets; + u32 bucket_log; + u16 elem_size; + u16 cache_idx; +}; + +struct bpf_local_storage_data { + /* smap is used as the searching key when looking up + * from the object's bpf_local_storage. + * + * Put it in the same cacheline as the data to minimize + * the number of cachelines access during the cache hit case. + */ + struct bpf_local_storage_map __rcu *smap; + u8 data[] __aligned(8); +}; + +/* Linked to bpf_local_storage and bpf_local_storage_map */ +struct bpf_local_storage_elem { + struct hlist_node map_node; /* Linked to bpf_local_storage_map */ + struct hlist_node snode; /* Linked to bpf_local_storage */ + struct bpf_local_storage __rcu *local_storage; + struct rcu_head rcu; + /* 8 bytes hole */ + /* The data is stored in aother cacheline to minimize + * the number of cachelines access during a cache hit. + */ + struct bpf_local_storage_data sdata ____cacheline_aligned; +}; + +struct bpf_local_storage { + struct bpf_local_storage_data __rcu *cache[BPF_LOCAL_STORAGE_CACHE_SIZE]; + struct hlist_head list; /* List of bpf_local_storage_elem */ + void *owner; /* The object that owns the above "list" of + * bpf_local_storage_elem. + */ + struct rcu_head rcu; + raw_spinlock_t lock; /* Protect adding/removing from the "list" */ +}; + +/* U16_MAX is much more than enough for sk local storage + * considering a tcp_sock is ~2k. + */ +#define BPF_LOCAL_STORAGE_MAX_VALUE_SIZE \ + min_t(u32, \ + (KMALLOC_MAX_SIZE - MAX_BPF_STACK - \ + sizeof(struct bpf_local_storage_elem)), \ + (U16_MAX - sizeof(struct bpf_local_storage_elem))) + +#define SELEM(_SDATA) \ + container_of((_SDATA), struct bpf_local_storage_elem, sdata) +#define SDATA(_SELEM) (&(_SELEM)->sdata) + +#define BPF_LOCAL_STORAGE_CACHE_SIZE 16 + +struct bpf_local_storage_cache { + spinlock_t idx_lock; + u64 idx_usage_counts[BPF_LOCAL_STORAGE_CACHE_SIZE]; +}; + +#define DEFINE_BPF_STORAGE_CACHE(name) \ +static struct bpf_local_storage_cache name = { \ + .idx_lock = __SPIN_LOCK_UNLOCKED(name.idx_lock), \ +} + +u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache); +void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, + u16 idx); + +/* Helper functions for bpf_local_storage */ +int bpf_local_storage_map_alloc_check(union bpf_attr *attr); + +struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr); + +struct bpf_local_storage_data * +bpf_local_storage_lookup(struct bpf_local_storage *local_storage, + struct bpf_local_storage_map *smap, + bool cacheit_lockit); + +void bpf_local_storage_map_free(struct bpf_local_storage_map *smap); + +int bpf_local_storage_map_check_btf(const struct bpf_map *map, + const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type); + +void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem); + +bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem, + bool uncharge_omem); + +void bpf_selem_unlink(struct bpf_local_storage_elem *selem); + +void bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *selem); + +void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem); + +struct bpf_local_storage_elem * +bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, + bool charge_mem); + +int +bpf_local_storage_alloc(void *owner, + struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *first_selem); + +struct bpf_local_storage_data * +bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, + void *value, u64 map_flags); + +#endif /* _BPF_LOCAL_STORAGE_H */ diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index af74712af585..aaacb6aafc87 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -17,9 +17,28 @@ #include <linux/lsm_hook_defs.h> #undef LSM_HOOK +struct bpf_storage_blob { + struct bpf_local_storage __rcu *storage; +}; + +extern struct lsm_blob_sizes bpf_lsm_blob_sizes; + int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog); +static inline struct bpf_storage_blob *bpf_inode( + const struct inode *inode) +{ + if (unlikely(!inode->i_security)) + return NULL; + + return inode->i_security + bpf_lsm_blob_sizes.lbs_inode; +} + +extern const struct bpf_func_proto bpf_inode_storage_get_proto; +extern const struct bpf_func_proto bpf_inode_storage_delete_proto; +void bpf_inode_storage_free(struct inode *inode); + #else /* !CONFIG_BPF_LSM */ static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, @@ -28,6 +47,16 @@ static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, return -EOPNOTSUPP; } +static inline struct bpf_storage_blob *bpf_inode( + const struct inode *inode) +{ + return NULL; +} + +static inline void bpf_inode_storage_free(struct inode *inode) +{ +} + #endif /* CONFIG_BPF_LSM */ #endif /* _LINUX_BPF_LSM_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index a52a5688418e..2e6f568377f1 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -107,6 +107,9 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) #endif +#ifdef CONFIG_BPF_LSM +BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) diff --git a/include/linux/btf.h b/include/linux/btf.h index 8b81fbb4497c..a9af5e7a7ece 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -64,8 +64,7 @@ const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf, u32 id, u32 *res_id); const struct btf_type * btf_resolve_size(const struct btf *btf, const struct btf_type *type, - u32 *type_size, const struct btf_type **elem_type, - u32 *total_nelems); + u32 *type_size); #define for_each_member(i, struct_type, member) \ for (i = 0, member = btf_type_member(struct_type); \ diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 4867d549e3c1..210b086188a3 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -3,6 +3,11 @@ #ifndef _LINUX_BTF_IDS_H #define _LINUX_BTF_IDS_H +struct btf_id_set { + u32 cnt; + u32 ids[]; +}; + #ifdef CONFIG_DEBUG_INFO_BTF #include <linux/compiler.h> /* for __PASTE */ @@ -62,7 +67,7 @@ asm( \ ".pushsection " BTF_IDS_SECTION ",\"a\"; \n" \ "." #scope " " #name "; \n" \ #name ":; \n" \ -".popsection; \n"); \ +".popsection; \n"); #define BTF_ID_LIST(name) \ __BTF_ID_LIST(name, local) \ @@ -88,12 +93,56 @@ asm( \ ".zero 4 \n" \ ".popsection; \n"); +/* + * The BTF_SET_START/END macros pair defines sorted list of + * BTF IDs plus its members count, with following layout: + * + * BTF_SET_START(list) + * BTF_ID(type1, name1) + * BTF_ID(type2, name2) + * BTF_SET_END(list) + * + * __BTF_ID__set__list: + * .zero 4 + * list: + * __BTF_ID__type1__name1__3: + * .zero 4 + * __BTF_ID__type2__name2__4: + * .zero 4 + * + */ +#define __BTF_SET_START(name, scope) \ +asm( \ +".pushsection " BTF_IDS_SECTION ",\"a\"; \n" \ +"." #scope " __BTF_ID__set__" #name "; \n" \ +"__BTF_ID__set__" #name ":; \n" \ +".zero 4 \n" \ +".popsection; \n"); + +#define BTF_SET_START(name) \ +__BTF_ID_LIST(name, local) \ +__BTF_SET_START(name, local) + +#define BTF_SET_START_GLOBAL(name) \ +__BTF_ID_LIST(name, globl) \ +__BTF_SET_START(name, globl) + +#define BTF_SET_END(name) \ +asm( \ +".pushsection " BTF_IDS_SECTION ",\"a\"; \n" \ +".size __BTF_ID__set__" #name ", .-" #name " \n" \ +".popsection; \n"); \ +extern struct btf_id_set name; + #else #define BTF_ID_LIST(name) static u32 name[5]; #define BTF_ID(prefix, name) #define BTF_ID_UNUSED #define BTF_ID_LIST_GLOBAL(name) u32 name[1]; +#define BTF_SET_START(name) static struct btf_id_set name = { 0 }; +#define BTF_SET_START_GLOBAL(name) static struct btf_id_set name = { 0 }; +#define BTF_SET_END(name) #endif /* CONFIG_DEBUG_INFO_BTF */ diff --git a/include/linux/filter.h b/include/linux/filter.h index 0a355b005bf4..995625950cc1 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1236,13 +1236,17 @@ struct bpf_sock_addr_kern { struct bpf_sock_ops_kern { struct sock *sk; - u32 op; union { u32 args[4]; u32 reply; u32 replylong[4]; }; - u32 is_fullsock; + struct sk_buff *syn_skb; + struct sk_buff *skb; + void *skb_data_end; + u8 op; + u8 is_fullsock; + u8 remaining_opt_len; u64 temp; /* temp and everything after is not * initialized to 0 before calling * the BPF program. New fields that diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h index 5bda8cf457b6..2a7660843444 100644 --- a/include/linux/if_tun.h +++ b/include/linux/if_tun.h @@ -27,9 +27,18 @@ struct tun_xdp_hdr { #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) struct socket *tun_get_socket(struct file *); struct ptr_ring *tun_get_tx_ring(struct file *file); -bool tun_is_xdp_frame(void *ptr); -void *tun_xdp_to_ptr(void *ptr); -void *tun_ptr_to_xdp(void *ptr); +static inline bool tun_is_xdp_frame(void *ptr) +{ + return (unsigned long)ptr & TUN_XDP_FLAG; +} +static inline void *tun_xdp_to_ptr(struct xdp_frame *xdp) +{ + return (void *)((unsigned long)xdp | TUN_XDP_FLAG); +} +static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr) +{ + return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG); +} void tun_ptr_free(void *ptr); #else #include <linux/err.h> @@ -48,11 +57,11 @@ static inline bool tun_is_xdp_frame(void *ptr) { return false; } -static inline void *tun_xdp_to_ptr(void *ptr) +static inline void *tun_xdp_to_ptr(struct xdp_frame *xdp) { return NULL; } -static inline void *tun_ptr_to_xdp(void *ptr) +static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr) { return NULL; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c0b512e6a02b..7f9fcfd15942 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -618,7 +618,7 @@ struct netdev_queue { /* Subordinate device that the queue has been assigned to */ struct net_device *sb_dev; #ifdef CONFIG_XDP_SOCKETS - struct xdp_umem *umem; + struct xsk_buff_pool *pool; #endif /* * write-mostly part @@ -755,7 +755,7 @@ struct netdev_rx_queue { struct net_device *dev; struct xdp_rxq_info xdp_rxq; #ifdef CONFIG_XDP_SOCKETS - struct xdp_umem *umem; + struct xsk_buff_pool *pool; #endif } ____cacheline_aligned_in_smp; @@ -883,7 +883,7 @@ enum bpf_netdev_command { /* BPF program for offload callbacks, invoked at program load time. */ BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE, - XDP_SETUP_XSK_UMEM, + XDP_SETUP_XSK_POOL, }; struct bpf_prog_offload_ops; @@ -917,9 +917,9 @@ struct netdev_bpf { struct { struct bpf_offloaded_map *offmap; }; - /* XDP_SETUP_XSK_UMEM */ + /* XDP_SETUP_XSK_POOL */ struct { - struct xdp_umem *umem; + struct xsk_buff_pool *pool; u16 queue_id; } xsk; }; diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h index d9015aac78c6..aaaac8ac927c 100644 --- a/include/linux/rcupdate_trace.h +++ b/include/linux/rcupdate_trace.h @@ -82,7 +82,14 @@ static inline void rcu_read_unlock_trace(void) void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); void synchronize_rcu_tasks_trace(void); void rcu_barrier_tasks_trace(void); - +#else +/* + * The BPF JIT forms these addresses even when it doesn't call these + * functions, so provide definitions that result in runtime errors. + */ +static inline void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func) { BUG(); } +static inline void rcu_read_lock_trace(void) { BUG(); } +static inline void rcu_read_unlock_trace(void) { BUG(); } #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ #endif /* __LINUX_RCUPDATE_TRACE_H */ diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 1e9ed840b9fc..3119928fc103 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -340,23 +340,6 @@ static inline void sk_psock_update_proto(struct sock *sk, struct sk_psock *psock, struct proto *ops) { - /* Initialize saved callbacks and original proto only once, since this - * function may be called multiple times for a psock, e.g. when - * psock->progs.msg_parser is updated. - * - * Since we've not installed the new proto, psock is not yet in use and - * we can initialize it without synchronization. - */ - if (!psock->sk_proto) { - struct proto *orig = READ_ONCE(sk->sk_prot); - - psock->saved_unhash = orig->unhash; - psock->saved_close = orig->close; - psock->saved_write_space = sk->sk_write_space; - - psock->sk_proto = orig; - } - /* Pairs with lockless read in sk_clone_lock() */ WRITE_ONCE(sk->sk_prot, ops); } diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 14b62d7df942..56ff2952edaf 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -92,6 +92,8 @@ struct tcp_options_received { smc_ok : 1, /* SMC seen on SYN packet */ snd_wscale : 4, /* Window scaling received from sender */ rcv_wscale : 4; /* Window scaling to send to receiver */ + u8 saw_unknown:1, /* Received unknown option */ + unused:7; u8 num_sacks; /* Number of SACK blocks */ u16 user_mss; /* mss requested by user in ioctl */ u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ @@ -237,14 +239,13 @@ struct tcp_sock { repair : 1, frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */ u8 repair_queue; - u8 syn_data:1, /* SYN includes data */ + u8 save_syn:2, /* Save headers of SYN packet */ + syn_data:1, /* SYN includes data */ syn_fastopen:1, /* SYN includes Fast Open option */ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ syn_fastopen_ch:1, /* Active TFO re-enabling probe */ syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ - save_syn:1, /* Save headers of SYN packet */ - is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */ - syn_smc:1; /* SYN includes SMC */ + is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ u32 tlp_high_seq; /* snd_nxt at the time of TLP */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ @@ -391,6 +392,9 @@ struct tcp_sock { #if IS_ENABLED(CONFIG_MPTCP) bool is_mptcp; #endif +#if IS_ENABLED(CONFIG_SMC) + bool syn_smc; /* SYN includes SMC */ +#endif #ifdef CONFIG_TCP_MD5SIG /* TCP AF-Specific parts; only used by MD5 Signature support so far */ @@ -406,7 +410,7 @@ struct tcp_sock { * socket. Used to retransmit SYNACKs etc. */ struct request_sock __rcu *fastopen_rsk; - u32 *saved_syn; + struct saved_syn *saved_syn; }; enum tsq_enum { @@ -484,6 +488,12 @@ static inline void tcp_saved_syn_free(struct tcp_sock *tp) tp->saved_syn = NULL; } +static inline u32 tcp_saved_syn_len(const struct saved_syn *saved_syn) +{ + return saved_syn->mac_hdrlen + saved_syn->network_hdrlen + + saved_syn->tcp_hdrlen; +} + struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, const struct sk_buff *orig_skb); diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h index 5036c94c0503..119f4c9c3a9c 100644 --- a/include/net/bpf_sk_storage.h +++ b/include/net/bpf_sk_storage.h @@ -3,13 +3,27 @@ #ifndef _BPF_SK_STORAGE_H #define _BPF_SK_STORAGE_H +#include <linux/rculist.h> +#include <linux/list.h> +#include <linux/hash.h> +#include <linux/types.h> +#include <linux/spinlock.h> +#include <linux/bpf.h> +#include <net/sock.h> +#include <uapi/linux/sock_diag.h> +#include <uapi/linux/btf.h> +#include <linux/bpf_local_storage.h> + struct sock; void bpf_sk_storage_free(struct sock *sk); extern const struct bpf_func_proto bpf_sk_storage_get_proto; extern const struct bpf_func_proto bpf_sk_storage_delete_proto; +extern const struct bpf_func_proto sk_storage_get_btf_proto; +extern const struct bpf_func_proto sk_storage_delete_btf_proto; +struct bpf_local_storage_elem; struct bpf_sk_storage_diag; struct sk_buff; struct nlattr; diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index aa8893c68c50..c738abeb3265 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -86,6 +86,8 @@ struct inet_connection_sock { struct timer_list icsk_retransmit_timer; struct timer_list icsk_delack_timer; __u32 icsk_rto; + __u32 icsk_rto_min; + __u32 icsk_delack_max; __u32 icsk_pmtu_cookie; const struct tcp_congestion_ops *icsk_ca_ops; const struct inet_connection_sock_af_ops *icsk_af_ops; diff --git a/include/net/request_sock.h b/include/net/request_sock.h index b2eb8b4ba697..29e41ff3ec93 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -41,6 +41,13 @@ struct request_sock_ops { int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req); +struct saved_syn { + u32 mac_hdrlen; + u32 network_hdrlen; + u32 tcp_hdrlen; + u8 data[]; +}; + /* struct request_sock - mini sock to represent a connection request */ struct request_sock { @@ -60,7 +67,7 @@ struct request_sock { struct timer_list rsk_timer; const struct request_sock_ops *rsk_ops; struct sock *sk; - u32 *saved_syn; + struct saved_syn *saved_syn; u32 secid; u32 peer_secid; }; diff --git a/include/net/sock.h b/include/net/sock.h index b943731fa879..7dd3051551fb 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -246,7 +246,7 @@ struct sock_common { /* public: */ }; -struct bpf_sk_storage; +struct bpf_local_storage; /** * struct sock - network layer representation of sockets @@ -517,7 +517,7 @@ struct sock { void (*sk_destruct)(struct sock *sk); struct sock_reuseport __rcu *sk_reuseport_cb; #ifdef CONFIG_BPF_SYSCALL - struct bpf_sk_storage __rcu *sk_bpf_storage; + struct bpf_local_storage __rcu *sk_bpf_storage; #endif struct rcu_head sk_rcu; }; diff --git a/include/net/tcp.h b/include/net/tcp.h index 0d11db6436c8..e85d564446c6 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -394,7 +394,7 @@ void tcp_metrics_init(void); bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst); void tcp_close(struct sock *sk, long timeout); void tcp_init_sock(struct sock *sk); -void tcp_init_transfer(struct sock *sk, int bpf_op); +void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb); __poll_t tcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int tcp_getsockopt(struct sock *sk, int level, int optname, @@ -455,7 +455,8 @@ enum tcp_synack_type { struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type); + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb); int tcp_disconnect(struct sock *sk, int flags); void tcp_finish_connect(struct sock *sk, struct sk_buff *skb); @@ -699,7 +700,7 @@ static inline void tcp_fast_path_check(struct sock *sk) static inline u32 tcp_rto_min(struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); - u32 rto_min = TCP_RTO_MIN; + u32 rto_min = inet_csk(sk)->icsk_rto_min; if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN); @@ -2025,7 +2026,8 @@ struct tcp_request_sock_ops { int (*send_synack)(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type); + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb); }; extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops; @@ -2223,6 +2225,55 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); #endif /* CONFIG_NET_SOCK_MSG */ +#ifdef CONFIG_CGROUP_BPF +/* Copy the listen sk's HDR_OPT_CB flags to its child. + * + * During 3-Way-HandShake, the synack is usually sent from + * the listen sk with the HDR_OPT_CB flags set so that + * bpf-prog will be called to write the BPF hdr option. + * + * In fastopen, the child sk is used to send synack instead + * of the listen sk. Thus, inheriting the HDR_OPT_CB flags + * from the listen sk gives the bpf-prog a chance to write + * BPF hdr option in the synack pkt during fastopen. + * + * Both fastopen and non-fastopen child will inherit the + * HDR_OPT_CB flags to keep the bpf-prog having a consistent + * behavior when deciding to clear this cb flags (or not) + * during the PASSIVE_ESTABLISHED_CB. + * + * In the future, other cb flags could be inherited here also. + */ +static inline void bpf_skops_init_child(const struct sock *sk, + struct sock *child) +{ + tcp_sk(child)->bpf_sock_ops_cb_flags = + tcp_sk(sk)->bpf_sock_ops_cb_flags & + (BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG | + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG | + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG); +} + +static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, + struct sk_buff *skb, + unsigned int end_offset) +{ + skops->skb = skb; + skops->skb_data_end = skb->data + end_offset; +} +#else +static inline void bpf_skops_init_child(const struct sock *sk, + struct sock *child) +{ +} + +static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, + struct sk_buff *skb, + unsigned int end_offset) +{ +} +#endif + /* Call BPF_SOCK_OPS program that returns an int. If the return value * is < 0, then the BPF op failed (for example if the loaded BPF * program does not support the chosen operation or there is no BPF diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index c9d87cc40c11..1a9559c0cbdd 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -18,25 +18,19 @@ struct xsk_queue; struct xdp_buff; struct xdp_umem { - struct xsk_queue *fq; - struct xsk_queue *cq; - struct xsk_buff_pool *pool; + void *addrs; u64 size; u32 headroom; u32 chunk_size; + u32 chunks; + u32 npgs; struct user_struct *user; refcount_t users; - struct work_struct work; - struct page **pgs; - u32 npgs; - u16 queue_id; - u8 need_wakeup; u8 flags; - int id; - struct net_device *dev; bool zc; - spinlock_t xsk_tx_list_lock; - struct list_head xsk_tx_list; + struct page **pgs; + int id; + struct list_head xsk_dma_list; }; struct xsk_map { @@ -48,10 +42,11 @@ struct xsk_map { struct xdp_sock { /* struct sock must be the first member of struct xdp_sock */ struct sock sk; - struct xsk_queue *rx; + struct xsk_queue *rx ____cacheline_aligned_in_smp; struct net_device *dev; struct xdp_umem *umem; struct list_head flush_node; + struct xsk_buff_pool *pool; u16 queue_id; bool zc; enum { @@ -59,10 +54,9 @@ struct xdp_sock { XSK_BOUND, XSK_UNBOUND, } state; - /* Protects multiple processes in the control path */ - struct mutex mutex; + struct xsk_queue *tx ____cacheline_aligned_in_smp; - struct list_head list; + struct list_head tx_list; /* Mutual exclusion of NAPI TX thread and sendmsg error paths * in the SKB destructor callback. */ @@ -77,6 +71,10 @@ struct xdp_sock { struct list_head map_list; /* Protects map_list */ spinlock_t map_list_lock; + /* Protects multiple processes in the control path */ + struct mutex mutex; + struct xsk_queue *fq_tmp; /* Only as tmp storage before bind */ + struct xsk_queue *cq_tmp; /* Only as tmp storage before bind */ }; #ifdef CONFIG_XDP_SOCKETS diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index ccf848f7efa4..5b1ee8a9976d 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -11,47 +11,50 @@ #ifdef CONFIG_XDP_SOCKETS -void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); -bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc); -void xsk_umem_consume_tx_done(struct xdp_umem *umem); -struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id); -void xsk_set_rx_need_wakeup(struct xdp_umem *umem); -void xsk_set_tx_need_wakeup(struct xdp_umem *umem); -void xsk_clear_rx_need_wakeup(struct xdp_umem *umem); -void xsk_clear_tx_need_wakeup(struct xdp_umem *umem); -bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem); +void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries); +bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); +void xsk_tx_release(struct xsk_buff_pool *pool); +struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, + u16 queue_id); +void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool); +void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool); +void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool); +void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool); +bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool); -static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem) +static inline u32 xsk_pool_get_headroom(struct xsk_buff_pool *pool) { - return XDP_PACKET_HEADROOM + umem->headroom; + return XDP_PACKET_HEADROOM + pool->headroom; } -static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem) +static inline u32 xsk_pool_get_chunk_size(struct xsk_buff_pool *pool) { - return umem->chunk_size; + return pool->chunk_size; } -static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem) +static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) { - return xsk_umem_get_chunk_size(umem) - xsk_umem_get_headroom(umem); + return xsk_pool_get_chunk_size(pool) - xsk_pool_get_headroom(pool); } -static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem, +static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) { - xp_set_rxq_info(umem->pool, rxq); + xp_set_rxq_info(pool, rxq); } -static inline void xsk_buff_dma_unmap(struct xdp_umem *umem, +static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) { - xp_dma_unmap(umem->pool, attrs); + xp_dma_unmap(pool, attrs); } -static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev, - unsigned long attrs) +static inline int xsk_pool_dma_map(struct xsk_buff_pool *pool, + struct device *dev, unsigned long attrs) { - return xp_dma_map(umem->pool, dev, attrs, umem->pgs, umem->npgs); + struct xdp_umem *umem = pool->umem; + + return xp_dma_map(pool, dev, attrs, umem->pgs, umem->npgs); } static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp) @@ -68,14 +71,14 @@ static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp) return xp_get_frame_dma(xskb); } -static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem) +static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) { - return xp_alloc(umem->pool); + return xp_alloc(pool); } -static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count) +static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count) { - return xp_can_alloc(umem->pool, count); + return xp_can_alloc(pool, count); } static inline void xsk_buff_free(struct xdp_buff *xdp) @@ -85,100 +88,104 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) xp_free(xskb); } -static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr) +static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool, + u64 addr) { - return xp_raw_get_dma(umem->pool, addr); + return xp_raw_get_dma(pool, addr); } -static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr) +static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) { - return xp_raw_get_data(umem->pool, addr); + return xp_raw_get_data(pool, addr); } -static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) +static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + if (!pool->dma_need_sync) + return; + xp_dma_sync_for_cpu(xskb); } -static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem, +static inline void xsk_buff_raw_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, size_t size) { - xp_dma_sync_for_device(umem->pool, dma, size); + xp_dma_sync_for_device(pool, dma, size); } #else -static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) +static inline void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries) { } -static inline bool xsk_umem_consume_tx(struct xdp_umem *umem, - struct xdp_desc *desc) +static inline bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, + struct xdp_desc *desc) { return false; } -static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem) +static inline void xsk_tx_release(struct xsk_buff_pool *pool) { } -static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, - u16 queue_id) +static inline struct xsk_buff_pool * +xsk_get_pool_from_qid(struct net_device *dev, u16 queue_id) { return NULL; } -static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem) +static inline void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) { } -static inline void xsk_set_tx_need_wakeup(struct xdp_umem *umem) +static inline void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool) { } -static inline void xsk_clear_rx_need_wakeup(struct xdp_umem *umem) +static inline void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool) { } -static inline void xsk_clear_tx_need_wakeup(struct xdp_umem *umem) +static inline void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool) { } -static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) +static inline bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool) { return false; } -static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem) +static inline u32 xsk_pool_get_headroom(struct xsk_buff_pool *pool) { return 0; } -static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem) +static inline u32 xsk_pool_get_chunk_size(struct xsk_buff_pool *pool) { return 0; } -static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem) +static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) { return 0; } -static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem, +static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) { } -static inline void xsk_buff_dma_unmap(struct xdp_umem *umem, +static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) { } -static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev, - unsigned long attrs) +static inline int xsk_pool_dma_map(struct xsk_buff_pool *pool, + struct device *dev, unsigned long attrs) { return 0; } @@ -193,12 +200,12 @@ static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp) return 0; } -static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem) +static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) { return NULL; } -static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count) +static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count) { return false; } @@ -207,21 +214,22 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) { } -static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr) +static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool, + u64 addr) { return 0; } -static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr) +static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) { return NULL; } -static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) +static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool) { } -static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem, +static inline void xsk_buff_raw_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, size_t size) { diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 6842990e2712..0140d086dc84 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -13,6 +13,8 @@ struct xsk_buff_pool; struct xdp_rxq_info; struct xsk_queue; struct xdp_desc; +struct xdp_umem; +struct xdp_sock; struct device; struct page; @@ -26,34 +28,68 @@ struct xdp_buff_xsk { struct list_head free_list_node; }; +struct xsk_dma_map { + dma_addr_t *dma_pages; + struct device *dev; + struct net_device *netdev; + refcount_t users; + struct list_head list; /* Protected by the RTNL_LOCK */ + u32 dma_pages_cnt; + bool dma_need_sync; +}; + struct xsk_buff_pool { - struct xsk_queue *fq; + /* Members only used in the control path first. */ + struct device *dev; + struct net_device *netdev; + struct list_head xsk_tx_list; + /* Protects modifications to the xsk_tx_list */ + spinlock_t xsk_tx_list_lock; + refcount_t users; + struct xdp_umem *umem; + struct work_struct work; struct list_head free_list; + u32 heads_cnt; + u16 queue_id; + + /* Data path members as close to free_heads at the end as possible. */ + struct xsk_queue *fq ____cacheline_aligned_in_smp; + struct xsk_queue *cq; + /* For performance reasons, each buff pool has its own array of dma_pages + * even when they are identical. + */ dma_addr_t *dma_pages; struct xdp_buff_xsk *heads; u64 chunk_mask; u64 addrs_cnt; u32 free_list_cnt; u32 dma_pages_cnt; - u32 heads_cnt; u32 free_heads_cnt; u32 headroom; u32 chunk_size; u32 frame_len; + u8 cached_need_wakeup; + bool uses_need_wakeup; bool dma_need_sync; bool unaligned; void *addrs; - struct device *dev; struct xdp_buff_xsk *free_heads[]; }; /* AF_XDP core. */ -struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks, - u32 chunk_size, u32 headroom, u64 size, - bool unaligned); -void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq); +struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, + struct xdp_umem *umem); +int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *dev, + u16 queue_id, u16 flags); +int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem, + struct net_device *dev, u16 queue_id); void xp_destroy(struct xsk_buff_pool *pool); void xp_release(struct xdp_buff_xsk *xskb); +void xp_get_pool(struct xsk_buff_pool *pool); +void xp_put_pool(struct xsk_buff_pool *pool); +void xp_clear_dev(struct xsk_buff_pool *pool); +void xp_add_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs); +void xp_del_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs); /* AF_XDP, and XDP core. */ void xp_free(struct xdp_buff_xsk *xskb); @@ -80,9 +116,6 @@ static inline dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb) void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb); static inline void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb) { - if (!xskb->pool->dma_need_sync) - return; - xp_dma_sync_for_cpu_slow(xskb); } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b6238b2209b7..8dda13880957 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -155,6 +155,7 @@ enum bpf_map_type { BPF_MAP_TYPE_DEVMAP_HASH, BPF_MAP_TYPE_STRUCT_OPS, BPF_MAP_TYPE_RINGBUF, + BPF_MAP_TYPE_INODE_STORAGE, }; /* Note that tracing related programs such as @@ -345,6 +346,14 @@ enum bpf_link_type { /* The verifier internal test flag. Behavior is undefined */ #define BPF_F_TEST_STATE_FREQ (1U << 3) +/* If BPF_F_SLEEPABLE is used in BPF_PROG_LOAD command, the verifier will + * restrict map and helper usage for such programs. Sleepable BPF programs can + * only be attached to hooks where kernel execution context allows sleeping. + * Such programs are allowed to use helpers that may sleep like + * bpf_copy_from_user(). + */ +#define BPF_F_SLEEPABLE (1U << 4) + /* When BPF ldimm64's insn[0].src_reg != 0 then this can have * two extensions: * @@ -2807,7 +2816,7 @@ union bpf_attr { * * **-ERANGE** if resulting value was out of range. * - * void *bpf_sk_storage_get(struct bpf_map *map, struct bpf_sock *sk, void *value, u64 flags) + * void *bpf_sk_storage_get(struct bpf_map *map, void *sk, void *value, u64 flags) * Description * Get a bpf-local-storage from a *sk*. * @@ -2823,6 +2832,9 @@ union bpf_attr { * "type". The bpf-local-storage "type" (i.e. the *map*) is * searched against all bpf-local-storages residing at *sk*. * + * *sk* is a kernel **struct sock** pointer for LSM program. + * *sk* is a **struct bpf_sock** pointer for other program types. + * * An optional *flags* (**BPF_SK_STORAGE_GET_F_CREATE**) can be * used such that a new bpf-local-storage will be * created if one does not exist. *value* can be used @@ -2835,7 +2847,7 @@ union bpf_attr { * **NULL** if not found or there was an error in adding * a new bpf-local-storage. * - * long bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) + * long bpf_sk_storage_delete(struct bpf_map *map, void *sk) * Description * Delete a bpf-local-storage from a *sk*. * Return @@ -3395,6 +3407,175 @@ union bpf_attr { * A non-negative value equal to or less than *size* on success, * or a negative error in case of failure. * + * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags) + * Description + * Load header option. Support reading a particular TCP header + * option for bpf program (BPF_PROG_TYPE_SOCK_OPS). + * + * If *flags* is 0, it will search the option from the + * sock_ops->skb_data. The comment in "struct bpf_sock_ops" + * has details on what skb_data contains under different + * sock_ops->op. + * + * The first byte of the *searchby_res* specifies the + * kind that it wants to search. + * + * If the searching kind is an experimental kind + * (i.e. 253 or 254 according to RFC6994). It also + * needs to specify the "magic" which is either + * 2 bytes or 4 bytes. It then also needs to + * specify the size of the magic by using + * the 2nd byte which is "kind-length" of a TCP + * header option and the "kind-length" also + * includes the first 2 bytes "kind" and "kind-length" + * itself as a normal TCP header option also does. + * + * For example, to search experimental kind 254 with + * 2 byte magic 0xeB9F, the searchby_res should be + * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ]. + * + * To search for the standard window scale option (3), + * the searchby_res should be [ 3, 0, 0, .... 0 ]. + * Note, kind-length must be 0 for regular option. + * + * Searching for No-Op (0) and End-of-Option-List (1) are + * not supported. + * + * *len* must be at least 2 bytes which is the minimal size + * of a header option. + * + * Supported flags: + * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the + * saved_syn packet or the just-received syn packet. + * + * Return + * >0 when found, the header option is copied to *searchby_res*. + * The return value is the total length copied. + * + * **-EINVAL** If param is invalid + * + * **-ENOMSG** The option is not found + * + * **-ENOENT** No syn packet available when + * **BPF_LOAD_HDR_OPT_TCP_SYN** is used + * + * **-ENOSPC** Not enough space. Only *len* number of + * bytes are copied. + * + * **-EFAULT** Cannot parse the header options in the packet + * + * **-EPERM** This helper cannot be used under the + * current sock_ops->op. + * + * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags) + * Description + * Store header option. The data will be copied + * from buffer *from* with length *len* to the TCP header. + * + * The buffer *from* should have the whole option that + * includes the kind, kind-length, and the actual + * option data. The *len* must be at least kind-length + * long. The kind-length does not have to be 4 byte + * aligned. The kernel will take care of the padding + * and setting the 4 bytes aligned value to th->doff. + * + * This helper will check for duplicated option + * by searching the same option in the outgoing skb. + * + * This helper can only be called during + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * Return + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** If param is invalid + * + * **-ENOSPC** Not enough space in the header. + * Nothing has been written + * + * **-EEXIST** The option has already existed + * + * **-EFAULT** Cannot parse the existing header options + * + * **-EPERM** This helper cannot be used under the + * current sock_ops->op. + * + * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags) + * Description + * Reserve *len* bytes for the bpf header option. The + * space will be used by bpf_store_hdr_opt() later in + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * If bpf_reserve_hdr_opt() is called multiple times, + * the total number of bytes will be reserved. + * + * This helper can only be called during + * BPF_SOCK_OPS_HDR_OPT_LEN_CB. + * + * Return + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** if param is invalid + * + * **-ENOSPC** Not enough space in the header. + * + * **-EPERM** This helper cannot be used under the + * current sock_ops->op. + * + * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags) + * Description + * Get a bpf_local_storage from an *inode*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *inode* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *inode*) except this + * helper enforces the key must be an inode and the map must also + * be a **BPF_MAP_TYPE_INODE_STORAGE**. + * + * Underneath, the value is stored locally at *inode* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *inode*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * int bpf_inode_storage_delete(struct bpf_map *map, void *inode) + * Description + * Delete a bpf_local_storage from an *inode*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + * + * long bpf_d_path(struct path *path, char *buf, u32 sz) + * Description + * Return full path for given 'struct path' object, which + * needs to be the kernel BTF 'path' object. The path is + * returned in the provided buffer 'buf' of size 'sz' and + * is zero terminated. + * + * Return + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. + * + * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr) + * Description + * Read *size* bytes from user space address *user_ptr* and store + * the data in *dst*. This is a wrapper of copy_from_user(). + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3539,6 +3720,13 @@ union bpf_attr { FN(skc_to_tcp_request_sock), \ FN(skc_to_udp6_sock), \ FN(get_task_stack), \ + FN(load_hdr_opt), \ + FN(store_hdr_opt), \ + FN(reserve_hdr_opt), \ + FN(inode_storage_get), \ + FN(inode_storage_delete), \ + FN(d_path), \ + FN(copy_from_user), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -3648,9 +3836,13 @@ enum { BPF_F_SYSCTL_BASE_NAME = (1ULL << 0), }; -/* BPF_FUNC_sk_storage_get flags */ +/* BPF_FUNC_<kernel_obj>_storage_get flags */ enum { - BPF_SK_STORAGE_GET_F_CREATE = (1ULL << 0), + BPF_LOCAL_STORAGE_GET_F_CREATE = (1ULL << 0), + /* BPF_SK_STORAGE_GET_F_CREATE is only kept for backward compatibility + * and BPF_LOCAL_STORAGE_GET_F_CREATE must be used instead. + */ + BPF_SK_STORAGE_GET_F_CREATE = BPF_LOCAL_STORAGE_GET_F_CREATE, }; /* BPF_FUNC_read_branch_records flags. */ @@ -4071,6 +4263,15 @@ struct bpf_link_info { __u64 cgroup_id; __u32 attach_type; } cgroup; + struct { + __aligned_u64 target_name; /* in/out: target_name buffer ptr */ + __u32 target_name_len; /* in/out: target_name buffer len */ + union { + struct { + __u32 map_id; + } map; + }; + } iter; struct { __u32 netns_ino; __u32 attach_type; @@ -4158,6 +4359,36 @@ struct bpf_sock_ops { __u64 bytes_received; __u64 bytes_acked; __bpf_md_ptr(struct bpf_sock *, sk); + /* [skb_data, skb_data_end) covers the whole TCP header. + * + * BPF_SOCK_OPS_PARSE_HDR_OPT_CB: The packet received + * BPF_SOCK_OPS_HDR_OPT_LEN_CB: Not useful because the + * header has not been written. + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB: The header and options have + * been written so far. + * BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: The SYNACK that concludes + * the 3WHS. + * BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: The ACK that concludes + * the 3WHS. + * + * bpf_load_hdr_opt() can also be used to read a particular option. + */ + __bpf_md_ptr(void *, skb_data); + __bpf_md_ptr(void *, skb_data_end); + __u32 skb_len; /* The total length of a packet. + * It includes the header, options, + * and payload. + */ + __u32 skb_tcp_flags; /* tcp_flags of the header. It provides + * an easy way to check for tcp_flags + * without parsing skb_data. + * + * In particular, the skb_tcp_flags + * will still be available in + * BPF_SOCK_OPS_HDR_OPT_LEN even though + * the outgoing header has not + * been written yet. + */ }; /* Definitions for bpf_sock_ops_cb_flags */ @@ -4166,8 +4397,51 @@ enum { BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1), BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2), BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), + /* Call bpf for all received TCP headers. The bpf prog will be + * called under sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * + * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * for the header option related helpers that will be useful + * to the bpf programs. + * + * It could be used at the client/active side (i.e. connect() side) + * when the server told it that the server was in syncookie + * mode and required the active side to resend the bpf-written + * options. The active side can keep writing the bpf-options until + * it received a valid packet from the server side to confirm + * the earlier packet (and options) has been received. The later + * example patch is using it like this at the active side when the + * server is in syncookie mode. + * + * The bpf prog will usually turn this off in the common cases. + */ + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), + /* Call bpf when kernel has received a header option that + * the kernel cannot handle. The bpf prog will be called under + * sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB. + * + * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * for the header option related helpers that will be useful + * to the bpf programs. + */ + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5), + /* Call bpf when the kernel is writing header options for the + * outgoing packet. The bpf prog will first be called + * to reserve space in a skb under + * sock_ops->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB. Then + * the bpf prog will be called to write the header option(s) + * under sock_ops->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * Please refer to the comment in BPF_SOCK_OPS_HDR_OPT_LEN_CB + * and BPF_SOCK_OPS_WRITE_HDR_OPT_CB for the header option + * related helpers that will be useful to the bpf programs. + * + * The kernel gets its chance to reserve space and write + * options first before the BPF program does. + */ + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6), /* Mask of all currently supported cb flags */ - BPF_SOCK_OPS_ALL_CB_FLAGS = 0xF, + BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F, }; /* List of known BPF sock_ops operators. @@ -4223,6 +4497,63 @@ enum { */ BPF_SOCK_OPS_RTT_CB, /* Called on every RTT. */ + BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option. + * It will be called to handle + * the packets received at + * an already established + * connection. + * + * sock_ops->skb_data: + * Referring to the received skb. + * It covers the TCP header only. + * + * bpf_load_hdr_opt() can also + * be used to search for a + * particular option. + */ + BPF_SOCK_OPS_HDR_OPT_LEN_CB, /* Reserve space for writing the + * header option later in + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * Arg1: bool want_cookie. (in + * writing SYNACK only) + * + * sock_ops->skb_data: + * Not available because no header has + * been written yet. + * + * sock_ops->skb_tcp_flags: + * The tcp_flags of the + * outgoing skb. (e.g. SYN, ACK, FIN). + * + * bpf_reserve_hdr_opt() should + * be used to reserve space. + */ + BPF_SOCK_OPS_WRITE_HDR_OPT_CB, /* Write the header options + * Arg1: bool want_cookie. (in + * writing SYNACK only) + * + * sock_ops->skb_data: + * Referring to the outgoing skb. + * It covers the TCP header + * that has already been written + * by the kernel and the + * earlier bpf-progs. + * + * sock_ops->skb_tcp_flags: + * The tcp_flags of the outgoing + * skb. (e.g. SYN, ACK, FIN). + * + * bpf_store_hdr_opt() should + * be used to write the + * option. + * + * bpf_load_hdr_opt() can also + * be used to search for a + * particular option that + * has already been written + * by the kernel or the + * earlier bpf-progs. + */ }; /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect @@ -4250,6 +4581,63 @@ enum { enum { TCP_BPF_IW = 1001, /* Set TCP initial congestion window */ TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ + TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */ + TCP_BPF_RTO_MIN = 1004, /* Min delay ack in usecs */ + /* Copy the SYN pkt to optval + * + * BPF_PROG_TYPE_SOCK_OPS only. It is similar to the + * bpf_getsockopt(TCP_SAVED_SYN) but it does not limit + * to only getting from the saved_syn. It can either get the + * syn packet from: + * + * 1. the just-received SYN packet (only available when writing the + * SYNACK). It will be useful when it is not necessary to + * save the SYN packet for latter use. It is also the only way + * to get the SYN during syncookie mode because the syn + * packet cannot be saved during syncookie. + * + * OR + * + * 2. the earlier saved syn which was done by + * bpf_setsockopt(TCP_SAVE_SYN). + * + * The bpf_getsockopt(TCP_BPF_SYN*) option will hide where the + * SYN packet is obtained. + * + * If the bpf-prog does not need the IP[46] header, the + * bpf-prog can avoid parsing the IP header by using + * TCP_BPF_SYN. Otherwise, the bpf-prog can get both + * IP[46] and TCP header by using TCP_BPF_SYN_IP. + * + * >0: Total number of bytes copied + * -ENOSPC: Not enough space in optval. Only optlen number of + * bytes is copied. + * -ENOENT: The SYN skb is not available now and the earlier SYN pkt + * is not saved by setsockopt(TCP_SAVE_SYN). + */ + TCP_BPF_SYN = 1005, /* Copy the TCP header */ + TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ + TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ +}; + +enum { + BPF_LOAD_HDR_OPT_TCP_SYN = (1ULL << 0), +}; + +/* args[0] value during BPF_SOCK_OPS_HDR_OPT_LEN_CB and + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + */ +enum { + BPF_WRITE_HDR_TCP_CURRENT_MSS = 1, /* Kernel is finding the + * total option spaces + * required for an established + * sk in order to calculate the + * MSS. No skb is actually + * sent. + */ + BPF_WRITE_HDR_TCP_SYNACK_COOKIE = 2, /* Kernel is in syncookie mode + * when sending a SYN. + */ }; struct bpf_perf_event_value { |
