From 136f282028dae7d9dd68469b197aa2b36b410992 Mon Sep 17 00:00:00 2001 From: Miguel Bernal Marin Date: Wed, 6 Oct 2021 00:13:18 -0500 Subject: ACPI: tools: fix compilation error When ACPI tools are compiled, the following error is showed: $ cd tools/power/acpi $ make DESCEND tools/acpidbg MKDIR include CP include CC tools/acpidbg/acpidbg.o In file included from /home/linux/tools/power/acpi/include/acpi/platform/acenv.h:152, from /home/linux/tools/power/acpi/include/acpi/acpi.h:22, from acpidbg.c:9: /home/linux/tools/power/acpi/include/acpi/platform/acgcc.h:25:10: fatal error: linux/stdarg.h: No such file or directory 29 | #include | ^~~~~~~~~~~~~~~~ compilation terminated. Use the ACPICA logic: just identify when it is used inside the kernel or by an ACPI tool. Fixes: c0891ac15f04 ("isystem: ship and use stdarg.h") Signed-off-by: Miguel Bernal Marin [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- include/acpi/platform/acgcc.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/acpi/platform/acgcc.h b/include/acpi/platform/acgcc.h index fb172a03a753..20ecb004f5a4 100644 --- a/include/acpi/platform/acgcc.h +++ b/include/acpi/platform/acgcc.h @@ -22,9 +22,14 @@ typedef __builtin_va_list va_list; #define va_arg(v, l) __builtin_va_arg(v, l) #define va_copy(d, s) __builtin_va_copy(d, s) #else +#ifdef __KERNEL__ #include -#endif -#endif +#else +/* Used to build acpi tools */ +#include +#endif /* __KERNEL__ */ +#endif /* ACPI_USE_BUILTIN_STDARG */ +#endif /* ! va_arg */ #define ACPI_INLINE __inline__ -- cgit v1.2.3 From fadb7ff1a6c2c565af56b4aacdd086b067eed440 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Thu, 14 Oct 2021 15:25:53 +0100 Subject: bpf: Prevent increasing bpf_jit_limit above max Restrict bpf_jit_limit to the maximum supported by the arch's JIT. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211014142554.53120-4-lmb@cloudflare.com --- include/linux/filter.h | 1 + kernel/bpf/core.c | 4 +++- net/core/sysctl_net_core.c | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 4a93c12543ee..ef03ff34234d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1051,6 +1051,7 @@ extern int bpf_jit_enable; extern int bpf_jit_harden; extern int bpf_jit_kallsyms; extern long bpf_jit_limit; +extern long bpf_jit_limit_max; typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d6b7dfdd8066..c1e7eb3f1876 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -524,6 +524,7 @@ int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); int bpf_jit_harden __read_mostly; long bpf_jit_limit __read_mostly; +long bpf_jit_limit_max __read_mostly; static void bpf_prog_ksym_set_addr(struct bpf_prog *prog) @@ -817,7 +818,8 @@ u64 __weak bpf_jit_alloc_exec_limit(void) static int __init bpf_jit_charge_init(void) { /* Only used as heuristic here to derive limit. */ - bpf_jit_limit = min_t(u64, round_up(bpf_jit_alloc_exec_limit() >> 2, + bpf_jit_limit_max = bpf_jit_alloc_exec_limit(); + bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 2, PAGE_SIZE), LONG_MAX); return 0; } diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index c8496c1142c9..5f88526ad61c 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -419,7 +419,7 @@ static struct ctl_table net_core_table[] = { .mode = 0600, .proc_handler = proc_dolongvec_minmax_bpf_restricted, .extra1 = &long_one, - .extra2 = &long_max, + .extra2 = &bpf_jit_limit_max, }, #endif { -- cgit v1.2.3 From 09b1d5dc6ce1c9151777f6c4e128a59457704c97 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 25 Oct 2021 13:31:12 +0200 Subject: cfg80211: fix management registrations locking The management registrations locking was broken, the list was locked for each wdev, but cfg80211_mgmt_registrations_update() iterated it without holding all the correct spinlocks, causing list corruption. Rather than trying to fix it with fine-grained locking, just move the lock to the wiphy/rdev (still need the list on each wdev), we already need to hold the wdev lock to change it, so there's no contention on the lock in any case. This trivially fixes the bug since we hold one wdev's lock already, and now will hold the lock that protects all lists. Cc: stable@vger.kernel.org Reported-by: Jouni Malinen Fixes: 6cd536fe62ef ("cfg80211: change internal management frame registration API") Link: https://lore.kernel.org/r/20211025133111.5cf733eab0f4.I7b0abb0494ab712f74e2efcd24bb31ac33f7eee9@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 -- net/wireless/core.c | 2 +- net/wireless/core.h | 2 ++ net/wireless/mlme.c | 26 ++++++++++++++------------ 4 files changed, 17 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 62dd8422e0dc..27336fc70467 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5376,7 +5376,6 @@ static inline void wiphy_unlock(struct wiphy *wiphy) * netdev and may otherwise be used by driver read-only, will be update * by cfg80211 on change_interface * @mgmt_registrations: list of registrations for management frames - * @mgmt_registrations_lock: lock for the list * @mgmt_registrations_need_update: mgmt registrations were updated, * need to propagate the update to the driver * @mtx: mutex used to lock data in this struct, may be used by drivers @@ -5423,7 +5422,6 @@ struct wireless_dev { u32 identifier; struct list_head mgmt_registrations; - spinlock_t mgmt_registrations_lock; u8 mgmt_registrations_need_update:1; struct mutex mtx; diff --git a/net/wireless/core.c b/net/wireless/core.c index 03323121ca50..aaba847d79eb 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -524,6 +524,7 @@ use_default_name: INIT_WORK(&rdev->propagate_cac_done_wk, cfg80211_propagate_cac_done_wk); INIT_WORK(&rdev->mgmt_registrations_update_wk, cfg80211_mgmt_registrations_update_wk); + spin_lock_init(&rdev->mgmt_registrations_lock); #ifdef CONFIG_CFG80211_DEFAULT_PS rdev->wiphy.flags |= WIPHY_FLAG_PS_ON_BY_DEFAULT; @@ -1279,7 +1280,6 @@ void cfg80211_init_wdev(struct wireless_dev *wdev) INIT_LIST_HEAD(&wdev->event_list); spin_lock_init(&wdev->event_lock); INIT_LIST_HEAD(&wdev->mgmt_registrations); - spin_lock_init(&wdev->mgmt_registrations_lock); INIT_LIST_HEAD(&wdev->pmsr_list); spin_lock_init(&wdev->pmsr_lock); INIT_WORK(&wdev->pmsr_free_wk, cfg80211_pmsr_free_wk); diff --git a/net/wireless/core.h b/net/wireless/core.h index b35d0db12f1d..1720abf36f92 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -100,6 +100,8 @@ struct cfg80211_registered_device { struct work_struct propagate_cac_done_wk; struct work_struct mgmt_registrations_update_wk; + /* lock for all wdev lists */ + spinlock_t mgmt_registrations_lock; /* must be last because of the way we do wiphy_priv(), * and it should at least be aligned to NETDEV_ALIGN */ diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 3aa69b375a10..783acd2c4211 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -452,9 +452,9 @@ static void cfg80211_mgmt_registrations_update(struct wireless_dev *wdev) lockdep_assert_held(&rdev->wiphy.mtx); - spin_lock_bh(&wdev->mgmt_registrations_lock); + spin_lock_bh(&rdev->mgmt_registrations_lock); if (!wdev->mgmt_registrations_need_update) { - spin_unlock_bh(&wdev->mgmt_registrations_lock); + spin_unlock_bh(&rdev->mgmt_registrations_lock); return; } @@ -479,7 +479,7 @@ static void cfg80211_mgmt_registrations_update(struct wireless_dev *wdev) rcu_read_unlock(); wdev->mgmt_registrations_need_update = 0; - spin_unlock_bh(&wdev->mgmt_registrations_lock); + spin_unlock_bh(&rdev->mgmt_registrations_lock); rdev_update_mgmt_frame_registrations(rdev, wdev, &upd); } @@ -503,6 +503,7 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, int match_len, bool multicast_rx, struct netlink_ext_ack *extack) { + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_mgmt_registration *reg, *nreg; int err = 0; u16 mgmt_type; @@ -548,7 +549,7 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, if (!nreg) return -ENOMEM; - spin_lock_bh(&wdev->mgmt_registrations_lock); + spin_lock_bh(&rdev->mgmt_registrations_lock); list_for_each_entry(reg, &wdev->mgmt_registrations, list) { int mlen = min(match_len, reg->match_len); @@ -583,7 +584,7 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, list_add(&nreg->list, &wdev->mgmt_registrations); } wdev->mgmt_registrations_need_update = 1; - spin_unlock_bh(&wdev->mgmt_registrations_lock); + spin_unlock_bh(&rdev->mgmt_registrations_lock); cfg80211_mgmt_registrations_update(wdev); @@ -591,7 +592,7 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, out: kfree(nreg); - spin_unlock_bh(&wdev->mgmt_registrations_lock); + spin_unlock_bh(&rdev->mgmt_registrations_lock); return err; } @@ -602,7 +603,7 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid) struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_mgmt_registration *reg, *tmp; - spin_lock_bh(&wdev->mgmt_registrations_lock); + spin_lock_bh(&rdev->mgmt_registrations_lock); list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) { if (reg->nlportid != nlportid) @@ -615,7 +616,7 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid) schedule_work(&rdev->mgmt_registrations_update_wk); } - spin_unlock_bh(&wdev->mgmt_registrations_lock); + spin_unlock_bh(&rdev->mgmt_registrations_lock); if (nlportid && rdev->crit_proto_nlportid == nlportid) { rdev->crit_proto_nlportid = 0; @@ -628,15 +629,16 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid) void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev) { + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_mgmt_registration *reg, *tmp; - spin_lock_bh(&wdev->mgmt_registrations_lock); + spin_lock_bh(&rdev->mgmt_registrations_lock); list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) { list_del(®->list); kfree(reg); } wdev->mgmt_registrations_need_update = 1; - spin_unlock_bh(&wdev->mgmt_registrations_lock); + spin_unlock_bh(&rdev->mgmt_registrations_lock); cfg80211_mgmt_registrations_update(wdev); } @@ -784,7 +786,7 @@ bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm, data = buf + ieee80211_hdrlen(mgmt->frame_control); data_len = len - ieee80211_hdrlen(mgmt->frame_control); - spin_lock_bh(&wdev->mgmt_registrations_lock); + spin_lock_bh(&rdev->mgmt_registrations_lock); list_for_each_entry(reg, &wdev->mgmt_registrations, list) { if (reg->frame_type != ftype) @@ -808,7 +810,7 @@ bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm, break; } - spin_unlock_bh(&wdev->mgmt_registrations_lock); + spin_unlock_bh(&rdev->mgmt_registrations_lock); trace_cfg80211_return_bool(result); return result; -- cgit v1.2.3 From 9122a70a6333705c0c35614ddc51c274ed1d3637 Mon Sep 17 00:00:00 2001 From: Cyril Strejc Date: Sun, 24 Oct 2021 22:14:25 +0200 Subject: net: multicast: calculate csum of looped-back and forwarded packets During a testing of an user-space application which transmits UDP multicast datagrams and utilizes multicast routing to send the UDP datagrams out of defined network interfaces, I've found a multicast router does not fill-in UDP checksum into locally produced, looped-back and forwarded UDP datagrams, if an original output NIC the datagrams are sent to has UDP TX checksum offload enabled. The datagrams are sent malformed out of the NIC the datagrams have been forwarded to. It is because: 1. If TX checksum offload is enabled on the output NIC, UDP checksum is not calculated by kernel and is not filled into skb data. 2. dev_loopback_xmit(), which is called solely by ip_mc_finish_output(), sets skb->ip_summed = CHECKSUM_UNNECESSARY unconditionally. 3. Since 35fc92a9 ("[NET]: Allow forwarding of ip_summed except CHECKSUM_COMPLETE"), the ip_summed value is preserved during forwarding. 4. If ip_summed != CHECKSUM_PARTIAL, checksum is not calculated during a packet egress. The minimum fix in dev_loopback_xmit(): 1. Preserves skb->ip_summed CHECKSUM_PARTIAL. This is the case when the original output NIC has TX checksum offload enabled. The effects are: a) If the forwarding destination interface supports TX checksum offloading, the NIC driver is responsible to fill-in the checksum. b) If the forwarding destination interface does NOT support TX checksum offloading, checksums are filled-in by kernel before skb is submitted to the NIC driver. c) For local delivery, checksum validation is skipped as in the case of CHECKSUM_UNNECESSARY, thanks to skb_csum_unnecessary(). 2. Translates ip_summed CHECKSUM_NONE to CHECKSUM_UNNECESSARY. It means, for CHECKSUM_NONE, the behavior is unmodified and is there to skip a looped-back packet local delivery checksum validation. Signed-off-by: Cyril Strejc Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/udp.h | 5 +++-- net/core/dev.c | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/udp.h b/include/net/udp.h index 360df454356c..909ecf447e0f 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -494,8 +494,9 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk, * CHECKSUM_NONE in __udp_gso_segment. UDP GRO indeed builds partial * packets in udp_gro_complete_segment. As does UDP GSO, verified by * udp_send_skb. But when those packets are looped in dev_loopback_xmit - * their ip_summed is set to CHECKSUM_UNNECESSARY. Reset in this - * specific case, where PARTIAL is both correct and required. + * their ip_summed CHECKSUM_NONE is changed to CHECKSUM_UNNECESSARY. + * Reset in this specific case, where PARTIAL is both correct and + * required. */ if (skb->pkt_type == PACKET_LOOPBACK) skb->ip_summed = CHECKSUM_PARTIAL; diff --git a/net/core/dev.c b/net/core/dev.c index ea2366497697..eb3a366bf212 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3912,7 +3912,8 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) skb_reset_mac_header(skb); __skb_pull(skb, skb_network_offset(skb)); skb->pkt_type = PACKET_LOOPBACK; - skb->ip_summed = CHECKSUM_UNNECESSARY; + if (skb->ip_summed == CHECKSUM_NONE) + skb->ip_summed = CHECKSUM_UNNECESSARY; WARN_ON(!skb_dst(skb)); skb_dst_force(skb); netif_rx_ni(skb); -- cgit v1.2.3 From 7b50ecfcc6cdfe87488576bc3ed443dc8d083b90 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 8 Oct 2021 13:33:03 -0700 Subject: net: Rename ->stream_memory_read to ->sock_is_readable The proto ops ->stream_memory_read() is currently only used by TCP to check whether psock queue is empty or not. We need to rename it before reusing it for non-TCP protocols, and adjust the exsiting users accordingly. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211008203306.37525-2-xiyou.wangcong@gmail.com --- include/net/sock.h | 8 +++++++- include/net/tls.h | 2 +- net/ipv4/tcp.c | 5 +---- net/ipv4/tcp_bpf.c | 4 ++-- net/tls/tls_main.c | 4 ++-- net/tls/tls_sw.c | 2 +- 6 files changed, 14 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index ea6fbc88c8f9..463f390d90b3 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1208,7 +1208,7 @@ struct proto { #endif bool (*stream_memory_free)(const struct sock *sk, int wake); - bool (*stream_memory_read)(const struct sock *sk); + bool (*sock_is_readable)(struct sock *sk); /* Memory pressure */ void (*enter_memory_pressure)(struct sock *sk); void (*leave_memory_pressure)(struct sock *sk); @@ -2820,4 +2820,10 @@ void sock_set_sndtimeo(struct sock *sk, s64 secs); int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len); +static inline bool sk_is_readable(struct sock *sk) +{ + if (sk->sk_prot->sock_is_readable) + return sk->sk_prot->sock_is_readable(sk); + return false; +} #endif /* _SOCK_H */ diff --git a/include/net/tls.h b/include/net/tls.h index be4b3e1cac46..01d2e3744393 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -375,7 +375,7 @@ void tls_sw_release_resources_rx(struct sock *sk); void tls_sw_free_ctx_rx(struct tls_context *tls_ctx); int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); -bool tls_sw_stream_read(const struct sock *sk); +bool tls_sw_sock_is_readable(struct sock *sk); ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e8b48df73c85..f5c336f8b0c8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -486,10 +486,7 @@ static bool tcp_stream_is_readable(struct sock *sk, int target) { if (tcp_epollin_ready(sk, target)) return true; - - if (sk->sk_prot->stream_memory_read) - return sk->sk_prot->stream_memory_read(sk); - return false; + return sk_is_readable(sk); } /* diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 9d068153c316..7e71e9e278cb 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -150,7 +150,7 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir); #ifdef CONFIG_BPF_SYSCALL -static bool tcp_bpf_stream_read(const struct sock *sk) +static bool tcp_bpf_sock_is_readable(struct sock *sk) { struct sk_psock *psock; bool empty = true; @@ -491,7 +491,7 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], prot[TCP_BPF_BASE].unhash = sock_map_unhash; prot[TCP_BPF_BASE].close = sock_map_close; prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg; - prot[TCP_BPF_BASE].stream_memory_read = tcp_bpf_stream_read; + prot[TCP_BPF_BASE].sock_is_readable = tcp_bpf_sock_is_readable; prot[TCP_BPF_TX] = prot[TCP_BPF_BASE]; prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg; diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index fde56ff49163..9ab81db8a654 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -681,12 +681,12 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], prot[TLS_BASE][TLS_SW] = prot[TLS_BASE][TLS_BASE]; prot[TLS_BASE][TLS_SW].recvmsg = tls_sw_recvmsg; - prot[TLS_BASE][TLS_SW].stream_memory_read = tls_sw_stream_read; + prot[TLS_BASE][TLS_SW].sock_is_readable = tls_sw_sock_is_readable; prot[TLS_BASE][TLS_SW].close = tls_sk_proto_close; prot[TLS_SW][TLS_SW] = prot[TLS_SW][TLS_BASE]; prot[TLS_SW][TLS_SW].recvmsg = tls_sw_recvmsg; - prot[TLS_SW][TLS_SW].stream_memory_read = tls_sw_stream_read; + prot[TLS_SW][TLS_SW].sock_is_readable = tls_sw_sock_is_readable; prot[TLS_SW][TLS_SW].close = tls_sk_proto_close; #ifdef CONFIG_TLS_DEVICE diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 4feb95e34b64..d5d09bd817b7 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2026,7 +2026,7 @@ splice_read_end: return copied ? : err; } -bool tls_sw_stream_read(const struct sock *sk) +bool tls_sw_sock_is_readable(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); -- cgit v1.2.3 From fb4e0a5e73d4bb5ab69b7905abd2ec3b580e9b59 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 8 Oct 2021 13:33:04 -0700 Subject: skmsg: Extract and reuse sk_msg_is_readable() tcp_bpf_sock_is_readable() is pretty much generic, we can extract it and reuse it for non-TCP sockets. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211008203306.37525-3-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 1 + net/core/skmsg.c | 14 ++++++++++++++ net/ipv4/tcp_bpf.c | 15 +-------------- 3 files changed, 16 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 14ab0c0bc924..1ce9a9eb223b 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -128,6 +128,7 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); +bool sk_msg_is_readable(struct sock *sk); static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes) { diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 2d6249b28928..a86ef7e844f8 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -474,6 +474,20 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, } EXPORT_SYMBOL_GPL(sk_msg_recvmsg); +bool sk_msg_is_readable(struct sock *sk) +{ + struct sk_psock *psock; + bool empty = true; + + rcu_read_lock(); + psock = sk_psock(sk); + if (likely(psock)) + empty = list_empty(&psock->ingress_msg); + rcu_read_unlock(); + return !empty; +} +EXPORT_SYMBOL_GPL(sk_msg_is_readable); + static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, struct sk_buff *skb) { diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 7e71e9e278cb..5f4d6f45d87f 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -150,19 +150,6 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir); #ifdef CONFIG_BPF_SYSCALL -static bool tcp_bpf_sock_is_readable(struct sock *sk) -{ - struct sk_psock *psock; - bool empty = true; - - rcu_read_lock(); - psock = sk_psock(sk); - if (likely(psock)) - empty = list_empty(&psock->ingress_msg); - rcu_read_unlock(); - return !empty; -} - static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo) { @@ -491,7 +478,7 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], prot[TCP_BPF_BASE].unhash = sock_map_unhash; prot[TCP_BPF_BASE].close = sock_map_close; prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg; - prot[TCP_BPF_BASE].sock_is_readable = tcp_bpf_sock_is_readable; + prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable; prot[TCP_BPF_TX] = prot[TCP_BPF_BASE]; prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg; -- cgit v1.2.3 From 99d0a3831e3500d945162cdb2310e3a5fce90b60 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 21 Oct 2021 08:46:10 -1000 Subject: bpf: Move BPF_MAP_TYPE for INODE_STORAGE and TASK_STORAGE outside of CONFIG_NET bpf_types.h has BPF_MAP_TYPE_INODE_STORAGE and BPF_MAP_TYPE_TASK_STORAGE declared inside #ifdef CONFIG_NET although they are built regardless of CONFIG_NET. So, when CONFIG_BPF_SYSCALL && !CONFIG_NET, they are built without the declarations leading to spurious build failures and not registered to bpf_map_types making them unavailable. Fix it by moving the BPF_MAP_TYPE for the two map types outside of CONFIG_NET. Reported-by: kernel test robot Fixes: a10787e6d58c ("bpf: Enable task local storage for tracing programs") Signed-off-by: Tejun Heo Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/YXG1cuuSJDqHQfRY@slm.duckdns.org --- include/linux/bpf_types.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 9c81724e4b98..bbe1eefa4c8a 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -101,14 +101,14 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK_TRACE, stack_trace_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) -#ifdef CONFIG_NET -BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) -BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops) -BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops) #ifdef CONFIG_BPF_LSM BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops) +#ifdef CONFIG_NET +BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) -- cgit v1.2.3 From 54713c85f536048e685258f880bf298a74c3620d Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Tue, 26 Oct 2021 13:00:19 +0200 Subject: bpf: Fix potential race in tail call compatibility check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lorenzo noticed that the code testing for program type compatibility of tail call maps is potentially racy in that two threads could encounter a map with an unset type simultaneously and both return true even though they are inserting incompatible programs. The race window is quite small, but artificially enlarging it by adding a usleep_range() inside the check in bpf_prog_array_compatible() makes it trivial to trigger from userspace with a program that does, essentially: map_fd = bpf_create_map(BPF_MAP_TYPE_PROG_ARRAY, 4, 4, 2, 0); pid = fork(); if (pid) { key = 0; value = xdp_fd; } else { key = 1; value = tc_fd; } err = bpf_map_update_elem(map_fd, &key, &value, 0); While the race window is small, it has potentially serious ramifications in that triggering it would allow a BPF program to tail call to a program of a different type. So let's get rid of it by protecting the update with a spinlock. The commit in the Fixes tag is the last commit that touches the code in question. v2: - Use a spinlock instead of an atomic variable and cmpxchg() (Alexei) v3: - Put lock and the members it protects into an embedded 'owner' struct (Daniel) Fixes: 3324b584b6f6 ("ebpf: misc core cleanup") Reported-by: Lorenzo Bianconi Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211026110019.363464-1-toke@redhat.com --- include/linux/bpf.h | 7 +++++-- kernel/bpf/arraymap.c | 1 + kernel/bpf/core.c | 20 +++++++++++++------- kernel/bpf/syscall.c | 6 ++++-- 4 files changed, 23 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 020a7d5bf470..3db6f6c95489 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -929,8 +929,11 @@ struct bpf_array_aux { * stored in the map to make sure that all callers and callees have * the same prog type and JITed flag. */ - enum bpf_prog_type type; - bool jited; + struct { + spinlock_t lock; + enum bpf_prog_type type; + bool jited; + } owner; /* Programs with direct jumps into programs part of this array. */ struct list_head poke_progs; struct bpf_map *map; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index cebd4fb06d19..447def540544 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -1072,6 +1072,7 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) INIT_WORK(&aux->work, prog_array_map_clear_deferred); INIT_LIST_HEAD(&aux->poke_progs); mutex_init(&aux->poke_mutex); + spin_lock_init(&aux->owner.lock); map = array_map_alloc(attr); if (IS_ERR(map)) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c1e7eb3f1876..6e3ae90ad107 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1823,20 +1823,26 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx, bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { + bool ret; + if (fp->kprobe_override) return false; - if (!array->aux->type) { + spin_lock(&array->aux->owner.lock); + + if (!array->aux->owner.type) { /* There's no owner yet where we could check for * compatibility. */ - array->aux->type = fp->type; - array->aux->jited = fp->jited; - return true; + array->aux->owner.type = fp->type; + array->aux->owner.jited = fp->jited; + ret = true; + } else { + ret = array->aux->owner.type == fp->type && + array->aux->owner.jited == fp->jited; } - - return array->aux->type == fp->type && - array->aux->jited == fp->jited; + spin_unlock(&array->aux->owner.lock); + return ret; } static int bpf_check_tail_call(const struct bpf_prog *fp) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9dab49d3f394..1cad6979a0d0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -543,8 +543,10 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { array = container_of(map, struct bpf_array, map); - type = array->aux->type; - jited = array->aux->jited; + spin_lock(&array->aux->owner.lock); + type = array->aux->owner.type; + jited = array->aux->owner.jited; + spin_unlock(&array->aux->owner.lock); } seq_printf(m, -- cgit v1.2.3 From da353fac65fede6b8b4cfe207f0d9408e3121105 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Wed, 27 Oct 2021 17:59:20 -0400 Subject: net/tls: Fix flipped sign in tls_err_abort() calls sk->sk_err appears to expect a positive value, a convention that ktls doesn't always follow and that leads to memory corruption in other code. For instance, [kworker] tls_encrypt_done(..., err=) tls_err_abort(.., err) sk->sk_err = err; [task] splice_from_pipe_feed ... tls_sw_do_sendpage if (sk->sk_err) { ret = -sk->sk_err; // ret is positive splice_from_pipe_feed (continued) ret = actor(...) // ret is still positive and interpreted as bytes // written, resulting in underflow of buf->len and // sd->len, leading to huge buf->offset and bogus // addresses computed in later calls to actor() Fix all tls_err_abort() callers to pass a negative error code consistently and centralize the error-prone sign flip there, throwing in a warning to catch future misuse and uninlining the function so it really does only warn once. Cc: stable@vger.kernel.org Fixes: c46234ebb4d1e ("tls: RX path for ktls") Reported-by: syzbot+b187b77c8474f9648fae@syzkaller.appspotmail.com Signed-off-by: Daniel Jordan Signed-off-by: David S. Miller --- include/net/tls.h | 9 ++------- net/tls/tls_sw.c | 17 +++++++++++++---- 2 files changed, 15 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index 01d2e3744393..1fffb206f09f 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -358,6 +358,7 @@ int tls_sk_query(struct sock *sk, int optname, char __user *optval, int __user *optlen); int tls_sk_attach(struct sock *sk, int optname, char __user *optval, unsigned int optlen); +void tls_err_abort(struct sock *sk, int err); int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx); void tls_sw_strparser_arm(struct sock *sk, struct tls_context *ctx); @@ -466,12 +467,6 @@ static inline bool tls_is_sk_tx_device_offloaded(struct sock *sk) #endif } -static inline void tls_err_abort(struct sock *sk, int err) -{ - sk->sk_err = err; - sk_error_report(sk); -} - static inline bool tls_bigint_increment(unsigned char *seq, int len) { int i; @@ -512,7 +507,7 @@ static inline void tls_advance_record_sn(struct sock *sk, struct cipher_context *ctx) { if (tls_bigint_increment(ctx->rec_seq, prot->rec_seq_size)) - tls_err_abort(sk, EBADMSG); + tls_err_abort(sk, -EBADMSG); if (prot->version != TLS_1_3_VERSION && prot->cipher_type != TLS_CIPHER_CHACHA20_POLY1305) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index d5d09bd817b7..1644f8baea19 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -35,6 +35,7 @@ * SOFTWARE. */ +#include #include #include #include @@ -43,6 +44,14 @@ #include #include +noinline void tls_err_abort(struct sock *sk, int err) +{ + WARN_ON_ONCE(err >= 0); + /* sk->sk_err should contain a positive error code. */ + sk->sk_err = -err; + sk_error_report(sk); +} + static int __skb_nsg(struct sk_buff *skb, int offset, int len, unsigned int recursion_level) { @@ -419,7 +428,7 @@ int tls_tx_records(struct sock *sk, int flags) tx_err: if (rc < 0 && rc != -EAGAIN) - tls_err_abort(sk, EBADMSG); + tls_err_abort(sk, -EBADMSG); return rc; } @@ -763,7 +772,7 @@ static int tls_push_record(struct sock *sk, int flags, msg_pl->sg.size + prot->tail_size, i); if (rc < 0) { if (rc != -EINPROGRESS) { - tls_err_abort(sk, EBADMSG); + tls_err_abort(sk, -EBADMSG); if (split) { tls_ctx->pending_open_record_frags = true; tls_merge_open_record(sk, rec, tmp, orig_end); @@ -1827,7 +1836,7 @@ int tls_sw_recvmsg(struct sock *sk, err = decrypt_skb_update(sk, skb, &msg->msg_iter, &chunk, &zc, async_capable); if (err < 0 && err != -EINPROGRESS) { - tls_err_abort(sk, EBADMSG); + tls_err_abort(sk, -EBADMSG); goto recv_end; } @@ -2007,7 +2016,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, } if (err < 0) { - tls_err_abort(sk, EBADMSG); + tls_err_abort(sk, -EBADMSG); goto splice_read_end; } ctx->decrypted = 1; -- cgit v1.2.3 From f7cc8890f30d3ddc785e2b2ddc647da5b4b3c3ec Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 27 Oct 2021 13:38:55 -0700 Subject: mptcp: fix corrupt receiver key in MPC + data + checksum MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit using packetdrill it's possible to observe that the receiver key contains random values when clients transmit MP_CAPABLE with data and checksum (as specified in RFC8684 §3.1). Fix the layout of mptcp_out_options, to avoid using the skb extension copy when writing the MP_CAPABLE sub-option. Fixes: d7b269083786 ("mptcp: shrink mptcp_out_options struct") Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/233 Reported-by: Poorva Sonparote Signed-off-by: Davide Caratti Signed-off-by: Mat Martineau Link: https://lore.kernel.org/r/20211027203855.264600-1-mathew.j.martineau@linux.intel.com Signed-off-by: Jakub Kicinski --- include/net/mptcp.h | 4 ++++ net/mptcp/options.c | 39 ++++++++++++++++++++++++--------------- 2 files changed, 28 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 6026bbefbffd..3214848402ec 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -69,6 +69,10 @@ struct mptcp_out_options { struct { u64 sndr_key; u64 rcvr_key; + u64 data_seq; + u32 subflow_seq; + u16 data_len; + __sum16 csum; }; struct { struct mptcp_addr_info addr; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index c41273cefc51..f0f22eb4fd5f 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -485,11 +485,11 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, mpext = mptcp_get_ext(skb); data_len = mpext ? mpext->data_len : 0; - /* we will check ext_copy.data_len in mptcp_write_options() to + /* we will check ops->data_len in mptcp_write_options() to * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and * TCPOLEN_MPTCP_MPC_ACK */ - opts->ext_copy.data_len = data_len; + opts->data_len = data_len; opts->suboptions = OPTION_MPTCP_MPC_ACK; opts->sndr_key = subflow->local_key; opts->rcvr_key = subflow->remote_key; @@ -505,9 +505,9 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, len = TCPOLEN_MPTCP_MPC_ACK_DATA; if (opts->csum_reqd) { /* we need to propagate more info to csum the pseudo hdr */ - opts->ext_copy.data_seq = mpext->data_seq; - opts->ext_copy.subflow_seq = mpext->subflow_seq; - opts->ext_copy.csum = mpext->csum; + opts->data_seq = mpext->data_seq; + opts->subflow_seq = mpext->subflow_seq; + opts->csum = mpext->csum; len += TCPOLEN_MPTCP_DSS_CHECKSUM; } *size = ALIGN(len, 4); @@ -1227,7 +1227,7 @@ static void mptcp_set_rwin(const struct tcp_sock *tp) WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); } -static u16 mptcp_make_csum(const struct mptcp_ext *mpext) +static u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __sum16 sum) { struct csum_pseudo_header header; __wsum csum; @@ -1237,15 +1237,21 @@ static u16 mptcp_make_csum(const struct mptcp_ext *mpext) * always the 64-bit value, irrespective of what length is used in the * DSS option itself. */ - header.data_seq = cpu_to_be64(mpext->data_seq); - header.subflow_seq = htonl(mpext->subflow_seq); - header.data_len = htons(mpext->data_len); + header.data_seq = cpu_to_be64(data_seq); + header.subflow_seq = htonl(subflow_seq); + header.data_len = htons(data_len); header.csum = 0; - csum = csum_partial(&header, sizeof(header), ~csum_unfold(mpext->csum)); + csum = csum_partial(&header, sizeof(header), ~csum_unfold(sum)); return (__force u16)csum_fold(csum); } +static u16 mptcp_make_csum(const struct mptcp_ext *mpext) +{ + return __mptcp_make_csum(mpext->data_seq, mpext->subflow_seq, mpext->data_len, + mpext->csum); +} + void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, struct mptcp_out_options *opts) { @@ -1337,7 +1343,7 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, len = TCPOLEN_MPTCP_MPC_SYN; } else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) { len = TCPOLEN_MPTCP_MPC_SYNACK; - } else if (opts->ext_copy.data_len) { + } else if (opts->data_len) { len = TCPOLEN_MPTCP_MPC_ACK_DATA; if (opts->csum_reqd) len += TCPOLEN_MPTCP_DSS_CHECKSUM; @@ -1366,14 +1372,17 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, put_unaligned_be64(opts->rcvr_key, ptr); ptr += 2; - if (!opts->ext_copy.data_len) + if (!opts->data_len) goto mp_capable_done; if (opts->csum_reqd) { - put_unaligned_be32(opts->ext_copy.data_len << 16 | - mptcp_make_csum(&opts->ext_copy), ptr); + put_unaligned_be32(opts->data_len << 16 | + __mptcp_make_csum(opts->data_seq, + opts->subflow_seq, + opts->data_len, + opts->csum), ptr); } else { - put_unaligned_be32(opts->ext_copy.data_len << 16 | + put_unaligned_be32(opts->data_len << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); } ptr += 1; -- cgit v1.2.3