summaryrefslogtreecommitdiff
path: root/net/smc/smc_hs_bpf.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-12-03 17:24:33 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2025-12-03 17:24:33 -0800
commit8f7aa3d3c7323f4ca2768a9e74ebbe359c4f8f88 (patch)
tree67f541ef66f1853d09e66dcec29ae6f7eea898d2 /net/smc/smc_hs_bpf.c
parent015e7b0b0e8e51f7321ec2aafc1d7fc0a8a5536f (diff)
parent4de44542991ed4cb8c9fb2ccd766d6e6015101b0 (diff)
Merge tag 'net-next-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-nextHEADtorvalds/mastertorvalds/HEADmaster
Pull networking updates from Jakub Kicinski: "Core & protocols: - Replace busylock at the Tx queuing layer with a lockless list. Resulting in a 300% (4x) improvement on heavy TX workloads, sending twice the number of packets per second, for half the cpu cycles. - Allow constantly busy flows to migrate to a more suitable CPU/NIC queue. Normally we perform queue re-selection when flow comes out of idle, but under extreme circumstances the flows may be constantly busy. Add sysctl to allow periodic rehashing even if it'd risk packet reordering. - Optimize the NAPI skb cache, make it larger, use it in more paths. - Attempt returning Tx skbs to the originating CPU (like we already did for Rx skbs). - Various data structure layout and prefetch optimizations from Eric. - Remove ktime_get() from the recvmsg() fast path, ktime_get() is sadly quite expensive on recent AMD machines. - Extend threaded NAPI polling to allow the kthread busy poll for packets. - Make MPTCP use Rx backlog processing. This lowers the lock pressure, improving the Rx performance. - Support memcg accounting of MPTCP socket memory. - Allow admin to opt sockets out of global protocol memory accounting (using a sysctl or BPF-based policy). The global limits are a poor fit for modern container workloads, where limits are imposed using cgroups. - Improve heuristics for when to kick off AF_UNIX garbage collection. - Allow users to control TCP SACK compression, and default to 33% of RTT. - Add tcp_rcvbuf_low_rtt sysctl to let datacenter users avoid unnecessarily aggressive rcvbuf growth and overshot when the connection RTT is low. - Preserve skb metadata space across skb_push / skb_pull operations. - Support for IPIP encapsulation in the nftables flowtable offload. - Support appending IP interface information to ICMP messages (RFC 5837). - Support setting max record size in TLS (RFC 8449). - Remove taking rtnl_lock from RTM_GETNEIGHTBL and RTM_SETNEIGHTBL. - Use a dedicated lock (and RCU) in MPLS, instead of rtnl_lock. - Let users configure the number of write buffers in SMC. - Add new struct sockaddr_unsized for sockaddr of unknown length, from Kees. - Some conversions away from the crypto_ahash API, from Eric Biggers. - Some preparations for slimming down struct page. - YAML Netlink protocol spec for WireGuard. - Add a tool on top of YAML Netlink specs/lib for reporting commonly computed derived statistics and summarized system state. Driver API: - Add CAN XL support to the CAN Netlink interface. - Add uAPI for reporting PHY Mean Square Error (MSE) diagnostics, as defined by the OPEN Alliance's "Advanced diagnostic features for 100BASE-T1 automotive Ethernet PHYs" specification. - Add DPLL phase-adjust-gran pin attribute (and implement it in zl3073x). - Refactor xfrm_input lock to reduce contention when NIC offloads IPsec and performs RSS. - Add info to devlink params whether the current setting is the default or a user override. Allow resetting back to default. - Add standard device stats for PSP crypto offload. - Leverage DSA frame broadcast to implement simple HSR frame duplication for a lot of switches without dedicated HSR offload. - Add uAPI defines for 1.6Tbps link modes. Device drivers: - Add Motorcomm YT921x gigabit Ethernet switch support. - Add MUCSE driver for N500/N210 1GbE NIC series. - Convert drivers to support dedicated ops for timestamping control, and away from the direct IOCTL handling. While at it support GET operations for PHY timestamping. - Add (and convert most drivers to) a dedicated ethtool callback for reading the Rx ring count. - Significant refactoring efforts in the STMMAC driver, which supports Synopsys turn-key MAC IP integrated into a ton of SoCs. - Ethernet high-speed NICs: - Broadcom (bnxt): - support PPS in/out on all pins - Intel (100G, ice, idpf): - ice: implement standard ethtool and timestamping stats - i40e: support setting the max number of MAC addresses per VF - iavf: support RSS of GTP tunnels for 5G and LTE deployments - nVidia/Mellanox (mlx5): - reduce downtime on interface reconfiguration - disable being an XDP redirect target by default (same as other drivers) to avoid wasting resources if feature is unused - Meta (fbnic): - add support for Linux-managed PCS on 25G, 50G, and 100G links - Wangxun: - support Rx descriptor merge, and Tx head writeback - support Rx coalescing offload - support 25G SPF and 40G QSFP modules - Ethernet virtual: - Google (gve): - allow ethtool to configure rx_buf_len - implement XDP HW RX Timestamping support for DQ descriptor format - Microsoft vNIC (mana): - support HW link state events - handle hardware recovery events when probing the device - Ethernet NICs consumer, and embedded: - usbnet: add support for Byte Queue Limits (BQL) - AMD (amd-xgbe): - add device selftests - NXP (enetc): - add i.MX94 support - Broadcom integrated MACs (bcmgenet, bcmasp): - bcmasp: add support for PHY-based Wake-on-LAN - Broadcom switches (b53): - support port isolation - support BCM5389/97/98 and BCM63XX ARL formats - Lantiq/MaxLinear switches: - support bridge FDB entries on the CPU port - use regmap for register access - allow user to enable/disable learning - support Energy Efficient Ethernet - support configuring RMII clock delays - add tagging driver for MaxLinear GSW1xx switches - Synopsys (stmmac): - support using the HW clock in free running mode - add Eswin EIC7700 support - add Rockchip RK3506 support - add Altera Agilex5 support - Cadence (macb): - cleanup and consolidate descriptor and DMA address handling - add EyeQ5 support - TI: - icssg-prueth: support AF_XDP - Airoha access points: - add missing Ethernet stats and link state callback - add AN7583 support - support out-of-order Tx completion processing - Power over Ethernet: - pd692x0: preserve PSE configuration across reboots - add support for TPS23881B devices - Ethernet PHYs: - Open Alliance OATC14 10BASE-T1S PHY cable diagnostic support - Support 50G SerDes and 100G interfaces in Linux-managed PHYs - micrel: - support for non PTP SKUs of lan8814 - enable in-band auto-negotiation on lan8814 - realtek: - cable testing support on RTL8224 - interrupt support on RTL8221B - motorcomm: support for PHY LEDs on YT853 - microchip: support for LAN867X Rev.D0 PHYs w/ SQI and cable diag - mscc: support for PHY LED control - CAN drivers: - m_can: add support for optional reset and system wake up - remove can_change_mtu() obsoleted by core handling - mcp251xfd: support GPIO controller functionality - Bluetooth: - add initial support for PASTa - WiFi: - split ieee80211.h file, it's way too big - improvements in VHT radiotap reporting, S1G, Channel Switch Announcement handling, rate tracking in mesh networks - improve multi-radio monitor mode support, and add a cfg80211 debugfs interface for it - HT action frame handling on 6 GHz - initial chanctx work towards NAN - MU-MIMO sniffer improvements - WiFi drivers: - RealTek (rtw89): - support USB devices RTL8852AU and RTL8852CU - initial work for RTL8922DE - improved injection support - Intel: - iwlwifi: new sniffer API support - MediaTek (mt76): - WED support for >32-bit DMA - airoha NPU support - regdomain improvements - continued WiFi7/MLO work - Qualcomm/Atheros: - ath10k: factory test support - ath11k: TX power insertion support - ath12k: BSS color change support - ath12k: statistics improvements - brcmfmac: Acer A1 840 tablet quirk - rtl8xxxu: 40 MHz connection fixes/support" * tag 'net-next-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1381 commits) net: page_pool: sanitise allocation order net: page pool: xa init with destroy on pp init net/mlx5e: Support XDP target xmit with dummy program net/mlx5e: Update XDP features in switch channels selftests/tc-testing: Test CAKE scheduler when enqueue drops packets net/sched: sch_cake: Fix incorrect qlen reduction in cake_drop wireguard: netlink: generate netlink code wireguard: uapi: generate header with ynl-gen wireguard: uapi: move flag enums wireguard: uapi: move enum wg_cmd wireguard: netlink: add YNL specification selftests: drv-net: Fix tolerance calculation in devlink_rate_tc_bw.py selftests: drv-net: Fix and clarify TC bandwidth split in devlink_rate_tc_bw.py selftests: drv-net: Set shell=True for sysfs writes in devlink_rate_tc_bw.py selftests: drv-net: Use Iperf3Runner in devlink_rate_tc_bw.py selftests: drv-net: introduce Iperf3Runner for measurement use cases selftests: drv-net: Add devlink_rate_tc_bw.py to TEST_PROGS net: ps3_gelic_net: Use napi_alloc_skb() and napi_gro_receive() Documentation: net: dsa: mention simple HSR offload helpers Documentation: net: dsa: mention availability of RedBox ...
Diffstat (limited to 'net/smc/smc_hs_bpf.c')
-rw-r--r--net/smc/smc_hs_bpf.c140
1 files changed, 140 insertions, 0 deletions
diff --git a/net/smc/smc_hs_bpf.c b/net/smc/smc_hs_bpf.c
new file mode 100644
index 000000000000..063d23d85850
--- /dev/null
+++ b/net/smc/smc_hs_bpf.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Generic hook for SMC handshake flow.
+ *
+ * Copyright IBM Corp. 2016
+ * Copyright (c) 2025, Alibaba Inc.
+ *
+ * Author: D. Wythe <alibuda@linux.alibaba.com>
+ */
+
+#include <linux/bpf_verifier.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/rculist.h>
+
+#include "smc_hs_bpf.h"
+
+static DEFINE_SPINLOCK(smc_hs_ctrl_list_lock);
+static LIST_HEAD(smc_hs_ctrl_list);
+
+static int smc_hs_ctrl_reg(struct smc_hs_ctrl *ctrl)
+{
+ int ret = 0;
+
+ spin_lock(&smc_hs_ctrl_list_lock);
+ /* already exist or duplicate name */
+ if (smc_hs_ctrl_find_by_name(ctrl->name))
+ ret = -EEXIST;
+ else
+ list_add_tail_rcu(&ctrl->list, &smc_hs_ctrl_list);
+ spin_unlock(&smc_hs_ctrl_list_lock);
+ return ret;
+}
+
+static void smc_hs_ctrl_unreg(struct smc_hs_ctrl *ctrl)
+{
+ spin_lock(&smc_hs_ctrl_list_lock);
+ list_del_rcu(&ctrl->list);
+ spin_unlock(&smc_hs_ctrl_list_lock);
+
+ /* Ensure that all readers to complete */
+ synchronize_rcu();
+}
+
+struct smc_hs_ctrl *smc_hs_ctrl_find_by_name(const char *name)
+{
+ struct smc_hs_ctrl *ctrl;
+
+ list_for_each_entry_rcu(ctrl, &smc_hs_ctrl_list, list) {
+ if (strcmp(ctrl->name, name) == 0)
+ return ctrl;
+ }
+ return NULL;
+}
+
+static int __smc_bpf_stub_set_tcp_option(struct tcp_sock *tp) { return 1; }
+static int __smc_bpf_stub_set_tcp_option_cond(const struct tcp_sock *tp,
+ struct inet_request_sock *ireq)
+{
+ return 1;
+}
+
+static struct smc_hs_ctrl __smc_bpf_hs_ctrl = {
+ .syn_option = __smc_bpf_stub_set_tcp_option,
+ .synack_option = __smc_bpf_stub_set_tcp_option_cond,
+};
+
+static int smc_bpf_hs_ctrl_init(struct btf *btf) { return 0; }
+
+static int smc_bpf_hs_ctrl_reg(void *kdata, struct bpf_link *link)
+{
+ if (link)
+ return -EOPNOTSUPP;
+
+ return smc_hs_ctrl_reg(kdata);
+}
+
+static void smc_bpf_hs_ctrl_unreg(void *kdata, struct bpf_link *link)
+{
+ smc_hs_ctrl_unreg(kdata);
+}
+
+static int smc_bpf_hs_ctrl_init_member(const struct btf_type *t,
+ const struct btf_member *member,
+ void *kdata, const void *udata)
+{
+ const struct smc_hs_ctrl *u_ctrl;
+ struct smc_hs_ctrl *k_ctrl;
+ u32 moff;
+
+ u_ctrl = (const struct smc_hs_ctrl *)udata;
+ k_ctrl = (struct smc_hs_ctrl *)kdata;
+
+ moff = __btf_member_bit_offset(t, member) / 8;
+ switch (moff) {
+ case offsetof(struct smc_hs_ctrl, name):
+ if (bpf_obj_name_cpy(k_ctrl->name, u_ctrl->name,
+ sizeof(u_ctrl->name)) <= 0)
+ return -EINVAL;
+ return 1;
+ case offsetof(struct smc_hs_ctrl, flags):
+ if (u_ctrl->flags & ~SMC_HS_CTRL_ALL_FLAGS)
+ return -EINVAL;
+ k_ctrl->flags = u_ctrl->flags;
+ return 1;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static const struct bpf_func_proto *
+bpf_smc_hs_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ return bpf_base_func_proto(func_id, prog);
+}
+
+static const struct bpf_verifier_ops smc_bpf_verifier_ops = {
+ .get_func_proto = bpf_smc_hs_func_proto,
+ .is_valid_access = bpf_tracing_btf_ctx_access,
+};
+
+static struct bpf_struct_ops bpf_smc_hs_ctrl_ops = {
+ .name = "smc_hs_ctrl",
+ .init = smc_bpf_hs_ctrl_init,
+ .reg = smc_bpf_hs_ctrl_reg,
+ .unreg = smc_bpf_hs_ctrl_unreg,
+ .cfi_stubs = &__smc_bpf_hs_ctrl,
+ .verifier_ops = &smc_bpf_verifier_ops,
+ .init_member = smc_bpf_hs_ctrl_init_member,
+ .owner = THIS_MODULE,
+};
+
+int bpf_smc_hs_ctrl_init(void)
+{
+ return register_bpf_struct_ops(&bpf_smc_hs_ctrl_ops, smc_hs_ctrl);
+}