summaryrefslogtreecommitdiff
path: root/net/mptcp/protocol.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-02-11 19:31:52 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2026-02-11 19:31:52 -0800
commit37a93dd5c49b5fda807fd204edf2547c3493319c (patch)
treece1ef5a642b9ea3d7242156438eb96dc5607a752 /net/mptcp/protocol.c
parent098b6e44cbaa2d526d06af90c862d13fb414a0ec (diff)
parent83310d613382f74070fc8b402f3f6c2af8439ead (diff)
Merge tag 'net-next-7.0' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-nextipvs/mainipvs/HEADipvs-next/mainipvs-next/HEADdavem/net-next/maindavem/net-next/HEAD
Pull networking updates from Paolo Abeni: "Core & protocols: - A significant effort all around the stack to guide the compiler to make the right choice when inlining code, to avoid unneeded calls for small helper and stack canary overhead in the fast-path. This generates better and faster code with very small or no text size increases, as in many cases the call generated more code than the actual inlined helper. - Extend AccECN implementation so that is now functionally complete, also allow the user-space enabling it on a per network namespace basis. - Add support for memory providers with large (above 4K) rx buffer. Paired with hw-gro, larger rx buffer sizes reduce the number of buffers traversing the stack, dincreasing single stream CPU usage by up to ~30%. - Do not add HBH header to Big TCP GSO packets. This simplifies the RX path, the TX path and the NIC drivers, and is possible because user-space taps can now interpret correctly such packets without the HBH hint. - Allow IPv6 routes to be configured with a gateway address that is resolved out of a different interface than the one specified, aligning IPv6 to IPv4 behavior. - Multi-queue aware sch_cake. This makes it possible to scale the rate shaper of sch_cake across multiple CPUs, while still enforcing a single global rate on the interface. - Add support for the nbcon (new buffer console) infrastructure to netconsole, enabling lock-free, priority-based console operations that are safer in crash scenarios. - Improve the TCP ipv6 output path to cache the flow information, saving cpu cycles, reducing cache line misses and stack use. - Improve netfilter packet tracker to resolve clashes for most protocols, avoiding unneeded drops on rare occasions. - Add IP6IP6 tunneling acceleration to the flowtable infrastructure. - Reduce tcp socket size by one cache line. - Notify neighbour changes atomically, avoiding inconsistencies between the notification sequence and the actual states sequence. - Add vsock namespace support, allowing complete isolation of vsocks across different network namespaces. - Improve xsk generic performances with cache-alignment-oriented optimizations. - Support netconsole automatic target recovery, allowing netconsole to reestablish targets when underlying low-level interface comes back online. Driver API: - Support for switching the working mode (automatic vs manual) of a DPLL device via netlink. - Introduce PHY ports representation to expose multiple front-facing media ports over a single MAC. - Introduce "rx-polarity" and "tx-polarity" device tree properties, to generalize polarity inversion requirements for differential signaling. - Add helper to create, prepare and enable managed clocks. Device drivers: - Add Huawei hinic3 PF etherner driver. - Add DWMAC glue driver for Motorcomm YT6801 PCIe ethernet controller. - Add ethernet driver for MaxLinear MxL862xx switches - Remove parallel-port Ethernet driver. - Convert existing driver timestamp configuration reporting to hwtstamp_get and remove legacy ioctl(). - Convert existing drivers to .get_rx_ring_count(), simplifing the RX ring count retrieval. Also remove the legacy fallback path. - Ethernet high-speed NICs: - Broadcom (bnxt, bng): - bnxt: add FW interface update to support FEC stats histogram and NVRAM defragmentation - bng: add TSO and H/W GRO support - nVidia/Mellanox (mlx5): - improve latency of channel restart operations, reducing the used H/W resources - add TSO support for UDP over GRE over VLAN - add flow counters support for hardware steering (HWS) rules - use a static memory area to store headers for H/W GRO, leading to 12% RX tput improvement - Intel (100G, ice, idpf): - ice: reorganizes layout of Tx and Rx rings for cacheline locality and utilizes __cacheline_group* macros on the new layouts - ice: introduces Synchronous Ethernet (SyncE) support - Meta (fbnic): - adds debugfs for firmware mailbox and tx/rx rings vectors - Ethernet virtual: - geneve: introduce GRO/GSO support for double UDP encapsulation - Ethernet NICs consumer, and embedded: - Synopsys (stmmac): - some code refactoring and cleanups - RealTek (r8169): - add support for RTL8127ATF (10G Fiber SFP) - add dash and LTR support - Airoha: - AN8811HB 2.5 Gbps phy support - Freescale (fec): - add XDP zero-copy support - Thunderbolt: - add get link setting support to allow bonding - Renesas: - add support for RZ/G3L GBETH SoC - Ethernet switches: - Maxlinear: - support R(G)MII slow rate configuration - add support for Intel GSW150 - Motorcomm (yt921x): - add DCB/QoS support - TI: - icssm-prueth: support bridging (STP/RSTP) via the switchdev framework - Ethernet PHYs: - Realtek: - enable SGMII and 2500Base-X in-band auto-negotiation - simplify and reunify C22/C45 drivers - Micrel: convert bindings to DT schema - CAN: - move skb headroom content into skb extensions, making CAN metadata access more robust - CAN drivers: - rcar_canfd: - add support for FD-only mode - add support for the RZ/T2H SoC - sja1000: cleanup the CAN state handling - WiFi: - implement EPPKE/802.1X over auth frames support - split up drop reasons better, removing generic RX_DROP - additional FTM capabilities: 6 GHz support, supported number of spatial streams and supported number of LTF repetitions - better mac80211 iterators to enumerate resources - initial UHR (Wi-Fi 8) support for cfg80211/mac80211 - WiFi drivers: - Qualcomm/Atheros: - ath11k: support for Channel Frequency Response measurement - ath12k: a significant driver refactor to support multi-wiphy devices and and pave the way for future device support in the same driver (rather than splitting to ath13k) - ath12k: support for the QCC2072 chipset - Intel: - iwlwifi: partial Neighbor Awareness Networking (NAN) support - iwlwifi: initial support for U-NII-9 and IEEE 802.11bn - RealTek (rtw89): - preparations for RTL8922DE support - Bluetooth: - implement setsockopt(BT_PHY) to set the connection packet type/PHY - set link_policy on incoming ACL connections - Bluetooth drivers: - btusb: add support for MediaTek7920, Realtek RTL8761BU and 8851BE - btqca: add WCN6855 firmware priority selection feature" * tag 'net-next-7.0' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1254 commits) bnge/bng_re: Add a new HSI net: macb: Fix tx/rx malfunction after phy link down and up af_unix: Fix memleak of newsk in unix_stream_connect(). net: ti: icssg-prueth: Add optional dependency on HSR net: dsa: add basic initial driver for MxL862xx switches net: mdio: add unlocked mdiodev C45 bus accessors net: dsa: add tag format for MxL862xx switches dt-bindings: net: dsa: add MaxLinear MxL862xx selftests: drivers: net: hw: Modify toeplitz.c to poll for packets octeontx2-pf: Unregister devlink on probe failure net: renesas: rswitch: fix forwarding offload statemachine ionic: Rate limit unknown xcvr type messages tcp: inet6_csk_xmit() optimization tcp: populate inet->cork.fl.u.ip6 in tcp_v6_syn_recv_sock() tcp: populate inet->cork.fl.u.ip6 in tcp_v6_connect() ipv6: inet6_csk_xmit() and inet6_csk_update_pmtu() use inet->cork.fl.u.ip6 ipv6: use inet->cork.fl.u.ip6 and np->final in ip6_datagram_dst_update() ipv6: use np->final in inet6_sk_rebuild_header() ipv6: add daddr/final storage in struct ipv6_pinfo net: stmmac: qcom-ethqos: fix qcom_ethqos_serdes_powerup() ...
Diffstat (limited to 'net/mptcp/protocol.c')
-rw-r--r--net/mptcp/protocol.c275
1 files changed, 239 insertions, 36 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 8d3233667418..cf1852b99963 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -28,6 +28,8 @@
#include "protocol.h"
#include "mib.h"
+static unsigned int mptcp_inq_hint(const struct sock *sk);
+
#define CREATE_TRACE_POINTS
#include <trace/events/mptcp.h>
@@ -224,9 +226,6 @@ static bool mptcp_rcvbuf_grow(struct sock *sk, u32 newval)
do_div(grow, oldval);
rcvwin += grow << 1;
- if (!RB_EMPTY_ROOT(&msk->out_of_order_queue))
- rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - msk->ack_seq;
-
cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
rcvbuf = min_t(u32, mptcp_space_from_win(sk, rcvwin), cap);
@@ -350,9 +349,6 @@ merge_right:
end:
skb_condense(skb);
skb_set_owner_r(skb, sk);
- /* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */
- if (sk->sk_socket)
- mptcp_rcvbuf_grow(sk, msk->rcvq_space.space);
}
static void mptcp_init_skb(struct sock *ssk, struct sk_buff *skb, int offset,
@@ -1164,8 +1160,9 @@ struct mptcp_sendmsg_info {
bool data_lock_held;
};
-static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *ssk,
- u64 data_seq, int avail_size)
+static size_t mptcp_check_allowed_size(const struct mptcp_sock *msk,
+ struct sock *ssk, u64 data_seq,
+ size_t avail_size)
{
u64 window_end = mptcp_wnd_end(msk);
u64 mptcp_snd_wnd;
@@ -1174,7 +1171,7 @@ static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *s
return avail_size;
mptcp_snd_wnd = window_end - data_seq;
- avail_size = min_t(unsigned int, mptcp_snd_wnd, avail_size);
+ avail_size = min(mptcp_snd_wnd, avail_size);
if (unlikely(tcp_sk(ssk)->snd_wnd < mptcp_snd_wnd)) {
tcp_sk(ssk)->snd_wnd = min_t(u64, U32_MAX, mptcp_snd_wnd);
@@ -1518,7 +1515,7 @@ struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
if (!ssk || !sk_stream_memory_free(ssk))
return NULL;
- burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt);
+ burst = min(MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt);
wmem = READ_ONCE(ssk->sk_wmem_queued);
if (!burst)
return ssk;
@@ -1995,6 +1992,17 @@ do_error:
static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied);
+static void mptcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ /* avoid the indirect call, we know the destructor is sock_rfree */
+ skb->destructor = NULL;
+ skb->sk = NULL;
+ atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+ sk_mem_uncharge(sk, skb->truesize);
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ skb_attempt_defer_free(skb);
+}
+
static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg,
size_t len, int flags, int copied_total,
struct scm_timestamping_internal *tss,
@@ -2049,13 +2057,7 @@ static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg,
break;
}
- /* avoid the indirect call, we know the destructor is sock_rfree */
- skb->destructor = NULL;
- skb->sk = NULL;
- atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
- sk_mem_uncharge(sk, skb->truesize);
- __skb_unlink(skb, &sk->sk_receive_queue);
- skb_attempt_defer_free(skb);
+ mptcp_eat_recv_skb(sk, skb);
}
if (copied >= len)
@@ -2066,6 +2068,21 @@ static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg,
return copied;
}
+static void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
+{
+ const struct tcp_sock *tp = tcp_sk(ssk);
+
+ msk->rcvspace_init = 1;
+ msk->rcvq_space.copied = 0;
+ msk->rcvq_space.rtt_us = 0;
+
+ /* initial rcv_space offering made to peer */
+ msk->rcvq_space.space = min_t(u32, tp->rcv_wnd,
+ TCP_INIT_CWND * tp->advmss);
+ if (msk->rcvq_space.space == 0)
+ msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT;
+}
+
/* receive buffer autotuning. See tcp_rcv_space_adjust for more information.
*
* Only difference: Use highest rtt estimate of the subflows in use.
@@ -2088,8 +2105,8 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
msk->rcvq_space.copied += copied;
- mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC);
- time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time);
+ mstamp = mptcp_stamp();
+ time = tcp_stamp_us_delta(mstamp, READ_ONCE(msk->rcvq_space.time));
rtt_us = msk->rcvq_space.rtt_us;
if (rtt_us && time < (rtt_us >> 3))
@@ -2119,6 +2136,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
if (msk->rcvq_space.copied <= msk->rcvq_space.space)
goto new_measure;
+ trace_mptcp_rcvbuf_grow(sk, time);
if (mptcp_rcvbuf_grow(sk, msk->rcvq_space.copied)) {
/* Make subflows follow along. If we do not do this, we
* get drops at subflow level if skbs can't be moved to
@@ -3040,6 +3058,7 @@ static int mptcp_init_sock(struct sock *sk)
sk_sockets_allocated_inc(sk);
sk->sk_rcvbuf = READ_ONCE(net->ipv4.sysctl_tcp_rmem[1]);
sk->sk_sndbuf = READ_ONCE(net->ipv4.sysctl_tcp_wmem[1]);
+ sk->sk_write_space = sk_stream_write_space;
return 0;
}
@@ -3549,6 +3568,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk,
__mptcp_propagate_sndbuf(nsk, ssk);
mptcp_rcv_space_init(msk, ssk);
+ msk->rcvq_space.time = mptcp_stamp();
if (mp_opt->suboptions & OPTION_MPTCP_MPC_ACK)
__mptcp_subflow_fully_established(msk, subflow, mp_opt);
@@ -3558,23 +3578,6 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk,
return nsk;
}
-void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
-{
- const struct tcp_sock *tp = tcp_sk(ssk);
-
- msk->rcvspace_init = 1;
- msk->rcvq_space.copied = 0;
- msk->rcvq_space.rtt_us = 0;
-
- msk->rcvq_space.time = tp->tcp_mstamp;
-
- /* initial rcv_space offering made to peer */
- msk->rcvq_space.space = min_t(u32, tp->rcv_wnd,
- TCP_INIT_CWND * tp->advmss);
- if (msk->rcvq_space.space == 0)
- msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT;
-}
-
static void mptcp_destroy(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -3763,6 +3766,7 @@ void mptcp_finish_connect(struct sock *ssk)
* accessing the field below
*/
WRITE_ONCE(msk->local_key, subflow->local_key);
+ WRITE_ONCE(msk->rcvq_space.time, mptcp_stamp());
mptcp_pm_new_connection(msk, ssk, 0);
}
@@ -4312,6 +4316,201 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
return mask;
}
+static struct sk_buff *mptcp_recv_skb(struct sock *sk, u32 *off)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct sk_buff *skb;
+ u32 offset;
+
+ if (!list_empty(&msk->backlog_list))
+ mptcp_move_skbs(sk);
+
+ while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
+ offset = MPTCP_SKB_CB(skb)->offset;
+ if (offset < skb->len) {
+ *off = offset;
+ return skb;
+ }
+ mptcp_eat_recv_skb(sk, skb);
+ }
+ return NULL;
+}
+
+/*
+ * Note:
+ * - It is assumed that the socket was locked by the caller.
+ */
+static int __mptcp_read_sock(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor, bool noack)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct sk_buff *skb;
+ int copied = 0;
+ u32 offset;
+
+ msk_owned_by_me(msk);
+
+ if (sk->sk_state == TCP_LISTEN)
+ return -ENOTCONN;
+ while ((skb = mptcp_recv_skb(sk, &offset)) != NULL) {
+ u32 data_len = skb->len - offset;
+ int count;
+ u32 size;
+
+ size = min_t(size_t, data_len, INT_MAX);
+ count = recv_actor(desc, skb, offset, size);
+ if (count <= 0) {
+ if (!copied)
+ copied = count;
+ break;
+ }
+
+ copied += count;
+
+ msk->bytes_consumed += count;
+ if (count < data_len) {
+ MPTCP_SKB_CB(skb)->offset += count;
+ MPTCP_SKB_CB(skb)->map_seq += count;
+ break;
+ }
+
+ mptcp_eat_recv_skb(sk, skb);
+ }
+
+ if (noack)
+ goto out;
+
+ mptcp_rcv_space_adjust(msk, copied);
+
+ if (copied > 0) {
+ mptcp_recv_skb(sk, &offset);
+ mptcp_cleanup_rbuf(msk, copied);
+ }
+out:
+ return copied;
+}
+
+static int mptcp_read_sock(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor)
+{
+ return __mptcp_read_sock(sk, desc, recv_actor, false);
+}
+
+static int __mptcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
+{
+ /* Store TCP splice context information in read_descriptor_t. */
+ read_descriptor_t rd_desc = {
+ .arg.data = tss,
+ .count = tss->len,
+ };
+
+ return mptcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
+}
+
+/**
+ * mptcp_splice_read - splice data from MPTCP socket to a pipe
+ * @sock: socket to splice from
+ * @ppos: position (not valid)
+ * @pipe: pipe to splice to
+ * @len: number of bytes to splice
+ * @flags: splice modifier flags
+ *
+ * Description:
+ * Will read pages from given socket and fill them into a pipe.
+ *
+ * Return:
+ * Amount of bytes that have been spliced.
+ *
+ **/
+static ssize_t mptcp_splice_read(struct socket *sock, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ struct tcp_splice_state tss = {
+ .pipe = pipe,
+ .len = len,
+ .flags = flags,
+ };
+ struct sock *sk = sock->sk;
+ ssize_t spliced = 0;
+ int ret = 0;
+ long timeo;
+
+ /*
+ * We can't seek on a socket input
+ */
+ if (unlikely(*ppos))
+ return -ESPIPE;
+
+ lock_sock(sk);
+
+ mptcp_rps_record_subflows(mptcp_sk(sk));
+
+ timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
+ while (tss.len) {
+ ret = __mptcp_splice_read(sk, &tss);
+ if (ret < 0) {
+ break;
+ } else if (!ret) {
+ if (spliced)
+ break;
+ if (sock_flag(sk, SOCK_DONE))
+ break;
+ if (sk->sk_err) {
+ ret = sock_error(sk);
+ break;
+ }
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ break;
+ if (sk->sk_state == TCP_CLOSE) {
+ /*
+ * This occurs when user tries to read
+ * from never connected socket.
+ */
+ ret = -ENOTCONN;
+ break;
+ }
+ if (!timeo) {
+ ret = -EAGAIN;
+ break;
+ }
+ /* if __mptcp_splice_read() got nothing while we have
+ * an skb in receive queue, we do not want to loop.
+ * This might happen with URG data.
+ */
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ break;
+ ret = sk_wait_data(sk, &timeo, NULL);
+ if (ret < 0)
+ break;
+ if (signal_pending(current)) {
+ ret = sock_intr_errno(timeo);
+ break;
+ }
+ continue;
+ }
+ tss.len -= ret;
+ spliced += ret;
+
+ if (!tss.len || !timeo)
+ break;
+ release_sock(sk);
+ lock_sock(sk);
+
+ if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
+ (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ signal_pending(current))
+ break;
+ }
+
+ release_sock(sk);
+
+ if (spliced)
+ return spliced;
+
+ return ret;
+}
+
static const struct proto_ops mptcp_stream_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
@@ -4332,6 +4531,8 @@ static const struct proto_ops mptcp_stream_ops = {
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.set_rcvlowat = mptcp_set_rcvlowat,
+ .read_sock = mptcp_read_sock,
+ .splice_read = mptcp_splice_read,
};
static struct inet_protosw mptcp_protosw = {
@@ -4436,6 +4637,8 @@ static const struct proto_ops mptcp_v6_stream_ops = {
.compat_ioctl = inet6_compat_ioctl,
#endif
.set_rcvlowat = mptcp_set_rcvlowat,
+ .read_sock = mptcp_read_sock,
+ .splice_read = mptcp_splice_read,
};
static struct proto mptcp_v6_prot;