summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-02-11 19:31:52 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2026-02-11 19:31:52 -0800
commit37a93dd5c49b5fda807fd204edf2547c3493319c (patch)
treece1ef5a642b9ea3d7242156438eb96dc5607a752 /net/ipv4
parent098b6e44cbaa2d526d06af90c862d13fb414a0ec (diff)
parent83310d613382f74070fc8b402f3f6c2af8439ead (diff)
Merge tag 'net-next-7.0' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-nextipvs/mainipvs/HEADipvs-next/mainipvs-next/HEADdavem/net-next/maindavem/net-next/HEAD
Pull networking updates from Paolo Abeni: "Core & protocols: - A significant effort all around the stack to guide the compiler to make the right choice when inlining code, to avoid unneeded calls for small helper and stack canary overhead in the fast-path. This generates better and faster code with very small or no text size increases, as in many cases the call generated more code than the actual inlined helper. - Extend AccECN implementation so that is now functionally complete, also allow the user-space enabling it on a per network namespace basis. - Add support for memory providers with large (above 4K) rx buffer. Paired with hw-gro, larger rx buffer sizes reduce the number of buffers traversing the stack, dincreasing single stream CPU usage by up to ~30%. - Do not add HBH header to Big TCP GSO packets. This simplifies the RX path, the TX path and the NIC drivers, and is possible because user-space taps can now interpret correctly such packets without the HBH hint. - Allow IPv6 routes to be configured with a gateway address that is resolved out of a different interface than the one specified, aligning IPv6 to IPv4 behavior. - Multi-queue aware sch_cake. This makes it possible to scale the rate shaper of sch_cake across multiple CPUs, while still enforcing a single global rate on the interface. - Add support for the nbcon (new buffer console) infrastructure to netconsole, enabling lock-free, priority-based console operations that are safer in crash scenarios. - Improve the TCP ipv6 output path to cache the flow information, saving cpu cycles, reducing cache line misses and stack use. - Improve netfilter packet tracker to resolve clashes for most protocols, avoiding unneeded drops on rare occasions. - Add IP6IP6 tunneling acceleration to the flowtable infrastructure. - Reduce tcp socket size by one cache line. - Notify neighbour changes atomically, avoiding inconsistencies between the notification sequence and the actual states sequence. - Add vsock namespace support, allowing complete isolation of vsocks across different network namespaces. - Improve xsk generic performances with cache-alignment-oriented optimizations. - Support netconsole automatic target recovery, allowing netconsole to reestablish targets when underlying low-level interface comes back online. Driver API: - Support for switching the working mode (automatic vs manual) of a DPLL device via netlink. - Introduce PHY ports representation to expose multiple front-facing media ports over a single MAC. - Introduce "rx-polarity" and "tx-polarity" device tree properties, to generalize polarity inversion requirements for differential signaling. - Add helper to create, prepare and enable managed clocks. Device drivers: - Add Huawei hinic3 PF etherner driver. - Add DWMAC glue driver for Motorcomm YT6801 PCIe ethernet controller. - Add ethernet driver for MaxLinear MxL862xx switches - Remove parallel-port Ethernet driver. - Convert existing driver timestamp configuration reporting to hwtstamp_get and remove legacy ioctl(). - Convert existing drivers to .get_rx_ring_count(), simplifing the RX ring count retrieval. Also remove the legacy fallback path. - Ethernet high-speed NICs: - Broadcom (bnxt, bng): - bnxt: add FW interface update to support FEC stats histogram and NVRAM defragmentation - bng: add TSO and H/W GRO support - nVidia/Mellanox (mlx5): - improve latency of channel restart operations, reducing the used H/W resources - add TSO support for UDP over GRE over VLAN - add flow counters support for hardware steering (HWS) rules - use a static memory area to store headers for H/W GRO, leading to 12% RX tput improvement - Intel (100G, ice, idpf): - ice: reorganizes layout of Tx and Rx rings for cacheline locality and utilizes __cacheline_group* macros on the new layouts - ice: introduces Synchronous Ethernet (SyncE) support - Meta (fbnic): - adds debugfs for firmware mailbox and tx/rx rings vectors - Ethernet virtual: - geneve: introduce GRO/GSO support for double UDP encapsulation - Ethernet NICs consumer, and embedded: - Synopsys (stmmac): - some code refactoring and cleanups - RealTek (r8169): - add support for RTL8127ATF (10G Fiber SFP) - add dash and LTR support - Airoha: - AN8811HB 2.5 Gbps phy support - Freescale (fec): - add XDP zero-copy support - Thunderbolt: - add get link setting support to allow bonding - Renesas: - add support for RZ/G3L GBETH SoC - Ethernet switches: - Maxlinear: - support R(G)MII slow rate configuration - add support for Intel GSW150 - Motorcomm (yt921x): - add DCB/QoS support - TI: - icssm-prueth: support bridging (STP/RSTP) via the switchdev framework - Ethernet PHYs: - Realtek: - enable SGMII and 2500Base-X in-band auto-negotiation - simplify and reunify C22/C45 drivers - Micrel: convert bindings to DT schema - CAN: - move skb headroom content into skb extensions, making CAN metadata access more robust - CAN drivers: - rcar_canfd: - add support for FD-only mode - add support for the RZ/T2H SoC - sja1000: cleanup the CAN state handling - WiFi: - implement EPPKE/802.1X over auth frames support - split up drop reasons better, removing generic RX_DROP - additional FTM capabilities: 6 GHz support, supported number of spatial streams and supported number of LTF repetitions - better mac80211 iterators to enumerate resources - initial UHR (Wi-Fi 8) support for cfg80211/mac80211 - WiFi drivers: - Qualcomm/Atheros: - ath11k: support for Channel Frequency Response measurement - ath12k: a significant driver refactor to support multi-wiphy devices and and pave the way for future device support in the same driver (rather than splitting to ath13k) - ath12k: support for the QCC2072 chipset - Intel: - iwlwifi: partial Neighbor Awareness Networking (NAN) support - iwlwifi: initial support for U-NII-9 and IEEE 802.11bn - RealTek (rtw89): - preparations for RTL8922DE support - Bluetooth: - implement setsockopt(BT_PHY) to set the connection packet type/PHY - set link_policy on incoming ACL connections - Bluetooth drivers: - btusb: add support for MediaTek7920, Realtek RTL8761BU and 8851BE - btqca: add WCN6855 firmware priority selection feature" * tag 'net-next-7.0' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1254 commits) bnge/bng_re: Add a new HSI net: macb: Fix tx/rx malfunction after phy link down and up af_unix: Fix memleak of newsk in unix_stream_connect(). net: ti: icssg-prueth: Add optional dependency on HSR net: dsa: add basic initial driver for MxL862xx switches net: mdio: add unlocked mdiodev C45 bus accessors net: dsa: add tag format for MxL862xx switches dt-bindings: net: dsa: add MaxLinear MxL862xx selftests: drivers: net: hw: Modify toeplitz.c to poll for packets octeontx2-pf: Unregister devlink on probe failure net: renesas: rswitch: fix forwarding offload statemachine ionic: Rate limit unknown xcvr type messages tcp: inet6_csk_xmit() optimization tcp: populate inet->cork.fl.u.ip6 in tcp_v6_syn_recv_sock() tcp: populate inet->cork.fl.u.ip6 in tcp_v6_connect() ipv6: inet6_csk_xmit() and inet6_csk_update_pmtu() use inet->cork.fl.u.ip6 ipv6: use inet->cork.fl.u.ip6 and np->final in ip6_datagram_dst_update() ipv6: use np->final in inet6_sk_rebuild_header() ipv6: add daddr/final storage in struct ipv6_pinfo net: stmmac: qcom-ethqos: fix qcom_ethqos_serdes_powerup() ...
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/cipso_ipv4.c3
-rw-r--r--net/ipv4/fib_lookup.h6
-rw-r--r--net/ipv4/fib_trie.c4
-rw-r--r--net/ipv4/icmp.c139
-rw-r--r--net/ipv4/igmp.c4
-rw-r--r--net/ipv4/inet_connection_sock.c24
-rw-r--r--net/ipv4/ip_output.c17
-rw-r--r--net/ipv4/ip_sockglue.c2
-rw-r--r--net/ipv4/ipconfig.c89
-rw-r--r--net/ipv4/ipmr.c2
-rw-r--r--net/ipv4/metrics.c2
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c2
-rw-r--r--net/ipv4/ping.c7
-rw-r--r--net/ipv4/raw.c7
-rw-r--r--net/ipv4/route.c8
-rw-r--r--net/ipv4/sysctl_net_ipv4.c4
-rw-r--r--net/ipv4/tcp.c84
-rw-r--r--net/ipv4/tcp_cong.c5
-rw-r--r--net/ipv4/tcp_fastopen.c86
-rw-r--r--net/ipv4/tcp_input.c305
-rw-r--r--net/ipv4/tcp_ipv4.c37
-rw-r--r--net/ipv4/tcp_minisocks.c43
-rw-r--r--net/ipv4/tcp_offload.c3
-rw-r--r--net/ipv4/tcp_output.c117
-rw-r--r--net/ipv4/tcp_rate.c209
-rw-r--r--net/ipv4/tcp_recovery.c75
-rw-r--r--net/ipv4/tcp_timer.c3
-rw-r--r--net/ipv4/udp.c32
-rw-r--r--net/ipv4/udp_offload.c6
30 files changed, 770 insertions, 557 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ec36d2ec059e..18108a6f0499 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -9,7 +9,7 @@ obj-y := route.o inetpeer.o protocol.o \
inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
- tcp_rate.o tcp_recovery.o tcp_ulp.o \
+ tcp_recovery.o tcp_ulp.o \
tcp_offload.o tcp_plb.o datagram.o raw.o udp.o udplite.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 709021197e1c..32b951ebc0c2 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -2196,7 +2196,8 @@ int cipso_v4_skbuff_setattr(struct sk_buff *skb,
/* if we don't ensure enough headroom we could panic on the skb_push()
* call below so make sure we have enough, we are also "mangling" the
* packet so we should probably do a copy-on-write call anyway */
- ret_val = skb_cow(skb, skb_headroom(skb) + len_delta);
+ ret_val = skb_cow(skb,
+ skb_headroom(skb) + (len_delta > 0 ? len_delta : 0));
if (ret_val < 0)
return ret_val;
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index f9b9e26c32c1..0b72796dd1ad 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -28,8 +28,10 @@ struct fib_alias {
/* Don't write on fa_state unless needed, to keep it shared on all cpus */
static inline void fib_alias_accessed(struct fib_alias *fa)
{
- if (!(fa->fa_state & FA_S_ACCESSED))
- fa->fa_state |= FA_S_ACCESSED;
+ u8 fa_state = READ_ONCE(fa->fa_state);
+
+ if (!(fa_state & FA_S_ACCESSED))
+ WRITE_ONCE(fa->fa_state, fa_state | FA_S_ACCESSED);
}
/* Exported by fib_semantics.c */
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 7e2c17fec3fc..1308213791f1 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1280,7 +1280,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
new_fa->fa_dscp = fa->fa_dscp;
new_fa->fa_info = fi;
new_fa->fa_type = cfg->fc_type;
- state = fa->fa_state;
+ state = READ_ONCE(fa->fa_state);
new_fa->fa_state = state & ~FA_S_ACCESSED;
new_fa->fa_slen = fa->fa_slen;
new_fa->tb_id = tb->tb_id;
@@ -1745,7 +1745,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
fib_remove_alias(t, tp, l, fa_to_delete);
- if (fa_to_delete->fa_state & FA_S_ACCESSED)
+ if (READ_ONCE(fa_to_delete->fa_state) & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net);
fib_release_info(fa_to_delete->fa_info);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 4abbec2f47ef..e216b6df6331 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -112,7 +112,9 @@ struct icmp_bxm {
__be32 times[3];
} data;
int head_len;
- struct ip_options_data replyopts;
+
+ /* Must be last as it ends in a flexible-array member. */
+ struct ip_options_rcu replyopts;
};
/* An array of errno for error messages from dest unreach. */
@@ -353,9 +355,12 @@ void icmp_out_count(struct net *net, unsigned char type)
static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
struct sk_buff *skb)
{
- struct icmp_bxm *icmp_param = from;
+ DEFINE_RAW_FLEX(struct icmp_bxm, icmp_param, replyopts.opt.__data,
+ IP_OPTIONS_DATA_FIXED_SIZE);
__wsum csum;
+ icmp_param = from;
+
csum = skb_copy_and_csum_bits(icmp_param->skb,
icmp_param->offset + offset,
to, len);
@@ -413,7 +418,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
int type = icmp_param->data.icmph.type;
int code = icmp_param->data.icmph.code;
- if (ip_options_echo(net, &icmp_param->replyopts.opt.opt, skb))
+ if (ip_options_echo(net, &icmp_param->replyopts.opt, skb))
return;
/* Needed by both icmpv4_global_allow and icmp_xmit_lock */
@@ -435,10 +440,10 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
daddr = ipc.addr = ip_hdr(skb)->saddr;
saddr = fib_compute_spec_dst(skb);
- if (icmp_param->replyopts.opt.opt.optlen) {
- ipc.opt = &icmp_param->replyopts.opt;
+ if (icmp_param->replyopts.opt.optlen) {
+ ipc.opt = &icmp_param->replyopts;
if (ipc.opt->opt.srr)
- daddr = icmp_param->replyopts.opt.opt.faddr;
+ daddr = icmp_param->replyopts.opt.faddr;
}
memset(&fl4, 0, sizeof(fl4));
fl4.daddr = daddr;
@@ -491,8 +496,8 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4,
int err;
memset(fl4, 0, sizeof(*fl4));
- fl4->daddr = (param->replyopts.opt.opt.srr ?
- param->replyopts.opt.opt.faddr : iph->saddr);
+ fl4->daddr = (param->replyopts.opt.srr ?
+ param->replyopts.opt.faddr : iph->saddr);
fl4->saddr = saddr;
fl4->flowi4_mark = mark;
fl4->flowi4_uid = sock_net_uid(net, NULL);
@@ -554,6 +559,21 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4,
/* steal dst entry from skb_in, don't drop refcnt */
skb_dstref_steal(skb_in);
skb_dstref_restore(skb_in, orefdst);
+
+ /*
+ * At this point, fl4_dec.daddr should NOT be local (we
+ * checked fl4_dec.saddr above). However, a race condition
+ * may occur if the address is added to the interface
+ * concurrently. In that case, ip_route_input() returns a
+ * LOCAL route with dst.output=ip_rt_bug, which must not
+ * be used for output.
+ */
+ if (!err && rt2 && rt2->rt_type == RTN_LOCAL) {
+ net_warn_ratelimited("detected local route for %pI4 during ICMP sending, src %pI4\n",
+ &fl4_dec.daddr, &fl4_dec.saddr);
+ dst_release(&rt2->dst);
+ err = -EINVAL;
+ }
}
if (err)
@@ -775,9 +795,10 @@ free_skb:
void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
const struct inet_skb_parm *parm)
{
+ DEFINE_RAW_FLEX(struct icmp_bxm, icmp_param, replyopts.opt.__data,
+ IP_OPTIONS_DATA_FIXED_SIZE);
struct iphdr *iph;
int room;
- struct icmp_bxm icmp_param;
struct rtable *rt = skb_rtable(skb_in);
bool apply_ratelimit = false;
struct sk_buff *ext_skb;
@@ -906,7 +927,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
iph->tos;
mark = IP4_REPLY_MARK(net, skb_in->mark);
- if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in,
+ if (__ip_options_echo(net, &icmp_param->replyopts.opt, skb_in,
&parm->opt))
goto out_unlock;
@@ -915,21 +936,21 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
* Prepare data for ICMP header.
*/
- icmp_param.data.icmph.type = type;
- icmp_param.data.icmph.code = code;
- icmp_param.data.icmph.un.gateway = info;
- icmp_param.data.icmph.checksum = 0;
- icmp_param.skb = skb_in;
- icmp_param.offset = skb_network_offset(skb_in);
+ icmp_param->data.icmph.type = type;
+ icmp_param->data.icmph.code = code;
+ icmp_param->data.icmph.un.gateway = info;
+ icmp_param->data.icmph.checksum = 0;
+ icmp_param->skb = skb_in;
+ icmp_param->offset = skb_network_offset(skb_in);
ipcm_init(&ipc);
ipc.tos = tos;
ipc.addr = iph->saddr;
- ipc.opt = &icmp_param.replyopts.opt;
+ ipc.opt = &icmp_param->replyopts;
ipc.sockc.mark = mark;
rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr,
inet_dsfield_to_dscp(tos), mark, type, code,
- &icmp_param);
+ icmp_param);
if (IS_ERR(rt))
goto out_unlock;
@@ -939,10 +960,10 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
/* RFC says return as much as we can without exceeding 576 bytes. */
- room = dst_mtu(&rt->dst);
+ room = dst4_mtu(&rt->dst);
if (room > 576)
room = 576;
- room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen;
+ room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.optlen;
room -= sizeof(struct icmphdr);
/* Guard against tiny mtu. We need to include at least one
* IP network header for this message to make any sense.
@@ -950,15 +971,15 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
if (room <= (int)sizeof(struct iphdr))
goto ende;
- ext_skb = icmp_ext_append(net, skb_in, &icmp_param.data.icmph, room,
+ ext_skb = icmp_ext_append(net, skb_in, &icmp_param->data.icmph, room,
parm->iif);
if (ext_skb)
- icmp_param.skb = ext_skb;
+ icmp_param->skb = ext_skb;
- icmp_param.data_len = icmp_param.skb->len - icmp_param.offset;
- if (icmp_param.data_len > room)
- icmp_param.data_len = room;
- icmp_param.head_len = sizeof(struct icmphdr);
+ icmp_param->data_len = icmp_param->skb->len - icmp_param->offset;
+ if (icmp_param->data_len > room)
+ icmp_param->data_len = room;
+ icmp_param->head_len = sizeof(struct icmphdr);
/* if we don't have a source address at this point, fall back to the
* dummy address instead of sending out a packet with a source address
@@ -969,7 +990,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
trace_icmp_send(skb_in, type, code);
- icmp_push_reply(sk, &icmp_param, &fl4, &ipc, &rt);
+ icmp_push_reply(sk, icmp_param, &fl4, &ipc, &rt);
if (ext_skb)
consume_skb(ext_skb);
@@ -1031,16 +1052,22 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
/* Checkin full IP header plus 8 bytes of protocol to
* avoid additional coding at protocol handlers.
*/
- if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) {
- __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS);
- return;
- }
+ if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
+ goto out;
+
+ /* IPPROTO_RAW sockets are not supposed to receive anything. */
+ if (protocol == IPPROTO_RAW)
+ goto out;
raw_icmp_error(skb, protocol, info);
ipprot = rcu_dereference(inet_protos[protocol]);
if (ipprot && ipprot->err_handler)
ipprot->err_handler(skb, info);
+ return;
+
+out:
+ __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS);
}
static bool icmp_tag_validation(int proto)
@@ -1206,7 +1233,8 @@ static enum skb_drop_reason icmp_redirect(struct sk_buff *skb)
static enum skb_drop_reason icmp_echo(struct sk_buff *skb)
{
- struct icmp_bxm icmp_param;
+ DEFINE_RAW_FLEX(struct icmp_bxm, icmp_param, replyopts.opt.__data,
+ IP_OPTIONS_DATA_FIXED_SIZE);
struct net *net;
net = skb_dst_dev_net_rcu(skb);
@@ -1214,18 +1242,18 @@ static enum skb_drop_reason icmp_echo(struct sk_buff *skb)
if (READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_all))
return SKB_NOT_DROPPED_YET;
- icmp_param.data.icmph = *icmp_hdr(skb);
- icmp_param.skb = skb;
- icmp_param.offset = 0;
- icmp_param.data_len = skb->len;
- icmp_param.head_len = sizeof(struct icmphdr);
+ icmp_param->data.icmph = *icmp_hdr(skb);
+ icmp_param->skb = skb;
+ icmp_param->offset = 0;
+ icmp_param->data_len = skb->len;
+ icmp_param->head_len = sizeof(struct icmphdr);
- if (icmp_param.data.icmph.type == ICMP_ECHO)
- icmp_param.data.icmph.type = ICMP_ECHOREPLY;
- else if (!icmp_build_probe(skb, &icmp_param.data.icmph))
+ if (icmp_param->data.icmph.type == ICMP_ECHO)
+ icmp_param->data.icmph.type = ICMP_ECHOREPLY;
+ else if (!icmp_build_probe(skb, &icmp_param->data.icmph))
return SKB_NOT_DROPPED_YET;
- icmp_reply(&icmp_param, skb);
+ icmp_reply(icmp_param, skb);
return SKB_NOT_DROPPED_YET;
}
@@ -1353,7 +1381,8 @@ EXPORT_SYMBOL_GPL(icmp_build_probe);
*/
static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb)
{
- struct icmp_bxm icmp_param;
+ DEFINE_RAW_FLEX(struct icmp_bxm, icmp_param, replyopts.opt.__data,
+ IP_OPTIONS_DATA_FIXED_SIZE);
/*
* Too short.
*/
@@ -1363,19 +1392,19 @@ static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb)
/*
* Fill in the current time as ms since midnight UT:
*/
- icmp_param.data.times[1] = inet_current_timestamp();
- icmp_param.data.times[2] = icmp_param.data.times[1];
-
- BUG_ON(skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4));
-
- icmp_param.data.icmph = *icmp_hdr(skb);
- icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY;
- icmp_param.data.icmph.code = 0;
- icmp_param.skb = skb;
- icmp_param.offset = 0;
- icmp_param.data_len = 0;
- icmp_param.head_len = sizeof(struct icmphdr) + 12;
- icmp_reply(&icmp_param, skb);
+ icmp_param->data.times[1] = inet_current_timestamp();
+ icmp_param->data.times[2] = icmp_param->data.times[1];
+
+ BUG_ON(skb_copy_bits(skb, 0, &icmp_param->data.times[0], 4));
+
+ icmp_param->data.icmph = *icmp_hdr(skb);
+ icmp_param->data.icmph.type = ICMP_TIMESTAMPREPLY;
+ icmp_param->data.icmph.code = 0;
+ icmp_param->skb = skb;
+ icmp_param->offset = 0;
+ icmp_param->data_len = 0;
+ icmp_param->head_len = sizeof(struct icmphdr) + 12;
+ icmp_reply(icmp_param, skb);
return SKB_NOT_DROPPED_YET;
out_err:
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 7182f1419c2a..0adc993c211d 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -227,7 +227,7 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
static void igmp_gq_start_timer(struct in_device *in_dev)
{
- int tv = get_random_u32_below(in_dev->mr_maxdelay);
+ int tv = get_random_u32_below(READ_ONCE(in_dev->mr_maxdelay));
unsigned long exp = jiffies + tv + 2;
if (in_dev->mr_gq_running &&
@@ -1009,7 +1009,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
if (!max_delay)
max_delay = 1; /* can't mod w/ 0 */
- in_dev->mr_maxdelay = max_delay;
+ WRITE_ONCE(in_dev->mr_maxdelay, max_delay);
/* RFC3376, 4.1.6. QRV and 4.1.7. QQIC, when the most recently
* received value was zero, use the default or statically
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 97d57c52b9ad..5dfac6ce1110 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -20,6 +20,7 @@
#include <net/tcp_states.h>
#include <net/xfrm.h>
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
#include <net/sock_reuseport.h>
#include <net/addrconf.h>
@@ -918,6 +919,16 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
}
EXPORT_SYMBOL(inet_reqsk_alloc);
+void __reqsk_free(struct request_sock *req)
+{
+ req->rsk_ops->destructor(req);
+ if (req->rsk_listener)
+ sock_put(req->rsk_listener);
+ kfree(req->saved_syn);
+ kmem_cache_free(req->rsk_ops->slab, req);
+}
+EXPORT_SYMBOL_GPL(__reqsk_free);
+
static struct request_sock *inet_reqsk_clone(struct request_sock *req,
struct sock *sk)
{
@@ -1103,6 +1114,8 @@ static void reqsk_timer_handler(struct timer_list *t)
(!resend ||
!tcp_rtx_synack(sk_listener, req) ||
inet_rsk(req)->acked)) {
+ if (req->num_retrans > 1 && tcp_rsk(req)->accecn_ok)
+ tcp_rsk(req)->accecn_fail_mode |= TCP_ACCECN_ACE_FAIL_SEND;
if (req->num_timeout++ == 0)
atomic_dec(&queue->young);
mod_timer(&req->rsk_timer, jiffies + tcp_reqsk_timeout(req));
@@ -1196,7 +1209,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
{
struct sock *newsk = sk_clone_lock(sk, priority);
struct inet_connection_sock *newicsk;
- struct inet_request_sock *ireq;
+ const struct inet_request_sock *ireq;
struct inet_sock *newinet;
if (!newsk)
@@ -1311,6 +1324,15 @@ static int inet_ulp_can_listen(const struct sock *sk)
return 0;
}
+static void reqsk_queue_alloc(struct request_sock_queue *queue)
+{
+ queue->fastopenq.rskq_rst_head = NULL;
+ queue->fastopenq.rskq_rst_tail = NULL;
+ queue->fastopenq.qlen = 0;
+
+ queue->rskq_accept_head = NULL;
+}
+
int inet_csk_listen_start(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index ff11d3a85a36..e4790cc7b5c2 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1300,7 +1300,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
return -EFAULT;
cork->fragsize = ip_sk_use_pmtu(sk) ?
- dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu);
+ dst4_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu);
if (!inetdev_valid_mtu(cork->fragsize))
return -ENETUNREACH;
@@ -1439,7 +1439,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
pmtudisc = READ_ONCE(inet->pmtudisc);
if (pmtudisc == IP_PMTUDISC_DO ||
pmtudisc == IP_PMTUDISC_PROBE ||
- (skb->len <= dst_mtu(&rt->dst) &&
+ (skb->len <= dst4_mtu(&rt->dst) &&
ip_dont_fragment(sk, &rt->dst)))
df = htons(IP_DF);
@@ -1606,7 +1606,8 @@ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk,
const struct ip_reply_arg *arg,
unsigned int len, u64 transmit_time, u32 txhash)
{
- struct ip_options_data replyopts;
+ DEFINE_RAW_FLEX(struct ip_options_rcu, replyopts, opt.__data,
+ IP_OPTIONS_DATA_FIXED_SIZE);
struct ipcm_cookie ipc;
struct flowi4 fl4;
struct rtable *rt = skb_rtable(skb);
@@ -1615,18 +1616,18 @@ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk,
int err;
int oif;
- if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt))
+ if (__ip_options_echo(net, &replyopts->opt, skb, sopt))
return;
ipcm_init(&ipc);
ipc.addr = daddr;
ipc.sockc.transmit_time = transmit_time;
- if (replyopts.opt.opt.optlen) {
- ipc.opt = &replyopts.opt;
+ if (replyopts->opt.optlen) {
+ ipc.opt = replyopts;
- if (replyopts.opt.opt.srr)
- daddr = replyopts.opt.opt.faddr;
+ if (replyopts->opt.srr)
+ daddr = replyopts->opt.faddr;
}
oif = arg->bound_dev_if;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 6d9c5c20b1c4..c062d9519818 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1634,7 +1634,7 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
val = 0;
dst = sk_dst_get(sk);
if (dst) {
- val = dst_mtu(dst);
+ val = dst4_mtu(dst);
dst_release(dst);
}
if (!val)
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 019408d3ca2c..b1e1be00ff8b 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -679,8 +679,18 @@ static const u8 ic_bootp_cookie[4] = { 99, 130, 83, 99 };
static void __init
ic_dhcp_init_options(u8 *options, struct ic_device *d)
{
- u8 mt = ((ic_servaddr == NONE)
- ? DHCPDISCOVER : DHCPREQUEST);
+ static const u8 ic_req_params[] = {
+ 1, /* Subnet mask */
+ 3, /* Default gateway */
+ 6, /* DNS server */
+ 12, /* Host name */
+ 15, /* Domain name */
+ 17, /* Boot path */
+ 26, /* MTU */
+ 40, /* NIS domain name */
+ 42, /* NTP servers */
+ };
+ u8 mt = (ic_servaddr == NONE) ? DHCPDISCOVER : DHCPREQUEST;
u8 *e = options;
int len;
@@ -705,51 +715,36 @@ ic_dhcp_init_options(u8 *options, struct ic_device *d)
e += 4;
}
- /* always? */
- {
- static const u8 ic_req_params[] = {
- 1, /* Subnet mask */
- 3, /* Default gateway */
- 6, /* DNS server */
- 12, /* Host name */
- 15, /* Domain name */
- 17, /* Boot path */
- 26, /* MTU */
- 40, /* NIS domain name */
- 42, /* NTP servers */
- };
-
- *e++ = 55; /* Parameter request list */
- *e++ = sizeof(ic_req_params);
- memcpy(e, ic_req_params, sizeof(ic_req_params));
- e += sizeof(ic_req_params);
-
- if (ic_host_name_set) {
- *e++ = 12; /* host-name */
- len = strlen(utsname()->nodename);
- *e++ = len;
- memcpy(e, utsname()->nodename, len);
- e += len;
- }
- if (*vendor_class_identifier) {
- pr_info("DHCP: sending class identifier \"%s\"\n",
- vendor_class_identifier);
- *e++ = 60; /* Class-identifier */
- len = strlen(vendor_class_identifier);
- *e++ = len;
- memcpy(e, vendor_class_identifier, len);
- e += len;
- }
- len = strlen(dhcp_client_identifier + 1);
- /* the minimum length of identifier is 2, include 1 byte type,
- * and can not be larger than the length of options
- */
- if (len >= 1 && len < 312 - (e - options) - 1) {
- *e++ = 61;
- *e++ = len + 1;
- memcpy(e, dhcp_client_identifier, len + 1);
- e += len + 1;
- }
+ *e++ = 55; /* Parameter request list */
+ *e++ = sizeof(ic_req_params);
+ memcpy(e, ic_req_params, sizeof(ic_req_params));
+ e += sizeof(ic_req_params);
+
+ if (ic_host_name_set) {
+ *e++ = 12; /* host-name */
+ len = strlen(utsname()->nodename);
+ *e++ = len;
+ memcpy(e, utsname()->nodename, len);
+ e += len;
+ }
+ if (*vendor_class_identifier) {
+ pr_info("DHCP: sending class identifier \"%s\"\n",
+ vendor_class_identifier);
+ *e++ = 60; /* Class-identifier */
+ len = strlen(vendor_class_identifier);
+ *e++ = len;
+ memcpy(e, vendor_class_identifier, len);
+ e += len;
+ }
+ len = strlen(dhcp_client_identifier + 1);
+ /* the minimum length of identifier is 2, include 1 byte type,
+ * and can not be larger than the length of options
+ */
+ if (len >= 1 && len < 312 - (e - options) - 1) {
+ *e++ = 61;
+ *e++ = len + 1;
+ memcpy(e, dhcp_client_identifier, len + 1);
+ e += len + 1;
}
*e++ = 255; /* End of the list */
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index ca9eaee4c2ef..131382c388e9 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1895,7 +1895,7 @@ static int ipmr_prepare_xmit(struct net *net, struct mr_table *mrt,
return -1;
}
- if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
+ if (skb->len+encap > dst4_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
/* Do not fragment multicasts. Alas, IPv4 does not
* allow to send ICMP, so that packets will disappear
* to blackhole.
diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c
index 8ddac1f595ed..82cf8a9e5ded 100644
--- a/net/ipv4/metrics.c
+++ b/net/ipv4/metrics.c
@@ -88,4 +88,4 @@ struct dst_metrics *ip_fib_metrics_init(struct nlattr *fc_mx,
return fib_metrics;
}
-EXPORT_SYMBOL_GPL(ip_fib_metrics_init);
+EXPORT_IPV6_MOD_GPL(ip_fib_metrics_init);
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index fae4aa4a5f09..fecf6621f679 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -303,7 +303,7 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb,
goto free_nskb;
/* "Never happens" */
- if (nskb->len > dst_mtu(skb_dst(nskb)))
+ if (nskb->len > dst4_mtu(skb_dst(nskb)))
goto free_nskb;
nf_ct_attach(nskb, oldskb);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index cfbd563498e8..ebfc5a3d3ad6 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -690,6 +690,8 @@ EXPORT_IPV6_MOD_GPL(ping_common_sendmsg);
static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
+ DEFINE_RAW_FLEX(struct ip_options_rcu, opt_copy, opt.__data,
+ IP_OPTIONS_DATA_FIXED_SIZE);
struct net *net = sock_net(sk);
struct flowi4 fl4;
struct inet_sock *inet = inet_sk(sk);
@@ -697,7 +699,6 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct icmphdr user_icmph;
struct pingfakehdr pfh;
struct rtable *rt = NULL;
- struct ip_options_data opt_copy;
int free = 0;
__be32 saddr, daddr, faddr;
u8 scope;
@@ -746,9 +747,9 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
rcu_read_lock();
inet_opt = rcu_dereference(inet->inet_opt);
if (inet_opt) {
- memcpy(&opt_copy, inet_opt,
+ memcpy(opt_copy, inet_opt,
sizeof(*inet_opt) + inet_opt->opt.optlen);
- ipc.opt = &opt_copy.opt;
+ ipc.opt = opt_copy;
}
rcu_read_unlock();
}
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 5998c4cc6f47..e20c41206e29 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -481,6 +481,8 @@ static int raw_getfrag(void *from, char *to, int offset, int len, int odd,
static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
+ DEFINE_RAW_FLEX(struct ip_options_rcu, opt_copy, opt.__data,
+ IP_OPTIONS_DATA_FIXED_SIZE);
struct inet_sock *inet = inet_sk(sk);
struct net *net = sock_net(sk);
struct ipcm_cookie ipc;
@@ -491,7 +493,6 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
__be32 daddr;
__be32 saddr;
int uc_index, err;
- struct ip_options_data opt_copy;
struct raw_frag_vec rfv;
int hdrincl;
@@ -561,9 +562,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
rcu_read_lock();
inet_opt = rcu_dereference(inet->inet_opt);
if (inet_opt) {
- memcpy(&opt_copy, inet_opt,
+ memcpy(opt_copy, inet_opt,
sizeof(*inet_opt) + inet_opt->opt.optlen);
- ipc.opt = &opt_copy.opt;
+ ipc.opt = opt_copy;
}
rcu_read_unlock();
}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 11d990703d31..06aa39ae80d6 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1795,8 +1795,8 @@ static void ip_handle_martian_source(struct net_device *dev,
* RFC1812 recommendation, if source is martian,
* the only hint is MAC header.
*/
- pr_warn("martian source %pI4 from %pI4, on dev %s\n",
- &daddr, &saddr, dev->name);
+ pr_warn("martian source (src=%pI4, dst=%pI4, dev=%s)\n",
+ &saddr, &daddr, dev->name);
if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
print_hex_dump(KERN_WARNING, "ll header: ",
DUMP_PREFIX_OFFSET, 16, 1,
@@ -2475,8 +2475,8 @@ martian_destination:
RT_CACHE_STAT_INC(in_martian_dst);
#ifdef CONFIG_IP_ROUTE_VERBOSE
if (IN_DEV_LOG_MARTIANS(in_dev))
- net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
- &daddr, &saddr, dev->name);
+ net_warn_ratelimited("martian destination (src=%pI4, dst=%pI4, dev=%s)\n",
+ &saddr, &daddr, dev->name);
#endif
goto out;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index a1a50a5c80dc..643763bc2142 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -47,7 +47,7 @@ static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX;
static int tcp_plb_max_rounds = 31;
static int tcp_plb_max_cong_thresh = 256;
static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC;
-static int tcp_ecn_mode_max = 2;
+static int tcp_ecn_mode_max = 5;
static u32 icmp_errors_extension_mask_all =
GENMASK_U8(ICMP_ERR_EXT_COUNT - 1, 0);
@@ -749,7 +749,7 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_TWO,
+ .extra2 = SYSCTL_THREE,
},
{
.procname = "tcp_ecn_option_beacon",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d5319ebe2452..6ce03a9adb4a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -319,15 +319,6 @@ struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp;
EXPORT_IPV6_MOD(tcp_sockets_allocated);
/*
- * TCP splice context
- */
-struct tcp_splice_state {
- struct pipe_inode_info *pipe;
- size_t len;
- unsigned int flags;
-};
-
-/*
* Pressure flag: try to collapse.
* Technical note: it is used by multiple contexts non atomically.
* All the __sk_mem_schedule() is of this nature: accounting
@@ -501,6 +492,9 @@ static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc)
struct sk_buff *skb = tcp_write_queue_tail(sk);
u32 tsflags = sockc->tsflags;
+ if (unlikely(!skb))
+ skb = skb_rb_last(&sk->tcp_rtx_queue);
+
if (tsflags && skb) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
@@ -517,6 +511,19 @@ static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc)
bpf_skops_tx_timestamping(sk, skb, BPF_SOCK_OPS_TSTAMP_SENDMSG_CB);
}
+/* @wake is one when sk_stream_write_space() calls us.
+ * This sends EPOLLOUT only if notsent_bytes is half the limit.
+ * This mimics the strategy used in sock_def_write_space().
+ */
+bool tcp_stream_memory_free(const struct sock *sk, int wake)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ u32 notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt);
+
+ return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
+}
+EXPORT_SYMBOL(tcp_stream_memory_free);
+
static bool tcp_stream_is_readable(struct sock *sk, int target)
{
if (tcp_epollin_ready(sk, target))
@@ -775,8 +782,8 @@ void tcp_push(struct sock *sk, int flags, int mss_now,
__tcp_push_pending_frames(sk, mss_now, nonagle);
}
-static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
- unsigned int offset, size_t len)
+int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
+ unsigned int offset, size_t len)
{
struct tcp_splice_state *tss = rd_desc->arg.data;
int ret;
@@ -902,6 +909,33 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
}
EXPORT_IPV6_MOD(tcp_splice_read);
+/* We allow to exceed memory limits for FIN packets to expedite
+ * connection tear down and (memory) recovery.
+ * Otherwise tcp_send_fin() could be tempted to either delay FIN
+ * or even be forced to close flow without any FIN.
+ * In general, we want to allow one skb per socket to avoid hangs
+ * with edge trigger epoll()
+ */
+void sk_forced_mem_schedule(struct sock *sk, int size)
+{
+ int delta, amt;
+
+ delta = size - sk->sk_forward_alloc;
+ if (delta <= 0)
+ return;
+
+ amt = sk_mem_pages(delta);
+ sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
+
+ if (mem_cgroup_sk_enabled(sk))
+ mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL);
+
+ if (sk->sk_bypass_prot_mem)
+ return;
+
+ sk_memory_allocated_add(sk, amt);
+}
+
struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp,
bool force_schedule)
{
@@ -1074,6 +1108,24 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
return err;
}
+/* If a gap is detected between sends, mark the socket application-limited. */
+void tcp_rate_check_app_limited(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (/* We have less than one packet to send. */
+ tp->write_seq - tp->snd_nxt < tp->mss_cache &&
+ /* Nothing in sending host's qdisc queues or NIC tx queue. */
+ sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) &&
+ /* We are not limited by CWND. */
+ tcp_packets_in_flight(tp) < tcp_snd_cwnd(tp) &&
+ /* All lost packets have been retransmitted. */
+ tp->lost_out <= tp->retrans_out)
+ tp->app_limited =
+ (tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+}
+EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited);
+
int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
{
struct net_devmem_dmabuf_binding *binding = NULL;
@@ -3418,6 +3470,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tcp_accecn_init_counters(tp);
tp->prev_ecnfield = 0;
tp->accecn_opt_tstamp = 0;
+ tp->pkts_acked_ewma = 0;
if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release)
icsk->icsk_ca_ops->release(sk);
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
@@ -4320,6 +4373,14 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
if (tp->rto_stamp)
info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp;
+ if (tcp_ecn_disabled(tp))
+ info->tcpi_ecn_mode = TCPI_ECN_MODE_DISABLED;
+ else if (tcp_ecn_mode_rfc3168(tp))
+ info->tcpi_ecn_mode = TCPI_ECN_MODE_RFC3168;
+ else if (tcp_ecn_mode_accecn(tp))
+ info->tcpi_ecn_mode = TCPI_ECN_MODE_ACCECN;
+ else if (tcp_ecn_mode_pending(tp))
+ info->tcpi_ecn_mode = TCPI_ECN_MODE_PENDING;
info->tcpi_accecn_fail_mode = tp->accecn_fail_mode;
info->tcpi_accecn_opt_seen = tp->saw_accecn_opt;
info->tcpi_received_ce = tp->received_ce;
@@ -5191,6 +5252,7 @@ static void __init tcp_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_ecn_bytes);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, pkts_acked_ewma);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index df758adbb445..e9f6c77e0631 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -16,6 +16,7 @@
#include <linux/gfp.h>
#include <linux/jhash.h>
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
#include <trace/events/tcp.h>
static DEFINE_SPINLOCK(tcp_cong_list_lock);
@@ -227,7 +228,7 @@ void tcp_assign_congestion_control(struct sock *sk)
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
if (ca->flags & TCP_CONG_NEEDS_ECN)
- INET_ECN_xmit(sk);
+ INET_ECN_xmit_ect_1_negotiation(sk);
else
INET_ECN_dontxmit(sk);
}
@@ -257,7 +258,7 @@ static void tcp_reinit_congestion_control(struct sock *sk,
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
if (ca->flags & TCP_CONG_NEEDS_ECN)
- INET_ECN_xmit(sk);
+ INET_ECN_xmit_ect_1_negotiation(sk);
else
INET_ECN_dontxmit(sk);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 7d945a527daf..b30090cff3cf 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -5,6 +5,92 @@
#include <net/tcp.h>
#include <net/busy_poll.h>
+/*
+ * This function is called to set a Fast Open socket's "fastopen_rsk" field
+ * to NULL when a TFO socket no longer needs to access the request_sock.
+ * This happens only after 3WHS has been either completed or aborted (e.g.,
+ * RST is received).
+ *
+ * Before TFO, a child socket is created only after 3WHS is completed,
+ * hence it never needs to access the request_sock. things get a lot more
+ * complex with TFO. A child socket, accepted or not, has to access its
+ * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts,
+ * until 3WHS is either completed or aborted. Afterwards the req will stay
+ * until either the child socket is accepted, or in the rare case when the
+ * listener is closed before the child is accepted.
+ *
+ * In short, a request socket is only freed after BOTH 3WHS has completed
+ * (or aborted) and the child socket has been accepted (or listener closed).
+ * When a child socket is accepted, its corresponding req->sk is set to
+ * NULL since it's no longer needed. More importantly, "req->sk == NULL"
+ * will be used by the code below to determine if a child socket has been
+ * accepted or not, and the check is protected by the fastopenq->lock
+ * described below.
+ *
+ * Note that fastopen_rsk is only accessed from the child socket's context
+ * with its socket lock held. But a request_sock (req) can be accessed by
+ * both its child socket through fastopen_rsk, and a listener socket through
+ * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin
+ * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created.
+ * only in the rare case when both the listener and the child locks are held,
+ * e.g., in inet_csk_listen_stop() do we not need to acquire the lock.
+ * The lock also protects other fields such as fastopenq->qlen, which is
+ * decremented by this function when fastopen_rsk is no longer needed.
+ *
+ * Note that another solution was to simply use the existing socket lock
+ * from the listener. But first socket lock is difficult to use. It is not
+ * a simple spin lock - one must consider sock_owned_by_user() and arrange
+ * to use sk_add_backlog() stuff. But what really makes it infeasible is the
+ * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to
+ * acquire a child's lock while holding listener's socket lock.
+ *
+ * This function also sets "treq->tfo_listener" to false.
+ * treq->tfo_listener is used by the listener so it is protected by the
+ * fastopenq->lock in this function.
+ */
+void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
+ bool reset)
+{
+ struct sock *lsk = req->rsk_listener;
+ struct fastopen_queue *fastopenq;
+
+ fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq;
+
+ RCU_INIT_POINTER(tcp_sk(sk)->fastopen_rsk, NULL);
+ spin_lock_bh(&fastopenq->lock);
+ fastopenq->qlen--;
+ tcp_rsk(req)->tfo_listener = false;
+ if (req->sk) /* the child socket hasn't been accepted yet */
+ goto out;
+
+ if (!reset || lsk->sk_state != TCP_LISTEN) {
+ /* If the listener has been closed don't bother with the
+ * special RST handling below.
+ */
+ spin_unlock_bh(&fastopenq->lock);
+ reqsk_put(req);
+ return;
+ }
+ /* Wait for 60secs before removing a req that has triggered RST.
+ * This is a simple defense against TFO spoofing attack - by
+ * counting the req against fastopen.max_qlen, and disabling
+ * TFO when the qlen exceeds max_qlen.
+ *
+ * For more details see CoNext'11 "TCP Fast Open" paper.
+ */
+ req->rsk_timer.expires = jiffies + 60*HZ;
+ if (fastopenq->rskq_rst_head == NULL)
+ fastopenq->rskq_rst_head = req;
+ else
+ fastopenq->rskq_rst_tail->dl_next = req;
+
+ req->dl_next = NULL;
+ fastopenq->rskq_rst_tail = req;
+ fastopenq->qlen++;
+out:
+ spin_unlock_bh(&fastopenq->lock);
+}
+
void tcp_fastopen_init_key_once(struct net *net)
{
u8 key[TCP_FASTOPEN_KEY_LENGTH];
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 198f8a0d37be..e7b41abb82aa 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -488,6 +488,10 @@ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
tcp_count_delivered_ce(tp, delivered);
}
+#define PKTS_ACKED_WEIGHT 6
+#define PKTS_ACKED_PREC 6
+#define ACK_COMP_THRESH 4
+
/* Returns the ECN CE delta */
static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb,
u32 delivered_pkts, u32 delivered_bytes,
@@ -499,6 +503,7 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb,
u32 delta, safe_delta, d_ceb;
bool opt_deltas_valid;
u32 corrected_ace;
+ u32 ewma;
/* Reordered ACK or uncertain due to lack of data to send and ts */
if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS)))
@@ -507,6 +512,18 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb,
opt_deltas_valid = tcp_accecn_process_option(tp, skb,
delivered_bytes, flag);
+ if (delivered_pkts) {
+ if (!tp->pkts_acked_ewma) {
+ ewma = delivered_pkts << PKTS_ACKED_PREC;
+ } else {
+ ewma = tp->pkts_acked_ewma;
+ ewma = (((ewma << PKTS_ACKED_WEIGHT) - ewma) +
+ (delivered_pkts << PKTS_ACKED_PREC)) >>
+ PKTS_ACKED_WEIGHT;
+ }
+ tp->pkts_acked_ewma = min_t(u32, ewma, 0xFFFFU);
+ }
+
if (!(flag & FLAG_SLOWPATH)) {
/* AccECN counter might overflow on large ACKs */
if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK)
@@ -555,7 +572,8 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb,
if (d_ceb <
safe_delta * tp->mss_cache >> TCP_ACCECN_SAFETY_SHIFT)
return delta;
- }
+ } else if (tp->pkts_acked_ewma > (ACK_COMP_THRESH << PKTS_ACKED_PREC))
+ return delta;
return safe_delta;
}
@@ -1558,6 +1576,38 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
return in_sack;
}
+/* Record the most recently (re)sent time among the (s)acked packets
+ * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
+ * draft-cheng-tcpm-rack-00.txt
+ */
+static void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
+ u32 end_seq, u64 xmit_time)
+{
+ u32 rtt_us;
+
+ rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time);
+ if (rtt_us < tcp_min_rtt(tp) && (sacked & TCPCB_RETRANS)) {
+ /* If the sacked packet was retransmitted, it's ambiguous
+ * whether the retransmission or the original (or the prior
+ * retransmission) was sacked.
+ *
+ * If the original is lost, there is no ambiguity. Otherwise
+ * we assume the original can be delayed up to aRTT + min_rtt.
+ * the aRTT term is bounded by the fast recovery or timeout,
+ * so it's at least one RTT (i.e., retransmission is at least
+ * an RTT later).
+ */
+ return;
+ }
+ tp->rack.advanced = 1;
+ tp->rack.rtt_us = rtt_us;
+ if (tcp_skb_sent_after(xmit_time, tp->rack.mstamp,
+ end_seq, tp->rack.end_seq)) {
+ tp->rack.mstamp = xmit_time;
+ tp->rack.end_seq = end_seq;
+ }
+}
+
/* Mark the given newly-SACKed range as such, adjusting counters and hints. */
static u8 tcp_sacktag_one(struct sock *sk,
struct tcp_sacktag_state *state, u8 sacked,
@@ -1637,6 +1687,160 @@ static u8 tcp_sacktag_one(struct sock *sk,
return sacked;
}
+/* The bandwidth estimator estimates the rate at which the network
+ * can currently deliver outbound data packets for this flow. At a high
+ * level, it operates by taking a delivery rate sample for each ACK.
+ *
+ * A rate sample records the rate at which the network delivered packets
+ * for this flow, calculated over the time interval between the transmission
+ * of a data packet and the acknowledgment of that packet.
+ *
+ * Specifically, over the interval between each transmit and corresponding ACK,
+ * the estimator generates a delivery rate sample. Typically it uses the rate
+ * at which packets were acknowledged. However, the approach of using only the
+ * acknowledgment rate faces a challenge under the prevalent ACK decimation or
+ * compression: packets can temporarily appear to be delivered much quicker
+ * than the bottleneck rate. Since it is physically impossible to do that in a
+ * sustained fashion, when the estimator notices that the ACK rate is faster
+ * than the transmit rate, it uses the latter:
+ *
+ * send_rate = #pkts_delivered/(last_snd_time - first_snd_time)
+ * ack_rate = #pkts_delivered/(last_ack_time - first_ack_time)
+ * bw = min(send_rate, ack_rate)
+ *
+ * Notice the estimator essentially estimates the goodput, not always the
+ * network bottleneck link rate when the sending or receiving is limited by
+ * other factors like applications or receiver window limits. The estimator
+ * deliberately avoids using the inter-packet spacing approach because that
+ * approach requires a large number of samples and sophisticated filtering.
+ *
+ * TCP flows can often be application-limited in request/response workloads.
+ * The estimator marks a bandwidth sample as application-limited if there
+ * was some moment during the sampled window of packets when there was no data
+ * ready to send in the write queue.
+ */
+
+/* Update the connection delivery information and generate a rate sample. */
+static void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ bool is_sack_reneg, struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 snd_us, ack_us;
+
+ /* Clear app limited if bubble is acked and gone. */
+ if (tp->app_limited && after(tp->delivered, tp->app_limited))
+ tp->app_limited = 0;
+
+ /* TODO: there are multiple places throughout tcp_ack() to get
+ * current time. Refactor the code using a new "tcp_acktag_state"
+ * to carry current time, flags, stats like "tcp_sacktag_state".
+ */
+ if (delivered)
+ tp->delivered_mstamp = tp->tcp_mstamp;
+
+ rs->acked_sacked = delivered; /* freshly ACKed or SACKed */
+ rs->losses = lost; /* freshly marked lost */
+ /* Return an invalid sample if no timing information is available or
+ * in recovery from loss with SACK reneging. Rate samples taken during
+ * a SACK reneging event may overestimate bw by including packets that
+ * were SACKed before the reneg.
+ */
+ if (!rs->prior_mstamp || is_sack_reneg) {
+ rs->delivered = -1;
+ rs->interval_us = -1;
+ return;
+ }
+ rs->delivered = tp->delivered - rs->prior_delivered;
+
+ rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
+ /* delivered_ce occupies less than 32 bits in the skb control block */
+ rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK;
+
+ /* Model sending data and receiving ACKs as separate pipeline phases
+ * for a window. Usually the ACK phase is longer, but with ACK
+ * compression the send phase can be longer. To be safe we use the
+ * longer phase.
+ */
+ snd_us = rs->interval_us; /* send phase */
+ ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
+ rs->prior_mstamp); /* ack phase */
+ rs->interval_us = max(snd_us, ack_us);
+
+ /* Record both segment send and ack receive intervals */
+ rs->snd_interval_us = snd_us;
+ rs->rcv_interval_us = ack_us;
+
+ /* Normally we expect interval_us >= min-rtt.
+ * Note that rate may still be over-estimated when a spuriously
+ * retransmistted skb was first (s)acked because "interval_us"
+ * is under-estimated (up to an RTT). However continuously
+ * measuring the delivery rate during loss recovery is crucial
+ * for connections suffer heavy or prolonged losses.
+ */
+ if (unlikely(rs->interval_us < tcp_min_rtt(tp))) {
+ if (!rs->is_retrans)
+ pr_debug("tcp rate: %ld %d %u %u %u\n",
+ rs->interval_us, rs->delivered,
+ inet_csk(sk)->icsk_ca_state,
+ tp->rx_opt.sack_ok, tcp_min_rtt(tp));
+ rs->interval_us = -1;
+ return;
+ }
+
+ /* Record the last non-app-limited or the highest app-limited bw */
+ if (!rs->is_app_limited ||
+ ((u64)rs->delivered * tp->rate_interval_us >=
+ (u64)tp->rate_delivered * rs->interval_us)) {
+ tp->rate_delivered = rs->delivered;
+ tp->rate_interval_us = rs->interval_us;
+ tp->rate_app_limited = rs->is_app_limited;
+ }
+}
+
+/* When an skb is sacked or acked, we fill in the rate sample with the (prior)
+ * delivery information when the skb was last transmitted.
+ *
+ * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is
+ * called multiple times. We favor the information from the most recently
+ * sent skb, i.e., the skb with the most recently sent time and the highest
+ * sequence.
+ */
+static void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ struct rate_sample *rs)
+{
+ struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u64 tx_tstamp;
+
+ if (!scb->tx.delivered_mstamp)
+ return;
+
+ tx_tstamp = tcp_skb_timestamp_us(skb);
+ if (!rs->prior_delivered ||
+ tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
+ scb->end_seq, rs->last_end_seq)) {
+ rs->prior_delivered_ce = scb->tx.delivered_ce;
+ rs->prior_delivered = scb->tx.delivered;
+ rs->prior_mstamp = scb->tx.delivered_mstamp;
+ rs->is_app_limited = scb->tx.is_app_limited;
+ rs->is_retrans = scb->sacked & TCPCB_RETRANS;
+ rs->last_end_seq = scb->end_seq;
+
+ /* Record send time of most recently ACKed packet: */
+ tp->first_tx_mstamp = tx_tstamp;
+ /* Find the duration of the "send phase" of this window: */
+ rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
+ scb->tx.first_tx_mstamp);
+
+ }
+ /* Mark off the skb delivered once it's sacked to avoid being
+ * used again when it's cumulatively acked. For acked packets
+ * we don't need to reset since it'll be freed soon.
+ */
+ if (scb->sacked & TCPCB_SACKED_ACKED)
+ scb->tx.delivered_mstamp = 0;
+}
+
/* Shift newly-SACKed bytes from this skb to the immediately previous
* already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
*/
@@ -3995,6 +4199,49 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered,
return delivered;
}
+/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries.
+ *
+ * If a DSACK is received that seems like it may have been due to reordering
+ * triggering fast recovery, increment reo_wnd by min_rtt/4 (upper bounded
+ * by srtt), since there is possibility that spurious retransmission was
+ * due to reordering delay longer than reo_wnd.
+ *
+ * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16)
+ * no. of successful recoveries (accounts for full DSACK-based loss
+ * recovery undo). After that, reset it to default (min_rtt/4).
+ *
+ * At max, reo_wnd is incremented only once per rtt. So that the new
+ * DSACK on which we are reacting, is due to the spurious retx (approx)
+ * after the reo_wnd has been updated last time.
+ *
+ * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than
+ * absolute value to account for change in rtt.
+ */
+static void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if ((READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) &
+ TCP_RACK_STATIC_REO_WND) ||
+ !rs->prior_delivered)
+ return;
+
+ /* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */
+ if (before(rs->prior_delivered, tp->rack.last_delivered))
+ tp->rack.dsack_seen = 0;
+
+ /* Adjust the reo_wnd if update is pending */
+ if (tp->rack.dsack_seen) {
+ tp->rack.reo_wnd_steps = min_t(u32, 0xFF,
+ tp->rack.reo_wnd_steps + 1);
+ tp->rack.dsack_seen = 0;
+ tp->rack.last_delivered = tp->delivered;
+ tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH;
+ } else if (!tp->rack.reo_wnd_persist) {
+ tp->rack.reo_wnd_steps = 1;
+ }
+}
+
/* This routine deals with incoming acks, but not outgoing ones. */
static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
{
@@ -4129,7 +4376,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tcp_in_ack_event(sk, flag);
- if (tp->tlp_high_seq)
+ if (unlikely(tp->tlp_high_seq))
tcp_process_tlp_ack(sk, ack, flag);
if (tcp_ack_is_dubious(sk, flag)) {
@@ -4179,7 +4426,7 @@ no_queue:
*/
tcp_ack_probe(sk);
- if (tp->tlp_high_seq)
+ if (unlikely(tp->tlp_high_seq))
tcp_process_tlp_ack(sk, ack, flag);
return 1;
@@ -4799,8 +5046,11 @@ static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
}
-static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
+static void tcp_rcv_spurious_retrans(struct sock *sk,
+ const struct sk_buff *skb)
{
+ struct tcp_sock *tp = tcp_sk(sk);
+
/* When the ACK path fails or drops most ACKs, the sender would
* timeout and spuriously retransmit the same segment repeatedly.
* If it seems our ACKs are not reaching the other side,
@@ -4820,6 +5070,14 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
/* Save last flowlabel after a spurious retrans. */
tcp_save_lrcv_flowlabel(sk, skb);
#endif
+ /* Check DSACK info to detect that the previous ACK carrying the
+ * AccECN option was lost after the second retransmision, and then
+ * stop sending AccECN option in all subsequent ACKs.
+ */
+ if (tcp_ecn_mode_accecn(tp) &&
+ tp->accecn_opt_sent_w_dsack &&
+ TCP_SKB_CB(skb)->seq == tp->duplicate_sack[0].start_seq)
+ tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_SEND);
}
static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
@@ -5527,25 +5785,6 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
return next;
}
-/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
-void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
-{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent = NULL;
- struct sk_buff *skb1;
-
- while (*p) {
- parent = *p;
- skb1 = rb_to_skb(parent);
- if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
- p = &parent->rb_left;
- else
- p = &parent->rb_right;
- }
- rb_link_node(&skb->rbnode, parent, p);
- rb_insert_color(&skb->rbnode, root);
-}
-
/* Collapse contiguous sequence of skbs head..tail with
* sequence numbers start..end.
*
@@ -5879,16 +6118,11 @@ static void tcp_new_space(struct sock *sk)
* small enough that tcp_stream_memory_free() decides it
* is time to generate EPOLLOUT.
*/
-void tcp_check_space(struct sock *sk)
+void __tcp_check_space(struct sock *sk)
{
- /* pairs with tcp_poll() */
- smp_mb();
- if (sk->sk_socket &&
- test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
- tcp_new_space(sk);
- if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
- tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
- }
+ tcp_new_space(sk);
+ if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+ tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
static inline void tcp_data_snd_check(struct sock *sk)
@@ -6222,6 +6456,8 @@ step1:
if (th->syn) {
if (tcp_ecn_mode_accecn(tp)) {
accecn_reflector = true;
+ tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield &
+ INET_ECN_MASK;
if (tp->rx_opt.accecn &&
tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn);
@@ -6843,7 +7079,7 @@ consume:
tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
tp->max_window = tp->snd_wnd;
- tcp_ecn_rcv_syn(tp, th, skb);
+ tcp_ecn_rcv_syn(sk, th, skb);
tcp_mtup_init(sk);
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
@@ -7248,7 +7484,8 @@ static void tcp_ecn_create_request(struct request_sock *req,
u32 ecn_ok_dst;
if (tcp_accecn_syn_requested(th) &&
- READ_ONCE(net->ipv4.sysctl_tcp_ecn) >= 3) {
+ (READ_ONCE(net->ipv4.sysctl_tcp_ecn) >= 3 ||
+ tcp_ca_needs_accecn(listen_sk))) {
inet_rsk(req)->ecn_ok = 1;
tcp_rsk(req)->accecn_ok = 1;
tcp_rsk(req)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield &
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f8a9596e8f4d..6264fc0b2be5 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -374,7 +374,7 @@ void tcp_v4_mtu_reduced(struct sock *sk)
{
struct inet_sock *inet = inet_sk(sk);
struct dst_entry *dst;
- u32 mtu;
+ u32 mtu, dmtu;
if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
return;
@@ -386,15 +386,14 @@ void tcp_v4_mtu_reduced(struct sock *sk)
/* Something is about to be wrong... Remember soft error
* for the case, if this connection will not able to recover.
*/
- if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
+ dmtu = dst4_mtu(dst);
+ if (mtu < dmtu && ip_dont_fragment(sk, dst))
WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
- mtu = dst_mtu(dst);
-
if (inet->pmtudisc != IP_PMTUDISC_DONT &&
ip_sk_accept_pmtu(sk) &&
- inet_csk(sk)->icsk_pmtu_cookie > mtu) {
- tcp_sync_mss(sk, mtu);
+ inet_csk(sk)->icsk_pmtu_cookie > dmtu) {
+ tcp_sync_mss(sk, dmtu);
/* Resend the TCP packet because it's
* clear that the old packet has been
@@ -1760,7 +1759,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
tcp_ca_openreq_child(newsk, dst);
- tcp_sync_mss(newsk, dst_mtu(dst));
+ tcp_sync_mss(newsk, dst4_mtu(dst));
newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
tcp_initialize_rcv_mss(newsk);
@@ -2110,14 +2109,6 @@ no_coalesce:
}
EXPORT_IPV6_MOD(tcp_add_backlog);
-int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason)
-{
- struct tcphdr *th = (struct tcphdr *)skb->data;
-
- return sk_filter_trim_cap(sk, skb, th->doff * 4, reason);
-}
-EXPORT_IPV6_MOD(tcp_filter);
-
static void tcp_v4_restore_cb(struct sk_buff *skb)
{
memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
@@ -3418,20 +3409,6 @@ void tcp4_proc_exit(void)
}
#endif /* CONFIG_PROC_FS */
-/* @wake is one when sk_stream_write_space() calls us.
- * This sends EPOLLOUT only if notsent_bytes is half the limit.
- * This mimics the strategy used in sock_def_write_space().
- */
-bool tcp_stream_memory_free(const struct sock *sk, int wake)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
- u32 notsent_bytes = READ_ONCE(tp->write_seq) -
- READ_ONCE(tp->snd_nxt);
-
- return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
-}
-EXPORT_SYMBOL(tcp_stream_memory_free);
-
struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
@@ -3474,6 +3451,8 @@ struct proto tcp_prot = {
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
+ .freeptr_offset = offsetof(struct tcp_sock,
+ inet_conn.icsk_inet.sk.sk_freeptr),
.slab_flags = SLAB_TYPESAFE_BY_RCU,
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index bd5462154f97..ec128865f5c0 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -481,13 +481,18 @@ static void tcp_ecn_openreq_child(struct sock *sk,
tp->syn_ect_snt = treq->syn_ect_snt;
tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt);
tp->saw_accecn_opt = treq->saw_accecn_opt;
+ if (treq->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_SEND)
+ tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_SEND);
+ if (treq->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_RECV)
+ tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV);
tp->prev_ecnfield = treq->syn_ect_rcv;
tp->accecn_opt_demand = 1;
tcp_ecn_received_counters_payload(sk, skb);
} else {
- tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ?
- TCP_ECN_MODE_RFC3168 :
- TCP_ECN_DISABLED);
+ if (inet_rsk(req)->ecn_ok && !tcp_ca_no_fallback_rfc3168(sk))
+ tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
+ else
+ tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
}
}
@@ -748,16 +753,28 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
*/
if (!tcp_oow_rate_limited(sock_net(sk), skb,
LINUX_MIB_TCPACKSKIPPEDSYNRECV,
- &tcp_rsk(req)->last_oow_ack_time) &&
-
- !tcp_rtx_synack(sk, req)) {
- unsigned long expires = jiffies;
-
- expires += tcp_reqsk_timeout(req);
- if (!fastopen)
- mod_timer_pending(&req->rsk_timer, expires);
- else
- req->rsk_timer.expires = expires;
+ &tcp_rsk(req)->last_oow_ack_time)) {
+ if (tcp_rsk(req)->accecn_ok) {
+ u8 ect_rcv = TCP_SKB_CB(skb)->ip_dsfield &
+ INET_ECN_MASK;
+
+ tcp_rsk(req)->syn_ect_rcv = ect_rcv;
+ if (tcp_accecn_ace(tcp_hdr(skb)) == 0x0)
+ tcp_rsk(req)->accecn_fail_mode |= TCP_ACCECN_ACE_FAIL_RECV;
+ }
+ if (!tcp_rtx_synack(sk, req)) {
+ unsigned long expires = jiffies;
+
+ if (req->num_retrans > 1 && tcp_rsk(req)->accecn_ok)
+ tcp_rsk(req)->accecn_fail_mode |= TCP_ACCECN_ACE_FAIL_SEND;
+
+ expires += tcp_reqsk_timeout(req);
+ if (!fastopen)
+ mod_timer_pending(&req->rsk_timer,
+ expires);
+ else
+ req->rsk_timer.expires = expires;
+ }
}
return NULL;
}
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 942a948f1a31..3b1fdcd3cb29 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -304,8 +304,7 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
goto out_check_final;
th2 = tcp_hdr(p);
- flush = (__force int)(flags & TCP_FLAG_CWR);
- flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
+ flush = (__force int)((flags ^ tcp_flag_word(th2)) &
~(TCP_FLAG_FIN | TCP_FLAG_PSH));
flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
for (i = sizeof(*th); i < thlen; i += 4)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 479afb714bdf..326b58ff1118 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -66,6 +66,25 @@ void tcp_mstamp_refresh(struct tcp_sock *tp)
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp);
+/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
+void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct sk_buff *skb1;
+
+ while (*p) {
+ parent = *p;
+ skb1 = rb_to_skb(parent);
+ if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
+ p = &parent->rb_left;
+ else
+ p = &parent->rb_right;
+ }
+ rb_link_node(&skb->rbnode, parent, p);
+ rb_insert_color(&skb->rbnode, root);
+}
+
/* Account for new data that has been sent to the network. */
static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
{
@@ -334,8 +353,11 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
return;
if (tcp_ecn_mode_accecn(tp)) {
- if (!tcp_accecn_ace_fail_recv(tp))
+ if (!tcp_accecn_ace_fail_recv(tp) &&
+ !tcp_accecn_ace_fail_send(tp))
INET_ECN_xmit(sk);
+ else
+ INET_ECN_dontxmit(sk);
tcp_accecn_set_ace(tp, skb, th);
skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ACCECN;
} else {
@@ -712,9 +734,12 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
if (tp) {
tp->accecn_minlen = 0;
tp->accecn_opt_tstamp = tp->tcp_mstamp;
+ tp->accecn_opt_sent_w_dsack = tp->rx_opt.dsack;
if (tp->accecn_opt_demand)
tp->accecn_opt_demand--;
}
+ } else if (tp) {
+ tp->accecn_opt_sent_w_dsack = 0;
}
if (unlikely(OPTION_SACK_ADVERTISE & options)) {
@@ -1106,7 +1131,7 @@ static unsigned int tcp_synack_options(const struct sock *sk,
if (treq->accecn_ok &&
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) &&
- req->num_timeout < 1 && remaining >= TCPOLEN_ACCECN_BASE) {
+ synack_type != TCP_SYNACK_RETRANS && remaining >= TCPOLEN_ACCECN_BASE) {
opts->use_synack_ecn_bytes = 1;
remaining -= tcp_options_fit_accecn(opts, 0, remaining);
}
@@ -1186,7 +1211,9 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
if (tcp_ecn_mode_accecn(tp)) {
int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option);
- if (ecn_opt && tp->saw_accecn_opt && !tcp_accecn_opt_fail_send(tp) &&
+ if (ecn_opt && tp->saw_accecn_opt &&
+ (ecn_opt >= TCP_ACCECN_OPTION_PERSIST ||
+ !tcp_accecn_opt_fail_send(tp)) &&
(ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand ||
tcp_accecn_option_beacon_check(sk))) {
opts->use_synack_ecn_bytes = 0;
@@ -1432,6 +1459,41 @@ static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
}
+/* Snapshot the current delivery information in the skb, to generate
+ * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
+ */
+static void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* In general we need to start delivery rate samples from the
+ * time we received the most recent ACK, to ensure we include
+ * the full time the network needs to deliver all in-flight
+ * packets. If there are no packets in flight yet, then we
+ * know that any ACKs after now indicate that the network was
+ * able to deliver those packets completely in the sampling
+ * interval between now and the next ACK.
+ *
+ * Note that we use packets_out instead of tcp_packets_in_flight(tp)
+ * because the latter is a guess based on RTO and loss-marking
+ * heuristics. We don't want spurious RTOs or loss markings to cause
+ * a spuriously small time interval, causing a spuriously high
+ * bandwidth estimate.
+ */
+ if (!tp->packets_out) {
+ u64 tstamp_us = tcp_skb_timestamp_us(skb);
+
+ tp->first_tx_mstamp = tstamp_us;
+ tp->delivered_mstamp = tstamp_us;
+ }
+
+ TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp;
+ TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp;
+ TCP_SKB_CB(skb)->tx.delivered = tp->delivered;
+ TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce;
+ TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0;
+}
+
INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb));
@@ -1530,7 +1592,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
*/
skb->pfmemalloc = 0;
- skb_push(skb, tcp_header_size);
+ __skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
skb_orphan(skb);
@@ -3571,12 +3633,15 @@ start:
tcp_retrans_try_collapse(sk, skb, avail_wnd);
}
- /* RFC3168, section 6.1.1.1. ECN fallback
- * As AccECN uses the same SYN flags (+ AE), this check covers both
- * cases.
- */
- if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
- tcp_ecn_clear_syn(sk, skb);
+ if (!tcp_ecn_mode_pending(tp) || icsk->icsk_retransmits > 1) {
+ /* RFC3168, section 6.1.1.1. ECN fallback
+ * As AccECN uses the same SYN flags (+ AE), this check
+ * covers both cases.
+ */
+ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) ==
+ TCPHDR_SYN_ECN)
+ tcp_ecn_clear_syn(sk, skb);
+ }
/* Update global and local TCP statistics. */
segs = tcp_skb_pcount(skb);
@@ -3732,33 +3797,6 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
inet_csk(sk)->icsk_rto, true);
}
-/* We allow to exceed memory limits for FIN packets to expedite
- * connection tear down and (memory) recovery.
- * Otherwise tcp_send_fin() could be tempted to either delay FIN
- * or even be forced to close flow without any FIN.
- * In general, we want to allow one skb per socket to avoid hangs
- * with edge trigger epoll()
- */
-void sk_forced_mem_schedule(struct sock *sk, int size)
-{
- int delta, amt;
-
- delta = size - sk->sk_forward_alloc;
- if (delta <= 0)
- return;
-
- amt = sk_mem_pages(delta);
- sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
-
- if (mem_cgroup_sk_enabled(sk))
- mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL);
-
- if (sk->sk_bypass_prot_mem)
- return;
-
- sk_memory_allocated_add(sk, amt);
-}
-
/* Send a FIN. The caller locks the socket for us.
* We should try to send a FIN packet really hard, but eventually give up.
*/
@@ -3918,6 +3956,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
switch (synack_type) {
case TCP_SYNACK_NORMAL:
+ case TCP_SYNACK_RETRANS:
skb_set_owner_edemux(skb, req_to_sk(req));
break;
case TCP_SYNACK_COOKIE:
@@ -4000,7 +4039,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
memset(th, 0, sizeof(struct tcphdr));
th->syn = 1;
th->ack = 1;
- tcp_ecn_make_synack(req, th);
+ tcp_ecn_make_synack(req, th, synack_type);
th->source = htons(ireq->ir_num);
th->dest = ireq->ir_rmt_port;
skb->mark = ireq->ir_mark;
@@ -4603,7 +4642,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
/* Paired with WRITE_ONCE() in sock_setsockopt() */
if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED)
WRITE_ONCE(tcp_rsk(req)->txhash, net_tx_rndhash());
- res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
+ res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_RETRANS,
NULL);
if (!res) {
TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
deleted file mode 100644
index a8f6d9d06f2e..000000000000
--- a/net/ipv4/tcp_rate.c
+++ /dev/null
@@ -1,209 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-#include <net/tcp.h>
-
-/* The bandwidth estimator estimates the rate at which the network
- * can currently deliver outbound data packets for this flow. At a high
- * level, it operates by taking a delivery rate sample for each ACK.
- *
- * A rate sample records the rate at which the network delivered packets
- * for this flow, calculated over the time interval between the transmission
- * of a data packet and the acknowledgment of that packet.
- *
- * Specifically, over the interval between each transmit and corresponding ACK,
- * the estimator generates a delivery rate sample. Typically it uses the rate
- * at which packets were acknowledged. However, the approach of using only the
- * acknowledgment rate faces a challenge under the prevalent ACK decimation or
- * compression: packets can temporarily appear to be delivered much quicker
- * than the bottleneck rate. Since it is physically impossible to do that in a
- * sustained fashion, when the estimator notices that the ACK rate is faster
- * than the transmit rate, it uses the latter:
- *
- * send_rate = #pkts_delivered/(last_snd_time - first_snd_time)
- * ack_rate = #pkts_delivered/(last_ack_time - first_ack_time)
- * bw = min(send_rate, ack_rate)
- *
- * Notice the estimator essentially estimates the goodput, not always the
- * network bottleneck link rate when the sending or receiving is limited by
- * other factors like applications or receiver window limits. The estimator
- * deliberately avoids using the inter-packet spacing approach because that
- * approach requires a large number of samples and sophisticated filtering.
- *
- * TCP flows can often be application-limited in request/response workloads.
- * The estimator marks a bandwidth sample as application-limited if there
- * was some moment during the sampled window of packets when there was no data
- * ready to send in the write queue.
- */
-
-/* Snapshot the current delivery information in the skb, to generate
- * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
- */
-void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- /* In general we need to start delivery rate samples from the
- * time we received the most recent ACK, to ensure we include
- * the full time the network needs to deliver all in-flight
- * packets. If there are no packets in flight yet, then we
- * know that any ACKs after now indicate that the network was
- * able to deliver those packets completely in the sampling
- * interval between now and the next ACK.
- *
- * Note that we use packets_out instead of tcp_packets_in_flight(tp)
- * because the latter is a guess based on RTO and loss-marking
- * heuristics. We don't want spurious RTOs or loss markings to cause
- * a spuriously small time interval, causing a spuriously high
- * bandwidth estimate.
- */
- if (!tp->packets_out) {
- u64 tstamp_us = tcp_skb_timestamp_us(skb);
-
- tp->first_tx_mstamp = tstamp_us;
- tp->delivered_mstamp = tstamp_us;
- }
-
- TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp;
- TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp;
- TCP_SKB_CB(skb)->tx.delivered = tp->delivered;
- TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce;
- TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0;
-}
-
-/* When an skb is sacked or acked, we fill in the rate sample with the (prior)
- * delivery information when the skb was last transmitted.
- *
- * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is
- * called multiple times. We favor the information from the most recently
- * sent skb, i.e., the skb with the most recently sent time and the highest
- * sequence.
- */
-void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
- struct rate_sample *rs)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
- u64 tx_tstamp;
-
- if (!scb->tx.delivered_mstamp)
- return;
-
- tx_tstamp = tcp_skb_timestamp_us(skb);
- if (!rs->prior_delivered ||
- tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
- scb->end_seq, rs->last_end_seq)) {
- rs->prior_delivered_ce = scb->tx.delivered_ce;
- rs->prior_delivered = scb->tx.delivered;
- rs->prior_mstamp = scb->tx.delivered_mstamp;
- rs->is_app_limited = scb->tx.is_app_limited;
- rs->is_retrans = scb->sacked & TCPCB_RETRANS;
- rs->last_end_seq = scb->end_seq;
-
- /* Record send time of most recently ACKed packet: */
- tp->first_tx_mstamp = tx_tstamp;
- /* Find the duration of the "send phase" of this window: */
- rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
- scb->tx.first_tx_mstamp);
-
- }
- /* Mark off the skb delivered once it's sacked to avoid being
- * used again when it's cumulatively acked. For acked packets
- * we don't need to reset since it'll be freed soon.
- */
- if (scb->sacked & TCPCB_SACKED_ACKED)
- scb->tx.delivered_mstamp = 0;
-}
-
-/* Update the connection delivery information and generate a rate sample. */
-void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
- bool is_sack_reneg, struct rate_sample *rs)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- u32 snd_us, ack_us;
-
- /* Clear app limited if bubble is acked and gone. */
- if (tp->app_limited && after(tp->delivered, tp->app_limited))
- tp->app_limited = 0;
-
- /* TODO: there are multiple places throughout tcp_ack() to get
- * current time. Refactor the code using a new "tcp_acktag_state"
- * to carry current time, flags, stats like "tcp_sacktag_state".
- */
- if (delivered)
- tp->delivered_mstamp = tp->tcp_mstamp;
-
- rs->acked_sacked = delivered; /* freshly ACKed or SACKed */
- rs->losses = lost; /* freshly marked lost */
- /* Return an invalid sample if no timing information is available or
- * in recovery from loss with SACK reneging. Rate samples taken during
- * a SACK reneging event may overestimate bw by including packets that
- * were SACKed before the reneg.
- */
- if (!rs->prior_mstamp || is_sack_reneg) {
- rs->delivered = -1;
- rs->interval_us = -1;
- return;
- }
- rs->delivered = tp->delivered - rs->prior_delivered;
-
- rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
- /* delivered_ce occupies less than 32 bits in the skb control block */
- rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK;
-
- /* Model sending data and receiving ACKs as separate pipeline phases
- * for a window. Usually the ACK phase is longer, but with ACK
- * compression the send phase can be longer. To be safe we use the
- * longer phase.
- */
- snd_us = rs->interval_us; /* send phase */
- ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
- rs->prior_mstamp); /* ack phase */
- rs->interval_us = max(snd_us, ack_us);
-
- /* Record both segment send and ack receive intervals */
- rs->snd_interval_us = snd_us;
- rs->rcv_interval_us = ack_us;
-
- /* Normally we expect interval_us >= min-rtt.
- * Note that rate may still be over-estimated when a spuriously
- * retransmistted skb was first (s)acked because "interval_us"
- * is under-estimated (up to an RTT). However continuously
- * measuring the delivery rate during loss recovery is crucial
- * for connections suffer heavy or prolonged losses.
- */
- if (unlikely(rs->interval_us < tcp_min_rtt(tp))) {
- if (!rs->is_retrans)
- pr_debug("tcp rate: %ld %d %u %u %u\n",
- rs->interval_us, rs->delivered,
- inet_csk(sk)->icsk_ca_state,
- tp->rx_opt.sack_ok, tcp_min_rtt(tp));
- rs->interval_us = -1;
- return;
- }
-
- /* Record the last non-app-limited or the highest app-limited bw */
- if (!rs->is_app_limited ||
- ((u64)rs->delivered * tp->rate_interval_us >=
- (u64)tp->rate_delivered * rs->interval_us)) {
- tp->rate_delivered = rs->delivered;
- tp->rate_interval_us = rs->interval_us;
- tp->rate_app_limited = rs->is_app_limited;
- }
-}
-
-/* If a gap is detected between sends, mark the socket application-limited. */
-void tcp_rate_check_app_limited(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (/* We have less than one packet to send. */
- tp->write_seq - tp->snd_nxt < tp->mss_cache &&
- /* Nothing in sending host's qdisc queues or NIC tx queue. */
- sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) &&
- /* We are not limited by CWND. */
- tcp_packets_in_flight(tp) < tcp_snd_cwnd(tp) &&
- /* All lost packets have been retransmitted. */
- tp->lost_out <= tp->retrans_out)
- tp->app_limited =
- (tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
-}
-EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited);
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index c52fd3254b6e..139646751073 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -111,38 +111,6 @@ bool tcp_rack_mark_lost(struct sock *sk)
return !!timeout;
}
-/* Record the most recently (re)sent time among the (s)acked packets
- * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
- * draft-cheng-tcpm-rack-00.txt
- */
-void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
- u64 xmit_time)
-{
- u32 rtt_us;
-
- rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time);
- if (rtt_us < tcp_min_rtt(tp) && (sacked & TCPCB_RETRANS)) {
- /* If the sacked packet was retransmitted, it's ambiguous
- * whether the retransmission or the original (or the prior
- * retransmission) was sacked.
- *
- * If the original is lost, there is no ambiguity. Otherwise
- * we assume the original can be delayed up to aRTT + min_rtt.
- * the aRTT term is bounded by the fast recovery or timeout,
- * so it's at least one RTT (i.e., retransmission is at least
- * an RTT later).
- */
- return;
- }
- tp->rack.advanced = 1;
- tp->rack.rtt_us = rtt_us;
- if (tcp_skb_sent_after(xmit_time, tp->rack.mstamp,
- end_seq, tp->rack.end_seq)) {
- tp->rack.mstamp = xmit_time;
- tp->rack.end_seq = end_seq;
- }
-}
-
/* We have waited long enough to accommodate reordering. Mark the expired
* packets lost and retransmit them.
*/
@@ -166,49 +134,6 @@ void tcp_rack_reo_timeout(struct sock *sk)
tcp_rearm_rto(sk);
}
-/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries.
- *
- * If a DSACK is received that seems like it may have been due to reordering
- * triggering fast recovery, increment reo_wnd by min_rtt/4 (upper bounded
- * by srtt), since there is possibility that spurious retransmission was
- * due to reordering delay longer than reo_wnd.
- *
- * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16)
- * no. of successful recoveries (accounts for full DSACK-based loss
- * recovery undo). After that, reset it to default (min_rtt/4).
- *
- * At max, reo_wnd is incremented only once per rtt. So that the new
- * DSACK on which we are reacting, is due to the spurious retx (approx)
- * after the reo_wnd has been updated last time.
- *
- * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than
- * absolute value to account for change in rtt.
- */
-void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- if ((READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) &
- TCP_RACK_STATIC_REO_WND) ||
- !rs->prior_delivered)
- return;
-
- /* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */
- if (before(rs->prior_delivered, tp->rack.last_delivered))
- tp->rack.dsack_seen = 0;
-
- /* Adjust the reo_wnd if update is pending */
- if (tp->rack.dsack_seen) {
- tp->rack.reo_wnd_steps = min_t(u32, 0xFF,
- tp->rack.reo_wnd_steps + 1);
- tp->rack.dsack_seen = 0;
- tp->rack.last_delivered = tp->delivered;
- tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH;
- } else if (!tp->rack.reo_wnd_persist) {
- tp->rack.reo_wnd_steps = 1;
- }
-}
-
/* RFC6582 NewReno recovery for non-SACK connection. It simply retransmits
* the next unacked packet upon receiving
* a) three or more DUPACKs to start the fast recovery
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 160080c9021d..5a14a53a3c9e 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,6 +22,7 @@
#include <linux/module.h>
#include <linux/gfp.h>
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
#include <net/rstreason.h>
static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
@@ -479,6 +480,8 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
* it's not good to give up too easily.
*/
tcp_rtx_synack(sk, req);
+ if (req->num_retrans > 1 && tcp_rsk(req)->accecn_ok)
+ tcp_rsk(req)->accecn_fail_mode |= TCP_ACCECN_ACE_FAIL_SEND;
req->num_timeout++;
tcp_update_rto_stats(sk);
if (!tp->retrans_stamp)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ee63af0ef42c..b96e47f1c8a2 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1193,7 +1193,7 @@ csum_partial:
send:
err = ip_send_skb(sock_net(sk), skb);
- if (err) {
+ if (unlikely(err)) {
if (err == -ENOBUFS &&
!inet_test_bit(RECVERR, sk)) {
UDP_INC_STATS(sock_net(sk),
@@ -1269,6 +1269,8 @@ EXPORT_IPV6_MOD_GPL(udp_cmsg_send);
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
+ DEFINE_RAW_FLEX(struct ip_options_rcu, opt_copy, opt.__data,
+ IP_OPTIONS_DATA_FIXED_SIZE);
struct inet_sock *inet = inet_sk(sk);
struct udp_sock *up = udp_sk(sk);
DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
@@ -1286,7 +1288,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE;
int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
struct sk_buff *skb;
- struct ip_options_data opt_copy;
int uc_index;
if (len > 0xFFFF)
@@ -1368,9 +1369,9 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
rcu_read_lock();
inet_opt = rcu_dereference(inet->inet_opt);
if (inet_opt) {
- memcpy(&opt_copy, inet_opt,
+ memcpy(opt_copy, inet_opt,
sizeof(*inet_opt) + inet_opt->opt.optlen);
- ipc.opt = &opt_copy.opt;
+ ipc.opt = opt_copy;
}
rcu_read_unlock();
}
@@ -1793,14 +1794,32 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
}
if (unlikely(to_drop)) {
+ int err_ipv4 = 0;
+ int err_ipv6 = 0;
+
for (nb = 0; to_drop != NULL; nb++) {
skb = to_drop;
+ if (skb->protocol == htons(ETH_P_IP))
+ err_ipv4++;
+ else
+ err_ipv6++;
to_drop = skb->next;
skb_mark_not_on_list(skb);
- /* TODO: update SNMP values. */
sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_PROTO_MEM);
}
numa_drop_add(&udp_sk(sk)->drop_counters, nb);
+ if (err_ipv4 > 0) {
+ SNMP_ADD_STATS(__UDPX_MIB(sk, true), UDP_MIB_MEMERRORS,
+ err_ipv4);
+ SNMP_ADD_STATS(__UDPX_MIB(sk, true), UDP_MIB_INERRORS,
+ err_ipv4);
+ }
+ if (err_ipv6 > 0) {
+ SNMP_ADD_STATS(__UDPX_MIB(sk, false), UDP_MIB_MEMERRORS,
+ err_ipv6);
+ SNMP_ADD_STATS(__UDPX_MIB(sk, false), UDP_MIB_INERRORS,
+ err_ipv6);
+ }
}
atomic_sub(total_size, &udp_prod_queue->rmem_alloc);
@@ -2429,7 +2448,8 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
/*
* UDP-Lite specific tests, ignored on UDP sockets
*/
- if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) {
+ if (unlikely(udp_test_bit(UDPLITE_RECV_CC, sk) &&
+ UDP_SKB_CB(skb)->partial_cov)) {
u16 pcrlen = READ_ONCE(up->pcrlen);
/*
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 589456bd8b5f..6b1654c1ad4a 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -483,11 +483,11 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
struct sock *sk = gso_skb->sk;
unsigned int sum_truesize = 0;
struct sk_buff *segs, *seg;
+ __be16 newlen, msslen;
struct udphdr *uh;
unsigned int mss;
bool copy_dtor;
__sum16 check;
- __be16 newlen;
int ret = 0;
mss = skb_shinfo(gso_skb)->gso_size;
@@ -556,6 +556,8 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
return segs;
}
+ msslen = htons(sizeof(*uh) + mss);
+
/* GSO partial and frag_list segmentation only requires splitting
* the frame into an MSS multiple and possibly a remainder, both
* cases return a GSO skb. So update the mss now.
@@ -585,7 +587,7 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
if (!seg->next)
break;
- uh->len = newlen;
+ uh->len = msslen;
uh->check = check;
if (seg->ip_summed == CHECKSUM_PARTIAL)