summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/cipso_ipv4.c3
-rw-r--r--net/ipv4/fib_lookup.h6
-rw-r--r--net/ipv4/fib_trie.c4
-rw-r--r--net/ipv4/icmp.c139
-rw-r--r--net/ipv4/igmp.c4
-rw-r--r--net/ipv4/inet_connection_sock.c24
-rw-r--r--net/ipv4/ip_output.c17
-rw-r--r--net/ipv4/ip_sockglue.c2
-rw-r--r--net/ipv4/ipconfig.c89
-rw-r--r--net/ipv4/ipmr.c2
-rw-r--r--net/ipv4/metrics.c2
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c2
-rw-r--r--net/ipv4/ping.c7
-rw-r--r--net/ipv4/raw.c7
-rw-r--r--net/ipv4/route.c8
-rw-r--r--net/ipv4/sysctl_net_ipv4.c4
-rw-r--r--net/ipv4/tcp.c84
-rw-r--r--net/ipv4/tcp_cong.c5
-rw-r--r--net/ipv4/tcp_fastopen.c86
-rw-r--r--net/ipv4/tcp_input.c305
-rw-r--r--net/ipv4/tcp_ipv4.c37
-rw-r--r--net/ipv4/tcp_minisocks.c43
-rw-r--r--net/ipv4/tcp_offload.c3
-rw-r--r--net/ipv4/tcp_output.c117
-rw-r--r--net/ipv4/tcp_rate.c209
-rw-r--r--net/ipv4/tcp_recovery.c75
-rw-r--r--net/ipv4/tcp_timer.c3
-rw-r--r--net/ipv4/udp.c32
-rw-r--r--net/ipv4/udp_offload.c6
30 files changed, 770 insertions, 557 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ec36d2ec059e..18108a6f0499 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -9,7 +9,7 @@ obj-y := route.o inetpeer.o protocol.o \
inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
- tcp_rate.o tcp_recovery.o tcp_ulp.o \
+ tcp_recovery.o tcp_ulp.o \
tcp_offload.o tcp_plb.o datagram.o raw.o udp.o udplite.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 709021197e1c..32b951ebc0c2 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -2196,7 +2196,8 @@ int cipso_v4_skbuff_setattr(struct sk_buff *skb,
/* if we don't ensure enough headroom we could panic on the skb_push()
* call below so make sure we have enough, we are also "mangling" the
* packet so we should probably do a copy-on-write call anyway */
- ret_val = skb_cow(skb, skb_headroom(skb) + len_delta);
+ ret_val = skb_cow(skb,
+ skb_headroom(skb) + (len_delta > 0 ? len_delta : 0));
if (ret_val < 0)
return ret_val;
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index f9b9e26c32c1..0b72796dd1ad 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -28,8 +28,10 @@ struct fib_alias {
/* Don't write on fa_state unless needed, to keep it shared on all cpus */
static inline void fib_alias_accessed(struct fib_alias *fa)
{
- if (!(fa->fa_state & FA_S_ACCESSED))
- fa->fa_state |= FA_S_ACCESSED;
+ u8 fa_state = READ_ONCE(fa->fa_state);
+
+ if (!(fa_state & FA_S_ACCESSED))
+ WRITE_ONCE(fa->fa_state, fa_state | FA_S_ACCESSED);
}
/* Exported by fib_semantics.c */
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 7e2c17fec3fc..1308213791f1 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1280,7 +1280,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
new_fa->fa_dscp = fa->fa_dscp;
new_fa->fa_info = fi;
new_fa->fa_type = cfg->fc_type;
- state = fa->fa_state;
+ state = READ_ONCE(fa->fa_state);
new_fa->fa_state = state & ~FA_S_ACCESSED;
new_fa->fa_slen = fa->fa_slen;
new_fa->tb_id = tb->tb_id;
@@ -1745,7 +1745,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
fib_remove_alias(t, tp, l, fa_to_delete);
- if (fa_to_delete->fa_state & FA_S_ACCESSED)
+ if (READ_ONCE(fa_to_delete->fa_state) & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net);
fib_release_info(fa_to_delete->fa_info);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 4abbec2f47ef..e216b6df6331 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -112,7 +112,9 @@ struct icmp_bxm {
__be32 times[3];
} data;
int head_len;
- struct ip_options_data replyopts;
+
+ /* Must be last as it ends in a flexible-array member. */
+ struct ip_options_rcu replyopts;
};
/* An array of errno for error messages from dest unreach. */
@@ -353,9 +355,12 @@ void icmp_out_count(struct net *net, unsigned char type)
static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
struct sk_buff *skb)
{
- struct icmp_bxm *icmp_param = from;
+ DEFINE_RAW_FLEX(struct icmp_bxm, icmp_param, replyopts.opt.__data,
+ IP_OPTIONS_DATA_FIXED_SIZE);
__wsum csum;
+ icmp_param = from;
+
csum = skb_copy_and_csum_bits(icmp_param->skb,
icmp_param->offset + offset,
to, len);
@@ -413,7 +418,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
int type = icmp_param->data.icmph.type;
int code = icmp_param->data.icmph.code;
- if (ip_options_echo(net, &icmp_param->replyopts.opt.opt, skb))
+ if (ip_options_echo(net, &icmp_param->replyopts.opt, skb))
return;
/* Needed by both icmpv4_global_allow and icmp_xmit_lock */
@@ -435,10 +440,10 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
daddr = ipc.addr = ip_hdr(skb)->saddr;
saddr = fib_compute_spec_dst(skb);
- if (icmp_param->replyopts.opt.opt.optlen) {
- ipc.opt = &icmp_param->replyopts.opt;
+ if (icmp_param->replyopts.opt.optlen) {
+ ipc.opt = &icmp_param->replyopts;
if (ipc.opt->opt.srr)
- daddr = icmp_param->replyopts.opt.opt.faddr;
+ daddr = icmp_param->replyopts.opt.faddr;
}
memset(&fl4, 0, sizeof(fl4));
fl4.daddr = daddr;
@@ -491,8 +496,8 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4,
int err;
memset(fl4, 0, sizeof(*fl4));
- fl4->daddr = (param->replyopts.opt.opt.srr ?
- param->replyopts.opt.opt.faddr : iph->saddr);
+ fl4->daddr = (param->replyopts.opt.srr ?
+ param->replyopts.opt.faddr : iph->saddr);
fl4->saddr = saddr;
fl4->flowi4_mark = mark;
fl4->flowi4_uid = sock_net_uid(net, NULL);
@@ -554,6 +559,21 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4,
/* steal dst entry from skb_in, don't drop refcnt */
skb_dstref_steal(skb_in);
skb_dstref_restore(skb_in, orefdst);
+
+ /*
+ * At this point, fl4_dec.daddr should NOT be local (we
+ * checked fl4_dec.saddr above). However, a race condition
+ * may occur if the address is added to the interface
+ * concurrently. In that case, ip_route_input() returns a
+ * LOCAL route with dst.output=ip_rt_bug, which must not
+ * be used for output.
+ */
+ if (!err && rt2 && rt2->rt_type == RTN_LOCAL) {
+ net_warn_ratelimited("detected local route for %pI4 during ICMP sending, src %pI4\n",
+ &fl4_dec.daddr, &fl4_dec.saddr);
+ dst_release(&rt2->dst);
+ err = -EINVAL;
+ }
}
if (err)
@@ -775,9 +795,10 @@ free_skb:
void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
const struct inet_skb_parm *parm)
{
+ DEFINE_RAW_FLEX(struct icmp_bxm, icmp_param, replyopts.opt.__data,
+ IP_OPTIONS_DATA_FIXED_SIZE);
struct iphdr *iph;
int room;
- struct icmp_bxm icmp_param;
struct rtable *rt = skb_rtable(skb_in);
bool apply_ratelimit = false;
struct sk_buff *ext_skb;
@@ -906,7 +927,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
iph->tos;
mark = IP4_REPLY_MARK(net, skb_in->mark);
- if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in,
+ if (__ip_options_echo(net, &icmp_param->replyopts.opt, skb_in,
&parm->opt))
goto out_unlock;
@@ -915,21 +936,21 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
* Prepare data for ICMP header.
*/
- icmp_param.data.icmph.type = type;
- icmp_param.data.icmph.code = code;
- icmp_param.data.icmph.un.gateway = info;
- icmp_param.data.icmph.checksum = 0;
- icmp_param.skb = skb_in;
- icmp_param.offset = skb_network_offset(skb_in);
+ icmp_param->data.icmph.type = type;
+ icmp_param->data.icmph.code = code;
+ icmp_param->data.icmph.un.gateway = info;
+ icmp_param->data.icmph.checksum = 0;
+ icmp_param->skb = skb_in;
+ icmp_param->offset = skb_network_offset(skb_in);
ipcm_init(&ipc);
ipc.tos = tos;
ipc.addr = iph->saddr;
- ipc.opt = &icmp_param.replyopts.opt;
+ ipc.opt = &icmp_param->replyopts;
ipc.sockc.mark = mark;
rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr,
inet_dsfield_to_dscp(tos), mark, type, code,
- &icmp_param);
+ icmp_param);
if (IS_ERR(rt))
goto out_unlock;
@@ -939,10 +960,10 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
/* RFC says return as much as we can without exceeding 576 bytes. */
- room = dst_mtu(&rt->dst);
+ room = dst4_mtu(&rt->dst);
if (room > 576)
room = 576;
- room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen;
+ room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.optlen;
room -= sizeof(struct icmphdr);
/* Guard against tiny mtu. We need to include at least one
* IP network header for this message to make any sense.
@@ -950,15 +971,15 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
if (room <= (int)sizeof(struct iphdr))
goto ende;
- ext_skb = icmp_ext_append(net, skb_in, &icmp_param.data.icmph, room,
+ ext_skb = icmp_ext_append(net, skb_in, &icmp_param->data.icmph, room,
parm->iif);
if (ext_skb)
- icmp_param.skb = ext_skb;
+ icmp_param->skb = ext_skb;
- icmp_param.data_len = icmp_param.skb->len - icmp_param.offset;
- if (icmp_param.data_len > room)
- icmp_param.data_len = room;
- icmp_param.head_len = sizeof(struct icmphdr);
+ icmp_param->data_len = icmp_param->skb->len - icmp_param->offset;
+ if (icmp_param->data_len > room)
+ icmp_param->data_len = room;
+ icmp_param->head_len = sizeof(struct icmphdr);
/* if we don't have a source address at this point, fall back to the
* dummy address instead of sending out a packet with a source address
@@ -969,7 +990,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
trace_icmp_send(skb_in, type, code);
- icmp_push_reply(sk, &icmp_param, &fl4, &ipc, &rt);
+ icmp_push_reply(sk, icmp_param, &fl4, &ipc, &rt);
if (ext_skb)
consume_skb(ext_skb);
@@ -1031,16 +1052,22 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
/* Checkin full IP header plus 8 bytes of protocol to
* avoid additional coding at protocol handlers.
*/
- if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) {
- __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS);
- return;
- }
+ if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
+ goto out;
+
+ /* IPPROTO_RAW sockets are not supposed to receive anything. */
+ if (protocol == IPPROTO_RAW)
+ goto out;
raw_icmp_error(skb, protocol, info);
ipprot = rcu_dereference(inet_protos[protocol]);
if (ipprot && ipprot->err_handler)
ipprot->err_handler(skb, info);
+ return;
+
+out:
+ __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS);
}
static bool icmp_tag_validation(int proto)
@@ -1206,7 +1233,8 @@ static enum skb_drop_reason icmp_redirect(struct sk_buff *skb)
static enum skb_drop_reason icmp_echo(struct sk_buff *skb)
{
- struct icmp_bxm icmp_param;
+ DEFINE_RAW_FLEX(struct icmp_bxm, icmp_param, replyopts.opt.__data,
+ IP_OPTIONS_DATA_FIXED_SIZE);
struct net *net;
net = skb_dst_dev_net_rcu(skb);
@@ -1214,18 +1242,18 @@ static enum skb_drop_reason icmp_echo(struct sk_buff *skb)
if (READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_all))
return SKB_NOT_DROPPED_YET;
- icmp_param.data.icmph = *icmp_hdr(skb);
- icmp_param.skb = skb;
- icmp_param.offset = 0;
- icmp_param.data_len = skb->len;
- icmp_param.head_len = sizeof(struct icmphdr);
+ icmp_param->data.icmph = *icmp_hdr(skb);
+ icmp_param->skb = skb;
+ icmp_param->offset = 0;
+ icmp_param->data_len = skb->len;
+ icmp_param->head_len = sizeof(struct icmphdr);
- if (icmp_param.data.icmph.type == ICMP_ECHO)
- icmp_param.data.icmph.type = ICMP_ECHOREPLY;
- else if (!icmp_build_probe(skb, &icmp_param.data.icmph))
+ if (icmp_param->data.icmph.type == ICMP_ECHO)
+ icmp_param->data.icmph.type = ICMP_ECHOREPLY;
+ else if (!icmp_build_probe(skb, &icmp_param->data.icmph))
return SKB_NOT_DROPPED_YET;
- icmp_reply(&icmp_param, skb);
+ icmp_reply(icmp_param, skb);
return SKB_NOT_DROPPED_YET;
}
@@ -1353,7 +1381,8 @@ EXPORT_SYMBOL_GPL(icmp_build_probe);
*/
static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb)
{
- struct icmp_bxm icmp_param;
+ DEFINE_RAW_FLEX(struct icmp_bxm, icmp_param, replyopts.opt.__data,
+ IP_OPTIONS_DATA_FIXED_SIZE);
/*
* Too short.
*/
@@ -1363,19 +1392,19 @@ static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb)
/*
* Fill in the current time as ms since midnight UT:
*/
- icmp_param.data.times[1] = inet_current_timestamp();
- icmp_param.data.times[2] = icmp_param.data.times[1];
-
- BUG_ON(skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4));
-
- icmp_param.data.icmph = *icmp_hdr(skb);
- icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY;
- icmp_param.data.icmph.code = 0;
- icmp_param.skb = skb;
- icmp_param.offset = 0;
- icmp_param.data_len = 0;
- icmp_param.head_len = sizeof(struct icmphdr) + 12;
- icmp_reply(&icmp_param, skb);
+ icmp_param->data.times[1] = inet_current_timestamp();
+ icmp_param->data.times[2] = icmp_param->data.times[1];
+
+ BUG_ON(skb_copy_bits(skb, 0, &icmp_param->data.times[0], 4));
+
+ icmp_param->data.icmph = *icmp_hdr(skb);
+ icmp_param->data.icmph.type = ICMP_TIMESTAMPREPLY;
+ icmp_param->data.icmph.code = 0;
+ icmp_param->skb = skb;
+ icmp_param->offset = 0;
+ icmp_param->data_len = 0;
+ icmp_param->head_len = sizeof(struct icmphdr) + 12;
+ icmp_reply(icmp_param, skb);
return SKB_NOT_DROPPED_YET;
out_err:
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 7182f1419c2a..0adc993c211d 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -227,7 +227,7 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
static void igmp_gq_start_timer(struct in_device *in_dev)
{
- int tv = get_random_u32_below(in_dev->mr_maxdelay);
+ int tv = get_random_u32_below(READ_ONCE(in_dev->mr_maxdelay));
unsigned long exp = jiffies + tv + 2;
if (in_dev->mr_gq_running &&
@@ -1009,7 +1009,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
if (!max_delay)
max_delay = 1; /* can't mod w/ 0 */
- in_dev->mr_maxdelay = max_delay;
+ WRITE_ONCE(in_dev->mr_maxdelay, max_delay);
/* RFC3376, 4.1.6. QRV and 4.1.7. QQIC, when the most recently
* received value was zero, use the default or statically
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 97d57c52b9ad..5dfac6ce1110 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -20,6 +20,7 @@
#include <net/tcp_states.h>
#include <net/xfrm.h>
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
#include <net/sock_reuseport.h>
#include <net/addrconf.h>
@@ -918,6 +919,16 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
}
EXPORT_SYMBOL(inet_reqsk_alloc);
+void __reqsk_free(struct request_sock *req)
+{
+ req->rsk_ops->destructor(req);
+ if (req->rsk_listener)
+ sock_put(req->rsk_listener);
+ kfree(req->saved_syn);
+ kmem_cache_free(req->rsk_ops->slab, req);
+}
+EXPORT_SYMBOL_GPL(__reqsk_free);
+
static struct request_sock *inet_reqsk_clone(struct request_sock *req,
struct sock *sk)
{
@@ -1103,6 +1114,8 @@ static void reqsk_timer_handler(struct timer_list *t)
(!resend ||
!tcp_rtx_synack(sk_listener, req) ||
inet_rsk(req)->acked)) {
+ if (req->num_retrans > 1 && tcp_rsk(req)->accecn_ok)
+ tcp_rsk(req)->accecn_fail_mode |= TCP_ACCECN_ACE_FAIL_SEND;
if (req->num_timeout++ == 0)
atomic_dec(&queue->young);
mod_timer(&req->rsk_timer, jiffies + tcp_reqsk_timeout(req));
@@ -1196,7 +1209,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
{
struct sock *newsk = sk_clone_lock(sk, priority);
struct inet_connection_sock *newicsk;
- struct inet_request_sock *ireq;
+ const struct inet_request_sock *ireq;
struct inet_sock *newinet;
if (!newsk)
@@ -1311,6 +1324,15 @@ static int inet_ulp_can_listen(const struct sock *sk)
return 0;
}
+static void reqsk_queue_alloc(struct request_sock_queue *queue)
+{
+ queue->fastopenq.rskq_rst_head = NULL;
+ queue->fastopenq.rskq_rst_tail = NULL;
+ queue->fastopenq.qlen = 0;
+
+ queue->rskq_accept_head = NULL;
+}
+
int inet_csk_listen_start(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index ff11d3a85a36..e4790cc7b5c2 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1300,7 +1300,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
return -EFAULT;
cork->fragsize = ip_sk_use_pmtu(sk) ?
- dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu);
+ dst4_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu);
if (!inetdev_valid_mtu(cork->fragsize))
return -ENETUNREACH;
@@ -1439,7 +1439,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
pmtudisc = READ_ONCE(inet->pmtudisc);
if (pmtudisc == IP_PMTUDISC_DO ||
pmtudisc == IP_PMTUDISC_PROBE ||
- (skb->len <= dst_mtu(&rt->dst) &&
+ (skb->len <= dst4_mtu(&rt->dst) &&
ip_dont_fragment(sk, &rt->dst)))
df = htons(IP_DF);
@@ -1606,7 +1606,8 @@ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk,
const struct ip_reply_arg *arg,
unsigned int len, u64 transmit_time, u32 txhash)
{
- struct ip_options_data replyopts;
+ DEFINE_RAW_FLEX(struct ip_options_rcu, replyopts, opt.__data,
+ IP_OPTIONS_DATA_FIXED_SIZE);
struct ipcm_cookie ipc;
struct flowi4 fl4;
struct rtable *rt = skb_rtable(skb);
@@ -1615,18 +1616,18 @@ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk,
int err;
int oif;
- if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt))
+ if (__ip_options_echo(net, &replyopts->opt, skb, sopt))
return;
ipcm_init(&ipc);
ipc.addr = daddr;
ipc.sockc.transmit_time = transmit_time;
- if (replyopts.opt.opt.optlen) {
- ipc.opt = &replyopts.opt;
+ if (replyopts->opt.optlen) {
+ ipc.opt = replyopts;
- if (replyopts.opt.opt.srr)
- daddr = replyopts.opt.opt.faddr;
+ if (replyopts->opt.srr)
+ daddr = replyopts->opt.faddr;
}
oif = arg->bound_dev_if;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 6d9c5c20b1c4..c062d9519818 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1634,7 +1634,7 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
val = 0;
dst = sk_dst_get(sk);
if (dst) {
- val = dst_mtu(dst);
+ val = dst4_mtu(dst);
dst_release(dst);
}
if (!val)
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 019408d3ca2c..b1e1be00ff8b 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -679,8 +679,18 @@ static const u8 ic_bootp_cookie[4] = { 99, 130, 83, 99 };
static void __init
ic_dhcp_init_options(u8 *options, struct ic_device *d)
{
- u8 mt = ((ic_servaddr == NONE)
- ? DHCPDISCOVER : DHCPREQUEST);
+ static const u8 ic_req_params[] = {
+ 1, /* Subnet mask */
+ 3, /* Default gateway */
+ 6, /* DNS server */
+ 12, /* Host name */
+ 15, /* Domain name */
+ 17, /* Boot path */
+ 26, /* MTU */
+ 40, /* NIS domain name */
+ 42, /* NTP servers */
+ };
+ u8 mt = (ic_servaddr == NONE) ? DHCPDISCOVER : DHCPREQUEST;
u8 *e = options;
int len;
@@ -705,51 +715,36 @@ ic_dhcp_init_options(u8 *options, struct ic_device *d)
e += 4;
}
- /* always? */
- {
- static const u8 ic_req_params[] = {
- 1, /* Subnet mask */
- 3, /* Default gateway */
- 6, /* DNS server */
- 12, /* Host name */
- 15, /* Domain name */
- 17, /* Boot path */
- 26, /* MTU */
- 40, /* NIS domain name */
- 42, /* NTP servers */
- };
-
- *e++ = 55; /* Parameter request list */
- *e++ = sizeof(ic_req_params);
- memcpy(e, ic_req_params, sizeof(ic_req_params));
- e += sizeof(ic_req_params);
-
- if (ic_host_name_set) {
- *e++ = 12; /* host-name */
- len = strlen(utsname()->nodename);
- *e++ = len;
- memcpy(e, utsname()->nodename, len);
- e += len;
- }
- if (*vendor_class_identifier) {
- pr_info("DHCP: sending class identifier \"%s\"\n",
- vendor_class_identifier);
- *e++ = 60; /* Class-identifier */
- len = strlen(vendor_class_identifier);
- *e++ = len;
- memcpy(e, vendor_class_identifier, len);
- e += len;
- }
- len = strlen(dhcp_client_identifier + 1);
- /* the minimum length of identifier is 2, include 1 byte type,
- * and can not be larger than the length of options
- */
- if (len >= 1 && len < 312 - (e - options) - 1) {
- *e++ = 61;
- *e++ = len + 1;
- memcpy(e, dhcp_client_identifier, len + 1);
- e += len + 1;
- }
+ *e++ = 55; /* Parameter request list */
+ *e++ = sizeof(ic_req_params);
+ memcpy(e, ic_req_params, sizeof(ic_req_params));
+ e += sizeof(ic_req_params);
+
+ if (ic_host_name_set) {
+ *e++ = 12; /* host-name */
+ len = strlen(utsname()->nodename);
+ *e++ = len;
+ memcpy(e, utsname()->nodename, len);
+ e += len;
+ }
+ if (*vendor_class_identifier) {
+ pr_info("DHCP: sending class identifier \"%s\"\n",
+ vendor_class_identifier);
+ *e++ = 60; /* Class-identifier */
+ len = strlen(vendor_class_identifier);
+ *e++ = len;
+ memcpy(e, vendor_class_identifier, len);
+ e += len;
+ }
+ len = strlen(dhcp_client_identifier + 1);
+ /* the minimum length of identifier is 2, include 1 byte type,
+ * and can not be larger than the length of options
+ */
+ if (len >= 1 && len < 312 - (e - options) - 1) {
+ *e++ = 61;
+ *e++ = len + 1;
+ memcpy(e, dhcp_client_identifier, len + 1);
+ e += len + 1;
}
*e++ = 255; /* End of the list */
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index ca9eaee4c2ef..131382c388e9 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1895,7 +1895,7 @@ static int ipmr_prepare_xmit(struct net *net, struct mr_table *mrt,
return -1;
}
- if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
+ if (skb->len+encap > dst4_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
/* Do not fragment multicasts. Alas, IPv4 does not
* allow to send ICMP, so that packets will disappear
* to blackhole.
diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c
index 8ddac1f595ed..82cf8a9e5ded 100644
--- a/net/ipv4/metrics.c
+++ b/net/ipv4/metrics.c
@@ -88,4 +88,4 @@ struct dst_metrics *ip_fib_metrics_init(struct nlattr *fc_mx,
return fib_metrics;
}
-EXPORT_SYMBOL_GPL(ip_fib_metrics_init);
+EXPORT_IPV6_MOD_GPL(ip_fib_metrics_init);
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index fae4aa4a5f09..fecf6621f679 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -303,7 +303,7 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb,
goto free_nskb;
/* "Never happens" */
- if (nskb->len > dst_mtu(skb_dst(nskb)))
+ if (nskb->len > dst4_mtu(skb_dst(nskb)))
goto free_nskb;
nf_ct_attach(nskb, oldskb);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index cfbd563498e8..ebfc5a3d3ad6 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -690,6 +690,8 @@ EXPORT_IPV6_MOD_GPL(ping_common_sendmsg);
static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
+ DEFINE_RAW_FLEX(struct ip_options_rcu, opt_copy, opt.__data,
+ IP_OPTIONS_DATA_FIXED_SIZE);
struct net *net = sock_net(sk);
struct flowi4 fl4;
struct inet_sock *inet = inet_sk(sk);
@@ -697,7 +699,6 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct icmphdr user_icmph;
struct pingfakehdr pfh;
struct rtable *rt = NULL;
- struct ip_options_data opt_copy;
int free = 0;
__be32 saddr, daddr, faddr;
u8 scope;
@@ -746,9 +747,9 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
rcu_read_lock();
inet_opt = rcu_dereference(inet->inet_opt);
if (inet_opt) {
- memcpy(&opt_copy, inet_opt,
+ memcpy(opt_copy, inet_opt,
sizeof(*inet_opt) + inet_opt->opt.optlen);
- ipc.opt = &opt_copy.opt;
+ ipc.opt = opt_copy;
}
rcu_read_unlock();
}
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 5998c4cc6f47..e20c41206e29 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -481,6 +481,8 @@ static int raw_getfrag(void *from, char *to, int offset, int len, int odd,
static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
+ DEFINE_RAW_FLEX(struct ip_options_rcu, opt_copy, opt.__data,
+ IP_OPTIONS_DATA_FIXED_SIZE);
struct inet_sock *inet = inet_sk(sk);
struct net *net = sock_net(sk);
struct ipcm_cookie ipc;
@@ -491,7 +493,6 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
__be32 daddr;
__be32 saddr;
int uc_index, err;
- struct ip_options_data opt_copy;
struct raw_frag_vec rfv;
int hdrincl;
@@ -561,9 +562,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
rcu_read_lock();
inet_opt = rcu_dereference(inet->inet_opt);
if (inet_opt) {
- memcpy(&opt_copy, inet_opt,
+ memcpy(opt_copy, inet_opt,
sizeof(*inet_opt) + inet_opt->opt.optlen);
- ipc.opt = &opt_copy.opt;
+ ipc.opt = opt_copy;
}
rcu_read_unlock();
}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 11d990703d31..06aa39ae80d6 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1795,8 +1795,8 @@ static void ip_handle_martian_source(struct net_device *dev,
* RFC1812 recommendation, if source is martian,
* the only hint is MAC header.
*/
- pr_warn("martian source %pI4 from %pI4, on dev %s\n",
- &daddr, &saddr, dev->name);
+ pr_warn("martian source (src=%pI4, dst=%pI4, dev=%s)\n",
+ &saddr, &daddr, dev->name);
if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
print_hex_dump(KERN_WARNING, "ll header: ",
DUMP_PREFIX_OFFSET, 16, 1,
@@ -2475,8 +2475,8 @@ martian_destination:
RT_CACHE_STAT_INC(in_martian_dst);
#ifdef CONFIG_IP_ROUTE_VERBOSE
if (IN_DEV_LOG_MARTIANS(in_dev))
- net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
- &daddr, &saddr, dev->name);
+ net_warn_ratelimited("martian destination (src=%pI4, dst=%pI4, dev=%s)\n",
+ &saddr, &daddr, dev->name);
#endif
goto out;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index a1a50a5c80dc..643763bc2142 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -47,7 +47,7 @@ static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX;
static int tcp_plb_max_rounds = 31;
static int tcp_plb_max_cong_thresh = 256;
static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC;
-static int tcp_ecn_mode_max = 2;
+static int tcp_ecn_mode_max = 5;
static u32 icmp_errors_extension_mask_all =
GENMASK_U8(ICMP_ERR_EXT_COUNT - 1, 0);
@@ -749,7 +749,7 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_TWO,
+ .extra2 = SYSCTL_THREE,
},
{
.procname = "tcp_ecn_option_beacon",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d5319ebe2452..6ce03a9adb4a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -319,15 +319,6 @@ struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp;
EXPORT_IPV6_MOD(tcp_sockets_allocated);
/*
- * TCP splice context
- */
-struct tcp_splice_state {
- struct pipe_inode_info *pipe;
- size_t len;
- unsigned int flags;
-};
-
-/*
* Pressure flag: try to collapse.
* Technical note: it is used by multiple contexts non atomically.
* All the __sk_mem_schedule() is of this nature: accounting
@@ -501,6 +492,9 @@ static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc)
struct sk_buff *skb = tcp_write_queue_tail(sk);
u32 tsflags = sockc->tsflags;
+ if (unlikely(!skb))
+ skb = skb_rb_last(&sk->tcp_rtx_queue);
+
if (tsflags && skb) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
@@ -517,6 +511,19 @@ static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc)
bpf_skops_tx_timestamping(sk, skb, BPF_SOCK_OPS_TSTAMP_SENDMSG_CB);
}
+/* @wake is one when sk_stream_write_space() calls us.
+ * This sends EPOLLOUT only if notsent_bytes is half the limit.
+ * This mimics the strategy used in sock_def_write_space().
+ */
+bool tcp_stream_memory_free(const struct sock *sk, int wake)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ u32 notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt);
+
+ return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
+}
+EXPORT_SYMBOL(tcp_stream_memory_free);
+
static bool tcp_stream_is_readable(struct sock *sk, int target)
{
if (tcp_epollin_ready(sk, target))
@@ -775,8 +782,8 @@ void tcp_push(struct sock *sk, int flags, int mss_now,
__tcp_push_pending_frames(sk, mss_now, nonagle);
}
-static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
- unsigned int offset, size_t len)
+int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
+ unsigned int offset, size_t len)
{
struct tcp_splice_state *tss = rd_desc->arg.data;
int ret;
@@ -902,6 +909,33 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
}
EXPORT_IPV6_MOD(tcp_splice_read);
+/* We allow to exceed memory limits for FIN packets to expedite
+ * connection tear down and (memory) recovery.
+ * Otherwise tcp_send_fin() could be tempted to either delay FIN
+ * or even be forced to close flow without any FIN.
+ * In general, we want to allow one skb per socket to avoid hangs
+ * with edge trigger epoll()
+ */
+void sk_forced_mem_schedule(struct sock *sk, int size)
+{
+ int delta, amt;
+
+ delta = size - sk->sk_forward_alloc;
+ if (delta <= 0)
+ return;
+
+ amt = sk_mem_pages(delta);
+ sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
+
+ if (mem_cgroup_sk_enabled(sk))
+ mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL);
+
+ if (sk->sk_bypass_prot_mem)
+ return;
+
+ sk_memory_allocated_add(sk, amt);
+}
+
struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp,
bool force_schedule)
{
@@ -1074,6 +1108,24 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
return err;
}
+/* If a gap is detected between sends, mark the socket application-limited. */
+void tcp_rate_check_app_limited(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (/* We have less than one packet to send. */
+ tp->write_seq - tp->snd_nxt < tp->mss_cache &&
+ /* Nothing in sending host's qdisc queues or NIC tx queue. */
+ sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) &&
+ /* We are not limited by CWND. */
+ tcp_packets_in_flight(tp) < tcp_snd_cwnd(tp) &&
+ /* All lost packets have been retransmitted. */
+ tp->lost_out <= tp->retrans_out)
+ tp->app_limited =
+ (tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+}
+EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited);
+
int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
{
struct net_devmem_dmabuf_binding *binding = NULL;
@@ -3418,6 +3470,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tcp_accecn_init_counters(tp);
tp->prev_ecnfield = 0;
tp->accecn_opt_tstamp = 0;
+ tp->pkts_acked_ewma = 0;
if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release)
icsk->icsk_ca_ops->release(sk);
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
@@ -4320,6 +4373,14 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
if (tp->rto_stamp)
info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp;
+ if (tcp_ecn_disabled(tp))
+ info->tcpi_ecn_mode = TCPI_ECN_MODE_DISABLED;
+ else if (tcp_ecn_mode_rfc3168(tp))
+ info->tcpi_ecn_mode = TCPI_ECN_MODE_RFC3168;
+ else if (tcp_ecn_mode_accecn(tp))
+ info->tcpi_ecn_mode = TCPI_ECN_MODE_ACCECN;
+ else if (tcp_ecn_mode_pending(tp))
+ info->tcpi_ecn_mode = TCPI_ECN_MODE_PENDING;
info->tcpi_accecn_fail_mode = tp->accecn_fail_mode;
info->tcpi_accecn_opt_seen = tp->saw_accecn_opt;
info->tcpi_received_ce = tp->received_ce;
@@ -5191,6 +5252,7 @@ static void __init tcp_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_ecn_bytes);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, pkts_acked_ewma);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index df758adbb445..e9f6c77e0631 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -16,6 +16,7 @@
#include <linux/gfp.h>
#include <linux/jhash.h>
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
#include <trace/events/tcp.h>
static DEFINE_SPINLOCK(tcp_cong_list_lock);
@@ -227,7 +228,7 @@ void tcp_assign_congestion_control(struct sock *sk)
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
if (ca->flags & TCP_CONG_NEEDS_ECN)
- INET_ECN_xmit(sk);
+ INET_ECN_xmit_ect_1_negotiation(sk);
else
INET_ECN_dontxmit(sk);
}
@@ -257,7 +258,7 @@ static void tcp_reinit_congestion_control(struct sock *sk,
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
if (ca->flags & TCP_CONG_NEEDS_ECN)
- INET_ECN_xmit(sk);
+ INET_ECN_xmit_ect_1_negotiation(sk);
else
INET_ECN_dontxmit(sk);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 7d945a527daf..b30090cff3cf 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -5,6 +5,92 @@
#include <net/tcp.h>
#include <net/busy_poll.h>
+/*
+ * This function is called to set a Fast Open socket's "fastopen_rsk" field
+ * to NULL when a TFO socket no longer needs to access the request_sock.
+ * This happens only after 3WHS has been either completed or aborted (e.g.,
+ * RST is received).
+ *
+ * Before TFO, a child socket is created only after 3WHS is completed,
+ * hence it never needs to access the request_sock. things get a lot more
+ * complex with TFO. A child socket, accepted or not, has to access its
+ * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts,
+ * until 3WHS is either completed or aborted. Afterwards the req will stay
+ * until either the child socket is accepted, or in the rare case when the
+ * listener is closed before the child is accepted.
+ *
+ * In short, a request socket is only freed after BOTH 3WHS has completed
+ * (or aborted) and the child socket has been accepted (or listener closed).
+ * When a child socket is accepted, its corresponding req->sk is set to
+ * NULL since it's no longer needed. More importantly, "req->sk == NULL"
+ * will be used by the code below to determine if a child socket has been
+ * accepted or not, and the check is protected by the fastopenq->lock
+ * described below.
+ *
+ * Note that fastopen_rsk is only accessed from the child socket's context
+ * with its socket lock held. But a request_sock (req) can be accessed by
+ * both its child socket through fastopen_rsk, and a listener socket through
+ * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin
+ * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created.
+ * only in the rare case when both the listener and the child locks are held,
+ * e.g., in inet_csk_listen_stop() do we not need to acquire the lock.
+ * The lock also protects other fields such as fastopenq->qlen, which is
+ * decremented by this function when fastopen_rsk is no longer needed.
+ *
+ * Note that another solution was to simply use the existing socket lock
+ * from the listener. But first socket lock is difficult to use. It is not
+ * a simple spin lock - one must consider sock_owned_by_user() and arrange
+ * to use sk_add_backlog() stuff. But what really makes it infeasible is the
+ * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to
+ * acquire a child's lock while holding listener's socket lock.
+ *
+ * This function also sets "treq->tfo_listener" to false.
+ * treq->tfo_listener is used by the listener so it is protected by the
+ * fastopenq->lock in this function.
+ */
+void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
+ bool reset)
+{
+ struct sock *lsk = req->rsk_listener;
+ struct fastopen_queue *fastopenq;
+
+ fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq;
+
+ RCU_INIT_POINTER(tcp_sk(sk)->fastopen_rsk, NULL);
+ spin_lock_bh(&fastopenq->lock);
+ fastopenq->qlen--;
+ tcp_rsk(req)->tfo_listener = false;
+ if (req->sk) /* the child socket hasn't been accepted yet */
+ goto out;
+
+ if (!reset || lsk->sk_state != TCP_LISTEN) {
+ /* If the listener has been closed don't bother with the
+ * special RST handling below.
+ */
+ spin_unlock_bh(&fastopenq->lock);
+ reqsk_put(req);
+ return;
+ }
+ /* Wait for 60secs before removing a req that has triggered RST.
+ * This is a simple defense against TFO spoofing attack - by
+ * counting the req against fastopen.max_qlen, and disabling
+ * TFO when the qlen exceeds max_qlen.
+ *
+ * For more details see CoNext'11 "TCP Fast Open" paper.
+ */
+ req->rsk_timer.expires = jiffies + 60*HZ;
+ if (fastopenq->rskq_rst_head == NULL)
+ fastopenq->rskq_rst_head = req;
+ else
+ fastopenq->rskq_rst_tail->dl_next = req;
+
+ req->dl_next = NULL;
+ fastopenq->rskq_rst_tail = req;
+ fastopenq->qlen++;
+out:
+ spin_unlock_bh(&fastopenq->lock);
+}
+
void tcp_fastopen_init_key_once(struct net *net)
{
u8 key[TCP_FASTOPEN_KEY_LENGTH];
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 198f8a0d37be..e7b41abb82aa 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -488,6 +488,10 @@ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
tcp_count_delivered_ce(tp, delivered);
}
+#define PKTS_ACKED_WEIGHT 6
+#define PKTS_ACKED_PREC 6
+#define ACK_COMP_THRESH 4
+
/* Returns the ECN CE delta */
static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb,
u32 delivered_pkts, u32 delivered_bytes,
@@ -499,6 +503,7 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb,
u32 delta, safe_delta, d_ceb;
bool opt_deltas_valid;
u32 corrected_ace;
+ u32 ewma;
/* Reordered ACK or uncertain due to lack of data to send and ts */
if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS)))
@@ -507,6 +512,18 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb,
opt_deltas_valid = tcp_accecn_process_option(tp, skb,
delivered_bytes, flag);
+ if (delivered_pkts) {
+ if (!tp->pkts_acked_ewma) {
+ ewma = delivered_pkts << PKTS_ACKED_PREC;
+ } else {
+ ewma = tp->pkts_acked_ewma;
+ ewma = (((ewma << PKTS_ACKED_WEIGHT) - ewma) +
+ (delivered_pkts << PKTS_ACKED_PREC)) >>
+ PKTS_ACKED_WEIGHT;
+ }
+ tp->pkts_acked_ewma = min_t(u32, ewma, 0xFFFFU);
+ }
+
if (!(flag & FLAG_SLOWPATH)) {
/* AccECN counter might overflow on large ACKs */
if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK)
@@ -555,7 +572,8 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb,
if (d_ceb <
safe_delta * tp->mss_cache >> TCP_ACCECN_SAFETY_SHIFT)
return delta;
- }
+ } else if (tp->pkts_acked_ewma > (ACK_COMP_THRESH << PKTS_ACKED_PREC))
+ return delta;
return safe_delta;
}
@@ -1558,6 +1576,38 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
return in_sack;
}
+/* Record the most recently (re)sent time among the (s)acked packets
+ * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
+ * draft-cheng-tcpm-rack-00.txt
+ */
+static void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
+ u32 end_seq, u64 xmit_time)
+{
+ u32 rtt_us;
+
+ rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time);
+ if (rtt_us < tcp_min_rtt(tp) && (sacked & TCPCB_RETRANS)) {
+ /* If the sacked packet was retransmitted, it's ambiguous
+ * whether the retransmission or the original (or the prior
+ * retransmission) was sacked.
+ *
+ * If the original is lost, there is no ambiguity. Otherwise
+ * we assume the original can be delayed up to aRTT + min_rtt.
+ * the aRTT term is bounded by the fast recovery or timeout,
+ * so it's at least one RTT (i.e., retransmission is at least
+ * an RTT later).
+ */
+ return;
+ }
+ tp->rack.advanced = 1;
+ tp->rack.rtt_us = rtt_us;
+ if (tcp_skb_sent_after(xmit_time, tp->rack.mstamp,
+ end_seq, tp->rack.end_seq)) {
+ tp->rack.mstamp = xmit_time;
+ tp->rack.end_seq = end_seq;
+ }
+}
+
/* Mark the given newly-SACKed range as such, adjusting counters and hints. */
static u8 tcp_sacktag_one(struct sock *sk,
struct tcp_sacktag_state *state, u8 sacked,
@@ -1637,6 +1687,160 @@ static u8 tcp_sacktag_one(struct sock *sk,
return sacked;
}
+/* The bandwidth estimator estimates the rate at which the network
+ * can currently deliver outbound data packets for this flow. At a high
+ * level, it operates by taking a delivery rate sample for each ACK.
+ *
+ * A rate sample records the rate at which the network delivered packets
+ * for this flow, calculated over the time interval between the transmission
+ * of a data packet and the acknowledgment of that packet.
+ *
+ * Specifically, over the interval between each transmit and corresponding ACK,
+ * the estimator generates a delivery rate sample. Typically it uses the rate
+ * at which packets were acknowledged. However, the approach of using only the
+ * acknowledgment rate faces a challenge under the prevalent ACK decimation or
+ * compression: packets can temporarily appear to be delivered much quicker
+ * than the bottleneck rate. Since it is physically impossible to do that in a
+ * sustained fashion, when the estimator notices that the ACK rate is faster
+ * than the transmit rate, it uses the latter:
+ *
+ * send_rate = #pkts_delivered/(last_snd_time - first_snd_time)
+ * ack_rate = #pkts_delivered/(last_ack_time - first_ack_time)
+ * bw = min(send_rate, ack_rate)
+ *
+ * Notice the estimator essentially estimates the goodput, not always the
+ * network bottleneck link rate when the sending or receiving is limited by
+ * other factors like applications or receiver window limits. The estimator
+ * deliberately avoids using the inter-packet spacing approach because that
+ * approach requires a large number of samples and sophisticated filtering.
+ *
+ * TCP flows can often be application-limited in request/response workloads.
+ * The estimator marks a bandwidth sample as application-limited if there
+ * was some moment during the sampled window of packets when there was no data
+ * ready to send in the write queue.
+ */
+
+/* Update the connection delivery information and generate a rate sample. */
+static void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ bool is_sack_reneg, struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 snd_us, ack_us;
+
+ /* Clear app limited if bubble is acked and gone. */
+ if (tp->app_limited && after(tp->delivered, tp->app_limited))
+ tp->app_limited = 0;
+
+ /* TODO: there are multiple places throughout tcp_ack() to get
+ * current time. Refactor the code using a new "tcp_acktag_state"
+ * to carry current time, flags, stats like "tcp_sacktag_state".
+ */
+ if (delivered)
+ tp->delivered_mstamp = tp->tcp_mstamp;
+
+ rs->acked_sacked = delivered; /* freshly ACKed or SACKed */
+ rs->losses = lost; /* freshly marked lost */
+ /* Return an invalid sample if no timing information is available or
+ * in recovery from loss with SACK reneging. Rate samples taken during
+ * a SACK reneging event may overestimate bw by including packets that
+ * were SACKed before the reneg.
+ */
+ if (!rs->prior_mstamp || is_sack_reneg) {
+ rs->delivered = -1;
+ rs->interval_us = -1;
+ return;
+ }
+ rs->delivered = tp->delivered - rs->prior_delivered;
+
+ rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
+ /* delivered_ce occupies less than 32 bits in the skb control block */
+ rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK;
+
+ /* Model sending data and receiving ACKs as separate pipeline phases
+ * for a window. Usually the ACK phase is longer, but with ACK
+ * compression the send phase can be longer. To be safe we use the
+ * longer phase.
+ */
+ snd_us = rs->interval_us; /* send phase */
+ ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
+ rs->prior_mstamp); /* ack phase */
+ rs->interval_us = max(snd_us, ack_us);
+
+ /* Record both segment send and ack receive intervals */
+ rs->snd_interval_us = snd_us;
+ rs->rcv_interval_us = ack_us;
+
+ /* Normally we expect interval_us >= min-rtt.
+ * Note that rate may still be over-estimated when a spuriously
+ * retransmistted skb was first (s)acked because "interval_us"
+ * is under-estimated (up to an RTT). However continuously
+ * measuring the delivery rate during loss recovery is crucial
+ * for connections suffer heavy or prolonged losses.
+ */
+ if (unlikely(rs->interval_us < tcp_min_rtt(tp))) {
+ if (!rs->is_retrans)
+ pr_debug("tcp rate: %ld %d %u %u %u\n",
+ rs->interval_us, rs->delivered,
+ inet_csk(sk)->icsk_ca_state,
+ tp->rx_opt.sack_ok, tcp_min_rtt(tp));
+ rs->interval_us = -1;
+ return;
+ }
+
+ /* Record the last non-app-limited or the highest app-limited bw */
+ if (!rs->is_app_limited ||
+ ((u64)rs->delivered * tp->rate_interval_us >=
+ (u64)tp->rate_delivered * rs->interval_us)) {
+ tp->rate_delivered = rs->delivered;
+ tp->rate_interval_us = rs->interval_us;
+ tp->rate_app_limited = rs->is_app_limited;
+ }
+}
+
+/* When an skb is sacked or acked, we fill in the rate sample with the (prior)
+ * delivery information when the skb was last transmitted.
+ *
+ * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is
+ * called multiple times. We favor the information from the most recently
+ * sent skb, i.e., the skb with the most recently sent time and the highest
+ * sequence.
+ */
+static void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ struct rate_sample *rs)
+{
+ struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u64 tx_tstamp;
+
+ if (!scb->tx.delivered_mstamp)
+ return;
+
+ tx_tstamp = tcp_skb_timestamp_us(skb);
+ if (!rs->prior_delivered ||
+ tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
+ scb->end_seq, rs->last_end_seq)) {
+ rs->prior_delivered_ce = scb->tx.delivered_ce;
+ rs->prior_delivered = scb->tx.delivered;
+ rs->prior_mstamp = scb->tx.delivered_mstamp;
+ rs->is_app_limited = scb->tx.is_app_limited;
+ rs->is_retrans = scb->sacked & TCPCB_RETRANS;
+ rs->last_end_seq = scb->end_seq;
+
+ /* Record send time of most recently ACKed packet: */
+ tp->first_tx_mstamp = tx_tstamp;
+ /* Find the duration of the "send phase" of this window: */
+ rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
+ scb->tx.first_tx_mstamp);
+
+ }
+ /* Mark off the skb delivered once it's sacked to avoid being
+ * used again when it's cumulatively acked. For acked packets
+ * we don't need to reset since it'll be freed soon.
+ */
+ if (scb->sacked & TCPCB_SACKED_ACKED)
+ scb->tx.delivered_mstamp = 0;
+}
+
/* Shift newly-SACKed bytes from this skb to the immediately previous
* already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
*/
@@ -3995,6 +4199,49 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered,
return delivered;
}
+/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries.
+ *
+ * If a DSACK is received that seems like it may have been due to reordering
+ * triggering fast recovery, increment reo_wnd by min_rtt/4 (upper bounded
+ * by srtt), since there is possibility that spurious retransmission was
+ * due to reordering delay longer than reo_wnd.
+ *
+ * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16)
+ * no. of successful recoveries (accounts for full DSACK-based loss
+ * recovery undo). After that, reset it to default (min_rtt/4).
+ *
+ * At max, reo_wnd is incremented only once per rtt. So that the new
+ * DSACK on which we are reacting, is due to the spurious retx (approx)
+ * after the reo_wnd has been updated last time.
+ *
+ * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than
+ * absolute value to account for change in rtt.
+ */
+static void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if ((READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) &
+ TCP_RACK_STATIC_REO_WND) ||
+ !rs->prior_delivered)
+ return;
+
+ /* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */
+ if (before(rs->prior_delivered, tp->rack.last_delivered))
+ tp->rack.dsack_seen = 0;
+
+ /* Adjust the reo_wnd if update is pending */
+ if (tp->rack.dsack_seen) {
+ tp->rack.reo_wnd_steps = min_t(u32, 0xFF,
+ tp->rack.reo_wnd_steps + 1);
+ tp->rack.dsack_seen = 0;
+ tp->rack.last_delivered = tp->delivered;
+ tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH;
+ } else if (!tp->rack.reo_wnd_persist) {
+ tp->rack.reo_wnd_steps = 1;
+ }
+}
+
/* This routine deals with incoming acks, but not outgoing ones. */
static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
{
@@ -4129,7 +4376,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tcp_in_ack_event(sk, flag);
- if (tp->tlp_high_seq)
+ if (unlikely(tp->tlp_high_seq))
tcp_process_tlp_ack(sk, ack, flag);
if (tcp_ack_is_dubious(sk, flag)) {
@@ -4179,7 +4426,7 @@ no_queue:
*/
tcp_ack_probe(sk);
- if (tp->tlp_high_seq)
+ if (unlikely(tp->tlp_high_seq))
tcp_process_tlp_ack(sk, ack, flag);
return 1;
@@ -4799,8 +5046,11 @@ static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
}
-static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
+static void tcp_rcv_spurious_retrans(struct sock *sk,
+ const struct sk_buff *skb)
{
+ struct tcp_sock *tp = tcp_sk(sk);
+
/* When the ACK path fails or drops most ACKs, the sender would
* timeout and spuriously retransmit the same segment repeatedly.
* If it seems our ACKs are not reaching the other side,
@@ -4820,6 +5070,14 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
/* Save last flowlabel after a spurious retrans. */
tcp_save_lrcv_flowlabel(sk, skb);
#endif
+ /* Check DSACK info to detect that the previous ACK carrying the
+ * AccECN option was lost after the second retransmision, and then
+ * stop sending AccECN option in all subsequent ACKs.
+ */
+ if (tcp_ecn_mode_accecn(tp) &&
+ tp->accecn_opt_sent_w_dsack &&
+ TCP_SKB_CB(skb)->seq == tp->duplicate_sack[0].start_seq)
+ tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_SEND);
}
static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
@@ -5527,25 +5785,6 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
return next;
}
-/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
-void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
-{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent = NULL;
- struct sk_buff *skb1;
-
- while (*p) {
- parent = *p;
- skb1 = rb_to_skb(parent);
- if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
- p = &parent->rb_left;
- else
- p = &parent->rb_right;
- }
- rb_link_node(&skb->rbnode, parent, p);
- rb_insert_color(&skb->rbnode, root);
-}
-
/* Collapse contiguous sequence of skbs head..tail with
* sequence numbers start..end.
*
@@ -5879,16 +6118,11 @@ static void tcp_new_space(struct sock *sk)
* small enough that tcp_stream_memory_free() decides it
* is time to generate EPOLLOUT.
*/
-void tcp_check_space(struct sock *sk)
+void __tcp_check_space(struct sock *sk)
{
- /* pairs with tcp_poll() */
- smp_mb();
- if (sk->sk_socket &&
- test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
- tcp_new_space(sk);
- if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
- tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
- }
+ tcp_new_space(sk);
+ if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+ tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
static inline void tcp_data_snd_check(struct sock *sk)
@@ -6222,6 +6456,8 @@ step1:
if (th->syn) {
if (tcp_ecn_mode_accecn(tp)) {
accecn_reflector = true;
+ tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield &
+ INET_ECN_MASK;
if (tp->rx_opt.accecn &&
tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn);
@@ -6843,7 +7079,7 @@ consume:
tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
tp->max_window = tp->snd_wnd;
- tcp_ecn_rcv_syn(tp, th, skb);
+ tcp_ecn_rcv_syn(sk, th, skb);
tcp_mtup_init(sk);
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
@@ -7248,7 +7484,8 @@ static void tcp_ecn_create_request(struct request_sock *req,
u32 ecn_ok_dst;
if (tcp_accecn_syn_requested(th) &&
- READ_ONCE(net->ipv4.sysctl_tcp_ecn) >= 3) {
+ (READ_ONCE(net->ipv4.sysctl_tcp_ecn) >= 3 ||
+ tcp_ca_needs_accecn(listen_sk))) {
inet_rsk(req)->ecn_ok = 1;
tcp_rsk(req)->accecn_ok = 1;
tcp_rsk(req)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield &
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f8a9596e8f4d..6264fc0b2be5 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -374,7 +374,7 @@ void tcp_v4_mtu_reduced(struct sock *sk)
{
struct inet_sock *inet = inet_sk(sk);
struct dst_entry *dst;
- u32 mtu;
+ u32 mtu, dmtu;
if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
return;
@@ -386,15 +386,14 @@ void tcp_v4_mtu_reduced(struct sock *sk)
/* Something is about to be wrong... Remember soft error
* for the case, if this connection will not able to recover.
*/
- if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
+ dmtu = dst4_mtu(dst);
+ if (mtu < dmtu && ip_dont_fragment(sk, dst))
WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
- mtu = dst_mtu(dst);
-
if (inet->pmtudisc != IP_PMTUDISC_DONT &&
ip_sk_accept_pmtu(sk) &&
- inet_csk(sk)->icsk_pmtu_cookie > mtu) {
- tcp_sync_mss(sk, mtu);
+ inet_csk(sk)->icsk_pmtu_cookie > dmtu) {
+ tcp_sync_mss(sk, dmtu);
/* Resend the TCP packet because it's
* clear that the old packet has been
@@ -1760,7 +1759,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
tcp_ca_openreq_child(newsk, dst);
- tcp_sync_mss(newsk, dst_mtu(dst));
+ tcp_sync_mss(newsk, dst4_mtu(dst));
newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
tcp_initialize_rcv_mss(newsk);
@@ -2110,14 +2109,6 @@ no_coalesce:
}
EXPORT_IPV6_MOD(tcp_add_backlog);
-int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason)
-{
- struct tcphdr *th = (struct tcphdr *)skb->data;
-
- return sk_filter_trim_cap(sk, skb, th->doff * 4, reason);
-}
-EXPORT_IPV6_MOD(tcp_filter);
-
static void tcp_v4_restore_cb(struct sk_buff *skb)
{
memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
@@ -3418,20 +3409,6 @@ void tcp4_proc_exit(void)
}
#endif /* CONFIG_PROC_FS */
-/* @wake is one when sk_stream_write_space() calls us.
- * This sends EPOLLOUT only if notsent_bytes is half the limit.
- * This mimics the strategy used in sock_def_write_space().
- */
-bool tcp_stream_memory_free(const struct sock *sk, int wake)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
- u32 notsent_bytes = READ_ONCE(tp->write_seq) -
- READ_ONCE(tp->snd_nxt);
-
- return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
-}
-EXPORT_SYMBOL(tcp_stream_memory_free);
-
struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
@@ -3474,6 +3451,8 @@ struct proto tcp_prot = {
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
+ .freeptr_offset = offsetof(struct tcp_sock,
+ inet_conn.icsk_inet.sk.sk_freeptr),
.slab_flags = SLAB_TYPESAFE_BY_RCU,
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index bd5462154f97..ec128865f5c0 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -481,13 +481,18 @@ static void tcp_ecn_openreq_child(struct sock *sk,
tp->syn_ect_snt = treq->syn_ect_snt;
tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt);
tp->saw_accecn_opt = treq->saw_accecn_opt;
+ if (treq->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_SEND)
+ tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_SEND);
+ if (treq->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_RECV)
+ tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV);
tp->prev_ecnfield = treq->syn_ect_rcv;
tp->accecn_opt_demand = 1;
tcp_ecn_received_counters_payload(sk, skb);
} else {
- tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ?
- TCP_ECN_MODE_RFC3168 :
- TCP_ECN_DISABLED);
+ if (inet_rsk(req)->ecn_ok && !tcp_ca_no_fallback_rfc3168(sk))
+ tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
+ else
+ tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
}
}
@@ -748,16 +753,28 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
*/
if (!tcp_oow_rate_limited(sock_net(sk), skb,
LINUX_MIB_TCPACKSKIPPEDSYNRECV,
- &tcp_rsk(req)->last_oow_ack_time) &&
-
- !tcp_rtx_synack(sk, req)) {
- unsigned long expires = jiffies;
-
- expires += tcp_reqsk_timeout(req);
- if (!fastopen)
- mod_timer_pending(&req->rsk_timer, expires);
- else
- req->rsk_timer.expires = expires;
+ &tcp_rsk(req)->last_oow_ack_time)) {
+ if (tcp_rsk(req)->accecn_ok) {
+ u8 ect_rcv = TCP_SKB_CB(skb)->ip_dsfield &
+ INET_ECN_MASK;
+
+ tcp_rsk(req)->syn_ect_rcv = ect_rcv;
+ if (tcp_accecn_ace(tcp_hdr(skb)) == 0x0)
+ tcp_rsk(req)->accecn_fail_mode |= TCP_ACCECN_ACE_FAIL_RECV;
+ }
+ if (!tcp_rtx_synack(sk, req)) {
+ unsigned long expires = jiffies;
+
+ if (req->num_retrans > 1 && tcp_rsk(req)->accecn_ok)
+ tcp_rsk(req)->accecn_fail_mode |= TCP_ACCECN_ACE_FAIL_SEND;
+
+ expires += tcp_reqsk_timeout(req);
+ if (!fastopen)
+ mod_timer_pending(&req->rsk_timer,
+ expires);
+ else
+ req->rsk_timer.expires = expires;
+ }
}
return NULL;
}
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 942a948f1a31..3b1fdcd3cb29 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -304,8 +304,7 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
goto out_check_final;
th2 = tcp_hdr(p);
- flush = (__force int)(flags & TCP_FLAG_CWR);
- flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
+ flush = (__force int)((flags ^ tcp_flag_word(th2)) &
~(TCP_FLAG_FIN | TCP_FLAG_PSH));
flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
for (i = sizeof(*th); i < thlen; i += 4)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 479afb714bdf..326b58ff1118 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -66,6 +66,25 @@ void tcp_mstamp_refresh(struct tcp_sock *tp)
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp);
+/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
+void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct sk_buff *skb1;
+
+ while (*p) {
+ parent = *p;
+ skb1 = rb_to_skb(parent);
+ if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
+ p = &parent->rb_left;
+ else
+ p = &parent->rb_right;
+ }
+ rb_link_node(&skb->rbnode, parent, p);
+ rb_insert_color(&skb->rbnode, root);
+}
+
/* Account for new data that has been sent to the network. */
static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
{
@@ -334,8 +353,11 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
return;
if (tcp_ecn_mode_accecn(tp)) {
- if (!tcp_accecn_ace_fail_recv(tp))
+ if (!tcp_accecn_ace_fail_recv(tp) &&
+ !tcp_accecn_ace_fail_send(tp))
INET_ECN_xmit(sk);
+ else
+ INET_ECN_dontxmit(sk);
tcp_accecn_set_ace(tp, skb, th);
skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ACCECN;
} else {
@@ -712,9 +734,12 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
if (tp) {
tp->accecn_minlen = 0;
tp->accecn_opt_tstamp = tp->tcp_mstamp;
+ tp->accecn_opt_sent_w_dsack = tp->rx_opt.dsack;
if (tp->accecn_opt_demand)
tp->accecn_opt_demand--;
}
+ } else if (tp) {
+ tp->accecn_opt_sent_w_dsack = 0;
}
if (unlikely(OPTION_SACK_ADVERTISE & options)) {
@@ -1106,7 +1131,7 @@ static unsigned int tcp_synack_options(const struct sock *sk,
if (treq->accecn_ok &&
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) &&
- req->num_timeout < 1 && remaining >= TCPOLEN_ACCECN_BASE) {
+ synack_type != TCP_SYNACK_RETRANS && remaining >= TCPOLEN_ACCECN_BASE) {
opts->use_synack_ecn_bytes = 1;
remaining -= tcp_options_fit_accecn(opts, 0, remaining);
}
@@ -1186,7 +1211,9 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
if (tcp_ecn_mode_accecn(tp)) {
int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option);
- if (ecn_opt && tp->saw_accecn_opt && !tcp_accecn_opt_fail_send(tp) &&
+ if (ecn_opt && tp->saw_accecn_opt &&
+ (ecn_opt >= TCP_ACCECN_OPTION_PERSIST ||
+ !tcp_accecn_opt_fail_send(tp)) &&
(ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand ||
tcp_accecn_option_beacon_check(sk))) {
opts->use_synack_ecn_bytes = 0;
@@ -1432,6 +1459,41 @@ static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
}
+/* Snapshot the current delivery information in the skb, to generate
+ * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
+ */
+static void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* In general we need to start delivery rate samples from the
+ * time we received the most recent ACK, to ensure we include
+ * the full time the network needs to deliver all in-flight
+ * packets. If there are no packets in flight yet, then we
+ * know that any ACKs after now indicate that the network was
+ * able to deliver those packets completely in the sampling
+ * interval between now and the next ACK.
+ *
+ * Note that we use packets_out instead of tcp_packets_in_flight(tp)
+ * because the latter is a guess based on RTO and loss-marking
+ * heuristics. We don't want spurious RTOs or loss markings to cause
+ * a spuriously small time interval, causing a spuriously high
+ * bandwidth estimate.
+ */
+ if (!tp->packets_out) {
+ u64 tstamp_us = tcp_skb_timestamp_us(skb);
+
+ tp->first_tx_mstamp = tstamp_us;
+ tp->delivered_mstamp = tstamp_us;
+ }
+
+ TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp;
+ TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp;
+ TCP_SKB_CB(skb)->tx.delivered = tp->delivered;
+ TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce;
+ TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0;
+}
+
INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb));
@@ -1530,7 +1592,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
*/
skb->pfmemalloc = 0;
- skb_push(skb, tcp_header_size);
+ __skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
skb_orphan(skb);
@@ -3571,12 +3633,15 @@ start:
tcp_retrans_try_collapse(sk, skb, avail_wnd);
}
- /* RFC3168, section 6.1.1.1. ECN fallback
- * As AccECN uses the same SYN flags (+ AE), this check covers both
- * cases.
- */
- if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
- tcp_ecn_clear_syn(sk, skb);
+ if (!tcp_ecn_mode_pending(tp) || icsk->icsk_retransmits > 1) {
+ /* RFC3168, section 6.1.1.1. ECN fallback
+ * As AccECN uses the same SYN flags (+ AE), this check
+ * covers both cases.
+ */
+ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) ==
+ TCPHDR_SYN_ECN)
+ tcp_ecn_clear_syn(sk, skb);
+ }
/* Update global and local TCP statistics. */
segs = tcp_skb_pcount(skb);
@@ -3732,33 +3797,6 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
inet_csk(sk)->icsk_rto, true);
}
-/* We allow to exceed memory limits for FIN packets to expedite
- * connection tear down and (memory) recovery.
- * Otherwise tcp_send_fin() could be tempted to either delay FIN
- * or even be forced to close flow without any FIN.
- * In general, we want to allow one skb per socket to avoid hangs
- * with edge trigger epoll()
- */
-void sk_forced_mem_schedule(struct sock *sk, int size)
-{
- int delta, amt;
-
- delta = size - sk->sk_forward_alloc;
- if (delta <= 0)
- return;
-
- amt = sk_mem_pages(delta);
- sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
-
- if (mem_cgroup_sk_enabled(sk))
- mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL);
-
- if (sk->sk_bypass_prot_mem)
- return;
-
- sk_memory_allocated_add(sk, amt);
-}
-
/* Send a FIN. The caller locks the socket for us.
* We should try to send a FIN packet really hard, but eventually give up.
*/
@@ -3918,6 +3956,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
switch (synack_type) {
case TCP_SYNACK_NORMAL:
+ case TCP_SYNACK_RETRANS:
skb_set_owner_edemux(skb, req_to_sk(req));
break;
case TCP_SYNACK_COOKIE:
@@ -4000,7 +4039,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
memset(th, 0, sizeof(struct tcphdr));
th->syn = 1;
th->ack = 1;
- tcp_ecn_make_synack(req, th);
+ tcp_ecn_make_synack(req, th, synack_type);
th->source = htons(ireq->ir_num);
th->dest = ireq->ir_rmt_port;
skb->mark = ireq->ir_mark;
@@ -4603,7 +4642,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
/* Paired with WRITE_ONCE() in sock_setsockopt() */
if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED)
WRITE_ONCE(tcp_rsk(req)->txhash, net_tx_rndhash());
- res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
+ res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_RETRANS,
NULL);
if (!res) {
TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
deleted file mode 100644
index a8f6d9d06f2e..000000000000
--- a/net/ipv4/tcp_rate.c
+++ /dev/null
@@ -1,209 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-#include <net/tcp.h>
-
-/* The bandwidth estimator estimates the rate at which the network
- * can currently deliver outbound data packets for this flow. At a high
- * level, it operates by taking a delivery rate sample for each ACK.
- *
- * A rate sample records the rate at which the network delivered packets
- * for this flow, calculated over the time interval between the transmission
- * of a data packet and the acknowledgment of that packet.
- *
- * Specifically, over the interval between each transmit and corresponding ACK,
- * the estimator generates a delivery rate sample. Typically it uses the rate
- * at which packets were acknowledged. However, the approach of using only the
- * acknowledgment rate faces a challenge under the prevalent ACK decimation or
- * compression: packets can temporarily appear to be delivered much quicker
- * than the bottleneck rate. Since it is physically impossible to do that in a
- * sustained fashion, when the estimator notices that the ACK rate is faster
- * than the transmit rate, it uses the latter:
- *
- * send_rate = #pkts_delivered/(last_snd_time - first_snd_time)
- * ack_rate = #pkts_delivered/(last_ack_time - first_ack_time)
- * bw = min(send_rate, ack_rate)
- *
- * Notice the estimator essentially estimates the goodput, not always the
- * network bottleneck link rate when the sending or receiving is limited by
- * other factors like applications or receiver window limits. The estimator
- * deliberately avoids using the inter-packet spacing approach because that
- * approach requires a large number of samples and sophisticated filtering.
- *
- * TCP flows can often be application-limited in request/response workloads.
- * The estimator marks a bandwidth sample as application-limited if there
- * was some moment during the sampled window of packets when there was no data
- * ready to send in the write queue.
- */
-
-/* Snapshot the current delivery information in the skb, to generate
- * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
- */
-void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- /* In general we need to start delivery rate samples from the
- * time we received the most recent ACK, to ensure we include
- * the full time the network needs to deliver all in-flight
- * packets. If there are no packets in flight yet, then we
- * know that any ACKs after now indicate that the network was
- * able to deliver those packets completely in the sampling
- * interval between now and the next ACK.
- *
- * Note that we use packets_out instead of tcp_packets_in_flight(tp)
- * because the latter is a guess based on RTO and loss-marking
- * heuristics. We don't want spurious RTOs or loss markings to cause
- * a spuriously small time interval, causing a spuriously high
- * bandwidth estimate.
- */
- if (!tp->packets_out) {
- u64 tstamp_us = tcp_skb_timestamp_us(skb);
-
- tp->first_tx_mstamp = tstamp_us;
- tp->delivered_mstamp = tstamp_us;
- }
-
- TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp;
- TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp;
- TCP_SKB_CB(skb)->tx.delivered = tp->delivered;
- TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce;
- TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0;
-}
-
-/* When an skb is sacked or acked, we fill in the rate sample with the (prior)
- * delivery information when the skb was last transmitted.
- *
- * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is
- * called multiple times. We favor the information from the most recently
- * sent skb, i.e., the skb with the most recently sent time and the highest
- * sequence.
- */
-void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
- struct rate_sample *rs)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
- u64 tx_tstamp;
-
- if (!scb->tx.delivered_mstamp)
- return;
-
- tx_tstamp = tcp_skb_timestamp_us(skb);
- if (!rs->prior_delivered ||
- tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
- scb->end_seq, rs->last_end_seq)) {
- rs->prior_delivered_ce = scb->tx.delivered_ce;
- rs->prior_delivered = scb->tx.delivered;
- rs->prior_mstamp = scb->tx.delivered_mstamp;
- rs->is_app_limited = scb->tx.is_app_limited;
- rs->is_retrans = scb->sacked & TCPCB_RETRANS;
- rs->last_end_seq = scb->end_seq;
-
- /* Record send time of most recently ACKed packet: */
- tp->first_tx_mstamp = tx_tstamp;
- /* Find the duration of the "send phase" of this window: */
- rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
- scb->tx.first_tx_mstamp);
-
- }
- /* Mark off the skb delivered once it's sacked to avoid being
- * used again when it's cumulatively acked. For acked packets
- * we don't need to reset since it'll be freed soon.
- */
- if (scb->sacked & TCPCB_SACKED_ACKED)
- scb->tx.delivered_mstamp = 0;
-}
-
-/* Update the connection delivery information and generate a rate sample. */
-void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
- bool is_sack_reneg, struct rate_sample *rs)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- u32 snd_us, ack_us;
-
- /* Clear app limited if bubble is acked and gone. */
- if (tp->app_limited && after(tp->delivered, tp->app_limited))
- tp->app_limited = 0;
-
- /* TODO: there are multiple places throughout tcp_ack() to get
- * current time. Refactor the code using a new "tcp_acktag_state"
- * to carry current time, flags, stats like "tcp_sacktag_state".
- */
- if (delivered)
- tp->delivered_mstamp = tp->tcp_mstamp;
-
- rs->acked_sacked = delivered; /* freshly ACKed or SACKed */
- rs->losses = lost; /* freshly marked lost */
- /* Return an invalid sample if no timing information is available or
- * in recovery from loss with SACK reneging. Rate samples taken during
- * a SACK reneging event may overestimate bw by including packets that
- * were SACKed before the reneg.
- */
- if (!rs->prior_mstamp || is_sack_reneg) {
- rs->delivered = -1;
- rs->interval_us = -1;
- return;
- }
- rs->delivered = tp->delivered - rs->prior_delivered;
-
- rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
- /* delivered_ce occupies less than 32 bits in the skb control block */
- rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK;
-
- /* Model sending data and receiving ACKs as separate pipeline phases
- * for a window. Usually the ACK phase is longer, but with ACK
- * compression the send phase can be longer. To be safe we use the
- * longer phase.
- */
- snd_us = rs->interval_us; /* send phase */
- ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
- rs->prior_mstamp); /* ack phase */
- rs->interval_us = max(snd_us, ack_us);
-
- /* Record both segment send and ack receive intervals */
- rs->snd_interval_us = snd_us;
- rs->rcv_interval_us = ack_us;
-
- /* Normally we expect interval_us >= min-rtt.
- * Note that rate may still be over-estimated when a spuriously
- * retransmistted skb was first (s)acked because "interval_us"
- * is under-estimated (up to an RTT). However continuously
- * measuring the delivery rate during loss recovery is crucial
- * for connections suffer heavy or prolonged losses.
- */
- if (unlikely(rs->interval_us < tcp_min_rtt(tp))) {
- if (!rs->is_retrans)
- pr_debug("tcp rate: %ld %d %u %u %u\n",
- rs->interval_us, rs->delivered,
- inet_csk(sk)->icsk_ca_state,
- tp->rx_opt.sack_ok, tcp_min_rtt(tp));
- rs->interval_us = -1;
- return;
- }
-
- /* Record the last non-app-limited or the highest app-limited bw */
- if (!rs->is_app_limited ||
- ((u64)rs->delivered * tp->rate_interval_us >=
- (u64)tp->rate_delivered * rs->interval_us)) {
- tp->rate_delivered = rs->delivered;
- tp->rate_interval_us = rs->interval_us;
- tp->rate_app_limited = rs->is_app_limited;
- }
-}
-
-/* If a gap is detected between sends, mark the socket application-limited. */
-void tcp_rate_check_app_limited(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (/* We have less than one packet to send. */
- tp->write_seq - tp->snd_nxt < tp->mss_cache &&
- /* Nothing in sending host's qdisc queues or NIC tx queue. */
- sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) &&
- /* We are not limited by CWND. */
- tcp_packets_in_flight(tp) < tcp_snd_cwnd(tp) &&
- /* All lost packets have been retransmitted. */
- tp->lost_out <= tp->retrans_out)
- tp->app_limited =
- (tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
-}
-EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited);
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index c52fd3254b6e..139646751073 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -111,38 +111,6 @@ bool tcp_rack_mark_lost(struct sock *sk)
return !!timeout;
}
-/* Record the most recently (re)sent time among the (s)acked packets
- * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
- * draft-cheng-tcpm-rack-00.txt
- */
-void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
- u64 xmit_time)
-{
- u32 rtt_us;
-
- rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time);
- if (rtt_us < tcp_min_rtt(tp) && (sacked & TCPCB_RETRANS)) {
- /* If the sacked packet was retransmitted, it's ambiguous
- * whether the retransmission or the original (or the prior
- * retransmission) was sacked.
- *
- * If the original is lost, there is no ambiguity. Otherwise
- * we assume the original can be delayed up to aRTT + min_rtt.
- * the aRTT term is bounded by the fast recovery or timeout,
- * so it's at least one RTT (i.e., retransmission is at least
- * an RTT later).
- */
- return;
- }
- tp->rack.advanced = 1;
- tp->rack.rtt_us = rtt_us;
- if (tcp_skb_sent_after(xmit_time, tp->rack.mstamp,
- end_seq, tp->rack.end_seq)) {
- tp->rack.mstamp = xmit_time;
- tp->rack.end_seq = end_seq;
- }
-}
-
/* We have waited long enough to accommodate reordering. Mark the expired
* packets lost and retransmit them.
*/
@@ -166,49 +134,6 @@ void tcp_rack_reo_timeout(struct sock *sk)
tcp_rearm_rto(sk);
}
-/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries.
- *
- * If a DSACK is received that seems like it may have been due to reordering
- * triggering fast recovery, increment reo_wnd by min_rtt/4 (upper bounded
- * by srtt), since there is possibility that spurious retransmission was
- * due to reordering delay longer than reo_wnd.
- *
- * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16)
- * no. of successful recoveries (accounts for full DSACK-based loss
- * recovery undo). After that, reset it to default (min_rtt/4).
- *
- * At max, reo_wnd is incremented only once per rtt. So that the new
- * DSACK on which we are reacting, is due to the spurious retx (approx)
- * after the reo_wnd has been updated last time.
- *
- * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than
- * absolute value to account for change in rtt.
- */
-void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- if ((READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) &
- TCP_RACK_STATIC_REO_WND) ||
- !rs->prior_delivered)
- return;
-
- /* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */
- if (before(rs->prior_delivered, tp->rack.last_delivered))
- tp->rack.dsack_seen = 0;
-
- /* Adjust the reo_wnd if update is pending */
- if (tp->rack.dsack_seen) {
- tp->rack.reo_wnd_steps = min_t(u32, 0xFF,
- tp->rack.reo_wnd_steps + 1);
- tp->rack.dsack_seen = 0;
- tp->rack.last_delivered = tp->delivered;
- tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH;
- } else if (!tp->rack.reo_wnd_persist) {
- tp->rack.reo_wnd_steps = 1;
- }
-}
-
/* RFC6582 NewReno recovery for non-SACK connection. It simply retransmits
* the next unacked packet upon receiving
* a) three or more DUPACKs to start the fast recovery
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 160080c9021d..5a14a53a3c9e 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,6 +22,7 @@
#include <linux/module.h>
#include <linux/gfp.h>
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
#include <net/rstreason.h>
static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
@@ -479,6 +480,8 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
* it's not good to give up too easily.
*/
tcp_rtx_synack(sk, req);
+ if (req->num_retrans > 1 && tcp_rsk(req)->accecn_ok)
+ tcp_rsk(req)->accecn_fail_mode |= TCP_ACCECN_ACE_FAIL_SEND;
req->num_timeout++;
tcp_update_rto_stats(sk);
if (!tp->retrans_stamp)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ee63af0ef42c..b96e47f1c8a2 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1193,7 +1193,7 @@ csum_partial:
send:
err = ip_send_skb(sock_net(sk), skb);
- if (err) {
+ if (unlikely(err)) {
if (err == -ENOBUFS &&
!inet_test_bit(RECVERR, sk)) {
UDP_INC_STATS(sock_net(sk),
@@ -1269,6 +1269,8 @@ EXPORT_IPV6_MOD_GPL(udp_cmsg_send);
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
+ DEFINE_RAW_FLEX(struct ip_options_rcu, opt_copy, opt.__data,
+ IP_OPTIONS_DATA_FIXED_SIZE);
struct inet_sock *inet = inet_sk(sk);
struct udp_sock *up = udp_sk(sk);
DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
@@ -1286,7 +1288,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE;
int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
struct sk_buff *skb;
- struct ip_options_data opt_copy;
int uc_index;
if (len > 0xFFFF)
@@ -1368,9 +1369,9 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
rcu_read_lock();
inet_opt = rcu_dereference(inet->inet_opt);
if (inet_opt) {
- memcpy(&opt_copy, inet_opt,
+ memcpy(opt_copy, inet_opt,
sizeof(*inet_opt) + inet_opt->opt.optlen);
- ipc.opt = &opt_copy.opt;
+ ipc.opt = opt_copy;
}
rcu_read_unlock();
}
@@ -1793,14 +1794,32 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
}
if (unlikely(to_drop)) {
+ int err_ipv4 = 0;
+ int err_ipv6 = 0;
+
for (nb = 0; to_drop != NULL; nb++) {
skb = to_drop;
+ if (skb->protocol == htons(ETH_P_IP))
+ err_ipv4++;
+ else
+ err_ipv6++;
to_drop = skb->next;
skb_mark_not_on_list(skb);
- /* TODO: update SNMP values. */
sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_PROTO_MEM);
}
numa_drop_add(&udp_sk(sk)->drop_counters, nb);
+ if (err_ipv4 > 0) {
+ SNMP_ADD_STATS(__UDPX_MIB(sk, true), UDP_MIB_MEMERRORS,
+ err_ipv4);
+ SNMP_ADD_STATS(__UDPX_MIB(sk, true), UDP_MIB_INERRORS,
+ err_ipv4);
+ }
+ if (err_ipv6 > 0) {
+ SNMP_ADD_STATS(__UDPX_MIB(sk, false), UDP_MIB_MEMERRORS,
+ err_ipv6);
+ SNMP_ADD_STATS(__UDPX_MIB(sk, false), UDP_MIB_INERRORS,
+ err_ipv6);
+ }
}
atomic_sub(total_size, &udp_prod_queue->rmem_alloc);
@@ -2429,7 +2448,8 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
/*
* UDP-Lite specific tests, ignored on UDP sockets
*/
- if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) {
+ if (unlikely(udp_test_bit(UDPLITE_RECV_CC, sk) &&
+ UDP_SKB_CB(skb)->partial_cov)) {
u16 pcrlen = READ_ONCE(up->pcrlen);
/*
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 589456bd8b5f..6b1654c1ad4a 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -483,11 +483,11 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
struct sock *sk = gso_skb->sk;
unsigned int sum_truesize = 0;
struct sk_buff *segs, *seg;
+ __be16 newlen, msslen;
struct udphdr *uh;
unsigned int mss;
bool copy_dtor;
__sum16 check;
- __be16 newlen;
int ret = 0;
mss = skb_shinfo(gso_skb)->gso_size;
@@ -556,6 +556,8 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
return segs;
}
+ msslen = htons(sizeof(*uh) + mss);
+
/* GSO partial and frag_list segmentation only requires splitting
* the frame into an MSS multiple and possibly a remainder, both
* cases return a GSO skb. So update the mss now.
@@ -585,7 +587,7 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
if (!seg->next)
break;
- uh->len = newlen;
+ uh->len = msslen;
uh->check = check;
if (seg->ip_summed == CHECKSUM_PARTIAL)