diff options
Diffstat (limited to 'net/ipv4')
| -rw-r--r-- | net/ipv4/Makefile | 2 | ||||
| -rw-r--r-- | net/ipv4/cipso_ipv4.c | 3 | ||||
| -rw-r--r-- | net/ipv4/fib_lookup.h | 6 | ||||
| -rw-r--r-- | net/ipv4/fib_trie.c | 4 | ||||
| -rw-r--r-- | net/ipv4/icmp.c | 139 | ||||
| -rw-r--r-- | net/ipv4/igmp.c | 4 | ||||
| -rw-r--r-- | net/ipv4/inet_connection_sock.c | 24 | ||||
| -rw-r--r-- | net/ipv4/ip_output.c | 17 | ||||
| -rw-r--r-- | net/ipv4/ip_sockglue.c | 2 | ||||
| -rw-r--r-- | net/ipv4/ipconfig.c | 89 | ||||
| -rw-r--r-- | net/ipv4/ipmr.c | 2 | ||||
| -rw-r--r-- | net/ipv4/metrics.c | 2 | ||||
| -rw-r--r-- | net/ipv4/netfilter/nf_reject_ipv4.c | 2 | ||||
| -rw-r--r-- | net/ipv4/ping.c | 7 | ||||
| -rw-r--r-- | net/ipv4/raw.c | 7 | ||||
| -rw-r--r-- | net/ipv4/route.c | 8 | ||||
| -rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 4 | ||||
| -rw-r--r-- | net/ipv4/tcp.c | 84 | ||||
| -rw-r--r-- | net/ipv4/tcp_cong.c | 5 | ||||
| -rw-r--r-- | net/ipv4/tcp_fastopen.c | 86 | ||||
| -rw-r--r-- | net/ipv4/tcp_input.c | 305 | ||||
| -rw-r--r-- | net/ipv4/tcp_ipv4.c | 37 | ||||
| -rw-r--r-- | net/ipv4/tcp_minisocks.c | 43 | ||||
| -rw-r--r-- | net/ipv4/tcp_offload.c | 3 | ||||
| -rw-r--r-- | net/ipv4/tcp_output.c | 117 | ||||
| -rw-r--r-- | net/ipv4/tcp_rate.c | 209 | ||||
| -rw-r--r-- | net/ipv4/tcp_recovery.c | 75 | ||||
| -rw-r--r-- | net/ipv4/tcp_timer.c | 3 | ||||
| -rw-r--r-- | net/ipv4/udp.c | 32 | ||||
| -rw-r--r-- | net/ipv4/udp_offload.c | 6 |
30 files changed, 770 insertions, 557 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index ec36d2ec059e..18108a6f0499 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -9,7 +9,7 @@ obj-y := route.o inetpeer.o protocol.o \ inet_timewait_sock.o inet_connection_sock.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ - tcp_rate.o tcp_recovery.o tcp_ulp.o \ + tcp_recovery.o tcp_ulp.o \ tcp_offload.o tcp_plb.o datagram.o raw.o udp.o udplite.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \ diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 709021197e1c..32b951ebc0c2 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -2196,7 +2196,8 @@ int cipso_v4_skbuff_setattr(struct sk_buff *skb, /* if we don't ensure enough headroom we could panic on the skb_push() * call below so make sure we have enough, we are also "mangling" the * packet so we should probably do a copy-on-write call anyway */ - ret_val = skb_cow(skb, skb_headroom(skb) + len_delta); + ret_val = skb_cow(skb, + skb_headroom(skb) + (len_delta > 0 ? len_delta : 0)); if (ret_val < 0) return ret_val; diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index f9b9e26c32c1..0b72796dd1ad 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -28,8 +28,10 @@ struct fib_alias { /* Don't write on fa_state unless needed, to keep it shared on all cpus */ static inline void fib_alias_accessed(struct fib_alias *fa) { - if (!(fa->fa_state & FA_S_ACCESSED)) - fa->fa_state |= FA_S_ACCESSED; + u8 fa_state = READ_ONCE(fa->fa_state); + + if (!(fa_state & FA_S_ACCESSED)) + WRITE_ONCE(fa->fa_state, fa_state | FA_S_ACCESSED); } /* Exported by fib_semantics.c */ diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 7e2c17fec3fc..1308213791f1 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1280,7 +1280,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->fa_dscp = fa->fa_dscp; new_fa->fa_info = fi; new_fa->fa_type = cfg->fc_type; - state = fa->fa_state; + state = READ_ONCE(fa->fa_state); new_fa->fa_state = state & ~FA_S_ACCESSED; new_fa->fa_slen = fa->fa_slen; new_fa->tb_id = tb->tb_id; @@ -1745,7 +1745,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb, fib_remove_alias(t, tp, l, fa_to_delete); - if (fa_to_delete->fa_state & FA_S_ACCESSED) + if (READ_ONCE(fa_to_delete->fa_state) & FA_S_ACCESSED) rt_cache_flush(cfg->fc_nlinfo.nl_net); fib_release_info(fa_to_delete->fa_info); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 4abbec2f47ef..e216b6df6331 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -112,7 +112,9 @@ struct icmp_bxm { __be32 times[3]; } data; int head_len; - struct ip_options_data replyopts; + + /* Must be last as it ends in a flexible-array member. */ + struct ip_options_rcu replyopts; }; /* An array of errno for error messages from dest unreach. */ @@ -353,9 +355,12 @@ void icmp_out_count(struct net *net, unsigned char type) static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { - struct icmp_bxm *icmp_param = from; + DEFINE_RAW_FLEX(struct icmp_bxm, icmp_param, replyopts.opt.__data, + IP_OPTIONS_DATA_FIXED_SIZE); __wsum csum; + icmp_param = from; + csum = skb_copy_and_csum_bits(icmp_param->skb, icmp_param->offset + offset, to, len); @@ -413,7 +418,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) int type = icmp_param->data.icmph.type; int code = icmp_param->data.icmph.code; - if (ip_options_echo(net, &icmp_param->replyopts.opt.opt, skb)) + if (ip_options_echo(net, &icmp_param->replyopts.opt, skb)) return; /* Needed by both icmpv4_global_allow and icmp_xmit_lock */ @@ -435,10 +440,10 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) daddr = ipc.addr = ip_hdr(skb)->saddr; saddr = fib_compute_spec_dst(skb); - if (icmp_param->replyopts.opt.opt.optlen) { - ipc.opt = &icmp_param->replyopts.opt; + if (icmp_param->replyopts.opt.optlen) { + ipc.opt = &icmp_param->replyopts; if (ipc.opt->opt.srr) - daddr = icmp_param->replyopts.opt.opt.faddr; + daddr = icmp_param->replyopts.opt.faddr; } memset(&fl4, 0, sizeof(fl4)); fl4.daddr = daddr; @@ -491,8 +496,8 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, int err; memset(fl4, 0, sizeof(*fl4)); - fl4->daddr = (param->replyopts.opt.opt.srr ? - param->replyopts.opt.opt.faddr : iph->saddr); + fl4->daddr = (param->replyopts.opt.srr ? + param->replyopts.opt.faddr : iph->saddr); fl4->saddr = saddr; fl4->flowi4_mark = mark; fl4->flowi4_uid = sock_net_uid(net, NULL); @@ -554,6 +559,21 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, /* steal dst entry from skb_in, don't drop refcnt */ skb_dstref_steal(skb_in); skb_dstref_restore(skb_in, orefdst); + + /* + * At this point, fl4_dec.daddr should NOT be local (we + * checked fl4_dec.saddr above). However, a race condition + * may occur if the address is added to the interface + * concurrently. In that case, ip_route_input() returns a + * LOCAL route with dst.output=ip_rt_bug, which must not + * be used for output. + */ + if (!err && rt2 && rt2->rt_type == RTN_LOCAL) { + net_warn_ratelimited("detected local route for %pI4 during ICMP sending, src %pI4\n", + &fl4_dec.daddr, &fl4_dec.saddr); + dst_release(&rt2->dst); + err = -EINVAL; + } } if (err) @@ -775,9 +795,10 @@ free_skb: void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, const struct inet_skb_parm *parm) { + DEFINE_RAW_FLEX(struct icmp_bxm, icmp_param, replyopts.opt.__data, + IP_OPTIONS_DATA_FIXED_SIZE); struct iphdr *iph; int room; - struct icmp_bxm icmp_param; struct rtable *rt = skb_rtable(skb_in); bool apply_ratelimit = false; struct sk_buff *ext_skb; @@ -906,7 +927,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, iph->tos; mark = IP4_REPLY_MARK(net, skb_in->mark); - if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, + if (__ip_options_echo(net, &icmp_param->replyopts.opt, skb_in, &parm->opt)) goto out_unlock; @@ -915,21 +936,21 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, * Prepare data for ICMP header. */ - icmp_param.data.icmph.type = type; - icmp_param.data.icmph.code = code; - icmp_param.data.icmph.un.gateway = info; - icmp_param.data.icmph.checksum = 0; - icmp_param.skb = skb_in; - icmp_param.offset = skb_network_offset(skb_in); + icmp_param->data.icmph.type = type; + icmp_param->data.icmph.code = code; + icmp_param->data.icmph.un.gateway = info; + icmp_param->data.icmph.checksum = 0; + icmp_param->skb = skb_in; + icmp_param->offset = skb_network_offset(skb_in); ipcm_init(&ipc); ipc.tos = tos; ipc.addr = iph->saddr; - ipc.opt = &icmp_param.replyopts.opt; + ipc.opt = &icmp_param->replyopts; ipc.sockc.mark = mark; rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, inet_dsfield_to_dscp(tos), mark, type, code, - &icmp_param); + icmp_param); if (IS_ERR(rt)) goto out_unlock; @@ -939,10 +960,10 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, /* RFC says return as much as we can without exceeding 576 bytes. */ - room = dst_mtu(&rt->dst); + room = dst4_mtu(&rt->dst); if (room > 576) room = 576; - room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen; + room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.optlen; room -= sizeof(struct icmphdr); /* Guard against tiny mtu. We need to include at least one * IP network header for this message to make any sense. @@ -950,15 +971,15 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, if (room <= (int)sizeof(struct iphdr)) goto ende; - ext_skb = icmp_ext_append(net, skb_in, &icmp_param.data.icmph, room, + ext_skb = icmp_ext_append(net, skb_in, &icmp_param->data.icmph, room, parm->iif); if (ext_skb) - icmp_param.skb = ext_skb; + icmp_param->skb = ext_skb; - icmp_param.data_len = icmp_param.skb->len - icmp_param.offset; - if (icmp_param.data_len > room) - icmp_param.data_len = room; - icmp_param.head_len = sizeof(struct icmphdr); + icmp_param->data_len = icmp_param->skb->len - icmp_param->offset; + if (icmp_param->data_len > room) + icmp_param->data_len = room; + icmp_param->head_len = sizeof(struct icmphdr); /* if we don't have a source address at this point, fall back to the * dummy address instead of sending out a packet with a source address @@ -969,7 +990,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, trace_icmp_send(skb_in, type, code); - icmp_push_reply(sk, &icmp_param, &fl4, &ipc, &rt); + icmp_push_reply(sk, icmp_param, &fl4, &ipc, &rt); if (ext_skb) consume_skb(ext_skb); @@ -1031,16 +1052,22 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info) /* Checkin full IP header plus 8 bytes of protocol to * avoid additional coding at protocol handlers. */ - if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) { - __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS); - return; - } + if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) + goto out; + + /* IPPROTO_RAW sockets are not supposed to receive anything. */ + if (protocol == IPPROTO_RAW) + goto out; raw_icmp_error(skb, protocol, info); ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot && ipprot->err_handler) ipprot->err_handler(skb, info); + return; + +out: + __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS); } static bool icmp_tag_validation(int proto) @@ -1206,7 +1233,8 @@ static enum skb_drop_reason icmp_redirect(struct sk_buff *skb) static enum skb_drop_reason icmp_echo(struct sk_buff *skb) { - struct icmp_bxm icmp_param; + DEFINE_RAW_FLEX(struct icmp_bxm, icmp_param, replyopts.opt.__data, + IP_OPTIONS_DATA_FIXED_SIZE); struct net *net; net = skb_dst_dev_net_rcu(skb); @@ -1214,18 +1242,18 @@ static enum skb_drop_reason icmp_echo(struct sk_buff *skb) if (READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_all)) return SKB_NOT_DROPPED_YET; - icmp_param.data.icmph = *icmp_hdr(skb); - icmp_param.skb = skb; - icmp_param.offset = 0; - icmp_param.data_len = skb->len; - icmp_param.head_len = sizeof(struct icmphdr); + icmp_param->data.icmph = *icmp_hdr(skb); + icmp_param->skb = skb; + icmp_param->offset = 0; + icmp_param->data_len = skb->len; + icmp_param->head_len = sizeof(struct icmphdr); - if (icmp_param.data.icmph.type == ICMP_ECHO) - icmp_param.data.icmph.type = ICMP_ECHOREPLY; - else if (!icmp_build_probe(skb, &icmp_param.data.icmph)) + if (icmp_param->data.icmph.type == ICMP_ECHO) + icmp_param->data.icmph.type = ICMP_ECHOREPLY; + else if (!icmp_build_probe(skb, &icmp_param->data.icmph)) return SKB_NOT_DROPPED_YET; - icmp_reply(&icmp_param, skb); + icmp_reply(icmp_param, skb); return SKB_NOT_DROPPED_YET; } @@ -1353,7 +1381,8 @@ EXPORT_SYMBOL_GPL(icmp_build_probe); */ static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb) { - struct icmp_bxm icmp_param; + DEFINE_RAW_FLEX(struct icmp_bxm, icmp_param, replyopts.opt.__data, + IP_OPTIONS_DATA_FIXED_SIZE); /* * Too short. */ @@ -1363,19 +1392,19 @@ static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb) /* * Fill in the current time as ms since midnight UT: */ - icmp_param.data.times[1] = inet_current_timestamp(); - icmp_param.data.times[2] = icmp_param.data.times[1]; - - BUG_ON(skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4)); - - icmp_param.data.icmph = *icmp_hdr(skb); - icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY; - icmp_param.data.icmph.code = 0; - icmp_param.skb = skb; - icmp_param.offset = 0; - icmp_param.data_len = 0; - icmp_param.head_len = sizeof(struct icmphdr) + 12; - icmp_reply(&icmp_param, skb); + icmp_param->data.times[1] = inet_current_timestamp(); + icmp_param->data.times[2] = icmp_param->data.times[1]; + + BUG_ON(skb_copy_bits(skb, 0, &icmp_param->data.times[0], 4)); + + icmp_param->data.icmph = *icmp_hdr(skb); + icmp_param->data.icmph.type = ICMP_TIMESTAMPREPLY; + icmp_param->data.icmph.code = 0; + icmp_param->skb = skb; + icmp_param->offset = 0; + icmp_param->data_len = 0; + icmp_param->head_len = sizeof(struct icmphdr) + 12; + icmp_reply(icmp_param, skb); return SKB_NOT_DROPPED_YET; out_err: diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 7182f1419c2a..0adc993c211d 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -227,7 +227,7 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay) static void igmp_gq_start_timer(struct in_device *in_dev) { - int tv = get_random_u32_below(in_dev->mr_maxdelay); + int tv = get_random_u32_below(READ_ONCE(in_dev->mr_maxdelay)); unsigned long exp = jiffies + tv + 2; if (in_dev->mr_gq_running && @@ -1009,7 +1009,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE); if (!max_delay) max_delay = 1; /* can't mod w/ 0 */ - in_dev->mr_maxdelay = max_delay; + WRITE_ONCE(in_dev->mr_maxdelay, max_delay); /* RFC3376, 4.1.6. QRV and 4.1.7. QQIC, when the most recently * received value was zero, use the default or statically diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 97d57c52b9ad..5dfac6ce1110 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -20,6 +20,7 @@ #include <net/tcp_states.h> #include <net/xfrm.h> #include <net/tcp.h> +#include <net/tcp_ecn.h> #include <net/sock_reuseport.h> #include <net/addrconf.h> @@ -918,6 +919,16 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, } EXPORT_SYMBOL(inet_reqsk_alloc); +void __reqsk_free(struct request_sock *req) +{ + req->rsk_ops->destructor(req); + if (req->rsk_listener) + sock_put(req->rsk_listener); + kfree(req->saved_syn); + kmem_cache_free(req->rsk_ops->slab, req); +} +EXPORT_SYMBOL_GPL(__reqsk_free); + static struct request_sock *inet_reqsk_clone(struct request_sock *req, struct sock *sk) { @@ -1103,6 +1114,8 @@ static void reqsk_timer_handler(struct timer_list *t) (!resend || !tcp_rtx_synack(sk_listener, req) || inet_rsk(req)->acked)) { + if (req->num_retrans > 1 && tcp_rsk(req)->accecn_ok) + tcp_rsk(req)->accecn_fail_mode |= TCP_ACCECN_ACE_FAIL_SEND; if (req->num_timeout++ == 0) atomic_dec(&queue->young); mod_timer(&req->rsk_timer, jiffies + tcp_reqsk_timeout(req)); @@ -1196,7 +1209,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, { struct sock *newsk = sk_clone_lock(sk, priority); struct inet_connection_sock *newicsk; - struct inet_request_sock *ireq; + const struct inet_request_sock *ireq; struct inet_sock *newinet; if (!newsk) @@ -1311,6 +1324,15 @@ static int inet_ulp_can_listen(const struct sock *sk) return 0; } +static void reqsk_queue_alloc(struct request_sock_queue *queue) +{ + queue->fastopenq.rskq_rst_head = NULL; + queue->fastopenq.rskq_rst_tail = NULL; + queue->fastopenq.qlen = 0; + + queue->rskq_accept_head = NULL; +} + int inet_csk_listen_start(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index ff11d3a85a36..e4790cc7b5c2 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1300,7 +1300,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, return -EFAULT; cork->fragsize = ip_sk_use_pmtu(sk) ? - dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu); + dst4_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu); if (!inetdev_valid_mtu(cork->fragsize)) return -ENETUNREACH; @@ -1439,7 +1439,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, pmtudisc = READ_ONCE(inet->pmtudisc); if (pmtudisc == IP_PMTUDISC_DO || pmtudisc == IP_PMTUDISC_PROBE || - (skb->len <= dst_mtu(&rt->dst) && + (skb->len <= dst4_mtu(&rt->dst) && ip_dont_fragment(sk, &rt->dst))) df = htons(IP_DF); @@ -1606,7 +1606,8 @@ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk, const struct ip_reply_arg *arg, unsigned int len, u64 transmit_time, u32 txhash) { - struct ip_options_data replyopts; + DEFINE_RAW_FLEX(struct ip_options_rcu, replyopts, opt.__data, + IP_OPTIONS_DATA_FIXED_SIZE); struct ipcm_cookie ipc; struct flowi4 fl4; struct rtable *rt = skb_rtable(skb); @@ -1615,18 +1616,18 @@ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk, int err; int oif; - if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt)) + if (__ip_options_echo(net, &replyopts->opt, skb, sopt)) return; ipcm_init(&ipc); ipc.addr = daddr; ipc.sockc.transmit_time = transmit_time; - if (replyopts.opt.opt.optlen) { - ipc.opt = &replyopts.opt; + if (replyopts->opt.optlen) { + ipc.opt = replyopts; - if (replyopts.opt.opt.srr) - daddr = replyopts.opt.opt.faddr; + if (replyopts->opt.srr) + daddr = replyopts->opt.faddr; } oif = arg->bound_dev_if; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 6d9c5c20b1c4..c062d9519818 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1634,7 +1634,7 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, val = 0; dst = sk_dst_get(sk); if (dst) { - val = dst_mtu(dst); + val = dst4_mtu(dst); dst_release(dst); } if (!val) diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 019408d3ca2c..b1e1be00ff8b 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -679,8 +679,18 @@ static const u8 ic_bootp_cookie[4] = { 99, 130, 83, 99 }; static void __init ic_dhcp_init_options(u8 *options, struct ic_device *d) { - u8 mt = ((ic_servaddr == NONE) - ? DHCPDISCOVER : DHCPREQUEST); + static const u8 ic_req_params[] = { + 1, /* Subnet mask */ + 3, /* Default gateway */ + 6, /* DNS server */ + 12, /* Host name */ + 15, /* Domain name */ + 17, /* Boot path */ + 26, /* MTU */ + 40, /* NIS domain name */ + 42, /* NTP servers */ + }; + u8 mt = (ic_servaddr == NONE) ? DHCPDISCOVER : DHCPREQUEST; u8 *e = options; int len; @@ -705,51 +715,36 @@ ic_dhcp_init_options(u8 *options, struct ic_device *d) e += 4; } - /* always? */ - { - static const u8 ic_req_params[] = { - 1, /* Subnet mask */ - 3, /* Default gateway */ - 6, /* DNS server */ - 12, /* Host name */ - 15, /* Domain name */ - 17, /* Boot path */ - 26, /* MTU */ - 40, /* NIS domain name */ - 42, /* NTP servers */ - }; - - *e++ = 55; /* Parameter request list */ - *e++ = sizeof(ic_req_params); - memcpy(e, ic_req_params, sizeof(ic_req_params)); - e += sizeof(ic_req_params); - - if (ic_host_name_set) { - *e++ = 12; /* host-name */ - len = strlen(utsname()->nodename); - *e++ = len; - memcpy(e, utsname()->nodename, len); - e += len; - } - if (*vendor_class_identifier) { - pr_info("DHCP: sending class identifier \"%s\"\n", - vendor_class_identifier); - *e++ = 60; /* Class-identifier */ - len = strlen(vendor_class_identifier); - *e++ = len; - memcpy(e, vendor_class_identifier, len); - e += len; - } - len = strlen(dhcp_client_identifier + 1); - /* the minimum length of identifier is 2, include 1 byte type, - * and can not be larger than the length of options - */ - if (len >= 1 && len < 312 - (e - options) - 1) { - *e++ = 61; - *e++ = len + 1; - memcpy(e, dhcp_client_identifier, len + 1); - e += len + 1; - } + *e++ = 55; /* Parameter request list */ + *e++ = sizeof(ic_req_params); + memcpy(e, ic_req_params, sizeof(ic_req_params)); + e += sizeof(ic_req_params); + + if (ic_host_name_set) { + *e++ = 12; /* host-name */ + len = strlen(utsname()->nodename); + *e++ = len; + memcpy(e, utsname()->nodename, len); + e += len; + } + if (*vendor_class_identifier) { + pr_info("DHCP: sending class identifier \"%s\"\n", + vendor_class_identifier); + *e++ = 60; /* Class-identifier */ + len = strlen(vendor_class_identifier); + *e++ = len; + memcpy(e, vendor_class_identifier, len); + e += len; + } + len = strlen(dhcp_client_identifier + 1); + /* the minimum length of identifier is 2, include 1 byte type, + * and can not be larger than the length of options + */ + if (len >= 1 && len < 312 - (e - options) - 1) { + *e++ = 61; + *e++ = len + 1; + memcpy(e, dhcp_client_identifier, len + 1); + e += len + 1; } *e++ = 255; /* End of the list */ diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index ca9eaee4c2ef..131382c388e9 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1895,7 +1895,7 @@ static int ipmr_prepare_xmit(struct net *net, struct mr_table *mrt, return -1; } - if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) { + if (skb->len+encap > dst4_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) { /* Do not fragment multicasts. Alas, IPv4 does not * allow to send ICMP, so that packets will disappear * to blackhole. diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c index 8ddac1f595ed..82cf8a9e5ded 100644 --- a/net/ipv4/metrics.c +++ b/net/ipv4/metrics.c @@ -88,4 +88,4 @@ struct dst_metrics *ip_fib_metrics_init(struct nlattr *fc_mx, return fib_metrics; } -EXPORT_SYMBOL_GPL(ip_fib_metrics_init); +EXPORT_IPV6_MOD_GPL(ip_fib_metrics_init); diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index fae4aa4a5f09..fecf6621f679 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -303,7 +303,7 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb, goto free_nskb; /* "Never happens" */ - if (nskb->len > dst_mtu(skb_dst(nskb))) + if (nskb->len > dst4_mtu(skb_dst(nskb))) goto free_nskb; nf_ct_attach(nskb, oldskb); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index cfbd563498e8..ebfc5a3d3ad6 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -690,6 +690,8 @@ EXPORT_IPV6_MOD_GPL(ping_common_sendmsg); static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { + DEFINE_RAW_FLEX(struct ip_options_rcu, opt_copy, opt.__data, + IP_OPTIONS_DATA_FIXED_SIZE); struct net *net = sock_net(sk); struct flowi4 fl4; struct inet_sock *inet = inet_sk(sk); @@ -697,7 +699,6 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct icmphdr user_icmph; struct pingfakehdr pfh; struct rtable *rt = NULL; - struct ip_options_data opt_copy; int free = 0; __be32 saddr, daddr, faddr; u8 scope; @@ -746,9 +747,9 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { - memcpy(&opt_copy, inet_opt, + memcpy(opt_copy, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen); - ipc.opt = &opt_copy.opt; + ipc.opt = opt_copy; } rcu_read_unlock(); } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 5998c4cc6f47..e20c41206e29 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -481,6 +481,8 @@ static int raw_getfrag(void *from, char *to, int offset, int len, int odd, static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { + DEFINE_RAW_FLEX(struct ip_options_rcu, opt_copy, opt.__data, + IP_OPTIONS_DATA_FIXED_SIZE); struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); struct ipcm_cookie ipc; @@ -491,7 +493,6 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) __be32 daddr; __be32 saddr; int uc_index, err; - struct ip_options_data opt_copy; struct raw_frag_vec rfv; int hdrincl; @@ -561,9 +562,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { - memcpy(&opt_copy, inet_opt, + memcpy(opt_copy, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen); - ipc.opt = &opt_copy.opt; + ipc.opt = opt_copy; } rcu_read_unlock(); } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 11d990703d31..06aa39ae80d6 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1795,8 +1795,8 @@ static void ip_handle_martian_source(struct net_device *dev, * RFC1812 recommendation, if source is martian, * the only hint is MAC header. */ - pr_warn("martian source %pI4 from %pI4, on dev %s\n", - &daddr, &saddr, dev->name); + pr_warn("martian source (src=%pI4, dst=%pI4, dev=%s)\n", + &saddr, &daddr, dev->name); if (dev->hard_header_len && skb_mac_header_was_set(skb)) { print_hex_dump(KERN_WARNING, "ll header: ", DUMP_PREFIX_OFFSET, 16, 1, @@ -2475,8 +2475,8 @@ martian_destination: RT_CACHE_STAT_INC(in_martian_dst); #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev)) - net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n", - &daddr, &saddr, dev->name); + net_warn_ratelimited("martian destination (src=%pI4, dst=%pI4, dev=%s)\n", + &saddr, &daddr, dev->name); #endif goto out; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index a1a50a5c80dc..643763bc2142 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -47,7 +47,7 @@ static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX; static int tcp_plb_max_rounds = 31; static int tcp_plb_max_cong_thresh = 256; static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC; -static int tcp_ecn_mode_max = 2; +static int tcp_ecn_mode_max = 5; static u32 icmp_errors_extension_mask_all = GENMASK_U8(ICMP_ERR_EXT_COUNT - 1, 0); @@ -749,7 +749,7 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, + .extra2 = SYSCTL_THREE, }, { .procname = "tcp_ecn_option_beacon", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d5319ebe2452..6ce03a9adb4a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -319,15 +319,6 @@ struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp; EXPORT_IPV6_MOD(tcp_sockets_allocated); /* - * TCP splice context - */ -struct tcp_splice_state { - struct pipe_inode_info *pipe; - size_t len; - unsigned int flags; -}; - -/* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. * All the __sk_mem_schedule() is of this nature: accounting @@ -501,6 +492,9 @@ static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc) struct sk_buff *skb = tcp_write_queue_tail(sk); u32 tsflags = sockc->tsflags; + if (unlikely(!skb)) + skb = skb_rb_last(&sk->tcp_rtx_queue); + if (tsflags && skb) { struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -517,6 +511,19 @@ static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc) bpf_skops_tx_timestamping(sk, skb, BPF_SOCK_OPS_TSTAMP_SENDMSG_CB); } +/* @wake is one when sk_stream_write_space() calls us. + * This sends EPOLLOUT only if notsent_bytes is half the limit. + * This mimics the strategy used in sock_def_write_space(). + */ +bool tcp_stream_memory_free(const struct sock *sk, int wake) +{ + const struct tcp_sock *tp = tcp_sk(sk); + u32 notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt); + + return (notsent_bytes << wake) < tcp_notsent_lowat(tp); +} +EXPORT_SYMBOL(tcp_stream_memory_free); + static bool tcp_stream_is_readable(struct sock *sk, int target) { if (tcp_epollin_ready(sk, target)) @@ -775,8 +782,8 @@ void tcp_push(struct sock *sk, int flags, int mss_now, __tcp_push_pending_frames(sk, mss_now, nonagle); } -static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, - unsigned int offset, size_t len) +int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, + unsigned int offset, size_t len) { struct tcp_splice_state *tss = rd_desc->arg.data; int ret; @@ -902,6 +909,33 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, } EXPORT_IPV6_MOD(tcp_splice_read); +/* We allow to exceed memory limits for FIN packets to expedite + * connection tear down and (memory) recovery. + * Otherwise tcp_send_fin() could be tempted to either delay FIN + * or even be forced to close flow without any FIN. + * In general, we want to allow one skb per socket to avoid hangs + * with edge trigger epoll() + */ +void sk_forced_mem_schedule(struct sock *sk, int size) +{ + int delta, amt; + + delta = size - sk->sk_forward_alloc; + if (delta <= 0) + return; + + amt = sk_mem_pages(delta); + sk_forward_alloc_add(sk, amt << PAGE_SHIFT); + + if (mem_cgroup_sk_enabled(sk)) + mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL); + + if (sk->sk_bypass_prot_mem) + return; + + sk_memory_allocated_add(sk, amt); +} + struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp, bool force_schedule) { @@ -1074,6 +1108,24 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, return err; } +/* If a gap is detected between sends, mark the socket application-limited. */ +void tcp_rate_check_app_limited(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (/* We have less than one packet to send. */ + tp->write_seq - tp->snd_nxt < tp->mss_cache && + /* Nothing in sending host's qdisc queues or NIC tx queue. */ + sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) && + /* We are not limited by CWND. */ + tcp_packets_in_flight(tp) < tcp_snd_cwnd(tp) && + /* All lost packets have been retransmitted. */ + tp->lost_out <= tp->retrans_out) + tp->app_limited = + (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; +} +EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited); + int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) { struct net_devmem_dmabuf_binding *binding = NULL; @@ -3418,6 +3470,7 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_accecn_init_counters(tp); tp->prev_ecnfield = 0; tp->accecn_opt_tstamp = 0; + tp->pkts_acked_ewma = 0; if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); @@ -4320,6 +4373,14 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (tp->rto_stamp) info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp; + if (tcp_ecn_disabled(tp)) + info->tcpi_ecn_mode = TCPI_ECN_MODE_DISABLED; + else if (tcp_ecn_mode_rfc3168(tp)) + info->tcpi_ecn_mode = TCPI_ECN_MODE_RFC3168; + else if (tcp_ecn_mode_accecn(tp)) + info->tcpi_ecn_mode = TCPI_ECN_MODE_ACCECN; + else if (tcp_ecn_mode_pending(tp)) + info->tcpi_ecn_mode = TCPI_ECN_MODE_PENDING; info->tcpi_accecn_fail_mode = tp->accecn_fail_mode; info->tcpi_accecn_opt_seen = tp->saw_accecn_opt; info->tcpi_received_ce = tp->received_ce; @@ -5191,6 +5252,7 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_ecn_bytes); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, pkts_acked_ewma); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index df758adbb445..e9f6c77e0631 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -16,6 +16,7 @@ #include <linux/gfp.h> #include <linux/jhash.h> #include <net/tcp.h> +#include <net/tcp_ecn.h> #include <trace/events/tcp.h> static DEFINE_SPINLOCK(tcp_cong_list_lock); @@ -227,7 +228,7 @@ void tcp_assign_congestion_control(struct sock *sk) memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); if (ca->flags & TCP_CONG_NEEDS_ECN) - INET_ECN_xmit(sk); + INET_ECN_xmit_ect_1_negotiation(sk); else INET_ECN_dontxmit(sk); } @@ -257,7 +258,7 @@ static void tcp_reinit_congestion_control(struct sock *sk, memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); if (ca->flags & TCP_CONG_NEEDS_ECN) - INET_ECN_xmit(sk); + INET_ECN_xmit_ect_1_negotiation(sk); else INET_ECN_dontxmit(sk); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 7d945a527daf..b30090cff3cf 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -5,6 +5,92 @@ #include <net/tcp.h> #include <net/busy_poll.h> +/* + * This function is called to set a Fast Open socket's "fastopen_rsk" field + * to NULL when a TFO socket no longer needs to access the request_sock. + * This happens only after 3WHS has been either completed or aborted (e.g., + * RST is received). + * + * Before TFO, a child socket is created only after 3WHS is completed, + * hence it never needs to access the request_sock. things get a lot more + * complex with TFO. A child socket, accepted or not, has to access its + * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts, + * until 3WHS is either completed or aborted. Afterwards the req will stay + * until either the child socket is accepted, or in the rare case when the + * listener is closed before the child is accepted. + * + * In short, a request socket is only freed after BOTH 3WHS has completed + * (or aborted) and the child socket has been accepted (or listener closed). + * When a child socket is accepted, its corresponding req->sk is set to + * NULL since it's no longer needed. More importantly, "req->sk == NULL" + * will be used by the code below to determine if a child socket has been + * accepted or not, and the check is protected by the fastopenq->lock + * described below. + * + * Note that fastopen_rsk is only accessed from the child socket's context + * with its socket lock held. But a request_sock (req) can be accessed by + * both its child socket through fastopen_rsk, and a listener socket through + * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin + * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created. + * only in the rare case when both the listener and the child locks are held, + * e.g., in inet_csk_listen_stop() do we not need to acquire the lock. + * The lock also protects other fields such as fastopenq->qlen, which is + * decremented by this function when fastopen_rsk is no longer needed. + * + * Note that another solution was to simply use the existing socket lock + * from the listener. But first socket lock is difficult to use. It is not + * a simple spin lock - one must consider sock_owned_by_user() and arrange + * to use sk_add_backlog() stuff. But what really makes it infeasible is the + * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to + * acquire a child's lock while holding listener's socket lock. + * + * This function also sets "treq->tfo_listener" to false. + * treq->tfo_listener is used by the listener so it is protected by the + * fastopenq->lock in this function. + */ +void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, + bool reset) +{ + struct sock *lsk = req->rsk_listener; + struct fastopen_queue *fastopenq; + + fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq; + + RCU_INIT_POINTER(tcp_sk(sk)->fastopen_rsk, NULL); + spin_lock_bh(&fastopenq->lock); + fastopenq->qlen--; + tcp_rsk(req)->tfo_listener = false; + if (req->sk) /* the child socket hasn't been accepted yet */ + goto out; + + if (!reset || lsk->sk_state != TCP_LISTEN) { + /* If the listener has been closed don't bother with the + * special RST handling below. + */ + spin_unlock_bh(&fastopenq->lock); + reqsk_put(req); + return; + } + /* Wait for 60secs before removing a req that has triggered RST. + * This is a simple defense against TFO spoofing attack - by + * counting the req against fastopen.max_qlen, and disabling + * TFO when the qlen exceeds max_qlen. + * + * For more details see CoNext'11 "TCP Fast Open" paper. + */ + req->rsk_timer.expires = jiffies + 60*HZ; + if (fastopenq->rskq_rst_head == NULL) + fastopenq->rskq_rst_head = req; + else + fastopenq->rskq_rst_tail->dl_next = req; + + req->dl_next = NULL; + fastopenq->rskq_rst_tail = req; + fastopenq->qlen++; +out: + spin_unlock_bh(&fastopenq->lock); +} + void tcp_fastopen_init_key_once(struct net *net) { u8 key[TCP_FASTOPEN_KEY_LENGTH]; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 198f8a0d37be..e7b41abb82aa 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -488,6 +488,10 @@ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, tcp_count_delivered_ce(tp, delivered); } +#define PKTS_ACKED_WEIGHT 6 +#define PKTS_ACKED_PREC 6 +#define ACK_COMP_THRESH 4 + /* Returns the ECN CE delta */ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, u32 delivered_pkts, u32 delivered_bytes, @@ -499,6 +503,7 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, u32 delta, safe_delta, d_ceb; bool opt_deltas_valid; u32 corrected_ace; + u32 ewma; /* Reordered ACK or uncertain due to lack of data to send and ts */ if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS))) @@ -507,6 +512,18 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, opt_deltas_valid = tcp_accecn_process_option(tp, skb, delivered_bytes, flag); + if (delivered_pkts) { + if (!tp->pkts_acked_ewma) { + ewma = delivered_pkts << PKTS_ACKED_PREC; + } else { + ewma = tp->pkts_acked_ewma; + ewma = (((ewma << PKTS_ACKED_WEIGHT) - ewma) + + (delivered_pkts << PKTS_ACKED_PREC)) >> + PKTS_ACKED_WEIGHT; + } + tp->pkts_acked_ewma = min_t(u32, ewma, 0xFFFFU); + } + if (!(flag & FLAG_SLOWPATH)) { /* AccECN counter might overflow on large ACKs */ if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) @@ -555,7 +572,8 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, if (d_ceb < safe_delta * tp->mss_cache >> TCP_ACCECN_SAFETY_SHIFT) return delta; - } + } else if (tp->pkts_acked_ewma > (ACK_COMP_THRESH << PKTS_ACKED_PREC)) + return delta; return safe_delta; } @@ -1558,6 +1576,38 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, return in_sack; } +/* Record the most recently (re)sent time among the (s)acked packets + * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from + * draft-cheng-tcpm-rack-00.txt + */ +static void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, + u32 end_seq, u64 xmit_time) +{ + u32 rtt_us; + + rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time); + if (rtt_us < tcp_min_rtt(tp) && (sacked & TCPCB_RETRANS)) { + /* If the sacked packet was retransmitted, it's ambiguous + * whether the retransmission or the original (or the prior + * retransmission) was sacked. + * + * If the original is lost, there is no ambiguity. Otherwise + * we assume the original can be delayed up to aRTT + min_rtt. + * the aRTT term is bounded by the fast recovery or timeout, + * so it's at least one RTT (i.e., retransmission is at least + * an RTT later). + */ + return; + } + tp->rack.advanced = 1; + tp->rack.rtt_us = rtt_us; + if (tcp_skb_sent_after(xmit_time, tp->rack.mstamp, + end_seq, tp->rack.end_seq)) { + tp->rack.mstamp = xmit_time; + tp->rack.end_seq = end_seq; + } +} + /* Mark the given newly-SACKed range as such, adjusting counters and hints. */ static u8 tcp_sacktag_one(struct sock *sk, struct tcp_sacktag_state *state, u8 sacked, @@ -1637,6 +1687,160 @@ static u8 tcp_sacktag_one(struct sock *sk, return sacked; } +/* The bandwidth estimator estimates the rate at which the network + * can currently deliver outbound data packets for this flow. At a high + * level, it operates by taking a delivery rate sample for each ACK. + * + * A rate sample records the rate at which the network delivered packets + * for this flow, calculated over the time interval between the transmission + * of a data packet and the acknowledgment of that packet. + * + * Specifically, over the interval between each transmit and corresponding ACK, + * the estimator generates a delivery rate sample. Typically it uses the rate + * at which packets were acknowledged. However, the approach of using only the + * acknowledgment rate faces a challenge under the prevalent ACK decimation or + * compression: packets can temporarily appear to be delivered much quicker + * than the bottleneck rate. Since it is physically impossible to do that in a + * sustained fashion, when the estimator notices that the ACK rate is faster + * than the transmit rate, it uses the latter: + * + * send_rate = #pkts_delivered/(last_snd_time - first_snd_time) + * ack_rate = #pkts_delivered/(last_ack_time - first_ack_time) + * bw = min(send_rate, ack_rate) + * + * Notice the estimator essentially estimates the goodput, not always the + * network bottleneck link rate when the sending or receiving is limited by + * other factors like applications or receiver window limits. The estimator + * deliberately avoids using the inter-packet spacing approach because that + * approach requires a large number of samples and sophisticated filtering. + * + * TCP flows can often be application-limited in request/response workloads. + * The estimator marks a bandwidth sample as application-limited if there + * was some moment during the sampled window of packets when there was no data + * ready to send in the write queue. + */ + +/* Update the connection delivery information and generate a rate sample. */ +static void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + bool is_sack_reneg, struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 snd_us, ack_us; + + /* Clear app limited if bubble is acked and gone. */ + if (tp->app_limited && after(tp->delivered, tp->app_limited)) + tp->app_limited = 0; + + /* TODO: there are multiple places throughout tcp_ack() to get + * current time. Refactor the code using a new "tcp_acktag_state" + * to carry current time, flags, stats like "tcp_sacktag_state". + */ + if (delivered) + tp->delivered_mstamp = tp->tcp_mstamp; + + rs->acked_sacked = delivered; /* freshly ACKed or SACKed */ + rs->losses = lost; /* freshly marked lost */ + /* Return an invalid sample if no timing information is available or + * in recovery from loss with SACK reneging. Rate samples taken during + * a SACK reneging event may overestimate bw by including packets that + * were SACKed before the reneg. + */ + if (!rs->prior_mstamp || is_sack_reneg) { + rs->delivered = -1; + rs->interval_us = -1; + return; + } + rs->delivered = tp->delivered - rs->prior_delivered; + + rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; + /* delivered_ce occupies less than 32 bits in the skb control block */ + rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK; + + /* Model sending data and receiving ACKs as separate pipeline phases + * for a window. Usually the ACK phase is longer, but with ACK + * compression the send phase can be longer. To be safe we use the + * longer phase. + */ + snd_us = rs->interval_us; /* send phase */ + ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, + rs->prior_mstamp); /* ack phase */ + rs->interval_us = max(snd_us, ack_us); + + /* Record both segment send and ack receive intervals */ + rs->snd_interval_us = snd_us; + rs->rcv_interval_us = ack_us; + + /* Normally we expect interval_us >= min-rtt. + * Note that rate may still be over-estimated when a spuriously + * retransmistted skb was first (s)acked because "interval_us" + * is under-estimated (up to an RTT). However continuously + * measuring the delivery rate during loss recovery is crucial + * for connections suffer heavy or prolonged losses. + */ + if (unlikely(rs->interval_us < tcp_min_rtt(tp))) { + if (!rs->is_retrans) + pr_debug("tcp rate: %ld %d %u %u %u\n", + rs->interval_us, rs->delivered, + inet_csk(sk)->icsk_ca_state, + tp->rx_opt.sack_ok, tcp_min_rtt(tp)); + rs->interval_us = -1; + return; + } + + /* Record the last non-app-limited or the highest app-limited bw */ + if (!rs->is_app_limited || + ((u64)rs->delivered * tp->rate_interval_us >= + (u64)tp->rate_delivered * rs->interval_us)) { + tp->rate_delivered = rs->delivered; + tp->rate_interval_us = rs->interval_us; + tp->rate_app_limited = rs->is_app_limited; + } +} + +/* When an skb is sacked or acked, we fill in the rate sample with the (prior) + * delivery information when the skb was last transmitted. + * + * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is + * called multiple times. We favor the information from the most recently + * sent skb, i.e., the skb with the most recently sent time and the highest + * sequence. + */ +static void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + struct rate_sample *rs) +{ + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + struct tcp_sock *tp = tcp_sk(sk); + u64 tx_tstamp; + + if (!scb->tx.delivered_mstamp) + return; + + tx_tstamp = tcp_skb_timestamp_us(skb); + if (!rs->prior_delivered || + tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp, + scb->end_seq, rs->last_end_seq)) { + rs->prior_delivered_ce = scb->tx.delivered_ce; + rs->prior_delivered = scb->tx.delivered; + rs->prior_mstamp = scb->tx.delivered_mstamp; + rs->is_app_limited = scb->tx.is_app_limited; + rs->is_retrans = scb->sacked & TCPCB_RETRANS; + rs->last_end_seq = scb->end_seq; + + /* Record send time of most recently ACKed packet: */ + tp->first_tx_mstamp = tx_tstamp; + /* Find the duration of the "send phase" of this window: */ + rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, + scb->tx.first_tx_mstamp); + + } + /* Mark off the skb delivered once it's sacked to avoid being + * used again when it's cumulatively acked. For acked packets + * we don't need to reset since it'll be freed soon. + */ + if (scb->sacked & TCPCB_SACKED_ACKED) + scb->tx.delivered_mstamp = 0; +} + /* Shift newly-SACKed bytes from this skb to the immediately previous * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. */ @@ -3995,6 +4199,49 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, return delivered; } +/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries. + * + * If a DSACK is received that seems like it may have been due to reordering + * triggering fast recovery, increment reo_wnd by min_rtt/4 (upper bounded + * by srtt), since there is possibility that spurious retransmission was + * due to reordering delay longer than reo_wnd. + * + * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16) + * no. of successful recoveries (accounts for full DSACK-based loss + * recovery undo). After that, reset it to default (min_rtt/4). + * + * At max, reo_wnd is incremented only once per rtt. So that the new + * DSACK on which we are reacting, is due to the spurious retx (approx) + * after the reo_wnd has been updated last time. + * + * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than + * absolute value to account for change in rtt. + */ +static void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if ((READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) & + TCP_RACK_STATIC_REO_WND) || + !rs->prior_delivered) + return; + + /* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */ + if (before(rs->prior_delivered, tp->rack.last_delivered)) + tp->rack.dsack_seen = 0; + + /* Adjust the reo_wnd if update is pending */ + if (tp->rack.dsack_seen) { + tp->rack.reo_wnd_steps = min_t(u32, 0xFF, + tp->rack.reo_wnd_steps + 1); + tp->rack.dsack_seen = 0; + tp->rack.last_delivered = tp->delivered; + tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH; + } else if (!tp->rack.reo_wnd_persist) { + tp->rack.reo_wnd_steps = 1; + } +} + /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { @@ -4129,7 +4376,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_in_ack_event(sk, flag); - if (tp->tlp_high_seq) + if (unlikely(tp->tlp_high_seq)) tcp_process_tlp_ack(sk, ack, flag); if (tcp_ack_is_dubious(sk, flag)) { @@ -4179,7 +4426,7 @@ no_queue: */ tcp_ack_probe(sk); - if (tp->tlp_high_seq) + if (unlikely(tp->tlp_high_seq)) tcp_process_tlp_ack(sk, ack, flag); return 1; @@ -4799,8 +5046,11 @@ static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq) tcp_sack_extend(tp->duplicate_sack, seq, end_seq); } -static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) +static void tcp_rcv_spurious_retrans(struct sock *sk, + const struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); + /* When the ACK path fails or drops most ACKs, the sender would * timeout and spuriously retransmit the same segment repeatedly. * If it seems our ACKs are not reaching the other side, @@ -4820,6 +5070,14 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) /* Save last flowlabel after a spurious retrans. */ tcp_save_lrcv_flowlabel(sk, skb); #endif + /* Check DSACK info to detect that the previous ACK carrying the + * AccECN option was lost after the second retransmision, and then + * stop sending AccECN option in all subsequent ACKs. + */ + if (tcp_ecn_mode_accecn(tp) && + tp->accecn_opt_sent_w_dsack && + TCP_SKB_CB(skb)->seq == tp->duplicate_sack[0].start_seq) + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_SEND); } static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) @@ -5527,25 +5785,6 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, return next; } -/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */ -void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct sk_buff *skb1; - - while (*p) { - parent = *p; - skb1 = rb_to_skb(parent); - if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) - p = &parent->rb_left; - else - p = &parent->rb_right; - } - rb_link_node(&skb->rbnode, parent, p); - rb_insert_color(&skb->rbnode, root); -} - /* Collapse contiguous sequence of skbs head..tail with * sequence numbers start..end. * @@ -5879,16 +6118,11 @@ static void tcp_new_space(struct sock *sk) * small enough that tcp_stream_memory_free() decides it * is time to generate EPOLLOUT. */ -void tcp_check_space(struct sock *sk) +void __tcp_check_space(struct sock *sk) { - /* pairs with tcp_poll() */ - smp_mb(); - if (sk->sk_socket && - test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { - tcp_new_space(sk); - if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) - tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); - } + tcp_new_space(sk); + if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) + tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } static inline void tcp_data_snd_check(struct sock *sk) @@ -6222,6 +6456,8 @@ step1: if (th->syn) { if (tcp_ecn_mode_accecn(tp)) { accecn_reflector = true; + tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & + INET_ECN_MASK; if (tp->rx_opt.accecn && tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn); @@ -6843,7 +7079,7 @@ consume: tp->snd_wl1 = TCP_SKB_CB(skb)->seq; tp->max_window = tp->snd_wnd; - tcp_ecn_rcv_syn(tp, th, skb); + tcp_ecn_rcv_syn(sk, th, skb); tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); @@ -7248,7 +7484,8 @@ static void tcp_ecn_create_request(struct request_sock *req, u32 ecn_ok_dst; if (tcp_accecn_syn_requested(th) && - READ_ONCE(net->ipv4.sysctl_tcp_ecn) >= 3) { + (READ_ONCE(net->ipv4.sysctl_tcp_ecn) >= 3 || + tcp_ca_needs_accecn(listen_sk))) { inet_rsk(req)->ecn_ok = 1; tcp_rsk(req)->accecn_ok = 1; tcp_rsk(req)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f8a9596e8f4d..6264fc0b2be5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -374,7 +374,7 @@ void tcp_v4_mtu_reduced(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); struct dst_entry *dst; - u32 mtu; + u32 mtu, dmtu; if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) return; @@ -386,15 +386,14 @@ void tcp_v4_mtu_reduced(struct sock *sk) /* Something is about to be wrong... Remember soft error * for the case, if this connection will not able to recover. */ - if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) + dmtu = dst4_mtu(dst); + if (mtu < dmtu && ip_dont_fragment(sk, dst)) WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); - mtu = dst_mtu(dst); - if (inet->pmtudisc != IP_PMTUDISC_DONT && ip_sk_accept_pmtu(sk) && - inet_csk(sk)->icsk_pmtu_cookie > mtu) { - tcp_sync_mss(sk, mtu); + inet_csk(sk)->icsk_pmtu_cookie > dmtu) { + tcp_sync_mss(sk, dmtu); /* Resend the TCP packet because it's * clear that the old packet has been @@ -1760,7 +1759,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, tcp_ca_openreq_child(newsk, dst); - tcp_sync_mss(newsk, dst_mtu(dst)); + tcp_sync_mss(newsk, dst4_mtu(dst)); newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); tcp_initialize_rcv_mss(newsk); @@ -2110,14 +2109,6 @@ no_coalesce: } EXPORT_IPV6_MOD(tcp_add_backlog); -int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason) -{ - struct tcphdr *th = (struct tcphdr *)skb->data; - - return sk_filter_trim_cap(sk, skb, th->doff * 4, reason); -} -EXPORT_IPV6_MOD(tcp_filter); - static void tcp_v4_restore_cb(struct sk_buff *skb) { memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, @@ -3418,20 +3409,6 @@ void tcp4_proc_exit(void) } #endif /* CONFIG_PROC_FS */ -/* @wake is one when sk_stream_write_space() calls us. - * This sends EPOLLOUT only if notsent_bytes is half the limit. - * This mimics the strategy used in sock_def_write_space(). - */ -bool tcp_stream_memory_free(const struct sock *sk, int wake) -{ - const struct tcp_sock *tp = tcp_sk(sk); - u32 notsent_bytes = READ_ONCE(tp->write_seq) - - READ_ONCE(tp->snd_nxt); - - return (notsent_bytes << wake) < tcp_notsent_lowat(tp); -} -EXPORT_SYMBOL(tcp_stream_memory_free); - struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, @@ -3474,6 +3451,8 @@ struct proto tcp_prot = { .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), + .freeptr_offset = offsetof(struct tcp_sock, + inet_conn.icsk_inet.sk.sk_freeptr), .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp_timewait_sock_ops, .rsk_prot = &tcp_request_sock_ops, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index bd5462154f97..ec128865f5c0 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -481,13 +481,18 @@ static void tcp_ecn_openreq_child(struct sock *sk, tp->syn_ect_snt = treq->syn_ect_snt; tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); tp->saw_accecn_opt = treq->saw_accecn_opt; + if (treq->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_SEND) + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_SEND); + if (treq->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_RECV) + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV); tp->prev_ecnfield = treq->syn_ect_rcv; tp->accecn_opt_demand = 1; tcp_ecn_received_counters_payload(sk, skb); } else { - tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? - TCP_ECN_MODE_RFC3168 : - TCP_ECN_DISABLED); + if (inet_rsk(req)->ecn_ok && !tcp_ca_no_fallback_rfc3168(sk)) + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + else + tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); } } @@ -748,16 +753,28 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, */ if (!tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, - &tcp_rsk(req)->last_oow_ack_time) && - - !tcp_rtx_synack(sk, req)) { - unsigned long expires = jiffies; - - expires += tcp_reqsk_timeout(req); - if (!fastopen) - mod_timer_pending(&req->rsk_timer, expires); - else - req->rsk_timer.expires = expires; + &tcp_rsk(req)->last_oow_ack_time)) { + if (tcp_rsk(req)->accecn_ok) { + u8 ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & + INET_ECN_MASK; + + tcp_rsk(req)->syn_ect_rcv = ect_rcv; + if (tcp_accecn_ace(tcp_hdr(skb)) == 0x0) + tcp_rsk(req)->accecn_fail_mode |= TCP_ACCECN_ACE_FAIL_RECV; + } + if (!tcp_rtx_synack(sk, req)) { + unsigned long expires = jiffies; + + if (req->num_retrans > 1 && tcp_rsk(req)->accecn_ok) + tcp_rsk(req)->accecn_fail_mode |= TCP_ACCECN_ACE_FAIL_SEND; + + expires += tcp_reqsk_timeout(req); + if (!fastopen) + mod_timer_pending(&req->rsk_timer, + expires); + else + req->rsk_timer.expires = expires; + } } return NULL; } diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 942a948f1a31..3b1fdcd3cb29 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -304,8 +304,7 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb, goto out_check_final; th2 = tcp_hdr(p); - flush = (__force int)(flags & TCP_FLAG_CWR); - flush |= (__force int)((flags ^ tcp_flag_word(th2)) & + flush = (__force int)((flags ^ tcp_flag_word(th2)) & ~(TCP_FLAG_FIN | TCP_FLAG_PSH)); flush |= (__force int)(th->ack_seq ^ th2->ack_seq); for (i = sizeof(*th); i < thlen; i += 4) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 479afb714bdf..326b58ff1118 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -66,6 +66,25 @@ void tcp_mstamp_refresh(struct tcp_sock *tp) static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); +/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */ +void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct sk_buff *skb1; + + while (*p) { + parent = *p; + skb1 = rb_to_skb(parent); + if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) + p = &parent->rb_left; + else + p = &parent->rb_right; + } + rb_link_node(&skb->rbnode, parent, p); + rb_insert_color(&skb->rbnode, root); +} + /* Account for new data that has been sent to the network. */ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) { @@ -334,8 +353,11 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, return; if (tcp_ecn_mode_accecn(tp)) { - if (!tcp_accecn_ace_fail_recv(tp)) + if (!tcp_accecn_ace_fail_recv(tp) && + !tcp_accecn_ace_fail_send(tp)) INET_ECN_xmit(sk); + else + INET_ECN_dontxmit(sk); tcp_accecn_set_ace(tp, skb, th); skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ACCECN; } else { @@ -712,9 +734,12 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, if (tp) { tp->accecn_minlen = 0; tp->accecn_opt_tstamp = tp->tcp_mstamp; + tp->accecn_opt_sent_w_dsack = tp->rx_opt.dsack; if (tp->accecn_opt_demand) tp->accecn_opt_demand--; } + } else if (tp) { + tp->accecn_opt_sent_w_dsack = 0; } if (unlikely(OPTION_SACK_ADVERTISE & options)) { @@ -1106,7 +1131,7 @@ static unsigned int tcp_synack_options(const struct sock *sk, if (treq->accecn_ok && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && - req->num_timeout < 1 && remaining >= TCPOLEN_ACCECN_BASE) { + synack_type != TCP_SYNACK_RETRANS && remaining >= TCPOLEN_ACCECN_BASE) { opts->use_synack_ecn_bytes = 1; remaining -= tcp_options_fit_accecn(opts, 0, remaining); } @@ -1186,7 +1211,9 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb if (tcp_ecn_mode_accecn(tp)) { int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option); - if (ecn_opt && tp->saw_accecn_opt && !tcp_accecn_opt_fail_send(tp) && + if (ecn_opt && tp->saw_accecn_opt && + (ecn_opt >= TCP_ACCECN_OPTION_PERSIST || + !tcp_accecn_opt_fail_send(tp)) && (ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand || tcp_accecn_option_beacon_check(sk))) { opts->use_synack_ecn_bytes = 0; @@ -1432,6 +1459,41 @@ static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb, list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); } +/* Snapshot the current delivery information in the skb, to generate + * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). + */ +static void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* In general we need to start delivery rate samples from the + * time we received the most recent ACK, to ensure we include + * the full time the network needs to deliver all in-flight + * packets. If there are no packets in flight yet, then we + * know that any ACKs after now indicate that the network was + * able to deliver those packets completely in the sampling + * interval between now and the next ACK. + * + * Note that we use packets_out instead of tcp_packets_in_flight(tp) + * because the latter is a guess based on RTO and loss-marking + * heuristics. We don't want spurious RTOs or loss markings to cause + * a spuriously small time interval, causing a spuriously high + * bandwidth estimate. + */ + if (!tp->packets_out) { + u64 tstamp_us = tcp_skb_timestamp_us(skb); + + tp->first_tx_mstamp = tstamp_us; + tp->delivered_mstamp = tstamp_us; + } + + TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp; + TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; + TCP_SKB_CB(skb)->tx.delivered = tp->delivered; + TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; + TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; +} + INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)); @@ -1530,7 +1592,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, */ skb->pfmemalloc = 0; - skb_push(skb, tcp_header_size); + __skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); skb_orphan(skb); @@ -3571,12 +3633,15 @@ start: tcp_retrans_try_collapse(sk, skb, avail_wnd); } - /* RFC3168, section 6.1.1.1. ECN fallback - * As AccECN uses the same SYN flags (+ AE), this check covers both - * cases. - */ - if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) - tcp_ecn_clear_syn(sk, skb); + if (!tcp_ecn_mode_pending(tp) || icsk->icsk_retransmits > 1) { + /* RFC3168, section 6.1.1.1. ECN fallback + * As AccECN uses the same SYN flags (+ AE), this check + * covers both cases. + */ + if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == + TCPHDR_SYN_ECN) + tcp_ecn_clear_syn(sk, skb); + } /* Update global and local TCP statistics. */ segs = tcp_skb_pcount(skb); @@ -3732,33 +3797,6 @@ void tcp_xmit_retransmit_queue(struct sock *sk) inet_csk(sk)->icsk_rto, true); } -/* We allow to exceed memory limits for FIN packets to expedite - * connection tear down and (memory) recovery. - * Otherwise tcp_send_fin() could be tempted to either delay FIN - * or even be forced to close flow without any FIN. - * In general, we want to allow one skb per socket to avoid hangs - * with edge trigger epoll() - */ -void sk_forced_mem_schedule(struct sock *sk, int size) -{ - int delta, amt; - - delta = size - sk->sk_forward_alloc; - if (delta <= 0) - return; - - amt = sk_mem_pages(delta); - sk_forward_alloc_add(sk, amt << PAGE_SHIFT); - - if (mem_cgroup_sk_enabled(sk)) - mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL); - - if (sk->sk_bypass_prot_mem) - return; - - sk_memory_allocated_add(sk, amt); -} - /* Send a FIN. The caller locks the socket for us. * We should try to send a FIN packet really hard, but eventually give up. */ @@ -3918,6 +3956,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, switch (synack_type) { case TCP_SYNACK_NORMAL: + case TCP_SYNACK_RETRANS: skb_set_owner_edemux(skb, req_to_sk(req)); break; case TCP_SYNACK_COOKIE: @@ -4000,7 +4039,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; - tcp_ecn_make_synack(req, th); + tcp_ecn_make_synack(req, th, synack_type); th->source = htons(ireq->ir_num); th->dest = ireq->ir_rmt_port; skb->mark = ireq->ir_mark; @@ -4603,7 +4642,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) /* Paired with WRITE_ONCE() in sock_setsockopt() */ if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED) WRITE_ONCE(tcp_rsk(req)->txhash, net_tx_rndhash()); - res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL, + res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_RETRANS, NULL); if (!res) { TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c deleted file mode 100644 index a8f6d9d06f2e..000000000000 --- a/net/ipv4/tcp_rate.c +++ /dev/null @@ -1,209 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -#include <net/tcp.h> - -/* The bandwidth estimator estimates the rate at which the network - * can currently deliver outbound data packets for this flow. At a high - * level, it operates by taking a delivery rate sample for each ACK. - * - * A rate sample records the rate at which the network delivered packets - * for this flow, calculated over the time interval between the transmission - * of a data packet and the acknowledgment of that packet. - * - * Specifically, over the interval between each transmit and corresponding ACK, - * the estimator generates a delivery rate sample. Typically it uses the rate - * at which packets were acknowledged. However, the approach of using only the - * acknowledgment rate faces a challenge under the prevalent ACK decimation or - * compression: packets can temporarily appear to be delivered much quicker - * than the bottleneck rate. Since it is physically impossible to do that in a - * sustained fashion, when the estimator notices that the ACK rate is faster - * than the transmit rate, it uses the latter: - * - * send_rate = #pkts_delivered/(last_snd_time - first_snd_time) - * ack_rate = #pkts_delivered/(last_ack_time - first_ack_time) - * bw = min(send_rate, ack_rate) - * - * Notice the estimator essentially estimates the goodput, not always the - * network bottleneck link rate when the sending or receiving is limited by - * other factors like applications or receiver window limits. The estimator - * deliberately avoids using the inter-packet spacing approach because that - * approach requires a large number of samples and sophisticated filtering. - * - * TCP flows can often be application-limited in request/response workloads. - * The estimator marks a bandwidth sample as application-limited if there - * was some moment during the sampled window of packets when there was no data - * ready to send in the write queue. - */ - -/* Snapshot the current delivery information in the skb, to generate - * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). - */ -void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - /* In general we need to start delivery rate samples from the - * time we received the most recent ACK, to ensure we include - * the full time the network needs to deliver all in-flight - * packets. If there are no packets in flight yet, then we - * know that any ACKs after now indicate that the network was - * able to deliver those packets completely in the sampling - * interval between now and the next ACK. - * - * Note that we use packets_out instead of tcp_packets_in_flight(tp) - * because the latter is a guess based on RTO and loss-marking - * heuristics. We don't want spurious RTOs or loss markings to cause - * a spuriously small time interval, causing a spuriously high - * bandwidth estimate. - */ - if (!tp->packets_out) { - u64 tstamp_us = tcp_skb_timestamp_us(skb); - - tp->first_tx_mstamp = tstamp_us; - tp->delivered_mstamp = tstamp_us; - } - - TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp; - TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; - TCP_SKB_CB(skb)->tx.delivered = tp->delivered; - TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; - TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; -} - -/* When an skb is sacked or acked, we fill in the rate sample with the (prior) - * delivery information when the skb was last transmitted. - * - * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is - * called multiple times. We favor the information from the most recently - * sent skb, i.e., the skb with the most recently sent time and the highest - * sequence. - */ -void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, - struct rate_sample *rs) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct tcp_skb_cb *scb = TCP_SKB_CB(skb); - u64 tx_tstamp; - - if (!scb->tx.delivered_mstamp) - return; - - tx_tstamp = tcp_skb_timestamp_us(skb); - if (!rs->prior_delivered || - tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp, - scb->end_seq, rs->last_end_seq)) { - rs->prior_delivered_ce = scb->tx.delivered_ce; - rs->prior_delivered = scb->tx.delivered; - rs->prior_mstamp = scb->tx.delivered_mstamp; - rs->is_app_limited = scb->tx.is_app_limited; - rs->is_retrans = scb->sacked & TCPCB_RETRANS; - rs->last_end_seq = scb->end_seq; - - /* Record send time of most recently ACKed packet: */ - tp->first_tx_mstamp = tx_tstamp; - /* Find the duration of the "send phase" of this window: */ - rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, - scb->tx.first_tx_mstamp); - - } - /* Mark off the skb delivered once it's sacked to avoid being - * used again when it's cumulatively acked. For acked packets - * we don't need to reset since it'll be freed soon. - */ - if (scb->sacked & TCPCB_SACKED_ACKED) - scb->tx.delivered_mstamp = 0; -} - -/* Update the connection delivery information and generate a rate sample. */ -void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, - bool is_sack_reneg, struct rate_sample *rs) -{ - struct tcp_sock *tp = tcp_sk(sk); - u32 snd_us, ack_us; - - /* Clear app limited if bubble is acked and gone. */ - if (tp->app_limited && after(tp->delivered, tp->app_limited)) - tp->app_limited = 0; - - /* TODO: there are multiple places throughout tcp_ack() to get - * current time. Refactor the code using a new "tcp_acktag_state" - * to carry current time, flags, stats like "tcp_sacktag_state". - */ - if (delivered) - tp->delivered_mstamp = tp->tcp_mstamp; - - rs->acked_sacked = delivered; /* freshly ACKed or SACKed */ - rs->losses = lost; /* freshly marked lost */ - /* Return an invalid sample if no timing information is available or - * in recovery from loss with SACK reneging. Rate samples taken during - * a SACK reneging event may overestimate bw by including packets that - * were SACKed before the reneg. - */ - if (!rs->prior_mstamp || is_sack_reneg) { - rs->delivered = -1; - rs->interval_us = -1; - return; - } - rs->delivered = tp->delivered - rs->prior_delivered; - - rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; - /* delivered_ce occupies less than 32 bits in the skb control block */ - rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK; - - /* Model sending data and receiving ACKs as separate pipeline phases - * for a window. Usually the ACK phase is longer, but with ACK - * compression the send phase can be longer. To be safe we use the - * longer phase. - */ - snd_us = rs->interval_us; /* send phase */ - ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, - rs->prior_mstamp); /* ack phase */ - rs->interval_us = max(snd_us, ack_us); - - /* Record both segment send and ack receive intervals */ - rs->snd_interval_us = snd_us; - rs->rcv_interval_us = ack_us; - - /* Normally we expect interval_us >= min-rtt. - * Note that rate may still be over-estimated when a spuriously - * retransmistted skb was first (s)acked because "interval_us" - * is under-estimated (up to an RTT). However continuously - * measuring the delivery rate during loss recovery is crucial - * for connections suffer heavy or prolonged losses. - */ - if (unlikely(rs->interval_us < tcp_min_rtt(tp))) { - if (!rs->is_retrans) - pr_debug("tcp rate: %ld %d %u %u %u\n", - rs->interval_us, rs->delivered, - inet_csk(sk)->icsk_ca_state, - tp->rx_opt.sack_ok, tcp_min_rtt(tp)); - rs->interval_us = -1; - return; - } - - /* Record the last non-app-limited or the highest app-limited bw */ - if (!rs->is_app_limited || - ((u64)rs->delivered * tp->rate_interval_us >= - (u64)tp->rate_delivered * rs->interval_us)) { - tp->rate_delivered = rs->delivered; - tp->rate_interval_us = rs->interval_us; - tp->rate_app_limited = rs->is_app_limited; - } -} - -/* If a gap is detected between sends, mark the socket application-limited. */ -void tcp_rate_check_app_limited(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (/* We have less than one packet to send. */ - tp->write_seq - tp->snd_nxt < tp->mss_cache && - /* Nothing in sending host's qdisc queues or NIC tx queue. */ - sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) && - /* We are not limited by CWND. */ - tcp_packets_in_flight(tp) < tcp_snd_cwnd(tp) && - /* All lost packets have been retransmitted. */ - tp->lost_out <= tp->retrans_out) - tp->app_limited = - (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; -} -EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited); diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index c52fd3254b6e..139646751073 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -111,38 +111,6 @@ bool tcp_rack_mark_lost(struct sock *sk) return !!timeout; } -/* Record the most recently (re)sent time among the (s)acked packets - * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from - * draft-cheng-tcpm-rack-00.txt - */ -void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, - u64 xmit_time) -{ - u32 rtt_us; - - rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time); - if (rtt_us < tcp_min_rtt(tp) && (sacked & TCPCB_RETRANS)) { - /* If the sacked packet was retransmitted, it's ambiguous - * whether the retransmission or the original (or the prior - * retransmission) was sacked. - * - * If the original is lost, there is no ambiguity. Otherwise - * we assume the original can be delayed up to aRTT + min_rtt. - * the aRTT term is bounded by the fast recovery or timeout, - * so it's at least one RTT (i.e., retransmission is at least - * an RTT later). - */ - return; - } - tp->rack.advanced = 1; - tp->rack.rtt_us = rtt_us; - if (tcp_skb_sent_after(xmit_time, tp->rack.mstamp, - end_seq, tp->rack.end_seq)) { - tp->rack.mstamp = xmit_time; - tp->rack.end_seq = end_seq; - } -} - /* We have waited long enough to accommodate reordering. Mark the expired * packets lost and retransmit them. */ @@ -166,49 +134,6 @@ void tcp_rack_reo_timeout(struct sock *sk) tcp_rearm_rto(sk); } -/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries. - * - * If a DSACK is received that seems like it may have been due to reordering - * triggering fast recovery, increment reo_wnd by min_rtt/4 (upper bounded - * by srtt), since there is possibility that spurious retransmission was - * due to reordering delay longer than reo_wnd. - * - * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16) - * no. of successful recoveries (accounts for full DSACK-based loss - * recovery undo). After that, reset it to default (min_rtt/4). - * - * At max, reo_wnd is incremented only once per rtt. So that the new - * DSACK on which we are reacting, is due to the spurious retx (approx) - * after the reo_wnd has been updated last time. - * - * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than - * absolute value to account for change in rtt. - */ -void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if ((READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) & - TCP_RACK_STATIC_REO_WND) || - !rs->prior_delivered) - return; - - /* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */ - if (before(rs->prior_delivered, tp->rack.last_delivered)) - tp->rack.dsack_seen = 0; - - /* Adjust the reo_wnd if update is pending */ - if (tp->rack.dsack_seen) { - tp->rack.reo_wnd_steps = min_t(u32, 0xFF, - tp->rack.reo_wnd_steps + 1); - tp->rack.dsack_seen = 0; - tp->rack.last_delivered = tp->delivered; - tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH; - } else if (!tp->rack.reo_wnd_persist) { - tp->rack.reo_wnd_steps = 1; - } -} - /* RFC6582 NewReno recovery for non-SACK connection. It simply retransmits * the next unacked packet upon receiving * a) three or more DUPACKs to start the fast recovery diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 160080c9021d..5a14a53a3c9e 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -22,6 +22,7 @@ #include <linux/module.h> #include <linux/gfp.h> #include <net/tcp.h> +#include <net/tcp_ecn.h> #include <net/rstreason.h> static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) @@ -479,6 +480,8 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) * it's not good to give up too easily. */ tcp_rtx_synack(sk, req); + if (req->num_retrans > 1 && tcp_rsk(req)->accecn_ok) + tcp_rsk(req)->accecn_fail_mode |= TCP_ACCECN_ACE_FAIL_SEND; req->num_timeout++; tcp_update_rto_stats(sk); if (!tp->retrans_stamp) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ee63af0ef42c..b96e47f1c8a2 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1193,7 +1193,7 @@ csum_partial: send: err = ip_send_skb(sock_net(sk), skb); - if (err) { + if (unlikely(err)) { if (err == -ENOBUFS && !inet_test_bit(RECVERR, sk)) { UDP_INC_STATS(sock_net(sk), @@ -1269,6 +1269,8 @@ EXPORT_IPV6_MOD_GPL(udp_cmsg_send); int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { + DEFINE_RAW_FLEX(struct ip_options_rcu, opt_copy, opt.__data, + IP_OPTIONS_DATA_FIXED_SIZE); struct inet_sock *inet = inet_sk(sk); struct udp_sock *up = udp_sk(sk); DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); @@ -1286,7 +1288,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE; int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); struct sk_buff *skb; - struct ip_options_data opt_copy; int uc_index; if (len > 0xFFFF) @@ -1368,9 +1369,9 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { - memcpy(&opt_copy, inet_opt, + memcpy(opt_copy, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen); - ipc.opt = &opt_copy.opt; + ipc.opt = opt_copy; } rcu_read_unlock(); } @@ -1793,14 +1794,32 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) } if (unlikely(to_drop)) { + int err_ipv4 = 0; + int err_ipv6 = 0; + for (nb = 0; to_drop != NULL; nb++) { skb = to_drop; + if (skb->protocol == htons(ETH_P_IP)) + err_ipv4++; + else + err_ipv6++; to_drop = skb->next; skb_mark_not_on_list(skb); - /* TODO: update SNMP values. */ sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_PROTO_MEM); } numa_drop_add(&udp_sk(sk)->drop_counters, nb); + if (err_ipv4 > 0) { + SNMP_ADD_STATS(__UDPX_MIB(sk, true), UDP_MIB_MEMERRORS, + err_ipv4); + SNMP_ADD_STATS(__UDPX_MIB(sk, true), UDP_MIB_INERRORS, + err_ipv4); + } + if (err_ipv6 > 0) { + SNMP_ADD_STATS(__UDPX_MIB(sk, false), UDP_MIB_MEMERRORS, + err_ipv6); + SNMP_ADD_STATS(__UDPX_MIB(sk, false), UDP_MIB_INERRORS, + err_ipv6); + } } atomic_sub(total_size, &udp_prod_queue->rmem_alloc); @@ -2429,7 +2448,8 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) /* * UDP-Lite specific tests, ignored on UDP sockets */ - if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) { + if (unlikely(udp_test_bit(UDPLITE_RECV_CC, sk) && + UDP_SKB_CB(skb)->partial_cov)) { u16 pcrlen = READ_ONCE(up->pcrlen); /* diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 589456bd8b5f..6b1654c1ad4a 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -483,11 +483,11 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, struct sock *sk = gso_skb->sk; unsigned int sum_truesize = 0; struct sk_buff *segs, *seg; + __be16 newlen, msslen; struct udphdr *uh; unsigned int mss; bool copy_dtor; __sum16 check; - __be16 newlen; int ret = 0; mss = skb_shinfo(gso_skb)->gso_size; @@ -556,6 +556,8 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, return segs; } + msslen = htons(sizeof(*uh) + mss); + /* GSO partial and frag_list segmentation only requires splitting * the frame into an MSS multiple and possibly a remainder, both * cases return a GSO skb. So update the mss now. @@ -585,7 +587,7 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, if (!seg->next) break; - uh->len = newlen; + uh->len = msslen; uh->check = check; if (seg->ip_summed == CHECKSUM_PARTIAL) |
