diff options
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r-- | net/ipv4/tcp_input.c | 395 |
1 files changed, 320 insertions, 75 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 71b76e98371a..b44fdc309633 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -70,8 +70,10 @@ #include <linux/sysctl.h> #include <linux/kernel.h> #include <linux/prefetch.h> +#include <linux/bitops.h> #include <net/dst.h> #include <net/tcp.h> +#include <net/tcp_ecn.h> #include <net/proto_memory.h> #include <net/inet_common.h> #include <linux/ipsec.h> @@ -339,31 +341,6 @@ static bool tcp_in_quickack_mode(struct sock *sk) (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk)); } -static void tcp_ecn_queue_cwr(struct tcp_sock *tp) -{ - if (tcp_ecn_mode_rfc3168(tp)) - tp->ecn_flags |= TCP_ECN_QUEUE_CWR; -} - -static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb) -{ - if (tcp_hdr(skb)->cwr) { - tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR; - - /* If the sender is telling us it has entered CWR, then its - * cwnd may be very low (even just 1 packet), so we should ACK - * immediately. - */ - if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) - inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; - } -} - -static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) -{ - tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; -} - static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -384,38 +361,117 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) if (tcp_ca_needs_ecn(sk)) tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); - if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR) && + tcp_ecn_mode_rfc3168(tp)) { /* Better not delay acks, sender can have a very low cwnd */ tcp_enter_quickack_mode(sk, 2); tp->ecn_flags |= TCP_ECN_DEMAND_CWR; } + /* As for RFC3168 ECN, the TCP_ECN_SEEN flag is set by + * tcp_data_ecn_check() when the ECN codepoint of + * received TCP data contains ECT(0), ECT(1), or CE. + */ + if (!tcp_ecn_mode_rfc3168(tp)) + break; tp->ecn_flags |= TCP_ECN_SEEN; break; default: if (tcp_ca_needs_ecn(sk)) tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); + if (!tcp_ecn_mode_rfc3168(tp)) + break; tp->ecn_flags |= TCP_ECN_SEEN; break; } } -static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) +/* Returns true if the byte counters can be used */ +static bool tcp_accecn_process_option(struct tcp_sock *tp, + const struct sk_buff *skb, + u32 delivered_bytes, int flag) { - if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || th->cwr)) - tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); -} + u8 estimate_ecnfield = tp->est_ecnfield; + bool ambiguous_ecn_bytes_incr = false; + bool first_changed = false; + unsigned int optlen; + bool order1, res; + unsigned int i; + u8 *ptr; -static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) -{ - if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr)) - tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); -} + if (tcp_accecn_opt_fail_recv(tp)) + return false; -static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) -{ - if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp)) - return true; - return false; + if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { + if (!tp->saw_accecn_opt) { + /* Too late to enable after this point due to + * potential counter wraps + */ + if (tp->bytes_sent >= (1 << 23) - 1) { + u8 saw_opt = TCP_ACCECN_OPT_FAIL_SEEN; + + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); + } + return false; + } + + if (estimate_ecnfield) { + u8 ecnfield = estimate_ecnfield - 1; + + tp->delivered_ecn_bytes[ecnfield] += delivered_bytes; + return true; + } + return false; + } + + ptr = skb_transport_header(skb) + tp->rx_opt.accecn; + optlen = ptr[1] - 2; + if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1)) + return false; + order1 = (ptr[0] == TCPOPT_ACCECN1); + ptr += 2; + + if (tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + tp->saw_accecn_opt = tcp_accecn_option_init(skb, + tp->rx_opt.accecn); + if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV); + } + + res = !!estimate_ecnfield; + for (i = 0; i < 3; i++) { + u32 init_offset; + u8 ecnfield; + s32 delta; + u32 *cnt; + + if (optlen < TCPOLEN_ACCECN_PERFIELD) + break; + + ecnfield = tcp_accecn_optfield_to_ecnfield(i, order1); + init_offset = tcp_accecn_field_init_offset(ecnfield); + cnt = &tp->delivered_ecn_bytes[ecnfield - 1]; + delta = tcp_update_ecn_bytes(cnt, ptr, init_offset); + if (delta && delta < 0) { + res = false; + ambiguous_ecn_bytes_incr = true; + } + if (delta && ecnfield != estimate_ecnfield) { + if (!first_changed) { + tp->est_ecnfield = ecnfield; + first_changed = true; + } else { + res = false; + ambiguous_ecn_bytes_incr = true; + } + } + + optlen -= TCPOLEN_ACCECN_PERFIELD; + ptr += TCPOLEN_ACCECN_PERFIELD; + } + if (ambiguous_ecn_bytes_incr) + tp->est_ecnfield = 0; + + return res; } static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count) @@ -428,10 +484,101 @@ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, bool ece_ack) { tp->delivered += delivered; - if (ece_ack) + if (tcp_ecn_mode_rfc3168(tp) && ece_ack) tcp_count_delivered_ce(tp, delivered); } +/* Returns the ECN CE delta */ +static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, + u32 delivered_pkts, u32 delivered_bytes, + int flag) +{ + u32 old_ceb = tcp_sk(sk)->delivered_ecn_bytes[INET_ECN_CE - 1]; + const struct tcphdr *th = tcp_hdr(skb); + struct tcp_sock *tp = tcp_sk(sk); + u32 delta, safe_delta, d_ceb; + bool opt_deltas_valid; + u32 corrected_ace; + + /* Reordered ACK or uncertain due to lack of data to send and ts */ + if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS))) + return 0; + + opt_deltas_valid = tcp_accecn_process_option(tp, skb, + delivered_bytes, flag); + + if (!(flag & FLAG_SLOWPATH)) { + /* AccECN counter might overflow on large ACKs */ + if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) + return 0; + } + + /* ACE field is not available during handshake */ + if (flag & FLAG_SYN_ACKED) + return 0; + + if (tp->received_ce_pending >= TCP_ACCECN_ACE_MAX_DELTA) + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; + + corrected_ace = tcp_accecn_ace(th) - TCP_ACCECN_CEP_INIT_OFFSET; + delta = (corrected_ace - tp->delivered_ce) & TCP_ACCECN_CEP_ACE_MASK; + if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) + return delta; + + safe_delta = delivered_pkts - + ((delivered_pkts - delta) & TCP_ACCECN_CEP_ACE_MASK); + + if (opt_deltas_valid) { + d_ceb = tp->delivered_ecn_bytes[INET_ECN_CE - 1] - old_ceb; + if (!d_ceb) + return delta; + + if ((delivered_pkts >= (TCP_ACCECN_CEP_ACE_MASK + 1) * 2) && + (tcp_is_sack(tp) || + ((1 << inet_csk(sk)->icsk_ca_state) & + (TCPF_CA_Open | TCPF_CA_CWR)))) { + u32 est_d_cep; + + if (delivered_bytes <= d_ceb) + return safe_delta; + + est_d_cep = DIV_ROUND_UP_ULL((u64)d_ceb * + delivered_pkts, + delivered_bytes); + return min(safe_delta, + delta + + (est_d_cep & ~TCP_ACCECN_CEP_ACE_MASK)); + } + + if (d_ceb > delta * tp->mss_cache) + return safe_delta; + if (d_ceb < + safe_delta * tp->mss_cache >> TCP_ACCECN_SAFETY_SHIFT) + return delta; + } + + return safe_delta; +} + +static u32 tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, + u32 delivered_pkts, u32 delivered_bytes, + int *flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 delta; + + delta = __tcp_accecn_process(sk, skb, delivered_pkts, + delivered_bytes, *flag); + if (delta > 0) { + tcp_count_delivered_ce(tp, delta); + *flag |= FLAG_ECE; + /* Recalculate header predictor */ + if (tp->pred_flags) + tcp_fast_path_on(tp); + } + return delta; +} + /* Buffer size and advertised window tuning. * * 1. Tuning sk->sk_sndbuf, when connection enters established state. @@ -744,7 +891,7 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, } } -static void tcp_rcvbuf_grow(struct sock *sk) +void tcp_rcvbuf_grow(struct sock *sk) { const struct net *net = sock_net(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -1030,6 +1177,7 @@ struct tcp_sacktag_state { u64 last_sackt; u32 reord; u32 sack_delivered; + u32 delivered_bytes; int flag; unsigned int mss_now; struct rate_sample *rate; @@ -1391,7 +1539,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, static u8 tcp_sacktag_one(struct sock *sk, struct tcp_sacktag_state *state, u8 sacked, u32 start_seq, u32 end_seq, - int dup_sack, int pcount, + int dup_sack, int pcount, u32 plen, u64 xmit_time) { struct tcp_sock *tp = tcp_sk(sk); @@ -1451,6 +1599,7 @@ static u8 tcp_sacktag_one(struct sock *sk, tp->sacked_out += pcount; /* Out-of-order packets delivered */ state->sack_delivered += pcount; + state->delivered_bytes += plen; } /* D-SACK. We can detect redundant retransmission in S|R and plain R @@ -1487,7 +1636,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, * tcp_highest_sack_seq() when skb is highest_sack. */ tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, - start_seq, end_seq, dup_sack, pcount, + start_seq, end_seq, dup_sack, pcount, skb->len, tcp_skb_timestamp_us(skb)); tcp_rate_skb_delivered(sk, skb, state->rate); @@ -1772,6 +1921,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, TCP_SKB_CB(skb)->end_seq, dup_sack, tcp_skb_pcount(skb), + skb->len, tcp_skb_timestamp_us(skb)); tcp_rate_skb_delivered(sk, skb, state->rate); if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) @@ -2569,7 +2719,7 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) if (frto_undo) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); - inet_csk(sk)->icsk_retransmits = 0; + WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0); if (tcp_is_non_sack_preventing_reopen(sk)) return true; if (frto_undo || tcp_is_sack(tp)) { @@ -3280,6 +3430,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, if (sacked & TCPCB_SACKED_ACKED) { tp->sacked_out -= acked_pcount; + /* snd_una delta covers these skbs */ + sack->delivered_bytes -= skb->len; } else if (tcp_is_sack(tp)) { tcp_count_delivered(tp, acked_pcount, ece_ack); if (!tcp_skb_spurious_retrans(tp, skb)) @@ -3376,6 +3528,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, if (before(reord, prior_fack)) tcp_check_sack_reordering(sk, reord, 0); } + + sack->delivered_bytes = (skb ? + TCP_SKB_CB(skb)->seq : tp->snd_una) - + prior_snd_una; } else if (skb && rtt_update && sack_rtt_us >= 0 && sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(skb))) { @@ -3645,8 +3801,18 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time); } +static void tcp_send_ack_reflect_ect(struct sock *sk, bool accecn_reflector) +{ + struct tcp_sock *tp = tcp_sk(sk); + u16 flags = 0; + + if (accecn_reflector) + flags = tcp_accecn_reflector_flags(tp->syn_ect_rcv); + __tcp_send_ack(sk, tp->rcv_nxt, flags); +} + /* RFC 5961 7 [ACK Throttling] */ -static void tcp_send_challenge_ack(struct sock *sk) +static void tcp_send_challenge_ack(struct sock *sk, bool accecn_reflector) { struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); @@ -3676,7 +3842,7 @@ static void tcp_send_challenge_ack(struct sock *sk) WRITE_ONCE(net->ipv4.tcp_challenge_count, count - 1); send_ack: NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK); - tcp_send_ack(sk); + tcp_send_ack_reflect_ect(sk, accecn_reflector); } } @@ -3787,7 +3953,8 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit) } /* Returns the number of packets newly acked or sacked by the current ACK */ -static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) +static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, + u32 ecn_count, int flag) { const struct net *net = sock_net(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -3795,8 +3962,12 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) delivered = tp->delivered - prior_delivered; NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered); - if (flag & FLAG_ECE) - NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered); + + if (flag & FLAG_ECE) { + if (tcp_ecn_mode_rfc3168(tp)) + ecn_count = delivered; + NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, ecn_count); + } return delivered; } @@ -3817,11 +3988,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 delivered = tp->delivered; u32 lost = tp->lost; int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ + u32 ecn_count = 0; /* Did we receive ECE/an AccECN ACE update? */ u32 prior_fack; sack_state.first_sackt = 0; sack_state.rate = &rs; sack_state.sack_delivered = 0; + sack_state.delivered_bytes = 0; /* We very likely will need to access rtx queue. */ prefetch(sk->tcp_rtx_queue.rb_node); @@ -3837,7 +4010,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ if (before(ack, prior_snd_una - max_window)) { if (!(flag & FLAG_NO_CHALLENGE_ACK)) - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, false); return -SKB_DROP_REASON_TCP_TOO_OLD_ACK; } goto old_ack; @@ -3851,7 +4024,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (after(ack, prior_snd_una)) { flag |= FLAG_SND_UNA_ADVANCED; - icsk->icsk_retransmits = 0; + WRITE_ONCE(icsk->icsk_retransmits, 0); #if IS_ENABLED(CONFIG_TLS_DEVICE) if (static_branch_unlikely(&clean_acked_data_enabled.key)) @@ -3912,8 +4085,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* We passed data and got it acked, remove any soft error * log. Something worked... */ - WRITE_ONCE(sk->sk_err_soft, 0); - icsk->icsk_probes_out = 0; + if (READ_ONCE(sk->sk_err_soft)) + WRITE_ONCE(sk->sk_err_soft, 0); + WRITE_ONCE(icsk->icsk_probes_out, 0); tp->rcv_tstamp = tcp_jiffies32; if (!prior_packets) goto no_queue; @@ -3924,6 +4098,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_rack_update_reo_wnd(sk, &rs); + if (tcp_ecn_mode_accecn(tp)) + ecn_count = tcp_accecn_process(sk, skb, + tp->delivered - delivered, + sack_state.delivered_bytes, + &flag); + tcp_in_ack_event(sk, flag); if (tp->tlp_high_seq) @@ -3948,7 +4128,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) sk_dst_confirm(sk); - delivered = tcp_newly_delivered(sk, delivered, flag); + delivered = tcp_newly_delivered(sk, delivered, ecn_count, flag); + lost = tp->lost - lost; /* freshly marked lost */ rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); @@ -3957,12 +4138,17 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) return 1; no_queue: + if (tcp_ecn_mode_accecn(tp)) + ecn_count = tcp_accecn_process(sk, skb, + tp->delivered - delivered, + sack_state.delivered_bytes, + &flag); tcp_in_ack_event(sk, flag); /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) { tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); - tcp_newly_delivered(sk, delivered, flag); + tcp_newly_delivered(sk, delivered, ecn_count, flag); } /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than @@ -3983,7 +4169,7 @@ old_ack: &sack_state); tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); - tcp_newly_delivered(sk, delivered, flag); + tcp_newly_delivered(sk, delivered, ecn_count, flag); tcp_xmit_recovery(sk, rexmit); } @@ -4083,6 +4269,7 @@ void tcp_parse_options(const struct net *net, ptr = (const unsigned char *)(th + 1); opt_rx->saw_tstamp = 0; + opt_rx->accecn = 0; opt_rx->saw_unknown = 0; while (length > 0) { @@ -4174,6 +4361,12 @@ void tcp_parse_options(const struct net *net, ptr, th->syn, foc, false); break; + case TCPOPT_ACCECN0: + case TCPOPT_ACCECN1: + /* Save offset of AccECN option in TCP header */ + opt_rx->accecn = (ptr - 2) - (__u8 *)th; + break; + case TCPOPT_EXP: /* Fast Open option shares code 254 using a * 16 bits magic number. @@ -4234,11 +4427,14 @@ static bool tcp_fast_parse_options(const struct net *net, */ if (th->doff == (sizeof(*th) / 4)) { tp->rx_opt.saw_tstamp = 0; + tp->rx_opt.accecn = 0; return false; } else if (tp->rx_opt.tstamp_ok && th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) { - if (tcp_parse_aligned_timestamp(tp, th)) + if (tcp_parse_aligned_timestamp(tp, th)) { + tp->rx_opt.accecn = 0; return true; + } } tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL); @@ -4830,7 +5026,7 @@ static bool tcp_ooo_try_coalesce(struct sock *sk, noinline_for_tracing static void tcp_drop_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); sk_skb_reason_drop(sk, skb, reason); } @@ -4890,12 +5086,23 @@ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb); /* Check if this incoming skb can be added to socket receive queues * while satisfying sk->sk_rcvbuf limit. + * + * In theory we should use skb->truesize, but this can cause problems + * when applications use too small SO_RCVBUF values. + * When LRO / hw gro is used, the socket might have a high tp->scaling_ratio, + * allowing RWIN to be close to available space. + * Whenever the receive queue gets full, we can receive a small packet + * filling RWIN, but with a high skb->truesize, because most NIC use 4K page + * plus sk_buff metadata even when receiving less than 1500 bytes of payload. + * + * Note that we use skb->len to decide to accept or drop this packet, + * but sk->sk_rmem_alloc is the sum of all skb->truesize. */ static bool tcp_can_ingest(const struct sock *sk, const struct sk_buff *skb) { - unsigned int new_mem = atomic_read(&sk->sk_rmem_alloc) + skb->truesize; + unsigned int rmem = atomic_read(&sk->sk_rmem_alloc); - return new_mem <= sk->sk_rcvbuf; + return rmem + skb->len <= sk->sk_rcvbuf; } static int tcp_try_rmem_schedule(struct sock *sk, const struct sk_buff *skb, @@ -5871,6 +6078,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, int syn_inerr) { struct tcp_sock *tp = tcp_sk(sk); + bool accecn_reflector = false; SKB_DR(reason); /* RFC1323: H1. Apply PAWS check first. */ @@ -5968,7 +6176,7 @@ step1: if (tp->syn_fastopen && !tp->data_segs_in && sk->sk_state == TCP_ESTABLISHED) tcp_fastopen_active_disable(sk); - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, false); SKB_DR_SET(reason, TCP_RESET); goto discard; } @@ -5979,6 +6187,16 @@ step1: * RFC 5961 4.2 : Send a challenge ack */ if (th->syn) { + if (tcp_ecn_mode_accecn(tp)) { + accecn_reflector = true; + if (tp->rx_opt.accecn && + tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn); + + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); + tcp_accecn_opt_demand_min(sk, 1); + } + } if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack && TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq && TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt && @@ -5988,7 +6206,7 @@ syn_challenge: if (syn_inerr) TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, accecn_reflector); SKB_DR_SET(reason, TCP_INVALID_SYN); goto discard; } @@ -6060,6 +6278,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) */ tp->rx_opt.saw_tstamp = 0; + tp->rx_opt.accecn = 0; /* pred_flags is 0xS?10 << 16 + snd_wnd * if header_prediction is to be made @@ -6114,6 +6333,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) flag |= __tcp_replace_ts_recent(tp, delta); + tcp_ecn_received_counters(sk, skb, 0); + /* We know that such packets are checksummed * on entry. */ @@ -6162,6 +6383,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) /* Bulk data transfer: receiver */ tcp_cleanup_skb(skb); __skb_pull(skb, tcp_header_len); + tcp_ecn_received_counters(sk, skb, + len - tcp_header_len); eaten = tcp_queue_rcv(sk, skb, &fragstolen); tcp_event_data_recv(sk, skb); @@ -6202,6 +6425,8 @@ validate: return; step5: + tcp_ecn_received_counters_payload(sk, skb); + reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT); if ((int)reason < 0) { reason = -reason; @@ -6297,7 +6522,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, u16 mss = tp->rx_opt.mss_clamp, try_exp = 0; bool syn_drop = false; - if (mss == tp->rx_opt.user_mss) { + if (mss == READ_ONCE(tp->rx_opt.user_mss)) { struct tcp_options_received opt; /* Get original SYNACK MSS value if user MSS sets mss_clamp */ @@ -6452,7 +6677,9 @@ consume: * state to ESTABLISHED..." */ - tcp_ecn_rcv_synack(tp, th); + if (tcp_ecn_mode_any(tp)) + tcp_ecn_rcv_synack(sk, skb, th, + TCP_SKB_CB(skb)->ip_dsfield); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_try_undo_spurious_syn(sk); @@ -6524,7 +6751,7 @@ consume: TCP_DELACK_MAX, false); goto consume; } - tcp_send_ack(sk); + tcp_send_ack_reflect_ect(sk, tcp_ecn_mode_accecn(tp)); return -1; } @@ -6583,7 +6810,7 @@ consume: tp->snd_wl1 = TCP_SKB_CB(skb)->seq; tp->max_window = tp->snd_wnd; - tcp_ecn_rcv_syn(tp, th); + tcp_ecn_rcv_syn(tp, th, skb); tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); @@ -6636,7 +6863,7 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) tcp_try_undo_recovery(sk); tcp_update_rto_time(tp); - inet_csk(sk)->icsk_retransmits = 0; + WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0); /* In tcp_fastopen_synack_timer() on the first SYNACK RTO we set * retrans_stamp but don't enter CA_Loss, so in case that happened we * need to zero retrans_stamp here to prevent spurious @@ -6765,7 +6992,7 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) } /* accept old ack during closing */ if ((int)reason < 0) { - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, false); reason = -reason; goto discard; } @@ -6812,9 +7039,12 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tp->lsndtime = tcp_jiffies32; tcp_initialize_rcv_mss(sk); + if (tcp_ecn_mode_accecn(tp)) + tcp_accecn_third_ack(sk, skb, tp->syn_ect_snt); tcp_fast_path_on(tp); if (sk->sk_shutdown & SEND_SHUTDOWN) tcp_shutdown(sk, SEND_SHUTDOWN); + break; case TCP_FIN_WAIT1: { @@ -6984,6 +7214,15 @@ static void tcp_ecn_create_request(struct request_sock *req, bool ect, ecn_ok; u32 ecn_ok_dst; + if (tcp_accecn_syn_requested(th) && + READ_ONCE(net->ipv4.sysctl_tcp_ecn) >= 3) { + inet_rsk(req)->ecn_ok = 1; + tcp_rsk(req)->accecn_ok = 1; + tcp_rsk(req)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & + INET_ECN_MASK; + return; + } + if (!th_ecn) return; @@ -6991,7 +7230,8 @@ static void tcp_ecn_create_request(struct request_sock *req, ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK); ecn_ok = READ_ONCE(net->ipv4.sysctl_tcp_ecn) || ecn_ok_dst; - if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || + if (((!ect || th->res1 || th->ae) && ecn_ok) || + tcp_ca_needs_ecn(listen_sk) || (ecn_ok_dst & DST_FEATURE_ECN_CA) || tcp_bpf_ca_needs_ecn((struct sock *)req)) inet_rsk(req)->ecn_ok = 1; @@ -7009,6 +7249,11 @@ static void tcp_openreq_init(struct request_sock *req, tcp_rsk(req)->snt_synack = 0; tcp_rsk(req)->snt_tsval_first = 0; tcp_rsk(req)->last_oow_ack_time = 0; + tcp_rsk(req)->accecn_ok = 0; + tcp_rsk(req)->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN; + tcp_rsk(req)->accecn_fail_mode = 0; + tcp_rsk(req)->syn_ect_rcv = 0; + tcp_rsk(req)->syn_ect_snt = 0; req->mss = rx_opt->mss_clamp; req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; ireq->tstamp_ok = rx_opt->tstamp_ok; @@ -7048,8 +7293,8 @@ static bool tcp_syn_flood_action(struct sock *sk, const char *proto) #endif __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); - if (!READ_ONCE(queue->synflood_warned) && syncookies != 2 && - xchg(&queue->synflood_warned, 1) == 0) { + if (syncookies != 2 && !READ_ONCE(queue->synflood_warned)) { + WRITE_ONCE(queue->synflood_warned, 1); if (IS_ENABLED(CONFIG_IPV6) && sk->sk_family == AF_INET6) { net_info_ratelimited("%s: Possible SYN flooding on port [%pI6c]:%u. %s.\n", proto, inet6_rcv_saddr(sk), @@ -7117,7 +7362,7 @@ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, return 0; } - mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss); + mss = tcp_parse_mss_option(th, READ_ONCE(tp->rx_opt.user_mss)); if (!mss) mss = af_ops->mss_clamp; @@ -7131,7 +7376,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, { struct tcp_fastopen_cookie foc = { .len = -1 }; struct tcp_options_received tmp_opt; - struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct sock *fastopen_sk = NULL; struct request_sock *req; @@ -7182,7 +7427,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = af_ops->mss_clamp; - tmp_opt.user_mss = tp->rx_opt.user_mss; + tmp_opt.user_mss = READ_ONCE(tp->rx_opt.user_mss); tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, want_cookie ? NULL : &foc); |