summaryrefslogtreecommitdiff
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c395
1 files changed, 320 insertions, 75 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 71b76e98371a..b44fdc309633 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -70,8 +70,10 @@
#include <linux/sysctl.h>
#include <linux/kernel.h>
#include <linux/prefetch.h>
+#include <linux/bitops.h>
#include <net/dst.h>
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
#include <net/proto_memory.h>
#include <net/inet_common.h>
#include <linux/ipsec.h>
@@ -339,31 +341,6 @@ static bool tcp_in_quickack_mode(struct sock *sk)
(icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk));
}
-static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
-{
- if (tcp_ecn_mode_rfc3168(tp))
- tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
-}
-
-static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb)
-{
- if (tcp_hdr(skb)->cwr) {
- tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
-
- /* If the sender is telling us it has entered CWR, then its
- * cwnd may be very low (even just 1 packet), so we should ACK
- * immediately.
- */
- if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq)
- inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
- }
-}
-
-static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
-{
- tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
-}
-
static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -384,38 +361,117 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb)
if (tcp_ca_needs_ecn(sk))
tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
- if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
+ if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR) &&
+ tcp_ecn_mode_rfc3168(tp)) {
/* Better not delay acks, sender can have a very low cwnd */
tcp_enter_quickack_mode(sk, 2);
tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
}
+ /* As for RFC3168 ECN, the TCP_ECN_SEEN flag is set by
+ * tcp_data_ecn_check() when the ECN codepoint of
+ * received TCP data contains ECT(0), ECT(1), or CE.
+ */
+ if (!tcp_ecn_mode_rfc3168(tp))
+ break;
tp->ecn_flags |= TCP_ECN_SEEN;
break;
default:
if (tcp_ca_needs_ecn(sk))
tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
+ if (!tcp_ecn_mode_rfc3168(tp))
+ break;
tp->ecn_flags |= TCP_ECN_SEEN;
break;
}
}
-static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
+/* Returns true if the byte counters can be used */
+static bool tcp_accecn_process_option(struct tcp_sock *tp,
+ const struct sk_buff *skb,
+ u32 delivered_bytes, int flag)
{
- if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || th->cwr))
- tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
-}
+ u8 estimate_ecnfield = tp->est_ecnfield;
+ bool ambiguous_ecn_bytes_incr = false;
+ bool first_changed = false;
+ unsigned int optlen;
+ bool order1, res;
+ unsigned int i;
+ u8 *ptr;
-static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
-{
- if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr))
- tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
-}
+ if (tcp_accecn_opt_fail_recv(tp))
+ return false;
-static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
-{
- if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp))
- return true;
- return false;
+ if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) {
+ if (!tp->saw_accecn_opt) {
+ /* Too late to enable after this point due to
+ * potential counter wraps
+ */
+ if (tp->bytes_sent >= (1 << 23) - 1) {
+ u8 saw_opt = TCP_ACCECN_OPT_FAIL_SEEN;
+
+ tcp_accecn_saw_opt_fail_recv(tp, saw_opt);
+ }
+ return false;
+ }
+
+ if (estimate_ecnfield) {
+ u8 ecnfield = estimate_ecnfield - 1;
+
+ tp->delivered_ecn_bytes[ecnfield] += delivered_bytes;
+ return true;
+ }
+ return false;
+ }
+
+ ptr = skb_transport_header(skb) + tp->rx_opt.accecn;
+ optlen = ptr[1] - 2;
+ if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1))
+ return false;
+ order1 = (ptr[0] == TCPOPT_ACCECN1);
+ ptr += 2;
+
+ if (tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
+ tp->saw_accecn_opt = tcp_accecn_option_init(skb,
+ tp->rx_opt.accecn);
+ if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN)
+ tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV);
+ }
+
+ res = !!estimate_ecnfield;
+ for (i = 0; i < 3; i++) {
+ u32 init_offset;
+ u8 ecnfield;
+ s32 delta;
+ u32 *cnt;
+
+ if (optlen < TCPOLEN_ACCECN_PERFIELD)
+ break;
+
+ ecnfield = tcp_accecn_optfield_to_ecnfield(i, order1);
+ init_offset = tcp_accecn_field_init_offset(ecnfield);
+ cnt = &tp->delivered_ecn_bytes[ecnfield - 1];
+ delta = tcp_update_ecn_bytes(cnt, ptr, init_offset);
+ if (delta && delta < 0) {
+ res = false;
+ ambiguous_ecn_bytes_incr = true;
+ }
+ if (delta && ecnfield != estimate_ecnfield) {
+ if (!first_changed) {
+ tp->est_ecnfield = ecnfield;
+ first_changed = true;
+ } else {
+ res = false;
+ ambiguous_ecn_bytes_incr = true;
+ }
+ }
+
+ optlen -= TCPOLEN_ACCECN_PERFIELD;
+ ptr += TCPOLEN_ACCECN_PERFIELD;
+ }
+ if (ambiguous_ecn_bytes_incr)
+ tp->est_ecnfield = 0;
+
+ return res;
}
static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count)
@@ -428,10 +484,101 @@ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
bool ece_ack)
{
tp->delivered += delivered;
- if (ece_ack)
+ if (tcp_ecn_mode_rfc3168(tp) && ece_ack)
tcp_count_delivered_ce(tp, delivered);
}
+/* Returns the ECN CE delta */
+static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb,
+ u32 delivered_pkts, u32 delivered_bytes,
+ int flag)
+{
+ u32 old_ceb = tcp_sk(sk)->delivered_ecn_bytes[INET_ECN_CE - 1];
+ const struct tcphdr *th = tcp_hdr(skb);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 delta, safe_delta, d_ceb;
+ bool opt_deltas_valid;
+ u32 corrected_ace;
+
+ /* Reordered ACK or uncertain due to lack of data to send and ts */
+ if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS)))
+ return 0;
+
+ opt_deltas_valid = tcp_accecn_process_option(tp, skb,
+ delivered_bytes, flag);
+
+ if (!(flag & FLAG_SLOWPATH)) {
+ /* AccECN counter might overflow on large ACKs */
+ if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK)
+ return 0;
+ }
+
+ /* ACE field is not available during handshake */
+ if (flag & FLAG_SYN_ACKED)
+ return 0;
+
+ if (tp->received_ce_pending >= TCP_ACCECN_ACE_MAX_DELTA)
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
+
+ corrected_ace = tcp_accecn_ace(th) - TCP_ACCECN_CEP_INIT_OFFSET;
+ delta = (corrected_ace - tp->delivered_ce) & TCP_ACCECN_CEP_ACE_MASK;
+ if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK)
+ return delta;
+
+ safe_delta = delivered_pkts -
+ ((delivered_pkts - delta) & TCP_ACCECN_CEP_ACE_MASK);
+
+ if (opt_deltas_valid) {
+ d_ceb = tp->delivered_ecn_bytes[INET_ECN_CE - 1] - old_ceb;
+ if (!d_ceb)
+ return delta;
+
+ if ((delivered_pkts >= (TCP_ACCECN_CEP_ACE_MASK + 1) * 2) &&
+ (tcp_is_sack(tp) ||
+ ((1 << inet_csk(sk)->icsk_ca_state) &
+ (TCPF_CA_Open | TCPF_CA_CWR)))) {
+ u32 est_d_cep;
+
+ if (delivered_bytes <= d_ceb)
+ return safe_delta;
+
+ est_d_cep = DIV_ROUND_UP_ULL((u64)d_ceb *
+ delivered_pkts,
+ delivered_bytes);
+ return min(safe_delta,
+ delta +
+ (est_d_cep & ~TCP_ACCECN_CEP_ACE_MASK));
+ }
+
+ if (d_ceb > delta * tp->mss_cache)
+ return safe_delta;
+ if (d_ceb <
+ safe_delta * tp->mss_cache >> TCP_ACCECN_SAFETY_SHIFT)
+ return delta;
+ }
+
+ return safe_delta;
+}
+
+static u32 tcp_accecn_process(struct sock *sk, const struct sk_buff *skb,
+ u32 delivered_pkts, u32 delivered_bytes,
+ int *flag)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 delta;
+
+ delta = __tcp_accecn_process(sk, skb, delivered_pkts,
+ delivered_bytes, *flag);
+ if (delta > 0) {
+ tcp_count_delivered_ce(tp, delta);
+ *flag |= FLAG_ECE;
+ /* Recalculate header predictor */
+ if (tp->pred_flags)
+ tcp_fast_path_on(tp);
+ }
+ return delta;
+}
+
/* Buffer size and advertised window tuning.
*
* 1. Tuning sk->sk_sndbuf, when connection enters established state.
@@ -744,7 +891,7 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
}
}
-static void tcp_rcvbuf_grow(struct sock *sk)
+void tcp_rcvbuf_grow(struct sock *sk)
{
const struct net *net = sock_net(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -1030,6 +1177,7 @@ struct tcp_sacktag_state {
u64 last_sackt;
u32 reord;
u32 sack_delivered;
+ u32 delivered_bytes;
int flag;
unsigned int mss_now;
struct rate_sample *rate;
@@ -1391,7 +1539,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
static u8 tcp_sacktag_one(struct sock *sk,
struct tcp_sacktag_state *state, u8 sacked,
u32 start_seq, u32 end_seq,
- int dup_sack, int pcount,
+ int dup_sack, int pcount, u32 plen,
u64 xmit_time)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -1451,6 +1599,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
tp->sacked_out += pcount;
/* Out-of-order packets delivered */
state->sack_delivered += pcount;
+ state->delivered_bytes += plen;
}
/* D-SACK. We can detect redundant retransmission in S|R and plain R
@@ -1487,7 +1636,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
* tcp_highest_sack_seq() when skb is highest_sack.
*/
tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
- start_seq, end_seq, dup_sack, pcount,
+ start_seq, end_seq, dup_sack, pcount, skb->len,
tcp_skb_timestamp_us(skb));
tcp_rate_skb_delivered(sk, skb, state->rate);
@@ -1772,6 +1921,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
TCP_SKB_CB(skb)->end_seq,
dup_sack,
tcp_skb_pcount(skb),
+ skb->len,
tcp_skb_timestamp_us(skb));
tcp_rate_skb_delivered(sk, skb, state->rate);
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
@@ -2569,7 +2719,7 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
if (frto_undo)
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPSPURIOUSRTOS);
- inet_csk(sk)->icsk_retransmits = 0;
+ WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0);
if (tcp_is_non_sack_preventing_reopen(sk))
return true;
if (frto_undo || tcp_is_sack(tp)) {
@@ -3280,6 +3430,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
if (sacked & TCPCB_SACKED_ACKED) {
tp->sacked_out -= acked_pcount;
+ /* snd_una delta covers these skbs */
+ sack->delivered_bytes -= skb->len;
} else if (tcp_is_sack(tp)) {
tcp_count_delivered(tp, acked_pcount, ece_ack);
if (!tcp_skb_spurious_retrans(tp, skb))
@@ -3376,6 +3528,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
if (before(reord, prior_fack))
tcp_check_sack_reordering(sk, reord, 0);
}
+
+ sack->delivered_bytes = (skb ?
+ TCP_SKB_CB(skb)->seq : tp->snd_una) -
+ prior_snd_una;
} else if (skb && rtt_update && sack_rtt_us >= 0 &&
sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp,
tcp_skb_timestamp_us(skb))) {
@@ -3645,8 +3801,18 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time);
}
+static void tcp_send_ack_reflect_ect(struct sock *sk, bool accecn_reflector)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u16 flags = 0;
+
+ if (accecn_reflector)
+ flags = tcp_accecn_reflector_flags(tp->syn_ect_rcv);
+ __tcp_send_ack(sk, tp->rcv_nxt, flags);
+}
+
/* RFC 5961 7 [ACK Throttling] */
-static void tcp_send_challenge_ack(struct sock *sk)
+static void tcp_send_challenge_ack(struct sock *sk, bool accecn_reflector)
{
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
@@ -3676,7 +3842,7 @@ static void tcp_send_challenge_ack(struct sock *sk)
WRITE_ONCE(net->ipv4.tcp_challenge_count, count - 1);
send_ack:
NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK);
- tcp_send_ack(sk);
+ tcp_send_ack_reflect_ect(sk, accecn_reflector);
}
}
@@ -3787,7 +3953,8 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit)
}
/* Returns the number of packets newly acked or sacked by the current ACK */
-static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
+static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered,
+ u32 ecn_count, int flag)
{
const struct net *net = sock_net(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -3795,8 +3962,12 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
delivered = tp->delivered - prior_delivered;
NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
- if (flag & FLAG_ECE)
- NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
+
+ if (flag & FLAG_ECE) {
+ if (tcp_ecn_mode_rfc3168(tp))
+ ecn_count = delivered;
+ NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, ecn_count);
+ }
return delivered;
}
@@ -3817,11 +3988,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
u32 delivered = tp->delivered;
u32 lost = tp->lost;
int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
+ u32 ecn_count = 0; /* Did we receive ECE/an AccECN ACE update? */
u32 prior_fack;
sack_state.first_sackt = 0;
sack_state.rate = &rs;
sack_state.sack_delivered = 0;
+ sack_state.delivered_bytes = 0;
/* We very likely will need to access rtx queue. */
prefetch(sk->tcp_rtx_queue.rb_node);
@@ -3837,7 +4010,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
if (before(ack, prior_snd_una - max_window)) {
if (!(flag & FLAG_NO_CHALLENGE_ACK))
- tcp_send_challenge_ack(sk);
+ tcp_send_challenge_ack(sk, false);
return -SKB_DROP_REASON_TCP_TOO_OLD_ACK;
}
goto old_ack;
@@ -3851,7 +4024,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (after(ack, prior_snd_una)) {
flag |= FLAG_SND_UNA_ADVANCED;
- icsk->icsk_retransmits = 0;
+ WRITE_ONCE(icsk->icsk_retransmits, 0);
#if IS_ENABLED(CONFIG_TLS_DEVICE)
if (static_branch_unlikely(&clean_acked_data_enabled.key))
@@ -3912,8 +4085,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* We passed data and got it acked, remove any soft error
* log. Something worked...
*/
- WRITE_ONCE(sk->sk_err_soft, 0);
- icsk->icsk_probes_out = 0;
+ if (READ_ONCE(sk->sk_err_soft))
+ WRITE_ONCE(sk->sk_err_soft, 0);
+ WRITE_ONCE(icsk->icsk_probes_out, 0);
tp->rcv_tstamp = tcp_jiffies32;
if (!prior_packets)
goto no_queue;
@@ -3924,6 +4098,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tcp_rack_update_reo_wnd(sk, &rs);
+ if (tcp_ecn_mode_accecn(tp))
+ ecn_count = tcp_accecn_process(sk, skb,
+ tp->delivered - delivered,
+ sack_state.delivered_bytes,
+ &flag);
+
tcp_in_ack_event(sk, flag);
if (tp->tlp_high_seq)
@@ -3948,7 +4128,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
sk_dst_confirm(sk);
- delivered = tcp_newly_delivered(sk, delivered, flag);
+ delivered = tcp_newly_delivered(sk, delivered, ecn_count, flag);
+
lost = tp->lost - lost; /* freshly marked lost */
rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
@@ -3957,12 +4138,17 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
return 1;
no_queue:
+ if (tcp_ecn_mode_accecn(tp))
+ ecn_count = tcp_accecn_process(sk, skb,
+ tp->delivered - delivered,
+ sack_state.delivered_bytes,
+ &flag);
tcp_in_ack_event(sk, flag);
/* If data was DSACKed, see if we can undo a cwnd reduction. */
if (flag & FLAG_DSACKING_ACK) {
tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit);
- tcp_newly_delivered(sk, delivered, flag);
+ tcp_newly_delivered(sk, delivered, ecn_count, flag);
}
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
@@ -3983,7 +4169,7 @@ old_ack:
&sack_state);
tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit);
- tcp_newly_delivered(sk, delivered, flag);
+ tcp_newly_delivered(sk, delivered, ecn_count, flag);
tcp_xmit_recovery(sk, rexmit);
}
@@ -4083,6 +4269,7 @@ void tcp_parse_options(const struct net *net,
ptr = (const unsigned char *)(th + 1);
opt_rx->saw_tstamp = 0;
+ opt_rx->accecn = 0;
opt_rx->saw_unknown = 0;
while (length > 0) {
@@ -4174,6 +4361,12 @@ void tcp_parse_options(const struct net *net,
ptr, th->syn, foc, false);
break;
+ case TCPOPT_ACCECN0:
+ case TCPOPT_ACCECN1:
+ /* Save offset of AccECN option in TCP header */
+ opt_rx->accecn = (ptr - 2) - (__u8 *)th;
+ break;
+
case TCPOPT_EXP:
/* Fast Open option shares code 254 using a
* 16 bits magic number.
@@ -4234,11 +4427,14 @@ static bool tcp_fast_parse_options(const struct net *net,
*/
if (th->doff == (sizeof(*th) / 4)) {
tp->rx_opt.saw_tstamp = 0;
+ tp->rx_opt.accecn = 0;
return false;
} else if (tp->rx_opt.tstamp_ok &&
th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
- if (tcp_parse_aligned_timestamp(tp, th))
+ if (tcp_parse_aligned_timestamp(tp, th)) {
+ tp->rx_opt.accecn = 0;
return true;
+ }
}
tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
@@ -4830,7 +5026,7 @@ static bool tcp_ooo_try_coalesce(struct sock *sk,
noinline_for_tracing static void
tcp_drop_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason)
{
- sk_drops_add(sk, skb);
+ sk_drops_skbadd(sk, skb);
sk_skb_reason_drop(sk, skb, reason);
}
@@ -4890,12 +5086,23 @@ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb);
/* Check if this incoming skb can be added to socket receive queues
* while satisfying sk->sk_rcvbuf limit.
+ *
+ * In theory we should use skb->truesize, but this can cause problems
+ * when applications use too small SO_RCVBUF values.
+ * When LRO / hw gro is used, the socket might have a high tp->scaling_ratio,
+ * allowing RWIN to be close to available space.
+ * Whenever the receive queue gets full, we can receive a small packet
+ * filling RWIN, but with a high skb->truesize, because most NIC use 4K page
+ * plus sk_buff metadata even when receiving less than 1500 bytes of payload.
+ *
+ * Note that we use skb->len to decide to accept or drop this packet,
+ * but sk->sk_rmem_alloc is the sum of all skb->truesize.
*/
static bool tcp_can_ingest(const struct sock *sk, const struct sk_buff *skb)
{
- unsigned int new_mem = atomic_read(&sk->sk_rmem_alloc) + skb->truesize;
+ unsigned int rmem = atomic_read(&sk->sk_rmem_alloc);
- return new_mem <= sk->sk_rcvbuf;
+ return rmem + skb->len <= sk->sk_rcvbuf;
}
static int tcp_try_rmem_schedule(struct sock *sk, const struct sk_buff *skb,
@@ -5871,6 +6078,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th, int syn_inerr)
{
struct tcp_sock *tp = tcp_sk(sk);
+ bool accecn_reflector = false;
SKB_DR(reason);
/* RFC1323: H1. Apply PAWS check first. */
@@ -5968,7 +6176,7 @@ step1:
if (tp->syn_fastopen && !tp->data_segs_in &&
sk->sk_state == TCP_ESTABLISHED)
tcp_fastopen_active_disable(sk);
- tcp_send_challenge_ack(sk);
+ tcp_send_challenge_ack(sk, false);
SKB_DR_SET(reason, TCP_RESET);
goto discard;
}
@@ -5979,6 +6187,16 @@ step1:
* RFC 5961 4.2 : Send a challenge ack
*/
if (th->syn) {
+ if (tcp_ecn_mode_accecn(tp)) {
+ accecn_reflector = true;
+ if (tp->rx_opt.accecn &&
+ tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
+ u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn);
+
+ tcp_accecn_saw_opt_fail_recv(tp, saw_opt);
+ tcp_accecn_opt_demand_min(sk, 1);
+ }
+ }
if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack &&
TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq &&
TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt &&
@@ -5988,7 +6206,7 @@ syn_challenge:
if (syn_inerr)
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
- tcp_send_challenge_ack(sk);
+ tcp_send_challenge_ack(sk, accecn_reflector);
SKB_DR_SET(reason, TCP_INVALID_SYN);
goto discard;
}
@@ -6060,6 +6278,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
*/
tp->rx_opt.saw_tstamp = 0;
+ tp->rx_opt.accecn = 0;
/* pred_flags is 0xS?10 << 16 + snd_wnd
* if header_prediction is to be made
@@ -6114,6 +6333,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
flag |= __tcp_replace_ts_recent(tp,
delta);
+ tcp_ecn_received_counters(sk, skb, 0);
+
/* We know that such packets are checksummed
* on entry.
*/
@@ -6162,6 +6383,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
/* Bulk data transfer: receiver */
tcp_cleanup_skb(skb);
__skb_pull(skb, tcp_header_len);
+ tcp_ecn_received_counters(sk, skb,
+ len - tcp_header_len);
eaten = tcp_queue_rcv(sk, skb, &fragstolen);
tcp_event_data_recv(sk, skb);
@@ -6202,6 +6425,8 @@ validate:
return;
step5:
+ tcp_ecn_received_counters_payload(sk, skb);
+
reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT);
if ((int)reason < 0) {
reason = -reason;
@@ -6297,7 +6522,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
bool syn_drop = false;
- if (mss == tp->rx_opt.user_mss) {
+ if (mss == READ_ONCE(tp->rx_opt.user_mss)) {
struct tcp_options_received opt;
/* Get original SYNACK MSS value if user MSS sets mss_clamp */
@@ -6452,7 +6677,9 @@ consume:
* state to ESTABLISHED..."
*/
- tcp_ecn_rcv_synack(tp, th);
+ if (tcp_ecn_mode_any(tp))
+ tcp_ecn_rcv_synack(sk, skb, th,
+ TCP_SKB_CB(skb)->ip_dsfield);
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
tcp_try_undo_spurious_syn(sk);
@@ -6524,7 +6751,7 @@ consume:
TCP_DELACK_MAX, false);
goto consume;
}
- tcp_send_ack(sk);
+ tcp_send_ack_reflect_ect(sk, tcp_ecn_mode_accecn(tp));
return -1;
}
@@ -6583,7 +6810,7 @@ consume:
tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
tp->max_window = tp->snd_wnd;
- tcp_ecn_rcv_syn(tp, th);
+ tcp_ecn_rcv_syn(tp, th, skb);
tcp_mtup_init(sk);
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
@@ -6636,7 +6863,7 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
tcp_try_undo_recovery(sk);
tcp_update_rto_time(tp);
- inet_csk(sk)->icsk_retransmits = 0;
+ WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0);
/* In tcp_fastopen_synack_timer() on the first SYNACK RTO we set
* retrans_stamp but don't enter CA_Loss, so in case that happened we
* need to zero retrans_stamp here to prevent spurious
@@ -6765,7 +6992,7 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
}
/* accept old ack during closing */
if ((int)reason < 0) {
- tcp_send_challenge_ack(sk);
+ tcp_send_challenge_ack(sk, false);
reason = -reason;
goto discard;
}
@@ -6812,9 +7039,12 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tp->lsndtime = tcp_jiffies32;
tcp_initialize_rcv_mss(sk);
+ if (tcp_ecn_mode_accecn(tp))
+ tcp_accecn_third_ack(sk, skb, tp->syn_ect_snt);
tcp_fast_path_on(tp);
if (sk->sk_shutdown & SEND_SHUTDOWN)
tcp_shutdown(sk, SEND_SHUTDOWN);
+
break;
case TCP_FIN_WAIT1: {
@@ -6984,6 +7214,15 @@ static void tcp_ecn_create_request(struct request_sock *req,
bool ect, ecn_ok;
u32 ecn_ok_dst;
+ if (tcp_accecn_syn_requested(th) &&
+ READ_ONCE(net->ipv4.sysctl_tcp_ecn) >= 3) {
+ inet_rsk(req)->ecn_ok = 1;
+ tcp_rsk(req)->accecn_ok = 1;
+ tcp_rsk(req)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield &
+ INET_ECN_MASK;
+ return;
+ }
+
if (!th_ecn)
return;
@@ -6991,7 +7230,8 @@ static void tcp_ecn_create_request(struct request_sock *req,
ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
ecn_ok = READ_ONCE(net->ipv4.sysctl_tcp_ecn) || ecn_ok_dst;
- if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
+ if (((!ect || th->res1 || th->ae) && ecn_ok) ||
+ tcp_ca_needs_ecn(listen_sk) ||
(ecn_ok_dst & DST_FEATURE_ECN_CA) ||
tcp_bpf_ca_needs_ecn((struct sock *)req))
inet_rsk(req)->ecn_ok = 1;
@@ -7009,6 +7249,11 @@ static void tcp_openreq_init(struct request_sock *req,
tcp_rsk(req)->snt_synack = 0;
tcp_rsk(req)->snt_tsval_first = 0;
tcp_rsk(req)->last_oow_ack_time = 0;
+ tcp_rsk(req)->accecn_ok = 0;
+ tcp_rsk(req)->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN;
+ tcp_rsk(req)->accecn_fail_mode = 0;
+ tcp_rsk(req)->syn_ect_rcv = 0;
+ tcp_rsk(req)->syn_ect_snt = 0;
req->mss = rx_opt->mss_clamp;
req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
ireq->tstamp_ok = rx_opt->tstamp_ok;
@@ -7048,8 +7293,8 @@ static bool tcp_syn_flood_action(struct sock *sk, const char *proto)
#endif
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
- if (!READ_ONCE(queue->synflood_warned) && syncookies != 2 &&
- xchg(&queue->synflood_warned, 1) == 0) {
+ if (syncookies != 2 && !READ_ONCE(queue->synflood_warned)) {
+ WRITE_ONCE(queue->synflood_warned, 1);
if (IS_ENABLED(CONFIG_IPV6) && sk->sk_family == AF_INET6) {
net_info_ratelimited("%s: Possible SYN flooding on port [%pI6c]:%u. %s.\n",
proto, inet6_rcv_saddr(sk),
@@ -7117,7 +7362,7 @@ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
return 0;
}
- mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss);
+ mss = tcp_parse_mss_option(th, READ_ONCE(tp->rx_opt.user_mss));
if (!mss)
mss = af_ops->mss_clamp;
@@ -7131,7 +7376,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
{
struct tcp_fastopen_cookie foc = { .len = -1 };
struct tcp_options_received tmp_opt;
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
struct sock *fastopen_sk = NULL;
struct request_sock *req;
@@ -7182,7 +7427,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = af_ops->mss_clamp;
- tmp_opt.user_mss = tp->rx_opt.user_mss;
+ tmp_opt.user_mss = READ_ONCE(tp->rx_opt.user_mss);
tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
want_cookie ? NULL : &foc);