summaryrefslogtreecommitdiff
path: root/net/ipv4/tcp_output.c
diff options
context:
space:
mode:
authorIlpo Järvinen <ij@kernel.org>2025-09-16 10:24:30 +0200
committerPaolo Abeni <pabeni@redhat.com>2025-09-18 08:47:52 +0200
commitb5e74132dfbe60329b3ff0e5c485039f2e31605c (patch)
tree0245fbbfbf923e9a0664908428cafe690fea3620 /net/ipv4/tcp_output.c
parent77a4fdf43c5ec81a431770511505d371c8822837 (diff)
tcp: accecn: AccECN option
The Accurate ECN allows echoing back the sum of bytes for each IP ECN field value in the received packets using AccECN option. This change implements AccECN option tx & rx side processing without option send control related features that are added by a later change. Based on specification: https://tools.ietf.org/id/draft-ietf-tcpm-accurate-ecn-28.txt (Some features of the spec will be added in the later changes rather than in this one). A full-length AccECN option is always attempted but if it does not fit, the minimum length is selected based on the counters that have changed since the last update. The AccECN option (with 24-bit fields) often ends in odd sizes so the option write code tries to take advantage of some nop used to pad the other TCP options. The delivered_ecn_bytes pairs with received_ecn_bytes similar to how delivered_ce pairs with received_ce. In contrast to ACE field, however, the option is not always available to update delivered_ecn_bytes. For ACK w/o AccECN option, the delivered bytes calculated based on the cumulative ACK+SACK information are assigned to one of the counters using an estimation heuristic to select the most likely ECN byte counter. Any estimation error is corrected when the next AccECN option arrives. It may occur that the heuristic gets too confused when there are enough different byte counter deltas between ACKs with the AccECN option in which case the heuristic just gives up on updating the counters for a while. tcp_ecn_option sysctl can be used to select option sending mode for AccECN: TCP_ECN_OPTION_DISABLED, TCP_ECN_OPTION_MINIMUM, and TCP_ECN_OPTION_FULL. This patch increases the size of tcp_info struct, as there is no existing holes for new u32 variables. Below are the pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_info { [...] __u32 tcpi_total_rto_time; /* 244 4 */ /* size: 248, cachelines: 4, members: 61 */ } [AFTER THIS PATCH] struct tcp_info { [...] __u32 tcpi_total_rto_time; /* 244 4 */ __u32 tcpi_received_ce; /* 248 4 */ __u32 tcpi_delivered_e1_bytes; /* 252 4 */ __u32 tcpi_delivered_e0_bytes; /* 256 4 */ __u32 tcpi_delivered_ce_bytes; /* 260 4 */ __u32 tcpi_received_e1_bytes; /* 264 4 */ __u32 tcpi_received_e0_bytes; /* 268 4 */ __u32 tcpi_received_ce_bytes; /* 272 4 */ /* size: 280, cachelines: 5, members: 68 */ } This patch uses the existing 1-byte holes in the tcp_sock_write_txrx group for new u8 members, but adds a 4-byte hole in tcp_sock_write_rx group after the new u32 delivered_ecn_bytes[3] member. Therefore, the group size of tcp_sock_write_rx is increased from 96 to 112. Below are the pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_sock { [...] u8 received_ce_pending:4; /* 2522: 0 1 */ u8 unused2:4; /* 2522: 4 1 */ /* XXX 1 byte hole, try to pack */ [...] u32 rcv_rtt_last_tsecr; /* 2668 4 */ [...] __cacheline_group_end__tcp_sock_write_rx[0]; /* 2728 0 */ [...] /* size: 3200, cachelines: 50, members: 167 */ } [AFTER THIS PATCH] struct tcp_sock { [...] u8 received_ce_pending:4;/* 2522: 0 1 */ u8 unused2:4; /* 2522: 4 1 */ u8 accecn_minlen:2; /* 2523: 0 1 */ u8 est_ecnfield:2; /* 2523: 2 1 */ u8 unused3:4; /* 2523: 4 1 */ [...] u32 rcv_rtt_last_tsecr; /* 2668 4 */ u32 delivered_ecn_bytes[3];/* 2672 12 */ /* XXX 4 bytes hole, try to pack */ [...] __cacheline_group_end__tcp_sock_write_rx[0]; /* 2744 0 */ [...] /* size: 3200, cachelines: 50, members: 171 */ } Signed-off-by: Ilpo Järvinen <ij@kernel.org> Signed-off-by: Neal Cardwell <ncardwell@google.com> Co-developed-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com> Signed-off-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com> Reviewed-by: Eric Dumazet <edumazet@google.com> Link: https://patch.msgid.link/20250916082434.100722-7-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r--net/ipv4/tcp_output.c165
1 files changed, 160 insertions, 5 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5be2b3eb73d3..34e5c83bbace 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -385,6 +385,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
#define OPTION_SMC BIT(9)
#define OPTION_MPTCP BIT(10)
#define OPTION_AO BIT(11)
+#define OPTION_ACCECN BIT(12)
static void smc_options_write(__be32 *ptr, u16 *options)
{
@@ -406,6 +407,8 @@ struct tcp_out_options {
u16 mss; /* 0 to disable */
u8 ws; /* window scale, 0 to disable */
u8 num_sack_blocks; /* number of SACK blocks to include */
+ u8 num_accecn_fields:7, /* number of AccECN fields needed */
+ use_synack_ecn_bytes:1; /* Use synack_ecn_bytes or not */
u8 hash_size; /* bytes in hash_location */
u8 bpf_opt_len; /* length of BPF hdr option */
__u8 *hash_location; /* temporary pointer, overloaded */
@@ -603,6 +606,11 @@ static __be32 *process_tcp_ao_options(struct tcp_sock *tp,
return ptr;
}
+/* Initial values for AccECN option, ordered is based on ECN field bits
+ * similar to received_ecn_bytes. Used for SYN/ACK AccECN option.
+ */
+static const u32 synack_ecn_bytes[3] = { 0, 0, 0 };
+
/* Write previously computed TCP options to the packet.
*
* Beware: Something in the Internet is very sensitive to the ordering of
@@ -621,6 +629,8 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
struct tcp_out_options *opts,
struct tcp_key *key)
{
+ u8 leftover_highbyte = TCPOPT_NOP; /* replace 1st NOP if avail */
+ u8 leftover_lowbyte = TCPOPT_NOP; /* replace 2nd NOP in succession */
__be32 *ptr = (__be32 *)(th + 1);
u16 options = opts->options; /* mungable copy */
@@ -656,15 +666,71 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
*ptr++ = htonl(opts->tsecr);
}
+ if (OPTION_ACCECN & options) {
+ const u32 *ecn_bytes = opts->use_synack_ecn_bytes ?
+ synack_ecn_bytes :
+ tp->received_ecn_bytes;
+ const u8 ect0_idx = INET_ECN_ECT_0 - 1;
+ const u8 ect1_idx = INET_ECN_ECT_1 - 1;
+ const u8 ce_idx = INET_ECN_CE - 1;
+ u32 e0b;
+ u32 e1b;
+ u32 ceb;
+ u8 len;
+
+ e0b = ecn_bytes[ect0_idx] + TCP_ACCECN_E0B_INIT_OFFSET;
+ e1b = ecn_bytes[ect1_idx] + TCP_ACCECN_E1B_INIT_OFFSET;
+ ceb = ecn_bytes[ce_idx] + TCP_ACCECN_CEB_INIT_OFFSET;
+ len = TCPOLEN_ACCECN_BASE +
+ opts->num_accecn_fields * TCPOLEN_ACCECN_PERFIELD;
+
+ if (opts->num_accecn_fields == 2) {
+ *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
+ ((e1b >> 8) & 0xffff));
+ *ptr++ = htonl(((e1b & 0xff) << 24) |
+ (ceb & 0xffffff));
+ } else if (opts->num_accecn_fields == 1) {
+ *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
+ ((e1b >> 8) & 0xffff));
+ leftover_highbyte = e1b & 0xff;
+ leftover_lowbyte = TCPOPT_NOP;
+ } else if (opts->num_accecn_fields == 0) {
+ leftover_highbyte = TCPOPT_ACCECN1;
+ leftover_lowbyte = len;
+ } else if (opts->num_accecn_fields == 3) {
+ *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
+ ((e1b >> 8) & 0xffff));
+ *ptr++ = htonl(((e1b & 0xff) << 24) |
+ (ceb & 0xffffff));
+ *ptr++ = htonl(((e0b & 0xffffff) << 8) |
+ TCPOPT_NOP);
+ }
+ if (tp)
+ tp->accecn_minlen = 0;
+ }
+
if (unlikely(OPTION_SACK_ADVERTISE & options)) {
- *ptr++ = htonl((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
+ *ptr++ = htonl((leftover_highbyte << 24) |
+ (leftover_lowbyte << 16) |
(TCPOPT_SACK_PERM << 8) |
TCPOLEN_SACK_PERM);
+ leftover_highbyte = TCPOPT_NOP;
+ leftover_lowbyte = TCPOPT_NOP;
}
if (unlikely(OPTION_WSCALE & options)) {
- *ptr++ = htonl((TCPOPT_NOP << 24) |
+ u8 highbyte = TCPOPT_NOP;
+
+ /* Do not split the leftover 2-byte to fit into a single
+ * NOP, i.e., replace this NOP only when 1 byte is leftover
+ * within leftover_highbyte.
+ */
+ if (unlikely(leftover_highbyte != TCPOPT_NOP &&
+ leftover_lowbyte == TCPOPT_NOP)) {
+ highbyte = leftover_highbyte;
+ leftover_highbyte = TCPOPT_NOP;
+ }
+ *ptr++ = htonl((highbyte << 24) |
(TCPOPT_WINDOW << 16) |
(TCPOLEN_WINDOW << 8) |
opts->ws);
@@ -675,11 +741,13 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
tp->duplicate_sack : tp->selective_acks;
int this_sack;
- *ptr++ = htonl((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
+ *ptr++ = htonl((leftover_highbyte << 24) |
+ (leftover_lowbyte << 16) |
(TCPOPT_SACK << 8) |
(TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
TCPOLEN_SACK_PERBLOCK)));
+ leftover_highbyte = TCPOPT_NOP;
+ leftover_lowbyte = TCPOPT_NOP;
for (this_sack = 0; this_sack < opts->num_sack_blocks;
++this_sack) {
@@ -688,6 +756,14 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
}
tp->rx_opt.dsack = 0;
+ } else if (unlikely(leftover_highbyte != TCPOPT_NOP ||
+ leftover_lowbyte != TCPOPT_NOP)) {
+ *ptr++ = htonl((leftover_highbyte << 24) |
+ (leftover_lowbyte << 16) |
+ (TCPOPT_NOP << 8) |
+ TCPOPT_NOP);
+ leftover_highbyte = TCPOPT_NOP;
+ leftover_lowbyte = TCPOPT_NOP;
}
if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
@@ -768,6 +844,61 @@ static void mptcp_set_option_cond(const struct request_sock *req,
}
}
+static u32 tcp_synack_options_combine_saving(struct tcp_out_options *opts)
+{
+ /* How much there's room for combining with the alignment padding? */
+ if ((opts->options & (OPTION_SACK_ADVERTISE | OPTION_TS)) ==
+ OPTION_SACK_ADVERTISE)
+ return 2;
+ else if (opts->options & OPTION_WSCALE)
+ return 1;
+ return 0;
+}
+
+/* Calculates how long AccECN option will fit to @remaining option space.
+ *
+ * AccECN option can sometimes replace NOPs used for alignment of other
+ * TCP options (up to @max_combine_saving available).
+ *
+ * Only solutions with at least @required AccECN fields are accepted.
+ *
+ * Returns: The size of the AccECN option excluding space repurposed from
+ * the alignment of the other options.
+ */
+static int tcp_options_fit_accecn(struct tcp_out_options *opts, int required,
+ int remaining)
+{
+ int size = TCP_ACCECN_MAXSIZE;
+ int max_combine_saving;
+ int align_size;
+
+ if (opts->use_synack_ecn_bytes)
+ max_combine_saving = tcp_synack_options_combine_saving(opts);
+ else
+ max_combine_saving = opts->num_sack_blocks > 0 ? 2 : 0;
+ opts->num_accecn_fields = TCP_ACCECN_NUMFIELDS;
+ while (opts->num_accecn_fields >= required) {
+ /* Pad to dword if cannot combine */
+ if ((size & 0x3) > max_combine_saving)
+ align_size = ALIGN(size, 4);
+ else
+ align_size = ALIGN_DOWN(size, 4);
+
+ if (remaining >= align_size) {
+ size = align_size;
+ break;
+ }
+
+ opts->num_accecn_fields--;
+ size -= TCPOLEN_ACCECN_PERFIELD;
+ }
+ if (opts->num_accecn_fields < required)
+ return 0;
+
+ opts->options |= OPTION_ACCECN;
+ return size;
+}
+
/* Compute TCP options for SYN packets. This is not the final
* network wire format yet.
*/
@@ -850,6 +981,15 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
}
}
+ /* Simultaneous open SYN/ACK needs AccECN option but not SYN */
+ if (unlikely((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) &&
+ tcp_ecn_mode_accecn(tp) &&
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) &&
+ remaining >= TCPOLEN_ACCECN_BASE)) {
+ opts->use_synack_ecn_bytes = 1;
+ remaining -= tcp_options_fit_accecn(opts, 0, remaining);
+ }
+
bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
return MAX_TCP_OPTION_SPACE - remaining;
@@ -867,6 +1007,7 @@ static unsigned int tcp_synack_options(const struct sock *sk,
{
struct inet_request_sock *ireq = inet_rsk(req);
unsigned int remaining = MAX_TCP_OPTION_SPACE;
+ struct tcp_request_sock *treq = tcp_rsk(req);
if (tcp_key_is_md5(key)) {
opts->options |= OPTION_MD5;
@@ -929,6 +1070,13 @@ static unsigned int tcp_synack_options(const struct sock *sk,
smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
+ if (treq->accecn_ok &&
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) &&
+ remaining >= TCPOLEN_ACCECN_BASE) {
+ opts->use_synack_ecn_bytes = 1;
+ remaining -= tcp_options_fit_accecn(opts, 0, remaining);
+ }
+
bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb,
synack_type, opts, &remaining);
@@ -1001,6 +1149,13 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
opts->num_sack_blocks = 0;
}
+ if (tcp_ecn_mode_accecn(tp) &&
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option)) {
+ opts->use_synack_ecn_bytes = 0;
+ size += tcp_options_fit_accecn(opts, tp->accecn_minlen,
+ MAX_TCP_OPTION_SPACE - size);
+ }
+
if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) {
unsigned int remaining = MAX_TCP_OPTION_SPACE - size;