summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2016-09-21 00:23:09 -0400
committerDavid S. Miller <davem@davemloft.net>2016-09-21 00:23:09 -0400
commita624f93ce6623d452e87d8dcf557e7c680822991 (patch)
treef6d7ecddbe3fd16a6a48374f2c647d40f25b77e5 /include
parent94d308d060cd3ee65152b8ebd7a1c24fa86eee82 (diff)
parent0f8782ea14974ce992618b55f0c041ef43ed0b78 (diff)
Merge branch 'tcp-bbr'
Neal Cardwell says: ==================== tcp: BBR congestion control algorithm This patch series implements a new TCP congestion control algorithm: BBR (Bottleneck Bandwidth and RTT). A paper with a detailed description of BBR will be published in ACM Queue, September-October 2016, as "BBR: Congestion-Based Congestion Control". BBR is widely deployed in production at Google. The patch series starts with a set of supporting infrastructure changes, including a few that extend the congestion control framework. The last patch adds BBR as a TCP congestion control module. Please see individual patches for the details. - v3 -> v4: - Updated tcp_bbr.c in "tcp_bbr: add BBR congestion control" to use const to qualify all the constant parameters. Thanks to Stephen Hemminger. - In "tcp_bbr: add BBR congestion control", remove the bbr_rate_kbps() function, which had a 64-bit divide that would be problematic on some architectures, and just use bbr_rate_bytes_per_sec() directly. Thanks to Kenneth Klette Jonassen for suggesting this. - In "tcp: switch back to proper tcp_skb_cb size check in tcp_init()", switched from sizeof(skb->cb) to FIELD_SIZEOF. Thanks to Lance Richardson for suggesting this. - Updated "tcp_bbr: add BBR congestion control" commit message with performance data, more details about deployment at Google, and another reminder to use fq with BBR. - Updated tcp_bbr.c in "tcp_bbr: add BBR congestion control" to use MODULE_LICENSE("Dual BSD/GPL"). - v2 -> v3: fix another issue caught by build bots: - adjust rate_sample struct initialization syntax to allow gcc-4.4 to compile the "tcp: track data delivery rate for a TCP connection" patch; also adjusted some similar syntax in "tcp_bbr: add BBR congestion control" - v1 -> v2: fix issues caught by build bots: - fix "tcp: export data delivery rate" to use rate64 instead of rate, so there is a 64-bit numerator for the do_div call - fix conflicting definitions for minmax caused by "tcp: use windowed min filter library for TCP min_rtt estimation" with a new commit: tcp: cdg: rename struct minmax in tcp_cdg.c to avoid a naming conflict - fix warning about the use of __packed in "tcp: track data delivery rate for a TCP connection", which involves the addition of a new commit: tcp: switch back to proper tcp_skb_cb size check in tcp_init() ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include')
-rw-r--r--include/linux/tcp.h14
-rw-r--r--include/linux/win_minmax.h37
-rw-r--r--include/net/inet_connection_sock.h4
-rw-r--r--include/net/tcp.h53
-rw-r--r--include/uapi/linux/inet_diag.h13
-rw-r--r--include/uapi/linux/pkt_sched.h2
-rw-r--r--include/uapi/linux/tcp.h3
7 files changed, 117 insertions, 9 deletions
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index c723a465125d..a17ae7b85218 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -19,6 +19,7 @@
#include <linux/skbuff.h>
+#include <linux/win_minmax.h>
#include <net/sock.h>
#include <net/inet_connection_sock.h>
#include <net/inet_timewait_sock.h>
@@ -212,7 +213,8 @@ struct tcp_sock {
u8 reord; /* reordering detected */
} rack;
u16 advmss; /* Advertised MSS */
- u8 unused;
+ u8 rate_app_limited:1, /* rate_{delivered,interval_us} limited? */
+ unused:7;
u8 nonagle : 4,/* Disable Nagle algorithm? */
thin_lto : 1,/* Use linear timeouts for thin streams */
thin_dupack : 1,/* Fast retransmit on first dupack */
@@ -234,9 +236,7 @@ struct tcp_sock {
u32 mdev_max_us; /* maximal mdev for the last rtt period */
u32 rttvar_us; /* smoothed mdev_max */
u32 rtt_seq; /* sequence number to update rttvar */
- struct rtt_meas {
- u32 rtt, ts; /* RTT in usec and sampling time in jiffies. */
- } rtt_min[3];
+ struct minmax rtt_min;
u32 packets_out; /* Packets which are "in flight" */
u32 retrans_out; /* Retransmitted packets out */
@@ -268,6 +268,12 @@ struct tcp_sock {
* receiver in Recovery. */
u32 prr_out; /* Total number of pkts sent during Recovery. */
u32 delivered; /* Total data packets delivered incl. rexmits */
+ u32 lost; /* Total data packets lost incl. rexmits */
+ u32 app_limited; /* limited until "delivered" reaches this val */
+ struct skb_mstamp first_tx_mstamp; /* start of window send phase */
+ struct skb_mstamp delivered_mstamp; /* time we reached "delivered" */
+ u32 rate_delivered; /* saved rate sample: packets delivered */
+ u32 rate_interval_us; /* saved rate sample: time elapsed */
u32 rcv_wnd; /* Current receiver window */
u32 write_seq; /* Tail(+1) of data held in tcp send buffer */
diff --git a/include/linux/win_minmax.h b/include/linux/win_minmax.h
new file mode 100644
index 000000000000..56569604278f
--- /dev/null
+++ b/include/linux/win_minmax.h
@@ -0,0 +1,37 @@
+/**
+ * lib/minmax.c: windowed min/max tracker by Kathleen Nichols.
+ *
+ */
+#ifndef MINMAX_H
+#define MINMAX_H
+
+#include <linux/types.h>
+
+/* A single data point for our parameterized min-max tracker */
+struct minmax_sample {
+ u32 t; /* time measurement was taken */
+ u32 v; /* value measured */
+};
+
+/* State for the parameterized min-max tracker */
+struct minmax {
+ struct minmax_sample s[3];
+};
+
+static inline u32 minmax_get(const struct minmax *m)
+{
+ return m->s[0].v;
+}
+
+static inline u32 minmax_reset(struct minmax *m, u32 t, u32 meas)
+{
+ struct minmax_sample val = { .t = t, .v = meas };
+
+ m->s[2] = m->s[1] = m->s[0] = val;
+ return m->s[0].v;
+}
+
+u32 minmax_running_max(struct minmax *m, u32 win, u32 t, u32 meas);
+u32 minmax_running_min(struct minmax *m, u32 win, u32 t, u32 meas);
+
+#endif
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 49dcad4fe99e..197a30d221e9 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -134,8 +134,8 @@ struct inet_connection_sock {
} icsk_mtup;
u32 icsk_user_timeout;
- u64 icsk_ca_priv[64 / sizeof(u64)];
-#define ICSK_CA_PRIV_SIZE (8 * sizeof(u64))
+ u64 icsk_ca_priv[88 / sizeof(u64)];
+#define ICSK_CA_PRIV_SIZE (11 * sizeof(u64))
};
#define ICSK_TIME_RETRANS 1 /* Retransmit timer */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index fdfbedd61c67..f83b7f220a65 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -533,6 +533,8 @@ __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss);
#endif
/* tcp_output.c */
+u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+ int min_tso_segs);
void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
int nonagle);
bool tcp_may_send_now(struct sock *sk);
@@ -671,7 +673,7 @@ static inline bool tcp_ca_dst_locked(const struct dst_entry *dst)
/* Minimum RTT in usec. ~0 means not available. */
static inline u32 tcp_min_rtt(const struct tcp_sock *tp)
{
- return tp->rtt_min[0].rtt;
+ return minmax_get(&tp->rtt_min);
}
/* Compute the actual receive window we are currently advertising.
@@ -763,8 +765,16 @@ struct tcp_skb_cb {
__u32 ack_seq; /* Sequence number ACK'd */
union {
struct {
- /* There is space for up to 20 bytes */
- __u32 in_flight;/* Bytes in flight when packet sent */
+ /* There is space for up to 24 bytes */
+ __u32 in_flight:30,/* Bytes in flight at transmit */
+ is_app_limited:1, /* cwnd not fully used? */
+ unused:1;
+ /* pkts S/ACKed so far upon tx of skb, incl retrans: */
+ __u32 delivered;
+ /* start of send pipeline phase */
+ struct skb_mstamp first_tx_mstamp;
+ /* when we reached the "delivered" count */
+ struct skb_mstamp delivered_mstamp;
} tx; /* only used for outgoing skbs */
union {
struct inet_skb_parm h4;
@@ -860,6 +870,27 @@ struct ack_sample {
u32 in_flight;
};
+/* A rate sample measures the number of (original/retransmitted) data
+ * packets delivered "delivered" over an interval of time "interval_us".
+ * The tcp_rate.c code fills in the rate sample, and congestion
+ * control modules that define a cong_control function to run at the end
+ * of ACK processing can optionally chose to consult this sample when
+ * setting cwnd and pacing rate.
+ * A sample is invalid if "delivered" or "interval_us" is negative.
+ */
+struct rate_sample {
+ struct skb_mstamp prior_mstamp; /* starting timestamp for interval */
+ u32 prior_delivered; /* tp->delivered at "prior_mstamp" */
+ s32 delivered; /* number of packets delivered over interval */
+ long interval_us; /* time for tp->delivered to incr "delivered" */
+ long rtt_us; /* RTT of last (S)ACKed packet (or -1) */
+ int losses; /* number of packets marked lost upon ACK */
+ u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */
+ u32 prior_in_flight; /* in flight before this ACK */
+ bool is_app_limited; /* is sample from packet with bubble in pipe? */
+ bool is_retrans; /* is sample from retransmission? */
+};
+
struct tcp_congestion_ops {
struct list_head list;
u32 key;
@@ -884,6 +915,14 @@ struct tcp_congestion_ops {
u32 (*undo_cwnd)(struct sock *sk);
/* hook for packet ack accounting (optional) */
void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
+ /* suggest number of segments for each skb to transmit (optional) */
+ u32 (*tso_segs_goal)(struct sock *sk);
+ /* returns the multiplier used in tcp_sndbuf_expand (optional) */
+ u32 (*sndbuf_expand)(struct sock *sk);
+ /* call when packets are delivered to update cwnd and pacing rate,
+ * after all the ca_state processing. (optional)
+ */
+ void (*cong_control)(struct sock *sk, const struct rate_sample *rs);
/* get info for inet_diag (optional) */
size_t (*get_info)(struct sock *sk, u32 ext, int *attr,
union tcp_cc_info *info);
@@ -946,6 +985,14 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
icsk->icsk_ca_ops->cwnd_event(sk, event);
}
+/* From tcp_rate.c */
+void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
+void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ struct rate_sample *rs);
+void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ struct skb_mstamp *now, struct rate_sample *rs);
+void tcp_rate_check_app_limited(struct sock *sk);
+
/* These functions determine how the current flow behaves in respect of SACK
* handling. SACK is negotiated with the peer, and therefore it can vary
* between different flows.
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index b5c366f87b3e..509cd961068d 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -124,6 +124,7 @@ enum {
INET_DIAG_PEERS,
INET_DIAG_PAD,
INET_DIAG_MARK,
+ INET_DIAG_BBRINFO,
__INET_DIAG_MAX,
};
@@ -157,8 +158,20 @@ struct tcp_dctcp_info {
__u32 dctcp_ab_tot;
};
+/* INET_DIAG_BBRINFO */
+
+struct tcp_bbr_info {
+ /* u64 bw: max-filtered BW (app throughput) estimate in Byte per sec: */
+ __u32 bbr_bw_lo; /* lower 32 bits of bw */
+ __u32 bbr_bw_hi; /* upper 32 bits of bw */
+ __u32 bbr_min_rtt; /* min-filtered RTT in uSec */
+ __u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */
+ __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */
+};
+
union tcp_cc_info {
struct tcpvegas_info vegas;
struct tcp_dctcp_info dctcp;
+ struct tcp_bbr_info bbr;
};
#endif /* _UAPI_INET_DIAG_H_ */
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 2382eed50278..f8e39dbaa781 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -792,6 +792,8 @@ enum {
TCA_FQ_ORPHAN_MASK, /* mask applied to orphaned skb hashes */
+ TCA_FQ_LOW_RATE_THRESHOLD, /* per packet delay under this rate */
+
__TCA_FQ_MAX
};
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 482898fc433a..73ac0db487f8 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -167,6 +167,7 @@ struct tcp_info {
__u8 tcpi_backoff;
__u8 tcpi_options;
__u8 tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
+ __u8 tcpi_delivery_rate_app_limited:1;
__u32 tcpi_rto;
__u32 tcpi_ato;
@@ -211,6 +212,8 @@ struct tcp_info {
__u32 tcpi_min_rtt;
__u32 tcpi_data_segs_in; /* RFC4898 tcpEStatsDataSegsIn */
__u32 tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */
+
+ __u64 tcpi_delivery_rate;
};
/* for TCP_MD5SIG socket option */