[TCP]: Make TSO play nice with congestion window.

Previously TSO would not abide by the congestion window properly. Essentially, each TSO packet would be trated just like 1 normal packet, even though a TSO packet generates more than 1 normal packet. This violates congestion window rules entirely. So now we record the TSO factor, a count of how many real packets a TSO packet will generate, and include this in all the packet counting routines. This initial version has a bug in that skb_entail() is not the correct time to figure out the TSO factor for the SKB, and tp->mss_tso_factor is not necessarily the right value for a given SKB. Will fix this up next. Signed-off-by: David S. Miller <davem@davemloft.net>
author: David S. Miller <davem@nuts.davemloft.net> 2004-09-06 19:20:50 -0700
committer: Patrick McHardy <kaber@trash.net> 2004-09-06 19:20:50 -0700
commit: 14a1f44569619b2dfda526dc0f73b9bf0df74171 (patch)
tree: f1200dfaf23a0013babab9fd458051b14d723ad1 /include
parent: 10bc956350e6821a1a9757065962f1924649b12d (diff)
2 files changed, 109 insertions, 23 deletions
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 9c42ac0b0322..ebf15b6a8162 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -201,6 +201,10 @@ struct tcp_sack_block {
 	__u32	end_seq;
 };
 
+typedef struct tcp_pcount {
+	__u32	val;
+} tcp_pcount_t;
+
 struct tcp_opt {
 	int	tcp_header_len;	/* Bytes of tcp header to send		*/
 
@@ -250,6 +254,7 @@ struct tcp_opt {
 	__u32	max_window;	/* Maximal window ever seen from peer	*/
 	__u32	pmtu_cookie;	/* Last pmtu seen by socket		*/
 	__u32	mss_cache;	/* Cached effective mss, not including SACKS */
+	__u32	mss_tso_factor;	/* Real packets per TSO packet */
 	__u16	mss_cache_std;	/* Like mss_cache, but without TSO */
 	__u16	mss_clamp;	/* Maximal mss, negotiated at connection setup */
 	__u16	ext_header_len;	/* Network protocol overhead (IP/IPv6 options) */
@@ -274,9 +279,9 @@ struct tcp_opt {
 	__u32	rtt_seq;	/* sequence number to update rttvar	*/
 	__u32	rto;		/* retransmit timeout			*/
 
-	__u32	packets_out;	/* Packets which are "in flight"	*/
-	__u32	left_out;	/* Packets which leaved network		*/
-	__u32	retrans_out;	/* Retransmitted packets out		*/
+	tcp_pcount_t packets_out; /* Packets which are "in flight"	*/
+	tcp_pcount_t left_out;	  /* Packets which leaved network	*/
+	tcp_pcount_t retrans_out; /* Retransmitted packets out		*/
 
 
 /*
@@ -337,9 +342,9 @@ struct tcp_opt {
 	__u8	syn_retries;	/* num of allowed syn retries */
 	__u8	ecn_flags;	/* ECN status bits.			*/
 	__u16	prior_ssthresh; /* ssthresh saved at recovery start	*/
-	__u32	lost_out;	/* Lost packets				*/
-	__u32	sacked_out;	/* SACK'd packets			*/
-	__u32	fackets_out;	/* FACK'd packets			*/
+	tcp_pcount_t lost_out;	/* Lost packets			*/
+	tcp_pcount_t sacked_out;/* SACK'd packets			*/
+	tcp_pcount_t fackets_out;/* FACK'd packets			*/
 	__u32	high_seq;	/* snd_nxt at onset of congestion	*/
 
 	__u32	retrans_stamp;	/* Timestamp of the last retransmit,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index a5be63c232e3..efda37b84207 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1047,13 +1047,18 @@ static inline void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long
  * is not a big flaw.
  */
 
-static __inline__ unsigned int tcp_current_mss(struct sock *sk, int large)
+static inline unsigned int tcp_current_mss(struct sock *sk, int large, int *factor)
 {
 	struct tcp_opt *tp = tcp_sk(sk);
 	struct dst_entry *dst = __sk_dst_get(sk);
-	int mss_now = large && (sk->sk_route_caps & NETIF_F_TSO) &&
-		      !tp->urg_mode ?
-		tp->mss_cache : tp->mss_cache_std;
+	int do_large, mss_now;
+
+	do_large = (large &&
+		    (sk->sk_route_caps & NETIF_F_TSO) &&
+		    !tp->urg_mode);
+	mss_now = do_large ? tp->mss_cache : tp->mss_cache_std;
+	if (factor)
+		*factor = do_large ? tp->mss_tso_factor : 1;
 
 	if (dst) {
 		u32 mtu = dst_pmtu(dst);
@@ -1181,12 +1186,76 @@ struct tcp_skb_cb {
 
 	__u16		urg_ptr;	/* Valid w/URG flags is set.	*/
 	__u32		ack_seq;	/* Sequence number ACK'd	*/
+	__u32		tso_factor;
 };
 
 #define TCP_SKB_CB(__skb)	((struct tcp_skb_cb *)&((__skb)->cb[0]))
 
 #include <net/tcp_ecn.h>
 
+/* Due to TSO, an SKB can be composed of multiple actual
+ * packets.  To keep these tracked properly, we use this.
+ */
+static inline int tcp_skb_pcount(struct sk_buff *skb)
+{
+	return TCP_SKB_CB(skb)->tso_factor;
+}
+
+static inline void tcp_inc_pcount(tcp_pcount_t *count, struct sk_buff *skb)
+{
+	count->val += tcp_skb_pcount(skb);
+}
+
+static inline void tcp_inc_pcount_explicit(tcp_pcount_t *count, int amt)
+{
+	count->val += amt;
+}
+
+static inline void tcp_dec_pcount_explicit(tcp_pcount_t *count, int amt)
+{
+	count->val -= amt;
+}
+
+static inline void tcp_dec_pcount(tcp_pcount_t *count, struct sk_buff *skb)
+{
+	count->val -= tcp_skb_pcount(skb);
+}
+
+static inline void tcp_dec_pcount_approx(tcp_pcount_t *count,
+					 struct sk_buff *skb)
+{
+	if (count->val) {
+		count->val -= tcp_skb_pcount(skb);
+		if ((int)count->val < 0)
+			count->val = 0;
+	}
+}
+
+static inline __u32 tcp_get_pcount(tcp_pcount_t *count)
+{
+	return count->val;
+}
+
+static inline void tcp_set_pcount(tcp_pcount_t *count, __u32 val)
+{
+	count->val = val;
+}
+
+static inline void tcp_packets_out_inc(struct sock *sk, struct tcp_opt *tp,
+				       struct sk_buff *skb)
+{
+	int orig = tcp_get_pcount(&tp->packets_out);
+
+	tcp_inc_pcount(&tp->packets_out, skb);
+	if (!orig)
+		tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+}
+
+static inline void tcp_packets_out_dec(struct tcp_opt *tp, struct sk_buff *skb)
+{
+	tcp_dec_pcount(&tp->packets_out, skb);
+}
+
 /* This determines how many packets are "in the network" to the best
  * of our knowledge.  In many cases it is conservative, but where
  * detailed information is available from the receiver (via SACK
@@ -1203,7 +1272,9 @@ struct tcp_skb_cb {
  */
 static __inline__ unsigned int tcp_packets_in_flight(struct tcp_opt *tp)
 {
-	return tp->packets_out - tp->left_out + tp->retrans_out;
+	return (tcp_get_pcount(&tp->packets_out) -
+		tcp_get_pcount(&tp->left_out) +
+		tcp_get_pcount(&tp->retrans_out));
 }
 
 /* Recalculate snd_ssthresh, we want to set it to:
@@ -1304,9 +1375,15 @@ static inline __u32 tcp_current_ssthresh(struct tcp_opt *tp)
 
 static inline void tcp_sync_left_out(struct tcp_opt *tp)
 {
-	if (tp->sack_ok && tp->sacked_out >= tp->packets_out - tp->lost_out)
-		tp->sacked_out = tp->packets_out - tp->lost_out;
-	tp->left_out = tp->sacked_out + tp->lost_out;
+	if (tp->sack_ok &&
+	    (tcp_get_pcount(&tp->sacked_out) >=
+	     tcp_get_pcount(&tp->packets_out) - tcp_get_pcount(&tp->lost_out)))
+		tcp_set_pcount(&tp->sacked_out,
+			       (tcp_get_pcount(&tp->packets_out) -
+				tcp_get_pcount(&tp->lost_out)));
+	tcp_set_pcount(&tp->left_out,
+		       (tcp_get_pcount(&tp->sacked_out) +
+			tcp_get_pcount(&tp->lost_out)));
 }
 
 extern void tcp_cwnd_application_limited(struct sock *sk);
@@ -1315,14 +1392,16 @@ extern void tcp_cwnd_application_limited(struct sock *sk);
 
 static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_opt *tp)
 {
-	if (tp->packets_out >= tp->snd_cwnd) {
+	__u32 packets_out = tcp_get_pcount(&tp->packets_out);
+
+	if (packets_out >= tp->snd_cwnd) {
 		/* Network is feed fully. */
 		tp->snd_cwnd_used = 0;
 		tp->snd_cwnd_stamp = tcp_time_stamp;
 	} else {
 		/* Network starves. */
-		if (tp->packets_out > tp->snd_cwnd_used)
-			tp->snd_cwnd_used = tp->packets_out;
+		if (tcp_get_pcount(&tp->packets_out) > tp->snd_cwnd_used)
+			tp->snd_cwnd_used = tcp_get_pcount(&tp->packets_out);
 
 		if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
 			tcp_cwnd_application_limited(sk);
@@ -1388,7 +1467,7 @@ tcp_nagle_check(struct tcp_opt *tp, struct sk_buff *skb, unsigned mss_now, int n
 		!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
 		((nonagle&TCP_NAGLE_CORK) ||
 		 (!nonagle &&
-		  tp->packets_out &&
+		  tcp_get_pcount(&tp->packets_out) &&
 		  tcp_minshall_check(tp))));
 }
 
@@ -1398,6 +1477,8 @@ tcp_nagle_check(struct tcp_opt *tp, struct sk_buff *skb, unsigned mss_now, int n
 static __inline__ int tcp_snd_test(struct tcp_opt *tp, struct sk_buff *skb,
 				   unsigned cur_mss, int nonagle)
 {
+	int pkts = TCP_SKB_CB(skb)->tso_factor;
+
 	/*	RFC 1122 - section 4.2.3.4
 	 *
 	 *	We must queue if
@@ -1424,14 +1505,14 @@ static __inline__ int tcp_snd_test(struct tcp_opt *tp, struct sk_buff *skb,
 	 */
 	return (((nonagle&TCP_NAGLE_PUSH) || tp->urg_mode
 		 || !tcp_nagle_check(tp, skb, cur_mss, nonagle)) &&
-		((tcp_packets_in_flight(tp) < tp->snd_cwnd) ||
+		(((tcp_packets_in_flight(tp) + (pkts-1)) < tp->snd_cwnd) ||
 		 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) &&
 		!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd));
 }
 
 static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_opt *tp)
 {
-	if (!tp->packets_out && !tp->pending)
+	if (!tcp_get_pcount(&tp->packets_out) && !tp->pending)
 		tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, tp->rto);
 }
 
@@ -1464,7 +1545,7 @@ static __inline__ void __tcp_push_pending_frames(struct sock *sk,
 static __inline__ void tcp_push_pending_frames(struct sock *sk,
 					       struct tcp_opt *tp)
 {
-	__tcp_push_pending_frames(sk, tp, tcp_current_mss(sk, 1), tp->nonagle);
+	__tcp_push_pending_frames(sk, tp, tcp_current_mss(sk, 1, NULL), tp->nonagle);
 }
 
 static __inline__ int tcp_may_send_now(struct sock *sk, struct tcp_opt *tp)
@@ -1472,7 +1553,7 @@ static __inline__ int tcp_may_send_now(struct sock *sk, struct tcp_opt *tp)
 	struct sk_buff *skb = sk->sk_send_head;
 
 	return (skb &&
-		tcp_snd_test(tp, skb, tcp_current_mss(sk, 1),
+		tcp_snd_test(tp, skb, tcp_current_mss(sk, 1, NULL),
 			     tcp_skb_is_last(sk, skb) ? TCP_NAGLE_PUSH : tp->nonagle));
 }
 
@@ -1964,7 +2045,7 @@ static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
 static inline __u32 __tcp_westwood_bw_rttmin(const struct tcp_opt *tp)
 {
         return max((tp->westwood.bw_est) * (tp->westwood.rtt_min) /
-		   (__u32) (tp->mss_cache),
+		   (__u32) (tp->mss_cache_std),
 		   2U);
 }
author	David S. Miller <davem@nuts.davemloft.net>	2004-09-06 19:20:50 -0700
committer	Patrick McHardy <kaber@trash.net>	2004-09-06 19:20:50 -0700
commit	14a1f44569619b2dfda526dc0f73b9bf0df74171 (patch)
tree	f1200dfaf23a0013babab9fd458051b14d723ad1 /include
parent	10bc956350e6821a1a9757065962f1924649b12d (diff)