diff options
Diffstat (limited to 'net/ipv4/tcp_output.c')
| -rw-r--r-- | net/ipv4/tcp_output.c | 162 | 
1 files changed, 91 insertions, 71 deletions
| diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 597dbd749f05..9c34b97d365d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -45,6 +45,21 @@  #include <trace/events/tcp.h> +/* Refresh clocks of a TCP socket, + * ensuring monotically increasing values. + */ +void tcp_mstamp_refresh(struct tcp_sock *tp) +{ +	u64 val = tcp_clock_ns(); + +	if (val > tp->tcp_clock_cache) +		tp->tcp_clock_cache = val; + +	val = div_u64(val, NSEC_PER_USEC); +	if (val > tp->tcp_mstamp) +		tp->tcp_mstamp = val; +} +  static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,  			   int push_one, gfp_t gfp); @@ -179,21 +194,6 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,  	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);  } - -u32 tcp_default_init_rwnd(u32 mss) -{ -	/* Initial receive window should be twice of TCP_INIT_CWND to -	 * enable proper sending of new unsent data during fast recovery -	 * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a -	 * limit when mss is larger than 1460. -	 */ -	u32 init_rwnd = TCP_INIT_CWND * 2; - -	if (mss > 1460) -		init_rwnd = max((1460 * init_rwnd) / mss, 2U); -	return init_rwnd; -} -  /* Determine a window scaling and initial window to offer.   * Based on the assumption that the given amount of space   * will be offered. Store the results in the tp structure. @@ -228,7 +228,10 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,  	if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)  		(*rcv_wnd) = min(space, MAX_TCP_WINDOW);  	else -		(*rcv_wnd) = space; +		(*rcv_wnd) = min_t(u32, space, U16_MAX); + +	if (init_rcv_wnd) +		*rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);  	(*rcv_wscale) = 0;  	if (wscale_ok) { @@ -241,11 +244,6 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,  			(*rcv_wscale)++;  		}  	} - -	if (!init_rcv_wnd) /* Use default unless specified otherwise */ -		init_rcv_wnd = tcp_default_init_rwnd(mss); -	*rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); -  	/* Set the clamp no higher than max representable value */  	(*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);  } @@ -977,28 +975,28 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)  	return HRTIMER_NORESTART;  } -static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) +static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb, +				      u64 prior_wstamp)  { -	u64 len_ns; -	u32 rate; +	struct tcp_sock *tp = tcp_sk(sk); -	if (!tcp_needs_internal_pacing(sk)) -		return; -	rate = sk->sk_pacing_rate; -	if (!rate || rate == ~0U) -		return; +	skb->skb_mstamp_ns = tp->tcp_wstamp_ns; +	if (sk->sk_pacing_status != SK_PACING_NONE) { +		unsigned long rate = sk->sk_pacing_rate; -	len_ns = (u64)skb->len * NSEC_PER_SEC; -	do_div(len_ns, rate); -	hrtimer_start(&tcp_sk(sk)->pacing_timer, -		      ktime_add_ns(ktime_get(), len_ns), -		      HRTIMER_MODE_ABS_PINNED_SOFT); -	sock_hold(sk); -} +		/* Original sch_fq does not pace first 10 MSS +		 * Note that tp->data_segs_out overflows after 2^32 packets, +		 * this is a minor annoyance. +		 */ +		if (rate != ~0UL && rate && tp->data_segs_out >= 10) { +			u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate); +			u64 credit = tp->tcp_wstamp_ns - prior_wstamp; -static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb) -{ -	skb->skb_mstamp = tp->tcp_mstamp; +			/* take into account OS jitter */ +			len_ns -= min_t(u64, len_ns / 2, credit); +			tp->tcp_wstamp_ns += len_ns; +		} +	}  	list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);  } @@ -1025,6 +1023,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,  	struct sk_buff *oskb = NULL;  	struct tcp_md5sig_key *md5;  	struct tcphdr *th; +	u64 prior_wstamp;  	int err;  	BUG_ON(!skb || !tcp_skb_pcount(skb)); @@ -1045,7 +1044,11 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,  		if (unlikely(!skb))  			return -ENOBUFS;  	} -	skb->skb_mstamp = tp->tcp_mstamp; + +	prior_wstamp = tp->tcp_wstamp_ns; +	tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); + +	skb->skb_mstamp_ns = tp->tcp_wstamp_ns;  	inet = inet_sk(sk);  	tcb = TCP_SKB_CB(skb); @@ -1137,7 +1140,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,  		tcp_event_data_sent(tp, sk);  		tp->data_segs_out += tcp_skb_pcount(skb);  		tp->bytes_sent += skb->len - tcp_header_size; -		tcp_internal_pacing(sk, skb);  	}  	if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) @@ -1149,8 +1151,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,  	skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);  	skb_shinfo(skb)->gso_size = tcp_skb_mss(skb); -	/* Our usage of tstamp should remain private */ -	skb->tstamp = 0; +	/* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */  	/* Cleanup our debris for IP stacks */  	memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), @@ -1163,7 +1164,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,  		err = net_xmit_eval(err);  	}  	if (!err && oskb) { -		tcp_update_skb_after_send(tp, oskb); +		tcp_update_skb_after_send(sk, oskb, prior_wstamp);  		tcp_rate_skb_sent(sk, oskb);  	}  	return err; @@ -1698,8 +1699,9 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,  {  	u32 bytes, segs; -	bytes = min(sk->sk_pacing_rate >> sk->sk_pacing_shift, -		    sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); +	bytes = min_t(unsigned long, +		      sk->sk_pacing_rate >> sk->sk_pacing_shift, +		      sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);  	/* Goal is to send at least one packet per ms,  	 * not one big TSO packet every 100 ms. @@ -1966,7 +1968,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,  	head = tcp_rtx_queue_head(sk);  	if (!head)  		goto send_now; -	age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp); +	age = tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(head));  	/* If next ACK is likely to come too late (half srtt), do not defer */  	if (age < (tp->srtt_us >> 4))  		goto send_now; @@ -2172,10 +2174,23 @@ static int tcp_mtu_probe(struct sock *sk)  	return -1;  } -static bool tcp_pacing_check(const struct sock *sk) +static bool tcp_pacing_check(struct sock *sk)  { -	return tcp_needs_internal_pacing(sk) && -	       hrtimer_is_queued(&tcp_sk(sk)->pacing_timer); +	struct tcp_sock *tp = tcp_sk(sk); + +	if (!tcp_needs_internal_pacing(sk)) +		return false; + +	if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache) +		return false; + +	if (!hrtimer_is_queued(&tp->pacing_timer)) { +		hrtimer_start(&tp->pacing_timer, +			      ns_to_ktime(tp->tcp_wstamp_ns), +			      HRTIMER_MODE_ABS_PINNED_SOFT); +		sock_hold(sk); +	} +	return true;  }  /* TCP Small Queues : @@ -2192,10 +2207,12 @@ static bool tcp_pacing_check(const struct sock *sk)  static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,  				  unsigned int factor)  { -	unsigned int limit; +	unsigned long limit; -	limit = max(2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift); -	limit = min_t(u32, limit, +	limit = max_t(unsigned long, +		      2 * skb->truesize, +		      sk->sk_pacing_rate >> sk->sk_pacing_shift); +	limit = min_t(unsigned long, limit,  		      sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);  	limit <<= factor; @@ -2304,18 +2321,19 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,  	while ((skb = tcp_send_head(sk))) {  		unsigned int limit; +		if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { +			/* "skb_mstamp_ns" is used as a start point for the retransmit timer */ +			skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache; +			list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); +			goto repair; /* Skip network transmission */ +		} +  		if (tcp_pacing_check(sk))  			break;  		tso_segs = tcp_init_tso_segs(skb, mss_now);  		BUG_ON(!tso_segs); -		if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { -			/* "skb_mstamp" is used as a start point for the retransmit timer */ -			tcp_update_skb_after_send(tp, skb); -			goto repair; /* Skip network transmission */ -		} -  		cwnd_quota = tcp_cwnd_test(tp, skb);  		if (!cwnd_quota) {  			if (push_one == 2) @@ -2437,8 +2455,8 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)  	if (rto_delta_us > 0)  		timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us)); -	inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, -				  TCP_RTO_MAX); +	tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, +			     TCP_RTO_MAX, NULL);  	return true;  } @@ -2887,7 +2905,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)  		} tcp_skb_tsorted_restore(skb);  		if (!err) { -			tcp_update_skb_after_send(tp, skb); +			tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns);  			tcp_rate_skb_sent(sk, skb);  		}  	} else { @@ -3002,9 +3020,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)  		if (skb == rtx_head &&  		    icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) -			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, -						  inet_csk(sk)->icsk_rto, -						  TCP_RTO_MAX); +			tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, +					     inet_csk(sk)->icsk_rto, +					     TCP_RTO_MAX, +					     skb);  	}  } @@ -3205,10 +3224,10 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,  	memset(&opts, 0, sizeof(opts));  #ifdef CONFIG_SYN_COOKIES  	if (unlikely(req->cookie_ts)) -		skb->skb_mstamp = cookie_init_timestamp(req); +		skb->skb_mstamp_ns = cookie_init_timestamp(req);  	else  #endif -		skb->skb_mstamp = tcp_clock_us(); +		skb->skb_mstamp_ns = tcp_clock_ns();  #ifdef CONFIG_TCP_MD5SIG  	rcu_read_lock(); @@ -3424,7 +3443,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)  	err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); -	syn->skb_mstamp = syn_data->skb_mstamp; +	syn->skb_mstamp_ns = syn_data->skb_mstamp_ns;  	/* Now full SYN+DATA was cloned and sent (or not),  	 * remove the SYN from the original skb (syn_data) @@ -3734,9 +3753,10 @@ void tcp_send_probe0(struct sock *sk)  			icsk->icsk_probes_out = 1;  		probe_max = TCP_RESOURCE_PROBE_INTERVAL;  	} -	inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, -				  tcp_probe0_when(sk, probe_max), -				  TCP_RTO_MAX); +	tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, +			     tcp_probe0_when(sk, probe_max), +			     TCP_RTO_MAX, +			     NULL);  }  int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) | 
