summaryrefslogtreecommitdiff
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c532
1 files changed, 298 insertions, 234 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3ce6b95091c0..fce2b80b2bc9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_input.c,v 1.205 2000/12/13 18:31:48 davem Exp $
+ * Version: $Id: tcp_input.c,v 1.226 2001/03/07 22:00:57 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -137,7 +137,7 @@ static __inline__ void tcp_measure_rcv_mss(struct tcp_opt *tp, struct sk_buff *s
*
* "len" is invariant segment length, including TCP header.
*/
- len = skb->tail - skb->h.raw;
+ len += skb->data - skb->h.raw;
if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) ||
/* If PSH is not set, packet should be
* full sized, provided peer TCP is not badly broken.
@@ -378,7 +378,8 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_opt *tp, struct sk_b
/* The _first_ data packet received, initialize
* delayed ACK engine.
*/
- tcp_enter_quickack_mode(tp);
+ tcp_incr_quickack(tp);
+ tp->ack.ato = TCP_ATO_MIN;
} else {
int m = now - tp->ack.lrcvtime;
@@ -510,7 +511,7 @@ static __inline__ void tcp_bound_rto(struct tcp_opt *tp)
}
/* Save metrics learned by this TCP session.
- This function is called only, when TCP finishes sucessfully
+ This function is called only, when TCP finishes successfully
i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
*/
void tcp_update_metrics(struct sock *sk)
@@ -1016,7 +1017,7 @@ void tcp_enter_loss(struct sock *sk, int how)
tp->fackets_out = cnt;
}
}
- tp->left_out = tp->sacked_out + tp->lost_out;
+ tcp_sync_left_out(tp);
tp->reordering = min(tp->reordering, sysctl_tcp_reordering);
tp->ca_state = TCP_CA_Loss;
@@ -1052,6 +1053,15 @@ static inline int tcp_fackets_out(struct tcp_opt *tp)
return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out;
}
+static inline int tcp_skb_timedout(struct tcp_opt *tp, struct sk_buff *skb)
+{
+ return (tcp_time_stamp - TCP_SKB_CB(skb)->when > tp->rto);
+}
+
+static inline int tcp_head_timedout(struct sock *sk, struct tcp_opt *tp)
+{
+ return tp->packets_out && tcp_skb_timedout(tp, skb_peek(&sk->write_queue));
+}
/* Linux NewReno/SACK/FACK/ECN state machine.
* --------------------------------------
@@ -1157,7 +1167,13 @@ tcp_time_to_recover(struct sock *sk, struct tcp_opt *tp)
if (tcp_fackets_out(tp) > tp->reordering)
return 1;
- /* Trick#3: It is still not OK... But will it be useful to delay
+ /* Trick#3 : when we use RFC2988 timer restart, fast
+ * retransmit can be triggered by timeout of queue head.
+ */
+ if (tcp_head_timedout(sk, tp))
+ return 1;
+
+ /* Trick#4: It is still not OK... But will it be useful to delay
* recovery more?
*/
if (tp->packets_out <= tp->reordering &&
@@ -1178,8 +1194,10 @@ tcp_time_to_recover(struct sock *sk, struct tcp_opt *tp)
*/
static void tcp_check_reno_reordering(struct tcp_opt *tp, int addend)
{
- if (tp->sacked_out + 1 > tp->packets_out) {
- tp->sacked_out = tp->packets_out ? tp->packets_out - 1 : 0;
+ int holes = min(max(tp->lost_out, 1), tp->packets_out);
+
+ if (tp->sacked_out + holes > tp->packets_out) {
+ tp->sacked_out = tp->packets_out - holes;
tcp_update_reordering(tp, tp->packets_out+addend, 0);
}
}
@@ -1190,7 +1208,7 @@ static void tcp_add_reno_sack(struct tcp_opt *tp)
{
++tp->sacked_out;
tcp_check_reno_reordering(tp, 0);
- tp->left_out = tp->sacked_out + tp->lost_out;
+ tcp_sync_left_out(tp);
}
/* Account for ACK, ACKing some data in Reno Recovery phase. */
@@ -1198,17 +1216,14 @@ static void tcp_add_reno_sack(struct tcp_opt *tp)
static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_opt *tp, int acked)
{
if (acked > 0) {
- /* One ACK eated lost packet. Must eat! */
- BUG_TRAP(tp->lost_out == 0);
-
- /* The rest eat duplicate ACKs. */
+ /* One ACK acked hole. The rest eat duplicate ACKs. */
if (acked-1 >= tp->sacked_out)
tp->sacked_out = 0;
else
tp->sacked_out -= acked-1;
}
tcp_check_reno_reordering(tp, acked);
- tp->left_out = tp->sacked_out + tp->lost_out;
+ tcp_sync_left_out(tp);
}
static inline void tcp_reset_reno_sack(struct tcp_opt *tp)
@@ -1234,7 +1249,7 @@ tcp_mark_head_lost(struct sock *sk, struct tcp_opt *tp, int packets, u32 high_se
tp->lost_out++;
}
}
- tp->left_out = tp->sacked_out + tp->lost_out;
+ tcp_sync_left_out(tp);
}
/* Account newly detected lost packet(s) */
@@ -1249,6 +1264,24 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_opt *tp)
} else {
tcp_mark_head_lost(sk, tp, 1, tp->high_seq);
}
+
+ /* New heuristics: it is possible only after we switched
+ * to restart timer each time when something is ACKed.
+ * Hence, we can detect timed out packets during fast
+ * retransmit without falling to slow start.
+ */
+ if (tcp_head_timedout(sk, tp)) {
+ struct sk_buff *skb;
+
+ for_retrans_queue(skb, sk, tp) {
+ if (tcp_skb_timedout(tp, skb) &&
+ !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
+ TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+ tp->lost_out++;
+ }
+ }
+ tcp_sync_left_out(tp);
+ }
}
/* CWND moderation, preventing bursts due to too big ACKs
@@ -1490,7 +1523,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
}
/* D. Synchronize left_out to current state. */
- tp->left_out = tp->sacked_out + tp->lost_out;
+ tcp_sync_left_out(tp);
/* E. Check state exit conditions. State can be terminated
* when high_seq is ACKed. */
@@ -1516,8 +1549,13 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
case TCP_CA_Disorder:
tcp_try_undo_dsack(sk, tp);
- tp->undo_marker = 0;
- tp->ca_state = TCP_CA_Open;
+ if (!tp->undo_marker ||
+ /* For SACK case do not Open to allow to undo
+ * catching for all duplicate ACKs. */
+ IsReno(tp) || tp->snd_una != tp->high_seq) {
+ tp->undo_marker = 0;
+ tp->ca_state = TCP_CA_Open;
+ }
break;
case TCP_CA_Recovery:
@@ -1544,8 +1582,8 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
}
break;
case TCP_CA_Loss:
- if (flag & FLAG_ACKED)
- tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ if (flag&FLAG_DATA_ACKED)
+ tp->retransmits = 0;
if (!tcp_try_undo_loss(sk, tp)) {
tcp_moderate_cwnd(tp);
tcp_xmit_retransmit_queue(sk);
@@ -1593,7 +1631,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
tp->ca_state = TCP_CA_Recovery;
}
- if (is_dupack)
+ if (is_dupack || tcp_head_timedout(sk, tp))
tcp_update_scoreboard(sk, tp);
tcp_cwnd_down(tp);
tcp_xmit_retransmit_queue(sk);
@@ -1613,16 +1651,18 @@ static void tcp_ack_saw_tstamp(struct tcp_opt *tp, int flag)
*
* See draft-ietf-tcplw-high-performance-00, section 3.3.
* 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
+ *
+ * Changed: reset backoff as soon as we see the first valid sample.
+ * If we do not, we get strongly overstimated rto. With timestamps
+ * samples are accepted even from very old segments: f.e., when rtt=1
+ * increases to 8, we retransmit 5 times and after 8 seconds delayed
+ * answer arrives rto becomes 120 seconds! If at least one of segments
+ * in window is lost... Voila. --ANK (010210)
*/
seq_rtt = tcp_time_stamp - tp->rcv_tsecr;
tcp_rtt_estimator(tp, seq_rtt);
tcp_set_rto(tp);
- if (tp->backoff) {
- if (!tp->retransmits || !(flag & FLAG_RETRANS_DATA_ACKED))
- tp->backoff = 0;
- else
- tp->rto <<= tp->backoff;
- }
+ tp->backoff = 0;
tcp_bound_rto(tp);
}
@@ -1642,15 +1682,7 @@ static void tcp_ack_no_tstamp(struct tcp_opt *tp, u32 seq_rtt, int flag)
tcp_rtt_estimator(tp, seq_rtt);
tcp_set_rto(tp);
- if (tp->backoff) {
- /* To relax it? We have valid sample as soon as we are
- * here. Why not to clear backoff?
- */
- if (!tp->retransmits)
- tp->backoff = 0;
- else
- tp->rto <<= tp->backoff;
- }
+ tp->backoff = 0;
tcp_bound_rto(tp);
}
@@ -1684,15 +1716,11 @@ static __inline__ void tcp_cong_avoid(struct tcp_opt *tp)
} else
tp->snd_cwnd_cnt++;
}
+ tp->snd_cwnd_stamp = tcp_time_stamp;
}
/* Restart timer after forward progress on connection.
- * RFC2988 recommends (and BSD does) to restart timer to now+rto,
- * which is certainly wrong and effectively means that
- * rto includes one more _full_ rtt.
- *
- * For details see:
- * ftp://ftp.inr.ac.ru:/ip-routing/README.rto
+ * RFC2988 recommends to restart timer to now+rto.
*/
static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
@@ -1700,12 +1728,7 @@ static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
if (tp->packets_out==0) {
tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS);
} else {
- struct sk_buff *skb = skb_peek(&sk->write_queue);
- __u32 when = tp->rto + tp->rttvar - (tcp_time_stamp - TCP_SKB_CB(skb)->when);
-
- if ((__s32)when < (__s32)tp->rttvar)
- when = tp->rttvar;
- tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, min(when, TCP_RTO_MAX));
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
}
}
@@ -1857,12 +1880,7 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_opt *tp,
/* Note, it is the only place, where
* fast path is recovered for sending TCP.
*/
- if (skb_queue_len(&tp->out_of_order_queue) == 0 &&
-#ifdef TCP_FORMAL_WINDOW
- tcp_receive_window(tp) &&
-#endif
- !tp->urg_data)
- tcp_fast_path_on(tp);
+ tcp_fast_path_check(sk, tp);
if (nwin > tp->max_window) {
tp->max_window = nwin;
@@ -1873,16 +1891,6 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_opt *tp,
tp->snd_una = ack;
-#ifdef TCP_DEBUG
- if (before(tp->snd_una + tp->snd_wnd, tp->snd_nxt)) {
- if (tp->snd_nxt-(tp->snd_una + tp->snd_wnd) >= (1<<tp->snd_wscale)
- && net_ratelimit())
- printk(KERN_DEBUG "TCP: peer %u.%u.%u.%u:%u/%u shrinks window %u:%u:%u. Bad, what else can I say?\n",
- NIPQUAD(sk->daddr), htons(sk->dport), sk->num,
- tp->snd_una, tp->snd_wnd, tp->snd_nxt);
- }
-#endif
-
return flag;
}
@@ -2224,7 +2232,6 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- tp->fin_seq = TCP_SKB_CB(skb)->end_seq;
tcp_schedule_ack(tp);
sk->shutdown |= RCV_SHUTDOWN;
@@ -2506,10 +2513,27 @@ static void tcp_ofo_queue(struct sock *sk)
}
}
+static inline int tcp_rmem_schedule(struct sock *sk, struct sk_buff *skb)
+{
+ return (int)skb->truesize <= sk->forward_alloc ||
+ tcp_mem_schedule(sk, skb->truesize, 1);
+}
+
+static int tcp_prune_queue(struct sock *sk);
+
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
+ struct tcphdr *th = skb->h.th;
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- int eaten = 0;
+ int eaten = -1;
+
+ th = skb->h.th;
+ __skb_pull(skb, th->doff*4);
+
+ if (skb->len == 0 && !th->fin)
+ goto drop;
+
+ TCP_ECN_accept_cwr(tp, skb);
if (tp->dsack) {
tp->dsack = 0;
@@ -2535,26 +2559,32 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
__set_current_state(TASK_RUNNING);
local_bh_enable();
- if (memcpy_toiovec(tp->ucopy.iov, skb->data, chunk)) {
+ if (skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) {
sk->err = EFAULT;
sk->error_report(sk);
}
local_bh_disable();
tp->ucopy.len -= chunk;
tp->copied_seq += chunk;
- eaten = (chunk == skb->len && !skb->h.th->fin);
+ eaten = (chunk == skb->len && !th->fin);
}
- if (!eaten) {
+ if (eaten <= 0) {
queue_and_out:
+ if (eaten < 0 &&
+ (atomic_read(&sk->rmem_alloc) > sk->rcvbuf ||
+ !tcp_rmem_schedule(sk, skb))) {
+ if (tcp_prune_queue(sk) < 0 || !tcp_rmem_schedule(sk, skb))
+ goto drop;
+ }
tcp_set_owner_r(skb, sk);
__skb_queue_tail(&sk->receive_queue, skb);
}
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
if(skb->len)
tcp_event_data_recv(sk, tp, skb);
- if(skb->h.th->fin)
- tcp_fin(skb, sk, skb->h.th);
+ if(th->fin)
+ tcp_fin(skb, sk, th);
if (skb_queue_len(&tp->out_of_order_queue)) {
tcp_ofo_queue(sk);
@@ -2569,15 +2599,9 @@ queue_and_out:
if(tp->num_sacks)
tcp_sack_remove(tp);
- /* Turn on fast path. */
- if (skb_queue_len(&tp->out_of_order_queue) == 0 &&
-#ifdef TCP_FORMAL_WINDOW
- tcp_receive_window(tp) &&
-#endif
- !tp->urg_data)
- tcp_fast_path_on(tp);
+ tcp_fast_path_check(sk, tp);
- if (eaten) {
+ if (eaten > 0) {
__kfree_skb(skb);
} else if (!sk->dead)
sk->data_ready(sk, 0);
@@ -2592,17 +2616,12 @@ queue_and_out:
out_of_window:
tcp_schedule_ack(tp);
+drop:
__kfree_skb(skb);
return;
}
- /* Out of window. F.e. zero window probe.
- *
- * Note: it is highly possible that we may open window and enqueue
- * this segment now. However, this will be known only after we queue
- * it, which will result in queue full of successive 1 byte BSD
- * window probes, it is SWS in fact. So, always reject it and send ACK.
- */
+ /* Out of window. F.e. zero window probe. */
if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt+tcp_receive_window(tp)))
goto out_of_window;
@@ -2626,6 +2645,12 @@ out_of_window:
TCP_ECN_check_ce(tp, skb);
+ if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf ||
+ !tcp_rmem_schedule(sk, skb)) {
+ if (tcp_prune_queue(sk) < 0 || !tcp_rmem_schedule(sk, skb))
+ goto drop;
+ }
+
/* Disable header prediction. */
tp->pred_flags = 0;
tcp_schedule_ack(tp);
@@ -2704,52 +2729,142 @@ add_sack:
}
}
+/* Collapse contiguous sequence of skbs head..tail with
+ * sequence numbers start..end.
+ * Segments with FIN/SYN are not collapsed (only because this
+ * simplifies code)
+ */
+static void
+tcp_collapse(struct sock *sk, struct sk_buff *head,
+ struct sk_buff *tail, u32 start, u32 end)
+{
+ struct sk_buff *skb;
-static void tcp_collapse_queue(struct sock *sk, struct sk_buff_head *q)
+ /* First, check that queue is collapsable and find
+ * the point where collapsing can be useful. */
+ for (skb = head; skb != tail; ) {
+ /* No new bits? It is possible on ofo queue. */
+ if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
+ struct sk_buff *next = skb->next;
+ __skb_unlink(skb, skb->list);
+ __kfree_skb(skb);
+ NET_INC_STATS_BH(TCPRcvCollapsed);
+ skb = next;
+ continue;
+ }
+
+ /* The first skb to collapse is:
+ * - not SYN/FIN and
+ * - bloated or contains data before "start" or
+ * overlaps to the next one.
+ */
+ if (!skb->h.th->syn && !skb->h.th->fin &&
+ (tcp_win_from_space(skb->truesize) > skb->len ||
+ before(TCP_SKB_CB(skb)->seq, start) ||
+ (skb->next != tail &&
+ TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb->next)->seq)))
+ break;
+
+ /* Decided to skip this, advance start seq. */
+ start = TCP_SKB_CB(skb)->end_seq;
+ skb = skb->next;
+ }
+ if (skb == tail || skb->h.th->syn || skb->h.th->fin)
+ return;
+
+ while (before(start, end)) {
+ struct sk_buff *nskb;
+ int header = skb_headroom(skb);
+ int copy = (PAGE_SIZE - sizeof(struct sk_buff) -
+ sizeof(struct skb_shared_info) - header - 31)&~15;
+
+ /* Too big header? This can happen with IPv6. */
+ if (copy < 0)
+ return;
+ if (end-start < copy)
+ copy = end-start;
+ nskb = alloc_skb(copy+header, GFP_ATOMIC);
+ if (!nskb)
+ return;
+ skb_reserve(nskb, header);
+ memcpy(nskb->head, skb->head, header);
+ nskb->nh.raw = nskb->head + (skb->nh.raw-skb->head);
+ nskb->h.raw = nskb->head + (skb->h.raw-skb->head);
+ nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head);
+ memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
+ TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
+ __skb_insert(nskb, skb->prev, skb, skb->list);
+ tcp_set_owner_r(nskb, sk);
+
+ /* Copy data, releasing collapsed skbs. */
+ while (copy > 0) {
+ int offset = start - TCP_SKB_CB(skb)->seq;
+ int size = TCP_SKB_CB(skb)->end_seq - start;
+
+ if (offset < 0) BUG();
+ if (size > 0) {
+ size = min(copy, size);
+ if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
+ BUG();
+ TCP_SKB_CB(nskb)->end_seq += size;
+ copy -= size;
+ start += size;
+ }
+ if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
+ struct sk_buff *next = skb->next;
+ __skb_unlink(skb, skb->list);
+ __kfree_skb(skb);
+ NET_INC_STATS_BH(TCPRcvCollapsed);
+ skb = next;
+ if (skb == tail || skb->h.th->syn || skb->h.th->fin)
+ return;
+ }
+ }
+ }
+}
+
+/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
+ * and tcp_collapse() them until all the queue is collapsed.
+ */
+static void tcp_collapse_ofo_queue(struct sock *sk)
{
- struct sk_buff *skb = skb_peek(q);
- struct sk_buff *skb_next;
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
+ struct sk_buff *head;
+ u32 start, end;
- while (skb &&
- skb != (struct sk_buff *)q &&
- (skb_next = skb->next) != (struct sk_buff *)q) {
- struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
- struct tcp_skb_cb *scb_next = TCP_SKB_CB(skb_next);
+ if (skb == NULL)
+ return;
- if (scb->end_seq == scb_next->seq &&
- skb_tailroom(skb) >= skb_next->len &&
-#define TCP_DONT_COLLAPSE (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN)
- !(tcp_flag_word(skb->h.th)&TCP_DONT_COLLAPSE) &&
- !(tcp_flag_word(skb_next->h.th)&TCP_DONT_COLLAPSE)) {
- /* OK to collapse two skbs to one */
- memcpy(skb_put(skb, skb_next->len), skb_next->data, skb_next->len);
- __skb_unlink(skb_next, skb_next->list);
- scb->end_seq = scb_next->end_seq;
- __kfree_skb(skb_next);
- NET_INC_STATS_BH(TCPRcvCollapsed);
+ start = TCP_SKB_CB(skb)->seq;
+ end = TCP_SKB_CB(skb)->end_seq;
+ head = skb;
+
+ for (;;) {
+ skb = skb->next;
+
+ /* Segment is terminated when we see gap or when
+ * we are at the end of all the queue. */
+ if (skb == (struct sk_buff *)&tp->out_of_order_queue ||
+ after(TCP_SKB_CB(skb)->seq, end) ||
+ before(TCP_SKB_CB(skb)->end_seq, start)) {
+ tcp_collapse(sk, head, skb, start, end);
+ head = skb;
+ if (skb == (struct sk_buff *)&tp->out_of_order_queue)
+ break;
+ /* Start new segment */
+ start = TCP_SKB_CB(skb)->seq;
+ end = TCP_SKB_CB(skb)->end_seq;
} else {
- /* Lots of spare tailroom, reallocate this skb to trim it. */
- if (tcp_win_from_space(skb->truesize) > skb->len &&
- skb_tailroom(skb) > sizeof(struct sk_buff) + 16) {
- struct sk_buff *nskb;
-
- nskb = skb_copy_expand(skb, skb_headroom(skb), 0, GFP_ATOMIC);
- if (nskb) {
- tcp_set_owner_r(nskb, sk);
- memcpy(nskb->data-skb_headroom(skb),
- skb->data-skb_headroom(skb),
- skb_headroom(skb));
- __skb_append(skb, nskb);
- __skb_unlink(skb, skb->list);
- __kfree_skb(skb);
- }
- }
- skb = skb_next;
+ if (before(TCP_SKB_CB(skb)->seq, start))
+ start = TCP_SKB_CB(skb)->seq;
+ if (after(TCP_SKB_CB(skb)->end_seq, end))
+ end = TCP_SKB_CB(skb)->end_seq;
}
}
}
-/* Clean the out_of_order queue if we can, trying to get
+/* Reduce allocated memory if we can, trying to get
* the socket within its memory limits again.
*
* Return less than zero if we should start dropping frames
@@ -2769,8 +2884,10 @@ static int tcp_prune_queue(struct sock *sk)
else if (tcp_memory_pressure)
tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4*tp->advmss);
- tcp_collapse_queue(sk, &sk->receive_queue);
- tcp_collapse_queue(sk, &tp->out_of_order_queue);
+ tcp_collapse_ofo_queue(sk);
+ tcp_collapse(sk, sk->receive_queue.next,
+ (struct sk_buff*)&sk->receive_queue,
+ tp->copied_seq, tp->rcv_nxt);
tcp_mem_reclaim(sk);
if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf)
@@ -2804,59 +2921,10 @@ static int tcp_prune_queue(struct sock *sk)
NET_INC_STATS_BH(RcvPruned);
/* Massive buffer overcommit. */
+ tp->pred_flags = 0;
return -1;
}
-static inline int tcp_rmem_schedule(struct sock *sk, struct sk_buff *skb)
-{
- return (int)skb->truesize <= sk->forward_alloc ||
- tcp_mem_schedule(sk, skb->truesize, 1);
-}
-
-/*
- * This routine handles the data. If there is room in the buffer,
- * it will be have already been moved into it. If there is no
- * room, then we will just have to discard the packet.
- */
-
-static void tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
-{
- struct tcphdr *th;
- struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
-
- th = skb->h.th;
- skb_pull(skb, th->doff*4);
- skb_trim(skb, len - (th->doff*4));
-
- if (skb->len == 0 && !th->fin)
- goto drop;
-
- TCP_ECN_accept_cwr(tp, skb);
-
- /*
- * If our receive queue has grown past its limits shrink it.
- * Make sure to do this before moving rcv_nxt, otherwise
- * data might be acked for that we don't have enough room.
- */
- if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf ||
- !tcp_rmem_schedule(sk, skb)) {
- if (tcp_prune_queue(sk) < 0 || !tcp_rmem_schedule(sk, skb))
- goto drop;
- }
-
- tcp_data_queue(sk, skb);
-
-#ifdef TCP_DEBUG
- if (before(tp->rcv_nxt, tp->copied_seq)) {
- printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n");
- tp->rcv_nxt = tp->copied_seq;
- }
-#endif
- return;
-
-drop:
- __kfree_skb(skb);
-}
/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
* As additional protections, we do not touch cwnd in retransmission phases,
@@ -2937,7 +3005,7 @@ static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
- tcp_write_xmit(sk))
+ tcp_write_xmit(sk, tp->nonagle))
tcp_check_probe_timer(sk, tp);
}
@@ -3009,6 +3077,19 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
if (after(tp->copied_seq, ptr))
return;
+ /* Do not replay urg ptr.
+ *
+ * NOTE: interesting situation not covered by specs.
+ * Misbehaving sender may send urg ptr, pointing to segment,
+ * which we already have in ofo queue. We are not able to fetch
+ * such data and will stay in TCP_URG_NOTYET until will be eaten
+ * by recvmsg(). Seems, we are not obliged to handle such wicked
+ * situations. But it is worth to think about possibility of some
+ * DoSes using some hypothetical application level deadlock.
+ */
+ if (before(ptr, tp->rcv_nxt))
+ return;
+
/* Do we already have a newer (or duplicate) urgent pointer? */
if (tp->urg_data && !after(ptr, tp->urg_seq))
return;
@@ -3027,9 +3108,27 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
* tp->copied_seq since we would read the last urgent byte again
* as data, nor can we alter copied_seq until this data arrives
* or we break the sematics of SIOCATMARK (and thus sockatmark())
+ *
+ * NOTE. Double Dutch. Rendering to plain English: author of comment
+ * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB);
+ * and expect that both A and B disappear from stream. This is _wrong_.
+ * Though this happens in BSD with high probability, this is occasional.
+ * Any application relying on this is buggy. Note also, that fix "works"
+ * only in this artificial test. Insert some normal data between A and B and we will
+ * decline of BSD again. Verdict: it is better to remove to trap
+ * buggy users.
*/
- if (tp->urg_seq == tp->copied_seq)
- tp->copied_seq++; /* Move the copied sequence on correctly */
+ if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
+ !sk->urginline &&
+ tp->copied_seq != tp->rcv_nxt) {
+ struct sk_buff *skb = skb_peek(&sk->receive_queue);
+ tp->copied_seq++;
+ if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
+ __skb_unlink(skb, skb->list);
+ __kfree_skb(skb);
+ }
+ }
+
tp->urg_data = TCP_URG_NOTYET;
tp->urg_seq = ptr;
@@ -3038,7 +3137,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
}
/* This is the 'fast' part of urgent handling. */
-static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
+static inline void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
@@ -3048,11 +3147,14 @@ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len
/* Do we wait for any urgent data? - normally not... */
if (tp->urg_data == TCP_URG_NOTYET) {
- u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4);
+ u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4) - th->syn;
/* Is the urgent pointer pointing into this packet? */
- if (ptr < len) {
- tp->urg_data = TCP_URG_VALID | *(ptr + (unsigned char *) th);
+ if (ptr < skb->len) {
+ u8 tmp;
+ if (skb_copy_bits(skb, ptr, &tmp, 1))
+ BUG();
+ tp->urg_data = TCP_URG_VALID | tmp;
if (!sk->dead)
sk->data_ready(sk,0);
}
@@ -3067,9 +3169,9 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
local_bh_enable();
if (skb->ip_summed==CHECKSUM_UNNECESSARY)
- err = memcpy_toiovec(tp->ucopy.iov, skb->h.raw + hlen, chunk);
+ err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk);
else
- err = copy_and_csum_toiovec(tp->ucopy.iov, skb, hlen);
+ err = skb_copy_and_csum_datagram_iovec(skb, hlen, tp->ucopy.iov);
if (!err) {
update:
@@ -3117,32 +3219,6 @@ tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
* disabled when:
* - A zero window was announced from us - zero window probing
* is only handled properly in the slow path.
- * [ NOTE: actually, it was made incorrectly and nobody ever noticed
- * this! Reason is clear: 1. Correct senders do not send
- * to zero window. 2. Even if a sender sends to zero window,
- * nothing terrible occurs.
- *
- * For now I cleaned this and fast path is really always disabled,
- * when window is zero, but I would be more happy to remove these
- * checks. Code will be only cleaner and _faster_. --ANK
- *
- * Later note. I've just found that slow path also accepts
- * out of window segments, look at tcp_sequence(). So...
- * it is the last argument: I repair all and comment out
- * repaired code by TCP_FORMAL_WINDOW.
- * [ I remember one rhyme from a chidren's book. (I apologize,
- * the trasnlation is not rhymed 8)): people in one (jewish) village
- * decided to build sauna, but divided to two parties.
- * The first one insisted that battens should not be dubbed,
- * another objected that foots will suffer of splinters,
- * the first fended that dubbed wet battens are too slippy
- * and people will fall and it is much more serious!
- * Certaiinly, all they went to rabbi.
- * After some thinking, he judged: "Do not be lazy!
- * Certainly, dub the battens! But put them by dubbed surface down."
- * ]
- * ]
- *
* - Out of order segments arrived.
* - Urgent data is expected.
* - There is no buffer space left
@@ -3348,7 +3424,7 @@ slow_path:
tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
- if(th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
+ if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
TCP_INC_STATS_BH(TcpInErrs);
NET_INC_STATS_BH(TCPAbortOnSyn);
tcp_reset(sk);
@@ -3360,10 +3436,10 @@ step5:
tcp_ack(sk, skb, FLAG_SLOWPATH);
/* Process urgent data. */
- tcp_urg(sk, th, len);
+ tcp_urg(sk, skb, th);
/* step 7: process the segment text */
- tcp_data(skb, sk, len);
+ tcp_data_queue(sk, skb);
tcp_data_snd_check(sk);
tcp_ack_snd_check(sk);
@@ -3452,8 +3528,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
*/
tp->snd_wnd = ntohs(th->window);
tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(skb)->seq);
- tp->syn_seq = TCP_SKB_CB(skb)->seq;
- tp->fin_seq = TCP_SKB_CB(skb)->seq;
if (tp->wscale_ok == 0) {
tp->snd_wscale = tp->rcv_wscale = 0;
@@ -3488,7 +3562,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
/* Remember, tcp_poll() does not lock socket!
* Change state from SYN-SENT only after copied_seq
- * is initilized. */
+ * is initialized. */
tp->copied_seq = tp->rcv_nxt;
mb();
tcp_set_state(sk, TCP_ESTABLISHED);
@@ -3498,7 +3572,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
sk_wake_async(sk, 0, POLL_OUT);
}
- if (tp->write_pending || tp->defer_accept) {
+ if (tp->write_pending || tp->defer_accept || tp->ack.pingpong) {
/* Save one ACK. Data will be ready after
* several ticks, if write_pending is set.
*
@@ -3508,6 +3582,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
*/
tcp_schedule_ack(tp);
tp->ack.lrcvtime = tcp_time_stamp;
+ tp->ack.ato = TCP_ATO_MIN;
+ tcp_incr_quickack(tp);
tcp_enter_quickack_mode(tp);
tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
@@ -3683,21 +3759,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
/* step 4:
*
- * Check for a SYN, and ensure it matches the SYN we were
- * first sent. We have to handle the rather unusual (but valid)
- * sequence that KA9Q derived products may generate of
- *
- * SYN
- * SYN|ACK Data
- * ACK (lost)
- * SYN|ACK Data + More Data
- * .. we must ACK not RST...
- *
- * We keep syn_seq as the sequence space occupied by the
- * original syn.
+ * Check for a SYN in window.
*/
-
- if (th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
+ if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
NET_INC_STATS_BH(TCPAbortOnSyn);
tcp_reset(sk);
return 1;
@@ -3806,13 +3870,13 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
step6:
/* step 6: check the URG bit */
- tcp_urg(sk, th, len);
+ tcp_urg(sk, skb, th);
/* step 7: process the segment text */
switch (sk->state) {
case TCP_CLOSE_WAIT:
case TCP_CLOSING:
- if (!before(TCP_SKB_CB(skb)->seq, tp->fin_seq))
+ if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
break;
case TCP_FIN_WAIT1:
case TCP_FIN_WAIT2:
@@ -3830,7 +3894,7 @@ step6:
}
/* Fall through */
case TCP_ESTABLISHED:
- tcp_data(skb, sk, len);
+ tcp_data_queue(sk, skb);
queued = 1;
break;
}