From 10bc956350e6821a1a9757065962f1924649b12d Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 2 Sep 2004 19:01:02 -0700 Subject: [NET]: Add reference counting to neigh_parms. I've added a refcnt on neigh_parms as well as a dead flag. The latter is checked under the tbl_lock before adding a neigh entry to the hash table. The non-trivial bit of the patch is the first chunk of net/core/neighbour.c. I removed that line because not doing so would mean that I have to drop the reference to the parms right there. That would've lead to race conditions since many places dereference neigh->parms without holding locks. It's also unnecessary to reset n->parms since we're no longer in a hurry to see it go due to the new ref counting. You'll also notice that I've put all dereferences of dev->*_ptr under the rcu_read_lock(). Without this we may get a neigh_parms that's already been released. Incidentally a lot of these places were racy even before the RCU change. For example, in the IPv6 case neigh->parms may be set to a value that's just been released. Finally in order to make sure that all stale entries are purged as quickly as possible I've added neigh_ifdown/arp_ifdown calls after every neigh_parms_release call. In many cases we now have multiple calls to neigh_ifdown in the shutdown path. I didn't remove the earlier calls because there may be hidden dependencies for them to be there. Once the respective maintainers have looked at them we can probably remove most of them. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/atm/clip.c | 21 ++++++++++++++++++--- net/core/neighbour.c | 24 +++++++++++++++++++----- net/decnet/dn_dev.c | 1 + net/decnet/dn_neigh.c | 18 ++++++++++++++---- net/ipv4/arp.c | 23 ++++++++++++++++------- net/ipv4/devinet.c | 6 +++++- net/ipv6/addrconf.c | 1 + net/ipv6/ndisc.c | 18 ++++++++++++++---- 8 files changed, 88 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/atm/clip.c b/net/atm/clip.c index 5de7c1fd73b5..f7756e1f93ce 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -26,6 +26,7 @@ #include #include #include +#include #include /* for struct rtable and routing */ #include /* icmp_send */ #include /* for HZ */ @@ -311,13 +312,27 @@ static int clip_constructor(struct neighbour *neigh) { struct atmarp_entry *entry = NEIGH2ENTRY(neigh); struct net_device *dev = neigh->dev; - struct in_device *in_dev = dev->ip_ptr; + struct in_device *in_dev; + struct neigh_parms *parms; DPRINTK("clip_constructor (neigh %p, entry %p)\n",neigh,entry); - if (!in_dev) return -EINVAL; neigh->type = inet_addr_type(entry->ip); if (neigh->type != RTN_UNICAST) return -EINVAL; - if (in_dev->arp_parms) neigh->parms = in_dev->arp_parms; + + rcu_read_lock(); + in_dev = __in_dev_get(dev); + if (!in_dev) { + rcu_read_unlock(); + return -EINVAL; + } + + parms = in_dev->arp_parms; + if (parms) { + __neigh_parms_put(neigh->parms); + neigh->parms = neigh_parms_clone(parms); + } + rcu_read_unlock(); + neigh->ops = &clip_neigh_ops; neigh->output = neigh->nud_state & NUD_VALID ? neigh->ops->connected_output : neigh->ops->output; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index f5deae1541c4..c9a747e89e5d 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -227,7 +227,6 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) we must kill timers etc. and move it to safe state. */ - n->parms = &tbl->parms; skb_queue_purge(&n->arp_queue); n->output = neigh_blackhole; if (n->nud_state & NUD_VALID) @@ -273,7 +272,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl) n->updated = n->used = now; n->nud_state = NUD_NONE; n->output = neigh_blackhole; - n->parms = &tbl->parms; + n->parms = neigh_parms_clone(&tbl->parms); init_timer(&n->timer); n->timer.function = neigh_timer_handler; n->timer.data = (unsigned long)n; @@ -340,12 +339,16 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, hash_val = tbl->hash(pkey, dev); write_lock_bh(&tbl->lock); + if (n->parms->dead) { + rc = ERR_PTR(-EINVAL); + goto out_tbl_unlock; + } + for (n1 = tbl->hash_buckets[hash_val]; n1; n1 = n1->next) { if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) { neigh_hold(n1); - write_unlock_bh(&tbl->lock); rc = n1; - goto out_neigh_release; + goto out_tbl_unlock; } } @@ -358,6 +361,8 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, rc = n; out: return rc; +out_tbl_unlock: + write_unlock_bh(&tbl->lock); out_neigh_release: neigh_release(n); goto out; @@ -494,6 +499,7 @@ void neigh_destroy(struct neighbour *neigh) skb_queue_purge(&neigh->arp_queue); dev_put(neigh->dev); + neigh_parms_put(neigh->parms); NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh); @@ -1120,6 +1126,7 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev, if (p) { memcpy(p, &tbl->parms, sizeof(*p)); p->tbl = tbl; + atomic_set(&p->refcnt, 1); INIT_RCU_HEAD(&p->rcu_head); p->reachable_time = neigh_rand_reach_time(p->base_reachable_time); @@ -1141,7 +1148,7 @@ static void neigh_rcu_free_parms(struct rcu_head *head) struct neigh_parms *parms = container_of(head, struct neigh_parms, rcu_head); - kfree(parms); + neigh_parms_put(parms); } void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) @@ -1154,6 +1161,7 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) for (p = &tbl->parms.next; *p; p = &(*p)->next) { if (*p == parms) { *p = parms->next; + parms->dead = 1; write_unlock_bh(&tbl->lock); call_rcu(&parms->rcu_head, neigh_rcu_free_parms); return; @@ -1163,11 +1171,17 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) NEIGH_PRINTK1("neigh_parms_release: not found\n"); } +void neigh_parms_destroy(struct neigh_parms *parms) +{ + kfree(parms); +} + void neigh_table_init(struct neigh_table *tbl) { unsigned long now = jiffies; + atomic_set(&tbl->parms.refcnt, 1); INIT_RCU_HEAD(&tbl->parms.rcu_head); tbl->parms.reachable_time = neigh_rand_reach_time(tbl->parms.base_reachable_time); diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index 5a05efb83092..733b1cf6c440 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -1215,6 +1215,7 @@ static void dn_dev_delete(struct net_device *dev) dev->dn_ptr = NULL; neigh_parms_release(&dn_neigh_table, dn_db->neigh_parms); + neigh_ifdown(&dn_neigh_table, dev); if (dn_db->router) neigh_release(dn_db->router); diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c index ab64b850c12b..e874232ec54b 100644 --- a/net/decnet/dn_neigh.c +++ b/net/decnet/dn_neigh.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -134,13 +135,22 @@ static int dn_neigh_construct(struct neighbour *neigh) { struct net_device *dev = neigh->dev; struct dn_neigh *dn = (struct dn_neigh *)neigh; - struct dn_dev *dn_db = (struct dn_dev *)dev->dn_ptr; + struct dn_dev *dn_db; + struct neigh_parms *parms; - if (dn_db == NULL) + rcu_read_lock(); + dn_db = dev->dn_ptr; + if (dn_db == NULL) { + rcu_read_unlock(); return -EINVAL; + } - if (dn_db->neigh_parms) - neigh->parms = dn_db->neigh_parms; + parms = dn_db->neigh_parms; + if (parms) { + __neigh_parms_put(neigh->parms); + neigh->parms = neigh_parms_clone(parms); + } + rcu_read_unlock(); if (dn_db->use_long) neigh->ops = &dn_long_ops; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 562702d99ba2..f4e6a4a368ec 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -96,6 +96,7 @@ #include #include #include +#include #ifdef CONFIG_SYSCTL #include #endif @@ -237,16 +238,24 @@ static int arp_constructor(struct neighbour *neigh) { u32 addr = *(u32*)neigh->primary_key; struct net_device *dev = neigh->dev; - struct in_device *in_dev = in_dev_get(dev); - - if (in_dev == NULL) - return -EINVAL; + struct in_device *in_dev; + struct neigh_parms *parms; neigh->type = inet_addr_type(addr); - if (in_dev->arp_parms) - neigh->parms = in_dev->arp_parms; - in_dev_put(in_dev); + rcu_read_lock(); + in_dev = __in_dev_get(dev); + if (in_dev == NULL) { + rcu_read_unlock(); + return -EINVAL; + } + + parms = in_dev->arp_parms; + if (parms) { + __neigh_parms_put(neigh->parms); + neigh->parms = neigh_parms_clone(parms); + } + rcu_read_unlock(); if (dev->hard_header == NULL) { neigh->nud_state = NUD_NOARP; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index fc9930460864..19eb795a1140 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -184,6 +184,7 @@ static void in_dev_rcu_put(struct rcu_head *head) static void inetdev_destroy(struct in_device *in_dev) { struct in_ifaddr *ifa; + struct net_device *dev; ASSERT_RTNL(); @@ -200,12 +201,15 @@ static void inetdev_destroy(struct in_device *in_dev) devinet_sysctl_unregister(&in_dev->cnf); #endif - in_dev->dev->ip_ptr = NULL; + dev = in_dev->dev; + dev->ip_ptr = NULL; #ifdef CONFIG_SYSCTL neigh_sysctl_unregister(in_dev->arp_parms); #endif neigh_parms_release(&arp_tbl, in_dev->arp_parms); + arp_ifdown(dev); + call_rcu(&in_dev->rcu_head, in_dev_rcu_put); } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 7150375908a8..d2091c5ce489 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2072,6 +2072,7 @@ static int addrconf_ifdown(struct net_device *dev, int how) neigh_sysctl_unregister(idev->nd_parms); #endif neigh_parms_release(&nd_tbl, idev->nd_parms); + neigh_ifdown(&nd_tbl, dev); in6_dev_put(idev); } return 0; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index b278e5a04ca8..6d23ea909aca 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -58,6 +58,7 @@ #include #include #include +#include #ifdef CONFIG_SYSCTL #include #endif @@ -284,14 +285,23 @@ static int ndisc_constructor(struct neighbour *neigh) { struct in6_addr *addr = (struct in6_addr*)&neigh->primary_key; struct net_device *dev = neigh->dev; - struct inet6_dev *in6_dev = in6_dev_get(dev); + struct inet6_dev *in6_dev; + struct neigh_parms *parms; int is_multicast = ipv6_addr_is_multicast(addr); - if (in6_dev == NULL) + rcu_read_lock(); + in6_dev = in6_dev_get(dev); + if (in6_dev == NULL) { + rcu_read_unlock(); return -EINVAL; + } - if (in6_dev->nd_parms) - neigh->parms = in6_dev->nd_parms; + parms = in6_dev->nd_parms; + if (parms) { + __neigh_parms_put(neigh->parms); + neigh->parms = neigh_parms_clone(parms); + } + rcu_read_unlock(); neigh->type = is_multicast ? RTN_MULTICAST : RTN_UNICAST; if (dev->hard_header == NULL) { -- cgit v1.2.3 From 14a1f44569619b2dfda526dc0f73b9bf0df74171 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 6 Sep 2004 19:20:50 -0700 Subject: [TCP]: Make TSO play nice with congestion window. Previously TSO would not abide by the congestion window properly. Essentially, each TSO packet would be trated just like 1 normal packet, even though a TSO packet generates more than 1 normal packet. This violates congestion window rules entirely. So now we record the TSO factor, a count of how many real packets a TSO packet will generate, and include this in all the packet counting routines. This initial version has a bug in that skb_entail() is not the correct time to figure out the TSO factor for the SKB, and tp->mss_tso_factor is not necessarily the right value for a given SKB. Will fix this up next. Signed-off-by: David S. Miller --- include/linux/tcp.h | 17 ++-- include/net/tcp.h | 115 +++++++++++++++++---- net/ipv4/tcp.c | 23 +++-- net/ipv4/tcp_diag.c | 12 +-- net/ipv4/tcp_input.c | 256 +++++++++++++++++++++++++---------------------- net/ipv4/tcp_ipv4.c | 3 +- net/ipv4/tcp_minisocks.c | 10 +- net/ipv4/tcp_output.c | 114 +++++++++++++-------- net/ipv4/tcp_timer.c | 8 +- net/ipv6/tcp_ipv6.c | 3 +- 10 files changed, 350 insertions(+), 211 deletions(-) (limited to 'net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 9c42ac0b0322..ebf15b6a8162 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -201,6 +201,10 @@ struct tcp_sack_block { __u32 end_seq; }; +typedef struct tcp_pcount { + __u32 val; +} tcp_pcount_t; + struct tcp_opt { int tcp_header_len; /* Bytes of tcp header to send */ @@ -250,6 +254,7 @@ struct tcp_opt { __u32 max_window; /* Maximal window ever seen from peer */ __u32 pmtu_cookie; /* Last pmtu seen by socket */ __u32 mss_cache; /* Cached effective mss, not including SACKS */ + __u32 mss_tso_factor; /* Real packets per TSO packet */ __u16 mss_cache_std; /* Like mss_cache, but without TSO */ __u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ __u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */ @@ -274,9 +279,9 @@ struct tcp_opt { __u32 rtt_seq; /* sequence number to update rttvar */ __u32 rto; /* retransmit timeout */ - __u32 packets_out; /* Packets which are "in flight" */ - __u32 left_out; /* Packets which leaved network */ - __u32 retrans_out; /* Retransmitted packets out */ + tcp_pcount_t packets_out; /* Packets which are "in flight" */ + tcp_pcount_t left_out; /* Packets which leaved network */ + tcp_pcount_t retrans_out; /* Retransmitted packets out */ /* @@ -337,9 +342,9 @@ struct tcp_opt { __u8 syn_retries; /* num of allowed syn retries */ __u8 ecn_flags; /* ECN status bits. */ __u16 prior_ssthresh; /* ssthresh saved at recovery start */ - __u32 lost_out; /* Lost packets */ - __u32 sacked_out; /* SACK'd packets */ - __u32 fackets_out; /* FACK'd packets */ + tcp_pcount_t lost_out; /* Lost packets */ + tcp_pcount_t sacked_out;/* SACK'd packets */ + tcp_pcount_t fackets_out;/* FACK'd packets */ __u32 high_seq; /* snd_nxt at onset of congestion */ __u32 retrans_stamp; /* Timestamp of the last retransmit, diff --git a/include/net/tcp.h b/include/net/tcp.h index a5be63c232e3..efda37b84207 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1047,13 +1047,18 @@ static inline void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long * is not a big flaw. */ -static __inline__ unsigned int tcp_current_mss(struct sock *sk, int large) +static inline unsigned int tcp_current_mss(struct sock *sk, int large, int *factor) { struct tcp_opt *tp = tcp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); - int mss_now = large && (sk->sk_route_caps & NETIF_F_TSO) && - !tp->urg_mode ? - tp->mss_cache : tp->mss_cache_std; + int do_large, mss_now; + + do_large = (large && + (sk->sk_route_caps & NETIF_F_TSO) && + !tp->urg_mode); + mss_now = do_large ? tp->mss_cache : tp->mss_cache_std; + if (factor) + *factor = do_large ? tp->mss_tso_factor : 1; if (dst) { u32 mtu = dst_pmtu(dst); @@ -1181,12 +1186,76 @@ struct tcp_skb_cb { __u16 urg_ptr; /* Valid w/URG flags is set. */ __u32 ack_seq; /* Sequence number ACK'd */ + __u32 tso_factor; }; #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) #include +/* Due to TSO, an SKB can be composed of multiple actual + * packets. To keep these tracked properly, we use this. + */ +static inline int tcp_skb_pcount(struct sk_buff *skb) +{ + return TCP_SKB_CB(skb)->tso_factor; +} + +static inline void tcp_inc_pcount(tcp_pcount_t *count, struct sk_buff *skb) +{ + count->val += tcp_skb_pcount(skb); +} + +static inline void tcp_inc_pcount_explicit(tcp_pcount_t *count, int amt) +{ + count->val += amt; +} + +static inline void tcp_dec_pcount_explicit(tcp_pcount_t *count, int amt) +{ + count->val -= amt; +} + +static inline void tcp_dec_pcount(tcp_pcount_t *count, struct sk_buff *skb) +{ + count->val -= tcp_skb_pcount(skb); +} + +static inline void tcp_dec_pcount_approx(tcp_pcount_t *count, + struct sk_buff *skb) +{ + if (count->val) { + count->val -= tcp_skb_pcount(skb); + if ((int)count->val < 0) + count->val = 0; + } +} + +static inline __u32 tcp_get_pcount(tcp_pcount_t *count) +{ + return count->val; +} + +static inline void tcp_set_pcount(tcp_pcount_t *count, __u32 val) +{ + count->val = val; +} + +static inline void tcp_packets_out_inc(struct sock *sk, struct tcp_opt *tp, + struct sk_buff *skb) +{ + int orig = tcp_get_pcount(&tp->packets_out); + + tcp_inc_pcount(&tp->packets_out, skb); + if (!orig) + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); +} + +static inline void tcp_packets_out_dec(struct tcp_opt *tp, struct sk_buff *skb) +{ + tcp_dec_pcount(&tp->packets_out, skb); +} + /* This determines how many packets are "in the network" to the best * of our knowledge. In many cases it is conservative, but where * detailed information is available from the receiver (via SACK @@ -1203,7 +1272,9 @@ struct tcp_skb_cb { */ static __inline__ unsigned int tcp_packets_in_flight(struct tcp_opt *tp) { - return tp->packets_out - tp->left_out + tp->retrans_out; + return (tcp_get_pcount(&tp->packets_out) - + tcp_get_pcount(&tp->left_out) + + tcp_get_pcount(&tp->retrans_out)); } /* Recalculate snd_ssthresh, we want to set it to: @@ -1304,9 +1375,15 @@ static inline __u32 tcp_current_ssthresh(struct tcp_opt *tp) static inline void tcp_sync_left_out(struct tcp_opt *tp) { - if (tp->sack_ok && tp->sacked_out >= tp->packets_out - tp->lost_out) - tp->sacked_out = tp->packets_out - tp->lost_out; - tp->left_out = tp->sacked_out + tp->lost_out; + if (tp->sack_ok && + (tcp_get_pcount(&tp->sacked_out) >= + tcp_get_pcount(&tp->packets_out) - tcp_get_pcount(&tp->lost_out))) + tcp_set_pcount(&tp->sacked_out, + (tcp_get_pcount(&tp->packets_out) - + tcp_get_pcount(&tp->lost_out))); + tcp_set_pcount(&tp->left_out, + (tcp_get_pcount(&tp->sacked_out) + + tcp_get_pcount(&tp->lost_out))); } extern void tcp_cwnd_application_limited(struct sock *sk); @@ -1315,14 +1392,16 @@ extern void tcp_cwnd_application_limited(struct sock *sk); static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_opt *tp) { - if (tp->packets_out >= tp->snd_cwnd) { + __u32 packets_out = tcp_get_pcount(&tp->packets_out); + + if (packets_out >= tp->snd_cwnd) { /* Network is feed fully. */ tp->snd_cwnd_used = 0; tp->snd_cwnd_stamp = tcp_time_stamp; } else { /* Network starves. */ - if (tp->packets_out > tp->snd_cwnd_used) - tp->snd_cwnd_used = tp->packets_out; + if (tcp_get_pcount(&tp->packets_out) > tp->snd_cwnd_used) + tp->snd_cwnd_used = tcp_get_pcount(&tp->packets_out); if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto) tcp_cwnd_application_limited(sk); @@ -1388,7 +1467,7 @@ tcp_nagle_check(struct tcp_opt *tp, struct sk_buff *skb, unsigned mss_now, int n !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && ((nonagle&TCP_NAGLE_CORK) || (!nonagle && - tp->packets_out && + tcp_get_pcount(&tp->packets_out) && tcp_minshall_check(tp)))); } @@ -1398,6 +1477,8 @@ tcp_nagle_check(struct tcp_opt *tp, struct sk_buff *skb, unsigned mss_now, int n static __inline__ int tcp_snd_test(struct tcp_opt *tp, struct sk_buff *skb, unsigned cur_mss, int nonagle) { + int pkts = TCP_SKB_CB(skb)->tso_factor; + /* RFC 1122 - section 4.2.3.4 * * We must queue if @@ -1424,14 +1505,14 @@ static __inline__ int tcp_snd_test(struct tcp_opt *tp, struct sk_buff *skb, */ return (((nonagle&TCP_NAGLE_PUSH) || tp->urg_mode || !tcp_nagle_check(tp, skb, cur_mss, nonagle)) && - ((tcp_packets_in_flight(tp) < tp->snd_cwnd) || + (((tcp_packets_in_flight(tp) + (pkts-1)) < tp->snd_cwnd) || (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) && !after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd)); } static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_opt *tp) { - if (!tp->packets_out && !tp->pending) + if (!tcp_get_pcount(&tp->packets_out) && !tp->pending) tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, tp->rto); } @@ -1464,7 +1545,7 @@ static __inline__ void __tcp_push_pending_frames(struct sock *sk, static __inline__ void tcp_push_pending_frames(struct sock *sk, struct tcp_opt *tp) { - __tcp_push_pending_frames(sk, tp, tcp_current_mss(sk, 1), tp->nonagle); + __tcp_push_pending_frames(sk, tp, tcp_current_mss(sk, 1, NULL), tp->nonagle); } static __inline__ int tcp_may_send_now(struct sock *sk, struct tcp_opt *tp) @@ -1472,7 +1553,7 @@ static __inline__ int tcp_may_send_now(struct sock *sk, struct tcp_opt *tp) struct sk_buff *skb = sk->sk_send_head; return (skb && - tcp_snd_test(tp, skb, tcp_current_mss(sk, 1), + tcp_snd_test(tp, skb, tcp_current_mss(sk, 1, NULL), tcp_skb_is_last(sk, skb) ? TCP_NAGLE_PUSH : tp->nonagle)); } @@ -1964,7 +2045,7 @@ static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb) static inline __u32 __tcp_westwood_bw_rttmin(const struct tcp_opt *tp) { return max((tp->westwood.bw_est) * (tp->westwood.rtt_min) / - (__u32) (tp->mss_cache), + (__u32) (tp->mss_cache_std), 2U); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f34bdec2f31e..9df826c8e22b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -590,13 +590,14 @@ static inline int forced_push(struct tcp_opt *tp) } static inline void skb_entail(struct sock *sk, struct tcp_opt *tp, - struct sk_buff *skb) + struct sk_buff *skb, int tso_factor) { skb->csum = 0; TCP_SKB_CB(skb)->seq = tp->write_seq; TCP_SKB_CB(skb)->end_seq = tp->write_seq; TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; TCP_SKB_CB(skb)->sacked = 0; + TCP_SKB_CB(skb)->tso_factor = tso_factor; __skb_queue_tail(&sk->sk_write_queue, skb); sk_charge_skb(sk, skb); if (!sk->sk_send_head) @@ -632,7 +633,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse size_t psize, int flags) { struct tcp_opt *tp = tcp_sk(sk); - int mss_now; + int mss_now, mss_factor_now; int err; ssize_t copied; long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); @@ -644,7 +645,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + mss_now = tcp_current_mss(sk, !(flags&MSG_OOB), &mss_factor_now); copied = 0; err = -EPIPE; @@ -668,7 +669,7 @@ new_segment: if (!skb) goto wait_for_memory; - skb_entail(sk, tp, skb); + skb_entail(sk, tp, skb, mss_factor_now); copy = mss_now; } @@ -719,7 +720,8 @@ wait_for_memory: if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + mss_now = tcp_current_mss(sk, !(flags&MSG_OOB), + &mss_factor_now); } out: @@ -780,7 +782,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct tcp_opt *tp = tcp_sk(sk); struct sk_buff *skb; int iovlen, flags; - int mss_now; + int mss_now, mss_factor_now; int err, copied; long timeo; @@ -798,7 +800,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* This should be in poll */ clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + mss_now = tcp_current_mss(sk, !(flags&MSG_OOB), &mss_factor_now); /* Ok commence sending. */ iovlen = msg->msg_iovlen; @@ -843,7 +845,7 @@ new_segment: NETIF_F_HW_CSUM)) skb->ip_summed = CHECKSUM_HW; - skb_entail(sk, tp, skb); + skb_entail(sk, tp, skb, mss_factor_now); copy = mss_now; } @@ -962,7 +964,8 @@ wait_for_memory: if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + mss_now = tcp_current_mss(sk, !(flags&MSG_OOB), + &mss_factor_now); } } @@ -1818,7 +1821,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->backoff = 0; tp->snd_cwnd = 2; tp->probes_out = 0; - tp->packets_out = 0; + tcp_set_pcount(&tp->packets_out, 0); tp->snd_ssthresh = 0x7fffffff; tp->snd_cwnd_cnt = 0; tcp_set_ca_state(tp, TCP_CA_Open); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 5506944b7e7c..e0f8a7664f7e 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -70,14 +70,14 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_rto = (1000000*tp->rto)/HZ; info->tcpi_ato = (1000000*tp->ack.ato)/HZ; - info->tcpi_snd_mss = tp->mss_cache; + info->tcpi_snd_mss = tp->mss_cache_std; info->tcpi_rcv_mss = tp->ack.rcv_mss; - info->tcpi_unacked = tp->packets_out; - info->tcpi_sacked = tp->sacked_out; - info->tcpi_lost = tp->lost_out; - info->tcpi_retrans = tp->retrans_out; - info->tcpi_fackets = tp->fackets_out; + info->tcpi_unacked = tcp_get_pcount(&tp->packets_out); + info->tcpi_sacked = tcp_get_pcount(&tp->sacked_out); + info->tcpi_lost = tcp_get_pcount(&tp->lost_out); + info->tcpi_retrans = tcp_get_pcount(&tp->retrans_out); + info->tcpi_fackets = tcp_get_pcount(&tp->fackets_out); info->tcpi_last_data_sent = ((now - tp->lsndtime)*1000)/HZ; info->tcpi_last_data_recv = ((now - tp->ack.lrcvtime)*1000)/HZ; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 85643472b84d..f4ec16169906 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -897,7 +897,9 @@ static void tcp_update_reordering(struct tcp_opt *tp, int metric, int ts) #if FASTRETRANS_DEBUG > 1 printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n", tp->sack_ok, tp->ca_state, - tp->reordering, tp->fackets_out, tp->sacked_out, + tp->reordering, + tcp_get_pcount(&tp->fackets_out), + tcp_get_pcount(&tp->sacked_out), tp->undo_marker ? tp->undo_retrans : 0); #endif /* Disable FACK yet. */ @@ -960,7 +962,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked; struct tcp_sack_block *sp = (struct tcp_sack_block *)(ptr+2); int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3; - int reord = tp->packets_out; + int reord = tcp_get_pcount(&tp->packets_out); int prior_fackets; u32 lost_retrans = 0; int flag = 0; @@ -972,11 +974,12 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ sk->sk_route_caps &= ~NETIF_F_TSO; sk->sk_no_largesend = 1; tp->mss_cache = tp->mss_cache_std; + tp->mss_tso_factor = 1; } - if (!tp->sacked_out) - tp->fackets_out = 0; - prior_fackets = tp->fackets_out; + if (!tcp_get_pcount(&tp->sacked_out)) + tcp_set_pcount(&tp->fackets_out, 0); + prior_fackets = tcp_get_pcount(&tp->fackets_out); for (i=0; isacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); - tp->lost_out--; - tp->retrans_out--; + tcp_dec_pcount(&tp->lost_out, skb); + tcp_dec_pcount(&tp->retrans_out, skb); } } else { /* New sack for not retransmitted frame, @@ -1087,16 +1090,16 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (sacked & TCPCB_LOST) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; - tp->lost_out--; + tcp_dec_pcount(&tp->lost_out, skb); } } TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; flag |= FLAG_DATA_SACKED; - tp->sacked_out++; + tcp_inc_pcount(&tp->sacked_out, skb); - if (fack_count > tp->fackets_out) - tp->fackets_out = fack_count; + if (fack_count > tcp_get_pcount(&tp->fackets_out)) + tcp_set_pcount(&tp->fackets_out, fack_count); } else { if (dup_sack && (sacked&TCPCB_RETRANS)) reord = min(fack_count, reord); @@ -1110,7 +1113,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (dup_sack && (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; - tp->retrans_out--; + tcp_dec_pcount(&tp->retrans_out, skb); } } } @@ -1134,12 +1137,12 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ (IsFack(tp) || !before(lost_retrans, TCP_SKB_CB(skb)->ack_seq + tp->reordering * - tp->mss_cache))) { + tp->mss_cache_std))) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; - tp->retrans_out--; + tcp_dec_pcount(&tp->retrans_out, skb); if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) { - tp->lost_out++; + tcp_inc_pcount(&tp->lost_out, skb); TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; flag |= FLAG_DATA_SACKED; NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT); @@ -1148,15 +1151,20 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ } } - tp->left_out = tp->sacked_out + tp->lost_out; + tcp_set_pcount(&tp->left_out, + (tcp_get_pcount(&tp->sacked_out) + + tcp_get_pcount(&tp->lost_out))); - if (reord < tp->fackets_out && tp->ca_state != TCP_CA_Loss) - tcp_update_reordering(tp, (tp->fackets_out + 1) - reord, 0); + if ((reord < tcp_get_pcount(&tp->fackets_out)) && + tp->ca_state != TCP_CA_Loss) + tcp_update_reordering(tp, + ((tcp_get_pcount(&tp->fackets_out) + 1) - + reord), 0); #if FASTRETRANS_DEBUG > 0 - BUG_TRAP((int)tp->sacked_out >= 0); - BUG_TRAP((int)tp->lost_out >= 0); - BUG_TRAP((int)tp->retrans_out >= 0); + BUG_TRAP((int)tcp_get_pcount(&tp->sacked_out) >= 0); + BUG_TRAP((int)tcp_get_pcount(&tp->lost_out) >= 0); + BUG_TRAP((int)tcp_get_pcount(&tp->retrans_out) >= 0); BUG_TRAP((int)tcp_packets_in_flight(tp) >= 0); #endif return flag; @@ -1186,7 +1194,7 @@ void tcp_enter_frto(struct sock *sk) * If something was really lost, it is eventually caught up * in tcp_enter_frto_loss. */ - tp->retrans_out = 0; + tcp_set_pcount(&tp->retrans_out, 0); tp->undo_marker = tp->snd_una; tp->undo_retrans = 0; @@ -1209,26 +1217,26 @@ static void tcp_enter_frto_loss(struct sock *sk) struct sk_buff *skb; int cnt = 0; - tp->sacked_out = 0; - tp->lost_out = 0; - tp->fackets_out = 0; + tcp_set_pcount(&tp->sacked_out, 0); + tcp_set_pcount(&tp->lost_out, 0); + tcp_set_pcount(&tp->fackets_out, 0); sk_stream_for_retrans_queue(skb, sk) { - cnt++; + cnt += TCP_SKB_CB(skb)->tso_factor;; TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) { /* Do not mark those segments lost that were * forward transmitted after RTO */ - if(!after(TCP_SKB_CB(skb)->end_seq, + if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - tp->lost_out++; + tcp_inc_pcount(&tp->lost_out, skb); } } else { - tp->sacked_out++; - tp->fackets_out = cnt; + tcp_inc_pcount(&tp->sacked_out, skb); + tcp_set_pcount(&tp->fackets_out, cnt); } } tcp_sync_left_out(tp); @@ -1250,12 +1258,12 @@ static void tcp_enter_frto_loss(struct sock *sk) void tcp_clear_retrans(struct tcp_opt *tp) { - tp->left_out = 0; - tp->retrans_out = 0; + tcp_set_pcount(&tp->left_out, 0); + tcp_set_pcount(&tp->retrans_out, 0); - tp->fackets_out = 0; - tp->sacked_out = 0; - tp->lost_out = 0; + tcp_set_pcount(&tp->fackets_out, 0); + tcp_set_pcount(&tp->sacked_out, 0); + tcp_set_pcount(&tp->lost_out, 0); tp->undo_marker = 0; tp->undo_retrans = 0; @@ -1289,17 +1297,17 @@ void tcp_enter_loss(struct sock *sk, int how) tp->undo_marker = tp->snd_una; sk_stream_for_retrans_queue(skb, sk) { - cnt++; + cnt += TCP_SKB_CB(skb)->tso_factor; if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS) tp->undo_marker = 0; TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - tp->lost_out++; + tcp_inc_pcount(&tp->lost_out, skb); } else { - tp->sacked_out++; - tp->fackets_out = cnt; + tcp_inc_pcount(&tp->sacked_out, skb); + tcp_set_pcount(&tp->fackets_out, cnt); } } tcp_sync_left_out(tp); @@ -1336,7 +1344,8 @@ static int tcp_check_sack_reneging(struct sock *sk, struct tcp_opt *tp) static inline int tcp_fackets_out(struct tcp_opt *tp) { - return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out; + return IsReno(tp) ? tcp_get_pcount(&tp->sacked_out)+1 : + tcp_get_pcount(&tp->fackets_out); } static inline int tcp_skb_timedout(struct tcp_opt *tp, struct sk_buff *skb) @@ -1346,7 +1355,7 @@ static inline int tcp_skb_timedout(struct tcp_opt *tp, struct sk_buff *skb) static inline int tcp_head_timedout(struct sock *sk, struct tcp_opt *tp) { - return tp->packets_out && + return tcp_get_pcount(&tp->packets_out) && tcp_skb_timedout(tp, skb_peek(&sk->sk_write_queue)); } @@ -1446,8 +1455,10 @@ static inline int tcp_head_timedout(struct sock *sk, struct tcp_opt *tp) static int tcp_time_to_recover(struct sock *sk, struct tcp_opt *tp) { + __u32 packets_out; + /* Trick#1: The loss is proven. */ - if (tp->lost_out) + if (tcp_get_pcount(&tp->lost_out)) return 1; /* Not-A-Trick#2 : Classic rule... */ @@ -1463,8 +1474,9 @@ tcp_time_to_recover(struct sock *sk, struct tcp_opt *tp) /* Trick#4: It is still not OK... But will it be useful to delay * recovery more? */ - if (tp->packets_out <= tp->reordering && - tp->sacked_out >= max_t(__u32, tp->packets_out/2, sysctl_tcp_reordering) && + packets_out = tcp_get_pcount(&tp->packets_out); + if (packets_out <= tp->reordering && + tcp_get_pcount(&tp->sacked_out) >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) && !tcp_may_send_now(sk, tp)) { /* We have nothing to send. This connection is limited * either by receiver window or by application. @@ -1483,12 +1495,16 @@ static void tcp_check_reno_reordering(struct tcp_opt *tp, int addend) { u32 holes; - holes = max(tp->lost_out, 1U); - holes = min(holes, tp->packets_out); + holes = max(tcp_get_pcount(&tp->lost_out), 1U); + holes = min(holes, tcp_get_pcount(&tp->packets_out)); - if (tp->sacked_out + holes > tp->packets_out) { - tp->sacked_out = tp->packets_out - holes; - tcp_update_reordering(tp, tp->packets_out+addend, 0); + if ((tcp_get_pcount(&tp->sacked_out) + holes) > + tcp_get_pcount(&tp->packets_out)) { + tcp_set_pcount(&tp->sacked_out, + (tcp_get_pcount(&tp->packets_out) - holes)); + tcp_update_reordering(tp, + tcp_get_pcount(&tp->packets_out)+addend, + 0); } } @@ -1496,7 +1512,7 @@ static void tcp_check_reno_reordering(struct tcp_opt *tp, int addend) static void tcp_add_reno_sack(struct tcp_opt *tp) { - ++tp->sacked_out; + tcp_inc_pcount_explicit(&tp->sacked_out, 1); tcp_check_reno_reordering(tp, 0); tcp_sync_left_out(tp); } @@ -1507,10 +1523,10 @@ static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_opt *tp, int acked { if (acked > 0) { /* One ACK acked hole. The rest eat duplicate ACKs. */ - if (acked-1 >= tp->sacked_out) - tp->sacked_out = 0; + if (acked-1 >= tcp_get_pcount(&tp->sacked_out)) + tcp_set_pcount(&tp->sacked_out, 0); else - tp->sacked_out -= acked-1; + tcp_dec_pcount_explicit(&tp->sacked_out, acked-1); } tcp_check_reno_reordering(tp, acked); tcp_sync_left_out(tp); @@ -1518,8 +1534,8 @@ static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_opt *tp, int acked static inline void tcp_reset_reno_sack(struct tcp_opt *tp) { - tp->sacked_out = 0; - tp->left_out = tp->lost_out; + tcp_set_pcount(&tp->sacked_out, 0); + tcp_set_pcount(&tp->left_out, tcp_get_pcount(&tp->lost_out)); } /* Mark head of queue up as lost. */ @@ -1529,14 +1545,15 @@ tcp_mark_head_lost(struct sock *sk, struct tcp_opt *tp, int packets, u32 high_se struct sk_buff *skb; int cnt = packets; - BUG_TRAP(cnt <= tp->packets_out); + BUG_TRAP(cnt <= tcp_get_pcount(&tp->packets_out)); sk_stream_for_retrans_queue(skb, sk) { - if (--cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq)) + cnt -= TCP_SKB_CB(skb)->tso_factor; + if (cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq)) break; if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - tp->lost_out++; + tcp_inc_pcount(&tp->lost_out, skb); } } tcp_sync_left_out(tp); @@ -1547,7 +1564,7 @@ tcp_mark_head_lost(struct sock *sk, struct tcp_opt *tp, int packets, u32 high_se static void tcp_update_scoreboard(struct sock *sk, struct tcp_opt *tp) { if (IsFack(tp)) { - int lost = tp->fackets_out - tp->reordering; + int lost = tcp_get_pcount(&tp->fackets_out) - tp->reordering; if (lost <= 0) lost = 1; tcp_mark_head_lost(sk, tp, lost, tp->high_seq); @@ -1567,7 +1584,7 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_opt *tp) if (tcp_skb_timedout(tp, skb) && !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - tp->lost_out++; + tcp_inc_pcount(&tp->lost_out, skb); } } tcp_sync_left_out(tp); @@ -1632,8 +1649,9 @@ static void DBGUNDO(struct sock *sk, struct tcp_opt *tp, const char *msg) printk(KERN_DEBUG "Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n", msg, NIPQUAD(inet->daddr), ntohs(inet->dport), - tp->snd_cwnd, tp->left_out, - tp->snd_ssthresh, tp->prior_ssthresh, tp->packets_out); + tp->snd_cwnd, tcp_get_pcount(&tp->left_out), + tp->snd_ssthresh, tp->prior_ssthresh, + tcp_get_pcount(&tp->packets_out)); } #else #define DBGUNDO(x...) do { } while (0) @@ -1703,13 +1721,13 @@ static void tcp_try_undo_dsack(struct sock *sk, struct tcp_opt *tp) static int tcp_try_undo_partial(struct sock *sk, struct tcp_opt *tp, int acked) { /* Partial ACK arrived. Force Hoe's retransmit. */ - int failed = IsReno(tp) || tp->fackets_out>tp->reordering; + int failed = IsReno(tp) || tcp_get_pcount(&tp->fackets_out)>tp->reordering; if (tcp_may_undo(tp)) { /* Plain luck! Hole if filled with delayed * packet, rather than with a retransmit. */ - if (tp->retrans_out == 0) + if (tcp_get_pcount(&tp->retrans_out) == 0) tp->retrans_stamp = 0; tcp_update_reordering(tp, tcp_fackets_out(tp)+acked, 1); @@ -1736,8 +1754,8 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_opt *tp) TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; } DBGUNDO(sk, tp, "partial loss"); - tp->lost_out = 0; - tp->left_out = tp->sacked_out; + tcp_set_pcount(&tp->lost_out, 0); + tcp_set_pcount(&tp->left_out, tcp_get_pcount(&tp->sacked_out)); tcp_undo_cwr(tp, 1); NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); tp->retransmits = 0; @@ -1760,9 +1778,9 @@ static __inline__ void tcp_complete_cwr(struct tcp_opt *tp) static void tcp_try_to_open(struct sock *sk, struct tcp_opt *tp, int flag) { - tp->left_out = tp->sacked_out; + tcp_set_pcount(&tp->left_out, tcp_get_pcount(&tp->sacked_out)); - if (tp->retrans_out == 0) + if (tcp_get_pcount(&tp->retrans_out) == 0) tp->retrans_stamp = 0; if (flag&FLAG_ECE) @@ -1771,8 +1789,8 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_opt *tp, int flag) if (tp->ca_state != TCP_CA_CWR) { int state = TCP_CA_Open; - if (tp->left_out || - tp->retrans_out || + if (tcp_get_pcount(&tp->left_out) || + tcp_get_pcount(&tp->retrans_out) || tp->undo_marker) state = TCP_CA_Disorder; @@ -1806,11 +1824,11 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, /* Some technical things: * 1. Reno does not count dupacks (sacked_out) automatically. */ - if (!tp->packets_out) - tp->sacked_out = 0; + if (!tcp_get_pcount(&tp->packets_out)) + tcp_set_pcount(&tp->sacked_out, 0); /* 2. SACK counts snd_fack in packets inaccurately. */ - if (tp->sacked_out == 0) - tp->fackets_out = 0; + if (tcp_get_pcount(&tp->sacked_out) == 0) + tcp_set_pcount(&tp->fackets_out, 0); /* Now state machine starts. * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ @@ -1818,15 +1836,15 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, tp->prior_ssthresh = 0; /* B. In all the states check for reneging SACKs. */ - if (tp->sacked_out && tcp_check_sack_reneging(sk, tp)) + if (tcp_get_pcount(&tp->sacked_out) && tcp_check_sack_reneging(sk, tp)) return; /* C. Process data loss notification, provided it is valid. */ if ((flag&FLAG_DATA_LOST) && before(tp->snd_una, tp->high_seq) && tp->ca_state != TCP_CA_Open && - tp->fackets_out > tp->reordering) { - tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq); + tcp_get_pcount(&tp->fackets_out) > tp->reordering) { + tcp_mark_head_lost(sk, tp, tcp_get_pcount(&tp->fackets_out)-tp->reordering, tp->high_seq); NET_INC_STATS_BH(LINUX_MIB_TCPLOSS); } @@ -1837,7 +1855,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, * when high_seq is ACKed. */ if (tp->ca_state == TCP_CA_Open) { if (!sysctl_tcp_frto) - BUG_TRAP(tp->retrans_out == 0); + BUG_TRAP(tcp_get_pcount(&tp->retrans_out) == 0); tp->retrans_stamp = 0; } else if (!before(tp->snd_una, tp->high_seq)) { switch (tp->ca_state) { @@ -1884,7 +1902,8 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, if (IsReno(tp) && is_dupack) tcp_add_reno_sack(tp); } else { - int acked = prior_packets - tp->packets_out; + int acked = prior_packets - + tcp_get_pcount(&tp->packets_out); if (IsReno(tp)) tcp_remove_reno_sacks(sk, tp, acked); is_dupack = tcp_try_undo_partial(sk, tp, acked); @@ -1927,7 +1946,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, tp->high_seq = tp->snd_nxt; tp->prior_ssthresh = 0; tp->undo_marker = tp->snd_una; - tp->undo_retrans = tp->retrans_out; + tp->undo_retrans = tcp_get_pcount(&tp->retrans_out); if (tp->ca_state < TCP_CA_CWR) { if (!(flag&FLAG_ECE)) @@ -2156,7 +2175,7 @@ static void vegas_cong_avoid(struct tcp_opt *tp, u32 ack, u32 seq_rtt) * is the cwnd during the previous RTT. */ old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) / - tp->mss_cache; + tp->mss_cache_std; old_snd_cwnd = tp->vegas.beg_snd_cwnd; /* Save the extent of the current window so we can use this @@ -2327,7 +2346,7 @@ static inline void tcp_cong_avoid(struct tcp_opt *tp, u32 ack, u32 seq_rtt) static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp) { - if (tp->packets_out==0) { + if (!tcp_get_pcount(&tp->packets_out)) { tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS); } else { tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); @@ -2343,7 +2362,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) int acked = 0; __s32 seq_rtt = -1; - while ((skb = skb_peek(&sk->sk_write_queue)) && skb != sk->sk_send_head) { + while ((skb = skb_peek(&sk->sk_write_queue)) && + skb != sk->sk_send_head) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); __u8 sacked = scb->sacked; @@ -2361,7 +2381,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) * connection startup slow start one packet too * quickly. This is severely frowned upon behavior. */ - if(!(scb->flags & TCPCB_FLAG_SYN)) { + if (!(scb->flags & TCPCB_FLAG_SYN)) { acked |= FLAG_DATA_ACKED; } else { acked |= FLAG_SYN_ACKED; @@ -2369,27 +2389,26 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) } if (sacked) { - if(sacked & TCPCB_RETRANS) { + if (sacked & TCPCB_RETRANS) { if(sacked & TCPCB_SACKED_RETRANS) - tp->retrans_out--; + tcp_dec_pcount(&tp->retrans_out, skb); acked |= FLAG_RETRANS_DATA_ACKED; seq_rtt = -1; } else if (seq_rtt < 0) seq_rtt = now - scb->when; - if(sacked & TCPCB_SACKED_ACKED) - tp->sacked_out--; - if(sacked & TCPCB_LOST) - tp->lost_out--; - if(sacked & TCPCB_URG) { + if (sacked & TCPCB_SACKED_ACKED) + tcp_dec_pcount(&tp->sacked_out, skb); + if (sacked & TCPCB_LOST) + tcp_dec_pcount(&tp->lost_out, skb); + if (sacked & TCPCB_URG) { if (tp->urg_mode && !before(scb->end_seq, tp->snd_up)) tp->urg_mode = 0; } } else if (seq_rtt < 0) seq_rtt = now - scb->when; - if (tp->fackets_out) - tp->fackets_out--; - tp->packets_out--; + tcp_dec_pcount_approx(&tp->fackets_out, skb); + tcp_packets_out_dec(tp, skb); __skb_unlink(skb, skb->list); sk_stream_free_skb(sk, skb); } @@ -2400,24 +2419,27 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) } #if FASTRETRANS_DEBUG > 0 - BUG_TRAP((int)tp->sacked_out >= 0); - BUG_TRAP((int)tp->lost_out >= 0); - BUG_TRAP((int)tp->retrans_out >= 0); - if (!tp->packets_out && tp->sack_ok) { - if (tp->lost_out) { - printk(KERN_DEBUG "Leak l=%u %d\n", tp->lost_out, - tp->ca_state); - tp->lost_out = 0; + BUG_TRAP((int)tcp_get_pcount(&tp->sacked_out) >= 0); + BUG_TRAP((int)tcp_get_pcount(&tp->lost_out) >= 0); + BUG_TRAP((int)tcp_get_pcount(&tp->retrans_out) >= 0); + if (!tcp_get_pcount(&tp->packets_out) && tp->sack_ok) { + if (tcp_get_pcount(&tp->lost_out)) { + printk(KERN_DEBUG "Leak l=%u %d\n", + tcp_get_pcount(&tp->lost_out), + tp->ca_state); + tcp_set_pcount(&tp->lost_out, 0); } - if (tp->sacked_out) { - printk(KERN_DEBUG "Leak s=%u %d\n", tp->sacked_out, - tp->ca_state); - tp->sacked_out = 0; + if (tcp_get_pcount(&tp->sacked_out)) { + printk(KERN_DEBUG "Leak s=%u %d\n", + tcp_get_pcount(&tp->sacked_out), + tp->ca_state); + tcp_set_pcount(&tp->sacked_out, 0); } - if (tp->retrans_out) { - printk(KERN_DEBUG "Leak r=%u %d\n", tp->retrans_out, - tp->ca_state); - tp->retrans_out = 0; + if (tcp_get_pcount(&tp->retrans_out)) { + printk(KERN_DEBUG "Leak r=%u %d\n", + tcp_get_pcount(&tp->retrans_out), + tp->ca_state); + tcp_set_pcount(&tp->retrans_out, 0); } } #endif @@ -2712,19 +2734,19 @@ static void westwood_dupack_update(struct sock *sk) { struct tcp_opt *tp = tcp_sk(sk); - tp->westwood.accounted += tp->mss_cache; - tp->westwood.cumul_ack = tp->mss_cache; + tp->westwood.accounted += tp->mss_cache_std; + tp->westwood.cumul_ack = tp->mss_cache_std; } static inline int westwood_may_change_cumul(struct tcp_opt *tp) { - return ((tp->westwood.cumul_ack) > tp->mss_cache); + return ((tp->westwood.cumul_ack) > tp->mss_cache_std); } static inline void westwood_partial_update(struct tcp_opt *tp) { tp->westwood.accounted -= tp->westwood.cumul_ack; - tp->westwood.cumul_ack = tp->mss_cache; + tp->westwood.cumul_ack = tp->mss_cache_std; } static inline void westwood_complete_update(struct tcp_opt *tp) @@ -2835,7 +2857,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) */ sk->sk_err_soft = 0; tp->rcv_tstamp = tcp_time_stamp; - prior_packets = tp->packets_out; + prior_packets = tcp_get_pcount(&tp->packets_out); if (!prior_packets) goto no_queue; @@ -3857,11 +3879,11 @@ static void tcp_new_space(struct sock *sk) { struct tcp_opt *tp = tcp_sk(sk); - if (tp->packets_out < tp->snd_cwnd && + if (tcp_get_pcount(&tp->packets_out) < tp->snd_cwnd && !(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && !tcp_memory_pressure && atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { - int sndmem = max_t(u32, tp->mss_clamp, tp->mss_cache) + + int sndmem = max_t(u32, tp->mss_clamp, tp->mss_cache_std) + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff), demanded = max_t(unsigned int, tp->snd_cwnd, tp->reordering + 1); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2cfd74fbf566..0fb326e84f28 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2075,7 +2075,8 @@ static int tcp_v4_init_sock(struct sock *sk) */ tp->snd_ssthresh = 0x7fffffff; /* Infinity */ tp->snd_cwnd_clamp = ~0; - tp->mss_cache = 536; + tp->mss_cache_std = tp->mss_cache = 536; + tp->mss_tso_factor = 1; tp->reordering = sysctl_tcp_reordering; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 79c1884c2b8b..ab04144245e5 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -752,11 +752,11 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, newtp->mdev = TCP_TIMEOUT_INIT; newtp->rto = TCP_TIMEOUT_INIT; - newtp->packets_out = 0; - newtp->left_out = 0; - newtp->retrans_out = 0; - newtp->sacked_out = 0; - newtp->fackets_out = 0; + tcp_set_pcount(&newtp->packets_out, 0); + tcp_set_pcount(&newtp->left_out, 0); + tcp_set_pcount(&newtp->retrans_out, 0); + tcp_set_pcount(&newtp->sacked_out, 0); + tcp_set_pcount(&newtp->fackets_out, 0); newtp->snd_ssthresh = 0x7fffffff; /* So many TCP implementations out there (incorrectly) count the diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index bd3d0133f724..0a70d082028c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -52,8 +52,7 @@ void update_send_head(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb) if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue) sk->sk_send_head = NULL; tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - if (tp->packets_out++ == 0) - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + tcp_packets_out_inc(sk, tp, skb); } /* SND.NXT, if window was not shrunk. @@ -123,7 +122,8 @@ static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *s { u32 now = tcp_time_stamp; - if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto) + if (!tcp_get_pcount(&tp->packets_out) && + (s32)(now - tp->lsndtime) > tp->rto) tcp_cwnd_restart(tp, __sk_dst_get(sk)); tp->lsndtime = now; @@ -259,7 +259,7 @@ static __inline__ u16 tcp_select_window(struct sock *sk) */ int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) { - if(skb != NULL) { + if (skb != NULL) { struct inet_opt *inet = inet_sk(sk); struct tcp_opt *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -268,6 +268,8 @@ int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) int sysctl_flags; int err; + BUG_ON(!TCP_SKB_CB(skb)->tso_factor); + #define SYSCTL_FLAG_TSTAMPS 0x1 #define SYSCTL_FLAG_WSCALE 0x2 #define SYSCTL_FLAG_SACK 0x4 @@ -414,8 +416,7 @@ void tcp_push_one(struct sock *sk, unsigned cur_mss) if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) { sk->sk_send_head = NULL; tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - if (tp->packets_out++ == 0) - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + tcp_packets_out_inc(sk, tp, skb); return; } } @@ -453,10 +454,13 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) flags = TCP_SKB_CB(skb)->flags; TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); TCP_SKB_CB(buff)->flags = flags; - TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_EVER_RETRANS|TCPCB_AT_TAIL); + TCP_SKB_CB(buff)->sacked = + (TCP_SKB_CB(skb)->sacked & + (TCPCB_LOST | TCPCB_EVER_RETRANS | TCPCB_AT_TAIL)); + TCP_SKB_CB(buff)->tso_factor = tp->mss_tso_factor; if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) { - tp->lost_out++; - tp->left_out++; + tcp_inc_pcount(&tp->lost_out, buff); + tcp_inc_pcount(&tp->left_out, buff); } TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL; @@ -594,9 +598,10 @@ int tcp_sync_mss(struct sock *sk, u32 pmtu) /* And store cached results */ tp->pmtu_cookie = pmtu; tp->mss_cache = tp->mss_cache_std = mss_now; + tp->mss_tso_factor = 1; if (sk->sk_route_caps & NETIF_F_TSO) { - int large_mss; + int large_mss, factor; large_mss = 65535 - tp->af_specific->net_header_len - tp->ext_header_len - tp->ext2_header_len - tp->tcp_header_len; @@ -604,8 +609,15 @@ int tcp_sync_mss(struct sock *sk, u32 pmtu) if (tp->max_window && large_mss > (tp->max_window>>1)) large_mss = max((tp->max_window>>1), 68U - tp->tcp_header_len); - /* Always keep large mss multiple of real mss. */ - tp->mss_cache = mss_now*(large_mss/mss_now); + /* Always keep large mss multiple of real mss, but + * do not exceed congestion window. + */ + factor = large_mss / mss_now; + if (factor > tp->snd_cwnd) + factor = tp->snd_cwnd; + + tp->mss_cache = mss_now * factor; + tp->mss_tso_factor = factor; } return mss_now; @@ -637,7 +649,7 @@ int tcp_write_xmit(struct sock *sk, int nonagle) * We also handle things correctly when the user adds some * IP options mid-stream. Silly to do, but cover it. */ - mss_now = tcp_current_mss(sk, 1); + mss_now = tcp_current_mss(sk, 1, NULL); while ((skb = sk->sk_send_head) && tcp_snd_test(tp, skb, mss_now, @@ -662,7 +674,7 @@ int tcp_write_xmit(struct sock *sk, int nonagle) return 0; } - return !tp->packets_out && sk->sk_send_head; + return !tcp_get_pcount(&tp->packets_out) && sk->sk_send_head; } return 0; } @@ -788,7 +800,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m /* The first test we must make is that neither of these two * SKB's are still referenced by someone else. */ - if(!skb_cloned(skb) && !skb_cloned(next_skb)) { + if (!skb_cloned(skb) && !skb_cloned(next_skb)) { int skb_size = skb->len, next_skb_size = next_skb->len; u16 flags = TCP_SKB_CB(skb)->flags; @@ -831,24 +843,23 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m */ TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL); if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS) - tp->retrans_out--; + tcp_dec_pcount(&tp->retrans_out, next_skb); if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) { - tp->lost_out--; - tp->left_out--; + tcp_dec_pcount(&tp->lost_out, next_skb); + tcp_dec_pcount(&tp->left_out, next_skb); } /* Reno case is special. Sigh... */ - if (!tp->sack_ok && tp->sacked_out) { - tp->sacked_out--; - tp->left_out--; + if (!tp->sack_ok && tcp_get_pcount(&tp->sacked_out)) { + tcp_dec_pcount_approx(&tp->sacked_out, next_skb); + tcp_dec_pcount(&tp->left_out, next_skb); } /* Not quite right: it can be > snd.fack, but * it is better to underestimate fackets. */ - if (tp->fackets_out) - tp->fackets_out--; + tcp_dec_pcount_approx(&tp->fackets_out, next_skb); + tcp_packets_out_dec(tp, next_skb); sk_stream_free_skb(sk, next_skb); - tp->packets_out--; } } @@ -860,7 +871,7 @@ void tcp_simple_retransmit(struct sock *sk) { struct tcp_opt *tp = tcp_sk(sk); struct sk_buff *skb; - unsigned int mss = tcp_current_mss(sk, 0); + unsigned int mss = tcp_current_mss(sk, 0, NULL); int lost = 0; sk_stream_for_retrans_queue(skb, sk) { @@ -868,11 +879,11 @@ void tcp_simple_retransmit(struct sock *sk) !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) { if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; - tp->retrans_out--; + tcp_dec_pcount(&tp->retrans_out, skb); } if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - tp->lost_out++; + tcp_inc_pcount(&tp->lost_out, skb); lost = 1; } } @@ -905,7 +916,7 @@ void tcp_simple_retransmit(struct sock *sk) int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_opt *tp = tcp_sk(sk); - unsigned int cur_mss = tcp_current_mss(sk, 0); + unsigned int cur_mss = tcp_current_mss(sk, 0, NULL); int err; /* Do not sent more than we queued. 1/4 is reserved for possible @@ -923,6 +934,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) sk->sk_route_caps &= ~NETIF_F_TSO; sk->sk_no_largesend = 1; tp->mss_cache = tp->mss_cache_std; + tp->mss_tso_factor = 1; } if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) @@ -938,12 +950,12 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) && TCP_SKB_CB(skb)->seq != tp->snd_una) return -EAGAIN; - if(skb->len > cur_mss) { - if(tcp_fragment(sk, skb, cur_mss)) + if (skb->len > cur_mss) { + if (tcp_fragment(sk, skb, cur_mss)) return -ENOMEM; /* We'll try again later. */ /* New SKB created, account for it. */ - tp->packets_out++; + tcp_inc_pcount(&tp->packets_out, skb); } /* Collapse two adjacent packets if worthwhile and we can. */ @@ -992,7 +1004,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) } #endif TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; - tp->retrans_out++; + tcp_inc_pcount(&tp->retrans_out, skb); /* Save stamp of the first retransmit. */ if (!tp->retrans_stamp) @@ -1020,14 +1032,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk) { struct tcp_opt *tp = tcp_sk(sk); struct sk_buff *skb; - int packet_cnt = tp->lost_out; + int packet_cnt = tcp_get_pcount(&tp->lost_out); /* First pass: retransmit lost packets. */ if (packet_cnt) { sk_stream_for_retrans_queue(skb, sk) { __u8 sacked = TCP_SKB_CB(skb)->sacked; + int pkts = TCP_SKB_CB(skb)->tso_factor; - if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) + if ((tcp_packets_in_flight(tp) + (pkts-1)) >= + tp->snd_cwnd) return; if (sacked&TCPCB_LOST) { @@ -1044,7 +1058,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); } - if (--packet_cnt <= 0) + packet_cnt -= TCP_SKB_CB(skb)->tso_factor; + if (packet_cnt <= 0) break; } } @@ -1073,17 +1088,20 @@ void tcp_xmit_retransmit_queue(struct sock *sk) packet_cnt = 0; sk_stream_for_retrans_queue(skb, sk) { - if(++packet_cnt > tp->fackets_out) + int pkts = TCP_SKB_CB(skb)->tso_factor; + + packet_cnt += pkts; + if (packet_cnt > tcp_get_pcount(&tp->fackets_out)) break; - if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) + if ((tcp_packets_in_flight(tp) + (pkts-1)) >= tp->snd_cwnd) break; - if(TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) + if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) continue; /* Ok, retransmit it. */ - if(tcp_retransmit_skb(sk, skb)) + if (tcp_retransmit_skb(sk, skb)) break; if (skb == skb_peek(&sk->sk_write_queue)) @@ -1101,13 +1119,13 @@ void tcp_send_fin(struct sock *sk) { struct tcp_opt *tp = tcp_sk(sk); struct sk_buff *skb = skb_peek_tail(&sk->sk_write_queue); - unsigned int mss_now; + int mss_now; /* Optimization, tack on the FIN if we have a queue of * unsent frames. But be careful about outgoing SACKS * and IP options. */ - mss_now = tcp_current_mss(sk, 1); + mss_now = tcp_current_mss(sk, 1, NULL); if (sk->sk_send_head != NULL) { TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; @@ -1127,6 +1145,7 @@ void tcp_send_fin(struct sock *sk) skb->csum = 0; TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); TCP_SKB_CB(skb)->sacked = 0; + TCP_SKB_CB(skb)->tso_factor = 1; /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ TCP_SKB_CB(skb)->seq = tp->write_seq; @@ -1158,6 +1177,7 @@ void tcp_send_active_reset(struct sock *sk, int priority) skb->csum = 0; TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST); TCP_SKB_CB(skb)->sacked = 0; + TCP_SKB_CB(skb)->tso_factor = 1; /* Send it off. */ TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp); @@ -1237,6 +1257,8 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, th->dest = req->rmt_port; TCP_SKB_CB(skb)->seq = req->snt_isn; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; + TCP_SKB_CB(skb)->sacked = 0; + TCP_SKB_CB(skb)->tso_factor = 1; th->seq = htonl(TCP_SKB_CB(skb)->seq); th->ack_seq = htonl(req->rcv_isn + 1); if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ @@ -1338,6 +1360,7 @@ int tcp_connect(struct sock *sk) TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN; TCP_ECN_send_syn(sk, tp, buff); TCP_SKB_CB(buff)->sacked = 0; + TCP_SKB_CB(buff)->tso_factor = 1; buff->csum = 0; TCP_SKB_CB(buff)->seq = tp->write_seq++; TCP_SKB_CB(buff)->end_seq = tp->write_seq; @@ -1350,7 +1373,7 @@ int tcp_connect(struct sock *sk) tp->retrans_stamp = TCP_SKB_CB(buff)->when; __skb_queue_tail(&sk->sk_write_queue, buff); sk_charge_skb(sk, buff); - tp->packets_out++; + tcp_inc_pcount(&tp->packets_out, buff); tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL)); TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); @@ -1437,6 +1460,7 @@ void tcp_send_ack(struct sock *sk) buff->csum = 0; TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK; TCP_SKB_CB(buff)->sacked = 0; + TCP_SKB_CB(buff)->tso_factor = 1; /* Send it off, this clears delayed acks for us. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp); @@ -1471,6 +1495,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) skb->csum = 0; TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; TCP_SKB_CB(skb)->sacked = urgent; + TCP_SKB_CB(skb)->tso_factor = 1; /* Use a previous sequence. This should cause the other * end to send an ack. Don't queue or clone SKB, just @@ -1491,7 +1516,7 @@ int tcp_write_wakeup(struct sock *sk) if ((skb = sk->sk_send_head) != NULL && before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) { int err; - int mss = tcp_current_mss(sk, 0); + int mss = tcp_current_mss(sk, 0, NULL); int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq; if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) @@ -1513,6 +1538,7 @@ int tcp_write_wakeup(struct sock *sk) sk->sk_no_largesend = 1; sk->sk_route_caps &= ~NETIF_F_TSO; tp->mss_cache = tp->mss_cache_std; + tp->mss_tso_factor = 1; } } TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; @@ -1542,7 +1568,7 @@ void tcp_send_probe0(struct sock *sk) err = tcp_write_wakeup(sk); - if (tp->packets_out || !sk->sk_send_head) { + if (tcp_get_pcount(&tp->packets_out) || !sk->sk_send_head) { /* Cancel probe timer, if it is not required. */ tp->probes_out = 0; tp->backoff = 0; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 72a5a50b50ab..c060bb333471 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -121,7 +121,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset) * 1. Last segment was sent recently. */ if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || /* 2. Window is closed. */ - (!tp->snd_wnd && !tp->packets_out)) + (!tp->snd_wnd && !tcp_get_pcount(&tp->packets_out))) do_reset = 1; if (do_reset) tcp_send_active_reset(sk, GFP_ATOMIC); @@ -269,7 +269,7 @@ static void tcp_probe_timer(struct sock *sk) struct tcp_opt *tp = tcp_sk(sk); int max_probes; - if (tp->packets_out || !sk->sk_send_head) { + if (tcp_get_pcount(&tp->packets_out) || !sk->sk_send_head) { tp->probes_out = 0; return; } @@ -316,7 +316,7 @@ static void tcp_retransmit_timer(struct sock *sk) { struct tcp_opt *tp = tcp_sk(sk); - if (tp->packets_out == 0) + if (!tcp_get_pcount(&tp->packets_out)) goto out; BUG_TRAP(!skb_queue_empty(&sk->sk_write_queue)); @@ -606,7 +606,7 @@ static void tcp_keepalive_timer (unsigned long data) elapsed = keepalive_time_when(tp); /* It is alive without keepalive 8) */ - if (tp->packets_out || sk->sk_send_head) + if (tcp_get_pcount(&tp->packets_out) || sk->sk_send_head) goto resched; elapsed = tcp_time_stamp - tp->rcv_tstamp; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 73b34df7fd17..0ecd0d8dfa0f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1929,7 +1929,8 @@ static int tcp_v6_init_sock(struct sock *sk) */ tp->snd_ssthresh = 0x7fffffff; tp->snd_cwnd_clamp = ~0; - tp->mss_cache = 536; + tp->mss_cache_std = tp->mss_cache = 536; + tp->mss_tso_factor = 1; tp->reordering = sysctl_tcp_reordering; -- cgit v1.2.3 From 95d267365e82205bada1b50fe699fb2284aa090e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 6 Sep 2004 20:01:22 -0700 Subject: [TCP]: Calculate SKB tso factor more accurately. Eliminate tp->mss_tso_factor. Instead, we calculate the SKB tso factor as we walk the write queue for initial transmit or fragment SKBs. Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 - include/net/tcp.h | 8 +++----- net/ipv4/tcp.c | 21 +++++++++------------ net/ipv4/tcp_input.c | 1 - net/ipv4/tcp_ipv4.c | 1 - net/ipv4/tcp_output.c | 49 +++++++++++++++++++++++++++++++++++-------------- net/ipv6/tcp_ipv6.c | 1 - 7 files changed, 47 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index ebf15b6a8162..86ca98c5ef8f 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -254,7 +254,6 @@ struct tcp_opt { __u32 max_window; /* Maximal window ever seen from peer */ __u32 pmtu_cookie; /* Last pmtu seen by socket */ __u32 mss_cache; /* Cached effective mss, not including SACKS */ - __u32 mss_tso_factor; /* Real packets per TSO packet */ __u16 mss_cache_std; /* Like mss_cache, but without TSO */ __u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ __u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */ diff --git a/include/net/tcp.h b/include/net/tcp.h index efda37b84207..1de15c7a560a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1047,7 +1047,7 @@ static inline void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long * is not a big flaw. */ -static inline unsigned int tcp_current_mss(struct sock *sk, int large, int *factor) +static inline unsigned int tcp_current_mss(struct sock *sk, int large) { struct tcp_opt *tp = tcp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); @@ -1057,8 +1057,6 @@ static inline unsigned int tcp_current_mss(struct sock *sk, int large, int *fact (sk->sk_route_caps & NETIF_F_TSO) && !tp->urg_mode); mss_now = do_large ? tp->mss_cache : tp->mss_cache_std; - if (factor) - *factor = do_large ? tp->mss_tso_factor : 1; if (dst) { u32 mtu = dst_pmtu(dst); @@ -1545,7 +1543,7 @@ static __inline__ void __tcp_push_pending_frames(struct sock *sk, static __inline__ void tcp_push_pending_frames(struct sock *sk, struct tcp_opt *tp) { - __tcp_push_pending_frames(sk, tp, tcp_current_mss(sk, 1, NULL), tp->nonagle); + __tcp_push_pending_frames(sk, tp, tcp_current_mss(sk, 1), tp->nonagle); } static __inline__ int tcp_may_send_now(struct sock *sk, struct tcp_opt *tp) @@ -1553,7 +1551,7 @@ static __inline__ int tcp_may_send_now(struct sock *sk, struct tcp_opt *tp) struct sk_buff *skb = sk->sk_send_head; return (skb && - tcp_snd_test(tp, skb, tcp_current_mss(sk, 1, NULL), + tcp_snd_test(tp, skb, tcp_current_mss(sk, 1), tcp_skb_is_last(sk, skb) ? TCP_NAGLE_PUSH : tp->nonagle)); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9df826c8e22b..36953ef7e6c2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -590,14 +590,13 @@ static inline int forced_push(struct tcp_opt *tp) } static inline void skb_entail(struct sock *sk, struct tcp_opt *tp, - struct sk_buff *skb, int tso_factor) + struct sk_buff *skb) { skb->csum = 0; TCP_SKB_CB(skb)->seq = tp->write_seq; TCP_SKB_CB(skb)->end_seq = tp->write_seq; TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; TCP_SKB_CB(skb)->sacked = 0; - TCP_SKB_CB(skb)->tso_factor = tso_factor; __skb_queue_tail(&sk->sk_write_queue, skb); sk_charge_skb(sk, skb); if (!sk->sk_send_head) @@ -633,7 +632,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse size_t psize, int flags) { struct tcp_opt *tp = tcp_sk(sk); - int mss_now, mss_factor_now; + int mss_now; int err; ssize_t copied; long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); @@ -645,7 +644,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB), &mss_factor_now); + mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); copied = 0; err = -EPIPE; @@ -669,7 +668,7 @@ new_segment: if (!skb) goto wait_for_memory; - skb_entail(sk, tp, skb, mss_factor_now); + skb_entail(sk, tp, skb); copy = mss_now; } @@ -720,8 +719,7 @@ wait_for_memory: if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB), - &mss_factor_now); + mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); } out: @@ -782,7 +780,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct tcp_opt *tp = tcp_sk(sk); struct sk_buff *skb; int iovlen, flags; - int mss_now, mss_factor_now; + int mss_now; int err, copied; long timeo; @@ -800,7 +798,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* This should be in poll */ clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB), &mss_factor_now); + mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); /* Ok commence sending. */ iovlen = msg->msg_iovlen; @@ -845,7 +843,7 @@ new_segment: NETIF_F_HW_CSUM)) skb->ip_summed = CHECKSUM_HW; - skb_entail(sk, tp, skb, mss_factor_now); + skb_entail(sk, tp, skb); copy = mss_now; } @@ -964,8 +962,7 @@ wait_for_memory: if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB), - &mss_factor_now); + mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); } } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f4ec16169906..d7fb3cde4f20 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -974,7 +974,6 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ sk->sk_route_caps &= ~NETIF_F_TSO; sk->sk_no_largesend = 1; tp->mss_cache = tp->mss_cache_std; - tp->mss_tso_factor = 1; } if (!tcp_get_pcount(&tp->sacked_out)) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0fb326e84f28..73f12904c7c3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2076,7 +2076,6 @@ static int tcp_v4_init_sock(struct sock *sk) tp->snd_ssthresh = 0x7fffffff; /* Infinity */ tp->snd_cwnd_clamp = ~0; tp->mss_cache_std = tp->mss_cache = 536; - tp->mss_tso_factor = 1; tp->reordering = sysctl_tcp_reordering; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0a70d082028c..336c7121b6b6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -422,6 +422,23 @@ void tcp_push_one(struct sock *sk, unsigned cur_mss) } } +static void tcp_set_skb_tso_factor(struct sk_buff *skb, unsigned int mss, + unsigned int mss_std) +{ + if (skb->len <= mss_std) { + /* Avoid the costly divide in the normal + * non-TSO case. + */ + TCP_SKB_CB(skb)->tso_factor = 1; + } else { + unsigned int factor; + + factor = skb->len + (mss_std - 1); + factor /= mss; + TCP_SKB_CB(skb)->tso_factor = factor; + } +} + /* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. @@ -457,7 +474,6 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) TCP_SKB_CB(buff)->sacked = (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST | TCPCB_EVER_RETRANS | TCPCB_AT_TAIL)); - TCP_SKB_CB(buff)->tso_factor = tp->mss_tso_factor; if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) { tcp_inc_pcount(&tp->lost_out, buff); tcp_inc_pcount(&tp->left_out, buff); @@ -484,6 +500,10 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) */ TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; + /* Fix up tso_factor for both original and new SKB. */ + tcp_set_skb_tso_factor(skb, tp->mss_cache, tp->mss_cache_std); + tcp_set_skb_tso_factor(buff, tp->mss_cache, tp->mss_cache_std); + /* Link BUFF into the send queue. */ __skb_append(skb, buff); @@ -598,7 +618,6 @@ int tcp_sync_mss(struct sock *sk, u32 pmtu) /* And store cached results */ tp->pmtu_cookie = pmtu; tp->mss_cache = tp->mss_cache_std = mss_now; - tp->mss_tso_factor = 1; if (sk->sk_route_caps & NETIF_F_TSO) { int large_mss, factor; @@ -617,7 +636,6 @@ int tcp_sync_mss(struct sock *sk, u32 pmtu) factor = tp->snd_cwnd; tp->mss_cache = mss_now * factor; - tp->mss_tso_factor = factor; } return mss_now; @@ -634,7 +652,7 @@ int tcp_sync_mss(struct sock *sk, u32 pmtu) int tcp_write_xmit(struct sock *sk, int nonagle) { struct tcp_opt *tp = tcp_sk(sk); - unsigned int mss_now; + unsigned int mss_now, mss_std; /* If we are closed, the bytes will have to remain here. * In time closedown will finish, we empty the write queue and all @@ -649,7 +667,8 @@ int tcp_write_xmit(struct sock *sk, int nonagle) * We also handle things correctly when the user adds some * IP options mid-stream. Silly to do, but cover it. */ - mss_now = tcp_current_mss(sk, 1, NULL); + mss_now = tcp_current_mss(sk, 1); + mss_std = tp->mss_cache_std; while ((skb = sk->sk_send_head) && tcp_snd_test(tp, skb, mss_now, @@ -658,7 +677,8 @@ int tcp_write_xmit(struct sock *sk, int nonagle) if (skb->len > mss_now) { if (tcp_fragment(sk, skb, mss_now)) break; - } + } else + tcp_set_skb_tso_factor(skb, mss_now, mss_std); TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) @@ -871,7 +891,7 @@ void tcp_simple_retransmit(struct sock *sk) { struct tcp_opt *tp = tcp_sk(sk); struct sk_buff *skb; - unsigned int mss = tcp_current_mss(sk, 0, NULL); + unsigned int mss = tcp_current_mss(sk, 0); int lost = 0; sk_stream_for_retrans_queue(skb, sk) { @@ -916,7 +936,7 @@ void tcp_simple_retransmit(struct sock *sk) int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_opt *tp = tcp_sk(sk); - unsigned int cur_mss = tcp_current_mss(sk, 0, NULL); + unsigned int cur_mss = tcp_current_mss(sk, 0); int err; /* Do not sent more than we queued. 1/4 is reserved for possible @@ -934,7 +954,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) sk->sk_route_caps &= ~NETIF_F_TSO; sk->sk_no_largesend = 1; tp->mss_cache = tp->mss_cache_std; - tp->mss_tso_factor = 1; } if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) @@ -1125,7 +1144,7 @@ void tcp_send_fin(struct sock *sk) * unsent frames. But be careful about outgoing SACKS * and IP options. */ - mss_now = tcp_current_mss(sk, 1, NULL); + mss_now = tcp_current_mss(sk, 1); if (sk->sk_send_head != NULL) { TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; @@ -1516,8 +1535,9 @@ int tcp_write_wakeup(struct sock *sk) if ((skb = sk->sk_send_head) != NULL && before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) { int err; - int mss = tcp_current_mss(sk, 0, NULL); - int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq; + unsigned int mss = tcp_current_mss(sk, 0); + unsigned int mss_std = tp->mss_cache_std; + unsigned int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq; if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; @@ -1538,9 +1558,10 @@ int tcp_write_wakeup(struct sock *sk) sk->sk_no_largesend = 1; sk->sk_route_caps &= ~NETIF_F_TSO; tp->mss_cache = tp->mss_cache_std; - tp->mss_tso_factor = 1; } - } + } else + tcp_set_skb_tso_factor(skb, mss, mss_std); + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; TCP_SKB_CB(skb)->when = tcp_time_stamp; err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0ecd0d8dfa0f..ebed7e197aac 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1930,7 +1930,6 @@ static int tcp_v6_init_sock(struct sock *sk) tp->snd_ssthresh = 0x7fffffff; tp->snd_cwnd_clamp = ~0; tp->mss_cache_std = tp->mss_cache = 536; - tp->mss_tso_factor = 1; tp->reordering = sysctl_tcp_reordering; -- cgit v1.2.3 From 5a6bdc92894c920dcc7fcf7010d0eb05de2e3d21 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 6 Sep 2004 21:03:16 -0700 Subject: [TCP]: Make sure SKB tso factor is setup early enough. It needs to be set so that congestion window calculations have a valid value to work with. This means that doing it at write queue running time is too late. Signed-off-by: David S. Miller --- include/net/tcp.h | 7 +++++++ net/ipv4/tcp_output.c | 19 ++++++++++--------- 2 files changed, 17 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 1de15c7a560a..1a8a317f2bd5 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1469,6 +1469,8 @@ tcp_nagle_check(struct tcp_opt *tp, struct sk_buff *skb, unsigned mss_now, int n tcp_minshall_check(tp)))); } +extern void tcp_set_skb_tso_factor(struct sk_buff *, unsigned int, unsigned int); + /* This checks if the data bearing packet SKB (usually sk->sk_send_head) * should be put on the wire right now. */ @@ -1477,6 +1479,11 @@ static __inline__ int tcp_snd_test(struct tcp_opt *tp, struct sk_buff *skb, { int pkts = TCP_SKB_CB(skb)->tso_factor; + if (!pkts) { + tcp_set_skb_tso_factor(skb, cur_mss, tp->mss_cache_std); + pkts = TCP_SKB_CB(skb)->tso_factor; + } + /* RFC 1122 - section 4.2.3.4 * * We must queue if diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 336c7121b6b6..32174549304e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -422,8 +422,8 @@ void tcp_push_one(struct sock *sk, unsigned cur_mss) } } -static void tcp_set_skb_tso_factor(struct sk_buff *skb, unsigned int mss, - unsigned int mss_std) +void tcp_set_skb_tso_factor(struct sk_buff *skb, unsigned int mss, + unsigned int mss_std) { if (skb->len <= mss_std) { /* Avoid the costly divide in the normal @@ -652,7 +652,7 @@ int tcp_sync_mss(struct sock *sk, u32 pmtu) int tcp_write_xmit(struct sock *sk, int nonagle) { struct tcp_opt *tp = tcp_sk(sk); - unsigned int mss_now, mss_std; + unsigned int mss_now; /* If we are closed, the bytes will have to remain here. * In time closedown will finish, we empty the write queue and all @@ -668,7 +668,6 @@ int tcp_write_xmit(struct sock *sk, int nonagle) * IP options mid-stream. Silly to do, but cover it. */ mss_now = tcp_current_mss(sk, 1); - mss_std = tp->mss_cache_std; while ((skb = sk->sk_send_head) && tcp_snd_test(tp, skb, mss_now, @@ -677,8 +676,7 @@ int tcp_write_xmit(struct sock *sk, int nonagle) if (skb->len > mss_now) { if (tcp_fragment(sk, skb, mss_now)) break; - } else - tcp_set_skb_tso_factor(skb, mss_now, mss_std); + } TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) @@ -1059,6 +1057,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) __u8 sacked = TCP_SKB_CB(skb)->sacked; int pkts = TCP_SKB_CB(skb)->tso_factor; + BUG_ON(!pkts); + if ((tcp_packets_in_flight(tp) + (pkts-1)) >= tp->snd_cwnd) return; @@ -1109,6 +1109,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) sk_stream_for_retrans_queue(skb, sk) { int pkts = TCP_SKB_CB(skb)->tso_factor; + BUG_ON(!pkts); + packet_cnt += pkts; if (packet_cnt > tcp_get_pcount(&tp->fackets_out)) break; @@ -1536,7 +1538,6 @@ int tcp_write_wakeup(struct sock *sk) before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) { int err; unsigned int mss = tcp_current_mss(sk, 0); - unsigned int mss_std = tp->mss_cache_std; unsigned int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq; if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) @@ -1559,8 +1560,8 @@ int tcp_write_wakeup(struct sock *sk) sk->sk_route_caps &= ~NETIF_F_TSO; tp->mss_cache = tp->mss_cache_std; } - } else - tcp_set_skb_tso_factor(skb, mss, mss_std); + } else if (!TCP_SKB_CB(skb)->tso_factor) + tcp_set_skb_tso_factor(skb, mss, tp->mss_cache_std); TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; TCP_SKB_CB(skb)->when = tcp_time_stamp; -- cgit v1.2.3 From f77bdc6fc029002e55a99ada37f9b38854cceef3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 6 Sep 2004 22:55:45 -0700 Subject: [NET]: Fix CONFIG_COMPAT build with networking disabled. Signed-off-by: David S. Miller --- fs/compat_ioctl.c | 10 ++++++---- net/Makefile | 3 ++- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index dc1a2d2c41d5..a493b5b5871b 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -407,6 +408,7 @@ out: return err; } +#ifdef CONFIG_NET static int do_siocgstamp(unsigned int fd, unsigned int cmd, unsigned long arg) { struct compat_timeval __user *up = compat_ptr(arg); @@ -461,7 +463,6 @@ struct ifconf32 { compat_caddr_t ifcbuf; }; -#ifdef CONFIG_NET static int dev_ifname32(unsigned int fd, unsigned int cmd, unsigned long arg) { struct net_device *dev; @@ -481,7 +482,6 @@ static int dev_ifname32(unsigned int fd, unsigned int cmd, unsigned long arg) err = copy_to_user(compat_ptr(arg), &ifr32, sizeof(ifr32)); return (err ? -EFAULT : 0); } -#endif static int dev_ifconf(unsigned int fd, unsigned int cmd, unsigned long arg) { @@ -797,6 +797,7 @@ static int routing_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) return ret; } +#endif struct hd_geometry32 { unsigned char heads; @@ -1872,7 +1873,8 @@ static int do_atm_ioctl(unsigned int fd, unsigned int cmd32, unsigned long arg) return -EINVAL; } -static int ret_einval(unsigned int fd, unsigned int cmd, unsigned long arg) +static __attribute_used__ int +ret_einval(unsigned int fd, unsigned int cmd, unsigned long arg) { return -EINVAL; } @@ -3162,7 +3164,6 @@ HANDLE_IOCTL(MEMREADOOB32, mtd_rw_oob) HANDLE_IOCTL(MEMWRITEOOB32, mtd_rw_oob) #ifdef CONFIG_NET HANDLE_IOCTL(SIOCGIFNAME, dev_ifname32) -#endif HANDLE_IOCTL(SIOCGIFCONF, dev_ifconf) HANDLE_IOCTL(SIOCGIFFLAGS, dev_ifsioc) HANDLE_IOCTL(SIOCSIFFLAGS, dev_ifsioc) @@ -3206,6 +3207,7 @@ HANDLE_IOCTL(SIOCBRDELIF, dev_ifsioc) /* Note SIOCRTMSG is no longer, so this is safe and * the user would have seen just an -EINVAL anyways. */ HANDLE_IOCTL(SIOCRTMSG, ret_einval) HANDLE_IOCTL(SIOCGSTAMP, do_siocgstamp) +#endif HANDLE_IOCTL(HDIO_GETGEO, hdio_getgeo) HANDLE_IOCTL(BLKRAGET, w_long) HANDLE_IOCTL(BLKGETSIZE, w_long) diff --git a/net/Makefile b/net/Makefile index 61740b47a67d..a46436e0fcc2 100644 --- a/net/Makefile +++ b/net/Makefile @@ -9,7 +9,8 @@ obj-y := nonet.o obj-$(CONFIG_NET) := socket.o core/ -obj-$(CONFIG_COMPAT) += compat.o +tmp-$(CONFIG_COMPAT) := compat.o +obj-$(CONFIG_NET) += $(tmp-y) # LLC has to be linked before the files in net/802/ obj-$(CONFIG_LLC) += llc/ -- cgit v1.2.3 From 2f122062ec392241d0f0453423284861934acb94 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Mon, 6 Sep 2004 23:21:21 -0700 Subject: [AX25]: Fix digipeat leak. Signed-off-by: Dave Jones Signed-off-by: David S. Miller --- net/ax25/af_ax25.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 47fbd98e0e81..3a84182f4474 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1176,13 +1176,16 @@ static int ax25_connect(struct socket *sock, struct sockaddr *uaddr, /* check if we can remove this feature. It is broken. */ printk(KERN_WARNING "ax25_connect(): %s uses autobind, please contact jreuter@yaina.de\n", current->comm); - if ((err = ax25_rt_autobind(ax25, &fsa->fsa_ax25.sax25_call)) < 0) + if ((err = ax25_rt_autobind(ax25, &fsa->fsa_ax25.sax25_call)) < 0) { + kfree(digi); goto out; + } ax25_fillin_cb(ax25, ax25->ax25_dev); ax25_cb_add(ax25); } else { if (ax25->ax25_dev == NULL) { + kfree(digi); err = -EHOSTUNREACH; goto out; } @@ -1191,8 +1194,7 @@ static int ax25_connect(struct socket *sock, struct sockaddr *uaddr, if (sk->sk_type == SOCK_SEQPACKET && (ax25t=ax25_find_cb(&ax25->source_addr, &fsa->fsa_ax25.sax25_call, digi, ax25->ax25_dev->dev))) { - if (digi != NULL) - kfree(digi); + kfree(digi); err = -EADDRINUSE; /* Already such a connection */ ax25_cb_put(ax25t); goto out; -- cgit v1.2.3 From 0c5b8d8a0c82e3ec85588a306f9ae1df55706e4f Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Mon, 6 Sep 2004 23:24:36 -0700 Subject: [PACKET]: Fix deref before NULL check in packet_release() Using the automated source checker at coverity.com, they picked up on some code in packet_release() where a NULL check was done after dereferencing. Patch below. Signed-off-by: Dave Jones Signed-off-by: David S. Miller --- net/packet/af_packet.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 51d0514fd2a7..1b441a628b71 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -786,11 +786,13 @@ out: static int packet_release(struct socket *sock) { struct sock *sk = sock->sk; - struct packet_opt *po = pkt_sk(sk); + struct packet_opt *po; if (!sk) return 0; + po = pkt_sk(sk); + write_lock_bh(&packet_sklist_lock); sk_del_node_init(sk); write_unlock_bh(&packet_sklist_lock); -- cgit v1.2.3 From cb40262783a52f8b4e44519b33684b76b2fa07dc Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 6 Sep 2004 23:35:09 -0700 Subject: [NET]: Fully plug netigh_create/inetdev_destroy race. So here is a patch to make sure that there is a barrier between the reading of dev->*_ptr and *dev->neigh_parms. With these barriers in place, it's clear that *dev->neigh_parms can no longer be NULL since once the parms are allocated, that pointer is never reset to NULL again. Therefore I've also removed the parms check in these paths. They were bogus to begin with since if they ever triggered then we'll have dead neigh entries stuck in the hash table. Unfortunately I couldn't arrange for this to happen with DECnet due to the dn_db->parms.up() call that's sandwiched between the assignment of dev->dn_ptr and dn_db->neigh_parms. So I've kept the parms check there but it will now fail instead of continuing. I've also added an smp_wmb() there so that at least we won't be reading garbage from dn_db->neigh_parms. DECnet is also buggy since there is no locking at all in the destruction path. It either needs locking or RCU like IPv4. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- drivers/s390/net/qeth_main.c | 8 +++----- net/atm/clip.c | 8 +++----- net/decnet/dn_dev.c | 2 ++ net/decnet/dn_neigh.c | 11 +++++++---- net/ipv4/arp.c | 8 +++----- net/ipv6/ndisc.c | 6 ++---- 6 files changed, 20 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/drivers/s390/net/qeth_main.c b/drivers/s390/net/qeth_main.c index d5285d105c65..a8e034b156cf 100644 --- a/drivers/s390/net/qeth_main.c +++ b/drivers/s390/net/qeth_main.c @@ -6718,17 +6718,15 @@ qeth_arp_constructor(struct neighbour *neigh) } rcu_read_lock(); - in_dev = __in_dev_get(dev); + in_dev = rcu_dereference(__in_dev_get(dev)); if (in_dev == NULL) { rcu_read_unlock(); return -EINVAL; } parms = in_dev->arp_parms; - if (parms) { - __neigh_parms_put(neigh->parms); - neigh->parms = neigh_parms_clone(parms); - } + __neigh_parms_put(neigh->parms); + neigh->parms = neigh_parms_clone(parms); rcu_read_unlock(); neigh->type = inet_addr_type(*(u32 *) neigh->primary_key); diff --git a/net/atm/clip.c b/net/atm/clip.c index f7756e1f93ce..104dd4d19da4 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -320,17 +320,15 @@ static int clip_constructor(struct neighbour *neigh) if (neigh->type != RTN_UNICAST) return -EINVAL; rcu_read_lock(); - in_dev = __in_dev_get(dev); + in_dev = rcu_dereference(__in_dev_get(dev)); if (!in_dev) { rcu_read_unlock(); return -EINVAL; } parms = in_dev->arp_parms; - if (parms) { - __neigh_parms_put(neigh->parms); - neigh->parms = neigh_parms_clone(parms); - } + __neigh_parms_put(neigh->parms); + neigh->parms = neigh_parms_clone(parms); rcu_read_unlock(); neigh->ops = &clip_neigh_ops; diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index 733b1cf6c440..a21a326808b4 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -1108,6 +1109,7 @@ struct dn_dev *dn_dev_create(struct net_device *dev, int *err) memset(dn_db, 0, sizeof(struct dn_dev)); memcpy(&dn_db->parms, p, sizeof(struct dn_dev_parms)); + smp_wmb(); dev->dn_ptr = dn_db; dn_db->dev = dev; init_timer(&dn_db->timer); diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c index e874232ec54b..d3d6c592a5cb 100644 --- a/net/decnet/dn_neigh.c +++ b/net/decnet/dn_neigh.c @@ -139,17 +139,20 @@ static int dn_neigh_construct(struct neighbour *neigh) struct neigh_parms *parms; rcu_read_lock(); - dn_db = dev->dn_ptr; + dn_db = rcu_dereference(dev->dn_ptr); if (dn_db == NULL) { rcu_read_unlock(); return -EINVAL; } parms = dn_db->neigh_parms; - if (parms) { - __neigh_parms_put(neigh->parms); - neigh->parms = neigh_parms_clone(parms); + if (!parms) { + rcu_read_unlock(); + return -EINVAL; } + + __neigh_parms_put(neigh->parms); + neigh->parms = neigh_parms_clone(parms); rcu_read_unlock(); if (dn_db->use_long) diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index f4e6a4a368ec..41e726ac3337 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -244,17 +244,15 @@ static int arp_constructor(struct neighbour *neigh) neigh->type = inet_addr_type(addr); rcu_read_lock(); - in_dev = __in_dev_get(dev); + in_dev = rcu_dereference(__in_dev_get(dev)); if (in_dev == NULL) { rcu_read_unlock(); return -EINVAL; } parms = in_dev->arp_parms; - if (parms) { - __neigh_parms_put(neigh->parms); - neigh->parms = neigh_parms_clone(parms); - } + __neigh_parms_put(neigh->parms); + neigh->parms = neigh_parms_clone(parms); rcu_read_unlock(); if (dev->hard_header == NULL) { diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 6d23ea909aca..e1f5aeb79258 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -297,10 +297,8 @@ static int ndisc_constructor(struct neighbour *neigh) } parms = in6_dev->nd_parms; - if (parms) { - __neigh_parms_put(neigh->parms); - neigh->parms = neigh_parms_clone(parms); - } + __neigh_parms_put(neigh->parms); + neigh->parms = neigh_parms_clone(parms); rcu_read_unlock(); neigh->type = is_multicast ? RTN_MULTICAST : RTN_UNICAST; -- cgit v1.2.3 From 43da55cbd54ed79f38556b39facb89d06448a267 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 6 Sep 2004 23:36:36 -0700 Subject: [NET]: Do less atomic count changes in dev_queue_xmit. With suggestions from Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 7a50c543e505..47b3d8497a5d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1249,17 +1249,17 @@ int __skb_linearize(struct sk_buff *skb, int gfp_mask) return 0; } -#define HARD_TX_LOCK_BH(dev, cpu) { \ +#define HARD_TX_LOCK(dev, cpu) { \ if ((dev->features & NETIF_F_LLTX) == 0) { \ - spin_lock_bh(&dev->xmit_lock); \ + spin_lock(&dev->xmit_lock); \ dev->xmit_lock_owner = cpu; \ } \ } -#define HARD_TX_UNLOCK_BH(dev) { \ +#define HARD_TX_UNLOCK(dev) { \ if ((dev->features & NETIF_F_LLTX) == 0) { \ dev->xmit_lock_owner = -1; \ - spin_unlock_bh(&dev->xmit_lock); \ + spin_unlock(&dev->xmit_lock); \ } \ } @@ -1313,7 +1313,12 @@ int dev_queue_xmit(struct sk_buff *skb) if (skb_checksum_help(&skb, 0)) goto out_kfree_skb; - rcu_read_lock(); + + /* Disable soft irqs for various locks below. Also + * stops preemption for RCU. + */ + local_bh_disable(); + /* Updates of qdisc are serialized by queue_lock. * The struct Qdisc which is pointed to by qdisc is now a * rcu structure - it may be accessed without acquiring @@ -1332,18 +1337,16 @@ int dev_queue_xmit(struct sk_buff *skb) #endif if (q->enqueue) { /* Grab device queue */ - spin_lock_bh(&dev->queue_lock); + spin_lock(&dev->queue_lock); rc = q->enqueue(skb, q); qdisc_run(dev); - spin_unlock_bh(&dev->queue_lock); - rcu_read_unlock(); + spin_unlock(&dev->queue_lock); rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc; goto out; } - rcu_read_unlock(); /* The device has no queue. Common case for software devices: loopback, all the sorts of tunnels... @@ -1358,12 +1361,11 @@ int dev_queue_xmit(struct sk_buff *skb) Either shot noqueue qdisc, it is even simpler 8) */ if (dev->flags & IFF_UP) { - int cpu = get_cpu(); + int cpu = smp_processor_id(); /* ok because BHs are off */ if (dev->xmit_lock_owner != cpu) { - HARD_TX_LOCK_BH(dev, cpu); - put_cpu(); + HARD_TX_LOCK(dev, cpu); if (!netif_queue_stopped(dev)) { if (netdev_nit) @@ -1371,17 +1373,16 @@ int dev_queue_xmit(struct sk_buff *skb) rc = 0; if (!dev->hard_start_xmit(skb, dev)) { - HARD_TX_UNLOCK_BH(dev); + HARD_TX_UNLOCK(dev); goto out; } } - HARD_TX_UNLOCK_BH(dev); + HARD_TX_UNLOCK(dev); if (net_ratelimit()) printk(KERN_CRIT "Virtual device %s asks to " "queue packet!\n", dev->name); goto out_enetdown; } else { - put_cpu(); /* Recursion is detected! It is possible, * unfortunately */ if (net_ratelimit()) @@ -1394,6 +1395,7 @@ out_enetdown: out_kfree_skb: kfree_skb(skb); out: + local_bh_enable(); return rc; } -- cgit v1.2.3 From e31cd2a33e417cd7e9328dfa32449223c968297c Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Mon, 6 Sep 2004 23:40:52 -0700 Subject: [PKT_SCHED]: Fix gact compile warnings. Signed-off-by: David S. Miller --- net/sched/gact.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/sched/gact.c b/net/sched/gact.c index cd1a58c60485..5607f5e8cd83 100644 --- a/net/sched/gact.c +++ b/net/sched/gact.c @@ -76,7 +76,9 @@ tcf_gact_init(struct rtattr *rta, struct rtattr *est, struct tc_action *a,int ov { struct rtattr *tb[TCA_GACT_MAX]; struct tc_gact *parm = NULL; +#ifdef CONFIG_GACT_PROB struct tc_gact_p *p_parm = NULL; +#endif struct tcf_gact *p = NULL; int ret = 0; int size = sizeof (*p); @@ -176,7 +178,9 @@ tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb->tail; struct tc_gact opt; +#ifdef CONFIG_GACT_PROB struct tc_gact_p p_opt; +#endif struct tcf_gact *p; struct tcf_t t; -- cgit v1.2.3 From a88031e42f7a3066c2cf73846ebb8c6d48d33cc9 Mon Sep 17 00:00:00 2001 From: Wensong Zhang Date: Mon, 6 Sep 2004 23:43:52 -0700 Subject: [IPVS] fixed to call nf_reset() to reset netfilter related fields Recommended by Harald Welte Signed-off-by: David S. Miller --- net/ipv4/ipvs/ip_vs_xmit.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c index 3a85f7a8d02a..9a8f051208d1 100644 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ b/net/ipv4/ipvs/ip_vs_xmit.c @@ -127,6 +127,7 @@ ip_vs_dst_reset(struct ip_vs_dest *dest) #define IP_VS_XMIT(skb, rt) \ do { \ + nf_reset(skb); \ (skb)->nfcache |= NFC_IPVS_PROPERTY; \ NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \ (rt)->u.dst.dev, dst_output); \ @@ -201,9 +202,6 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; -#ifdef CONFIG_NETFILTER_DEBUG - skb->nf_debug = 0; -#endif /* CONFIG_NETFILTER_DEBUG */ IP_VS_XMIT(skb, rt); LeaveFunction(10); @@ -280,9 +278,6 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; -#ifdef CONFIG_NETFILTER_DEBUG - skb->nf_debug = 0; -#endif /* CONFIG_NETFILTER_DEBUG */ IP_VS_XMIT(skb, rt); LeaveFunction(10); @@ -418,10 +413,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; -#ifdef CONFIG_NETFILTER_DEBUG - skb->nf_debug = 0; -#endif /* CONFIG_NETFILTER_DEBUG */ - IP_VS_XMIT(skb, rt); LeaveFunction(10); @@ -480,9 +471,6 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; -#ifdef CONFIG_NETFILTER_DEBUG - skb->nf_debug = 0; -#endif /* CONFIG_NETFILTER_DEBUG */ IP_VS_XMIT(skb, rt); LeaveFunction(10); @@ -557,9 +545,6 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; -#ifdef CONFIG_NETFILTER_DEBUG - skb->nf_debug = 0; -#endif /* CONFIG_NETFILTER_DEBUG */ IP_VS_XMIT(skb, rt); rc = NF_STOLEN; -- cgit v1.2.3 From 7ad9cf1c3ef14cd0cf165dcf76d52d25855ea15b Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 6 Sep 2004 23:49:43 -0700 Subject: [BRIDGE]: deadlock on device removal Fixes: https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=131569 Dead lock in bridge when removing device interface module. br_del_if assumes br->lock not held. This fixes case of: brctl addbr b0 brctl addif b0 eth0 rmmod eth0 Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_notify.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c index d0702fbcb21d..f8fb49e34764 100644 --- a/net/bridge/br_notify.c +++ b/net/bridge/br_notify.c @@ -76,10 +76,12 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v break; case NETDEV_UNREGISTER: + spin_unlock_bh(&br->lock); br_del_if(br, dev); - break; + goto done; } spin_unlock_bh(&br->lock); + done: return NOTIFY_DONE; } -- cgit v1.2.3 From aeb2cc38c4d50ca1fc4a8815c7eba8bb8eb14059 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 6 Sep 2004 23:50:58 -0700 Subject: [NET]: Fix compat layer setsockopt overzealous conversions. compat_sys_setsockopt() is a little overzealous about converting 32-bit stuff into 64-bit. It should match on level _and_ optname, not just optname. Currently it eats the IPV6_V6ONLY sockopt because its value (26) happens to match SO_ATTACH_FILTER. This makes it at least check 'level' for everything but IPT_SO_SET_REPLACE == IPT6_SO_SET_REPLACE, because that does seem to be the same in different levels. But do_netfilter_replace() is another can of worms entirely -- it doesn't actually work either, because some netfilter modules (like ipt_limit) include kernel-only bits which change size in the structure they share with userspace. Signed-off-by: David S. Miller --- net/compat.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/compat.c b/net/compat.c index 998b21b65363..6080b6439b96 100644 --- a/net/compat.c +++ b/net/compat.c @@ -455,13 +455,15 @@ static int do_set_sock_timeout(int fd, int level, int optname, char __user *optv asmlinkage long compat_sys_setsockopt(int fd, int level, int optname, char __user *optval, int optlen) { + /* SO_SET_REPLACE seems to be the same in all levels */ if (optname == IPT_SO_SET_REPLACE) return do_netfilter_replace(fd, level, optname, optval, optlen); - if (optname == SO_ATTACH_FILTER) + if (level == SOL_SOCKET && optname == SO_ATTACH_FILTER) return do_set_attach_filter(fd, level, optname, optval, optlen); - if (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO) + if (level == SOL_SOCKET && + (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO)) return do_set_sock_timeout(fd, level, optname, optval, optlen); return sys_setsockopt(fd, level, optname, optval, optlen); -- cgit v1.2.3 From d81b34dd264c876be4b8fc86bc9754e2b566a6a9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Sep 2004 23:52:08 -0700 Subject: [IPV4]: Mark inet_family_ops static Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 3 +-- net/sctp/protocol.c | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ed2923791e0e..c859b31fd0f1 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -837,7 +837,7 @@ struct proto_ops inet_dgram_ops = { .sendpage = inet_sendpage, }; -struct net_proto_family inet_family_ops = { +static struct net_proto_family inet_family_ops = { .family = PF_INET, .create = inet_create, .owner = THIS_MODULE, @@ -1157,7 +1157,6 @@ EXPORT_SYMBOL(inet_accept); EXPORT_SYMBOL(inet_bind); EXPORT_SYMBOL(inet_dgram_connect); EXPORT_SYMBOL(inet_dgram_ops); -EXPORT_SYMBOL(inet_family_ops); EXPORT_SYMBOL(inet_getname); EXPORT_SYMBOL(inet_ioctl); EXPORT_SYMBOL(inet_listen); diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index bae07708eb01..97f66fd770f4 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -81,8 +81,6 @@ static struct sctp_af *sctp_af_v6_specific; kmem_cache_t *sctp_chunk_cachep; kmem_cache_t *sctp_bucket_cachep; -extern struct net_proto_family inet_family_ops; - extern int sctp_snmp_proc_init(void); extern int sctp_snmp_proc_exit(void); extern int sctp_eps_proc_init(void); -- cgit v1.2.3 From 50e2daaaae756c25216e2274acb61386f8f33a31 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Sep 2004 23:53:00 -0700 Subject: [NET]: Unexport {alloc,free}_divert_blk() these are called by dev.c for every device (and nowhere else) Signed-off-by: David S. Miller --- net/core/dv.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/core/dv.c b/net/core/dv.c index c1340cc53b75..f8e3f9c6b282 100644 --- a/net/core/dv.c +++ b/net/core/dv.c @@ -553,6 +553,3 @@ void divert_frame(struct sk_buff *skb) break; } } - -EXPORT_SYMBOL(alloc_divert_blk); -EXPORT_SYMBOL(free_divert_blk); -- cgit v1.2.3 From 5c96c965be27556d2345a9534e36ff0e93604cec Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 7 Sep 2004 00:18:46 -0700 Subject: [NETFILTER]: Fix build error with CONFIG_SYSCTL disabled. ip_ct_log_invalid was added without testing that it compiles without CONFIG_SYSCTL. Since sysctl is the only way of turning it on, there should be no references to it if \!CONFIG_SYSCTL. Also, that turns off CONFIG_PROC_FS, which elicits more warnings. Squish them too. Signed-off-by: Rusty Russell Signed-off-by: David S. Miller --- .../linux/netfilter_ipv4/ip_conntrack_protocol.h | 4 ++++ net/ipv4/netfilter/ip_conntrack_proto_sctp.c | 2 +- net/ipv4/netfilter/ip_conntrack_standalone.c | 21 +++++++++++++++------ net/ipv4/netfilter/ip_queue.c | 2 ++ 4 files changed, 22 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter_ipv4/ip_conntrack_protocol.h b/include/linux/netfilter_ipv4/ip_conntrack_protocol.h index 55d57404acb8..6edb801fa51f 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack_protocol.h +++ b/include/linux/netfilter_ipv4/ip_conntrack_protocol.h @@ -75,6 +75,7 @@ extern int ip_conntrack_protocol_tcp_init(void); /* Log invalid packets */ extern unsigned int ip_ct_log_invalid; +#ifdef CONFIG_SYSCTL #ifdef DEBUG_INVALID_PACKETS #define LOG_INVALID(proto) \ (ip_ct_log_invalid == (proto) || ip_ct_log_invalid == IPPROTO_RAW) @@ -83,5 +84,8 @@ extern unsigned int ip_ct_log_invalid; ((ip_ct_log_invalid == (proto) || ip_ct_log_invalid == IPPROTO_RAW) \ && net_ratelimit()) #endif +#else +#define LOG_INVALID(proto) 0 +#endif /* CONFIG_SYSCTL */ #endif /*_IP_CONNTRACK_PROTOCOL_H*/ diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c index 64755c5aed6e..3e51036e5065 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c @@ -623,8 +623,8 @@ int __init init(void) return ret; - cleanup: #ifdef CONFIG_SYSCTL + cleanup: ip_conntrack_protocol_unregister(&ip_conntrack_protocol_sctp); #endif out: diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index ceff26dbff47..f4c3899771c4 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -54,6 +54,7 @@ static int kill_proto(const struct ip_conntrack *i, void *data) *((u_int8_t *) data)); } +#ifdef CONFIG_PROC_FS static unsigned int print_tuple(char *buffer, const struct ip_conntrack_tuple *tuple, struct ip_conntrack_protocol *proto) @@ -367,6 +368,7 @@ static struct file_operations ct_cpu_seq_fops = { .llseek = seq_lseek, .release = seq_release_private, }; +#endif static unsigned int ip_confirm(unsigned int hooknum, struct sk_buff **pskb, @@ -726,10 +728,15 @@ static ctl_table ip_ct_net_table[] = { }, { .ctl_name = 0 } }; -#endif + +EXPORT_SYMBOL(ip_ct_log_invalid); +#endif /* CONFIG_SYSCTL */ + static int init_or_cleanup(int init) { +#ifdef CONFIG_PROC_FS struct proc_dir_entry *proc, *proc_exp, *proc_stat; +#endif int ret = 0; if (!init) goto cleanup; @@ -738,19 +745,20 @@ static int init_or_cleanup(int init) if (ret < 0) goto cleanup_nothing; - proc = proc_net_create("ip_conntrack", 0440, NULL); +#ifdef CONFIG_PROC_FS + proc = proc_net_fops_create("ip_conntrack", 0440, &ct_file_ops); if (!proc) goto cleanup_init; - proc->proc_fops = &ct_file_ops; - proc_exp = proc_net_create("ip_conntrack_expect", 0440, NULL); + proc_exp = proc_net_fops_create("ip_conntrack_expect", 0440, + &exp_file_ops); if (!proc_exp) goto cleanup_proc; - proc_exp->proc_fops = &exp_file_ops; proc_stat = proc_net_fops_create("ip_conntrack_stat", S_IRUGO, &ct_cpu_seq_fops); if (!proc_stat) goto cleanup_proc_exp; proc_stat->owner = THIS_MODULE; +#endif ret = nf_register_hook(&ip_conntrack_defrag_ops); if (ret < 0) { @@ -814,12 +822,14 @@ static int init_or_cleanup(int init) local_bh_enable(); nf_unregister_hook(&ip_conntrack_defrag_ops); cleanup_proc_stat: +#ifdef CONFIG_PROC_FS proc_net_remove("ip_conntrack_stat"); cleanup_proc_exp: proc_net_remove("ip_conntrack_exp"); cleanup_proc: proc_net_remove("ip_conntrack"); cleanup_init: +#endif /* CONFIG_PROC_FS */ ip_conntrack_cleanup(); cleanup_nothing: return ret; @@ -912,4 +922,3 @@ EXPORT_SYMBOL(ip_conntrack_hash); EXPORT_SYMBOL(ip_conntrack_untracked); EXPORT_SYMBOL_GPL(ip_conntrack_find_get); EXPORT_SYMBOL_GPL(ip_conntrack_put); -EXPORT_SYMBOL(ip_ct_log_invalid); diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 906b89df2f19..26dca38f692a 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -619,6 +619,7 @@ static ctl_table ipq_root_table[] = { { .ctl_name = 0 } }; +#ifdef CONFIG_PROC_FS static int ipq_get_info(char *buffer, char **start, off_t offset, int length) { @@ -648,6 +649,7 @@ ipq_get_info(char *buffer, char **start, off_t offset, int length) len = 0; return len; } +#endif /* CONFIG_PROC_FS */ static int init_or_cleanup(int init) -- cgit v1.2.3 From ef31c0c83addad073e6fda9885f0196b9308b6e3 Mon Sep 17 00:00:00 2001 From: Pozsar Balazs Date: Tue, 7 Sep 2004 00:20:26 -0700 Subject: [PKT_SCHED]: Add missing MODULE_LICENSE. Signed-off-by: Pozsar Balazs Signed-off-by: David S. Miller --- net/sched/sch_atm.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index fe530156875a..ff61f8e698c9 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -714,3 +714,4 @@ static void __exit atm_exit(void) module_init(atm_init) module_exit(atm_exit) +MODULE_LICENSE("GPL"); -- cgit v1.2.3