From 5eb8f4d74204864d8a4154772206ad747274d12d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 13 Apr 2015 14:56:51 +0200 Subject: mac80211: don't warn when stopping VLAN with stations Stations assigned to an AP_VLAN type interface are flushed when the interface is stopped, but then we warn about it. Suppress the warning since there's nothing else that would ensure those stations are already removed at this point. Reported-by: Jouni Malinen Signed-off-by: Johannes Berg --- net/mac80211/iface.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index b4ac596a7cb7..bab5c63c0bad 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -819,13 +819,15 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, * (because if we remove a STA after ops->remove_interface() * the driver will have removed the vif info already!) * - * This is relevant only in WDS mode, in all other modes we've - * already removed all stations when disconnecting or similar, - * so warn otherwise. + * In WDS mode a station must exist here and be flushed, for + * AP_VLANs stations may exist since there's nothing else that + * would have removed them, but in other modes there shouldn't + * be any stations. */ flushed = sta_info_flush(sdata); - WARN_ON_ONCE((sdata->vif.type != NL80211_IFTYPE_WDS && flushed > 0) || - (sdata->vif.type == NL80211_IFTYPE_WDS && flushed != 1)); + WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_AP_VLAN && + ((sdata->vif.type != NL80211_IFTYPE_WDS && flushed > 0) || + (sdata->vif.type == NL80211_IFTYPE_WDS && flushed != 1))); /* don't count this interface for promisc/allmulti while it is down */ if (sdata->flags & IEEE80211_SDATA_ALLMULTI) -- cgit v1.2.3 From 60f4b626d5b5c5724b010200a8c2ff3169f6f4db Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 23 Apr 2015 14:02:30 +0200 Subject: mac80211: fix rhashtable conversion My conversion of the mac80211 station hash table to rhashtable completely broke the lookup in sta_info_get() as it no longer took into account the virtual interface. Fix that. Fixes: 7bedd0cfad4e1 ("mac80211: use rhashtable for station table") Signed-off-by: Johannes Berg --- net/mac80211/sta_info.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 12971b71d0fa..39893178a1a9 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -157,8 +157,24 @@ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + struct rhash_head *tmp; + const struct bucket_table *tbl; + + rcu_read_lock(); + tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash); - return rhashtable_lookup_fast(&local->sta_hash, addr, sta_rht_params); + for_each_sta_info(local, tbl, addr, sta, tmp) { + if (sta->sdata == sdata) { + rcu_read_unlock(); + /* this is safe as the caller must already hold + * another rcu read section or the mutex + */ + return sta; + } + } + rcu_read_unlock(); + return NULL; } /* -- cgit v1.2.3 From caf22d311a35d1c0c2c73e1f0988276d51175c8f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 24 Apr 2015 11:10:10 +0200 Subject: mac80211: enable hash table shrinking The hashtable behaviour change was merged into the tree at about the same time as the mac80211 use of rhashtable, but of course these don't really conflict in the normal sense. Enable hash table shrinking now. Signed-off-by: Johannes Berg --- net/mac80211/sta_info.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 39893178a1a9..2880f2ae99ab 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -66,6 +66,7 @@ static const struct rhashtable_params sta_rht_params = { .nelem_hint = 3, /* start small */ + .automatic_shrinking = true, .head_offset = offsetof(struct sta_info, hash_node), .key_offset = offsetof(struct sta_info, sta.addr), .key_len = ETH_ALEN, -- cgit v1.2.3 From bdddbf6996c0b9299efc97b8f66e06286f3aa8c9 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Wed, 29 Apr 2015 08:42:44 +0800 Subject: xfrm: fix a race in xfrm_state_lookup_byspi The returned xfrm_state should be hold before unlock xfrm_state_lock, otherwise the returned xfrm_state maybe be released. Fixes: c454997e6[{pktgen, xfrm} Introduce xfrm_state_lookup_byspi..] Cc: Fan Du Signed-off-by: Li RongQing Acked-by: Fan Du Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index f5e39e35d73a..96688cd0f6f1 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -927,8 +927,8 @@ struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, __be32 spi, x->id.spi != spi) continue; - spin_unlock_bh(&net->xfrm.xfrm_state_lock); xfrm_state_hold(x); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); return x; } spin_unlock_bh(&net->xfrm.xfrm_state_lock); -- cgit v1.2.3 From 0df48c26d8418c5c9fba63fac15b660d70ca2f1c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Apr 2015 15:28:17 -0700 Subject: tcp: add tcpi_bytes_acked to tcp_info This patch tracks total number of bytes acked for a TCP socket. This is the sum of all changes done to tp->snd_una, and allows for precise tracking of delivered data. RFC4898 named this : tcpEStatsAppHCThruOctetsAcked This is a 64bit field, and can be fetched both from TCP_INFO getsockopt() if one has a handle on a TCP socket, or from inet_diag netlink facility (iproute2/ss patch will follow) Note that tp->bytes_acked was placed near tp->snd_una for best data locality and minimal performance impact. Signed-off-by: Eric Dumazet Acked-by: Yuchung Cheng Cc: Matt Mathis Cc: Eric Salo Cc: Martin Lau Cc: Chris Rapier Signed-off-by: David S. Miller --- include/linux/tcp.h | 4 ++++ include/net/tcp.h | 2 +- include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 6 +++++- net/ipv4/tcp_input.c | 13 +++++++++++-- 5 files changed, 22 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 0caa3a2d4106..0f73b43171da 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -150,6 +150,10 @@ struct tcp_sock { u32 rcv_wup; /* rcv_nxt on last window update sent */ u32 snd_nxt; /* Next sequence we send */ + u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked + * sum(delta(snd_una)), or how many bytes + * were acked. + */ u32 snd_una; /* First byte we want an ack for */ u32 snd_sml; /* Last byte of the most recently transmitted small packet */ u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 051dc5c2802d..dd7b4ea6a10c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -576,7 +576,7 @@ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) } /* tcp.c */ -void tcp_get_info(const struct sock *, struct tcp_info *); +void tcp_get_info(struct sock *, struct tcp_info *); /* Read 'sendfile()'-style from a TCP socket */ typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *, diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 3b9718328d8b..6666e98a0af9 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -189,6 +189,7 @@ struct tcp_info { __u64 tcpi_pacing_rate; __u64 tcpi_max_pacing_rate; + __u64 tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */ }; /* for TCP_MD5SIG socket option */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8c5cd9efebbc..4bf0e8ca7b5b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2592,7 +2592,7 @@ EXPORT_SYMBOL(compat_tcp_setsockopt); #endif /* Return information about state of tcp endpoint in API format. */ -void tcp_get_info(const struct sock *sk, struct tcp_info *info) +void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -2663,6 +2663,10 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info) rate = READ_ONCE(sk->sk_max_pacing_rate); info->tcpi_max_pacing_rate = rate != ~0U ? rate : ~0ULL; + + spin_lock_bh(&sk->sk_lock.slock); + info->tcpi_bytes_acked = tp->bytes_acked; + spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL_GPL(tcp_get_info); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3a4d9b34bed4..378d3f4d4dc3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3280,6 +3280,15 @@ static inline bool tcp_may_update_window(const struct tcp_sock *tp, (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd); } +/* If we update tp->snd_una, also update tp->bytes_acked */ +static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack) +{ + u32 delta = ack - tp->snd_una; + + tp->bytes_acked += delta; + tp->snd_una = ack; +} + /* Update our send window. * * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2 @@ -3315,7 +3324,7 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 } } - tp->snd_una = ack; + tcp_snd_una_update(tp, ack); return flag; } @@ -3497,7 +3506,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) * Note, we use the fact that SND.UNA>=SND.WL2. */ tcp_update_wl(tp, ack_seq); - tp->snd_una = ack; + tcp_snd_una_update(tp, ack); flag |= FLAG_WIN_UPDATE; tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE); -- cgit v1.2.3 From bdd1f9edacb5f5835d1e6276571bbbe5b88ded48 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Apr 2015 15:28:18 -0700 Subject: tcp: add tcpi_bytes_received to tcp_info This patch tracks total number of payload bytes received on a TCP socket. This is the sum of all changes done to tp->rcv_nxt RFC4898 named this : tcpEStatsAppHCThruOctetsReceived This is a 64bit field, and can be fetched both from TCP_INFO getsockopt() if one has a handle on a TCP socket, or from inet_diag netlink facility (iproute2/ss patch will follow) Note that tp->bytes_received was placed near tp->rcv_nxt for best data locality and minimal performance impact. Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Cc: Matt Mathis Cc: Eric Salo Cc: Martin Lau Cc: Chris Rapier Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/tcp.h | 4 ++++ include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 1 + net/ipv4/tcp_fastopen.c | 1 + net/ipv4/tcp_input.c | 17 +++++++++++++---- 5 files changed, 20 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 0f73b43171da..3b2911502a8c 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -145,6 +145,10 @@ struct tcp_sock { * read the code and the spec side by side (and laugh ...) * See RFC793 and RFC1122. The RFC writes these in capitals. */ + u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived + * sum(delta(rcv_nxt)), or how many bytes + * were acked. + */ u32 rcv_nxt; /* What we want to receive next */ u32 copied_seq; /* Head of yet unread data */ u32 rcv_wup; /* rcv_nxt on last window update sent */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 6666e98a0af9..a48f93f3207b 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -190,6 +190,7 @@ struct tcp_info { __u64 tcpi_pacing_rate; __u64 tcpi_max_pacing_rate; __u64 tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */ + __u64 tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */ }; /* for TCP_MD5SIG socket option */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4bf0e8ca7b5b..99fcc0b22c92 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2666,6 +2666,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) spin_lock_bh(&sk->sk_lock.slock); info->tcpi_bytes_acked = tp->bytes_acked; + info->tcpi_bytes_received = tp->bytes_received; spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL_GPL(tcp_get_info); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index e3d87aca6be8..3c673d5e6cff 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -206,6 +206,7 @@ static bool tcp_fastopen_create_child(struct sock *sk, skb_set_owner_r(skb2, child); __skb_queue_tail(&child->sk_receive_queue, skb2); tp->syn_data_acked = 1; + tp->bytes_received = end_seq - TCP_SKB_CB(skb)->seq - 1; } else { end_seq = TCP_SKB_CB(skb)->seq + 1; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 378d3f4d4dc3..7e6962bcfc30 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3289,6 +3289,15 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack) tp->snd_una = ack; } +/* If we update tp->rcv_nxt, also update tp->bytes_received */ +static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq) +{ + u32 delta = seq - tp->rcv_nxt; + + tp->bytes_received += delta; + tp->rcv_nxt = seq; +} + /* Update our send window. * * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2 @@ -4245,7 +4254,7 @@ static void tcp_ofo_queue(struct sock *sk) tail = skb_peek_tail(&sk->sk_receive_queue); eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); if (!eaten) __skb_queue_tail(&sk->sk_receive_queue, skb); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) @@ -4413,7 +4422,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int __skb_pull(skb, hdrlen); eaten = (tail && tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; - tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); if (!eaten) { __skb_queue_tail(&sk->sk_receive_queue, skb); skb_set_owner_r(skb, sk); @@ -4506,7 +4515,7 @@ queue_and_out: eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); } - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); if (skb->len) tcp_event_data_recv(sk, skb); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) @@ -5254,7 +5263,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_rcv_rtt_measure_ts(sk, skb); __skb_pull(skb, tcp_header_len); - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER); eaten = 1; } -- cgit v1.2.3 From 64f40ff5bbdb1b679fb3c4dbc8230d6517d2b8dc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Apr 2015 16:23:48 -0700 Subject: tcp: prepare CC get_info() access from getsockopt() We would like that optional info provided by Congestion Control modules using netlink can also be read using getsockopt() This patch changes get_info() to put this information in a buffer, instead of skb, like tcp_get_info(), so that following patch can reuse this common infrastructure. Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Cc: Neal Cardwell Acked-by: Neal Cardwell Acked-by: Daniel Borkmann Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/net/tcp.h | 5 ++++- include/uapi/linux/inet_diag.h | 4 ++++ net/ipv4/inet_diag.c | 8 +++++--- net/ipv4/tcp_dctcp.c | 20 ++++++++++---------- net/ipv4/tcp_illinois.c | 21 +++++++++++---------- net/ipv4/tcp_vegas.c | 19 ++++++++++--------- net/ipv4/tcp_vegas.h | 3 ++- 7 files changed, 46 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index dd7b4ea6a10c..6d204f3f9df8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -804,6 +804,8 @@ enum tcp_ca_ack_event_flags { /* Requires ECN/ECT set on all packets */ #define TCP_CONG_NEEDS_ECN 0x2 +union tcp_cc_info; + struct tcp_congestion_ops { struct list_head list; u32 key; @@ -829,7 +831,8 @@ struct tcp_congestion_ops { /* hook for packet ack accounting (optional) */ void (*pkts_acked)(struct sock *sk, u32 num_acked, s32 rtt_us); /* get info for inet_diag (optional) */ - int (*get_info)(struct sock *sk, u32 ext, struct sk_buff *skb); + size_t (*get_info)(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info); char name[TCP_CA_NAME_MAX]; struct module *owner; diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index d65c0a09efd3..c7093c75bdd6 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -143,4 +143,8 @@ struct tcp_dctcp_info { __u32 dctcp_ab_tot; }; +union tcp_cc_info { + struct tcpvegas_info vegas; + struct tcp_dctcp_info dctcp; +}; #endif /* _UAPI_INET_DIAG_H_ */ diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index bb77ebdae3b3..4d32262c7502 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -224,14 +224,16 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, handler->idiag_get_info(sk, r, info); if (sk->sk_state < TCP_TIME_WAIT) { - int err = 0; + union tcp_cc_info info; + size_t sz = 0; + int attr; rcu_read_lock(); ca_ops = READ_ONCE(icsk->icsk_ca_ops); if (ca_ops && ca_ops->get_info) - err = ca_ops->get_info(sk, ext, skb); + sz = ca_ops->get_info(sk, ext, &attr, &info); rcu_read_unlock(); - if (err < 0) + if (sz && nla_put(skb, attr, sz, &info) < 0) goto errout; } diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 4376016f7fa5..4c41c1287197 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -277,7 +277,8 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) } } -static int dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) +static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) { const struct dctcp *ca = inet_csk_ca(sk); @@ -286,18 +287,17 @@ static int dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) */ if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) || ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - struct tcp_dctcp_info info; - - memset(&info, 0, sizeof(info)); + memset(info, 0, sizeof(struct tcp_dctcp_info)); if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) { - info.dctcp_enabled = 1; - info.dctcp_ce_state = (u16) ca->ce_state; - info.dctcp_alpha = ca->dctcp_alpha; - info.dctcp_ab_ecn = ca->acked_bytes_ecn; - info.dctcp_ab_tot = ca->acked_bytes_total; + info->dctcp.dctcp_enabled = 1; + info->dctcp.dctcp_ce_state = (u16) ca->ce_state; + info->dctcp.dctcp_alpha = ca->dctcp_alpha; + info->dctcp.dctcp_ab_ecn = ca->acked_bytes_ecn; + info->dctcp.dctcp_ab_tot = ca->acked_bytes_total; } - return nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info); + *attr = INET_DIAG_DCTCPINFO; + return sizeof(*info); } return 0; } diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 67476f085e48..f71002e4db0b 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -300,24 +300,25 @@ static u32 tcp_illinois_ssthresh(struct sock *sk) } /* Extract info for Tcp socket info provided via netlink. */ -static int tcp_illinois_info(struct sock *sk, u32 ext, struct sk_buff *skb) +static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) { const struct illinois *ca = inet_csk_ca(sk); if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - struct tcpvegas_info info = { - .tcpv_enabled = 1, - .tcpv_rttcnt = ca->cnt_rtt, - .tcpv_minrtt = ca->base_rtt, - }; + info->vegas.tcpv_enabled = 1; + info->vegas.tcpv_rttcnt = ca->cnt_rtt; + info->vegas.tcpv_minrtt = ca->base_rtt; + info->vegas.tcpv_rtt = 0; - if (info.tcpv_rttcnt > 0) { + if (info->vegas.tcpv_rttcnt > 0) { u64 t = ca->sum_rtt; - do_div(t, info.tcpv_rttcnt); - info.tcpv_rtt = t; + do_div(t, info->vegas.tcpv_rttcnt); + info->vegas.tcpv_rtt = t; } - return nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); + *attr = INET_DIAG_VEGASINFO; + return sizeof(struct tcpvegas_info); } return 0; } diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index c71a1b8f7bde..a6cea1d5e20d 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -286,18 +286,19 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) } /* Extract info for Tcp socket info provided via netlink. */ -int tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) +size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) { const struct vegas *ca = inet_csk_ca(sk); + if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - struct tcpvegas_info info = { - .tcpv_enabled = ca->doing_vegas_now, - .tcpv_rttcnt = ca->cntRTT, - .tcpv_rtt = ca->baseRTT, - .tcpv_minrtt = ca->minRTT, - }; - - return nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); + info->vegas.tcpv_enabled = ca->doing_vegas_now, + info->vegas.tcpv_rttcnt = ca->cntRTT, + info->vegas.tcpv_rtt = ca->baseRTT, + info->vegas.tcpv_minrtt = ca->minRTT, + + *attr = INET_DIAG_VEGASINFO; + return sizeof(struct tcpvegas_info); } return 0; } diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h index e8a6b33cc61d..ef9da5306c68 100644 --- a/net/ipv4/tcp_vegas.h +++ b/net/ipv4/tcp_vegas.h @@ -19,6 +19,7 @@ void tcp_vegas_init(struct sock *sk); void tcp_vegas_state(struct sock *sk, u8 ca_state); void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us); void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event); -int tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb); +size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info); #endif /* __TCP_VEGAS_H */ -- cgit v1.2.3 From 6e9250f59ef9efb932c84850cd221f22c2a03c4a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Apr 2015 16:23:49 -0700 Subject: tcp: add TCP_CC_INFO socket option Some Congestion Control modules can provide per flow information, but current way to get this information is to use netlink. Like TCP_INFO, let's add TCP_CC_INFO so that applications can issue a getsockopt() if they have a socket file descriptor, instead of playing complex netlink games. Sample usage would be : union tcp_cc_info info; socklen_t len = sizeof(info); if (getsockopt(fd, SOL_TCP, TCP_CC_INFO, &info, &len) == -1) Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Cc: Neal Cardwell Acked-by: Neal Cardwell Acked-by: Daniel Borkmann Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index a48f93f3207b..faa72f4fa547 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -112,6 +112,7 @@ enum { #define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */ #define TCP_TIMESTAMP 24 #define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */ +#define TCP_CC_INFO 26 /* Get Congestion Control (optional) info */ struct tcp_repair_opt { __u32 opt_code; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 99fcc0b22c92..46efa03d2b11 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -252,6 +252,7 @@ #include #include #include +#include #include #include #include @@ -2739,6 +2740,26 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return -EFAULT; return 0; } + case TCP_CC_INFO: { + const struct tcp_congestion_ops *ca_ops; + union tcp_cc_info info; + size_t sz = 0; + int attr; + + if (get_user(len, optlen)) + return -EFAULT; + + ca_ops = icsk->icsk_ca_ops; + if (ca_ops && ca_ops->get_info) + sz = ca_ops->get_info(sk, ~0U, &attr, &info); + + len = min_t(unsigned int, len, sz); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &info, len)) + return -EFAULT; + return 0; + } case TCP_QUICKACK: val = !icsk->icsk_ack.pingpong; break; -- cgit v1.2.3 From 9dac8835440622c2f84591673969d510ce198c11 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 29 Apr 2015 11:28:30 -0700 Subject: tcp: update reordering first before detecting loss tcp_mark_lost_retrans is not used when FACK is disabled. Since tcp_update_reordering may disable FACK, it should be called first before tcp_mark_lost_retrans. Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7e6962bcfc30..bc790ea9960f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1820,14 +1820,12 @@ advance_sp: for (j = 0; j < used_sacks; j++) tp->recv_sack_cache[i++] = sp[j]; - tcp_mark_lost_retrans(sk); - - tcp_verify_left_out(tp); - if ((state.reord < tp->fackets_out) && ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker)) tcp_update_reordering(sk, tp->fackets_out - state.reord, 0); + tcp_mark_lost_retrans(sk); + tcp_verify_left_out(tp); out: #if FASTRETRANS_DEBUG > 0 -- cgit v1.2.3 From d24d81444f8caf1895256ef2d2e89ae8202a17e4 Mon Sep 17 00:00:00 2001 From: Gabriele Mazzotta Date: Sun, 26 Apr 2015 20:51:50 +0200 Subject: Bluetooth: Skip the shutdown routine if the interface is not up Most likely, the shutdown routine requires the interface to be up. This is the case for BTUSB_INTEL: the routine tries to send a command to the interface, but since this one is down, it fails and exits once HCI_INIT_TIMEOUT has expired. Signed-off-by: Gabriele Mazzotta Signed-off-by: Marcel Holtmann Cc: stable@vger.kernel.org # 4.0.x --- net/bluetooth/hci_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 476709bd068a..4663c3dad3f5 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1557,7 +1557,8 @@ static int hci_dev_do_close(struct hci_dev *hdev) { BT_DBG("%s %p", hdev->name, hdev); - if (!hci_dev_test_flag(hdev, HCI_UNREGISTER)) { + if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) && + test_bit(HCI_UP, &hdev->flags)) { /* Execute vendor specific shutdown routine */ if (hdev->shutdown) hdev->shutdown(hdev); -- cgit v1.2.3 From 2b4d413c3871af0f2ccd466fb6581ed2c2d89438 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 30 Apr 2015 17:44:53 +0200 Subject: mac802154: fix ieee802154_register_hw error handling Currently if ieee802154_if_add failed, we don't unregister the wpan phy which was registered before. This patch adds a correct error handling for unregister the wpan phy when ieee802154_if_add failed. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac802154/main.c b/net/mac802154/main.c index 8500378c8318..beece7b7a776 100644 --- a/net/mac802154/main.c +++ b/net/mac802154/main.c @@ -166,13 +166,15 @@ int ieee802154_register_hw(struct ieee802154_hw *hw) if (IS_ERR(dev)) { rtnl_unlock(); rc = PTR_ERR(dev); - goto out_wq; + goto out_phy; } rtnl_unlock(); return 0; +out_phy: + wpan_phy_unregister(local->phy); out_wq: destroy_workqueue(local->workqueue); out: -- cgit v1.2.3 From 89eb6d0677a6daf134015bc7bd5ec1432911eed2 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 30 Apr 2015 17:44:54 +0200 Subject: mac802154: llsec: fix return value check in llsec_key_alloc() In case of error, the functions crypto_alloc_aead() and crypto_alloc_blkcipher() returns ERR_PTR() and never returns NULL. The NULL test in the return value check should be replaced with IS_ERR(). Signed-off-by: Wei Yongjun Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/llsec.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c index dcf73958133a..5b2be12832e6 100644 --- a/net/mac802154/llsec.c +++ b/net/mac802154/llsec.c @@ -134,7 +134,7 @@ llsec_key_alloc(const struct ieee802154_llsec_key *template) for (i = 0; i < ARRAY_SIZE(key->tfm); i++) { key->tfm[i] = crypto_alloc_aead("ccm(aes)", 0, CRYPTO_ALG_ASYNC); - if (!key->tfm[i]) + if (IS_ERR(key->tfm[i])) goto err_tfm; if (crypto_aead_setkey(key->tfm[i], template->key, IEEE802154_LLSEC_KEY_SIZE)) @@ -144,7 +144,7 @@ llsec_key_alloc(const struct ieee802154_llsec_key *template) } key->tfm0 = crypto_alloc_blkcipher("ctr(aes)", 0, CRYPTO_ALG_ASYNC); - if (!key->tfm0) + if (IS_ERR(key->tfm0)) goto err_tfm; if (crypto_blkcipher_setkey(key->tfm0, template->key, -- cgit v1.2.3 From 1cc800e7aabb40e2d8ffc3d259e6a153007fa9eb Mon Sep 17 00:00:00 2001 From: Guido Günther Date: Thu, 30 Apr 2015 17:44:55 +0200 Subject: ieee802154: Add trace events for rdev->ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enabling tracing via echo 1 > /sys/kernel/debug/tracing/events/cfg802154/enable enables event tracing like iwpan dev wpan0 set pan_id 0xbeef cat /sys/kernel/debug/tracing/trace # tracer: nop # # entries-in-buffer/entries-written: 2/2 #P:1 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | iwpan-2663 [000] .... 170.369142: 802154_rdev_set_pan_id: phy0, wpan_dev(1), pan id: 0xbeef iwpan-2663 [000] .... 170.369177: 802154_rdev_return_int: phy0, returned: 0 Signed-off-by: Guido Günther Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/Makefile | 4 +- net/ieee802154/rdev-ops.h | 77 ++++++++++++--- net/ieee802154/trace.c | 7 ++ net/ieee802154/trace.h | 246 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 322 insertions(+), 12 deletions(-) create mode 100644 net/ieee802154/trace.c create mode 100644 net/ieee802154/trace.h (limited to 'net') diff --git a/net/ieee802154/Makefile b/net/ieee802154/Makefile index 05dab2957cd4..4adfd4d5471b 100644 --- a/net/ieee802154/Makefile +++ b/net/ieee802154/Makefile @@ -3,7 +3,9 @@ obj-$(CONFIG_IEEE802154_SOCKET) += ieee802154_socket.o obj-y += 6lowpan/ ieee802154-y := netlink.o nl-mac.o nl-phy.o nl_policy.o core.o \ - header_ops.o sysfs.o nl802154.o + header_ops.o sysfs.o nl802154.o trace.o ieee802154_socket-y := socket.o +CFLAGS_trace.o := -I$(src) + ccflags-y += -D__CHECK_ENDIAN__ diff --git a/net/ieee802154/rdev-ops.h b/net/ieee802154/rdev-ops.h index 7c46732fad2b..624413ee095f 100644 --- a/net/ieee802154/rdev-ops.h +++ b/net/ieee802154/rdev-ops.h @@ -4,6 +4,7 @@ #include #include "core.h" +#include "trace.h" static inline struct net_device * rdev_add_virtual_intf_deprecated(struct cfg802154_registered_device *rdev, @@ -24,73 +25,127 @@ static inline int rdev_add_virtual_intf(struct cfg802154_registered_device *rdev, char *name, enum nl802154_iftype type, __le64 extended_addr) { - return rdev->ops->add_virtual_intf(&rdev->wpan_phy, name, type, + int ret; + + trace_802154_rdev_add_virtual_intf(&rdev->wpan_phy, name, type, extended_addr); + ret = rdev->ops->add_virtual_intf(&rdev->wpan_phy, name, type, + extended_addr); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } static inline int rdev_del_virtual_intf(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev) { - return rdev->ops->del_virtual_intf(&rdev->wpan_phy, wpan_dev); + int ret; + + trace_802154_rdev_del_virtual_intf(&rdev->wpan_phy, wpan_dev); + ret = rdev->ops->del_virtual_intf(&rdev->wpan_phy, wpan_dev); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } static inline int rdev_set_channel(struct cfg802154_registered_device *rdev, u8 page, u8 channel) { - return rdev->ops->set_channel(&rdev->wpan_phy, page, channel); + int ret; + + trace_802154_rdev_set_channel(&rdev->wpan_phy, page, channel); + ret = rdev->ops->set_channel(&rdev->wpan_phy, page, channel); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } static inline int rdev_set_cca_mode(struct cfg802154_registered_device *rdev, const struct wpan_phy_cca *cca) { - return rdev->ops->set_cca_mode(&rdev->wpan_phy, cca); + int ret; + + trace_802154_rdev_set_cca_mode(&rdev->wpan_phy, cca); + ret = rdev->ops->set_cca_mode(&rdev->wpan_phy, cca); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } static inline int rdev_set_pan_id(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, __le16 pan_id) { - return rdev->ops->set_pan_id(&rdev->wpan_phy, wpan_dev, pan_id); + int ret; + + trace_802154_rdev_set_pan_id(&rdev->wpan_phy, wpan_dev, pan_id); + ret = rdev->ops->set_pan_id(&rdev->wpan_phy, wpan_dev, pan_id); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } static inline int rdev_set_short_addr(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, __le16 short_addr) { - return rdev->ops->set_short_addr(&rdev->wpan_phy, wpan_dev, short_addr); + int ret; + + trace_802154_rdev_set_short_addr(&rdev->wpan_phy, wpan_dev, short_addr); + ret = rdev->ops->set_short_addr(&rdev->wpan_phy, wpan_dev, short_addr); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } static inline int rdev_set_backoff_exponent(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, u8 min_be, u8 max_be) { - return rdev->ops->set_backoff_exponent(&rdev->wpan_phy, wpan_dev, + int ret; + + trace_802154_rdev_set_backoff_exponent(&rdev->wpan_phy, wpan_dev, min_be, max_be); + ret = rdev->ops->set_backoff_exponent(&rdev->wpan_phy, wpan_dev, + min_be, max_be); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } static inline int rdev_set_max_csma_backoffs(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, u8 max_csma_backoffs) { - return rdev->ops->set_max_csma_backoffs(&rdev->wpan_phy, wpan_dev, - max_csma_backoffs); + int ret; + + trace_802154_rdev_set_csma_backoffs(&rdev->wpan_phy, wpan_dev, + max_csma_backoffs); + ret = rdev->ops->set_max_csma_backoffs(&rdev->wpan_phy, wpan_dev, + max_csma_backoffs); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } static inline int rdev_set_max_frame_retries(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, s8 max_frame_retries) { - return rdev->ops->set_max_frame_retries(&rdev->wpan_phy, wpan_dev, + int ret; + + trace_802154_rdev_set_max_frame_retries(&rdev->wpan_phy, wpan_dev, max_frame_retries); + ret = rdev->ops->set_max_frame_retries(&rdev->wpan_phy, wpan_dev, + max_frame_retries); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } static inline int rdev_set_lbt_mode(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, bool mode) { - return rdev->ops->set_lbt_mode(&rdev->wpan_phy, wpan_dev, mode); + int ret; + + trace_802154_rdev_set_lbt_mode(&rdev->wpan_phy, wpan_dev, mode); + ret = rdev->ops->set_lbt_mode(&rdev->wpan_phy, wpan_dev, mode); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } #endif /* __CFG802154_RDEV_OPS */ diff --git a/net/ieee802154/trace.c b/net/ieee802154/trace.c new file mode 100644 index 000000000000..95f997fad755 --- /dev/null +++ b/net/ieee802154/trace.c @@ -0,0 +1,7 @@ +#include + +#ifndef __CHECKER__ +#define CREATE_TRACE_POINTS +#include "trace.h" + +#endif diff --git a/net/ieee802154/trace.h b/net/ieee802154/trace.h new file mode 100644 index 000000000000..8ed9f975de01 --- /dev/null +++ b/net/ieee802154/trace.h @@ -0,0 +1,246 @@ +/* Based on net/wireless/tracing.h */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM cfg802154 + +#if !defined(__RDEV_CFG802154_OPS_TRACE) || defined(TRACE_HEADER_MULTI_READ) +#define __RDEV_CFG802154_OPS_TRACE + +#include + +#include + +#define MAXNAME 32 +#define WPAN_PHY_ENTRY __array(char, wpan_phy_name, MAXNAME) +#define WPAN_PHY_ASSIGN strlcpy(__entry->wpan_phy_name, \ + wpan_phy_name(wpan_phy), \ + MAXNAME) +#define WPAN_PHY_PR_FMT "%s" +#define WPAN_PHY_PR_ARG __entry->wpan_phy_name + +#define WPAN_DEV_ENTRY __field(u32, identifier) +#define WPAN_DEV_ASSIGN (__entry->identifier) = (!IS_ERR_OR_NULL(wpan_dev) \ + ? wpan_dev->identifier : 0) +#define WPAN_DEV_PR_FMT "wpan_dev(%u)" +#define WPAN_DEV_PR_ARG (__entry->identifier) + +#define WPAN_CCA_ENTRY __field(enum nl802154_cca_modes, cca_mode) \ + __field(enum nl802154_cca_opts, cca_opt) +#define WPAN_CCA_ASSIGN \ + do { \ + (__entry->cca_mode) = cca->mode; \ + (__entry->cca_opt) = cca->opt; \ + } while (0) +#define WPAN_CCA_PR_FMT "cca_mode: %d, cca_opt: %d" +#define WPAN_CCA_PR_ARG __entry->cca_mode, __entry->cca_opt + +#define BOOL_TO_STR(bo) (bo) ? "true" : "false" + +/************************************************************* + * rdev->ops traces * + *************************************************************/ + +TRACE_EVENT(802154_rdev_add_virtual_intf, + TP_PROTO(struct wpan_phy *wpan_phy, char *name, + enum nl802154_iftype type, __le64 extended_addr), + TP_ARGS(wpan_phy, name, type, extended_addr), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + __string(vir_intf_name, name ? name : "") + __field(enum nl802154_iftype, type) + __field(__le64, extended_addr) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + __assign_str(vir_intf_name, name ? name : ""); + __entry->type = type; + __entry->extended_addr = extended_addr; + ), + TP_printk(WPAN_PHY_PR_FMT ", virtual intf name: %s, type: %d, ea %llx", + WPAN_PHY_PR_ARG, __get_str(vir_intf_name), __entry->type, + __entry->extended_addr) +); + +TRACE_EVENT(802154_rdev_del_virtual_intf, + TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev), + TP_ARGS(wpan_phy, wpan_dev), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + WPAN_DEV_ENTRY + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + WPAN_DEV_ASSIGN; + ), + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT, WPAN_PHY_PR_ARG, + WPAN_DEV_PR_ARG) +); + +TRACE_EVENT(802154_rdev_set_channel, + TP_PROTO(struct wpan_phy *wpan_phy, u8 page, u8 channel), + TP_ARGS(wpan_phy, page, channel), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + __field(u8, page) + __field(u8, channel) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + __entry->page = page; + __entry->channel = channel; + ), + TP_printk(WPAN_PHY_PR_FMT ", page: %d, channel: %d", WPAN_PHY_PR_ARG, + __entry->page, __entry->channel) +); + +TRACE_EVENT(802154_rdev_set_cca_mode, + TP_PROTO(struct wpan_phy *wpan_phy, const struct wpan_phy_cca *cca), + TP_ARGS(wpan_phy, cca), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + WPAN_CCA_ENTRY + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + WPAN_CCA_ASSIGN; + ), + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_CCA_PR_FMT, WPAN_PHY_PR_ARG, + WPAN_CCA_PR_ARG) +); + +DECLARE_EVENT_CLASS(802154_le16_template, + TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + __le16 le16arg), + TP_ARGS(wpan_phy, wpan_dev, le16arg), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + WPAN_DEV_ENTRY + __field(__le16, le16arg) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + WPAN_DEV_ASSIGN; + __entry->le16arg = le16arg; + ), + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", pan id: 0x%04x", + WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG, + __le16_to_cpu(__entry->le16arg)) +); + +DEFINE_EVENT(802154_le16_template, 802154_rdev_set_pan_id, + TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + __le16 le16arg), + TP_ARGS(wpan_phy, wpan_dev, le16arg) +); + +DEFINE_EVENT_PRINT(802154_le16_template, 802154_rdev_set_short_addr, + TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + __le16 le16arg), + TP_ARGS(wpan_phy, wpan_dev, le16arg), + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", sa: 0x%04x", + WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG, __entry->le16arg) +); + +TRACE_EVENT(802154_rdev_set_backoff_exponent, + TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + u8 min_be, u8 max_be), + TP_ARGS(wpan_phy, wpan_dev, min_be, max_be), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + WPAN_DEV_ENTRY + __field(u8, min_be) + __field(u8, max_be) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + WPAN_DEV_ASSIGN; + __entry->min_be = min_be; + __entry->max_be = max_be; + ), + + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT + ", min be: %d, max_be: %d", WPAN_PHY_PR_ARG, + WPAN_DEV_PR_ARG, __entry->min_be, __entry->max_be) +); + +TRACE_EVENT(802154_rdev_set_csma_backoffs, + TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + u8 max_csma_backoffs), + TP_ARGS(wpan_phy, wpan_dev, max_csma_backoffs), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + WPAN_DEV_ENTRY + __field(u8, max_csma_backoffs) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + WPAN_DEV_ASSIGN; + __entry->max_csma_backoffs = max_csma_backoffs; + ), + + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT + ", max csma backoffs: %d", WPAN_PHY_PR_ARG, + WPAN_DEV_PR_ARG, __entry->max_csma_backoffs) +); + +TRACE_EVENT(802154_rdev_set_max_frame_retries, + TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + s8 max_frame_retries), + TP_ARGS(wpan_phy, wpan_dev, max_frame_retries), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + WPAN_DEV_ENTRY + __field(s8, max_frame_retries) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + WPAN_DEV_ASSIGN; + __entry->max_frame_retries = max_frame_retries; + ), + + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT + ", max frame retries: %d", WPAN_PHY_PR_ARG, + WPAN_DEV_PR_ARG, __entry->max_frame_retries) +); + +TRACE_EVENT(802154_rdev_set_lbt_mode, + TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + bool mode), + TP_ARGS(wpan_phy, wpan_dev, mode), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + WPAN_DEV_ENTRY + __field(bool, mode) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + WPAN_DEV_ASSIGN; + __entry->mode = mode; + ), + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT + ", lbt mode: %s", WPAN_PHY_PR_ARG, + WPAN_DEV_PR_ARG, BOOL_TO_STR(__entry->mode)) +); + +TRACE_EVENT(802154_rdev_return_int, + TP_PROTO(struct wpan_phy *wpan_phy, int ret), + TP_ARGS(wpan_phy, ret), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + __field(int, ret) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + __entry->ret = ret; + ), + TP_printk(WPAN_PHY_PR_FMT ", returned: %d", WPAN_PHY_PR_ARG, + __entry->ret) +); + +#endif /* !__RDEV_CFG802154_OPS_TRACE || TRACE_HEADER_MULTI_READ */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace +#include -- cgit v1.2.3 From 5b4a10390460cccf17a9fac739e153d68cf25ef5 Mon Sep 17 00:00:00 2001 From: Varka Bhadram Date: Thu, 30 Apr 2015 17:44:57 +0200 Subject: cfg802154: pass name_assign_type to rdev_add_virtual_intf() This code is based on commit 6bab2e19c5ffd ("cfg80211: pass name_assign_type to rdev_add_virtual_intf()") This will expose in sysfs whether the ifname of a IEEE-802.15.4 device is set by userspace or generated by the kernel. We are using two types of name_assign_types o NET_NAME_ENUM: Default interface name provided by kernel o NET_NAME_USER: Interface name provided by user. Signed-off-by: Varka Bhadram Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- include/net/cfg802154.h | 2 ++ net/ieee802154/nl-phy.c | 5 ++++- net/ieee802154/nl802154.c | 2 +- net/ieee802154/rdev-ops.h | 10 +++++++--- net/mac802154/cfg.c | 9 ++++++--- net/mac802154/ieee802154_i.h | 3 ++- net/mac802154/iface.c | 5 +++-- net/mac802154/main.c | 3 ++- 8 files changed, 27 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index eeda67652766..6ea16c84293b 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -30,11 +30,13 @@ struct wpan_phy_cca; struct cfg802154_ops { struct net_device * (*add_virtual_intf_deprecated)(struct wpan_phy *wpan_phy, const char *name, + unsigned char name_assign_type, int type); void (*del_virtual_intf_deprecated)(struct wpan_phy *wpan_phy, struct net_device *dev); int (*add_virtual_intf)(struct wpan_phy *wpan_phy, const char *name, + unsigned char name_assign_type, enum nl802154_iftype type, __le64 extended_addr); int (*del_virtual_intf)(struct wpan_phy *wpan_phy, diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c index 1b9d25f6e898..346c6665d25e 100644 --- a/net/ieee802154/nl-phy.c +++ b/net/ieee802154/nl-phy.c @@ -175,6 +175,7 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info) int rc = -ENOBUFS; struct net_device *dev; int type = __IEEE802154_DEV_INVALID; + unsigned char name_assign_type; pr_debug("%s\n", __func__); @@ -190,8 +191,10 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info) if (devname[nla_len(info->attrs[IEEE802154_ATTR_DEV_NAME]) - 1] != '\0') return -EINVAL; /* phy name should be null-terminated */ + name_assign_type = NET_NAME_USER; } else { devname = "wpan%d"; + name_assign_type = NET_NAME_ENUM; } if (strlen(devname) >= IFNAMSIZ) @@ -221,7 +224,7 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info) } dev = rdev_add_virtual_intf_deprecated(wpan_phy_to_rdev(phy), devname, - type); + name_assign_type, type); if (IS_ERR(dev)) { rc = PTR_ERR(dev); goto nla_put_failure; diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index a4daf91b8d0a..f3c12f6a4a39 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -589,7 +589,7 @@ static int nl802154_new_interface(struct sk_buff *skb, struct genl_info *info) return rdev_add_virtual_intf(rdev, nla_data(info->attrs[NL802154_ATTR_IFNAME]), - type, extended_addr); + NET_NAME_USER, type, extended_addr); } static int nl802154_del_interface(struct sk_buff *skb, struct genl_info *info) diff --git a/net/ieee802154/rdev-ops.h b/net/ieee802154/rdev-ops.h index 624413ee095f..7b5a9dd94fe5 100644 --- a/net/ieee802154/rdev-ops.h +++ b/net/ieee802154/rdev-ops.h @@ -8,10 +8,12 @@ static inline struct net_device * rdev_add_virtual_intf_deprecated(struct cfg802154_registered_device *rdev, - const char *name, int type) + const char *name, + unsigned char name_assign_type, + int type) { return rdev->ops->add_virtual_intf_deprecated(&rdev->wpan_phy, name, - type); + name_assign_type, type); } static inline void @@ -23,13 +25,15 @@ rdev_del_virtual_intf_deprecated(struct cfg802154_registered_device *rdev, static inline int rdev_add_virtual_intf(struct cfg802154_registered_device *rdev, char *name, + unsigned char name_assign_type, enum nl802154_iftype type, __le64 extended_addr) { int ret; trace_802154_rdev_add_virtual_intf(&rdev->wpan_phy, name, type, extended_addr); - ret = rdev->ops->add_virtual_intf(&rdev->wpan_phy, name, type, + ret = rdev->ops->add_virtual_intf(&rdev->wpan_phy, name, + name_assign_type, type, extended_addr); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c index 5d9f68c75e5f..70be9c799f8a 100644 --- a/net/mac802154/cfg.c +++ b/net/mac802154/cfg.c @@ -22,13 +22,14 @@ static struct net_device * ieee802154_add_iface_deprecated(struct wpan_phy *wpan_phy, - const char *name, int type) + const char *name, + unsigned char name_assign_type, int type) { struct ieee802154_local *local = wpan_phy_priv(wpan_phy); struct net_device *dev; rtnl_lock(); - dev = ieee802154_if_add(local, name, type, + dev = ieee802154_if_add(local, name, name_assign_type, type, cpu_to_le64(0x0000000000000000ULL)); rtnl_unlock(); @@ -45,12 +46,14 @@ static void ieee802154_del_iface_deprecated(struct wpan_phy *wpan_phy, static int ieee802154_add_iface(struct wpan_phy *phy, const char *name, + unsigned char name_assign_type, enum nl802154_iftype type, __le64 extended_addr) { struct ieee802154_local *local = wpan_phy_priv(phy); struct net_device *err; - err = ieee802154_if_add(local, name, type, extended_addr); + err = ieee802154_if_add(local, name, name_assign_type, type, + extended_addr); return PTR_ERR_OR_ZERO(err); } diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h index bebd70ffc7a3..127ba18386fc 100644 --- a/net/mac802154/ieee802154_i.h +++ b/net/mac802154/ieee802154_i.h @@ -182,7 +182,8 @@ void ieee802154_iface_exit(void); void ieee802154_if_remove(struct ieee802154_sub_if_data *sdata); struct net_device * ieee802154_if_add(struct ieee802154_local *local, const char *name, - enum nl802154_iftype type, __le64 extended_addr); + unsigned char name_assign_type, enum nl802154_iftype type, + __le64 extended_addr); void ieee802154_remove_interfaces(struct ieee802154_local *local); #endif /* __IEEE802154_I_H */ diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index 38b56f9d9386..91b75abbd1a1 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -522,7 +522,8 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata, struct net_device * ieee802154_if_add(struct ieee802154_local *local, const char *name, - enum nl802154_iftype type, __le64 extended_addr) + unsigned char name_assign_type, enum nl802154_iftype type, + __le64 extended_addr) { struct net_device *ndev = NULL; struct ieee802154_sub_if_data *sdata = NULL; @@ -531,7 +532,7 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name, ASSERT_RTNL(); ndev = alloc_netdev(sizeof(*sdata) + local->hw.vif_data_size, name, - NET_NAME_UNKNOWN, ieee802154_if_setup); + name_assign_type, ieee802154_if_setup); if (!ndev) return ERR_PTR(-ENOMEM); diff --git a/net/mac802154/main.c b/net/mac802154/main.c index 8500378c8318..68b9667323ec 100644 --- a/net/mac802154/main.c +++ b/net/mac802154/main.c @@ -161,7 +161,8 @@ int ieee802154_register_hw(struct ieee802154_hw *hw) rtnl_lock(); - dev = ieee802154_if_add(local, "wpan%d", NL802154_IFTYPE_NODE, + dev = ieee802154_if_add(local, "wpan%d", NET_NAME_ENUM, + NL802154_IFTYPE_NODE, cpu_to_le64(0x0000000000000000ULL)); if (IS_ERR(dev)) { rtnl_unlock(); -- cgit v1.2.3 From 1add15646672ff4e7fe59bec2afcb5a0c80c5e49 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 30 Apr 2015 17:45:04 +0200 Subject: ieee802154: trace: fix endian convertion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fix endian convertions for extended address and short address handling when TP_printk is called. Signed-off-by: Alexander Aring Cc: Guido Günther Signed-off-by: Marcel Holtmann --- net/ieee802154/trace.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ieee802154/trace.h b/net/ieee802154/trace.h index 8ed9f975de01..5ac25eb6ed17 100644 --- a/net/ieee802154/trace.h +++ b/net/ieee802154/trace.h @@ -58,7 +58,7 @@ TRACE_EVENT(802154_rdev_add_virtual_intf, ), TP_printk(WPAN_PHY_PR_FMT ", virtual intf name: %s, type: %d, ea %llx", WPAN_PHY_PR_ARG, __get_str(vir_intf_name), __entry->type, - __entry->extended_addr) + __le64_to_cpu(__entry->extended_addr)) ); TRACE_EVENT(802154_rdev_del_virtual_intf, @@ -138,7 +138,8 @@ DEFINE_EVENT_PRINT(802154_le16_template, 802154_rdev_set_short_addr, __le16 le16arg), TP_ARGS(wpan_phy, wpan_dev, le16arg), TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", sa: 0x%04x", - WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG, __entry->le16arg) + WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG, + __le16_to_cpu(__entry->le16arg)) ); TRACE_EVENT(802154_rdev_set_backoff_exponent, -- cgit v1.2.3 From a5d28090405038ca1f40c13f38d6d4285456efee Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Apr 2015 09:40:40 -0700 Subject: codel: fix maxpacket/mtu confusion Under presence of TSO/GSO/GRO packets, codel at low rates can be quite useless. In following example, not a single packet was ever dropped, while average delay in codel queue is ~100 ms ! qdisc codel 0: parent 1:12 limit 16000p target 5.0ms interval 100.0ms Sent 134376498 bytes 88797 pkt (dropped 0, overlimits 0 requeues 0) backlog 13626b 3p requeues 0 count 0 lastcount 0 ldelay 96.9ms drop_next 0us maxpacket 9084 ecn_mark 0 drop_overlimit 0 This comes from a confusion of what should be the minimal backlog. It is pretty clear it is not 64KB or whatever max GSO packet ever reached the qdisc. codel intent was to use MTU of the device. After the fix, we finally drop some packets, and rtt/cwnd of my single TCP flow are meeting our expectations. qdisc codel 0: parent 1:12 limit 16000p target 5.0ms interval 100.0ms Sent 102798497 bytes 67912 pkt (dropped 1365, overlimits 0 requeues 0) backlog 6056b 3p requeues 0 count 1 lastcount 1 ldelay 36.3ms drop_next 0us maxpacket 10598 ecn_mark 0 drop_overlimit 0 Signed-off-by: Eric Dumazet Cc: Kathleen Nichols Cc: Dave Taht Cc: Van Jacobson Signed-off-by: David S. Miller --- include/net/codel.h | 10 +++++++--- net/sched/sch_codel.c | 2 +- net/sched/sch_fq_codel.c | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/codel.h b/include/net/codel.h index aeee28081245..1e18005f7f65 100644 --- a/include/net/codel.h +++ b/include/net/codel.h @@ -120,11 +120,13 @@ static inline u32 codel_time_to_us(codel_time_t val) * struct codel_params - contains codel parameters * @target: target queue size (in time units) * @interval: width of moving time window + * @mtu: device mtu, or minimal queue backlog in bytes. * @ecn: is Explicit Congestion Notification enabled */ struct codel_params { codel_time_t target; codel_time_t interval; + u32 mtu; bool ecn; }; @@ -166,10 +168,12 @@ struct codel_stats { u32 ecn_mark; }; -static void codel_params_init(struct codel_params *params) +static void codel_params_init(struct codel_params *params, + const struct Qdisc *sch) { params->interval = MS2TIME(100); params->target = MS2TIME(5); + params->mtu = psched_mtu(qdisc_dev(sch)); params->ecn = false; } @@ -180,7 +184,7 @@ static void codel_vars_init(struct codel_vars *vars) static void codel_stats_init(struct codel_stats *stats) { - stats->maxpacket = 256; + stats->maxpacket = 0; } /* @@ -234,7 +238,7 @@ static bool codel_should_drop(const struct sk_buff *skb, stats->maxpacket = qdisc_pkt_len(skb); if (codel_time_before(vars->ldelay, params->target) || - sch->qstats.backlog <= stats->maxpacket) { + sch->qstats.backlog <= params->mtu) { /* went below - stay below for at least interval */ vars->first_above_time = 0; return false; diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index de28f8e968e8..7a0bdb16ac92 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -164,7 +164,7 @@ static int codel_init(struct Qdisc *sch, struct nlattr *opt) sch->limit = DEFAULT_CODEL_LIMIT; - codel_params_init(&q->params); + codel_params_init(&q->params, sch); codel_vars_init(&q->vars); codel_stats_init(&q->stats); diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 1e52decb7b59..c244c45b78d7 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -391,7 +391,7 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) q->perturbation = prandom_u32(); INIT_LIST_HEAD(&q->new_flows); INIT_LIST_HEAD(&q->old_flows); - codel_params_init(&q->cparams); + codel_params_init(&q->cparams, sch); codel_stats_init(&q->cstats); q->cparams.ecn = true; -- cgit v1.2.3 From edac450d5b5d47aa11e614f19c8f9b17f362e608 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 1 May 2015 08:27:59 +0800 Subject: netlink: Remove max_size setting We currently limit the hash table size to 64K which is very bad as even 10 years ago it was relatively easy to generate millions of sockets. Since the hash table is naturally limited by memory allocation failure, we don't really need an explicit limit so this patch removes it. Signed-off-by: Herbert Xu Acked-by: Thomas Graf Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index ec4adbdcb9b4..daa0b818174b 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -3139,7 +3139,6 @@ static const struct rhashtable_params netlink_rhashtable_params = { .key_len = netlink_compare_arg_len, .obj_hashfn = netlink_hash, .obj_cmpfn = netlink_compare, - .max_size = 65536, .automatic_shrinking = true, }; -- cgit v1.2.3 From c0adf54a10903b59037a4c5fcb933dfeeb7b2624 Mon Sep 17 00:00:00 2001 From: shamir rabinovitch Date: Thu, 30 Apr 2015 20:58:07 -0400 Subject: net/rds: fix unaligned memory access rdma_conn_param private data is copied using memcpy after headers such as cma_hdr (see cma_resolve_ib_udp as example). so the start of the private data is aligned to the end of the structure that come before. if this structure end with u32 the meaning is that the start of the private data will be 4 bytes aligned. structures that use u8/u16/u32/u64 are naturally aligned but in case the structure start is not 8 bytes aligned, all u64 members of this structure will not be aligned. to solve this issue we must use special macros that allow unaligned access to those unaligned members. Addresses the following kernel log seen when attempting to use RDMA: Kernel unaligned access at TPC[10507a88] rds_ib_cm_connect_complete+0x1bc/0x1e0 [rds_rdma] Acked-by: Chien Yen Signed-off-by: shamir rabinovitch [Minor tweaks for top of tree by:] Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/rds/ib_cm.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 31b74f5e61ad..29144a60019f 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -183,8 +183,17 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even /* If the peer gave us the last packet it saw, process this as if * we had received a regular ACK. */ - if (dp && dp->dp_ack_seq) - rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); + if (dp) { + /* dp structure start is not guaranteed to be 8 bytes aligned. + * Since dp_ack_seq is 64-bit extended load operations can be + * used so go through get_unaligned to avoid unaligned errors. + */ + u64 dp_ack_seq = get_unaligned(&dp->dp_ack_seq); + + if (dp_ack_seq) + rds_send_drop_acked(conn, be64_to_cpu(dp_ack_seq), + NULL); + } rds_connect_complete(conn); } -- cgit v1.2.3 From 2e70aedd3d522b018c01df172cd213a8a75e2d55 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 3 May 2015 08:04:28 +0800 Subject: Revert "net: kernel socket should be released in init_net namespace" This reverts commit c243d7e20996254f89c28d4838b5feca735c030d. That patch is solving a non-existant problem while creating a real problem. Just because a socket is allocated in the init name space doesn't mean that it gets hashed in the init name space. When we unhash it the name space must be the same as the one we had when we hashed it. So this patch is completely bogus and causes socket leaks. Reported-by: Andrey Wagin Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index e891bcf325ca..292f42228bfb 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1474,8 +1474,8 @@ void sk_release_kernel(struct sock *sk) return; sock_hold(sk); - sock_net_set(sk, get_net(&init_net)); sock_release(sk->sk_socket); + sock_net_set(sk, get_net(&init_net)); sock_put(sk); } EXPORT_SYMBOL(sk_release_kernel); -- cgit v1.2.3 From 9507271d960a1911a51683888837d75c171cd91f Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Tue, 28 Apr 2015 16:29:53 -0400 Subject: svcrpc: fix potential GSSX_ACCEPT_SEC_CONTEXT decoding failures In an environment where the KDC is running Active Directory, the exported composite name field returned in the context could be large enough to span a page boundary. Attaching a scratch buffer to the decoding xdr_stream helps deal with those cases. The case where we saw this was actually due to behavior that's been fixed in newer gss-proxy versions, but we're fixing it here too. Signed-off-by: Scott Mayhew Cc: stable@vger.kernel.org Reviewed-by: Simo Sorce Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/gss_rpc_xdr.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index 1ec19f6f0c2b..eeeba5adee6d 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -793,20 +793,26 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, { u32 value_follows; int err; + struct page *scratch; + + scratch = alloc_page(GFP_KERNEL); + if (!scratch) + return -ENOMEM; + xdr_set_scratch_buffer(xdr, page_address(scratch), PAGE_SIZE); /* res->status */ err = gssx_dec_status(xdr, &res->status); if (err) - return err; + goto out_free; /* res->context_handle */ err = gssx_dec_bool(xdr, &value_follows); if (err) - return err; + goto out_free; if (value_follows) { err = gssx_dec_ctx(xdr, res->context_handle); if (err) - return err; + goto out_free; } else { res->context_handle = NULL; } @@ -814,11 +820,11 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, /* res->output_token */ err = gssx_dec_bool(xdr, &value_follows); if (err) - return err; + goto out_free; if (value_follows) { err = gssx_dec_buffer(xdr, res->output_token); if (err) - return err; + goto out_free; } else { res->output_token = NULL; } @@ -826,14 +832,17 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, /* res->delegated_cred_handle */ err = gssx_dec_bool(xdr, &value_follows); if (err) - return err; + goto out_free; if (value_follows) { /* we do not support upcall servers sending this data. */ - return -EINVAL; + err = -EINVAL; + goto out_free; } /* res->options */ err = gssx_dec_option_array(xdr, &res->options); +out_free: + __free_page(scratch); return err; } -- cgit v1.2.3 From d66bf7dd27573ee5ea90484899ee952c19ccb194 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Sat, 2 May 2015 21:33:44 -0400 Subject: net: core: Correct an over-stringent device loop detection. The code in __netdev_upper_dev_link() has an over-stringent loop detection logic that actually prevents valid configurations from working correctly. In particular, the logic returns an error if an upper device is already in the list of all upper devices for a given dev. This particular check seems to be a overzealous as it disallows perfectly valid configurations. For example: # ip l a link eth0 name eth0.10 type vlan id 10 # ip l a dev br0 typ bridge # ip l s eth0.10 master br0 # ip l s eth0 master br0 <--- Will fail If you switch the last two commands (add eth0 first), then both will succeed. If after that, you remove eth0 and try to re-add it, it will fail! It appears to be enough to simply check adj_list to keeps things safe. I've tried stacking multiple devices multiple times in all different combinations, and either rx_handler registration prevented the stacking of the device linking cought the error. Signed-off-by: Vladislav Yasevich Acked-by: Jiri Pirko Acked-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index c7ba0388f1be..2c1c67fad64d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5209,7 +5209,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper)) return -EBUSY; - if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper)) + if (__netdev_find_adj(dev, upper_dev, &dev->adj_list.upper)) return -EEXIST; if (master && netdev_master_upper_dev_get(dev)) -- cgit v1.2.3 From e2783717a71e9babfdd7c36c7e35b790d2c01022 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 4 May 2015 11:51:38 -0400 Subject: net/rds: Fix new sparse warning c0adf54a109 introduced new sparse warnings: CHECK /home/dahern/kernels/linux.git/net/rds/ib_cm.c net/rds/ib_cm.c:191:34: warning: incorrect type in initializer (different base types) net/rds/ib_cm.c:191:34: expected unsigned long long [unsigned] [usertype] dp_ack_seq net/rds/ib_cm.c:191:34: got restricted __be64 net/rds/ib_cm.c:194:51: warning: cast to restricted __be64 The temporary variable for sequence number should have been declared as __be64 rather than u64. Make it so. Signed-off-by: David Ahern Cc: shamir rabinovitch Signed-off-by: David S. Miller --- net/rds/ib_cm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 29144a60019f..8a09ee7db3c1 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -188,7 +188,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even * Since dp_ack_seq is 64-bit extended load operations can be * used so go through get_unaligned to avoid unaligned errors. */ - u64 dp_ack_seq = get_unaligned(&dp->dp_ack_seq); + __be64 dp_ack_seq = get_unaligned(&dp->dp_ack_seq); if (dp_ack_seq) rds_send_drop_acked(conn, be64_to_cpu(dp_ack_seq), -- cgit v1.2.3 From c967a0873a7836c7a77bf611f1c7d3f47c554c45 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 5 May 2015 09:06:30 -0700 Subject: mpls: Move reserved label definitions Move to include/uapi/linux/mpls.h to be externally visibile. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/uapi/linux/mpls.h | 10 ++++++++++ net/mpls/af_mpls.c | 18 +++++++++--------- net/mpls/internal.h | 10 ---------- 3 files changed, 19 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/mpls.h b/include/uapi/linux/mpls.h index bc9abfe88c9a..0fe6ea5a41d5 100644 --- a/include/uapi/linux/mpls.h +++ b/include/uapi/linux/mpls.h @@ -31,4 +31,14 @@ struct mpls_label { #define MPLS_LS_TTL_MASK 0x000000FF #define MPLS_LS_TTL_SHIFT 0 +/* Reserved labels */ +#define MPLS_LABEL_IPV4_EXPLICIT_NULL 0 /* RFC3032 */ +#define MPLS_LABEL_ROUTER_ALERT 1 /* RFC3032 */ +#define MPLS_LABEL_IPV6_EXPLICIT_NULL 2 /* RFC3032 */ +#define MPLS_LABEL_IMPLICIT_NULL 3 /* RFC3032 */ +#define MPLS_LABEL_ENTROPY_INDICATOR 7 /* RFC6790 */ +#define MPLS_LABEL_GAL 13 /* RFC5586 */ +#define MPLS_LABEL_OAM_ALERT 14 /* RFC3429 */ +#define MPLS_LABEL_EXTENSION 15 /* RFC7274 */ + #endif /* _UAPI_MPLS_H */ diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 954810c76a86..b6eb7615960a 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -647,7 +647,7 @@ int nla_get_labels(const struct nlattr *nla, return -EINVAL; switch (dec.label) { - case LABEL_IMPLICIT_NULL: + case MPLS_LABEL_IMPLICIT_NULL: /* RFC3032: This is a label that an LSR may * assign and distribute, but which never * actually appears in the encapsulation. @@ -935,7 +935,7 @@ static int resize_platform_label_table(struct net *net, size_t limit) } /* In case the predefined labels need to be populated */ - if (limit > LABEL_IPV4_EXPLICIT_NULL) { + if (limit > MPLS_LABEL_IPV4_EXPLICIT_NULL) { struct net_device *lo = net->loopback_dev; rt0 = mpls_rt_alloc(lo->addr_len); if (!rt0) @@ -945,7 +945,7 @@ static int resize_platform_label_table(struct net *net, size_t limit) rt0->rt_via_table = NEIGH_LINK_TABLE; memcpy(rt0->rt_via, lo->dev_addr, lo->addr_len); } - if (limit > LABEL_IPV6_EXPLICIT_NULL) { + if (limit > MPLS_LABEL_IPV6_EXPLICIT_NULL) { struct net_device *lo = net->loopback_dev; rt2 = mpls_rt_alloc(lo->addr_len); if (!rt2) @@ -973,15 +973,15 @@ static int resize_platform_label_table(struct net *net, size_t limit) memcpy(labels, old, cp_size); /* If needed set the predefined labels */ - if ((old_limit <= LABEL_IPV6_EXPLICIT_NULL) && - (limit > LABEL_IPV6_EXPLICIT_NULL)) { - RCU_INIT_POINTER(labels[LABEL_IPV6_EXPLICIT_NULL], rt2); + if ((old_limit <= MPLS_LABEL_IPV6_EXPLICIT_NULL) && + (limit > MPLS_LABEL_IPV6_EXPLICIT_NULL)) { + RCU_INIT_POINTER(labels[MPLS_LABEL_IPV6_EXPLICIT_NULL], rt2); rt2 = NULL; } - if ((old_limit <= LABEL_IPV4_EXPLICIT_NULL) && - (limit > LABEL_IPV4_EXPLICIT_NULL)) { - RCU_INIT_POINTER(labels[LABEL_IPV4_EXPLICIT_NULL], rt0); + if ((old_limit <= MPLS_LABEL_IPV4_EXPLICIT_NULL) && + (limit > MPLS_LABEL_IPV4_EXPLICIT_NULL)) { + RCU_INIT_POINTER(labels[MPLS_LABEL_IPV4_EXPLICIT_NULL], rt0); rt0 = NULL; } diff --git a/net/mpls/internal.h b/net/mpls/internal.h index 693877d69606..b064c345042c 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -1,16 +1,6 @@ #ifndef MPLS_INTERNAL_H #define MPLS_INTERNAL_H -#define LABEL_IPV4_EXPLICIT_NULL 0 /* RFC3032 */ -#define LABEL_ROUTER_ALERT_LABEL 1 /* RFC3032 */ -#define LABEL_IPV6_EXPLICIT_NULL 2 /* RFC3032 */ -#define LABEL_IMPLICIT_NULL 3 /* RFC3032 */ -#define LABEL_ENTROPY_INDICATOR 7 /* RFC6790 */ -#define LABEL_GAL 13 /* RFC5586 */ -#define LABEL_OAM_ALERT 14 /* RFC3429 */ -#define LABEL_EXTENSION 15 /* RFC7274 */ - - struct mpls_shim_hdr { __be32 label_stack_entry; }; -- cgit v1.2.3 From 31ccd0e66d41f73cfc21a8de976e713455205228 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Apr 2015 16:20:58 -0700 Subject: tcp_westwood: fix tcp_westwood_info() I forgot to update tcp_westwood when changing get_info() behavior, this patch should fix this. Fixes: 64f40ff5bbdb ("tcp: prepare CC get_info() access from getsockopt()") Reported-by: kbuild test robot Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_westwood.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index b3c57cceb990..c10732e39837 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -256,18 +256,19 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) } /* Extract info for Tcp socket info provided via netlink. */ -static int tcp_westwood_info(struct sock *sk, u32 ext, struct sk_buff *skb) +static size_t tcp_westwood_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) { const struct westwood *ca = inet_csk_ca(sk); if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - struct tcpvegas_info info = { - .tcpv_enabled = 1, - .tcpv_rtt = jiffies_to_usecs(ca->rtt), - .tcpv_minrtt = jiffies_to_usecs(ca->rtt_min), - }; + info->vegas.tcpv_enabled = 1; + info->vegas.tcpv_rttcnt = 0; + info->vegas.tcpv_rtt = jiffies_to_usecs(ca->rtt), + info->vegas.tcpv_minrtt = jiffies_to_usecs(ca->rtt_min), - return nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); + *attr = INET_DIAG_VEGASINFO; + return sizeof(struct tcpvegas_info); } return 0; } -- cgit v1.2.3 From f30bf2a5cac6c60ab366c4bc6db913597bf4d6ab Mon Sep 17 00:00:00 2001 From: Tommi Rantala Date: Thu, 7 May 2015 15:12:21 +0300 Subject: ipvs: fix memory leak in ip_vs_ctl.c Fix memory leak introduced in commit a0840e2e165a ("IPVS: netns, ip_vs_ctl local vars moved to ipvs struct."): unreferenced object 0xffff88005785b800 (size 2048): comm "(-localed)", pid 1434, jiffies 4294755650 (age 1421.089s) hex dump (first 32 bytes): bb 89 0b 83 ff ff ff ff b0 78 f0 4e 00 88 ff ff .........x.N.... 04 00 00 00 a4 01 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmemleak_alloc+0x4e/0xb0 [] __kmalloc_track_caller+0x244/0x430 [] kmemdup+0x20/0x50 [] ip_vs_control_net_init+0x1f7/0x510 [] __ip_vs_init+0x100/0x250 [] ops_init+0x41/0x190 [] setup_net+0x93/0x150 [] copy_net_ns+0x82/0x140 [] create_new_namespaces+0xfd/0x190 [] unshare_nsproxy_namespaces+0x5a/0xc0 [] SyS_unshare+0x173/0x310 [] system_call_fastpath+0x12/0x6f [] 0xffffffffffffffff Fixes: a0840e2e165a ("IPVS: netns, ip_vs_ctl local vars moved to ipvs struct.") Signed-off-by: Tommi Rantala Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_ctl.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 49532672f66d..285eae3a1454 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3823,6 +3823,9 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) cancel_work_sync(&ipvs->defense_work.work); unregister_net_sysctl_table(ipvs->sysctl_hdr); ip_vs_stop_estimator(net, &ipvs->tot_stats); + + if (!net_eq(net, &init_net)) + kfree(ipvs->sysctl_tbl); } #else -- cgit v1.2.3 From e16e888b525503be05b3aea64190e8b3bdef44d0 Mon Sep 17 00:00:00 2001 From: Markus Stenberg Date: Tue, 5 May 2015 13:36:59 +0300 Subject: ipv6: Fixed source specific default route handling. If there are only IPv6 source specific default routes present, the host gets -ENETUNREACH on e.g. connect() because ip6_dst_lookup_tail calls ip6_route_output first, and given source address any, it fails, and ip6_route_get_saddr is never called. The change is to use the ip6_route_get_saddr, even if the initial ip6_route_output fails, and then doing ip6_route_output _again_ after we have appropriate source address available. Note that this is '99% fix' to the problem; a correct fix would be to do route lookups only within addrconf.c when picking a source address, and never call ip6_route_output before source address has been populated. Signed-off-by: Markus Stenberg Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 39 +++++++++++++++++++++++++++++++-------- net/ipv6/route.c | 5 +++-- 2 files changed, 34 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 7fde1f265c90..c21777565c58 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -886,22 +886,45 @@ static int ip6_dst_lookup_tail(struct sock *sk, #endif int err; - if (!*dst) - *dst = ip6_route_output(net, sk, fl6); - - err = (*dst)->error; - if (err) - goto out_err_release; + /* The correct way to handle this would be to do + * ip6_route_get_saddr, and then ip6_route_output; however, + * the route-specific preferred source forces the + * ip6_route_output call _before_ ip6_route_get_saddr. + * + * In source specific routing (no src=any default route), + * ip6_route_output will fail given src=any saddr, though, so + * that's why we try it again later. + */ + if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) { + struct rt6_info *rt; + bool had_dst = *dst != NULL; - if (ipv6_addr_any(&fl6->saddr)) { - struct rt6_info *rt = (struct rt6_info *) *dst; + if (!had_dst) + *dst = ip6_route_output(net, sk, fl6); + rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; err = ip6_route_get_saddr(net, rt, &fl6->daddr, sk ? inet6_sk(sk)->srcprefs : 0, &fl6->saddr); if (err) goto out_err_release; + + /* If we had an erroneous initial result, pretend it + * never existed and let the SA-enabled version take + * over. + */ + if (!had_dst && (*dst)->error) { + dst_release(*dst); + *dst = NULL; + } } + if (!*dst) + *dst = ip6_route_output(net, sk, fl6); + + err = (*dst)->error; + if (err) + goto out_err_release; + #ifdef CONFIG_IPV6_OPTIMISTIC_DAD /* * Here if the dst entry we've looked up diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 5c48293ff062..d3588885f097 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2245,9 +2245,10 @@ int ip6_route_get_saddr(struct net *net, unsigned int prefs, struct in6_addr *saddr) { - struct inet6_dev *idev = ip6_dst_idev((struct dst_entry *)rt); + struct inet6_dev *idev = + rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL; int err = 0; - if (rt->rt6i_prefsrc.plen) + if (rt && rt->rt6i_prefsrc.plen) *saddr = rt->rt6i_prefsrc.addr; else err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, -- cgit v1.2.3 From f711a6ae062caeee46067b2f2f12ffda319ae73c Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Tue, 5 May 2015 15:20:51 -0400 Subject: net/rds: RDS-TCP: Always create a new rds_sock for an incoming connection. When running RDS over TCP, the active (client) side connects to the listening ("passive") side at the RDS_TCP_PORT. After the connection is established, if the client side reboots (potentially without even sending a FIN) the server still has a TCP socket in the esablished state. If the server-side now gets a new SYN comes from the client with a different client port, TCP will create a new socket-pair, but the RDS layer will incorrectly pull up the old rds_connection (which is still associated with the stale t_sock and RDS socket state). This patch corrects this behavior by having rds_tcp_accept_one() always create a new connection for an incoming TCP SYN. The rds and tcp state associated with the old socket-pair is cleaned up via the rds_tcp_state_change() callback which would typically be invoked in most cases when the client-TCP sends a FIN on TCP restart, triggering a transition to CLOSE_WAIT state. In the rarer event of client death without a FIN, TCP_KEEPALIVE probes on the socket will detect the stale socket, and the TCP transition to CLOSE state will trigger the RDS state cleanup. Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/connection.c | 4 ++++ net/rds/tcp_connect.c | 1 + net/rds/tcp_listen.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index 14f041398ca1..60f0cd6ed15f 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -126,7 +126,10 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, struct rds_transport *loop_trans; unsigned long flags; int ret; + struct rds_transport *otrans = trans; + if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP) + goto new_conn; rcu_read_lock(); conn = rds_conn_lookup(head, laddr, faddr, trans); if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && @@ -142,6 +145,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, if (conn) goto out; +new_conn: conn = kmem_cache_zalloc(rds_conn_slab, gfp); if (!conn) { conn = ERR_PTR(-ENOMEM); diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index f9f564a6c960..973109c7b8e8 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -62,6 +62,7 @@ void rds_tcp_state_change(struct sock *sk) case TCP_ESTABLISHED: rds_connect_complete(conn); break; + case TCP_CLOSE_WAIT: case TCP_CLOSE: rds_conn_drop(conn); default: diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 23ab4dcd1d9f..0da49e34495f 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -45,12 +45,45 @@ static void rds_tcp_accept_worker(struct work_struct *work); static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker); static struct socket *rds_tcp_listen_sock; +static int rds_tcp_keepalive(struct socket *sock) +{ + /* values below based on xs_udp_default_timeout */ + int keepidle = 5; /* send a probe 'keepidle' secs after last data */ + int keepcnt = 5; /* number of unack'ed probes before declaring dead */ + int keepalive = 1; + int ret = 0; + + ret = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, + (char *)&keepalive, sizeof(keepalive)); + if (ret < 0) + goto bail; + + ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT, + (char *)&keepcnt, sizeof(keepcnt)); + if (ret < 0) + goto bail; + + ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE, + (char *)&keepidle, sizeof(keepidle)); + if (ret < 0) + goto bail; + + /* KEEPINTVL is the interval between successive probes. We follow + * the model in xs_tcp_finish_connecting() and re-use keepidle. + */ + ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL, + (char *)&keepidle, sizeof(keepidle)); +bail: + return ret; +} + static int rds_tcp_accept_one(struct socket *sock) { struct socket *new_sock = NULL; struct rds_connection *conn; int ret; struct inet_sock *inet; + struct rds_tcp_connection *rs_tcp; ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, sock->sk->sk_protocol, &new_sock); @@ -63,6 +96,10 @@ static int rds_tcp_accept_one(struct socket *sock) if (ret < 0) goto out; + ret = rds_tcp_keepalive(new_sock); + if (ret < 0) + goto out; + rds_tcp_tune(new_sock); inet = inet_sk(new_sock->sk); @@ -77,6 +114,15 @@ static int rds_tcp_accept_one(struct socket *sock) ret = PTR_ERR(conn); goto out; } + /* An incoming SYN request came in, and TCP just accepted it. + * We always create a new conn for listen side of TCP, and do not + * add it to the c_hash_list. + * + * If the client reboots, this conn will need to be cleaned up. + * rds_tcp_state_change() will do that cleanup + */ + rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data; + WARN_ON(!rs_tcp || rs_tcp->t_sock); /* * see the comment above rds_queue_delayed_reconnect() -- cgit v1.2.3 From c82ac7e69efe6dbe370d6ba84e2666d7692ef1c2 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Tue, 5 May 2015 15:20:52 -0400 Subject: net/rds: RDS-TCP: only initiate reconnect attempt on outgoing TCP socket. When the peer of an RDS-TCP connection restarts, a reconnect attempt should only be made from the active side of the TCP connection, i.e. the side that has a transient TCP port number. Do not add the passive side of the TCP connection to the c_hash_node and thus avoid triggering rds_queue_reconnect() for passive rds connections. Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/connection.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index 60f0cd6ed15f..da6da57e5f36 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -234,13 +234,22 @@ new_conn: /* Creating normal conn */ struct rds_connection *found; - found = rds_conn_lookup(head, laddr, faddr, trans); + if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP) + found = NULL; + else + found = rds_conn_lookup(head, laddr, faddr, trans); if (found) { trans->conn_free(conn->c_transport_data); kmem_cache_free(rds_conn_slab, conn); conn = found; } else { - hlist_add_head_rcu(&conn->c_hash_node, head); + if ((is_outgoing && otrans->t_type == RDS_TRANS_TCP) || + (otrans->t_type != RDS_TRANS_TCP)) { + /* Only the active side should be added to + * reconnect list for TCP. + */ + hlist_add_head_rcu(&conn->c_hash_node, head); + } rds_cong_add_conn(conn); rds_conn_count++; } -- cgit v1.2.3 From d744318574090c3b796915d9d710bdb17db191a1 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 5 May 2015 15:22:02 -0700 Subject: net_sched: fix a use-after-free in tc_ctl_tfilter() When tcf_destroy() returns true, tp could be already destroyed, we should not use tp->next after that. For long term, we probably should move tp list to list_head. Fixes: 1e052be69d04 ("net_sched: destroy proto tp when all filters are gone") Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/cls_api.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 8b0470e418dc..b6ef9a04de06 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -308,12 +308,11 @@ replay: case RTM_DELTFILTER: err = tp->ops->delete(tp, fh); if (err == 0) { - tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER); - if (tcf_destroy(tp, false)) { - struct tcf_proto *next = rtnl_dereference(tp->next); + struct tcf_proto *next = rtnl_dereference(tp->next); + tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER); + if (tcf_destroy(tp, false)) RCU_INIT_POINTER(*back, next); - } } goto errout; case RTM_GETTFILTER: -- cgit v1.2.3 From 78f5b899195019f71f7593c604d75ca61658eae3 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 7 May 2015 08:08:51 -0700 Subject: mpls: Change reserved label names to be consistent with netbsd Since these are now visible to userspace it is nice to be consistent with BSD (sys/netmpls/mpls.h in netBSD). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/uapi/linux/mpls.h | 12 ++++++------ net/mpls/af_mpls.c | 18 +++++++++--------- 2 files changed, 15 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/mpls.h b/include/uapi/linux/mpls.h index 0fe6ea5a41d5..139d4dd1cab8 100644 --- a/include/uapi/linux/mpls.h +++ b/include/uapi/linux/mpls.h @@ -32,13 +32,13 @@ struct mpls_label { #define MPLS_LS_TTL_SHIFT 0 /* Reserved labels */ -#define MPLS_LABEL_IPV4_EXPLICIT_NULL 0 /* RFC3032 */ -#define MPLS_LABEL_ROUTER_ALERT 1 /* RFC3032 */ -#define MPLS_LABEL_IPV6_EXPLICIT_NULL 2 /* RFC3032 */ -#define MPLS_LABEL_IMPLICIT_NULL 3 /* RFC3032 */ -#define MPLS_LABEL_ENTROPY_INDICATOR 7 /* RFC6790 */ +#define MPLS_LABEL_IPV4NULL 0 /* RFC3032 */ +#define MPLS_LABEL_RTALERT 1 /* RFC3032 */ +#define MPLS_LABEL_IPV6NULL 2 /* RFC3032 */ +#define MPLS_LABEL_IMPLNULL 3 /* RFC3032 */ +#define MPLS_LABEL_ENTROPY 7 /* RFC6790 */ #define MPLS_LABEL_GAL 13 /* RFC5586 */ -#define MPLS_LABEL_OAM_ALERT 14 /* RFC3429 */ +#define MPLS_LABEL_OAMALERT 14 /* RFC3429 */ #define MPLS_LABEL_EXTENSION 15 /* RFC7274 */ #endif /* _UAPI_MPLS_H */ diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index b6eb7615960a..7b3f732269e4 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -647,7 +647,7 @@ int nla_get_labels(const struct nlattr *nla, return -EINVAL; switch (dec.label) { - case MPLS_LABEL_IMPLICIT_NULL: + case MPLS_LABEL_IMPLNULL: /* RFC3032: This is a label that an LSR may * assign and distribute, but which never * actually appears in the encapsulation. @@ -935,7 +935,7 @@ static int resize_platform_label_table(struct net *net, size_t limit) } /* In case the predefined labels need to be populated */ - if (limit > MPLS_LABEL_IPV4_EXPLICIT_NULL) { + if (limit > MPLS_LABEL_IPV4NULL) { struct net_device *lo = net->loopback_dev; rt0 = mpls_rt_alloc(lo->addr_len); if (!rt0) @@ -945,7 +945,7 @@ static int resize_platform_label_table(struct net *net, size_t limit) rt0->rt_via_table = NEIGH_LINK_TABLE; memcpy(rt0->rt_via, lo->dev_addr, lo->addr_len); } - if (limit > MPLS_LABEL_IPV6_EXPLICIT_NULL) { + if (limit > MPLS_LABEL_IPV6NULL) { struct net_device *lo = net->loopback_dev; rt2 = mpls_rt_alloc(lo->addr_len); if (!rt2) @@ -973,15 +973,15 @@ static int resize_platform_label_table(struct net *net, size_t limit) memcpy(labels, old, cp_size); /* If needed set the predefined labels */ - if ((old_limit <= MPLS_LABEL_IPV6_EXPLICIT_NULL) && - (limit > MPLS_LABEL_IPV6_EXPLICIT_NULL)) { - RCU_INIT_POINTER(labels[MPLS_LABEL_IPV6_EXPLICIT_NULL], rt2); + if ((old_limit <= MPLS_LABEL_IPV6NULL) && + (limit > MPLS_LABEL_IPV6NULL)) { + RCU_INIT_POINTER(labels[MPLS_LABEL_IPV6NULL], rt2); rt2 = NULL; } - if ((old_limit <= MPLS_LABEL_IPV4_EXPLICIT_NULL) && - (limit > MPLS_LABEL_IPV4_EXPLICIT_NULL)) { - RCU_INIT_POINTER(labels[MPLS_LABEL_IPV4_EXPLICIT_NULL], rt0); + if ((old_limit <= MPLS_LABEL_IPV4NULL) && + (limit > MPLS_LABEL_IPV4NULL)) { + RCU_INIT_POINTER(labels[MPLS_LABEL_IPV4NULL], rt0); rt0 = NULL; } -- cgit v1.2.3 From fbf33a2802f71dba06faa948af805884ff16a2ab Mon Sep 17 00:00:00 2001 From: "Kretschmer, Mathias" Date: Fri, 8 May 2015 15:44:37 +0200 Subject: af_packet / TX_RING not fully non-blocking (w/ MSG_DONTWAIT). This patch fixes an issue where the send(MSG_DONTWAIT) call on a TX_RING is not fully non-blocking in cases where the device's sndBuf is full. We pass nonblock=true to sock_alloc_send_skb() and return any possibly occuring error code (most likely EGAIN) to the caller. As the fast-path stays as it is, we keep the unlikely() around skb == NULL. Signed-off-by: Mathias Kretschmer Signed-off-by: David S. Miller --- net/packet/af_packet.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 5102c3cc4eec..b5989c6ee551 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2311,11 +2311,14 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) tlen = dev->needed_tailroom; skb = sock_alloc_send_skb(&po->sk, hlen + tlen + sizeof(struct sockaddr_ll), - 0, &err); + !need_wait, &err); - if (unlikely(skb == NULL)) + if (unlikely(skb == NULL)) { + /* we assume the socket was initially writeable ... */ + if (likely(len_sum > 0)) + err = len_sum; goto out_status; - + } tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto, addr, hlen); if (tp_len > dev->mtu + dev->hard_header_len) { -- cgit v1.2.3 From 47b4e1fc4972cc43a19121bc2608a60aef3bf216 Mon Sep 17 00:00:00 2001 From: Janusz Dziedzic Date: Mon, 11 May 2015 11:31:15 +0200 Subject: mac80211: move WEP tailroom size check Remove checking tailroom when adding IV as it uses only headroom, and move the check to the ICV generation that actually needs the tailroom. In other case I hit such warning and datapath don't work, when testing: - IBSS + WEP - ath9k with hw crypt enabled - IPv6 data (ping6) WARNING: CPU: 3 PID: 13301 at net/mac80211/wep.c:102 ieee80211_wep_add_iv+0x129/0x190 [mac80211]() [...] Call Trace: [] dump_stack+0x45/0x57 [] warn_slowpath_common+0x8a/0xc0 [] warn_slowpath_null+0x1a/0x20 [] ieee80211_wep_add_iv+0x129/0x190 [mac80211] [] ieee80211_crypto_wep_encrypt+0x6b/0xd0 [mac80211] [] invoke_tx_handlers+0xc51/0xf30 [mac80211] [...] Cc: stable@vger.kernel.org Signed-off-by: Janusz Dziedzic Signed-off-by: Johannes Berg --- net/mac80211/wep.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c index a4220e92f0cc..efa3f48f1ec5 100644 --- a/net/mac80211/wep.c +++ b/net/mac80211/wep.c @@ -98,8 +98,7 @@ static u8 *ieee80211_wep_add_iv(struct ieee80211_local *local, hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); - if (WARN_ON(skb_tailroom(skb) < IEEE80211_WEP_ICV_LEN || - skb_headroom(skb) < IEEE80211_WEP_IV_LEN)) + if (WARN_ON(skb_headroom(skb) < IEEE80211_WEP_IV_LEN)) return NULL; hdrlen = ieee80211_hdrlen(hdr->frame_control); @@ -167,6 +166,9 @@ int ieee80211_wep_encrypt(struct ieee80211_local *local, size_t len; u8 rc4key[3 + WLAN_KEY_LEN_WEP104]; + if (WARN_ON(skb_tailroom(skb) < IEEE80211_WEP_ICV_LEN)) + return -1; + iv = ieee80211_wep_add_iv(local, skb, keylen, keyidx); if (!iv) return -1; -- cgit v1.2.3 From 145a42b3a964c6647464f05bd58aa33787de7f75 Mon Sep 17 00:00:00 2001 From: David Ward Date: Sat, 9 May 2015 22:01:47 -0400 Subject: net_sched: gred: use correct backlog value in WRED mode In WRED mode, the backlog for a single virtual queue (VQ) should not be used to determine queue behavior; instead the backlog is summed across all VQs. This sum is currently used when calculating the average queue lengths. It also needs to be used when determining if the queue's hard limit has been reached, or when reporting each VQ's backlog via netlink. q->backlog will only be used if the queue switches out of WRED mode. Signed-off-by: David Ward Signed-off-by: David S. Miller --- net/sched/sch_gred.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index a4ca4517cdc8..634529e0ce6b 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -229,7 +229,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) break; } - if (q->backlog + qdisc_pkt_len(skb) <= q->limit) { + if (gred_backlog(t, q, sch) + qdisc_pkt_len(skb) <= q->limit) { q->backlog += qdisc_pkt_len(skb); return qdisc_enqueue_tail(skb, sch); } @@ -553,7 +553,7 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) opt.limit = q->limit; opt.DP = q->DP; - opt.backlog = q->backlog; + opt.backlog = gred_backlog(table, q, sch); opt.prio = q->prio; opt.qth_min = q->parms.qth_min >> q->parms.Wlog; opt.qth_max = q->parms.qth_max >> q->parms.Wlog; -- cgit v1.2.3 From e3d8ecb70e16412b14fb11c1b68ecb533bd4ea64 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 11 May 2015 15:57:31 +0200 Subject: netns: return RTM_NEWNSID instead of RTM_GETNSID on a get Usually, RTM_NEWxxx is returned on a get (same as a dump). Fixes: 0c7aecd4bde4 ("netns: add rtnl cmd to add and get peer netns ids") Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/net_namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 78fc04ad36fc..572af0011997 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -601,7 +601,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh) } err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, - RTM_GETNSID, net, peer, -1); + RTM_NEWNSID, net, peer, -1); if (err < 0) goto err_out; -- cgit v1.2.3 From 64aa42338e9a88c139b89797163714f0f95f3c6b Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 13 May 2015 15:26:10 +0800 Subject: esp4: Use high-order sequence number bits for IV generation I noticed we were only using the low-order bits for IV generation when ESN is enabled. This is very bad because it means that the IV can repeat. We must use the full 64 bits. Signed-off-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/ipv4/esp4.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 421a80b09b62..30b544f025ac 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -256,7 +256,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) aead_givcrypt_set_crypt(req, sg, sg, clen, iv); aead_givcrypt_set_assoc(req, asg, assoclen); aead_givcrypt_set_giv(req, esph->enc_data, - XFRM_SKB_CB(skb)->seq.output.low); + XFRM_SKB_CB(skb)->seq.output.low + + ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32)); ESP_SKB_CB(skb)->tmp = tmp; err = crypto_aead_givencrypt(req); -- cgit v1.2.3 From 6d7258ca937027ae86d6d5938d7ae10b6d68f4a4 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 13 May 2015 15:27:18 +0800 Subject: esp6: Use high-order sequence number bits for IV generation I noticed we were only using the low-order bits for IV generation when ESN is enabled. This is very bad because it means that the IV can repeat. We must use the full 64 bits. Signed-off-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/ipv6/esp6.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 31f1b5d5e2ef..7c07ce36aae2 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -248,7 +248,8 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) aead_givcrypt_set_crypt(req, sg, sg, clen, iv); aead_givcrypt_set_assoc(req, asg, assoclen); aead_givcrypt_set_giv(req, esph->enc_data, - XFRM_SKB_CB(skb)->seq.output.low); + XFRM_SKB_CB(skb)->seq.output.low + + ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32)); ESP_SKB_CB(skb)->tmp = tmp; err = crypto_aead_givencrypt(req); -- cgit v1.2.3 From be346ffaad9bc354075fba5cd009fc4519abdd64 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Tue, 12 May 2015 20:53:14 -0400 Subject: vlan: Correctly propagate promisc|allmulti flags in notifier. Currently vlan notifier handler will try to update all vlans for a device when that device comes up. A problem occurs, however, when the vlan device was set to promiscuous, but not by the user (ex: a bridge). In that case, dev->gflags are not updated. What results is that the lower device ends up with an extra promiscuity count. Here are the backtraces that prove this: [62852.052179] [] __dev_set_promiscuity+0x38/0x1e0 [62852.052186] [] ? _raw_spin_unlock_bh+0x1b/0x40 [62852.052188] [] ? dev_set_rx_mode+0x2e/0x40 [62852.052190] [] dev_set_promiscuity+0x24/0x50 [62852.052194] [] vlan_dev_open+0xd5/0x1f0 [8021q] [62852.052196] [] __dev_open+0xbf/0x140 [62852.052198] [] __dev_change_flags+0x9d/0x170 [62852.052200] [] dev_change_flags+0x29/0x60 The above comes from the setting the vlan device to IFF_UP state. [62852.053569] [] __dev_set_promiscuity+0x38/0x1e0 [62852.053571] [] ? vlan_dev_set_rx_mode+0x2b/0x30 [8021q] [62852.053573] [] __dev_change_flags+0xe5/0x170 [62852.053645] [] dev_change_flags+0x29/0x60 [62852.053647] [] vlan_device_event+0x18a/0x690 [8021q] [62852.053649] [] notifier_call_chain+0x4c/0x70 [62852.053651] [] raw_notifier_call_chain+0x16/0x20 [62852.053653] [] call_netdevice_notifiers+0x2d/0x60 [62852.053654] [] __dev_notify_flags+0x33/0xa0 [62852.053656] [] dev_change_flags+0x52/0x60 [62852.053657] [] do_setlink+0x397/0xa40 And this one comes from the notification code. What we end up with is a vlan with promiscuity count of 1 and and a physical device with a promiscuity count of 2. They should both have a count 1. To resolve this issue, vlan code can use dev_get_flags() api which correctly masks promiscuity and allmulti flags. Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/8021q/vlan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 98a30a5b8664..59555f0f8fc8 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -443,7 +443,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, case NETDEV_UP: /* Put all VLANs for this dev in the up state too. */ vlan_group_for_each_dev(grp, i, vlandev) { - flgs = vlandev->flags; + flgs = dev_get_flags(vlandev); if (flgs & IFF_UP) continue; -- cgit v1.2.3 From 177d0506a911eb60b38b172215df5325ed94fa64 Mon Sep 17 00:00:00 2001 From: Wesley Kuo Date: Wed, 13 May 2015 10:33:15 +0800 Subject: Bluetooth: Fix remote name event return directly. This patch fixes hci_remote_name_evt dose not resolve name during discovery status is RESOLVING. Before simultaneous dual mode scan enabled, hci_check_pending_name will set discovery status to STOPPED eventually. Signed-off-by: Wesley Kuo Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 4663c3dad3f5..c4802f3bd4c5 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2854,9 +2854,11 @@ static void le_scan_disable_work_complete(struct hci_dev *hdev, u8 status, * state. If we were running both LE and BR/EDR inquiry * simultaneously, and BR/EDR inquiry is already * finished, stop discovery, otherwise BR/EDR inquiry - * will stop discovery when finished. + * will stop discovery when finished. If we will resolve + * remote device name, do not change discovery state. */ - if (!test_bit(HCI_INQUIRY, &hdev->flags)) + if (!test_bit(HCI_INQUIRY, &hdev->flags) && + hdev->discovery.state != DISCOVERY_RESOLVING) hci_discovery_set_state(hdev, DISCOVERY_STOPPED); } else { -- cgit v1.2.3 From 91dd93f956b9ea9ecf47fd4b9acd2d2e7f980303 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 May 2015 17:24:50 -0700 Subject: netlink: move nl_table in read_mostly section netlink sockets creation and deletion heavily modify nl_table_users and nl_table_lock. If nl_table is sharing one cache line with one of them, netlink performance is really bad on SMP. ffffffff81ff5f00 B nl_table ffffffff81ff5f0c b nl_table_users Putting nl_table in read_mostly section increased performance of my open/delete netlink sockets test by about 80 % This came up while diagnosing a getaddrinfo() problem. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index daa0b818174b..dbe885901b34 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -89,7 +89,7 @@ static inline int netlink_is_kernel(struct sock *sk) return nlk_sk(sk)->flags & NETLINK_KERNEL_SOCKET; } -struct netlink_table *nl_table; +struct netlink_table *nl_table __read_mostly; EXPORT_SYMBOL_GPL(nl_table); static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); -- cgit v1.2.3 From e87a468eb97da35d8dc00e8fa9828b4de4ab69d0 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Thu, 14 May 2015 20:34:08 -0400 Subject: ipv6: Fix udp checksums with raw sockets It was reported that trancerout6 would cause a kernel to crash when trying to compute checksums on raw UDP packets. The cause was the check in __ip6_append_data that would attempt to use partial checksums on the packet. However, raw sockets do not initialize partial checksum fields so partial checksums can't be used. Solve this the same way IPv4 does it. raw sockets pass transhdrlen value of 0 to ip_append_data which causes the checksum to be computed in software. Use the same check in ip6_append_data (check transhdrlen). Reported-by: Wolfgang Walter CC: Wolfgang Walter CC: Eric Dumazet Signed-off-by: Vladislav Yasevich Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index c21777565c58..bc09cb97b840 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1300,8 +1300,10 @@ emsgsize: /* If this is the first and only packet and device * supports checksum offloading, let's use it. + * Use transhdrlen, same as IPv4, because partial + * sums only work when transhdrlen is set. */ - if (!skb && sk->sk_protocol == IPPROTO_UDP && + if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && length + fragheaderlen < mtu && rt->dst.dev->features & NETIF_F_V6_CSUM && !exthdrlen) -- cgit v1.2.3 From eea39946a1f36e8a5a47c86e7ecfca6076868505 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Wed, 13 May 2015 21:17:41 -0700 Subject: rename RTNH_F_EXTERNAL to RTNH_F_OFFLOAD RTNH_F_EXTERNAL today is printed as "offload" in iproute2 output. This patch renames the flag to be consistent with what the user sees. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 2 +- net/ipv4/fib_trie.c | 2 +- net/switchdev/switchdev.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 974db03f7b1a..17fb02f488da 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -337,7 +337,7 @@ struct rtnexthop { #define RTNH_F_DEAD 1 /* Nexthop is dead (used by multipath) */ #define RTNH_F_PERVASIVE 2 /* Do recursive gateway lookup */ #define RTNH_F_ONLINK 4 /* Gateway is forced on link */ -#define RTNH_F_EXTERNAL 8 /* Route installed externally */ +#define RTNH_F_OFFLOAD 8 /* offloaded route */ /* Macros to handle hexthops */ diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index e13fcc602da2..64c2076ced54 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1764,7 +1764,7 @@ void fib_table_flush_external(struct fib_table *tb) /* record local slen */ slen = fa->fa_slen; - if (!fi || !(fi->fib_flags & RTNH_F_EXTERNAL)) + if (!fi || !(fi->fib_flags & RTNH_F_OFFLOAD)) continue; netdev_switch_fib_ipv4_del(n->key, diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 46568b85c333..055453d48668 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -338,7 +338,7 @@ int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, fi, tos, type, nlflags, tb_id); if (!err) - fi->fib_flags |= RTNH_F_EXTERNAL; + fi->fib_flags |= RTNH_F_OFFLOAD; } return err; @@ -364,7 +364,7 @@ int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, const struct swdev_ops *ops; int err = 0; - if (!(fi->fib_flags & RTNH_F_EXTERNAL)) + if (!(fi->fib_flags & RTNH_F_OFFLOAD)) return 0; dev = netdev_switch_get_dev_by_nhs(fi); @@ -376,7 +376,7 @@ int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, err = ops->swdev_fib_ipv4_del(dev, htonl(dst), dst_len, fi, tos, type, tb_id); if (!err) - fi->fib_flags &= ~RTNH_F_EXTERNAL; + fi->fib_flags &= ~RTNH_F_OFFLOAD; } return err; -- cgit v1.2.3 From 595ca5880b37d4aa3c292d75531577175d36b225 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 7 May 2015 14:15:58 +0200 Subject: netfilter: avoid build error if TPROXY/SOCKET=y && NF_DEFRAG_IPV6=m With TPROXY=y but DEFRAG_IPV6=m we get build failure: net/built-in.o: In function `tproxy_tg_init': net/netfilter/xt_TPROXY.c:588: undefined reference to `nf_defrag_ipv6_enable' If DEFRAG_IPV6 is modular, TPROXY must be too. (or both must be builtin). This enforces =m for both. Reported-and-tested-by: Liu Hua Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/Kconfig | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index f70e34a68f70..a0f3e6a3c7d1 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -863,6 +863,7 @@ config NETFILTER_XT_TARGET_TPROXY depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED depends on (IPV6 || IPV6=n) + depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) depends on IP_NF_MANGLE select NF_DEFRAG_IPV4 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES @@ -1356,6 +1357,7 @@ config NETFILTER_XT_MATCH_SOCKET depends on NETFILTER_ADVANCED depends on !NF_CONNTRACK || NF_CONNTRACK depends on (IPV6 || IPV6=n) + depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) select NF_DEFRAG_IPV4 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES help -- cgit v1.2.3 From b3cad287d13b5f6695c6b4aab72969cd64bf0171 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 7 May 2015 14:54:16 +0200 Subject: conntrack: RFC5961 challenge ACK confuse conntrack LAST-ACK transition In compliance with RFC5961, the network stack send challenge ACK in response to spurious SYN packets, since commit 0c228e833c88 ("tcp: Restore RFC5961-compliant behavior for SYN packets"). This pose a problem for netfilter conntrack in state LAST_ACK, because this challenge ACK is (falsely) seen as ACKing last FIN, causing a false state transition (into TIME_WAIT). The challenge ACK is hard to distinguish from real last ACK. Thus, solution introduce a flag that tracks the potential for seeing a challenge ACK, in case a SYN packet is let through and current state is LAST_ACK. When conntrack transition LAST_ACK to TIME_WAIT happens, this flag is used for determining if we are expecting a challenge ACK. Scapy based reproducer script avail here: https://github.com/netoptimizer/network-testing/blob/master/scapy/tcp_hacks_3WHS_LAST_ACK.py Fixes: 0c228e833c88 ("tcp: Restore RFC5961-compliant behavior for SYN packets") Signed-off-by: Jesper Dangaard Brouer Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_conntrack_tcp.h | 3 +++ net/netfilter/nf_conntrack_proto_tcp.c | 35 ++++++++++++++++++++++--- 2 files changed, 35 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_conntrack_tcp.h b/include/uapi/linux/netfilter/nf_conntrack_tcp.h index 9993a421201c..ef9f80f0f529 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_tcp.h +++ b/include/uapi/linux/netfilter/nf_conntrack_tcp.h @@ -42,6 +42,9 @@ enum tcp_conntrack { /* The field td_maxack has been set */ #define IP_CT_TCP_FLAG_MAXACK_SET 0x20 +/* Marks possibility for expected RFC5961 challenge ACK */ +#define IP_CT_EXP_CHALLENGE_ACK 0x40 + struct nf_ct_tcp_flags { __u8 flags; __u8 mask; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 5caa0c41bf26..70383de72054 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -202,7 +202,7 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. * sCW -> sCW - * sLA -> sTW Last ACK detected. + * sLA -> sTW Last ACK detected (RFC5961 challenged) * sTW -> sTW Retransmitted last ACK. Remain in the same state. * sCL -> sCL */ @@ -261,7 +261,7 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. * sCW -> sCW - * sLA -> sTW Last ACK detected. + * sLA -> sTW Last ACK detected (RFC5961 challenged) * sTW -> sTW Retransmitted last ACK. * sCL -> sCL */ @@ -906,6 +906,7 @@ static int tcp_packet(struct nf_conn *ct, 1 : ct->proto.tcp.last_win; ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale = ct->proto.tcp.last_wscale; + ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK; ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags = ct->proto.tcp.last_flags; memset(&ct->proto.tcp.seen[dir], 0, @@ -923,7 +924,9 @@ static int tcp_packet(struct nf_conn *ct, * may be in sync but we are not. In that case, we annotate * the TCP options and let the packet go through. If it is a * valid SYN packet, the server will reply with a SYN/ACK, and - * then we'll get in sync. Otherwise, the server ignores it. */ + * then we'll get in sync. Otherwise, the server potentially + * responds with a challenge ACK if implementing RFC5961. + */ if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) { struct ip_ct_tcp_state seen = {}; @@ -939,6 +942,13 @@ static int tcp_packet(struct nf_conn *ct, ct->proto.tcp.last_flags |= IP_CT_TCP_FLAG_SACK_PERM; } + /* Mark the potential for RFC5961 challenge ACK, + * this pose a special problem for LAST_ACK state + * as ACK is intrepretated as ACKing last FIN. + */ + if (old_state == TCP_CONNTRACK_LAST_ACK) + ct->proto.tcp.last_flags |= + IP_CT_EXP_CHALLENGE_ACK; } spin_unlock_bh(&ct->lock); if (LOG_INVALID(net, IPPROTO_TCP)) @@ -970,6 +980,25 @@ static int tcp_packet(struct nf_conn *ct, nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: invalid state "); return -NF_ACCEPT; + case TCP_CONNTRACK_TIME_WAIT: + /* RFC5961 compliance cause stack to send "challenge-ACK" + * e.g. in response to spurious SYNs. Conntrack MUST + * not believe this ACK is acking last FIN. + */ + if (old_state == TCP_CONNTRACK_LAST_ACK && + index == TCP_ACK_SET && + ct->proto.tcp.last_dir != dir && + ct->proto.tcp.last_index == TCP_SYN_SET && + (ct->proto.tcp.last_flags & IP_CT_EXP_CHALLENGE_ACK)) { + /* Detected RFC5961 challenge ACK */ + ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK; + spin_unlock_bh(&ct->lock); + if (LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: challenge-ACK ignored "); + return NF_ACCEPT; /* Don't change state */ + } + break; case TCP_CONNTRACK_CLOSE: if (index == TCP_RST_SET && (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) -- cgit v1.2.3 From 960bd2c26421d321e890f1936938196ead41976f Mon Sep 17 00:00:00 2001 From: Mirek Kratochvil Date: Fri, 15 May 2015 21:15:29 +0200 Subject: netfilter: nf_tables: fix bogus warning in nft_data_uninit() The values 0x00000000-0xfffffeff are reserved for userspace datatype. When, deleting set elements with maps, a bogus warning is triggered. WARNING: CPU: 0 PID: 11133 at net/netfilter/nf_tables_api.c:4481 nft_data_uninit+0x35/0x40 [nf_tables]() This fixes the check accordingly to enum definition in include/linux/netfilter/nf_tables.h Fixes: https://bugzilla.netfilter.org/show_bug.cgi?id=1013 Signed-off-by: Mirek Kratochvil Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index ad9d11fb29fd..34ded09317e7 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4472,9 +4472,9 @@ EXPORT_SYMBOL_GPL(nft_data_init); */ void nft_data_uninit(const struct nft_data *data, enum nft_data_types type) { - switch (type) { - case NFT_DATA_VALUE: + if (type < NFT_DATA_VERDICT) return; + switch (type) { case NFT_DATA_VERDICT: return nft_verdict_uninit(data); default: -- cgit v1.2.3 From c0bb07df7d981e4091432754e30c9c720e2c0c78 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 16 May 2015 21:50:28 +0800 Subject: netlink: Reset portid after netlink_insert failure The commit c5adde9468b0714a051eac7f9666f23eb10b61f7 ("netlink: eliminate nl_sk_hash_lock") breaks the autobind retry mechanism because it doesn't reset portid after a failed netlink_insert. This means that should autobind fail the first time around, then the socket will be stuck in limbo as it can never be bound again since it already has a non-zero portid. Fixes: c5adde9468b0 ("netlink: eliminate nl_sk_hash_lock") Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index dbe885901b34..bf6e76643f78 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1081,6 +1081,7 @@ static int netlink_insert(struct sock *sk, u32 portid) if (err) { if (err == -EEXIST) err = -EADDRINUSE; + nlk_sk(sk)->portid = 0; sock_put(sk); } -- cgit v1.2.3 From ed2a80ab7b76f11af0b2c6255709c4ebf164b667 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 13 May 2015 14:19:42 +0200 Subject: rtnl/bond: don't send rtnl msg for unregistered iface Before the patch, the command 'ip link add bond2 type bond mode 802.3ad' causes the kernel to send a rtnl message for the bond2 interface, with an ifindex 0. 'ip monitor' shows: 0: bond2: mtu 1500 state DOWN group default link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff 9: bond2@NONE: mtu 1500 qdisc noop state DOWN group default link/ether ea:3e:1f:53:92:7b brd ff:ff:ff:ff:ff:ff [snip] The patch fixes the spotted bug by checking in bond driver if the interface is registered before calling the notifier chain. It also adds a check in rtmsg_ifinfo() to prevent this kind of bug in the future. Fixes: d4261e565000 ("bonding: create netlink event when bonding option is changed") CC: Jiri Pirko Reported-by: Julien Meunier Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- drivers/net/bonding/bond_options.c | 2 +- net/core/rtnetlink.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index 4df28943d222..e8d3c1d35453 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -624,7 +624,7 @@ int __bond_opt_set(struct bonding *bond, out: if (ret) bond_opt_error_interpret(bond, opt, ret, val); - else + else if (bond->dev->reg_state == NETREG_REGISTERED) call_netdevice_notifiers(NETDEV_CHANGEINFODATA, bond->dev); return ret; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 666e0928ba40..8de36824018d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2416,6 +2416,9 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, { struct sk_buff *skb; + if (dev->reg_state != NETREG_REGISTERED) + return; + skb = rtmsg_ifinfo_build_skb(type, dev, change, flags); if (skb) rtmsg_ifinfo_send(skb, dev, flags); -- cgit v1.2.3 From 21858cd02dabcf290564cbf4769b101eba54d7bb Mon Sep 17 00:00:00 2001 From: Florent Fourcot Date: Sat, 16 May 2015 00:24:59 +0200 Subject: tcp/ipv6: fix flow label setting in TIME_WAIT state commit 1d13a96c74fc ("ipv6: tcp: fix flowlabel value in ACK messages send from TIME_WAIT") added the flow label in the last TCP packets. Unfortunately, it was not casted properly. This patch replace the buggy shift with be32_to_cpu/cpu_to_be32. Fixes: 1d13a96c74fc ("ipv6: tcp: fix flowlabel value in ACK messages") Reported-by: Eric Dumazet Signed-off-by: Florent Fourcot Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_minisocks.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index e5d7649136fc..b5732a54f2ad 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -300,7 +300,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tw->tw_v6_daddr = sk->sk_v6_daddr; tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; tw->tw_tclass = np->tclass; - tw->tw_flowlabel = np->flow_label >> 12; + tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK); tw->tw_ipv6only = sk->sk_ipv6only; } #endif diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index b6575d665568..3adffb300238 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -914,7 +914,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_time_stamp + tcptw->tw_ts_offset, tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), - tw->tw_tclass, (tw->tw_flowlabel << 12)); + tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel)); inet_twsk_put(tw); } -- cgit v1.2.3 From 22d3a3c829fa9ecdb493d1f1f2838d543f8d86a3 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 19 May 2015 15:40:21 +0200 Subject: mac80211: don't use napi_gro_receive() outside NAPI context No matter how the driver manages its NAPI context, there's no way sending frames to it from a timer can be correct, since it would corrupt the internal GRO lists. To avoid that, always use the non-NAPI path when releasing frames from the timer. Cc: stable@vger.kernel.org Reported-by: Jean Trivelly Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 3 +++ net/mac80211/rx.c | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index ab46ab4a7249..cdc374296245 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -205,6 +205,8 @@ enum ieee80211_packet_rx_flags { * @IEEE80211_RX_CMNTR: received on cooked monitor already * @IEEE80211_RX_BEACON_REPORTED: This frame was already reported * to cfg80211_report_obss_beacon(). + * @IEEE80211_RX_REORDER_TIMER: this frame is released by the + * reorder buffer timeout timer, not the normal RX path * * These flags are used across handling multiple interfaces * for a single frame. @@ -212,6 +214,7 @@ enum ieee80211_packet_rx_flags { enum ieee80211_rx_flags { IEEE80211_RX_CMNTR = BIT(0), IEEE80211_RX_BEACON_REPORTED = BIT(1), + IEEE80211_RX_REORDER_TIMER = BIT(2), }; struct ieee80211_rx_data { diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 260eed45b6d2..5793f75c5ffd 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2121,7 +2121,8 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) /* deliver to local stack */ skb->protocol = eth_type_trans(skb, dev); memset(skb->cb, 0, sizeof(skb->cb)); - if (rx->local->napi) + if (!(rx->flags & IEEE80211_RX_REORDER_TIMER) && + rx->local->napi) napi_gro_receive(rx->local->napi, skb); else netif_receive_skb(skb); @@ -3231,7 +3232,7 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) /* This is OK -- must be QoS data frame */ .security_idx = tid, .seqno_idx = tid, - .flags = 0, + .flags = IEEE80211_RX_REORDER_TIMER, }; struct tid_ampdu_rx *tid_agg_rx; -- cgit v1.2.3 From 33b4b015e1a1ca7a8fdce40af5e71642a8ea355c Mon Sep 17 00:00:00 2001 From: Henning Rogge Date: Mon, 18 May 2015 21:08:49 +0200 Subject: net/ipv6/udp: Fix ipv6 multicast socket filter regression Commit <5cf3d46192fc> ("udp: Simplify__udp*_lib_mcast_deliver") simplified the filter for incoming IPv6 multicast but removed the check of the local socket address and the UDP destination address. This patch restores the filter to prevent sockets bound to a IPv6 multicast IP to receive other UDP traffic link unicast. Signed-off-by: Henning Rogge Fixes: 5cf3d46192fc ("udp: Simplify__udp*_lib_mcast_deliver") Cc: "David S. Miller" Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/udp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 3477c919fcc8..c2ec41617a35 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -731,7 +731,9 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk, (inet->inet_dport && inet->inet_dport != rmt_port) || (!ipv6_addr_any(&sk->sk_v6_daddr) && !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) || - (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) + (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) || + (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) && + !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr))) return false; if (!inet6_mc_check(sk, loc_addr, rmt_addr)) return false; -- cgit v1.2.3 From da34ac7626b571d262f92b93f11eb32dd58d9c4e Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 18 May 2015 12:31:44 -0700 Subject: tcp: only undo on partial ACKs in CA_Loss Undo based on TCP timestamps should only happen on ACKs that advance SND_UNA, according to the Eifel algorithm in RFC 3522: Section 3.2: (4) If the value of the Timestamp Echo Reply field of the acceptable ACK's Timestamps option is smaller than the value of RetransmitTS, then proceed to step (5), Section Terminology: We use the term 'acceptable ACK' as defined in [RFC793]. That is an ACK that acknowledges previously unacknowledged data. This is because upon receiving an out-of-order packet, the receiver returns the last timestamp that advances RCV_NXT, not the current timestamp of the packet in the DUPACK. Without checking the flag, the DUPACK will cause tcp_packet_delayed() to return true and tcp_try_undo_loss() will revert cwnd reduction. Note that we check the condition in CA_Recovery already by only calling tcp_try_undo_partial() if FLAG_SND_UNA_ADVANCED is set or tcp_try_undo_recovery() if snd_una crosses high_seq. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bc790ea9960f..9faf775a8c4a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2698,11 +2698,16 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) struct tcp_sock *tp = tcp_sk(sk); bool recovered = !before(tp->snd_una, tp->high_seq); + if ((flag & FLAG_SND_UNA_ADVANCED) && + tcp_try_undo_loss(sk, false)) + return; + if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ /* Step 3.b. A timeout is spurious if not all data are * lost, i.e., never-retransmitted data are (s)acked. */ - if (tcp_try_undo_loss(sk, flag & FLAG_ORIG_SACK_ACKED)) + if ((flag & FLAG_ORIG_SACK_ACKED) && + tcp_try_undo_loss(sk, true)) return; if (after(tp->snd_nxt, tp->high_seq) && @@ -2732,8 +2737,6 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) else if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); } - if (tcp_try_undo_loss(sk, false)) - return; tcp_xmit_retransmit_queue(sk); } -- cgit v1.2.3 From b7b0ed910cd8450db6d98cd4361c644bb1c88412 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 18 May 2015 12:31:45 -0700 Subject: tcp: don't over-send F-RTO probes After sending the new data packets to probe (step 2), F-RTO may incorrectly send more probes if the next ACK advances SND_UNA and does not sack new packet. However F-RTO RFC 5682 probes at most once. This bug may cause sender to always send new data instead of repairing holes, inducing longer HoL blocking on the receiver for the application. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9faf775a8c4a..243d674b3ef5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2710,9 +2710,9 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) tcp_try_undo_loss(sk, true)) return; - if (after(tp->snd_nxt, tp->high_seq) && - (flag & FLAG_DATA_SACKED || is_dupack)) { - tp->frto = 0; /* Loss was real: 2nd part of step 3.a */ + if (after(tp->snd_nxt, tp->high_seq)) { + if (flag & FLAG_DATA_SACKED || is_dupack) + tp->frto = 0; /* Step 3.a. loss was real */ } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) { tp->high_seq = tp->snd_nxt; __tcp_push_pending_frames(sk, tcp_current_mss(sk), -- cgit v1.2.3 From 3bfe049807c240344b407e3cfb74544927359817 Mon Sep 17 00:00:00 2001 From: Francesco Ruggeri Date: Sun, 17 May 2015 14:30:31 -0700 Subject: netfilter: nfnetlink_{log,queue}: Register pernet in first place nfnetlink_{log,queue}_init() register the netlink callback nf*_rcv_nl_event before registering the pernet_subsys, but the callback relies on data structures allocated by pernet init functions. When nfnetlink_{log,queue} is loaded, if a netlink message is received after the netlink callback is registered but before the pernet_subsys is registered, the kernel will panic in the sequence nfulnl_rcv_nl_event nfnl_log_pernet net_generic BUG_ON(id == 0) where id is nfnl_log_net_id. The panic can be easily reproduced in 4.0.3 by: while true ;do modprobe nfnetlink_log ; rmmod nfnetlink_log ; done & while true ;do ip netns add dummy ; ip netns del dummy ; done & This patch moves register_pernet_subsys to earlier in nfnetlink_log_init. Notice that the BUG_ON hit in 4.0.3 was recently removed in 2591ffd308 ["netns: remove BUG_ONs from net_generic()"]. Signed-off-by: Francesco Ruggeri Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_log.c | 19 ++++++++++--------- net/netfilter/nfnetlink_queue_core.c | 18 +++++++++--------- 2 files changed, 19 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 3ad91266c821..4ef1fae8445e 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -1073,7 +1073,13 @@ static struct pernet_operations nfnl_log_net_ops = { static int __init nfnetlink_log_init(void) { - int status = -ENOMEM; + int status; + + status = register_pernet_subsys(&nfnl_log_net_ops); + if (status < 0) { + pr_err("failed to register pernet ops\n"); + goto out; + } netlink_register_notifier(&nfulnl_rtnl_notifier); status = nfnetlink_subsys_register(&nfulnl_subsys); @@ -1088,28 +1094,23 @@ static int __init nfnetlink_log_init(void) goto cleanup_subsys; } - status = register_pernet_subsys(&nfnl_log_net_ops); - if (status < 0) { - pr_err("failed to register pernet ops\n"); - goto cleanup_logger; - } return status; -cleanup_logger: - nf_log_unregister(&nfulnl_logger); cleanup_subsys: nfnetlink_subsys_unregister(&nfulnl_subsys); cleanup_netlink_notifier: netlink_unregister_notifier(&nfulnl_rtnl_notifier); + unregister_pernet_subsys(&nfnl_log_net_ops); +out: return status; } static void __exit nfnetlink_log_fini(void) { - unregister_pernet_subsys(&nfnl_log_net_ops); nf_log_unregister(&nfulnl_logger); nfnetlink_subsys_unregister(&nfulnl_subsys); netlink_unregister_notifier(&nfulnl_rtnl_notifier); + unregister_pernet_subsys(&nfnl_log_net_ops); } MODULE_DESCRIPTION("netfilter userspace logging"); diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 0b98c7420239..11c7682fa0ea 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -1317,7 +1317,13 @@ static struct pernet_operations nfnl_queue_net_ops = { static int __init nfnetlink_queue_init(void) { - int status = -ENOMEM; + int status; + + status = register_pernet_subsys(&nfnl_queue_net_ops); + if (status < 0) { + pr_err("nf_queue: failed to register pernet ops\n"); + goto out; + } netlink_register_notifier(&nfqnl_rtnl_notifier); status = nfnetlink_subsys_register(&nfqnl_subsys); @@ -1326,19 +1332,13 @@ static int __init nfnetlink_queue_init(void) goto cleanup_netlink_notifier; } - status = register_pernet_subsys(&nfnl_queue_net_ops); - if (status < 0) { - pr_err("nf_queue: failed to register pernet ops\n"); - goto cleanup_subsys; - } register_netdevice_notifier(&nfqnl_dev_notifier); nf_register_queue_handler(&nfqh); return status; -cleanup_subsys: - nfnetlink_subsys_unregister(&nfqnl_subsys); cleanup_netlink_notifier: netlink_unregister_notifier(&nfqnl_rtnl_notifier); +out: return status; } @@ -1346,9 +1346,9 @@ static void __exit nfnetlink_queue_fini(void) { nf_unregister_queue_handler(); unregister_netdevice_notifier(&nfqnl_dev_notifier); - unregister_pernet_subsys(&nfnl_queue_net_ops); nfnetlink_subsys_unregister(&nfqnl_subsys); netlink_unregister_notifier(&nfqnl_rtnl_notifier); + unregister_pernet_subsys(&nfnl_queue_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ } -- cgit v1.2.3 From 1086bbe97a074844188c6c988fa0b1a98c3ccbb9 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Tue, 19 May 2015 20:55:17 -0400 Subject: netfilter: ensure number of counters is >0 in do_replace() After improving setsockopt() coverage in trinity, I started triggering vmalloc failures pretty reliably from this code path: warn_alloc_failed+0xe9/0x140 __vmalloc_node_range+0x1be/0x270 vzalloc+0x4b/0x50 __do_replace+0x52/0x260 [ip_tables] do_ipt_set_ctl+0x15d/0x1d0 [ip_tables] nf_setsockopt+0x65/0x90 ip_setsockopt+0x61/0xa0 raw_setsockopt+0x16/0x60 sock_common_setsockopt+0x14/0x20 SyS_setsockopt+0x71/0xd0 It turns out we don't validate that the num_counters field in the struct we pass in from userspace is initialized. The same problem also exists in ebtables, arptables, ipv6, and the compat variants. Signed-off-by: Dave Jones Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtables.c | 4 ++++ net/ipv4/netfilter/arp_tables.c | 6 ++++++ net/ipv4/netfilter/ip_tables.c | 6 ++++++ net/ipv6/netfilter/ip6_tables.c | 6 ++++++ 4 files changed, 22 insertions(+) (limited to 'net') diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 91180a7fc943..24c7c96bf5f8 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1117,6 +1117,8 @@ static int do_replace(struct net *net, const void __user *user, return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct ebt_counter)) return -ENOMEM; + if (tmp.num_counters == 0) + return -EINVAL; tmp.name[sizeof(tmp.name) - 1] = 0; @@ -2159,6 +2161,8 @@ static int compat_copy_ebt_replace_from_user(struct ebt_replace *repl, return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct ebt_counter)) return -ENOMEM; + if (tmp.num_counters == 0) + return -EINVAL; memcpy(repl, &tmp, offsetof(struct ebt_replace, hook_entry)); diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 13bfe84bf3ca..a61200754f4b 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1075,6 +1075,9 @@ static int do_replace(struct net *net, const void __user *user, /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + if (tmp.num_counters == 0) + return -EINVAL; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); @@ -1499,6 +1502,9 @@ static int compat_do_replace(struct net *net, void __user *user, return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + if (tmp.num_counters == 0) + return -EINVAL; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index c69db7fa25ee..2d0e265fef6e 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1262,6 +1262,9 @@ do_replace(struct net *net, const void __user *user, unsigned int len) /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + if (tmp.num_counters == 0) + return -EINVAL; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); @@ -1809,6 +1812,9 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + if (tmp.num_counters == 0) + return -EINVAL; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 1a732a1d3c8e..62f5b0d0bc9b 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1275,6 +1275,9 @@ do_replace(struct net *net, const void __user *user, unsigned int len) /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + if (tmp.num_counters == 0) + return -EINVAL; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); @@ -1822,6 +1825,9 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + if (tmp.num_counters == 0) + return -EINVAL; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); -- cgit v1.2.3 From faecbb45ebefb20260ad4a631e011e93c896cb73 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 20 May 2015 13:42:25 +0200 Subject: Revert "netfilter: bridge: query conntrack about skb dnat" This reverts commit c055d5b03bb4cb69d349d787c9787c0383abd8b2. There are two issues: 'dnat_took_place' made me think that this is related to -j DNAT/MASQUERADE. But thats only one part of the story. This is also relevant for SNAT when we undo snat translation in reverse/reply direction. Furthermore, I originally wanted to do this mainly to avoid storing ipv6 addresses once we make DNAT/REDIRECT work for ipv6 on bridges. However, I forgot about SNPT/DNPT which is stateless. So we can't escape storing address for ipv6 anyway. Might as well do it for ipv4 too. Reported-and-tested-by: Bernhard Thaler Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/skbuff.h | 1 + net/bridge/br_netfilter.c | 27 +++++++++------------------ 2 files changed, 10 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 66e374d62f64..f15154a879c7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -176,6 +176,7 @@ struct nf_bridge_info { struct net_device *physindev; struct net_device *physoutdev; char neigh_header[8]; + __be32 ipv4_daddr; }; #endif diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index ab55e2472beb..60ddfbeb47f5 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -37,10 +37,6 @@ #include #include -#if IS_ENABLED(CONFIG_NF_CONNTRACK) -#include -#endif - #include #include "br_private.h" #ifdef CONFIG_SYSCTL @@ -350,24 +346,15 @@ free_skb: return 0; } -static bool dnat_took_place(const struct sk_buff *skb) +static bool daddr_was_changed(const struct sk_buff *skb, + const struct nf_bridge_info *nf_bridge) { -#if IS_ENABLED(CONFIG_NF_CONNTRACK) - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; - - ct = nf_ct_get(skb, &ctinfo); - if (!ct || nf_ct_is_untracked(ct)) - return false; - - return test_bit(IPS_DST_NAT_BIT, &ct->status); -#else - return false; -#endif + return ip_hdr(skb)->daddr != nf_bridge->ipv4_daddr; } /* This requires some explaining. If DNAT has taken place, * we will need to fix up the destination Ethernet address. + * This is also true when SNAT takes place (for the reply direction). * * There are two cases to consider: * 1. The packet was DNAT'ed to a device in the same bridge @@ -421,7 +408,7 @@ static int br_nf_pre_routing_finish(struct sock *sk, struct sk_buff *skb) nf_bridge->pkt_otherhost = false; } nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING; - if (dnat_took_place(skb)) { + if (daddr_was_changed(skb, nf_bridge)) { if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) { struct in_device *in_dev = __in_dev_get_rcu(dev); @@ -632,6 +619,7 @@ static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct nf_hook_state *state) { + struct nf_bridge_info *nf_bridge; struct net_bridge_port *p; struct net_bridge *br; __u32 len = nf_bridge_encap_header_len(skb); @@ -669,6 +657,9 @@ static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops, if (!setup_pre_routing(skb)) return NF_DROP; + nf_bridge = nf_bridge_info_get(skb); + nf_bridge->ipv4_daddr = ip_hdr(skb)->daddr; + skb->protocol = htons(ETH_P_IP); NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, state->sk, skb, -- cgit v1.2.3 From 252ec2b3aa6f6a9ac29c6539027db600c11bf45e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 19 May 2015 13:36:38 +0200 Subject: mac80211: don't split remain-on-channel for coalescing Due to remain-on-channel scheduling delays, when we split an ROC while coalescing, we'll usually get a picture like this: existing ROC: |------------------| current time: ^ new ROC: |------| |-------| If the expected response frames are then transmitted by the peer in the hole between the two fragments of the new ROC, we miss them and the process (e.g. ANQP query) fails. mac80211 expects that the window to miss something is small: existing ROC: |------------------| new ROC: |------||-------| but that's normally not the case. To avoid this problem, coalesce only if the new ROC's duration is <= the remaining time on the existing one: existing ROC: |------------------| new ROC: |-----| and never split a new one but schedule it afterwards instead: existing ROC: |------------------| new ROC: |-------------| type=bugfix bug=not-tracked fixes=unknown Reported-by: Matti Gottlieb Reviewed-by: EliadX Peller Reviewed-by: Matti Gottlieb Tested-by: Matti Gottlieb Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 59 +++++++++------------------------------------- net/mac80211/ieee80211_i.h | 6 ----- 2 files changed, 11 insertions(+), 54 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 265e42721a66..ff347a0eebd4 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2495,51 +2495,22 @@ static bool ieee80211_coalesce_started_roc(struct ieee80211_local *local, struct ieee80211_roc_work *new_roc, struct ieee80211_roc_work *cur_roc) { - unsigned long j = jiffies; - unsigned long cur_roc_end = cur_roc->hw_start_time + - msecs_to_jiffies(cur_roc->duration); - struct ieee80211_roc_work *next_roc; - int new_dur; + unsigned long now = jiffies; + unsigned long remaining = cur_roc->hw_start_time + + msecs_to_jiffies(cur_roc->duration) - + now; if (WARN_ON(!cur_roc->started || !cur_roc->hw_begun)) return false; - if (time_after(j + IEEE80211_ROC_MIN_LEFT, cur_roc_end)) + /* if it doesn't fit entirely, schedule a new one */ + if (new_roc->duration > jiffies_to_msecs(remaining)) return false; ieee80211_handle_roc_started(new_roc); - new_dur = new_roc->duration - jiffies_to_msecs(cur_roc_end - j); - - /* cur_roc is long enough - add new_roc to the dependents list. */ - if (new_dur <= 0) { - list_add_tail(&new_roc->list, &cur_roc->dependents); - return true; - } - - new_roc->duration = new_dur; - - /* - * if cur_roc was already coalesced before, we might - * want to extend the next roc instead of adding - * a new one. - */ - next_roc = list_entry(cur_roc->list.next, - struct ieee80211_roc_work, list); - if (&next_roc->list != &local->roc_list && - next_roc->chan == new_roc->chan && - next_roc->sdata == new_roc->sdata && - !WARN_ON(next_roc->started)) { - list_add_tail(&new_roc->list, &next_roc->dependents); - next_roc->duration = max(next_roc->duration, - new_roc->duration); - next_roc->type = max(next_roc->type, new_roc->type); - return true; - } - - /* add right after cur_roc */ - list_add(&new_roc->list, &cur_roc->list); - + /* add to dependents so we send the expired event properly */ + list_add_tail(&new_roc->list, &cur_roc->dependents); return true; } @@ -2652,17 +2623,9 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, * In the offloaded ROC case, if it hasn't begun, add * this new one to the dependent list to be handled * when the master one begins. If it has begun, - * check that there's still a minimum time left and - * if so, start this one, transmitting the frame, but - * add it to the list directly after this one with - * a reduced time so we'll ask the driver to execute - * it right after finishing the previous one, in the - * hope that it'll also be executed right afterwards, - * effectively extending the old one. - * If there's no minimum time left, just add it to the - * normal list. - * TODO: the ROC type is ignored here, assuming that it - * is better to immediately use the current ROC. + * check if it fits entirely within the existing one, + * in which case it will just be dependent as well. + * Otherwise, schedule it by itself. */ if (!tmp->hw_begun) { list_add_tail(&roc->list, &tmp->dependents); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index cdc374296245..c0a9187bc3a9 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -328,12 +328,6 @@ struct mesh_preq_queue { u8 flags; }; -#if HZ/100 == 0 -#define IEEE80211_ROC_MIN_LEFT 1 -#else -#define IEEE80211_ROC_MIN_LEFT (HZ/100) -#endif - struct ieee80211_roc_work { struct list_head list; struct list_head dependents; -- cgit v1.2.3 From f9dca80b98caac8b4bfb43a2edf1e9f877ccf322 Mon Sep 17 00:00:00 2001 From: Michal Kazior Date: Wed, 13 May 2015 09:16:48 +0000 Subject: mac80211: fix AP_VLAN crypto tailroom calculation Some splats I was seeing: (a) WARNING: CPU: 1 PID: 0 at /devel/src/linux/net/mac80211/wep.c:102 ieee80211_wep_add_iv (b) WARNING: CPU: 1 PID: 0 at /devel/src/linux/net/mac80211/wpa.c:73 ieee80211_tx_h_michael_mic_add (c) WARNING: CPU: 3 PID: 0 at /devel/src/linux/net/mac80211/wpa.c:433 ieee80211_crypto_ccmp_encrypt I've seen (a) and (b) with ath9k hw crypto and (c) with ath9k sw crypto. All of them were related to insufficient skb tailroom and I was able to trigger these with ping6 program. AP_VLANs may inherit crypto keys from parent AP. This wasn't considered and yielded problems in some setups resulting in inability to transmit data because mac80211 wouldn't resize skbs when necessary and subsequently drop some packets due to insufficient tailroom. For efficiency purposes don't inspect both AP_VLAN and AP sdata looking for tailroom counter. Instead update AP_VLAN tailroom counters whenever their master AP tailroom counter changes. Signed-off-by: Michal Kazior Signed-off-by: Johannes Berg --- net/mac80211/iface.c | 6 ++++ net/mac80211/key.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++------ net/mac80211/key.h | 1 + net/mac80211/util.c | 3 ++ 4 files changed, 83 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index bab5c63c0bad..84cef600c573 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -522,6 +522,12 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) memcpy(sdata->vif.hw_queue, master->vif.hw_queue, sizeof(sdata->vif.hw_queue)); sdata->vif.bss_conf.chandef = master->vif.bss_conf.chandef; + + mutex_lock(&local->key_mtx); + sdata->crypto_tx_tailroom_needed_cnt += + master->crypto_tx_tailroom_needed_cnt; + mutex_unlock(&local->key_mtx); + break; } case NL80211_IFTYPE_AP: diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 2291cd730091..a907f2d5c12d 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -58,6 +58,22 @@ static void assert_key_lock(struct ieee80211_local *local) lockdep_assert_held(&local->key_mtx); } +static void +update_vlan_tailroom_need_count(struct ieee80211_sub_if_data *sdata, int delta) +{ + struct ieee80211_sub_if_data *vlan; + + if (sdata->vif.type != NL80211_IFTYPE_AP) + return; + + mutex_lock(&sdata->local->mtx); + + list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) + vlan->crypto_tx_tailroom_needed_cnt += delta; + + mutex_unlock(&sdata->local->mtx); +} + static void increment_tailroom_need_count(struct ieee80211_sub_if_data *sdata) { /* @@ -79,6 +95,8 @@ static void increment_tailroom_need_count(struct ieee80211_sub_if_data *sdata) * http://mid.gmane.org/1308590980.4322.19.camel@jlt3.sipsolutions.net */ + update_vlan_tailroom_need_count(sdata, 1); + if (!sdata->crypto_tx_tailroom_needed_cnt++) { /* * Flush all XMIT packets currently using HW encryption or no @@ -88,6 +106,15 @@ static void increment_tailroom_need_count(struct ieee80211_sub_if_data *sdata) } } +static void decrease_tailroom_need_count(struct ieee80211_sub_if_data *sdata, + int delta) +{ + WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt < delta); + + update_vlan_tailroom_need_count(sdata, -delta); + sdata->crypto_tx_tailroom_needed_cnt -= delta; +} + static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) { struct ieee80211_sub_if_data *sdata; @@ -144,7 +171,7 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) if (!((key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC) || (key->conf.flags & IEEE80211_KEY_FLAG_RESERVE_TAILROOM))) - sdata->crypto_tx_tailroom_needed_cnt--; + decrease_tailroom_need_count(sdata, 1); WARN_ON((key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE) && (key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)); @@ -541,7 +568,7 @@ static void __ieee80211_key_destroy(struct ieee80211_key *key, schedule_delayed_work(&sdata->dec_tailroom_needed_wk, HZ/2); } else { - sdata->crypto_tx_tailroom_needed_cnt--; + decrease_tailroom_need_count(sdata, 1); } } @@ -631,6 +658,7 @@ void ieee80211_key_free(struct ieee80211_key *key, bool delay_tailroom) void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata) { struct ieee80211_key *key; + struct ieee80211_sub_if_data *vlan; ASSERT_RTNL(); @@ -639,7 +667,14 @@ void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata) mutex_lock(&sdata->local->key_mtx); - sdata->crypto_tx_tailroom_needed_cnt = 0; + WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt || + sdata->crypto_tx_tailroom_pending_dec); + + if (sdata->vif.type == NL80211_IFTYPE_AP) { + list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) + WARN_ON_ONCE(vlan->crypto_tx_tailroom_needed_cnt || + vlan->crypto_tx_tailroom_pending_dec); + } list_for_each_entry(key, &sdata->key_list, list) { increment_tailroom_need_count(sdata); @@ -649,6 +684,22 @@ void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata) mutex_unlock(&sdata->local->key_mtx); } +void ieee80211_reset_crypto_tx_tailroom(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_sub_if_data *vlan; + + mutex_lock(&sdata->local->key_mtx); + + sdata->crypto_tx_tailroom_needed_cnt = 0; + + if (sdata->vif.type == NL80211_IFTYPE_AP) { + list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) + vlan->crypto_tx_tailroom_needed_cnt = 0; + } + + mutex_unlock(&sdata->local->key_mtx); +} + void ieee80211_iter_keys(struct ieee80211_hw *hw, struct ieee80211_vif *vif, void (*iter)(struct ieee80211_hw *hw, @@ -688,8 +739,8 @@ static void ieee80211_free_keys_iface(struct ieee80211_sub_if_data *sdata, { struct ieee80211_key *key, *tmp; - sdata->crypto_tx_tailroom_needed_cnt -= - sdata->crypto_tx_tailroom_pending_dec; + decrease_tailroom_need_count(sdata, + sdata->crypto_tx_tailroom_pending_dec); sdata->crypto_tx_tailroom_pending_dec = 0; ieee80211_debugfs_key_remove_mgmt_default(sdata); @@ -709,6 +760,7 @@ void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata, { struct ieee80211_local *local = sdata->local; struct ieee80211_sub_if_data *vlan; + struct ieee80211_sub_if_data *master; struct ieee80211_key *key, *tmp; LIST_HEAD(keys); @@ -728,8 +780,20 @@ void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata, list_for_each_entry_safe(key, tmp, &keys, list) __ieee80211_key_destroy(key, false); - WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt || - sdata->crypto_tx_tailroom_pending_dec); + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { + if (sdata->bss) { + master = container_of(sdata->bss, + struct ieee80211_sub_if_data, + u.ap); + + WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt != + master->crypto_tx_tailroom_needed_cnt); + } + } else { + WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt || + sdata->crypto_tx_tailroom_pending_dec); + } + if (sdata->vif.type == NL80211_IFTYPE_AP) { list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) WARN_ON_ONCE(vlan->crypto_tx_tailroom_needed_cnt || @@ -793,8 +857,8 @@ void ieee80211_delayed_tailroom_dec(struct work_struct *wk) */ mutex_lock(&sdata->local->key_mtx); - sdata->crypto_tx_tailroom_needed_cnt -= - sdata->crypto_tx_tailroom_pending_dec; + decrease_tailroom_need_count(sdata, + sdata->crypto_tx_tailroom_pending_dec); sdata->crypto_tx_tailroom_pending_dec = 0; mutex_unlock(&sdata->local->key_mtx); } diff --git a/net/mac80211/key.h b/net/mac80211/key.h index c5a31835be0e..96557dd1e77d 100644 --- a/net/mac80211/key.h +++ b/net/mac80211/key.h @@ -161,6 +161,7 @@ void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata, void ieee80211_free_sta_keys(struct ieee80211_local *local, struct sta_info *sta); void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata); +void ieee80211_reset_crypto_tx_tailroom(struct ieee80211_sub_if_data *sdata); #define key_mtx_dereference(local, ref) \ rcu_dereference_protected(ref, lockdep_is_held(&((local)->key_mtx))) diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 79412f16b61d..b864ebc6ab8f 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -2022,6 +2022,9 @@ int ieee80211_reconfig(struct ieee80211_local *local) mutex_unlock(&local->sta_mtx); /* add back keys */ + list_for_each_entry(sdata, &local->interfaces, list) + ieee80211_reset_crypto_tx_tailroom(sdata); + list_for_each_entry(sdata, &local->interfaces, list) if (ieee80211_sdata_running(sdata)) ieee80211_enable_keys(sdata); -- cgit v1.2.3 From 35f1b4e96b9258a3668872b1139c51e5a23eb876 Mon Sep 17 00:00:00 2001 From: Michal Kubeček Date: Mon, 18 May 2015 20:53:55 +0200 Subject: ipv6: do not delete previously existing ECMP routes if add fails If adding a nexthop of an IPv6 multipath route fails, comment in ip6_route_multipath() says we are going to delete all nexthops already added. However, current implementation deletes even the routes it hasn't even tried to add yet. For example, running ip route add 1234:5678::/64 \ nexthop via fe80::aa dev dummy1 \ nexthop via fe80::bb dev dummy1 \ nexthop via fe80::cc dev dummy1 twice results in removing all routes first command added. Limit the second (delete) run to nexthops that succeeded in the first (add) run. Fixes: 51ebd3181572 ("ipv6: add support of equal cost multipath (ECMP)") Signed-off-by: Michal Kubecek Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv6/route.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d3588885f097..3821a3517478 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2504,9 +2504,9 @@ static int ip6_route_multipath(struct fib6_config *cfg, int add) int attrlen; int err = 0, last_err = 0; + remaining = cfg->fc_mp_len; beginning: rtnh = (struct rtnexthop *)cfg->fc_mp; - remaining = cfg->fc_mp_len; /* Parse a Multipath Entry */ while (rtnh_ok(rtnh, remaining)) { @@ -2536,6 +2536,7 @@ beginning: * next hops that have been already added. */ add = 0; + remaining = cfg->fc_mp_len - remaining; goto beginning; } } -- cgit v1.2.3 From 27596472473a02cfef2908a6bcda7e55264ba6b7 Mon Sep 17 00:00:00 2001 From: Michal Kubeček Date: Mon, 18 May 2015 20:54:00 +0200 Subject: ipv6: fix ECMP route replacement When replacing an IPv6 multipath route with "ip route replace", i.e. NLM_F_CREATE | NLM_F_REPLACE, fib6_add_rt2node() replaces only first matching route without fixing its siblings, resulting in corrupted siblings linked list; removing one of the siblings can then end in an infinite loop. IPv6 ECMP implementation is a bit different from IPv4 so that route replacement cannot work in exactly the same way. This should be a reasonable approximation: 1. If the new route is ECMP-able and there is a matching ECMP-able one already, replace it and all its siblings (if any). 2. If the new route is ECMP-able and no matching ECMP-able route exists, replace first matching non-ECMP-able (if any) or just add the new one. 3. If the new route is not ECMP-able, replace first matching non-ECMP-able route (if any) or add the new route. We also need to remove the NLM_F_REPLACE flag after replacing old route(s) by first nexthop of an ECMP route so that each subsequent nexthop does not replace previous one. Fixes: 51ebd3181572 ("ipv6: add support of equal cost multipath (ECMP)") Signed-off-by: Michal Kubecek Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 39 +++++++++++++++++++++++++++++++++++++-- net/ipv6/route.c | 11 +++++++---- 2 files changed, 44 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 96dbffff5a24..bde57b113009 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -693,6 +693,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, { struct rt6_info *iter = NULL; struct rt6_info **ins; + struct rt6_info **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); int add = (!info->nlh || @@ -716,8 +717,13 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, (info->nlh->nlmsg_flags & NLM_F_EXCL)) return -EEXIST; if (replace) { - found++; - break; + if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) { + found++; + break; + } + if (rt_can_ecmp) + fallback_ins = fallback_ins ?: ins; + goto next_iter; } if (iter->dst.dev == rt->dst.dev && @@ -753,9 +759,17 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, if (iter->rt6i_metric > rt->rt6i_metric) break; +next_iter: ins = &iter->dst.rt6_next; } + if (fallback_ins && !found) { + /* No ECMP-able route found, replace first non-ECMP one */ + ins = fallback_ins; + iter = *ins; + found++; + } + /* Reset round-robin state, if necessary */ if (ins == &fn->leaf) fn->rr_ptr = NULL; @@ -815,6 +829,8 @@ add: } } else { + int nsiblings; + if (!found) { if (add) goto add; @@ -835,8 +851,27 @@ add: info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } + nsiblings = iter->rt6i_nsiblings; fib6_purge_rt(iter, fn, info->nl_net); rt6_release(iter); + + if (nsiblings) { + /* Replacing an ECMP route, remove all siblings */ + ins = &rt->dst.rt6_next; + iter = *ins; + while (iter) { + if (rt6_qualify_for_ecmp(iter)) { + *ins = iter->dst.rt6_next; + fib6_purge_rt(iter, fn, info->nl_net); + rt6_release(iter); + nsiblings--; + } else { + ins = &iter->dst.rt6_next; + } + iter = *ins; + } + WARN_ON(nsiblings != 0); + } } return 0; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3821a3517478..c73ae5039e46 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2541,11 +2541,14 @@ beginning: } } /* Because each route is added like a single route we remove - * this flag after the first nexthop (if there is a collision, - * we have already fail to add the first nexthop: - * fib6_add_rt2node() has reject it). + * these flags after the first nexthop: if there is a collision, + * we have already failed to add the first nexthop: + * fib6_add_rt2node() has rejected it; when replacing, old + * nexthops have been replaced by first new, the rest should + * be added to it. */ - cfg->fc_nlinfo.nlh->nlmsg_flags &= ~NLM_F_EXCL; + cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | + NLM_F_REPLACE); rtnh = rtnh_next(rtnh, &remaining); } -- cgit v1.2.3 From b0494532214bdfbf241e94fabab5dd46f7b82631 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 11 May 2015 17:53:10 +0300 Subject: libceph: request a new osdmap if lingering request maps to no osd This commit does two things. First, if there are any homeless lingering requests, we now request a new osdmap even if the osdmap that is being processed brought no changes, i.e. if a given lingering request turned homeless in one of the previous epochs and remained homeless in the current epoch. Not doing so leaves us with a stale osdmap and as a result we may miss our window for reestablishing the watch and lose notifies. MON=1 OSD=1: # cat linger-needmap.sh #!/bin/bash rbd create --size 1 test DEV=$(rbd map test) ceph osd out 0 rbd map dne/dne # obtain a new osdmap as a side effect (!) sleep 1 ceph osd in 0 rbd resize --size 2 test # rbd info test | grep size -> 2M # blockdev --getsize $DEV -> 1M N.B.: Not obtaining a new osdmap in between "osd out" and "osd in" above is enough to make it miss that resize notify, but that is a bug^Wlimitation of ceph watch/notify v1. Second, homeless lingering requests are now kicked just like those lingering requests whose mapping has changed. This is mainly to recognize that a homeless lingering request makes no sense and to preserve the invariant that a registered lingering request is not sitting on any of r_req_lru_item lists. This spares us a WARN_ON, which commit ba9d114ec557 ("libceph: clear r_req_lru_item in __unregister_linger_request()") tried to fix the _wrong_ way. Cc: stable@vger.kernel.org # 3.10+ Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/osd_client.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 41a4abc7e98e..31d4b1ebff01 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2017,20 +2017,29 @@ static void kick_requests(struct ceph_osd_client *osdc, bool force_resend, err = __map_request(osdc, req, force_resend || force_resend_writes); dout("__map_request returned %d\n", err); - if (err == 0) - continue; /* no change and no osd was specified */ if (err < 0) continue; /* hrm! */ - if (req->r_osd == NULL) { - dout("tid %llu maps to no valid osd\n", req->r_tid); - needmap++; /* request a newer map */ - continue; - } + if (req->r_osd == NULL || err > 0) { + if (req->r_osd == NULL) { + dout("lingering %p tid %llu maps to no osd\n", + req, req->r_tid); + /* + * A homeless lingering request makes + * no sense, as it's job is to keep + * a particular OSD connection open. + * Request a newer map and kick the + * request, knowing that it won't be + * resent until we actually get a map + * that can tell us where to send it. + */ + needmap++; + } - dout("kicking lingering %p tid %llu osd%d\n", req, req->r_tid, - req->r_osd ? req->r_osd->o_osd : -1); - __register_request(osdc, req); - __unregister_linger_request(osdc, req); + dout("kicking lingering %p tid %llu osd%d\n", req, + req->r_tid, req->r_osd ? req->r_osd->o_osd : -1); + __register_request(osdc, req); + __unregister_linger_request(osdc, req); + } } reset_changed_osds(osdc); mutex_unlock(&osdc->request_mutex); -- cgit v1.2.3 From 521a04d06a729e5971cdee7f84080387ed320527 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 11 May 2015 17:53:34 +0300 Subject: Revert "libceph: clear r_req_lru_item in __unregister_linger_request()" This reverts commit ba9d114ec5578e6e99a4dfa37ff8ae688040fd64. .. which introduced a regression that prevented all lingering requests requeued in kick_requests() from ever being sent to the OSDs, resulting in a lot of missed notifies. In retrospect it's pretty obvious that r_req_lru_item item in the case of lingering requests can be used not only for notarget, but also for unsent linkage due to how tightly actual map and enqueue operations are coupled in __map_request(). The assertion that was being silenced is taken care of in the previous ("libceph: request a new osdmap if lingering request maps to no osd") commit: by always kicking homeless lingering requests we ensure that none of them ends up on the notarget list outside of the critical section guarded by request_mutex. Cc: stable@vger.kernel.org # 3.18+, needs b0494532214b "libceph: request a new osdmap if lingering request maps to no osd" Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/osd_client.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 31d4b1ebff01..c4ec9239249a 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1306,8 +1306,6 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc, if (list_empty(&req->r_osd_item)) req->r_osd = NULL; } - - list_del_init(&req->r_req_lru_item); /* can be on notarget */ ceph_osdc_put_request(req); } -- cgit v1.2.3 From 407d34ef294727bdc200934c38d9a8241f4a5547 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 21 May 2015 00:38:12 +0800 Subject: xfrm: Always zero high-order sequence number bits As we're now always including the high bits of the sequence number in the IV generation process we need to ensure that they don't contain crap. This patch ensures that the high sequence bits are always zeroed so that we don't leak random data into the IV. Signed-off-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_replay.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index dab57daae408..4fd725a0c500 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -99,6 +99,7 @@ static int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb) if (x->type->flags & XFRM_TYPE_REPLAY_PROT) { XFRM_SKB_CB(skb)->seq.output.low = ++x->replay.oseq; + XFRM_SKB_CB(skb)->seq.output.hi = 0; if (unlikely(x->replay.oseq == 0)) { x->replay.oseq--; xfrm_audit_state_replay_overflow(x, skb); @@ -177,6 +178,7 @@ static int xfrm_replay_overflow_bmp(struct xfrm_state *x, struct sk_buff *skb) if (x->type->flags & XFRM_TYPE_REPLAY_PROT) { XFRM_SKB_CB(skb)->seq.output.low = ++replay_esn->oseq; + XFRM_SKB_CB(skb)->seq.output.hi = 0; if (unlikely(replay_esn->oseq == 0)) { replay_esn->oseq--; xfrm_audit_state_replay_overflow(x, skb); -- cgit v1.2.3 From c78e1746d3ad7d548bdf3fe491898cc453911a49 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 20 May 2015 17:13:33 +0200 Subject: net: sched: fix call_rcu() race on classifier module unloads Vijay reported that a loop as simple as ... while true; do tc qdisc add dev foo root handle 1: prio tc filter add dev foo parent 1: u32 match u32 0 0 flowid 1 tc qdisc del dev foo root rmmod cls_u32 done ... will panic the kernel. Moreover, he bisected the change apparently introducing it to 78fd1d0ab072 ("netlink: Re-add locking to netlink_lookup() and seq walker"). The removal of synchronize_net() from the netlink socket triggering the qdisc to be removed, seems to have uncovered an RCU resp. module reference count race from the tc API. Given that RCU conversion was done after e341694e3eb5 ("netlink: Convert netlink_lookup() to use RCU protected hash table") which added the synchronize_net() originally, occasion of hitting the bug was less likely (not impossible though): When qdiscs that i) support attaching classifiers and, ii) have at least one of them attached, get deleted, they invoke tcf_destroy_chain(), and thus call into ->destroy() handler from a classifier module. After RCU conversion, all classifier that have an internal prio list, unlink them and initiate freeing via call_rcu() deferral. Meanhile, tcf_destroy() releases already reference to the tp->ops->owner module before the queued RCU callback handler has been invoked. Subsequent rmmod on the classifier module is then not prevented since all module references are already dropped. By the time, the kernel invokes the RCU callback handler from the module, that function address is then invalid. One way to fix it would be to add an rcu_barrier() to unregister_tcf_proto_ops() to wait for all pending call_rcu()s to complete. synchronize_rcu() is not appropriate as under heavy RCU callback load, registered call_rcu()s could be deferred longer than a grace period. In case we don't have any pending call_rcu()s, the barrier is allowed to return immediately. Since we came here via unregister_tcf_proto_ops(), there are no users of a given classifier anymore. Further nested call_rcu()s pointing into the module space are not being done anywhere. Only cls_bpf_delete_prog() may schedule a work item, to unlock pages eventually, but that is not in the range/context of cls_bpf anymore. Fixes: 25d8c0d55f24 ("net: rcu-ify tcf_proto") Fixes: 9888faefe132 ("net: sched: cls_basic use RCU") Reported-by: Vijay Subramanian Signed-off-by: Daniel Borkmann Cc: John Fastabend Cc: Eric Dumazet Cc: Thomas Graf Cc: Jamal Hadi Salim Cc: Alexei Starovoitov Tested-by: Vijay Subramanian Acked-by: Alexei Starovoitov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/cls_api.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index b6ef9a04de06..a75864d93142 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -81,6 +81,11 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops) struct tcf_proto_ops *t; int rc = -ENOENT; + /* Wait for outstanding call_rcu()s, if any, from a + * tcf_proto_ops's destroy() handler. + */ + rcu_barrier(); + write_lock(&cls_mod_lock); list_for_each_entry(t, &tcf_proto_base, head) { if (t == ops) { -- cgit v1.2.3 From d654976cbf852ee20612ee10dbe57cdacda9f452 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 May 2015 21:51:19 -0700 Subject: tcp: fix a potential deadlock in tcp_get_info() Taking socket spinlock in tcp_get_info() can deadlock, as inet_diag_dump_icsk() holds the &hashinfo->ehash_locks[i], while packet processing can use the reverse locking order. We could avoid this locking for TCP_LISTEN states, but lockdep would certainly get confused as all TCP sockets share same lockdep classes. [ 523.722504] ====================================================== [ 523.728706] [ INFO: possible circular locking dependency detected ] [ 523.734990] 4.1.0-dbg-DEV #1676 Not tainted [ 523.739202] ------------------------------------------------------- [ 523.745474] ss/18032 is trying to acquire lock: [ 523.750002] (slock-AF_INET){+.-...}, at: [] tcp_get_info+0x2c4/0x360 [ 523.758129] [ 523.758129] but task is already holding lock: [ 523.763968] (&(&hashinfo->ehash_locks[i])->rlock){+.-...}, at: [] inet_diag_dump_icsk+0x1d5/0x6c0 [ 523.774661] [ 523.774661] which lock already depends on the new lock. [ 523.774661] [ 523.782850] [ 523.782850] the existing dependency chain (in reverse order) is: [ 523.790326] -> #1 (&(&hashinfo->ehash_locks[i])->rlock){+.-...}: [ 523.796599] [] lock_acquire+0xbb/0x270 [ 523.802565] [] _raw_spin_lock+0x38/0x50 [ 523.808628] [] __inet_hash_nolisten+0x78/0x110 [ 523.815273] [] tcp_v4_syn_recv_sock+0x24b/0x350 [ 523.822067] [] tcp_check_req+0x3c1/0x500 [ 523.828199] [] tcp_v4_do_rcv+0x239/0x3d0 [ 523.834331] [] tcp_v4_rcv+0xa8e/0xc10 [ 523.840202] [] ip_local_deliver_finish+0x133/0x3e0 [ 523.847214] [] ip_local_deliver+0xaa/0xc0 [ 523.853440] [] ip_rcv_finish+0x168/0x5c0 [ 523.859624] [] ip_rcv+0x307/0x420 Lets use u64_sync infrastructure instead. As a bonus, 64bit arches get optimized, as these are nop for them. Fixes: 0df48c26d841 ("tcp: add tcpi_bytes_acked to tcp_info") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 ++ net/ipv4/tcp.c | 11 +++++++---- net/ipv4/tcp_fastopen.c | 4 ++++ net/ipv4/tcp_input.c | 4 ++++ 4 files changed, 17 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 3b2911502a8c..e8bbf403618f 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -158,6 +158,8 @@ struct tcp_sock { * sum(delta(snd_una)), or how many bytes * were acked. */ + struct u64_stats_sync syncp; /* protects 64bit vars (cf tcp_get_info()) */ + u32 snd_una; /* First byte we want an ack for */ u32 snd_sml; /* Last byte of the most recently transmitted small packet */ u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 46efa03d2b11..f1377f2a0472 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -402,6 +402,7 @@ void tcp_init_sock(struct sock *sk) tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_clamp = ~0; tp->mss_cache = TCP_MSS_DEFAULT; + u64_stats_init(&tp->syncp); tp->reordering = sysctl_tcp_reordering; tcp_enable_early_retrans(tp); @@ -2598,6 +2599,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); u32 now = tcp_time_stamp; + unsigned int start; u32 rate; memset(info, 0, sizeof(*info)); @@ -2665,10 +2667,11 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) rate = READ_ONCE(sk->sk_max_pacing_rate); info->tcpi_max_pacing_rate = rate != ~0U ? rate : ~0ULL; - spin_lock_bh(&sk->sk_lock.slock); - info->tcpi_bytes_acked = tp->bytes_acked; - info->tcpi_bytes_received = tp->bytes_received; - spin_unlock_bh(&sk->sk_lock.slock); + do { + start = u64_stats_fetch_begin_irq(&tp->syncp); + info->tcpi_bytes_acked = tp->bytes_acked; + info->tcpi_bytes_received = tp->bytes_received; + } while (u64_stats_fetch_retry_irq(&tp->syncp, start)); } EXPORT_SYMBOL_GPL(tcp_get_info); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 3c673d5e6cff..46b087a27503 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -206,6 +206,10 @@ static bool tcp_fastopen_create_child(struct sock *sk, skb_set_owner_r(skb2, child); __skb_queue_tail(&child->sk_receive_queue, skb2); tp->syn_data_acked = 1; + + /* u64_stats_update_begin(&tp->syncp) not needed here, + * as we certainly are not changing upper 32bit value (0) + */ tp->bytes_received = end_seq - TCP_SKB_CB(skb)->seq - 1; } else { end_seq = TCP_SKB_CB(skb)->seq + 1; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 243d674b3ef5..c9ab964189a0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3286,7 +3286,9 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack) { u32 delta = ack - tp->snd_una; + u64_stats_update_begin(&tp->syncp); tp->bytes_acked += delta; + u64_stats_update_end(&tp->syncp); tp->snd_una = ack; } @@ -3295,7 +3297,9 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq) { u32 delta = seq - tp->rcv_nxt; + u64_stats_update_begin(&tp->syncp); tp->bytes_received += delta; + u64_stats_update_end(&tp->syncp); tp->rcv_nxt = seq; } -- cgit v1.2.3 From 381c759d9916c42959515ad34a6d467e24a88e93 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 22 May 2015 04:58:12 -0500 Subject: ipv4: Avoid crashing in ip_error ip_error does not check if in_dev is NULL before dereferencing it. IThe following sequence of calls is possible: CPU A CPU B ip_rcv_finish ip_route_input_noref() ip_route_input_slow() inetdev_destroy() dst_input() With the result that a network device can be destroyed while processing an input packet. A crash was triggered with only unicast packets in flight, and forwarding enabled on the only network device. The error condition was created by the removal of the network device. As such it is likely the that error code was -EHOSTUNREACH, and the action taken by ip_error (if in_dev had been accessible) would have been to not increment any counters and to have tried and likely failed to send an icmp error as the network device is going away. Therefore handle this weird case by just dropping the packet if !in_dev. It will result in dropping the packet sooner, and will not result in an actual change of behavior. Fixes: 251da4130115b ("ipv4: Cache ip_error() routes even when not forwarding.") Reported-by: Vittorio Gambaletta Tested-by: Vittorio Gambaletta Signed-off-by: Vittorio Gambaletta Signed-off-by: "Eric W. Biederman" Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index bff62fc87b8e..f45f2a12f37b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -902,6 +902,10 @@ static int ip_error(struct sk_buff *skb) bool send; int code; + /* IP on this device is disabled. */ + if (!in_dev) + goto out; + net = dev_net(rt->dst.dev); if (!IN_DEV_FORWARD(in_dev)) { switch (rt->dst.error) { -- cgit v1.2.3 From d4e64c2909231222ceba0999d921e7ac8908f656 Mon Sep 17 00:00:00 2001 From: Michal Kubeček Date: Fri, 22 May 2015 13:40:09 +0200 Subject: ipv4: fill in table id when replacing a route When replacing an IPv4 route, tb_id member of the new fib_alias structure is not set in the replace code path so that the new route is ignored. Fixes: 0ddcf43d5d4a ("ipv4: FIB Local/MAIN table collapse") Signed-off-by: Michal Kubecek Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 64c2076ced54..09b62e17dd8c 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1164,6 +1164,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) state = fa->fa_state; new_fa->fa_state = state & ~FA_S_ACCESSED; new_fa->fa_slen = fa->fa_slen; + new_fa->tb_id = tb->tb_id; err = netdev_switch_fib_ipv4_add(key, plen, fi, new_fa->fa_tos, -- cgit v1.2.3 From 47cc84ce0c2fe75c99ea5963c4b5704dd78ead54 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Fri, 22 May 2015 12:18:59 -0300 Subject: bridge: fix parsing of MLDv2 reports When more than a multicast address is present in a MLDv2 report, all but the first address is ignored, because the code breaks out of the loop if there has not been an error adding that address. This has caused failures when two guests connected through the bridge tried to communicate using IPv6. Neighbor discoveries would not be transmitted to the other guest when both used a link-local address and a static address. This only happens when there is a MLDv2 querier in the network. The fix will only break out of the loop when there is a failure adding a multicast address. The mdb before the patch: dev ovirtmgmt port vnet0 grp ff02::1:ff7d:6603 temp dev ovirtmgmt port vnet1 grp ff02::1:ff7d:6604 temp dev ovirtmgmt port bond0.86 grp ff02::2 temp After the patch: dev ovirtmgmt port vnet0 grp ff02::1:ff7d:6603 temp dev ovirtmgmt port vnet1 grp ff02::1:ff7d:6604 temp dev ovirtmgmt port bond0.86 grp ff02::fb temp dev ovirtmgmt port bond0.86 grp ff02::2 temp dev ovirtmgmt port bond0.86 grp ff02::d temp dev ovirtmgmt port vnet0 grp ff02::1:ff00:76 temp dev ovirtmgmt port bond0.86 grp ff02::16 temp dev ovirtmgmt port vnet1 grp ff02::1:ff00:77 temp dev ovirtmgmt port bond0.86 grp ff02::1:ff00:def temp dev ovirtmgmt port bond0.86 grp ff02::1:ffa1:40bf temp Fixes: 08b202b67264 ("bridge br_multicast: IPv6 MLD support.") Reported-by: Rik Theys Signed-off-by: Thadeu Lima de Souza Cascardo Tested-by: Rik Theys Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 4b6722f8f179..a3abe6ed111e 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1072,7 +1072,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, err = br_ip6_multicast_add_group(br, port, &grec->grec_mca, vid); - if (!err) + if (err) break; } -- cgit v1.2.3 From f96dee13b8e10f00840124255bed1d8b4c6afd6f Mon Sep 17 00:00:00 2001 From: Arun Parameswaran Date: Wed, 20 May 2015 14:35:30 -0700 Subject: net: core: 'ethtool' issue with querying phy settings When trying to configure the settings for PHY1, using commands like 'ethtool -s eth0 phyad 1 speed 100', the 'ethtool' seems to modify other settings apart from the speed of the PHY1, in the above case. The ethtool seems to query the settings for PHY0, and use this as the base to apply the new settings to the PHY1. This is causing the other settings of the PHY 1 to be wrongly configured. The issue is caused by the '_ethtool_get_settings()' API, which gets called because of the 'ETHTOOL_GSET' command, is clearing the 'cmd' pointer (of type 'struct ethtool_cmd') by calling memset. This clears all the parameters (if any) passed for the 'ETHTOOL_GSET' cmd. So the driver's callback is always invoked with 'cmd->phy_address' as '0'. The '_ethtool_get_settings()' is called from other files in the 'net/core'. So the fix is applied to the 'ethtool_get_settings()' which is only called in the context of the 'ethtool'. Signed-off-by: Arun Parameswaran Reviewed-by: Ray Jui Reviewed-by: Scott Branden Signed-off-by: David S. Miller --- net/core/ethtool.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 1d00b8922902..1347e11f5cc9 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -359,7 +359,15 @@ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) int err; struct ethtool_cmd cmd; - err = __ethtool_get_settings(dev, &cmd); + if (!dev->ethtool_ops->get_settings) + return -EOPNOTSUPP; + + if (copy_from_user(&cmd, useraddr, sizeof(cmd))) + return -EFAULT; + + cmd.cmd = ETHTOOL_GSET; + + err = dev->ethtool_ops->get_settings(dev, &cmd); if (err < 0) return err; -- cgit v1.2.3 From 93a33a584e2a49a217118148125944fd02d47b54 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 May 2015 13:28:29 -0700 Subject: bridge: fix lockdep splat Following lockdep splat was reported : [ 29.382286] =============================== [ 29.382315] [ INFO: suspicious RCU usage. ] [ 29.382344] 4.1.0-0.rc0.git11.1.fc23.x86_64 #1 Not tainted [ 29.382380] ------------------------------- [ 29.382409] net/bridge/br_private.h:626 suspicious rcu_dereference_check() usage! [ 29.382455] other info that might help us debug this: [ 29.382507] rcu_scheduler_active = 1, debug_locks = 0 [ 29.382549] 2 locks held by swapper/0/0: [ 29.382576] #0: (((&p->forward_delay_timer))){+.-...}, at: [] call_timer_fn+0x5/0x4f0 [ 29.382660] #1: (&(&br->lock)->rlock){+.-...}, at: [] br_forward_delay_timer_expired+0x31/0x140 [bridge] [ 29.382754] stack backtrace: [ 29.382787] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.1.0-0.rc0.git11.1.fc23.x86_64 #1 [ 29.382838] Hardware name: LENOVO 422916G/LENOVO, BIOS A1KT53AUS 04/07/2015 [ 29.382882] 0000000000000000 3ebfc20364115825 ffff880666603c48 ffffffff81892d4b [ 29.382943] 0000000000000000 ffffffff81e124e0 ffff880666603c78 ffffffff8110bcd7 [ 29.383004] ffff8800785c9d00 ffff88065485ac58 ffff880c62002800 ffff880c5fc88ac0 [ 29.383065] Call Trace: [ 29.383084] [] dump_stack+0x4c/0x65 [ 29.383130] [] lockdep_rcu_suspicious+0xe7/0x120 [ 29.383178] [] br_fill_ifinfo+0x4a9/0x6a0 [bridge] [ 29.383225] [] br_ifinfo_notify+0x11b/0x4b0 [bridge] [ 29.383271] [] ? br_hold_timer_expired+0x70/0x70 [bridge] [ 29.383320] [] br_forward_delay_timer_expired+0x58/0x140 [bridge] [ 29.383371] [] ? br_hold_timer_expired+0x70/0x70 [bridge] [ 29.383416] [] call_timer_fn+0xc3/0x4f0 [ 29.383454] [] ? call_timer_fn+0x5/0x4f0 [ 29.383493] [] ? lock_release_holdtime.part.29+0xf/0x200 [ 29.383541] [] ? br_hold_timer_expired+0x70/0x70 [bridge] [ 29.383587] [] run_timer_softirq+0x244/0x490 [ 29.383629] [] __do_softirq+0xec/0x670 [ 29.383666] [] irq_exit+0x145/0x150 [ 29.383703] [] smp_apic_timer_interrupt+0x46/0x60 [ 29.383744] [] apic_timer_interrupt+0x73/0x80 [ 29.383782] [] ? cpuidle_enter_state+0x5f/0x2f0 [ 29.383832] [] ? cpuidle_enter_state+0x5b/0x2f0 Problem here is that br_forward_delay_timer_expired() is a timer handler, calling br_ifinfo_notify() which assumes either rcu_read_lock() or RTNL are held. Simplest fix seems to add rcu read lock section. Signed-off-by: Eric Dumazet Reported-by: Josh Boyer Reported-by: Dominick Grift Cc: Vlad Yasevich Signed-off-by: David S. Miller --- net/bridge/br_stp_timer.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c index 4fcaa67750fd..7caf7fae2d5b 100644 --- a/net/bridge/br_stp_timer.c +++ b/net/bridge/br_stp_timer.c @@ -97,7 +97,9 @@ static void br_forward_delay_timer_expired(unsigned long arg) netif_carrier_on(br->dev); } br_log_state(p); + rcu_read_lock(); br_ifinfo_notify(RTM_NEWLINK, p); + rcu_read_unlock(); spin_unlock(&br->lock); } -- cgit v1.2.3 From b48732e4a48d80ed4a14812f0bab09560846514e Mon Sep 17 00:00:00 2001 From: Mark Salyzyn Date: Tue, 26 May 2015 08:22:19 -0700 Subject: unix/caif: sk_socket can disappear when state is unlocked got a rare NULL pointer dereference in clear_bit Signed-off-by: Mark Salyzyn Acked-by: Hannes Frederic Sowa ---- v2: switch to sock_flag(sk, SOCK_DEAD) and added net/caif/caif_socket.c v3: return -ECONNRESET in upstream caller of wait function for SOCK_DEAD Signed-off-by: David S. Miller --- net/caif/caif_socket.c | 8 ++++++++ net/unix/af_unix.c | 8 ++++++++ 2 files changed, 16 insertions(+) (limited to 'net') diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 4ec0c803aef1..112ad784838a 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -330,6 +330,10 @@ static long caif_stream_data_wait(struct sock *sk, long timeo) release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); + + if (sock_flag(sk, SOCK_DEAD)) + break; + clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); } @@ -373,6 +377,10 @@ static int caif_stream_recvmsg(struct socket *sock, struct msghdr *msg, struct sk_buff *skb; lock_sock(sk); + if (sock_flag(sk, SOCK_DEAD)) { + err = -ECONNRESET; + goto unlock; + } skb = skb_dequeue(&sk->sk_receive_queue); caif_check_flow_release(sk); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 5266ea7b922b..06430598cf51 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1880,6 +1880,10 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, unix_state_unlock(sk); timeo = freezable_schedule_timeout(timeo); unix_state_lock(sk); + + if (sock_flag(sk, SOCK_DEAD)) + break; + clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); } @@ -1939,6 +1943,10 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, struct sk_buff *skb, *last; unix_state_lock(sk); + if (sock_flag(sk, SOCK_DEAD)) { + err = -ECONNRESET; + goto unlock; + } last = skb = skb_peek(&sk->sk_receive_queue); again: if (skb == NULL) { -- cgit v1.2.3 From 86e363dc3b50bfd50a1f315934583fbda673ab8d Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 26 May 2015 16:08:48 -0700 Subject: net_sched: invoke ->attach() after setting dev->qdisc For mq qdisc, we add per tx queue qdisc to root qdisc for display purpose, however, that happens too early, before the new dev->qdisc is finally set, this causes q->list points to an old root qdisc which is going to be freed right before assigning with a new one. Fix this by moving ->attach() after setting dev->qdisc. For the record, this fixes the following crash: ------------[ cut here ]------------ WARNING: CPU: 1 PID: 975 at lib/list_debug.c:59 __list_del_entry+0x5a/0x98() list_del corruption. prev->next should be ffff8800d1998ae8, but was 6b6b6b6b6b6b6b6b CPU: 1 PID: 975 Comm: tc Not tainted 4.1.0-rc4+ #1019 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 0000000000000009 ffff8800d73fb928 ffffffff81a44e7f 0000000047574756 ffff8800d73fb978 ffff8800d73fb968 ffffffff810790da ffff8800cfc4cd20 ffffffff814e725b ffff8800d1998ae8 ffffffff82381250 0000000000000000 Call Trace: [] dump_stack+0x4c/0x65 [] warn_slowpath_common+0x9c/0xb6 [] ? __list_del_entry+0x5a/0x98 [] warn_slowpath_fmt+0x46/0x48 [] ? dev_graft_qdisc+0x5e/0x6a [] __list_del_entry+0x5a/0x98 [] list_del+0xe/0x2d [] qdisc_list_del+0x1e/0x20 [] qdisc_destroy+0x30/0xd6 [] qdisc_graft+0x11d/0x243 [] tc_get_qdisc+0x1a6/0x1d4 [] ? mark_lock+0x2e/0x226 [] rtnetlink_rcv_msg+0x181/0x194 [] ? rtnl_lock+0x17/0x19 [] ? rtnl_lock+0x17/0x19 [] ? __rtnl_unlock+0x17/0x17 [] netlink_rcv_skb+0x4d/0x93 [] rtnetlink_rcv+0x26/0x2d [] netlink_unicast+0xcb/0x150 [] ? might_fault+0x59/0xa9 [] netlink_sendmsg+0x4fa/0x51c [] sock_sendmsg_nosec+0x12/0x1d [] sock_sendmsg+0x29/0x2e [] ___sys_sendmsg+0x1b4/0x23a [] ? native_sched_clock+0x35/0x37 [] ? sched_clock_local+0x12/0x72 [] ? sched_clock_cpu+0x9e/0xb7 [] ? current_kernel_time+0xe/0x32 [] ? lock_release_holdtime.part.29+0x71/0x7f [] ? read_seqcount_begin.constprop.27+0x5f/0x76 [] ? trace_hardirqs_on_caller+0x17d/0x199 [] ? __fget_light+0x50/0x78 [] __sys_sendmsg+0x42/0x60 [] SyS_sendmsg+0x12/0x1c [] system_call_fastpath+0x12/0x6f ---[ end trace ef29d3fb28e97ae7 ]--- For long term, we probably need to clean up the qdisc_graft() code in case it hides other bugs like this. Fixes: 95dc19299f74 ("pkt_sched: give visibility to mq slave qdiscs") Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_api.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index ad9eed70bc8f..1e1c89e51a11 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -815,10 +815,8 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, if (dev->flags & IFF_UP) dev_deactivate(dev); - if (new && new->ops->attach) { - new->ops->attach(new); - num_q = 0; - } + if (new && new->ops->attach) + goto skip; for (i = 0; i < num_q; i++) { struct netdev_queue *dev_queue = dev_ingress_queue(dev); @@ -834,12 +832,16 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, qdisc_destroy(old); } +skip: if (!ingress) { notify_and_destroy(net, skb, n, classid, dev->qdisc, new); if (new && !new->ops->attach) atomic_inc(&new->refcnt); dev->qdisc = new ? : &noop_qdisc; + + if (new && new->ops->attach) + new->ops->attach(new); } else { notify_and_destroy(net, skb, n, classid, old, new); } -- cgit v1.2.3 From cd5279c194f89c9b97c294af4aaf4ea8c5e3c704 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 27 May 2015 07:16:43 -0700 Subject: ip_vti/ip6_vti: Do not touch skb->mark on xmit Instead of modifying skb->mark we can simply modify the flowi_mark that is generated as a result of the xfrm_decode_session. By doing this we don't need to actually touch the skb->mark and it can be preserved as it passes out through the tunnel. Signed-off-by: Alexander Duyck Signed-off-by: Steffen Klassert --- net/ipv4/ip_vti.c | 5 +++-- net/ipv6/ip6_vti.c | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 9f7269f3c54a..4c318e1c13c8 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -216,8 +216,6 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) memset(&fl, 0, sizeof(fl)); - skb->mark = be32_to_cpu(tunnel->parms.o_key); - switch (skb->protocol) { case htons(ETH_P_IP): xfrm_decode_session(skb, &fl, AF_INET); @@ -233,6 +231,9 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } + /* override mark with tunnel output key */ + fl.flowi_mark = be32_to_cpu(tunnel->parms.o_key); + return vti_xmit(skb, dev, &fl); } diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index ed9d681207fa..104de4da3ff3 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -495,7 +495,6 @@ vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) int ret; memset(&fl, 0, sizeof(fl)); - skb->mark = be32_to_cpu(t->parms.o_key); switch (skb->protocol) { case htons(ETH_P_IPV6): @@ -516,6 +515,9 @@ vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) goto tx_err; } + /* override mark with tunnel output key */ + fl.flowi_mark = be32_to_cpu(t->parms.o_key); + ret = vti6_xmit(skb, dev, &fl); if (ret < 0) goto tx_err; -- cgit v1.2.3 From 049f8e2e28d9c3dac0744cc2f19d3157c7fb5646 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 27 May 2015 07:16:49 -0700 Subject: xfrm: Override skb->mark with tunnel->parm.i_key in xfrm_input This change makes it so that if a tunnel is defined we just use the mark from the tunnel instead of the mark from the skb header. By doing this we can avoid the need to set skb->mark inside of the tunnel receive functions. Signed-off-by: Alexander Duyck Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_input.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 526c4feb3b50..b58286ecd156 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -13,6 +13,8 @@ #include #include #include +#include +#include static struct kmem_cache *secpath_cachep __read_mostly; @@ -186,6 +188,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) struct xfrm_state *x = NULL; xfrm_address_t *daddr; struct xfrm_mode *inner_mode; + u32 mark = skb->mark; unsigned int family; int decaps = 0; int async = 0; @@ -203,6 +206,18 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) XFRM_SPI_SKB_CB(skb)->daddroff); family = XFRM_SPI_SKB_CB(skb)->family; + /* if tunnel is present override skb->mark value with tunnel i_key */ + if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4) { + switch (family) { + case AF_INET: + mark = be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4->parms.i_key); + break; + case AF_INET6: + mark = be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6->parms.i_key); + break; + } + } + /* Allocate new secpath or COW existing one. */ if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) { struct sec_path *sp; @@ -229,7 +244,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) goto drop; } - x = xfrm_state_lookup(net, skb->mark, daddr, spi, nexthdr, family); + x = xfrm_state_lookup(net, mark, daddr, spi, nexthdr, family); if (x == NULL) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES); xfrm_audit_state_notfound(skb, family, spi, seq); -- cgit v1.2.3 From d55c670cbc54b2270a465cdc382ce71adae45785 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 27 May 2015 07:16:54 -0700 Subject: ip_vti/ip6_vti: Preserve skb->mark after rcv_cb call The vti6_rcv_cb and vti_rcv_cb calls were leaving the skb->mark modified after completing the function. This resulted in the original skb->mark value being lost. Since we only need skb->mark to be set for xfrm_policy_check we can pull the assignment into the rcv_cb calls and then just restore the original mark after xfrm_policy_check has been completed. Signed-off-by: Alexander Duyck Signed-off-by: Steffen Klassert --- net/ipv4/ip_vti.c | 9 +++++++-- net/ipv6/ip6_vti.c | 9 +++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 4c318e1c13c8..0c152087ca15 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -65,7 +65,6 @@ static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, goto drop; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; - skb->mark = be32_to_cpu(tunnel->parms.i_key); return xfrm_input(skb, nexthdr, spi, encap_type); } @@ -91,6 +90,8 @@ static int vti_rcv_cb(struct sk_buff *skb, int err) struct pcpu_sw_netstats *tstats; struct xfrm_state *x; struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4; + u32 orig_mark = skb->mark; + int ret; if (!tunnel) return 1; @@ -107,7 +108,11 @@ static int vti_rcv_cb(struct sk_buff *skb, int err) x = xfrm_input_state(skb); family = x->inner_mode->afinfo->family; - if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) + skb->mark = be32_to_cpu(tunnel->parms.i_key); + ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family); + skb->mark = orig_mark; + + if (!ret) return -EPERM; skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev))); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 104de4da3ff3..ff3bd863fa03 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -322,7 +322,6 @@ static int vti6_rcv(struct sk_buff *skb) } XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t; - skb->mark = be32_to_cpu(t->parms.i_key); rcu_read_unlock(); @@ -342,6 +341,8 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err) struct pcpu_sw_netstats *tstats; struct xfrm_state *x; struct ip6_tnl *t = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6; + u32 orig_mark = skb->mark; + int ret; if (!t) return 1; @@ -358,7 +359,11 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err) x = xfrm_input_state(skb); family = x->inner_mode->afinfo->family; - if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) + skb->mark = be32_to_cpu(t->parms.i_key); + ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family); + skb->mark = orig_mark; + + if (!ret) return -EPERM; skb_scrub_packet(skb, !net_eq(t->net, dev_net(skb->dev))); -- cgit v1.2.3 From 71d9f6149cac8fc6646adfb2a6f3b0de6ddd23f6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 May 2015 04:42:54 -0700 Subject: bridge: fix br_multicast_query_expired() bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit br_multicast_query_expired() querier argument is a pointer to a struct bridge_mcast_querier : struct bridge_mcast_querier { struct br_ip addr; struct net_bridge_port __rcu *port; }; Intent of the code was to clear port field, not the pointer to querier. Fixes: 2cd4143192e8 ("bridge: memorize and export selected IGMP/MLD querier port") Signed-off-by: Eric Dumazet Acked-by: Thadeu Lima de Souza Cascardo Acked-by: Linus Lüssing Cc: Linus Lüssing Cc: Steinar H. Gunderson Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index a3abe6ed111e..22fd0419b314 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1822,7 +1822,7 @@ static void br_multicast_query_expired(struct net_bridge *br, if (query->startup_sent < br->multicast_startup_query_count) query->startup_sent++; - RCU_INIT_POINTER(querier, NULL); + RCU_INIT_POINTER(querier->port, NULL); br_multicast_send_query(br, NULL, query); spin_unlock(&br->multicast_lock); } -- cgit v1.2.3 From beb39db59d14990e401e235faf66a6b9b31240b0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 30 May 2015 09:16:53 -0700 Subject: udp: fix behavior of wrong checksums We have two problems in UDP stack related to bogus checksums : 1) We return -EAGAIN to application even if receive queue is not empty. This breaks applications using edge trigger epoll() 2) Under UDP flood, we can loop forever without yielding to other processes, potentially hanging the host, especially on non SMP. This patch is an attempt to make things better. We might in the future add extra support for rt applications wanting to better control time spent doing a recv() in a hostile environment. For example we could validate checksums before queuing packets in socket receive queue. Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/udp.c | 6 ++---- net/ipv6/udp.c | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d10b7e0112eb..1c92ea67baef 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1345,10 +1345,8 @@ csum_copy_err: } unlock_sock_fast(sk, slow); - if (noblock) - return -EAGAIN; - - /* starting over for a new packet */ + /* starting over for a new packet, but check if we need to yield */ + cond_resched(); msg->msg_flags &= ~MSG_TRUNC; goto try_again; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index c2ec41617a35..e51fc3eee6db 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -525,10 +525,8 @@ csum_copy_err: } unlock_sock_fast(sk, slow); - if (noblock) - return -EAGAIN; - - /* starting over for a new packet */ + /* starting over for a new packet, but check if we need to yield */ + cond_resched(); msg->msg_flags &= ~MSG_TRUNC; goto try_again; } -- cgit v1.2.3 From 9f950415e4e28e7cfae2e416b43e862e8101d996 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Fri, 29 May 2015 13:47:07 -0400 Subject: tcp: fix child sockets to use system default congestion control if not set Linux 3.17 and earlier are explicitly engineered so that if the app doesn't specifically request a CC module on a listener before the SYN arrives, then the child gets the system default CC when the connection is established. See tcp_init_congestion_control() in 3.17 or earlier, which says "if no choice made yet assign the current value set as default". The change ("net: tcp: assign tcp cong_ops when tcp sk is created") altered these semantics, so that children got their parent listener's congestion control even if the system default had changed after the listener was created. This commit returns to those original semantics from 3.17 and earlier, since they are the original semantics from 2007 in 4d4d3d1e8 ("[TCP]: Congestion control initialization."), and some Linux congestion control workflows depend on that. In summary, if a listener socket specifically sets TCP_CONGESTION to "x", or the route locks the CC module to "x", then the child gets "x". Otherwise the child gets current system default from net.ipv4.tcp_congestion_control. That's the behavior in 3.17 and earlier, and this commit restores that. Fixes: 55d8694fa82c ("net: tcp: assign tcp cong_ops when tcp sk is created") Cc: Florian Westphal Cc: Daniel Borkmann Cc: Glenn Judd Cc: Stephen Hemminger Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: Yuchung Cheng Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 3 ++- net/ipv4/tcp_cong.c | 5 ++++- net/ipv4/tcp_minisocks.c | 5 ++++- 3 files changed, 10 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 497bc14cdb85..0320bbb7d7b5 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -98,7 +98,8 @@ struct inet_connection_sock { const struct tcp_congestion_ops *icsk_ca_ops; const struct inet_connection_sock_af_ops *icsk_af_ops; unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); - __u8 icsk_ca_state:7, + __u8 icsk_ca_state:6, + icsk_ca_setsockopt:1, icsk_ca_dst_locked:1; __u8 icsk_retransmits; __u8 icsk_pending; diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 7a5ae50c80c8..84be008c945c 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -187,6 +187,7 @@ static void tcp_reinit_congestion_control(struct sock *sk, tcp_cleanup_congestion_control(sk); icsk->icsk_ca_ops = ca; + icsk->icsk_ca_setsockopt = 1; if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); @@ -335,8 +336,10 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) rcu_read_lock(); ca = __tcp_ca_find_autoload(name); /* No change asking for existing value */ - if (ca == icsk->icsk_ca_ops) + if (ca == icsk->icsk_ca_ops) { + icsk->icsk_ca_setsockopt = 1; goto out; + } if (!ca) err = -ENOENT; else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b5732a54f2ad..17e7339ee5ca 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -420,7 +420,10 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) rcu_read_unlock(); } - if (!ca_got_dst && !try_module_get(icsk->icsk_ca_ops->owner)) + /* If no valid choice made yet, assign current system default ca. */ + if (!ca_got_dst && + (!icsk->icsk_ca_setsockopt || + !try_module_get(icsk->icsk_ca_ops->owner))) tcp_assign_congestion_control(sk); tcp_set_ca_state(sk, TCP_CA_Open); -- cgit v1.2.3 From 24595346d79b6bd98a77d24c493e8490639788fc Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 29 May 2015 10:29:46 -0700 Subject: net: dsa: Properly propagate errors from dsa_switch_setup_one While shuffling some code around, dsa_switch_setup_one() was introduced, and it was modified to return either an error code using ERR_PTR() or a NULL pointer when running out of memory or failing to setup a switch. This is a problem for its caler: dsa_switch_setup() which uses IS_ERR() and expects to find an error code, not a NULL pointer, so we still try to proceed with dsa_switch_setup() and operate on invalid memory addresses. This can be easily reproduced by having e.g: the bcm_sf2 driver built-in, but having no such switch, such that drv->setup will fail. Fix this by using PTR_ERR() consistently which is both more informative and avoids for the caller to use IS_ERR_OR_NULL(). Fixes: df197195a5248 ("net: dsa: split dsa_switch_setup into two functions") Reported-by: Andrew Lunn Signed-off-by: Florian Fainelli Tested-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/dsa.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index e6f6cc3a1bcf..392e29a0227d 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -359,7 +359,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, */ ds = kzalloc(sizeof(*ds) + drv->priv_size, GFP_KERNEL); if (ds == NULL) - return NULL; + return ERR_PTR(-ENOMEM); ds->dst = dst; ds->index = index; @@ -370,7 +370,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, ret = dsa_switch_setup_one(ds, parent); if (ret) - return NULL; + return ERR_PTR(ret); return ds; } -- cgit v1.2.3 From d26e2c9ffa385dd1b646f43c1397ba12af9ed431 Mon Sep 17 00:00:00 2001 From: Bernhard Thaler Date: Thu, 28 May 2015 10:26:18 +0200 Subject: Revert "netfilter: ensure number of counters is >0 in do_replace()" This partially reverts commit 1086bbe97a07 ("netfilter: ensure number of counters is >0 in do_replace()") in net/bridge/netfilter/ebtables.c. Setting rules with ebtables does not work any more with 1086bbe97a07 place. There is an error message and no rules set in the end. e.g. ~# ebtables -t nat -A POSTROUTING --src 12:34:56:78:9a:bc -j DROP Unable to update the kernel. Two possible causes: 1. Multiple ebtables programs were executing simultaneously. The ebtables userspace tool doesn't by default support multiple ebtables programs running Reverting the ebtables part of 1086bbe97a07 makes this work again. Signed-off-by: Bernhard Thaler Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtables.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net') diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 24c7c96bf5f8..91180a7fc943 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1117,8 +1117,6 @@ static int do_replace(struct net *net, const void __user *user, return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct ebt_counter)) return -ENOMEM; - if (tmp.num_counters == 0) - return -EINVAL; tmp.name[sizeof(tmp.name) - 1] = 0; @@ -2161,8 +2159,6 @@ static int compat_copy_ebt_replace_from_user(struct ebt_replace *repl, return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct ebt_counter)) return -ENOMEM; - if (tmp.num_counters == 0) - return -EINVAL; memcpy(repl, &tmp, offsetof(struct ebt_replace, hook_entry)); -- cgit v1.2.3 From 18ec898ee54e03a9aab8b54db50cb2b36209d313 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 1 Jun 2015 14:43:50 -0700 Subject: Revert "net: core: 'ethtool' issue with querying phy settings" This reverts commit f96dee13b8e10f00840124255bed1d8b4c6afd6f. It isn't right, ethtool is meant to manage one PHY instance per netdevice at a time, and this is selected by the SET command. Therefore by definition the GET command must only return the settings for the configured and selected PHY. Reported-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/ethtool.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 1347e11f5cc9..1d00b8922902 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -359,15 +359,7 @@ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) int err; struct ethtool_cmd cmd; - if (!dev->ethtool_ops->get_settings) - return -EOPNOTSUPP; - - if (copy_from_user(&cmd, useraddr, sizeof(cmd))) - return -EFAULT; - - cmd.cmd = ETHTOOL_GSET; - - err = dev->ethtool_ops->get_settings(dev, &cmd); + err = __ethtool_get_settings(dev, &cmd); if (err < 0) return err; -- cgit v1.2.3 From ccd740cbc6e01b2a08baa341867063ed2887f4b9 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Fri, 29 May 2015 11:28:26 -0700 Subject: vti6: Add pmtu handling to vti6_xmit. We currently rely on the PMTU discovery of xfrm. However if a packet is localy sent, the PMTU mechanism of xfrm tries to to local socket notification what might not work for applications like ping that don't check for this. So add pmtu handling to vti6_xmit to report MTU changes immediately. Signed-off-by: Steffen Klassert Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv6/ip6_vti.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index ff3bd863fa03..0224c032dca5 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -435,6 +435,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) struct net_device *tdev; struct xfrm_state *x; int err = -1; + int mtu; if (!dst) goto tx_err_link_failure; @@ -468,6 +469,19 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) skb_dst_set(skb, dst); skb->dev = skb_dst(skb)->dev; + mtu = dst_mtu(dst); + if (!skb->ignore_df && skb->len > mtu) { + skb_dst(skb)->ops->update_pmtu(dst, NULL, skb, mtu); + + if (skb->protocol == htons(ETH_P_IPV6)) + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + else + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + + return -EMSGSIZE; + } + err = dst_output(skb); if (net_xmit_eval(err) == 0) { struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); -- cgit v1.2.3