diff options
Diffstat (limited to 'net/ipv4')
54 files changed, 1397 insertions, 1446 deletions
| diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 775824720b6b..238225b0c970 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -186,6 +186,7 @@ config NET_IPGRE_DEMUX  config NET_IP_TUNNEL  	tristate +	select DST_CACHE  	default n  config NET_IPGRE @@ -405,14 +406,6 @@ config INET_XFRM_MODE_BEET  	  If unsure, say Y. -config INET_LRO -	tristate "Large Receive Offload (ipv4/tcp)" -	default y -	---help--- -	  Support for Large Receive Offload (ipv4/tcp). - -	  If unsure, say Y. -  config INET_DIAG  	tristate "INET: socket monitoring interface"  	default y diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 62c049b647e9..bfa133691cde 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -32,7 +32,6 @@ obj-$(CONFIG_INET_ESP) += esp4.o  obj-$(CONFIG_INET_IPCOMP) += ipcomp.o  obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o  obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o -obj-$(CONFIG_INET_LRO) += inet_lro.o  obj-$(CONFIG_INET_TUNNEL) += tunnel4.o  obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o  obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 5c5db6636704..9e481992dbae 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -370,7 +370,11 @@ lookup_protocol:  		 */  		inet->inet_sport = htons(inet->inet_num);  		/* Add to protocol hash chains. */ -		sk->sk_prot->hash(sk); +		err = sk->sk_prot->hash(sk); +		if (err) { +			sk_common_release(sk); +			goto out; +		}  	}  	if (sk->sk_prot->init) { @@ -1091,12 +1095,6 @@ void inet_unregister_protosw(struct inet_protosw *p)  }  EXPORT_SYMBOL(inet_unregister_protosw); -/* - *      Shall we try to damage output packets if routing dev changes? - */ - -int sysctl_ip_dynaddr __read_mostly; -  static int inet_sk_reselect_saddr(struct sock *sk)  {  	struct inet_sock *inet = inet_sk(sk); @@ -1127,7 +1125,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)  	if (new_saddr == old_saddr)  		return 0; -	if (sysctl_ip_dynaddr > 1) { +	if (sock_net(sk)->ipv4.sysctl_ip_dynaddr > 1) {  		pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n",  			__func__, &old_saddr, &new_saddr);  	} @@ -1142,8 +1140,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)  	 * Besides that, it does not check for connection  	 * uniqueness. Wait for troubles.  	 */ -	__sk_prot_rehash(sk); -	return 0; +	return __sk_prot_rehash(sk);  }  int inet_sk_rebuild_header(struct sock *sk) @@ -1183,7 +1180,7 @@ int inet_sk_rebuild_header(struct sock *sk)  		 * Other protocols have to map its equivalent state to TCP_SYN_SENT.  		 * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme  		 */ -		if (!sysctl_ip_dynaddr || +		if (!sock_net(sk)->ipv4.sysctl_ip_dynaddr ||  		    sk->sk_state != TCP_SYN_SENT ||  		    (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||  		    (err = inet_sk_reselect_saddr(sk)) != 0) @@ -1383,6 +1380,45 @@ out:  	return pp;  } +static struct sk_buff **ipip_gro_receive(struct sk_buff **head, +					 struct sk_buff *skb) +{ +	if (NAPI_GRO_CB(skb)->encap_mark) { +		NAPI_GRO_CB(skb)->flush = 1; +		return NULL; +	} + +	NAPI_GRO_CB(skb)->encap_mark = 1; + +	return inet_gro_receive(head, skb); +} + +#define SECONDS_PER_DAY	86400 + +/* inet_current_timestamp - Return IP network timestamp + * + * Return milliseconds since midnight in network byte order. + */ +__be32 inet_current_timestamp(void) +{ +	u32 secs; +	u32 msecs; +	struct timespec64 ts; + +	ktime_get_real_ts64(&ts); + +	/* Get secs since midnight. */ +	(void)div_u64_rem(ts.tv_sec, SECONDS_PER_DAY, &secs); +	/* Convert to msecs. */ +	msecs = secs * MSEC_PER_SEC; +	/* Convert nsec to msec. */ +	msecs += (u32)ts.tv_nsec / NSEC_PER_MSEC; + +	/* Convert to network byte order. */ +	return htonl(msecs); +} +EXPORT_SYMBOL(inet_current_timestamp); +  int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)  {  	if (sk->sk_family == AF_INET) @@ -1425,6 +1461,13 @@ out_unlock:  	return err;  } +static int ipip_gro_complete(struct sk_buff *skb, int nhoff) +{ +	skb->encapsulation = 1; +	skb_shinfo(skb)->gso_type |= SKB_GSO_IPIP; +	return inet_gro_complete(skb, nhoff); +} +  int inet_ctl_sock_create(struct sock **sk, unsigned short family,  			 unsigned short type, unsigned char protocol,  			 struct net *net) @@ -1652,8 +1695,8 @@ static struct packet_offload ip_packet_offload __read_mostly = {  static const struct net_offload ipip_offload = {  	.callbacks = {  		.gso_segment	= inet_gso_segment, -		.gro_receive	= inet_gro_receive, -		.gro_complete	= inet_gro_complete, +		.gro_receive	= ipip_gro_receive, +		.gro_complete	= ipip_gro_complete,  	},  }; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 59b3e0e8fd51..c34c7544d1db 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -665,7 +665,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)  	 */  	if (!in_dev) -		goto out; +		goto out_free_skb;  	arp = arp_hdr(skb); @@ -673,7 +673,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)  	default:  		if (arp->ar_pro != htons(ETH_P_IP) ||  		    htons(dev_type) != arp->ar_hrd) -			goto out; +			goto out_free_skb;  		break;  	case ARPHRD_ETHER:  	case ARPHRD_FDDI: @@ -690,17 +690,17 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)  		if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&  		     arp->ar_hrd != htons(ARPHRD_IEEE802)) ||  		    arp->ar_pro != htons(ETH_P_IP)) -			goto out; +			goto out_free_skb;  		break;  	case ARPHRD_AX25:  		if (arp->ar_pro != htons(AX25_P_IP) ||  		    arp->ar_hrd != htons(ARPHRD_AX25)) -			goto out; +			goto out_free_skb;  		break;  	case ARPHRD_NETROM:  		if (arp->ar_pro != htons(AX25_P_IP) ||  		    arp->ar_hrd != htons(ARPHRD_NETROM)) -			goto out; +			goto out_free_skb;  		break;  	} @@ -708,7 +708,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)  	if (arp->ar_op != htons(ARPOP_REPLY) &&  	    arp->ar_op != htons(ARPOP_REQUEST)) -		goto out; +		goto out_free_skb;  /*   *	Extract fields @@ -733,7 +733,15 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)   */  	if (ipv4_is_multicast(tip) ||  	    (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip))) -		goto out; +		goto out_free_skb; + + /* +  *	For some 802.11 wireless deployments (and possibly other networks), +  *	there will be an ARP proxy and gratuitous ARP frames are attacks +  *	and thus should not be accepted. +  */ +	if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP)) +		goto out_free_skb;  /*   *     Special case: We must set Frame Relay source Q.922 address @@ -770,7 +778,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)  		    !arp_ignore(in_dev, sip, tip))  			arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip,  				     sha, dev->dev_addr, sha, reply_dst); -		goto out; +		goto out_consume_skb;  	}  	if (arp->ar_op == htons(ARPOP_REQUEST) && @@ -795,7 +803,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)  					neigh_release(n);  				}  			} -			goto out; +			goto out_consume_skb;  		} else if (IN_DEV_FORWARD(in_dev)) {  			if (addr_type == RTN_UNICAST  &&  			    (arp_fwd_proxy(in_dev, dev, rt) || @@ -818,7 +826,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)  						       in_dev->arp_parms, skb);  					goto out_free_dst;  				} -				goto out; +				goto out_consume_skb;  			}  		}  	} @@ -868,11 +876,16 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)  		neigh_release(n);  	} -out: +out_consume_skb:  	consume_skb(skb); +  out_free_dst:  	dst_release(reply_dst); -	return 0; +	return NET_RX_SUCCESS; + +out_free_skb: +	kfree_skb(skb); +	return NET_RX_DROP;  }  static void parp_redo(struct sk_buff *skb) @@ -916,11 +929,11 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev,  consumeskb:  	consume_skb(skb); -	return 0; +	return NET_RX_SUCCESS;  freeskb:  	kfree_skb(skb);  out_of_mem: -	return 0; +	return NET_RX_DROP;  }  /* diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index f6303b17546b..e333bc86bd39 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -334,6 +334,9 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,  	ASSERT_RTNL(); +	if (in_dev->dead) +		goto no_promotions; +  	/* 1. Deleting primary ifaddr forces deletion all secondaries  	 * unless alias promotion is set  	 **/ @@ -380,6 +383,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,  			fib_del_ifaddr(ifa, ifa1);  	} +no_promotions:  	/* 2. Unlink it */  	*ifap = ifa1->ifa_next; @@ -1194,6 +1198,7 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)  	__be32 addr = 0;  	struct in_device *in_dev;  	struct net *net = dev_net(dev); +	int master_idx;  	rcu_read_lock();  	in_dev = __in_dev_get_rcu(dev); @@ -1214,12 +1219,33 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)  	if (addr)  		goto out_unlock;  no_in_dev: +	master_idx = l3mdev_master_ifindex_rcu(dev); + +	/* For VRFs, the VRF device takes the place of the loopback device, +	 * with addresses on it being preferred.  Note in such cases the +	 * loopback device will be among the devices that fail the master_idx +	 * equality check in the loop below. +	 */ +	if (master_idx && +	    (dev = dev_get_by_index_rcu(net, master_idx)) && +	    (in_dev = __in_dev_get_rcu(dev))) { +		for_primary_ifa(in_dev) { +			if (ifa->ifa_scope != RT_SCOPE_LINK && +			    ifa->ifa_scope <= scope) { +				addr = ifa->ifa_local; +				goto out_unlock; +			} +		} endfor_ifa(in_dev); +	}  	/* Not loopback addresses on loopback should be preferred  	   in this case. It is important that lo is the first interface  	   in dev_base list.  	 */  	for_each_netdev_rcu(net, dev) { +		if (l3mdev_master_ifindex_rcu(dev) != master_idx) +			continue; +  		in_dev = __in_dev_get_rcu(dev);  		if (!in_dev)  			continue; @@ -1731,17 +1757,20 @@ static int inet_netconf_msgsize_devconf(int type)  {  	int size = NLMSG_ALIGN(sizeof(struct netconfmsg))  		   + nla_total_size(4);	/* NETCONFA_IFINDEX */ +	bool all = false; + +	if (type == NETCONFA_ALL) +		all = true; -	/* type -1 is used for ALL */ -	if (type == -1 || type == NETCONFA_FORWARDING) +	if (all || type == NETCONFA_FORWARDING)  		size += nla_total_size(4); -	if (type == -1 || type == NETCONFA_RP_FILTER) +	if (all || type == NETCONFA_RP_FILTER)  		size += nla_total_size(4); -	if (type == -1 || type == NETCONFA_MC_FORWARDING) +	if (all || type == NETCONFA_MC_FORWARDING)  		size += nla_total_size(4); -	if (type == -1 || type == NETCONFA_PROXY_NEIGH) +	if (all || type == NETCONFA_PROXY_NEIGH)  		size += nla_total_size(4); -	if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) +	if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)  		size += nla_total_size(4);  	return size; @@ -1754,36 +1783,39 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,  {  	struct nlmsghdr  *nlh;  	struct netconfmsg *ncm; +	bool all = false;  	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),  			flags);  	if (!nlh)  		return -EMSGSIZE; +	if (type == NETCONFA_ALL) +		all = true; +  	ncm = nlmsg_data(nlh);  	ncm->ncm_family = AF_INET;  	if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0)  		goto nla_put_failure; -	/* type -1 is used for ALL */ -	if ((type == -1 || type == NETCONFA_FORWARDING) && +	if ((all || type == NETCONFA_FORWARDING) &&  	    nla_put_s32(skb, NETCONFA_FORWARDING,  			IPV4_DEVCONF(*devconf, FORWARDING)) < 0)  		goto nla_put_failure; -	if ((type == -1 || type == NETCONFA_RP_FILTER) && +	if ((all || type == NETCONFA_RP_FILTER) &&  	    nla_put_s32(skb, NETCONFA_RP_FILTER,  			IPV4_DEVCONF(*devconf, RP_FILTER)) < 0)  		goto nla_put_failure; -	if ((type == -1 || type == NETCONFA_MC_FORWARDING) && +	if ((all || type == NETCONFA_MC_FORWARDING) &&  	    nla_put_s32(skb, NETCONFA_MC_FORWARDING,  			IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)  		goto nla_put_failure; -	if ((type == -1 || type == NETCONFA_PROXY_NEIGH) && +	if ((all || type == NETCONFA_PROXY_NEIGH) &&  	    nla_put_s32(skb, NETCONFA_PROXY_NEIGH,  			IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)  		goto nla_put_failure; -	if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) && +	if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&  	    nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,  			IPV4_DEVCONF(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0)  		goto nla_put_failure; @@ -1871,14 +1903,14 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb,  	}  	err = -ENOBUFS; -	skb = nlmsg_new(inet_netconf_msgsize_devconf(-1), GFP_ATOMIC); +	skb = nlmsg_new(inet_netconf_msgsize_devconf(NETCONFA_ALL), GFP_ATOMIC);  	if (!skb)  		goto errout;  	err = inet_netconf_fill_devconf(skb, ifindex, devconf,  					NETLINK_CB(in_skb).portid,  					nlh->nlmsg_seq, RTM_NEWNETCONF, 0, -					-1); +					NETCONFA_ALL);  	if (err < 0) {  		/* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */  		WARN_ON(err == -EMSGSIZE); @@ -1922,7 +1954,7 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb,  						      cb->nlh->nlmsg_seq,  						      RTM_NEWNETCONF,  						      NLM_F_MULTI, -						      -1) < 0) { +						      NETCONFA_ALL) < 0) {  				rcu_read_unlock();  				goto done;  			} @@ -1938,7 +1970,7 @@ cont:  					      NETLINK_CB(cb->skb).portid,  					      cb->nlh->nlmsg_seq,  					      RTM_NEWNETCONF, NLM_F_MULTI, -					      -1) < 0) +					      NETCONFA_ALL) < 0)  			goto done;  		else  			h++; @@ -1949,7 +1981,7 @@ cont:  					      NETLINK_CB(cb->skb).portid,  					      cb->nlh->nlmsg_seq,  					      RTM_NEWNETCONF, NLM_F_MULTI, -					      -1) < 0) +					      NETCONFA_ALL) < 0)  			goto done;  		else  			h++; @@ -2185,6 +2217,8 @@ static struct devinet_sysctl_table {  					"igmpv3_unsolicited_report_interval"),  		DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN,  					"ignore_routes_with_linkdown"), +		DEVINET_SYSCTL_RW_ENTRY(DROP_GRATUITOUS_ARP, +					"drop_gratuitous_arp"),  		DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),  		DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), @@ -2192,6 +2226,8 @@ static struct devinet_sysctl_table {  					      "promote_secondaries"),  		DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET,  					      "route_localnet"), +		DEVINET_SYSCTL_FLUSHING_ENTRY(DROP_UNICAST_IN_L2_MULTICAST, +					      "drop_unicast_in_l2_multicast"),  	},  }; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 473447593060..8a9246deccfe 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -280,7 +280,6 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)  	struct in_device *in_dev;  	struct fib_result res;  	struct rtable *rt; -	struct flowi4 fl4;  	struct net *net;  	int scope; @@ -296,14 +295,13 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)  	scope = RT_SCOPE_UNIVERSE;  	if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) { -		fl4.flowi4_oif = 0; -		fl4.flowi4_iif = LOOPBACK_IFINDEX; -		fl4.daddr = ip_hdr(skb)->saddr; -		fl4.saddr = 0; -		fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); -		fl4.flowi4_scope = scope; -		fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; -		fl4.flowi4_tun_key.tun_id = 0; +		struct flowi4 fl4 = { +			.flowi4_iif = LOOPBACK_IFINDEX, +			.daddr = ip_hdr(skb)->saddr, +			.flowi4_tos = RT_TOS(ip_hdr(skb)->tos), +			.flowi4_scope = scope, +			.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0, +		};  		if (!fib_lookup(net, &fl4, &res, 0))  			return FIB_RES_PREFSRC(net, res);  	} else { @@ -922,6 +920,9 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)  		subnet = 1;  	} +	if (in_dev->dead) +		goto no_promotions; +  	/* Deletion is more complicated than add.  	 * We should take care of not to delete too much :-)  	 * @@ -997,6 +998,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)  		}  	} +no_promotions:  	if (!(ok & BRD_OK))  		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);  	if (subnet && ifa->ifa_prefixlen < 31) { diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 976f0dcf6991..a0586b4a197d 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -48,7 +48,7 @@ static inline struct fou *fou_from_sock(struct sock *sk)  	return sk->sk_user_data;  } -static void fou_recv_pull(struct sk_buff *skb, size_t len) +static int fou_recv_pull(struct sk_buff *skb, size_t len)  {  	struct iphdr *iph = ip_hdr(skb); @@ -59,6 +59,7 @@ static void fou_recv_pull(struct sk_buff *skb, size_t len)  	__skb_pull(skb, len);  	skb_postpull_rcsum(skb, udp_hdr(skb), len);  	skb_reset_transport_header(skb); +	return iptunnel_pull_offloads(skb);  }  static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) @@ -68,9 +69,14 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb)  	if (!fou)  		return 1; -	fou_recv_pull(skb, sizeof(struct udphdr)); +	if (fou_recv_pull(skb, sizeof(struct udphdr))) +		goto drop;  	return -fou->protocol; + +drop: +	kfree_skb(skb); +	return 0;  }  static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr, @@ -170,6 +176,9 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)  	__skb_pull(skb, sizeof(struct udphdr) + hdrlen);  	skb_reset_transport_header(skb); +	if (iptunnel_pull_offloads(skb)) +		goto drop; +  	return -guehdr->proto_ctype;  drop: @@ -319,8 +328,6 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,  	skb_gro_pull(skb, hdrlen); -	flush = 0; -  	for (p = *head; p; p = p->next) {  		const struct guehdr *guehdr2; @@ -352,6 +359,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,  		goto out_unlock;  	pp = ops->callbacks.gro_receive(head, skb); +	flush = 0;  out_unlock:  	rcu_read_unlock(); @@ -774,7 +782,6 @@ static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e,  	uh->dest = e->dport;  	uh->source = sport;  	uh->len = htons(skb->len); -	uh->check = 0;  	udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb,  		     fl4->saddr, fl4->daddr, skb->len); @@ -784,11 +791,11 @@ static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e,  int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,  		     u8 *protocol, struct flowi4 *fl4)  { -	bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); -	int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; +	int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : +						       SKB_GSO_UDP_TUNNEL;  	__be16 sport; -	skb = iptunnel_handle_offloads(skb, csum, type); +	skb = iptunnel_handle_offloads(skb, type);  	if (IS_ERR(skb))  		return PTR_ERR(skb); @@ -804,8 +811,8 @@ EXPORT_SYMBOL(fou_build_header);  int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,  		     u8 *protocol, struct flowi4 *fl4)  { -	bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); -	int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; +	int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : +						       SKB_GSO_UDP_TUNNEL;  	struct guehdr *guehdr;  	size_t hdrlen, optlen = 0;  	__be16 sport; @@ -814,7 +821,6 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,  	if ((e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) &&  	    skb->ip_summed == CHECKSUM_PARTIAL) { -		csum = false;  		optlen += GUE_PLEN_REMCSUM;  		type |= SKB_GSO_TUNNEL_REMCSUM;  		need_priv = true; @@ -822,7 +828,7 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,  	optlen += need_priv ? GUE_LEN_PRIV : 0; -	skb = iptunnel_handle_offloads(skb, csum, type); +	skb = iptunnel_handle_offloads(skb, type);  	if (IS_ERR(skb))  		return PTR_ERR(skb); diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 5a8ee3282550..c47539d04b88 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -18,15 +18,13 @@  static struct sk_buff *gre_gso_segment(struct sk_buff *skb,  				       netdev_features_t features)  { +	int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);  	struct sk_buff *segs = ERR_PTR(-EINVAL); -	netdev_features_t enc_features; -	int ghl; -	struct gre_base_hdr *greh;  	u16 mac_offset = skb->mac_header; -	int mac_len = skb->mac_len;  	__be16 protocol = skb->protocol; -	int tnl_hlen; -	bool csum; +	u16 mac_len = skb->mac_len; +	int gre_offset, outer_hlen; +	bool need_csum, ufo;  	if (unlikely(skb_shinfo(skb)->gso_type &  				~(SKB_GSO_TCPV4 | @@ -43,74 +41,75 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,  	if (!skb->encapsulation)  		goto out; -	if (unlikely(!pskb_may_pull(skb, sizeof(*greh)))) +	if (unlikely(tnl_hlen < sizeof(struct gre_base_hdr)))  		goto out; -	greh = (struct gre_base_hdr *)skb_transport_header(skb); - -	ghl = skb_inner_mac_header(skb) - skb_transport_header(skb); -	if (unlikely(ghl < sizeof(*greh))) +	if (unlikely(!pskb_may_pull(skb, tnl_hlen)))  		goto out; -	csum = !!(greh->flags & GRE_CSUM); -	if (csum) -		skb->encap_hdr_csum = 1; -  	/* setup inner skb. */ -	skb->protocol = greh->protocol;  	skb->encapsulation = 0; - -	if (unlikely(!pskb_may_pull(skb, ghl))) -		goto out; - -	__skb_pull(skb, ghl); +	SKB_GSO_CB(skb)->encap_level = 0; +	__skb_pull(skb, tnl_hlen);  	skb_reset_mac_header(skb);  	skb_set_network_header(skb, skb_inner_network_offset(skb));  	skb->mac_len = skb_inner_network_offset(skb); +	skb->protocol = skb->inner_protocol; + +	need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM); +	skb->encap_hdr_csum = need_csum; + +	ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); + +	features &= skb->dev->hw_enc_features; + +	/* The only checksum offload we care about from here on out is the +	 * outer one so strip the existing checksum feature flags based +	 * on the fact that we will be computing our checksum in software. +	 */ +	if (ufo) { +		features &= ~NETIF_F_CSUM_MASK; +		if (!need_csum) +			features |= NETIF_F_HW_CSUM; +	}  	/* segment inner packet. */ -	enc_features = skb->dev->hw_enc_features & features; -	segs = skb_mac_gso_segment(skb, enc_features); +	segs = skb_mac_gso_segment(skb, features);  	if (IS_ERR_OR_NULL(segs)) { -		skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len); +		skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, +				     mac_len);  		goto out;  	} +	outer_hlen = skb_tnl_header_len(skb); +	gre_offset = outer_hlen - tnl_hlen;  	skb = segs; -	tnl_hlen = skb_tnl_header_len(skb);  	do { -		__skb_push(skb, ghl); -		if (csum) { -			__be32 *pcsum; - -			if (skb_has_shared_frag(skb)) { -				int err; - -				err = __skb_linearize(skb); -				if (err) { -					kfree_skb_list(segs); -					segs = ERR_PTR(err); -					goto out; -				} -			} - -			skb_reset_transport_header(skb); +		struct gre_base_hdr *greh; +		__be32 *pcsum; -			greh = (struct gre_base_hdr *) -			    skb_transport_header(skb); -			pcsum = (__be32 *)(greh + 1); -			*pcsum = 0; -			*(__sum16 *)pcsum = gso_make_checksum(skb, 0); +		/* Set up inner headers if we are offloading inner checksum */ +		if (skb->ip_summed == CHECKSUM_PARTIAL) { +			skb_reset_inner_headers(skb); +			skb->encapsulation = 1;  		} -		__skb_push(skb, tnl_hlen - ghl); -		skb_reset_inner_headers(skb); -		skb->encapsulation = 1; +		skb->mac_len = mac_len; +		skb->protocol = protocol; +		__skb_push(skb, outer_hlen);  		skb_reset_mac_header(skb);  		skb_set_network_header(skb, mac_len); -		skb->mac_len = mac_len; -		skb->protocol = protocol; +		skb_set_transport_header(skb, gre_offset); + +		if (!need_csum) +			continue; + +		greh = (struct gre_base_hdr *)skb_transport_header(skb); +		pcsum = (__be32 *)(greh + 1); + +		*pcsum = 0; +		*(__sum16 *)pcsum = gso_make_checksum(skb, 0);  	} while ((skb = skb->next));  out:  	return segs; @@ -128,6 +127,11 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,  	struct packet_offload *ptype;  	__be16 type; +	if (NAPI_GRO_CB(skb)->encap_mark) +		goto out; + +	NAPI_GRO_CB(skb)->encap_mark = 1; +  	off = skb_gro_offset(skb);  	hlen = off + sizeof(*greh);  	greh = skb_gro_header_fast(skb, off); @@ -177,8 +181,6 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,  					     null_compute_pseudo);  	} -	flush = 0; -  	for (p = *head; p; p = p->next) {  		const struct gre_base_hdr *greh2; @@ -215,6 +217,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,  	skb_gro_postpull_rcsum(skb, greh, grehlen);  	pp = ptype->callbacks.gro_receive(head, skb); +	flush = 0;  out_unlock:  	rcu_read_unlock(); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 36e26977c908..6333489771ed 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -931,7 +931,6 @@ static bool icmp_echo(struct sk_buff *skb)   */  static bool icmp_timestamp(struct sk_buff *skb)  { -	struct timespec tv;  	struct icmp_bxm icmp_param;  	/*  	 *	Too short. @@ -942,9 +941,7 @@ static bool icmp_timestamp(struct sk_buff *skb)  	/*  	 *	Fill in the current time as ms since midnight UT:  	 */ -	getnstimeofday(&tv); -	icmp_param.data.times[1] = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC + -					 tv.tv_nsec / NSEC_PER_MSEC); +	icmp_param.data.times[1] = inet_current_timestamp();  	icmp_param.data.times[2] = icmp_param.data.times[1];  	if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4))  		BUG(); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index b3086cf27027..9b4ca87f70ba 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -107,12 +107,6 @@  #include <linux/seq_file.h>  #endif -#define IP_MAX_MEMBERSHIPS	20 -#define IP_MAX_MSF		10 - -/* IGMP reports for link-local multicast groups are enabled by default */ -int sysctl_igmp_llm_reports __read_mostly = 1; -  #ifdef CONFIG_IP_MULTICAST  /* Parameter names and values are taken from igmp-v2-06 draft */ @@ -432,6 +426,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,  	int type, int gdeleted, int sdeleted)  {  	struct net_device *dev = pmc->interface->dev; +	struct net *net = dev_net(dev);  	struct igmpv3_report *pih;  	struct igmpv3_grec *pgr = NULL;  	struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list; @@ -439,7 +434,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,  	if (pmc->multiaddr == IGMP_ALL_HOSTS)  		return skb; -	if (ipv4_is_local_multicast(pmc->multiaddr) && !sysctl_igmp_llm_reports) +	if (ipv4_is_local_multicast(pmc->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports)  		return skb;  	isquery = type == IGMPV3_MODE_IS_INCLUDE || @@ -542,6 +537,7 @@ empty_source:  static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)  {  	struct sk_buff *skb = NULL; +	struct net *net = dev_net(in_dev->dev);  	int type;  	if (!pmc) { @@ -550,7 +546,7 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)  			if (pmc->multiaddr == IGMP_ALL_HOSTS)  				continue;  			if (ipv4_is_local_multicast(pmc->multiaddr) && -			     !sysctl_igmp_llm_reports) +			     !net->ipv4.sysctl_igmp_llm_reports)  				continue;  			spin_lock_bh(&pmc->lock);  			if (pmc->sfcount[MCAST_EXCLUDE]) @@ -686,7 +682,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,  	if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)  		return igmpv3_send_report(in_dev, pmc); -	if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports) +	if (ipv4_is_local_multicast(group) && !net->ipv4.sysctl_igmp_llm_reports)  		return 0;  	if (type == IGMP_HOST_LEAVE_MESSAGE) @@ -765,9 +761,10 @@ static void igmp_ifc_timer_expire(unsigned long data)  static void igmp_ifc_event(struct in_device *in_dev)  { +	struct net *net = dev_net(in_dev->dev);  	if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))  		return; -	in_dev->mr_ifc_count = in_dev->mr_qrv ?: sysctl_igmp_qrv; +	in_dev->mr_ifc_count = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;  	igmp_ifc_start_timer(in_dev, 1);  } @@ -857,12 +854,13 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)  static bool igmp_heard_report(struct in_device *in_dev, __be32 group)  {  	struct ip_mc_list *im; +	struct net *net = dev_net(in_dev->dev);  	/* Timers are only set for non-local groups */  	if (group == IGMP_ALL_HOSTS)  		return false; -	if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports) +	if (ipv4_is_local_multicast(group) && !net->ipv4.sysctl_igmp_llm_reports)  		return false;  	rcu_read_lock(); @@ -886,6 +884,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,  	__be32			group = ih->group;  	int			max_delay;  	int			mark = 0; +	struct net		*net = dev_net(in_dev->dev);  	if (len == 8) { @@ -971,7 +970,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,  		if (im->multiaddr == IGMP_ALL_HOSTS)  			continue;  		if (ipv4_is_local_multicast(im->multiaddr) && -		    !sysctl_igmp_llm_reports) +		    !net->ipv4.sysctl_igmp_llm_reports)  			continue;  		spin_lock_bh(&im->lock);  		if (im->tm_running) @@ -1087,6 +1086,7 @@ static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr)  static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)  {  	struct ip_mc_list *pmc; +	struct net *net = dev_net(in_dev->dev);  	/* this is an "ip_mc_list" for convenience; only the fields below  	 * are actually used. In particular, the refcnt and users are not @@ -1101,7 +1101,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)  	pmc->interface = im->interface;  	in_dev_hold(in_dev);  	pmc->multiaddr = im->multiaddr; -	pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; +	pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;  	pmc->sfmode = im->sfmode;  	if (pmc->sfmode == MCAST_INCLUDE) {  		struct ip_sf_list *psf; @@ -1186,6 +1186,7 @@ static void igmp_group_dropped(struct ip_mc_list *im)  {  	struct in_device *in_dev = im->interface;  #ifdef CONFIG_IP_MULTICAST +	struct net *net = dev_net(in_dev->dev);  	int reporter;  #endif @@ -1197,7 +1198,7 @@ static void igmp_group_dropped(struct ip_mc_list *im)  #ifdef CONFIG_IP_MULTICAST  	if (im->multiaddr == IGMP_ALL_HOSTS)  		return; -	if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports) +	if (ipv4_is_local_multicast(im->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports)  		return;  	reporter = im->reporter; @@ -1222,6 +1223,9 @@ static void igmp_group_dropped(struct ip_mc_list *im)  static void igmp_group_added(struct ip_mc_list *im)  {  	struct in_device *in_dev = im->interface; +#ifdef CONFIG_IP_MULTICAST +	struct net *net = dev_net(in_dev->dev); +#endif  	if (im->loaded == 0) {  		im->loaded = 1; @@ -1231,7 +1235,7 @@ static void igmp_group_added(struct ip_mc_list *im)  #ifdef CONFIG_IP_MULTICAST  	if (im->multiaddr == IGMP_ALL_HOSTS)  		return; -	if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports) +	if (ipv4_is_local_multicast(im->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports)  		return;  	if (in_dev->dead) @@ -1244,7 +1248,7 @@ static void igmp_group_added(struct ip_mc_list *im)  	}  	/* else, v3 */ -	im->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; +	im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;  	igmp_ifc_event(in_dev);  #endif  } @@ -1313,6 +1317,9 @@ static void ip_mc_hash_remove(struct in_device *in_dev,  void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)  {  	struct ip_mc_list *im; +#ifdef CONFIG_IP_MULTICAST +	struct net *net = dev_net(in_dev->dev); +#endif  	ASSERT_RTNL(); @@ -1339,7 +1346,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)  	spin_lock_init(&im->lock);  #ifdef CONFIG_IP_MULTICAST  	setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im); -	im->unsolicit_count = sysctl_igmp_qrv; +	im->unsolicit_count = net->ipv4.sysctl_igmp_qrv;  #endif  	im->next_rcu = in_dev->mc_list; @@ -1532,6 +1539,7 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev)  #ifdef CONFIG_IP_MULTICAST  	struct ip_mc_list *im;  	int type; +	struct net *net = dev_net(in_dev->dev);  	ASSERT_RTNL(); @@ -1539,7 +1547,7 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev)  		if (im->multiaddr == IGMP_ALL_HOSTS)  			continue;  		if (ipv4_is_local_multicast(im->multiaddr) && -		    !sysctl_igmp_llm_reports) +		    !net->ipv4.sysctl_igmp_llm_reports)  			continue;  		/* a failover is happening and switches @@ -1638,6 +1646,9 @@ void ip_mc_down(struct in_device *in_dev)  void ip_mc_init_dev(struct in_device *in_dev)  { +#ifdef CONFIG_IP_MULTICAST +	struct net *net = dev_net(in_dev->dev); +#endif  	ASSERT_RTNL();  #ifdef CONFIG_IP_MULTICAST @@ -1645,7 +1656,7 @@ void ip_mc_init_dev(struct in_device *in_dev)  			(unsigned long)in_dev);  	setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire,  			(unsigned long)in_dev); -	in_dev->mr_qrv = sysctl_igmp_qrv; +	in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv;  #endif  	spin_lock_init(&in_dev->mc_tomb_lock); @@ -1656,11 +1667,14 @@ void ip_mc_init_dev(struct in_device *in_dev)  void ip_mc_up(struct in_device *in_dev)  {  	struct ip_mc_list *pmc; +#ifdef CONFIG_IP_MULTICAST +	struct net *net = dev_net(in_dev->dev); +#endif  	ASSERT_RTNL();  #ifdef CONFIG_IP_MULTICAST -	in_dev->mr_qrv = sysctl_igmp_qrv; +	in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv;  #endif  	ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); @@ -1726,11 +1740,6 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)  /*   *	Join a socket to a group   */ -int sysctl_igmp_max_memberships __read_mostly = IP_MAX_MEMBERSHIPS; -int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF; -#ifdef CONFIG_IP_MULTICAST -int sysctl_igmp_qrv __read_mostly = IGMP_QUERY_ROBUSTNESS_VARIABLE; -#endif  static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,  	__be32 *psfsrc) @@ -1755,6 +1764,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,  	if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) {  #ifdef CONFIG_IP_MULTICAST  		struct in_device *in_dev = pmc->interface; +		struct net *net = dev_net(in_dev->dev);  #endif  		/* no more filters for this source */ @@ -1765,7 +1775,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,  #ifdef CONFIG_IP_MULTICAST  		if (psf->sf_oldin &&  		    !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) { -			psf->sf_crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; +			psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;  			psf->sf_next = pmc->tomb;  			pmc->tomb = psf;  			rv = 1; @@ -1823,12 +1833,13 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,  	    pmc->sfcount[MCAST_INCLUDE]) {  #ifdef CONFIG_IP_MULTICAST  		struct ip_sf_list *psf; +		struct net *net = dev_net(in_dev->dev);  #endif  		/* filter mode change */  		pmc->sfmode = MCAST_INCLUDE;  #ifdef CONFIG_IP_MULTICAST -		pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; +		pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;  		in_dev->mr_ifc_count = pmc->crcount;  		for (psf = pmc->sources; psf; psf = psf->sf_next)  			psf->sf_crcount = 0; @@ -1995,6 +2006,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,  	} else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) {  #ifdef CONFIG_IP_MULTICAST  		struct ip_sf_list *psf; +		struct net *net = dev_net(pmc->interface->dev);  		in_dev = pmc->interface;  #endif @@ -2006,7 +2018,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,  #ifdef CONFIG_IP_MULTICAST  		/* else no filters; keep old mode for reports */ -		pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; +		pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;  		in_dev->mr_ifc_count = pmc->crcount;  		for (psf = pmc->sources; psf; psf = psf->sf_next)  			psf->sf_crcount = 0; @@ -2073,7 +2085,7 @@ int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)  		count++;  	}  	err = -ENOBUFS; -	if (count >= sysctl_igmp_max_memberships) +	if (count >= net->ipv4.sysctl_igmp_max_memberships)  		goto done;  	iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);  	if (!iml) @@ -2245,7 +2257,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct  	}  	/* else, add a new source to the filter */ -	if (psl && psl->sl_count >= sysctl_igmp_max_msf) { +	if (psl && psl->sl_count >= net->ipv4.sysctl_igmp_max_msf) {  		err = -ENOBUFS;  		goto done;  	} @@ -2918,6 +2930,12 @@ static int __net_init igmp_net_init(struct net *net)  		goto out_sock;  	} +	/* Sysctl initialization */ +	net->ipv4.sysctl_igmp_max_memberships = 20; +	net->ipv4.sysctl_igmp_max_msf = 10; +	/* IGMP reports for link-local multicast groups are enabled by default */ +	net->ipv4.sysctl_igmp_llm_reports = 1; +	net->ipv4.sysctl_igmp_qrv = 2;  	return 0;  out_sock: diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 64148914803a..bc5196ea1bdf 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -24,6 +24,7 @@  #include <net/tcp_states.h>  #include <net/xfrm.h>  #include <net/tcp.h> +#include <net/sock_reuseport.h>  #ifdef INET_CSK_DEBUG  const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; @@ -67,7 +68,8 @@ int inet_csk_bind_conflict(const struct sock *sk,  			if ((!reuse || !sk2->sk_reuse ||  			    sk2->sk_state == TCP_LISTEN) &&  			    (!reuseport || !sk2->sk_reuseport || -			    (sk2->sk_state != TCP_TIME_WAIT && +			     rcu_access_pointer(sk->sk_reuseport_cb) || +			     (sk2->sk_state != TCP_TIME_WAIT &&  			     !uid_eq(uid, sock_i_uid(sk2))))) {  				if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || @@ -89,161 +91,154 @@ EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);  /* Obtain a reference to a local port for the given sock,   * if snum is zero it means select any available local port. + * We try to allocate an odd port (and leave even ports for connect())   */  int inet_csk_get_port(struct sock *sk, unsigned short snum)  { -	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; +	bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; +	struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; +	int ret = 1, attempts = 5, port = snum; +	int smallest_size = -1, smallest_port;  	struct inet_bind_hashbucket *head; -	struct inet_bind_bucket *tb; -	int ret, attempts = 5;  	struct net *net = sock_net(sk); -	int smallest_size = -1, smallest_rover; +	int i, low, high, attempt_half; +	struct inet_bind_bucket *tb;  	kuid_t uid = sock_i_uid(sk); -	int attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; +	u32 remaining, offset; -	local_bh_disable(); -	if (!snum) { -		int remaining, rover, low, high; +	if (port) { +have_port: +		head = &hinfo->bhash[inet_bhashfn(net, port, +						  hinfo->bhash_size)]; +		spin_lock_bh(&head->lock); +		inet_bind_bucket_for_each(tb, &head->chain) +			if (net_eq(ib_net(tb), net) && tb->port == port) +				goto tb_found; +		goto tb_not_found; +	}  again: -		inet_get_local_port_range(net, &low, &high); -		if (attempt_half) { -			int half = low + ((high - low) >> 1); - -			if (attempt_half == 1) -				high = half; -			else -				low = half; -		} -		remaining = (high - low) + 1; -		smallest_rover = rover = prandom_u32() % remaining + low; - -		smallest_size = -1; -		do { -			if (inet_is_local_reserved_port(net, rover)) -				goto next_nolock; -			head = &hashinfo->bhash[inet_bhashfn(net, rover, -					hashinfo->bhash_size)]; -			spin_lock(&head->lock); -			inet_bind_bucket_for_each(tb, &head->chain) -				if (net_eq(ib_net(tb), net) && tb->port == rover) { -					if (((tb->fastreuse > 0 && -					      sk->sk_reuse && -					      sk->sk_state != TCP_LISTEN) || -					     (tb->fastreuseport > 0 && -					      sk->sk_reuseport && -					      uid_eq(tb->fastuid, uid))) && -					    (tb->num_owners < smallest_size || smallest_size == -1)) { -						smallest_size = tb->num_owners; -						smallest_rover = rover; -					} -					if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { -						snum = rover; -						goto tb_found; -					} -					goto next; +	attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; +other_half_scan: +	inet_get_local_port_range(net, &low, &high); +	high++; /* [32768, 60999] -> [32768, 61000[ */ +	if (high - low < 4) +		attempt_half = 0; +	if (attempt_half) { +		int half = low + (((high - low) >> 2) << 1); + +		if (attempt_half == 1) +			high = half; +		else +			low = half; +	} +	remaining = high - low; +	if (likely(remaining > 1)) +		remaining &= ~1U; + +	offset = prandom_u32() % remaining; +	/* __inet_hash_connect() favors ports having @low parity +	 * We do the opposite to not pollute connect() users. +	 */ +	offset |= 1U; +	smallest_size = -1; +	smallest_port = low; /* avoid compiler warning */ + +other_parity_scan: +	port = low + offset; +	for (i = 0; i < remaining; i += 2, port += 2) { +		if (unlikely(port >= high)) +			port -= remaining; +		if (inet_is_local_reserved_port(net, port)) +			continue; +		head = &hinfo->bhash[inet_bhashfn(net, port, +						  hinfo->bhash_size)]; +		spin_lock_bh(&head->lock); +		inet_bind_bucket_for_each(tb, &head->chain) +			if (net_eq(ib_net(tb), net) && tb->port == port) { +				if (((tb->fastreuse > 0 && reuse) || +				     (tb->fastreuseport > 0 && +				      sk->sk_reuseport && +				      !rcu_access_pointer(sk->sk_reuseport_cb) && +				      uid_eq(tb->fastuid, uid))) && +				    (tb->num_owners < smallest_size || smallest_size == -1)) { +					smallest_size = tb->num_owners; +					smallest_port = port;  				} -			break; -		next: -			spin_unlock(&head->lock); -		next_nolock: -			if (++rover > high) -				rover = low; -		} while (--remaining > 0); - -		/* Exhausted local port range during search?  It is not -		 * possible for us to be holding one of the bind hash -		 * locks if this test triggers, because if 'remaining' -		 * drops to zero, we broke out of the do/while loop at -		 * the top level, not from the 'break;' statement. -		 */ -		ret = 1; -		if (remaining <= 0) { -			if (smallest_size != -1) { -				snum = smallest_rover; -				goto have_snum; -			} -			if (attempt_half == 1) { -				/* OK we now try the upper half of the range */ -				attempt_half = 2; -				goto again; +				if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) +					goto tb_found; +				goto next_port;  			} -			goto fail; -		} -		/* OK, here is the one we will use.  HEAD is -		 * non-NULL and we hold it's mutex. -		 */ -		snum = rover; -	} else { -have_snum: -		head = &hashinfo->bhash[inet_bhashfn(net, snum, -				hashinfo->bhash_size)]; -		spin_lock(&head->lock); -		inet_bind_bucket_for_each(tb, &head->chain) -			if (net_eq(ib_net(tb), net) && tb->port == snum) -				goto tb_found; +		goto tb_not_found; +next_port: +		spin_unlock_bh(&head->lock); +		cond_resched(); +	} + +	if (smallest_size != -1) { +		port = smallest_port; +		goto have_port;  	} -	tb = NULL; -	goto tb_not_found; +	offset--; +	if (!(offset & 1)) +		goto other_parity_scan; + +	if (attempt_half == 1) { +		/* OK we now try the upper half of the range */ +		attempt_half = 2; +		goto other_half_scan; +	} +	return ret; + +tb_not_found: +	tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, +				     net, head, port); +	if (!tb) +		goto fail_unlock;  tb_found:  	if (!hlist_empty(&tb->owners)) {  		if (sk->sk_reuse == SK_FORCE_REUSE)  			goto success; -		if (((tb->fastreuse > 0 && -		      sk->sk_reuse && sk->sk_state != TCP_LISTEN) || +		if (((tb->fastreuse > 0 && reuse) ||  		     (tb->fastreuseport > 0 && +		      !rcu_access_pointer(sk->sk_reuseport_cb) &&  		      sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && -		    smallest_size == -1) { +		    smallest_size == -1)  			goto success; -		} else { -			ret = 1; -			if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { -				if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) || -				     (tb->fastreuseport > 0 && -				      sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && -				    smallest_size != -1 && --attempts >= 0) { -					spin_unlock(&head->lock); -					goto again; -				} - -				goto fail_unlock; +		if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { +			if ((reuse || +			     (tb->fastreuseport > 0 && +			      sk->sk_reuseport && +			      !rcu_access_pointer(sk->sk_reuseport_cb) && +			      uid_eq(tb->fastuid, uid))) && +			    smallest_size != -1 && --attempts >= 0) { +				spin_unlock_bh(&head->lock); +				goto again;  			} +			goto fail_unlock;  		} -	} -tb_not_found: -	ret = 1; -	if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, -					net, head, snum)) == NULL) -		goto fail_unlock; -	if (hlist_empty(&tb->owners)) { -		if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) -			tb->fastreuse = 1; -		else +		if (!reuse)  			tb->fastreuse = 0; +		if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)) +			tb->fastreuseport = 0; +	} else { +		tb->fastreuse = reuse;  		if (sk->sk_reuseport) {  			tb->fastreuseport = 1;  			tb->fastuid = uid; -		} else -			tb->fastreuseport = 0; -	} else { -		if (tb->fastreuse && -		    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) -			tb->fastreuse = 0; -		if (tb->fastreuseport && -		    (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))) +		} else {  			tb->fastreuseport = 0; +		}  	}  success:  	if (!inet_csk(sk)->icsk_bind_hash) -		inet_bind_hash(sk, tb, snum); +		inet_bind_hash(sk, tb, port);  	WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);  	ret = 0;  fail_unlock: -	spin_unlock(&head->lock); -fail: -	local_bh_enable(); +	spin_unlock_bh(&head->lock);  	return ret;  }  EXPORT_SYMBOL_GPL(inet_csk_get_port); @@ -482,10 +477,6 @@ EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);  #define AF_INET_FAMILY(fam) true  #endif -/* Only thing we need from tcp.h */ -extern int sysctl_tcp_synack_retries; - -  /* Decide when to expire the request and when to resend SYN-ACK */  static inline void syn_ack_recalc(struct request_sock *req, const int thresh,  				  const int max_retries, @@ -557,6 +548,7 @@ static void reqsk_timer_handler(unsigned long data)  {  	struct request_sock *req = (struct request_sock *)data;  	struct sock *sk_listener = req->rsk_listener; +	struct net *net = sock_net(sk_listener);  	struct inet_connection_sock *icsk = inet_csk(sk_listener);  	struct request_sock_queue *queue = &icsk->icsk_accept_queue;  	int qlen, expire = 0, resend = 0; @@ -566,7 +558,7 @@ static void reqsk_timer_handler(unsigned long data)  	if (sk_state_load(sk_listener) != TCP_LISTEN)  		goto drop; -	max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; +	max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries;  	thresh = max_retries;  	/* Normally all the openreqs are young and become mature  	 * (i.e. converted to established socket) for first timeout. @@ -737,6 +729,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog)  {  	struct inet_connection_sock *icsk = inet_csk(sk);  	struct inet_sock *inet = inet_sk(sk); +	int err = -EADDRINUSE;  	reqsk_queue_alloc(&icsk->icsk_accept_queue); @@ -754,13 +747,14 @@ int inet_csk_listen_start(struct sock *sk, int backlog)  		inet->inet_sport = htons(inet->inet_num);  		sk_dst_reset(sk); -		sk->sk_prot->hash(sk); +		err = sk->sk_prot->hash(sk); -		return 0; +		if (likely(!err)) +			return 0;  	}  	sk->sk_state = TCP_CLOSE; -	return -EADDRINUSE; +	return err;  }  EXPORT_SYMBOL_GPL(inet_csk_listen_start); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 6029157a19ed..5fdb02f5598e 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -357,18 +357,18 @@ struct sock *inet_diag_find_one_icsk(struct net *net,  	struct sock *sk;  	if (req->sdiag_family == AF_INET) -		sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0], +		sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[0],  				 req->id.idiag_dport, req->id.idiag_src[0],  				 req->id.idiag_sport, req->id.idiag_if);  #if IS_ENABLED(CONFIG_IPV6)  	else if (req->sdiag_family == AF_INET6) {  		if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&  		    ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) -			sk = inet_lookup(net, hashinfo, req->id.idiag_dst[3], +			sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[3],  					 req->id.idiag_dport, req->id.idiag_src[3],  					 req->id.idiag_sport, req->id.idiag_if);  		else -			sk = inet6_lookup(net, hashinfo, +			sk = inet6_lookup(net, hashinfo, NULL, 0,  					  (struct in6_addr *)req->id.idiag_dst,  					  req->id.idiag_dport,  					  (struct in6_addr *)req->id.idiag_src, @@ -879,6 +879,7 @@ next_normal:  		}  		spin_unlock_bh(lock); +		cond_resched();  	}  done: diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index ccc5980797fc..bc68eced0105 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -20,10 +20,12 @@  #include <linux/wait.h>  #include <linux/vmalloc.h> +#include <net/addrconf.h>  #include <net/inet_connection_sock.h>  #include <net/inet_hashtables.h>  #include <net/secure_seq.h>  #include <net/ip.h> +#include <net/sock_reuseport.h>  static u32 inet_ehashfn(const struct net *net, const __be32 laddr,  			const __u16 lport, const __be32 faddr, @@ -205,6 +207,7 @@ static inline int compute_score(struct sock *sk, struct net *net,  struct sock *__inet_lookup_listener(struct net *net,  				    struct inet_hashinfo *hashinfo, +				    struct sk_buff *skb, int doff,  				    const __be32 saddr, __be16 sport,  				    const __be32 daddr, const unsigned short hnum,  				    const int dif) @@ -214,6 +217,7 @@ struct sock *__inet_lookup_listener(struct net *net,  	unsigned int hash = inet_lhashfn(net, hnum);  	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];  	int score, hiscore, matches = 0, reuseport = 0; +	bool select_ok = true;  	u32 phash = 0;  	rcu_read_lock(); @@ -229,6 +233,15 @@ begin:  			if (reuseport) {  				phash = inet_ehashfn(net, daddr, hnum,  						     saddr, sport); +				if (select_ok) { +					struct sock *sk2; +					sk2 = reuseport_select_sock(sk, phash, +								    skb, doff); +					if (sk2) { +						result = sk2; +						goto found; +					} +				}  				matches = 1;  			}  		} else if (score == hiscore && reuseport) { @@ -246,11 +259,13 @@ begin:  	if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)  		goto begin;  	if (result) { +found:  		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))  			result = NULL;  		else if (unlikely(compute_score(result, net, hnum, daddr,  				  dif) < hiscore)) {  			sock_put(result); +			select_ok = false;  			goto begin;  		}  	} @@ -449,32 +464,74 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)  }  EXPORT_SYMBOL_GPL(inet_ehash_nolisten); -void __inet_hash(struct sock *sk, struct sock *osk) +static int inet_reuseport_add_sock(struct sock *sk, +				   struct inet_listen_hashbucket *ilb, +				   int (*saddr_same)(const struct sock *sk1, +						     const struct sock *sk2, +						     bool match_wildcard)) +{ +	struct sock *sk2; +	struct hlist_nulls_node *node; +	kuid_t uid = sock_i_uid(sk); + +	sk_nulls_for_each_rcu(sk2, node, &ilb->head) { +		if (sk2 != sk && +		    sk2->sk_family == sk->sk_family && +		    ipv6_only_sock(sk2) == ipv6_only_sock(sk) && +		    sk2->sk_bound_dev_if == sk->sk_bound_dev_if && +		    sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && +		    saddr_same(sk, sk2, false)) +			return reuseport_add_sock(sk, sk2); +	} + +	/* Initial allocation may have already happened via setsockopt */ +	if (!rcu_access_pointer(sk->sk_reuseport_cb)) +		return reuseport_alloc(sk); +	return 0; +} + +int __inet_hash(struct sock *sk, struct sock *osk, +		 int (*saddr_same)(const struct sock *sk1, +				   const struct sock *sk2, +				   bool match_wildcard))  {  	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;  	struct inet_listen_hashbucket *ilb; +	int err = 0;  	if (sk->sk_state != TCP_LISTEN) {  		inet_ehash_nolisten(sk, osk); -		return; +		return 0;  	}  	WARN_ON(!sk_unhashed(sk));  	ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];  	spin_lock(&ilb->lock); +	if (sk->sk_reuseport) { +		err = inet_reuseport_add_sock(sk, ilb, saddr_same); +		if (err) +			goto unlock; +	}  	__sk_nulls_add_node_rcu(sk, &ilb->head);  	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); +unlock:  	spin_unlock(&ilb->lock); + +	return err;  }  EXPORT_SYMBOL(__inet_hash); -void inet_hash(struct sock *sk) +int inet_hash(struct sock *sk)  { +	int err = 0; +  	if (sk->sk_state != TCP_CLOSE) {  		local_bh_disable(); -		__inet_hash(sk, NULL); +		err = __inet_hash(sk, NULL, ipv4_rcv_saddr_equal);  		local_bh_enable();  	} + +	return err;  }  EXPORT_SYMBOL_GPL(inet_hash); @@ -493,6 +550,8 @@ void inet_unhash(struct sock *sk)  		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);  	spin_lock_bh(lock); +	if (rcu_access_pointer(sk->sk_reuseport_cb)) +		reuseport_detach_sock(sk);  	done = __sk_nulls_del_node_init_rcu(sk);  	if (done)  		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); @@ -506,106 +565,106 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,  			struct sock *, __u16, struct inet_timewait_sock **))  {  	struct inet_hashinfo *hinfo = death_row->hashinfo; -	const unsigned short snum = inet_sk(sk)->inet_num; +	struct inet_timewait_sock *tw = NULL;  	struct inet_bind_hashbucket *head; -	struct inet_bind_bucket *tb; -	int ret; +	int port = inet_sk(sk)->inet_num;  	struct net *net = sock_net(sk); +	struct inet_bind_bucket *tb; +	u32 remaining, offset; +	int ret, i, low, high; +	static u32 hint; + +	if (port) { +		head = &hinfo->bhash[inet_bhashfn(net, port, +						  hinfo->bhash_size)]; +		tb = inet_csk(sk)->icsk_bind_hash; +		spin_lock_bh(&head->lock); +		if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { +			inet_ehash_nolisten(sk, NULL); +			spin_unlock_bh(&head->lock); +			return 0; +		} +		spin_unlock(&head->lock); +		/* No definite answer... Walk to established hash table */ +		ret = check_established(death_row, sk, port, NULL); +		local_bh_enable(); +		return ret; +	} -	if (!snum) { -		int i, remaining, low, high, port; -		static u32 hint; -		u32 offset = hint + port_offset; -		struct inet_timewait_sock *tw = NULL; +	inet_get_local_port_range(net, &low, &high); +	high++; /* [32768, 60999] -> [32768, 61000[ */ +	remaining = high - low; +	if (likely(remaining > 1)) +		remaining &= ~1U; -		inet_get_local_port_range(net, &low, &high); -		remaining = (high - low) + 1; +	offset = (hint + port_offset) % remaining; +	/* In first pass we try ports of @low parity. +	 * inet_csk_get_port() does the opposite choice. +	 */ +	offset &= ~1U; +other_parity_scan: +	port = low + offset; +	for (i = 0; i < remaining; i += 2, port += 2) { +		if (unlikely(port >= high)) +			port -= remaining; +		if (inet_is_local_reserved_port(net, port)) +			continue; +		head = &hinfo->bhash[inet_bhashfn(net, port, +						  hinfo->bhash_size)]; +		spin_lock_bh(&head->lock); -		/* By starting with offset being an even number, -		 * we tend to leave about 50% of ports for other uses, -		 * like bind(0). +		/* Does not bother with rcv_saddr checks, because +		 * the established check is already unique enough.  		 */ -		offset &= ~1; - -		local_bh_disable(); -		for (i = 0; i < remaining; i++) { -			port = low + (i + offset) % remaining; -			if (inet_is_local_reserved_port(net, port)) -				continue; -			head = &hinfo->bhash[inet_bhashfn(net, port, -					hinfo->bhash_size)]; -			spin_lock(&head->lock); - -			/* Does not bother with rcv_saddr checks, -			 * because the established check is already -			 * unique enough. -			 */ -			inet_bind_bucket_for_each(tb, &head->chain) { -				if (net_eq(ib_net(tb), net) && -				    tb->port == port) { -					if (tb->fastreuse >= 0 || -					    tb->fastreuseport >= 0) -						goto next_port; -					WARN_ON(hlist_empty(&tb->owners)); -					if (!check_established(death_row, sk, -								port, &tw)) -						goto ok; +		inet_bind_bucket_for_each(tb, &head->chain) { +			if (net_eq(ib_net(tb), net) && tb->port == port) { +				if (tb->fastreuse >= 0 || +				    tb->fastreuseport >= 0)  					goto next_port; -				} +				WARN_ON(hlist_empty(&tb->owners)); +				if (!check_established(death_row, sk, +						       port, &tw)) +					goto ok; +				goto next_port;  			} - -			tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, -					net, head, port); -			if (!tb) { -				spin_unlock(&head->lock); -				break; -			} -			tb->fastreuse = -1; -			tb->fastreuseport = -1; -			goto ok; - -		next_port: -			spin_unlock(&head->lock);  		} -		local_bh_enable(); - -		return -EADDRNOTAVAIL; -ok: -		hint += (i + 2) & ~1; - -		/* Head lock still held and bh's disabled */ -		inet_bind_hash(sk, tb, port); -		if (sk_unhashed(sk)) { -			inet_sk(sk)->inet_sport = htons(port); -			inet_ehash_nolisten(sk, (struct sock *)tw); +		tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, +					     net, head, port); +		if (!tb) { +			spin_unlock_bh(&head->lock); +			return -ENOMEM;  		} -		if (tw) -			inet_twsk_bind_unhash(tw, hinfo); -		spin_unlock(&head->lock); +		tb->fastreuse = -1; +		tb->fastreuseport = -1; +		goto ok; +next_port: +		spin_unlock_bh(&head->lock); +		cond_resched(); +	} -		if (tw) -			inet_twsk_deschedule_put(tw); +	offset++; +	if ((offset & 1) && remaining > 1) +		goto other_parity_scan; -		ret = 0; -		goto out; -	} +	return -EADDRNOTAVAIL; -	head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)]; -	tb  = inet_csk(sk)->icsk_bind_hash; -	spin_lock_bh(&head->lock); -	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { -		inet_ehash_nolisten(sk, NULL); -		spin_unlock_bh(&head->lock); -		return 0; -	} else { -		spin_unlock(&head->lock); -		/* No definite answer... Walk to established hash table */ -		ret = check_established(death_row, sk, snum, NULL); -out: -		local_bh_enable(); -		return ret; +ok: +	hint += i + 2; + +	/* Head lock still held and bh's disabled */ +	inet_bind_hash(sk, tb, port); +	if (sk_unhashed(sk)) { +		inet_sk(sk)->inet_sport = htons(port); +		inet_ehash_nolisten(sk, (struct sock *)tw);  	} +	if (tw) +		inet_twsk_bind_unhash(tw, hinfo); +	spin_unlock(&head->lock); +	if (tw) +		inet_twsk_deschedule_put(tw); +	local_bh_enable(); +	return 0;  }  /* diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c deleted file mode 100644 index f17ea49b28fb..000000000000 --- a/net/ipv4/inet_lro.c +++ /dev/null @@ -1,374 +0,0 @@ -/* - *  linux/net/ipv4/inet_lro.c - * - *  Large Receive Offload (ipv4 / tcp) - * - *  (C) Copyright IBM Corp. 2007 - * - *  Authors: - *       Jan-Bernd Themann <themann@de.ibm.com> - *       Christoph Raisch <raisch@de.ibm.com> - * - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - - -#include <linux/module.h> -#include <linux/if_vlan.h> -#include <linux/inet_lro.h> -#include <net/checksum.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>"); -MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)"); - -#define TCP_HDR_LEN(tcph) (tcph->doff << 2) -#define IP_HDR_LEN(iph) (iph->ihl << 2) -#define TCP_PAYLOAD_LENGTH(iph, tcph) \ -	(ntohs(iph->tot_len) - IP_HDR_LEN(iph) - TCP_HDR_LEN(tcph)) - -#define IPH_LEN_WO_OPTIONS 5 -#define TCPH_LEN_WO_OPTIONS 5 -#define TCPH_LEN_W_TIMESTAMP 8 - -#define LRO_MAX_PG_HLEN 64 - -#define LRO_INC_STATS(lro_mgr, attr) { lro_mgr->stats.attr++; } - -/* - * Basic tcp checks whether packet is suitable for LRO - */ - -static int lro_tcp_ip_check(const struct iphdr *iph, const struct tcphdr *tcph, -			    int len, const struct net_lro_desc *lro_desc) -{ -        /* check ip header: don't aggregate padded frames */ -	if (ntohs(iph->tot_len) != len) -		return -1; - -	if (TCP_PAYLOAD_LENGTH(iph, tcph) == 0) -		return -1; - -	if (iph->ihl != IPH_LEN_WO_OPTIONS) -		return -1; - -	if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack || -	    tcph->rst || tcph->syn || tcph->fin) -		return -1; - -	if (INET_ECN_is_ce(ipv4_get_dsfield(iph))) -		return -1; - -	if (tcph->doff != TCPH_LEN_WO_OPTIONS && -	    tcph->doff != TCPH_LEN_W_TIMESTAMP) -		return -1; - -	/* check tcp options (only timestamp allowed) */ -	if (tcph->doff == TCPH_LEN_W_TIMESTAMP) { -		__be32 *topt = (__be32 *)(tcph + 1); - -		if (*topt != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) -				   | (TCPOPT_TIMESTAMP << 8) -				   | TCPOLEN_TIMESTAMP)) -			return -1; - -		/* timestamp should be in right order */ -		topt++; -		if (lro_desc && after(ntohl(lro_desc->tcp_rcv_tsval), -				      ntohl(*topt))) -			return -1; - -		/* timestamp reply should not be zero */ -		topt++; -		if (*topt == 0) -			return -1; -	} - -	return 0; -} - -static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc) -{ -	struct iphdr *iph = lro_desc->iph; -	struct tcphdr *tcph = lro_desc->tcph; -	__be32 *p; -	__wsum tcp_hdr_csum; - -	tcph->ack_seq = lro_desc->tcp_ack; -	tcph->window = lro_desc->tcp_window; - -	if (lro_desc->tcp_saw_tstamp) { -		p = (__be32 *)(tcph + 1); -		*(p+2) = lro_desc->tcp_rcv_tsecr; -	} - -	csum_replace2(&iph->check, iph->tot_len, htons(lro_desc->ip_tot_len)); -	iph->tot_len = htons(lro_desc->ip_tot_len); - -	tcph->check = 0; -	tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0); -	lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum); -	tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, -					lro_desc->ip_tot_len - -					IP_HDR_LEN(iph), IPPROTO_TCP, -					lro_desc->data_csum); -} - -static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len) -{ -	__wsum tcp_csum; -	__wsum tcp_hdr_csum; -	__wsum tcp_ps_hdr_csum; - -	tcp_csum = ~csum_unfold(tcph->check); -	tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), tcp_csum); - -	tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, -					     len + TCP_HDR_LEN(tcph), -					     IPPROTO_TCP, 0); - -	return csum_sub(csum_sub(tcp_csum, tcp_hdr_csum), -			tcp_ps_hdr_csum); -} - -static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb, -			  struct iphdr *iph, struct tcphdr *tcph) -{ -	int nr_frags; -	__be32 *ptr; -	u32 tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); - -	nr_frags = skb_shinfo(skb)->nr_frags; -	lro_desc->parent = skb; -	lro_desc->next_frag = &(skb_shinfo(skb)->frags[nr_frags]); -	lro_desc->iph = iph; -	lro_desc->tcph = tcph; -	lro_desc->tcp_next_seq = ntohl(tcph->seq) + tcp_data_len; -	lro_desc->tcp_ack = tcph->ack_seq; -	lro_desc->tcp_window = tcph->window; - -	lro_desc->pkt_aggr_cnt = 1; -	lro_desc->ip_tot_len = ntohs(iph->tot_len); - -	if (tcph->doff == 8) { -		ptr = (__be32 *)(tcph+1); -		lro_desc->tcp_saw_tstamp = 1; -		lro_desc->tcp_rcv_tsval = *(ptr+1); -		lro_desc->tcp_rcv_tsecr = *(ptr+2); -	} - -	lro_desc->mss = tcp_data_len; -	lro_desc->active = 1; - -	lro_desc->data_csum = lro_tcp_data_csum(iph, tcph, -						tcp_data_len); -} - -static inline void lro_clear_desc(struct net_lro_desc *lro_desc) -{ -	memset(lro_desc, 0, sizeof(struct net_lro_desc)); -} - -static void lro_add_common(struct net_lro_desc *lro_desc, struct iphdr *iph, -			   struct tcphdr *tcph, int tcp_data_len) -{ -	struct sk_buff *parent = lro_desc->parent; -	__be32 *topt; - -	lro_desc->pkt_aggr_cnt++; -	lro_desc->ip_tot_len += tcp_data_len; -	lro_desc->tcp_next_seq += tcp_data_len; -	lro_desc->tcp_window = tcph->window; -	lro_desc->tcp_ack = tcph->ack_seq; - -	/* don't update tcp_rcv_tsval, would not work with PAWS */ -	if (lro_desc->tcp_saw_tstamp) { -		topt = (__be32 *) (tcph + 1); -		lro_desc->tcp_rcv_tsecr = *(topt + 2); -	} - -	lro_desc->data_csum = csum_block_add(lro_desc->data_csum, -					     lro_tcp_data_csum(iph, tcph, -							       tcp_data_len), -					     parent->len); - -	parent->len += tcp_data_len; -	parent->data_len += tcp_data_len; -	if (tcp_data_len > lro_desc->mss) -		lro_desc->mss = tcp_data_len; -} - -static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb, -			   struct iphdr *iph, struct tcphdr *tcph) -{ -	struct sk_buff *parent = lro_desc->parent; -	int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); - -	lro_add_common(lro_desc, iph, tcph, tcp_data_len); - -	skb_pull(skb, (skb->len - tcp_data_len)); -	parent->truesize += skb->truesize; - -	if (lro_desc->last_skb) -		lro_desc->last_skb->next = skb; -	else -		skb_shinfo(parent)->frag_list = skb; - -	lro_desc->last_skb = skb; -} - - -static int lro_check_tcp_conn(struct net_lro_desc *lro_desc, -			      struct iphdr *iph, -			      struct tcphdr *tcph) -{ -	if ((lro_desc->iph->saddr != iph->saddr) || -	    (lro_desc->iph->daddr != iph->daddr) || -	    (lro_desc->tcph->source != tcph->source) || -	    (lro_desc->tcph->dest != tcph->dest)) -		return -1; -	return 0; -} - -static struct net_lro_desc *lro_get_desc(struct net_lro_mgr *lro_mgr, -					 struct net_lro_desc *lro_arr, -					 struct iphdr *iph, -					 struct tcphdr *tcph) -{ -	struct net_lro_desc *lro_desc = NULL; -	struct net_lro_desc *tmp; -	int max_desc = lro_mgr->max_desc; -	int i; - -	for (i = 0; i < max_desc; i++) { -		tmp = &lro_arr[i]; -		if (tmp->active) -			if (!lro_check_tcp_conn(tmp, iph, tcph)) { -				lro_desc = tmp; -				goto out; -			} -	} - -	for (i = 0; i < max_desc; i++) { -		if (!lro_arr[i].active) { -			lro_desc = &lro_arr[i]; -			goto out; -		} -	} - -	LRO_INC_STATS(lro_mgr, no_desc); -out: -	return lro_desc; -} - -static void lro_flush(struct net_lro_mgr *lro_mgr, -		      struct net_lro_desc *lro_desc) -{ -	if (lro_desc->pkt_aggr_cnt > 1) -		lro_update_tcp_ip_header(lro_desc); - -	skb_shinfo(lro_desc->parent)->gso_size = lro_desc->mss; - -	if (lro_mgr->features & LRO_F_NAPI) -		netif_receive_skb(lro_desc->parent); -	else -		netif_rx(lro_desc->parent); - -	LRO_INC_STATS(lro_mgr, flushed); -	lro_clear_desc(lro_desc); -} - -static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb, -			  void *priv) -{ -	struct net_lro_desc *lro_desc; -	struct iphdr *iph; -	struct tcphdr *tcph; -	u64 flags; -	int vlan_hdr_len = 0; - -	if (!lro_mgr->get_skb_header || -	    lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph, -				    &flags, priv)) -		goto out; - -	if (!(flags & LRO_IPV4) || !(flags & LRO_TCP)) -		goto out; - -	lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); -	if (!lro_desc) -		goto out; - -	if ((skb->protocol == htons(ETH_P_8021Q)) && -	    !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) -		vlan_hdr_len = VLAN_HLEN; - -	if (!lro_desc->active) { /* start new lro session */ -		if (lro_tcp_ip_check(iph, tcph, skb->len - vlan_hdr_len, NULL)) -			goto out; - -		skb->ip_summed = lro_mgr->ip_summed_aggr; -		lro_init_desc(lro_desc, skb, iph, tcph); -		LRO_INC_STATS(lro_mgr, aggregated); -		return 0; -	} - -	if (lro_desc->tcp_next_seq != ntohl(tcph->seq)) -		goto out2; - -	if (lro_tcp_ip_check(iph, tcph, skb->len, lro_desc)) -		goto out2; - -	lro_add_packet(lro_desc, skb, iph, tcph); -	LRO_INC_STATS(lro_mgr, aggregated); - -	if ((lro_desc->pkt_aggr_cnt >= lro_mgr->max_aggr) || -	    lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu)) -		lro_flush(lro_mgr, lro_desc); - -	return 0; - -out2: /* send aggregated SKBs to stack */ -	lro_flush(lro_mgr, lro_desc); - -out: -	return 1; -} - -void lro_receive_skb(struct net_lro_mgr *lro_mgr, -		     struct sk_buff *skb, -		     void *priv) -{ -	if (__lro_proc_skb(lro_mgr, skb, priv)) { -		if (lro_mgr->features & LRO_F_NAPI) -			netif_receive_skb(skb); -		else -			netif_rx(skb); -	} -} -EXPORT_SYMBOL(lro_receive_skb); - -void lro_flush_all(struct net_lro_mgr *lro_mgr) -{ -	int i; -	struct net_lro_desc *lro_desc = lro_mgr->lro_arr; - -	for (i = 0; i < lro_mgr->max_desc; i++) { -		if (lro_desc[i].active) -			lro_flush(lro_mgr, &lro_desc[i]); -	} -} -EXPORT_SYMBOL(lro_flush_all); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index da0d7ce85844..af18f1e4889e 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -71,7 +71,6 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s  	if (unlikely(opt->optlen))  		ip_forward_options(skb); -	skb_sender_cpu_clear(skb);  	return dst_output(net, sk, skb);  } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 187c6fcc3027..efbd47d1a531 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -54,8 +54,6 @@   * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c   * as well. Or notify me, at least. --ANK   */ - -static int sysctl_ipfrag_max_dist __read_mostly = 64;  static const char ip_frag_cache_name[] = "ip4-frags";  struct ipfrag_skb_cb @@ -150,7 +148,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a)  	qp->daddr = arg->iph->daddr;  	qp->vif = arg->vif;  	qp->user = arg->user; -	qp->peer = sysctl_ipfrag_max_dist ? +	qp->peer = q->net->max_dist ?  		inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) :  		NULL;  } @@ -275,7 +273,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph,  static int ip_frag_too_far(struct ipq *qp)  {  	struct inet_peer *peer = qp->peer; -	unsigned int max = sysctl_ipfrag_max_dist; +	unsigned int max = qp->q.net->max_dist;  	unsigned int start, end;  	int rc; @@ -749,6 +747,14 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = {  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	}, +	{ +		.procname	= "ipfrag_max_dist", +		.data		= &init_net.ipv4.frags.max_dist, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &zero +	},  	{ }  }; @@ -762,14 +768,6 @@ static struct ctl_table ip4_frags_ctl_table[] = {  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	}, -	{ -		.procname	= "ipfrag_max_dist", -		.data		= &sysctl_ipfrag_max_dist, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec_minmax, -		.extra1		= &zero -	},  	{ }  }; @@ -790,10 +788,7 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)  		table[1].data = &net->ipv4.frags.low_thresh;  		table[1].extra2 = &net->ipv4.frags.high_thresh;  		table[2].data = &net->ipv4.frags.timeout; - -		/* Don't export sysctls to unprivileged users */ -		if (net->user_ns != &init_user_ns) -			table[0].procname = NULL; +		table[3].data = &net->ipv4.frags.max_dist;  	}  	hdr = register_net_sysctl(net, "net/ipv4", table); @@ -865,6 +860,8 @@ static int __net_init ipv4_frags_init_net(struct net *net)  	 */  	net->ipv4.frags.timeout = IP_FRAG_TIME; +	net->ipv4.frags.max_dist = 64; +  	res = inet_frags_init_net(&net->ipv4.frags);  	if (res)  		return res; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 41ba68de46d8..31936d387cfd 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -238,7 +238,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,  				return -EINVAL;  		}  	} -	return iptunnel_pull_header(skb, hdr_len, tpi->proto); +	return iptunnel_pull_header(skb, hdr_len, tpi->proto, false);  }  static void ipgre_err(struct sk_buff *skb, u32 info, @@ -440,6 +440,17 @@ drop:  	return 0;  } +static __sum16 gre_checksum(struct sk_buff *skb) +{ +	__wsum csum; + +	if (skb->ip_summed == CHECKSUM_PARTIAL) +		csum = lco_csum(skb); +	else +		csum = skb_checksum(skb, 0, skb->len, 0); +	return csum_fold(csum); +} +  static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags,  			 __be16 proto, __be32 key, __be32 seq)  { @@ -467,8 +478,7 @@ static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags,  		    !(skb_shinfo(skb)->gso_type &  		      (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) {  			*ptr = 0; -			*(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0, -								 skb->len, 0)); +			*(__sum16 *)ptr = gre_checksum(skb);  		}  	}  } @@ -493,8 +503,7 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,  static struct sk_buff *gre_handle_offloads(struct sk_buff *skb,  					   bool csum)  { -	return iptunnel_handle_offloads(skb, csum, -					csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); +	return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);  }  static struct rtable *gre_get_rt(struct sk_buff *skb, @@ -518,11 +527,12 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)  {  	struct ip_tunnel_info *tun_info;  	const struct ip_tunnel_key *key; +	struct rtable *rt = NULL;  	struct flowi4 fl; -	struct rtable *rt;  	int min_headroom;  	int tunnel_hlen;  	__be16 df, flags; +	bool use_cache;  	int err;  	tun_info = skb_tunnel_info(skb); @@ -531,9 +541,17 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)  		goto err_free_skb;  	key = &tun_info->key; -	rt = gre_get_rt(skb, dev, &fl, key); -	if (IS_ERR(rt)) -		goto err_free_skb; +	use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); +	if (use_cache) +		rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl.saddr); +	if (!rt) { +		rt = gre_get_rt(skb, dev, &fl, key); +		if (IS_ERR(rt)) +				goto err_free_skb; +		if (use_cache) +			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, +					  fl.saddr); +	}  	tunnel_hlen = ip_gre_calc_hlen(key->tun_flags); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index d77eb0c3b684..e3d782746d9d 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -308,15 +308,12 @@ drop:  	return true;  } -int sysctl_ip_early_demux __read_mostly = 1; -EXPORT_SYMBOL(sysctl_ip_early_demux); -  static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)  {  	const struct iphdr *iph = ip_hdr(skb);  	struct rtable *rt; -	if (sysctl_ip_early_demux && +	if (net->ipv4.sysctl_ip_early_demux &&  	    !skb_dst(skb) &&  	    !skb->sk &&  	    !ip_is_fragment(iph)) { @@ -362,8 +359,31 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)  	rt = skb_rtable(skb);  	if (rt->rt_type == RTN_MULTICAST) {  		IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INMCAST, skb->len); -	} else if (rt->rt_type == RTN_BROADCAST) +	} else if (rt->rt_type == RTN_BROADCAST) {  		IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INBCAST, skb->len); +	} else if (skb->pkt_type == PACKET_BROADCAST || +		   skb->pkt_type == PACKET_MULTICAST) { +		struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + +		/* RFC 1122 3.3.6: +		 * +		 *   When a host sends a datagram to a link-layer broadcast +		 *   address, the IP destination address MUST be a legal IP +		 *   broadcast or IP multicast address. +		 * +		 *   A host SHOULD silently discard a datagram that is received +		 *   via a link-layer broadcast (see Section 2.4) but does not +		 *   specify an IP multicast or broadcast destination address. +		 * +		 * This doesn't explicitly say L2 *broadcast*, but broadcast is +		 * in a way a form of multicast and the most common use case for +		 * this is 802.11 protecting against cross-station spoofing (the +		 * so-called "hole-196" attack) so do it for both. +		 */ +		if (in_dev && +		    IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST)) +			goto drop; +	}  	return dst_input(skb); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index bd246792360b..4d158ff1def1 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -58,10 +58,9 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt,  		if (opt->ts_needaddr)  			ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt);  		if (opt->ts_needtime) { -			struct timespec tv;  			__be32 midtime; -			getnstimeofday(&tv); -			midtime = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC); + +			midtime = inet_current_timestamp();  			memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4);  		}  		return; @@ -415,11 +414,10 @@ int ip_options_compile(struct net *net,  					break;  				}  				if (timeptr) { -					struct timespec tv; -					u32  midtime; -					getnstimeofday(&tv); -					midtime = (tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC; -					put_unaligned_be32(midtime, timeptr); +					__be32 midtime; + +					midtime = inet_current_timestamp(); +					memcpy(timeptr, &midtime, 4);  					opt->is_changed = 1;  				}  			} else if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC) { diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 565bf64b2b7d..124bf0a66328 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -79,9 +79,6 @@  #include <linux/netlink.h>  #include <linux/tcp.h> -int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; -EXPORT_SYMBOL(sysctl_ip_default_ttl); -  static int  ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,  	    unsigned int mtu, diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index a50124260f5a..035ad645a8d9 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -573,6 +573,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,  			    int optname, char __user *optval, unsigned int optlen)  {  	struct inet_sock *inet = inet_sk(sk); +	struct net *net = sock_net(sk);  	int val = 0, err;  	bool needs_rtnl = setsockopt_needs_rtnl(optname); @@ -912,7 +913,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,  		}  		/* numsrc >= (1G-4) overflow in 32 bits */  		if (msf->imsf_numsrc >= 0x3ffffffcU || -		    msf->imsf_numsrc > sysctl_igmp_max_msf) { +		    msf->imsf_numsrc > net->ipv4.sysctl_igmp_max_msf) {  			kfree(msf);  			err = -ENOBUFS;  			break; @@ -1067,7 +1068,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,  		/* numsrc >= (4G-140)/128 overflow in 32 bits */  		if (gsf->gf_numsrc >= 0x1ffffff || -		    gsf->gf_numsrc > sysctl_igmp_max_msf) { +		    gsf->gf_numsrc > net->ipv4.sysctl_igmp_max_msf) {  			err = -ENOBUFS;  			goto mc_msf_out;  		} @@ -1342,10 +1343,13 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,  		val = inet->tos;  		break;  	case IP_TTL: +	{ +		struct net *net = sock_net(sk);  		val = (inet->uc_ttl == -1 ? -		       sysctl_ip_default_ttl : +		       net->ipv4.sysctl_ip_default_ttl :  		       inet->uc_ttl);  		break; +	}  	case IP_HDRINCL:  		val = inet->hdrincl;  		break; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 336e6892a93c..6aad0192443d 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -68,61 +68,6 @@ static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)  			 IP_TNL_HASH_BITS);  } -static void __tunnel_dst_set(struct ip_tunnel_dst *idst, -			     struct dst_entry *dst, __be32 saddr) -{ -	struct dst_entry *old_dst; - -	dst_clone(dst); -	old_dst = xchg((__force struct dst_entry **)&idst->dst, dst); -	dst_release(old_dst); -	idst->saddr = saddr; -} - -static noinline void tunnel_dst_set(struct ip_tunnel *t, -			   struct dst_entry *dst, __be32 saddr) -{ -	__tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr); -} - -static void tunnel_dst_reset(struct ip_tunnel *t) -{ -	tunnel_dst_set(t, NULL, 0); -} - -void ip_tunnel_dst_reset_all(struct ip_tunnel *t) -{ -	int i; - -	for_each_possible_cpu(i) -		__tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0); -} -EXPORT_SYMBOL(ip_tunnel_dst_reset_all); - -static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, -					u32 cookie, __be32 *saddr) -{ -	struct ip_tunnel_dst *idst; -	struct dst_entry *dst; - -	rcu_read_lock(); -	idst = raw_cpu_ptr(t->dst_cache); -	dst = rcu_dereference(idst->dst); -	if (dst && !atomic_inc_not_zero(&dst->__refcnt)) -		dst = NULL; -	if (dst) { -		if (!dst->obsolete || dst->ops->check(dst, cookie)) { -			*saddr = idst->saddr; -		} else { -			tunnel_dst_reset(t); -			dst_release(dst); -			dst = NULL; -		} -	} -	rcu_read_unlock(); -	return (struct rtable *)dst; -} -  static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,  				__be16 flags, __be32 key)  { @@ -381,7 +326,8 @@ static int ip_tunnel_bind_dev(struct net_device *dev)  		if (!IS_ERR(rt)) {  			tdev = rt->dst.dev; -			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr); +			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, +					  fl4.saddr);  			ip_rt_put(rt);  		}  		if (dev->type != ARPHRD_ETHER) @@ -731,7 +677,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,  	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)  		goto tx_error; -	rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL; +	rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) : +			 NULL;  	if (!rt) {  		rt = ip_route_output_key(tunnel->net, &fl4); @@ -741,7 +688,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,  			goto tx_error;  		}  		if (connected) -			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr); +			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, +					  fl4.saddr);  	}  	if (rt->dst.dev == dev) { @@ -837,7 +785,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn,  		if (set_mtu)  			dev->mtu = mtu;  	} -	ip_tunnel_dst_reset_all(t); +	dst_cache_reset(&t->dst_cache);  	netdev_state_change(dev);  } @@ -976,7 +924,7 @@ static void ip_tunnel_dev_free(struct net_device *dev)  	struct ip_tunnel *tunnel = netdev_priv(dev);  	gro_cells_destroy(&tunnel->gro_cells); -	free_percpu(tunnel->dst_cache); +	dst_cache_destroy(&tunnel->dst_cache);  	free_percpu(dev->tstats);  	free_netdev(dev);  } @@ -1170,15 +1118,15 @@ int ip_tunnel_init(struct net_device *dev)  	if (!dev->tstats)  		return -ENOMEM; -	tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); -	if (!tunnel->dst_cache) { +	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); +	if (err) {  		free_percpu(dev->tstats); -		return -ENOMEM; +		return err;  	}  	err = gro_cells_init(&tunnel->gro_cells, dev);  	if (err) { -		free_percpu(tunnel->dst_cache); +		dst_cache_destroy(&tunnel->dst_cache);  		free_percpu(dev->tstats);  		return err;  	} @@ -1208,7 +1156,7 @@ void ip_tunnel_uninit(struct net_device *dev)  	if (itn->fb_tunnel_dev != dev)  		ip_tunnel_del(itn, netdev_priv(dev)); -	ip_tunnel_dst_reset_all(tunnel); +	dst_cache_reset(&tunnel->dst_cache);  }  EXPORT_SYMBOL_GPL(ip_tunnel_uninit); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 859d415c0b2d..02dd990af542 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -86,7 +86,8 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,  }  EXPORT_SYMBOL_GPL(iptunnel_xmit); -int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) +int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, +			 bool xnet)  {  	if (unlikely(!pskb_may_pull(skb, hdr_len)))  		return -ENOMEM; @@ -109,14 +110,12 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)  		skb->protocol = inner_proto;  	} -	nf_reset(skb); -	secpath_reset(skb);  	skb_clear_hash_if_not_l4(skb); -	skb_dst_drop(skb);  	skb->vlan_tci = 0;  	skb_set_queue_mapping(skb, 0); -	skb->pkt_type = PACKET_HOST; -	return 0; +	skb_scrub_packet(skb, xnet); + +	return iptunnel_pull_offloads(skb);  }  EXPORT_SYMBOL_GPL(iptunnel_pull_header); @@ -148,7 +147,6 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,  EXPORT_SYMBOL_GPL(iptunnel_metadata_reply);  struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, -					 bool csum_help,  					 int gso_type_mask)  {  	int err; @@ -166,20 +164,15 @@ struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,  		return skb;  	} -	/* If packet is not gso and we are resolving any partial checksum, -	 * clear encapsulation flag. This allows setting CHECKSUM_PARTIAL -	 * on the outer header without confusing devices that implement -	 * NETIF_F_IP_CSUM with encapsulation. -	 */ -	if (csum_help) -		skb->encapsulation = 0; - -	if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) { -		err = skb_checksum_help(skb); -		if (unlikely(err)) -			goto error; -	} else if (skb->ip_summed != CHECKSUM_PARTIAL) +	if (skb->ip_summed != CHECKSUM_PARTIAL) {  		skb->ip_summed = CHECKSUM_NONE; +		/* We clear encapsulation here to prevent badly-written +		 * drivers potentially deciding to offload an inner checksum +		 * if we set CHECKSUM_PARTIAL on the outer header. +		 * This should go away when the drivers are all fixed. +		 */ +		skb->encapsulation = 0; +	}  	return skb;  error: @@ -406,6 +399,12 @@ static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = {  void __init ip_tunnel_core_init(void)  { +	/* If you land here, make sure whether increasing ip_tunnel_info's +	 * options_len is a reasonable choice with its usage in front ends +	 * (f.e., it's part of flow keys, etc). +	 */ +	BUILD_BUG_ON(IP_TUNNEL_OPTS_MAX != 255); +  	lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP);  	lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6);  } diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 4044da61e747..ec51d02166de 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -195,7 +195,7 @@ static int ipip_rcv(struct sk_buff *skb)  	if (tunnel) {  		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))  			goto drop; -		if (iptunnel_pull_header(skb, 0, tpi.proto)) +		if (iptunnel_pull_header(skb, 0, tpi.proto, false))  			goto drop;  		return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error);  	} @@ -219,7 +219,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)  	if (unlikely(skb->protocol != htons(ETH_P_IP)))  		goto tx_error; -	skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP); +	skb = iptunnel_handle_offloads(skb, SKB_GSO_IPIP);  	if (IS_ERR(skb))  		goto out; diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index b488cac9c5ca..bf081927e06b 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1780,9 +1780,29 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len  	return ret;  } -struct xt_table *arpt_register_table(struct net *net, -				     const struct xt_table *table, -				     const struct arpt_replace *repl) +static void __arpt_unregister_table(struct xt_table *table) +{ +	struct xt_table_info *private; +	void *loc_cpu_entry; +	struct module *table_owner = table->me; +	struct arpt_entry *iter; + +	private = xt_unregister_table(table); + +	/* Decrease module usage counts and free resources */ +	loc_cpu_entry = private->entries; +	xt_entry_foreach(iter, loc_cpu_entry, private->size) +		cleanup_entry(iter); +	if (private->number > private->initial_entries) +		module_put(table_owner); +	xt_free_table_info(private); +} + +int arpt_register_table(struct net *net, +			const struct xt_table *table, +			const struct arpt_replace *repl, +			const struct nf_hook_ops *ops, +			struct xt_table **res)  {  	int ret;  	struct xt_table_info *newinfo; @@ -1791,10 +1811,8 @@ struct xt_table *arpt_register_table(struct net *net,  	struct xt_table *new_table;  	newinfo = xt_alloc_table_info(repl->size); -	if (!newinfo) { -		ret = -ENOMEM; -		goto out; -	} +	if (!newinfo) +		return -ENOMEM;  	loc_cpu_entry = newinfo->entries;  	memcpy(loc_cpu_entry, repl->entries, repl->size); @@ -1809,30 +1827,28 @@ struct xt_table *arpt_register_table(struct net *net,  		ret = PTR_ERR(new_table);  		goto out_free;  	} -	return new_table; + +	/* set res now, will see skbs right after nf_register_net_hooks */ +	WRITE_ONCE(*res, new_table); + +	ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); +	if (ret != 0) { +		__arpt_unregister_table(new_table); +		*res = NULL; +	} + +	return ret;  out_free:  	xt_free_table_info(newinfo); -out: -	return ERR_PTR(ret); +	return ret;  } -void arpt_unregister_table(struct xt_table *table) +void arpt_unregister_table(struct net *net, struct xt_table *table, +			   const struct nf_hook_ops *ops)  { -	struct xt_table_info *private; -	void *loc_cpu_entry; -	struct module *table_owner = table->me; -	struct arpt_entry *iter; - -	private = xt_unregister_table(table); - -	/* Decrease module usage counts and free resources */ -	loc_cpu_entry = private->entries; -	xt_entry_foreach(iter, loc_cpu_entry, private->size) -		cleanup_entry(iter); -	if (private->number > private->initial_entries) -		module_put(table_owner); -	xt_free_table_info(private); +	nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); +	__arpt_unregister_table(table);  }  /* The built-in targets: standard (NULL) and error. */ diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 1897ee160920..dd8c80dc32a2 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -17,12 +17,15 @@ MODULE_DESCRIPTION("arptables filter table");  #define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \  			   (1 << NF_ARP_FORWARD)) +static int __net_init arptable_filter_table_init(struct net *net); +  static const struct xt_table packet_filter = {  	.name		= "filter",  	.valid_hooks	= FILTER_VALID_HOOKS,  	.me		= THIS_MODULE,  	.af		= NFPROTO_ARP,  	.priority	= NF_IP_PRI_FILTER, +	.table_init	= arptable_filter_table_init,  };  /* The work comes in here from netfilter.c */ @@ -35,26 +38,32 @@ arptable_filter_hook(void *priv, struct sk_buff *skb,  static struct nf_hook_ops *arpfilter_ops __read_mostly; -static int __net_init arptable_filter_net_init(struct net *net) +static int __net_init arptable_filter_table_init(struct net *net)  {  	struct arpt_replace *repl; -	 +	int err; + +	if (net->ipv4.arptable_filter) +		return 0; +  	repl = arpt_alloc_initial_table(&packet_filter);  	if (repl == NULL)  		return -ENOMEM; -	net->ipv4.arptable_filter = -		arpt_register_table(net, &packet_filter, repl); +	err = arpt_register_table(net, &packet_filter, repl, arpfilter_ops, +				  &net->ipv4.arptable_filter);  	kfree(repl); -	return PTR_ERR_OR_ZERO(net->ipv4.arptable_filter); +	return err;  }  static void __net_exit arptable_filter_net_exit(struct net *net)  { -	arpt_unregister_table(net->ipv4.arptable_filter); +	if (!net->ipv4.arptable_filter) +		return; +	arpt_unregister_table(net, net->ipv4.arptable_filter, arpfilter_ops); +	net->ipv4.arptable_filter = NULL;  }  static struct pernet_operations arptable_filter_net_ops = { -	.init = arptable_filter_net_init,  	.exit = arptable_filter_net_exit,  }; @@ -62,26 +71,23 @@ static int __init arptable_filter_init(void)  {  	int ret; +	arpfilter_ops = xt_hook_ops_alloc(&packet_filter, arptable_filter_hook); +	if (IS_ERR(arpfilter_ops)) +		return PTR_ERR(arpfilter_ops); +  	ret = register_pernet_subsys(&arptable_filter_net_ops); -	if (ret < 0) +	if (ret < 0) { +		kfree(arpfilter_ops);  		return ret; - -	arpfilter_ops = xt_hook_link(&packet_filter, arptable_filter_hook); -	if (IS_ERR(arpfilter_ops)) { -		ret = PTR_ERR(arpfilter_ops); -		goto cleanup_table;  	} -	return ret; -cleanup_table: -	unregister_pernet_subsys(&arptable_filter_net_ops);  	return ret;  }  static void __exit arptable_filter_fini(void)  { -	xt_hook_unlink(&packet_filter, arpfilter_ops);  	unregister_pernet_subsys(&arptable_filter_net_ops); +	kfree(arpfilter_ops);  }  module_init(arptable_filter_init); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index b99affad6ba1..e53f8d6f326d 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -2062,9 +2062,27 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)  	return ret;  } -struct xt_table *ipt_register_table(struct net *net, -				    const struct xt_table *table, -				    const struct ipt_replace *repl) +static void __ipt_unregister_table(struct net *net, struct xt_table *table) +{ +	struct xt_table_info *private; +	void *loc_cpu_entry; +	struct module *table_owner = table->me; +	struct ipt_entry *iter; + +	private = xt_unregister_table(table); + +	/* Decrease module usage counts and free resources */ +	loc_cpu_entry = private->entries; +	xt_entry_foreach(iter, loc_cpu_entry, private->size) +		cleanup_entry(iter, net); +	if (private->number > private->initial_entries) +		module_put(table_owner); +	xt_free_table_info(private); +} + +int ipt_register_table(struct net *net, const struct xt_table *table, +		       const struct ipt_replace *repl, +		       const struct nf_hook_ops *ops, struct xt_table **res)  {  	int ret;  	struct xt_table_info *newinfo; @@ -2073,10 +2091,8 @@ struct xt_table *ipt_register_table(struct net *net,  	struct xt_table *new_table;  	newinfo = xt_alloc_table_info(repl->size); -	if (!newinfo) { -		ret = -ENOMEM; -		goto out; -	} +	if (!newinfo) +		return -ENOMEM;  	loc_cpu_entry = newinfo->entries;  	memcpy(loc_cpu_entry, repl->entries, repl->size); @@ -2091,30 +2107,27 @@ struct xt_table *ipt_register_table(struct net *net,  		goto out_free;  	} -	return new_table; +	/* set res now, will see skbs right after nf_register_net_hooks */ +	WRITE_ONCE(*res, new_table); + +	ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); +	if (ret != 0) { +		__ipt_unregister_table(net, new_table); +		*res = NULL; +	} + +	return ret;  out_free:  	xt_free_table_info(newinfo); -out: -	return ERR_PTR(ret); +	return ret;  } -void ipt_unregister_table(struct net *net, struct xt_table *table) +void ipt_unregister_table(struct net *net, struct xt_table *table, +			  const struct nf_hook_ops *ops)  { -	struct xt_table_info *private; -	void *loc_cpu_entry; -	struct module *table_owner = table->me; -	struct ipt_entry *iter; - -	private = xt_unregister_table(table); - -	/* Decrease module usage counts and free resources */ -	loc_cpu_entry = private->entries; -	xt_entry_foreach(iter, loc_cpu_entry, private->size) -		cleanup_entry(iter, net); -	if (private->number > private->initial_entries) -		module_put(table_owner); -	xt_free_table_info(private); +	nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); +	__ipt_unregister_table(net, table);  }  /* Returns 1 if the type and code is matched by the range, 0 otherwise */ diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 5fdc556514ba..7b8fbb352877 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -21,6 +21,7 @@ static struct iphdr *  synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr)  {  	struct iphdr *iph; +	struct net *net = sock_net(skb->sk);  	skb_reset_network_header(skb);  	iph = (struct iphdr *)skb_put(skb, sizeof(*iph)); @@ -29,7 +30,7 @@ synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr)  	iph->tos	= 0;  	iph->id		= 0;  	iph->frag_off	= htons(IP_DF); -	iph->ttl	= sysctl_ip_default_ttl; +	iph->ttl	= net->ipv4.sysctl_ip_default_ttl;  	iph->protocol	= IPPROTO_TCP;  	iph->check	= 0;  	iph->saddr	= saddr; diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 397ef2dd133e..7667f223d7f8 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -23,6 +23,7 @@ MODULE_DESCRIPTION("iptables filter table");  #define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \  			    (1 << NF_INET_FORWARD) | \  			    (1 << NF_INET_LOCAL_OUT)) +static int __net_init iptable_filter_table_init(struct net *net);  static const struct xt_table packet_filter = {  	.name		= "filter", @@ -30,6 +31,7 @@ static const struct xt_table packet_filter = {  	.me		= THIS_MODULE,  	.af		= NFPROTO_IPV4,  	.priority	= NF_IP_PRI_FILTER, +	.table_init	= iptable_filter_table_init,  };  static unsigned int @@ -48,12 +50,16 @@ iptable_filter_hook(void *priv, struct sk_buff *skb,  static struct nf_hook_ops *filter_ops __read_mostly;  /* Default to forward because I got too much mail already. */ -static bool forward = true; +static bool forward __read_mostly = true;  module_param(forward, bool, 0000); -static int __net_init iptable_filter_net_init(struct net *net) +static int __net_init iptable_filter_table_init(struct net *net)  {  	struct ipt_replace *repl; +	int err; + +	if (net->ipv4.iptable_filter) +		return 0;  	repl = ipt_alloc_initial_table(&packet_filter);  	if (repl == NULL) @@ -62,15 +68,26 @@ static int __net_init iptable_filter_net_init(struct net *net)  	((struct ipt_standard *)repl->entries)[1].target.verdict =  		forward ? -NF_ACCEPT - 1 : -NF_DROP - 1; -	net->ipv4.iptable_filter = -		ipt_register_table(net, &packet_filter, repl); +	err = ipt_register_table(net, &packet_filter, repl, filter_ops, +				 &net->ipv4.iptable_filter);  	kfree(repl); -	return PTR_ERR_OR_ZERO(net->ipv4.iptable_filter); +	return err; +} + +static int __net_init iptable_filter_net_init(struct net *net) +{ +	if (net == &init_net || !forward) +		return iptable_filter_table_init(net); + +	return 0;  }  static void __net_exit iptable_filter_net_exit(struct net *net)  { -	ipt_unregister_table(net, net->ipv4.iptable_filter); +	if (!net->ipv4.iptable_filter) +		return; +	ipt_unregister_table(net, net->ipv4.iptable_filter, filter_ops); +	net->ipv4.iptable_filter = NULL;  }  static struct pernet_operations iptable_filter_net_ops = { @@ -82,24 +99,21 @@ static int __init iptable_filter_init(void)  {  	int ret; +	filter_ops = xt_hook_ops_alloc(&packet_filter, iptable_filter_hook); +	if (IS_ERR(filter_ops)) +		return PTR_ERR(filter_ops); +  	ret = register_pernet_subsys(&iptable_filter_net_ops);  	if (ret < 0) -		return ret; - -	/* Register hooks */ -	filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook); -	if (IS_ERR(filter_ops)) { -		ret = PTR_ERR(filter_ops); -		unregister_pernet_subsys(&iptable_filter_net_ops); -	} +		kfree(filter_ops);  	return ret;  }  static void __exit iptable_filter_fini(void)  { -	xt_hook_unlink(&packet_filter, filter_ops);  	unregister_pernet_subsys(&iptable_filter_net_ops); +	kfree(filter_ops);  }  module_init(iptable_filter_init); diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index ba5d392a13c4..57fc97cdac70 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -28,12 +28,15 @@ MODULE_DESCRIPTION("iptables mangle table");  			    (1 << NF_INET_LOCAL_OUT) | \  			    (1 << NF_INET_POST_ROUTING)) +static int __net_init iptable_mangle_table_init(struct net *net); +  static const struct xt_table packet_mangler = {  	.name		= "mangle",  	.valid_hooks	= MANGLE_VALID_HOOKS,  	.me		= THIS_MODULE,  	.af		= NFPROTO_IPV4,  	.priority	= NF_IP_PRI_MANGLE, +	.table_init	= iptable_mangle_table_init,  };  static unsigned int @@ -92,27 +95,32 @@ iptable_mangle_hook(void *priv,  }  static struct nf_hook_ops *mangle_ops __read_mostly; - -static int __net_init iptable_mangle_net_init(struct net *net) +static int __net_init iptable_mangle_table_init(struct net *net)  {  	struct ipt_replace *repl; +	int ret; + +	if (net->ipv4.iptable_mangle) +		return 0;  	repl = ipt_alloc_initial_table(&packet_mangler);  	if (repl == NULL)  		return -ENOMEM; -	net->ipv4.iptable_mangle = -		ipt_register_table(net, &packet_mangler, repl); +	ret = ipt_register_table(net, &packet_mangler, repl, mangle_ops, +				 &net->ipv4.iptable_mangle);  	kfree(repl); -	return PTR_ERR_OR_ZERO(net->ipv4.iptable_mangle); +	return ret;  }  static void __net_exit iptable_mangle_net_exit(struct net *net)  { -	ipt_unregister_table(net, net->ipv4.iptable_mangle); +	if (!net->ipv4.iptable_mangle) +		return; +	ipt_unregister_table(net, net->ipv4.iptable_mangle, mangle_ops); +	net->ipv4.iptable_mangle = NULL;  }  static struct pernet_operations iptable_mangle_net_ops = { -	.init = iptable_mangle_net_init,  	.exit = iptable_mangle_net_exit,  }; @@ -120,15 +128,22 @@ static int __init iptable_mangle_init(void)  {  	int ret; +	mangle_ops = xt_hook_ops_alloc(&packet_mangler, iptable_mangle_hook); +	if (IS_ERR(mangle_ops)) { +		ret = PTR_ERR(mangle_ops); +		return ret; +	} +  	ret = register_pernet_subsys(&iptable_mangle_net_ops); -	if (ret < 0) +	if (ret < 0) { +		kfree(mangle_ops);  		return ret; +	} -	/* Register hooks */ -	mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook); -	if (IS_ERR(mangle_ops)) { -		ret = PTR_ERR(mangle_ops); +	ret = iptable_mangle_table_init(&init_net); +	if (ret) {  		unregister_pernet_subsys(&iptable_mangle_net_ops); +		kfree(mangle_ops);  	}  	return ret; @@ -136,8 +151,8 @@ static int __init iptable_mangle_init(void)  static void __exit iptable_mangle_fini(void)  { -	xt_hook_unlink(&packet_mangler, mangle_ops);  	unregister_pernet_subsys(&iptable_mangle_net_ops); +	kfree(mangle_ops);  }  module_init(iptable_mangle_init); diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index ae2cd2752046..138a24bc76ad 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -18,6 +18,8 @@  #include <net/netfilter/nf_nat_core.h>  #include <net/netfilter/nf_nat_l3proto.h> +static int __net_init iptable_nat_table_init(struct net *net); +  static const struct xt_table nf_nat_ipv4_table = {  	.name		= "nat",  	.valid_hooks	= (1 << NF_INET_PRE_ROUTING) | @@ -26,6 +28,7 @@ static const struct xt_table nf_nat_ipv4_table = {  			  (1 << NF_INET_LOCAL_IN),  	.me		= THIS_MODULE,  	.af		= NFPROTO_IPV4, +	.table_init	= iptable_nat_table_init,  };  static unsigned int iptable_nat_do_chain(void *priv, @@ -95,50 +98,50 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {  	},  }; -static int __net_init iptable_nat_net_init(struct net *net) +static int __net_init iptable_nat_table_init(struct net *net)  {  	struct ipt_replace *repl; +	int ret; + +	if (net->ipv4.nat_table) +		return 0;  	repl = ipt_alloc_initial_table(&nf_nat_ipv4_table);  	if (repl == NULL)  		return -ENOMEM; -	net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl); +	ret = ipt_register_table(net, &nf_nat_ipv4_table, repl, +				 nf_nat_ipv4_ops, &net->ipv4.nat_table);  	kfree(repl); -	return PTR_ERR_OR_ZERO(net->ipv4.nat_table); +	return ret;  }  static void __net_exit iptable_nat_net_exit(struct net *net)  { -	ipt_unregister_table(net, net->ipv4.nat_table); +	if (!net->ipv4.nat_table) +		return; +	ipt_unregister_table(net, net->ipv4.nat_table, nf_nat_ipv4_ops); +	net->ipv4.nat_table = NULL;  }  static struct pernet_operations iptable_nat_net_ops = { -	.init	= iptable_nat_net_init,  	.exit	= iptable_nat_net_exit,  };  static int __init iptable_nat_init(void)  { -	int err; +	int ret = register_pernet_subsys(&iptable_nat_net_ops); -	err = register_pernet_subsys(&iptable_nat_net_ops); -	if (err < 0) -		goto err1; +	if (ret) +		return ret; -	err = nf_register_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); -	if (err < 0) -		goto err2; -	return 0; - -err2: -	unregister_pernet_subsys(&iptable_nat_net_ops); -err1: -	return err; +	ret = iptable_nat_table_init(&init_net); +	if (ret) +		unregister_pernet_subsys(&iptable_nat_net_ops); +	return ret;  }  static void __exit iptable_nat_exit(void)  { -	nf_unregister_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));  	unregister_pernet_subsys(&iptable_nat_net_ops);  } diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 1ba02811acb0..2642ecd2645c 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -10,12 +10,15 @@  #define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)) +static int __net_init iptable_raw_table_init(struct net *net); +  static const struct xt_table packet_raw = {  	.name = "raw",  	.valid_hooks =  RAW_VALID_HOOKS,  	.me = THIS_MODULE,  	.af = NFPROTO_IPV4,  	.priority = NF_IP_PRI_RAW, +	.table_init = iptable_raw_table_init,  };  /* The work comes in here from netfilter.c. */ @@ -34,26 +37,32 @@ iptable_raw_hook(void *priv, struct sk_buff *skb,  static struct nf_hook_ops *rawtable_ops __read_mostly; -static int __net_init iptable_raw_net_init(struct net *net) +static int __net_init iptable_raw_table_init(struct net *net)  {  	struct ipt_replace *repl; +	int ret; + +	if (net->ipv4.iptable_raw) +		return 0;  	repl = ipt_alloc_initial_table(&packet_raw);  	if (repl == NULL)  		return -ENOMEM; -	net->ipv4.iptable_raw = -		ipt_register_table(net, &packet_raw, repl); +	ret = ipt_register_table(net, &packet_raw, repl, rawtable_ops, +				 &net->ipv4.iptable_raw);  	kfree(repl); -	return PTR_ERR_OR_ZERO(net->ipv4.iptable_raw); +	return ret;  }  static void __net_exit iptable_raw_net_exit(struct net *net)  { -	ipt_unregister_table(net, net->ipv4.iptable_raw); +	if (!net->ipv4.iptable_raw) +		return; +	ipt_unregister_table(net, net->ipv4.iptable_raw, rawtable_ops); +	net->ipv4.iptable_raw = NULL;  }  static struct pernet_operations iptable_raw_net_ops = { -	.init = iptable_raw_net_init,  	.exit = iptable_raw_net_exit,  }; @@ -61,15 +70,20 @@ static int __init iptable_raw_init(void)  {  	int ret; +	rawtable_ops = xt_hook_ops_alloc(&packet_raw, iptable_raw_hook); +	if (IS_ERR(rawtable_ops)) +		return PTR_ERR(rawtable_ops); +  	ret = register_pernet_subsys(&iptable_raw_net_ops); -	if (ret < 0) +	if (ret < 0) { +		kfree(rawtable_ops);  		return ret; +	} -	/* Register hooks */ -	rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook); -	if (IS_ERR(rawtable_ops)) { -		ret = PTR_ERR(rawtable_ops); +	ret = iptable_raw_table_init(&init_net); +	if (ret) {  		unregister_pernet_subsys(&iptable_raw_net_ops); +		kfree(rawtable_ops);  	}  	return ret; @@ -77,8 +91,8 @@ static int __init iptable_raw_init(void)  static void __exit iptable_raw_fini(void)  { -	xt_hook_unlink(&packet_raw, rawtable_ops);  	unregister_pernet_subsys(&iptable_raw_net_ops); +	kfree(rawtable_ops);  }  module_init(iptable_raw_init); diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index c2e23d5e9cd4..ff226596e4b5 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -28,12 +28,15 @@ MODULE_DESCRIPTION("iptables security table, for MAC rules");  				(1 << NF_INET_FORWARD) | \  				(1 << NF_INET_LOCAL_OUT) +static int __net_init iptable_security_table_init(struct net *net); +  static const struct xt_table security_table = {  	.name		= "security",  	.valid_hooks	= SECURITY_VALID_HOOKS,  	.me		= THIS_MODULE,  	.af		= NFPROTO_IPV4,  	.priority	= NF_IP_PRI_SECURITY, +	.table_init	= iptable_security_table_init,  };  static unsigned int @@ -51,26 +54,33 @@ iptable_security_hook(void *priv, struct sk_buff *skb,  static struct nf_hook_ops *sectbl_ops __read_mostly; -static int __net_init iptable_security_net_init(struct net *net) +static int __net_init iptable_security_table_init(struct net *net)  {  	struct ipt_replace *repl; +	int ret; + +	if (net->ipv4.iptable_security) +		return 0;  	repl = ipt_alloc_initial_table(&security_table);  	if (repl == NULL)  		return -ENOMEM; -	net->ipv4.iptable_security = -		ipt_register_table(net, &security_table, repl); +	ret = ipt_register_table(net, &security_table, repl, sectbl_ops, +				 &net->ipv4.iptable_security);  	kfree(repl); -	return PTR_ERR_OR_ZERO(net->ipv4.iptable_security); +	return ret;  }  static void __net_exit iptable_security_net_exit(struct net *net)  { -	ipt_unregister_table(net, net->ipv4.iptable_security); +	if (!net->ipv4.iptable_security) +		return; + +	ipt_unregister_table(net, net->ipv4.iptable_security, sectbl_ops); +	net->ipv4.iptable_security = NULL;  }  static struct pernet_operations iptable_security_net_ops = { -	.init = iptable_security_net_init,  	.exit = iptable_security_net_exit,  }; @@ -78,27 +88,29 @@ static int __init iptable_security_init(void)  {  	int ret; +	sectbl_ops = xt_hook_ops_alloc(&security_table, iptable_security_hook); +	if (IS_ERR(sectbl_ops)) +		return PTR_ERR(sectbl_ops); +  	ret = register_pernet_subsys(&iptable_security_net_ops); -	if (ret < 0) +	if (ret < 0) { +		kfree(sectbl_ops);  		return ret; - -	sectbl_ops = xt_hook_link(&security_table, iptable_security_hook); -	if (IS_ERR(sectbl_ops)) { -		ret = PTR_ERR(sectbl_ops); -		goto cleanup_table;  	} -	return ret; +	ret = iptable_security_table_init(&init_net); +	if (ret) { +		unregister_pernet_subsys(&iptable_security_net_ops); +		kfree(sectbl_ops); +	} -cleanup_table: -	unregister_pernet_subsys(&iptable_security_net_ops);  	return ret;  }  static void __exit iptable_security_fini(void)  { -	xt_hook_unlink(&security_table, sectbl_ops);  	unregister_pernet_subsys(&iptable_security_net_ops); +	kfree(sectbl_ops);  }  module_init(iptable_security_init); diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index a04dee536b8e..d88da36b383c 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -31,10 +31,8 @@ static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb,  	err = ip_defrag(net, skb, user);  	local_bh_enable(); -	if (!err) { -		ip_send_check(ip_hdr(skb)); +	if (!err)  		skb->ignore_df = 1; -	}  	return err;  } diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index 61c7cc22ea68..f8aad03d674b 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -127,29 +127,15 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,  				    u8 proto, void *data, __sum16 *check,  				    int datalen, int oldlen)  { -	const struct iphdr *iph = ip_hdr(skb); -	struct rtable *rt = skb_rtable(skb); -  	if (skb->ip_summed != CHECKSUM_PARTIAL) { -		if (!(rt->rt_flags & RTCF_LOCAL) && -		    (!skb->dev || skb->dev->features & -		     (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM))) { -			skb->ip_summed = CHECKSUM_PARTIAL; -			skb->csum_start = skb_headroom(skb) + -					  skb_network_offset(skb) + -					  ip_hdrlen(skb); -			skb->csum_offset = (void *)check - data; -			*check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, -						    datalen, proto, 0); -		} else { -			*check = 0; -			*check = csum_tcpudp_magic(iph->saddr, iph->daddr, -						   datalen, proto, -						   csum_partial(data, datalen, -								0)); -			if (proto == IPPROTO_UDP && !*check) -				*check = CSUM_MANGLED_0; -		} +		const struct iphdr *iph = ip_hdr(skb); + +		skb->ip_summed = CHECKSUM_PARTIAL; +		skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) + +			ip_hdrlen(skb); +		skb->csum_offset = (void *)check - data; +		*check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen, +					    proto, 0);  	} else  		inet_proto_csum_replace2(check, skb,  					 htons(oldlen), htons(datalen), true); diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c index c6eb42100e9a..ea91058b5f6f 100644 --- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c @@ -108,10 +108,18 @@ static int masq_inet_event(struct notifier_block *this,  			   unsigned long event,  			   void *ptr)  { -	struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev; +	struct in_device *idev = ((struct in_ifaddr *)ptr)->ifa_dev;  	struct netdev_notifier_info info; -	netdev_notifier_info_init(&info, dev); +	/* The masq_dev_notifier will catch the case of the device going +	 * down.  So if the inetdev is dead and being destroyed we have +	 * no work to do.  Otherwise this is an individual address removal +	 * and we have to perform the flush. +	 */ +	if (idev->dead) +		return NOTIFY_DONE; + +	netdev_notifier_info_init(&info, idev->dev);  	return masq_device_event(this, event, &info);  } diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c index b72ffc58e255..51ced81b616c 100644 --- a/net/ipv4/netfilter/nft_masq_ipv4.c +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -25,7 +25,12 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr,  	memset(&range, 0, sizeof(range));  	range.flags = priv->flags; - +	if (priv->sreg_proto_min) { +		range.min_proto.all = +			*(__be16 *)®s->data[priv->sreg_proto_min]; +		range.max_proto.all = +			*(__be16 *)®s->data[priv->sreg_proto_max]; +	}  	regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->hook,  						    &range, pkt->out);  } diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index d3a27165f9cc..cf9700b1a106 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -145,10 +145,12 @@ fail:  }  EXPORT_SYMBOL_GPL(ping_get_port); -void ping_hash(struct sock *sk) +int ping_hash(struct sock *sk)  {  	pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num);  	BUG(); /* "Please do not press this button again." */ + +	return 0;  }  void ping_unhash(struct sock *sk) @@ -1140,13 +1142,6 @@ static int ping_v4_seq_show(struct seq_file *seq, void *v)  	return 0;  } -static const struct seq_operations ping_v4_seq_ops = { -	.show		= ping_v4_seq_show, -	.start		= ping_v4_seq_start, -	.next		= ping_seq_next, -	.stop		= ping_seq_stop, -}; -  static int ping_seq_open(struct inode *inode, struct file *file)  {  	struct ping_seq_afinfo *afinfo = PDE_DATA(inode); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 3abd9d7a3adf..9f665b63a927 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -390,7 +390,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)  	seq_printf(seq, "\nIp: %d %d",  		   IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2, -		   sysctl_ip_default_ttl); +		   net->ipv4.sysctl_ip_default_ttl);  	BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);  	for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 7113bae4e6a0..8d22de74080c 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -93,7 +93,7 @@ static struct raw_hashinfo raw_v4_hashinfo = {  	.lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),  }; -void raw_hash_sk(struct sock *sk) +int raw_hash_sk(struct sock *sk)  {  	struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;  	struct hlist_head *head; @@ -104,6 +104,8 @@ void raw_hash_sk(struct sock *sk)  	sk_add_node(sk, head);  	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);  	write_unlock_bh(&h->lock); + +	return 0;  }  EXPORT_SYMBOL_GPL(raw_hash_sk); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 643a86c49020..4c04f09338e3 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -19,8 +19,6 @@  #include <net/tcp.h>  #include <net/route.h> -extern int sysctl_tcp_syncookies; -  static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;  #define COOKIEBITS 24	/* Upper bits store count */ @@ -50,8 +48,7 @@ static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;  #define TSBITS	6  #define TSMASK	(((__u32)1 << TSBITS) - 1) -static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], -		      ipv4_cookie_scratch); +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv4_cookie_scratch);  static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,  		       u32 count, int c) @@ -307,7 +304,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)  	__u8 rcv_wscale;  	struct flowi4 fl4; -	if (!sysctl_tcp_syncookies || !th->ack || th->rst) +	if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies || !th->ack || th->rst)  		goto out;  	if (tcp_synq_no_recent_overflow(sk)) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4d367b4139a3..1e1fe6086dd9 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -283,31 +283,6 @@ static struct ctl_table ipv4_table[] = {  		.proc_handler	= proc_dointvec  	},  	{ -		.procname	= "ip_default_ttl", -		.data		= &sysctl_ip_default_ttl, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec_minmax, -		.extra1		= &ip_ttl_min, -		.extra2		= &ip_ttl_max, -	}, -	{ -		.procname	= "tcp_syn_retries", -		.data		= &sysctl_tcp_syn_retries, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec_minmax, -		.extra1		= &tcp_syn_retries_min, -		.extra2		= &tcp_syn_retries_max -	}, -	{ -		.procname	= "tcp_synack_retries", -		.data		= &sysctl_tcp_synack_retries, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec -	}, -	{  		.procname	= "tcp_max_orphans",  		.data		= &sysctl_tcp_max_orphans,  		.maxlen		= sizeof(int), @@ -322,51 +297,6 @@ static struct ctl_table ipv4_table[] = {  		.proc_handler	= proc_dointvec  	},  	{ -		.procname	= "ip_early_demux", -		.data		= &sysctl_ip_early_demux, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec -	}, -	{ -		.procname	= "ip_dynaddr", -		.data		= &sysctl_ip_dynaddr, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec -	}, -	{ -		.procname	= "tcp_retries1", -		.data		= &sysctl_tcp_retries1, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec_minmax, -		.extra2		= &tcp_retr1_max -	}, -	{ -		.procname	= "tcp_retries2", -		.data		= &sysctl_tcp_retries2, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec -	}, -	{ -		.procname	= "tcp_fin_timeout", -		.data		= &sysctl_tcp_fin_timeout, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec_jiffies, -	}, -#ifdef CONFIG_SYN_COOKIES -	{ -		.procname	= "tcp_syncookies", -		.data		= &sysctl_tcp_syncookies, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec -	}, -#endif -	{  		.procname	= "tcp_fastopen",  		.data		= &sysctl_tcp_fastopen,  		.maxlen		= sizeof(int), @@ -415,30 +345,6 @@ static struct ctl_table ipv4_table[] = {  		.proc_handler	= proc_dointvec  	},  	{ -		.procname	= "igmp_max_memberships", -		.data		= &sysctl_igmp_max_memberships, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec -	}, -	{ -		.procname	= "igmp_max_msf", -		.data		= &sysctl_igmp_max_msf, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec -	}, -#ifdef CONFIG_IP_MULTICAST -	{ -		.procname	= "igmp_qrv", -		.data		= &sysctl_igmp_qrv, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec_minmax, -		.extra1		= &one -	}, -#endif -	{  		.procname	= "inet_peer_threshold",  		.data		= &inet_peer_threshold,  		.maxlen		= sizeof(int), @@ -460,13 +366,6 @@ static struct ctl_table ipv4_table[] = {  		.proc_handler	= proc_dointvec_jiffies,  	},  	{ -		.procname	= "tcp_orphan_retries", -		.data		= &sysctl_tcp_orphan_retries, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec -	}, -	{  		.procname	= "tcp_fack",  		.data		= &sysctl_tcp_fack,  		.maxlen		= sizeof(int), @@ -481,13 +380,6 @@ static struct ctl_table ipv4_table[] = {  		.proc_handler	= proc_dointvec,  	},  	{ -		.procname	= "tcp_reordering", -		.data		= &sysctl_tcp_reordering, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec -	}, -	{  		.procname	= "tcp_max_reordering",  		.data		= &sysctl_tcp_max_reordering,  		.maxlen		= sizeof(int), @@ -517,13 +409,6 @@ static struct ctl_table ipv4_table[] = {  		.extra1		= &one,  	},  	{ -		.procname	= "tcp_notsent_lowat", -		.data		= &sysctl_tcp_notsent_lowat, -		.maxlen		= sizeof(sysctl_tcp_notsent_lowat), -		.mode		= 0644, -		.proc_handler	= proc_dointvec, -	}, -	{  		.procname	= "tcp_rmem",  		.data		= &sysctl_tcp_rmem,  		.maxlen		= sizeof(sysctl_tcp_rmem), @@ -845,6 +730,29 @@ static struct ctl_table ipv4_net_table[] = {  		.proc_handler	= proc_dointvec  	},  	{ +		.procname	= "ip_dynaddr", +		.data		= &init_net.ipv4.sysctl_ip_dynaddr, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec +	}, +	{ +		.procname	= "ip_early_demux", +		.data		= &init_net.ipv4.sysctl_ip_early_demux, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec +	}, +	{ +		.procname	= "ip_default_ttl", +		.data		= &init_net.ipv4.sysctl_ip_default_ttl, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &ip_ttl_min, +		.extra2		= &ip_ttl_max, +	}, +	{  		.procname	= "ip_local_port_range",  		.maxlen		= sizeof(init_net.ipv4.ip_local_ports.range),  		.data		= &init_net.ipv4.ip_local_ports.range, @@ -934,12 +842,36 @@ static struct ctl_table ipv4_net_table[] = {  	},  	{  		.procname	= "igmp_link_local_mcast_reports", -		.data		= &sysctl_igmp_llm_reports, +		.data		= &init_net.ipv4.sysctl_igmp_llm_reports, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec +	}, +	{ +		.procname	= "igmp_max_memberships", +		.data		= &init_net.ipv4.sysctl_igmp_max_memberships,  		.maxlen		= sizeof(int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec  	},  	{ +		.procname	= "igmp_max_msf", +		.data		= &init_net.ipv4.sysctl_igmp_max_msf, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec +	}, +#ifdef CONFIG_IP_MULTICAST +	{ +		.procname	= "igmp_qrv", +		.data		= &init_net.ipv4.sysctl_igmp_qrv, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &one +	}, +#endif +	{  		.procname	= "tcp_keepalive_time",  		.data		= &init_net.ipv4.sysctl_tcp_keepalive_time,  		.maxlen		= sizeof(int), @@ -960,6 +892,74 @@ static struct ctl_table ipv4_net_table[] = {  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	}, +	{ +		.procname	= "tcp_syn_retries", +		.data		= &init_net.ipv4.sysctl_tcp_syn_retries, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &tcp_syn_retries_min, +		.extra2		= &tcp_syn_retries_max +	}, +	{ +		.procname	= "tcp_synack_retries", +		.data		= &init_net.ipv4.sysctl_tcp_synack_retries, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec +	}, +#ifdef CONFIG_SYN_COOKIES +	{ +		.procname	= "tcp_syncookies", +		.data		= &init_net.ipv4.sysctl_tcp_syncookies, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec +	}, +#endif +	{ +		.procname	= "tcp_reordering", +		.data		= &init_net.ipv4.sysctl_tcp_reordering, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec +	}, +	{ +		.procname	= "tcp_retries1", +		.data		= &init_net.ipv4.sysctl_tcp_retries1, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec_minmax, +		.extra2		= &tcp_retr1_max +	}, +	{ +		.procname	= "tcp_retries2", +		.data		= &init_net.ipv4.sysctl_tcp_retries2, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec +	}, +	{ +		.procname	= "tcp_orphan_retries", +		.data		= &init_net.ipv4.sysctl_tcp_orphan_retries, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec +	}, +	{ +		.procname	= "tcp_fin_timeout", +		.data		= &init_net.ipv4.sysctl_tcp_fin_timeout, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec_jiffies, +	}, +	{ +		.procname	= "tcp_notsent_lowat", +		.data		= &init_net.ipv4.sysctl_tcp_notsent_lowat, +		.maxlen		= sizeof(unsigned int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec, +	},  	{ }  }; @@ -988,6 +988,10 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)  	if (!net->ipv4.sysctl_local_reserved_ports)  		goto err_ports; +	net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; +	net->ipv4.sysctl_ip_dynaddr = 0; +	net->ipv4.sysctl_ip_early_demux = 1; +  	return 0;  err_ports: diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 483ffdf5aa4d..08b8b960a8ed 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -247,6 +247,7 @@  #define pr_fmt(fmt) "TCP: " fmt +#include <crypto/hash.h>  #include <linux/kernel.h>  #include <linux/module.h>  #include <linux/types.h> @@ -266,7 +267,6 @@  #include <linux/swap.h>  #include <linux/cache.h>  #include <linux/err.h> -#include <linux/crypto.h>  #include <linux/time.h>  #include <linux/slab.h> @@ -282,8 +282,6 @@  #include <asm/unaligned.h>  #include <net/busy_poll.h> -int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; -  int sysctl_tcp_min_tso_segs __read_mostly = 2;  int sysctl_tcp_autocorking __read_mostly = 1; @@ -406,7 +404,7 @@ void tcp_init_sock(struct sock *sk)  	tp->mss_cache = TCP_MSS_DEFAULT;  	u64_stats_init(&tp->syncp); -	tp->reordering = sysctl_tcp_reordering; +	tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;  	tcp_enable_early_retrans(tp);  	tcp_assign_congestion_control(sk); @@ -558,20 +556,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)  			return -EINVAL;  		slow = lock_sock_fast(sk); -		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) -			answ = 0; -		else if (sock_flag(sk, SOCK_URGINLINE) || -			 !tp->urg_data || -			 before(tp->urg_seq, tp->copied_seq) || -			 !before(tp->urg_seq, tp->rcv_nxt)) { - -			answ = tp->rcv_nxt - tp->copied_seq; - -			/* Subtract 1, if FIN was received */ -			if (answ && sock_flag(sk, SOCK_DONE)) -				answ--; -		} else -			answ = tp->urg_seq - tp->copied_seq; +		answ = tcp_inq(sk);  		unlock_sock_fast(sk, slow);  		break;  	case SIOCATMARK: @@ -1466,8 +1451,10 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)  	while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {  		offset = seq - TCP_SKB_CB(skb)->seq; -		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) +		if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { +			pr_err_once("%s: found a SYN, please report !\n", __func__);  			offset--; +		}  		if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {  			*off = offset;  			return skb; @@ -1657,8 +1644,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,  				break;  			offset = *seq - TCP_SKB_CB(skb)->seq; -			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) +			if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { +				pr_err_once("%s: found a SYN, please report !\n", __func__);  				offset--; +			}  			if (offset < skb->len)  				goto found_ok_skb;  			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) @@ -2326,6 +2315,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct inet_connection_sock *icsk = inet_csk(sk); +	struct net *net = sock_net(sk);  	int val;  	int err = 0; @@ -2522,7 +2512,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,  	case TCP_LINGER2:  		if (val < 0)  			tp->linger2 = -1; -		else if (val > sysctl_tcp_fin_timeout / HZ) +		else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ)  			tp->linger2 = 0;  		else  			tp->linger2 = val * HZ; @@ -2639,6 +2629,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)  	const struct inet_connection_sock *icsk = inet_csk(sk);  	u32 now = tcp_time_stamp;  	unsigned int start; +	int notsent_bytes;  	u64 rate64;  	u32 rate; @@ -2719,6 +2710,13 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)  	} while (u64_stats_fetch_retry_irq(&tp->syncp, start));  	info->tcpi_segs_out = tp->segs_out;  	info->tcpi_segs_in = tp->segs_in; + +	notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt); +	info->tcpi_notsent_bytes = max(0, notsent_bytes); + +	info->tcpi_min_rtt = tcp_min_rtt(tp); +	info->tcpi_data_segs_in = tp->data_segs_in; +	info->tcpi_data_segs_out = tp->data_segs_out;  }  EXPORT_SYMBOL_GPL(tcp_get_info); @@ -2727,6 +2725,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,  {  	struct inet_connection_sock *icsk = inet_csk(sk);  	struct tcp_sock *tp = tcp_sk(sk); +	struct net *net = sock_net(sk);  	int val, len;  	if (get_user(len, optlen)) @@ -2761,12 +2760,12 @@ static int do_tcp_getsockopt(struct sock *sk, int level,  		val = keepalive_probes(tp);  		break;  	case TCP_SYNCNT: -		val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; +		val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;  		break;  	case TCP_LINGER2:  		val = tp->linger2;  		if (val >= 0) -			val = (val ? : sysctl_tcp_fin_timeout) / HZ; +			val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ;  		break;  	case TCP_DEFER_ACCEPT:  		val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept, @@ -2943,17 +2942,26 @@ static bool tcp_md5sig_pool_populated = false;  static void __tcp_alloc_md5sig_pool(void)  { +	struct crypto_ahash *hash;  	int cpu; +	hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC); +	if (IS_ERR(hash)) +		return; +  	for_each_possible_cpu(cpu) { -		if (!per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm) { -			struct crypto_hash *hash; +		struct ahash_request *req; -			hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); -			if (IS_ERR(hash)) -				return; -			per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm = hash; -		} +		if (per_cpu(tcp_md5sig_pool, cpu).md5_req) +			continue; + +		req = ahash_request_alloc(hash, GFP_KERNEL); +		if (!req) +			return; + +		ahash_request_set_callback(req, 0, NULL, NULL); + +		per_cpu(tcp_md5sig_pool, cpu).md5_req = req;  	}  	/* before setting tcp_md5sig_pool_populated, we must commit all writes  	 * to memory. See smp_rmb() in tcp_get_md5sig_pool() @@ -3003,7 +3011,6 @@ int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,  {  	struct scatterlist sg;  	struct tcphdr hdr; -	int err;  	/* We are not allowed to change tcphdr, make a local copy */  	memcpy(&hdr, th, sizeof(hdr)); @@ -3011,8 +3018,8 @@ int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,  	/* options aren't included in the hash */  	sg_init_one(&sg, &hdr, sizeof(hdr)); -	err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr)); -	return err; +	ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(hdr)); +	return crypto_ahash_update(hp->md5_req);  }  EXPORT_SYMBOL(tcp_md5_hash_header); @@ -3021,7 +3028,7 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,  {  	struct scatterlist sg;  	const struct tcphdr *tp = tcp_hdr(skb); -	struct hash_desc *desc = &hp->md5_desc; +	struct ahash_request *req = hp->md5_req;  	unsigned int i;  	const unsigned int head_data_len = skb_headlen(skb) > header_len ?  					   skb_headlen(skb) - header_len : 0; @@ -3031,7 +3038,8 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,  	sg_init_table(&sg, 1);  	sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len); -	if (crypto_hash_update(desc, &sg, head_data_len)) +	ahash_request_set_crypt(req, &sg, NULL, head_data_len); +	if (crypto_ahash_update(req))  		return 1;  	for (i = 0; i < shi->nr_frags; ++i) { @@ -3041,7 +3049,8 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,  		sg_set_page(&sg, page, skb_frag_size(f),  			    offset_in_page(offset)); -		if (crypto_hash_update(desc, &sg, skb_frag_size(f))) +		ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f)); +		if (crypto_ahash_update(req))  			return 1;  	} @@ -3058,7 +3067,8 @@ int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *ke  	struct scatterlist sg;  	sg_init_one(&sg, key->key, key->keylen); -	return crypto_hash_update(&hp->md5_desc, &sg, key->keylen); +	ahash_request_set_crypt(hp->md5_req, &sg, NULL, key->keylen); +	return crypto_ahash_update(hp->md5_req);  }  EXPORT_SYMBOL(tcp_md5_hash_key); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 55be6ac70cff..cffd8f9ed1a9 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -1,3 +1,4 @@ +#include <linux/crypto.h>  #include <linux/err.h>  #include <linux/init.h>  #include <linux/kernel.h> @@ -124,6 +125,49 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req,  	return false;  } + +/* If an incoming SYN or SYNACK frame contains a payload and/or FIN, + * queue this additional data / FIN. + */ +void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb) +{ +	struct tcp_sock *tp = tcp_sk(sk); + +	if (TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt) +		return; + +	skb = skb_clone(skb, GFP_ATOMIC); +	if (!skb) +		return; + +	skb_dst_drop(skb); +	/* segs_in has been initialized to 1 in tcp_create_openreq_child(). +	 * Hence, reset segs_in to 0 before calling tcp_segs_in() +	 * to avoid double counting.  Also, tcp_segs_in() expects +	 * skb->len to include the tcp_hdrlen.  Hence, it should +	 * be called before __skb_pull(). +	 */ +	tp->segs_in = 0; +	tcp_segs_in(tp, skb); +	__skb_pull(skb, tcp_hdrlen(skb)); +	skb_set_owner_r(skb, sk); + +	TCP_SKB_CB(skb)->seq++; +	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN; + +	tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; +	__skb_queue_tail(&sk->sk_receive_queue, skb); +	tp->syn_data_acked = 1; + +	/* u64_stats_update_begin(&tp->syncp) not needed here, +	 * as we certainly are not changing upper 32bit value (0) +	 */ +	tp->bytes_received = skb->len; + +	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) +		tcp_fin(sk); +} +  static struct sock *tcp_fastopen_create_child(struct sock *sk,  					      struct sk_buff *skb,  					      struct dst_entry *dst, @@ -132,7 +176,6 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,  	struct tcp_sock *tp;  	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;  	struct sock *child; -	u32 end_seq;  	bool own_req;  	req->num_retrans = 0; @@ -178,35 +221,11 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,  	tcp_init_metrics(child);  	tcp_init_buffer_space(child); -	/* Queue the data carried in the SYN packet. -	 * We used to play tricky games with skb_get(). -	 * With lockless listener, it is a dead end. -	 * Do not think about it. -	 * -	 * XXX (TFO) - we honor a zero-payload TFO request for now, -	 * (any reason not to?) but no need to queue the skb since -	 * there is no data. How about SYN+FIN? -	 */ -	end_seq = TCP_SKB_CB(skb)->end_seq; -	if (end_seq != TCP_SKB_CB(skb)->seq + 1) { -		struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); - -		if (likely(skb2)) { -			skb_dst_drop(skb2); -			__skb_pull(skb2, tcp_hdrlen(skb)); -			skb_set_owner_r(skb2, child); -			__skb_queue_tail(&child->sk_receive_queue, skb2); -			tp->syn_data_acked = 1; - -			/* u64_stats_update_begin(&tp->syncp) not needed here, -			 * as we certainly are not changing upper 32bit value (0) -			 */ -			tp->bytes_received = end_seq - TCP_SKB_CB(skb)->seq - 1; -		} else { -			end_seq = TCP_SKB_CB(skb)->seq + 1; -		} -	} -	tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = end_seq; +	tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + +	tcp_fastopen_add_skb(child, skb); + +	tcp_rsk(req)->rcv_nxt = tp->rcv_nxt;  	/* tcp_conn_request() is sending the SYNACK,  	 * and queues the child into listener accept queue.  	 */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3b2c8e90a475..e6e65f79ade8 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -80,9 +80,7 @@ int sysctl_tcp_timestamps __read_mostly = 1;  int sysctl_tcp_window_scaling __read_mostly = 1;  int sysctl_tcp_sack __read_mostly = 1;  int sysctl_tcp_fack __read_mostly = 1; -int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;  int sysctl_tcp_max_reordering __read_mostly = 300; -EXPORT_SYMBOL(sysctl_tcp_reordering);  int sysctl_tcp_dsack __read_mostly = 1;  int sysctl_tcp_app_win __read_mostly = 31;  int sysctl_tcp_adv_win_scale __read_mostly = 1; @@ -126,6 +124,10 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;  #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)  #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) +#define REXMIT_NONE	0 /* no loss recovery to do */ +#define REXMIT_LOST	1 /* retransmit packets marked lost */ +#define REXMIT_NEW	2 /* FRTO-style transmit of unsent/new packets */ +  /* Adapt the MSS value used to make delayed ack decision to the   * real world.   */ @@ -1210,6 +1212,7 @@ static u8 tcp_sacktag_one(struct sock *sk,  		sacked |= TCPCB_SACKED_ACKED;  		state->flag |= FLAG_DATA_SACKED;  		tp->sacked_out += pcount; +		tp->delivered += pcount;  /* Out-of-order packets delivered */  		fack_count += pcount; @@ -1821,8 +1824,12 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend)  static void tcp_add_reno_sack(struct sock *sk)  {  	struct tcp_sock *tp = tcp_sk(sk); +	u32 prior_sacked = tp->sacked_out; +  	tp->sacked_out++;  	tcp_check_reno_reordering(sk, 0); +	if (tp->sacked_out > prior_sacked) +		tp->delivered++; /* Some out-of-order packet is delivered */  	tcp_verify_left_out(tp);  } @@ -1834,6 +1841,7 @@ static void tcp_remove_reno_sacks(struct sock *sk, int acked)  	if (acked > 0) {  		/* One ACK acked hole. The rest eat duplicate ACKs. */ +		tp->delivered += max_t(int, acked - tp->sacked_out, 1);  		if (acked - 1 >= tp->sacked_out)  			tp->sacked_out = 0;  		else @@ -1873,6 +1881,7 @@ void tcp_enter_loss(struct sock *sk)  {  	const struct inet_connection_sock *icsk = inet_csk(sk);  	struct tcp_sock *tp = tcp_sk(sk); +	struct net *net = sock_net(sk);  	struct sk_buff *skb;  	bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;  	bool is_reneg;			/* is receiver reneging on SACKs? */ @@ -1923,9 +1932,9 @@ void tcp_enter_loss(struct sock *sk)  	 * suggests that the degree of reordering is over-estimated.  	 */  	if (icsk->icsk_ca_state <= TCP_CA_Disorder && -	    tp->sacked_out >= sysctl_tcp_reordering) +	    tp->sacked_out >= net->ipv4.sysctl_tcp_reordering)  		tp->reordering = min_t(unsigned int, tp->reordering, -				       sysctl_tcp_reordering); +				       net->ipv4.sysctl_tcp_reordering);  	tcp_set_ca_state(sk, TCP_CA_Loss);  	tp->high_seq = tp->snd_nxt;  	tcp_ecn_queue_cwr(tp); @@ -2109,6 +2118,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)  {  	struct tcp_sock *tp = tcp_sk(sk);  	__u32 packets_out; +	int tcp_reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;  	/* Trick#1: The loss is proven. */  	if (tp->lost_out) @@ -2123,7 +2133,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)  	 */  	packets_out = tp->packets_out;  	if (packets_out <= tp->reordering && -	    tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) && +	    tp->sacked_out >= max_t(__u32, packets_out/2, tcp_reordering) &&  	    !tcp_may_send_now(sk)) {  		/* We have nothing to send. This connection is limited  		 * either by receiver window or by application. @@ -2467,14 +2477,12 @@ static void tcp_init_cwnd_reduction(struct sock *sk)  	tcp_ecn_queue_cwr(tp);  } -static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, -			       int fast_rexmit, int flag) +static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, +			       int flag)  {  	struct tcp_sock *tp = tcp_sk(sk);  	int sndcnt = 0;  	int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp); -	int newly_acked_sacked = prior_unsacked - -				 (tp->packets_out - tp->sacked_out);  	if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))  		return; @@ -2492,7 +2500,8 @@ static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,  	} else {  		sndcnt = min(delta, newly_acked_sacked);  	} -	sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0)); +	/* Force a fast retransmit upon entering fast recovery */ +	sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));  	tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;  } @@ -2537,7 +2546,7 @@ static void tcp_try_keep_open(struct sock *sk)  	}  } -static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked) +static void tcp_try_to_open(struct sock *sk, int flag)  {  	struct tcp_sock *tp = tcp_sk(sk); @@ -2551,8 +2560,6 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)  	if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {  		tcp_try_keep_open(sk); -	} else { -		tcp_cwnd_reduction(sk, prior_unsacked, 0, flag);  	}  } @@ -2662,7 +2669,8 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)  /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are   * recovered or spurious. Otherwise retransmits more on partial ACKs.   */ -static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) +static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, +			     int *rexmit)  {  	struct tcp_sock *tp = tcp_sk(sk);  	bool recovered = !before(tp->snd_una, tp->high_seq); @@ -2684,10 +2692,15 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)  				tp->frto = 0; /* Step 3.a. loss was real */  		} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {  			tp->high_seq = tp->snd_nxt; -			__tcp_push_pending_frames(sk, tcp_current_mss(sk), -						  TCP_NAGLE_OFF); -			if (after(tp->snd_nxt, tp->high_seq)) -				return; /* Step 2.b */ +			/* Step 2.b. Try send new data (but deferred until cwnd +			 * is updated in tcp_ack()). Otherwise fall back to +			 * the conventional recovery. +			 */ +			if (tcp_send_head(sk) && +			    after(tcp_wnd_end(tp), tp->snd_nxt)) { +				*rexmit = REXMIT_NEW; +				return; +			}  			tp->frto = 0;  		}  	} @@ -2706,12 +2719,11 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)  		else if (flag & FLAG_SND_UNA_ADVANCED)  			tcp_reset_reno_sack(tp);  	} -	tcp_xmit_retransmit_queue(sk); +	*rexmit = REXMIT_LOST;  }  /* Undo during fast recovery after partial ACK. */ -static bool tcp_try_undo_partial(struct sock *sk, const int acked, -				 const int prior_unsacked, int flag) +static bool tcp_try_undo_partial(struct sock *sk, const int acked)  {  	struct tcp_sock *tp = tcp_sk(sk); @@ -2726,10 +2738,8 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked,  		 * can undo. Otherwise we clock out new packets but do not  		 * mark more packets lost or retransmit more.  		 */ -		if (tp->retrans_out) { -			tcp_cwnd_reduction(sk, prior_unsacked, 0, flag); +		if (tp->retrans_out)  			return true; -		}  		if (!tcp_any_retrans_done(sk))  			tp->retrans_stamp = 0; @@ -2748,21 +2758,21 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked,   * taking into account both packets sitting in receiver's buffer and   * packets lost by network.   * - * Besides that it does CWND reduction, when packet loss is detected - * and changes state of machine. + * Besides that it updates the congestion state when packet loss or ECN + * is detected. But it does not reduce the cwnd, it is done by the + * congestion control later.   *   * It does _not_ decide what to send, it is made in function   * tcp_xmit_retransmit_queue().   */  static void tcp_fastretrans_alert(struct sock *sk, const int acked, -				  const int prior_unsacked, -				  bool is_dupack, int flag) +				  bool is_dupack, int *ack_flag, int *rexmit)  {  	struct inet_connection_sock *icsk = inet_csk(sk);  	struct tcp_sock *tp = tcp_sk(sk); +	int fast_rexmit = 0, flag = *ack_flag;  	bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&  				    (tcp_fackets_out(tp) > tp->reordering)); -	int fast_rexmit = 0;  	if (WARN_ON(!tp->packets_out && tp->sacked_out))  		tp->sacked_out = 0; @@ -2809,8 +2819,10 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,  	/* Use RACK to detect loss */  	if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS && -	    tcp_rack_mark_lost(sk)) +	    tcp_rack_mark_lost(sk)) {  		flag |= FLAG_LOST_RETRANS; +		*ack_flag |= FLAG_LOST_RETRANS; +	}  	/* E. Process state. */  	switch (icsk->icsk_ca_state) { @@ -2819,7 +2831,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,  			if (tcp_is_reno(tp) && is_dupack)  				tcp_add_reno_sack(sk);  		} else { -			if (tcp_try_undo_partial(sk, acked, prior_unsacked, flag)) +			if (tcp_try_undo_partial(sk, acked))  				return;  			/* Partial ACK arrived. Force fast retransmit. */  			do_lost = tcp_is_reno(tp) || @@ -2831,7 +2843,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,  		}  		break;  	case TCP_CA_Loss: -		tcp_process_loss(sk, flag, is_dupack); +		tcp_process_loss(sk, flag, is_dupack, rexmit);  		if (icsk->icsk_ca_state != TCP_CA_Open &&  		    !(flag & FLAG_LOST_RETRANS))  			return; @@ -2848,7 +2860,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,  			tcp_try_undo_dsack(sk);  		if (!tcp_time_to_recover(sk, flag)) { -			tcp_try_to_open(sk, flag, prior_unsacked); +			tcp_try_to_open(sk, flag);  			return;  		} @@ -2870,8 +2882,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,  	if (do_lost)  		tcp_update_scoreboard(sk, fast_rexmit); -	tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit, flag); -	tcp_xmit_retransmit_queue(sk); +	*rexmit = REXMIT_LOST;  }  /* Kathleen Nichols' algorithm for tracking the minimum value of @@ -3096,7 +3107,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,   * arrived at the other end.   */  static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, -			       u32 prior_snd_una, +			       u32 prior_snd_una, int *acked,  			       struct tcp_sacktag_state *sack)  {  	const struct inet_connection_sock *icsk = inet_csk(sk); @@ -3154,10 +3165,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,  				flag |= FLAG_ORIG_SACK_ACKED;  		} -		if (sacked & TCPCB_SACKED_ACKED) +		if (sacked & TCPCB_SACKED_ACKED) {  			tp->sacked_out -= acked_pcount; -		else if (tcp_is_sack(tp) && !tcp_skb_spurious_retrans(tp, skb)) -			tcp_rack_advance(tp, &skb->skb_mstamp, sacked); +		} else if (tcp_is_sack(tp)) { +			tp->delivered += acked_pcount; +			if (!tcp_skb_spurious_retrans(tp, skb)) +				tcp_rack_advance(tp, &skb->skb_mstamp, sacked); +		}  		if (sacked & TCPCB_LOST)  			tp->lost_out -= acked_pcount; @@ -3266,6 +3280,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,  		}  	}  #endif +	*acked = pkts_acked;  	return flag;  } @@ -3299,21 +3314,36 @@ static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)  /* Decide wheather to run the increase function of congestion control. */  static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)  { -	if (tcp_in_cwnd_reduction(sk)) -		return false; -  	/* If reordering is high then always grow cwnd whenever data is  	 * delivered regardless of its ordering. Otherwise stay conservative  	 * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/  	 * new SACK or ECE mark may first advance cwnd here and later reduce  	 * cwnd in tcp_fastretrans_alert() based on more states.  	 */ -	if (tcp_sk(sk)->reordering > sysctl_tcp_reordering) +	if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering)  		return flag & FLAG_FORWARD_PROGRESS;  	return flag & FLAG_DATA_ACKED;  } +/* The "ultimate" congestion control function that aims to replace the rigid + * cwnd increase and decrease control (tcp_cong_avoid,tcp_*cwnd_reduction). + * It's called toward the end of processing an ACK with precise rate + * information. All transmission or retransmission are delayed afterwards. + */ +static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked, +			     int flag) +{ +	if (tcp_in_cwnd_reduction(sk)) { +		/* Reduce cwnd if state mandates */ +		tcp_cwnd_reduction(sk, acked_sacked, flag); +	} else if (tcp_may_raise_cwnd(sk, flag)) { +		/* Advance cwnd if state allows */ +		tcp_cong_avoid(sk, ack, acked_sacked); +	} +	tcp_update_pacing_rate(sk); +} +  /* Check that window update is acceptable.   * The function assumes that snd_una<=ack<=snd_next.   */ @@ -3509,6 +3539,27 @@ static inline void tcp_in_ack_event(struct sock *sk, u32 flags)  		icsk->icsk_ca_ops->in_ack_event(sk, flags);  } +/* Congestion control has updated the cwnd already. So if we're in + * loss recovery then now we do any new sends (for FRTO) or + * retransmits (for CA_Loss or CA_recovery) that make sense. + */ +static void tcp_xmit_recovery(struct sock *sk, int rexmit) +{ +	struct tcp_sock *tp = tcp_sk(sk); + +	if (rexmit == REXMIT_NONE) +		return; + +	if (unlikely(rexmit == 2)) { +		__tcp_push_pending_frames(sk, tcp_current_mss(sk), +					  TCP_NAGLE_OFF); +		if (after(tp->snd_nxt, tp->high_seq)) +			return; +		tp->frto = 0; +	} +	tcp_xmit_retransmit_queue(sk); +} +  /* This routine deals with incoming acks, but not outgoing ones. */  static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  { @@ -3521,8 +3572,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  	bool is_dupack = false;  	u32 prior_fackets;  	int prior_packets = tp->packets_out; -	const int prior_unsacked = tp->packets_out - tp->sacked_out; +	u32 prior_delivered = tp->delivered;  	int acked = 0; /* Number of packets newly acked */ +	int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */  	sack_state.first_sackt.v64 = 0; @@ -3611,23 +3663,16 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  		goto no_queue;  	/* See if we can take anything off of the retransmit queue. */ -	acked = tp->packets_out; -	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, +	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,  				    &sack_state); -	acked -= tp->packets_out;  	if (tcp_ack_is_dubious(sk, flag)) {  		is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); -		tcp_fastretrans_alert(sk, acked, prior_unsacked, -				      is_dupack, flag); +		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);  	}  	if (tp->tlp_high_seq)  		tcp_process_tlp_ack(sk, ack, flag); -	/* Advance cwnd if state allows */ -	if (tcp_may_raise_cwnd(sk, flag)) -		tcp_cong_avoid(sk, ack, acked); -  	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {  		struct dst_entry *dst = __sk_dst_get(sk);  		if (dst) @@ -3636,14 +3681,14 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  	if (icsk->icsk_pending == ICSK_TIME_RETRANS)  		tcp_schedule_loss_probe(sk); -	tcp_update_pacing_rate(sk); +	tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag); +	tcp_xmit_recovery(sk, rexmit);  	return 1;  no_queue:  	/* If data was DSACKed, see if we can undo a cwnd reduction. */  	if (flag & FLAG_DSACKING_ACK) -		tcp_fastretrans_alert(sk, acked, prior_unsacked, -				      is_dupack, flag); +		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);  	/* If this ack opens up a zero window, clear backoff.  It was  	 * being used to time the probes, and is probably far higher than  	 * it needs to be for normal retransmission. @@ -3666,8 +3711,8 @@ old_ack:  	if (TCP_SKB_CB(skb)->sacked) {  		flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,  						&sack_state); -		tcp_fastretrans_alert(sk, acked, prior_unsacked, -				      is_dupack, flag); +		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); +		tcp_xmit_recovery(sk, rexmit);  	}  	SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt); @@ -3998,7 +4043,7 @@ void tcp_reset(struct sock *sk)   *   *	If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.   */ -static void tcp_fin(struct sock *sk) +void tcp_fin(struct sock *sk)  {  	struct tcp_sock *tp = tcp_sk(sk); @@ -5512,6 +5557,9 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,  	tp->syn_data_acked = tp->syn_data;  	if (tp->syn_data_acked)  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); + +	tcp_fastopen_add_skb(sk, synack); +  	return false;  } @@ -6118,9 +6166,10 @@ static bool tcp_syn_flood_action(const struct sock *sk,  	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;  	const char *msg = "Dropping request";  	bool want_cookie = false; +	struct net *net = sock_net(sk);  #ifdef CONFIG_SYN_COOKIES -	if (sysctl_tcp_syncookies) { +	if (net->ipv4.sysctl_tcp_syncookies) {  		msg = "Sending cookies";  		want_cookie = true;  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); @@ -6129,7 +6178,7 @@ static bool tcp_syn_flood_action(const struct sock *sk,  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);  	if (!queue->synflood_warned && -	    sysctl_tcp_syncookies != 2 && +	    net->ipv4.sysctl_tcp_syncookies != 2 &&  	    xchg(&queue->synflood_warned, 1) == 0)  		pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",  			proto, ntohs(tcp_hdr(skb)->dest), msg); @@ -6162,6 +6211,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,  	__u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;  	struct tcp_options_received tmp_opt;  	struct tcp_sock *tp = tcp_sk(sk); +	struct net *net = sock_net(sk);  	struct sock *fastopen_sk = NULL;  	struct dst_entry *dst = NULL;  	struct request_sock *req; @@ -6172,7 +6222,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,  	 * limitations, they conserve resources and peer is  	 * evidently real one.  	 */ -	if ((sysctl_tcp_syncookies == 2 || +	if ((net->ipv4.sysctl_tcp_syncookies == 2 ||  	     inet_csk_reqsk_queue_is_full(sk)) && !isn) {  		want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);  		if (!want_cookie) @@ -6238,7 +6288,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,  			}  		}  		/* Kill the following clause, if you dislike this way. */ -		else if (!sysctl_tcp_syncookies && +		else if (!net->ipv4.sysctl_tcp_syncookies &&  			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <  			  (sysctl_max_syn_backlog >> 2)) &&  			 !tcp_peer_is_proven(req, dst, false, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 487ac67059e2..ad450509029b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -81,7 +81,7 @@  #include <linux/proc_fs.h>  #include <linux/seq_file.h> -#include <linux/crypto.h> +#include <crypto/hash.h>  #include <linux/scatterlist.h>  int sysctl_tcp_tw_reuse __read_mostly; @@ -319,8 +319,6 @@ void tcp_req_err(struct sock *sk, u32 seq, bool abort)  	/* ICMPs are not backlogged, hence we cannot get  	 * an established socket here.  	 */ -	WARN_ON(req->sk); -  	if (seq != tcp_rsk(req)->snt_isn) {  		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);  	} else if (abort) { @@ -642,8 +640,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)  		 * Incoming packet is checked with md5 hash with finding key,  		 * no RST generated if md5 hash doesn't match.  		 */ -		sk1 = __inet_lookup_listener(net, -					     &tcp_hashinfo, ip_hdr(skb)->saddr, +		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, +					     ip_hdr(skb)->saddr,  					     th->source, ip_hdr(skb)->daddr,  					     ntohs(th->source), inet_iif(skb));  		/* don't send rst if it can't find key */ @@ -865,7 +863,6 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)  	kfree(inet_rsk(req)->opt);  } -  #ifdef CONFIG_TCP_MD5SIG  /*   * RFC2385 MD5 checksumming requires a mapping of @@ -1039,21 +1036,22 @@ static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,  	bp->len = cpu_to_be16(nbytes);  	sg_init_one(&sg, bp, sizeof(*bp)); -	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp)); +	ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp)); +	return crypto_ahash_update(hp->md5_req);  }  static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,  			       __be32 daddr, __be32 saddr, const struct tcphdr *th)  {  	struct tcp_md5sig_pool *hp; -	struct hash_desc *desc; +	struct ahash_request *req;  	hp = tcp_get_md5sig_pool();  	if (!hp)  		goto clear_hash_noput; -	desc = &hp->md5_desc; +	req = hp->md5_req; -	if (crypto_hash_init(desc)) +	if (crypto_ahash_init(req))  		goto clear_hash;  	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))  		goto clear_hash; @@ -1061,7 +1059,8 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,  		goto clear_hash;  	if (tcp_md5_hash_key(hp, key))  		goto clear_hash; -	if (crypto_hash_final(desc, md5_hash)) +	ahash_request_set_crypt(req, NULL, md5_hash, 0); +	if (crypto_ahash_final(req))  		goto clear_hash;  	tcp_put_md5sig_pool(); @@ -1079,7 +1078,7 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,  			const struct sk_buff *skb)  {  	struct tcp_md5sig_pool *hp; -	struct hash_desc *desc; +	struct ahash_request *req;  	const struct tcphdr *th = tcp_hdr(skb);  	__be32 saddr, daddr; @@ -1095,9 +1094,9 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,  	hp = tcp_get_md5sig_pool();  	if (!hp)  		goto clear_hash_noput; -	desc = &hp->md5_desc; +	req = hp->md5_req; -	if (crypto_hash_init(desc)) +	if (crypto_ahash_init(req))  		goto clear_hash;  	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len)) @@ -1108,7 +1107,8 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,  		goto clear_hash;  	if (tcp_md5_hash_key(hp, key))  		goto clear_hash; -	if (crypto_hash_final(desc, md5_hash)) +	ahash_request_set_crypt(req, NULL, md5_hash, 0); +	if (crypto_ahash_final(req))  		goto clear_hash;  	tcp_put_md5sig_pool(); @@ -1587,7 +1587,8 @@ int tcp_v4_rcv(struct sk_buff *skb)  	TCP_SKB_CB(skb)->sacked	 = 0;  lookup: -	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); +	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, +			       th->dest);  	if (!sk)  		goto no_tcp_socket; @@ -1650,7 +1651,7 @@ process:  	sk_incoming_cpu_update(sk);  	bh_lock_sock_nested(sk); -	tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); +	tcp_segs_in(tcp_sk(sk), skb);  	ret = 0;  	if (!sock_owned_by_user(sk)) {  		if (!tcp_prequeue(sk, skb)) @@ -1703,7 +1704,8 @@ do_time_wait:  	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {  	case TCP_TW_SYN: {  		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), -							&tcp_hashinfo, +							&tcp_hashinfo, skb, +							__tcp_hdrlen(th),  							iph->saddr, th->source,  							iph->daddr, th->dest,  							inet_iif(skb)); @@ -2395,6 +2397,16 @@ static int __net_init tcp_sk_init(struct net *net)  	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;  	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; +	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; +	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; +	net->ipv4.sysctl_tcp_syncookies = 1; +	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; +	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; +	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; +	net->ipv4.sysctl_tcp_orphan_retries = 0; +	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; +	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; +  	return 0;  fail:  	tcp_sk_exit(net); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index a726d7853ce5..7b7eec439906 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -369,6 +369,7 @@ void tcp_update_metrics(struct sock *sk)  	const struct inet_connection_sock *icsk = inet_csk(sk);  	struct dst_entry *dst = __sk_dst_get(sk);  	struct tcp_sock *tp = tcp_sk(sk); +	struct net *net = sock_net(sk);  	struct tcp_metrics_block *tm;  	unsigned long rtt;  	u32 val; @@ -473,7 +474,7 @@ void tcp_update_metrics(struct sock *sk)  		if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {  			val = tcp_metric_get(tm, TCP_METRIC_REORDERING);  			if (val < tp->reordering && -			    tp->reordering != sysctl_tcp_reordering) +			    tp->reordering != net->ipv4.sysctl_tcp_reordering)  				tcp_metric_set(tm, TCP_METRIC_REORDERING,  					       tp->reordering);  		} diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 9b02af2139d3..acb366dd61e6 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -27,9 +27,6 @@  #include <net/inet_common.h>  #include <net/xfrm.h> -int sysctl_tcp_syncookies __read_mostly = 1; -EXPORT_SYMBOL(sysctl_tcp_syncookies); -  int sysctl_tcp_abort_on_overflow __read_mostly;  struct inet_timewait_death_row tcp_death_row = { @@ -815,7 +812,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,  	int ret = 0;  	int state = child->sk_state; -	tcp_sk(child)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); +	tcp_segs_in(tcp_sk(child), skb);  	if (!sock_owned_by_user(child)) {  		ret = tcp_rcv_state_process(child, skb);  		/* Wakeup parent, send SIGIO */ diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 9864a2dbadce..773083b7f1e9 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -135,7 +135,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,  		th->fin = th->psh = 0;  		th->check = newcheck; -		if (skb->ip_summed != CHECKSUM_PARTIAL) +		if (skb->ip_summed == CHECKSUM_PARTIAL) +			gso_reset_checksum(skb, ~th->check); +		else  			th->check = gso_make_checksum(skb, ~th->check);  		seq += mss; @@ -169,7 +171,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,  		      skb->data_len);  	th->check = ~csum_fold((__force __wsum)((__force u32)th->check +  				(__force u32)delta)); -	if (skb->ip_summed != CHECKSUM_PARTIAL) +	if (skb->ip_summed == CHECKSUM_PARTIAL) +		gso_reset_checksum(skb, ~th->check); +	else  		th->check = gso_make_checksum(skb, ~th->check);  out:  	return segs; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index fda379cd600d..7d2dc015cd19 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -62,9 +62,6 @@ int sysctl_tcp_tso_win_divisor __read_mostly = 3;  /* By default, RFC2861 behavior.  */  int sysctl_tcp_slow_start_after_idle __read_mostly = 1; -unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX; -EXPORT_SYMBOL(sysctl_tcp_notsent_lowat); -  static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,  			   int push_one, gfp_t gfp); @@ -1006,8 +1003,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,  	if (likely(tcb->tcp_flags & TCPHDR_ACK))  		tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); -	if (skb->len != tcp_header_size) +	if (skb->len != tcp_header_size) {  		tcp_event_data_sent(tp, sk); +		tp->data_segs_out += tcp_skb_pcount(skb); +	}  	if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)  		TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, @@ -3476,6 +3475,7 @@ void tcp_send_probe0(struct sock *sk)  {  	struct inet_connection_sock *icsk = inet_csk(sk);  	struct tcp_sock *tp = tcp_sk(sk); +	struct net *net = sock_net(sk);  	unsigned long probe_max;  	int err; @@ -3489,7 +3489,7 @@ void tcp_send_probe0(struct sock *sk)  	}  	if (err <= 0) { -		if (icsk->icsk_backoff < sysctl_tcp_retries2) +		if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2)  			icsk->icsk_backoff++;  		icsk->icsk_probes_out++;  		probe_max = TCP_RTO_MAX; diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index ebf5ff57526e..f6c50af24a64 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -187,13 +187,13 @@ static int tcpprobe_sprint(char *tbuf, int n)  {  	const struct tcp_log *p  		= tcp_probe.log + tcp_probe.tail; -	struct timespec tv -		= ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start)); +	struct timespec64 ts +		= ktime_to_timespec64(ktime_sub(p->tstamp, tcp_probe.start));  	return scnprintf(tbuf, n,  			"%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n", -			(unsigned long)tv.tv_sec, -			(unsigned long)tv.tv_nsec, +			(unsigned long)ts.tv_sec, +			(unsigned long)ts.tv_nsec,  			&p->src, &p->dst, p->length, p->snd_nxt, p->snd_una,  			p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd);  } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index a4730a28b220..49bc474f8e35 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -22,11 +22,6 @@  #include <linux/gfp.h>  #include <net/tcp.h> -int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES; -int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES; -int sysctl_tcp_retries1 __read_mostly = TCP_RETR1; -int sysctl_tcp_retries2 __read_mostly = TCP_RETR2; -int sysctl_tcp_orphan_retries __read_mostly;  int sysctl_tcp_thin_linear_timeouts __read_mostly;  static void tcp_write_err(struct sock *sk) @@ -82,7 +77,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)  /* Calculate maximal number or retries on an orphaned socket. */  static int tcp_orphan_retries(struct sock *sk, bool alive)  { -	int retries = sysctl_tcp_orphan_retries; /* May be zero. */ +	int retries = sock_net(sk)->ipv4.sysctl_tcp_orphan_retries; /* May be zero. */  	/* We know from an ICMP that something is wrong. */  	if (sk->sk_err_soft && !alive) @@ -157,6 +152,7 @@ static int tcp_write_timeout(struct sock *sk)  {  	struct inet_connection_sock *icsk = inet_csk(sk);  	struct tcp_sock *tp = tcp_sk(sk); +	struct net *net = sock_net(sk);  	int retry_until;  	bool do_reset, syn_set = false; @@ -169,10 +165,10 @@ static int tcp_write_timeout(struct sock *sk)  				NET_INC_STATS_BH(sock_net(sk),  						 LINUX_MIB_TCPFASTOPENACTIVEFAIL);  		} -		retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; +		retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;  		syn_set = true;  	} else { -		if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) { +		if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0, 0)) {  			/* Some middle-boxes may black-hole Fast Open _after_  			 * the handshake. Therefore we conservatively disable  			 * Fast Open on this path on recurring timeouts with @@ -181,7 +177,7 @@ static int tcp_write_timeout(struct sock *sk)  			if (tp->syn_data_acked &&  			    tp->bytes_acked <= tp->rx_opt.mss_clamp) {  				tcp_fastopen_cache_set(sk, 0, NULL, true, 0); -				if (icsk->icsk_retransmits == sysctl_tcp_retries1) +				if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1)  					NET_INC_STATS_BH(sock_net(sk),  							 LINUX_MIB_TCPFASTOPENACTIVEFAIL);  			} @@ -191,7 +187,7 @@ static int tcp_write_timeout(struct sock *sk)  			dst_negative_advice(sk);  		} -		retry_until = sysctl_tcp_retries2; +		retry_until = net->ipv4.sysctl_tcp_retries2;  		if (sock_flag(sk, SOCK_DEAD)) {  			const bool alive = icsk->icsk_rto < TCP_RTO_MAX; @@ -305,7 +301,7 @@ static void tcp_probe_timer(struct sock *sk)  		 (s32)(tcp_time_stamp - start_ts) > icsk->icsk_user_timeout)  		goto abort; -	max_probes = sysctl_tcp_retries2; +	max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2;  	if (sock_flag(sk, SOCK_DEAD)) {  		const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; @@ -332,7 +328,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk)  {  	struct inet_connection_sock *icsk = inet_csk(sk);  	int max_retries = icsk->icsk_syn_retries ? : -	    sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */ +	    sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */  	struct request_sock *req;  	req = tcp_sk(sk)->fastopen_rsk; @@ -360,6 +356,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk)  void tcp_retransmit_timer(struct sock *sk)  {  	struct tcp_sock *tp = tcp_sk(sk); +	struct net *net = sock_net(sk);  	struct inet_connection_sock *icsk = inet_csk(sk);  	if (tp->fastopen_rsk) { @@ -490,7 +487,7 @@ out_reset_timer:  		icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);  	}  	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); -	if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0)) +	if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0, 0))  		__sk_dst_reset(sk);  out:; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 95d2f198017e..08eed5e16df0 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -356,8 +356,8 @@ EXPORT_SYMBOL(udp_lib_get_port);   * match_wildcard == false: addresses must be exactly the same, i.e.   *                          0.0.0.0 only equals to 0.0.0.0   */ -static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2, -				bool match_wildcard) +int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2, +			 bool match_wildcard)  {  	struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); @@ -848,32 +848,20 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb,  {  	struct udphdr *uh = udp_hdr(skb); -	if (nocheck) +	if (nocheck) {  		uh->check = 0; -	else if (skb_is_gso(skb)) +	} else if (skb_is_gso(skb)) {  		uh->check = ~udp_v4_check(len, saddr, daddr, 0); -	else if (skb_dst(skb) && skb_dst(skb)->dev && -		 (skb_dst(skb)->dev->features & -		  (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM))) { - -		BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); - +	} else if (skb->ip_summed == CHECKSUM_PARTIAL) { +		uh->check = 0; +		uh->check = udp_v4_check(len, saddr, daddr, lco_csum(skb)); +		if (uh->check == 0) +			uh->check = CSUM_MANGLED_0; +	} else {  		skb->ip_summed = CHECKSUM_PARTIAL;  		skb->csum_start = skb_transport_header(skb) - skb->head;  		skb->csum_offset = offsetof(struct udphdr, check);  		uh->check = ~udp_v4_check(len, saddr, daddr, 0); -	} else { -		__wsum csum; - -		BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); - -		uh->check = 0; -		csum = skb_checksum(skb, 0, len, 0); -		uh->check = udp_v4_check(len, saddr, daddr, csum); -		if (uh->check == 0) -			uh->check = CSUM_MANGLED_0; - -		skb->ip_summed = CHECKSUM_UNNECESSARY;  	}  }  EXPORT_SYMBOL(udp_set_csum); @@ -2082,10 +2070,14 @@ void udp_v4_early_demux(struct sk_buff *skb)  		if (!in_dev)  			return; -		ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, -				       iph->protocol); -		if (!ours) -			return; +		/* we are supposed to accept bcast packets */ +		if (skb->pkt_type == PACKET_MULTICAST) { +			ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, +					       iph->protocol); +			if (!ours) +				return; +		} +  		sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,  						   uh->source, iph->saddr, dif);  	} else if (skb->pkt_type == PACKET_HOST) { diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 4c519c1dc161..0ed2dafb7cc4 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -32,42 +32,65 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,  					     netdev_features_t features),  	__be16 new_protocol, bool is_ipv6)  { +	int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); +	bool remcsum, need_csum, offload_csum, ufo;  	struct sk_buff *segs = ERR_PTR(-EINVAL); +	struct udphdr *uh = udp_hdr(skb);  	u16 mac_offset = skb->mac_header; -	int mac_len = skb->mac_len; -	int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);  	__be16 protocol = skb->protocol; -	netdev_features_t enc_features; +	u16 mac_len = skb->mac_len;  	int udp_offset, outer_hlen; -	unsigned int oldlen; -	bool need_csum = !!(skb_shinfo(skb)->gso_type & -			    SKB_GSO_UDP_TUNNEL_CSUM); -	bool remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM); -	bool offload_csum = false, dont_encap = (need_csum || remcsum); - -	oldlen = (u16)~skb->len; +	__wsum partial;  	if (unlikely(!pskb_may_pull(skb, tnl_hlen)))  		goto out; +	/* Adjust partial header checksum to negate old length. +	 * We cannot rely on the value contained in uh->len as it is +	 * possible that the actual value exceeds the boundaries of the +	 * 16 bit length field due to the header being added outside of an +	 * IP or IPv6 frame that was already limited to 64K - 1. +	 */ +	partial = csum_sub(csum_unfold(uh->check), +			   (__force __wsum)htonl(skb->len)); + +	/* setup inner skb. */  	skb->encapsulation = 0; +	SKB_GSO_CB(skb)->encap_level = 0;  	__skb_pull(skb, tnl_hlen);  	skb_reset_mac_header(skb);  	skb_set_network_header(skb, skb_inner_network_offset(skb));  	skb->mac_len = skb_inner_network_offset(skb);  	skb->protocol = new_protocol; + +	need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM);  	skb->encap_hdr_csum = need_csum; + +	remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);  	skb->remcsum_offload = remcsum; +	ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); +  	/* Try to offload checksum if possible */  	offload_csum = !!(need_csum && -			  ((skb->dev->features & NETIF_F_HW_CSUM) || -			   (skb->dev->features & (is_ipv6 ? -			    NETIF_F_IPV6_CSUM : NETIF_F_IP_CSUM)))); +			  (skb->dev->features & +			   (is_ipv6 ? (NETIF_F_HW_CSUM | NETIF_F_IPV6_CSUM) : +				      (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM)))); + +	features &= skb->dev->hw_enc_features; + +	/* The only checksum offload we care about from here on out is the +	 * outer one so strip the existing checksum feature flags and +	 * instead set the flag based on our outer checksum offload value. +	 */ +	if (remcsum || ufo) { +		features &= ~NETIF_F_CSUM_MASK; +		if (!need_csum || offload_csum) +			features |= NETIF_F_HW_CSUM; +	}  	/* segment inner packet. */ -	enc_features = skb->dev->hw_enc_features & features; -	segs = gso_inner_segment(skb, enc_features); +	segs = gso_inner_segment(skb, features);  	if (IS_ERR_OR_NULL(segs)) {  		skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,  				     mac_len); @@ -78,17 +101,13 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,  	udp_offset = outer_hlen - tnl_hlen;  	skb = segs;  	do { -		struct udphdr *uh; -		int len; -		__be32 delta; +		__be16 len; -		if (dont_encap) { -			skb->encapsulation = 0; +		if (remcsum)  			skb->ip_summed = CHECKSUM_NONE; -		} else { -			/* Only set up inner headers if we might be offloading -			 * inner checksum. -			 */ + +		/* Set up inner headers if we are offloading inner checksum */ +		if (skb->ip_summed == CHECKSUM_PARTIAL) {  			skb_reset_inner_headers(skb);  			skb->encapsulation = 1;  		} @@ -96,43 +115,27 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,  		skb->mac_len = mac_len;  		skb->protocol = protocol; -		skb_push(skb, outer_hlen); +		__skb_push(skb, outer_hlen);  		skb_reset_mac_header(skb);  		skb_set_network_header(skb, mac_len);  		skb_set_transport_header(skb, udp_offset); -		len = skb->len - udp_offset; +		len = htons(skb->len - udp_offset);  		uh = udp_hdr(skb); -		uh->len = htons(len); +		uh->len = len;  		if (!need_csum)  			continue; -		delta = htonl(oldlen + len); +		uh->check = ~csum_fold(csum_add(partial, (__force __wsum)len)); -		uh->check = ~csum_fold((__force __wsum) -				       ((__force u32)uh->check + -					(__force u32)delta)); -		if (offload_csum) { -			skb->ip_summed = CHECKSUM_PARTIAL; -			skb->csum_start = skb_transport_header(skb) - skb->head; -			skb->csum_offset = offsetof(struct udphdr, check); -		} else if (remcsum) { -			/* Need to calculate checksum from scratch, -			 * inner checksums are never when doing -			 * remote_checksum_offload. -			 */ - -			skb->csum = skb_checksum(skb, udp_offset, -						 skb->len - udp_offset, -						 0); -			uh->check = csum_fold(skb->csum); -			if (uh->check == 0) -				uh->check = CSUM_MANGLED_0; -		} else { +		if (skb->encapsulation || !offload_csum) {  			uh->check = gso_make_checksum(skb, ~uh->check); -  			if (uh->check == 0)  				uh->check = CSUM_MANGLED_0; +		} else { +			skb->ip_summed = CHECKSUM_PARTIAL; +			skb->csum_start = skb_transport_header(skb) - skb->head; +			skb->csum_offset = offsetof(struct udphdr, check);  		}  	} while ((skb = skb->next));  out: @@ -235,6 +238,13 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,  	skb->ip_summed = CHECKSUM_NONE; +	/* If there is no outer header we can fake a checksum offload +	 * due to the fact that we have already done the checksum in +	 * software prior to segmenting the frame. +	 */ +	if (!skb->encap_hdr_csum) +		features |= NETIF_F_HW_CSUM; +  	/* Fragment the skb. IP headers of the fragments are updated in  	 * inet_gso_segment()  	 */ @@ -302,14 +312,14 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,  	unsigned int off = skb_gro_offset(skb);  	int flush = 1; -	if (NAPI_GRO_CB(skb)->udp_mark || +	if (NAPI_GRO_CB(skb)->encap_mark ||  	    (skb->ip_summed != CHECKSUM_PARTIAL &&  	     NAPI_GRO_CB(skb)->csum_cnt == 0 &&  	     !NAPI_GRO_CB(skb)->csum_valid))  		goto out; -	/* mark that this skb passed once through the udp gro layer */ -	NAPI_GRO_CB(skb)->udp_mark = 1; +	/* mark that this skb passed once through the tunnel gro layer */ +	NAPI_GRO_CB(skb)->encap_mark = 1;  	rcu_read_lock();  	uo_priv = rcu_dereference(udp_offload_base); | 
