summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c12
-rw-r--r--net/ipv4/arp.c2
-rw-r--r--net/ipv4/cipso_ipv4.c13
-rw-r--r--net/ipv4/devinet.c7
-rw-r--r--net/ipv4/esp4.c4
-rw-r--r--net/ipv4/fib_frontend.c7
-rw-r--r--net/ipv4/fib_rules.c4
-rw-r--r--net/ipv4/fou_core.c32
-rw-r--r--net/ipv4/fou_nl.c4
-rw-r--r--net/ipv4/icmp.c39
-rw-r--r--net/ipv4/inet_connection_sock.c42
-rw-r--r--net/ipv4/inet_diag.c570
-rw-r--r--net/ipv4/inet_fragment.c2
-rw-r--r--net/ipv4/inet_hashtables.c108
-rw-r--r--net/ipv4/inet_timewait_sock.c15
-rw-r--r--net/ipv4/ip_fragment.c6
-rw-r--r--net/ipv4/ip_gre.c4
-rw-r--r--net/ipv4/ip_input.c40
-rw-r--r--net/ipv4/ip_options.c5
-rw-r--r--net/ipv4/ip_output.c8
-rw-r--r--net/ipv4/ip_tunnel_core.c6
-rw-r--r--net/ipv4/ipmr.c9
-rw-r--r--net/ipv4/netfilter.c9
-rw-r--r--net/ipv4/netfilter/Kconfig3
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c4
-rw-r--r--net/ipv4/netfilter/nf_dup_ipv4.c4
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c58
-rw-r--r--net/ipv4/netfilter/nf_socket_ipv4.c3
-rw-r--r--net/ipv4/netfilter/nf_tproxy_ipv4.c5
-rw-r--r--net/ipv4/netfilter/nft_fib_ipv4.c4
-rw-r--r--net/ipv4/nexthop.c49
-rw-r--r--net/ipv4/ping.c68
-rw-r--r--net/ipv4/proc.c65
-rw-r--r--net/ipv4/raw.c7
-rw-r--r--net/ipv4/raw_diag.c10
-rw-r--r--net/ipv4/route.c38
-rw-r--r--net/ipv4/syncookies.c4
-rw-r--r--net/ipv4/sysctl_net_ipv4.c19
-rw-r--r--net/ipv4/tcp.c105
-rw-r--r--net/ipv4/tcp_ao.c9
-rw-r--r--net/ipv4/tcp_bpf.c5
-rw-r--r--net/ipv4/tcp_cdg.c2
-rw-r--r--net/ipv4/tcp_diag.c461
-rw-r--r--net/ipv4/tcp_fastopen.c7
-rw-r--r--net/ipv4/tcp_input.c395
-rw-r--r--net/ipv4/tcp_ipv4.c89
-rw-r--r--net/ipv4/tcp_metrics.c8
-rw-r--r--net/ipv4/tcp_minisocks.c80
-rw-r--r--net/ipv4/tcp_offload.c4
-rw-r--r--net/ipv4/tcp_output.c332
-rw-r--r--net/ipv4/tcp_timer.c6
-rw-r--r--net/ipv4/udp.c171
-rw-r--r--net/ipv4/udp_diag.c10
-rw-r--r--net/ipv4/udp_offload.c4
-rw-r--r--net/ipv4/udp_tunnel_core.c3
-rw-r--r--net/ipv4/udp_tunnel_nic.c2
-rw-r--r--net/ipv4/xfrm4_policy.c4
57 files changed, 1800 insertions, 1186 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 76e38092cd8a..3109c5ec38f3 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -102,6 +102,7 @@
#include <net/gro.h>
#include <net/gso.h>
#include <net/tcp.h>
+#include <net/psp.h>
#include <net/udp.h>
#include <net/udplite.h>
#include <net/ping.h>
@@ -158,6 +159,7 @@ void inet_sock_destruct(struct sock *sk)
kfree(rcu_dereference_protected(inet->inet_opt, 1));
dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1));
dst_release(rcu_dereference_protected(sk->sk_rx_dst, 1));
+ psp_sk_assoc_free(sk);
}
EXPORT_SYMBOL(inet_sock_destruct);
@@ -1393,14 +1395,10 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
segs = ERR_PTR(-EPROTONOSUPPORT);
- if (!skb->encapsulation || encap) {
- udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
- fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID);
+ fixedid = !!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCP_FIXEDID << encap));
- /* fixed ID is invalid if DF bit is not set */
- if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF)))
- goto out;
- }
+ if (!skb->encapsulation || encap)
+ udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
ops = rcu_dereference(inet_offloads[proto]);
if (likely(ops && ops->callbacks.gso_segment)) {
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 5cfc1c939673..833f2cf97178 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -170,7 +170,7 @@ struct neigh_table arp_tbl = {
[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
[NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ,
[NEIGH_VAR_GC_STALETIME] = 60 * HZ,
- [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
+ [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_DEFAULT,
[NEIGH_VAR_PROXY_QLEN] = 64,
[NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ,
[NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10,
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 740af8541d2f..709021197e1c 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1715,8 +1715,7 @@ validate_return:
*/
void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway)
{
- unsigned char optbuf[sizeof(struct ip_options) + 40];
- struct ip_options *opt = (struct ip_options *)optbuf;
+ struct inet_skb_parm parm;
int res;
if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES)
@@ -1727,19 +1726,19 @@ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway)
* so we can not use icmp_send and IPCB here.
*/
- memset(opt, 0, sizeof(struct ip_options));
- opt->optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr);
+ memset(&parm, 0, sizeof(parm));
+ parm.opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
rcu_read_lock();
- res = __ip_options_compile(dev_net(skb->dev), opt, skb, NULL);
+ res = __ip_options_compile(dev_net(skb->dev), &parm.opt, skb, NULL);
rcu_read_unlock();
if (res)
return;
if (gateway)
- __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, opt);
+ __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, &parm);
else
- __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, opt);
+ __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, &parm);
}
/**
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index c47d3828d4f6..942a887bf089 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -340,14 +340,13 @@ static void inetdev_destroy(struct in_device *in_dev)
static int __init inet_blackhole_dev_init(void)
{
- int err = 0;
+ struct in_device *in_dev;
rtnl_lock();
- if (!inetdev_init(blackhole_netdev))
- err = -ENOMEM;
+ in_dev = inetdev_init(blackhole_netdev);
rtnl_unlock();
- return err;
+ return PTR_ERR_OR_ZERO(in_dev);
}
late_initcall(inet_blackhole_dev_init);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index f14a41ee4aa1..2c922afadb8f 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -132,8 +132,8 @@ static struct sock *esp_find_tcp_sk(struct xfrm_state *x)
dport = encap->encap_dport;
spin_unlock_bh(&x->lock);
- sk = inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, x->id.daddr.a4,
- dport, x->props.saddr.a4, sport, 0);
+ sk = inet_lookup_established(net, x->id.daddr.a4, dport,
+ x->props.saddr.a4, sport, 0);
if (!sk)
return ERR_PTR(-ENOENT);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 6e1b94796f67..1dab44e13d3b 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -32,6 +32,7 @@
#include <linux/list.h>
#include <linux/slab.h>
+#include <net/flow.h>
#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -293,7 +294,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
.flowi4_iif = LOOPBACK_IFINDEX,
.flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev),
.daddr = ip_hdr(skb)->saddr,
- .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(skb))),
+ .flowi4_dscp = ip4h_dscp(ip_hdr(skb)),
.flowi4_scope = scope,
.flowi4_mark = vmark ? skb->mark : 0,
};
@@ -358,7 +359,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
fl4.daddr = src;
fl4.saddr = dst;
- fl4.flowi4_tos = inet_dscp_to_dsfield(dscp);
+ fl4.flowi4_dscp = dscp;
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
fl4.flowi4_tun_key.tun_id = 0;
fl4.flowi4_flags = 0;
@@ -1372,7 +1373,7 @@ static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn)
struct flowi4 fl4 = {
.flowi4_mark = frn->fl_mark,
.daddr = frn->fl_addr,
- .flowi4_tos = frn->fl_tos & INET_DSCP_MASK,
+ .flowi4_dscp = inet_dsfield_to_dscp(frn->fl_tos),
.flowi4_scope = frn->fl_scope,
};
struct fib_table *tb;
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index fa58d6620ed6..51f0193092f0 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -23,6 +23,7 @@
#include <linux/list.h>
#include <linux/rcupdate.h>
#include <linux/export.h>
+#include <net/flow.h>
#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/route.h>
@@ -193,8 +194,7 @@ INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule,
* to mask the upper three DSCP bits prior to matching to maintain
* legacy behavior.
*/
- if (r->dscp_full &&
- (r->dscp ^ inet_dsfield_to_dscp(fl4->flowi4_tos)) & r->dscp_mask)
+ if (r->dscp_full && (r->dscp ^ fl4->flowi4_dscp) & r->dscp_mask)
return 0;
else if (!r->dscp_full && r->dscp &&
!fib_dscp_masked_match(r->dscp, fl4))
diff --git a/net/ipv4/fou_core.c b/net/ipv4/fou_core.c
index 3e30745e2c09..3970b6b7ace5 100644
--- a/net/ipv4/fou_core.c
+++ b/net/ipv4/fou_core.c
@@ -228,21 +228,27 @@ drop:
return 0;
}
+static const struct net_offload *fou_gro_ops(const struct sock *sk,
+ int proto)
+{
+ const struct net_offload __rcu **offloads;
+
+ /* FOU doesn't allow IPv4 on IPv6 sockets. */
+ offloads = sk->sk_family == AF_INET6 ? inet6_offloads : inet_offloads;
+ return rcu_dereference(offloads[proto]);
+}
+
static struct sk_buff *fou_gro_receive(struct sock *sk,
struct list_head *head,
struct sk_buff *skb)
{
- const struct net_offload __rcu **offloads;
struct fou *fou = fou_from_sock(sk);
const struct net_offload *ops;
struct sk_buff *pp = NULL;
- u8 proto;
if (!fou)
goto out;
- proto = fou->protocol;
-
/* We can clear the encap_mark for FOU as we are essentially doing
* one of two possible things. We are either adding an L4 tunnel
* header to the outer L3 tunnel header, or we are simply
@@ -254,8 +260,7 @@ static struct sk_buff *fou_gro_receive(struct sock *sk,
/* Flag this frame as already having an outer encap header */
NAPI_GRO_CB(skb)->is_fou = 1;
- offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
- ops = rcu_dereference(offloads[proto]);
+ ops = fou_gro_ops(sk, fou->protocol);
if (!ops || !ops->callbacks.gro_receive)
goto out;
@@ -268,10 +273,8 @@ out:
static int fou_gro_complete(struct sock *sk, struct sk_buff *skb,
int nhoff)
{
- const struct net_offload __rcu **offloads;
struct fou *fou = fou_from_sock(sk);
const struct net_offload *ops;
- u8 proto;
int err;
if (!fou) {
@@ -279,10 +282,7 @@ static int fou_gro_complete(struct sock *sk, struct sk_buff *skb,
goto out;
}
- proto = fou->protocol;
-
- offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
- ops = rcu_dereference(offloads[proto]);
+ ops = fou_gro_ops(sk, fou->protocol);
if (WARN_ON(!ops || !ops->callbacks.gro_complete)) {
err = -ENOSYS;
goto out;
@@ -323,7 +323,6 @@ static struct sk_buff *gue_gro_receive(struct sock *sk,
struct list_head *head,
struct sk_buff *skb)
{
- const struct net_offload __rcu **offloads;
const struct net_offload *ops;
struct sk_buff *pp = NULL;
struct sk_buff *p;
@@ -450,8 +449,7 @@ next_proto:
/* Flag this frame as already having an outer encap header */
NAPI_GRO_CB(skb)->is_fou = 1;
- offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
- ops = rcu_dereference(offloads[proto]);
+ ops = fou_gro_ops(sk, proto);
if (!ops || !ops->callbacks.gro_receive)
goto out;
@@ -467,7 +465,6 @@ out:
static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
{
struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff);
- const struct net_offload __rcu **offloads;
const struct net_offload *ops;
unsigned int guehlen = 0;
u8 proto;
@@ -494,8 +491,7 @@ static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
return err;
}
- offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
- ops = rcu_dereference(offloads[proto]);
+ ops = fou_gro_ops(sk, proto);
if (WARN_ON(!ops || !ops->callbacks.gro_complete))
goto out;
diff --git a/net/ipv4/fou_nl.c b/net/ipv4/fou_nl.c
index 3d9614609b2d..506260b4a4dc 100644
--- a/net/ipv4/fou_nl.c
+++ b/net/ipv4/fou_nl.c
@@ -18,9 +18,9 @@ const struct nla_policy fou_nl_policy[FOU_ATTR_IFINDEX + 1] = {
[FOU_ATTR_TYPE] = { .type = NLA_U8, },
[FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, },
[FOU_ATTR_LOCAL_V4] = { .type = NLA_U32, },
- [FOU_ATTR_LOCAL_V6] = { .len = 16, },
+ [FOU_ATTR_LOCAL_V6] = NLA_POLICY_EXACT_LEN(16),
[FOU_ATTR_PEER_V4] = { .type = NLA_U32, },
- [FOU_ATTR_PEER_V6] = { .len = 16, },
+ [FOU_ATTR_PEER_V6] = NLA_POLICY_EXACT_LEN(16),
[FOU_ATTR_PEER_PORT] = { .type = NLA_BE16, },
[FOU_ATTR_IFINDEX] = { .type = NLA_S32, },
};
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 2ffe73ea644f..1b7fb5d935ed 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -72,6 +72,7 @@
#include <linux/string.h>
#include <linux/netfilter_ipv4.h>
#include <linux/slab.h>
+#include <net/flow.h>
#include <net/snmp.h>
#include <net/ip.h>
#include <net/route.h>
@@ -318,17 +319,17 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
return true;
/* No rate limit on loopback */
- dev = dst_dev(dst);
+ rcu_read_lock();
+ dev = dst_dev_rcu(dst);
if (dev && (dev->flags & IFF_LOOPBACK))
goto out;
- rcu_read_lock();
peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr,
l3mdev_master_ifindex_rcu(dev));
rc = inet_peer_xrlim_allow(peer,
READ_ONCE(net->ipv4.sysctl_icmp_ratelimit));
- rcu_read_unlock();
out:
+ rcu_read_unlock();
if (!rc)
__ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST);
else
@@ -444,7 +445,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
fl4.saddr = saddr;
fl4.flowi4_mark = mark;
fl4.flowi4_uid = sock_net_uid(net, NULL);
- fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(skb)));
+ fl4.flowi4_dscp = ip4h_dscp(ip_hdr(skb));
fl4.flowi4_proto = IPPROTO_ICMP;
fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev);
security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4));
@@ -495,7 +496,7 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4,
fl4->saddr = saddr;
fl4->flowi4_mark = mark;
fl4->flowi4_uid = sock_net_uid(net, NULL);
- fl4->flowi4_tos = inet_dscp_to_dsfield(dscp);
+ fl4->flowi4_dscp = dscp;
fl4->flowi4_proto = IPPROTO_ICMP;
fl4->fl4_icmp_type = type;
fl4->fl4_icmp_code = code;
@@ -544,14 +545,15 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4,
goto relookup_failed;
}
/* Ugh! */
- orefdst = skb_in->_skb_refdst; /* save old refdst */
- skb_dst_set(skb_in, NULL);
+ orefdst = skb_dstref_steal(skb_in);
err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr,
dscp, rt2->dst.dev) ? -EINVAL : 0;
dst_release(&rt2->dst);
rt2 = skb_rtable(skb_in);
- skb_in->_skb_refdst = orefdst; /* restore old refdst */
+ /* steal dst entry from skb_in, don't drop refcnt */
+ skb_dstref_steal(skb_in);
+ skb_dstref_restore(skb_in, orefdst);
}
if (err)
@@ -592,7 +594,7 @@ relookup_failed:
*/
void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
- const struct ip_options *opt)
+ const struct inet_skb_parm *parm)
{
struct iphdr *iph;
int room;
@@ -708,7 +710,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
rcu_read_lock();
if (rt_is_input_route(rt) &&
READ_ONCE(net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr))
- dev = dev_get_by_index_rcu(net, inet_iif(skb_in));
+ dev = dev_get_by_index_rcu(net, parm->iif ? parm->iif :
+ inet_iif(skb_in));
if (dev)
saddr = inet_select_addr(dev, iph->saddr,
@@ -723,7 +726,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
iph->tos;
mark = IP4_REPLY_MARK(net, skb_in->mark);
- if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, opt))
+ if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in,
+ &parm->opt))
goto out_unlock;
@@ -797,14 +801,16 @@ EXPORT_SYMBOL(__icmp_send);
void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
{
struct sk_buff *cloned_skb = NULL;
- struct ip_options opts = { 0 };
enum ip_conntrack_info ctinfo;
+ enum ip_conntrack_dir dir;
+ struct inet_skb_parm parm;
struct nf_conn *ct;
__be32 orig_ip;
+ memset(&parm, 0, sizeof(parm));
ct = nf_ct_get(skb_in, &ctinfo);
- if (!ct || !(ct->status & IPS_SRC_NAT)) {
- __icmp_send(skb_in, type, code, info, &opts);
+ if (!ct || !(READ_ONCE(ct->status) & IPS_NAT_MASK)) {
+ __icmp_send(skb_in, type, code, info, &parm);
return;
}
@@ -818,8 +824,9 @@ void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
goto out;
orig_ip = ip_hdr(skb_in)->saddr;
- ip_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.ip;
- __icmp_send(skb_in, type, code, info, &opts);
+ dir = CTINFO2DIR(ctinfo);
+ ip_hdr(skb_in)->saddr = ct->tuplehash[dir].tuple.src.u3.ip;
+ __icmp_send(skb_in, type, code, info, &parm);
ip_hdr(skb_in)->saddr = orig_ip;
out:
consume_skb(cloned_skb);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 1e2df51427fe..cdd1e12aac8c 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -423,7 +423,7 @@ success:
}
static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
- struct sock *sk)
+ const struct sock *sk)
{
if (tb->fastreuseport <= 0)
return 0;
@@ -453,8 +453,9 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
ipv6_only_sock(sk), true, false);
}
-void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
- struct sock *sk)
+void inet_csk_update_fastreuse(const struct sock *sk,
+ struct inet_bind_bucket *tb,
+ struct inet_bind2_bucket *tb2)
{
bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
@@ -501,6 +502,9 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
tb->fastreuseport = 0;
}
}
+
+ tb2->fastreuse = tb->fastreuse;
+ tb2->fastreuseport = tb->fastreuseport;
}
/* Obtain a reference to a local port for the given sock,
@@ -582,7 +586,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
}
success:
- inet_csk_update_fastreuse(tb, sk);
+ inet_csk_update_fastreuse(sk, tb, tb2);
if (!inet_csk(sk)->icsk_bind_hash)
inet_bind_hash(sk, tb, tb2, port);
@@ -706,9 +710,9 @@ struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg)
spin_unlock_bh(&queue->fastopenq.lock);
}
-out:
release_sock(sk);
- if (newsk && mem_cgroup_sockets_enabled) {
+
+ if (mem_cgroup_sockets_enabled) {
gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL;
int amt = 0;
@@ -718,7 +722,7 @@ out:
lock_sock(newsk);
mem_cgroup_sk_alloc(newsk);
- if (newsk->sk_memcg) {
+ if (mem_cgroup_from_sk(newsk)) {
/* The socket has not been accepted yet, no need
* to look at newsk->sk_wmem_queued.
*/
@@ -727,23 +731,22 @@ out:
}
if (amt)
- mem_cgroup_charge_skmem(newsk->sk_memcg, amt, gfp);
+ mem_cgroup_sk_charge(newsk, amt, gfp);
kmem_cache_charge(newsk, gfp);
release_sock(newsk);
}
+
if (req)
reqsk_put(req);
- if (newsk)
- inet_init_csk_locks(newsk);
-
+ inet_init_csk_locks(newsk);
return newsk;
+
out_err:
- newsk = NULL;
- req = NULL;
+ release_sock(sk);
arg->err = error;
- goto out;
+ return NULL;
}
EXPORT_SYMBOL(inet_csk_accept);
@@ -1297,12 +1300,19 @@ void inet_csk_destroy_sock(struct sock *sk)
xfrm_sk_free_policy(sk);
- this_cpu_dec(*sk->sk_prot->orphan_count);
+ tcp_orphan_count_dec();
sock_put(sk);
}
EXPORT_SYMBOL(inet_csk_destroy_sock);
+void inet_csk_prepare_for_destroy_sock(struct sock *sk)
+{
+ /* The below has to be done to allow calling inet_csk_destroy_sock */
+ sock_set_flag(sk, SOCK_DEAD);
+ tcp_orphan_count_inc();
+}
+
/* This function allows to force a closure of a socket after the call to
* tcp_create_openreq_child().
*/
@@ -1370,7 +1380,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req,
sock_orphan(child);
- this_cpu_inc(*sk->sk_prot->orphan_count);
+ tcp_orphan_count_inc();
if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 2fa53b16fe77..f0b6c5a411a2 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -20,9 +20,6 @@
#include <net/ipv6.h>
#include <net/inet_common.h>
#include <net/inet_connection_sock.h>
-#include <net/inet_hashtables.h>
-#include <net/inet_timewait_sock.h>
-#include <net/inet6_hashtables.h>
#include <net/bpf_sk_storage.h>
#include <net/netlink.h>
@@ -74,54 +71,29 @@ static void inet_diag_unlock_handler(const struct inet_diag_handler *handler)
void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
{
- r->idiag_family = sk->sk_family;
+ r->idiag_family = READ_ONCE(sk->sk_family);
- r->id.idiag_sport = htons(sk->sk_num);
- r->id.idiag_dport = sk->sk_dport;
- r->id.idiag_if = sk->sk_bound_dev_if;
+ r->id.idiag_sport = htons(READ_ONCE(sk->sk_num));
+ r->id.idiag_dport = READ_ONCE(sk->sk_dport);
+ r->id.idiag_if = READ_ONCE(sk->sk_bound_dev_if);
sock_diag_save_cookie(sk, r->id.idiag_cookie);
#if IS_ENABLED(CONFIG_IPV6)
- if (sk->sk_family == AF_INET6) {
- *(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr;
- *(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr;
+ if (r->idiag_family == AF_INET6) {
+ data_race(*(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr);
+ data_race(*(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr);
} else
#endif
{
memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
- r->id.idiag_src[0] = sk->sk_rcv_saddr;
- r->id.idiag_dst[0] = sk->sk_daddr;
+ r->id.idiag_src[0] = READ_ONCE(sk->sk_rcv_saddr);
+ r->id.idiag_dst[0] = READ_ONCE(sk->sk_daddr);
}
}
EXPORT_SYMBOL_GPL(inet_diag_msg_common_fill);
-static size_t inet_sk_attr_size(struct sock *sk,
- const struct inet_diag_req_v2 *req,
- bool net_admin)
-{
- const struct inet_diag_handler *handler;
- size_t aux = 0;
-
- rcu_read_lock();
- handler = rcu_dereference(inet_diag_table[req->sdiag_protocol]);
- DEBUG_NET_WARN_ON_ONCE(!handler);
- if (handler && handler->idiag_get_aux_size)
- aux = handler->idiag_get_aux_size(sk, net_admin);
- rcu_read_unlock();
-
- return nla_total_size(sizeof(struct tcp_info))
- + nla_total_size(sizeof(struct inet_diag_msg))
- + inet_diag_msg_attrs_size()
- + nla_total_size(sizeof(struct inet_diag_meminfo))
- + nla_total_size(SK_MEMINFO_VARS * sizeof(u32))
- + nla_total_size(TCP_CA_NAME_MAX)
- + nla_total_size(sizeof(struct tcpvegas_info))
- + aux
- + 64;
-}
-
int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
struct inet_diag_msg *r, int ext,
struct user_namespace *user_ns,
@@ -313,17 +285,17 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk_pending == ICSK_TIME_LOSS_PROBE) {
r->idiag_timer = 1;
- r->idiag_retrans = icsk->icsk_retransmits;
+ r->idiag_retrans = READ_ONCE(icsk->icsk_retransmits);
r->idiag_expires =
jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies);
} else if (icsk_pending == ICSK_TIME_PROBE0) {
r->idiag_timer = 4;
- r->idiag_retrans = icsk->icsk_probes_out;
+ r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out);
r->idiag_expires =
jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies);
} else if (timer_pending(&sk->sk_timer)) {
r->idiag_timer = 2;
- r->idiag_retrans = icsk->icsk_probes_out;
+ r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out);
r->idiag_expires =
jiffies_delta_to_msecs(sk->sk_timer.expires - jiffies);
}
@@ -422,183 +394,6 @@ errout:
}
EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
-static int inet_twsk_diag_fill(struct sock *sk,
- struct sk_buff *skb,
- struct netlink_callback *cb,
- u16 nlmsg_flags, bool net_admin)
-{
- struct inet_timewait_sock *tw = inet_twsk(sk);
- struct inet_diag_msg *r;
- struct nlmsghdr *nlh;
- long tmo;
-
- nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type,
- sizeof(*r), nlmsg_flags);
- if (!nlh)
- return -EMSGSIZE;
-
- r = nlmsg_data(nlh);
- BUG_ON(tw->tw_state != TCP_TIME_WAIT);
-
- inet_diag_msg_common_fill(r, sk);
- r->idiag_retrans = 0;
-
- r->idiag_state = READ_ONCE(tw->tw_substate);
- r->idiag_timer = 3;
- tmo = tw->tw_timer.expires - jiffies;
- r->idiag_expires = jiffies_delta_to_msecs(tmo);
- r->idiag_rqueue = 0;
- r->idiag_wqueue = 0;
- r->idiag_uid = 0;
- r->idiag_inode = 0;
-
- if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
- tw->tw_mark)) {
- nlmsg_cancel(skb, nlh);
- return -EMSGSIZE;
- }
-
- nlmsg_end(skb, nlh);
- return 0;
-}
-
-static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
- struct netlink_callback *cb,
- u16 nlmsg_flags, bool net_admin)
-{
- struct request_sock *reqsk = inet_reqsk(sk);
- struct inet_diag_msg *r;
- struct nlmsghdr *nlh;
- long tmo;
-
- nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
- cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags);
- if (!nlh)
- return -EMSGSIZE;
-
- r = nlmsg_data(nlh);
- inet_diag_msg_common_fill(r, sk);
- r->idiag_state = TCP_SYN_RECV;
- r->idiag_timer = 1;
- r->idiag_retrans = reqsk->num_retrans;
-
- BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
- offsetof(struct sock, sk_cookie));
-
- tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies;
- r->idiag_expires = jiffies_delta_to_msecs(tmo);
- r->idiag_rqueue = 0;
- r->idiag_wqueue = 0;
- r->idiag_uid = 0;
- r->idiag_inode = 0;
-
- if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
- inet_rsk(reqsk)->ir_mark)) {
- nlmsg_cancel(skb, nlh);
- return -EMSGSIZE;
- }
-
- nlmsg_end(skb, nlh);
- return 0;
-}
-
-static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
- struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r,
- u16 nlmsg_flags, bool net_admin)
-{
- if (sk->sk_state == TCP_TIME_WAIT)
- return inet_twsk_diag_fill(sk, skb, cb, nlmsg_flags, net_admin);
-
- if (sk->sk_state == TCP_NEW_SYN_RECV)
- return inet_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin);
-
- return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, r, nlmsg_flags,
- net_admin);
-}
-
-struct sock *inet_diag_find_one_icsk(struct net *net,
- struct inet_hashinfo *hashinfo,
- const struct inet_diag_req_v2 *req)
-{
- struct sock *sk;
-
- rcu_read_lock();
- if (req->sdiag_family == AF_INET)
- sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[0],
- req->id.idiag_dport, req->id.idiag_src[0],
- req->id.idiag_sport, req->id.idiag_if);
-#if IS_ENABLED(CONFIG_IPV6)
- else if (req->sdiag_family == AF_INET6) {
- if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
- ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src))
- sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[3],
- req->id.idiag_dport, req->id.idiag_src[3],
- req->id.idiag_sport, req->id.idiag_if);
- else
- sk = inet6_lookup(net, hashinfo, NULL, 0,
- (struct in6_addr *)req->id.idiag_dst,
- req->id.idiag_dport,
- (struct in6_addr *)req->id.idiag_src,
- req->id.idiag_sport,
- req->id.idiag_if);
- }
-#endif
- else {
- rcu_read_unlock();
- return ERR_PTR(-EINVAL);
- }
- rcu_read_unlock();
- if (!sk)
- return ERR_PTR(-ENOENT);
-
- if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) {
- sock_gen_put(sk);
- return ERR_PTR(-ENOENT);
- }
-
- return sk;
-}
-EXPORT_SYMBOL_GPL(inet_diag_find_one_icsk);
-
-int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
- struct netlink_callback *cb,
- const struct inet_diag_req_v2 *req)
-{
- struct sk_buff *in_skb = cb->skb;
- bool net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN);
- struct net *net = sock_net(in_skb->sk);
- struct sk_buff *rep;
- struct sock *sk;
- int err;
-
- sk = inet_diag_find_one_icsk(net, hashinfo, req);
- if (IS_ERR(sk))
- return PTR_ERR(sk);
-
- rep = nlmsg_new(inet_sk_attr_size(sk, req, net_admin), GFP_KERNEL);
- if (!rep) {
- err = -ENOMEM;
- goto out;
- }
-
- err = sk_diag_fill(sk, rep, cb, req, 0, net_admin);
- if (err < 0) {
- WARN_ON(err == -EMSGSIZE);
- nlmsg_free(rep);
- goto out;
- }
- err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid);
-
-out:
- if (sk)
- sock_gen_put(sk);
-
- return err;
-}
-EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk);
-
static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb,
const struct nlmsghdr *nlh,
int hdrlen,
@@ -785,7 +580,7 @@ static void entry_fill_addrs(struct inet_diag_entry *entry,
const struct sock *sk)
{
#if IS_ENABLED(CONFIG_IPV6)
- if (sk->sk_family == AF_INET6) {
+ if (entry->family == AF_INET6) {
entry->saddr = sk->sk_v6_rcv_saddr.s6_addr32;
entry->daddr = sk->sk_v6_daddr.s6_addr32;
} else
@@ -796,31 +591,36 @@ static void entry_fill_addrs(struct inet_diag_entry *entry,
}
}
-int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
+int inet_diag_bc_sk(const struct inet_diag_dump_data *cb_data, struct sock *sk)
{
- struct inet_sock *inet = inet_sk(sk);
+ const struct nlattr *bc = cb_data->inet_diag_nla_bc;
+ const struct inet_sock *inet = inet_sk(sk);
struct inet_diag_entry entry;
if (!bc)
return 1;
- entry.family = sk->sk_family;
+ entry.family = READ_ONCE(sk->sk_family);
entry_fill_addrs(&entry, sk);
- entry.sport = inet->inet_num;
- entry.dport = ntohs(inet->inet_dport);
- entry.ifindex = sk->sk_bound_dev_if;
- entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0;
- if (sk_fullsock(sk))
- entry.mark = READ_ONCE(sk->sk_mark);
- else if (sk->sk_state == TCP_NEW_SYN_RECV)
- entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark;
- else if (sk->sk_state == TCP_TIME_WAIT)
- entry.mark = inet_twsk(sk)->tw_mark;
- else
- entry.mark = 0;
+ entry.sport = READ_ONCE(inet->inet_num);
+ entry.dport = ntohs(READ_ONCE(inet->inet_dport));
+ entry.ifindex = READ_ONCE(sk->sk_bound_dev_if);
+ if (cb_data->userlocks_needed)
+ entry.userlocks = sk_fullsock(sk) ? READ_ONCE(sk->sk_userlocks) : 0;
+ if (cb_data->mark_needed) {
+ if (sk_fullsock(sk))
+ entry.mark = READ_ONCE(sk->sk_mark);
+ else if (sk->sk_state == TCP_NEW_SYN_RECV)
+ entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark;
+ else if (sk->sk_state == TCP_TIME_WAIT)
+ entry.mark = inet_twsk(sk)->tw_mark;
+ else
+ entry.mark = 0;
+ }
#ifdef CONFIG_SOCK_CGROUP_DATA
- entry.cgroup_id = sk_fullsock(sk) ?
- cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)) : 0;
+ if (cb_data->cgroup_needed)
+ entry.cgroup_id = sk_fullsock(sk) ?
+ cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)) : 0;
#endif
return inet_diag_bc_run(bc, &entry);
@@ -920,16 +720,21 @@ static bool valid_cgroupcond(const struct inet_diag_bc_op *op, int len,
}
#endif
-static int inet_diag_bc_audit(const struct nlattr *attr,
+static int inet_diag_bc_audit(struct inet_diag_dump_data *cb_data,
const struct sk_buff *skb)
{
- bool net_admin = netlink_net_capable(skb, CAP_NET_ADMIN);
+ const struct nlattr *attr = cb_data->inet_diag_nla_bc;
const void *bytecode, *bc;
int bytecode_len, len;
+ bool net_admin;
+
+ if (!attr)
+ return 0;
- if (!attr || nla_len(attr) < sizeof(struct inet_diag_bc_op))
+ if (nla_len(attr) < sizeof(struct inet_diag_bc_op))
return -EINVAL;
+ net_admin = netlink_net_capable(skb, CAP_NET_ADMIN);
bytecode = bc = nla_data(attr);
len = bytecode_len = nla_len(attr);
@@ -961,14 +766,18 @@ static int inet_diag_bc_audit(const struct nlattr *attr,
return -EPERM;
if (!valid_markcond(bc, len, &min_len))
return -EINVAL;
+ cb_data->mark_needed = true;
break;
#ifdef CONFIG_SOCK_CGROUP_DATA
case INET_DIAG_BC_CGROUP_COND:
if (!valid_cgroupcond(bc, len, &min_len))
return -EINVAL;
+ cb_data->cgroup_needed = true;
break;
#endif
case INET_DIAG_BC_AUTO:
+ cb_data->userlocks_needed = true;
+ fallthrough;
case INET_DIAG_BC_JMP:
case INET_DIAG_BC_NOP:
break;
@@ -992,280 +801,6 @@ static int inet_diag_bc_audit(const struct nlattr *attr,
return len == 0 ? 0 : -EINVAL;
}
-static void twsk_build_assert(void)
-{
- BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) !=
- offsetof(struct sock, sk_family));
-
- BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) !=
- offsetof(struct inet_sock, inet_num));
-
- BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) !=
- offsetof(struct inet_sock, inet_dport));
-
- BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) !=
- offsetof(struct inet_sock, inet_rcv_saddr));
-
- BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) !=
- offsetof(struct inet_sock, inet_daddr));
-
-#if IS_ENABLED(CONFIG_IPV6)
- BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) !=
- offsetof(struct sock, sk_v6_rcv_saddr));
-
- BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) !=
- offsetof(struct sock, sk_v6_daddr));
-#endif
-}
-
-void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
- struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r)
-{
- bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
- struct inet_diag_dump_data *cb_data = cb->data;
- struct net *net = sock_net(skb->sk);
- u32 idiag_states = r->idiag_states;
- int i, num, s_i, s_num;
- struct nlattr *bc;
- struct sock *sk;
-
- bc = cb_data->inet_diag_nla_bc;
- if (idiag_states & TCPF_SYN_RECV)
- idiag_states |= TCPF_NEW_SYN_RECV;
- s_i = cb->args[1];
- s_num = num = cb->args[2];
-
- if (cb->args[0] == 0) {
- if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport)
- goto skip_listen_ht;
-
- for (i = s_i; i <= hashinfo->lhash2_mask; i++) {
- struct inet_listen_hashbucket *ilb;
- struct hlist_nulls_node *node;
-
- num = 0;
- ilb = &hashinfo->lhash2[i];
-
- if (hlist_nulls_empty(&ilb->nulls_head)) {
- s_num = 0;
- continue;
- }
- spin_lock(&ilb->lock);
- sk_nulls_for_each(sk, node, &ilb->nulls_head) {
- struct inet_sock *inet = inet_sk(sk);
-
- if (!net_eq(sock_net(sk), net))
- continue;
-
- if (num < s_num) {
- num++;
- continue;
- }
-
- if (r->sdiag_family != AF_UNSPEC &&
- sk->sk_family != r->sdiag_family)
- goto next_listen;
-
- if (r->id.idiag_sport != inet->inet_sport &&
- r->id.idiag_sport)
- goto next_listen;
-
- if (!inet_diag_bc_sk(bc, sk))
- goto next_listen;
-
- if (inet_sk_diag_fill(sk, inet_csk(sk), skb,
- cb, r, NLM_F_MULTI,
- net_admin) < 0) {
- spin_unlock(&ilb->lock);
- goto done;
- }
-
-next_listen:
- ++num;
- }
- spin_unlock(&ilb->lock);
-
- s_num = 0;
- }
-skip_listen_ht:
- cb->args[0] = 1;
- s_i = num = s_num = 0;
- }
-
-/* Process a maximum of SKARR_SZ sockets at a time when walking hash buckets
- * with bh disabled.
- */
-#define SKARR_SZ 16
-
- /* Dump bound but inactive (not listening, connecting, etc.) sockets */
- if (cb->args[0] == 1) {
- if (!(idiag_states & TCPF_BOUND_INACTIVE))
- goto skip_bind_ht;
-
- for (i = s_i; i < hashinfo->bhash_size; i++) {
- struct inet_bind_hashbucket *ibb;
- struct inet_bind2_bucket *tb2;
- struct sock *sk_arr[SKARR_SZ];
- int num_arr[SKARR_SZ];
- int idx, accum, res;
-
-resume_bind_walk:
- num = 0;
- accum = 0;
- ibb = &hashinfo->bhash2[i];
-
- if (hlist_empty(&ibb->chain)) {
- s_num = 0;
- continue;
- }
- spin_lock_bh(&ibb->lock);
- inet_bind_bucket_for_each(tb2, &ibb->chain) {
- if (!net_eq(ib2_net(tb2), net))
- continue;
-
- sk_for_each_bound(sk, &tb2->owners) {
- struct inet_sock *inet = inet_sk(sk);
-
- if (num < s_num)
- goto next_bind;
-
- if (sk->sk_state != TCP_CLOSE ||
- !inet->inet_num)
- goto next_bind;
-
- if (r->sdiag_family != AF_UNSPEC &&
- r->sdiag_family != sk->sk_family)
- goto next_bind;
-
- if (!inet_diag_bc_sk(bc, sk))
- goto next_bind;
-
- sock_hold(sk);
- num_arr[accum] = num;
- sk_arr[accum] = sk;
- if (++accum == SKARR_SZ)
- goto pause_bind_walk;
-next_bind:
- num++;
- }
- }
-pause_bind_walk:
- spin_unlock_bh(&ibb->lock);
-
- res = 0;
- for (idx = 0; idx < accum; idx++) {
- if (res >= 0) {
- res = inet_sk_diag_fill(sk_arr[idx],
- NULL, skb, cb,
- r, NLM_F_MULTI,
- net_admin);
- if (res < 0)
- num = num_arr[idx];
- }
- sock_put(sk_arr[idx]);
- }
- if (res < 0)
- goto done;
-
- cond_resched();
-
- if (accum == SKARR_SZ) {
- s_num = num + 1;
- goto resume_bind_walk;
- }
-
- s_num = 0;
- }
-skip_bind_ht:
- cb->args[0] = 2;
- s_i = num = s_num = 0;
- }
-
- if (!(idiag_states & ~TCPF_LISTEN))
- goto out;
-
- for (i = s_i; i <= hashinfo->ehash_mask; i++) {
- struct inet_ehash_bucket *head = &hashinfo->ehash[i];
- spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
- struct hlist_nulls_node *node;
- struct sock *sk_arr[SKARR_SZ];
- int num_arr[SKARR_SZ];
- int idx, accum, res;
-
- if (hlist_nulls_empty(&head->chain))
- continue;
-
- if (i > s_i)
- s_num = 0;
-
-next_chunk:
- num = 0;
- accum = 0;
- spin_lock_bh(lock);
- sk_nulls_for_each(sk, node, &head->chain) {
- int state;
-
- if (!net_eq(sock_net(sk), net))
- continue;
- if (num < s_num)
- goto next_normal;
- state = (sk->sk_state == TCP_TIME_WAIT) ?
- READ_ONCE(inet_twsk(sk)->tw_substate) : sk->sk_state;
- if (!(idiag_states & (1 << state)))
- goto next_normal;
- if (r->sdiag_family != AF_UNSPEC &&
- sk->sk_family != r->sdiag_family)
- goto next_normal;
- if (r->id.idiag_sport != htons(sk->sk_num) &&
- r->id.idiag_sport)
- goto next_normal;
- if (r->id.idiag_dport != sk->sk_dport &&
- r->id.idiag_dport)
- goto next_normal;
- twsk_build_assert();
-
- if (!inet_diag_bc_sk(bc, sk))
- goto next_normal;
-
- if (!refcount_inc_not_zero(&sk->sk_refcnt))
- goto next_normal;
-
- num_arr[accum] = num;
- sk_arr[accum] = sk;
- if (++accum == SKARR_SZ)
- break;
-next_normal:
- ++num;
- }
- spin_unlock_bh(lock);
- res = 0;
- for (idx = 0; idx < accum; idx++) {
- if (res >= 0) {
- res = sk_diag_fill(sk_arr[idx], skb, cb, r,
- NLM_F_MULTI, net_admin);
- if (res < 0)
- num = num_arr[idx];
- }
- sock_gen_put(sk_arr[idx]);
- }
- if (res < 0)
- break;
- cond_resched();
- if (accum == SKARR_SZ) {
- s_num = num + 1;
- goto next_chunk;
- }
- }
-
-done:
- cb->args[1] = i;
- cb->args[2] = num;
-out:
- ;
-}
-EXPORT_SYMBOL_GPL(inet_diag_dump_icsk);
-
static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
const struct inet_diag_req_v2 *r)
{
@@ -1319,13 +854,10 @@ static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen)
kfree(cb_data);
return err;
}
- nla = cb_data->inet_diag_nla_bc;
- if (nla) {
- err = inet_diag_bc_audit(nla, skb);
- if (err) {
- kfree(cb_data);
- return err;
- }
+ err = inet_diag_bc_audit(cb_data, skb);
+ if (err) {
+ kfree(cb_data);
+ return err;
}
nla = cb_data->inet_diag_nla_bpf_stgs;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 470ab17ceb51..025895eb6ec5 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -183,7 +183,7 @@ static void fqdir_work_fn(struct work_struct *work)
rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL);
if (llist_add(&fqdir->free_list, &fqdir_free_list))
- queue_delayed_work(system_wq, &fqdir_free_work, HZ);
+ queue_delayed_work(system_percpu_wq, &fqdir_free_work, HZ);
}
int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net)
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index ceeeec9b7290..b7024e3d9ac3 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -58,6 +58,14 @@ static u32 sk_ehashfn(const struct sock *sk)
sk->sk_daddr, sk->sk_dport);
}
+static bool sk_is_connect_bind(const struct sock *sk)
+{
+ if (sk->sk_state == TCP_TIME_WAIT)
+ return inet_twsk(sk)->tw_connect_bind;
+ else
+ return sk->sk_userlocks & SOCK_CONNECT_BIND;
+}
+
/*
* Allocate and initialize a new local port bind bucket.
* The bindhash mutex for snum's hash chain must be held here.
@@ -87,10 +95,22 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
*/
void inet_bind_bucket_destroy(struct inet_bind_bucket *tb)
{
+ const struct inet_bind2_bucket *tb2;
+
if (hlist_empty(&tb->bhash2)) {
hlist_del_rcu(&tb->node);
kfree_rcu(tb, rcu);
+ return;
+ }
+
+ if (tb->fastreuse == -1 && tb->fastreuseport == -1)
+ return;
+ hlist_for_each_entry(tb2, &tb->bhash2, bhash_node) {
+ if (tb2->fastreuse != -1 || tb2->fastreuseport != -1)
+ return;
}
+ tb->fastreuse = -1;
+ tb->fastreuseport = -1;
}
bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net,
@@ -121,6 +141,8 @@ static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2,
#else
tb2->rcv_saddr = sk->sk_rcv_saddr;
#endif
+ tb2->fastreuse = 0;
+ tb2->fastreuseport = 0;
INIT_HLIST_HEAD(&tb2->owners);
hlist_add_head(&tb2->node, &head->chain);
hlist_add_head(&tb2->bhash_node, &tb->bhash2);
@@ -143,11 +165,23 @@ struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep,
/* Caller must hold hashbucket lock for this tb with local BH disabled */
void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb)
{
+ const struct sock *sk;
+
if (hlist_empty(&tb->owners)) {
__hlist_del(&tb->node);
__hlist_del(&tb->bhash_node);
kmem_cache_free(cachep, tb);
+ return;
+ }
+
+ if (tb->fastreuse == -1 && tb->fastreuseport == -1)
+ return;
+ sk_for_each_bound(sk, &tb->owners) {
+ if (!sk_is_connect_bind(sk))
+ return;
}
+ tb->fastreuse = -1;
+ tb->fastreuseport = -1;
}
static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2,
@@ -191,6 +225,7 @@ static void __inet_put_port(struct sock *sk)
tb = inet_csk(sk)->icsk_bind_hash;
inet_csk(sk)->icsk_bind_hash = NULL;
inet_sk(sk)->inet_num = 0;
+ sk->sk_userlocks &= ~SOCK_CONNECT_BIND;
spin_lock(&head2->lock);
if (inet_csk(sk)->icsk_bind2_hash) {
@@ -277,7 +312,7 @@ bhash2_find:
}
}
if (update_fastreuse)
- inet_csk_update_fastreuse(tb, child);
+ inet_csk_update_fastreuse(child, tb, tb2);
inet_bind_hash(child, tb, tb2, port);
spin_unlock(&head2->lock);
spin_unlock(&head->lock);
@@ -425,19 +460,18 @@ struct sock *inet_lookup_run_sk_lookup(const struct net *net,
}
struct sock *__inet_lookup_listener(const struct net *net,
- struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
const __be32 saddr, __be16 sport,
const __be32 daddr, const unsigned short hnum,
const int dif, const int sdif)
{
struct inet_listen_hashbucket *ilb2;
+ struct inet_hashinfo *hashinfo;
struct sock *result = NULL;
unsigned int hash2;
/* Lookup redirect from BPF */
- if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
- hashinfo == net->ipv4.tcp_death_row.hashinfo) {
+ if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
result = inet_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff,
saddr, sport, daddr, hnum, dif,
inet_ehashfn);
@@ -445,6 +479,7 @@ struct sock *__inet_lookup_listener(const struct net *net,
goto done;
}
+ hashinfo = net->ipv4.tcp_death_row.hashinfo;
hash2 = ipv4_portaddr_hash(net, daddr, hnum);
ilb2 = inet_lhash2_bucket(hashinfo, hash2);
@@ -490,21 +525,22 @@ void sock_edemux(struct sk_buff *skb)
EXPORT_SYMBOL(sock_edemux);
struct sock *__inet_lookup_established(const struct net *net,
- struct inet_hashinfo *hashinfo,
- const __be32 saddr, const __be16 sport,
- const __be32 daddr, const u16 hnum,
- const int dif, const int sdif)
+ const __be32 saddr, const __be16 sport,
+ const __be32 daddr, const u16 hnum,
+ const int dif, const int sdif)
{
- INET_ADDR_COOKIE(acookie, saddr, daddr);
const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
- struct sock *sk;
+ INET_ADDR_COOKIE(acookie, saddr, daddr);
const struct hlist_nulls_node *node;
- /* Optimize here for direct hit, only listening connections can
- * have wildcards anyways.
- */
- unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
- unsigned int slot = hash & hashinfo->ehash_mask;
- struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
+ struct inet_ehash_bucket *head;
+ struct inet_hashinfo *hashinfo;
+ unsigned int hash, slot;
+ struct sock *sk;
+
+ hashinfo = net->ipv4.tcp_death_row.hashinfo;
+ hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
+ slot = hash & hashinfo->ehash_mask;
+ head = &hashinfo->ehash[slot];
begin:
sk_nulls_for_each_rcu(sk, node, &head->chain) {
@@ -579,8 +615,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) {
if (sk2->sk_state == TCP_TIME_WAIT) {
tw = inet_twsk(sk2);
- if (sk->sk_protocol == IPPROTO_TCP &&
- tcp_twsk_unique(sk, sk2, twp))
+ if (tcp_twsk_unique(sk, sk2, twp))
break;
}
goto not_unique;
@@ -707,7 +742,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk)
if (ok) {
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
} else {
- this_cpu_inc(*sk->sk_prot->orphan_count);
+ tcp_orphan_count_inc();
inet_sk_set_state(sk, TCP_CLOSE);
sock_set_flag(sk, SOCK_DEAD);
inet_csk_destroy_sock(sk);
@@ -739,15 +774,18 @@ static int inet_reuseport_add_sock(struct sock *sk,
return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
}
-int __inet_hash(struct sock *sk, struct sock *osk)
+int inet_hash(struct sock *sk)
{
struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
struct inet_listen_hashbucket *ilb2;
int err = 0;
+ if (sk->sk_state == TCP_CLOSE)
+ return 0;
+
if (sk->sk_state != TCP_LISTEN) {
local_bh_disable();
- inet_ehash_nolisten(sk, osk, NULL);
+ inet_ehash_nolisten(sk, NULL, NULL);
local_bh_enable();
return 0;
}
@@ -772,17 +810,7 @@ unlock:
return err;
}
-EXPORT_IPV6_MOD(__inet_hash);
-
-int inet_hash(struct sock *sk)
-{
- int err = 0;
-
- if (sk->sk_state != TCP_CLOSE)
- err = __inet_hash(sk, NULL);
-
- return err;
-}
+EXPORT_IPV6_MOD(inet_hash);
void inet_unhash(struct sock *sk)
{
@@ -800,11 +828,6 @@ void inet_unhash(struct sock *sk)
* avoid circular locking dependency on PREEMPT_RT.
*/
spin_lock(&ilb2->lock);
- if (sk_unhashed(sk)) {
- spin_unlock(&ilb2->lock);
- return;
- }
-
if (rcu_access_pointer(sk->sk_reuseport_cb))
reuseport_stop_listen_sock(sk);
@@ -815,10 +838,6 @@ void inet_unhash(struct sock *sk)
spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
spin_lock_bh(lock);
- if (sk_unhashed(sk)) {
- spin_unlock_bh(lock);
- return;
- }
__sk_nulls_del_node_init_rcu(sk);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
spin_unlock_bh(lock);
@@ -966,6 +985,10 @@ static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family,
if (!tb2) {
tb2 = new_tb2;
inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk);
+ if (sk_is_connect_bind(sk)) {
+ tb2->fastreuse = -1;
+ tb2->fastreuseport = -1;
+ }
}
inet_csk(sk)->icsk_bind2_hash = tb2;
sk_add_bind_node(sk, &tb2->owners);
@@ -1136,6 +1159,8 @@ ok:
head2, tb, sk);
if (!tb2)
goto error;
+ tb2->fastreuse = -1;
+ tb2->fastreuseport = -1;
}
/* Here we want to add a little bit of randomness to the next source
@@ -1148,6 +1173,7 @@ ok:
/* Head lock still held and bh's disabled */
inet_bind_hash(sk, tb, tb2, port);
+ sk->sk_userlocks |= SOCK_CONNECT_BIND;
if (sk_unhashed(sk)) {
inet_sk(sk)->inet_sport = htons(port);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 875ff923a8ed..c96d61d08854 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -15,7 +15,8 @@
#include <net/inet_hashtables.h>
#include <net/inet_timewait_sock.h>
#include <net/ip.h>
-
+#include <net/tcp.h>
+#include <net/psp.h>
/**
* inet_twsk_bind_unhash - unhash a timewait socket from bind hash
@@ -74,7 +75,8 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
void inet_twsk_free(struct inet_timewait_sock *tw)
{
struct module *owner = tw->tw_prot->owner;
- twsk_destructor((struct sock *)tw);
+
+ tcp_twsk_destructor((struct sock *)tw);
kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw);
module_put(owner);
}
@@ -206,10 +208,14 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
tw->tw_hash = sk->sk_hash;
tw->tw_ipv6only = 0;
tw->tw_transparent = inet_test_bit(TRANSPARENT, sk);
+ tw->tw_connect_bind = !!(sk->sk_userlocks & SOCK_CONNECT_BIND);
tw->tw_prot = sk->sk_prot_creator;
atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
twsk_net_set(tw, sock_net(sk));
timer_setup(&tw->tw_timer, tw_timer_handler, 0);
+#ifdef CONFIG_SOCK_VALIDATE_XMIT
+ tw->tw_validate_xmit_skb = NULL;
+#endif
/*
* Because we use RCU lookups, we should not set tw_refcnt
* to a non null value before everything is setup for this
@@ -218,6 +224,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
refcount_set(&tw->tw_refcnt, 0);
__module_get(tw->tw_prot->owner);
+ psp_twsk_init(tw, sk);
}
return tw;
@@ -329,13 +336,13 @@ restart:
TCPF_NEW_SYN_RECV))
continue;
- if (refcount_read(&sock_net(sk)->ns.count))
+ if (check_net(sock_net(sk)))
continue;
if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
continue;
- if (refcount_read(&sock_net(sk)->ns.count)) {
+ if (check_net(sock_net(sk))) {
sock_gen_put(sk);
goto restart;
}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index b2584cce90ae..f7012479713b 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -476,14 +476,16 @@ out_fail:
/* Process an incoming IP datagram fragment. */
int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
{
- struct net_device *dev = skb->dev ? : skb_dst_dev(skb);
- int vif = l3mdev_master_ifindex_rcu(dev);
+ struct net_device *dev;
struct ipq *qp;
+ int vif;
__IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
/* Lookup (or create) queue header */
rcu_read_lock();
+ dev = skb->dev ? : skb_dst_dev_rcu(skb);
+ vif = l3mdev_master_ifindex_rcu(dev);
qp = ip_find(net, ip_hdr(skb), user, vif);
if (qp) {
int ret, refs = 0;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f5b9004d6938..761a53c6a89a 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -28,6 +28,7 @@
#include <linux/etherdevice.h>
#include <linux/if_ether.h>
+#include <net/flow.h>
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
@@ -44,7 +45,6 @@
#include <net/gre.h>
#include <net/dst_metadata.h>
#include <net/erspan.h>
-#include <net/inet_dscp.h>
/*
Problems & solutions
@@ -930,7 +930,7 @@ static int ipgre_open(struct net_device *dev)
if (ipv4_is_multicast(t->parms.iph.daddr)) {
struct flowi4 fl4 = {
.flowi4_oif = t->parms.link,
- .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(&t->parms.iph)),
+ .flowi4_dscp = ip4h_dscp(&t->parms.iph),
.flowi4_scope = RT_SCOPE_UNIVERSE,
.flowi4_proto = IPPROTO_GRE,
.saddr = t->parms.iph.saddr,
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index fc323994b1fa..273578579a6b 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -263,10 +263,11 @@ int ip_local_deliver(struct sk_buff *skb)
}
EXPORT_SYMBOL(ip_local_deliver);
-static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev)
+static inline enum skb_drop_reason
+ip_rcv_options(struct sk_buff *skb, struct net_device *dev)
{
- struct ip_options *opt;
const struct iphdr *iph;
+ struct ip_options *opt;
/* It looks as overkill, because not all
IP options require packet mangling.
@@ -277,7 +278,7 @@ static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev)
*/
if (skb_cow(skb, skb_headroom(skb))) {
__IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INDISCARDS);
- goto drop;
+ return SKB_DROP_REASON_NOMEM;
}
iph = ip_hdr(skb);
@@ -286,7 +287,7 @@ static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev)
if (ip_options_compile(dev_net(dev), opt, skb)) {
__IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
- goto drop;
+ return SKB_DROP_REASON_IP_INHDR;
}
if (unlikely(opt->srr)) {
@@ -298,17 +299,15 @@ static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev)
net_info_ratelimited("source route option %pI4 -> %pI4\n",
&iph->saddr,
&iph->daddr);
- goto drop;
+ return SKB_DROP_REASON_NOT_SPECIFIED;
}
}
if (ip_options_rcv_srr(skb, dev))
- goto drop;
+ return SKB_DROP_REASON_NOT_SPECIFIED;
}
- return false;
-drop:
- return true;
+ return SKB_NOT_DROPPED_YET;
}
static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph,
@@ -319,7 +318,7 @@ static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph,
}
int tcp_v4_early_demux(struct sk_buff *skb);
-int udp_v4_early_demux(struct sk_buff *skb);
+enum skb_drop_reason udp_v4_early_demux(struct sk_buff *skb);
static int ip_rcv_finish_core(struct net *net,
struct sk_buff *skb, struct net_device *dev,
const struct sk_buff *hint)
@@ -335,7 +334,6 @@ static int ip_rcv_finish_core(struct net *net,
goto drop_error;
}
- drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) &&
!skb_dst(skb) &&
!skb->sk &&
@@ -354,7 +352,6 @@ static int ip_rcv_finish_core(struct net *net,
drop_reason = udp_v4_early_demux(skb);
if (unlikely(drop_reason))
goto drop_error;
- drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
/* must reload iph, skb->head might have changed */
iph = ip_hdr(skb);
@@ -372,7 +369,6 @@ static int ip_rcv_finish_core(struct net *net,
ip4h_dscp(iph), dev);
if (unlikely(drop_reason))
goto drop_error;
- drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
} else {
struct in_device *in_dev = __in_dev_get_rcu(dev);
@@ -391,8 +387,11 @@ static int ip_rcv_finish_core(struct net *net,
}
#endif
- if (iph->ihl > 5 && ip_rcv_options(skb, dev))
- goto drop;
+ if (iph->ihl > 5) {
+ drop_reason = ip_rcv_options(skb, dev);
+ if (drop_reason)
+ goto drop;
+ }
rt = skb_rtable(skb);
if (rt->rt_type == RTN_MULTICAST) {
@@ -587,9 +586,13 @@ static void ip_sublist_rcv_finish(struct list_head *head)
}
static struct sk_buff *ip_extract_route_hint(const struct net *net,
- struct sk_buff *skb, int rt_type)
+ struct sk_buff *skb)
{
- if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST ||
+ const struct iphdr *iph = ip_hdr(skb);
+
+ if (fib4_has_custom_rules(net) ||
+ ipv4_is_lbcast(iph->daddr) ||
+ ipv4_is_zeronet(iph->daddr) ||
IPCB(skb)->flags & IPSKB_MULTIPATH)
return NULL;
@@ -618,8 +621,7 @@ static void ip_list_rcv_finish(struct net *net, struct list_head *head)
dst = skb_dst(skb);
if (curr_dst != dst) {
- hint = ip_extract_route_hint(net, skb,
- dst_rtable(dst)->rt_type);
+ hint = ip_extract_route_hint(net, skb);
/* dispatch old sublist */
if (!list_empty(&sublist))
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index e3321932bec0..be8815ce3ac2 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -615,14 +615,13 @@ int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev)
}
memcpy(&nexthop, &optptr[srrptr-1], 4);
- orefdst = skb->_skb_refdst;
- skb_dst_set(skb, NULL);
+ orefdst = skb_dstref_steal(skb);
err = ip_route_input(skb, nexthop, iph->saddr, ip4h_dscp(iph),
dev) ? -EINVAL : 0;
rt2 = skb_rtable(skb);
if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
skb_dst_drop(skb);
- skb->_skb_refdst = orefdst;
+ skb_dstref_restore(skb, orefdst);
return -EINVAL;
}
refdst_drop(orefdst);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 84e7f8a2f50f..5ca97ede979c 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -63,6 +63,7 @@
#include <linux/stat.h>
#include <linux/init.h>
+#include <net/flow.h>
#include <net/snmp.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -83,6 +84,7 @@
#include <linux/netfilter_bridge.h>
#include <linux/netlink.h>
#include <linux/tcp.h>
+#include <net/psp.h>
static int
ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
@@ -485,7 +487,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
inet_sk_init_flowi4(inet, fl4);
/* sctp_v4_xmit() uses its own DSCP value */
- fl4->flowi4_tos = tos & INET_DSCP_MASK;
+ fl4->flowi4_dscp = inet_dsfield_to_dscp(tos);
/* If this fails, retransmit mechanism of transport layer will
* keep trying until route appears or the connection times
@@ -1664,8 +1666,10 @@ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk,
arg->csumoffset) = csum_fold(csum_add(nskb->csum,
arg->csum));
nskb->ip_summed = CHECKSUM_NONE;
- if (orig_sk)
+ if (orig_sk) {
skb_set_owner_edemux(nskb, (struct sock *)orig_sk);
+ psp_reply_set_decrypted(nskb);
+ }
if (transmit_time)
nskb->tstamp_type = SKB_CLOCK_MONOTONIC;
if (txhash)
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index cc9915543637..2e61ac137128 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -206,6 +206,9 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu)
if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr)))
return -EINVAL;
+ if (skb_is_gso(skb))
+ skb_gso_reset(skb);
+
skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
pskb_pull(skb, ETH_HLEN);
skb_reset_network_header(skb);
@@ -300,6 +303,9 @@ static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu)
if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr)))
return -EINVAL;
+ if (skb_is_gso(skb))
+ skb_gso_reset(skb);
+
skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
pskb_pull(skb, ETH_HLEN);
skb_reset_network_header(skb);
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index e86a8a862c41..ca9eaee4c2ef 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -42,6 +42,7 @@
#include <linux/init.h>
#include <linux/if_ether.h>
#include <linux/slab.h>
+#include <net/flow.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -1904,7 +1905,7 @@ static int ipmr_prepare_xmit(struct net *net, struct mr_table *mrt,
return -1;
}
- encap += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
+ encap += LL_RESERVED_SPACE(dst_dev_rcu(&rt->dst)) + rt->dst.header_len;
if (skb_cow(skb, encap)) {
ip_rt_put(rt);
@@ -1957,7 +1958,7 @@ static void ipmr_queue_fwd_xmit(struct net *net, struct mr_table *mrt,
* result in receiving multiple packets.
*/
NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
- net, NULL, skb, skb->dev, rt->dst.dev,
+ net, NULL, skb, skb->dev, dst_dev_rcu(&rt->dst),
ipmr_forward_finish);
return;
@@ -2120,7 +2121,7 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
struct flowi4 fl4 = {
.daddr = iph->daddr,
.saddr = iph->saddr,
- .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)),
+ .flowi4_dscp = ip4h_dscp(iph),
.flowi4_oif = (rt_is_output_route(rt) ?
skb->dev->ifindex : 0),
.flowi4_iif = (rt_is_output_route(rt) ?
@@ -2301,7 +2302,7 @@ int ip_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb)
guard(rcu)();
- dev = rt->dst.dev;
+ dev = dst_dev_rcu(&rt->dst);
if (IPCB(skb)->flags & IPSKB_FORWARDED)
goto mc_output;
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 0565f001120d..ce310eb779e0 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -11,10 +11,10 @@
#include <linux/skbuff.h>
#include <linux/gfp.h>
#include <linux/export.h>
+#include <net/flow.h>
#include <net/route.h>
#include <net/xfrm.h>
#include <net/ip.h>
-#include <net/inet_dscp.h>
#include <net/netfilter/nf_queue.h>
/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
@@ -44,7 +44,7 @@ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, un
*/
fl4.daddr = iph->daddr;
fl4.saddr = saddr;
- fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph));
+ fl4.flowi4_dscp = ip4h_dscp(iph);
fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0;
fl4.flowi4_l3mdev = l3mdev_master_ifindex(dev);
fl4.flowi4_mark = skb->mark;
@@ -65,7 +65,10 @@ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, un
if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
xfrm_decode_session(net, skb, flowi4_to_flowi(&fl4), AF_INET) == 0) {
struct dst_entry *dst = skb_dst(skb);
- skb_dst_set(skb, NULL);
+ /* ignore return value from skb_dstref_steal, xfrm_lookup takes
+ * care of dropping the refcnt if needed.
+ */
+ skb_dstref_steal(skb);
dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), sk, 0);
if (IS_ERR(dst))
return PTR_ERR(dst);
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 2c438b140e88..7dc9772fe2d8 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -14,6 +14,7 @@ config NF_DEFRAG_IPV4
config IP_NF_IPTABLES_LEGACY
tristate "Legacy IP tables support"
depends on NETFILTER_XTABLES_LEGACY
+ depends on NETFILTER_XTABLES
default m if NETFILTER_XTABLES_LEGACY
help
iptables is a legacy packet classifier.
@@ -326,6 +327,7 @@ endif # IP_NF_IPTABLES
config IP_NF_ARPTABLES
tristate "Legacy ARPTABLES support"
depends on NETFILTER_XTABLES_LEGACY
+ depends on NETFILTER_XTABLES
default n
help
arptables is a legacy packet classifier.
@@ -343,6 +345,7 @@ config IP_NF_ARPFILTER
select IP_NF_ARPTABLES
select NETFILTER_FAMILY_ARP
depends on NETFILTER_XTABLES_LEGACY
+ depends on NETFILTER_XTABLES
help
ARP packet filtering defines a table `filter', which has a series of
rules for simple ARP packet filtering at local input and
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index a27782d7653e..6d9bf5106868 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -8,8 +8,8 @@
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
-#include <net/inet_dscp.h>
#include <linux/ip.h>
+#include <net/flow.h>
#include <net/ip.h>
#include <net/ip_fib.h>
#include <net/route.h>
@@ -76,7 +76,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
flow.daddr = iph->saddr;
flow.saddr = rpfilter_get_saddr(iph->daddr);
flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
- flow.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph));
+ flow.flowi4_dscp = ip4h_dscp(iph);
flow.flowi4_scope = RT_SCOPE_UNIVERSE;
flow.flowi4_l3mdev = l3mdev_master_ifindex_rcu(xt_in(par));
flow.flowi4_uid = sock_net_uid(xt_net(par), NULL);
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index ed08fb78cfa8..9a773502f10a 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -12,10 +12,10 @@
#include <linux/skbuff.h>
#include <linux/netfilter.h>
#include <net/checksum.h>
+#include <net/flow.h>
#include <net/icmp.h>
#include <net/ip.h>
#include <net/route.h>
-#include <net/inet_dscp.h>
#include <net/netfilter/ipv4/nf_dup_ipv4.h>
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <net/netfilter/nf_conntrack.h>
@@ -33,7 +33,7 @@ static bool nf_dup_ipv4_route(struct net *net, struct sk_buff *skb,
fl4.flowi4_oif = oif;
fl4.daddr = gw->s_addr;
- fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph));
+ fl4.flowi4_dscp = ip4h_dscp(iph);
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH;
rt = ip_route_output_key(net, &fl4);
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index 87fd945a0d27..fae4aa4a5f09 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -12,6 +12,15 @@
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_bridge.h>
+static struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ __u8 protocol, int ttl);
+static void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
+ const struct tcphdr *oth);
+static const struct tcphdr *
+nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
+ struct tcphdr *_oth, int hook);
+
static int nf_reject_iphdr_validate(struct sk_buff *skb)
{
struct iphdr *iph;
@@ -71,6 +80,27 @@ struct sk_buff *nf_reject_skb_v4_tcp_reset(struct net *net,
}
EXPORT_SYMBOL_GPL(nf_reject_skb_v4_tcp_reset);
+static bool nf_skb_is_icmp_unreach(const struct sk_buff *skb)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ u8 *tp, _type;
+ int thoff;
+
+ if (iph->protocol != IPPROTO_ICMP)
+ return false;
+
+ thoff = skb_network_offset(skb) + sizeof(*iph);
+
+ tp = skb_header_pointer(skb,
+ thoff + offsetof(struct icmphdr, type),
+ sizeof(_type), &_type);
+
+ if (!tp)
+ return false;
+
+ return *tp == ICMP_DEST_UNREACH;
+}
+
struct sk_buff *nf_reject_skb_v4_unreach(struct net *net,
struct sk_buff *oldskb,
const struct net_device *dev,
@@ -91,6 +121,10 @@ struct sk_buff *nf_reject_skb_v4_unreach(struct net *net,
if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET))
return NULL;
+ /* don't reply to ICMP_DEST_UNREACH with ICMP_DEST_UNREACH. */
+ if (nf_skb_is_icmp_unreach(oldskb))
+ return NULL;
+
/* RFC says return as much as we can without exceeding 576 bytes. */
len = min_t(unsigned int, 536, oldskb->len);
@@ -136,8 +170,9 @@ struct sk_buff *nf_reject_skb_v4_unreach(struct net *net,
}
EXPORT_SYMBOL_GPL(nf_reject_skb_v4_unreach);
-const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
- struct tcphdr *_oth, int hook)
+static const struct tcphdr *
+nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
+ struct tcphdr *_oth, int hook)
{
const struct tcphdr *oth;
@@ -163,11 +198,10 @@ const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
return oth;
}
-EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_get);
-struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
- const struct sk_buff *oldskb,
- __u8 protocol, int ttl)
+static struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ __u8 protocol, int ttl)
{
struct iphdr *niph, *oiph = ip_hdr(oldskb);
@@ -188,10 +222,9 @@ struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
return niph;
}
-EXPORT_SYMBOL_GPL(nf_reject_iphdr_put);
-void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
- const struct tcphdr *oth)
+static void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
+ const struct tcphdr *oth)
{
struct iphdr *niph = ip_hdr(nskb);
struct tcphdr *tcph;
@@ -218,7 +251,6 @@ void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
nskb->csum_start = (unsigned char *)tcph - nskb->head;
nskb->csum_offset = offsetof(struct tcphdr, check);
}
-EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put);
static int nf_reject_fill_skb_dst(struct sk_buff *skb_in)
{
@@ -247,8 +279,7 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb,
if (!oth)
return;
- if ((hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) &&
- nf_reject_fill_skb_dst(oldskb) < 0)
+ if (!skb_dst(oldskb) && nf_reject_fill_skb_dst(oldskb) < 0)
return;
if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
@@ -321,8 +352,7 @@ void nf_send_unreach(struct sk_buff *skb_in, int code, int hook)
if (iph->frag_off & htons(IP_OFFSET))
return;
- if ((hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) &&
- nf_reject_fill_skb_dst(skb_in) < 0)
+ if (!skb_dst(skb_in) && nf_reject_fill_skb_dst(skb_in) < 0)
return;
if (skb_csum_unnecessary(skb_in) ||
diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c
index a1350fc25838..5080fa5fbf6a 100644
--- a/net/ipv4/netfilter/nf_socket_ipv4.c
+++ b/net/ipv4/netfilter/nf_socket_ipv4.c
@@ -71,8 +71,7 @@ nf_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff,
{
switch (protocol) {
case IPPROTO_TCP:
- return inet_lookup(net, net->ipv4.tcp_death_row.hashinfo,
- skb, doff, saddr, sport, daddr, dport,
+ return inet_lookup(net, skb, doff, saddr, sport, daddr, dport,
in->ifindex);
case IPPROTO_UDP:
return udp4_lib_lookup(net, saddr, sport, daddr, dport,
diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c
index 73e66a088e25..041c3f37f237 100644
--- a/net/ipv4/netfilter/nf_tproxy_ipv4.c
+++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c
@@ -81,7 +81,6 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
const struct net_device *in,
const enum nf_tproxy_lookup_t lookup_type)
{
- struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
struct sock *sk;
switch (protocol) {
@@ -95,7 +94,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
switch (lookup_type) {
case NF_TPROXY_LOOKUP_LISTENER:
- sk = inet_lookup_listener(net, hinfo, skb,
+ sk = inet_lookup_listener(net, skb,
ip_hdrlen(skb) + __tcp_hdrlen(hp),
saddr, sport, daddr, dport,
in->ifindex, 0);
@@ -109,7 +108,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
*/
break;
case NF_TPROXY_LOOKUP_ESTABLISHED:
- sk = inet_lookup_established(net, hinfo, saddr, sport,
+ sk = inet_lookup_established(net, saddr, sport,
daddr, dport, in->ifindex);
break;
default:
diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c
index 7e7c49535e3f..82af6cd76d13 100644
--- a/net/ipv4/netfilter/nft_fib_ipv4.c
+++ b/net/ipv4/netfilter/nft_fib_ipv4.c
@@ -10,7 +10,7 @@
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nft_fib.h>
-#include <net/inet_dscp.h>
+#include <net/flow.h>
#include <net/ip.h>
#include <net/ip_fib.h>
#include <net/route.h>
@@ -114,7 +114,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
if (priv->flags & NFTA_FIB_F_MARK)
fl4.flowi4_mark = pkt->skb->mark;
- fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph));
+ fl4.flowi4_dscp = ip4h_dscp(iph);
if (priv->flags & NFTA_FIB_F_DADDR) {
fl4.daddr = iph->daddr;
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 29118c43ebf5..7b9d70f9b31c 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -2087,6 +2087,12 @@ static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
{
struct nh_grp_entry *nhge, *tmp;
+ /* If there is nothing to do, let's avoid the costly call to
+ * synchronize_net()
+ */
+ if (list_empty(&nh->grp_list))
+ return;
+
list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list)
remove_nh_grp_entry(net, nhge, nlinfo);
@@ -2399,6 +2405,13 @@ static int replace_nexthop_single(struct net *net, struct nexthop *old,
return -EINVAL;
}
+ if (!list_empty(&old->grp_list) &&
+ rtnl_dereference(new->nh_info)->fdb_nh !=
+ rtnl_dereference(old->nh_info)->fdb_nh) {
+ NL_SET_ERR_MSG(extack, "Cannot change nexthop FDB status while in a group");
+ return -EINVAL;
+ }
+
err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack);
if (err)
return err;
@@ -3511,12 +3524,42 @@ static int rtm_dump_walk_nexthops(struct sk_buff *skb,
int err;
s_idx = ctx->idx;
- for (node = rb_first(root); node; node = rb_next(node)) {
+
+ /* If this is not the first invocation, ctx->idx will contain the id of
+ * the last nexthop we processed. Instead of starting from the very
+ * first element of the red/black tree again and linearly skipping the
+ * (potentially large) set of nodes with an id smaller than s_idx, walk
+ * the tree and find the left-most node whose id is >= s_idx. This
+ * provides an efficient O(log n) starting point for the dump
+ * continuation.
+ */
+ if (s_idx != 0) {
+ struct rb_node *tmp = root->rb_node;
+
+ node = NULL;
+ while (tmp) {
+ struct nexthop *nh;
+
+ nh = rb_entry(tmp, struct nexthop, rb_node);
+ if (nh->id < s_idx) {
+ tmp = tmp->rb_right;
+ } else {
+ /* Track current candidate and keep looking on
+ * the left side to find the left-most
+ * (smallest id) that is still >= s_idx.
+ */
+ node = tmp;
+ tmp = tmp->rb_left;
+ }
+ }
+ } else {
+ node = rb_first(root);
+ }
+
+ for (; node; node = rb_next(node)) {
struct nexthop *nh;
nh = rb_entry(node, struct nexthop, rb_node);
- if (nh->id < s_idx)
- continue;
ctx->idx = nh->id;
err = nh_cb(skb, cb, nh, data);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 031df4c19fcc..5321c5801c64 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -56,9 +56,7 @@ struct ping_table {
static struct ping_table ping_table;
struct pingv6_ops pingv6_ops;
-EXPORT_SYMBOL_GPL(pingv6_ops);
-
-static u16 ping_port_rover;
+EXPORT_IPV6_MOD_GPL(pingv6_ops);
static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask)
{
@@ -67,7 +65,6 @@ static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask)
pr_debug("hash(%u) = %u\n", num, res);
return res;
}
-EXPORT_SYMBOL_GPL(ping_hash);
static inline struct hlist_head *ping_hashslot(struct ping_table *table,
struct net *net, unsigned int num)
@@ -77,6 +74,7 @@ static inline struct hlist_head *ping_hashslot(struct ping_table *table,
int ping_get_port(struct sock *sk, unsigned short ident)
{
+ struct net *net = sock_net(sk);
struct inet_sock *isk, *isk2;
struct hlist_head *hlist;
struct sock *sk2 = NULL;
@@ -84,15 +82,16 @@ int ping_get_port(struct sock *sk, unsigned short ident)
isk = inet_sk(sk);
spin_lock(&ping_table.lock);
if (ident == 0) {
+ u16 result = net->ipv4.ping_port_rover + 1;
u32 i;
- u16 result = ping_port_rover + 1;
for (i = 0; i < (1L << 16); i++, result++) {
if (!result)
- result++; /* avoid zero */
- hlist = ping_hashslot(&ping_table, sock_net(sk),
- result);
+ continue; /* avoid zero */
+ hlist = ping_hashslot(&ping_table, net, result);
sk_for_each(sk2, hlist) {
+ if (!net_eq(sock_net(sk2), net))
+ continue;
isk2 = inet_sk(sk2);
if (isk2->inet_num == result)
@@ -100,7 +99,7 @@ int ping_get_port(struct sock *sk, unsigned short ident)
}
/* found */
- ping_port_rover = ident = result;
+ net->ipv4.ping_port_rover = ident = result;
break;
next_port:
;
@@ -108,8 +107,10 @@ next_port:
if (i >= (1L << 16))
goto fail;
} else {
- hlist = ping_hashslot(&ping_table, sock_net(sk), ident);
+ hlist = ping_hashslot(&ping_table, net, ident);
sk_for_each(sk2, hlist) {
+ if (!net_eq(sock_net(sk2), net))
+ continue;
isk2 = inet_sk(sk2);
/* BUG? Why is this reuse and not reuseaddr? ping.c
@@ -129,7 +130,7 @@ next_port:
pr_debug("was not hashed\n");
sk_add_node_rcu(sk, hlist);
sock_set_flag(sk, SOCK_RCU_FREE);
- sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ sock_prot_inuse_add(net, sk->sk_prot, 1);
}
spin_unlock(&ping_table.lock);
return 0;
@@ -138,15 +139,7 @@ fail:
spin_unlock(&ping_table.lock);
return -EADDRINUSE;
}
-EXPORT_SYMBOL_GPL(ping_get_port);
-
-int ping_hash(struct sock *sk)
-{
- pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num);
- BUG(); /* "Please do not press this button again." */
-
- return 0;
-}
+EXPORT_IPV6_MOD_GPL(ping_get_port);
void ping_unhash(struct sock *sk)
{
@@ -161,7 +154,7 @@ void ping_unhash(struct sock *sk)
}
spin_unlock(&ping_table.lock);
}
-EXPORT_SYMBOL_GPL(ping_unhash);
+EXPORT_IPV6_MOD_GPL(ping_unhash);
/* Called under rcu_read_lock() */
static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
@@ -188,6 +181,8 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
}
sk_for_each_rcu(sk, hslot) {
+ if (!net_eq(sock_net(sk), net))
+ continue;
isk = inet_sk(sk);
pr_debug("iterate\n");
@@ -279,7 +274,7 @@ out_release_group:
put_group_info(group_info);
return ret;
}
-EXPORT_SYMBOL_GPL(ping_init_sock);
+EXPORT_IPV6_MOD_GPL(ping_init_sock);
void ping_close(struct sock *sk, long timeout)
{
@@ -289,7 +284,7 @@ void ping_close(struct sock *sk, long timeout)
sk_common_release(sk);
}
-EXPORT_SYMBOL_GPL(ping_close);
+EXPORT_IPV6_MOD_GPL(ping_close);
static int ping_pre_connect(struct sock *sk, struct sockaddr *uaddr,
int addr_len)
@@ -467,7 +462,7 @@ out:
pr_debug("ping_v4_bind -> %d\n", err);
return err;
}
-EXPORT_SYMBOL_GPL(ping_bind);
+EXPORT_IPV6_MOD_GPL(ping_bind);
/*
* Is this a supported type of ICMP message?
@@ -600,7 +595,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info)
out:
return;
}
-EXPORT_SYMBOL_GPL(ping_err);
+EXPORT_IPV6_MOD_GPL(ping_err);
/*
* Copy and checksum an ICMP Echo packet from user space into a buffer
@@ -630,7 +625,7 @@ int ping_getfrag(void *from, char *to,
return 0;
}
-EXPORT_SYMBOL_GPL(ping_getfrag);
+EXPORT_IPV6_MOD_GPL(ping_getfrag);
static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh,
struct flowi4 *fl4)
@@ -691,7 +686,7 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
return 0;
}
-EXPORT_SYMBOL_GPL(ping_common_sendmsg);
+EXPORT_IPV6_MOD_GPL(ping_common_sendmsg);
static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
@@ -936,7 +931,7 @@ out:
pr_debug("ping_recvmsg -> %d\n", err);
return err;
}
-EXPORT_SYMBOL_GPL(ping_recvmsg);
+EXPORT_IPV6_MOD_GPL(ping_recvmsg);
static enum skb_drop_reason __ping_queue_rcv_skb(struct sock *sk,
struct sk_buff *skb)
@@ -957,7 +952,7 @@ int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
return __ping_queue_rcv_skb(sk, skb) ? -1 : 0;
}
-EXPORT_SYMBOL_GPL(ping_queue_rcv_skb);
+EXPORT_IPV6_MOD_GPL(ping_queue_rcv_skb);
/*
@@ -985,7 +980,7 @@ enum skb_drop_reason ping_rcv(struct sk_buff *skb)
kfree_skb_reason(skb, SKB_DROP_REASON_NO_SOCKET);
return SKB_DROP_REASON_NO_SOCKET;
}
-EXPORT_SYMBOL_GPL(ping_rcv);
+EXPORT_IPV6_MOD_GPL(ping_rcv);
struct proto ping_prot = {
.name = "PING",
@@ -1002,13 +997,12 @@ struct proto ping_prot = {
.bind = ping_bind,
.backlog_rcv = ping_queue_rcv_skb,
.release_cb = ip4_datagram_release_cb,
- .hash = ping_hash,
.unhash = ping_unhash,
.get_port = ping_get_port,
.put_port = ping_unhash,
.obj_size = sizeof(struct inet_sock),
};
-EXPORT_SYMBOL(ping_prot);
+EXPORT_IPV6_MOD(ping_prot);
#ifdef CONFIG_PROC_FS
@@ -1073,7 +1067,7 @@ void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family)
return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
}
-EXPORT_SYMBOL_GPL(ping_seq_start);
+EXPORT_IPV6_MOD_GPL(ping_seq_start);
static void *ping_v4_seq_start(struct seq_file *seq, loff_t *pos)
{
@@ -1092,14 +1086,14 @@ void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos)
++*pos;
return sk;
}
-EXPORT_SYMBOL_GPL(ping_seq_next);
+EXPORT_IPV6_MOD_GPL(ping_seq_next);
void ping_seq_stop(struct seq_file *seq, void *v)
__releases(ping_table.lock)
{
spin_unlock(&ping_table.lock);
}
-EXPORT_SYMBOL_GPL(ping_seq_stop);
+EXPORT_IPV6_MOD_GPL(ping_seq_stop);
static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
int bucket)
@@ -1119,7 +1113,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
from_kuid_munged(seq_user_ns(f), sk_uid(sp)),
0, sock_i_ino(sp),
refcount_read(&sp->sk_refcnt), sp,
- atomic_read(&sp->sk_drops));
+ sk_drops_read(sp));
}
static int ping_v4_seq_show(struct seq_file *seq, void *v)
@@ -1150,6 +1144,8 @@ static int __net_init ping_v4_proc_init_net(struct net *net)
if (!proc_create_net("icmp", 0444, net->proc_net, &ping_v4_seq_ops,
sizeof(struct ping_iter_state)))
return -ENOMEM;
+
+ net->ipv4.ping_port_rover = get_random_u16();
return 0;
}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 65b0d0ab0084..974afc4ecbe2 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -95,7 +95,6 @@ static const struct snmp_mib snmp4_ipstats_list[] = {
SNMP_MIB_ITEM("FragFails", IPSTATS_MIB_FRAGFAILS),
SNMP_MIB_ITEM("FragCreates", IPSTATS_MIB_FRAGCREATES),
SNMP_MIB_ITEM("OutTransmits", IPSTATS_MIB_OUTPKTS),
- SNMP_MIB_SENTINEL
};
/* Following items are displayed in /proc/net/netstat */
@@ -119,7 +118,6 @@ static const struct snmp_mib snmp4_ipextstats_list[] = {
SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),
SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS),
- SNMP_MIB_SENTINEL
};
static const struct {
@@ -157,7 +155,6 @@ static const struct snmp_mib snmp4_tcp_list[] = {
SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS),
SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS),
SNMP_MIB_ITEM("InCsumErrors", TCP_MIB_CSUMERRORS),
- SNMP_MIB_SENTINEL
};
static const struct snmp_mib snmp4_udp_list[] = {
@@ -170,7 +167,6 @@ static const struct snmp_mib snmp4_udp_list[] = {
SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS),
SNMP_MIB_ITEM("IgnoredMulti", UDP_MIB_IGNOREDMULTI),
SNMP_MIB_ITEM("MemErrors", UDP_MIB_MEMERRORS),
- SNMP_MIB_SENTINEL
};
static const struct snmp_mib snmp4_net_list[] = {
@@ -309,7 +305,6 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPAOKeyNotFound", LINUX_MIB_TCPAOKEYNOTFOUND),
SNMP_MIB_ITEM("TCPAOGood", LINUX_MIB_TCPAOGOOD),
SNMP_MIB_ITEM("TCPAODroppedIcmps", LINUX_MIB_TCPAODROPPEDICMPS),
- SNMP_MIB_SENTINEL
};
static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals,
@@ -389,14 +384,15 @@ static void icmp_put(struct seq_file *seq)
*/
static int snmp_seq_show_ipstats(struct seq_file *seq, void *v)
{
+ const int cnt = ARRAY_SIZE(snmp4_ipstats_list);
+ u64 buff64[ARRAY_SIZE(snmp4_ipstats_list)];
struct net *net = seq->private;
- u64 buff64[IPSTATS_MIB_MAX];
int i;
- memset(buff64, 0, IPSTATS_MIB_MAX * sizeof(u64));
+ memset(buff64, 0, sizeof(buff64));
seq_puts(seq, "Ip: Forwarding DefaultTTL");
- for (i = 0; snmp4_ipstats_list[i].name; i++)
+ for (i = 0; i < cnt; i++)
seq_printf(seq, " %s", snmp4_ipstats_list[i].name);
seq_printf(seq, "\nIp: %d %d",
@@ -404,10 +400,10 @@ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v)
READ_ONCE(net->ipv4.sysctl_ip_default_ttl));
BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
- snmp_get_cpu_field64_batch(buff64, snmp4_ipstats_list,
- net->mib.ip_statistics,
- offsetof(struct ipstats_mib, syncp));
- for (i = 0; snmp4_ipstats_list[i].name; i++)
+ snmp_get_cpu_field64_batch_cnt(buff64, snmp4_ipstats_list, cnt,
+ net->mib.ip_statistics,
+ offsetof(struct ipstats_mib, syncp));
+ for (i = 0; i < cnt; i++)
seq_printf(seq, " %llu", buff64[i]);
return 0;
@@ -415,20 +411,23 @@ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v)
static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v)
{
+ const int udp_cnt = ARRAY_SIZE(snmp4_udp_list);
+ const int tcp_cnt = ARRAY_SIZE(snmp4_tcp_list);
unsigned long buff[TCPUDP_MIB_MAX];
struct net *net = seq->private;
int i;
- memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
+ memset(buff, 0, tcp_cnt * sizeof(unsigned long));
seq_puts(seq, "\nTcp:");
- for (i = 0; snmp4_tcp_list[i].name; i++)
+ for (i = 0; i < tcp_cnt; i++)
seq_printf(seq, " %s", snmp4_tcp_list[i].name);
seq_puts(seq, "\nTcp:");
- snmp_get_cpu_field_batch(buff, snmp4_tcp_list,
- net->mib.tcp_statistics);
- for (i = 0; snmp4_tcp_list[i].name; i++) {
+ snmp_get_cpu_field_batch_cnt(buff, snmp4_tcp_list,
+ tcp_cnt,
+ net->mib.tcp_statistics);
+ for (i = 0; i < tcp_cnt; i++) {
/* MaxConn field is signed, RFC 2012 */
if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
seq_printf(seq, " %ld", buff[i]);
@@ -436,27 +435,29 @@ static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v)
seq_printf(seq, " %lu", buff[i]);
}
- memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
+ memset(buff, 0, udp_cnt * sizeof(unsigned long));
- snmp_get_cpu_field_batch(buff, snmp4_udp_list,
- net->mib.udp_statistics);
+ snmp_get_cpu_field_batch_cnt(buff, snmp4_udp_list,
+ udp_cnt,
+ net->mib.udp_statistics);
seq_puts(seq, "\nUdp:");
- for (i = 0; snmp4_udp_list[i].name; i++)
+ for (i = 0; i < udp_cnt; i++)
seq_printf(seq, " %s", snmp4_udp_list[i].name);
seq_puts(seq, "\nUdp:");
- for (i = 0; snmp4_udp_list[i].name; i++)
+ for (i = 0; i < udp_cnt; i++)
seq_printf(seq, " %lu", buff[i]);
- memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
+ memset(buff, 0, udp_cnt * sizeof(unsigned long));
/* the UDP and UDP-Lite MIBs are the same */
seq_puts(seq, "\nUdpLite:");
- snmp_get_cpu_field_batch(buff, snmp4_udp_list,
- net->mib.udplite_statistics);
- for (i = 0; snmp4_udp_list[i].name; i++)
+ snmp_get_cpu_field_batch_cnt(buff, snmp4_udp_list,
+ udp_cnt,
+ net->mib.udplite_statistics);
+ for (i = 0; i < udp_cnt; i++)
seq_printf(seq, " %s", snmp4_udp_list[i].name);
seq_puts(seq, "\nUdpLite:");
- for (i = 0; snmp4_udp_list[i].name; i++)
+ for (i = 0; i < udp_cnt; i++)
seq_printf(seq, " %lu", buff[i]);
seq_putc(seq, '\n');
@@ -480,8 +481,8 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
*/
static int netstat_seq_show(struct seq_file *seq, void *v)
{
- const int ip_cnt = ARRAY_SIZE(snmp4_ipextstats_list) - 1;
- const int tcp_cnt = ARRAY_SIZE(snmp4_net_list) - 1;
+ const int ip_cnt = ARRAY_SIZE(snmp4_ipextstats_list);
+ const int tcp_cnt = ARRAY_SIZE(snmp4_net_list);
struct net *net = seq->private;
unsigned long *buff;
int i;
@@ -494,8 +495,8 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
buff = kzalloc(max(tcp_cnt * sizeof(long), ip_cnt * sizeof(u64)),
GFP_KERNEL);
if (buff) {
- snmp_get_cpu_field_batch(buff, snmp4_net_list,
- net->mib.net_statistics);
+ snmp_get_cpu_field_batch_cnt(buff, snmp4_net_list, tcp_cnt,
+ net->mib.net_statistics);
for (i = 0; i < tcp_cnt; i++)
seq_printf(seq, " %lu", buff[i]);
} else {
@@ -513,7 +514,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
u64 *buff64 = (u64 *)buff;
memset(buff64, 0, ip_cnt * sizeof(u64));
- snmp_get_cpu_field64_batch(buff64, snmp4_ipextstats_list,
+ snmp_get_cpu_field64_batch_cnt(buff64, snmp4_ipextstats_list, ip_cnt,
net->mib.ip_statistics,
offsetof(struct ipstats_mib, syncp));
for (i = 0; i < ip_cnt; i++)
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 1d2c89d63cc7..d54ebb7df966 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -178,7 +178,7 @@ static int raw_v4_input(struct net *net, struct sk_buff *skb,
if (atomic_read(&sk->sk_rmem_alloc) >=
READ_ONCE(sk->sk_rcvbuf)) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
continue;
}
@@ -311,7 +311,7 @@ static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
int raw_rcv(struct sock *sk, struct sk_buff *skb)
{
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY);
return NET_RX_DROP;
}
@@ -793,6 +793,7 @@ static int raw_sk_init(struct sock *sk)
{
struct raw_sock *rp = raw_sk(sk);
+ sk->sk_drop_counters = &rp->drop_counters;
if (inet_sk(sk)->inet_num == IPPROTO_ICMP)
memset(&rp->filter, 0, sizeof(rp->filter));
return 0;
@@ -1045,7 +1046,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
0, 0L, 0,
from_kuid_munged(seq_user_ns(seq), sk_uid(sp)),
0, sock_i_ino(sp),
- refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops));
+ refcount_read(&sp->sk_refcnt), sp, sk_drops_read(sp));
}
static int raw_seq_show(struct seq_file *seq, void *v)
diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c
index cc793bd8de25..943e5998e0ad 100644
--- a/net/ipv4/raw_diag.c
+++ b/net/ipv4/raw_diag.c
@@ -126,9 +126,9 @@ static int raw_diag_dump_one(struct netlink_callback *cb,
static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *r,
- struct nlattr *bc, bool net_admin)
+ bool net_admin)
{
- if (!inet_diag_bc_sk(bc, sk))
+ if (!inet_diag_bc_sk(cb->data, sk))
return 0;
return inet_sk_diag_fill(sk, NULL, skb, cb, r, NLM_F_MULTI, net_admin);
@@ -140,17 +140,13 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
struct net *net = sock_net(skb->sk);
- struct inet_diag_dump_data *cb_data;
int num, s_num, slot, s_slot;
struct hlist_head *hlist;
struct sock *sk = NULL;
- struct nlattr *bc;
if (IS_ERR(hashinfo))
return;
- cb_data = cb->data;
- bc = cb_data->inet_diag_nla_bc;
s_slot = cb->args[0];
num = s_num = cb->args[1];
@@ -174,7 +170,7 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
if (r->id.idiag_dport != inet->inet_dport &&
r->id.idiag_dport)
goto next;
- if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0)
+ if (sk_diag_dump(sk, skb, cb, r, net_admin) < 0)
goto out_unlock;
next:
num++;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f639a2ae881a..6d27d3610c1c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -84,6 +84,7 @@
#include <linux/jhash.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
+#include <net/flow.h>
#include <net/inet_dscp.h>
#include <net/net_namespace.h>
#include <net/ip.h>
@@ -413,11 +414,11 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
const void *daddr)
{
const struct rtable *rt = container_of(dst, struct rtable, dst);
- struct net_device *dev = dst_dev(dst);
+ struct net_device *dev;
struct neighbour *n;
rcu_read_lock();
-
+ dev = dst_dev_rcu(dst);
if (likely(rt->rt_gw_family == AF_INET)) {
n = ip_neigh_gw4(dev, rt->rt_gw4);
} else if (rt->rt_gw_family == AF_INET6) {
@@ -1026,7 +1027,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
return;
rcu_read_lock();
- net = dev_net_rcu(dst_dev(dst));
+ net = dst_dev_net_rcu(dst);
if (mtu < net->ipv4.ip_rt_min_pmtu) {
lock = true;
mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu);
@@ -1221,8 +1222,8 @@ EXPORT_INDIRECT_CALLABLE(ipv4_dst_check);
static void ipv4_send_dest_unreach(struct sk_buff *skb)
{
+ struct inet_skb_parm parm;
struct net_device *dev;
- struct ip_options opt;
int res;
/* Recompile ip options since IPCB may not be valid anymore.
@@ -1232,21 +1233,21 @@ static void ipv4_send_dest_unreach(struct sk_buff *skb)
ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
return;
- memset(&opt, 0, sizeof(opt));
+ memset(&parm, 0, sizeof(parm));
if (ip_hdr(skb)->ihl > 5) {
if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
return;
- opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
+ parm.opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
rcu_read_lock();
dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
- res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
+ res = __ip_options_compile(dev_net(dev), &parm.opt, skb, NULL);
rcu_read_unlock();
if (res)
return;
}
- __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
+ __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &parm);
}
static void ipv4_link_failure(struct sk_buff *skb)
@@ -1291,7 +1292,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
struct flowi4 fl4 = {
.daddr = iph->daddr,
.saddr = iph->saddr,
- .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)),
+ .flowi4_dscp = ip4h_dscp(iph),
.flowi4_oif = rt->dst.dev->ifindex,
.flowi4_iif = skb->dev->ifindex,
.flowi4_mark = skb->mark,
@@ -1326,7 +1327,7 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
struct net *net;
rcu_read_lock();
- net = dev_net_rcu(dst_dev(dst));
+ net = dst_dev_net_rcu(dst);
advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
net->ipv4.ip_rt_min_advmss);
rcu_read_unlock();
@@ -2210,7 +2211,7 @@ ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
goto martian_source;
}
- if (rt->rt_type != RTN_LOCAL)
+ if (!(rt->rt_flags & RTCF_LOCAL))
goto skip_validate_source;
reason = fib_validate_source_reason(skb, saddr, daddr, dscp, 0, dev,
@@ -2331,7 +2332,7 @@ ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
fl4.flowi4_oif = 0;
fl4.flowi4_iif = dev->ifindex;
fl4.flowi4_mark = skb->mark;
- fl4.flowi4_tos = inet_dscp_to_dsfield(dscp);
+ fl4.flowi4_dscp = dscp;
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
fl4.flowi4_flags = 0;
fl4.daddr = daddr;
@@ -2575,12 +2576,16 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
!netif_is_l3_master(dev_out))
return ERR_PTR(-EINVAL);
- if (ipv4_is_lbcast(fl4->daddr))
+ if (ipv4_is_lbcast(fl4->daddr)) {
type = RTN_BROADCAST;
- else if (ipv4_is_multicast(fl4->daddr))
+
+ /* reset fi to prevent gateway resolution */
+ fi = NULL;
+ } else if (ipv4_is_multicast(fl4->daddr)) {
type = RTN_MULTICAST;
- else if (ipv4_is_zeronet(fl4->daddr))
+ } else if (ipv4_is_zeronet(fl4->daddr)) {
return ERR_PTR(-EINVAL);
+ }
if (dev_out->flags & IFF_LOOPBACK)
flags |= RTCF_LOCAL;
@@ -2690,7 +2695,6 @@ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
struct rtable *rth;
fl4->flowi4_iif = LOOPBACK_IFINDEX;
- fl4->flowi4_tos &= INET_DSCP_MASK;
rcu_read_lock();
rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
@@ -3333,7 +3337,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
fl4.daddr = dst;
fl4.saddr = src;
- fl4.flowi4_tos = inet_dscp_to_dsfield(dscp);
+ fl4.flowi4_dscp = dscp;
fl4.flowi4_oif = nla_get_u32_default(tb[RTA_OIF], 0);
fl4.flowi4_mark = mark;
fl4.flowi4_uid = uid;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index eb0819463fae..569befcf021b 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -12,6 +12,7 @@
#include <linux/export.h>
#include <net/secure_seq.h>
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
#include <net/route.h>
static siphash_aligned_key_t syncookie_secret[2];
@@ -403,6 +404,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
struct tcp_sock *tp = tcp_sk(sk);
struct inet_request_sock *ireq;
struct net *net = sock_net(sk);
+ struct tcp_request_sock *treq;
struct request_sock *req;
struct sock *ret = sk;
struct flowi4 fl4;
@@ -428,6 +430,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
}
ireq = inet_rsk(req);
+ treq = tcp_rsk(req);
sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
@@ -483,6 +486,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
if (!req->syncookie)
ireq->rcv_wscale = rcv_wscale;
ireq->ecn_ok &= cookie_ecn_ok(net, &rt->dst);
+ treq->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th);
ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst);
/* ip_queue_xmit() depends on our flow being setup
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 3a43010d726f..24dbc603cc44 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -47,6 +47,7 @@ static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX;
static int tcp_plb_max_rounds = 31;
static int tcp_plb_max_cong_thresh = 256;
static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC;
+static int tcp_ecn_mode_max = 2;
/* obsolete */
static int sysctl_tcp_low_latency __read_mostly;
@@ -728,9 +729,27 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
+ .extra2 = &tcp_ecn_mode_max,
+ },
+ {
+ .procname = "tcp_ecn_option",
+ .data = &init_net.ipv4.sysctl_tcp_ecn_option,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_TWO,
},
{
+ .procname = "tcp_ecn_option_beacon",
+ .data = &init_net.ipv4.sysctl_tcp_ecn_option_beacon,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_THREE,
+ },
+ {
.procname = "tcp_ecn_fallback",
.data = &init_net.ipv4.sysctl_tcp_ecn_fallback,
.maxlen = sizeof(u8),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 71a956fbfc55..7949d16506a4 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -270,11 +270,14 @@
#include <net/icmp.h>
#include <net/inet_common.h>
+#include <net/inet_ecn.h>
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
#include <net/mptcp.h>
#include <net/proto_memory.h>
#include <net/xfrm.h>
#include <net/ip.h>
+#include <net/psp.h>
#include <net/sock.h>
#include <net/rstreason.h>
@@ -412,6 +415,22 @@ static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
return rate64;
}
+#ifdef CONFIG_TCP_MD5SIG
+void tcp_md5_destruct_sock(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tp->md5sig_info) {
+
+ tcp_clear_md5_list(sk);
+ kfree(rcu_replace_pointer(tp->md5sig_info, NULL, 1));
+ static_branch_slow_dec_deferred(&tcp_md5_needed);
+ tcp_md5_release_sigpool();
+ }
+}
+EXPORT_IPV6_MOD_GPL(tcp_md5_destruct_sock);
+#endif
+
/* Address-family independent initialization for a tcp_sock.
*
* NOTE: A lot of things set to zero explicitly by call to
@@ -687,6 +706,7 @@ void tcp_skb_entail(struct sock *sk, struct sk_buff *skb)
tcb->seq = tcb->end_seq = tp->write_seq;
tcb->tcp_flags = TCPHDR_ACK;
__skb_header_release(skb);
+ psp_enqueue_set_decrypted(sk, skb);
tcp_add_write_queue_tail(sk, skb);
sk_wmem_queued_add(sk, skb->truesize);
sk_mem_charge(sk, skb->truesize);
@@ -2818,9 +2838,9 @@ found_ok_skb:
err = tcp_recvmsg_dmabuf(sk, skb, offset, msg,
used);
- if (err <= 0) {
+ if (err < 0) {
if (!copied)
- copied = -EFAULT;
+ copied = err;
break;
}
@@ -3099,8 +3119,8 @@ bool tcp_check_oom(const struct sock *sk, int shift)
void __tcp_close(struct sock *sk, long timeout)
{
+ bool data_was_unread = false;
struct sk_buff *skb;
- int data_was_unread = 0;
int state;
WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
@@ -3118,13 +3138,14 @@ void __tcp_close(struct sock *sk, long timeout)
* descriptor close, not protocol-sourced closes, because the
* reader process may not have drained the data yet!
*/
- while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
- u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
+ while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
+ u32 end_seq = TCP_SKB_CB(skb)->end_seq;
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
- len--;
- data_was_unread += len;
- __kfree_skb(skb);
+ end_seq--;
+ if (after(end_seq, tcp_sk(sk)->copied_seq))
+ data_was_unread = true;
+ tcp_eat_recv_skb(sk, skb);
}
/* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
@@ -3195,7 +3216,7 @@ adjudge_to_death:
/* remove backlog if any, without releasing ownership. */
__release_sock(sk);
- this_cpu_inc(tcp_orphan_count);
+ tcp_orphan_count_inc();
/* Have we already been destroyed by a softirq or backlog? */
if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
@@ -3327,6 +3348,7 @@ int tcp_disconnect(struct sock *sk, int flags)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int old_state = sk->sk_state;
+ struct request_sock *req;
u32 seq;
if (old_state != TCP_CLOSE)
@@ -3376,7 +3398,7 @@ int tcp_disconnect(struct sock *sk, int flags)
WRITE_ONCE(tp->write_seq, seq);
icsk->icsk_backoff = 0;
- icsk->icsk_probes_out = 0;
+ WRITE_ONCE(icsk->icsk_probes_out, 0);
icsk->icsk_probes_tstamp = 0;
icsk->icsk_rto = TCP_TIMEOUT_INIT;
WRITE_ONCE(icsk->icsk_rto_min, TCP_RTO_MIN);
@@ -3389,6 +3411,11 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->window_clamp = 0;
tp->delivered = 0;
tp->delivered_ce = 0;
+ tp->accecn_fail_mode = 0;
+ tp->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN;
+ tcp_accecn_init_counters(tp);
+ tp->prev_ecnfield = 0;
+ tp->accecn_opt_tstamp = 0;
if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release)
icsk->icsk_ca_ops->release(sk);
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
@@ -3442,6 +3469,10 @@ int tcp_disconnect(struct sock *sk, int flags)
/* Clean up fastopen related fields */
+ req = rcu_dereference_protected(tp->fastopen_rsk,
+ lockdep_sock_is_held(sk));
+ if (req)
+ reqsk_fastopen_remove(sk, req, false);
tcp_free_fastopen_req(tp);
inet_clear_bit(DEFER_CONNECT, sk);
tp->fastopen_client_fail = 0;
@@ -3760,7 +3791,7 @@ int tcp_sock_set_maxseg(struct sock *sk, int val)
if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW))
return -EINVAL;
- tcp_sk(sk)->rx_opt.user_mss = val;
+ WRITE_ONCE(tcp_sk(sk)->rx_opt.user_mss, val);
return 0;
}
@@ -3890,15 +3921,13 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
WRITE_ONCE(inet_csk(sk)->icsk_delack_max, delack_max);
return 0;
}
+ case TCP_MAXSEG:
+ return tcp_sock_set_maxseg(sk, val);
}
sockopt_lock_sock(sk);
switch (optname) {
- case TCP_MAXSEG:
- err = tcp_sock_set_maxseg(sk, val);
- break;
-
case TCP_NODELAY:
__tcp_sock_set_nodelay(sk, val);
break;
@@ -4137,6 +4166,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
const struct inet_connection_sock *icsk = inet_csk(sk);
+ const u8 ect1_idx = INET_ECN_ECT_1 - 1;
+ const u8 ect0_idx = INET_ECN_ECT_0 - 1;
+ const u8 ce_idx = INET_ECN_CE - 1;
unsigned long rate;
u32 now;
u64 rate64;
@@ -4263,6 +4295,16 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
if (tp->rto_stamp)
info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp;
+ info->tcpi_accecn_fail_mode = tp->accecn_fail_mode;
+ info->tcpi_accecn_opt_seen = tp->saw_accecn_opt;
+ info->tcpi_received_ce = tp->received_ce;
+ info->tcpi_delivered_e1_bytes = tp->delivered_ecn_bytes[ect1_idx];
+ info->tcpi_delivered_e0_bytes = tp->delivered_ecn_bytes[ect0_idx];
+ info->tcpi_delivered_ce_bytes = tp->delivered_ecn_bytes[ce_idx];
+ info->tcpi_received_e1_bytes = tp->received_ecn_bytes[ect1_idx];
+ info->tcpi_received_e0_bytes = tp->received_ecn_bytes[ect0_idx];
+ info->tcpi_received_ce_bytes = tp->received_ecn_bytes[ce_idx];
+
unlock_sock_fast(sk, slow);
}
EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -4348,7 +4390,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
- nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
+ nla_put_u8(stats, TCP_NLA_RECUR_RETRANS,
+ READ_ONCE(inet_csk(sk)->icsk_retransmits));
nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
@@ -4383,6 +4426,7 @@ int do_tcp_getsockopt(struct sock *sk, int level,
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
+ int user_mss;
int val, len;
if (copy_from_sockptr(&len, optlen, sizeof(int)))
@@ -4396,9 +4440,10 @@ int do_tcp_getsockopt(struct sock *sk, int level,
switch (optname) {
case TCP_MAXSEG:
val = tp->mss_cache;
- if (tp->rx_opt.user_mss &&
+ user_mss = READ_ONCE(tp->rx_opt.user_mss);
+ if (user_mss &&
((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
- val = tp->rx_opt.user_mss;
+ val = user_mss;
if (tp->repair)
val = tp->rx_opt.mss_clamp;
break;
@@ -5056,7 +5101,9 @@ static void __init tcp_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint);
- CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_tx, 32);
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, tcp_clean_acked);
+#endif
/* TXRX read-mostly hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, tsoffset);
@@ -5067,11 +5114,9 @@ static void __init tcp_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, lost_out);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, sacked_out);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, scaling_ratio);
- CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_txrx, 32);
/* RX read-mostly hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, copied_seq);
- CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rcv_tstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_wl1);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tlp_high_seq);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rttvar_us);
@@ -5082,12 +5127,6 @@ static void __init tcp_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rtt_min);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, out_of_order_queue);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_ssthresh);
-#if IS_ENABLED(CONFIG_TLS_DEVICE)
- CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tcp_clean_acked);
- CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_rx, 77);
-#else
- CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_rx, 69);
-#endif
/* TX read-write hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, segs_out);
@@ -5101,11 +5140,11 @@ static void __init tcp_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, accecn_opt_tstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags);
- CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 89);
/* TXRX read-write hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags);
@@ -5120,15 +5159,13 @@ static void __init tcp_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_up);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_tstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt);
- /* 32bit arches with 8byte alignment on u64 fields might need padding
- * before tcp_clock_cache.
- */
- CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 92 + 4);
-
/* RX read-write hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, segs_in);
@@ -5139,12 +5176,12 @@ static void __init tcp_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_delivered);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_ecn_bytes);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space);
- CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 99);
}
void __init tcp_init(void)
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
index bbb8d5f0eae7..34b8450829d0 100644
--- a/net/ipv4/tcp_ao.c
+++ b/net/ipv4/tcp_ao.c
@@ -268,9 +268,8 @@ static void tcp_ao_key_free_rcu(struct rcu_head *head)
kfree_sensitive(key);
}
-static void tcp_ao_info_free_rcu(struct rcu_head *head)
+static void tcp_ao_info_free(struct tcp_ao_info *ao)
{
- struct tcp_ao_info *ao = container_of(head, struct tcp_ao_info, rcu);
struct tcp_ao_key *key;
struct hlist_node *n;
@@ -310,7 +309,7 @@ void tcp_ao_destroy_sock(struct sock *sk, bool twsk)
if (!twsk)
tcp_ao_sk_omem_free(sk, ao);
- call_rcu(&ao->rcu, tcp_ao_info_free_rcu);
+ tcp_ao_info_free(ao);
}
void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp)
@@ -1178,7 +1177,9 @@ void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb)
if (!ao)
return;
- WRITE_ONCE(ao->risn, tcp_hdr(skb)->seq);
+ /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */
+ if (skb)
+ WRITE_ONCE(ao->risn, tcp_hdr(skb)->seq);
ao->rcv_sne = 0;
hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk))
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index ba581785adb4..a268e1595b22 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -408,8 +408,11 @@ more_data:
if (!psock->cork) {
psock->cork = kzalloc(sizeof(*psock->cork),
GFP_ATOMIC | __GFP_NOWARN);
- if (!psock->cork)
+ if (!psock->cork) {
+ sk_msg_free(sk, msg);
+ *copied = 0;
return -ENOMEM;
+ }
}
memcpy(psock->cork, msg, sizeof(*msg));
return 0;
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index ba4d98e510e0..fbad6c35dee9 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -379,7 +379,7 @@ static void tcp_cdg_init(struct sock *sk)
/* We silently fall back to window = 1 if allocation fails. */
if (window > 1)
ca->gradients = kcalloc(window, sizeof(ca->gradients[0]),
- GFP_NOWAIT | __GFP_NOWARN);
+ GFP_NOWAIT);
ca->rtt_seq = tp->snd_nxt;
ca->shadow_wnd = tcp_snd_cwnd(tp);
}
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 45e174b8cd22..d83efd91f461 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -12,6 +12,9 @@
#include <linux/tcp.h>
+#include <net/inet_hashtables.h>
+#include <net/inet6_hashtables.h>
+#include <net/inet_timewait_sock.h>
#include <net/netlink.h>
#include <net/tcp.h>
@@ -174,27 +177,465 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin)
size += ulp_ops->get_info_size(sk, net_admin);
}
}
- return size;
+
+ return size
+ + nla_total_size(sizeof(struct tcp_info))
+ + nla_total_size(sizeof(struct inet_diag_msg))
+ + inet_diag_msg_attrs_size()
+ + nla_total_size(sizeof(struct inet_diag_meminfo))
+ + nla_total_size(SK_MEMINFO_VARS * sizeof(u32))
+ + nla_total_size(TCP_CA_NAME_MAX)
+ + nla_total_size(sizeof(struct tcpvegas_info))
+ + 64;
+}
+
+static int tcp_twsk_diag_fill(struct sock *sk,
+ struct sk_buff *skb,
+ struct netlink_callback *cb,
+ u16 nlmsg_flags, bool net_admin)
+{
+ struct inet_timewait_sock *tw = inet_twsk(sk);
+ struct inet_diag_msg *r;
+ struct nlmsghdr *nlh;
+ long tmo;
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type,
+ sizeof(*r), nlmsg_flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ r = nlmsg_data(nlh);
+ DEBUG_NET_WARN_ON_ONCE(tw->tw_state != TCP_TIME_WAIT);
+
+ inet_diag_msg_common_fill(r, sk);
+ r->idiag_retrans = 0;
+
+ r->idiag_state = READ_ONCE(tw->tw_substate);
+ r->idiag_timer = 3;
+ tmo = tw->tw_timer.expires - jiffies;
+ r->idiag_expires = jiffies_delta_to_msecs(tmo);
+ r->idiag_rqueue = 0;
+ r->idiag_wqueue = 0;
+ r->idiag_uid = 0;
+ r->idiag_inode = 0;
+
+ if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
+ tw->tw_mark)) {
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static int tcp_req_diag_fill(struct sock *sk, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ u16 nlmsg_flags, bool net_admin)
+{
+ struct request_sock *reqsk = inet_reqsk(sk);
+ struct inet_diag_msg *r;
+ struct nlmsghdr *nlh;
+ long tmo;
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ r = nlmsg_data(nlh);
+ inet_diag_msg_common_fill(r, sk);
+ r->idiag_state = TCP_SYN_RECV;
+ r->idiag_timer = 1;
+ r->idiag_retrans = READ_ONCE(reqsk->num_retrans);
+
+ BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
+ offsetof(struct sock, sk_cookie));
+
+ tmo = READ_ONCE(inet_reqsk(sk)->rsk_timer.expires) - jiffies;
+ r->idiag_expires = jiffies_delta_to_msecs(tmo);
+ r->idiag_rqueue = 0;
+ r->idiag_wqueue = 0;
+ r->idiag_uid = 0;
+ r->idiag_inode = 0;
+
+ if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
+ inet_rsk(reqsk)->ir_mark)) {
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r,
+ u16 nlmsg_flags, bool net_admin)
+{
+ if (sk->sk_state == TCP_TIME_WAIT)
+ return tcp_twsk_diag_fill(sk, skb, cb, nlmsg_flags, net_admin);
+
+ if (sk->sk_state == TCP_NEW_SYN_RECV)
+ return tcp_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin);
+
+ return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, r, nlmsg_flags,
+ net_admin);
+}
+
+static void twsk_build_assert(void)
+{
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) !=
+ offsetof(struct sock, sk_family));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) !=
+ offsetof(struct inet_sock, inet_num));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) !=
+ offsetof(struct inet_sock, inet_dport));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) !=
+ offsetof(struct inet_sock, inet_rcv_saddr));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) !=
+ offsetof(struct inet_sock, inet_daddr));
+
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) !=
+ offsetof(struct sock, sk_v6_rcv_saddr));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) !=
+ offsetof(struct sock, sk_v6_daddr));
+#endif
}
static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
const struct inet_diag_req_v2 *r)
{
- struct inet_hashinfo *hinfo;
+ bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
+ struct inet_diag_dump_data *cb_data = cb->data;
+ struct net *net = sock_net(skb->sk);
+ u32 idiag_states = r->idiag_states;
+ struct inet_hashinfo *hashinfo;
+ int i, num, s_i, s_num;
+ struct sock *sk;
- hinfo = sock_net(cb->skb->sk)->ipv4.tcp_death_row.hashinfo;
+ hashinfo = net->ipv4.tcp_death_row.hashinfo;
+ if (idiag_states & TCPF_SYN_RECV)
+ idiag_states |= TCPF_NEW_SYN_RECV;
+ s_i = cb->args[1];
+ s_num = num = cb->args[2];
+
+ if (cb->args[0] == 0) {
+ if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport)
+ goto skip_listen_ht;
+
+ for (i = s_i; i <= hashinfo->lhash2_mask; i++) {
+ struct inet_listen_hashbucket *ilb;
+ struct hlist_nulls_node *node;
+
+ num = 0;
+ ilb = &hashinfo->lhash2[i];
+
+ if (hlist_nulls_empty(&ilb->nulls_head)) {
+ s_num = 0;
+ continue;
+ }
+ spin_lock(&ilb->lock);
+ sk_nulls_for_each(sk, node, &ilb->nulls_head) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (!net_eq(sock_net(sk), net))
+ continue;
+
+ if (num < s_num) {
+ num++;
+ continue;
+ }
+
+ if (r->sdiag_family != AF_UNSPEC &&
+ sk->sk_family != r->sdiag_family)
+ goto next_listen;
+
+ if (r->id.idiag_sport != inet->inet_sport &&
+ r->id.idiag_sport)
+ goto next_listen;
+
+ if (!inet_diag_bc_sk(cb_data, sk))
+ goto next_listen;
+
+ if (inet_sk_diag_fill(sk, inet_csk(sk), skb,
+ cb, r, NLM_F_MULTI,
+ net_admin) < 0) {
+ spin_unlock(&ilb->lock);
+ goto done;
+ }
+
+next_listen:
+ ++num;
+ }
+ spin_unlock(&ilb->lock);
+
+ s_num = 0;
+ }
+skip_listen_ht:
+ cb->args[0] = 1;
+ s_i = num = s_num = 0;
+ }
+
+/* Process a maximum of SKARR_SZ sockets at a time when walking hash buckets
+ * with bh disabled.
+ */
+#define SKARR_SZ 16
+
+ /* Dump bound but inactive (not listening, connecting, etc.) sockets */
+ if (cb->args[0] == 1) {
+ if (!(idiag_states & TCPF_BOUND_INACTIVE))
+ goto skip_bind_ht;
+
+ for (i = s_i; i < hashinfo->bhash_size; i++) {
+ struct inet_bind_hashbucket *ibb;
+ struct inet_bind2_bucket *tb2;
+ struct sock *sk_arr[SKARR_SZ];
+ int num_arr[SKARR_SZ];
+ int idx, accum, res;
+
+resume_bind_walk:
+ num = 0;
+ accum = 0;
+ ibb = &hashinfo->bhash2[i];
+
+ if (hlist_empty(&ibb->chain)) {
+ s_num = 0;
+ continue;
+ }
+ spin_lock_bh(&ibb->lock);
+ inet_bind_bucket_for_each(tb2, &ibb->chain) {
+ if (!net_eq(ib2_net(tb2), net))
+ continue;
+
+ sk_for_each_bound(sk, &tb2->owners) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (num < s_num)
+ goto next_bind;
+
+ if (sk->sk_state != TCP_CLOSE ||
+ !inet->inet_num)
+ goto next_bind;
+
+ if (r->sdiag_family != AF_UNSPEC &&
+ r->sdiag_family != sk->sk_family)
+ goto next_bind;
+
+ if (!inet_diag_bc_sk(cb_data, sk))
+ goto next_bind;
+
+ sock_hold(sk);
+ num_arr[accum] = num;
+ sk_arr[accum] = sk;
+ if (++accum == SKARR_SZ)
+ goto pause_bind_walk;
+next_bind:
+ num++;
+ }
+ }
+pause_bind_walk:
+ spin_unlock_bh(&ibb->lock);
+
+ res = 0;
+ for (idx = 0; idx < accum; idx++) {
+ if (res >= 0) {
+ res = inet_sk_diag_fill(sk_arr[idx],
+ NULL, skb, cb,
+ r, NLM_F_MULTI,
+ net_admin);
+ if (res < 0)
+ num = num_arr[idx];
+ }
+ sock_put(sk_arr[idx]);
+ }
+ if (res < 0)
+ goto done;
+
+ cond_resched();
+
+ if (accum == SKARR_SZ) {
+ s_num = num + 1;
+ goto resume_bind_walk;
+ }
+
+ s_num = 0;
+ }
+skip_bind_ht:
+ cb->args[0] = 2;
+ s_i = num = s_num = 0;
+ }
- inet_diag_dump_icsk(hinfo, skb, cb, r);
+ if (!(idiag_states & ~TCPF_LISTEN))
+ goto out;
+
+ for (i = s_i; i <= hashinfo->ehash_mask; i++) {
+ struct inet_ehash_bucket *head = &hashinfo->ehash[i];
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
+ struct hlist_nulls_node *node;
+ struct sock *sk_arr[SKARR_SZ];
+ int num_arr[SKARR_SZ];
+ int idx, accum, res;
+
+ if (hlist_nulls_empty(&head->chain))
+ continue;
+
+ if (i > s_i)
+ s_num = 0;
+
+next_chunk:
+ num = 0;
+ accum = 0;
+ spin_lock_bh(lock);
+ sk_nulls_for_each(sk, node, &head->chain) {
+ int state;
+
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (num < s_num)
+ goto next_normal;
+ state = (sk->sk_state == TCP_TIME_WAIT) ?
+ READ_ONCE(inet_twsk(sk)->tw_substate) : sk->sk_state;
+ if (!(idiag_states & (1 << state)))
+ goto next_normal;
+ if (r->sdiag_family != AF_UNSPEC &&
+ sk->sk_family != r->sdiag_family)
+ goto next_normal;
+ if (r->id.idiag_sport != htons(sk->sk_num) &&
+ r->id.idiag_sport)
+ goto next_normal;
+ if (r->id.idiag_dport != sk->sk_dport &&
+ r->id.idiag_dport)
+ goto next_normal;
+ twsk_build_assert();
+
+ if (!inet_diag_bc_sk(cb_data, sk))
+ goto next_normal;
+
+ if (!refcount_inc_not_zero(&sk->sk_refcnt))
+ goto next_normal;
+
+ num_arr[accum] = num;
+ sk_arr[accum] = sk;
+ if (++accum == SKARR_SZ)
+ break;
+next_normal:
+ ++num;
+ }
+ spin_unlock_bh(lock);
+
+ res = 0;
+ for (idx = 0; idx < accum; idx++) {
+ if (res >= 0) {
+ res = sk_diag_fill(sk_arr[idx], skb, cb, r,
+ NLM_F_MULTI, net_admin);
+ if (res < 0)
+ num = num_arr[idx];
+ }
+ sock_gen_put(sk_arr[idx]);
+ }
+ if (res < 0)
+ break;
+
+ cond_resched();
+
+ if (accum == SKARR_SZ) {
+ s_num = num + 1;
+ goto next_chunk;
+ }
+ }
+
+done:
+ cb->args[1] = i;
+ cb->args[2] = num;
+out:
+ ;
+}
+
+static struct sock *tcp_diag_find_one_icsk(struct net *net,
+ const struct inet_diag_req_v2 *req)
+{
+ struct sock *sk;
+
+ rcu_read_lock();
+ if (req->sdiag_family == AF_INET) {
+ sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[0],
+ req->id.idiag_dport, req->id.idiag_src[0],
+ req->id.idiag_sport, req->id.idiag_if);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (req->sdiag_family == AF_INET6) {
+ if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
+ ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src))
+ sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[3],
+ req->id.idiag_dport, req->id.idiag_src[3],
+ req->id.idiag_sport, req->id.idiag_if);
+ else
+ sk = inet6_lookup(net, NULL, 0,
+ (struct in6_addr *)req->id.idiag_dst,
+ req->id.idiag_dport,
+ (struct in6_addr *)req->id.idiag_src,
+ req->id.idiag_sport,
+ req->id.idiag_if);
+#endif
+ } else {
+ rcu_read_unlock();
+ return ERR_PTR(-EINVAL);
+ }
+ rcu_read_unlock();
+ if (!sk)
+ return ERR_PTR(-ENOENT);
+
+ if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) {
+ sock_gen_put(sk);
+ return ERR_PTR(-ENOENT);
+ }
+
+ return sk;
}
static int tcp_diag_dump_one(struct netlink_callback *cb,
const struct inet_diag_req_v2 *req)
{
- struct inet_hashinfo *hinfo;
+ struct sk_buff *in_skb = cb->skb;
+ struct sk_buff *rep;
+ struct sock *sk;
+ struct net *net;
+ bool net_admin;
+ int err;
- hinfo = sock_net(cb->skb->sk)->ipv4.tcp_death_row.hashinfo;
+ net = sock_net(in_skb->sk);
+ sk = tcp_diag_find_one_icsk(net, req);
+ if (IS_ERR(sk))
+ return PTR_ERR(sk);
- return inet_diag_dump_one_icsk(hinfo, cb, req);
+ net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN);
+ rep = nlmsg_new(tcp_diag_get_aux_size(sk, net_admin), GFP_KERNEL);
+ if (!rep) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = sk_diag_fill(sk, rep, cb, req, 0, net_admin);
+ if (err < 0) {
+ WARN_ON(err == -EMSGSIZE);
+ nlmsg_free(rep);
+ goto out;
+ }
+ err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid);
+
+out:
+ if (sk)
+ sock_gen_put(sk);
+
+ return err;
}
#ifdef CONFIG_INET_DIAG_DESTROY
@@ -202,13 +643,10 @@ static int tcp_diag_destroy(struct sk_buff *in_skb,
const struct inet_diag_req_v2 *req)
{
struct net *net = sock_net(in_skb->sk);
- struct inet_hashinfo *hinfo;
struct sock *sk;
int err;
- hinfo = net->ipv4.tcp_death_row.hashinfo;
- sk = inet_diag_find_one_icsk(net, hinfo, req);
-
+ sk = tcp_diag_find_one_icsk(net, req);
if (IS_ERR(sk))
return PTR_ERR(sk);
@@ -226,7 +664,6 @@ static const struct inet_diag_handler tcp_diag_handler = {
.dump_one = tcp_diag_dump_one,
.idiag_get_info = tcp_diag_get_info,
.idiag_get_aux = tcp_diag_get_aux,
- .idiag_get_aux_size = tcp_diag_get_aux_size,
.idiag_type = IPPROTO_TCP,
.idiag_info_size = sizeof(struct tcp_info),
#ifdef CONFIG_INET_DIAG_DESTROY
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index f1884f0c9e52..7d945a527daf 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -576,11 +576,12 @@ void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
}
} else if (tp->syn_fastopen_ch &&
atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) {
- dst = sk_dst_get(sk);
- dev = dst ? dst_dev(dst) : NULL;
+ rcu_read_lock();
+ dst = __sk_dst_get(sk);
+ dev = dst ? dst_dev_rcu(dst) : NULL;
if (!(dev && (dev->flags & IFF_LOOPBACK)))
atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0);
- dst_release(dst);
+ rcu_read_unlock();
}
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 71b76e98371a..b44fdc309633 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -70,8 +70,10 @@
#include <linux/sysctl.h>
#include <linux/kernel.h>
#include <linux/prefetch.h>
+#include <linux/bitops.h>
#include <net/dst.h>
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
#include <net/proto_memory.h>
#include <net/inet_common.h>
#include <linux/ipsec.h>
@@ -339,31 +341,6 @@ static bool tcp_in_quickack_mode(struct sock *sk)
(icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk));
}
-static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
-{
- if (tcp_ecn_mode_rfc3168(tp))
- tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
-}
-
-static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb)
-{
- if (tcp_hdr(skb)->cwr) {
- tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
-
- /* If the sender is telling us it has entered CWR, then its
- * cwnd may be very low (even just 1 packet), so we should ACK
- * immediately.
- */
- if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq)
- inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
- }
-}
-
-static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
-{
- tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
-}
-
static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -384,38 +361,117 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb)
if (tcp_ca_needs_ecn(sk))
tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
- if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
+ if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR) &&
+ tcp_ecn_mode_rfc3168(tp)) {
/* Better not delay acks, sender can have a very low cwnd */
tcp_enter_quickack_mode(sk, 2);
tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
}
+ /* As for RFC3168 ECN, the TCP_ECN_SEEN flag is set by
+ * tcp_data_ecn_check() when the ECN codepoint of
+ * received TCP data contains ECT(0), ECT(1), or CE.
+ */
+ if (!tcp_ecn_mode_rfc3168(tp))
+ break;
tp->ecn_flags |= TCP_ECN_SEEN;
break;
default:
if (tcp_ca_needs_ecn(sk))
tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
+ if (!tcp_ecn_mode_rfc3168(tp))
+ break;
tp->ecn_flags |= TCP_ECN_SEEN;
break;
}
}
-static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
+/* Returns true if the byte counters can be used */
+static bool tcp_accecn_process_option(struct tcp_sock *tp,
+ const struct sk_buff *skb,
+ u32 delivered_bytes, int flag)
{
- if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || th->cwr))
- tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
-}
+ u8 estimate_ecnfield = tp->est_ecnfield;
+ bool ambiguous_ecn_bytes_incr = false;
+ bool first_changed = false;
+ unsigned int optlen;
+ bool order1, res;
+ unsigned int i;
+ u8 *ptr;
-static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
-{
- if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr))
- tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
-}
+ if (tcp_accecn_opt_fail_recv(tp))
+ return false;
-static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
-{
- if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp))
- return true;
- return false;
+ if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) {
+ if (!tp->saw_accecn_opt) {
+ /* Too late to enable after this point due to
+ * potential counter wraps
+ */
+ if (tp->bytes_sent >= (1 << 23) - 1) {
+ u8 saw_opt = TCP_ACCECN_OPT_FAIL_SEEN;
+
+ tcp_accecn_saw_opt_fail_recv(tp, saw_opt);
+ }
+ return false;
+ }
+
+ if (estimate_ecnfield) {
+ u8 ecnfield = estimate_ecnfield - 1;
+
+ tp->delivered_ecn_bytes[ecnfield] += delivered_bytes;
+ return true;
+ }
+ return false;
+ }
+
+ ptr = skb_transport_header(skb) + tp->rx_opt.accecn;
+ optlen = ptr[1] - 2;
+ if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1))
+ return false;
+ order1 = (ptr[0] == TCPOPT_ACCECN1);
+ ptr += 2;
+
+ if (tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
+ tp->saw_accecn_opt = tcp_accecn_option_init(skb,
+ tp->rx_opt.accecn);
+ if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN)
+ tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV);
+ }
+
+ res = !!estimate_ecnfield;
+ for (i = 0; i < 3; i++) {
+ u32 init_offset;
+ u8 ecnfield;
+ s32 delta;
+ u32 *cnt;
+
+ if (optlen < TCPOLEN_ACCECN_PERFIELD)
+ break;
+
+ ecnfield = tcp_accecn_optfield_to_ecnfield(i, order1);
+ init_offset = tcp_accecn_field_init_offset(ecnfield);
+ cnt = &tp->delivered_ecn_bytes[ecnfield - 1];
+ delta = tcp_update_ecn_bytes(cnt, ptr, init_offset);
+ if (delta && delta < 0) {
+ res = false;
+ ambiguous_ecn_bytes_incr = true;
+ }
+ if (delta && ecnfield != estimate_ecnfield) {
+ if (!first_changed) {
+ tp->est_ecnfield = ecnfield;
+ first_changed = true;
+ } else {
+ res = false;
+ ambiguous_ecn_bytes_incr = true;
+ }
+ }
+
+ optlen -= TCPOLEN_ACCECN_PERFIELD;
+ ptr += TCPOLEN_ACCECN_PERFIELD;
+ }
+ if (ambiguous_ecn_bytes_incr)
+ tp->est_ecnfield = 0;
+
+ return res;
}
static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count)
@@ -428,10 +484,101 @@ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
bool ece_ack)
{
tp->delivered += delivered;
- if (ece_ack)
+ if (tcp_ecn_mode_rfc3168(tp) && ece_ack)
tcp_count_delivered_ce(tp, delivered);
}
+/* Returns the ECN CE delta */
+static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb,
+ u32 delivered_pkts, u32 delivered_bytes,
+ int flag)
+{
+ u32 old_ceb = tcp_sk(sk)->delivered_ecn_bytes[INET_ECN_CE - 1];
+ const struct tcphdr *th = tcp_hdr(skb);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 delta, safe_delta, d_ceb;
+ bool opt_deltas_valid;
+ u32 corrected_ace;
+
+ /* Reordered ACK or uncertain due to lack of data to send and ts */
+ if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS)))
+ return 0;
+
+ opt_deltas_valid = tcp_accecn_process_option(tp, skb,
+ delivered_bytes, flag);
+
+ if (!(flag & FLAG_SLOWPATH)) {
+ /* AccECN counter might overflow on large ACKs */
+ if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK)
+ return 0;
+ }
+
+ /* ACE field is not available during handshake */
+ if (flag & FLAG_SYN_ACKED)
+ return 0;
+
+ if (tp->received_ce_pending >= TCP_ACCECN_ACE_MAX_DELTA)
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
+
+ corrected_ace = tcp_accecn_ace(th) - TCP_ACCECN_CEP_INIT_OFFSET;
+ delta = (corrected_ace - tp->delivered_ce) & TCP_ACCECN_CEP_ACE_MASK;
+ if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK)
+ return delta;
+
+ safe_delta = delivered_pkts -
+ ((delivered_pkts - delta) & TCP_ACCECN_CEP_ACE_MASK);
+
+ if (opt_deltas_valid) {
+ d_ceb = tp->delivered_ecn_bytes[INET_ECN_CE - 1] - old_ceb;
+ if (!d_ceb)
+ return delta;
+
+ if ((delivered_pkts >= (TCP_ACCECN_CEP_ACE_MASK + 1) * 2) &&
+ (tcp_is_sack(tp) ||
+ ((1 << inet_csk(sk)->icsk_ca_state) &
+ (TCPF_CA_Open | TCPF_CA_CWR)))) {
+ u32 est_d_cep;
+
+ if (delivered_bytes <= d_ceb)
+ return safe_delta;
+
+ est_d_cep = DIV_ROUND_UP_ULL((u64)d_ceb *
+ delivered_pkts,
+ delivered_bytes);
+ return min(safe_delta,
+ delta +
+ (est_d_cep & ~TCP_ACCECN_CEP_ACE_MASK));
+ }
+
+ if (d_ceb > delta * tp->mss_cache)
+ return safe_delta;
+ if (d_ceb <
+ safe_delta * tp->mss_cache >> TCP_ACCECN_SAFETY_SHIFT)
+ return delta;
+ }
+
+ return safe_delta;
+}
+
+static u32 tcp_accecn_process(struct sock *sk, const struct sk_buff *skb,
+ u32 delivered_pkts, u32 delivered_bytes,
+ int *flag)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 delta;
+
+ delta = __tcp_accecn_process(sk, skb, delivered_pkts,
+ delivered_bytes, *flag);
+ if (delta > 0) {
+ tcp_count_delivered_ce(tp, delta);
+ *flag |= FLAG_ECE;
+ /* Recalculate header predictor */
+ if (tp->pred_flags)
+ tcp_fast_path_on(tp);
+ }
+ return delta;
+}
+
/* Buffer size and advertised window tuning.
*
* 1. Tuning sk->sk_sndbuf, when connection enters established state.
@@ -744,7 +891,7 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
}
}
-static void tcp_rcvbuf_grow(struct sock *sk)
+void tcp_rcvbuf_grow(struct sock *sk)
{
const struct net *net = sock_net(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -1030,6 +1177,7 @@ struct tcp_sacktag_state {
u64 last_sackt;
u32 reord;
u32 sack_delivered;
+ u32 delivered_bytes;
int flag;
unsigned int mss_now;
struct rate_sample *rate;
@@ -1391,7 +1539,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
static u8 tcp_sacktag_one(struct sock *sk,
struct tcp_sacktag_state *state, u8 sacked,
u32 start_seq, u32 end_seq,
- int dup_sack, int pcount,
+ int dup_sack, int pcount, u32 plen,
u64 xmit_time)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -1451,6 +1599,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
tp->sacked_out += pcount;
/* Out-of-order packets delivered */
state->sack_delivered += pcount;
+ state->delivered_bytes += plen;
}
/* D-SACK. We can detect redundant retransmission in S|R and plain R
@@ -1487,7 +1636,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
* tcp_highest_sack_seq() when skb is highest_sack.
*/
tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
- start_seq, end_seq, dup_sack, pcount,
+ start_seq, end_seq, dup_sack, pcount, skb->len,
tcp_skb_timestamp_us(skb));
tcp_rate_skb_delivered(sk, skb, state->rate);
@@ -1772,6 +1921,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
TCP_SKB_CB(skb)->end_seq,
dup_sack,
tcp_skb_pcount(skb),
+ skb->len,
tcp_skb_timestamp_us(skb));
tcp_rate_skb_delivered(sk, skb, state->rate);
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
@@ -2569,7 +2719,7 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
if (frto_undo)
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPSPURIOUSRTOS);
- inet_csk(sk)->icsk_retransmits = 0;
+ WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0);
if (tcp_is_non_sack_preventing_reopen(sk))
return true;
if (frto_undo || tcp_is_sack(tp)) {
@@ -3280,6 +3430,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
if (sacked & TCPCB_SACKED_ACKED) {
tp->sacked_out -= acked_pcount;
+ /* snd_una delta covers these skbs */
+ sack->delivered_bytes -= skb->len;
} else if (tcp_is_sack(tp)) {
tcp_count_delivered(tp, acked_pcount, ece_ack);
if (!tcp_skb_spurious_retrans(tp, skb))
@@ -3376,6 +3528,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
if (before(reord, prior_fack))
tcp_check_sack_reordering(sk, reord, 0);
}
+
+ sack->delivered_bytes = (skb ?
+ TCP_SKB_CB(skb)->seq : tp->snd_una) -
+ prior_snd_una;
} else if (skb && rtt_update && sack_rtt_us >= 0 &&
sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp,
tcp_skb_timestamp_us(skb))) {
@@ -3645,8 +3801,18 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time);
}
+static void tcp_send_ack_reflect_ect(struct sock *sk, bool accecn_reflector)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u16 flags = 0;
+
+ if (accecn_reflector)
+ flags = tcp_accecn_reflector_flags(tp->syn_ect_rcv);
+ __tcp_send_ack(sk, tp->rcv_nxt, flags);
+}
+
/* RFC 5961 7 [ACK Throttling] */
-static void tcp_send_challenge_ack(struct sock *sk)
+static void tcp_send_challenge_ack(struct sock *sk, bool accecn_reflector)
{
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
@@ -3676,7 +3842,7 @@ static void tcp_send_challenge_ack(struct sock *sk)
WRITE_ONCE(net->ipv4.tcp_challenge_count, count - 1);
send_ack:
NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK);
- tcp_send_ack(sk);
+ tcp_send_ack_reflect_ect(sk, accecn_reflector);
}
}
@@ -3787,7 +3953,8 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit)
}
/* Returns the number of packets newly acked or sacked by the current ACK */
-static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
+static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered,
+ u32 ecn_count, int flag)
{
const struct net *net = sock_net(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -3795,8 +3962,12 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
delivered = tp->delivered - prior_delivered;
NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
- if (flag & FLAG_ECE)
- NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
+
+ if (flag & FLAG_ECE) {
+ if (tcp_ecn_mode_rfc3168(tp))
+ ecn_count = delivered;
+ NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, ecn_count);
+ }
return delivered;
}
@@ -3817,11 +3988,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
u32 delivered = tp->delivered;
u32 lost = tp->lost;
int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
+ u32 ecn_count = 0; /* Did we receive ECE/an AccECN ACE update? */
u32 prior_fack;
sack_state.first_sackt = 0;
sack_state.rate = &rs;
sack_state.sack_delivered = 0;
+ sack_state.delivered_bytes = 0;
/* We very likely will need to access rtx queue. */
prefetch(sk->tcp_rtx_queue.rb_node);
@@ -3837,7 +4010,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
if (before(ack, prior_snd_una - max_window)) {
if (!(flag & FLAG_NO_CHALLENGE_ACK))
- tcp_send_challenge_ack(sk);
+ tcp_send_challenge_ack(sk, false);
return -SKB_DROP_REASON_TCP_TOO_OLD_ACK;
}
goto old_ack;
@@ -3851,7 +4024,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (after(ack, prior_snd_una)) {
flag |= FLAG_SND_UNA_ADVANCED;
- icsk->icsk_retransmits = 0;
+ WRITE_ONCE(icsk->icsk_retransmits, 0);
#if IS_ENABLED(CONFIG_TLS_DEVICE)
if (static_branch_unlikely(&clean_acked_data_enabled.key))
@@ -3912,8 +4085,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* We passed data and got it acked, remove any soft error
* log. Something worked...
*/
- WRITE_ONCE(sk->sk_err_soft, 0);
- icsk->icsk_probes_out = 0;
+ if (READ_ONCE(sk->sk_err_soft))
+ WRITE_ONCE(sk->sk_err_soft, 0);
+ WRITE_ONCE(icsk->icsk_probes_out, 0);
tp->rcv_tstamp = tcp_jiffies32;
if (!prior_packets)
goto no_queue;
@@ -3924,6 +4098,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tcp_rack_update_reo_wnd(sk, &rs);
+ if (tcp_ecn_mode_accecn(tp))
+ ecn_count = tcp_accecn_process(sk, skb,
+ tp->delivered - delivered,
+ sack_state.delivered_bytes,
+ &flag);
+
tcp_in_ack_event(sk, flag);
if (tp->tlp_high_seq)
@@ -3948,7 +4128,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
sk_dst_confirm(sk);
- delivered = tcp_newly_delivered(sk, delivered, flag);
+ delivered = tcp_newly_delivered(sk, delivered, ecn_count, flag);
+
lost = tp->lost - lost; /* freshly marked lost */
rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
@@ -3957,12 +4138,17 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
return 1;
no_queue:
+ if (tcp_ecn_mode_accecn(tp))
+ ecn_count = tcp_accecn_process(sk, skb,
+ tp->delivered - delivered,
+ sack_state.delivered_bytes,
+ &flag);
tcp_in_ack_event(sk, flag);
/* If data was DSACKed, see if we can undo a cwnd reduction. */
if (flag & FLAG_DSACKING_ACK) {
tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit);
- tcp_newly_delivered(sk, delivered, flag);
+ tcp_newly_delivered(sk, delivered, ecn_count, flag);
}
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
@@ -3983,7 +4169,7 @@ old_ack:
&sack_state);
tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit);
- tcp_newly_delivered(sk, delivered, flag);
+ tcp_newly_delivered(sk, delivered, ecn_count, flag);
tcp_xmit_recovery(sk, rexmit);
}
@@ -4083,6 +4269,7 @@ void tcp_parse_options(const struct net *net,
ptr = (const unsigned char *)(th + 1);
opt_rx->saw_tstamp = 0;
+ opt_rx->accecn = 0;
opt_rx->saw_unknown = 0;
while (length > 0) {
@@ -4174,6 +4361,12 @@ void tcp_parse_options(const struct net *net,
ptr, th->syn, foc, false);
break;
+ case TCPOPT_ACCECN0:
+ case TCPOPT_ACCECN1:
+ /* Save offset of AccECN option in TCP header */
+ opt_rx->accecn = (ptr - 2) - (__u8 *)th;
+ break;
+
case TCPOPT_EXP:
/* Fast Open option shares code 254 using a
* 16 bits magic number.
@@ -4234,11 +4427,14 @@ static bool tcp_fast_parse_options(const struct net *net,
*/
if (th->doff == (sizeof(*th) / 4)) {
tp->rx_opt.saw_tstamp = 0;
+ tp->rx_opt.accecn = 0;
return false;
} else if (tp->rx_opt.tstamp_ok &&
th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
- if (tcp_parse_aligned_timestamp(tp, th))
+ if (tcp_parse_aligned_timestamp(tp, th)) {
+ tp->rx_opt.accecn = 0;
return true;
+ }
}
tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
@@ -4830,7 +5026,7 @@ static bool tcp_ooo_try_coalesce(struct sock *sk,
noinline_for_tracing static void
tcp_drop_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason)
{
- sk_drops_add(sk, skb);
+ sk_drops_skbadd(sk, skb);
sk_skb_reason_drop(sk, skb, reason);
}
@@ -4890,12 +5086,23 @@ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb);
/* Check if this incoming skb can be added to socket receive queues
* while satisfying sk->sk_rcvbuf limit.
+ *
+ * In theory we should use skb->truesize, but this can cause problems
+ * when applications use too small SO_RCVBUF values.
+ * When LRO / hw gro is used, the socket might have a high tp->scaling_ratio,
+ * allowing RWIN to be close to available space.
+ * Whenever the receive queue gets full, we can receive a small packet
+ * filling RWIN, but with a high skb->truesize, because most NIC use 4K page
+ * plus sk_buff metadata even when receiving less than 1500 bytes of payload.
+ *
+ * Note that we use skb->len to decide to accept or drop this packet,
+ * but sk->sk_rmem_alloc is the sum of all skb->truesize.
*/
static bool tcp_can_ingest(const struct sock *sk, const struct sk_buff *skb)
{
- unsigned int new_mem = atomic_read(&sk->sk_rmem_alloc) + skb->truesize;
+ unsigned int rmem = atomic_read(&sk->sk_rmem_alloc);
- return new_mem <= sk->sk_rcvbuf;
+ return rmem + skb->len <= sk->sk_rcvbuf;
}
static int tcp_try_rmem_schedule(struct sock *sk, const struct sk_buff *skb,
@@ -5871,6 +6078,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th, int syn_inerr)
{
struct tcp_sock *tp = tcp_sk(sk);
+ bool accecn_reflector = false;
SKB_DR(reason);
/* RFC1323: H1. Apply PAWS check first. */
@@ -5968,7 +6176,7 @@ step1:
if (tp->syn_fastopen && !tp->data_segs_in &&
sk->sk_state == TCP_ESTABLISHED)
tcp_fastopen_active_disable(sk);
- tcp_send_challenge_ack(sk);
+ tcp_send_challenge_ack(sk, false);
SKB_DR_SET(reason, TCP_RESET);
goto discard;
}
@@ -5979,6 +6187,16 @@ step1:
* RFC 5961 4.2 : Send a challenge ack
*/
if (th->syn) {
+ if (tcp_ecn_mode_accecn(tp)) {
+ accecn_reflector = true;
+ if (tp->rx_opt.accecn &&
+ tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
+ u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn);
+
+ tcp_accecn_saw_opt_fail_recv(tp, saw_opt);
+ tcp_accecn_opt_demand_min(sk, 1);
+ }
+ }
if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack &&
TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq &&
TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt &&
@@ -5988,7 +6206,7 @@ syn_challenge:
if (syn_inerr)
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
- tcp_send_challenge_ack(sk);
+ tcp_send_challenge_ack(sk, accecn_reflector);
SKB_DR_SET(reason, TCP_INVALID_SYN);
goto discard;
}
@@ -6060,6 +6278,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
*/
tp->rx_opt.saw_tstamp = 0;
+ tp->rx_opt.accecn = 0;
/* pred_flags is 0xS?10 << 16 + snd_wnd
* if header_prediction is to be made
@@ -6114,6 +6333,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
flag |= __tcp_replace_ts_recent(tp,
delta);
+ tcp_ecn_received_counters(sk, skb, 0);
+
/* We know that such packets are checksummed
* on entry.
*/
@@ -6162,6 +6383,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
/* Bulk data transfer: receiver */
tcp_cleanup_skb(skb);
__skb_pull(skb, tcp_header_len);
+ tcp_ecn_received_counters(sk, skb,
+ len - tcp_header_len);
eaten = tcp_queue_rcv(sk, skb, &fragstolen);
tcp_event_data_recv(sk, skb);
@@ -6202,6 +6425,8 @@ validate:
return;
step5:
+ tcp_ecn_received_counters_payload(sk, skb);
+
reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT);
if ((int)reason < 0) {
reason = -reason;
@@ -6297,7 +6522,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
bool syn_drop = false;
- if (mss == tp->rx_opt.user_mss) {
+ if (mss == READ_ONCE(tp->rx_opt.user_mss)) {
struct tcp_options_received opt;
/* Get original SYNACK MSS value if user MSS sets mss_clamp */
@@ -6452,7 +6677,9 @@ consume:
* state to ESTABLISHED..."
*/
- tcp_ecn_rcv_synack(tp, th);
+ if (tcp_ecn_mode_any(tp))
+ tcp_ecn_rcv_synack(sk, skb, th,
+ TCP_SKB_CB(skb)->ip_dsfield);
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
tcp_try_undo_spurious_syn(sk);
@@ -6524,7 +6751,7 @@ consume:
TCP_DELACK_MAX, false);
goto consume;
}
- tcp_send_ack(sk);
+ tcp_send_ack_reflect_ect(sk, tcp_ecn_mode_accecn(tp));
return -1;
}
@@ -6583,7 +6810,7 @@ consume:
tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
tp->max_window = tp->snd_wnd;
- tcp_ecn_rcv_syn(tp, th);
+ tcp_ecn_rcv_syn(tp, th, skb);
tcp_mtup_init(sk);
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
@@ -6636,7 +6863,7 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
tcp_try_undo_recovery(sk);
tcp_update_rto_time(tp);
- inet_csk(sk)->icsk_retransmits = 0;
+ WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0);
/* In tcp_fastopen_synack_timer() on the first SYNACK RTO we set
* retrans_stamp but don't enter CA_Loss, so in case that happened we
* need to zero retrans_stamp here to prevent spurious
@@ -6765,7 +6992,7 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
}
/* accept old ack during closing */
if ((int)reason < 0) {
- tcp_send_challenge_ack(sk);
+ tcp_send_challenge_ack(sk, false);
reason = -reason;
goto discard;
}
@@ -6812,9 +7039,12 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tp->lsndtime = tcp_jiffies32;
tcp_initialize_rcv_mss(sk);
+ if (tcp_ecn_mode_accecn(tp))
+ tcp_accecn_third_ack(sk, skb, tp->syn_ect_snt);
tcp_fast_path_on(tp);
if (sk->sk_shutdown & SEND_SHUTDOWN)
tcp_shutdown(sk, SEND_SHUTDOWN);
+
break;
case TCP_FIN_WAIT1: {
@@ -6984,6 +7214,15 @@ static void tcp_ecn_create_request(struct request_sock *req,
bool ect, ecn_ok;
u32 ecn_ok_dst;
+ if (tcp_accecn_syn_requested(th) &&
+ READ_ONCE(net->ipv4.sysctl_tcp_ecn) >= 3) {
+ inet_rsk(req)->ecn_ok = 1;
+ tcp_rsk(req)->accecn_ok = 1;
+ tcp_rsk(req)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield &
+ INET_ECN_MASK;
+ return;
+ }
+
if (!th_ecn)
return;
@@ -6991,7 +7230,8 @@ static void tcp_ecn_create_request(struct request_sock *req,
ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
ecn_ok = READ_ONCE(net->ipv4.sysctl_tcp_ecn) || ecn_ok_dst;
- if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
+ if (((!ect || th->res1 || th->ae) && ecn_ok) ||
+ tcp_ca_needs_ecn(listen_sk) ||
(ecn_ok_dst & DST_FEATURE_ECN_CA) ||
tcp_bpf_ca_needs_ecn((struct sock *)req))
inet_rsk(req)->ecn_ok = 1;
@@ -7009,6 +7249,11 @@ static void tcp_openreq_init(struct request_sock *req,
tcp_rsk(req)->snt_synack = 0;
tcp_rsk(req)->snt_tsval_first = 0;
tcp_rsk(req)->last_oow_ack_time = 0;
+ tcp_rsk(req)->accecn_ok = 0;
+ tcp_rsk(req)->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN;
+ tcp_rsk(req)->accecn_fail_mode = 0;
+ tcp_rsk(req)->syn_ect_rcv = 0;
+ tcp_rsk(req)->syn_ect_snt = 0;
req->mss = rx_opt->mss_clamp;
req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
ireq->tstamp_ok = rx_opt->tstamp_ok;
@@ -7048,8 +7293,8 @@ static bool tcp_syn_flood_action(struct sock *sk, const char *proto)
#endif
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
- if (!READ_ONCE(queue->synflood_warned) && syncookies != 2 &&
- xchg(&queue->synflood_warned, 1) == 0) {
+ if (syncookies != 2 && !READ_ONCE(queue->synflood_warned)) {
+ WRITE_ONCE(queue->synflood_warned, 1);
if (IS_ENABLED(CONFIG_IPV6) && sk->sk_family == AF_INET6) {
net_info_ratelimited("%s: Possible SYN flooding on port [%pI6c]:%u. %s.\n",
proto, inet6_rcv_saddr(sk),
@@ -7117,7 +7362,7 @@ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
return 0;
}
- mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss);
+ mss = tcp_parse_mss_option(th, READ_ONCE(tp->rx_opt.user_mss));
if (!mss)
mss = af_ops->mss_clamp;
@@ -7131,7 +7376,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
{
struct tcp_fastopen_cookie foc = { .len = -1 };
struct tcp_options_received tmp_opt;
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
struct sock *fastopen_sk = NULL;
struct request_sock *req;
@@ -7182,7 +7427,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = af_ops->mss_clamp;
- tmp_opt.user_mss = tp->rx_opt.user_mss;
+ tmp_opt.user_mss = READ_ONCE(tp->rx_opt.user_mss);
tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
want_cookie ? NULL : &foc);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 84d3d556ed80..b1fcf3e4e1ce 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -65,6 +65,7 @@
#include <net/icmp.h>
#include <net/inet_hashtables.h>
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
#include <net/transp_v6.h>
#include <net/ipv6.h>
#include <net/inet_common.h>
@@ -74,6 +75,7 @@
#include <net/secure_seq.h>
#include <net/busy_poll.h>
#include <net/rstreason.h>
+#include <net/psp.h>
#include <linux/inet.h>
#include <linux/ipv6.h>
@@ -292,9 +294,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_dport = usin->sin_port;
sk_daddr_set(sk, daddr);
- inet_csk(sk)->icsk_ext_hdr_len = 0;
+ inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk);
if (inet_opt)
- inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
+ inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen;
tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
@@ -506,8 +508,7 @@ int tcp_v4_err(struct sk_buff *skb, u32 info)
struct sock *sk;
int err;
- sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
- iph->daddr, th->dest, iph->saddr,
+ sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr,
ntohs(th->source), inet_iif(skb), 0);
if (!sk) {
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
@@ -823,8 +824,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
* Incoming packet is checked with md5 hash with finding key,
* no RST generated if md5 hash doesn't match.
*/
- sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
- NULL, 0, ip_hdr(skb)->saddr,
+ sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr,
th->source, ip_hdr(skb)->daddr,
ntohs(th->source), dif, sdif);
/* don't send rst if it can't find key */
@@ -1191,7 +1191,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
enum tcp_synack_type synack_type,
struct sk_buff *syn_skb)
{
- const struct inet_request_sock *ireq = inet_rsk(req);
+ struct inet_request_sock *ireq = inet_rsk(req);
struct flowi4 fl4;
int err = -1;
struct sk_buff *skb;
@@ -1204,6 +1204,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
if (skb) {
+ tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
tos = READ_ONCE(inet_sk(sk)->tos);
@@ -1505,9 +1506,9 @@ void tcp_clear_md5_list(struct sock *sk)
md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
- hlist_del_rcu(&key->node);
+ hlist_del(&key->node);
atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
- kfree_rcu(key, rcu);
+ kfree(key);
}
}
@@ -1907,6 +1908,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
enum skb_drop_reason reason;
struct sock *rsk;
+ reason = psp_sk_rx_policy_check(sk, skb);
+ if (reason)
+ goto err_discard;
+
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
struct dst_entry *dst;
@@ -1968,6 +1973,7 @@ csum_err:
reason = SKB_DROP_REASON_TCP_CSUM;
trace_tcp_bad_csum(skb);
TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+err_discard:
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
@@ -1992,8 +1998,7 @@ int tcp_v4_early_demux(struct sk_buff *skb)
if (th->doff < sizeof(struct tcphdr) / 4)
return 0;
- sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
- iph->saddr, th->source,
+ sk = __inet_lookup_established(net, iph->saddr, th->source,
iph->daddr, ntohs(th->dest),
skb->skb_iif, inet_sdif(skb));
if (sk) {
@@ -2070,7 +2075,9 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
(TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) ||
!tcp_skb_can_collapse_rx(tail, skb) ||
thtail->doff != th->doff ||
- memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
+ memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) ||
+ /* prior to PSP Rx policy check, retain exact PSP metadata */
+ psp_skb_coalesce_diff(tail, skb))
goto no_coalesce;
__skb_pull(skb, hdrlen);
@@ -2236,8 +2243,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
th = (const struct tcphdr *)skb->data;
iph = ip_hdr(skb);
lookup:
- sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
- skb, __tcp_hdrlen(th), th->source,
+ sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source,
th->dest, sdif, &refcounted);
if (!sk)
goto no_tcp_socket;
@@ -2258,7 +2264,7 @@ lookup:
&iph->saddr, &iph->daddr,
AF_INET, dif, sdif);
if (unlikely(drop_reason)) {
- sk_drops_add(sk, skb);
+ sk_drops_skbadd(sk, skb);
reqsk_put(req);
goto discard_it;
}
@@ -2403,7 +2409,7 @@ discard_it:
return 0;
discard_and_relse:
- sk_drops_add(sk, skb);
+ sk_drops_skbadd(sk, skb);
if (refcounted)
sock_put(sk);
goto discard_it;
@@ -2426,9 +2432,7 @@ do_time_wait:
&drop_reason);
switch (tw_status) {
case TCP_TW_SYN: {
- struct sock *sk2 = inet_lookup_listener(net,
- net->ipv4.tcp_death_row.hashinfo,
- skb, __tcp_hdrlen(th),
+ struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th),
iph->saddr, th->source,
iph->daddr, th->dest,
inet_iif(skb),
@@ -2441,6 +2445,10 @@ do_time_wait:
__this_cpu_write(tcp_tw_isn, isn);
goto process;
}
+
+ drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb);
+ if (drop_reason)
+ break;
}
/* to ACK */
fallthrough;
@@ -2459,7 +2467,6 @@ do_time_wait:
static struct timewait_sock_ops tcp_timewait_sock_ops = {
.twsk_obj_size = sizeof(struct tcp_timewait_sock),
- .twsk_destructor= tcp_twsk_destructor,
};
void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
@@ -2501,6 +2508,13 @@ static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
.ao_calc_key_sk = tcp_v4_ao_calc_key_sk,
#endif
};
+
+static void tcp4_destruct_sock(struct sock *sk)
+{
+ tcp_md5_destruct_sock(sk);
+ tcp_ao_destroy_sock(sk, false);
+ inet_sock_destruct(sk);
+}
#endif
/* NOTE: A lot of things set to zero explicitly by call to
@@ -2516,23 +2530,12 @@ static int tcp_v4_init_sock(struct sock *sk)
#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
+ sk->sk_destruct = tcp4_destruct_sock;
#endif
return 0;
}
-#ifdef CONFIG_TCP_MD5SIG
-static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
-{
- struct tcp_md5sig_info *md5sig;
-
- md5sig = container_of(head, struct tcp_md5sig_info, rcu);
- kfree(md5sig);
- static_branch_slow_dec_deferred(&tcp_md5_needed);
- tcp_md5_release_sigpool();
-}
-#endif
-
static void tcp_release_user_frags(struct sock *sk)
{
#ifdef CONFIG_PAGE_POOL
@@ -2569,19 +2572,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
/* Cleans up our, hopefully empty, out_of_order_queue. */
skb_rbtree_purge(&tp->out_of_order_queue);
-#ifdef CONFIG_TCP_MD5SIG
- /* Clean up the MD5 key list, if any */
- if (tp->md5sig_info) {
- struct tcp_md5sig_info *md5sig;
-
- md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
- tcp_clear_md5_list(sk);
- call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
- rcu_assign_pointer(tp->md5sig_info, NULL);
- }
-#endif
- tcp_ao_destroy_sock(sk, false);
-
/* Clean up a referenced TCP bind bucket. */
if (inet_csk(sk)->icsk_bind_hash)
inet_put_port(sk);
@@ -2958,9 +2948,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
rx_queue,
timer_active,
jiffies_delta_to_clock_t(timer_expires - jiffies),
- icsk->icsk_retransmits,
+ READ_ONCE(icsk->icsk_retransmits),
from_kuid_munged(seq_user_ns(f), sk_uid(sk)),
- icsk->icsk_probes_out,
+ READ_ONCE(icsk->icsk_probes_out),
sock_i_ino(sk),
refcount_read(&sk->sk_refcnt), sk,
jiffies_to_clock_t(icsk->icsk_rto),
@@ -3524,7 +3514,6 @@ struct proto tcp_prot = {
.leave_memory_pressure = tcp_leave_memory_pressure,
.stream_memory_free = tcp_stream_memory_free,
.sockets_allocated = &tcp_sockets_allocated,
- .orphan_count = &tcp_orphan_count,
.memory_allocated = &net_aligned_data.tcp_memory_allocated,
.per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
@@ -3583,7 +3572,9 @@ fallback:
static int __net_init tcp_sk_init(struct net *net)
{
- net->ipv4.sysctl_tcp_ecn = 2;
+ net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN;
+ net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL;
+ net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON;
net->ipv4.sysctl_tcp_ecn_fallback = 1;
net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 03c068ea27b6..45b6ecd16412 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -170,7 +170,7 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
struct net *net;
spin_lock_bh(&tcp_metrics_lock);
- net = dev_net_rcu(dst_dev(dst));
+ net = dst_dev_net_rcu(dst);
/* While waiting for the spin-lock the cache might have been populated
* with this entry and so we have to check again.
@@ -273,7 +273,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
return NULL;
}
- net = dev_net_rcu(dst_dev(dst));
+ net = dst_dev_net_rcu(dst);
hash ^= net_hash_mix(net);
hash = hash_32(hash, tcp_metrics_hash_log);
@@ -318,7 +318,7 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
else
return NULL;
- net = dev_net_rcu(dst_dev(dst));
+ net = dst_dev_net_rcu(dst);
hash ^= net_hash_mix(net);
hash = hash_32(hash, tcp_metrics_hash_log);
@@ -912,7 +912,7 @@ static void tcp_metrics_flush_all(struct net *net)
spin_lock_bh(&tcp_metrics_lock);
for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) {
match = net ? net_eq(tm_net(tm), net) :
- !refcount_read(&tm_net(tm)->ns.count);
+ !check_net(tm_net(tm));
if (match) {
rcu_assign_pointer(*pp, tm->tcpm_next);
kfree_rcu(tm, rcu_head);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 2994c9222c9c..2ec8c6f1cdcc 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -20,9 +20,11 @@
*/
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
#include <net/xfrm.h>
#include <net/busy_poll.h>
#include <net/rstreason.h>
+#include <net/psp.h>
static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
{
@@ -103,9 +105,16 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
u32 rcv_nxt = READ_ONCE(tcptw->tw_rcv_nxt);
struct tcp_options_received tmp_opt;
+ enum skb_drop_reason psp_drop;
bool paws_reject = false;
int ts_recent_stamp;
+ /* Instead of dropping immediately, wait to see what value is
+ * returned. We will accept a non psp-encapsulated syn in the
+ * case where TCP_TW_SYN is returned.
+ */
+ psp_drop = psp_twsk_rx_policy_check(tw, skb);
+
tmp_opt.saw_tstamp = 0;
ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
if (th->doff > (sizeof(*th) >> 2) && ts_recent_stamp) {
@@ -123,6 +132,9 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) {
/* Just repeat all the checks of tcp_rcv_state_process() */
+ if (psp_drop)
+ goto out_put;
+
/* Out of window, send ACK */
if (paws_reject ||
!tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
@@ -193,6 +205,9 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
(TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
/* In window segment, it may be only reset or bare ack. */
+ if (psp_drop)
+ goto out_put;
+
if (th->rst) {
/* This is TIME_WAIT assassination, in two flavors.
* Oh well... nobody has a sufficient solution to this
@@ -246,6 +261,9 @@ kill:
return TCP_TW_SYN;
}
+ if (psp_drop)
+ goto out_put;
+
if (paws_reject) {
*drop_reason = SKB_DROP_REASON_TCP_RFC7323_TW_PAWS;
__NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWS_TW_REJECTED);
@@ -264,6 +282,8 @@ kill:
return tcp_timewait_check_oow_rate_limit(
tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
}
+
+out_put:
inet_twsk_put(tw);
return TCP_TW_SUCCESS;
}
@@ -377,31 +397,22 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
}
EXPORT_SYMBOL(tcp_time_wait);
-#ifdef CONFIG_TCP_MD5SIG
-static void tcp_md5_twsk_free_rcu(struct rcu_head *head)
-{
- struct tcp_md5sig_key *key;
-
- key = container_of(head, struct tcp_md5sig_key, rcu);
- kfree(key);
- static_branch_slow_dec_deferred(&tcp_md5_needed);
- tcp_md5_release_sigpool();
-}
-#endif
-
void tcp_twsk_destructor(struct sock *sk)
{
#ifdef CONFIG_TCP_MD5SIG
if (static_branch_unlikely(&tcp_md5_needed.key)) {
struct tcp_timewait_sock *twsk = tcp_twsk(sk);
- if (twsk->tw_md5_key)
- call_rcu(&twsk->tw_md5_key->rcu, tcp_md5_twsk_free_rcu);
+ if (twsk->tw_md5_key) {
+ kfree(twsk->tw_md5_key);
+ static_branch_slow_dec_deferred(&tcp_md5_needed);
+ tcp_md5_release_sigpool();
+ }
}
#endif
tcp_ao_destroy_sock(sk, true);
+ psp_twsk_assoc_free(inet_twsk(sk));
}
-EXPORT_IPV6_MOD_GPL(tcp_twsk_destructor);
void tcp_twsk_purge(struct list_head *net_exit_list)
{
@@ -461,12 +472,26 @@ void tcp_openreq_init_rwin(struct request_sock *req,
ireq->rcv_wscale = rcv_wscale;
}
-static void tcp_ecn_openreq_child(struct tcp_sock *tp,
- const struct request_sock *req)
+static void tcp_ecn_openreq_child(struct sock *sk,
+ const struct request_sock *req,
+ const struct sk_buff *skb)
{
- tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ?
- TCP_ECN_MODE_RFC3168 :
- TCP_ECN_DISABLED);
+ const struct tcp_request_sock *treq = tcp_rsk(req);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (treq->accecn_ok) {
+ tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
+ tp->syn_ect_snt = treq->syn_ect_snt;
+ tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt);
+ tp->saw_accecn_opt = treq->saw_accecn_opt;
+ tp->prev_ecnfield = treq->syn_ect_rcv;
+ tp->accecn_opt_demand = 1;
+ tcp_ecn_received_counters_payload(sk, skb);
+ } else {
+ tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ?
+ TCP_ECN_MODE_RFC3168 :
+ TCP_ECN_DISABLED);
+ }
}
void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
@@ -631,7 +656,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
newtp->rx_opt.mss_clamp = req->mss;
- tcp_ecn_openreq_child(newtp, req);
+ tcp_ecn_openreq_child(newsk, req, skb);
newtp->fastopen_req = NULL;
RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
@@ -674,6 +699,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
bool own_req;
tmp_opt.saw_tstamp = 0;
+ tmp_opt.accecn = 0;
if (th->doff > (sizeof(struct tcphdr)>>2)) {
tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL);
@@ -851,6 +877,18 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
if (!(flg & TCP_FLAG_ACK))
return NULL;
+ if (tcp_rsk(req)->accecn_ok && tmp_opt.accecn &&
+ tcp_rsk(req)->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
+ u8 saw_opt = tcp_accecn_option_init(skb, tmp_opt.accecn);
+
+ tcp_rsk(req)->saw_accecn_opt = saw_opt;
+ if (tcp_rsk(req)->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) {
+ u8 fail_mode = TCP_ACCECN_OPT_FAIL_RECV;
+
+ tcp_rsk(req)->accecn_fail_mode |= fail_mode;
+ }
+ }
+
/* For Fast Open no more processing is needed (sk is the
* child socket).
*/
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index be5c2294610e..2cb93da93abc 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -434,8 +434,7 @@ static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
inet_get_iif_sdif(skb, &iif, &sdif);
iph = skb_gro_network_header(skb);
net = dev_net_rcu(skb->dev);
- sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
- iph->saddr, th->source,
+ sk = __inet_lookup_established(net, iph->saddr, th->source,
iph->daddr, ntohs(th->dest),
iif, sdif);
NAPI_GRO_CB(skb)->is_flist = !sk;
@@ -485,6 +484,7 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff)
th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr,
iph->daddr, 0);
+ BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID << 1 != SKB_GSO_TCP_FIXEDID_INNER);
skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4 |
(NAPI_GRO_CB(skb)->ip_fixedid * SKB_GSO_TCP_FIXEDID);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index caf11920a878..bb3576ac0ad7 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -38,8 +38,10 @@
#define pr_fmt(fmt) "TCP: " fmt
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
#include <net/mptcp.h>
#include <net/proto_memory.h>
+#include <net/psp.h>
#include <linux/compiler.h>
#include <linux/gfp.h>
@@ -319,60 +321,6 @@ static u16 tcp_select_window(struct sock *sk)
return new_win;
}
-/* Packet ECN state for a SYN-ACK */
-static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
-
- TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
- if (tcp_ecn_disabled(tp))
- TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
- else if (tcp_ca_needs_ecn(sk) ||
- tcp_bpf_ca_needs_ecn(sk))
- INET_ECN_xmit(sk);
-}
-
-/* Packet ECN state for a SYN. */
-static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
- bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 ||
- tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
-
- if (!use_ecn) {
- const struct dst_entry *dst = __sk_dst_get(sk);
-
- if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
- use_ecn = true;
- }
-
- tp->ecn_flags = 0;
-
- if (use_ecn) {
- TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
- tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
- if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
- INET_ECN_xmit(sk);
- }
-}
-
-static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
-{
- if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback))
- /* tp->ecn_flags are cleared at a later point in time when
- * SYN ACK is ultimatively being received.
- */
- TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR);
-}
-
-static void
-tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
-{
- if (inet_rsk(req)->ecn_ok)
- th->ece = 1;
-}
-
/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
* be sent.
*/
@@ -381,7 +329,15 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tcp_ecn_mode_rfc3168(tp)) {
+ if (!tcp_ecn_mode_any(tp))
+ return;
+
+ if (tcp_ecn_mode_accecn(tp)) {
+ if (!tcp_accecn_ace_fail_recv(tp))
+ INET_ECN_xmit(sk);
+ tcp_accecn_set_ace(tp, skb, th);
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ACCECN;
+ } else {
/* Not-retransmitted data segment: set ECT and inject CWR. */
if (skb->len != tcp_header_len &&
!before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
@@ -403,13 +359,15 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
/* Constructs common control bits of non-data skb. If SYN/FIN is present,
* auto increment end seqno.
*/
-static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u16 flags)
+static void tcp_init_nondata_skb(struct sk_buff *skb, struct sock *sk,
+ u32 seq, u16 flags)
{
skb->ip_summed = CHECKSUM_PARTIAL;
TCP_SKB_CB(skb)->tcp_flags = flags;
tcp_skb_pcount_set(skb, 1);
+ psp_enqueue_set_decrypted(sk, skb);
TCP_SKB_CB(skb)->seq = seq;
if (flags & (TCPHDR_SYN | TCPHDR_FIN))
@@ -430,6 +388,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
#define OPTION_SMC BIT(9)
#define OPTION_MPTCP BIT(10)
#define OPTION_AO BIT(11)
+#define OPTION_ACCECN BIT(12)
static void smc_options_write(__be32 *ptr, u16 *options)
{
@@ -451,6 +410,8 @@ struct tcp_out_options {
u16 mss; /* 0 to disable */
u8 ws; /* window scale, 0 to disable */
u8 num_sack_blocks; /* number of SACK blocks to include */
+ u8 num_accecn_fields:7, /* number of AccECN fields needed */
+ use_synack_ecn_bytes:1; /* Use synack_ecn_bytes or not */
u8 hash_size; /* bytes in hash_location */
u8 bpf_opt_len; /* length of BPF hdr option */
__u8 *hash_location; /* temporary pointer, overloaded */
@@ -648,6 +609,11 @@ static __be32 *process_tcp_ao_options(struct tcp_sock *tp,
return ptr;
}
+/* Initial values for AccECN option, ordered is based on ECN field bits
+ * similar to received_ecn_bytes. Used for SYN/ACK AccECN option.
+ */
+static const u32 synack_ecn_bytes[3] = { 0, 0, 0 };
+
/* Write previously computed TCP options to the packet.
*
* Beware: Something in the Internet is very sensitive to the ordering of
@@ -666,6 +632,8 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
struct tcp_out_options *opts,
struct tcp_key *key)
{
+ u8 leftover_highbyte = TCPOPT_NOP; /* replace 1st NOP if avail */
+ u8 leftover_lowbyte = TCPOPT_NOP; /* replace 2nd NOP in succession */
__be32 *ptr = (__be32 *)(th + 1);
u16 options = opts->options; /* mungable copy */
@@ -701,15 +669,75 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
*ptr++ = htonl(opts->tsecr);
}
+ if (OPTION_ACCECN & options) {
+ const u32 *ecn_bytes = opts->use_synack_ecn_bytes ?
+ synack_ecn_bytes :
+ tp->received_ecn_bytes;
+ const u8 ect0_idx = INET_ECN_ECT_0 - 1;
+ const u8 ect1_idx = INET_ECN_ECT_1 - 1;
+ const u8 ce_idx = INET_ECN_CE - 1;
+ u32 e0b;
+ u32 e1b;
+ u32 ceb;
+ u8 len;
+
+ e0b = ecn_bytes[ect0_idx] + TCP_ACCECN_E0B_INIT_OFFSET;
+ e1b = ecn_bytes[ect1_idx] + TCP_ACCECN_E1B_INIT_OFFSET;
+ ceb = ecn_bytes[ce_idx] + TCP_ACCECN_CEB_INIT_OFFSET;
+ len = TCPOLEN_ACCECN_BASE +
+ opts->num_accecn_fields * TCPOLEN_ACCECN_PERFIELD;
+
+ if (opts->num_accecn_fields == 2) {
+ *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
+ ((e1b >> 8) & 0xffff));
+ *ptr++ = htonl(((e1b & 0xff) << 24) |
+ (ceb & 0xffffff));
+ } else if (opts->num_accecn_fields == 1) {
+ *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
+ ((e1b >> 8) & 0xffff));
+ leftover_highbyte = e1b & 0xff;
+ leftover_lowbyte = TCPOPT_NOP;
+ } else if (opts->num_accecn_fields == 0) {
+ leftover_highbyte = TCPOPT_ACCECN1;
+ leftover_lowbyte = len;
+ } else if (opts->num_accecn_fields == 3) {
+ *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
+ ((e1b >> 8) & 0xffff));
+ *ptr++ = htonl(((e1b & 0xff) << 24) |
+ (ceb & 0xffffff));
+ *ptr++ = htonl(((e0b & 0xffffff) << 8) |
+ TCPOPT_NOP);
+ }
+ if (tp) {
+ tp->accecn_minlen = 0;
+ tp->accecn_opt_tstamp = tp->tcp_mstamp;
+ if (tp->accecn_opt_demand)
+ tp->accecn_opt_demand--;
+ }
+ }
+
if (unlikely(OPTION_SACK_ADVERTISE & options)) {
- *ptr++ = htonl((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
+ *ptr++ = htonl((leftover_highbyte << 24) |
+ (leftover_lowbyte << 16) |
(TCPOPT_SACK_PERM << 8) |
TCPOLEN_SACK_PERM);
+ leftover_highbyte = TCPOPT_NOP;
+ leftover_lowbyte = TCPOPT_NOP;
}
if (unlikely(OPTION_WSCALE & options)) {
- *ptr++ = htonl((TCPOPT_NOP << 24) |
+ u8 highbyte = TCPOPT_NOP;
+
+ /* Do not split the leftover 2-byte to fit into a single
+ * NOP, i.e., replace this NOP only when 1 byte is leftover
+ * within leftover_highbyte.
+ */
+ if (unlikely(leftover_highbyte != TCPOPT_NOP &&
+ leftover_lowbyte == TCPOPT_NOP)) {
+ highbyte = leftover_highbyte;
+ leftover_highbyte = TCPOPT_NOP;
+ }
+ *ptr++ = htonl((highbyte << 24) |
(TCPOPT_WINDOW << 16) |
(TCPOLEN_WINDOW << 8) |
opts->ws);
@@ -720,11 +748,13 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
tp->duplicate_sack : tp->selective_acks;
int this_sack;
- *ptr++ = htonl((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
+ *ptr++ = htonl((leftover_highbyte << 24) |
+ (leftover_lowbyte << 16) |
(TCPOPT_SACK << 8) |
(TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
TCPOLEN_SACK_PERBLOCK)));
+ leftover_highbyte = TCPOPT_NOP;
+ leftover_lowbyte = TCPOPT_NOP;
for (this_sack = 0; this_sack < opts->num_sack_blocks;
++this_sack) {
@@ -733,6 +763,14 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
}
tp->rx_opt.dsack = 0;
+ } else if (unlikely(leftover_highbyte != TCPOPT_NOP ||
+ leftover_lowbyte != TCPOPT_NOP)) {
+ *ptr++ = htonl((leftover_highbyte << 24) |
+ (leftover_lowbyte << 16) |
+ (TCPOPT_NOP << 8) |
+ TCPOPT_NOP);
+ leftover_highbyte = TCPOPT_NOP;
+ leftover_lowbyte = TCPOPT_NOP;
}
if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
@@ -813,6 +851,80 @@ static void mptcp_set_option_cond(const struct request_sock *req,
}
}
+static u32 tcp_synack_options_combine_saving(struct tcp_out_options *opts)
+{
+ /* How much there's room for combining with the alignment padding? */
+ if ((opts->options & (OPTION_SACK_ADVERTISE | OPTION_TS)) ==
+ OPTION_SACK_ADVERTISE)
+ return 2;
+ else if (opts->options & OPTION_WSCALE)
+ return 1;
+ return 0;
+}
+
+/* Calculates how long AccECN option will fit to @remaining option space.
+ *
+ * AccECN option can sometimes replace NOPs used for alignment of other
+ * TCP options (up to @max_combine_saving available).
+ *
+ * Only solutions with at least @required AccECN fields are accepted.
+ *
+ * Returns: The size of the AccECN option excluding space repurposed from
+ * the alignment of the other options.
+ */
+static int tcp_options_fit_accecn(struct tcp_out_options *opts, int required,
+ int remaining)
+{
+ int size = TCP_ACCECN_MAXSIZE;
+ int sack_blocks_reduce = 0;
+ int max_combine_saving;
+ int rem = remaining;
+ int align_size;
+
+ if (opts->use_synack_ecn_bytes)
+ max_combine_saving = tcp_synack_options_combine_saving(opts);
+ else
+ max_combine_saving = opts->num_sack_blocks > 0 ? 2 : 0;
+ opts->num_accecn_fields = TCP_ACCECN_NUMFIELDS;
+ while (opts->num_accecn_fields >= required) {
+ /* Pad to dword if cannot combine */
+ if ((size & 0x3) > max_combine_saving)
+ align_size = ALIGN(size, 4);
+ else
+ align_size = ALIGN_DOWN(size, 4);
+
+ if (rem >= align_size) {
+ size = align_size;
+ break;
+ } else if (opts->num_accecn_fields == required &&
+ opts->num_sack_blocks > 2 &&
+ required > 0) {
+ /* Try to fit the option by removing one SACK block */
+ opts->num_sack_blocks--;
+ sack_blocks_reduce++;
+ rem = rem + TCPOLEN_SACK_PERBLOCK;
+
+ opts->num_accecn_fields = TCP_ACCECN_NUMFIELDS;
+ size = TCP_ACCECN_MAXSIZE;
+ continue;
+ }
+
+ opts->num_accecn_fields--;
+ size -= TCPOLEN_ACCECN_PERFIELD;
+ }
+ if (sack_blocks_reduce > 0) {
+ if (opts->num_accecn_fields >= required)
+ size -= sack_blocks_reduce * TCPOLEN_SACK_PERBLOCK;
+ else
+ opts->num_sack_blocks += sack_blocks_reduce;
+ }
+ if (opts->num_accecn_fields < required)
+ return 0;
+
+ opts->options |= OPTION_ACCECN;
+ return size;
+}
+
/* Compute TCP options for SYN packets. This is not the final
* network wire format yet.
*/
@@ -895,6 +1007,20 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
}
}
+ /* Simultaneous open SYN/ACK needs AccECN option but not SYN.
+ * It is attempted to negotiate the use of AccECN also on the first
+ * retransmitted SYN, as mentioned in "3.1.4.1. Retransmitted SYNs"
+ * of AccECN draft.
+ */
+ if (unlikely((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) &&
+ tcp_ecn_mode_accecn(tp) &&
+ inet_csk(sk)->icsk_retransmits < 2 &&
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) &&
+ remaining >= TCPOLEN_ACCECN_BASE)) {
+ opts->use_synack_ecn_bytes = 1;
+ remaining -= tcp_options_fit_accecn(opts, 0, remaining);
+ }
+
bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
return MAX_TCP_OPTION_SPACE - remaining;
@@ -912,6 +1038,7 @@ static unsigned int tcp_synack_options(const struct sock *sk,
{
struct inet_request_sock *ireq = inet_rsk(req);
unsigned int remaining = MAX_TCP_OPTION_SPACE;
+ struct tcp_request_sock *treq = tcp_rsk(req);
if (tcp_key_is_md5(key)) {
opts->options |= OPTION_MD5;
@@ -974,6 +1101,13 @@ static unsigned int tcp_synack_options(const struct sock *sk,
smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
+ if (treq->accecn_ok &&
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) &&
+ req->num_timeout < 1 && remaining >= TCPOLEN_ACCECN_BASE) {
+ opts->use_synack_ecn_bytes = 1;
+ remaining -= tcp_options_fit_accecn(opts, 0, remaining);
+ }
+
bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb,
synack_type, opts, &remaining);
@@ -1030,17 +1164,32 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
if (unlikely(eff_sacks)) {
const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
- if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED +
- TCPOLEN_SACK_PERBLOCK))
- return size;
+ if (likely(remaining >= TCPOLEN_SACK_BASE_ALIGNED +
+ TCPOLEN_SACK_PERBLOCK)) {
+ opts->num_sack_blocks =
+ min_t(unsigned int, eff_sacks,
+ (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
+ TCPOLEN_SACK_PERBLOCK);
+
+ size += TCPOLEN_SACK_BASE_ALIGNED +
+ opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
+ } else {
+ opts->num_sack_blocks = 0;
+ }
+ } else {
+ opts->num_sack_blocks = 0;
+ }
- opts->num_sack_blocks =
- min_t(unsigned int, eff_sacks,
- (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
- TCPOLEN_SACK_PERBLOCK);
+ if (tcp_ecn_mode_accecn(tp)) {
+ int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option);
- size += TCPOLEN_SACK_BASE_ALIGNED +
- opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
+ if (ecn_opt && tp->saw_accecn_opt && !tcp_accecn_opt_fail_send(tp) &&
+ (ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand ||
+ tcp_accecn_option_beacon_check(sk))) {
+ opts->use_synack_ecn_bytes = 0;
+ size += tcp_options_fit_accecn(opts, tp->accecn_minlen,
+ MAX_TCP_OPTION_SPACE - size);
+ }
}
if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
@@ -1437,7 +1586,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
err = tcp_ao_transmit_skb(sk, skb, key.ao_key, th,
opts.hash_location);
if (err) {
- kfree_skb_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED);
+ sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_NOT_SPECIFIED);
return -ENOMEM;
}
}
@@ -1510,6 +1659,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
/* Advance write_seq and place onto the write_queue. */
WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq);
__skb_header_release(skb);
+ psp_enqueue_set_decrypted(sk, skb);
tcp_add_write_queue_tail(sk, skb);
sk_wmem_queued_add(sk, skb->truesize);
sk_mem_charge(sk, skb->truesize);
@@ -2750,6 +2900,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
sent_pkts = 0;
tcp_mstamp_refresh(tp);
+
+ /* AccECN option beacon depends on mstamp, it may change mss */
+ if (tcp_ecn_mode_accecn(tp) && tcp_accecn_option_beacon_check(sk))
+ mss_now = tcp_current_mss(sk);
+
if (!push_one) {
/* Do MTU probing. */
result = tcp_mtu_probe(sk);
@@ -3402,7 +3557,10 @@ start:
tcp_retrans_try_collapse(sk, skb, avail_wnd);
}
- /* RFC3168, section 6.1.1.1. ECN fallback */
+ /* RFC3168, section 6.1.1.1. ECN fallback
+ * As AccECN uses the same SYN flags (+ AE), this check covers both
+ * cases.
+ */
if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
tcp_ecn_clear_syn(sk, skb);
@@ -3578,9 +3736,8 @@ void sk_forced_mem_schedule(struct sock *sk, int size)
sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
sk_memory_allocated_add(sk, amt);
- if (mem_cgroup_sockets_enabled && sk->sk_memcg)
- mem_cgroup_charge_skmem(sk->sk_memcg, amt,
- gfp_memcg_charge() | __GFP_NOFAIL);
+ if (mem_cgroup_sk_enabled(sk))
+ mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL);
}
/* Send a FIN. The caller locks the socket for us.
@@ -3625,7 +3782,7 @@ void tcp_send_fin(struct sock *sk)
skb_reserve(skb, MAX_TCP_HEADER);
sk_forced_mem_schedule(sk, skb->truesize);
/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
- tcp_init_nondata_skb(skb, tp->write_seq,
+ tcp_init_nondata_skb(skb, sk, tp->write_seq,
TCPHDR_ACK | TCPHDR_FIN);
tcp_queue_skb(sk, skb);
}
@@ -3653,7 +3810,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority,
/* Reserve space for headers and prepare control bits. */
skb_reserve(skb, MAX_TCP_HEADER);
- tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
+ tcp_init_nondata_skb(skb, sk, tcp_acceptable_seq(sk),
TCPHDR_ACK | TCPHDR_RST);
tcp_mstamp_refresh(tcp_sk(sk));
/* Send it off. */
@@ -3891,6 +4048,7 @@ static void tcp_connect_init(struct sock *sk)
const struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_sock *tp = tcp_sk(sk);
__u8 rcv_wscale;
+ u16 user_mss;
u32 rcv_wnd;
/* We'll fix this up when we get a response from the other end.
@@ -3903,8 +4061,9 @@ static void tcp_connect_init(struct sock *sk)
tcp_ao_connect_init(sk);
/* If user gave his TCP_MAXSEG, record it to clamp */
- if (tp->rx_opt.user_mss)
- tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
+ user_mss = READ_ONCE(tp->rx_opt.user_mss);
+ if (user_mss)
+ tp->rx_opt.mss_clamp = user_mss;
tp->max_window = 0;
tcp_mtup_init(sk);
tcp_sync_mss(sk, dst_mtu(dst));
@@ -3955,7 +4114,7 @@ static void tcp_connect_init(struct sock *sk)
WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
- inet_csk(sk)->icsk_retransmits = 0;
+ WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0);
tcp_clear_retrans(tp);
}
@@ -4148,7 +4307,7 @@ int tcp_connect(struct sock *sk)
/* SYN eats a sequence byte, write_seq updated by
* tcp_connect_queue_skb().
*/
- tcp_init_nondata_skb(buff, tp->write_seq, TCPHDR_SYN);
+ tcp_init_nondata_skb(buff, sk, tp->write_seq, TCPHDR_SYN);
tcp_mstamp_refresh(tp);
tp->retrans_stamp = tcp_time_stamp_ts(tp);
tcp_connect_queue_skb(sk, buff);
@@ -4273,7 +4432,8 @@ void __tcp_send_ack(struct sock *sk, u32 rcv_nxt, u16 flags)
/* Reserve space for headers and prepare control bits. */
skb_reserve(buff, MAX_TCP_HEADER);
- tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK | flags);
+ tcp_init_nondata_skb(buff, sk,
+ tcp_acceptable_seq(sk), TCPHDR_ACK | flags);
/* We do not want pure acks influencing TCP Small Queues or fq/pacing
* too much.
@@ -4319,7 +4479,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
* end to send an ack. Don't queue or clone SKB, just
* send it.
*/
- tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
+ tcp_init_nondata_skb(skb, sk, tp->snd_una - !urgent, TCPHDR_ACK);
NET_INC_STATS(sock_net(sk), mib);
return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0);
}
@@ -4393,13 +4553,13 @@ void tcp_send_probe0(struct sock *sk)
if (tp->packets_out || tcp_write_queue_empty(sk)) {
/* Cancel probe timer, if it is not required. */
- icsk->icsk_probes_out = 0;
+ WRITE_ONCE(icsk->icsk_probes_out, 0);
icsk->icsk_backoff = 0;
icsk->icsk_probes_tstamp = 0;
return;
}
- icsk->icsk_probes_out++;
+ WRITE_ONCE(icsk->icsk_probes_out, icsk->icsk_probes_out + 1);
if (err <= 0) {
if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2))
icsk->icsk_backoff++;
@@ -4437,7 +4597,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
tcp_sk_rw(sk)->total_retrans++;
}
trace_tcp_retransmit_synack(sk, req);
- req->num_retrans++;
+ WRITE_ONCE(req->num_retrans, req->num_retrans + 1);
}
return res;
}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index a207877270fb..2dd73a4e8e51 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -392,7 +392,7 @@ static void tcp_probe_timer(struct sock *sk)
int max_probes;
if (tp->packets_out || !skb) {
- icsk->icsk_probes_out = 0;
+ WRITE_ONCE(icsk->icsk_probes_out, 0);
icsk->icsk_probes_tstamp = 0;
return;
}
@@ -444,7 +444,7 @@ static void tcp_update_rto_stats(struct sock *sk)
tp->total_rto_recoveries++;
tp->rto_stamp = tcp_time_stamp_ms(tp);
}
- icsk->icsk_retransmits++;
+ WRITE_ONCE(icsk->icsk_retransmits, icsk->icsk_retransmits + 1);
tp->total_rto++;
}
@@ -839,7 +839,7 @@ static void tcp_keepalive_timer(struct timer_list *t)
goto out;
}
if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) {
- icsk->icsk_probes_out++;
+ WRITE_ONCE(icsk->icsk_probes_out, icsk->icsk_probes_out + 1);
elapsed = keepalive_intvl_when(tp);
} else {
/* If keepalive was lost due to local congestion,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index cc3ce0f762ec..95241093b7f0 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -68,7 +68,7 @@
* YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
* Alexey Kuznetsov: allow both IPv4 and IPv6 sockets to bind
* a single port at the same time.
- * Derek Atkins <derek@ihtfp.com>: Add Encapulation Support
+ * Derek Atkins <derek@ihtfp.com>: Add Encapsulation Support
* James Chapman : Add L2TP encapsulation type.
*/
@@ -509,7 +509,7 @@ rescore:
/* compute_score is too long of a function to be
* inlined, and calling it again here yields
- * measureable overhead for some
+ * measurable overhead for some
* workloads. Work around it by jumping
* backwards to rescore 'result'.
*/
@@ -1685,31 +1685,6 @@ static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb)
udp_rmem_release(sk, udp_skb_truesize(skb), 1, true);
}
-/* Idea of busylocks is to let producers grab an extra spinlock
- * to relieve pressure on the receive_queue spinlock shared by consumer.
- * Under flood, this means that only one producer can be in line
- * trying to acquire the receive_queue spinlock.
- * These busylock can be allocated on a per cpu manner, instead of a
- * per socket one (that would consume a cache line per socket)
- */
-static int udp_busylocks_log __read_mostly;
-static spinlock_t *udp_busylocks __read_mostly;
-
-static spinlock_t *busylock_acquire(void *ptr)
-{
- spinlock_t *busy;
-
- busy = udp_busylocks + hash_ptr(ptr, udp_busylocks_log);
- spin_lock(busy);
- return busy;
-}
-
-static void busylock_release(spinlock_t *busy)
-{
- if (busy)
- spin_unlock(busy);
-}
-
static int udp_rmem_schedule(struct sock *sk, int size)
{
int delta;
@@ -1724,14 +1699,24 @@ static int udp_rmem_schedule(struct sock *sk, int size)
int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
{
struct sk_buff_head *list = &sk->sk_receive_queue;
+ struct udp_prod_queue *udp_prod_queue;
+ struct sk_buff *next, *to_drop = NULL;
+ struct llist_node *ll_list;
unsigned int rmem, rcvbuf;
- spinlock_t *busy = NULL;
int size, err = -ENOMEM;
+ int total_size = 0;
+ int q_size = 0;
+ int dropcount;
+ int nb = 0;
rmem = atomic_read(&sk->sk_rmem_alloc);
rcvbuf = READ_ONCE(sk->sk_rcvbuf);
size = skb->truesize;
+ udp_prod_queue = &udp_sk(sk)->udp_prod_queue[numa_node_id()];
+
+ rmem += atomic_read(&udp_prod_queue->rmem_alloc);
+
/* Immediately drop when the receive queue is full.
* Cast to unsigned int performs the boundary check for INT_MAX.
*/
@@ -1739,8 +1724,8 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
if (rcvbuf > INT_MAX >> 1)
goto drop;
- /* Always allow at least one packet for small buffer. */
- if (rmem > rcvbuf)
+ /* Accept the packet if queue is empty. */
+ if (rmem)
goto drop;
}
@@ -1753,42 +1738,77 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
if (rmem > (rcvbuf >> 1)) {
skb_condense(skb);
size = skb->truesize;
- busy = busylock_acquire(sk);
}
udp_set_dev_scratch(skb);
- atomic_add(size, &sk->sk_rmem_alloc);
+ atomic_add(size, &udp_prod_queue->rmem_alloc);
+
+ if (!llist_add(&skb->ll_node, &udp_prod_queue->ll_root))
+ return 0;
+
+ dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? sk_drops_read(sk) : 0;
spin_lock(&list->lock);
- err = udp_rmem_schedule(sk, size);
- if (err) {
- spin_unlock(&list->lock);
- goto uncharge_drop;
- }
- sk_forward_alloc_add(sk, -size);
+ ll_list = llist_del_all(&udp_prod_queue->ll_root);
- /* no need to setup a destructor, we will explicitly release the
- * forward allocated memory on dequeue
- */
- sock_skb_set_dropcount(sk, skb);
+ ll_list = llist_reverse_order(ll_list);
+
+ llist_for_each_entry_safe(skb, next, ll_list, ll_node) {
+ size = udp_skb_truesize(skb);
+ total_size += size;
+ err = udp_rmem_schedule(sk, size);
+ if (unlikely(err)) {
+ /* Free the skbs outside of locked section. */
+ skb->next = to_drop;
+ to_drop = skb;
+ continue;
+ }
+
+ q_size += size;
+ sk_forward_alloc_add(sk, -size);
+
+ /* no need to setup a destructor, we will explicitly release the
+ * forward allocated memory on dequeue
+ */
+ SOCK_SKB_CB(skb)->dropcount = dropcount;
+ nb++;
+ __skb_queue_tail(list, skb);
+ }
+
+ atomic_add(q_size, &sk->sk_rmem_alloc);
- __skb_queue_tail(list, skb);
spin_unlock(&list->lock);
- if (!sock_flag(sk, SOCK_DEAD))
- INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk);
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ /* Multiple threads might be blocked in recvmsg(),
+ * using prepare_to_wait_exclusive().
+ */
+ while (nb) {
+ INDIRECT_CALL_1(sk->sk_data_ready,
+ sock_def_readable, sk);
+ nb--;
+ }
+ }
- busylock_release(busy);
- return 0;
+ if (unlikely(to_drop)) {
+ for (nb = 0; to_drop != NULL; nb++) {
+ skb = to_drop;
+ to_drop = skb->next;
+ skb_mark_not_on_list(skb);
+ /* TODO: update SNMP values. */
+ sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_PROTO_MEM);
+ }
+ numa_drop_add(&udp_sk(sk)->drop_counters, nb);
+ }
-uncharge_drop:
- atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+ atomic_sub(total_size, &udp_prod_queue->rmem_alloc);
+
+ return 0;
drop:
- atomic_inc(&sk->sk_drops);
- busylock_release(busy);
+ udp_drops_inc(sk);
return err;
}
EXPORT_IPV6_MOD_GPL(__udp_enqueue_schedule_skb);
@@ -1806,6 +1826,7 @@ void udp_destruct_common(struct sock *sk)
kfree_skb(skb);
}
udp_rmem_release(sk, total, 0, true);
+ kfree(up->udp_prod_queue);
}
EXPORT_IPV6_MOD_GPL(udp_destruct_common);
@@ -1817,10 +1838,11 @@ static void udp_destruct_sock(struct sock *sk)
int udp_init_sock(struct sock *sk)
{
- udp_lib_init_sock(sk);
+ int res = udp_lib_init_sock(sk);
+
sk->sk_destruct = udp_destruct_sock;
set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
- return 0;
+ return res;
}
void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
@@ -1828,6 +1850,13 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
if (unlikely(READ_ONCE(udp_sk(sk)->peeking_with_offset)))
sk_peek_offset_bwd(sk, len);
+ if (!skb_shared(skb)) {
+ if (unlikely(udp_skb_has_head_state(skb)))
+ skb_release_head_state(skb);
+ skb_attempt_defer_free(skb);
+ return;
+ }
+
if (!skb_unref(skb))
return;
@@ -1852,7 +1881,7 @@ static struct sk_buff *__first_packet_length(struct sock *sk,
IS_UDPLITE(sk));
__UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
IS_UDPLITE(sk));
- atomic_inc(&sk->sk_drops);
+ udp_drops_inc(sk);
__skb_unlink(skb, rcvq);
*total += skb->truesize;
kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM);
@@ -2008,7 +2037,7 @@ try_again:
__UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, is_udplite);
__UDP_INC_STATS(net, UDP_MIB_INERRORS, is_udplite);
- atomic_inc(&sk->sk_drops);
+ udp_drops_inc(sk);
kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM);
goto try_again;
}
@@ -2078,7 +2107,7 @@ try_again:
if (unlikely(err)) {
if (!peeking) {
- atomic_inc(&sk->sk_drops);
+ udp_drops_inc(sk);
UDP_INC_STATS(sock_net(sk),
UDP_MIB_INERRORS, is_udplite);
}
@@ -2449,7 +2478,7 @@ csum_error:
__UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
drop:
__UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
- atomic_inc(&sk->sk_drops);
+ udp_drops_inc(sk);
sk_skb_reason_drop(sk, skb, drop_reason);
return -1;
}
@@ -2534,7 +2563,7 @@ start_lookup:
nskb = skb_clone(skb, GFP_ATOMIC);
if (unlikely(!nskb)) {
- atomic_inc(&sk->sk_drops);
+ udp_drops_inc(sk);
__UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS,
IS_UDPLITE(sk));
__UDP_INC_STATS(net, UDP_MIB_INERRORS,
@@ -2609,7 +2638,7 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
return 0;
}
-/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and
+/* wrapper for udp_queue_rcv_skb taking care of csum conversion and
* return code conversion for ip layer consumption
*/
static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
@@ -2807,7 +2836,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
return NULL;
}
-int udp_v4_early_demux(struct sk_buff *skb)
+enum skb_drop_reason udp_v4_early_demux(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
struct in_device *in_dev = NULL;
@@ -2821,7 +2850,7 @@ int udp_v4_early_demux(struct sk_buff *skb)
/* validate the packet */
if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr)))
- return 0;
+ return SKB_NOT_DROPPED_YET;
iph = ip_hdr(skb);
uh = udp_hdr(skb);
@@ -2830,12 +2859,12 @@ int udp_v4_early_demux(struct sk_buff *skb)
in_dev = __in_dev_get_rcu(skb->dev);
if (!in_dev)
- return 0;
+ return SKB_NOT_DROPPED_YET;
ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr,
iph->protocol);
if (!ours)
- return 0;
+ return SKB_NOT_DROPPED_YET;
sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
uh->source, iph->saddr,
@@ -2846,7 +2875,7 @@ int udp_v4_early_demux(struct sk_buff *skb)
}
if (!sk)
- return 0;
+ return SKB_NOT_DROPPED_YET;
skb->sk = sk;
DEBUG_NET_WARN_ON_ONCE(sk_is_refcounted(sk));
@@ -2873,7 +2902,7 @@ int udp_v4_early_demux(struct sk_buff *skb)
ip4h_dscp(iph),
skb->dev, in_dev, &itag);
}
- return 0;
+ return SKB_NOT_DROPPED_YET;
}
int udp_rcv(struct sk_buff *skb)
@@ -3386,7 +3415,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
from_kuid_munged(seq_user_ns(f), sk_uid(sp)),
0, sock_i_ino(sp),
refcount_read(&sp->sk_refcnt), sp,
- atomic_read(&sp->sk_drops));
+ sk_drops_read(sp));
}
int udp4_seq_show(struct seq_file *seq, void *v)
@@ -3994,7 +4023,6 @@ static void __init bpf_iter_register(void)
void __init udp_init(void)
{
unsigned long limit;
- unsigned int i;
udp_table_init(&udp_table, "UDP");
limit = nr_free_buffer_pages() / 8;
@@ -4003,15 +4031,6 @@ void __init udp_init(void)
sysctl_udp_mem[1] = limit;
sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2;
- /* 16 spinlocks per cpu */
- udp_busylocks_log = ilog2(nr_cpu_ids) + 4;
- udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log,
- GFP_KERNEL);
- if (!udp_busylocks)
- panic("UDP: failed to alloc udp_busylocks\n");
- for (i = 0; i < (1U << udp_busylocks_log); i++)
- spin_lock_init(udp_busylocks + i);
-
if (register_pernet_subsys(&udp_sysctl_ops))
panic("UDP: failed to init sysctl parameters.\n");
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 38cb3a28e4ed..6e491c720c90 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -16,9 +16,9 @@
static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *req,
- struct nlattr *bc, bool net_admin)
+ bool net_admin)
{
- if (!inet_diag_bc_sk(bc, sk))
+ if (!inet_diag_bc_sk(cb->data, sk))
return 0;
return inet_sk_diag_fill(sk, NULL, skb, cb, req, NLM_F_MULTI,
@@ -92,12 +92,8 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
{
bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
struct net *net = sock_net(skb->sk);
- struct inet_diag_dump_data *cb_data;
int num, s_num, slot, s_slot;
- struct nlattr *bc;
- cb_data = cb->data;
- bc = cb_data->inet_diag_nla_bc;
s_slot = cb->args[0];
num = s_num = cb->args[1];
@@ -130,7 +126,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
r->id.idiag_dport)
goto next;
- if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) {
+ if (sk_diag_dump(sk, skb, cb, r, net_admin) < 0) {
spin_unlock_bh(&hslot->lock);
goto done;
}
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 5128e2a5b00a..19d0b5b09ffa 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -217,7 +217,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
skb->remcsum_offload = remcsum;
- need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
+ need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb);
/* Try to offload checksum if possible */
offload_csum = !!(need_csum &&
!need_ipsec &&
@@ -891,8 +891,6 @@ struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb)
skb_gro_checksum_try_convert(skb, IPPROTO_UDP,
inet_gro_compute_pseudo);
skip:
- NAPI_GRO_CB(skb)->is_ipv6 = 0;
-
if (static_branch_unlikely(&udp_encap_needed_key))
sk = udp4_gro_lookup_skb(skb, uh->source, uh->dest);
diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c
index fce945f23069..54386e06a813 100644
--- a/net/ipv4/udp_tunnel_core.c
+++ b/net/ipv4/udp_tunnel_core.c
@@ -4,6 +4,7 @@
#include <linux/socket.h>
#include <linux/kernel.h>
#include <net/dst_metadata.h>
+#include <net/flow.h>
#include <net/udp.h>
#include <net/udp_tunnel.h>
#include <net/inet_dscp.h>
@@ -253,7 +254,7 @@ struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb,
fl4.saddr = key->u.ipv4.src;
fl4.fl4_dport = dport;
fl4.fl4_sport = sport;
- fl4.flowi4_tos = tos & INET_DSCP_MASK;
+ fl4.flowi4_dscp = inet_dsfield_to_dscp(tos);
fl4.flowi4_flags = key->flow_flags;
rt = ip_route_output_key(net, &fl4);
diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c
index ff66db48453c..944b3cf25468 100644
--- a/net/ipv4/udp_tunnel_nic.c
+++ b/net/ipv4/udp_tunnel_nic.c
@@ -930,7 +930,7 @@ udp_tunnel_nic_netdevice_event(struct notifier_block *unused,
err = udp_tunnel_nic_register(dev);
if (err)
- netdev_WARN(dev, "failed to register for UDP tunnel offloads: %d", err);
+ netdev_warn(dev, "failed to register for UDP tunnel offloads: %d", err);
return notifier_from_errno(err);
}
/* All other events will need the udp_tunnel_nic state */
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 7fb6205619e7..58faf1ddd2b1 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -14,7 +14,7 @@
#include <linux/inetdevice.h>
#include <net/dst.h>
#include <net/xfrm.h>
-#include <net/inet_dscp.h>
+#include <net/flow.h>
#include <net/ip.h>
#include <net/l3mdev.h>
@@ -25,7 +25,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct flowi4 *fl4,
memset(fl4, 0, sizeof(*fl4));
fl4->daddr = params->daddr->a4;
- fl4->flowi4_tos = inet_dscp_to_dsfield(params->dscp);
+ fl4->flowi4_dscp = params->dscp;
fl4->flowi4_l3mdev = l3mdev_master_ifindex_by_index(params->net,
params->oif);
fl4->flowi4_mark = params->mark;