summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig4
-rw-r--r--net/ipv4/af_inet.c22
-rw-r--r--net/ipv4/arp.c6
-rw-r--r--net/ipv4/datagram.c4
-rw-r--r--net/ipv4/fou_nl.c1
-rw-r--r--net/ipv4/fou_nl.h1
-rw-r--r--net/ipv4/icmp.c191
-rw-r--r--net/ipv4/inet_connection_sock.c56
-rw-r--r--net/ipv4/inet_diag.c8
-rw-r--r--net/ipv4/inet_hashtables.c8
-rw-r--r--net/ipv4/inet_timewait_sock.c35
-rw-r--r--net/ipv4/ip_input.c4
-rw-r--r--net/ipv4/ipconfig.c3
-rw-r--r--net/ipv4/ipip.c25
-rw-r--r--net/ipv4/ping.c8
-rw-r--r--net/ipv4/raw.c3
-rw-r--r--net/ipv4/sysctl_net_ipv4.c29
-rw-r--r--net/ipv4/tcp.c125
-rw-r--r--net/ipv4/tcp_input.c72
-rw-r--r--net/ipv4/tcp_ipv4.c154
-rw-r--r--net/ipv4/tcp_lp.c7
-rw-r--r--net/ipv4/tcp_minisocks.c8
-rw-r--r--net/ipv4/tcp_offload.c27
-rw-r--r--net/ipv4/tcp_output.c38
-rw-r--r--net/ipv4/tcp_timer.c26
-rw-r--r--net/ipv4/udp.c6
-rw-r--r--net/ipv4/udp_tunnel_core.c4
27 files changed, 531 insertions, 344 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 12850a277251..b71c22475c51 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -760,9 +760,7 @@ config TCP_AO
config TCP_MD5SIG
bool "TCP: MD5 Signature Option support (RFC2385)"
- select CRYPTO
- select CRYPTO_MD5
- select TCP_SIGPOOL
+ select CRYPTO_LIB_MD5
help
RFC2385 specifies a method of giving MD5 protection to TCP sessions.
Its main (only?) use is to protect BGP sessions between core routers
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 3109c5ec38f3..08d811f11896 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -441,7 +441,7 @@ int inet_release(struct socket *sock)
}
EXPORT_SYMBOL(inet_release);
-int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int inet_bind_sk(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
{
u32 flags = BIND_WITH_LOCK;
int err;
@@ -464,13 +464,13 @@ int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len)
return __inet_bind(sk, uaddr, addr_len, flags);
}
-int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+int inet_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len)
{
return inet_bind_sk(sock->sk, uaddr, addr_len);
}
EXPORT_SYMBOL(inet_bind);
-int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
+int __inet_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len,
u32 flags)
{
struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
@@ -567,7 +567,7 @@ out:
return err;
}
-int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
+int inet_dgram_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
int addr_len, int flags)
{
struct sock *sk = sock->sk;
@@ -623,7 +623,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
* Connect to a remote host. There is regrettably still a little
* TCP 'magic' in here.
*/
-int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+int __inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
int addr_len, int flags, int is_sendmsg)
{
struct sock *sk = sock->sk;
@@ -741,7 +741,7 @@ sock_error:
}
EXPORT_SYMBOL(__inet_stream_connect);
-int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+int inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
int addr_len, int flags)
{
int err;
@@ -755,6 +755,11 @@ EXPORT_SYMBOL(inet_stream_connect);
void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *newsk)
{
+ if (mem_cgroup_sockets_enabled) {
+ mem_cgroup_sk_alloc(newsk);
+ __sk_charge(newsk, GFP_KERNEL);
+ }
+
sock_rps_record_flow(newsk);
WARN_ON(!((1 << newsk->sk_state) &
(TCPF_ESTABLISHED | TCPF_SYN_RECV |
@@ -768,6 +773,7 @@ void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *new
newsock->state = SS_CONNECTED;
}
+EXPORT_SYMBOL_GPL(__inet_accept);
/*
* Accept a pending connection. The TCP layer now gives BSD semantics.
@@ -813,7 +819,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
}
sin->sin_port = inet->inet_dport;
sin->sin_addr.s_addr = inet->inet_daddr;
- BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
+ BPF_CGROUP_RUN_SA_PROG(sk, sin, &sin_addr_len,
CGROUP_INET4_GETPEERNAME);
} else {
__be32 addr = inet->inet_rcv_saddr;
@@ -821,7 +827,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
addr = inet->inet_saddr;
sin->sin_port = inet->inet_sport;
sin->sin_addr.s_addr = addr;
- BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
+ BPF_CGROUP_RUN_SA_PROG(sk, sin, &sin_addr_len,
CGROUP_INET4_GETSOCKNAME);
}
release_sock(sk);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 833f2cf97178..7f3863daaa40 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1189,7 +1189,7 @@ static int arp_req_get(struct net *net, struct arpreq *r)
read_lock_bh(&neigh->lock);
memcpy(r->arp_ha.sa_data, neigh->ha,
- min(dev->addr_len, sizeof(r->arp_ha.sa_data_min)));
+ min(dev->addr_len, sizeof(r->arp_ha.sa_data)));
r->arp_flags = arp_state_to_flags(neigh);
read_unlock_bh(&neigh->lock);
@@ -1217,10 +1217,10 @@ int arp_invalidate(struct net_device *dev, __be32 ip, bool force)
err = neigh_update(neigh, NULL, NUD_FAILED,
NEIGH_UPDATE_F_OVERRIDE|
NEIGH_UPDATE_F_ADMIN, 0);
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
neigh_release(neigh);
neigh_remove_one(neigh);
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
}
return err;
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index c2b2cda1a7e5..1614593b6d72 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -16,7 +16,7 @@
#include <net/tcp_states.h>
#include <net/sock_reuseport.h>
-int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int __ip4_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
{
struct inet_sock *inet = inet_sk(sk);
struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
@@ -84,7 +84,7 @@ out:
}
EXPORT_SYMBOL(__ip4_datagram_connect);
-int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int ip4_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
{
int res;
diff --git a/net/ipv4/fou_nl.c b/net/ipv4/fou_nl.c
index 506260b4a4dc..7a99639204b1 100644
--- a/net/ipv4/fou_nl.c
+++ b/net/ipv4/fou_nl.c
@@ -2,6 +2,7 @@
/* Do not edit directly, auto-generated from: */
/* Documentation/netlink/specs/fou.yaml */
/* YNL-GEN kernel source */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
#include <net/netlink.h>
#include <net/genetlink.h>
diff --git a/net/ipv4/fou_nl.h b/net/ipv4/fou_nl.h
index 63a6c4ed803d..438342dc8507 100644
--- a/net/ipv4/fou_nl.h
+++ b/net/ipv4/fou_nl.h
@@ -2,6 +2,7 @@
/* Do not edit directly, auto-generated from: */
/* Documentation/netlink/specs/fou.yaml */
/* YNL-GEN kernel header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
#ifndef _LINUX_FOU_GEN_H
#define _LINUX_FOU_GEN_H
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 1b7fb5d935ed..4abbec2f47ef 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -582,6 +582,185 @@ relookup_failed:
return ERR_PTR(err);
}
+struct icmp_ext_iio_addr4_subobj {
+ __be16 afi;
+ __be16 reserved;
+ __be32 addr4;
+};
+
+static unsigned int icmp_ext_iio_len(void)
+{
+ return sizeof(struct icmp_extobj_hdr) +
+ /* ifIndex */
+ sizeof(__be32) +
+ /* Interface Address Sub-Object */
+ sizeof(struct icmp_ext_iio_addr4_subobj) +
+ /* Interface Name Sub-Object. Length must be a multiple of 4
+ * bytes.
+ */
+ ALIGN(sizeof(struct icmp_ext_iio_name_subobj), 4) +
+ /* MTU */
+ sizeof(__be32);
+}
+
+static unsigned int icmp_ext_max_len(u8 ext_objs)
+{
+ unsigned int ext_max_len;
+
+ ext_max_len = sizeof(struct icmp_ext_hdr);
+
+ if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF))
+ ext_max_len += icmp_ext_iio_len();
+
+ return ext_max_len;
+}
+
+static __be32 icmp_ext_iio_addr4_find(const struct net_device *dev)
+{
+ struct in_device *in_dev;
+ struct in_ifaddr *ifa;
+
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ return 0;
+
+ /* It is unclear from RFC 5837 which IP address should be chosen, but
+ * it makes sense to choose a global unicast address.
+ */
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY)
+ continue;
+ if (ifa->ifa_scope != RT_SCOPE_UNIVERSE ||
+ ipv4_is_multicast(ifa->ifa_address))
+ continue;
+ return ifa->ifa_address;
+ }
+
+ return 0;
+}
+
+static void icmp_ext_iio_iif_append(struct net *net, struct sk_buff *skb,
+ int iif)
+{
+ struct icmp_ext_iio_name_subobj *name_subobj;
+ struct icmp_extobj_hdr *objh;
+ struct net_device *dev;
+ __be32 data;
+
+ if (!iif)
+ return;
+
+ /* Add the fields in the order specified by RFC 5837. */
+ objh = skb_put(skb, sizeof(*objh));
+ objh->class_num = ICMP_EXT_OBJ_CLASS_IIO;
+ objh->class_type = ICMP_EXT_CTYPE_IIO_ROLE(ICMP_EXT_CTYPE_IIO_ROLE_IIF);
+
+ data = htonl(iif);
+ skb_put_data(skb, &data, sizeof(__be32));
+ objh->class_type |= ICMP_EXT_CTYPE_IIO_IFINDEX;
+
+ rcu_read_lock();
+
+ dev = dev_get_by_index_rcu(net, iif);
+ if (!dev)
+ goto out;
+
+ data = icmp_ext_iio_addr4_find(dev);
+ if (data) {
+ struct icmp_ext_iio_addr4_subobj *addr4_subobj;
+
+ addr4_subobj = skb_put_zero(skb, sizeof(*addr4_subobj));
+ addr4_subobj->afi = htons(ICMP_AFI_IP);
+ addr4_subobj->addr4 = data;
+ objh->class_type |= ICMP_EXT_CTYPE_IIO_IPADDR;
+ }
+
+ name_subobj = skb_put_zero(skb, ALIGN(sizeof(*name_subobj), 4));
+ name_subobj->len = ALIGN(sizeof(*name_subobj), 4);
+ netdev_copy_name(dev, name_subobj->name);
+ objh->class_type |= ICMP_EXT_CTYPE_IIO_NAME;
+
+ data = htonl(READ_ONCE(dev->mtu));
+ skb_put_data(skb, &data, sizeof(__be32));
+ objh->class_type |= ICMP_EXT_CTYPE_IIO_MTU;
+
+out:
+ rcu_read_unlock();
+ objh->length = htons(skb_tail_pointer(skb) - (unsigned char *)objh);
+}
+
+static void icmp_ext_objs_append(struct net *net, struct sk_buff *skb,
+ u8 ext_objs, int iif)
+{
+ if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF))
+ icmp_ext_iio_iif_append(net, skb, iif);
+}
+
+static struct sk_buff *
+icmp_ext_append(struct net *net, struct sk_buff *skb_in, struct icmphdr *icmph,
+ unsigned int room, int iif)
+{
+ unsigned int payload_len, ext_max_len, ext_len;
+ struct icmp_ext_hdr *ext_hdr;
+ struct sk_buff *skb;
+ u8 ext_objs;
+ int nhoff;
+
+ switch (icmph->type) {
+ case ICMP_DEST_UNREACH:
+ case ICMP_TIME_EXCEEDED:
+ case ICMP_PARAMETERPROB:
+ break;
+ default:
+ return NULL;
+ }
+
+ ext_objs = READ_ONCE(net->ipv4.sysctl_icmp_errors_extension_mask);
+ if (!ext_objs)
+ return NULL;
+
+ ext_max_len = icmp_ext_max_len(ext_objs);
+ if (ICMP_EXT_ORIG_DGRAM_MIN_LEN + ext_max_len > room)
+ return NULL;
+
+ skb = skb_clone(skb_in, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ nhoff = skb_network_offset(skb);
+ payload_len = min(skb->len - nhoff, ICMP_EXT_ORIG_DGRAM_MIN_LEN);
+
+ if (!pskb_network_may_pull(skb, payload_len))
+ goto free_skb;
+
+ if (pskb_trim(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN) ||
+ __skb_put_padto(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN, false))
+ goto free_skb;
+
+ if (pskb_expand_head(skb, 0, ext_max_len, GFP_ATOMIC))
+ goto free_skb;
+
+ ext_hdr = skb_put_zero(skb, sizeof(*ext_hdr));
+ ext_hdr->version = ICMP_EXT_VERSION_2;
+
+ icmp_ext_objs_append(net, skb, ext_objs, iif);
+
+ /* Do not send an empty extension structure. */
+ ext_len = skb_tail_pointer(skb) - (unsigned char *)ext_hdr;
+ if (ext_len == sizeof(*ext_hdr))
+ goto free_skb;
+
+ ext_hdr->checksum = ip_compute_csum(ext_hdr, ext_len);
+ /* The length of the original datagram in 32-bit words (RFC 4884). */
+ icmph->un.reserved[1] = ICMP_EXT_ORIG_DGRAM_MIN_LEN / sizeof(u32);
+
+ return skb;
+
+free_skb:
+ consume_skb(skb);
+ return NULL;
+}
+
/*
* Send an ICMP message in response to a situation
*
@@ -601,6 +780,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
struct icmp_bxm icmp_param;
struct rtable *rt = skb_rtable(skb_in);
bool apply_ratelimit = false;
+ struct sk_buff *ext_skb;
struct ipcm_cookie ipc;
struct flowi4 fl4;
__be32 saddr;
@@ -770,7 +950,12 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
if (room <= (int)sizeof(struct iphdr))
goto ende;
- icmp_param.data_len = skb_in->len - icmp_param.offset;
+ ext_skb = icmp_ext_append(net, skb_in, &icmp_param.data.icmph, room,
+ parm->iif);
+ if (ext_skb)
+ icmp_param.skb = ext_skb;
+
+ icmp_param.data_len = icmp_param.skb->len - icmp_param.offset;
if (icmp_param.data_len > room)
icmp_param.data_len = room;
icmp_param.head_len = sizeof(struct icmphdr);
@@ -785,6 +970,9 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
trace_icmp_send(skb_in, type, code);
icmp_push_reply(sk, &icmp_param, &fl4, &ipc, &rt);
+
+ if (ext_skb)
+ consume_skb(ext_skb);
ende:
ip_rt_put(rt);
out_unlock:
@@ -1502,6 +1690,7 @@ static int __net_init icmp_sk_init(struct net *net)
net->ipv4.sysctl_icmp_ratelimit = 1 * HZ;
net->ipv4.sysctl_icmp_ratemask = 0x1818;
net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0;
+ net->ipv4.sysctl_icmp_errors_extension_mask = 0;
net->ipv4.sysctl_icmp_msgs_per_sec = 1000;
net->ipv4.sysctl_icmp_msgs_burst = 50;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index cdd1e12aac8c..97d57c52b9ad 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -712,31 +712,6 @@ struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg)
release_sock(sk);
- if (mem_cgroup_sockets_enabled) {
- gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL;
- int amt = 0;
-
- /* atomically get the memory usage, set and charge the
- * newsk->sk_memcg.
- */
- lock_sock(newsk);
-
- mem_cgroup_sk_alloc(newsk);
- if (mem_cgroup_from_sk(newsk)) {
- /* The socket has not been accepted yet, no need
- * to look at newsk->sk_wmem_queued.
- */
- amt = sk_mem_pages(newsk->sk_forward_alloc +
- atomic_read(&newsk->sk_rmem_alloc));
- }
-
- if (amt)
- mem_cgroup_sk_charge(newsk, amt, gfp);
- kmem_cache_charge(newsk, gfp);
-
- release_sock(newsk);
- }
-
if (req)
reqsk_put(req);
@@ -762,9 +737,9 @@ void inet_csk_init_xmit_timers(struct sock *sk,
{
struct inet_connection_sock *icsk = inet_csk(sk);
- timer_setup(&icsk->icsk_retransmit_timer, retransmit_handler, 0);
+ timer_setup(&sk->tcp_retransmit_timer, retransmit_handler, 0);
timer_setup(&icsk->icsk_delack_timer, delack_handler, 0);
- timer_setup(&sk->sk_timer, keepalive_handler, 0);
+ timer_setup(&icsk->icsk_keepalive_timer, keepalive_handler, 0);
icsk->icsk_pending = icsk->icsk_ack.pending = 0;
}
@@ -775,9 +750,9 @@ void inet_csk_clear_xmit_timers(struct sock *sk)
smp_store_release(&icsk->icsk_pending, 0);
smp_store_release(&icsk->icsk_ack.pending, 0);
- sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
+ sk_stop_timer(sk, &sk->tcp_retransmit_timer);
sk_stop_timer(sk, &icsk->icsk_delack_timer);
- sk_stop_timer(sk, &sk->sk_timer);
+ sk_stop_timer(sk, &icsk->icsk_keepalive_timer);
}
void inet_csk_clear_xmit_timers_sync(struct sock *sk)
@@ -790,9 +765,9 @@ void inet_csk_clear_xmit_timers_sync(struct sock *sk)
smp_store_release(&icsk->icsk_pending, 0);
smp_store_release(&icsk->icsk_ack.pending, 0);
- sk_stop_timer_sync(sk, &icsk->icsk_retransmit_timer);
+ sk_stop_timer_sync(sk, &sk->tcp_retransmit_timer);
sk_stop_timer_sync(sk, &icsk->icsk_delack_timer);
- sk_stop_timer_sync(sk, &sk->sk_timer);
+ sk_stop_timer_sync(sk, &icsk->icsk_keepalive_timer);
}
struct dst_entry *inet_csk_route_req(const struct sock *sk,
@@ -910,7 +885,6 @@ reqsk_alloc_noprof(const struct request_sock_ops *ops, struct sock *sk_listener,
sk_tx_queue_clear(req_to_sk(req));
req->saved_syn = NULL;
req->syncookie = 0;
- req->timeout = 0;
req->num_timeout = 0;
req->num_retrans = 0;
req->sk = NULL;
@@ -938,7 +912,6 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
ireq->ireq_state = TCP_NEW_SYN_RECV;
write_pnet(&ireq->ireq_net, sock_net(sk_listener));
ireq->ireq_family = sk_listener->sk_family;
- req->timeout = TCP_TIMEOUT_INIT;
}
return req;
@@ -1121,16 +1094,18 @@ static void reqsk_timer_handler(struct timer_list *t)
young <<= 1;
}
}
+
syn_ack_recalc(req, max_syn_ack_retries, READ_ONCE(queue->rskq_defer_accept),
&expire, &resend);
- req->rsk_ops->syn_ack_timeout(req);
+ tcp_syn_ack_timeout(req);
+
if (!expire &&
(!resend ||
!tcp_rtx_synack(sk_listener, req) ||
inet_rsk(req)->acked)) {
if (req->num_timeout++ == 0)
atomic_dec(&queue->young);
- mod_timer(&req->rsk_timer, jiffies + reqsk_timeout(req, TCP_RTO_MAX));
+ mod_timer(&req->rsk_timer, jiffies + tcp_reqsk_timeout(req));
if (!nreq)
return;
@@ -1167,8 +1142,7 @@ drop:
reqsk_put(oreq);
}
-static bool reqsk_queue_hash_req(struct request_sock *req,
- unsigned long timeout)
+static bool reqsk_queue_hash_req(struct request_sock *req)
{
bool found_dup_sk = false;
@@ -1176,8 +1150,9 @@ static bool reqsk_queue_hash_req(struct request_sock *req,
return false;
/* The timer needs to be setup after a successful insertion. */
+ req->timeout = tcp_timeout_init((struct sock *)req);
timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED);
- mod_timer(&req->rsk_timer, jiffies + timeout);
+ mod_timer(&req->rsk_timer, jiffies + req->timeout);
/* before letting lookups find us, make sure all req fields
* are committed to memory and refcnt initialized.
@@ -1187,10 +1162,9 @@ static bool reqsk_queue_hash_req(struct request_sock *req,
return true;
}
-bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
- unsigned long timeout)
+bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req)
{
- if (!reqsk_queue_hash_req(req, timeout))
+ if (!reqsk_queue_hash_req(req))
return false;
inet_csk_reqsk_queue_added(sk);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index f0b6c5a411a2..3f5b1418a610 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -287,17 +287,17 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
r->idiag_timer = 1;
r->idiag_retrans = READ_ONCE(icsk->icsk_retransmits);
r->idiag_expires =
- jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies);
+ jiffies_delta_to_msecs(tcp_timeout_expires(sk) - jiffies);
} else if (icsk_pending == ICSK_TIME_PROBE0) {
r->idiag_timer = 4;
r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out);
r->idiag_expires =
- jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies);
- } else if (timer_pending(&sk->sk_timer)) {
+ jiffies_delta_to_msecs(tcp_timeout_expires(sk) - jiffies);
+ } else if (timer_pending(&icsk->icsk_keepalive_timer)) {
r->idiag_timer = 2;
r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out);
r->idiag_expires =
- jiffies_delta_to_msecs(sk->sk_timer.expires - jiffies);
+ jiffies_delta_to_msecs(icsk->icsk_keepalive_timer.expires - jiffies);
}
if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) {
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index b7024e3d9ac3..f5826ec4bcaa 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -720,8 +720,11 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk)
spin_lock(lock);
if (osk) {
WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
- ret = sk_nulls_del_node_init_rcu(osk);
- } else if (found_dup_sk) {
+ ret = sk_nulls_replace_node_init_rcu(osk, sk);
+ goto unlock;
+ }
+
+ if (found_dup_sk) {
*found_dup_sk = inet_ehash_lookup_by_sk(sk, list);
if (*found_dup_sk)
ret = false;
@@ -730,6 +733,7 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk)
if (ret)
__sk_nulls_add_node_rcu(sk, list);
+unlock:
spin_unlock(lock);
return ret;
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index c96d61d08854..d4c781a0667f 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -88,12 +88,6 @@ void inet_twsk_put(struct inet_timewait_sock *tw)
}
EXPORT_SYMBOL_GPL(inet_twsk_put);
-static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
- struct hlist_nulls_head *list)
-{
- hlist_nulls_add_head_rcu(&tw->tw_node, list);
-}
-
static void inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo)
{
__inet_twsk_schedule(tw, timeo, false);
@@ -113,13 +107,12 @@ void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw,
{
const struct inet_sock *inet = inet_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
- struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
struct inet_bind_hashbucket *bhead, *bhead2;
- /* Step 1: Put TW into bind hash. Original socket stays there too.
- Note, that any socket with inet->num != 0 MUST be bound in
- binding cache, even if it is closed.
+ /* Put TW into bind hash. Original socket stays there too.
+ * Note, that any socket with inet->num != 0 MUST be bound in
+ * binding cache, even if it is closed.
*/
bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
hashinfo->bhash_size)];
@@ -141,19 +134,6 @@ void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw,
spin_lock(lock);
- /* Step 2: Hash TW into tcp ehash chain */
- inet_twsk_add_node_rcu(tw, &ehead->chain);
-
- /* Step 3: Remove SK from hash chain */
- if (__sk_nulls_del_node_init_rcu(sk))
- sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
-
-
- /* Ensure above writes are committed into memory before updating the
- * refcount.
- * Provides ordering vs later refcount_inc().
- */
- smp_wmb();
/* tw_refcnt is set to 3 because we have :
* - one reference for bhash chain.
* - one reference for ehash chain.
@@ -163,6 +143,15 @@ void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw,
*/
refcount_set(&tw->tw_refcnt, 3);
+ /* Ensure tw_refcnt has been set before tw is published.
+ * smp_wmb() provides the necessary memory barrier to enforce this
+ * ordering.
+ */
+ smp_wmb();
+
+ hlist_nulls_replace_init_rcu(&sk->sk_nulls_node, &tw->tw_node);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+
inet_twsk_schedule(tw, timeo);
spin_unlock(lock);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 273578579a6b..19d3141dad1f 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -141,6 +141,8 @@
#include <linux/mroute.h>
#include <linux/netlink.h>
#include <net/dst_metadata.h>
+#include <net/udp.h>
+#include <net/tcp.h>
/*
* Process Router Attention IP option (RFC 2113)
@@ -317,8 +319,6 @@ static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph,
ip_hdr(hint)->tos == iph->tos;
}
-int tcp_v4_early_demux(struct sk_buff *skb);
-enum skb_drop_reason udp_v4_early_demux(struct sk_buff *skb);
static int ip_rcv_finish_core(struct net *net,
struct sk_buff *skb, struct net_device *dev,
const struct sk_buff *hint)
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 22a7889876c1..019408d3ca2c 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1690,7 +1690,8 @@ static int __init ic_proto_name(char *name)
*v = 0;
if (kstrtou8(client_id, 0, dhcp_client_identifier))
pr_debug("DHCP: Invalid client identifier type\n");
- strncpy(dhcp_client_identifier + 1, v + 1, 251);
+ strscpy(dhcp_client_identifier + 1, v + 1,
+ sizeof(dhcp_client_identifier) - 1);
*v = ',';
}
return 1;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 3e03af073a1c..ff95b1b9908e 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -353,6 +353,30 @@ ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd)
return ip_tunnel_ctl(dev, p, cmd);
}
+static int ipip_fill_forward_path(struct net_device_path_ctx *ctx,
+ struct net_device_path *path)
+{
+ struct ip_tunnel *tunnel = netdev_priv(ctx->dev);
+ const struct iphdr *tiph = &tunnel->parms.iph;
+ struct rtable *rt;
+
+ rt = ip_route_output(dev_net(ctx->dev), tiph->daddr, 0, 0, 0,
+ RT_SCOPE_UNIVERSE);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+
+ path->type = DEV_PATH_TUN;
+ path->tun.src_v4.s_addr = tiph->saddr;
+ path->tun.dst_v4.s_addr = tiph->daddr;
+ path->tun.l3_proto = IPPROTO_IPIP;
+ path->dev = ctx->dev;
+
+ ctx->dev = rt->dst.dev;
+ ip_rt_put(rt);
+
+ return 0;
+}
+
static const struct net_device_ops ipip_netdev_ops = {
.ndo_init = ipip_tunnel_init,
.ndo_uninit = ip_tunnel_uninit,
@@ -362,6 +386,7 @@ static const struct net_device_ops ipip_netdev_ops = {
.ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = ip_tunnel_get_iflink,
.ndo_tunnel_ctl = ipip_tunnel_ctl,
+ .ndo_fill_forward_path = ipip_fill_forward_path,
};
#define IPIP_FEATURES (NETIF_F_SG | \
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 5321c5801c64..ad56588107cc 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -286,7 +286,7 @@ void ping_close(struct sock *sk, long timeout)
}
EXPORT_IPV6_MOD_GPL(ping_close);
-static int ping_pre_connect(struct sock *sk, struct sockaddr *uaddr,
+static int ping_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
int addr_len)
{
/* This check is replicated from __ip4_datagram_connect() and
@@ -301,7 +301,7 @@ static int ping_pre_connect(struct sock *sk, struct sockaddr *uaddr,
/* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */
static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
- struct sockaddr *uaddr, int addr_len)
+ struct sockaddr_unsized *uaddr, int addr_len)
{
struct net *net = sock_net(sk);
if (sk->sk_family == AF_INET) {
@@ -387,7 +387,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
return 0;
}
-static void ping_set_saddr(struct sock *sk, struct sockaddr *saddr)
+static void ping_set_saddr(struct sock *sk, struct sockaddr_unsized *saddr)
{
if (saddr->sa_family == AF_INET) {
struct inet_sock *isk = inet_sk(sk);
@@ -407,7 +407,7 @@ static void ping_set_saddr(struct sock *sk, struct sockaddr *saddr)
* Moreover, we don't allow binding to multi- and broadcast addresses.
*/
-int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int ping_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
{
struct inet_sock *isk = inet_sk(sk);
unsigned short snum;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index d54ebb7df966..5998c4cc6f47 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -697,7 +697,8 @@ static void raw_destroy(struct sock *sk)
}
/* This gets rid of all the nasties in af_inet. -DaveM */
-static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+static int raw_bind(struct sock *sk, struct sockaddr_unsized *uaddr,
+ int addr_len)
{
struct inet_sock *inet = inet_sk(sk);
struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 24dbc603cc44..a1a50a5c80dc 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -48,6 +48,8 @@ static int tcp_plb_max_rounds = 31;
static int tcp_plb_max_cong_thresh = 256;
static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC;
static int tcp_ecn_mode_max = 2;
+static u32 icmp_errors_extension_mask_all =
+ GENMASK_U8(ICMP_ERR_EXT_COUNT - 1, 0);
/* obsolete */
static int sysctl_tcp_low_latency __read_mostly;
@@ -675,6 +677,15 @@ static struct ctl_table ipv4_net_table[] = {
.extra2 = SYSCTL_ONE
},
{
+ .procname = "icmp_errors_extension_mask",
+ .data = &init_net.ipv4.sysctl_icmp_errors_extension_mask,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &icmp_errors_extension_mask_all,
+ },
+ {
.procname = "icmp_ratelimit",
.data = &init_net.ipv4.sysctl_icmp_ratelimit,
.maxlen = sizeof(int),
@@ -1332,6 +1343,15 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dou8vec_minmax,
},
{
+ .procname = "tcp_rcvbuf_low_rtt",
+ .data = &init_net.ipv4.sysctl_tcp_rcvbuf_low_rtt,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_INT_MAX,
+ },
+ {
.procname = "tcp_tso_win_divisor",
.data = &init_net.ipv4.sysctl_tcp_tso_win_divisor,
.maxlen = sizeof(u8),
@@ -1441,6 +1461,15 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_doulongvec_minmax,
},
{
+ .procname = "tcp_comp_sack_rtt_percent",
+ .data = &init_net.ipv4.sysctl_tcp_comp_sack_rtt_percent,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_ONE_THOUSAND,
+ },
+ {
.procname = "tcp_comp_sack_slack_ns",
.data = &init_net.ipv4.sysctl_tcp_comp_sack_slack_ns,
.maxlen = sizeof(unsigned long),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8a18aeca7ab0..f035440c475a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -243,7 +243,7 @@
#define pr_fmt(fmt) "TCP: " fmt
-#include <crypto/hash.h>
+#include <crypto/md5.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/types.h>
@@ -253,7 +253,6 @@
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/skbuff.h>
-#include <linux/scatterlist.h>
#include <linux/splice.h>
#include <linux/net.h>
#include <linux/socket.h>
@@ -425,7 +424,6 @@ void tcp_md5_destruct_sock(struct sock *sk)
tcp_clear_md5_list(sk);
kfree(rcu_replace_pointer(tp->md5sig_info, NULL, 1));
static_branch_slow_dec_deferred(&tcp_md5_needed);
- tcp_md5_release_sigpool();
}
}
EXPORT_IPV6_MOD_GPL(tcp_md5_destruct_sock);
@@ -928,7 +926,8 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp,
}
__kfree_skb(skb);
} else {
- sk->sk_prot->enter_memory_pressure(sk);
+ if (!sk->sk_bypass_prot_mem)
+ tcp_enter_memory_pressure(sk);
sk_stream_moderate_sndbuf(sk);
}
return NULL;
@@ -1062,7 +1061,7 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
}
}
flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
- err = __inet_stream_connect(sk->sk_socket, uaddr,
+ err = __inet_stream_connect(sk->sk_socket, (struct sockaddr_unsized *)uaddr,
msg->msg_namelen, flags, 1);
/* fastopen_req could already be freed in __inet_stream_connect
* if the connection times out or gets rst
@@ -1557,8 +1556,10 @@ void __tcp_cleanup_rbuf(struct sock *sk, int copied)
time_to_ack = true;
}
}
- if (time_to_ack)
+ if (time_to_ack) {
+ tcp_mstamp_refresh(tp);
tcp_send_ack(sk);
+ }
}
void tcp_cleanup_rbuf(struct sock *sk, int copied)
@@ -2586,7 +2587,7 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
if (err)
goto out;
- atomic_long_inc(&niov->pp_ref_count);
+ atomic_long_inc(&niov->desc.pp_ref_count);
tcp_xa_pool.netmems[tcp_xa_pool.idx++] = skb_frag_netmem(frag);
sent += copy;
@@ -3583,9 +3584,12 @@ static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf,
DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
EXPORT_IPV6_MOD(tcp_tx_delay_enabled);
-static void tcp_enable_tx_delay(void)
+static void tcp_enable_tx_delay(struct sock *sk, int val)
{
- if (!static_branch_unlikely(&tcp_tx_delay_enabled)) {
+ struct tcp_sock *tp = tcp_sk(sk);
+ s32 delta = (val - tp->tcp_tx_delay) << 3;
+
+ if (val && !static_branch_unlikely(&tcp_tx_delay_enabled)) {
static int __tcp_tx_delay_enabled = 0;
if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
@@ -3593,6 +3597,22 @@ static void tcp_enable_tx_delay(void)
pr_info("TCP_TX_DELAY enabled\n");
}
}
+ /* If we change tcp_tx_delay on a live flow, adjust tp->srtt_us,
+ * tp->rtt_min, icsk_rto and sk->sk_pacing_rate.
+ * This is best effort.
+ */
+ if (delta && sk->sk_state == TCP_ESTABLISHED) {
+ s64 srtt = (s64)tp->srtt_us + delta;
+
+ tp->srtt_us = clamp_t(s64, srtt, 1, ~0U);
+
+ /* Note: does not deal with non zero icsk_backoff */
+ tcp_set_rto(sk);
+
+ minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
+
+ tcp_update_pacing_rate(sk);
+ }
}
/* When set indicates to always queue non-full frames. Later the user clears
@@ -4119,8 +4139,12 @@ ao_parse:
tp->recvmsg_inq = val;
break;
case TCP_TX_DELAY:
- if (val)
- tcp_enable_tx_delay();
+ /* tp->srtt_us is u32, and is shifted by 3 */
+ if (val < 0 || val >= (1U << (31 - 3))) {
+ err = -EINVAL;
+ break;
+ }
+ tcp_enable_tx_delay(sk, val);
WRITE_ONCE(tp->tcp_tx_delay, val);
break;
default:
@@ -4815,52 +4839,45 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
EXPORT_IPV6_MOD(tcp_getsockopt);
#ifdef CONFIG_TCP_MD5SIG
-int tcp_md5_sigpool_id = -1;
-EXPORT_IPV6_MOD_GPL(tcp_md5_sigpool_id);
-
-int tcp_md5_alloc_sigpool(void)
+void tcp_md5_hash_skb_data(struct md5_ctx *ctx, const struct sk_buff *skb,
+ unsigned int header_len)
{
- size_t scratch_size;
- int ret;
+ const unsigned int head_data_len = skb_headlen(skb) > header_len ?
+ skb_headlen(skb) - header_len : 0;
+ const struct skb_shared_info *shi = skb_shinfo(skb);
+ struct sk_buff *frag_iter;
+ unsigned int i;
- scratch_size = sizeof(union tcp_md5sum_block) + sizeof(struct tcphdr);
- ret = tcp_sigpool_alloc_ahash("md5", scratch_size);
- if (ret >= 0) {
- /* As long as any md5 sigpool was allocated, the return
- * id would stay the same. Re-write the id only for the case
- * when previously all MD5 keys were deleted and this call
- * allocates the first MD5 key, which may return a different
- * sigpool id than was used previously.
- */
- WRITE_ONCE(tcp_md5_sigpool_id, ret); /* Avoids the compiler potentially being smart here */
- return 0;
- }
- return ret;
-}
+ md5_update(ctx, (const u8 *)tcp_hdr(skb) + header_len, head_data_len);
-void tcp_md5_release_sigpool(void)
-{
- tcp_sigpool_release(READ_ONCE(tcp_md5_sigpool_id));
-}
+ for (i = 0; i < shi->nr_frags; ++i) {
+ const skb_frag_t *f = &shi->frags[i];
+ u32 p_off, p_len, copied;
+ const void *vaddr;
+ struct page *p;
-void tcp_md5_add_sigpool(void)
-{
- tcp_sigpool_get(READ_ONCE(tcp_md5_sigpool_id));
+ skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f),
+ p, p_off, p_len, copied) {
+ vaddr = kmap_local_page(p);
+ md5_update(ctx, vaddr + p_off, p_len);
+ kunmap_local(vaddr);
+ }
+ }
+
+ skb_walk_frags(skb, frag_iter)
+ tcp_md5_hash_skb_data(ctx, frag_iter, 0);
}
+EXPORT_IPV6_MOD(tcp_md5_hash_skb_data);
-int tcp_md5_hash_key(struct tcp_sigpool *hp,
- const struct tcp_md5sig_key *key)
+void tcp_md5_hash_key(struct md5_ctx *ctx,
+ const struct tcp_md5sig_key *key)
{
u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */
- struct scatterlist sg;
-
- sg_init_one(&sg, key->key, keylen);
- ahash_request_set_crypt(hp->req, &sg, NULL, keylen);
/* We use data_race() because tcp_md5_do_add() might change
* key->key under us
*/
- return data_race(crypto_ahash_update(hp->req));
+ data_race(({ md5_update(ctx, key->key, keylen), 0; }));
}
EXPORT_IPV6_MOD(tcp_md5_hash_key);
@@ -4871,19 +4888,16 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
int family, int l3index, const __u8 *hash_location)
{
/* This gets called for each TCP segment that has TCP-MD5 option.
- * We have 3 drop cases:
- * o No MD5 hash and one expected.
- * o MD5 hash and we're not expecting one.
- * o MD5 hash and its wrong.
+ * We have 2 drop cases:
+ * o An MD5 signature is present, but we're not expecting one.
+ * o The MD5 signature is wrong.
*/
const struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_key *key;
u8 newhash[16];
- int genhash;
key = tcp_md5_do_lookup(sk, l3index, saddr, family);
-
- if (!key && hash_location) {
+ if (!key) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
trace_tcp_hash_md5_unexpected(sk, skb);
return SKB_DROP_REASON_TCP_MD5UNEXPECTED;
@@ -4894,11 +4908,10 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
* IPv4-mapped case.
*/
if (family == AF_INET)
- genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
+ tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
else
- genhash = tp->af_specific->calc_md5_hash(newhash, key,
- NULL, skb);
- if (genhash || memcmp(hash_location, newhash, 16) != 0) {
+ tp->af_specific->calc_md5_hash(newhash, key, NULL, skb);
+ if (memcmp(hash_location, newhash, 16) != 0) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
trace_tcp_hash_md5_mismatch(sk, skb);
return SKB_DROP_REASON_TCP_MD5FAILURE;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e4a979b75cc6..198f8a0d37be 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -896,6 +896,7 @@ void tcp_rcvbuf_grow(struct sock *sk, u32 newval)
const struct net *net = sock_net(sk);
struct tcp_sock *tp = tcp_sk(sk);
u32 rcvwin, rcvbuf, cap, oldval;
+ u32 rtt_threshold, rtt_us;
u64 grow;
oldval = tp->rcvq_space.space;
@@ -908,10 +909,19 @@ void tcp_rcvbuf_grow(struct sock *sk, u32 newval)
/* DRS is always one RTT late. */
rcvwin = newval << 1;
- /* slow start: allow the sender to double its rate. */
- grow = (u64)rcvwin * (newval - oldval);
- do_div(grow, oldval);
- rcvwin += grow << 1;
+ rtt_us = tp->rcv_rtt_est.rtt_us >> 3;
+ rtt_threshold = READ_ONCE(net->ipv4.sysctl_tcp_rcvbuf_low_rtt);
+ if (rtt_us < rtt_threshold) {
+ /* For small RTT, we set @grow to rcvwin * rtt_us/rtt_threshold.
+ * It might take few additional ms to reach 'line rate',
+ * but will avoid sk_rcvbuf inflation and poor cache use.
+ */
+ grow = div_u64((u64)rcvwin * rtt_us, rtt_threshold);
+ } else {
+ /* slow start: allow the sender to double its rate. */
+ grow = div_u64(((u64)rcvwin << 1) * (newval - oldval), oldval);
+ }
+ rcvwin += grow;
if (!RB_EMPTY_ROOT(&tp->out_of_order_queue))
rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt;
@@ -937,9 +947,15 @@ void tcp_rcv_space_adjust(struct sock *sk)
trace_tcp_rcv_space_adjust(sk);
- tcp_mstamp_refresh(tp);
+ if (unlikely(!tp->rcv_rtt_est.rtt_us))
+ return;
+
+ /* We do not refresh tp->tcp_mstamp here.
+ * Some platforms have expensive ktime_get() implementations.
+ * Using the last cached value is enough for DRS.
+ */
time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
- if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
+ if (time < (tp->rcv_rtt_est.rtt_us >> 3))
return;
/* Number of bytes copied to user in last RTT */
@@ -1102,7 +1118,7 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
tp->srtt_us = max(1U, srtt);
}
-static void tcp_update_pacing_rate(struct sock *sk)
+void tcp_update_pacing_rate(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
u64 rate;
@@ -1139,7 +1155,7 @@ static void tcp_update_pacing_rate(struct sock *sk)
/* Calculate rto without backoff. This is the second half of Van Jacobson's
* routine referred to above.
*/
-static void tcp_set_rto(struct sock *sk)
+void tcp_set_rto(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
/* Old crap is replaced with new one. 8)
@@ -5887,7 +5903,9 @@ static inline void tcp_data_snd_check(struct sock *sk)
static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
{
struct tcp_sock *tp = tcp_sk(sk);
- unsigned long rtt, delay;
+ struct net *net = sock_net(sk);
+ unsigned long rtt;
+ u64 delay;
/* More than one full frame received... */
if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
@@ -5906,7 +5924,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
* Defer the ack until tcp_release_cb().
*/
if (sock_owned_by_user_nocheck(sk) &&
- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_backlog_ack_defer)) {
+ READ_ONCE(net->ipv4.sysctl_tcp_backlog_ack_defer)) {
set_bit(TCP_ACK_DEFERRED, &sk->sk_tsq_flags);
return;
}
@@ -5921,7 +5939,7 @@ send_now:
}
if (!tcp_is_sack(tp) ||
- tp->compressed_ack >= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr))
+ tp->compressed_ack >= READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_nr))
goto send_now;
if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) {
@@ -5936,18 +5954,26 @@ send_now:
if (hrtimer_is_queued(&tp->compressed_ack_timer))
return;
- /* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */
+ /* compress ack timer : comp_sack_rtt_percent of rtt,
+ * but no more than tcp_comp_sack_delay_ns.
+ */
rtt = tp->rcv_rtt_est.rtt_us;
if (tp->srtt_us && tp->srtt_us < rtt)
rtt = tp->srtt_us;
- delay = min_t(unsigned long,
- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns),
- rtt * (NSEC_PER_USEC >> 3)/20);
+ /* delay = (rtt >> 3) * NSEC_PER_USEC * comp_sack_rtt_percent / 100
+ * ->
+ * delay = rtt * 1.25 * comp_sack_rtt_percent
+ */
+ delay = (u64)(rtt + (rtt >> 2)) *
+ READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_rtt_percent);
+
+ delay = min(delay, READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_delay_ns));
+
sock_hold(sk);
hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay),
- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns),
+ READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_slack_ns),
HRTIMER_MODE_REL_PINNED_SOFT);
}
@@ -7525,15 +7551,11 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
sock_put(fastopen_sk);
} else {
tcp_rsk(req)->tfo_listener = false;
- if (!want_cookie) {
- req->timeout = tcp_timeout_init((struct sock *)req);
- if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req,
- req->timeout))) {
- reqsk_free(req);
- dst_release(dst);
- return 0;
- }
-
+ if (!want_cookie &&
+ unlikely(!inet_csk_reqsk_queue_hash_add(sk, req))) {
+ reqsk_free(req);
+ dst_release(dst);
+ return 0;
}
af_ops->send_synack(sk, dst, &fl, req, &foc,
!want_cookie ? TCP_SYNACK_NORMAL :
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b1fcf3e4e1ce..f8a9596e8f4d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -53,6 +53,7 @@
#include <linux/module.h>
#include <linux/random.h>
#include <linux/cache.h>
+#include <linux/fips.h>
#include <linux/jhash.h>
#include <linux/init.h>
#include <linux/times.h>
@@ -86,14 +87,13 @@
#include <linux/btf_ids.h>
#include <linux/skbuff_ref.h>
-#include <crypto/hash.h>
-#include <linux/scatterlist.h>
+#include <crypto/md5.h>
#include <trace/events/tcp.h>
#ifdef CONFIG_TCP_MD5SIG
-static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
- __be32 daddr, __be32 saddr, const struct tcphdr *th);
+static void tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
+ __be32 daddr, __be32 saddr, const struct tcphdr *th);
#endif
struct inet_hashinfo tcp_hashinfo;
@@ -205,7 +205,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
}
EXPORT_IPV6_MOD_GPL(tcp_twsk_unique);
-static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
+static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
int addr_len)
{
/* This check is replicated from tcp_v4_connect() and intended to
@@ -221,7 +221,7 @@ static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
}
/* This will initiate an outgoing connection. */
-int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
{
struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
struct inet_timewait_death_row *tcp_death_row;
@@ -754,7 +754,6 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
struct tcp_md5sig_key *key = NULL;
unsigned char newhash[16];
struct sock *sk1 = NULL;
- int genhash;
#endif
u64 transmit_time = 0;
struct sock *ctl_sk;
@@ -840,11 +839,9 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
if (!key)
goto out;
-
- genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
- if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
+ tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
+ if (memcmp(md5_hash_location, newhash, 16) != 0)
goto out;
-
}
if (key) {
@@ -1425,13 +1422,13 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
struct tcp_sock *tp = tcp_sk(sk);
if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
- if (tcp_md5_alloc_sigpool())
- return -ENOMEM;
+ if (fips_enabled) {
+ pr_warn_once("TCP-MD5 support is disabled due to FIPS\n");
+ return -EOPNOTSUPP;
+ }
- if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
- tcp_md5_release_sigpool();
+ if (tcp_md5sig_info_add(sk, GFP_KERNEL))
return -ENOMEM;
- }
if (!static_branch_inc(&tcp_md5_needed.key)) {
struct tcp_md5sig_info *md5sig;
@@ -1439,7 +1436,6 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
rcu_assign_pointer(tp->md5sig_info, NULL);
kfree_rcu(md5sig, rcu);
- tcp_md5_release_sigpool();
return -EUSERS;
}
}
@@ -1456,12 +1452,9 @@ int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
struct tcp_sock *tp = tcp_sk(sk);
if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
- tcp_md5_add_sigpool();
- if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
- tcp_md5_release_sigpool();
+ if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
return -ENOMEM;
- }
if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
struct tcp_md5sig_info *md5sig;
@@ -1470,7 +1463,6 @@ int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
rcu_assign_pointer(tp->md5sig_info, NULL);
kfree_rcu(md5sig, rcu);
- tcp_md5_release_sigpool();
return -EUSERS;
}
}
@@ -1578,66 +1570,44 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
cmd.tcpm_key, cmd.tcpm_keylen);
}
-static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
- __be32 daddr, __be32 saddr,
- const struct tcphdr *th, int nbytes)
+static void tcp_v4_md5_hash_headers(struct md5_ctx *ctx,
+ __be32 daddr, __be32 saddr,
+ const struct tcphdr *th, int nbytes)
{
- struct tcp4_pseudohdr *bp;
- struct scatterlist sg;
- struct tcphdr *_th;
-
- bp = hp->scratch;
- bp->saddr = saddr;
- bp->daddr = daddr;
- bp->pad = 0;
- bp->protocol = IPPROTO_TCP;
- bp->len = cpu_to_be16(nbytes);
-
- _th = (struct tcphdr *)(bp + 1);
- memcpy(_th, th, sizeof(*th));
- _th->check = 0;
+ struct {
+ struct tcp4_pseudohdr ip;
+ struct tcphdr tcp;
+ } h;
- sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
- ahash_request_set_crypt(hp->req, &sg, NULL,
- sizeof(*bp) + sizeof(*th));
- return crypto_ahash_update(hp->req);
+ h.ip.saddr = saddr;
+ h.ip.daddr = daddr;
+ h.ip.pad = 0;
+ h.ip.protocol = IPPROTO_TCP;
+ h.ip.len = cpu_to_be16(nbytes);
+ h.tcp = *th;
+ h.tcp.check = 0;
+ md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp));
}
-static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
- __be32 daddr, __be32 saddr, const struct tcphdr *th)
+static noinline_for_stack void
+tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
+ __be32 daddr, __be32 saddr, const struct tcphdr *th)
{
- struct tcp_sigpool hp;
+ struct md5_ctx ctx;
- if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
- goto clear_hash_nostart;
-
- if (crypto_ahash_init(hp.req))
- goto clear_hash;
- if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
- goto clear_hash;
- if (tcp_md5_hash_key(&hp, key))
- goto clear_hash;
- ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
- if (crypto_ahash_final(hp.req))
- goto clear_hash;
-
- tcp_sigpool_end(&hp);
- return 0;
-
-clear_hash:
- tcp_sigpool_end(&hp);
-clear_hash_nostart:
- memset(md5_hash, 0, 16);
- return 1;
+ md5_init(&ctx);
+ tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2);
+ tcp_md5_hash_key(&ctx, key);
+ md5_final(&ctx, md5_hash);
}
-int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
- const struct sock *sk,
- const struct sk_buff *skb)
+noinline_for_stack void
+tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
+ const struct sock *sk, const struct sk_buff *skb)
{
const struct tcphdr *th = tcp_hdr(skb);
- struct tcp_sigpool hp;
__be32 saddr, daddr;
+ struct md5_ctx ctx;
if (sk) { /* valid for establish/request sockets */
saddr = sk->sk_rcv_saddr;
@@ -1648,30 +1618,11 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
daddr = iph->daddr;
}
- if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
- goto clear_hash_nostart;
-
- if (crypto_ahash_init(hp.req))
- goto clear_hash;
-
- if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
- goto clear_hash;
- if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
- goto clear_hash;
- if (tcp_md5_hash_key(&hp, key))
- goto clear_hash;
- ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
- if (crypto_ahash_final(hp.req))
- goto clear_hash;
-
- tcp_sigpool_end(&hp);
- return 0;
-
-clear_hash:
- tcp_sigpool_end(&hp);
-clear_hash_nostart:
- memset(md5_hash, 0, 16);
- return 1;
+ md5_init(&ctx);
+ tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, skb->len);
+ tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2);
+ tcp_md5_hash_key(&ctx, key);
+ md5_final(&ctx, md5_hash);
}
EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb);
@@ -1709,7 +1660,6 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = {
.send_ack = tcp_v4_reqsk_send_ack,
.destructor = tcp_v4_reqsk_destructor,
.send_reset = tcp_v4_send_reset,
- .syn_ack_timeout = tcp_syn_ack_timeout,
};
const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
@@ -2919,13 +2869,13 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk_pending == ICSK_TIME_LOSS_PROBE) {
timer_active = 1;
- timer_expires = icsk_timeout(icsk);
+ timer_expires = tcp_timeout_expires(sk);
} else if (icsk_pending == ICSK_TIME_PROBE0) {
timer_active = 4;
- timer_expires = icsk_timeout(icsk);
- } else if (timer_pending(&sk->sk_timer)) {
+ timer_expires = tcp_timeout_expires(sk);
+ } else if (timer_pending(&icsk->icsk_keepalive_timer)) {
timer_active = 2;
- timer_expires = sk->sk_timer.expires;
+ timer_expires = icsk->icsk_keepalive_timer.expires;
} else {
timer_active = 0;
timer_expires = jiffies;
@@ -3616,6 +3566,7 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_adv_win_scale = 1;
net->ipv4.sysctl_tcp_frto = 2;
net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
+ net->ipv4.sysctl_tcp_rcvbuf_low_rtt = USEC_PER_MSEC;
/* This limits the percentage of the congestion window which we
* will allow a single TSO frame to consume. Building TSO frames
* which are too large can cause TCP streams to be bursty.
@@ -3643,8 +3594,9 @@ static int __net_init tcp_sk_init(struct net *net)
sizeof(init_net.ipv4.sysctl_tcp_wmem));
}
net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
- net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
+ net->ipv4.sysctl_tcp_comp_sack_slack_ns = 10 * NSEC_PER_USEC;
net->ipv4.sysctl_tcp_comp_sack_nr = 44;
+ net->ipv4.sysctl_tcp_comp_sack_rtt_percent = 33;
net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index 52fe17167460..976b56644a8a 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -23,9 +23,9 @@
* Original Author:
* Aleksandar Kuzmanovic <akuzma@northwestern.edu>
* Available from:
- * http://www.ece.rice.edu/~akuzma/Doc/akuzma/TCP-LP.pdf
+ * https://users.cs.northwestern.edu/~akuzma/doc/TCP-LP-ToN.pdf
* Original implementation for 2.4.19:
- * http://www-ece.rice.edu/networks/TCP-LP/
+ * https://users.cs.northwestern.edu/~akuzma/rice/TCP-LP/linux/tcp-lp-linux.htm
*
* 2.6.x module Authors:
* Wong Hoi Sing, Edison <hswong3i@gmail.com>
@@ -113,6 +113,8 @@ static void tcp_lp_init(struct sock *sk)
/**
* tcp_lp_cong_avoid
* @sk: socket to avoid congesting
+ * @ack: current ack sequence number
+ * @acked: number of ACKed packets
*
* Implementation of cong_avoid.
* Will only call newReno CA when away from inference.
@@ -261,6 +263,7 @@ static void tcp_lp_rtt_sample(struct sock *sk, u32 rtt)
/**
* tcp_lp_pkts_acked
* @sk: socket requiring congestion avoidance calculations
+ * @sample: ACK sample containing timing and rate information
*
* Implementation of pkts_acked.
* Deal with active drop under Early Congestion Indication.
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 2ec8c6f1cdcc..bd5462154f97 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -312,7 +312,6 @@ static void tcp_time_wait_init(struct sock *sk, struct tcp_timewait_sock *tcptw)
return;
if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key))
goto out_free;
- tcp_md5_add_sigpool();
}
return;
out_free:
@@ -338,7 +337,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
- tw->tw_transparent = inet_test_bit(TRANSPARENT, sk);
tw->tw_mark = sk->sk_mark;
tw->tw_priority = READ_ONCE(sk->sk_priority);
tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
@@ -406,7 +404,6 @@ void tcp_twsk_destructor(struct sock *sk)
if (twsk->tw_md5_key) {
kfree(twsk->tw_md5_key);
static_branch_slow_dec_deferred(&tcp_md5_needed);
- tcp_md5_release_sigpool();
}
}
#endif
@@ -716,7 +713,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
* it can be estimated (approximately)
* from another data.
*/
- tmp_opt.ts_recent_stamp = ktime_get_seconds() - reqsk_timeout(req, TCP_RTO_MAX) / HZ;
+ tmp_opt.ts_recent_stamp = ktime_get_seconds() -
+ tcp_reqsk_timeout(req) / HZ;
paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
}
}
@@ -755,7 +753,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
!tcp_rtx_synack(sk, req)) {
unsigned long expires = jiffies;
- expires += reqsk_timeout(req, TCP_RTO_MAX);
+ expires += tcp_reqsk_timeout(req);
if (!fastopen)
mod_timer_pending(&req->rsk_timer, expires);
else
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 2cb93da93abc..fdda18b1abda 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -282,33 +282,6 @@ struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th)
return NULL;
}
-struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb)
-{
- unsigned int thlen, hlen, off;
- struct tcphdr *th;
-
- off = skb_gro_offset(skb);
- hlen = off + sizeof(*th);
- th = skb_gro_header(skb, hlen, off);
- if (unlikely(!th))
- return NULL;
-
- thlen = th->doff * 4;
- if (thlen < sizeof(*th))
- return NULL;
-
- hlen = off + thlen;
- if (!skb_gro_may_pull(skb, hlen)) {
- th = skb_gro_header_slow(skb, hlen, off);
- if (unlikely(!th))
- return NULL;
- }
-
- skb_gro_pull(skb, thlen);
-
- return th;
-}
-
struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
struct tcphdr *th)
{
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b94efb3050d2..479afb714bdf 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -40,6 +40,7 @@
#include <net/tcp.h>
#include <net/tcp_ecn.h>
#include <net/mptcp.h>
+#include <net/smc.h>
#include <net/proto_memory.h>
#include <net/psp.h>
@@ -802,34 +803,36 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
mptcp_options_write(th, ptr, tp, opts);
}
-static void smc_set_option(const struct tcp_sock *tp,
+static void smc_set_option(struct tcp_sock *tp,
struct tcp_out_options *opts,
unsigned int *remaining)
{
#if IS_ENABLED(CONFIG_SMC)
- if (static_branch_unlikely(&tcp_have_smc)) {
- if (tp->syn_smc) {
- if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
- opts->options |= OPTION_SMC;
- *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
- }
+ if (static_branch_unlikely(&tcp_have_smc) && tp->syn_smc) {
+ tp->syn_smc = !!smc_call_hsbpf(1, tp, syn_option);
+ /* re-check syn_smc */
+ if (tp->syn_smc &&
+ *remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
+ opts->options |= OPTION_SMC;
+ *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
}
}
#endif
}
static void smc_set_option_cond(const struct tcp_sock *tp,
- const struct inet_request_sock *ireq,
+ struct inet_request_sock *ireq,
struct tcp_out_options *opts,
unsigned int *remaining)
{
#if IS_ENABLED(CONFIG_SMC)
- if (static_branch_unlikely(&tcp_have_smc)) {
- if (tp->syn_smc && ireq->smc_ok) {
- if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
- opts->options |= OPTION_SMC;
- *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
- }
+ if (static_branch_unlikely(&tcp_have_smc) && tp->syn_smc && ireq->smc_ok) {
+ ireq->smc_ok = !!smc_call_hsbpf(1, tp, synack_option, ireq);
+ /* re-check smc_ok */
+ if (ireq->smc_ok &&
+ *remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
+ opts->options |= OPTION_SMC;
+ *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
}
}
#endif
@@ -3743,12 +3746,17 @@ void sk_forced_mem_schedule(struct sock *sk, int size)
delta = size - sk->sk_forward_alloc;
if (delta <= 0)
return;
+
amt = sk_mem_pages(delta);
sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
- sk_memory_allocated_add(sk, amt);
if (mem_cgroup_sk_enabled(sk))
mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL);
+
+ if (sk->sk_bypass_prot_mem)
+ return;
+
+ sk_memory_allocated_add(sk, amt);
}
/* Send a FIN. The caller locks the socket for us.
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 2dd73a4e8e51..160080c9021d 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -458,7 +458,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
struct tcp_sock *tp = tcp_sk(sk);
int max_retries;
- req->rsk_ops->syn_ack_timeout(req);
+ tcp_syn_ack_timeout(req);
/* Add one more retry for fastopen.
* Paired with WRITE_ONCE() in tcp_sock_set_syncnt()
@@ -510,7 +510,7 @@ static bool tcp_rtx_probe0_timed_out(const struct sock *sk,
* and tp->rcv_tstamp might very well have been written recently.
* rcv_delta can thus be negative.
*/
- rcv_delta = icsk_timeout(icsk) - tp->rcv_tstamp;
+ rcv_delta = tcp_timeout_expires(sk) - tp->rcv_tstamp;
if (rcv_delta <= timeout)
return false;
@@ -697,9 +697,9 @@ void tcp_write_timer_handler(struct sock *sk)
!icsk->icsk_pending)
return;
- if (time_after(icsk_timeout(icsk), jiffies)) {
- sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
- icsk_timeout(icsk));
+ if (time_after(tcp_timeout_expires(sk), jiffies)) {
+ sk_reset_timer(sk, &sk->tcp_retransmit_timer,
+ tcp_timeout_expires(sk));
return;
}
tcp_mstamp_refresh(tcp_sk(sk));
@@ -725,12 +725,10 @@ void tcp_write_timer_handler(struct sock *sk)
static void tcp_write_timer(struct timer_list *t)
{
- struct inet_connection_sock *icsk =
- timer_container_of(icsk, t, icsk_retransmit_timer);
- struct sock *sk = &icsk->icsk_inet.sk;
+ struct sock *sk = timer_container_of(sk, t, tcp_retransmit_timer);
/* Avoid locking the socket when there is no pending event. */
- if (!smp_load_acquire(&icsk->icsk_pending))
+ if (!smp_load_acquire(&inet_csk(sk)->icsk_pending))
goto out;
bh_lock_sock(sk);
@@ -752,16 +750,15 @@ void tcp_syn_ack_timeout(const struct request_sock *req)
__NET_INC_STATS(net, LINUX_MIB_TCPTIMEOUTS);
}
-EXPORT_IPV6_MOD(tcp_syn_ack_timeout);
void tcp_reset_keepalive_timer(struct sock *sk, unsigned long len)
{
- sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
+ sk_reset_timer(sk, &inet_csk(sk)->icsk_keepalive_timer, jiffies + len);
}
static void tcp_delete_keepalive_timer(struct sock *sk)
{
- sk_stop_timer(sk, &sk->sk_timer);
+ sk_stop_timer(sk, &inet_csk(sk)->icsk_keepalive_timer);
}
void tcp_set_keepalive(struct sock *sk, int val)
@@ -778,8 +775,9 @@ EXPORT_IPV6_MOD_GPL(tcp_set_keepalive);
static void tcp_keepalive_timer(struct timer_list *t)
{
- struct sock *sk = timer_container_of(sk, t, sk_timer);
- struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_connection_sock *icsk =
+ timer_container_of(icsk, t, icsk_keepalive_timer);
+ struct sock *sk = &icsk->icsk_inet.sk;
struct tcp_sock *tp = tcp_sk(sk);
u32 elapsed;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 30dfbf73729d..ffe074cb5865 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2159,7 +2159,8 @@ csum_copy_err:
goto try_again;
}
-int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int udp_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
+ int addr_len)
{
/* This check is replicated from __ip4_datagram_connect() and
* intended to prevent BPF program called below from accessing bytes
@@ -2172,7 +2173,8 @@ int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
}
EXPORT_IPV6_MOD(udp_pre_connect);
-static int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+static int udp_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
+ int addr_len)
{
int res;
diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c
index 54386e06a813..b1f667c52cb2 100644
--- a/net/ipv4/udp_tunnel_core.c
+++ b/net/ipv4/udp_tunnel_core.c
@@ -29,7 +29,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
udp_addr.sin_family = AF_INET;
udp_addr.sin_addr = cfg->local_ip;
udp_addr.sin_port = cfg->local_udp_port;
- err = kernel_bind(sock, (struct sockaddr *)&udp_addr,
+ err = kernel_bind(sock, (struct sockaddr_unsized *)&udp_addr,
sizeof(udp_addr));
if (err < 0)
goto error;
@@ -38,7 +38,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
udp_addr.sin_family = AF_INET;
udp_addr.sin_addr = cfg->peer_ip;
udp_addr.sin_port = cfg->peer_udp_port;
- err = kernel_connect(sock, (struct sockaddr *)&udp_addr,
+ err = kernel_connect(sock, (struct sockaddr_unsized *)&udp_addr,
sizeof(udp_addr), 0);
if (err < 0)
goto error;