summaryrefslogtreecommitdiff
path: root/net/ipv4/tcp.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r--net/ipv4/tcp.c84
1 files changed, 73 insertions, 11 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d5319ebe2452..6ce03a9adb4a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -319,15 +319,6 @@ struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp;
EXPORT_IPV6_MOD(tcp_sockets_allocated);
/*
- * TCP splice context
- */
-struct tcp_splice_state {
- struct pipe_inode_info *pipe;
- size_t len;
- unsigned int flags;
-};
-
-/*
* Pressure flag: try to collapse.
* Technical note: it is used by multiple contexts non atomically.
* All the __sk_mem_schedule() is of this nature: accounting
@@ -501,6 +492,9 @@ static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc)
struct sk_buff *skb = tcp_write_queue_tail(sk);
u32 tsflags = sockc->tsflags;
+ if (unlikely(!skb))
+ skb = skb_rb_last(&sk->tcp_rtx_queue);
+
if (tsflags && skb) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
@@ -517,6 +511,19 @@ static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc)
bpf_skops_tx_timestamping(sk, skb, BPF_SOCK_OPS_TSTAMP_SENDMSG_CB);
}
+/* @wake is one when sk_stream_write_space() calls us.
+ * This sends EPOLLOUT only if notsent_bytes is half the limit.
+ * This mimics the strategy used in sock_def_write_space().
+ */
+bool tcp_stream_memory_free(const struct sock *sk, int wake)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ u32 notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt);
+
+ return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
+}
+EXPORT_SYMBOL(tcp_stream_memory_free);
+
static bool tcp_stream_is_readable(struct sock *sk, int target)
{
if (tcp_epollin_ready(sk, target))
@@ -775,8 +782,8 @@ void tcp_push(struct sock *sk, int flags, int mss_now,
__tcp_push_pending_frames(sk, mss_now, nonagle);
}
-static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
- unsigned int offset, size_t len)
+int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
+ unsigned int offset, size_t len)
{
struct tcp_splice_state *tss = rd_desc->arg.data;
int ret;
@@ -902,6 +909,33 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
}
EXPORT_IPV6_MOD(tcp_splice_read);
+/* We allow to exceed memory limits for FIN packets to expedite
+ * connection tear down and (memory) recovery.
+ * Otherwise tcp_send_fin() could be tempted to either delay FIN
+ * or even be forced to close flow without any FIN.
+ * In general, we want to allow one skb per socket to avoid hangs
+ * with edge trigger epoll()
+ */
+void sk_forced_mem_schedule(struct sock *sk, int size)
+{
+ int delta, amt;
+
+ delta = size - sk->sk_forward_alloc;
+ if (delta <= 0)
+ return;
+
+ amt = sk_mem_pages(delta);
+ sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
+
+ if (mem_cgroup_sk_enabled(sk))
+ mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL);
+
+ if (sk->sk_bypass_prot_mem)
+ return;
+
+ sk_memory_allocated_add(sk, amt);
+}
+
struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp,
bool force_schedule)
{
@@ -1074,6 +1108,24 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
return err;
}
+/* If a gap is detected between sends, mark the socket application-limited. */
+void tcp_rate_check_app_limited(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (/* We have less than one packet to send. */
+ tp->write_seq - tp->snd_nxt < tp->mss_cache &&
+ /* Nothing in sending host's qdisc queues or NIC tx queue. */
+ sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) &&
+ /* We are not limited by CWND. */
+ tcp_packets_in_flight(tp) < tcp_snd_cwnd(tp) &&
+ /* All lost packets have been retransmitted. */
+ tp->lost_out <= tp->retrans_out)
+ tp->app_limited =
+ (tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+}
+EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited);
+
int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
{
struct net_devmem_dmabuf_binding *binding = NULL;
@@ -3418,6 +3470,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tcp_accecn_init_counters(tp);
tp->prev_ecnfield = 0;
tp->accecn_opt_tstamp = 0;
+ tp->pkts_acked_ewma = 0;
if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release)
icsk->icsk_ca_ops->release(sk);
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
@@ -4320,6 +4373,14 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
if (tp->rto_stamp)
info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp;
+ if (tcp_ecn_disabled(tp))
+ info->tcpi_ecn_mode = TCPI_ECN_MODE_DISABLED;
+ else if (tcp_ecn_mode_rfc3168(tp))
+ info->tcpi_ecn_mode = TCPI_ECN_MODE_RFC3168;
+ else if (tcp_ecn_mode_accecn(tp))
+ info->tcpi_ecn_mode = TCPI_ECN_MODE_ACCECN;
+ else if (tcp_ecn_mode_pending(tp))
+ info->tcpi_ecn_mode = TCPI_ECN_MODE_PENDING;
info->tcpi_accecn_fail_mode = tp->accecn_fail_mode;
info->tcpi_accecn_opt_seen = tp->saw_accecn_opt;
info->tcpi_received_ce = tp->received_ce;
@@ -5191,6 +5252,7 @@ static void __init tcp_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_ecn_bytes);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, pkts_acked_ewma);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked);