summaryrefslogtreecommitdiff
path: root/net/mptcp/protocol.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/mptcp/protocol.c')
-rw-r--r--net/mptcp/protocol.c275
1 files changed, 239 insertions, 36 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 8d3233667418..cf1852b99963 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -28,6 +28,8 @@
#include "protocol.h"
#include "mib.h"
+static unsigned int mptcp_inq_hint(const struct sock *sk);
+
#define CREATE_TRACE_POINTS
#include <trace/events/mptcp.h>
@@ -224,9 +226,6 @@ static bool mptcp_rcvbuf_grow(struct sock *sk, u32 newval)
do_div(grow, oldval);
rcvwin += grow << 1;
- if (!RB_EMPTY_ROOT(&msk->out_of_order_queue))
- rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - msk->ack_seq;
-
cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
rcvbuf = min_t(u32, mptcp_space_from_win(sk, rcvwin), cap);
@@ -350,9 +349,6 @@ merge_right:
end:
skb_condense(skb);
skb_set_owner_r(skb, sk);
- /* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */
- if (sk->sk_socket)
- mptcp_rcvbuf_grow(sk, msk->rcvq_space.space);
}
static void mptcp_init_skb(struct sock *ssk, struct sk_buff *skb, int offset,
@@ -1164,8 +1160,9 @@ struct mptcp_sendmsg_info {
bool data_lock_held;
};
-static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *ssk,
- u64 data_seq, int avail_size)
+static size_t mptcp_check_allowed_size(const struct mptcp_sock *msk,
+ struct sock *ssk, u64 data_seq,
+ size_t avail_size)
{
u64 window_end = mptcp_wnd_end(msk);
u64 mptcp_snd_wnd;
@@ -1174,7 +1171,7 @@ static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *s
return avail_size;
mptcp_snd_wnd = window_end - data_seq;
- avail_size = min_t(unsigned int, mptcp_snd_wnd, avail_size);
+ avail_size = min(mptcp_snd_wnd, avail_size);
if (unlikely(tcp_sk(ssk)->snd_wnd < mptcp_snd_wnd)) {
tcp_sk(ssk)->snd_wnd = min_t(u64, U32_MAX, mptcp_snd_wnd);
@@ -1518,7 +1515,7 @@ struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
if (!ssk || !sk_stream_memory_free(ssk))
return NULL;
- burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt);
+ burst = min(MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt);
wmem = READ_ONCE(ssk->sk_wmem_queued);
if (!burst)
return ssk;
@@ -1995,6 +1992,17 @@ do_error:
static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied);
+static void mptcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ /* avoid the indirect call, we know the destructor is sock_rfree */
+ skb->destructor = NULL;
+ skb->sk = NULL;
+ atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+ sk_mem_uncharge(sk, skb->truesize);
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ skb_attempt_defer_free(skb);
+}
+
static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg,
size_t len, int flags, int copied_total,
struct scm_timestamping_internal *tss,
@@ -2049,13 +2057,7 @@ static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg,
break;
}
- /* avoid the indirect call, we know the destructor is sock_rfree */
- skb->destructor = NULL;
- skb->sk = NULL;
- atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
- sk_mem_uncharge(sk, skb->truesize);
- __skb_unlink(skb, &sk->sk_receive_queue);
- skb_attempt_defer_free(skb);
+ mptcp_eat_recv_skb(sk, skb);
}
if (copied >= len)
@@ -2066,6 +2068,21 @@ static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg,
return copied;
}
+static void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
+{
+ const struct tcp_sock *tp = tcp_sk(ssk);
+
+ msk->rcvspace_init = 1;
+ msk->rcvq_space.copied = 0;
+ msk->rcvq_space.rtt_us = 0;
+
+ /* initial rcv_space offering made to peer */
+ msk->rcvq_space.space = min_t(u32, tp->rcv_wnd,
+ TCP_INIT_CWND * tp->advmss);
+ if (msk->rcvq_space.space == 0)
+ msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT;
+}
+
/* receive buffer autotuning. See tcp_rcv_space_adjust for more information.
*
* Only difference: Use highest rtt estimate of the subflows in use.
@@ -2088,8 +2105,8 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
msk->rcvq_space.copied += copied;
- mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC);
- time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time);
+ mstamp = mptcp_stamp();
+ time = tcp_stamp_us_delta(mstamp, READ_ONCE(msk->rcvq_space.time));
rtt_us = msk->rcvq_space.rtt_us;
if (rtt_us && time < (rtt_us >> 3))
@@ -2119,6 +2136,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
if (msk->rcvq_space.copied <= msk->rcvq_space.space)
goto new_measure;
+ trace_mptcp_rcvbuf_grow(sk, time);
if (mptcp_rcvbuf_grow(sk, msk->rcvq_space.copied)) {
/* Make subflows follow along. If we do not do this, we
* get drops at subflow level if skbs can't be moved to
@@ -3040,6 +3058,7 @@ static int mptcp_init_sock(struct sock *sk)
sk_sockets_allocated_inc(sk);
sk->sk_rcvbuf = READ_ONCE(net->ipv4.sysctl_tcp_rmem[1]);
sk->sk_sndbuf = READ_ONCE(net->ipv4.sysctl_tcp_wmem[1]);
+ sk->sk_write_space = sk_stream_write_space;
return 0;
}
@@ -3549,6 +3568,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk,
__mptcp_propagate_sndbuf(nsk, ssk);
mptcp_rcv_space_init(msk, ssk);
+ msk->rcvq_space.time = mptcp_stamp();
if (mp_opt->suboptions & OPTION_MPTCP_MPC_ACK)
__mptcp_subflow_fully_established(msk, subflow, mp_opt);
@@ -3558,23 +3578,6 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk,
return nsk;
}
-void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
-{
- const struct tcp_sock *tp = tcp_sk(ssk);
-
- msk->rcvspace_init = 1;
- msk->rcvq_space.copied = 0;
- msk->rcvq_space.rtt_us = 0;
-
- msk->rcvq_space.time = tp->tcp_mstamp;
-
- /* initial rcv_space offering made to peer */
- msk->rcvq_space.space = min_t(u32, tp->rcv_wnd,
- TCP_INIT_CWND * tp->advmss);
- if (msk->rcvq_space.space == 0)
- msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT;
-}
-
static void mptcp_destroy(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -3763,6 +3766,7 @@ void mptcp_finish_connect(struct sock *ssk)
* accessing the field below
*/
WRITE_ONCE(msk->local_key, subflow->local_key);
+ WRITE_ONCE(msk->rcvq_space.time, mptcp_stamp());
mptcp_pm_new_connection(msk, ssk, 0);
}
@@ -4312,6 +4316,201 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
return mask;
}
+static struct sk_buff *mptcp_recv_skb(struct sock *sk, u32 *off)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct sk_buff *skb;
+ u32 offset;
+
+ if (!list_empty(&msk->backlog_list))
+ mptcp_move_skbs(sk);
+
+ while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
+ offset = MPTCP_SKB_CB(skb)->offset;
+ if (offset < skb->len) {
+ *off = offset;
+ return skb;
+ }
+ mptcp_eat_recv_skb(sk, skb);
+ }
+ return NULL;
+}
+
+/*
+ * Note:
+ * - It is assumed that the socket was locked by the caller.
+ */
+static int __mptcp_read_sock(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor, bool noack)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct sk_buff *skb;
+ int copied = 0;
+ u32 offset;
+
+ msk_owned_by_me(msk);
+
+ if (sk->sk_state == TCP_LISTEN)
+ return -ENOTCONN;
+ while ((skb = mptcp_recv_skb(sk, &offset)) != NULL) {
+ u32 data_len = skb->len - offset;
+ int count;
+ u32 size;
+
+ size = min_t(size_t, data_len, INT_MAX);
+ count = recv_actor(desc, skb, offset, size);
+ if (count <= 0) {
+ if (!copied)
+ copied = count;
+ break;
+ }
+
+ copied += count;
+
+ msk->bytes_consumed += count;
+ if (count < data_len) {
+ MPTCP_SKB_CB(skb)->offset += count;
+ MPTCP_SKB_CB(skb)->map_seq += count;
+ break;
+ }
+
+ mptcp_eat_recv_skb(sk, skb);
+ }
+
+ if (noack)
+ goto out;
+
+ mptcp_rcv_space_adjust(msk, copied);
+
+ if (copied > 0) {
+ mptcp_recv_skb(sk, &offset);
+ mptcp_cleanup_rbuf(msk, copied);
+ }
+out:
+ return copied;
+}
+
+static int mptcp_read_sock(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor)
+{
+ return __mptcp_read_sock(sk, desc, recv_actor, false);
+}
+
+static int __mptcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
+{
+ /* Store TCP splice context information in read_descriptor_t. */
+ read_descriptor_t rd_desc = {
+ .arg.data = tss,
+ .count = tss->len,
+ };
+
+ return mptcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
+}
+
+/**
+ * mptcp_splice_read - splice data from MPTCP socket to a pipe
+ * @sock: socket to splice from
+ * @ppos: position (not valid)
+ * @pipe: pipe to splice to
+ * @len: number of bytes to splice
+ * @flags: splice modifier flags
+ *
+ * Description:
+ * Will read pages from given socket and fill them into a pipe.
+ *
+ * Return:
+ * Amount of bytes that have been spliced.
+ *
+ **/
+static ssize_t mptcp_splice_read(struct socket *sock, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ struct tcp_splice_state tss = {
+ .pipe = pipe,
+ .len = len,
+ .flags = flags,
+ };
+ struct sock *sk = sock->sk;
+ ssize_t spliced = 0;
+ int ret = 0;
+ long timeo;
+
+ /*
+ * We can't seek on a socket input
+ */
+ if (unlikely(*ppos))
+ return -ESPIPE;
+
+ lock_sock(sk);
+
+ mptcp_rps_record_subflows(mptcp_sk(sk));
+
+ timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
+ while (tss.len) {
+ ret = __mptcp_splice_read(sk, &tss);
+ if (ret < 0) {
+ break;
+ } else if (!ret) {
+ if (spliced)
+ break;
+ if (sock_flag(sk, SOCK_DONE))
+ break;
+ if (sk->sk_err) {
+ ret = sock_error(sk);
+ break;
+ }
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ break;
+ if (sk->sk_state == TCP_CLOSE) {
+ /*
+ * This occurs when user tries to read
+ * from never connected socket.
+ */
+ ret = -ENOTCONN;
+ break;
+ }
+ if (!timeo) {
+ ret = -EAGAIN;
+ break;
+ }
+ /* if __mptcp_splice_read() got nothing while we have
+ * an skb in receive queue, we do not want to loop.
+ * This might happen with URG data.
+ */
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ break;
+ ret = sk_wait_data(sk, &timeo, NULL);
+ if (ret < 0)
+ break;
+ if (signal_pending(current)) {
+ ret = sock_intr_errno(timeo);
+ break;
+ }
+ continue;
+ }
+ tss.len -= ret;
+ spliced += ret;
+
+ if (!tss.len || !timeo)
+ break;
+ release_sock(sk);
+ lock_sock(sk);
+
+ if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
+ (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ signal_pending(current))
+ break;
+ }
+
+ release_sock(sk);
+
+ if (spliced)
+ return spliced;
+
+ return ret;
+}
+
static const struct proto_ops mptcp_stream_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
@@ -4332,6 +4531,8 @@ static const struct proto_ops mptcp_stream_ops = {
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.set_rcvlowat = mptcp_set_rcvlowat,
+ .read_sock = mptcp_read_sock,
+ .splice_read = mptcp_splice_read,
};
static struct inet_protosw mptcp_protosw = {
@@ -4436,6 +4637,8 @@ static const struct proto_ops mptcp_v6_stream_ops = {
.compat_ioctl = inet6_compat_ioctl,
#endif
.set_rcvlowat = mptcp_set_rcvlowat,
+ .read_sock = mptcp_read_sock,
+ .splice_read = mptcp_splice_read,
};
static struct proto mptcp_v6_prot;