From b18afb6f42292a9154102fed7265ae747bbfbc57 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 15 Jan 2024 12:55:09 -0800 Subject: tcp: Move tcp_ns_to_ts() to tcp.h We will support arbitrary SYN Cookie with BPF. When BPF prog validates ACK and kfunc allocates a reqsk, we need to call tcp_ns_to_ts() to calculate an offset of TSval for later use: time t0 : Send SYN+ACK -> tsval = Initial TSval (Random Number) t1 : Recv ACK of 3WHS -> tsoff = TSecr - tcp_ns_to_ts(usec_ts_ok, tcp_clock_ns()) = Initial TSval - t1 t2 : Send ACK -> tsval = t2 + tsoff = Initial TSval + (t2 - t1) = Initial TSval + Time Delta (x) (x) Note that the time delta does not include the initial RTT from t0 to t1. Let's move tcp_ns_to_ts() to tcp.h. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240115205514.68364-2-kuniyu@amazon.com Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/net/tcp.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index dd78a1181031..114000e71a46 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -577,6 +577,15 @@ static inline u32 tcp_cookie_time(void) return val; } +/* Convert one nsec 64bit timestamp to ts (ms or usec resolution) */ +static inline u64 tcp_ns_to_ts(bool usec_ts, u64 val) +{ + if (usec_ts) + return div_u64(val, NSEC_PER_USEC); + + return div_u64(val, NSEC_PER_MSEC); +} + u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, u16 *mssp); __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss); -- cgit v1.2.3 From 95e752b5299fa8c90099f7bc2aa1ee3e2e2c95ab Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 15 Jan 2024 12:55:10 -0800 Subject: tcp: Move skb_steal_sock() to request_sock.h We will support arbitrary SYN Cookie with BPF. If BPF prog validates ACK and kfunc allocates a reqsk, it will be carried to TCP stack as skb->sk with req->syncookie 1. In skb_steal_sock(), we need to check inet_reqsk(sk)->syncookie to see if the reqsk is created by kfunc. However, inet_reqsk() is not available in sock.h. Let's move skb_steal_sock() to request_sock.h. While at it, we refactor skb_steal_sock() so it returns early if skb->sk is NULL to minimise the following patch. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240115205514.68364-3-kuniyu@amazon.com Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/net/request_sock.h | 28 ++++++++++++++++++++++++++++ include/net/sock.h | 25 ------------------------- 2 files changed, 28 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 144c39db9898..26c630c40abb 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -83,6 +83,34 @@ static inline struct sock *req_to_sk(struct request_sock *req) return (struct sock *)req; } +/** + * skb_steal_sock - steal a socket from an sk_buff + * @skb: sk_buff to steal the socket from + * @refcounted: is set to true if the socket is reference-counted + * @prefetched: is set to true if the socket was assigned from bpf + */ +static inline struct sock *skb_steal_sock(struct sk_buff *skb, + bool *refcounted, bool *prefetched) +{ + struct sock *sk = skb->sk; + + if (!sk) { + *prefetched = false; + *refcounted = false; + return NULL; + } + + *prefetched = skb_sk_is_prefetched(skb); + if (*prefetched) + *refcounted = sk_is_refcounted(sk); + else + *refcounted = true; + + skb->destructor = NULL; + skb->sk = NULL; + return sk; +} + static inline struct request_sock * reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, bool attach_listener) diff --git a/include/net/sock.h b/include/net/sock.h index a7f815c7cfdf..32a399fdcbb5 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2814,31 +2814,6 @@ sk_is_refcounted(struct sock *sk) return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE); } -/** - * skb_steal_sock - steal a socket from an sk_buff - * @skb: sk_buff to steal the socket from - * @refcounted: is set to true if the socket is reference-counted - * @prefetched: is set to true if the socket was assigned from bpf - */ -static inline struct sock * -skb_steal_sock(struct sk_buff *skb, bool *refcounted, bool *prefetched) -{ - if (skb->sk) { - struct sock *sk = skb->sk; - - *refcounted = true; - *prefetched = skb_sk_is_prefetched(skb); - if (*prefetched) - *refcounted = sk_is_refcounted(sk); - skb->destructor = NULL; - skb->sk = NULL; - return sk; - } - *prefetched = false; - *refcounted = false; - return NULL; -} - /* Checks if this SKB belongs to an HW offloaded socket * and whether any SW fallbacks are required based on dev. * Check decrypted mark in case skb_orphan() cleared socket. -- cgit v1.2.3 From 8b5ac68fb5ee416537c1214cbacf0ddc4293cce9 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 15 Jan 2024 12:55:11 -0800 Subject: bpf: tcp: Handle BPF SYN Cookie in skb_steal_sock(). We will support arbitrary SYN Cookie with BPF. If BPF prog validates ACK and kfunc allocates a reqsk, it will be carried to TCP stack as skb->sk with req->syncookie 1. Also, the reqsk has its listener as req->rsk_listener with no refcnt taken. When the TCP stack looks up a socket from the skb, we steal inet_reqsk(skb->sk)->rsk_listener in skb_steal_sock() so that the skb will be processed in cookie_v[46]_check() with the listener. Note that we do not clear skb->sk and skb->destructor so that we can carry the reqsk to cookie_v[46]_check(). Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240115205514.68364-4-kuniyu@amazon.com Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/net/request_sock.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 26c630c40abb..8839133d6f6b 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -101,10 +101,21 @@ static inline struct sock *skb_steal_sock(struct sk_buff *skb, } *prefetched = skb_sk_is_prefetched(skb); - if (*prefetched) + if (*prefetched) { +#if IS_ENABLED(CONFIG_SYN_COOKIES) + if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) { + struct request_sock *req = inet_reqsk(sk); + + *refcounted = false; + sk = req->rsk_listener; + req->rsk_listener = NULL; + return sk; + } +#endif *refcounted = sk_is_refcounted(sk); - else + } else { *refcounted = true; + } skb->destructor = NULL; skb->sk = NULL; -- cgit v1.2.3 From 695751e31a63efd2bbe6779873adf1e4deb00cd5 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 15 Jan 2024 12:55:12 -0800 Subject: bpf: tcp: Handle BPF SYN Cookie in cookie_v[46]_check(). We will support arbitrary SYN Cookie with BPF in the following patch. If BPF prog validates ACK and kfunc allocates a reqsk, it will be carried to cookie_[46]_check() as skb->sk. If skb->sk is not NULL, we call cookie_bpf_check(). Then, we clear skb->sk and skb->destructor, which are needed not to hold refcnt for reqsk and the listener. See the following patch for details. After that, we finish initialisation for the remaining fields with cookie_tcp_reqsk_init(). Note that the server side WScale is set only for non-BPF SYN Cookie. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240115205514.68364-5-kuniyu@amazon.com Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/net/tcp.h | 20 ++++++++++++++++++++ net/ipv4/syncookies.c | 31 +++++++++++++++++++++++++++---- net/ipv6/syncookies.c | 13 +++++++++---- 3 files changed, 56 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 114000e71a46..dfe99a084a71 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -599,6 +599,26 @@ static inline bool cookie_ecn_ok(const struct net *net, const struct dst_entry * dst_feature(dst, RTAX_FEATURE_ECN); } +#if IS_ENABLED(CONFIG_BPF) +static inline bool cookie_bpf_ok(struct sk_buff *skb) +{ + return skb->sk; +} + +struct request_sock *cookie_bpf_check(struct sock *sk, struct sk_buff *skb); +#else +static inline bool cookie_bpf_ok(struct sk_buff *skb) +{ + return false; +} + +static inline struct request_sock *cookie_bpf_check(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + return NULL; +} +#endif + /* From net/ipv6/syncookies.c */ int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th); struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 981944c22820..be88bf586ff9 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -295,6 +295,24 @@ static int cookie_tcp_reqsk_init(struct sock *sk, struct sk_buff *skb, return 0; } +#if IS_ENABLED(CONFIG_BPF) +struct request_sock *cookie_bpf_check(struct sock *sk, struct sk_buff *skb) +{ + struct request_sock *req = inet_reqsk(skb->sk); + + skb->sk = NULL; + skb->destructor = NULL; + + if (cookie_tcp_reqsk_init(sk, skb, req)) { + reqsk_free(req); + req = NULL; + } + + return req; +} +EXPORT_SYMBOL_GPL(cookie_bpf_check); +#endif + struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk, struct sk_buff *skb, struct tcp_options_received *tcp_opt, @@ -395,9 +413,13 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) !th->ack || th->rst) goto out; - req = cookie_tcp_check(net, sk, skb); - if (IS_ERR(req)) - goto out; + if (cookie_bpf_ok(skb)) { + req = cookie_bpf_check(sk, skb); + } else { + req = cookie_tcp_check(net, sk, skb); + if (IS_ERR(req)) + goto out; + } if (!req) goto out_drop; @@ -445,7 +467,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ireq->wscale_ok, &rcv_wscale, dst_metric(&rt->dst, RTAX_INITRWND)); - ireq->rcv_wscale = rcv_wscale; + if (!req->syncookie) + ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok &= cookie_ecn_ok(net, &rt->dst); ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index c8d2ca27220c..6b9c69278819 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -182,9 +182,13 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) !th->ack || th->rst) goto out; - req = cookie_tcp_check(net, sk, skb); - if (IS_ERR(req)) - goto out; + if (cookie_bpf_ok(skb)) { + req = cookie_bpf_check(sk, skb); + } else { + req = cookie_tcp_check(net, sk, skb); + if (IS_ERR(req)) + goto out; + } if (!req) goto out_drop; @@ -247,7 +251,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); - ireq->rcv_wscale = rcv_wscale; + if (!req->syncookie) + ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok &= cookie_ecn_ok(net, dst); ret = tcp_get_cookie_sock(sk, skb, req, dst); -- cgit v1.2.3 From e472f88891abbc535a5e16a68a104073985f6061 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 15 Jan 2024 12:55:13 -0800 Subject: bpf: tcp: Support arbitrary SYN Cookie. This patch adds a new kfunc available at TC hook to support arbitrary SYN Cookie. The basic usage is as follows: struct bpf_tcp_req_attrs attrs = { .mss = mss, .wscale_ok = wscale_ok, .rcv_wscale = rcv_wscale, /* Server's WScale < 15 */ .snd_wscale = snd_wscale, /* Client's WScale < 15 */ .tstamp_ok = tstamp_ok, .rcv_tsval = tsval, .rcv_tsecr = tsecr, /* Server's Initial TSval */ .usec_ts_ok = usec_ts_ok, .sack_ok = sack_ok, .ecn_ok = ecn_ok, } skc = bpf_skc_lookup_tcp(...); sk = (struct sock *)bpf_skc_to_tcp_sock(skc); bpf_sk_assign_tcp_reqsk(skb, sk, attrs, sizeof(attrs)); bpf_sk_release(skc); bpf_sk_assign_tcp_reqsk() takes skb, a listener sk, and struct bpf_tcp_req_attrs and allocates reqsk and configures it. Then, bpf_sk_assign_tcp_reqsk() links reqsk with skb and the listener. The notable thing here is that we do not hold refcnt for both reqsk and listener. To differentiate that, we mark reqsk->syncookie, which is only used in TX for now. So, if reqsk->syncookie is 1 in RX, it means that the reqsk is allocated by kfunc. When skb is freed, sock_pfree() checks if reqsk->syncookie is 1, and in that case, we set NULL to reqsk->rsk_listener before calling reqsk_free() as reqsk does not hold a refcnt of the listener. When the TCP stack looks up a socket from the skb, we steal the listener from the reqsk in skb_steal_sock() and create a full sk in cookie_v[46]_check(). The refcnt of reqsk will finally be set to 1 in tcp_get_cookie_sock() after creating a full sk. Note that we can extend struct bpf_tcp_req_attrs in the future when we add a new attribute that is determined in 3WHS. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240115205514.68364-6-kuniyu@amazon.com Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/net/tcp.h | 14 +++++++ net/core/filter.c | 111 +++++++++++++++++++++++++++++++++++++++++++++++++++++- net/core/sock.c | 14 ++++++- 3 files changed, 135 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index dfe99a084a71..451dc1373970 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -600,6 +600,20 @@ static inline bool cookie_ecn_ok(const struct net *net, const struct dst_entry * } #if IS_ENABLED(CONFIG_BPF) +struct bpf_tcp_req_attrs { + u32 rcv_tsval; + u32 rcv_tsecr; + u16 mss; + u8 rcv_wscale; + u8 snd_wscale; + u8 ecn_ok; + u8 wscale_ok; + u8 sack_ok; + u8 tstamp_ok; + u8 usec_ts_ok; + u8 reserved[3]; +}; + static inline bool cookie_bpf_ok(struct sk_buff *skb) { return skb->sk; diff --git a/net/core/filter.c b/net/core/filter.c index 8c9f67c81e22..6a7abbaa50b8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -11837,6 +11837,103 @@ __bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern, return 0; } + +__bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct sk_buff *skb, struct sock *sk, + struct bpf_tcp_req_attrs *attrs, int attrs__sz) +{ +#if IS_ENABLED(CONFIG_SYN_COOKIES) + const struct request_sock_ops *ops; + struct inet_request_sock *ireq; + struct tcp_request_sock *treq; + struct request_sock *req; + struct net *net; + __u16 min_mss; + u32 tsoff = 0; + + if (attrs__sz != sizeof(*attrs) || + attrs->reserved[0] || attrs->reserved[1] || attrs->reserved[2]) + return -EINVAL; + + if (!skb_at_tc_ingress(skb)) + return -EINVAL; + + net = dev_net(skb->dev); + if (net != sock_net(sk)) + return -ENETUNREACH; + + switch (skb->protocol) { + case htons(ETH_P_IP): + ops = &tcp_request_sock_ops; + min_mss = 536; + break; +#if IS_BUILTIN(CONFIG_IPV6) + case htons(ETH_P_IPV6): + ops = &tcp6_request_sock_ops; + min_mss = IPV6_MIN_MTU - 60; + break; +#endif + default: + return -EINVAL; + } + + if (sk->sk_type != SOCK_STREAM || sk->sk_state != TCP_LISTEN || + sk_is_mptcp(sk)) + return -EINVAL; + + if (attrs->mss < min_mss) + return -EINVAL; + + if (attrs->wscale_ok) { + if (!READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) + return -EINVAL; + + if (attrs->snd_wscale > TCP_MAX_WSCALE || + attrs->rcv_wscale > TCP_MAX_WSCALE) + return -EINVAL; + } + + if (attrs->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack)) + return -EINVAL; + + if (attrs->tstamp_ok) { + if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps)) + return -EINVAL; + + tsoff = attrs->rcv_tsecr - tcp_ns_to_ts(attrs->usec_ts_ok, tcp_clock_ns()); + } + + req = inet_reqsk_alloc(ops, sk, false); + if (!req) + return -ENOMEM; + + ireq = inet_rsk(req); + treq = tcp_rsk(req); + + req->rsk_listener = sk; + req->syncookie = 1; + req->mss = attrs->mss; + req->ts_recent = attrs->rcv_tsval; + + ireq->snd_wscale = attrs->snd_wscale; + ireq->rcv_wscale = attrs->rcv_wscale; + ireq->tstamp_ok = !!attrs->tstamp_ok; + ireq->sack_ok = !!attrs->sack_ok; + ireq->wscale_ok = !!attrs->wscale_ok; + ireq->ecn_ok = !!attrs->ecn_ok; + + treq->req_usec_ts = !!attrs->usec_ts_ok; + treq->ts_off = tsoff; + + skb_orphan(skb); + skb->sk = req_to_sk(req); + skb->destructor = sock_pfree; + + return 0; +#else + return -EOPNOTSUPP; +#endif +} + __bpf_kfunc_end_defs(); int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags, @@ -11865,6 +11962,10 @@ BTF_SET8_START(bpf_kfunc_check_set_sock_addr) BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path) BTF_SET8_END(bpf_kfunc_check_set_sock_addr) +BTF_SET8_START(bpf_kfunc_check_set_tcp_reqsk) +BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS) +BTF_SET8_END(bpf_kfunc_check_set_tcp_reqsk) + static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_skb, @@ -11880,6 +11981,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_sock_addr = { .set = &bpf_kfunc_check_set_sock_addr, }; +static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_tcp_reqsk, +}; + static int __init bpf_kfunc_init(void) { int ret; @@ -11895,8 +12001,9 @@ static int __init bpf_kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); - return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, - &bpf_kfunc_set_sock_addr); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + &bpf_kfunc_set_sock_addr); + return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk); } late_initcall(bpf_kfunc_init); diff --git a/net/core/sock.c b/net/core/sock.c index 158dbdebce6a..147fb2656e6b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2582,8 +2582,18 @@ EXPORT_SYMBOL(sock_efree); #ifdef CONFIG_INET void sock_pfree(struct sk_buff *skb) { - if (sk_is_refcounted(skb->sk)) - sock_gen_put(skb->sk); + struct sock *sk = skb->sk; + + if (!sk_is_refcounted(sk)) + return; + + if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) { + inet_reqsk(sk)->rsk_listener = NULL; + reqsk_free(inet_reqsk(sk)); + return; + } + + sock_gen_put(sk); } EXPORT_SYMBOL(sock_pfree); #endif /* CONFIG_INET */ -- cgit v1.2.3