From 4408d55a64677febdcb50d1b44d0dc714ce4187e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 13 Jan 2022 09:28:45 +0900 Subject: af_unix: Refactor unix_next_socket(). Currently, unix_next_socket() is overloaded depending on the 2nd argument. If it is NULL, unix_next_socket() returns the first socket in the hash. If not NULL, it returns the next socket in the same hash list or the first socket in the next non-empty hash list. This patch refactors unix_next_socket() into two functions unix_get_first() and unix_get_next(). unix_get_first() newly acquires a lock and returns the first socket in the list. unix_get_next() returns the next socket in a list or releases a lock and falls back to unix_get_first(). In the following patch, bpf iter holds entire sockets in a list and always releases the lock before .show(). It always calls unix_get_first() to acquire a lock in each iteration. So, this patch makes the change easier to follow. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20220113002849.4384-2-kuniyu@amazon.co.jp Signed-off-by: Alexei Starovoitov --- net/unix/af_unix.c | 51 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 21 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c19569819866..e1c4082accdb 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -3240,49 +3240,58 @@ static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) return sk; } -static struct sock *unix_next_socket(struct seq_file *seq, - struct sock *sk, - loff_t *pos) +static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) { unsigned long bucket = get_bucket(*pos); + struct sock *sk; - while (sk > (struct sock *)SEQ_START_TOKEN) { - sk = sk_next(sk); - if (!sk) - goto next_bucket; - if (sock_net(sk) == seq_file_net(seq)) - return sk; - } - - do { + while (bucket < ARRAY_SIZE(unix_socket_table)) { spin_lock(&unix_table_locks[bucket]); + sk = unix_from_bucket(seq, pos); if (sk) return sk; -next_bucket: - spin_unlock(&unix_table_locks[bucket++]); - *pos = set_bucket_offset(bucket, 1); - } while (bucket < ARRAY_SIZE(unix_socket_table)); + spin_unlock(&unix_table_locks[bucket]); + + *pos = set_bucket_offset(++bucket, 1); + } return NULL; } +static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, + loff_t *pos) +{ + unsigned long bucket = get_bucket(*pos); + + for (sk = sk_next(sk); sk; sk = sk_next(sk)) + if (sock_net(sk) == seq_file_net(seq)) + return sk; + + spin_unlock(&unix_table_locks[bucket]); + + *pos = set_bucket_offset(++bucket, 1); + + return unix_get_first(seq, pos); +} + static void *unix_seq_start(struct seq_file *seq, loff_t *pos) { if (!*pos) return SEQ_START_TOKEN; - if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table)) - return NULL; - - return unix_next_socket(seq, NULL, pos); + return unix_get_first(seq, pos); } static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; - return unix_next_socket(seq, v, pos); + + if (v == SEQ_START_TOKEN) + return unix_get_first(seq, pos); + + return unix_get_next(seq, v, pos); } static void unix_seq_stop(struct seq_file *seq, void *v) -- cgit v1.2.3 From 855d8e77ffb05be6e54c34dababccb20318aec00 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 13 Jan 2022 09:28:46 +0900 Subject: bpf: af_unix: Use batching algorithm in bpf unix iter. The commit 04c7820b776f ("bpf: tcp: Bpf iter batching and lock_sock") introduces the batching algorithm to iterate TCP sockets with more consistency. This patch uses the same algorithm to iterate AF_UNIX sockets. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20220113002849.4384-3-kuniyu@amazon.co.jp Signed-off-by: Alexei Starovoitov --- net/unix/af_unix.c | 184 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 177 insertions(+), 7 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e1c4082accdb..d383d5f63b6b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -3356,6 +3356,15 @@ static const struct seq_operations unix_seq_ops = { }; #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) +struct bpf_unix_iter_state { + struct seq_net_private p; + unsigned int cur_sk; + unsigned int end_sk; + unsigned int max_sk; + struct sock **batch; + bool st_bucket_done; +}; + struct bpf_iter__unix { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct unix_sock *, unix_sk); @@ -3374,24 +3383,156 @@ static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, return bpf_iter_run_prog(prog, &ctx); } +static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) + +{ + struct bpf_unix_iter_state *iter = seq->private; + unsigned int expected = 1; + struct sock *sk; + + sock_hold(start_sk); + iter->batch[iter->end_sk++] = start_sk; + + for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { + if (sock_net(sk) != seq_file_net(seq)) + continue; + + if (iter->end_sk < iter->max_sk) { + sock_hold(sk); + iter->batch[iter->end_sk++] = sk; + } + + expected++; + } + + spin_unlock(&unix_table_locks[start_sk->sk_hash]); + + return expected; +} + +static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter) +{ + while (iter->cur_sk < iter->end_sk) + sock_put(iter->batch[iter->cur_sk++]); +} + +static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter, + unsigned int new_batch_sz) +{ + struct sock **new_batch; + + new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, + GFP_USER | __GFP_NOWARN); + if (!new_batch) + return -ENOMEM; + + bpf_iter_unix_put_batch(iter); + kvfree(iter->batch); + iter->batch = new_batch; + iter->max_sk = new_batch_sz; + + return 0; +} + +static struct sock *bpf_iter_unix_batch(struct seq_file *seq, + loff_t *pos) +{ + struct bpf_unix_iter_state *iter = seq->private; + unsigned int expected; + bool resized = false; + struct sock *sk; + + if (iter->st_bucket_done) + *pos = set_bucket_offset(get_bucket(*pos) + 1, 1); + +again: + /* Get a new batch */ + iter->cur_sk = 0; + iter->end_sk = 0; + + sk = unix_get_first(seq, pos); + if (!sk) + return NULL; /* Done */ + + expected = bpf_iter_unix_hold_batch(seq, sk); + + if (iter->end_sk == expected) { + iter->st_bucket_done = true; + return sk; + } + + if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) { + resized = true; + goto again; + } + + return sk; +} + +static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos) +{ + if (!*pos) + return SEQ_START_TOKEN; + + /* bpf iter does not support lseek, so it always + * continue from where it was stop()-ped. + */ + return bpf_iter_unix_batch(seq, pos); +} + +static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_unix_iter_state *iter = seq->private; + struct sock *sk; + + /* Whenever seq_next() is called, the iter->cur_sk is + * done with seq_show(), so advance to the next sk in + * the batch. + */ + if (iter->cur_sk < iter->end_sk) + sock_put(iter->batch[iter->cur_sk++]); + + ++*pos; + + if (iter->cur_sk < iter->end_sk) + sk = iter->batch[iter->cur_sk]; + else + sk = bpf_iter_unix_batch(seq, pos); + + return sk; +} + static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; struct sock *sk = v; uid_t uid; + bool slow; + int ret; if (v == SEQ_START_TOKEN) return 0; + slow = lock_sock_fast(sk); + + if (unlikely(sk_unhashed(sk))) { + ret = SEQ_SKIP; + goto unlock; + } + uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); meta.seq = seq; prog = bpf_iter_get_info(&meta, false); - return unix_prog_seq_show(prog, &meta, v, uid); + ret = unix_prog_seq_show(prog, &meta, v, uid); +unlock: + unlock_sock_fast(sk, slow); + return ret; } static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) { + struct bpf_unix_iter_state *iter = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; @@ -3402,12 +3543,13 @@ static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) (void)unix_prog_seq_show(prog, &meta, v, 0); } - unix_seq_stop(seq, v); + if (iter->cur_sk < iter->end_sk) + bpf_iter_unix_put_batch(iter); } static const struct seq_operations bpf_iter_unix_seq_ops = { - .start = unix_seq_start, - .next = unix_seq_next, + .start = bpf_iter_unix_seq_start, + .next = bpf_iter_unix_seq_next, .stop = bpf_iter_unix_seq_stop, .show = bpf_iter_unix_seq_show, }; @@ -3456,11 +3598,39 @@ static struct pernet_operations unix_net_ops = { DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, struct unix_sock *unix_sk, uid_t uid) +#define INIT_BATCH_SZ 16 + +static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux) +{ + struct bpf_unix_iter_state *iter = priv_data; + int err; + + err = bpf_iter_init_seq_net(priv_data, aux); + if (err) + return err; + + err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ); + if (err) { + bpf_iter_fini_seq_net(priv_data); + return err; + } + + return 0; +} + +static void bpf_iter_fini_unix(void *priv_data) +{ + struct bpf_unix_iter_state *iter = priv_data; + + bpf_iter_fini_seq_net(priv_data); + kvfree(iter->batch); +} + static const struct bpf_iter_seq_info unix_seq_info = { .seq_ops = &bpf_iter_unix_seq_ops, - .init_seq_private = bpf_iter_init_seq_net, - .fini_seq_private = bpf_iter_fini_seq_net, - .seq_priv_size = sizeof(struct seq_net_private), + .init_seq_private = bpf_iter_init_unix, + .fini_seq_private = bpf_iter_fini_unix, + .seq_priv_size = sizeof(struct bpf_unix_iter_state), }; static struct bpf_iter_reg unix_reg_info = { -- cgit v1.2.3 From eb7d8f1d9ebc7379f09a51bf4faa35e0bfa7437d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 13 Jan 2022 09:28:47 +0900 Subject: bpf: Support bpf_(get|set)sockopt() in bpf unix iter. This patch makes bpf_(get|set)sockopt() available when iterating AF_UNIX sockets. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20220113002849.4384-4-kuniyu@amazon.co.jp Signed-off-by: Alexei Starovoitov --- net/unix/af_unix.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index d383d5f63b6b..3e0d6281fd1e 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -3633,6 +3633,20 @@ static const struct bpf_iter_seq_info unix_seq_info = { .seq_priv_size = sizeof(struct bpf_unix_iter_state), }; +static const struct bpf_func_proto * +bpf_iter_unix_get_func_proto(enum bpf_func_id func_id, + const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_setsockopt: + return &bpf_sk_setsockopt_proto; + case BPF_FUNC_getsockopt: + return &bpf_sk_getsockopt_proto; + default: + return NULL; + } +} + static struct bpf_iter_reg unix_reg_info = { .target = "unix", .ctx_arg_info_size = 1, @@ -3640,6 +3654,7 @@ static struct bpf_iter_reg unix_reg_info = { { offsetof(struct bpf_iter__unix, unix_sk), PTR_TO_BTF_ID_OR_NULL }, }, + .get_func_proto = bpf_iter_unix_get_func_proto, .seq_info = &unix_seq_info, }; -- cgit v1.2.3 From e82025c623e2bf04d162bafceb66a59115814479 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Mar 2022 12:08:08 +0900 Subject: af_unix: Fix some data-races around unix_sk(sk)->oob_skb. Out-of-band data automatically places a "mark" showing wherein the sequence the out-of-band data would have been. If the out-of-band data implies cancelling everything sent so far, the "mark" is helpful to flush them. When the socket's read pointer reaches the "mark", the ioctl() below sets a non zero value to the arg `atmark`: The out-of-band data is queued in sk->sk_receive_queue as well as ordinary data and also saved in unix_sk(sk)->oob_skb. It can be used to test if the head of the receive queue is the out-of-band data meaning the socket is at the "mark". While testing that, unix_ioctl() reads unix_sk(sk)->oob_skb locklessly. Thus, all accesses to oob_skb need some basic protection to avoid load/store tearing which KCSAN detects when these are called concurrently: - ioctl(fd_a, SIOCATMARK, &atmark, sizeof(atmark)) - send(fd_b_connected_to_a, buf, sizeof(buf), MSG_OOB) BUG: KCSAN: data-race in unix_ioctl / unix_stream_sendmsg write to 0xffff888003d9cff0 of 8 bytes by task 175 on cpu 1: unix_stream_sendmsg (net/unix/af_unix.c:2087 net/unix/af_unix.c:2191) sock_sendmsg (net/socket.c:705 net/socket.c:725) __sys_sendto (net/socket.c:2040) __x64_sys_sendto (net/socket.c:2048) do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:113) read to 0xffff888003d9cff0 of 8 bytes by task 176 on cpu 0: unix_ioctl (net/unix/af_unix.c:3101 (discriminator 1)) sock_do_ioctl (net/socket.c:1128) sock_ioctl (net/socket.c:1242) __x64_sys_ioctl (fs/ioctl.c:52 fs/ioctl.c:874 fs/ioctl.c:860 fs/ioctl.c:860) do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:113) value changed: 0xffff888003da0c00 -> 0xffff888003da0d00 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 176 Comm: unix_race_oob_i Not tainted 5.17.0-rc5-59529-g83dc4c2af682 #12 Hardware name: Red Hat KVM, BIOS 1.11.0-2.amzn2 04/01/2014 Fixes: 314001f0bf92 ("af_unix: Add OOB support") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/unix/af_unix.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c19569819866..0c37e5595aae 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2084,7 +2084,7 @@ static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other if (ousk->oob_skb) consume_skb(ousk->oob_skb); - ousk->oob_skb = skb; + WRITE_ONCE(ousk->oob_skb, skb); scm_stat_add(other, skb); skb_queue_tail(&other->sk_receive_queue, skb); @@ -2602,9 +2602,8 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state) oob_skb = u->oob_skb; - if (!(state->flags & MSG_PEEK)) { - u->oob_skb = NULL; - } + if (!(state->flags & MSG_PEEK)) + WRITE_ONCE(u->oob_skb, NULL); unix_state_unlock(sk); @@ -2639,7 +2638,7 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, skb = NULL; } else if (sock_flag(sk, SOCK_URGINLINE)) { if (!(flags & MSG_PEEK)) { - u->oob_skb = NULL; + WRITE_ONCE(u->oob_skb, NULL); consume_skb(skb); } } else if (!(flags & MSG_PEEK)) { @@ -3094,11 +3093,10 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCATMARK: { struct sk_buff *skb; - struct unix_sock *u = unix_sk(sk); int answ = 0; skb = skb_peek(&sk->sk_receive_queue); - if (skb && skb == u->oob_skb) + if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb)) answ = 1; err = put_user(answ, (int __user *)arg); } -- cgit v1.2.3 From d9a232d435dcc966738b0f414a86f7edf4f4c8c4 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Mar 2022 12:08:09 +0900 Subject: af_unix: Support POLLPRI for OOB. The commit 314001f0bf92 ("af_unix: Add OOB support") introduced OOB for AF_UNIX, but it lacks some changes for POLLPRI. Let's add the missing piece. In the selftest, normal datagrams are sent followed by OOB data, so this commit replaces `POLLIN | POLLPRI` with just `POLLPRI` in the first test case. Fixes: 314001f0bf92 ("af_unix: Add OOB support") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/unix/af_unix.c | 4 ++++ tools/testing/selftests/net/af_unix/test_unix_oob.c | 6 +++--- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 0c37e5595aae..1e7ed5829ed5 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -3137,6 +3137,10 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa mask |= EPOLLIN | EPOLLRDNORM; if (sk_is_readable(sk)) mask |= EPOLLIN | EPOLLRDNORM; +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (READ_ONCE(unix_sk(sk)->oob_skb)) + mask |= EPOLLPRI; +#endif /* Connection-based need to check for termination and startup */ if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && diff --git a/tools/testing/selftests/net/af_unix/test_unix_oob.c b/tools/testing/selftests/net/af_unix/test_unix_oob.c index 3dece8b29253..b57e91e1c3f2 100644 --- a/tools/testing/selftests/net/af_unix/test_unix_oob.c +++ b/tools/testing/selftests/net/af_unix/test_unix_oob.c @@ -218,10 +218,10 @@ main(int argc, char **argv) /* Test 1: * veriyf that SIGURG is - * delivered and 63 bytes are - * read and oob is '@' + * delivered, 63 bytes are + * read, oob is '@', and POLLPRI works. */ - wait_for_data(pfd, POLLIN | POLLPRI); + wait_for_data(pfd, POLLPRI); read_oob(pfd, &oob); len = read_data(pfd, buf, 1024); if (!signal_recvd || len != 63 || oob != '@') { -- cgit v1.2.3 From 4edf21aa94ee33c75f819f2b6eb6dd52ef8a1628 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Mar 2022 12:23:08 +0900 Subject: af_unix: Remove unnecessary brackets around CONFIG_AF_UNIX_OOB. Let's remove unnecessary brackets around CONFIG_AF_UNIX_OOB. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20220317032308.65372-1-kuniyu@amazon.co.jp Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3e0d6281fd1e..4247c4134f31 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2049,7 +2049,7 @@ out: */ #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) -#if (IS_ENABLED(CONFIG_AF_UNIX_OOB)) +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other) { struct unix_sock *ousk = unix_sk(other); @@ -2115,7 +2115,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, err = -EOPNOTSUPP; if (msg->msg_flags & MSG_OOB) { -#if (IS_ENABLED(CONFIG_AF_UNIX_OOB)) +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (len) len--; else @@ -2186,7 +2186,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, sent += size; } -#if (IS_ENABLED(CONFIG_AF_UNIX_OOB)) +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (msg->msg_flags & MSG_OOB) { err = queue_oob(sock, msg, other); if (err) -- cgit v1.2.3