From d679ae94fdd5d3ab00c35078f5af5f37e068b03d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 29 Apr 2022 14:38:01 -0700 Subject: list: fix a data-race around ep->rdllist ep_poll() first calls ep_events_available() with no lock held and checks if ep->rdllist is empty by list_empty_careful(), which reads rdllist->prev. Thus all accesses to it need some protection to avoid store/load-tearing. Note INIT_LIST_HEAD_RCU() already has the annotation for both prev and next. Commit bf3b9f6372c4 ("epoll: Add busy poll support to epoll with socket fds.") added the first lockless ep_events_available(), and commit c5a282e9635e ("fs/epoll: reduce the scope of wq lock in epoll_wait()") made some ep_events_available() calls lockless and added single call under a lock, finally commit e59d3c64cba6 ("epoll: eliminate unnecessary lock for zero timeout") made the last ep_events_available() lockless. BUG: KCSAN: data-race in do_epoll_wait / do_epoll_wait write to 0xffff88810480c7d8 of 8 bytes by task 1802 on cpu 0: INIT_LIST_HEAD include/linux/list.h:38 [inline] list_splice_init include/linux/list.h:492 [inline] ep_start_scan fs/eventpoll.c:622 [inline] ep_send_events fs/eventpoll.c:1656 [inline] ep_poll fs/eventpoll.c:1806 [inline] do_epoll_wait+0x4eb/0xf40 fs/eventpoll.c:2234 do_epoll_pwait fs/eventpoll.c:2268 [inline] __do_sys_epoll_pwait fs/eventpoll.c:2281 [inline] __se_sys_epoll_pwait+0x12b/0x240 fs/eventpoll.c:2275 __x64_sys_epoll_pwait+0x74/0x80 fs/eventpoll.c:2275 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff88810480c7d8 of 8 bytes by task 1799 on cpu 1: list_empty_careful include/linux/list.h:329 [inline] ep_events_available fs/eventpoll.c:381 [inline] ep_poll fs/eventpoll.c:1797 [inline] do_epoll_wait+0x279/0xf40 fs/eventpoll.c:2234 do_epoll_pwait fs/eventpoll.c:2268 [inline] __do_sys_epoll_pwait fs/eventpoll.c:2281 [inline] __se_sys_epoll_pwait+0x12b/0x240 fs/eventpoll.c:2275 __x64_sys_epoll_pwait+0x74/0x80 fs/eventpoll.c:2275 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0xffff88810480c7d0 -> 0xffff888103c15098 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 1799 Comm: syz-fuzzer Tainted: G W 5.17.0-rc7-syzkaller-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Link: https://lkml.kernel.org/r/20220322002653.33865-3-kuniyu@amazon.co.jp Fixes: e59d3c64cba6 ("epoll: eliminate unnecessary lock for zero timeout") Fixes: c5a282e9635e ("fs/epoll: reduce the scope of wq lock in epoll_wait()") Fixes: bf3b9f6372c4 ("epoll: Add busy poll support to epoll with socket fds.") Signed-off-by: Kuniyuki Iwashima Reported-by: syzbot+bdd6e38a1ed5ee58d8bd@syzkaller.appspotmail.com Cc: Al Viro , Andrew Morton Cc: Kuniyuki Iwashima Cc: Kuniyuki Iwashima Cc: "Soheil Hassas Yeganeh" Cc: Davidlohr Bueso Cc: "Sridhar Samudrala" Cc: Alexander Duyck Signed-off-by: Andrew Morton --- include/linux/list.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux/list.h') diff --git a/include/linux/list.h b/include/linux/list.h index dd6c2041d09c..d7d2bfa1a365 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -35,7 +35,7 @@ static inline void INIT_LIST_HEAD(struct list_head *list) { WRITE_ONCE(list->next, list); - list->prev = list; + WRITE_ONCE(list->prev, list); } #ifdef CONFIG_DEBUG_LIST @@ -306,7 +306,7 @@ static inline int list_empty(const struct list_head *head) static inline void list_del_init_careful(struct list_head *entry) { __list_del_entry(entry); - entry->prev = entry; + WRITE_ONCE(entry->prev, entry); smp_store_release(&entry->next, entry); } @@ -326,7 +326,7 @@ static inline void list_del_init_careful(struct list_head *entry) static inline int list_empty_careful(const struct list_head *head) { struct list_head *next = smp_load_acquire(&head->next); - return list_is_head(next, head) && (next == head->prev); + return list_is_head(next, head) && (next == READ_ONCE(head->prev)); } /** -- cgit v1.2.3 From 2fbdf45d7d26361a0c3ec8833fd96edf0f5812da Mon Sep 17 00:00:00 2001 From: Ricardo Martinez Date: Fri, 6 May 2022 11:12:57 -0700 Subject: list: Add list_next_entry_circular() and list_prev_entry_circular() Add macros to get the next or previous entries and wraparound if needed. For example, calling list_next_entry_circular() on the last element should return the first element in the list. Signed-off-by: Ricardo Martinez Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- include/linux/list.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux/list.h') diff --git a/include/linux/list.h b/include/linux/list.h index dd6c2041d09c..c147eeb2d39d 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -563,6 +563,19 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_next_entry(pos, member) \ list_entry((pos)->member.next, typeof(*(pos)), member) +/** + * list_next_entry_circular - get the next element in list + * @pos: the type * to cursor. + * @head: the list head to take the element from. + * @member: the name of the list_head within the struct. + * + * Wraparound if pos is the last element (return the first element). + * Note, that list is expected to be not empty. + */ +#define list_next_entry_circular(pos, head, member) \ + (list_is_last(&(pos)->member, head) ? \ + list_first_entry(head, typeof(*(pos)), member) : list_next_entry(pos, member)) + /** * list_prev_entry - get the prev element in list * @pos: the type * to cursor @@ -571,6 +584,19 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_prev_entry(pos, member) \ list_entry((pos)->member.prev, typeof(*(pos)), member) +/** + * list_prev_entry_circular - get the prev element in list + * @pos: the type * to cursor. + * @head: the list head to take the element from. + * @member: the name of the list_head within the struct. + * + * Wraparound if pos is the first element (return the last element). + * Note, that list is expected to be not empty. + */ +#define list_prev_entry_circular(pos, head, member) \ + (list_is_first(&(pos)->member, head) ? \ + list_last_entry(head, typeof(*(pos)), member) : list_prev_entry(pos, member)) + /** * list_for_each - iterate over a list * @pos: the &struct list_head to use as a loop cursor. -- cgit v1.2.3 From ad25f5cb39872ca14bcbe00816ae65c22fe04b89 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 21 May 2022 08:45:28 +0100 Subject: rxrpc: Fix locking issue There's a locking issue with the per-netns list of calls in rxrpc. The pieces of code that add and remove a call from the list use write_lock() and the calls procfile uses read_lock() to access it. However, the timer callback function may trigger a removal by trying to queue a call for processing and finding that it's already queued - at which point it has a spare refcount that it has to do something with. Unfortunately, if it puts the call and this reduces the refcount to 0, the call will be removed from the list. Unfortunately, since the _bh variants of the locking functions aren't used, this can deadlock. ================================ WARNING: inconsistent lock state 5.18.0-rc3-build4+ #10 Not tainted -------------------------------- inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. ksoftirqd/2/25 [HC0[0]:SC1[1]:HE1:SE0] takes: ffff888107ac4038 (&rxnet->call_lock){+.?.}-{2:2}, at: rxrpc_put_call+0x103/0x14b {SOFTIRQ-ON-W} state was registered at: ... Possible unsafe locking scenario: CPU0 ---- lock(&rxnet->call_lock); lock(&rxnet->call_lock); *** DEADLOCK *** 1 lock held by ksoftirqd/2/25: #0: ffff8881008ffdb0 ((&call->timer)){+.-.}-{0:0}, at: call_timer_fn+0x5/0x23d Changes ======= ver #2) - Changed to using list_next_rcu() rather than rcu_dereference() directly. Fixes: 17926a79320a ("[AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both") Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: David S. Miller --- fs/seq_file.c | 32 ++++++++++++++++++++++++++++++++ include/linux/list.h | 10 ++++++++++ include/linux/seq_file.h | 4 ++++ net/rxrpc/ar-internal.h | 2 +- net/rxrpc/call_accept.c | 6 +++--- net/rxrpc/call_object.c | 18 +++++++++--------- net/rxrpc/net_ns.c | 2 +- net/rxrpc/proc.c | 10 ++-------- 8 files changed, 62 insertions(+), 22 deletions(-) (limited to 'include/linux/list.h') diff --git a/fs/seq_file.c b/fs/seq_file.c index 7ab8a58c29b6..9456a2032224 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -931,6 +931,38 @@ struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos) } EXPORT_SYMBOL(seq_list_next); +struct list_head *seq_list_start_rcu(struct list_head *head, loff_t pos) +{ + struct list_head *lh; + + list_for_each_rcu(lh, head) + if (pos-- == 0) + return lh; + + return NULL; +} +EXPORT_SYMBOL(seq_list_start_rcu); + +struct list_head *seq_list_start_head_rcu(struct list_head *head, loff_t pos) +{ + if (!pos) + return head; + + return seq_list_start_rcu(head, pos - 1); +} +EXPORT_SYMBOL(seq_list_start_head_rcu); + +struct list_head *seq_list_next_rcu(void *v, struct list_head *head, + loff_t *ppos) +{ + struct list_head *lh; + + lh = list_next_rcu((struct list_head *)v); + ++*ppos; + return lh == head ? NULL : lh; +} +EXPORT_SYMBOL(seq_list_next_rcu); + /** * seq_hlist_start - start an iteration of a hlist * @head: the head of the hlist diff --git a/include/linux/list.h b/include/linux/list.h index c147eeb2d39d..57e8b559cdf6 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -605,6 +605,16 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_for_each(pos, head) \ for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next) +/** + * list_for_each_rcu - Iterate over a list in an RCU-safe fashion + * @pos: the &struct list_head to use as a loop cursor. + * @head: the head for your list. + */ +#define list_for_each_rcu(pos, head) \ + for (pos = rcu_dereference((head)->next); \ + !list_is_head(pos, (head)); \ + pos = rcu_dereference(pos->next)) + /** * list_for_each_continue - continue iteration over a list * @pos: the &struct list_head to use as a loop cursor. diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 60820ab511d2..bd023dd38ae6 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -277,6 +277,10 @@ extern struct list_head *seq_list_start_head(struct list_head *head, extern struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos); +extern struct list_head *seq_list_start_rcu(struct list_head *head, loff_t pos); +extern struct list_head *seq_list_start_head_rcu(struct list_head *head, loff_t pos); +extern struct list_head *seq_list_next_rcu(void *v, struct list_head *head, loff_t *ppos); + /* * Helpers for iteration over hlist_head-s in seq_files */ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 52a23d03d694..dcc0ec0bf3de 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -60,7 +60,7 @@ struct rxrpc_net { struct proc_dir_entry *proc_net; /* Subdir in /proc/net */ u32 epoch; /* Local epoch for detecting local-end reset */ struct list_head calls; /* List of calls active in this namespace */ - rwlock_t call_lock; /* Lock for ->calls */ + spinlock_t call_lock; /* Lock for ->calls */ atomic_t nr_calls; /* Count of allocated calls */ atomic_t nr_conns; diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 8ae59edc2551..99e10eea3732 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -140,9 +140,9 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, write_unlock(&rx->call_lock); rxnet = call->rxnet; - write_lock(&rxnet->call_lock); - list_add_tail(&call->link, &rxnet->calls); - write_unlock(&rxnet->call_lock); + spin_lock_bh(&rxnet->call_lock); + list_add_tail_rcu(&call->link, &rxnet->calls); + spin_unlock_bh(&rxnet->call_lock); b->call_backlog[call_head] = call; smp_store_release(&b->call_backlog_head, (call_head + 1) & (size - 1)); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 8764a4f19c03..84d0a4109645 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -337,9 +337,9 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, write_unlock(&rx->call_lock); rxnet = call->rxnet; - write_lock(&rxnet->call_lock); - list_add_tail(&call->link, &rxnet->calls); - write_unlock(&rxnet->call_lock); + spin_lock_bh(&rxnet->call_lock); + list_add_tail_rcu(&call->link, &rxnet->calls); + spin_unlock_bh(&rxnet->call_lock); /* From this point on, the call is protected by its own lock. */ release_sock(&rx->sk); @@ -633,9 +633,9 @@ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op) ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); if (!list_empty(&call->link)) { - write_lock(&rxnet->call_lock); + spin_lock_bh(&rxnet->call_lock); list_del_init(&call->link); - write_unlock(&rxnet->call_lock); + spin_unlock_bh(&rxnet->call_lock); } rxrpc_cleanup_call(call); @@ -707,7 +707,7 @@ void rxrpc_destroy_all_calls(struct rxrpc_net *rxnet) _enter(""); if (!list_empty(&rxnet->calls)) { - write_lock(&rxnet->call_lock); + spin_lock_bh(&rxnet->call_lock); while (!list_empty(&rxnet->calls)) { call = list_entry(rxnet->calls.next, @@ -722,12 +722,12 @@ void rxrpc_destroy_all_calls(struct rxrpc_net *rxnet) rxrpc_call_states[call->state], call->flags, call->events); - write_unlock(&rxnet->call_lock); + spin_unlock_bh(&rxnet->call_lock); cond_resched(); - write_lock(&rxnet->call_lock); + spin_lock_bh(&rxnet->call_lock); } - write_unlock(&rxnet->call_lock); + spin_unlock_bh(&rxnet->call_lock); } atomic_dec(&rxnet->nr_calls); diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index 34f389975a7d..bb4c25d6df64 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -50,7 +50,7 @@ static __net_init int rxrpc_init_net(struct net *net) rxnet->epoch |= RXRPC_RANDOM_EPOCH; INIT_LIST_HEAD(&rxnet->calls); - rwlock_init(&rxnet->call_lock); + spin_lock_init(&rxnet->call_lock); atomic_set(&rxnet->nr_calls, 1); atomic_set(&rxnet->nr_conns, 1); diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 8967201fd8e5..245418943e01 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -26,29 +26,23 @@ static const char *const rxrpc_conn_states[RXRPC_CONN__NR_STATES] = { */ static void *rxrpc_call_seq_start(struct seq_file *seq, loff_t *_pos) __acquires(rcu) - __acquires(rxnet->call_lock) { struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); rcu_read_lock(); - read_lock(&rxnet->call_lock); - return seq_list_start_head(&rxnet->calls, *_pos); + return seq_list_start_head_rcu(&rxnet->calls, *_pos); } static void *rxrpc_call_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); - return seq_list_next(v, &rxnet->calls, pos); + return seq_list_next_rcu(v, &rxnet->calls, pos); } static void rxrpc_call_seq_stop(struct seq_file *seq, void *v) - __releases(rxnet->call_lock) __releases(rcu) { - struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); - - read_unlock(&rxnet->call_lock); rcu_read_unlock(); } -- cgit v1.2.3