summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRoland McGrath <roland@redhat.com>2003-03-16 16:25:22 -0800
committerChristoph Hellwig <hch@sgi.com>2003-03-16 16:25:22 -0800
commit874f2e4778535550d4cf65055bcb4003604f0e40 (patch)
treeb33ec4ea3761b720b5d274a1d7531d8b546e9274
parente54f721e229c8218e1483775edcf602f1d12063b (diff)
[PATCH] signal fix for wedge on multithreaded core dump
This is a fix made almost a month ago, during the flurry of signal changes. I didn't realize until today that this hadn't made it into 2.5. Sorry about the delay. This fix is necessary to avoid sometimes wedging in uninterruptible sleep when doing a multithreaded core dump triggered by a process signal (kill) rather than a trap. You can reproduce the problem by running your favorite multithreaded program (NPTL) and then using "kill -SEGV" on it. It will often wedge. The actual fix could be just a two line diff: + if (current->signal->group_exit) + goto dequeue; after the group_exit_task check. That is the fix that has been used in Ingo's backport for weeks and tested heavily (well, as heavily as core dumping ever gets tested, but it's been in our production systems). But I broke the hair out into a separate function. The patch below has the same effect as the two-liner, and no other difference. I have tested 2.5.64 with this patch and it works for me, though I haven't beat on it. The way the wedge happens is that for a core-dump signal group_send_sig_info does a group stop of other threads before the one thread handles the fatal signal. If the fatal thread gets into do_coredump and coredump_wait first, then other threads see the group stop and suspend with SIGKILL pending. All other fatal cases clear group_stop_count, so this is the only way this ever happens. Checking group_exit fixes it. I didn't make do_coredump clear group_stop_count because doing it with the appropriate ordering and locking doesn't fit the organization that code.
-rw-r--r--kernel/signal.c62
1 files changed, 41 insertions, 21 deletions
diff --git a/kernel/signal.c b/kernel/signal.c
index 49e483f8451e..7f630c0261e0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1460,6 +1460,45 @@ do_signal_stop(int signr)
#ifndef HAVE_ARCH_GET_SIGNAL_TO_DELIVER
+/*
+ * Do appropriate magic when group_stop_count > 0.
+ * We return nonzero if we stopped, after releasing the siglock.
+ * We return zero if we still hold the siglock and should look
+ * for another signal without checking group_stop_count again.
+ */
+static inline int handle_group_stop(void)
+{
+ int stop_count;
+
+ if (current->signal->group_exit_task == current) {
+ /*
+ * Group stop is so we can do a core dump,
+ * We are the initiating thread, so get on with it.
+ */
+ current->signal->group_exit_task = NULL;
+ return 0;
+ }
+
+ if (current->signal->group_exit)
+ /*
+ * Group stop is so another thread can do a core dump,
+ * or else we are racing against a death signal.
+ * Just punt the stop so we can get the next signal.
+ */
+ return 0;
+
+ /*
+ * There is a group stop in progress. We stop
+ * without any associated signal being in our queue.
+ */
+ stop_count = --current->signal->group_stop_count;
+ current->exit_code = current->signal->group_exit_code;
+ set_current_state(TASK_STOPPED);
+ spin_unlock_irq(&current->sighand->siglock);
+ finish_stop(stop_count);
+ return 1;
+}
+
int get_signal_to_deliver(siginfo_t *info, struct pt_regs *regs, void *cookie)
{
sigset_t *mask = &current->blocked;
@@ -1469,28 +1508,9 @@ int get_signal_to_deliver(siginfo_t *info, struct pt_regs *regs, void *cookie)
struct k_sigaction *ka;
spin_lock_irq(&current->sighand->siglock);
- if (unlikely(current->signal->group_stop_count > 0)) {
- int stop_count;
- if (current->signal->group_exit_task == current) {
- /*
- * Group stop is so we can do a core dump.
- */
- current->signal->group_exit_task = NULL;
- goto dequeue;
- }
- /*
- * There is a group stop in progress. We stop
- * without any associated signal being in our queue.
- */
- stop_count = --current->signal->group_stop_count;
- signr = current->signal->group_exit_code;
- current->exit_code = signr;
- set_current_state(TASK_STOPPED);
- spin_unlock_irq(&current->sighand->siglock);
- finish_stop(stop_count);
+ if (unlikely(current->signal->group_stop_count > 0) &&
+ handle_group_stop())
continue;
- }
- dequeue:
signr = dequeue_signal(current, mask, info);
spin_unlock_irq(&current->sighand->siglock);