From 25f407f0b668f5e4ebd5d13e1fb4306ba6427ead Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 21 Oct 2005 15:03:29 -0700 Subject: [PATCH] Call exit_itimers from do_exit, not __exit_signal When I originally moved exit_itimers into __exit_signal, that was the only place where we could reliably know it was the last thread in the group dying, without races. Since then we've gotten the signal_struct.live counter, and do_exit can reliably do group-wide cleanup work. This patch moves the call to do_exit, where it's made without locks. This avoids the deadlock issues that the old __exit_signal code's comment talks about, and the one that Oleg found recently with process CPU timers. [ This replaces e03d13e985d48ac4885382c9e3b1510c78bd047f, which is why it was just reverted. ] Signed-off-by: Roland McGrath Signed-off-by: Linus Torvalds --- kernel/exit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 43077732619b..3b25b182d2be 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -843,6 +843,7 @@ fastcall NORET_TYPE void do_exit(long code) group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { del_timer_sync(&tsk->signal->real_timer); + exit_itimers(tsk->signal); acct_process(code); } exit_mm(tsk); -- cgit v1.2.3 From 3de463c7d9d58f8cf3395268230cb20a4c15bffa Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 24 Oct 2005 14:34:03 +0400 Subject: [PATCH] posix-timers: remove false BUG_ON() from run_posix_cpu_timers() do_exit() clears ->it_##clock##_expires, but nothing prevents another cpu to attach the timer to exiting process after that. After exit_notify() does 'write_unlock_irq(&tasklist_lock)' and before do_exit() calls 'schedule() local timer interrupt can find tsk->exit_state != 0. If that state was EXIT_DEAD (or another cpu does sys_wait4) interrupted task has ->signal == NULL. At this moment exiting task has no pending cpu timers, they were cleaned up in __exit_signal()->posix_cpu_timers_exit{,_group}(), so we can just return from irq. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/exit.c | 8 -------- kernel/posix-cpu-timers.c | 36 ++++++++++++++++++------------------ 2 files changed, 18 insertions(+), 26 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 3b25b182d2be..4897977a1f4b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -825,14 +825,6 @@ fastcall NORET_TYPE void do_exit(long code) tsk->flags |= PF_EXITING; - /* - * Make sure we don't try to process any timer firings - * while we are already exiting. - */ - tsk->it_virt_expires = cputime_zero; - tsk->it_prof_expires = cputime_zero; - tsk->it_sched_expires = 0; - if (unlikely(in_atomic())) printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", current->comm, current->pid, diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 30ab39a27736..ccb04683bf18 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1285,30 +1285,30 @@ void run_posix_cpu_timers(struct task_struct *tsk) #undef UNEXPIRED - BUG_ON(tsk->exit_state); - /* * Double-check with locks held. */ read_lock(&tasklist_lock); - spin_lock(&tsk->sighand->siglock); + if (likely(tsk->signal != NULL)) { + spin_lock(&tsk->sighand->siglock); - /* - * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N] - * all the timers that are firing, and put them on the firing list. - */ - check_thread_timers(tsk, &firing); - check_process_timers(tsk, &firing); + /* + * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N] + * all the timers that are firing, and put them on the firing list. + */ + check_thread_timers(tsk, &firing); + check_process_timers(tsk, &firing); - /* - * We must release these locks before taking any timer's lock. - * There is a potential race with timer deletion here, as the - * siglock now protects our private firing list. We have set - * the firing flag in each timer, so that a deletion attempt - * that gets the timer lock before we do will give it up and - * spin until we've taken care of that timer below. - */ - spin_unlock(&tsk->sighand->siglock); + /* + * We must release these locks before taking any timer's lock. + * There is a potential race with timer deletion here, as the + * siglock now protects our private firing list. We have set + * the firing flag in each timer, so that a deletion attempt + * that gets the timer lock before we do will give it up and + * spin until we've taken care of that timer below. + */ + spin_unlock(&tsk->sighand->siglock); + } read_unlock(&tasklist_lock); /* -- cgit v1.2.3 From a362f463a6d316d14daed0f817e151835ce97ff7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 27 Oct 2005 09:07:33 -0700 Subject: Revert "remove false BUG_ON() from run_posix_cpu_timers()" This reverts commit 3de463c7d9d58f8cf3395268230cb20a4c15bffa. Roland has another patch that allows us to leave the BUG_ON() in place by just making sure that the condition it tests for really is always true. That goes in next. Signed-off-by: Linus Torvalds --- kernel/exit.c | 8 ++++++++ kernel/posix-cpu-timers.c | 36 ++++++++++++++++++------------------ 2 files changed, 26 insertions(+), 18 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 4897977a1f4b..3b25b182d2be 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -825,6 +825,14 @@ fastcall NORET_TYPE void do_exit(long code) tsk->flags |= PF_EXITING; + /* + * Make sure we don't try to process any timer firings + * while we are already exiting. + */ + tsk->it_virt_expires = cputime_zero; + tsk->it_prof_expires = cputime_zero; + tsk->it_sched_expires = 0; + if (unlikely(in_atomic())) printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", current->comm, current->pid, diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 383ba22f0b62..ea1aca5e7c2b 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1293,30 +1293,30 @@ void run_posix_cpu_timers(struct task_struct *tsk) #undef UNEXPIRED + BUG_ON(tsk->exit_state); + /* * Double-check with locks held. */ read_lock(&tasklist_lock); - if (likely(tsk->signal != NULL)) { - spin_lock(&tsk->sighand->siglock); + spin_lock(&tsk->sighand->siglock); - /* - * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N] - * all the timers that are firing, and put them on the firing list. - */ - check_thread_timers(tsk, &firing); - check_process_timers(tsk, &firing); + /* + * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N] + * all the timers that are firing, and put them on the firing list. + */ + check_thread_timers(tsk, &firing); + check_process_timers(tsk, &firing); - /* - * We must release these locks before taking any timer's lock. - * There is a potential race with timer deletion here, as the - * siglock now protects our private firing list. We have set - * the firing flag in each timer, so that a deletion attempt - * that gets the timer lock before we do will give it up and - * spin until we've taken care of that timer below. - */ - spin_unlock(&tsk->sighand->siglock); - } + /* + * We must release these locks before taking any timer's lock. + * There is a potential race with timer deletion here, as the + * siglock now protects our private firing list. We have set + * the firing flag in each timer, so that a deletion attempt + * that gets the timer lock before we do will give it up and + * spin until we've taken care of that timer below. + */ + spin_unlock(&tsk->sighand->siglock); read_unlock(&tasklist_lock); /* -- cgit v1.2.3 From 365e9c87a982c03d0af3886e29d877f581b59611 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:18 -0700 Subject: [PATCH] mm: update_hiwaters just in time update_mem_hiwater has attracted various criticisms, in particular from those concerned with mm scalability. Originally it was called whenever rss or total_vm got raised. Then many of those callsites were replaced by a timer tick call from account_system_time. Now Frank van Maarseveen reports that to be found inadequate. How about this? Works for Frank. Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros update_hiwater_rss and update_hiwater_vm. Don't attempt to keep mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually by 1): those are hot paths. Do the opposite, update only when about to lower rss (usually by many), or just before final accounting in do_exit. Handle mm->hiwater_vm in the same way, though it's much less of an issue. Demand that whoever collects these hiwater statistics do the work of taking the maximum with rss or total_vm. And there has been no collector of these hiwater statistics in the tree. The new convention needs an example, so match Frank's usage by adding a VmPeak line above VmSize to /proc//status, and also a VmHWM line above VmRSS (High-Water-Mark or High-Water-Memory). There was a particular anomaly during mremap move, that hiwater_vm might be captured too high. A fleeting such anomaly remains, but it's quickly corrected now, whereas before it would stick. What locking? None: if the app is racy then these statistics will be racy, it's not worth any overhead to make them exact. But whenever it suits, hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under page_table_lock (for now) or with preemption disabled (later on): without going to any trouble, minimize the time between reading current values and updating, to minimize those occasions when a racing thread bumps a count up and back down in between. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/compat.c | 1 - fs/exec.c | 1 - fs/proc/task_mmu.c | 23 +++++++++++++++++++++-- include/linux/mm.h | 3 --- include/linux/sched.h | 10 ++++++++++ kernel/exit.c | 5 ++++- kernel/sched.c | 2 -- mm/fremap.c | 4 +++- mm/hugetlb.c | 3 +++ mm/memory.c | 17 +---------------- mm/mmap.c | 4 ++++ mm/mremap.c | 12 ++++++++++-- mm/nommu.c | 15 ++------------- mm/rmap.c | 6 ++++++ 14 files changed, 64 insertions(+), 42 deletions(-) (limited to 'kernel/exit.c') diff --git a/fs/compat.c b/fs/compat.c index a719e158e002..8e71cdbecc7c 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1490,7 +1490,6 @@ int compat_do_execve(char * filename, /* execve success */ security_bprm_free(bprm); acct_update_integrals(current); - update_mem_hiwater(current); kfree(bprm); return retval; } diff --git a/fs/exec.c b/fs/exec.c index cefadf5ab83b..9bb55c8cf224 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1207,7 +1207,6 @@ int do_execve(char * filename, /* execve success */ security_bprm_free(bprm); acct_update_integrals(current); - update_mem_hiwater(current); kfree(bprm); return retval; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index bccee7cf9ccd..7c89b4549049 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -14,22 +14,41 @@ char *task_mem(struct mm_struct *mm, char *buffer) { unsigned long data, text, lib; + unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; + + /* + * Note: to minimize their overhead, mm maintains hiwater_vm and + * hiwater_rss only when about to *lower* total_vm or rss. Any + * collector of these hiwater stats must therefore get total_vm + * and rss too, which will usually be the higher. Barriers? not + * worth the effort, such snapshots can always be inconsistent. + */ + hiwater_vm = total_vm = mm->total_vm; + if (hiwater_vm < mm->hiwater_vm) + hiwater_vm = mm->hiwater_vm; + hiwater_rss = total_rss = get_mm_rss(mm); + if (hiwater_rss < mm->hiwater_rss) + hiwater_rss = mm->hiwater_rss; data = mm->total_vm - mm->shared_vm - mm->stack_vm; text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; buffer += sprintf(buffer, + "VmPeak:\t%8lu kB\n" "VmSize:\t%8lu kB\n" "VmLck:\t%8lu kB\n" + "VmHWM:\t%8lu kB\n" "VmRSS:\t%8lu kB\n" "VmData:\t%8lu kB\n" "VmStk:\t%8lu kB\n" "VmExe:\t%8lu kB\n" "VmLib:\t%8lu kB\n" "VmPTE:\t%8lu kB\n", - (mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), + hiwater_vm << (PAGE_SHIFT-10), + (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), mm->locked_vm << (PAGE_SHIFT-10), - get_mm_rss(mm) << (PAGE_SHIFT-10), + hiwater_rss << (PAGE_SHIFT-10), + total_rss << (PAGE_SHIFT-10), data << (PAGE_SHIFT-10), mm->stack_vm << (PAGE_SHIFT-10), text, lib, (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10); diff --git a/include/linux/mm.h b/include/linux/mm.h index da42093250c3..7d4552fe0864 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -938,9 +938,6 @@ static inline void vm_stat_account(struct mm_struct *mm, } #endif /* CONFIG_PROC_FS */ -/* update per process rss and vm hiwater data */ -extern void update_mem_hiwater(struct task_struct *tsk); - #ifndef CONFIG_DEBUG_PAGEALLOC static inline void kernel_map_pages(struct page *page, int numpages, int enable) diff --git a/include/linux/sched.h b/include/linux/sched.h index afcaac66cbd5..a9c0b7d26303 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -256,6 +256,16 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); #define dec_mm_counter(mm, member) (mm)->_##member-- #define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss) +#define update_hiwater_rss(mm) do { \ + unsigned long _rss = get_mm_rss(mm); \ + if ((mm)->hiwater_rss < _rss) \ + (mm)->hiwater_rss = _rss; \ +} while (0) +#define update_hiwater_vm(mm) do { \ + if ((mm)->hiwater_vm < (mm)->total_vm) \ + (mm)->hiwater_vm = (mm)->total_vm; \ +} while (0) + typedef unsigned long mm_counter_t; struct mm_struct { diff --git a/kernel/exit.c b/kernel/exit.c index 3b25b182d2be..79f52b85d6ed 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -839,7 +839,10 @@ fastcall NORET_TYPE void do_exit(long code) preempt_count()); acct_update_integrals(tsk); - update_mem_hiwater(tsk); + if (tsk->mm) { + update_hiwater_rss(tsk->mm); + update_hiwater_vm(tsk->mm); + } group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { del_timer_sync(&tsk->signal->real_timer); diff --git a/kernel/sched.c b/kernel/sched.c index 1e5cafdf4e27..4f26c544d02c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2511,8 +2511,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset, cpustat->idle = cputime64_add(cpustat->idle, tmp); /* Account for system time used */ acct_update_integrals(p); - /* Update rss highwater mark */ - update_mem_hiwater(p); } /* diff --git a/mm/fremap.c b/mm/fremap.c index 7f08d10ceaff..49719a35769a 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -143,8 +143,10 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte) goto err_unlock; - if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) + if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) { + update_hiwater_rss(mm); dec_mm_counter(mm, file_rss); + } set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); pte_val = *pte; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 094455bcbbf7..ac5f044bf514 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -310,6 +310,9 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, BUG_ON(start & ~HPAGE_MASK); BUG_ON(end & ~HPAGE_MASK); + /* Update high watermark before we lower rss */ + update_hiwater_rss(mm); + for (address = start; address < end; address += HPAGE_SIZE) { ptep = huge_pte_offset(mm, address); if (! ptep) diff --git a/mm/memory.c b/mm/memory.c index a25ee1d3e20a..692ad810263d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -820,6 +820,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, lru_add_drain(); spin_lock(&mm->page_table_lock); tlb = tlb_gather_mmu(mm, 0); + update_hiwater_rss(mm); end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); tlb_finish_mmu(tlb, address, end); spin_unlock(&mm->page_table_lock); @@ -2225,22 +2226,6 @@ unsigned long vmalloc_to_pfn(void * vmalloc_addr) EXPORT_SYMBOL(vmalloc_to_pfn); -/* - * update_mem_hiwater - * - update per process rss and vm high water data - */ -void update_mem_hiwater(struct task_struct *tsk) -{ - if (tsk->mm) { - unsigned long rss = get_mm_rss(tsk->mm); - - if (tsk->mm->hiwater_rss < rss) - tsk->mm->hiwater_rss = rss; - if (tsk->mm->hiwater_vm < tsk->mm->total_vm) - tsk->mm->hiwater_vm = tsk->mm->total_vm; - } -} - #if !defined(__HAVE_ARCH_GATE_AREA) #if defined(AT_SYSINFO_EHDR) diff --git a/mm/mmap.c b/mm/mmap.c index 8a111792b8db..c43b28457007 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1640,6 +1640,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) */ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) { + /* Update high watermark before we lower total_vm */ + update_hiwater_vm(mm); do { long nrpages = vma_pages(vma); @@ -1668,6 +1670,7 @@ static void unmap_region(struct mm_struct *mm, lru_add_drain(); spin_lock(&mm->page_table_lock); tlb = tlb_gather_mmu(mm, 0); + update_hiwater_rss(mm); unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, @@ -1953,6 +1956,7 @@ void exit_mmap(struct mm_struct *mm) flush_cache_mm(mm); tlb = tlb_gather_mmu(mm, 1); + /* Don't update_hiwater_rss(mm) here, do_exit already did */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); diff --git a/mm/mremap.c b/mm/mremap.c index 318eea5467a0..ccf456477020 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -167,6 +167,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, unsigned long new_pgoff; unsigned long moved_len; unsigned long excess = 0; + unsigned long hiwater_vm; int split = 0; /* @@ -205,9 +206,15 @@ static unsigned long move_vma(struct vm_area_struct *vma, } /* - * if we failed to move page tables we still do total_vm increment - * since do_munmap() will decrement it by old_len == new_len + * If we failed to move page tables we still do total_vm increment + * since do_munmap() will decrement it by old_len == new_len. + * + * Since total_vm is about to be raised artificially high for a + * moment, we need to restore high watermark afterwards: if stats + * are taken meanwhile, total_vm and hiwater_vm appear too high. + * If this were a serious issue, we'd add a flag to do_munmap(). */ + hiwater_vm = mm->hiwater_vm; mm->total_vm += new_len >> PAGE_SHIFT; vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); @@ -216,6 +223,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, vm_unacct_memory(excess >> PAGE_SHIFT); excess = 0; } + mm->hiwater_vm = hiwater_vm; /* Restore VM_ACCOUNT if one or two pieces of vma left */ if (excess) { diff --git a/mm/nommu.c b/mm/nommu.c index 599924886eb5..dfb124ffb9be 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -931,6 +931,8 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) realalloc -= kobjsize(vml); askedalloc -= sizeof(*vml); kfree(vml); + + update_hiwater_vm(mm); mm->total_vm -= len >> PAGE_SHIFT; #ifdef DEBUG @@ -1078,19 +1080,6 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr) { } -void update_mem_hiwater(struct task_struct *tsk) -{ - unsigned long rss; - - if (likely(tsk->mm)) { - rss = get_mm_rss(tsk->mm); - if (tsk->mm->hiwater_rss < rss) - tsk->mm->hiwater_rss = rss; - if (tsk->mm->hiwater_vm < tsk->mm->total_vm) - tsk->mm->hiwater_vm = tsk->mm->total_vm; - } -} - void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) diff --git a/mm/rmap.c b/mm/rmap.c index f69d5342ce7f..4c52c56c9905 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -538,6 +538,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) if (pte_dirty(pteval)) set_page_dirty(page); + /* Update high watermark before we lower rss */ + update_hiwater_rss(mm); + if (PageAnon(page)) { swp_entry_t entry = { .val = page->private }; /* @@ -628,6 +631,9 @@ static void try_to_unmap_cluster(unsigned long cursor, if (!pmd_present(*pmd)) goto out_unlock; + /* Update high watermark before we lower rss */ + update_hiwater_rss(mm); + for (original_pte = pte = pte_offset_map(pmd, address); address < end; pte++, address += PAGE_SIZE) { -- cgit v1.2.3 From 7407251a0e2ed099e4b12b742b635503e981507c Mon Sep 17 00:00:00 2001 From: Coywolf Qi Hunt Date: Sun, 30 Oct 2005 15:02:47 -0800 Subject: [PATCH] PF_DEAD cleanup The PF_DEAD setting doesn't belong to exit_notify(), move it to a proper place. Signed-off-by: Coywolf Qi Hunt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 79f52b85d6ed..6ef8f7356a74 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -783,10 +783,6 @@ static void exit_notify(struct task_struct *tsk) /* If the process is dead, release it - nobody will wait for it */ if (state == EXIT_DEAD) release_task(tsk); - - /* PF_DEAD causes final put_task_struct after we schedule. */ - preempt_disable(); - tsk->flags |= PF_DEAD; } fastcall NORET_TYPE void do_exit(long code) @@ -873,7 +869,11 @@ fastcall NORET_TYPE void do_exit(long code) tsk->mempolicy = NULL; #endif - BUG_ON(!(current->flags & PF_DEAD)); + /* PF_DEAD causes final put_task_struct after we schedule. */ + preempt_disable(); + BUG_ON(tsk->flags & PF_DEAD); + tsk->flags |= PF_DEAD; + schedule(); BUG(); /* Avoid "noreturn function does return". */ -- cgit v1.2.3 From 7f2a52555998c699a7e89f24636c909d6fc08a60 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Sun, 30 Oct 2005 15:02:50 -0800 Subject: [PATCH] wait4 PTRACE_ATTACH race fix Back about a year ago when I last fiddled heavily with the do_wait code, I was thinking too hard about the wrong thing and I now think I introduced a bug whose inverse thought I was fixing. Apparently noone was looking too hard over much shoulder, so as to cite my bogus reasoning at the time. In the race condition when PTRACE_ATTACH is about to steal a child and then the child hits a tracing event (what my_ptrace_child checks for), the real parent does need to set its flag noting it has some eligible live children. Otherwise a spurious ECHILD error is possible, since the child in question is not yet on the ptrace_children list. Signed-off-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 6ef8f7356a74..2d39ccc367e6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1383,6 +1383,15 @@ repeat: switch (p->state) { case TASK_TRACED: + /* + * When we hit the race with PTRACE_ATTACH, + * we will not report this child. But the + * race means it has not yet been moved to + * our ptrace_children list, so we need to + * set the flag here to avoid a spurious ECHILD + * when the race happens with the only child. + */ + flag = 1; if (!my_ptrace_child(p)) continue; /*FALLTHROUGH*/ -- cgit v1.2.3 From b67a1b9e4bf878aa5d4b6b44cb5a251a2f425f0d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 30 Oct 2005 15:03:44 -0800 Subject: [PATCH] remove hardcoded SEND_SIG_xxx constants This patch replaces hardcoded SEND_SIG_xxx constants with their symbolic names. No changes in affected .o files. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 10 +++++----- kernel/signal.c | 35 ++++++++++++++++++++--------------- security/selinux/hooks.c | 4 ++-- 3 files changed, 27 insertions(+), 22 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 2d39ccc367e6..537394b25e8d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -547,7 +547,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced) if (p->pdeath_signal) /* We already hold the tasklist_lock here. */ - group_send_sig_info(p->pdeath_signal, (void *) 0, p); + group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); /* Move the child from its dying parent to the new one. */ if (unlikely(traced)) { @@ -591,8 +591,8 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced) int pgrp = process_group(p); if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { - __kill_pg_info(SIGHUP, (void *)1, pgrp); - __kill_pg_info(SIGCONT, (void *)1, pgrp); + __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp); + __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp); } } } @@ -727,8 +727,8 @@ static void exit_notify(struct task_struct *tsk) (t->signal->session == tsk->signal->session) && will_become_orphaned_pgrp(process_group(tsk), tsk) && has_stopped_jobs(process_group(tsk))) { - __kill_pg_info(SIGHUP, (void *)1, process_group(tsk)); - __kill_pg_info(SIGCONT, (void *)1, process_group(tsk)); + __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk)); + __kill_pg_info(SIGCONT, SEND_SIG_PRIV, process_group(tsk)); } /* Let father know we died diff --git a/kernel/signal.c b/kernel/signal.c index 9d1512dcf176..1f7b2aaa4a39 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -651,8 +651,9 @@ static int check_kill_permission(int sig, struct siginfo *info, if (!valid_signal(sig)) return error; error = -EPERM; - if ((!info || ((unsigned long)info != 1 && - (unsigned long)info != 2 && SI_FROMUSER(info))) + if ((info == SEND_SIG_NOINFO || + (info != SEND_SIG_PRIV && info != SEND_SIG_FORCED + && SI_FROMUSER(info))) && ((sig != SIGCONT) || (current->signal->session != t->signal->session)) && (current->euid ^ t->suid) && (current->euid ^ t->uid) @@ -789,7 +790,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, * fast-pathed signals for kernel-internal things like SIGSTOP * or SIGKILL. */ - if ((unsigned long)info == 2) + if (info == SEND_SIG_FORCED) goto out_set; /* Real-time signals must be queued if sent by sigqueue, or @@ -801,19 +802,19 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, pass on the info struct. */ q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && - ((unsigned long) info < 2 || + (info < SEND_SIG_FORCED || info->si_code >= 0))); if (q) { list_add_tail(&q->list, &signals->list); switch ((unsigned long) info) { - case 0: + case (unsigned long) SEND_SIG_NOINFO: q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_USER; q->info.si_pid = current->pid; q->info.si_uid = current->uid; break; - case 1: + case (unsigned long) SEND_SIG_PRIV: q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_KERNEL; @@ -825,14 +826,15 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, break; } } else { - if (sig >= SIGRTMIN && info && (unsigned long)info != 1 + if (sig >= SIGRTMIN + && info != SEND_SIG_NOINFO && info != SEND_SIG_PRIV && info->si_code != SI_USER) /* * Queue overflow, abort. We may abort if the signal was rt * and sent by user using something other than kill(). */ return -EAGAIN; - if (((unsigned long)info > 1) && (info->si_code == SI_TIMER)) + if ((info > SEND_SIG_PRIV) && (info->si_code == SI_TIMER)) /* * Set up a return to indicate that we dropped * the signal. @@ -858,7 +860,7 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) BUG(); assert_spin_locked(&t->sighand->siglock); - if (((unsigned long)info > 2) && (info->si_code == SI_TIMER)) + if ((info > SEND_SIG_FORCED) && (info->si_code == SI_TIMER)) /* * Set up a return to indicate that we dropped the signal. */ @@ -914,7 +916,7 @@ force_sig_specific(int sig, struct task_struct *t) t->sighand->action[sig-1].sa.sa_handler = SIG_DFL; sigdelset(&t->blocked, sig); recalc_sigpending_tsk(t); - specific_send_sig_info(sig, (void *)2, t); + specific_send_sig_info(sig, SEND_SIG_FORCED, t); spin_unlock_irqrestore(&t->sighand->siglock, flags); } @@ -1050,7 +1052,7 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) assert_spin_locked(&p->sighand->siglock); handle_stop_signal(sig, p); - if (((unsigned long)info > 2) && (info->si_code == SI_TIMER)) + if ((info > SEND_SIG_FORCED) && (info->si_code == SI_TIMER)) /* * Set up a return to indicate that we dropped the signal. */ @@ -1285,10 +1287,13 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p) return ret; } +#define __si_special(priv) \ + ((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO) + int send_sig(int sig, struct task_struct *p, int priv) { - return send_sig_info(sig, (void*)(long)(priv != 0), p); + return send_sig_info(sig, __si_special(priv), p); } /* @@ -1308,7 +1313,7 @@ send_group_sig_info(int sig, struct siginfo *info, struct task_struct *p) void force_sig(int sig, struct task_struct *p) { - force_sig_info(sig, (void*)1L, p); + force_sig_info(sig, SEND_SIG_PRIV, p); } /* @@ -1333,13 +1338,13 @@ force_sigsegv(int sig, struct task_struct *p) int kill_pg(pid_t pgrp, int sig, int priv) { - return kill_pg_info(sig, (void *)(long)(priv != 0), pgrp); + return kill_pg_info(sig, __si_special(priv), pgrp); } int kill_proc(pid_t pid, int sig, int priv) { - return kill_proc_info(sig, (void *)(long)(priv != 0), pid); + return kill_proc_info(sig, __si_special(priv), pid); } /* diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index bb62838be496..295ac472faf1 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2713,8 +2713,8 @@ static int selinux_task_kill(struct task_struct *p, struct siginfo *info, int si if (rc) return rc; - if (info && ((unsigned long)info == 1 || - (unsigned long)info == 2 || SI_FROMKERNEL(info))) + if (info != SEND_SIG_NOINFO && (info == SEND_SIG_PRIV || + info == SEND_SIG_FORCED || SI_FROMKERNEL(info))) return 0; if (!sig) -- cgit v1.2.3