From 4ece58992befbbbc83f27a7c92a0353bdff1e3b6 Mon Sep 17 00:00:00 2001 From: "Prasanna S. Panchamukhi" Date: Mon, 30 Aug 2004 20:32:21 -0700 Subject: [PATCH] kprobes base patch This patch helps developers to trap at almost any kernel code address, specifying a handler routine to be invoked when the breakpoint is hit. Useful for analysing the Linux kernel by collecting debugging information non-disruptively. Employs single-stepping out-of-line to avoid probe misses on SMP and may be especially useful in aiding debugging elusive races and problems on live systems. More elaborate dynamic tracing tools such as DProbes can be built over the kprobes interface. Helps developers to trap at almost any kernel code address, specifying a handler routine to be invoked when the breakpoint is hit. Useful for analysing the Linux kernel by collecting debugging information non-disruptively. Employs single-stepping out-of-line to avoid probe misses on SMP and may be especially useful in aiding debugging elusive races and problems on live systems. More elaborate dynamic tracing tools such as DProbes can be built over the kprobes interface. Sample usage: To place a probe on __blockdev_direct_IO: static int probe_handler(struct kprobe *p, struct pt_regs *) { ... whatever ... } struct kprobe kp = { .addr = __blockdev_direct_IO, .pre_handler = probe_handler }; register_kprobe(&kp); Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 1 + kernel/kprobes.c | 125 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 126 insertions(+) create mode 100644 kernel/kprobes.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 47f98594e9e5..a032595fd58c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -23,6 +23,7 @@ obj-$(CONFIG_IKCONFIG_PROC) += configs.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o obj-$(CONFIG_AUDIT) += audit.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o +obj-$(CONFIG_KPROBES) += kprobes.o ifneq ($(CONFIG_IA64),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/kprobes.c b/kernel/kprobes.c new file mode 100644 index 000000000000..7c5fee08665e --- /dev/null +++ b/kernel/kprobes.c @@ -0,0 +1,125 @@ +/* + * Kernel Probes (KProbes) + * kernel/kprobes.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004 + * + * 2002-Oct Created by Vamsi Krishna S Kernel + * Probes initial implementation (includes suggestions from + * Rusty Russell). + * 2004-Aug Updated by Prasanna S Panchamukhi with + * hlists and exceptions notifier as suggested by Andi Kleen. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#define KPROBE_HASH_BITS 6 +#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) + +static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; + +unsigned int kprobe_cpu = NR_CPUS; +static spinlock_t kprobe_lock = SPIN_LOCK_UNLOCKED; + +/* Locks kprobe: irqs must be disabled */ +void lock_kprobes(void) +{ + spin_lock(&kprobe_lock); + kprobe_cpu = smp_processor_id(); +} + +void unlock_kprobes(void) +{ + kprobe_cpu = NR_CPUS; + spin_unlock(&kprobe_lock); +} + +/* You have to be holding the kprobe_lock */ +struct kprobe *get_kprobe(void *addr) +{ + struct hlist_head *head; + struct hlist_node *node; + + head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)]; + hlist_for_each(node, head) { + struct kprobe *p = hlist_entry(node, struct kprobe, hlist); + if (p->addr == addr) + return p; + } + return NULL; +} + +int register_kprobe(struct kprobe *p) +{ + int ret = 0; + unsigned long flags; + + spin_lock_irqsave(&kprobe_lock, flags); + INIT_HLIST_NODE(&p->hlist); + if (get_kprobe(p->addr)) { + ret = -EEXIST; + goto out; + } + hlist_add_head(&p->hlist, + &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); + + arch_prepare_kprobe(p); + p->opcode = *p->addr; + *p->addr = BREAKPOINT_INSTRUCTION; + flush_icache_range(p->addr, p->addr + sizeof(kprobe_opcode_t)); + out: + spin_unlock_irqrestore(&kprobe_lock, flags); + return ret; +} + +void unregister_kprobe(struct kprobe *p) +{ + unsigned long flags; + spin_lock_irqsave(&kprobe_lock, flags); + *p->addr = p->opcode; + hlist_del(&p->hlist); + flush_icache_range(p->addr, p->addr + sizeof(kprobe_opcode_t)); + spin_unlock_irqrestore(&kprobe_lock, flags); +} + +static struct notifier_block kprobe_exceptions_nb = { + .notifier_call = kprobe_exceptions_notify, +}; + +static int __init init_kprobes(void) +{ + int i, err = 0; + + /* FIXME allocate the probe table, currently defined statically */ + /* initialize all list heads */ + for (i = 0; i < KPROBE_TABLE_SIZE; i++) + INIT_HLIST_HEAD(&kprobe_table[i]); + + err = register_die_notifier(&kprobe_exceptions_nb); + return err; +} + +__initcall(init_kprobes); + +EXPORT_SYMBOL_GPL(register_kprobe); +EXPORT_SYMBOL_GPL(unregister_kprobe); -- cgit v1.2.3 From b08e758948d781968ba967f669761f8ee81a9fcd Mon Sep 17 00:00:00 2001 From: "Prasanna S. Panchamukhi" Date: Mon, 30 Aug 2004 20:32:33 -0700 Subject: [PATCH] Jumper Probes to provide function arguments A special kprobe type which can be placed on function entry points, and employs a simple mirroring principle to allow seamless access to the arguments of a function being probed. The probe handler routine should have the same prototype as the function being probed. Currently implemented for x86. The way it works is that when the probe is hit, the breakpoint handler simply irets to the probe handler's eip while retaining register and stack state corresponding to the function entry. After it is done, the probe handler calls jprobe_return() which traps again to restore processor state and switch back to the probed function. Linus noted correctly at KS that we need to be careful as gcc assumes that the callee owns arguments. We save and restore enough stack bytes to cover argument space. Sample Usage: static int jip_queue_xmit(struct sk_buff *skb, int ipfragok) { ... whatever ... jprobe_return(); return 0; } struct jprobe jp = { {.addr = (kprobe_opcode_t *) ip_queue_xmit}, .entry = (kprobe_opcode_t *) jip_queue_xmit }; register_jprobe(&jp); Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/kprobes.c | 78 ++++++++++++++++++++++++++++++++++++++++++++-- include/asm-i386/kprobes.h | 5 +++ include/linux/kprobes.h | 43 +++++++++++++++++++++++-- kernel/kprobes.c | 19 +++++++++++ 4 files changed, 141 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c index 3e050b79ef68..06197c4c7757 100644 --- a/arch/i386/kernel/kprobes.c +++ b/arch/i386/kernel/kprobes.c @@ -16,11 +16,13 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * - * Copyright (C) IBM Corporation, 2002 + * Copyright (C) IBM Corporation, 2002, 2004 * * 2002-Oct Created by Vamsi Krishna S Kernel * Probes initial implementation ( includes contributions from * Rusty Russell). + * 2004-July Suparna Bhattacharya added jumper probes + * interface to access function arguments. */ #include @@ -36,6 +38,10 @@ static struct kprobe *current_kprobe; static unsigned long kprobe_status, kprobe_old_eflags, kprobe_saved_eflags; +static struct pt_regs jprobe_saved_regs; +static long *jprobe_saved_esp; +/* copy of the kernel stack at the probe fire time */ +static kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE]; /* * returns non-zero if opcode modifies the interrupt flag. @@ -91,6 +97,11 @@ static inline int kprobe_handler(struct pt_regs *regs) if (p) { disarm_kprobe(p, regs); ret = 1; + } else { + p = current_kprobe; + if (p->break_handler && p->break_handler(p, regs)) { + goto ss_probe; + } } /* If it's not ours, can't be delete race, (we hold lock). */ goto no_kprobe; @@ -121,8 +132,12 @@ static inline int kprobe_handler(struct pt_regs *regs) if (is_IF_modifier(p->opcode)) kprobe_saved_eflags &= ~IF_MASK; - p->pre_handler(p, regs); + if (p->pre_handler(p, regs)) { + /* handler has already set things up, so skip ss setup */ + return 1; + } + ss_probe: prepare_singlestep(p, regs); kprobe_status = KPROBE_HIT_SS; return 1; @@ -273,3 +288,62 @@ int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, } return NOTIFY_BAD; } + +int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct jprobe *jp = container_of(p, struct jprobe, kp); + unsigned long addr; + + jprobe_saved_regs = *regs; + jprobe_saved_esp = ®s->esp; + addr = (unsigned long)jprobe_saved_esp; + + /* + * TBD: As Linus pointed out, gcc assumes that the callee + * owns the argument space and could overwrite it, e.g. + * tailcall optimization. So, to be absolutely safe + * we also save and restore enough stack bytes to cover + * the argument area. + */ + memcpy(jprobes_stack, (kprobe_opcode_t *) addr, MIN_STACK_SIZE(addr)); + regs->eflags &= ~IF_MASK; + regs->eip = (unsigned long)(jp->entry); + return 1; +} + +void jprobe_return(void) +{ + preempt_enable_no_resched(); + asm volatile (" xchgl %%ebx,%%esp \n" + " int3 \n"::"b" + (jprobe_saved_esp):"memory"); +} +void jprobe_return_end(void) +{ +}; + +int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) +{ + u8 *addr = (u8 *) (regs->eip - 1); + unsigned long stack_addr = (unsigned long)jprobe_saved_esp; + struct jprobe *jp = container_of(p, struct jprobe, kp); + + if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) { + if (®s->esp != jprobe_saved_esp) { + struct pt_regs *saved_regs = + container_of(jprobe_saved_esp, struct pt_regs, esp); + printk("current esp %p does not match saved esp %p\n", + ®s->esp, jprobe_saved_esp); + printk("Saved registers for jprobe %p\n", jp); + show_registers(saved_regs); + printk("Current registers\n"); + show_registers(regs); + BUG(); + } + *regs = jprobe_saved_regs; + memcpy((kprobe_opcode_t *) stack_addr, jprobes_stack, + MIN_STACK_SIZE(stack_addr)); + return 1; + } + return 0; +} diff --git a/include/asm-i386/kprobes.h b/include/asm-i386/kprobes.h index 27829d5540e1..566e34a19996 100644 --- a/include/asm-i386/kprobes.h +++ b/include/asm-i386/kprobes.h @@ -32,6 +32,11 @@ struct pt_regs; typedef u8 kprobe_opcode_t; #define BREAKPOINT_INSTRUCTION 0xcc #define MAX_INSN_SIZE 16 +#define MAX_STACK_SIZE 64 +#define MIN_STACK_SIZE(ADDR) (((MAX_STACK_SIZE) < \ + (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR))) \ + ? (MAX_STACK_SIZE) \ + : (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR))) /* trap3/1 are intr gates for kprobes. So, restore the status of IF, * if necessary, before executing the original int3/1 (trap) handler. diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 8da3214a0973..172b7f421f72 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -18,11 +18,13 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * - * Copyright (C) IBM Corporation, 2002 + * Copyright (C) IBM Corporation, 2002, 2004 * * 2002-Oct Created by Vamsi Krishna S Kernel * Probes initial implementation ( includes suggestions from * Rusty Russell). + * 2004-July Suparna Bhattacharya added jumper probes + * interface to access function arguments. */ #include #include @@ -32,7 +34,8 @@ struct kprobe; struct pt_regs; -typedef void (*kprobe_pre_handler_t) (struct kprobe *, struct pt_regs *); +typedef int (*kprobe_pre_handler_t) (struct kprobe *, struct pt_regs *); +typedef int (*kprobe_break_handler_t) (struct kprobe *, struct pt_regs *); typedef void (*kprobe_post_handler_t) (struct kprobe *, struct pt_regs *, unsigned long flags); typedef int (*kprobe_fault_handler_t) (struct kprobe *, struct pt_regs *, @@ -53,6 +56,10 @@ struct kprobe { * Return 1 if it handled fault, otherwise kernel will see it. */ kprobe_fault_handler_t fault_handler; + /* ... called if breakpoint trap occurs in probe handler. + * Return 1 if it handled break, otherwise kernel will see it. */ + kprobe_break_handler_t break_handler; + /* Saved opcode (which has been replaced with breakpoint) */ kprobe_opcode_t opcode; @@ -60,6 +67,21 @@ struct kprobe { kprobe_opcode_t insn[MAX_INSN_SIZE]; }; +/* + * Special probe type that uses setjmp-longjmp type tricks to resume + * execution at a specified entry with a matching prototype corresponding + * to the probed function - a trick to enable arguments to become + * accessible seamlessly by probe handling logic. + * Note: + * Because of the way compilers allocate stack space for local variables + * etc upfront, regardless of sub-scopes within a function, this mirroring + * principle currently works only for probes placed on function entry points. + */ +struct jprobe { + struct kprobe kp; + kprobe_opcode_t *entry; /* probe handling code to jump to */ +}; + #ifdef CONFIG_KPROBES /* Locks kprobe: irq must be disabled */ void lock_kprobes(void); @@ -73,12 +95,19 @@ static inline int kprobe_running(void) } extern void arch_prepare_kprobe(struct kprobe *p); +extern void show_registers(struct pt_regs *regs); /* Get the kprobe at this addr (if any). Must have called lock_kprobes */ struct kprobe *get_kprobe(void *addr); int register_kprobe(struct kprobe *p); void unregister_kprobe(struct kprobe *p); +int setjmp_pre_handler(struct kprobe *, struct pt_regs *); +int longjmp_break_handler(struct kprobe *, struct pt_regs *); +int register_jprobe(struct jprobe *p); +void unregister_jprobe(struct jprobe *p); +void jprobe_return(void); + #else static inline int kprobe_running(void) { @@ -91,5 +120,15 @@ static inline int register_kprobe(struct kprobe *p) static inline void unregister_kprobe(struct kprobe *p) { } +static inline int register_jprobe(struct jprobe *p) +{ + return -ENOSYS; +} +static inline void unregister_jprobe(struct jprobe *p) +{ +} +static inline void jprobe_return(void) +{ +} #endif #endif /* _LINUX_KPROBES_H */ diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 7c5fee08665e..9e3dc417c67c 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -23,6 +23,8 @@ * Rusty Russell). * 2004-Aug Updated by Prasanna S Panchamukhi with * hlists and exceptions notifier as suggested by Andi Kleen. + * 2004-July Suparna Bhattacharya added jumper probes + * interface to access function arguments. */ #include #include @@ -106,6 +108,20 @@ static struct notifier_block kprobe_exceptions_nb = { .notifier_call = kprobe_exceptions_notify, }; +int register_jprobe(struct jprobe *jp) +{ + /* Todo: Verify probepoint is a function entry point */ + jp->kp.pre_handler = setjmp_pre_handler; + jp->kp.break_handler = longjmp_break_handler; + + return register_kprobe(&jp->kp); +} + +void unregister_jprobe(struct jprobe *jp) +{ + unregister_kprobe(&jp->kp); +} + static int __init init_kprobes(void) { int i, err = 0; @@ -123,3 +139,6 @@ __initcall(init_kprobes); EXPORT_SYMBOL_GPL(register_kprobe); EXPORT_SYMBOL_GPL(unregister_kprobe); +EXPORT_SYMBOL_GPL(register_jprobe); +EXPORT_SYMBOL_GPL(unregister_jprobe); +EXPORT_SYMBOL_GPL(jprobe_return); -- cgit v1.2.3 From ca3f74aa7baa11482435275c41ba323e3e2ec7b6 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Mon, 30 Aug 2004 20:35:25 -0700 Subject: [PATCH] waitid system call This patch adds a new system call `waitid'. This is a new POSIX call that subsumes the rest of the wait* family and can do some things the older calls cannot. A minor addition is the ability to select what kinds of status to check for with a mask of independent bits, so you can wait for just stops and not terminations, for example. A more significant improvement is the WNOWAIT flag, which allows for polling child status without reaping. This interface fills in a siginfo_t with the same details that a SIGCHLD for the status change has; some of that info (e.g. si_uid) is not available via wait4 or other calls. I've added a new system call that has the parameter conventions of the POSIX function because that seems like the cleanest thing. This patch includes the actual system call table additions for i386 and x86-64; other architectures will need to assign the system call number, and 64-bit ones may need to implement 32-bit compat support for it as I did for x86-64. The new features could instead be provided by some new kludge inventions in the wait4 system call interface (that's what BSD did). If kludges are preferable to adding a system call, I can work up something different. I added a struct rusage field si_rusage to siginfo_t in the SIGCHLD case (this does not affect the size or layout of the struct). This is not part of the POSIX interface, but it makes it so that `waitid' subsumes all the functionality of `wait4'. Future kernel ABIs (new arch's or whatnot) can have only the `waitid' system call and the rest of the wait* family including wait3 and wait4 can be implemented in user space using waitid. There is nothing in user space as yet that would make use of the new field. Most of the new functionality is implemented purely in the waitid system call itself. POSIX also provides for the WCONTINUED flag to report when a child process had been stopped by job control and then resumed with SIGCONT. Corresponding to this, a SIGCHLD is now generated when a child resumes (unless SA_NOCLDSTOP is set), with the value CLD_CONTINUED in siginfo_t.si_code. To implement this, some additional bookkeeping is required in the signal code handling job control stops. The motivation for this work is to make it possible to implement the POSIX semantics of the `waitid' function in glibc completely and correctly. If changing either the system call interface used to accomplish that, or any details of the kernel implementation work, would improve the chances of getting this incorporated, I am more than happy to work through any issues. Signed-off-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/entry.S | 1 + arch/x86_64/ia32/ia32_signal.c | 2 + arch/x86_64/ia32/ia32entry.S | 1 + arch/x86_64/ia32/sys_ia32.c | 19 +++ include/asm-generic/siginfo.h | 3 + include/asm-i386/unistd.h | 3 +- include/asm-ia64/siginfo.h | 1 + include/asm-x86_64/ia32.h | 1 + include/asm-x86_64/ia32_unistd.h | 3 +- include/asm-x86_64/unistd.h | 4 +- include/linux/compat.h | 2 + include/linux/sched.h | 2 + include/linux/syscalls.h | 2 + include/linux/wait.h | 9 ++ kernel/compat.c | 6 +- kernel/exit.c | 241 +++++++++++++++++++++++++++++++++++---- kernel/signal.c | 41 ++++++- kernel/sys.c | 50 ++++---- 18 files changed, 336 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index a3a571771f14..0504da0ce432 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -900,5 +900,6 @@ ENTRY(sys_call_table) .long sys_mq_notify .long sys_mq_getsetattr .long sys_ni_syscall /* reserved for kexec */ + .long sys_waitid syscall_table_size=(.-sys_call_table) diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c index b5b77dbf3b40..99b0f71e70c0 100644 --- a/arch/x86_64/ia32/ia32_signal.c +++ b/arch/x86_64/ia32/ia32_signal.c @@ -74,6 +74,8 @@ int ia32_copy_siginfo_to_user(siginfo_t32 __user *to, siginfo_t *from) err |= __put_user(from->si_utime, &to->si_utime); err |= __put_user(from->si_stime, &to->si_stime); err |= __put_user(from->si_status, &to->si_status); + err |= put_compat_rusage(&from->si_rusage, + &to->si_rusage); default: case __SI_KILL >> 16: err |= __put_user(from->si_uid, &to->si_uid); diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 9fd8d5b3471d..7acd7963b9db 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -586,6 +586,7 @@ ia32_sys_call_table: .quad compat_sys_mq_notify .quad compat_sys_mq_getsetattr .quad quiet_ni_syscall /* reserved for kexec */ + .quad sys32_waitid /* don't forget to change IA32_NR_syscalls */ ia32_syscall_end: .rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8 diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c index 6a33dbb93b71..96b0c653e216 100644 --- a/arch/x86_64/ia32/sys_ia32.c +++ b/arch/x86_64/ia32/sys_ia32.c @@ -1151,6 +1151,25 @@ asmlinkage long sys32_clone(unsigned int clone_flags, unsigned int newsp, return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); } +asmlinkage long sys32_waitid(int which, compat_pid_t pid, + siginfo_t32 __user *uinfo, int options) +{ + siginfo_t info; + long ret; + mm_segment_t old_fs = get_fs(); + + info.si_signo = 0; + set_fs (KERNEL_DS); + ret = sys_waitid(which, pid, (siginfo_t __user *) &info, options); + set_fs (old_fs); + + if (ret < 0 || info.si_signo == 0) + return ret; + BUG_ON(info.si_code & __SI_MASK); + info.si_code |= __SI_CHLD; + return ia32_copy_siginfo_to_user(uinfo, &info); +} + /* * Some system calls that need sign extended arguments. This could be done by a generic wrapper. */ diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h index f757d2d51e7f..eaacc06baec3 100644 --- a/include/asm-generic/siginfo.h +++ b/include/asm-generic/siginfo.h @@ -3,6 +3,7 @@ #include #include +#include typedef union sigval { int sival_int; @@ -74,6 +75,7 @@ typedef struct siginfo { int _status; /* exit code */ clock_t _utime; clock_t _stime; + struct rusage _rusage; } _sigchld; /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */ @@ -105,6 +107,7 @@ typedef struct siginfo { #define si_status _sifields._sigchld._status #define si_utime _sifields._sigchld._utime #define si_stime _sifields._sigchld._stime +#define si_rusage _sifields._sigchld._rusage #define si_value _sifields._rt._sigval #define si_int _sifields._rt._sigval.sival_int #define si_ptr _sifields._rt._sigval.sival_ptr diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index ef936b861404..be8c6ac56c7c 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -289,8 +289,9 @@ #define __NR_mq_notify (__NR_mq_open+4) #define __NR_mq_getsetattr (__NR_mq_open+5) #define __NR_sys_kexec_load 283 +#define __NR_waitid 284 -#define NR_syscalls 284 +#define NR_syscalls 285 /* user-visible error numbers are in the range -1 - -124: see */ diff --git a/include/asm-ia64/siginfo.h b/include/asm-ia64/siginfo.h index 52cce1d3dad8..cdaff790197f 100644 --- a/include/asm-ia64/siginfo.h +++ b/include/asm-ia64/siginfo.h @@ -56,6 +56,7 @@ typedef struct siginfo { int _status; /* exit code */ clock_t _utime; clock_t _stime; + struct rusage _rusage; } _sigchld; /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */ diff --git a/include/asm-x86_64/ia32.h b/include/asm-x86_64/ia32.h index 73319a9a8ca5..e5fb97969428 100644 --- a/include/asm-x86_64/ia32.h +++ b/include/asm-x86_64/ia32.h @@ -115,6 +115,7 @@ typedef struct siginfo32 { int _status; /* exit code */ compat_clock_t _utime; compat_clock_t _stime; + struct compat_rusage _rusage; } _sigchld; /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */ diff --git a/include/asm-x86_64/ia32_unistd.h b/include/asm-x86_64/ia32_unistd.h index ff0309c2ecf4..26e56c27fe18 100644 --- a/include/asm-x86_64/ia32_unistd.h +++ b/include/asm-x86_64/ia32_unistd.h @@ -289,7 +289,8 @@ #define __NR_ia32_mq_notify (__NR_ia32_mq_open+4) #define __NR_ia32_mq_getsetattr (__NR_ia32_mq_open+5) #define __NR_ia32_kexec 283 +#define __NR_ia32_waitid 284 -#define IA32_NR_syscalls 287 /* must be > than biggest syscall! */ +#define IA32_NR_syscalls 285 /* must be > than biggest syscall! */ #endif /* _ASM_X86_64_IA32_UNISTD_H_ */ diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index 26e0aa30b534..ae066360d597 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -554,8 +554,10 @@ __SYSCALL(__NR_mq_notify, sys_mq_notify) __SYSCALL(__NR_mq_getsetattr, sys_mq_getsetattr) #define __NR_kexec_load 246 __SYSCALL(__NR_kexec_load, sys_ni_syscall) +#define __NR_waitid (253) +__SYSCALL(__NR_waitid, sys_waitid) -#define __NR_syscall_max __NR_kexec_load +#define __NR_syscall_max __NR_waitid #ifndef __NO_STUBS /* user-visible error numbers are in the range -1 - -4095 */ diff --git a/include/linux/compat.h b/include/linux/compat.h index e45c0f905451..5e58d705ac7c 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -79,6 +79,8 @@ struct compat_rusage { compat_long_t ru_nivcsw; }; +extern int put_compat_rusage(const struct rusage *, struct compat_rusage __user *); + struct compat_dirent { u32 d_ino; compat_off_t d_off; diff --git a/include/linux/sched.h b/include/linux/sched.h index 5a36441e2fd7..f89a32a4bf48 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -287,6 +287,8 @@ struct signal_struct { /* thread group stop support, overloads group_exit_code too */ int group_stop_count; + /* 1 if group stopped since last SIGCONT, -1 if SIGCONT since report */ + int stop_state; /* POSIX.1b Interval Timers */ struct list_head posix_timers; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index fadfc6d7a87e..5ff82aafa59e 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -162,6 +162,8 @@ asmlinkage long sys_exit(int error_code); asmlinkage void sys_exit_group(int error_code); asmlinkage long sys_wait4(pid_t pid, unsigned int __user *stat_addr, int options, struct rusage __user *ru); +asmlinkage long sys_waitid(int which, pid_t pid, + struct siginfo __user *infop, int options); asmlinkage long sys_waitpid(pid_t pid, unsigned int __user *stat_addr, int options); asmlinkage long sys_set_tid_address(int __user *tidptr); asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, diff --git a/include/linux/wait.h b/include/linux/wait.h index 98c8a441befa..4417f800a639 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -3,11 +3,20 @@ #define WNOHANG 0x00000001 #define WUNTRACED 0x00000002 +#define WSTOPPED WUNTRACED +#define WEXITED 0x00000004 +#define WCONTINUED 0x00000008 +#define WNOWAIT 0x01000000 /* Don't reap, just poll status. */ #define __WNOTHREAD 0x20000000 /* Don't wait on children of other threads in this group */ #define __WALL 0x40000000 /* Wait on all children, regardless of type */ #define __WCLONE 0x80000000 /* Wait only on non-SIGCHLD children */ +/* First argument to waitid: */ +#define P_ALL 0 +#define P_PID 1 +#define P_PGID 2 + #ifdef __KERNEL__ #include diff --git a/kernel/compat.c b/kernel/compat.c index 481ac0d4bb98..5d0a92498dea 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -310,7 +310,7 @@ asmlinkage long compat_sys_getrlimit (unsigned int resource, return ret; } -static long put_compat_rusage(struct compat_rusage __user *ru, struct rusage *r) +int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru) { if (!access_ok(VERIFY_WRITE, ru, sizeof(*ru)) || __put_user(r->ru_utime.tv_sec, &ru->ru_utime.tv_sec) || @@ -348,7 +348,7 @@ asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru) if (ret) return ret; - if (put_compat_rusage(ru, &r)) + if (put_compat_rusage(&r, ru)) return -EFAULT; return 0; @@ -374,7 +374,7 @@ compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options, set_fs (old_fs); if (ret > 0) { - if (put_compat_rusage(ru, &r)) + if (put_compat_rusage(&r, ru)) return -EFAULT; if (stat_addr && put_user(status, stat_addr)) return -EFAULT; diff --git a/kernel/exit.c b/kernel/exit.c index c6ceaaee8a2e..a8c98b4880aa 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -957,16 +957,64 @@ static int eligible_child(pid_t pid, int options, task_t *p) return 1; } +static int wait_noreap_copyout(task_t *p, pid_t pid, uid_t uid, + int why, int status, + struct siginfo __user *infop) +{ + int retval = getrusage(p, RUSAGE_BOTH, &infop->si_rusage); + put_task_struct(p); + if (!retval) + retval = put_user(SIGCHLD, &infop->si_signo); + if (!retval) + retval = put_user(0, &infop->si_errno); + if (!retval) + retval = put_user((short)why, &infop->si_code); + if (!retval) + retval = put_user(pid, &infop->si_pid); + if (!retval) + retval = put_user(uid, &infop->si_uid); + if (!retval) + retval = put_user(status, &infop->si_status); + if (!retval) + retval = pid; + return retval; +} + /* * Handle sys_wait4 work for one task in state TASK_ZOMBIE. We hold * read_lock(&tasklist_lock) on entry. If we return zero, we still hold * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ -static int wait_task_zombie(task_t *p, unsigned int __user *stat_addr, struct rusage __user *ru) +static int wait_task_zombie(task_t *p, int noreap, + struct siginfo __user *infop, + int __user *stat_addr, struct rusage __user *ru) { unsigned long state; int retval; + int status; + + if (unlikely(noreap)) { + pid_t pid = p->pid; + uid_t uid = p->uid; + int exit_code = p->exit_code; + int why, status; + + if (unlikely(p->state != TASK_ZOMBIE)) + return 0; + if (unlikely(p->exit_signal == -1 && p->ptrace == 0)) + return 0; + get_task_struct(p); + read_unlock(&tasklist_lock); + if ((exit_code & 0x7f) == 0) { + why = CLD_EXITED; + status = exit_code >> 8; + } else { + why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; + status = exit_code & 0x7f; + } + return wait_noreap_copyout(p, pid, uid, why, status, infop); + } /* * Try to move the task's state to DEAD @@ -977,12 +1025,13 @@ static int wait_task_zombie(task_t *p, unsigned int __user *stat_addr, struct ru BUG_ON(state != TASK_DEAD); return 0; } - if (unlikely(p->exit_signal == -1 && p->ptrace == 0)) + if (unlikely(p->exit_signal == -1 && p->ptrace == 0)) { /* * This can only happen in a race with a ptraced thread * dying on another processor. */ return 0; + } /* * Now we are sure this task is interesting, and no other @@ -991,12 +1040,32 @@ static int wait_task_zombie(task_t *p, unsigned int __user *stat_addr, struct ru read_unlock(&tasklist_lock); retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; - if (!retval && stat_addr) { - if (p->signal->group_exit) - retval = put_user(p->signal->group_exit_code, stat_addr); - else - retval = put_user(p->exit_code, stat_addr); + status = p->signal->group_exit + ? p->signal->group_exit_code : p->exit_code; + if (!retval && stat_addr) + retval = put_user(status, stat_addr); + if (!retval && infop) + retval = put_user(SIGCHLD, &infop->si_signo); + if (!retval && infop) + retval = put_user(0, &infop->si_errno); + if (!retval && infop) { + int why; + + if ((status & 0x7f) == 0) { + why = CLD_EXITED; + status >>= 8; + } else { + why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; + status &= 0x7f; + } + retval = put_user((short)why, &infop->si_code); + if (!retval) + retval = put_user(status, &infop->si_status); } + if (!retval && infop) + retval = put_user(p->pid, &infop->si_pid); + if (!retval && infop) + retval = put_user(p->uid, &infop->si_uid); if (retval) { p->state = TASK_ZOMBIE; return retval; @@ -1009,8 +1078,9 @@ static int wait_task_zombie(task_t *p, unsigned int __user *stat_addr, struct ru __ptrace_unlink(p); p->state = TASK_ZOMBIE; /* - * If this is not a detached task, notify the parent. If it's - * still not detached after that, don't release it now. + * If this is not a detached task, notify the parent. + * If it's still not detached after that, don't release + * it now. */ if (p->exit_signal != -1) { do_notify_parent(p, p->exit_signal); @@ -1032,9 +1102,9 @@ static int wait_task_zombie(task_t *p, unsigned int __user *stat_addr, struct ru * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ -static int wait_task_stopped(task_t *p, int delayed_group_leader, - unsigned int __user *stat_addr, - struct rusage __user *ru) +static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap, + struct siginfo __user *infop, + int __user *stat_addr, struct rusage __user *ru) { int retval, exit_code; @@ -1057,6 +1127,21 @@ static int wait_task_stopped(task_t *p, int delayed_group_leader, */ get_task_struct(p); read_unlock(&tasklist_lock); + + if (unlikely(noreap)) { + pid_t pid = p->pid; + uid_t uid = p->uid; + int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED; + + exit_code = p->exit_code; + if (unlikely(!exit_code) || + unlikely(p->state > TASK_STOPPED)) + goto bail_ref; + return wait_noreap_copyout(p, pid, uid, + why, (exit_code << 8) | 0x7f, + infop); + } + write_lock_irq(&tasklist_lock); /* @@ -1082,6 +1167,7 @@ static int wait_task_stopped(task_t *p, int delayed_group_leader, * resumed, or it resumed and then died. */ write_unlock_irq(&tasklist_lock); +bail_ref: put_task_struct(p); read_lock(&tasklist_lock); return 0; @@ -1096,6 +1182,20 @@ static int wait_task_stopped(task_t *p, int delayed_group_leader, retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; if (!retval && stat_addr) retval = put_user((exit_code << 8) | 0x7f, stat_addr); + if (!retval && infop) + retval = put_user(SIGCHLD, &infop->si_signo); + if (!retval && infop) + retval = put_user(0, &infop->si_errno); + if (!retval && infop) + retval = put_user((short)((p->ptrace & PT_PTRACED) + ? CLD_TRAPPED : CLD_STOPPED), + &infop->si_code); + if (!retval && infop) + retval = put_user(exit_code, &infop->si_status); + if (!retval && infop) + retval = put_user(p->pid, &infop->si_pid); + if (!retval && infop) + retval = put_user(p->uid, &infop->si_uid); if (!retval) retval = p->pid; put_task_struct(p); @@ -1104,15 +1204,13 @@ static int wait_task_stopped(task_t *p, int delayed_group_leader, return retval; } -asmlinkage long sys_wait4(pid_t pid,unsigned int __user *stat_addr, int options, struct rusage __user *ru) +static long do_wait(pid_t pid, int options, struct siginfo __user *infop, + int __user *stat_addr, struct rusage __user *ru) { DECLARE_WAITQUEUE(wait, current); struct task_struct *tsk; int flag, retval; - if (options & ~(WNOHANG|WUNTRACED|__WNOTHREAD|__WCLONE|__WALL)) - return -EINVAL; - add_wait_queue(¤t->wait_chldexit,&wait); repeat: flag = 0; @@ -1138,25 +1236,60 @@ repeat: !(p->ptrace & PT_PTRACED)) continue; retval = wait_task_stopped(p, ret == 2, + (options & WNOWAIT), + infop, stat_addr, ru); if (retval != 0) /* He released the lock. */ - goto end_wait4; + goto end; break; case TASK_ZOMBIE: /* * Eligible but we cannot release it yet: */ if (ret == 2) + goto check_continued; + if (!likely(options & WEXITED)) continue; - retval = wait_task_zombie(p, stat_addr, ru); + retval = wait_task_zombie( + p, (options & WNOWAIT), + infop, stat_addr, ru); if (retval != 0) /* He released the lock. */ - goto end_wait4; + goto end; + break; + case TASK_DEAD: + continue; + default: +check_continued: + if (!unlikely(options & WCONTINUED)) + continue; + if (unlikely(!p->signal)) + continue; + spin_lock_irq(&p->sighand->siglock); + if (p->signal->stop_state < 0) { + pid_t pid; + uid_t uid; + + if (!(options & WNOWAIT)) + p->signal->stop_state = 0; + spin_unlock_irq(&p->sighand->siglock); + pid = p->pid; + uid = p->uid; + get_task_struct(p); + read_unlock(&tasklist_lock); + retval = wait_noreap_copyout(p, pid, + uid, CLD_CONTINUED, + SIGCONT, infop); + BUG_ON(retval == 0); + goto end; + } + spin_unlock_irq(&p->sighand->siglock); break; } } if (!flag) { - list_for_each (_p,&tsk->ptrace_children) { - p = list_entry(_p,struct task_struct,ptrace_list); + list_for_each(_p, &tsk->ptrace_children) { + p = list_entry(_p, struct task_struct, + ptrace_list); if (!eligible_child(pid, options, p)) continue; flag = 1; @@ -1169,24 +1302,84 @@ repeat: if (tsk->signal != current->signal) BUG(); } while (tsk != current); + read_unlock(&tasklist_lock); if (flag) { retval = 0; if (options & WNOHANG) - goto end_wait4; + goto end; retval = -ERESTARTSYS; if (signal_pending(current)) - goto end_wait4; + goto end; schedule(); goto repeat; } retval = -ECHILD; -end_wait4: +end: current->state = TASK_RUNNING; remove_wait_queue(¤t->wait_chldexit,&wait); + if (infop) { + if (retval > 0) + retval = 0; + else { + /* + * For a WNOHANG return, clear out all the fields + * we would set so the user can easily tell the + * difference. + */ + if (!retval) + retval = put_user(0, &infop->si_signo); + if (!retval) + retval = put_user(0, &infop->si_errno); + if (!retval) + retval = put_user(0, &infop->si_code); + if (!retval) + retval = put_user(0, &infop->si_pid); + if (!retval) + retval = put_user(0, &infop->si_uid); + if (!retval) + retval = put_user(0, &infop->si_status); + } + } return retval; } +asmlinkage long sys_waitid(int which, pid_t pid, + struct siginfo __user *infop, int options) +{ + if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED)) + return -EINVAL; + if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) + return -EINVAL; + + switch (which) { + case P_ALL: + pid = -1; + break; + case P_PID: + if (pid <= 0) + return -EINVAL; + break; + case P_PGID: + if (pid <= 0) + return -EINVAL; + pid = -pid; + break; + default: + return -EINVAL; + } + + return do_wait(pid, options, infop, NULL, &infop->si_rusage); +} + +asmlinkage long sys_wait4(pid_t pid, unsigned int *stat_addr, + int options, struct rusage *ru) +{ + if (options & ~(WNOHANG|WUNTRACED|__WNOTHREAD|__WCLONE|__WALL)) + return -EINVAL; + return do_wait(pid, options | WEXITED, NULL, stat_addr, ru); +} + #ifdef __ARCH_WANT_SYS_WAITPID /* diff --git a/kernel/signal.c b/kernel/signal.c index e5d6cbc50c1e..300912082f99 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -26,6 +26,8 @@ #include #include +extern void k_getrusage(struct task_struct *, int, struct rusage *); + /* * SLAB caches for signal bits. */ @@ -660,6 +662,7 @@ static void handle_stop_signal(int sig, struct task_struct *p) * the SIGCHLD was pending on entry to this kill. */ p->signal->group_stop_count = 0; + p->signal->stop_state = 1; if (p->ptrace & PT_PTRACED) do_notify_parent_cldstop(p, p->parent); else @@ -696,6 +699,21 @@ static void handle_stop_signal(int sig, struct task_struct *p) t = next_thread(t); } while (t != p); + + if (p->signal->stop_state > 0) { + /* + * We were in fact stopped, and are now continued. + * Notify the parent with CLD_CONTINUED. + */ + p->signal->stop_state = -1; + p->signal->group_exit_code = 0; + if (p->ptrace & PT_PTRACED) + do_notify_parent_cldstop(p, p->parent); + else + do_notify_parent_cldstop( + p->group_leader, + p->group_leader->real_parent); + } } } @@ -1466,6 +1484,7 @@ void do_notify_parent(struct task_struct *tsk, int sig) /* FIXME: find out whether or not this is supposed to be c*time. */ info.si_utime = tsk->utime; info.si_stime = tsk->stime; + k_getrusage(tsk, RUSAGE_BOTH, &info.si_rusage); status = tsk->exit_code & 0x7f; why = SI_KERNEL; /* shouldn't happen */ @@ -1555,9 +1574,16 @@ do_notify_parent_cldstop(struct task_struct *tsk, struct task_struct *parent) /* FIXME: find out whether or not this is supposed to be c*time. */ info.si_utime = tsk->utime; info.si_stime = tsk->stime; + k_getrusage(tsk, RUSAGE_BOTH, &info.si_rusage); - info.si_status = tsk->exit_code & 0x7f; - info.si_code = CLD_STOPPED; + info.si_status = (tsk->signal ? tsk->signal->group_exit_code : + tsk->exit_code) & 0x7f; + if (info.si_status == 0) { + info.si_status = SIGCONT; + info.si_code = CLD_CONTINUED; + } else { + info.si_code = CLD_STOPPED; + } sighand = parent->sighand; spin_lock_irqsave(&sighand->siglock, flags); @@ -1623,14 +1649,17 @@ do_signal_stop(int signr) stop_count = --sig->group_stop_count; current->exit_code = signr; set_current_state(TASK_STOPPED); + if (stop_count == 0) + sig->stop_state = 1; spin_unlock_irq(&sighand->siglock); } else if (thread_group_empty(current)) { /* * Lock must be held through transition to stopped state. */ - current->exit_code = signr; + current->exit_code = current->signal->group_exit_code = signr; set_current_state(TASK_STOPPED); + sig->stop_state = 1; spin_unlock_irq(&sighand->siglock); } else { @@ -1696,6 +1725,8 @@ do_signal_stop(int signr) current->exit_code = signr; set_current_state(TASK_STOPPED); + if (stop_count == 0) + sig->stop_state = 1; spin_unlock_irq(&sighand->siglock); read_unlock(&tasklist_lock); @@ -1736,6 +1767,8 @@ static inline int handle_group_stop(void) * without any associated signal being in our queue. */ stop_count = --current->signal->group_stop_count; + if (stop_count == 0) + current->signal->stop_state = 1; current->exit_code = current->signal->group_exit_code; set_current_state(TASK_STOPPED); spin_unlock_irq(¤t->sighand->siglock); @@ -2098,6 +2131,8 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) err |= __put_user(from->si_status, &to->si_status); err |= __put_user(from->si_utime, &to->si_utime); err |= __put_user(from->si_stime, &to->si_stime); + err |= __copy_to_user(&to->si_rusage, &from->si_rusage, + sizeof(to->si_rusage)); break; case __SI_RT: /* This is not generated by the kernel as of now. */ case __SI_MESGQ: /* But this is */ diff --git a/kernel/sys.c b/kernel/sys.c index 1bbc66a60b8c..28c5555e6352 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1540,37 +1540,43 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) * reaped till shortly after the call to getrusage(), in both cases the * task being examined is in a frozen state so the counters won't change. */ -int getrusage(struct task_struct *p, int who, struct rusage __user *ru) -{ - struct rusage r; - memset((char *) &r, 0, sizeof(r)); + +void k_getrusage(struct task_struct *p, int who, struct rusage *r) +{ + memset((char *) r, 0, sizeof *r); switch (who) { case RUSAGE_SELF: - jiffies_to_timeval(p->utime, &r.ru_utime); - jiffies_to_timeval(p->stime, &r.ru_stime); - r.ru_nvcsw = p->nvcsw; - r.ru_nivcsw = p->nivcsw; - r.ru_minflt = p->min_flt; - r.ru_majflt = p->maj_flt; + jiffies_to_timeval(p->utime, &r->ru_utime); + jiffies_to_timeval(p->stime, &r->ru_stime); + r->ru_nvcsw = p->nvcsw; + r->ru_nivcsw = p->nivcsw; + r->ru_minflt = p->min_flt; + r->ru_majflt = p->maj_flt; break; case RUSAGE_CHILDREN: - jiffies_to_timeval(p->cutime, &r.ru_utime); - jiffies_to_timeval(p->cstime, &r.ru_stime); - r.ru_nvcsw = p->cnvcsw; - r.ru_nivcsw = p->cnivcsw; - r.ru_minflt = p->cmin_flt; - r.ru_majflt = p->cmaj_flt; + jiffies_to_timeval(p->cutime, &r->ru_utime); + jiffies_to_timeval(p->cstime, &r->ru_stime); + r->ru_nvcsw = p->cnvcsw; + r->ru_nivcsw = p->cnivcsw; + r->ru_minflt = p->cmin_flt; + r->ru_majflt = p->cmaj_flt; break; default: - jiffies_to_timeval(p->utime + p->cutime, &r.ru_utime); - jiffies_to_timeval(p->stime + p->cstime, &r.ru_stime); - r.ru_nvcsw = p->nvcsw + p->cnvcsw; - r.ru_nivcsw = p->nivcsw + p->cnivcsw; - r.ru_minflt = p->min_flt + p->cmin_flt; - r.ru_majflt = p->maj_flt + p->cmaj_flt; + jiffies_to_timeval(p->utime + p->cutime, &r->ru_utime); + jiffies_to_timeval(p->stime + p->cstime, &r->ru_stime); + r->ru_nvcsw = p->nvcsw + p->cnvcsw; + r->ru_nivcsw = p->nivcsw + p->cnivcsw; + r->ru_minflt = p->min_flt + p->cmin_flt; + r->ru_majflt = p->maj_flt + p->cmaj_flt; break; } +} + +int getrusage(struct task_struct *p, int who, struct rusage __user *ru) +{ + struct rusage r; + k_getrusage(p, who, &r); return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; } -- cgit v1.2.3 From 497c9d6819e984f737ff51489258f213e193d52e Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Mon, 30 Aug 2004 20:35:38 -0700 Subject: [PATCH] fix rusage semantics This patch changes the rusage bookkeeping and the semantics of the getrusage and times calls in a couple of ways. The first change is in the c* fields counting dead child processes. POSIX requires that children that have died be counted in these fields when they are reaped by a wait* call, and that if they are never reaped (e.g. because of ignoring SIGCHLD, or exitting yourself first) then they are never counted. These were counted in release_task for all threads. I've changed it so they are counted in wait_task_zombie, i.e. exactly when being reaped. POSIX also specifies for RUSAGE_CHILDREN that the report include the reaped child processes of the calling process, i.e. whole thread group in Linux, not just ones forked by the calling thread. POSIX specifies tms_c[us]time fields in the times call the same way. I've moved the c* fields that contain this information into signal_struct, where the single set of counters accumulates data from any thread in the group that calls wait*. Finally, POSIX specifies getrusage and times as returning cumulative totals for the whole process (aka thread group), not just the calling thread. I've added fields in signal_struct to accumulate the stats of detached threads as they die. The process stats are the sums of these records plus the stats of remaining each live/zombie thread. The times and getrusage calls, and the internal uses for filling in wait4 results and siginfo_t, now iterate over the threads in the thread group and sum up their stats along with the stats recorded for threads already dead and gone. I added a new value RUSAGE_GROUP (-3) for the getrusage system call rather than changing the behavior of the old RUSAGE_SELF (0). POSIX specifies RUSAGE_SELF to mean all threads, so the glibc getrusage call will just translate it to RUSAGE_GROUP for new kernels. I did this thinking that someone somewhere might want the old behavior with an old glibc and a new kernel (it is only different if they are using CLONE_THREAD anyway). However, I've changed the times system call to conform to POSIX as well and did not provide any backward compatibility there. In that case there is nothing easy like a parameter value to use, it would have to be a new system call number. That seems pretty pointless. Given that, I wonder if it is worth bothering to preserve the compatible RUSAGE_SELF behavior by introducing RUSAGE_GROUP instead of just changing RUSAGE_SELF's meaning. Comments? I've done some basic testing on x86 and x86-64, and all the numbers come out right after these fixes. (I have a test program that shows a few Signed-off-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/kernel/osf_sys.c | 16 ++---- arch/mips/kernel/sysirix.c | 4 +- fs/binfmt_elf.c | 25 +++++++-- fs/proc/array.c | 13 +++-- include/linux/resource.h | 1 + include/linux/sched.h | 16 ++++-- kernel/compat.c | 37 +++++++++++-- kernel/exit.c | 38 +++++++++++--- kernel/fork.c | 8 +-- kernel/signal.c | 28 ++++++++-- kernel/sys.c | 124 +++++++++++++++++++++++++++++++++++--------- 11 files changed, 243 insertions(+), 67 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index 4761ed9a44c2..b45caf6aff9c 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -1079,18 +1079,10 @@ osf_getrusage(int who, struct rusage32 __user *ru) r.ru_majflt = current->maj_flt; break; case RUSAGE_CHILDREN: - jiffies_to_timeval32(current->cutime, &r.ru_utime); - jiffies_to_timeval32(current->cstime, &r.ru_stime); - r.ru_minflt = current->cmin_flt; - r.ru_majflt = current->cmaj_flt; - break; - default: - jiffies_to_timeval32(current->utime + current->cutime, - &r.ru_utime); - jiffies_to_timeval32(current->stime + current->cstime, - &r.ru_stime); - r.ru_minflt = current->min_flt + current->cmin_flt; - r.ru_majflt = current->maj_flt + current->cmaj_flt; + jiffies_to_timeval32(current->signal->cutime, &r.ru_utime); + jiffies_to_timeval32(current->signal->cstime, &r.ru_stime); + r.ru_minflt = current->signal->cmin_flt; + r.ru_majflt = current->signal->cmaj_flt; break; } diff --git a/arch/mips/kernel/sysirix.c b/arch/mips/kernel/sysirix.c index 494d1872df32..392c73853fa6 100644 --- a/arch/mips/kernel/sysirix.c +++ b/arch/mips/kernel/sysirix.c @@ -810,8 +810,8 @@ asmlinkage int irix_times(struct tms * tbuf) return err; err |= __put_user(current->utime, &tbuf->tms_utime); err |= __put_user(current->stime, &tbuf->tms_stime); - err |= __put_user(current->cutime, &tbuf->tms_cutime); - err |= __put_user(current->cstime, &tbuf->tms_cstime); + err |= __put_user(current->signal->cutime, &tbuf->tms_cutime); + err |= __put_user(current->signal->cstime, &tbuf->tms_cstime); } return err; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 717da940fbfa..0c8ab2a51056 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1179,10 +1179,27 @@ static void fill_prstatus(struct elf_prstatus *prstatus, prstatus->pr_ppid = p->parent->pid; prstatus->pr_pgrp = process_group(p); prstatus->pr_sid = p->signal->session; - jiffies_to_timeval(p->utime, &prstatus->pr_utime); - jiffies_to_timeval(p->stime, &prstatus->pr_stime); - jiffies_to_timeval(p->cutime, &prstatus->pr_cutime); - jiffies_to_timeval(p->cstime, &prstatus->pr_cstime); + if (p->pid == p->tgid) { + /* + * This is the record for the group leader. Add in the + * cumulative times of previous dead threads. This total + * won't include the time of each live thread whose state + * is included in the core dump. The final total reported + * to our parent process when it calls wait4 will include + * those sums as well as the little bit more time it takes + * this and each other thread to finish dying after the + * core dump synchronization phase. + */ + jiffies_to_timeval(p->utime + p->signal->utime, + &prstatus->pr_utime); + jiffies_to_timeval(p->stime + p->signal->stime, + &prstatus->pr_stime); + } else { + jiffies_to_timeval(p->utime, &prstatus->pr_utime); + jiffies_to_timeval(p->stime, &prstatus->pr_stime); + } + jiffies_to_timeval(p->signal->cutime, &prstatus->pr_cutime); + jiffies_to_timeval(p->signal->cstime, &prstatus->pr_cstime); } static void fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, diff --git a/fs/proc/array.c b/fs/proc/array.c index 6fbe586a5b8b..40c77fe3ad30 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -310,6 +310,7 @@ int proc_pid_stat(struct task_struct *task, char * buffer) int num_threads = 0; struct mm_struct *mm; unsigned long long start_time; + unsigned long cmin_flt = 0, cmaj_flt = 0, cutime = 0, cstime = 0; char tcomm[sizeof(task->comm)]; state = *get_task_state(task); @@ -340,6 +341,10 @@ int proc_pid_stat(struct task_struct *task, char * buffer) } pgid = process_group(task); sid = task->signal->session; + cmin_flt = task->signal->cmin_flt; + cmaj_flt = task->signal->cmaj_flt; + cutime = task->signal->cutime; + cstime = task->signal->cstime; } read_unlock(&tasklist_lock); @@ -368,13 +373,13 @@ int proc_pid_stat(struct task_struct *task, char * buffer) tty_pgrp, task->flags, task->min_flt, - task->cmin_flt, + cmin_flt, task->maj_flt, - task->cmaj_flt, + cmaj_flt, jiffies_to_clock_t(task->utime), jiffies_to_clock_t(task->stime), - jiffies_to_clock_t(task->cutime), - jiffies_to_clock_t(task->cstime), + jiffies_to_clock_t(cutime), + jiffies_to_clock_t(cstime), priority, nice, num_threads, diff --git a/include/linux/resource.h b/include/linux/resource.h index 21a86cb6acdb..3c848324df44 100644 --- a/include/linux/resource.h +++ b/include/linux/resource.h @@ -17,6 +17,7 @@ #define RUSAGE_SELF 0 #define RUSAGE_CHILDREN (-1) #define RUSAGE_BOTH (-2) /* sys_wait4() uses this */ +#define RUSAGE_GROUP (-3) /* thread group sum + dead threads */ struct rusage { struct timeval ru_utime; /* user time used */ diff --git a/include/linux/sched.h b/include/linux/sched.h index f89a32a4bf48..3aaf8e5f2216 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -301,6 +301,16 @@ struct signal_struct { int leader; struct tty_struct *tty; /* NULL if no tty */ + + /* + * Cumulative resource counters for dead threads in the group, + * and for reaped dead child processes forked by this group. + * Live threads maintain their own counters and add to these + * in __exit_signal, except for the group leader. + */ + unsigned long utime, stime, cutime, cstime; + unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; + unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; }; /* @@ -495,11 +505,11 @@ struct task_struct { unsigned long it_real_value, it_prof_value, it_virt_value; unsigned long it_real_incr, it_prof_incr, it_virt_incr; struct timer_list real_timer; - unsigned long utime, stime, cutime, cstime; - unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; /* context switch counts */ + unsigned long utime, stime; + unsigned long nvcsw, nivcsw; /* context switch counts */ u64 start_time; /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ - unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; + unsigned long min_flt, maj_flt; /* process credentials */ uid_t uid,euid,suid,fsuid; gid_t gid,egid,sgid,fsgid; diff --git a/kernel/compat.c b/kernel/compat.c index 5d0a92498dea..8bfae57e7d66 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -160,10 +160,39 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) */ if (tbuf) { struct compat_tms tmp; - tmp.tms_utime = compat_jiffies_to_clock_t(current->utime); - tmp.tms_stime = compat_jiffies_to_clock_t(current->stime); - tmp.tms_cutime = compat_jiffies_to_clock_t(current->cutime); - tmp.tms_cstime = compat_jiffies_to_clock_t(current->cstime); + struct task_struct *tsk = current; + struct task_struct *t; + unsigned long utime, stime, cutime, cstime; + + read_lock(&tasklist_lock); + utime = tsk->signal->utime; + stime = tsk->signal->stime; + t = tsk; + do { + utime += t->utime; + stime += t->stime; + t = next_thread(t); + } while (t != tsk); + + /* + * While we have tasklist_lock read-locked, no dying thread + * can be updating current->signal->[us]time. Instead, + * we got their counts included in the live thread loop. + * However, another thread can come in right now and + * do a wait call that updates current->signal->c[us]time. + * To make sure we always see that pair updated atomically, + * we take the siglock around fetching them. + */ + spin_lock_irq(&tsk->sighand->siglock); + cutime = tsk->signal->cutime; + cstime = tsk->signal->cstime; + spin_unlock_irq(&tsk->sighand->siglock); + read_unlock(&tasklist_lock); + + tmp.tms_utime = compat_jiffies_to_clock_t(utime); + tmp.tms_stime = compat_jiffies_to_clock_t(stime); + tmp.tms_cutime = compat_jiffies_to_clock_t(cutime); + tmp.tms_cstime = compat_jiffies_to_clock_t(cstime); if (copy_to_user(tbuf, &tmp, sizeof(tmp))) return -EFAULT; } diff --git a/kernel/exit.c b/kernel/exit.c index a8c98b4880aa..494f97d0d059 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -89,12 +89,6 @@ repeat: zap_leader = (leader->exit_signal == -1); } - p->parent->cutime += p->utime + p->cutime; - p->parent->cstime += p->stime + p->cstime; - p->parent->cmin_flt += p->min_flt + p->cmin_flt; - p->parent->cmaj_flt += p->maj_flt + p->cmaj_flt; - p->parent->cnvcsw += p->nvcsw + p->cnvcsw; - p->parent->cnivcsw += p->nivcsw + p->cnivcsw; sched_exit(p); write_unlock_irq(&tasklist_lock); spin_unlock(&p->proc_lock); @@ -1033,6 +1027,38 @@ static int wait_task_zombie(task_t *p, int noreap, return 0; } + if (likely(p->real_parent == p->parent) && likely(p->signal)) { + /* + * The resource counters for the group leader are in its + * own task_struct. Those for dead threads in the group + * are in its signal_struct, as are those for the child + * processes it has previously reaped. All these + * accumulate in the parent's signal_struct c* fields. + * + * We don't bother to take a lock here to protect these + * p->signal fields, because they are only touched by + * __exit_signal, which runs with tasklist_lock + * write-locked anyway, and so is excluded here. We do + * need to protect the access to p->parent->signal fields, + * as other threads in the parent group can be right + * here reaping other children at the same time. + */ + spin_lock_irq(&p->parent->sighand->siglock); + p->parent->signal->cutime += + p->utime + p->signal->utime + p->signal->cutime; + p->parent->signal->cstime += + p->stime + p->signal->stime + p->signal->cstime; + p->parent->signal->cmin_flt += + p->min_flt + p->signal->min_flt + p->signal->cmin_flt; + p->parent->signal->cmaj_flt += + p->maj_flt + p->signal->maj_flt + p->signal->cmaj_flt; + p->parent->signal->cnvcsw += + p->nvcsw + p->signal->nvcsw + p->signal->cnvcsw; + p->parent->signal->cnivcsw += + p->nivcsw + p->signal->nivcsw + p->signal->cnivcsw; + spin_unlock_irq(&p->parent->sighand->siglock); + } + /* * Now we are sure this task is interesting, and no other * thread can reap it because we set its state to TASK_DEAD. diff --git a/kernel/fork.c b/kernel/fork.c index fed7a0772660..76842cca81c9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -559,8 +559,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) int retval; tsk->min_flt = tsk->maj_flt = 0; - tsk->cmin_flt = tsk->cmaj_flt = 0; - tsk->nvcsw = tsk->nivcsw = tsk->cnvcsw = tsk->cnivcsw = 0; + tsk->nvcsw = tsk->nivcsw = 0; tsk->mm = NULL; tsk->active_mm = NULL; @@ -867,6 +866,10 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts sig->leader = 0; /* session leadership doesn't inherit */ sig->tty_old_pgrp = 0; + sig->utime = sig->stime = sig->cutime = sig->cstime = 0; + sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; + sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; + return 0; } @@ -986,7 +989,6 @@ static task_t *copy_process(unsigned long clone_flags, p->real_timer.data = (unsigned long) p; p->utime = p->stime = 0; - p->cutime = p->cstime = 0; p->lock_depth = -1; /* -1 = no lock */ p->start_time = get_jiffies_64(); p->security = NULL; diff --git a/kernel/signal.c b/kernel/signal.c index 300912082f99..2dec2c91ed6e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -369,6 +369,22 @@ void __exit_signal(struct task_struct *tsk) if (tsk == sig->curr_target) sig->curr_target = next_thread(tsk); tsk->signal = NULL; + /* + * Accumulate here the counters for all threads but the + * group leader as they die, so they can be added into + * the process-wide totals when those are taken. + * The group leader stays around as a zombie as long + * as there are other threads. When it gets reaped, + * the exit.c code will add its counts into these totals. + * We won't ever get here for the group leader, since it + * will have been the last reference on the signal_struct. + */ + sig->utime += tsk->utime; + sig->stime += tsk->stime; + sig->min_flt += tsk->min_flt; + sig->maj_flt += tsk->maj_flt; + sig->nvcsw += tsk->nvcsw; + sig->nivcsw += tsk->nivcsw; spin_unlock(&sighand->siglock); sig = NULL; /* Marker for below. */ } @@ -663,12 +679,14 @@ static void handle_stop_signal(int sig, struct task_struct *p) */ p->signal->group_stop_count = 0; p->signal->stop_state = 1; + spin_unlock(&p->sighand->siglock); if (p->ptrace & PT_PTRACED) do_notify_parent_cldstop(p, p->parent); else do_notify_parent_cldstop( p->group_leader, p->group_leader->real_parent); + spin_lock(&p->sighand->siglock); } rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); t = p; @@ -707,12 +725,14 @@ static void handle_stop_signal(int sig, struct task_struct *p) */ p->signal->stop_state = -1; p->signal->group_exit_code = 0; + spin_unlock(&p->sighand->siglock); if (p->ptrace & PT_PTRACED) do_notify_parent_cldstop(p, p->parent); else do_notify_parent_cldstop( p->group_leader, p->group_leader->real_parent); + spin_lock(&p->sighand->siglock); } } } @@ -1473,8 +1493,8 @@ void do_notify_parent(struct task_struct *tsk, int sig) if (sig == -1) BUG(); - BUG_ON(tsk->group_leader != tsk && tsk->group_leader->state != TASK_ZOMBIE && !tsk->ptrace); - BUG_ON(tsk->group_leader == tsk && !thread_group_empty(tsk) && !tsk->ptrace); + BUG_ON(!tsk->ptrace && + (tsk->group_leader != tsk || !thread_group_empty(tsk))); info.si_signo = sig; info.si_errno = 0; @@ -1482,8 +1502,8 @@ void do_notify_parent(struct task_struct *tsk, int sig) info.si_uid = tsk->uid; /* FIXME: find out whether or not this is supposed to be c*time. */ - info.si_utime = tsk->utime; - info.si_stime = tsk->stime; + info.si_utime = tsk->utime + tsk->signal->utime; + info.si_stime = tsk->stime + tsk->signal->stime; k_getrusage(tsk, RUSAGE_BOTH, &info.si_rusage); status = tsk->exit_code & 0x7f; diff --git a/kernel/sys.c b/kernel/sys.c index 28c5555e6352..de206f3df466 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -947,10 +947,39 @@ asmlinkage long sys_times(struct tms __user * tbuf) */ if (tbuf) { struct tms tmp; - tmp.tms_utime = jiffies_to_clock_t(current->utime); - tmp.tms_stime = jiffies_to_clock_t(current->stime); - tmp.tms_cutime = jiffies_to_clock_t(current->cutime); - tmp.tms_cstime = jiffies_to_clock_t(current->cstime); + struct task_struct *tsk = current; + struct task_struct *t; + unsigned long utime, stime, cutime, cstime; + + read_lock(&tasklist_lock); + utime = tsk->signal->utime; + stime = tsk->signal->stime; + t = tsk; + do { + utime += t->utime; + stime += t->stime; + t = next_thread(t); + } while (t != tsk); + + /* + * While we have tasklist_lock read-locked, no dying thread + * can be updating current->signal->[us]time. Instead, + * we got their counts included in the live thread loop. + * However, another thread can come in right now and + * do a wait call that updates current->signal->c[us]time. + * To make sure we always see that pair updated atomically, + * we take the siglock around fetching them. + */ + spin_lock_irq(&tsk->sighand->siglock); + cutime = tsk->signal->cutime; + cstime = tsk->signal->cstime; + spin_unlock_irq(&tsk->sighand->siglock); + read_unlock(&tasklist_lock); + + tmp.tms_utime = jiffies_to_clock_t(utime); + tmp.tms_stime = jiffies_to_clock_t(stime); + tmp.tms_cutime = jiffies_to_clock_t(cutime); + tmp.tms_cstime = jiffies_to_clock_t(cstime); if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) return -EFAULT; } @@ -1533,18 +1562,29 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) * a lot simpler! (Which we're not doing right now because we're not * measuring them yet). * - * This is SMP safe. Either we are called from sys_getrusage on ourselves - * below (we know we aren't going to exit/disappear and only we change our - * rusage counters), or we are called from wait4() on a process which is - * either stopped or zombied. In the zombied case the task won't get - * reaped till shortly after the call to getrusage(), in both cases the - * task being examined is in a frozen state so the counters won't change. + * This expects to be called with tasklist_lock read-locked or better, + * and the siglock not locked. It may momentarily take the siglock. + * + * When sampling multiple threads for RUSAGE_GROUP, under SMP we might have + * races with threads incrementing their own counters. But since word + * reads are atomic, we either get new values or old values and we don't + * care which for the sums. We always take the siglock to protect reading + * the c* fields from p->signal from races with exit.c updating those + * fields when reaping, so a sample either gets all the additions of a + * given child after it's reaped, or none so this sample is before reaping. */ - void k_getrusage(struct task_struct *p, int who, struct rusage *r) { + struct task_struct *t; + unsigned long flags; + unsigned long utime, stime; + memset((char *) r, 0, sizeof *r); + + if (unlikely(!p->signal)) + return; + switch (who) { case RUSAGE_SELF: jiffies_to_timeval(p->utime, &r->ru_utime); @@ -1555,34 +1595,68 @@ void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_majflt = p->maj_flt; break; case RUSAGE_CHILDREN: - jiffies_to_timeval(p->cutime, &r->ru_utime); - jiffies_to_timeval(p->cstime, &r->ru_stime); - r->ru_nvcsw = p->cnvcsw; - r->ru_nivcsw = p->cnivcsw; - r->ru_minflt = p->cmin_flt; - r->ru_majflt = p->cmaj_flt; + spin_lock_irqsave(&p->sighand->siglock, flags); + utime = p->signal->cutime; + stime = p->signal->cstime; + r->ru_nvcsw = p->signal->cnvcsw; + r->ru_nivcsw = p->signal->cnivcsw; + r->ru_minflt = p->signal->cmin_flt; + r->ru_majflt = p->signal->cmaj_flt; + spin_unlock_irqrestore(&p->sighand->siglock, flags); + jiffies_to_timeval(utime, &r->ru_utime); + jiffies_to_timeval(stime, &r->ru_stime); break; - default: - jiffies_to_timeval(p->utime + p->cutime, &r->ru_utime); - jiffies_to_timeval(p->stime + p->cstime, &r->ru_stime); - r->ru_nvcsw = p->nvcsw + p->cnvcsw; - r->ru_nivcsw = p->nivcsw + p->cnivcsw; - r->ru_minflt = p->min_flt + p->cmin_flt; - r->ru_majflt = p->maj_flt + p->cmaj_flt; + case RUSAGE_GROUP: + spin_lock_irqsave(&p->sighand->siglock, flags); + utime = stime = 0; + goto sum_group; + case RUSAGE_BOTH: + spin_lock_irqsave(&p->sighand->siglock, flags); + utime = p->signal->cutime; + stime = p->signal->cstime; + r->ru_nvcsw = p->signal->cnvcsw; + r->ru_nivcsw = p->signal->cnivcsw; + r->ru_minflt = p->signal->cmin_flt; + r->ru_majflt = p->signal->cmaj_flt; + sum_group: + utime += p->signal->utime; + stime += p->signal->stime; + r->ru_nvcsw += p->signal->nvcsw; + r->ru_nivcsw += p->signal->nivcsw; + r->ru_minflt += p->signal->min_flt; + r->ru_majflt += p->signal->maj_flt; + t = p; + do { + utime += t->utime; + stime += t->stime; + r->ru_nvcsw += t->nvcsw; + r->ru_nivcsw += t->nivcsw; + r->ru_minflt += t->min_flt; + r->ru_majflt += t->maj_flt; + t = next_thread(t); + } while (t != p); + spin_unlock_irqrestore(&p->sighand->siglock, flags); + jiffies_to_timeval(utime, &r->ru_utime); + jiffies_to_timeval(stime, &r->ru_stime); break; + default: + BUG(); } } int getrusage(struct task_struct *p, int who, struct rusage __user *ru) { struct rusage r; + read_lock(&tasklist_lock); k_getrusage(p, who, &r); + read_unlock(&tasklist_lock); return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; } asmlinkage long sys_getrusage(int who, struct rusage __user *ru) { - if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN) + if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN + && who != RUSAGE_GROUP) return -EINVAL; return getrusage(current, who, ru); } -- cgit v1.2.3 From 2c05d9eb31eac02a105f2b05c144272f8b382864 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 30 Aug 2004 20:35:49 -0700 Subject: [PATCH] fix PID hash sizing A 4GB, 4-way Opteron would create the smallest size table (16 entries) because pidhash_init is called before mem_init which is where x86-64 sets up max_pfn. nr_kernel_pages is setup by paging_init, called from setup_arch, which is also where i386 sets up max_pfn. So export nr_kernel_pages, nr_all_pages. Use nr_kernel_pages when sizing the PID hash. This fixes the problem. This also makes the pid hash dependant on the size of ZONE_NORMAL instead of total size of memory. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bootmem.h | 3 +++ kernel/pid.c | 2 +- mm/page_alloc.c | 4 ++-- 3 files changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index e038f9a3d0ef..376a5500a7a6 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -67,6 +67,9 @@ extern void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, __alloc_bootmem_node((pgdat), (x), PAGE_SIZE, 0) #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ +extern unsigned long __initdata nr_kernel_pages; +extern unsigned long __initdata nr_all_pages; + extern void *__init alloc_large_system_hash(const char *tablename, unsigned long bucketsize, unsigned long numentries, diff --git a/kernel/pid.c b/kernel/pid.c index 6ed44f56ca45..11d834532302 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -271,7 +271,7 @@ void switch_exec_pids(task_t *leader, task_t *thread) void __init pidhash_init(void) { int i, j, pidhash_size; - unsigned long megabytes = max_pfn >> (20 - PAGE_SHIFT); + unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT); pidhash_shift = max(4, fls(megabytes * 4)); pidhash_shift = min(12, pidhash_shift); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ce877b9e223e..327f35787fe2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -55,8 +55,8 @@ EXPORT_SYMBOL(zone_table); static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; int min_free_kbytes = 1024; -static unsigned long __initdata nr_kernel_pages; -static unsigned long __initdata nr_all_pages; +unsigned long __initdata nr_kernel_pages; +unsigned long __initdata nr_all_pages; /* * Temporary debugging check for pages not lying within a given zone. -- cgit v1.2.3 From f59ad67e9647b385fd8738b3757ce9be2369bcbc Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 30 Aug 2004 20:36:01 -0700 Subject: [PATCH] use hlist for pid hash Use hlists for the PID hashes. This halves the memory footprint of these hashes. No benchmarks, but I think this is a worthy improvement because the hashes are something that would be likely to have significant portions loaded into the cache of every CPU on some workloads. This comes at the "expense" of 1. reintroducing the memory prefetch into the hash traversal loop; 2. adding new pids to the head of the list instead of the tail. I suspect that if this was a big problem then the hash isn't sized well or could benefit from moving hot entries to the head. Also, account for all the pid hashes when reporting hash memory usage. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pid.h | 2 +- kernel/pid.c | 19 ++++++++++--------- 2 files changed, 11 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/pid.h b/include/linux/pid.h index 9e7213cb1281..76e61c2b38dd 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -16,7 +16,7 @@ struct pid atomic_t count; struct task_struct *task; struct list_head task_list; - struct list_head hash_chain; + struct hlist_node hash_chain; }; struct pid_link diff --git a/kernel/pid.c b/kernel/pid.c index 11d834532302..8c9b7510174a 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -27,7 +27,7 @@ #include #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) -static struct list_head *pid_hash[PIDTYPE_MAX]; +static struct hlist_head *pid_hash[PIDTYPE_MAX]; static int pidhash_shift; int pid_max = PID_MAX_DEFAULT; @@ -148,11 +148,11 @@ failure: fastcall struct pid *find_pid(enum pid_type type, int nr) { - struct list_head *elem, *bucket = &pid_hash[type][pid_hashfn(nr)]; + struct hlist_node *elem; struct pid *pid; - __list_for_each(elem, bucket) { - pid = list_entry(elem, struct pid, hash_chain); + hlist_for_each_entry(pid, elem, + &pid_hash[type][pid_hashfn(nr)], hash_chain) { if (pid->nr == nr) return pid; } @@ -179,7 +179,8 @@ int fastcall attach_pid(task_t *task, enum pid_type type, int nr) INIT_LIST_HEAD(&pid->task_list); pid->task = task; get_task_struct(task); - list_add(&pid->hash_chain, &pid_hash[type][pid_hashfn(nr)]); + hlist_add_head(&pid->hash_chain, + &pid_hash[type][pid_hashfn(nr)]); } list_add_tail(&task->pids[type].pid_chain, &pid->task_list); task->pids[type].pidptr = pid; @@ -198,7 +199,7 @@ static inline int __detach_pid(task_t *task, enum pid_type type) return 0; nr = pid->nr; - list_del(&pid->hash_chain); + hlist_del(&pid->hash_chain); put_task_struct(pid->task); return nr; @@ -277,9 +278,9 @@ void __init pidhash_init(void) pidhash_shift = min(12, pidhash_shift); pidhash_size = 1 << pidhash_shift; - printk("PID hash table entries: %d (order %d: %Zd bytes)\n", + printk("PID hash table entries: %d (order: %d, %Zd bytes)\n", pidhash_size, pidhash_shift, - pidhash_size * sizeof(struct list_head)); + PIDTYPE_MAX * pidhash_size * sizeof(struct hlist_head)); for (i = 0; i < PIDTYPE_MAX; i++) { pid_hash[i] = alloc_bootmem(pidhash_size * @@ -287,7 +288,7 @@ void __init pidhash_init(void) if (!pid_hash[i]) panic("Could not alloc pidhash!\n"); for (j = 0; j < pidhash_size; j++) - INIT_LIST_HEAD(&pid_hash[i][j]); + INIT_HLIST_HEAD(&pid_hash[i][j]); } } -- cgit v1.2.3 From 758638eb8ed5feb31636e83ab6a4ebd0024a2721 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 30 Aug 2004 20:36:51 -0700 Subject: [PATCH] Don't OOPS on stripped modules Don't want to go overboard with the checks, but this is simple and reasonable. Signed-off-by: Rusty Russell (modified) Signed-off-by: Paolo 'Blaisorblade' Giarrusso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index aa91a98ffb0b..8c17a4ce0707 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1538,9 +1538,6 @@ static struct module *load_module(void __user *umod, secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; sechdrs[0].sh_addr = 0; - /* And these should exist, but gcc whinges if we don't init them */ - symindex = strindex = 0; - for (i = 1; i < hdr->e_shnum; i++) { if (sechdrs[i].sh_type != SHT_NOBITS && len < sechdrs[i].sh_offset + sechdrs[i].sh_size) @@ -1572,6 +1569,13 @@ static struct module *load_module(void __user *umod, } mod = (void *)sechdrs[modindex].sh_addr; + if (symindex == 0) { + printk(KERN_WARNING "%s: module has no symbols (stripped?)\n", + mod->name); + err = -ENOEXEC; + goto free_hdr; + } + /* Optional sections */ exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab"); gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl"); -- cgit v1.2.3 From a3db446cfafe2fa0080029bbd916cf04dc1d93bd Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Mon, 30 Aug 2004 20:42:19 -0700 Subject: [PATCH] kernel/fork.c add missing unlikely(). Signed-off-by: Luiz Capitulino Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 76842cca81c9..9353b2e96700 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1120,7 +1120,7 @@ static task_t *copy_process(unsigned long clone_flags, } SET_LINKS(p); - if (p->ptrace & PT_PTRACED) + if (unlikely(p->ptrace & PT_PTRACED)) __ptrace_link(p, current->parent); attach_pid(p, PIDTYPE_PID, p->pid); -- cgit v1.2.3 From bae835a2d8e115a9b62eac5f001f32fb5a5626d3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 31 Aug 2004 04:51:37 -0700 Subject: Annotate sys_wait4() user pointers --- kernel/exit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 494f97d0d059..6e196d723d01 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1398,8 +1398,8 @@ asmlinkage long sys_waitid(int which, pid_t pid, return do_wait(pid, options, infop, NULL, &infop->si_rusage); } -asmlinkage long sys_wait4(pid_t pid, unsigned int *stat_addr, - int options, struct rusage *ru) +asmlinkage long sys_wait4(pid_t pid, unsigned int __user *stat_addr, + int options, struct rusage __user *ru) { if (options & ~(WNOHANG|WUNTRACED|__WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; -- cgit v1.2.3 From 3573026afa5aea2707c2a9ae390d49d4e37ac703 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 1 Sep 2004 03:30:15 -0700 Subject: [KPROBES]: Pass integer addresses, not pointers, to flush_icache_range(). Signed-off-by: David S. Miller --- kernel/kprobes.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9e3dc417c67c..01436a31c690 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -88,7 +88,8 @@ int register_kprobe(struct kprobe *p) arch_prepare_kprobe(p); p->opcode = *p->addr; *p->addr = BREAKPOINT_INSTRUCTION; - flush_icache_range(p->addr, p->addr + sizeof(kprobe_opcode_t)); + flush_icache_range((unsigned long) p->addr, + (unsigned long) p->addr + sizeof(kprobe_opcode_t)); out: spin_unlock_irqrestore(&kprobe_lock, flags); return ret; @@ -100,7 +101,8 @@ void unregister_kprobe(struct kprobe *p) spin_lock_irqsave(&kprobe_lock, flags); *p->addr = p->opcode; hlist_del(&p->hlist); - flush_icache_range(p->addr, p->addr + sizeof(kprobe_opcode_t)); + flush_icache_range((unsigned long) p->addr, + (unsigned long) p->addr + sizeof(kprobe_opcode_t)); spin_unlock_irqrestore(&kprobe_lock, flags); } -- cgit v1.2.3 From 37cbd41da09cb39c9896c04ed8baac7526bb9547 Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Thu, 2 Sep 2004 00:42:48 -0700 Subject: [PATCH] fixed pidhashing patch This patch fixes strange and obscure pid implementation in current kernels: - it removes calling of put_task_struct() from detach_pid() under tasklist_lock. This allows to use blocking calls in security_task_free() hooks (in __put_task_struct()). - it saves some space = 5*5 ints = 100 bytes in task_struct - it's smaller and tidy, more straigthforward and doesn't use any knowledge about pids using and assignment. - it removes pid_links and pid_struct doesn't hold reference counters on task_struct. instead, new pid_structs and linked altogether and only one of them is inserted in hash_list. Signed-off-by: Kirill Korotaev (kksx@mail.ru) Signed-off-by: William Irwin Signed-off-by: Linus Torvalds --- drivers/char/tty_io.c | 30 +++++++----------- fs/fcntl.c | 12 +++---- fs/proc/base.c | 5 ++- include/linux/pid.h | 44 ++++++++++---------------- include/linux/sched.h | 9 +++--- kernel/capability.c | 6 ++-- kernel/exit.c | 26 ++++----------- kernel/fork.c | 5 ++- kernel/pid.c | 88 ++++++++++++++++++++++++--------------------------- kernel/signal.c | 12 +++---- kernel/sys.c | 16 ++++------ 11 files changed, 101 insertions(+), 152 deletions(-) (limited to 'kernel') diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c index 2b28688b36ea..68304e241e90 100644 --- a/drivers/char/tty_io.c +++ b/drivers/char/tty_io.c @@ -424,7 +424,6 @@ void do_tty_hangup(void *data) struct file * cons_filp = NULL; struct file *filp, *f = NULL; struct task_struct *p; - struct pid *pid; int closecount = 0, n; if (!tty) @@ -495,8 +494,7 @@ void do_tty_hangup(void *data) read_lock(&tasklist_lock); if (tty->session > 0) { - struct list_head *l; - for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid) { + do_each_task_pid(tty->session, PIDTYPE_SID, p) { if (p->signal->tty == tty) p->signal->tty = NULL; if (!p->signal->leader) @@ -505,7 +503,7 @@ void do_tty_hangup(void *data) send_group_sig_info(SIGCONT, SEND_SIG_PRIV, p); if (tty->pgrp > 0) p->signal->tty_old_pgrp = tty->pgrp; - } + } while_each_task_pid(tty->session, PIDTYPE_SID, p); } read_unlock(&tasklist_lock); @@ -577,8 +575,6 @@ void disassociate_ctty(int on_exit) { struct tty_struct *tty; struct task_struct *p; - struct list_head *l; - struct pid *pid; int tty_pgrp = -1; lock_kernel(); @@ -607,8 +603,9 @@ void disassociate_ctty(int on_exit) tty->pgrp = -1; read_lock(&tasklist_lock); - for_each_task_pid(current->signal->session, PIDTYPE_SID, p, l, pid) + do_each_task_pid(current->signal->session, PIDTYPE_SID, p) { p->signal->tty = NULL; + } while_each_task_pid(current->signal->session, PIDTYPE_SID, p); read_unlock(&tasklist_lock); unlock_kernel(); } @@ -1260,15 +1257,15 @@ static void release_dev(struct file * filp) */ if (tty_closing || o_tty_closing) { struct task_struct *p; - struct list_head *l; - struct pid *pid; read_lock(&tasklist_lock); - for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid) + do_each_task_pid(tty->session, PIDTYPE_SID, p) { p->signal->tty = NULL; + } while_each_task_pid(tty->session, PIDTYPE_SID, p); if (o_tty) - for_each_task_pid(o_tty->session, PIDTYPE_SID, p,l, pid) + do_each_task_pid(o_tty->session, PIDTYPE_SID, p) { p->signal->tty = NULL; + } while_each_task_pid(o_tty->session, PIDTYPE_SID, p); read_unlock(&tasklist_lock); } @@ -1638,8 +1635,6 @@ static int fionbio(struct file *file, int __user *p) static int tiocsctty(struct tty_struct *tty, int arg) { - struct list_head *l; - struct pid *pid; task_t *p; if (current->signal->leader && @@ -1662,8 +1657,9 @@ static int tiocsctty(struct tty_struct *tty, int arg) */ read_lock(&tasklist_lock); - for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid) + do_each_task_pid(tty->session, PIDTYPE_SID, p) { p->signal->tty = NULL; + } while_each_task_pid(tty->session, PIDTYPE_SID, p); read_unlock(&tasklist_lock); } else return -EPERM; @@ -1970,8 +1966,6 @@ static void __do_SAK(void *arg) #else struct tty_struct *tty = arg; struct task_struct *p; - struct list_head *l; - struct pid *pid; int session; int i; struct file *filp; @@ -1984,7 +1978,7 @@ static void __do_SAK(void *arg) if (tty->driver->flush_buffer) tty->driver->flush_buffer(tty); read_lock(&tasklist_lock); - for_each_task_pid(session, PIDTYPE_SID, p, l, pid) { + do_each_task_pid(session, PIDTYPE_SID, p) { if (p->signal->tty == tty || session > 0) { printk(KERN_NOTICE "SAK: killed process %d" " (%s): p->signal->session==tty->session\n", @@ -2011,7 +2005,7 @@ static void __do_SAK(void *arg) spin_unlock(&p->files->file_lock); } task_unlock(p); - } + } while_each_task_pid(session, PIDTYPE_SID, p); read_unlock(&tasklist_lock); #endif } diff --git a/fs/fcntl.c b/fs/fcntl.c index 305abb43d0c0..eee115d6a224 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -497,11 +497,9 @@ void send_sigio(struct fown_struct *fown, int fd, int band) send_sigio_to_task(p, fown, fd, band); } } else { - struct list_head *l; - struct pid *pidptr; - for_each_task_pid(-pid, PIDTYPE_PGID, p, l, pidptr) { + do_each_task_pid(-pid, PIDTYPE_PGID, p) { send_sigio_to_task(p, fown, fd, band); - } + } while_each_task_pid(-pid, PIDTYPE_PGID, p); } read_unlock(&tasklist_lock); out_unlock_fown: @@ -534,11 +532,9 @@ int send_sigurg(struct fown_struct *fown) send_sigurg_to_task(p, fown); } } else { - struct list_head *l; - struct pid *pidptr; - for_each_task_pid(-pid, PIDTYPE_PGID, p, l, pidptr) { + do_each_task_pid(-pid, PIDTYPE_PGID, p) { send_sigurg_to_task(p, fown); - } + } while_each_task_pid(-pid, PIDTYPE_PGID, p); } read_unlock(&tasklist_lock); out_unlock_fown: diff --git a/fs/proc/base.c b/fs/proc/base.c index c668619e4508..355145ea9468 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -778,10 +778,9 @@ static struct inode_operations proc_pid_link_inode_operations = { .follow_link = proc_pid_follow_link }; -static int pid_alive(struct task_struct *p) +static inline int pid_alive(struct task_struct *p) { - BUG_ON(p->pids[PIDTYPE_PID].pidptr != &p->pids[PIDTYPE_PID].pid); - return atomic_read(&p->pids[PIDTYPE_PID].pid.count); + return p->pids[PIDTYPE_PID].nr != 0; } #define NUMBUF 10 diff --git a/include/linux/pid.h b/include/linux/pid.h index 02fb2d216d2b..5f74c4ab0d11 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -12,35 +12,22 @@ enum pid_type struct pid { - /* Try to keep hash_chain in the same cacheline as nr for find_pid */ - struct hlist_node hash_chain; + /* Try to keep pid_chain in the same cacheline as nr for find_pid */ int nr; - atomic_t count; - struct task_struct *task; - struct list_head task_list; -}; - -struct pid_link -{ - struct list_head pid_chain; - struct pid *pidptr; - struct pid pid; + struct hlist_node pid_chain; + /* list of pids with the same nr, only one of them is in the hash */ + struct list_head pid_list; }; #define pid_task(elem, type) \ - list_entry(elem, struct task_struct, pids[type].pid_chain) + list_entry(elem, struct task_struct, pids[type].pid_list) /* - * attach_pid() and link_pid() must be called with the tasklist_lock + * attach_pid() and detach_pid() must be called with the tasklist_lock * write-held. */ extern int FASTCALL(attach_pid(struct task_struct *task, enum pid_type type, int nr)); -extern void FASTCALL(link_pid(struct task_struct *task, struct pid_link *link, struct pid *pid)); - -/* - * detach_pid() must be called with the tasklist_lock write-held. - */ extern void FASTCALL(detach_pid(struct task_struct *task, enum pid_type)); /* @@ -53,13 +40,16 @@ extern int alloc_pidmap(void); extern void FASTCALL(free_pidmap(int)); extern void switch_exec_pids(struct task_struct *leader, struct task_struct *thread); -#define for_each_task_pid(who, type, task, elem, pid) \ - if ((pid = find_pid(type, who))) \ - for (elem = pid->task_list.next, \ - prefetch(elem->next), \ - task = pid_task(elem, type); \ - elem != &pid->task_list; \ - elem = elem->next, prefetch(elem->next), \ - task = pid_task(elem, type)) +#define do_each_task_pid(who, type, task) \ + if ((task = find_task_by_pid_type(type, who))) { \ + prefetch((task)->pids[type].pid_list.next); \ + do { + +#define while_each_task_pid(who, type, task) \ + task = pid_task((task)->pids[type].pid_list.next,\ + type); \ + prefetch((task)->pids[type].pid_list.next); \ + } while (hlist_unhashed(&(task)->pids[type].pid_chain));\ + } \ #endif /* _LINUX_PID_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 3aaf8e5f2216..5cf33f072140 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -494,7 +494,7 @@ struct task_struct { struct task_struct *group_leader; /* threadgroup leader */ /* PID/PID hash table linkage. */ - struct pid_link pids[PIDTYPE_MAX]; + struct pid pids[PIDTYPE_MAX]; wait_queue_head_t wait_chldexit; /* for wait4() */ struct completion *vfork_done; /* for vfork() */ @@ -673,7 +673,8 @@ extern struct task_struct init_task; extern struct mm_struct init_mm; -extern struct task_struct *find_task_by_pid(int pid); +#define find_task_by_pid(nr) find_task_by_pid_type(PIDTYPE_PID, nr) +extern struct task_struct *find_task_by_pid_type(int type, int pid); extern void set_special_pids(pid_t session, pid_t pgrp); extern void __set_special_pids(pid_t session, pid_t pgrp); @@ -876,9 +877,7 @@ extern task_t * FASTCALL(next_thread(const task_t *p)); static inline int thread_group_empty(task_t *p) { - struct pid *pid = p->pids[PIDTYPE_TGID].pidptr; - - return pid->task_list.next->next == &pid->task_list; + return list_empty(&p->pids[PIDTYPE_TGID].pid_list); } #define delay_group_leader(p) \ diff --git a/kernel/capability.c b/kernel/capability.c index 1c5c35718450..7e864e2ccf6a 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -89,14 +89,12 @@ static inline void cap_set_pg(int pgrp, kernel_cap_t *effective, kernel_cap_t *permitted) { task_t *g, *target; - struct list_head *l; - struct pid *pid; - for_each_task_pid(pgrp, PIDTYPE_PGID, g, l, pid) { + do_each_task_pid(pgrp, PIDTYPE_PGID, g) { target = g; while_each_thread(g, target) security_capset_set(target, effective, inheritable, permitted); - } + } while_each_task_pid(pgrp, PIDTYPE_PGID, g); } /* diff --git a/kernel/exit.c b/kernel/exit.c index 6e196d723d01..731b9ccd236b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -124,16 +124,15 @@ void unhash_process(struct task_struct *p) int session_of_pgrp(int pgrp) { struct task_struct *p; - struct list_head *l; - struct pid *pid; int sid = -1; read_lock(&tasklist_lock); - for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) + do_each_task_pid(pgrp, PIDTYPE_PGID, p) { if (p->signal->session > 0) { sid = p->signal->session; goto out; } + } while_each_task_pid(pgrp, PIDTYPE_PGID, p); p = find_task_by_pid(pgrp); if (p) sid = p->signal->session; @@ -154,11 +153,9 @@ out: static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task) { struct task_struct *p; - struct list_head *l; - struct pid *pid; int ret = 1; - for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) { + do_each_task_pid(pgrp, PIDTYPE_PGID, p) { if (p == ignored_task || p->state >= TASK_ZOMBIE || p->real_parent->pid == 1) @@ -168,7 +165,7 @@ static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task) ret = 0; break; } - } + } while_each_task_pid(pgrp, PIDTYPE_PGID, p); return ret; /* (sighing) "Often!" */ } @@ -187,10 +184,8 @@ static inline int has_stopped_jobs(int pgrp) { int retval = 0; struct task_struct *p; - struct list_head *l; - struct pid *pid; - for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) { + do_each_task_pid(pgrp, PIDTYPE_PGID, p) { if (p->state != TASK_STOPPED) continue; @@ -206,7 +201,7 @@ static inline int has_stopped_jobs(int pgrp) retval = 1; break; - } + } while_each_task_pid(pgrp, PIDTYPE_PGID, p); return retval; } @@ -849,9 +844,6 @@ asmlinkage long sys_exit(int error_code) task_t fastcall *next_thread(const task_t *p) { - const struct pid_link *link = p->pids + PIDTYPE_TGID; - const struct list_head *tmp, *head = &link->pidptr->task_list; - #ifdef CONFIG_SMP if (!p->sighand) BUG(); @@ -859,11 +851,7 @@ task_t fastcall *next_thread(const task_t *p) !rwlock_is_locked(&tasklist_lock)) BUG(); #endif - tmp = link->pid_chain.next; - if (tmp == head) - tmp = head->next; - - return pid_task(tmp, PIDTYPE_TGID); + return pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID); } EXPORT_SYMBOL(next_thread); diff --git a/kernel/fork.c b/kernel/fork.c index 9353b2e96700..78db8811c834 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1124,14 +1124,13 @@ static task_t *copy_process(unsigned long clone_flags, __ptrace_link(p, current->parent); attach_pid(p, PIDTYPE_PID, p->pid); + attach_pid(p, PIDTYPE_TGID, p->tgid); if (thread_group_leader(p)) { - attach_pid(p, PIDTYPE_TGID, p->tgid); attach_pid(p, PIDTYPE_PGID, process_group(p)); attach_pid(p, PIDTYPE_SID, p->signal->session); if (p->pid) __get_cpu_var(process_counts)++; - } else - link_pid(p, p->pids + PIDTYPE_TGID, &p->group_leader->pids[PIDTYPE_TGID].pid); + } nr_threads++; write_unlock_irq(&tasklist_lock); diff --git a/kernel/pid.c b/kernel/pid.c index 8c9b7510174a..57527f0cda5e 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -146,74 +146,66 @@ failure: return -1; } -fastcall struct pid *find_pid(enum pid_type type, int nr) +struct pid * fastcall find_pid(enum pid_type type, int nr) { struct hlist_node *elem; struct pid *pid; hlist_for_each_entry(pid, elem, - &pid_hash[type][pid_hashfn(nr)], hash_chain) { + &pid_hash[type][pid_hashfn(nr)], pid_chain) { if (pid->nr == nr) return pid; } return NULL; } -void fastcall link_pid(task_t *task, struct pid_link *link, struct pid *pid) -{ - atomic_inc(&pid->count); - list_add_tail(&link->pid_chain, &pid->task_list); - link->pidptr = pid; -} - int fastcall attach_pid(task_t *task, enum pid_type type, int nr) { - struct pid *pid = find_pid(type, nr); - - if (pid) - atomic_inc(&pid->count); - else { - pid = &task->pids[type].pid; - pid->nr = nr; - atomic_set(&pid->count, 1); - INIT_LIST_HEAD(&pid->task_list); - pid->task = task; - get_task_struct(task); - hlist_add_head(&pid->hash_chain, + struct pid *pid, *task_pid; + + task_pid = &task->pids[type]; + pid = find_pid(type, nr); + if (pid == NULL) { + hlist_add_head(&task_pid->pid_chain, &pid_hash[type][pid_hashfn(nr)]); + INIT_LIST_HEAD(&task_pid->pid_list); + } else { + INIT_HLIST_NODE(&task_pid->pid_chain); + list_add_tail(&task_pid->pid_list, &pid->pid_list); } - list_add_tail(&task->pids[type].pid_chain, &pid->task_list); - task->pids[type].pidptr = pid; + task_pid->nr = nr; return 0; } static inline int __detach_pid(task_t *task, enum pid_type type) { - struct pid_link *link = task->pids + type; - struct pid *pid = link->pidptr; + struct pid *pid, *pid_next; int nr; - list_del(&link->pid_chain); - if (!atomic_dec_and_test(&pid->count)) - return 0; - + pid = &task->pids[type]; + if (!hlist_unhashed(&pid->pid_chain)) { + hlist_del(&pid->pid_chain); + if (!list_empty(&pid->pid_list)) { + pid_next = list_entry(pid->pid_list.next, + struct pid, pid_list); + /* insert next pid from pid_list to hash */ + hlist_add_head(&pid_next->pid_chain, + &pid_hash[type][pid_hashfn(pid_next->nr)]); + } + } + list_del(&pid->pid_list); nr = pid->nr; - hlist_del(&pid->hash_chain); - put_task_struct(pid->task); + pid->nr = 0; return nr; } -static void _detach_pid(task_t *task, enum pid_type type) -{ - __detach_pid(task, type); -} - void fastcall detach_pid(task_t *task, enum pid_type type) { - int nr = __detach_pid(task, type); + int nr; + nr = __detach_pid(task, type); if (!nr) return; @@ -223,16 +215,18 @@ void fastcall detach_pid(task_t *task, enum pid_type type) free_pidmap(nr); } -task_t *find_task_by_pid(int nr) +task_t *find_task_by_pid_type(int type, int nr) { - struct pid *pid = find_pid(PIDTYPE_PID, nr); + struct pid *pid; + pid = find_pid(type, nr); if (!pid) return NULL; - return pid_task(pid->task_list.next, PIDTYPE_PID); + + return pid_task(&pid->pid_list, type); } -EXPORT_SYMBOL(find_task_by_pid); +EXPORT_SYMBOL(find_task_by_pid_type); /* * This function switches the PIDs if a non-leader thread calls @@ -241,13 +235,13 @@ EXPORT_SYMBOL(find_task_by_pid); */ void switch_exec_pids(task_t *leader, task_t *thread) { - _detach_pid(leader, PIDTYPE_PID); - _detach_pid(leader, PIDTYPE_TGID); - _detach_pid(leader, PIDTYPE_PGID); - _detach_pid(leader, PIDTYPE_SID); + __detach_pid(leader, PIDTYPE_PID); + __detach_pid(leader, PIDTYPE_TGID); + __detach_pid(leader, PIDTYPE_PGID); + __detach_pid(leader, PIDTYPE_SID); - _detach_pid(thread, PIDTYPE_PID); - _detach_pid(thread, PIDTYPE_TGID); + __detach_pid(thread, PIDTYPE_PID); + __detach_pid(thread, PIDTYPE_TGID); leader->pid = leader->tgid = thread->pid; thread->pid = thread->tgid; diff --git a/kernel/signal.c b/kernel/signal.c index 2dec2c91ed6e..8b05f0b8c2dc 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1110,8 +1110,6 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) { struct task_struct *p; - struct list_head *l; - struct pid *pid; int retval, success; if (pgrp <= 0) @@ -1119,11 +1117,11 @@ int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) success = 0; retval = -ESRCH; - for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) { + do_each_task_pid(pgrp, PIDTYPE_PGID, p) { int err = group_send_sig_info(sig, info, p); success |= !err; retval = err; - } + } while_each_task_pid(pgrp, PIDTYPE_PGID, p); return success ? 0 : retval; } @@ -1150,8 +1148,6 @@ int kill_sl_info(int sig, struct siginfo *info, pid_t sid) { int err, retval = -EINVAL; - struct pid *pid; - struct list_head *l; struct task_struct *p; if (sid <= 0) @@ -1159,13 +1155,13 @@ kill_sl_info(int sig, struct siginfo *info, pid_t sid) retval = -ESRCH; read_lock(&tasklist_lock); - for_each_task_pid(sid, PIDTYPE_SID, p, l, pid) { + do_each_task_pid(sid, PIDTYPE_SID, p) { if (!p->signal->leader) continue; err = group_send_sig_info(sig, info, p); if (retval) retval = err; - } + } while_each_task_pid(sid, PIDTYPE_SID, p); read_unlock(&tasklist_lock); out: return retval; diff --git a/kernel/sys.c b/kernel/sys.c index de206f3df466..a4b29df201f6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -310,8 +310,6 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) { struct task_struct *g, *p; struct user_struct *user; - struct pid *pid; - struct list_head *l; int error = -EINVAL; if (which > 2 || which < 0) @@ -336,8 +334,9 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) case PRIO_PGRP: if (!who) who = process_group(current); - for_each_task_pid(who, PIDTYPE_PGID, p, l, pid) + do_each_task_pid(who, PIDTYPE_PGID, p) { error = set_one_prio(p, niceval, error); + } while_each_task_pid(who, PIDTYPE_PGID, p); break; case PRIO_USER: if (!who) @@ -371,8 +370,6 @@ out: asmlinkage long sys_getpriority(int which, int who) { struct task_struct *g, *p; - struct list_head *l; - struct pid *pid; struct user_struct *user; long niceval, retval = -ESRCH; @@ -394,11 +391,11 @@ asmlinkage long sys_getpriority(int which, int who) case PRIO_PGRP: if (!who) who = process_group(current); - for_each_task_pid(who, PIDTYPE_PGID, p, l, pid) { + do_each_task_pid(who, PIDTYPE_PGID, p) { niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; - } + } while_each_task_pid(who, PIDTYPE_PGID, p); break; case PRIO_USER: if (!who) @@ -1044,12 +1041,11 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) if (pgid != pid) { struct task_struct *p; - struct pid *pid; - struct list_head *l; - for_each_task_pid(pgid, PIDTYPE_PGID, p, l, pid) + do_each_task_pid(pgid, PIDTYPE_PGID, p) { if (p->signal->session == current->signal->session) goto ok_pgid; + } while_each_task_pid(pgid, PIDTYPE_PGID, p); goto out; } -- cgit v1.2.3