From 55b50278ec233024c2e5be04855d66ebdcebc35e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 21 Sep 2003 01:37:01 -0700 Subject: [PATCH] real-time enhanced page allocator and throttling From: Robert Love - Let real-time tasks dip further into the reserves than usual in __alloc_pages(). There are a lot of ways to special case this. This patch just cuts z->pages_low in half, before doing the incremental min thing, for real-time tasks. I do not do anything in the low memory slow path. We can be a _lot_ more aggressive if we want. Right now, we just give real-time tasks a little help. - Never ever call balance_dirty_pages() on a real-time task. Where and how exactly we handle this is up for debate. We could, for example, special case real-time tasks inside balance_dirty_pages(). This would allow us to perform some of the work (say, waking up pdflush) but not other work (say, the active throttling). As it stands now, we do the per-processor accounting in balance_dirty_pages_ratelimited() but we never call balance_dirty_pages(). Lots of approaches work. What we want to do is never engage the real-time task in forced writeback. --- kernel/sched.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 89f1bb28dacd..1c5802ceedae 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -179,7 +179,6 @@ static DEFINE_PER_CPU(struct runqueue, runqueues); #define this_rq() (&__get_cpu_var(runqueues)) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -#define rt_task(p) ((p)->prio < MAX_RT_PRIO) /* * Default context-switch locking: -- cgit v1.2.3 From feaecce47220d2d8bf6f0d76e09dc623dbf991e0 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 21 Sep 2003 01:37:10 -0700 Subject: [PATCH] Fix setpgid and threads From: Jeremy Fitzhardinge I'm resending my patch to fix this problem. To recap: every task_struct has its own copy of the thread group's pgrp. Only the thread group leader is allowed to change the tgrp's pgrp, but it only updates its own copy of pgrp, while all the other threads in the tgrp use the old value they inherited on creation. This patch simply updates all the other thread's pgrp when the tgrp leader changes pgrp. Ulrich has already expressed reservations about this patch since it is (1) incomplete (it doesn't cover the case of other ids which have similar problems), (2) racy (it doesn't synchronize with other threads looking at the task pgrp, so they could see an inconsistent view) and (3) slow (it takes linear time with respect to the number of threads in the tgrp). My reaction is that (1) it fixes the actual bug I'm encountering in a real program. (2) doesn't really matter for pgrp, since it is mostly an issue with respect to the terminal job-control code (which is even more broken without this patch. Regarding (3), I think there are very few programs which have a large number of threads which change process group id on a regular basis (a heavily multi-threaded job-control shell?). Ulrich also said he has a (proposed?) much better fix, which I've been looking forward to. I'm submitting this patch as a stop-gap fix for a real bug, and perhaps to prompt the improved patch. An alternative fix, at least for pgrp, is to change all references to ->pgrp to group_leader->pgrp. This may be sufficient on its own, but it would be a reasonably intrusive patch (I count 95 instances in 32 files in the 2.6.0-test3-mm3 tree). --- arch/h8300/kernel/signal.c | 2 +- arch/m68k/kernel/signal.c | 2 +- arch/m68knommu/kernel/signal.c | 2 +- arch/mips/kernel/irixelf.c | 2 +- arch/mips/kernel/irixsig.c | 2 +- arch/mips/kernel/sysirix.c | 4 ++-- arch/sparc64/solaris/misc.c | 4 ++-- drivers/char/n_tty.c | 6 +++--- drivers/char/rocket.c | 2 +- drivers/char/tty_io.c | 10 +++++----- drivers/isdn/i4l/isdn_tty.c | 3 ++- fs/autofs/autofs_i.h | 2 +- fs/autofs/inode.c | 4 ++-- fs/autofs/root.c | 4 ++-- fs/autofs4/autofs_i.h | 2 +- fs/autofs4/inode.c | 4 ++-- fs/autofs4/root.c | 4 ++-- fs/binfmt_elf.c | 4 ++-- fs/coda/upcall.c | 2 +- fs/devfs/base.c | 6 +++--- fs/proc/array.c | 2 +- include/linux/sched.h | 9 +++++++-- kernel/exit.c | 24 ++++++++++++------------ kernel/fork.c | 2 +- kernel/pid.c | 4 ++-- kernel/signal.c | 4 ++-- kernel/sys.c | 17 +++++++++-------- 27 files changed, 70 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c index 8db25723b077..727b420fadca 100644 --- a/arch/h8300/kernel/signal.c +++ b/arch/h8300/kernel/signal.c @@ -593,7 +593,7 @@ asmlinkage int do_signal(sigset_t *oldset, struct pt_regs *regs) continue; case SIGTSTP: case SIGTTIN: case SIGTTOU: - if (is_orphaned_pgrp(current->pgrp)) + if (is_orphaned_pgrp(process_group(current))) continue; /* FALLTHRU */ diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index 54951a69f4cc..92c280c7016d 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -1082,7 +1082,7 @@ asmlinkage int do_signal(sigset_t *oldset, struct pt_regs *regs) continue; case SIGTSTP: case SIGTTIN: case SIGTTOU: - if (is_orphaned_pgrp(current->pgrp)) + if (is_orphaned_pgrp(process_group(current))) continue; /* FALLTHRU */ diff --git a/arch/m68knommu/kernel/signal.c b/arch/m68knommu/kernel/signal.c index 4271a229b62c..d24df6f29517 100644 --- a/arch/m68knommu/kernel/signal.c +++ b/arch/m68knommu/kernel/signal.c @@ -841,7 +841,7 @@ asmlinkage int do_signal(sigset_t *oldset, struct pt_regs *regs) continue; case SIGTSTP: case SIGTTIN: case SIGTTOU: - if (is_orphaned_pgrp(current->pgrp)) + if (is_orphaned_pgrp(process_group(current))) continue; /* FALLTHRU */ diff --git a/arch/mips/kernel/irixelf.c b/arch/mips/kernel/irixelf.c index 31fbd57c7021..f925c6514aad 100644 --- a/arch/mips/kernel/irixelf.c +++ b/arch/mips/kernel/irixelf.c @@ -1130,7 +1130,7 @@ static int irix_core_dump(long signr, struct pt_regs * regs, struct file *file) prstatus.pr_sighold = current->blocked.sig[0]; psinfo.pr_pid = prstatus.pr_pid = current->pid; psinfo.pr_ppid = prstatus.pr_ppid = current->parent->pid; - psinfo.pr_pgrp = prstatus.pr_pgrp = current->pgrp; + psinfo.pr_pgrp = prstatus.pr_pgrp = process_group(current); psinfo.pr_sid = prstatus.pr_sid = current->session; prstatus.pr_utime.tv_sec = CT_TO_SECS(current->utime); prstatus.pr_utime.tv_usec = CT_TO_USECS(current->utime); diff --git a/arch/mips/kernel/irixsig.c b/arch/mips/kernel/irixsig.c index 85cc88d32b7d..9ed7b1e75a39 100644 --- a/arch/mips/kernel/irixsig.c +++ b/arch/mips/kernel/irixsig.c @@ -582,7 +582,7 @@ repeat: p = list_entry(_p,struct task_struct,sibling); if ((type == P_PID) && p->pid != pid) continue; - if ((type == P_PGID) && p->pgrp != pid) + if ((type == P_PGID) && process_group(p) != pid) continue; if ((p->exit_signal != SIGCHLD)) continue; diff --git a/arch/mips/kernel/sysirix.c b/arch/mips/kernel/sysirix.c index 022aa036efef..cadafba927ac 100644 --- a/arch/mips/kernel/sysirix.c +++ b/arch/mips/kernel/sysirix.c @@ -803,11 +803,11 @@ asmlinkage int irix_setpgrp(int flags) printk("[%s:%d] setpgrp(%d) ", current->comm, current->pid, flags); #endif if(!flags) - error = current->pgrp; + error = process_group(current); else error = sys_setsid(); #ifdef DEBUG_PROCGRPS - printk("returning %d\n", current->pgrp); + printk("returning %d\n", process_group(current)); #endif return error; diff --git a/arch/sparc64/solaris/misc.c b/arch/sparc64/solaris/misc.c index 024b74c9d8b5..ea7b2c439653 100644 --- a/arch/sparc64/solaris/misc.c +++ b/arch/sparc64/solaris/misc.c @@ -392,7 +392,7 @@ asmlinkage int solaris_procids(int cmd, s32 pid, s32 pgid) switch (cmd) { case 0: /* getpgrp */ - return current->pgrp; + return process_group(current); case 1: /* setpgrp */ { int (*sys_setpgid)(pid_t,pid_t) = @@ -403,7 +403,7 @@ asmlinkage int solaris_procids(int cmd, s32 pid, s32 pgid) ret = sys_setpgid(0, 0); if (ret) return ret; current->tty = NULL; - return current->pgrp; + return process_group(current); } case 2: /* getsid */ { diff --git a/drivers/char/n_tty.c b/drivers/char/n_tty.c index 6cc938bbbf30..44d551ba1091 100644 --- a/drivers/char/n_tty.c +++ b/drivers/char/n_tty.c @@ -977,11 +977,11 @@ do_it_again: if (file->f_op->write != redirected_tty_write && current->tty == tty) { if (tty->pgrp <= 0) printk("read_chan: tty->pgrp <= 0!\n"); - else if (current->pgrp != tty->pgrp) { + else if (process_group(current) != tty->pgrp) { if (is_ignored(SIGTTIN) || - is_orphaned_pgrp(current->pgrp)) + is_orphaned_pgrp(process_group(current))) return -EIO; - kill_pg(current->pgrp, SIGTTIN, 1); + kill_pg(process_group(current), SIGTTIN, 1); return -ERESTARTSYS; } } diff --git a/drivers/char/rocket.c b/drivers/char/rocket.c index 8f969928c68b..50fab08d460f 100644 --- a/drivers/char/rocket.c +++ b/drivers/char/rocket.c @@ -956,7 +956,7 @@ static int rp_open(struct tty_struct *tty, struct file *filp) * Info->count is now 1; so it's safe to sleep now. */ info->session = current->session; - info->pgrp = current->pgrp; + info->pgrp = process_group(current); if ((info->flags & ROCKET_INITIALIZED) == 0) { cp = &info->channel; diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c index 8d1f8c6d3c23..bbd84871219e 100644 --- a/drivers/char/tty_io.c +++ b/drivers/char/tty_io.c @@ -325,13 +325,13 @@ int tty_check_change(struct tty_struct * tty) printk(KERN_WARNING "tty_check_change: tty->pgrp <= 0!\n"); return 0; } - if (current->pgrp == tty->pgrp) + if (process_group(current) == tty->pgrp) return 0; if (is_ignored(SIGTTOU)) return 0; - if (is_orphaned_pgrp(current->pgrp)) + if (is_orphaned_pgrp(process_group(current))) return -EIO; - (void) kill_pg(current->pgrp,SIGTTOU,1); + (void) kill_pg(process_group(current), SIGTTOU, 1); return -ERESTARTSYS; } @@ -1406,7 +1406,7 @@ got_driver: task_unlock(current); current->tty_old_pgrp = 0; tty->session = current->session; - tty->pgrp = current->pgrp; + tty->pgrp = process_group(current); } return 0; } @@ -1580,7 +1580,7 @@ static int tiocsctty(struct tty_struct *tty, int arg) task_unlock(current); current->tty_old_pgrp = 0; tty->session = current->session; - tty->pgrp = current->pgrp; + tty->pgrp = process_group(current); return 0; } diff --git a/drivers/isdn/i4l/isdn_tty.c b/drivers/isdn/i4l/isdn_tty.c index 996e8c5a9612..390b0f484d26 100644 --- a/drivers/isdn/i4l/isdn_tty.c +++ b/drivers/isdn/i4l/isdn_tty.c @@ -1989,7 +1989,8 @@ modem_write_profile(atemu * m) memcpy(m->pmsn, m->msn, ISDN_MSNLEN); memcpy(m->plmsn, m->lmsn, ISDN_LMSNLEN); if ((get_isdn_dev())->profd) - kill_pg_info(SIGIO, SEND_SIG_PRIV, (get_isdn_dev())->profd->pgrp); + kill_pg_info(SIGIO, SEND_SIG_PRIV, + process_group((get_isdn_dev())->profd)); } static struct tty_operations modem_ops = { diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 412e4dcb9e40..6171431272dc 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -123,7 +123,7 @@ static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb) filesystem without "magic".) */ static inline int autofs_oz_mode(struct autofs_sb_info *sbi) { - return sbi->catatonic || current->pgrp == sbi->oz_pgrp; + return sbi->catatonic || process_group(current) == sbi->oz_pgrp; } /* Hash operations */ diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index 76bb0d61dd91..398cade9442d 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -51,7 +51,7 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, pid *uid = current->uid; *gid = current->gid; - *pgrp = current->pgrp; + *pgrp = process_group(current); *minproto = *maxproto = AUTOFS_PROTO_VERSION; @@ -129,7 +129,7 @@ int autofs_fill_super(struct super_block *s, void *data, int silent) sbi->magic = AUTOFS_SBI_MAGIC; sbi->catatonic = 0; sbi->exp_timeout = 0; - sbi->oz_pgrp = current->pgrp; + sbi->oz_pgrp = process_group(current); autofs_initialize_hash(&sbi->dirhash); sbi->queues = NULL; memset(sbi->symlink_bitmap, 0, sizeof(long)*AUTOFS_SYMLINK_BITMAP_LEN); diff --git a/fs/autofs/root.c b/fs/autofs/root.c index 546ac2f9af87..74ad37a84efc 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -213,7 +213,7 @@ static struct dentry *autofs_root_lookup(struct inode *dir, struct dentry *dentr oz_mode = autofs_oz_mode(sbi); DPRINTK(("autofs_lookup: pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n", - current->pid, current->pgrp, sbi->catatonic, oz_mode)); + current->pid, process_group(current), sbi->catatonic, oz_mode)); /* * Mark the dentry incomplete, but add it. This is needed so @@ -527,7 +527,7 @@ static int autofs_root_ioctl(struct inode *inode, struct file *filp, { struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb); - DPRINTK(("autofs_ioctl: cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n",cmd,arg,sbi,current->pgrp)); + DPRINTK(("autofs_ioctl: cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n",cmd,arg,sbi,process_group(current))); if ( _IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) || _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT ) diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 16a3ab2f6d05..0a4b95417122 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -113,7 +113,7 @@ static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry) filesystem without "magic".) */ static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) { - return sbi->catatonic || current->pgrp == sbi->oz_pgrp; + return sbi->catatonic || process_group(current) == sbi->oz_pgrp; } /* Does a dentry have some pending activity? */ diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 35647d8eca94..f7cb9de42d95 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -101,7 +101,7 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, *uid = current->uid; *gid = current->gid; - *pgrp = current->pgrp; + *pgrp = process_group(current); *minproto = AUTOFS_MIN_PROTO_VERSION; *maxproto = AUTOFS_MAX_PROTO_VERSION; @@ -192,7 +192,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi->magic = AUTOFS_SBI_MAGIC; sbi->catatonic = 0; sbi->exp_timeout = 0; - sbi->oz_pgrp = current->pgrp; + sbi->oz_pgrp = process_group(current); sbi->sb = s; sbi->version = 0; sbi->queues = NULL; diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 49f9f4d3b406..971dfa29b387 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -255,7 +255,7 @@ static struct dentry *autofs4_root_lookup(struct inode *dir, struct dentry *dent lock_kernel(); oz_mode = autofs4_oz_mode(sbi); DPRINTK(("autofs_lookup: pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n", - current->pid, current->pgrp, sbi->catatonic, oz_mode)); + current->pid, process_group(current), sbi->catatonic, oz_mode)); /* * Mark the dentry incomplete, but add it. This is needed so @@ -518,7 +518,7 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp, struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb); DPRINTK(("autofs_ioctl: cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n", - cmd,arg,sbi,current->pgrp)); + cmd,arg,sbi,process_group(current))); if ( _IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) || _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT ) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index ffcef47900fd..dfdead4fcb63 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1077,7 +1077,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus, prstatus->pr_sighold = p->blocked.sig[0]; prstatus->pr_pid = p->pid; prstatus->pr_ppid = p->parent->pid; - prstatus->pr_pgrp = p->pgrp; + prstatus->pr_pgrp = process_group(p); prstatus->pr_sid = p->session; jiffies_to_timeval(p->utime, &prstatus->pr_utime); jiffies_to_timeval(p->stime, &prstatus->pr_stime); @@ -1105,7 +1105,7 @@ static void fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, psinfo->pr_pid = p->pid; psinfo->pr_ppid = p->parent->pid; - psinfo->pr_pgrp = p->pgrp; + psinfo->pr_pgrp = process_group(p); psinfo->pr_sid = p->session; i = p->state ? ffz(~p->state) + 1 : 0; diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index a513a4b823a9..5bce9b860d52 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -54,7 +54,7 @@ static void *alloc_upcall(int opcode, int size) inp->ih.opcode = opcode; inp->ih.pid = current->pid; - inp->ih.pgid = current->pgrp; + inp->ih.pgid = process_group(current); #ifdef CODA_FS_OLD_API memset(&inp->ih.cred, 0, sizeof(struct coda_cred)); inp->ih.cred.cr_fsuid = current->fsuid; diff --git a/fs/devfs/base.c b/fs/devfs/base.c index dd03e0397cc2..73894d9af19c 100644 --- a/fs/devfs/base.c +++ b/fs/devfs/base.c @@ -1334,7 +1334,7 @@ static int is_devfsd_or_child (struct fs_info *fs_info) struct task_struct *p = current; if (p == fs_info->devfsd_task) return (TRUE); - if (p->pgrp == fs_info->devfsd_pgrp) return (TRUE); + if (process_group(p) == fs_info->devfsd_pgrp) return (TRUE); read_lock(&tasklist_lock); for ( ; p != &init_task; p = p->real_parent) { @@ -2744,8 +2744,8 @@ static int devfsd_ioctl (struct inode *inode, struct file *file, } fs_info->devfsd_task = current; spin_unlock (&lock); - fs_info->devfsd_pgrp = (current->pgrp == current->pid) ? - current->pgrp : 0; + fs_info->devfsd_pgrp = (process_group(current) == current->pid) ? + process_group(current) : 0; fs_info->devfsd_file = file; fs_info->devfsd_info = kmalloc (sizeof *fs_info->devfsd_info, GFP_KERNEL); diff --git a/fs/proc/array.c b/fs/proc/array.c index a7a3bdaac4ec..4b9ec914d60f 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -341,7 +341,7 @@ int proc_pid_stat(struct task_struct *task, char * buffer) task->comm, state, ppid, - task->pgrp, + process_group(task), task->session, tty_nr, tty_pgrp, diff --git a/include/linux/sched.h b/include/linux/sched.h index 3d9aba07d508..9878dbef500a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -362,7 +362,7 @@ struct task_struct { unsigned long personality; int did_exec:1; pid_t pid; - pid_t pgrp; + pid_t __pgrp; /* Accessed via process_group() */ pid_t tty_old_pgrp; pid_t session; pid_t tgid; @@ -377,7 +377,7 @@ struct task_struct { struct task_struct *parent; /* parent process */ struct list_head children; /* list of my children */ struct list_head sibling; /* linkage in my parent's children list */ - struct task_struct *group_leader; + struct task_struct *group_leader; /* threadgroup leader */ /* PID/PID hash table linkage. */ struct pid_link pids[PIDTYPE_MAX]; @@ -463,6 +463,11 @@ struct task_struct { siginfo_t *last_siginfo; /* For ptrace use. */ }; +static inline pid_t process_group(struct task_struct *tsk) +{ + return tsk->group_leader->__pgrp; +} + extern void __put_task_struct(struct task_struct *tsk); #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) #define put_task_struct(tsk) \ diff --git a/kernel/exit.c b/kernel/exit.c index b6174f82adf9..c565fd69d559 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -152,7 +152,7 @@ static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task) || p->state >= TASK_ZOMBIE || p->real_parent->pid == 1) continue; - if (p->real_parent->pgrp != pgrp + if (process_group(p->real_parent) != pgrp && p->real_parent->session == p->session) { ret = 0; break; @@ -247,9 +247,9 @@ void __set_special_pids(pid_t session, pid_t pgrp) curr->session = session; attach_pid(curr, PIDTYPE_SID, session); } - if (curr->pgrp != pgrp) { + if (process_group(curr) != pgrp) { detach_pid(curr, PIDTYPE_PGID); - curr->pgrp = pgrp; + curr->group_leader->__pgrp = pgrp; attach_pid(curr, PIDTYPE_PGID, pgrp); } } @@ -508,9 +508,9 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced) * than we are, and it was the only connection * outside, so the child pgrp is now orphaned. */ - if ((p->pgrp != father->pgrp) && + if ((process_group(p) != process_group(father)) && (p->session == father->session)) { - int pgrp = p->pgrp; + int pgrp = process_group(p); if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { __kill_pg_info(SIGHUP, (void *)1, pgrp); @@ -618,12 +618,12 @@ static void exit_notify(struct task_struct *tsk) t = tsk->real_parent; - if ((t->pgrp != tsk->pgrp) && + if ((process_group(t) != process_group(tsk)) && (t->session == tsk->session) && - will_become_orphaned_pgrp(tsk->pgrp, tsk) && - has_stopped_jobs(tsk->pgrp)) { - __kill_pg_info(SIGHUP, (void *)1, tsk->pgrp); - __kill_pg_info(SIGCONT, (void *)1, tsk->pgrp); + will_become_orphaned_pgrp(process_group(tsk), tsk) && + has_stopped_jobs(process_group(tsk))) { + __kill_pg_info(SIGHUP, (void *)1, process_group(tsk)); + __kill_pg_info(SIGCONT, (void *)1, process_group(tsk)); } /* Let father know we died @@ -813,10 +813,10 @@ static int eligible_child(pid_t pid, int options, task_t *p) if (p->pid != pid) return 0; } else if (!pid) { - if (p->pgrp != current->pgrp) + if (process_group(p) != process_group(current)) return 0; } else if (pid != -1) { - if (p->pgrp != -pid) + if (process_group(p) != -pid) return 0; } diff --git a/kernel/fork.c b/kernel/fork.c index 37d79b4e16e6..50535a16c71e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1004,7 +1004,7 @@ struct task_struct *copy_process(unsigned long clone_flags, attach_pid(p, PIDTYPE_PID, p->pid); if (thread_group_leader(p)) { attach_pid(p, PIDTYPE_TGID, p->tgid); - attach_pid(p, PIDTYPE_PGID, p->pgrp); + attach_pid(p, PIDTYPE_PGID, process_group(p)); attach_pid(p, PIDTYPE_SID, p->session); if (p->pid) __get_cpu_var(process_counts)++; diff --git a/kernel/pid.c b/kernel/pid.c index 00413e3967b9..713f54eaeda9 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -250,13 +250,13 @@ void switch_exec_pids(task_t *leader, task_t *thread) attach_pid(thread, PIDTYPE_PID, thread->pid); attach_pid(thread, PIDTYPE_TGID, thread->tgid); - attach_pid(thread, PIDTYPE_PGID, thread->pgrp); + attach_pid(thread, PIDTYPE_PGID, leader->__pgrp); attach_pid(thread, PIDTYPE_SID, thread->session); list_add_tail(&thread->tasks, &init_task.tasks); attach_pid(leader, PIDTYPE_PID, leader->pid); attach_pid(leader, PIDTYPE_TGID, leader->tgid); - attach_pid(leader, PIDTYPE_PGID, leader->pgrp); + attach_pid(leader, PIDTYPE_PGID, leader->__pgrp); attach_pid(leader, PIDTYPE_SID, leader->session); } diff --git a/kernel/signal.c b/kernel/signal.c index 72333be1fd42..852da1a009da 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1139,7 +1139,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid) static int kill_something_info(int sig, struct siginfo *info, int pid) { if (!pid) { - return kill_pg_info(sig, info, current->pgrp); + return kill_pg_info(sig, info, process_group(current)); } else if (pid == -1) { int retval = 0, count = 0; struct task_struct * p; @@ -1798,7 +1798,7 @@ relock: /* signals can be posted during this window */ - if (is_orphaned_pgrp(current->pgrp)) + if (is_orphaned_pgrp(process_group(current))) goto relock; spin_lock_irq(¤t->sighand->siglock); diff --git a/kernel/sys.c b/kernel/sys.c index 02b5a12dfd59..9c6da1d16d9a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -290,7 +290,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) break; case PRIO_PGRP: if (!who) - who = current->pgrp; + who = process_group(current); for_each_task_pid(who, PIDTYPE_PGID, p, l, pid) error = set_one_prio(p, niceval, error); break; @@ -346,7 +346,7 @@ asmlinkage long sys_getpriority(int which, int who) break; case PRIO_PGRP: if (!who) - who = current->pgrp; + who = process_group(current); for_each_task_pid(who, PIDTYPE_PGID, p, l, pid) { niceval = 20 - task_nice(p); if (niceval > retval) @@ -979,11 +979,12 @@ ok_pgid: if (err) goto out; - if (p->pgrp != pgid) { + if (process_group(p) != pgid) { detach_pid(p, PIDTYPE_PGID); - p->pgrp = pgid; + p->group_leader->__pgrp = pgid; attach_pid(p, PIDTYPE_PGID, pgid); } + err = 0; out: /* All paths lead to here, thus we are safe. -DaveM */ @@ -994,7 +995,7 @@ out: asmlinkage long sys_getpgid(pid_t pid) { if (!pid) { - return current->pgrp; + return process_group(current); } else { int retval; struct task_struct *p; @@ -1006,7 +1007,7 @@ asmlinkage long sys_getpgid(pid_t pid) if (p) { retval = security_task_getpgid(p); if (!retval) - retval = p->pgrp; + retval = process_group(p); } read_unlock(&tasklist_lock); return retval; @@ -1016,7 +1017,7 @@ asmlinkage long sys_getpgid(pid_t pid) asmlinkage long sys_getpgrp(void) { /* SMP - assuming writes are word atomic this is fine */ - return current->pgrp; + return process_group(current); } asmlinkage long sys_getsid(pid_t pid) @@ -1059,7 +1060,7 @@ asmlinkage long sys_setsid(void) __set_special_pids(current->pid, current->pid); current->tty = NULL; current->tty_old_pgrp = 0; - err = current->pgrp; + err = process_group(current); out: write_unlock_irq(&tasklist_lock); return err; -- cgit v1.2.3 From f221af36348b0231dded895a5c8248660ff3bbc0 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 21 Sep 2003 01:38:00 -0700 Subject: [PATCH] scheduler infrastructure From: Ingo Molnar the attached scheduler patch (against test2-mm2) adds the scheduling infrastructure items discussed on lkml. I got good feedback - and while i dont expect it to solve all problems, it does solve a number of bad ones: - test_starve.c code from David Mosberger - thud.c making the system unusuable due to unfairness - fair/accurate sleep average based on a finegrained clock - audio skipping way too easily other changes in sched-test2-mm2-A3: - ia64 sched_clock() code, from David Mosberger. - migration thread startup without relying on implicit scheduling behavior. While the current 2.6 code is correct (due to the cpu-up code adding CPUs one by one), but it's also fragile - and this code cannot be carried over into the 2.4 backports. So adding this method would clean up the startup and would make it easier to have 2.4 backports. and here's the original changelog for the scheduler changes: - cycle accuracy (nanosec resolution) timekeeping within the scheduler. This fixes a number of audio artifacts (skipping) i've reproduced. I dont think we can get away without going cycle accuracy - reading the cycle counter adds some overhead, but it's acceptable. The first nanosec-accuracy patch was done by Mike Galbraith - this patch is different but similar in nature. I went further in also changing the sleep_avg to be of nanosec resolution. - more finegrained timeslices: there's now a timeslice 'sub unit' of 50 usecs (TIMESLICE_GRANULARITY) - CPU hogs on the same priority level will roundrobin with this unit. This change is intended to make gaming latencies shorter. - include scheduling latency in sleep bonus calculation. This change extends the sleep-average calculation to the period of time a task spends on the runqueue but doesnt get scheduled yet, right after wakeup. Note that tasks that were preempted (ie. not woken up) and are still on the runqueue do not get this benefit. This change closes one of the last hole in the dynamic priority estimation, it should result in interactive tasks getting more priority under heavy load. This change also fixes the test-starve.c testcase from David Mosberger. The TSC-based scheduler clock is disabled on ia32 NUMA platforms. (ie. platforms that have unsynched TSC for sure.) Those platforms should provide the proper code to rely on the TSC in a global way. (no such infrastructure exists at the moment - the monotonic TSC-based clock doesnt deal with TSC offsets either, as far as i can tell.) --- arch/i386/kernel/smpboot.c | 4 +- arch/i386/kernel/timers/timer_tsc.c | 24 ++++ fs/proc/array.c | 5 +- include/linux/sched.h | 5 +- kernel/fork.c | 2 +- kernel/sched.c | 222 +++++++++++++++++++++++++++--------- 6 files changed, 201 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index a78d1cc88a1f..2d034f0374cc 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -915,13 +915,13 @@ static void smp_tune_scheduling (void) cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth; } - cache_decay_ticks = (long)cacheflush_time/cpu_khz * HZ / 1000; + cache_decay_ticks = (long)cacheflush_time/cpu_khz + 1; printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n", (long)cacheflush_time/(cpu_khz/1000), ((long)cacheflush_time*100/(cpu_khz/1000)) % 100); printk("task migration cache decay timeout: %ld msecs.\n", - (cache_decay_ticks + 1) * 1000 / HZ); + cache_decay_ticks); } /* diff --git a/arch/i386/kernel/timers/timer_tsc.c b/arch/i386/kernel/timers/timer_tsc.c index af912f4d2cf1..f8b3c918a955 100644 --- a/arch/i386/kernel/timers/timer_tsc.c +++ b/arch/i386/kernel/timers/timer_tsc.c @@ -127,6 +127,30 @@ static unsigned long long monotonic_clock_tsc(void) return base + cycles_2_ns(this_offset - last_offset); } +/* + * Scheduler clock - returns current time in nanosec units. + */ +unsigned long long sched_clock(void) +{ + unsigned long long this_offset; + + /* + * In the NUMA case we dont use the TSC as they are not + * synchronized across all CPUs. + */ +#ifndef CONFIG_NUMA + if (unlikely(!cpu_has_tsc)) +#endif + return (unsigned long long)jiffies * (1000000000 / HZ); + + /* Read the Time Stamp Counter */ + rdtscll(this_offset); + + /* return the value in ns */ + return cycles_2_ns(this_offset); +} + + static void mark_offset_tsc(void) { unsigned long lost,delay; diff --git a/fs/proc/array.c b/fs/proc/array.c index 4b9ec914d60f..bf39eb5013b0 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -154,13 +154,16 @@ static inline char * task_state(struct task_struct *p, char *buffer) read_lock(&tasklist_lock); buffer += sprintf(buffer, "State:\t%s\n" + "SleepAVG:\t%lu%%\n" "Tgid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" "TracerPid:\t%d\n" "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", - get_task_state(p), p->tgid, + get_task_state(p), + (p->sleep_avg/1024)*100/(1000000000/1024), + p->tgid, p->pid, p->pid ? p->real_parent->pid : 0, p->pid && p->ptrace ? p->parent->pid : 0, p->uid, p->euid, p->suid, p->fsuid, diff --git a/include/linux/sched.h b/include/linux/sched.h index 9878dbef500a..d2d412d96bea 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -342,7 +342,8 @@ struct task_struct { prio_array_t *array; unsigned long sleep_avg; - unsigned long last_run; + unsigned long long timestamp; + int activated; unsigned long policy; cpumask_t cpus_allowed; @@ -506,6 +507,8 @@ static inline int set_cpus_allowed(task_t *p, cpumask_t new_mask) } #endif +extern unsigned long long sched_clock(void); + #ifdef CONFIG_NUMA extern void sched_balance_exec(void); extern void node_nr_running_init(void); diff --git a/kernel/fork.c b/kernel/fork.c index 50535a16c71e..38badc50bebc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -925,7 +925,7 @@ struct task_struct *copy_process(unsigned long clone_flags, */ p->first_time_slice = 1; current->time_slice >>= 1; - p->last_run = jiffies; + p->timestamp = sched_clock(); if (!current->time_slice) { /* * This case is rare, it happens when the parent has only diff --git a/kernel/sched.c b/kernel/sched.c index 1c5802ceedae..966dfe516ec5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -68,13 +68,15 @@ */ #define MIN_TIMESLICE ( 10 * HZ / 1000) #define MAX_TIMESLICE (200 * HZ / 1000) -#define CHILD_PENALTY 50 +#define TIMESLICE_GRANULARITY (HZ/40 ?: 1) +#define ON_RUNQUEUE_WEIGHT 30 +#define CHILD_PENALTY 95 #define PARENT_PENALTY 100 #define EXIT_WEIGHT 3 #define PRIO_BONUS_RATIO 25 #define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (10*HZ) -#define STARVATION_LIMIT (10*HZ) +#define MAX_SLEEP_AVG (1*1000000000) +#define STARVATION_LIMIT HZ #define NODE_THRESHOLD 125 /* @@ -115,6 +117,11 @@ #define TASK_INTERACTIVE(p) \ ((p)->prio <= (p)->static_prio - DELTA(p)) +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio || \ + ((p)->prio == (rq)->curr->prio && \ + (p)->time_slice > (rq)->curr->time_slice * 2)) + /* * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ] * to time slice values. @@ -318,8 +325,8 @@ static int effective_prio(task_t *p) if (rt_task(p)) return p->prio; - bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 - - MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2; + bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*(p->sleep_avg/1024)/(MAX_SLEEP_AVG/1024)/100; + bonus -= MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2; prio = p->static_prio - bonus; if (prio < MAX_RT_PRIO) @@ -338,24 +345,24 @@ static inline void __activate_task(task_t *p, runqueue_t *rq) nr_running_inc(rq); } -/* - * activate_task - move a task to the runqueue and do priority recalculation - * - * Update all the scheduling statistics stuff. (sleep average - * calculation, priority modifiers, etc.) - */ -static inline void activate_task(task_t *p, runqueue_t *rq) +static void recalc_task_prio(task_t *p, unsigned long long now) { - long sleep_time = jiffies - p->last_run - 1; + unsigned long long __sleep_time = now - p->timestamp; + unsigned long sleep_time; + + if (__sleep_time > MAX_SLEEP_AVG) + sleep_time = MAX_SLEEP_AVG; + else + sleep_time = (unsigned long)__sleep_time; if (sleep_time > 0) { - int sleep_avg; + unsigned long long sleep_avg; /* * This code gives a bonus to interactive tasks. * * The boost works by updating the 'average sleep time' - * value here, based on ->last_run. The more time a task + * value here, based on ->timestamp. The more time a task * spends sleeping, the higher the average gets - and the * higher the priority boost gets as well. */ @@ -374,6 +381,37 @@ static inline void activate_task(task_t *p, runqueue_t *rq) p->prio = effective_prio(p); } } +} + +/* + * activate_task - move a task to the runqueue and do priority recalculation + * + * Update all the scheduling statistics stuff. (sleep average + * calculation, priority modifiers, etc.) + */ +static inline void activate_task(task_t *p, runqueue_t *rq) +{ + unsigned long long now = sched_clock(); + + recalc_task_prio(p, now); + + /* + * Tasks which were woken up by interrupts (ie. hw events) + * are most likely of interactive nature. So we give them + * the credit of extending their sleep time to the period + * of time they spend on the runqueue, waiting for execution + * on a CPU, first time around: + */ + if (in_interrupt()) + p->activated = 2; + else + /* + * Normal first-time wakeups get a credit too for on-runqueue time, + * but it will be weighted down: + */ + p->activated = 1; + p->timestamp = now; + __activate_task(p, rq); } @@ -500,7 +538,7 @@ repeat_lock_task: __activate_task(p, rq); else { activate_task(p, rq); - if (p->prio < rq->curr->prio) + if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); } success = 1; @@ -549,8 +587,8 @@ void wake_up_forked_process(task_t * p) * and children as well, to keep max-interactive tasks * from forking tasks that are max-interactive. */ - current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100; - p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100; + current->sleep_avg = current->sleep_avg / 100 * PARENT_PENALTY; + p->sleep_avg = p->sleep_avg / 100 * CHILD_PENALTY; p->prio = effective_prio(p); set_task_cpu(p, smp_processor_id()); @@ -591,8 +629,7 @@ void sched_exit(task_t * p) * the sleep_avg of the parent as well. */ if (p->sleep_avg < p->parent->sleep_avg) - p->parent->sleep_avg = (p->parent->sleep_avg * EXIT_WEIGHT + - p->sleep_avg) / (EXIT_WEIGHT + 1); + p->parent->sleep_avg = p->parent->sleep_avg / (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / (EXIT_WEIGHT + 1); } /** @@ -994,13 +1031,8 @@ static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. */ - if (p->prio < this_rq->curr->prio) + if (TASK_PREEMPTS_CURR(p, this_rq)) set_need_resched(); - else { - if (p->prio == this_rq->curr->prio && - p->time_slice > this_rq->curr->time_slice) - set_need_resched(); - } } /* @@ -1017,12 +1049,14 @@ static void load_balance(runqueue_t *this_rq, int idle, cpumask_t cpumask) runqueue_t *busiest; prio_array_t *array; struct list_head *head, *curr; + unsigned long long now; task_t *tmp; busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask); if (!busiest) goto out; + now = sched_clock(); /* * We first consider expired tasks. Those will likely not be * executed in the near future, and they are most likely to @@ -1063,7 +1097,7 @@ skip_queue: */ #define CAN_MIGRATE_TASK(p,rq,this_cpu) \ - ((idle || (jiffies - (p)->last_run > cache_decay_ticks)) && \ + ((idle || (((now - (p)->timestamp)>>10) > cache_decay_ticks)) &&\ !task_running(rq, p) && \ cpu_isset(this_cpu, (p)->cpus_allowed)) @@ -1180,8 +1214,7 @@ EXPORT_PER_CPU_SYMBOL(kstat); */ #define EXPIRED_STARVING(rq) \ (STARVATION_LIMIT && ((rq)->expired_timestamp && \ - (jiffies - (rq)->expired_timestamp >= \ - STARVATION_LIMIT * ((rq)->nr_running) + 1))) + (jiffies - (rq)->expired_timestamp >= STARVATION_LIMIT))) /* * This function gets called by the timer code, with HZ frequency. @@ -1231,14 +1264,11 @@ void scheduler_tick(int user_ticks, int sys_ticks) spin_lock(&rq->lock); /* * The task was running during this tick - update the - * time slice counter and the sleep average. Note: we - * do not update a thread's priority until it either - * goes to sleep or uses up its timeslice. This makes - * it possible for interactive tasks to use up their - * timeslices at their highest priority levels. + * time slice counter. Note: we do not update a thread's + * priority until it either goes to sleep or uses up its + * timeslice. This makes it possible for interactive tasks + * to use up their timeslices at their highest priority levels. */ - if (p->sleep_avg) - p->sleep_avg--; if (unlikely(rt_task(p))) { /* * RR tasks need a special form of timeslice management. @@ -1262,12 +1292,33 @@ void scheduler_tick(int user_ticks, int sys_ticks) p->time_slice = task_timeslice(p); p->first_time_slice = 0; + if (!rq->expired_timestamp) + rq->expired_timestamp = jiffies; if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { - if (!rq->expired_timestamp) - rq->expired_timestamp = jiffies; enqueue_task(p, rq->expired); } else enqueue_task(p, rq->active); + } else { + /* + * Prevent a too long timeslice allowing a task to monopolize + * the CPU. We do this by splitting up the timeslice into + * smaller pieces. + * + * Note: this does not mean the task's timeslices expire or + * get lost in any way, they just might be preempted by + * another task of equal priority. (one with higher + * priority would have preempted this task already.) We + * requeue this task to the end of the list on this priority + * level, which is in essence a round-robin of tasks with + * equal priority. + */ + if (!((task_timeslice(p) - p->time_slice) % TIMESLICE_GRANULARITY) && + (p->array == rq->active)) { + dequeue_task(p, rq->active); + set_tsk_need_resched(p); + p->prio = effective_prio(p); + enqueue_task(p, rq->active); + } } out_unlock: spin_unlock(&rq->lock); @@ -1286,6 +1337,8 @@ asmlinkage void schedule(void) runqueue_t *rq; prio_array_t *array; struct list_head *queue; + unsigned long long now; + unsigned long run_time; int idx; /* @@ -1306,7 +1359,11 @@ need_resched: rq = this_rq(); release_kernel_lock(prev); - prev->last_run = jiffies; + now = sched_clock(); + if (likely(now - prev->timestamp < MAX_SLEEP_AVG)) + run_time = now - prev->timestamp; + else + run_time = MAX_SLEEP_AVG; spin_lock_irq(&rq->lock); /* @@ -1356,12 +1413,30 @@ pick_next_task: queue = array->queue + idx; next = list_entry(queue->next, task_t, run_list); + if (next->activated) { + unsigned long long delta = now - next->timestamp; + + if (next->activated == 1) + delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; + + next->activated = 0; + array = next->array; + dequeue_task(next, array); + recalc_task_prio(next, next->timestamp + delta); + enqueue_task(next, array); + } switch_tasks: prefetch(next); clear_tsk_need_resched(prev); RCU_qsctr(task_cpu(prev))++; + prev->sleep_avg -= run_time; + if ((long)prev->sleep_avg < 0) + prev->sleep_avg = 0; + prev->timestamp = now; + if (likely(prev != next)) { + next->timestamp = now; rq->nr_switches++; rq->curr = next; @@ -1601,6 +1676,7 @@ void set_user_nice(task_t *p, long nice) unsigned long flags; prio_array_t *array; runqueue_t *rq; + int old_prio, new_prio, delta; if (TASK_NICE(p) == nice || nice < -20 || nice > 19) return; @@ -1609,6 +1685,12 @@ void set_user_nice(task_t *p, long nice) * the task might be in the middle of scheduling on another CPU. */ rq = task_rq_lock(p, &flags); + /* + * The RT priorities are set via setscheduler(), but we still + * allow the 'normal' nice value to be set - but as expected + * it wont have any effect on scheduling until the task is + * not SCHED_NORMAL: + */ if (rt_task(p)) { p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; @@ -1616,16 +1698,20 @@ void set_user_nice(task_t *p, long nice) array = p->array; if (array) dequeue_task(p, array); + + old_prio = p->prio; + new_prio = NICE_TO_PRIO(nice); + delta = new_prio - old_prio; p->static_prio = NICE_TO_PRIO(nice); - p->prio = NICE_TO_PRIO(nice); + p->prio += delta; + if (array) { enqueue_task(p, array); /* - * If the task is running and lowered its priority, - * or increased its priority then reschedule its CPU: + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: */ - if ((NICE_TO_PRIO(nice) < p->static_prio) || - task_running(rq, p)) + if (delta < 0 || (delta > 0 && task_running(rq, p))) resched_task(rq->curr); } out_unlock: @@ -2382,6 +2468,12 @@ static void move_task_away(struct task_struct *p, int dest_cpu) local_irq_restore(flags); } +typedef struct { + int cpu; + struct completion startup_done; + task_t *task; +} migration_startup_t; + /* * migration_thread - this is a highprio system thread that performs * thread migration by bumping thread off CPU then 'pushing' onto @@ -2391,20 +2483,21 @@ static int migration_thread(void * data) { /* Marking "param" __user is ok, since we do a set_fs(KERNEL_DS); */ struct sched_param __user param = { .sched_priority = MAX_RT_PRIO-1 }; - int cpu = (long) data; + migration_startup_t *startup = data; + int cpu = startup->cpu; runqueue_t *rq; int ret; + startup->task = current; + complete(&startup->startup_done); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + + BUG_ON(smp_processor_id() != cpu); + daemonize("migration/%d", cpu); set_fs(KERNEL_DS); - /* - * Either we are running on the right CPU, or there's a a - * migration thread on this CPU, guaranteed (we're started - * serially). - */ - set_cpus_allowed(current, cpumask_of_cpu(cpu)); - ret = setscheduler(0, SCHED_FIFO, ¶m); rq = this_rq(); @@ -2440,13 +2533,30 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) { + long cpu = (long) hcpu; + migration_startup_t startup; + switch (action) { case CPU_ONLINE: - printk("Starting migration thread for cpu %li\n", - (long)hcpu); - kernel_thread(migration_thread, hcpu, CLONE_KERNEL); - while (!cpu_rq((long)hcpu)->migration_thread) + + printk("Starting migration thread for cpu %li\n", cpu); + + startup.cpu = cpu; + startup.task = NULL; + init_completion(&startup.startup_done); + + kernel_thread(migration_thread, &startup, CLONE_KERNEL); + wait_for_completion(&startup.startup_done); + wait_task_inactive(startup.task); + + startup.task->thread_info->cpu = cpu; + startup.task->cpus_allowed = cpumask_of_cpu(cpu); + + wake_up_process(startup.task); + + while (!cpu_rq(cpu)->migration_thread) yield(); + break; } return NOTIFY_OK; -- cgit v1.2.3 From 875ee1e15641e6662264f9279b12cba1f1d3c4fc Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 21 Sep 2003 01:38:16 -0700 Subject: [PATCH] CPU scheduler balancing fix From: Nick Piggin The patch changes the imbalance required before a balance to 25% from 50% - as the comments intend. It also changes a case where the balancing wouldn't be done if the imbalance was >= 25% but only 1 task difference. The downside of the second change is that one task may bounce from one cpu to another for some loads. This will only bounce once every 200ms, so it shouldn't be a big problem. (Benchmarking results are basically a wash - SDET is increased maybe 0.5%) --- kernel/sched.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 966dfe516ec5..38fa1ac0f82b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -995,10 +995,10 @@ static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, if (likely(!busiest)) goto out; - *imbalance = (max_load - nr_running) / 2; + *imbalance = max_load - nr_running; /* It needs an at least ~25% imbalance to trigger balancing. */ - if (!idle && (*imbalance < (max_load + 3)/4)) { + if (!idle && ((*imbalance)*4 < max_load)) { busiest = NULL; goto out; } @@ -1008,7 +1008,7 @@ static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, * Make sure nothing changed since we checked the * runqueue length. */ - if (busiest->nr_running <= nr_running + 1) { + if (busiest->nr_running <= nr_running) { spin_unlock(&busiest->lock); busiest = NULL; } @@ -1057,6 +1057,12 @@ static void load_balance(runqueue_t *this_rq, int idle, cpumask_t cpumask) goto out; now = sched_clock(); + /* + * We only want to steal a number of tasks equal to 1/2 the imbalance, + * otherwise we'll just shift the imbalance to the new queue: + */ + imbalance /= 2; + /* * We first consider expired tasks. Those will likely not be * executed in the near future, and they are most likely to -- cgit v1.2.3 From 2cf13d58dbc43af21ad54ccbf1081f0757818b71 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 21 Sep 2003 01:38:24 -0700 Subject: [PATCH] CPU scheduler interactivity changes From: Con Kolivas Interactivity scheduler tweaks on top of Ingo's A3 interactivity patch. Interactive credit added to task struct to find truly interactive tasks and treat them differently. Extra #defines included as helpers for conversion to/from nanosecond timing, to work out an average timeslice for nice 0 tasks, and the effective dynamic priority bonuses that will be given to tasks. MAX_SLEEP_AVG modified to change dynamic priority by one for a nice 0 task sleeping or running for one full timeslice. CREDIT_LIMIT is the number of times a task earns sleep_avg over MAX_SLEEP_AVG before it is considered HIGH_CREDIT (truly interactive); and -CREDIT_LIMIT is LOW_CREDIT TIMESLICE GRANULARITY is modified to be more frequent for more interactivetasks (10 ms for top 2 dynamic priorities and then halving each priority belowthat) and less frequent per extra cpu. JUST_INTERACTIVE_SLEEP logic created to be a sleep_avg consistent with giving a task enough dynamic priority to remain on the active array. Task preemption of equal priority tasks is dropped as requeuing with TIMESLICE_GRANULARITY makes this unecessary. Dynamic priority bonus simplified. User tasks that sleep a long time and not waking from uninterruptible sleep are sought and categorised as idle. Their sleep avg is limited in it's rise to prevent them becoming high priority and suddenly turning into cpu hogs. Bonus for sleeping is proportionately higher the lower the dynamic priority of a task is; this allows for very rapid escalation to interactive status. Tasks that are LOW_CREDIT are limited in rise per sleep to one priority level. Non HIGH_CREDIT tasks waking from uninterruptible sleep are sought to detect cpu hogs waiting on I/O and their sleep_avg rise is limited to just interactive state to prevent cpu bound tasks from becoming interactive during I/O wait. Tasks that earn sleep_avg over MAX_SLEEP_AVG get interactive credits. On runqueue bonus is not given to non HIGH_CREDIT tasks waking from uninterruptible sleep. Forked tasks and their parents get sleep_avg limited to the minimum necessary to maintain their effective dynamic priority thus preventing repeated forking from being a way to get highly interactive, but not penalise them noticably otherwise. CAN_MIGRATE_TASK cleaned up and modified to work with nanosecond timestamps. Reverted Ingo's A3 Starvation limit change - it was making interactive tasks suffer more under increasing load. If a cpu is grossly overloaded and everyone is going to starve it may as well run interactive tasks preferentially. Task requeuing is limited to interactive tasks only (cpu bound tasks dont need low latency and derive benefit from longer timeslices), and they must have at least TIMESLICE_GRANULARITY remaining. HIGH_CREDIT tasks get penalised less sleep_avg the more interactive they are thus keeping them interactive for bursts but if they become sustained cpu hogs they will slide increasingly rapidly down the dynamic priority scale. Tasks that run out of sleep_avg, are still using up cpu time and are not high or low credit yet get penalised interactive credits to determine LOW_CREDIT tasks (cpu bound ones). --- include/linux/sched.h | 1 + kernel/sched.c | 252 +++++++++++++++++++++++++++++++++++++------------- 2 files changed, 189 insertions(+), 64 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index d2d412d96bea..1618ae7f42d4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -342,6 +342,7 @@ struct task_struct { prio_array_t *array; unsigned long sleep_avg; + long interactive_credit; unsigned long long timestamp; int activated; diff --git a/kernel/sched.c b/kernel/sched.c index 38fa1ac0f82b..875a7dcd2e55 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -14,6 +14,7 @@ * an array-switch method of distributing timeslices * and per-CPU runqueues. Cleanups and useful suggestions * by Davide Libenzi, preemptible kernel bits by Robert Love. + * 2003-09-03 Interactivity tuning by Con Kolivas. */ #include @@ -58,6 +59,14 @@ #define USER_PRIO(p) ((p)-MAX_RT_PRIO) #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) +#define AVG_TIMESLICE (MIN_TIMESLICE + ((MAX_TIMESLICE - MIN_TIMESLICE) *\ + (MAX_PRIO-1-NICE_TO_PRIO(0))/(MAX_USER_PRIO - 1))) + +/* + * Some helpers for converting nanosecond timing to jiffy resolution + */ +#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) +#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) /* * These are the 'tuning knobs' of the scheduler: @@ -68,16 +77,18 @@ */ #define MIN_TIMESLICE ( 10 * HZ / 1000) #define MAX_TIMESLICE (200 * HZ / 1000) -#define TIMESLICE_GRANULARITY (HZ/40 ?: 1) #define ON_RUNQUEUE_WEIGHT 30 #define CHILD_PENALTY 95 #define PARENT_PENALTY 100 #define EXIT_WEIGHT 3 #define PRIO_BONUS_RATIO 25 +#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) #define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (1*1000000000) -#define STARVATION_LIMIT HZ +#define MAX_SLEEP_AVG (AVG_TIMESLICE * MAX_BONUS) +#define STARVATION_LIMIT (MAX_SLEEP_AVG) +#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) #define NODE_THRESHOLD 125 +#define CREDIT_LIMIT 100 /* * If a task is 'interactive' then we reinsert it in the active @@ -107,6 +118,19 @@ * too hard. */ +#define CURRENT_BONUS(p) \ + (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ + MAX_SLEEP_AVG) + +#ifdef CONFIG_SMP +#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \ + (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ + num_online_cpus()) +#else +#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \ + (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) +#endif + #define SCALE(v1,v1_max,v2_max) \ (v1) * (v2_max) / (v1_max) @@ -117,10 +141,18 @@ #define TASK_INTERACTIVE(p) \ ((p)->prio <= (p)->static_prio - DELTA(p)) +#define JUST_INTERACTIVE_SLEEP(p) \ + (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ + (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) + +#define HIGH_CREDIT(p) \ + ((p)->interactive_credit > CREDIT_LIMIT) + +#define LOW_CREDIT(p) \ + ((p)->interactive_credit < -CREDIT_LIMIT) + #define TASK_PREEMPTS_CURR(p, rq) \ - ((p)->prio < (rq)->curr->prio || \ - ((p)->prio == (rq)->curr->prio && \ - (p)->time_slice > (rq)->curr->time_slice * 2)) + ((p)->prio < (rq)->curr->prio) /* * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ] @@ -325,8 +357,7 @@ static int effective_prio(task_t *p) if (rt_task(p)) return p->prio; - bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*(p->sleep_avg/1024)/(MAX_SLEEP_AVG/1024)/100; - bonus -= MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2; + bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; prio = p->static_prio - bonus; if (prio < MAX_RT_PRIO) @@ -350,37 +381,75 @@ static void recalc_task_prio(task_t *p, unsigned long long now) unsigned long long __sleep_time = now - p->timestamp; unsigned long sleep_time; - if (__sleep_time > MAX_SLEEP_AVG) - sleep_time = MAX_SLEEP_AVG; + if (__sleep_time > NS_MAX_SLEEP_AVG) + sleep_time = NS_MAX_SLEEP_AVG; else sleep_time = (unsigned long)__sleep_time; - if (sleep_time > 0) { - unsigned long long sleep_avg; - + if (likely(sleep_time > 0)) { /* - * This code gives a bonus to interactive tasks. - * - * The boost works by updating the 'average sleep time' - * value here, based on ->timestamp. The more time a task - * spends sleeping, the higher the average gets - and the - * higher the priority boost gets as well. + * User tasks that sleep a long time are categorised as + * idle and will get just interactive status to stay active & + * prevent them suddenly becoming cpu hogs and starving + * other processes. */ - sleep_avg = p->sleep_avg + sleep_time; + if (p->mm && p->activated != -1 && + sleep_time > JUST_INTERACTIVE_SLEEP(p)){ + p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - + AVG_TIMESLICE); + if (!HIGH_CREDIT(p)) + p->interactive_credit++; + } else { + /* + * The lower the sleep avg a task has the more + * rapidly it will rise with sleep time. + */ + sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; - /* - * 'Overflow' bonus ticks go to the waker as well, so the - * ticks are not lost. This has the effect of further - * boosting tasks that are related to maximum-interactive - * tasks. - */ - if (sleep_avg > MAX_SLEEP_AVG) - sleep_avg = MAX_SLEEP_AVG; - if (p->sleep_avg != sleep_avg) { - p->sleep_avg = sleep_avg; - p->prio = effective_prio(p); + /* + * Tasks with low interactive_credit are limited to + * one timeslice worth of sleep avg bonus. + */ + if (LOW_CREDIT(p) && + sleep_time > JIFFIES_TO_NS(task_timeslice(p))) + sleep_time = + JIFFIES_TO_NS(task_timeslice(p)); + + /* + * Non high_credit tasks waking from uninterruptible + * sleep are limited in their sleep_avg rise as they + * are likely to be cpu hogs waiting on I/O + */ + if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm){ + if (p->sleep_avg >= JUST_INTERACTIVE_SLEEP(p)) + sleep_time = 0; + else if (p->sleep_avg + sleep_time >= + JUST_INTERACTIVE_SLEEP(p)){ + p->sleep_avg = + JUST_INTERACTIVE_SLEEP(p); + sleep_time = 0; + } + } + + /* + * This code gives a bonus to interactive tasks. + * + * The boost works by updating the 'average sleep time' + * value here, based on ->timestamp. The more time a task + * spends sleeping, the higher the average gets - and the + * higher the priority boost gets as well. + */ + p->sleep_avg += sleep_time; + + if (p->sleep_avg > NS_MAX_SLEEP_AVG){ + p->sleep_avg = NS_MAX_SLEEP_AVG; + if (!HIGH_CREDIT(p)) + p->interactive_credit++; + } } } + + p->prio = effective_prio(p); } /* @@ -396,20 +465,26 @@ static inline void activate_task(task_t *p, runqueue_t *rq) recalc_task_prio(p, now); /* - * Tasks which were woken up by interrupts (ie. hw events) - * are most likely of interactive nature. So we give them - * the credit of extending their sleep time to the period - * of time they spend on the runqueue, waiting for execution - * on a CPU, first time around: - */ - if (in_interrupt()) - p->activated = 2; - else - /* - * Normal first-time wakeups get a credit too for on-runqueue time, - * but it will be weighted down: + * This checks to make sure it's not an uninterruptible task + * that is now waking up. */ - p->activated = 1; + if (!p->activated){ + /* + * Tasks which were woken up by interrupts (ie. hw events) + * are most likely of interactive nature. So we give them + * the credit of extending their sleep time to the period + * of time they spend on the runqueue, waiting for execution + * on a CPU, first time around: + */ + if (in_interrupt()) + p->activated = 2; + else + /* + * Normal first-time wakeups get a credit too for on-runqueue + * time, but it will be weighted down: + */ + p->activated = 1; + } p->timestamp = now; __activate_task(p, rq); @@ -532,8 +607,14 @@ repeat_lock_task: task_rq_unlock(rq, &flags); goto repeat_lock_task; } - if (old_state == TASK_UNINTERRUPTIBLE) + if (old_state == TASK_UNINTERRUPTIBLE){ rq->nr_uninterruptible--; + /* + * Tasks on involuntary sleep don't earn + * sleep_avg beyond just interactive state. + */ + p->activated = -1; + } if (sync) __activate_task(p, rq); else { @@ -587,8 +668,14 @@ void wake_up_forked_process(task_t * p) * and children as well, to keep max-interactive tasks * from forking tasks that are max-interactive. */ - current->sleep_avg = current->sleep_avg / 100 * PARENT_PENALTY; - p->sleep_avg = p->sleep_avg / 100 * CHILD_PENALTY; + current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * + PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + + p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * + CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + + p->interactive_credit = 0; + p->prio = effective_prio(p); set_task_cpu(p, smp_processor_id()); @@ -629,7 +716,9 @@ void sched_exit(task_t * p) * the sleep_avg of the parent as well. */ if (p->sleep_avg < p->parent->sleep_avg) - p->parent->sleep_avg = p->parent->sleep_avg / (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / (EXIT_WEIGHT + 1); + p->parent->sleep_avg = p->parent->sleep_avg / + (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / + (EXIT_WEIGHT + 1); } /** @@ -1035,6 +1124,29 @@ static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t set_need_resched(); } +/* + * Previously: + * + * #define CAN_MIGRATE_TASK(p,rq,this_cpu) \ + * ((!idle || (NS_TO_JIFFIES(now - (p)->timestamp) > \ + * cache_decay_ticks)) && !task_running(rq, p) && \ + * cpu_isset(this_cpu, (p)->cpus_allowed)) + */ + +static inline int +can_migrate_task(task_t *tsk, runqueue_t *rq, int this_cpu, int idle) +{ + unsigned long delta = sched_clock() - tsk->timestamp; + + if (!idle && (delta <= JIFFIES_TO_NS(cache_decay_ticks))) + return 0; + if (task_running(rq, tsk)) + return 0; + if (!cpu_isset(this_cpu, tsk->cpus_allowed)) + return 0; + return 1; +} + /* * Current runqueue is empty, or rebalance tick: if there is an * inbalance (current runqueue is too short) then pull from @@ -1049,14 +1161,12 @@ static void load_balance(runqueue_t *this_rq, int idle, cpumask_t cpumask) runqueue_t *busiest; prio_array_t *array; struct list_head *head, *curr; - unsigned long long now; task_t *tmp; busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask); if (!busiest) goto out; - now = sched_clock(); /* * We only want to steal a number of tasks equal to 1/2 the imbalance, * otherwise we'll just shift the imbalance to the new queue: @@ -1102,14 +1212,9 @@ skip_queue: * 3) are cache-hot on their current CPU. */ -#define CAN_MIGRATE_TASK(p,rq,this_cpu) \ - ((idle || (((now - (p)->timestamp)>>10) > cache_decay_ticks)) &&\ - !task_running(rq, p) && \ - cpu_isset(this_cpu, (p)->cpus_allowed)) - curr = curr->prev; - if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) { + if (!can_migrate_task(tmp, busiest, this_cpu, idle)) { if (curr != head) goto skip_queue; idx++; @@ -1220,7 +1325,8 @@ EXPORT_PER_CPU_SYMBOL(kstat); */ #define EXPIRED_STARVING(rq) \ (STARVATION_LIMIT && ((rq)->expired_timestamp && \ - (jiffies - (rq)->expired_timestamp >= STARVATION_LIMIT))) + (jiffies - (rq)->expired_timestamp >= \ + STARVATION_LIMIT * ((rq)->nr_running) + 1))) /* * This function gets called by the timer code, with HZ frequency. @@ -1317,9 +1423,15 @@ void scheduler_tick(int user_ticks, int sys_ticks) * requeue this task to the end of the list on this priority * level, which is in essence a round-robin of tasks with * equal priority. + * + * This only applies to tasks in the interactive + * delta range with at least TIMESLICE_GRANULARITY to requeue. */ - if (!((task_timeslice(p) - p->time_slice) % TIMESLICE_GRANULARITY) && - (p->array == rq->active)) { + if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - + p->time_slice) % TIMESLICE_GRANULARITY(p)) && + (p->time_slice >= TIMESLICE_GRANULARITY(p)) && + (p->array == rq->active)) { + dequeue_task(p, rq->active); set_tsk_need_resched(p); p->prio = effective_prio(p); @@ -1366,10 +1478,19 @@ need_resched: release_kernel_lock(prev); now = sched_clock(); - if (likely(now - prev->timestamp < MAX_SLEEP_AVG)) + if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG)) run_time = now - prev->timestamp; else - run_time = MAX_SLEEP_AVG; + run_time = NS_MAX_SLEEP_AVG; + + /* + * Tasks with interactive credits get charged less run_time + * at high sleep_avg to delay them losing their interactive + * status + */ + if (HIGH_CREDIT(prev)) + run_time /= (CURRENT_BONUS(prev) ? : 1); + spin_lock_irq(&rq->lock); /* @@ -1419,26 +1540,29 @@ pick_next_task: queue = array->queue + idx; next = list_entry(queue->next, task_t, run_list); - if (next->activated) { + if (next->activated > 0) { unsigned long long delta = now - next->timestamp; if (next->activated == 1) delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; - next->activated = 0; array = next->array; dequeue_task(next, array); recalc_task_prio(next, next->timestamp + delta); enqueue_task(next, array); } + next->activated = 0; switch_tasks: prefetch(next); clear_tsk_need_resched(prev); RCU_qsctr(task_cpu(prev))++; prev->sleep_avg -= run_time; - if ((long)prev->sleep_avg < 0) + if ((long)prev->sleep_avg <= 0){ prev->sleep_avg = 0; + if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev))) + prev->interactive_credit--; + } prev->timestamp = now; if (likely(prev != next)) { -- cgit v1.2.3 From d6dbfa23be4b356ae815f498f26c99367696135b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 21 Sep 2003 01:38:31 -0700 Subject: [PATCH] might_sleep diagnostics might_sleep() can be triggered by either local interrupts being disabled or by elevated preempt count. Disambiguate them. --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 875a7dcd2e55..f4b866b8cb44 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2809,6 +2809,8 @@ void __might_sleep(char *file, int line) prev_jiffy = jiffies; printk(KERN_ERR "Debug: sleeping function called from invalid" " context at %s:%d\n", file, line); + printk("in_atomic():%d, irqs_disabled():%d\n", + in_atomic(), irqs_disabled()); dump_stack(); } #endif -- cgit v1.2.3 From 98badc086ec9fd7e568dff59eb31277d21c88192 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 21 Sep 2003 01:40:51 -0700 Subject: [PATCH] remove /proc/config_build_info From: Zwane Mwaikambo The same info is already available in /proc/version. --- kernel/configs.c | 46 ++-------------------------------------------- scripts/mkconfigs | 12 ------------ 2 files changed, 2 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/kernel/configs.c b/kernel/configs.c index 6a5c0c9d9176..57f54451edbe 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -47,7 +47,7 @@ /**************************************************/ /* globals and useful constants */ -static const char IKCONFIG_VERSION[] = "0.6"; +static const char IKCONFIG_VERSION[] __initdata = "0.7"; static ssize_t ikconfig_read_current(struct file *file, char __user *buf, @@ -72,32 +72,6 @@ static struct file_operations ikconfig_file_ops = { .read = ikconfig_read_current, }; - -/***************************************************/ -/* build_info_show: let people read the info */ -/* we have on the tools used to build this kernel */ - -static int build_info_show(struct seq_file *seq, void *v) -{ - seq_printf(seq, - "Kernel: %s\nCompiler: %s\nVersion_in_Makefile: %s\n", - ikconfig_build_info, LINUX_COMPILER, UTS_RELEASE); - return 0; -} - -static int build_info_open(struct inode *inode, struct file *file) -{ - return single_open(file, build_info_show, PDE(inode)->data); -} - -static struct file_operations build_info_file_ops = { - .owner = THIS_MODULE, - .open = build_info_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - /***************************************************/ /* ikconfig_init: start up everything we need to */ @@ -112,26 +86,12 @@ static int __init ikconfig_init(void) entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO, &proc_root); if (!entry) - goto leave; + return -ENOMEM; entry->proc_fops = &ikconfig_file_ops; entry->size = kernel_config_data_size; - /* create the "build_info" file */ - entry = create_proc_entry("config_build_info", - S_IFREG | S_IRUGO, &proc_root); - if (!entry) - goto leave_gz; - entry->proc_fops = &build_info_file_ops; - return 0; - -leave_gz: - /* remove the file from proc */ - remove_proc_entry("config.gz", &proc_root); - -leave: - return -ENOMEM; } /***************************************************/ @@ -139,9 +99,7 @@ leave: static void __exit ikconfig_cleanup(void) { - /* remove the files */ remove_proc_entry("config.gz", &proc_root); - remove_proc_entry("config_build_info", &proc_root); } module_init(ikconfig_init); diff --git a/scripts/mkconfigs b/scripts/mkconfigs index fc9c3dd978bc..a3166274ebc3 100755 --- a/scripts/mkconfigs +++ b/scripts/mkconfigs @@ -25,12 +25,6 @@ # - Retain lines that begin with "# CONFIG_" # - lines that use double-quotes must \\-escape-quote them - -kernel_version() -{ - KERNVER="`grep VERSION $1 | head -1 | cut -f3 -d' '`.`grep PATCHLEVEL $1 | head -1 | cut -f3 -d' '`.`grep SUBLEVEL $1 | head -1 | cut -f3 -d' '``grep EXTRAVERSION $1 | head -1 | cut -f3 -d' '`" -} - if [ $# -lt 2 ] then echo "Usage: `basename $0` " @@ -66,12 +60,6 @@ echo \ * */" -echo "#ifdef CONFIG_IKCONFIG_PROC" -echo "static char const ikconfig_build_info[] =" -echo " \"`uname -s` `uname -r` `uname -v` `uname -m`\";" -echo "#endif" -echo -kernel_version $makefile echo "static char const ikconfig_config[] __attribute__((unused)) = " echo "\"CONFIG_BEGIN=n\\n\\" echo "`cat $config | sed 's/\"/\\\\\"/g' | grep "^#\? \?CONFIG_" | awk '{ print $0 "\\\\n\\\\" }' `" -- cgit v1.2.3 From e9d256288940c58e349ab7c2b6620c1ae3ad5a5a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 21 Sep 2003 01:41:21 -0700 Subject: [PATCH] Export new char dev functions From: Jonathan Corbet Nobody told me that the failure to export these (like their block counterparts) was anything but an oversight; modules will not be able to use larger device numbers without them. So...this patch exports the new char device functions. --- fs/char_dev.c | 15 +++++++++++++++ kernel/ksyms.c | 2 -- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/fs/char_dev.c b/fs/char_dev.c index 44303f85e624..48f5a6791bec 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -445,3 +445,18 @@ void __init chrdev_init(void) kset_register(&kset_dynamic); cdev_map = kobj_map_init(base_probe, &cdev_subsys); } + + +/* Let modules do char dev stuff */ +EXPORT_SYMBOL(register_chrdev_region); +EXPORT_SYMBOL(unregister_chrdev_region); +EXPORT_SYMBOL(alloc_chrdev_region); +EXPORT_SYMBOL(cdev_init); +EXPORT_SYMBOL(cdev_alloc); +EXPORT_SYMBOL(cdev_get); +EXPORT_SYMBOL(cdev_put); +EXPORT_SYMBOL(cdev_del); +EXPORT_SYMBOL(cdev_add); +EXPORT_SYMBOL(cdev_unmap); +EXPORT_SYMBOL(register_chrdev); +EXPORT_SYMBOL(unregister_chrdev); diff --git a/kernel/ksyms.c b/kernel/ksyms.c index 9f61a0496c2a..9da2940ac0e6 100644 --- a/kernel/ksyms.c +++ b/kernel/ksyms.c @@ -348,8 +348,6 @@ EXPORT_SYMBOL(lock_page); EXPORT_SYMBOL(unlock_page); /* device registration */ -EXPORT_SYMBOL(register_chrdev); -EXPORT_SYMBOL(unregister_chrdev); EXPORT_SYMBOL(register_blkdev); EXPORT_SYMBOL(unregister_blkdev); EXPORT_SYMBOL(tty_register_driver); -- cgit v1.2.3 From 58e1dd1e026c459bd53ad1de7b92c00b5887f0a9 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 21 Sep 2003 01:41:47 -0700 Subject: [PATCH] Incorrect value for SIGRTMAX From: Corey Minyard I was having a problem with signals with POSIX timers, and it turns out that the value of SIGRTMAX is incorrect. Remember that there is no signal 0, so the signals should go from 1-_NSIG. However, SIGRTMAX is defined as (_NSIG-1) in all architectures. The following patch fixes this. This define is only used in drivers/usb/core/devio.c and kernel/posix-timers.c, and both are incorrect without this fix. There's also no check for zero in posix-timers.c, that fix is part of the diff. Also, shouldn't do_sigaction() use this value instead of _NSIG? It's not a big deal, but some architectures have different values for _NSIG and SIGRTMAX. --- include/asm-alpha/signal.h | 2 +- include/asm-arm/signal.h | 2 +- include/asm-arm26/signal.h | 2 +- include/asm-cris/signal.h | 2 +- include/asm-h8300/signal.h | 2 +- include/asm-i386/signal.h | 2 +- include/asm-ia64/signal.h | 2 +- include/asm-m68k/signal.h | 2 +- include/asm-m68knommu/signal.h | 2 +- include/asm-mips/signal.h | 2 +- include/asm-parisc/signal.h | 2 +- include/asm-ppc/signal.h | 2 +- include/asm-ppc64/signal.h | 2 +- include/asm-s390/signal.h | 2 +- include/asm-sh/signal.h | 2 +- include/asm-sparc/signal.h | 2 +- include/asm-sparc64/signal.h | 2 +- include/asm-v850/signal.h | 2 +- include/asm-x86_64/signal.h | 2 +- kernel/posix-timers.c | 1 + 20 files changed, 20 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/asm-alpha/signal.h b/include/asm-alpha/signal.h index 27282f335de4..e24248f01340 100644 --- a/include/asm-alpha/signal.h +++ b/include/asm-alpha/signal.h @@ -71,7 +71,7 @@ typedef unsigned long sigset_t; /* These should not be considered constants from userland. */ #define SIGRTMIN 32 -#define SIGRTMAX (_NSIG-1) +#define SIGRTMAX _NSIG /* * SA_FLAGS values: diff --git a/include/asm-arm/signal.h b/include/asm-arm/signal.h index eb59aa539b9c..6d623e24c0fb 100644 --- a/include/asm-arm/signal.h +++ b/include/asm-arm/signal.h @@ -68,7 +68,7 @@ typedef unsigned long sigset_t; /* These should not be considered constants from userland. */ #define SIGRTMIN 32 -#define SIGRTMAX (_NSIG-1) +#define SIGRTMAX _NSIG #define SIGSWI 32 diff --git a/include/asm-arm26/signal.h b/include/asm-arm26/signal.h index e166988dbd5c..6f62e51a2e5a 100644 --- a/include/asm-arm26/signal.h +++ b/include/asm-arm26/signal.h @@ -68,7 +68,7 @@ typedef unsigned long sigset_t; /* These should not be considered constants from userland. */ #define SIGRTMIN 32 -#define SIGRTMAX (_NSIG-1) +#define SIGRTMAX _NSIG #define SIGSWI 32 diff --git a/include/asm-cris/signal.h b/include/asm-cris/signal.h index 1335bf27d8e2..3f187ec4800a 100644 --- a/include/asm-cris/signal.h +++ b/include/asm-cris/signal.h @@ -68,7 +68,7 @@ typedef unsigned long sigset_t; /* These should not be considered constants from userland. */ #define SIGRTMIN 32 -#define SIGRTMAX (_NSIG-1) +#define SIGRTMAX _NSIG /* * SA_FLAGS values: diff --git a/include/asm-h8300/signal.h b/include/asm-h8300/signal.h index 460d8a6f69f4..6612725c2297 100644 --- a/include/asm-h8300/signal.h +++ b/include/asm-h8300/signal.h @@ -68,7 +68,7 @@ typedef unsigned long sigset_t; /* These should not be considered constants from userland. */ #define SIGRTMIN 32 -#define SIGRTMAX (_NSIG-1) +#define SIGRTMAX _NSIG /* * SA_FLAGS values: diff --git a/include/asm-i386/signal.h b/include/asm-i386/signal.h index 80b7d5fdef9b..e3397cd6f77b 100644 --- a/include/asm-i386/signal.h +++ b/include/asm-i386/signal.h @@ -70,7 +70,7 @@ typedef unsigned long sigset_t; /* These should not be considered constants from userland. */ #define SIGRTMIN 32 -#define SIGRTMAX (_NSIG-1) +#define SIGRTMAX _NSIG /* * SA_FLAGS values: diff --git a/include/asm-ia64/signal.h b/include/asm-ia64/signal.h index f98d3bb65a92..5744b56571ba 100644 --- a/include/asm-ia64/signal.h +++ b/include/asm-ia64/signal.h @@ -50,7 +50,7 @@ /* These should not be considered constants from userland. */ #define SIGRTMIN 32 -#define SIGRTMAX (_NSIG-1) +#define SIGRTMAX _NSIG /* * SA_FLAGS values: diff --git a/include/asm-m68k/signal.h b/include/asm-m68k/signal.h index dfb43563e1cc..8d9c02cafc01 100644 --- a/include/asm-m68k/signal.h +++ b/include/asm-m68k/signal.h @@ -68,7 +68,7 @@ typedef unsigned long sigset_t; /* These should not be considered constants from userland. */ #define SIGRTMIN 32 -#define SIGRTMAX (_NSIG-1) +#define SIGRTMAX _NSIG /* * SA_FLAGS values: diff --git a/include/asm-m68knommu/signal.h b/include/asm-m68knommu/signal.h index 4b4f4b4058f1..486cbb0dc088 100644 --- a/include/asm-m68knommu/signal.h +++ b/include/asm-m68knommu/signal.h @@ -68,7 +68,7 @@ typedef unsigned long sigset_t; /* These should not be considered constants from userland. */ #define SIGRTMIN 32 -#define SIGRTMAX (_NSIG-1) +#define SIGRTMAX _NSIG /* * SA_FLAGS values: diff --git a/include/asm-mips/signal.h b/include/asm-mips/signal.h index 40d73cc2a1e0..994987db61be 100644 --- a/include/asm-mips/signal.h +++ b/include/asm-mips/signal.h @@ -59,7 +59,7 @@ typedef unsigned long old_sigset_t; /* at least 32 bits */ /* These should not be considered constants from userland. */ #define SIGRTMIN 32 -#define SIGRTMAX (_NSIG-1) +#define SIGRTMAX _NSIG /* * SA_FLAGS values: diff --git a/include/asm-parisc/signal.h b/include/asm-parisc/signal.h index 5ab02aef6dea..cd4beefef333 100644 --- a/include/asm-parisc/signal.h +++ b/include/asm-parisc/signal.h @@ -42,7 +42,7 @@ /* These should not be considered constants from userland. */ #define SIGRTMIN 37 -#define SIGRTMAX (_NSIG-1) /* it's 44 under HP/UX */ +#define SIGRTMAX _NSIG /* it's 44 under HP/UX */ /* * SA_FLAGS values: diff --git a/include/asm-ppc/signal.h b/include/asm-ppc/signal.h index f692baff3a44..b0528fcbe985 100644 --- a/include/asm-ppc/signal.h +++ b/include/asm-ppc/signal.h @@ -61,7 +61,7 @@ typedef struct { /* These should not be considered constants from userland. */ #define SIGRTMIN 32 -#define SIGRTMAX (_NSIG-1) +#define SIGRTMAX _NSIG /* * SA_FLAGS values: diff --git a/include/asm-ppc64/signal.h b/include/asm-ppc64/signal.h index 97ed18c9bb60..d0baf44cc89a 100644 --- a/include/asm-ppc64/signal.h +++ b/include/asm-ppc64/signal.h @@ -57,7 +57,7 @@ typedef struct { /* These should not be considered constants from userland. */ #define SIGRTMIN 32 -#define SIGRTMAX (_NSIG-1) +#define SIGRTMAX _NSIG /* * SA_FLAGS values: diff --git a/include/asm-s390/signal.h b/include/asm-s390/signal.h index fe2263b3ee0d..f273cdcd1cf6 100644 --- a/include/asm-s390/signal.h +++ b/include/asm-s390/signal.h @@ -78,7 +78,7 @@ typedef unsigned long sigset_t; /* These should not be considered constants from userland. */ #define SIGRTMIN 32 -#define SIGRTMAX (_NSIG-1) +#define SIGRTMAX _NSIG /* * SA_FLAGS values: diff --git a/include/asm-sh/signal.h b/include/asm-sh/signal.h index 2f8118eb0cb5..51a5f0691ee5 100644 --- a/include/asm-sh/signal.h +++ b/include/asm-sh/signal.h @@ -57,7 +57,7 @@ typedef struct { /* These should not be considered constants from userland. */ #define SIGRTMIN 32 -#define SIGRTMAX (_NSIG-1) +#define SIGRTMAX _NSIG /* * SA_FLAGS values: diff --git a/include/asm-sparc/signal.h b/include/asm-sparc/signal.h index 4e455b7add4a..6813baae27c6 100644 --- a/include/asm-sparc/signal.h +++ b/include/asm-sparc/signal.h @@ -89,7 +89,7 @@ #define _NSIG_WORDS (__NEW_NSIG / _NSIG_BPW) #define SIGRTMIN 32 -#define SIGRTMAX (__NEW_NSIG - 1) +#define SIGRTMAX __NEW_NSIG #if defined(__KERNEL__) || defined(__WANT_POSIX1B_SIGNALS__) #define _NSIG __NEW_NSIG diff --git a/include/asm-sparc64/signal.h b/include/asm-sparc64/signal.h index 63ac2e389367..f2101925bb74 100644 --- a/include/asm-sparc64/signal.h +++ b/include/asm-sparc64/signal.h @@ -89,7 +89,7 @@ #define _NSIG_WORDS (__NEW_NSIG / _NSIG_BPW) #define SIGRTMIN 32 -#define SIGRTMAX (__NEW_NSIG - 1) +#define SIGRTMAX __NEW_NSIG #if defined(__KERNEL__) || defined(__WANT_POSIX1B_SIGNALS__) #define _NSIG __NEW_NSIG diff --git a/include/asm-v850/signal.h b/include/asm-v850/signal.h index c33b1db71749..407db875899c 100644 --- a/include/asm-v850/signal.h +++ b/include/asm-v850/signal.h @@ -71,7 +71,7 @@ typedef unsigned long sigset_t; /* These should not be considered constants from userland. */ #define SIGRTMIN 32 -#define SIGRTMAX (_NSIG-1) +#define SIGRTMAX _NSIG /* * SA_FLAGS values: diff --git a/include/asm-x86_64/signal.h b/include/asm-x86_64/signal.h index bd1f350f1ba6..21c4bf716666 100644 --- a/include/asm-x86_64/signal.h +++ b/include/asm-x86_64/signal.h @@ -77,7 +77,7 @@ typedef unsigned long sigset_t; /* These should not be considered constants from userland. */ #define SIGRTMIN 32 -#define SIGRTMAX (_NSIG-1) +#define SIGRTMAX _NSIG /* * SA_FLAGS values: diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index d9be410a9e62..64940545cb84 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -344,6 +344,7 @@ static inline struct task_struct * good_sigevent(sigevent_t * event) return NULL; if ((event->sigev_notify & ~SIGEV_NONE & MIPS_SIGEV) && + event->sigev_signo && ((unsigned) (event->sigev_signo > SIGRTMAX))) return NULL; -- cgit v1.2.3 From 1cfc080ae153e2dccf12b965c6e2da8cc996f18f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 21 Sep 2003 01:42:31 -0700 Subject: [PATCH] Handle init_new_context failures From: Anton Blanchard If init_new_context fails we definitely do not want to call mmput, because that will call destroy_context against an uninitialised context. Instead we should back out what we did in init_mm. Fixes some weird failures on ppc64 when running a fork bomb. --- kernel/fork.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 38badc50bebc..f2d3115483da 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -305,7 +305,7 @@ out: return retval; fail_nomem: retval = -ENOMEM; - fail: +fail: vm_unacct_memory(charge); goto out; } @@ -499,7 +499,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) goto fail_nomem; if (init_new_context(tsk,mm)) - goto free_pt; + goto fail_nocontext; retval = dup_mmap(mm, oldmm); if (retval) @@ -514,6 +514,15 @@ free_pt: mmput(mm); fail_nomem: return retval; + +fail_nocontext: + /* + * If init_new_context() failed, we cannot use mmput() to free the mm + * because it calls destroy_context() + */ + mm_free_pgd(mm); + free_mm(mm); + return retval; } static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old) -- cgit v1.2.3