diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/configs.c | 46 | ||||
| -rw-r--r-- | kernel/exit.c | 24 | ||||
| -rw-r--r-- | kernel/fork.c | 17 | ||||
| -rw-r--r-- | kernel/ksyms.c | 2 | ||||
| -rw-r--r-- | kernel/pid.c | 4 | ||||
| -rw-r--r-- | kernel/posix-timers.c | 1 | ||||
| -rw-r--r-- | kernel/sched.c | 391 | ||||
| -rw-r--r-- | kernel/signal.c | 4 | ||||
| -rw-r--r-- | kernel/sys.c | 17 |
9 files changed, 357 insertions, 149 deletions
diff --git a/kernel/configs.c b/kernel/configs.c index 6a5c0c9d9176..57f54451edbe 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -47,7 +47,7 @@ /**************************************************/ /* globals and useful constants */ -static const char IKCONFIG_VERSION[] = "0.6"; +static const char IKCONFIG_VERSION[] __initdata = "0.7"; static ssize_t ikconfig_read_current(struct file *file, char __user *buf, @@ -72,32 +72,6 @@ static struct file_operations ikconfig_file_ops = { .read = ikconfig_read_current, }; - -/***************************************************/ -/* build_info_show: let people read the info */ -/* we have on the tools used to build this kernel */ - -static int build_info_show(struct seq_file *seq, void *v) -{ - seq_printf(seq, - "Kernel: %s\nCompiler: %s\nVersion_in_Makefile: %s\n", - ikconfig_build_info, LINUX_COMPILER, UTS_RELEASE); - return 0; -} - -static int build_info_open(struct inode *inode, struct file *file) -{ - return single_open(file, build_info_show, PDE(inode)->data); -} - -static struct file_operations build_info_file_ops = { - .owner = THIS_MODULE, - .open = build_info_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - /***************************************************/ /* ikconfig_init: start up everything we need to */ @@ -112,26 +86,12 @@ static int __init ikconfig_init(void) entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO, &proc_root); if (!entry) - goto leave; + return -ENOMEM; entry->proc_fops = &ikconfig_file_ops; entry->size = kernel_config_data_size; - /* create the "build_info" file */ - entry = create_proc_entry("config_build_info", - S_IFREG | S_IRUGO, &proc_root); - if (!entry) - goto leave_gz; - entry->proc_fops = &build_info_file_ops; - return 0; - -leave_gz: - /* remove the file from proc */ - remove_proc_entry("config.gz", &proc_root); - -leave: - return -ENOMEM; } /***************************************************/ @@ -139,9 +99,7 @@ leave: static void __exit ikconfig_cleanup(void) { - /* remove the files */ remove_proc_entry("config.gz", &proc_root); - remove_proc_entry("config_build_info", &proc_root); } module_init(ikconfig_init); diff --git a/kernel/exit.c b/kernel/exit.c index b6174f82adf9..c565fd69d559 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -152,7 +152,7 @@ static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task) || p->state >= TASK_ZOMBIE || p->real_parent->pid == 1) continue; - if (p->real_parent->pgrp != pgrp + if (process_group(p->real_parent) != pgrp && p->real_parent->session == p->session) { ret = 0; break; @@ -247,9 +247,9 @@ void __set_special_pids(pid_t session, pid_t pgrp) curr->session = session; attach_pid(curr, PIDTYPE_SID, session); } - if (curr->pgrp != pgrp) { + if (process_group(curr) != pgrp) { detach_pid(curr, PIDTYPE_PGID); - curr->pgrp = pgrp; + curr->group_leader->__pgrp = pgrp; attach_pid(curr, PIDTYPE_PGID, pgrp); } } @@ -508,9 +508,9 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced) * than we are, and it was the only connection * outside, so the child pgrp is now orphaned. */ - if ((p->pgrp != father->pgrp) && + if ((process_group(p) != process_group(father)) && (p->session == father->session)) { - int pgrp = p->pgrp; + int pgrp = process_group(p); if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { __kill_pg_info(SIGHUP, (void *)1, pgrp); @@ -618,12 +618,12 @@ static void exit_notify(struct task_struct *tsk) t = tsk->real_parent; - if ((t->pgrp != tsk->pgrp) && + if ((process_group(t) != process_group(tsk)) && (t->session == tsk->session) && - will_become_orphaned_pgrp(tsk->pgrp, tsk) && - has_stopped_jobs(tsk->pgrp)) { - __kill_pg_info(SIGHUP, (void *)1, tsk->pgrp); - __kill_pg_info(SIGCONT, (void *)1, tsk->pgrp); + will_become_orphaned_pgrp(process_group(tsk), tsk) && + has_stopped_jobs(process_group(tsk))) { + __kill_pg_info(SIGHUP, (void *)1, process_group(tsk)); + __kill_pg_info(SIGCONT, (void *)1, process_group(tsk)); } /* Let father know we died @@ -813,10 +813,10 @@ static int eligible_child(pid_t pid, int options, task_t *p) if (p->pid != pid) return 0; } else if (!pid) { - if (p->pgrp != current->pgrp) + if (process_group(p) != process_group(current)) return 0; } else if (pid != -1) { - if (p->pgrp != -pid) + if (process_group(p) != -pid) return 0; } diff --git a/kernel/fork.c b/kernel/fork.c index 37d79b4e16e6..f2d3115483da 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -305,7 +305,7 @@ out: return retval; fail_nomem: retval = -ENOMEM; - fail: +fail: vm_unacct_memory(charge); goto out; } @@ -499,7 +499,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) goto fail_nomem; if (init_new_context(tsk,mm)) - goto free_pt; + goto fail_nocontext; retval = dup_mmap(mm, oldmm); if (retval) @@ -514,6 +514,15 @@ free_pt: mmput(mm); fail_nomem: return retval; + +fail_nocontext: + /* + * If init_new_context() failed, we cannot use mmput() to free the mm + * because it calls destroy_context() + */ + mm_free_pgd(mm); + free_mm(mm); + return retval; } static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old) @@ -925,7 +934,7 @@ struct task_struct *copy_process(unsigned long clone_flags, */ p->first_time_slice = 1; current->time_slice >>= 1; - p->last_run = jiffies; + p->timestamp = sched_clock(); if (!current->time_slice) { /* * This case is rare, it happens when the parent has only @@ -1004,7 +1013,7 @@ struct task_struct *copy_process(unsigned long clone_flags, attach_pid(p, PIDTYPE_PID, p->pid); if (thread_group_leader(p)) { attach_pid(p, PIDTYPE_TGID, p->tgid); - attach_pid(p, PIDTYPE_PGID, p->pgrp); + attach_pid(p, PIDTYPE_PGID, process_group(p)); attach_pid(p, PIDTYPE_SID, p->session); if (p->pid) __get_cpu_var(process_counts)++; diff --git a/kernel/ksyms.c b/kernel/ksyms.c index 9f61a0496c2a..9da2940ac0e6 100644 --- a/kernel/ksyms.c +++ b/kernel/ksyms.c @@ -348,8 +348,6 @@ EXPORT_SYMBOL(lock_page); EXPORT_SYMBOL(unlock_page); /* device registration */ -EXPORT_SYMBOL(register_chrdev); -EXPORT_SYMBOL(unregister_chrdev); EXPORT_SYMBOL(register_blkdev); EXPORT_SYMBOL(unregister_blkdev); EXPORT_SYMBOL(tty_register_driver); diff --git a/kernel/pid.c b/kernel/pid.c index 00413e3967b9..713f54eaeda9 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -250,13 +250,13 @@ void switch_exec_pids(task_t *leader, task_t *thread) attach_pid(thread, PIDTYPE_PID, thread->pid); attach_pid(thread, PIDTYPE_TGID, thread->tgid); - attach_pid(thread, PIDTYPE_PGID, thread->pgrp); + attach_pid(thread, PIDTYPE_PGID, leader->__pgrp); attach_pid(thread, PIDTYPE_SID, thread->session); list_add_tail(&thread->tasks, &init_task.tasks); attach_pid(leader, PIDTYPE_PID, leader->pid); attach_pid(leader, PIDTYPE_TGID, leader->tgid); - attach_pid(leader, PIDTYPE_PGID, leader->pgrp); + attach_pid(leader, PIDTYPE_PGID, leader->__pgrp); attach_pid(leader, PIDTYPE_SID, leader->session); } diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index d9be410a9e62..64940545cb84 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -344,6 +344,7 @@ static inline struct task_struct * good_sigevent(sigevent_t * event) return NULL; if ((event->sigev_notify & ~SIGEV_NONE & MIPS_SIGEV) && + event->sigev_signo && ((unsigned) (event->sigev_signo > SIGRTMAX))) return NULL; diff --git a/kernel/sched.c b/kernel/sched.c index 9dc251a8d8a5..b35f717d1b58 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -14,6 +14,7 @@ * an array-switch method of distributing timeslices * and per-CPU runqueues. Cleanups and useful suggestions * by Davide Libenzi, preemptible kernel bits by Robert Love. + * 2003-09-03 Interactivity tuning by Con Kolivas. */ #include <linux/mm.h> @@ -59,6 +60,14 @@ #define USER_PRIO(p) ((p)-MAX_RT_PRIO) #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) +#define AVG_TIMESLICE (MIN_TIMESLICE + ((MAX_TIMESLICE - MIN_TIMESLICE) *\ + (MAX_PRIO-1-NICE_TO_PRIO(0))/(MAX_USER_PRIO - 1))) + +/* + * Some helpers for converting nanosecond timing to jiffy resolution + */ +#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) +#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) /* * These are the 'tuning knobs' of the scheduler: @@ -69,14 +78,18 @@ */ #define MIN_TIMESLICE ( 10 * HZ / 1000) #define MAX_TIMESLICE (200 * HZ / 1000) -#define CHILD_PENALTY 50 +#define ON_RUNQUEUE_WEIGHT 30 +#define CHILD_PENALTY 95 #define PARENT_PENALTY 100 #define EXIT_WEIGHT 3 #define PRIO_BONUS_RATIO 25 +#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) #define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (10*HZ) -#define STARVATION_LIMIT (10*HZ) +#define MAX_SLEEP_AVG (AVG_TIMESLICE * MAX_BONUS) +#define STARVATION_LIMIT (MAX_SLEEP_AVG) +#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) #define NODE_THRESHOLD 125 +#define CREDIT_LIMIT 100 /* * If a task is 'interactive' then we reinsert it in the active @@ -106,6 +119,19 @@ * too hard. */ +#define CURRENT_BONUS(p) \ + (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ + MAX_SLEEP_AVG) + +#ifdef CONFIG_SMP +#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \ + (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ + num_online_cpus()) +#else +#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \ + (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) +#endif + #define SCALE(v1,v1_max,v2_max) \ (v1) * (v2_max) / (v1_max) @@ -116,6 +142,19 @@ #define TASK_INTERACTIVE(p) \ ((p)->prio <= (p)->static_prio - DELTA(p)) +#define JUST_INTERACTIVE_SLEEP(p) \ + (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ + (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) + +#define HIGH_CREDIT(p) \ + ((p)->interactive_credit > CREDIT_LIMIT) + +#define LOW_CREDIT(p) \ + ((p)->interactive_credit < -CREDIT_LIMIT) + +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio) + /* * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ] * to time slice values. @@ -180,7 +219,6 @@ static DEFINE_PER_CPU(struct runqueue, runqueues); #define this_rq() (&__get_cpu_var(runqueues)) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -#define rt_task(p) ((p)->prio < MAX_RT_PRIO) /* * Default context-switch locking: @@ -320,8 +358,7 @@ static int effective_prio(task_t *p) if (rt_task(p)) return p->prio; - bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 - - MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2; + bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; prio = p->static_prio - bonus; if (prio < MAX_RT_PRIO) @@ -340,6 +377,82 @@ static inline void __activate_task(task_t *p, runqueue_t *rq) nr_running_inc(rq); } +static void recalc_task_prio(task_t *p, unsigned long long now) +{ + unsigned long long __sleep_time = now - p->timestamp; + unsigned long sleep_time; + + if (__sleep_time > NS_MAX_SLEEP_AVG) + sleep_time = NS_MAX_SLEEP_AVG; + else + sleep_time = (unsigned long)__sleep_time; + + if (likely(sleep_time > 0)) { + /* + * User tasks that sleep a long time are categorised as + * idle and will get just interactive status to stay active & + * prevent them suddenly becoming cpu hogs and starving + * other processes. + */ + if (p->mm && p->activated != -1 && + sleep_time > JUST_INTERACTIVE_SLEEP(p)){ + p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - + AVG_TIMESLICE); + if (!HIGH_CREDIT(p)) + p->interactive_credit++; + } else { + /* + * The lower the sleep avg a task has the more + * rapidly it will rise with sleep time. + */ + sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; + + /* + * Tasks with low interactive_credit are limited to + * one timeslice worth of sleep avg bonus. + */ + if (LOW_CREDIT(p) && + sleep_time > JIFFIES_TO_NS(task_timeslice(p))) + sleep_time = + JIFFIES_TO_NS(task_timeslice(p)); + + /* + * Non high_credit tasks waking from uninterruptible + * sleep are limited in their sleep_avg rise as they + * are likely to be cpu hogs waiting on I/O + */ + if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm){ + if (p->sleep_avg >= JUST_INTERACTIVE_SLEEP(p)) + sleep_time = 0; + else if (p->sleep_avg + sleep_time >= + JUST_INTERACTIVE_SLEEP(p)){ + p->sleep_avg = + JUST_INTERACTIVE_SLEEP(p); + sleep_time = 0; + } + } + + /* + * This code gives a bonus to interactive tasks. + * + * The boost works by updating the 'average sleep time' + * value here, based on ->timestamp. The more time a task + * spends sleeping, the higher the average gets - and the + * higher the priority boost gets as well. + */ + p->sleep_avg += sleep_time; + + if (p->sleep_avg > NS_MAX_SLEEP_AVG){ + p->sleep_avg = NS_MAX_SLEEP_AVG; + if (!HIGH_CREDIT(p)) + p->interactive_credit++; + } + } + } + + p->prio = effective_prio(p); +} + /* * activate_task - move a task to the runqueue and do priority recalculation * @@ -348,34 +461,33 @@ static inline void __activate_task(task_t *p, runqueue_t *rq) */ static inline void activate_task(task_t *p, runqueue_t *rq) { - long sleep_time = jiffies - p->last_run - 1; + unsigned long long now = sched_clock(); - if (sleep_time > 0) { - int sleep_avg; + recalc_task_prio(p, now); + /* + * This checks to make sure it's not an uninterruptible task + * that is now waking up. + */ + if (!p->activated){ /* - * This code gives a bonus to interactive tasks. - * - * The boost works by updating the 'average sleep time' - * value here, based on ->last_run. The more time a task - * spends sleeping, the higher the average gets - and the - * higher the priority boost gets as well. + * Tasks which were woken up by interrupts (ie. hw events) + * are most likely of interactive nature. So we give them + * the credit of extending their sleep time to the period + * of time they spend on the runqueue, waiting for execution + * on a CPU, first time around: */ - sleep_avg = p->sleep_avg + sleep_time; - + if (in_interrupt()) + p->activated = 2; + else /* - * 'Overflow' bonus ticks go to the waker as well, so the - * ticks are not lost. This has the effect of further - * boosting tasks that are related to maximum-interactive - * tasks. + * Normal first-time wakeups get a credit too for on-runqueue + * time, but it will be weighted down: */ - if (sleep_avg > MAX_SLEEP_AVG) - sleep_avg = MAX_SLEEP_AVG; - if (p->sleep_avg != sleep_avg) { - p->sleep_avg = sleep_avg; - p->prio = effective_prio(p); + p->activated = 1; } - } + p->timestamp = now; + __activate_task(p, rq); } @@ -496,13 +608,19 @@ repeat_lock_task: task_rq_unlock(rq, &flags); goto repeat_lock_task; } - if (old_state == TASK_UNINTERRUPTIBLE) + if (old_state == TASK_UNINTERRUPTIBLE){ rq->nr_uninterruptible--; + /* + * Tasks on involuntary sleep don't earn + * sleep_avg beyond just interactive state. + */ + p->activated = -1; + } if (sync) __activate_task(p, rq); else { activate_task(p, rq); - if (p->prio < rq->curr->prio) + if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); } success = 1; @@ -551,8 +669,14 @@ void wake_up_forked_process(task_t * p) * and children as well, to keep max-interactive tasks * from forking tasks that are max-interactive. */ - current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100; - p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100; + current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * + PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + + p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * + CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + + p->interactive_credit = 0; + p->prio = effective_prio(p); set_task_cpu(p, smp_processor_id()); @@ -593,8 +717,9 @@ void sched_exit(task_t * p) * the sleep_avg of the parent as well. */ if (p->sleep_avg < p->parent->sleep_avg) - p->parent->sleep_avg = (p->parent->sleep_avg * EXIT_WEIGHT + - p->sleep_avg) / (EXIT_WEIGHT + 1); + p->parent->sleep_avg = p->parent->sleep_avg / + (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / + (EXIT_WEIGHT + 1); } /** @@ -960,10 +1085,10 @@ static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, if (likely(!busiest)) goto out; - *imbalance = (max_load - nr_running) / 2; + *imbalance = max_load - nr_running; /* It needs an at least ~25% imbalance to trigger balancing. */ - if (!idle && (*imbalance < (max_load + 3)/4)) { + if (!idle && ((*imbalance)*4 < max_load)) { busiest = NULL; goto out; } @@ -973,7 +1098,7 @@ static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, * Make sure nothing changed since we checked the * runqueue length. */ - if (busiest->nr_running <= nr_running + 1) { + if (busiest->nr_running <= nr_running) { spin_unlock(&busiest->lock); busiest = NULL; } @@ -996,13 +1121,31 @@ static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. */ - if (p->prio < this_rq->curr->prio) + if (TASK_PREEMPTS_CURR(p, this_rq)) set_need_resched(); - else { - if (p->prio == this_rq->curr->prio && - p->time_slice > this_rq->curr->time_slice) - set_need_resched(); - } +} + +/* + * Previously: + * + * #define CAN_MIGRATE_TASK(p,rq,this_cpu) \ + * ((!idle || (NS_TO_JIFFIES(now - (p)->timestamp) > \ + * cache_decay_ticks)) && !task_running(rq, p) && \ + * cpu_isset(this_cpu, (p)->cpus_allowed)) + */ + +static inline int +can_migrate_task(task_t *tsk, runqueue_t *rq, int this_cpu, int idle) +{ + unsigned long delta = sched_clock() - tsk->timestamp; + + if (!idle && (delta <= JIFFIES_TO_NS(cache_decay_ticks))) + return 0; + if (task_running(rq, tsk)) + return 0; + if (!cpu_isset(this_cpu, tsk->cpus_allowed)) + return 0; + return 1; } /* @@ -1026,6 +1169,12 @@ static void load_balance(runqueue_t *this_rq, int idle, cpumask_t cpumask) goto out; /* + * We only want to steal a number of tasks equal to 1/2 the imbalance, + * otherwise we'll just shift the imbalance to the new queue: + */ + imbalance /= 2; + + /* * We first consider expired tasks. Those will likely not be * executed in the near future, and they are most likely to * be cache-cold, thus switching CPUs has the least effect @@ -1064,14 +1213,9 @@ skip_queue: * 3) are cache-hot on their current CPU. */ -#define CAN_MIGRATE_TASK(p,rq,this_cpu) \ - ((idle || (jiffies - (p)->last_run > cache_decay_ticks)) && \ - !task_running(rq, p) && \ - cpu_isset(this_cpu, (p)->cpus_allowed)) - curr = curr->prev; - if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) { + if (!can_migrate_task(tmp, busiest, this_cpu, idle)) { if (curr != head) goto skip_queue; idx++; @@ -1233,14 +1377,11 @@ void scheduler_tick(int user_ticks, int sys_ticks) spin_lock(&rq->lock); /* * The task was running during this tick - update the - * time slice counter and the sleep average. Note: we - * do not update a thread's priority until it either - * goes to sleep or uses up its timeslice. This makes - * it possible for interactive tasks to use up their - * timeslices at their highest priority levels. + * time slice counter. Note: we do not update a thread's + * priority until it either goes to sleep or uses up its + * timeslice. This makes it possible for interactive tasks + * to use up their timeslices at their highest priority levels. */ - if (p->sleep_avg) - p->sleep_avg--; if (unlikely(rt_task(p))) { /* * RR tasks need a special form of timeslice management. @@ -1264,12 +1405,39 @@ void scheduler_tick(int user_ticks, int sys_ticks) p->time_slice = task_timeslice(p); p->first_time_slice = 0; + if (!rq->expired_timestamp) + rq->expired_timestamp = jiffies; if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { - if (!rq->expired_timestamp) - rq->expired_timestamp = jiffies; enqueue_task(p, rq->expired); } else enqueue_task(p, rq->active); + } else { + /* + * Prevent a too long timeslice allowing a task to monopolize + * the CPU. We do this by splitting up the timeslice into + * smaller pieces. + * + * Note: this does not mean the task's timeslices expire or + * get lost in any way, they just might be preempted by + * another task of equal priority. (one with higher + * priority would have preempted this task already.) We + * requeue this task to the end of the list on this priority + * level, which is in essence a round-robin of tasks with + * equal priority. + * + * This only applies to tasks in the interactive + * delta range with at least TIMESLICE_GRANULARITY to requeue. + */ + if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - + p->time_slice) % TIMESLICE_GRANULARITY(p)) && + (p->time_slice >= TIMESLICE_GRANULARITY(p)) && + (p->array == rq->active)) { + + dequeue_task(p, rq->active); + set_tsk_need_resched(p); + p->prio = effective_prio(p); + enqueue_task(p, rq->active); + } } out_unlock: spin_unlock(&rq->lock); @@ -1288,6 +1456,8 @@ asmlinkage void schedule(void) runqueue_t *rq; prio_array_t *array; struct list_head *queue; + unsigned long long now; + unsigned long run_time; int idx; /* @@ -1308,7 +1478,20 @@ need_resched: rq = this_rq(); release_kernel_lock(prev); - prev->last_run = jiffies; + now = sched_clock(); + if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG)) + run_time = now - prev->timestamp; + else + run_time = NS_MAX_SLEEP_AVG; + + /* + * Tasks with interactive credits get charged less run_time + * at high sleep_avg to delay them losing their interactive + * status + */ + if (HIGH_CREDIT(prev)) + run_time /= (CURRENT_BONUS(prev) ? : 1); + spin_lock_irq(&rq->lock); /* @@ -1358,12 +1541,33 @@ pick_next_task: queue = array->queue + idx; next = list_entry(queue->next, task_t, run_list); + if (next->activated > 0) { + unsigned long long delta = now - next->timestamp; + + if (next->activated == 1) + delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; + + array = next->array; + dequeue_task(next, array); + recalc_task_prio(next, next->timestamp + delta); + enqueue_task(next, array); + } + next->activated = 0; switch_tasks: prefetch(next); clear_tsk_need_resched(prev); RCU_qsctr(task_cpu(prev))++; + prev->sleep_avg -= run_time; + if ((long)prev->sleep_avg <= 0){ + prev->sleep_avg = 0; + if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev))) + prev->interactive_credit--; + } + prev->timestamp = now; + if (likely(prev != next)) { + next->timestamp = now; rq->nr_switches++; rq->curr = next; @@ -1603,6 +1807,7 @@ void set_user_nice(task_t *p, long nice) unsigned long flags; prio_array_t *array; runqueue_t *rq; + int old_prio, new_prio, delta; if (TASK_NICE(p) == nice || nice < -20 || nice > 19) return; @@ -1611,6 +1816,12 @@ void set_user_nice(task_t *p, long nice) * the task might be in the middle of scheduling on another CPU. */ rq = task_rq_lock(p, &flags); + /* + * The RT priorities are set via setscheduler(), but we still + * allow the 'normal' nice value to be set - but as expected + * it wont have any effect on scheduling until the task is + * not SCHED_NORMAL: + */ if (rt_task(p)) { p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; @@ -1618,16 +1829,20 @@ void set_user_nice(task_t *p, long nice) array = p->array; if (array) dequeue_task(p, array); + + old_prio = p->prio; + new_prio = NICE_TO_PRIO(nice); + delta = new_prio - old_prio; p->static_prio = NICE_TO_PRIO(nice); - p->prio = NICE_TO_PRIO(nice); + p->prio += delta; + if (array) { enqueue_task(p, array); /* - * If the task is running and lowered its priority, - * or increased its priority then reschedule its CPU: + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: */ - if ((NICE_TO_PRIO(nice) < p->static_prio) || - task_running(rq, p)) + if (delta < 0 || (delta > 0 && task_running(rq, p))) resched_task(rq->curr); } out_unlock: @@ -2384,6 +2599,12 @@ static void move_task_away(struct task_struct *p, int dest_cpu) local_irq_restore(flags); } +typedef struct { + int cpu; + struct completion startup_done; + task_t *task; +} migration_startup_t; + /* * migration_thread - this is a highprio system thread that performs * thread migration by bumping thread off CPU then 'pushing' onto @@ -2393,20 +2614,21 @@ static int migration_thread(void * data) { /* Marking "param" __user is ok, since we do a set_fs(KERNEL_DS); */ struct sched_param __user param = { .sched_priority = MAX_RT_PRIO-1 }; - int cpu = (long) data; + migration_startup_t *startup = data; + int cpu = startup->cpu; runqueue_t *rq; int ret; + startup->task = current; + complete(&startup->startup_done); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + + BUG_ON(smp_processor_id() != cpu); + daemonize("migration/%d", cpu); set_fs(KERNEL_DS); - /* - * Either we are running on the right CPU, or there's a a - * migration thread on this CPU, guaranteed (we're started - * serially). - */ - set_cpus_allowed(current, cpumask_of_cpu(cpu)); - ret = setscheduler(0, SCHED_FIFO, ¶m); rq = this_rq(); @@ -2445,13 +2667,30 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) { + long cpu = (long) hcpu; + migration_startup_t startup; + switch (action) { case CPU_ONLINE: - printk("Starting migration thread for cpu %li\n", - (long)hcpu); - kernel_thread(migration_thread, hcpu, CLONE_KERNEL); - while (!cpu_rq((long)hcpu)->migration_thread) + + printk("Starting migration thread for cpu %li\n", cpu); + + startup.cpu = cpu; + startup.task = NULL; + init_completion(&startup.startup_done); + + kernel_thread(migration_thread, &startup, CLONE_KERNEL); + wait_for_completion(&startup.startup_done); + wait_task_inactive(startup.task); + + startup.task->thread_info->cpu = cpu; + startup.task->cpus_allowed = cpumask_of_cpu(cpu); + + wake_up_process(startup.task); + + while (!cpu_rq(cpu)->migration_thread) yield(); + break; } return NOTIFY_OK; @@ -2574,6 +2813,8 @@ void __might_sleep(char *file, int line) prev_jiffy = jiffies; printk(KERN_ERR "Debug: sleeping function called from invalid" " context at %s:%d\n", file, line); + printk("in_atomic():%d, irqs_disabled():%d\n", + in_atomic(), irqs_disabled()); dump_stack(); } #endif diff --git a/kernel/signal.c b/kernel/signal.c index 72333be1fd42..852da1a009da 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1139,7 +1139,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid) static int kill_something_info(int sig, struct siginfo *info, int pid) { if (!pid) { - return kill_pg_info(sig, info, current->pgrp); + return kill_pg_info(sig, info, process_group(current)); } else if (pid == -1) { int retval = 0, count = 0; struct task_struct * p; @@ -1798,7 +1798,7 @@ relock: /* signals can be posted during this window */ - if (is_orphaned_pgrp(current->pgrp)) + if (is_orphaned_pgrp(process_group(current))) goto relock; spin_lock_irq(¤t->sighand->siglock); diff --git a/kernel/sys.c b/kernel/sys.c index b172afa53be1..9eda26d6745c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -290,7 +290,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) break; case PRIO_PGRP: if (!who) - who = current->pgrp; + who = process_group(current); for_each_task_pid(who, PIDTYPE_PGID, p, l, pid) error = set_one_prio(p, niceval, error); break; @@ -346,7 +346,7 @@ asmlinkage long sys_getpriority(int which, int who) break; case PRIO_PGRP: if (!who) - who = current->pgrp; + who = process_group(current); for_each_task_pid(who, PIDTYPE_PGID, p, l, pid) { niceval = 20 - task_nice(p); if (niceval > retval) @@ -982,11 +982,12 @@ ok_pgid: if (err) goto out; - if (p->pgrp != pgid) { + if (process_group(p) != pgid) { detach_pid(p, PIDTYPE_PGID); - p->pgrp = pgid; + p->group_leader->__pgrp = pgid; attach_pid(p, PIDTYPE_PGID, pgid); } + err = 0; out: /* All paths lead to here, thus we are safe. -DaveM */ @@ -997,7 +998,7 @@ out: asmlinkage long sys_getpgid(pid_t pid) { if (!pid) { - return current->pgrp; + return process_group(current); } else { int retval; struct task_struct *p; @@ -1009,7 +1010,7 @@ asmlinkage long sys_getpgid(pid_t pid) if (p) { retval = security_task_getpgid(p); if (!retval) - retval = p->pgrp; + retval = process_group(p); } read_unlock(&tasklist_lock); return retval; @@ -1019,7 +1020,7 @@ asmlinkage long sys_getpgid(pid_t pid) asmlinkage long sys_getpgrp(void) { /* SMP - assuming writes are word atomic this is fine */ - return current->pgrp; + return process_group(current); } asmlinkage long sys_getsid(pid_t pid) @@ -1062,7 +1063,7 @@ asmlinkage long sys_setsid(void) __set_special_pids(current->pid, current->pid); current->tty = NULL; current->tty_old_pgrp = 0; - err = current->pgrp; + err = process_group(current); out: write_unlock_irq(&tasklist_lock); return err; |
