6 files changed, 331 insertions, 5 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index d680ace0fdda..0ac3efc9d071 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -7,7 +7,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    sysctl.o capability.o ptrace.o timer.o user.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
 	    rcupdate.o intermodule.o extable.o params.o posix-timers.o \
-	    kthread.o wait.o kfifo.o sys_ni.o
+	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o
 
 obj-$(CONFIG_FUTEX) += futex.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
diff --git a/kernel/fork.c b/kernel/fork.c
index f6b929e69f5b..bc0633f9730a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -749,6 +749,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
+	sig->sched_time = 0;
 
 	task_lock(current->group_leader);
 	memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
@@ -877,6 +878,7 @@ static task_t *copy_process(unsigned long clone_flags,
 
 	p->utime = cputime_zero;
 	p->stime = cputime_zero;
+ 	p->sched_time = 0;
 	p->rchar = 0;		/* I/O counter: bytes read */
 	p->wchar = 0;		/* I/O counter: bytes written */
 	p->syscr = 0;		/* I/O counter: read syscalls */
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
new file mode 100644
index 000000000000..fdc54f75aa15
--- /dev/null
+++ b/kernel/posix-cpu-timers.c
@@ -0,0 +1,288 @@
+/*
+ * Implement CPU time clocks for the POSIX clock interface.
+ */
+
+#include <linux/sched.h>
+#include <linux/posix-timers.h>
+#include <asm/uaccess.h>
+#include <linux/errno.h>
+
+union cpu_time_count {
+	cputime_t cpu;
+	unsigned long long sched;
+};
+
+static int check_clock(clockid_t which_clock)
+{
+	int error = 0;
+	struct task_struct *p;
+	const pid_t pid = CPUCLOCK_PID(which_clock);
+
+	if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX)
+		return -EINVAL;
+
+	if (pid == 0)
+		return 0;
+
+	read_lock(&tasklist_lock);
+	p = find_task_by_pid(pid);
+	if (!p || (CPUCLOCK_PERTHREAD(which_clock) ?
+		   p->tgid != current->tgid : p->tgid != pid)) {
+		error = -EINVAL;
+	}
+	read_unlock(&tasklist_lock);
+
+	return error;
+}
+
+static void sample_to_timespec(clockid_t which_clock,
+			       union cpu_time_count cpu,
+			       struct timespec *tp)
+{
+	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+		tp->tv_sec = div_long_long_rem(cpu.sched,
+					       NSEC_PER_SEC, &tp->tv_nsec);
+	} else {
+		cputime_to_timespec(cpu.cpu, tp);
+	}
+}
+
+static inline cputime_t prof_ticks(struct task_struct *p)
+{
+	return cputime_add(p->utime, p->stime);
+}
+static inline cputime_t virt_ticks(struct task_struct *p)
+{
+	return p->utime;
+}
+static inline unsigned long long sched_ns(struct task_struct *p)
+{
+	return (p == current) ? current_sched_time(p) : p->sched_time;
+}
+
+int posix_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
+{
+	int error = check_clock(which_clock);
+	if (!error) {
+		tp->tv_sec = 0;
+		tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ);
+		if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+			/*
+			 * If sched_clock is using a cycle counter, we
+			 * don't have any idea of its true resolution
+			 * exported, but it is much more than 1s/HZ.
+			 */
+			tp->tv_nsec = 1;
+		}
+	}
+	return error;
+}
+
+int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp)
+{
+	/*
+	 * You can never reset a CPU clock, but we check for other errors
+	 * in the call before failing with EPERM.
+	 */
+	int error = check_clock(which_clock);
+	if (error == 0) {
+		error = -EPERM;
+	}
+	return error;
+}
+
+
+/*
+ * Sample a per-thread clock for the given task.
+ */
+static int cpu_clock_sample(clockid_t which_clock, struct task_struct *p,
+			    union cpu_time_count *cpu)
+{
+	switch (CPUCLOCK_WHICH(which_clock)) {
+	default:
+		return -EINVAL;
+	case CPUCLOCK_PROF:
+		cpu->cpu = prof_ticks(p);
+		break;
+	case CPUCLOCK_VIRT:
+		cpu->cpu = virt_ticks(p);
+		break;
+	case CPUCLOCK_SCHED:
+		cpu->sched = sched_ns(p);
+		break;
+	}
+	return 0;
+}
+
+/*
+ * Sample a process (thread group) clock for the given group_leader task.
+ * Must be called with tasklist_lock held for reading.
+ */
+static int cpu_clock_sample_group(clockid_t which_clock,
+				  struct task_struct *p,
+				  union cpu_time_count *cpu)
+{
+	struct task_struct *t = p;
+	unsigned long flags;
+	switch (CPUCLOCK_WHICH(which_clock)) {
+	default:
+		return -EINVAL;
+	case CPUCLOCK_PROF:
+		spin_lock_irqsave(&p->sighand->siglock, flags);
+		cpu->cpu = cputime_add(p->signal->utime, p->signal->stime);
+		do {
+			cpu->cpu = cputime_add(cpu->cpu, prof_ticks(t));
+			t = next_thread(t);
+		} while (t != p);
+		spin_unlock_irqrestore(&p->sighand->siglock, flags);
+		break;
+	case CPUCLOCK_VIRT:
+		spin_lock_irqsave(&p->sighand->siglock, flags);
+		cpu->cpu = p->signal->utime;
+		do {
+			cpu->cpu = cputime_add(cpu->cpu, virt_ticks(t));
+			t = next_thread(t);
+		} while (t != p);
+		spin_unlock_irqrestore(&p->sighand->siglock, flags);
+		break;
+	case CPUCLOCK_SCHED:
+		spin_lock_irqsave(&p->sighand->siglock, flags);
+		cpu->sched = p->signal->sched_time;
+		/* Add in each other live thread.  */
+		while ((t = next_thread(t)) != p) {
+			cpu->sched += t->sched_time;
+		}
+		if (p->tgid == current->tgid) {
+			/*
+			 * We're sampling ourselves, so include the
+			 * cycles not yet banked.  We still omit
+			 * other threads running on other CPUs,
+			 * so the total can always be behind as
+			 * much as max(nthreads-1,ncpus) * (NSEC_PER_SEC/HZ).
+			 */
+			cpu->sched += current_sched_time(current);
+		} else {
+			cpu->sched += p->sched_time;
+		}
+		spin_unlock_irqrestore(&p->sighand->siglock, flags);
+		break;
+	}
+	return 0;
+}
+
+
+int posix_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
+{
+	const pid_t pid = CPUCLOCK_PID(which_clock);
+	int error = -EINVAL;
+	union cpu_time_count rtn;
+
+	if (pid == 0) {
+		/*
+		 * Special case constant value for our own clocks.
+		 * We don't have to do any lookup to find ourselves.
+		 */
+		if (CPUCLOCK_PERTHREAD(which_clock)) {
+			/*
+			 * Sampling just ourselves we can do with no locking.
+			 */
+			error = cpu_clock_sample(which_clock,
+						 current, &rtn);
+		} else {
+			read_lock(&tasklist_lock);
+			error = cpu_clock_sample_group(which_clock,
+						       current, &rtn);
+			read_unlock(&tasklist_lock);
+		}
+	} else {
+		/*
+		 * Find the given PID, and validate that the caller
+		 * should be able to see it.
+		 */
+		struct task_struct *p;
+		read_lock(&tasklist_lock);
+		p = find_task_by_pid(pid);
+		if (p) {
+			if (CPUCLOCK_PERTHREAD(which_clock)) {
+				if (p->tgid == current->tgid) {
+					error = cpu_clock_sample(which_clock,
+								 p, &rtn);
+				}
+			} else if (p->tgid == pid && p->signal) {
+				error = cpu_clock_sample_group(which_clock,
+							       p, &rtn);
+			}
+		}
+		read_unlock(&tasklist_lock);
+	}
+
+	if (error)
+		return error;
+	sample_to_timespec(which_clock, rtn, tp);
+	return 0;
+}
+
+/*
+ * These can't be called, since timer_create never works.
+ */
+int posix_cpu_timer_set(struct k_itimer *timer, int flags,
+			struct itimerspec *old, struct itimerspec *new)
+{
+	BUG();
+	return -EINVAL;
+}
+int posix_cpu_timer_del(struct k_itimer *timer)
+{
+	BUG();
+	return -EINVAL;
+}
+void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *spec)
+{
+	BUG();
+}
+
+
+#define PROCESS_CLOCK	MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
+#define THREAD_CLOCK	MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
+
+static int process_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
+{
+	return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
+}
+static int process_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
+{
+	return posix_cpu_clock_get(PROCESS_CLOCK, tp);
+}
+static int thread_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
+{
+	return posix_cpu_clock_getres(THREAD_CLOCK, tp);
+}
+static int thread_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
+{
+	return posix_cpu_clock_get(THREAD_CLOCK, tp);
+}
+
+
+static __init int init_posix_cpu_timers(void)
+{
+	struct k_clock process = {
+		.clock_getres = process_cpu_clock_getres,
+		.clock_get = process_cpu_clock_get,
+		.clock_set = do_posix_clock_nosettime,
+		.timer_create = do_posix_clock_notimer_create,
+		.nsleep = do_posix_clock_nonanosleep,
+	};
+	struct k_clock thread = {
+		.clock_getres = thread_cpu_clock_getres,
+		.clock_get = thread_cpu_clock_get,
+		.clock_set = do_posix_clock_nosettime,
+		.timer_create = do_posix_clock_notimer_create,
+		.nsleep = do_posix_clock_nonanosleep,
+	};
+
+	register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
+	register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
+
+	return 0;
+}
+__initcall(init_posix_cpu_timers);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index d04a2f17e395..09b2d6b4634f 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -200,13 +200,15 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
 
 #ifdef CLOCK_DISPATCH_DIRECT
 #define CLOCK_DISPATCH(clock, call, arglist) \
-	((*posix_clocks[clock].call) arglist)
+	((clock) < 0 ? posix_cpu_##call arglist : \
+	 (*posix_clocks[clock].call) arglist)
 #define DEFHOOK(name)	if (clock->name == NULL) clock->name = common_##name
 #define COMMONDEFN	static
 #else
 #define CLOCK_DISPATCH(clock, call, arglist) \
-	(posix_clocks[clock].call != NULL \
-	 ? (*posix_clocks[clock].call) arglist : common_##call arglist)
+	((clock) < 0 ? posix_cpu_##call arglist : \
+	 (posix_clocks[clock].call != NULL \
+	  ? (*posix_clocks[clock].call) arglist : common_##call arglist))
 #define DEFHOOK(name)		(void) 0 /* Nothing here.  */
 #define COMMONDEFN	static inline
 #endif
@@ -277,6 +279,8 @@ static inline void common_default_hooks(struct k_clock *clock)
  */
 static inline int invalid_clockid(clockid_t which_clock)
 {
+	if (which_clock < 0)	/* CPU clock, posix_cpu_* will check it */
+		return 0;
 	if ((unsigned) which_clock >= MAX_CLOCKS)
 		return 1;
 	if (posix_clocks[which_clock].clock_getres != NULL)
diff --git a/kernel/sched.c b/kernel/sched.c
index 0888acbe3f66..8176366cfd8f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2242,6 +2242,32 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
 EXPORT_PER_CPU_SYMBOL(kstat);
 
 /*
+ * This is called on clock ticks and on context switches.
+ * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ */
+static inline void update_cpu_clock(task_t *p, runqueue_t *rq,
+				    unsigned long long now)
+{
+	unsigned long long last = max(p->timestamp, rq->timestamp_last_tick);
+	p->sched_time += now - last;
+}
+
+/*
+ * Return current->sched_time plus any more ns on the sched_clock
+ * that have not yet been banked.
+ */
+unsigned long long current_sched_time(const task_t *tsk)
+{
+	unsigned long long ns;
+	unsigned long flags;
+	local_irq_save(flags);
+	ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick);
+	ns = tsk->sched_time + (sched_clock() - ns);
+	local_irq_restore(flags);
+	return ns;
+}
+
+/*
  * We place interactive tasks back into the active array, if possible.
  *
  * To guarantee that this does not starve expired tasks we ignore the
@@ -2419,8 +2445,11 @@ void scheduler_tick(void)
 	int cpu = smp_processor_id();
 	runqueue_t *rq = this_rq();
 	task_t *p = current;
+	unsigned long long now = sched_clock();
+
+	update_cpu_clock(p, rq, now);
 
-	rq->timestamp_last_tick = sched_clock();
+	rq->timestamp_last_tick = now;
 
 	if (p == rq->idle) {
 		if (wake_priority_sleeper(rq))
@@ -2804,6 +2833,8 @@ switch_tasks:
 	clear_tsk_need_resched(prev);
 	rcu_qsctr_inc(task_cpu(prev));
 
+	update_cpu_clock(prev, rq, now);
+
 	prev->sleep_avg -= run_time;
 	if ((long)prev->sleep_avg <= 0)
 		prev->sleep_avg = 0;
diff --git a/kernel/signal.c b/kernel/signal.c
index 9e87ab3f8f21..3f1df438d23c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -381,6 +381,7 @@ void __exit_signal(struct task_struct *tsk)
 		sig->maj_flt += tsk->maj_flt;
 		sig->nvcsw += tsk->nvcsw;
 		sig->nivcsw += tsk->nivcsw;
+		sig->sched_time += tsk->sched_time;
 		spin_unlock(&sighand->siglock);
 		sig = NULL;	/* Marker for below.  */
 	}