From cbee9f88ec1b8dd6b58f25f54e4f52c82ed77690 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Oct 2012 14:16:43 +0200 Subject: mm: numa: Add fault driven placement and migration NOTE: This patch is based on "sched, numa, mm: Add fault driven placement and migration policy" but as it throws away all the policy to just leave a basic foundation I had to drop the signed-offs-by. This patch creates a bare-bones method for setting PTEs pte_numa in the context of the scheduler that when faulted later will be faulted onto the node the CPU is running on. In itself this does nothing useful but any placement policy will fundamentally depend on receiving hints on placement from fault context and doing something intelligent about it. Signed-off-by: Mel Gorman Acked-by: Rik van Riel --- include/linux/sched.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 0dd42a02df2e..844af5b12cb2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1479,6 +1479,14 @@ struct task_struct { short il_next; short pref_node_fork; #endif +#ifdef CONFIG_NUMA_BALANCING + int numa_scan_seq; + int numa_migrate_seq; + unsigned int numa_scan_period; + u64 node_stamp; /* migration stamp */ + struct callback_head numa_work; +#endif /* CONFIG_NUMA_BALANCING */ + struct rcu_head rcu; /* @@ -1553,6 +1561,14 @@ struct task_struct { /* Future-safe accessor for struct task_struct's cpus_allowed. */ #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) +#ifdef CONFIG_NUMA_BALANCING +extern void task_numa_fault(int node, int pages); +#else +static inline void task_numa_fault(int node, int pages) +{ +} +#endif + /* * Priority of a process goes from 0..MAX_PRIO-1, valid RT * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH @@ -1990,6 +2006,10 @@ enum sched_tunable_scaling { }; extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; +extern unsigned int sysctl_numa_balancing_scan_period_min; +extern unsigned int sysctl_numa_balancing_scan_period_max; +extern unsigned int sysctl_numa_balancing_settle_count; + #ifdef CONFIG_SCHED_DEBUG extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; -- cgit v1.2.3 From 6e5fb223e89dbe5cb5c563f8d4a4a0a7d62455a8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Oct 2012 14:16:45 +0200 Subject: mm: sched: numa: Implement constant, per task Working Set Sampling (WSS) rate Previously, to probe the working set of a task, we'd use a very simple and crude method: mark all of its address space PROT_NONE. That method has various (obvious) disadvantages: - it samples the working set at dissimilar rates, giving some tasks a sampling quality advantage over others. - creates performance problems for tasks with very large working sets - over-samples processes with large address spaces but which only very rarely execute Improve that method by keeping a rotating offset into the address space that marks the current position of the scan, and advance it by a constant rate (in a CPU cycles execution proportional manner). If the offset reaches the last mapped address of the mm then it then it starts over at the first address. The per-task nature of the working set sampling functionality in this tree allows such constant rate, per task, execution-weight proportional sampling of the working set, with an adaptive sampling interval/frequency that goes from once per 100ms up to just once per 8 seconds. The current sampling volume is 256 MB per interval. As tasks mature and converge their working set, so does the sampling rate slow down to just a trickle, 256 MB per 8 seconds of CPU time executed. This, beyond being adaptive, also rate-limits rarely executing systems and does not over-sample on overloaded systems. [ In AutoNUMA speak, this patch deals with the effective sampling rate of the 'hinting page fault'. AutoNUMA's scanning is currently rate-limited, but it is also fundamentally single-threaded, executing in the knuma_scand kernel thread, so the limit in AutoNUMA is global and does not scale up with the number of CPUs, nor does it scan tasks in an execution proportional manner. So the idea of rate-limiting the scanning was first implemented in the AutoNUMA tree via a global rate limit. This patch goes beyond that by implementing an execution rate proportional working set sampling rate that is not implemented via a single global scanning daemon. ] [ Dan Carpenter pointed out a possible NULL pointer dereference in the first version of this patch. ] Based-on-idea-by: Andrea Arcangeli Bug-Found-By: Dan Carpenter Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Rik van Riel [ Wrote changelog and fixed bug. ] Signed-off-by: Ingo Molnar Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel --- include/linux/mm_types.h | 3 +++ include/linux/sched.h | 1 + kernel/sched/fair.c | 65 ++++++++++++++++++++++++++++++++++++++---------- kernel/sysctl.c | 7 ++++++ 4 files changed, 63 insertions(+), 13 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ed8638c29b3e..d1e246c5e50c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -406,6 +406,9 @@ struct mm_struct { */ unsigned long numa_next_scan; + /* Restart point for scanning and setting pte_numa */ + unsigned long numa_scan_offset; + /* numa_scan_seq prevents two threads setting pte_numa */ int numa_scan_seq; #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 844af5b12cb2..37841958d234 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2008,6 +2008,7 @@ extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; extern unsigned int sysctl_numa_balancing_scan_period_min; extern unsigned int sysctl_numa_balancing_scan_period_max; +extern unsigned int sysctl_numa_balancing_scan_size; extern unsigned int sysctl_numa_balancing_settle_count; #ifdef CONFIG_SCHED_DEBUG diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6831abb5dbef..0a349dd1fa60 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -780,10 +780,13 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) #ifdef CONFIG_NUMA_BALANCING /* - * numa task sample period in ms: 5s + * numa task sample period in ms */ -unsigned int sysctl_numa_balancing_scan_period_min = 5000; -unsigned int sysctl_numa_balancing_scan_period_max = 5000*16; +unsigned int sysctl_numa_balancing_scan_period_min = 100; +unsigned int sysctl_numa_balancing_scan_period_max = 100*16; + +/* Portion of address space to scan in MB */ +unsigned int sysctl_numa_balancing_scan_size = 256; static void task_numa_placement(struct task_struct *p) { @@ -808,6 +811,12 @@ void task_numa_fault(int node, int pages) task_numa_placement(p); } +static void reset_ptenuma_scan(struct task_struct *p) +{ + ACCESS_ONCE(p->mm->numa_scan_seq)++; + p->mm->numa_scan_offset = 0; +} + /* * The expensive part of numa migration is done from task_work context. * Triggered from task_tick_numa(). @@ -817,6 +826,9 @@ void task_numa_work(struct callback_head *work) unsigned long migrate, next_scan, now = jiffies; struct task_struct *p = current; struct mm_struct *mm = p->mm; + struct vm_area_struct *vma; + unsigned long offset, end; + long length; WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); @@ -846,18 +858,45 @@ void task_numa_work(struct callback_head *work) if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) return; - ACCESS_ONCE(mm->numa_scan_seq)++; - { - struct vm_area_struct *vma; + offset = mm->numa_scan_offset; + length = sysctl_numa_balancing_scan_size; + length <<= 20; - down_read(&mm->mmap_sem); - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (!vma_migratable(vma)) - continue; - change_prot_numa(vma, vma->vm_start, vma->vm_end); - } - up_read(&mm->mmap_sem); + down_read(&mm->mmap_sem); + vma = find_vma(mm, offset); + if (!vma) { + reset_ptenuma_scan(p); + offset = 0; + vma = mm->mmap; + } + for (; vma && length > 0; vma = vma->vm_next) { + if (!vma_migratable(vma)) + continue; + + /* Skip small VMAs. They are not likely to be of relevance */ + if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) < HPAGE_PMD_NR) + continue; + + offset = max(offset, vma->vm_start); + end = min(ALIGN(offset + length, HPAGE_SIZE), vma->vm_end); + length -= end - offset; + + change_prot_numa(vma, offset, end); + + offset = end; } + + /* + * It is possible to reach the end of the VMA list but the last few VMAs are + * not guaranteed to the vma_migratable. If they are not, we would find the + * !migratable VMA on the next scan but not reset the scanner to the start + * so check it now. + */ + if (vma) + mm->numa_scan_offset = offset; + else + reset_ptenuma_scan(p); + up_read(&mm->mmap_sem); } /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 025e1ae50ef1..7d3a2e0475e5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -366,6 +366,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "numa_balancing_scan_size_mb", + .data = &sysctl_numa_balancing_scan_size, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_SCHED_DEBUG */ { -- cgit v1.2.3 From 4b96a29ba891dd59734cb7be80a900fe93aa2d9f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Oct 2012 14:16:47 +0200 Subject: mm: sched: numa: Implement slow start for working set sampling Add a 1 second delay before starting to scan the working set of a task and starting to balance it amongst nodes. [ note that before the constant per task WSS sampling rate patch the initial scan would happen much later still, in effect that patch caused this regression. ] The theory is that short-run tasks benefit very little from NUMA placement: they come and go, and they better stick to the node they were started on. As tasks mature and rebalance to other CPUs and nodes, so does their NUMA placement have to change and so does it start to matter more and more. In practice this change fixes an observable kbuild regression: # [ a perf stat --null --repeat 10 test of ten bzImage builds to /dev/shm ] !NUMA: 45.291088843 seconds time elapsed ( +- 0.40% ) 45.154231752 seconds time elapsed ( +- 0.36% ) +NUMA, no slow start: 46.172308123 seconds time elapsed ( +- 0.30% ) 46.343168745 seconds time elapsed ( +- 0.25% ) +NUMA, 1 sec slow start: 45.224189155 seconds time elapsed ( +- 0.25% ) 45.160866532 seconds time elapsed ( +- 0.17% ) and it also fixes an observable perf bench (hackbench) regression: # perf stat --null --repeat 10 perf bench sched messaging -NUMA: -NUMA: 0.246225691 seconds time elapsed ( +- 1.31% ) +NUMA no slow start: 0.252620063 seconds time elapsed ( +- 1.13% ) +NUMA 1sec delay: 0.248076230 seconds time elapsed ( +- 1.35% ) The implementation is simple and straightforward, most of the patch deals with adding the /proc/sys/kernel/numa_balancing_scan_delay_ms tunable knob. Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Rik van Riel [ Wrote the changelog, ran measurements, tuned the default. ] Signed-off-by: Ingo Molnar Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel --- include/linux/sched.h | 1 + kernel/sched/core.c | 2 +- kernel/sched/fair.c | 5 +++++ kernel/sysctl.c | 7 +++++++ 4 files changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 37841958d234..7d95a232b5b9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2006,6 +2006,7 @@ enum sched_tunable_scaling { }; extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; +extern unsigned int sysctl_numa_balancing_scan_delay; extern unsigned int sysctl_numa_balancing_scan_period_min; extern unsigned int sysctl_numa_balancing_scan_period_max; extern unsigned int sysctl_numa_balancing_scan_size; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cad0d092ce3b..fbfc4843063f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1543,7 +1543,7 @@ static void __sched_fork(struct task_struct *p) p->node_stamp = 0ULL; p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; - p->numa_scan_period = sysctl_numa_balancing_scan_period_min; + p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_work.next = &p->numa_work; #endif /* CONFIG_NUMA_BALANCING */ } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f6e1f25ed2bd..7727b0161579 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -788,6 +788,9 @@ unsigned int sysctl_numa_balancing_scan_period_max = 100*16; /* Portion of address space to scan in MB */ unsigned int sysctl_numa_balancing_scan_size = 256; +/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ +unsigned int sysctl_numa_balancing_scan_delay = 1000; + static void task_numa_placement(struct task_struct *p) { int seq = ACCESS_ONCE(p->mm->numa_scan_seq); @@ -929,6 +932,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; if (now - curr->node_stamp > period) { + if (!curr->node_stamp) + curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; curr->node_stamp = now; if (!time_before(jiffies, curr->mm->numa_next_scan)) { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7d3a2e0475e5..48a68cc258c1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -352,6 +352,13 @@ static struct ctl_table kern_table[] = { }, #endif /* CONFIG_SMP */ #ifdef CONFIG_NUMA_BALANCING + { + .procname = "numa_balancing_scan_delay_ms", + .data = &sysctl_numa_balancing_scan_delay, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "numa_balancing_scan_period_min_ms", .data = &sysctl_numa_balancing_scan_period_min, -- cgit v1.2.3 From b8593bfda1652755136333cdd362de125b283a9c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 21 Nov 2012 01:18:23 +0000 Subject: mm: sched: Adapt the scanning rate if a NUMA hinting fault does not migrate The PTE scanning rate and fault rates are two of the biggest sources of system CPU overhead with automatic NUMA placement. Ideally a proper policy would detect if a workload was properly placed, schedule and adjust the PTE scanning rate accordingly. We do not track the necessary information to do that but we at least know if we migrated or not. This patch scans slower if a page was not migrated as the result of a NUMA hinting fault up to sysctl_numa_balancing_scan_period_max which is now higher than the previous default. Once every minute it will reset the scanner in case of phase changes. This is hilariously crude and the numbers are arbitrary. Workloads will converge quite slowly in comparison to what a proper policy should be able to do. On the plus side, we will chew up less CPU for workloads that have no need for automatic balancing. Signed-off-by: Mel Gorman --- include/linux/mm_types.h | 3 +++ include/linux/sched.h | 5 +++-- kernel/sched/core.c | 1 + kernel/sched/fair.c | 29 +++++++++++++++++++++-------- kernel/sysctl.c | 7 +++++++ mm/huge_memory.c | 2 +- mm/memory.c | 12 ++++++++---- 7 files changed, 44 insertions(+), 15 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c5fffa239861..e850a23dd6ec 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -410,6 +410,9 @@ struct mm_struct { */ unsigned long numa_next_scan; + /* numa_next_reset is when the PTE scanner period will be reset */ + unsigned long numa_next_reset; + /* Restart point for scanning and setting pte_numa */ unsigned long numa_scan_offset; diff --git a/include/linux/sched.h b/include/linux/sched.h index 7d95a232b5b9..0f4ff2bd03f6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1562,9 +1562,9 @@ struct task_struct { #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) #ifdef CONFIG_NUMA_BALANCING -extern void task_numa_fault(int node, int pages); +extern void task_numa_fault(int node, int pages, bool migrated); #else -static inline void task_numa_fault(int node, int pages) +static inline void task_numa_fault(int node, int pages, bool migrated) { } #endif @@ -2009,6 +2009,7 @@ extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; extern unsigned int sysctl_numa_balancing_scan_delay; extern unsigned int sysctl_numa_balancing_scan_period_min; extern unsigned int sysctl_numa_balancing_scan_period_max; +extern unsigned int sysctl_numa_balancing_scan_period_reset; extern unsigned int sysctl_numa_balancing_scan_size; extern unsigned int sysctl_numa_balancing_settle_count; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fbfc4843063f..9d255bc0e278 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1537,6 +1537,7 @@ static void __sched_fork(struct task_struct *p) #ifdef CONFIG_NUMA_BALANCING if (p->mm && atomic_read(&p->mm->mm_users) == 1) { p->mm->numa_next_scan = jiffies; + p->mm->numa_next_reset = jiffies; p->mm->numa_scan_seq = 0; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dd18087fd369..4b577863933f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -784,7 +784,8 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * numa task sample period in ms */ unsigned int sysctl_numa_balancing_scan_period_min = 100; -unsigned int sysctl_numa_balancing_scan_period_max = 100*16; +unsigned int sysctl_numa_balancing_scan_period_max = 100*50; +unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; /* Portion of address space to scan in MB */ unsigned int sysctl_numa_balancing_scan_size = 256; @@ -806,20 +807,19 @@ static void task_numa_placement(struct task_struct *p) /* * Got a PROT_NONE fault for a page on @node. */ -void task_numa_fault(int node, int pages) +void task_numa_fault(int node, int pages, bool migrated) { struct task_struct *p = current; /* FIXME: Allocate task-specific structure for placement policy here */ /* - * Assume that as faults occur that pages are getting properly placed - * and fewer NUMA hints are required. Note that this is a big - * assumption, it assumes processes reach a steady steady with no - * further phase changes. + * If pages are properly placed (did not migrate) then scan slower. + * This is reset periodically in case of phase changes */ - p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, - p->numa_scan_period + jiffies_to_msecs(2)); + if (!migrated) + p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, + p->numa_scan_period + jiffies_to_msecs(10)); task_numa_placement(p); } @@ -857,6 +857,19 @@ void task_numa_work(struct callback_head *work) if (p->flags & PF_EXITING) return; + /* + * Reset the scan period if enough time has gone by. Objective is that + * scanning will be reduced if pages are properly placed. As tasks + * can enter different phases this needs to be re-examined. Lacking + * proper tracking of reference behaviour, this blunt hammer is used. + */ + migrate = mm->numa_next_reset; + if (time_after(now, migrate)) { + p->numa_scan_period = sysctl_numa_balancing_scan_period_min; + next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); + xchg(&mm->numa_next_reset, next_scan); + } + /* * Enforce maximal scan/migration frequency.. */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 48a68cc258c1..8906f90d6fa2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -366,6 +366,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "numa_balancing_scan_period_reset", + .data = &sysctl_numa_balancing_scan_period_reset, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "numa_balancing_scan_period_max_ms", .data = &sysctl_numa_balancing_scan_period_max, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 79b96064f8fc..199b261a257e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1068,7 +1068,7 @@ out_unlock: spin_unlock(&mm->page_table_lock); if (page) { put_page(page); - task_numa_fault(numa_node_id(), HPAGE_PMD_NR); + task_numa_fault(numa_node_id(), HPAGE_PMD_NR, false); } return 0; } diff --git a/mm/memory.c b/mm/memory.c index 84c6d9eab182..39edb11b63dc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3468,6 +3468,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl; int current_nid = -1; int target_nid; + bool migrated = false; /* * The "pte" at this point cannot be used safely without @@ -3509,12 +3510,13 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, } /* Migrate to the requested node */ - if (migrate_misplaced_page(page, target_nid)) + migrated = migrate_misplaced_page(page, target_nid); + if (migrated) current_nid = target_nid; out: if (current_nid != -1) - task_numa_fault(current_nid, 1); + task_numa_fault(current_nid, 1, migrated); return 0; } @@ -3554,6 +3556,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; int curr_nid = local_nid; int target_nid; + bool migrated; if (!pte_present(pteval)) continue; if (!pte_numa(pteval)) @@ -3590,9 +3593,10 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Migrate to the requested node */ pte_unmap_unlock(pte, ptl); - if (migrate_misplaced_page(page, target_nid)) + migrated = migrate_misplaced_page(page, target_nid); + if (migrated) curr_nid = target_nid; - task_numa_fault(curr_nid, 1); + task_numa_fault(curr_nid, 1, migrated); pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); } -- cgit v1.2.3 From 1a687c2e9a99335c9e77392f050fe607fa18a652 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 22 Nov 2012 11:16:36 +0000 Subject: mm: sched: numa: Control enabling and disabling of NUMA balancing This patch adds Kconfig options and kernel parameters to allow the enabling and disabling of automatic NUMA balancing. The existance of such a switch was and is very important when debugging problems related to transparent hugepages and we should have the same for automatic NUMA placement. Signed-off-by: Mel Gorman --- Documentation/kernel-parameters.txt | 3 +++ include/linux/sched.h | 4 ++++ init/Kconfig | 8 +++++++ kernel/sched/core.c | 48 +++++++++++++++++++++++++------------ kernel/sched/fair.c | 3 +++ kernel/sched/features.h | 6 +++-- mm/mempolicy.c | 46 +++++++++++++++++++++++++++++++++++ 7 files changed, 101 insertions(+), 17 deletions(-) (limited to 'include/linux/sched.h') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 9776f068306b..2e8d2625b814 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1996,6 +1996,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nr_uarts= [SERIAL] maximum number of UARTs to be registered. + numa_balancing= [KNL,X86] Enable or disable automatic NUMA balancing. + Allowed values are enable and disable + numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA. one of ['zone', 'node', 'default'] can be specified This can be set from sysctl after boot. diff --git a/include/linux/sched.h b/include/linux/sched.h index 0f4ff2bd03f6..b1e619f9ff1a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1563,10 +1563,14 @@ struct task_struct { #ifdef CONFIG_NUMA_BALANCING extern void task_numa_fault(int node, int pages, bool migrated); +extern void set_numabalancing_state(bool enabled); #else static inline void task_numa_fault(int node, int pages, bool migrated) { } +static inline void set_numabalancing_state(bool enabled) +{ +} #endif /* diff --git a/init/Kconfig b/init/Kconfig index 9f00f004796a..18e2a5920a34 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -720,6 +720,14 @@ config ARCH_USES_NUMA_PROT_NONE depends on ARCH_WANTS_PROT_NUMA_PROT_NONE depends on NUMA_BALANCING +config NUMA_BALANCING_DEFAULT_ENABLED + bool "Automatically enable NUMA aware memory/task placement" + default y + depends on NUMA_BALANCING + help + If set, autonumic NUMA balancing will be enabled if running on a NUMA + machine. + config NUMA_BALANCING bool "Memory placement aware NUMA scheduler" default y diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9d255bc0e278..7a45015274ab 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -192,23 +192,10 @@ static void sched_feat_disable(int i) { }; static void sched_feat_enable(int i) { }; #endif /* HAVE_JUMP_LABEL */ -static ssize_t -sched_feat_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +static int sched_feat_set(char *cmp) { - char buf[64]; - char *cmp; - int neg = 0; int i; - - if (cnt > 63) - cnt = 63; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - cmp = strstrip(buf); + int neg = 0; if (strncmp(cmp, "NO_", 3) == 0) { neg = 1; @@ -228,6 +215,27 @@ sched_feat_write(struct file *filp, const char __user *ubuf, } } + return i; +} + +static ssize_t +sched_feat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp; + int i; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + cmp = strstrip(buf); + + i = sched_feat_set(cmp); if (i == __SCHED_FEAT_NR) return -EINVAL; @@ -1549,6 +1557,16 @@ static void __sched_fork(struct task_struct *p) #endif /* CONFIG_NUMA_BALANCING */ } +#ifdef CONFIG_NUMA_BALANCING +void set_numabalancing_state(bool enabled) +{ + if (enabled) + sched_feat_set("NUMA"); + else + sched_feat_set("NO_NUMA"); +} +#endif /* CONFIG_NUMA_BALANCING */ + /* * fork()/clone()-time setup: */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4b577863933f..7a02a2082e95 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -811,6 +811,9 @@ void task_numa_fault(int node, int pages, bool migrated) { struct task_struct *p = current; + if (!sched_feat_numa(NUMA)) + return; + /* FIXME: Allocate task-specific structure for placement policy here */ /* diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 5fb7aefbec80..d2373a3e3252 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -63,8 +63,10 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) /* - * Apply the automatic NUMA scheduling policy + * Apply the automatic NUMA scheduling policy. Enabled automatically + * at runtime if running on a NUMA machine. Can be controlled via + * numa_balancing= */ #ifdef CONFIG_NUMA_BALANCING -SCHED_FEAT(NUMA, true) +SCHED_FEAT(NUMA, false) #endif diff --git a/mm/mempolicy.c b/mm/mempolicy.c index fd20e28fd2ad..046308e9b999 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2521,6 +2521,50 @@ void mpol_free_shared_policy(struct shared_policy *p) mutex_unlock(&p->mutex); } +#ifdef CONFIG_NUMA_BALANCING +static bool __initdata numabalancing_override; + +static void __init check_numabalancing_enable(void) +{ + bool numabalancing_default = false; + + if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) + numabalancing_default = true; + + if (nr_node_ids > 1 && !numabalancing_override) { + printk(KERN_INFO "Enabling automatic NUMA balancing. " + "Configure with numa_balancing= or sysctl"); + set_numabalancing_state(numabalancing_default); + } +} + +static int __init setup_numabalancing(char *str) +{ + int ret = 0; + if (!str) + goto out; + numabalancing_override = true; + + if (!strcmp(str, "enable")) { + set_numabalancing_state(true); + ret = 1; + } else if (!strcmp(str, "disable")) { + set_numabalancing_state(false); + ret = 1; + } +out: + if (!ret) + printk(KERN_WARNING "Unable to parse numa_balancing=\n"); + + return ret; +} +__setup("numa_balancing=", setup_numabalancing); +#else +static inline void __init check_numabalancing_enable(void) +{ +} +#endif /* CONFIG_NUMA_BALANCING */ + /* assumes fs == KERNEL_DS */ void __init numa_policy_init(void) { @@ -2571,6 +2615,8 @@ void __init numa_policy_init(void) if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) printk("numa_policy_init: interleaving failed\n"); + + check_numabalancing_enable(); } /* Reset policy of current process to default */ -- cgit v1.2.3 From a5ba911ec3792168530d35e16a8ec3b6fc60bcb5 Mon Sep 17 00:00:00 2001 From: Gao feng Date: Mon, 17 Dec 2012 16:03:22 -0800 Subject: pidns: remove unused is_container_init() Since commit 1cdcbec1a337 ("CRED: Neuter sys_capset()") is_container_init() has no callers. Signed-off-by: Gao feng Cc: David Howells Acked-by: Serge Hallyn Cc: James Morris Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 6 ------ kernel/pid.c | 15 --------------- 2 files changed, 21 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index b089c92c609b..9914c662ed7b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1778,12 +1778,6 @@ static inline int is_global_init(struct task_struct *tsk) return tsk->pid == 1; } -/* - * is_container_init: - * check whether in the task is init in its own pid namespace. - */ -extern int is_container_init(struct task_struct *tsk); - extern struct pid *cad_pid; extern void free_task(struct task_struct *tsk); diff --git a/kernel/pid.c b/kernel/pid.c index fd996c1ed9f8..a54a1123c7cf 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -81,21 +81,6 @@ struct pid_namespace init_pid_ns = { }; EXPORT_SYMBOL_GPL(init_pid_ns); -int is_container_init(struct task_struct *tsk) -{ - int ret = 0; - struct pid *pid; - - rcu_read_lock(); - pid = task_pid(tsk); - if (pid != NULL && pid->numbers[pid->level].nr == 1) - ret = 1; - rcu_read_unlock(); - - return ret; -} -EXPORT_SYMBOL(is_container_init); - /* * Note: disable interrupts while the pidmap_lock is held as an * interrupt might come in and do read_lock(&tasklist_lock). -- cgit v1.2.3 From 0e9d92f2d02d8c8320f0502307c688d07bdac2b3 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:42 -0800 Subject: memcg: skip memcg kmem allocations in specified code regions Create a mechanism that skip memcg allocations during certain pieces of our core code. It basically works in the same way as preempt_disable()/preempt_enable(): By marking a region under which all allocations will be accounted to the root memcg. We need this to prevent races in early cache creation, when we allocate data using caches that are not necessarily created already. Signed-off-by: Glauber Costa yCc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + mm/memcontrol.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 55 insertions(+), 3 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9914c662ed7b..f712465b05c5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1597,6 +1597,7 @@ struct task_struct { unsigned long nr_pages; /* uncharged usage */ unsigned long memsw_nr_pages; /* uncharged mem+swap usage */ } memcg_batch; + unsigned int memcg_kmem_skip_account; #endif #ifdef CONFIG_HAVE_HW_BREAKPOINT atomic_t ptrace_bp_refcnt; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index efd26620a60b..65302a083d2f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3025,6 +3025,37 @@ out: kfree(s->memcg_params); } +/* + * During the creation a new cache, we need to disable our accounting mechanism + * altogether. This is true even if we are not creating, but rather just + * enqueing new caches to be created. + * + * This is because that process will trigger allocations; some visible, like + * explicit kmallocs to auxiliary data structures, name strings and internal + * cache structures; some well concealed, like INIT_WORK() that can allocate + * objects during debug. + * + * If any allocation happens during memcg_kmem_get_cache, we will recurse back + * to it. This may not be a bounded recursion: since the first cache creation + * failed to complete (waiting on the allocation), we'll just try to create the + * cache again, failing at the same point. + * + * memcg_kmem_get_cache is prepared to abort after seeing a positive count of + * memcg_kmem_skip_account. So we enclose anything that might allocate memory + * inside the following two functions. + */ +static inline void memcg_stop_kmem_account(void) +{ + VM_BUG_ON(!current->mm); + current->memcg_kmem_skip_account++; +} + +static inline void memcg_resume_kmem_account(void) +{ + VM_BUG_ON(!current->mm); + current->memcg_kmem_skip_account--; +} + static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s) { char *name; @@ -3084,7 +3115,6 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, goto out; new_cachep = kmem_cache_dup(memcg, cachep); - if (new_cachep == NULL) { new_cachep = cachep; goto out; @@ -3125,8 +3155,8 @@ static void memcg_create_cache_work_func(struct work_struct *w) * Enqueue the creation of a per-memcg kmem_cache. * Called with rcu_read_lock. */ -static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, - struct kmem_cache *cachep) +static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, + struct kmem_cache *cachep) { struct create_work *cw; @@ -3147,6 +3177,24 @@ static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, schedule_work(&cw->work); } +static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, + struct kmem_cache *cachep) +{ + /* + * We need to stop accounting when we kmalloc, because if the + * corresponding kmalloc cache is not yet created, the first allocation + * in __memcg_create_cache_enqueue will recurse. + * + * However, it is better to enclose the whole function. Depending on + * the debugging options enabled, INIT_WORK(), for instance, can + * trigger an allocation. This too, will make us recurse. Because at + * this point we can't allow ourselves back into memcg_kmem_get_cache, + * the safest choice is to do it like this, wrapping the whole function. + */ + memcg_stop_kmem_account(); + __memcg_create_cache_enqueue(memcg, cachep); + memcg_resume_kmem_account(); +} /* * Return the kmem_cache we're supposed to use for a slab allocation. * We try to use the current memcg's version of the cache. @@ -3169,6 +3217,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, VM_BUG_ON(!cachep->memcg_params); VM_BUG_ON(!cachep->memcg_params->is_root_cache); + if (!current->mm || current->memcg_kmem_skip_account) + return cachep; + rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); rcu_read_unlock(); -- cgit v1.2.3 From ae903caae267154de7cf8576b130ff474630596b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 14 Dec 2012 12:44:11 -0500 Subject: Bury the conditionals from kernel_thread/kernel_execve series All architectures have CONFIG_GENERIC_KERNEL_THREAD CONFIG_GENERIC_KERNEL_EXECVE __ARCH_WANT_SYS_EXECVE None of them have __ARCH_WANT_KERNEL_EXECVE and there are only two callers of kernel_execve() (which is a trivial wrapper for do_execve() now) left. Kill the conditionals and make both callers use do_execve(). Signed-off-by: Al Viro --- arch/Kconfig | 6 ------ arch/alpha/Kconfig | 2 -- arch/alpha/include/asm/unistd.h | 1 - arch/arm/Kconfig | 2 -- arch/arm/include/asm/unistd.h | 1 - arch/arm64/Kconfig | 2 -- arch/arm64/include/asm/unistd.h | 1 - arch/avr32/Kconfig | 2 -- arch/avr32/include/asm/unistd.h | 1 - arch/blackfin/Kconfig | 2 -- arch/blackfin/include/asm/unistd.h | 1 - arch/c6x/Kconfig | 2 -- arch/c6x/include/uapi/asm/unistd.h | 1 - arch/cris/Kconfig | 2 -- arch/cris/include/asm/unistd.h | 1 - arch/frv/Kconfig | 2 -- arch/frv/include/asm/unistd.h | 1 - arch/h8300/Kconfig | 2 -- arch/h8300/include/asm/unistd.h | 1 - arch/hexagon/Kconfig | 2 -- arch/hexagon/include/uapi/asm/unistd.h | 1 - arch/ia64/Kconfig | 2 -- arch/ia64/include/asm/unistd.h | 1 - arch/m32r/Kconfig | 2 -- arch/m32r/include/asm/unistd.h | 1 - arch/m68k/Kconfig | 2 -- arch/m68k/include/asm/unistd.h | 1 - arch/microblaze/Kconfig | 2 -- arch/microblaze/include/asm/unistd.h | 1 - arch/mips/Kconfig | 2 -- arch/mips/include/asm/unistd.h | 1 - arch/mn10300/Kconfig | 2 -- arch/mn10300/include/asm/unistd.h | 1 - arch/openrisc/Kconfig | 2 -- arch/openrisc/include/uapi/asm/unistd.h | 1 - arch/parisc/Kconfig | 2 -- arch/parisc/include/asm/unistd.h | 1 - arch/powerpc/Kconfig | 2 -- arch/powerpc/include/asm/unistd.h | 1 - arch/s390/Kconfig | 2 -- arch/s390/include/asm/unistd.h | 1 - arch/score/Kconfig | 2 -- arch/score/include/asm/unistd.h | 1 - arch/sh/Kconfig | 2 -- arch/sh/include/asm/unistd.h | 1 - arch/sparc/Kconfig | 2 -- arch/sparc/include/asm/unistd.h | 1 - arch/tile/Kconfig | 2 -- arch/tile/include/asm/unistd.h | 1 - arch/unicore32/Kconfig | 2 -- arch/unicore32/include/uapi/asm/unistd.h | 1 - arch/x86/Kconfig | 2 -- arch/x86/include/asm/unistd.h | 1 - arch/x86/um/Kconfig | 2 -- arch/xtensa/Kconfig | 2 -- arch/xtensa/include/asm/unistd.h | 1 - fs/exec.c | 21 --------------------- include/linux/binfmts.h | 4 ---- include/linux/sched.h | 2 -- include/linux/syscalls.h | 9 --------- init/main.c | 4 +++- kernel/fork.c | 2 -- kernel/kmod.c | 6 +++--- 63 files changed, 6 insertions(+), 131 deletions(-) (limited to 'include/linux/sched.h') diff --git a/arch/Kconfig b/arch/Kconfig index 8d698fb5ccc9..0a8dd0585d0d 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -271,12 +271,6 @@ config ARCH_WANT_OLD_COMPAT_IPC select ARCH_WANT_COMPAT_IPC_PARSE_VERSION bool -config GENERIC_KERNEL_THREAD - bool - -config GENERIC_KERNEL_EXECVE - bool - config HAVE_ARCH_SECCOMP_FILTER bool help diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 5dd7f5db24d4..7e3710c0cce5 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -20,8 +20,6 @@ config ALPHA select GENERIC_CMOS_UPDATE select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA help diff --git a/arch/alpha/include/asm/unistd.h b/arch/alpha/include/asm/unistd.h index eb3a4664ced2..68c75f2fadb9 100644 --- a/arch/alpha/include/asm/unistd.h +++ b/arch/alpha/include/asm/unistd.h @@ -481,7 +481,6 @@ #define __ARCH_WANT_SYS_OLDUMOUNT #define __ARCH_WANT_SYS_SIGPENDING #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 8918a2dd89b4..b789654e7e2f 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -11,8 +11,6 @@ config ARM select GENERIC_CLOCKEVENTS_BROADCAST if SMP select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select GENERIC_PCI_IOMAP select GENERIC_SMP_IDLE_THREAD select GENERIC_STRNCPY_FROM_USER diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h index 7cd13cc62624..21a2700d2957 100644 --- a/arch/arm/include/asm/unistd.h +++ b/arch/arm/include/asm/unistd.h @@ -41,7 +41,6 @@ #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_SYS_SOCKETCALL #endif -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 4b03c56ec329..a846029bebcc 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -7,8 +7,6 @@ config ARM64 select GENERIC_IOMAP select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW - select GENERIC_KERNEL_EXECVE - select GENERIC_KERNEL_THREAD select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select HARDIRQS_SW_RESEND diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index d69aeea6da1e..738322945d1a 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -27,6 +27,5 @@ #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #endif -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE #include diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig index 649aeb9acecb..06e73bf665e9 100644 --- a/arch/avr32/Kconfig +++ b/arch/avr32/Kconfig @@ -17,8 +17,6 @@ config AVR32 select GENERIC_CLOCKEVENTS select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE help AVR32 is a high-performance 32-bit RISC microprocessor core, designed for cost-sensitive embedded applications, with particular diff --git a/arch/avr32/include/asm/unistd.h b/arch/avr32/include/asm/unistd.h index f05a9804e8e2..0bdf6371574e 100644 --- a/arch/avr32/include/asm/unistd.h +++ b/arch/avr32/include/asm/unistd.h @@ -39,7 +39,6 @@ #define __ARCH_WANT_SYS_GETPGRP #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig index ab9ff4075f4d..b6f3ad5441c5 100644 --- a/arch/blackfin/Kconfig +++ b/arch/blackfin/Kconfig @@ -45,8 +45,6 @@ config BLACKFIN select ARCH_USES_GETTIMEOFFSET if !GENERIC_CLOCKEVENTS select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE config GENERIC_CSUM def_bool y diff --git a/arch/blackfin/include/asm/unistd.h b/arch/blackfin/include/asm/unistd.h index 460514a1a4e1..38711e28baac 100644 --- a/arch/blackfin/include/asm/unistd.h +++ b/arch/blackfin/include/asm/unistd.h @@ -446,7 +446,6 @@ #define __ARCH_WANT_SYS_NICE #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_VFORK /* diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig index 66eab3703c75..f6a3648f5ec3 100644 --- a/arch/c6x/Kconfig +++ b/arch/c6x/Kconfig @@ -17,8 +17,6 @@ config C6X select OF select OF_EARLY_FLATTREE select GENERIC_CLOCKEVENTS - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select MODULES_USE_ELF_RELA config MMU diff --git a/arch/c6x/include/uapi/asm/unistd.h b/arch/c6x/include/uapi/asm/unistd.h index f3987a8703d9..e7d09a614d10 100644 --- a/arch/c6x/include/uapi/asm/unistd.h +++ b/arch/c6x/include/uapi/asm/unistd.h @@ -14,7 +14,6 @@ * more details. */ -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE /* Use the standard ABI for syscalls. */ diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig index 0cac6a49f230..c59a01dd9c0c 100644 --- a/arch/cris/Kconfig +++ b/arch/cris/Kconfig @@ -49,8 +49,6 @@ config CRIS select GENERIC_SMP_IDLE_THREAD if ETRAX_ARCH_V32 select GENERIC_CMOS_UPDATE select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select CLONE_BACKWARDS2 config HZ diff --git a/arch/cris/include/asm/unistd.h b/arch/cris/include/asm/unistd.h index f27b542e0ebc..5cda75a9cc1e 100644 --- a/arch/cris/include/asm/unistd.h +++ b/arch/cris/include/asm/unistd.h @@ -371,7 +371,6 @@ #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig index df2eb4bd9fa2..9d262645f667 100644 --- a/arch/frv/Kconfig +++ b/arch/frv/Kconfig @@ -12,8 +12,6 @@ config FRV select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_CPU_DEVICES select ARCH_WANT_IPC_PARSE_VERSION - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE config ZONE_DMA bool diff --git a/arch/frv/include/asm/unistd.h b/arch/frv/include/asm/unistd.h index 1807d8ea8cb5..d685da17f5fb 100644 --- a/arch/frv/include/asm/unistd.h +++ b/arch/frv/include/asm/unistd.h @@ -29,7 +29,6 @@ #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index 04bef4d25b4a..98fabd10e95f 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -8,8 +8,6 @@ config H8300 select GENERIC_IRQ_SHOW select GENERIC_CPU_DEVICES select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE config SYMBOL_PREFIX string diff --git a/arch/h8300/include/asm/unistd.h b/arch/h8300/include/asm/unistd.h index c2c2f5c7d6bf..566f94860c45 100644 --- a/arch/h8300/include/asm/unistd.h +++ b/arch/h8300/include/asm/unistd.h @@ -356,7 +356,6 @@ #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index e418803b6c8e..0744f7d7b1fd 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -31,8 +31,6 @@ config HEXAGON select GENERIC_CLOCKEVENTS select GENERIC_CLOCKEVENTS_BROADCAST select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE ---help--- Qualcomm Hexagon is a processor architecture designed for high performance and low power across a wide variety of applications. diff --git a/arch/hexagon/include/uapi/asm/unistd.h b/arch/hexagon/include/uapi/asm/unistd.h index 2af81533bd0f..4a87cc47075c 100644 --- a/arch/hexagon/include/uapi/asm/unistd.h +++ b/arch/hexagon/include/uapi/asm/unistd.h @@ -27,7 +27,6 @@ */ #define sys_mmap2 sys_mmap_pgoff -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE #include diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 670600468128..3279646120e3 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -42,8 +42,6 @@ config IA64 select GENERIC_TIME_VSYSCALL_OLD select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE default y help The Itanium Processor Family is Intel's 64-bit successor to diff --git a/arch/ia64/include/asm/unistd.h b/arch/ia64/include/asm/unistd.h index 1574bca86138..8b3ff2f5b861 100644 --- a/arch/ia64/include/asm/unistd.h +++ b/arch/ia64/include/asm/unistd.h @@ -29,7 +29,6 @@ #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #if !defined(__ASSEMBLY__) && !defined(ASSEMBLER) diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index 5183f43a2cf7..f807721e19a5 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -15,8 +15,6 @@ config M32R select GENERIC_ATOMIC64 select ARCH_USES_GETTIMEOFFSET select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE config SBUS bool diff --git a/arch/m32r/include/asm/unistd.h b/arch/m32r/include/asm/unistd.h index d9e7351af2a4..cbfa39158fef 100644 --- a/arch/m32r/include/asm/unistd.h +++ b/arch/m32r/include/asm/unistd.h @@ -352,7 +352,6 @@ #define __ARCH_WANT_SYS_OLDUMOUNT #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 953a7ba5d050..6710084e072a 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -15,8 +15,6 @@ config M68K select FPU if MMU select ARCH_WANT_IPC_PARSE_VERSION select ARCH_USES_GETTIMEOFFSET if MMU && !COLDFIRE - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_REL select MODULES_USE_ELF_RELA diff --git a/arch/m68k/include/asm/unistd.h b/arch/m68k/include/asm/unistd.h index a021d67cdd72..847994ce6804 100644 --- a/arch/m68k/include/asm/unistd.h +++ b/arch/m68k/include/asm/unistd.h @@ -31,7 +31,6 @@ #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 4bcf89148f3c..ba3b7c8c04b8 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -26,8 +26,6 @@ config MICROBLAZE select GENERIC_ATOMIC64 select GENERIC_CLOCKEVENTS select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select CLONE_BACKWARDS config SWAP diff --git a/arch/microblaze/include/asm/unistd.h b/arch/microblaze/include/asm/unistd.h index 94d978986b75..38cabf4db548 100644 --- a/arch/microblaze/include/asm/unistd.h +++ b/arch/microblaze/include/asm/unistd.h @@ -422,7 +422,6 @@ #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE #define __ARCH_WANT_SYS_VFORK #ifdef CONFIG_MMU diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 4183e62f178c..dba9390d37cf 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -40,8 +40,6 @@ config MIPS select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_REL select MODULES_USE_ELF_RELA if 64BIT - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE menu "Machine selection" diff --git a/arch/mips/include/asm/unistd.h b/arch/mips/include/asm/unistd.h index b306e2081cad..9e47cc11aa26 100644 --- a/arch/mips/include/asm/unistd.h +++ b/arch/mips/include/asm/unistd.h @@ -20,7 +20,6 @@ #define __ARCH_OMIT_COMPAT_SYS_GETDENTS64 #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_SYS_ALARM -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig index 72471744a912..aa03f2e13385 100644 --- a/arch/mn10300/Kconfig +++ b/arch/mn10300/Kconfig @@ -8,8 +8,6 @@ config MN10300 select HAVE_ARCH_KGDB select HAVE_NMI_WATCHDOG if MN10300_WD_TIMER select GENERIC_CLOCKEVENTS - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select MODULES_USE_ELF_RELA config AM33_2 diff --git a/arch/mn10300/include/asm/unistd.h b/arch/mn10300/include/asm/unistd.h index cabf8ba73b27..e6d2ed4ba68f 100644 --- a/arch/mn10300/include/asm/unistd.h +++ b/arch/mn10300/include/asm/unistd.h @@ -43,7 +43,6 @@ #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index e7f1a2993f78..05f2ba41ff1a 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -22,8 +22,6 @@ config OPENRISC select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE config MMU def_bool y diff --git a/arch/openrisc/include/uapi/asm/unistd.h b/arch/openrisc/include/uapi/asm/unistd.h index 5082b8066325..ce40b71df006 100644 --- a/arch/openrisc/include/uapi/asm/unistd.h +++ b/arch/openrisc/include/uapi/asm/unistd.h @@ -20,7 +20,6 @@ #define sys_mmap2 sys_mmap_pgoff -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index e688a2be30f6..b77feffbadea 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -22,8 +22,6 @@ config PARISC select GENERIC_STRNCPY_FROM_USER select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select CLONE_BACKWARDS help diff --git a/arch/parisc/include/asm/unistd.h b/arch/parisc/include/asm/unistd.h index 1efef41659c9..3043194547cd 100644 --- a/arch/parisc/include/asm/unistd.h +++ b/arch/parisc/include/asm/unistd.h @@ -163,7 +163,6 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5) \ #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND #define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 951a517a1a0f..17903f1f356b 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -141,10 +141,8 @@ config PPC select GENERIC_CLOCKEVENTS select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER - select GENERIC_KERNEL_THREAD select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_EXECVE select CLONE_BACKWARDS config EARLY_PRINTK diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index 76fe846ec40e..784872f93711 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -55,7 +55,6 @@ #define __ARCH_WANT_SYS_NEWFSTATAT #define __ARCH_WANT_COMPAT_SYS_SENDFILE #endif -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 3cbb8757704e..5029ebf7110e 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -137,8 +137,6 @@ config S390 select GENERIC_CLOCKEVENTS select KTIME_SCALAR if 32BIT select HAVE_ARCH_SECCOMP_FILTER - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA select CLONE_BACKWARDS2 diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h index 086bb8eaf6ab..636530872516 100644 --- a/arch/s390/include/asm/unistd.h +++ b/arch/s390/include/asm/unistd.h @@ -53,7 +53,6 @@ # define __ARCH_WANT_COMPAT_SYS_TIME # define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND # endif -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/score/Kconfig b/arch/score/Kconfig index 45893390c7dd..3b1482e7afac 100644 --- a/arch/score/Kconfig +++ b/arch/score/Kconfig @@ -13,8 +13,6 @@ config SCORE select GENERIC_CLOCKEVENTS select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_REL - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select CLONE_BACKWARDS choice diff --git a/arch/score/include/asm/unistd.h b/arch/score/include/asm/unistd.h index 56001c93095a..9cb4260a5f3e 100644 --- a/arch/score/include/asm/unistd.h +++ b/arch/score/include/asm/unistd.h @@ -4,7 +4,6 @@ #define __ARCH_WANT_SYSCALL_NO_FLAGS #define __ARCH_WANT_SYSCALL_OFF_T #define __ARCH_WANT_SYSCALL_DEPRECATED -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 8451317eed58..babc2b826c5c 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -40,8 +40,6 @@ config SUPERH select GENERIC_STRNLEN_USER select HAVE_MOD_ARCH_SPECIFIC if DWARF_UNWINDER select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE help The SuperH is a RISC processor targeted for use in embedded systems and consumer electronics; it was also used in the Sega Dreamcast diff --git a/arch/sh/include/asm/unistd.h b/arch/sh/include/asm/unistd.h index 43d3f26b2eab..012004ed3330 100644 --- a/arch/sh/include/asm/unistd.h +++ b/arch/sh/include/asm/unistd.h @@ -28,7 +28,6 @@ # define __ARCH_WANT_SYS_SIGPENDING # define __ARCH_WANT_SYS_SIGPROCMASK # define __ARCH_WANT_SYS_RT_SIGACTION -# define __ARCH_WANT_SYS_EXECVE # define __ARCH_WANT_SYS_FORK # define __ARCH_WANT_SYS_VFORK # define __ARCH_WANT_SYS_CLONE diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 0c7d365fa402..9f2edb5c5551 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -41,8 +41,6 @@ config SPARC select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE config SPARC32 def_bool !64BIT diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h index c3e5d8b64171..0ecea6ed943e 100644 --- a/arch/sparc/include/asm/unistd.h +++ b/arch/sparc/include/asm/unistd.h @@ -46,7 +46,6 @@ #define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND #define __ARCH_WANT_COMPAT_SYS_SENDFILE #endif -#define __ARCH_WANT_SYS_EXECVE /* * "Conditional" syscalls diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index ea7f61e8bc9e..875d008828b8 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -21,8 +21,6 @@ config TILE select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_CLOCKEVENTS select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE # FIXME: investigate whether we need/want these options. # select HAVE_IOREMAP_PROT diff --git a/arch/tile/include/asm/unistd.h b/arch/tile/include/asm/unistd.h index b51c6ee3cd6c..940831fe9e94 100644 --- a/arch/tile/include/asm/unistd.h +++ b/arch/tile/include/asm/unistd.h @@ -16,6 +16,5 @@ #define __ARCH_WANT_SYS_LLSEEK #endif #define __ARCH_WANT_SYS_NEWFSTATAT -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE #include diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig index c4fbb21e802b..60651df5f952 100644 --- a/arch/unicore32/Kconfig +++ b/arch/unicore32/Kconfig @@ -16,8 +16,6 @@ config UNICORE32 select ARCH_WANT_FRAME_POINTERS select GENERIC_IOMAP select MODULES_USE_ELF_REL - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE help UniCore-32 is 32-bit Instruction Set Architecture, including a series of low-power-consumption RISC chip diff --git a/arch/unicore32/include/uapi/asm/unistd.h b/arch/unicore32/include/uapi/asm/unistd.h index 00cf5e286fca..d4cc4559d848 100644 --- a/arch/unicore32/include/uapi/asm/unistd.h +++ b/arch/unicore32/include/uapi/asm/unistd.h @@ -12,5 +12,4 @@ /* Use the standard ABI for syscalls. */ #include -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0df6e7d84539..01ca0ebaff0e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -108,8 +108,6 @@ config X86 select GENERIC_STRNLEN_USER select HAVE_RCU_USER_QS if X86_64 select HAVE_IRQ_TIME_ACCOUNTING - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select MODULES_USE_ELF_REL if X86_32 select MODULES_USE_ELF_RELA if X86_64 select CLONE_BACKWARDS if X86_32 diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 0e7dea7d3669..9dcfcc1d6f92 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -50,7 +50,6 @@ # define __ARCH_WANT_SYS_TIME # define __ARCH_WANT_SYS_UTIME # define __ARCH_WANT_SYS_WAITPID -# define __ARCH_WANT_SYS_EXECVE # define __ARCH_WANT_SYS_FORK # define __ARCH_WANT_SYS_VFORK # define __ARCH_WANT_SYS_CLONE diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index 8f51c39750d1..0fd20f241e40 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -13,8 +13,6 @@ endmenu config UML_X86 def_bool y select GENERIC_FIND_FIRST_BIT - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE config 64BIT bool "64-bit kernel" if SUBARCH = "x86" diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 2481f267be29..03a8c107e07e 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -13,8 +13,6 @@ config XTENSA select GENERIC_CPU_DEVICES select MODULES_USE_ELF_RELA select GENERIC_PCI_IOMAP - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select ARCH_WANT_OPTIONAL_GPIOLIB select CLONE_BACKWARDS help diff --git a/arch/xtensa/include/asm/unistd.h b/arch/xtensa/include/asm/unistd.h index e002dbcc88b6..eb63ea87815c 100644 --- a/arch/xtensa/include/asm/unistd.h +++ b/arch/xtensa/include/asm/unistd.h @@ -1,7 +1,6 @@ #ifndef _XTENSA_UNISTD_H #define _XTENSA_UNISTD_H -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE #include diff --git a/fs/exec.c b/fs/exec.c index 721a29929511..090ac91da2e9 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1657,7 +1657,6 @@ int get_dumpable(struct mm_struct *mm) return __get_dumpable(mm->flags); } -#ifdef __ARCH_WANT_SYS_EXECVE SYSCALL_DEFINE3(execve, const char __user *, filename, const char __user *const __user *, argv, @@ -1685,23 +1684,3 @@ asmlinkage long compat_sys_execve(const char __user * filename, return error; } #endif -#endif - -#ifdef __ARCH_WANT_KERNEL_EXECVE -int kernel_execve(const char *filename, - const char *const argv[], - const char *const envp[]) -{ - int ret = do_execve(filename, - (const char __user *const __user *)argv, - (const char __user *const __user *)envp); - if (ret < 0) - return ret; - - /* - * We were successful. We won't be returning to our caller, but - * instead to user space by manipulating the kernel stack. - */ - ret_from_kernel_execve(current_pt_regs()); -} -#endif diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 2630c9b41a86..8c1388c6ae27 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -121,8 +121,4 @@ extern void install_exec_creds(struct linux_binprm *bprm); extern void set_binfmt(struct linux_binfmt *new); extern void free_bprm(struct linux_binprm *); -#ifdef __ARCH_WANT_KERNEL_EXECVE -extern void ret_from_kernel_execve(struct pt_regs *normal) __noreturn; -#endif - #endif /* _LINUX_BINFMTS_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 1162258bcaf0..9e5a54e3d84f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2291,9 +2291,7 @@ extern int do_execve(const char *, const char __user * const __user *); extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); struct task_struct *fork_idle(int); -#ifdef CONFIG_GENERIC_KERNEL_THREAD extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); -#endif extern void set_task_comm(struct task_struct *tsk, char *from); extern char *get_task_comm(char *to, struct task_struct *tsk); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 91835e7f364d..9fe5f946526e 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -827,15 +827,6 @@ asmlinkage long sys_fanotify_mark(int fanotify_fd, unsigned int flags, const char __user *pathname); asmlinkage long sys_syncfs(int fd); -#ifndef CONFIG_GENERIC_KERNEL_EXECVE -int kernel_execve(const char *filename, const char *const argv[], const char *const envp[]); -#else -#define kernel_execve(filename, argv, envp) \ - do_execve(filename, \ - (const char __user *const __user *)argv, \ - (const char __user *const __user *)envp) -#endif - asmlinkage long sys_fork(void); asmlinkage long sys_vfork(void); #ifdef CONFIG_CLONE_BACKWARDS diff --git a/init/main.c b/init/main.c index e33e09df3cbc..155ac208d581 100644 --- a/init/main.c +++ b/init/main.c @@ -797,7 +797,9 @@ static void __init do_pre_smp_initcalls(void) static int run_init_process(const char *init_filename) { argv_init[0] = init_filename; - return kernel_execve(init_filename, argv_init, envp_init); + return do_execve(init_filename, + (const char __user *const __user *)argv_init, + (const char __user *const __user *)envp_init); } static void __init kernel_init_freeable(void); diff --git a/kernel/fork.c b/kernel/fork.c index 540730783433..389712ffc0ad 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1623,7 +1623,6 @@ long do_fork(unsigned long clone_flags, return nr; } -#ifdef CONFIG_GENERIC_KERNEL_THREAD /* * Create a kernel thread. */ @@ -1632,7 +1631,6 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, (unsigned long)arg, NULL, NULL); } -#endif #ifdef __ARCH_WANT_SYS_FORK SYSCALL_DEFINE0(fork) diff --git a/kernel/kmod.c b/kernel/kmod.c index 1c317e386831..0023a87e8de6 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -219,9 +219,9 @@ static int ____call_usermodehelper(void *data) commit_creds(new); - retval = kernel_execve(sub_info->path, - (const char *const *)sub_info->argv, - (const char *const *)sub_info->envp); + retval = do_execve(sub_info->path, + (const char __user *const __user *)sub_info->argv, + (const char __user *const __user *)sub_info->envp); if (!retval) return 0; -- cgit v1.2.3 From 774a1221e862b343388347bac9b318767336b20b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 15 Jan 2013 18:52:51 -0800 Subject: module, async: async_synchronize_full() on module init iff async is used If the default iosched is built as module, the kernel may deadlock while trying to load the iosched module on device probe if the probing was running off async. This is because async_synchronize_full() at the end of module init ends up waiting for the async job which initiated the module loading. async A modprobe 1. finds a device 2. registers the block device 3. request_module(default iosched) 4. modprobe in userland 5. load and init module 6. async_synchronize_full() Async A waits for modprobe to finish in request_module() and modprobe waits for async A to finish in async_synchronize_full(). Because there's no easy to track dependency once control goes out to userland, implementing properly nested flushing is difficult. For now, make module init perform async_synchronize_full() iff module init has queued async jobs as suggested by Linus. This avoids the described deadlock because iosched module doesn't use async and thus wouldn't invoke async_synchronize_full(). This is hacky and incomplete. It will deadlock if async module loading nests; however, this works around the known problem case and seems to be the best of bad options. For more details, please refer to the following thread. http://thread.gmane.org/gmane.linux.kernel/1420814 Signed-off-by: Tejun Heo Reported-by: Alex Riesen Tested-by: Ming Lei Tested-by: Alex Riesen Cc: Arjan van de Ven Cc: Jens Axboe Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + kernel/async.c | 3 +++ kernel/module.c | 27 +++++++++++++++++++++++++-- 3 files changed, 29 insertions(+), 2 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 206bb089c06b..6fc8f45de4e9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1810,6 +1810,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ +#define PF_USED_ASYNC 0x00004000 /* used async_schedule*(), used by module init */ #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ #define PF_FROZEN 0x00010000 /* frozen for system suspend */ #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ diff --git a/kernel/async.c b/kernel/async.c index 9d3118384858..a1d585c351d6 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -196,6 +196,9 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a atomic_inc(&entry_count); spin_unlock_irqrestore(&async_lock, flags); + /* mark that this task has queued an async job, used by module init */ + current->flags |= PF_USED_ASYNC; + /* schedule for execution */ queue_work(system_unbound_wq, &entry->work); diff --git a/kernel/module.c b/kernel/module.c index 250092c1d57d..b10b048367e1 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3013,6 +3013,12 @@ static int do_init_module(struct module *mod) { int ret = 0; + /* + * We want to find out whether @mod uses async during init. Clear + * PF_USED_ASYNC. async_schedule*() will set it. + */ + current->flags &= ~PF_USED_ASYNC; + blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); @@ -3058,8 +3064,25 @@ static int do_init_module(struct module *mod) blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_LIVE, mod); - /* We need to finish all async code before the module init sequence is done */ - async_synchronize_full(); + /* + * We need to finish all async code before the module init sequence + * is done. This has potential to deadlock. For example, a newly + * detected block device can trigger request_module() of the + * default iosched from async probing task. Once userland helper + * reaches here, async_synchronize_full() will wait on the async + * task waiting on request_module() and deadlock. + * + * This deadlock is avoided by perfomring async_synchronize_full() + * iff module init queued any async jobs. This isn't a full + * solution as it will deadlock the same if module loading from + * async jobs nests more than once; however, due to the various + * constraints, this hack seems to be the best option for now. + * Please refer to the following thread for details. + * + * http://thread.gmane.org/gmane.linux.kernel/1420814 + */ + if (current->flags & PF_USED_ASYNC) + async_synchronize_full(); mutex_lock(&module_mutex); /* Drop initial reference. */ -- cgit v1.2.3 From 910ffdb18a6408e14febbb6e4b6840fd2c928c82 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 21 Jan 2013 20:47:41 +0100 Subject: ptrace: introduce signal_wake_up_state() and ptrace_signal_wake_up() Cleanup and preparation for the next change. signal_wake_up(resume => true) is overused. None of ptrace/jctl callers actually want to wakeup a TASK_WAKEKILL task, but they can't specify the necessary mask. Turn signal_wake_up() into signal_wake_up_state(state), reintroduce signal_wake_up() as a trivial helper, and add ptrace_signal_wake_up() which adds __TASK_TRACED. This way ptrace_signal_wake_up() can work "inside" ptrace_request() even if the tracee doesn't have the TASK_WAKEKILL bit set. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- include/linux/sched.h | 11 ++++++++++- kernel/ptrace.c | 8 ++++---- kernel/signal.c | 14 ++++---------- 3 files changed, 18 insertions(+), 15 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6fc8f45de4e9..d2112477ff5e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2714,7 +2714,16 @@ static inline void thread_group_cputime_init(struct signal_struct *sig) extern void recalc_sigpending_and_wake(struct task_struct *t); extern void recalc_sigpending(void); -extern void signal_wake_up(struct task_struct *t, int resume_stopped); +extern void signal_wake_up_state(struct task_struct *t, unsigned int state); + +static inline void signal_wake_up(struct task_struct *t, bool resume) +{ + signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0); +} +static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume) +{ + signal_wake_up_state(t, resume ? __TASK_TRACED : 0); +} /* * Wrappers for p->thread_info->cpu access. No-op on UP. diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 612a56126851..62f7c2774b16 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -117,7 +117,7 @@ void __ptrace_unlink(struct task_struct *child) * TASK_KILLABLE sleeps. */ if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child)) - signal_wake_up(child, task_is_traced(child)); + ptrace_signal_wake_up(child, true); spin_unlock(&child->sighand->siglock); } @@ -317,7 +317,7 @@ static int ptrace_attach(struct task_struct *task, long request, */ if (task_is_stopped(task) && task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) - signal_wake_up(task, 1); + signal_wake_up_state(task, __TASK_STOPPED); spin_unlock(&task->sighand->siglock); @@ -737,7 +737,7 @@ int ptrace_request(struct task_struct *child, long request, * tracee into STOP. */ if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP))) - signal_wake_up(child, child->jobctl & JOBCTL_LISTENING); + ptrace_signal_wake_up(child, child->jobctl & JOBCTL_LISTENING); unlock_task_sighand(child, &flags); ret = 0; @@ -763,7 +763,7 @@ int ptrace_request(struct task_struct *child, long request, * start of this trap and now. Trigger re-trap. */ if (child->jobctl & JOBCTL_TRAP_NOTIFY) - signal_wake_up(child, true); + ptrace_signal_wake_up(child, true); ret = 0; } unlock_task_sighand(child, &flags); diff --git a/kernel/signal.c b/kernel/signal.c index 53cd5c4d1172..6e97aa6fa32c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -680,23 +680,17 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) * No need to set need_resched since signal event passing * goes through ->blocked */ -void signal_wake_up(struct task_struct *t, int resume) +void signal_wake_up_state(struct task_struct *t, unsigned int state) { - unsigned int mask; - set_tsk_thread_flag(t, TIF_SIGPENDING); - /* - * For SIGKILL, we want to wake it up in the stopped/traced/killable + * TASK_WAKEKILL also means wake it up in the stopped/traced/killable * case. We don't check t->state here because there is a race with it * executing another processor and just now entering stopped state. * By using wake_up_state, we ensure the process will wake up and * handle its death signal. */ - mask = TASK_INTERRUPTIBLE; - if (resume) - mask |= TASK_WAKEKILL; - if (!wake_up_state(t, mask)) + if (!wake_up_state(t, state | TASK_INTERRUPTIBLE)) kick_process(t); } @@ -844,7 +838,7 @@ static void ptrace_trap_notify(struct task_struct *t) assert_spin_locked(&t->sighand->siglock); task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY); - signal_wake_up(t, t->jobctl & JOBCTL_LISTENING); + ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING); } /* -- cgit v1.2.3 From ace783b9bbfa2182b4a561498db3f09a0c56bc79 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 24 Jan 2013 14:30:48 +0800 Subject: sched: split out css_online/css_offline from tg creation/destruction This is a preparaton for later patches. - What do we gain from cpu_cgroup_css_online(): After ss->css_alloc() and before ss->css_online(), there's a small window that tg->css.cgroup is NULL. With this change, tg won't be seen before ss->css_online(), where it's added to the global list, so we're guaranteed we'll never see NULL tg->css.cgroup. - What do we gain from cpu_cgroup_css_offline(): tg is freed via RCU, so is cgroup. Without this change, This is how synchronization works: cgroup_rmdir() no ss->css_offline() diput() syncornize_rcu() ss->css_free() <-- unregister tg, and free it via call_rcu() kfree_rcu(cgroup) <-- wait possible refs to cgroup, and free cgroup We can't just kfree(cgroup), because tg might access tg->css.cgroup. With this change: cgroup_rmdir() ss->css_offline() <-- unregister tg diput() synchronize_rcu() <-- wait possible refs to tg and cgroup ss->css_free() <-- free tg kfree_rcu(cgroup) <-- free cgroup As you see, kfree_rcu() is redundant now. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo Acked-by: Ingo Molnar --- include/linux/sched.h | 3 +++ kernel/sched/auto_group.c | 3 +++ kernel/sched/core.c | 49 +++++++++++++++++++++++++++++++++++++---------- 3 files changed, 45 insertions(+), 10 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 206bb089c06b..577eb973de7a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2750,7 +2750,10 @@ extern void normalize_rt_tasks(void); extern struct task_group root_task_group; extern struct task_group *sched_create_group(struct task_group *parent); +extern void sched_online_group(struct task_group *tg, + struct task_group *parent); extern void sched_destroy_group(struct task_group *tg); +extern void sched_offline_group(struct task_group *tg); extern void sched_move_task(struct task_struct *tsk); #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 0984a21076a3..64de5f8b0c9e 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -35,6 +35,7 @@ static inline void autogroup_destroy(struct kref *kref) ag->tg->rt_se = NULL; ag->tg->rt_rq = NULL; #endif + sched_offline_group(ag->tg); sched_destroy_group(ag->tg); } @@ -76,6 +77,8 @@ static inline struct autogroup *autogroup_create(void) if (IS_ERR(tg)) goto out_free; + sched_online_group(tg, &root_task_group); + kref_init(&ag->kref); init_rwsem(&ag->lock); ag->id = atomic_inc_return(&autogroup_seq_nr); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 257002c13bb0..106167243d68 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7159,7 +7159,6 @@ static void free_sched_group(struct task_group *tg) struct task_group *sched_create_group(struct task_group *parent) { struct task_group *tg; - unsigned long flags; tg = kzalloc(sizeof(*tg), GFP_KERNEL); if (!tg) @@ -7171,6 +7170,17 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err; + return tg; + +err: + free_sched_group(tg); + return ERR_PTR(-ENOMEM); +} + +void sched_online_group(struct task_group *tg, struct task_group *parent) +{ + unsigned long flags; + spin_lock_irqsave(&task_group_lock, flags); list_add_rcu(&tg->list, &task_groups); @@ -7180,12 +7190,6 @@ struct task_group *sched_create_group(struct task_group *parent) INIT_LIST_HEAD(&tg->children); list_add_rcu(&tg->siblings, &parent->children); spin_unlock_irqrestore(&task_group_lock, flags); - - return tg; - -err: - free_sched_group(tg); - return ERR_PTR(-ENOMEM); } /* rcu callback to free various structures associated with a task group */ @@ -7197,6 +7201,12 @@ static void free_sched_group_rcu(struct rcu_head *rhp) /* Destroy runqueue etc associated with a task group */ void sched_destroy_group(struct task_group *tg) +{ + /* wait for possible concurrent references to cfs_rqs complete */ + call_rcu(&tg->rcu, free_sched_group_rcu); +} + +void sched_offline_group(struct task_group *tg) { unsigned long flags; int i; @@ -7209,9 +7219,6 @@ void sched_destroy_group(struct task_group *tg) list_del_rcu(&tg->list); list_del_rcu(&tg->siblings); spin_unlock_irqrestore(&task_group_lock, flags); - - /* wait for possible concurrent references to cfs_rqs complete */ - call_rcu(&tg->rcu, free_sched_group_rcu); } /* change task's runqueue when it moves between groups. @@ -7563,6 +7570,19 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) return &tg->css; } +static int cpu_cgroup_css_online(struct cgroup *cgrp) +{ + struct task_group *tg = cgroup_tg(cgrp); + struct task_group *parent; + + if (!cgrp->parent) + return 0; + + parent = cgroup_tg(cgrp->parent); + sched_online_group(tg, parent); + return 0; +} + static void cpu_cgroup_css_free(struct cgroup *cgrp) { struct task_group *tg = cgroup_tg(cgrp); @@ -7570,6 +7590,13 @@ static void cpu_cgroup_css_free(struct cgroup *cgrp) sched_destroy_group(tg); } +static void cpu_cgroup_css_offline(struct cgroup *cgrp) +{ + struct task_group *tg = cgroup_tg(cgrp); + + sched_offline_group(tg); +} + static int cpu_cgroup_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { @@ -7925,6 +7952,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { .name = "cpu", .css_alloc = cpu_cgroup_css_alloc, .css_free = cpu_cgroup_css_free, + .css_online = cpu_cgroup_css_online, + .css_offline = cpu_cgroup_css_offline, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, -- cgit v1.2.3 From 57d2aa00dcec67afa52478730f2b524521af14fb Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Tue, 17 Jul 2012 15:03:43 +0800 Subject: sched/rt: Avoid updating RT entry timeout twice within one tick period The issue below was found in 2.6.34-rt rather than mainline rt kernel, but the issue still exists upstream as well. So please let me describe how it was noticed on 2.6.34-rt: On this version, each softirq has its own thread, it means there is at least one RT FIFO task per cpu. The priority of these tasks is set to 49 by default. If user launches an RT FIFO task with priority lower than 49 of softirq RT tasks, it's possible there are two RT FIFO tasks enqueued one cpu runqueue at one moment. By current strategy of balancing RT tasks, when it comes to RT tasks, we really need to put them off to a CPU that they can run on as soon as possible. Even if it means a bit of cache line flushing, we want RT tasks to be run with the least latency. When the user RT FIFO task which just launched before is running, the sched timer tick of the current cpu happens. In this tick period, the timeout value of the user RT task will be updated once. Subsequently, we try to wake up one softirq RT task on its local cpu. As the priority of current user RT task is lower than the softirq RT task, the current task will be preempted by the higher priority softirq RT task. Before preemption, we check to see if current can readily move to a different cpu. If so, we will reschedule to allow the RT push logic to try to move current somewhere else. Whenever the woken softirq RT task runs, it first tries to migrate the user FIFO RT task over to a cpu that is running a task of lesser priority. If migration is done, it will send a reschedule request to the found cpu by IPI interrupt. Once the target cpu responds the IPI interrupt, it will pick the migrated user RT task to preempt its current task. When the user RT task is running on the new cpu, the sched timer tick of the cpu fires. So it will tick the user RT task again. This also means the RT task timeout value will be updated again. As the migration may be done in one tick period, it means the user RT task timeout value will be updated twice within one tick. If we set a limit on the amount of cpu time for the user RT task by setrlimit(RLIMIT_RTTIME), the SIGXCPU signal should be posted upon reaching the soft limit. But exactly when the SIGXCPU signal should be sent depends on the RT task timeout value. In fact the timeout mechanism of sending the SIGXCPU signal assumes the RT task timeout is increased once every tick. However, currently the timeout value may be added twice per tick. So it results in the SIGXCPU signal being sent earlier than expected. To solve this issue, we prevent the timeout value from increasing twice within one tick time by remembering the jiffies value of last updating the timeout. As long as the RT task's jiffies is different with the global jiffies value, we allow its timeout to be updated. Signed-off-by: Ying Xue Signed-off-by: Fan Du Reviewed-by: Yong Zhang Acked-by: Steven Rostedt Cc: Link: http://lkml.kernel.org/r/1342508623-2887-1-git-send-email-ying.xue@windriver.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/rt.c | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index d2112477ff5e..924e42a8df58 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1208,6 +1208,7 @@ struct sched_entity { struct sched_rt_entity { struct list_head run_list; unsigned long timeout; + unsigned long watchdog_stamp; unsigned int time_slice; struct sched_rt_entity *back; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 29bda5bdf2a5..2f69ca997826 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1988,7 +1988,11 @@ static void watchdog(struct rq *rq, struct task_struct *p) if (soft != RLIM_INFINITY) { unsigned long next; - p->rt.timeout++; + if (p->rt.watchdog_stamp != jiffies) { + p->rt.timeout++; + p->rt.watchdog_stamp = jiffies; + } + next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); if (p->rt.timeout > next) p->cputime_expires.sched_exp = p->se.sum_exec_runtime; -- cgit v1.2.3 From 6fac4829ce0ef9b7f24369086ce5f0e9f38d37bc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 13 Nov 2012 14:20:55 +0100 Subject: cputime: Use accessors to read task cputime stats This is in preparation for the full dynticks feature. While remotely reading the cputime of a task running in a full dynticks CPU, we'll need to do some extra-computation. This way we can account the time it spent tickless in userspace since its last cputime snapshot. Signed-off-by: Frederic Weisbecker Cc: Andrew Morton Cc: Ingo Molnar Cc: Li Zhong Cc: Namhyung Kim Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- arch/alpha/kernel/osf_sys.c | 6 ++++-- arch/x86/kernel/apm_32.c | 11 ++++++----- drivers/isdn/mISDN/stack.c | 7 ++++++- fs/binfmt_elf.c | 8 ++++++-- fs/binfmt_elf_fdpic.c | 7 +++++-- fs/proc/array.c | 4 ++-- include/linux/sched.h | 23 +++++++++++++++++++++++ include/linux/tsacct_kern.h | 3 +++ kernel/acct.c | 6 ++++-- kernel/cpu.c | 4 +++- kernel/delayacct.c | 7 +++++-- kernel/exit.c | 10 ++++++---- kernel/posix-cpu-timers.c | 28 ++++++++++++++++++++++------ kernel/sched/cputime.c | 13 +++++++------ kernel/signal.c | 12 ++++++++---- kernel/tsacct.c | 44 ++++++++++++++++++++++++++++++++++---------- 16 files changed, 144 insertions(+), 49 deletions(-) (limited to 'include/linux/sched.h') diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index 14db93e4c8a8..dbc1760f418b 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -1139,6 +1139,7 @@ struct rusage32 { SYSCALL_DEFINE2(osf_getrusage, int, who, struct rusage32 __user *, ru) { struct rusage32 r; + cputime_t utime, stime; if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN) return -EINVAL; @@ -1146,8 +1147,9 @@ SYSCALL_DEFINE2(osf_getrusage, int, who, struct rusage32 __user *, ru) memset(&r, 0, sizeof(r)); switch (who) { case RUSAGE_SELF: - jiffies_to_timeval32(current->utime, &r.ru_utime); - jiffies_to_timeval32(current->stime, &r.ru_stime); + task_cputime(current, &utime, &stime); + jiffies_to_timeval32(utime, &r.ru_utime); + jiffies_to_timeval32(stime, &r.ru_stime); r.ru_minflt = current->min_flt; r.ru_majflt = current->maj_flt; break; diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index d65464e43503..8d7012b7f402 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -899,6 +899,7 @@ static void apm_cpu_idle(void) static int use_apm_idle; /* = 0 */ static unsigned int last_jiffies; /* = 0 */ static unsigned int last_stime; /* = 0 */ + cputime_t stime; int apm_idle_done = 0; unsigned int jiffies_since_last_check = jiffies - last_jiffies; @@ -906,23 +907,23 @@ static void apm_cpu_idle(void) WARN_ONCE(1, "deprecated apm_cpu_idle will be deleted in 2012"); recalc: + task_cputime(current, NULL, &stime); if (jiffies_since_last_check > IDLE_CALC_LIMIT) { use_apm_idle = 0; - last_jiffies = jiffies; - last_stime = current->stime; } else if (jiffies_since_last_check > idle_period) { unsigned int idle_percentage; - idle_percentage = current->stime - last_stime; + idle_percentage = stime - last_stime; idle_percentage *= 100; idle_percentage /= jiffies_since_last_check; use_apm_idle = (idle_percentage > idle_threshold); if (apm_info.forbid_idle) use_apm_idle = 0; - last_jiffies = jiffies; - last_stime = current->stime; } + last_jiffies = jiffies; + last_stime = stime; + bucket = IDLE_LEAKY_MAX; while (!need_resched()) { diff --git a/drivers/isdn/mISDN/stack.c b/drivers/isdn/mISDN/stack.c index 5f21f629b7ae..deda591f70b9 100644 --- a/drivers/isdn/mISDN/stack.c +++ b/drivers/isdn/mISDN/stack.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "core.h" static u_int *debug; @@ -202,6 +203,9 @@ static int mISDNStackd(void *data) { struct mISDNstack *st = data; +#ifdef MISDN_MSG_STATS + cputime_t utime, stime; +#endif int err = 0; sigfillset(¤t->blocked); @@ -303,9 +307,10 @@ mISDNStackd(void *data) "msg %d sleep %d stopped\n", dev_name(&st->dev->dev), st->msg_cnt, st->sleep_cnt, st->stopped_cnt); + task_cputime(st->thread, &utime, &stime); printk(KERN_DEBUG "mISDNStackd daemon for %s utime(%ld) stime(%ld)\n", - dev_name(&st->dev->dev), st->thread->utime, st->thread->stime); + dev_name(&st->dev->dev), utime, stime); printk(KERN_DEBUG "mISDNStackd daemon for %s nvcsw(%ld) nivcsw(%ld)\n", dev_name(&st->dev->dev), st->thread->nvcsw, st->thread->nivcsw); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 0c42cdbabecf..49d0b43458b7 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -1320,8 +1321,11 @@ static void fill_prstatus(struct elf_prstatus *prstatus, cputime_to_timeval(cputime.utime, &prstatus->pr_utime); cputime_to_timeval(cputime.stime, &prstatus->pr_stime); } else { - cputime_to_timeval(p->utime, &prstatus->pr_utime); - cputime_to_timeval(p->stime, &prstatus->pr_stime); + cputime_t utime, stime; + + task_cputime(p, &utime, &stime); + cputime_to_timeval(utime, &prstatus->pr_utime); + cputime_to_timeval(stime, &prstatus->pr_stime); } cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime); cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index dc84732e554f..cb240dd3b402 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1375,8 +1375,11 @@ static void fill_prstatus(struct elf_prstatus *prstatus, cputime_to_timeval(cputime.utime, &prstatus->pr_utime); cputime_to_timeval(cputime.stime, &prstatus->pr_stime); } else { - cputime_to_timeval(p->utime, &prstatus->pr_utime); - cputime_to_timeval(p->stime, &prstatus->pr_stime); + cputime_t utime, stime; + + task_cputime(p, &utime, &stime); + cputime_to_timeval(utime, &prstatus->pr_utime); + cputime_to_timeval(stime, &prstatus->pr_stime); } cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime); cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime); diff --git a/fs/proc/array.c b/fs/proc/array.c index 6a91e6ffbcbd..f7ed9ee46eb9 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -449,7 +449,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, do { min_flt += t->min_flt; maj_flt += t->maj_flt; - gtime += t->gtime; + gtime += task_gtime(t); t = next_thread(t); } while (t != task); @@ -472,7 +472,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, min_flt = task->min_flt; maj_flt = task->maj_flt; task_cputime_adjusted(task, &utime, &stime); - gtime = task->gtime; + gtime = task_gtime(task); } /* scale priority and nice values from timeslices to -20..20 */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 6fc8f45de4e9..a9c608b6154e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1792,6 +1792,29 @@ static inline void put_task_struct(struct task_struct *t) __put_task_struct(t); } +static inline cputime_t task_gtime(struct task_struct *t) +{ + return t->gtime; +} + +static inline void task_cputime(struct task_struct *t, + cputime_t *utime, cputime_t *stime) +{ + if (utime) + *utime = t->utime; + if (stime) + *stime = t->stime; +} + +static inline void task_cputime_scaled(struct task_struct *t, + cputime_t *utimescaled, + cputime_t *stimescaled) +{ + if (utimescaled) + *utimescaled = t->utimescaled; + if (stimescaled) + *stimescaled = t->stimescaled; +} extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); diff --git a/include/linux/tsacct_kern.h b/include/linux/tsacct_kern.h index 44893e5ec8f7..3251965bf4cc 100644 --- a/include/linux/tsacct_kern.h +++ b/include/linux/tsacct_kern.h @@ -23,12 +23,15 @@ static inline void bacct_add_tsk(struct user_namespace *user_ns, #ifdef CONFIG_TASK_XACCT extern void xacct_add_tsk(struct taskstats *stats, struct task_struct *p); extern void acct_update_integrals(struct task_struct *tsk); +extern void acct_account_cputime(struct task_struct *tsk); extern void acct_clear_integrals(struct task_struct *tsk); #else static inline void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) {} static inline void acct_update_integrals(struct task_struct *tsk) {} +static inline void acct_account_cputime(struct task_struct *tsk) +{} static inline void acct_clear_integrals(struct task_struct *tsk) {} #endif /* CONFIG_TASK_XACCT */ diff --git a/kernel/acct.c b/kernel/acct.c index 051e071a06e7..e8b1627ab9c7 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -566,6 +566,7 @@ out: void acct_collect(long exitcode, int group_dead) { struct pacct_struct *pacct = ¤t->signal->pacct; + cputime_t utime, stime; unsigned long vsize = 0; if (group_dead && current->mm) { @@ -593,8 +594,9 @@ void acct_collect(long exitcode, int group_dead) pacct->ac_flag |= ACORE; if (current->flags & PF_SIGNALED) pacct->ac_flag |= AXSIG; - pacct->ac_utime += current->utime; - pacct->ac_stime += current->stime; + task_cputime(current, &utime, &stime); + pacct->ac_utime += utime; + pacct->ac_stime += stime; pacct->ac_minflt += current->min_flt; pacct->ac_majflt += current->maj_flt; spin_unlock_irq(¤t->sighand->siglock); diff --git a/kernel/cpu.c b/kernel/cpu.c index 3046a503242c..e5d5e8e1e030 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -224,11 +224,13 @@ void clear_tasks_mm_cpumask(int cpu) static inline void check_for_tasks(int cpu) { struct task_struct *p; + cputime_t utime, stime; write_lock_irq(&tasklist_lock); for_each_process(p) { + task_cputime(p, &utime, &stime); if (task_cpu(p) == cpu && p->state == TASK_RUNNING && - (p->utime || p->stime)) + (utime || stime)) printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " "(state = %ld, flags = %x)\n", p->comm, task_pid_nr(p), cpu, diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 418b3f7053aa..d473988c1d0b 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -106,6 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) unsigned long long t2, t3; unsigned long flags; struct timespec ts; + cputime_t utime, stime, stimescaled, utimescaled; /* Though tsk->delays accessed later, early exit avoids * unnecessary returning of other data @@ -114,12 +115,14 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) goto done; tmp = (s64)d->cpu_run_real_total; - cputime_to_timespec(tsk->utime + tsk->stime, &ts); + task_cputime(tsk, &utime, &stime); + cputime_to_timespec(utime + stime, &ts); tmp += timespec_to_ns(&ts); d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; tmp = (s64)d->cpu_scaled_run_real_total; - cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts); + task_cputime_scaled(tsk, &utimescaled, &stimescaled); + cputime_to_timespec(utimescaled + stimescaled, &ts); tmp += timespec_to_ns(&ts); d->cpu_scaled_run_real_total = (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp; diff --git a/kernel/exit.c b/kernel/exit.c index b4df21937216..7dd20408707c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -85,6 +85,7 @@ static void __exit_signal(struct task_struct *tsk) bool group_dead = thread_group_leader(tsk); struct sighand_struct *sighand; struct tty_struct *uninitialized_var(tty); + cputime_t utime, stime; sighand = rcu_dereference_check(tsk->sighand, lockdep_tasklist_lock_is_held()); @@ -123,9 +124,10 @@ static void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ - sig->utime += tsk->utime; - sig->stime += tsk->stime; - sig->gtime += tsk->gtime; + task_cputime(tsk, &utime, &stime); + sig->utime += utime; + sig->stime += stime; + sig->gtime += task_gtime(tsk); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; @@ -1092,7 +1094,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) sig = p->signal; psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; - psig->cgtime += p->gtime + sig->gtime + sig->cgtime; + psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; psig->cmin_flt += p->min_flt + sig->min_flt + sig->cmin_flt; psig->cmaj_flt += diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index a278cad1d5d6..165d47698477 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -155,11 +155,19 @@ static void bump_cpu_timer(struct k_itimer *timer, static inline cputime_t prof_ticks(struct task_struct *p) { - return p->utime + p->stime; + cputime_t utime, stime; + + task_cputime(p, &utime, &stime); + + return utime + stime; } static inline cputime_t virt_ticks(struct task_struct *p) { - return p->utime; + cputime_t utime; + + task_cputime(p, &utime, NULL); + + return utime; } static int @@ -471,18 +479,23 @@ static void cleanup_timers(struct list_head *head, */ void posix_cpu_timers_exit(struct task_struct *tsk) { + cputime_t utime, stime; + add_device_randomness((const void*) &tsk->se.sum_exec_runtime, sizeof(unsigned long long)); + task_cputime(tsk, &utime, &stime); cleanup_timers(tsk->cpu_timers, - tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); + utime, stime, tsk->se.sum_exec_runtime); } void posix_cpu_timers_exit_group(struct task_struct *tsk) { struct signal_struct *const sig = tsk->signal; + cputime_t utime, stime; + task_cputime(tsk, &utime, &stime); cleanup_timers(tsk->signal->cpu_timers, - tsk->utime + sig->utime, tsk->stime + sig->stime, + utime + sig->utime, stime + sig->stime, tsk->se.sum_exec_runtime + sig->sum_sched_runtime); } @@ -1226,11 +1239,14 @@ static inline int task_cputime_expired(const struct task_cputime *sample, static inline int fastpath_timer_check(struct task_struct *tsk) { struct signal_struct *sig; + cputime_t utime, stime; + + task_cputime(tsk, &utime, &stime); if (!task_cputime_zero(&tsk->cputime_expires)) { struct task_cputime task_sample = { - .utime = tsk->utime, - .stime = tsk->stime, + .utime = utime, + .stime = stime, .sum_exec_runtime = tsk->se.sum_exec_runtime }; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index e1939d38bf73..c533deaf06d5 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -164,7 +164,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime, task_group_account_field(p, index, (__force u64) cputime); /* Account for user time used */ - acct_update_integrals(p); + acct_account_cputime(p); } /* @@ -214,7 +214,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, task_group_account_field(p, index, (__force u64) cputime); /* Account for system time used */ - acct_update_integrals(p); + acct_account_cputime(p); } /* @@ -296,6 +296,7 @@ static __always_inline bool steal_account_process_tick(void) void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) { struct signal_struct *sig = tsk->signal; + cputime_t utime, stime; struct task_struct *t; times->utime = sig->utime; @@ -309,8 +310,9 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) t = tsk; do { - times->utime += t->utime; - times->stime += t->stime; + task_cputime(tsk, &utime, &stime); + times->utime += utime; + times->stime += stime; times->sum_exec_runtime += task_sched_runtime(t); } while_each_thread(tsk, t); out: @@ -588,11 +590,10 @@ static void cputime_adjust(struct task_cputime *curr, void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { struct task_cputime cputime = { - .utime = p->utime, - .stime = p->stime, .sum_exec_runtime = p->se.sum_exec_runtime, }; + task_cputime(p, &cputime.utime, &cputime.stime); cputime_adjust(&cputime, &p->prev_cputime, ut, st); } diff --git a/kernel/signal.c b/kernel/signal.c index 372771e948c2..776a45a3661b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1638,6 +1638,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig) unsigned long flags; struct sighand_struct *psig; bool autoreap = false; + cputime_t utime, stime; BUG_ON(sig == -1); @@ -1675,8 +1676,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig) task_uid(tsk)); rcu_read_unlock(); - info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); - info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime); + task_cputime(tsk, &utime, &stime); + info.si_utime = cputime_to_clock_t(utime + tsk->signal->utime); + info.si_stime = cputime_to_clock_t(stime + tsk->signal->stime); info.si_status = tsk->exit_code & 0x7f; if (tsk->exit_code & 0x80) @@ -1740,6 +1742,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, unsigned long flags; struct task_struct *parent; struct sighand_struct *sighand; + cputime_t utime, stime; if (for_ptracer) { parent = tsk->parent; @@ -1758,8 +1761,9 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); rcu_read_unlock(); - info.si_utime = cputime_to_clock_t(tsk->utime); - info.si_stime = cputime_to_clock_t(tsk->stime); + task_cputime(tsk, &utime, &stime); + info.si_utime = cputime_to_clock_t(utime); + info.si_stime = cputime_to_clock_t(stime); info.si_code = why; switch (why) { diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 625df0b44690..a1dd9a1b1327 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -32,6 +32,7 @@ void bacct_add_tsk(struct user_namespace *user_ns, { const struct cred *tcred; struct timespec uptime, ts; + cputime_t utime, stime, utimescaled, stimescaled; u64 ac_etime; BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); @@ -65,10 +66,15 @@ void bacct_add_tsk(struct user_namespace *user_ns, stats->ac_ppid = pid_alive(tsk) ? task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0; rcu_read_unlock(); - stats->ac_utime = cputime_to_usecs(tsk->utime); - stats->ac_stime = cputime_to_usecs(tsk->stime); - stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled); - stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled); + + task_cputime(tsk, &utime, &stime); + stats->ac_utime = cputime_to_usecs(utime); + stats->ac_stime = cputime_to_usecs(stime); + + task_cputime_scaled(tsk, &utimescaled, &stimescaled); + stats->ac_utimescaled = cputime_to_usecs(utimescaled); + stats->ac_stimescaled = cputime_to_usecs(stimescaled); + stats->ac_minflt = tsk->min_flt; stats->ac_majflt = tsk->maj_flt; @@ -115,11 +121,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) #undef KB #undef MB -/** - * acct_update_integrals - update mm integral fields in task_struct - * @tsk: task_struct for accounting - */ -void acct_update_integrals(struct task_struct *tsk) +static void __acct_update_integrals(struct task_struct *tsk, + cputime_t utime, cputime_t stime) { if (likely(tsk->mm)) { cputime_t time, dtime; @@ -128,7 +131,7 @@ void acct_update_integrals(struct task_struct *tsk) u64 delta; local_irq_save(flags); - time = tsk->stime + tsk->utime; + time = stime + utime; dtime = time - tsk->acct_timexpd; jiffies_to_timeval(cputime_to_jiffies(dtime), &value); delta = value.tv_sec; @@ -144,6 +147,27 @@ void acct_update_integrals(struct task_struct *tsk) } } +/** + * acct_update_integrals - update mm integral fields in task_struct + * @tsk: task_struct for accounting + */ +void acct_update_integrals(struct task_struct *tsk) +{ + cputime_t utime, stime; + + task_cputime(tsk, &utime, &stime); + __acct_update_integrals(tsk, utime, stime); +} + +/** + * acct_account_cputime - update mm integral after cputime update + * @tsk: task_struct for accounting + */ +void acct_account_cputime(struct task_struct *tsk) +{ + __acct_update_integrals(tsk, tsk->utime, tsk->stime); +} + /** * acct_clear_integrals - clear the mm integral fields in task_struct * @tsk: task_struct whose accounting fields are cleared -- cgit v1.2.3 From 6a61671bb2f3a1bd12cd17b8fca811a624782632 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 16 Dec 2012 20:00:34 +0100 Subject: cputime: Safely read cputime of full dynticks CPUs While remotely reading the cputime of a task running in a full dynticks CPU, the values stored in utime/stime fields of struct task_struct may be stale. Its values may be those of the last kernel <-> user transition time snapshot and we need to add the tickless time spent since this snapshot. To fix this, flush the cputime of the dynticks CPUs on kernel <-> user transition and record the time / context where we did this. Then on top of this snapshot and the current time, perform the fixup on the reader side from task_times() accessors. Signed-off-by: Frederic Weisbecker Cc: Andrew Morton Cc: Ingo Molnar Cc: Li Zhong Cc: Namhyung Kim Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner [fixed kvm module related build errors] Signed-off-by: Sedat Dilek --- arch/s390/kernel/vtime.c | 6 +- include/linux/hardirq.h | 4 +- include/linux/init_task.h | 11 +++ include/linux/kvm_host.h | 20 ++++- include/linux/sched.h | 27 +++++-- include/linux/vtime.h | 47 ++++++----- kernel/context_tracking.c | 21 ++++- kernel/fork.c | 6 ++ kernel/sched/core.c | 1 + kernel/sched/cputime.c | 193 +++++++++++++++++++++++++++++++++++++++++++--- kernel/softirq.c | 6 +- 11 files changed, 290 insertions(+), 52 deletions(-) (limited to 'include/linux/sched.h') diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index e84b8b68444a..ce9cc5aa2033 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -127,7 +127,7 @@ void vtime_account_user(struct task_struct *tsk) * Update process times based on virtual cpu times stored by entry.S * to the lowcore fields user_timer, system_timer & steal_clock. */ -void vtime_account(struct task_struct *tsk) +void vtime_account_irq_enter(struct task_struct *tsk) { struct thread_info *ti = task_thread_info(tsk); u64 timer, system; @@ -145,10 +145,10 @@ void vtime_account(struct task_struct *tsk) virt_timer_forward(system); } -EXPORT_SYMBOL_GPL(vtime_account); +EXPORT_SYMBOL_GPL(vtime_account_irq_enter); void vtime_account_system(struct task_struct *tsk) -__attribute__((alias("vtime_account"))); +__attribute__((alias("vtime_account_irq_enter"))); EXPORT_SYMBOL_GPL(vtime_account_system); void __kprobes vtime_stop_cpu(void) diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 624ef3f45c8e..7105d5cbb762 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -153,7 +153,7 @@ extern void rcu_nmi_exit(void); */ #define __irq_enter() \ do { \ - vtime_account_irq_enter(current); \ + account_irq_enter_time(current); \ add_preempt_count(HARDIRQ_OFFSET); \ trace_hardirq_enter(); \ } while (0) @@ -169,7 +169,7 @@ extern void irq_enter(void); #define __irq_exit() \ do { \ trace_hardirq_exit(); \ - vtime_account_irq_exit(current); \ + account_irq_exit_time(current); \ sub_preempt_count(HARDIRQ_OFFSET); \ } while (0) diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 6d087c5f57f7..cc898b871cef 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #ifdef CONFIG_SMP @@ -141,6 +142,15 @@ extern struct task_group root_task_group; # define INIT_PERF_EVENTS(tsk) #endif +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN +# define INIT_VTIME(tsk) \ + .vtime_seqlock = __SEQLOCK_UNLOCKED(tsk.vtime_seqlock), \ + .vtime_snap = 0, \ + .vtime_snap_whence = VTIME_SYS, +#else +# define INIT_VTIME(tsk) +#endif + #define INIT_TASK_COMM "swapper" /* @@ -210,6 +220,7 @@ extern struct task_group root_task_group; INIT_TRACE_RECURSION \ INIT_TASK_RCU_PREEMPT(tsk) \ INIT_CPUSET_SEQ \ + INIT_VTIME(tsk) \ } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4fe2396401da..b7996a768eb2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -741,7 +741,7 @@ static inline int kvm_deassign_device(struct kvm *kvm, } #endif /* CONFIG_IOMMU_API */ -static inline void guest_enter(void) +static inline void __guest_enter(void) { /* * This is running in ioctl context so we can avoid @@ -751,7 +751,7 @@ static inline void guest_enter(void) current->flags |= PF_VCPU; } -static inline void guest_exit(void) +static inline void __guest_exit(void) { /* * This is running in ioctl context so we can avoid @@ -761,6 +761,22 @@ static inline void guest_exit(void) current->flags &= ~PF_VCPU; } +#ifdef CONFIG_CONTEXT_TRACKING +extern void guest_enter(void); +extern void guest_exit(void); + +#else /* !CONFIG_CONTEXT_TRACKING */ +static inline void guest_enter(void) +{ + __guest_enter(); +} + +static inline void guest_exit(void) +{ + __guest_exit(); +} +#endif /* !CONFIG_CONTEXT_TRACKING */ + static inline void kvm_guest_enter(void) { unsigned long flags; diff --git a/include/linux/sched.h b/include/linux/sched.h index a9c608b6154e..a9fa5145e1a7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1367,6 +1367,15 @@ struct task_struct { cputime_t gtime; #ifndef CONFIG_VIRT_CPU_ACCOUNTING struct cputime prev_cputime; +#endif +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN + seqlock_t vtime_seqlock; + unsigned long long vtime_snap; + enum { + VTIME_SLEEPING = 0, + VTIME_USER, + VTIME_SYS, + } vtime_snap_whence; #endif unsigned long nvcsw, nivcsw; /* context switch counts */ struct timespec start_time; /* monotonic time */ @@ -1792,11 +1801,13 @@ static inline void put_task_struct(struct task_struct *t) __put_task_struct(t); } -static inline cputime_t task_gtime(struct task_struct *t) -{ - return t->gtime; -} - +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN +extern void task_cputime(struct task_struct *t, + cputime_t *utime, cputime_t *stime); +extern void task_cputime_scaled(struct task_struct *t, + cputime_t *utimescaled, cputime_t *stimescaled); +extern cputime_t task_gtime(struct task_struct *t); +#else static inline void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) { @@ -1815,6 +1826,12 @@ static inline void task_cputime_scaled(struct task_struct *t, if (stimescaled) *stimescaled = t->stimescaled; } + +static inline cputime_t task_gtime(struct task_struct *t) +{ + return t->gtime; +} +#endif extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); diff --git a/include/linux/vtime.h b/include/linux/vtime.h index bb50c3ca0d79..71a5782d8c59 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -8,35 +8,44 @@ extern void vtime_task_switch(struct task_struct *prev); extern void vtime_account_system(struct task_struct *tsk); extern void vtime_account_idle(struct task_struct *tsk); extern void vtime_account_user(struct task_struct *tsk); -extern void vtime_account(struct task_struct *tsk); +extern void vtime_account_irq_enter(struct task_struct *tsk); -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -extern bool vtime_accounting_enabled(void); -#else +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE static inline bool vtime_accounting_enabled(void) { return true; } #endif #else /* !CONFIG_VIRT_CPU_ACCOUNTING */ + static inline void vtime_task_switch(struct task_struct *prev) { } static inline void vtime_account_system(struct task_struct *tsk) { } static inline void vtime_account_user(struct task_struct *tsk) { } -static inline void vtime_account(struct task_struct *tsk) { } +static inline void vtime_account_irq_enter(struct task_struct *tsk) { } static inline bool vtime_accounting_enabled(void) { return false; } #endif #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -static inline void arch_vtime_task_switch(struct task_struct *tsk) { } -static inline void vtime_user_enter(struct task_struct *tsk) -{ - vtime_account_system(tsk); -} +extern void arch_vtime_task_switch(struct task_struct *tsk); +extern void vtime_account_irq_exit(struct task_struct *tsk); +extern bool vtime_accounting_enabled(void); +extern void vtime_user_enter(struct task_struct *tsk); static inline void vtime_user_exit(struct task_struct *tsk) { vtime_account_user(tsk); } +extern void vtime_guest_enter(struct task_struct *tsk); +extern void vtime_guest_exit(struct task_struct *tsk); +extern void vtime_init_idle(struct task_struct *tsk); #else +static inline void vtime_account_irq_exit(struct task_struct *tsk) +{ + /* On hard|softirq exit we always account to hard|softirq cputime */ + vtime_account_system(tsk); +} static inline void vtime_user_enter(struct task_struct *tsk) { } static inline void vtime_user_exit(struct task_struct *tsk) { } +static inline void vtime_guest_enter(struct task_struct *tsk) { } +static inline void vtime_guest_exit(struct task_struct *tsk) { } +static inline void vtime_init_idle(struct task_struct *tsk) { } #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -45,25 +54,15 @@ extern void irqtime_account_irq(struct task_struct *tsk); static inline void irqtime_account_irq(struct task_struct *tsk) { } #endif -static inline void vtime_account_irq_enter(struct task_struct *tsk) +static inline void account_irq_enter_time(struct task_struct *tsk) { - /* - * Hardirq can interrupt idle task anytime. So we need vtime_account() - * that performs the idle check in CONFIG_VIRT_CPU_ACCOUNTING. - * Softirq can also interrupt idle task directly if it calls - * local_bh_enable(). Such case probably don't exist but we never know. - * Ksoftirqd is not concerned because idle time is flushed on context - * switch. Softirqs in the end of hardirqs are also not a problem because - * the idle time is flushed on hardirq time already. - */ - vtime_account(tsk); + vtime_account_irq_enter(tsk); irqtime_account_irq(tsk); } -static inline void vtime_account_irq_exit(struct task_struct *tsk) +static inline void account_irq_exit_time(struct task_struct *tsk) { - /* On hard|softirq exit we always account to hard|softirq cputime */ - vtime_account_system(tsk); + vtime_account_irq_exit(tsk); irqtime_account_irq(tsk); } diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 9002e92e6372..74f68f4dc6c2 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -1,8 +1,9 @@ #include +#include #include #include #include - +#include DEFINE_PER_CPU(struct context_tracking, context_tracking) = { #ifdef CONFIG_CONTEXT_TRACKING_FORCE @@ -61,6 +62,24 @@ void user_exit(void) local_irq_restore(flags); } +void guest_enter(void) +{ + if (vtime_accounting_enabled()) + vtime_guest_enter(current); + else + __guest_enter(); +} +EXPORT_SYMBOL_GPL(guest_enter); + +void guest_exit(void) +{ + if (vtime_accounting_enabled()) + vtime_guest_exit(current); + else + __guest_exit(); +} +EXPORT_SYMBOL_GPL(guest_exit); + void context_tracking_task_switch(struct task_struct *prev, struct task_struct *next) { diff --git a/kernel/fork.c b/kernel/fork.c index 65ca6d27f24e..e68a95b4cf26 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1233,6 +1233,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifndef CONFIG_VIRT_CPU_ACCOUNTING p->prev_cputime.utime = p->prev_cputime.stime = 0; #endif +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN + seqlock_init(&p->vtime_seqlock); + p->vtime_snap = 0; + p->vtime_snap_whence = VTIME_SLEEPING; +#endif + #if defined(SPLIT_RSS_COUNTING) memset(&p->rss_stat, 0, sizeof(p->rss_stat)); #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 257002c13bb0..261022d7e79d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4666,6 +4666,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) */ idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); + vtime_init_idle(idle); #if defined(CONFIG_SMP) sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); #endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a44ecdf809a1..082e05d915b4 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -492,7 +492,7 @@ void vtime_task_switch(struct task_struct *prev) * vtime_account(). */ #ifndef __ARCH_HAS_VTIME_ACCOUNT -void vtime_account(struct task_struct *tsk) +void vtime_account_irq_enter(struct task_struct *tsk) { if (!vtime_accounting_enabled()) return; @@ -516,7 +516,7 @@ void vtime_account(struct task_struct *tsk) } vtime_account_system(tsk); } -EXPORT_SYMBOL_GPL(vtime_account); +EXPORT_SYMBOL_GPL(vtime_account_irq_enter); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ #else /* !CONFIG_VIRT_CPU_ACCOUNTING */ @@ -600,28 +600,55 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime #endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -static DEFINE_PER_CPU(unsigned long long, cputime_snap); +static unsigned long long vtime_delta(struct task_struct *tsk) +{ + unsigned long long clock; + + clock = sched_clock(); + if (clock < tsk->vtime_snap) + return 0; -static cputime_t get_vtime_delta(void) + return clock - tsk->vtime_snap; +} + +static cputime_t get_vtime_delta(struct task_struct *tsk) { - unsigned long long delta; + unsigned long long delta = vtime_delta(tsk); - delta = sched_clock() - __this_cpu_read(cputime_snap); - __this_cpu_add(cputime_snap, delta); + WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING); + tsk->vtime_snap += delta; /* CHECKME: always safe to convert nsecs to cputime? */ return nsecs_to_cputime(delta); } +static void __vtime_account_system(struct task_struct *tsk) +{ + cputime_t delta_cpu = get_vtime_delta(tsk); + + account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu)); +} + void vtime_account_system(struct task_struct *tsk) { - cputime_t delta_cpu; + if (!vtime_accounting_enabled()) + return; + + write_seqlock(&tsk->vtime_seqlock); + __vtime_account_system(tsk); + write_sequnlock(&tsk->vtime_seqlock); +} +void vtime_account_irq_exit(struct task_struct *tsk) +{ if (!vtime_accounting_enabled()) return; - delta_cpu = get_vtime_delta(); - account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu)); + write_seqlock(&tsk->vtime_seqlock); + if (context_tracking_in_user()) + tsk->vtime_snap_whence = VTIME_USER; + __vtime_account_system(tsk); + write_sequnlock(&tsk->vtime_seqlock); } void vtime_account_user(struct task_struct *tsk) @@ -631,14 +658,44 @@ void vtime_account_user(struct task_struct *tsk) if (!vtime_accounting_enabled()) return; - delta_cpu = get_vtime_delta(); + delta_cpu = get_vtime_delta(tsk); + write_seqlock(&tsk->vtime_seqlock); + tsk->vtime_snap_whence = VTIME_SYS; account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); + write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_user_enter(struct task_struct *tsk) +{ + if (!vtime_accounting_enabled()) + return; + + write_seqlock(&tsk->vtime_seqlock); + tsk->vtime_snap_whence = VTIME_USER; + __vtime_account_system(tsk); + write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_guest_enter(struct task_struct *tsk) +{ + write_seqlock(&tsk->vtime_seqlock); + __vtime_account_system(tsk); + current->flags |= PF_VCPU; + write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_guest_exit(struct task_struct *tsk) +{ + write_seqlock(&tsk->vtime_seqlock); + __vtime_account_system(tsk); + current->flags &= ~PF_VCPU; + write_sequnlock(&tsk->vtime_seqlock); } void vtime_account_idle(struct task_struct *tsk) { - cputime_t delta_cpu = get_vtime_delta(); + cputime_t delta_cpu = get_vtime_delta(tsk); account_idle_time(delta_cpu); } @@ -647,4 +704,116 @@ bool vtime_accounting_enabled(void) { return context_tracking_active(); } + +void arch_vtime_task_switch(struct task_struct *prev) +{ + write_seqlock(&prev->vtime_seqlock); + prev->vtime_snap_whence = VTIME_SLEEPING; + write_sequnlock(&prev->vtime_seqlock); + + write_seqlock(¤t->vtime_seqlock); + current->vtime_snap_whence = VTIME_SYS; + current->vtime_snap = sched_clock(); + write_sequnlock(¤t->vtime_seqlock); +} + +void vtime_init_idle(struct task_struct *t) +{ + unsigned long flags; + + write_seqlock_irqsave(&t->vtime_seqlock, flags); + t->vtime_snap_whence = VTIME_SYS; + t->vtime_snap = sched_clock(); + write_sequnlock_irqrestore(&t->vtime_seqlock, flags); +} + +cputime_t task_gtime(struct task_struct *t) +{ + unsigned long flags; + unsigned int seq; + cputime_t gtime; + + do { + seq = read_seqbegin_irqsave(&t->vtime_seqlock, flags); + + gtime = t->gtime; + if (t->flags & PF_VCPU) + gtime += vtime_delta(t); + + } while (read_seqretry_irqrestore(&t->vtime_seqlock, seq, flags)); + + return gtime; +} + +/* + * Fetch cputime raw values from fields of task_struct and + * add up the pending nohz execution time since the last + * cputime snapshot. + */ +static void +fetch_task_cputime(struct task_struct *t, + cputime_t *u_dst, cputime_t *s_dst, + cputime_t *u_src, cputime_t *s_src, + cputime_t *udelta, cputime_t *sdelta) +{ + unsigned long flags; + unsigned int seq; + unsigned long long delta; + + do { + *udelta = 0; + *sdelta = 0; + + seq = read_seqbegin_irqsave(&t->vtime_seqlock, flags); + + if (u_dst) + *u_dst = *u_src; + if (s_dst) + *s_dst = *s_src; + + /* Task is sleeping, nothing to add */ + if (t->vtime_snap_whence == VTIME_SLEEPING || + is_idle_task(t)) + continue; + + delta = vtime_delta(t); + + /* + * Task runs either in user or kernel space, add pending nohz time to + * the right place. + */ + if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) { + *udelta = delta; + } else { + if (t->vtime_snap_whence == VTIME_SYS) + *sdelta = delta; + } + } while (read_seqretry_irqrestore(&t->vtime_seqlock, seq, flags)); +} + + +void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) +{ + cputime_t udelta, sdelta; + + fetch_task_cputime(t, utime, stime, &t->utime, + &t->stime, &udelta, &sdelta); + if (utime) + *utime += udelta; + if (stime) + *stime += sdelta; +} + +void task_cputime_scaled(struct task_struct *t, + cputime_t *utimescaled, cputime_t *stimescaled) +{ + cputime_t udelta, sdelta; + + fetch_task_cputime(t, utimescaled, stimescaled, + &t->utimescaled, &t->stimescaled, &udelta, &sdelta); + if (utimescaled) + *utimescaled += cputime_to_scaled(udelta); + if (stimescaled) + *stimescaled += cputime_to_scaled(sdelta); +} #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ diff --git a/kernel/softirq.c b/kernel/softirq.c index ed567babe789..f5cc25f147a6 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -221,7 +221,7 @@ asmlinkage void __do_softirq(void) current->flags &= ~PF_MEMALLOC; pending = local_softirq_pending(); - vtime_account_irq_enter(current); + account_irq_enter_time(current); __local_bh_disable((unsigned long)__builtin_return_address(0), SOFTIRQ_OFFSET); @@ -272,7 +272,7 @@ restart: lockdep_softirq_exit(); - vtime_account_irq_exit(current); + account_irq_exit_time(current); __local_bh_enable(SOFTIRQ_OFFSET); tsk_restore_flags(current, old_flags, PF_MEMALLOC); } @@ -341,7 +341,7 @@ static inline void invoke_softirq(void) */ void irq_exit(void) { - vtime_account_irq_exit(current); + account_irq_exit_time(current); trace_hardirq_exit(); sub_preempt_count(IRQ_EXIT_OFFSET); if (!in_interrupt() && local_softirq_pending()) -- cgit v1.2.3 From 5a1b98d3096f1d780045f9be812335ad77aed93d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 6 Nov 2012 13:28:21 -0500 Subject: new helper: sigsp() Normal logics for altstack handling in sigframe allocation Signed-off-by: Al Viro --- include/linux/sched.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6fc8f45de4e9..8f983293b403 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2305,6 +2305,17 @@ static inline int sas_ss_flags(unsigned long sp) : on_sig_stack(sp) ? SS_ONSTACK : 0); } +static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig) +{ + if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp)) +#ifdef CONFIG_STACK_GROWSUP + return current->sas_ss_sp; +#else + return current->sas_ss_sp + current->sas_ss_size; +#endif + return sp; +} + /* * Routines for handling mm_structs */ -- cgit v1.2.3 From cf4aebc292fac7f34f8345664320e9d4a42ca76c Mon Sep 17 00:00:00 2001 From: Clark Williams Date: Thu, 7 Feb 2013 09:46:59 -0600 Subject: sched: Move sched.h sysctl bits into separate header Move the sysctl-related bits from include/linux/sched.h into a new file: include/linux/sched/sysctl.h. Then update source files requiring access to those bits by including the new header file. Signed-off-by: Clark Williams Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20130207094659.06dced96@riff.lan Signed-off-by: Ingo Molnar --- block/blk-exec.c | 1 + include/linux/sched.h | 91 ----------------------------------------- include/linux/sched/sysctl.h | 97 ++++++++++++++++++++++++++++++++++++++++++++ init/init_task.c | 1 + kernel/hrtimer.c | 1 + kernel/sched/sched.h | 1 + kernel/sysctl.c | 1 + kernel/timer.c | 1 + mm/mmap.c | 1 + mm/mremap.c | 1 + mm/nommu.c | 1 + 11 files changed, 106 insertions(+), 91 deletions(-) create mode 100644 include/linux/sched/sysctl.h (limited to 'include/linux/sched.h') diff --git a/block/blk-exec.c b/block/blk-exec.c index 74638ec234c8..c88202f973d9 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "blk.h" diff --git a/include/linux/sched.h b/include/linux/sched.h index 719ee0815e3a..8fc9b2710a80 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -304,19 +304,6 @@ static inline void lockup_detector_init(void) } #endif -#ifdef CONFIG_DETECT_HUNG_TASK -extern unsigned int sysctl_hung_task_panic; -extern unsigned long sysctl_hung_task_check_count; -extern unsigned long sysctl_hung_task_timeout_secs; -extern unsigned long sysctl_hung_task_warnings; -extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos); -#else -/* Avoid need for ifdefs elsewhere in the code */ -enum { sysctl_hung_task_timeout_secs = 0 }; -#endif - /* Attach to any functions which should be ignored in wchan output. */ #define __sched __attribute__((__section__(".sched.text"))) @@ -338,23 +325,6 @@ extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner); struct nsproxy; struct user_namespace; -/* - * Default maximum number of active map areas, this limits the number of vmas - * per mm struct. Users can overwrite this number by sysctl but there is a - * problem. - * - * When a program's coredump is generated as ELF format, a section is created - * per a vma. In ELF, the number of sections is represented in unsigned short. - * This means the number of sections should be smaller than 65535 at coredump. - * Because the kernel adds some informative sections to a image of program at - * generating coredump, we need some margin. The number of extra sections is - * 1-3 now and depends on arch. We use "5" as safe margin, here. - */ -#define MAPCOUNT_ELF_CORE_MARGIN (5) -#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) - -extern int sysctl_max_map_count; - #include #ifdef CONFIG_MMU @@ -1221,12 +1191,6 @@ struct sched_rt_entity { #endif }; -/* - * default timeslice is 100 msecs (used only for SCHED_RR tasks). - * Timeslices get refilled after they expire. - */ -#define RR_TIMESLICE (100 * HZ / 1000) - struct rcu_node; enum perf_event_task_context { @@ -2074,58 +2038,7 @@ extern void wake_up_idle_cpu(int cpu); static inline void wake_up_idle_cpu(int cpu) { } #endif -extern unsigned int sysctl_sched_latency; -extern unsigned int sysctl_sched_min_granularity; -extern unsigned int sysctl_sched_wakeup_granularity; -extern unsigned int sysctl_sched_child_runs_first; - -enum sched_tunable_scaling { - SCHED_TUNABLESCALING_NONE, - SCHED_TUNABLESCALING_LOG, - SCHED_TUNABLESCALING_LINEAR, - SCHED_TUNABLESCALING_END, -}; -extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; - -extern unsigned int sysctl_numa_balancing_scan_delay; -extern unsigned int sysctl_numa_balancing_scan_period_min; -extern unsigned int sysctl_numa_balancing_scan_period_max; -extern unsigned int sysctl_numa_balancing_scan_period_reset; -extern unsigned int sysctl_numa_balancing_scan_size; -extern unsigned int sysctl_numa_balancing_settle_count; - -#ifdef CONFIG_SCHED_DEBUG -extern unsigned int sysctl_sched_migration_cost; -extern unsigned int sysctl_sched_nr_migrate; -extern unsigned int sysctl_sched_time_avg; -extern unsigned int sysctl_timer_migration; -extern unsigned int sysctl_sched_shares_window; - -int sched_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, - loff_t *ppos); -#endif -#ifdef CONFIG_SCHED_DEBUG -static inline unsigned int get_sysctl_timer_migration(void) -{ - return sysctl_timer_migration; -} -#else -static inline unsigned int get_sysctl_timer_migration(void) -{ - return 1; -} -#endif -extern unsigned int sysctl_sched_rt_period; -extern int sysctl_sched_rt_runtime; - -int sched_rt_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); - #ifdef CONFIG_SCHED_AUTOGROUP -extern unsigned int sysctl_sched_autogroup_enabled; - extern void sched_autogroup_create_attach(struct task_struct *p); extern void sched_autogroup_detach(struct task_struct *p); extern void sched_autogroup_fork(struct signal_struct *sig); @@ -2141,10 +2054,6 @@ static inline void sched_autogroup_fork(struct signal_struct *sig) { } static inline void sched_autogroup_exit(struct signal_struct *sig) { } #endif -#ifdef CONFIG_CFS_BANDWIDTH -extern unsigned int sysctl_sched_cfs_bandwidth_slice; -#endif - #ifdef CONFIG_RT_MUTEXES extern int rt_mutex_getprio(struct task_struct *p); extern void rt_mutex_setprio(struct task_struct *p, int prio); diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h new file mode 100644 index 000000000000..bac914e458ca --- /dev/null +++ b/include/linux/sched/sysctl.h @@ -0,0 +1,97 @@ +#ifndef _SCHED_SYSCTL_H +#define _SCHED_SYSCTL_H + +#ifdef CONFIG_DETECT_HUNG_TASK +extern unsigned int sysctl_hung_task_panic; +extern unsigned long sysctl_hung_task_check_count; +extern unsigned long sysctl_hung_task_timeout_secs; +extern unsigned long sysctl_hung_task_warnings; +extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos); +#else +/* Avoid need for ifdefs elsewhere in the code */ +enum { sysctl_hung_task_timeout_secs = 0 }; +#endif + +/* + * Default maximum number of active map areas, this limits the number of vmas + * per mm struct. Users can overwrite this number by sysctl but there is a + * problem. + * + * When a program's coredump is generated as ELF format, a section is created + * per a vma. In ELF, the number of sections is represented in unsigned short. + * This means the number of sections should be smaller than 65535 at coredump. + * Because the kernel adds some informative sections to a image of program at + * generating coredump, we need some margin. The number of extra sections is + * 1-3 now and depends on arch. We use "5" as safe margin, here. + */ +#define MAPCOUNT_ELF_CORE_MARGIN (5) +#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) + +extern int sysctl_max_map_count; + +extern unsigned int sysctl_sched_latency; +extern unsigned int sysctl_sched_min_granularity; +extern unsigned int sysctl_sched_wakeup_granularity; +extern unsigned int sysctl_sched_child_runs_first; + +enum sched_tunable_scaling { + SCHED_TUNABLESCALING_NONE, + SCHED_TUNABLESCALING_LOG, + SCHED_TUNABLESCALING_LINEAR, + SCHED_TUNABLESCALING_END, +}; +extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; + +extern unsigned int sysctl_numa_balancing_scan_delay; +extern unsigned int sysctl_numa_balancing_scan_period_min; +extern unsigned int sysctl_numa_balancing_scan_period_max; +extern unsigned int sysctl_numa_balancing_scan_period_reset; +extern unsigned int sysctl_numa_balancing_scan_size; +extern unsigned int sysctl_numa_balancing_settle_count; + +#ifdef CONFIG_SCHED_DEBUG +extern unsigned int sysctl_sched_migration_cost; +extern unsigned int sysctl_sched_nr_migrate; +extern unsigned int sysctl_sched_time_avg; +extern unsigned int sysctl_timer_migration; +extern unsigned int sysctl_sched_shares_window; + +int sched_proc_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, + loff_t *ppos); +#endif +#ifdef CONFIG_SCHED_DEBUG +static inline unsigned int get_sysctl_timer_migration(void) +{ + return sysctl_timer_migration; +} +#else +static inline unsigned int get_sysctl_timer_migration(void) +{ + return 1; +} +#endif +extern unsigned int sysctl_sched_rt_period; +extern int sysctl_sched_rt_runtime; + +#ifdef CONFIG_CFS_BANDWIDTH +extern unsigned int sysctl_sched_cfs_bandwidth_slice; +#endif + +#ifdef CONFIG_SCHED_AUTOGROUP +extern unsigned int sysctl_sched_autogroup_enabled; +#endif + +/* + * default timeslice is 100 msecs (used only for SCHED_RR tasks). + * Timeslices get refilled after they expire. + */ +#define RR_TIMESLICE (100 * HZ / 1000) + +int sched_rt_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); + +#endif /* _SCHED_SYSCTL_H */ diff --git a/init/init_task.c b/init/init_task.c index 8b2f3996b035..a031ad14c950 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 6db7a5ed52b5..8a9aa59d0d61 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fc886441436a..ed8de30a040e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1,5 +1,6 @@ #include +#include #include #include #include diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c88878db491e..7357e23aaf68 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -61,6 +61,7 @@ #include #include #include +#include #include #include diff --git a/kernel/timer.c b/kernel/timer.c index 367d00858482..3e13baf3f0ea 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include diff --git a/mm/mmap.c b/mm/mmap.c index 35730ee9d515..5dee4a0bb49f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include diff --git a/mm/mremap.c b/mm/mremap.c index e1031e1f6a61..f9766f460299 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include diff --git a/mm/nommu.c b/mm/nommu.c index 79c3cac87afa..b20db4e22263 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From 8bd75c77b7c6a3954140dd2e20346aef3efe4a35 Mon Sep 17 00:00:00 2001 From: Clark Williams Date: Thu, 7 Feb 2013 09:47:07 -0600 Subject: sched/rt: Move rt specific bits into new header file Move rt scheduler definitions out of include/linux/sched.h into new file include/linux/sched/rt.h Signed-off-by: Clark Williams Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20130207094707.7b9f825f@riff.lan Signed-off-by: Ingo Molnar --- drivers/spi/spi.c | 2 +- drivers/staging/csr/bh.c | 2 +- drivers/staging/csr/unifi_sme.c | 2 +- drivers/tty/sysrq.c | 1 + fs/select.c | 1 + include/linux/sched.h | 55 ++----------------------------------- include/linux/sched/rt.h | 58 +++++++++++++++++++++++++++++++++++++++ init/init_task.c | 1 + kernel/futex.c | 1 + kernel/hrtimer.c | 1 + kernel/irq/manage.c | 1 + kernel/mutex.c | 1 + kernel/rtmutex-debug.c | 1 + kernel/rtmutex-tester.c | 1 + kernel/rtmutex.c | 1 + kernel/sched/cpupri.c | 2 ++ kernel/sched/sched.h | 1 + kernel/trace/trace.c | 1 + kernel/trace/trace_sched_wakeup.c | 2 +- kernel/watchdog.c | 1 + mm/page-writeback.c | 1 + mm/page_alloc.c | 1 + 22 files changed, 81 insertions(+), 57 deletions(-) create mode 100644 include/linux/sched/rt.h (limited to 'include/linux/sched.h') diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 19ee901577da..3a6083b386a1 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/staging/csr/bh.c b/drivers/staging/csr/bh.c index 1a1f5c79822a..7b133597e923 100644 --- a/drivers/staging/csr/bh.c +++ b/drivers/staging/csr/bh.c @@ -15,7 +15,7 @@ */ #include "csr_wifi_hip_unifi.h" #include "unifi_priv.h" - +#include /* * --------------------------------------------------------------------------- diff --git a/drivers/staging/csr/unifi_sme.c b/drivers/staging/csr/unifi_sme.c index 7c6c4138fc76..49395da34b7f 100644 --- a/drivers/staging/csr/unifi_sme.c +++ b/drivers/staging/csr/unifi_sme.c @@ -15,7 +15,7 @@ #include "unifi_priv.h" #include "csr_wifi_hip_unifi.h" #include "csr_wifi_hip_conversions.h" - +#include diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index b3c4a250ff86..40e5b3919e27 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -15,6 +15,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include diff --git a/fs/select.c b/fs/select.c index 2ef72d965036..8c1c96c27062 100644 --- a/fs/select.c +++ b/fs/select.c @@ -26,6 +26,7 @@ #include #include #include +#include #include diff --git a/include/linux/sched.h b/include/linux/sched.h index 8fc9b2710a80..33cc42130371 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1164,6 +1164,7 @@ struct sched_entity { /* rq "owned" by this entity/group: */ struct cfs_rq *my_q; #endif + /* * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be * removed when useful for applications beyond shares distribution (e.g. @@ -1191,6 +1192,7 @@ struct sched_rt_entity { #endif }; + struct rcu_node; enum perf_event_task_context { @@ -1596,37 +1598,6 @@ static inline void set_numabalancing_state(bool enabled) } #endif -/* - * Priority of a process goes from 0..MAX_PRIO-1, valid RT - * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH - * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority - * values are inverted: lower p->prio value means higher priority. - * - * The MAX_USER_RT_PRIO value allows the actual maximum - * RT priority to be separate from the value exported to - * user-space. This allows kernel threads to set their - * priority to a value higher than any user task. Note: - * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. - */ - -#define MAX_USER_RT_PRIO 100 -#define MAX_RT_PRIO MAX_USER_RT_PRIO - -#define MAX_PRIO (MAX_RT_PRIO + 40) -#define DEFAULT_PRIO (MAX_RT_PRIO + 20) - -static inline int rt_prio(int prio) -{ - if (unlikely(prio < MAX_RT_PRIO)) - return 1; - return 0; -} - -static inline int rt_task(struct task_struct *p) -{ - return rt_prio(p->prio); -} - static inline struct pid *task_pid(struct task_struct *task) { return task->pids[PIDTYPE_PID].pid; @@ -2054,26 +2025,6 @@ static inline void sched_autogroup_fork(struct signal_struct *sig) { } static inline void sched_autogroup_exit(struct signal_struct *sig) { } #endif -#ifdef CONFIG_RT_MUTEXES -extern int rt_mutex_getprio(struct task_struct *p); -extern void rt_mutex_setprio(struct task_struct *p, int prio); -extern void rt_mutex_adjust_pi(struct task_struct *p); -static inline bool tsk_is_pi_blocked(struct task_struct *tsk) -{ - return tsk->pi_blocked_on != NULL; -} -#else -static inline int rt_mutex_getprio(struct task_struct *p) -{ - return p->normal_prio; -} -# define rt_mutex_adjust_pi(p) do { } while (0) -static inline bool tsk_is_pi_blocked(struct task_struct *tsk) -{ - return false; -} -#endif - extern bool yield_to(struct task_struct *p, bool preempt); extern void set_user_nice(struct task_struct *p, long nice); extern int task_prio(const struct task_struct *p); @@ -2703,8 +2654,6 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); extern long sched_getaffinity(pid_t pid, struct cpumask *mask); -extern void normalize_rt_tasks(void); - #ifdef CONFIG_CGROUP_SCHED extern struct task_group root_task_group; diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h new file mode 100644 index 000000000000..94e19ea28fc3 --- /dev/null +++ b/include/linux/sched/rt.h @@ -0,0 +1,58 @@ +#ifndef _SCHED_RT_H +#define _SCHED_RT_H + +/* + * Priority of a process goes from 0..MAX_PRIO-1, valid RT + * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH + * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority + * values are inverted: lower p->prio value means higher priority. + * + * The MAX_USER_RT_PRIO value allows the actual maximum + * RT priority to be separate from the value exported to + * user-space. This allows kernel threads to set their + * priority to a value higher than any user task. Note: + * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. + */ + +#define MAX_USER_RT_PRIO 100 +#define MAX_RT_PRIO MAX_USER_RT_PRIO + +#define MAX_PRIO (MAX_RT_PRIO + 40) +#define DEFAULT_PRIO (MAX_RT_PRIO + 20) + +static inline int rt_prio(int prio) +{ + if (unlikely(prio < MAX_RT_PRIO)) + return 1; + return 0; +} + +static inline int rt_task(struct task_struct *p) +{ + return rt_prio(p->prio); +} + +#ifdef CONFIG_RT_MUTEXES +extern int rt_mutex_getprio(struct task_struct *p); +extern void rt_mutex_setprio(struct task_struct *p, int prio); +extern void rt_mutex_adjust_pi(struct task_struct *p); +static inline bool tsk_is_pi_blocked(struct task_struct *tsk) +{ + return tsk->pi_blocked_on != NULL; +} +#else +static inline int rt_mutex_getprio(struct task_struct *p) +{ + return p->normal_prio; +} +# define rt_mutex_adjust_pi(p) do { } while (0) +static inline bool tsk_is_pi_blocked(struct task_struct *tsk) +{ + return false; +} +#endif + +extern void normalize_rt_tasks(void); + + +#endif /* _SCHED_RT_H */ diff --git a/init/init_task.c b/init/init_task.c index a031ad14c950..ba0a7f362d9e 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/futex.c b/kernel/futex.c index 19eb089ca003..9618b6e9fb36 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -60,6 +60,7 @@ #include #include #include +#include #include diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 8a9aa59d0d61..c5dde988c0ce 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e49a288fa479..02115d9592ec 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "internals.h" diff --git a/kernel/mutex.c b/kernel/mutex.c index a307cc9c9526..52f23011b6e0 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -19,6 +19,7 @@ */ #include #include +#include #include #include #include diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index 16502d3a71c8..13b243a323fa 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c @@ -17,6 +17,7 @@ * See rt.c in preempt-rt for proper credits and further information */ #include +#include #include #include #include diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 98ec49475460..7890b10084a7 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index a242e691c993..1e09308bf2a1 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "rtmutex_common.h" diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 23aa789c53ee..1095e878a46f 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -28,6 +28,8 @@ */ #include +#include +#include #include "cpupri.h" /* Convert between a 140 based task->prio, and our 102 based cpupri */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ed8de30a040e..cc03cfdf469f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1,6 +1,7 @@ #include #include +#include #include #include #include diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3c13e46d7d24..4d2e4afd956e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -39,6 +39,7 @@ #include #include #include +#include #include "trace.h" #include "trace_output.h" diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 9fe45fcefca0..75aa97fbe1a1 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -15,8 +15,8 @@ #include #include #include +#include #include - #include "trace.h" static struct trace_array *wakeup_trace; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 75a2ab3d0b02..27689422aa92 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0713bfbf0954..66a0024becd9 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -35,6 +35,7 @@ #include /* __set_page_dirty_buffers */ #include #include +#include #include /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index df2022ff0c8a..42d18e46f286 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From e9b04b5b67ec628a5e9a312e14b6864f8f73ba12 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 20 Nov 2012 11:14:10 -0500 Subject: make do_sigaltstack() static Signed-off-by: Al Viro --- include/linux/sched.h | 1 - kernel/signal.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8f983293b403..6dd06494997a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2259,7 +2259,6 @@ extern struct sigqueue *sigqueue_alloc(void); extern void sigqueue_free(struct sigqueue *); extern int send_sigqueue(struct sigqueue *, struct task_struct *, int group); extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); -extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long); static inline void restore_saved_sigmask(void) { diff --git a/kernel/signal.c b/kernel/signal.c index 87c09e3061d2..6919050c8156 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3129,7 +3129,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) return 0; } -int +static int do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) { stack_t oss; -- cgit v1.2.3 From 1c3e826482ab698e418c7a894440e62c76aac893 Mon Sep 17 00:00:00 2001 From: Sha Zhengju Date: Wed, 20 Feb 2013 17:14:38 +0800 Subject: sched/core: Remove the obsolete and unused nr_uninterruptible() function Signed-off-by: Sha Zhengju Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1361351678-8065-1-git-send-email-handai.szj@taobao.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - kernel/sched/core.c | 22 ++-------------------- 2 files changed, 2 insertions(+), 21 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 33cc42130371..f9ca237df7e8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -98,7 +98,6 @@ extern int nr_threads; DECLARE_PER_CPU(unsigned long, process_counts); extern int nr_processes(void); extern unsigned long nr_running(void); -extern unsigned long nr_uninterruptible(void); extern unsigned long nr_iowait(void); extern unsigned long nr_iowait_cpu(int cpu); extern unsigned long this_cpu_load(void); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 03d7784b7bd2..b7b03cd2d4cd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1969,11 +1969,10 @@ context_switch(struct rq *rq, struct task_struct *prev, } /* - * nr_running, nr_uninterruptible and nr_context_switches: + * nr_running and nr_context_switches: * * externally visible scheduler statistics: current number of runnable - * threads, current number of uninterruptible-sleeping threads, total - * number of context switches performed since bootup. + * threads, total number of context switches performed since bootup. */ unsigned long nr_running(void) { @@ -1985,23 +1984,6 @@ unsigned long nr_running(void) return sum; } -unsigned long nr_uninterruptible(void) -{ - unsigned long i, sum = 0; - - for_each_possible_cpu(i) - sum += cpu_rq(i)->nr_uninterruptible; - - /* - * Since we read the counters lockless, it might be slightly - * inaccurate. Do not allow it to go below zero though: - */ - if (unlikely((long)sum < 0)) - sum = 0; - - return sum; -} - unsigned long long nr_context_switches(void) { int i; -- cgit v1.2.3 From 21caf2fc1931b485483ddd254b634fa8f0099963 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 22 Feb 2013 16:34:08 -0800 Subject: mm: teach mm by current context info to not do I/O during memory allocation This patch introduces PF_MEMALLOC_NOIO on process flag('flags' field of 'struct task_struct'), so that the flag can be set by one task to avoid doing I/O inside memory allocation in the task's context. The patch trys to solve one deadlock problem caused by block device, and the problem may happen at least in the below situations: - during block device runtime resume, if memory allocation with GFP_KERNEL is called inside runtime resume callback of any one of its ancestors(or the block device itself), the deadlock may be triggered inside the memory allocation since it might not complete until the block device becomes active and the involed page I/O finishes. The situation is pointed out first by Alan Stern. It is not a good approach to convert all GFP_KERNEL[1] in the path into GFP_NOIO because several subsystems may be involved(for example, PCI, USB and SCSI may be involved for usb mass stoarage device, network devices involved too in the iSCSI case) - during block device runtime suspend, because runtime resume need to wait for completion of concurrent runtime suspend. - during error handling of usb mass storage deivce, USB bus reset will be put on the device, so there shouldn't have any memory allocation with GFP_KERNEL during USB bus reset, otherwise the deadlock similar with above may be triggered. Unfortunately, any usb device may include one mass storage interface in theory, so it requires all usb interface drivers to handle the situation. In fact, most usb drivers don't know how to handle bus reset on the device and don't provide .pre_set() and .post_reset() callback at all, so USB core has to unbind and bind driver for these devices. So it is still not practical to resort to GFP_NOIO for solving the problem. Also the introduced solution can be used by block subsystem or block drivers too, for example, set the PF_MEMALLOC_NOIO flag before doing actual I/O transfer. It is not a good idea to convert all these GFP_KERNEL in the affected path into GFP_NOIO because these functions doing that may be implemented as library and will be called in many other contexts. In fact, memalloc_noio_flags() can convert some of current static GFP_NOIO allocation into GFP_KERNEL back in other non-affected contexts, at least almost all GFP_NOIO in USB subsystem can be converted into GFP_KERNEL after applying the approach and make allocation with GFP_NOIO only happen in runtime resume/bus reset/block I/O transfer contexts generally. [1], several GFP_KERNEL allocation examples in runtime resume path - pci subsystem acpi_os_allocate <-acpi_ut_allocate <-ACPI_ALLOCATE_ZEROED <-acpi_evaluate_object <-__acpi_bus_set_power <-acpi_bus_set_power <-acpi_pci_set_power_state <-platform_pci_set_power_state <-pci_platform_power_transition <-__pci_complete_power_transition <-pci_set_power_state <-pci_restore_standard_config <-pci_pm_runtime_resume - usb subsystem usb_get_status <-finish_port_resume <-usb_port_resume <-generic_resume <-usb_resume_device <-usb_resume_both <-usb_runtime_resume - some individual usb drivers usblp, uvc, gspca, most of dvb-usb-v2 media drivers, cpia2, az6007, .... That is just what I have found. Unfortunately, this allocation can only be found by human being now, and there should be many not found since any function in the resume path(call tree) may allocate memory with GFP_KERNEL. Signed-off-by: Ming Lei Signed-off-by: Minchan Kim Cc: Alan Stern Cc: Oliver Neukum Cc: Jiri Kosina Cc: Mel Gorman Cc: KAMEZAWA Hiroyuki Cc: Michal Hocko Cc: Ingo Molnar Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Cc: Greg KH Cc: Jens Axboe Cc: "David S. Miller" Cc: Eric Dumazet Cc: David Decotigny Cc: Tom Herbert Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 22 ++++++++++++++++++++++ mm/page_alloc.c | 9 ++++++++- mm/vmscan.c | 4 ++-- 3 files changed, 32 insertions(+), 3 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index e4112aad2964..c2182b53dace 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -51,6 +51,7 @@ struct sched_param { #include #include #include +#include #include @@ -1791,6 +1792,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, #define PF_FROZEN 0x00010000 /* frozen for system suspend */ #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ #define PF_KSWAPD 0x00040000 /* I am kswapd */ +#define PF_MEMALLOC_NOIO 0x00080000 /* Allocating memory without IO involved */ #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */ @@ -1828,6 +1830,26 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) #define used_math() tsk_used_math(current) +/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags */ +static inline gfp_t memalloc_noio_flags(gfp_t flags) +{ + if (unlikely(current->flags & PF_MEMALLOC_NOIO)) + flags &= ~__GFP_IO; + return flags; +} + +static inline unsigned int memalloc_noio_save(void) +{ + unsigned int flags = current->flags & PF_MEMALLOC_NOIO; + current->flags |= PF_MEMALLOC_NOIO; + return flags; +} + +static inline void memalloc_noio_restore(unsigned int flags) +{ + current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags; +} + /* * task->jobctl flags */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e3fb290194c0..3ede25e6686e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2624,10 +2624,17 @@ retry_cpuset: page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, alloc_flags, preferred_zone, migratetype); - if (unlikely(!page)) + if (unlikely(!page)) { + /* + * Runtime PM, block IO and its error handling path + * can deadlock because I/O on the device might not + * complete. + */ + gfp_mask = memalloc_noio_flags(gfp_mask); page = __alloc_pages_slowpath(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); + } trace_mm_page_alloc(page, order, gfp_mask, migratetype); diff --git a/mm/vmscan.c b/mm/vmscan.c index b93968b71dc6..a68fa20269d9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2352,7 +2352,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, { unsigned long nr_reclaimed; struct scan_control sc = { - .gfp_mask = gfp_mask, + .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), .may_writepage = !laptop_mode, .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_unmap = 1, @@ -3313,7 +3313,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), .may_swap = 1, .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), - .gfp_mask = gfp_mask, + .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), .order = order, .priority = ZONE_RECLAIM_PRIORITY, }; -- cgit v1.2.3 From e579d2c259be42b6f29458327e5153b22414b031 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 27 Feb 2013 17:03:15 -0800 Subject: coredump: remove redundant defines for dumpable states The existing SUID_DUMP_* defines duplicate the newer SUID_DUMPABLE_* defines introduced in 54b501992dd2 ("coredump: warn about unsafe suid_dumpable / core_pattern combo"). Remove the new ones, and use the prior values instead. Signed-off-by: Kees Cook Reported-by: Chen Gang Cc: Alexander Viro Cc: Alan Cox Cc: "Eric W. Biederman" Cc: Doug Ledford Cc: Serge Hallyn Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/coredump.c | 2 +- fs/exec.c | 10 +++++----- fs/proc/internal.h | 3 ++- include/linux/sched.h | 5 ----- kernel/sysctl.c | 2 +- 5 files changed, 9 insertions(+), 13 deletions(-) (limited to 'include/linux/sched.h') diff --git a/fs/coredump.c b/fs/coredump.c index 69baf903d3bd..c6479658d487 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -501,7 +501,7 @@ void do_coredump(siginfo_t *siginfo) * so we dump it as root in mode 2, and only into a controlled * environment (pipe handler or fully qualified path). */ - if (__get_dumpable(cprm.mm_flags) == SUID_DUMPABLE_SAFE) { + if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) { /* Setuid core dump mode */ flag = O_EXCL; /* Stop rewrite attacks */ cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */ diff --git a/fs/exec.c b/fs/exec.c index 864c50df660a..a96a4885bbbf 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1111,7 +1111,7 @@ void setup_new_exec(struct linux_binprm * bprm) current->sas_ss_sp = current->sas_ss_size = 0; if (uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid())) - set_dumpable(current->mm, SUID_DUMPABLE_ENABLED); + set_dumpable(current->mm, SUID_DUMP_USER); else set_dumpable(current->mm, suid_dumpable); @@ -1639,17 +1639,17 @@ EXPORT_SYMBOL(set_binfmt); void set_dumpable(struct mm_struct *mm, int value) { switch (value) { - case SUID_DUMPABLE_DISABLED: + case SUID_DUMP_DISABLE: clear_bit(MMF_DUMPABLE, &mm->flags); smp_wmb(); clear_bit(MMF_DUMP_SECURELY, &mm->flags); break; - case SUID_DUMPABLE_ENABLED: + case SUID_DUMP_USER: set_bit(MMF_DUMPABLE, &mm->flags); smp_wmb(); clear_bit(MMF_DUMP_SECURELY, &mm->flags); break; - case SUID_DUMPABLE_SAFE: + case SUID_DUMP_ROOT: set_bit(MMF_DUMP_SECURELY, &mm->flags); smp_wmb(); set_bit(MMF_DUMPABLE, &mm->flags); @@ -1662,7 +1662,7 @@ int __get_dumpable(unsigned long mm_flags) int ret; ret = mm_flags & MMF_DUMPABLE_MASK; - return (ret > SUID_DUMPABLE_ENABLED) ? SUID_DUMPABLE_SAFE : ret; + return (ret > SUID_DUMP_USER) ? SUID_DUMP_ROOT : ret; } int get_dumpable(struct mm_struct *mm) diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 252544c05207..85ff3a4598b3 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -11,6 +11,7 @@ #include #include +#include struct ctl_table_header; struct mempolicy; @@ -108,7 +109,7 @@ static inline int task_dumpable(struct task_struct *task) if (mm) dumpable = get_dumpable(mm); task_unlock(task); - if (dumpable == SUID_DUMPABLE_ENABLED) + if (dumpable == SUID_DUMP_USER) return 1; return 0; } diff --git a/include/linux/sched.h b/include/linux/sched.h index 6853bf947fde..d35d2b6ddbfb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -346,11 +346,6 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm) {} extern void set_dumpable(struct mm_struct *mm, int value); extern int get_dumpable(struct mm_struct *mm); -/* get/set_dumpable() values */ -#define SUID_DUMPABLE_DISABLED 0 -#define SUID_DUMPABLE_ENABLED 1 -#define SUID_DUMPABLE_SAFE 2 - /* mm flags */ /* dumpable bits */ #define MMF_DUMPABLE 0 /* core dump is permitted */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d8df00e69c14..d1b4ee67d2df 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2095,7 +2095,7 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, static void validate_coredump_safety(void) { #ifdef CONFIG_COREDUMP - if (suid_dumpable == SUID_DUMPABLE_SAFE && + if (suid_dumpable == SUID_DUMP_ROOT && core_pattern[0] != '/' && core_pattern[0] != '|') { printk(KERN_WARNING "Unsafe core_pattern used with "\ "suid_dumpable=2. Pipe handler or fully qualified "\ -- cgit v1.2.3