From 4de7b17fd05d03fa919e8c47fc66122bd24d7b6c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 21 Aug 2023 14:44:28 +0100 Subject: sched: Assert for_each_thread() is properly locked list_for_each_entry_rcu() takes an optional fourth argument which allows RCU to assert that the correct lock is held. Several callers of for_each_thread() rely on their caller to be holding the appropriate lock, so this is a useful assertion to include. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ingo Molnar Reviewed-by: Joel Fernandes (Google) Link: https://lore.kernel.org/r/20230821134428.2504912-1-willy@infradead.org --- include/linux/sched/signal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 0014d3adaf84..9610bad018a3 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -656,7 +656,8 @@ extern bool current_is_single_threaded(void); while ((t = next_thread(t)) != g) #define __for_each_thread(signal, t) \ - list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node) + list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \ + lockdep_is_held(&tasklist_lock)) #define for_each_thread(p, t) \ __for_each_thread((p)->signal, t) -- cgit v1.3 From 3ba78da711940ce07c39c4cdd1f4ad284067a42d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 6 Jun 2021 13:27:15 +0200 Subject: sched/headers: Add header guard to It's the only non-trivial header in include/linux/sched/ missing a header guard. Signed-off-by: Ingo Molnar --- include/linux/sched/deadline.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/sched') diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index 7c83d4d5a971..df3aca89d4f5 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -1,4 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_DEADLINE_H +#define _LINUX_SCHED_DEADLINE_H /* * SCHED_DEADLINE tasks has negative priorities, reflecting @@ -34,3 +36,5 @@ extern void dl_add_task_root_domain(struct task_struct *p); extern void dl_clear_root_domain(struct root_domain *rd); #endif /* CONFIG_SMP */ + +#endif /* _LINUX_SCHED_DEADLINE_H */ -- cgit v1.3 From 6eddb116dd830436afbd922568292867de6c8b9e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 21 Sep 2023 11:24:17 +0200 Subject: sched/headers: Standardize the header guard name Use the same _LINUX_SCHED_ prefix nomenclature as the other 29 header guards in include/linux/sched/ do. Signed-off-by: Ingo Molnar --- include/linux/sched/vhost_task.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/vhost_task.h b/include/linux/sched/vhost_task.h index 837a23624a66..bc60243d43b3 100644 --- a/include/linux/sched/vhost_task.h +++ b/include/linux/sched/vhost_task.h @@ -1,7 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_VHOST_TASK_H -#define _LINUX_VHOST_TASK_H - +#ifndef _LINUX_SCHED_VHOST_TASK_H +#define _LINUX_SCHED_VHOST_TASK_H struct vhost_task; @@ -11,4 +10,4 @@ void vhost_task_start(struct vhost_task *vtsk); void vhost_task_stop(struct vhost_task *vtsk); void vhost_task_wake(struct vhost_task *vtsk); -#endif +#endif /* _LINUX_SCHED_VHOST_TASK_H */ -- cgit v1.3 From 0f9a1a4d234c064d8dff69cf3f3755554dd479ec Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 21 Sep 2023 11:27:37 +0200 Subject: sched/headers: Standardize the header guard #endif Signed-off-by: Ingo Molnar --- include/linux/sched/types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/types.h b/include/linux/sched/types.h index 3c3e049224ae..969aaf5ef9d6 100644 --- a/include/linux/sched/types.h +++ b/include/linux/sched/types.h @@ -20,4 +20,4 @@ struct task_cputime { unsigned long long sum_exec_runtime; }; -#endif +#endif /* _LINUX_SCHED_TYPES_H */ -- cgit v1.3 From 1632d47fae2f2d229dd432854c4443ebb0bb27a4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 21 Sep 2023 11:28:48 +0200 Subject: sched/headers: Standardize the header guard #endif Signed-off-by: Ingo Molnar --- include/linux/sched/smt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h index 59d3736c454c..fb1e295e7e63 100644 --- a/include/linux/sched/smt.h +++ b/include/linux/sched/smt.h @@ -17,4 +17,4 @@ static inline bool sched_smt_active(void) { return false; } void arch_smt_update(void); -#endif +#endif /* _LINUX_SCHED_SMT_H */ -- cgit v1.3 From ed2da8b725b932b1e2b2f4835bb664d47ed03031 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 10 Oct 2023 09:31:40 +0100 Subject: sched/numa: Trace decisions related to skipping VMAs NUMA balancing skips or scans VMAs for a variety of reasons. In preparation for completing scans of VMAs regardless of PID access, trace the reasons why a VMA was skipped. In a later patch, the tracing will be used to track if a VMA was forcibly scanned. Signed-off-by: Mel Gorman Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231010083143.19593-4-mgorman@techsingularity.net --- include/linux/sched/numa_balancing.h | 8 ++++++ include/trace/events/sched.h | 50 ++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 17 +++++++++--- 3 files changed, 71 insertions(+), 4 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h index 3988762efe15..c127a1509e2f 100644 --- a/include/linux/sched/numa_balancing.h +++ b/include/linux/sched/numa_balancing.h @@ -15,6 +15,14 @@ #define TNF_FAULT_LOCAL 0x08 #define TNF_MIGRATE_FAIL 0x10 +enum numa_vmaskip_reason { + NUMAB_SKIP_UNSUITABLE, + NUMAB_SKIP_SHARED_RO, + NUMAB_SKIP_INACCESSIBLE, + NUMAB_SKIP_SCAN_DELAY, + NUMAB_SKIP_PID_INACTIVE, +}; + #ifdef CONFIG_NUMA_BALANCING extern void task_numa_fault(int last_node, int node, int pages, int flags); extern pid_t task_numa_group_id(struct task_struct *p); diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index a13d5d06be9d..d82a04d6a1bc 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -664,6 +664,56 @@ DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa, TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu) ); +#ifdef CONFIG_NUMA_BALANCING +#define NUMAB_SKIP_REASON \ + EM( NUMAB_SKIP_UNSUITABLE, "unsuitable" ) \ + EM( NUMAB_SKIP_SHARED_RO, "shared_ro" ) \ + EM( NUMAB_SKIP_INACCESSIBLE, "inaccessible" ) \ + EM( NUMAB_SKIP_SCAN_DELAY, "scan_delay" ) \ + EMe(NUMAB_SKIP_PID_INACTIVE, "pid_inactive" ) + +/* Redefine for export. */ +#undef EM +#undef EMe +#define EM(a, b) TRACE_DEFINE_ENUM(a); +#define EMe(a, b) TRACE_DEFINE_ENUM(a); + +NUMAB_SKIP_REASON + +/* Redefine for symbolic printing. */ +#undef EM +#undef EMe +#define EM(a, b) { a, b }, +#define EMe(a, b) { a, b } + +TRACE_EVENT(sched_skip_vma_numa, + + TP_PROTO(struct mm_struct *mm, struct vm_area_struct *vma, + enum numa_vmaskip_reason reason), + + TP_ARGS(mm, vma, reason), + + TP_STRUCT__entry( + __field(unsigned long, numa_scan_offset) + __field(unsigned long, vm_start) + __field(unsigned long, vm_end) + __field(enum numa_vmaskip_reason, reason) + ), + + TP_fast_assign( + __entry->numa_scan_offset = mm->numa_scan_offset; + __entry->vm_start = vma->vm_start; + __entry->vm_end = vma->vm_end; + __entry->reason = reason; + ), + + TP_printk("numa_scan_offset=%lX vm_start=%lX vm_end=%lX reason=%s", + __entry->numa_scan_offset, + __entry->vm_start, + __entry->vm_end, + __print_symbolic(__entry->reason, NUMAB_SKIP_REASON)) +); +#endif /* CONFIG_NUMA_BALANCING */ /* * Tracepoint for waking a polling cpu without an IPI. diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6b47edcbe834..31cfdb0794fb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3210,6 +3210,7 @@ static void task_numa_work(struct callback_head *work) do { if (!vma_migratable(vma) || !vma_policy_mof(vma) || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE); continue; } @@ -3220,15 +3221,19 @@ static void task_numa_work(struct callback_head *work) * as migrating the pages will be of marginal benefit. */ if (!vma->vm_mm || - (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) + (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO); continue; + } /* * Skip inaccessible VMAs to avoid any confusion between * PROT_NONE and NUMA hinting ptes */ - if (!vma_is_accessible(vma)) + if (!vma_is_accessible(vma)) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE); continue; + } /* Initialise new per-VMA NUMAB state. */ if (!vma->numab_state) { @@ -3250,12 +3255,16 @@ static void task_numa_work(struct callback_head *work) * delay the scan for new VMAs. */ if (mm->numa_scan_seq && time_before(jiffies, - vma->numab_state->next_scan)) + vma->numab_state->next_scan)) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY); continue; + } /* Do not scan the VMA if task has not accessed */ - if (!vma_is_accessed(vma)) + if (!vma_is_accessed(vma)) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE); continue; + } /* * RESET access PIDs regularly for old VMAs. Resetting after checking -- cgit v1.3 From b7a5b537c55c088d891ae554103d1b281abef781 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 10 Oct 2023 09:31:42 +0100 Subject: sched/numa: Complete scanning of partial VMAs regardless of PID activity NUMA Balancing skips VMAs when the current task has not trapped a NUMA fault within the VMA. If the VMA is skipped then mm->numa_scan_offset advances and a task that is trapping faults within the VMA may never fully update PTEs within the VMA. Force tasks to update PTEs for partially scanned PTEs. The VMA will be tagged for NUMA hints by some task but this removes some of the benefit of tracking PID activity within a VMA. A follow-on patch will mitigate this problem. The test cases and machines evaluated did not trigger the corner case so the performance results are neutral with only small changes within the noise from normal test-to-test variance. However, the next patch makes the corner case easier to trigger. Signed-off-by: Mel Gorman Signed-off-by: Ingo Molnar Tested-by: Raghavendra K T Link: https://lore.kernel.org/r/20231010083143.19593-6-mgorman@techsingularity.net --- include/linux/sched/numa_balancing.h | 1 + include/trace/events/sched.h | 3 ++- kernel/sched/fair.c | 18 +++++++++++++++--- 3 files changed, 18 insertions(+), 4 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h index c127a1509e2f..7dcc0bdfddbb 100644 --- a/include/linux/sched/numa_balancing.h +++ b/include/linux/sched/numa_balancing.h @@ -21,6 +21,7 @@ enum numa_vmaskip_reason { NUMAB_SKIP_INACCESSIBLE, NUMAB_SKIP_SCAN_DELAY, NUMAB_SKIP_PID_INACTIVE, + NUMAB_SKIP_IGNORE_PID, }; #ifdef CONFIG_NUMA_BALANCING diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index d82a04d6a1bc..bfc07c10541a 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -670,7 +670,8 @@ DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa, EM( NUMAB_SKIP_SHARED_RO, "shared_ro" ) \ EM( NUMAB_SKIP_INACCESSIBLE, "inaccessible" ) \ EM( NUMAB_SKIP_SCAN_DELAY, "scan_delay" ) \ - EMe(NUMAB_SKIP_PID_INACTIVE, "pid_inactive" ) + EM( NUMAB_SKIP_PID_INACTIVE, "pid_inactive" ) \ + EMe(NUMAB_SKIP_IGNORE_PID, "ignore_pid_inactive" ) /* Redefine for export. */ #undef EM diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ce36969625bd..ab79013f6e91 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3113,7 +3113,7 @@ static void reset_ptenuma_scan(struct task_struct *p) p->mm->numa_scan_offset = 0; } -static bool vma_is_accessed(struct vm_area_struct *vma) +static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma) { unsigned long pids; /* @@ -3126,7 +3126,19 @@ static bool vma_is_accessed(struct vm_area_struct *vma) return true; pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1]; - return test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids); + if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids)) + return true; + + /* + * Complete a scan that has already started regardless of PID access, or + * some VMAs may never be scanned in multi-threaded applications: + */ + if (mm->numa_scan_offset > vma->vm_start) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID); + return true; + } + + return false; } #define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay) @@ -3270,7 +3282,7 @@ static void task_numa_work(struct callback_head *work) } /* Do not scan the VMA if task has not accessed */ - if (!vma_is_accessed(vma)) { + if (!vma_is_accessed(mm, vma)) { trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE); continue; } -- cgit v1.3 From f169c62ff7cd1acf8bac8ae17bfeafa307d9e6fa Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 10 Oct 2023 09:31:43 +0100 Subject: sched/numa: Complete scanning of inactive VMAs when there is no alternative VMAs are skipped if there is no recent fault activity but this represents a chicken-and-egg problem as there may be no fault activity if the PTEs are never updated to trap NUMA hints. There is an indirect reliance on scanning to be forced early in the lifetime of a task but this may fail to detect changes in phase behaviour. Force inactive VMAs to be scanned when all other eligible VMAs have been updated within the same scan sequence. Test results in general look good with some changes in performance, both negative and positive, depending on whether the additional scanning and faulting was beneficial or not to the workload. The autonuma benchmark workload NUMA01_THREADLOCAL was picked for closer examination. The workload creates two processes with numerous threads and thread-local storage that is zero-filled in a loop. It exercises the corner case where unrelated threads may skip VMAs that are thread-local to another thread and still has some VMAs that inactive while the workload executes. The VMA skipping activity frequency with and without the patch: 6.6.0-rc2-sched-numabtrace-v1 ============================= 649 reason=scan_delay 9,094 reason=unsuitable 48,915 reason=shared_ro 143,919 reason=inaccessible 193,050 reason=pid_inactive 6.6.0-rc2-sched-numabselective-v1 ============================= 146 reason=seq_completed 622 reason=ignore_pid_inactive 624 reason=scan_delay 6,570 reason=unsuitable 16,101 reason=shared_ro 27,608 reason=inaccessible 41,939 reason=pid_inactive Note that with the patch applied, the PID activity is ignored (ignore_pid_inactive) to ensure a VMA with some activity is completely scanned. In addition, a small number of VMAs are scanned when no other eligible VMA is available during a single scan window (seq_completed). The number of times a VMA is skipped due to no PID activity from the scanning task (pid_inactive) drops dramatically. It is expected that this will increase the number of PTEs updated for NUMA hinting faults as well as hinting faults but these represent PTEs that would otherwise have been missed. The tradeoff is scan+fault overhead versus improving locality due to migration. On a 2-socket Cascade Lake test machine, the time to complete the workload is as follows; 6.6.0-rc2 6.6.0-rc2 sched-numabtrace-v1 sched-numabselective-v1 Min elsp-NUMA01_THREADLOCAL 174.22 ( 0.00%) 117.64 ( 32.48%) Amean elsp-NUMA01_THREADLOCAL 175.68 ( 0.00%) 123.34 * 29.79%* Stddev elsp-NUMA01_THREADLOCAL 1.20 ( 0.00%) 4.06 (-238.20%) CoeffVar elsp-NUMA01_THREADLOCAL 0.68 ( 0.00%) 3.29 (-381.70%) Max elsp-NUMA01_THREADLOCAL 177.18 ( 0.00%) 128.03 ( 27.74%) The time to complete the workload is reduced by almost 30%: 6.6.0-rc2 6.6.0-rc2 sched-numabtrace-v1 sched-numabselective-v1 / Duration User 91201.80 63506.64 Duration System 2015.53 1819.78 Duration Elapsed 1234.77 868.37 In this specific case, system CPU time was not increased but it's not universally true. From vmstat, the NUMA scanning and fault activity is as follows; 6.6.0-rc2 6.6.0-rc2 sched-numabtrace-v1 sched-numabselective-v1 Ops NUMA base-page range updates 64272.00 26374386.00 Ops NUMA PTE updates 36624.00 55538.00 Ops NUMA PMD updates 54.00 51404.00 Ops NUMA hint faults 15504.00 75786.00 Ops NUMA hint local faults % 14860.00 56763.00 Ops NUMA hint local percent 95.85 74.90 Ops NUMA pages migrated 1629.00 6469222.00 Both the number of PTE updates and hint faults is dramatically increased. While this is superficially unfortunate, it represents ranges that were simply skipped without the patch. As a result of the scanning and hinting faults, many more pages were also migrated but as the time to completion is reduced, the overhead is offset by the gain. Signed-off-by: Mel Gorman Signed-off-by: Ingo Molnar Tested-by: Raghavendra K T Link: https://lore.kernel.org/r/20231010083143.19593-7-mgorman@techsingularity.net --- include/linux/mm_types.h | 6 ++++ include/linux/sched/numa_balancing.h | 1 + include/trace/events/sched.h | 3 +- kernel/sched/fair.c | 55 ++++++++++++++++++++++++++++++++++-- 4 files changed, 61 insertions(+), 4 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e7571eca1131..589f31ef2e84 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -575,6 +575,12 @@ struct vma_numab_state { * every VMA_PID_RESET_PERIOD jiffies: */ unsigned long pids_active[2]; + + /* + * MM scan sequence ID when the VMA was last completely scanned. + * A VMA is not eligible for scanning if prev_scan_seq == numa_scan_seq + */ + int prev_scan_seq; }; /* diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h index 7dcc0bdfddbb..b69afb8630db 100644 --- a/include/linux/sched/numa_balancing.h +++ b/include/linux/sched/numa_balancing.h @@ -22,6 +22,7 @@ enum numa_vmaskip_reason { NUMAB_SKIP_SCAN_DELAY, NUMAB_SKIP_PID_INACTIVE, NUMAB_SKIP_IGNORE_PID, + NUMAB_SKIP_SEQ_COMPLETED, }; #ifdef CONFIG_NUMA_BALANCING diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index bfc07c10541a..6188ad0d9e0d 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -671,7 +671,8 @@ DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa, EM( NUMAB_SKIP_INACCESSIBLE, "inaccessible" ) \ EM( NUMAB_SKIP_SCAN_DELAY, "scan_delay" ) \ EM( NUMAB_SKIP_PID_INACTIVE, "pid_inactive" ) \ - EMe(NUMAB_SKIP_IGNORE_PID, "ignore_pid_inactive" ) + EM( NUMAB_SKIP_IGNORE_PID, "ignore_pid_inactive" ) \ + EMe(NUMAB_SKIP_SEQ_COMPLETED, "seq_completed" ) /* Redefine for export. */ #undef EM diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ab79013f6e91..922905194c0c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3158,6 +3158,8 @@ static void task_numa_work(struct callback_head *work) unsigned long nr_pte_updates = 0; long pages, virtpages; struct vma_iterator vmi; + bool vma_pids_skipped; + bool vma_pids_forced = false; SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); @@ -3200,7 +3202,6 @@ static void task_numa_work(struct callback_head *work) */ p->node_stamp += 2 * TICK_NSEC; - start = mm->numa_scan_offset; pages = sysctl_numa_balancing_scan_size; pages <<= 20 - PAGE_SHIFT; /* MB in pages */ virtpages = pages * 8; /* Scan up to this much virtual space */ @@ -3210,6 +3211,16 @@ static void task_numa_work(struct callback_head *work) if (!mmap_read_trylock(mm)) return; + + /* + * VMAs are skipped if the current PID has not trapped a fault within + * the VMA recently. Allow scanning to be forced if there is no + * suitable VMA remaining. + */ + vma_pids_skipped = false; + +retry_pids: + start = mm->numa_scan_offset; vma_iter_init(&vmi, mm, start); vma = vma_next(&vmi); if (!vma) { @@ -3260,6 +3271,13 @@ static void task_numa_work(struct callback_head *work) /* Reset happens after 4 times scan delay of scan start */ vma->numab_state->pids_active_reset = vma->numab_state->next_scan + msecs_to_jiffies(VMA_PID_RESET_PERIOD); + + /* + * Ensure prev_scan_seq does not match numa_scan_seq, + * to prevent VMAs being skipped prematurely on the + * first scan: + */ + vma->numab_state->prev_scan_seq = mm->numa_scan_seq - 1; } /* @@ -3281,8 +3299,19 @@ static void task_numa_work(struct callback_head *work) vma->numab_state->pids_active[1] = 0; } - /* Do not scan the VMA if task has not accessed */ - if (!vma_is_accessed(mm, vma)) { + /* Do not rescan VMAs twice within the same sequence. */ + if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) { + mm->numa_scan_offset = vma->vm_end; + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED); + continue; + } + + /* + * Do not scan the VMA if task has not accessed it, unless no other + * VMA candidate exists. + */ + if (!vma_pids_forced && !vma_is_accessed(mm, vma)) { + vma_pids_skipped = true; trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE); continue; } @@ -3311,8 +3340,28 @@ static void task_numa_work(struct callback_head *work) cond_resched(); } while (end != vma->vm_end); + + /* VMA scan is complete, do not scan until next sequence. */ + vma->numab_state->prev_scan_seq = mm->numa_scan_seq; + + /* + * Only force scan within one VMA at a time, to limit the + * cost of scanning a potentially uninteresting VMA. + */ + if (vma_pids_forced) + break; } for_each_vma(vmi, vma); + /* + * If no VMAs are remaining and VMAs were skipped due to the PID + * not accessing the VMA previously, then force a scan to ensure + * forward progress: + */ + if (!vma && !vma_pids_forced && vma_pids_skipped) { + vma_pids_forced = true; + goto retry_pids; + } + out: /* * It is possible to reach the end of the VMA list but the last few -- cgit v1.3 From b95303e0aeaf446b65169dd4142cacdaeb7d4c8b Mon Sep 17 00:00:00 2001 From: Barry Song Date: Thu, 19 Oct 2023 11:33:21 +0800 Subject: sched: Add cpus_share_resources API Add cpus_share_resources() API. This is the preparation for the optimization of select_idle_cpu() on platforms with cluster scheduler level. On a machine with clusters cpus_share_resources() will test whether two cpus are within the same cluster. On a non-cluster machine it will behaves the same as cpus_share_cache(). So we use "resources" here for cache resources. Signed-off-by: Barry Song Signed-off-by: Yicong Yang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Gautham R. Shenoy Reviewed-by: Tim Chen Reviewed-by: Vincent Guittot Tested-and-reviewed-by: Chen Yu Tested-by: K Prateek Nayak Link: https://lkml.kernel.org/r/20231019033323.54147-2-yangyicong@huawei.com --- include/linux/sched/sd_flags.h | 7 +++++++ include/linux/sched/topology.h | 8 +++++++- kernel/sched/core.c | 12 ++++++++++++ kernel/sched/sched.h | 1 + kernel/sched/topology.c | 13 +++++++++++++ 5 files changed, 40 insertions(+), 1 deletion(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index fad77b5172e2..a8b28647aafc 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -109,6 +109,13 @@ SD_FLAG(SD_ASYM_CPUCAPACITY_FULL, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) */ SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) +/* + * Domain members share CPU cluster (LLC tags or L2 cache) + * + * NEEDS_GROUPS: Clusters are shared between groups. + */ +SD_FLAG(SD_CLUSTER, SDF_NEEDS_GROUPS) + /* * Domain members share CPU package resources (i.e. caches) * diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 67b573d5bf28..4c14fe127223 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -45,7 +45,7 @@ static inline int cpu_smt_flags(void) #ifdef CONFIG_SCHED_CLUSTER static inline int cpu_cluster_flags(void) { - return SD_SHARE_PKG_RESOURCES; + return SD_CLUSTER | SD_SHARE_PKG_RESOURCES; } #endif @@ -179,6 +179,7 @@ cpumask_var_t *alloc_sched_domains(unsigned int ndoms); void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); bool cpus_share_cache(int this_cpu, int that_cpu); +bool cpus_share_resources(int this_cpu, int that_cpu); typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); typedef int (*sched_domain_flags_f)(void); @@ -232,6 +233,11 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu) return true; } +static inline bool cpus_share_resources(int this_cpu, int that_cpu) +{ + return true; +} + #endif /* !CONFIG_SMP */ #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dc724f59e495..5e1fb8a63b2e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3939,6 +3939,18 @@ bool cpus_share_cache(int this_cpu, int that_cpu) return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } +/* + * Whether CPUs are share cache resources, which means LLC on non-cluster + * machines and LLC tag or L2 on machines with clusters. + */ +bool cpus_share_resources(int this_cpu, int that_cpu) +{ + if (this_cpu == that_cpu) + return true; + + return per_cpu(sd_share_id, this_cpu) == per_cpu(sd_share_id, that_cpu); +} + static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 65cad0e5729e..998f03d02de0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1853,6 +1853,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); +DECLARE_PER_CPU(int, sd_share_id); DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index a63729f87c21..dbb8c328e8ad 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -668,6 +668,7 @@ static void destroy_sched_domains(struct sched_domain *sd) DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(int, sd_share_id); DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); @@ -693,6 +694,17 @@ static void update_top_cache_domain(int cpu) per_cpu(sd_llc_id, cpu) = id; rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); + sd = lowest_flag_domain(cpu, SD_CLUSTER); + if (sd) + id = cpumask_first(sched_domain_span(sd)); + + /* + * This assignment should be placed after the sd_llc_id as + * we want this id equals to cluster id on cluster machines + * but equals to LLC id on non-Cluster machines. + */ + per_cpu(sd_share_id, cpu) = id; + sd = lowest_flag_domain(cpu, SD_NUMA); rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); @@ -1550,6 +1562,7 @@ static struct cpumask ***sched_domains_numa_masks; */ #define TOPOLOGY_SD_FLAGS \ (SD_SHARE_CPUCAPACITY | \ + SD_CLUSTER | \ SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING) -- cgit v1.3 From 984ffb6a4366752c949f7b39640aecdce222607f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 20 Oct 2023 12:35:33 +0200 Subject: sched/fair: Remove SIS_PROP SIS_UTIL seems to work well, lets remove the old thing. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Vincent Guittot Link: https://lkml.kernel.org/r/20231020134337.GD33965@noisy.programming.kicks-ass.net --- include/linux/sched/topology.h | 2 -- kernel/sched/core.c | 5 ----- kernel/sched/fair.c | 48 ------------------------------------------ kernel/sched/features.h | 1 - kernel/sched/sched.h | 3 --- 5 files changed, 59 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 4c14fe127223..de545ba85218 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -109,8 +109,6 @@ struct sched_domain { u64 max_newidle_lb_cost; unsigned long last_decay_max_lb_cost; - u64 avg_scan_cost; /* select_idle_sibling */ - #ifdef CONFIG_SCHEDSTATS /* load_balance() stats */ unsigned int lb_count[CPU_MAX_IDLE_TYPES]; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5e1fb8a63b2e..7a0c16115b79 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3792,9 +3792,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, if (rq->avg_idle > max) rq->avg_idle = max; - rq->wake_stamp = jiffies; - rq->wake_avg_idle = rq->avg_idle / 2; - rq->idle_stamp = 0; } #endif @@ -9953,8 +9950,6 @@ void __init sched_init(void) rq->online = 0; rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; - rq->wake_stamp = jiffies; - rq->wake_avg_idle = rq->avg_idle; rq->max_idle_balance_cost = sysctl_sched_migration_cost; INIT_LIST_HEAD(&rq->cfs_tasks); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 523b5aee2d6a..8767988242ee 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7209,45 +7209,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); int i, cpu, idle_cpu = -1, nr = INT_MAX; struct sched_domain_shared *sd_share; - struct rq *this_rq = this_rq(); - int this = smp_processor_id(); - struct sched_domain *this_sd = NULL; - u64 time = 0; cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); - if (sched_feat(SIS_PROP) && !has_idle_core) { - u64 avg_cost, avg_idle, span_avg; - unsigned long now = jiffies; - - this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); - if (!this_sd) - return -1; - - /* - * If we're busy, the assumption that the last idle period - * predicts the future is flawed; age away the remaining - * predicted idle time. - */ - if (unlikely(this_rq->wake_stamp < now)) { - while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) { - this_rq->wake_stamp++; - this_rq->wake_avg_idle >>= 1; - } - } - - avg_idle = this_rq->wake_avg_idle; - avg_cost = this_sd->avg_scan_cost + 1; - - span_avg = sd->span_weight * avg_idle; - if (span_avg > 4*avg_cost) - nr = div_u64(span_avg, avg_cost); - else - nr = 4; - - time = cpu_clock(this); - } - if (sched_feat(SIS_UTIL)) { sd_share = rcu_dereference(per_cpu(sd_llc_shared, target)); if (sd_share) { @@ -7301,18 +7265,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool if (has_idle_core) set_idle_cores(target, false); - if (sched_feat(SIS_PROP) && this_sd && !has_idle_core) { - time = cpu_clock(this) - time; - - /* - * Account for the scan cost of wakeups against the average - * idle time. - */ - this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time); - - update_avg(&this_sd->avg_scan_cost, time); - } - return idle_cpu; } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index f770168230ae..a3ddf84de430 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -49,7 +49,6 @@ SCHED_FEAT(TTWU_QUEUE, true) /* * When doing wakeups, attempt to limit superfluous scans of the LLC domain. */ -SCHED_FEAT(SIS_PROP, false) SCHED_FEAT(SIS_UTIL, true) /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ef4fe7bcf740..2e5a95486a42 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1059,9 +1059,6 @@ struct rq { u64 idle_stamp; u64 avg_idle; - unsigned long wake_stamp; - u64 wake_avg_idle; - /* This is used to determine avg_idle's max value */ u64 max_idle_balance_cost; -- cgit v1.3