From 34f26a15611afb03c33df6819359d36f5b382589 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Wed, 7 Sep 2022 17:03:32 +0800 Subject: sched/psi: Per-cgroup PSI accounting disable/re-enable interface PSI accounts stalls for each cgroup separately and aggregates it at each level of the hierarchy. This may cause non-negligible overhead for some workloads when under deep level of the hierarchy. commit 3958e2d0c34e ("cgroup: make per-cgroup pressure stall tracking configurable") make PSI to skip per-cgroup stall accounting, only account system-wide to avoid this each level overhead. But for our use case, we also want leaf cgroup PSI stats accounted for userspace adjustment on that cgroup, apart from only system-wide adjustment. So this patch introduce a per-cgroup PSI accounting disable/re-enable interface "cgroup.pressure", which is a read-write single value file that allowed values are "0" and "1", the defaults is "1" so per-cgroup PSI stats is enabled by default. Implementation details: It should be relatively straight-forward to disable and re-enable state aggregation, time tracking, averaging on a per-cgroup level, if we can live with losing history from while it was disabled. I.e. the avgs will restart from 0, total= will have gaps. But it's hard or complex to stop/restart groupc->tasks[] updates, which is not implemented in this patch. So we always update groupc->tasks[] and PSI_ONCPU bit in psi_group_change() even when the cgroup PSI stats is disabled. Suggested-by: Johannes Weiner Suggested-by: Tejun Heo Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lkml.kernel.org/r/20220907090332.2078-1-zhouchengming@bytedance.com --- include/linux/cgroup-defs.h | 3 +++ include/linux/psi.h | 2 ++ include/linux/psi_types.h | 3 +++ 3 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 4bcf56b3491c..7df76b318245 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -428,6 +428,9 @@ struct cgroup { struct cgroup_file procs_file; /* handle for "cgroup.procs" */ struct cgroup_file events_file; /* handle for "cgroup.events" */ + /* handles for "{cpu,memory,io,irq}.pressure" */ + struct cgroup_file psi_files[NR_PSI_RESOURCES]; + /* * The bitmask of subsystems enabled on the child cgroups. * ->subtree_control is the one configured through diff --git a/include/linux/psi.h b/include/linux/psi.h index 362a74ca1d3b..b029a847def1 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -39,6 +39,7 @@ static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) int psi_cgroup_alloc(struct cgroup *cgrp); void psi_cgroup_free(struct cgroup *cgrp); void cgroup_move_task(struct task_struct *p, struct css_set *to); +void psi_cgroup_restart(struct psi_group *group); #endif #else /* CONFIG_PSI */ @@ -60,6 +61,7 @@ static inline void cgroup_move_task(struct task_struct *p, struct css_set *to) { rcu_assign_pointer(p->cgroups, to); } +static inline void psi_cgroup_restart(struct psi_group *group) {} #endif #endif /* CONFIG_PSI */ diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index a0b746258c68..6e4372735068 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -152,6 +152,7 @@ struct psi_trigger { struct psi_group { struct psi_group *parent; + bool enabled; /* Protects data used by the aggregator */ struct mutex avgs_lock; @@ -194,6 +195,8 @@ struct psi_group { #else /* CONFIG_PSI */ +#define NR_PSI_RESOURCES 0 + struct psi_group { }; #endif /* CONFIG_PSI */ -- cgit v1.2.3