summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorDavid Finkel <davidf@vimeo.com>2024-07-29 10:37:42 -0400
committerAndrew Morton <akpm@linux-foundation.org>2024-09-01 20:25:53 -0700
commitc6f53ed8f213a66ae8bc40aa9112c32412c35a21 (patch)
tree4c1809b01e308558cb62c85255cc968c28057d87 /include
parent3290ef3c7f2a8171fc534e02fb95a512eeae689a (diff)
mm, memcg: cg2 memory{.swap,}.peak write handlers
Patch series "mm, memcg: cg2 memory{.swap,}.peak write handlers", v7. This patch (of 2): Other mechanisms for querying the peak memory usage of either a process or v1 memory cgroup allow for resetting the high watermark. Restore parity with those mechanisms, but with a less racy API. For example: - Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets the high watermark. - writing "5" to the clear_refs pseudo-file in a processes's proc directory resets the peak RSS. This change is an evolution of a previous patch, which mostly copied the cgroup v1 behavior, however, there were concerns about races/ownership issues with a global reset, so instead this change makes the reset filedescriptor-local. Writing any non-empty string to the memory.peak and memory.swap.peak pseudo-files reset the high watermark to the current usage for subsequent reads through that same FD. Notably, following Johannes's suggestion, this implementation moves the O(FDs that have written) behavior onto the FD write(2) path. Instead, on the page-allocation path, we simply add one additional watermark to conditionally bump per-hierarchy level in the page-counter. Additionally, this takes Longman's suggestion of nesting the page-charging-path checks for the two watermarks to reduce the number of common-case comparisons. This behavior is particularly useful for work scheduling systems that need to track memory usage of worker processes/cgroups per-work-item. Since memory can't be squeezed like CPU can (the OOM-killer has opinions), these systems need to track the peak memory usage to compute system/container fullness when binpacking workitems. Most notably, Vimeo's use-case involves a system that's doing global binpacking across many Kubernetes pods/containers, and while we can use PSI for some local decisions about overload, we strive to avoid packing workloads too tightly in the first place. To facilitate this, we track the peak memory usage. However, since we run with long-lived workers (to amortize startup costs) we need a way to track the high watermark while a work-item is executing. Polling runs the risk of missing short spikes that last for timescales below the polling interval, and peak memory tracking at the cgroup level is otherwise perfect for this use-case. As this data is used to ensure that binpacked work ends up with sufficient headroom, this use-case mostly avoids the inaccuracies surrounding reclaimable memory. Link: https://lkml.kernel.org/r/20240730231304.761942-1-davidf@vimeo.com Link: https://lkml.kernel.org/r/20240729143743.34236-1-davidf@vimeo.com Link: https://lkml.kernel.org/r/20240729143743.34236-2-davidf@vimeo.com Signed-off-by: David Finkel <davidf@vimeo.com> Suggested-by: Johannes Weiner <hannes@cmpxchg.org> Suggested-by: Waiman Long <longman@redhat.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Michal Koutný <mkoutny@suse.com> Acked-by: Tejun Heo <tj@kernel.org> Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Muchun Song <muchun.song@linux.dev> Cc: Shakeel Butt <shakeel.butt@linux.dev> Cc: Shuah Khan <shuah@kernel.org> Cc: Zefan Li <lizefan.x@bytedance.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'include')
-rw-r--r--include/linux/cgroup-defs.h5
-rw-r--r--include/linux/cgroup.h3
-rw-r--r--include/linux/memcontrol.h5
-rw-r--r--include/linux/page_counter.h11
4 files changed, 23 insertions, 1 deletions
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index ae04035b6cbe..7fc2d0195f56 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -775,6 +775,11 @@ struct cgroup_subsys {
extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+struct cgroup_of_peak {
+ unsigned long value;
+ struct list_head list;
+};
+
/**
* cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups
* @tsk: target task
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c60ba0ab1462..3e0563753cc3 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -11,6 +11,7 @@
#include <linux/sched.h>
#include <linux/nodemask.h>
+#include <linux/list.h>
#include <linux/rculist.h>
#include <linux/cgroupstats.h>
#include <linux/fs.h>
@@ -854,4 +855,6 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id);
+struct cgroup_of_peak *of_peak(struct kernfs_open_file *of);
+
#endif /* _LINUX_CGROUP_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index af7da7bd00af..1b79760af685 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -193,6 +193,11 @@ struct mem_cgroup {
struct page_counter memsw; /* v1 only */
};
+ /* registered local peak watchers */
+ struct list_head memory_peaks;
+ struct list_head swap_peaks;
+ spinlock_t peaks_lock;
+
/* Range enforcement for interrupt charges */
struct work_struct high_work;
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 66ebf9a73158..79dbd8bc35a7 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -26,6 +26,8 @@ struct page_counter {
atomic_long_t children_low_usage;
unsigned long watermark;
+ /* Latest cg2 reset watermark */
+ unsigned long local_watermark;
unsigned long failcnt;
/* Keep all the read most fields in a separete cacheline. */
@@ -84,7 +86,14 @@ int page_counter_memparse(const char *buf, const char *max,
static inline void page_counter_reset_watermark(struct page_counter *counter)
{
- counter->watermark = page_counter_read(counter);
+ unsigned long usage = page_counter_read(counter);
+
+ /*
+ * Update local_watermark first, so it's always <= watermark
+ * (modulo CPU/compiler re-ordering)
+ */
+ counter->local_watermark = usage;
+ counter->watermark = usage;
}
#ifdef CONFIG_MEMCG