From 6b611388b626eaa59d202bf8f64d095ff80bcde6 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Thu, 24 Oct 2024 18:22:59 -0700
Subject: memcg-v1: remove charge move code

The memcg-v1 charge move feature has been deprecated completely and let's
remove the relevant code as well.

Link: https://lkml.kernel.org/r/20241025012304.2473312-3-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux/memcontrol.h')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 34d2da05f2f1..0b113267b2de 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -299,11 +299,6 @@ struct mem_cgroup {
 	/* For oom notifier event fd */
 	struct list_head oom_notify;
 
-	/*
-	 * Should we move charges of a task when a task is moved into this
-	 * mem_cgroup ? And what type of charges should we move ?
-	 */
-	unsigned long move_charge_at_immigrate;
 	/* taken only while moving_account > 0 */
 	spinlock_t move_lock;
 	unsigned long move_lock_flags;
-- 
cgit v1.2.3


From a29c0e4b2e867f4e362a6740c430bfdc2efdd1d9 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Thu, 24 Oct 2024 18:23:03 -0700
Subject: memcg-v1: remove memcg move locking code

The memcg v1's charge move feature has been deprecated.  All the places
using the memcg move lock, have stopped using it as they don't need the
protection any more.  Let's proceed to remove all the locking code related
to charge moving.

Link: https://lkml.kernel.org/r/20241025012304.2473312-7-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 54 ------------------------------
 mm/filemap.c               |  1 -
 mm/memcontrol-v1.c         | 82 ----------------------------------------------
 mm/memcontrol.c            |  5 ---
 mm/rmap.c                  |  1 -
 5 files changed, 143 deletions(-)

(limited to 'include/linux/memcontrol.h')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0b113267b2de..bb49e0d4b377 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -299,20 +299,10 @@ struct mem_cgroup {
 	/* For oom notifier event fd */
 	struct list_head oom_notify;
 
-	/* taken only while moving_account > 0 */
-	spinlock_t move_lock;
-	unsigned long move_lock_flags;
-
 	/* Legacy tcp memory accounting */
 	bool tcpmem_active;
 	int tcpmem_pressure;
 
-	/*
-	 * set > 0 if pages under this cgroup are moving to other cgroup.
-	 */
-	atomic_t moving_account;
-	struct task_struct *move_lock_task;
-
 	/* List of events which userspace want to receive */
 	struct list_head event_list;
 	spinlock_t event_list_lock;
@@ -428,9 +418,7 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
  *
  * - the folio lock
  * - LRU isolation
- * - folio_memcg_lock()
  * - exclusive reference
- * - mem_cgroup_trylock_pages()
  *
  * For a kmem folio a caller should hold an rcu read lock to protect memcg
  * associated with a kmem folio from being released.
@@ -499,9 +487,7 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
  *
  * - the folio lock
  * - LRU isolation
- * - lock_folio_memcg()
  * - exclusive reference
- * - mem_cgroup_trylock_pages()
  *
  * For a kmem folio a caller should hold an rcu read lock to protect memcg
  * associated with a kmem folio from being released.
@@ -1867,26 +1853,6 @@ static inline bool task_in_memcg_oom(struct task_struct *p)
 	return p->memcg_in_oom;
 }
 
-void folio_memcg_lock(struct folio *folio);
-void folio_memcg_unlock(struct folio *folio);
-
-/* try to stablize folio_memcg() for all the pages in a memcg */
-static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
-{
-	rcu_read_lock();
-
-	if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account))
-		return true;
-
-	rcu_read_unlock();
-	return false;
-}
-
-static inline void mem_cgroup_unlock_pages(void)
-{
-	rcu_read_unlock();
-}
-
 static inline void mem_cgroup_enter_user_fault(void)
 {
 	WARN_ON(current->in_user_fault);
@@ -1908,26 +1874,6 @@ unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order,
 	return 0;
 }
 
-static inline void folio_memcg_lock(struct folio *folio)
-{
-}
-
-static inline void folio_memcg_unlock(struct folio *folio)
-{
-}
-
-static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
-{
-	/* to match folio_memcg_rcu() */
-	rcu_read_lock();
-	return true;
-}
-
-static inline void mem_cgroup_unlock_pages(void)
-{
-	rcu_read_unlock();
-}
-
 static inline bool task_in_memcg_oom(struct task_struct *p)
 {
 	return false;
diff --git a/mm/filemap.c b/mm/filemap.c
index 630a1c431ea1..e582a1545d2a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -119,7 +119,6 @@
  *    ->i_pages lock		(folio_remove_rmap_pte->set_page_dirty)
  *    bdi.wb->list_lock		(folio_remove_rmap_pte->set_page_dirty)
  *    ->inode->i_lock		(folio_remove_rmap_pte->set_page_dirty)
- *    ->memcg->move_lock	(folio_remove_rmap_pte->folio_memcg_lock)
  *    bdi.wb->list_lock		(zap_pte_range->set_page_dirty)
  *    ->inode->i_lock		(zap_pte_range->set_page_dirty)
  *    ->private_lock		(zap_pte_range->block_dirty_folio)
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 9c0fba8c8a83..539ceefa9d2d 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -401,87 +401,6 @@ unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order,
 	return nr_reclaimed;
 }
 
-/**
- * folio_memcg_lock - Bind a folio to its memcg.
- * @folio: The folio.
- *
- * This function prevents unlocked LRU folios from being moved to
- * another cgroup.
- *
- * It ensures lifetime of the bound memcg.  The caller is responsible
- * for the lifetime of the folio.
- */
-void folio_memcg_lock(struct folio *folio)
-{
-	struct mem_cgroup *memcg;
-	unsigned long flags;
-
-	/*
-	 * The RCU lock is held throughout the transaction.  The fast
-	 * path can get away without acquiring the memcg->move_lock
-	 * because page moving starts with an RCU grace period.
-         */
-	rcu_read_lock();
-
-	if (mem_cgroup_disabled())
-		return;
-again:
-	memcg = folio_memcg(folio);
-	if (unlikely(!memcg))
-		return;
-
-#ifdef CONFIG_PROVE_LOCKING
-	local_irq_save(flags);
-	might_lock(&memcg->move_lock);
-	local_irq_restore(flags);
-#endif
-
-	if (atomic_read(&memcg->moving_account) <= 0)
-		return;
-
-	spin_lock_irqsave(&memcg->move_lock, flags);
-	if (memcg != folio_memcg(folio)) {
-		spin_unlock_irqrestore(&memcg->move_lock, flags);
-		goto again;
-	}
-
-	/*
-	 * When charge migration first begins, we can have multiple
-	 * critical sections holding the fast-path RCU lock and one
-	 * holding the slowpath move_lock. Track the task who has the
-	 * move_lock for folio_memcg_unlock().
-	 */
-	memcg->move_lock_task = current;
-	memcg->move_lock_flags = flags;
-}
-
-static void __folio_memcg_unlock(struct mem_cgroup *memcg)
-{
-	if (memcg && memcg->move_lock_task == current) {
-		unsigned long flags = memcg->move_lock_flags;
-
-		memcg->move_lock_task = NULL;
-		memcg->move_lock_flags = 0;
-
-		spin_unlock_irqrestore(&memcg->move_lock, flags);
-	}
-
-	rcu_read_unlock();
-}
-
-/**
- * folio_memcg_unlock - Release the binding between a folio and its memcg.
- * @folio: The folio.
- *
- * This releases the binding created by folio_memcg_lock().  This does
- * not change the accounting of this folio to its memcg, but it does
- * permit others to change it.
- */
-void folio_memcg_unlock(struct folio *folio)
-{
-	__folio_memcg_unlock(folio_memcg(folio));
-}
-
 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
 				struct cftype *cft)
 {
@@ -1189,7 +1108,6 @@ void memcg1_memcg_init(struct mem_cgroup *memcg)
 {
 	INIT_LIST_HEAD(&memcg->oom_notify);
 	mutex_init(&memcg->thresholds_lock);
-	spin_lock_init(&memcg->move_lock);
 	INIT_LIST_HEAD(&memcg->event_list);
 	spin_lock_init(&memcg->event_list_lock);
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ec4ac6eb650b..39e902c1dd9f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1189,7 +1189,6 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
  * These functions are safe to use under any of the following conditions:
  * - folio locked
  * - folio_test_lru false
- * - folio_memcg_lock()
  * - folio frozen (refcount of 0)
  *
  * Return: The lruvec this folio is on with its lock held.
@@ -1211,7 +1210,6 @@ struct lruvec *folio_lruvec_lock(struct folio *folio)
  * These functions are safe to use under any of the following conditions:
  * - folio locked
  * - folio_test_lru false
- * - folio_memcg_lock()
  * - folio frozen (refcount of 0)
  *
  * Return: The lruvec this folio is on with its lock held and interrupts
@@ -1235,7 +1233,6 @@ struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
  * These functions are safe to use under any of the following conditions:
  * - folio locked
  * - folio_test_lru false
- * - folio_memcg_lock()
  * - folio frozen (refcount of 0)
  *
  * Return: The lruvec this folio is on with its lock held and interrupts
@@ -2375,9 +2372,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
 	 *
 	 * - the page lock
 	 * - LRU isolation
-	 * - folio_memcg_lock()
 	 * - exclusive reference
-	 * - mem_cgroup_trylock_pages()
 	 */
 	folio->memcg_data = (unsigned long)memcg;
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 73d5998677d4..4d75433330f9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -32,7 +32,6 @@
  *                   swap_lock (in swap_duplicate, swap_info_get)
  *                     mmlist_lock (in mmput, drain_mmlist and others)
  *                     mapping->private_lock (in block_dirty_folio)
- *                       folio_lock_memcg move_lock (in block_dirty_folio)
  *                         i_pages lock (widely used)
  *                           lruvec->lru_lock (in folio_lruvec_lock_irq)
  *                     inode->i_lock (in set_page_dirty's __mark_inode_dirty)
-- 
cgit v1.2.3


From 906c38ff52e95575ddf3281bee531eded3dba150 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Sat, 26 Oct 2024 09:37:07 -0700
Subject: memcg: workingset: remove folio_memcg_rcu usage

The function workingset_activation() is called from folio_mark_accessed()
with the guarantee that the given folio can not be freed under us in
workingset_activation().  In addition, the association of the folio and
its memcg can not be broken here because charge migration is no more.
There is no need to use folio_memcg_rcu.  Simply use folio_memcg_charged()
because that is what this function cares about.

[akpm@linux-foundation.org: provide folio_memcg_charged stub for CONFIG_MEMCG=n]
Link: https://lkml.kernel.org/r/20241026163707.2479526-1-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Suggested-by: Yu Zhao <yuzhao@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 34 ++--------------------------------
 mm/workingset.c            | 14 ++------------
 2 files changed, 4 insertions(+), 44 deletions(-)

(limited to 'include/linux/memcontrol.h')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index bb49e0d4b377..8e4608be811d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -443,35 +443,6 @@ static inline bool folio_memcg_charged(struct folio *folio)
 	return __folio_memcg(folio) != NULL;
 }
 
-/**
- * folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio.
- * @folio: Pointer to the folio.
- *
- * This function assumes that the folio is known to have a
- * proper memory cgroup pointer. It's not safe to call this function
- * against some type of folios, e.g. slab folios or ex-slab folios.
- *
- * Return: A pointer to the memory cgroup associated with the folio,
- * or NULL.
- */
-static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
-{
-	unsigned long memcg_data = READ_ONCE(folio->memcg_data);
-
-	VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
-
-	if (memcg_data & MEMCG_DATA_KMEM) {
-		struct obj_cgroup *objcg;
-
-		objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
-		return obj_cgroup_memcg(objcg);
-	}
-
-	WARN_ON_ONCE(!rcu_read_lock_held());
-
-	return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
-}
-
 /*
  * folio_memcg_check - Get the memory cgroup associated with a folio.
  * @folio: Pointer to the folio.
@@ -1084,10 +1055,9 @@ static inline struct mem_cgroup *folio_memcg(struct folio *folio)
 	return NULL;
 }
 
-static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
+static inline bool folio_memcg_charged(struct folio *folio)
 {
-	WARN_ON_ONCE(!rcu_read_lock_held());
-	return NULL;
+	return false;
 }
 
 static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
diff --git a/mm/workingset.c b/mm/workingset.c
index a2b28e356e68..0e38bec261a4 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -591,22 +591,12 @@ void workingset_refault(struct folio *folio, void *shadow)
  */
 void workingset_activation(struct folio *folio)
 {
-	struct mem_cgroup *memcg;
-
-	rcu_read_lock();
 	/*
 	 * Filter non-memcg pages here, e.g. unmap can call
 	 * mark_page_accessed() on VDSO pages.
-	 *
-	 * XXX: See workingset_refault() - this should return
-	 * root_mem_cgroup even for !CONFIG_MEMCG.
 	 */
-	memcg = folio_memcg_rcu(folio);
-	if (!mem_cgroup_disabled() && !memcg)
-		goto out;
-	workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
-out:
-	rcu_read_unlock();
+	if (mem_cgroup_disabled() || folio_memcg_charged(folio))
+		workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
 }
 
 /*
-- 
cgit v1.2.3


From e7ac4daeed91a25382091e73818ea0cddb1afd5e Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua@oppo.com>
Date: Thu, 7 Nov 2024 14:12:46 +1300
Subject: mm: count zeromap read and set for swapout and swapin
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the proportion of folios from the zeromap is small, missing their
accounting may not significantly impact profiling.  However, it's easy to
construct a scenario where this becomes an issue—for example, allocating
1 GB of memory, writing zeros from userspace, followed by MADV_PAGEOUT,
and then swapping it back in.  In this case, the swap-out and swap-in
counts seem to vanish into a black hole, potentially causing semantic
ambiguity.

On the other hand, Usama reported that zero-filled pages can exceed 10% in
workloads utilizing zswap, while Hailong noted that some app in Android
have more than 6% zero-filled pages.  Before commit 0ca0c24e3211 ("mm:
store zero pages to be swapped out in a bitmap"), both zswap and zRAM
implemented similar optimizations, leading to these optimized-out pages
being counted in either zswap or zRAM counters (with pswpin/pswpout also
increasing for zRAM).  With zeromap functioning prior to both zswap and
zRAM, userspace will no longer detect these swap-out and swap-in actions.

We have three ways to address this:

1. Introduce a dedicated counter specifically for the zeromap.

2. Use pswpin/pswpout accounting, treating the zero map as a standard
   backend.  This approach aligns with zRAM's current handling of
   same-page fills at the device level.  However, it would mean losing the
   optimized-out page counters previously available in zRAM and would not
   align with systems using zswap.  Additionally, as noted by Nhat Pham,
   pswpin/pswpout counters apply only to I/O done directly to the backend
   device.

3. Count zeromap pages under zswap, aligning with system behavior when
   zswap is enabled.  However, this would not be consistent with zRAM, nor
   would it align with systems lacking both zswap and zRAM.

Given the complications with options 2 and 3, this patch selects
option 1.

We can find these counters from /proc/vmstat (counters for the whole
system) and memcg's memory.stat (counters for the interested memcg).

For example:

$ grep -E 'swpin_zero|swpout_zero' /proc/vmstat
swpin_zero 1648
swpout_zero 33536

$ grep -E 'swpin_zero|swpout_zero' /sys/fs/cgroup/system.slice/memory.stat
swpin_zero 3905
swpout_zero 3985

This patch does not address any specific zeromap bug, but the missing
swpout and swpin counts for zero-filled pages can be highly confusing and
may mislead user-space agents that rely on changes in these counters as
indicators.  Therefore, we add a Fixes tag to encourage the inclusion of
this counter in any kernel versions with zeromap.

Many thanks to Kanchana for the contribution of changing
count_objcg_event() to count_objcg_events() to support large folios[1],
which has now been incorporated into this patch.

[1] https://lkml.kernel.org/r/20241001053222.6944-5-kanchana.p.sridhar@intel.com

Link: https://lkml.kernel.org/r/20241107011246.59137-1-21cnbao@gmail.com
Fixes: 0ca0c24e3211 ("mm: store zero pages to be swapped out in a bitmap")
Co-developed-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Hailong Liu <hailong.liu@oppo.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/cgroup-v2.rst |  9 +++++++++
 include/linux/memcontrol.h              | 12 +++++++-----
 include/linux/vm_event_item.h           |  2 ++
 mm/memcontrol.c                         |  4 ++++
 mm/page_io.c                            | 16 ++++++++++++++++
 mm/vmstat.c                             |  2 ++
 mm/zswap.c                              |  6 +++---
 7 files changed, 43 insertions(+), 8 deletions(-)

(limited to 'include/linux/memcontrol.h')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 69af2173555f..6d02168d78be 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1599,6 +1599,15 @@ The following nested keys are defined.
 	  pglazyfreed (npn)
 		Amount of reclaimed lazyfree pages
 
+	  swpin_zero
+		Number of pages swapped into memory and filled with zero, where I/O
+		was optimized out because the page content was detected to be zero
+		during swapout.
+
+	  swpout_zero
+		Number of zero-filled pages swapped out with I/O skipped due to the
+		content being detected as zero.
+
 	  zswpin
 		Number of pages moved in to memory from zswap.
 
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 34d2da05f2f1..e1b41554a5fb 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1760,8 +1760,9 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg)
 
 struct mem_cgroup *mem_cgroup_from_slab_obj(void *p);
 
-static inline void count_objcg_event(struct obj_cgroup *objcg,
-				     enum vm_event_item idx)
+static inline void count_objcg_events(struct obj_cgroup *objcg,
+				      enum vm_event_item idx,
+				      unsigned long count)
 {
 	struct mem_cgroup *memcg;
 
@@ -1770,7 +1771,7 @@ static inline void count_objcg_event(struct obj_cgroup *objcg,
 
 	rcu_read_lock();
 	memcg = obj_cgroup_memcg(objcg);
-	count_memcg_events(memcg, idx, 1);
+	count_memcg_events(memcg, idx, count);
 	rcu_read_unlock();
 }
 
@@ -1825,8 +1826,9 @@ static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
 	return NULL;
 }
 
-static inline void count_objcg_event(struct obj_cgroup *objcg,
-				     enum vm_event_item idx)
+static inline void count_objcg_events(struct obj_cgroup *objcg,
+				      enum vm_event_item idx,
+				      unsigned long count)
 {
 }
 
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index aed952d04132..f70d0958095c 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -134,6 +134,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 #ifdef CONFIG_SWAP
 		SWAP_RA,
 		SWAP_RA_HIT,
+		SWPIN_ZERO,
+		SWPOUT_ZERO,
 #ifdef CONFIG_KSM
 		KSM_SWPIN_COPY,
 #endif
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 06df2af97415..53db98d2c4a1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -431,6 +431,10 @@ static const unsigned int memcg_vm_event_stat[] = {
 	PGDEACTIVATE,
 	PGLAZYFREE,
 	PGLAZYFREED,
+#ifdef CONFIG_SWAP
+	SWPIN_ZERO,
+	SWPOUT_ZERO,
+#endif
 #ifdef CONFIG_ZSWAP
 	ZSWPIN,
 	ZSWPOUT,
diff --git a/mm/page_io.c b/mm/page_io.c
index 69536a2b3c13..01749b99fb54 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -204,7 +204,9 @@ static bool is_folio_zero_filled(struct folio *folio)
 
 static void swap_zeromap_folio_set(struct folio *folio)
 {
+	struct obj_cgroup *objcg = get_obj_cgroup_from_folio(folio);
 	struct swap_info_struct *sis = swp_swap_info(folio->swap);
+	int nr_pages = folio_nr_pages(folio);
 	swp_entry_t entry;
 	unsigned int i;
 
@@ -212,6 +214,12 @@ static void swap_zeromap_folio_set(struct folio *folio)
 		entry = page_swap_entry(folio_page(folio, i));
 		set_bit(swp_offset(entry), sis->zeromap);
 	}
+
+	count_vm_events(SWPOUT_ZERO, nr_pages);
+	if (objcg) {
+		count_objcg_events(objcg, SWPOUT_ZERO, nr_pages);
+		obj_cgroup_put(objcg);
+	}
 }
 
 static void swap_zeromap_folio_clear(struct folio *folio)
@@ -503,6 +511,7 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
 static bool swap_read_folio_zeromap(struct folio *folio)
 {
 	int nr_pages = folio_nr_pages(folio);
+	struct obj_cgroup *objcg;
 	bool is_zeromap;
 
 	/*
@@ -517,6 +526,13 @@ static bool swap_read_folio_zeromap(struct folio *folio)
 	if (!is_zeromap)
 		return false;
 
+	objcg = get_obj_cgroup_from_folio(folio);
+	count_vm_events(SWPIN_ZERO, nr_pages);
+	if (objcg) {
+		count_objcg_events(objcg, SWPIN_ZERO, nr_pages);
+		obj_cgroup_put(objcg);
+	}
+
 	folio_zero_range(folio, 0, folio_size(folio));
 	folio_mark_uptodate(folio);
 	return true;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b5a4cea423e1..ac6a5aa34eab 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1415,6 +1415,8 @@ const char * const vmstat_text[] = {
 #ifdef CONFIG_SWAP
 	"swap_ra",
 	"swap_ra_hit",
+	"swpin_zero",
+	"swpout_zero",
 #ifdef CONFIG_KSM
 	"ksm_swpin_copy",
 #endif
diff --git a/mm/zswap.c b/mm/zswap.c
index 162013952074..0030ce8fecfc 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1053,7 +1053,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 
 	count_vm_event(ZSWPWB);
 	if (entry->objcg)
-		count_objcg_event(entry->objcg, ZSWPWB);
+		count_objcg_events(entry->objcg, ZSWPWB, 1);
 
 	zswap_entry_free(entry);
 
@@ -1483,7 +1483,7 @@ bool zswap_store(struct folio *folio)
 
 	if (objcg) {
 		obj_cgroup_charge_zswap(objcg, entry->length);
-		count_objcg_event(objcg, ZSWPOUT);
+		count_objcg_events(objcg, ZSWPOUT, 1);
 	}
 
 	/*
@@ -1577,7 +1577,7 @@ bool zswap_load(struct folio *folio)
 
 	count_vm_event(ZSWPIN);
 	if (entry->objcg)
-		count_objcg_event(entry->objcg, ZSWPIN);
+		count_objcg_events(entry->objcg, ZSWPIN, 1);
 
 	if (swapcache) {
 		zswap_entry_free(entry);
-- 
cgit v1.2.3


From 69bad21551c9caea8c58800f96da48a704fd311e Mon Sep 17 00:00:00 2001
From: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
Date: Mon, 30 Sep 2024 22:32:16 -0700
Subject: mm: define obj_cgroup_get() if CONFIG_MEMCG is not defined

Patch series "mm: zswap swap-out of large folios", v10.

This patch series enables zswap_store() to accept and store large folios.
The most significant contribution in this series is from the earlier RFC
submitted by Ryan Roberts [1].  Ryan's original RFC has been migrated to
mm-unstable as of 9-30-2024 in patch 6 of this series, and adapted based
on code review comments received for the current patch-series.

[1]: [RFC PATCH v1] mm: zswap: Store large folios without splitting
     https://lore.kernel.org/linux-mm/20231019110543.3284654-1-ryan.roberts@arm.com/T/#u

The first few patches do the prep work for supporting large folios in
zswap_store.  Patch 6 provides the main functionality to swap-out large
folios in zswap.  Patch 7 adds sysfs per-order hugepages "zswpout"
counters that get incremented upon successful zswap_store of large folios,
and also updates the documentation for this:

/sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/zswpout

This patch series is a prerequisite for zswap compress batching of large
folio swap-out and decompress batching of swap-ins based on
swapin_readahead(), using Intel IAA hardware acceleration, which we would
like to submit in subsequent patch-series, with performance improvement
data.

Thanks to Ying Huang for pre-posting review feedback and suggestions!

Thanks also to Nhat, Yosry, Johannes, Barry, Chengming, Usama, Ying and
Matthew for their helpful feedback, code/data reviews and suggestions!

Co-development signoff request:
===============================
I would like to thank Ryan Roberts for his original RFC [1] and request
his co-developer signoff on patch 6 in this series. Thanks Ryan!


System setup for testing:
=========================
Testing of this patch series was done with mm-unstable as of 9-27-2024,
commit de2fbaa6d9c3576ec7133ed02a370ec9376bf000 (without this patch-series)
and mm-unstable 9-30-2024 commit c121617e3606be6575cdacfdb63cc8d67b46a568
(with this patch-series). Data was gathered on an Intel Sapphire Rapids
server, dual-socket 56 cores per socket, 4 IAA devices per socket, 503 GiB
RAM and 525G SSD disk partition swap. Core frequency was fixed at 2500MHz.

The vm-scalability "usemem" test was run in a cgroup whose memory.high
was fixed at 150G. The is no swap limit set for the cgroup. 30 usemem
processes were run, each allocating and writing 10G of memory, and sleeping
for 10 sec before exiting:

usemem --init-time -w -O -s 10 -n 30 10g

Other kernel configuration parameters:

    zswap compressors : zstd, deflate-iaa
    zswap allocator   : zsmalloc
    vm.page-cluster   : 2

In the experiments where "deflate-iaa" is used as the zswap compressor,
IAA "compression verification" is enabled by default
(cat /sys/bus/dsa/drivers/crypto/verify_compress). Hence each IAA
compression will be decompressed internally by the "iaa_crypto" driver, the
crc-s returned by the hardware will be compared and errors reported in case
of mismatches. Thus "deflate-iaa" helps ensure better data integrity as
compared to the software compressors, and the experimental data listed
below is with verify_compress set to "1".


Metrics reporting methodology:
==============================
Total and average throughput are derived from the individual 30 processes'
throughputs reported by usemem. elapsed/sys times are measured with perf.

All percentage changes are "new" vs. "old"; hence a positive value
denotes an increase in the metric, whether it is throughput or latency,
and a negative value denotes a reduction in the metric. Positive throughput
change percentages and negative latency change percentages denote improvements.

The vm stats and sysfs hugepages stats included with the performance data
provide details on the swapout activity to zswap/swap device.


Testing labels used in data summaries:
======================================
The data refers to these test configurations and the before/after
comparisons that they do:

 before-case1:
 -------------
 mm-unstable 9-27-2024, CONFIG_THP_SWAP=N (compares zswap 4K vs. zswap 64K)

 In this scenario, CONFIG_THP_SWAP=N results in 64K/2M folios to be split
 into 4K folios that get processed by zswap.

 before-case2:
 -------------
 mm-unstable 9-27-2024, CONFIG_THP_SWAP=Y (compares SSD swap large folios vs. zswap large folios)

 In this scenario, CONFIG_THP_SWAP=Y results in zswap rejecting large
 folios, which will then be stored by the SSD swap device.

 after:
 ------
 v10 of this patch-series, CONFIG_THP_SWAP=Y

 The "after" is CONFIG_THP_SWAP=Y and v10 of this patch-series, that results
 in 64K/2M folios to not be split, and to be processed by zswap_store.


Regression Testing:
===================
I ran vm-scalability usemem without large folios, i.e., only 4K folios with
mm-unstable and this patch-series. The main goal was to make sure that
there is no functional or performance regression wrt the earlier zswap
behavior for 4K folios, now that 4K folios will be processed by the new
zswap_store() code.

The data indicates there is no significant regression.

 -------------------------------------------------------------------------------
 4K folios:
 ==========

 zswap compressor                zstd          zstd        zstd       zstd v10
                         before-case1  before-case2       after      vs.     vs.
                                                                   case1   case2
 -------------------------------------------------------------------------------
 Total throughput (KB/s)    4,793,363     4,880,978   4,853,074       1%     -1%
 Average throughput (KB/s)    159,778       162,699     161,769       1%     -1%
 elapsed time (sec)            130.14        123.17      126.29      -3%      3%
 sys time (sec)              3,135.53      2,985.64    3,083.18      -2%      3%
 memcg_high                   446,826       444,626     452,930
 memcg_swap_fail                    0             0           0
 zswpout                   48,932,107    48,931,971  48,931,820
 zswpin                           383           386         397
 pswpout                            0             0           0
 pswpin                             0             0           0
 thp_swpout                         0             0           0
 thp_swpout_fallback                0             0           0
 64kB-mthp_swpout_fallback          0             0           0
 pgmajfault                     3,063         3,077       3,479
 swap_ra                           93            94          96
 swap_ra_hit                       47            47          50
 ZSWPOUT-64kB                     n/a           n/a           0
 SWPOUT-64kB                        0             0           0
 -------------------------------------------------------------------------------


Performance Testing:
====================

We list the data for 64K folios with before/after data per-compressor,
followed by the same for 2M pmd-mappable folios.


 -------------------------------------------------------------------------------
 64K folios: zstd:
 =================

 zswap compressor                zstd          zstd         zstd      zstd v10
                         before-case1  before-case2        after     vs.    vs.
                                                                    case1  case2
 -------------------------------------------------------------------------------
 Total throughput (KB/s)    5,222,213     1,076,611    6,159,776      18%   472%
 Average throughput (KB/s)    174,073        35,887      205,325      18%   472%
 elapsed time (sec)            120.50        347.16       108.33     -10%   -69%
 sys time (sec)              2,930.33        248.16     2,549.65     -13%   927%
 memcg_high                   416,773       552,200      465,874
 memcg_swap_fail            3,192,906         1,293        1,012
 zswpout                   48,931,583        20,903   48,931,218
 zswpin                           384           363          410
 pswpout                            0    40,778,448            0
 pswpin                             0            16            0
 thp_swpout                         0             0            0
 thp_swpout_fallback                0             0            0
 64kB-mthp_swpout_fallback  3,192,906         1,293        1,012
 pgmajfault                     3,452         3,072        3,061
 swap_ra                           90            87          107
 swap_ra_hit                       42            43           57
 ZSWPOUT-64kB                     n/a           n/a    3,057,173
 SWPOUT-64kB                        0     2,548,653            0
 -------------------------------------------------------------------------------


 -------------------------------------------------------------------------------
 64K folios: deflate-iaa:
 ========================

 zswap compressor         deflate-iaa   deflate-iaa  deflate-iaa deflate-iaa v10
                         before-case1  before-case2        after     vs.     vs.
                                                                   case1   case2
 -------------------------------------------------------------------------------
 Total throughput (KB/s)    5,652,608     1,089,180    7,189,778     27%    560%
 Average throughput (KB/s)    188,420        36,306      239,659     27%    560%
 elapsed time (sec)            102.90        343.35        87.05    -15%    -75%
 sys time (sec)              2,246.86        213.53     1,864.16    -17%    773%
 memcg_high                   576,104       502,907      642,083
 memcg_swap_fail            4,016,117         1,407        1,478
 zswpout                   61,163,423        22,444   57,798,716
 zswpin                           401           368          454
 pswpout                            0    40,862,080            0
 pswpin                             0            20            0
 thp_swpout                         0             0            0
 thp_swpout_fallback                0             0            0
 64kB-mthp_swpout_fallback  4,016,117         1,407        1,478
 pgmajfault                     3,063         3,153        3,122
 swap_ra                           96            93          156
 swap_ra_hit                       46            45           83
 ZSWPOUT-64kB                     n/a           n/a    3,611,032
 SWPOUT-64kB                        0     2,553,880            0
 -------------------------------------------------------------------------------


 -------------------------------------------------------------------------------
 2M folios: zstd:
 ================

 zswap compressor                zstd          zstd         zstd      zstd v10
                         before-case1  before-case2        after     vs.    vs.
                                                                   case1  case2
 -------------------------------------------------------------------------------
 Total throughput (KB/s)    5,895,500     1,109,694    6,484,224     10%    484%
 Average throughput (KB/s)    196,516        36,989      216,140     10%    484%
 elapsed time (sec)            108.77        334.28       106.33     -2%    -68%
 sys time (sec)              2,657.14         94.88     2,376.13    -11%   2404%
 memcg_high                    64,200        66,316       56,898
 memcg_swap_fail              101,182            70           27
 zswpout                   48,931,499        36,507   48,890,640
 zswpin                           380           379          377
 pswpout                            0    40,166,400            0
 pswpin                             0             0            0
 thp_swpout                         0        78,450            0
 thp_swpout_fallback          101,182            70           27
 2MB-mthp_swpout_fallback           0             0           27
 pgmajfault                     3,067         3,417        3,311
 swap_ra                           91            90          854
 swap_ra_hit                       45            45          810
 ZSWPOUT-2MB                      n/a           n/a       95,459
 SWPOUT-2MB                         0        78,450            0
 -------------------------------------------------------------------------------


 -------------------------------------------------------------------------------
 2M folios: deflate-iaa:
 =======================

 zswap compressor         deflate-iaa   deflate-iaa  deflate-iaa deflate-iaa v10
                         before-case1  before-case2        after     vs.     vs.
                                                                   case1   case2
 -------------------------------------------------------------------------------
 Total throughput (KB/s)   6,286,587      1,126,785    7,073,464     13%    528%
 Average throughput (KB/s)   209,552         37,559      235,782     13%    528%
 elapsed time (sec)            96.19         333.03        85.79    -11%    -74%
 sys time (sec)             2,141.44          99.96     1,826.67    -15%   1727%
 memcg_high                   99,253         64,666       79,718
 memcg_swap_fail             129,074             53          165
 zswpout                  61,312,794         28,321   56,045,120
 zswpin                          383            406          403
 pswpout                           0     40,048,128            0
 pswpin                            0              0            0
 thp_swpout                        0         78,219            0
 thp_swpout_fallback         129,074             53          165
 2MB-mthp_swpout_fallback          0              0          165
 pgmajfault                    3,430          3,077       31,468
 swap_ra                          91            103       84,373
 swap_ra_hit                      47             46       84,317
 ZSWPOUT-2MB                     n/a            n/a      109,229
 SWPOUT-2MB                        0         78,219            0
 -------------------------------------------------------------------------------


And finally, this is a comparison of deflate-iaa vs. zstd with v10 of this
patch-series:

 ---------------------------------------------
                  zswap_store large folios v10
                  Impr w/ deflate-iaa vs. zstd

                       64K folios    2M folios
 ---------------------------------------------
 Throughput (KB/s)            17%           9%
 elapsed time (sec)          -20%         -19%
 sys time (sec)              -27%         -23%
 ---------------------------------------------


Conclusions based on the performance results:
=============================================

 v10 wrt before-case1:
 ---------------------
 We see significant improvements in throughput, elapsed and sys time for
 zstd and deflate-iaa, when comparing before-case1 (THP_SWAP=N) vs. after
 (THP_SWAP=Y) with zswap_store large folios.

 v10 wrt before-case2:
 ---------------------
 We see even more significant improvements in throughput and elapsed time
 for zstd and deflate-iaa, when comparing before-case2 (large-folio-SSD)
 vs. after (large-folio-zswap). The sys time increases with
 large-folio-zswap as expected, due to the CPU compression time
 vs. asynchronous disk write times, as pointed out by Ying and Yosry.

 In before-case2, when zswap does not store large folios, only allocations
 and cgroup charging due to 4K folio zswap stores count towards the cgroup
 memory limit. However, in the after scenario, with the introduction of
 zswap_store() of large folios, there is an added component of the zswap
 compressed pool usage from large folio stores from potentially all 30
 processes, that gets counted towards the memory limit. As a result, we see
 higher swapout activity in the "after" data.


Summary:
========
The v10 data presented above shows that zswap_store of large folios
demonstrates good throughput/performance improvements compared to
conventional SSD swap of large folios with a sufficiently large 525G SSD
swap device. Hence, it seems reasonable for zswap_store to support large
folios, so that further performance improvements can be implemented.

In the experimental setup used in this patchset, we have enabled IAA
compress verification to ensure additional hardware data integrity CRC
checks not currently done by the software compressors. We see good
throughput/latency improvements with deflate-iaa vs. zstd with zswap_store
of large folios.

Some of the ideas for further reducing latency that have shown promise in
our experiments, are:

1) IAA compress/decompress batching.
2) Distributing compress jobs across all IAA devices on the socket.

The tests run for this patchset are using only 1 IAA device per core, that
avails of 2 compress engines on the device. In our experiments with IAA
batching, we distribute compress jobs from all cores to the 8 compress
engines available per socket. We further compress the pages in each folio
in parallel in the accelerator. As a result, we improve compress latency
and reclaim throughput.

In decompress batching, we use swapin_readahead to generate a prefetch
batch of 4K folios that we decompress in parallel in IAA.

 ------------------------------------------------------------------------------
                          IAA compress/decompress batching
              Further improvements wrt v10 zswap_store Sequential
                          subpage store using "deflate-iaa":

                      "deflate-iaa" Batching  "deflate-iaa-canned" [2] Batching
                          Additional Impr               Additional Impr
                     64K folios    2M folios     64K folios    2M folios
 ------------------------------------------------------------------------------
 Throughput (KB/s)          19%          43%           26%           55%
 elapsed time (sec)         -5%         -14%          -10%          -21%
 sys time (sec)              4%          -7%           -4%          -18%
 ------------------------------------------------------------------------------


With zswap IAA compress/decompress batching, we are able to demonstrate
significant performance improvements and memory savings in server
scalability experiments in highly contended system scenarios under
significant memory pressure; as compared to software compressors. We hope
to submit this work in subsequent patch series. The current patch-series is
a prequisite for these future submissions.

[1] https://lore.kernel.org/linux-mm/20231019110543.3284654-1-ryan.roberts@arm.com/T/#u
[2] https://patchwork.kernel.org/project/linux-crypto/cover/cover.1710969449.git.andre.glover@linux.intel.com/


This patch (of 6):

This resolves an issue with obj_cgroup_get() not being defined if
CONFIG_MEMCG is not defined.

Before this patch, we would see build errors if obj_cgroup_get() is called
from code that is agnostic of CONFIG_MEMCG.

The zswap_store() changes for large folios in subsequent commits will
require the use of obj_cgroup_get() in zswap code that falls into this
category.

Link: https://lkml.kernel.org/r/20241001053222.6944-1-kanchana.p.sridhar@intel.com
Link: https://lkml.kernel.org/r/20241001053222.6944-2-kanchana.p.sridhar@intel.com
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Wajdi Feghali <wajdi.k.feghali@intel.com>
Cc: "Zou, Nanhai" <nanhai.zou@intel.com>
Cc: Barry Song <21cnbao@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux/memcontrol.h')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0bd8f61a5597..5502aa8e138e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1233,6 +1233,10 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css)
 	return NULL;
 }
 
+static inline void obj_cgroup_get(struct obj_cgroup *objcg)
+{
+}
+
 static inline void obj_cgroup_put(struct obj_cgroup *objcg)
 {
 }
-- 
cgit v1.2.3