From bb4a340e075b7897ece109686bfa177f8518d2db Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 9 Aug 2010 17:18:37 -0700 Subject: mm: rename anon_vma_lock to vma_lock_anon_vma Rename anon_vma_lock to vma_lock_anon_vma. This matches the naming style used in page_lock_anon_vma and will come in really handy further down in this patch series. Signed-off-by: Rik van Riel Acked-by: Mel Gorman Acked-by: KAMEZAWA Hiroyuki Tested-by: Larry Woodman Acked-by: Larry Woodman Reviewed-by: Minchan Kim Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/rmap.h') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 77216742c178..80cd162a8aa6 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -99,14 +99,14 @@ static inline struct anon_vma *page_anon_vma(struct page *page) return page_rmapping(page); } -static inline void anon_vma_lock(struct vm_area_struct *vma) +static inline void vma_lock_anon_vma(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; if (anon_vma) spin_lock(&anon_vma->lock); } -static inline void anon_vma_unlock(struct vm_area_struct *vma) +static inline void vma_unlock_anon_vma(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; if (anon_vma) -- cgit v1.2.3 From cba48b98f2348c814316c4b4f411a07a0e4a2bf9 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 9 Aug 2010 17:18:38 -0700 Subject: mm: change direct call of spin_lock(anon_vma->lock) to inline function Subsitute a direct call of spin_lock(anon_vma->lock) with an inline function doing exactly the same. This makes it easier to do the substitution to the root anon_vma lock in a following patch. We will deal with the handful of special locks (nested, dec_and_lock, etc) separately. Signed-off-by: Rik van Riel Acked-by: Mel Gorman Acked-by: KAMEZAWA Hiroyuki Tested-by: Larry Woodman Acked-by: Larry Woodman Reviewed-by: Minchan Kim Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 10 ++++++++++ mm/ksm.c | 18 +++++++++--------- mm/migrate.c | 2 +- mm/mmap.c | 2 +- mm/rmap.c | 20 ++++++++++---------- 5 files changed, 31 insertions(+), 21 deletions(-) (limited to 'include/linux/rmap.h') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 80cd162a8aa6..5f981be61416 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -113,6 +113,16 @@ static inline void vma_unlock_anon_vma(struct vm_area_struct *vma) spin_unlock(&anon_vma->lock); } +static inline void anon_vma_lock(struct anon_vma *anon_vma) +{ + spin_lock(&anon_vma->lock); +} + +static inline void anon_vma_unlock(struct anon_vma *anon_vma) +{ + spin_unlock(&anon_vma->lock); +} + /* * anon_vma helper functions. */ diff --git a/mm/ksm.c b/mm/ksm.c index 6c3e99b4ae7c..eb9f6806ed51 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -327,7 +327,7 @@ static void drop_anon_vma(struct rmap_item *rmap_item) if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) { int empty = list_empty(&anon_vma->head); - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); if (empty) anon_vma_free(anon_vma); } @@ -1566,7 +1566,7 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; - spin_lock(&anon_vma->lock); + anon_vma_lock(anon_vma); list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { vma = vmac->vma; if (rmap_item->address < vma->vm_start || @@ -1589,7 +1589,7 @@ again: if (!search_new_forks || !mapcount) break; } - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); if (!mapcount) goto out; } @@ -1619,7 +1619,7 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; - spin_lock(&anon_vma->lock); + anon_vma_lock(anon_vma); list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { vma = vmac->vma; if (rmap_item->address < vma->vm_start || @@ -1637,11 +1637,11 @@ again: ret = try_to_unmap_one(page, vma, rmap_item->address, flags); if (ret != SWAP_AGAIN || !page_mapped(page)) { - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); goto out; } } - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); } if (!search_new_forks++) goto again; @@ -1671,7 +1671,7 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; - spin_lock(&anon_vma->lock); + anon_vma_lock(anon_vma); list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { vma = vmac->vma; if (rmap_item->address < vma->vm_start || @@ -1688,11 +1688,11 @@ again: ret = rmap_one(page, vma, rmap_item->address, arg); if (ret != SWAP_AGAIN) { - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); goto out; } } - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); } if (!search_new_forks++) goto again; diff --git a/mm/migrate.c b/mm/migrate.c index 4205b1d6049e..1855f869917d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -684,7 +684,7 @@ rcu_unlock: /* Drop an anon_vma reference if we took one */ if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) { int empty = list_empty(&anon_vma->head); - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); if (empty) anon_vma_free(anon_vma); } diff --git a/mm/mmap.c b/mm/mmap.c index e26f1ea7c904..f5db18decc2e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2593,7 +2593,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) if (!__test_and_clear_bit(0, (unsigned long *) &anon_vma->head.next)) BUG(); - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); } } diff --git a/mm/rmap.c b/mm/rmap.c index 38a336e2eea1..b65f00d1707f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -134,7 +134,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) allocated = anon_vma; } - spin_lock(&anon_vma->lock); + anon_vma_lock(anon_vma); /* page_table_lock to protect against threads */ spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { @@ -147,7 +147,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) avc = NULL; } spin_unlock(&mm->page_table_lock); - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); if (unlikely(allocated)) anon_vma_free(allocated); @@ -170,9 +170,9 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, avc->anon_vma = anon_vma; list_add(&avc->same_vma, &vma->anon_vma_chain); - spin_lock(&anon_vma->lock); + anon_vma_lock(anon_vma); list_add_tail(&avc->same_anon_vma, &anon_vma->head); - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); } /* @@ -246,12 +246,12 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) if (!anon_vma) return; - spin_lock(&anon_vma->lock); + anon_vma_lock(anon_vma); list_del(&anon_vma_chain->same_anon_vma); /* We must garbage collect the anon_vma if it's empty */ empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma); - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); if (empty) anon_vma_free(anon_vma); @@ -302,7 +302,7 @@ struct anon_vma *page_lock_anon_vma(struct page *page) goto out; anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); - spin_lock(&anon_vma->lock); + anon_vma_lock(anon_vma); return anon_vma; out: rcu_read_unlock(); @@ -311,7 +311,7 @@ out: void page_unlock_anon_vma(struct anon_vma *anon_vma) { - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); rcu_read_unlock(); } @@ -1389,7 +1389,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, anon_vma = page_anon_vma(page); if (!anon_vma) return ret; - spin_lock(&anon_vma->lock); + anon_vma_lock(anon_vma); list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); @@ -1399,7 +1399,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, if (ret != SWAP_AGAIN) break; } - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); return ret; } -- cgit v1.2.3 From 5c341ee1dfc8fe69d66b1c8b19e463c6d7201ae1 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 9 Aug 2010 17:18:39 -0700 Subject: mm: track the root (oldest) anon_vma Track the root (oldest) anon_vma in each anon_vma tree. Because we only take the lock on the root anon_vma, we cannot use the lock on higher-up anon_vmas to lock anything. This makes it impossible to do an indirect lookup of the root anon_vma, since the data structures could go away from under us. However, a direct pointer is safe because the root anon_vma is always the last one that gets freed on munmap or exit, by virtue of the same_vma list order and unlink_anon_vmas walking the list forward. [akpm@linux-foundation.org: fix typo] Signed-off-by: Rik van Riel Acked-by: Mel Gorman Acked-by: KAMEZAWA Hiroyuki Tested-by: Larry Woodman Acked-by: Larry Woodman Reviewed-by: Minchan Kim Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 1 + mm/rmap.c | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include/linux/rmap.h') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 5f981be61416..41fa6ddc6214 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -26,6 +26,7 @@ */ struct anon_vma { spinlock_t lock; /* Serialize access to vma list */ + struct anon_vma *root; /* Root of this anon_vma tree */ #if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION) /* diff --git a/mm/rmap.c b/mm/rmap.c index b65f00d1707f..caa48b27371b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -132,6 +132,11 @@ int anon_vma_prepare(struct vm_area_struct *vma) if (unlikely(!anon_vma)) goto out_enomem_free_avc; allocated = anon_vma; + /* + * This VMA had no anon_vma yet. This anon_vma is + * the root of any anon_vma tree that might form. + */ + anon_vma->root = anon_vma; } anon_vma_lock(anon_vma); @@ -224,9 +229,15 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) avc = anon_vma_chain_alloc(); if (!avc) goto out_error_free_anon_vma; - anon_vma_chain_link(vma, avc, anon_vma); + + /* + * The root anon_vma's spinlock is the lock actually used when we + * lock any of the anon_vmas in this anon_vma tree. + */ + anon_vma->root = pvma->anon_vma->root; /* Mark this anon_vma as the one where our new (COWed) pages go. */ vma->anon_vma = anon_vma; + anon_vma_chain_link(vma, avc, anon_vma); return 0; @@ -261,7 +272,10 @@ void unlink_anon_vmas(struct vm_area_struct *vma) { struct anon_vma_chain *avc, *next; - /* Unlink each anon_vma chained to the VMA. */ + /* + * Unlink each anon_vma chained to the VMA. This list is ordered + * from newest to oldest, ensuring the root anon_vma gets freed last. + */ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { anon_vma_unlink(avc); list_del(&avc->same_vma); -- cgit v1.2.3 From 012f18004da33ba672e3c60838cc4898126174d3 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 9 Aug 2010 17:18:40 -0700 Subject: mm: always lock the root (oldest) anon_vma Always (and only) lock the root (oldest) anon_vma whenever we do something in an anon_vma. The recently introduced anon_vma scalability is due to the rmap code scanning only the VMAs that need to be scanned. Many common operations still took the anon_vma lock on the root anon_vma, so always taking that lock is not expected to introduce any scalability issues. However, always taking the same lock does mean we only need to take one lock, which means rmap_walk on pages from any anon_vma in the vma is excluded from occurring during an munmap, expand_stack or other operation that needs to exclude rmap_walk and similar functions. Also add the proper locking to vma_adjust. Signed-off-by: Rik van Riel Tested-by: Larry Woodman Acked-by: Larry Woodman Reviewed-by: Minchan Kim Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Mel Gorman Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 8 ++++---- mm/ksm.c | 2 +- mm/migrate.c | 2 +- mm/mmap.c | 30 ++++++++++++++++++++++-------- 4 files changed, 28 insertions(+), 14 deletions(-) (limited to 'include/linux/rmap.h') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 41fa6ddc6214..af43cb9a0506 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -104,24 +104,24 @@ static inline void vma_lock_anon_vma(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; if (anon_vma) - spin_lock(&anon_vma->lock); + spin_lock(&anon_vma->root->lock); } static inline void vma_unlock_anon_vma(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; if (anon_vma) - spin_unlock(&anon_vma->lock); + spin_unlock(&anon_vma->root->lock); } static inline void anon_vma_lock(struct anon_vma *anon_vma) { - spin_lock(&anon_vma->lock); + spin_lock(&anon_vma->root->lock); } static inline void anon_vma_unlock(struct anon_vma *anon_vma) { - spin_unlock(&anon_vma->lock); + spin_unlock(&anon_vma->root->lock); } /* diff --git a/mm/ksm.c b/mm/ksm.c index eb9f6806ed51..da6037c261f1 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -325,7 +325,7 @@ static void drop_anon_vma(struct rmap_item *rmap_item) { struct anon_vma *anon_vma = rmap_item->anon_vma; - if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) { + if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) { int empty = list_empty(&anon_vma->head); anon_vma_unlock(anon_vma); if (empty) diff --git a/mm/migrate.c b/mm/migrate.c index 1855f869917d..5208fa1d9712 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -682,7 +682,7 @@ skip_unmap: rcu_unlock: /* Drop an anon_vma reference if we took one */ - if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) { + if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) { int empty = list_empty(&anon_vma->head); anon_vma_unlock(anon_vma); if (empty) diff --git a/mm/mmap.c b/mm/mmap.c index f5db18decc2e..fb89360a2120 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -506,6 +506,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, struct vm_area_struct *importer = NULL; struct address_space *mapping = NULL; struct prio_tree_root *root = NULL; + struct anon_vma *anon_vma = NULL; struct file *file = vma->vm_file; long adjust_next = 0; int remove_next = 0; @@ -578,6 +579,17 @@ again: remove_next = 1 + (end > next->vm_end); } } + /* + * When changing only vma->vm_end, we don't really need anon_vma + * lock. This is a fairly rare case by itself, but the anon_vma + * lock may be shared between many sibling processes. Skipping + * the lock for brk adjustments makes a difference sometimes. + */ + if (vma->anon_vma && (insert || importer || start != vma->vm_start)) { + anon_vma = vma->anon_vma; + anon_vma_lock(anon_vma); + } + if (root) { flush_dcache_mmap_lock(mapping); vma_prio_tree_remove(vma, root); @@ -617,6 +629,8 @@ again: remove_next = 1 + (end > next->vm_end); __insert_vm_struct(mm, insert); } + if (anon_vma) + anon_vma_unlock(anon_vma); if (mapping) spin_unlock(&mapping->i_mmap_lock); @@ -2470,23 +2484,23 @@ static DEFINE_MUTEX(mm_all_locks_mutex); static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) { - if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) { + if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { /* * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. */ - spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem); + spin_lock_nest_lock(&anon_vma->root->lock, &mm->mmap_sem); /* * We can safely modify head.next after taking the - * anon_vma->lock. If some other vma in this mm shares + * anon_vma->root->lock. If some other vma in this mm shares * the same anon_vma we won't take it again. * * No need of atomic instructions here, head.next * can't change from under us thanks to the - * anon_vma->lock. + * anon_vma->root->lock. */ if (__test_and_set_bit(0, (unsigned long *) - &anon_vma->head.next)) + &anon_vma->root->head.next)) BUG(); } } @@ -2577,7 +2591,7 @@ out_unlock: static void vm_unlock_anon_vma(struct anon_vma *anon_vma) { - if (test_bit(0, (unsigned long *) &anon_vma->head.next)) { + if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { /* * The LSB of head.next can't change to 0 from under * us because we hold the mm_all_locks_mutex. @@ -2588,10 +2602,10 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) * * No need of atomic instructions here, head.next * can't change from under us until we release the - * anon_vma->lock. + * anon_vma->root->lock. */ if (!__test_and_clear_bit(0, (unsigned long *) - &anon_vma->head.next)) + &anon_vma->root->head.next)) BUG(); anon_vma_unlock(anon_vma); } -- cgit v1.2.3 From 76545066c8521f3e32c849744744842b4df25b79 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 9 Aug 2010 17:18:41 -0700 Subject: mm: extend KSM refcounts to the anon_vma root KSM reference counts can cause an anon_vma to exist after the processe it belongs to have already exited. Because the anon_vma lock now lives in the root anon_vma, we need to ensure that the root anon_vma stays around until after all the "child" anon_vmas have been freed. The obvious way to do this is to have a "child" anon_vma take a reference to the root in anon_vma_fork. When the anon_vma is freed at munmap or process exit, we drop the refcount in anon_vma_unlink and possibly free the root anon_vma. The KSM anon_vma reference count function also needs to be modified to deal with the possibility of freeing 2 levels of anon_vma. The easiest way to do this is to break out the KSM magic and make it generic. When compiling without CONFIG_KSM, this code is compiled out. Signed-off-by: Rik van Riel Tested-by: Larry Woodman Acked-by: Larry Woodman Reviewed-by: Minchan Kim Cc: KAMEZAWA Hiroyuki Acked-by: Mel Gorman Acked-by: Linus Torvalds Tested-by: Dave Young Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 15 +++++++++++++++ mm/ksm.c | 17 ++++++----------- mm/migrate.c | 10 +++------- mm/rmap.c | 46 +++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 69 insertions(+), 19 deletions(-) (limited to 'include/linux/rmap.h') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index af43cb9a0506..dc9b3c0bf5d4 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -81,6 +81,13 @@ static inline int anonvma_external_refcount(struct anon_vma *anon_vma) { return atomic_read(&anon_vma->external_refcount); } + +static inline void get_anon_vma(struct anon_vma *anon_vma) +{ + atomic_inc(&anon_vma->external_refcount); +} + +void drop_anon_vma(struct anon_vma *); #else static inline void anonvma_external_refcount_init(struct anon_vma *anon_vma) { @@ -90,6 +97,14 @@ static inline int anonvma_external_refcount(struct anon_vma *anon_vma) { return 0; } + +static inline void get_anon_vma(struct anon_vma *anon_vma) +{ +} + +static inline void drop_anon_vma(struct anon_vma *anon_vma) +{ +} #endif /* CONFIG_KSM */ static inline struct anon_vma *page_anon_vma(struct page *page) diff --git a/mm/ksm.c b/mm/ksm.c index da6037c261f1..9f2acc998a37 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -318,19 +318,14 @@ static void hold_anon_vma(struct rmap_item *rmap_item, struct anon_vma *anon_vma) { rmap_item->anon_vma = anon_vma; - atomic_inc(&anon_vma->external_refcount); + get_anon_vma(anon_vma); } -static void drop_anon_vma(struct rmap_item *rmap_item) +static void ksm_drop_anon_vma(struct rmap_item *rmap_item) { struct anon_vma *anon_vma = rmap_item->anon_vma; - if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) { - int empty = list_empty(&anon_vma->head); - anon_vma_unlock(anon_vma); - if (empty) - anon_vma_free(anon_vma); - } + drop_anon_vma(anon_vma); } /* @@ -415,7 +410,7 @@ static void break_cow(struct rmap_item *rmap_item) * It is not an accident that whenever we want to break COW * to undo, we also need to drop a reference to the anon_vma. */ - drop_anon_vma(rmap_item); + ksm_drop_anon_vma(rmap_item); down_read(&mm->mmap_sem); if (ksm_test_exit(mm)) @@ -470,7 +465,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) ksm_pages_sharing--; else ksm_pages_shared--; - drop_anon_vma(rmap_item); + ksm_drop_anon_vma(rmap_item); rmap_item->address &= PAGE_MASK; cond_resched(); } @@ -558,7 +553,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) else ksm_pages_shared--; - drop_anon_vma(rmap_item); + ksm_drop_anon_vma(rmap_item); rmap_item->address &= PAGE_MASK; } else if (rmap_item->address & UNSTABLE_FLAG) { diff --git a/mm/migrate.c b/mm/migrate.c index 5208fa1d9712..38e7cad782f4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -639,7 +639,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, * exist when the page is remapped later */ anon_vma = page_anon_vma(page); - atomic_inc(&anon_vma->external_refcount); + get_anon_vma(anon_vma); } } @@ -682,12 +682,8 @@ skip_unmap: rcu_unlock: /* Drop an anon_vma reference if we took one */ - if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) { - int empty = list_empty(&anon_vma->head); - anon_vma_unlock(anon_vma); - if (empty) - anon_vma_free(anon_vma); - } + if (anon_vma) + drop_anon_vma(anon_vma); if (rcu_locked) rcu_read_unlock(); diff --git a/mm/rmap.c b/mm/rmap.c index caa48b27371b..07e9814c7a41 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -235,6 +235,12 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) * lock any of the anon_vmas in this anon_vma tree. */ anon_vma->root = pvma->anon_vma->root; + /* + * With KSM refcounts, an anon_vma can stay around longer than the + * process it belongs to. The root anon_vma needs to be pinned + * until this anon_vma is freed, because the lock lives in the root. + */ + get_anon_vma(anon_vma->root); /* Mark this anon_vma as the one where our new (COWed) pages go. */ vma->anon_vma = anon_vma; anon_vma_chain_link(vma, avc, anon_vma); @@ -264,8 +270,12 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma); anon_vma_unlock(anon_vma); - if (empty) + if (empty) { + /* We no longer need the root anon_vma */ + if (anon_vma->root != anon_vma) + drop_anon_vma(anon_vma->root); anon_vma_free(anon_vma); + } } void unlink_anon_vmas(struct vm_area_struct *vma) @@ -1382,6 +1392,40 @@ int try_to_munlock(struct page *page) return try_to_unmap_file(page, TTU_MUNLOCK); } +#if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION) +/* + * Drop an anon_vma refcount, freeing the anon_vma and anon_vma->root + * if necessary. Be careful to do all the tests under the lock. Once + * we know we are the last user, nobody else can get a reference and we + * can do the freeing without the lock. + */ +void drop_anon_vma(struct anon_vma *anon_vma) +{ + if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) { + struct anon_vma *root = anon_vma->root; + int empty = list_empty(&anon_vma->head); + int last_root_user = 0; + int root_empty = 0; + + /* + * The refcount on a non-root anon_vma got dropped. Drop + * the refcount on the root and check if we need to free it. + */ + if (empty && anon_vma != root) { + last_root_user = atomic_dec_and_test(&root->external_refcount); + root_empty = list_empty(&root->head); + } + anon_vma_unlock(anon_vma); + + if (empty) { + anon_vma_free(anon_vma); + if (root_empty && last_root_user) + anon_vma_free(root); + } + } +} +#endif + #ifdef CONFIG_MIGRATION /* * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): -- cgit v1.2.3 From ad8c2ee801ad7a52d919b478d9b2c7b39a72d295 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 9 Aug 2010 17:19:48 -0700 Subject: rmap: add exclusive page to private anon_vma on swapin On swapin it is fairly common for a page to be owned exclusively by one process. In that case we want to add the page to the anon_vma of that process's VMA, instead of to the root anon_vma. This will reduce the amount of rmap searching that the swapout code needs to do. Signed-off-by: Rik van Riel Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 2 ++ mm/memory.c | 4 +++- mm/rmap.c | 13 ++++++++++++- 3 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include/linux/rmap.h') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index dc9b3c0bf5d4..d6661de56f30 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -162,6 +162,8 @@ static inline void anon_vma_merge(struct vm_area_struct *vma, */ void page_move_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); +void do_page_add_anon_rmap(struct page *, struct vm_area_struct *, + unsigned long, int); void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); void page_add_file_rmap(struct page *); void page_remove_rmap(struct page *); diff --git a/mm/memory.c b/mm/memory.c index 6b0c37dcfd16..6bc039486e9f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2628,6 +2628,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, swp_entry_t entry; pte_t pte; struct mem_cgroup *ptr = NULL; + int exclusive = 0; int ret = 0; if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) @@ -2722,10 +2723,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); flags &= ~FAULT_FLAG_WRITE; + exclusive = 1; } flush_icache_page(vma, page); set_pte_at(mm, address, page_table, pte); - page_add_anon_rmap(page, vma, address); + do_page_add_anon_rmap(page, vma, address, exclusive); /* It's better to call commit-charge after rmap is established */ mem_cgroup_commit_charge_swapin(page, ptr); diff --git a/mm/rmap.c b/mm/rmap.c index 4d152a6d3a89..a7d0f5482634 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -829,6 +829,17 @@ static void __page_check_anon_rmap(struct page *page, */ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) +{ + do_page_add_anon_rmap(page, vma, address, 0); +} + +/* + * Special version of the above for do_swap_page, which often runs + * into pages that are exclusively owned by the current process. + * Everybody else should continue to use page_add_anon_rmap above. + */ +void do_page_add_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address, int exclusive) { int first = atomic_inc_and_test(&page->_mapcount); if (first) @@ -839,7 +850,7 @@ void page_add_anon_rmap(struct page *page, VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); if (first) - __page_set_anon_rmap(page, vma, address, 0); + __page_set_anon_rmap(page, vma, address, exclusive); else __page_check_anon_rmap(page, vma, address); } -- cgit v1.2.3 From 0fe6e20b9c4c53b3e97096ee73a0857f60aad43f Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 28 May 2010 09:29:16 +0900 Subject: hugetlb, rmap: add reverse mapping for hugepage This patch adds reverse mapping feature for hugepage by introducing mapcount for shared/private-mapped hugepage and anon_vma for private-mapped hugepage. While hugepage is not currently swappable, reverse mapping can be useful for memory error handler. Without this patch, memory error handler cannot identify processes using the bad hugepage nor unmap it from them. That is: - for shared hugepage: we can collect processes using a hugepage through pagecache, but can not unmap the hugepage because of the lack of mapcount. - for privately mapped hugepage: we can neither collect processes nor unmap the hugepage. This patch solves these problems. This patch include the bug fix given by commit 23be7468e8, so reverts it. Dependency: "hugetlb: move definition of is_vm_hugetlb_page() to hugepage_inline.h" ChangeLog since May 24. - create hugetlb_inline.h and move is_vm_hugetlb_index() in it. - move functions setting up anon_vma for hugepage into mm/rmap.c. ChangeLog since May 13. - rebased to 2.6.34 - fix logic error (in case that private mapping and shared mapping coexist) - move is_vm_hugetlb_page() into include/linux/mm.h to use this function from linear_page_index() - define and use linear_hugepage_index() instead of compound_order() - use page_move_anon_rmap() in hugetlb_cow() - copy exclusive switch of __set_page_anon_rmap() into hugepage counterpart. - revert commit 24be7468 completely Signed-off-by: Naoya Horiguchi Cc: Andi Kleen Cc: Andrew Morton Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Larry Woodman Cc: Lee Schermerhorn Acked-by: Fengguang Wu Acked-by: Mel Gorman Signed-off-by: Andi Kleen --- include/linux/hugetlb.h | 1 + include/linux/pagemap.h | 8 ++++++- include/linux/poison.h | 9 -------- include/linux/rmap.h | 5 +++++ mm/hugetlb.c | 44 ++++++++++++++++++++++++++++++++++-- mm/rmap.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 114 insertions(+), 12 deletions(-) (limited to 'include/linux/rmap.h') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d47a7c41745d..e688fd89354d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -99,6 +99,7 @@ static inline void hugetlb_report_meminfo(struct seq_file *m) #define is_hugepage_only_range(mm, addr, len) 0 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) +#define huge_pte_offset(mm, address) 0 #define hugetlb_change_protection(vma, address, end, newprot) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index b2bd2bae9775..a547d9689170 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -282,10 +282,16 @@ static inline loff_t page_offset(struct page *page) return ((loff_t)page->index) << PAGE_CACHE_SHIFT; } +extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma, + unsigned long address); + static inline pgoff_t linear_page_index(struct vm_area_struct *vma, unsigned long address) { - pgoff_t pgoff = (address - vma->vm_start) >> PAGE_SHIFT; + pgoff_t pgoff; + if (unlikely(is_vm_hugetlb_page(vma))) + return linear_hugepage_index(vma, address); + pgoff = (address - vma->vm_start) >> PAGE_SHIFT; pgoff += vma->vm_pgoff; return pgoff >> (PAGE_CACHE_SHIFT - PAGE_SHIFT); } diff --git a/include/linux/poison.h b/include/linux/poison.h index 34066ffd893d..2110a81c5e2a 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -48,15 +48,6 @@ #define POISON_FREE 0x6b /* for use-after-free poisoning */ #define POISON_END 0xa5 /* end-byte of poisoning */ -/********** mm/hugetlb.c **********/ -/* - * Private mappings of hugetlb pages use this poisoned value for - * page->mapping. The core VM should not be doing anything with this mapping - * but futex requires the existence of some page->mapping value even though it - * is unused if PAGE_MAPPING_ANON is set. - */ -#define HUGETLB_POISON ((void *)(0x00300300 + POISON_POINTER_DELTA + PAGE_MAPPING_ANON)) - /********** arch/$ARCH/mm/init.c **********/ #define POISON_FREE_INITMEM 0xcc diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 77216742c178..9d50e7ef5f5a 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -140,6 +140,11 @@ void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned lon void page_add_file_rmap(struct page *); void page_remove_rmap(struct page *); +void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, + unsigned long); +void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, + unsigned long); + static inline void page_dup_rmap(struct page *page) { atomic_inc(&page->_mapcount); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 54d42b009dbe..aa3c51739378 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -220,6 +221,12 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, (vma->vm_pgoff >> huge_page_order(h)); } +pgoff_t linear_hugepage_index(struct vm_area_struct *vma, + unsigned long address) +{ + return vma_hugecache_offset(hstate_vma(vma), vma, address); +} + /* * Return the size of the pages allocated when backing a VMA. In the majority * cases this will be same size as used by the page table entries. @@ -552,6 +559,7 @@ static void free_huge_page(struct page *page) set_page_private(page, 0); page->mapping = NULL; BUG_ON(page_count(page)); + BUG_ON(page_mapcount(page)); INIT_LIST_HEAD(&page->lru); spin_lock(&hugetlb_lock); @@ -2129,6 +2137,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, entry = huge_ptep_get(src_pte); ptepage = pte_page(entry); get_page(ptepage); + page_dup_rmap(ptepage); set_huge_pte_at(dst, addr, dst_pte, entry); } spin_unlock(&src->page_table_lock); @@ -2207,6 +2216,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, flush_tlb_range(vma, start, end); mmu_notifier_invalidate_range_end(mm, start, end); list_for_each_entry_safe(page, tmp, &page_list, lru) { + page_remove_rmap(page); list_del(&page->lru); put_page(page); } @@ -2272,6 +2282,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, return 1; } +/* + * Hugetlb_cow() should be called with page lock of the original hugepage held. + */ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t pte, struct page *pagecache_page) @@ -2286,8 +2299,11 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, retry_avoidcopy: /* If no-one else is actually using this page, avoid the copy * and just make the page writable */ - avoidcopy = (page_count(old_page) == 1); + avoidcopy = (page_mapcount(old_page) == 1); if (avoidcopy) { + if (!trylock_page(old_page)) + if (PageAnon(old_page)) + page_move_anon_rmap(old_page, vma, address); set_huge_ptep_writable(vma, address, ptep); return 0; } @@ -2338,6 +2354,13 @@ retry_avoidcopy: return -PTR_ERR(new_page); } + /* + * When the original hugepage is shared one, it does not have + * anon_vma prepared. + */ + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + copy_huge_page(new_page, old_page, address, vma); __SetPageUptodate(new_page); @@ -2352,6 +2375,8 @@ retry_avoidcopy: huge_ptep_clear_flush(vma, address, ptep); set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, new_page, 1)); + page_remove_rmap(old_page); + hugepage_add_anon_rmap(new_page, vma, address); /* Make the old page be freed below */ new_page = old_page; } @@ -2452,10 +2477,17 @@ retry: spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); spin_unlock(&inode->i_lock); + page_dup_rmap(page); } else { lock_page(page); - page->mapping = HUGETLB_POISON; + if (unlikely(anon_vma_prepare(vma))) { + ret = VM_FAULT_OOM; + goto backout_unlocked; + } + hugepage_add_new_anon_rmap(page, vma, address); } + } else { + page_dup_rmap(page); } /* @@ -2507,6 +2539,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *ptep; pte_t entry; int ret; + struct page *page = NULL; struct page *pagecache_page = NULL; static DEFINE_MUTEX(hugetlb_instantiation_mutex); struct hstate *h = hstate_vma(vma); @@ -2548,6 +2581,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, vma, address); } + if (!pagecache_page) { + page = pte_page(entry); + lock_page(page); + } + spin_lock(&mm->page_table_lock); /* Check for a racing update before calling hugetlb_cow */ if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) @@ -2573,6 +2611,8 @@ out_page_table_lock: if (pagecache_page) { unlock_page(pagecache_page); put_page(pagecache_page); + } else { + unlock_page(page); } out_mutex: diff --git a/mm/rmap.c b/mm/rmap.c index 38a336e2eea1..0ad53572eaf2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -56,6 +56,7 @@ #include #include #include +#include #include @@ -326,6 +327,8 @@ vma_address(struct page *page, struct vm_area_struct *vma) pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); unsigned long address; + if (unlikely(is_vm_hugetlb_page(vma))) + pgoff = page->index << huge_page_order(page_hstate(page)); address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { /* page should be within @vma mapping range */ @@ -369,6 +372,12 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, pte_t *pte; spinlock_t *ptl; + if (unlikely(PageHuge(page))) { + pte = huge_pte_offset(mm, address); + ptl = &mm->page_table_lock; + goto check; + } + pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) return NULL; @@ -389,6 +398,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, } ptl = pte_lockptr(mm, pmd); +check: spin_lock(ptl); if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { *ptlp = ptl; @@ -873,6 +883,12 @@ void page_remove_rmap(struct page *page) page_clear_dirty(page); set_page_dirty(page); } + /* + * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED + * and not charged by memcg for now. + */ + if (unlikely(PageHuge(page))) + return; if (PageAnon(page)) { mem_cgroup_uncharge_page(page); __dec_zone_page_state(page, NR_ANON_PAGES); @@ -1445,3 +1461,46 @@ int rmap_walk(struct page *page, int (*rmap_one)(struct page *, return rmap_walk_file(page, rmap_one, arg); } #endif /* CONFIG_MIGRATION */ + +#ifdef CONFIG_HUGETLBFS +/* + * The following three functions are for anonymous (private mapped) hugepages. + * Unlike common anonymous pages, anonymous hugepages have no accounting code + * and no lru code, because we handle hugepages differently from common pages. + */ +static void __hugepage_set_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address, int exclusive) +{ + struct anon_vma *anon_vma = vma->anon_vma; + BUG_ON(!anon_vma); + if (!exclusive) { + struct anon_vma_chain *avc; + avc = list_entry(vma->anon_vma_chain.prev, + struct anon_vma_chain, same_vma); + anon_vma = avc->anon_vma; + } + anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; + page->mapping = (struct address_space *) anon_vma; + page->index = linear_page_index(vma, address); +} + +void hugepage_add_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + struct anon_vma *anon_vma = vma->anon_vma; + int first; + BUG_ON(!anon_vma); + BUG_ON(address < vma->vm_start || address >= vma->vm_end); + first = atomic_inc_and_test(&page->_mapcount); + if (first) + __hugepage_set_anon_rmap(page, vma, address, 0); +} + +void hugepage_add_new_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + BUG_ON(address < vma->vm_start || address >= vma->vm_end); + atomic_set(&page->_mapcount, 0); + __hugepage_set_anon_rmap(page, vma, address, 1); +} +#endif /* CONFIG_HUGETLBFS */ -- cgit v1.2.3