diff options
Diffstat (limited to 'mm/swap_state.c')
| -rw-r--r-- | mm/swap_state.c | 488 | 
1 files changed, 246 insertions, 242 deletions
| diff --git a/mm/swap_state.c b/mm/swap_state.c index c354435a0923..b13e9c4baa90 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -23,6 +23,7 @@  #include <linux/huge_mm.h>  #include <linux/shmem_fs.h>  #include "internal.h" +#include "swap_table.h"  #include "swap.h"  /* @@ -36,8 +37,11 @@ static const struct address_space_operations swap_aops = {  #endif  }; -struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; -static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; +/* Set swap_space as read only as swap cache is handled by swap table */ +struct address_space swap_space __ro_after_init = { +	.a_ops = &swap_aops, +}; +  static bool enable_vma_readahead __read_mostly = true;  #define SWAP_RA_ORDER_CEILING	5 @@ -69,150 +73,237 @@ void show_swap_cache_info(void)  	printk("Total swap = %lukB\n", K(total_swap_pages));  } -void *get_shadow_from_swap_cache(swp_entry_t entry) +/** + * swap_cache_get_folio - Looks up a folio in the swap cache. + * @entry: swap entry used for the lookup. + * + * A found folio will be returned unlocked and with its refcount increased. + * + * Context: Caller must ensure @entry is valid and protect the swap device + * with reference count or locks. + * Return: Returns the found folio on success, NULL otherwise. The caller + * must lock nd check if the folio still matches the swap entry before + * use (e.g., folio_matches_swap_entry). + */ +struct folio *swap_cache_get_folio(swp_entry_t entry)  { -	struct address_space *address_space = swap_address_space(entry); -	pgoff_t idx = swap_cache_index(entry); -	void *shadow; +	unsigned long swp_tb; +	struct folio *folio; + +	for (;;) { +		swp_tb = swap_table_get(__swap_entry_to_cluster(entry), +					swp_cluster_offset(entry)); +		if (!swp_tb_is_folio(swp_tb)) +			return NULL; +		folio = swp_tb_to_folio(swp_tb); +		if (likely(folio_try_get(folio))) +			return folio; +	} -	shadow = xa_load(&address_space->i_pages, idx); -	if (xa_is_value(shadow)) -		return shadow;  	return NULL;  } -/* - * add_to_swap_cache resembles filemap_add_folio on swapper_space, - * but sets SwapCache flag and 'swap' instead of mapping and index. +/** + * swap_cache_get_shadow - Looks up a shadow in the swap cache. + * @entry: swap entry used for the lookup. + * + * Context: Caller must ensure @entry is valid and protect the swap device + * with reference count or locks. + * Return: Returns either NULL or an XA_VALUE (shadow).   */ -int add_to_swap_cache(struct folio *folio, swp_entry_t entry, -			gfp_t gfp, void **shadowp) +void *swap_cache_get_shadow(swp_entry_t entry)  { -	struct address_space *address_space = swap_address_space(entry); -	pgoff_t idx = swap_cache_index(entry); -	XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); -	unsigned long i, nr = folio_nr_pages(folio); -	void *old; +	unsigned long swp_tb; -	xas_set_update(&xas, workingset_update_node); +	swp_tb = swap_table_get(__swap_entry_to_cluster(entry), +				swp_cluster_offset(entry)); +	if (swp_tb_is_shadow(swp_tb)) +		return swp_tb_to_shadow(swp_tb); +	return NULL; +} -	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); -	VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); -	VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); +/** + * swap_cache_add_folio - Add a folio into the swap cache. + * @folio: The folio to be added. + * @entry: The swap entry corresponding to the folio. + * @gfp: gfp_mask for XArray node allocation. + * @shadowp: If a shadow is found, return the shadow. + * + * Context: Caller must ensure @entry is valid and protect the swap device + * with reference count or locks. + * The caller also needs to update the corresponding swap_map slots with + * SWAP_HAS_CACHE bit to avoid race or conflict. + */ +void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp) +{ +	void *shadow = NULL; +	unsigned long old_tb, new_tb; +	struct swap_cluster_info *ci; +	unsigned int ci_start, ci_off, ci_end; +	unsigned long nr_pages = folio_nr_pages(folio); + +	VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); +	VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); +	VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio); + +	new_tb = folio_to_swp_tb(folio); +	ci_start = swp_cluster_offset(entry); +	ci_end = ci_start + nr_pages; +	ci_off = ci_start; +	ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry)); +	do { +		old_tb = __swap_table_xchg(ci, ci_off, new_tb); +		WARN_ON_ONCE(swp_tb_is_folio(old_tb)); +		if (swp_tb_is_shadow(old_tb)) +			shadow = swp_tb_to_shadow(old_tb); +	} while (++ci_off < ci_end); -	folio_ref_add(folio, nr); +	folio_ref_add(folio, nr_pages);  	folio_set_swapcache(folio);  	folio->swap = entry; +	swap_cluster_unlock(ci); -	do { -		xas_lock_irq(&xas); -		xas_create_range(&xas); -		if (xas_error(&xas)) -			goto unlock; -		for (i = 0; i < nr; i++) { -			VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio); -			if (shadowp) { -				old = xas_load(&xas); -				if (xa_is_value(old)) -					*shadowp = old; -			} -			xas_store(&xas, folio); -			xas_next(&xas); -		} -		address_space->nrpages += nr; -		__node_stat_mod_folio(folio, NR_FILE_PAGES, nr); -		__lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr); -unlock: -		xas_unlock_irq(&xas); -	} while (xas_nomem(&xas, gfp)); - -	if (!xas_error(&xas)) -		return 0; +	node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); +	lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages); -	folio_clear_swapcache(folio); -	folio_ref_sub(folio, nr); -	return xas_error(&xas); +	if (shadowp) +		*shadowp = shadow;  } -/* - * This must be called only on folios that have - * been verified to be in the swap cache. +/** + * __swap_cache_del_folio - Removes a folio from the swap cache. + * @ci: The locked swap cluster. + * @folio: The folio. + * @entry: The first swap entry that the folio corresponds to. + * @shadow: shadow value to be filled in the swap cache. + * + * Removes a folio from the swap cache and fills a shadow in place. + * This won't put the folio's refcount. The caller has to do that. + * + * Context: Caller must ensure the folio is locked and in the swap cache + * using the index of @entry, and lock the cluster that holds the entries.   */ -void __delete_from_swap_cache(struct folio *folio, -			swp_entry_t entry, void *shadow) +void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, +			    swp_entry_t entry, void *shadow)  { -	struct address_space *address_space = swap_address_space(entry); -	int i; -	long nr = folio_nr_pages(folio); -	pgoff_t idx = swap_cache_index(entry); -	XA_STATE(xas, &address_space->i_pages, idx); - -	xas_set_update(&xas, workingset_update_node); - -	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); -	VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); -	VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); - -	for (i = 0; i < nr; i++) { -		void *entry = xas_store(&xas, shadow); -		VM_BUG_ON_PAGE(entry != folio, entry); -		xas_next(&xas); -	} +	unsigned long old_tb, new_tb; +	unsigned int ci_start, ci_off, ci_end; +	unsigned long nr_pages = folio_nr_pages(folio); + +	VM_WARN_ON_ONCE(__swap_entry_to_cluster(entry) != ci); +	VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); +	VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); +	VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio); + +	new_tb = shadow_swp_to_tb(shadow); +	ci_start = swp_cluster_offset(entry); +	ci_end = ci_start + nr_pages; +	ci_off = ci_start; +	do { +		/* If shadow is NULL, we sets an empty shadow */ +		old_tb = __swap_table_xchg(ci, ci_off, new_tb); +		WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || +			     swp_tb_to_folio(old_tb) != folio); +	} while (++ci_off < ci_end); +  	folio->swap.val = 0;  	folio_clear_swapcache(folio); -	address_space->nrpages -= nr; -	__node_stat_mod_folio(folio, NR_FILE_PAGES, -nr); -	__lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr); +	node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages); +	lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages);  } -/* - * This must be called only on folios that have - * been verified to be in the swap cache and locked. - * It will never put the folio into the free list, - * the caller has a reference on the folio. +/** + * swap_cache_del_folio - Removes a folio from the swap cache. + * @folio: The folio. + * + * Same as __swap_cache_del_folio, but handles lock and refcount. The + * caller must ensure the folio is either clean or has a swap count + * equal to zero, or it may cause data loss. + * + * Context: Caller must ensure the folio is locked and in the swap cache.   */ -void delete_from_swap_cache(struct folio *folio) +void swap_cache_del_folio(struct folio *folio)  { +	struct swap_cluster_info *ci;  	swp_entry_t entry = folio->swap; -	struct address_space *address_space = swap_address_space(entry); -	xa_lock_irq(&address_space->i_pages); -	__delete_from_swap_cache(folio, entry, NULL); -	xa_unlock_irq(&address_space->i_pages); +	ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry)); +	__swap_cache_del_folio(ci, folio, entry, NULL); +	swap_cluster_unlock(ci);  	put_swap_folio(folio, entry);  	folio_ref_sub(folio, folio_nr_pages(folio));  } -void clear_shadow_from_swap_cache(int type, unsigned long begin, -				unsigned long end) +/** + * __swap_cache_replace_folio - Replace a folio in the swap cache. + * @ci: The locked swap cluster. + * @old: The old folio to be replaced. + * @new: The new folio. + * + * Replace an existing folio in the swap cache with a new folio. The + * caller is responsible for setting up the new folio's flag and swap + * entries. Replacement will take the new folio's swap entry value as + * the starting offset to override all slots covered by the new folio. + * + * Context: Caller must ensure both folios are locked, and lock the + * cluster that holds the old folio to be replaced. + */ +void __swap_cache_replace_folio(struct swap_cluster_info *ci, +				struct folio *old, struct folio *new)  { -	unsigned long curr = begin; -	void *old; - -	for (;;) { -		swp_entry_t entry = swp_entry(type, curr); -		unsigned long index = curr & SWAP_ADDRESS_SPACE_MASK; -		struct address_space *address_space = swap_address_space(entry); -		XA_STATE(xas, &address_space->i_pages, index); - -		xas_set_update(&xas, workingset_update_node); - -		xa_lock_irq(&address_space->i_pages); -		xas_for_each(&xas, old, min(index + (end - curr), SWAP_ADDRESS_SPACE_PAGES)) { -			if (!xa_is_value(old)) -				continue; -			xas_store(&xas, NULL); -		} -		xa_unlock_irq(&address_space->i_pages); +	swp_entry_t entry = new->swap; +	unsigned long nr_pages = folio_nr_pages(new); +	unsigned int ci_off = swp_cluster_offset(entry); +	unsigned int ci_end = ci_off + nr_pages; +	unsigned long old_tb, new_tb; + +	VM_WARN_ON_ONCE(!folio_test_swapcache(old) || !folio_test_swapcache(new)); +	VM_WARN_ON_ONCE(!folio_test_locked(old) || !folio_test_locked(new)); +	VM_WARN_ON_ONCE(!entry.val); + +	/* Swap cache still stores N entries instead of a high-order entry */ +	new_tb = folio_to_swp_tb(new); +	do { +		old_tb = __swap_table_xchg(ci, ci_off, new_tb); +		WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old); +	} while (++ci_off < ci_end); -		/* search the next swapcache until we meet end */ -		curr = ALIGN((curr + 1), SWAP_ADDRESS_SPACE_PAGES); -		if (curr > end) -			break; +	/* +	 * If the old folio is partially replaced (e.g., splitting a large +	 * folio, the old folio is shrunk, and new split sub folios replace +	 * the shrunk part), ensure the new folio doesn't overlap it. +	 */ +	if (IS_ENABLED(CONFIG_DEBUG_VM) && +	    folio_order(old) != folio_order(new)) { +		ci_off = swp_cluster_offset(old->swap); +		ci_end = ci_off + folio_nr_pages(old); +		while (ci_off++ < ci_end) +			WARN_ON_ONCE(swp_tb_to_folio(__swap_table_get(ci, ci_off)) != old);  	}  } +/** + * swap_cache_clear_shadow - Clears a set of shadows in the swap cache. + * @entry: The starting index entry. + * @nr_ents: How many slots need to be cleared. + * + * Context: Caller must ensure the range is valid, all in one single cluster, + * not occupied by any folio, and lock the cluster. + */ +void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents) +{ +	struct swap_cluster_info *ci = __swap_entry_to_cluster(entry); +	unsigned int ci_off = swp_cluster_offset(entry), ci_end; +	unsigned long old; + +	ci_end = ci_off + nr_ents; +	do { +		old = __swap_table_xchg(ci, ci_off, null_to_swp_tb()); +		WARN_ON_ONCE(swp_tb_is_folio(old)); +	} while (++ci_off < ci_end); +} +  /*   * If we are the only user, then try to free up the swap cache.   * @@ -272,100 +363,50 @@ static inline bool swap_use_vma_readahead(void)  	return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);  } -/* - * Lookup a swap entry in the swap cache. A found folio will be returned - * unlocked and with its refcount incremented - we rely on the kernel - * lock getting page table operations atomic even if we drop the folio - * lock before returning. - * - * Caller must lock the swap device or hold a reference to keep it valid. +/** + * swap_update_readahead - Update the readahead statistics of VMA or globally. + * @folio: the swap cache folio that just got hit. + * @vma: the VMA that should be updated, could be NULL for global update. + * @addr: the addr that triggered the swapin, ignored if @vma is NULL.   */ -struct folio *swap_cache_get_folio(swp_entry_t entry, -		struct vm_area_struct *vma, unsigned long addr) +void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, +			   unsigned long addr)  { -	struct folio *folio; - -	folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); -	if (!IS_ERR(folio)) { -		bool vma_ra = swap_use_vma_readahead(); -		bool readahead; +	bool readahead, vma_ra = swap_use_vma_readahead(); -		/* -		 * At the moment, we don't support PG_readahead for anon THP -		 * so let's bail out rather than confusing the readahead stat. -		 */ -		if (unlikely(folio_test_large(folio))) -			return folio; - -		readahead = folio_test_clear_readahead(folio); -		if (vma && vma_ra) { -			unsigned long ra_val; -			int win, hits; - -			ra_val = GET_SWAP_RA_VAL(vma); -			win = SWAP_RA_WIN(ra_val); -			hits = SWAP_RA_HITS(ra_val); -			if (readahead) -				hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); -			atomic_long_set(&vma->swap_readahead_info, -					SWAP_RA_VAL(addr, win, hits)); -		} - -		if (readahead) { -			count_vm_event(SWAP_RA_HIT); -			if (!vma || !vma_ra) -				atomic_inc(&swapin_readahead_hits); -		} -	} else { -		folio = NULL; +	/* +	 * At the moment, we don't support PG_readahead for anon THP +	 * so let's bail out rather than confusing the readahead stat. +	 */ +	if (unlikely(folio_test_large(folio))) +		return; + +	readahead = folio_test_clear_readahead(folio); +	if (vma && vma_ra) { +		unsigned long ra_val; +		int win, hits; + +		ra_val = GET_SWAP_RA_VAL(vma); +		win = SWAP_RA_WIN(ra_val); +		hits = SWAP_RA_HITS(ra_val); +		if (readahead) +			hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); +		atomic_long_set(&vma->swap_readahead_info, +				SWAP_RA_VAL(addr, win, hits));  	} -	return folio; -} - -/** - * filemap_get_incore_folio - Find and get a folio from the page or swap caches. - * @mapping: The address_space to search. - * @index: The page cache index. - * - * This differs from filemap_get_folio() in that it will also look for the - * folio in the swap cache. - * - * Return: The found folio or %NULL. - */ -struct folio *filemap_get_incore_folio(struct address_space *mapping, -		pgoff_t index) -{ -	swp_entry_t swp; -	struct swap_info_struct *si; -	struct folio *folio = filemap_get_entry(mapping, index); - -	if (!folio) -		return ERR_PTR(-ENOENT); -	if (!xa_is_value(folio)) -		return folio; -	if (!shmem_mapping(mapping)) -		return ERR_PTR(-ENOENT); - -	swp = radix_to_swp_entry(folio); -	/* There might be swapin error entries in shmem mapping. */ -	if (non_swap_entry(swp)) -		return ERR_PTR(-ENOENT); -	/* Prevent swapoff from happening to us */ -	si = get_swap_device(swp); -	if (!si) -		return ERR_PTR(-ENOENT); -	index = swap_cache_index(swp); -	folio = filemap_get_folio(swap_address_space(swp), index); -	put_swap_device(si); -	return folio; +	if (readahead) { +		count_vm_event(SWAP_RA_HIT); +		if (!vma || !vma_ra) +			atomic_inc(&swapin_readahead_hits); +	}  }  struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,  		struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated,  		bool skip_if_exists)  { -	struct swap_info_struct *si = swp_swap_info(entry); +	struct swap_info_struct *si = __swap_entry_to_info(entry);  	struct folio *folio;  	struct folio *new_folio = NULL;  	struct folio *result = NULL; @@ -374,14 +415,13 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,  	*new_page_allocated = false;  	for (;;) {  		int err; +  		/* -		 * First check the swap cache.  Since this is normally -		 * called after swap_cache_get_folio() failed, re-calling -		 * that would confuse statistics. +		 * Check the swap cache first, if a cached folio is found, +		 * return it unlocked. The caller will lock and check it.  		 */ -		folio = filemap_get_folio(swap_address_space(entry), -					  swap_cache_index(entry)); -		if (!IS_ERR(folio)) +		folio = swap_cache_get_folio(entry); +		if (folio)  			goto got_folio;  		/* @@ -423,7 +463,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,  			goto put_and_return;  		/* -		 * We might race against __delete_from_swap_cache(), and +		 * We might race against __swap_cache_del_folio(), and  		 * stumble across a swap_map entry whose SWAP_HAS_CACHE  		 * has not yet been cleared.  Or race against another  		 * __read_swap_cache_async(), which has set SWAP_HAS_CACHE @@ -441,10 +481,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,  	if (mem_cgroup_swapin_charge_folio(new_folio, NULL, gfp_mask, entry))  		goto fail_unlock; -	/* May fail (-ENOMEM) if XArray node allocation failed. */ -	if (add_to_swap_cache(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) -		goto fail_unlock; - +	swap_cache_add_folio(new_folio, entry, &shadow);  	memcg1_swapin(entry, 1);  	if (shadow) @@ -590,7 +627,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,  	unsigned long offset = entry_offset;  	unsigned long start_offset, end_offset;  	unsigned long mask; -	struct swap_info_struct *si = swp_swap_info(entry); +	struct swap_info_struct *si = __swap_entry_to_info(entry);  	struct blk_plug plug;  	struct swap_iocb *splug = NULL;  	bool page_allocated; @@ -636,41 +673,6 @@ skip:  	return folio;  } -int init_swap_address_space(unsigned int type, unsigned long nr_pages) -{ -	struct address_space *spaces, *space; -	unsigned int i, nr; - -	nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); -	spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL); -	if (!spaces) -		return -ENOMEM; -	for (i = 0; i < nr; i++) { -		space = spaces + i; -		xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ); -		atomic_set(&space->i_mmap_writable, 0); -		space->a_ops = &swap_aops; -		/* swap cache doesn't use writeback related tags */ -		mapping_set_no_writeback_tags(space); -	} -	nr_swapper_spaces[type] = nr; -	swapper_spaces[type] = spaces; - -	return 0; -} - -void exit_swap_address_space(unsigned int type) -{ -	int i; -	struct address_space *spaces = swapper_spaces[type]; - -	for (i = 0; i < nr_swapper_spaces[type]; i++) -		VM_WARN_ON_ONCE(!mapping_empty(&spaces[i])); -	kvfree(spaces); -	nr_swapper_spaces[type] = 0; -	swapper_spaces[type] = NULL; -} -  static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start,  			   unsigned long *end)  { @@ -843,7 +845,7 @@ static const struct attribute_group swap_attr_group = {  	.attrs = swap_attrs,  }; -static int __init swap_init_sysfs(void) +static int __init swap_init(void)  {  	int err;  	struct kobject *swap_kobj; @@ -858,11 +860,13 @@ static int __init swap_init_sysfs(void)  		pr_err("failed to register swap group\n");  		goto delete_obj;  	} +	/* Swap cache writeback is LRU based, no tags for it */ +	mapping_set_no_writeback_tags(&swap_space);  	return 0;  delete_obj:  	kobject_put(swap_kobj);  	return err;  } -subsys_initcall(swap_init_sysfs); +subsys_initcall(swap_init);  #endif | 
