From f7f497cb702462e8505ff3d8d4e7722ad95626a1 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Wed, 16 Feb 2022 12:30:35 +0530 Subject: jbd2: kill t_handle_lock transaction spinlock This patch kills t_handle_lock transaction spinlock completely from jbd2. To explain the reasoning, currently there were three sites at which this spinlock was used. 1. jbd2_journal_wait_updates() a. Based on careful code review it can be seen that, we don't need this lock here. This is since we wait for any currently ongoing updates based on a atomic variable t_updates. And we anyway don't take any t_handle_lock while in stop_this_handle(). i.e. write_lock(&journal->j_state_lock() jbd2_journal_wait_updates() stop_this_handle() while (atomic_read(txn->t_updates) { | DEFINE_WAIT(wait); | prepare_to_wait(); | if (atomic_read(txn->t_updates) if (atomic_dec_and_test(txn->t_updates)) write_unlock(&journal->j_state_lock); schedule(); wake_up() write_lock(&journal->j_state_lock); finish_wait(); } txn->t_state = T_COMMIT write_unlock(&journal->j_state_lock); b. Also note that between atomic_inc(&txn->t_updates) in start_this_handle() and jbd2_journal_wait_updates(), the synchronization happens via read_lock(journal->j_state_lock) in start_this_handle(); 2. jbd2_journal_extend() a. jbd2_journal_extend() is called with the handle of each process from task_struct. So no lock required in updating member fields of handle_t b. For member fields of h_transaction, all updates happens only via atomic APIs (which is also within read_lock()). So, no need of this transaction spinlock. 3. update_t_max_wait() Based on Jan suggestion, this can be carefully removed using atomic cmpxchg API. Note that there can be several processes which are waiting for a new transaction to be allocated and started. For doing this only one process will succeed in taking write_lock() and allocating a new txn. After that all of the process will be updating the t_max_wait (max transaction wait time). This can be done via below method w/o taking any locks using atomic cmpxchg. For more details refer [1] new = get_new_val(); old = READ_ONCE(ptr->max_val); while (old < new) old = cmpxchg(&ptr->max_val, old, new); [1]: https://lwn.net/Articles/849237/ Suggested-by: Jan Kara Signed-off-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/d89e599658b4a1f3893a48c6feded200073037fc.1644992076.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux/jbd2.h') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 9c3ada74ffb1..a787872e1e86 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -554,9 +554,6 @@ struct transaction_chp_stats_s { * ->j_list_lock * * j_state_lock - * ->t_handle_lock - * - * j_state_lock * ->j_list_lock (journal_unmap_buffer) * */ -- cgit v1.2.3 From ccd16945dba091fdf1036d7711b9f6cbd287ae28 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 9 Feb 2022 20:21:43 +0000 Subject: ext4: Convert invalidatepage to invalidate_folio Extensive changes, but fairly mechanical. Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Damien Le Moal Acked-by: Damien Le Moal Tested-by: Mike Marshall # orangefs Tested-by: David Howells # afs --- fs/ext4/inode.c | 56 ++++++++++++++++++++++----------------------- fs/jbd2/journal.c | 2 +- fs/jbd2/transaction.c | 31 ++++++++++++------------- include/linux/jbd2.h | 4 ++-- include/linux/pagemap.h | 18 +++++++++++++++ include/trace/events/ext4.h | 30 ++++++++++++------------ 6 files changed, 78 insertions(+), 63 deletions(-) (limited to 'include/linux/jbd2.h') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d7086209572a..678ba122f8b1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -184,7 +184,7 @@ void ext4_evict_inode(struct inode *inode) * journal. So although mm thinks everything is clean and * ready for reaping the inode might still have some pages to * write in the running transaction or waiting to be - * checkpointed. Thus calling jbd2_journal_invalidatepage() + * checkpointed. Thus calling jbd2_journal_invalidate_folio() * (via truncate_inode_pages()) to discard these buffers can * cause data loss. Also even if we did not discard these * buffers, we would have no way to find them after the inode @@ -3186,7 +3186,7 @@ static void ext4_readahead(struct readahead_control *rac) static void ext4_invalidate_folio(struct folio *folio, size_t offset, size_t length) { - trace_ext4_invalidatepage(&folio->page, offset, length); + trace_ext4_invalidate_folio(folio, offset, length); /* No journalling happens on data buffers when this function is used */ WARN_ON(folio_buffers(folio) && buffer_jbd(folio_buffers(folio))); @@ -3194,29 +3194,28 @@ static void ext4_invalidate_folio(struct folio *folio, size_t offset, block_invalidate_folio(folio, offset, length); } -static int __ext4_journalled_invalidatepage(struct page *page, - unsigned int offset, - unsigned int length) +static int __ext4_journalled_invalidate_folio(struct folio *folio, + size_t offset, size_t length) { - journal_t *journal = EXT4_JOURNAL(page->mapping->host); + journal_t *journal = EXT4_JOURNAL(folio->mapping->host); - trace_ext4_journalled_invalidatepage(page, offset, length); + trace_ext4_journalled_invalidate_folio(folio, offset, length); /* * If it's a full truncate we just forget about the pending dirtying */ - if (offset == 0 && length == PAGE_SIZE) - ClearPageChecked(page); + if (offset == 0 && length == folio_size(folio)) + folio_clear_checked(folio); - return jbd2_journal_invalidatepage(journal, page, offset, length); + return jbd2_journal_invalidate_folio(journal, folio, offset, length); } /* Wrapper for aops... */ -static void ext4_journalled_invalidatepage(struct page *page, - unsigned int offset, - unsigned int length) +static void ext4_journalled_invalidate_folio(struct folio *folio, + size_t offset, + size_t length) { - WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0); + WARN_ON(__ext4_journalled_invalidate_folio(folio, offset, length) < 0); } static int ext4_releasepage(struct page *page, gfp_t wait) @@ -3601,7 +3600,7 @@ static const struct address_space_operations ext4_journalled_aops = { .write_end = ext4_journalled_write_end, .set_page_dirty = ext4_journalled_set_page_dirty, .bmap = ext4_bmap, - .invalidatepage = ext4_journalled_invalidatepage, + .invalidate_folio = ext4_journalled_invalidate_folio, .releasepage = ext4_releasepage, .direct_IO = noop_direct_IO, .is_partially_uptodate = block_is_partially_uptodate, @@ -5204,13 +5203,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) } /* - * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate - * buffers that are attached to a page stradding i_size and are undergoing + * In data=journal mode ext4_journalled_invalidate_folio() may fail to invalidate + * buffers that are attached to a folio straddling i_size and are undergoing * commit. In that case we have to wait for commit to finish and try again. */ static void ext4_wait_for_tail_page_commit(struct inode *inode) { - struct page *page; unsigned offset; journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = 0; @@ -5218,25 +5216,25 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) offset = inode->i_size & (PAGE_SIZE - 1); /* - * If the page is fully truncated, we don't need to wait for any commit - * (and we even should not as __ext4_journalled_invalidatepage() may - * strip all buffers from the page but keep the page dirty which can then - * confuse e.g. concurrent ext4_writepage() seeing dirty page without + * If the folio is fully truncated, we don't need to wait for any commit + * (and we even should not as __ext4_journalled_invalidate_folio() may + * strip all buffers from the folio but keep the folio dirty which can then + * confuse e.g. concurrent ext4_writepage() seeing dirty folio without * buffers). Also we don't need to wait for any commit if all buffers in - * the page remain valid. This is most beneficial for the common case of + * the folio remain valid. This is most beneficial for the common case of * blocksize == PAGESIZE. */ if (!offset || offset > (PAGE_SIZE - i_blocksize(inode))) return; while (1) { - page = find_lock_page(inode->i_mapping, + struct folio *folio = filemap_lock_folio(inode->i_mapping, inode->i_size >> PAGE_SHIFT); - if (!page) + if (!folio) return; - ret = __ext4_journalled_invalidatepage(page, offset, - PAGE_SIZE - offset); - unlock_page(page); - put_page(page); + ret = __ext4_journalled_invalidate_folio(folio, offset, + folio_size(folio) - offset); + folio_unlock(folio); + folio_put(folio); if (ret != -EBUSY) return; commit_tid = 0; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index c2cf74b01ddb..fcacafa4510d 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -86,7 +86,7 @@ EXPORT_SYMBOL(jbd2_journal_start_commit); EXPORT_SYMBOL(jbd2_journal_force_commit_nested); EXPORT_SYMBOL(jbd2_journal_wipe); EXPORT_SYMBOL(jbd2_journal_blocks_per_page); -EXPORT_SYMBOL(jbd2_journal_invalidatepage); +EXPORT_SYMBOL(jbd2_journal_invalidate_folio); EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); EXPORT_SYMBOL(jbd2_journal_force_commit); EXPORT_SYMBOL(jbd2_journal_inode_ranged_write); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 8e2f8275a253..d988016034e2 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2219,14 +2219,14 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) } /* - * jbd2_journal_invalidatepage + * jbd2_journal_invalidate_folio * * This code is tricky. It has a number of cases to deal with. * * There are two invariants which this code relies on: * - * i_size must be updated on disk before we start calling invalidatepage on the - * data. + * i_size must be updated on disk before we start calling invalidate_folio + * on the data. * * This is done in ext3 by defining an ext3_setattr method which * updates i_size before truncate gets going. By maintaining this @@ -2428,9 +2428,9 @@ zap_buffer_unlocked: } /** - * jbd2_journal_invalidatepage() + * jbd2_journal_invalidate_folio() * @journal: journal to use for flush... - * @page: page to flush + * @folio: folio to flush * @offset: start of the range to invalidate * @length: length of the range to invalidate * @@ -2439,30 +2439,29 @@ zap_buffer_unlocked: * the page is straddling i_size. Caller then has to wait for current commit * and try again. */ -int jbd2_journal_invalidatepage(journal_t *journal, - struct page *page, - unsigned int offset, - unsigned int length) +int jbd2_journal_invalidate_folio(journal_t *journal, struct folio *folio, + size_t offset, size_t length) { struct buffer_head *head, *bh, *next; unsigned int stop = offset + length; unsigned int curr_off = 0; - int partial_page = (offset || length < PAGE_SIZE); + int partial_page = (offset || length < folio_size(folio)); int may_free = 1; int ret = 0; - if (!PageLocked(page)) + if (!folio_test_locked(folio)) BUG(); - if (!page_has_buffers(page)) + head = folio_buffers(folio); + if (!head) return 0; - BUG_ON(stop > PAGE_SIZE || stop < length); + BUG_ON(stop > folio_size(folio) || stop < length); /* We will potentially be playing with lists other than just the * data lists (especially for journaled data mode), so be * cautious in our locking. */ - head = bh = page_buffers(page); + bh = head; do { unsigned int next_off = curr_off + bh->b_size; next = bh->b_this_page; @@ -2485,8 +2484,8 @@ int jbd2_journal_invalidatepage(journal_t *journal, } while (bh != head); if (!partial_page) { - if (may_free && try_to_free_buffers(page)) - J_ASSERT(!page_has_buffers(page)); + if (may_free && try_to_free_buffers(&folio->page)) + J_ASSERT(!folio_buffers(folio)); } return 0; } diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 9c3ada74ffb1..1b9d1e205a2f 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1530,8 +1530,8 @@ void jbd2_journal_set_triggers(struct buffer_head *, struct jbd2_buffer_trigger_type *type); extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *); extern int jbd2_journal_forget (handle_t *, struct buffer_head *); -extern int jbd2_journal_invalidatepage(journal_t *, - struct page *, unsigned int, unsigned int); +int jbd2_journal_invalidate_folio(journal_t *, struct folio *, + size_t offset, size_t length); extern int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page); extern int jbd2_journal_stop(handle_t *); extern int jbd2_journal_flush(journal_t *journal, unsigned int flags); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 4503d5baa252..6a9617e9c6bc 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -422,6 +422,24 @@ static inline struct folio *filemap_get_folio(struct address_space *mapping, return __filemap_get_folio(mapping, index, 0, 0); } +/** + * filemap_lock_folio - Find and lock a folio. + * @mapping: The address_space to search. + * @index: The page index. + * + * Looks up the page cache entry at @mapping & @index. If a folio is + * present, it is returned locked with an increased refcount. + * + * Context: May sleep. + * Return: A folio or %NULL if there is no folio in the cache for this + * index. Will not return a shadow, swap or DAX entry. + */ +static inline struct folio *filemap_lock_folio(struct address_space *mapping, + pgoff_t index) +{ + return __filemap_get_folio(mapping, index, FGP_LOCK, 0); +} + /** * find_get_page - find and get a page reference * @mapping: the address_space to search diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 19e957b7f941..40cca0e5a811 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -597,44 +597,44 @@ DEFINE_EVENT(ext4__page_op, ext4_releasepage, TP_ARGS(page) ); -DECLARE_EVENT_CLASS(ext4_invalidatepage_op, - TP_PROTO(struct page *page, unsigned int offset, unsigned int length), +DECLARE_EVENT_CLASS(ext4_invalidate_folio_op, + TP_PROTO(struct folio *folio, size_t offset, size_t length), - TP_ARGS(page, offset, length), + TP_ARGS(folio, offset, length), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( pgoff_t, index ) - __field( unsigned int, offset ) - __field( unsigned int, length ) + __field( size_t, offset ) + __field( size_t, length ) ), TP_fast_assign( - __entry->dev = page->mapping->host->i_sb->s_dev; - __entry->ino = page->mapping->host->i_ino; - __entry->index = page->index; + __entry->dev = folio->mapping->host->i_sb->s_dev; + __entry->ino = folio->mapping->host->i_ino; + __entry->index = folio->index; __entry->offset = offset; __entry->length = length; ), - TP_printk("dev %d,%d ino %lu page_index %lu offset %u length %u", + TP_printk("dev %d,%d ino %lu folio_index %lu offset %zu length %zu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned long) __entry->index, __entry->offset, __entry->length) ); -DEFINE_EVENT(ext4_invalidatepage_op, ext4_invalidatepage, - TP_PROTO(struct page *page, unsigned int offset, unsigned int length), +DEFINE_EVENT(ext4_invalidate_folio_op, ext4_invalidate_folio, + TP_PROTO(struct folio *folio, size_t offset, size_t length), - TP_ARGS(page, offset, length) + TP_ARGS(folio, offset, length) ); -DEFINE_EVENT(ext4_invalidatepage_op, ext4_journalled_invalidatepage, - TP_PROTO(struct page *page, unsigned int offset, unsigned int length), +DEFINE_EVENT(ext4_invalidate_folio_op, ext4_journalled_invalidate_folio, + TP_PROTO(struct folio *folio, size_t offset, size_t length), - TP_ARGS(page, offset, length) + TP_ARGS(folio, offset, length) ); TRACE_EVENT(ext4_discard_blocks, -- cgit v1.2.3 From c56a6eb03deb187c989a966fda5a254249b56c2a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 1 May 2022 00:46:03 -0400 Subject: jbd2: Convert jbd2_journal_try_to_free_buffers to take a folio Also convert it to return a bool since it's called from release_folio(). Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Theodore Ts'o Reviewed-by: Jeff Layton --- fs/ext4/inode.c | 2 +- fs/jbd2/transaction.c | 12 ++++++------ include/linux/jbd2.h | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux/jbd2.h') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 52c46ac5bc8a..943937cb5302 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3253,7 +3253,7 @@ static bool ext4_release_folio(struct folio *folio, gfp_t wait) if (folio_test_checked(folio)) return false; if (journal) - return jbd2_journal_try_to_free_buffers(journal, &folio->page); + return jbd2_journal_try_to_free_buffers(journal, folio); else return try_to_free_buffers(&folio->page); } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index fcb9175016a5..ee33d277d51e 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2143,17 +2143,17 @@ out: * cannot happen because we never reallocate freed data as metadata * while the data is part of a transaction. Yes? * - * Return 0 on failure, 1 on success + * Return false on failure, true on success */ -int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page) +bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio) { struct buffer_head *head; struct buffer_head *bh; - int ret = 0; + bool ret = false; - J_ASSERT(PageLocked(page)); + J_ASSERT(folio_test_locked(folio)); - head = page_buffers(page); + head = folio_buffers(folio); bh = head; do { struct journal_head *jh; @@ -2175,7 +2175,7 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page) goto busy; } while ((bh = bh->b_this_page) != head); - ret = try_to_free_buffers(page); + ret = try_to_free_buffers(&folio->page); busy: return ret; } diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index de9536680b2b..e79d6e0b14e8 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1529,7 +1529,7 @@ extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *); extern int jbd2_journal_forget (handle_t *, struct buffer_head *); int jbd2_journal_invalidate_folio(journal_t *, struct folio *, size_t offset, size_t length); -extern int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page); +bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio); extern int jbd2_journal_stop(handle_t *); extern int jbd2_journal_flush(journal_t *journal, unsigned int flags); extern void jbd2_journal_lock_updates (journal_t *); -- cgit v1.2.3