From 5a80bd075f3bce24793ae1aeb06066895ec5aef0 Mon Sep 17 00:00:00 2001 From: Hengqi Chen Date: Sat, 20 May 2023 08:40:57 +0000 Subject: block: introduce block_io_start/block_io_done tracepoints Currently, several BCC ([0]) tools (biosnoop/biostacks/biotop) use kprobes to blk_account_io_start/blk_account_io_done to implement their functionalities. This is fragile because the target kernel functions may be renamed ([1]) or inlined ([2]). So introduce two new tracepoints for such use cases. [0]: https://github.com/iovisor/bcc [1]: https://github.com/iovisor/bcc/issues/3954 [2]: https://github.com/iovisor/bcc/issues/4261 Tested-by: Francis Laniel Signed-off-by: Hengqi Chen Tested-by: Yonghong Song Link: https://lore.kernel.org/r/20230520084057.1467003-1-hengqi.chen@gmail.com Signed-off-by: Jens Axboe --- include/trace/events/block.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/trace/events') diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 7f4dfbdf12a6..40e60c33cc6f 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -245,6 +245,32 @@ DEFINE_EVENT(block_rq, block_rq_merge, TP_ARGS(rq) ); +/** + * block_io_start - insert a request for execution + * @rq: block IO operation request + * + * Called when block operation request @rq is queued for execution + */ +DEFINE_EVENT(block_rq, block_io_start, + + TP_PROTO(struct request *rq), + + TP_ARGS(rq) +); + +/** + * block_io_done - block IO operation request completed + * @rq: block IO operation request + * + * Called when block operation request @rq is completed + */ +DEFINE_EVENT(block_rq, block_io_done, + + TP_PROTO(struct request *rq), + + TP_ARGS(rq) +); + /** * block_bio_complete - completed all work on the block operation * @q: queue holding the block operation -- cgit v1.2.3 From c42bebca967da88c054ccb5ab152c9822c054662 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 15 May 2023 09:33:00 -0400 Subject: SUNRPC: Trace struct svc_sock lifetime events Capture a timestamp and pointer address during the creation and destruction of struct svc_sock to record its lifetime. This helps to diagnose transport reference counting issues. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/trace/events/sunrpc.h | 39 +++++++++++++++++++++++++++------------ net/sunrpc/svcsock.c | 4 +++- 2 files changed, 30 insertions(+), 13 deletions(-) (limited to 'include/trace/events') diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 31bc7025cb44..69e42ef30979 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -2104,31 +2104,46 @@ DEFINE_SVC_DEFERRED_EVENT(drop); DEFINE_SVC_DEFERRED_EVENT(queue); DEFINE_SVC_DEFERRED_EVENT(recv); -TRACE_EVENT(svcsock_new_socket, +DECLARE_EVENT_CLASS(svcsock_lifetime_class, TP_PROTO( + const void *svsk, const struct socket *socket ), - - TP_ARGS(socket), - + TP_ARGS(svsk, socket), TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(const void *, svsk) + __field(const void *, sk) __field(unsigned long, type) __field(unsigned long, family) - __field(bool, listener) + __field(unsigned long, state) ), - TP_fast_assign( + struct sock *sk = socket->sk; + + __entry->netns_ino = sock_net(sk)->ns.inum; + __entry->svsk = svsk; + __entry->sk = sk; __entry->type = socket->type; - __entry->family = socket->sk->sk_family; - __entry->listener = (socket->sk->sk_state == TCP_LISTEN); + __entry->family = sk->sk_family; + __entry->state = sk->sk_state; ), - - TP_printk("type=%s family=%s%s", - show_socket_type(__entry->type), + TP_printk("svsk=%p type=%s family=%s%s", + __entry->svsk, show_socket_type(__entry->type), rpc_show_address_family(__entry->family), - __entry->listener ? " (listener)" : "" + __entry->state == TCP_LISTEN ? " (listener)" : "" ) ); +#define DEFINE_SVCSOCK_LIFETIME_EVENT(name) \ + DEFINE_EVENT(svcsock_lifetime_class, name, \ + TP_PROTO( \ + const void *svsk, \ + const struct socket *socket \ + ), \ + TP_ARGS(svsk, socket)) + +DEFINE_SVCSOCK_LIFETIME_EVENT(svcsock_new); +DEFINE_SVCSOCK_LIFETIME_EVENT(svcsock_free); TRACE_EVENT(svcsock_marker, TP_PROTO( diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index fbe33c922882..5f519fc0541b 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1470,7 +1470,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, else svc_tcp_init(svsk, serv); - trace_svcsock_new_socket(sock); + trace_svcsock_new(svsk, sock); return svsk; } @@ -1651,6 +1651,8 @@ static void svc_sock_free(struct svc_xprt *xprt) struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); struct socket *sock = svsk->sk_sock; + trace_svcsock_free(svsk, sock); + tls_handshake_cancel(sock->sk); if (sock->file) sockfd_put(sock); -- cgit v1.2.3 From ecd8b2928f2efc7b678b361d51920c15b5ef3885 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 19 May 2023 14:39:55 +0200 Subject: mm: compaction: remove compaction result helpers Patch series "mm: compaction: cleanups & simplifications". These compaction cleanups are split out from the huge page allocator series[1], as requested by reviewer feedback. [1] https://lore.kernel.org/linux-mm/20230418191313.268131-1-hannes@cmpxchg.org/ This patch (of 5): The compaction result helpers encode quirks that are specific to the allocator's retry logic. E.g. COMPACT_SUCCESS and COMPACT_COMPLETE actually represent failures that should be retried upon, and so on. I frequently found myself pulling up the helper implementation in order to understand and work on the retry logic. They're not quite clean abstractions; rather they split the retry logic into two locations. Remove the helpers and inline the checks. Then comment on the result interpretations directly where the decision making happens. Link: https://lkml.kernel.org/r/20230519123959.77335-1-hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20230519123959.77335-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/compaction.h | 92 ------------------------------------------ include/trace/events/mmflags.h | 4 +- mm/page_alloc.c | 30 ++++++++------ 3 files changed, 19 insertions(+), 107 deletions(-) (limited to 'include/trace/events') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index a6e512cfb670..1f0328a2ba48 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -95,78 +95,6 @@ extern enum compact_result compaction_suitable(struct zone *zone, int order, extern void compaction_defer_reset(struct zone *zone, int order, bool alloc_success); -/* Compaction has made some progress and retrying makes sense */ -static inline bool compaction_made_progress(enum compact_result result) -{ - /* - * Even though this might sound confusing this in fact tells us - * that the compaction successfully isolated and migrated some - * pageblocks. - */ - if (result == COMPACT_SUCCESS) - return true; - - return false; -} - -/* Compaction has failed and it doesn't make much sense to keep retrying. */ -static inline bool compaction_failed(enum compact_result result) -{ - /* All zones were scanned completely and still not result. */ - if (result == COMPACT_COMPLETE) - return true; - - return false; -} - -/* Compaction needs reclaim to be performed first, so it can continue. */ -static inline bool compaction_needs_reclaim(enum compact_result result) -{ - /* - * Compaction backed off due to watermark checks for order-0 - * so the regular reclaim has to try harder and reclaim something. - */ - if (result == COMPACT_SKIPPED) - return true; - - return false; -} - -/* - * Compaction has backed off for some reason after doing some work or none - * at all. It might be throttling or lock contention. Retrying might be still - * worthwhile, but with a higher priority if allowed. - */ -static inline bool compaction_withdrawn(enum compact_result result) -{ - /* - * If compaction is deferred for high-order allocations, it is - * because sync compaction recently failed. If this is the case - * and the caller requested a THP allocation, we do not want - * to heavily disrupt the system, so we fail the allocation - * instead of entering direct reclaim. - */ - if (result == COMPACT_DEFERRED) - return true; - - /* - * If compaction in async mode encounters contention or blocks higher - * priority task we back off early rather than cause stalls. - */ - if (result == COMPACT_CONTENDED) - return true; - - /* - * Page scanners have met but we haven't scanned full zones so this - * is a back off in fact. - */ - if (result == COMPACT_PARTIAL_SKIPPED) - return true; - - return false; -} - - bool compaction_zonelist_suitable(struct alloc_context *ac, int order, int alloc_flags); @@ -185,26 +113,6 @@ static inline enum compact_result compaction_suitable(struct zone *zone, int ord return COMPACT_SKIPPED; } -static inline bool compaction_made_progress(enum compact_result result) -{ - return false; -} - -static inline bool compaction_failed(enum compact_result result) -{ - return false; -} - -static inline bool compaction_needs_reclaim(enum compact_result result) -{ - return false; -} - -static inline bool compaction_withdrawn(enum compact_result result) -{ - return true; -} - static inline void kcompactd_run(int nid) { } diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index b63e7c0fbbe5..1478b9dd05fa 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -223,8 +223,8 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \ #define compact_result_to_feedback(result) \ ({ \ enum compact_result __result = result; \ - (compaction_failed(__result)) ? COMPACTION_FAILED : \ - (compaction_withdrawn(__result)) ? COMPACTION_WITHDRAWN : COMPACTION_PROGRESS; \ + (__result == COMPACT_COMPLETE) ? COMPACTION_FAILED : \ + (__result == COMPACT_SUCCESS) ? COMPACTION_PROGRESS : COMPACTION_WITHDRAWN; \ }) #define COMPACTION_FEEDBACK \ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b9a9ba2db9e9..e3a3ebc2dfce 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3469,35 +3469,39 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, if (fatal_signal_pending(current)) return false; - if (compaction_made_progress(compact_result)) + /* + * Compaction managed to coalesce some page blocks, but the + * allocation failed presumably due to a race. Retry some. + */ + if (compact_result == COMPACT_SUCCESS) (*compaction_retries)++; /* - * compaction considers all the zone as desperately out of memory - * so it doesn't really make much sense to retry except when the + * All zones were scanned completely and still no result. It + * doesn't really make much sense to retry except when the * failure could be caused by insufficient priority */ - if (compaction_failed(compact_result)) + if (compact_result == COMPACT_COMPLETE) goto check_priority; /* - * compaction was skipped because there are not enough order-0 pages - * to work with, so we retry only if it looks like reclaim can help. + * Compaction was skipped due to a lack of free order-0 + * migration targets. Continue if reclaim can help. */ - if (compaction_needs_reclaim(compact_result)) { + if (compact_result == COMPACT_SKIPPED) { ret = compaction_zonelist_suitable(ac, order, alloc_flags); goto out; } /* - * make sure the compaction wasn't deferred or didn't bail out early - * due to locks contention before we declare that we should give up. - * But the next retry should use a higher priority if allowed, so - * we don't just keep bailing out endlessly. + * If compaction backed due to being deferred, due to + * contended locks in async mode, or due to scanners meeting + * after a partial scan, retry with increased priority. */ - if (compaction_withdrawn(compact_result)) { + if (compact_result == COMPACT_DEFERRED || + compact_result == COMPACT_CONTENDED || + compact_result == COMPACT_PARTIAL_SKIPPED) goto check_priority; - } /* * !costly requests are much more important than __GFP_RETRY_MAYFAIL -- cgit v1.2.3 From 447ba88658faa8dbfd29d557daa38b7d88f460ec Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 25 May 2023 20:54:00 +0800 Subject: mm: compaction: add trace event for fast freepages isolation The fast_isolate_freepages() can also isolate freepages, but we can not know the fast isolation efficiency to understand the fast isolation pressure. So add a trace event to show some numbers to help to understand the efficiency for fast freepages isolation. Link: https://lkml.kernel.org/r/78d2932d0160d122c15372aceb3f2c45460a17fc.1685018752.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: Vlastimil Babka Cc: Mel Gorman Signed-off-by: Andrew Morton --- include/trace/events/compaction.h | 11 +++++++++++ mm/compaction.c | 6 +++++- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/trace/events') diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index 3313eb83c117..2b2a975efd20 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -64,6 +64,17 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages, TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken) ); +DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_fast_isolate_freepages, + + TP_PROTO( + unsigned long start_pfn, + unsigned long end_pfn, + unsigned long nr_scanned, + unsigned long nr_taken), + + TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken) +); + #ifdef CONFIG_COMPACTION TRACE_EVENT(mm_compaction_migratepages, diff --git a/mm/compaction.c b/mm/compaction.c index 3b09d8d02581..ce6293bf9c4a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1416,7 +1416,7 @@ static int next_search_order(struct compact_control *cc, int order) static void fast_isolate_freepages(struct compact_control *cc) { unsigned int limit = max(1U, freelist_scan_limit(cc) >> 1); - unsigned int nr_scanned = 0; + unsigned int nr_scanned = 0, total_isolated = 0; unsigned long low_pfn, min_pfn, highest = 0; unsigned long nr_isolated = 0; unsigned long distance; @@ -1515,6 +1515,7 @@ static void fast_isolate_freepages(struct compact_control *cc) set_page_private(page, order); nr_isolated = 1 << order; nr_scanned += nr_isolated - 1; + total_isolated += nr_isolated; cc->nr_freepages += nr_isolated; list_add_tail(&page->lru, &cc->freepages); count_compact_events(COMPACTISOLATED, nr_isolated); @@ -1535,6 +1536,9 @@ static void fast_isolate_freepages(struct compact_control *cc) limit = max(1U, limit >> 1); } + trace_mm_compaction_fast_isolate_freepages(min_pfn, cc->free_pfn, + nr_scanned, total_isolated); + if (!page) { cc->fast_search_fail++; if (scan_start) { -- cgit v1.2.3 From 36c9b4504088089185f7743433c914935b518ab2 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Mon, 15 May 2023 16:10:42 +0530 Subject: ext4: Change remaining tracepoints to use folio ext4_readpage() is converted to ext4_read_folio() hence change the related tracepoint from trace_ext4_readpage(page) to trace_ext4_read_folio(folio). Do the same for trace_ext4_releasepage(page) to trace_ext4_release_folio(folio) As a minor bit of optimization to avoid an extra dereferencing, since both of the above functions already were dereferencing folio->mapping->host, hence change the tracepoint argument to take (inode, folio). Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/caba2b3c0147bed4ea7706767dc1d19cd0e29ab0.1684122756.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 7 ++++--- include/trace/events/ext4.h | 26 +++++++++++++------------- 2 files changed, 17 insertions(+), 16 deletions(-) (limited to 'include/trace/events') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 02de439bf1f0..e76a95267178 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3105,7 +3105,7 @@ static int ext4_read_folio(struct file *file, struct folio *folio) int ret = -EAGAIN; struct inode *inode = folio->mapping->host; - trace_ext4_readpage(&folio->page); + trace_ext4_read_folio(inode, folio); if (ext4_has_inline_data(inode)) ret = ext4_readpage_inline(inode, folio); @@ -3164,9 +3164,10 @@ static void ext4_journalled_invalidate_folio(struct folio *folio, static bool ext4_release_folio(struct folio *folio, gfp_t wait) { - journal_t *journal = EXT4_JOURNAL(folio->mapping->host); + struct inode *inode = folio->mapping->host; + journal_t *journal = EXT4_JOURNAL(inode); - trace_ext4_releasepage(&folio->page); + trace_ext4_release_folio(inode, folio); /* Page has dirty journalled data -> cannot release */ if (folio_test_checked(folio)) diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index ebccf6a6aa1b..54cd509ced0f 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -560,10 +560,10 @@ TRACE_EVENT(ext4_writepages_result, (unsigned long) __entry->writeback_index) ); -DECLARE_EVENT_CLASS(ext4__page_op, - TP_PROTO(struct page *page), +DECLARE_EVENT_CLASS(ext4__folio_op, + TP_PROTO(struct inode *inode, struct folio *folio), - TP_ARGS(page), + TP_ARGS(inode, folio), TP_STRUCT__entry( __field( dev_t, dev ) @@ -573,29 +573,29 @@ DECLARE_EVENT_CLASS(ext4__page_op, ), TP_fast_assign( - __entry->dev = page->mapping->host->i_sb->s_dev; - __entry->ino = page->mapping->host->i_ino; - __entry->index = page->index; + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->index = folio->index; ), - TP_printk("dev %d,%d ino %lu page_index %lu", + TP_printk("dev %d,%d ino %lu folio_index %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned long) __entry->index) ); -DEFINE_EVENT(ext4__page_op, ext4_readpage, +DEFINE_EVENT(ext4__folio_op, ext4_read_folio, - TP_PROTO(struct page *page), + TP_PROTO(struct inode *inode, struct folio *folio), - TP_ARGS(page) + TP_ARGS(inode, folio) ); -DEFINE_EVENT(ext4__page_op, ext4_releasepage, +DEFINE_EVENT(ext4__folio_op, ext4_release_folio, - TP_PROTO(struct page *page), + TP_PROTO(struct inode *inode, struct folio *folio), - TP_ARGS(page) + TP_ARGS(inode, folio) ); DECLARE_EVENT_CLASS(ext4_invalidate_folio_op, -- cgit v1.2.3 From 949fa3f11ced2a5c8e3737e73b09676adf4b322b Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Thu, 15 Jun 2023 03:59:45 -0300 Subject: trace,smp: Add tracepoints around remotelly called functions The recently added ipi_send_{cpu,cpumask} tracepoints allow finding sources of IPIs targeting CPUs running latency-sensitive applications. For NOHZ_FULL CPUs, all IPIs are interference, and those tracepoints are sufficient to find them and work on getting rid of them. In some setups however, not *all* IPIs are to be suppressed, but long-running IPI callbacks can still be problematic. Add a pair of tracepoints to mark the start and end of processing a CSD IPI callback, similar to what exists for softirq, workqueue or timer callbacks. Signed-off-by: Leonardo Bras Tested-and-reviewed-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20230615065944.188876-5-leobras@redhat.com --- include/trace/events/csd.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ kernel/smp.c | 25 +++++++++++++++++++------ 2 files changed, 64 insertions(+), 6 deletions(-) create mode 100644 include/trace/events/csd.h (limited to 'include/trace/events') diff --git a/include/trace/events/csd.h b/include/trace/events/csd.h new file mode 100644 index 000000000000..af1df5200ae6 --- /dev/null +++ b/include/trace/events/csd.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM csd + +#if !defined(_TRACE_CSD_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_CSD_H + +#include + +/* + * Tracepoints for a function which is called as an effect of smp_call_function.* + */ +DECLARE_EVENT_CLASS(csd_function, + + TP_PROTO(smp_call_func_t func, struct __call_single_data *csd), + + TP_ARGS(func, csd), + + TP_STRUCT__entry( + __field(void *, func) + __field(void *, csd) + ), + + TP_fast_assign( + __entry->func = func; + __entry->csd = csd; + ), + + TP_printk("func=%ps, csd=%p", __entry->func, __entry->csd) +); + +DEFINE_EVENT(csd_function, csd_function_entry, + TP_PROTO(smp_call_func_t func, struct __call_single_data *csd), + TP_ARGS(func, csd) +); + +DEFINE_EVENT(csd_function, csd_function_exit, + TP_PROTO(smp_call_func_t func, struct __call_single_data *csd), + TP_ARGS(func, csd) +); + +#endif /* _TRACE_CSD_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/smp.c b/kernel/smp.c index 71dce748dbf0..1fa01a83fd83 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -27,6 +27,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include +#undef CREATE_TRACE_POINTS #include "smpboot.h" #include "sched/smp.h" @@ -121,6 +124,14 @@ send_call_function_ipi_mask(struct cpumask *mask) arch_send_call_function_ipi_mask(mask); } +static __always_inline void +csd_do_func(smp_call_func_t func, void *info, struct __call_single_data *csd) +{ + trace_csd_function_entry(func, csd); + func(info); + trace_csd_function_exit(func, csd); +} + #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG static DEFINE_STATIC_KEY_MAYBE(CONFIG_CSD_LOCK_WAIT_DEBUG_DEFAULT, csdlock_debug_enabled); @@ -375,7 +386,7 @@ static int generic_exec_single(int cpu, struct __call_single_data *csd) csd_lock_record(csd); csd_unlock(csd); local_irq_save(flags); - func(info); + csd_do_func(func, info, NULL); csd_lock_record(NULL); local_irq_restore(flags); return 0; @@ -477,7 +488,7 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline) } csd_lock_record(csd); - func(info); + csd_do_func(func, info, csd); csd_unlock(csd); csd_lock_record(NULL); } else { @@ -508,7 +519,7 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline) csd_lock_record(csd); csd_unlock(csd); - func(info); + csd_do_func(func, info, csd); csd_lock_record(NULL); } else if (type == CSD_TYPE_IRQ_WORK) { irq_work_single(csd); @@ -522,8 +533,10 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline) /* * Third; only CSD_TYPE_TTWU is left, issue those. */ - if (entry) - sched_ttwu_pending(entry); + if (entry) { + csd = llist_entry(entry, typeof(*csd), node.llist); + csd_do_func(sched_ttwu_pending, entry, csd); + } } @@ -816,7 +829,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask, unsigned long flags; local_irq_save(flags); - func(info); + csd_do_func(func, info, NULL); local_irq_restore(flags); } -- cgit v1.2.3 From bf5a8c26ad7caf0772a1cd48c8a0924e48bdbaf0 Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Thu, 15 Jun 2023 03:59:47 -0300 Subject: trace,smp: Add tracepoints for scheduling remotelly called functions Add a tracepoint for when a CSD is queued to a remote CPU's call_single_queue. This allows finding exactly which CPU queued a given CSD when looking at a csd_function_{entry,exit} event, and also enables us to accurately measure IPI delivery time with e.g. a synthetic event: $ echo 'hist:keys=cpu,csd.hex:ts=common_timestamp.usecs' >\ /sys/kernel/tracing/events/smp/csd_queue_cpu/trigger $ echo 'csd_latency unsigned int dst_cpu; unsigned long csd; u64 time' >\ /sys/kernel/tracing/synthetic_events $ echo \ 'hist:keys=common_cpu,csd.hex:'\ 'time=common_timestamp.usecs-$ts:'\ 'onmatch(smp.csd_queue_cpu).trace(csd_latency,common_cpu,csd,$time)' >\ /sys/kernel/tracing/events/smp/csd_function_entry/trigger $ trace-cmd record -e 'synthetic:csd_latency' hackbench $ trace-cmd report <...>-467 [001] 21.824263: csd_queue_cpu: cpu=0 callsite=try_to_wake_up+0x2ea func=sched_ttwu_pending csd=0xffff8880076148b8 <...>-467 [001] 21.824280: ipi_send_cpu: cpu=0 callsite=try_to_wake_up+0x2ea callback=generic_smp_call_function_single_interrupt+0x0 <...>-489 [000] 21.824299: csd_function_entry: func=sched_ttwu_pending csd=0xffff8880076148b8 <...>-489 [000] 21.824320: csd_latency: dst_cpu=0, csd=18446612682193848504, time=36 Suggested-by: Valentin Schneider Signed-off-by: Leonardo Bras Tested-and-reviewed-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20230615065944.188876-7-leobras@redhat.com --- include/trace/events/csd.h | 27 +++++++++++++++++++++++++++ kernel/smp.c | 16 +++++----------- 2 files changed, 32 insertions(+), 11 deletions(-) (limited to 'include/trace/events') diff --git a/include/trace/events/csd.h b/include/trace/events/csd.h index af1df5200ae6..67e9d01f80c2 100644 --- a/include/trace/events/csd.h +++ b/include/trace/events/csd.h @@ -7,6 +7,33 @@ #include +TRACE_EVENT(csd_queue_cpu, + + TP_PROTO(const unsigned int cpu, + unsigned long callsite, + smp_call_func_t func, + struct __call_single_data *csd), + + TP_ARGS(cpu, callsite, func, csd), + + TP_STRUCT__entry( + __field(unsigned int, cpu) + __field(void *, callsite) + __field(void *, func) + __field(void *, csd) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->callsite = (void *)callsite; + __entry->func = func; + __entry->csd = csd; + ), + + TP_printk("cpu=%u callsite=%pS func=%ps csd=%p", + __entry->cpu, __entry->callsite, __entry->func, __entry->csd) + ); + /* * Tracepoints for a function which is called as an effect of smp_call_function.* */ diff --git a/kernel/smp.c b/kernel/smp.c index 1fa01a83fd83..385179dae360 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -340,7 +340,7 @@ void __smp_call_single_queue(int cpu, struct llist_node *node) * even if we haven't sent the smp_call IPI yet (e.g. the stopper * executes migration_cpu_stop() on the remote CPU). */ - if (trace_ipi_send_cpu_enabled()) { + if (trace_csd_queue_cpu_enabled()) { call_single_data_t *csd; smp_call_func_t func; @@ -348,7 +348,7 @@ void __smp_call_single_queue(int cpu, struct llist_node *node) func = CSD_TYPE(csd) == CSD_TYPE_TTWU ? sched_ttwu_pending : csd->func; - trace_ipi_send_cpu(cpu, _RET_IP_, func); + trace_csd_queue_cpu(cpu, _RET_IP_, func, csd); } /* @@ -741,7 +741,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask, int cpu, last_cpu, this_cpu = smp_processor_id(); struct call_function_data *cfd; bool wait = scf_flags & SCF_WAIT; - int nr_cpus = 0, nr_queued = 0; + int nr_cpus = 0; bool run_remote = false; bool run_local = false; @@ -799,21 +799,15 @@ static void smp_call_function_many_cond(const struct cpumask *mask, csd->node.src = smp_processor_id(); csd->node.dst = cpu; #endif + trace_csd_queue_cpu(cpu, _RET_IP_, func, csd); + if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) { __cpumask_set_cpu(cpu, cfd->cpumask_ipi); nr_cpus++; last_cpu = cpu; } - nr_queued++; } - /* - * Trace each smp_function_call_*() as an IPI, actual IPIs - * will be traced with func==generic_smp_call_function_single_ipi(). - */ - if (nr_queued) - trace_ipi_send_cpumask(cfd->cpumask, _RET_IP_, func); - /* * Choose the most efficient way to send an IPI. Note that the * number of CPUs might be zero due to concurrent changes to the -- cgit v1.2.3 From a23c76e92d8215456ca2fbd5cf6c1ff71b744c1d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 12 Jun 2023 10:13:51 -0400 Subject: svcrdma: trace cc_release calls This event brackets the svcrdma_post_* trace points. If this trace event is enabled but does not appear as expected, that indicates a chunk_ctxt leak. Reviewed-by: Jeff Layton Acked-by: Tom Talpey Signed-off-by: Chuck Lever --- include/trace/events/rpcrdma.h | 8 ++++++++ net/sunrpc/xprtrdma/svc_rdma_rw.c | 2 ++ 2 files changed, 10 insertions(+) (limited to 'include/trace/events') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 8f461e04e5f0..f8069ef2ee0f 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -2112,6 +2112,14 @@ DEFINE_POST_CHUNK_EVENT(read); DEFINE_POST_CHUNK_EVENT(write); DEFINE_POST_CHUNK_EVENT(reply); +DEFINE_EVENT(svcrdma_post_chunk_class, svcrdma_cc_release, + TP_PROTO( + const struct rpc_rdma_cid *cid, + int sqecount + ), + TP_ARGS(cid, sqecount) +); + TRACE_EVENT(svcrdma_wc_read, TP_PROTO( const struct ib_wc *wc, diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 59ea87b5f458..9836406cc41e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -191,6 +191,8 @@ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc, struct svc_rdma_rw_ctxt *ctxt; LLIST_HEAD(free); + trace_svcrdma_cc_release(&cc->cc_cid, cc->cc_sqecount); + first = last = NULL; while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) { list_del(&ctxt->rw_list); -- cgit v1.2.3 From 2951580ba6adb082bb6b7154a5ecb24e7c1f7569 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 18 Apr 2023 16:38:54 +0200 Subject: tracing/timer: Add missing hrtimer modes to decode_hrtimer_mode(). The trace output for the HRTIMER_MODE_.*_HARD modes is seen as a number since these modes are not decoded. The author was not aware of the fancy decoding function which makes the life easier. Extend decode_hrtimer_mode() with the additional HRTIMER_MODE_.*_HARD modes. Fixes: ae6683d815895 ("hrtimer: Introduce HARD expiry mode") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Reviewed-by: Mukesh Ojha Acked-by: Steven Rostedt (Google) Link: https://lore.kernel.org/r/20230418143854.8vHWQKLM@linutronix.de --- include/trace/events/timer.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/trace/events') diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index 3e8619c72f77..b4bc2828fa09 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -158,7 +158,11 @@ DEFINE_EVENT(timer_class, timer_cancel, { HRTIMER_MODE_ABS_SOFT, "ABS|SOFT" }, \ { HRTIMER_MODE_REL_SOFT, "REL|SOFT" }, \ { HRTIMER_MODE_ABS_PINNED_SOFT, "ABS|PINNED|SOFT" }, \ - { HRTIMER_MODE_REL_PINNED_SOFT, "REL|PINNED|SOFT" }) + { HRTIMER_MODE_REL_PINNED_SOFT, "REL|PINNED|SOFT" }, \ + { HRTIMER_MODE_ABS_HARD, "ABS|HARD" }, \ + { HRTIMER_MODE_REL_HARD, "REL|HARD" }, \ + { HRTIMER_MODE_ABS_PINNED_HARD, "ABS|PINNED|HARD" }, \ + { HRTIMER_MODE_REL_PINNED_HARD, "REL|PINNED|HARD" }) /** * hrtimer_init - called when the hrtimer is initialized -- cgit v1.2.3 From 122e9ede5355071359c10a8ebca138b162ef176b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 09:54:06 +0200 Subject: btrfs: add a btrfs_finish_ordered_extent helper Add a helper to complete an ordered_extent without first doing a lookup. The tracepoint cannot use the ordered_extent class as we also want to print the range. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 19 +++++++++++++++++++ fs/btrfs/ordered-data.h | 3 +++ include/trace/events/btrfs.h | 29 +++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+) (limited to 'include/trace/events') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index aa4c203a1695..a629532283bc 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -368,6 +368,25 @@ static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered) btrfs_queue_work(wq, &ordered->work); } +bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, + struct page *page, u64 file_offset, u64 len, + bool uptodate) +{ + struct btrfs_inode *inode = BTRFS_I(ordered->inode); + unsigned long flags; + bool ret; + + trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate); + + spin_lock_irqsave(&inode->ordered_tree.lock, flags); + ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate); + spin_unlock_irqrestore(&inode->ordered_tree.lock, flags); + + if (ret) + btrfs_queue_ordered_fn(ordered); + return ret; +} + /* * Mark all ordered extents io inside the specified range finished. * diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 0bd0f0368132..173bd5c5df26 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -167,6 +167,9 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry); +bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, + struct page *page, u64 file_offset, u64 len, + bool uptodate); void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, struct page *page, u64 file_offset, u64 num_bytes, bool uptodate); diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 8ea9cea9bfeb..c6eee9b414cf 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -661,6 +661,35 @@ DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_mark_finished, TP_ARGS(inode, ordered) ); +TRACE_EVENT(btrfs_finish_ordered_extent, + + TP_PROTO(const struct btrfs_inode *inode, u64 start, u64 len, + bool uptodate), + + TP_ARGS(inode, start, len, uptodate), + + TP_STRUCT__entry_btrfs( + __field( u64, ino ) + __field( u64, start ) + __field( u64, len ) + __field( bool, uptodate ) + __field( u64, root_objectid ) + ), + + TP_fast_assign_btrfs(inode->root->fs_info, + __entry->ino = btrfs_ino(inode); + __entry->start = start; + __entry->len = len; + __entry->uptodate = uptodate; + __entry->root_objectid = inode->root->root_key.objectid; + ), + + TP_printk_btrfs("root=%llu(%s) ino=%llu start=%llu len=%llu uptodate=%d", + show_root_type(__entry->root_objectid), + __entry->ino, __entry->start, + __entry->len, !!__entry->uptodate) +); + DECLARE_EVENT_CLASS(btrfs__writepage, TP_PROTO(const struct page *page, const struct inode *inode, -- cgit v1.2.3 From 6442550027f77a76efa7873b946c34c4a733febd Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Mon, 19 Jun 2023 11:15:31 +0900 Subject: btrfs: tracepoints: also show actual number of the outstanding extents The btrfs_inode_mod_outstanding_extents trace event only shows the modified number to the number of outstanding extents. It would be helpful if we can see the resulting extent number as well. Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- include/trace/events/btrfs.h | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include/trace/events') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 8abf96cfea8f..d47a927b3504 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -332,7 +332,7 @@ static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode, if (btrfs_is_free_space_inode(inode)) return; trace_btrfs_inode_mod_outstanding_extents(inode->root, btrfs_ino(inode), - mod); + mod, inode->outstanding_extents); } /* diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index c6eee9b414cf..a8206f5332e9 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -2011,25 +2011,27 @@ DEFINE_EVENT(btrfs__prelim_ref, btrfs_prelim_ref_insert, ); TRACE_EVENT(btrfs_inode_mod_outstanding_extents, - TP_PROTO(const struct btrfs_root *root, u64 ino, int mod), + TP_PROTO(const struct btrfs_root *root, u64 ino, int mod, unsigned outstanding), - TP_ARGS(root, ino, mod), + TP_ARGS(root, ino, mod, outstanding), TP_STRUCT__entry_btrfs( __field( u64, root_objectid ) __field( u64, ino ) __field( int, mod ) + __field( unsigned, outstanding ) ), TP_fast_assign_btrfs(root->fs_info, __entry->root_objectid = root->root_key.objectid; __entry->ino = ino; __entry->mod = mod; + __entry->outstanding = outstanding; ), - TP_printk_btrfs("root=%llu(%s) ino=%llu mod=%d", + TP_printk_btrfs("root=%llu(%s) ino=%llu mod=%d outstanding=%u", show_root_type(__entry->root_objectid), - __entry->ino, __entry->mod) + __entry->ino, __entry->mod, __entry->outstanding) ); DECLARE_EVENT_CLASS(btrfs__block_group, -- cgit v1.2.3 From 4eb7a4a1a33bdaf259fca8528f2546c90ad18f0d Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Tue, 30 May 2023 18:03:42 +0530 Subject: ext4: Convert mballoc cr (criteria) to enum Convert criteria to be an enum so it easier to maintain and update the tracefiles to use enum names. This change also makes it easier to insert new criterias in the future. There is no functional change in this patch. Signed-off-by: Ojaswin Mujoo Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/5d82fd467bdf70ea45bdaef810af3b146013946c.1685449706.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 23 +++++++++-- fs/ext4/mballoc.c | 96 ++++++++++++++++++++++----------------------- include/trace/events/ext4.h | 16 +++++++- 3 files changed, 82 insertions(+), 53 deletions(-) (limited to 'include/trace/events') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2a7e254cbae3..914475cbb060 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -127,6 +127,23 @@ enum SHIFT_DIRECTION { SHIFT_RIGHT, }; +/* + * Number of criterias defined. For each criteria, mballoc has slightly + * different way of finding the required blocks nad usually, higher the + * criteria the slower the allocation. We start at lower criterias and keep + * falling back to higher ones if we are not able to find any blocks. + */ +#define EXT4_MB_NUM_CRS 4 +/* + * All possible allocation criterias for mballoc + */ +enum criteria { + CR0, + CR1, + CR2, + CR3, +}; + /* * Flags used in mballoc's allocation_context flags field. * @@ -1544,9 +1561,9 @@ struct ext4_sb_info { atomic_t s_bal_2orders; /* 2^order hits */ atomic_t s_bal_cr0_bad_suggestions; atomic_t s_bal_cr1_bad_suggestions; - atomic64_t s_bal_cX_groups_considered[4]; - atomic64_t s_bal_cX_hits[4]; - atomic64_t s_bal_cX_failed[4]; /* cX loop didn't find blocks */ + atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS]; + atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS]; + atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */ atomic_t s_mb_buddies_generated; /* number of buddies generated */ atomic64_t s_mb_generation_time; atomic_t s_mb_lost_chunks; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 8a3946f1bdd9..4359280d2fa7 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -154,19 +154,19 @@ * structures to decide the order in which groups are to be traversed for * fulfilling an allocation request. * - * At CR = 0, we look for groups which have the largest_free_order >= the order + * At CR0 , we look for groups which have the largest_free_order >= the order * of the request. We directly look at the largest free order list in the data * structure (1) above where largest_free_order = order of the request. If that * list is empty, we look at remaining list in the increasing order of - * largest_free_order. This allows us to perform CR = 0 lookup in O(1) time. + * largest_free_order. This allows us to perform CR0 lookup in O(1) time. * - * At CR = 1, we only consider groups where average fragment size > request + * At CR1, we only consider groups where average fragment size > request * size. So, we lookup a group which has average fragment size just above or * equal to request size using our average fragment size group lists (data * structure 2) in O(1) time. * * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in - * linear order which requires O(N) search time for each CR 0 and CR 1 phase. + * linear order which requires O(N) search time for each CR0 and CR1 phase. * * The regular allocator (using the buddy cache) supports a few tunables. * @@ -409,7 +409,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); static bool ext4_mb_good_group(struct ext4_allocation_context *ac, - ext4_group_t group, int cr); + ext4_group_t group, enum criteria cr); static int ext4_try_to_trim_range(struct super_block *sb, struct ext4_buddy *e4b, ext4_grpblk_t start, @@ -859,7 +859,7 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) * cr level needs an update. */ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, - int *new_cr, ext4_group_t *group, ext4_group_t ngroups) + enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *iter, *grp; @@ -884,8 +884,8 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], bb_largest_free_order_node) { if (sbi->s_mb_stats) - atomic64_inc(&sbi->s_bal_cX_groups_considered[0]); - if (likely(ext4_mb_good_group(ac, iter->bb_group, 0))) { + atomic64_inc(&sbi->s_bal_cX_groups_considered[CR0]); + if (likely(ext4_mb_good_group(ac, iter->bb_group, CR0))) { grp = iter; break; } @@ -897,7 +897,7 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, if (!grp) { /* Increment cr and search again */ - *new_cr = 1; + *new_cr = CR1; } else { *group = grp->bb_group; ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED; @@ -909,7 +909,7 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, * order. Updates *new_cr if cr level needs an update. */ static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, - int *new_cr, ext4_group_t *group, ext4_group_t ngroups) + enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *grp = NULL, *iter; @@ -932,8 +932,8 @@ static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i], bb_avg_fragment_size_node) { if (sbi->s_mb_stats) - atomic64_inc(&sbi->s_bal_cX_groups_considered[1]); - if (likely(ext4_mb_good_group(ac, iter->bb_group, 1))) { + atomic64_inc(&sbi->s_bal_cX_groups_considered[CR1]); + if (likely(ext4_mb_good_group(ac, iter->bb_group, CR1))) { grp = iter; break; } @@ -947,7 +947,7 @@ static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, *group = grp->bb_group; ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; } else { - *new_cr = 2; + *new_cr = CR2; } } @@ -955,7 +955,7 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac) { if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN))) return 0; - if (ac->ac_criteria >= 2) + if (ac->ac_criteria >= CR2) return 0; if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) return 0; @@ -1000,7 +1000,7 @@ inc_and_return: * @ngroups Total number of groups */ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, - int *new_cr, ext4_group_t *group, ext4_group_t ngroups) + enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) { *new_cr = ac->ac_criteria; @@ -1009,9 +1009,9 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, return; } - if (*new_cr == 0) { + if (*new_cr == CR0) { ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups); - } else if (*new_cr == 1) { + } else if (*new_cr == CR1) { ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups); } else { /* @@ -2408,13 +2408,13 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, * for the allocation or not. */ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, - ext4_group_t group, int cr) + ext4_group_t group, enum criteria cr) { ext4_grpblk_t free, fragments; int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); - BUG_ON(cr < 0 || cr >= 4); + BUG_ON(cr < CR0 || cr >= EXT4_MB_NUM_CRS); if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp) || !grp)) return false; @@ -2428,7 +2428,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, return false; switch (cr) { - case 0: + case CR0: BUG_ON(ac->ac_2order == 0); /* Avoid using the first bg of a flexgroup for data files */ @@ -2447,15 +2447,15 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, return false; return true; - case 1: + case CR1: if ((free / fragments) >= ac->ac_g_ex.fe_len) return true; break; - case 2: + case CR2: if (free >= ac->ac_g_ex.fe_len) return true; break; - case 3: + case CR3: return true; default: BUG(); @@ -2476,7 +2476,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, * out"! */ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, - ext4_group_t group, int cr) + ext4_group_t group, enum criteria cr) { struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); struct super_block *sb = ac->ac_sb; @@ -2496,7 +2496,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, free = grp->bb_free; if (free == 0) goto out; - if (cr <= 2 && free < ac->ac_g_ex.fe_len) + if (cr <= CR2 && free < ac->ac_g_ex.fe_len) goto out; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) goto out; @@ -2511,7 +2511,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, ext4_get_group_desc(sb, group, NULL); int ret; - /* cr=0/1 is a very optimistic search to find large + /* cr=CR0/CR1 is a very optimistic search to find large * good chunks almost for free. If buddy data is not * ready, then this optimization makes no sense. But * we never skip the first block group in a flex_bg, @@ -2519,7 +2519,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, * and we want to make sure we locate metadata blocks * in the first block group in the flex_bg if possible. */ - if (cr < 2 && + if (cr < CR2 && (!sbi->s_log_groups_per_flex || ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) && !(ext4_has_group_desc_csum(sb) && @@ -2625,7 +2625,7 @@ static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { ext4_group_t prefetch_grp = 0, ngroups, group, i; - int cr = -1, new_cr; + enum criteria cr, new_cr; int err = 0, first_err = 0; unsigned int nr = 0, prefetch_ios = 0; struct ext4_sb_info *sbi; @@ -2683,13 +2683,13 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) } /* Let's just scan groups to find more-less suitable blocks */ - cr = ac->ac_2order ? 0 : 1; + cr = ac->ac_2order ? CR0 : CR1; /* - * cr == 0 try to get exact allocation, - * cr == 3 try to get anything + * cr == CR0 try to get exact allocation, + * cr == CR3 try to get anything */ repeat: - for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { + for (; cr < EXT4_MB_NUM_CRS && ac->ac_status == AC_STATUS_CONTINUE; cr++) { ac->ac_criteria = cr; /* * searching for the right group start @@ -2716,7 +2716,7 @@ repeat: * spend a lot of time loading imperfect groups */ if ((prefetch_grp == group) && - (cr > 1 || + (cr > CR1 || prefetch_ios < sbi->s_mb_prefetch_limit)) { unsigned int curr_ios = prefetch_ios; @@ -2758,9 +2758,9 @@ repeat: } ac->ac_groups_scanned++; - if (cr == 0) + if (cr == CR0) ext4_mb_simple_scan_group(ac, &e4b); - else if (cr == 1 && sbi->s_stripe && + else if (cr == CR1 && sbi->s_stripe && !(ac->ac_g_ex.fe_len % EXT4_B2C(sbi, sbi->s_stripe))) ext4_mb_scan_aligned(ac, &e4b); @@ -2801,7 +2801,7 @@ repeat: ac->ac_b_ex.fe_len = 0; ac->ac_status = AC_STATUS_CONTINUE; ac->ac_flags |= EXT4_MB_HINT_FIRST; - cr = 3; + cr = CR3; goto repeat; } } @@ -2926,36 +2926,36 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned)); seq_puts(seq, "\tcr0_stats:\n"); - seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0])); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR0])); seq_printf(seq, "\t\tgroups_considered: %llu\n", - atomic64_read(&sbi->s_bal_cX_groups_considered[0])); + atomic64_read(&sbi->s_bal_cX_groups_considered[CR0])); seq_printf(seq, "\t\tuseless_loops: %llu\n", - atomic64_read(&sbi->s_bal_cX_failed[0])); + atomic64_read(&sbi->s_bal_cX_failed[CR0])); seq_printf(seq, "\t\tbad_suggestions: %u\n", atomic_read(&sbi->s_bal_cr0_bad_suggestions)); seq_puts(seq, "\tcr1_stats:\n"); - seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1])); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR1])); seq_printf(seq, "\t\tgroups_considered: %llu\n", - atomic64_read(&sbi->s_bal_cX_groups_considered[1])); + atomic64_read(&sbi->s_bal_cX_groups_considered[CR1])); seq_printf(seq, "\t\tuseless_loops: %llu\n", - atomic64_read(&sbi->s_bal_cX_failed[1])); + atomic64_read(&sbi->s_bal_cX_failed[CR1])); seq_printf(seq, "\t\tbad_suggestions: %u\n", atomic_read(&sbi->s_bal_cr1_bad_suggestions)); seq_puts(seq, "\tcr2_stats:\n"); - seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2])); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR2])); seq_printf(seq, "\t\tgroups_considered: %llu\n", - atomic64_read(&sbi->s_bal_cX_groups_considered[2])); + atomic64_read(&sbi->s_bal_cX_groups_considered[CR2])); seq_printf(seq, "\t\tuseless_loops: %llu\n", - atomic64_read(&sbi->s_bal_cX_failed[2])); + atomic64_read(&sbi->s_bal_cX_failed[CR2])); seq_puts(seq, "\tcr3_stats:\n"); - seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3])); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR3])); seq_printf(seq, "\t\tgroups_considered: %llu\n", - atomic64_read(&sbi->s_bal_cX_groups_considered[3])); + atomic64_read(&sbi->s_bal_cX_groups_considered[CR3])); seq_printf(seq, "\t\tuseless_loops: %llu\n", - atomic64_read(&sbi->s_bal_cX_failed[3])); + atomic64_read(&sbi->s_bal_cX_failed[CR3])); seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned)); seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 54cd509ced0f..51eab82e897c 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -120,6 +120,18 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX); { EXT4_FC_REASON_INODE_JOURNAL_DATA, "INODE_JOURNAL_DATA"}, \ { EXT4_FC_REASON_ENCRYPTED_FILENAME, "ENCRYPTED_FILENAME"}) +TRACE_DEFINE_ENUM(CR0); +TRACE_DEFINE_ENUM(CR1); +TRACE_DEFINE_ENUM(CR2); +TRACE_DEFINE_ENUM(CR3); + +#define show_criteria(cr) \ + __print_symbolic(cr, \ + { CR0, "CR0" }, \ + { CR1, "CR1" }, \ + { CR2, "CR2" }, \ + { CR3, "CR3" }) + TRACE_EVENT(ext4_other_inode_update_time, TP_PROTO(struct inode *inode, ino_t orig_ino), @@ -1063,7 +1075,7 @@ TRACE_EVENT(ext4_mballoc_alloc, ), TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u " - "result %u/%d/%u@%u blks %u grps %u cr %u flags %s " + "result %u/%d/%u@%u blks %u grps %u cr %s flags %s " "tail %u broken %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, @@ -1073,7 +1085,7 @@ TRACE_EVENT(ext4_mballoc_alloc, __entry->goal_len, __entry->goal_logical, __entry->result_group, __entry->result_start, __entry->result_len, __entry->result_logical, - __entry->found, __entry->groups, __entry->cr, + __entry->found, __entry->groups, show_criteria(__entry->cr), show_mballoc_flags(__entry->flags), __entry->tail, __entry->buddy ? 1 << __entry->buddy : 0) ); -- cgit v1.2.3 From 7e170922f06bf46effa7c57f6035fc463d6edc7e Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Tue, 30 May 2023 18:03:49 +0530 Subject: ext4: Add allocation criteria 1.5 (CR1_5) CR1_5 aims to optimize allocations which can't be satisfied in CR1. The fact that we couldn't find a group in CR1 suggests that it would be difficult to find a continuous extent to compleltely satisfy our allocations. So before falling to the slower CR2, in CR1.5 we proactively trim the the preallocations so we can find a group with (free / fragments) big enough. This speeds up our allocation at the cost of slightly reduced preallocation. The patch also adds a new sysfs tunable: * /sys/fs/ext4//mb_cr1_5_max_trim_order This controls how much CR1.5 can trim a request before falling to CR2. For example, for a request of order 7 and max trim order 2, CR1.5 can trim this upto order 5. Suggested-by: Ritesh Harjani (IBM) Signed-off-by: Ojaswin Mujoo Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/150fdf65c8e4cc4dba71e020ce0859bcf636a5ff.1685449706.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 8 ++- fs/ext4/mballoc.c | 135 +++++++++++++++++++++++++++++++++++++++++--- fs/ext4/mballoc.h | 13 +++++ fs/ext4/sysfs.c | 2 + include/trace/events/ext4.h | 2 + 5 files changed, 150 insertions(+), 10 deletions(-) (limited to 'include/trace/events') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e4f53947f51f..fb9d2e2a9e42 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -133,13 +133,14 @@ enum SHIFT_DIRECTION { * criteria the slower the allocation. We start at lower criterias and keep * falling back to higher ones if we are not able to find any blocks. */ -#define EXT4_MB_NUM_CRS 4 +#define EXT4_MB_NUM_CRS 5 /* * All possible allocation criterias for mballoc */ enum criteria { CR0, CR1, + CR1_5, CR2, CR3, }; @@ -185,6 +186,9 @@ enum criteria { #define EXT4_MB_CR0_OPTIMIZED 0x8000 /* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ #define EXT4_MB_CR1_OPTIMIZED 0x00010000 +/* Avg fragment size rb tree lookup succeeded at least once for cr = 1.5 */ +#define EXT4_MB_CR1_5_OPTIMIZED 0x00020000 + struct ext4_allocation_request { /* target inode for block we're allocating */ struct inode *inode; @@ -1549,6 +1553,7 @@ struct ext4_sb_info { unsigned long s_mb_last_start; unsigned int s_mb_prefetch; unsigned int s_mb_prefetch_limit; + unsigned int s_mb_cr1_5_max_trim_order; /* stats for buddy allocator */ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ @@ -1563,6 +1568,7 @@ struct ext4_sb_info { atomic_t s_bal_2orders; /* 2^order hits */ atomic_t s_bal_cr0_bad_suggestions; atomic_t s_bal_cr1_bad_suggestions; + atomic_t s_bal_cr1_5_bad_suggestions; atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS]; atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS]; atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 7fc99b209546..cc9ad7469fbb 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -165,6 +165,14 @@ * equal to request size using our average fragment size group lists (data * structure 2) in O(1) time. * + * At CR1.5 (aka CR1_5), we aim to optimize allocations which can't be satisfied + * in CR1. The fact that we couldn't find a group in CR1 suggests that there is + * no BG that has average fragment size > goal length. So before falling to the + * slower CR2, in CR1.5 we proactively trim goal length and then use the same + * fragment lists as CR1 to find a BG with a big enough average fragment size. + * This increases the chances of finding a suitable block group in O(1) time and + * results * in faster allocation at the cost of reduced size of allocation. + * * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in * linear order which requires O(N) search time for each CR0 and CR1 phase. * @@ -962,6 +970,91 @@ static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, *group = grp->bb_group; ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; } else { + *new_cr = CR1_5; + } +} + +/* + * We couldn't find a group in CR1 so try to find the highest free fragment + * order we have and proactively trim the goal request length to that order to + * find a suitable group faster. + * + * This optimizes allocation speed at the cost of slightly reduced + * preallocations. However, we make sure that we don't trim the request too + * much and fall to CR2 in that case. + */ +static void ext4_mb_choose_next_group_cr1_5(struct ext4_allocation_context *ac, + enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_group_info *grp = NULL; + int i, order, min_order; + unsigned long num_stripe_clusters = 0; + + if (unlikely(ac->ac_flags & EXT4_MB_CR1_5_OPTIMIZED)) { + if (sbi->s_mb_stats) + atomic_inc(&sbi->s_bal_cr1_5_bad_suggestions); + } + + /* + * mb_avg_fragment_size_order() returns order in a way that makes + * retrieving back the length using (1 << order) inaccurate. Hence, use + * fls() instead since we need to know the actual length while modifying + * goal length. + */ + order = fls(ac->ac_g_ex.fe_len); + min_order = order - sbi->s_mb_cr1_5_max_trim_order; + if (min_order < 0) + min_order = 0; + + if (1 << min_order < ac->ac_o_ex.fe_len) + min_order = fls(ac->ac_o_ex.fe_len) + 1; + + if (sbi->s_stripe > 0) { + /* + * We are assuming that stripe size is always a multiple of + * cluster ratio otherwise __ext4_fill_super exists early. + */ + num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe); + if (1 << min_order < num_stripe_clusters) + min_order = fls(num_stripe_clusters); + } + + for (i = order; i >= min_order; i--) { + int frag_order; + /* + * Scale down goal len to make sure we find something + * in the free fragments list. Basically, reduce + * preallocations. + */ + ac->ac_g_ex.fe_len = 1 << i; + + if (num_stripe_clusters > 0) { + /* + * Try to round up the adjusted goal to stripe size + * (in cluster units) multiple for efficiency. + * + * XXX: Is s->stripe always a power of 2? In that case + * we can use the faster round_up() variant. + */ + ac->ac_g_ex.fe_len = roundup(ac->ac_g_ex.fe_len, + num_stripe_clusters); + } + + frag_order = mb_avg_fragment_size_order(ac->ac_sb, + ac->ac_g_ex.fe_len); + + grp = ext4_mb_find_good_group_avg_frag_lists(ac, frag_order); + if (grp) + break; + } + + if (grp) { + *group = grp->bb_group; + ac->ac_flags |= EXT4_MB_CR1_5_OPTIMIZED; + } else { + /* Reset goal length to original goal length before falling into CR2 */ + ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; *new_cr = CR2; } } @@ -1028,6 +1121,8 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups); } else if (*new_cr == CR1) { ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups); + } else if (*new_cr == CR1_5) { + ext4_mb_choose_next_group_cr1_5(ac, new_cr, group, ngroups); } else { /* * TODO: For CR=2, we can arrange groups in an rb tree sorted by @@ -2351,7 +2446,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, if (ac->ac_criteria < CR2) { /* - * In CR1, we are sure that this group will + * In CR1 and CR1_5, we are sure that this group will * have a large enough continuous free extent, so skip * over the smaller free extents */ @@ -2483,6 +2578,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, return true; case CR1: + case CR1_5: if ((free / fragments) >= ac->ac_g_ex.fe_len) return true; break; @@ -2747,7 +2843,7 @@ repeat: * spend a lot of time loading imperfect groups */ if ((prefetch_grp == group) && - (cr > CR1 || + (cr > CR1_5 || prefetch_ios < sbi->s_mb_prefetch_limit)) { nr = sbi->s_mb_prefetch; if (ext4_has_feature_flex_bg(sb)) { @@ -2787,7 +2883,7 @@ repeat: ac->ac_groups_scanned++; if (cr == CR0) ext4_mb_simple_scan_group(ac, &e4b); - else if (cr == CR1 && sbi->s_stripe && + else if ((cr == CR1 || cr == CR1_5) && sbi->s_stripe && !(ac->ac_g_ex.fe_len % EXT4_B2C(sbi, sbi->s_stripe))) ext4_mb_scan_aligned(ac, &e4b); @@ -2803,6 +2899,11 @@ repeat: /* Processed all groups and haven't found blocks */ if (sbi->s_mb_stats && i == ngroups) atomic64_inc(&sbi->s_bal_cX_failed[cr]); + + if (i == ngroups && ac->ac_criteria == CR1_5) + /* Reset goal length to original goal length before + * falling into CR2 */ + ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; } if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && @@ -2972,6 +3073,16 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) seq_printf(seq, "\t\tbad_suggestions: %u\n", atomic_read(&sbi->s_bal_cr1_bad_suggestions)); + seq_puts(seq, "\tcr1.5_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR1_5])); + seq_printf(seq, "\t\tgroups_considered: %llu\n", + atomic64_read(&sbi->s_bal_cX_groups_considered[CR1_5])); + seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR1_5])); + seq_printf(seq, "\t\tuseless_loops: %llu\n", + atomic64_read(&sbi->s_bal_cX_failed[CR1_5])); + seq_printf(seq, "\t\tbad_suggestions: %u\n", + atomic_read(&sbi->s_bal_cr1_5_bad_suggestions)); + seq_puts(seq, "\tcr2_stats:\n"); seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR2])); seq_printf(seq, "\t\tgroups_considered: %llu\n", @@ -3489,6 +3600,8 @@ int ext4_mb_init(struct super_block *sb) sbi->s_mb_stats = MB_DEFAULT_STATS; sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; + sbi->s_mb_cr1_5_max_trim_order = MB_DEFAULT_CR1_5_TRIM_ORDER; + /* * The default group preallocation is 512, which for 4k block * sizes translates to 2 megabytes. However for bigalloc file @@ -4392,6 +4505,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, * placement or satisfy big request as is */ ac->ac_g_ex.fe_logical = start; ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size); + ac->ac_orig_goal_len = ac->ac_g_ex.fe_len; /* define goal start in order to merge */ if (ar->pright && (ar->lright == (start + size)) && @@ -4435,8 +4549,10 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) atomic_inc(&sbi->s_bal_goals); - if (ac->ac_f_ex.fe_len == ac->ac_g_ex.fe_len) + /* did we allocate as much as normalizer originally wanted? */ + if (ac->ac_f_ex.fe_len == ac->ac_orig_goal_len) atomic_inc(&sbi->s_bal_len_goals); + if (ac->ac_found > sbi->s_mb_max_to_scan) atomic_inc(&sbi->s_bal_breaks); } @@ -4921,7 +5037,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa = ac->ac_pa; - if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { + if (ac->ac_b_ex.fe_len < ac->ac_orig_goal_len) { int new_bex_start; int new_bex_end; @@ -4936,14 +5052,14 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) * fragmentation in check while ensuring logical range of best * extent doesn't overflow out of goal extent: * - * 1. Check if best ex can be kept at end of goal and still - * cover original start + * 1. Check if best ex can be kept at end of goal (before + * cr_best_avail trimmed it) and still cover original start * 2. Else, check if best ex can be kept at start of goal and * still cover original start * 3. Else, keep the best ex at start of original request. */ new_bex_end = ac->ac_g_ex.fe_logical + - EXT4_C2B(sbi, ac->ac_g_ex.fe_len); + EXT4_C2B(sbi, ac->ac_orig_goal_len); new_bex_start = new_bex_end - EXT4_C2B(sbi, ac->ac_b_ex.fe_len); if (ac->ac_o_ex.fe_logical >= new_bex_start) goto adjust_bex; @@ -4964,7 +5080,7 @@ adjust_bex: BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); BUG_ON(new_bex_end > (ac->ac_g_ex.fe_logical + - EXT4_C2B(sbi, ac->ac_g_ex.fe_len))); + EXT4_C2B(sbi, ac->ac_orig_goal_len))); } pa->pa_lstart = ac->ac_b_ex.fe_logical; @@ -5584,6 +5700,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, ac->ac_o_ex.fe_start = block; ac->ac_o_ex.fe_len = len; ac->ac_g_ex = ac->ac_o_ex; + ac->ac_orig_goal_len = ac->ac_g_ex.fe_len; ac->ac_flags = ar->flags; /* we have to define context: we'll work with a file or diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index acfdc204e15d..bddc0335c261 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -85,6 +85,13 @@ */ #define MB_DEFAULT_LINEAR_SCAN_THRESHOLD 16 +/* + * The maximum order upto which CR1.5 can trim a particular allocation request. + * Example, if we have an order 7 request and max trim order of 3, CR1.5 can + * trim this upto order 4. + */ +#define MB_DEFAULT_CR1_5_TRIM_ORDER 3 + /* * Number of valid buddy orders */ @@ -179,6 +186,12 @@ struct ext4_allocation_context { /* copy of the best found extent taken before preallocation efforts */ struct ext4_free_extent ac_f_ex; + /* + * goal len can change in CR1.5, so save the original len. This is + * used while adjusting the PA window and for accounting. + */ + ext4_grpblk_t ac_orig_goal_len; + __u32 ac_groups_considered; __u32 ac_flags; /* allocation hints */ __u16 ac_groups_scanned; diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 3042bc605bbf..4a5c08c8dddb 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -223,6 +223,7 @@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.int EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); +EXT4_RW_ATTR_SBI_UI(mb_cr1_5_max_trim_order, s_mb_cr1_5_max_trim_order); #ifdef CONFIG_EXT4_DEBUG EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail); #endif @@ -273,6 +274,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(warning_ratelimit_burst), ATTR_LIST(msg_ratelimit_interval_ms), ATTR_LIST(msg_ratelimit_burst), + ATTR_LIST(mb_cr1_5_max_trim_order), ATTR_LIST(errors_count), ATTR_LIST(warning_count), ATTR_LIST(msg_count), diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 51eab82e897c..2a3ffa081d2c 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -122,6 +122,7 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX); TRACE_DEFINE_ENUM(CR0); TRACE_DEFINE_ENUM(CR1); +TRACE_DEFINE_ENUM(CR1_5); TRACE_DEFINE_ENUM(CR2); TRACE_DEFINE_ENUM(CR3); @@ -129,6 +130,7 @@ TRACE_DEFINE_ENUM(CR3); __print_symbolic(cr, \ { CR0, "CR0" }, \ { CR1, "CR1" }, \ + { CR1_5, "CR1.5" } \ { CR2, "CR2" }, \ { CR3, "CR3" }) -- cgit v1.2.3 From f52f3d2b9fbab73c776f4d3386393e9bbc83b87d Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Tue, 30 May 2023 18:03:50 +0530 Subject: ext4: Give symbolic names to mballoc criterias mballoc criterias have historically been called by numbers like CR0, CR1... however this makes it confusing to understand what each criteria is about. Change these criterias from numbers to symbolic names and add relevant comments. While we are at it, also reformat and add some comments to ext4_seq_mb_stats_show() for better readability. Additionally, define CR_FAST which signifies the criteria below which we can make quicker decisions like: * quitting early if (free block < requested len) * avoiding to scan free extents smaller than required len. * avoiding to initialize buddy cache and work with existing cache * limiting prefetches Suggested-by: Jan Kara Signed-off-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/a2dc6ec5aea5e5e68cf8e788c2a964ffead9c8b0.1685449706.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 55 ++++++--- fs/ext4/mballoc.c | 271 +++++++++++++++++++++++++------------------- fs/ext4/mballoc.h | 8 +- fs/ext4/sysfs.c | 4 +- include/trace/events/ext4.h | 26 ++--- 5 files changed, 214 insertions(+), 150 deletions(-) (limited to 'include/trace/events') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index fb9d2e2a9e42..6a1f013d23f7 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -135,16 +135,45 @@ enum SHIFT_DIRECTION { */ #define EXT4_MB_NUM_CRS 5 /* - * All possible allocation criterias for mballoc + * All possible allocation criterias for mballoc. Lower are faster. */ enum criteria { - CR0, - CR1, - CR1_5, - CR2, - CR3, + /* + * Used when number of blocks needed is a power of 2. This doesn't + * trigger any disk IO except prefetch and is the fastest criteria. + */ + CR_POWER2_ALIGNED, + + /* + * Tries to lookup in-memory data structures to find the most suitable + * group that satisfies goal request. No disk IO except block prefetch. + */ + CR_GOAL_LEN_FAST, + + /* + * Same as CR_GOAL_LEN_FAST but is allowed to reduce the goal length to + * the best available length for faster allocation. + */ + CR_BEST_AVAIL_LEN, + + /* + * Reads each block group sequentially, performing disk IO if necessary, to + * find find_suitable block group. Tries to allocate goal length but might trim + * the request if nothing is found after enough tries. + */ + CR_GOAL_LEN_SLOW, + + /* + * Finds the first free set of blocks and allocates those. This is only + * used in rare cases when CR_GOAL_LEN_SLOW also fails to allocate + * anything. + */ + CR_ANY_FREE, }; +/* criteria below which we use fast block scanning and avoid unnecessary IO */ +#define CR_FAST CR_GOAL_LEN_SLOW + /* * Flags used in mballoc's allocation_context flags field. * @@ -183,11 +212,11 @@ enum criteria { /* Do strict check for free blocks while retrying block allocation */ #define EXT4_MB_STRICT_CHECK 0x4000 /* Large fragment size list lookup succeeded at least once for cr = 0 */ -#define EXT4_MB_CR0_OPTIMIZED 0x8000 +#define EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED 0x8000 /* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ -#define EXT4_MB_CR1_OPTIMIZED 0x00010000 +#define EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED 0x00010000 /* Avg fragment size rb tree lookup succeeded at least once for cr = 1.5 */ -#define EXT4_MB_CR1_5_OPTIMIZED 0x00020000 +#define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED 0x00020000 struct ext4_allocation_request { /* target inode for block we're allocating */ @@ -1553,7 +1582,7 @@ struct ext4_sb_info { unsigned long s_mb_last_start; unsigned int s_mb_prefetch; unsigned int s_mb_prefetch_limit; - unsigned int s_mb_cr1_5_max_trim_order; + unsigned int s_mb_best_avail_max_trim_order; /* stats for buddy allocator */ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ @@ -1566,9 +1595,9 @@ struct ext4_sb_info { atomic_t s_bal_len_goals; /* len goal hits */ atomic_t s_bal_breaks; /* too long searches */ atomic_t s_bal_2orders; /* 2^order hits */ - atomic_t s_bal_cr0_bad_suggestions; - atomic_t s_bal_cr1_bad_suggestions; - atomic_t s_bal_cr1_5_bad_suggestions; + atomic_t s_bal_p2_aligned_bad_suggestions; + atomic_t s_bal_goal_fast_bad_suggestions; + atomic_t s_bal_best_avail_bad_suggestions; atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS]; atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS]; atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index cc9ad7469fbb..74ebe31f8d0f 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -154,27 +154,31 @@ * structures to decide the order in which groups are to be traversed for * fulfilling an allocation request. * - * At CR0 , we look for groups which have the largest_free_order >= the order - * of the request. We directly look at the largest free order list in the data - * structure (1) above where largest_free_order = order of the request. If that - * list is empty, we look at remaining list in the increasing order of - * largest_free_order. This allows us to perform CR0 lookup in O(1) time. + * At CR_POWER2_ALIGNED , we look for groups which have the largest_free_order + * >= the order of the request. We directly look at the largest free order list + * in the data structure (1) above where largest_free_order = order of the + * request. If that list is empty, we look at remaining list in the increasing + * order of largest_free_order. This allows us to perform CR_POWER2_ALIGNED + * lookup in O(1) time. * - * At CR1, we only consider groups where average fragment size > request - * size. So, we lookup a group which has average fragment size just above or - * equal to request size using our average fragment size group lists (data - * structure 2) in O(1) time. + * At CR_GOAL_LEN_FAST, we only consider groups where + * average fragment size > request size. So, we lookup a group which has average + * fragment size just above or equal to request size using our average fragment + * size group lists (data structure 2) in O(1) time. * - * At CR1.5 (aka CR1_5), we aim to optimize allocations which can't be satisfied - * in CR1. The fact that we couldn't find a group in CR1 suggests that there is - * no BG that has average fragment size > goal length. So before falling to the - * slower CR2, in CR1.5 we proactively trim goal length and then use the same - * fragment lists as CR1 to find a BG with a big enough average fragment size. - * This increases the chances of finding a suitable block group in O(1) time and - * results * in faster allocation at the cost of reduced size of allocation. + * At CR_BEST_AVAIL_LEN, we aim to optimize allocations which can't be satisfied + * in CR_GOAL_LEN_FAST. The fact that we couldn't find a group in + * CR_GOAL_LEN_FAST suggests that there is no BG that has avg + * fragment size > goal length. So before falling to the slower + * CR_GOAL_LEN_SLOW, in CR_BEST_AVAIL_LEN we proactively trim goal length and + * then use the same fragment lists as CR_GOAL_LEN_FAST to find a BG with a big + * enough average fragment size. This increases the chances of finding a + * suitable block group in O(1) time and results in faster allocation at the + * cost of reduced size of allocation. * * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in - * linear order which requires O(N) search time for each CR0 and CR1 phase. + * linear order which requires O(N) search time for each CR_POWER2_ALIGNED and + * CR_GOAL_LEN_FAST phase. * * The regular allocator (using the buddy cache) supports a few tunables. * @@ -359,8 +363,8 @@ * - bitlock on a group (group) * - object (inode/locality) (object) * - per-pa lock (pa) - * - cr0 lists lock (cr0) - * - cr1 tree lock (cr1) + * - cr_power2_aligned lists lock (cr_power2_aligned) + * - cr_goal_len_fast lists lock (cr_goal_len_fast) * * Paths: * - new pa @@ -392,7 +396,7 @@ * * - allocation path (ext4_mb_regular_allocator) * group - * cr0/cr1 + * cr_power2_aligned/cr_goal_len_fast */ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; @@ -866,7 +870,7 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) * Choose next group by traversing largest_free_order lists. Updates *new_cr if * cr level needs an update. */ -static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, +static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context *ac, enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); @@ -876,8 +880,8 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, if (ac->ac_status == AC_STATUS_FOUND) return; - if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR0_OPTIMIZED)) - atomic_inc(&sbi->s_bal_cr0_bad_suggestions); + if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED)) + atomic_inc(&sbi->s_bal_p2_aligned_bad_suggestions); grp = NULL; for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { @@ -892,8 +896,8 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], bb_largest_free_order_node) { if (sbi->s_mb_stats) - atomic64_inc(&sbi->s_bal_cX_groups_considered[CR0]); - if (likely(ext4_mb_good_group(ac, iter->bb_group, CR0))) { + atomic64_inc(&sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]); + if (likely(ext4_mb_good_group(ac, iter->bb_group, CR_POWER2_ALIGNED))) { grp = iter; break; } @@ -905,10 +909,10 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, if (!grp) { /* Increment cr and search again */ - *new_cr = CR1; + *new_cr = CR_GOAL_LEN_FAST; } else { *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED; + ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED; } } @@ -947,16 +951,16 @@ ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context *ac, int o * Choose next group by traversing average fragment size list of suitable * order. Updates *new_cr if cr level needs an update. */ -static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, +static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *ac, enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *grp = NULL; int i; - if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) { + if (unlikely(ac->ac_flags & EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED)) { if (sbi->s_mb_stats) - atomic_inc(&sbi->s_bal_cr1_bad_suggestions); + atomic_inc(&sbi->s_bal_goal_fast_bad_suggestions); } for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); @@ -968,22 +972,22 @@ static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, if (grp) { *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; + ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED; } else { - *new_cr = CR1_5; + *new_cr = CR_BEST_AVAIL_LEN; } } /* - * We couldn't find a group in CR1 so try to find the highest free fragment + * We couldn't find a group in CR_GOAL_LEN_FAST so try to find the highest free fragment * order we have and proactively trim the goal request length to that order to * find a suitable group faster. * * This optimizes allocation speed at the cost of slightly reduced * preallocations. However, we make sure that we don't trim the request too - * much and fall to CR2 in that case. + * much and fall to CR_GOAL_LEN_SLOW in that case. */ -static void ext4_mb_choose_next_group_cr1_5(struct ext4_allocation_context *ac, +static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context *ac, enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); @@ -991,9 +995,9 @@ static void ext4_mb_choose_next_group_cr1_5(struct ext4_allocation_context *ac, int i, order, min_order; unsigned long num_stripe_clusters = 0; - if (unlikely(ac->ac_flags & EXT4_MB_CR1_5_OPTIMIZED)) { + if (unlikely(ac->ac_flags & EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED)) { if (sbi->s_mb_stats) - atomic_inc(&sbi->s_bal_cr1_5_bad_suggestions); + atomic_inc(&sbi->s_bal_best_avail_bad_suggestions); } /* @@ -1003,7 +1007,7 @@ static void ext4_mb_choose_next_group_cr1_5(struct ext4_allocation_context *ac, * goal length. */ order = fls(ac->ac_g_ex.fe_len); - min_order = order - sbi->s_mb_cr1_5_max_trim_order; + min_order = order - sbi->s_mb_best_avail_max_trim_order; if (min_order < 0) min_order = 0; @@ -1051,11 +1055,11 @@ static void ext4_mb_choose_next_group_cr1_5(struct ext4_allocation_context *ac, if (grp) { *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR1_5_OPTIMIZED; + ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED; } else { - /* Reset goal length to original goal length before falling into CR2 */ + /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */ ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; - *new_cr = CR2; + *new_cr = CR_GOAL_LEN_SLOW; } } @@ -1063,7 +1067,7 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac) { if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN))) return 0; - if (ac->ac_criteria >= CR2) + if (ac->ac_criteria >= CR_GOAL_LEN_SLOW) return 0; if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) return 0; @@ -1117,12 +1121,12 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, return; } - if (*new_cr == CR0) { - ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups); - } else if (*new_cr == CR1) { - ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups); - } else if (*new_cr == CR1_5) { - ext4_mb_choose_next_group_cr1_5(ac, new_cr, group, ngroups); + if (*new_cr == CR_POWER2_ALIGNED) { + ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group, ngroups); + } else if (*new_cr == CR_GOAL_LEN_FAST) { + ext4_mb_choose_next_group_goal_fast(ac, new_cr, group, ngroups); + } else if (*new_cr == CR_BEST_AVAIL_LEN) { + ext4_mb_choose_next_group_best_avail(ac, new_cr, group, ngroups); } else { /* * TODO: For CR=2, we can arrange groups in an rb tree sorted by @@ -2444,11 +2448,12 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, break; } - if (ac->ac_criteria < CR2) { + if (ac->ac_criteria < CR_FAST) { /* - * In CR1 and CR1_5, we are sure that this group will - * have a large enough continuous free extent, so skip - * over the smaller free extents + * In CR_GOAL_LEN_FAST and CR_BEST_AVAIL_LEN, we are + * sure that this group will have a large enough + * continuous free extent, so skip over the smaller free + * extents */ j = mb_find_next_bit(bitmap, EXT4_CLUSTERS_PER_GROUP(sb), i); @@ -2544,7 +2549,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); - BUG_ON(cr < CR0 || cr >= EXT4_MB_NUM_CRS); + BUG_ON(cr < CR_POWER2_ALIGNED || cr >= EXT4_MB_NUM_CRS); if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp) || !grp)) return false; @@ -2558,7 +2563,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, return false; switch (cr) { - case CR0: + case CR_POWER2_ALIGNED: BUG_ON(ac->ac_2order == 0); /* Avoid using the first bg of a flexgroup for data files */ @@ -2577,16 +2582,16 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, return false; return true; - case CR1: - case CR1_5: + case CR_GOAL_LEN_FAST: + case CR_BEST_AVAIL_LEN: if ((free / fragments) >= ac->ac_g_ex.fe_len) return true; break; - case CR2: + case CR_GOAL_LEN_SLOW: if (free >= ac->ac_g_ex.fe_len) return true; break; - case CR3: + case CR_ANY_FREE: return true; default: BUG(); @@ -2627,7 +2632,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, free = grp->bb_free; if (free == 0) goto out; - if (cr <= CR2 && free < ac->ac_g_ex.fe_len) + if (cr <= CR_FAST && free < ac->ac_g_ex.fe_len) goto out; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) goto out; @@ -2642,15 +2647,16 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, ext4_get_group_desc(sb, group, NULL); int ret; - /* cr=CR0/CR1 is a very optimistic search to find large - * good chunks almost for free. If buddy data is not - * ready, then this optimization makes no sense. But - * we never skip the first block group in a flex_bg, - * since this gets used for metadata block allocation, - * and we want to make sure we locate metadata blocks - * in the first block group in the flex_bg if possible. + /* + * cr=CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic + * search to find large good chunks almost for free. If buddy + * data is not ready, then this optimization makes no sense. But + * we never skip the first block group in a flex_bg, since this + * gets used for metadata block allocation, and we want to make + * sure we locate metadata blocks in the first block group in + * the flex_bg if possible. */ - if (cr < CR2 && + if (cr < CR_FAST && (!sbi->s_log_groups_per_flex || ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) && !(ext4_has_group_desc_csum(sb) && @@ -2810,10 +2816,10 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) } /* Let's just scan groups to find more-less suitable blocks */ - cr = ac->ac_2order ? CR0 : CR1; + cr = ac->ac_2order ? CR_POWER2_ALIGNED : CR_GOAL_LEN_FAST; /* - * cr == CR0 try to get exact allocation, - * cr == CR3 try to get anything + * cr == CR_POWER2_ALIGNED try to get exact allocation, + * cr == CR_ANY_FREE try to get anything */ repeat: for (; cr < EXT4_MB_NUM_CRS && ac->ac_status == AC_STATUS_CONTINUE; cr++) { @@ -2843,7 +2849,7 @@ repeat: * spend a lot of time loading imperfect groups */ if ((prefetch_grp == group) && - (cr > CR1_5 || + (cr >= CR_FAST || prefetch_ios < sbi->s_mb_prefetch_limit)) { nr = sbi->s_mb_prefetch; if (ext4_has_feature_flex_bg(sb)) { @@ -2881,9 +2887,11 @@ repeat: } ac->ac_groups_scanned++; - if (cr == CR0) + if (cr == CR_POWER2_ALIGNED) ext4_mb_simple_scan_group(ac, &e4b); - else if ((cr == CR1 || cr == CR1_5) && sbi->s_stripe && + else if ((cr == CR_GOAL_LEN_FAST || + cr == CR_BEST_AVAIL_LEN) && + sbi->s_stripe && !(ac->ac_g_ex.fe_len % EXT4_B2C(sbi, sbi->s_stripe))) ext4_mb_scan_aligned(ac, &e4b); @@ -2900,9 +2908,9 @@ repeat: if (sbi->s_mb_stats && i == ngroups) atomic64_inc(&sbi->s_bal_cX_failed[cr]); - if (i == ngroups && ac->ac_criteria == CR1_5) + if (i == ngroups && ac->ac_criteria == CR_BEST_AVAIL_LEN) /* Reset goal length to original goal length before - * falling into CR2 */ + * falling into CR_GOAL_LEN_SLOW */ ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; } @@ -2929,7 +2937,7 @@ repeat: ac->ac_b_ex.fe_len = 0; ac->ac_status = AC_STATUS_CONTINUE; ac->ac_flags |= EXT4_MB_HINT_FIRST; - cr = CR3; + cr = CR_ANY_FREE; goto repeat; } } @@ -3045,66 +3053,94 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) seq_puts(seq, "mballoc:\n"); if (!sbi->s_mb_stats) { seq_puts(seq, "\tmb stats collection turned off.\n"); - seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n"); + seq_puts( + seq, + "\tTo enable, please write \"1\" to sysfs file mb_stats.\n"); return 0; } seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs)); seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success)); - seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned)); - - seq_puts(seq, "\tcr0_stats:\n"); - seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR0])); - seq_printf(seq, "\t\tgroups_considered: %llu\n", - atomic64_read(&sbi->s_bal_cX_groups_considered[CR0])); - seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR0])); + seq_printf(seq, "\tgroups_scanned: %u\n", + atomic_read(&sbi->s_bal_groups_scanned)); + + /* CR_POWER2_ALIGNED stats */ + seq_puts(seq, "\tcr_p2_aligned_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", + atomic64_read(&sbi->s_bal_cX_hits[CR_POWER2_ALIGNED])); + seq_printf( + seq, "\t\tgroups_considered: %llu\n", + atomic64_read( + &sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED])); + seq_printf(seq, "\t\textents_scanned: %u\n", + atomic_read(&sbi->s_bal_cX_ex_scanned[CR_POWER2_ALIGNED])); seq_printf(seq, "\t\tuseless_loops: %llu\n", - atomic64_read(&sbi->s_bal_cX_failed[CR0])); + atomic64_read(&sbi->s_bal_cX_failed[CR_POWER2_ALIGNED])); seq_printf(seq, "\t\tbad_suggestions: %u\n", - atomic_read(&sbi->s_bal_cr0_bad_suggestions)); + atomic_read(&sbi->s_bal_p2_aligned_bad_suggestions)); - seq_puts(seq, "\tcr1_stats:\n"); - seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR1])); + /* CR_GOAL_LEN_FAST stats */ + seq_puts(seq, "\tcr_goal_fast_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", + atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_FAST])); seq_printf(seq, "\t\tgroups_considered: %llu\n", - atomic64_read(&sbi->s_bal_cX_groups_considered[CR1])); - seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR1])); + atomic64_read( + &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_FAST])); + seq_printf(seq, "\t\textents_scanned: %u\n", + atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_FAST])); seq_printf(seq, "\t\tuseless_loops: %llu\n", - atomic64_read(&sbi->s_bal_cX_failed[CR1])); + atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_FAST])); seq_printf(seq, "\t\tbad_suggestions: %u\n", - atomic_read(&sbi->s_bal_cr1_bad_suggestions)); - - seq_puts(seq, "\tcr1.5_stats:\n"); - seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR1_5])); - seq_printf(seq, "\t\tgroups_considered: %llu\n", - atomic64_read(&sbi->s_bal_cX_groups_considered[CR1_5])); - seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR1_5])); + atomic_read(&sbi->s_bal_goal_fast_bad_suggestions)); + + /* CR_BEST_AVAIL_LEN stats */ + seq_puts(seq, "\tcr_best_avail_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", + atomic64_read(&sbi->s_bal_cX_hits[CR_BEST_AVAIL_LEN])); + seq_printf( + seq, "\t\tgroups_considered: %llu\n", + atomic64_read( + &sbi->s_bal_cX_groups_considered[CR_BEST_AVAIL_LEN])); + seq_printf(seq, "\t\textents_scanned: %u\n", + atomic_read(&sbi->s_bal_cX_ex_scanned[CR_BEST_AVAIL_LEN])); seq_printf(seq, "\t\tuseless_loops: %llu\n", - atomic64_read(&sbi->s_bal_cX_failed[CR1_5])); + atomic64_read(&sbi->s_bal_cX_failed[CR_BEST_AVAIL_LEN])); seq_printf(seq, "\t\tbad_suggestions: %u\n", - atomic_read(&sbi->s_bal_cr1_5_bad_suggestions)); + atomic_read(&sbi->s_bal_best_avail_bad_suggestions)); - seq_puts(seq, "\tcr2_stats:\n"); - seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR2])); + /* CR_GOAL_LEN_SLOW stats */ + seq_puts(seq, "\tcr_goal_slow_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", + atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_SLOW])); seq_printf(seq, "\t\tgroups_considered: %llu\n", - atomic64_read(&sbi->s_bal_cX_groups_considered[CR2])); - seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR2])); + atomic64_read( + &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_SLOW])); + seq_printf(seq, "\t\textents_scanned: %u\n", + atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_SLOW])); seq_printf(seq, "\t\tuseless_loops: %llu\n", - atomic64_read(&sbi->s_bal_cX_failed[CR2])); - - seq_puts(seq, "\tcr3_stats:\n"); - seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR3])); - seq_printf(seq, "\t\tgroups_considered: %llu\n", - atomic64_read(&sbi->s_bal_cX_groups_considered[CR3])); - seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR3])); + atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_SLOW])); + + /* CR_ANY_FREE stats */ + seq_puts(seq, "\tcr_any_free_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", + atomic64_read(&sbi->s_bal_cX_hits[CR_ANY_FREE])); + seq_printf( + seq, "\t\tgroups_considered: %llu\n", + atomic64_read(&sbi->s_bal_cX_groups_considered[CR_ANY_FREE])); + seq_printf(seq, "\t\textents_scanned: %u\n", + atomic_read(&sbi->s_bal_cX_ex_scanned[CR_ANY_FREE])); seq_printf(seq, "\t\tuseless_loops: %llu\n", - atomic64_read(&sbi->s_bal_cX_failed[CR3])); - seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned)); + atomic64_read(&sbi->s_bal_cX_failed[CR_ANY_FREE])); + + /* Aggregates */ + seq_printf(seq, "\textents_scanned: %u\n", + atomic_read(&sbi->s_bal_ex_scanned)); seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); - seq_printf(seq, "\t\tlen_goal_hits: %u\n", atomic_read(&sbi->s_bal_len_goals)); + seq_printf(seq, "\t\tlen_goal_hits: %u\n", + atomic_read(&sbi->s_bal_len_goals)); seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks)); seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks)); - seq_printf(seq, "\tbuddies_generated: %u/%u\n", atomic_read(&sbi->s_mb_buddies_generated), ext4_get_groups_count(sb)); @@ -3112,8 +3148,7 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) atomic64_read(&sbi->s_mb_generation_time)); seq_printf(seq, "\tpreallocated: %u\n", atomic_read(&sbi->s_mb_preallocated)); - seq_printf(seq, "\tdiscarded: %u\n", - atomic_read(&sbi->s_mb_discarded)); + seq_printf(seq, "\tdiscarded: %u\n", atomic_read(&sbi->s_mb_discarded)); return 0; } @@ -3600,7 +3635,7 @@ int ext4_mb_init(struct super_block *sb) sbi->s_mb_stats = MB_DEFAULT_STATS; sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; - sbi->s_mb_cr1_5_max_trim_order = MB_DEFAULT_CR1_5_TRIM_ORDER; + sbi->s_mb_best_avail_max_trim_order = MB_DEFAULT_BEST_AVAIL_TRIM_ORDER; /* * The default group preallocation is 512, which for 4k block diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index bddc0335c261..df6b5e7c2274 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -86,11 +86,11 @@ #define MB_DEFAULT_LINEAR_SCAN_THRESHOLD 16 /* - * The maximum order upto which CR1.5 can trim a particular allocation request. - * Example, if we have an order 7 request and max trim order of 3, CR1.5 can - * trim this upto order 4. + * The maximum order upto which CR_BEST_AVAIL_LEN can trim a particular + * allocation request. Example, if we have an order 7 request and max trim order + * of 3, we can trim this request upto order 4. */ -#define MB_DEFAULT_CR1_5_TRIM_ORDER 3 +#define MB_DEFAULT_BEST_AVAIL_TRIM_ORDER 3 /* * Number of valid buddy orders diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 4a5c08c8dddb..6d332dff79dd 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -223,7 +223,7 @@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.int EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); -EXT4_RW_ATTR_SBI_UI(mb_cr1_5_max_trim_order, s_mb_cr1_5_max_trim_order); +EXT4_RW_ATTR_SBI_UI(mb_best_avail_max_trim_order, s_mb_best_avail_max_trim_order); #ifdef CONFIG_EXT4_DEBUG EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail); #endif @@ -274,7 +274,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(warning_ratelimit_burst), ATTR_LIST(msg_ratelimit_interval_ms), ATTR_LIST(msg_ratelimit_burst), - ATTR_LIST(mb_cr1_5_max_trim_order), + ATTR_LIST(mb_best_avail_max_trim_order), ATTR_LIST(errors_count), ATTR_LIST(warning_count), ATTR_LIST(msg_count), diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 2a3ffa081d2c..65029dfb92fb 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -120,19 +120,19 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX); { EXT4_FC_REASON_INODE_JOURNAL_DATA, "INODE_JOURNAL_DATA"}, \ { EXT4_FC_REASON_ENCRYPTED_FILENAME, "ENCRYPTED_FILENAME"}) -TRACE_DEFINE_ENUM(CR0); -TRACE_DEFINE_ENUM(CR1); -TRACE_DEFINE_ENUM(CR1_5); -TRACE_DEFINE_ENUM(CR2); -TRACE_DEFINE_ENUM(CR3); - -#define show_criteria(cr) \ - __print_symbolic(cr, \ - { CR0, "CR0" }, \ - { CR1, "CR1" }, \ - { CR1_5, "CR1.5" } \ - { CR2, "CR2" }, \ - { CR3, "CR3" }) +TRACE_DEFINE_ENUM(CR_POWER2_ALIGNED); +TRACE_DEFINE_ENUM(CR_GOAL_LEN_FAST); +TRACE_DEFINE_ENUM(CR_BEST_AVAIL_LEN); +TRACE_DEFINE_ENUM(CR_GOAL_LEN_SLOW); +TRACE_DEFINE_ENUM(CR_ANY_FREE); + +#define show_criteria(cr) \ + __print_symbolic(cr, \ + { CR_POWER2_ALIGNED, "CR_POWER2_ALIGNED" }, \ + { CR_GOAL_LEN_FAST, "CR_GOAL_LEN_FAST" }, \ + { CR_BEST_AVAIL_LEN, "CR_BEST_AVAIL_LEN" }, \ + { CR_GOAL_LEN_SLOW, "CR_GOAL_LEN_SLOW" }, \ + { CR_ANY_FREE, "CR_ANY_FREE" }) TRACE_EVENT(ext4_other_inode_update_time, TP_PROTO(struct inode *inode, ino_t orig_ino), -- cgit v1.2.3