diff options
Diffstat (limited to 'fs')
80 files changed, 4649 insertions, 2552 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index ea95c90c8474..4438637c8900 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -62,6 +62,7 @@ config BTRFS_FS_RUN_SANITY_TESTS config BTRFS_DEBUG bool "Btrfs debugging support" depends on BTRFS_FS + select REF_TRACKER if STACKTRACE_SUPPORT help Enable run-time debugging support for the btrfs filesystem. @@ -117,14 +118,3 @@ config BTRFS_EXPERIMENTAL - large folio support If unsure, say N. - -config BTRFS_FS_REF_VERIFY - bool "Btrfs with the ref verify tool compiled in" - depends on BTRFS_FS - default n - help - Enable run-time extent reference verification instrumentation. This - is meant to be used by btrfs developers for tracking down extent - reference problems or verifying they didn't break something. - - If unsure, say N. diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 2d5f0482678b..743d7677b175 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -36,7 +36,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ lru_cache.o raid-stripe-tree.o fiemap.o direct-io.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o -btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o +btrfs-$(CONFIG_BTRFS_DEBUG) += ref-verify.o btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o btrfs-$(CONFIG_FS_VERITY) += verity.o diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c index 861c7d92c437..1248aa2535d3 100644 --- a/fs/btrfs/accessors.c +++ b/fs/btrfs/accessors.c @@ -44,7 +44,7 @@ static __always_inline void memcpy_split_src(char *dest, const char *src1, * gives us all the type checking. * * The extent buffer pages stored in the array folios may not form a contiguous - * phyusical range, but the API functions assume the linear offset to the range + * physical range, but the API functions assume the linear offset to the range * from 0 to metadata node size. */ diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 6a450be293b1..2ab550a1e715 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -859,7 +859,7 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, free_pref(ref); return PTR_ERR(eb); } - if (!extent_buffer_uptodate(eb)) { + if (unlikely(!extent_buffer_uptodate(eb))) { free_pref(ref); free_extent_buffer(eb); return -EIO; @@ -1062,7 +1062,7 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx, iref = (struct btrfs_extent_inline_ref *)ptr; type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); - if (type == BTRFS_REF_TYPE_INVALID) + if (unlikely(type == BTRFS_REF_TYPE_INVALID)) return -EUCLEAN; offset = btrfs_extent_inline_ref_offset(leaf, iref); @@ -1422,7 +1422,7 @@ again: ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. @@ -1614,7 +1614,7 @@ again: ret = PTR_ERR(eb); goto out; } - if (!extent_buffer_uptodate(eb)) { + if (unlikely(!extent_buffer_uptodate(eb))) { free_extent_buffer(eb); ret = -EIO; goto out; @@ -1652,7 +1652,7 @@ again: * case. */ ASSERT(eie); - if (!eie) { + if (unlikely(!eie)) { ret = -EUCLEAN; goto out; } @@ -1690,7 +1690,7 @@ out: * @ctx->bytenr and @ctx->extent_item_pos. The bytenr of the found leaves are * added to the ulist at @ctx->refs, and that ulist is allocated by this * function. The caller should free the ulist with free_leaf_list() if - * @ctx->ignore_extent_item_pos is false, otherwise a fimple ulist_free() is + * @ctx->ignore_extent_item_pos is false, otherwise a simple ulist_free() is * enough. * * Returns 0 on success and < 0 on error. On error @ctx->refs is not allocated. @@ -2215,7 +2215,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. @@ -2312,7 +2312,7 @@ static int get_extent_inline_ref(unsigned long *ptr, *out_eiref = (struct btrfs_extent_inline_ref *)(*ptr); *out_type = btrfs_get_extent_inline_ref_type(eb, *out_eiref, BTRFS_REF_TYPE_ANY); - if (*out_type == BTRFS_REF_TYPE_INVALID) + if (unlikely(*out_type == BTRFS_REF_TYPE_INVALID)) return -EUCLEAN; *ptr += btrfs_extent_inline_ref_size(*out_type); @@ -2868,7 +2868,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. @@ -2876,7 +2876,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) ret = -EUCLEAN; goto release; } - if (path->slots[0] == 0) { + if (unlikely(path->slots[0] == 0)) { DEBUG_WARN(); ret = -EUCLEAN; goto release; @@ -3457,7 +3457,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans, if (ret < 0) goto out; /* No extra backref? This means the tree block is corrupted */ - if (ret > 0) { + if (unlikely(ret > 0)) { ret = -EUCLEAN; goto out; } @@ -3500,7 +3500,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans, ((unsigned long)iter->cur_ptr); type = btrfs_get_extent_inline_ref_type(eb, iref, BTRFS_REF_TYPE_BLOCK); - if (type == BTRFS_REF_TYPE_INVALID) { + if (unlikely(type == BTRFS_REF_TYPE_INVALID)) { ret = -EUCLEAN; goto out; } @@ -3612,7 +3612,7 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, } /* Sanity check, we shouldn't have any unchecked nodes */ - if (!upper->checked) { + if (unlikely(!upper->checked)) { DEBUG_WARN("we should not have any unchecked nodes"); return -EUCLEAN; } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 34b0193a181c..25d51c246070 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -190,7 +190,7 @@ struct btrfs_backref_share_check_ctx { * It's very common to have several file extent items that point to the * same extent (bytenr) but with different offsets and lengths. This * typically happens for COW writes, partial writes into prealloc - * extents, NOCOW writes after snapshoting a root, hole punching or + * extents, NOCOW writes after snapshotting a root, hole punching or * reflinking within the same file (less common perhaps). * So keep a small cache with the lookup results for the extent pointed * by the last few file extent items. This cache is checked, with a @@ -414,7 +414,7 @@ struct btrfs_backref_cache { /* * Whether this cache is for relocation * - * Reloction backref cache require more info for reloc root compared + * Relocation backref cache require more info for reloc root compared * to generic backref cache. */ bool is_reloc; diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 50b5fc1c06d7..21df48e6c4fa 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -93,6 +93,7 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, refcount_inc(&orig_bbio->ordered->refs); bbio->ordered = orig_bbio->ordered; } + bbio->csum_search_commit_root = orig_bbio->csum_search_commit_root; atomic_inc(&orig_bbio->pending_ios); return bbio; } @@ -166,7 +167,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, int mirror = repair_bbio->mirror_num; if (repair_bbio->bio.bi_status || - !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) { + !btrfs_data_csum_ok(repair_bbio, dev, 0, bvec_phys(bv))) { bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); repair_bbio->bio.bi_iter = repair_bbio->saved_iter; @@ -203,18 +204,21 @@ done: */ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, u32 bio_offset, - struct bio_vec *bv, + phys_addr_t paddr, struct btrfs_failed_bio *fbio) { struct btrfs_inode *inode = failed_bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct folio *folio = page_folio(phys_to_page(paddr)); const u32 sectorsize = fs_info->sectorsize; + const u32 foff = offset_in_folio(folio, paddr); const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT); struct btrfs_bio *repair_bbio; struct bio *repair_bio; int num_copies; int mirror; + ASSERT(foff + sectorsize <= folio_size(folio)); btrfs_debug(fs_info, "repair read error: read error at %llu", failed_bbio->file_offset + bio_offset); @@ -237,7 +241,7 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, &btrfs_repair_bioset); repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector; - __bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset); + bio_add_folio_nofail(repair_bio, folio, sectorsize, foff); repair_bbio = btrfs_bio(repair_bio); btrfs_bio_init(repair_bbio, fs_info, NULL, fbio); @@ -258,6 +262,7 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de struct bvec_iter *iter = &bbio->saved_iter; blk_status_t status = bbio->bio.bi_status; struct btrfs_failed_bio *fbio = NULL; + phys_addr_t paddr; u32 offset = 0; /* Read-repair requires the inode field to be set by the submitter. */ @@ -275,17 +280,11 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de /* Clear the I/O error. A failed repair will reset it. */ bbio->bio.bi_status = BLK_STS_OK; - while (iter->bi_size) { - struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter); - - bv.bv_len = min(bv.bv_len, sectorsize); - if (status || !btrfs_data_csum_ok(bbio, dev, offset, &bv)) - fbio = repair_one_sector(bbio, offset, &bv, fbio); - - bio_advance_iter_single(&bbio->bio, iter, sectorsize); + btrfs_bio_for_each_block(paddr, &bbio->bio, iter, fs_info->sectorsize) { + if (status || !btrfs_data_csum_ok(bbio, dev, offset, paddr)) + fbio = repair_one_sector(bbio, offset, paddr, fbio); offset += sectorsize; } - if (bbio->csum != bbio->csum_inline) kfree(bbio->csum); @@ -780,11 +779,38 @@ end_bbio: return true; } +static void assert_bbio_alignment(struct btrfs_bio *bbio) +{ +#ifdef CONFIG_BTRFS_ASSERT + struct btrfs_fs_info *fs_info = bbio->fs_info; + struct bio_vec bvec; + struct bvec_iter iter; + const u32 blocksize = fs_info->sectorsize; + + /* Metadata has no extra bs > ps alignment requirement. */ + if (!is_data_bbio(bbio)) + return; + + bio_for_each_bvec(bvec, &bbio->bio, iter) + ASSERT(IS_ALIGNED(bvec.bv_offset, blocksize) && + IS_ALIGNED(bvec.bv_len, blocksize), + "root=%llu inode=%llu logical=%llu length=%u index=%u bv_offset=%u bv_len=%u", + btrfs_root_id(bbio->inode->root), + btrfs_ino(bbio->inode), + bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT, + bbio->bio.bi_iter.bi_size, iter.bi_idx, + bvec.bv_offset, + bvec.bv_len); +#endif +} + void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num) { /* If bbio->inode is not populated, its file_offset must be 0. */ ASSERT(bbio->inode || bbio->file_offset == 0); + assert_bbio_alignment(bbio); + while (!btrfs_submit_chunk(bbio, mirror_num)) ; } @@ -823,8 +849,8 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, if (ret < 0) goto out_counter_dec; - if (!smap.dev->bdev || - !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state)) { + if (unlikely(!smap.dev->bdev || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state))) { ret = -EIO; goto out_counter_dec; } diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index dc2eb43b7097..00883aea55d7 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -82,6 +82,8 @@ struct btrfs_bio { /* Save the first error status of split bio. */ blk_status_t status; + /* Use the commit root to look up csums (data read bio only). */ + bool csum_search_commit_root; /* * This member must come last, bio_alloc_bioset will allocate enough * bytes for entire btrfs_bio but relies on bio being last. diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index fcd274d83fd7..5322ef2ae015 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1358,7 +1358,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( * data in this block group. That check should be done by relocation routine, * not this function. */ -static int inc_block_group_ro(struct btrfs_block_group *cache, int force) +static int inc_block_group_ro(struct btrfs_block_group *cache, bool force) { struct btrfs_space_info *sinfo = cache->space_info; u64 num_bytes; @@ -1971,7 +1971,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) * called, which is where we will transfer a reserved extent's * size from the "reserved" counter to the "used" counter - this * happens when running delayed references. When we relocate the - * chunk below, relocation first flushes dellaloc, waits for + * chunk below, relocation first flushes delalloc, waits for * ordered extent completion (which is where we create delayed * references for data extents) and commits the current * transaction (which runs delayed references), and only after @@ -2071,7 +2071,7 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key return -ENOENT; } - if (map->start != key->objectid || map->chunk_len != key->offset) { + if (unlikely(map->start != key->objectid || map->chunk_len != key->offset)) { btrfs_err(fs_info, "block group %llu len %llu mismatch with chunk %llu len %llu", key->objectid, key->offset, map->start, map->chunk_len); @@ -2084,7 +2084,7 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key flags = btrfs_stack_block_group_flags(&bg) & BTRFS_BLOCK_GROUP_TYPE_MASK; - if (flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + if (unlikely(flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK))) { btrfs_err(fs_info, "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", key->objectid, key->offset, flags, @@ -2245,7 +2245,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) return ret; /* Shouldn't have super stripes in sequential zones */ - if (zoned && nr) { + if (unlikely(zoned && nr)) { kfree(logical); btrfs_err(fs_info, "zoned: block group %llu must not contain super block", @@ -2336,7 +2336,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) break; bg = btrfs_lookup_block_group(fs_info, map->start); - if (!bg) { + if (unlikely(!bg)) { btrfs_err(fs_info, "chunk start=%llu len=%llu doesn't have corresponding block group", map->start, map->chunk_len); @@ -2344,9 +2344,9 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) btrfs_free_chunk_map(map); break; } - if (bg->start != map->start || bg->length != map->chunk_len || - (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != - (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + if (unlikely(bg->start != map->start || bg->length != map->chunk_len || + (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != + (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK))) { btrfs_err(fs_info, "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", map->start, map->chunk_len, @@ -2839,7 +2839,7 @@ next: * space or none at all (due to no need to COW, extent buffers * were already COWed in the current transaction and still * unwritten, tree heights lower than the maximum possible - * height, etc). For data we generally reserve the axact amount + * height, etc). For data we generally reserve the exact amount * of space we are going to allocate later, the exception is * when using compression, as we must reserve space based on the * uncompressed data size, because the compression is only done @@ -3248,7 +3248,7 @@ again: */ BTRFS_I(inode)->generation = 0; ret = btrfs_update_inode(trans, BTRFS_I(inode)); - if (ret) { + if (unlikely(ret)) { /* * So theoretically we could recover from this, simply set the * super cache generation to 0 so we know to invalidate the @@ -3995,7 +3995,7 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans struct btrfs_space_info *sys_space_info; sys_space_info = btrfs_find_space_info(trans->fs_info, sys_flags); - if (!sys_space_info) { + if (unlikely(!sys_space_info)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; @@ -4009,17 +4009,17 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans } ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_chunk_alloc_add_chunk_item(trans, bg); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } - } else if (ret) { + } else if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index a8bb8429c966..9172104a5889 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -63,7 +63,7 @@ enum btrfs_discard_state { * CHUNK_ALLOC_FORCE means it must try to allocate one * * CHUNK_ALLOC_FORCE_FOR_EXTENT like CHUNK_ALLOC_FORCE but called from - * find_free_extent() that also activaes the zone + * find_free_extent() that also activates the zone */ enum btrfs_chunk_alloc_enum { CHUNK_ALLOC_NO_FORCE, diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 3bb504c1e32a..af373d50a901 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -537,9 +537,9 @@ static inline void btrfs_set_inode_mapping_order(struct btrfs_inode *inode) /* We only allow BITS_PER_LONGS blocks for each bitmap. */ #ifdef CONFIG_BTRFS_EXPERIMENTAL - mapping_set_folio_order_range(inode->vfs_inode.i_mapping, 0, - ilog2(((BITS_PER_LONG << inode->root->fs_info->sectorsize_bits) - >> PAGE_SHIFT))); + mapping_set_folio_order_range(inode->vfs_inode.i_mapping, + inode->root->fs_info->block_min_order, + inode->root->fs_info->block_max_order); #endif } @@ -547,10 +547,12 @@ static inline void btrfs_set_inode_mapping_order(struct btrfs_inode *inode) #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes -int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum, - const u8 * const csum_expected); +void btrfs_calculate_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, + u8 *dest); +int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum, + const u8 * const csum_expected); bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, - u32 bio_offset, struct bio_vec *bv); + u32 bio_offset, phys_addr_t paddr); noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, struct btrfs_file_extent *file_extent, bool nowait); @@ -563,7 +565,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, const struct fscrypt_str *name); int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, - const struct fscrypt_str *name, int add_backref, u64 index); + const struct fscrypt_str *name, bool add_backref, u64 index); int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry); int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 end); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 35e3071cec06..bacad18357b3 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -90,19 +90,19 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len) } static int compression_compress_pages(int type, struct list_head *ws, - struct address_space *mapping, u64 start, + struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { switch (type) { case BTRFS_COMPRESS_ZLIB: - return zlib_compress_folios(ws, mapping, start, folios, + return zlib_compress_folios(ws, inode, start, folios, out_folios, total_in, total_out); case BTRFS_COMPRESS_LZO: - return lzo_compress_folios(ws, mapping, start, folios, + return lzo_compress_folios(ws, inode, start, folios, out_folios, total_in, total_out); case BTRFS_COMPRESS_ZSTD: - return zstd_compress_folios(ws, mapping, start, folios, + return zstd_compress_folios(ws, inode, start, folios, out_folios, total_in, total_out); case BTRFS_COMPRESS_NONE: default: @@ -223,10 +223,14 @@ static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_co /* * Common wrappers for page allocation from compression wrappers */ -struct folio *btrfs_alloc_compr_folio(void) +struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info) { struct folio *folio = NULL; + /* For bs > ps cases, no cached folio pool for now. */ + if (fs_info->block_min_order) + goto alloc; + spin_lock(&compr_pool.lock); if (compr_pool.count > 0) { folio = list_first_entry(&compr_pool.list, struct folio, lru); @@ -238,13 +242,18 @@ struct folio *btrfs_alloc_compr_folio(void) if (folio) return folio; - return folio_alloc(GFP_NOFS, 0); +alloc: + return folio_alloc(GFP_NOFS, fs_info->block_min_order); } void btrfs_free_compr_folio(struct folio *folio) { bool do_free = false; + /* The folio is from bs > ps fs, no cached pool for now. */ + if (folio_order(folio)) + goto free; + spin_lock(&compr_pool.lock); if (compr_pool.count > compr_pool.thresh) { do_free = true; @@ -257,6 +266,7 @@ void btrfs_free_compr_folio(struct folio *folio) if (!do_free) return; +free: ASSERT(folio_ref_count(folio) == 1); folio_put(folio); } @@ -344,16 +354,19 @@ static void end_bbio_compressed_write(struct btrfs_bio *bbio) static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb) { + struct btrfs_fs_info *fs_info = cb->bbio.fs_info; struct bio *bio = &cb->bbio.bio; u32 offset = 0; while (offset < cb->compressed_len) { + struct folio *folio; int ret; - u32 len = min_t(u32, cb->compressed_len - offset, PAGE_SIZE); + u32 len = min_t(u32, cb->compressed_len - offset, + btrfs_min_folio_size(fs_info)); + folio = cb->compressed_folios[offset >> (PAGE_SHIFT + fs_info->block_min_order)]; /* Maximum compressed extent is smaller than bio size limit. */ - ret = bio_add_folio(bio, cb->compressed_folios[offset >> PAGE_SHIFT], - len, 0); + ret = bio_add_folio(bio, folio, len, 0); ASSERT(ret); offset += len; } @@ -443,6 +456,10 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (fs_info->sectorsize < PAGE_SIZE) return 0; + /* For bs > ps cases, we don't support readahead for compressed folios for now. */ + if (fs_info->block_min_order) + return 0; + end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; while (cur < compressed_end) { @@ -602,17 +619,19 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) cb->compressed_len = compressed_len; cb->compress_type = btrfs_extent_map_compression(em); cb->orig_bbio = bbio; + cb->bbio.csum_search_commit_root = bbio->csum_search_commit_root; btrfs_free_extent_map(em); - cb->nr_folios = DIV_ROUND_UP(compressed_len, PAGE_SIZE); + cb->nr_folios = DIV_ROUND_UP(compressed_len, btrfs_min_folio_size(fs_info)); cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct folio *), GFP_NOFS); if (!cb->compressed_folios) { status = BLK_STS_RESOURCE; goto out_free_bio; } - ret = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios); + ret = btrfs_alloc_folio_array(cb->nr_folios, fs_info->block_min_order, + cb->compressed_folios); if (ret) { status = BLK_STS_RESOURCE; goto out_free_compressed_pages; @@ -687,8 +706,6 @@ struct heuristic_ws { struct list_head list; }; -static struct workspace_manager heuristic_wsm; - static void free_heuristic_ws(struct list_head *ws) { struct heuristic_ws *workspace; @@ -701,7 +718,7 @@ static void free_heuristic_ws(struct list_head *ws) kfree(workspace); } -static struct list_head *alloc_heuristic_ws(void) +static struct list_head *alloc_heuristic_ws(struct btrfs_fs_info *fs_info) { struct heuristic_ws *ws; @@ -728,11 +745,9 @@ fail: return ERR_PTR(-ENOMEM); } -const struct btrfs_compress_op btrfs_heuristic_compress = { - .workspace_manager = &heuristic_wsm, -}; +const struct btrfs_compress_levels btrfs_heuristic_compress = { 0 }; -static const struct btrfs_compress_op * const btrfs_compress_op[] = { +static const struct btrfs_compress_levels * const btrfs_compress_levels[] = { /* The heuristic is represented as compression type 0 */ &btrfs_heuristic_compress, &btrfs_zlib_compress, @@ -740,13 +755,13 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = { &btrfs_zstd_compress, }; -static struct list_head *alloc_workspace(int type, int level) +static struct list_head *alloc_workspace(struct btrfs_fs_info *fs_info, int type, int level) { switch (type) { - case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(); - case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level); - case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(); - case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level); + case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(fs_info); + case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(fs_info, level); + case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(fs_info); + case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(fs_info, level); default: /* * This can't happen, the type is validated several times @@ -772,44 +787,58 @@ static void free_workspace(int type, struct list_head *ws) } } -static void btrfs_init_workspace_manager(int type) +static int alloc_workspace_manager(struct btrfs_fs_info *fs_info, + enum btrfs_compression_type type) { - struct workspace_manager *wsm; + struct workspace_manager *gwsm; struct list_head *workspace; - wsm = btrfs_compress_op[type]->workspace_manager; - INIT_LIST_HEAD(&wsm->idle_ws); - spin_lock_init(&wsm->ws_lock); - atomic_set(&wsm->total_ws, 0); - init_waitqueue_head(&wsm->ws_wait); + ASSERT(fs_info->compr_wsm[type] == NULL); + gwsm = kzalloc(sizeof(*gwsm), GFP_KERNEL); + if (!gwsm) + return -ENOMEM; + + INIT_LIST_HEAD(&gwsm->idle_ws); + spin_lock_init(&gwsm->ws_lock); + atomic_set(&gwsm->total_ws, 0); + init_waitqueue_head(&gwsm->ws_wait); + fs_info->compr_wsm[type] = gwsm; /* * Preallocate one workspace for each compression type so we can * guarantee forward progress in the worst case */ - workspace = alloc_workspace(type, 0); + workspace = alloc_workspace(fs_info, type, 0); if (IS_ERR(workspace)) { - btrfs_warn(NULL, - "cannot preallocate compression workspace, will try later"); + btrfs_warn(fs_info, + "cannot preallocate compression workspace for %s, will try later", + btrfs_compress_type2str(type)); } else { - atomic_set(&wsm->total_ws, 1); - wsm->free_ws = 1; - list_add(workspace, &wsm->idle_ws); + atomic_set(&gwsm->total_ws, 1); + gwsm->free_ws = 1; + list_add(workspace, &gwsm->idle_ws); } + return 0; } -static void btrfs_cleanup_workspace_manager(int type) +static void free_workspace_manager(struct btrfs_fs_info *fs_info, + enum btrfs_compression_type type) { - struct workspace_manager *wsman; struct list_head *ws; + struct workspace_manager *gwsm = fs_info->compr_wsm[type]; - wsman = btrfs_compress_op[type]->workspace_manager; - while (!list_empty(&wsman->idle_ws)) { - ws = wsman->idle_ws.next; + /* ZSTD uses its own workspace manager, should enter here. */ + ASSERT(type != BTRFS_COMPRESS_ZSTD && type < BTRFS_NR_COMPRESS_TYPES); + if (!gwsm) + return; + fs_info->compr_wsm[type] = NULL; + while (!list_empty(&gwsm->idle_ws)) { + ws = gwsm->idle_ws.next; list_del(ws); free_workspace(type, ws); - atomic_dec(&wsman->total_ws); + atomic_dec(&gwsm->total_ws); } + kfree(gwsm); } /* @@ -818,9 +847,9 @@ static void btrfs_cleanup_workspace_manager(int type) * Preallocation makes a forward progress guarantees and we do not return * errors. */ -struct list_head *btrfs_get_workspace(int type, int level) +struct list_head *btrfs_get_workspace(struct btrfs_fs_info *fs_info, int type, int level) { - struct workspace_manager *wsm; + struct workspace_manager *wsm = fs_info->compr_wsm[type]; struct list_head *workspace; int cpus = num_online_cpus(); unsigned nofs_flag; @@ -830,7 +859,7 @@ struct list_head *btrfs_get_workspace(int type, int level) wait_queue_head_t *ws_wait; int *free_ws; - wsm = btrfs_compress_op[type]->workspace_manager; + ASSERT(wsm); idle_ws = &wsm->idle_ws; ws_lock = &wsm->ws_lock; total_ws = &wsm->total_ws; @@ -866,7 +895,7 @@ again: * context of btrfs_compress_bio/btrfs_compress_pages */ nofs_flag = memalloc_nofs_save(); - workspace = alloc_workspace(type, level); + workspace = alloc_workspace(fs_info, type, level); memalloc_nofs_restore(nofs_flag); if (IS_ERR(workspace)) { @@ -889,7 +918,7 @@ again: /* no burst */ 1); if (__ratelimit(&_rs)) - btrfs_warn(NULL, + btrfs_warn(fs_info, "no compression workspaces, low memory, retrying"); } goto again; @@ -897,13 +926,13 @@ again: return workspace; } -static struct list_head *get_workspace(int type, int level) +static struct list_head *get_workspace(struct btrfs_fs_info *fs_info, int type, int level) { switch (type) { - case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(type, level); - case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(level); - case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(type, level); - case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(level); + case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(fs_info, type, level); + case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(fs_info, level); + case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(fs_info, type, level); + case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(fs_info, level); default: /* * This can't happen, the type is validated several times @@ -917,21 +946,21 @@ static struct list_head *get_workspace(int type, int level) * put a workspace struct back on the list or free it if we have enough * idle ones sitting around */ -void btrfs_put_workspace(int type, struct list_head *ws) +void btrfs_put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws) { - struct workspace_manager *wsm; + struct workspace_manager *gwsm = fs_info->compr_wsm[type]; struct list_head *idle_ws; spinlock_t *ws_lock; atomic_t *total_ws; wait_queue_head_t *ws_wait; int *free_ws; - wsm = btrfs_compress_op[type]->workspace_manager; - idle_ws = &wsm->idle_ws; - ws_lock = &wsm->ws_lock; - total_ws = &wsm->total_ws; - ws_wait = &wsm->ws_wait; - free_ws = &wsm->free_ws; + ASSERT(gwsm); + idle_ws = &gwsm->idle_ws; + ws_lock = &gwsm->ws_lock; + total_ws = &gwsm->total_ws; + ws_wait = &gwsm->ws_wait; + free_ws = &gwsm->free_ws; spin_lock(ws_lock); if (*free_ws <= num_online_cpus()) { @@ -948,13 +977,13 @@ wake: cond_wake_up(ws_wait); } -static void put_workspace(int type, struct list_head *ws) +static void put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws) { switch (type) { - case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(type, ws); - case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(type, ws); - case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(type, ws); - case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(ws); + case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(fs_info, type, ws); + case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(fs_info, type, ws); + case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(fs_info, type, ws); + case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(fs_info, ws); default: /* * This can't happen, the type is validated several times @@ -970,12 +999,12 @@ static void put_workspace(int type, struct list_head *ws) */ static int btrfs_compress_set_level(unsigned int type, int level) { - const struct btrfs_compress_op *ops = btrfs_compress_op[type]; + const struct btrfs_compress_levels *levels = btrfs_compress_levels[type]; if (level == 0) - level = ops->default_level; + level = levels->default_level; else - level = clamp(level, ops->min_level, ops->max_level); + level = clamp(level, levels->min_level, levels->max_level); return level; } @@ -985,9 +1014,9 @@ static int btrfs_compress_set_level(unsigned int type, int level) */ bool btrfs_compress_level_valid(unsigned int type, int level) { - const struct btrfs_compress_op *ops = btrfs_compress_op[type]; + const struct btrfs_compress_levels *levels = btrfs_compress_levels[type]; - return ops->min_level <= level && level <= ops->max_level; + return levels->min_level <= level && level <= levels->max_level; } /* Wrapper around find_get_page(), with extra error message. */ @@ -1022,44 +1051,46 @@ int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, * - compression algo are 0-3 * - the level are bits 4-7 * - * @out_pages is an in/out parameter, holds maximum number of pages to allocate - * and returns number of actually allocated pages + * @out_folios is an in/out parameter, holds maximum number of folios to allocate + * and returns number of actually allocated folios * * @total_in is used to return the number of bytes actually read. It * may be smaller than the input length if we had to exit early because we - * ran out of room in the pages array or because we cross the + * ran out of room in the folios array or because we cross the * max_out threshold. * * @total_out is an in/out parameter, must be set to the input length and will * be also used to return the total number of compressed bytes */ -int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping, +int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; const unsigned long orig_len = *total_out; struct list_head *workspace; int ret; level = btrfs_compress_set_level(type, level); - workspace = get_workspace(type, level); - ret = compression_compress_pages(type, workspace, mapping, start, folios, + workspace = get_workspace(fs_info, type, level); + ret = compression_compress_pages(type, workspace, inode, start, folios, out_folios, total_in, total_out); /* The total read-in bytes should be no larger than the input. */ ASSERT(*total_in <= orig_len); - put_workspace(type, workspace); + put_workspace(fs_info, type, workspace); return ret; } static int btrfs_decompress_bio(struct compressed_bio *cb) { + struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); struct list_head *workspace; int ret; int type = cb->compress_type; - workspace = get_workspace(type, 0); + workspace = get_workspace(fs_info, type, 0); ret = compression_decompress_bio(workspace, cb); - put_workspace(type, workspace); + put_workspace(fs_info, type, workspace); if (!ret) zero_fill_bio(&cb->orig_bbio->bio); @@ -1080,20 +1111,50 @@ int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, int ret; /* - * The full destination page range should not exceed the page size. + * The full destination folio range should not exceed the folio size. * And the @destlen should not exceed sectorsize, as this is only called for * inline file extents, which should not exceed sectorsize. */ - ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize); + ASSERT(dest_pgoff + destlen <= folio_size(dest_folio) && destlen <= sectorsize); - workspace = get_workspace(type, 0); + workspace = get_workspace(fs_info, type, 0); ret = compression_decompress(type, workspace, data_in, dest_folio, dest_pgoff, srclen, destlen); - put_workspace(type, workspace); + put_workspace(fs_info, type, workspace); + + return ret; +} + +int btrfs_alloc_compress_wsm(struct btrfs_fs_info *fs_info) +{ + int ret; + ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_NONE); + if (ret < 0) + goto error; + ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_ZLIB); + if (ret < 0) + goto error; + ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_LZO); + if (ret < 0) + goto error; + ret = zstd_alloc_workspace_manager(fs_info); + if (ret < 0) + goto error; + return 0; +error: + btrfs_free_compress_wsm(fs_info); return ret; } +void btrfs_free_compress_wsm(struct btrfs_fs_info *fs_info) +{ + free_workspace_manager(fs_info, BTRFS_COMPRESS_NONE); + free_workspace_manager(fs_info, BTRFS_COMPRESS_ZLIB); + free_workspace_manager(fs_info, BTRFS_COMPRESS_LZO); + zstd_free_workspace_manager(fs_info); +} + int __init btrfs_init_compress(void) { if (bioset_init(&btrfs_compressed_bioset, BIO_POOL_SIZE, @@ -1105,11 +1166,6 @@ int __init btrfs_init_compress(void) if (!compr_pool.shrinker) return -ENOMEM; - btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE); - btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB); - btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO); - zstd_init_workspace_manager(); - spin_lock_init(&compr_pool.lock); INIT_LIST_HEAD(&compr_pool.list); compr_pool.count = 0; @@ -1130,10 +1186,6 @@ void __cold btrfs_exit_compress(void) btrfs_compr_pool_scan(NULL, NULL); shrinker_free(compr_pool.shrinker); - btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE); - btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB); - btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO); - zstd_cleanup_workspace_manager(); bioset_exit(&btrfs_compressed_bioset); } @@ -1256,7 +1308,7 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len, #define ENTROPY_LVL_HIGH (80) /* - * For increasead precision in shannon_entropy calculation, + * For increased precision in shannon_entropy calculation, * let's do pow(n, M) to save more digits after comma: * * - maximum int bit length is 64 @@ -1542,7 +1594,8 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, */ int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end) { - struct list_head *ws_list = get_workspace(0, 0); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct list_head *ws_list = get_workspace(fs_info, 0, 0); struct heuristic_ws *ws; u32 i; u8 byte; @@ -1611,7 +1664,7 @@ int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end) } out: - put_workspace(0, ws_list); + put_workspace(fs_info, 0, ws_list); return ret; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 7b41b2b5ff44..eba188a9e3bb 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -75,6 +75,11 @@ struct compressed_bio { struct btrfs_bio bbio; }; +static inline struct btrfs_fs_info *cb_to_fs_info(const struct compressed_bio *cb) +{ + return cb->bbio.fs_info; +} + /* @range_end must be exclusive. */ static inline u32 btrfs_calc_input_length(struct folio *folio, u64 range_end, u64 cur) { @@ -84,11 +89,14 @@ static inline u32 btrfs_calc_input_length(struct folio *folio, u64 range_end, u6 return min(range_end, folio_end(folio)) - cur; } +int btrfs_alloc_compress_wsm(struct btrfs_fs_info *fs_info); +void btrfs_free_compress_wsm(struct btrfs_fs_info *fs_info); + int __init btrfs_init_compress(void); void __cold btrfs_exit_compress(void); bool btrfs_compress_level_valid(unsigned int type, int level); -int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping, +int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, @@ -104,19 +112,9 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio); int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret); -struct folio *btrfs_alloc_compr_folio(void); +struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info); void btrfs_free_compr_folio(struct folio *folio); -enum btrfs_compression_type { - BTRFS_COMPRESS_NONE = 0, - BTRFS_COMPRESS_ZLIB = 1, - BTRFS_COMPRESS_LZO = 2, - BTRFS_COMPRESS_ZSTD = 3, - BTRFS_NR_COMPRESS_TYPES = 4, - - BTRFS_DEFRAG_DONT_COMPRESS, -}; - struct workspace_manager { struct list_head idle_ws; spinlock_t ws_lock; @@ -128,11 +126,10 @@ struct workspace_manager { wait_queue_head_t ws_wait; }; -struct list_head *btrfs_get_workspace(int type, int level); -void btrfs_put_workspace(int type, struct list_head *ws); +struct list_head *btrfs_get_workspace(struct btrfs_fs_info *fs_info, int type, int level); +void btrfs_put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws); -struct btrfs_compress_op { - struct workspace_manager *workspace_manager; +struct btrfs_compress_levels { /* Maximum level supported by the compression algorithm */ int min_level; int max_level; @@ -142,10 +139,10 @@ struct btrfs_compress_op { /* The heuristic workspaces are managed via the 0th workspace manager */ #define BTRFS_NR_WORKSPACE_MANAGERS BTRFS_NR_COMPRESS_TYPES -extern const struct btrfs_compress_op btrfs_heuristic_compress; -extern const struct btrfs_compress_op btrfs_zlib_compress; -extern const struct btrfs_compress_op btrfs_lzo_compress; -extern const struct btrfs_compress_op btrfs_zstd_compress; +extern const struct btrfs_compress_levels btrfs_heuristic_compress; +extern const struct btrfs_compress_levels btrfs_zlib_compress; +extern const struct btrfs_compress_levels btrfs_lzo_compress; +extern const struct btrfs_compress_levels btrfs_zstd_compress; const char* btrfs_compress_type2str(enum btrfs_compression_type type); bool btrfs_compress_is_valid_type(const char *str, size_t len); @@ -155,39 +152,39 @@ int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end); int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, struct folio **in_folio_ret); -int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, +int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zlib_decompress(struct list_head *ws, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); -struct list_head *zlib_alloc_workspace(unsigned int level); +struct list_head *zlib_alloc_workspace(struct btrfs_fs_info *fs_info, unsigned int level); void zlib_free_workspace(struct list_head *ws); -struct list_head *zlib_get_workspace(unsigned int level); +struct list_head *zlib_get_workspace(struct btrfs_fs_info *fs_info, unsigned int level); -int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, +int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress(struct list_head *ws, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); -struct list_head *lzo_alloc_workspace(void); +struct list_head *lzo_alloc_workspace(struct btrfs_fs_info *fs_info); void lzo_free_workspace(struct list_head *ws); -int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, +int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zstd_decompress(struct list_head *ws, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); -void zstd_init_workspace_manager(void); -void zstd_cleanup_workspace_manager(void); -struct list_head *zstd_alloc_workspace(int level); +int zstd_alloc_workspace_manager(struct btrfs_fs_info *fs_info); +void zstd_free_workspace_manager(struct btrfs_fs_info *fs_info); +struct list_head *zstd_alloc_workspace(struct btrfs_fs_info *fs_info, int level); void zstd_free_workspace(struct list_head *ws); -struct list_head *zstd_get_workspace(int level); -void zstd_put_workspace(struct list_head *ws); +struct list_head *zstd_get_workspace(struct btrfs_fs_info *fs_info, int level); +void zstd_put_workspace(struct btrfs_fs_info *fs_info, struct list_head *ws); #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 74e6d7f3d266..561658aca018 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -30,10 +30,10 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level); static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *ins_key, struct btrfs_path *path, - int data_size, int extend); + int data_size, bool extend); static int push_node_left(struct btrfs_trans_handle *trans, struct extent_buffer *dst, - struct extent_buffer *src, int empty); + struct extent_buffer *src, bool empty); static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); @@ -293,11 +293,11 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) { ret = btrfs_inc_ref(trans, root, cow, 1); - if (ret) + if (unlikely(ret)) btrfs_abort_transaction(trans, ret); } else { ret = btrfs_inc_ref(trans, root, cow, 0); - if (ret) + if (unlikely(ret)) btrfs_abort_transaction(trans, ret); } if (ret) { @@ -536,14 +536,14 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } @@ -556,7 +556,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, parent_start = buf->start; ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } @@ -567,7 +567,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, parent_start, last_ref); free_extent_buffer(buf); add_root_to_dirty_list(root); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } @@ -575,7 +575,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, WARN_ON(trans->transid != btrfs_header_generation(parent)); ret = btrfs_tree_mod_log_insert_key(parent, parent_slot, BTRFS_MOD_LOG_KEY_REPLACE); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } @@ -586,14 +586,14 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(trans, parent); if (last_ref) { ret = btrfs_tree_mod_log_free_eb(buf); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } } ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf, parent_start, last_ref); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } @@ -613,15 +613,12 @@ error_unlock_cow: return ret; } -static inline int should_cow_block(const struct btrfs_trans_handle *trans, - const struct btrfs_root *root, - const struct extent_buffer *buf) +static inline bool should_cow_block(const struct btrfs_trans_handle *trans, + const struct btrfs_root *root, + const struct extent_buffer *buf) { if (btrfs_is_testing(root->fs_info)) - return 0; - - /* Ensure we can see the FORCE_COW bit */ - smp_mb__before_atomic(); + return false; /* * We do not need to cow a block if @@ -634,13 +631,25 @@ static inline int should_cow_block(const struct btrfs_trans_handle *trans, * after we've finished copying src root, we must COW the shared * block to ensure the metadata consistency. */ - if (btrfs_header_generation(buf) == trans->transid && - !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && - !(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && - btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && - !test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) - return 0; - return 1; + + if (btrfs_header_generation(buf) != trans->transid) + return true; + + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) + return true; + + /* Ensure we can see the FORCE_COW bit. */ + smp_mb__before_atomic(); + if (test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) + return true; + + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) + return false; + + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) + return true; + + return false; } /* @@ -844,7 +853,7 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, &check); if (IS_ERR(eb)) return eb; - if (!extent_buffer_uptodate(eb)) { + if (unlikely(!extent_buffer_uptodate(eb))) { free_extent_buffer(eb); return ERR_PTR(-EIO); } @@ -913,7 +922,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } ret = btrfs_tree_mod_log_insert_root(root->node, child, true); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_tree_unlock(child); free_extent_buffer(child); btrfs_abort_transaction(trans, ret); @@ -935,7 +944,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); /* once for the root ptr */ free_extent_buffer_stale(mid); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1010,7 +1019,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, right, 0, 1); free_extent_buffer_stale(right); right = NULL; - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1019,7 +1028,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_node_key(right, &right_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, BTRFS_MOD_LOG_KEY_REPLACE); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1071,7 +1080,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); free_extent_buffer_stale(mid); mid = NULL; - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1081,7 +1090,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_node_key(mid, &mid_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot, BTRFS_MOD_LOG_KEY_REPLACE); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1186,7 +1195,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_node_key(mid, &disk_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot, BTRFS_MOD_LOG_KEY_REPLACE); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_tree_unlock(left); free_extent_buffer(left); btrfs_abort_transaction(trans, ret); @@ -1246,7 +1255,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_node_key(right, &disk_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, BTRFS_MOD_LOG_KEY_REPLACE); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_tree_unlock(right); free_extent_buffer(right); btrfs_abort_transaction(trans, ret); @@ -1484,13 +1493,13 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, reada_for_search(fs_info, p, parent_level, slot, key->objectid); /* first we do an atomic uptodate check */ - if (btrfs_buffer_uptodate(tmp, check.transid, 1) > 0) { + if (btrfs_buffer_uptodate(tmp, check.transid, true) > 0) { /* * Do extra check for first_key, eb can be stale due to * being cached, read from scrub, or have multiple * parents (shared tree blocks). */ - if (btrfs_verify_level_key(tmp, &check)) { + if (unlikely(btrfs_verify_level_key(tmp, &check))) { ret = -EUCLEAN; goto out; } @@ -1571,7 +1580,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, * and give up so that our caller doesn't loop forever * on our EAGAINs. */ - if (!extent_buffer_uptodate(tmp)) { + if (unlikely(!extent_buffer_uptodate(tmp))) { ret = -EIO; goto out; } @@ -1752,7 +1761,7 @@ out: * The root may have failed to write out at some point, and thus is no * longer valid, return an error in this case. */ - if (!extent_buffer_uptodate(b)) { + if (unlikely(!extent_buffer_uptodate(b))) { if (root_lock) btrfs_tree_unlock_rw(b, root_lock); free_extent_buffer(b); @@ -2260,7 +2269,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, again: b = btrfs_get_old_root(root, time_seq); - if (!b) { + if (unlikely(!b)) { ret = -EIO; goto done; } @@ -2686,7 +2695,7 @@ static bool check_sibling_keys(const struct extent_buffer *left, */ static int push_node_left(struct btrfs_trans_handle *trans, struct extent_buffer *dst, - struct extent_buffer *src, int empty) + struct extent_buffer *src, bool empty) { struct btrfs_fs_info *fs_info = trans->fs_info; int push_items = 0; @@ -2722,13 +2731,13 @@ static int push_node_left(struct btrfs_trans_handle *trans, push_items = min(src_nritems - 8, push_items); /* dst is the left eb, src is the middle eb */ - if (check_sibling_keys(dst, src)) { + if (unlikely(check_sibling_keys(dst, src))) { ret = -EUCLEAN; btrfs_abort_transaction(trans, ret); return ret; } ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -2796,7 +2805,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, push_items = max_push; /* dst is the right eb, src is the middle eb */ - if (check_sibling_keys(src, dst)) { + if (unlikely(check_sibling_keys(src, dst))) { ret = -EUCLEAN; btrfs_abort_transaction(trans, ret); return ret; @@ -2813,7 +2822,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items, push_items); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -2883,7 +2892,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_clear_buffer_dirty(trans, c); ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1); - if (ret2 < 0) + if (unlikely(ret2 < 0)) btrfs_abort_transaction(trans, ret2); btrfs_tree_unlock(c); free_extent_buffer(c); @@ -2928,7 +2937,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, if (level) { ret = btrfs_tree_mod_log_insert_move(lower, slot + 1, slot, nritems - slot); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -2941,7 +2950,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, if (level) { ret = btrfs_tree_mod_log_insert_key(lower, slot, BTRFS_MOD_LOG_KEY_ADD); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -3017,7 +3026,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, ASSERT(btrfs_header_level(c) == level); ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); - if (ret) { + if (unlikely(ret)) { btrfs_tree_unlock(split); free_extent_buffer(split); btrfs_abort_transaction(trans, ret); @@ -3086,7 +3095,7 @@ int btrfs_leaf_free_space(const struct extent_buffer *leaf) int ret; ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_crit(fs_info, "leaf free space ret %d, leaf data size %lu, used %d nritems %d", ret, @@ -3102,7 +3111,7 @@ int btrfs_leaf_free_space(const struct extent_buffer *leaf) */ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_path *path, - int data_size, int empty, + int data_size, bool empty, struct extent_buffer *right, int free_space, u32 left_nritems, u32 min_slot) @@ -3239,7 +3248,7 @@ out_unlock: static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int min_data_size, int data_size, - int empty, u32 min_slot) + bool empty, u32 min_slot) { struct extent_buffer *left = path->nodes[0]; struct extent_buffer *right; @@ -3278,7 +3287,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (left_nritems == 0) goto out_unlock; - if (check_sibling_keys(left, right)) { + if (unlikely(check_sibling_keys(left, right))) { ret = -EUCLEAN; btrfs_abort_transaction(trans, ret); btrfs_tree_unlock(right); @@ -3316,7 +3325,7 @@ out_unlock: */ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_path *path, int data_size, - int empty, struct extent_buffer *left, + bool empty, struct extent_buffer *left, int free_space, u32 right_nritems, u32 max_slot) { @@ -3494,7 +3503,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } - if (check_sibling_keys(left, right)) { + if (unlikely(check_sibling_keys(left, right))) { ret = -EUCLEAN; btrfs_abort_transaction(trans, ret); goto out; @@ -3642,7 +3651,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *ins_key, struct btrfs_path *path, int data_size, - int extend) + bool extend) { struct btrfs_disk_key disk_key; struct extent_buffer *l; @@ -4075,7 +4084,7 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans, btrfs_set_item_size(leaf, slot, new_size); btrfs_mark_buffer_dirty(trans, leaf); - if (btrfs_leaf_free_space(leaf) < 0) { + if (unlikely(btrfs_leaf_free_space(leaf) < 0)) { btrfs_print_leaf(leaf); BUG(); } @@ -4108,7 +4117,7 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans, old_data = btrfs_item_data_end(leaf, slot); BUG_ON(slot < 0); - if (slot >= nritems) { + if (unlikely(slot >= nritems)) { btrfs_print_leaf(leaf); btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d", slot, nritems); @@ -4135,7 +4144,7 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans, btrfs_set_item_size(leaf, slot, old_size + data_size); btrfs_mark_buffer_dirty(trans, leaf); - if (btrfs_leaf_free_space(leaf) < 0) { + if (unlikely(btrfs_leaf_free_space(leaf) < 0)) { btrfs_print_leaf(leaf); BUG(); } @@ -4183,7 +4192,7 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans, data_end = leaf_data_end(leaf); total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); - if (btrfs_leaf_free_space(leaf) < total_size) { + if (unlikely(btrfs_leaf_free_space(leaf) < total_size)) { btrfs_print_leaf(leaf); btrfs_crit(fs_info, "not enough freespace need %u have %d", total_size, btrfs_leaf_free_space(leaf)); @@ -4193,7 +4202,7 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans, if (slot != nritems) { unsigned int old_data = btrfs_item_data_end(leaf, slot); - if (old_data < data_end) { + if (unlikely(old_data < data_end)) { btrfs_print_leaf(leaf); btrfs_crit(fs_info, "item at slot %d with data offset %u beyond data end of leaf %u", @@ -4232,7 +4241,7 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(leaf, nritems + batch->nr); btrfs_mark_buffer_dirty(trans, leaf); - if (btrfs_leaf_free_space(leaf) < 0) { + if (unlikely(btrfs_leaf_free_space(leaf) < 0)) { btrfs_print_leaf(leaf); BUG(); } @@ -4374,7 +4383,7 @@ int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (level) { ret = btrfs_tree_mod_log_insert_move(parent, slot, slot + 1, nritems - slot - 1); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -4387,7 +4396,7 @@ int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, } else if (level) { ret = btrfs_tree_mod_log_insert_key(parent, slot, BTRFS_MOD_LOG_KEY_REMOVE); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); return ret; } diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 738179a5e170..7b277934f66f 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -153,7 +153,7 @@ void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh) } /* - * Pick the defragable inode that we want, if it doesn't exist, we will get the + * Pick the defraggable inode that we want, if it doesn't exist, we will get the * next one. */ static struct inode_defrag *btrfs_pick_defrag_inode( @@ -924,7 +924,7 @@ again: folio_put(folio); goto again; } - if (!folio_test_uptodate(folio)) { + if (unlikely(!folio_test_uptodate(folio))) { folio_unlock(folio); folio_put(folio); return ERR_PTR(-EIO); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index c0c1ddd46b67..41e37f7f67cc 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -57,6 +57,7 @@ static inline void btrfs_init_delayed_node( delayed_node->root = root; delayed_node->inode_id = inode_id; refcount_set(&delayed_node->refs, 0); + btrfs_delayed_node_ref_tracker_dir_init(delayed_node); delayed_node->ins_root = RB_ROOT_CACHED; delayed_node->del_root = RB_ROOT_CACHED; mutex_init(&delayed_node->mutex); @@ -65,7 +66,8 @@ static inline void btrfs_init_delayed_node( } static struct btrfs_delayed_node *btrfs_get_delayed_node( - struct btrfs_inode *btrfs_inode) + struct btrfs_inode *btrfs_inode, + struct btrfs_ref_tracker *tracker) { struct btrfs_root *root = btrfs_inode->root; u64 ino = btrfs_ino(btrfs_inode); @@ -74,6 +76,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( node = READ_ONCE(btrfs_inode->delayed_node); if (node) { refcount_inc(&node->refs); + btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_NOFS); return node; } @@ -83,6 +86,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( if (node) { if (btrfs_inode->delayed_node) { refcount_inc(&node->refs); /* can be accessed */ + btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC); BUG_ON(btrfs_inode->delayed_node != node); xa_unlock(&root->delayed_nodes); return node; @@ -106,6 +110,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( */ if (refcount_inc_not_zero(&node->refs)) { refcount_inc(&node->refs); + btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC); + btrfs_delayed_node_ref_tracker_alloc(node, &node->inode_cache_tracker, + GFP_ATOMIC); btrfs_inode->delayed_node = node; } else { node = NULL; @@ -126,7 +133,8 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( * Return the delayed node, or error pointer on failure. */ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( - struct btrfs_inode *btrfs_inode) + struct btrfs_inode *btrfs_inode, + struct btrfs_ref_tracker *tracker) { struct btrfs_delayed_node *node; struct btrfs_root *root = btrfs_inode->root; @@ -135,7 +143,7 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( void *ptr; again: - node = btrfs_get_delayed_node(btrfs_inode); + node = btrfs_get_delayed_node(btrfs_inode, tracker); if (node) return node; @@ -144,12 +152,10 @@ again: return ERR_PTR(-ENOMEM); btrfs_init_delayed_node(node, root, ino); - /* Cached in the inode and can be accessed. */ - refcount_set(&node->refs, 2); - /* Allocate and reserve the slot, from now it can return a NULL from xa_load(). */ ret = xa_reserve(&root->delayed_nodes, ino, GFP_NOFS); if (ret == -ENOMEM) { + btrfs_delayed_node_ref_tracker_dir_exit(node); kmem_cache_free(delayed_node_cache, node); return ERR_PTR(-ENOMEM); } @@ -158,6 +164,7 @@ again: if (ptr) { /* Somebody inserted it, go back and read it. */ xa_unlock(&root->delayed_nodes); + btrfs_delayed_node_ref_tracker_dir_exit(node); kmem_cache_free(delayed_node_cache, node); node = NULL; goto again; @@ -166,6 +173,12 @@ again: ASSERT(xa_err(ptr) != -EINVAL); ASSERT(xa_err(ptr) != -ENOMEM); ASSERT(ptr == NULL); + + /* Cached in the inode and can be accessed. */ + refcount_set(&node->refs, 2); + btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC); + btrfs_delayed_node_ref_tracker_alloc(node, &node->inode_cache_tracker, GFP_ATOMIC); + btrfs_inode->delayed_node = node; xa_unlock(&root->delayed_nodes); @@ -191,6 +204,8 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root, list_add_tail(&node->n_list, &root->node_list); list_add_tail(&node->p_list, &root->prepare_list); refcount_inc(&node->refs); /* inserted into list */ + btrfs_delayed_node_ref_tracker_alloc(node, &node->node_list_tracker, + GFP_ATOMIC); root->nodes++; set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags); } @@ -204,6 +219,7 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, spin_lock(&root->lock); if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { root->nodes--; + btrfs_delayed_node_ref_tracker_free(node, &node->node_list_tracker); refcount_dec(&node->refs); /* not in the list */ list_del_init(&node->n_list); if (!list_empty(&node->p_list)) @@ -214,22 +230,26 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, } static struct btrfs_delayed_node *btrfs_first_delayed_node( - struct btrfs_delayed_root *delayed_root) + struct btrfs_delayed_root *delayed_root, + struct btrfs_ref_tracker *tracker) { struct btrfs_delayed_node *node; spin_lock(&delayed_root->lock); node = list_first_entry_or_null(&delayed_root->node_list, struct btrfs_delayed_node, n_list); - if (node) + if (node) { refcount_inc(&node->refs); + btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC); + } spin_unlock(&delayed_root->lock); return node; } static struct btrfs_delayed_node *btrfs_next_delayed_node( - struct btrfs_delayed_node *node) + struct btrfs_delayed_node *node, + struct btrfs_ref_tracker *tracker) { struct btrfs_delayed_root *delayed_root; struct list_head *p; @@ -249,6 +269,7 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node( next = list_entry(p, struct btrfs_delayed_node, n_list); refcount_inc(&next->refs); + btrfs_delayed_node_ref_tracker_alloc(next, tracker, GFP_ATOMIC); out: spin_unlock(&delayed_root->lock); @@ -257,7 +278,7 @@ out: static void __btrfs_release_delayed_node( struct btrfs_delayed_node *delayed_node, - int mod) + int mod, struct btrfs_ref_tracker *tracker) { struct btrfs_delayed_root *delayed_root; @@ -273,6 +294,7 @@ static void __btrfs_release_delayed_node( btrfs_dequeue_delayed_node(delayed_root, delayed_node); mutex_unlock(&delayed_node->mutex); + btrfs_delayed_node_ref_tracker_free(delayed_node, tracker); if (refcount_dec_and_test(&delayed_node->refs)) { struct btrfs_root *root = delayed_node->root; @@ -282,17 +304,20 @@ static void __btrfs_release_delayed_node( * back up. We can delete it now. */ ASSERT(refcount_read(&delayed_node->refs) == 0); + btrfs_delayed_node_ref_tracker_dir_exit(delayed_node); kmem_cache_free(delayed_node_cache, delayed_node); } } -static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node) +static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node, + struct btrfs_ref_tracker *tracker) { - __btrfs_release_delayed_node(node, 0); + __btrfs_release_delayed_node(node, 0, tracker); } static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( - struct btrfs_delayed_root *delayed_root) + struct btrfs_delayed_root *delayed_root, + struct btrfs_ref_tracker *tracker) { struct btrfs_delayed_node *node; @@ -302,6 +327,7 @@ static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( if (node) { list_del_init(&node->p_list); refcount_inc(&node->refs); + btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC); } spin_unlock(&delayed_root->lock); @@ -309,9 +335,10 @@ static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( } static inline void btrfs_release_prepared_delayed_node( - struct btrfs_delayed_node *node) + struct btrfs_delayed_node *node, + struct btrfs_ref_tracker *tracker) { - __btrfs_release_delayed_node(node, 1); + __btrfs_release_delayed_node(node, 1, tracker); } static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len, @@ -711,8 +738,8 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, u32 *ins_sizes; int i = 0; - ins_data = kmalloc(batch.nr * sizeof(u32) + - batch.nr * sizeof(struct btrfs_key), GFP_NOFS); + ins_data = kmalloc_array(batch.nr, + sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS); if (!ins_data) { ret = -ENOMEM; goto out; @@ -1011,7 +1038,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, * transaction, because we could leave the inode with the * improper counts behind. */ - if (ret != -ENOENT) + if (unlikely(ret != -ENOENT)) btrfs_abort_transaction(trans, ret); goto out; } @@ -1039,7 +1066,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, btrfs_release_path(path); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto err_out; } @@ -1126,6 +1153,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_root *delayed_root; struct btrfs_delayed_node *curr_node, *prev_node; + struct btrfs_ref_tracker curr_delayed_node_tracker, prev_delayed_node_tracker; struct btrfs_path *path; struct btrfs_block_rsv *block_rsv; int ret = 0; @@ -1143,17 +1171,18 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) delayed_root = fs_info->delayed_root; - curr_node = btrfs_first_delayed_node(delayed_root); + curr_node = btrfs_first_delayed_node(delayed_root, &curr_delayed_node_tracker); while (curr_node && (!count || nr--)) { ret = __btrfs_commit_inode_delayed_items(trans, path, curr_node); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } prev_node = curr_node; - curr_node = btrfs_next_delayed_node(curr_node); + prev_delayed_node_tracker = curr_delayed_node_tracker; + curr_node = btrfs_next_delayed_node(curr_node, &curr_delayed_node_tracker); /* * See the comment below about releasing path before releasing * node. If the commit of delayed items was successful the path @@ -1161,7 +1190,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) * point to locked extent buffers (a leaf at the very least). */ ASSERT(path->nodes[0] == NULL); - btrfs_release_delayed_node(prev_node); + btrfs_release_delayed_node(prev_node, &prev_delayed_node_tracker); } /* @@ -1174,7 +1203,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) btrfs_free_path(path); if (curr_node) - btrfs_release_delayed_node(curr_node); + btrfs_release_delayed_node(curr_node, &curr_delayed_node_tracker); trans->block_rsv = block_rsv; return ret; @@ -1193,7 +1222,9 @@ int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr) int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { - struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); + struct btrfs_ref_tracker delayed_node_tracker; + struct btrfs_delayed_node *delayed_node = + btrfs_get_delayed_node(inode, &delayed_node_tracker); BTRFS_PATH_AUTO_FREE(path); struct btrfs_block_rsv *block_rsv; int ret; @@ -1204,14 +1235,14 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, mutex_lock(&delayed_node->mutex); if (!delayed_node->count) { mutex_unlock(&delayed_node->mutex); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return 0; } mutex_unlock(&delayed_node->mutex); path = btrfs_alloc_path(); if (!path) { - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return -ENOMEM; } @@ -1220,7 +1251,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); trans->block_rsv = block_rsv; return ret; @@ -1230,18 +1261,20 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_trans_handle *trans; - struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); + struct btrfs_ref_tracker delayed_node_tracker; + struct btrfs_delayed_node *delayed_node; struct btrfs_path *path; struct btrfs_block_rsv *block_rsv; int ret; + delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker); if (!delayed_node) return 0; mutex_lock(&delayed_node->mutex); if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { mutex_unlock(&delayed_node->mutex); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return 0; } mutex_unlock(&delayed_node->mutex); @@ -1275,7 +1308,7 @@ trans_out: btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); out: - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return ret; } @@ -1289,7 +1322,8 @@ void btrfs_remove_delayed_node(struct btrfs_inode *inode) return; inode->delayed_node = NULL; - btrfs_release_delayed_node(delayed_node); + + btrfs_release_delayed_node(delayed_node, &delayed_node->inode_cache_tracker); } struct btrfs_async_delayed_work { @@ -1305,6 +1339,7 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work) struct btrfs_trans_handle *trans; struct btrfs_path *path; struct btrfs_delayed_node *delayed_node = NULL; + struct btrfs_ref_tracker delayed_node_tracker; struct btrfs_root *root; struct btrfs_block_rsv *block_rsv; int total_done = 0; @@ -1321,7 +1356,8 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work) BTRFS_DELAYED_BACKGROUND / 2) break; - delayed_node = btrfs_first_prepared_delayed_node(delayed_root); + delayed_node = btrfs_first_prepared_delayed_node(delayed_root, + &delayed_node_tracker); if (!delayed_node) break; @@ -1330,7 +1366,8 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work) trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { btrfs_release_path(path); - btrfs_release_prepared_delayed_node(delayed_node); + btrfs_release_prepared_delayed_node(delayed_node, + &delayed_node_tracker); total_done++; continue; } @@ -1345,7 +1382,8 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work) btrfs_btree_balance_dirty_nodelay(root->fs_info); btrfs_release_path(path); - btrfs_release_prepared_delayed_node(delayed_node); + btrfs_release_prepared_delayed_node(delayed_node, + &delayed_node_tracker); total_done++; } while ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK) @@ -1377,10 +1415,15 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info) { - struct btrfs_delayed_node *node = btrfs_first_delayed_node(fs_info->delayed_root); + struct btrfs_ref_tracker delayed_node_tracker; + struct btrfs_delayed_node *node; - if (WARN_ON(node)) + node = btrfs_first_delayed_node( fs_info->delayed_root, &delayed_node_tracker); + if (WARN_ON(node)) { + btrfs_delayed_node_ref_tracker_free(node, + &delayed_node_tracker); refcount_dec(&node->refs); + } } static bool could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) @@ -1454,13 +1497,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = trans->fs_info; const unsigned int leaf_data_size = BTRFS_LEAF_DATA_SIZE(fs_info); struct btrfs_delayed_node *delayed_node; + struct btrfs_ref_tracker delayed_node_tracker; struct btrfs_delayed_item *delayed_item; struct btrfs_dir_item *dir_item; bool reserve_leaf_space; u32 data_len; int ret; - delayed_node = btrfs_get_or_create_delayed_node(dir); + delayed_node = btrfs_get_or_create_delayed_node(dir, &delayed_node_tracker); if (IS_ERR(delayed_node)) return PTR_ERR(delayed_node); @@ -1536,7 +1580,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, mutex_unlock(&delayed_node->mutex); release_node: - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return ret; } @@ -1591,10 +1635,11 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, u64 index) { struct btrfs_delayed_node *node; + struct btrfs_ref_tracker delayed_node_tracker; struct btrfs_delayed_item *item; int ret; - node = btrfs_get_or_create_delayed_node(dir); + node = btrfs_get_or_create_delayed_node(dir, &delayed_node_tracker); if (IS_ERR(node)) return PTR_ERR(node); @@ -1635,14 +1680,16 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, } mutex_unlock(&node->mutex); end: - btrfs_release_delayed_node(node); + btrfs_release_delayed_node(node, &delayed_node_tracker); return ret; } int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode) { - struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); + struct btrfs_ref_tracker delayed_node_tracker; + struct btrfs_delayed_node *delayed_node; + delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker); if (!delayed_node) return -ENOENT; @@ -1652,12 +1699,12 @@ int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode) * is updated now. So we needn't lock the delayed node. */ if (!delayed_node->index_cnt) { - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return -EINVAL; } inode->index_cnt = delayed_node->index_cnt; - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return 0; } @@ -1668,8 +1715,9 @@ bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode, { struct btrfs_delayed_node *delayed_node; struct btrfs_delayed_item *item; + struct btrfs_ref_tracker delayed_node_tracker; - delayed_node = btrfs_get_delayed_node(inode); + delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker); if (!delayed_node) return false; @@ -1704,6 +1752,7 @@ bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode, * insert/delete delayed items in this period. So we also needn't * requeue or dequeue this delayed node. */ + btrfs_delayed_node_ref_tracker_free(delayed_node, &delayed_node_tracker); refcount_dec(&delayed_node->refs); return true; @@ -1844,17 +1893,18 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev) { struct btrfs_delayed_node *delayed_node; + struct btrfs_ref_tracker delayed_node_tracker; struct btrfs_inode_item *inode_item; struct inode *vfs_inode = &inode->vfs_inode; - delayed_node = btrfs_get_delayed_node(inode); + delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker); if (!delayed_node) return -ENOENT; mutex_lock(&delayed_node->mutex); if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { mutex_unlock(&delayed_node->mutex); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return -ENOENT; } @@ -1892,7 +1942,7 @@ int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev) inode->index_cnt = (u64)-1; mutex_unlock(&delayed_node->mutex); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return 0; } @@ -1901,9 +1951,10 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; struct btrfs_delayed_node *delayed_node; + struct btrfs_ref_tracker delayed_node_tracker; int ret = 0; - delayed_node = btrfs_get_or_create_delayed_node(inode); + delayed_node = btrfs_get_or_create_delayed_node(inode, &delayed_node_tracker); if (IS_ERR(delayed_node)) return PTR_ERR(delayed_node); @@ -1923,7 +1974,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, atomic_inc(&root->fs_info->delayed_root->items); release_node: mutex_unlock(&delayed_node->mutex); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return ret; } @@ -1931,6 +1982,7 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_delayed_node *delayed_node; + struct btrfs_ref_tracker delayed_node_tracker; /* * we don't do delayed inode updates during log recovery because it @@ -1940,7 +1992,7 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode) if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) return -EAGAIN; - delayed_node = btrfs_get_or_create_delayed_node(inode); + delayed_node = btrfs_get_or_create_delayed_node(inode, &delayed_node_tracker); if (IS_ERR(delayed_node)) return PTR_ERR(delayed_node); @@ -1967,7 +2019,7 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode) atomic_inc(&fs_info->delayed_root->items); release_node: mutex_unlock(&delayed_node->mutex); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return 0; } @@ -2011,19 +2063,21 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode) { struct btrfs_delayed_node *delayed_node; + struct btrfs_ref_tracker delayed_node_tracker; - delayed_node = btrfs_get_delayed_node(inode); + delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker); if (!delayed_node) return; __btrfs_kill_delayed_node(delayed_node); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); } void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) { unsigned long index = 0; struct btrfs_delayed_node *delayed_nodes[8]; + struct btrfs_ref_tracker delayed_node_trackers[8]; while (1) { struct btrfs_delayed_node *node; @@ -2042,6 +2096,9 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) * about to be removed from the tree in the loop below */ if (refcount_inc_not_zero(&node->refs)) { + btrfs_delayed_node_ref_tracker_alloc(node, + &delayed_node_trackers[count], + GFP_ATOMIC); delayed_nodes[count] = node; count++; } @@ -2053,7 +2110,9 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) for (int i = 0; i < count; i++) { __btrfs_kill_delayed_node(delayed_nodes[i]); - btrfs_release_delayed_node(delayed_nodes[i]); + btrfs_release_delayed_node(delayed_nodes[i], + &delayed_node_trackers[i]); + btrfs_delayed_node_ref_tracker_dir_print(delayed_nodes[i]); } } } @@ -2061,14 +2120,17 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info) { struct btrfs_delayed_node *curr_node, *prev_node; + struct btrfs_ref_tracker curr_delayed_node_tracker, prev_delayed_node_tracker; - curr_node = btrfs_first_delayed_node(fs_info->delayed_root); + curr_node = btrfs_first_delayed_node(fs_info->delayed_root, + &curr_delayed_node_tracker); while (curr_node) { __btrfs_kill_delayed_node(curr_node); prev_node = curr_node; - curr_node = btrfs_next_delayed_node(curr_node); - btrfs_release_delayed_node(prev_node); + prev_delayed_node_tracker = curr_delayed_node_tracker; + curr_node = btrfs_next_delayed_node(curr_node, &curr_delayed_node_tracker); + btrfs_release_delayed_node(prev_node, &prev_delayed_node_tracker); } } @@ -2078,8 +2140,9 @@ void btrfs_log_get_delayed_items(struct btrfs_inode *inode, { struct btrfs_delayed_node *node; struct btrfs_delayed_item *item; + struct btrfs_ref_tracker delayed_node_tracker; - node = btrfs_get_delayed_node(inode); + node = btrfs_get_delayed_node(inode, &delayed_node_tracker); if (!node) return; @@ -2137,6 +2200,7 @@ void btrfs_log_get_delayed_items(struct btrfs_inode *inode, * delete delayed items. */ ASSERT(refcount_read(&node->refs) > 1); + btrfs_delayed_node_ref_tracker_free(node, &delayed_node_tracker); refcount_dec(&node->refs); } @@ -2147,8 +2211,9 @@ void btrfs_log_put_delayed_items(struct btrfs_inode *inode, struct btrfs_delayed_node *node; struct btrfs_delayed_item *item; struct btrfs_delayed_item *next; + struct btrfs_ref_tracker delayed_node_tracker; - node = btrfs_get_delayed_node(inode); + node = btrfs_get_delayed_node(inode, &delayed_node_tracker); if (!node) return; @@ -2180,5 +2245,6 @@ void btrfs_log_put_delayed_items(struct btrfs_inode *inode, * delete delayed items. */ ASSERT(refcount_read(&node->refs) > 1); + btrfs_delayed_node_ref_tracker_free(node, &delayed_node_tracker); refcount_dec(&node->refs); } diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index e6e763ad2d42..0d949edc0caf 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -16,6 +16,7 @@ #include <linux/fs.h> #include <linux/atomic.h> #include <linux/refcount.h> +#include <linux/ref_tracker.h> #include "ctree.h" struct btrfs_disk_key; @@ -44,6 +45,22 @@ struct btrfs_delayed_root { wait_queue_head_t wait; }; +struct btrfs_ref_tracker_dir { +#ifdef CONFIG_BTRFS_DEBUG + struct ref_tracker_dir dir; +#else + struct {} tracker; +#endif +}; + +struct btrfs_ref_tracker { +#ifdef CONFIG_BTRFS_DEBUG + struct ref_tracker *tracker; +#else + struct {} tracker; +#endif +}; + #define BTRFS_DELAYED_NODE_IN_LIST 0 #define BTRFS_DELAYED_NODE_INODE_DIRTY 1 #define BTRFS_DELAYED_NODE_DEL_IREF 2 @@ -78,6 +95,12 @@ struct btrfs_delayed_node { * actual number of leaves we end up using. Protected by @mutex. */ u32 index_item_leaves; + /* Track all references to this delayed node. */ + struct btrfs_ref_tracker_dir ref_dir; + /* Track delayed node reference stored in node list. */ + struct btrfs_ref_tracker node_list_tracker; + /* Track delayed node reference stored in inode cache. */ + struct btrfs_ref_tracker inode_cache_tracker; }; struct btrfs_delayed_item { @@ -169,4 +192,74 @@ void __cold btrfs_delayed_inode_exit(void); /* for debugging */ void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info); +#define BTRFS_DELAYED_NODE_REF_TRACKER_QUARANTINE_COUNT 16 +#define BTRFS_DELAYED_NODE_REF_TRACKER_DISPLAY_LIMIT 16 + +#ifdef CONFIG_BTRFS_DEBUG +static inline void btrfs_delayed_node_ref_tracker_dir_init(struct btrfs_delayed_node *node) +{ + if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER)) + return; + + ref_tracker_dir_init(&node->ref_dir.dir, + BTRFS_DELAYED_NODE_REF_TRACKER_QUARANTINE_COUNT, + "delayed_node"); +} + +static inline void btrfs_delayed_node_ref_tracker_dir_exit(struct btrfs_delayed_node *node) +{ + if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER)) + return; + + ref_tracker_dir_exit(&node->ref_dir.dir); +} + +static inline void btrfs_delayed_node_ref_tracker_dir_print(struct btrfs_delayed_node *node) +{ + if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER)) + return; + + ref_tracker_dir_print(&node->ref_dir.dir, + BTRFS_DELAYED_NODE_REF_TRACKER_DISPLAY_LIMIT); +} + +static inline int btrfs_delayed_node_ref_tracker_alloc(struct btrfs_delayed_node *node, + struct btrfs_ref_tracker *tracker, + gfp_t gfp) +{ + if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER)) + return 0; + + return ref_tracker_alloc(&node->ref_dir.dir, &tracker->tracker, gfp); +} + +static inline int btrfs_delayed_node_ref_tracker_free(struct btrfs_delayed_node *node, + struct btrfs_ref_tracker *tracker) +{ + if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER)) + return 0; + + return ref_tracker_free(&node->ref_dir.dir, &tracker->tracker); +} +#else +static inline void btrfs_delayed_node_ref_tracker_dir_init(struct btrfs_delayed_node *node) { } + +static inline void btrfs_delayed_node_ref_tracker_dir_exit(struct btrfs_delayed_node *node) { } + +static inline void btrfs_delayed_node_ref_tracker_dir_print(struct btrfs_delayed_node *node) { } + +static inline int btrfs_delayed_node_ref_tracker_alloc(struct btrfs_delayed_node *node, + struct btrfs_ref_tracker *tracker, + gfp_t gfp) +{ + return 0; +} + +static inline int btrfs_delayed_node_ref_tracker_free(struct btrfs_delayed_node *node, + struct btrfs_ref_tracker *tracker) +{ + return 0; +} +#endif + #endif diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index ca382c5b186f..481802efaa14 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -895,7 +895,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, } /* - * Initialize the structure which represents a modification to a an extent. + * Initialize the structure which represents a modification to an extent. * * @fs_info: Internal to the mounted filesystem mount structure. * @@ -952,7 +952,7 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root, bool skip_qgroup) { -#ifdef CONFIG_BTRFS_FS_REF_VERIFY +#ifdef CONFIG_BTRFS_DEBUG /* If @real_root not set, use @root as fallback */ generic_ref->real_root = mod_root ?: generic_ref->ref_root; #endif @@ -969,7 +969,7 @@ void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root, void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset, u64 mod_root, bool skip_qgroup) { -#ifdef CONFIG_BTRFS_FS_REF_VERIFY +#ifdef CONFIG_BTRFS_DEBUG /* If @real_root not set, use @root as fallback */ generic_ref->real_root = mod_root ?: generic_ref->ref_root; #endif @@ -1251,7 +1251,6 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) { struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs; struct btrfs_fs_info *fs_info = trans->fs_info; - bool testing = btrfs_is_testing(fs_info); spin_lock(&delayed_refs->lock); while (true) { @@ -1281,7 +1280,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) spin_unlock(&delayed_refs->lock); mutex_unlock(&head->mutex); - if (!testing && pin_bytes) { + if (!btrfs_is_testing(fs_info) && pin_bytes) { struct btrfs_block_group *bg; bg = btrfs_lookup_block_group(fs_info, head->bytenr); @@ -1312,14 +1311,14 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) btrfs_error_unpin_extent_range(fs_info, head->bytenr, head->bytenr + head->num_bytes - 1); } - if (!testing) + if (!btrfs_is_testing(fs_info)) btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); btrfs_put_delayed_ref_head(head); cond_resched(); spin_lock(&delayed_refs->lock); } - if (!testing) + if (!btrfs_is_testing(fs_info)) btrfs_qgroup_destroy_extent_records(trans); spin_unlock(&delayed_refs->lock); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 552ec4fa645d..5ce940532144 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -276,10 +276,6 @@ struct btrfs_ref { */ bool skip_qgroup; -#ifdef CONFIG_BTRFS_FS_REF_VERIFY - /* Through which root is this modification. */ - u64 real_root; -#endif u64 bytenr; u64 num_bytes; u64 owning_root; @@ -296,6 +292,11 @@ struct btrfs_ref { struct btrfs_data_ref data_ref; struct btrfs_tree_ref tree_ref; }; + +#ifdef CONFIG_BTRFS_DEBUG + /* Through which root is this modification. */ + u64 real_root; +#endif }; extern struct kmem_cache *btrfs_delayed_ref_head_cachep; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 4675bcd5f92e..a4eaef60549e 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -98,7 +98,7 @@ no_valid_dev_replace_entry_found: * We don't have a replace item or it's corrupted. If there is * a replace target, fail the mount. */ - if (btrfs_find_device(fs_info->fs_devices, &args)) { + if (unlikely(btrfs_find_device(fs_info->fs_devices, &args))) { btrfs_err(fs_info, "found replace target device without a valid replace item"); return -EUCLEAN; @@ -158,7 +158,7 @@ no_valid_dev_replace_entry_found: * We don't have an active replace item but if there is a * replace target, fail the mount. */ - if (btrfs_find_device(fs_info->fs_devices, &args)) { + if (unlikely(btrfs_find_device(fs_info->fs_devices, &args))) { btrfs_err(fs_info, "replace without active item, run 'device scan --forget' on the target device"); ret = -EUCLEAN; @@ -177,8 +177,7 @@ no_valid_dev_replace_entry_found: * allow 'btrfs dev replace_cancel' if src/tgt device is * missing */ - if (!dev_replace->srcdev && - !btrfs_test_opt(fs_info, DEGRADED)) { + if (unlikely(!dev_replace->srcdev && !btrfs_test_opt(fs_info, DEGRADED))) { ret = -EIO; btrfs_warn(fs_info, "cannot mount because device replace operation is ongoing and"); @@ -186,8 +185,7 @@ no_valid_dev_replace_entry_found: "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?", src_devid); } - if (!dev_replace->tgtdev && - !btrfs_test_opt(fs_info, DEGRADED)) { + if (unlikely(!dev_replace->tgtdev && !btrfs_test_opt(fs_info, DEGRADED))) { ret = -EIO; btrfs_warn(fs_info, "cannot mount because device replace operation is ongoing and"); @@ -637,7 +635,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: - DEBUG_WARN("unexpected STARTED ot SUSPENDED dev-replace state"); + DEBUG_WARN("unexpected STARTED or SUSPENDED dev-replace state"); ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; up_write(&dev_replace->rwsem); goto leave; diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index fe9a4bd7e6e6..802d4dbe5b38 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -786,6 +786,18 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, if (iov_iter_alignment(iter) & blocksize_mask) return -EINVAL; + /* + * For bs > ps support, we heavily rely on large folios to make sure no + * block will cross large folio boundaries. + * + * But memory provided by direct IO is only virtually contiguous, not + * physically contiguous, and will break the btrfs' large folio requirement. + * + * So for bs > ps support, all direct IOs should fallback to buffered ones. + */ + if (fs_info->sectorsize > PAGE_SIZE) + return -EINVAL; + return 0; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index aa4393eba997..9247a58894de 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -116,7 +116,7 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result) * detect blocks that either didn't get written at all or got written * in the wrong place. */ -int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, int atomic) +int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, bool atomic) { if (!extent_buffer_uptodate(eb)) return 0; @@ -370,21 +370,21 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, ASSERT(check); found_start = btrfs_header_bytenr(eb); - if (found_start != eb->start) { + if (unlikely(found_start != eb->start)) { btrfs_err_rl(fs_info, "bad tree block start, mirror %u want %llu have %llu", eb->read_mirror, eb->start, found_start); ret = -EIO; goto out; } - if (check_tree_block_fsid(eb)) { + if (unlikely(check_tree_block_fsid(eb))) { btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u", eb->start, eb->read_mirror); ret = -EIO; goto out; } found_level = btrfs_header_level(eb); - if (found_level >= BTRFS_MAX_LEVEL) { + if (unlikely(found_level >= BTRFS_MAX_LEVEL)) { btrfs_err(fs_info, "bad tree block level, mirror %u level %d on logical %llu", eb->read_mirror, btrfs_header_level(eb), eb->start); @@ -404,13 +404,13 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, CSUM_FMT_VALUE(csum_size, result), btrfs_header_level(eb), ignore_csum ? ", ignored" : ""); - if (!ignore_csum) { + if (unlikely(!ignore_csum)) { ret = -EUCLEAN; goto out; } } - if (found_level != check->level) { + if (unlikely(found_level != check->level)) { btrfs_err(fs_info, "level verify failed on logical %llu mirror %u wanted %u found %u", eb->start, eb->read_mirror, check->level, found_level); @@ -639,7 +639,6 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, u64 objectid, gfp_t flags) { struct btrfs_root *root; - bool dummy = btrfs_is_testing(fs_info); root = kzalloc(sizeof(*root), flags); if (!root) @@ -696,7 +695,7 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, root->log_transid_committed = -1; btrfs_set_root_last_log_commit(root, 0); root->anon_dev = 0; - if (!dummy) { + if (!btrfs_is_testing(fs_info)) { btrfs_extent_io_tree_init(fs_info, &root->dirty_log_pages, IO_TREE_ROOT_DIRTY_LOG_PAGES); btrfs_extent_io_tree_init(fs_info, &root->log_csum_range, @@ -1047,7 +1046,7 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, root->node = NULL; goto fail; } - if (!btrfs_buffer_uptodate(root->node, generation, 0)) { + if (unlikely(!btrfs_buffer_uptodate(root->node, generation, false))) { ret = -EIO; goto fail; } @@ -1056,10 +1055,10 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, * For real fs, and not log/reloc trees, root owner must * match its root node owner */ - if (!btrfs_is_testing(fs_info) && - btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID && - btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && - btrfs_root_id(root) != btrfs_header_owner(root->node)) { + if (unlikely(!btrfs_is_testing(fs_info) && + btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID && + btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && + btrfs_root_id(root) != btrfs_header_owner(root->node))) { btrfs_crit(fs_info, "root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu", btrfs_root_id(root), root->node->start, @@ -1248,6 +1247,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) if (fs_info->fs_devices) btrfs_close_devices(fs_info->fs_devices); + btrfs_free_compress_wsm(fs_info); percpu_counter_destroy(&fs_info->stats_read_blocks); percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); @@ -2058,7 +2058,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, u64 bytenr = btrfs_super_log_root(disk_super); int level = btrfs_super_log_root_level(disk_super); - if (fs_devices->rw_devices == 0) { + if (unlikely(fs_devices->rw_devices == 0)) { btrfs_warn(fs_info, "log replay required on RO media"); return -EIO; } @@ -2079,7 +2079,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, btrfs_put_root(log_tree_root); return ret; } - if (!extent_buffer_uptodate(log_tree_root->node)) { + if (unlikely(!extent_buffer_uptodate(log_tree_root->node))) { btrfs_err(fs_info, "failed to read log tree"); btrfs_put_root(log_tree_root); return -EIO; @@ -2087,10 +2087,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); + btrfs_put_root(log_tree_root); if (ret) { btrfs_handle_fs_error(fs_info, ret, "Failed to recover log tree"); - btrfs_put_root(log_tree_root); return ret; } @@ -2324,7 +2324,7 @@ static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info, const u32 sectorsize = btrfs_super_sectorsize(sb); u32 sys_array_size = btrfs_super_sys_array_size(sb); - if (sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { + if (unlikely(sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)) { btrfs_err(fs_info, "system chunk array too big %u > %u", sys_array_size, BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); return -EUCLEAN; @@ -2342,12 +2342,12 @@ static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info, disk_key = (struct btrfs_disk_key *)(sb->sys_chunk_array + cur); len = sizeof(*disk_key); - if (cur + len > sys_array_size) + if (unlikely(cur + len > sys_array_size)) goto short_read; cur += len; btrfs_disk_key_to_cpu(&key, disk_key); - if (key.type != BTRFS_CHUNK_ITEM_KEY) { + if (unlikely(key.type != BTRFS_CHUNK_ITEM_KEY)) { btrfs_err(fs_info, "unexpected item type %u in sys_array at offset %u", key.type, cur); @@ -2355,10 +2355,10 @@ static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info, } chunk = (struct btrfs_chunk *)(sb->sys_chunk_array + cur); num_stripes = btrfs_stack_chunk_num_stripes(chunk); - if (cur + btrfs_chunk_item_size(num_stripes) > sys_array_size) + if (unlikely(cur + btrfs_chunk_item_size(num_stripes) > sys_array_size)) goto short_read; type = btrfs_stack_chunk_type(chunk); - if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) { + if (unlikely(!(type & BTRFS_BLOCK_GROUP_SYSTEM))) { btrfs_err(fs_info, "invalid chunk type %llu in sys_array at offset %u", type, cur); @@ -2438,21 +2438,7 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, ret = -EINVAL; } - /* - * We only support at most 3 sectorsizes: 4K, PAGE_SIZE, MIN_BLOCKSIZE. - * - * For 4K page sized systems with non-debug builds, all 3 matches (4K). - * For 4K page sized systems with debug builds, there are two block sizes - * supported. (4K and 2K) - * - * We can support 16K sectorsize with 64K page size without problem, - * but such sectorsize/pagesize combination doesn't make much sense. - * 4K will be our future standard, PAGE_SIZE is supported from the very - * beginning. - */ - if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && - sectorsize != PAGE_SIZE && - sectorsize != BTRFS_MIN_BLOCKSIZE)) { + if (!btrfs_supported_blocksize(sectorsize)) { btrfs_err(fs_info, "sectorsize %llu not yet supported for page size %lu", sectorsize, PAGE_SIZE); @@ -2619,13 +2605,13 @@ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info, ret = btrfs_validate_super(fs_info, sb, -1); if (ret < 0) goto out; - if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) { + if (unlikely(!btrfs_supported_super_csum(btrfs_super_csum_type(sb)))) { ret = -EUCLEAN; btrfs_err(fs_info, "invalid csum type, has %u want %u", btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32); goto out; } - if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) { + if (unlikely(btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP)) { ret = -EUCLEAN; btrfs_err(fs_info, "invalid incompat flags, has 0x%llx valid mask 0x%llx", @@ -2655,7 +2641,7 @@ static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int lev root->node = NULL; return ret; } - if (!extent_buffer_uptodate(root->node)) { + if (unlikely(!extent_buffer_uptodate(root->node))) { free_extent_buffer(root->node); root->node = NULL; return -EIO; @@ -3256,18 +3242,24 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount) } /* - * Subpage runtime limitation on v1 cache. + * Subpage/bs > ps runtime limitation on v1 cache. * - * V1 space cache still has some hard codeed PAGE_SIZE usage, while + * V1 space cache still has some hard coded PAGE_SIZE usage, while * we're already defaulting to v2 cache, no need to bother v1 as it's * going to be deprecated anyway. */ - if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) { + if (fs_info->sectorsize != PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) { btrfs_warn(fs_info, "v1 space cache is not supported for page size %lu with sectorsize %u", PAGE_SIZE, fs_info->sectorsize); return -EINVAL; } + if (fs_info->sectorsize > PAGE_SIZE && btrfs_fs_incompat(fs_info, RAID56)) { + btrfs_err(fs_info, + "RAID56 is not supported for page size %lu with sectorsize %u", + PAGE_SIZE, fs_info->sectorsize); + return -EINVAL; + } /* This can be called by remount, we need to protect the super block. */ spin_lock(&fs_info->super_lock); @@ -3396,10 +3388,16 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->nodesize_bits = ilog2(nodesize); fs_info->sectorsize = sectorsize; fs_info->sectorsize_bits = ilog2(sectorsize); + fs_info->block_min_order = ilog2(round_up(sectorsize, PAGE_SIZE) >> PAGE_SHIFT); + fs_info->block_max_order = ilog2((BITS_PER_LONG << fs_info->sectorsize_bits) >> PAGE_SHIFT); fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; fs_info->fs_devices->fs_info = fs_info; + if (fs_info->sectorsize > PAGE_SIZE) + btrfs_warn(fs_info, + "support for block size %u with page size %zu is experimental, some features may be missing", + fs_info->sectorsize, PAGE_SIZE); /* * Handle the space caching options appropriately now that we have the * super block loaded and validated. @@ -3421,6 +3419,9 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize); + ret = btrfs_alloc_compress_wsm(fs_info); + if (ret) + goto fail_sb_buffer; ret = btrfs_init_workqueues(fs_info); if (ret) goto fail_sb_buffer; @@ -3468,7 +3469,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device * below in btrfs_init_dev_replace(). */ btrfs_free_extra_devids(fs_devices); - if (!fs_devices->latest_dev->bdev) { + if (unlikely(!fs_devices->latest_dev->bdev)) { btrfs_err(fs_info, "failed to read devices"); ret = -EIO; goto fail_tree_roots; @@ -3962,7 +3963,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info) * Checks last_flush_error of disks in order to determine the device * state. */ - if (errors_wait && !btrfs_check_rw_degradable(info, NULL)) + if (unlikely(errors_wait && !btrfs_check_rw_degradable(info, NULL))) return -EIO; return 0; @@ -4064,7 +4065,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); ret = btrfs_validate_write_super(fs_info, sb); - if (ret < 0) { + if (unlikely(ret < 0)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); btrfs_handle_fs_error(fs_info, -EUCLEAN, "unexpected superblock corruption detected"); @@ -4075,7 +4076,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) if (ret) total_errors++; } - if (total_errors > max_errors) { + if (unlikely(total_errors > max_errors)) { btrfs_err(fs_info, "%d errors while writing supers", total_errors); mutex_unlock(&fs_info->fs_devices->device_list_mutex); @@ -4100,7 +4101,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) total_errors++; } mutex_unlock(&fs_info->fs_devices->device_list_mutex); - if (total_errors > max_errors) { + if (unlikely(total_errors > max_errors)) { btrfs_handle_fs_error(fs_info, -EIO, "%d errors while writing supers", total_errors); @@ -4880,7 +4881,7 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) return ret; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset -1 found, there would have to exist a root * with such id, but this is out of valid range. diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 864a55a96226..57920f2c6fe4 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -106,8 +106,7 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) void btrfs_put_root(struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans, struct extent_buffer *buf); -int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, - int atomic); +int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, bool atomic); int btrfs_read_extent_buffer(struct extent_buffer *buf, const struct btrfs_tree_parent_check *check); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 7fc8a3200b40..d062ac521051 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -174,7 +174,7 @@ struct dentry *btrfs_get_parent(struct dentry *child) ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto fail; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset of -1 found, there would have to exist an * inode with such number or a root with such id. diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 66361325f6dc..bb2ca1c9c7b0 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -1237,7 +1237,7 @@ hit_next: state = next_search_state(inserted_state, end); /* * If there's a next state, whether contiguous or not, we don't - * need to unlock and start search agian. If it's not contiguous + * need to unlock and start search again. If it's not contiguous * we will end up here and try to allocate a prealloc state and insert. */ if (state) @@ -1664,7 +1664,7 @@ out: */ u64 btrfs_count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, - u32 bits, int contig, + u32 bits, bool contig, struct extent_state **cached_state) { struct extent_state *state = NULL; diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 36facca37973..6f07b965e8da 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -163,7 +163,7 @@ void __cold btrfs_extent_state_free_cachep(void); u64 btrfs_count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, - u64 max_bytes, u32 bits, int contig, + u64 max_bytes, u32 bits, bool contig, struct extent_state **cached_state); void btrfs_free_extent_state(struct extent_state *state); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 97d517cdf2df..dc4ca98c3780 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -325,7 +325,7 @@ search_again: /* * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required, - * is_data == BTRFS_REF_TYPE_DATA, data type is requiried, + * is_data == BTRFS_REF_TYPE_DATA, data type is required, * is_data == BTRFS_REF_TYPE_ANY, either type is OK. */ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, @@ -879,7 +879,7 @@ again: ptr += btrfs_extent_inline_ref_size(type); continue; } - if (type == BTRFS_REF_TYPE_INVALID) { + if (unlikely(type == BTRFS_REF_TYPE_INVALID)) { ret = -EUCLEAN; goto out; } @@ -1210,7 +1210,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, * We're adding refs to a tree block we already own, this * should not happen at all. */ - if (owner < BTRFS_FIRST_FREE_OBJECTID) { + if (unlikely(owner < BTRFS_FIRST_FREE_OBJECTID)) { btrfs_print_leaf(path->nodes[0]); btrfs_crit(trans->fs_info, "adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u", @@ -2157,7 +2157,7 @@ again: delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif ret = __btrfs_run_delayed_refs(trans, min_bytes); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -2355,7 +2355,7 @@ static noinline int check_committed_ref(struct btrfs_inode *inode, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. @@ -2457,7 +2457,7 @@ out: static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - int full_backref, int inc) + bool full_backref, bool inc) { struct btrfs_fs_info *fs_info = root->fs_info; u64 parent; @@ -2543,15 +2543,15 @@ fail: } int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref) + struct extent_buffer *buf, bool full_backref) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 1); + return __btrfs_mod_ref(trans, root, buf, full_backref, true); } int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref) + struct extent_buffer *buf, bool full_backref) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 0); + return __btrfs_mod_ref(trans, root, buf, full_backref, false); } static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) @@ -2760,7 +2760,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, btrfs_put_block_group(cache); total_unpinned = 0; cache = btrfs_lookup_block_group(fs_info, start); - if (cache == NULL) { + if (unlikely(cache == NULL)) { /* Logic error, something removed the block group. */ ret = -EUCLEAN; goto out; @@ -2982,26 +2982,26 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans, csum_root = btrfs_csum_root(trans->fs_info, bytenr); ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } } ret = btrfs_record_squota_delta(trans->fs_info, delta); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -3115,7 +3115,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; - if (!is_data && refs_to_drop != 1) { + if (unlikely(!is_data && refs_to_drop != 1)) { btrfs_crit(info, "invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u", node->bytenr, refs_to_drop); @@ -3162,7 +3162,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } if (!found_extent) { - if (iref) { + if (unlikely(iref)) { abort_and_dump(trans, path, "invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref", path->slots[0]); @@ -3172,7 +3172,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, /* Must be SHARED_* item, remove the backref first */ ret = remove_extent_backref(trans, extent_root, path, NULL, refs_to_drop, is_data); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3221,7 +3221,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, "umm, got %d back from search, was looking for %llu, slot %d", ret, bytenr, path->slots[0]); } - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3254,7 +3254,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, key.type == BTRFS_EXTENT_ITEM_KEY) { struct btrfs_tree_block_info *bi; - if (item_size < sizeof(*ei) + sizeof(*bi)) { + if (unlikely(item_size < sizeof(*ei) + sizeof(*bi))) { abort_and_dump(trans, path, "invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu", key.objectid, key.type, key.offset, @@ -3268,7 +3268,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } refs = btrfs_extent_refs(leaf, ei); - if (refs < refs_to_drop) { + if (unlikely(refs < refs_to_drop)) { abort_and_dump(trans, path, "trying to drop %d refs but we only have %llu for bytenr %llu slot %u", refs_to_drop, refs, bytenr, path->slots[0]); @@ -3285,7 +3285,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, * be updated by remove_extent_backref */ if (iref) { - if (!found_extent) { + if (unlikely(!found_extent)) { abort_and_dump(trans, path, "invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u", path->slots[0]); @@ -3298,7 +3298,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (found_extent) { ret = remove_extent_backref(trans, extent_root, path, iref, refs_to_drop, is_data); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3314,8 +3314,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, /* In this branch refs == 1 */ if (found_extent) { - if (is_data && refs_to_drop != - extent_data_ref_count(path, iref)) { + if (unlikely(is_data && refs_to_drop != + extent_data_ref_count(path, iref))) { abort_and_dump(trans, path, "invalid refs_to_drop, current refs %u refs_to_drop %u slot %u", extent_data_ref_count(path, iref), @@ -3324,7 +3324,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, goto out; } if (iref) { - if (path->slots[0] != extent_slot) { + if (unlikely(path->slots[0] != extent_slot)) { abort_and_dump(trans, path, "invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref", key.objectid, key.type, @@ -3339,7 +3339,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, * | extent_slot ||extent_slot + 1| * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ] */ - if (path->slots[0] != extent_slot + 1) { + if (unlikely(path->slots[0] != extent_slot + 1)) { abort_and_dump(trans, path, "invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM", path->slots[0]); @@ -3363,7 +3363,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -4297,7 +4297,8 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, } static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, - struct find_free_extent_ctl *ffe_ctl) + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_space_info *space_info) { if (ffe_ctl->for_treelog) { spin_lock(&fs_info->treelog_bg_lock); @@ -4315,12 +4316,13 @@ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->zone_active_bgs_lock); list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { /* - * No lock is OK here because avail is monotinically + * No lock is OK here because avail is monotonically * decreasing, and this is just a hint. */ u64 avail = block_group->zone_capacity - block_group->alloc_offset; if (block_group_bits(block_group, ffe_ctl->flags) && + block_group->space_info == space_info && avail >= ffe_ctl->num_bytes) { ffe_ctl->hint_byte = block_group->start; break; @@ -4342,7 +4344,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, return prepare_allocation_clustered(fs_info, ffe_ctl, space_info, ins); case BTRFS_EXTENT_ALLOC_ZONED: - return prepare_allocation_zoned(fs_info, ffe_ctl); + return prepare_allocation_zoned(fs_info, ffe_ctl, space_info); default: BUG(); } @@ -5061,7 +5063,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (IS_ERR(buf)) return buf; - if (check_eb_lock_owner(buf)) { + if (unlikely(check_eb_lock_owner(buf))) { free_extent_buffer(buf); return ERR_PTR(-EUCLEAN); } @@ -5470,17 +5472,17 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, if (!(wc->flags[level] & flag)) { ASSERT(path->locks[level]); ret = btrfs_inc_ref(trans, root, eb, 1); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } ret = btrfs_dec_ref(trans, root, eb, 0); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } ret = btrfs_set_disk_extent_flags(trans, eb, flag); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -5582,7 +5584,7 @@ static int check_next_block_uptodate(struct btrfs_trans_handle *trans, generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]); - if (btrfs_buffer_uptodate(next, generation, 0)) + if (btrfs_buffer_uptodate(next, generation, false)) return 0; check.level = level - 1; @@ -5611,7 +5613,7 @@ static int check_next_block_uptodate(struct btrfs_trans_handle *trans, * If we are UPDATE_BACKREF then we will not, we need to update our backrefs. * * If we are DROP_REFERENCE this will figure out if we need to drop our current - * reference, skipping it if we dropped it from a previous incompleted drop, or + * reference, skipping it if we dropped it from a previous uncompleted drop, or * dropping it if we still have a reference to it. */ static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -5636,7 +5638,7 @@ static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_r ref.parent = path->nodes[level]->start; } else { ASSERT(btrfs_root_id(root) == btrfs_header_owner(path->nodes[level])); - if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level])) { + if (unlikely(btrfs_root_id(root) != btrfs_header_owner(path->nodes[level]))) { btrfs_err(root->fs_info, "mismatched block owner"); return -EIO; } @@ -5758,7 +5760,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, /* * We have to walk down into this node, and if we're currently at the - * DROP_REFERNCE stage and this block is shared then we need to switch + * DROP_REFERENCE stage and this block is shared then we need to switch * to the UPDATE_BACKREF stage in order to convert to FULL_BACKREF. */ if (wc->stage == DROP_REFERENCE && wc->refs[level - 1] > 1) { @@ -5772,7 +5774,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, level--; ASSERT(level == btrfs_header_level(next)); - if (level != btrfs_header_level(next)) { + if (unlikely(level != btrfs_header_level(next))) { btrfs_err(root->fs_info, "mismatched level"); ret = -EIO; goto out_unlock; @@ -5883,7 +5885,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, } } else { ret = btrfs_dec_ref(trans, root, eb, 0); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -5908,13 +5910,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (eb == root->node) { if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = eb->start; - else if (btrfs_root_id(root) != btrfs_header_owner(eb)) + else if (unlikely(btrfs_root_id(root) != btrfs_header_owner(eb))) goto owner_mismatch; } else { if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = path->nodes[level + 1]->start; - else if (btrfs_root_id(root) != - btrfs_header_owner(path->nodes[level + 1])) + else if (unlikely(btrfs_root_id(root) != + btrfs_header_owner(path->nodes[level + 1]))) goto owner_mismatch; } @@ -6049,9 +6051,9 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, * also make sure backrefs for the shared block and all lower level * blocks are properly updated. * - * If called with for_reloc == 0, may exit early with -EAGAIN + * If called with for_reloc set, may exit early with -EAGAIN */ -int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) +int btrfs_drop_snapshot(struct btrfs_root *root, bool update_ref, bool for_reloc) { const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID); struct btrfs_fs_info *fs_info = root->fs_info; @@ -6178,13 +6180,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) while (1) { ret = walk_down_tree(trans, root, path, wc); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); break; } ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); break; } @@ -6211,7 +6213,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) ret = btrfs_update_root(trans, tree_root, &root->root_key, root_item); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -6247,7 +6249,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) goto out_end_trans; ret = btrfs_del_root(trans, &root->root_key); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -6255,7 +6257,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) if (!is_reloc_root) { ret = btrfs_find_root(tree_root, &root->root_key, path, NULL, NULL); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } else if (ret > 0) { diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 82d3a82dc712..e970ac42a871 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -140,9 +140,9 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data, int delalloc); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref); + struct extent_buffer *buf, bool full_backref); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref); + struct extent_buffer *buf, bool full_backref); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct extent_buffer *eb, u64 flags); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); @@ -155,8 +155,7 @@ int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, const struct extent_buffer *eb); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref); -int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, - int for_reloc); +int btrfs_drop_snapshot(struct btrfs_root *root, bool update_ref, bool for_reloc); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b21cb72835cc..c123a3ef154a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -101,6 +101,26 @@ struct btrfs_bio_ctrl { enum btrfs_compression_type compress_type; u32 len_to_oe_boundary; blk_opf_t opf; + /* + * For data read bios, we attempt to optimize csum lookups if the extent + * generation is older than the current one. To make this possible, we + * need to track the maximum generation of an extent in a bio_ctrl to + * make the decision when submitting the bio. + * + * The pattern between do_readpage(), submit_one_bio() and + * submit_extent_folio() is quite subtle, so tracking this is tricky. + * + * As we process extent E, we might submit a bio with existing built up + * extents before adding E to a new bio, or we might just add E to the + * bio. As a result, E's generation could apply to the current bio or + * to the next one, so we need to be careful to update the bio_ctrl's + * generation with E's only when we are sure E is added to bio_ctrl->bbio + * in submit_extent_folio(). + * + * See the comment in btrfs_lookup_bio_sums() for more detail on the + * need for this optimization. + */ + u64 generation; btrfs_bio_end_io_t end_io_func; struct writeback_control *wbc; @@ -131,6 +151,26 @@ struct btrfs_bio_ctrl { u64 last_em_start; }; +/* + * Helper to set the csum search commit root option for a bio_ctrl's bbio + * before submitting the bio. + * + * Only for use by submit_one_bio(). + */ +static void bio_set_csum_search_commit_root(struct btrfs_bio_ctrl *bio_ctrl) +{ + struct btrfs_bio *bbio = bio_ctrl->bbio; + + ASSERT(bbio); + + if (!(btrfs_op(&bbio->bio) == BTRFS_MAP_READ && is_data_inode(bbio->inode))) + return; + + bio_ctrl->bbio->csum_search_commit_root = + (bio_ctrl->generation && + bio_ctrl->generation < btrfs_get_fs_generation(bbio->inode->root->fs_info)); +} + static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_bio *bbio = bio_ctrl->bbio; @@ -141,6 +181,8 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) /* Caller should ensure the bio has at least some range added */ ASSERT(bbio->bio.bi_iter.bi_size); + bio_set_csum_search_commit_root(bio_ctrl); + if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ && bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) btrfs_submit_compressed_read(bbio); @@ -149,6 +191,12 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) /* The bbio is owned by the end_io handler now */ bio_ctrl->bbio = NULL; + /* + * We used the generation to decide whether to lookup csums in the + * commit_root or not when we called bio_set_csum_search_commit_root() + * above. Now, reset the generation for the next bio. + */ + bio_ctrl->generation = 0; } /* @@ -345,6 +393,13 @@ again: /* step one, find a bunch of delalloc bytes starting at start */ delalloc_start = *start; delalloc_end = 0; + + /* + * If @max_bytes is smaller than a block, btrfs_find_delalloc_range() can + * return early without handling any dirty ranges. + */ + ASSERT(max_bytes >= fs_info->sectorsize); + found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end, max_bytes, &cached_state); if (!found || delalloc_end <= *start || delalloc_start > orig_end) { @@ -370,18 +425,19 @@ again: if (delalloc_end + 1 - delalloc_start > max_bytes) delalloc_end = delalloc_start + max_bytes - 1; - /* step two, lock all the folioss after the folios that has start */ + /* step two, lock all the folios after the folios that has start */ ret = lock_delalloc_folios(inode, locked_folio, delalloc_start, delalloc_end); ASSERT(!ret || ret == -EAGAIN); if (ret == -EAGAIN) { - /* some of the folios are gone, lets avoid looping by - * shortening the size of the delalloc range we're searching + /* + * Some of the folios are gone, lets avoid looping by + * shortening the size of the delalloc range we're searching. */ btrfs_free_extent_state(cached_state); cached_state = NULL; if (!loops) { - max_bytes = PAGE_SIZE; + max_bytes = fs_info->sectorsize; loops = 1; goto again; } else { @@ -570,6 +626,7 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) * Populate every free slot in a provided array with folios using GFP_NOFS. * * @nr_folios: number of folios to allocate + * @order: the order of the folios to be allocated * @folio_array: the array to fill with folios; any existing non-NULL entries in * the array will be skipped * @@ -577,12 +634,13 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) * -ENOMEM otherwise, the partially allocated folios would be freed and * the array slots zeroed */ -int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array) +int btrfs_alloc_folio_array(unsigned int nr_folios, unsigned int order, + struct folio **folio_array) { for (int i = 0; i < nr_folios; i++) { if (folio_array[i]) continue; - folio_array[i] = folio_alloc(GFP_NOFS, 0); + folio_array[i] = folio_alloc(GFP_NOFS, order); if (!folio_array[i]) goto error; } @@ -591,6 +649,7 @@ error: for (int i = 0; i < nr_folios; i++) { if (folio_array[i]) folio_put(folio_array[i]); + folio_array[i] = NULL; } return -ENOMEM; } @@ -719,15 +778,18 @@ static void alloc_new_bio(struct btrfs_inode *inode, * @size: portion of page that we want to write to * @pg_offset: offset of the new bio or to check whether we are adding * a contiguous page to the previous one + * @read_em_generation: generation of the extent_map we are submitting + * (only used for read) * * The will either add the page into the existing @bio_ctrl->bbio, or allocate a * new one in @bio_ctrl->bbio. - * The mirror number for this IO should already be initizlied in + * The mirror number for this IO should already be initialized in * @bio_ctrl->mirror_num. */ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, u64 disk_bytenr, struct folio *folio, - size_t size, unsigned long pg_offset) + size_t size, unsigned long pg_offset, + u64 read_em_generation) { struct btrfs_inode *inode = folio_to_inode(folio); loff_t file_offset = folio_pos(folio) + pg_offset; @@ -758,6 +820,11 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, submit_one_bio(bio_ctrl); continue; } + /* + * Now that the folio is definitely added to the bio, include its + * generation in the max generation calculation. + */ + bio_ctrl->generation = max(bio_ctrl->generation, read_em_generation); bio_ctrl->next_file_offset += len; if (bio_ctrl->wbc) @@ -960,6 +1027,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, bool force_bio_submit = false; u64 disk_bytenr; u64 block_start; + u64 em_gen; ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) { @@ -1043,6 +1111,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, bio_ctrl->last_em_start = em->start; + em_gen = em->generation; btrfs_free_extent_map(em); em = NULL; @@ -1066,7 +1135,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, if (force_bio_submit) submit_one_bio(bio_ctrl); submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize, - pg_offset); + pg_offset, em_gen); } return 0; } @@ -1600,7 +1669,7 @@ static int submit_one_sector(struct btrfs_inode *inode, ASSERT(folio_test_writeback(folio)); submit_extent_folio(bio_ctrl, disk_bytenr, folio, - sectorsize, filepos - folio_pos(folio)); + sectorsize, filepos - folio_pos(folio), 0); return 0; } @@ -1621,7 +1690,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long range_bitmap = 0; bool submitted_io = false; - bool error = false; + int found_error = 0; const u64 folio_start = folio_pos(folio); const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); u64 cur; @@ -1685,7 +1754,8 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, */ btrfs_mark_ordered_io_finished(inode, folio, cur, fs_info->sectorsize, false); - error = true; + if (!found_error) + found_error = ret; continue; } submitted_io = true; @@ -1702,11 +1772,11 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, * If we hit any error, the corresponding sector will have its dirty * flag cleared and writeback finished, thus no need to handle the error case. */ - if (!submitted_io && !error) { + if (!submitted_io && !found_error) { btrfs_folio_set_writeback(fs_info, folio, start, len); btrfs_folio_clear_writeback(fs_info, folio, start, len); } - return ret; + return found_error; } /* @@ -2167,7 +2237,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, * @fs_info: The fs_info for this file system. * @start: The offset of the range to start waiting on writeback. * @end: The end of the range, inclusive. This is meant to be used in - * conjuction with wait_marked_extents, so this will usually be + * conjunction with wait_marked_extents, so this will usually be * the_next_eb->start - 1. */ void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start, @@ -2437,7 +2507,7 @@ retry: * In above case, [32K, 96K) is asynchronously submitted * for compression, and [124K, 128K) needs to be written back. * - * If we didn't wait wrtiteback for page 64K, [128K, 128K) + * If we didn't wait writeback for page 64K, [128K, 128K) * won't be submitted as the page still has writeback flag * and will be skipped in the next check. * @@ -2921,7 +2991,7 @@ static void cleanup_extent_buffer_folios(struct extent_buffer *eb) { const int num_folios = num_extent_folios(eb); - /* We canont use num_extent_folios() as loop bound as eb->folios changes. */ + /* We cannot use num_extent_folios() as loop bound as eb->folios changes. */ for (int i = 0; i < num_folios; i++) { ASSERT(eb->folios[i]); detach_extent_buffer_folio(eb, eb->folios[i]); @@ -3168,29 +3238,30 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, */ static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) { - if (!IS_ALIGNED(start, fs_info->sectorsize)) { + const u32 nodesize = fs_info->nodesize; + + if (unlikely(!IS_ALIGNED(start, fs_info->sectorsize))) { btrfs_err(fs_info, "bad tree block start %llu", start); return true; } - if (fs_info->nodesize < PAGE_SIZE && !IS_ALIGNED(start, fs_info->nodesize)) { + if (unlikely(nodesize < PAGE_SIZE && !IS_ALIGNED(start, nodesize))) { btrfs_err(fs_info, "tree block is not nodesize aligned, start %llu nodesize %u", - start, fs_info->nodesize); + start, nodesize); return true; } - if (fs_info->nodesize >= PAGE_SIZE && - !PAGE_ALIGNED(start)) { + if (unlikely(nodesize >= PAGE_SIZE && !PAGE_ALIGNED(start))) { btrfs_err(fs_info, "tree block is not page aligned, start %llu nodesize %u", - start, fs_info->nodesize); + start, nodesize); return true; } - if (!IS_ALIGNED(start, fs_info->nodesize) && - !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags)) { + if (unlikely(!IS_ALIGNED(start, nodesize) && + !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags))) { btrfs_warn(fs_info, "tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance", - start, fs_info->nodesize); + start, nodesize); } return false; } @@ -3809,7 +3880,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num, return ret; wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE); - if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) + if (unlikely(!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) return -EIO; return 0; } @@ -4485,7 +4556,7 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, if (IS_ERR(eb)) return; - if (btrfs_buffer_uptodate(eb, gen, 1)) { + if (btrfs_buffer_uptodate(eb, gen, true)) { free_extent_buffer(eb); return; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 61130786b9a3..5fcbfe44218c 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -366,7 +366,8 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, bool nofail); -int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array); +int btrfs_alloc_folio_array(unsigned int nr_folios, unsigned int order, + struct folio **folio_array); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 9a5a497edc97..7e38c23a0c1c 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -460,7 +460,7 @@ void btrfs_clear_em_logging(struct btrfs_inode *inode, struct extent_map *em) static inline void setup_extent_mapping(struct btrfs_inode *inode, struct extent_map *em, - int modified) + bool modified) { refcount_inc(&em->refs); @@ -486,7 +486,7 @@ static inline void setup_extent_mapping(struct btrfs_inode *inode, * taken, or a reference dropped if the merge attempt was successful. */ static int add_extent_mapping(struct btrfs_inode *inode, - struct extent_map *em, int modified) + struct extent_map *em, bool modified) { struct extent_map_tree *tree = &inode->extent_tree; struct btrfs_root *root = inode->root; @@ -509,7 +509,7 @@ static int add_extent_mapping(struct btrfs_inode *inode, } static struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len, int strict) + u64 start, u64 len, bool strict) { struct extent_map *em; struct rb_node *rb_node; @@ -548,7 +548,7 @@ static struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, struct extent_map *btrfs_lookup_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len) { - return lookup_extent_mapping(tree, start, len, 1); + return lookup_extent_mapping(tree, start, len, true); } /* @@ -566,7 +566,7 @@ struct extent_map *btrfs_lookup_extent_mapping(struct extent_map_tree *tree, struct extent_map *btrfs_search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len) { - return lookup_extent_mapping(tree, start, len, 0); + return lookup_extent_mapping(tree, start, len, false); } /* @@ -594,7 +594,7 @@ void btrfs_remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *e static void replace_extent_mapping(struct btrfs_inode *inode, struct extent_map *cur, struct extent_map *new, - int modified) + bool modified) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_map_tree *tree = &inode->extent_tree; @@ -670,7 +670,7 @@ static noinline int merge_extent_mapping(struct btrfs_inode *inode, em->len = end - start; if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) em->offset += start_diff; - return add_extent_mapping(inode, em, 0); + return add_extent_mapping(inode, em, false); } /* @@ -707,7 +707,7 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode, if (em->disk_bytenr == EXTENT_MAP_INLINE) ASSERT(em->start == 0); - ret = add_extent_mapping(inode, em, 0); + ret = add_extent_mapping(inode, em, false); /* it is possible that someone inserted the extent into the tree * while we had the lock dropped. It is also possible that * an overlapping map exists in the tree @@ -1057,7 +1057,7 @@ int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pr btrfs_lock_extent(&inode->io_tree, start, start + len - 1, NULL); write_lock(&em_tree->lock); em = btrfs_lookup_extent_mapping(em_tree, start, len); - if (!em) { + if (unlikely(!em)) { ret = -EIO; goto out_unlock; } @@ -1082,7 +1082,7 @@ int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pr split_pre->flags = flags; split_pre->generation = em->generation; - replace_extent_mapping(inode, em, split_pre, 1); + replace_extent_mapping(inode, em, split_pre, true); /* * Now we only have an extent_map at: @@ -1098,7 +1098,7 @@ int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pr split_mid->ram_bytes = split_mid->len; split_mid->flags = flags; split_mid->generation = em->generation; - add_extent_mapping(inode, split_mid, 1); + add_extent_mapping(inode, split_mid, true); /* Once for us */ btrfs_free_extent_map(em); diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c index 7935586a9dbd..f2eaaef8422b 100644 --- a/fs/btrfs/fiemap.c +++ b/fs/btrfs/fiemap.c @@ -153,7 +153,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, if (cache_end > offset) { if (offset == cache->offset) { /* - * We cached a dealloc range (found in the io tree) for + * We cached a delalloc range (found in the io tree) for * a hole or prealloc extent and we have now found a * file extent item for the same offset. What we have * now is more recent and up to date, so discard what diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index c09fbc257634..a42e6d54e7cd 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -397,6 +397,36 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio) path->skip_locking = 1; } + /* + * If we are searching for a csum of an extent from a past + * transaction, we can search in the commit root and reduce + * lock contention on the csum tree extent buffers. + * + * This is important because that lock is an rwsem which gets + * pretty heavy write load under memory pressure and sustained + * csum overwrites, unlike the commit_root_sem. (Memory pressure + * makes us writeback the nodes multiple times per transaction, + * which makes us cow them each time, taking the write lock.) + * + * Due to how rwsem is implemented, there is a possible + * priority inversion where the readers holding the lock don't + * get scheduled (say they're in a cgroup stuck in heavy reclaim) + * which then blocks writers, including transaction commit. By + * using a semaphore with fewer writers (only a commit switching + * the roots), we make this issue less likely. + * + * Note that we don't rely on btrfs_search_slot to lock the + * commit root csum. We call search_slot multiple times, which would + * create a potential race where a commit comes in between searches + * while we are not holding the commit_root_sem, and we get csums + * from across transactions. + */ + if (bbio->csum_search_commit_root) { + path->search_commit_root = 1; + path->skip_locking = 1; + down_read(&fs_info->commit_root_sem); + } + while (bio_offset < orig_len) { int count; u64 cur_disk_bytenr = orig_disk_bytenr + bio_offset; @@ -442,6 +472,8 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio) bio_offset += count * sectorsize; } + if (bbio->csum_search_commit_root) + up_read(&fs_info->commit_root_sem); return ret; } @@ -743,12 +775,10 @@ int btrfs_csum_one_bio(struct btrfs_bio *bbio) SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct bio *bio = &bbio->bio; struct btrfs_ordered_sum *sums; - char *data; - struct bvec_iter iter; - struct bio_vec bvec; + struct bvec_iter iter = bio->bi_iter; + phys_addr_t paddr; + const u32 blocksize = fs_info->sectorsize; int index; - unsigned int blockcount; - int i; unsigned nofs_flag; nofs_flag = memalloc_nofs_save(); @@ -767,21 +797,9 @@ int btrfs_csum_one_bio(struct btrfs_bio *bbio) shash->tfm = fs_info->csum_shash; - bio_for_each_segment(bvec, bio, iter) { - blockcount = BTRFS_BYTES_TO_BLKS(fs_info, - bvec.bv_len + fs_info->sectorsize - - 1); - - for (i = 0; i < blockcount; i++) { - data = bvec_kmap_local(&bvec); - crypto_shash_digest(shash, - data + (i * fs_info->sectorsize), - fs_info->sectorsize, - sums->sums + index); - kunmap_local(data); - index += fs_info->csum_size; - } - + btrfs_bio_for_each_block(paddr, bio, &iter, blocksize) { + btrfs_calculate_block_csum(fs_info, paddr, sums->sums + index); + index += fs_info->csum_size; } bbio->sums = sums; @@ -993,7 +1011,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, * item changed size or key */ ret = btrfs_split_item(trans, root, path, &key, offset); - if (ret && ret != -EAGAIN) { + if (unlikely(ret && ret != -EAGAIN)) { btrfs_abort_transaction(trans, ret); break; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 204674934795..7efd1f8a1912 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -327,7 +327,7 @@ next_slot: args->start - extent_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -426,7 +426,7 @@ delete_extent_item: key.offset - extent_offset, 0, false); ret = btrfs_free_extent(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -443,7 +443,7 @@ delete_extent_item: ret = btrfs_del_items(trans, root, path, del_slot, del_nr); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -587,21 +587,20 @@ again: leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid != ino || - key.type != BTRFS_EXTENT_DATA_KEY) { + if (unlikely(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; } fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) { + if (unlikely(btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; } extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); - if (key.offset > start || extent_end < end) { + if (unlikely(key.offset > start || extent_end < end)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; @@ -676,7 +675,7 @@ again: btrfs_release_path(path); goto again; } - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -704,7 +703,7 @@ again: ref.ref_root = btrfs_root_id(root); btrfs_init_data_ref(&ref, ino, orig_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -712,7 +711,7 @@ again: if (split == start) { key.offset = start; } else { - if (start != key.offset) { + if (unlikely(start != key.offset)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; @@ -744,7 +743,7 @@ again: del_slot = path->slots[0] + 1; del_nr++; ret = btrfs_free_extent(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -762,7 +761,7 @@ again: del_slot = path->slots[0]; del_nr++; ret = btrfs_free_extent(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -783,7 +782,7 @@ again: extent_end - key.offset); ret = btrfs_del_items(trans, root, path, del_slot, del_nr); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -815,7 +814,7 @@ static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 if (ret) return ret; folio_lock(folio); - if (!folio_test_uptodate(folio)) { + if (unlikely(!folio_test_uptodate(folio))) { folio_unlock(folio); return -EIO; } @@ -970,7 +969,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, * Return: * > 0 If we can nocow, and updates @write_bytes. * 0 If we can't do a nocow write. - * -EAGAIN If we can't do a nocow write because snapshoting of the inode's + * -EAGAIN If we can't do a nocow write because snapshotting of the inode's * root is in progress or because we are in a non-blocking IO * context and need to block (@nowait is true). * < 0 If an error happened. @@ -2460,9 +2459,9 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, * got EOPNOTSUPP via prealloc then we messed up and * need to abort. */ - if (ret && - (ret != -EOPNOTSUPP || - (extent_info && extent_info->is_new_extent))) + if (unlikely(ret && + (ret != -EOPNOTSUPP || + (extent_info && extent_info->is_new_extent)))) btrfs_abort_transaction(trans, ret); break; } @@ -2473,7 +2472,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, cur_offset < ino_size) { ret = fill_holes(trans, inode, path, cur_offset, drop_args.drop_end); - if (ret) { + if (unlikely(ret)) { /* * If we failed then we didn't insert our hole * entries for the area we dropped, so now the @@ -2493,7 +2492,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, ret = btrfs_inode_clear_file_extent_range(inode, cur_offset, drop_args.drop_end - cur_offset); - if (ret) { + if (unlikely(ret)) { /* * We couldn't clear our area, so we could * presumably adjust up and corrupt the fs, so @@ -2512,7 +2511,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, ret = btrfs_insert_replace_extent(trans, inode, path, extent_info, replace_len, drop_args.bytes_found); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -2607,7 +2606,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, cur_offset < drop_args.drop_end) { ret = fill_holes(trans, inode, path, cur_offset, drop_args.drop_end); - if (ret) { + if (unlikely(ret)) { /* Same comment as above. */ btrfs_abort_transaction(trans, ret); goto out_trans; @@ -2616,7 +2615,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, /* See the comment in the loop above for the reasoning here. */ ret = btrfs_inode_clear_file_extent_range(inode, cur_offset, drop_args.drop_end - cur_offset); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_trans; } @@ -2626,7 +2625,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, ret = btrfs_insert_replace_extent(trans, inode, path, extent_info, extent_info->data_len, drop_args.bytes_found); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_trans; } @@ -3345,7 +3344,7 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end * We could also use the extent map tree to find such delalloc that is * being flushed, but using the ordered extents tree is more efficient * because it's usually much smaller as ordered extents are removed from - * the tree once they complete. With the extent maps, we mau have them + * the tree once they complete. With the extent maps, we may have them * in the extent map tree for a very long time, and they were either * created by previous writes or loaded by read operations. */ diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 5d8d1570a5c9..ab873bd67192 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2282,7 +2282,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, * If this block group has some small extents we don't want to * use up all of our free slots in the cache with them, we want * to reserve them to larger extents, however if we have plenty - * of cache left then go ahead an dadd them, no sense in adding + * of cache left then go ahead and add them, no sense in adding * the overhead of a bitmap if we don't have to. */ if (info->bytes <= fs_info->sectorsize * 8) { @@ -3829,7 +3829,7 @@ out_unlock: /* * If we break out of trimming a bitmap prematurely, we should reset the - * trimming bit. In a rather contrieved case, it's possible to race here so + * trimming bit. In a rather contrived case, it's possible to race here so * reset the state to BTRFS_TRIM_STATE_UNTRIMMED. * * start = start of bitmap @@ -4142,7 +4142,7 @@ int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool act if (!active) { set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); ret = cleanup_free_space_cache_v1(fs_info, trans); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); goto out; diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index eba7f22ae49c..dad0b492a663 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -137,12 +137,12 @@ static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans, if (ret < 0) return ret; - if (ret == 0) { + if (unlikely(ret == 0)) { DEBUG_WARN(); return -EIO; } - if (p->slots[0] == 0) { + if (unlikely(p->slots[0] == 0)) { DEBUG_WARN("no previous slot found"); return -EIO; } @@ -218,7 +218,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, bitmap_size = free_space_bitmap_size(fs_info, block_group->length); bitmap = alloc_bitmap(bitmap_size); - if (!bitmap) { + if (unlikely(!bitmap)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out; @@ -233,7 +233,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, while (!done) { ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -271,7 +271,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, } ret = btrfs_del_items(trans, root, path, path->slots[0], nr); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -293,7 +293,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, expected_extent_count = btrfs_free_space_extent_count(leaf, info); btrfs_release_path(path); - if (extent_count != expected_extent_count) { + if (unlikely(extent_count != expected_extent_count)) { btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, @@ -320,7 +320,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, root, path, &key, data_size); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -361,7 +361,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, bitmap_size = free_space_bitmap_size(fs_info, block_group->length); bitmap = alloc_bitmap(bitmap_size); - if (!bitmap) { + if (unlikely(!bitmap)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out; @@ -376,7 +376,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, while (!done) { ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -420,7 +420,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, } ret = btrfs_del_items(trans, root, path, path->slots[0], nr); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -454,7 +454,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, key.offset = (end_bit - start_bit) * fs_info->sectorsize; ret = btrfs_insert_empty_item(trans, root, path, &key, 0); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -465,7 +465,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, start_bit = find_next_bit_le(bitmap, nrbits, end_bit); } - if (extent_count != expected_extent_count) { + if (unlikely(extent_count != expected_extent_count)) { btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, @@ -848,14 +848,14 @@ int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, return 0; path = btrfs_alloc_path(); - if (!path) { + if (unlikely(!path)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out; } block_group = btrfs_lookup_block_group(trans->fs_info, start); - if (!block_group) { + if (unlikely(!block_group)) { DEBUG_WARN("no block group found for start=%llu", start); ret = -ENOENT; btrfs_abort_transaction(trans, ret); @@ -1030,14 +1030,14 @@ int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, return 0; path = btrfs_alloc_path(); - if (!path) { + if (unlikely(!path)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out; } block_group = btrfs_lookup_block_group(trans->fs_info, start); - if (!block_group) { + if (unlikely(!block_group)) { DEBUG_WARN("no block group found for start=%llu", start); ret = -ENOENT; btrfs_abort_transaction(trans, ret); @@ -1185,7 +1185,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) goto out_clear; } ret = btrfs_global_root_insert(free_space_root); - if (ret) { + if (unlikely(ret)) { btrfs_put_root(free_space_root); btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); @@ -1197,7 +1197,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) block_group = rb_entry(node, struct btrfs_block_group, cache_node); ret = populate_free_space_tree(trans, block_group); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); goto out_clear; @@ -1290,14 +1290,14 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); ret = clear_free_space_tree(trans, free_space_root); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; } ret = btrfs_del_root(trans, &free_space_root->root_key); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; @@ -1315,7 +1315,7 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) ret = btrfs_free_tree_block(trans, btrfs_root_id(free_space_root), free_space_root->node, 0, 1); btrfs_put_root(free_space_root); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; @@ -1344,7 +1344,7 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); ret = clear_free_space_tree(trans, free_space_root); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; @@ -1362,7 +1362,7 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) goto next; ret = populate_free_space_tree(trans, block_group); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; @@ -1422,7 +1422,7 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, if (!path) { path = btrfs_alloc_path(); - if (!path) { + if (unlikely(!path)) { btrfs_abort_transaction(trans, -ENOMEM); return -ENOMEM; } @@ -1430,7 +1430,7 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, } ret = add_new_free_space_info(trans, block_group, path); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1481,7 +1481,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, } path = btrfs_alloc_path(); - if (!path) { + if (unlikely(!path)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out; @@ -1496,7 +1496,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, while (!done) { ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1527,7 +1527,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, } ret = btrfs_del_items(trans, root, path, path->slots[0], nr); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1611,7 +1611,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, extent_count++; } - if (extent_count != expected_extent_count) { + if (unlikely(extent_count != expected_extent_count)) { btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, @@ -1672,7 +1672,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, extent_count++; } - if (extent_count != expected_extent_count) { + if (unlikely(extent_count != expected_extent_count)) { btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c index b2bb86f8d7cf..feb0a2faa837 100644 --- a/fs/btrfs/fs.c +++ b/fs/btrfs/fs.c @@ -55,6 +55,54 @@ size_t __attribute_const__ btrfs_get_num_csums(void) } /* + * We support the following block sizes for all systems: + * + * - 4K + * This is the most common block size. For PAGE SIZE > 4K cases the subpage + * mode is used. + * + * - PAGE_SIZE + * The straightforward block size to support. + * + * And extra support for the following block sizes based on the kernel config: + * + * - MIN_BLOCKSIZE + * This is either 4K (regular builds) or 2K (debug builds) + * This allows testing subpage routines on x86_64. + */ +bool __attribute_const__ btrfs_supported_blocksize(u32 blocksize) +{ + /* @blocksize should be validated first. */ + ASSERT(is_power_of_2(blocksize) && blocksize >= BTRFS_MIN_BLOCKSIZE && + blocksize <= BTRFS_MAX_BLOCKSIZE); + + if (blocksize == PAGE_SIZE || blocksize == SZ_4K || blocksize == BTRFS_MIN_BLOCKSIZE) + return true; +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* + * For bs > ps support it's done by specifying a minimal folio order + * for filemap, thus implying large data folios. + * For HIGHMEM systems, we can not always access the content of a (large) + * folio in one go, but go through them page by page. + * + * A lot of features don't implement a proper PAGE sized loop for large + * folios, this includes: + * + * - compression + * - verity + * - encoded write + * + * Considering HIGHMEM is such a pain to deal with and it's going + * to be deprecated eventually, just reject HIGHMEM && bs > ps cases. + */ + if (IS_ENABLED(CONFIG_HIGHMEM) && blocksize > PAGE_SIZE) + return false; + return true; +#endif + return false; +} + +/* * Start exclusive operation @type, return true on success. */ bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 8cc07cc70b12..814bbc9417d2 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -59,6 +59,8 @@ struct btrfs_space_info; #define BTRFS_MIN_BLOCKSIZE (SZ_4K) #endif +#define BTRFS_MAX_BLOCKSIZE (SZ_64K) + #define BTRFS_MAX_EXTENT_SIZE SZ_128M #define BTRFS_OLDEST_GENERATION 0ULL @@ -102,6 +104,8 @@ enum { BTRFS_FS_STATE_RO, /* Track if a transaction abort has been reported on this filesystem */ BTRFS_FS_STATE_TRANS_ABORTED, + /* Track if log replay has failed. */ + BTRFS_FS_STATE_LOG_REPLAY_ABORTED, /* * Bio operations should be blocked on this filesystem because a source * or target device is being destroyed as part of a device replace @@ -243,6 +247,7 @@ enum { BTRFS_MOUNT_NOSPACECACHE = (1ULL << 30), BTRFS_MOUNT_IGNOREMETACSUMS = (1ULL << 31), BTRFS_MOUNT_IGNORESUPERFLAGS = (1ULL << 32), + BTRFS_MOUNT_REF_TRACKER = (1ULL << 33), }; /* @@ -280,7 +285,7 @@ enum { #ifdef CONFIG_BTRFS_EXPERIMENTAL /* - * Features under developmen like Extent tree v2 support is enabled + * Features under development like Extent tree v2 support is enabled * only under CONFIG_BTRFS_EXPERIMENTAL */ #define BTRFS_FEATURE_INCOMPAT_SUPP \ @@ -303,6 +308,16 @@ enum { #define BTRFS_WARNING_COMMIT_INTERVAL (300) #define BTRFS_DEFAULT_MAX_INLINE (2048) +enum btrfs_compression_type { + BTRFS_COMPRESS_NONE = 0, + BTRFS_COMPRESS_ZLIB = 1, + BTRFS_COMPRESS_LZO = 2, + BTRFS_COMPRESS_ZSTD = 3, + BTRFS_NR_COMPRESS_TYPES = 4, + + BTRFS_DEFRAG_DONT_COMPRESS, +}; + struct btrfs_dev_replace { /* See #define above */ u64 replace_state; @@ -505,6 +520,9 @@ struct btrfs_fs_info { u64 last_trans_log_full_commit; unsigned long long mount_opt; + /* Compress related structures. */ + void *compr_wsm[BTRFS_NR_COMPRESS_TYPES]; + int compress_type; int compress_level; u32 commit_interval; @@ -809,6 +827,8 @@ struct btrfs_fs_info { u32 sectorsize; /* ilog2 of sectorsize, use to avoid 64bit division */ u32 sectorsize_bits; + u32 block_min_order; + u32 block_max_order; u32 csum_size; u32 csums_per_leaf; u32 stripesize; @@ -878,12 +898,10 @@ struct btrfs_fs_info { struct lockdep_map btrfs_trans_pending_ordered_map; struct lockdep_map btrfs_ordered_extent_map; -#ifdef CONFIG_BTRFS_FS_REF_VERIFY +#ifdef CONFIG_BTRFS_DEBUG spinlock_t ref_verify_lock; struct rb_root block_tree; -#endif -#ifdef CONFIG_BTRFS_DEBUG struct kobject *debug_kobj; struct list_head allocated_roots; @@ -905,6 +923,12 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) return mapping_gfp_constraint(mapping, ~__GFP_FS); } +/* Return the minimal folio size of the fs. */ +static inline unsigned int btrfs_min_folio_size(struct btrfs_fs_info *fs_info) +{ + return 1U << (PAGE_SHIFT + fs_info->block_min_order); +} + static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info) { return READ_ONCE(fs_info->generation); @@ -997,6 +1021,7 @@ static inline unsigned int btrfs_blocks_per_folio(const struct btrfs_fs_info *fs return folio_size(folio) >> fs_info->sectorsize_bits; } +bool __attribute_const__ btrfs_supported_blocksize(u32 blocksize); bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type); bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, @@ -1107,9 +1132,9 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) #define EXPORT_FOR_TESTS -static inline int btrfs_is_testing(const struct btrfs_fs_info *fs_info) +static inline bool btrfs_is_testing(const struct btrfs_fs_info *fs_info) { - return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); + return unlikely(test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)); } void btrfs_test_destroy_inode(struct inode *inode); @@ -1118,9 +1143,9 @@ void btrfs_test_destroy_inode(struct inode *inode); #define EXPORT_FOR_TESTS static -static inline int btrfs_is_testing(const struct btrfs_fs_info *fs_info) +static inline bool btrfs_is_testing(const struct btrfs_fs_info *fs_info) { - return 0; + return false; } #endif diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index f06cf701ae5a..1bd73b80f9fa 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -137,7 +137,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, */ extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], ref_objectid, name); - if (!extref) { + if (unlikely(!extref)) { btrfs_abort_transaction(trans, -ENOENT); return -ENOENT; } @@ -627,7 +627,7 @@ delete: if (control->clear_extent_range) { ret = btrfs_inode_clear_file_extent_range(control->inode, clear_start, clear_len); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -666,7 +666,7 @@ delete: btrfs_init_data_ref(&ref, control->ino, extent_offset, btrfs_root_id(root), false); ret = btrfs_free_extent(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -684,7 +684,7 @@ delete: ret = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -720,7 +720,7 @@ out: int ret2; ret2 = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); - if (ret2) { + if (unlikely(ret2)) { btrfs_abort_transaction(trans, ret2); ret = ret2; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 27942d0ad9de..ced87c9e4682 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -72,6 +72,9 @@ #include "raid-stripe-tree.h" #include "fiemap.h" +#define COW_FILE_RANGE_KEEP_LOCKED (1UL << 0) +#define COW_FILE_RANGE_NO_INLINE (1UL << 1) + struct btrfs_iget_args { u64 ino; struct btrfs_root *root; @@ -367,7 +370,7 @@ int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags) } /* - * Unock inode i_rwsem. + * Unlock inode i_rwsem. * * ilock_flags should contain the same bits set as passed to btrfs_inode_lock() * to decide whether the lock acquired is shared or exclusive. @@ -631,7 +634,7 @@ static noinline int __cow_file_range_inline(struct btrfs_inode *inode, drop_args.replace_extent = true; drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len); ret = btrfs_drop_extents(trans, root, inode, &drop_args); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -639,7 +642,7 @@ static noinline int __cow_file_range_inline(struct btrfs_inode *inode, ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted, size, compressed_size, compress_type, compressed_folio, update_i_size); - if (ret && ret != -ENOSPC) { + if (unlikely(ret && ret != -ENOSPC)) { btrfs_abort_transaction(trans, ret); goto out; } else if (ret == -ENOSPC) { @@ -649,7 +652,7 @@ static noinline int __cow_file_range_inline(struct btrfs_inode *inode, btrfs_update_inode_bytes(inode, size, drop_args.bytes_found); ret = btrfs_update_inode(trans, inode); - if (ret && ret != -ENOSPC) { + if (unlikely(ret && ret != -ENOSPC)) { btrfs_abort_transaction(trans, ret); goto out; } else if (ret == -ENOSPC) { @@ -851,6 +854,8 @@ static void compress_file_range(struct btrfs_work *work) struct btrfs_inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct address_space *mapping = inode->vfs_inode.i_mapping; + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; + const u32 min_folio_size = btrfs_min_folio_size(fs_info); u64 blocksize = fs_info->sectorsize; u64 start = async_chunk->start; u64 end = async_chunk->end; @@ -861,7 +866,7 @@ static void compress_file_range(struct btrfs_work *work) unsigned long nr_folios; unsigned long total_compressed = 0; unsigned long total_in = 0; - unsigned int poff; + unsigned int loff; int i; int compress_type = fs_info->compress_type; int compress_level = fs_info->compress_level; @@ -899,8 +904,8 @@ static void compress_file_range(struct btrfs_work *work) actual_end = min_t(u64, i_size, end + 1); again: folios = NULL; - nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; - nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED_PAGES); + nr_folios = (end >> min_folio_shift) - (start >> min_folio_shift) + 1; + nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED >> min_folio_shift); /* * we don't want to send crud past the end of i_size through @@ -956,18 +961,18 @@ again: /* Compression level is applied here. */ ret = btrfs_compress_folios(compress_type, compress_level, - mapping, start, folios, &nr_folios, &total_in, + inode, start, folios, &nr_folios, &total_in, &total_compressed); if (ret) goto mark_incompressible; /* - * Zero the tail end of the last page, as we might be sending it down + * Zero the tail end of the last folio, as we might be sending it down * to disk. */ - poff = offset_in_page(total_compressed); - if (poff) - folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff); + loff = (total_compressed & (min_folio_size - 1)); + if (loff) + folio_zero_range(folios[nr_folios - 1], loff, min_folio_size - loff); /* * Try to create an inline extent. @@ -1245,18 +1250,18 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * locked_folio is the folio that writepage had locked already. We use * it to make sure we don't do extra locks or unlocks. * - * When this function fails, it unlocks all pages except @locked_folio. + * When this function fails, it unlocks all folios except @locked_folio. * * When this function successfully creates an inline extent, it returns 1 and - * unlocks all pages including locked_folio and starts I/O on them. - * (In reality inline extents are limited to a single page, so locked_folio is - * the only page handled anyway). + * unlocks all folios including locked_folio and starts I/O on them. + * (In reality inline extents are limited to a single block, so locked_folio is + * the only folio handled anyway). * - * When this function succeed and creates a normal extent, the page locking + * When this function succeed and creates a normal extent, the folio locking * status depends on the passed in flags: * - * - If @keep_locked is set, all pages are kept locked. - * - Else all pages except for @locked_folio are unlocked. + * - If COW_FILE_RANGE_KEEP_LOCKED flag is set, all folios are kept locked. + * - Else all folios except for @locked_folio are unlocked. * * When a failure happens in the second or later iteration of the * while-loop, the ordered extents created in previous iterations are cleaned up. @@ -1264,7 +1269,7 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, static noinline int cow_file_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, u64 end, u64 *done_offset, - bool keep_locked, bool no_inline) + unsigned long flags) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -1292,7 +1297,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, inode_should_defrag(inode, start, end, num_bytes, SZ_64K); - if (!no_inline) { + if (!(flags & COW_FILE_RANGE_NO_INLINE)) { /* lets try to make an inline extent */ ret = cow_file_range_inline(inode, locked_folio, start, end, 0, BTRFS_COMPRESS_NONE, NULL, false); @@ -1320,7 +1325,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * Do set the Ordered (Private2) bit so we know this page was properly * setup for writepage. */ - page_ops = (keep_locked ? 0 : PAGE_UNLOCK); + page_ops = ((flags & COW_FILE_RANGE_KEEP_LOCKED) ? 0 : PAGE_UNLOCK); page_ops |= PAGE_SET_ORDERED; /* @@ -1531,10 +1536,11 @@ out_unlock: btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size, end - start - cur_alloc_size + 1, NULL); } - btrfs_err_rl(fs_info, - "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", - __func__, btrfs_root_id(inode->root), - btrfs_ino(inode), orig_start, end + 1 - orig_start, ret); + btrfs_err(fs_info, +"%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu cur_alloc_size=%llu: %d", + __func__, btrfs_root_id(inode->root), + btrfs_ino(inode), orig_start, end + 1 - orig_start, + start, cur_alloc_size, ret); return ret; } @@ -1687,7 +1693,7 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode, while (start <= end) { ret = cow_file_range(inode, locked_folio, start, end, - &done_offset, true, false); + &done_offset, COW_FILE_RANGE_KEEP_LOCKED); if (ret) return ret; extent_write_locked_range(&inode->vfs_inode, locked_folio, @@ -1768,9 +1774,15 @@ static int fallback_to_cow(struct btrfs_inode *inode, * Don't try to create inline extents, as a mix of inline extent that * is written out and unlocked directly and a normal NOCOW extent * doesn't work. + * + * And here we do not unlock the folio after a successful run. + * The folios will be unlocked after everything is finished, or by error handling. + * + * This is to ensure error handling won't need to clear dirty/ordered flags without + * a locked folio, which can race with writeback. */ - ret = cow_file_range(inode, locked_folio, start, end, NULL, false, - true); + ret = cow_file_range(inode, locked_folio, start, end, NULL, + COW_FILE_RANGE_NO_INLINE | COW_FILE_RANGE_KEEP_LOCKED); ASSERT(ret != 1); return ret; } @@ -1913,61 +1925,14 @@ static int can_nocow_file_extent(struct btrfs_path *path, return ret < 0 ? ret : can_nocow; } -/* - * Cleanup the dirty folios which will never be submitted due to error. - * - * When running a delalloc range, we may need to split the ranges (due to - * fragmentation or NOCOW). If we hit an error in the later part, we will error - * out and previously successfully executed range will never be submitted, thus - * we have to cleanup those folios by clearing their dirty flag, starting and - * finishing the writeback. - */ -static void cleanup_dirty_folios(struct btrfs_inode *inode, - struct folio *locked_folio, - u64 start, u64 end, int error) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct address_space *mapping = inode->vfs_inode.i_mapping; - pgoff_t start_index = start >> PAGE_SHIFT; - pgoff_t end_index = end >> PAGE_SHIFT; - u32 len; - - ASSERT(end + 1 - start < U32_MAX); - ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && - IS_ALIGNED(end + 1, fs_info->sectorsize)); - len = end + 1 - start; - - /* - * Handle the locked folio first. - * The btrfs_folio_clamp_*() helpers can handle range out of the folio case. - */ - btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len); - - for (pgoff_t index = start_index; index <= end_index; index++) { - struct folio *folio; - - /* Already handled at the beginning. */ - if (index == locked_folio->index) - continue; - folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS); - /* Cache already dropped, no need to do any cleanup. */ - if (IS_ERR(folio)) - continue; - btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len); - folio_unlock(folio); - folio_put(folio); - } - mapping_set_error(mapping, error); -} - static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio, struct extent_state **cached, struct can_nocow_file_extent_args *nocow_args, u64 file_pos, bool is_prealloc) { struct btrfs_ordered_extent *ordered; - u64 len = nocow_args->file_extent.num_bytes; - u64 end = file_pos + len - 1; + const u64 len = nocow_args->file_extent.num_bytes; + const u64 end = file_pos + len - 1; int ret = 0; btrfs_lock_extent(&inode->io_tree, file_pos, end, cached); @@ -1978,8 +1943,8 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent, BTRFS_ORDERED_PREALLOC); if (IS_ERR(em)) { - btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached); - return PTR_ERR(em); + ret = PTR_ERR(em); + goto error; } btrfs_free_extent_map(em); } @@ -1991,8 +1956,8 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio if (IS_ERR(ordered)) { if (is_prealloc) btrfs_drop_extent_map_range(inode, file_pos, end, false); - btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached); - return PTR_ERR(ordered); + ret = PTR_ERR(ordered); + goto error; } if (btrfs_is_data_reloc_root(inode->root)) @@ -2004,23 +1969,30 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio ret = btrfs_reloc_clone_csums(ordered); btrfs_put_ordered_extent(ordered); + if (ret < 0) + goto error; extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, - PAGE_UNLOCK | PAGE_SET_ORDERED); - /* - * On error, we need to cleanup the ordered extents we created. - * - * We do not clear the folio Dirty flags because they are set and - * cleaered by the caller. - */ - if (ret < 0) - btrfs_cleanup_ordered_extents(inode, file_pos, len); + PAGE_SET_ORDERED); + return ret; + +error: + btrfs_cleanup_ordered_extents(inode, file_pos, len); + extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_CLEAR_DATA_RESV, + PAGE_UNLOCK | PAGE_START_WRITEBACK | + PAGE_END_WRITEBACK); + btrfs_err(inode->root->fs_info, + "%s failed, root=%lld inode=%llu start=%llu len=%llu: %d", + __func__, btrfs_root_id(inode->root), btrfs_ino(inode), + file_pos, len, ret); return ret; } /* - * when nowcow writeback call back. This checks for snapshots or COW copies + * When nocow writeback calls back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. * * If no cow copies or snapshots exist, we write directly to the existing @@ -2037,13 +2009,23 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, /* * If not 0, represents the inclusive end of the last fallback_to_cow() * range. Only for error handling. + * + * The same for nocow_end, it's to avoid double cleaning up the range + * already cleaned by nocow_one_range(). */ u64 cow_end = 0; + u64 nocow_end = 0; u64 cur_offset = start; int ret; bool check_prev = true; u64 ino = btrfs_ino(inode); struct can_nocow_file_extent_args nocow_args = { 0 }; + /* The range that has ordered extent(s). */ + u64 oe_cleanup_start; + u64 oe_cleanup_len = 0; + /* The range that is untouched. */ + u64 untouched_start; + u64 untouched_len = 0; /* * Normally on a zoned device we're only doing COW writes, but in case @@ -2207,8 +2189,10 @@ must_cow: &nocow_args, cur_offset, extent_type == BTRFS_FILE_EXTENT_PREALLOC); btrfs_dec_nocow_writers(nocow_bg); - if (ret < 0) + if (ret < 0) { + nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1; goto error; + } cur_offset = extent_end; } btrfs_release_path(path); @@ -2225,86 +2209,105 @@ must_cow: cow_start = (u64)-1; } - btrfs_free_path(path); - return 0; - -error: /* - * There are several error cases: - * - * 1) Failed without falling back to COW - * start cur_offset end - * |/////////////| | - * - * In this case, cow_start should be (u64)-1. - * - * For range [start, cur_offset) the folios are already unlocked (except - * @locked_folio), EXTENT_DELALLOC already removed. - * Need to clear the dirty flags and finish the ordered extents. - * - * 2) Failed with error before calling fallback_to_cow() - * - * start cow_start end - * |/////////////| | - * - * In this case, only @cow_start is set, @cur_offset is between - * [cow_start, end) + * Everything is finished without an error, can unlock the folios now. * - * It's mostly the same as case 1), just replace @cur_offset with - * @cow_start. - * - * 3) Failed with error from fallback_to_cow() - * - * start cow_start cow_end end - * |/////////////|-----------| | - * - * In this case, both @cow_start and @cow_end is set. - * - * For range [start, cow_start) it's the same as case 1). - * But for range [cow_start, cow_end), all the cleanup is handled by - * cow_file_range(), we should not touch anything in that range. - * - * So for all above cases, if @cow_start is set, cleanup ordered extents - * for range [start, @cow_start), other wise cleanup range [start, @cur_offset). + * No need to touch the io tree range nor set folio ordered flag, as + * fallback_to_cow() and nocow_one_range() have already handled them. */ - if (cow_start != (u64)-1) - cur_offset = cow_start; + extent_clear_unlock_delalloc(inode, start, end, locked_folio, NULL, 0, PAGE_UNLOCK); - if (cur_offset > start) { - btrfs_cleanup_ordered_extents(inode, start, cur_offset - start); - cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret); - } + btrfs_free_path(path); + return 0; - /* - * If an error happened while a COW region is outstanding, cur_offset - * needs to be reset to @cow_end + 1 to skip the COW range, as - * cow_file_range() will do the proper cleanup at error. - */ - if (cow_end) - cur_offset = cow_end + 1; +error: + if (cow_start == (u64)-1) { + /* + * case a) + * start cur_offset end + * | OE cleanup | Untouched | + * + * We finished a fallback_to_cow() or nocow_one_range() call, + * but failed to check the next range. + * + * or + * start cur_offset nocow_end end + * | OE cleanup | Skip | Untouched | + * + * nocow_one_range() failed, the range [cur_offset, nocow_end] is + * already cleaned up. + */ + oe_cleanup_start = start; + oe_cleanup_len = cur_offset - start; + if (nocow_end) + untouched_start = nocow_end + 1; + else + untouched_start = cur_offset; + untouched_len = end + 1 - untouched_start; + } else if (cow_start != (u64)-1 && cow_end == 0) { + /* + * case b) + * start cow_start cur_offset end + * | OE cleanup | Untouched | + * + * We got a range that needs COW, but before we hit the next NOCOW range, + * thus [cow_start, cur_offset) doesn't yet have any OE. + */ + oe_cleanup_start = start; + oe_cleanup_len = cow_start - start; + untouched_start = cow_start; + untouched_len = end + 1 - untouched_start; + } else { + /* + * case c) + * start cow_start cow_end end + * | OE cleanup | Skip | Untouched | + * + * fallback_to_cow() failed, and fallback_to_cow() will do the + * cleanup for its range, we shouldn't touch the range + * [cow_start, cow_end]. + */ + ASSERT(cow_start != (u64)-1 && cow_end != 0); + oe_cleanup_start = start; + oe_cleanup_len = cow_start - start; + untouched_start = cow_end + 1; + untouched_len = end + 1 - untouched_start; + } + + if (oe_cleanup_len) { + const u64 oe_cleanup_end = oe_cleanup_start + oe_cleanup_len - 1; + btrfs_cleanup_ordered_extents(inode, oe_cleanup_start, oe_cleanup_len); + extent_clear_unlock_delalloc(inode, oe_cleanup_start, oe_cleanup_end, + locked_folio, NULL, + EXTENT_LOCKED | EXTENT_DELALLOC, + PAGE_UNLOCK | PAGE_START_WRITEBACK | + PAGE_END_WRITEBACK); + } - /* - * We need to lock the extent here because we're clearing DELALLOC and - * we're not locked at this point. - */ - if (cur_offset < end) { + if (untouched_len) { struct extent_state *cached = NULL; + const u64 untouched_end = untouched_start + untouched_len - 1; - btrfs_lock_extent(&inode->io_tree, cur_offset, end, &cached); - extent_clear_unlock_delalloc(inode, cur_offset, end, + /* + * We need to lock the extent here because we're clearing DELALLOC and + * we're not locked at this point. + */ + btrfs_lock_extent(&inode->io_tree, untouched_start, untouched_end, &cached); + extent_clear_unlock_delalloc(inode, untouched_start, untouched_end, locked_folio, &cached, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); - btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL); + btrfs_qgroup_free_data(inode, NULL, untouched_start, untouched_len, NULL); } btrfs_free_path(path); - btrfs_err_rl(fs_info, - "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", - __func__, btrfs_root_id(inode->root), - btrfs_ino(inode), start, end + 1 - start, ret); + btrfs_err(fs_info, +"%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu oe_cleanup=%llu oe_cleanup_len=%llu untouched_start=%llu untouched_len=%llu: %d", + __func__, btrfs_root_id(inode->root), btrfs_ino(inode), + start, end + 1 - start, cur_offset, oe_cleanup_start, oe_cleanup_len, + untouched_start, untouched_len, ret); return ret; } @@ -2349,8 +2352,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol ret = run_delalloc_cow(inode, locked_folio, start, end, wbc, true); else - ret = cow_file_range(inode, locked_folio, start, end, NULL, - false, false); + ret = cow_file_range(inode, locked_folio, start, end, NULL, 0); return ret; } @@ -2986,7 +2988,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, * If we dropped an inline extent here, we know the range where it is * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the * number of bytes only for that range containing the inline extent. - * The remaining of the range will be processed when clearning the + * The remaining of the range will be processed when clearing the * EXTENT_DELALLOC_BIT bit through the ordered extent completion. */ if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) { @@ -3102,14 +3104,15 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) if (!freespace_inode) btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent); - if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { + if (unlikely(test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags))) { ret = -EIO; goto out; } - if (btrfs_is_zoned(fs_info)) - btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, - ordered_extent->disk_num_bytes); + ret = btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes); + if (ret) + goto out; if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { truncated = true; @@ -3147,7 +3150,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) trans->block_rsv = &inode->block_rsv; ret = btrfs_insert_raid_extent(trans, ordered_extent); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3155,7 +3158,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { /* Logic error */ ASSERT(list_empty(&ordered_extent->list)); - if (!list_empty(&ordered_extent->list)) { + if (unlikely(!list_empty(&ordered_extent->list))) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; @@ -3163,7 +3166,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, inode); - if (ret) { + if (unlikely(ret)) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); } @@ -3190,20 +3193,20 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) ordered_extent->disk_num_bytes); } } - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_unpin_extent_cache(inode, ordered_extent->file_offset, ordered_extent->num_bytes, trans->transid); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } ret = add_pending_csums(trans, &ordered_extent->list); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3221,7 +3224,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, inode); - if (ret) { /* -ENOMEM or corruption */ + if (unlikely(ret)) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); goto out; } @@ -3327,21 +3330,47 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) return btrfs_finish_one_ordered(ordered); } +void btrfs_calculate_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, + u8 *dest) +{ + struct folio *folio = page_folio(phys_to_page(paddr)); + const u32 blocksize = fs_info->sectorsize; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + + shash->tfm = fs_info->csum_shash; + /* The full block must be inside the folio. */ + ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio)); + + if (folio_test_partial_kmap(folio)) { + size_t cur = paddr; + + crypto_shash_init(shash); + while (cur < paddr + blocksize) { + void *kaddr; + size_t len = min(paddr + blocksize - cur, + PAGE_SIZE - offset_in_page(cur)); + + kaddr = kmap_local_folio(folio, offset_in_folio(folio, cur)); + crypto_shash_update(shash, kaddr, len); + kunmap_local(kaddr); + cur += len; + } + crypto_shash_final(shash, dest); + } else { + crypto_shash_digest(shash, phys_to_virt(paddr), blocksize, dest); + } +} /* * Verify the checksum for a single sector without any extra action that depend * on the type of I/O. * * @kaddr must be a properly kmapped address. */ -int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum, - const u8 * const csum_expected) +int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum, + const u8 * const csum_expected) { - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - - shash->tfm = fs_info->csum_shash; - crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); - - if (memcmp(csum, csum_expected, fs_info->csum_size)) + btrfs_calculate_block_csum(fs_info, paddr, csum); + if (unlikely(memcmp(csum, csum_expected, fs_info->csum_size) != 0)) return -EIO; return 0; } @@ -3360,17 +3389,16 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum * Return %true if the sector is ok or had no checksum to start with, else %false. */ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, - u32 bio_offset, struct bio_vec *bv) + u32 bio_offset, phys_addr_t paddr) { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u32 blocksize = fs_info->sectorsize; + struct folio *folio; u64 file_offset = bbio->file_offset + bio_offset; - u64 end = file_offset + bv->bv_len - 1; + u64 end = file_offset + blocksize - 1; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; - void *kaddr; - - ASSERT(bv->bv_len == fs_info->sectorsize); if (!bbio->csum) return true; @@ -3386,12 +3414,8 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) * fs_info->csum_size; - kaddr = bvec_kmap_local(bv); - if (btrfs_check_sector_csum(fs_info, kaddr, csum, csum_expected)) { - kunmap_local(kaddr); + if (btrfs_check_block_csum(fs_info, paddr, csum, csum_expected)) goto zeroit; - } - kunmap_local(kaddr); return true; zeroit: @@ -3399,7 +3423,9 @@ zeroit: bbio->mirror_num); if (dev) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); - memzero_bvec(bv); + folio = page_folio(phys_to_page(paddr)); + ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio)); + folio_zero_range(folio, offset_in_folio(folio, paddr), blocksize); return false; } @@ -3513,7 +3539,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, int ret; ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode)); - if (ret && ret != -EEXIST) { + if (unlikely(ret && ret != -EEXIST)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -4262,7 +4288,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, } ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index); - if (ret) { + if (unlikely(ret)) { btrfs_crit(fs_info, "failed to delete reference to %.*s, root %llu inode %llu parent %llu", name->len, name->name, btrfs_root_id(root), ino, dir_ino); @@ -4274,7 +4300,7 @@ skip_backref: rename_ctx->index = index; ret = btrfs_delete_delayed_dir_index(trans, dir, index); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -4429,7 +4455,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_dir_item_key_to_cpu(leaf, di, &key); WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); ret = btrfs_delete_one_dir_name(trans, root, path, di); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -4460,14 +4486,14 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, ret = btrfs_del_root_ref(trans, objectid, btrfs_root_id(root), dir_ino, &index, &fname.disk_name); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } } ret = btrfs_delete_delayed_dir_index(trans, dir, index); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -4525,7 +4551,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) return ret; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset -1 found, there would have to exist a root * with such id, but this is out of valid range. @@ -4639,13 +4665,13 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) btrfs_record_snapshot_destroy(trans, dir); ret = btrfs_unlink_subvol(trans, dir, dentry); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } ret = btrfs_record_root_in_trans(trans, dest); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -4659,7 +4685,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) ret = btrfs_insert_orphan_item(trans, fs_info->tree_root, btrfs_root_id(dest)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -4667,7 +4693,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest)); - if (ret && ret != -ENOENT) { + if (unlikely(ret && ret != -ENOENT)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -4676,7 +4702,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) dest->root_item.received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, btrfs_root_id(dest)); - if (ret && ret != -ENOENT) { + if (unlikely(ret && ret != -ENOENT)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -4816,7 +4842,7 @@ again: folio_put(folio); goto again; } - if (!folio_test_uptodate(folio)) { + if (unlikely(!folio_test_uptodate(folio))) { ret = -EIO; goto out_unlock; } @@ -4904,7 +4930,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 e goto out; /* - * Skip the truncatioin if the range in the target block is already aligned. + * Skip the truncation if the range in the target block is already aligned. * The seemingly complex check will also handle the same block case. */ if (in_head_block && !IS_ALIGNED(start, blocksize)) @@ -4960,7 +4986,7 @@ again: folio_put(folio); goto again; } - if (!folio_test_uptodate(folio)) { + if (unlikely(!folio_test_uptodate(folio))) { ret = -EIO; goto out_unlock; } @@ -5080,7 +5106,7 @@ static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len) drop_args.drop_cache = true; ret = btrfs_drop_extents(trans, root, inode, &drop_args); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; @@ -5600,8 +5626,8 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, } btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); - if (location->type != BTRFS_INODE_ITEM_KEY && - location->type != BTRFS_ROOT_ITEM_KEY) { + if (unlikely(location->type != BTRFS_INODE_ITEM_KEY && + location->type != BTRFS_ROOT_ITEM_KEY)) { ret = -EUCLEAN; btrfs_warn(root->fs_info, "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", @@ -5892,7 +5918,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return ERR_CAST(inode); /* Do extra check against inode mode with di_type */ - if (btrfs_inode_type(inode) != di_type) { + if (unlikely(btrfs_inode_type(inode) != di_type)) { btrfs_crit(fs_info, "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u", inode->vfs_inode.i_mode, btrfs_inode_type(inode), @@ -6479,6 +6505,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (!args->subvol) btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir)); + btrfs_set_inode_mapping_order(BTRFS_I(inode)); if (S_ISREG(inode->i_mode)) { if (btrfs_test_opt(fs_info, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; @@ -6486,7 +6513,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM; btrfs_update_inode_mapping_flags(BTRFS_I(inode)); - btrfs_set_inode_mapping_order(BTRFS_I(inode)); } ret = btrfs_insert_inode_locked(inode); @@ -6533,7 +6559,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]); batch.nr = args->orphan ? 1 : 2; ret = btrfs_insert_empty_items(trans, root, path, &batch); - if (ret != 0) { + if (unlikely(ret != 0)) { btrfs_abort_transaction(trans, ret); goto discard; } @@ -6610,7 +6636,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, */ if (!args->subvol) { ret = btrfs_init_inode_security(trans, args); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto discard; } @@ -6630,14 +6656,14 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (args->orphan) { ret = btrfs_orphan_add(trans, BTRFS_I(inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto discard; } } else { ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 0, BTRFS_I(inode)->dir_index); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto discard; } @@ -6668,7 +6694,7 @@ out: */ int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, - const struct fscrypt_str *name, int add_backref, u64 index) + const struct fscrypt_str *name, bool add_backref, u64 index) { int ret = 0; struct btrfs_key key; @@ -6701,7 +6727,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, btrfs_inode_type(inode), index); if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; - else if (ret) { + else if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -6857,7 +6883,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, /* Link added now we update the inode item with the new link count. */ inc_nlink(inode); ret = btrfs_update_inode(trans, BTRFS_I(inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -6868,7 +6894,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, * open(2) O_TMPFILE flag. */ ret = btrfs_orphan_del(trans, BTRFS_I(inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -7076,7 +7102,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { /* Only regular file could have regular/prealloc extent */ - if (!S_ISREG(inode->vfs_inode.i_mode)) { + if (unlikely(!S_ISREG(inode->vfs_inode.i_mode))) { ret = -EUCLEAN; btrfs_crit(fs_info, "regular/prealloc extent found for non-regular inode %llu", @@ -7153,7 +7179,7 @@ not_found: insert: ret = 0; btrfs_release_path(path); - if (em->start > start || btrfs_extent_map_end(em) <= start) { + if (unlikely(em->start > start || btrfs_extent_map_end(em) <= start)) { btrfs_err(fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", em->start, em->len, start, len); @@ -8185,7 +8211,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, btrfs_ino(BTRFS_I(old_dir)), new_idx); if (ret) { - if (need_abort) + if (unlikely(need_abort)) btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8233,7 +8259,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8241,12 +8267,12 @@ static int btrfs_rename_exchange(struct inode *old_dir, ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), old_name, &old_rename_ctx); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8255,7 +8281,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* dest is a subvolume */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8263,12 +8289,12 @@ static int btrfs_rename_exchange(struct inode *old_dir, ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), new_name, &new_rename_ctx); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_update_inode(trans, BTRFS_I(new_inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8276,14 +8302,14 @@ static int btrfs_rename_exchange(struct inode *old_dir, ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), new_name, 0, old_idx); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode), old_name, 0, new_idx); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8524,7 +8550,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8532,12 +8558,12 @@ static int btrfs_rename(struct mnt_idmap *idmap, ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), &old_fname.disk_name, &rename_ctx); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8548,7 +8574,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8557,7 +8583,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(d_inode(new_dentry)), &new_fname.disk_name); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8565,7 +8591,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (new_inode->i_nlink == 0) { ret = btrfs_orphan_add(trans, BTRFS_I(d_inode(new_dentry))); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8574,7 +8600,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), &new_fname.disk_name, 0, index); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8588,7 +8614,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (flags & RENAME_WHITEOUT) { ret = btrfs_create_new_inode(trans, &whiteout_args); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } else { @@ -8882,7 +8908,7 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, goto out; path = btrfs_alloc_path(); - if (!path) { + if (unlikely(!path)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); discard_new_inode(inode); @@ -8894,7 +8920,7 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, key.offset = 0; datasize = btrfs_file_extent_calc_inline_size(name_len); ret = btrfs_insert_empty_item(trans, root, path, &key, datasize); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_free_path(path); discard_new_inode(inode); @@ -9107,7 +9133,7 @@ next: ret = btrfs_update_inode(trans, BTRFS_I(inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); if (own_trans) btrfs_end_transaction(trans); @@ -9275,7 +9301,7 @@ static ssize_t btrfs_encoded_read_inline( ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), extent_start, 0); if (ret) { - if (ret > 0) { + if (unlikely(ret > 0)) { /* The extent item disappeared? */ return -EIO; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7e13de2bdcbf..a454b5ba2097 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -376,13 +376,13 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, if (comp) { ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp, strlen(comp), 0); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } } else { ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL, 0, 0); - if (ret && ret != -ENODATA) { + if (unlikely(ret && ret != -ENODATA)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -633,7 +633,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap, btrfs_clear_buffer_dirty(trans, leaf); btrfs_tree_unlock(leaf); ret2 = btrfs_free_tree_block(trans, objectid, leaf, 0, 1); - if (ret2 < 0) + if (unlikely(ret2 < 0)) btrfs_abort_transaction(trans, ret2); free_extent_buffer(leaf); goto out; @@ -654,14 +654,14 @@ static noinline int create_subvol(struct mnt_idmap *idmap, /* ... and new_root is owned by new_inode_args.inode now. */ ret = btrfs_record_root_in_trans(trans, new_root); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_uuid_tree_add(trans, root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -669,7 +669,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap, btrfs_record_new_subvolume(trans, BTRFS_I(dir)); ret = btrfs_create_new_inode(trans, &new_inode_args); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -957,7 +957,7 @@ static noinline int btrfs_mksnapshot(struct dentry *parent, /* * Force new buffered writes to reserve space even when NOCOW is - * possible. This is to avoid later writeback (running dealloc) to + * possible. This is to avoid later writeback (running delalloc) to * fallback to COW mode and unexpectedly fail with ENOSPC. */ btrfs_drew_read_lock(&root->snapshot_lock); @@ -1251,7 +1251,7 @@ out: } static noinline int btrfs_ioctl_snap_create(struct file *file, - void __user *arg, int subvol) + void __user *arg, bool subvol) { struct btrfs_ioctl_vol_args *vol_args; int ret; @@ -2133,7 +2133,7 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp) ret = btrfs_next_leaf(fs_info->tree_root, path); if (ret < 0) { goto out; - } else if (ret > 0) { + } else if (unlikely(ret > 0)) { ret = -EUCLEAN; goto out; } @@ -2216,7 +2216,7 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root, ret = btrfs_next_leaf(root, path); if (ret < 0) { goto out; - } else if (ret > 0) { + } else if (unlikely(ret > 0)) { ret = -EUCLEAN; goto out; } @@ -2245,7 +2245,7 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root, ret = btrfs_next_item(root, path); if (ret < 0) { goto out; - } else if (ret > 0) { + } else if (unlikely(ret > 0)) { ret = -EUCLEAN; goto out; } @@ -4008,7 +4008,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, btrfs_root_id(root)); - if (ret && ret != -ENOENT) { + if (unlikely(ret && ret != -ENOENT)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); goto out; @@ -4032,7 +4032,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, ret = btrfs_uuid_tree_add(trans, sa->uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, btrfs_root_id(root)); - if (ret < 0 && ret != -EEXIST) { + if (unlikely(ret < 0 && ret != -EEXIST)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); goto out; @@ -4418,6 +4418,10 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, goto out_acct; } + if (fs_info->sectorsize > PAGE_SIZE) { + ret = -ENOTTY; + goto out_acct; + } if (compat) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) struct btrfs_ioctl_encoded_io_args_32 args32; @@ -4509,6 +4513,7 @@ out_acct: static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool compat) { + struct btrfs_fs_info *fs_info = inode_to_fs_info(file->f_inode); struct btrfs_ioctl_encoded_io_args args; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; @@ -4522,6 +4527,11 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool goto out_acct; } + if (fs_info->sectorsize > PAGE_SIZE) { + ret = -ENOTTY; + goto out_acct; + } + if (!(file->f_mode & FMODE_WRITE)) { ret = -EBADF; goto out_acct; @@ -4780,14 +4790,14 @@ out_fail: static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags) { + struct file *file = cmd->file; + struct btrfs_inode *inode = BTRFS_I(file->f_inode); + struct extent_io_tree *io_tree = &inode->io_tree; + struct btrfs_fs_info *fs_info = inode->root->fs_info; size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); size_t copy_end; int ret; u64 disk_bytenr, disk_io_size; - struct file *file; - struct btrfs_inode *inode; - struct btrfs_fs_info *fs_info; - struct extent_io_tree *io_tree; loff_t pos; struct kiocb kiocb; struct extent_state *cached_state = NULL; @@ -4803,10 +4813,11 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue ret = -EPERM; goto out_acct; } - file = cmd->file; - inode = BTRFS_I(file->f_inode); - fs_info = inode->root->fs_info; - io_tree = &inode->io_tree; + if (fs_info->sectorsize > PAGE_SIZE) { + ret = -ENOTTY; + goto out_acct; + } + sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); if (issue_flags & IO_URING_F_COMPAT) { @@ -4933,9 +4944,10 @@ out_acct: static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issue_flags) { + struct file *file = cmd->file; + struct btrfs_fs_info *fs_info = inode_to_fs_info(file->f_inode); loff_t pos; struct kiocb kiocb; - struct file *file; ssize_t ret; void __user *sqe_addr; struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd); @@ -4948,8 +4960,11 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu ret = -EPERM; goto out_acct; } + if (fs_info->sectorsize > PAGE_SIZE) { + ret = -ENOTTY; + goto out_acct; + } - file = cmd->file; sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); if (!(file->f_mode & FMODE_WRITE)) { @@ -5223,13 +5238,13 @@ long btrfs_ioctl(struct file *file, unsigned int case FITRIM: return btrfs_ioctl_fitrim(fs_info, argp); case BTRFS_IOC_SNAP_CREATE: - return btrfs_ioctl_snap_create(file, argp, 0); + return btrfs_ioctl_snap_create(file, argp, false); case BTRFS_IOC_SNAP_CREATE_V2: - return btrfs_ioctl_snap_create_v2(file, argp, 0); + return btrfs_ioctl_snap_create_v2(file, argp, false); case BTRFS_IOC_SUBVOL_CREATE: - return btrfs_ioctl_snap_create(file, argp, 1); + return btrfs_ioctl_snap_create(file, argp, true); case BTRFS_IOC_SUBVOL_CREATE_V2: - return btrfs_ioctl_snap_create_v2(file, argp, 1); + return btrfs_ioctl_snap_create_v2(file, argp, true); case BTRFS_IOC_SNAP_DESTROY: return btrfs_ioctl_snap_destroy(file, argp, false); case BTRFS_IOC_SNAP_DESTROY_V2: diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index a3e6d9616e60..0035851d72b0 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -361,7 +361,7 @@ void btrfs_drew_read_lock(struct btrfs_drew_lock *lock) atomic_inc(&lock->readers); /* - * Ensure the pending reader count is perceieved BEFORE this reader + * Ensure the pending reader count is perceived BEFORE this reader * goes to sleep in case of active writers. This guarantees new writers * won't be allowed and that the current reader will be woken up when * the last active writer finishes its jobs. diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index af29df98ac14..a4673e7d95d7 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -74,7 +74,7 @@ enum btrfs_lock_nesting { BTRFS_NESTING_NEW_ROOT, /* - * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so + * We are limited to MAX_LOCKDEP_SUBCLASSES number of subclasses, so * add this in here and add a static_assert to keep us from going over * the limit. As of this writing we're limited to 8, and we're * definitely using 8, hence this check to keep us from messing up in diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index d403641889ca..4758f66da449 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -58,9 +58,6 @@ * 0x1000 | SegHdr N+1| Data payload N+1 ... | */ -#define WORKSPACE_BUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE)) -#define WORKSPACE_CBUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE)) - struct workspace { void *mem; void *buf; /* where decompressed data goes */ @@ -68,7 +65,14 @@ struct workspace { struct list_head list; }; -static struct workspace_manager wsm; +static u32 workspace_buf_length(const struct btrfs_fs_info *fs_info) +{ + return lzo1x_worst_compress(fs_info->sectorsize); +} +static u32 workspace_cbuf_length(const struct btrfs_fs_info *fs_info) +{ + return lzo1x_worst_compress(fs_info->sectorsize); +} void lzo_free_workspace(struct list_head *ws) { @@ -80,7 +84,7 @@ void lzo_free_workspace(struct list_head *ws) kfree(workspace); } -struct list_head *lzo_alloc_workspace(void) +struct list_head *lzo_alloc_workspace(struct btrfs_fs_info *fs_info) { struct workspace *workspace; @@ -89,8 +93,8 @@ struct list_head *lzo_alloc_workspace(void) return ERR_PTR(-ENOMEM); workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL | __GFP_NOWARN); - workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL | __GFP_NOWARN); - workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL | __GFP_NOWARN); + workspace->buf = kvmalloc(workspace_buf_length(fs_info), GFP_KERNEL | __GFP_NOWARN); + workspace->cbuf = kvmalloc(workspace_cbuf_length(fs_info), GFP_KERNEL | __GFP_NOWARN); if (!workspace->mem || !workspace->buf || !workspace->cbuf) goto fail; @@ -128,19 +132,21 @@ static inline size_t read_compress_length(const char *buf) * * Will allocate new pages when needed. */ -static int copy_compressed_data_to_page(char *compressed_data, +static int copy_compressed_data_to_page(struct btrfs_fs_info *fs_info, + char *compressed_data, size_t compressed_size, struct folio **out_folios, unsigned long max_nr_folio, - u32 *cur_out, - const u32 sectorsize) + u32 *cur_out) { + const u32 sectorsize = fs_info->sectorsize; + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; u32 sector_bytes_left; u32 orig_out; struct folio *cur_folio; char *kaddr; - if ((*cur_out / PAGE_SIZE) >= max_nr_folio) + if ((*cur_out >> min_folio_shift) >= max_nr_folio) return -E2BIG; /* @@ -149,18 +155,17 @@ static int copy_compressed_data_to_page(char *compressed_data, */ ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize); - cur_folio = out_folios[*cur_out / PAGE_SIZE]; + cur_folio = out_folios[*cur_out >> min_folio_shift]; /* Allocate a new page */ if (!cur_folio) { - cur_folio = btrfs_alloc_compr_folio(); + cur_folio = btrfs_alloc_compr_folio(fs_info); if (!cur_folio) return -ENOMEM; - out_folios[*cur_out / PAGE_SIZE] = cur_folio; + out_folios[*cur_out >> min_folio_shift] = cur_folio; } - kaddr = kmap_local_folio(cur_folio, 0); - write_compress_length(kaddr + offset_in_page(*cur_out), - compressed_size); + kaddr = kmap_local_folio(cur_folio, offset_in_folio(cur_folio, *cur_out)); + write_compress_length(kaddr, compressed_size); *cur_out += LZO_LEN; orig_out = *cur_out; @@ -172,20 +177,20 @@ static int copy_compressed_data_to_page(char *compressed_data, kunmap_local(kaddr); - if ((*cur_out / PAGE_SIZE) >= max_nr_folio) + if ((*cur_out >> min_folio_shift) >= max_nr_folio) return -E2BIG; - cur_folio = out_folios[*cur_out / PAGE_SIZE]; + cur_folio = out_folios[*cur_out >> min_folio_shift]; /* Allocate a new page */ if (!cur_folio) { - cur_folio = btrfs_alloc_compr_folio(); + cur_folio = btrfs_alloc_compr_folio(fs_info); if (!cur_folio) return -ENOMEM; - out_folios[*cur_out / PAGE_SIZE] = cur_folio; + out_folios[*cur_out >> min_folio_shift] = cur_folio; } kaddr = kmap_local_folio(cur_folio, 0); - memcpy(kaddr + offset_in_page(*cur_out), + memcpy(kaddr + offset_in_folio(cur_folio, *cur_out), compressed_data + *cur_out - orig_out, copy_len); *cur_out += copy_len; @@ -209,12 +214,15 @@ out: return 0; } -int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, +int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct workspace *workspace = list_entry(ws, struct workspace, list); - const u32 sectorsize = inode_to_fs_info(mapping->host)->sectorsize; + const u32 sectorsize = fs_info->sectorsize; + const u32 min_folio_size = btrfs_min_folio_size(fs_info); + struct address_space *mapping = inode->vfs_inode.i_mapping; struct folio *folio_in = NULL; char *sizes_ptr; const unsigned long max_nr_folio = *out_folios; @@ -263,9 +271,9 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, goto out; } - ret = copy_compressed_data_to_page(workspace->cbuf, out_len, + ret = copy_compressed_data_to_page(fs_info, workspace->cbuf, out_len, folios, max_nr_folio, - &cur_out, sectorsize); + &cur_out); if (ret < 0) goto out; @@ -280,8 +288,8 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, goto out; } - /* Check if we have reached page boundary */ - if (PAGE_ALIGNED(cur_in)) { + /* Check if we have reached folio boundary. */ + if (IS_ALIGNED(cur_in, min_folio_size)) { folio_put(folio_in); folio_in = NULL; } @@ -298,7 +306,7 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, out: if (folio_in) folio_put(folio_in); - *out_folios = DIV_ROUND_UP(cur_out, PAGE_SIZE); + *out_folios = DIV_ROUND_UP(cur_out, min_folio_size); return ret; } @@ -310,15 +318,16 @@ out: static void copy_compressed_segment(struct compressed_bio *cb, char *dest, u32 len, u32 *cur_in) { + struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; u32 orig_in = *cur_in; while (*cur_in < orig_in + len) { - struct folio *cur_folio; - u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in), - orig_in + len - *cur_in); + struct folio *cur_folio = cb->compressed_folios[*cur_in >> min_folio_shift]; + u32 copy_len = min_t(u32, orig_in + len - *cur_in, + folio_size(cur_folio) - offset_in_folio(cur_folio, *cur_in)); ASSERT(copy_len); - cur_folio = cb->compressed_folios[*cur_in / PAGE_SIZE]; memcpy_from_folio(dest + *cur_in - orig_in, cur_folio, offset_in_folio(cur_folio, *cur_in), copy_len); @@ -332,6 +341,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) struct workspace *workspace = list_entry(ws, struct workspace, list); const struct btrfs_fs_info *fs_info = cb->bbio.inode->root->fs_info; const u32 sectorsize = fs_info->sectorsize; + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; char *kaddr; int ret; /* Compressed data length, can be unaligned */ @@ -378,14 +388,14 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) */ ASSERT(cur_in / sectorsize == (cur_in + LZO_LEN - 1) / sectorsize); - cur_folio = cb->compressed_folios[cur_in / PAGE_SIZE]; + cur_folio = cb->compressed_folios[cur_in >> min_folio_shift]; ASSERT(cur_folio); kaddr = kmap_local_folio(cur_folio, 0); - seg_len = read_compress_length(kaddr + offset_in_page(cur_in)); + seg_len = read_compress_length(kaddr + offset_in_folio(cur_folio, cur_in)); kunmap_local(kaddr); cur_in += LZO_LEN; - if (unlikely(seg_len > WORKSPACE_CBUF_LENGTH)) { + if (unlikely(seg_len > workspace_cbuf_length(fs_info))) { struct btrfs_inode *inode = cb->bbio.inode; /* @@ -445,19 +455,19 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, const u32 sectorsize = fs_info->sectorsize; size_t in_len; size_t out_len; - size_t max_segment_len = WORKSPACE_BUF_LENGTH; + size_t max_segment_len = workspace_buf_length(fs_info); int ret = 0; - if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2) + if (unlikely(srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2)) return -EUCLEAN; in_len = read_compress_length(data_in); - if (in_len != srclen) + if (unlikely(in_len != srclen)) return -EUCLEAN; data_in += LZO_LEN; in_len = read_compress_length(data_in); - if (in_len != srclen - LZO_LEN * 2) { + if (unlikely(in_len != srclen - LZO_LEN * 2)) { ret = -EUCLEAN; goto out; } @@ -487,8 +497,7 @@ out: return ret; } -const struct btrfs_compress_op btrfs_lzo_compress = { - .workspace_manager = &wsm, +const struct btrfs_compress_levels btrfs_lzo_compress = { .max_level = 1, .default_level = 1, }; diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index 363fd28c0268..a0cf8effe008 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -18,6 +18,7 @@ static const char fs_state_chars[] = { [BTRFS_FS_STATE_REMOUNTING] = 'M', [BTRFS_FS_STATE_RO] = 0, [BTRFS_FS_STATE_TRANS_ABORTED] = 'A', + [BTRFS_FS_STATE_LOG_REPLAY_ABORTED] = 'O', [BTRFS_FS_STATE_DEV_REPLACING] = 'R', [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0, [BTRFS_FS_STATE_NO_DATA_CSUMS] = 'C', diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 022ebc89af85..4416c165644f 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -4,7 +4,6 @@ #define BTRFS_MESSAGES_H #include <linux/types.h> -#include <linux/types.h> #include <linux/printk.h> #include <linux/bug.h> diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index ff5eac84d819..60f9b000d644 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -11,6 +11,7 @@ #include <linux/pagemap.h> #include <linux/math64.h> #include <linux/rbtree.h> +#include <linux/bio.h> /* * Enumerate bits using enum autoincrement. Define the @name as the n-th bit. @@ -20,6 +21,54 @@ name = (1U << __ ## name ## _BIT), \ __ ## name ## _SEQ = __ ## name ## _BIT +static inline phys_addr_t bio_iter_phys(struct bio *bio, struct bvec_iter *iter) +{ + struct bio_vec bv = bio_iter_iovec(bio, *iter); + + return bvec_phys(&bv); +} + +/* + * Iterate bio using btrfs block size. + * + * This will handle large folio and highmem. + * + * @paddr: Physical memory address of each iteration + * @bio: The bio to iterate + * @iter: The bvec_iter (pointer) to use. + * @blocksize: The blocksize to iterate. + * + * This requires all folios in the bio to cover at least one block. + */ +#define btrfs_bio_for_each_block(paddr, bio, iter, blocksize) \ + for (; (iter)->bi_size && \ + (paddr = bio_iter_phys((bio), (iter)), 1); \ + bio_advance_iter_single((bio), (iter), (blocksize))) + +/* Initialize a bvec_iter to the size of the specified bio. */ +static inline struct bvec_iter init_bvec_iter_for_bio(struct bio *bio) +{ + struct bio_vec *bvec; + u32 bio_size = 0; + int i; + + bio_for_each_bvec_all(bvec, bio, i) + bio_size += bvec->bv_len; + + return (struct bvec_iter) { + .bi_sector = 0, + .bi_size = bio_size, + .bi_idx = 0, + .bi_bvec_done = 0, + }; +} + +#define btrfs_bio_for_each_block_all(paddr, bio, blocksize) \ + for (struct bvec_iter iter = init_bvec_iter_for_bio(bio); \ + (iter).bi_size && \ + (paddr = bio_iter_phys((bio), &(iter)), 1); \ + bio_advance_iter_single((bio), &(iter), (blocksize))) + static inline void cond_wake_up(struct wait_queue_head *wq) { /* diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 74e38da9bd39..62b993fae54f 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -6,12 +6,19 @@ #include "messages.h" #include "ctree.h" #include "disk-io.h" +#include "file-item.h" #include "print-tree.h" #include "accessors.h" #include "tree-checker.h" #include "volumes.h" #include "raid-stripe-tree.h" +/* + * Large enough buffer size for the stringification of any key type yet short + * enough to use the stack and avoid allocations. + */ +#define KEY_TYPE_BUF_SIZE 32 + struct root_name_map { u64 id; const char *name; @@ -227,21 +234,209 @@ static void print_eb_refs_lock(const struct extent_buffer *eb) #endif } +static void print_timespec(const struct extent_buffer *eb, + struct btrfs_timespec *timespec, + const char *prefix, const char *suffix) +{ + const u64 secs = btrfs_timespec_sec(eb, timespec); + const u32 nsecs = btrfs_timespec_nsec(eb, timespec); + + pr_info("%s%llu.%u%s", prefix, secs, nsecs, suffix); +} + +static void print_inode_item(const struct extent_buffer *eb, int i) +{ + struct btrfs_inode_item *ii = btrfs_item_ptr(eb, i, struct btrfs_inode_item); + + pr_info("\t\tinode generation %llu transid %llu size %llu nbytes %llu\n", + btrfs_inode_generation(eb, ii), btrfs_inode_transid(eb, ii), + btrfs_inode_size(eb, ii), btrfs_inode_nbytes(eb, ii)); + pr_info("\t\tblock group %llu mode %o links %u uid %u gid %u\n", + btrfs_inode_block_group(eb, ii), btrfs_inode_mode(eb, ii), + btrfs_inode_nlink(eb, ii), btrfs_inode_uid(eb, ii), + btrfs_inode_gid(eb, ii)); + pr_info("\t\trdev %llu sequence %llu flags 0x%llx\n", + btrfs_inode_rdev(eb, ii), btrfs_inode_sequence(eb, ii), + btrfs_inode_flags(eb, ii)); + print_timespec(eb, &ii->atime, "\t\tatime ", "\n"); + print_timespec(eb, &ii->ctime, "\t\tctime ", "\n"); + print_timespec(eb, &ii->mtime, "\t\tmtime ", "\n"); + print_timespec(eb, &ii->otime, "\t\totime ", "\n"); +} + +static void print_dir_item(const struct extent_buffer *eb, int i) +{ + const u32 size = btrfs_item_size(eb, i); + struct btrfs_dir_item *di = btrfs_item_ptr(eb, i, struct btrfs_dir_item); + u32 cur = 0; + + while (cur < size) { + const u32 name_len = btrfs_dir_name_len(eb, di); + const u32 data_len = btrfs_dir_data_len(eb, di); + const u32 len = sizeof(*di) + name_len + data_len; + struct btrfs_key location; + + btrfs_dir_item_key_to_cpu(eb, di, &location); + pr_info("\t\tlocation key (%llu %u %llu) type %d\n", + location.objectid, location.type, location.offset, + btrfs_dir_ftype(eb, di)); + pr_info("\t\ttransid %llu data_len %u name_len %u\n", + btrfs_dir_transid(eb, di), data_len, name_len); + di = (struct btrfs_dir_item *)((char *)di + len); + cur += len; + } +} + +static void print_inode_ref_item(const struct extent_buffer *eb, int i) +{ + const u32 size = btrfs_item_size(eb, i); + struct btrfs_inode_ref *ref = btrfs_item_ptr(eb, i, struct btrfs_inode_ref); + u32 cur = 0; + + while (cur < size) { + const u64 index = btrfs_inode_ref_index(eb, ref); + const u32 name_len = btrfs_inode_ref_name_len(eb, ref); + const u32 len = sizeof(*ref) + name_len; + + pr_info("\t\tindex %llu name_len %u\n", index, name_len); + ref = (struct btrfs_inode_ref *)((char *)ref + len); + cur += len; + } +} + +static void print_inode_extref_item(const struct extent_buffer *eb, int i) +{ + const u32 size = btrfs_item_size(eb, i); + struct btrfs_inode_extref *extref; + u32 cur = 0; + + extref = btrfs_item_ptr(eb, i, struct btrfs_inode_extref); + while (cur < size) { + const u64 index = btrfs_inode_extref_index(eb, extref); + const u32 name_len = btrfs_inode_extref_name_len(eb, extref); + const u64 parent = btrfs_inode_extref_parent(eb, extref); + const u32 len = sizeof(*extref) + name_len; + + pr_info("\t\tindex %llu parent %llu name_len %u\n", + index, parent, name_len); + extref = (struct btrfs_inode_extref *)((char *)extref + len); + cur += len; + } +} + +static void print_dir_log_index_item(const struct extent_buffer *eb, int i) +{ + struct btrfs_dir_log_item *dlog; + + dlog = btrfs_item_ptr(eb, i, struct btrfs_dir_log_item); + pr_info("\t\tdir log end %llu\n", btrfs_dir_log_end(eb, dlog)); +} + +static void print_extent_csum(const struct extent_buffer *eb, int i) +{ + const struct btrfs_fs_info *fs_info = eb->fs_info; + const u32 size = btrfs_item_size(eb, i); + const u32 csum_bytes = (size / fs_info->csum_size) * fs_info->sectorsize; + struct btrfs_key key; + + btrfs_item_key_to_cpu(eb, &key, i); + pr_info("\t\trange start %llu end %llu length %u\n", + key.offset, key.offset + csum_bytes, csum_bytes); +} + +static void print_file_extent_item(const struct extent_buffer *eb, int i) +{ + struct btrfs_file_extent_item *fi; + + fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); + pr_info("\t\tgeneration %llu type %hhu\n", + btrfs_file_extent_generation(eb, fi), + btrfs_file_extent_type(eb, fi)); + + if (btrfs_file_extent_type(eb, fi) == BTRFS_FILE_EXTENT_INLINE) { + pr_info("\t\tinline extent data size %u ram_bytes %llu compression %hhu\n", + btrfs_file_extent_inline_item_len(eb, i), + btrfs_file_extent_ram_bytes(eb, fi), + btrfs_file_extent_compression(eb, fi)); + return; + } + + pr_info("\t\textent data disk bytenr %llu nr %llu\n", + btrfs_file_extent_disk_bytenr(eb, fi), + btrfs_file_extent_disk_num_bytes(eb, fi)); + pr_info("\t\textent data offset %llu nr %llu ram %llu\n", + btrfs_file_extent_offset(eb, fi), + btrfs_file_extent_num_bytes(eb, fi), + btrfs_file_extent_ram_bytes(eb, fi)); + pr_info("\t\textent compression %hhu\n", + btrfs_file_extent_compression(eb, fi)); +} + +static void key_type_string(const struct btrfs_key *key, char *buf, int buf_size) +{ + static const char *key_to_str[256] = { + [BTRFS_INODE_ITEM_KEY] = "INODE_ITEM", + [BTRFS_INODE_REF_KEY] = "INODE_REF", + [BTRFS_INODE_EXTREF_KEY] = "INODE_EXTREF", + [BTRFS_DIR_ITEM_KEY] = "DIR_ITEM", + [BTRFS_DIR_INDEX_KEY] = "DIR_INDEX", + [BTRFS_DIR_LOG_ITEM_KEY] = "DIR_LOG_ITEM", + [BTRFS_DIR_LOG_INDEX_KEY] = "DIR_LOG_INDEX", + [BTRFS_XATTR_ITEM_KEY] = "XATTR_ITEM", + [BTRFS_VERITY_DESC_ITEM_KEY] = "VERITY_DESC_ITEM", + [BTRFS_VERITY_MERKLE_ITEM_KEY] = "VERITY_MERKLE_ITEM", + [BTRFS_ORPHAN_ITEM_KEY] = "ORPHAN_ITEM", + [BTRFS_ROOT_ITEM_KEY] = "ROOT_ITEM", + [BTRFS_ROOT_REF_KEY] = "ROOT_REF", + [BTRFS_ROOT_BACKREF_KEY] = "ROOT_BACKREF", + [BTRFS_EXTENT_ITEM_KEY] = "EXTENT_ITEM", + [BTRFS_METADATA_ITEM_KEY] = "METADATA_ITEM", + [BTRFS_TREE_BLOCK_REF_KEY] = "TREE_BLOCK_REF", + [BTRFS_SHARED_BLOCK_REF_KEY] = "SHARED_BLOCK_REF", + [BTRFS_EXTENT_DATA_REF_KEY] = "EXTENT_DATA_REF", + [BTRFS_SHARED_DATA_REF_KEY] = "SHARED_DATA_REF", + [BTRFS_EXTENT_OWNER_REF_KEY] = "EXTENT_OWNER_REF", + [BTRFS_EXTENT_CSUM_KEY] = "EXTENT_CSUM", + [BTRFS_EXTENT_DATA_KEY] = "EXTENT_DATA", + [BTRFS_BLOCK_GROUP_ITEM_KEY] = "BLOCK_GROUP_ITEM", + [BTRFS_FREE_SPACE_INFO_KEY] = "FREE_SPACE_INFO", + [BTRFS_FREE_SPACE_EXTENT_KEY] = "FREE_SPACE_EXTENT", + [BTRFS_FREE_SPACE_BITMAP_KEY] = "FREE_SPACE_BITMAP", + [BTRFS_CHUNK_ITEM_KEY] = "CHUNK_ITEM", + [BTRFS_DEV_ITEM_KEY] = "DEV_ITEM", + [BTRFS_DEV_EXTENT_KEY] = "DEV_EXTENT", + [BTRFS_TEMPORARY_ITEM_KEY] = "TEMPORARY_ITEM", + [BTRFS_DEV_REPLACE_KEY] = "DEV_REPLACE", + [BTRFS_STRING_ITEM_KEY] = "STRING_ITEM", + [BTRFS_QGROUP_STATUS_KEY] = "QGROUP_STATUS", + [BTRFS_QGROUP_RELATION_KEY] = "QGROUP_RELATION", + [BTRFS_QGROUP_INFO_KEY] = "QGROUP_INFO", + [BTRFS_QGROUP_LIMIT_KEY] = "QGROUP_LIMIT", + [BTRFS_PERSISTENT_ITEM_KEY] = "PERSISTENT_ITEM", + [BTRFS_UUID_KEY_SUBVOL] = "UUID_KEY_SUBVOL", + [BTRFS_UUID_KEY_RECEIVED_SUBVOL] = "UUID_KEY_RECEIVED_SUBVOL", + [BTRFS_RAID_STRIPE_KEY] = "RAID_STRIPE", + }; + + if (key->type == 0 && key->objectid == BTRFS_FREE_SPACE_OBJECTID) + scnprintf(buf, buf_size, "UNTYPED"); + else if (key_to_str[key->type]) + scnprintf(buf, buf_size, key_to_str[key->type]); + else + scnprintf(buf, buf_size, "UNKNOWN.%d", key->type); +} + void btrfs_print_leaf(const struct extent_buffer *l) { struct btrfs_fs_info *fs_info; int i; u32 type, nr; struct btrfs_root_item *ri; - struct btrfs_dir_item *di; - struct btrfs_inode_item *ii; struct btrfs_block_group_item *bi; - struct btrfs_file_extent_item *fi; struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; struct btrfs_dev_extent *dev_extent; struct btrfs_key key; - struct btrfs_key found_key; if (!l) return; @@ -255,25 +450,35 @@ void btrfs_print_leaf(const struct extent_buffer *l) btrfs_leaf_free_space(l), btrfs_header_owner(l)); print_eb_refs_lock(l); for (i = 0 ; i < nr ; i++) { + char key_buf[KEY_TYPE_BUF_SIZE]; + btrfs_item_key_to_cpu(l, &key, i); type = key.type; - pr_info("\titem %d key (%llu %u %llu) itemoff %d itemsize %d\n", - i, key.objectid, type, key.offset, + key_type_string(&key, key_buf, KEY_TYPE_BUF_SIZE); + + pr_info("\titem %d key (%llu %s %llu) itemoff %d itemsize %d\n", + i, key.objectid, key_buf, key.offset, btrfs_item_offset(l, i), btrfs_item_size(l, i)); switch (type) { case BTRFS_INODE_ITEM_KEY: - ii = btrfs_item_ptr(l, i, struct btrfs_inode_item); - pr_info("\t\tinode generation %llu size %llu mode %o\n", - btrfs_inode_generation(l, ii), - btrfs_inode_size(l, ii), - btrfs_inode_mode(l, ii)); + print_inode_item(l, i); + break; + case BTRFS_INODE_REF_KEY: + print_inode_ref_item(l, i); + break; + case BTRFS_INODE_EXTREF_KEY: + print_inode_extref_item(l, i); break; case BTRFS_DIR_ITEM_KEY: - di = btrfs_item_ptr(l, i, struct btrfs_dir_item); - btrfs_dir_item_key_to_cpu(l, di, &found_key); - pr_info("\t\tdir oid %llu flags %u\n", - found_key.objectid, - btrfs_dir_flags(l, di)); + case BTRFS_DIR_INDEX_KEY: + case BTRFS_XATTR_ITEM_KEY: + print_dir_item(l, i); + break; + case BTRFS_DIR_LOG_INDEX_KEY: + print_dir_log_index_item(l, i); + break; + case BTRFS_EXTENT_CSUM_KEY: + print_extent_csum(l, i); break; case BTRFS_ROOT_ITEM_KEY: ri = btrfs_item_ptr(l, i, struct btrfs_root_item); @@ -303,24 +508,7 @@ void btrfs_print_leaf(const struct extent_buffer *l) btrfs_shared_data_ref_count(l, sref)); break; case BTRFS_EXTENT_DATA_KEY: - fi = btrfs_item_ptr(l, i, - struct btrfs_file_extent_item); - pr_info("\t\tgeneration %llu type %hhu\n", - btrfs_file_extent_generation(l, fi), - btrfs_file_extent_type(l, fi)); - if (btrfs_file_extent_type(l, fi) == - BTRFS_FILE_EXTENT_INLINE) { - pr_info("\t\tinline extent data size %llu\n", - btrfs_file_extent_ram_bytes(l, fi)); - break; - } - pr_info("\t\textent data disk bytenr %llu nr %llu\n", - btrfs_file_extent_disk_bytenr(l, fi), - btrfs_file_extent_disk_num_bytes(l, fi)); - pr_info("\t\textent data offset %llu nr %llu ram %llu\n", - btrfs_file_extent_offset(l, fi), - btrfs_file_extent_num_bytes(l, fi), - btrfs_file_extent_ram_bytes(l, fi)); + print_file_extent_item(l, i); break; case BTRFS_BLOCK_GROUP_ITEM_KEY: bi = btrfs_item_ptr(l, i, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index da102da169fd..1175b8192cd7 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1069,7 +1069,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, } path = btrfs_alloc_path(); - if (!path) { + if (unlikely(!path)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out_free_root; @@ -1081,7 +1081,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, ret = btrfs_insert_empty_item(trans, quota_root, path, &key, sizeof(*ptr)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -1111,7 +1111,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0); if (ret > 0) goto out_add_root; - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -1129,7 +1129,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, /* We should not have a stray @prealloc pointer. */ ASSERT(prealloc == NULL); prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); - if (!prealloc) { + if (unlikely(!prealloc)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out_free_path; @@ -1137,7 +1137,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, ret = add_qgroup_item(trans, quota_root, found_key.offset); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -1145,13 +1145,13 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset); prealloc = NULL; ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } ret = btrfs_search_slot_for_read(tree_root, &found_key, path, 1, 0); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -1165,7 +1165,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, } } ret = btrfs_next_item(tree_root, path); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -1176,7 +1176,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, out_add_root: btrfs_release_path(path); ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -1190,7 +1190,7 @@ out_add_root: qgroup = add_qgroup_rb(fs_info, prealloc, BTRFS_FS_TREE_OBJECTID); prealloc = NULL; ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -1376,13 +1376,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) btrfs_free_qgroup_config(fs_info); ret = btrfs_clean_quota_tree(trans, quota_root); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_del_root(trans, "a_root->root_key); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -2426,9 +2426,9 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, int i; /* Level sanity check */ - if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 || - root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 || - root_level < cur_level) { + if (unlikely(cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 || + root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 || + root_level < cur_level)) { btrfs_err_rl(fs_info, "%s: bad levels, cur_level=%d root_level=%d", __func__, cur_level, root_level); @@ -2444,7 +2444,7 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, * dst_path->nodes[root_level] must be initialized before * calling this function. */ - if (cur_level == root_level) { + if (unlikely(cur_level == root_level)) { btrfs_err_rl(fs_info, "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d", __func__, root_level, root_level, cur_level); @@ -2530,7 +2530,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, return 0; /* Wrong parameter order */ - if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) { + if (unlikely(btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb))) { btrfs_err_rl(fs_info, "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__, btrfs_header_generation(src_eb), @@ -2538,7 +2538,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, return -EUCLEAN; } - if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) { + if (unlikely(!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb))) { ret = -EIO; goto out; } @@ -2729,7 +2729,7 @@ static void qgroup_iterator_nested_clean(struct list_head *head) */ static void qgroup_update_refcnt(struct btrfs_fs_info *fs_info, struct ulist *roots, struct list_head *qgroups, - u64 seq, int update_old) + u64 seq, bool update_old) { struct ulist_node *unode; struct ulist_iterator uiter; @@ -4710,8 +4710,8 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root, if (!btrfs_qgroup_full_accounting(fs_info)) return 0; - if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) > - btrfs_node_ptr_generation(reloc_parent, reloc_slot)) { + if (unlikely(btrfs_node_ptr_generation(subvol_parent, subvol_slot) > + btrfs_node_ptr_generation(reloc_parent, reloc_slot))) { btrfs_err_rl(fs_info, "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu", __func__, @@ -4843,7 +4843,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, reloc_eb = NULL; goto free_out; } - if (!extent_buffer_uptodate(reloc_eb)) { + if (unlikely(!extent_buffer_uptodate(reloc_eb))) { ret = -EIO; goto free_out; } diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index cab0b291088c..cc6f6095cc9f 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -67,7 +67,7 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *stripe_root = fs_info->stripe_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *leaf; u64 found_start; @@ -260,7 +260,6 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le btrfs_release_path(path); } - btrfs_free_path(path); return ret; } @@ -269,7 +268,7 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans, struct btrfs_stripe_extent *stripe_extent, const size_t item_size) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; int ret; int slot; @@ -288,7 +287,6 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, stripe_extent, btrfs_item_ptr_offset(leaf, slot), item_size); - btrfs_free_path(path); return ret; } @@ -306,7 +304,7 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, int ret; stripe_extent = kzalloc(item_size, GFP_NOFS); - if (!stripe_extent) { + if (!unlikely(stripe_extent)) { btrfs_abort_transaction(trans, -ENOMEM); btrfs_end_transaction(trans); return -ENOMEM; @@ -376,7 +374,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, struct btrfs_stripe_extent *stripe_extent; struct btrfs_key stripe_key; struct btrfs_key found_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; const u64 end = logical + *length; int num_stripes; @@ -402,7 +400,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, ret = btrfs_search_slot(NULL, stripe_root, &stripe_key, path, 0, 0); if (ret < 0) - goto free_path; + return ret; if (ret) { if (path->slots[0] != 0) path->slots[0]--; @@ -459,8 +457,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, trace_btrfs_get_raid_extent_offset(fs_info, logical, *length, stripe->physical, devid); - ret = 0; - goto free_path; + return 0; } /* If we're here, we haven't found the requested devid in the stripe. */ @@ -474,8 +471,6 @@ out: logical, logical + *length, stripe->dev->devid, btrfs_bg_type_to_raid_name(map_type)); } -free_path: - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 3ff2bedfb3a4..0135dceb7baa 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1167,7 +1167,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, /* Check if we have reached tolerance early. */ found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL); - if (found_errors > rbio->bioc->max_errors) + if (unlikely(found_errors > rbio->bioc->max_errors)) return -EIO; return 0; } @@ -1208,17 +1208,16 @@ static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) const u32 sectorsize = rbio->bioc->fs_info->sectorsize; const u32 sectorsize_bits = rbio->bioc->fs_info->sectorsize_bits; struct bvec_iter iter = bio->bi_iter; + phys_addr_t paddr; u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - rbio->bioc->full_stripe_logical; - while (iter.bi_size) { + btrfs_bio_for_each_block(paddr, bio, &iter, sectorsize) { unsigned int index = (offset >> sectorsize_bits); struct sector_ptr *sector = &rbio->bio_sectors[index]; - struct bio_vec bv = bio_iter_iovec(bio, iter); sector->has_paddr = true; - sector->paddr = bvec_phys(&bv); - bio_advance_iter_single(bio, &iter, sectorsize); + sector->paddr = paddr; offset += sectorsize; } } @@ -1511,22 +1510,17 @@ static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, */ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) { - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + const u32 blocksize = rbio->bioc->fs_info->sectorsize; + phys_addr_t paddr; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) { - struct sector_ptr *sector; - phys_addr_t paddr = bvec_phys(bvec); + btrfs_bio_for_each_block_all(paddr, bio, blocksize) { + struct sector_ptr *sector = find_stripe_sector(rbio, paddr); - for (u32 off = 0; off < bvec->bv_len; off += sectorsize) { - sector = find_stripe_sector(rbio, paddr + off); - ASSERT(sector); - if (sector) - sector->uptodate = 1; - } + ASSERT(sector); + if (sector) + sector->uptodate = 1; } } @@ -1573,8 +1567,7 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; int total_sector_nr = get_bio_sector_nr(rbio, bio); - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + phys_addr_t paddr; /* No data csum for the whole stripe, no need to verify. */ if (!rbio->csum_bitmap || !rbio->csum_buf) @@ -1584,27 +1577,20 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors) return; - bio_for_each_segment_all(bvec, bio, iter_all) { - void *kaddr; - - kaddr = bvec_kmap_local(bvec); - for (u32 off = 0; off < bvec->bv_len; - off += fs_info->sectorsize, total_sector_nr++) { - u8 csum_buf[BTRFS_CSUM_SIZE]; - u8 *expected_csum = rbio->csum_buf + - total_sector_nr * fs_info->csum_size; - int ret; + btrfs_bio_for_each_block_all(paddr, bio, fs_info->sectorsize) { + u8 csum_buf[BTRFS_CSUM_SIZE]; + u8 *expected_csum = rbio->csum_buf + total_sector_nr * fs_info->csum_size; + int ret; - /* No csum for this sector, skip to the next sector. */ - if (!test_bit(total_sector_nr, rbio->csum_bitmap)) - continue; + /* No csum for this sector, skip to the next sector. */ + if (!test_bit(total_sector_nr, rbio->csum_bitmap)) + continue; - ret = btrfs_check_sector_csum(fs_info, kaddr + off, - csum_buf, expected_csum); - if (ret < 0) - set_bit(total_sector_nr, rbio->error_bitmap); - } - kunmap_local(kaddr); + ret = btrfs_check_block_csum(fs_info, paddr, + csum_buf, expected_csum); + if (ret < 0) + set_bit(total_sector_nr, rbio->error_bitmap); + total_sector_nr++; } } @@ -1802,7 +1788,6 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, struct sector_ptr *sector; u8 csum_buf[BTRFS_CSUM_SIZE]; u8 *csum_expected; - void *kaddr; int ret; if (!rbio->csum_bitmap || !rbio->csum_buf) @@ -1824,9 +1809,7 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, csum_expected = rbio->csum_buf + (stripe_nr * rbio->stripe_nsectors + sector_nr) * fs_info->csum_size; - kaddr = kmap_local_sector(sector); - ret = btrfs_check_sector_csum(fs_info, kaddr, csum_buf, csum_expected); - kunmap_local(kaddr); + ret = btrfs_check_block_csum(fs_info, sector->paddr, csum_buf, csum_expected); return ret; } @@ -1864,7 +1847,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, if (!found_errors) return 0; - if (found_errors > rbio->bioc->max_errors) + if (unlikely(found_errors > rbio->bioc->max_errors)) return -EIO; /* @@ -2416,7 +2399,7 @@ static void rmw_rbio(struct btrfs_raid_bio *rbio) int found_errors; found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL); - if (found_errors > rbio->bioc->max_errors) { + if (unlikely(found_errors > rbio->bioc->max_errors)) { ret = -EIO; break; } @@ -2705,7 +2688,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, &failb); - if (found_errors > rbio->bioc->max_errors) { + if (unlikely(found_errors > rbio->bioc->max_errors)) { ret = -EIO; goto out; } @@ -2729,7 +2712,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) * data, so the capability of the repair is declined. (In the * case of RAID5, we can not repair anything.) */ - if (dfail > rbio->bioc->max_errors - 1) { + if (unlikely(dfail > rbio->bioc->max_errors - 1)) { ret = -EIO; goto out; } @@ -2746,7 +2729,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) * scrubbing parity, luckily, use the other one to repair the * data, or we can not repair the data stripe. */ - if (failp != rbio->scrubp) { + if (unlikely(failp != rbio->scrubp)) { ret = -EIO; goto out; } @@ -2837,7 +2820,7 @@ static void scrub_rbio(struct btrfs_raid_bio *rbio) int found_errors; found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL); - if (found_errors > rbio->bioc->max_errors) { + if (unlikely(found_errors > rbio->bioc->max_errors)) { ret = -EIO; break; } @@ -2861,19 +2844,22 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) * This is for scrub call sites where we already have correct data contents. * This allows us to avoid reading data stripes again. * - * Unfortunately here we have to do page copy, other than reusing the pages. + * Unfortunately here we have to do folio copy, other than reusing the pages. * This is due to the fact rbio has its own page management for its cache. */ -void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio, - struct page **data_pages, u64 data_logical) +void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio, + struct folio **data_folios, u64 data_logical) { + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; const u64 offset_in_full_stripe = data_logical - rbio->bioc->full_stripe_logical; - const int page_index = offset_in_full_stripe >> PAGE_SHIFT; - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - const u32 sectors_per_page = PAGE_SIZE / sectorsize; + unsigned int findex = 0; + unsigned int foffset = 0; int ret; + /* We shouldn't hit RAID56 for bs > ps cases for now. */ + ASSERT(fs_info->sectorsize <= PAGE_SIZE); + /* * If we hit ENOMEM temporarily, but later at * raid56_parity_submit_scrub_rbio() time it succeeded, we just do @@ -2890,14 +2876,25 @@ void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio, ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN)); ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT)); - for (int page_nr = 0; page_nr < (BTRFS_STRIPE_LEN >> PAGE_SHIFT); page_nr++) { - struct page *dst = rbio->stripe_pages[page_nr + page_index]; - struct page *src = data_pages[page_nr]; + for (unsigned int cur_off = offset_in_full_stripe; + cur_off < offset_in_full_stripe + BTRFS_STRIPE_LEN; + cur_off += PAGE_SIZE) { + const unsigned int pindex = cur_off >> PAGE_SHIFT; + void *kaddr; + + kaddr = kmap_local_page(rbio->stripe_pages[pindex]); + memcpy_from_folio(kaddr, data_folios[findex], foffset, PAGE_SIZE); + kunmap_local(kaddr); - memcpy_page(dst, 0, src, 0, PAGE_SIZE); - for (int sector_nr = sectors_per_page * page_index; - sector_nr < sectors_per_page * (page_index + 1); - sector_nr++) - rbio->stripe_sectors[sector_nr].uptodate = true; + foffset += PAGE_SIZE; + ASSERT(foffset <= folio_size(data_folios[findex])); + if (foffset == folio_size(data_folios[findex])) { + findex++; + foffset = 0; + } } + for (unsigned int sector_nr = offset_in_full_stripe >> fs_info->sectorsize_bits; + sector_nr < (offset_in_full_stripe + BTRFS_STRIPE_LEN) >> fs_info->sectorsize_bits; + sector_nr++) + rbio->stripe_sectors[sector_nr].uptodate = true; } diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 0d7b4c2fb6ae..84c4d1d29c7a 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -201,8 +201,8 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, unsigned long *dbitmap, int stripe_nsectors); void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); -void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio, - struct page **data_pages, u64 data_logical); +void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio, + struct folio **data_folios, u64 data_logical); int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 9f1858b42c0e..de4cb0f3fbd0 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -971,7 +971,7 @@ void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start, int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) { struct btrfs_root *extent_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *eb; int tree_block_level = 0; u64 bytenr = 0, num_bytes = 0; @@ -1021,6 +1021,5 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) btrfs_free_ref_cache(fs_info); btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); } - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h index 559bd25a2b7a..1ce544d53cc5 100644 --- a/fs/btrfs/ref-verify.h +++ b/fs/btrfs/ref-verify.h @@ -12,7 +12,7 @@ struct btrfs_fs_info; struct btrfs_ref; -#ifdef CONFIG_BTRFS_FS_REF_VERIFY +#ifdef CONFIG_BTRFS_DEBUG #include <linux/spinlock.h> @@ -53,6 +53,6 @@ static inline void btrfs_init_ref_verify(struct btrfs_fs_info *fs_info) { } -#endif /* CONFIG_BTRFS_FS_REF_VERIFY */ +#endif /* CONFIG_BTRFS_DEBUG */ #endif diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index ce25ab7f0e99..5465a5eae9b2 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -23,7 +23,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, u64 endoff, const u64 destoff, const u64 olen, - int no_time_update) + bool no_time_update) { int ret; @@ -43,7 +43,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, } ret = btrfs_update_inode(trans, BTRFS_I(inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; @@ -268,12 +268,12 @@ copy_inline_extent: drop_args.end = aligned_end; drop_args.drop_cache = true; ret = btrfs_drop_extents(trans, root, inode, &drop_args); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_insert_empty_item(trans, root, path, new_key, size); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -285,7 +285,7 @@ copy_inline_extent: btrfs_update_inode_bytes(inode, datal, drop_args.bytes_found); btrfs_set_inode_full_sync(inode); ret = btrfs_inode_set_file_extent_range(inode, 0, aligned_end); - if (ret) + if (unlikely(ret)) btrfs_abort_transaction(trans, ret); out: if (!ret && !trans) { @@ -337,10 +337,10 @@ copy_to_page: */ static int btrfs_clone(struct inode *src, struct inode *inode, const u64 off, const u64 olen, const u64 olen_aligned, - const u64 destoff, int no_time_update) + const u64 destoff, bool no_time_update) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_trans_handle *trans; char *buf = NULL; @@ -611,7 +611,6 @@ process_slot: } out: - btrfs_free_path(path); kvfree(buf); clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 7256f6748c8f..8dd8de6b9fb8 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -821,7 +821,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, u64 bytenr, u64 num_bytes) { struct btrfs_root *root = BTRFS_I(reloc_inode)->root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; int ret; @@ -834,11 +834,9 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0); if (ret < 0) - goto out; - if (ret > 0) { - ret = -ENOENT; - goto out; - } + return ret; + if (ret > 0) + return -ENOENT; leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0], @@ -849,16 +847,11 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)); - if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { - ret = -EINVAL; - goto out; - } + if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) + return -EINVAL; *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - ret = 0; -out: - btrfs_free_path(path); - return ret; + return 0; } /* @@ -974,7 +967,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, btrfs_init_data_ref(&ref, key.objectid, key.offset, btrfs_root_id(root), false); ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -988,7 +981,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, btrfs_init_data_ref(&ref, key.objectid, key.offset, btrfs_root_id(root), false); ret = btrfs_free_extent(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -1199,7 +1192,7 @@ again: ref.ref_root = btrfs_root_id(src); btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -1212,7 +1205,7 @@ again: ref.ref_root = btrfs_root_id(dest); btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -1226,7 +1219,7 @@ again: ref.ref_root = btrfs_root_id(src); btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_free_extent(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -1240,7 +1233,7 @@ again: ref.ref_root = btrfs_root_id(dest); btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_free_extent(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -1490,7 +1483,7 @@ static int clean_dirty_subvols(struct reloc_control *rc) * ->reloc_root. If it fails however we must * drop the ref ourselves. */ - ret2 = btrfs_drop_snapshot(reloc_root, 0, 1); + ret2 = btrfs_drop_snapshot(reloc_root, false, true); if (ret2 < 0) { btrfs_put_root(reloc_root); if (!ret) @@ -1500,7 +1493,7 @@ static int clean_dirty_subvols(struct reloc_control *rc) btrfs_put_root(root); } else { /* Orphan reloc tree, just clean it up */ - ret2 = btrfs_drop_snapshot(root, 0, 1); + ret2 = btrfs_drop_snapshot(root, false, true); if (ret2 < 0) { btrfs_put_root(root); if (!ret) @@ -1791,7 +1784,7 @@ again: list_add(&reloc_root->root_list, &reloc_roots); btrfs_put_root(root); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); if (!err) err = ret; @@ -1960,7 +1953,7 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, DEBUG_WARN("error %ld reading root for reloc root", PTR_ERR(root)); return PTR_ERR(root); } - if (root->reloc_root != reloc_root) { + if (unlikely(root->reloc_root != reloc_root)) { DEBUG_WARN("unexpected reloc root found"); btrfs_err(fs_info, "root %llu has two reloc roots associated with it", @@ -2031,7 +2024,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, if (!root) return ERR_PTR(-ENOENT); - if (next->new_bytenr) { + if (unlikely(next->new_bytenr)) { /* * We just created the reloc root, so we shouldn't have * ->new_bytenr set yet. If it is then we have multiple roots @@ -2090,7 +2083,7 @@ struct btrfs_root *select_one_root(struct btrfs_backref_node *node) * This can occur if we have incomplete extent refs leading all * the way up a particular path, in this case return -EUCLEAN. */ - if (!root) + if (unlikely(!root)) return ERR_PTR(-EUCLEAN); /* No other choice for non-shareable tree */ @@ -2277,7 +2270,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, bytenr = btrfs_node_blockptr(upper->eb, slot); if (lowest) { - if (bytenr != node->bytenr) { + if (unlikely(bytenr != node->bytenr)) { btrfs_err(root->fs_info, "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu", bytenr, node->bytenr, slot, @@ -2332,7 +2325,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, if (!ret) ret = btrfs_drop_subtree(trans, root, eb, upper->eb); - if (ret) + if (unlikely(ret)) btrfs_abort_transaction(trans, ret); } next: @@ -2454,7 +2447,7 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info, eb = read_tree_block(fs_info, block->bytenr, &check); if (IS_ERR(eb)) return PTR_ERR(eb); - if (!extent_buffer_uptodate(eb)) { + if (unlikely(!extent_buffer_uptodate(eb))) { free_extent_buffer(eb); return -EIO; } @@ -2519,7 +2512,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, * normal user in the case of corruption. */ ASSERT(node->new_bytenr == 0); - if (node->new_bytenr) { + if (unlikely(node->new_bytenr)) { btrfs_err(root->fs_info, "bytenr %llu has improper references to it", node->bytenr); @@ -2839,7 +2832,7 @@ again: if (!folio_test_uptodate(folio)) { btrfs_read_folio(NULL, folio); folio_lock(folio); - if (!folio_test_uptodate(folio)) { + if (unlikely(!folio_test_uptodate(folio))) { ret = -EIO; goto release_folio; } @@ -3158,7 +3151,7 @@ static int __add_tree_block(struct reloc_control *rc, struct rb_root *blocks) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int ret; bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA); @@ -3186,7 +3179,7 @@ again: path->skip_locking = 1; ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret > 0 && skinny) { if (path->slots[0]) { @@ -3213,14 +3206,10 @@ again: "tree block extent item (%llu) is not found in extent tree", bytenr); WARN_ON(1); - ret = -EINVAL; - goto out; + return -EINVAL; } - ret = add_tree_block(rc, &key, path, blocks); -out: - btrfs_free_path(path); - return ret; + return add_tree_block(rc, &key, path, blocks); } static int delete_block_group_cache(struct btrfs_block_group *block_group, @@ -3510,7 +3499,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) struct rb_root blocks = RB_ROOT; struct btrfs_key key; struct btrfs_trans_handle *trans = NULL; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_extent_item *ei; u64 flags; int ret; @@ -3679,14 +3668,13 @@ out_free: if (ret < 0 && !err) err = ret; btrfs_free_block_rsv(fs_info, rc->block_rsv); - btrfs_free_path(path); return err; } static int __insert_orphan_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_inode_item *item; struct extent_buffer *leaf; int ret; @@ -3697,7 +3685,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_inode(trans, root, path, objectid); if (ret) - goto out; + return ret; leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); @@ -3707,15 +3695,13 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC); -out: - btrfs_free_path(path); - return ret; + return 0; } static void delete_orphan_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int ret = 0; @@ -3738,7 +3724,6 @@ static void delete_orphan_inode(struct btrfs_trans_handle *trans, out: if (ret) btrfs_abort_transaction(trans, ret); - btrfs_free_path(path); } /* diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index e22e6b06927a..d07eab70f759 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -85,7 +85,7 @@ int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key, * Key with offset -1 found, there would have to exist a root * with such id, but this is out of the valid range. */ - if (ret == 0) { + if (unlikely(ret == 0)) { ret = -EUCLEAN; goto out; } @@ -130,7 +130,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root *item) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *l; int ret; int slot; @@ -143,15 +143,15 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root ret = btrfs_search_slot(trans, root, key, path, 0, 1); if (ret < 0) - goto out; + return ret; - if (ret > 0) { + if (unlikely(ret > 0)) { btrfs_crit(fs_info, "unable to find root key (%llu %u %llu) in tree %llu", key->objectid, key->type, key->offset, btrfs_root_id(root)); ret = -EUCLEAN; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } l = path->nodes[0]; @@ -168,22 +168,22 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_release_path(path); ret = btrfs_search_slot(trans, root, key, path, -1, 1); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } ret = btrfs_del_item(trans, root, path); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } btrfs_release_path(path); ret = btrfs_insert_empty_item(trans, root, path, key, sizeof(*item)); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } l = path->nodes[0]; slot = path->slots[0]; @@ -197,8 +197,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_set_root_generation_v2(item, btrfs_root_generation(item)); write_extent_buffer(l, item, ptr, sizeof(*item)); -out: - btrfs_free_path(path); return ret; } @@ -216,7 +214,7 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) { struct btrfs_root *tree_root = fs_info->tree_root; struct extent_buffer *leaf; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_root *root; int err = 0; @@ -309,7 +307,6 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) btrfs_put_root(root); } - btrfs_free_path(path); return err; } @@ -318,7 +315,7 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, const struct btrfs_key *key) { struct btrfs_root *root = trans->fs_info->tree_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; path = btrfs_alloc_path(); @@ -326,17 +323,12 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, return -ENOMEM; ret = btrfs_search_slot(trans, root, key, path, -1, 1); if (ret < 0) - goto out; - if (ret != 0) { + return ret; + if (unlikely(ret > 0)) /* The root must exist but we did not find it by the key. */ - ret = -EUCLEAN; - goto out; - } + return -EUCLEAN; - ret = btrfs_del_item(trans, root, path); -out: - btrfs_free_path(path); - return ret; + return btrfs_del_item(trans, root, path); } int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, @@ -344,7 +336,7 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, const struct fscrypt_str *name) { struct btrfs_root *tree_root = trans->fs_info->tree_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root_ref *ref; struct extent_buffer *leaf; struct btrfs_key key; @@ -361,7 +353,7 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, again: ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); if (ret < 0) { - goto out; + return ret; } else if (ret == 0) { leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], @@ -369,18 +361,16 @@ again: ptr = (unsigned long)(ref + 1); if ((btrfs_root_ref_dirid(leaf, ref) != dirid) || (btrfs_root_ref_name_len(leaf, ref) != name->len) || - memcmp_extent_buffer(leaf, name->name, ptr, name->len)) { - ret = -ENOENT; - goto out; - } + memcmp_extent_buffer(leaf, name->name, ptr, name->len)) + return -ENOENT; + *sequence = btrfs_root_ref_sequence(leaf, ref); ret = btrfs_del_item(trans, tree_root, path); if (ret) - goto out; + return ret; } else { - ret = -ENOENT; - goto out; + return -ENOENT; } if (key.type == BTRFS_ROOT_BACKREF_KEY) { @@ -391,8 +381,6 @@ again: goto again; } -out: - btrfs_free_path(path); return ret; } @@ -418,7 +406,7 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, struct btrfs_root *tree_root = trans->fs_info->tree_root; struct btrfs_key key; int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root_ref *ref; struct extent_buffer *leaf; unsigned long ptr; @@ -433,9 +421,8 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, again: ret = btrfs_insert_empty_item(trans, tree_root, path, &key, sizeof(*ref) + name->len); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); - btrfs_free_path(path); return ret; } @@ -455,7 +442,6 @@ again: goto again; } - btrfs_free_path(path); return 0; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 6776e6ab8d10..4691d0bdb2e8 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -113,7 +113,7 @@ enum { /* Which blocks are covered by extent items. */ scrub_bitmap_nr_has_extent = 0, - /* Which blocks are meteadata. */ + /* Which blocks are metadata. */ scrub_bitmap_nr_is_metadata, /* @@ -130,7 +130,7 @@ enum { scrub_bitmap_nr_last, }; -#define SCRUB_STRIPE_PAGES (BTRFS_STRIPE_LEN / PAGE_SIZE) +#define SCRUB_STRIPE_MAX_FOLIOS (BTRFS_STRIPE_LEN / PAGE_SIZE) /* * Represent one contiguous range with a length of BTRFS_STRIPE_LEN. @@ -139,7 +139,7 @@ struct scrub_stripe { struct scrub_ctx *sctx; struct btrfs_block_group *bg; - struct page *pages[SCRUB_STRIPE_PAGES]; + struct folio *folios[SCRUB_STRIPE_MAX_FOLIOS]; struct scrub_sector_verification *sectors; struct btrfs_device *dev; @@ -206,7 +206,7 @@ struct scrub_ctx { ktime_t throttle_deadline; u64 throttle_sent; - int is_dev_replace; + bool is_dev_replace; u64 write_pointer; struct mutex wr_lock; @@ -339,10 +339,10 @@ static void release_scrub_stripe(struct scrub_stripe *stripe) if (!stripe) return; - for (int i = 0; i < SCRUB_STRIPE_PAGES; i++) { - if (stripe->pages[i]) - __free_page(stripe->pages[i]); - stripe->pages[i] = NULL; + for (int i = 0; i < SCRUB_STRIPE_MAX_FOLIOS; i++) { + if (stripe->folios[i]) + folio_put(stripe->folios[i]); + stripe->folios[i] = NULL; } kfree(stripe->sectors); kfree(stripe->csums); @@ -355,6 +355,7 @@ static void release_scrub_stripe(struct scrub_stripe *stripe) static int init_scrub_stripe(struct btrfs_fs_info *fs_info, struct scrub_stripe *stripe) { + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; int ret; memset(stripe, 0, sizeof(*stripe)); @@ -367,7 +368,9 @@ static int init_scrub_stripe(struct btrfs_fs_info *fs_info, atomic_set(&stripe->pending_io, 0); spin_lock_init(&stripe->write_error_lock); - ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages, false); + ASSERT(BTRFS_STRIPE_LEN >> min_folio_shift <= SCRUB_STRIPE_MAX_FOLIOS); + ret = btrfs_alloc_folio_array(BTRFS_STRIPE_LEN >> min_folio_shift, + fs_info->block_min_order, stripe->folios); if (ret < 0) goto error; @@ -446,7 +449,7 @@ static void scrub_put_ctx(struct scrub_ctx *sctx) } static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( - struct btrfs_fs_info *fs_info, int is_dev_replace) + struct btrfs_fs_info *fs_info, bool is_dev_replace) { struct scrub_ctx *sctx; int i; @@ -585,7 +588,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * bool is_super, u64 logical, u64 physical) { struct btrfs_fs_info *fs_info = dev->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key found_key; struct extent_buffer *eb; struct btrfs_extent_item *ei; @@ -612,7 +615,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, &flags); if (ret < 0) - goto out; + return; swarn.extent_item_size = found_key.offset; @@ -658,9 +661,6 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn); } - -out: - btrfs_free_path(path); } static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) @@ -687,13 +687,30 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) static void *scrub_stripe_get_kaddr(struct scrub_stripe *stripe, int sector_nr) { - u32 offset = (sector_nr << stripe->bg->fs_info->sectorsize_bits); - const struct page *page = stripe->pages[offset >> PAGE_SHIFT]; + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; + u32 offset = (sector_nr << fs_info->sectorsize_bits); + const struct folio *folio = stripe->folios[offset >> min_folio_shift]; - /* stripe->pages[] is allocated by us and no highmem is allowed. */ - ASSERT(page); - ASSERT(!PageHighMem(page)); - return page_address(page) + offset_in_page(offset); + /* stripe->folios[] is allocated by us and no highmem is allowed. */ + ASSERT(folio); + ASSERT(!folio_test_partial_kmap(folio)); + return folio_address(folio) + offset_in_folio(folio, offset); +} + +static phys_addr_t scrub_stripe_get_paddr(struct scrub_stripe *stripe, int sector_nr) +{ + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; + u32 offset = (sector_nr << fs_info->sectorsize_bits); + const struct folio *folio = stripe->folios[offset >> min_folio_shift]; + + /* stripe->folios[] is allocated by us and no highmem is allowed. */ + ASSERT(folio); + ASSERT(!folio_test_partial_kmap(folio)); + /* And the range must be contained inside the folio. */ + ASSERT(offset_in_folio(folio, offset) + fs_info->sectorsize <= folio_size(folio)); + return page_to_phys(folio_page(folio, 0)) + offset_in_folio(folio, offset); } static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr) @@ -788,7 +805,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct scrub_sector_verification *sector = &stripe->sectors[sector_nr]; const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; - void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); + phys_addr_t paddr = scrub_stripe_get_paddr(stripe, sector_nr); u8 csum_buf[BTRFS_CSUM_SIZE]; int ret; @@ -833,7 +850,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) return; } - ret = btrfs_check_sector_csum(fs_info, kaddr, csum_buf, sector->csum); + ret = btrfs_check_block_csum(fs_info, paddr, csum_buf, sector->csum); if (ret < 0) { scrub_bitmap_set_bit_csum_error(stripe, sector_nr); scrub_bitmap_set_bit_error(stripe, sector_nr); @@ -1369,8 +1386,7 @@ static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *d * Slice is divided into intervals when the IO is submitted, adjust by * bwlimit and maximum of 64 intervals. */ - div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024))); - div = min_t(u32, 64, div); + div = clamp(bwlimit / (16 * 1024 * 1024), 1, 64); /* Start new epoch, set deadline */ now = ktime_get(); @@ -1513,7 +1529,7 @@ static int find_first_extent_item(struct btrfs_root *extent_root, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. @@ -1859,6 +1875,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, { struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_bio *bbio; + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits; int mirror = stripe->mirror_num; @@ -1871,7 +1888,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, return; } - bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info, + bbio = btrfs_bio_alloc(BTRFS_STRIPE_LEN >> min_folio_shift, REQ_OP_READ, fs_info, scrub_read_endio, stripe); bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT; @@ -1970,7 +1987,7 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) * metadata, we should immediately abort. */ for (int i = 0; i < nr_stripes; i++) { - if (stripe_has_metadata_error(&sctx->stripes[i])) { + if (unlikely(stripe_has_metadata_error(&sctx->stripes[i]))) { ret = -EIO; goto out; } @@ -2164,7 +2181,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, * As we may hit an empty data stripe while it's missing. */ bitmap_and(&error, &error, &has_extent, stripe->nr_sectors); - if (!bitmap_empty(&error, stripe->nr_sectors)) { + if (unlikely(!bitmap_empty(&error, stripe->nr_sectors))) { btrfs_err(fs_info, "scrub: unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", full_stripe_start, i, stripe->nr_sectors, @@ -2202,7 +2219,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, for (int i = 0; i < data_stripes; i++) { stripe = &sctx->raid56_data_stripes[i]; - raid56_parity_cache_data_pages(rbio, stripe->pages, + raid56_parity_cache_data_folios(rbio, stripe->folios, full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT)); } raid56_parity_submit_scrub_rbio(rbio); @@ -2586,7 +2603,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, u64 start, u64 end) { struct btrfs_dev_extent *dev_extent = NULL; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_root *root = fs_info->dev_root; u64 chunk_offset; @@ -2858,8 +2875,8 @@ skip_unfreeze: btrfs_put_block_group(cache); if (ret) break; - if (sctx->is_dev_replace && - atomic64_read(&dev_replace->num_write_errors) > 0) { + if (unlikely(sctx->is_dev_replace && + atomic64_read(&dev_replace->num_write_errors) > 0)) { ret = -EIO; break; } @@ -2872,8 +2889,6 @@ skip: btrfs_release_path(path); } - btrfs_free_path(path); - return ret; } @@ -2889,13 +2904,13 @@ static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev, if (ret < 0) return ret; ret = btrfs_check_super_csum(fs_info, sb); - if (ret != 0) { + if (unlikely(ret != 0)) { btrfs_err_rl(fs_info, "scrub: super block at physical %llu devid %llu has bad csum", physical, dev->devid); return -EIO; } - if (btrfs_super_generation(sb) != generation) { + if (unlikely(btrfs_super_generation(sb) != generation)) { btrfs_err_rl(fs_info, "scrub: super block at physical %llu devid %llu has bad generation %llu expect %llu", physical, dev->devid, @@ -3013,7 +3028,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info) int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, - int readonly, int is_dev_replace) + bool readonly, bool is_dev_replace) { struct btrfs_dev_lookup_args args = { .devid = devid }; struct scrub_ctx *sctx; @@ -3065,8 +3080,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, } mutex_lock(&fs_info->scrub_lock); - if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || - test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) { + if (unlikely(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state))) { mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); ret = -EIO; diff --git a/fs/btrfs/scrub.h b/fs/btrfs/scrub.h index f0df597b75c7..aa68b6ebaf55 100644 --- a/fs/btrfs/scrub.h +++ b/fs/btrfs/scrub.h @@ -11,7 +11,7 @@ struct btrfs_scrub_progress; int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, - int readonly, int is_dev_replace); + bool readonly, bool is_dev_replace); void btrfs_scrub_pause(struct btrfs_fs_info *fs_info); void btrfs_scrub_continue(struct btrfs_fs_info *fs_info); int btrfs_scrub_cancel(struct btrfs_fs_info *info); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 7664025a5af4..9230e5066fc6 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -646,7 +646,7 @@ static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off) ret = kernel_write(filp, buf + pos, len - pos, off); if (ret < 0) return ret; - if (ret == 0) + if (unlikely(ret == 0)) return -EIO; pos += ret; } @@ -909,7 +909,7 @@ static int get_inode_info(struct btrfs_root *root, u64 ino, struct btrfs_inode_info *info) { int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_inode_item *ii; struct btrfs_key key; @@ -924,11 +924,11 @@ static int get_inode_info(struct btrfs_root *root, u64 ino, if (ret) { if (ret > 0) ret = -ENOENT; - goto out; + return ret; } if (!info) - goto out; + return 0; ii = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); @@ -945,9 +945,7 @@ static int get_inode_info(struct btrfs_root *root, u64 ino, */ info->fileattr = btrfs_inode_flags(path->nodes[0], ii); -out: - btrfs_free_path(path); - return ret; + return 0; } static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen) @@ -973,13 +971,13 @@ typedef int (*iterate_inode_ref_t)(u64 dir, struct fs_path *p, void *ctx); * path must point to the INODE_REF or INODE_EXTREF when called. */ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *found_key, int resolve, + struct btrfs_key *found_key, bool resolve, iterate_inode_ref_t iterate, void *ctx) { struct extent_buffer *eb = path->nodes[0]; struct btrfs_inode_ref *iref; struct btrfs_inode_extref *extref; - struct btrfs_path *tmp_path; + BTRFS_PATH_AUTO_FREE(tmp_path); struct fs_path *p; u32 cur = 0; u32 total; @@ -1076,7 +1074,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, } out: - btrfs_free_path(tmp_path); fs_path_free(p); return ret; } @@ -1224,7 +1221,7 @@ static int get_inode_path(struct btrfs_root *root, { int ret; struct btrfs_key key, found_key; - struct btrfs_path *p; + BTRFS_PATH_AUTO_FREE(p); p = alloc_path_for_send(); if (!p) @@ -1238,28 +1235,20 @@ static int get_inode_path(struct btrfs_root *root, ret = btrfs_search_slot_for_read(root, &key, p, 1, 0); if (ret < 0) - goto out; - if (ret) { - ret = 1; - goto out; - } + return ret; + if (ret) + return 1; + btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]); if (found_key.objectid != ino || (found_key.type != BTRFS_INODE_REF_KEY && - found_key.type != BTRFS_INODE_EXTREF_KEY)) { - ret = -ENOENT; - goto out; - } + found_key.type != BTRFS_INODE_EXTREF_KEY)) + return -ENOENT; - ret = iterate_inode_ref(root, p, &found_key, 1, - __copy_first_ref, path); + ret = iterate_inode_ref(root, p, &found_key, true, __copy_first_ref, path); if (ret < 0) - goto out; - ret = 0; - -out: - btrfs_free_path(p); - return ret; + return ret; + return 0; } struct backref_ctx { @@ -1389,7 +1378,7 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, struct backref_ctx *bctx = ctx; struct send_ctx *sctx = bctx->sctx; struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; - const u64 key = leaf_bytenr >> fs_info->sectorsize_bits; + const u64 key = leaf_bytenr >> fs_info->nodesize_bits; struct btrfs_lru_cache_entry *raw_entry; struct backref_cache_entry *entry; @@ -1444,7 +1433,7 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, if (!new_entry) return; - new_entry->entry.key = leaf_bytenr >> fs_info->sectorsize_bits; + new_entry->entry.key = leaf_bytenr >> fs_info->nodesize_bits; new_entry->entry.gen = 0; new_entry->num_roots = 0; ULIST_ITER_INIT(&uiter); @@ -1716,7 +1705,7 @@ static int read_symlink(struct btrfs_root *root, struct fs_path *dest) { int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_file_extent_item *ei; u8 type; @@ -1733,21 +1722,20 @@ static int read_symlink(struct btrfs_root *root, key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; - if (ret) { + return ret; + if (unlikely(ret)) { /* * An empty symlink inode. Can happen in rare error paths when * creating a symlink (transaction committed before the inode * eviction handler removed the symlink inode items and a crash - * happened in between or the subvol was snapshoted in between). + * happened in between or the subvol was snapshotted in between). * Print an informative message to dmesg/syslog so that the user * can delete the symlink. */ btrfs_err(root->fs_info, "Found empty symlink inode %llu at root %llu", ino, btrfs_root_id(root)); - ret = -EIO; - goto out; + return -EIO; } ei = btrfs_item_ptr(path->nodes[0], path->slots[0], @@ -1758,7 +1746,7 @@ static int read_symlink(struct btrfs_root *root, btrfs_crit(root->fs_info, "send: found symlink extent that is not inline, ino %llu root %llu extent type %d", ino, btrfs_root_id(root), type); - goto out; + return ret; } compression = btrfs_file_extent_compression(path->nodes[0], ei); if (unlikely(compression != BTRFS_COMPRESS_NONE)) { @@ -1766,17 +1754,13 @@ static int read_symlink(struct btrfs_root *root, btrfs_crit(root->fs_info, "send: found symlink extent with compression, ino %llu root %llu compression type %d", ino, btrfs_root_id(root), compression); - goto out; + return ret; } off = btrfs_file_extent_inline_start(ei); len = btrfs_file_extent_ram_bytes(path->nodes[0], ei); - ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); - -out: - btrfs_free_path(path); - return ret; + return fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); } /* @@ -1787,8 +1771,7 @@ static int gen_unique_name(struct send_ctx *sctx, u64 ino, u64 gen, struct fs_path *dest) { - int ret = 0; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dir_item *di; char tmp[64]; int len; @@ -1811,10 +1794,9 @@ static int gen_unique_name(struct send_ctx *sctx, path, BTRFS_FIRST_FREE_OBJECTID, &tmp_name, 0); btrfs_release_path(path); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } + if (IS_ERR(di)) + return PTR_ERR(di); + if (di) { /* not unique, try again */ idx++; @@ -1823,7 +1805,6 @@ static int gen_unique_name(struct send_ctx *sctx, if (!sctx->parent_root) { /* unique */ - ret = 0; break; } @@ -1831,10 +1812,9 @@ static int gen_unique_name(struct send_ctx *sctx, path, BTRFS_FIRST_FREE_OBJECTID, &tmp_name, 0); btrfs_release_path(path); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } + if (IS_ERR(di)) + return PTR_ERR(di); + if (di) { /* not unique, try again */ idx++; @@ -1844,11 +1824,7 @@ static int gen_unique_name(struct send_ctx *sctx, break; } - ret = fs_path_add(dest, tmp, len); - -out: - btrfs_free_path(path); - return ret; + return fs_path_add(dest, tmp, len); } enum inode_state { @@ -1960,7 +1936,7 @@ static int lookup_dir_item_inode(struct btrfs_root *root, int ret = 0; struct btrfs_dir_item *di; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len); path = alloc_path_for_send(); @@ -1968,19 +1944,15 @@ static int lookup_dir_item_inode(struct btrfs_root *root, return -ENOMEM; di = btrfs_lookup_dir_item(NULL, root, path, dir, &name_str, 0); - if (IS_ERR_OR_NULL(di)) { - ret = di ? PTR_ERR(di) : -ENOENT; - goto out; - } + if (IS_ERR_OR_NULL(di)) + return di ? PTR_ERR(di) : -ENOENT; + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); - if (key.type == BTRFS_ROOT_ITEM_KEY) { - ret = -ENOENT; - goto out; - } + if (key.type == BTRFS_ROOT_ITEM_KEY) + return -ENOENT; + *found_inode = key.objectid; -out: - btrfs_free_path(path); return ret; } @@ -1994,7 +1966,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, int ret; struct btrfs_key key; struct btrfs_key found_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int len; u64 parent_dir; @@ -2008,16 +1980,14 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); if (ret < 0) - goto out; + return ret; if (!ret) btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); if (ret || found_key.objectid != ino || (found_key.type != BTRFS_INODE_REF_KEY && - found_key.type != BTRFS_INODE_EXTREF_KEY)) { - ret = -ENOENT; - goto out; - } + found_key.type != BTRFS_INODE_EXTREF_KEY)) + return -ENOENT; if (found_key.type == BTRFS_INODE_REF_KEY) { struct btrfs_inode_ref *iref; @@ -2038,19 +2008,17 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref); } if (ret < 0) - goto out; + return ret; btrfs_release_path(path); if (dir_gen) { ret = get_inode_gen(root, parent_dir, dir_gen); if (ret < 0) - goto out; + return ret; } *dir = parent_dir; -out: - btrfs_free_path(path); return ret; } @@ -2486,7 +2454,7 @@ static int send_subvol_begin(struct send_ctx *sctx) int ret; struct btrfs_root *send_root = sctx->send_root; struct btrfs_root *parent_root = sctx->parent_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_root_ref *ref; struct extent_buffer *leaf; @@ -2498,10 +2466,8 @@ static int send_subvol_begin(struct send_ctx *sctx) return -ENOMEM; name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL); - if (!name) { - btrfs_free_path(path); + if (!name) return -ENOMEM; - } key.objectid = btrfs_root_id(send_root); key.type = BTRFS_ROOT_BACKREF_KEY; @@ -2564,7 +2530,6 @@ static int send_subvol_begin(struct send_ctx *sctx) tlv_put_failure: out: - btrfs_free_path(path); kfree(name); return ret; } @@ -2715,7 +2680,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) int ret = 0; struct fs_path *p = NULL; struct btrfs_inode_item *ii; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *eb; struct btrfs_key key; int slot; @@ -2759,7 +2724,6 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) tlv_put_failure: out: free_path_for_command(sctx, p); - btrfs_free_path(path); return ret; } @@ -2769,7 +2733,7 @@ out: * processing an inode that is a directory and it just got renamed, and existing * entries in the cache may refer to inodes that have the directory in their * full path - in which case we would generate outdated paths (pre-rename) - * for the inodes that the cache entries point to. Instead of prunning the + * for the inodes that the cache entries point to. Instead of pruning the * cache when inserting, do it after we finish processing each inode at * finish_inode_if_needed(). */ @@ -2930,7 +2894,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) { int ret = 0; int iter_ret = 0; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; struct btrfs_key di_key; @@ -2970,7 +2934,6 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) if (iter_ret < 0) ret = iter_ret; - btrfs_free_path(path); return ret; } @@ -3750,7 +3713,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, struct recorded_ref *parent_ref, const bool is_orphan) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key di_key; struct btrfs_dir_item *di; @@ -3771,19 +3734,15 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len); ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0); - if (ret < 0) { - goto out; - } else if (ret > 0) { - ret = 0; - goto out; - } + if (ret < 0) + return ret; + if (ret > 0) + return 0; di = btrfs_match_dir_item_name(path, parent_ref->name, parent_ref->name_len); - if (!di) { - ret = 0; - goto out; - } + if (!di) + return 0; /* * di_key.objectid has the number of the inode that has a dentry in the * parent directory with the same name that sctx->cur_ino is being @@ -3793,26 +3752,22 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, * that it happens after that other inode is renamed. */ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key); - if (di_key.type != BTRFS_INODE_ITEM_KEY) { - ret = 0; - goto out; - } + if (di_key.type != BTRFS_INODE_ITEM_KEY) + return 0; ret = get_inode_gen(sctx->parent_root, di_key.objectid, &left_gen); if (ret < 0) - goto out; + return ret; ret = get_inode_gen(sctx->send_root, di_key.objectid, &right_gen); if (ret < 0) { if (ret == -ENOENT) ret = 0; - goto out; + return ret; } /* Different inode, no need to delay the rename of sctx->cur_ino */ - if (right_gen != left_gen) { - ret = 0; - goto out; - } + if (right_gen != left_gen) + return 0; wdm = get_waiting_dir_move(sctx, di_key.objectid); if (wdm && !wdm->orphanized) { @@ -3826,8 +3781,6 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, if (!ret) ret = 1; } -out: - btrfs_free_path(path); return ret; } @@ -3877,7 +3830,7 @@ static int is_ancestor(struct btrfs_root *root, bool free_fs_path = false; int ret = 0; int iter_ret = 0; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; if (!fs_path) { @@ -3945,7 +3898,6 @@ static int is_ancestor(struct btrfs_root *root, ret = iter_ret; out: - btrfs_free_path(path); if (free_fs_path) fs_path_free(fs_path); return ret; @@ -4756,8 +4708,8 @@ static int record_new_ref(struct send_ctx *sctx) { int ret; - ret = iterate_inode_ref(sctx->send_root, sctx->left_path, - sctx->cmp_key, 0, record_new_ref_if_needed, sctx); + ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key, + false, record_new_ref_if_needed, sctx); if (ret < 0) return ret; @@ -4768,9 +4720,8 @@ static int record_deleted_ref(struct send_ctx *sctx) { int ret; - ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, - sctx->cmp_key, 0, record_deleted_ref_if_needed, - sctx); + ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, sctx->cmp_key, + false, record_deleted_ref_if_needed, sctx); if (ret < 0) return ret; @@ -4781,12 +4732,12 @@ static int record_changed_ref(struct send_ctx *sctx) { int ret; - ret = iterate_inode_ref(sctx->send_root, sctx->left_path, - sctx->cmp_key, 0, record_new_ref_if_needed, sctx); + ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key, + false, record_new_ref_if_needed, sctx); if (ret < 0) return ret; - ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, - sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx); + ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, sctx->cmp_key, + false, record_deleted_ref_if_needed, sctx); if (ret < 0) return ret; @@ -4803,7 +4754,7 @@ static int process_all_refs(struct send_ctx *sctx, int ret = 0; int iter_ret = 0; struct btrfs_root *root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; iterate_inode_ref_t cb; @@ -4822,8 +4773,7 @@ static int process_all_refs(struct send_ctx *sctx, } else { btrfs_err(sctx->send_root->fs_info, "Wrong command %d in process_all_refs", cmd); - ret = -EINVAL; - goto out; + return -EINVAL; } key.objectid = sctx->cmp_key->objectid; @@ -4835,15 +4785,14 @@ static int process_all_refs(struct send_ctx *sctx, found_key.type != BTRFS_INODE_EXTREF_KEY)) break; - ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx); + ret = iterate_inode_ref(root, path, &found_key, false, cb, sctx); if (ret < 0) - goto out; + return ret; } /* Catch error found during iteration */ - if (iter_ret < 0) { - ret = iter_ret; - goto out; - } + if (iter_ret < 0) + return iter_ret; + btrfs_release_path(path); /* @@ -4851,10 +4800,7 @@ static int process_all_refs(struct send_ctx *sctx, * re-creating this inode and will be rename'ing it into place once we * rename the parent directory. */ - ret = process_recorded_refs(sctx, &pending_move); -out: - btrfs_free_path(path); - return ret; + return process_recorded_refs(sctx, &pending_move); } static int send_set_xattr(struct send_ctx *sctx, @@ -5080,7 +5026,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx) int ret = 0; int iter_ret = 0; struct btrfs_root *root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; @@ -5108,7 +5054,6 @@ static int process_all_new_xattrs(struct send_ctx *sctx) if (iter_ret < 0) ret = iter_ret; - btrfs_free_path(path); return ret; } @@ -5254,7 +5199,7 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) if (!folio_test_uptodate(folio)) { btrfs_read_folio(NULL, folio); folio_lock(folio); - if (!folio_test_uptodate(folio)) { + if (unlikely(!folio_test_uptodate(folio))) { folio_unlock(folio); btrfs_err(fs_info, "send: IO error at offset %llu for inode %llu root %llu", @@ -5656,7 +5601,14 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) && + /* + * Do not go through encoded read for bs > ps cases. + * + * Encoded send is using vmallocated pages as buffer, which we can + * not ensure every folio is large enough to contain a block. + */ + if (sctx->send_root->fs_info->sectorsize <= PAGE_SIZE && + (sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) && btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { bool is_inline = (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE); @@ -5766,7 +5718,7 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, */ static int send_capabilities(struct send_ctx *sctx) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dir_item *di; struct extent_buffer *leaf; unsigned long data_ptr; @@ -5804,7 +5756,6 @@ static int send_capabilities(struct send_ctx *sctx) strlen(XATTR_NAME_CAPS), buf, buf_len); out: kfree(buf); - btrfs_free_path(path); return ret; } @@ -5812,7 +5763,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, struct clone_root *clone_root, const u64 disk_byte, u64 data_offset, u64 offset, u64 len) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int ret; struct btrfs_inode_info info; @@ -5848,7 +5799,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, ret = get_inode_info(clone_root->root, clone_root->ino, &info); btrfs_release_path(path); if (ret < 0) - goto out; + return ret; clone_src_i_size = info.size; /* @@ -5878,7 +5829,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, key.offset = clone_root->offset; ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret > 0 && path->slots[0] > 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); if (key.objectid == clone_root->ino && @@ -5899,7 +5850,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(clone_root->root, path); if (ret < 0) - goto out; + return ret; else if (ret > 0) break; continue; @@ -5936,7 +5887,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, ret = send_extent_data(sctx, dst_path, offset, hole_len); if (ret < 0) - goto out; + return ret; len -= hole_len; if (len == 0) @@ -6007,7 +5958,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, ret = send_clone(sctx, offset, slen, clone_root); if (ret < 0) - goto out; + return ret; } ret = send_extent_data(sctx, dst_path, offset + slen, @@ -6041,7 +5992,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, } if (ret < 0) - goto out; + return ret; len -= clone_len; if (len == 0) @@ -6072,8 +6023,6 @@ next: ret = send_extent_data(sctx, dst_path, offset, len); else ret = 0; -out: - btrfs_free_path(path); return ret; } @@ -6162,7 +6111,7 @@ static int is_extent_unchanged(struct send_ctx *sctx, { int ret = 0; struct btrfs_key key; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *eb; int slot; struct btrfs_key found_key; @@ -6188,10 +6137,9 @@ static int is_extent_unchanged(struct send_ctx *sctx, ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); left_type = btrfs_file_extent_type(eb, ei); - if (left_type != BTRFS_FILE_EXTENT_REG) { - ret = 0; - goto out; - } + if (left_type != BTRFS_FILE_EXTENT_REG) + return 0; + left_disknr = btrfs_file_extent_disk_bytenr(eb, ei); left_len = btrfs_file_extent_num_bytes(eb, ei); left_offset = btrfs_file_extent_offset(eb, ei); @@ -6223,11 +6171,9 @@ static int is_extent_unchanged(struct send_ctx *sctx, key.offset = ekey->offset; ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0); if (ret < 0) - goto out; - if (ret) { - ret = 0; - goto out; - } + return ret; + if (ret) + return 0; /* * Handle special case where the right side has no extents at all. @@ -6236,11 +6182,9 @@ static int is_extent_unchanged(struct send_ctx *sctx, slot = path->slots[0]; btrfs_item_key_to_cpu(eb, &found_key, slot); if (found_key.objectid != key.objectid || - found_key.type != key.type) { + found_key.type != key.type) /* If we're a hole then just pretend nothing changed */ - ret = (left_disknr) ? 0 : 1; - goto out; - } + return (left_disknr ? 0 : 1); /* * We're now on 2a, 2b or 7. @@ -6250,10 +6194,8 @@ static int is_extent_unchanged(struct send_ctx *sctx, ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); right_type = btrfs_file_extent_type(eb, ei); if (right_type != BTRFS_FILE_EXTENT_REG && - right_type != BTRFS_FILE_EXTENT_INLINE) { - ret = 0; - goto out; - } + right_type != BTRFS_FILE_EXTENT_INLINE) + return 0; if (right_type == BTRFS_FILE_EXTENT_INLINE) { right_len = btrfs_file_extent_ram_bytes(eb, ei); @@ -6266,11 +6208,9 @@ static int is_extent_unchanged(struct send_ctx *sctx, * Are we at extent 8? If yes, we know the extent is changed. * This may only happen on the first iteration. */ - if (found_key.offset + right_len <= ekey->offset) { + if (found_key.offset + right_len <= ekey->offset) /* If we're a hole just pretend nothing changed */ - ret = (left_disknr) ? 0 : 1; - goto out; - } + return (left_disknr ? 0 : 1); /* * We just wanted to see if when we have an inline extent, what @@ -6280,10 +6220,8 @@ static int is_extent_unchanged(struct send_ctx *sctx, * compressed extent representing data with a size matching * the page size (currently the same as sector size). */ - if (right_type == BTRFS_FILE_EXTENT_INLINE) { - ret = 0; - goto out; - } + if (right_type == BTRFS_FILE_EXTENT_INLINE) + return 0; right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); right_offset = btrfs_file_extent_offset(eb, ei); @@ -6303,17 +6241,15 @@ static int is_extent_unchanged(struct send_ctx *sctx, */ if (left_disknr != right_disknr || left_offset_fixed != right_offset || - left_gen != right_gen) { - ret = 0; - goto out; - } + left_gen != right_gen) + return 0; /* * Go to the next extent. */ ret = btrfs_next_item(sctx->parent_root, path); if (ret < 0) - goto out; + return ret; if (!ret) { eb = path->nodes[0]; slot = path->slots[0]; @@ -6324,10 +6260,9 @@ static int is_extent_unchanged(struct send_ctx *sctx, key.offset += right_len; break; } - if (found_key.offset != key.offset + right_len) { - ret = 0; - goto out; - } + if (found_key.offset != key.offset + right_len) + return 0; + key = found_key; } @@ -6340,15 +6275,12 @@ static int is_extent_unchanged(struct send_ctx *sctx, else ret = 0; - -out: - btrfs_free_path(path); return ret; } static int get_last_extent(struct send_ctx *sctx, u64 offset) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = sctx->send_root; struct btrfs_key key; int ret; @@ -6364,15 +6296,13 @@ static int get_last_extent(struct send_ctx *sctx, u64 offset) key.offset = offset; ret = btrfs_search_slot_for_read(root, &key, path, 0, 1); if (ret < 0) - goto out; + return ret; ret = 0; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY) - goto out; + return ret; sctx->cur_inode_last_extent = btrfs_file_extent_end(path); -out: - btrfs_free_path(path); return ret; } @@ -6380,7 +6310,7 @@ static int range_is_hole_in_parent(struct send_ctx *sctx, const u64 start, const u64 end) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_root *root = sctx->parent_root; u64 search_start = start; @@ -6395,7 +6325,7 @@ static int range_is_hole_in_parent(struct send_ctx *sctx, key.offset = search_start; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret > 0 && path->slots[0] > 0) path->slots[0]--; @@ -6408,8 +6338,8 @@ static int range_is_hole_in_parent(struct send_ctx *sctx, if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) - goto out; - else if (ret > 0) + return ret; + if (ret > 0) break; continue; } @@ -6431,15 +6361,11 @@ static int range_is_hole_in_parent(struct send_ctx *sctx, search_start = extent_end; goto next; } - ret = 0; - goto out; + return 0; next: path->slots[0]++; } - ret = 1; -out: - btrfs_free_path(path); - return ret; + return 1; } static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, @@ -6547,7 +6473,7 @@ static int process_all_extents(struct send_ctx *sctx) int ret = 0; int iter_ret = 0; struct btrfs_root *root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; @@ -6574,11 +6500,10 @@ static int process_all_extents(struct send_ctx *sctx) if (iter_ret < 0) ret = iter_ret; - btrfs_free_path(path); return ret; } -static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end, +static int process_recorded_refs_if_needed(struct send_ctx *sctx, bool at_end, int *pending_move, int *refs_processed) { @@ -6601,7 +6526,7 @@ out: return ret; } -static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) +static int finish_inode_if_needed(struct send_ctx *sctx, bool at_end) { int ret = 0; struct btrfs_inode_info info; @@ -7036,7 +6961,7 @@ static int changed_ref(struct send_ctx *sctx, { int ret = 0; - if (sctx->cur_ino != sctx->cmp_key->objectid) { + if (unlikely(sctx->cur_ino != sctx->cmp_key->objectid)) { inconsistent_snapshot_error(sctx, result, "reference"); return -EIO; } @@ -7064,7 +6989,7 @@ static int changed_xattr(struct send_ctx *sctx, { int ret = 0; - if (sctx->cur_ino != sctx->cmp_key->objectid) { + if (unlikely(sctx->cur_ino != sctx->cmp_key->objectid)) { inconsistent_snapshot_error(sctx, result, "xattr"); return -EIO; } @@ -7304,7 +7229,7 @@ static int search_key_again(const struct send_ctx *sctx, */ ret = btrfs_search_slot(NULL, root, key, path, 0, 0); ASSERT(ret <= 0); - if (ret > 0) { + if (unlikely(ret > 0)) { btrfs_print_tree(path->nodes[path->lowest_level], false); btrfs_err(root->fs_info, "send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d", @@ -7324,7 +7249,7 @@ static int full_send_tree(struct send_ctx *sctx) struct btrfs_root *send_root = sctx->send_root; struct btrfs_key key; struct btrfs_fs_info *fs_info = send_root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); path = alloc_path_for_send(); if (!path) @@ -7341,7 +7266,7 @@ static int full_send_tree(struct send_ctx *sctx) ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0); if (ret < 0) - goto out; + return ret; if (ret) goto out_finish; @@ -7351,7 +7276,7 @@ static int full_send_tree(struct send_ctx *sctx) ret = changed_cb(path, NULL, &key, BTRFS_COMPARE_TREE_NEW, sctx); if (ret < 0) - goto out; + return ret; down_read(&fs_info->commit_root_sem); if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { @@ -7370,14 +7295,14 @@ static int full_send_tree(struct send_ctx *sctx) btrfs_release_path(path); ret = search_key_again(sctx, send_root, path, &key); if (ret < 0) - goto out; + return ret; } else { up_read(&fs_info->commit_root_sem); } ret = btrfs_next_item(send_root, path); if (ret < 0) - goto out; + return ret; if (ret) { ret = 0; break; @@ -7385,11 +7310,7 @@ static int full_send_tree(struct send_ctx *sctx) } out_finish: - ret = finish_inode_if_needed(sctx, 1); - -out: - btrfs_free_path(path); - return ret; + return finish_inode_if_needed(sctx, 1); } static int replace_node_with_clone(struct btrfs_path *path, int level) @@ -7644,8 +7565,8 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, struct btrfs_fs_info *fs_info = left_root->fs_info; int ret; int cmp; - struct btrfs_path *left_path = NULL; - struct btrfs_path *right_path = NULL; + BTRFS_PATH_AUTO_FREE(left_path); + BTRFS_PATH_AUTO_FREE(right_path); struct btrfs_key left_key; struct btrfs_key right_key; char *tmp_buf = NULL; @@ -7918,8 +7839,6 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, out_unlock: up_read(&fs_info->commit_root_sem); out: - btrfs_free_path(left_path); - btrfs_free_path(right_path); kvfree(tmp_buf); return ret; } @@ -7986,7 +7905,7 @@ static int ensure_commit_roots_uptodate(struct send_ctx *sctx) } /* - * Make sure any existing dellaloc is flushed for any root used by a send + * Make sure any existing delalloc is flushed for any root used by a send * operation so that we do not miss any data and we do not race with writeback * finishing and changing a tree while send is using the tree. This could * happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index c573d80550ad..97452fb5d29b 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -479,7 +479,7 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, /* * On the zoned mode, we always allocate one zone as one chunk. - * Returning non-zone size alingned bytes here will result in + * Returning non-zone size aligned bytes here will result in * less pressure for the async metadata reclaim process, and it * will over-commit too much leading to ENOSPC. Align down to the * zone size to avoid that. @@ -1528,7 +1528,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, * turned into error mode due to a transaction abort when flushing space * above, in that case fail with the abort error instead of returning * success to the caller if we can steal from the global rsv - this is - * just to have caller fail immeditelly instead of later when trying to + * just to have caller fail immediately instead of later when trying to * modify the fs, making it easier to debug -ENOSPC problems. */ if (BTRFS_FS_ERROR(fs_info)) { diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index cb4f97833dc3..5ca8d4db6722 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -690,7 +690,7 @@ IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, \ GET_SUBPAGE_BITMAP(fs_info, folio, name, &bitmap); \ btrfs_warn(fs_info, \ - "dumpping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \ + "dumping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \ start, len, folio_pos(folio), \ blocks_per_folio, &bitmap); \ } diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index ee0710eb13fd..ad0552db7c7d 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -13,7 +13,7 @@ struct address_space; struct folio; /* - * Extra info for subpapge bitmap. + * Extra info for subpage bitmap. * * For subpage we pack all uptodate/dirty/writeback/ordered bitmaps into * one larger bitmap. diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index b06b8f325537..d6e496436539 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -133,9 +133,8 @@ enum { Opt_enospc_debug, #ifdef CONFIG_BTRFS_DEBUG Opt_fragment, Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, -#endif -#ifdef CONFIG_BTRFS_FS_REF_VERIFY Opt_ref_verify, + Opt_ref_tracker, #endif Opt_err, }; @@ -257,8 +256,7 @@ static const struct fs_parameter_spec btrfs_fs_parameters[] = { fsparam_flag_no("enospc_debug", Opt_enospc_debug), #ifdef CONFIG_BTRFS_DEBUG fsparam_enum("fragment", Opt_fragment, btrfs_parameter_fragment), -#endif -#ifdef CONFIG_BTRFS_FS_REF_VERIFY + fsparam_flag("ref_tracker", Opt_ref_tracker), fsparam_flag("ref_verify", Opt_ref_verify), #endif {} @@ -646,11 +644,12 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) return -EINVAL; } break; -#endif -#ifdef CONFIG_BTRFS_FS_REF_VERIFY case Opt_ref_verify: btrfs_set_opt(ctx->mount_opt, REF_VERIFY); break; + case Opt_ref_tracker: + btrfs_set_opt(ctx->mount_opt, REF_TRACKER); + break; #endif default: btrfs_err(NULL, "unrecognized mount option '%s'", param->key); @@ -926,7 +925,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec { struct btrfs_root *root = fs_info->tree_root; struct btrfs_dir_item *di; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key location; struct fscrypt_str name = FSTR_INIT("default", 7); u64 dir_id; @@ -943,7 +942,6 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec dir_id = btrfs_super_root_dir(fs_info->super_copy); di = btrfs_lookup_dir_item(NULL, root, path, dir_id, &name, 0); if (IS_ERR(di)) { - btrfs_free_path(path); return PTR_ERR(di); } if (!di) { @@ -952,13 +950,11 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec * it's always been there, but don't freak out, just try and * mount the top-level subvolume. */ - btrfs_free_path(path); *objectid = BTRFS_FS_TREE_OBJECTID; return 0; } btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); - btrfs_free_path(path); *objectid = location.objectid; return 0; } @@ -1156,6 +1152,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) #endif if (btrfs_test_opt(info, REF_VERIFY)) seq_puts(seq, ",ref_verify"); + if (btrfs_test_opt(info, REF_TRACKER)) + seq_puts(seq, ",ref_tracker"); seq_printf(seq, ",subvolid=%llu", btrfs_root_id(BTRFS_I(d_inode(dentry))->root)); subvol_name = btrfs_get_subvol_name_from_objectid(info, btrfs_root_id(BTRFS_I(d_inode(dentry))->root)); @@ -1282,7 +1280,7 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE); /* - * We need to cleanup all defragable inodes if the autodefragment is + * We need to cleanup all defraggable inodes if the autodefragment is * close or the filesystem is read only. */ if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && @@ -2274,10 +2272,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, device = btrfs_scan_one_device(vol->name, false); if (IS_ERR_OR_NULL(device)) { mutex_unlock(&uuid_mutex); - if (IS_ERR(device)) - ret = PTR_ERR(device); - else - ret = 0; + ret = PTR_ERR_OR_ZERO(device); break; } ret = !(device->fs_devices->num_devices == @@ -2330,14 +2325,14 @@ static int check_dev_super(struct btrfs_device *dev) /* Verify the checksum. */ csum_type = btrfs_super_csum_type(sb); - if (csum_type != btrfs_super_csum_type(fs_info->super_copy)) { + if (unlikely(csum_type != btrfs_super_csum_type(fs_info->super_copy))) { btrfs_err(fs_info, "csum type changed, has %u expect %u", csum_type, btrfs_super_csum_type(fs_info->super_copy)); ret = -EUCLEAN; goto out; } - if (btrfs_check_super_csum(fs_info, sb)) { + if (unlikely(btrfs_check_super_csum(fs_info, sb))) { btrfs_err(fs_info, "csum for on-disk super block no longer matches"); ret = -EUCLEAN; goto out; @@ -2349,7 +2344,7 @@ static int check_dev_super(struct btrfs_device *dev) goto out; last_trans = btrfs_get_last_trans_committed(fs_info); - if (btrfs_super_generation(sb) != last_trans) { + if (unlikely(btrfs_super_generation(sb) != last_trans)) { btrfs_err(fs_info, "transid mismatch, has %llu expect %llu", btrfs_super_generation(sb), last_trans); ret = -EUCLEAN; @@ -2486,9 +2481,6 @@ static int __init btrfs_print_mod_info(void) #ifdef CONFIG_BTRFS_ASSERT ", assert=on" #endif -#ifdef CONFIG_BTRFS_FS_REF_VERIFY - ", ref-verify=on" -#endif #ifdef CONFIG_BLK_DEV_ZONED ", zoned=yes" #else diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 9d398f7a36ad..81f52c1f55ce 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -409,13 +409,17 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj, char *buf) { ssize_t ret = 0; + bool has_output = false; - if (BTRFS_MIN_BLOCKSIZE != SZ_4K && BTRFS_MIN_BLOCKSIZE != PAGE_SIZE) - ret += sysfs_emit_at(buf, ret, "%u ", BTRFS_MIN_BLOCKSIZE); - if (PAGE_SIZE > SZ_4K) - ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K); - ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE); - + for (u32 cur = BTRFS_MIN_BLOCKSIZE; cur <= BTRFS_MAX_BLOCKSIZE; cur *= 2) { + if (!btrfs_supported_blocksize(cur)) + continue; + if (has_output) + ret += sysfs_emit_at(buf, ret, " "); + ret += sysfs_emit_at(buf, ret, "%u", cur); + has_output = true; + } + ret += sysfs_emit_at(buf, ret, "\n"); return ret; } BTRFS_ATTR(static_feature, supported_sectorsizes, diff --git a/fs/btrfs/tests/delayed-refs-tests.c b/fs/btrfs/tests/delayed-refs-tests.c index 265370e79a54..e2248acb906b 100644 --- a/fs/btrfs/tests/delayed-refs-tests.c +++ b/fs/btrfs/tests/delayed-refs-tests.c @@ -997,12 +997,12 @@ int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize) ret = simple_tests(&trans); if (!ret) { - test_msg("running delayed refs merg tests on metadata refs"); + test_msg("running delayed refs merge tests on metadata refs"); ret = merge_tests(&trans, BTRFS_REF_METADATA); } if (!ret) { - test_msg("running delayed refs merg tests on data refs"); + test_msg("running delayed refs merge tests on data refs"); ret = merge_tests(&trans, BTRFS_REF_DATA); } diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 3a86534c116f..42af6c737c6e 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -1095,7 +1095,7 @@ int btrfs_test_extent_map(void) /* * Test a chunk with 2 data stripes one of which * intersects the physical address of the super block - * is correctly recognised. + * is correctly recognized. */ .raid_type = BTRFS_BLOCK_GROUP_RAID1, .physical_start = SZ_64M - SZ_4M, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c5c0d9cf1a80..89ae0c7a610a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -103,7 +103,7 @@ static struct kmem_cache *btrfs_trans_handle_cachep; * | attached to transid N+1. | * | | * | To next stage: | - * | Until all tree blocks are super blocks are | + * | Until all tree blocks and super blocks are | * | written to block devices | * V | * Transaction N [[TRANS_STATE_COMPLETED]] V @@ -404,7 +404,7 @@ loop: */ static int record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root, - int force) + bool force) { struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; @@ -1569,7 +1569,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, * qgroup counters could end up wrong. */ ret = btrfs_run_delayed_refs(trans, U64_MAX); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -1641,7 +1641,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *parent_root; struct btrfs_block_rsv *rsv; struct btrfs_inode *parent_inode = pending->dir; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dir_item *dir_item; struct extent_buffer *tmp; struct extent_buffer *old; @@ -1694,10 +1694,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto clear_skip_qgroup; } - key.objectid = objectid; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - rsv = trans->block_rsv; trans->block_rsv = &pending->block_rsv; trans->bytes_reserved = trans->block_rsv->reserved; @@ -1714,7 +1710,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * insert the directory item */ ret = btrfs_set_inode_index(parent_inode, &index); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1735,7 +1731,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_create_qgroup(trans, objectid); if (ret && ret != -EEXIST) { - if (ret != -ENOTCONN || btrfs_qgroup_enabled(fs_info)) { + if (unlikely(ret != -ENOTCONN || btrfs_qgroup_enabled(fs_info))) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1748,13 +1744,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * snapshot */ ret = btrfs_run_delayed_items(trans); - if (ret) { /* Transaction aborted */ + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } ret = record_root_in_trans(trans, root, 0); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1789,7 +1785,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, old = btrfs_lock_root_node(root); ret = btrfs_cow_block(trans, root, old, NULL, 0, &old, BTRFS_NESTING_COW); - if (ret) { + if (unlikely(ret)) { btrfs_tree_unlock(old); free_extent_buffer(old); btrfs_abort_transaction(trans, ret); @@ -1800,21 +1796,23 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, /* clean up in any case */ btrfs_tree_unlock(old); free_extent_buffer(old); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } /* see comments in should_cow_block() */ set_bit(BTRFS_ROOT_FORCE_COW, &root->state); - smp_wmb(); + smp_mb__after_atomic(); btrfs_set_root_node(new_root_item, tmp); /* record when the snapshot was created in key.offset */ + key.objectid = objectid; + key.type = BTRFS_ROOT_ITEM_KEY; key.offset = trans->transid; ret = btrfs_insert_root(trans, tree_root, &key, new_root_item); btrfs_tree_unlock(tmp); free_extent_buffer(tmp); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1826,7 +1824,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_root_id(parent_root), btrfs_ino(parent_inode), index, &fname.disk_name); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1841,7 +1839,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } ret = btrfs_reloc_post_snapshot(trans, pending); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1864,7 +1862,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_insert_dir_item(trans, &fname.disk_name, parent_inode, &key, BTRFS_FT_DIR, index); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1874,14 +1872,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, inode_set_mtime_to_ts(&parent_inode->vfs_inode, inode_set_ctime_current(&parent_inode->vfs_inode)); ret = btrfs_update_inode_fallback(trans, parent_inode); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_uuid_tree_add(trans, new_root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1889,7 +1887,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_uuid_tree_add(trans, new_root_item->received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, objectid); - if (ret && ret != -EEXIST) { + if (unlikely(ret && ret != -EEXIST)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1907,7 +1905,6 @@ free_fname: free_pending: kfree(new_root_item); pending->root_item = NULL; - btrfs_free_path(path); pending->path = NULL; return ret; @@ -2423,7 +2420,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * them. * * We needn't worry that this operation will corrupt the snapshots, - * because all the tree which are snapshoted will be forced to COW + * because all the tree which are snapshotted will be forced to COW * the nodes and leaves. */ ret = btrfs_run_delayed_items(trans); @@ -2657,9 +2654,9 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) if (btrfs_header_backref_rev(root->node) < BTRFS_MIXED_BACKREF_REV) - ret = btrfs_drop_snapshot(root, 0, 0); + ret = btrfs_drop_snapshot(root, false, false); else - ret = btrfs_drop_snapshot(root, 1, 0); + ret = btrfs_drop_snapshot(root, true, false); btrfs_put_root(root); return (ret < 0) ? 0 : 1; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index a997c7cc35a2..ca30b15ea452 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -183,6 +183,7 @@ static bool check_prev_ino(struct extent_buffer *leaf, /* Only these key->types needs to be checked */ ASSERT(key->type == BTRFS_XATTR_ITEM_KEY || key->type == BTRFS_INODE_REF_KEY || + key->type == BTRFS_INODE_EXTREF_KEY || key->type == BTRFS_DIR_INDEX_KEY || key->type == BTRFS_DIR_ITEM_KEY || key->type == BTRFS_EXTENT_DATA_KEY); @@ -1209,7 +1210,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, /* * For legacy root item, the members starting at generation_v2 will be * all filled with 0. - * And since we allow geneartion_v2 as 0, it will still pass the check. + * And since we allow generation_v2 as 0, it will still pass the check. */ read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot), btrfs_item_size(leaf, slot)); @@ -1782,6 +1783,39 @@ static int check_inode_ref(struct extent_buffer *leaf, return 0; } +static int check_inode_extref(struct extent_buffer *leaf, + struct btrfs_key *key, struct btrfs_key *prev_key, + int slot) +{ + unsigned long ptr = btrfs_item_ptr_offset(leaf, slot); + unsigned long end = ptr + btrfs_item_size(leaf, slot); + + if (unlikely(!check_prev_ino(leaf, key, slot, prev_key))) + return -EUCLEAN; + + while (ptr < end) { + struct btrfs_inode_extref *extref = (struct btrfs_inode_extref *)ptr; + u16 namelen; + + if (unlikely(ptr + sizeof(*extref)) > end) { + inode_ref_err(leaf, slot, + "inode extref overflow, ptr %lu end %lu inode_extref size %zu", + ptr, end, sizeof(*extref)); + return -EUCLEAN; + } + + namelen = btrfs_inode_extref_name_len(leaf, extref); + if (unlikely(ptr + sizeof(*extref) + namelen > end)) { + inode_ref_err(leaf, slot, + "inode extref overflow, ptr %lu end %lu namelen %u", + ptr, end, namelen); + return -EUCLEAN; + } + ptr += sizeof(*extref) + namelen; + } + return 0; +} + static int check_raid_stripe_extent(const struct extent_buffer *leaf, const struct btrfs_key *key, int slot) { @@ -1893,6 +1927,9 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf, case BTRFS_INODE_REF_KEY: ret = check_inode_ref(leaf, key, prev_key, slot); break; + case BTRFS_INODE_EXTREF_KEY: + ret = check_inode_extref(leaf, key, prev_key, slot); + break; case BTRFS_BLOCK_GROUP_ITEM_KEY: ret = check_block_group_item(leaf, key, slot); break; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7a63afedd01e..6aad6b65522b 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -27,6 +27,7 @@ #include "file-item.h" #include "file.h" #include "orphan.h" +#include "print-tree.h" #include "tree-checker.h" #define MAX_CONFLICT_INODES 10 @@ -101,17 +102,134 @@ enum { LOG_WALK_REPLAY_ALL, }; +/* + * The walk control struct is used to pass state down the chain when processing + * the log tree. The stage field tells us which part of the log tree processing + * we are currently doing. + */ +struct walk_control { + /* + * Signal that we are freeing the metadata extents of a log tree. + * This is used at transaction commit time while freeing a log tree. + */ + bool free; + + /* + * Signal that we are pinning the metadata extents of a log tree and the + * data extents its leaves point to (if using mixed block groups). + * This happens in the first stage of log replay to ensure that during + * replay, while we are modifying subvolume trees, we don't overwrite + * the metadata extents of log trees. + */ + bool pin; + + /* What stage of the replay code we're currently in. */ + int stage; + + /* + * Ignore any items from the inode currently being processed. Needs + * to be set every time we find a BTRFS_INODE_ITEM_KEY. + */ + bool ignore_cur_inode; + + /* + * The root we are currently replaying to. This is NULL for the replay + * stage LOG_WALK_PIN_ONLY. + */ + struct btrfs_root *root; + + /* The log tree we are currently processing (not NULL for any stage). */ + struct btrfs_root *log; + + /* The transaction handle used for replaying all log trees. */ + struct btrfs_trans_handle *trans; + + /* + * The function that gets used to process blocks we find in the tree. + * Note the extent_buffer might not be up to date when it is passed in, + * and it must be checked or read if you need the data inside it. + */ + int (*process_func)(struct extent_buffer *eb, + struct walk_control *wc, u64 gen, int level); + + /* + * The following are used only when stage is >= LOG_WALK_REPLAY_INODES + * and by the replay_one_buffer() callback. + */ + + /* The current log leaf being processed. */ + struct extent_buffer *log_leaf; + /* The key being processed of the current log leaf. */ + struct btrfs_key log_key; + /* The slot being processed of the current log leaf. */ + int log_slot; + + /* A path used for searches and modifications to subvolume trees. */ + struct btrfs_path *subvol_path; +}; + +static void do_abort_log_replay(struct walk_control *wc, const char *function, + unsigned int line, int error, const char *fmt, ...) +{ + struct btrfs_fs_info *fs_info = wc->trans->fs_info; + struct va_format vaf; + va_list args; + + /* + * Do nothing if we already aborted, to avoid dumping leaves again which + * can be verbose. Further more, only the first call is useful since it + * is where we have a problem. Note that we do not use the flag + * BTRFS_FS_STATE_TRANS_ABORTED because log replay calls functions that + * are outside of tree-log.c that can abort transactions (such as + * btrfs_add_link() for example), so if that happens we still want to + * dump all log replay specific information below. + */ + if (test_and_set_bit(BTRFS_FS_STATE_LOG_REPLAY_ABORTED, &fs_info->fs_state)) + return; + + btrfs_abort_transaction(wc->trans, error); + + if (wc->subvol_path->nodes[0]) { + btrfs_crit(fs_info, + "subvolume (root %llu) leaf currently being processed:", + btrfs_root_id(wc->root)); + btrfs_print_leaf(wc->subvol_path->nodes[0]); + } + + if (wc->log_leaf) { + btrfs_crit(fs_info, + "log tree (for root %llu) leaf currently being processed (slot %d key %llu %u %llu):", + btrfs_root_id(wc->root), wc->log_slot, + wc->log_key.objectid, wc->log_key.type, wc->log_key.offset); + btrfs_print_leaf(wc->log_leaf); + } + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(fs_info, + "log replay failed in %s:%u for root %llu, stage %d, with error %d: %pV", + function, line, btrfs_root_id(wc->root), wc->stage, error, &vaf); + + va_end(args); +} + +/* + * Use this for aborting a transaction during log replay while we are down the + * call chain of replay_one_buffer(), so that we get a lot more useful + * information for debugging issues when compared to a plain call to + * btrfs_abort_transaction(). + */ +#define btrfs_abort_log_replay(wc, error, fmt, args...) \ + do_abort_log_replay((wc), __func__, __LINE__, (error), fmt, ##args) + static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, int inode_only, struct btrfs_log_ctx *ctx); -static int link_to_fixup_dir(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 objectid); -static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_root *log, - struct btrfs_path *path, +static int link_to_fixup_dir(struct walk_control *wc, u64 objectid); +static noinline int replay_dir_deletes(struct walk_control *wc, u64 dirid, bool del_all); static void wait_log_commit(struct btrfs_root *root, int transid); @@ -300,53 +418,13 @@ void btrfs_end_log_trans(struct btrfs_root *root) } /* - * the walk control struct is used to pass state down the chain when - * processing the log tree. The stage field tells us which part - * of the log tree processing we are currently doing. The others - * are state fields used for that specific part - */ -struct walk_control { - /* should we free the extent on disk when done? This is used - * at transaction commit time while freeing a log tree - */ - int free; - - /* pin only walk, we record which extents on disk belong to the - * log trees - */ - int pin; - - /* what stage of the replay code we're currently in */ - int stage; - - /* - * Ignore any items from the inode currently being processed. Needs - * to be set every time we find a BTRFS_INODE_ITEM_KEY. - */ - bool ignore_cur_inode; - - /* the root we are currently replaying */ - struct btrfs_root *replay_dest; - - /* the trans handle for the current replay */ - struct btrfs_trans_handle *trans; - - /* the function that gets used to process blocks we find in the - * tree. Note the extent_buffer might not be up to date when it is - * passed in, and it must be checked or read if you need the data - * inside it - */ - int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, - struct walk_control *wc, u64 gen, int level); -}; - -/* * process_func used to pin down extents, write them or wait on them */ -static int process_one_buffer(struct btrfs_root *log, - struct extent_buffer *eb, +static int process_one_buffer(struct extent_buffer *eb, struct walk_control *wc, u64 gen, int level) { + struct btrfs_root *log = wc->log; + struct btrfs_trans_handle *trans = wc->trans; struct btrfs_fs_info *fs_info = log->fs_info; int ret = 0; @@ -361,25 +439,36 @@ static int process_one_buffer(struct btrfs_root *log, }; ret = btrfs_read_extent_buffer(eb, &check); - if (ret) + if (unlikely(ret)) { + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(fs_info, ret, NULL); return ret; + } } if (wc->pin) { - ret = btrfs_pin_extent_for_log_replay(wc->trans, eb); - if (ret) + ASSERT(trans != NULL); + ret = btrfs_pin_extent_for_log_replay(trans, eb); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); return ret; + } - if (btrfs_buffer_uptodate(eb, gen, 0) && - btrfs_header_level(eb) == 0) + if (btrfs_buffer_uptodate(eb, gen, false) && level == 0) { ret = btrfs_exclude_logged_extents(eb); + if (ret) + btrfs_abort_transaction(trans, ret); + } } return ret; } /* - * Item overwrite used by log replay. The given eb, slot and key all refer to - * the source data we are copying out. + * Item overwrite used by log replay. The given log tree leaf, slot and key + * from the walk_control structure all refer to the source data we are copying + * out. * * The given root is for the tree we are copying into, and path is a scratch * path for use in this function (it should be released on entry and will be @@ -391,12 +480,10 @@ static int process_one_buffer(struct btrfs_root *log, * * If the key isn't in the destination yet, a new item is inserted. */ -static int overwrite_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) +static int overwrite_item(struct walk_control *wc) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; int ret; u32 item_size; u64 saved_i_size = 0; @@ -405,7 +492,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans, unsigned long dst_ptr; struct extent_buffer *dst_eb; int dst_slot; - bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; + const bool is_inode_item = (wc->log_key.type == BTRFS_INODE_ITEM_KEY); /* * This is only used during log replay, so the root is always from a @@ -416,16 +503,21 @@ static int overwrite_item(struct btrfs_trans_handle *trans, */ ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID); - item_size = btrfs_item_size(eb, slot); - src_ptr = btrfs_item_ptr_offset(eb, slot); + item_size = btrfs_item_size(wc->log_leaf, wc->log_slot); + src_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot); /* Look for the key in the destination tree. */ - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); - if (ret < 0) + ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0); + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to search subvolume tree for key (%llu %u %llu) root %llu", + wc->log_key.objectid, wc->log_key.type, + wc->log_key.offset, btrfs_root_id(root)); return ret; + } - dst_eb = path->nodes[0]; - dst_slot = path->slots[0]; + dst_eb = wc->subvol_path->nodes[0]; + dst_slot = wc->subvol_path->slots[0]; if (ret == 0) { char *src_copy; @@ -435,16 +527,17 @@ static int overwrite_item(struct btrfs_trans_handle *trans, goto insert; if (item_size == 0) { - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); return 0; } src_copy = kmalloc(item_size, GFP_NOFS); if (!src_copy) { - btrfs_release_path(path); + btrfs_abort_log_replay(wc, -ENOMEM, + "failed to allocate memory for log leaf item"); return -ENOMEM; } - read_extent_buffer(eb, src_copy, src_ptr, item_size); + read_extent_buffer(wc->log_leaf, src_copy, src_ptr, item_size); dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot); ret = memcmp_extent_buffer(dst_eb, src_copy, dst_ptr, item_size); @@ -456,7 +549,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans, * sync */ if (ret == 0) { - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); return 0; } @@ -464,7 +557,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans, * We need to load the old nbytes into the inode so when we * replay the extents we've logged we get the right nbytes. */ - if (inode_item) { + if (is_inode_item) { struct btrfs_inode_item *item; u64 nbytes; u32 mode; @@ -472,20 +565,20 @@ static int overwrite_item(struct btrfs_trans_handle *trans, item = btrfs_item_ptr(dst_eb, dst_slot, struct btrfs_inode_item); nbytes = btrfs_inode_nbytes(dst_eb, item); - item = btrfs_item_ptr(eb, slot, + item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_inode_item); - btrfs_set_inode_nbytes(eb, item, nbytes); + btrfs_set_inode_nbytes(wc->log_leaf, item, nbytes); /* * If this is a directory we need to reset the i_size to * 0 so that we can set it up properly when replaying * the rest of the items in this log. */ - mode = btrfs_inode_mode(eb, item); + mode = btrfs_inode_mode(wc->log_leaf, item); if (S_ISDIR(mode)) - btrfs_set_inode_size(eb, item, 0); + btrfs_set_inode_size(wc->log_leaf, item, 0); } - } else if (inode_item) { + } else if (is_inode_item) { struct btrfs_inode_item *item; u32 mode; @@ -493,38 +586,41 @@ static int overwrite_item(struct btrfs_trans_handle *trans, * New inode, set nbytes to 0 so that the nbytes comes out * properly when we replay the extents. */ - item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); - btrfs_set_inode_nbytes(eb, item, 0); + item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_inode_item); + btrfs_set_inode_nbytes(wc->log_leaf, item, 0); /* * If this is a directory we need to reset the i_size to 0 so * that we can set it up properly when replaying the rest of * the items in this log. */ - mode = btrfs_inode_mode(eb, item); + mode = btrfs_inode_mode(wc->log_leaf, item); if (S_ISDIR(mode)) - btrfs_set_inode_size(eb, item, 0); + btrfs_set_inode_size(wc->log_leaf, item, 0); } insert: - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); /* try to insert the key into the destination tree */ - path->skip_release_on_error = 1; - ret = btrfs_insert_empty_item(trans, root, path, - key, item_size); - path->skip_release_on_error = 0; + wc->subvol_path->skip_release_on_error = 1; + ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &wc->log_key, item_size); + wc->subvol_path->skip_release_on_error = 0; - dst_eb = path->nodes[0]; - dst_slot = path->slots[0]; + dst_eb = wc->subvol_path->nodes[0]; + dst_slot = wc->subvol_path->slots[0]; /* make sure any existing item is the correct size */ if (ret == -EEXIST || ret == -EOVERFLOW) { const u32 found_size = btrfs_item_size(dst_eb, dst_slot); if (found_size > item_size) - btrfs_truncate_item(trans, path, item_size, 1); + btrfs_truncate_item(trans, wc->subvol_path, item_size, 1); else if (found_size < item_size) - btrfs_extend_item(trans, path, item_size - found_size); + btrfs_extend_item(trans, wc->subvol_path, item_size - found_size); } else if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to insert item for key (%llu %u %llu)", + wc->log_key.objectid, wc->log_key.type, + wc->log_key.offset); return ret; } dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot); @@ -538,15 +634,15 @@ insert: * state of the tree found in the subvolume, and i_size is modified * as it goes */ - if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { + if (is_inode_item && ret == -EEXIST) { struct btrfs_inode_item *src_item; struct btrfs_inode_item *dst_item; src_item = (struct btrfs_inode_item *)src_ptr; dst_item = (struct btrfs_inode_item *)dst_ptr; - if (btrfs_inode_generation(eb, src_item) == 0) { - const u64 ino_size = btrfs_inode_size(eb, src_item); + if (btrfs_inode_generation(wc->log_leaf, src_item) == 0) { + const u64 ino_size = btrfs_inode_size(wc->log_leaf, src_item); /* * For regular files an ino_size == 0 is used only when @@ -555,21 +651,21 @@ insert: * case don't set the size of the inode in the fs/subvol * tree, otherwise we would be throwing valid data away. */ - if (S_ISREG(btrfs_inode_mode(eb, src_item)) && + if (S_ISREG(btrfs_inode_mode(wc->log_leaf, src_item)) && S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && ino_size != 0) btrfs_set_inode_size(dst_eb, dst_item, ino_size); goto no_copy; } - if (S_ISDIR(btrfs_inode_mode(eb, src_item)) && + if (S_ISDIR(btrfs_inode_mode(wc->log_leaf, src_item)) && S_ISDIR(btrfs_inode_mode(dst_eb, dst_item))) { save_old_i_size = 1; saved_i_size = btrfs_inode_size(dst_eb, dst_item); } } - copy_extent_buffer(dst_eb, eb, dst_ptr, src_ptr, item_size); + copy_extent_buffer(dst_eb, wc->log_leaf, dst_ptr, src_ptr, item_size); if (save_old_i_size) { struct btrfs_inode_item *dst_item; @@ -579,7 +675,7 @@ insert: } /* make sure the generation is filled in */ - if (key->type == BTRFS_INODE_ITEM_KEY) { + if (is_inode_item) { struct btrfs_inode_item *dst_item; dst_item = (struct btrfs_inode_item *)dst_ptr; @@ -587,7 +683,7 @@ insert: btrfs_set_inode_generation(dst_eb, dst_item, trans->transid); } no_copy: - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); return 0; } @@ -618,292 +714,354 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, * The extent is inserted into the file, dropping any existing extents * from the file that overlap the new one. */ -static noinline int replay_one_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) +static noinline int replay_one_extent(struct walk_control *wc) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_fs_info *fs_info = root->fs_info; int found_type; u64 extent_end; - u64 start = key->offset; + const u64 start = wc->log_key.offset; u64 nbytes = 0; + u64 csum_start; + u64 csum_end; + LIST_HEAD(ordered_sums); + u64 offset; + unsigned long dest_offset; + struct btrfs_key ins; struct btrfs_file_extent_item *item; struct btrfs_inode *inode = NULL; - unsigned long size; int ret = 0; - item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); - found_type = btrfs_file_extent_type(eb, item); + item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(wc->log_leaf, item); if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { - nbytes = btrfs_file_extent_num_bytes(eb, item); - extent_end = start + nbytes; - - /* - * We don't add to the inodes nbytes if we are prealloc or a - * hole. - */ - if (btrfs_file_extent_disk_bytenr(eb, item) == 0) - nbytes = 0; + extent_end = start + btrfs_file_extent_num_bytes(wc->log_leaf, item); + /* Holes don't take up space. */ + if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) != 0) + nbytes = btrfs_file_extent_num_bytes(wc->log_leaf, item); } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - size = btrfs_file_extent_ram_bytes(eb, item); - nbytes = btrfs_file_extent_ram_bytes(eb, item); - extent_end = ALIGN(start + size, - fs_info->sectorsize); + nbytes = btrfs_file_extent_ram_bytes(wc->log_leaf, item); + extent_end = ALIGN(start + nbytes, fs_info->sectorsize); } else { - btrfs_err(fs_info, - "unexpected extent type=%d root=%llu inode=%llu offset=%llu", - found_type, btrfs_root_id(root), key->objectid, key->offset); + btrfs_abort_log_replay(wc, -EUCLEAN, + "unexpected extent type=%d root=%llu inode=%llu offset=%llu", + found_type, btrfs_root_id(root), + wc->log_key.objectid, wc->log_key.offset); return -EUCLEAN; } - inode = btrfs_iget_logging(key->objectid, root); - if (IS_ERR(inode)) - return PTR_ERR(inode); + inode = btrfs_iget_logging(wc->log_key.objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + btrfs_abort_log_replay(wc, ret, + "failed to get inode %llu for root %llu", + wc->log_key.objectid, btrfs_root_id(root)); + return ret; + } /* * first check to see if we already have this extent in the * file. This must be done before the btrfs_drop_extents run * so we don't try to drop this extent. */ - ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), start, 0); + ret = btrfs_lookup_file_extent(trans, root, wc->subvol_path, + btrfs_ino(inode), start, 0); if (ret == 0 && (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC)) { + struct extent_buffer *leaf = wc->subvol_path->nodes[0]; struct btrfs_file_extent_item existing; unsigned long ptr; - ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); - read_extent_buffer(path->nodes[0], &existing, ptr, sizeof(existing)); + ptr = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]); + read_extent_buffer(leaf, &existing, ptr, sizeof(existing)); /* * we already have a pointer to this exact extent, * we don't have to do anything */ - if (memcmp_extent_buffer(eb, &existing, (unsigned long)item, + if (memcmp_extent_buffer(wc->log_leaf, &existing, (unsigned long)item, sizeof(existing)) == 0) { - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); goto out; } } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); /* drop any overlapping extents */ drop_args.start = start; drop_args.end = extent_end; drop_args.drop_cache = true; + drop_args.path = wc->subvol_path; ret = btrfs_drop_extents(trans, root, inode, &drop_args); - if (ret) + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to drop extents for inode %llu range [%llu, %llu) root %llu", + wc->log_key.objectid, start, extent_end, + btrfs_root_id(root)); goto out; + } - if (found_type == BTRFS_FILE_EXTENT_REG || - found_type == BTRFS_FILE_EXTENT_PREALLOC) { - u64 offset; - unsigned long dest_offset; - struct btrfs_key ins; - - if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && - btrfs_fs_incompat(fs_info, NO_HOLES)) - goto update_inode; - - ret = btrfs_insert_empty_item(trans, root, path, key, - sizeof(*item)); + if (found_type == BTRFS_FILE_EXTENT_INLINE) { + /* inline extents are easy, we just overwrite them */ + ret = overwrite_item(wc); if (ret) goto out; - dest_offset = btrfs_item_ptr_offset(path->nodes[0], - path->slots[0]); - copy_extent_buffer(path->nodes[0], eb, dest_offset, - (unsigned long)item, sizeof(*item)); + goto update_inode; + } - ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); - ins.type = BTRFS_EXTENT_ITEM_KEY; - ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); - offset = key->offset - btrfs_file_extent_offset(eb, item); + /* + * If not an inline extent, it can only be a regular or prealloc one. + * We have checked that above and returned -EUCLEAN if not. + */ - /* - * Manually record dirty extent, as here we did a shallow - * file extent item copy and skip normal backref update, - * but modifying extent tree all by ourselves. - * So need to manually record dirty extent for qgroup, - * as the owner of the file extent changed from log tree - * (doesn't affect qgroup) to fs/file tree(affects qgroup) - */ - ret = btrfs_qgroup_trace_extent(trans, - btrfs_file_extent_disk_bytenr(eb, item), - btrfs_file_extent_disk_num_bytes(eb, item)); - if (ret < 0) - goto out; + /* A hole and NO_HOLES feature enabled, nothing else to do. */ + if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) == 0 && + btrfs_fs_incompat(fs_info, NO_HOLES)) + goto update_inode; - if (ins.objectid > 0) { - u64 csum_start; - u64 csum_end; - LIST_HEAD(ordered_sums); + ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, + &wc->log_key, sizeof(*item)); + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to insert item with key (%llu %u %llu) root %llu", + wc->log_key.objectid, wc->log_key.type, + wc->log_key.offset, btrfs_root_id(root)); + goto out; + } + dest_offset = btrfs_item_ptr_offset(wc->subvol_path->nodes[0], + wc->subvol_path->slots[0]); + copy_extent_buffer(wc->subvol_path->nodes[0], wc->log_leaf, dest_offset, + (unsigned long)item, sizeof(*item)); - /* - * is this extent already allocated in the extent - * allocation tree? If so, just add a reference - */ - ret = btrfs_lookup_data_extent(fs_info, ins.objectid, - ins.offset); - if (ret < 0) { - goto out; - } else if (ret == 0) { - struct btrfs_ref ref = { - .action = BTRFS_ADD_DELAYED_REF, - .bytenr = ins.objectid, - .num_bytes = ins.offset, - .owning_root = btrfs_root_id(root), - .ref_root = btrfs_root_id(root), - }; - btrfs_init_data_ref(&ref, key->objectid, offset, - 0, false); - ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) - goto out; - } else { - /* - * insert the extent pointer in the extent - * allocation tree - */ - ret = btrfs_alloc_logged_file_extent(trans, - btrfs_root_id(root), - key->objectid, offset, &ins); - if (ret) - goto out; - } - btrfs_release_path(path); + /* + * We have an explicit hole and NO_HOLES is not enabled. We have added + * the hole file extent item to the subvolume tree, so we don't have + * anything else to do other than update the file extent item range and + * update the inode item. + */ + if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) == 0) { + btrfs_release_path(wc->subvol_path); + goto update_inode; + } - if (btrfs_file_extent_compression(eb, item)) { - csum_start = ins.objectid; - csum_end = csum_start + ins.offset; - } else { - csum_start = ins.objectid + - btrfs_file_extent_offset(eb, item); - csum_end = csum_start + - btrfs_file_extent_num_bytes(eb, item); - } + ins.objectid = btrfs_file_extent_disk_bytenr(wc->log_leaf, item); + ins.type = BTRFS_EXTENT_ITEM_KEY; + ins.offset = btrfs_file_extent_disk_num_bytes(wc->log_leaf, item); + offset = wc->log_key.offset - btrfs_file_extent_offset(wc->log_leaf, item); - ret = btrfs_lookup_csums_list(root->log_root, - csum_start, csum_end - 1, - &ordered_sums, false); - if (ret < 0) - goto out; - ret = 0; - /* - * Now delete all existing cums in the csum root that - * cover our range. We do this because we can have an - * extent that is completely referenced by one file - * extent item and partially referenced by another - * file extent item (like after using the clone or - * extent_same ioctls). In this case if we end up doing - * the replay of the one that partially references the - * extent first, and we do not do the csum deletion - * below, we can get 2 csum items in the csum tree that - * overlap each other. For example, imagine our log has - * the two following file extent items: - * - * key (257 EXTENT_DATA 409600) - * extent data disk byte 12845056 nr 102400 - * extent data offset 20480 nr 20480 ram 102400 - * - * key (257 EXTENT_DATA 819200) - * extent data disk byte 12845056 nr 102400 - * extent data offset 0 nr 102400 ram 102400 - * - * Where the second one fully references the 100K extent - * that starts at disk byte 12845056, and the log tree - * has a single csum item that covers the entire range - * of the extent: - * - * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 - * - * After the first file extent item is replayed, the - * csum tree gets the following csum item: - * - * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 - * - * Which covers the 20K sub-range starting at offset 20K - * of our extent. Now when we replay the second file - * extent item, if we do not delete existing csum items - * that cover any of its blocks, we end up getting two - * csum items in our csum tree that overlap each other: - * - * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 - * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 - * - * Which is a problem, because after this anyone trying - * to lookup up for the checksum of any block of our - * extent starting at an offset of 40K or higher, will - * end up looking at the second csum item only, which - * does not contain the checksum for any block starting - * at offset 40K or higher of our extent. - */ - while (!list_empty(&ordered_sums)) { - struct btrfs_ordered_sum *sums; - struct btrfs_root *csum_root; - - sums = list_first_entry(&ordered_sums, - struct btrfs_ordered_sum, - list); - csum_root = btrfs_csum_root(fs_info, - sums->logical); - if (!ret) - ret = btrfs_del_csums(trans, csum_root, - sums->logical, - sums->len); - if (!ret) - ret = btrfs_csum_file_blocks(trans, - csum_root, - sums); - list_del(&sums->list); - kfree(sums); - } - if (ret) - goto out; - } else { - btrfs_release_path(path); + /* + * Manually record dirty extent, as here we did a shallow file extent + * item copy and skip normal backref update, but modifying extent tree + * all by ourselves. So need to manually record dirty extent for qgroup, + * as the owner of the file extent changed from log tree (doesn't affect + * qgroup) to fs/file tree (affects qgroup). + */ + ret = btrfs_qgroup_trace_extent(trans, ins.objectid, ins.offset); + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, +"failed to trace extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu", + ins.objectid, ins.offset, + wc->log_key.objectid, btrfs_root_id(root)); + goto out; + } + + /* + * Is this extent already allocated in the extent tree? + * If so, just add a reference. + */ + ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset); + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, +"failed to lookup data extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu", + ins.objectid, ins.offset, + wc->log_key.objectid, btrfs_root_id(root)); + goto out; + } else if (ret == 0) { + struct btrfs_ref ref = { + .action = BTRFS_ADD_DELAYED_REF, + .bytenr = ins.objectid, + .num_bytes = ins.offset, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_root_id(root), + }; + + btrfs_init_data_ref(&ref, wc->log_key.objectid, offset, 0, false); + ret = btrfs_inc_extent_ref(trans, &ref); + if (ret) { + btrfs_abort_log_replay(wc, ret, +"failed to increment data extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu", + ins.objectid, ins.offset, + wc->log_key.objectid, + btrfs_root_id(root)); + goto out; } - } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - /* inline extents are easy, we just overwrite them */ - ret = overwrite_item(trans, root, path, eb, slot, key); - if (ret) + } else { + /* Insert the extent pointer in the extent tree. */ + ret = btrfs_alloc_logged_file_extent(trans, btrfs_root_id(root), + wc->log_key.objectid, offset, &ins); + if (ret) { + btrfs_abort_log_replay(wc, ret, +"failed to allocate logged data extent for bytenr %llu disk_num_bytes %llu offset %llu inode %llu root %llu", + ins.objectid, ins.offset, offset, + wc->log_key.objectid, btrfs_root_id(root)); goto out; + } } - ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start); + btrfs_release_path(wc->subvol_path); + + if (btrfs_file_extent_compression(wc->log_leaf, item)) { + csum_start = ins.objectid; + csum_end = csum_start + ins.offset; + } else { + csum_start = ins.objectid + btrfs_file_extent_offset(wc->log_leaf, item); + csum_end = csum_start + btrfs_file_extent_num_bytes(wc->log_leaf, item); + } + + ret = btrfs_lookup_csums_list(root->log_root, csum_start, csum_end - 1, + &ordered_sums, false); + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to lookups csums for range [%llu, %llu) inode %llu root %llu", + csum_start, csum_end, wc->log_key.objectid, + btrfs_root_id(root)); + goto out; + } + ret = 0; + /* + * Now delete all existing cums in the csum root that cover our range. + * We do this because we can have an extent that is completely + * referenced by one file extent item and partially referenced by + * another file extent item (like after using the clone or extent_same + * ioctls). In this case if we end up doing the replay of the one that + * partially references the extent first, and we do not do the csum + * deletion below, we can get 2 csum items in the csum tree that overlap + * each other. For example, imagine our log has the two following file + * extent items: + * + * key (257 EXTENT_DATA 409600) + * extent data disk byte 12845056 nr 102400 + * extent data offset 20480 nr 20480 ram 102400 + * + * key (257 EXTENT_DATA 819200) + * extent data disk byte 12845056 nr 102400 + * extent data offset 0 nr 102400 ram 102400 + * + * Where the second one fully references the 100K extent that starts at + * disk byte 12845056, and the log tree has a single csum item that + * covers the entire range of the extent: + * + * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 + * + * After the first file extent item is replayed, the csum tree gets the + * following csum item: + * + * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 + * + * Which covers the 20K sub-range starting at offset 20K of our extent. + * Now when we replay the second file extent item, if we do not delete + * existing csum items that cover any of its blocks, we end up getting + * two csum items in our csum tree that overlap each other: + * + * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 + * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 + * + * Which is a problem, because after this anyone trying to lookup for + * the checksum of any block of our extent starting at an offset of 40K + * or higher, will end up looking at the second csum item only, which + * does not contain the checksum for any block starting at offset 40K or + * higher of our extent. + */ + while (!list_empty(&ordered_sums)) { + struct btrfs_ordered_sum *sums; + struct btrfs_root *csum_root; + + sums = list_first_entry(&ordered_sums, struct btrfs_ordered_sum, list); + csum_root = btrfs_csum_root(fs_info, sums->logical); + if (!ret) { + ret = btrfs_del_csums(trans, csum_root, sums->logical, + sums->len); + if (ret) + btrfs_abort_log_replay(wc, ret, + "failed to delete csums for range [%llu, %llu) inode %llu root %llu", + sums->logical, + sums->logical + sums->len, + wc->log_key.objectid, + btrfs_root_id(root)); + } + if (!ret) { + ret = btrfs_csum_file_blocks(trans, csum_root, sums); + if (ret) + btrfs_abort_log_replay(wc, ret, + "failed to add csums for range [%llu, %llu) inode %llu root %llu", + sums->logical, + sums->logical + sums->len, + wc->log_key.objectid, + btrfs_root_id(root)); + } + list_del(&sums->list); + kfree(sums); + } if (ret) goto out; update_inode: + ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start); + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to set file extent range [%llu, %llu) inode %llu root %llu", + start, extent_end, wc->log_key.objectid, + btrfs_root_id(root)); + goto out; + } + btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found); ret = btrfs_update_inode(trans, inode); + if (ret) + btrfs_abort_log_replay(wc, ret, + "failed to update inode %llu root %llu", + wc->log_key.objectid, btrfs_root_id(root)); out: iput(&inode->vfs_inode); return ret; } -static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, +static int unlink_inode_for_log_replay(struct walk_control *wc, struct btrfs_inode *dir, struct btrfs_inode *inode, const struct fscrypt_str *name) { + struct btrfs_trans_handle *trans = wc->trans; int ret; ret = btrfs_unlink_inode(trans, dir, inode, name); - if (ret) + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to unlink inode %llu parent dir %llu name %.*s root %llu", + btrfs_ino(inode), btrfs_ino(dir), name->len, + name->name, btrfs_root_id(inode->root)); return ret; + } /* * Whenever we need to check if a name exists or not, we check the * fs/subvolume tree. So after an unlink we must run delayed items, so * that future checks for a name during log replay see that the name * does not exists anymore. */ - return btrfs_run_delayed_items(trans); + ret = btrfs_run_delayed_items(trans); + if (ret) + btrfs_abort_log_replay(wc, ret, +"failed to run delayed items current inode %llu parent dir %llu name %.*s root %llu", + btrfs_ino(inode), btrfs_ino(dir), name->len, + name->name, btrfs_root_id(inode->root)); + + return ret; } /* @@ -914,39 +1072,44 @@ static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, * This is a helper function to do the unlink of a specific directory * item */ -static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, - struct btrfs_path *path, +static noinline int drop_one_dir_item(struct walk_control *wc, struct btrfs_inode *dir, struct btrfs_dir_item *di) { struct btrfs_root *root = dir->root; struct btrfs_inode *inode; struct fscrypt_str name; - struct extent_buffer *leaf; + struct extent_buffer *leaf = wc->subvol_path->nodes[0]; struct btrfs_key location; int ret; - leaf = path->nodes[0]; - btrfs_dir_item_key_to_cpu(leaf, di, &location); ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name); - if (ret) - return -ENOMEM; + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to allocate name for dir %llu root %llu", + btrfs_ino(dir), btrfs_root_id(root)); + return ret; + } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); inode = btrfs_iget_logging(location.objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); + btrfs_abort_log_replay(wc, ret, + "failed to open inode %llu parent dir %llu name %.*s root %llu", + location.objectid, btrfs_ino(dir), + name.len, name.name, btrfs_root_id(root)); inode = NULL; goto out; } - ret = link_to_fixup_dir(trans, root, path, location.objectid); + ret = link_to_fixup_dir(wc, location.objectid); if (ret) goto out; - ret = unlink_inode_for_log_replay(trans, dir, inode, &name); + ret = unlink_inode_for_log_replay(wc, dir, inode, &name); out: kfree(name.name); if (inode) @@ -1013,7 +1176,7 @@ static noinline int backref_in_log(struct btrfs_root *log, u64 ref_objectid, const struct fscrypt_str *name) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; path = btrfs_alloc_path(); @@ -1021,12 +1184,10 @@ static noinline int backref_in_log(struct btrfs_root *log, return -ENOMEM; ret = btrfs_search_slot(NULL, log, key, path, 0, 0); - if (ret < 0) { - goto out; - } else if (ret == 1) { - ret = 0; - goto out; - } + if (ret < 0) + return ret; + if (ret == 1) + return 0; if (key->type == BTRFS_INODE_EXTREF_KEY) ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], @@ -1035,20 +1196,15 @@ static noinline int backref_in_log(struct btrfs_root *log, else ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name); -out: - btrfs_free_path(path); return ret; } -static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_root *log_root, +static int unlink_refs_not_in_log(struct walk_control *wc, struct btrfs_key *search_key, struct btrfs_inode *dir, - struct btrfs_inode *inode, - u64 parent_objectid) + struct btrfs_inode *inode) { - struct extent_buffer *leaf = path->nodes[0]; + struct extent_buffer *leaf = wc->subvol_path->nodes[0]; unsigned long ptr; unsigned long ptr_end; @@ -1057,8 +1213,8 @@ static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans, * log. If so, we allow them to stay otherwise they must be unlinked as * a conflict. */ - ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); - ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]); + ptr_end = ptr + btrfs_item_size(leaf, wc->subvol_path->slots[0]); while (ptr < ptr_end) { struct fscrypt_str victim_name; struct btrfs_inode_ref *victim_ref; @@ -1068,22 +1224,34 @@ static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans, ret = read_alloc_one_name(leaf, (victim_ref + 1), btrfs_inode_ref_name_len(leaf, victim_ref), &victim_name); - if (ret) + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to allocate name for inode %llu parent dir %llu root %llu", + btrfs_ino(inode), btrfs_ino(dir), + btrfs_root_id(inode->root)); return ret; + } - ret = backref_in_log(log_root, search_key, parent_objectid, &victim_name); + ret = backref_in_log(wc->log, search_key, btrfs_ino(dir), &victim_name); if (ret) { - kfree(victim_name.name); - if (ret < 0) + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, +"failed to check if backref is in log tree for inode %llu parent dir %llu name %.*s root %llu", + btrfs_ino(inode), btrfs_ino(dir), + victim_name.len, victim_name.name, + btrfs_root_id(inode->root)); + kfree(victim_name.name); return ret; + } + kfree(victim_name.name); ptr = (unsigned long)(victim_ref + 1) + victim_name.len; continue; } inc_nlink(&inode->vfs_inode); - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); - ret = unlink_inode_for_log_replay(trans, dir, inode, &victim_name); + ret = unlink_inode_for_log_replay(wc, dir, inode, &victim_name); kfree(victim_name.name); if (ret) return ret; @@ -1093,64 +1261,64 @@ static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans, return 0; } -static int unlink_extrefs_not_in_log(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_root *root, - struct btrfs_root *log_root, +static int unlink_extrefs_not_in_log(struct walk_control *wc, struct btrfs_key *search_key, - struct btrfs_inode *inode, - u64 inode_objectid, - u64 parent_objectid) + struct btrfs_inode *dir, + struct btrfs_inode *inode) { - struct extent_buffer *leaf = path->nodes[0]; - const unsigned long base = btrfs_item_ptr_offset(leaf, path->slots[0]); - const u32 item_size = btrfs_item_size(leaf, path->slots[0]); + struct extent_buffer *leaf = wc->subvol_path->nodes[0]; + const unsigned long base = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]); + const u32 item_size = btrfs_item_size(leaf, wc->subvol_path->slots[0]); u32 cur_offset = 0; while (cur_offset < item_size) { + struct btrfs_root *log_root = wc->log; struct btrfs_inode_extref *extref; - struct btrfs_inode *victim_parent; struct fscrypt_str victim_name; int ret; extref = (struct btrfs_inode_extref *)(base + cur_offset); victim_name.len = btrfs_inode_extref_name_len(leaf, extref); - if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) + if (btrfs_inode_extref_parent(leaf, extref) != btrfs_ino(dir)) goto next; ret = read_alloc_one_name(leaf, &extref->name, victim_name.len, &victim_name); - if (ret) + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to allocate name for inode %llu parent dir %llu root %llu", + btrfs_ino(inode), btrfs_ino(dir), + btrfs_root_id(inode->root)); return ret; + } - search_key->objectid = inode_objectid; + search_key->objectid = btrfs_ino(inode); search_key->type = BTRFS_INODE_EXTREF_KEY; - search_key->offset = btrfs_extref_hash(parent_objectid, + search_key->offset = btrfs_extref_hash(btrfs_ino(dir), victim_name.name, victim_name.len); - ret = backref_in_log(log_root, search_key, parent_objectid, &victim_name); + ret = backref_in_log(log_root, search_key, btrfs_ino(dir), &victim_name); if (ret) { - kfree(victim_name.name); - if (ret < 0) + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, +"failed to check if backref is in log tree for inode %llu parent dir %llu name %.*s root %llu", + btrfs_ino(inode), btrfs_ino(dir), + victim_name.len, victim_name.name, + btrfs_root_id(inode->root)); + kfree(victim_name.name); return ret; + } + kfree(victim_name.name); next: cur_offset += victim_name.len + sizeof(*extref); continue; } - victim_parent = btrfs_iget_logging(parent_objectid, root); - if (IS_ERR(victim_parent)) { - kfree(victim_name.name); - return PTR_ERR(victim_parent); - } - inc_nlink(&inode->vfs_inode); - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); - ret = unlink_inode_for_log_replay(trans, victim_parent, inode, - &victim_name); - iput(&victim_parent->vfs_inode); + ret = unlink_inode_for_log_replay(wc, dir, inode, &victim_name); kfree(victim_name.name); if (ret) return ret; @@ -1160,27 +1328,29 @@ next: return 0; } -static inline int __add_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_root *log_root, +static inline int __add_inode_ref(struct walk_control *wc, struct btrfs_inode *dir, struct btrfs_inode *inode, - u64 inode_objectid, u64 parent_objectid, u64 ref_index, struct fscrypt_str *name) { int ret; + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; struct btrfs_dir_item *di; struct btrfs_key search_key; struct btrfs_inode_extref *extref; again: /* Search old style refs */ - search_key.objectid = inode_objectid; + search_key.objectid = btrfs_ino(inode); search_key.type = BTRFS_INODE_REF_KEY; - search_key.offset = parent_objectid; - ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + search_key.offset = btrfs_ino(dir); + ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0); if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to search subvolume tree for key (%llu %u %llu) root %llu", + search_key.objectid, search_key.type, + search_key.offset, btrfs_root_id(root)); return ret; } else if (ret == 0) { /* @@ -1190,52 +1360,60 @@ again: if (search_key.objectid == search_key.offset) return 1; - ret = unlink_refs_not_in_log(trans, path, log_root, &search_key, - dir, inode, parent_objectid); + ret = unlink_refs_not_in_log(wc, &search_key, dir, inode); if (ret == -EAGAIN) goto again; else if (ret) return ret; } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); /* Same search but for extended refs */ - extref = btrfs_lookup_inode_extref(root, path, name, inode_objectid, parent_objectid); + extref = btrfs_lookup_inode_extref(root, wc->subvol_path, name, + btrfs_ino(inode), btrfs_ino(dir)); if (IS_ERR(extref)) { return PTR_ERR(extref); } else if (extref) { - ret = unlink_extrefs_not_in_log(trans, path, root, log_root, - &search_key, inode, - inode_objectid, parent_objectid); + ret = unlink_extrefs_not_in_log(wc, &search_key, dir, inode); if (ret == -EAGAIN) goto again; else if (ret) return ret; } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); /* look for a conflicting sequence number */ - di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), + di = btrfs_lookup_dir_index_item(trans, root, wc->subvol_path, btrfs_ino(dir), ref_index, name, 0); if (IS_ERR(di)) { - return PTR_ERR(di); + ret = PTR_ERR(di); + btrfs_abort_log_replay(wc, ret, +"failed to lookup dir index item for dir %llu ref_index %llu name %.*s root %llu", + btrfs_ino(dir), ref_index, name->len, + name->name, btrfs_root_id(root)); + return ret; } else if (di) { - ret = drop_one_dir_item(trans, path, dir, di); + ret = drop_one_dir_item(wc, dir, di); if (ret) return ret; } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); /* look for a conflicting name */ - di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0); + di = btrfs_lookup_dir_item(trans, root, wc->subvol_path, btrfs_ino(dir), name, 0); if (IS_ERR(di)) { - return PTR_ERR(di); + ret = PTR_ERR(di); + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir item for dir %llu name %.*s root %llu", + btrfs_ino(dir), name->len, name->name, + btrfs_root_id(root)); + return ret; } else if (di) { - ret = drop_one_dir_item(trans, path, dir, di); + ret = drop_one_dir_item(wc, dir, di); if (ret) return ret; } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); return 0; } @@ -1288,63 +1466,79 @@ static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, * proper unlink of that name (that is, remove its entry from the inode * reference item and both dir index keys). */ -static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_inode *inode, - struct extent_buffer *log_eb, - int log_slot, - struct btrfs_key *key) +static int unlink_old_inode_refs(struct walk_control *wc, struct btrfs_inode *inode) { + struct btrfs_root *root = wc->root; int ret; unsigned long ref_ptr; unsigned long ref_end; struct extent_buffer *eb; again: - btrfs_release_path(path); - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + btrfs_release_path(wc->subvol_path); + ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0); if (ret > 0) { ret = 0; goto out; } - if (ret < 0) + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to search subvolume tree for key (%llu %u %llu) root %llu", + wc->log_key.objectid, wc->log_key.type, + wc->log_key.offset, btrfs_root_id(root)); goto out; + } - eb = path->nodes[0]; - ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); - ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]); + eb = wc->subvol_path->nodes[0]; + ref_ptr = btrfs_item_ptr_offset(eb, wc->subvol_path->slots[0]); + ref_end = ref_ptr + btrfs_item_size(eb, wc->subvol_path->slots[0]); while (ref_ptr < ref_end) { struct fscrypt_str name; u64 parent_id; - if (key->type == BTRFS_INODE_EXTREF_KEY) { + if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY) { ret = extref_get_fields(eb, ref_ptr, &name, NULL, &parent_id); + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to get extref details for inode %llu root %llu", + btrfs_ino(inode), + btrfs_root_id(root)); + goto out; + } } else { - parent_id = key->offset; + parent_id = wc->log_key.offset; ret = ref_get_fields(eb, ref_ptr, &name, NULL); + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to get ref details for inode %llu parent_id %llu root %llu", + btrfs_ino(inode), parent_id, + btrfs_root_id(root)); + goto out; + } } - if (ret) - goto out; - if (key->type == BTRFS_INODE_EXTREF_KEY) - ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, + if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY) + ret = !!btrfs_find_name_in_ext_backref(wc->log_leaf, wc->log_slot, parent_id, &name); else - ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name); + ret = !!btrfs_find_name_in_backref(wc->log_leaf, wc->log_slot, + &name); if (!ret) { struct btrfs_inode *dir; - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); dir = btrfs_iget_logging(parent_id, root); if (IS_ERR(dir)) { ret = PTR_ERR(dir); kfree(name.name); + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir inode %llu root %llu", + parent_id, btrfs_root_id(root)); goto out; } - ret = unlink_inode_for_log_replay(trans, dir, inode, &name); + ret = unlink_inode_for_log_replay(wc, dir, inode, &name); kfree(name.name); iput(&dir->vfs_inode); if (ret) @@ -1354,56 +1548,51 @@ again: kfree(name.name); ref_ptr += name.len; - if (key->type == BTRFS_INODE_EXTREF_KEY) + if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY) ref_ptr += sizeof(struct btrfs_inode_extref); else ref_ptr += sizeof(struct btrfs_inode_ref); } ret = 0; out: - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); return ret; } /* - * replay one inode back reference item found in the log tree. - * eb, slot and key refer to the buffer and key found in the log tree. - * root is the destination we are replaying into, and path is for temp - * use by this function. (it should be released on return). + * Replay one inode back reference item found in the log tree. + * Path is for temporary use by this function (it should be released on return). */ -static noinline int add_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_root *log, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) +static noinline int add_inode_ref(struct walk_control *wc) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; struct btrfs_inode *dir = NULL; struct btrfs_inode *inode = NULL; unsigned long ref_ptr; unsigned long ref_end; struct fscrypt_str name = { 0 }; int ret; - const bool is_extref_item = (key->type == BTRFS_INODE_EXTREF_KEY); + const bool is_extref_item = (wc->log_key.type == BTRFS_INODE_EXTREF_KEY); u64 parent_objectid; u64 inode_objectid; u64 ref_index = 0; int ref_struct_size; - ref_ptr = btrfs_item_ptr_offset(eb, slot); - ref_end = ref_ptr + btrfs_item_size(eb, slot); + ref_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot); + ref_end = ref_ptr + btrfs_item_size(wc->log_leaf, wc->log_slot); if (is_extref_item) { struct btrfs_inode_extref *r; ref_struct_size = sizeof(struct btrfs_inode_extref); r = (struct btrfs_inode_extref *)ref_ptr; - parent_objectid = btrfs_inode_extref_parent(eb, r); + parent_objectid = btrfs_inode_extref_parent(wc->log_leaf, r); } else { ref_struct_size = sizeof(struct btrfs_inode_ref); - parent_objectid = key->offset; + parent_objectid = wc->log_key.offset; } - inode_objectid = key->objectid; + inode_objectid = wc->log_key.objectid; /* * it is possible that we didn't log all the parent directories @@ -1416,6 +1605,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, ret = PTR_ERR(dir); if (ret == -ENOENT) ret = 0; + else + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir inode %llu root %llu", + parent_objectid, btrfs_root_id(root)); dir = NULL; goto out; } @@ -1423,16 +1616,24 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, inode = btrfs_iget_logging(inode_objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); + btrfs_abort_log_replay(wc, ret, + "failed to lookup inode %llu root %llu", + inode_objectid, btrfs_root_id(root)); inode = NULL; goto out; } while (ref_ptr < ref_end) { if (is_extref_item) { - ret = extref_get_fields(eb, ref_ptr, &name, + ret = extref_get_fields(wc->log_leaf, ref_ptr, &name, &ref_index, &parent_objectid); - if (ret) + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to get extref details for inode %llu root %llu", + btrfs_ino(inode), + btrfs_root_id(root)); goto out; + } /* * parent object can change from one array * item to another. @@ -1457,19 +1658,35 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, */ ret = 0; goto next; + } else { + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir inode %llu root %llu", + parent_objectid, + btrfs_root_id(root)); } goto out; } } } else { - ret = ref_get_fields(eb, ref_ptr, &name, &ref_index); - if (ret) + ret = ref_get_fields(wc->log_leaf, ref_ptr, &name, &ref_index); + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to get ref details for inode %llu parent_objectid %llu root %llu", + btrfs_ino(inode), + parent_objectid, + btrfs_root_id(root)); goto out; + } } - ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), - ref_index, &name); + ret = inode_in_dir(root, wc->subvol_path, btrfs_ino(dir), + btrfs_ino(inode), ref_index, &name); if (ret < 0) { + btrfs_abort_log_replay(wc, ret, +"failed to check if inode %llu is in dir %llu ref_index %llu name %.*s root %llu", + btrfs_ino(inode), btrfs_ino(dir), + ref_index, name.len, name.name, + btrfs_root_id(root)); goto out; } else if (ret == 0) { /* @@ -1479,9 +1696,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * overwrite any existing back reference, and we don't * want to create dangling pointers in the directory. */ - ret = __add_inode_ref(trans, root, path, log, dir, inode, - inode_objectid, parent_objectid, - ref_index, &name); + ret = __add_inode_ref(wc, dir, inode, ref_index, &name); if (ret) { if (ret == 1) ret = 0; @@ -1490,12 +1705,24 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, /* insert our name */ ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index); - if (ret) + if (ret) { + btrfs_abort_log_replay(wc, ret, +"failed to add link for inode %llu in dir %llu ref_index %llu name %.*s root %llu", + btrfs_ino(inode), + btrfs_ino(dir), ref_index, + name.len, name.name, + btrfs_root_id(root)); goto out; + } ret = btrfs_update_inode(trans, inode); - if (ret) + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to update inode %llu root %llu", + btrfs_ino(inode), + btrfs_root_id(root)); goto out; + } } /* Else, ret == 1, we already have a perfect match, we're done. */ @@ -1517,14 +1744,14 @@ next: * dir index entries exist for a name but there is no inode reference * item with the same name. */ - ret = unlink_old_inode_refs(trans, root, path, inode, eb, slot, key); + ret = unlink_old_inode_refs(wc, inode); if (ret) goto out; /* finally write the back reference in the inode */ - ret = overwrite_item(trans, root, path, eb, slot, key); + ret = overwrite_item(wc); out: - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); kfree(name.name); if (dir) iput(&dir->vfs_inode); @@ -1642,26 +1869,22 @@ process_slot: * number of back refs found. If it goes down to zero, the iput * will free the inode. */ -static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, +static noinline int fixup_inode_link_count(struct walk_control *wc, struct btrfs_inode *inode) { + struct btrfs_trans_handle *trans = wc->trans; struct btrfs_root *root = inode->root; - struct btrfs_path *path; int ret; u64 nlink = 0; const u64 ino = btrfs_ino(inode); - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - ret = count_inode_refs(inode, path); + ret = count_inode_refs(inode, wc->subvol_path); if (ret < 0) goto out; nlink = ret; - ret = count_inode_extrefs(inode, path); + ret = count_inode_extrefs(inode, wc->subvol_path); if (ret < 0) goto out; @@ -1680,7 +1903,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (inode->vfs_inode.i_nlink == 0) { if (S_ISDIR(inode->vfs_inode.i_mode)) { - ret = replay_dir_deletes(trans, root, NULL, path, ino, true); + ret = replay_dir_deletes(wc, ino, true); if (ret) goto out; } @@ -1690,13 +1913,11 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, } out: - btrfs_free_path(path); + btrfs_release_path(wc->subvol_path); return ret; } -static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path) +static noinline int fixup_inode_link_counts(struct walk_control *wc) { int ret; struct btrfs_key key; @@ -1705,48 +1926,50 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = (u64)-1; while (1) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; struct btrfs_inode *inode; - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + ret = btrfs_search_slot(trans, root, &key, wc->subvol_path, -1, 1); if (ret < 0) break; if (ret == 1) { ret = 0; - if (path->slots[0] == 0) + if (wc->subvol_path->slots[0] == 0) break; - path->slots[0]--; + wc->subvol_path->slots[0]--; } - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &key, wc->subvol_path->slots[0]); if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || key.type != BTRFS_ORPHAN_ITEM_KEY) break; - ret = btrfs_del_item(trans, root, path); + ret = btrfs_del_item(trans, root, wc->subvol_path); if (ret) break; - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); inode = btrfs_iget_logging(key.offset, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); break; } - ret = fixup_inode_link_count(trans, inode); + ret = fixup_inode_link_count(wc, inode); iput(&inode->vfs_inode); if (ret) break; /* * fixup on a directory may create new entries, - * make sure we always look for the highset possible + * make sure we always look for the highest possible * offset */ key.offset = (u64)-1; } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); return ret; } @@ -1756,36 +1979,48 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, * count when replay is done. The link count is incremented here * so the inode won't go away until we check it */ -static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 objectid) +static noinline int link_to_fixup_dir(struct walk_control *wc, u64 objectid) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; struct btrfs_key key; int ret = 0; struct btrfs_inode *inode; struct inode *vfs_inode; inode = btrfs_iget_logging(objectid, root); - if (IS_ERR(inode)) - return PTR_ERR(inode); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + btrfs_abort_log_replay(wc, ret, + "failed to lookup inode %llu root %llu", + objectid, btrfs_root_id(root)); + return ret; + } vfs_inode = &inode->vfs_inode; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = objectid; - ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &key, 0); - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); if (ret == 0) { if (!vfs_inode->i_nlink) set_nlink(vfs_inode, 1); else inc_nlink(vfs_inode); ret = btrfs_update_inode(trans, inode); + if (ret) + btrfs_abort_log_replay(wc, ret, + "failed to update inode %llu root %llu", + objectid, btrfs_root_id(root)); } else if (ret == -EEXIST) { ret = 0; + } else { + btrfs_abort_log_replay(wc, ret, + "failed to insert fixup item for inode %llu root %llu", + objectid, btrfs_root_id(root)); } iput(vfs_inode); @@ -1826,9 +2061,8 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, return ret; } -static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, +static int delete_conflicting_dir_entry(struct walk_control *wc, struct btrfs_inode *dir, - struct btrfs_path *path, struct btrfs_dir_item *dst_di, const struct btrfs_key *log_key, u8 log_flags, @@ -1836,12 +2070,12 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, { struct btrfs_key found_key; - btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); + btrfs_dir_item_key_to_cpu(wc->subvol_path->nodes[0], dst_di, &found_key); /* The existing dentry points to the same inode, don't delete it. */ if (found_key.objectid == log_key->objectid && found_key.type == log_key->type && found_key.offset == log_key->offset && - btrfs_dir_flags(path->nodes[0], dst_di) == log_flags) + btrfs_dir_flags(wc->subvol_path->nodes[0], dst_di) == log_flags) return 1; /* @@ -1851,7 +2085,7 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, if (!exists) return 0; - return drop_one_dir_item(trans, path, dir, dst_di); + return drop_one_dir_item(wc, dir, dst_di); } /* @@ -1870,13 +2104,10 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a * non-existing inode) and 1 if the name was replayed. */ -static noinline int replay_one_name(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *eb, - struct btrfs_dir_item *di, - struct btrfs_key *key) +static noinline int replay_one_name(struct walk_control *wc, struct btrfs_dir_item *di) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; struct fscrypt_str name = { 0 }; struct btrfs_dir_item *dir_dst_di; struct btrfs_dir_item *index_dst_di; @@ -1891,53 +2122,85 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, bool update_size = true; bool name_added = false; - dir = btrfs_iget_logging(key->objectid, root); - if (IS_ERR(dir)) - return PTR_ERR(dir); + dir = btrfs_iget_logging(wc->log_key.objectid, root); + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir inode %llu root %llu", + wc->log_key.objectid, btrfs_root_id(root)); + return ret; + } - ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); - if (ret) + ret = read_alloc_one_name(wc->log_leaf, di + 1, + btrfs_dir_name_len(wc->log_leaf, di), &name); + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to allocate name for dir %llu root %llu", + btrfs_ino(dir), btrfs_root_id(root)); goto out; + } - log_flags = btrfs_dir_flags(eb, di); - btrfs_dir_item_key_to_cpu(eb, di, &log_key); - ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); - btrfs_release_path(path); - if (ret < 0) + log_flags = btrfs_dir_flags(wc->log_leaf, di); + btrfs_dir_item_key_to_cpu(wc->log_leaf, di, &log_key); + ret = btrfs_lookup_inode(trans, root, wc->subvol_path, &log_key, 0); + btrfs_release_path(wc->subvol_path); + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to lookup inode %llu root %llu", + log_key.objectid, btrfs_root_id(root)); goto out; + } exists = (ret == 0); ret = 0; - dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, - &name, 1); + dir_dst_di = btrfs_lookup_dir_item(trans, root, wc->subvol_path, + wc->log_key.objectid, &name, 1); if (IS_ERR(dir_dst_di)) { ret = PTR_ERR(dir_dst_di); + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir item for dir %llu name %.*s root %llu", + wc->log_key.objectid, name.len, name.name, + btrfs_root_id(root)); goto out; } else if (dir_dst_di) { - ret = delete_conflicting_dir_entry(trans, dir, path, dir_dst_di, + ret = delete_conflicting_dir_entry(wc, dir, dir_dst_di, &log_key, log_flags, exists); - if (ret < 0) + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to delete conflicting entry for dir %llu name %.*s root %llu", + btrfs_ino(dir), name.len, name.name, + btrfs_root_id(root)); goto out; + } dir_dst_matches = (ret == 1); } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); - index_dst_di = btrfs_lookup_dir_index_item(trans, root, path, - key->objectid, key->offset, - &name, 1); + index_dst_di = btrfs_lookup_dir_index_item(trans, root, wc->subvol_path, + wc->log_key.objectid, + wc->log_key.offset, &name, 1); if (IS_ERR(index_dst_di)) { ret = PTR_ERR(index_dst_di); + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir index item for dir %llu name %.*s root %llu", + wc->log_key.objectid, name.len, name.name, + btrfs_root_id(root)); goto out; } else if (index_dst_di) { - ret = delete_conflicting_dir_entry(trans, dir, path, index_dst_di, + ret = delete_conflicting_dir_entry(wc, dir, index_dst_di, &log_key, log_flags, exists); - if (ret < 0) + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to delete conflicting entry for dir %llu name %.*s root %llu", + btrfs_ino(dir), name.len, name.name, + btrfs_root_id(root)); goto out; + } index_dst_matches = (ret == 1); } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); if (dir_dst_matches && index_dst_matches) { ret = 0; @@ -1951,9 +2214,13 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, */ search_key.objectid = log_key.objectid; search_key.type = BTRFS_INODE_REF_KEY; - search_key.offset = key->objectid; + search_key.offset = wc->log_key.objectid; ret = backref_in_log(root->log_root, &search_key, 0, &name); if (ret < 0) { + btrfs_abort_log_replay(wc, ret, +"failed to check if ref item is logged for inode %llu dir %llu name %.*s root %llu", + search_key.objectid, btrfs_ino(dir), + name.len, name.name, btrfs_root_id(root)); goto out; } else if (ret) { /* The dentry will be added later. */ @@ -1964,9 +2231,13 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, search_key.objectid = log_key.objectid; search_key.type = BTRFS_INODE_EXTREF_KEY; - search_key.offset = btrfs_extref_hash(key->objectid, name.name, name.len); - ret = backref_in_log(root->log_root, &search_key, key->objectid, &name); + search_key.offset = btrfs_extref_hash(wc->log_key.objectid, name.name, name.len); + ret = backref_in_log(root->log_root, &search_key, wc->log_key.objectid, &name); if (ret < 0) { + btrfs_abort_log_replay(wc, ret, +"failed to check if extref item is logged for inode %llu dir %llu name %.*s root %llu", + search_key.objectid, btrfs_ino(dir), + name.len, name.name, btrfs_root_id(root)); goto out; } else if (ret) { /* The dentry will be added later. */ @@ -1974,11 +2245,15 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, update_size = false; goto out; } - btrfs_release_path(path); - ret = insert_one_name(trans, root, key->objectid, key->offset, + ret = insert_one_name(trans, root, wc->log_key.objectid, wc->log_key.offset, &name, &log_key); - if (ret && ret != -ENOENT && ret != -EEXIST) + if (ret && ret != -ENOENT && ret != -EEXIST) { + btrfs_abort_log_replay(wc, ret, + "failed to insert name %.*s for inode %llu dir %llu root %llu", + name.len, name.name, log_key.objectid, + btrfs_ino(dir), btrfs_root_id(root)); goto out; + } if (!ret) name_added = true; update_size = false; @@ -1988,6 +2263,10 @@ out: if (!ret && update_size) { btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2); ret = btrfs_update_inode(trans, dir); + if (ret) + btrfs_abort_log_replay(wc, ret, + "failed to update dir inode %llu root %llu", + btrfs_ino(dir), btrfs_root_id(root)); } kfree(name.name); iput(&dir->vfs_inode); @@ -1997,20 +2276,16 @@ out: } /* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */ -static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) +static noinline int replay_one_dir_item(struct walk_control *wc) { int ret; struct btrfs_dir_item *di; /* We only log dir index keys, which only contain a single dir item. */ - ASSERT(key->type == BTRFS_DIR_INDEX_KEY); + ASSERT(wc->log_key.type == BTRFS_DIR_INDEX_KEY); - di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); - ret = replay_one_name(trans, root, path, eb, di, key); + di = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_dir_item); + ret = replay_one_name(wc, di); if (ret < 0) return ret; @@ -2040,17 +2315,11 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, * to ever delete the parent directory has it would result in stale * dentries that can never be deleted. */ - if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) { - struct btrfs_path *fixup_path; + if (ret == 1 && btrfs_dir_ftype(wc->log_leaf, di) != BTRFS_FT_DIR) { struct btrfs_key di_key; - fixup_path = btrfs_alloc_path(); - if (!fixup_path) - return -ENOMEM; - - btrfs_dir_item_key_to_cpu(eb, di, &di_key); - ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid); - btrfs_free_path(fixup_path); + btrfs_dir_item_key_to_cpu(wc->log_leaf, di, &di_key); + ret = link_to_fixup_dir(wc, di_key.objectid); } return ret; @@ -2143,13 +2412,13 @@ out: * item is not in the log, the item is removed and the inode it points * to is unlinked */ -static noinline int check_item_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *log, - struct btrfs_path *path, +static noinline int check_item_in_log(struct walk_control *wc, struct btrfs_path *log_path, struct btrfs_inode *dir, - struct btrfs_key *dir_key) + struct btrfs_key *dir_key, + bool force_remove) { + struct btrfs_trans_handle *trans = wc->trans; struct btrfs_root *root = dir->root; int ret; struct extent_buffer *eb; @@ -2167,21 +2436,31 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, */ ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY); - eb = path->nodes[0]; - slot = path->slots[0]; + eb = wc->subvol_path->nodes[0]; + slot = wc->subvol_path->slots[0]; di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); - if (ret) + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to allocate name for dir %llu index %llu root %llu", + btrfs_ino(dir), dir_key->offset, + btrfs_root_id(root)); goto out; + } - if (log) { + if (!force_remove) { struct btrfs_dir_item *log_di; - log_di = btrfs_lookup_dir_index_item(trans, log, log_path, + log_di = btrfs_lookup_dir_index_item(trans, wc->log, log_path, dir_key->objectid, dir_key->offset, &name, 0); if (IS_ERR(log_di)) { ret = PTR_ERR(log_di); + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir index item for dir %llu index %llu name %.*s root %llu", + btrfs_ino(dir), dir_key->offset, + name.len, name.name, + btrfs_root_id(root)); goto out; } else if (log_di) { /* The dentry exists in the log, we have nothing to do. */ @@ -2191,28 +2470,31 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, } btrfs_dir_item_key_to_cpu(eb, di, &location); - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); btrfs_release_path(log_path); inode = btrfs_iget_logging(location.objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); inode = NULL; + btrfs_abort_log_replay(wc, ret, + "failed to lookup inode %llu root %llu", + location.objectid, btrfs_root_id(root)); goto out; } - ret = link_to_fixup_dir(trans, root, path, location.objectid); + ret = link_to_fixup_dir(wc, location.objectid); if (ret) goto out; inc_nlink(&inode->vfs_inode); - ret = unlink_inode_for_log_replay(trans, dir, inode, &name); + ret = unlink_inode_for_log_replay(wc, dir, inode, &name); /* * Unlike dir item keys, dir index keys can only have one name (entry) in * them, as there are no key collisions since each key has a unique offset * (an index number), so we're done. */ out: - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); btrfs_release_path(log_path); kfree(name.name); if (inode) @@ -2220,59 +2502,67 @@ out: return ret; } -static int replay_xattr_deletes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_root *log, - struct btrfs_path *path, - const u64 ino) +static int replay_xattr_deletes(struct walk_control *wc) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; + struct btrfs_root *log = wc->log; struct btrfs_key search_key; - struct btrfs_path *log_path; - int i; + BTRFS_PATH_AUTO_FREE(log_path); + const u64 ino = wc->log_key.objectid; int nritems; int ret; log_path = btrfs_alloc_path(); - if (!log_path) + if (!log_path) { + btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path"); return -ENOMEM; + } search_key.objectid = ino; search_key.type = BTRFS_XATTR_ITEM_KEY; search_key.offset = 0; again: - ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); - if (ret < 0) + ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0); + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to search xattrs for inode %llu root %llu", + ino, btrfs_root_id(root)); goto out; + } process_leaf: - nritems = btrfs_header_nritems(path->nodes[0]); - for (i = path->slots[0]; i < nritems; i++) { + nritems = btrfs_header_nritems(wc->subvol_path->nodes[0]); + for (int i = wc->subvol_path->slots[0]; i < nritems; i++) { struct btrfs_key key; struct btrfs_dir_item *di; struct btrfs_dir_item *log_di; u32 total_size; u32 cur; - btrfs_item_key_to_cpu(path->nodes[0], &key, i); + btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &key, i); if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { ret = 0; goto out; } - di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); - total_size = btrfs_item_size(path->nodes[0], i); + di = btrfs_item_ptr(wc->subvol_path->nodes[0], i, struct btrfs_dir_item); + total_size = btrfs_item_size(wc->subvol_path->nodes[0], i); cur = 0; while (cur < total_size) { - u16 name_len = btrfs_dir_name_len(path->nodes[0], di); - u16 data_len = btrfs_dir_data_len(path->nodes[0], di); + u16 name_len = btrfs_dir_name_len(wc->subvol_path->nodes[0], di); + u16 data_len = btrfs_dir_data_len(wc->subvol_path->nodes[0], di); u32 this_len = sizeof(*di) + name_len + data_len; char *name; name = kmalloc(name_len, GFP_NOFS); if (!name) { ret = -ENOMEM; + btrfs_abort_log_replay(wc, ret, + "failed to allocate memory for name of length %u", + name_len); goto out; } - read_extent_buffer(path->nodes[0], name, + read_extent_buffer(wc->subvol_path->nodes[0], name, (unsigned long)(di + 1), name_len); log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, @@ -2280,40 +2570,59 @@ process_leaf: btrfs_release_path(log_path); if (!log_di) { /* Doesn't exist in log tree, so delete it. */ - btrfs_release_path(path); - di = btrfs_lookup_xattr(trans, root, path, ino, + btrfs_release_path(wc->subvol_path); + di = btrfs_lookup_xattr(trans, root, wc->subvol_path, ino, name, name_len, -1); - kfree(name); if (IS_ERR(di)) { ret = PTR_ERR(di); + btrfs_abort_log_replay(wc, ret, + "failed to lookup xattr with name %.*s for inode %llu root %llu", + name_len, name, ino, + btrfs_root_id(root)); + kfree(name); goto out; } ASSERT(di); ret = btrfs_delete_one_dir_name(trans, root, - path, di); - if (ret) + wc->subvol_path, di); + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to delete xattr with name %.*s for inode %llu root %llu", + name_len, name, ino, + btrfs_root_id(root)); + kfree(name); goto out; - btrfs_release_path(path); + } + btrfs_release_path(wc->subvol_path); + kfree(name); search_key = key; goto again; } - kfree(name); if (IS_ERR(log_di)) { ret = PTR_ERR(log_di); + btrfs_abort_log_replay(wc, ret, + "failed to lookup xattr in log tree with name %.*s for inode %llu root %llu", + name_len, name, ino, + btrfs_root_id(root)); + kfree(name); goto out; } + kfree(name); cur += this_len; di = (struct btrfs_dir_item *)((char *)di + this_len); } } - ret = btrfs_next_leaf(root, path); + ret = btrfs_next_leaf(root, wc->subvol_path); if (ret > 0) ret = 0; else if (ret == 0) goto process_leaf; + else + btrfs_abort_log_replay(wc, ret, + "failed to get next leaf in subvolume root %llu", + btrfs_root_id(root)); out: - btrfs_free_path(log_path); - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); return ret; } @@ -2328,12 +2637,11 @@ out: * Anything we don't find in the log is unlinked and removed from the * directory. */ -static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_root *log, - struct btrfs_path *path, +static noinline int replay_dir_deletes(struct walk_control *wc, u64 dirid, bool del_all) { + struct btrfs_root *root = wc->root; + struct btrfs_root *log = (del_all ? NULL : wc->log); u64 range_start; u64 range_end; int ret = 0; @@ -2345,8 +2653,10 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, dir_key.objectid = dirid; dir_key.type = BTRFS_DIR_INDEX_KEY; log_path = btrfs_alloc_path(); - if (!log_path) + if (!log_path) { + btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path"); return -ENOMEM; + } dir = btrfs_iget_logging(dirid, root); /* @@ -2358,6 +2668,10 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, ret = PTR_ERR(dir); if (ret == -ENOENT) ret = 0; + else + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir inode %llu root %llu", + dirid, btrfs_root_id(root)); return ret; } @@ -2367,32 +2681,46 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, if (del_all) range_end = (u64)-1; else { - ret = find_dir_range(log, path, dirid, + ret = find_dir_range(log, wc->subvol_path, dirid, &range_start, &range_end); - if (ret < 0) + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to find range for dir %llu in log tree root %llu", + dirid, btrfs_root_id(root)); goto out; - else if (ret > 0) + } else if (ret > 0) { break; + } } dir_key.offset = range_start; while (1) { int nritems; - ret = btrfs_search_slot(NULL, root, &dir_key, path, - 0, 0); - if (ret < 0) + ret = btrfs_search_slot(NULL, root, &dir_key, + wc->subvol_path, 0, 0); + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to search root %llu for key (%llu %u %llu)", + btrfs_root_id(root), + dir_key.objectid, dir_key.type, + dir_key.offset); goto out; + } - nritems = btrfs_header_nritems(path->nodes[0]); - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret == 1) + nritems = btrfs_header_nritems(wc->subvol_path->nodes[0]); + if (wc->subvol_path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, wc->subvol_path); + if (ret == 1) { break; - else if (ret < 0) + } else if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to get next leaf in subvolume root %llu", + btrfs_root_id(root)); goto out; + } } - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); + btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &found_key, + wc->subvol_path->slots[0]); if (found_key.objectid != dirid || found_key.type != dir_key.type) { ret = 0; @@ -2402,23 +2730,21 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, if (found_key.offset > range_end) break; - ret = check_item_in_log(trans, log, path, - log_path, dir, - &found_key); + ret = check_item_in_log(wc, log_path, dir, &found_key, del_all); if (ret) goto out; if (found_key.offset == (u64)-1) break; dir_key.offset = found_key.offset + 1; } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); if (range_end == (u64)-1) break; range_start = range_end + 1; } ret = 0; out: - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); btrfs_free_path(log_path); iput(&dir->vfs_inode); return ret; @@ -2435,7 +2761,7 @@ out: * only in the log (references come from either directory items or inode * back refs). */ -static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, +static int replay_one_buffer(struct extent_buffer *eb, struct walk_control *wc, u64 gen, int level) { int nritems; @@ -2443,33 +2769,44 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, .transid = gen, .level = level }; - struct btrfs_path *path; - struct btrfs_root *root = wc->replay_dest; - struct btrfs_key key; - int i; + struct btrfs_root *root = wc->root; + struct btrfs_trans_handle *trans = wc->trans; int ret; - ret = btrfs_read_extent_buffer(eb, &check); - if (ret) - return ret; - - level = btrfs_header_level(eb); - if (level != 0) return 0; - path = btrfs_alloc_path(); - if (!path) + /* + * Set to NULL since it was not yet read and in case we abort log replay + * on error, we have no valid log tree leaf to dump. + */ + wc->log_leaf = NULL; + ret = btrfs_read_extent_buffer(eb, &check); + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to read log tree leaf %llu for root %llu", + eb->start, btrfs_root_id(root)); + return ret; + } + + ASSERT(wc->subvol_path == NULL); + wc->subvol_path = btrfs_alloc_path(); + if (!wc->subvol_path) { + btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path"); return -ENOMEM; + } + + wc->log_leaf = eb; nritems = btrfs_header_nritems(eb); - for (i = 0; i < nritems; i++) { + for (wc->log_slot = 0; wc->log_slot < nritems; wc->log_slot++) { struct btrfs_inode_item *inode_item; - btrfs_item_key_to_cpu(eb, &key, i); + btrfs_item_key_to_cpu(eb, &wc->log_key, wc->log_slot); - if (key.type == BTRFS_INODE_ITEM_KEY) { - inode_item = btrfs_item_ptr(eb, i, struct btrfs_inode_item); + if (wc->log_key.type == BTRFS_INODE_ITEM_KEY) { + inode_item = btrfs_item_ptr(eb, wc->log_slot, + struct btrfs_inode_item); /* * An inode with no links is either: * @@ -2498,22 +2835,20 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, } /* Inode keys are done during the first stage. */ - if (key.type == BTRFS_INODE_ITEM_KEY && + if (wc->log_key.type == BTRFS_INODE_ITEM_KEY && wc->stage == LOG_WALK_REPLAY_INODES) { u32 mode; - ret = replay_xattr_deletes(wc->trans, root, log, path, key.objectid); + ret = replay_xattr_deletes(wc); if (ret) break; mode = btrfs_inode_mode(eb, inode_item); if (S_ISDIR(mode)) { - ret = replay_dir_deletes(wc->trans, root, log, path, - key.objectid, false); + ret = replay_dir_deletes(wc, wc->log_key.objectid, false); if (ret) break; } - ret = overwrite_item(wc->trans, root, path, - eb, i, &key); + ret = overwrite_item(wc); if (ret) break; @@ -2530,9 +2865,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct btrfs_inode *inode; u64 from; - inode = btrfs_iget_logging(key.objectid, root); + inode = btrfs_iget_logging(wc->log_key.objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); + btrfs_abort_log_replay(wc, ret, + "failed to lookup inode %llu root %llu", + wc->log_key.objectid, + btrfs_root_id(root)); break; } from = ALIGN(i_size_read(&inode->vfs_inode), @@ -2540,21 +2879,31 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, drop_args.start = from; drop_args.end = (u64)-1; drop_args.drop_cache = true; - ret = btrfs_drop_extents(wc->trans, root, inode, - &drop_args); - if (!ret) { + drop_args.path = wc->subvol_path; + ret = btrfs_drop_extents(trans, root, inode, &drop_args); + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to drop extents for inode %llu root %llu offset %llu", + btrfs_ino(inode), + btrfs_root_id(root), + from); + } else { inode_sub_bytes(&inode->vfs_inode, drop_args.bytes_found); /* Update the inode's nbytes. */ - ret = btrfs_update_inode(wc->trans, inode); + ret = btrfs_update_inode(trans, inode); + if (ret) + btrfs_abort_log_replay(wc, ret, + "failed to update inode %llu root %llu", + btrfs_ino(inode), + btrfs_root_id(root)); } iput(&inode->vfs_inode); if (ret) break; } - ret = link_to_fixup_dir(wc->trans, root, - path, key.objectid); + ret = link_to_fixup_dir(wc, wc->log_key.objectid); if (ret) break; } @@ -2562,10 +2911,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, if (wc->ignore_cur_inode) continue; - if (key.type == BTRFS_DIR_INDEX_KEY && + if (wc->log_key.type == BTRFS_DIR_INDEX_KEY && wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { - ret = replay_one_dir_item(wc->trans, root, path, - eb, i, &key); + ret = replay_one_dir_item(wc); if (ret) break; } @@ -2574,20 +2922,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, continue; /* these keys are simply copied */ - if (key.type == BTRFS_XATTR_ITEM_KEY) { - ret = overwrite_item(wc->trans, root, path, - eb, i, &key); + if (wc->log_key.type == BTRFS_XATTR_ITEM_KEY) { + ret = overwrite_item(wc); if (ret) break; - } else if (key.type == BTRFS_INODE_REF_KEY || - key.type == BTRFS_INODE_EXTREF_KEY) { - ret = add_inode_ref(wc->trans, root, log, path, - eb, i, &key); + } else if (wc->log_key.type == BTRFS_INODE_REF_KEY || + wc->log_key.type == BTRFS_INODE_EXTREF_KEY) { + ret = add_inode_ref(wc); if (ret) break; - } else if (key.type == BTRFS_EXTENT_DATA_KEY) { - ret = replay_one_extent(wc->trans, root, path, - eb, i, &key); + } else if (wc->log_key.type == BTRFS_EXTENT_DATA_KEY) { + ret = replay_one_extent(wc); if (ret) break; } @@ -2598,55 +2943,55 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, * older kernel with such keys, ignore them. */ } - btrfs_free_path(path); + btrfs_free_path(wc->subvol_path); + wc->subvol_path = NULL; return ret; } -/* - * Correctly adjust the reserved bytes occupied by a log tree extent buffer - */ -static int unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) -{ - struct btrfs_block_group *cache; - - cache = btrfs_lookup_block_group(fs_info, start); - if (!cache) { - btrfs_err(fs_info, "unable to find block group for %llu", start); - return -ENOENT; - } - - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - cache->reserved -= fs_info->nodesize; - cache->space_info->bytes_reserved -= fs_info->nodesize; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - - btrfs_put_block_group(cache); - - return 0; -} - static int clean_log_buffer(struct btrfs_trans_handle *trans, struct extent_buffer *eb) { + struct btrfs_fs_info *fs_info = eb->fs_info; + struct btrfs_block_group *bg; + btrfs_tree_lock(eb); btrfs_clear_buffer_dirty(trans, eb); wait_on_extent_buffer_writeback(eb); btrfs_tree_unlock(eb); - if (trans) - return btrfs_pin_reserved_extent(trans, eb); + if (trans) { + int ret; - return unaccount_log_buffer(eb->fs_info, eb->start); + ret = btrfs_pin_reserved_extent(trans, eb); + if (ret) + btrfs_abort_transaction(trans, ret); + return ret; + } + + bg = btrfs_lookup_block_group(fs_info, eb->start); + if (!bg) { + btrfs_err(fs_info, "unable to find block group for %llu", eb->start); + btrfs_handle_fs_error(fs_info, -ENOENT, NULL); + return -ENOENT; + } + + spin_lock(&bg->space_info->lock); + spin_lock(&bg->lock); + bg->reserved -= fs_info->nodesize; + bg->space_info->bytes_reserved -= fs_info->nodesize; + spin_unlock(&bg->lock); + spin_unlock(&bg->space_info->lock); + + btrfs_put_block_group(bg); + + return 0; } -static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, int *level, - struct walk_control *wc) +static noinline int walk_down_log_tree(struct btrfs_path *path, int *level, + struct walk_control *wc) { - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_fs_info *fs_info = wc->log->fs_info; u64 bytenr; u64 ptr_gen; struct extent_buffer *next; @@ -2674,12 +3019,17 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, next = btrfs_find_create_tree_block(fs_info, bytenr, btrfs_header_owner(cur), *level - 1); - if (IS_ERR(next)) - return PTR_ERR(next); + if (IS_ERR(next)) { + ret = PTR_ERR(next); + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(fs_info, ret, NULL); + return ret; + } if (*level == 1) { - ret = wc->process_func(root, next, wc, ptr_gen, - *level - 1); + ret = wc->process_func(next, wc, ptr_gen, *level - 1); if (ret) { free_extent_buffer(next); return ret; @@ -2690,6 +3040,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, ret = btrfs_read_extent_buffer(next, &check); if (ret) { free_extent_buffer(next); + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(fs_info, ret, NULL); return ret; } @@ -2705,6 +3059,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, ret = btrfs_read_extent_buffer(next, &check); if (ret) { free_extent_buffer(next); + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(fs_info, ret, NULL); return ret; } @@ -2721,10 +3079,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, return 0; } -static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, int *level, - struct walk_control *wc) +static noinline int walk_up_log_tree(struct btrfs_path *path, int *level, + struct walk_control *wc) { int i; int slot; @@ -2738,14 +3094,14 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, WARN_ON(*level == 0); return 0; } else { - ret = wc->process_func(root, path->nodes[*level], wc, + ret = wc->process_func(path->nodes[*level], wc, btrfs_header_generation(path->nodes[*level]), *level); if (ret) return ret; if (wc->free) { - ret = clean_log_buffer(trans, path->nodes[*level]); + ret = clean_log_buffer(wc->trans, path->nodes[*level]); if (ret) return ret; } @@ -2762,13 +3118,13 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, * the tree freeing any blocks that have a ref count of zero after being * decremented. */ -static int walk_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *log, struct walk_control *wc) +static int walk_log_tree(struct walk_control *wc) { + struct btrfs_root *log = wc->log; int ret = 0; int wret; int level; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int orig_level; path = btrfs_alloc_path(); @@ -2782,36 +3138,30 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, path->slots[level] = 0; while (1) { - wret = walk_down_log_tree(trans, log, path, &level, wc); + wret = walk_down_log_tree(path, &level, wc); if (wret > 0) break; - if (wret < 0) { - ret = wret; - goto out; - } + if (wret < 0) + return wret; - wret = walk_up_log_tree(trans, log, path, &level, wc); + wret = walk_up_log_tree(path, &level, wc); if (wret > 0) break; - if (wret < 0) { - ret = wret; - goto out; - } + if (wret < 0) + return wret; } /* was the root node processed? if not, catch it here */ if (path->nodes[orig_level]) { - ret = wc->process_func(log, path->nodes[orig_level], wc, + ret = wc->process_func(path->nodes[orig_level], wc, btrfs_header_generation(path->nodes[orig_level]), orig_level); if (ret) - goto out; + return ret; if (wc->free) - ret = clean_log_buffer(trans, path->nodes[orig_level]); + ret = clean_log_buffer(wc->trans, path->nodes[orig_level]); } -out: - btrfs_free_path(path); return ret; } @@ -3220,7 +3570,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level); ret = write_all_supers(fs_info, 1); mutex_unlock(&fs_info->tree_log_mutex); - if (ret) { + if (unlikely(ret)) { btrfs_set_log_full_commit(trans); btrfs_abort_transaction(trans, ret); goto out_wake_log_root; @@ -3272,12 +3622,14 @@ static void free_log_tree(struct btrfs_trans_handle *trans, { int ret; struct walk_control wc = { - .free = 1, - .process_func = process_one_buffer + .free = true, + .process_func = process_one_buffer, + .log = log, + .trans = trans, }; if (log->node) { - ret = walk_log_tree(trans, log, &wc); + ret = walk_log_tree(&wc); if (ret) { /* * We weren't able to traverse the entire log tree, the @@ -3476,7 +3828,7 @@ static int inode_logged(const struct btrfs_trans_handle *trans, /* * The inode was previously logged and then evicted, set logged_trans to - * the current transacion's ID, to avoid future tree searches as long as + * the current transaction's ID, to avoid future tree searches as long as * the inode is not evicted again. */ spin_lock(&inode->lock); @@ -3547,13 +3899,13 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, const struct fscrypt_str *name, struct btrfs_inode *dir, u64 index) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; ret = inode_logged(trans, dir, NULL); if (ret == 0) return; - else if (ret < 0) { + if (ret < 0) { btrfs_set_log_full_commit(trans); return; } @@ -3567,7 +3919,7 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, ret = join_running_log_trans(root); ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret); if (WARN_ON(ret)) - goto out; + return; mutex_lock(&dir->log_mutex); @@ -3577,8 +3929,6 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, if (ret < 0) btrfs_set_log_full_commit(trans); btrfs_end_log_trans(root); -out: - btrfs_free_path(path); } /* see comments for btrfs_del_dir_entries_in_log */ @@ -3691,8 +4041,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, struct btrfs_key *ins_keys; u32 *ins_sizes; - ins_data = kmalloc(count * sizeof(u32) + - count * sizeof(struct btrfs_key), GFP_NOFS); + ins_data = kmalloc_array(count, sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS); if (!ins_data) return -ENOMEM; @@ -4255,7 +4604,7 @@ static int truncate_inode_items(struct btrfs_trans_handle *trans, static void fill_inode_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf, struct btrfs_inode_item *item, - struct inode *inode, int log_inode_only, + struct inode *inode, bool log_inode_only, u64 logged_isize) { u64 flags; @@ -4351,7 +4700,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans, inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, - 0, 0); + false, 0); btrfs_release_path(path); return 0; } @@ -4455,8 +4804,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, src = src_path->nodes[0]; - ins_data = kmalloc(nr * sizeof(struct btrfs_key) + - nr * sizeof(u32), GFP_NOFS); + ins_data = kmalloc_array(nr, sizeof(struct btrfs_key) + sizeof(u32), GFP_NOFS); if (!ins_data) return -ENOMEM; @@ -4857,7 +5205,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, struct btrfs_key key; const u64 i_size = i_size_read(&inode->vfs_inode); const u64 ino = btrfs_ino(inode); - struct btrfs_path *dst_path = NULL; + BTRFS_PATH_AUTO_FREE(dst_path); bool dropped_extents = false; u64 truncate_offset = i_size; struct extent_buffer *leaf; @@ -4975,7 +5323,6 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, start_slot, ins_nr, 1, 0, ctx); out: btrfs_release_path(path); - btrfs_free_path(dst_path); return ret; } @@ -5348,7 +5695,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, u64 *other_ino, u64 *other_parent) { int ret; - struct btrfs_path *search_path; + BTRFS_PATH_AUTO_FREE(search_path); char *name = NULL; u32 name_len = 0; u32 item_size = btrfs_item_size(eb, slot); @@ -5433,7 +5780,6 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, } ret = 0; out: - btrfs_free_path(search_path); kfree(name); return ret; } @@ -6161,8 +6507,7 @@ static int log_delayed_insertion_items(struct btrfs_trans_handle *trans, if (!first) return 0; - ins_data = kmalloc(max_batch_size * sizeof(u32) + - max_batch_size * sizeof(struct btrfs_key), GFP_NOFS); + ins_data = kmalloc_array(max_batch_size, sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS); if (!ins_data) return -ENOMEM; ins_sizes = (u32 *)ins_data; @@ -6816,7 +7161,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_root *root = inode->root; const u64 ino = btrfs_ino(inode); @@ -6832,7 +7177,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; while (true) { struct extent_buffer *leaf = path->nodes[0]; @@ -6844,8 +7189,8 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) - goto out; - else if (ret > 0) + return ret; + if (ret > 0) break; continue; } @@ -6903,10 +7248,8 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, * at both parents and the old parent B would still * exist. */ - if (IS_ERR(dir_inode)) { - ret = PTR_ERR(dir_inode); - goto out; - } + if (IS_ERR(dir_inode)) + return PTR_ERR(dir_inode); if (!need_log_inode(trans, dir_inode)) { btrfs_add_delayed_iput(dir_inode); @@ -6919,14 +7262,11 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, ret = log_new_dir_dentries(trans, dir_inode, ctx); btrfs_add_delayed_iput(dir_inode); if (ret) - goto out; + return ret; } path->slots[0]++; } - ret = 0; -out: - btrfs_free_path(path); - return ret; + return 0; } static int log_new_ancestors(struct btrfs_trans_handle *trans, @@ -7037,7 +7377,7 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; const u64 ino = btrfs_ino(inode); - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key search_key; int ret; @@ -7058,7 +7398,7 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans, again: ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret == 0) path->slots[0]++; @@ -7070,8 +7410,8 @@ again: if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) - goto out; - else if (ret > 0) + return ret; + if (ret > 0) break; continue; } @@ -7088,10 +7428,8 @@ again: * this loop, etc). So just return some error to fallback to * a transaction commit. */ - if (found_key.type == BTRFS_INODE_EXTREF_KEY) { - ret = -EMLINK; - goto out; - } + if (found_key.type == BTRFS_INODE_EXTREF_KEY) + return -EMLINK; /* * Logging ancestors needs to do more searches on the fs/subvol @@ -7103,14 +7441,11 @@ again: ret = log_new_ancestors(trans, root, path, ctx); if (ret) - goto out; + return ret; btrfs_release_path(path); goto again; } - ret = 0; -out: - btrfs_free_path(path); - return ret; + return 0; } /* @@ -7290,10 +7625,12 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) } wc.trans = trans; - wc.pin = 1; + wc.pin = true; + wc.log = log_root_tree; - ret = walk_log_tree(trans, log_root_tree, &wc); - if (ret) { + ret = walk_log_tree(&wc); + wc.log = NULL; + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error; } @@ -7304,12 +7641,11 @@ again: key.offset = (u64)-1; while (1) { - struct btrfs_root *log; struct btrfs_key found_key; ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto error; } @@ -7324,20 +7660,19 @@ again: if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) break; - log = btrfs_read_tree_root(log_root_tree, &found_key); - if (IS_ERR(log)) { - ret = PTR_ERR(log); + wc.log = btrfs_read_tree_root(log_root_tree, &found_key); + if (IS_ERR(wc.log)) { + ret = PTR_ERR(wc.log); + wc.log = NULL; btrfs_abort_transaction(trans, ret); goto error; } - wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset, - true); - if (IS_ERR(wc.replay_dest)) { - ret = PTR_ERR(wc.replay_dest); - wc.replay_dest = NULL; - if (ret != -ENOENT) { - btrfs_put_root(log); + wc.root = btrfs_get_fs_root(fs_info, found_key.offset, true); + if (IS_ERR(wc.root)) { + ret = PTR_ERR(wc.root); + wc.root = NULL; + if (unlikely(ret != -ENOENT)) { btrfs_abort_transaction(trans, ret); goto error; } @@ -7353,33 +7688,34 @@ again: * block from being modified, and we'll just bail for * each subsequent pass. */ - ret = btrfs_pin_extent_for_log_replay(trans, log->node); - if (ret) { - btrfs_put_root(log); + ret = btrfs_pin_extent_for_log_replay(trans, wc.log->node); + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error; } goto next; } - wc.replay_dest->log_root = log; - ret = btrfs_record_root_in_trans(trans, wc.replay_dest); - if (ret) { + wc.root->log_root = wc.log; + ret = btrfs_record_root_in_trans(trans, wc.root); + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto next; } - ret = walk_log_tree(trans, log, &wc); - if (ret) { + ret = walk_log_tree(&wc); + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto next; } if (wc.stage == LOG_WALK_REPLAY_ALL) { - struct btrfs_root *root = wc.replay_dest; + struct btrfs_root *root = wc.root; - ret = fixup_inode_link_counts(trans, wc.replay_dest, path); - if (ret) { + wc.subvol_path = path; + ret = fixup_inode_link_counts(&wc); + wc.subvol_path = NULL; + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto next; } @@ -7392,17 +7728,18 @@ again: * could only happen during mount. */ ret = btrfs_init_root_free_objectid(root); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto next; } } next: - if (wc.replay_dest) { - wc.replay_dest->log_root = NULL; - btrfs_put_root(wc.replay_dest); + if (wc.root) { + wc.root->log_root = NULL; + btrfs_put_root(wc.root); } - btrfs_put_root(log); + btrfs_put_root(wc.log); + wc.log = NULL; if (ret) goto error; @@ -7414,7 +7751,7 @@ next: /* step one is to pin it all, step two is to replay just inodes */ if (wc.pin) { - wc.pin = 0; + wc.pin = false; wc.process_func = replay_one_buffer; wc.stage = LOG_WALK_REPLAY_INODES; goto again; @@ -7432,14 +7769,13 @@ next: if (ret) return ret; - log_root_tree->log_root = NULL; clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); - btrfs_put_root(log_root_tree); return 0; error: if (wc.trans) btrfs_end_transaction(wc.trans); + btrfs_put_root(wc.log); clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); btrfs_free_path(path); return ret; diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 4633cbcfcdb9..46bd8ca58670 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -487,12 +487,12 @@ static int rollback_verity(struct btrfs_inode *inode) inode->ro_flags &= ~BTRFS_INODE_RO_VERITY; btrfs_sync_inode_flags_to_i_flags(inode); ret = btrfs_update_inode(trans, inode); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } ret = del_orphan(trans, inode); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -676,11 +676,11 @@ int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size) if (ret < 0) return ret; - if (item.reserved[0] != 0 || item.reserved[1] != 0) + if (unlikely(item.reserved[0] != 0 || item.reserved[1] != 0)) return -EUCLEAN; true_size = btrfs_stack_verity_descriptor_size(&item); - if (true_size > INT_MAX) + if (unlikely(true_size > INT_MAX)) return -EUCLEAN; if (buf_size == 0) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c6e3efd6f602..2bec544d8ba3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1377,8 +1377,8 @@ struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, } /* - * Make sure the last byte of label is properly NUL termiated. We use - * '%s' to print the label, if not properly NUL termiated we can access + * Make sure the last byte of label is properly NUL terminated. We use + * '%s' to print the label, if not properly NUL terminated we can access * beyond the label. */ if (super->label[0] && super->label[BTRFS_LABEL_SIZE - 1]) @@ -1911,7 +1911,7 @@ static noinline int find_next_devid(struct btrfs_fs_info *fs_info, if (ret < 0) goto error; - if (ret == 0) { + if (unlikely(ret == 0)) { /* Corruption */ btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); ret = -EUCLEAN; @@ -2243,7 +2243,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, } ret = btrfs_rm_dev_item(trans, device); - if (ret) { + if (unlikely(ret)) { /* Any error in dev item removal is critical */ btrfs_crit(fs_info, "failed to remove device item for devid %llu: %d", @@ -2843,21 +2843,21 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path mutex_lock(&fs_info->chunk_mutex); ret = init_first_rw_device(trans); mutex_unlock(&fs_info->chunk_mutex); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error_sysfs; } } ret = btrfs_add_dev_item(trans, device); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error_sysfs; } if (seeding_dev) { ret = btrfs_finish_sprout(trans); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error_sysfs; } @@ -3049,7 +3049,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) goto out; - else if (ret > 0) { /* Logic error or corruption */ + else if (unlikely(ret > 0)) { /* Logic error or corruption */ btrfs_err(fs_info, "failed to lookup chunk %llu when freeing", chunk_offset); btrfs_abort_transaction(trans, -ENOENT); @@ -3058,7 +3058,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) } ret = btrfs_del_item(trans, root, path); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_err(fs_info, "failed to delete chunk %llu item", chunk_offset); btrfs_abort_transaction(trans, ret); goto out; @@ -3283,7 +3283,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) ret = btrfs_free_dev_extent(trans, device, map->stripes[i].physical, &dev_extent_len); - if (ret) { + if (unlikely(ret)) { mutex_unlock(&fs_devices->device_list_mutex); btrfs_abort_transaction(trans, ret); goto out; @@ -3353,7 +3353,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) struct btrfs_space_info *space_info; space_info = btrfs_find_space_info(fs_info, sys_flags); - if (!space_info) { + if (unlikely(!space_info)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; @@ -3367,17 +3367,17 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) } ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } ret = remove_chunk_item(trans, map, chunk_offset); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } - } else if (ret) { + } else if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3386,7 +3386,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_del_sys_chunk(fs_info, chunk_offset); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3402,7 +3402,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) btrfs_trans_release_chunk_metadata(trans); ret = btrfs_remove_block_group(trans, map); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3527,7 +3527,7 @@ again: mutex_unlock(&fs_info->reclaim_bgs_lock); goto error; } - if (ret == 0) { + if (unlikely(ret == 0)) { /* * On the first search we would find chunk tree with * offset -1, which is not possible. On subsequent @@ -4269,7 +4269,7 @@ error: * @flags: profile to validate * @extended: if true @flags is treated as an extended profile */ -static int alloc_profile_is_valid(u64 flags, int extended) +static int alloc_profile_is_valid(u64 flags, bool extended) { u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : BTRFS_BLOCK_GROUP_PROFILE_MASK); @@ -4463,7 +4463,7 @@ out_overflow: } /* - * Should be called with balance mutexe held + * Should be called with balance mutex held */ int btrfs_balance(struct btrfs_fs_info *fs_info, struct btrfs_balance_control *bctl, @@ -5041,7 +5041,7 @@ again: /* Now btrfs_update_device() will change the on-disk size. */ ret = btrfs_update_device(trans, device); btrfs_trans_release_chunk_metadata(trans); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); } else { @@ -5701,7 +5701,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, item_size = btrfs_chunk_item_size(map->num_stripes); chunk = kzalloc(item_size, GFP_NOFS); - if (!chunk) { + if (unlikely(!chunk)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out; @@ -7486,7 +7486,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) /* * Lockdep complains about possible circular locking dependency between * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores - * used for freeze procection of a fs (struct super_block.s_writers), + * used for freeze protection of a fs (struct super_block.s_writers), * which we take when starting a transaction, and extent buffers of the * chunk tree if we call read_one_dev() while holding a lock on an * extent buffer of the chunk tree. Since we are mounting the filesystem @@ -7919,8 +7919,6 @@ int btrfs_bg_type_to_factor(u64 flags) return btrfs_raid_array[index].ncopies; } - - static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, u64 chunk_offset, u64 devid, u64 physical_offset, u64 physical_len) @@ -7934,7 +7932,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, int i; map = btrfs_find_chunk_map(fs_info, chunk_offset, 1); - if (!map) { + if (unlikely(!map)) { btrfs_err(fs_info, "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", physical_offset, devid); @@ -7943,7 +7941,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, } stripe_len = btrfs_calc_stripe_length(map); - if (physical_len != stripe_len) { + if (unlikely(physical_len != stripe_len)) { btrfs_err(fs_info, "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", physical_offset, devid, map->start, physical_len, @@ -7963,8 +7961,8 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, devid, physical_offset, physical_len); for (i = 0; i < map->num_stripes; i++) { - if (map->stripes[i].dev->devid == devid && - map->stripes[i].physical == physical_offset) { + if (unlikely(map->stripes[i].dev->devid == devid && + map->stripes[i].physical == physical_offset)) { found = true; if (map->verified_stripes >= map->num_stripes) { btrfs_err(fs_info, @@ -7977,7 +7975,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, break; } } - if (!found) { + if (unlikely(!found)) { btrfs_err(fs_info, "dev extent physical offset %llu devid %llu has no corresponding chunk", physical_offset, devid); @@ -7986,13 +7984,13 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, /* Make sure no dev extent is beyond device boundary */ dev = btrfs_find_device(fs_info->fs_devices, &args); - if (!dev) { + if (unlikely(!dev)) { btrfs_err(fs_info, "failed to find devid %llu", devid); ret = -EUCLEAN; goto out; } - if (physical_offset + physical_len > dev->disk_total_bytes) { + if (unlikely(physical_offset + physical_len > dev->disk_total_bytes)) { btrfs_err(fs_info, "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", devid, physical_offset, physical_len, @@ -8004,8 +8002,8 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, if (dev->zone_info) { u64 zone_size = dev->zone_info->zone_size; - if (!IS_ALIGNED(physical_offset, zone_size) || - !IS_ALIGNED(physical_len, zone_size)) { + if (unlikely(!IS_ALIGNED(physical_offset, zone_size) || + !IS_ALIGNED(physical_len, zone_size))) { btrfs_err(fs_info, "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone", devid, physical_offset, physical_len); @@ -8029,7 +8027,7 @@ static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) struct btrfs_chunk_map *map; map = rb_entry(node, struct btrfs_chunk_map, rb_node); - if (map->num_stripes != map->verified_stripes) { + if (unlikely(map->num_stripes != map->verified_stripes)) { btrfs_err(fs_info, "chunk %llu has missing dev extent, have %d expect %d", map->start, map->verified_stripes, map->num_stripes); @@ -8089,7 +8087,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) if (ret < 0) goto out; /* No dev extents at all? Not good */ - if (ret > 0) { + if (unlikely(ret > 0)) { ret = -EUCLEAN; goto out; } @@ -8114,7 +8112,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) physical_len = btrfs_dev_extent_length(leaf, dext); /* Check if this dev extent overlaps with the previous one */ - if (devid == prev_devid && physical_offset < prev_dev_ext_end) { + if (unlikely(devid == prev_devid && physical_offset < prev_dev_ext_end)) { btrfs_err(fs_info, "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", devid, physical_offset, prev_dev_ext_end); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index a56e873a3029..2cbf8080eade 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -34,7 +34,7 @@ struct btrfs_zoned_device_info; #define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) /* - * Arbitratry maximum size of one discard request to limit potentially long time + * Arbitrary maximum size of one discard request to limit potentially long time * spent in blkdev_issue_discard(). */ #define BTRFS_MAX_DISCARD_CHUNK_SIZE (SZ_1G) @@ -495,7 +495,7 @@ struct btrfs_discard_stripe { }; /* - * Context for IO subsmission for device stripe. + * Context for IO submission for device stripe. * * - Track the unfinished mirrors for mirror based profiles * Mirror based profiles are SINGLE/DUP/RAID1/RAID10. diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 5292cd341f70..6caba8be7c84 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -34,11 +34,9 @@ struct workspace { int level; }; -static struct workspace_manager wsm; - -struct list_head *zlib_get_workspace(unsigned int level) +struct list_head *zlib_get_workspace(struct btrfs_fs_info *fs_info, unsigned int level) { - struct list_head *ws = btrfs_get_workspace(BTRFS_COMPRESS_ZLIB, level); + struct list_head *ws = btrfs_get_workspace(fs_info, BTRFS_COMPRESS_ZLIB, level); struct workspace *workspace = list_entry(ws, struct workspace, list); workspace->level = level; @@ -55,8 +53,25 @@ void zlib_free_workspace(struct list_head *ws) kfree(workspace); } -struct list_head *zlib_alloc_workspace(unsigned int level) +/* + * For s390 hardware acceleration, the buffer size should be at least + * ZLIB_DFLTCC_BUF_SIZE to achieve the best performance. + * + * But if bs > ps we can have large enough folios that meet the s390 hardware + * handling. + */ +static bool need_special_buffer(struct btrfs_fs_info *fs_info) +{ + if (!zlib_deflate_dfltcc_enabled()) + return false; + if (btrfs_min_folio_size(fs_info) >= ZLIB_DFLTCC_BUF_SIZE) + return false; + return true; +} + +struct list_head *zlib_alloc_workspace(struct btrfs_fs_info *fs_info, unsigned int level) { + const u32 blocksize = fs_info->sectorsize; struct workspace *workspace; int workspacesize; @@ -69,19 +84,15 @@ struct list_head *zlib_alloc_workspace(unsigned int level) workspace->strm.workspace = kvzalloc(workspacesize, GFP_KERNEL | __GFP_NOWARN); workspace->level = level; workspace->buf = NULL; - /* - * In case of s390 zlib hardware support, allocate lager workspace - * buffer. If allocator fails, fall back to a single page buffer. - */ - if (zlib_deflate_dfltcc_enabled()) { + if (need_special_buffer(fs_info)) { workspace->buf = kmalloc(ZLIB_DFLTCC_BUF_SIZE, __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | GFP_NOIO); workspace->buf_size = ZLIB_DFLTCC_BUF_SIZE; } if (!workspace->buf) { - workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); - workspace->buf_size = PAGE_SIZE; + workspace->buf = kmalloc(blocksize, GFP_KERNEL); + workspace->buf_size = blocksize; } if (!workspace->strm.workspace || !workspace->buf) goto fail; @@ -133,11 +144,15 @@ static int copy_data_into_buffer(struct address_space *mapping, return 0; } -int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, +int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct workspace *workspace = list_entry(ws, struct workspace, list); + struct address_space *mapping = inode->vfs_inode.i_mapping; + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; + const u32 min_folio_size = btrfs_min_folio_size(fs_info); int ret; char *data_in = NULL; char *cfolio_out; @@ -146,7 +161,8 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, struct folio *out_folio = NULL; unsigned long len = *total_out; unsigned long nr_dest_folios = *out_folios; - const unsigned long max_out = nr_dest_folios * PAGE_SIZE; + const unsigned long max_out = nr_dest_folios << min_folio_shift; + const u32 blocksize = fs_info->sectorsize; const u64 orig_end = start + len; *out_folios = 0; @@ -155,9 +171,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, ret = zlib_deflateInit(&workspace->strm, workspace->level); if (unlikely(ret != Z_OK)) { - struct btrfs_inode *inode = BTRFS_I(mapping->host); - - btrfs_err(inode->root->fs_info, + btrfs_err(fs_info, "zlib compression init failed, error %d root %llu inode %llu offset %llu", ret, btrfs_root_id(inode->root), btrfs_ino(inode), start); ret = -EIO; @@ -167,7 +181,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, workspace->strm.total_in = 0; workspace->strm.total_out = 0; - out_folio = btrfs_alloc_compr_folio(); + out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; @@ -179,7 +193,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, workspace->strm.next_in = workspace->buf; workspace->strm.avail_in = 0; workspace->strm.next_out = cfolio_out; - workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_out = min_folio_size; while (workspace->strm.total_in < len) { /* @@ -191,10 +205,11 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, unsigned int copy_length = min(bytes_left, workspace->buf_size); /* - * This can only happen when hardware zlib compression is - * enabled. + * For s390 hardware accelerated zlib, and our folio is smaller + * than the copy_length, we need to fill the buffer so that + * we can take full advantage of hardware acceleration. */ - if (copy_length > PAGE_SIZE) { + if (need_special_buffer(fs_info)) { ret = copy_data_into_buffer(mapping, workspace, start, copy_length); if (ret < 0) @@ -225,9 +240,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH); if (unlikely(ret != Z_OK)) { - struct btrfs_inode *inode = BTRFS_I(mapping->host); - - btrfs_warn(inode->root->fs_info, + btrfs_warn(fs_info, "zlib compression failed, error %d root %llu inode %llu offset %llu", ret, btrfs_root_id(inode->root), btrfs_ino(inode), start); @@ -237,7 +250,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, } /* we're making it bigger, give up */ - if (workspace->strm.total_in > 8192 && + if (workspace->strm.total_in > blocksize * 2 && workspace->strm.total_in < workspace->strm.total_out) { ret = -E2BIG; @@ -252,7 +265,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, ret = -E2BIG; goto out; } - out_folio = btrfs_alloc_compr_folio(); + out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; @@ -260,7 +273,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, cfolio_out = folio_address(out_folio); folios[nr_folios] = out_folio; nr_folios++; - workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_out = min_folio_size; workspace->strm.next_out = cfolio_out; } /* we're all done */ @@ -278,7 +291,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, ret = zlib_deflate(&workspace->strm, Z_FINISH); if (ret == Z_STREAM_END) break; - if (ret != Z_OK && ret != Z_BUF_ERROR) { + if (unlikely(ret != Z_OK && ret != Z_BUF_ERROR)) { zlib_deflateEnd(&workspace->strm); ret = -EIO; goto out; @@ -288,7 +301,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, ret = -E2BIG; goto out; } - out_folio = btrfs_alloc_compr_folio(); + out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; @@ -296,7 +309,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, cfolio_out = folio_address(out_folio); folios[nr_folios] = out_folio; nr_folios++; - workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_out = min_folio_size; workspace->strm.next_out = cfolio_out; } } @@ -322,20 +335,22 @@ out: int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { + struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); struct workspace *workspace = list_entry(ws, struct workspace, list); + const u32 min_folio_size = btrfs_min_folio_size(fs_info); int ret = 0, ret2; int wbits = MAX_WBITS; char *data_in; size_t total_out = 0; unsigned long folio_in_index = 0; size_t srclen = cb->compressed_len; - unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE); + unsigned long total_folios_in = DIV_ROUND_UP(srclen, min_folio_size); unsigned long buf_start; struct folio **folios_in = cb->compressed_folios; data_in = kmap_local_folio(folios_in[folio_in_index], 0); workspace->strm.next_in = data_in; - workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE); + workspace->strm.avail_in = min_t(size_t, srclen, min_folio_size); workspace->strm.total_in = 0; workspace->strm.total_out = 0; @@ -396,7 +411,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) data_in = kmap_local_folio(folios_in[folio_in_index], 0); workspace->strm.next_in = data_in; tmp = srclen - workspace->strm.total_in; - workspace->strm.avail_in = min(tmp, PAGE_SIZE); + workspace->strm.avail_in = min(tmp, min_folio_size); } } if (unlikely(ret != Z_STREAM_END)) { @@ -484,8 +499,7 @@ out: return ret; } -const struct btrfs_compress_op btrfs_zlib_compress = { - .workspace_manager = &wsm, +const struct btrfs_compress_levels btrfs_zlib_compress = { .min_level = 1, .max_level = 9, .default_level = BTRFS_ZLIB_DEFAULT_LEVEL, diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 71cf9be75c42..e00036672f33 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -274,7 +274,7 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, return ret; } *nr_zones = ret; - if (!ret) + if (unlikely(!ret)) return -EIO; /* Populate cache */ @@ -315,7 +315,7 @@ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) if (ret < 0) return ret; /* No dev extents at all? Not good */ - if (ret > 0) + if (unlikely(ret > 0)) return -EUCLEAN; } @@ -503,7 +503,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len; } - if (nreported != zone_info->nr_zones) { + if (unlikely(nreported != zone_info->nr_zones)) { btrfs_err(device->fs_info, "inconsistent number of zones on %s (%u/%u)", rcu_dereference(device->name), nreported, @@ -513,7 +513,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) } if (max_active_zones) { - if (nactive > max_active_zones) { + if (unlikely(nactive > max_active_zones)) { if (bdev_max_active_zones(bdev) == 0) { max_active_zones = 0; zone_info->max_active_zones = 0; @@ -550,7 +550,7 @@ validate: if (ret) goto out; - if (nr_zones != BTRFS_NR_SB_LOG_ZONES) { + if (unlikely(nr_zones != BTRFS_NR_SB_LOG_ZONES)) { btrfs_err(device->fs_info, "zoned: failed to read super block log zone info at devid %llu zone %u", device->devid, sb_zone); @@ -568,7 +568,7 @@ validate: ret = sb_write_pointer(device->bdev, &zone_info->sb_zones[sb_pos], &sb_wp); - if (ret != -ENOENT && ret) { + if (unlikely(ret != -ENOENT && ret)) { btrfs_err(device->fs_info, "zoned: super block log zone corrupted devid %llu zone %u", device->devid, sb_zone); @@ -901,7 +901,7 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, zones); if (ret < 0) return ret; - if (ret != BTRFS_NR_SB_LOG_ZONES) + if (unlikely(ret != BTRFS_NR_SB_LOG_ZONES)) return -EIO; return sb_log_location(bdev, zones, rw, bytenr_ret); @@ -1253,7 +1253,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, root = btrfs_extent_root(fs_info, key.objectid); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); /* We should not find the exact match */ - if (!ret) + if (unlikely(!ret)) ret = -EUCLEAN; if (ret < 0) return ret; @@ -1274,8 +1274,8 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, else length = fs_info->nodesize; - if (!(found_key.objectid >= cache->start && - found_key.objectid + length <= cache->start + cache->length)) { + if (unlikely(!(found_key.objectid >= cache->start && + found_key.objectid + length <= cache->start + cache->length))) { return -EUCLEAN; } *offset_ret = found_key.objectid + length - cache->start; @@ -1357,7 +1357,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, return 0; } - if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { + if (unlikely(zone.type == BLK_ZONE_TYPE_CONVENTIONAL)) { btrfs_err(fs_info, "zoned: unexpected conventional zone %llu on device %s (devid %llu)", zone.start << SECTOR_SHIFT, rcu_dereference(device->name), @@ -1399,7 +1399,7 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg, struct zone_info *info, unsigned long *active) { - if (info->alloc_offset == WP_MISSING_DEV) { + if (unlikely(info->alloc_offset == WP_MISSING_DEV)) { btrfs_err(bg->fs_info, "zoned: cannot recover write pointer for zone %llu", info->physical); @@ -1428,13 +1428,13 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); - if (zone_info[0].alloc_offset == WP_MISSING_DEV) { + if (unlikely(zone_info[0].alloc_offset == WP_MISSING_DEV)) { btrfs_err(bg->fs_info, "zoned: cannot recover write pointer for zone %llu", zone_info[0].physical); return -EIO; } - if (zone_info[1].alloc_offset == WP_MISSING_DEV) { + if (unlikely(zone_info[1].alloc_offset == WP_MISSING_DEV)) { btrfs_err(bg->fs_info, "zoned: cannot recover write pointer for zone %llu", zone_info[1].physical); @@ -1447,14 +1447,14 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, if (zone_info[1].alloc_offset == WP_CONVENTIONAL) zone_info[1].alloc_offset = last_alloc; - if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) { + if (unlikely(zone_info[0].alloc_offset != zone_info[1].alloc_offset)) { btrfs_err(bg->fs_info, "zoned: write pointer offset mismatch of zones in DUP profile"); return -EIO; } if (test_bit(0, active) != test_bit(1, active)) { - if (!btrfs_zone_activate(bg)) + if (unlikely(!btrfs_zone_activate(bg))) return -EIO; } else if (test_bit(0, active)) { set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); @@ -1489,16 +1489,16 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, if (zone_info[i].alloc_offset == WP_CONVENTIONAL) zone_info[i].alloc_offset = last_alloc; - if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) && - !btrfs_test_opt(fs_info, DEGRADED)) { + if (unlikely((zone_info[0].alloc_offset != zone_info[i].alloc_offset) && + !btrfs_test_opt(fs_info, DEGRADED))) { btrfs_err(fs_info, "zoned: write pointer offset mismatch of zones in %s profile", btrfs_bg_type_to_raid_name(map->type)); return -EIO; } if (test_bit(0, active) != test_bit(i, active)) { - if (!btrfs_test_opt(fs_info, DEGRADED) && - !btrfs_zone_activate(bg)) { + if (unlikely(!btrfs_test_opt(fs_info, DEGRADED) && + !btrfs_zone_activate(bg))) { return -EIO; } } else { @@ -1554,7 +1554,7 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, } if (test_bit(0, active) != test_bit(i, active)) { - if (!btrfs_zone_activate(bg)) + if (unlikely(!btrfs_zone_activate(bg))) return -EIO; } else { if (test_bit(0, active)) @@ -1586,7 +1586,7 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, continue; if (test_bit(0, active) != test_bit(i, active)) { - if (!btrfs_zone_activate(bg)) + if (unlikely(!btrfs_zone_activate(bg))) return -EIO; } else { if (test_bit(0, active)) @@ -1643,7 +1643,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) return 0; /* Sanity check */ - if (!IS_ALIGNED(length, fs_info->zone_size)) { + if (unlikely(!IS_ALIGNED(length, fs_info->zone_size))) { btrfs_err(fs_info, "zoned: block group %llu len %llu unaligned to zone size %llu", logical, length, fs_info->zone_size); @@ -1756,7 +1756,7 @@ out: return -EINVAL; } - if (cache->alloc_offset > cache->zone_capacity) { + if (unlikely(cache->alloc_offset > cache->zone_capacity)) { btrfs_err(fs_info, "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu", cache->alloc_offset, cache->zone_capacity, @@ -2087,7 +2087,7 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, &mapped_length, &bioc, NULL, NULL); - if (ret || !bioc || mapped_length < PAGE_SIZE) { + if (unlikely(ret || !bioc || mapped_length < PAGE_SIZE)) { ret = -EIO; goto out_put_bioc; } @@ -2145,7 +2145,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, if (physical_pos == wp) return 0; - if (physical_pos > wp) + if (unlikely(physical_pos > wp)) return -EUCLEAN; length = wp - physical_pos; @@ -2464,16 +2464,17 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) return ret; } -void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) +int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) { struct btrfs_block_group *block_group; u64 min_alloc_bytes; if (!btrfs_is_zoned(fs_info)) - return; + return 0; block_group = btrfs_lookup_block_group(fs_info, logical); - ASSERT(block_group); + if (WARN_ON_ONCE(!block_group)) + return -ENOENT; /* No MIXED_BG on zoned btrfs. */ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) @@ -2490,16 +2491,21 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len out: btrfs_put_block_group(block_group); + return 0; } static void btrfs_zone_finish_endio_workfn(struct work_struct *work) { + int ret; struct btrfs_block_group *bg = container_of(work, struct btrfs_block_group, zone_finish_work); wait_on_extent_buffer_writeback(bg->last_eb); free_extent_buffer(bg->last_eb); - btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length); + ret = do_zone_finish(bg, true); + if (ret) + btrfs_handle_fs_error(bg->fs_info, ret, + "Failed to finish block-group's zone"); btrfs_put_block_group(bg); } diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 6e11533b8e14..17c5656580dd 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -83,7 +83,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, bool btrfs_zone_activate(struct btrfs_block_group *block_group); int btrfs_zone_finish(struct btrfs_block_group *block_group); bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags); -void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, +int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length); void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, struct extent_buffer *eb); @@ -234,8 +234,11 @@ static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, return true; } -static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, - u64 logical, u64 length) { } +static inline int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) +{ + return 0; +} static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, struct extent_buffer *eb) { } diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index ff0292615e1f..c9cddcfa337b 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -77,7 +77,6 @@ struct workspace { */ struct zstd_workspace_manager { - const struct btrfs_compress_op *ops; spinlock_t lock; struct list_head lru_list; struct list_head idle_ws[ZSTD_BTRFS_MAX_LEVEL]; @@ -86,8 +85,6 @@ struct zstd_workspace_manager { struct timer_list timer; }; -static struct zstd_workspace_manager wsm; - static size_t zstd_ws_mem_sizes[ZSTD_BTRFS_MAX_LEVEL]; static inline struct workspace *list_to_workspace(struct list_head *list) @@ -112,19 +109,19 @@ static inline int clip_level(int level) */ static void zstd_reclaim_timer_fn(struct timer_list *timer) { + struct zstd_workspace_manager *zwsm = + container_of(timer, struct zstd_workspace_manager, timer); unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES; struct list_head *pos, *next; - ASSERT(timer == &wsm.timer); - - spin_lock(&wsm.lock); + spin_lock(&zwsm->lock); - if (list_empty(&wsm.lru_list)) { - spin_unlock(&wsm.lock); + if (list_empty(&zwsm->lru_list)) { + spin_unlock(&zwsm->lock); return; } - list_for_each_prev_safe(pos, next, &wsm.lru_list) { + list_for_each_prev_safe(pos, next, &zwsm->lru_list) { struct workspace *victim = container_of(pos, struct workspace, lru_list); int level; @@ -141,15 +138,15 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) list_del(&victim->list); zstd_free_workspace(&victim->list); - if (list_empty(&wsm.idle_ws[level])) - clear_bit(level, &wsm.active_map); + if (list_empty(&zwsm->idle_ws[level])) + clear_bit(level, &zwsm->active_map); } - if (!list_empty(&wsm.lru_list)) - mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES); + if (!list_empty(&zwsm->lru_list)) + mod_timer(&zwsm->timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES); - spin_unlock(&wsm.lock); + spin_unlock(&zwsm->lock); } /* @@ -182,49 +179,56 @@ static void zstd_calc_ws_mem_sizes(void) } } -void zstd_init_workspace_manager(void) +int zstd_alloc_workspace_manager(struct btrfs_fs_info *fs_info) { + struct zstd_workspace_manager *zwsm; struct list_head *ws; - int i; + ASSERT(fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD] == NULL); + zwsm = kzalloc(sizeof(*zwsm), GFP_KERNEL); + if (!zwsm) + return -ENOMEM; zstd_calc_ws_mem_sizes(); + spin_lock_init(&zwsm->lock); + init_waitqueue_head(&zwsm->wait); + timer_setup(&zwsm->timer, zstd_reclaim_timer_fn, 0); - wsm.ops = &btrfs_zstd_compress; - spin_lock_init(&wsm.lock); - init_waitqueue_head(&wsm.wait); - timer_setup(&wsm.timer, zstd_reclaim_timer_fn, 0); - - INIT_LIST_HEAD(&wsm.lru_list); - for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) - INIT_LIST_HEAD(&wsm.idle_ws[i]); + INIT_LIST_HEAD(&zwsm->lru_list); + for (int i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) + INIT_LIST_HEAD(&zwsm->idle_ws[i]); + fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD] = zwsm; - ws = zstd_alloc_workspace(ZSTD_BTRFS_MAX_LEVEL); + ws = zstd_alloc_workspace(fs_info, ZSTD_BTRFS_MAX_LEVEL); if (IS_ERR(ws)) { btrfs_warn(NULL, "cannot preallocate zstd compression workspace"); } else { - set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &wsm.active_map); - list_add(ws, &wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]); + set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &zwsm->active_map); + list_add(ws, &zwsm->idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]); } + return 0; } -void zstd_cleanup_workspace_manager(void) +void zstd_free_workspace_manager(struct btrfs_fs_info *fs_info) { + struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD]; struct workspace *workspace; - int i; - spin_lock_bh(&wsm.lock); - for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) { - while (!list_empty(&wsm.idle_ws[i])) { - workspace = container_of(wsm.idle_ws[i].next, + if (!zwsm) + return; + fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD] = NULL; + spin_lock_bh(&zwsm->lock); + for (int i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) { + while (!list_empty(&zwsm->idle_ws[i])) { + workspace = container_of(zwsm->idle_ws[i].next, struct workspace, list); list_del(&workspace->list); list_del(&workspace->lru_list); zstd_free_workspace(&workspace->list); } } - spin_unlock_bh(&wsm.lock); - - timer_delete_sync(&wsm.timer); + spin_unlock_bh(&zwsm->lock); + timer_delete_sync(&zwsm->timer); + kfree(zwsm); } /* @@ -239,29 +243,31 @@ void zstd_cleanup_workspace_manager(void) * offer the opportunity to reclaim the workspace in favor of allocating an * appropriately sized one in the future. */ -static struct list_head *zstd_find_workspace(int level) +static struct list_head *zstd_find_workspace(struct btrfs_fs_info *fs_info, int level) { + struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD]; struct list_head *ws; struct workspace *workspace; int i = clip_level(level); - spin_lock_bh(&wsm.lock); - for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) { - if (!list_empty(&wsm.idle_ws[i])) { - ws = wsm.idle_ws[i].next; + ASSERT(zwsm); + spin_lock_bh(&zwsm->lock); + for_each_set_bit_from(i, &zwsm->active_map, ZSTD_BTRFS_MAX_LEVEL) { + if (!list_empty(&zwsm->idle_ws[i])) { + ws = zwsm->idle_ws[i].next; workspace = list_to_workspace(ws); list_del_init(ws); /* keep its place if it's a lower level using this */ workspace->req_level = level; if (clip_level(level) == workspace->level) list_del(&workspace->lru_list); - if (list_empty(&wsm.idle_ws[i])) - clear_bit(i, &wsm.active_map); - spin_unlock_bh(&wsm.lock); + if (list_empty(&zwsm->idle_ws[i])) + clear_bit(i, &zwsm->active_map); + spin_unlock_bh(&zwsm->lock); return ws; } } - spin_unlock_bh(&wsm.lock); + spin_unlock_bh(&zwsm->lock); return NULL; } @@ -276,30 +282,33 @@ static struct list_head *zstd_find_workspace(int level) * attempt to allocate a new workspace. If we fail to allocate one due to * memory pressure, go to sleep waiting for the max level workspace to free up. */ -struct list_head *zstd_get_workspace(int level) +struct list_head *zstd_get_workspace(struct btrfs_fs_info *fs_info, int level) { + struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD]; struct list_head *ws; unsigned int nofs_flag; + ASSERT(zwsm); + /* level == 0 means we can use any workspace */ if (!level) level = 1; again: - ws = zstd_find_workspace(level); + ws = zstd_find_workspace(fs_info, level); if (ws) return ws; nofs_flag = memalloc_nofs_save(); - ws = zstd_alloc_workspace(level); + ws = zstd_alloc_workspace(fs_info, level); memalloc_nofs_restore(nofs_flag); if (IS_ERR(ws)) { DEFINE_WAIT(wait); - prepare_to_wait(&wsm.wait, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(&zwsm->wait, &wait, TASK_UNINTERRUPTIBLE); schedule(); - finish_wait(&wsm.wait, &wait); + finish_wait(&zwsm->wait, &wait); goto again; } @@ -318,34 +327,36 @@ again: * isn't set, it is also set here. Only the max level workspace tries and wakes * up waiting workspaces. */ -void zstd_put_workspace(struct list_head *ws) +void zstd_put_workspace(struct btrfs_fs_info *fs_info, struct list_head *ws) { + struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD]; struct workspace *workspace = list_to_workspace(ws); - spin_lock_bh(&wsm.lock); + ASSERT(zwsm); + spin_lock_bh(&zwsm->lock); /* A node is only taken off the lru if we are the corresponding level */ if (clip_level(workspace->req_level) == workspace->level) { /* Hide a max level workspace from reclaim */ - if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) { + if (list_empty(&zwsm->idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) { INIT_LIST_HEAD(&workspace->lru_list); } else { workspace->last_used = jiffies; - list_add(&workspace->lru_list, &wsm.lru_list); - if (!timer_pending(&wsm.timer)) - mod_timer(&wsm.timer, + list_add(&workspace->lru_list, &zwsm->lru_list); + if (!timer_pending(&zwsm->timer)) + mod_timer(&zwsm->timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES); } } - set_bit(workspace->level, &wsm.active_map); - list_add(&workspace->list, &wsm.idle_ws[workspace->level]); + set_bit(workspace->level, &zwsm->active_map); + list_add(&workspace->list, &zwsm->idle_ws[workspace->level]); workspace->req_level = 0; - spin_unlock_bh(&wsm.lock); + spin_unlock_bh(&zwsm->lock); if (workspace->level == clip_level(ZSTD_BTRFS_MAX_LEVEL)) - cond_wake_up(&wsm.wait); + cond_wake_up(&zwsm->wait); } void zstd_free_workspace(struct list_head *ws) @@ -357,8 +368,9 @@ void zstd_free_workspace(struct list_head *ws) kfree(workspace); } -struct list_head *zstd_alloc_workspace(int level) +struct list_head *zstd_alloc_workspace(struct btrfs_fs_info *fs_info, int level) { + const u32 blocksize = fs_info->sectorsize; struct workspace *workspace; workspace = kzalloc(sizeof(*workspace), GFP_KERNEL); @@ -371,7 +383,7 @@ struct list_head *zstd_alloc_workspace(int level) workspace->req_level = level; workspace->last_used = jiffies; workspace->mem = kvmalloc(workspace->size, GFP_KERNEL | __GFP_NOWARN); - workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + workspace->buf = kmalloc(blocksize, GFP_KERNEL); if (!workspace->mem || !workspace->buf) goto fail; @@ -384,11 +396,13 @@ fail: return ERR_PTR(-ENOMEM); } -int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, +int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct workspace *workspace = list_entry(ws, struct workspace, list); + struct address_space *mapping = inode->vfs_inode.i_mapping; zstd_cstream *stream; int ret = 0; int nr_folios = 0; @@ -399,7 +413,9 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, unsigned long len = *total_out; const unsigned long nr_dest_folios = *out_folios; const u64 orig_end = start + len; - unsigned long max_out = nr_dest_folios * PAGE_SIZE; + const u32 blocksize = fs_info->sectorsize; + const u32 min_folio_size = btrfs_min_folio_size(fs_info); + unsigned long max_out = nr_dest_folios * min_folio_size; unsigned int cur_len; workspace->params = zstd_get_btrfs_parameters(workspace->req_level, len); @@ -411,9 +427,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, stream = zstd_init_cstream(&workspace->params, len, workspace->mem, workspace->size); if (unlikely(!stream)) { - struct btrfs_inode *inode = BTRFS_I(mapping->host); - - btrfs_err(inode->root->fs_info, + btrfs_err(fs_info, "zstd compression init level %d failed, root %llu inode %llu offset %llu", workspace->req_level, btrfs_root_id(inode->root), btrfs_ino(inode), start); @@ -431,7 +445,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, workspace->in_buf.size = cur_len; /* Allocate and map in the output buffer */ - out_folio = btrfs_alloc_compr_folio(); + out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; @@ -439,7 +453,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, folios[nr_folios++] = out_folio; workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; - workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); + workspace->out_buf.size = min_t(size_t, max_out, min_folio_size); while (1) { size_t ret2; @@ -447,9 +461,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, ret2 = zstd_compress_stream(stream, &workspace->out_buf, &workspace->in_buf); if (unlikely(zstd_is_error(ret2))) { - struct btrfs_inode *inode = BTRFS_I(mapping->host); - - btrfs_warn(inode->root->fs_info, + btrfs_warn(fs_info, "zstd compression level %d failed, error %d root %llu inode %llu offset %llu", workspace->req_level, zstd_get_error_code(ret2), btrfs_root_id(inode->root), btrfs_ino(inode), @@ -459,7 +471,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, } /* Check to see if we are making it bigger */ - if (tot_in + workspace->in_buf.pos > 8192 && + if (tot_in + workspace->in_buf.pos > blocksize * 2 && tot_in + workspace->in_buf.pos < tot_out + workspace->out_buf.pos) { ret = -E2BIG; @@ -475,13 +487,13 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, /* Check if we need more output space */ if (workspace->out_buf.pos == workspace->out_buf.size) { - tot_out += PAGE_SIZE; - max_out -= PAGE_SIZE; + tot_out += min_folio_size; + max_out -= min_folio_size; if (nr_folios == nr_dest_folios) { ret = -E2BIG; goto out; } - out_folio = btrfs_alloc_compr_folio(); + out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; @@ -489,8 +501,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, folios[nr_folios++] = out_folio; workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; - workspace->out_buf.size = min_t(size_t, max_out, - PAGE_SIZE); + workspace->out_buf.size = min_t(size_t, max_out, min_folio_size); } /* We've reached the end of the input */ @@ -522,9 +533,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, ret2 = zstd_end_stream(stream, &workspace->out_buf); if (unlikely(zstd_is_error(ret2))) { - struct btrfs_inode *inode = BTRFS_I(mapping->host); - - btrfs_err(inode->root->fs_info, + btrfs_err(fs_info, "zstd compression end level %d failed, error %d root %llu inode %llu offset %llu", workspace->req_level, zstd_get_error_code(ret2), btrfs_root_id(inode->root), btrfs_ino(inode), @@ -542,13 +551,13 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, goto out; } - tot_out += PAGE_SIZE; - max_out -= PAGE_SIZE; + tot_out += min_folio_size; + max_out -= min_folio_size; if (nr_folios == nr_dest_folios) { ret = -E2BIG; goto out; } - out_folio = btrfs_alloc_compr_folio(); + out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; @@ -556,7 +565,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, folios[nr_folios++] = out_folio; workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; - workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); + workspace->out_buf.size = min_t(size_t, max_out, min_folio_size); } if (tot_out >= tot_in) { @@ -578,13 +587,16 @@ out: int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { + struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); struct workspace *workspace = list_entry(ws, struct workspace, list); struct folio **folios_in = cb->compressed_folios; size_t srclen = cb->compressed_len; zstd_dstream *stream; int ret = 0; + const u32 blocksize = fs_info->sectorsize; + const unsigned int min_folio_size = btrfs_min_folio_size(fs_info); unsigned long folio_in_index = 0; - unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE); + unsigned long total_folios_in = DIV_ROUND_UP(srclen, min_folio_size); unsigned long buf_start; unsigned long total_out = 0; @@ -602,11 +614,11 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) workspace->in_buf.src = kmap_local_folio(folios_in[folio_in_index], 0); workspace->in_buf.pos = 0; - workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); + workspace->in_buf.size = min_t(size_t, srclen, min_folio_size); workspace->out_buf.dst = workspace->buf; workspace->out_buf.pos = 0; - workspace->out_buf.size = PAGE_SIZE; + workspace->out_buf.size = blocksize; while (1) { size_t ret2; @@ -642,16 +654,16 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (workspace->in_buf.pos == workspace->in_buf.size) { kunmap_local(workspace->in_buf.src); folio_in_index++; - if (folio_in_index >= total_folios_in) { + if (unlikely(folio_in_index >= total_folios_in)) { workspace->in_buf.src = NULL; ret = -EIO; goto done; } - srclen -= PAGE_SIZE; + srclen -= min_folio_size; workspace->in_buf.src = kmap_local_folio(folios_in[folio_in_index], 0); workspace->in_buf.pos = 0; - workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); + workspace->in_buf.size = min_t(size_t, srclen, min_folio_size); } } ret = 0; @@ -718,9 +730,7 @@ finish: return ret; } -const struct btrfs_compress_op btrfs_zstd_compress = { - /* ZSTD uses own workspace manager */ - .workspace_manager = NULL, +const struct btrfs_compress_levels btrfs_zstd_compress = { .min_level = ZSTD_BTRFS_MIN_LEVEL, .max_level = ZSTD_BTRFS_MAX_LEVEL, .default_level = ZSTD_BTRFS_DEFAULT_LEVEL, diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 3c39cfacb251..0d0ef54fc4de 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -473,12 +473,12 @@ ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, rdt_last_cmd_clear(); if (!strcmp(buf, "mbm_local_bytes")) { - if (resctrl_arch_is_mbm_local_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID; else ret = -EINVAL; } else if (!strcmp(buf, "mbm_total_bytes")) { - if (resctrl_arch_is_mbm_total_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID; else ret = -EINVAL; @@ -563,10 +563,15 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, rr->r = r; rr->d = d; rr->first = first; - rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); - if (IS_ERR(rr->arch_mon_ctx)) { - rr->err = -EINVAL; - return; + if (resctrl_arch_mbm_cntr_assign_enabled(r) && + resctrl_is_mbm_event(evtid)) { + rr->is_mbm_cntr = true; + } else { + rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); + if (IS_ERR(rr->arch_mon_ctx)) { + rr->err = -EINVAL; + return; + } } cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU); @@ -582,7 +587,8 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, else smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); - resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); + if (rr->arch_mon_ctx) + resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); } int rdtgroup_mondata_show(struct seq_file *m, void *arg) @@ -653,10 +659,16 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) checkresult: + /* + * -ENOENT is a special case, set only when "mbm_event" counter assignment + * mode is enabled and no counter has been assigned. + */ if (rr.err == -EIO) seq_puts(m, "Error\n"); else if (rr.err == -EINVAL) seq_puts(m, "Unavailable\n"); + else if (rr.err == -ENOENT) + seq_puts(m, "Unassigned\n"); else seq_printf(m, "%llu\n", rr.val); diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 9a8cf6f11151..cf1fd82dc5a9 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -52,19 +52,31 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) } /** - * struct mon_evt - Entry in the event list of a resource + * struct mon_evt - Properties of a monitor event * @evtid: event id + * @rid: resource id for this event * @name: name of the event + * @evt_cfg: Event configuration value that represents the + * memory transactions (e.g., READS_TO_LOCAL_MEM, + * READS_TO_REMOTE_MEM) being tracked by @evtid. + * Only valid if @evtid is an MBM event. * @configurable: true if the event is configurable - * @list: entry in &rdt_resource->evt_list + * @enabled: true if the event is enabled */ struct mon_evt { enum resctrl_event_id evtid; + enum resctrl_res_level rid; char *name; + u32 evt_cfg; bool configurable; - struct list_head list; + bool enabled; }; +extern struct mon_evt mon_event_all[QOS_NUM_EVENTS]; + +#define for_each_mon_event(mevt) for (mevt = &mon_event_all[QOS_FIRST_EVENT]; \ + mevt < &mon_event_all[QOS_NUM_EVENTS]; mevt++) + /** * struct mon_data - Monitoring details for each event file. * @list: Member of the global @mon_data_kn_priv_list list. @@ -99,6 +111,8 @@ struct mon_data { * @evtid: Which monitor event to read. * @first: Initialize MBM counter when true. * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. + * @is_mbm_cntr: true if "mbm_event" counter assignment mode is enabled and it + * is an MBM event. * @err: Error encountered when reading counter. * @val: Returned value of event counter. If @rgrp is a parent resource group, * @val includes the sum of event counts from its child resource groups. @@ -113,6 +127,7 @@ struct rmid_read { enum resctrl_event_id evtid; bool first; struct cacheinfo *ci; + bool is_mbm_cntr; int err; u64 val; void *arch_mon_ctx; @@ -226,6 +241,8 @@ struct rdtgroup { #define RFTYPE_DEBUG BIT(10) +#define RFTYPE_ASSIGN_CONFIG BIT(11) + #define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) #define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) @@ -375,6 +392,41 @@ bool closid_allocated(unsigned int closid); int resctrl_find_cleanest_closid(void); +void *rdt_kn_parent_priv(struct kernfs_node *kn); + +int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, struct seq_file *s, void *v); + +ssize_t resctrl_mbm_assign_mode_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off); + +void resctrl_bmec_files_show(struct rdt_resource *r, struct kernfs_node *l3_mon_kn, + bool show); + +int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, void *v); + +int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, + void *v); + +void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp); + +void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp); + +int event_filter_show(struct kernfs_open_file *of, struct seq_file *seq, void *v); + +ssize_t event_filter_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off); + +int resctrl_mbm_assign_on_mkdir_show(struct kernfs_open_file *of, + struct seq_file *s, void *v); + +ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off); + +int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v); + +ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 7326c28a7908..4076336fbba6 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -336,7 +336,7 @@ void free_rmid(u32 closid, u32 rmid) entry = __rmid_entry(idx); - if (resctrl_arch_is_llc_occupancy_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) add_rmid_to_limbo(entry); else list_add_tail(&entry->list, &rmid_free_lru); @@ -346,27 +346,97 @@ static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, u32 rmid, enum resctrl_event_id evtid) { u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); + struct mbm_state *state; - switch (evtid) { - case QOS_L3_MBM_TOTAL_EVENT_ID: - return &d->mbm_total[idx]; - case QOS_L3_MBM_LOCAL_EVENT_ID: - return &d->mbm_local[idx]; - default: + if (!resctrl_is_mbm_event(evtid)) return NULL; + + state = d->mbm_states[MBM_STATE_IDX(evtid)]; + + return state ? &state[idx] : NULL; +} + +/* + * mbm_cntr_get() - Return the counter ID for the matching @evtid and @rdtgrp. + * + * Return: + * Valid counter ID on success, or -ENOENT on failure. + */ +static int mbm_cntr_get(struct rdt_resource *r, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) +{ + int cntr_id; + + if (!r->mon.mbm_cntr_assignable) + return -ENOENT; + + if (!resctrl_is_mbm_event(evtid)) + return -ENOENT; + + for (cntr_id = 0; cntr_id < r->mon.num_mbm_cntrs; cntr_id++) { + if (d->cntr_cfg[cntr_id].rdtgrp == rdtgrp && + d->cntr_cfg[cntr_id].evtid == evtid) + return cntr_id; + } + + return -ENOENT; +} + +/* + * mbm_cntr_alloc() - Initialize and return a new counter ID in the domain @d. + * Caller must ensure that the specified event is not assigned already. + * + * Return: + * Valid counter ID on success, or -ENOSPC on failure. + */ +static int mbm_cntr_alloc(struct rdt_resource *r, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) +{ + int cntr_id; + + for (cntr_id = 0; cntr_id < r->mon.num_mbm_cntrs; cntr_id++) { + if (!d->cntr_cfg[cntr_id].rdtgrp) { + d->cntr_cfg[cntr_id].rdtgrp = rdtgrp; + d->cntr_cfg[cntr_id].evtid = evtid; + return cntr_id; + } } + + return -ENOSPC; } -static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) +/* + * mbm_cntr_free() - Clear the counter ID configuration details in the domain @d. + */ +static void mbm_cntr_free(struct rdt_mon_domain *d, int cntr_id) +{ + memset(&d->cntr_cfg[cntr_id], 0, sizeof(*d->cntr_cfg)); +} + +static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) { int cpu = smp_processor_id(); + u32 closid = rdtgrp->closid; + u32 rmid = rdtgrp->mon.rmid; struct rdt_mon_domain *d; + int cntr_id = -ENOENT; struct mbm_state *m; int err, ret; u64 tval = 0; + if (rr->is_mbm_cntr) { + cntr_id = mbm_cntr_get(rr->r, rr->d, rdtgrp, rr->evtid); + if (cntr_id < 0) { + rr->err = -ENOENT; + return -EINVAL; + } + } + if (rr->first) { - resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); + if (rr->is_mbm_cntr) + resctrl_arch_reset_cntr(rr->r, rr->d, closid, rmid, cntr_id, rr->evtid); + else + resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); m = get_mbm_state(rr->d, closid, rmid, rr->evtid); if (m) memset(m, 0, sizeof(struct mbm_state)); @@ -377,8 +447,12 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) /* Reading a single domain, must be on a CPU in that domain. */ if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask)) return -EINVAL; - rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); + if (rr->is_mbm_cntr) + rr->err = resctrl_arch_cntr_read(rr->r, rr->d, closid, rmid, cntr_id, + rr->evtid, &tval); + else + rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, + rr->evtid, &tval, rr->arch_mon_ctx); if (rr->err) return rr->err; @@ -402,8 +476,12 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { if (d->ci_id != rr->ci->id) continue; - err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); + if (rr->is_mbm_cntr) + err = resctrl_arch_cntr_read(rr->r, d, closid, rmid, cntr_id, + rr->evtid, &tval); + else + err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, + rr->evtid, &tval, rr->arch_mon_ctx); if (!err) { rr->val += tval; ret = 0; @@ -419,8 +497,8 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) /* * mbm_bw_count() - Update bw count from values previously read by * __mon_event_count(). - * @closid: The closid used to identify the cached mbm_state. - * @rmid: The rmid used to identify the cached mbm_state. + * @rdtgrp: resctrl group associated with the CLOSID and RMID to identify + * the cached mbm_state. * @rr: The struct rmid_read populated by __mon_event_count(). * * Supporting function to calculate the memory bandwidth @@ -428,9 +506,11 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) * __mon_event_count() is compared with the chunks value from the previous * invocation. This must be called once per second to maintain values in MBps. */ -static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) +static void mbm_bw_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) { u64 cur_bw, bytes, cur_bytes; + u32 closid = rdtgrp->closid; + u32 rmid = rdtgrp->mon.rmid; struct mbm_state *m; m = get_mbm_state(rr->d, closid, rmid, rr->evtid); @@ -459,7 +539,7 @@ void mon_event_count(void *info) rdtgrp = rr->rgrp; - ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); + ret = __mon_event_count(rdtgrp, rr); /* * For Ctrl groups read data from child monitor groups and @@ -470,8 +550,7 @@ void mon_event_count(void *info) if (rdtgrp->type == RDTCTRL_GROUP) { list_for_each_entry(entry, head, mon.crdtgrp_list) { - if (__mon_event_count(entry->closid, entry->mon.rmid, - rr) == 0) + if (__mon_event_count(entry, rr) == 0) ret = 0; } } @@ -602,44 +681,49 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) } static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d, - u32 closid, u32 rmid, enum resctrl_event_id evtid) + struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) { struct rmid_read rr = {0}; rr.r = r; rr.d = d; rr.evtid = evtid; - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); - if (IS_ERR(rr.arch_mon_ctx)) { - pr_warn_ratelimited("Failed to allocate monitor context: %ld", - PTR_ERR(rr.arch_mon_ctx)); - return; + if (resctrl_arch_mbm_cntr_assign_enabled(r)) { + rr.is_mbm_cntr = true; + } else { + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); + if (IS_ERR(rr.arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(rr.arch_mon_ctx)); + return; + } } - __mon_event_count(closid, rmid, &rr); + __mon_event_count(rdtgrp, &rr); /* * If the software controller is enabled, compute the * bandwidth for this event id. */ if (is_mba_sc(NULL)) - mbm_bw_count(closid, rmid, &rr); + mbm_bw_count(rdtgrp, &rr); - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); + if (rr.arch_mon_ctx) + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); } static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, - u32 closid, u32 rmid) + struct rdtgroup *rdtgrp) { /* * This is protected from concurrent reads from user as both * the user and overflow handler hold the global mutex. */ - if (resctrl_arch_is_mbm_total_enabled()) - mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID); + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + mbm_update_one_event(r, d, rdtgrp, QOS_L3_MBM_TOTAL_EVENT_ID); - if (resctrl_arch_is_mbm_local_enabled()) - mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID); + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + mbm_update_one_event(r, d, rdtgrp, QOS_L3_MBM_LOCAL_EVENT_ID); } /* @@ -712,11 +796,11 @@ void mbm_handle_overflow(struct work_struct *work) d = container_of(work, struct rdt_mon_domain, mbm_over.work); list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); + mbm_update(r, d, prgrp); head = &prgrp->mon.crdtgrp_list; list_for_each_entry(crgrp, head, mon.crdtgrp_list) - mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); + mbm_update(r, d, crgrp); if (is_mba_sc(NULL)) update_mba_bw(prgrp, d); @@ -842,38 +926,819 @@ out_unlock: mutex_unlock(&rdtgroup_mutex); } -static struct mon_evt llc_occupancy_event = { - .name = "llc_occupancy", - .evtid = QOS_L3_OCCUP_EVENT_ID, +/* + * All available events. Architecture code marks the ones that + * are supported by a system using resctrl_enable_mon_event() + * to set .enabled. + */ +struct mon_evt mon_event_all[QOS_NUM_EVENTS] = { + [QOS_L3_OCCUP_EVENT_ID] = { + .name = "llc_occupancy", + .evtid = QOS_L3_OCCUP_EVENT_ID, + .rid = RDT_RESOURCE_L3, + }, + [QOS_L3_MBM_TOTAL_EVENT_ID] = { + .name = "mbm_total_bytes", + .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, + .rid = RDT_RESOURCE_L3, + }, + [QOS_L3_MBM_LOCAL_EVENT_ID] = { + .name = "mbm_local_bytes", + .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, + .rid = RDT_RESOURCE_L3, + }, }; -static struct mon_evt mbm_total_event = { - .name = "mbm_total_bytes", - .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, +void resctrl_enable_mon_event(enum resctrl_event_id eventid) +{ + if (WARN_ON_ONCE(eventid < QOS_FIRST_EVENT || eventid >= QOS_NUM_EVENTS)) + return; + if (mon_event_all[eventid].enabled) { + pr_warn("Duplicate enable for event %d\n", eventid); + return; + } + + mon_event_all[eventid].enabled = true; +} + +bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid) +{ + return eventid >= QOS_FIRST_EVENT && eventid < QOS_NUM_EVENTS && + mon_event_all[eventid].enabled; +} + +u32 resctrl_get_mon_evt_cfg(enum resctrl_event_id evtid) +{ + return mon_event_all[evtid].evt_cfg; +} + +/** + * struct mbm_transaction - Memory transaction an MBM event can be configured with. + * @name: Name of memory transaction (read, write ...). + * @val: The bit (eg. READS_TO_LOCAL_MEM or READS_TO_REMOTE_MEM) used to + * represent the memory transaction within an event's configuration. + */ +struct mbm_transaction { + char name[32]; + u32 val; }; -static struct mon_evt mbm_local_event = { - .name = "mbm_local_bytes", - .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, +/* Decoded values for each type of memory transaction. */ +static struct mbm_transaction mbm_transactions[NUM_MBM_TRANSACTIONS] = { + {"local_reads", READS_TO_LOCAL_MEM}, + {"remote_reads", READS_TO_REMOTE_MEM}, + {"local_non_temporal_writes", NON_TEMP_WRITE_TO_LOCAL_MEM}, + {"remote_non_temporal_writes", NON_TEMP_WRITE_TO_REMOTE_MEM}, + {"local_reads_slow_memory", READS_TO_LOCAL_S_MEM}, + {"remote_reads_slow_memory", READS_TO_REMOTE_S_MEM}, + {"dirty_victim_writes_all", DIRTY_VICTIMS_TO_ALL_MEM}, }; +int event_filter_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) +{ + struct mon_evt *mevt = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r; + bool sep = false; + int ret = 0, i; + + mutex_lock(&rdtgroup_mutex); + rdt_last_cmd_clear(); + + r = resctrl_arch_get_resource(mevt->rid); + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + for (i = 0; i < NUM_MBM_TRANSACTIONS; i++) { + if (mevt->evt_cfg & mbm_transactions[i].val) { + if (sep) + seq_putc(seq, ','); + seq_printf(seq, "%s", mbm_transactions[i].name); + sep = true; + } + } + seq_putc(seq, '\n'); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret; +} + +int resctrl_mbm_assign_on_mkdir_show(struct kernfs_open_file *of, struct seq_file *s, + void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + int ret = 0; + + mutex_lock(&rdtgroup_mutex); + rdt_last_cmd_clear(); + + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + seq_printf(s, "%u\n", r->mon.mbm_assign_on_mkdir); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret; +} + +ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + bool value; + int ret; + + ret = kstrtobool(buf, &value); + if (ret) + return ret; + + mutex_lock(&rdtgroup_mutex); + rdt_last_cmd_clear(); + + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + r->mon.mbm_assign_on_mkdir = value; + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret ?: nbytes; +} + +/* + * mbm_cntr_free_all() - Clear all the counter ID configuration details in the + * domain @d. Called when mbm_assign_mode is changed. + */ +static void mbm_cntr_free_all(struct rdt_resource *r, struct rdt_mon_domain *d) +{ + memset(d->cntr_cfg, 0, sizeof(*d->cntr_cfg) * r->mon.num_mbm_cntrs); +} + +/* + * resctrl_reset_rmid_all() - Reset all non-architecture states for all the + * supported RMIDs. + */ +static void resctrl_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) +{ + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + enum resctrl_event_id evt; + int idx; + + for_each_mbm_event_id(evt) { + if (!resctrl_is_mon_event_enabled(evt)) + continue; + idx = MBM_STATE_IDX(evt); + memset(d->mbm_states[idx], 0, sizeof(*d->mbm_states[0]) * idx_limit); + } +} + +/* + * rdtgroup_assign_cntr() - Assign/unassign the counter ID for the event, RMID + * pair in the domain. + * + * Assign the counter if @assign is true else unassign the counter. Reset the + * associated non-architectural state. + */ +static void rdtgroup_assign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign) +{ + struct mbm_state *m; + + resctrl_arch_config_cntr(r, d, evtid, rmid, closid, cntr_id, assign); + + m = get_mbm_state(d, closid, rmid, evtid); + if (m) + memset(m, 0, sizeof(*m)); +} + +/* + * rdtgroup_alloc_assign_cntr() - Allocate a counter ID and assign it to the event + * pointed to by @mevt and the resctrl group @rdtgrp within the domain @d. + * + * Return: + * 0 on success, < 0 on failure. + */ +static int rdtgroup_alloc_assign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, struct mon_evt *mevt) +{ + int cntr_id; + + /* No action required if the counter is assigned already. */ + cntr_id = mbm_cntr_get(r, d, rdtgrp, mevt->evtid); + if (cntr_id >= 0) + return 0; + + cntr_id = mbm_cntr_alloc(r, d, rdtgrp, mevt->evtid); + if (cntr_id < 0) { + rdt_last_cmd_printf("Failed to allocate counter for %s in domain %d\n", + mevt->name, d->hdr.id); + return cntr_id; + } + + rdtgroup_assign_cntr(r, d, mevt->evtid, rdtgrp->mon.rmid, rdtgrp->closid, cntr_id, true); + + return 0; +} + /* - * Initialize the event list for the resource. + * rdtgroup_assign_cntr_event() - Assign a hardware counter for the event in + * @mevt to the resctrl group @rdtgrp. Assign counters to all domains if @d is + * NULL; otherwise, assign the counter to the specified domain @d. + * + * If all counters in a domain are already in use, rdtgroup_alloc_assign_cntr() + * will fail. The assignment process will abort at the first failure encountered + * during domain traversal, which may result in the event being only partially + * assigned. * - * Note that MBM events are also part of RDT_RESOURCE_L3 resource - * because as per the SDM the total and local memory bandwidth - * are enumerated as part of L3 monitoring. + * Return: + * 0 on success, < 0 on failure. + */ +static int rdtgroup_assign_cntr_event(struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, + struct mon_evt *mevt) +{ + struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid); + int ret = 0; + + if (!d) { + list_for_each_entry(d, &r->mon_domains, hdr.list) { + ret = rdtgroup_alloc_assign_cntr(r, d, rdtgrp, mevt); + if (ret) + return ret; + } + } else { + ret = rdtgroup_alloc_assign_cntr(r, d, rdtgrp, mevt); + } + + return ret; +} + +/* + * rdtgroup_assign_cntrs() - Assign counters to MBM events. Called when + * a new group is created. + * + * Each group can accommodate two counters per domain: one for the total + * event and one for the local event. Assignments may fail due to the limited + * number of counters. However, it is not necessary to fail the group creation + * and thus no failure is returned. Users have the option to modify the + * counter assignments after the group has been created. + */ +void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + + if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r) || + !r->mon.mbm_assign_on_mkdir) + return; + + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + rdtgroup_assign_cntr_event(NULL, rdtgrp, + &mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID]); + + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + rdtgroup_assign_cntr_event(NULL, rdtgrp, + &mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID]); +} + +/* + * rdtgroup_free_unassign_cntr() - Unassign and reset the counter ID configuration + * for the event pointed to by @mevt within the domain @d and resctrl group @rdtgrp. + */ +static void rdtgroup_free_unassign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, struct mon_evt *mevt) +{ + int cntr_id; + + cntr_id = mbm_cntr_get(r, d, rdtgrp, mevt->evtid); + + /* If there is no cntr_id assigned, nothing to do */ + if (cntr_id < 0) + return; + + rdtgroup_assign_cntr(r, d, mevt->evtid, rdtgrp->mon.rmid, rdtgrp->closid, cntr_id, false); + + mbm_cntr_free(d, cntr_id); +} + +/* + * rdtgroup_unassign_cntr_event() - Unassign a hardware counter associated with + * the event structure @mevt from the domain @d and the group @rdtgrp. Unassign + * the counters from all the domains if @d is NULL else unassign from @d. + */ +static void rdtgroup_unassign_cntr_event(struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, + struct mon_evt *mevt) +{ + struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid); + + if (!d) { + list_for_each_entry(d, &r->mon_domains, hdr.list) + rdtgroup_free_unassign_cntr(r, d, rdtgrp, mevt); + } else { + rdtgroup_free_unassign_cntr(r, d, rdtgrp, mevt); + } +} + +/* + * rdtgroup_unassign_cntrs() - Unassign the counters associated with MBM events. + * Called when a group is deleted. */ -static void l3_mon_evt_init(struct rdt_resource *r) +void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp) { - INIT_LIST_HEAD(&r->evt_list); + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - if (resctrl_arch_is_llc_occupancy_enabled()) - list_add_tail(&llc_occupancy_event.list, &r->evt_list); - if (resctrl_arch_is_mbm_total_enabled()) - list_add_tail(&mbm_total_event.list, &r->evt_list); - if (resctrl_arch_is_mbm_local_enabled()) - list_add_tail(&mbm_local_event.list, &r->evt_list); + if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r)) + return; + + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + rdtgroup_unassign_cntr_event(NULL, rdtgrp, + &mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID]); + + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + rdtgroup_unassign_cntr_event(NULL, rdtgrp, + &mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID]); +} + +static int resctrl_parse_mem_transactions(char *tok, u32 *val) +{ + u32 temp_val = 0; + char *evt_str; + bool found; + int i; + +next_config: + if (!tok || tok[0] == '\0') { + *val = temp_val; + return 0; + } + + /* Start processing the strings for each memory transaction type */ + evt_str = strim(strsep(&tok, ",")); + found = false; + for (i = 0; i < NUM_MBM_TRANSACTIONS; i++) { + if (!strcmp(mbm_transactions[i].name, evt_str)) { + temp_val |= mbm_transactions[i].val; + found = true; + break; + } + } + + if (!found) { + rdt_last_cmd_printf("Invalid memory transaction type %s\n", evt_str); + return -EINVAL; + } + + goto next_config; +} + +/* + * rdtgroup_update_cntr_event - Update the counter assignments for the event + * in a group. + * @r: Resource to which update needs to be done. + * @rdtgrp: Resctrl group. + * @evtid: MBM monitor event. + */ +static void rdtgroup_update_cntr_event(struct rdt_resource *r, struct rdtgroup *rdtgrp, + enum resctrl_event_id evtid) +{ + struct rdt_mon_domain *d; + int cntr_id; + + list_for_each_entry(d, &r->mon_domains, hdr.list) { + cntr_id = mbm_cntr_get(r, d, rdtgrp, evtid); + if (cntr_id >= 0) + rdtgroup_assign_cntr(r, d, evtid, rdtgrp->mon.rmid, + rdtgrp->closid, cntr_id, true); + } +} + +/* + * resctrl_update_cntr_allrdtgrp - Update the counter assignments for the event + * for all the groups. + * @mevt MBM Monitor event. + */ +static void resctrl_update_cntr_allrdtgrp(struct mon_evt *mevt) +{ + struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid); + struct rdtgroup *prgrp, *crgrp; + + /* + * Find all the groups where the event is assigned and update the + * configuration of existing assignments. + */ + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + rdtgroup_update_cntr_event(r, prgrp, mevt->evtid); + + list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) + rdtgroup_update_cntr_event(r, crgrp, mevt->evtid); + } +} + +ssize_t event_filter_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off) +{ + struct mon_evt *mevt = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r; + u32 evt_cfg = 0; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + buf[nbytes - 1] = '\0'; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + r = resctrl_arch_get_resource(mevt->rid); + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + ret = resctrl_parse_mem_transactions(buf, &evt_cfg); + if (!ret && mevt->evt_cfg != evt_cfg) { + mevt->evt_cfg = evt_cfg; + resctrl_update_cntr_allrdtgrp(mevt); + } + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret ?: nbytes; +} + +int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + bool enabled; + + mutex_lock(&rdtgroup_mutex); + enabled = resctrl_arch_mbm_cntr_assign_enabled(r); + + if (r->mon.mbm_cntr_assignable) { + if (enabled) + seq_puts(s, "[mbm_event]\n"); + else + seq_puts(s, "[default]\n"); + + if (!IS_ENABLED(CONFIG_RESCTRL_ASSIGN_FIXED)) { + if (enabled) + seq_puts(s, "default\n"); + else + seq_puts(s, "mbm_event\n"); + } + } else { + seq_puts(s, "[default]\n"); + } + + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +ssize_t resctrl_mbm_assign_mode_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + struct rdt_mon_domain *d; + int ret = 0; + bool enable; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + buf[nbytes - 1] = '\0'; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + if (!strcmp(buf, "default")) { + enable = 0; + } else if (!strcmp(buf, "mbm_event")) { + if (r->mon.mbm_cntr_assignable) { + enable = 1; + } else { + ret = -EINVAL; + rdt_last_cmd_puts("mbm_event mode is not supported\n"); + goto out_unlock; + } + } else { + ret = -EINVAL; + rdt_last_cmd_puts("Unsupported assign mode\n"); + goto out_unlock; + } + + if (enable != resctrl_arch_mbm_cntr_assign_enabled(r)) { + ret = resctrl_arch_mbm_cntr_assign_set(r, enable); + if (ret) + goto out_unlock; + + /* Update the visibility of BMEC related files */ + resctrl_bmec_files_show(r, NULL, !enable); + + /* + * Initialize the default memory transaction values for + * total and local events. + */ + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask; + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask & + (READS_TO_LOCAL_MEM | + READS_TO_LOCAL_S_MEM | + NON_TEMP_WRITE_TO_LOCAL_MEM); + /* Enable auto assignment when switching to "mbm_event" mode */ + if (enable) + r->mon.mbm_assign_on_mkdir = true; + /* + * Reset all the non-achitectural RMID state and assignable counters. + */ + list_for_each_entry(d, &r->mon_domains, hdr.list) { + mbm_cntr_free_all(r, d); + resctrl_reset_rmid_all(r, d); + } + } + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret ?: nbytes; +} + +int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + struct rdt_mon_domain *dom; + bool sep = false; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + list_for_each_entry(dom, &r->mon_domains, hdr.list) { + if (sep) + seq_putc(s, ';'); + + seq_printf(s, "%d=%d", dom->hdr.id, r->mon.num_mbm_cntrs); + sep = true; + } + seq_putc(s, '\n'); + + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + return 0; +} + +int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + struct rdt_mon_domain *dom; + bool sep = false; + u32 cntrs, i; + int ret = 0; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + list_for_each_entry(dom, &r->mon_domains, hdr.list) { + if (sep) + seq_putc(s, ';'); + + cntrs = 0; + for (i = 0; i < r->mon.num_mbm_cntrs; i++) { + if (!dom->cntr_cfg[i].rdtgrp) + cntrs++; + } + + seq_printf(s, "%d=%u", dom->hdr.id, cntrs); + sep = true; + } + seq_putc(s, '\n'); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret; +} + +int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + struct rdt_mon_domain *d; + struct rdtgroup *rdtgrp; + struct mon_evt *mevt; + int ret = 0; + bool sep; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + ret = -ENOENT; + goto out_unlock; + } + + rdt_last_cmd_clear(); + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled || !resctrl_is_mbm_event(mevt->evtid)) + continue; + + sep = false; + seq_printf(s, "%s:", mevt->name); + list_for_each_entry(d, &r->mon_domains, hdr.list) { + if (sep) + seq_putc(s, ';'); + + if (mbm_cntr_get(r, d, rdtgrp, mevt->evtid) < 0) + seq_printf(s, "%d=_", d->hdr.id); + else + seq_printf(s, "%d=e", d->hdr.id); + + sep = true; + } + seq_putc(s, '\n'); + } + +out_unlock: + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +/* + * mbm_get_mon_event_by_name() - Return the mon_evt entry for the matching + * event name. + */ +static struct mon_evt *mbm_get_mon_event_by_name(struct rdt_resource *r, char *name) +{ + struct mon_evt *mevt; + + for_each_mon_event(mevt) { + if (mevt->rid == r->rid && mevt->enabled && + resctrl_is_mbm_event(mevt->evtid) && + !strcmp(mevt->name, name)) + return mevt; + } + + return NULL; +} + +static int rdtgroup_modify_assign_state(char *assign, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, struct mon_evt *mevt) +{ + int ret = 0; + + if (!assign || strlen(assign) != 1) + return -EINVAL; + + switch (*assign) { + case 'e': + ret = rdtgroup_assign_cntr_event(d, rdtgrp, mevt); + break; + case '_': + rdtgroup_unassign_cntr_event(d, rdtgrp, mevt); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int resctrl_parse_mbm_assignment(struct rdt_resource *r, struct rdtgroup *rdtgrp, + char *event, char *tok) +{ + struct rdt_mon_domain *d; + unsigned long dom_id = 0; + char *dom_str, *id_str; + struct mon_evt *mevt; + int ret; + + mevt = mbm_get_mon_event_by_name(r, event); + if (!mevt) { + rdt_last_cmd_printf("Invalid event %s\n", event); + return -ENOENT; + } + +next: + if (!tok || tok[0] == '\0') + return 0; + + /* Start processing the strings for each domain */ + dom_str = strim(strsep(&tok, ";")); + + id_str = strsep(&dom_str, "="); + + /* Check for domain id '*' which means all domains */ + if (id_str && *id_str == '*') { + ret = rdtgroup_modify_assign_state(dom_str, NULL, rdtgrp, mevt); + if (ret) + rdt_last_cmd_printf("Assign operation '%s:*=%s' failed\n", + event, dom_str); + return ret; + } else if (!id_str || kstrtoul(id_str, 10, &dom_id)) { + rdt_last_cmd_puts("Missing domain id\n"); + return -EINVAL; + } + + /* Verify if the dom_id is valid */ + list_for_each_entry(d, &r->mon_domains, hdr.list) { + if (d->hdr.id == dom_id) { + ret = rdtgroup_modify_assign_state(dom_str, d, rdtgrp, mevt); + if (ret) { + rdt_last_cmd_printf("Assign operation '%s:%ld=%s' failed\n", + event, dom_id, dom_str); + return ret; + } + goto next; + } + } + + rdt_last_cmd_printf("Invalid domain id %ld\n", dom_id); + return -EINVAL; +} + +ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + struct rdtgroup *rdtgrp; + char *token, *event; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + buf[nbytes - 1] = '\0'; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + rdt_last_cmd_clear(); + + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event mode is not enabled\n"); + rdtgroup_kn_unlock(of->kn); + return -EINVAL; + } + + while ((token = strsep(&buf, "\n")) != NULL) { + /* + * The write command follows the following format: + * "<Event>:<Domain ID>=<Assignment state>" + * Extract the event name first. + */ + event = strsep(&token, ":"); + + ret = resctrl_parse_mbm_assignment(r, rdtgrp, event, token); + if (ret) + break; + } + + rdtgroup_kn_unlock(of->kn); + + return ret ?: nbytes; } /** @@ -900,24 +1765,43 @@ int resctrl_mon_resource_init(void) if (ret) return ret; - l3_mon_evt_init(r); - if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { - mbm_total_event.configurable = true; + mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].configurable = true; resctrl_file_fflags_init("mbm_total_bytes_config", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) { - mbm_local_event.configurable = true; + mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].configurable = true; resctrl_file_fflags_init("mbm_local_bytes_config", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } - if (resctrl_arch_is_mbm_local_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID; - else if (resctrl_arch_is_mbm_total_enabled()) + else if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; + if (r->mon.mbm_cntr_assignable) { + if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); + if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); + mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask; + mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask & + (READS_TO_LOCAL_MEM | + READS_TO_LOCAL_S_MEM | + NON_TEMP_WRITE_TO_LOCAL_MEM); + r->mon.mbm_assign_on_mkdir = true; + resctrl_file_fflags_init("num_mbm_cntrs", + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + resctrl_file_fflags_init("available_mbm_cntrs", + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + resctrl_file_fflags_init("event_filter", RFTYPE_ASSIGN_CONFIG); + resctrl_file_fflags_init("mbm_assign_on_mkdir", RFTYPE_MON_INFO | + RFTYPE_RES_CACHE); + resctrl_file_fflags_init("mbm_L3_assignments", RFTYPE_MON_BASE); + } + return 0; } diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 77d08229d855..0320360cd7a6 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -123,14 +123,8 @@ void rdt_staged_configs_clear(void) static bool resctrl_is_mbm_enabled(void) { - return (resctrl_arch_is_mbm_total_enabled() || - resctrl_arch_is_mbm_local_enabled()); -} - -static bool resctrl_is_mbm_event(int e) -{ - return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && - e <= QOS_L3_MBM_LOCAL_EVENT_ID); + return (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID) || + resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)); } /* @@ -196,7 +190,7 @@ static int closid_alloc(void) lockdep_assert_held(&rdtgroup_mutex); if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) && - resctrl_arch_is_llc_occupancy_enabled()) { + resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) { cleanest_closid = resctrl_find_cleanest_closid(); if (cleanest_closid < 0) return cleanest_closid; @@ -981,7 +975,7 @@ static int rdt_last_cmd_status_show(struct kernfs_open_file *of, return 0; } -static void *rdt_kn_parent_priv(struct kernfs_node *kn) +void *rdt_kn_parent_priv(struct kernfs_node *kn) { /* * The parent pointer is only valid within RCU section since it can be @@ -1141,7 +1135,7 @@ static int rdt_num_rmids_show(struct kernfs_open_file *of, { struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - seq_printf(seq, "%d\n", r->num_rmid); + seq_printf(seq, "%d\n", r->mon.num_rmid); return 0; } @@ -1152,9 +1146,12 @@ static int rdt_mon_features_show(struct kernfs_open_file *of, struct rdt_resource *r = rdt_kn_parent_priv(of->kn); struct mon_evt *mevt; - list_for_each_entry(mevt, &r->evt_list, list) { + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled) + continue; seq_printf(seq, "%s\n", mevt->name); - if (mevt->configurable) + if (mevt->configurable && + !resctrl_arch_mbm_cntr_assign_enabled(r)) seq_printf(seq, "%s_config\n", mevt->name); } @@ -1735,9 +1732,9 @@ next: } /* Value from user cannot be more than the supported set of events */ - if ((val & r->mbm_cfg_mask) != val) { + if ((val & r->mon.mbm_cfg_mask) != val) { rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n", - r->mbm_cfg_mask); + r->mon.mbm_cfg_mask); return -EINVAL; } @@ -1803,6 +1800,44 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, return ret ?: nbytes; } +/* + * resctrl_bmec_files_show() — Controls the visibility of BMEC-related resctrl + * files. When @show is true, the files are displayed; when false, the files + * are hidden. + * Don't treat kernfs_find_and_get failure as an error, since this function may + * be called regardless of whether BMEC is supported or the event is enabled. + */ +void resctrl_bmec_files_show(struct rdt_resource *r, struct kernfs_node *l3_mon_kn, + bool show) +{ + struct kernfs_node *kn_config, *mon_kn = NULL; + char name[32]; + + if (!l3_mon_kn) { + sprintf(name, "%s_MON", r->name); + mon_kn = kernfs_find_and_get(kn_info, name); + if (!mon_kn) + return; + l3_mon_kn = mon_kn; + } + + kn_config = kernfs_find_and_get(l3_mon_kn, "mbm_total_bytes_config"); + if (kn_config) { + kernfs_show(kn_config, show); + kernfs_put(kn_config); + } + + kn_config = kernfs_find_and_get(l3_mon_kn, "mbm_local_bytes_config"); + if (kn_config) { + kernfs_show(kn_config, show); + kernfs_put(kn_config); + } + + /* Release the reference only if it was acquired */ + if (mon_kn) + kernfs_put(mon_kn); +} + /* rdtgroup information files for one cache resource. */ static struct rftype res_common_files[] = { { @@ -1813,6 +1848,13 @@ static struct rftype res_common_files[] = { .fflags = RFTYPE_TOP_INFO, }, { + .name = "mbm_assign_on_mkdir", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_mbm_assign_on_mkdir_show, + .write = resctrl_mbm_assign_on_mkdir_write, + }, + { .name = "num_closids", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, @@ -1827,6 +1869,12 @@ static struct rftype res_common_files[] = { .fflags = RFTYPE_MON_INFO, }, { + .name = "available_mbm_cntrs", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_available_mbm_cntrs_show, + }, + { .name = "num_rmids", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, @@ -1841,6 +1889,12 @@ static struct rftype res_common_files[] = { .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, { + .name = "num_mbm_cntrs", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_num_mbm_cntrs_show, + }, + { .name = "min_cbm_bits", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, @@ -1916,6 +1970,28 @@ static struct rftype res_common_files[] = { .write = mbm_local_bytes_config_write, }, { + .name = "event_filter", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = event_filter_show, + .write = event_filter_write, + }, + { + .name = "mbm_L3_assignments", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = mbm_L3_assignments_show, + .write = mbm_L3_assignments_write, + }, + { + .name = "mbm_assign_mode", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_mbm_assign_mode_show, + .write = resctrl_mbm_assign_mode_write, + .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE, + }, + { .name = "cpus", .mode = 0644, .kf_ops = &rdtgroup_kf_single_ops, @@ -2168,10 +2244,48 @@ int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, return ret; } +static int resctrl_mkdir_event_configs(struct rdt_resource *r, struct kernfs_node *l3_mon_kn) +{ + struct kernfs_node *kn_subdir, *kn_subdir2; + struct mon_evt *mevt; + int ret; + + kn_subdir = kernfs_create_dir(l3_mon_kn, "event_configs", l3_mon_kn->mode, NULL); + if (IS_ERR(kn_subdir)) + return PTR_ERR(kn_subdir); + + ret = rdtgroup_kn_set_ugid(kn_subdir); + if (ret) + return ret; + + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled || !resctrl_is_mbm_event(mevt->evtid)) + continue; + + kn_subdir2 = kernfs_create_dir(kn_subdir, mevt->name, kn_subdir->mode, mevt); + if (IS_ERR(kn_subdir2)) { + ret = PTR_ERR(kn_subdir2); + goto out; + } + + ret = rdtgroup_kn_set_ugid(kn_subdir2); + if (ret) + goto out; + + ret = rdtgroup_add_files(kn_subdir2, RFTYPE_ASSIGN_CONFIG); + if (ret) + break; + } + +out: + return ret; +} + static int rdtgroup_mkdir_info_resdir(void *priv, char *name, unsigned long fflags) { struct kernfs_node *kn_subdir; + struct rdt_resource *r; int ret; kn_subdir = kernfs_create_dir(kn_info, name, @@ -2184,8 +2298,25 @@ static int rdtgroup_mkdir_info_resdir(void *priv, char *name, return ret; ret = rdtgroup_add_files(kn_subdir, fflags); - if (!ret) - kernfs_activate(kn_subdir); + if (ret) + return ret; + + if ((fflags & RFTYPE_MON_INFO) == RFTYPE_MON_INFO) { + r = priv; + if (r->mon.mbm_cntr_assignable) { + ret = resctrl_mkdir_event_configs(r, kn_subdir); + if (ret) + return ret; + /* + * Hide BMEC related files if mbm_event mode + * is enabled. + */ + if (resctrl_arch_mbm_cntr_assign_enabled(r)) + resctrl_bmec_files_show(r, kn_subdir, false); + } + } + + kernfs_activate(kn_subdir); return ret; } @@ -2608,10 +2739,8 @@ static int rdt_get_tree(struct fs_context *fc) goto out_root; ret = schemata_list_create(); - if (ret) { - schemata_list_destroy(); - goto out_ctx; - } + if (ret) + goto out_schemata_free; ret = closid_init(); if (ret) @@ -2637,6 +2766,8 @@ static int rdt_get_tree(struct fs_context *fc) if (ret < 0) goto out_info; + rdtgroup_assign_cntrs(&rdtgroup_default); + ret = mkdir_mondata_all(rdtgroup_default.kn, &rdtgroup_default, &kn_mondata); if (ret < 0) @@ -2675,15 +2806,16 @@ out_mondata: if (resctrl_arch_mon_capable()) kernfs_remove(kn_mondata); out_mongrp: - if (resctrl_arch_mon_capable()) + if (resctrl_arch_mon_capable()) { + rdtgroup_unassign_cntrs(&rdtgroup_default); kernfs_remove(kn_mongrp); + } out_info: kernfs_remove(kn_info); out_closid_exit: closid_exit(); out_schemata_free: schemata_list_destroy(); -out_ctx: rdt_disable_ctx(); out_root: rdtgroup_destroy_root(); @@ -2822,6 +2954,7 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) head = &rdtgrp->mon.crdtgrp_list; list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { + rdtgroup_unassign_cntrs(sentry); free_rmid(sentry->closid, sentry->mon.rmid); list_del(&sentry->mon.crdtgrp_list); @@ -2862,6 +2995,8 @@ static void rmdir_all_sub(void) cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + rdtgroup_unassign_cntrs(rdtgrp); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); kernfs_remove(rdtgrp->kn); @@ -2946,6 +3081,7 @@ static void resctrl_fs_teardown(void) return; rmdir_all_sub(); + rdtgroup_unassign_cntrs(&rdtgroup_default); mon_put_kn_priv(); rdt_pseudo_lock_release(); rdtgroup_default.mode = RDT_MODE_SHAREABLE; @@ -3057,10 +3193,9 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, struct mon_evt *mevt; int ret, domid; - if (WARN_ON(list_empty(&r->evt_list))) - return -EPERM; - - list_for_each_entry(mevt, &r->evt_list, list) { + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled) + continue; domid = do_sum ? d->ci_id : d->hdr.id; priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum); if (WARN_ON_ONCE(!priv)) @@ -3427,9 +3562,12 @@ static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) } rdtgrp->mon.rmid = ret; + rdtgroup_assign_cntrs(rdtgrp); + ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn); if (ret) { rdt_last_cmd_puts("kernfs subdir error\n"); + rdtgroup_unassign_cntrs(rdtgrp); free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); return ret; } @@ -3439,8 +3577,10 @@ static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp) { - if (resctrl_arch_mon_capable()) + if (resctrl_arch_mon_capable()) { + rdtgroup_unassign_cntrs(rgrp); free_rmid(rgrp->closid, rgrp->mon.rmid); + } } /* @@ -3716,6 +3856,9 @@ static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) update_closid_rmid(tmpmask, NULL); rdtgrp->flags = RDT_DELETED; + + rdtgroup_unassign_cntrs(rdtgrp); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); /* @@ -3763,6 +3906,8 @@ static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); update_closid_rmid(tmpmask, NULL); + rdtgroup_unassign_cntrs(rdtgrp); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); closid_free(rdtgrp->closid); @@ -4022,9 +4167,14 @@ static void rdtgroup_setup_default(void) static void domain_destroy_mon_state(struct rdt_mon_domain *d) { + int idx; + + kfree(d->cntr_cfg); bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - kfree(d->mbm_local); + for_each_mbm_idx(idx) { + kfree(d->mbm_states[idx]); + d->mbm_states[idx] = NULL; + } } void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) @@ -4050,7 +4200,7 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d if (resctrl_is_mbm_enabled()) cancel_delayed_work(&d->mbm_over); - if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) { + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID) && has_busy_rmid(d)) { /* * When a package is going down, forcefully * decrement rmid->ebusy. There is no way to know @@ -4084,32 +4234,41 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d) { u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - size_t tsize; + size_t tsize = sizeof(*d->mbm_states[0]); + enum resctrl_event_id eventid; + int idx; - if (resctrl_arch_is_llc_occupancy_enabled()) { + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) { d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); if (!d->rmid_busy_llc) return -ENOMEM; } - if (resctrl_arch_is_mbm_total_enabled()) { - tsize = sizeof(*d->mbm_total); - d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); - if (!d->mbm_total) { - bitmap_free(d->rmid_busy_llc); - return -ENOMEM; - } + + for_each_mbm_event_id(eventid) { + if (!resctrl_is_mon_event_enabled(eventid)) + continue; + idx = MBM_STATE_IDX(eventid); + d->mbm_states[idx] = kcalloc(idx_limit, tsize, GFP_KERNEL); + if (!d->mbm_states[idx]) + goto cleanup; } - if (resctrl_arch_is_mbm_local_enabled()) { - tsize = sizeof(*d->mbm_local); - d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); - if (!d->mbm_local) { - bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - return -ENOMEM; - } + + if (resctrl_is_mbm_enabled() && r->mon.mbm_cntr_assignable) { + tsize = sizeof(*d->cntr_cfg); + d->cntr_cfg = kcalloc(r->mon.num_mbm_cntrs, tsize, GFP_KERNEL); + if (!d->cntr_cfg) + goto cleanup; } return 0; +cleanup: + bitmap_free(d->rmid_busy_llc); + for_each_mbm_idx(idx) { + kfree(d->mbm_states[idx]); + d->mbm_states[idx] = NULL; + } + + return -ENOMEM; } int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) @@ -4144,7 +4303,7 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) RESCTRL_PICK_ANY_CPU); } - if (resctrl_arch_is_llc_occupancy_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); /* @@ -4219,7 +4378,7 @@ void resctrl_offline_cpu(unsigned int cpu) cancel_delayed_work(&d->mbm_over); mbm_setup_overflow_handler(d, 0, cpu); } - if (resctrl_arch_is_llc_occupancy_enabled() && + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID) && cpu == d->cqm_work_cpu && has_busy_rmid(d)) { cancel_delayed_work(&d->cqm_limbo); cqm_setup_limbo_handler(d, 0, cpu); |