diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-02-09 15:45:21 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-02-09 15:45:21 -0800 |
| commit | 8912c2fd5830e976c0deaeb0b2a458ce6b4718c7 (patch) | |
| tree | 1e95a844937baf6bba645414e09a6826af5ca62d | |
| parent | b29a7a8eee6a1ca974aaf053c0ffed1173d279c2 (diff) | |
| parent | 161ab30da6899f31f8128cec7c833e99fa4d06d2 (diff) | |
Merge tag 'for-6.20-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba:
"User visible changes, feature updates:
- when using block size > page size, enable direct IO
- fallback to buffered IO if the data profile has duplication,
workaround to avoid checksum mismatches on block group profiles
with redundancy, real direct IO is possible on single or RAID0
- redo export of zoned statistics, moved from sysfs to
/proc/pid/mountstats due to size limitations of the former
Experimental features:
- remove offload checksum tunable, intended to find best way to do it
but since we've switched to offload to thread for everything we
don't need it anymore
- initial support for remap-tree feature, a translation layer of
logical block addresses that allow changes without moving/rewriting
blocks to do eg. relocation, or other changes that require COW
Notable fixes:
- automatic removal of accidentally leftover chunks when
free-space-tree is enabled since mkfs.btrfs v6.16.1
- zoned mode:
- do not try to append to conventional zones when RAID is mixing
zoned and conventional drives
- fixup write pointers when mixing zoned and conventional on
DUP/RAID* profiles
- when using squota, relax deletion rules for qgroups with 0 members
to allow easier recovery from accounting bugs, also add more checks
to detect bad accounting
- fix periodic reclaim scanning, properly check boundary conditions
not to trigger it unexpectedly or miss the time to run it
- trim:
- continue after first error
- change reporting to the first detected error
- add more cancellation points
- reduce contention of big device lock that can block other
operations when there's lots of trimmed space
- when chunk allocation is forced (needs experimental build) fix
transaction abort when unexpected space layout is detected
Core:
- switch to crypto library API for checksumming, removed module
dependencies, pointer indirections, etc.
- error handling improvements
- adjust how and where transaction commit or abort are done and are
maybe not necessary
- minor compression optimization to skip single block ranges
- improve how compression folios are handled
- new and updated selftests
- cleanups, refactoring:
- auto-freeing and other automatic variable cleanup conversion
- structure size optimizations
- condition annotations"
* tag 'for-6.20-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (137 commits)
btrfs: get rid of compressed_bio::compressed_folios[]
btrfs: get rid of compressed_folios[] usage for encoded writes
btrfs: get rid of compressed_folios[] usage for compressed read
btrfs: remove the old btrfs_compress_folios() infrastructure
btrfs: switch to btrfs_compress_bio() interface for compressed writes
btrfs: introduce btrfs_compress_bio() helper
btrfs: zlib: introduce zlib_compress_bio() helper
btrfs: zstd: introduce zstd_compress_bio() helper
btrfs: lzo: introduce lzo_compress_bio() helper
btrfs: zoned: factor out the zone loading part into a testable function
btrfs: add cleanup function for btrfs_free_chunk_map
btrfs: tests: add cleanup functions for test specific functions
btrfs: raid56: fix memory leak of btrfs_raid_bio::stripe_uptodate_bitmap
btrfs: tests: add unit tests for pending extent walking functions
btrfs: fix EEXIST abort due to non-consecutive gaps in chunk allocation
btrfs: fix transaction commit blocking during trim of unallocated space
btrfs: handle user interrupt properly in btrfs_trim_fs()
btrfs: preserve first error in btrfs_trim_fs()
btrfs: continue trimming remaining devices on failure
btrfs: do not BUG_ON() in btrfs_remove_block_group()
...
71 files changed, 5866 insertions, 2078 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 6d6fc85835d4..ede184b6eda1 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -4,11 +4,8 @@ config BTRFS_FS tristate "Btrfs filesystem support" select BLK_CGROUP_PUNT_BIO select CRC32 - select CRYPTO - select CRYPTO_CRC32C - select CRYPTO_XXHASH - select CRYPTO_SHA256 - select CRYPTO_BLAKE2B + select CRYPTO_LIB_BLAKE2B + select CRYPTO_LIB_SHA256 select ZLIB_INFLATE select ZLIB_DEFLATE select LZO_COMPRESS @@ -18,6 +15,7 @@ config BTRFS_FS select FS_IOMAP select RAID6_PQ select XOR_BLOCKS + select XXHASH depends on PAGE_SIZE_LESS_THAN_256KB help @@ -106,9 +104,6 @@ config BTRFS_EXPERIMENTAL - send stream protocol v3 - fs-verity support - - checksum offload mode - sysfs knob to affect when checksums are - calculated (at IO time, or in a thread) - - raid-stripe-tree - additional mapping of extents to devices to support RAID1* profiles on zoned devices, RAID56 not yet supported @@ -121,4 +116,6 @@ config BTRFS_EXPERIMENTAL - asynchronous checksum generation for data writes + - remap-tree - logical address remapping tree + If unsure, say N. diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 743d7677b175..975104b74486 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -44,4 +44,5 @@ btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ tests/extent-buffer-tests.o tests/btrfs-tests.o \ tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \ tests/free-space-tree-tests.o tests/extent-map-tests.o \ - tests/raid-stripe-tree-tests.o tests/delayed-refs-tests.o + tests/raid-stripe-tree-tests.o tests/delayed-refs-tests.o \ + tests/chunk-allocation-tests.o diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 78721412951c..8938357fcb40 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -240,6 +240,26 @@ BTRFS_SETGET_FUNCS(block_group_flags, struct btrfs_block_group_item, flags, 64); BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags, struct btrfs_block_group_item, flags, 64); +/* struct btrfs_block_group_item_v2 */ +BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_used, struct btrfs_block_group_item_v2, + used, 64); +BTRFS_SETGET_FUNCS(block_group_v2_used, struct btrfs_block_group_item_v2, used, 64); +BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_chunk_objectid, + struct btrfs_block_group_item_v2, chunk_objectid, 64); +BTRFS_SETGET_FUNCS(block_group_v2_chunk_objectid, + struct btrfs_block_group_item_v2, chunk_objectid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_flags, + struct btrfs_block_group_item_v2, flags, 64); +BTRFS_SETGET_FUNCS(block_group_v2_flags, struct btrfs_block_group_item_v2, flags, 64); +BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_remap_bytes, + struct btrfs_block_group_item_v2, remap_bytes, 64); +BTRFS_SETGET_FUNCS(block_group_v2_remap_bytes, struct btrfs_block_group_item_v2, + remap_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_identity_remap_count, + struct btrfs_block_group_item_v2, identity_remap_count, 32); +BTRFS_SETGET_FUNCS(block_group_v2_identity_remap_count, struct btrfs_block_group_item_v2, + identity_remap_count, 32); + /* struct btrfs_free_space_info */ BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info, extent_count, 32); @@ -863,6 +883,12 @@ BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, uuid_tree_generation, 64); BTRFS_SETGET_STACK_FUNCS(super_nr_global_roots, struct btrfs_super_block, nr_global_roots, 64); +BTRFS_SETGET_STACK_FUNCS(super_remap_root, struct btrfs_super_block, + remap_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_remap_root_generation, struct btrfs_super_block, + remap_root_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_remap_root_level, struct btrfs_super_block, + remap_root_level, 8); /* struct btrfs_file_extent_item */ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item, @@ -1010,6 +1036,10 @@ BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption, BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size, struct btrfs_verity_descriptor_item, size, 64); +BTRFS_SETGET_FUNCS(remap_address, struct btrfs_remap_item, address, 64); +BTRFS_SETGET_STACK_FUNCS(stack_remap_address, struct btrfs_remap_item, + address, 64); + /* Cast into the data area of the leaf. */ #define btrfs_item_ptr(leaf, slot, type) \ ((type *)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot))) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 78da47a3d00e..9bb406f7dd30 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -3609,10 +3609,8 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, } rb_node = rb_simple_insert(&cache->rb_root, &upper->simple_node); - if (unlikely(rb_node)) { + if (unlikely(rb_node)) btrfs_backref_panic(cache->fs_info, upper->bytenr, -EEXIST); - return -EUCLEAN; - } list_add_tail(&edge->list[UPPER], &upper->lower); diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index fa1d321a2fb8..0a69e09bfe28 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -97,7 +97,13 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, bbio->orig_logical = orig_bbio->orig_logical; orig_bbio->orig_logical += map_length; } + bbio->csum_search_commit_root = orig_bbio->csum_search_commit_root; + bbio->can_use_append = orig_bbio->can_use_append; + bbio->is_scrub = orig_bbio->is_scrub; + bbio->is_remap = orig_bbio->is_remap; + bbio->async_csum = orig_bbio->async_csum; + atomic_inc(&orig_bbio->pending_ios); return bbio; } @@ -480,6 +486,8 @@ static void btrfs_clone_write_end_io(struct bio *bio) static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) { + u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || (btrfs_op(bio) == BTRFS_MAP_WRITE && @@ -494,12 +502,13 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) * For zone append writing, bi_sector must point the beginning of the * zone */ - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + if (btrfs_bio(bio)->can_use_append && btrfs_dev_is_sequential(dev, physical)) { u64 zone_start = round_down(physical, dev->fs_info->zone_size); ASSERT(btrfs_dev_is_sequential(dev, physical)); bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; + bio->bi_opf &= ~REQ_OP_WRITE; + bio->bi_opf |= REQ_OP_ZONE_APPEND; } btrfs_debug(dev->fs_info, "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", @@ -662,11 +671,6 @@ static bool should_async_write(struct btrfs_bio *bbio) bool auto_csum_mode = true; #ifdef CONFIG_BTRFS_EXPERIMENTAL - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode); - - if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_ON) - return true; /* * Write bios will calculate checksum and submit bio at the same time. * Unless explicitly required don't offload serial csum calculate and bio @@ -747,7 +751,6 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 length = bio->bi_iter.bi_size; u64 map_length = length; - bool use_append = btrfs_use_zone_append(bbio); struct btrfs_io_context *bioc = NULL; struct btrfs_io_stripe smap; blk_status_t status; @@ -775,8 +778,10 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) if (bio_op(bio) == REQ_OP_WRITE && is_data_bbio(bbio)) bbio->orig_logical = logical; + bbio->can_use_append = btrfs_use_zone_append(bbio); + map_length = min(map_length, length); - if (use_append) + if (bbio->can_use_append) map_length = btrfs_append_map_length(bbio, map_length); if (map_length < length) { @@ -805,11 +810,6 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) } if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - if (use_append) { - bio->bi_opf &= ~REQ_OP_WRITE; - bio->bi_opf |= REQ_OP_ZONE_APPEND; - } - if (is_data_bbio(bbio) && bioc && bioc->use_rst) { /* * No locking for the list update, as we only add to @@ -827,7 +827,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) */ if (!(inode->flags & BTRFS_INODE_NODATASUM) && !test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state) && - !btrfs_is_data_reloc_root(inode->root)) { + !btrfs_is_data_reloc_root(inode->root) && !bbio->is_remap) { if (should_async_write(bbio) && btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num)) goto done; @@ -836,9 +836,8 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) status = errno_to_blk_status(ret); if (status) goto fail; - } else if (use_append || - (btrfs_is_zoned(fs_info) && inode && - inode->flags & BTRFS_INODE_NODATASUM)) { + } else if (bbio->can_use_append || + (btrfs_is_zoned(fs_info) && inode->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_alloc_dummy_sum(bbio); status = errno_to_blk_status(ret); if (status) diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index 1be74209f0b8..303ed6c7103d 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -68,29 +68,36 @@ struct btrfs_bio { struct btrfs_tree_parent_check parent_check; }; + /* For internal use in read end I/O handling */ + struct work_struct end_io_work; + /* End I/O information supplied to btrfs_bio_alloc */ btrfs_bio_end_io_t end_io; void *private; - /* For internal use in read end I/O handling */ - unsigned int mirror_num; atomic_t pending_ios; - struct work_struct end_io_work; + u16 mirror_num; /* Save the first error status of split bio. */ blk_status_t status; /* Use the commit root to look up csums (data read bio only). */ - bool csum_search_commit_root; + bool csum_search_commit_root:1; /* * Since scrub will reuse btree inode, we need this flag to distinguish * scrub bios. */ - bool is_scrub; + bool is_scrub:1; + + /* Whether the bio is coming from copy_remapped_data_io(). */ + bool is_remap:1; /* Whether the csum generation for data write is async. */ - bool async_csum; + bool async_csum:1; + + /* Whether the bio is written using zone append. */ + bool can_use_append:1; /* * This member must come last, bio_alloc_bioset will allocate enough diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 08b14449fabe..3186ed4fd26d 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -239,7 +239,7 @@ static struct btrfs_block_group *block_group_cache_tree_search( while (n) { cache = rb_entry(n, struct btrfs_block_group, cache_node); - end = cache->start + cache->length - 1; + end = btrfs_block_group_end(cache) - 1; start = cache->start; if (bytenr < start) { @@ -292,7 +292,7 @@ struct btrfs_block_group *btrfs_next_block_group( /* If our block group was removed, we need a full search. */ if (RB_EMPTY_NODE(&cache->cache_node)) { - const u64 next_bytenr = cache->start + cache->length; + const u64 next_bytenr = btrfs_block_group_end(cache); read_unlock(&fs_info->block_group_cache_lock); btrfs_put_block_group(cache); @@ -575,28 +575,28 @@ int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start, /* * Get an arbitrary extent item index / max_index through the block group * - * @block_group the block group to sample from + * @caching_ctl the caching control containing the block group to sample from * @index: the integral step through the block group to grab from * @max_index: the granularity of the sampling * @key: return value parameter for the item we find + * @path: path to use for searching in the extent tree * * Pre-conditions on indices: * 0 <= index <= max_index * 0 < max_index * - * Returns: 0 on success, 1 if the search didn't yield a useful item, negative - * error code on error. + * Returns: 0 on success, 1 if the search didn't yield a useful item. */ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl, - struct btrfs_block_group *block_group, int index, int max_index, - struct btrfs_key *found_key) + struct btrfs_key *found_key, + struct btrfs_path *path) { + struct btrfs_block_group *block_group = caching_ctl->block_group; struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *extent_root; u64 search_offset; - u64 search_end = block_group->start + block_group->length; - BTRFS_PATH_AUTO_FREE(path); + const u64 search_end = btrfs_block_group_end(block_group); struct btrfs_key search_key; int ret = 0; @@ -606,16 +606,13 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ lockdep_assert_held(&caching_ctl->mutex); lockdep_assert_held_read(&fs_info->commit_root_sem); - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start, - BTRFS_SUPER_INFO_OFFSET)); - - path->skip_locking = true; - path->search_commit_root = true; - path->reada = READA_FORWARD; + extent_root = btrfs_extent_root(fs_info, block_group->start); + if (unlikely(!extent_root)) { + btrfs_err(fs_info, + "missing extent root for block group at offset %llu", + block_group->start); + return -EUCLEAN; + } search_offset = index * div_u64(block_group->length, max_index); search_key.objectid = block_group->start + search_offset; @@ -673,27 +670,42 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ * 3, we can either read every file extent, or admit that this is best effort * anyway and try to stay fast. * - * Returns: 0 on success, negative error code on error. + * No errors are returned since failing to determine the size class is not a + * critical error, size classes are just an optimization. */ -static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl, - struct btrfs_block_group *block_group) +static void load_block_group_size_class(struct btrfs_caching_control *caching_ctl) { + BTRFS_PATH_AUTO_RELEASE(path); + struct btrfs_block_group *block_group = caching_ctl->block_group; struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_key key; int i; u64 min_size = block_group->length; enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE; - int ret; + + /* + * Since we run in workqueue context, we allocate the path on stack to + * avoid memory allocation failure, as the stack in a work queue task + * is not deep. + */ + ASSERT(current_work() == &caching_ctl->work.normal_work); if (!btrfs_block_group_should_use_size_class(block_group)) - return 0; + return; + + path.skip_locking = true; + path.search_commit_root = true; + path.reada = READA_FORWARD; lockdep_assert_held(&caching_ctl->mutex); lockdep_assert_held_read(&fs_info->commit_root_sem); for (i = 0; i < 5; ++i) { - ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key); + int ret; + + ret = sample_block_group_extent_item(caching_ctl, i, 5, &key, &path); if (ret < 0) - goto out; + return; + btrfs_release_path(&path); if (ret > 0) continue; min_size = min_t(u64, min_size, key.offset); @@ -704,13 +716,12 @@ static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl block_group->size_class = size_class; spin_unlock(&block_group->lock); } -out: - return ret; } static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) { struct btrfs_block_group *block_group = caching_ctl->block_group; + const u64 block_group_end = btrfs_block_group_end(block_group); struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *extent_root; BTRFS_PATH_AUTO_FREE(path); @@ -755,13 +766,13 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) next: ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); while (1) { - if (btrfs_fs_closing(fs_info) > 1) { + if (btrfs_fs_closing_done(fs_info)) { last = (u64)-1; break; } @@ -786,7 +797,7 @@ next: ret = btrfs_next_leaf(extent_root, path); if (ret < 0) - goto out; + return ret; if (ret) break; leaf = path->nodes[0]; @@ -807,7 +818,7 @@ next: continue; } - if (key.objectid >= block_group->start + block_group->length) + if (key.objectid >= block_group_end) break; if (key.type == BTRFS_EXTENT_ITEM_KEY || @@ -817,7 +828,7 @@ next: ret = btrfs_add_new_free_space(block_group, last, key.objectid, &space_added); if (ret) - goto out; + return ret; total_found += space_added; if (key.type == BTRFS_METADATA_ITEM_KEY) last = key.objectid + @@ -836,17 +847,13 @@ next: path->slots[0]++; } - ret = btrfs_add_new_free_space(block_group, last, - block_group->start + block_group->length, - NULL); -out: - return ret; + return btrfs_add_new_free_space(block_group, last, block_group_end, NULL); } static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg) { btrfs_clear_extent_bit(&bg->fs_info->excluded_extents, bg->start, - bg->start + bg->length - 1, EXTENT_DIRTY, NULL); + btrfs_block_group_end(bg) - 1, EXTENT_DIRTY, NULL); } static noinline void caching_thread(struct btrfs_work *work) @@ -863,7 +870,7 @@ static noinline void caching_thread(struct btrfs_work *work) mutex_lock(&caching_ctl->mutex); down_read(&fs_info->commit_root_sem); - load_block_group_size_class(caching_ctl, block_group); + load_block_group_size_class(caching_ctl); if (btrfs_test_opt(fs_info, SPACE_CACHE)) { ret = load_free_space_cache(block_group); if (ret == 1) { @@ -933,6 +940,13 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait) if (btrfs_is_zoned(fs_info)) return 0; + /* + * No allocations can be done from remapped block groups, so they have + * no entries in the free-space tree. + */ + if (cache->flags & BTRFS_BLOCK_GROUP_REMAPPED) + return 0; + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); if (!caching_ctl) return -ENOMEM; @@ -1057,8 +1071,25 @@ static int remove_block_group_item(struct btrfs_trans_handle *trans, if (ret < 0) return ret; - ret = btrfs_del_item(trans, root, path); - return ret; + return btrfs_del_item(trans, root, path); +} + +void btrfs_remove_bg_from_sinfo(struct btrfs_block_group *bg) +{ + int factor = btrfs_bg_type_to_factor(bg->flags); + + spin_lock(&bg->space_info->lock); + if (btrfs_test_opt(bg->fs_info, ENOSPC_DEBUG)) { + WARN_ON(bg->space_info->total_bytes < bg->length); + WARN_ON(bg->space_info->bytes_readonly < bg->length - bg->zone_unusable); + WARN_ON(bg->space_info->bytes_zone_unusable < bg->zone_unusable); + WARN_ON(bg->space_info->disk_total < bg->length * factor); + } + bg->space_info->total_bytes -= bg->length; + bg->space_info->bytes_readonly -= (bg->length - bg->zone_unusable); + btrfs_space_info_update_bytes_zone_unusable(bg->space_info, -bg->zone_unusable); + bg->space_info->disk_total -= bg->length * factor; + spin_unlock(&bg->space_info->lock); } int btrfs_remove_block_group(struct btrfs_trans_handle *trans, @@ -1072,16 +1103,22 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct kobject *kobj = NULL; int ret; int index; - int factor; struct btrfs_caching_control *caching_ctl = NULL; bool remove_map; bool remove_rsv = false; block_group = btrfs_lookup_block_group(fs_info, map->start); - if (!block_group) + if (unlikely(!block_group)) { + btrfs_abort_transaction(trans, -ENOENT); return -ENOENT; + } - BUG_ON(!block_group->ro); + if (unlikely(!block_group->ro && + !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED))) { + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + goto out; + } trace_btrfs_remove_block_group(block_group); /* @@ -1093,7 +1130,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, block_group->length); index = btrfs_bg_flags_to_raid_index(block_group->flags); - factor = btrfs_bg_type_to_factor(block_group->flags); /* make sure this block group isn't part of an allocation cluster */ cluster = &fs_info->data_alloc_cluster; @@ -1114,8 +1150,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, btrfs_clear_data_reloc_bg(block_group); path = btrfs_alloc_path(); - if (!path) { + if (unlikely(!path)) { ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); goto out; } @@ -1151,8 +1188,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, mutex_unlock(&trans->transaction->cache_write_mutex); ret = btrfs_remove_free_space_inode(trans, inode, block_group); - if (ret) + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); goto out; + } write_lock(&fs_info->block_group_cache_lock); rb_erase_cached(&block_group->cache_node, @@ -1217,26 +1256,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_lock(&block_group->space_info->lock); list_del_init(&block_group->ro_list); - - if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { - WARN_ON(block_group->space_info->total_bytes - < block_group->length); - WARN_ON(block_group->space_info->bytes_readonly - < block_group->length - block_group->zone_unusable); - WARN_ON(block_group->space_info->bytes_zone_unusable - < block_group->zone_unusable); - WARN_ON(block_group->space_info->disk_total - < block_group->length * factor); - } - block_group->space_info->total_bytes -= block_group->length; - block_group->space_info->bytes_readonly -= - (block_group->length - block_group->zone_unusable); - btrfs_space_info_update_bytes_zone_unusable(block_group->space_info, - -block_group->zone_unusable); - block_group->space_info->disk_total -= block_group->length * factor; - spin_unlock(&block_group->space_info->lock); + if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) + btrfs_remove_bg_from_sinfo(block_group); + /* * Remove the free space for the block group from the free space tree * and the block group's item from the extent tree before marking the @@ -1247,14 +1271,24 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * deletes the block group item from the extent tree, allowing for * another task to attempt to create another block group with the same * item key (and failing with -EEXIST and a transaction abort). + * + * If the REMAPPED flag has been set the block group's free space + * has already been removed, so we can skip the call to + * btrfs_remove_block_group_free_space(). */ - ret = btrfs_remove_block_group_free_space(trans, block_group); - if (ret) - goto out; + if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) { + ret = btrfs_remove_block_group_free_space(trans, block_group); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + goto out; + } + } ret = remove_block_group_item(trans, path, block_group); - if (ret < 0) + if (unlikely(ret < 0)) { + btrfs_abort_transaction(trans, ret); goto out; + } spin_lock(&block_group->lock); /* @@ -1377,8 +1411,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, bool force) goto out; } - num_bytes = cache->length - cache->reserved - cache->pinned - - cache->bytes_super - cache->zone_unusable - cache->used; + num_bytes = btrfs_block_group_available_space(cache); /* * Data never overcommits, even in mixed mode, so do just the straight @@ -1564,8 +1597,10 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_lock(&space_info->lock); spin_lock(&block_group->lock); - if (btrfs_is_block_group_used(block_group) || block_group->ro || - list_is_singular(&block_group->list)) { + if (btrfs_is_block_group_used(block_group) || + (block_group->ro && !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) || + list_is_singular(&block_group->list) || + test_bit(BLOCK_GROUP_FLAG_FULLY_REMAPPED, &block_group->runtime_flags)) { /* * We want to bail if we made new allocations or have * outstanding allocations in this block group. We do @@ -1606,9 +1641,10 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * needing to allocate extents from the block group. */ used = btrfs_space_info_used(space_info, true); - if ((space_info->total_bytes - block_group->length < used && - block_group->zone_unusable < block_group->length) || - has_unwritten_metadata(block_group)) { + if (((space_info->total_bytes - block_group->length < used && + block_group->zone_unusable < block_group->length) || + has_unwritten_metadata(block_group)) && + !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) { /* * Add a reference for the list, compensate for the ref * drop under the "next" label for the @@ -1773,6 +1809,9 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg) btrfs_get_block_group(bg); trace_btrfs_add_unused_block_group(bg); list_add_tail(&bg->bg_list, &fs_info->unused_bgs); + } else if (bg->flags & BTRFS_BLOCK_GROUP_REMAPPED && + bg->identity_remap_count == 0) { + /* Leave fully remapped block groups on the fully_remapped_bgs list. */ } else if (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) { /* Pull out the block group from the reclaim_bgs list. */ trace_btrfs_add_unused_block_group(bg); @@ -1805,6 +1844,12 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a, static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info) { + if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) + return false; + + if (btrfs_fs_closing(fs_info)) + return false; + if (btrfs_is_zoned(fs_info)) return btrfs_zoned_should_reclaim(fs_info); return true; @@ -1839,12 +1884,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) struct btrfs_space_info *space_info; LIST_HEAD(retry_list); - if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) - return; - - if (btrfs_fs_closing(fs_info)) - return; - if (!btrfs_should_reclaim(fs_info)) return; @@ -1872,6 +1911,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) while (!list_empty(&fs_info->reclaim_bgs)) { u64 used; u64 reserved; + u64 old_total; int ret = 0; bg = list_first_entry(&fs_info->reclaim_bgs, @@ -1937,6 +1977,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) } spin_unlock(&bg->lock); + old_total = space_info->total_bytes; spin_unlock(&space_info->lock); /* @@ -1989,14 +2030,14 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) reserved = 0; spin_lock(&space_info->lock); space_info->reclaim_errors++; - if (READ_ONCE(space_info->periodic_reclaim)) - space_info->periodic_reclaim_ready = false; spin_unlock(&space_info->lock); } spin_lock(&space_info->lock); space_info->reclaim_count++; space_info->reclaim_bytes += used; space_info->reclaim_bytes += reserved; + if (space_info->total_bytes < old_total) + btrfs_set_periodic_reclaim_ready(space_info, true); spin_unlock(&space_info->lock); next: @@ -2249,7 +2290,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) while (nr--) { u64 len = min_t(u64, stripe_len, - cache->start + cache->length - logical[nr]); + btrfs_block_group_end(cache) - logical[nr]); cache->bytes_super += len; ret = btrfs_set_extent_bit(&fs_info->excluded_extents, @@ -2266,7 +2307,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) return 0; } -static struct btrfs_block_group *btrfs_create_block_group_cache( +static struct btrfs_block_group *btrfs_create_block_group( struct btrfs_fs_info *fs_info, u64 start) { struct btrfs_block_group *cache; @@ -2360,7 +2401,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) } static int read_one_block_group(struct btrfs_fs_info *info, - struct btrfs_block_group_item *bgi, + struct btrfs_block_group_item_v2 *bgi, const struct btrfs_key *key, int need_clear) { @@ -2370,16 +2411,21 @@ static int read_one_block_group(struct btrfs_fs_info *info, ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY); - cache = btrfs_create_block_group_cache(info, key->objectid); + cache = btrfs_create_block_group(info, key->objectid); if (!cache) return -ENOMEM; cache->length = key->offset; - cache->used = btrfs_stack_block_group_used(bgi); - cache->commit_used = cache->used; - cache->flags = btrfs_stack_block_group_flags(bgi); - cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); + cache->used = btrfs_stack_block_group_v2_used(bgi); + cache->last_used = cache->used; + cache->flags = btrfs_stack_block_group_v2_flags(bgi); + cache->last_flags = cache->flags; + cache->global_root_id = btrfs_stack_block_group_v2_chunk_objectid(bgi); cache->space_info = btrfs_find_space_info(info, cache->flags); + cache->remap_bytes = btrfs_stack_block_group_v2_remap_bytes(bgi); + cache->last_remap_bytes = cache->remap_bytes; + cache->identity_remap_count = btrfs_stack_block_group_v2_identity_remap_count(bgi); + cache->last_identity_remap_count = cache->identity_remap_count; btrfs_set_free_space_tree_thresholds(cache); @@ -2444,10 +2490,10 @@ static int read_one_block_group(struct btrfs_fs_info *info, } else if (cache->length == cache->used) { cache->cached = BTRFS_CACHE_FINISHED; btrfs_free_excluded_extents(cache); - } else if (cache->used == 0) { + } else if (cache->used == 0 && cache->remap_bytes == 0) { cache->cached = BTRFS_CACHE_FINISHED; ret = btrfs_add_new_free_space(cache, cache->start, - cache->start + cache->length, NULL); + btrfs_block_group_end(cache), NULL); btrfs_free_excluded_extents(cache); if (ret) goto error; @@ -2464,7 +2510,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, set_avail_alloc_bits(info, cache->flags); if (btrfs_chunk_writeable(info, cache->start)) { - if (cache->used == 0) { + if (cache->used == 0 && cache->remap_bytes == 0) { ASSERT(list_empty(&cache->bg_list)); if (btrfs_test_opt(info, DISCARD_ASYNC)) btrfs_discard_queue_work(&info->discard_ctl, cache); @@ -2491,7 +2537,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) struct btrfs_block_group *bg; map = rb_entry(node, struct btrfs_chunk_map, rb_node); - bg = btrfs_create_block_group_cache(fs_info, map->start); + bg = btrfs_create_block_group(fs_info, map->start); if (!bg) { ret = -ENOMEM; break; @@ -2568,9 +2614,10 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) need_clear = 1; while (1) { - struct btrfs_block_group_item bgi; + struct btrfs_block_group_item_v2 bgi; struct extent_buffer *leaf; int slot; + size_t size; ret = find_first_block_group(info, path, &key); if (ret > 0) @@ -2581,8 +2628,16 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) leaf = path->nodes[0]; slot = path->slots[0]; + if (btrfs_fs_incompat(info, REMAP_TREE)) { + size = sizeof(struct btrfs_block_group_item_v2); + } else { + size = sizeof(struct btrfs_block_group_item); + btrfs_set_stack_block_group_v2_remap_bytes(&bgi, 0); + btrfs_set_stack_block_group_v2_identity_remap_count(&bgi, 0); + } + read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), - sizeof(bgi)); + size); btrfs_item_key_to_cpu(leaf, &key, slot); btrfs_release_path(path); @@ -2652,28 +2707,38 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_item bgi; + struct btrfs_block_group_item_v2 bgi; struct btrfs_root *root = btrfs_block_group_root(fs_info); struct btrfs_key key; - u64 old_commit_used; + u64 old_last_used; + size_t size; int ret; spin_lock(&block_group->lock); - btrfs_set_stack_block_group_used(&bgi, block_group->used); - btrfs_set_stack_block_group_chunk_objectid(&bgi, - block_group->global_root_id); - btrfs_set_stack_block_group_flags(&bgi, block_group->flags); - old_commit_used = block_group->commit_used; - block_group->commit_used = block_group->used; + btrfs_set_stack_block_group_v2_used(&bgi, block_group->used); + btrfs_set_stack_block_group_v2_chunk_objectid(&bgi, block_group->global_root_id); + btrfs_set_stack_block_group_v2_flags(&bgi, block_group->flags); + btrfs_set_stack_block_group_v2_remap_bytes(&bgi, block_group->remap_bytes); + btrfs_set_stack_block_group_v2_identity_remap_count(&bgi, block_group->identity_remap_count); + old_last_used = block_group->last_used; + block_group->last_used = block_group->used; + block_group->last_remap_bytes = block_group->remap_bytes; + block_group->last_identity_remap_count = block_group->identity_remap_count; + block_group->last_flags = block_group->flags; key.objectid = block_group->start; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; key.offset = block_group->length; spin_unlock(&block_group->lock); - ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi)); + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) + size = sizeof(struct btrfs_block_group_item_v2); + else + size = sizeof(struct btrfs_block_group_item); + + ret = btrfs_insert_item(trans, root, &key, &bgi, size); if (ret < 0) { spin_lock(&block_group->lock); - block_group->commit_used = old_commit_used; + block_group->last_used = old_last_used; spin_unlock(&block_group->lock); } @@ -2886,7 +2951,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran btrfs_set_log_full_commit(trans); - cache = btrfs_create_block_group_cache(fs_info, chunk_offset); + cache = btrfs_create_block_group(fs_info, chunk_offset); if (!cache) return ERR_PTR(-ENOMEM); @@ -3090,7 +3155,6 @@ unlock_out: void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) { struct btrfs_space_info *sinfo = cache->space_info; - u64 num_bytes; BUG_ON(!cache->ro); @@ -3106,10 +3170,7 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) btrfs_space_info_update_bytes_zone_unusable(sinfo, cache->zone_unusable); sinfo->bytes_readonly -= cache->zone_unusable; } - num_bytes = cache->length - cache->reserved - - cache->pinned - cache->bytes_super - - cache->zone_unusable - cache->used; - sinfo->bytes_readonly -= num_bytes; + sinfo->bytes_readonly -= btrfs_block_group_available_space(cache); list_del_init(&cache->ro_list); } spin_unlock(&cache->lock); @@ -3125,10 +3186,12 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, struct btrfs_root *root = btrfs_block_group_root(fs_info); unsigned long bi; struct extent_buffer *leaf; - struct btrfs_block_group_item bgi; + struct btrfs_block_group_item_v2 bgi; struct btrfs_key key; - u64 old_commit_used; - u64 used; + u64 old_last_used, old_last_remap_bytes; + u32 old_last_identity_remap_count; + u64 used, remap_bytes; + u32 identity_remap_count; /* * Block group items update can be triggered out of commit transaction @@ -3137,14 +3200,24 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, * may be changed. */ spin_lock(&cache->lock); - old_commit_used = cache->commit_used; + old_last_used = cache->last_used; + old_last_remap_bytes = cache->last_remap_bytes; + old_last_identity_remap_count = cache->last_identity_remap_count; used = cache->used; - /* No change in used bytes, can safely skip it. */ - if (cache->commit_used == used) { + remap_bytes = cache->remap_bytes; + identity_remap_count = cache->identity_remap_count; + /* No change in values, can safely skip it. */ + if (cache->last_used == used && + cache->last_remap_bytes == remap_bytes && + cache->last_identity_remap_count == identity_remap_count && + cache->last_flags == cache->flags) { spin_unlock(&cache->lock); return 0; } - cache->commit_used = used; + cache->last_used = used; + cache->last_remap_bytes = remap_bytes; + cache->last_identity_remap_count = identity_remap_count; + cache->last_flags = cache->flags; spin_unlock(&cache->lock); key.objectid = cache->start; @@ -3160,25 +3233,37 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; bi = btrfs_item_ptr_offset(leaf, path->slots[0]); - btrfs_set_stack_block_group_used(&bgi, used); - btrfs_set_stack_block_group_chunk_objectid(&bgi, - cache->global_root_id); - btrfs_set_stack_block_group_flags(&bgi, cache->flags); - write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); + btrfs_set_stack_block_group_v2_used(&bgi, used); + btrfs_set_stack_block_group_v2_chunk_objectid(&bgi, cache->global_root_id); + btrfs_set_stack_block_group_v2_flags(&bgi, cache->flags); + + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { + btrfs_set_stack_block_group_v2_remap_bytes(&bgi, cache->remap_bytes); + btrfs_set_stack_block_group_v2_identity_remap_count(&bgi, + cache->identity_remap_count); + write_extent_buffer(leaf, &bgi, bi, + sizeof(struct btrfs_block_group_item_v2)); + } else { + write_extent_buffer(leaf, &bgi, bi, + sizeof(struct btrfs_block_group_item)); + } + fail: btrfs_release_path(path); /* - * We didn't update the block group item, need to revert commit_used + * We didn't update the block group item, need to revert last_used * unless the block group item didn't exist yet - this is to prevent a * race with a concurrent insertion of the block group item, with * insert_block_group_item(), that happened just after we attempted to - * update. In that case we would reset commit_used to 0 just after the + * update. In that case we would reset last_used to 0 just after the * insertion set it to a value greater than 0 - if the block group later * becomes with 0 used bytes, we would incorrectly skip its update. */ if (ret < 0 && ret != -ENOENT) { spin_lock(&cache->lock); - cache->commit_used = old_commit_used; + cache->last_used = old_last_used; + cache->last_remap_bytes = old_last_remap_bytes; + cache->last_identity_remap_count = old_last_identity_remap_count; spin_unlock(&cache->lock); } return ret; @@ -3701,7 +3786,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, return -ENOENT; /* An extent can not span multiple block groups. */ - ASSERT(bytenr + num_bytes <= cache->start + cache->length); + ASSERT(bytenr + num_bytes <= btrfs_block_group_end(cache)); space_info = cache->space_info; factor = btrfs_bg_type_to_factor(cache->flags); @@ -4530,6 +4615,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) list_del_init(&block_group->bg_list); btrfs_put_block_group(block_group); } + + while (!list_empty(&info->fully_remapped_bgs)) { + block_group = list_first_entry(&info->fully_remapped_bgs, + struct btrfs_block_group, bg_list); + list_del_init(&block_group->bg_list); + btrfs_put_block_group(block_group); + } spin_unlock(&info->unused_bgs_lock); spin_lock(&info->zone_active_bgs_lock); @@ -4680,6 +4772,7 @@ int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, enum btrfs_block_group_size_class size_class, bool force_wrong_size_class) { + lockdep_assert_held(&bg->lock); ASSERT(size_class != BTRFS_BG_SZ_NONE); /* The new allocation is in the right size class, do nothing */ @@ -4717,3 +4810,103 @@ bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg) return false; return true; } + +void btrfs_mark_bg_fully_remapped(struct btrfs_block_group *bg, + struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + + + if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) { + spin_lock(&bg->lock); + set_bit(BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING, &bg->runtime_flags); + spin_unlock(&bg->lock); + + btrfs_discard_queue_work(&fs_info->discard_ctl, bg); + } else { + spin_lock(&fs_info->unused_bgs_lock); + /* + * The block group might already be on the unused_bgs list, + * remove it if it is. It'll get readded after + * btrfs_handle_fully_remapped_bgs() finishes. + */ + if (!list_empty(&bg->bg_list)) + list_del(&bg->bg_list); + else + btrfs_get_block_group(bg); + + list_add_tail(&bg->bg_list, &fs_info->fully_remapped_bgs); + spin_unlock(&fs_info->unused_bgs_lock); + } +} + +/* + * Compare the block group and chunk trees, and find any fully-remapped block + * groups which haven't yet had their chunk stripes and device extents removed, + * and put them on the fully_remapped_bgs list so this gets done. + * + * This happens when a block group becomes fully remapped, i.e. its last + * identity mapping is removed, and the volume is unmounted before async + * discard has finished. It's important this gets done as until it is the + * chunk's stripes are dead space. + */ +int btrfs_populate_fully_remapped_bgs_list(struct btrfs_fs_info *fs_info) +{ + struct rb_node *node_bg, *node_chunk; + + node_bg = rb_first_cached(&fs_info->block_group_cache_tree); + node_chunk = rb_first_cached(&fs_info->mapping_tree); + + while (node_bg && node_chunk) { + struct btrfs_block_group *bg; + struct btrfs_chunk_map *map; + + bg = rb_entry(node_bg, struct btrfs_block_group, cache_node); + map = rb_entry(node_chunk, struct btrfs_chunk_map, rb_node); + + ASSERT(bg->start == map->start); + + if (!(bg->flags & BTRFS_BLOCK_GROUP_REMAPPED)) + goto next; + + if (bg->identity_remap_count != 0) + goto next; + + if (map->num_stripes == 0) + goto next; + + spin_lock(&fs_info->unused_bgs_lock); + + if (list_empty(&bg->bg_list)) { + btrfs_get_block_group(bg); + list_add_tail(&bg->bg_list, &fs_info->fully_remapped_bgs); + } else { + list_move_tail(&bg->bg_list, &fs_info->fully_remapped_bgs); + } + + spin_unlock(&fs_info->unused_bgs_lock); + + /* + * Ideally we'd want to call btrfs_discard_queue_work() here, + * but it'd do nothing as the discard worker hasn't been + * started yet. + * + * The block group will get added to the discard list when + * btrfs_handle_fully_remapped_bgs() gets called, when we + * commit the first transaction. + */ + if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) { + spin_lock(&bg->lock); + set_bit(BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING, &bg->runtime_flags); + spin_unlock(&bg->lock); + } + +next: + node_bg = rb_next(node_bg); + node_chunk = rb_next(node_chunk); + } + + ASSERT(!node_bg && !node_chunk); + + return 0; +} diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 5f933455118c..c03e04292900 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -49,6 +49,7 @@ enum btrfs_discard_state { BTRFS_DISCARD_EXTENTS, BTRFS_DISCARD_BITMAPS, BTRFS_DISCARD_RESET_CURSOR, + BTRFS_DISCARD_FULLY_REMAPPED, }; /* @@ -92,6 +93,8 @@ enum btrfs_block_group_flags { * transaction. */ BLOCK_GROUP_FLAG_NEW, + BLOCK_GROUP_FLAG_FULLY_REMAPPED, + BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING, }; enum btrfs_caching_type { @@ -129,13 +132,22 @@ struct btrfs_block_group { u64 flags; u64 cache_generation; u64 global_root_id; + u64 remap_bytes; + u32 identity_remap_count; /* * The last committed used bytes of this block group, if the above @used - * is still the same as @commit_used, we don't need to update block + * is still the same as @last_used, we don't need to update block * group item of this block group. */ - u64 commit_used; + u64 last_used; + /* The last committed remap_bytes value of this block group. */ + u64 last_remap_bytes; + /* The last commited identity_remap_count value of this block group. */ + u32 last_identity_remap_count; + /* The last committed flags value for this block group. */ + u64 last_flags; + /* * If the free space extent count exceeds this number, convert the block * group to bitmaps. @@ -282,7 +294,8 @@ static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg) { lockdep_assert_held(&bg->lock); - return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0); + return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0 || + bg->remap_bytes > 0); } static inline bool btrfs_is_block_group_data_only(const struct btrfs_block_group *block_group) @@ -295,6 +308,14 @@ static inline bool btrfs_is_block_group_data_only(const struct btrfs_block_group !(block_group->flags & BTRFS_BLOCK_GROUP_METADATA); } +static inline u64 btrfs_block_group_available_space(const struct btrfs_block_group *bg) +{ + lockdep_assert_held(&bg->lock); + + return (bg->length - bg->used - bg->pinned - bg->reserved - + bg->bytes_super - bg->zone_unusable); +} + #ifdef CONFIG_BTRFS_DEBUG int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group); #endif @@ -324,6 +345,7 @@ int btrfs_add_new_free_space(struct btrfs_block_group *block_group, struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( struct btrfs_fs_info *fs_info, const u64 chunk_offset); +void btrfs_remove_bg_from_sinfo(struct btrfs_block_group *bg); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_chunk_map *map); void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); @@ -395,5 +417,8 @@ int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, enum btrfs_block_group_size_class size_class, bool force_wrong_size_class); bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg); +void btrfs_mark_bg_fully_remapped(struct btrfs_block_group *bg, + struct btrfs_trans_handle *trans); +int btrfs_populate_fully_remapped_bgs_list(struct btrfs_fs_info *fs_info); #endif /* BTRFS_BLOCK_GROUP_H */ diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 96cf7a162987..e823230c09b7 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -419,6 +419,9 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root) case BTRFS_TREE_LOG_OBJECTID: root->block_rsv = &fs_info->treelog_rsv; break; + case BTRFS_REMAP_TREE_OBJECTID: + root->block_rsv = &fs_info->remap_block_rsv; + break; default: root->block_rsv = NULL; break; @@ -432,6 +435,9 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info) space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); fs_info->chunk_block_rsv.space_info = space_info; + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA_REMAP); + fs_info->remap_block_rsv.space_info = space_info; + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); fs_info->global_block_rsv.space_info = space_info; fs_info->trans_block_rsv.space_info = space_info; @@ -458,6 +464,8 @@ void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info) WARN_ON(fs_info->trans_block_rsv.reserved > 0); WARN_ON(fs_info->chunk_block_rsv.size > 0); WARN_ON(fs_info->chunk_block_rsv.reserved > 0); + WARN_ON(fs_info->remap_block_rsv.size > 0); + WARN_ON(fs_info->remap_block_rsv.reserved > 0); WARN_ON(fs_info->delayed_block_rsv.size > 0); WARN_ON(fs_info->delayed_block_rsv.reserved > 0); WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 79ae9d05cd91..8359fb96bc3c 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -22,6 +22,7 @@ enum btrfs_rsv_type { BTRFS_BLOCK_RSV_DELALLOC, BTRFS_BLOCK_RSV_TRANS, BTRFS_BLOCK_RSV_CHUNK, + BTRFS_BLOCK_RSV_REMAP, BTRFS_BLOCK_RSV_DELOPS, BTRFS_BLOCK_RSV_DELREFS, BTRFS_BLOCK_RSV_TREELOG, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 6b3357287b42..1e7174ad32e2 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -21,7 +21,6 @@ #include <linux/sched/mm.h> #include <linux/log2.h> #include <linux/shrinker.h> -#include <crypto/hash.h> #include "misc.h" #include "ctree.h" #include "fs.h" @@ -87,37 +86,6 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len) return false; } -static int compression_compress_pages(int type, struct list_head *ws, - struct btrfs_inode *inode, u64 start, - struct folio **folios, unsigned long *out_folios, - unsigned long *total_in, unsigned long *total_out) -{ - switch (type) { - case BTRFS_COMPRESS_ZLIB: - return zlib_compress_folios(ws, inode, start, folios, - out_folios, total_in, total_out); - case BTRFS_COMPRESS_LZO: - return lzo_compress_folios(ws, inode, start, folios, - out_folios, total_in, total_out); - case BTRFS_COMPRESS_ZSTD: - return zstd_compress_folios(ws, inode, start, folios, - out_folios, total_in, total_out); - case BTRFS_COMPRESS_NONE: - default: - /* - * This can happen when compression races with remount setting - * it to 'no compress', while caller doesn't call - * inode_need_compress() to check if we really need to - * compress. - * - * Not a big deal, just need to inform caller that we - * haven't allocated any pages yet. - */ - *out_folios = 0; - return -E2BIG; - } -} - static int compression_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { @@ -156,13 +124,6 @@ static int compression_decompress(int type, struct list_head *ws, } } -static void btrfs_free_compressed_folios(struct compressed_bio *cb) -{ - for (unsigned int i = 0; i < cb->nr_folios; i++) - btrfs_free_compr_folio(cb->compressed_folios[i]); - kfree(cb->compressed_folios); -} - static int btrfs_decompress_bio(struct compressed_bio *cb); /* @@ -271,12 +232,14 @@ static void end_bbio_compressed_read(struct btrfs_bio *bbio) { struct compressed_bio *cb = to_compressed_bio(bbio); blk_status_t status = bbio->bio.bi_status; + struct folio_iter fi; if (!status) status = errno_to_blk_status(btrfs_decompress_bio(cb)); - btrfs_free_compressed_folios(cb); btrfs_bio_end_io(cb->orig_bbio, status); + bio_for_each_folio_all(fi, &bbio->bio) + btrfs_free_compr_folio(fi.folio); bio_put(&bbio->bio); } @@ -327,6 +290,7 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb) static void end_bbio_compressed_write(struct btrfs_bio *bbio) { struct compressed_bio *cb = to_compressed_bio(bbio); + struct folio_iter fi; btrfs_finish_ordered_extent(cb->bbio.ordered, NULL, cb->start, cb->len, cb->bbio.bio.bi_status == BLK_STS_OK); @@ -334,29 +298,11 @@ static void end_bbio_compressed_write(struct btrfs_bio *bbio) if (cb->writeback) end_compressed_writeback(cb); /* Note, our inode could be gone now. */ - btrfs_free_compressed_folios(cb); + bio_for_each_folio_all(fi, &bbio->bio) + btrfs_free_compr_folio(fi.folio); bio_put(&cb->bbio.bio); } -static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb) -{ - struct bio *bio = &cb->bbio.bio; - u32 offset = 0; - unsigned int findex = 0; - - while (offset < cb->compressed_len) { - struct folio *folio = cb->compressed_folios[findex]; - u32 len = min_t(u32, cb->compressed_len - offset, folio_size(folio)); - int ret; - - /* Maximum compressed extent is smaller than bio size limit. */ - ret = bio_add_folio(bio, folio, len, 0); - ASSERT(ret); - offset += len; - findex++; - } -} - /* * worker function to build and submit bios for previously compressed pages. * The corresponding pages in the inode should be marked for writeback @@ -367,35 +313,44 @@ static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb) * the end io hooks. */ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, - struct folio **compressed_folios, - unsigned int nr_folios, - blk_opf_t write_flags, - bool writeback) + struct compressed_bio *cb) { struct btrfs_inode *inode = ordered->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct compressed_bio *cb; ASSERT(IS_ALIGNED(ordered->file_offset, fs_info->sectorsize)); ASSERT(IS_ALIGNED(ordered->num_bytes, fs_info->sectorsize)); + ASSERT(cb->writeback); - cb = alloc_compressed_bio(inode, ordered->file_offset, - REQ_OP_WRITE | write_flags, - end_bbio_compressed_write); cb->start = ordered->file_offset; cb->len = ordered->num_bytes; - cb->compressed_folios = compressed_folios; cb->compressed_len = ordered->disk_num_bytes; - cb->writeback = writeback; - cb->nr_folios = nr_folios; cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT; cb->bbio.ordered = ordered; - btrfs_add_compressed_bio_folios(cb); btrfs_submit_bbio(&cb->bbio, 0); } /* + * Allocate a compressed write bio for @inode file offset @start length @len. + * + * The caller still needs to properly queue all folios and populate involved + * members. + */ +struct compressed_bio *btrfs_alloc_compressed_write(struct btrfs_inode *inode, + u64 start, u64 len) +{ + struct compressed_bio *cb; + + cb = alloc_compressed_bio(inode, start, REQ_OP_WRITE, end_bbio_compressed_write); + cb->start = start; + cb->len = len; + cb->writeback = true; + + return cb; +} + +/* * Add extra pages in the same compressed file extent so that we don't need to * re-read the same extent again and again. * @@ -520,7 +475,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, folio_put(folio); break; } - add_size = min(em->start + em->len, page_end + 1) - cur; + add_size = min(btrfs_extent_map_end(em), page_end + 1) - cur; btrfs_free_extent_map(em); btrfs_unlock_extent(tree, cur, page_end, NULL); @@ -571,13 +526,13 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) struct extent_map_tree *em_tree = &inode->extent_tree; struct compressed_bio *cb; unsigned int compressed_len; + const u32 min_folio_size = btrfs_min_folio_size(fs_info); u64 file_offset = bbio->file_offset; u64 em_len; u64 em_start; struct extent_map *em; unsigned long pflags; int memstall = 0; - blk_status_t status; int ret; /* we need the actual starting offset of this extent in the file */ @@ -585,7 +540,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) em = btrfs_lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize); read_unlock(&em_tree->lock); if (!em) { - status = BLK_STS_IOERR; + ret = -EIO; goto out; } @@ -607,27 +562,30 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) btrfs_free_extent_map(em); - cb->nr_folios = DIV_ROUND_UP(compressed_len, btrfs_min_folio_size(fs_info)); - cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct folio *), GFP_NOFS); - if (!cb->compressed_folios) { - status = BLK_STS_RESOURCE; - goto out_free_bio; - } + for (int i = 0; i * min_folio_size < compressed_len; i++) { + struct folio *folio; + u32 cur_len = min(compressed_len - i * min_folio_size, min_folio_size); + + folio = btrfs_alloc_compr_folio(fs_info); + if (!folio) { + ret = -ENOMEM; + goto out_free_bio; + } - ret = btrfs_alloc_folio_array(cb->nr_folios, fs_info->block_min_order, - cb->compressed_folios); - if (ret) { - status = BLK_STS_RESOURCE; - goto out_free_compressed_pages; + ret = bio_add_folio(&cb->bbio.bio, folio, cur_len, 0); + if (unlikely(!ret)) { + folio_put(folio); + ret = -EINVAL; + goto out_free_bio; + } } + ASSERT(cb->bbio.bio.bi_iter.bi_size == compressed_len); add_ra_bio_pages(&inode->vfs_inode, em_start + em_len, cb, &memstall, &pflags); - /* include any pages we added in add_ra-bio_pages */ cb->len = bbio->bio.bi_iter.bi_size; cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector; - btrfs_add_compressed_bio_folios(cb); if (memstall) psi_memstall_leave(&pflags); @@ -635,12 +593,10 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) btrfs_submit_bbio(&cb->bbio, 0); return; -out_free_compressed_pages: - kfree(cb->compressed_folios); out_free_bio: - bio_put(&cb->bbio.bio); + cleanup_compressed_bio(cb); out: - btrfs_bio_end_io(bbio, status); + btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); } /* @@ -1027,42 +983,71 @@ int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, } /* - * Given an address space and start and length, compress the bytes into @pages - * that are allocated on demand. - * - * @type_level is encoded algorithm and level, where level 0 means whatever - * default the algorithm chooses and is opaque here; - * - compression algo are 0-3 - * - the level are bits 4-7 + * Given an address space and start and length, compress the page cache + * contents into @cb. * - * @out_folios is an in/out parameter, holds maximum number of folios to allocate - * and returns number of actually allocated folios + * @type_level: is encoded algorithm and level, where level 0 means whatever + * default the algorithm chooses and is opaque here; + * - compression algo are 0-3 + * - the level are bits 4-7 * - * @total_in is used to return the number of bytes actually read. It - * may be smaller than the input length if we had to exit early because we - * ran out of room in the folios array or because we cross the - * max_out threshold. + * @cb->bbio.bio.bi_iter.bi_size will indicate the compressed data size. + * The bi_size may not be sectorsize aligned, thus the caller still need + * to do the round up before submission. * - * @total_out is an in/out parameter, must be set to the input length and will - * be also used to return the total number of compressed bytes + * This function will allocate compressed folios with btrfs_alloc_compr_folio(), + * thus callers must make sure the endio function and error handling are using + * btrfs_free_compr_folio() to release those folios. + * This is already done in end_bbio_compressed_write() and cleanup_compressed_bio(). */ -int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inode, - u64 start, struct folio **folios, unsigned long *out_folios, - unsigned long *total_in, unsigned long *total_out) +struct compressed_bio *btrfs_compress_bio(struct btrfs_inode *inode, + u64 start, u32 len, unsigned int type, + int level, blk_opf_t write_flags) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - const unsigned long orig_len = *total_out; struct list_head *workspace; + struct compressed_bio *cb; int ret; + cb = alloc_compressed_bio(inode, start, REQ_OP_WRITE | write_flags, + end_bbio_compressed_write); + cb->start = start; + cb->len = len; + cb->writeback = true; + cb->compress_type = type; + level = btrfs_compress_set_level(type, level); workspace = get_workspace(fs_info, type, level); - ret = compression_compress_pages(type, workspace, inode, start, folios, - out_folios, total_in, total_out); - /* The total read-in bytes should be no larger than the input. */ - ASSERT(*total_in <= orig_len); + switch (type) { + case BTRFS_COMPRESS_ZLIB: + ret = zlib_compress_bio(workspace, cb); + break; + case BTRFS_COMPRESS_LZO: + ret = lzo_compress_bio(workspace, cb); + break; + case BTRFS_COMPRESS_ZSTD: + ret = zstd_compress_bio(workspace, cb); + break; + case BTRFS_COMPRESS_NONE: + default: + /* + * This can happen when compression races with remount setting + * it to 'no compress', while caller doesn't call + * inode_need_compress() to check if we really need to + * compress. + * + * Not a big deal, just need to inform caller that we + * haven't allocated any pages yet. + */ + ret = -E2BIG; + } + put_workspace(fs_info, type, workspace); - return ret; + if (ret < 0) { + cleanup_compressed_bio(cb); + return ERR_PTR(ret); + } + return cb; } static int btrfs_decompress_bio(struct compressed_bio *cb) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index e0228017e861..65b8bc4bbe0b 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -42,12 +42,6 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); #define BTRFS_ZLIB_DEFAULT_LEVEL 3 struct compressed_bio { - /* Number of compressed folios in the array. */ - unsigned int nr_folios; - - /* The folios with the compressed data on them. */ - struct folio **compressed_folios; - /* starting offset in the inode for our pages */ u64 start; @@ -91,18 +85,15 @@ int __init btrfs_init_compress(void); void __cold btrfs_exit_compress(void); bool btrfs_compress_level_valid(unsigned int type, int level); -int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inode, - u64 start, struct folio **folios, unsigned long *out_folios, - unsigned long *total_in, unsigned long *total_out); int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); int btrfs_decompress_buf2page(const char *buf, u32 buf_len, struct compressed_bio *cb, u32 decompressed); +struct compressed_bio *btrfs_alloc_compressed_write(struct btrfs_inode *inode, + u64 start, u64 len); void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, - struct folio **compressed_folios, - unsigned int nr_folios, blk_opf_t write_flags, - bool writeback); + struct compressed_bio *cb); void btrfs_submit_compressed_read(struct btrfs_bio *bbio); int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret); @@ -146,10 +137,21 @@ int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end); int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, struct folio **in_folio_ret); +struct compressed_bio *btrfs_compress_bio(struct btrfs_inode *inode, + u64 start, u32 len, unsigned int type, + int level, blk_opf_t write_flags); + +static inline void cleanup_compressed_bio(struct compressed_bio *cb) +{ + struct bio *bio = &cb->bbio.bio; + struct folio_iter fi; + + bio_for_each_folio_all(fi, bio) + btrfs_free_compr_folio(fi.folio); + bio_put(bio); +} -int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, - u64 start, struct folio **folios, unsigned long *out_folios, - unsigned long *total_in, unsigned long *total_out); +int zlib_compress_bio(struct list_head *ws, struct compressed_bio *cb); int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zlib_decompress(struct list_head *ws, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, @@ -158,9 +160,7 @@ struct list_head *zlib_alloc_workspace(struct btrfs_fs_info *fs_info, unsigned i void zlib_free_workspace(struct list_head *ws); struct list_head *zlib_get_workspace(struct btrfs_fs_info *fs_info, unsigned int level); -int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode, - u64 start, struct folio **folios, unsigned long *out_folios, - unsigned long *total_in, unsigned long *total_out); +int lzo_compress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress(struct list_head *ws, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, @@ -168,9 +168,7 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, struct list_head *lzo_alloc_workspace(struct btrfs_fs_info *fs_info); void lzo_free_workspace(struct list_head *ws); -int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, - u64 start, struct folio **folios, unsigned long *out_folios, - unsigned long *total_in, unsigned long *total_out); +int zstd_compress_bio(struct list_head *ws, struct compressed_bio *cb); int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zstd_decompress(struct list_head *ws, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a48b4befbee7..7267b2502665 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -249,6 +249,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, int ret = 0; int level; struct btrfs_disk_key disk_key; + const bool is_reloc_root = (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID); u64 reloc_src_root = 0; WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && @@ -262,7 +263,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, else btrfs_node_key(buf, &disk_key, 0); - if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) + if (is_reloc_root) reloc_src_root = btrfs_header_owner(buf); cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, &disk_key, level, buf->start, 0, @@ -276,7 +277,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | BTRFS_HEADER_FLAG_RELOC); - if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) + if (is_reloc_root) btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); else btrfs_set_header_owner(cow, new_root_objectid); @@ -291,16 +292,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, return ret; } - if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) { - ret = btrfs_inc_ref(trans, root, cow, 1); - if (unlikely(ret)) - btrfs_abort_transaction(trans, ret); - } else { - ret = btrfs_inc_ref(trans, root, cow, 0); - if (unlikely(ret)) - btrfs_abort_transaction(trans, ret); - } - if (ret) { + ret = btrfs_inc_ref(trans, root, cow, is_reloc_root); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); btrfs_tree_unlock(cow); free_extent_buffer(cow); return ret; @@ -362,6 +356,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, u64 owner; u64 flags; int ret; + const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID); /* * Backrefs update rules: @@ -397,8 +392,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, } } else { refs = 1; - if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID || - btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) + if (is_reloc_root || btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; else flags = 0; @@ -417,18 +411,17 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, } if (refs > 1) { - if ((owner == btrfs_root_id(root) || - btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) && + if ((owner == btrfs_root_id(root) || is_reloc_root) && !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { - ret = btrfs_inc_ref(trans, root, buf, 1); + ret = btrfs_inc_ref(trans, root, buf, true); if (ret) return ret; - if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { - ret = btrfs_dec_ref(trans, root, buf, 0); + if (is_reloc_root) { + ret = btrfs_dec_ref(trans, root, buf, false); if (ret) return ret; - ret = btrfs_inc_ref(trans, root, cow, 1); + ret = btrfs_inc_ref(trans, root, cow, true); if (ret) return ret; } @@ -437,23 +430,16 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (ret) return ret; } else { - - if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1); - else - ret = btrfs_inc_ref(trans, root, cow, 0); + ret = btrfs_inc_ref(trans, root, cow, is_reloc_root); if (ret) return ret; } } else { if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { - if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1); - else - ret = btrfs_inc_ref(trans, root, cow, 0); + ret = btrfs_inc_ref(trans, root, cow, is_reloc_root); if (ret) return ret; - ret = btrfs_dec_ref(trans, root, buf, 1); + ret = btrfs_dec_ref(trans, root, buf, true); if (ret) return ret; } @@ -4016,8 +4002,7 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = split_item(trans, path, new_key, split_offset); - return ret; + return split_item(trans, path, new_key, split_offset); } /* diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 692370fc07b2..6de7ad191e04 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -86,6 +86,14 @@ struct btrfs_path { struct btrfs_path *path_name __free(btrfs_free_path) = NULL /* + * This defines an on-stack path that will be auto released when exiting the scope. + * + * It is compatible with any existing manual btrfs_release_path() calls. + */ +#define BTRFS_PATH_AUTO_RELEASE(path_name) \ + struct btrfs_path path_name __free(btrfs_release_path) = { 0 } + +/* * The state of btrfs root */ enum { @@ -601,6 +609,7 @@ void btrfs_release_path(struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); DEFINE_FREE(btrfs_free_path, struct btrfs_path *, btrfs_free_path(_T)) +DEFINE_FREE(btrfs_release_path, struct btrfs_path, btrfs_release_path(&_T)) int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int slot, int nr); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index b81e224d4a27..ecf05cd64696 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -609,7 +609,7 @@ static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, { struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *fi; - struct btrfs_path path = { 0 }; + BTRFS_PATH_AUTO_RELEASE(path); struct extent_map *em; struct btrfs_key key; u64 ino = btrfs_ino(inode); @@ -720,16 +720,13 @@ next: if (ret > 0) goto not_found; } - btrfs_release_path(&path); return em; not_found: - btrfs_release_path(&path); btrfs_free_extent_map(em); return NULL; err: - btrfs_release_path(&path); btrfs_free_extent_map(em); return ERR_PTR(ret); } @@ -795,10 +792,11 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_map *next; + const u64 em_end = btrfs_extent_map_end(em); bool ret = false; /* This is the last extent */ - if (em->start + em->len >= i_size_read(inode)) + if (em_end >= i_size_read(inode)) return false; /* @@ -807,7 +805,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, * one will not be a target. * This will just cause extra IO without really reducing the fragments. */ - next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); + next = defrag_lookup_extent(inode, em_end, newer_than, locked); /* No more em or hole */ if (!next || next->disk_bytenr >= EXTENT_MAP_LAST_BYTE) goto out; diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 4b7d9015e0da..1739a0b29c49 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -232,19 +232,19 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, } static struct btrfs_delayed_node *btrfs_first_delayed_node( - struct btrfs_delayed_root *delayed_root, + struct btrfs_fs_info *fs_info, struct btrfs_ref_tracker *tracker) { struct btrfs_delayed_node *node; - spin_lock(&delayed_root->lock); - node = list_first_entry_or_null(&delayed_root->node_list, + spin_lock(&fs_info->delayed_root.lock); + node = list_first_entry_or_null(&fs_info->delayed_root.node_list, struct btrfs_delayed_node, n_list); if (node) { refcount_inc(&node->refs); btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC); } - spin_unlock(&delayed_root->lock); + spin_unlock(&fs_info->delayed_root.lock); return node; } @@ -257,7 +257,7 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node( struct list_head *p; struct btrfs_delayed_node *next = NULL; - delayed_root = node->root->fs_info->delayed_root; + delayed_root = &node->root->fs_info->delayed_root; spin_lock(&delayed_root->lock); if (!test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { /* not in the list */ @@ -287,7 +287,7 @@ static void __btrfs_release_delayed_node( if (!delayed_node) return; - delayed_root = delayed_node->root->fs_info->delayed_root; + delayed_root = &delayed_node->root->fs_info->delayed_root; mutex_lock(&delayed_node->mutex); if (delayed_node->count) @@ -425,7 +425,7 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, delayed_node->index_cnt = ins->index + 1; delayed_node->count++; - atomic_inc(&delayed_node->root->fs_info->delayed_root->items); + atomic_inc(&delayed_node->root->fs_info->delayed_root.items); return 0; } @@ -443,7 +443,6 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) { struct btrfs_delayed_node *delayed_node = delayed_item->delayed_node; struct rb_root_cached *root; - struct btrfs_delayed_root *delayed_root; /* Not inserted, ignore it. */ if (RB_EMPTY_NODE(&delayed_item->rb_node)) @@ -452,8 +451,6 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) /* If it's in a rbtree, then we need to have delayed node locked. */ lockdep_assert_held(&delayed_node->mutex); - delayed_root = delayed_node->root->fs_info->delayed_root; - if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM) root = &delayed_node->ins_root; else @@ -462,8 +459,7 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) rb_erase_cached(&delayed_item->rb_node, root); RB_CLEAR_NODE(&delayed_item->rb_node); delayed_node->count--; - - finish_one_item(delayed_root); + finish_one_item(&delayed_node->root->fs_info->delayed_root); } static void btrfs_release_delayed_item(struct btrfs_delayed_item *item) @@ -980,30 +976,21 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) { - struct btrfs_delayed_root *delayed_root; - if (delayed_node && test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { ASSERT(delayed_node->root); clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); delayed_node->count--; - - delayed_root = delayed_node->root->fs_info->delayed_root; - finish_one_item(delayed_root); + finish_one_item(&delayed_node->root->fs_info->delayed_root); } } static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node) { - if (test_and_clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) { - struct btrfs_delayed_root *delayed_root; - ASSERT(delayed_node->root); delayed_node->count--; - - delayed_root = delayed_node->root->fs_info->delayed_root; - finish_one_item(delayed_root); + finish_one_item(&delayed_node->root->fs_info->delayed_root); } } @@ -1137,8 +1124,8 @@ __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, ret = btrfs_record_root_in_trans(trans, node->root); if (ret) return ret; - ret = btrfs_update_delayed_inode(trans, node->root, path, node); - return ret; + + return btrfs_update_delayed_inode(trans, node->root, path, node); } /* @@ -1150,7 +1137,6 @@ __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_delayed_root *delayed_root; struct btrfs_delayed_node *curr_node, *prev_node; struct btrfs_ref_tracker curr_delayed_node_tracker, prev_delayed_node_tracker; struct btrfs_path *path; @@ -1168,9 +1154,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) block_rsv = trans->block_rsv; trans->block_rsv = &fs_info->delayed_block_rsv; - delayed_root = fs_info->delayed_root; - - curr_node = btrfs_first_delayed_node(delayed_root, &curr_delayed_node_tracker); + curr_node = btrfs_first_delayed_node(fs_info, &curr_delayed_node_tracker); while (curr_node && (!count || nr--)) { ret = __btrfs_commit_inode_delayed_items(trans, path, curr_node); @@ -1417,7 +1401,7 @@ void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info) struct btrfs_ref_tracker delayed_node_tracker; struct btrfs_delayed_node *node; - node = btrfs_first_delayed_node( fs_info->delayed_root, &delayed_node_tracker); + node = btrfs_first_delayed_node(fs_info, &delayed_node_tracker); if (WARN_ON(node)) { btrfs_delayed_node_ref_tracker_free(node, &delayed_node_tracker); @@ -1440,7 +1424,7 @@ static bool could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info) { - struct btrfs_delayed_root *delayed_root = fs_info->delayed_root; + struct btrfs_delayed_root *delayed_root = &fs_info->delayed_root; if ((atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) || btrfs_workqueue_normal_congested(fs_info->delayed_workers)) @@ -1970,7 +1954,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, fill_stack_inode_item(trans, &delayed_node->inode_item, inode); set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); delayed_node->count++; - atomic_inc(&root->fs_info->delayed_root->items); + atomic_inc(&root->fs_info->delayed_root.items); release_node: mutex_unlock(&delayed_node->mutex); btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); @@ -2012,7 +1996,7 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode) mutex_lock(&delayed_node->mutex); if (!test_and_set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) { delayed_node->count++; - atomic_inc(&fs_info->delayed_root->items); + atomic_inc(&fs_info->delayed_root.items); } mutex_unlock(&delayed_node->mutex); btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); @@ -2118,8 +2102,7 @@ void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info) struct btrfs_delayed_node *curr_node, *prev_node; struct btrfs_ref_tracker curr_delayed_node_tracker, prev_delayed_node_tracker; - curr_node = btrfs_first_delayed_node(fs_info->delayed_root, - &curr_delayed_node_tracker); + curr_node = btrfs_first_delayed_node(fs_info, &curr_delayed_node_tracker); while (curr_node) { __btrfs_kill_delayed_node(curr_node); diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index b09d4ec8c77d..fc752863f89b 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -30,21 +30,6 @@ enum btrfs_delayed_item_type { BTRFS_DELAYED_DELETION_ITEM }; -struct btrfs_delayed_root { - spinlock_t lock; - struct list_head node_list; - /* - * Used for delayed nodes which is waiting to be dealt with by the - * worker. If the delayed node is inserted into the work queue, we - * drop it from this list. - */ - struct list_head prepare_list; - atomic_t items; /* for delayed items */ - atomic_t items_seq; /* for delayed items */ - int nodes; /* for delayed nodes */ - wait_queue_head_t wait; -}; - struct btrfs_ref_tracker_dir { #ifdef CONFIG_BTRFS_DEBUG struct ref_tracker_dir dir; diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index 07e19e88ba4b..9a63200d7a53 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -763,7 +763,7 @@ static ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, struct btrfs_dio_data data = { 0 }; return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL, &data, done_before); + IOMAP_DIO_PARTIAL | IOMAP_DIO_FSBLOCK_ALIGNED, &data, done_before); } static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, @@ -772,7 +772,7 @@ static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *it struct btrfs_dio_data data = { 0 }; return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL, &data, done_before); + IOMAP_DIO_PARTIAL | IOMAP_DIO_FSBLOCK_ALIGNED, &data, done_before); } static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, @@ -785,19 +785,6 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, if (iov_iter_alignment(iter) & blocksize_mask) return -EINVAL; - - /* - * For bs > ps support, we heavily rely on large folios to make sure no - * block will cross large folio boundaries. - * - * But memory provided by direct IO is only virtually contiguous, not - * physically contiguous, and will break the btrfs' large folio requirement. - * - * So for bs > ps support, all direct IOs should fallback to buffered ones. - */ - if (fs_info->sectorsize > PAGE_SIZE) - return -EINVAL; - return 0; } @@ -814,6 +801,8 @@ ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) ssize_t ret; unsigned int ilock_flags = 0; struct iomap_dio *dio; + const u64 data_profile = btrfs_data_alloc_profile(fs_info) & + BTRFS_BLOCK_GROUP_PROFILE_MASK; if (iocb->ki_flags & IOCB_NOWAIT) ilock_flags |= BTRFS_ILOCK_TRY; @@ -827,6 +816,16 @@ ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode)) ilock_flags |= BTRFS_ILOCK_SHARED; + /* + * If our data profile has duplication (either extra mirrors or RAID56), + * we can not trust the direct IO buffer, the content may change during + * writeback and cause different contents written to different mirrors. + * + * Thus only RAID0 and SINGLE can go true zero-copy direct IO. + */ + if (data_profile != BTRFS_BLOCK_GROUP_RAID0 && data_profile != 0) + goto buffered; + relock: ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); if (ret < 0) diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 89fe85778115..1c304bf473e5 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -216,6 +216,25 @@ static struct btrfs_block_group *find_next_block_group( } /* + * Check whether a block group is empty. + * + * "Empty" here means that there are no extents physically located within the + * device extents corresponding to this block group. + * + * For a remapped block group, this means that all of its identity remaps have + * been removed. For a non-remapped block group, this means that no extents + * have an address within its range, and that nothing has been remapped to be + * within it. + */ +static bool block_group_is_empty(const struct btrfs_block_group *bg) +{ + if (bg->flags & BTRFS_BLOCK_GROUP_REMAPPED) + return bg->identity_remap_count == 0; + + return bg->used == 0 && bg->remap_bytes == 0; +} + +/* * Look up next block group and set it for use. * * @discard_ctl: discard control @@ -241,8 +260,10 @@ again: block_group = find_next_block_group(discard_ctl, now); if (block_group && now >= block_group->discard_eligible_time) { + const bool empty = block_group_is_empty(block_group); + if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED && - block_group->used != 0) { + !empty) { if (btrfs_is_block_group_data_only(block_group)) { __add_to_discard_list(discard_ctl, block_group); /* @@ -267,7 +288,12 @@ again: } if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) { block_group->discard_cursor = block_group->start; - block_group->discard_state = BTRFS_DISCARD_EXTENTS; + + if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED && empty) { + block_group->discard_state = BTRFS_DISCARD_FULLY_REMAPPED; + } else { + block_group->discard_state = BTRFS_DISCARD_EXTENTS; + } } } if (block_group) { @@ -373,7 +399,7 @@ void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) return; - if (block_group->used == 0) + if (block_group_is_empty(block_group)) add_to_discard_unused_list(discard_ctl, block_group); else add_to_discard_list(discard_ctl, block_group); @@ -470,7 +496,7 @@ static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl, { remove_from_discard_list(discard_ctl, block_group); - if (block_group->used == 0) { + if (block_group_is_empty(block_group)) { if (btrfs_is_free_space_trimmed(block_group)) btrfs_mark_bg_unused(block_group); else @@ -524,7 +550,8 @@ static void btrfs_discard_workfn(struct work_struct *work) /* Perform discarding */ minlen = discard_minlen[discard_index]; - if (discard_state == BTRFS_DISCARD_BITMAPS) { + switch (discard_state) { + case BTRFS_DISCARD_BITMAPS: { u64 maxlen = 0; /* @@ -541,17 +568,28 @@ static void btrfs_discard_workfn(struct work_struct *work) btrfs_block_group_end(block_group), minlen, maxlen, true); discard_ctl->discard_bitmap_bytes += trimmed; - } else { + + break; + } + + case BTRFS_DISCARD_FULLY_REMAPPED: + btrfs_trim_fully_remapped_block_group(block_group); + break; + + default: btrfs_trim_block_group_extents(block_group, &trimmed, block_group->discard_cursor, btrfs_block_group_end(block_group), minlen, true); discard_ctl->discard_extent_bytes += trimmed; + + break; } /* Determine next steps for a block_group */ if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) { - if (discard_state == BTRFS_DISCARD_BITMAPS) { + if (discard_state == BTRFS_DISCARD_BITMAPS || + discard_state == BTRFS_DISCARD_FULLY_REMAPPED) { btrfs_finish_discard_pass(discard_ctl, block_group); } else { block_group->discard_cursor = block_group->start; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2833b44f4b4f..20c405a4789d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -18,11 +18,11 @@ #include <linux/crc32c.h> #include <linux/sched/mm.h> #include <linux/unaligned.h> -#include <crypto/hash.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" +#include "delayed-inode.h" #include "bio.h" #include "print-tree.h" #include "locking.h" @@ -62,12 +62,6 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info); static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info); -static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) -{ - if (fs_info->csum_shash) - crypto_free_shash(fs_info->csum_shash); -} - /* * Compute the csum of a btree block and store the result to provided buffer. */ @@ -76,12 +70,11 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result) struct btrfs_fs_info *fs_info = buf->fs_info; int num_pages; u32 first_page_part; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + struct btrfs_csum_ctx csum; char *kaddr; int i; - shash->tfm = fs_info->csum_shash; - crypto_shash_init(shash); + btrfs_csum_init(&csum, fs_info->csum_type); if (buf->addr) { /* Pages are contiguous, handle them as a big one. */ @@ -94,21 +87,21 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result) num_pages = num_extent_pages(buf); } - crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, - first_page_part - BTRFS_CSUM_SIZE); + btrfs_csum_update(&csum, kaddr + BTRFS_CSUM_SIZE, + first_page_part - BTRFS_CSUM_SIZE); /* * Multiple single-page folios case would reach here. * * nodesize <= PAGE_SIZE and large folio all handled by above - * crypto_shash_update() already. + * btrfs_csum_update() already. */ for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) { kaddr = folio_address(buf->folios[i]); - crypto_shash_update(shash, kaddr, PAGE_SIZE); + btrfs_csum_update(&csum, kaddr, PAGE_SIZE); } memset(result, 0, BTRFS_CSUM_SIZE); - crypto_shash_final(shash, result); + btrfs_csum_final(&csum, result); } /* @@ -160,18 +153,15 @@ static bool btrfs_supported_super_csum(u16 csum_type) int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, const struct btrfs_super_block *disk_sb) { - char result[BTRFS_CSUM_SIZE]; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - - shash->tfm = fs_info->csum_shash; + u8 result[BTRFS_CSUM_SIZE]; /* * The super_block structure does not span the whole * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is * filled with zeros and is included in the checksum. */ - crypto_shash_digest(shash, (const u8 *)disk_sb + BTRFS_CSUM_SIZE, - BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result); + btrfs_csum(fs_info->csum_type, (const u8 *)disk_sb + BTRFS_CSUM_SIZE, + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result); if (memcmp(disk_sb->csum, result, fs_info->csum_size)) return 1; @@ -186,7 +176,6 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, const u32 step = min(fs_info->nodesize, PAGE_SIZE); const u32 nr_steps = eb->len / step; phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; - int ret = 0; if (sb_rdonly(fs_info->sb)) return -EROFS; @@ -208,9 +197,8 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, paddrs[i] = page_to_phys(&folio->page) + offset_in_page(eb->start); } - ret = btrfs_repair_io_failure(fs_info, 0, eb->start, eb->len, eb->start, - paddrs, step, mirror_num); - return ret; + return btrfs_repair_io_failure(fs_info, 0, eb->start, eb->len, + eb->start, paddrs, step, mirror_num); } /* @@ -382,22 +370,19 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, btrfs_err_rl(fs_info, "bad tree block start, mirror %u want %llu have %llu", eb->read_mirror, eb->start, found_start); - ret = -EIO; - goto out; + return -EIO; } if (unlikely(check_tree_block_fsid(eb))) { btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u", eb->start, eb->read_mirror); - ret = -EIO; - goto out; + return -EIO; } found_level = btrfs_header_level(eb); if (unlikely(found_level >= BTRFS_MAX_LEVEL)) { btrfs_err(fs_info, "bad tree block level, mirror %u level %d on logical %llu", eb->read_mirror, btrfs_header_level(eb), eb->start); - ret = -EIO; - goto out; + return -EIO; } csum_tree_block(eb, result); @@ -412,18 +397,15 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, BTRFS_CSUM_FMT_VALUE(csum_size, result), btrfs_header_level(eb), ignore_csum ? ", ignored" : ""); - if (unlikely(!ignore_csum)) { - ret = -EUCLEAN; - goto out; - } + if (unlikely(!ignore_csum)) + return -EUCLEAN; } if (unlikely(found_level != check->level)) { btrfs_err(fs_info, "level verify failed on logical %llu mirror %u wanted %u found %u", eb->start, eb->read_mirror, check->level, found_level); - ret = -EIO; - goto out; + return -EIO; } if (unlikely(check->transid && btrfs_header_generation(eb) != check->transid)) { @@ -431,8 +413,7 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, "parent transid verify failed on logical %llu mirror %u wanted %llu found %llu", eb->start, eb->read_mirror, check->transid, btrfs_header_generation(eb)); - ret = -EIO; - goto out; + return -EIO; } if (check->has_first_key) { const struct btrfs_key *expect_key = &check->first_key; @@ -450,14 +431,13 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, expect_key->type, expect_key->offset, found_key.objectid, found_key.type, found_key.offset); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } } if (check->owner_root) { ret = btrfs_check_eb_owner(eb, check->owner_root); if (ret < 0) - goto out; + return ret; } /* If this is a leaf block and it is corrupt, just return -EIO. */ @@ -471,7 +451,6 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, btrfs_err(fs_info, "read time tree block corruption detected on logical %llu mirror %u", eb->start, eb->read_mirror); -out: return ret; } @@ -815,7 +794,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root *root; - struct btrfs_key key; unsigned int nofs_flag; int ret = 0; @@ -864,10 +842,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_tree_unlock(leaf); - key.objectid = objectid; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = 0; - ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item); + ret = btrfs_insert_root(trans, tree_root, &root->root_key, &root->root_item); if (ret) goto fail; @@ -1153,6 +1128,8 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info, return btrfs_grab_root(btrfs_global_root(fs_info, &key)); case BTRFS_RAID_STRIPE_TREE_OBJECTID: return btrfs_grab_root(fs_info->stripe_root); + case BTRFS_REMAP_TREE_OBJECTID: + return btrfs_grab_root(fs_info->remap_root); default: return NULL; } @@ -1229,11 +1206,9 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) ASSERT(percpu_counter_sum_positive(em_counter) == 0); percpu_counter_destroy(em_counter); percpu_counter_destroy(&fs_info->dev_replace.bio_counter); - btrfs_free_csum_hash(fs_info); btrfs_free_stripe_hash_table(fs_info); btrfs_free_ref_cache(fs_info); kfree(fs_info->balance_ctl); - kfree(fs_info->delayed_root); free_global_roots(fs_info); btrfs_put_root(fs_info->tree_root); btrfs_put_root(fs_info->chunk_root); @@ -1244,6 +1219,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_put_root(fs_info->data_reloc_root); btrfs_put_root(fs_info->block_group_root); btrfs_put_root(fs_info->stripe_root); + btrfs_put_root(fs_info->remap_root); btrfs_check_leaked_roots(fs_info); btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); @@ -1488,6 +1464,10 @@ static int cleaner_kthread(void *arg) */ btrfs_run_defrag_inodes(fs_info); + if (btrfs_fs_incompat(fs_info, REMAP_TREE) && + !btrfs_test_opt(fs_info, DISCARD_ASYNC)) + btrfs_handle_fully_remapped_bgs(fs_info); + /* * Acquires fs_info->reclaim_bgs_lock to avoid racing * with relocation (btrfs_relocate_chunk) and relocation @@ -1796,6 +1776,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) free_root_extent_buffers(info->data_reloc_root); free_root_extent_buffers(info->block_group_root); free_root_extent_buffers(info->stripe_root); + free_root_extent_buffers(info->remap_root); if (free_chunk_root) free_root_extent_buffers(info->chunk_root); } @@ -1983,21 +1964,8 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) return 0; } -static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) +static void btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) { - struct crypto_shash *csum_shash; - const char *csum_driver = btrfs_super_csum_driver(csum_type); - - csum_shash = crypto_alloc_shash(csum_driver, 0, 0); - - if (IS_ERR(csum_shash)) { - btrfs_err(fs_info, "error allocating %s hash for checksum", - csum_driver); - return PTR_ERR(csum_shash); - } - - fs_info->csum_shash = csum_shash; - /* Check if the checksum implementation is a fast accelerated one. */ switch (csum_type) { case BTRFS_CSUM_TYPE_CRC32: @@ -2011,10 +1979,8 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) break; } - btrfs_info(fs_info, "using %s (%s) checksum algorithm", - btrfs_super_csum_name(csum_type), - crypto_shash_driver_name(csum_shash)); - return 0; + btrfs_info(fs_info, "using %s checksum algorithm", + btrfs_super_csum_name(csum_type)); } static int btrfs_replay_log(struct btrfs_fs_info *fs_info, @@ -2172,11 +2138,10 @@ static int load_global_roots(struct btrfs_root *tree_root) return ret; if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE)) return ret; - ret = load_global_roots_objectid(tree_root, path, - BTRFS_FREE_SPACE_TREE_OBJECTID, - "free space"); - return ret; + return load_global_roots_objectid(tree_root, path, + BTRFS_FREE_SPACE_TREE_OBJECTID, + "free space"); } static int btrfs_read_roots(struct btrfs_fs_info *fs_info) @@ -2225,21 +2190,44 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) if (ret) goto out; - /* - * This tree can share blocks with some other fs tree during relocation - * and we need a proper setup by btrfs_get_fs_root - */ - root = btrfs_get_fs_root(tree_root->fs_info, - BTRFS_DATA_RELOC_TREE_OBJECTID, true); - if (IS_ERR(root)) { - if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { - location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; - ret = PTR_ERR(root); - goto out; + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { + /* The remap_root has already been loaded in load_important_roots(). */ + root = fs_info->remap_root; + + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + + root->root_key.objectid = BTRFS_REMAP_TREE_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = 0; + + /* Check that data reloc tree doesn't also exist. */ + location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; + root = btrfs_read_tree_root(fs_info->tree_root, &location); + if (!IS_ERR(root)) { + btrfs_err(fs_info, "data reloc tree exists when remap-tree enabled"); + btrfs_put_root(root); + return -EIO; + } else if (PTR_ERR(root) != -ENOENT) { + btrfs_warn(fs_info, "error %ld when checking for data reloc tree", + PTR_ERR(root)); } } else { - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->data_reloc_root = root; + /* + * This tree can share blocks with some other fs tree during + * relocation and we need a proper setup by btrfs_get_fs_root(). + */ + root = btrfs_get_fs_root(tree_root->fs_info, + BTRFS_DATA_RELOC_TREE_OBJECTID, true); + if (IS_ERR(root)) { + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->data_reloc_root = root; + } } location.objectid = BTRFS_QUOTA_TREE_OBJECTID; @@ -2479,6 +2467,35 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, ret = -EINVAL; } + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { + /* + * Reduce test matrix for remap tree by requiring block-group-tree + * and no-holes. Free-space-tree is a hard requirement. + */ + if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) || + !btrfs_fs_incompat(fs_info, NO_HOLES) || + !btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) { + btrfs_err(fs_info, +"remap-tree feature requires free-space-tree, no-holes, and block-group-tree"); + ret = -EINVAL; + } + + if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { + btrfs_err(fs_info, "remap-tree not supported with mixed-bg"); + ret = -EINVAL; + } + + if (btrfs_fs_incompat(fs_info, ZONED)) { + btrfs_err(fs_info, "remap-tree not supported with zoned devices"); + ret = -EINVAL; + } + + if (sectorsize > PAGE_SIZE) { + btrfs_err(fs_info, "remap-tree not supported when block size > page size"); + ret = -EINVAL; + } + } + /* * Hint to catch really bogus numbers, bitflips or so, more exact checks are * done later @@ -2637,6 +2654,18 @@ static int load_important_roots(struct btrfs_fs_info *fs_info) btrfs_warn(fs_info, "couldn't read tree root"); return ret; } + + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { + bytenr = btrfs_super_remap_root(sb); + gen = btrfs_super_remap_root_generation(sb); + level = btrfs_super_remap_root_level(sb); + ret = load_super_root(fs_info->remap_root, bytenr, gen, level); + if (ret) { + btrfs_warn(fs_info, "couldn't read remap root"); + return ret; + } + } + return 0; } @@ -2773,6 +2802,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); INIT_LIST_HEAD(&fs_info->unused_bgs); INIT_LIST_HEAD(&fs_info->reclaim_bgs); + INIT_LIST_HEAD(&fs_info->fully_remapped_bgs); INIT_LIST_HEAD(&fs_info->zone_active_bgs); #ifdef CONFIG_BTRFS_DEBUG INIT_LIST_HEAD(&fs_info->allocated_roots); @@ -2785,6 +2815,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) BTRFS_BLOCK_RSV_GLOBAL); btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK); + btrfs_init_block_rsv(&fs_info->remap_block_rsv, BTRFS_BLOCK_RSV_REMAP); btrfs_init_block_rsv(&fs_info->treelog_rsv, BTRFS_BLOCK_RSV_TREELOG); btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); btrfs_init_block_rsv(&fs_info->delayed_block_rsv, @@ -2827,6 +2858,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) mutex_init(&fs_info->chunk_mutex); mutex_init(&fs_info->transaction_kthread_mutex); mutex_init(&fs_info->cleaner_mutex); + mutex_init(&fs_info->remap_mutex); mutex_init(&fs_info->ro_block_group_mutex); init_rwsem(&fs_info->commit_root_sem); init_rwsem(&fs_info->cleanup_work_sem); @@ -2901,11 +2933,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block if (ret) return ret; - fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), - GFP_KERNEL); - if (!fs_info->delayed_root) - return -ENOMEM; - btrfs_init_delayed_root(fs_info->delayed_root); + btrfs_init_delayed_root(&fs_info->delayed_root); if (sb_rdonly(sb)) set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state); @@ -3018,6 +3046,8 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) btrfs_warn(fs_info, "'clear_cache' option is ignored with extent tree v2"); + else if (btrfs_fs_incompat(fs_info, REMAP_TREE)) + btrfs_warn(fs_info, "'clear_cache' option is ignored with remap tree"); else rebuild_free_space_tree = true; } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && @@ -3032,7 +3062,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) if (ret) { btrfs_warn(fs_info, "failed to rebuild free space tree: %d", ret); - goto out; + return ret; } } @@ -3043,11 +3073,20 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) if (ret) { btrfs_warn(fs_info, "failed to disable free space tree: %d", ret); - goto out; + return ret; } } /* + * Before btrfs-progs v6.16.1 mkfs.btrfs can leave free space entries + * for deleted temporary chunks. Delete them if they exist. + */ + ret = btrfs_delete_orphan_free_space_entries(fs_info); + if (ret < 0) { + btrfs_err(fs_info, "failed to delete orphan free space tree entries: %d", ret); + return ret; + } + /* * btrfs_find_orphan_roots() is responsible for finding all the dead * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load * them into the fs_info->fs_roots_radix tree. This must be done before @@ -3060,17 +3099,17 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) */ ret = btrfs_find_orphan_roots(fs_info); if (ret) - goto out; + return ret; ret = btrfs_cleanup_fs_roots(fs_info); if (ret) - goto out; + return ret; down_read(&fs_info->cleanup_work_sem); if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) || (ret = btrfs_orphan_cleanup(fs_info->tree_root))) { up_read(&fs_info->cleanup_work_sem); - goto out; + return ret; } up_read(&fs_info->cleanup_work_sem); @@ -3079,7 +3118,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { btrfs_warn(fs_info, "failed to recover relocation: %d", ret); - goto out; + return ret; } if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) && @@ -3089,24 +3128,24 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) if (ret) { btrfs_warn(fs_info, "failed to create free space tree: %d", ret); - goto out; + return ret; } } if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) { ret = btrfs_set_free_space_cache_v1_active(fs_info, cache_opt); if (ret) - goto out; + return ret; } ret = btrfs_resume_balance_async(fs_info); if (ret) - goto out; + return ret; ret = btrfs_resume_dev_replace_async(fs_info); if (ret) { btrfs_warn(fs_info, "failed to resume dev_replace"); - goto out; + return ret; } btrfs_qgroup_rescan_resume(fs_info); @@ -3117,12 +3156,11 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) if (ret) { btrfs_warn(fs_info, "failed to create the UUID tree %d", ret); - goto out; + return ret; } } -out: - return ret; + return 0; } /* @@ -3253,6 +3291,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *tree_root; struct btrfs_root *chunk_root; + struct btrfs_root *remap_root; int ret; int level; @@ -3302,12 +3341,9 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device } fs_info->csum_size = btrfs_super_csum_size(disk_super); + fs_info->csum_type = csum_type; - ret = btrfs_init_csum_hash(fs_info, csum_type); - if (ret) { - btrfs_release_disk_super(disk_super); - goto fail_alloc; - } + btrfs_init_csum_hash(fs_info, csum_type); /* * We want to check superblock checksum, the type is stored inside. @@ -3390,6 +3426,16 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (ret < 0) goto fail_alloc; + if (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_REMAP_TREE) { + remap_root = btrfs_alloc_root(fs_info, BTRFS_REMAP_TREE_OBJECTID, + GFP_KERNEL); + fs_info->remap_root = remap_root; + if (!remap_root) { + ret = -ENOMEM; + goto fail_alloc; + } + } + /* * At this point our mount options are validated, if we set ->max_inline * to something non-standard make sure we truncate it to sectorsize. @@ -3541,6 +3587,14 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_sysfs; } + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { + ret = btrfs_populate_fully_remapped_bgs_list(fs_info); + if (ret) { + btrfs_err(fs_info, "failed to populate fully_remapped_bgs list: %d", ret); + goto fail_sysfs; + } + } + btrfs_zoned_reserve_data_reloc_bg(fs_info); btrfs_free_zone_cache(fs_info); @@ -3709,7 +3763,6 @@ static int write_dev_supers(struct btrfs_device *device, { struct btrfs_fs_info *fs_info = device->fs_info; struct address_space *mapping = device->bdev->bd_mapping; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); int i; int ret; u64 bytenr, bytenr_orig; @@ -3719,8 +3772,6 @@ static int write_dev_supers(struct btrfs_device *device, if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; - shash->tfm = fs_info->csum_shash; - for (i = 0; i < max_mirrors; i++) { struct folio *folio; struct bio *bio; @@ -3744,9 +3795,8 @@ static int write_dev_supers(struct btrfs_device *device, btrfs_set_super_bytenr(sb, bytenr_orig); - crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE, - BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, - sb->csum); + btrfs_csum(fs_info->csum_type, (const u8 *)sb + BTRFS_CSUM_SIZE, + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, sb->csum); folio = __filemap_get_folio(mapping, bytenr >> PAGE_SHIFT, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, @@ -3866,7 +3916,7 @@ static void write_dev_flush(struct btrfs_device *device) { struct bio *bio = &device->flush_bio; - device->last_flush_error = BLK_STS_OK; + clear_bit(BTRFS_DEV_STATE_FLUSH_FAILED, &device->dev_state); bio_init(bio, device->bdev, NULL, 0, REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH); @@ -3891,7 +3941,7 @@ static bool wait_dev_flush(struct btrfs_device *device) wait_for_completion_io(&device->flush_wait); if (bio->bi_status) { - device->last_flush_error = bio->bi_status; + set_bit(BTRFS_DEV_STATE_FLUSH_FAILED, &device->dev_state); btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS); return true; } @@ -3941,7 +3991,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info) } /* - * Checks last_flush_error of disks in order to determine the device + * Checks flush failure of disks in order to determine the device * state. */ if (unlikely(errors_wait && !btrfs_check_rw_degradable(info, NULL))) diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index bb2ca1c9c7b0..d0dd50f7d279 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -187,8 +187,6 @@ static int add_extent_changeset(struct extent_state *state, u32 bits, struct extent_changeset *changeset, int set) { - int ret; - if (!changeset) return 0; if (set && (state->state & bits) == bits) @@ -196,9 +194,8 @@ static int add_extent_changeset(struct extent_state *state, u32 bits, if (!set && (state->state & bits) == 0) return 0; changeset->bytes_changed += state->end - state->start + 1; - ret = ulist_add(&changeset->range_changed, state->start, state->end, - GFP_ATOMIC); - return ret; + + return ulist_add(&changeset->range_changed, state->start, state->end, GFP_ATOMIC); } static inline struct extent_state *next_state(struct extent_state *state) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e4cae34620d1..03cf9f242c70 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -41,6 +41,7 @@ #include "tree-checker.h" #include "raid-stripe-tree.h" #include "delayed-inode.h" +#include "relocation.h" #undef SCRAMBLE_DELAYED_REFS @@ -476,7 +477,7 @@ again: btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != bytenr || key.type != BTRFS_EXTENT_DATA_REF_KEY) - goto fail; + return ret; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_data_ref); @@ -487,12 +488,11 @@ again: btrfs_release_path(path); goto again; } - ret = 0; - break; + return 0; } path->slots[0]++; } -fail: + return ret; } @@ -1380,7 +1380,7 @@ out: } int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, - u64 num_bytes, u64 *actual_bytes) + u64 num_bytes, u64 *actual_bytes, bool do_remap) { int ret = 0; u64 discarded_bytes = 0; @@ -1398,7 +1398,8 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, int i; num_bytes = end - cur; - stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes); + stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes, + do_remap); if (IS_ERR(stripes)) { ret = PTR_ERR(stripes); if (ret == -EOPNOTSUPP) @@ -1553,6 +1554,28 @@ static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info, BTRFS_QGROUP_RSV_DATA); } +static int drop_remap_tree_ref(struct btrfs_trans_handle *trans, + const struct btrfs_delayed_ref_node *node) +{ + u64 bytenr = node->bytenr; + u64 num_bytes = node->num_bytes; + int ret; + + ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + return ret; + } + + ret = btrfs_update_block_group(trans, bytenr, num_bytes, false); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + return ret; + } + + return 0; +} + static int run_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, const struct btrfs_delayed_ref_node *node, @@ -1747,7 +1770,10 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, } else if (node->action == BTRFS_ADD_DELAYED_REF) { ret = __btrfs_inc_extent_ref(trans, node, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, href, node, extent_op); + if (node->ref_root == BTRFS_REMAP_TREE_OBJECTID) + ret = drop_remap_tree_ref(trans, node); + else + ret = __btrfs_free_extent(trans, href, node, extent_op); } else { BUG(); } @@ -1761,35 +1787,39 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) { + struct btrfs_fs_info *fs_info = trans->fs_info; int ret = 0; if (TRANS_ABORTED(trans)) { if (insert_reserved) { btrfs_pin_extent(trans, node->bytenr, node->num_bytes); - free_head_ref_squota_rsv(trans->fs_info, href); + free_head_ref_squota_rsv(fs_info, href); } return 0; } if (node->type == BTRFS_TREE_BLOCK_REF_KEY || - node->type == BTRFS_SHARED_BLOCK_REF_KEY) + node->type == BTRFS_SHARED_BLOCK_REF_KEY) { ret = run_delayed_tree_ref(trans, href, node, extent_op, insert_reserved); - else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || - node->type == BTRFS_SHARED_DATA_REF_KEY) + } else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || + node->type == BTRFS_SHARED_DATA_REF_KEY) { ret = run_delayed_data_ref(trans, href, node, extent_op, insert_reserved); - else if (node->type == BTRFS_EXTENT_OWNER_REF_KEY) - ret = 0; - else - BUG(); - if (ret && insert_reserved) - btrfs_pin_extent(trans, node->bytenr, node->num_bytes); - if (ret < 0) - btrfs_err(trans->fs_info, + } else if (unlikely(node->type != BTRFS_EXTENT_OWNER_REF_KEY)) { + ret = -EUCLEAN; + btrfs_err(fs_info, "unexpected delayed ref node type: %u", node->type); + } + + if (unlikely(ret)) { + if (insert_reserved) + btrfs_pin_extent(trans, node->bytenr, node->num_bytes); + btrfs_err(fs_info, "failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d", node->bytenr, node->num_bytes, node->type, node->action, node->ref_mod, ret); + } + return ret; } @@ -2470,7 +2500,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, int i; int action; int level; - int ret = 0; + int ret; if (btrfs_is_testing(fs_info)) return 0; @@ -2522,7 +2552,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, else ret = btrfs_free_extent(trans, &ref); if (ret) - goto fail; + return ret; } else { /* We don't know the owning_root, leave as 0. */ ref.bytenr = btrfs_node_blockptr(buf, i); @@ -2535,12 +2565,10 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, else ret = btrfs_free_extent(trans, &ref); if (ret) - goto fail; + return ret; } } return 0; -fail: - return ret; } int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -2559,17 +2587,17 @@ static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) { struct btrfs_fs_info *fs_info = root->fs_info; u64 flags; - u64 ret; if (data) flags = BTRFS_BLOCK_GROUP_DATA; else if (root == fs_info->chunk_root) flags = BTRFS_BLOCK_GROUP_SYSTEM; + else if (root == fs_info->remap_root) + flags = BTRFS_BLOCK_GROUP_METADATA_REMAP; else flags = BTRFS_BLOCK_GROUP_METADATA; - ret = btrfs_get_alloc_profile(fs_info, flags); - return ret; + return btrfs_get_alloc_profile(fs_info, flags); } static u64 first_logical_byte(struct btrfs_fs_info *fs_info) @@ -2753,8 +2781,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, u64 len; bool readonly; - if (!cache || - start >= cache->start + cache->length) { + if (!cache || start >= btrfs_block_group_end(cache)) { if (cache) btrfs_put_block_group(cache); total_unpinned = 0; @@ -2770,7 +2797,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, empty_cluster <<= 1; } - len = cache->start + cache->length - start; + len = btrfs_block_group_end(cache) - start; len = min(len, end + 1 - start); if (return_free_space) @@ -2819,6 +2846,75 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, return 0; } +/* + * Complete the remapping of a block group by removing its chunk stripes and + * device extents, and adding it to the unused list if there's no longer any + * extents nominally within it. + */ +int btrfs_complete_bg_remapping(struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + struct btrfs_chunk_map *map; + int ret; + + map = btrfs_get_chunk_map(fs_info, bg->start, 1); + if (IS_ERR(map)) + return PTR_ERR(map); + + ret = btrfs_last_identity_remap_gone(map, bg); + if (ret) { + btrfs_free_chunk_map(map); + return ret; + } + + /* + * Set num_stripes to 0, so that btrfs_remove_dev_extents() won't run a + * second time. + */ + map->num_stripes = 0; + + btrfs_free_chunk_map(map); + + if (bg->used == 0) { + spin_lock(&fs_info->unused_bgs_lock); + if (!list_empty(&bg->bg_list)) { + list_del_init(&bg->bg_list); + btrfs_put_block_group(bg); + } + spin_unlock(&fs_info->unused_bgs_lock); + + btrfs_mark_bg_unused(bg); + } + + return 0; +} + +void btrfs_handle_fully_remapped_bgs(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_group *bg; + int ret; + + spin_lock(&fs_info->unused_bgs_lock); + while (!list_empty(&fs_info->fully_remapped_bgs)) { + bg = list_first_entry(&fs_info->fully_remapped_bgs, + struct btrfs_block_group, bg_list); + list_del_init(&bg->bg_list); + spin_unlock(&fs_info->unused_bgs_lock); + + btrfs_discard_extent(fs_info, bg->start, bg->length, NULL, false); + + ret = btrfs_complete_bg_remapping(bg); + if (ret) { + btrfs_put_block_group(bg); + return; + } + + btrfs_put_block_group(bg); + spin_lock(&fs_info->unused_bgs_lock); + } + spin_unlock(&fs_info->unused_bgs_lock); +} + int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -2839,7 +2935,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) if (btrfs_test_opt(fs_info, DISCARD_SYNC)) ret = btrfs_discard_extent(fs_info, start, - end + 1 - start, NULL); + end + 1 - start, NULL, true); next_state = btrfs_next_extent_state(unpin, cached_state); btrfs_clear_extent_dirty(unpin, start, end, &cached_state); @@ -2897,7 +2993,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) ret = -EROFS; if (!TRANS_ABORTED(trans)) ret = btrfs_discard_extent(fs_info, block_group->start, - block_group->length, NULL); + block_group->length, NULL, true); /* * Not strictly necessary to lock, as the block_group should be @@ -2971,11 +3067,22 @@ u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info, } static int do_free_extent_accounting(struct btrfs_trans_handle *trans, - u64 bytenr, struct btrfs_squota_delta *delta) + u64 bytenr, struct btrfs_squota_delta *delta, + struct btrfs_path *path) { int ret; + bool remapped = false; u64 num_bytes = delta->num_bytes; + /* Returns 1 on success and 0 on no-op. */ + ret = btrfs_remove_extent_from_remap_tree(trans, path, bytenr, num_bytes); + if (unlikely(ret < 0)) { + btrfs_abort_transaction(trans, ret); + return ret; + } else if (ret == 1) { + remapped = true; + } + if (delta->is_data) { struct btrfs_root *csum_root; @@ -2999,10 +3106,13 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans, return ret; } - ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes); - if (unlikely(ret)) { - btrfs_abort_transaction(trans, ret); - return ret; + /* If remapped, FST has already been taken care of in remove_range_from_remap_tree(). */ + if (!remapped) { + ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + return ret; + } } ret = btrfs_update_block_group(trans, bytenr, num_bytes, false); @@ -3361,7 +3471,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - ret = do_free_extent_accounting(trans, bytenr, &delta); + ret = do_free_extent_accounting(trans, bytenr, &delta, path); } btrfs_release_path(path); @@ -3462,12 +3572,12 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans, return 0; if (btrfs_header_generation(buf) != trans->transid) - goto out; + return 0; if (root_id != BTRFS_TREE_LOG_OBJECTID) { ret = check_ref_cleanup(trans, buf->start); if (!ret) - goto out; + return 0; } bg = btrfs_lookup_block_group(fs_info, buf->start); @@ -3475,7 +3585,7 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { pin_down_extent(trans, bg, buf->start, buf->len, true); btrfs_put_block_group(bg); - goto out; + return 0; } /* @@ -3499,7 +3609,7 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans, || btrfs_is_zoned(fs_info)) { pin_down_extent(trans, bg, buf->start, buf->len, true); btrfs_put_block_group(bg); - goto out; + return 0; } WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); @@ -3509,7 +3619,6 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans, btrfs_put_block_group(bg); trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); -out: return 0; } @@ -4191,10 +4300,8 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, else trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - return ret; - } + if (IS_ERR(trans)) + return PTR_ERR(trans); ret = btrfs_chunk_alloc(trans, space_info, ffe_ctl->flags, CHUNK_ALLOC_FORCE_FOR_EXTENT); @@ -4288,36 +4395,43 @@ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, struct btrfs_space_info *space_info) { + struct btrfs_block_group *block_group; + if (ffe_ctl->for_treelog) { spin_lock(&fs_info->treelog_bg_lock); if (fs_info->treelog_bg) ffe_ctl->hint_byte = fs_info->treelog_bg; spin_unlock(&fs_info->treelog_bg_lock); - } else if (ffe_ctl->for_data_reloc) { + return 0; + } + + if (ffe_ctl->for_data_reloc) { spin_lock(&fs_info->relocation_bg_lock); if (fs_info->data_reloc_bg) ffe_ctl->hint_byte = fs_info->data_reloc_bg; spin_unlock(&fs_info->relocation_bg_lock); - } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { - struct btrfs_block_group *block_group; + return 0; + } - spin_lock(&fs_info->zone_active_bgs_lock); - list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { - /* - * No lock is OK here because avail is monotonically - * decreasing, and this is just a hint. - */ - u64 avail = block_group->zone_capacity - block_group->alloc_offset; + if (!(ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)) + return 0; - if (block_group_bits(block_group, ffe_ctl->flags) && - block_group->space_info == space_info && - avail >= ffe_ctl->num_bytes) { - ffe_ctl->hint_byte = block_group->start; - break; - } + spin_lock(&fs_info->zone_active_bgs_lock); + list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { + /* + * No lock is OK here because avail is monotonically + * decreasing, and this is just a hint. + */ + u64 avail = block_group->zone_capacity - block_group->alloc_offset; + + if (block_group_bits(block_group, ffe_ctl->flags) && + block_group->space_info == space_info && + avail >= ffe_ctl->num_bytes) { + ffe_ctl->hint_byte = block_group->start; + break; } - spin_unlock(&fs_info->zone_active_bgs_lock); } + spin_unlock(&fs_info->zone_active_bgs_lock); return 0; } @@ -4441,7 +4555,8 @@ static noinline int find_free_extent(struct btrfs_root *root, block_group->cached != BTRFS_CACHE_NO) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || - block_group->ro) { + block_group->ro || + (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) { /* * someone is removing this block group, * we can't jump into the have_block_group @@ -4475,7 +4590,8 @@ search: ffe_ctl->hinted = false; /* If the block group is read-only, we can skip it entirely. */ - if (unlikely(block_group->ro)) { + if (unlikely(block_group->ro || + (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED))) { if (ffe_ctl->for_treelog) btrfs_clear_treelog_bg(block_group); if (ffe_ctl->for_data_reloc) @@ -4562,7 +4678,7 @@ have_block_group: /* move on to the next group */ if (ffe_ctl->search_start + ffe_ctl->num_bytes > - block_group->start + block_group->length) { + btrfs_block_group_end(block_group)) { btrfs_add_free_space_unused(block_group, ffe_ctl->found_offset, ffe_ctl->num_bytes); @@ -4883,6 +4999,9 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, int level = btrfs_delayed_ref_owner(node); bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); + if (unlikely(node->ref_root == BTRFS_REMAP_TREE_OBJECTID)) + goto skip; + extent_key.objectid = node->bytenr; if (skinny_metadata) { /* The owner of a tree block is the level. */ @@ -4935,6 +5054,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_free_path(path); +skip: return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize); } @@ -5263,7 +5383,6 @@ struct walk_control { * @root: the root we are currently deleting * @wc: the walk control for this deletion * @eb: the parent eb that we're currently visiting - * @refs: the number of refs for wc->level - 1 * @flags: the flags for wc->level - 1 * @slot: the slot in the eb that we're currently checking * @@ -5458,12 +5577,12 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, /* wc->stage == UPDATE_BACKREF */ if (!(wc->flags[level] & flag)) { ASSERT(path->locks[level]); - ret = btrfs_inc_ref(trans, root, eb, 1); + ret = btrfs_inc_ref(trans, root, eb, true); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } - ret = btrfs_dec_ref(trans, root, eb, 0); + ret = btrfs_dec_ref(trans, root, eb, false); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; @@ -5864,18 +5983,12 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (wc->refs[level] == 1) { if (level == 0) { - if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { - ret = btrfs_dec_ref(trans, root, eb, 1); - if (ret) { - btrfs_abort_transaction(trans, ret); - return ret; - } - } else { - ret = btrfs_dec_ref(trans, root, eb, 0); - if (unlikely(ret)) { - btrfs_abort_transaction(trans, ret); - return ret; - } + const bool full_backref = (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF); + + ret = btrfs_dec_ref(trans, root, eb, full_backref); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + return ret; } if (btrfs_is_fstree(btrfs_root_id(root))) { ret = btrfs_qgroup_trace_leaf_items(trans, eb); @@ -6400,10 +6513,12 @@ void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u6 * it while performing the free space search since we have already * held back allocations. */ -static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) +static int btrfs_trim_free_extents_throttle(struct btrfs_device *device, + u64 *trimmed, u64 pos, u64 *ret_next_pos) { - u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0; int ret; + u64 start = pos; + u64 trim_len = 0; *trimmed = 0; @@ -6423,15 +6538,20 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) while (1) { struct btrfs_fs_info *fs_info = device->fs_info; + u64 cur_start; + u64 end; + u64 len; u64 bytes; ret = mutex_lock_interruptible(&fs_info->chunk_mutex); if (ret) break; + cur_start = start; btrfs_find_first_clear_extent_bit(&device->alloc_state, start, &start, &end, CHUNK_TRIMMED | CHUNK_ALLOCATED); + start = max(start, cur_start); /* Check if there are any CHUNK_* bits left */ if (start > device->total_bytes) { @@ -6457,6 +6577,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) end = min(end, device->total_bytes - 1); len = end - start + 1; + len = min(len, BTRFS_MAX_TRIM_LENGTH); /* We didn't find any extents */ if (!len) { @@ -6477,6 +6598,12 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) start += len; *trimmed += bytes; + trim_len += len; + if (trim_len >= BTRFS_MAX_TRIM_LENGTH) { + *ret_next_pos = start; + ret = -EAGAIN; + break; + } if (btrfs_trim_interrupted()) { ret = -ERESTARTSYS; @@ -6489,20 +6616,134 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) return ret; } +static int btrfs_trim_free_extents(struct btrfs_fs_info *fs_info, u64 *trimmed, + u64 *dev_failed, int *dev_ret) +{ + struct btrfs_device *dev; + struct btrfs_device *working_dev = NULL; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + u8 uuid[BTRFS_UUID_SIZE]; + u64 start = BTRFS_DEVICE_RANGE_RESERVED; + + *trimmed = 0; + *dev_failed = 0; + *dev_ret = 0; + + /* Find the device with the smallest UUID to start. */ + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(dev, &fs_devices->devices, dev_list) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) + continue; + if (!working_dev || + memcmp(dev->uuid, working_dev->uuid, BTRFS_UUID_SIZE) < 0) + working_dev = dev; + } + if (working_dev) + memcpy(uuid, working_dev->uuid, BTRFS_UUID_SIZE); + mutex_unlock(&fs_devices->device_list_mutex); + + if (!working_dev) + return 0; + + while (1) { + u64 group_trimmed = 0; + u64 next_pos = 0; + int ret = 0; + + mutex_lock(&fs_devices->device_list_mutex); + + /* Find and trim the current device. */ + list_for_each_entry(dev, &fs_devices->devices, dev_list) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) + continue; + if (dev == working_dev) { + ret = btrfs_trim_free_extents_throttle(working_dev, + &group_trimmed, start, &next_pos); + break; + } + } + + /* Throttle: continue the same device from the new position. */ + if (ret == -EAGAIN && next_pos > start) { + mutex_unlock(&fs_devices->device_list_mutex); + *trimmed += group_trimmed; + start = next_pos; + cond_resched(); + continue; + } + + /* User interrupted. */ + if (ret == -ERESTARTSYS || ret == -EINTR) { + mutex_unlock(&fs_devices->device_list_mutex); + *trimmed += group_trimmed; + return ret; + } + + /* + * Device completed (ret == 0), failed, or EAGAIN with no progress. + * Record error if any, then move to next device. + */ + if (ret == -EAGAIN) { + /* No progress - log and skip device. */ + btrfs_warn(fs_info, + "trim throttle: no progress, offset=%llu device %s, skipping", + start, btrfs_dev_name(working_dev)); + (*dev_failed)++; + if (!*dev_ret) + *dev_ret = ret; + } else if (ret) { + /* Device failed with error. */ + (*dev_failed)++; + if (!*dev_ret) + *dev_ret = ret; + } + + /* + * Find next device: smallest UUID larger than current. + * Devices added during trim with smaller UUID will be skipped. + */ + working_dev = NULL; + list_for_each_entry(dev, &fs_devices->devices, dev_list) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) + continue; + /* Must larger than current UUID. */ + if (memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE) <= 0) + continue; + /* Find the smallest. */ + if (!working_dev || + memcmp(dev->uuid, working_dev->uuid, BTRFS_UUID_SIZE) < 0) + working_dev = dev; + } + if (working_dev) + memcpy(uuid, working_dev->uuid, BTRFS_UUID_SIZE); + + mutex_unlock(&fs_devices->device_list_mutex); + + *trimmed += group_trimmed; + start = BTRFS_DEVICE_RANGE_RESERVED; + + /* No more devices. */ + if (!working_dev) + break; + + cond_resched(); + } + + return 0; +} + /* * Trim the whole filesystem by: * 1) trimming the free space in each block group * 2) trimming the unallocated space on each device * * This will also continue trimming even if a block group or device encounters - * an error. The return value will be the last error, or 0 if nothing bad + * an error. The return value will be the first error, or 0 if nothing bad * happens. */ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) { - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_block_group *cache = NULL; - struct btrfs_device *device; u64 group_trimmed; u64 range_end = U64_MAX; u64 start; @@ -6533,14 +6774,15 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) } start = max(range->start, cache->start); - end = min(range_end, cache->start + cache->length); + end = min(range_end, btrfs_block_group_end(cache)); if (end - start >= range->minlen) { if (!btrfs_block_group_done(cache)) { ret = btrfs_cache_block_group(cache, true); if (ret) { bg_failed++; - bg_ret = ret; + if (!bg_ret) + bg_ret = ret; continue; } } @@ -6551,9 +6793,14 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) range->minlen); trimmed += group_trimmed; + if (ret == -ERESTARTSYS || ret == -EINTR) { + btrfs_put_block_group(cache); + break; + } if (ret) { bg_failed++; - bg_ret = ret; + if (!bg_ret) + bg_ret = ret; continue; } } @@ -6561,30 +6808,22 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) if (bg_failed) btrfs_warn(fs_info, - "failed to trim %llu block group(s), last error %d", + "failed to trim %llu block group(s), first error %d", bg_failed, bg_ret); - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) { - if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) - continue; + if (ret == -ERESTARTSYS || ret == -EINTR) + return ret; - ret = btrfs_trim_free_extents(device, &group_trimmed); - - trimmed += group_trimmed; - if (ret) { - dev_failed++; - dev_ret = ret; - break; - } - } - mutex_unlock(&fs_devices->device_list_mutex); + ret = btrfs_trim_free_extents(fs_info, &group_trimmed, &dev_failed, &dev_ret); + trimmed += group_trimmed; if (dev_failed) btrfs_warn(fs_info, - "failed to trim %llu device(s), last error %d", + "failed to trim %llu device(s), first error %d", dev_failed, dev_ret); range->len = trimmed; + if (ret == -ERESTARTSYS || ret == -EINTR) + return ret; if (bg_ret) return bg_ret; return dev_ret; diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 71bb8109c969..ff330d4896d6 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -161,7 +161,9 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *parent); void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, - u64 num_bytes, u64 *actual_bytes); + u64 num_bytes, u64 *actual_bytes, bool do_remap); int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); +void btrfs_handle_fully_remapped_bgs(struct btrfs_fs_info *fs_info); +int btrfs_complete_bg_remapping(struct btrfs_block_group *bg); #endif diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f6cca3c97166..3df399dc8856 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -440,8 +440,7 @@ again: loops = 1; goto again; } else { - found = false; - goto out_failed; + return false; } } @@ -461,7 +460,7 @@ again: } *start = delalloc_start; *end = delalloc_end; -out_failed: + return found; } @@ -970,7 +969,7 @@ static void btrfs_readahead_expand(struct readahead_control *ractl, { const u64 ra_pos = readahead_pos(ractl); const u64 ra_end = ra_pos + readahead_length(ractl); - const u64 em_end = em->start + em->len; + const u64 em_end = btrfs_extent_map_end(em); /* No expansion for holes and inline extents. */ if (em->disk_bytenr > EXTENT_MAP_LAST_BYTE) @@ -998,11 +997,17 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, u64 start = folio_pos(folio); const u64 end = start + folio_size(folio) - 1; u64 extent_offset; + u64 locked_end; u64 last_byte = i_size_read(inode); struct extent_map *em; int ret = 0; const size_t blocksize = fs_info->sectorsize; + if (bio_ctrl->ractl) + locked_end = readahead_pos(bio_ctrl->ractl) + readahead_length(bio_ctrl->ractl) - 1; + else + locked_end = end; + ret = set_folio_extent_mapped(folio); if (ret < 0) { folio_unlock(folio); @@ -1036,7 +1041,14 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, end_folio_read(folio, true, cur, blocksize); continue; } - em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached); + /* + * Search extent map for the whole locked range. + * This will allow btrfs_get_extent() to return a larger hole + * when possible. + * This can reduce duplicated btrfs_get_extent() calls for large + * holes. + */ + em = get_extent_map(BTRFS_I(inode), folio, cur, locked_end - cur + 1, em_cached); if (IS_ERR(em)) { end_folio_read(folio, false, cur, end + 1 - cur); return PTR_ERR(em); @@ -1426,8 +1438,9 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, u64 delalloc_start = page_start; u64 delalloc_end = page_end; u64 delalloc_to_write = 0; + unsigned int start_bit; + unsigned int end_bit; int ret = 0; - int bit; /* Save the dirty bitmap as our submission bitmap will be a subset of it. */ if (btrfs_is_subpage(fs_info, folio)) { @@ -1437,10 +1450,12 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, bio_ctrl->submit_bitmap = 1; } - for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) { - u64 start = page_start + (bit << fs_info->sectorsize_bits); + for_each_set_bitrange(start_bit, end_bit, &bio_ctrl->submit_bitmap, + blocks_per_folio) { + u64 start = page_start + (start_bit << fs_info->sectorsize_bits); + u32 len = (end_bit - start_bit) << fs_info->sectorsize_bits; - btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize); + btrfs_folio_set_lock(fs_info, folio, start, len); } /* Lock all (subpage) delalloc ranges inside the folio first. */ @@ -1557,10 +1572,13 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, fs_info->sectorsize_bits, blocks_per_folio); - for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size) - btrfs_mark_ordered_io_finished(inode, folio, - page_start + (bit << fs_info->sectorsize_bits), - fs_info->sectorsize, false); + for_each_set_bitrange(start_bit, end_bit, &bio_ctrl->submit_bitmap, + bitmap_size) { + u64 start = page_start + (start_bit << fs_info->sectorsize_bits); + u32 len = (end_bit - start_bit) << fs_info->sectorsize_bits; + + btrfs_mark_ordered_io_finished(inode, folio, start, len, false); + } return ret; } out: @@ -1598,7 +1616,7 @@ out: /* * Return 0 if we have submitted or queued the sector for submission. - * Return <0 for critical errors, and the sector will have its dirty flag cleared. + * Return <0 for critical errors, and the involved sector will be cleaned up. * * Caller should make sure filepos < i_size and handle filepos >= i_size case. */ @@ -1623,6 +1641,13 @@ static int submit_one_sector(struct btrfs_inode *inode, em = btrfs_get_extent(inode, NULL, filepos, sectorsize); if (IS_ERR(em)) { /* + * bio_ctrl may contain a bio crossing several folios. + * Submit it immediately so that the bio has a chance + * to finish normally, other than marked as error. + */ + submit_one_bio(bio_ctrl); + + /* * When submission failed, we should still clear the folio dirty. * Or the folio will be written back again but without any * ordered extent. @@ -1630,6 +1655,13 @@ static int submit_one_sector(struct btrfs_inode *inode, btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize); btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize); + + /* + * Since there is no bio submitted to finish the ordered + * extent, we have to manually finish this sector. + */ + btrfs_mark_ordered_io_finished(inode, folio, filepos, + fs_info->sectorsize, false); return PTR_ERR(em); } @@ -1714,8 +1746,8 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, return ret; } - for (cur = start; cur < end; cur += fs_info->sectorsize) - set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); + bitmap_set(&range_bitmap, (start - folio_pos(folio)) >> fs_info->sectorsize_bits, + len >> fs_info->sectorsize_bits); bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap, blocks_per_folio); @@ -1756,19 +1788,6 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, } ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size); if (unlikely(ret < 0)) { - /* - * bio_ctrl may contain a bio crossing several folios. - * Submit it immediately so that the bio has a chance - * to finish normally, other than marked as error. - */ - submit_one_bio(bio_ctrl); - /* - * Failed to grab the extent map which should be very rare. - * Since there is no bio submitted to finish the ordered - * extent, we have to manually finish this sector. - */ - btrfs_mark_ordered_io_finished(inode, folio, cur, - fs_info->sectorsize, false); if (!found_error) found_error = ret; continue; diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 7e38c23a0c1c..095a561d733f 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -319,8 +319,15 @@ static void dump_extent_map(struct btrfs_fs_info *fs_info, const char *prefix, /* Internal sanity checks for btrfs debug builds. */ static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map *em) { + const u32 blocksize = fs_info->sectorsize; + if (!IS_ENABLED(CONFIG_BTRFS_DEBUG)) return; + + if (!IS_ALIGNED(em->start, blocksize) || + !IS_ALIGNED(em->len, blocksize)) + dump_extent_map(fs_info, "unaligned start offset or length members", em); + if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) { if (em->disk_num_bytes == 0) dump_extent_map(fs_info, "zero disk_num_bytes", em); @@ -334,6 +341,11 @@ static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map dump_extent_map(fs_info, "ram_bytes mismatch with disk_num_bytes for non-compressed em", em); + if (!IS_ALIGNED(em->disk_bytenr, blocksize) || + !IS_ALIGNED(em->disk_num_bytes, blocksize) || + !IS_ALIGNED(em->offset, blocksize) || + !IS_ALIGNED(em->ram_bytes, blocksize)) + dump_extent_map(fs_info, "unaligned members", em); } else if (em->offset) { dump_extent_map(fs_info, "non-zero offset for hole/inline", em); } diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 14e5257f0f04..7bd715442f3e 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -8,7 +8,6 @@ #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/sched/mm.h> -#include <crypto/hash.h> #include "messages.h" #include "ctree.h" #include "disk-io.h" @@ -769,7 +768,6 @@ static void csum_one_bio(struct btrfs_bio *bbio, struct bvec_iter *src) { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct bio *bio = &bbio->bio; struct btrfs_ordered_sum *sums = bbio->sums; struct bvec_iter iter = *src; @@ -781,8 +779,6 @@ static void csum_one_bio(struct btrfs_bio *bbio, struct bvec_iter *src) u32 offset = 0; int index = 0; - shash->tfm = fs_info->csum_shash; - btrfs_bio_for_each_block(paddr, bio, &iter, step) { paddrs[(offset / step) % nr_steps] = paddr; offset += step; @@ -1138,7 +1134,7 @@ again: } ret = PTR_ERR(item); if (ret != -EFBIG && ret != -ENOENT) - goto out; + return ret; if (ret == -EFBIG) { u32 item_size; @@ -1154,7 +1150,7 @@ again: /* We didn't find a csum item, insert one. */ ret = find_next_csum_offset(root, path, &next_offset); if (ret < 0) - goto out; + return ret; found_next = 1; goto insert; } @@ -1182,7 +1178,7 @@ again: csum_size, 1); path->search_for_extension = false; if (ret < 0) - goto out; + return ret; if (ret > 0) { if (path->slots[0] == 0) @@ -1238,14 +1234,14 @@ extend_csum: btrfs_header_nritems(path->nodes[0])) { ret = find_next_csum_offset(root, path, &next_offset); if (ret < 0) - goto out; + return ret; found_next = 1; goto insert; } ret = find_next_csum_offset(root, path, &next_offset); if (ret < 0) - goto out; + return ret; tmp = (next_offset - bytenr) >> fs_info->sectorsize_bits; if (tmp <= INT_MAX) @@ -1286,7 +1282,7 @@ insert: ret = btrfs_insert_empty_item(trans, root, path, &file_key, ins_size); if (ret < 0) - goto out; + return ret; leaf = path->nodes[0]; csum: item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); @@ -1311,8 +1307,8 @@ found: cond_resched(); goto again; } -out: - return ret; + + return 0; } void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index aca2b541e72d..acaa3dbd2b7b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -566,7 +566,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, int del_nr = 0; int del_slot = 0; int recow; - int ret = 0; + int ret; u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); @@ -581,7 +581,7 @@ again: ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) - goto out; + return ret; if (ret > 0 && path->slots[0] > 0) path->slots[0]--; @@ -590,20 +590,20 @@ again: if (unlikely(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); if (unlikely(btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); if (unlikely(key.offset > start || extent_end < end)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); @@ -633,7 +633,7 @@ again: trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, end - other_start); - goto out; + return 0; } } @@ -661,7 +661,7 @@ again: other_end - start); btrfs_set_file_extent_offset(leaf, fi, start - orig_offset); - goto out; + return 0; } } @@ -677,7 +677,7 @@ again: } if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } leaf = path->nodes[0]; @@ -705,7 +705,7 @@ again: ret = btrfs_inc_extent_ref(trans, &ref); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } if (split == start) { @@ -714,7 +714,7 @@ again: if (unlikely(start != key.offset)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } path->slots[0]--; extent_end = end; @@ -745,7 +745,7 @@ again: ret = btrfs_free_extent(trans, &ref); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } } other_start = 0; @@ -763,7 +763,7 @@ again: ret = btrfs_free_extent(trans, &ref); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } } if (del_nr == 0) { @@ -784,11 +784,11 @@ again: ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } } -out: - return ret; + + return 0; } /* @@ -860,7 +860,7 @@ static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN) | fgf_set_order(write_bytes); struct folio *folio; - int ret = 0; + int ret; again: folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask); @@ -877,10 +877,8 @@ again: if (ret) { /* The folio is already unlocked. */ folio_put(folio); - if (!nowait && ret == -EAGAIN) { - ret = 0; + if (!nowait && ret == -EAGAIN) goto again; - } return ret; } *folio_ret = folio; @@ -1275,8 +1273,7 @@ again: btrfs_delalloc_release_extents(inode, reserved_len); release_space(inode, *data_reserved, reserved_start, reserved_len, only_release_metadata); - ret = extents_locked; - return ret; + return extents_locked; } copied = copy_folio_from_iter_atomic(folio, offset_in_folio(folio, start), @@ -1441,7 +1438,7 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, struct btrfs_inode *inode = BTRFS_I(file_inode(file)); ssize_t num_written, num_sync; - if (unlikely(btrfs_is_shutdown(inode->root->fs_info))) + if (btrfs_is_shutdown(inode->root->fs_info)) return -EIO; /* * If the fs flips readonly due to some impossible error, although we @@ -2046,7 +2043,7 @@ static int btrfs_file_mmap_prepare(struct vm_area_desc *desc) struct file *filp = desc->file; struct address_space *mapping = filp->f_mapping; - if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(filp))))) + if (btrfs_is_shutdown(inode_to_fs_info(file_inode(filp)))) return -EIO; if (!mapping->a_ops->read_folio) return -ENOEXEC; @@ -2199,10 +2196,11 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len) /* Hole or vacuum extent(only exists in no-hole mode) */ if (em->disk_bytenr == EXTENT_MAP_HOLE) { + const u64 em_end = btrfs_extent_map_end(em); + ret = 1; - *len = em->start + em->len > *start + *len ? - 0 : *start + *len - em->start - em->len; - *start = em->start + em->len; + *len = (em_end > *start + *len) ? 0 : (*start + *len - em_end); + *start = em_end; } btrfs_free_extent_map(em); return ret; @@ -2951,7 +2949,7 @@ static int btrfs_zero_range(struct inode *inode, * new prealloc extent, so that we get a larger contiguous disk extent. */ if (em->start <= alloc_start && (em->flags & EXTENT_FLAG_PREALLOC)) { - const u64 em_end = em->start + em->len; + const u64 em_end = btrfs_extent_map_end(em); if (em_end >= offset + len) { /* @@ -3117,7 +3115,7 @@ static long btrfs_fallocate(struct file *file, int mode, int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize; int ret; - if (unlikely(btrfs_is_shutdown(inode_to_fs_info(inode)))) + if (btrfs_is_shutdown(inode_to_fs_info(inode))) return -EIO; /* Do not allow fallocate in ZONED mode */ @@ -3811,7 +3809,7 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) { int ret; - if (unlikely(btrfs_is_shutdown(inode_to_fs_info(inode)))) + if (btrfs_is_shutdown(inode_to_fs_info(inode))) return -EIO; filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; @@ -3826,7 +3824,7 @@ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { ssize_t ret = 0; - if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(iocb->ki_filp))))) + if (btrfs_is_shutdown(inode_to_fs_info(file_inode(iocb->ki_filp)))) return -EIO; if (iocb->ki_flags & IOCB_DIRECT) { @@ -3843,7 +3841,7 @@ static ssize_t btrfs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { - if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(in))))) + if (btrfs_is_shutdown(inode_to_fs_info(file_inode(in)))) return -EIO; return filemap_splice_read(in, ppos, pipe, len, flags); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index f0f72850fab2..cc075a460a22 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -29,6 +29,7 @@ #include "file-item.h" #include "file.h" #include "super.h" +#include "relocation.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_64K @@ -1079,7 +1080,7 @@ int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, struct btrfs_trim_range *trim_entry; /* Get the cluster for this block_group if it exists */ - if (block_group && !list_empty(&block_group->cluster_list)) { + if (!list_empty(&block_group->cluster_list)) { cluster = list_first_entry(&block_group->cluster_list, struct btrfs_free_cluster, block_group_list); } @@ -1161,7 +1162,7 @@ update_cache_item(struct btrfs_trans_handle *trans, if (ret < 0) { btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, EXTENT_DELALLOC, NULL); - goto fail; + return ret; } leaf = path->nodes[0]; if (ret > 0) { @@ -1175,7 +1176,7 @@ update_cache_item(struct btrfs_trans_handle *trans, inode->i_size - 1, EXTENT_DELALLOC, NULL); btrfs_release_path(path); - goto fail; + return -ENOENT; } } @@ -1188,9 +1189,6 @@ update_cache_item(struct btrfs_trans_handle *trans, btrfs_release_path(path); return 0; - -fail: - return -1; } static noinline_for_stack int write_pinned_extent_entries( @@ -1200,12 +1198,10 @@ static noinline_for_stack int write_pinned_extent_entries( int *entries) { u64 start, extent_start, extent_end, len; + const u64 block_group_end = btrfs_block_group_end(block_group); struct extent_io_tree *unpin = NULL; int ret; - if (!block_group) - return 0; - /* * We want to add any pinned extents to our free space cache * so we don't leak the space @@ -1217,19 +1213,18 @@ static noinline_for_stack int write_pinned_extent_entries( start = block_group->start; - while (start < block_group->start + block_group->length) { + while (start < block_group_end) { if (!btrfs_find_first_extent_bit(unpin, start, &extent_start, &extent_end, EXTENT_DIRTY, NULL)) return 0; /* This pinned extent is out of our range */ - if (extent_start >= block_group->start + block_group->length) + if (extent_start >= block_group_end) return 0; extent_start = max(extent_start, start); - extent_end = min(block_group->start + block_group->length, - extent_end + 1); + extent_end = min(block_group_end, extent_end + 1); len = extent_end - extent_start; *entries += 1; @@ -1374,9 +1369,9 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, static int __btrfs_write_out_cache(struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_block_group *block_group, - struct btrfs_io_ctl *io_ctl, struct btrfs_trans_handle *trans) { + struct btrfs_io_ctl *io_ctl = &block_group->io_ctl; struct extent_state *cached_state = NULL; LIST_HEAD(bitmap_list); int entries = 0; @@ -1393,7 +1388,7 @@ static int __btrfs_write_out_cache(struct inode *inode, if (ret) return ret; - if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { + if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) { down_write(&block_group->data_rwsem); spin_lock(&block_group->lock); if (block_group->delalloc_bytes) { @@ -1465,7 +1460,7 @@ static int __btrfs_write_out_cache(struct inode *inode, goto out_nospc; } - if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) + if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) up_write(&block_group->data_rwsem); /* * Release the pages and unlock the extent, we will flush @@ -1500,7 +1495,7 @@ out_nospc: cleanup_write_cache_enospc(inode, io_ctl, &cached_state); out_unlock: - if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) + if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) up_write(&block_group->data_rwsem); out: @@ -1536,8 +1531,7 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) return 0; - ret = __btrfs_write_out_cache(inode, ctl, block_group, - &block_group->io_ctl, trans); + ret = __btrfs_write_out_cache(inode, ctl, block_group, trans); if (ret) { btrfs_debug(fs_info, "failed to write free space cache for block group %llu error %d", @@ -2020,7 +2014,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, int ret; if (!ctl->free_space_offset.rb_node) - goto out; + return NULL; again: if (use_bytes_index) { node = rb_first_cached(&ctl->free_space_bytes); @@ -2028,7 +2022,7 @@ again: entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1); if (!entry) - goto out; + return NULL; node = &entry->offset_index; } @@ -2112,7 +2106,7 @@ again: *bytes = entry->bytes - align_off; return entry; } -out: + return NULL; } @@ -2756,6 +2750,9 @@ int btrfs_add_free_space(struct btrfs_block_group *block_group, { enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED) + return 0; + if (btrfs_is_zoned(block_group->fs_info)) return __btrfs_add_free_space_zoned(block_group, bytenr, size, true); @@ -2894,7 +2891,7 @@ again: old_end - (offset + bytes), info->trim_state); WARN_ON(ret); - goto out; + return ret; } } @@ -2906,7 +2903,7 @@ again: out_lock: btrfs_discard_update_discardable(block_group); spin_unlock(&ctl->tree_lock); -out: + return ret; } @@ -3063,6 +3060,12 @@ bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group) struct rb_node *node; bool ret = true; + if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED && + !test_bit(BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING, &block_group->runtime_flags) && + block_group->identity_remap_count == 0) { + return true; + } + spin_lock(&ctl->tree_lock); node = rb_first(&ctl->free_space_offset); @@ -3674,7 +3677,7 @@ static int do_trimming(struct btrfs_block_group *block_group, } spin_unlock(&space_info->lock); - ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed); + ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed, false); if (!ret) { *total_trimmed += trimmed; trim_state = BTRFS_TRIM_STATE_TRIMMED; @@ -3831,6 +3834,50 @@ out_unlock: return ret; } +void btrfs_trim_fully_remapped_block_group(struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; + int ret = 0; + u64 bytes, trimmed; + const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); + u64 end = btrfs_block_group_end(bg); + + if (!test_bit(BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING, &bg->runtime_flags)) { + bg->discard_cursor = end; + + if (bg->used == 0) { + spin_lock(&fs_info->unused_bgs_lock); + if (!list_empty(&bg->bg_list)) { + list_del_init(&bg->bg_list); + btrfs_put_block_group(bg); + } + spin_unlock(&fs_info->unused_bgs_lock); + + btrfs_mark_bg_unused(bg); + } + + return; + } + + bytes = end - bg->discard_cursor; + + if (max_discard_size && + bytes >= (max_discard_size + BTRFS_ASYNC_DISCARD_MIN_FILTER)) + bytes = max_discard_size; + + ret = btrfs_discard_extent(fs_info, bg->discard_cursor, bytes, &trimmed, false); + if (ret) + return; + + bg->discard_cursor += trimmed; + + if (bg->discard_cursor < end) + return; + + btrfs_complete_bg_remapping(bg); +} + /* * If we break out of trimming a bitmap prematurely, we should reset the * trimming bit. In a rather contrived case, it's possible to race here so @@ -3956,7 +4003,7 @@ static int trim_bitmaps(struct btrfs_block_group *block_group, if (async && *total_trimmed) { spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); - goto out; + return ret; } bytes = min(bytes, end - start); @@ -4017,7 +4064,6 @@ next: if (offset >= end) block_group->discard_cursor = end; -out: return ret; } @@ -4110,20 +4156,20 @@ static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info, { struct btrfs_block_group *block_group; struct rb_node *node; - int ret = 0; btrfs_info(fs_info, "cleaning free space cache v1"); node = rb_first_cached(&fs_info->block_group_cache_tree); while (node) { + int ret; + block_group = rb_entry(node, struct btrfs_block_group, cache_node); ret = btrfs_remove_free_space_inode(trans, NULL, block_group); if (ret) - goto out; + return ret; node = rb_next(node); } -out: - return ret; + return 0; } int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active) diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 9f1dbfdee8ca..33fc3b245648 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -166,6 +166,7 @@ int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen, u64 maxlen, bool async); +void btrfs_trim_fully_remapped_block_group(struct btrfs_block_group *bg); bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info); int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 1ad2ad384b9e..ecddfca92b2b 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -21,8 +21,7 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path); -static struct btrfs_root *btrfs_free_space_root( - struct btrfs_block_group *block_group) +struct btrfs_root *btrfs_free_space_root(struct btrfs_block_group *block_group) { struct btrfs_key key = { .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID, @@ -93,7 +92,6 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans, return 0; } -EXPORT_FOR_TESTS struct btrfs_free_space_info *btrfs_search_free_space_info( struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, @@ -220,7 +218,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, return 0; start = block_group->start; - end = block_group->start + block_group->length; + end = btrfs_block_group_end(block_group); key.objectid = end - 1; key.type = (u8)-1; @@ -360,7 +358,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, return 0; start = block_group->start; - end = block_group->start + block_group->length; + end = btrfs_block_group_end(block_group); key.objectid = end - 1; key.type = (u8)-1; @@ -667,7 +665,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, * Read the bit for the block immediately after the extent of space if * that block is within the block group. */ - if (end < block_group->start + block_group->length) { + if (end < btrfs_block_group_end(block_group)) { /* The next block may be in the next bitmap. */ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (end >= key.objectid + key.offset) { @@ -940,7 +938,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans, right: /* Search for a neighbor on the right. */ - if (end == block_group->start + block_group->length) + if (end == btrfs_block_group_end(block_group)) goto insert; key.objectid = end; key.type = (u8)-1; @@ -1106,7 +1104,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, * highest, block group). */ start = block_group->start; - end = block_group->start + block_group->length; + end = btrfs_block_group_end(block_group); while (ret == 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); @@ -1396,9 +1394,9 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, * can use multiple transactions, every time btrfs_end_transaction() is * called at btrfs_rebuild_free_space_tree() we finish the creation of * new block groups by calling btrfs_create_pending_block_groups(), and - * that in turn calls us, through add_block_group_free_space(), to add - * a free space info item and a free space extent item for the block - * group. + * that in turn calls us, through btrfs_add_block_group_free_space(), + * to add a free space info item and a free space extent item for the + * block group. * * Then later btrfs_rebuild_free_space_tree() may find such new block * groups and processes them with populate_free_space_tree(), which can @@ -1479,7 +1477,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, } start = block_group->start; - end = block_group->start + block_group->length; + end = btrfs_block_group_end(block_group); key.objectid = end - 1; key.type = (u8)-1; @@ -1525,33 +1523,28 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, btrfs_release_path(path); } - ret = 0; - - return ret; + return 0; } static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, struct btrfs_path *path, u32 expected_extent_count) { - struct btrfs_block_group *block_group; - struct btrfs_fs_info *fs_info; + struct btrfs_block_group *block_group = caching_ctl->block_group; + struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *root; struct btrfs_key key; bool prev_bit_set = false; /* Initialize to silence GCC. */ u64 extent_start = 0; - u64 end, offset; + const u64 end = btrfs_block_group_end(block_group); + u64 offset; u64 total_found = 0; u32 extent_count = 0; int ret; - block_group = caching_ctl->block_group; - fs_info = block_group->fs_info; root = btrfs_free_space_root(block_group); - end = block_group->start + block_group->length; - while (1) { ret = btrfs_next_item(root, path); if (ret < 0) @@ -1617,21 +1610,17 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, struct btrfs_path *path, u32 expected_extent_count) { - struct btrfs_block_group *block_group; - struct btrfs_fs_info *fs_info; + struct btrfs_block_group *block_group = caching_ctl->block_group; + struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *root; struct btrfs_key key; - u64 end; + const u64 end = btrfs_block_group_end(block_group); u64 total_found = 0; u32 extent_count = 0; int ret; - block_group = caching_ctl->block_group; - fs_info = block_group->fs_info; root = btrfs_free_space_root(block_group); - end = block_group->start + block_group->length; - while (1) { u64 space_added; @@ -1712,3 +1701,106 @@ int btrfs_load_free_space_tree(struct btrfs_caching_control *caching_ctl) else return load_free_space_extents(caching_ctl, path, extent_count); } + +static int delete_orphan_free_space_entries(struct btrfs_root *fst_root, + struct btrfs_path *path, + u64 first_bg_bytenr) +{ + struct btrfs_trans_handle *trans; + int ret; + + trans = btrfs_start_transaction(fst_root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + while (true) { + struct btrfs_key key = { 0 }; + int i; + + ret = btrfs_search_slot(trans, fst_root, &key, path, -1, 1); + if (ret < 0) + break; + ASSERT(ret > 0); + ret = 0; + for (i = 0; i < btrfs_header_nritems(path->nodes[0]); i++) { + btrfs_item_key_to_cpu(path->nodes[0], &key, i); + if (key.objectid >= first_bg_bytenr) { + /* + * Only break the for() loop and continue to + * delete items. + */ + break; + } + } + /* No items to delete, finished. */ + if (i == 0) + break; + + ret = btrfs_del_items(trans, fst_root, path, 0, i); + if (ret < 0) + break; + btrfs_release_path(path); + } + btrfs_release_path(path); + btrfs_end_transaction(trans); + if (ret == 0) + btrfs_info(fst_root->fs_info, "deleted orphan free space tree entries"); + return ret; +} + +/* Remove any free space entry before the first block group. */ +int btrfs_delete_orphan_free_space_entries(struct btrfs_fs_info *fs_info) +{ + BTRFS_PATH_AUTO_RELEASE(path); + struct btrfs_key key = { + .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = 0, + }; + struct btrfs_root *root; + struct btrfs_block_group *bg; + u64 first_bg_bytenr; + int ret; + + /* + * Extent tree v2 has multiple global roots based on the block group. + * This means we cannot easily grab the global free space tree and locate + * orphan items. Furthermore this is still experimental, all users + * should use the latest btrfs-progs anyway. + */ + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + return 0; + if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + return 0; + root = btrfs_global_root(fs_info, &key); + if (!root) + return 0; + + key.objectid = 0; + key.type = 0; + key.offset = 0; + + bg = btrfs_lookup_first_block_group(fs_info, 0); + if (unlikely(!bg)) { + btrfs_err(fs_info, "no block group found"); + return -EUCLEAN; + } + first_bg_bytenr = bg->start; + btrfs_put_block_group(bg); + + ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); + if (ret < 0) + return ret; + /* There should not be an all-zero key in fst. */ + ASSERT(ret > 0); + + /* Empty free space tree. */ + if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) + return 0; + + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + if (key.objectid >= first_bg_bytenr) + return 0; + btrfs_release_path(&path); + return delete_orphan_free_space_entries(root, &path, first_bg_bytenr); +} diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h index 3d9a5d4477fc..709730e36888 100644 --- a/fs/btrfs/free-space-tree.h +++ b/fs/btrfs/free-space-tree.h @@ -35,12 +35,14 @@ int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, u64 start, u64 size); int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, u64 start, u64 size); - -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_delete_orphan_free_space_entries(struct btrfs_fs_info *fs_info); struct btrfs_free_space_info * btrfs_search_free_space_info(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path, int cow); +struct btrfs_root *btrfs_free_space_root(struct btrfs_block_group *block_group); + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int __btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path, u64 start, u64 size); diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c index feb0a2faa837..14d83565cdee 100644 --- a/fs/btrfs/fs.c +++ b/fs/btrfs/fs.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/crc32.h> #include "messages.h" #include "fs.h" #include "accessors.h" @@ -8,13 +9,11 @@ static const struct btrfs_csums { u16 size; const char name[10]; - const char driver[12]; } btrfs_csums[] = { [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" }, - [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b", - .driver = "blake2b-256" }, + [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b" }, }; /* This exists for btrfs-progs usages. */ @@ -37,21 +36,94 @@ const char *btrfs_super_csum_name(u16 csum_type) return btrfs_csums[csum_type].name; } -/* - * Return driver name if defined, otherwise the name that's also a valid driver - * name. - */ -const char *btrfs_super_csum_driver(u16 csum_type) +size_t __attribute_const__ btrfs_get_num_csums(void) { - /* csum type is validated at mount time */ - return btrfs_csums[csum_type].driver[0] ? - btrfs_csums[csum_type].driver : - btrfs_csums[csum_type].name; + return ARRAY_SIZE(btrfs_csums); } -size_t __attribute_const__ btrfs_get_num_csums(void) +void btrfs_csum(u16 csum_type, const u8 *data, size_t len, u8 *out) { - return ARRAY_SIZE(btrfs_csums); + switch (csum_type) { + case BTRFS_CSUM_TYPE_CRC32: + put_unaligned_le32(~crc32c(~0, data, len), out); + break; + case BTRFS_CSUM_TYPE_XXHASH: + put_unaligned_le64(xxh64(data, len, 0), out); + break; + case BTRFS_CSUM_TYPE_SHA256: + sha256(data, len, out); + break; + case BTRFS_CSUM_TYPE_BLAKE2: + blake2b(NULL, 0, data, len, out, 32); + break; + default: + /* Checksum type is validated at mount time. */ + BUG(); + } +} + +void btrfs_csum_init(struct btrfs_csum_ctx *ctx, u16 csum_type) +{ + ctx->csum_type = csum_type; + switch (ctx->csum_type) { + case BTRFS_CSUM_TYPE_CRC32: + ctx->crc32 = ~0; + break; + case BTRFS_CSUM_TYPE_XXHASH: + xxh64_reset(&ctx->xxh64, 0); + break; + case BTRFS_CSUM_TYPE_SHA256: + sha256_init(&ctx->sha256); + break; + case BTRFS_CSUM_TYPE_BLAKE2: + blake2b_init(&ctx->blake2b, 32); + break; + default: + /* Checksume type is validated at mount time. */ + BUG(); + } +} + +void btrfs_csum_update(struct btrfs_csum_ctx *ctx, const u8 *data, size_t len) +{ + switch (ctx->csum_type) { + case BTRFS_CSUM_TYPE_CRC32: + ctx->crc32 = crc32c(ctx->crc32, data, len); + break; + case BTRFS_CSUM_TYPE_XXHASH: + xxh64_update(&ctx->xxh64, data, len); + break; + case BTRFS_CSUM_TYPE_SHA256: + sha256_update(&ctx->sha256, data, len); + break; + case BTRFS_CSUM_TYPE_BLAKE2: + blake2b_update(&ctx->blake2b, data, len); + break; + default: + /* Checksum type is validated at mount time. */ + BUG(); + } +} + +void btrfs_csum_final(struct btrfs_csum_ctx *ctx, u8 *out) +{ + switch (ctx->csum_type) { + case BTRFS_CSUM_TYPE_CRC32: + put_unaligned_le32(~ctx->crc32, out); + break; + case BTRFS_CSUM_TYPE_XXHASH: + put_unaligned_le64(xxh64_digest(&ctx->xxh64), out); + break; + case BTRFS_CSUM_TYPE_SHA256: + sha256_final(&ctx->sha256, out); + break; + case BTRFS_CSUM_TYPE_BLAKE2: + blake2b_final(&ctx->blake2b, out); + break; + default: + /* Checksum type is validated at mount time. */ + BUG(); + } } /* diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 8ffbc40ebe45..3de3b517810e 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -3,6 +3,8 @@ #ifndef BTRFS_FS_H #define BTRFS_FS_H +#include <crypto/blake2b.h> +#include <crypto/sha2.h> #include <linux/blkdev.h> #include <linux/sizes.h> #include <linux/time64.h> @@ -24,6 +26,7 @@ #include <linux/wait_bit.h> #include <linux/sched.h> #include <linux/rbtree.h> +#include <linux/xxhash.h> #include <uapi/linux/btrfs.h> #include <uapi/linux/btrfs_tree.h> #include "extent-io-tree.h" @@ -35,14 +38,12 @@ struct inode; struct super_block; struct kobject; struct reloc_control; -struct crypto_shash; struct ulist; struct btrfs_device; struct btrfs_block_group; struct btrfs_root; struct btrfs_fs_devices; struct btrfs_transaction; -struct btrfs_delayed_root; struct btrfs_balance_control; struct btrfs_subpage_info; struct btrfs_stripe_hash_table; @@ -64,6 +65,12 @@ struct btrfs_space_info; #define BTRFS_MAX_EXTENT_SIZE SZ_128M +/* + * Maximum length to trim in a single iteration to avoid holding device list + * mutex for too long. + */ +#define BTRFS_MAX_TRIM_LENGTH SZ_2G + #define BTRFS_OLDEST_GENERATION 0ULL #define BTRFS_EMPTY_DIR_SIZE 0 @@ -313,7 +320,8 @@ enum { #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \ BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE | \ - BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) + BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2 | \ + BTRFS_FEATURE_INCOMPAT_REMAP_TREE) #else @@ -461,6 +469,21 @@ struct btrfs_commit_stats { u64 critical_section_start_time; }; +struct btrfs_delayed_root { + spinlock_t lock; + int nodes; /* for delayed nodes */ + struct list_head node_list; + /* + * Used for delayed nodes which is waiting to be dealt with by the + * worker. If the delayed node is inserted into the work queue, we + * drop it from this list. + */ + struct list_head prepare_list; + atomic_t items; /* for delayed items */ + atomic_t items_seq; /* for delayed items */ + wait_queue_head_t wait; +}; + struct btrfs_fs_info { u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; unsigned long flags; @@ -473,6 +496,7 @@ struct btrfs_fs_info { struct btrfs_root *data_reloc_root; struct btrfs_root *block_group_root; struct btrfs_root *stripe_root; + struct btrfs_root *remap_root; /* The log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; @@ -507,6 +531,8 @@ struct btrfs_fs_info { struct btrfs_block_rsv trans_block_rsv; /* Block reservation for chunk tree */ struct btrfs_block_rsv chunk_block_rsv; + /* Block reservation for remap tree. */ + struct btrfs_block_rsv remap_block_rsv; /* Block reservation for delayed operations */ struct btrfs_block_rsv delayed_block_rsv; /* Block reservation for delayed refs */ @@ -581,6 +607,7 @@ struct btrfs_fs_info { struct mutex transaction_kthread_mutex; struct mutex cleaner_mutex; struct mutex chunk_mutex; + struct mutex remap_mutex; /* * This is taken to make sure we don't set block groups ro after the @@ -810,7 +837,7 @@ struct btrfs_fs_info { /* Filesystem state */ unsigned long fs_state; - struct btrfs_delayed_root *delayed_root; + struct btrfs_delayed_root delayed_root; /* Entries are eb->start >> nodesize_bits */ struct xarray buffer_tree; @@ -834,10 +861,11 @@ struct btrfs_fs_info { struct list_head reclaim_bgs; int bg_reclaim_threshold; - /* Protects the lists unused_bgs and reclaim_bgs. */ + /* Protects the lists unused_bgs, reclaim_bgs, and fully_remapped_bgs. */ spinlock_t unused_bgs_lock; /* Protected by unused_bgs_lock. */ struct list_head unused_bgs; + struct list_head fully_remapped_bgs; struct mutex unused_bg_unpin_mutex; /* Protect block groups that are going to be deleted */ struct mutex reclaim_bgs_lock; @@ -850,9 +878,10 @@ struct btrfs_fs_info { u32 sectorsize_bits; u32 block_min_order; u32 block_max_order; + u32 stripesize; u32 csum_size; u32 csums_per_leaf; - u32 stripesize; + u32 csum_type; /* * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular @@ -864,8 +893,6 @@ struct btrfs_fs_info { spinlock_t swapfile_pins_lock; struct rb_root swapfile_pins; - struct crypto_shash *csum_shash; - /* Type of exclusive operation running, protected by super_lock */ enum btrfs_exclusive_operation exclusive_operation; @@ -1057,8 +1084,20 @@ int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args) u16 btrfs_csum_type_size(u16 type); int btrfs_super_csum_size(const struct btrfs_super_block *s); const char *btrfs_super_csum_name(u16 csum_type); -const char *btrfs_super_csum_driver(u16 csum_type); size_t __attribute_const__ btrfs_get_num_csums(void); +struct btrfs_csum_ctx { + u16 csum_type; + union { + u32 crc32; + struct xxh64_state xxh64; + struct sha256_ctx sha256; + struct blake2b_ctx blake2b; + }; +}; +void btrfs_csum(u16 csum_type, const u8 *data, size_t len, u8 *out); +void btrfs_csum_init(struct btrfs_csum_ctx *ctx, u16 csum_type); +void btrfs_csum_update(struct btrfs_csum_ctx *ctx, const u8 *data, size_t len); +void btrfs_csum_final(struct btrfs_csum_ctx *ctx, u8 *out); static inline bool btrfs_is_empty_uuid(const u8 *uuid) { @@ -1105,15 +1144,17 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, #define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \ BTRFS_MOUNT_##opt) -static inline int btrfs_fs_closing(const struct btrfs_fs_info *fs_info) +static inline bool btrfs_fs_closing(const struct btrfs_fs_info *fs_info) +{ + return unlikely(test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)); +} + +static inline bool btrfs_fs_closing_done(const struct btrfs_fs_info *fs_info) { - /* Do it this way so we only ever do one test_bit in the normal case. */ - if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) { - if (test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags)) - return 2; - return 1; - } - return 0; + if (btrfs_fs_closing(fs_info) && test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags)) + return true; + + return false; } /* @@ -1141,9 +1182,9 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ &(fs_info)->fs_state))) -static inline bool btrfs_is_shutdown(struct btrfs_fs_info *fs_info) +static inline bool btrfs_is_shutdown(const struct btrfs_fs_info *fs_info) { - return test_bit(BTRFS_FS_STATE_EMERGENCY_SHUTDOWN, &fs_info->fs_state); + return unlikely(test_bit(BTRFS_FS_STATE_EMERGENCY_SHUTDOWN, &fs_info->fs_state)); } static inline void btrfs_force_shutdown(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index b73e1dd97208..a864f8c99729 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -371,14 +371,13 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 objectid) { struct btrfs_key key; - int ret; + key.objectid = objectid; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - ret = btrfs_insert_empty_item(trans, root, path, &key, - sizeof(struct btrfs_inode_item)); - return ret; + return btrfs_insert_empty_item(trans, root, path, &key, + sizeof(struct btrfs_inode_item)); } int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ab356b50119c..82df115bd0c5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3,7 +3,6 @@ * Copyright (C) 2007 Oracle. All rights reserved. */ -#include <crypto/hash.h> #include <linux/kernel.h> #include <linux/bio.h> #include <linux/blk-cgroup.h> @@ -219,7 +218,7 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off int mirror_num) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_path path = { 0 }; + BTRFS_PATH_AUTO_RELEASE(path); struct btrfs_key found_key = { 0 }; struct extent_buffer *eb; struct btrfs_extent_item *ei; @@ -257,7 +256,6 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off if (ret < 0) { btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d", logical, ret); - btrfs_release_path(&path); return; } eb = path.nodes[0]; @@ -287,11 +285,14 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off (ref_level ? "node" : "leaf"), ref_level, ref_root); } - btrfs_release_path(&path); } else { struct btrfs_backref_walk_ctx ctx = { 0 }; struct data_reloc_warn reloc_warn = { 0 }; + /* + * Do not hold the path as later iterate_extent_inodes() call + * can be time consuming. + */ btrfs_release_path(&path); ctx.bytenr = found_key.objectid; @@ -507,7 +508,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, root, path, &key, datasize); if (ret) - goto fail; + return ret; } leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], @@ -546,7 +547,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, ret = btrfs_inode_set_file_extent_range(inode, 0, ALIGN(size, root->fs_info->sectorsize)); if (ret) - goto fail; + return ret; /* * We're an inline extent, so nobody can extend the file past i_size @@ -562,8 +563,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, } inode->disk_i_size = i_size; -fail: - return ret; + return 0; } static bool can_cow_file_range_inline(struct btrfs_inode *inode, @@ -690,8 +690,8 @@ out: /* * Don't forget to free the reserved space, as for inlined extent * it won't count as data extent, free them directly here. - * And at reserve time, it's always aligned to page size, so - * just free one page here. + * And at reserve time, it's always aligned to sector size, so + * just free one sector here. * * If we fallback to non-inline (ret == 1) due to -ENOSPC, then we need * to keep the data reservation. @@ -756,10 +756,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, struct async_extent { u64 start; u64 ram_size; - u64 compressed_size; - struct folio **folios; - unsigned long nr_folios; - int compress_type; + struct compressed_bio *cb; struct list_head list; }; @@ -780,24 +777,18 @@ struct async_cow { struct async_chunk chunks[]; }; -static noinline int add_async_extent(struct async_chunk *cow, - u64 start, u64 ram_size, - u64 compressed_size, - struct folio **folios, - unsigned long nr_folios, - int compress_type) +static int add_async_extent(struct async_chunk *cow, u64 start, u64 ram_size, + struct compressed_bio *cb) { struct async_extent *async_extent; async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); if (!async_extent) return -ENOMEM; + ASSERT(ram_size < U32_MAX); async_extent->start = start; async_extent->ram_size = ram_size; - async_extent->compressed_size = compressed_size; - async_extent->folios = folios; - async_extent->nr_folios = nr_folios; - async_extent->compress_type = compress_type; + async_extent->cb = cb; list_add_tail(&async_extent->list, &cow->extents); return 0; } @@ -816,6 +807,13 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, return 0; } + /* + * If the delalloc range is only one fs block and can not be inlined, + * do not even bother try compression, as there will be no space saving + * and will always fallback to regular write later. + */ + if (start != 0 && end + 1 - start <= fs_info->sectorsize) + return 0; /* Defrag ioctl takes precedence over mount options and properties. */ if (inode->defrag_compress == BTRFS_DEFRAG_DONT_COMPRESS) return 0; @@ -864,6 +862,61 @@ static int extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start, return ret; } +static struct folio *compressed_bio_last_folio(struct compressed_bio *cb) +{ + struct bio *bio = &cb->bbio.bio; + struct bio_vec *bvec; + phys_addr_t paddr; + + /* + * Make sure all folios have the same min_folio_size. + * + * Otherwise we cannot simply use offset_in_offset(folio, bi_size) to + * calculate the end of the last folio. + */ + if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) { + struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); + const u32 min_folio_size = btrfs_min_folio_size(fs_info); + struct folio_iter fi; + + bio_for_each_folio_all(fi, bio) + ASSERT(folio_size(fi.folio) == min_folio_size); + } + + /* The bio must not be empty. */ + ASSERT(bio->bi_vcnt); + + bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; + paddr = page_to_phys(bvec->bv_page) + bvec->bv_offset + bvec->bv_len - 1; + return page_folio(phys_to_page(paddr)); +} + +static void zero_last_folio(struct compressed_bio *cb) +{ + struct bio *bio = &cb->bbio.bio; + struct folio *last_folio = compressed_bio_last_folio(cb); + const u32 bio_size = bio->bi_iter.bi_size; + const u32 foffset = offset_in_folio(last_folio, bio_size); + + folio_zero_range(last_folio, foffset, folio_size(last_folio) - foffset); +} + +static void round_up_last_block(struct compressed_bio *cb, u32 blocksize) +{ + struct bio *bio = &cb->bbio.bio; + struct folio *last_folio = compressed_bio_last_folio(cb); + const u32 bio_size = bio->bi_iter.bi_size; + const u32 foffset = offset_in_folio(last_folio, bio_size); + bool ret; + + if (IS_ALIGNED(bio_size, blocksize)) + return; + + ret = bio_add_folio(bio, last_folio, round_up(foffset, blocksize) - foffset, foffset); + /* The remaining part should be merged thus never fail. */ + ASSERT(ret); +} + /* * Work queue call back to started compression on a file and pages. * @@ -884,24 +937,22 @@ static void compress_file_range(struct btrfs_work *work) struct btrfs_inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct address_space *mapping = inode->vfs_inode.i_mapping; - const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; + struct compressed_bio *cb = NULL; const u32 min_folio_size = btrfs_min_folio_size(fs_info); u64 blocksize = fs_info->sectorsize; u64 start = async_chunk->start; u64 end = async_chunk->end; u64 actual_end; u64 i_size; + u32 cur_len; int ret = 0; - struct folio **folios = NULL; - unsigned long nr_folios; unsigned long total_compressed = 0; unsigned long total_in = 0; unsigned int loff; - int i; int compress_type = fs_info->compress_type; int compress_level = fs_info->compress_level; - if (unlikely(btrfs_is_shutdown(fs_info))) + if (btrfs_is_shutdown(fs_info)) goto cleanup_and_bail_uncompressed; inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); @@ -916,7 +967,7 @@ static void compress_file_range(struct btrfs_work *work) /* * All the folios should have been locked thus no failure. * - * And even if some folios are missing, btrfs_compress_folios() + * And even if some folios are missing, btrfs_compress_bio() * would handle them correctly, so here just do an ASSERT() check for * early logic errors. */ @@ -936,9 +987,10 @@ static void compress_file_range(struct btrfs_work *work) barrier(); actual_end = min_t(u64, i_size, end + 1); again: - folios = NULL; - nr_folios = (end >> min_folio_shift) - (start >> min_folio_shift) + 1; - nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED >> min_folio_shift); + total_in = 0; + cur_len = min(end + 1 - start, BTRFS_MAX_UNCOMPRESSED); + ret = 0; + cb = NULL; /* * we don't want to send crud past the end of i_size through @@ -953,21 +1005,6 @@ again: if (actual_end <= start) goto cleanup_and_bail_uncompressed; - total_compressed = actual_end - start; - - /* - * Skip compression for a small file range(<=blocksize) that - * isn't an inline extent, since it doesn't save disk space at all. - */ - if (total_compressed <= blocksize && - (start > 0 || end + 1 < inode->disk_i_size)) - goto cleanup_and_bail_uncompressed; - - total_compressed = min_t(unsigned long, total_compressed, - BTRFS_MAX_UNCOMPRESSED); - total_in = 0; - ret = 0; - /* * We do compression for mount -o compress and when the inode has not * been flagged as NOCOMPRESS. This flag can change at any time if we @@ -976,15 +1013,6 @@ again: if (!inode_need_compress(inode, start, end)) goto cleanup_and_bail_uncompressed; - folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS); - if (!folios) { - /* - * Memory allocation failure is not a fatal error, we can fall - * back to uncompressed code. - */ - goto cleanup_and_bail_uncompressed; - } - if (0 < inode->defrag_compress && inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) { compress_type = inode->defrag_compress; compress_level = inode->defrag_compress_level; @@ -993,11 +1021,15 @@ again: } /* Compression level is applied here. */ - ret = btrfs_compress_folios(compress_type, compress_level, - inode, start, folios, &nr_folios, &total_in, - &total_compressed); - if (ret) + cb = btrfs_compress_bio(inode, start, cur_len, compress_type, + compress_level, async_chunk->write_flags); + if (IS_ERR(cb)) { + cb = NULL; goto mark_incompressible; + } + + total_compressed = cb->bbio.bio.bi_iter.bi_size; + total_in = cur_len; /* * Zero the tail end of the last folio, as we might be sending it down @@ -1005,7 +1037,7 @@ again: */ loff = (total_compressed & (min_folio_size - 1)); if (loff) - folio_zero_range(folios[nr_folios - 1], loff, min_folio_size - loff); + zero_last_folio(cb); /* * Try to create an inline extent. @@ -1021,11 +1053,13 @@ again: BTRFS_COMPRESS_NONE, NULL, false); else ret = cow_file_range_inline(inode, NULL, start, end, total_compressed, - compress_type, folios[0], false); + compress_type, + bio_first_folio_all(&cb->bbio.bio), false); if (ret <= 0) { + cleanup_compressed_bio(cb); if (ret < 0) mapping_set_error(mapping, -EIO); - goto free_pages; + return; } /* @@ -1033,6 +1067,7 @@ again: * block size boundary so the allocator does sane things. */ total_compressed = ALIGN(total_compressed, blocksize); + round_up_last_block(cb, blocksize); /* * One last check to make sure the compression is really a win, compare @@ -1043,12 +1078,12 @@ again: if (total_compressed + blocksize > total_in) goto mark_incompressible; + /* * The async work queues will take care of doing actual allocation on * disk for these compressed pages, and will submit the bios. */ - ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios, - nr_folios, compress_type); + ret = add_async_extent(async_chunk, start, total_in, cb); BUG_ON(ret); if (start + total_in < end) { start += total_in; @@ -1061,33 +1096,10 @@ mark_incompressible: if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress) inode->flags |= BTRFS_INODE_NOCOMPRESS; cleanup_and_bail_uncompressed: - ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, - BTRFS_COMPRESS_NONE); + ret = add_async_extent(async_chunk, start, end - start + 1, NULL); BUG_ON(ret); -free_pages: - if (folios) { - for (i = 0; i < nr_folios; i++) { - WARN_ON(folios[i]->mapping); - btrfs_free_compr_folio(folios[i]); - } - kfree(folios); - } -} - -static void free_async_extent_pages(struct async_extent *async_extent) -{ - int i; - - if (!async_extent->folios) - return; - - for (i = 0; i < async_extent->nr_folios; i++) { - WARN_ON(async_extent->folios[i]->mapping); - btrfs_free_compr_folio(async_extent->folios[i]); - } - kfree(async_extent->folios); - async_extent->nr_folios = 0; - async_extent->folios = NULL; + if (cb) + cleanup_compressed_bio(cb); } static void submit_uncompressed_range(struct btrfs_inode *inode, @@ -1134,7 +1146,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, struct extent_state *cached = NULL; struct extent_map *em; int ret = 0; - bool free_pages = false; + u32 compressed_size; u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; @@ -1154,17 +1166,14 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, locked_folio = async_chunk->locked_folio; } - if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { - ASSERT(!async_extent->folios); - ASSERT(async_extent->nr_folios == 0); + if (!async_extent->cb) { submit_uncompressed_range(inode, async_extent, locked_folio); - free_pages = true; goto done; } + compressed_size = async_extent->cb->bbio.bio.bi_iter.bi_size; ret = btrfs_reserve_extent(root, async_extent->ram_size, - async_extent->compressed_size, - async_extent->compressed_size, + compressed_size, compressed_size, 0, *alloc_hint, &ins, true, true); if (ret) { /* @@ -1174,7 +1183,8 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, * fall back to uncompressed. */ submit_uncompressed_range(inode, async_extent, locked_folio); - free_pages = true; + cleanup_compressed_bio(async_extent->cb); + async_extent->cb = NULL; goto done; } @@ -1186,7 +1196,9 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, file_extent.ram_bytes = async_extent->ram_size; file_extent.num_bytes = async_extent->ram_size; file_extent.offset = 0; - file_extent.compression = async_extent->compress_type; + file_extent.compression = async_extent->cb->compress_type; + + async_extent->cb->bbio.bio.bi_iter.bi_sector = ins.objectid >> SECTOR_SHIFT; em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED); if (IS_ERR(em)) { @@ -1202,22 +1214,20 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, ret = PTR_ERR(ordered); goto out_free_reserve; } + async_extent->cb->bbio.ordered = ordered; btrfs_dec_block_group_reservations(fs_info, ins.objectid); /* Clear dirty, set writeback and unlock the pages. */ extent_clear_unlock_delalloc(inode, start, end, NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_START_WRITEBACK); - btrfs_submit_compressed_write(ordered, - async_extent->folios, /* compressed_folios */ - async_extent->nr_folios, - async_chunk->write_flags, true); + btrfs_submit_bbio(&async_extent->cb->bbio, 0); + async_extent->cb = NULL; + *alloc_hint = ins.objectid + ins.offset; done: if (async_chunk->blkcg_css) kthread_associate_blkcg(NULL); - if (free_pages) - free_async_extent_pages(async_extent); kfree(async_extent); return; @@ -1232,7 +1242,8 @@ out_free_reserve: EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); - free_async_extent_pages(async_extent); + if (async_extent->cb) + cleanup_compressed_bio(async_extent->cb); if (async_chunk->blkcg_css) kthread_associate_blkcg(NULL); btrfs_debug(fs_info, @@ -1275,6 +1286,133 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, } /* + * Handle COW for one range. + * + * @ins: The key representing the allocated range. + * @file_offset: The file offset of the COW range + * @num_bytes: The expected length of the COW range + * The actually allocated length can be smaller than it. + * @min_alloc_size: The minimal extent size. + * @alloc_hint: The hint for the extent allocator. + * @ret_alloc_size: The COW range handles by this function. + * + * Return 0 if everything is fine and update @ret_alloc_size updated. The + * range is still locked, and caller should unlock the range after everything + * is done or for error handling. + * + * Return <0 for error and @is updated for where the extra cleanup should + * happen. The range [file_offset, file_offset + ret_alloc_size) will be + * cleaned up by this function. + */ +static int cow_one_range(struct btrfs_inode *inode, struct folio *locked_folio, + struct btrfs_key *ins, struct extent_state **cached, + u64 file_offset, u32 num_bytes, u32 min_alloc_size, + u64 alloc_hint, u32 *ret_alloc_size) +{ + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_ordered_extent *ordered; + struct btrfs_file_extent file_extent; + struct extent_map *em; + u32 cur_len = 0; + u64 cur_end; + int ret; + + ret = btrfs_reserve_extent(root, num_bytes, num_bytes, min_alloc_size, + 0, alloc_hint, ins, true, true); + if (ret < 0) { + *ret_alloc_size = cur_len; + return ret; + } + + cur_len = ins->offset; + cur_end = file_offset + cur_len - 1; + + file_extent.disk_bytenr = ins->objectid; + file_extent.disk_num_bytes = ins->offset; + file_extent.num_bytes = ins->offset; + file_extent.ram_bytes = ins->offset; + file_extent.offset = 0; + file_extent.compression = BTRFS_COMPRESS_NONE; + + /* + * Locked range will be released either during error clean up (inside + * this function or by the caller for previously successful ranges) or + * after the whole range is finished. + */ + btrfs_lock_extent(&inode->io_tree, file_offset, cur_end, cached); + em = btrfs_create_io_em(inode, file_offset, &file_extent, BTRFS_ORDERED_REGULAR); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto free_reserved; + } + btrfs_free_extent_map(em); + + ordered = btrfs_alloc_ordered_extent(inode, file_offset, &file_extent, + 1U << BTRFS_ORDERED_REGULAR); + if (IS_ERR(ordered)) { + btrfs_drop_extent_map_range(inode, file_offset, cur_end, false); + ret = PTR_ERR(ordered); + goto free_reserved; + } + + if (btrfs_is_data_reloc_root(root)) { + ret = btrfs_reloc_clone_csums(ordered); + + /* + * Only drop cache here, and process as normal. + * + * We must not allow extent_clear_unlock_delalloc() at + * free_reserved label to free meta of this ordered extent, as + * its meta should be freed by btrfs_finish_ordered_io(). + * + * So we must continue until @start is increased to + * skip current ordered extent. + */ + if (ret) + btrfs_drop_extent_map_range(inode, file_offset, + cur_end, false); + } + btrfs_put_ordered_extent(ordered); + btrfs_dec_block_group_reservations(fs_info, ins->objectid); + /* + * Error handling for btrfs_reloc_clone_csums(). + * + * Treat the range as finished, thus only clear EXTENT_LOCKED | EXTENT_DELALLOC. + * The accounting will be done by ordered extents. + */ + if (unlikely(ret < 0)) { + btrfs_cleanup_ordered_extents(inode, file_offset, cur_len); + extent_clear_unlock_delalloc(inode, file_offset, cur_end, locked_folio, cached, + EXTENT_LOCKED | EXTENT_DELALLOC, + PAGE_UNLOCK | PAGE_START_WRITEBACK | + PAGE_END_WRITEBACK); + mapping_set_error(inode->vfs_inode.i_mapping, -EIO); + } + *ret_alloc_size = cur_len; + return ret; + +free_reserved: + extent_clear_unlock_delalloc(inode, file_offset, cur_end, locked_folio, cached, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DELALLOC_NEW | + EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, + PAGE_UNLOCK | PAGE_START_WRITEBACK | + PAGE_END_WRITEBACK); + btrfs_qgroup_free_data(inode, NULL, file_offset, cur_len, NULL); + btrfs_dec_block_group_reservations(fs_info, ins->objectid); + btrfs_free_reserved_extent(fs_info, ins->objectid, ins->offset, true); + mapping_set_error(inode->vfs_inode.i_mapping, -EIO); + *ret_alloc_size = cur_len; + /* + * We should not return -EAGAIN where it's a special return code for + * zoned to catch btrfs_reserved_extent(). + */ + ASSERT(ret != -EAGAIN); + return ret; +} + +/* * when extent_io.c finds a delayed allocation range in the file, * the call backs end up in this code. The basic idea is to * allocate extents on disk for the range, and create ordered data structs @@ -1310,16 +1448,15 @@ static noinline int cow_file_range(struct btrfs_inode *inode, u64 alloc_hint = 0; u64 orig_start = start; u64 num_bytes; - u64 cur_alloc_size = 0; - u64 min_alloc_size; - u64 blocksize = fs_info->sectorsize; + u32 min_alloc_size; + u32 blocksize = fs_info->sectorsize; + u32 cur_alloc_size = 0; struct btrfs_key ins; - struct extent_map *em; unsigned clear_bits; unsigned long page_ops; int ret = 0; - if (unlikely(btrfs_is_shutdown(fs_info))) { + if (btrfs_is_shutdown(fs_info)) { ret = -EIO; goto out_unlock; } @@ -1383,16 +1520,14 @@ static noinline int cow_file_range(struct btrfs_inode *inode, min_alloc_size = fs_info->sectorsize; while (num_bytes > 0) { - struct btrfs_ordered_extent *ordered; - struct btrfs_file_extent file_extent; + ret = cow_one_range(inode, locked_folio, &ins, &cached, start, + num_bytes, min_alloc_size, alloc_hint, &cur_alloc_size); - ret = btrfs_reserve_extent(root, num_bytes, num_bytes, - min_alloc_size, 0, alloc_hint, - &ins, true, true); if (ret == -EAGAIN) { /* - * btrfs_reserve_extent only returns -EAGAIN for zoned - * file systems, which is an indication that there are + * cow_one_range() only returns -EAGAIN for zoned + * file systems (from btrfs_reserve_extent()), which + * is an indication that there are * no active zones to allocate from at the moment. * * If this is the first loop iteration, wait for at @@ -1421,79 +1556,14 @@ static noinline int cow_file_range(struct btrfs_inode *inode, } if (ret < 0) goto out_unlock; - cur_alloc_size = ins.offset; - - file_extent.disk_bytenr = ins.objectid; - file_extent.disk_num_bytes = ins.offset; - file_extent.num_bytes = ins.offset; - file_extent.ram_bytes = ins.offset; - file_extent.offset = 0; - file_extent.compression = BTRFS_COMPRESS_NONE; - /* - * Locked range will be released either during error clean up or - * after the whole range is finished. - */ - btrfs_lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1, - &cached); - - em = btrfs_create_io_em(inode, start, &file_extent, - BTRFS_ORDERED_REGULAR); - if (IS_ERR(em)) { - btrfs_unlock_extent(&inode->io_tree, start, - start + cur_alloc_size - 1, &cached); - ret = PTR_ERR(em); - goto out_reserve; - } - btrfs_free_extent_map(em); - - ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, - 1U << BTRFS_ORDERED_REGULAR); - if (IS_ERR(ordered)) { - btrfs_unlock_extent(&inode->io_tree, start, - start + cur_alloc_size - 1, &cached); - ret = PTR_ERR(ordered); - goto out_drop_extent_cache; - } - - if (btrfs_is_data_reloc_root(root)) { - ret = btrfs_reloc_clone_csums(ordered); - - /* - * Only drop cache here, and process as normal. - * - * We must not allow extent_clear_unlock_delalloc() - * at out_unlock label to free meta of this ordered - * extent, as its meta should be freed by - * btrfs_finish_ordered_io(). - * - * So we must continue until @start is increased to - * skip current ordered extent. - */ - if (ret) - btrfs_drop_extent_map_range(inode, start, - start + cur_alloc_size - 1, - false); - } - btrfs_put_ordered_extent(ordered); + /* We should not allocate an extent larger than requested.*/ + ASSERT(cur_alloc_size <= num_bytes); - btrfs_dec_block_group_reservations(fs_info, ins.objectid); - - if (num_bytes < cur_alloc_size) - num_bytes = 0; - else - num_bytes -= cur_alloc_size; + num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; cur_alloc_size = 0; - - /* - * btrfs_reloc_clone_csums() error, since start is increased - * extent_clear_unlock_delalloc() at out_unlock label won't - * free metadata of current ordered extent, we're OK to exit. - */ - if (ret) - goto out_unlock; } extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached, EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); @@ -1502,11 +1572,6 @@ done: *done_offset = end; return ret; -out_drop_extent_cache: - btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false); -out_reserve: - btrfs_dec_block_group_reservations(fs_info, ins.objectid); - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); out_unlock: /* * Now, we have three regions to clean up: @@ -1543,24 +1608,9 @@ out_unlock: page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; /* - * For the range (2). If we reserved an extent for our delalloc range - * (or a subrange) and failed to create the respective ordered extent, - * then it means that when we reserved the extent we decremented the - * extent's size from the data space_info's bytes_may_use counter and - * incremented the space_info's bytes_reserved counter by the same - * amount. We must make sure extent_clear_unlock_delalloc() does not try - * to decrement again the data space_info's bytes_may_use counter, - * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV. - */ - if (cur_alloc_size) { - extent_clear_unlock_delalloc(inode, start, - start + cur_alloc_size - 1, - locked_folio, &cached, clear_bits, - page_ops); - btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); - } - - /* + * For the range (2) the error handling is done by cow_one_range() itself. + * Nothing needs to be done. + * * For the range (3). We never touched the region. In addition to the * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data * space_info's bytes_may_use counter, reserved in @@ -1575,7 +1625,7 @@ out_unlock: end - start - cur_alloc_size + 1, NULL); } btrfs_err(fs_info, -"%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu cur_alloc_size=%llu: %d", +"%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu cur_alloc_size=%u: %d", __func__, btrfs_root_id(inode->root), btrfs_ino(inode), orig_start, end + 1 - orig_start, start, cur_alloc_size, ret); @@ -2072,7 +2122,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, */ ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root)); - if (unlikely(btrfs_is_shutdown(fs_info))) { + if (btrfs_is_shutdown(fs_info)) { ret = -EIO; goto error; } @@ -2372,7 +2422,6 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol u64 start, u64 end, struct writeback_control *wbc) { const bool zoned = btrfs_is_zoned(inode->root->fs_info); - int ret; /* * The range must cover part of the @locked_folio, or a return of 1 @@ -2381,10 +2430,8 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol ASSERT(!(end <= folio_pos(locked_folio) || start >= folio_next_pos(locked_folio))); - if (should_nocow(inode, start, end)) { - ret = run_delalloc_nocow(inode, locked_folio, start, end); - return ret; - } + if (should_nocow(inode, start, end)) + return run_delalloc_nocow(inode, locked_folio, start, end); if (btrfs_inode_can_compress(inode) && inode_need_compress(inode, start, end) && @@ -2392,11 +2439,9 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol return 1; if (zoned) - ret = run_delalloc_cow(inode, locked_folio, start, end, wbc, - true); + return run_delalloc_cow(inode, locked_folio, start, end, wbc, true); else - ret = cow_file_range(inode, locked_folio, start, end, NULL, 0); - return ret; + return cow_file_range(inode, locked_folio, start, end, NULL, 0); } void btrfs_split_delalloc_extent(struct btrfs_inode *inode, @@ -3007,7 +3052,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, drop_args.extent_item_size = sizeof(*stack_fi); ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) - goto out; + return ret; if (!drop_args.extent_inserted) { ins.objectid = btrfs_ino(inode); @@ -3017,7 +3062,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*stack_fi)); if (ret) - goto out; + return ret; } leaf = path->nodes[0]; btrfs_set_stack_file_extent_generation(stack_fi, trans->transid); @@ -3052,13 +3097,11 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes); if (ret) - goto out; + return ret; - ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode), - file_pos - offset, - qgroup_reserved, &ins); -out: - return ret; + return btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode), + file_pos - offset, + qgroup_reserved, &ins); } static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, @@ -3226,19 +3269,21 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) logical_len); btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes); + if (unlikely(ret < 0)) { + btrfs_abort_transaction(trans, ret); + goto out; + } } else { BUG_ON(root == fs_info->tree_root); ret = insert_ordered_extent_file_extent(trans, ordered_extent); - if (!ret) { - clear_reserved_extent = false; - btrfs_release_delalloc_bytes(fs_info, - ordered_extent->disk_bytenr, - ordered_extent->disk_num_bytes); + if (unlikely(ret < 0)) { + btrfs_abort_transaction(trans, ret); + goto out; } - } - if (unlikely(ret < 0)) { - btrfs_abort_transaction(trans, ret); - goto out; + clear_reserved_extent = false; + btrfs_release_delalloc_bytes(fs_info, + ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes); } ret = btrfs_unpin_extent_cache(inode, ordered_extent->file_offset, @@ -3336,7 +3381,7 @@ out: btrfs_discard_extent(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes, - NULL); + NULL, true); btrfs_free_reserved_extent(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes, true); @@ -3418,20 +3463,19 @@ void btrfs_calculate_block_csum_pages(struct btrfs_fs_info *fs_info, const u32 blocksize = fs_info->sectorsize; const u32 step = min(blocksize, PAGE_SIZE); const u32 nr_steps = blocksize / step; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + struct btrfs_csum_ctx csum; - shash->tfm = fs_info->csum_shash; - crypto_shash_init(shash); + btrfs_csum_init(&csum, fs_info->csum_type); for (int i = 0; i < nr_steps; i++) { const phys_addr_t paddr = paddrs[i]; void *kaddr; ASSERT(offset_in_page(paddr) + step <= PAGE_SIZE); kaddr = kmap_local_page(phys_to_page(paddr)) + offset_in_page(paddr); - crypto_shash_update(shash, kaddr, step); + btrfs_csum_update(&csum, kaddr, step); kunmap_local(kaddr); } - crypto_shash_final(shash, dest); + btrfs_csum_final(&csum, dest); } /* @@ -7137,7 +7181,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, read_unlock(&em_tree->lock); if (em) { - if (em->start > start || em->start + em->len <= start) + if (em->start > start || btrfs_extent_map_end(em) <= start) btrfs_free_extent_map(em); else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio) btrfs_free_extent_map(em); @@ -9790,12 +9834,12 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, struct extent_state *cached_state = NULL; struct btrfs_ordered_extent *ordered; struct btrfs_file_extent file_extent; + struct compressed_bio *cb = NULL; int compression; size_t orig_count; + const u32 min_folio_size = btrfs_min_folio_size(fs_info); u64 start, end; u64 num_bytes, ram_bytes, disk_num_bytes; - unsigned long nr_folios, i; - struct folio **folios; struct btrfs_key ins; bool extent_reserved = false; struct extent_map *em; @@ -9884,39 +9928,46 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, * isn't. */ disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize); - nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); - folios = kvcalloc(nr_folios, sizeof(struct folio *), GFP_KERNEL_ACCOUNT); - if (!folios) - return -ENOMEM; - for (i = 0; i < nr_folios; i++) { - size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from)); + + cb = btrfs_alloc_compressed_write(inode, start, num_bytes); + for (int i = 0; i * min_folio_size < disk_num_bytes; i++) { + struct folio *folio; + size_t bytes = min(min_folio_size, iov_iter_count(from)); char *kaddr; - folios[i] = folio_alloc(GFP_KERNEL_ACCOUNT, 0); - if (!folios[i]) { + folio = btrfs_alloc_compr_folio(fs_info); + if (!folio) { ret = -ENOMEM; - goto out_folios; + goto out_cb; } - kaddr = kmap_local_folio(folios[i], 0); - if (copy_from_iter(kaddr, bytes, from) != bytes) { - kunmap_local(kaddr); + kaddr = kmap_local_folio(folio, 0); + ret = copy_from_iter(kaddr, bytes, from); + kunmap_local(kaddr); + if (ret != bytes) { + folio_put(folio); ret = -EFAULT; - goto out_folios; + goto out_cb; + } + if (bytes < min_folio_size) + folio_zero_range(folio, bytes, min_folio_size - bytes); + ret = bio_add_folio(&cb->bbio.bio, folio, folio_size(folio), 0); + if (unlikely(!ret)) { + folio_put(folio); + ret = -EINVAL; + goto out_cb; } - if (bytes < PAGE_SIZE) - memset(kaddr + bytes, 0, PAGE_SIZE - bytes); - kunmap_local(kaddr); } + ASSERT(cb->bbio.bio.bi_iter.bi_size == disk_num_bytes); for (;;) { ret = btrfs_wait_ordered_range(inode, start, num_bytes); if (ret) - goto out_folios; + goto out_cb; ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping, start >> PAGE_SHIFT, end >> PAGE_SHIFT); if (ret) - goto out_folios; + goto out_cb; btrfs_lock_extent(io_tree, start, end, &cached_state); ordered = btrfs_lookup_ordered_range(inode, start, num_bytes); if (!ordered && @@ -9948,7 +9999,8 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, encoded->unencoded_offset == 0 && can_cow_file_range_inline(inode, start, encoded->len, orig_count)) { ret = __cow_file_range_inline(inode, encoded->len, - orig_count, compression, folios[0], + orig_count, compression, + bio_first_folio_all(&cb->bbio.bio), true); if (ret <= 0) { if (ret == 0) @@ -9993,7 +10045,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, btrfs_delalloc_release_extents(inode, num_bytes); - btrfs_submit_compressed_write(ordered, folios, nr_folios, 0, false); + btrfs_submit_compressed_write(ordered, cb); ret = orig_count; goto out; @@ -10015,12 +10067,9 @@ out_free_data_space: btrfs_free_reserved_data_space_noquota(inode, disk_num_bytes); out_unlock: btrfs_unlock_extent(io_tree, start, end, &cached_state); -out_folios: - for (i = 0; i < nr_folios; i++) { - if (folios[i]) - folio_put(folios[i]); - } - kvfree(folios); +out_cb: + if (cb) + cleanup_compressed_bio(cb); out: if (ret >= 0) iocb->ki_pos += encoded->len; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d1ab03691606..a6cc2d3b414c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1107,7 +1107,7 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, bool readonly, struct btrfs_qgroup_inherit *inherit) { - int ret = 0; + int ret; struct qstr qname = QSTR_INIT(name, strlen(name)); if (!S_ISDIR(file_inode(file)->i_mode)) @@ -1115,7 +1115,7 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, ret = mnt_want_write_file(file); if (ret) - goto out; + return ret; if (strchr(name, '/')) { ret = -EINVAL; @@ -1167,7 +1167,6 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, } out_drop_write: mnt_drop_write_file(file); -out: return ret; } @@ -1283,14 +1282,14 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, struct btrfs_trans_handle *trans; u64 root_flags; u64 flags; - int ret = 0; + int ret; if (!inode_owner_or_capable(file_mnt_idmap(file), inode)) return -EPERM; ret = mnt_want_write_file(file); if (ret) - goto out; + return ret; if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { ret = -EINVAL; @@ -1359,7 +1358,6 @@ out_drop_sem: up_write(&fs_info->subvol_sem); out_drop_write: mnt_drop_write_file(file); -out: return ret; } @@ -1425,10 +1423,8 @@ static noinline int copy_to_sk(struct btrfs_path *path, continue; if (sizeof(sh) + item_len > *buf_size) { - if (*num_found) { - ret = 1; - goto out; - } + if (*num_found) + return 1; /* * return one empty item back for v1, which does not @@ -1440,10 +1436,8 @@ static noinline int copy_to_sk(struct btrfs_path *path, ret = -EOVERFLOW; } - if (sizeof(sh) + item_len + *sk_offset > *buf_size) { - ret = 1; - goto out; - } + if (sizeof(sh) + item_len + *sk_offset > *buf_size) + return 1; sh.objectid = key->objectid; sh.type = key->type; @@ -1457,10 +1451,8 @@ static noinline int copy_to_sk(struct btrfs_path *path, * problem. Otherwise we'll fault and then copy the buffer in * properly this next time through */ - if (copy_to_user_nofault(ubuf + *sk_offset, &sh, sizeof(sh))) { - ret = 0; - goto out; - } + if (copy_to_user_nofault(ubuf + *sk_offset, &sh, sizeof(sh))) + return 0; *sk_offset += sizeof(sh); @@ -1472,22 +1464,20 @@ static noinline int copy_to_sk(struct btrfs_path *path, */ if (read_extent_buffer_to_user_nofault(leaf, up, item_off, item_len)) { - ret = 0; *sk_offset -= sizeof(sh); - goto out; + return 0; } *sk_offset += item_len; } (*num_found)++; - if (ret) /* -EOVERFLOW from above */ - goto out; + /* -EOVERFLOW from above. */ + if (ret) + return ret; - if (*num_found >= sk->nr_items) { - ret = 1; - goto out; - } + if (*num_found >= sk->nr_items) + return 1; } advance_key: ret = 0; @@ -1507,7 +1497,7 @@ advance_key: key->objectid++; } else ret = 1; -out: + /* * 0: all items from this leaf copied, continue with next * 1: * more items can be copied, but unused buffer is too small @@ -4931,7 +4921,7 @@ out_acct: int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { - if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(cmd->file))))) + if (btrfs_is_shutdown(inode_to_fs_info(file_inode(cmd->file)))) return -EIO; switch (cmd->cmd_op) { diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 0035851d72b0..e3df5ca0b552 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -73,6 +73,7 @@ static struct btrfs_lockdep_keyset { { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") }, { .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") }, { .id = BTRFS_RAID_STRIPE_TREE_OBJECTID, DEFINE_NAME("raid-stripe") }, + { .id = BTRFS_REMAP_TREE_OBJECTID, DEFINE_NAME("remap") }, { .id = 0, DEFINE_NAME("tree") }, }; diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 4758f66da449..8e20497afffe 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -123,126 +123,188 @@ static inline size_t read_compress_length(const char *buf) } /* + * Write data into @out_folio and queue it into @out_bio. + * + * Return 0 if everything is fine and @total_out will be increased. + * Return <0 for error. + * + * The @out_folio can be NULL after a full folio is queued. + * Thus the caller should check and allocate a new folio when needed. + */ +static int write_and_queue_folio(struct bio *out_bio, struct folio **out_folio, + u32 *total_out, u32 write_len) +{ + const u32 fsize = folio_size(*out_folio); + const u32 foffset = offset_in_folio(*out_folio, *total_out); + + ASSERT(out_folio && *out_folio); + /* Should not cross folio boundary. */ + ASSERT(foffset + write_len <= fsize); + + /* We can not use bio_add_folio_nofail() which doesn't do any merge. */ + if (!bio_add_folio(out_bio, *out_folio, write_len, foffset)) { + /* + * We have allocated a bio that havs BTRFS_MAX_COMPRESSED_PAGES + * vecs, and all ranges inside the same folio should have been + * merged. If bio_add_folio() still failed, that means we have + * reached the bvec limits. + * + * This should only happen at the beginning of a folio, and + * caller is responsible for releasing the folio, since it's + * not yet queued into the bio. + */ + ASSERT(IS_ALIGNED(*total_out, fsize)); + return -E2BIG; + } + + *total_out += write_len; + /* + * The full folio has been filled and queued, reset @out_folio to NULL, + * so that error handling is fully handled by the bio. + */ + if (IS_ALIGNED(*total_out, fsize)) + *out_folio = NULL; + return 0; +} + +/* + * Copy compressed data to bio. + * + * @out_bio: The bio that will contain all the compressed data. + * @compressed_data: The compressed data of this segment. + * @compressed_size: The size of the compressed data. + * @out_folio: The current output folio, will be updated if a new + * folio is allocated. + * @total_out: The total bytes of current output. + * @max_out: The maximum size of the compressed data. + * * Will do: * * - Write a segment header into the destination * - Copy the compressed buffer into the destination * - Make sure we have enough space in the last sector to fit a segment header * If not, we will pad at most (LZO_LEN (4)) - 1 bytes of zeros. + * - If a full folio is filled, it will be queued into @out_bio, and @out_folio + * will be updated. * * Will allocate new pages when needed. */ -static int copy_compressed_data_to_page(struct btrfs_fs_info *fs_info, - char *compressed_data, - size_t compressed_size, - struct folio **out_folios, - unsigned long max_nr_folio, - u32 *cur_out) +static int copy_compressed_data_to_bio(struct btrfs_fs_info *fs_info, + struct bio *out_bio, + const char *compressed_data, + size_t compressed_size, + struct folio **out_folio, + u32 *total_out, u32 max_out) { const u32 sectorsize = fs_info->sectorsize; - const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; + const u32 sectorsize_bits = fs_info->sectorsize_bits; + const u32 fsize = btrfs_min_folio_size(fs_info); + const u32 old_size = out_bio->bi_iter.bi_size; + u32 copy_start; u32 sector_bytes_left; - u32 orig_out; - struct folio *cur_folio; char *kaddr; + int ret; - if ((*cur_out >> min_folio_shift) >= max_nr_folio) - return -E2BIG; + ASSERT(out_folio); + + /* There should be at least a lzo header queued. */ + ASSERT(old_size); + ASSERT(old_size == *total_out); /* * We never allow a segment header crossing sector boundary, previous * run should ensure we have enough space left inside the sector. */ - ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize); + ASSERT((old_size >> sectorsize_bits) == (old_size + LZO_LEN - 1) >> sectorsize_bits); - cur_folio = out_folios[*cur_out >> min_folio_shift]; - /* Allocate a new page */ - if (!cur_folio) { - cur_folio = btrfs_alloc_compr_folio(fs_info); - if (!cur_folio) + if (!*out_folio) { + *out_folio = btrfs_alloc_compr_folio(fs_info); + if (!*out_folio) return -ENOMEM; - out_folios[*cur_out >> min_folio_shift] = cur_folio; } - kaddr = kmap_local_folio(cur_folio, offset_in_folio(cur_folio, *cur_out)); + /* Write the segment header first. */ + kaddr = kmap_local_folio(*out_folio, offset_in_folio(*out_folio, *total_out)); write_compress_length(kaddr, compressed_size); - *cur_out += LZO_LEN; - - orig_out = *cur_out; + kunmap_local(kaddr); + ret = write_and_queue_folio(out_bio, out_folio, total_out, LZO_LEN); + if (ret < 0) + return ret; - /* Copy compressed data */ - while (*cur_out - orig_out < compressed_size) { - u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize, - orig_out + compressed_size - *cur_out); + copy_start = *total_out; - kunmap_local(kaddr); + /* Copy compressed data. */ + while (*total_out - copy_start < compressed_size) { + u32 copy_len = min_t(u32, sectorsize - *total_out % sectorsize, + copy_start + compressed_size - *total_out); + u32 foffset = *total_out & (fsize - 1); - if ((*cur_out >> min_folio_shift) >= max_nr_folio) + /* With the range copied, we're larger than the original range. */ + if (((*total_out + copy_len) >> sectorsize_bits) >= + max_out >> sectorsize_bits) return -E2BIG; - cur_folio = out_folios[*cur_out >> min_folio_shift]; - /* Allocate a new page */ - if (!cur_folio) { - cur_folio = btrfs_alloc_compr_folio(fs_info); - if (!cur_folio) + if (!*out_folio) { + *out_folio = btrfs_alloc_compr_folio(fs_info); + if (!*out_folio) return -ENOMEM; - out_folios[*cur_out >> min_folio_shift] = cur_folio; } - kaddr = kmap_local_folio(cur_folio, 0); - memcpy(kaddr + offset_in_folio(cur_folio, *cur_out), - compressed_data + *cur_out - orig_out, copy_len); - - *cur_out += copy_len; + kaddr = kmap_local_folio(*out_folio, foffset); + memcpy(kaddr, compressed_data + *total_out - copy_start, copy_len); + kunmap_local(kaddr); + ret = write_and_queue_folio(out_bio, out_folio, total_out, copy_len); + if (ret < 0) + return ret; } /* * Check if we can fit the next segment header into the remaining space * of the sector. */ - sector_bytes_left = round_up(*cur_out, sectorsize) - *cur_out; + sector_bytes_left = round_up(*total_out, sectorsize) - *total_out; if (sector_bytes_left >= LZO_LEN || sector_bytes_left == 0) - goto out; + return 0; - /* The remaining size is not enough, pad it with zeros */ - memset(kaddr + offset_in_page(*cur_out), 0, - sector_bytes_left); - *cur_out += sector_bytes_left; + ASSERT(*out_folio); -out: - kunmap_local(kaddr); - return 0; + /* The remaining size is not enough, pad it with zeros */ + folio_zero_range(*out_folio, offset_in_folio(*out_folio, *total_out), sector_bytes_left); + return write_and_queue_folio(out_bio, out_folio, total_out, sector_bytes_left); } -int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode, - u64 start, struct folio **folios, unsigned long *out_folios, - unsigned long *total_in, unsigned long *total_out) +int lzo_compress_bio(struct list_head *ws, struct compressed_bio *cb) { + struct btrfs_inode *inode = cb->bbio.inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct workspace *workspace = list_entry(ws, struct workspace, list); + struct bio *bio = &cb->bbio.bio; + const u64 start = cb->start; + const u32 len = cb->len; const u32 sectorsize = fs_info->sectorsize; const u32 min_folio_size = btrfs_min_folio_size(fs_info); struct address_space *mapping = inode->vfs_inode.i_mapping; struct folio *folio_in = NULL; + struct folio *folio_out = NULL; char *sizes_ptr; - const unsigned long max_nr_folio = *out_folios; int ret = 0; - /* Points to the file offset of input data */ + /* Points to the file offset of input data. */ u64 cur_in = start; - /* Points to the current output byte */ - u32 cur_out = 0; - u32 len = *total_out; + /* Points to the current output byte. */ + u32 total_out = 0; - ASSERT(max_nr_folio > 0); - *out_folios = 0; - *total_out = 0; - *total_in = 0; + ASSERT(bio->bi_iter.bi_size == 0); + ASSERT(len); + + folio_out = btrfs_alloc_compr_folio(fs_info); + if (!folio_out) + return -ENOMEM; + + /* Queue a segment header first. */ + ret = write_and_queue_folio(bio, &folio_out, &total_out, LZO_LEN); + /* The first header should not fail. */ + ASSERT(ret == 0); - /* - * Skip the header for now, we will later come back and write the total - * compressed size - */ - cur_out += LZO_LEN; while (cur_in < start + len) { char *data_in; const u32 sectorsize_mask = sectorsize - 1; @@ -250,19 +312,18 @@ int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u32 in_len; size_t out_len; - /* Get the input page first */ + /* Get the input page first. */ if (!folio_in) { ret = btrfs_compress_filemap_get_folio(mapping, cur_in, &folio_in); if (ret < 0) goto out; } - /* Compress at most one sector of data each time */ + /* Compress at most one sector of data each time. */ in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off); ASSERT(in_len); data_in = kmap_local_folio(folio_in, offset_in_folio(folio_in, cur_in)); - ret = lzo1x_1_compress(data_in, in_len, - workspace->cbuf, &out_len, + ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, &out_len, workspace->mem); kunmap_local(data_in); if (unlikely(ret < 0)) { @@ -271,9 +332,8 @@ int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode, goto out; } - ret = copy_compressed_data_to_page(fs_info, workspace->cbuf, out_len, - folios, max_nr_folio, - &cur_out); + ret = copy_compressed_data_to_bio(fs_info, bio, workspace->cbuf, out_len, + &folio_out, &total_out, len); if (ret < 0) goto out; @@ -283,50 +343,80 @@ int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode, * Check if we're making it bigger after two sectors. And if * it is so, give up. */ - if (cur_in - start > sectorsize * 2 && cur_in - start < cur_out) { + if (cur_in - start > sectorsize * 2 && cur_in - start < total_out) { ret = -E2BIG; goto out; } - /* Check if we have reached folio boundary. */ + /* Check if we have reached input folio boundary. */ if (IS_ALIGNED(cur_in, min_folio_size)) { folio_put(folio_in); folio_in = NULL; } } + /* + * The last folio is already queued. Bio is responsible for freeing + * those folios now. + */ + folio_out = NULL; /* Store the size of all chunks of compressed data */ - sizes_ptr = kmap_local_folio(folios[0], 0); - write_compress_length(sizes_ptr, cur_out); + sizes_ptr = kmap_local_folio(bio_first_folio_all(bio), 0); + write_compress_length(sizes_ptr, total_out); kunmap_local(sizes_ptr); - - ret = 0; - *total_out = cur_out; - *total_in = cur_in - start; out: + /* + * We can only free the folio that has no part queued into the bio. + * + * As any folio that is already queued into bio will be released by + * the endio function of bio. + */ + if (folio_out && IS_ALIGNED(total_out, min_folio_size)) { + btrfs_free_compr_folio(folio_out); + folio_out = NULL; + } if (folio_in) folio_put(folio_in); - *out_folios = DIV_ROUND_UP(cur_out, min_folio_size); return ret; } +static struct folio *get_current_folio(struct compressed_bio *cb, struct folio_iter *fi, + u32 *cur_folio_index, u32 cur_in) +{ + struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; + + ASSERT(cur_folio_index); + + /* Need to switch to the next folio. */ + if (cur_in >> min_folio_shift != *cur_folio_index) { + /* We can only do the switch one folio a time. */ + ASSERT(cur_in >> min_folio_shift == *cur_folio_index + 1); + + bio_next_folio(fi, &cb->bbio.bio); + (*cur_folio_index)++; + } + return fi->folio; +} + /* * Copy the compressed segment payload into @dest. * * For the payload there will be no padding, just need to do page switching. */ static void copy_compressed_segment(struct compressed_bio *cb, + struct folio_iter *fi, u32 *cur_folio_index, char *dest, u32 len, u32 *cur_in) { - struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); - const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; u32 orig_in = *cur_in; while (*cur_in < orig_in + len) { - struct folio *cur_folio = cb->compressed_folios[*cur_in >> min_folio_shift]; - u32 copy_len = min_t(u32, orig_in + len - *cur_in, - folio_size(cur_folio) - offset_in_folio(cur_folio, *cur_in)); + struct folio *cur_folio = get_current_folio(cb, fi, cur_folio_index, *cur_in); + u32 copy_len; + ASSERT(cur_folio); + copy_len = min_t(u32, orig_in + len - *cur_in, + folio_size(cur_folio) - offset_in_folio(cur_folio, *cur_in)); ASSERT(copy_len); memcpy_from_folio(dest + *cur_in - orig_in, cur_folio, @@ -341,7 +431,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) struct workspace *workspace = list_entry(ws, struct workspace, list); const struct btrfs_fs_info *fs_info = cb->bbio.inode->root->fs_info; const u32 sectorsize = fs_info->sectorsize; - const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; + struct folio_iter fi; char *kaddr; int ret; /* Compressed data length, can be unaligned */ @@ -350,8 +440,15 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) u32 cur_in = 0; /* Bytes decompressed so far */ u32 cur_out = 0; - - kaddr = kmap_local_folio(cb->compressed_folios[0], 0); + /* The current folio index number inside the bio. */ + u32 cur_folio_index = 0; + + bio_first_folio(&fi, &cb->bbio.bio, 0); + /* There must be a compressed folio and matches the sectorsize. */ + if (unlikely(!fi.folio)) + return -EINVAL; + ASSERT(folio_size(fi.folio) == sectorsize); + kaddr = kmap_local_folio(fi.folio, 0); len_in = read_compress_length(kaddr); kunmap_local(kaddr); cur_in += LZO_LEN; @@ -388,7 +485,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) */ ASSERT(cur_in / sectorsize == (cur_in + LZO_LEN - 1) / sectorsize); - cur_folio = cb->compressed_folios[cur_in >> min_folio_shift]; + cur_folio = get_current_folio(cb, &fi, &cur_folio_index, cur_in); ASSERT(cur_folio); kaddr = kmap_local_folio(cur_folio, 0); seg_len = read_compress_length(kaddr + offset_in_folio(cur_folio, cur_in)); @@ -410,7 +507,8 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) } /* Copy the compressed segment payload into workspace */ - copy_compressed_segment(cb, workspace->cbuf, seg_len, &cur_in); + copy_compressed_segment(cb, &fi, &cur_folio_index, workspace->cbuf, + seg_len, &cur_in); /* Decompress the data */ ret = lzo1x_decompress_safe(workspace->cbuf, seg_len, @@ -456,7 +554,7 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, size_t in_len; size_t out_len; size_t max_segment_len = workspace_buf_length(fs_info); - int ret = 0; + int ret; if (unlikely(srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2)) return -EUCLEAN; @@ -467,10 +565,8 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, data_in += LZO_LEN; in_len = read_compress_length(data_in); - if (unlikely(in_len != srclen - LZO_LEN * 2)) { - ret = -EUCLEAN; - goto out; - } + if (unlikely(in_len != srclen - LZO_LEN * 2)) + return -EUCLEAN; data_in += LZO_LEN; out_len = sectorsize; @@ -482,19 +578,18 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, "lzo decompression failed, error %d root %llu inode %llu offset %llu", ret, btrfs_root_id(inode->root), btrfs_ino(inode), folio_pos(dest_folio)); - ret = -EIO; - goto out; + return -EIO; } ASSERT(out_len <= sectorsize); memcpy_to_folio(dest_folio, dest_pgoff, workspace->buf, out_len); /* Early end, considered as an error. */ if (unlikely(out_len < destlen)) { - ret = -EIO; folio_zero_range(dest_folio, dest_pgoff + out_len, destlen - out_len); + return -EIO; } -out: - return ret; + + return 0; } const struct btrfs_compress_levels btrfs_lzo_compress = { diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index 2f853de44473..6190777924bf 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -211,33 +211,19 @@ static struct ratelimit_state printk_limits[] = { RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100), }; -void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) +__printf(3, 4) __cold +void _btrfs_printk(const struct btrfs_fs_info *fs_info, unsigned int level, const char *fmt, ...) { - char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0"; struct va_format vaf; va_list args; - int kern_level; - const char *type = logtypes[4]; - struct ratelimit_state *ratelimit = &printk_limits[4]; + const char *type = logtypes[level]; + struct ratelimit_state *ratelimit = &printk_limits[level]; #ifdef CONFIG_PRINTK_INDEX printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); #endif va_start(args, fmt); - - while ((kern_level = printk_get_level(fmt)) != 0) { - size_t size = printk_skip_level(fmt) - fmt; - - if (kern_level >= '0' && kern_level <= '7') { - memcpy(lvl, fmt, size); - lvl[size] = '\0'; - type = logtypes[kern_level - '0']; - ratelimit = &printk_limits[kern_level - '0']; - } - fmt += size; - } - vaf.fmt = fmt; vaf.va = &args; @@ -247,10 +233,10 @@ void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, char statestr[STATE_STRING_BUF_LEN]; btrfs_state_to_string(fs_info, statestr); - _printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type, + _printk(KERN_SOH "%dBTRFS %s (device %s%s): %pV\n", level, type, fs_info->sb->s_id, statestr, &vaf); } else { - _printk("%sBTRFS %s: %pV\n", lvl, type, &vaf); + _printk(KERN_SOH "%dBTRFS %s: %pV\n", level, type, &vaf); } } diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index d8c0bd17dcda..943e53980945 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -23,69 +23,74 @@ void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) #ifdef CONFIG_PRINTK -#define btrfs_printk(fs_info, fmt, args...) \ - _btrfs_printk(fs_info, fmt, ##args) - -__printf(2, 3) -__cold -void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); +__printf(3, 4) __cold +void _btrfs_printk(const struct btrfs_fs_info *fs_info, unsigned int level, const char *fmt, ...); #else -#define btrfs_printk(fs_info, fmt, args...) \ +#define btrfs_printk_in_rcu(fs_info, level, fmt, args...) \ + btrfs_no_printk(fs_info, fmt, ##args) + +#define btrfs_printk_in_rcu(fs_info, level, fmt, args...) \ btrfs_no_printk(fs_info, fmt, ##args) + +#define btrfs_printk_rl_in_rcu(fs_info, level, fmt, args...) \ + btrfs_no_printk(fs_info, fmt, ##args) + #endif /* * Print a message with filesystem info, enclosed in RCU protection. */ #define btrfs_crit(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args) + btrfs_printk_in_rcu(fs_info, LOGLEVEL_CRIT, fmt, ##args) #define btrfs_err(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args) + btrfs_printk_in_rcu(fs_info, LOGLEVEL_ERR, fmt, ##args) #define btrfs_warn(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args) + btrfs_printk_in_rcu(fs_info, LOGLEVEL_WARNING, fmt, ##args) #define btrfs_info(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args) + btrfs_printk_in_rcu(fs_info, LOGLEVEL_INFO, fmt, ##args) /* * Wrappers that use a ratelimited printk */ #define btrfs_crit_rl(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args) + btrfs_printk_rl_in_rcu(fs_info, LOGLEVEL_CRIT, fmt, ##args) #define btrfs_err_rl(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args) + btrfs_printk_rl_in_rcu(fs_info, LOGLEVEL_ERR, fmt, ##args) #define btrfs_warn_rl(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args) + btrfs_printk_rl_in_rcu(fs_info, LOGLEVEL_WARNING, fmt, ##args) #define btrfs_info_rl(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args) + btrfs_printk_rl_in_rcu(fs_info, LOGLEVEL_INFO, fmt, ##args) #if defined(CONFIG_DYNAMIC_DEBUG) #define btrfs_debug(fs_info, fmt, args...) \ _dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \ - fs_info, KERN_DEBUG fmt, ##args) + fs_info, LOGLEVEL_DEBUG, fmt, ##args) #define btrfs_debug_rl(fs_info, fmt, args...) \ _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \ - fs_info, KERN_DEBUG fmt, ##args) + fs_info, LOGLEVEL_DEBUG, fmt, ##args) #elif defined(DEBUG) #define btrfs_debug(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) + btrfs_printk_in_rcu(fs_info, LOGLEVEL_DEBUG, fmt, ##args) #define btrfs_debug_rl(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args) + btrfs_printk_rl_in_rcu(fs_info, LOGLEVEl_DEBUG, fmt, ##args) #else /* When printk() is no_printk(), expand to no-op. */ #define btrfs_debug(fs_info, fmt, args...) do { (void)(fs_info); } while(0) #define btrfs_debug_rl(fs_info, fmt, args...) do { (void)(fs_info); } while(0) #endif -#define btrfs_printk_in_rcu(fs_info, fmt, args...) \ -do { \ - rcu_read_lock(); \ - btrfs_printk(fs_info, fmt, ##args); \ - rcu_read_unlock(); \ +#ifdef CONFIG_PRINTK + +#define btrfs_printk_in_rcu(fs_info, level, fmt, args...) \ +do { \ + rcu_read_lock(); \ + _btrfs_printk(fs_info, level, fmt, ##args); \ + rcu_read_unlock(); \ } while (0) -#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \ +#define btrfs_printk_rl_in_rcu(fs_info, level, fmt, args...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ @@ -93,10 +98,12 @@ do { \ \ rcu_read_lock(); \ if (__ratelimit(&_rs)) \ - btrfs_printk(fs_info, fmt, ##args); \ + _btrfs_printk(fs_info, level, fmt, ##args); \ rcu_read_unlock(); \ } while (0) +#endif + #ifdef CONFIG_BTRFS_ASSERT __printf(1, 2) @@ -113,7 +120,6 @@ static inline void verify_assert_printk_format(const char *fmt, ...) { */ #define __REST_ARGS(_, ... ) __VA_OPT__(,) __VA_ARGS__ -#if defined(CONFIG_CC_IS_CLANG) || GCC_VERSION >= 80000 /* * Assertion with optional printk() format. * @@ -152,22 +158,6 @@ do { \ } while(0) #else - -/* For GCC < 8.x only the simple output. */ - -#define ASSERT(cond, args...) \ -do { \ - verify_assert_printk_format("check the format string" args); \ - if (!likely(cond)) { \ - pr_err("assertion failed: %s :: %ld, in %s:%d\n", \ - #cond, (long)(cond), __FILE__, __LINE__); \ - BUG(); \ - } \ -} while(0) - -#endif - -#else /* Compile check the @cond expression but don't generate any code. */ #define ASSERT(cond, args...) BUILD_BUG_ON_INVALID(cond) #endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 206587820fec..f53c313ab6e4 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -346,6 +346,42 @@ int btrfs_verify_qgroup_counts(const struct btrfs_fs_info *fs_info, u64 qgroupid } #endif +static bool squota_check_parent_usage(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *parent) +{ + u64 excl_sum = 0; + u64 rfer_sum = 0; + u64 excl_cmpr_sum = 0; + u64 rfer_cmpr_sum = 0; + struct btrfs_qgroup_list *glist; + int nr_members = 0; + bool mismatch; + + if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE) + return false; + if (btrfs_qgroup_level(parent->qgroupid) == 0) + return false; + + /* Eligible parent qgroup. Squota; level > 0; empty members list. */ + list_for_each_entry(glist, &parent->members, next_member) { + excl_sum += glist->member->excl; + rfer_sum += glist->member->rfer; + excl_cmpr_sum += glist->member->excl_cmpr; + rfer_cmpr_sum += glist->member->rfer_cmpr; + nr_members++; + } + mismatch = (parent->excl != excl_sum || parent->rfer != rfer_sum || + parent->excl_cmpr != excl_cmpr_sum || parent->rfer_cmpr != excl_cmpr_sum); + + WARN(mismatch, + "parent squota qgroup %hu/%llu has mismatched usage from its %d members. " + "%llu %llu %llu %llu vs %llu %llu %llu %llu\n", + btrfs_qgroup_level(parent->qgroupid), + btrfs_qgroup_subvolid(parent->qgroupid), nr_members, parent->excl, + parent->rfer, parent->excl_cmpr, parent->rfer_cmpr, excl_sum, + rfer_sum, excl_cmpr_sum, rfer_cmpr_sum); + return mismatch; +} + __printf(2, 3) static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info, const char *fmt, ...) { @@ -658,7 +694,6 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, u64 dst) { - int ret; struct btrfs_root *quota_root = trans->fs_info->quota_root; BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; @@ -671,8 +706,7 @@ static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, key.type = BTRFS_QGROUP_RELATION_KEY; key.offset = dst; - ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0); - return ret; + return btrfs_insert_empty_item(trans, quota_root, path, &key, 0); } static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, @@ -797,9 +831,7 @@ static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid) if (ret > 0) return -ENOENT; - ret = btrfs_del_item(trans, quota_root, path); - - return ret; + return btrfs_del_item(trans, quota_root, path); } static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, @@ -1562,6 +1594,7 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst goto out; } ret = quick_update_accounting(fs_info, src, dst, 1); + squota_check_parent_usage(fs_info, parent); spin_unlock(&fs_info->qgroup_lock); out: kfree(prealloc); @@ -1580,10 +1613,8 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, int ret = 0; int ret2; - if (!fs_info->quota_root) { - ret = -ENOTCONN; - goto out; - } + if (!fs_info->quota_root) + return -ENOTCONN; member = find_qgroup_rb(fs_info, src); parent = find_qgroup_rb(fs_info, dst); @@ -1605,10 +1636,10 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, delete_item: ret = del_qgroup_relation_item(trans, src, dst); if (ret < 0 && ret != -ENOENT) - goto out; + return ret; ret2 = del_qgroup_relation_item(trans, dst, src); if (ret2 < 0 && ret2 != -ENOENT) - goto out; + return ret2; /* At least one deletion succeeded, return 0 */ if (!ret || !ret2) @@ -1618,9 +1649,11 @@ delete_item: spin_lock(&fs_info->qgroup_lock); del_relation_rb(fs_info, src, dst); ret = quick_update_accounting(fs_info, src, dst, -1); + ASSERT(parent); + squota_check_parent_usage(fs_info, parent); spin_unlock(&fs_info->qgroup_lock); } -out: + return ret; } @@ -1679,6 +1712,36 @@ out: return ret; } +static bool can_delete_parent_qgroup(struct btrfs_qgroup *qgroup) + +{ + ASSERT(btrfs_qgroup_level(qgroup->qgroupid)); + return list_empty(&qgroup->members); +} + +/* + * Return true if we can delete the squota qgroup and false otherwise. + * + * Rules for whether we can delete: + * + * A subvolume qgroup can be removed iff the subvolume is fully deleted, which + * is iff there is 0 usage in the qgroup. + * + * A higher level qgroup can be removed iff it has no members. + * Note: We audit its usage to warn on inconsitencies without blocking deletion. + */ +static bool can_delete_squota_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup) +{ + ASSERT(btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE); + + if (btrfs_qgroup_level(qgroup->qgroupid) > 0) { + squota_check_parent_usage(fs_info, qgroup); + return can_delete_parent_qgroup(qgroup); + } + + return !(qgroup->rfer || qgroup->excl || qgroup->rfer_cmpr || qgroup->excl_cmpr); +} + /* * Return 0 if we can not delete the qgroup (not empty or has children etc). * Return >0 if we can delete the qgroup. @@ -1689,23 +1752,13 @@ static int can_delete_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup struct btrfs_key key; BTRFS_PATH_AUTO_FREE(path); - /* - * Squota would never be inconsistent, but there can still be case - * where a dropped subvolume still has qgroup numbers, and squota - * relies on such qgroup for future accounting. - * - * So for squota, do not allow dropping any non-zero qgroup. - */ - if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE && - (qgroup->rfer || qgroup->excl || qgroup->excl_cmpr || qgroup->rfer_cmpr)) - return 0; + /* Since squotas cannot be inconsistent, they have special rules for deletion. */ + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) + return can_delete_squota_qgroup(fs_info, qgroup); /* For higher level qgroup, we can only delete it if it has no child. */ - if (btrfs_qgroup_level(qgroup->qgroupid)) { - if (!list_empty(&qgroup->members)) - return 0; - return 1; - } + if (btrfs_qgroup_level(qgroup->qgroupid)) + return can_delete_parent_qgroup(qgroup); /* * For level-0 qgroups, we can only delete it if it has no subvolume @@ -2433,13 +2486,11 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, /* This node is old, no need to trace */ if (child_gen < last_snapshot) - goto out; + return ret; eb = btrfs_read_node_slot(eb, parent_slot); - if (IS_ERR(eb)) { - ret = PTR_ERR(eb); - goto out; - } + if (IS_ERR(eb)) + return PTR_ERR(eb); dst_path->nodes[cur_level] = eb; dst_path->slots[cur_level] = 0; @@ -2484,7 +2535,7 @@ cleanup: dst_path->slots[cur_level] = 0; dst_path->locks[cur_level] = 0; } -out: + return ret; } @@ -2596,10 +2647,8 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, return ret; } - if (root_level == 0) { - ret = btrfs_qgroup_trace_leaf_items(trans, root_eb); - return ret; - } + if (root_level == 0) + return btrfs_qgroup_trace_leaf_items(trans, root_eb); path = btrfs_alloc_path(); if (!path) diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 58dc3e5057ce..314cb95ba846 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -754,8 +754,13 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, /* * We may have copied an inline extent into a page of the destination - * range, so wait for writeback to complete before invalidating pages - * from the page cache. This is a rare case. + * range. So flush delalloc and wait for ordered extent completion. + * This is to ensure the invalidation below does not fail, as if for + * example it finds a dirty folio, our folio release callback + * (btrfs_release_folio()) returns false, which makes the invalidation + * return an -EBUSY error. We can't ignore such failures since they + * could come from some range other than the copied inline extent's + * destination range and we have no way to know that. */ ret = btrfs_wait_ordered_range(BTRFS_I(inode), destoff, len); if (ret < 0) @@ -873,7 +878,7 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, bool same_inode = dst_inode == src_inode; int ret; - if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(src_file))))) + if (btrfs_is_shutdown(inode_to_fs_info(file_inode(src_file)))) return -EIO; if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 5bfefc3e9c06..fcd0a2ba3554 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -37,6 +37,7 @@ #include "super.h" #include "tree-checker.h" #include "raid-stripe-tree.h" +#include "free-space-tree.h" /* * Relocation overview @@ -3254,7 +3255,6 @@ static int delete_v1_space_cache(struct extent_buffer *leaf, struct btrfs_key key; bool found = false; int i; - int ret; if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID) return 0; @@ -3278,8 +3278,8 @@ static int delete_v1_space_cache(struct extent_buffer *leaf, } if (!found) return -ENOENT; - ret = delete_block_group_cache(block_group, NULL, space_cache_ino); - return ret; + + return delete_block_group_cache(block_group, NULL, space_cache_ino); } /* @@ -3616,7 +3616,7 @@ restart: btrfs_btree_balance_dirty(fs_info); } - if (!err) { + if (!err && !btrfs_fs_incompat(fs_info, REMAP_TREE)) { ret = relocate_file_extent_cluster(rc); if (ret < 0) err = ret; @@ -3860,6 +3860,1433 @@ static const char *stage_to_string(enum reloc_stage stage) return "unknown"; } +static int add_remap_tree_entries(struct btrfs_trans_handle *trans, struct btrfs_path *path, + struct btrfs_key *entries, unsigned int num_entries) +{ + int ret; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_item_batch batch; + u32 *data_sizes; + u32 max_items; + + max_items = BTRFS_LEAF_DATA_SIZE(trans->fs_info) / sizeof(struct btrfs_item); + + data_sizes = kzalloc(sizeof(u32) * min_t(u32, num_entries, max_items), GFP_NOFS); + if (!data_sizes) + return -ENOMEM; + + while (true) { + batch.keys = entries; + batch.data_sizes = data_sizes; + batch.total_data_size = 0; + batch.nr = min_t(u32, num_entries, max_items); + + ret = btrfs_insert_empty_items(trans, fs_info->remap_root, path, &batch); + btrfs_release_path(path); + + if (num_entries <= max_items) + break; + + num_entries -= max_items; + entries += max_items; + } + + kfree(data_sizes); + + return ret; +} + +struct space_run { + u64 start; + u64 end; +}; + +static void parse_bitmap(u64 block_size, const unsigned long *bitmap, + unsigned long size, u64 address, struct space_run *space_runs, + unsigned int *num_space_runs) +{ + unsigned long pos, end; + u64 run_start, run_length; + + pos = find_first_bit(bitmap, size); + if (pos == size) + return; + + while (true) { + end = find_next_zero_bit(bitmap, size, pos); + + run_start = address + (pos * block_size); + run_length = (end - pos) * block_size; + + if (*num_space_runs != 0 && + space_runs[*num_space_runs - 1].end == run_start) { + space_runs[*num_space_runs - 1].end += run_length; + } else { + space_runs[*num_space_runs].start = run_start; + space_runs[*num_space_runs].end = run_start + run_length; + + (*num_space_runs)++; + } + + if (end == size) + break; + + pos = find_next_bit(bitmap, size, end + 1); + if (pos == size) + break; + } +} + +static void adjust_block_group_remap_bytes(struct btrfs_trans_handle *trans, + struct btrfs_block_group *bg, s64 diff) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + bool bg_already_dirty = true; + bool mark_unused = false; + + spin_lock(&bg->lock); + bg->remap_bytes += diff; + if (bg->used == 0 && bg->remap_bytes == 0) + mark_unused = true; + spin_unlock(&bg->lock); + + if (mark_unused) + btrfs_mark_bg_unused(bg); + + spin_lock(&trans->transaction->dirty_bgs_lock); + if (list_empty(&bg->dirty_list)) { + list_add_tail(&bg->dirty_list, &trans->transaction->dirty_bgs); + bg_already_dirty = false; + btrfs_get_block_group(bg); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + + /* Modified block groups are accounted for in the delayed_refs_rsv. */ + if (!bg_already_dirty) + btrfs_inc_delayed_refs_rsv_bg_updates(fs_info); +} + +/* Private structure for I/O from copy_remapped_data(). */ +struct reloc_io_private { + struct completion done; + refcount_t pending_refs; + blk_status_t status; +}; + +static void reloc_endio(struct btrfs_bio *bbio) +{ + struct reloc_io_private *priv = bbio->private; + + if (bbio->bio.bi_status) + WRITE_ONCE(priv->status, bbio->bio.bi_status); + + if (refcount_dec_and_test(&priv->pending_refs)) + complete(&priv->done); + + bio_put(&bbio->bio); +} + +static int copy_remapped_data_io(struct btrfs_fs_info *fs_info, + struct reloc_io_private *priv, + struct page **pages, u64 addr, u64 length, + blk_opf_t op) +{ + struct btrfs_bio *bbio; + int i; + + init_completion(&priv->done); + refcount_set(&priv->pending_refs, 1); + priv->status = 0; + + bbio = btrfs_bio_alloc(BIO_MAX_VECS, op, BTRFS_I(fs_info->btree_inode), + addr, reloc_endio, priv); + bbio->bio.bi_iter.bi_sector = (addr >> SECTOR_SHIFT); + bbio->is_remap = true; + + i = 0; + do { + size_t bytes = min_t(u64, length, PAGE_SIZE); + + if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { + refcount_inc(&priv->pending_refs); + btrfs_submit_bbio(bbio, 0); + + bbio = btrfs_bio_alloc(BIO_MAX_VECS, op, + BTRFS_I(fs_info->btree_inode), + addr, reloc_endio, priv); + bbio->bio.bi_iter.bi_sector = (addr >> SECTOR_SHIFT); + bbio->is_remap = true; + continue; + } + + i++; + addr += bytes; + length -= bytes; + } while (length); + + refcount_inc(&priv->pending_refs); + btrfs_submit_bbio(bbio, 0); + + if (!refcount_dec_and_test(&priv->pending_refs)) + wait_for_completion_io(&priv->done); + + return blk_status_to_errno(READ_ONCE(priv->status)); +} + +static int copy_remapped_data(struct btrfs_fs_info *fs_info, u64 old_addr, + u64 new_addr, u64 length) +{ + int ret; + u64 copy_len = min_t(u64, length, SZ_1M); + struct page **pages; + struct reloc_io_private priv; + unsigned int nr_pages = DIV_ROUND_UP(length, PAGE_SIZE); + + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) + return -ENOMEM; + + ret = btrfs_alloc_page_array(nr_pages, pages, 0); + if (ret) { + ret = -ENOMEM; + goto end; + } + + /* Copy 1MB at a time, to avoid using too much memory. */ + do { + u64 to_copy = min_t(u64, length, copy_len); + + /* Limit to one bio. */ + to_copy = min_t(u64, to_copy, BIO_MAX_VECS << PAGE_SHIFT); + + ret = copy_remapped_data_io(fs_info, &priv, pages, old_addr, + to_copy, REQ_OP_READ); + if (ret) + goto end; + + ret = copy_remapped_data_io(fs_info, &priv, pages, new_addr, + to_copy, REQ_OP_WRITE); + if (ret) + goto end; + + if (to_copy == length) + break; + + old_addr += to_copy; + new_addr += to_copy; + length -= to_copy; + } while (true); + + ret = 0; +end: + for (int i = 0; i < nr_pages; i++) { + if (pages[i]) + __free_page(pages[i]); + } + kfree(pages); + + return ret; +} + +static int add_remap_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, u64 new_addr, u64 length, + u64 old_addr) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_remap_item remap = { 0 }; + struct btrfs_key key; + struct extent_buffer *leaf; + int ret; + + key.objectid = old_addr; + key.type = BTRFS_REMAP_KEY; + key.offset = length; + + ret = btrfs_insert_empty_item(trans, fs_info->remap_root, path, + &key, sizeof(struct btrfs_remap_item)); + if (ret) + return ret; + + leaf = path->nodes[0]; + btrfs_set_stack_remap_address(&remap, new_addr); + write_extent_buffer(leaf, &remap, btrfs_item_ptr_offset(leaf, path->slots[0]), + sizeof(struct btrfs_remap_item)); + + btrfs_release_path(path); + + return 0; +} + +static int add_remap_backref_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, u64 new_addr, + u64 length, u64 old_addr) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_remap_item remap = { 0 }; + struct btrfs_key key; + struct extent_buffer *leaf; + int ret; + + key.objectid = new_addr; + key.type = BTRFS_REMAP_BACKREF_KEY; + key.offset = length; + + ret = btrfs_insert_empty_item(trans, fs_info->remap_root, path, &key, + sizeof(struct btrfs_remap_item)); + if (ret) + return ret; + + leaf = path->nodes[0]; + btrfs_set_stack_remap_address(&remap, old_addr); + write_extent_buffer(leaf, &remap, btrfs_item_ptr_offset(leaf, path->slots[0]), + sizeof(struct btrfs_remap_item)); + + btrfs_release_path(path); + + return 0; +} + +static int move_existing_remap(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + struct btrfs_block_group *bg, u64 new_addr, + u64 length, u64 old_addr) +{ + struct btrfs_trans_handle *trans; + struct extent_buffer *leaf; + struct btrfs_remap_item *remap_ptr; + struct btrfs_remap_item remap = { 0 }; + struct btrfs_key key, ins; + u64 dest_addr, dest_length, min_size; + struct btrfs_block_group *dest_bg; + int ret; + const bool is_data = (bg->flags & BTRFS_BLOCK_GROUP_DATA); + struct btrfs_space_info *sinfo = bg->space_info; + bool mutex_taken = false; + bool bg_needs_free_space; + + spin_lock(&sinfo->lock); + btrfs_space_info_update_bytes_may_use(sinfo, length); + spin_unlock(&sinfo->lock); + + if (is_data) + min_size = fs_info->sectorsize; + else + min_size = fs_info->nodesize; + + ret = btrfs_reserve_extent(fs_info->fs_root, length, length, min_size, + 0, 0, &ins, is_data, false); + if (unlikely(ret)) { + spin_lock(&sinfo->lock); + btrfs_space_info_update_bytes_may_use(sinfo, -length); + spin_unlock(&sinfo->lock); + return ret; + } + + dest_addr = ins.objectid; + dest_length = ins.offset; + + if (!is_data && !IS_ALIGNED(dest_length, fs_info->nodesize)) { + u64 new_length = ALIGN_DOWN(dest_length, fs_info->nodesize); + + btrfs_free_reserved_extent(fs_info, dest_addr + new_length, + dest_length - new_length, 0); + + dest_length = new_length; + } + + trans = btrfs_join_transaction(fs_info->remap_root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto end; + } + + mutex_lock(&fs_info->remap_mutex); + mutex_taken = true; + + /* Find old remap entry. */ + key.objectid = old_addr; + key.type = BTRFS_REMAP_KEY; + key.offset = length; + + ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, 0, 1); + if (ret == 1) { + /* + * Not a problem if the remap entry wasn't found: that means + * that another transaction has deallocated the data. + * move_existing_remaps() loops until the BG contains no + * remaps, so we can just return 0 in this case. + */ + btrfs_release_path(path); + ret = 0; + goto end; + } else if (unlikely(ret)) { + goto end; + } + + ret = copy_remapped_data(fs_info, new_addr, dest_addr, dest_length); + if (unlikely(ret)) + goto end; + + /* Change data of old remap entry. */ + leaf = path->nodes[0]; + remap_ptr = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_remap_item); + btrfs_set_remap_address(leaf, remap_ptr, dest_addr); + btrfs_mark_buffer_dirty(trans, leaf); + + if (dest_length != length) { + key.offset = dest_length; + btrfs_set_item_key_safe(trans, path, &key); + } + + btrfs_release_path(path); + + if (dest_length != length) { + /* Add remap item for remainder. */ + ret = add_remap_item(trans, path, new_addr + dest_length, + length - dest_length, old_addr + dest_length); + if (unlikely(ret)) + goto end; + } + + /* Change or remove old backref. */ + key.objectid = new_addr; + key.type = BTRFS_REMAP_BACKREF_KEY; + key.offset = length; + + ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, -1, 1); + if (unlikely(ret)) { + if (ret == 1) { + btrfs_release_path(path); + ret = -ENOENT; + } + goto end; + } + + leaf = path->nodes[0]; + + if (dest_length == length) { + ret = btrfs_del_item(trans, fs_info->remap_root, path); + if (unlikely(ret)) { + btrfs_release_path(path); + goto end; + } + } else { + key.objectid += dest_length; + key.offset -= dest_length; + btrfs_set_item_key_safe(trans, path, &key); + btrfs_set_stack_remap_address(&remap, old_addr + dest_length); + + write_extent_buffer(leaf, &remap, + btrfs_item_ptr_offset(leaf, path->slots[0]), + sizeof(struct btrfs_remap_item)); + } + + btrfs_release_path(path); + + /* Add new backref. */ + ret = add_remap_backref_item(trans, path, dest_addr, dest_length, old_addr); + if (unlikely(ret)) + goto end; + + adjust_block_group_remap_bytes(trans, bg, -dest_length); + + ret = btrfs_add_to_free_space_tree(trans, new_addr, dest_length); + if (unlikely(ret)) + goto end; + + dest_bg = btrfs_lookup_block_group(fs_info, dest_addr); + + adjust_block_group_remap_bytes(trans, dest_bg, dest_length); + + mutex_lock(&dest_bg->free_space_lock); + bg_needs_free_space = test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, + &dest_bg->runtime_flags); + mutex_unlock(&dest_bg->free_space_lock); + btrfs_put_block_group(dest_bg); + + if (bg_needs_free_space) { + ret = btrfs_add_block_group_free_space(trans, dest_bg); + if (unlikely(ret)) + goto end; + } + + ret = btrfs_remove_from_free_space_tree(trans, dest_addr, dest_length); + if (unlikely(ret)) { + btrfs_remove_from_free_space_tree(trans, new_addr, dest_length); + goto end; + } + + ret = 0; + +end: + if (mutex_taken) + mutex_unlock(&fs_info->remap_mutex); + + btrfs_dec_block_group_reservations(fs_info, dest_addr); + + if (unlikely(ret)) { + btrfs_free_reserved_extent(fs_info, dest_addr, dest_length, 0); + + if (trans) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + } + } else { + dest_bg = btrfs_lookup_block_group(fs_info, dest_addr); + btrfs_free_reserved_bytes(dest_bg, dest_length, 0); + btrfs_put_block_group(dest_bg); + + ret = btrfs_commit_transaction(trans); + } + + return ret; +} + +static int move_existing_remaps(struct btrfs_fs_info *fs_info, + struct btrfs_block_group *bg, + struct btrfs_path *path) +{ + int ret; + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_remap_item *remap; + u64 old_addr; + + /* Look for backrefs in remap tree. */ + while (bg->remap_bytes > 0) { + key.objectid = bg->start; + key.type = BTRFS_REMAP_BACKREF_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, fs_info->remap_root, &key, path, 0, 0); + if (ret < 0) + return ret; + + leaf = path->nodes[0]; + + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(fs_info->remap_root, path); + if (ret < 0) { + btrfs_release_path(path); + return ret; + } + + if (ret) { + btrfs_release_path(path); + break; + } + + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.type != BTRFS_REMAP_BACKREF_KEY) { + path->slots[0]++; + + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(fs_info->remap_root, path); + if (ret < 0) { + btrfs_release_path(path); + return ret; + } + + if (ret) { + btrfs_release_path(path); + break; + } + + leaf = path->nodes[0]; + } + } + + remap = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_remap_item); + old_addr = btrfs_remap_address(leaf, remap); + + btrfs_release_path(path); + + ret = move_existing_remap(fs_info, path, bg, key.objectid, + key.offset, old_addr); + if (ret) + return ret; + } + + ASSERT(bg->remap_bytes == 0); + + return 0; +} + +static int create_remap_tree_entries(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_free_space_info *fsi; + struct btrfs_key key, found_key; + struct extent_buffer *leaf; + struct btrfs_root *space_root; + u32 extent_count; + struct space_run *space_runs = NULL; + unsigned int num_space_runs = 0; + struct btrfs_key *entries = NULL; + unsigned int max_entries, num_entries; + int ret; + + mutex_lock(&bg->free_space_lock); + + if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &bg->runtime_flags)) { + mutex_unlock(&bg->free_space_lock); + + ret = btrfs_add_block_group_free_space(trans, bg); + if (ret) + return ret; + + mutex_lock(&bg->free_space_lock); + } + + fsi = btrfs_search_free_space_info(trans, bg, path, 0); + if (IS_ERR(fsi)) { + mutex_unlock(&bg->free_space_lock); + return PTR_ERR(fsi); + } + + extent_count = btrfs_free_space_extent_count(path->nodes[0], fsi); + + btrfs_release_path(path); + + space_runs = kmalloc(sizeof(*space_runs) * extent_count, GFP_NOFS); + if (!space_runs) { + mutex_unlock(&bg->free_space_lock); + return -ENOMEM; + } + + key.objectid = bg->start; + key.type = 0; + key.offset = 0; + + space_root = btrfs_free_space_root(bg); + + ret = btrfs_search_slot(trans, space_root, &key, path, 0, 0); + if (ret < 0) { + mutex_unlock(&bg->free_space_lock); + goto out; + } + + ret = 0; + + while (true) { + leaf = path->nodes[0]; + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid >= bg->start + bg->length) + break; + + if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY) { + if (num_space_runs != 0 && + space_runs[num_space_runs - 1].end == found_key.objectid) { + space_runs[num_space_runs - 1].end = + found_key.objectid + found_key.offset; + } else { + ASSERT(num_space_runs < extent_count); + + space_runs[num_space_runs].start = found_key.objectid; + space_runs[num_space_runs].end = + found_key.objectid + found_key.offset; + + num_space_runs++; + } + } else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) { + void *bitmap; + unsigned long offset; + u32 data_size; + + offset = btrfs_item_ptr_offset(leaf, path->slots[0]); + data_size = btrfs_item_size(leaf, path->slots[0]); + + if (data_size != 0) { + bitmap = kmalloc(data_size, GFP_NOFS); + if (!bitmap) { + mutex_unlock(&bg->free_space_lock); + ret = -ENOMEM; + goto out; + } + + read_extent_buffer(leaf, bitmap, offset, data_size); + + parse_bitmap(fs_info->sectorsize, bitmap, + data_size * BITS_PER_BYTE, + found_key.objectid, space_runs, + &num_space_runs); + + ASSERT(num_space_runs <= extent_count); + + kfree(bitmap); + } + } + + path->slots[0]++; + + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(space_root, path); + if (ret != 0) { + if (ret == 1) + ret = 0; + break; + } + leaf = path->nodes[0]; + } + } + + btrfs_release_path(path); + + mutex_unlock(&bg->free_space_lock); + + max_entries = extent_count + 2; + entries = kmalloc(sizeof(*entries) * max_entries, GFP_NOFS); + if (!entries) { + ret = -ENOMEM; + goto out; + } + + num_entries = 0; + + if (num_space_runs == 0) { + entries[num_entries].objectid = bg->start; + entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY; + entries[num_entries].offset = bg->length; + num_entries++; + } else { + if (space_runs[0].start > bg->start) { + entries[num_entries].objectid = bg->start; + entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY; + entries[num_entries].offset = space_runs[0].start - bg->start; + num_entries++; + } + + for (unsigned int i = 1; i < num_space_runs; i++) { + entries[num_entries].objectid = space_runs[i - 1].end; + entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY; + entries[num_entries].offset = + space_runs[i].start - space_runs[i - 1].end; + num_entries++; + } + + if (space_runs[num_space_runs - 1].end < bg->start + bg->length) { + entries[num_entries].objectid = + space_runs[num_space_runs - 1].end; + entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY; + entries[num_entries].offset = + bg->start + bg->length - space_runs[num_space_runs - 1].end; + num_entries++; + } + + if (num_entries == 0) + goto out; + } + + bg->identity_remap_count = num_entries; + + ret = add_remap_tree_entries(trans, path, entries, num_entries); + +out: + kfree(entries); + kfree(space_runs); + + return ret; +} + +static int find_next_identity_remap(struct btrfs_trans_handle *trans, + struct btrfs_path *path, u64 bg_end, + u64 last_start, u64 *start, u64 *length) +{ + int ret; + struct btrfs_key key, found_key; + struct btrfs_root *remap_root = trans->fs_info->remap_root; + struct extent_buffer *leaf; + + key.objectid = last_start; + key.type = BTRFS_IDENTITY_REMAP_KEY; + key.offset = 0; + + ret = btrfs_search_slot(trans, remap_root, &key, path, 0, 0); + if (ret < 0) + goto out; + + leaf = path->nodes[0]; + while (true) { + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(remap_root, path); + + if (ret != 0) { + if (ret == 1) + ret = -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid >= bg_end) { + ret = -ENOENT; + goto out; + } + + if (found_key.type == BTRFS_IDENTITY_REMAP_KEY) { + *start = found_key.objectid; + *length = found_key.offset; + ret = 0; + goto out; + } + + path->slots[0]++; + } + +out: + btrfs_release_path(path); + + return ret; +} + +static int remove_chunk_stripes(struct btrfs_trans_handle *trans, + struct btrfs_chunk_map *chunk_map, + struct btrfs_path *path) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_chunk *chunk; + int ret; + + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = chunk_map->start; + + btrfs_reserve_chunk_metadata(trans, false); + + ret = btrfs_search_slot(trans, fs_info->chunk_root, &key, path, 0, 1); + if (ret) { + if (ret == 1) { + btrfs_release_path(path); + ret = -ENOENT; + } + btrfs_trans_release_chunk_metadata(trans); + return ret; + } + + leaf = path->nodes[0]; + + chunk = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_chunk); + btrfs_set_chunk_num_stripes(leaf, chunk, 0); + btrfs_set_chunk_sub_stripes(leaf, chunk, 0); + + btrfs_truncate_item(trans, path, offsetof(struct btrfs_chunk, stripe), 1); + + btrfs_mark_buffer_dirty(trans, leaf); + + btrfs_release_path(path); + btrfs_trans_release_chunk_metadata(trans); + + return 0; +} + +int btrfs_last_identity_remap_gone(struct btrfs_chunk_map *chunk_map, + struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + struct btrfs_trans_handle *trans; + int ret; + unsigned int num_items; + BTRFS_PATH_AUTO_FREE(path); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * One item for each entry we're removing in the dev extents tree, and + * another for each device. DUP chunks are all on one device, + * everything else has one device per stripe. + */ + if (bg->flags & BTRFS_BLOCK_GROUP_DUP) + num_items = chunk_map->num_stripes + 1; + else + num_items = 2 * chunk_map->num_stripes; + + trans = btrfs_start_transaction_fallback_global_rsv(fs_info->tree_root, num_items); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_remove_dev_extents(trans, chunk_map); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + return ret; + } + + mutex_lock(&trans->fs_info->chunk_mutex); + for (unsigned int i = 0; i < chunk_map->num_stripes; i++) { + ret = btrfs_update_device(trans, chunk_map->stripes[i].dev); + if (unlikely(ret)) { + mutex_unlock(&trans->fs_info->chunk_mutex); + btrfs_abort_transaction(trans, ret); + return ret; + } + } + mutex_unlock(&trans->fs_info->chunk_mutex); + + write_lock(&trans->fs_info->mapping_tree_lock); + btrfs_chunk_map_device_clear_bits(chunk_map, CHUNK_ALLOCATED); + write_unlock(&trans->fs_info->mapping_tree_lock); + + btrfs_remove_bg_from_sinfo(bg); + + spin_lock(&bg->lock); + clear_bit(BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING, &bg->runtime_flags); + spin_unlock(&bg->lock); + + ret = remove_chunk_stripes(trans, chunk_map, path); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + return ret; + } + + ret = btrfs_commit_transaction(trans); + if (ret) + return ret; + + return 0; +} + +static void adjust_identity_remap_count(struct btrfs_trans_handle *trans, + struct btrfs_block_group *bg, int delta) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + bool bg_already_dirty = true; + bool mark_fully_remapped = false; + + WARN_ON(delta < 0 && -delta > bg->identity_remap_count); + + spin_lock(&bg->lock); + + bg->identity_remap_count += delta; + + if (bg->identity_remap_count == 0 && + !test_bit(BLOCK_GROUP_FLAG_FULLY_REMAPPED, &bg->runtime_flags)) { + set_bit(BLOCK_GROUP_FLAG_FULLY_REMAPPED, &bg->runtime_flags); + mark_fully_remapped = true; + } + + spin_unlock(&bg->lock); + + spin_lock(&trans->transaction->dirty_bgs_lock); + if (list_empty(&bg->dirty_list)) { + list_add_tail(&bg->dirty_list, &trans->transaction->dirty_bgs); + bg_already_dirty = false; + btrfs_get_block_group(bg); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + + /* Modified block groups are accounted for in the delayed_refs_rsv. */ + if (!bg_already_dirty) + btrfs_inc_delayed_refs_rsv_bg_updates(fs_info); + + if (mark_fully_remapped) + btrfs_mark_bg_fully_remapped(bg, trans); +} + +static int add_remap_entry(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_block_group *src_bg, u64 old_addr, + u64 new_addr, u64 length) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_key key, new_key; + int ret; + int identity_count_delta = 0; + + key.objectid = old_addr; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, -1, 1); + if (ret < 0) + goto end; + + if (path->slots[0] == 0) { + ret = -ENOENT; + goto end; + } + + path->slots[0]--; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type != BTRFS_IDENTITY_REMAP_KEY || + key.objectid > old_addr || + key.objectid + key.offset <= old_addr) { + ret = -ENOENT; + goto end; + } + + /* Shorten or delete identity mapping entry. */ + if (key.objectid == old_addr) { + ret = btrfs_del_item(trans, fs_info->remap_root, path); + if (ret) + goto end; + + identity_count_delta--; + } else { + new_key.objectid = key.objectid; + new_key.type = BTRFS_IDENTITY_REMAP_KEY; + new_key.offset = old_addr - key.objectid; + + btrfs_set_item_key_safe(trans, path, &new_key); + } + + btrfs_release_path(path); + + /* Create new remap entry. */ + ret = add_remap_item(trans, path, new_addr, length, old_addr); + if (ret) + goto end; + + /* Add entry for remainder of identity mapping, if necessary. */ + if (key.objectid + key.offset != old_addr + length) { + new_key.objectid = old_addr + length; + new_key.type = BTRFS_IDENTITY_REMAP_KEY; + new_key.offset = key.objectid + key.offset - old_addr - length; + + ret = btrfs_insert_empty_item(trans, fs_info->remap_root, + path, &new_key, 0); + if (ret) + goto end; + + btrfs_release_path(path); + + identity_count_delta++; + } + + /* Add backref. */ + ret = add_remap_backref_item(trans, path, new_addr, length, old_addr); + if (ret) + goto end; + + if (identity_count_delta != 0) + adjust_identity_remap_count(trans, src_bg, identity_count_delta); + +end: + btrfs_release_path(path); + + return ret; +} + +static int mark_chunk_remapped(struct btrfs_trans_handle *trans, + struct btrfs_path *path, u64 start) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_chunk_map *chunk_map; + struct btrfs_key key; + u64 type; + int ret; + struct extent_buffer *leaf; + struct btrfs_chunk *chunk; + + read_lock(&fs_info->mapping_tree_lock); + + chunk_map = btrfs_find_chunk_map_nolock(fs_info, start, 1); + if (!chunk_map) { + read_unlock(&fs_info->mapping_tree_lock); + return -ENOENT; + } + + chunk_map->type |= BTRFS_BLOCK_GROUP_REMAPPED; + type = chunk_map->type; + + read_unlock(&fs_info->mapping_tree_lock); + + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = start; + + ret = btrfs_search_slot(trans, fs_info->chunk_root, &key, path, 0, 1); + if (ret == 1) { + ret = -ENOENT; + goto end; + } else if (ret < 0) + goto end; + + leaf = path->nodes[0]; + + chunk = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_chunk); + btrfs_set_chunk_type(leaf, chunk, type); + btrfs_mark_buffer_dirty(trans, leaf); + + ret = 0; +end: + btrfs_free_chunk_map(chunk_map); + btrfs_release_path(path); + + return ret; +} + +static int do_remap_reloc_trans(struct btrfs_fs_info *fs_info, + struct btrfs_block_group *src_bg, + struct btrfs_path *path, u64 *last_start) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *extent_root; + struct btrfs_key ins; + struct btrfs_block_group *dest_bg = NULL; + u64 start = 0, remap_length = 0; + u64 length, new_addr, min_size; + int ret; + const bool is_data = (src_bg->flags & BTRFS_BLOCK_GROUP_DATA); + bool no_more = false; + bool made_reservation = false, bg_needs_free_space; + struct btrfs_space_info *sinfo = src_bg->space_info; + + extent_root = btrfs_extent_root(fs_info, src_bg->start); + + trans = btrfs_start_transaction(extent_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + mutex_lock(&fs_info->remap_mutex); + + ret = find_next_identity_remap(trans, path, src_bg->start + src_bg->length, + *last_start, &start, &remap_length); + if (ret == -ENOENT) { + no_more = true; + goto next; + } else if (ret) { + mutex_unlock(&fs_info->remap_mutex); + btrfs_end_transaction(trans); + return ret; + } + + /* Try to reserve enough space for block. */ + spin_lock(&sinfo->lock); + btrfs_space_info_update_bytes_may_use(sinfo, remap_length); + spin_unlock(&sinfo->lock); + + if (is_data) + min_size = fs_info->sectorsize; + else + min_size = fs_info->nodesize; + + /* + * We're using btrfs_reserve_extent() to allocate a contiguous + * logical address range, but this will become a remap item rather than + * an extent in the extent tree. + * + * Short allocations are fine: it means that we chop off the beginning + * of the identity remap that we're processing, and will tackle the + * rest of it the next time round. + */ + ret = btrfs_reserve_extent(fs_info->fs_root, remap_length, remap_length, + min_size, 0, 0, &ins, is_data, false); + if (ret) { + spin_lock(&sinfo->lock); + btrfs_space_info_update_bytes_may_use(sinfo, -remap_length); + spin_unlock(&sinfo->lock); + + mutex_unlock(&fs_info->remap_mutex); + btrfs_end_transaction(trans); + return ret; + } + + made_reservation = true; + + new_addr = ins.objectid; + length = ins.offset; + + if (!is_data && !IS_ALIGNED(length, fs_info->nodesize)) { + u64 new_length = ALIGN_DOWN(length, fs_info->nodesize); + + btrfs_free_reserved_extent(fs_info, new_addr + new_length, + length - new_length, 0); + + length = new_length; + } + + dest_bg = btrfs_lookup_block_group(fs_info, new_addr); + + mutex_lock(&dest_bg->free_space_lock); + bg_needs_free_space = test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, + &dest_bg->runtime_flags); + mutex_unlock(&dest_bg->free_space_lock); + + if (bg_needs_free_space) { + ret = btrfs_add_block_group_free_space(trans, dest_bg); + if (ret) + goto fail; + } + + ret = copy_remapped_data(fs_info, start, new_addr, length); + if (ret) + goto fail; + + ret = btrfs_remove_from_free_space_tree(trans, new_addr, length); + if (ret) + goto fail; + + ret = add_remap_entry(trans, path, src_bg, start, new_addr, length); + if (ret) { + btrfs_add_to_free_space_tree(trans, new_addr, length); + goto fail; + } + + adjust_block_group_remap_bytes(trans, dest_bg, length); + btrfs_free_reserved_bytes(dest_bg, length, 0); + + spin_lock(&sinfo->lock); + sinfo->bytes_readonly += length; + spin_unlock(&sinfo->lock); + +next: + if (dest_bg) + btrfs_put_block_group(dest_bg); + + if (made_reservation) + btrfs_dec_block_group_reservations(fs_info, new_addr); + + mutex_unlock(&fs_info->remap_mutex); + + if (src_bg->identity_remap_count == 0) { + bool mark_fully_remapped = false; + + spin_lock(&src_bg->lock); + if (!test_bit(BLOCK_GROUP_FLAG_FULLY_REMAPPED, &src_bg->runtime_flags)) { + mark_fully_remapped = true; + set_bit(BLOCK_GROUP_FLAG_FULLY_REMAPPED, &src_bg->runtime_flags); + } + spin_unlock(&src_bg->lock); + + if (mark_fully_remapped) + btrfs_mark_bg_fully_remapped(src_bg, trans); + } + + ret = btrfs_end_transaction(trans); + if (ret) + return ret; + + if (no_more) + return 1; + + *last_start = start; + + return 0; + +fail: + if (dest_bg) + btrfs_put_block_group(dest_bg); + + btrfs_free_reserved_extent(fs_info, new_addr, length, 0); + + mutex_unlock(&fs_info->remap_mutex); + btrfs_end_transaction(trans); + + return ret; +} + +static int do_remap_reloc(struct btrfs_fs_info *fs_info, struct btrfs_path *path, + struct btrfs_block_group *bg) +{ + u64 last_start = bg->start; + int ret; + + while (true) { + ret = do_remap_reloc_trans(fs_info, bg, path, &last_start); + if (ret) { + if (ret == 1) + ret = 0; + break; + } + } + + return ret; +} + +int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical, u64 *length) +{ + int ret; + struct btrfs_key key, found_key; + struct extent_buffer *leaf; + struct btrfs_remap_item *remap; + BTRFS_PATH_AUTO_FREE(path); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = *logical; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, fs_info->remap_root, &key, path, 0, 0); + if (ret < 0) + return ret; + + leaf = path->nodes[0]; + if (path->slots[0] == 0) + return -ENOENT; + + path->slots[0]--; + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.type != BTRFS_REMAP_KEY && + found_key.type != BTRFS_IDENTITY_REMAP_KEY) { + return -ENOENT; + } + + if (found_key.objectid > *logical || + found_key.objectid + found_key.offset <= *logical) { + return -ENOENT; + } + + if (*logical + *length > found_key.objectid + found_key.offset) + *length = found_key.objectid + found_key.offset - *logical; + + if (found_key.type == BTRFS_IDENTITY_REMAP_KEY) + return 0; + + remap = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_remap_item); + *logical += btrfs_remap_address(leaf, remap) - found_key.objectid; + + return 0; +} + +static int start_block_group_remapping(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + struct btrfs_block_group *bg) +{ + struct btrfs_trans_handle *trans; + bool bg_already_dirty = true; + int ret, ret2; + + ret = btrfs_cache_block_group(bg, true); + if (ret) + return ret; + + trans = btrfs_start_transaction(fs_info->remap_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + /* We need to run delayed refs, to make sure FST is up to date. */ + ret = btrfs_run_delayed_refs(trans, U64_MAX); + if (ret) { + btrfs_end_transaction(trans); + return ret; + } + + mutex_lock(&fs_info->remap_mutex); + + if (bg->flags & BTRFS_BLOCK_GROUP_REMAPPED) { + ret = 0; + goto end; + } + + ret = create_remap_tree_entries(trans, path, bg); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + goto end; + } + + spin_lock(&bg->lock); + bg->flags |= BTRFS_BLOCK_GROUP_REMAPPED; + spin_unlock(&bg->lock); + + spin_lock(&trans->transaction->dirty_bgs_lock); + if (list_empty(&bg->dirty_list)) { + list_add_tail(&bg->dirty_list, &trans->transaction->dirty_bgs); + bg_already_dirty = false; + btrfs_get_block_group(bg); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + + /* Modified block groups are accounted for in the delayed_refs_rsv. */ + if (!bg_already_dirty) + btrfs_inc_delayed_refs_rsv_bg_updates(fs_info); + + ret = mark_chunk_remapped(trans, path, bg->start); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + goto end; + } + + ret = btrfs_remove_block_group_free_space(trans, bg); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + goto end; + } + + btrfs_remove_free_space_cache(bg); + +end: + mutex_unlock(&fs_info->remap_mutex); + + ret2 = btrfs_end_transaction(trans); + if (!ret) + ret = ret2; + + return ret; +} + +static int do_nonremap_reloc(struct btrfs_fs_info *fs_info, bool verbose, + struct reloc_control *rc) +{ + int ret; + + while (1) { + enum reloc_stage finishes_stage; + + mutex_lock(&fs_info->cleaner_mutex); + ret = relocate_block_group(rc); + mutex_unlock(&fs_info->cleaner_mutex); + + finishes_stage = rc->stage; + /* + * We may have gotten ENOSPC after we already dirtied some + * extents. If writeout happens while we're relocating a + * different block group we could end up hitting the + * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in + * btrfs_reloc_cow_block. Make sure we write everything out + * properly so we don't trip over this problem, and then break + * out of the loop if we hit an error. + */ + if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { + int wb_ret; + + wb_ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), + 0, (u64)-1); + if (wb_ret && ret == 0) + ret = wb_ret; + invalidate_mapping_pages(rc->data_inode->i_mapping, 0, -1); + rc->stage = UPDATE_DATA_PTRS; + } + + if (ret < 0) + return ret; + + if (rc->extents_found == 0) + break; + + if (verbose) + btrfs_info(fs_info, "found %llu extents, stage: %s", + rc->extents_found, stage_to_string(finishes_stage)); + } + + WARN_ON(rc->block_group->pinned > 0); + WARN_ON(rc->block_group->reserved > 0); + WARN_ON(rc->block_group->used > 0); + + return 0; +} + /* * function to relocate all extents in a block group. */ @@ -3870,7 +5297,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start); struct reloc_control *rc; struct inode *inode; - struct btrfs_path *path; + struct btrfs_path *path = NULL; int ret; bool bg_is_ro = false; @@ -3932,7 +5359,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, } inode = lookup_free_space_inode(rc->block_group, path); - btrfs_free_path(path); + btrfs_release_path(path); if (!IS_ERR(inode)) ret = delete_block_group_cache(rc->block_group, inode, 0); @@ -3942,11 +5369,13 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, if (ret && ret != -ENOENT) goto out; - rc->data_inode = create_reloc_inode(rc->block_group); - if (IS_ERR(rc->data_inode)) { - ret = PTR_ERR(rc->data_inode); - rc->data_inode = NULL; - goto out; + if (!btrfs_fs_incompat(fs_info, REMAP_TREE)) { + rc->data_inode = create_reloc_inode(rc->block_group); + if (IS_ERR(rc->data_inode)) { + ret = PTR_ERR(rc->data_inode); + rc->data_inode = NULL; + goto out; + } } if (verbose) @@ -3959,54 +5388,31 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, ret = btrfs_zone_finish(rc->block_group); WARN_ON(ret && ret != -EAGAIN); - while (1) { - enum reloc_stage finishes_stage; - - mutex_lock(&fs_info->cleaner_mutex); - ret = relocate_block_group(rc); - mutex_unlock(&fs_info->cleaner_mutex); - - finishes_stage = rc->stage; - /* - * We may have gotten ENOSPC after we already dirtied some - * extents. If writeout happens while we're relocating a - * different block group we could end up hitting the - * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in - * btrfs_reloc_cow_block. Make sure we write everything out - * properly so we don't trip over this problem, and then break - * out of the loop if we hit an error. - */ - if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { - int wb_ret; - - wb_ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), 0, - (u64)-1); - if (wb_ret && ret == 0) - ret = wb_ret; - invalidate_mapping_pages(rc->data_inode->i_mapping, - 0, -1); - rc->stage = UPDATE_DATA_PTRS; + if (should_relocate_using_remap_tree(bg)) { + if (bg->remap_bytes != 0) { + ret = move_existing_remaps(fs_info, bg, path); + if (ret) + goto out; } - - if (ret < 0) + ret = start_block_group_remapping(fs_info, path, bg); + if (ret) goto out; - if (rc->extents_found == 0) - break; + ret = do_remap_reloc(fs_info, path, rc->block_group); + if (ret) + goto out; - if (verbose) - btrfs_info(fs_info, "found %llu extents, stage: %s", - rc->extents_found, - stage_to_string(finishes_stage)); + btrfs_delete_unused_bgs(fs_info); + } else { + ret = do_nonremap_reloc(fs_info, verbose, rc); } - WARN_ON(rc->block_group->pinned > 0); - WARN_ON(rc->block_group->reserved > 0); - WARN_ON(rc->block_group->used > 0); out: if (ret && bg_is_ro) btrfs_dec_block_group_ro(rc->block_group); - iput(rc->data_inode); + if (!btrfs_fs_incompat(fs_info, REMAP_TREE)) + iput(rc->data_inode); + btrfs_free_path(path); reloc_chunk_end(fs_info); out_put_bg: btrfs_put_block_group(bg); @@ -4200,7 +5606,7 @@ out: btrfs_free_path(path); - if (ret == 0) { + if (ret == 0 && !btrfs_fs_incompat(fs_info, REMAP_TREE)) { /* cleanup orphan inode in data relocation tree */ fs_root = btrfs_grab_root(fs_info->data_reloc_root); ASSERT(fs_root); @@ -4414,3 +5820,260 @@ u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info) logical = fs_info->reloc_ctl->block_group->start; return logical; } + +static int insert_remap_item(struct btrfs_trans_handle *trans, struct btrfs_path *path, + u64 old_addr, u64 length, u64 new_addr) +{ + int ret; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_key key; + struct btrfs_remap_item remap = { 0 }; + + if (old_addr == new_addr) { + /* Add new identity remap item. */ + key.objectid = old_addr; + key.type = BTRFS_IDENTITY_REMAP_KEY; + key.offset = length; + + ret = btrfs_insert_empty_item(trans, fs_info->remap_root, path, + &key, 0); + if (ret) + return ret; + } else { + /* Add new remap item. */ + key.objectid = old_addr; + key.type = BTRFS_REMAP_KEY; + key.offset = length; + + ret = btrfs_insert_empty_item(trans, fs_info->remap_root, + path, &key, sizeof(struct btrfs_remap_item)); + if (ret) + return ret; + + btrfs_set_stack_remap_address(&remap, new_addr); + + write_extent_buffer(path->nodes[0], &remap, + btrfs_item_ptr_offset(path->nodes[0], path->slots[0]), + sizeof(struct btrfs_remap_item)); + + btrfs_release_path(path); + + /* Add new backref item. */ + key.objectid = new_addr; + key.type = BTRFS_REMAP_BACKREF_KEY; + key.offset = length; + + ret = btrfs_insert_empty_item(trans, fs_info->remap_root, + path, &key, + sizeof(struct btrfs_remap_item)); + if (ret) + return ret; + + btrfs_set_stack_remap_address(&remap, old_addr); + + write_extent_buffer(path->nodes[0], &remap, + btrfs_item_ptr_offset(path->nodes[0], path->slots[0]), + sizeof(struct btrfs_remap_item)); + } + + btrfs_release_path(path); + + return 0; +} + +/* + * Punch a hole in the remap item or identity remap item pointed to by path, + * for the range [hole_start, hole_start + hole_length). + */ +static int remove_range_from_remap_tree(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_block_group *bg, + u64 hole_start, u64 hole_length) +{ + int ret; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_key key; + u64 hole_end, new_addr, remap_start, remap_length, remap_end; + u64 overlap_length; + bool is_identity_remap; + int identity_count_delta = 0; + + hole_end = hole_start + hole_length; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + is_identity_remap = (key.type == BTRFS_IDENTITY_REMAP_KEY); + + remap_start = key.objectid; + remap_length = key.offset; + remap_end = remap_start + remap_length; + + if (is_identity_remap) { + new_addr = remap_start; + } else { + struct btrfs_remap_item *remap_ptr; + + remap_ptr = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_remap_item); + new_addr = btrfs_remap_address(leaf, remap_ptr); + } + + /* Delete old item. */ + ret = btrfs_del_item(trans, fs_info->remap_root, path); + btrfs_release_path(path); + if (ret) + return ret; + + if (is_identity_remap) { + identity_count_delta = -1; + } else { + /* Remove backref. */ + key.objectid = new_addr; + key.type = BTRFS_REMAP_BACKREF_KEY; + key.offset = remap_length; + + ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, -1, 1); + if (ret) { + if (ret == 1) { + btrfs_release_path(path); + ret = -ENOENT; + } + return ret; + } + + ret = btrfs_del_item(trans, fs_info->remap_root, path); + + btrfs_release_path(path); + + if (ret) + return ret; + } + + /* If hole_start > remap_start, re-add the start of the remap item. */ + if (hole_start > remap_start) { + ret = insert_remap_item(trans, path, remap_start, + hole_start - remap_start, new_addr); + if (ret) + return ret; + + if (is_identity_remap) + identity_count_delta++; + } + + /* If hole_end < remap_end, re-add the end of the remap item. */ + if (hole_end < remap_end) { + ret = insert_remap_item(trans, path, hole_end, + remap_end - hole_end, + hole_end - remap_start + new_addr); + if (ret) + return ret; + + if (is_identity_remap) + identity_count_delta++; + } + + if (identity_count_delta != 0) + adjust_identity_remap_count(trans, bg, identity_count_delta); + + overlap_length = min_t(u64, hole_end, remap_end) - + max_t(u64, hole_start, remap_start); + + if (!is_identity_remap) { + struct btrfs_block_group *dest_bg; + + dest_bg = btrfs_lookup_block_group(fs_info, new_addr); + adjust_block_group_remap_bytes(trans, dest_bg, -overlap_length); + btrfs_put_block_group(dest_bg); + ret = btrfs_add_to_free_space_tree(trans, + hole_start - remap_start + new_addr, + overlap_length); + if (ret) + return ret; + } + + ret = overlap_length; + + return ret; +} + +/* + * Return 1 if remove_range_from_remap_tree() has been called successfully, + * 0 if block group wasn't remapped, and a negative number on error. + */ +int btrfs_remove_extent_from_remap_tree(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + u64 bytenr, u64 num_bytes) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_key key, found_key; + struct extent_buffer *leaf; + struct btrfs_block_group *bg; + int ret, length; + + if (!(btrfs_super_incompat_flags(fs_info->super_copy) & + BTRFS_FEATURE_INCOMPAT_REMAP_TREE)) + return 0; + + bg = btrfs_lookup_block_group(fs_info, bytenr); + if (!bg) + return 0; + + mutex_lock(&fs_info->remap_mutex); + + if (!(bg->flags & BTRFS_BLOCK_GROUP_REMAPPED)) { + mutex_unlock(&fs_info->remap_mutex); + btrfs_put_block_group(bg); + return 0; + } + + do { + key.objectid = bytenr; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, -1, 1); + if (ret < 0) + goto end; + + leaf = path->nodes[0]; + if (path->slots[0] == 0) { + ret = -ENOENT; + goto end; + } + + path->slots[0]--; + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.type != BTRFS_IDENTITY_REMAP_KEY && + found_key.type != BTRFS_REMAP_KEY) { + ret = -ENOENT; + goto end; + } + + if (bytenr < found_key.objectid || + bytenr >= found_key.objectid + found_key.offset) { + ret = -ENOENT; + goto end; + } + + length = remove_range_from_remap_tree(trans, path, bg, bytenr, num_bytes); + if (length < 0) { + ret = length; + goto end; + } + + bytenr += length; + num_bytes -= length; + } while (num_bytes > 0); + + ret = 1; + +end: + mutex_unlock(&fs_info->remap_mutex); + + btrfs_put_block_group(bg); + btrfs_release_path(path); + + return ret; +} diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h index 5c36b3f84b57..d647823b5d13 100644 --- a/fs/btrfs/relocation.h +++ b/fs/btrfs/relocation.h @@ -12,6 +12,17 @@ struct btrfs_trans_handle; struct btrfs_ordered_extent; struct btrfs_pending_snapshot; +static inline bool should_relocate_using_remap_tree(const struct btrfs_block_group *bg) +{ + if (!btrfs_fs_incompat(bg->fs_info, REMAP_TREE)) + return false; + + if (bg->flags & (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA_REMAP)) + return false; + + return true; +} + int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, bool verbose); int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); @@ -31,5 +42,11 @@ int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info); struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr); bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root); u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info); +int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical, u64 *length); +int btrfs_remove_extent_from_remap_tree(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + u64 bytenr, u64 num_bytes); +int btrfs_last_identity_remap_gone(struct btrfs_chunk_map *chunk_map, + struct btrfs_block_group *bg); #endif diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 6a7e297ab0a7..37a4173c0a0b 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -217,8 +217,6 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_root *root; - int err = 0; - int ret; path = btrfs_alloc_path(); if (!path) @@ -230,20 +228,19 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) while (1) { u64 root_objectid; + int ret; ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); - if (ret < 0) { - err = ret; - break; - } + if (ret < 0) + return ret; leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(tree_root, path); if (ret < 0) - err = ret; - if (ret != 0) - break; + return ret; + else if (ret > 0) + return 0; leaf = path->nodes[0]; } @@ -252,34 +249,32 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) if (key.objectid != BTRFS_ORPHAN_OBJECTID || key.type != BTRFS_ORPHAN_ITEM_KEY) - break; + return 0; root_objectid = key.offset; key.offset++; root = btrfs_get_fs_root(fs_info, root_objectid, false); - err = PTR_ERR_OR_ZERO(root); - if (err && err != -ENOENT) { + ret = PTR_ERR_OR_ZERO(root); + if (ret && ret != -ENOENT) { break; - } else if (err == -ENOENT) { + } else if (ret == -ENOENT) { struct btrfs_trans_handle *trans; - btrfs_release_path(path); - trans = btrfs_join_transaction(tree_root); if (IS_ERR(trans)) { - err = PTR_ERR(trans); - btrfs_handle_fs_error(fs_info, err, - "Failed to start trans to delete orphan item"); - break; + ret = PTR_ERR(trans); + btrfs_err(fs_info, + "failed to join transaction to delete orphan item: %d", + ret); + return ret; } - err = btrfs_del_orphan_item(trans, tree_root, - root_objectid); + ret = btrfs_del_orphan_item(trans, tree_root, root_objectid); btrfs_end_transaction(trans); - if (err) { - btrfs_handle_fs_error(fs_info, err, - "Failed to delete root orphan item"); - break; + if (ret) { + btrfs_err(fs_info, + "failed to delete root orphan item: %d", ret); + return ret; } continue; } @@ -307,7 +302,7 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) btrfs_put_root(root); } - return err; + return 0; } /* drop the root item for 'key' from the tree root */ diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a40ee41f42c6..2a64e2d50ced 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -6,7 +6,6 @@ #include <linux/blkdev.h> #include <linux/ratelimit.h> #include <linux/sched/mm.h> -#include <crypto/hash.h> #include "ctree.h" #include "discard.h" #include "volumes.h" @@ -718,7 +717,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits); void *first_kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); struct btrfs_header *header = first_kaddr; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + struct btrfs_csum_ctx csum; u8 on_disk_csum[BTRFS_CSUM_SIZE]; u8 calculated_csum[BTRFS_CSUM_SIZE]; @@ -760,17 +759,16 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr } /* Now check tree block csum. */ - shash->tfm = fs_info->csum_shash; - crypto_shash_init(shash); - crypto_shash_update(shash, first_kaddr + BTRFS_CSUM_SIZE, - fs_info->sectorsize - BTRFS_CSUM_SIZE); + btrfs_csum_init(&csum, fs_info->csum_type); + btrfs_csum_update(&csum, first_kaddr + BTRFS_CSUM_SIZE, + fs_info->sectorsize - BTRFS_CSUM_SIZE); for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) { - crypto_shash_update(shash, scrub_stripe_get_kaddr(stripe, i), - fs_info->sectorsize); + btrfs_csum_update(&csum, scrub_stripe_get_kaddr(stripe, i), + fs_info->sectorsize); } - crypto_shash_final(shash, calculated_csum); + btrfs_csum_final(&csum, calculated_csum); if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) { scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); @@ -1690,15 +1688,15 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, scrub_stripe_reset_bitmaps(stripe); /* The range must be inside the bg. */ - ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length, + ASSERT(logical_start >= bg->start && logical_end <= btrfs_block_group_end(bg), "bg->start=%llu logical_start=%llu logical_end=%llu end=%llu", - bg->start, logical_start, logical_end, bg->start + bg->length); + bg->start, logical_start, logical_end, btrfs_block_group_end(bg)); ret = find_first_extent_item(extent_root, extent_path, logical_start, logical_len); /* Either error or not found. */ if (ret) - goto out; + return ret; get_extent_info(extent_path, &extent_start, &extent_len, &extent_flags, &extent_gen); if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) @@ -1731,7 +1729,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, ret = find_first_extent_item(extent_root, extent_path, cur_logical, stripe_end - cur_logical + 1); if (ret < 0) - goto out; + return ret; if (ret > 0) { ret = 0; break; @@ -1765,7 +1763,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, stripe->logical, stripe_end, stripe->csums, &csum_bitmap); if (ret < 0) - goto out; + return ret; if (ret > 0) ret = 0; @@ -1775,7 +1773,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, } } set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); -out: + return ret; } @@ -2173,8 +2171,8 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, u64 full_stripe_start) { struct btrfs_fs_info *fs_info = sctx->fs_info; - struct btrfs_path extent_path = { 0 }; - struct btrfs_path csum_path = { 0 }; + BTRFS_PATH_AUTO_RELEASE(extent_path); + BTRFS_PATH_AUTO_RELEASE(csum_path); struct scrub_stripe *stripe; bool all_empty = true; const int data_stripes = nr_data_stripes(map); @@ -2226,7 +2224,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, full_stripe_start + btrfs_stripe_nr_to_offset(i), BTRFS_STRIPE_LEN, stripe); if (ret < 0) - goto out; + return ret; /* * No extent in this data stripe, need to manually mark them * initialized to make later read submission happy. @@ -2248,10 +2246,8 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, break; } } - if (all_empty) { - ret = 0; - goto out; - } + if (all_empty) + return 0; for (int i = 0; i < data_stripes; i++) { stripe = &sctx->raid56_data_stripes[i]; @@ -2292,20 +2288,15 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, "scrub: unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", full_stripe_start, i, stripe->nr_sectors, &error); - ret = -EIO; - goto out; + return ret; } bitmap_or(&extent_bitmap, &extent_bitmap, &has_extent, stripe->nr_sectors); } /* Now we can check and regenerate the P/Q stripe. */ - ret = scrub_raid56_cached_parity(sctx, scrub_dev, map, full_stripe_start, - &extent_bitmap); -out: - btrfs_release_path(&extent_path); - btrfs_release_path(&csum_path); - return ret; + return scrub_raid56_cached_parity(sctx, scrub_dev, map, full_stripe_start, + &extent_bitmap); } /* @@ -2328,7 +2319,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, int ret = 0; /* The range must be inside the bg */ - ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); + ASSERT(logical_start >= bg->start && logical_end <= btrfs_block_group_end(bg)); /* Go through each extent items inside the logical range */ while (cur_logical < logical_end) { @@ -2420,12 +2411,13 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx, const u64 logical_increment = simple_stripe_full_stripe_len(map); const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index); const u64 orig_physical = map->stripes[stripe_index].physical; + const u64 end = btrfs_block_group_end(bg); const int mirror_num = simple_stripe_mirror_num(map, stripe_index); u64 cur_logical = orig_logical; u64 cur_physical = orig_physical; int ret = 0; - while (cur_logical < bg->start + bg->length) { + while (cur_logical < end) { /* * Inside each stripe, RAID0 is just SINGLE, and RAID10 is * just RAID1, so we can reuse scrub_simple_mirror() to scrub diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d8127a7120c2..3dcfdba018b5 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -6449,11 +6449,9 @@ static int process_extent(struct send_ctx *sctx, if (sctx->parent_root && !sctx->cur_inode_new) { ret = is_extent_unchanged(sctx, path, key); if (ret < 0) - goto out; - if (ret) { - ret = 0; + return ret; + if (ret) goto out_hole; - } } else { struct btrfs_file_extent_item *ei; u8 type; @@ -6469,31 +6467,25 @@ static int process_extent(struct send_ctx *sctx, * we have enough commands queued up to justify rev'ing * the send spec. */ - if (type == BTRFS_FILE_EXTENT_PREALLOC) { - ret = 0; - goto out; - } + if (type == BTRFS_FILE_EXTENT_PREALLOC) + return 0; /* Have a hole, just skip it. */ - if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0) { - ret = 0; - goto out; - } + if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0) + return 0; } } ret = find_extent_clone(sctx, path, key->objectid, key->offset, sctx->cur_inode_size, &found_clone); if (ret != -ENOENT && ret < 0) - goto out; + return ret; ret = send_write_or_clone(sctx, path, key, found_clone); if (ret) - goto out; + return ret; out_hole: - ret = maybe_send_hole(sctx, path, key); -out: - return ret; + return maybe_send_hole(sctx, path, key); } static int process_all_extents(struct send_ctx *sctx) @@ -6535,23 +6527,24 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, bool at_end, int *pending_move, int *refs_processed) { - int ret = 0; + int ret; if (sctx->cur_ino == 0) - goto out; + return 0; + if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid && sctx->cmp_key->type <= BTRFS_INODE_EXTREF_KEY) - goto out; + return 0; + if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs)) - goto out; + return 0; ret = process_recorded_refs(sctx, pending_move); if (ret < 0) - goto out; + return ret; *refs_processed = 1; -out: - return ret; + return 0; } static int finish_inode_if_needed(struct send_ctx *sctx, bool at_end) @@ -6768,7 +6761,7 @@ static void close_current_inode(struct send_ctx *sctx) static int changed_inode(struct send_ctx *sctx, enum btrfs_compare_tree_result result) { - int ret = 0; + int ret; struct btrfs_key *key = sctx->cmp_key; struct btrfs_inode_item *left_ii = NULL; struct btrfs_inode_item *right_ii = NULL; @@ -6860,7 +6853,7 @@ static int changed_inode(struct send_ctx *sctx, if (result == BTRFS_COMPARE_TREE_NEW) { if (btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii) == 0) { sctx->ignore_cur_inode = true; - goto out; + return 0; } sctx->cur_inode_gen = left_gen; sctx->cur_inode_new = true; @@ -6888,7 +6881,7 @@ static int changed_inode(struct send_ctx *sctx, old_nlinks = btrfs_inode_nlink(sctx->right_path->nodes[0], right_ii); if (new_nlinks == 0 && old_nlinks == 0) { sctx->ignore_cur_inode = true; - goto out; + return 0; } else if (new_nlinks == 0 || old_nlinks == 0) { sctx->cur_inode_new_gen = 1; } @@ -6914,7 +6907,7 @@ static int changed_inode(struct send_ctx *sctx, ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_DELETED); if (ret < 0) - goto out; + return ret; } /* @@ -6935,11 +6928,11 @@ static int changed_inode(struct send_ctx *sctx, left_ii); ret = send_create_inode_if_needed(sctx); if (ret < 0) - goto out; + return ret; ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW); if (ret < 0) - goto out; + return ret; /* * Advance send_progress now as we did not get * into process_recorded_refs_if_needed in the @@ -6953,10 +6946,10 @@ static int changed_inode(struct send_ctx *sctx, */ ret = process_all_extents(sctx); if (ret < 0) - goto out; + return ret; ret = process_all_new_xattrs(sctx); if (ret < 0) - goto out; + return ret; } } else { sctx->cur_inode_gen = left_gen; @@ -6970,8 +6963,7 @@ static int changed_inode(struct send_ctx *sctx, } } -out: - return ret; + return 0; } /* @@ -7104,20 +7096,20 @@ static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path, u32 item_size; u32 cur_offset = 0; int ref_name_len; - int ret = 0; /* Easy case, just check this one dirid */ if (key->type == BTRFS_INODE_REF_KEY) { dirid = key->offset; - ret = dir_changed(sctx, dirid); - goto out; + return dir_changed(sctx, dirid); } leaf = path->nodes[0]; item_size = btrfs_item_size(leaf, path->slots[0]); ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); while (cur_offset < item_size) { + int ret; + extref = (struct btrfs_inode_extref *)(ptr + cur_offset); dirid = btrfs_inode_extref_parent(leaf, extref); @@ -7127,11 +7119,10 @@ static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path, continue; ret = dir_changed(sctx, dirid); if (ret) - break; + return ret; last_dirid = dirid; } -out: - return ret; + return 0; } /* @@ -7212,12 +7203,12 @@ static int changed_cb(struct btrfs_path *left_path, ret = finish_inode_if_needed(sctx, 0); if (ret < 0) - goto out; + return ret; /* Ignore non-FS objects */ if (key->objectid == BTRFS_FREE_INO_OBJECTID || key->objectid == BTRFS_FREE_SPACE_OBJECTID) - goto out; + return 0; if (key->type == BTRFS_INODE_ITEM_KEY) { ret = changed_inode(sctx, result); @@ -7234,7 +7225,6 @@ static int changed_cb(struct btrfs_path *left_path, ret = changed_verity(sctx, result); } -out: return ret; } diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 3f08e450f796..bb5aac7ee9d2 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -215,7 +215,7 @@ static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags) if (flags & BTRFS_BLOCK_GROUP_DATA) return BTRFS_MAX_DATA_CHUNK_SIZE; - else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + else if (flags & (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA_REMAP)) return SZ_32M; /* Handle BTRFS_BLOCK_GROUP_METADATA */ @@ -329,7 +329,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) struct btrfs_super_block *disk_super; u64 features; u64 flags; - int mixed = 0; + bool mixed = false; int ret; disk_super = fs_info->super_copy; @@ -338,26 +338,35 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) features = btrfs_super_incompat_flags(disk_super); if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) - mixed = 1; + mixed = true; flags = BTRFS_BLOCK_GROUP_SYSTEM; ret = create_space_info(fs_info, flags); if (ret) - goto out; + return ret; if (mixed) { flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; ret = create_space_info(fs_info, flags); + if (ret) + return ret; } else { flags = BTRFS_BLOCK_GROUP_METADATA; ret = create_space_info(fs_info, flags); if (ret) - goto out; + return ret; flags = BTRFS_BLOCK_GROUP_DATA; ret = create_space_info(fs_info, flags); + if (ret) + return ret; + } + + if (features & BTRFS_FEATURE_INCOMPAT_REMAP_TREE) { + flags = BTRFS_BLOCK_GROUP_METADATA_REMAP; + ret = create_space_info(fs_info, flags); } -out: + return ret; } @@ -370,8 +379,13 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, factor = btrfs_bg_type_to_factor(block_group->flags); spin_lock(&space_info->lock); - space_info->total_bytes += block_group->length; - space_info->disk_total += block_group->length * factor; + + if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED) || + block_group->identity_remap_count != 0) { + space_info->total_bytes += block_group->length; + space_info->disk_total += block_group->length * factor; + } + space_info->bytes_used += block_group->used; space_info->disk_used += block_group->used * factor; space_info->bytes_readonly += block_group->bytes_super; @@ -606,27 +620,12 @@ do { \ spin_unlock(&__rsv->lock); \ } while (0) -static const char *space_info_flag_to_str(const struct btrfs_space_info *space_info) -{ - switch (space_info->flags) { - case BTRFS_BLOCK_GROUP_SYSTEM: - return "SYSTEM"; - case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA: - return "DATA+METADATA"; - case BTRFS_BLOCK_GROUP_DATA: - return "DATA"; - case BTRFS_BLOCK_GROUP_METADATA: - return "METADATA"; - default: - return "UNKNOWN"; - } -} - static void dump_global_block_rsv(struct btrfs_fs_info *fs_info) { DUMP_BLOCK_RSV(fs_info, global_block_rsv); DUMP_BLOCK_RSV(fs_info, trans_block_rsv); DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); + DUMP_BLOCK_RSV(fs_info, remap_block_rsv); DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); } @@ -634,7 +633,7 @@ static void dump_global_block_rsv(struct btrfs_fs_info *fs_info) static void __btrfs_dump_space_info(const struct btrfs_space_info *info) { const struct btrfs_fs_info *fs_info = info->fs_info; - const char *flag_str = space_info_flag_to_str(info); + const char *flag_str = btrfs_space_info_type_str(info); lockdep_assert_held(&info->lock); /* The free space could be negative in case of overcommit */ @@ -672,8 +671,7 @@ again: u64 avail; spin_lock(&cache->lock); - avail = cache->length - cache->used - cache->pinned - - cache->reserved - cache->bytes_super - cache->zone_unusable; + avail = btrfs_block_group_available_space(cache); btrfs_info(fs_info, "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu delalloc %llu super %llu zone_unusable (%llu bytes available) %s", cache->start, cache->length, cache->used, cache->pinned, @@ -2099,11 +2097,11 @@ static bool is_reclaim_urgent(struct btrfs_space_info *space_info) return unalloc < data_chunk_size; } -static void do_reclaim_sweep(struct btrfs_space_info *space_info, int raid) +static bool do_reclaim_sweep(struct btrfs_space_info *space_info, int raid) { struct btrfs_block_group *bg; int thresh_pct; - bool try_again = true; + bool will_reclaim = false; bool urgent; spin_lock(&space_info->lock); @@ -2121,7 +2119,7 @@ again: spin_lock(&bg->lock); thresh = mult_perc(bg->length, thresh_pct); if (bg->used < thresh && bg->reclaim_mark) { - try_again = false; + will_reclaim = true; reclaim = true; } bg->reclaim_mark++; @@ -2138,12 +2136,13 @@ again: * If we have any staler groups, we don't touch the fresher ones, but if we * really need a block group, do take a fresh one. */ - if (try_again && urgent) { - try_again = false; + if (!will_reclaim && urgent) { + urgent = false; goto again; } up_read(&space_info->groups_sem); + return will_reclaim; } void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes) @@ -2153,7 +2152,8 @@ void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s6 lockdep_assert_held(&space_info->lock); space_info->reclaimable_bytes += bytes; - if (space_info->reclaimable_bytes >= chunk_sz) + if (space_info->reclaimable_bytes > 0 && + space_info->reclaimable_bytes >= chunk_sz) btrfs_set_periodic_reclaim_ready(space_info, true); } @@ -2180,7 +2180,6 @@ static bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info) spin_lock(&space_info->lock); ret = space_info->periodic_reclaim_ready; - btrfs_set_periodic_reclaim_ready(space_info, false); spin_unlock(&space_info->lock); return ret; @@ -2194,8 +2193,10 @@ void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info) list_for_each_entry(space_info, &fs_info->space_info, list) { if (!btrfs_should_periodic_reclaim(space_info)) continue; - for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) - do_reclaim_sweep(space_info, raid); + for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) { + if (do_reclaim_sweep(space_info, raid)) + btrfs_set_periodic_reclaim_ready(space_info, false); + } } } diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 446c0614ad4a..0703f24b23f7 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -307,4 +307,20 @@ int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info); void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info); void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len); +static inline const char *btrfs_space_info_type_str(const struct btrfs_space_info *space_info) +{ + switch (space_info->flags) { + case BTRFS_BLOCK_GROUP_SYSTEM: + return "SYSTEM"; + case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA: + return "DATA+METADATA"; + case BTRFS_BLOCK_GROUP_DATA: + return "DATA"; + case BTRFS_BLOCK_GROUP_METADATA: + return "METADATA"; + default: + return "UNKNOWN"; + } +} + #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index af56fdbba65d..d64d303b6edc 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2483,6 +2483,18 @@ static void btrfs_shutdown(struct super_block *sb) } #endif +static int btrfs_show_stats(struct seq_file *seq, struct dentry *root) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); + + if (btrfs_is_zoned(fs_info)) { + btrfs_show_zoned_stats(fs_info, seq); + return 0; + } + + return 0; +} + static const struct super_operations btrfs_super_ops = { .drop_inode = btrfs_drop_inode, .evict_inode = btrfs_evict_inode, @@ -2498,6 +2510,7 @@ static const struct super_operations btrfs_super_ops = { .unfreeze_fs = btrfs_unfreeze, .nr_cached_objects = btrfs_nr_cached_objects, .free_cached_objects = btrfs_free_cached_objects, + .show_stats = btrfs_show_stats, #ifdef CONFIG_BTRFS_EXPERIMENTAL .remove_bdev = btrfs_remove_bdev, .shutdown = btrfs_shutdown, @@ -2700,7 +2713,3 @@ module_exit(exit_btrfs_fs) MODULE_DESCRIPTION("B-Tree File System (BTRFS)"); MODULE_LICENSE("GPL"); -MODULE_SOFTDEP("pre: crc32c"); -MODULE_SOFTDEP("pre: xxhash64"); -MODULE_SOFTDEP("pre: sha256"); -MODULE_SOFTDEP("pre: blake2b-256"); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 4b3c2acac51a..27bfb7b55ec4 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -11,7 +11,6 @@ #include <linux/bug.h> #include <linux/list.h> #include <linux/string_choices.h> -#include <crypto/hash.h> #include "messages.h" #include "ctree.h" #include "discard.h" @@ -300,6 +299,8 @@ BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2); /* Remove once support for raid stripe tree is feature complete. */ BTRFS_FEAT_ATTR_INCOMPAT(raid_stripe_tree, RAID_STRIPE_TREE); +/* Remove once support for remap tree is feature complete. */ +BTRFS_FEAT_ATTR_INCOMPAT(remap_tree, REMAP_TREE); #endif #ifdef CONFIG_FS_VERITY BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY); @@ -332,6 +333,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = { #ifdef CONFIG_BTRFS_EXPERIMENTAL BTRFS_FEAT_ATTR_PTR(extent_tree_v2), BTRFS_FEAT_ATTR_PTR(raid_stripe_tree), + BTRFS_FEAT_ATTR_PTR(remap_tree), #endif #ifdef CONFIG_FS_VERITY BTRFS_FEAT_ATTR_PTR(verity), @@ -1253,10 +1255,9 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); u16 csum_type = btrfs_super_csum_type(fs_info->super_copy); + const char *csum_name = btrfs_super_csum_name(csum_type); - return sysfs_emit(buf, "%s (%s)\n", - btrfs_super_csum_name(csum_type), - crypto_shash_driver_name(fs_info->csum_shash)); + return sysfs_emit(buf, "%s (%s-lib)\n", csum_name, csum_name); } BTRFS_ATTR(, checksum, btrfs_checksum_show); @@ -1540,47 +1541,6 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show, btrfs_bg_reclaim_threshold_store); -#ifdef CONFIG_BTRFS_EXPERIMENTAL -static ssize_t btrfs_offload_csum_show(struct kobject *kobj, - struct kobj_attribute *a, char *buf) -{ - struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); - - switch (READ_ONCE(fs_devices->offload_csum_mode)) { - case BTRFS_OFFLOAD_CSUM_AUTO: - return sysfs_emit(buf, "auto\n"); - case BTRFS_OFFLOAD_CSUM_FORCE_ON: - return sysfs_emit(buf, "1\n"); - case BTRFS_OFFLOAD_CSUM_FORCE_OFF: - return sysfs_emit(buf, "0\n"); - default: - WARN_ON(1); - return -EINVAL; - } -} - -static ssize_t btrfs_offload_csum_store(struct kobject *kobj, - struct kobj_attribute *a, const char *buf, - size_t len) -{ - struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); - int ret; - bool val; - - ret = kstrtobool(buf, &val); - if (ret == 0) - WRITE_ONCE(fs_devices->offload_csum_mode, - val ? BTRFS_OFFLOAD_CSUM_FORCE_ON : BTRFS_OFFLOAD_CSUM_FORCE_OFF); - else if (ret == -EINVAL && sysfs_streq(buf, "auto")) - WRITE_ONCE(fs_devices->offload_csum_mode, BTRFS_OFFLOAD_CSUM_AUTO); - else - return -EINVAL; - - return len; -} -BTRFS_ATTR_RW(, offload_csum, btrfs_offload_csum_show, btrfs_offload_csum_store); -#endif - /* * Per-filesystem information and stats. * @@ -1600,9 +1560,6 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, bg_reclaim_threshold), BTRFS_ATTR_PTR(, commit_stats), BTRFS_ATTR_PTR(, temp_fsid), -#ifdef CONFIG_BTRFS_EXPERIMENTAL - BTRFS_ATTR_PTR(, offload_csum), -#endif NULL, }; @@ -1972,6 +1929,8 @@ static const char *alloc_name(struct btrfs_space_info *space_info) case BTRFS_BLOCK_GROUP_SYSTEM: ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_PRIMARY); return "system"; + case BTRFS_BLOCK_GROUP_METADATA_REMAP: + return "metadata-remap"; default: WARN_ON(1); return "invalid-combination"; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index b576897d71cc..7f13c05d3736 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -301,6 +301,9 @@ int btrfs_run_sanity_tests(void) ret = btrfs_test_delayed_refs(sectorsize, nodesize); if (ret) goto out; + ret = btrfs_test_chunk_allocation(sectorsize, nodesize); + if (ret) + goto out; } } ret = btrfs_test_extent_map(); diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index 4307bdaa6749..b03d85a6e5ef 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -7,8 +7,10 @@ #define BTRFS_TESTS_H #include <linux/types.h> +#include <linux/cleanup.h> #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + int btrfs_run_sanity_tests(void); #define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt "\n", ##__VA_ARGS__) @@ -45,13 +47,18 @@ int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize); int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize); int btrfs_test_extent_map(void); int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize); +int btrfs_test_chunk_allocation(u32 sectorsize, u32 nodesize); struct inode *btrfs_new_test_inode(void); struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize); void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info); +DEFINE_FREE(btrfs_free_dummy_fs_info, struct btrfs_fs_info *, + btrfs_free_dummy_fs_info(_T)) void btrfs_free_dummy_root(struct btrfs_root *root); struct btrfs_block_group * btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long length); void btrfs_free_dummy_block_group(struct btrfs_block_group *cache); +DEFINE_FREE(btrfs_free_dummy_block_group, struct btrfs_block_group *, + btrfs_free_dummy_block_group(_T)); void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); void btrfs_init_dummy_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/tests/chunk-allocation-tests.c b/fs/btrfs/tests/chunk-allocation-tests.c new file mode 100644 index 000000000000..9beb0602fc8c --- /dev/null +++ b/fs/btrfs/tests/chunk-allocation-tests.c @@ -0,0 +1,476 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2026 Meta. All rights reserved. + */ + +#include <linux/sizes.h> +#include "btrfs-tests.h" +#include "../volumes.h" +#include "../disk-io.h" +#include "../extent-io-tree.h" + +/* + * Tests for chunk allocator pending extent internals. + * These two functions form the core of searching the chunk allocation pending + * extent bitmap and have relatively easily definable semantics, so unit + * testing them can help ensure the correctness of chunk allocation. + */ + +/* + * Describes the inputs to the system and expected results + * when testing btrfs_find_hole_in_pending_extents(). + */ +struct pending_extent_test_case { + const char *name; + /* Input range to search. */ + u64 hole_start; + u64 hole_len; + /* The size of hole we are searching for. */ + u64 min_hole_size; + /* + * Pending extents to set up (up to 2 for up to 3 holes) + * If len == 0, then it is skipped. + */ + struct { + u64 start; + u64 len; + } pending_extents[2]; + /* Expected outputs. */ + bool expected_found; + u64 expected_start; + u64 expected_len; +}; + +static const struct pending_extent_test_case find_hole_tests[] = { + { + .name = "no pending extents", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .min_hole_size = SZ_1G, + .pending_extents = { }, + .expected_found = true, + .expected_start = 0, + .expected_len = 10ULL * SZ_1G, + }, + { + .name = "pending extent at start of range", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .min_hole_size = SZ_1G, + .pending_extents = { + { .start = 0, .len = SZ_1G }, + }, + .expected_found = true, + .expected_start = SZ_1G, + .expected_len = 9ULL * SZ_1G, + }, + { + .name = "pending extent overlapping start of range", + .hole_start = SZ_1G, + .hole_len = 9ULL * SZ_1G, + .min_hole_size = SZ_1G, + .pending_extents = { + { .start = 0, .len = SZ_2G }, + }, + .expected_found = true, + .expected_start = SZ_2G, + .expected_len = 8ULL * SZ_1G, + }, + { + .name = "two holes; first hole is exactly big enough", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .min_hole_size = SZ_1G, + .pending_extents = { + { .start = SZ_1G, .len = SZ_1G }, + }, + .expected_found = true, + .expected_start = 0, + .expected_len = SZ_1G, + }, + { + .name = "two holes; first hole is big enough", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .min_hole_size = SZ_1G, + .pending_extents = { + { .start = SZ_2G, .len = SZ_1G }, + }, + .expected_found = true, + .expected_start = 0, + .expected_len = SZ_2G, + }, + { + .name = "two holes; second hole is big enough", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .min_hole_size = SZ_2G, + .pending_extents = { + { .start = SZ_1G, .len = SZ_1G }, + }, + .expected_found = true, + .expected_start = SZ_2G, + .expected_len = 8ULL * SZ_1G, + }, + { + .name = "three holes; first hole big enough", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .min_hole_size = SZ_2G, + .pending_extents = { + { .start = SZ_2G, .len = SZ_1G }, + { .start = 4ULL * SZ_1G, .len = SZ_1G }, + }, + .expected_found = true, + .expected_start = 0, + .expected_len = SZ_2G, + }, + { + .name = "three holes; second hole big enough", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .min_hole_size = SZ_2G, + .pending_extents = { + { .start = SZ_1G, .len = SZ_1G }, + { .start = 5ULL * SZ_1G, .len = SZ_1G }, + }, + .expected_found = true, + .expected_start = SZ_2G, + .expected_len = 3ULL * SZ_1G, + }, + { + .name = "three holes; third hole big enough", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .min_hole_size = SZ_2G, + .pending_extents = { + { .start = SZ_1G, .len = SZ_1G }, + { .start = 3ULL * SZ_1G, .len = 5ULL * SZ_1G }, + }, + .expected_found = true, + .expected_start = 8ULL * SZ_1G, + .expected_len = SZ_2G, + }, + { + .name = "three holes; all holes too small", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .min_hole_size = SZ_2G, + .pending_extents = { + { .start = SZ_1G, .len = SZ_1G }, + { .start = 3ULL * SZ_1G, .len = 6ULL * SZ_1G }, + }, + .expected_found = false, + .expected_start = 0, + .expected_len = SZ_1G, + }, + { + .name = "three holes; all holes too small; first biggest", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .min_hole_size = 3ULL * SZ_1G, + .pending_extents = { + { .start = SZ_2G, .len = SZ_1G }, + { .start = 4ULL * SZ_1G, .len = 5ULL * SZ_1G }, + }, + .expected_found = false, + .expected_start = 0, + .expected_len = SZ_2G, + }, + { + .name = "three holes; all holes too small; second biggest", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .min_hole_size = 3ULL * SZ_1G, + .pending_extents = { + { .start = SZ_1G, .len = SZ_1G }, + { .start = 4ULL * SZ_1G, .len = 5ULL * SZ_1G }, + }, + .expected_found = false, + .expected_start = SZ_2G, + .expected_len = SZ_2G, + }, + { + .name = "three holes; all holes too small; third biggest", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .min_hole_size = 3ULL * SZ_1G, + .pending_extents = { + { .start = SZ_1G, .len = SZ_1G }, + { .start = 3ULL * SZ_1G, .len = 5ULL * SZ_1G }, + }, + .expected_found = false, + .expected_start = 8ULL * SZ_1G, + .expected_len = SZ_2G, + }, + { + .name = "hole entirely allocated by pending", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .min_hole_size = SZ_1G, + .pending_extents = { + { .start = 0, .len = 10ULL * SZ_1G }, + }, + .expected_found = false, + .expected_start = 10ULL * SZ_1G, + .expected_len = 0, + }, + { + .name = "pending extent at end of range", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .min_hole_size = SZ_1G, + .pending_extents = { + { .start = 9ULL * SZ_1G, .len = SZ_2G }, + }, + .expected_found = true, + .expected_start = 0, + .expected_len = 9ULL * SZ_1G, + }, + { + .name = "zero length input", + .hole_start = SZ_1G, + .hole_len = 0, + .min_hole_size = SZ_1G, + .pending_extents = { }, + .expected_found = false, + .expected_start = SZ_1G, + .expected_len = 0, + }, +}; + +static int test_find_hole_in_pending(u32 sectorsize, u32 nodesize) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_device *device; + int ret = 0; + + test_msg("running find_hole_in_pending_extents tests"); + + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + return -ENOMEM; + } + + device = btrfs_alloc_dummy_device(fs_info); + if (IS_ERR(device)) { + test_err("failed to allocate dummy device"); + ret = PTR_ERR(device); + goto out_free_fs_info; + } + device->fs_info = fs_info; + + for (int i = 0; i < ARRAY_SIZE(find_hole_tests); i++) { + const struct pending_extent_test_case *test_case = &find_hole_tests[i]; + u64 hole_start = test_case->hole_start; + u64 hole_len = test_case->hole_len; + bool found; + + for (int j = 0; j < ARRAY_SIZE(test_case->pending_extents); j++) { + u64 start = test_case->pending_extents[j].start; + u64 len = test_case->pending_extents[j].len; + + if (!len) + continue; + btrfs_set_extent_bit(&device->alloc_state, + start, start + len - 1, + CHUNK_ALLOCATED, NULL); + } + + mutex_lock(&fs_info->chunk_mutex); + found = btrfs_find_hole_in_pending_extents(device, &hole_start, &hole_len, + test_case->min_hole_size); + mutex_unlock(&fs_info->chunk_mutex); + + if (found != test_case->expected_found) { + test_err("%s: expected found=%d, got found=%d", + test_case->name, test_case->expected_found, found); + ret = -EINVAL; + goto out_clear_pending_extents; + } + if (hole_start != test_case->expected_start || + hole_len != test_case->expected_len) { + test_err("%s: expected [%llu, %llu), got [%llu, %llu)", + test_case->name, test_case->expected_start, + test_case->expected_start + + test_case->expected_len, + hole_start, hole_start + hole_len); + ret = -EINVAL; + goto out_clear_pending_extents; + } +out_clear_pending_extents: + btrfs_clear_extent_bit(&device->alloc_state, 0, (u64)-1, + CHUNK_ALLOCATED, NULL); + if (ret) + break; + } + +out_free_fs_info: + btrfs_free_dummy_fs_info(fs_info); + return ret; +} + +/* + * Describes the inputs to the system and expected results + * when testing btrfs_first_pending_extent(). + */ +struct first_pending_test_case { + const char *name; + /* The range to look for a pending extent in. */ + u64 hole_start; + u64 hole_len; + /* The pending extent to look for. */ + struct { + u64 start; + u64 len; + } pending_extent; + /* Expected outputs. */ + bool expected_found; + u64 expected_pending_start; + u64 expected_pending_end; +}; + +static const struct first_pending_test_case first_pending_tests[] = { + { + .name = "no pending extent", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .pending_extent = { 0, 0 }, + .expected_found = false, + }, + { + .name = "pending extent at search start", + .hole_start = SZ_1G, + .hole_len = 9ULL * SZ_1G, + .pending_extent = { SZ_1G, SZ_1G }, + .expected_found = true, + .expected_pending_start = SZ_1G, + .expected_pending_end = SZ_2G - 1, + }, + { + .name = "pending extent overlapping search start", + .hole_start = SZ_1G, + .hole_len = 9ULL * SZ_1G, + .pending_extent = { 0, SZ_2G }, + .expected_found = true, + .expected_pending_start = 0, + .expected_pending_end = SZ_2G - 1, + }, + { + .name = "pending extent inside search range", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .pending_extent = { SZ_2G, SZ_1G }, + .expected_found = true, + .expected_pending_start = SZ_2G, + .expected_pending_end = 3ULL * SZ_1G - 1, + }, + { + .name = "pending extent outside search range", + .hole_start = 0, + .hole_len = SZ_1G, + .pending_extent = { SZ_2G, SZ_1G }, + .expected_found = false, + }, + { + .name = "pending extent overlapping end of search range", + .hole_start = 0, + .hole_len = SZ_2G, + .pending_extent = { SZ_1G, SZ_2G }, + .expected_found = true, + .expected_pending_start = SZ_1G, + .expected_pending_end = 3ULL * SZ_1G - 1, + }, +}; + +static int test_first_pending_extent(u32 sectorsize, u32 nodesize) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_device *device; + int ret = 0; + + test_msg("running first_pending_extent tests"); + + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + return -ENOMEM; + } + + device = btrfs_alloc_dummy_device(fs_info); + if (IS_ERR(device)) { + test_err("failed to allocate dummy device"); + ret = PTR_ERR(device); + goto out_free_fs_info; + } + + device->fs_info = fs_info; + + for (int i = 0; i < ARRAY_SIZE(first_pending_tests); i++) { + const struct first_pending_test_case *test_case = &first_pending_tests[i]; + u64 start = test_case->pending_extent.start; + u64 len = test_case->pending_extent.len; + u64 pending_start, pending_end; + bool found; + + if (len) { + btrfs_set_extent_bit(&device->alloc_state, + start, start + len - 1, + CHUNK_ALLOCATED, NULL); + } + + mutex_lock(&fs_info->chunk_mutex); + found = btrfs_first_pending_extent(device, test_case->hole_start, + test_case->hole_len, + &pending_start, &pending_end); + mutex_unlock(&fs_info->chunk_mutex); + + if (found != test_case->expected_found) { + test_err("%s: expected found=%d, got found=%d", + test_case->name, test_case->expected_found, found); + ret = -EINVAL; + goto out_clear_pending_extents; + } + if (!found) + goto out_clear_pending_extents; + + if (pending_start != test_case->expected_pending_start || + pending_end != test_case->expected_pending_end) { + test_err("%s: expected pending [%llu, %llu], got [%llu, %llu]", + test_case->name, + test_case->expected_pending_start, + test_case->expected_pending_end, + pending_start, pending_end); + ret = -EINVAL; + goto out_clear_pending_extents; + } + +out_clear_pending_extents: + btrfs_clear_extent_bit(&device->alloc_state, 0, (u64)-1, + CHUNK_ALLOCATED, NULL); + if (ret) + break; + } + +out_free_fs_info: + btrfs_free_dummy_fs_info(fs_info); + return ret; +} + +int btrfs_test_chunk_allocation(u32 sectorsize, u32 nodesize) +{ + int ret; + + test_msg("running chunk allocation tests"); + + ret = test_first_pending_extent(sectorsize, nodesize); + if (ret) + return ret; + + ret = test_find_hole_in_pending(sectorsize, nodesize); + if (ret) + return ret; + + return 0; +} diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index aabf825e8d7b..811f36d41101 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -173,9 +173,12 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) return -ENOMEM; } - /* Add [0, 1K) */ + /* + * Add [0, 1K) which is inlined. And the extent map length must + * be one block. + */ em->start = 0; - em->len = SZ_1K; + em->len = SZ_4K; em->disk_bytenr = EXTENT_MAP_INLINE; em->disk_num_bytes = 0; em->ram_bytes = SZ_1K; @@ -219,7 +222,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) /* Add [0, 1K) */ em->start = 0; - em->len = SZ_1K; + em->len = SZ_4K; em->disk_bytenr = EXTENT_MAP_INLINE; em->disk_num_bytes = 0; em->ram_bytes = SZ_1K; @@ -235,7 +238,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) ret = -ENOENT; goto out; } - if (em->start != 0 || btrfs_extent_map_end(em) != SZ_1K || + if (em->start != 0 || btrfs_extent_map_end(em) != SZ_4K || em->disk_bytenr != EXTENT_MAP_INLINE) { test_err( "case2 [0 1K]: ret %d return a wrong em (start %llu len %llu disk_bytenr %llu", @@ -1131,8 +1134,11 @@ int btrfs_test_extent_map(void) /* * Note: the fs_info is not set up completely, we only need * fs_info::fsid for the tracepoint. + * + * And all the immediate numbers are based on 4K blocksize, + * thus we have to use 4K as sectorsize no matter the page size. */ - fs_info = btrfs_alloc_dummy_fs_info(PAGE_SIZE, PAGE_SIZE); + fs_info = btrfs_alloc_dummy_fs_info(SZ_4K, SZ_4K); if (!fs_info) { test_std_err(TEST_ALLOC_FS_INFO); return -ENOMEM; diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index c8822edd32e2..8dee057f41fd 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -49,7 +49,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans, if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { if (path->slots[0] != 0) goto invalid; - end = cache->start + cache->length; + end = btrfs_block_group_end(cache); i = 0; while (++path->slots[0] < btrfs_header_nritems(path->nodes[0])) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); @@ -216,7 +216,7 @@ static int test_remove_end(struct btrfs_trans_handle *trans, int ret; ret = __btrfs_remove_from_free_space_tree(trans, cache, path, - cache->start + cache->length - alignment, + btrfs_block_group_end(cache) - alignment, alignment); if (ret) { test_err("could not remove free space"); diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index a4c2b7748b95..b04fbcaf0a1d 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -81,17 +81,20 @@ static void insert_inode_item_key(struct btrfs_root *root) * diagram of how the extents will look though this may not be possible we still * want to make sure everything acts normally (the last number is not inclusive) * - * [0 - 6][ 6 - 4096 ][ 4096 - 4100][4100 - 8195][8195 - 12291] - * [inline][hole but no extent][ hole ][ regular ][regular1 split] + * The numbers are using 4K fs block size as an example, the real test will scale + * all the extent maps (except the inlined one) according to the block size. * - * [12291 - 16387][16387 - 24579][24579 - 28675][ 28675 - 32771][32771 - 36867 ] - * [ hole ][regular1 split][ prealloc ][ prealloc1 ][prealloc1 written] + * [ 0 - 6 ][ 6 - 4K ][ 4K - 8K ][ 8K - 12K ] + * [ inline ][ implied hole ][ regular ][ regular1 split ] * - * [36867 - 45059][45059 - 53251][53251 - 57347][57347 - 61443][61443- 69635] - * [ prealloc1 ][ compressed ][ compressed1 ][ regular ][ compressed1] + * [ 12K - 16K ][ 16K - 24K ][ 24K - 28K ][ 28K - 32K ][ 32K - 36K ] + * [ hole ][ regular1 split ][ prealloc ][ prealloc1 ][ prealloc1 written ] * - * [69635-73731][ 73731 - 86019 ][86019-90115] - * [ regular ][ hole but no extent][ regular ] + * [ 36K - 44K ][ 44K - 52K ][ 52K - 56K ][ 56K - 60K ][ 60K - 68 K ] + * [ prealloc1 ][ compressed ][ compressed1 ][ regular ][ compressed1 ] + * + * [ 68K - 72K ][ 72K - 84K ][ 84K - 88K ] + * [ regular ][ hole but no extent ][ regular ] */ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) { @@ -100,6 +103,8 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) u64 offset = 0; /* + * Start 0, length 6, inlined. + * * Tree-checker has strict limits on inline extents that they can only * exist at file offset 0, thus we can only have one inline file extent * at most. @@ -109,20 +114,18 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) slot++; offset = sectorsize; - /* Now another hole */ - insert_extent(root, offset, 4, 4, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0, - slot); + /* Start 1 * blocksize, length 1 * blocksize, regular. */ + insert_extent(root, offset, sectorsize, sectorsize, 0, + disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - offset += 4; - /* Now for a regular extent */ - insert_extent(root, offset, sectorsize - 1, sectorsize - 1, 0, - disk_bytenr, sectorsize - 1, BTRFS_FILE_EXTENT_REG, 0, slot); - slot++; - disk_bytenr += sectorsize; - offset += sectorsize - 1; + /* We don't want the regular em merged with the next one. */ + disk_bytenr += 2 * sectorsize; + offset += sectorsize; /* + * Start 2 * blocksize, length 1 * blocksize, regular. + * * Now for 3 extents that were split from a hole punch so we test * offsets properly. */ @@ -130,10 +133,14 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; offset += sectorsize; + + /* Start 3 * blocksize, length 1 * blocksize, regular, explicit hole. */ insert_extent(root, offset, sectorsize, sectorsize, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; offset += sectorsize; + + /* Start 4 * blocksize, length 2 * blocksize, regular. */ insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize, 2 * sectorsize, disk_bytenr, 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); @@ -141,7 +148,7 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) offset += 2 * sectorsize; disk_bytenr += 4 * sectorsize; - /* Now for a unwritten prealloc extent */ + /* Start 6 * blocksize, length 1 * blocksize, preallocated. */ insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot); slot++; @@ -154,6 +161,8 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) disk_bytenr += 2 * sectorsize; /* + * Start 7 * blocksize, length 1 * blocksize, prealloc. + * * Now for a partially written prealloc extent, basically the same as * the hole punch example above. Ram_bytes never changes when you mark * extents written btw. @@ -162,11 +171,15 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) 4 * sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot); slot++; offset += sectorsize; + + /* Start 8 * blocksize, length 1 * blocksize, regular. */ insert_extent(root, offset, sectorsize, 4 * sectorsize, sectorsize, disk_bytenr, 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; offset += sectorsize; + + /* Start 9 * blocksize, length 2 * blocksize, prealloc. */ insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize, 2 * sectorsize, disk_bytenr, 4 * sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot); @@ -174,7 +187,7 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) offset += 2 * sectorsize; disk_bytenr += 4 * sectorsize; - /* Now a normal compressed extent */ + /* Start 11 * blocksize, length 2 * blocksize, regular. */ insert_extent(root, offset, 2 * sectorsize, 2 * sectorsize, 0, disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); @@ -183,17 +196,21 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) /* No merges */ disk_bytenr += 2 * sectorsize; - /* Now a split compressed extent */ + /* Start 13 * blocksize, length 1 * blocksize, regular. */ insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); slot++; offset += sectorsize; + + /* Start 14 * blocksize, length 1 * blocksize, regular. */ insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr + sectorsize, sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; offset += sectorsize; + + /* Start 15 * blocksize, length 2 * blocksize, regular. */ insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize, 2 * sectorsize, disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); @@ -201,12 +218,19 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) offset += 2 * sectorsize; disk_bytenr += 2 * sectorsize; - /* Now extents that have a hole but no hole extent */ + /* Start 17 * blocksize, length 1 * blocksize, regular. */ insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; offset += 4 * sectorsize; disk_bytenr += sectorsize; + + /* + * Start 18 * blocksize, length 3 * blocksize, implied hole (aka no + * file extent item). + * + * Start 21 * blocksize, length 1 * blocksize, regular. + */ insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); } @@ -313,29 +337,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) * unless we have a page for it to write into. Maybe we should change * this? */ - offset = em->start + em->len; - btrfs_free_extent_map(em); - - em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); - if (IS_ERR(em)) { - test_err("got an error when we shouldn't have"); - goto out; - } - if (em->disk_bytenr != EXTENT_MAP_HOLE) { - test_err("expected a hole, got %llu", em->disk_bytenr); - goto out; - } - if (em->start != offset || em->len != 4) { - test_err( - "unexpected extent wanted start %llu len 4, got start %llu len %llu", - offset, em->start, em->len); - goto out; - } - if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %u", em->flags); - goto out; - } - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); /* Regular extent */ @@ -348,10 +350,10 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("expected a real extent, got %llu", em->disk_bytenr); goto out; } - if (em->start != offset || em->len != sectorsize - 1) { + if (em->start != offset || em->len != sectorsize) { test_err( - "unexpected extent wanted start %llu len 4095, got start %llu len %llu", - offset, em->start, em->len); + "unexpected extent wanted start %llu len %u, got start %llu len %llu", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -362,7 +364,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); /* The next 3 are split extents */ @@ -391,7 +393,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) } disk_bytenr = btrfs_extent_map_block_start(em); orig_start = em->start; - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -413,7 +415,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -446,7 +448,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) disk_bytenr, btrfs_extent_map_block_start(em)); goto out; } - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); /* Prealloc extent */ @@ -474,7 +476,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); /* The next 3 are a half written prealloc extent */ @@ -504,7 +506,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) } disk_bytenr = btrfs_extent_map_block_start(em); orig_start = em->start; - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -536,7 +538,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) disk_bytenr + em->offset, btrfs_extent_map_block_start(em)); goto out; } - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -569,7 +571,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) disk_bytenr + em->offset, btrfs_extent_map_block_start(em)); goto out; } - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); /* Now for the compressed extent */ @@ -602,7 +604,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) BTRFS_COMPRESS_ZLIB, btrfs_extent_map_compression(em)); goto out; } - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); /* Split compressed extent */ @@ -637,7 +639,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) } disk_bytenr = btrfs_extent_map_block_start(em); orig_start = em->start; - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -663,7 +665,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -697,7 +699,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) BTRFS_COMPRESS_ZLIB, btrfs_extent_map_compression(em)); goto out; } - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); /* A hole between regular extents but no hole extent */ @@ -724,7 +726,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, SZ_4M); @@ -756,7 +758,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index bd03f465e2d3..0b2498749b1e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -950,7 +950,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) if (transid) { if (transid <= btrfs_get_last_trans_committed(fs_info)) - goto out; + return 0; /* find specified transaction */ spin_lock(&fs_info->trans_lock); @@ -975,7 +975,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) if (!cur_trans) { if (transid > btrfs_get_last_trans_committed(fs_info)) ret = -EINVAL; - goto out; + return ret; } } else { /* find newest transaction that is committing | committed */ @@ -991,14 +991,15 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) } } spin_unlock(&fs_info->trans_lock); + /* Nothing committing or committed. */ if (!cur_trans) - goto out; /* nothing committing|committed */ + return ret; } wait_for_commit(cur_trans, TRANS_STATE_COMPLETED); ret = cur_trans->aborted; btrfs_put_transaction(cur_trans); -out: + return ret; } @@ -1515,7 +1516,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) btrfs_free_log(trans, root); ret2 = btrfs_update_reloc_root(trans, root); - if (ret2) + if (unlikely(ret2)) return ret2; /* see comments in should_cow_block() */ @@ -1532,7 +1533,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) ret2 = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); - if (ret2) + if (unlikely(ret2)) return ret2; spin_lock(&fs_info->fs_roots_radix_lock); } @@ -1621,9 +1622,9 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, goto out; switch_commit_roots(trans); ret = btrfs_write_and_wait_transaction(trans); - if (ret) - btrfs_handle_fs_error(fs_info, ret, - "Error while writing out transaction for qgroup"); + if (unlikely(ret)) + btrfs_err(fs_info, +"error while writing out transaction during qgroup snapshot accounting: %d", ret); out: /* @@ -1687,11 +1688,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, &pending->dentry->d_name, 0, &fname); memalloc_nofs_restore(nofs_flags); - if (pending->error) + if (unlikely(pending->error)) goto free_pending; pending->error = btrfs_get_free_objectid(tree_root, &objectid); - if (pending->error) + if (unlikely(pending->error)) goto free_fname; /* @@ -1707,7 +1708,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, &pending->block_rsv, to_reserve, BTRFS_RESERVE_NO_FLUSH); - if (pending->error) + if (unlikely(pending->error)) goto clear_skip_qgroup; } @@ -1719,7 +1720,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, trans->bytes_reserved, 1); parent_root = parent_inode->root; ret = record_root_in_trans(trans, parent_root, 0); - if (ret) + if (unlikely(ret)) goto fail; cur_time = current_time(&parent_inode->vfs_inode); @@ -1736,7 +1737,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, btrfs_ino(parent_inode), &fname.disk_name, 0); - if (dir_item != NULL && !IS_ERR(dir_item)) { + if (unlikely(dir_item != NULL && !IS_ERR(dir_item))) { pending->error = -EEXIST; goto dir_item_existed; } else if (IS_ERR(dir_item)) { @@ -1873,7 +1874,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) ret = btrfs_qgroup_inherit(trans, btrfs_root_id(root), objectid, btrfs_root_id(parent_root), pending->inherit); - if (ret < 0) + if (unlikely(ret < 0)) goto fail; ret = btrfs_insert_dir_item(trans, &fname.disk_name, @@ -1939,7 +1940,7 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans) list_for_each_entry_safe(pending, next, head, list) { list_del(&pending->list); ret = create_pending_snapshot(trans, pending); - if (ret) + if (unlikely(ret)) break; } return ret; @@ -1967,6 +1968,13 @@ static void update_super_roots(struct btrfs_fs_info *fs_info) super->cache_generation = 0; if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags)) super->uuid_tree_generation = root_item->generation; + + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { + root_item = &fs_info->remap_root->root_item; + super->remap_root = root_item->bytenr; + super->remap_root_generation = root_item->generation; + super->remap_root_level = root_item->level; + } } int btrfs_transaction_blocked(struct btrfs_fs_info *info) @@ -2258,7 +2266,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (run_it) { ret = btrfs_start_dirty_block_groups(trans); - if (ret) + if (unlikely(ret)) goto lockdep_trans_commit_start_release; } } @@ -2308,7 +2316,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) ret = READ_ONCE(prev_trans->aborted); btrfs_put_transaction(prev_trans); - if (ret) + if (unlikely(ret)) goto lockdep_release; spin_lock(&fs_info->trans_lock); } @@ -2338,11 +2346,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) extwriter_counter_dec(cur_trans, trans->type); ret = btrfs_start_delalloc_flush(fs_info); - if (ret) + if (unlikely(ret)) goto lockdep_release; ret = btrfs_run_delayed_items(trans); - if (ret) + if (unlikely(ret)) goto lockdep_release; /* @@ -2357,7 +2365,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) /* some pending stuffs might be added after the previous flush. */ ret = btrfs_run_delayed_items(trans); - if (ret) { + if (unlikely(ret)) { btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); goto cleanup_transaction; } @@ -2429,7 +2437,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * core function of the snapshot creation. */ ret = create_pending_snapshots(trans); - if (ret) + if (unlikely(ret)) goto unlock_reloc; /* @@ -2443,11 +2451,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * the nodes and leaves. */ ret = btrfs_run_delayed_items(trans); - if (ret) + if (unlikely(ret)) goto unlock_reloc; ret = btrfs_run_delayed_refs(trans, U64_MAX); - if (ret) + if (unlikely(ret)) goto unlock_reloc; /* @@ -2459,7 +2467,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) WARN_ON(cur_trans != trans->transaction); ret = commit_fs_roots(trans); - if (ret) + if (unlikely(ret)) goto unlock_reloc; /* commit_fs_roots gets rid of all the tree log roots, it is now @@ -2472,11 +2480,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * new_roots. So let's do quota accounting. */ ret = btrfs_qgroup_account_extents(trans); - if (ret < 0) + if (unlikely(ret < 0)) goto unlock_reloc; ret = commit_cowonly_roots(trans); - if (ret) + if (unlikely(ret)) goto unlock_reloc; /* @@ -2500,13 +2508,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) list_add_tail(&fs_info->chunk_root->dirty_list, &cur_trans->switch_commits); - if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { - btrfs_set_root_node(&fs_info->block_group_root->root_item, - fs_info->block_group_root->node); - list_add_tail(&fs_info->block_group_root->dirty_list, - &cur_trans->switch_commits); - } - switch_commit_roots(trans); ASSERT(list_empty(&cur_trans->dirty_bgs)); @@ -2550,9 +2551,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) wake_up_process(fs_info->cleaner_kthread); ret = btrfs_write_and_wait_transaction(trans); - if (ret) { - btrfs_handle_fs_error(fs_info, ret, - "Error while writing out transaction"); + if (unlikely(ret)) { + btrfs_err(fs_info, "error while writing out transaction: %d", ret); mutex_unlock(&fs_info->tree_log_mutex); goto scrub_continue; } @@ -2563,7 +2563,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * to go about their business */ mutex_unlock(&fs_info->tree_log_mutex); - if (ret) + if (unlikely(ret)) goto scrub_continue; update_commit_stats(fs_info); @@ -2576,7 +2576,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); ret = btrfs_finish_extent_commit(trans); - if (ret) + if (unlikely(ret)) goto scrub_continue; if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags)) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index c21c21adf61e..452394b34d01 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -688,6 +688,7 @@ static int check_block_group_item(struct extent_buffer *leaf, u64 chunk_objectid; u64 flags; u64 type; + size_t exp_size; /* * Here we don't really care about alignment since extent allocator can @@ -699,10 +700,15 @@ static int check_block_group_item(struct extent_buffer *leaf, return -EUCLEAN; } - if (unlikely(item_size != sizeof(bgi))) { + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) + exp_size = sizeof(struct btrfs_block_group_item_v2); + else + exp_size = sizeof(struct btrfs_block_group_item); + + if (unlikely(item_size != exp_size)) { block_group_err(leaf, slot, "invalid item size, have %u expect %zu", - item_size, sizeof(bgi)); + item_size, exp_size); return -EUCLEAN; } @@ -748,17 +754,26 @@ static int check_block_group_item(struct extent_buffer *leaf, return -EUCLEAN; } + if (unlikely(flags & BTRFS_BLOCK_GROUP_METADATA_REMAP && + !btrfs_fs_incompat(fs_info, REMAP_TREE))) { + block_group_err(leaf, slot, +"invalid flags, have 0x%llx (METADATA_REMAP flag set) but no remap-tree incompat flag", + flags); + return -EUCLEAN; + } + type = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; if (unlikely(type != BTRFS_BLOCK_GROUP_DATA && type != BTRFS_BLOCK_GROUP_METADATA && type != BTRFS_BLOCK_GROUP_SYSTEM && + type != BTRFS_BLOCK_GROUP_METADATA_REMAP && type != (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) { block_group_err(leaf, slot, -"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx", +"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx, 0x%llx or 0x%llx", type, hweight64(type), BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA, - BTRFS_BLOCK_GROUP_SYSTEM, + BTRFS_BLOCK_GROUP_SYSTEM, BTRFS_BLOCK_GROUP_METADATA_REMAP, BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA); return -EUCLEAN; } @@ -807,6 +822,32 @@ static void chunk_err(const struct btrfs_fs_info *fs_info, va_end(args); } +static bool valid_stripe_count(u64 profile, u16 num_stripes, u16 sub_stripes) +{ + switch (profile) { + case BTRFS_BLOCK_GROUP_RAID0: + return true; + case BTRFS_BLOCK_GROUP_RAID10: + return sub_stripes == btrfs_raid_array[BTRFS_RAID_RAID10].sub_stripes; + case BTRFS_BLOCK_GROUP_RAID1: + return num_stripes == btrfs_raid_array[BTRFS_RAID_RAID1].devs_min; + case BTRFS_BLOCK_GROUP_RAID1C3: + return num_stripes == btrfs_raid_array[BTRFS_RAID_RAID1C3].devs_min; + case BTRFS_BLOCK_GROUP_RAID1C4: + return num_stripes == btrfs_raid_array[BTRFS_RAID_RAID1C4].devs_min; + case BTRFS_BLOCK_GROUP_RAID5: + return num_stripes >= btrfs_raid_array[BTRFS_RAID_RAID5].devs_min; + case BTRFS_BLOCK_GROUP_RAID6: + return num_stripes >= btrfs_raid_array[BTRFS_RAID_RAID6].devs_min; + case BTRFS_BLOCK_GROUP_DUP: + return num_stripes == btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes; + case 0: /* SINGLE */ + return num_stripes == btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes; + default: + BUG(); + } +} + /* * The common chunk check which could also work on super block sys chunk array. * @@ -830,6 +871,7 @@ int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info, u64 features; u32 chunk_sector_size; bool mixed = false; + bool remapped; int raid_index; int nparity; int ncopies; @@ -852,13 +894,14 @@ int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info, raid_index = btrfs_bg_flags_to_raid_index(type); ncopies = btrfs_raid_array[raid_index].ncopies; nparity = btrfs_raid_array[raid_index].nparity; + remapped = (type & BTRFS_BLOCK_GROUP_REMAPPED); - if (unlikely(!num_stripes)) { + if (unlikely(!remapped && !num_stripes)) { chunk_err(fs_info, leaf, chunk, logical, "invalid chunk num_stripes, have %u", num_stripes); return -EUCLEAN; } - if (unlikely(num_stripes < ncopies)) { + if (unlikely(num_stripes != 0 && num_stripes < ncopies)) { chunk_err(fs_info, leaf, chunk, logical, "invalid chunk num_stripes < ncopies, have %u < %d", num_stripes, ncopies); @@ -913,12 +956,10 @@ int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info, length, btrfs_stripe_nr_to_offset(U32_MAX)); return -EUCLEAN; } - if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK | - BTRFS_BLOCK_GROUP_PROFILE_MASK))) { + if (unlikely(type & ~BTRFS_BLOCK_GROUP_VALID)) { chunk_err(fs_info, leaf, chunk, logical, "unrecognized chunk type: 0x%llx", - ~(BTRFS_BLOCK_GROUP_TYPE_MASK | - BTRFS_BLOCK_GROUP_PROFILE_MASK) & type); + type & ~BTRFS_BLOCK_GROUP_VALID); return -EUCLEAN; } @@ -958,22 +999,9 @@ int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info, } } - if (unlikely((type & BTRFS_BLOCK_GROUP_RAID10 && - sub_stripes != btrfs_raid_array[BTRFS_RAID_RAID10].sub_stripes) || - (type & BTRFS_BLOCK_GROUP_RAID1 && - num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1].devs_min) || - (type & BTRFS_BLOCK_GROUP_RAID1C3 && - num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C3].devs_min) || - (type & BTRFS_BLOCK_GROUP_RAID1C4 && - num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C4].devs_min) || - (type & BTRFS_BLOCK_GROUP_RAID5 && - num_stripes < btrfs_raid_array[BTRFS_RAID_RAID5].devs_min) || - (type & BTRFS_BLOCK_GROUP_RAID6 && - num_stripes < btrfs_raid_array[BTRFS_RAID_RAID6].devs_min) || - (type & BTRFS_BLOCK_GROUP_DUP && - num_stripes != btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes) || - ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && - num_stripes != btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes))) { + if (!remapped && + !valid_stripe_count(type & BTRFS_BLOCK_GROUP_PROFILE_MASK, + num_stripes, sub_stripes)) { chunk_err(fs_info, leaf, chunk, logical, "invalid num_stripes:sub_stripes %u:%u for profile %llu", num_stripes, sub_stripes, @@ -997,11 +1025,11 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf, struct btrfs_fs_info *fs_info = leaf->fs_info; int num_stripes; - if (unlikely(btrfs_item_size(leaf, slot) < sizeof(struct btrfs_chunk))) { + if (unlikely(btrfs_item_size(leaf, slot) < offsetof(struct btrfs_chunk, stripe))) { chunk_err(fs_info, leaf, chunk, key->offset, "invalid chunk item size: have %u expect [%zu, %u)", btrfs_item_size(leaf, slot), - sizeof(struct btrfs_chunk), + offsetof(struct btrfs_chunk, stripe), BTRFS_LEAF_DATA_SIZE(fs_info)); return -EUCLEAN; } diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index eb201f4ec3c7..833e2fd989eb 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -57,6 +57,11 @@ enum btrfs_tree_block_status { BTRFS_TREE_BLOCK_WRITTEN_NOT_SET, }; + +#define BTRFS_BLOCK_GROUP_VALID (BTRFS_BLOCK_GROUP_TYPE_MASK | \ + BTRFS_BLOCK_GROUP_PROFILE_MASK | \ + BTRFS_BLOCK_GROUP_REMAPPED) + /* * Exported simply for btrfs-progs which wants to have the * btrfs_tree_block_status return codes. diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 6cffcf0c3e7a..e1bd03ebfd98 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5160,7 +5160,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, if (ctx->logged_before) { drop_args.path = path; drop_args.start = em->start; - drop_args.end = em->start + em->len; + drop_args.end = btrfs_extent_map_end(em); drop_args.replace_extent = true; drop_args.extent_item_size = sizeof(fi); ret = btrfs_drop_extents(trans, log, inode, &drop_args); diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index e3a1310fa7d5..f24c14b9bb2f 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -207,15 +207,11 @@ static int btrfs_uuid_iter_rem(struct btrfs_root *uuid_root, u8 *uuid, u8 type, /* 1 - for the uuid item */ trans = btrfs_start_transaction(uuid_root, 1); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out; - } + if (IS_ERR(trans)) + return PTR_ERR(trans); ret = btrfs_uuid_tree_remove(trans, uuid, type, subid); btrfs_end_transaction(trans); - -out: return ret; } @@ -235,14 +231,14 @@ static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, if (type != BTRFS_UUID_KEY_SUBVOL && type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) - goto out; + return 0; subvol_root = btrfs_get_fs_root(fs_info, subvolid, true); if (IS_ERR(subvol_root)) { ret = PTR_ERR(subvol_root); if (ret == -ENOENT) - ret = 1; - goto out; + return 1; + return ret; } switch (type) { @@ -257,7 +253,7 @@ static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, break; } btrfs_put_root(subvol_root); -out: + return ret; } diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index a2ac3fb68bc8..06cbd6f00a78 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -525,23 +525,21 @@ static int finish_verity(struct btrfs_inode *inode, const void *desc, ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 0, (const char *)&item, sizeof(item)); if (ret) - goto out; + return ret; /* Write out the descriptor itself */ ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 1, desc, desc_size); if (ret) - goto out; + return ret; /* * 1 for updating the inode flag * 1 for deleting the orphan */ trans = btrfs_start_transaction(root, 2); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out; - } + if (IS_ERR(trans)) + return PTR_ERR(trans); inode->ro_flags |= BTRFS_INODE_RO_VERITY; btrfs_sync_inode_flags_to_i_flags(inode); ret = btrfs_update_inode(trans, inode); @@ -554,8 +552,7 @@ static int finish_verity(struct btrfs_inode *inode, const void *desc, btrfs_set_fs_compat_ro(root->fs_info, VERITY); end_trans: btrfs_end_transaction(trans); -out: - return ret; + return 0; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8a08412f3529..f281d113519b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -231,6 +231,10 @@ void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); + /* Block groups containing the remap tree. */ + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA_REMAP, "metadata-remap"); + /* Block group that has been remapped. */ + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_REMAPPED, "remapped"); DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) @@ -1169,7 +1173,7 @@ static void btrfs_close_one_device(struct btrfs_device *device) * any transaction and set the error state, guaranteeing no commits of * unsafe super blocks. */ - device->last_flush_error = 0; + clear_bit(BTRFS_DEV_STATE_FLUSH_FAILED, &device->dev_state); /* Verify the device is back in a pristine state */ WARN_ON(test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); @@ -1505,30 +1509,158 @@ error_bdev_put: } /* - * Try to find a chunk that intersects [start, start + len] range and when one - * such is found, record the end of it in *start + * Find the first pending extent intersecting a range. + * + * @device: the device to search + * @start: start of the range to check + * @len: length of the range to check + * @pending_start: output pointer for the start of the found pending extent + * @pending_end: output pointer for the end of the found pending extent (inclusive) + * + * Search for a pending chunk allocation that intersects the half-open range + * [start, start + len). + * + * Return: true if a pending extent was found, false otherwise. + * If the return value is true, store the first pending extent in + * [*pending_start, *pending_end]. Otherwise, the two output variables + * may still be modified, to something outside the range and should not + * be used. */ -static bool contains_pending_extent(struct btrfs_device *device, u64 *start, - u64 len) +bool btrfs_first_pending_extent(struct btrfs_device *device, u64 start, u64 len, + u64 *pending_start, u64 *pending_end) { - u64 physical_start, physical_end; - lockdep_assert_held(&device->fs_info->chunk_mutex); - if (btrfs_find_first_extent_bit(&device->alloc_state, *start, - &physical_start, &physical_end, + if (btrfs_find_first_extent_bit(&device->alloc_state, start, + pending_start, pending_end, CHUNK_ALLOCATED, NULL)) { - if (in_range(physical_start, *start, len) || - in_range(*start, physical_start, - physical_end + 1 - physical_start)) { - *start = physical_end + 1; + if (in_range(*pending_start, start, len) || + in_range(start, *pending_start, *pending_end + 1 - *pending_start)) { return true; } } return false; } +/* + * Find the first real hole accounting for pending extents. + * + * @device: the device containing the candidate hole + * @start: input/output pointer for the hole start position + * @len: input/output pointer for the hole length + * @min_hole_size: the size of hole we are looking for + * + * Given a potential hole specified by [*start, *start + *len), check for pending + * chunk allocations within that range. If pending extents are found, the hole is + * adjusted to represent the first true free space that is large enough when + * accounting for pending chunks. + * + * Note that this function must handle various cases involving non consecutive + * pending extents. + * + * Returns: true if a suitable hole was found and false otherwise. + * If the return value is true, then *start and *len are set to represent the hole. + * If the return value is false, then *start is set to the largest hole we + * found and *len is set to its length. + * If there are no holes at all, then *start is set to the end of the range and + * *len is set to 0. + */ +bool btrfs_find_hole_in_pending_extents(struct btrfs_device *device, u64 *start, + u64 *len, u64 min_hole_size) +{ + u64 pending_start, pending_end; + u64 end; + u64 max_hole_start = 0; + u64 max_hole_len = 0; + + lockdep_assert_held(&device->fs_info->chunk_mutex); + + if (*len == 0) + return false; + + end = *start + *len - 1; + + /* + * Loop until we either see a large enough hole or check every pending + * extent overlapping the candidate hole. + * At every hole that we observe, record it if it is the new max. + * At the end of the iteration, set the output variables to the max hole. + */ + while (true) { + if (btrfs_first_pending_extent(device, *start, *len, &pending_start, &pending_end)) { + /* + * Case 1: the pending extent overlaps the start of + * candidate hole. That means the true hole is after the + * pending extent, but we need to find the next pending + * extent to properly size the hole. In the next loop, + * we will reduce to case 2 or 3. + * e.g., + * + * |----pending A----| real hole |----pending B----| + * | candidate hole | + * *start end + */ + if (pending_start <= *start) { + *start = pending_end + 1; + goto next; + } + /* + * Case 2: The pending extent starts after *start (and overlaps + * [*start, end), so the first hole just goes up to the start + * of the pending extent. + * e.g., + * + * | real hole |----pending A----| + * | candidate hole | + * *start end + */ + *len = pending_start - *start; + if (*len > max_hole_len) { + max_hole_start = *start; + max_hole_len = *len; + } + if (*len >= min_hole_size) + break; + /* + * If the hole wasn't big enough, then we advance past + * the pending extent and keep looking. + */ + *start = pending_end + 1; + goto next; + } else { + /* + * Case 3: There is no pending extent overlapping the + * range [*start, *start + *len - 1], so the only remaining + * hole is the remaining range. + * e.g., + * + * | candidate hole | + * | real hole | + * *start end + */ + + if (*len > max_hole_len) { + max_hole_start = *start; + max_hole_len = *len; + } + break; + } +next: + if (*start > end) + break; + *len = end - *start + 1; + } + if (max_hole_len) { + *start = max_hole_start; + *len = max_hole_len; + } else { + *start = end + 1; + *len = 0; + } + return max_hole_len >= min_hole_size; +} + static u64 dev_extent_search_start(struct btrfs_device *device) { switch (device->fs_devices->chunk_alloc_policy) { @@ -1593,59 +1725,57 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device, } /* - * Check if specified hole is suitable for allocation. + * Validate and adjust a hole for chunk allocation + * + * @device: the device containing the candidate hole + * @hole_start: input/output pointer for the hole start position + * @hole_size: input/output pointer for the hole size + * @num_bytes: minimum allocation size required + * + * Check if the specified hole is suitable for allocation and adjust it if + * necessary. The hole may be modified to skip over pending chunk allocations + * and to satisfy stricter zoned requirements on zoned filesystems. * - * @device: the device which we have the hole - * @hole_start: starting position of the hole - * @hole_size: the size of the hole - * @num_bytes: the size of the free space that we need + * For regular (non-zoned) allocation, if the hole after adjustment is smaller + * than @num_bytes, the search continues past additional pending extents until + * either a sufficiently large hole is found or no more pending extents exist. * - * This function may modify @hole_start and @hole_size to reflect the suitable - * position for allocation. Returns 1 if hole position is updated, 0 otherwise. + * Return: true if a suitable hole was found and false otherwise. + * If the return value is true, then *hole_start and *hole_size are set to + * represent the hole we found. + * If the return value is false, then *hole_start is set to the largest + * hole we found and *hole_size is set to its length. + * If there are no holes at all, then *hole_start is set to the end of the range + * and *hole_size is set to 0. */ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, u64 *hole_size, u64 num_bytes) { - bool changed = false; - u64 hole_end = *hole_start + *hole_size; + bool found = false; + const u64 hole_end = *hole_start + *hole_size - 1; - for (;;) { - /* - * Check before we set max_hole_start, otherwise we could end up - * sending back this offset anyway. - */ - if (contains_pending_extent(device, hole_start, *hole_size)) { - if (hole_end >= *hole_start) - *hole_size = hole_end - *hole_start; - else - *hole_size = 0; - changed = true; - } + ASSERT(*hole_size > 0); - switch (device->fs_devices->chunk_alloc_policy) { - default: - btrfs_warn_unknown_chunk_allocation(device->fs_devices->chunk_alloc_policy); - fallthrough; - case BTRFS_CHUNK_ALLOC_REGULAR: - /* No extra check */ - break; - case BTRFS_CHUNK_ALLOC_ZONED: - if (dev_extent_hole_check_zoned(device, hole_start, - hole_size, num_bytes)) { - changed = true; - /* - * The changed hole can contain pending extent. - * Loop again to check that. - */ - continue; - } - break; - } +again: + *hole_size = hole_end - *hole_start + 1; + found = btrfs_find_hole_in_pending_extents(device, hole_start, hole_size, num_bytes); + if (!found) + return found; + ASSERT(*hole_size >= num_bytes); + switch (device->fs_devices->chunk_alloc_policy) { + default: + btrfs_warn_unknown_chunk_allocation(device->fs_devices->chunk_alloc_policy); + fallthrough; + case BTRFS_CHUNK_ALLOC_REGULAR: + return found; + case BTRFS_CHUNK_ALLOC_ZONED: + if (dev_extent_hole_check_zoned(device, hole_start, hole_size, num_bytes)) + goto again; break; } - return changed; + return found; } /* @@ -1704,7 +1834,7 @@ static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, ret = -ENOMEM; goto out; } -again: + if (search_start >= search_end || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { ret = -ENOSPC; @@ -1791,11 +1921,7 @@ next: */ if (search_end > search_start) { hole_size = search_end - search_start; - if (dev_extent_hole_check(device, &search_start, &hole_size, - num_bytes)) { - btrfs_release_path(path); - goto again; - } + dev_extent_hole_check(device, &search_start, &hole_size, num_bytes); if (hole_size > max_hole_size) { max_hole_start = search_start; @@ -2316,9 +2442,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, free_fs_devices(cur_devices); } - ret = btrfs_commit_transaction(trans); - - return ret; + return btrfs_commit_transaction(trans); error_undo: if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { @@ -2923,8 +3047,7 @@ error: return ret; } -static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, - struct btrfs_device *device) +int btrfs_update_device(struct btrfs_trans_handle *trans, struct btrfs_device *device) { int ret; BTRFS_PATH_AUTO_FREE(path); @@ -3222,25 +3345,12 @@ static int remove_chunk_item(struct btrfs_trans_handle *trans, return btrfs_free_chunk(trans, chunk_offset); } -int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) +int btrfs_remove_dev_extents(struct btrfs_trans_handle *trans, struct btrfs_chunk_map *map) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_chunk_map *map; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; u64 dev_extent_len = 0; int i, ret = 0; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - - map = btrfs_get_chunk_map(fs_info, chunk_offset, 1); - if (IS_ERR(map)) { - /* - * This is a logic error, but we don't want to just rely on the - * user having built with ASSERT enabled, so if ASSERT doesn't - * do anything we still error out. - */ - DEBUG_WARN("errr %ld reading chunk map at offset %llu", - PTR_ERR(map), chunk_offset); - return PTR_ERR(map); - } /* * First delete the device extent items from the devices btree. @@ -3261,7 +3371,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) if (unlikely(ret)) { mutex_unlock(&fs_devices->device_list_mutex); btrfs_abort_transaction(trans, ret); - goto out; + return ret; } if (device->bytes_used > 0) { @@ -3281,6 +3391,26 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) } mutex_unlock(&fs_devices->device_list_mutex); + return 0; +} + +int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_chunk_map *map; + int ret; + + map = btrfs_get_chunk_map(fs_info, chunk_offset, 1); + if (IS_ERR(map)) { + DEBUG_WARN("errr %ld reading chunk map at offset %llu", + PTR_ERR(map), chunk_offset); + return PTR_ERR(map); + } + + ret = btrfs_remove_dev_extents(trans, map); + if (ret) + goto out; + /* * We acquire fs_info->chunk_mutex for 2 reasons: * @@ -3376,11 +3506,10 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) */ btrfs_trans_release_chunk_metadata(trans); + /* On error, btrfs_remove_block_group() aborts the transaction. */ ret = btrfs_remove_block_group(trans, map); - if (unlikely(ret)) { - btrfs_abort_transaction(trans, ret); - goto out; - } + if (unlikely(ret)) + ASSERT(BTRFS_FS_ERROR(fs_info) != 0); out: if (trans->removing_chunk) { @@ -3392,15 +3521,50 @@ out: return ret; } -int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset, - bool verbose) +static int btrfs_relocate_chunk_finish(struct btrfs_fs_info *fs_info, + struct btrfs_block_group *bg) { struct btrfs_root *root = fs_info->chunk_root; struct btrfs_trans_handle *trans; - struct btrfs_block_group *block_group; u64 length; int ret; + btrfs_discard_cancel_work(&fs_info->discard_ctl, bg); + length = bg->length; + btrfs_put_block_group(bg); + + /* + * On a zoned file system, discard the whole block group, this will + * trigger a REQ_OP_ZONE_RESET operation on the device zone. If + * resetting the zone fails, don't treat it as a fatal problem from the + * filesystem's point of view. + */ + if (btrfs_is_zoned(fs_info)) { + ret = btrfs_discard_extent(fs_info, bg->start, length, NULL, true); + if (ret) + btrfs_info(fs_info, "failed to reset zone %llu after relocation", + bg->start); + } + + trans = btrfs_start_trans_remove_block_group(root->fs_info, bg->start); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_handle_fs_error(root->fs_info, ret, NULL); + return ret; + } + + /* Step two, delete the device extents and the chunk tree entries. */ + ret = btrfs_remove_chunk(trans, bg->start); + btrfs_end_transaction(trans); + + return ret; +} + +int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset, bool verbose) +{ + struct btrfs_block_group *block_group; + int ret; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { btrfs_err(fs_info, "relocate: not supported on extent tree v2 yet"); @@ -3438,38 +3602,15 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset, block_group = btrfs_lookup_block_group(fs_info, chunk_offset); if (!block_group) return -ENOENT; - btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); - length = block_group->length; - btrfs_put_block_group(block_group); - /* - * On a zoned file system, discard the whole block group, this will - * trigger a REQ_OP_ZONE_RESET operation on the device zone. If - * resetting the zone fails, don't treat it as a fatal problem from the - * filesystem's point of view. - */ - if (btrfs_is_zoned(fs_info)) { - ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL); - if (ret) - btrfs_info(fs_info, - "failed to reset zone %llu after relocation", - chunk_offset); - } - - trans = btrfs_start_trans_remove_block_group(root->fs_info, - chunk_offset); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - btrfs_handle_fs_error(root->fs_info, ret, NULL); - return ret; + if (should_relocate_using_remap_tree(block_group)) { + /* If we're relocating using the remap tree we're now done. */ + btrfs_put_block_group(block_group); + ret = 0; + } else { + ret = btrfs_relocate_chunk_finish(fs_info, block_group); } - /* - * step two, delete the device extents and the - * chunk tree entries - */ - ret = btrfs_remove_chunk(trans, chunk_offset); - btrfs_end_transaction(trans); return ret; } @@ -3646,7 +3787,7 @@ static int insert_balance_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key key; - int ret, err; + int ret; path = btrfs_alloc_path(); if (!path) @@ -3681,9 +3822,11 @@ static int insert_balance_item(struct btrfs_fs_info *fs_info, btrfs_set_balance_flags(leaf, item, bctl->flags); out: btrfs_free_path(path); - err = btrfs_commit_transaction(trans); - if (err && !ret) - ret = err; + if (ret == 0) + ret = btrfs_commit_transaction(trans); + else + btrfs_end_transaction(trans); + return ret; } @@ -3693,7 +3836,7 @@ static int del_balance_item(struct btrfs_fs_info *fs_info) struct btrfs_trans_handle *trans; struct btrfs_path *path; struct btrfs_key key; - int ret, err; + int ret; path = btrfs_alloc_path(); if (!path) @@ -3720,9 +3863,11 @@ static int del_balance_item(struct btrfs_fs_info *fs_info) ret = btrfs_del_item(trans, root, path); out: btrfs_free_path(path); - err = btrfs_commit_transaction(trans); - if (err && !ret) - ret = err; + if (ret == 0) + ret = btrfs_commit_transaction(trans); + else + btrfs_end_transaction(trans); + return ret; } @@ -3966,6 +4111,12 @@ static bool should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk struct btrfs_balance_args *bargs = NULL; u64 chunk_type = btrfs_chunk_type(leaf, chunk); + /* Treat METADATA_REMAP chunks as METADATA. */ + if (chunk_type & BTRFS_BLOCK_GROUP_METADATA_REMAP) { + chunk_type &= ~BTRFS_BLOCK_GROUP_METADATA_REMAP; + chunk_type |= BTRFS_BLOCK_GROUP_METADATA; + } + /* type filter */ if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { @@ -4047,6 +4198,107 @@ static bool should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk return true; } +struct remap_chunk_info { + struct list_head list; + u64 offset; + struct btrfs_block_group *bg; + bool made_ro; +}; + +static int cow_remap_tree(struct btrfs_trans_handle *trans, struct btrfs_path *path) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_key key = { 0 }; + int ret; + + ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, 0, 1); + if (ret < 0) + return ret; + + while (true) { + ret = btrfs_next_leaf(fs_info->remap_root, path); + if (ret < 0) { + return ret; + } else if (ret > 0) { + ret = 0; + break; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + btrfs_release_path(path); + + ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, 0, 1); + if (ret < 0) + break; + } + + return ret; +} + +static int balance_remap_chunks(struct btrfs_fs_info *fs_info, struct btrfs_path *path, + struct list_head *chunks) +{ + struct remap_chunk_info *rci, *tmp; + struct btrfs_trans_handle *trans; + int ret; + + list_for_each_entry_safe(rci, tmp, chunks, list) { + rci->bg = btrfs_lookup_block_group(fs_info, rci->offset); + if (!rci->bg) { + list_del(&rci->list); + kfree(rci); + continue; + } + + ret = btrfs_inc_block_group_ro(rci->bg, false); + if (ret) + goto end; + + rci->made_ro = true; + } + + if (list_empty(chunks)) + return 0; + + trans = btrfs_start_transaction(fs_info->remap_root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto end; + } + + mutex_lock(&fs_info->remap_mutex); + ret = cow_remap_tree(trans, path); + mutex_unlock(&fs_info->remap_mutex); + + btrfs_release_path(path); + btrfs_commit_transaction(trans); + +end: + while (!list_empty(chunks)) { + bool is_unused; + + rci = list_first_entry(chunks, struct remap_chunk_info, list); + + spin_lock(&rci->bg->lock); + is_unused = !btrfs_is_block_group_used(rci->bg); + spin_unlock(&rci->bg->lock); + + if (is_unused) + btrfs_mark_bg_unused(rci->bg); + + if (rci->made_ro) + btrfs_dec_block_group_ro(rci->bg); + + btrfs_put_block_group(rci->bg); + + list_del(&rci->list); + kfree(rci); + } + + return ret; +} + static int __btrfs_balance(struct btrfs_fs_info *fs_info) { struct btrfs_balance_control *bctl = fs_info->balance_ctl; @@ -4069,6 +4321,9 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) u32 count_meta = 0; u32 count_sys = 0; int chunk_reserved = 0; + struct remap_chunk_info *rci; + unsigned int num_remap_chunks = 0; + LIST_HEAD(remap_chunks); path = btrfs_alloc_path(); if (!path) { @@ -4135,6 +4390,14 @@ again: chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); chunk_type = btrfs_chunk_type(leaf, chunk); + /* Check if chunk has already been fully relocated. */ + if (chunk_type & BTRFS_BLOCK_GROUP_REMAPPED && + btrfs_chunk_num_stripes(leaf, chunk) == 0) { + btrfs_release_path(path); + mutex_unlock(&fs_info->reclaim_bgs_lock); + goto loop; + } + if (!counting) { spin_lock(&fs_info->balance_lock); bctl->stat.considered++; @@ -4159,7 +4422,8 @@ again: count_data++; else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) count_sys++; - else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) + else if (chunk_type & (BTRFS_BLOCK_GROUP_METADATA | + BTRFS_BLOCK_GROUP_METADATA_REMAP)) count_meta++; goto loop; @@ -4179,6 +4443,29 @@ again: goto loop; } + /* + * Balancing METADATA_REMAP chunks takes place separately - add + * the details to a list so it can be processed later. + */ + if (chunk_type & BTRFS_BLOCK_GROUP_METADATA_REMAP) { + mutex_unlock(&fs_info->reclaim_bgs_lock); + + rci = kmalloc(sizeof(struct remap_chunk_info), GFP_NOFS); + if (!rci) { + ret = -ENOMEM; + goto error; + } + + rci->offset = found_key.offset; + rci->bg = NULL; + rci->made_ro = false; + list_add_tail(&rci->list, &remap_chunks); + + num_remap_chunks++; + + goto loop; + } + if (!chunk_reserved) { /* * We may be relocating the only data chunk we have, @@ -4218,11 +4505,24 @@ loop: key.offset = found_key.offset - 1; } + btrfs_release_path(path); + if (counting) { - btrfs_release_path(path); counting = false; goto again; } + + if (!list_empty(&remap_chunks)) { + ret = balance_remap_chunks(fs_info, path, &remap_chunks); + if (ret == -ENOSPC) + enospc_errors++; + + if (!ret) { + spin_lock(&fs_info->balance_lock); + bctl->stat.completed += num_remap_chunks; + spin_unlock(&fs_info->balance_lock); + } + } error: if (enospc_errors) { btrfs_info(fs_info, "%d enospc errors during balance", @@ -4844,6 +5144,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) u64 diff; u64 start; u64 free_diff = 0; + u64 pending_start, pending_end; new_size = round_down(new_size, fs_info->sectorsize); start = new_size; @@ -4889,7 +5190,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) * in-memory chunks are synced to disk so that the loop below sees them * and relocates them accordingly. */ - if (contains_pending_extent(device, &start, diff)) { + if (btrfs_first_pending_extent(device, start, diff, &pending_start, &pending_end)) { mutex_unlock(&fs_info->chunk_mutex); ret = btrfs_commit_transaction(trans); if (ret) @@ -5410,7 +5711,7 @@ static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int } } -static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits) +void btrfs_chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits) { for (int i = 0; i < map->num_stripes; i++) { struct btrfs_io_stripe *stripe = &map->stripes[i]; @@ -5427,7 +5728,7 @@ void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_ma write_lock(&fs_info->mapping_tree_lock); rb_erase_cached(&map->rb_node, &fs_info->mapping_tree); RB_CLEAR_NODE(&map->rb_node); - chunk_map_device_clear_bits(map, CHUNK_ALLOCATED); + btrfs_chunk_map_device_clear_bits(map, CHUNK_ALLOCATED); write_unlock(&fs_info->mapping_tree_lock); /* Once for the tree reference. */ @@ -5463,7 +5764,7 @@ int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *m return -EEXIST; } chunk_map_device_set_bits(map, CHUNK_ALLOCATED); - chunk_map_device_clear_bits(map, CHUNK_TRIMMED); + btrfs_chunk_map_device_clear_bits(map, CHUNK_TRIMMED); write_unlock(&fs_info->mapping_tree_lock); return 0; @@ -5819,7 +6120,7 @@ void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info) map = rb_entry(node, struct btrfs_chunk_map, rb_node); rb_erase_cached(&map->rb_node, &fs_info->mapping_tree); RB_CLEAR_NODE(&map->rb_node); - chunk_map_device_clear_bits(map, CHUNK_ALLOCATED); + btrfs_chunk_map_device_clear_bits(map, CHUNK_ALLOCATED); /* Once for the tree ref. */ btrfs_free_chunk_map(map); cond_resched_rwlock_write(&fs_info->mapping_tree_lock); @@ -6066,7 +6367,7 @@ void btrfs_put_bioc(struct btrfs_io_context *bioc) */ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, u64 logical, u64 *length_ret, - u32 *num_stripes) + u32 *num_stripes, bool do_remap) { struct btrfs_chunk_map *map; struct btrfs_discard_stripe *stripes; @@ -6090,6 +6391,24 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, if (IS_ERR(map)) return ERR_CAST(map); + if (do_remap && (map->type & BTRFS_BLOCK_GROUP_REMAPPED)) { + u64 new_logical = logical; + + ret = btrfs_translate_remap(fs_info, &new_logical, &length); + if (ret) + goto out_free_map; + + if (new_logical != logical) { + btrfs_free_chunk_map(map); + + map = btrfs_get_chunk_map(fs_info, new_logical, length); + if (IS_ERR(map)) + return ERR_CAST(map); + + logical = new_logical; + } + } + /* we don't discard raid56 yet */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ret = -EOPNOTSUPP; @@ -6577,6 +6896,24 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, if (IS_ERR(map)) return PTR_ERR(map); + if (map->type & BTRFS_BLOCK_GROUP_REMAPPED) { + u64 new_logical = logical; + + ret = btrfs_translate_remap(fs_info, &new_logical, length); + if (ret) + return ret; + + if (new_logical != logical) { + btrfs_free_chunk_map(map); + + map = btrfs_get_chunk_map(fs_info, new_logical, *length); + if (IS_ERR(map)) + return PTR_ERR(map); + + logical = new_logical; + } + } + num_copies = btrfs_chunk_map_num_copies(map); if (io_geom.mirror_num > num_copies) return -EINVAL; @@ -7041,7 +7378,12 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, */ map->sub_stripes = btrfs_raid_array[index].sub_stripes; map->verified_stripes = 0; - map->stripe_size = btrfs_calc_stripe_length(map); + + if (num_stripes > 0) + map->stripe_size = btrfs_calc_stripe_length(map); + else + map->stripe_size = 0; + for (i = 0; i < num_stripes; i++) { map->stripes[i].physical = btrfs_stripe_offset_nr(leaf, chunk, i); @@ -7167,7 +7509,6 @@ static int read_one_dev(struct extent_buffer *leaf, struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; u64 devid; - int ret; u8 fs_uuid[BTRFS_FSID_SIZE]; u8 dev_uuid[BTRFS_UUID_SIZE]; @@ -7267,8 +7608,8 @@ static int read_one_dev(struct extent_buffer *leaf, atomic64_add(device->total_bytes - device->bytes_used, &fs_info->free_chunk_space); } - ret = 0; - return ret; + + return 0; } int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) @@ -7357,10 +7698,9 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, map = btrfs_find_chunk_map(fs_info, 0, U64_MAX); /* No chunk at all? Return false anyway */ - if (!map) { - ret = false; - goto out; - } + if (!map) + return false; + while (map) { int missing = 0; int max_tolerated; @@ -7374,7 +7714,7 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || - dev->last_flush_error) + test_bit(BTRFS_DEV_STATE_FLUSH_FAILED, &dev->dev_state)) missing++; else if (failing_dev && failing_dev == dev) missing++; @@ -7385,15 +7725,14 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, "chunk %llu missing %d devices, max tolerance is %d for writable mount", map->start, missing, max_tolerated); btrfs_free_chunk_map(map); - ret = false; - goto out; + return false; } next_start = map->start + map->chunk_len; btrfs_free_chunk_map(map); map = btrfs_find_chunk_map(fs_info, next_start, U64_MAX - next_start); } -out: + return ret; } @@ -8025,7 +8364,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) if (!path) return -ENOMEM; - path->reada = READA_FORWARD; + path->reada = READA_FORWARD_ALWAYS; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) return ret; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index f20abeb16bce..8288d79372a5 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -30,6 +30,7 @@ struct btrfs_block_group; struct btrfs_trans_handle; struct btrfs_transaction; struct btrfs_zoned_device_info; +struct btrfs_space_info; #define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) @@ -58,7 +59,6 @@ static_assert(ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT); */ static_assert(const_ffs(BTRFS_BLOCK_GROUP_RAID0) < const_ffs(BTRFS_BLOCK_GROUP_PROFILE_MASK & ~BTRFS_BLOCK_GROUP_RAID0)); -static_assert(ilog2(BTRFS_BLOCK_GROUP_RAID0) > ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK)); /* ilog2() can handle both constants and variables */ #define BTRFS_BG_FLAG_TO_INDEX(profile) \ @@ -80,6 +80,15 @@ enum btrfs_raid_types { BTRFS_NR_RAID_TYPES }; +static_assert(BTRFS_RAID_RAID0 == 1); +static_assert(BTRFS_RAID_RAID1 == 2); +static_assert(BTRFS_RAID_DUP == 3); +static_assert(BTRFS_RAID_RAID10 == 4); +static_assert(BTRFS_RAID_RAID5 == 5); +static_assert(BTRFS_RAID_RAID6 == 6); +static_assert(BTRFS_RAID_RAID1C3 == 7); +static_assert(BTRFS_RAID_RAID1C4 == 8); + /* * Use sequence counter to get consistent device stat data on * 32-bit processors. @@ -99,6 +108,7 @@ enum btrfs_raid_types { #define BTRFS_DEV_STATE_REPLACE_TGT (3) #define BTRFS_DEV_STATE_FLUSH_SENT (4) #define BTRFS_DEV_STATE_NO_READA (5) +#define BTRFS_DEV_STATE_FLUSH_FAILED (6) /* Set when the device item is found in chunk tree, used to catch unexpected registered device. */ #define BTRFS_DEV_STATE_ITEM_FOUND (7) @@ -125,13 +135,7 @@ struct btrfs_device { struct btrfs_zoned_device_info *zone_info; - /* - * Device's major-minor number. Must be set even if the device is not - * opened (bdev == NULL), unless the device is missing. - */ - dev_t devt; unsigned long dev_state; - blk_status_t last_flush_error; #ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED seqcount_t data_seqcount; @@ -195,6 +199,12 @@ struct btrfs_device { atomic_t dev_stats_ccnt; atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX]; + /* + * Device's major-minor number. Must be set even if the device is not + * opened (bdev == NULL), unless the device is missing. + */ + dev_t devt; + struct extent_io_tree alloc_state; struct completion kobj_unregister; @@ -321,25 +331,6 @@ enum btrfs_read_policy { BTRFS_NR_READ_POLICY, }; -#ifdef CONFIG_BTRFS_EXPERIMENTAL -/* - * Checksum mode - offload it to workqueues or do it synchronously in - * btrfs_submit_chunk(). - */ -enum btrfs_offload_csum_mode { - /* - * Choose offloading checksum or do it synchronously automatically. - * Do it synchronously if the checksum is fast, or offload to workqueues - * otherwise. - */ - BTRFS_OFFLOAD_CSUM_AUTO, - /* Always offload checksum to workqueues. */ - BTRFS_OFFLOAD_CSUM_FORCE_ON, - /* Never offload checksum to workqueues. */ - BTRFS_OFFLOAD_CSUM_FORCE_OFF, -}; -#endif - struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ @@ -466,9 +457,6 @@ struct btrfs_fs_devices { /* Device to be used for reading in case of RAID1. */ u64 read_devid; - - /* Checksum mode - offload it or do it synchronously. */ - enum btrfs_offload_csum_mode offload_csum_mode; #endif }; @@ -646,6 +634,7 @@ static inline void btrfs_free_chunk_map(struct btrfs_chunk_map *map) kfree(map); } } +DEFINE_FREE(btrfs_free_chunk_map, struct btrfs_chunk_map *, btrfs_free_chunk_map(_T)) struct btrfs_balance_control { struct btrfs_balance_args data; @@ -727,7 +716,7 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, u32 length, int mirror_num); struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, u64 logical, u64 *length_ret, - u32 *num_stripes); + u32 *num_stripes, bool do_remap); int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, @@ -789,6 +778,7 @@ u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map); int btrfs_nr_parity_stripes(u64 type); int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, struct btrfs_block_group *bg); +int btrfs_remove_dev_extents(struct btrfs_trans_handle *trans, struct btrfs_chunk_map *map); int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS @@ -901,6 +891,13 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb); +int btrfs_update_device(struct btrfs_trans_handle *trans, struct btrfs_device *device); +void btrfs_chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits); + +bool btrfs_first_pending_extent(struct btrfs_device *device, u64 start, u64 len, + u64 *pending_start, u64 *pending_end); +bool btrfs_find_hole_in_pending_extents(struct btrfs_device *device, + u64 *start, u64 *len, u64 min_hole_size); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 10ed48d4a846..0a8fcee16428 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -145,31 +145,24 @@ static int copy_data_into_buffer(struct address_space *mapping, return 0; } -int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, - u64 start, struct folio **folios, unsigned long *out_folios, - unsigned long *total_in, unsigned long *total_out) +int zlib_compress_bio(struct list_head *ws, struct compressed_bio *cb) { + struct btrfs_inode *inode = cb->bbio.inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct workspace *workspace = list_entry(ws, struct workspace, list); struct address_space *mapping = inode->vfs_inode.i_mapping; - const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; + struct bio *bio = &cb->bbio.bio; + u64 start = cb->start; + u32 len = cb->len; const u32 min_folio_size = btrfs_min_folio_size(fs_info); int ret; char *data_in = NULL; char *cfolio_out; - int nr_folios = 0; struct folio *in_folio = NULL; struct folio *out_folio = NULL; - unsigned long len = *total_out; - unsigned long nr_dest_folios = *out_folios; - const unsigned long max_out = nr_dest_folios << min_folio_shift; const u32 blocksize = fs_info->sectorsize; const u64 orig_end = start + len; - *out_folios = 0; - *total_out = 0; - *total_in = 0; - ret = zlib_deflateInit(&workspace->strm, workspace->level); if (unlikely(ret != Z_OK)) { btrfs_err(fs_info, @@ -188,8 +181,6 @@ int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, goto out; } cfolio_out = folio_address(out_folio); - folios[0] = out_folio; - nr_folios = 1; workspace->strm.next_in = workspace->buf; workspace->strm.avail_in = 0; @@ -198,8 +189,8 @@ int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, while (workspace->strm.total_in < len) { /* - * Get next input pages and copy the contents to - * the workspace buffer if required. + * Get next input pages and copy the contents to the workspace + * buffer if required. */ if (workspace->strm.avail_in == 0) { unsigned long bytes_left = len - workspace->strm.total_in; @@ -250,40 +241,39 @@ int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, goto out; } - /* we're making it bigger, give up */ + /* We're making it bigger, give up. */ if (workspace->strm.total_in > blocksize * 2 && - workspace->strm.total_in < - workspace->strm.total_out) { + workspace->strm.total_in < workspace->strm.total_out) { ret = -E2BIG; goto out; } - /* we need another page for writing out. Test this - * before the total_in so we will pull in a new page for - * the stream end if required - */ + if (workspace->strm.total_out >= len) { + ret = -E2BIG; + goto out; + } + /* Queue the full folio and allocate a new one. */ if (workspace->strm.avail_out == 0) { - if (nr_folios == nr_dest_folios) { + if (!bio_add_folio(bio, out_folio, folio_size(out_folio), 0)) { ret = -E2BIG; goto out; } + out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; } cfolio_out = folio_address(out_folio); - folios[nr_folios] = out_folio; - nr_folios++; workspace->strm.avail_out = min_folio_size; workspace->strm.next_out = cfolio_out; } - /* we're all done */ + /* We're all done. */ if (workspace->strm.total_in >= len) break; - if (workspace->strm.total_out > max_out) - break; } + workspace->strm.avail_in = 0; + /* * Call deflate with Z_FINISH flush parameter providing more output * space but no more input data, until it returns with Z_STREAM_END. @@ -297,23 +287,39 @@ int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, ret = -EIO; goto out; } else if (workspace->strm.avail_out == 0) { - /* Get another folio for the stream end. */ - if (nr_folios == nr_dest_folios) { + if (workspace->strm.total_out >= len) { + ret = -E2BIG; + goto out; + } + if (!bio_add_folio(bio, out_folio, folio_size(out_folio), 0)) { ret = -E2BIG; goto out; } + /* Get another folio for the stream end. */ out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; } cfolio_out = folio_address(out_folio); - folios[nr_folios] = out_folio; - nr_folios++; workspace->strm.avail_out = min_folio_size; workspace->strm.next_out = cfolio_out; } } + /* Queue the remaining part of the folio. */ + if (workspace->strm.total_out > bio->bi_iter.bi_size) { + u32 cur_len = offset_in_folio(out_folio, workspace->strm.total_out); + + if (!bio_add_folio(bio, out_folio, cur_len, 0)) { + ret = -E2BIG; + goto out; + } + } else { + /* The last folio hasn't' been utilized. */ + btrfs_free_compr_folio(out_folio); + } + out_folio = NULL; + ASSERT(bio->bi_iter.bi_size == workspace->strm.total_out); zlib_deflateEnd(&workspace->strm); if (workspace->strm.total_out >= workspace->strm.total_in) { @@ -322,10 +328,9 @@ int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, } ret = 0; - *total_out = workspace->strm.total_out; - *total_in = workspace->strm.total_in; out: - *out_folios = nr_folios; + if (out_folio) + btrfs_free_compr_folio(out_folio); if (data_in) { kunmap_local(data_in); folio_put(in_folio); @@ -338,18 +343,23 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); struct workspace *workspace = list_entry(ws, struct workspace, list); + struct folio_iter fi; const u32 min_folio_size = btrfs_min_folio_size(fs_info); int ret = 0, ret2; int wbits = MAX_WBITS; char *data_in; size_t total_out = 0; - unsigned long folio_in_index = 0; size_t srclen = cb->compressed_len; - unsigned long total_folios_in = DIV_ROUND_UP(srclen, min_folio_size); unsigned long buf_start; - struct folio **folios_in = cb->compressed_folios; - data_in = kmap_local_folio(folios_in[folio_in_index], 0); + bio_first_folio(&fi, &cb->bbio.bio, 0); + + /* We must have at least one folio here, that has the correct size. */ + if (unlikely(!fi.folio)) + return -EINVAL; + ASSERT(folio_size(fi.folio) == min_folio_size); + + data_in = kmap_local_folio(fi.folio, 0); workspace->strm.next_in = data_in; workspace->strm.avail_in = min_t(size_t, srclen, min_folio_size); workspace->strm.total_in = 0; @@ -404,12 +414,13 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (workspace->strm.avail_in == 0) { unsigned long tmp; kunmap_local(data_in); - folio_in_index++; - if (folio_in_index >= total_folios_in) { + bio_next_folio(&fi, &cb->bbio.bio); + if (!fi.folio) { data_in = NULL; break; } - data_in = kmap_local_folio(folios_in[folio_in_index], 0); + ASSERT(folio_size(fi.folio) == min_folio_size); + data_in = kmap_local_folio(fi.folio, 0); workspace->strm.next_in = data_in; tmp = srclen - workspace->strm.total_in; workspace->strm.avail_in = min(tmp, min_folio_size); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 359a98e6de85..ad8621587fd2 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1231,6 +1231,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; + const u64 bg_end = btrfs_block_group_end(cache); int ret; u64 length; @@ -1253,7 +1254,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, if (!path) return -ENOMEM; - key.objectid = cache->start + cache->length; + key.objectid = bg_end; key.type = 0; key.offset = 0; @@ -1282,7 +1283,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, length = fs_info->nodesize; if (unlikely(!(found_key.objectid >= cache->start && - found_key.objectid + length <= cache->start + cache->length))) { + found_key.objectid + length <= bg_end))) { return -EUCLEAN; } *offset_ret = found_key.objectid + length - cache->start; @@ -1437,18 +1438,32 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); if (unlikely(zone_info[0].alloc_offset == WP_MISSING_DEV)) { - btrfs_err(bg->fs_info, + btrfs_err(fs_info, "zoned: cannot recover write pointer for zone %llu", zone_info[0].physical); return -EIO; } if (unlikely(zone_info[1].alloc_offset == WP_MISSING_DEV)) { - btrfs_err(bg->fs_info, + btrfs_err(fs_info, "zoned: cannot recover write pointer for zone %llu", zone_info[1].physical); return -EIO; } + /* + * When the last extent is removed, last_alloc can be smaller than the other write + * pointer. In that case, last_alloc should be moved to the corresponding write + * pointer position. + */ + for (int i = 0; i < map->num_stripes; i++) { + if (zone_info[i].alloc_offset == WP_CONVENTIONAL) + continue; + if (last_alloc <= zone_info[i].alloc_offset) { + last_alloc = zone_info[i].alloc_offset; + break; + } + } + if (zone_info[0].alloc_offset == WP_CONVENTIONAL) zone_info[0].alloc_offset = last_alloc; @@ -1456,7 +1471,7 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, zone_info[1].alloc_offset = last_alloc; if (unlikely(zone_info[0].alloc_offset != zone_info[1].alloc_offset)) { - btrfs_err(bg->fs_info, + btrfs_err(fs_info, "zoned: write pointer offset mismatch of zones in DUP profile"); return -EIO; } @@ -1490,6 +1505,21 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, /* In case a device is missing we have a cap of 0, so don't use it. */ bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); + /* + * When the last extent is removed, last_alloc can be smaller than the other write + * pointer. In that case, last_alloc should be moved to the corresponding write + * pointer position. + */ + for (i = 0; i < map->num_stripes; i++) { + if (zone_info[i].alloc_offset == WP_MISSING_DEV || + zone_info[i].alloc_offset == WP_CONVENTIONAL) + continue; + if (last_alloc <= zone_info[i].alloc_offset) { + last_alloc = zone_info[i].alloc_offset; + break; + } + } + for (i = 0; i < map->num_stripes; i++) { if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; @@ -1531,7 +1561,9 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, { struct btrfs_fs_info *fs_info = bg->fs_info; u64 stripe_nr = 0, stripe_offset = 0; + u64 prev_offset = 0; u32 stripe_index = 0; + bool has_partial = false, has_conventional = false; if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", @@ -1539,6 +1571,35 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, return -EINVAL; } + /* + * When the last extent is removed, last_alloc can be smaller than the other write + * pointer. In that case, last_alloc should be moved to the corresponding write + * pointer position. + */ + for (int i = 0; i < map->num_stripes; i++) { + u64 alloc; + + if (zone_info[i].alloc_offset == WP_MISSING_DEV || + zone_info[i].alloc_offset == WP_CONVENTIONAL) + continue; + + stripe_nr = zone_info[i].alloc_offset >> BTRFS_STRIPE_LEN_SHIFT; + stripe_offset = zone_info[i].alloc_offset & BTRFS_STRIPE_LEN_MASK; + if (stripe_offset == 0 && stripe_nr > 0) { + stripe_nr--; + stripe_offset = BTRFS_STRIPE_LEN; + } + alloc = ((stripe_nr * map->num_stripes + i) << BTRFS_STRIPE_LEN_SHIFT) + + stripe_offset; + last_alloc = max(last_alloc, alloc); + + /* Partially written stripe found. It should be last. */ + if (zone_info[i].alloc_offset & BTRFS_STRIPE_LEN_MASK) + break; + } + stripe_nr = 0; + stripe_offset = 0; + if (last_alloc) { u32 factor = map->num_stripes; @@ -1552,7 +1613,7 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, continue; if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { - + has_conventional = true; zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr); if (stripe_index > i) @@ -1561,6 +1622,28 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, zone_info[i].alloc_offset += stripe_offset; } + /* Verification */ + if (i != 0) { + if (unlikely(prev_offset < zone_info[i].alloc_offset)) { + btrfs_err(fs_info, + "zoned: stripe position disorder found in block group %llu", + bg->start); + return -EIO; + } + + if (unlikely(has_partial && + (zone_info[i].alloc_offset & BTRFS_STRIPE_LEN_MASK))) { + btrfs_err(fs_info, + "zoned: multiple partial written stripe found in block group %llu", + bg->start); + return -EIO; + } + } + prev_offset = zone_info[i].alloc_offset; + + if ((zone_info[i].alloc_offset & BTRFS_STRIPE_LEN_MASK) != 0) + has_partial = true; + if (test_bit(0, active) != test_bit(i, active)) { if (unlikely(!btrfs_zone_activate(bg))) return -EIO; @@ -1572,6 +1655,19 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, bg->alloc_offset += zone_info[i].alloc_offset; } + /* Check if all devices stay in the same stripe row. */ + if (unlikely(zone_info[0].alloc_offset - + zone_info[map->num_stripes - 1].alloc_offset > BTRFS_STRIPE_LEN)) { + btrfs_err(fs_info, "zoned: stripe gap too large in block group %llu", bg->start); + return -EIO; + } + + if (unlikely(has_conventional && bg->alloc_offset < last_alloc)) { + btrfs_err(fs_info, "zoned: allocated extent stays beyond write pointers %llu %llu", + bg->alloc_offset, last_alloc); + return -EIO; + } + return 0; } @@ -1582,8 +1678,11 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; + u64 AUTO_KFREE(raid0_allocs); u64 stripe_nr = 0, stripe_offset = 0; u32 stripe_index = 0; + bool has_partial = false, has_conventional = false; + u64 prev_offset = 0; if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", @@ -1591,6 +1690,60 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, return -EINVAL; } + raid0_allocs = kcalloc(map->num_stripes / map->sub_stripes, sizeof(*raid0_allocs), + GFP_NOFS); + if (!raid0_allocs) + return -ENOMEM; + + /* + * When the last extent is removed, last_alloc can be smaller than the other write + * pointer. In that case, last_alloc should be moved to the corresponding write + * pointer position. + */ + for (int i = 0; i < map->num_stripes; i += map->sub_stripes) { + u64 alloc = zone_info[i].alloc_offset; + + for (int j = 1; j < map->sub_stripes; j++) { + int idx = i + j; + + if (zone_info[idx].alloc_offset == WP_MISSING_DEV || + zone_info[idx].alloc_offset == WP_CONVENTIONAL) + continue; + if (alloc == WP_MISSING_DEV || alloc == WP_CONVENTIONAL) { + alloc = zone_info[idx].alloc_offset; + } else if (unlikely(zone_info[idx].alloc_offset != alloc)) { + btrfs_err(fs_info, + "zoned: write pointer mismatch found in block group %llu", + bg->start); + return -EIO; + } + } + + raid0_allocs[i / map->sub_stripes] = alloc; + if (alloc == WP_CONVENTIONAL) + continue; + if (unlikely(alloc == WP_MISSING_DEV)) { + btrfs_err(fs_info, + "zoned: cannot recover write pointer of block group %llu due to missing device", + bg->start); + return -EIO; + } + + stripe_nr = alloc >> BTRFS_STRIPE_LEN_SHIFT; + stripe_offset = alloc & BTRFS_STRIPE_LEN_MASK; + if (stripe_offset == 0 && stripe_nr > 0) { + stripe_nr--; + stripe_offset = BTRFS_STRIPE_LEN; + } + + alloc = ((stripe_nr * (map->num_stripes / map->sub_stripes) + + (i / map->sub_stripes)) << + BTRFS_STRIPE_LEN_SHIFT) + stripe_offset; + last_alloc = max(last_alloc, alloc); + } + stripe_nr = 0; + stripe_offset = 0; + if (last_alloc) { u32 factor = map->num_stripes / map->sub_stripes; @@ -1600,24 +1753,51 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, } for (int i = 0; i < map->num_stripes; i++) { - if (zone_info[i].alloc_offset == WP_MISSING_DEV) - continue; + int idx = i / map->sub_stripes; - if (test_bit(0, active) != test_bit(i, active)) { - if (unlikely(!btrfs_zone_activate(bg))) - return -EIO; - } else { - if (test_bit(0, active)) - set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); + if (raid0_allocs[idx] == WP_CONVENTIONAL) { + has_conventional = true; + raid0_allocs[idx] = btrfs_stripe_nr_to_offset(stripe_nr); + + if (stripe_index > idx) + raid0_allocs[idx] += BTRFS_STRIPE_LEN; + else if (stripe_index == idx) + raid0_allocs[idx] += stripe_offset; } - if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { - zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr); + if ((i % map->sub_stripes) == 0) { + /* Verification */ + if (i != 0) { + if (unlikely(prev_offset < raid0_allocs[idx])) { + btrfs_err(fs_info, + "zoned: stripe position disorder found in block group %llu", + bg->start); + return -EIO; + } - if (stripe_index > (i / map->sub_stripes)) - zone_info[i].alloc_offset += BTRFS_STRIPE_LEN; - else if (stripe_index == (i / map->sub_stripes)) - zone_info[i].alloc_offset += stripe_offset; + if (unlikely(has_partial && + (raid0_allocs[idx] & BTRFS_STRIPE_LEN_MASK))) { + btrfs_err(fs_info, + "zoned: multiple partial written stripe found in block group %llu", + bg->start); + return -EIO; + } + } + prev_offset = raid0_allocs[idx]; + + if ((raid0_allocs[idx] & BTRFS_STRIPE_LEN_MASK) != 0) + has_partial = true; + } + + if (zone_info[i].alloc_offset == WP_MISSING_DEV || + zone_info[i].alloc_offset == WP_CONVENTIONAL) + zone_info[i].alloc_offset = raid0_allocs[idx]; + + if (test_bit(0, active) != test_bit(i, active)) { + if (unlikely(!btrfs_zone_activate(bg))) + return -EIO; + } else if (test_bit(0, active)) { + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); } if ((i % map->sub_stripes) == 0) { @@ -1626,9 +1806,79 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, } } + /* Check if all devices stay in the same stripe row. */ + if (unlikely(zone_info[0].alloc_offset - + zone_info[map->num_stripes - 1].alloc_offset > BTRFS_STRIPE_LEN)) { + btrfs_err(fs_info, "zoned: stripe gap too large in block group %llu", + bg->start); + return -EIO; + } + + if (unlikely(has_conventional && bg->alloc_offset < last_alloc)) { + btrfs_err(fs_info, "zoned: allocated extent stays beyond write pointers %llu %llu", + bg->alloc_offset, last_alloc); + return -EIO; + } + return 0; } +EXPORT_FOR_TESTS +int btrfs_load_block_group_by_raid_type(struct btrfs_block_group *bg, + struct btrfs_chunk_map *map, + struct zone_info *zone_info, + unsigned long *active, u64 last_alloc) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + u64 profile; + int ret; + + profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; + switch (profile) { + case 0: /* single */ + ret = btrfs_load_block_group_single(bg, &zone_info[0], active); + break; + case BTRFS_BLOCK_GROUP_DUP: + ret = btrfs_load_block_group_dup(bg, map, zone_info, active, last_alloc); + break; + case BTRFS_BLOCK_GROUP_RAID1: + case BTRFS_BLOCK_GROUP_RAID1C3: + case BTRFS_BLOCK_GROUP_RAID1C4: + ret = btrfs_load_block_group_raid1(bg, map, zone_info, active, last_alloc); + break; + case BTRFS_BLOCK_GROUP_RAID0: + ret = btrfs_load_block_group_raid0(bg, map, zone_info, active, last_alloc); + break; + case BTRFS_BLOCK_GROUP_RAID10: + ret = btrfs_load_block_group_raid10(bg, map, zone_info, active, last_alloc); + break; + case BTRFS_BLOCK_GROUP_RAID5: + case BTRFS_BLOCK_GROUP_RAID6: + default: + btrfs_err(fs_info, "zoned: profile %s not yet supported", + btrfs_bg_type_to_raid_name(map->type)); + return -EINVAL; + } + + if (ret == -EIO && profile != 0 && profile != BTRFS_BLOCK_GROUP_RAID0 && + profile != BTRFS_BLOCK_GROUP_RAID10) { + /* + * Detected broken write pointer. Make this block group + * unallocatable by setting the allocation pointer at the end of + * allocatable region. Relocating this block group will fix the + * mismatch. + * + * Currently, we cannot handle RAID0 or RAID10 case like this + * because we don't have a proper zone_capacity value. But, + * reading from this block group won't work anyway by a missing + * stripe. + */ + bg->alloc_offset = bg->zone_capacity; + } + + return ret; +} + int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) { struct btrfs_fs_info *fs_info = cache->fs_info; @@ -1641,7 +1891,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) unsigned long *active = NULL; u64 last_alloc = 0; u32 num_sequential = 0, num_conventional = 0; - u64 profile; if (!btrfs_is_zoned(fs_info)) return 0; @@ -1701,53 +1950,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } } - profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; - switch (profile) { - case 0: /* single */ - ret = btrfs_load_block_group_single(cache, &zone_info[0], active); - break; - case BTRFS_BLOCK_GROUP_DUP: - ret = btrfs_load_block_group_dup(cache, map, zone_info, active, - last_alloc); - break; - case BTRFS_BLOCK_GROUP_RAID1: - case BTRFS_BLOCK_GROUP_RAID1C3: - case BTRFS_BLOCK_GROUP_RAID1C4: - ret = btrfs_load_block_group_raid1(cache, map, zone_info, - active, last_alloc); - break; - case BTRFS_BLOCK_GROUP_RAID0: - ret = btrfs_load_block_group_raid0(cache, map, zone_info, - active, last_alloc); - break; - case BTRFS_BLOCK_GROUP_RAID10: - ret = btrfs_load_block_group_raid10(cache, map, zone_info, - active, last_alloc); - break; - case BTRFS_BLOCK_GROUP_RAID5: - case BTRFS_BLOCK_GROUP_RAID6: - default: - btrfs_err(fs_info, "zoned: profile %s not yet supported", - btrfs_bg_type_to_raid_name(map->type)); - ret = -EINVAL; - goto out; - } - - if (ret == -EIO && profile != 0 && profile != BTRFS_BLOCK_GROUP_RAID0 && - profile != BTRFS_BLOCK_GROUP_RAID10) { - /* - * Detected broken write pointer. Make this block group - * unallocatable by setting the allocation pointer at the end of - * allocatable region. Relocating this block group will fix the - * mismatch. - * - * Currently, we cannot handle RAID0 or RAID10 case like this - * because we don't have a proper zone_capacity value. But, - * reading from this block group won't work anyway by a missing - * stripe. - */ - cache->alloc_offset = cache->zone_capacity; - } + ret = btrfs_load_block_group_by_raid_type(cache, map, zone_info, active, last_alloc); out: /* Reject non SINGLE data profiles without RST */ @@ -2028,7 +2231,7 @@ int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, if (block_group) { if (block_group->start > eb->start || - block_group->start + block_group->length <= eb->start) { + btrfs_block_group_end(block_group) <= eb->start) { btrfs_put_block_group(block_group); block_group = NULL; ctx->zoned_bg = NULL; @@ -2248,7 +2451,7 @@ out_unlock: static void wait_eb_writebacks(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; - const u64 end = block_group->start + block_group->length; + const u64 end = btrfs_block_group_end(block_group); struct extent_buffer *eb; unsigned long index, start = (block_group->start >> fs_info->nodesize_bits); @@ -2984,3 +3187,58 @@ int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num return 0; } + +void btrfs_show_zoned_stats(struct btrfs_fs_info *fs_info, struct seq_file *seq) +{ + struct btrfs_block_group *bg; + u64 data_reloc_bg; + u64 treelog_bg; + + seq_puts(seq, "\n zoned statistics:\n"); + + spin_lock(&fs_info->zone_active_bgs_lock); + seq_printf(seq, "\tactive block-groups: %zu\n", + list_count_nodes(&fs_info->zone_active_bgs)); + spin_unlock(&fs_info->zone_active_bgs_lock); + + spin_lock(&fs_info->unused_bgs_lock); + seq_printf(seq, "\t reclaimable: %zu\n", + list_count_nodes(&fs_info->reclaim_bgs)); + seq_printf(seq, "\t unused: %zu\n", list_count_nodes(&fs_info->unused_bgs)); + spin_unlock(&fs_info->unused_bgs_lock); + + seq_printf(seq,"\t need reclaim: %s\n", + str_true_false(btrfs_zoned_should_reclaim(fs_info))); + + data_reloc_bg = data_race(fs_info->data_reloc_bg); + if (data_reloc_bg) + seq_printf(seq, "\tdata relocation block-group: %llu\n", + data_reloc_bg); + treelog_bg = data_race(fs_info->treelog_bg); + if (treelog_bg) + seq_printf(seq, "\ttree-log block-group: %llu\n", treelog_bg); + + spin_lock(&fs_info->zone_active_bgs_lock); + seq_puts(seq, "\tactive zones:\n"); + list_for_each_entry(bg, &fs_info->zone_active_bgs, active_bg_list) { + u64 start; + u64 alloc_offset; + u64 used; + u64 reserved; + u64 zone_unusable; + const char *typestr = btrfs_space_info_type_str(bg->space_info); + + spin_lock(&bg->lock); + start = bg->start; + alloc_offset = bg->alloc_offset; + used = bg->used; + reserved = bg->reserved; + zone_unusable = bg->zone_unusable; + spin_unlock(&bg->lock); + + seq_printf(seq, + "\t start: %llu, wp: %llu used: %llu, reserved: %llu, unusable: %llu (%s)\n", + start, alloc_offset, used, reserved, zone_unusable, typestr); + } + spin_unlock(&fs_info->zone_active_bgs_lock); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 5cefdeb08b7b..8e21a836f858 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -10,6 +10,7 @@ #include <linux/errno.h> #include <linux/spinlock.h> #include <linux/mutex.h> +#include <linux/seq_file.h> #include "messages.h" #include "volumes.h" #include "disk-io.h" @@ -96,6 +97,17 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info); int btrfs_zoned_activate_one_bg(struct btrfs_space_info *space_info, bool do_finish); void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info); int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes); +void btrfs_show_zoned_stats(struct btrfs_fs_info *fs_info, struct seq_file *seq); + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +struct zone_info; + +int btrfs_load_block_group_by_raid_type(struct btrfs_block_group *bg, + struct btrfs_chunk_map *map, + struct zone_info *zone_info, + unsigned long *active, u64 last_alloc); +#endif + #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) @@ -275,6 +287,11 @@ static inline int btrfs_reset_unused_block_groups(struct btrfs_space_info *space return 0; } +static inline int btrfs_show_zoned_stats(struct btrfs_fs_info *fs_info, struct seq_file *seq) +{ + return 0; +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index c9cddcfa337b..32fd7f5454d3 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -396,36 +396,31 @@ fail: return ERR_PTR(-ENOMEM); } -int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, - u64 start, struct folio **folios, unsigned long *out_folios, - unsigned long *total_in, unsigned long *total_out) +int zstd_compress_bio(struct list_head *ws, struct compressed_bio *cb) { + struct btrfs_inode *inode = cb->bbio.inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct workspace *workspace = list_entry(ws, struct workspace, list); struct address_space *mapping = inode->vfs_inode.i_mapping; + struct bio *bio = &cb->bbio.bio; zstd_cstream *stream; int ret = 0; - int nr_folios = 0; - struct folio *in_folio = NULL; /* The current folio to read. */ - struct folio *out_folio = NULL; /* The current folio to write to. */ + /* The current folio to read. */ + struct folio *in_folio = NULL; + /* The current folio to write to. */ + struct folio *out_folio = NULL; unsigned long tot_in = 0; unsigned long tot_out = 0; - unsigned long len = *total_out; - const unsigned long nr_dest_folios = *out_folios; - const u64 orig_end = start + len; + const u64 start = cb->start; + const u32 len = cb->len; + const u64 end = start + len; const u32 blocksize = fs_info->sectorsize; const u32 min_folio_size = btrfs_min_folio_size(fs_info); - unsigned long max_out = nr_dest_folios * min_folio_size; - unsigned int cur_len; workspace->params = zstd_get_btrfs_parameters(workspace->req_level, len); - *out_folios = 0; - *total_out = 0; - *total_in = 0; - /* Initialize the stream */ - stream = zstd_init_cstream(&workspace->params, len, workspace->mem, - workspace->size); + /* Initialize the stream. */ + stream = zstd_init_cstream(&workspace->params, len, workspace->mem, workspace->size); if (unlikely(!stream)) { btrfs_err(fs_info, "zstd compression init level %d failed, root %llu inode %llu offset %llu", @@ -435,99 +430,95 @@ int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, goto out; } - /* map in the first page of input data */ + /* Map in the first page of input data. */ ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); if (ret < 0) goto out; - cur_len = btrfs_calc_input_length(in_folio, orig_end, start); workspace->in_buf.src = kmap_local_folio(in_folio, offset_in_folio(in_folio, start)); workspace->in_buf.pos = 0; - workspace->in_buf.size = cur_len; + workspace->in_buf.size = btrfs_calc_input_length(in_folio, end, start); - /* Allocate and map in the output buffer */ + /* Allocate and map in the output buffer. */ out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; } - folios[nr_folios++] = out_folio; workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; - workspace->out_buf.size = min_t(size_t, max_out, min_folio_size); + workspace->out_buf.size = min_folio_size; while (1) { size_t ret2; - ret2 = zstd_compress_stream(stream, &workspace->out_buf, - &workspace->in_buf); + ret2 = zstd_compress_stream(stream, &workspace->out_buf, &workspace->in_buf); if (unlikely(zstd_is_error(ret2))) { btrfs_warn(fs_info, "zstd compression level %d failed, error %d root %llu inode %llu offset %llu", workspace->req_level, zstd_get_error_code(ret2), btrfs_root_id(inode->root), btrfs_ino(inode), - start); + start + tot_in); ret = -EIO; goto out; } - /* Check to see if we are making it bigger */ + /* Check to see if we are making it bigger. */ if (tot_in + workspace->in_buf.pos > blocksize * 2 && - tot_in + workspace->in_buf.pos < - tot_out + workspace->out_buf.pos) { + tot_in + workspace->in_buf.pos < tot_out + workspace->out_buf.pos) { ret = -E2BIG; goto out; } - /* We've reached the end of our output range */ - if (workspace->out_buf.pos >= max_out) { - tot_out += workspace->out_buf.pos; - ret = -E2BIG; - goto out; - } - - /* Check if we need more output space */ - if (workspace->out_buf.pos == workspace->out_buf.size) { + /* Check if we need more output space. */ + if (workspace->out_buf.pos >= workspace->out_buf.size) { tot_out += min_folio_size; - max_out -= min_folio_size; - if (nr_folios == nr_dest_folios) { + if (tot_out >= len) { ret = -E2BIG; goto out; } + /* Queue the current foliot into the bio. */ + if (!bio_add_folio(bio, out_folio, folio_size(out_folio), 0)) { + ret = -E2BIG; + goto out; + } + out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; } - folios[nr_folios++] = out_folio; workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; - workspace->out_buf.size = min_t(size_t, max_out, min_folio_size); + workspace->out_buf.size = min_folio_size; } - /* We've reached the end of the input */ - if (workspace->in_buf.pos >= len) { + /* We've reached the end of the input. */ + if (tot_in + workspace->in_buf.pos >= len) { tot_in += workspace->in_buf.pos; break; } - /* Check if we need more input */ - if (workspace->in_buf.pos == workspace->in_buf.size) { + /* Check if we need more input. */ + if (workspace->in_buf.pos >= workspace->in_buf.size) { + u64 cur; + tot_in += workspace->in_buf.size; + cur = start + tot_in; + kunmap_local(workspace->in_buf.src); workspace->in_buf.src = NULL; folio_put(in_folio); - start += cur_len; - len -= cur_len; - ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); + + ret = btrfs_compress_filemap_get_folio(mapping, cur, &in_folio); if (ret < 0) goto out; - cur_len = btrfs_calc_input_length(in_folio, orig_end, start); workspace->in_buf.src = kmap_local_folio(in_folio, - offset_in_folio(in_folio, start)); + offset_in_folio(in_folio, cur)); workspace->in_buf.pos = 0; - workspace->in_buf.size = cur_len; + workspace->in_buf.size = btrfs_calc_input_length(in_folio, end, cur); } } + while (1) { size_t ret2; @@ -537,23 +528,30 @@ int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, "zstd compression end level %d failed, error %d root %llu inode %llu offset %llu", workspace->req_level, zstd_get_error_code(ret2), btrfs_root_id(inode->root), btrfs_ino(inode), - start); + start + tot_in); ret = -EIO; goto out; } + /* Queue the remaining part of the output folio into bio. */ if (ret2 == 0) { tot_out += workspace->out_buf.pos; + if (tot_out >= len) { + ret = -E2BIG; + goto out; + } + if (!bio_add_folio(bio, out_folio, workspace->out_buf.pos, 0)) { + ret = -E2BIG; + goto out; + } + out_folio = NULL; break; } - if (workspace->out_buf.pos >= max_out) { - tot_out += workspace->out_buf.pos; + tot_out += min_folio_size; + if (tot_out >= len) { ret = -E2BIG; goto out; } - - tot_out += min_folio_size; - max_out -= min_folio_size; - if (nr_folios == nr_dest_folios) { + if (!bio_add_folio(bio, out_folio, folio_size(out_folio), 0)) { ret = -E2BIG; goto out; } @@ -562,10 +560,9 @@ int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, ret = -ENOMEM; goto out; } - folios[nr_folios++] = out_folio; workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; - workspace->out_buf.size = min_t(size_t, max_out, min_folio_size); + workspace->out_buf.size = min_folio_size; } if (tot_out >= tot_in) { @@ -574,10 +571,10 @@ int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, } ret = 0; - *total_in = tot_in; - *total_out = tot_out; + ASSERT(tot_out == bio->bi_iter.bi_size); out: - *out_folios = nr_folios; + if (out_folio) + btrfs_free_compr_folio(out_folio); if (workspace->in_buf.src) { kunmap_local(workspace->in_buf.src); folio_put(in_folio); @@ -589,7 +586,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); struct workspace *workspace = list_entry(ws, struct workspace, list); - struct folio **folios_in = cb->compressed_folios; + struct folio_iter fi; size_t srclen = cb->compressed_len; zstd_dstream *stream; int ret = 0; @@ -600,6 +597,11 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) unsigned long buf_start; unsigned long total_out = 0; + bio_first_folio(&fi, &cb->bbio.bio, 0); + if (unlikely(!fi.folio)) + return -EINVAL; + ASSERT(folio_size(fi.folio) == blocksize); + stream = zstd_init_dstream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); if (unlikely(!stream)) { @@ -612,7 +614,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) goto done; } - workspace->in_buf.src = kmap_local_folio(folios_in[folio_in_index], 0); + workspace->in_buf.src = kmap_local_folio(fi.folio, 0); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, min_folio_size); @@ -660,8 +662,9 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) goto done; } srclen -= min_folio_size; - workspace->in_buf.src = - kmap_local_folio(folios_in[folio_in_index], 0); + bio_next_folio(&fi, &cb->bbio.bio); + ASSERT(fi.folio); + workspace->in_buf.src = kmap_local_folio(fi.folio, 0); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, min_folio_size); } diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index e8fd92789423..9165154a274d 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -336,6 +336,7 @@ struct btrfs_ioctl_fs_info_args { #define BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2 (1ULL << 13) #define BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE (1ULL << 14) #define BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA (1ULL << 16) +#define BTRFS_FEATURE_INCOMPAT_REMAP_TREE (1ULL << 17) struct btrfs_ioctl_feature_flags { __u64 compat_flags; diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index fc29d273845d..f7843e6bb978 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -76,6 +76,9 @@ /* Tracks RAID stripes in block groups. */ #define BTRFS_RAID_STRIPE_TREE_OBJECTID 12ULL +/* Holds details of remapped addresses after relocation. */ +#define BTRFS_REMAP_TREE_OBJECTID 13ULL + /* device stats in the device tree */ #define BTRFS_DEV_STATS_OBJECTID 0ULL @@ -282,6 +285,10 @@ #define BTRFS_RAID_STRIPE_KEY 230 +#define BTRFS_IDENTITY_REMAP_KEY 234 +#define BTRFS_REMAP_KEY 235 +#define BTRFS_REMAP_BACKREF_KEY 236 + /* * Records the overall state of the qgroups. * There's only one instance of this key present, @@ -714,9 +721,12 @@ struct btrfs_super_block { __u8 metadata_uuid[BTRFS_FSID_SIZE]; __u64 nr_global_roots; + __le64 remap_root; + __le64 remap_root_generation; + __u8 remap_root_level; /* Future expansion */ - __le64 reserved[27]; + __u8 reserved[199]; __u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; @@ -1161,12 +1171,15 @@ struct btrfs_dev_replace_item { #define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8) #define BTRFS_BLOCK_GROUP_RAID1C3 (1ULL << 9) #define BTRFS_BLOCK_GROUP_RAID1C4 (1ULL << 10) +#define BTRFS_BLOCK_GROUP_REMAPPED (1ULL << 11) +#define BTRFS_BLOCK_GROUP_METADATA_REMAP (1ULL << 12) #define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \ BTRFS_SPACE_INFO_GLOBAL_RSV) #define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ BTRFS_BLOCK_GROUP_SYSTEM | \ - BTRFS_BLOCK_GROUP_METADATA) + BTRFS_BLOCK_GROUP_METADATA | \ + BTRFS_BLOCK_GROUP_METADATA_REMAP) #define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ BTRFS_BLOCK_GROUP_RAID1 | \ @@ -1219,6 +1232,14 @@ struct btrfs_block_group_item { __le64 flags; } __attribute__ ((__packed__)); +struct btrfs_block_group_item_v2 { + __le64 used; + __le64 chunk_objectid; + __le64 flags; + __le64 remap_bytes; + __le32 identity_remap_count; +} __attribute__ ((__packed__)); + struct btrfs_free_space_info { __le32 extent_count; __le32 flags; @@ -1323,4 +1344,13 @@ struct btrfs_verity_descriptor_item { __u8 encryption; } __attribute__ ((__packed__)); +/* + * For a range identified by a BTRFS_REMAP_KEY item in the remap tree, gives + * the address that the start of the range will get remapped to. This + * structure is also shared by BTRFS_REMAP_BACKREF_KEY. + */ +struct btrfs_remap_item { + __le64 address; +} __attribute__ ((__packed__)); + #endif /* _BTRFS_CTREE_H_ */ |
