summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-02-09 15:45:21 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2026-02-09 15:45:21 -0800
commit8912c2fd5830e976c0deaeb0b2a458ce6b4718c7 (patch)
tree1e95a844937baf6bba645414e09a6826af5ca62d
parentb29a7a8eee6a1ca974aaf053c0ffed1173d279c2 (diff)
parent161ab30da6899f31f8128cec7c833e99fa4d06d2 (diff)
Merge tag 'for-6.20-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba: "User visible changes, feature updates: - when using block size > page size, enable direct IO - fallback to buffered IO if the data profile has duplication, workaround to avoid checksum mismatches on block group profiles with redundancy, real direct IO is possible on single or RAID0 - redo export of zoned statistics, moved from sysfs to /proc/pid/mountstats due to size limitations of the former Experimental features: - remove offload checksum tunable, intended to find best way to do it but since we've switched to offload to thread for everything we don't need it anymore - initial support for remap-tree feature, a translation layer of logical block addresses that allow changes without moving/rewriting blocks to do eg. relocation, or other changes that require COW Notable fixes: - automatic removal of accidentally leftover chunks when free-space-tree is enabled since mkfs.btrfs v6.16.1 - zoned mode: - do not try to append to conventional zones when RAID is mixing zoned and conventional drives - fixup write pointers when mixing zoned and conventional on DUP/RAID* profiles - when using squota, relax deletion rules for qgroups with 0 members to allow easier recovery from accounting bugs, also add more checks to detect bad accounting - fix periodic reclaim scanning, properly check boundary conditions not to trigger it unexpectedly or miss the time to run it - trim: - continue after first error - change reporting to the first detected error - add more cancellation points - reduce contention of big device lock that can block other operations when there's lots of trimmed space - when chunk allocation is forced (needs experimental build) fix transaction abort when unexpected space layout is detected Core: - switch to crypto library API for checksumming, removed module dependencies, pointer indirections, etc. - error handling improvements - adjust how and where transaction commit or abort are done and are maybe not necessary - minor compression optimization to skip single block ranges - improve how compression folios are handled - new and updated selftests - cleanups, refactoring: - auto-freeing and other automatic variable cleanup conversion - structure size optimizations - condition annotations" * tag 'for-6.20-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (137 commits) btrfs: get rid of compressed_bio::compressed_folios[] btrfs: get rid of compressed_folios[] usage for encoded writes btrfs: get rid of compressed_folios[] usage for compressed read btrfs: remove the old btrfs_compress_folios() infrastructure btrfs: switch to btrfs_compress_bio() interface for compressed writes btrfs: introduce btrfs_compress_bio() helper btrfs: zlib: introduce zlib_compress_bio() helper btrfs: zstd: introduce zstd_compress_bio() helper btrfs: lzo: introduce lzo_compress_bio() helper btrfs: zoned: factor out the zone loading part into a testable function btrfs: add cleanup function for btrfs_free_chunk_map btrfs: tests: add cleanup functions for test specific functions btrfs: raid56: fix memory leak of btrfs_raid_bio::stripe_uptodate_bitmap btrfs: tests: add unit tests for pending extent walking functions btrfs: fix EEXIST abort due to non-consecutive gaps in chunk allocation btrfs: fix transaction commit blocking during trim of unallocated space btrfs: handle user interrupt properly in btrfs_trim_fs() btrfs: preserve first error in btrfs_trim_fs() btrfs: continue trimming remaining devices on failure btrfs: do not BUG_ON() in btrfs_remove_block_group() ...
-rw-r--r--fs/btrfs/Kconfig13
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/accessors.h30
-rw-r--r--fs/btrfs/backref.c4
-rw-r--r--fs/btrfs/bio.c35
-rw-r--r--fs/btrfs/bio.h19
-rw-r--r--fs/btrfs/block-group.c455
-rw-r--r--fs/btrfs/block-group.h31
-rw-r--r--fs/btrfs/block-rsv.c8
-rw-r--r--fs/btrfs/block-rsv.h1
-rw-r--r--fs/btrfs/compression.c217
-rw-r--r--fs/btrfs/compression.h40
-rw-r--r--fs/btrfs/ctree.c49
-rw-r--r--fs/btrfs/ctree.h9
-rw-r--r--fs/btrfs/defrag.c10
-rw-r--r--fs/btrfs/delayed-inode.c53
-rw-r--r--fs/btrfs/delayed-inode.h15
-rw-r--r--fs/btrfs/direct-io.c29
-rw-r--r--fs/btrfs/discard.c52
-rw-r--r--fs/btrfs/disk-io.c284
-rw-r--r--fs/btrfs/extent-io-tree.c7
-rw-r--r--fs/btrfs/extent-tree.c451
-rw-r--r--fs/btrfs/extent-tree.h4
-rw-r--r--fs/btrfs/extent_io.c77
-rw-r--r--fs/btrfs/extent_map.c12
-rw-r--r--fs/btrfs/file-item.c20
-rw-r--r--fs/btrfs/file.c60
-rw-r--r--fs/btrfs/free-space-cache.c108
-rw-r--r--fs/btrfs/free-space-cache.h1
-rw-r--r--fs/btrfs/free-space-tree.c150
-rw-r--r--fs/btrfs/free-space-tree.h6
-rw-r--r--fs/btrfs/fs.c100
-rw-r--r--fs/btrfs/fs.h79
-rw-r--r--fs/btrfs/inode-item.c7
-rw-r--r--fs/btrfs/inode.c597
-rw-r--r--fs/btrfs/ioctl.c46
-rw-r--r--fs/btrfs/locking.c1
-rw-r--r--fs/btrfs/lzo.c297
-rw-r--r--fs/btrfs/messages.c26
-rw-r--r--fs/btrfs/messages.h76
-rw-r--r--fs/btrfs/qgroup.c125
-rw-r--r--fs/btrfs/reflink.c11
-rw-r--r--fs/btrfs/relocation.c1765
-rw-r--r--fs/btrfs/relocation.h17
-rw-r--r--fs/btrfs/root-tree.c47
-rw-r--r--fs/btrfs/scrub.c56
-rw-r--r--fs/btrfs/send.c76
-rw-r--r--fs/btrfs/space-info.c73
-rw-r--r--fs/btrfs/space-info.h16
-rw-r--r--fs/btrfs/super.c17
-rw-r--r--fs/btrfs/sysfs.c55
-rw-r--r--fs/btrfs/tests/btrfs-tests.c3
-rw-r--r--fs/btrfs/tests/btrfs-tests.h7
-rw-r--r--fs/btrfs/tests/chunk-allocation-tests.c476
-rw-r--r--fs/btrfs/tests/extent-map-tests.c16
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c4
-rw-r--r--fs/btrfs/tests/inode-tests.c126
-rw-r--r--fs/btrfs/transaction.c78
-rw-r--r--fs/btrfs/tree-checker.c84
-rw-r--r--fs/btrfs/tree-checker.h5
-rw-r--r--fs/btrfs/tree-log.c2
-rw-r--r--fs/btrfs/uuid-tree.c16
-rw-r--r--fs/btrfs/verity.c13
-rw-r--r--fs/btrfs/volumes.c631
-rw-r--r--fs/btrfs/volumes.h57
-rw-r--r--fs/btrfs/zlib.c97
-rw-r--r--fs/btrfs/zoned.c398
-rw-r--r--fs/btrfs/zoned.h17
-rw-r--r--fs/btrfs/zstd.c139
-rw-r--r--include/uapi/linux/btrfs.h1
-rw-r--r--include/uapi/linux/btrfs_tree.h34
71 files changed, 5866 insertions, 2078 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 6d6fc85835d4..ede184b6eda1 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -4,11 +4,8 @@ config BTRFS_FS
tristate "Btrfs filesystem support"
select BLK_CGROUP_PUNT_BIO
select CRC32
- select CRYPTO
- select CRYPTO_CRC32C
- select CRYPTO_XXHASH
- select CRYPTO_SHA256
- select CRYPTO_BLAKE2B
+ select CRYPTO_LIB_BLAKE2B
+ select CRYPTO_LIB_SHA256
select ZLIB_INFLATE
select ZLIB_DEFLATE
select LZO_COMPRESS
@@ -18,6 +15,7 @@ config BTRFS_FS
select FS_IOMAP
select RAID6_PQ
select XOR_BLOCKS
+ select XXHASH
depends on PAGE_SIZE_LESS_THAN_256KB
help
@@ -106,9 +104,6 @@ config BTRFS_EXPERIMENTAL
- send stream protocol v3 - fs-verity support
- - checksum offload mode - sysfs knob to affect when checksums are
- calculated (at IO time, or in a thread)
-
- raid-stripe-tree - additional mapping of extents to devices to
support RAID1* profiles on zoned devices,
RAID56 not yet supported
@@ -121,4 +116,6 @@ config BTRFS_EXPERIMENTAL
- asynchronous checksum generation for data writes
+ - remap-tree - logical address remapping tree
+
If unsure, say N.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 743d7677b175..975104b74486 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -44,4 +44,5 @@ btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
tests/extent-buffer-tests.o tests/btrfs-tests.o \
tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \
tests/free-space-tree-tests.o tests/extent-map-tests.o \
- tests/raid-stripe-tree-tests.o tests/delayed-refs-tests.o
+ tests/raid-stripe-tree-tests.o tests/delayed-refs-tests.o \
+ tests/chunk-allocation-tests.o
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index 78721412951c..8938357fcb40 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -240,6 +240,26 @@ BTRFS_SETGET_FUNCS(block_group_flags, struct btrfs_block_group_item, flags, 64);
BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags,
struct btrfs_block_group_item, flags, 64);
+/* struct btrfs_block_group_item_v2 */
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_used, struct btrfs_block_group_item_v2,
+ used, 64);
+BTRFS_SETGET_FUNCS(block_group_v2_used, struct btrfs_block_group_item_v2, used, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_chunk_objectid,
+ struct btrfs_block_group_item_v2, chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(block_group_v2_chunk_objectid,
+ struct btrfs_block_group_item_v2, chunk_objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_flags,
+ struct btrfs_block_group_item_v2, flags, 64);
+BTRFS_SETGET_FUNCS(block_group_v2_flags, struct btrfs_block_group_item_v2, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_remap_bytes,
+ struct btrfs_block_group_item_v2, remap_bytes, 64);
+BTRFS_SETGET_FUNCS(block_group_v2_remap_bytes, struct btrfs_block_group_item_v2,
+ remap_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_identity_remap_count,
+ struct btrfs_block_group_item_v2, identity_remap_count, 32);
+BTRFS_SETGET_FUNCS(block_group_v2_identity_remap_count, struct btrfs_block_group_item_v2,
+ identity_remap_count, 32);
+
/* struct btrfs_free_space_info */
BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info,
extent_count, 32);
@@ -863,6 +883,12 @@ BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
uuid_tree_generation, 64);
BTRFS_SETGET_STACK_FUNCS(super_nr_global_roots, struct btrfs_super_block,
nr_global_roots, 64);
+BTRFS_SETGET_STACK_FUNCS(super_remap_root, struct btrfs_super_block,
+ remap_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_remap_root_generation, struct btrfs_super_block,
+ remap_root_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_remap_root_level, struct btrfs_super_block,
+ remap_root_level, 8);
/* struct btrfs_file_extent_item */
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item,
@@ -1010,6 +1036,10 @@ BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption,
BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size,
struct btrfs_verity_descriptor_item, size, 64);
+BTRFS_SETGET_FUNCS(remap_address, struct btrfs_remap_item, address, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_remap_address, struct btrfs_remap_item,
+ address, 64);
+
/* Cast into the data area of the leaf. */
#define btrfs_item_ptr(leaf, slot, type) \
((type *)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot)))
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 78da47a3d00e..9bb406f7dd30 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -3609,10 +3609,8 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
}
rb_node = rb_simple_insert(&cache->rb_root, &upper->simple_node);
- if (unlikely(rb_node)) {
+ if (unlikely(rb_node))
btrfs_backref_panic(cache->fs_info, upper->bytenr, -EEXIST);
- return -EUCLEAN;
- }
list_add_tail(&edge->list[UPPER], &upper->lower);
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index fa1d321a2fb8..0a69e09bfe28 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -97,7 +97,13 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
bbio->orig_logical = orig_bbio->orig_logical;
orig_bbio->orig_logical += map_length;
}
+
bbio->csum_search_commit_root = orig_bbio->csum_search_commit_root;
+ bbio->can_use_append = orig_bbio->can_use_append;
+ bbio->is_scrub = orig_bbio->is_scrub;
+ bbio->is_remap = orig_bbio->is_remap;
+ bbio->async_csum = orig_bbio->async_csum;
+
atomic_inc(&orig_bbio->pending_ios);
return bbio;
}
@@ -480,6 +486,8 @@ static void btrfs_clone_write_end_io(struct bio *bio)
static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
{
+ u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+
if (!dev || !dev->bdev ||
test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
(btrfs_op(bio) == BTRFS_MAP_WRITE &&
@@ -494,12 +502,13 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
* For zone append writing, bi_sector must point the beginning of the
* zone
*/
- if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
- u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ if (btrfs_bio(bio)->can_use_append && btrfs_dev_is_sequential(dev, physical)) {
u64 zone_start = round_down(physical, dev->fs_info->zone_size);
ASSERT(btrfs_dev_is_sequential(dev, physical));
bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
+ bio->bi_opf &= ~REQ_OP_WRITE;
+ bio->bi_opf |= REQ_OP_ZONE_APPEND;
}
btrfs_debug(dev->fs_info,
"%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
@@ -662,11 +671,6 @@ static bool should_async_write(struct btrfs_bio *bbio)
bool auto_csum_mode = true;
#ifdef CONFIG_BTRFS_EXPERIMENTAL
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode);
-
- if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_ON)
- return true;
/*
* Write bios will calculate checksum and submit bio at the same time.
* Unless explicitly required don't offload serial csum calculate and bio
@@ -747,7 +751,6 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
u64 length = bio->bi_iter.bi_size;
u64 map_length = length;
- bool use_append = btrfs_use_zone_append(bbio);
struct btrfs_io_context *bioc = NULL;
struct btrfs_io_stripe smap;
blk_status_t status;
@@ -775,8 +778,10 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
if (bio_op(bio) == REQ_OP_WRITE && is_data_bbio(bbio))
bbio->orig_logical = logical;
+ bbio->can_use_append = btrfs_use_zone_append(bbio);
+
map_length = min(map_length, length);
- if (use_append)
+ if (bbio->can_use_append)
map_length = btrfs_append_map_length(bbio, map_length);
if (map_length < length) {
@@ -805,11 +810,6 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
}
if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
- if (use_append) {
- bio->bi_opf &= ~REQ_OP_WRITE;
- bio->bi_opf |= REQ_OP_ZONE_APPEND;
- }
-
if (is_data_bbio(bbio) && bioc && bioc->use_rst) {
/*
* No locking for the list update, as we only add to
@@ -827,7 +827,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
*/
if (!(inode->flags & BTRFS_INODE_NODATASUM) &&
!test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state) &&
- !btrfs_is_data_reloc_root(inode->root)) {
+ !btrfs_is_data_reloc_root(inode->root) && !bbio->is_remap) {
if (should_async_write(bbio) &&
btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num))
goto done;
@@ -836,9 +836,8 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
status = errno_to_blk_status(ret);
if (status)
goto fail;
- } else if (use_append ||
- (btrfs_is_zoned(fs_info) && inode &&
- inode->flags & BTRFS_INODE_NODATASUM)) {
+ } else if (bbio->can_use_append ||
+ (btrfs_is_zoned(fs_info) && inode->flags & BTRFS_INODE_NODATASUM)) {
ret = btrfs_alloc_dummy_sum(bbio);
status = errno_to_blk_status(ret);
if (status)
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index 1be74209f0b8..303ed6c7103d 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -68,29 +68,36 @@ struct btrfs_bio {
struct btrfs_tree_parent_check parent_check;
};
+ /* For internal use in read end I/O handling */
+ struct work_struct end_io_work;
+
/* End I/O information supplied to btrfs_bio_alloc */
btrfs_bio_end_io_t end_io;
void *private;
- /* For internal use in read end I/O handling */
- unsigned int mirror_num;
atomic_t pending_ios;
- struct work_struct end_io_work;
+ u16 mirror_num;
/* Save the first error status of split bio. */
blk_status_t status;
/* Use the commit root to look up csums (data read bio only). */
- bool csum_search_commit_root;
+ bool csum_search_commit_root:1;
/*
* Since scrub will reuse btree inode, we need this flag to distinguish
* scrub bios.
*/
- bool is_scrub;
+ bool is_scrub:1;
+
+ /* Whether the bio is coming from copy_remapped_data_io(). */
+ bool is_remap:1;
/* Whether the csum generation for data write is async. */
- bool async_csum;
+ bool async_csum:1;
+
+ /* Whether the bio is written using zone append. */
+ bool can_use_append:1;
/*
* This member must come last, bio_alloc_bioset will allocate enough
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 08b14449fabe..3186ed4fd26d 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -239,7 +239,7 @@ static struct btrfs_block_group *block_group_cache_tree_search(
while (n) {
cache = rb_entry(n, struct btrfs_block_group, cache_node);
- end = cache->start + cache->length - 1;
+ end = btrfs_block_group_end(cache) - 1;
start = cache->start;
if (bytenr < start) {
@@ -292,7 +292,7 @@ struct btrfs_block_group *btrfs_next_block_group(
/* If our block group was removed, we need a full search. */
if (RB_EMPTY_NODE(&cache->cache_node)) {
- const u64 next_bytenr = cache->start + cache->length;
+ const u64 next_bytenr = btrfs_block_group_end(cache);
read_unlock(&fs_info->block_group_cache_lock);
btrfs_put_block_group(cache);
@@ -575,28 +575,28 @@ int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start,
/*
* Get an arbitrary extent item index / max_index through the block group
*
- * @block_group the block group to sample from
+ * @caching_ctl the caching control containing the block group to sample from
* @index: the integral step through the block group to grab from
* @max_index: the granularity of the sampling
* @key: return value parameter for the item we find
+ * @path: path to use for searching in the extent tree
*
* Pre-conditions on indices:
* 0 <= index <= max_index
* 0 < max_index
*
- * Returns: 0 on success, 1 if the search didn't yield a useful item, negative
- * error code on error.
+ * Returns: 0 on success, 1 if the search didn't yield a useful item.
*/
static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl,
- struct btrfs_block_group *block_group,
int index, int max_index,
- struct btrfs_key *found_key)
+ struct btrfs_key *found_key,
+ struct btrfs_path *path)
{
+ struct btrfs_block_group *block_group = caching_ctl->block_group;
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_root *extent_root;
u64 search_offset;
- u64 search_end = block_group->start + block_group->length;
- BTRFS_PATH_AUTO_FREE(path);
+ const u64 search_end = btrfs_block_group_end(block_group);
struct btrfs_key search_key;
int ret = 0;
@@ -606,16 +606,13 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_
lockdep_assert_held(&caching_ctl->mutex);
lockdep_assert_held_read(&fs_info->commit_root_sem);
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start,
- BTRFS_SUPER_INFO_OFFSET));
-
- path->skip_locking = true;
- path->search_commit_root = true;
- path->reada = READA_FORWARD;
+ extent_root = btrfs_extent_root(fs_info, block_group->start);
+ if (unlikely(!extent_root)) {
+ btrfs_err(fs_info,
+ "missing extent root for block group at offset %llu",
+ block_group->start);
+ return -EUCLEAN;
+ }
search_offset = index * div_u64(block_group->length, max_index);
search_key.objectid = block_group->start + search_offset;
@@ -673,27 +670,42 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_
* 3, we can either read every file extent, or admit that this is best effort
* anyway and try to stay fast.
*
- * Returns: 0 on success, negative error code on error.
+ * No errors are returned since failing to determine the size class is not a
+ * critical error, size classes are just an optimization.
*/
-static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl,
- struct btrfs_block_group *block_group)
+static void load_block_group_size_class(struct btrfs_caching_control *caching_ctl)
{
+ BTRFS_PATH_AUTO_RELEASE(path);
+ struct btrfs_block_group *block_group = caching_ctl->block_group;
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_key key;
int i;
u64 min_size = block_group->length;
enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE;
- int ret;
+
+ /*
+ * Since we run in workqueue context, we allocate the path on stack to
+ * avoid memory allocation failure, as the stack in a work queue task
+ * is not deep.
+ */
+ ASSERT(current_work() == &caching_ctl->work.normal_work);
if (!btrfs_block_group_should_use_size_class(block_group))
- return 0;
+ return;
+
+ path.skip_locking = true;
+ path.search_commit_root = true;
+ path.reada = READA_FORWARD;
lockdep_assert_held(&caching_ctl->mutex);
lockdep_assert_held_read(&fs_info->commit_root_sem);
for (i = 0; i < 5; ++i) {
- ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key);
+ int ret;
+
+ ret = sample_block_group_extent_item(caching_ctl, i, 5, &key, &path);
if (ret < 0)
- goto out;
+ return;
+ btrfs_release_path(&path);
if (ret > 0)
continue;
min_size = min_t(u64, min_size, key.offset);
@@ -704,13 +716,12 @@ static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl
block_group->size_class = size_class;
spin_unlock(&block_group->lock);
}
-out:
- return ret;
}
static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
{
struct btrfs_block_group *block_group = caching_ctl->block_group;
+ const u64 block_group_end = btrfs_block_group_end(block_group);
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_root *extent_root;
BTRFS_PATH_AUTO_FREE(path);
@@ -755,13 +766,13 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
next:
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
while (1) {
- if (btrfs_fs_closing(fs_info) > 1) {
+ if (btrfs_fs_closing_done(fs_info)) {
last = (u64)-1;
break;
}
@@ -786,7 +797,7 @@ next:
ret = btrfs_next_leaf(extent_root, path);
if (ret < 0)
- goto out;
+ return ret;
if (ret)
break;
leaf = path->nodes[0];
@@ -807,7 +818,7 @@ next:
continue;
}
- if (key.objectid >= block_group->start + block_group->length)
+ if (key.objectid >= block_group_end)
break;
if (key.type == BTRFS_EXTENT_ITEM_KEY ||
@@ -817,7 +828,7 @@ next:
ret = btrfs_add_new_free_space(block_group, last,
key.objectid, &space_added);
if (ret)
- goto out;
+ return ret;
total_found += space_added;
if (key.type == BTRFS_METADATA_ITEM_KEY)
last = key.objectid +
@@ -836,17 +847,13 @@ next:
path->slots[0]++;
}
- ret = btrfs_add_new_free_space(block_group, last,
- block_group->start + block_group->length,
- NULL);
-out:
- return ret;
+ return btrfs_add_new_free_space(block_group, last, block_group_end, NULL);
}
static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg)
{
btrfs_clear_extent_bit(&bg->fs_info->excluded_extents, bg->start,
- bg->start + bg->length - 1, EXTENT_DIRTY, NULL);
+ btrfs_block_group_end(bg) - 1, EXTENT_DIRTY, NULL);
}
static noinline void caching_thread(struct btrfs_work *work)
@@ -863,7 +870,7 @@ static noinline void caching_thread(struct btrfs_work *work)
mutex_lock(&caching_ctl->mutex);
down_read(&fs_info->commit_root_sem);
- load_block_group_size_class(caching_ctl, block_group);
+ load_block_group_size_class(caching_ctl);
if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
ret = load_free_space_cache(block_group);
if (ret == 1) {
@@ -933,6 +940,13 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
if (btrfs_is_zoned(fs_info))
return 0;
+ /*
+ * No allocations can be done from remapped block groups, so they have
+ * no entries in the free-space tree.
+ */
+ if (cache->flags & BTRFS_BLOCK_GROUP_REMAPPED)
+ return 0;
+
caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
if (!caching_ctl)
return -ENOMEM;
@@ -1057,8 +1071,25 @@ static int remove_block_group_item(struct btrfs_trans_handle *trans,
if (ret < 0)
return ret;
- ret = btrfs_del_item(trans, root, path);
- return ret;
+ return btrfs_del_item(trans, root, path);
+}
+
+void btrfs_remove_bg_from_sinfo(struct btrfs_block_group *bg)
+{
+ int factor = btrfs_bg_type_to_factor(bg->flags);
+
+ spin_lock(&bg->space_info->lock);
+ if (btrfs_test_opt(bg->fs_info, ENOSPC_DEBUG)) {
+ WARN_ON(bg->space_info->total_bytes < bg->length);
+ WARN_ON(bg->space_info->bytes_readonly < bg->length - bg->zone_unusable);
+ WARN_ON(bg->space_info->bytes_zone_unusable < bg->zone_unusable);
+ WARN_ON(bg->space_info->disk_total < bg->length * factor);
+ }
+ bg->space_info->total_bytes -= bg->length;
+ bg->space_info->bytes_readonly -= (bg->length - bg->zone_unusable);
+ btrfs_space_info_update_bytes_zone_unusable(bg->space_info, -bg->zone_unusable);
+ bg->space_info->disk_total -= bg->length * factor;
+ spin_unlock(&bg->space_info->lock);
}
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
@@ -1072,16 +1103,22 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct kobject *kobj = NULL;
int ret;
int index;
- int factor;
struct btrfs_caching_control *caching_ctl = NULL;
bool remove_map;
bool remove_rsv = false;
block_group = btrfs_lookup_block_group(fs_info, map->start);
- if (!block_group)
+ if (unlikely(!block_group)) {
+ btrfs_abort_transaction(trans, -ENOENT);
return -ENOENT;
+ }
- BUG_ON(!block_group->ro);
+ if (unlikely(!block_group->ro &&
+ !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED))) {
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
trace_btrfs_remove_block_group(block_group);
/*
@@ -1093,7 +1130,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
block_group->length);
index = btrfs_bg_flags_to_raid_index(block_group->flags);
- factor = btrfs_bg_type_to_factor(block_group->flags);
/* make sure this block group isn't part of an allocation cluster */
cluster = &fs_info->data_alloc_cluster;
@@ -1114,8 +1150,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
btrfs_clear_data_reloc_bg(block_group);
path = btrfs_alloc_path();
- if (!path) {
+ if (unlikely(!path)) {
ret = -ENOMEM;
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -1151,8 +1188,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
mutex_unlock(&trans->transaction->cache_write_mutex);
ret = btrfs_remove_free_space_inode(trans, inode, block_group);
- if (ret)
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
write_lock(&fs_info->block_group_cache_lock);
rb_erase_cached(&block_group->cache_node,
@@ -1217,26 +1256,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_lock(&block_group->space_info->lock);
list_del_init(&block_group->ro_list);
-
- if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
- WARN_ON(block_group->space_info->total_bytes
- < block_group->length);
- WARN_ON(block_group->space_info->bytes_readonly
- < block_group->length - block_group->zone_unusable);
- WARN_ON(block_group->space_info->bytes_zone_unusable
- < block_group->zone_unusable);
- WARN_ON(block_group->space_info->disk_total
- < block_group->length * factor);
- }
- block_group->space_info->total_bytes -= block_group->length;
- block_group->space_info->bytes_readonly -=
- (block_group->length - block_group->zone_unusable);
- btrfs_space_info_update_bytes_zone_unusable(block_group->space_info,
- -block_group->zone_unusable);
- block_group->space_info->disk_total -= block_group->length * factor;
-
spin_unlock(&block_group->space_info->lock);
+ if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED))
+ btrfs_remove_bg_from_sinfo(block_group);
+
/*
* Remove the free space for the block group from the free space tree
* and the block group's item from the extent tree before marking the
@@ -1247,14 +1271,24 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* deletes the block group item from the extent tree, allowing for
* another task to attempt to create another block group with the same
* item key (and failing with -EEXIST and a transaction abort).
+ *
+ * If the REMAPPED flag has been set the block group's free space
+ * has already been removed, so we can skip the call to
+ * btrfs_remove_block_group_free_space().
*/
- ret = btrfs_remove_block_group_free_space(trans, block_group);
- if (ret)
- goto out;
+ if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) {
+ ret = btrfs_remove_block_group_free_space(trans, block_group);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ }
ret = remove_block_group_item(trans, path, block_group);
- if (ret < 0)
+ if (unlikely(ret < 0)) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
spin_lock(&block_group->lock);
/*
@@ -1377,8 +1411,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, bool force)
goto out;
}
- num_bytes = cache->length - cache->reserved - cache->pinned -
- cache->bytes_super - cache->zone_unusable - cache->used;
+ num_bytes = btrfs_block_group_available_space(cache);
/*
* Data never overcommits, even in mixed mode, so do just the straight
@@ -1564,8 +1597,10 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
- if (btrfs_is_block_group_used(block_group) || block_group->ro ||
- list_is_singular(&block_group->list)) {
+ if (btrfs_is_block_group_used(block_group) ||
+ (block_group->ro && !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) ||
+ list_is_singular(&block_group->list) ||
+ test_bit(BLOCK_GROUP_FLAG_FULLY_REMAPPED, &block_group->runtime_flags)) {
/*
* We want to bail if we made new allocations or have
* outstanding allocations in this block group. We do
@@ -1606,9 +1641,10 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* needing to allocate extents from the block group.
*/
used = btrfs_space_info_used(space_info, true);
- if ((space_info->total_bytes - block_group->length < used &&
- block_group->zone_unusable < block_group->length) ||
- has_unwritten_metadata(block_group)) {
+ if (((space_info->total_bytes - block_group->length < used &&
+ block_group->zone_unusable < block_group->length) ||
+ has_unwritten_metadata(block_group)) &&
+ !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) {
/*
* Add a reference for the list, compensate for the ref
* drop under the "next" label for the
@@ -1773,6 +1809,9 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
btrfs_get_block_group(bg);
trace_btrfs_add_unused_block_group(bg);
list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
+ } else if (bg->flags & BTRFS_BLOCK_GROUP_REMAPPED &&
+ bg->identity_remap_count == 0) {
+ /* Leave fully remapped block groups on the fully_remapped_bgs list. */
} else if (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) {
/* Pull out the block group from the reclaim_bgs list. */
trace_btrfs_add_unused_block_group(bg);
@@ -1805,6 +1844,12 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info)
{
+ if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
+ return false;
+
+ if (btrfs_fs_closing(fs_info))
+ return false;
+
if (btrfs_is_zoned(fs_info))
return btrfs_zoned_should_reclaim(fs_info);
return true;
@@ -1839,12 +1884,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
struct btrfs_space_info *space_info;
LIST_HEAD(retry_list);
- if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
- return;
-
- if (btrfs_fs_closing(fs_info))
- return;
-
if (!btrfs_should_reclaim(fs_info))
return;
@@ -1872,6 +1911,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
while (!list_empty(&fs_info->reclaim_bgs)) {
u64 used;
u64 reserved;
+ u64 old_total;
int ret = 0;
bg = list_first_entry(&fs_info->reclaim_bgs,
@@ -1937,6 +1977,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
}
spin_unlock(&bg->lock);
+ old_total = space_info->total_bytes;
spin_unlock(&space_info->lock);
/*
@@ -1989,14 +2030,14 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
reserved = 0;
spin_lock(&space_info->lock);
space_info->reclaim_errors++;
- if (READ_ONCE(space_info->periodic_reclaim))
- space_info->periodic_reclaim_ready = false;
spin_unlock(&space_info->lock);
}
spin_lock(&space_info->lock);
space_info->reclaim_count++;
space_info->reclaim_bytes += used;
space_info->reclaim_bytes += reserved;
+ if (space_info->total_bytes < old_total)
+ btrfs_set_periodic_reclaim_ready(space_info, true);
spin_unlock(&space_info->lock);
next:
@@ -2249,7 +2290,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
while (nr--) {
u64 len = min_t(u64, stripe_len,
- cache->start + cache->length - logical[nr]);
+ btrfs_block_group_end(cache) - logical[nr]);
cache->bytes_super += len;
ret = btrfs_set_extent_bit(&fs_info->excluded_extents,
@@ -2266,7 +2307,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
return 0;
}
-static struct btrfs_block_group *btrfs_create_block_group_cache(
+static struct btrfs_block_group *btrfs_create_block_group(
struct btrfs_fs_info *fs_info, u64 start)
{
struct btrfs_block_group *cache;
@@ -2360,7 +2401,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
}
static int read_one_block_group(struct btrfs_fs_info *info,
- struct btrfs_block_group_item *bgi,
+ struct btrfs_block_group_item_v2 *bgi,
const struct btrfs_key *key,
int need_clear)
{
@@ -2370,16 +2411,21 @@ static int read_one_block_group(struct btrfs_fs_info *info,
ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY);
- cache = btrfs_create_block_group_cache(info, key->objectid);
+ cache = btrfs_create_block_group(info, key->objectid);
if (!cache)
return -ENOMEM;
cache->length = key->offset;
- cache->used = btrfs_stack_block_group_used(bgi);
- cache->commit_used = cache->used;
- cache->flags = btrfs_stack_block_group_flags(bgi);
- cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
+ cache->used = btrfs_stack_block_group_v2_used(bgi);
+ cache->last_used = cache->used;
+ cache->flags = btrfs_stack_block_group_v2_flags(bgi);
+ cache->last_flags = cache->flags;
+ cache->global_root_id = btrfs_stack_block_group_v2_chunk_objectid(bgi);
cache->space_info = btrfs_find_space_info(info, cache->flags);
+ cache->remap_bytes = btrfs_stack_block_group_v2_remap_bytes(bgi);
+ cache->last_remap_bytes = cache->remap_bytes;
+ cache->identity_remap_count = btrfs_stack_block_group_v2_identity_remap_count(bgi);
+ cache->last_identity_remap_count = cache->identity_remap_count;
btrfs_set_free_space_tree_thresholds(cache);
@@ -2444,10 +2490,10 @@ static int read_one_block_group(struct btrfs_fs_info *info,
} else if (cache->length == cache->used) {
cache->cached = BTRFS_CACHE_FINISHED;
btrfs_free_excluded_extents(cache);
- } else if (cache->used == 0) {
+ } else if (cache->used == 0 && cache->remap_bytes == 0) {
cache->cached = BTRFS_CACHE_FINISHED;
ret = btrfs_add_new_free_space(cache, cache->start,
- cache->start + cache->length, NULL);
+ btrfs_block_group_end(cache), NULL);
btrfs_free_excluded_extents(cache);
if (ret)
goto error;
@@ -2464,7 +2510,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
set_avail_alloc_bits(info, cache->flags);
if (btrfs_chunk_writeable(info, cache->start)) {
- if (cache->used == 0) {
+ if (cache->used == 0 && cache->remap_bytes == 0) {
ASSERT(list_empty(&cache->bg_list));
if (btrfs_test_opt(info, DISCARD_ASYNC))
btrfs_discard_queue_work(&info->discard_ctl, cache);
@@ -2491,7 +2537,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
struct btrfs_block_group *bg;
map = rb_entry(node, struct btrfs_chunk_map, rb_node);
- bg = btrfs_create_block_group_cache(fs_info, map->start);
+ bg = btrfs_create_block_group(fs_info, map->start);
if (!bg) {
ret = -ENOMEM;
break;
@@ -2568,9 +2614,10 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
need_clear = 1;
while (1) {
- struct btrfs_block_group_item bgi;
+ struct btrfs_block_group_item_v2 bgi;
struct extent_buffer *leaf;
int slot;
+ size_t size;
ret = find_first_block_group(info, path, &key);
if (ret > 0)
@@ -2581,8 +2628,16 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
leaf = path->nodes[0];
slot = path->slots[0];
+ if (btrfs_fs_incompat(info, REMAP_TREE)) {
+ size = sizeof(struct btrfs_block_group_item_v2);
+ } else {
+ size = sizeof(struct btrfs_block_group_item);
+ btrfs_set_stack_block_group_v2_remap_bytes(&bgi, 0);
+ btrfs_set_stack_block_group_v2_identity_remap_count(&bgi, 0);
+ }
+
read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
- sizeof(bgi));
+ size);
btrfs_item_key_to_cpu(leaf, &key, slot);
btrfs_release_path(path);
@@ -2652,28 +2707,38 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_group_item bgi;
+ struct btrfs_block_group_item_v2 bgi;
struct btrfs_root *root = btrfs_block_group_root(fs_info);
struct btrfs_key key;
- u64 old_commit_used;
+ u64 old_last_used;
+ size_t size;
int ret;
spin_lock(&block_group->lock);
- btrfs_set_stack_block_group_used(&bgi, block_group->used);
- btrfs_set_stack_block_group_chunk_objectid(&bgi,
- block_group->global_root_id);
- btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
- old_commit_used = block_group->commit_used;
- block_group->commit_used = block_group->used;
+ btrfs_set_stack_block_group_v2_used(&bgi, block_group->used);
+ btrfs_set_stack_block_group_v2_chunk_objectid(&bgi, block_group->global_root_id);
+ btrfs_set_stack_block_group_v2_flags(&bgi, block_group->flags);
+ btrfs_set_stack_block_group_v2_remap_bytes(&bgi, block_group->remap_bytes);
+ btrfs_set_stack_block_group_v2_identity_remap_count(&bgi, block_group->identity_remap_count);
+ old_last_used = block_group->last_used;
+ block_group->last_used = block_group->used;
+ block_group->last_remap_bytes = block_group->remap_bytes;
+ block_group->last_identity_remap_count = block_group->identity_remap_count;
+ block_group->last_flags = block_group->flags;
key.objectid = block_group->start;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
key.offset = block_group->length;
spin_unlock(&block_group->lock);
- ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
+ if (btrfs_fs_incompat(fs_info, REMAP_TREE))
+ size = sizeof(struct btrfs_block_group_item_v2);
+ else
+ size = sizeof(struct btrfs_block_group_item);
+
+ ret = btrfs_insert_item(trans, root, &key, &bgi, size);
if (ret < 0) {
spin_lock(&block_group->lock);
- block_group->commit_used = old_commit_used;
+ block_group->last_used = old_last_used;
spin_unlock(&block_group->lock);
}
@@ -2886,7 +2951,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
btrfs_set_log_full_commit(trans);
- cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
+ cache = btrfs_create_block_group(fs_info, chunk_offset);
if (!cache)
return ERR_PTR(-ENOMEM);
@@ -3090,7 +3155,6 @@ unlock_out:
void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
{
struct btrfs_space_info *sinfo = cache->space_info;
- u64 num_bytes;
BUG_ON(!cache->ro);
@@ -3106,10 +3170,7 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
btrfs_space_info_update_bytes_zone_unusable(sinfo, cache->zone_unusable);
sinfo->bytes_readonly -= cache->zone_unusable;
}
- num_bytes = cache->length - cache->reserved -
- cache->pinned - cache->bytes_super -
- cache->zone_unusable - cache->used;
- sinfo->bytes_readonly -= num_bytes;
+ sinfo->bytes_readonly -= btrfs_block_group_available_space(cache);
list_del_init(&cache->ro_list);
}
spin_unlock(&cache->lock);
@@ -3125,10 +3186,12 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root = btrfs_block_group_root(fs_info);
unsigned long bi;
struct extent_buffer *leaf;
- struct btrfs_block_group_item bgi;
+ struct btrfs_block_group_item_v2 bgi;
struct btrfs_key key;
- u64 old_commit_used;
- u64 used;
+ u64 old_last_used, old_last_remap_bytes;
+ u32 old_last_identity_remap_count;
+ u64 used, remap_bytes;
+ u32 identity_remap_count;
/*
* Block group items update can be triggered out of commit transaction
@@ -3137,14 +3200,24 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
* may be changed.
*/
spin_lock(&cache->lock);
- old_commit_used = cache->commit_used;
+ old_last_used = cache->last_used;
+ old_last_remap_bytes = cache->last_remap_bytes;
+ old_last_identity_remap_count = cache->last_identity_remap_count;
used = cache->used;
- /* No change in used bytes, can safely skip it. */
- if (cache->commit_used == used) {
+ remap_bytes = cache->remap_bytes;
+ identity_remap_count = cache->identity_remap_count;
+ /* No change in values, can safely skip it. */
+ if (cache->last_used == used &&
+ cache->last_remap_bytes == remap_bytes &&
+ cache->last_identity_remap_count == identity_remap_count &&
+ cache->last_flags == cache->flags) {
spin_unlock(&cache->lock);
return 0;
}
- cache->commit_used = used;
+ cache->last_used = used;
+ cache->last_remap_bytes = remap_bytes;
+ cache->last_identity_remap_count = identity_remap_count;
+ cache->last_flags = cache->flags;
spin_unlock(&cache->lock);
key.objectid = cache->start;
@@ -3160,25 +3233,37 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
- btrfs_set_stack_block_group_used(&bgi, used);
- btrfs_set_stack_block_group_chunk_objectid(&bgi,
- cache->global_root_id);
- btrfs_set_stack_block_group_flags(&bgi, cache->flags);
- write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
+ btrfs_set_stack_block_group_v2_used(&bgi, used);
+ btrfs_set_stack_block_group_v2_chunk_objectid(&bgi, cache->global_root_id);
+ btrfs_set_stack_block_group_v2_flags(&bgi, cache->flags);
+
+ if (btrfs_fs_incompat(fs_info, REMAP_TREE)) {
+ btrfs_set_stack_block_group_v2_remap_bytes(&bgi, cache->remap_bytes);
+ btrfs_set_stack_block_group_v2_identity_remap_count(&bgi,
+ cache->identity_remap_count);
+ write_extent_buffer(leaf, &bgi, bi,
+ sizeof(struct btrfs_block_group_item_v2));
+ } else {
+ write_extent_buffer(leaf, &bgi, bi,
+ sizeof(struct btrfs_block_group_item));
+ }
+
fail:
btrfs_release_path(path);
/*
- * We didn't update the block group item, need to revert commit_used
+ * We didn't update the block group item, need to revert last_used
* unless the block group item didn't exist yet - this is to prevent a
* race with a concurrent insertion of the block group item, with
* insert_block_group_item(), that happened just after we attempted to
- * update. In that case we would reset commit_used to 0 just after the
+ * update. In that case we would reset last_used to 0 just after the
* insertion set it to a value greater than 0 - if the block group later
* becomes with 0 used bytes, we would incorrectly skip its update.
*/
if (ret < 0 && ret != -ENOENT) {
spin_lock(&cache->lock);
- cache->commit_used = old_commit_used;
+ cache->last_used = old_last_used;
+ cache->last_remap_bytes = old_last_remap_bytes;
+ cache->last_identity_remap_count = old_last_identity_remap_count;
spin_unlock(&cache->lock);
}
return ret;
@@ -3701,7 +3786,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
return -ENOENT;
/* An extent can not span multiple block groups. */
- ASSERT(bytenr + num_bytes <= cache->start + cache->length);
+ ASSERT(bytenr + num_bytes <= btrfs_block_group_end(cache));
space_info = cache->space_info;
factor = btrfs_bg_type_to_factor(cache->flags);
@@ -4530,6 +4615,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
list_del_init(&block_group->bg_list);
btrfs_put_block_group(block_group);
}
+
+ while (!list_empty(&info->fully_remapped_bgs)) {
+ block_group = list_first_entry(&info->fully_remapped_bgs,
+ struct btrfs_block_group, bg_list);
+ list_del_init(&block_group->bg_list);
+ btrfs_put_block_group(block_group);
+ }
spin_unlock(&info->unused_bgs_lock);
spin_lock(&info->zone_active_bgs_lock);
@@ -4680,6 +4772,7 @@ int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
enum btrfs_block_group_size_class size_class,
bool force_wrong_size_class)
{
+ lockdep_assert_held(&bg->lock);
ASSERT(size_class != BTRFS_BG_SZ_NONE);
/* The new allocation is in the right size class, do nothing */
@@ -4717,3 +4810,103 @@ bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg)
return false;
return true;
}
+
+void btrfs_mark_bg_fully_remapped(struct btrfs_block_group *bg,
+ struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+
+
+ if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
+ spin_lock(&bg->lock);
+ set_bit(BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING, &bg->runtime_flags);
+ spin_unlock(&bg->lock);
+
+ btrfs_discard_queue_work(&fs_info->discard_ctl, bg);
+ } else {
+ spin_lock(&fs_info->unused_bgs_lock);
+ /*
+ * The block group might already be on the unused_bgs list,
+ * remove it if it is. It'll get readded after
+ * btrfs_handle_fully_remapped_bgs() finishes.
+ */
+ if (!list_empty(&bg->bg_list))
+ list_del(&bg->bg_list);
+ else
+ btrfs_get_block_group(bg);
+
+ list_add_tail(&bg->bg_list, &fs_info->fully_remapped_bgs);
+ spin_unlock(&fs_info->unused_bgs_lock);
+ }
+}
+
+/*
+ * Compare the block group and chunk trees, and find any fully-remapped block
+ * groups which haven't yet had their chunk stripes and device extents removed,
+ * and put them on the fully_remapped_bgs list so this gets done.
+ *
+ * This happens when a block group becomes fully remapped, i.e. its last
+ * identity mapping is removed, and the volume is unmounted before async
+ * discard has finished. It's important this gets done as until it is the
+ * chunk's stripes are dead space.
+ */
+int btrfs_populate_fully_remapped_bgs_list(struct btrfs_fs_info *fs_info)
+{
+ struct rb_node *node_bg, *node_chunk;
+
+ node_bg = rb_first_cached(&fs_info->block_group_cache_tree);
+ node_chunk = rb_first_cached(&fs_info->mapping_tree);
+
+ while (node_bg && node_chunk) {
+ struct btrfs_block_group *bg;
+ struct btrfs_chunk_map *map;
+
+ bg = rb_entry(node_bg, struct btrfs_block_group, cache_node);
+ map = rb_entry(node_chunk, struct btrfs_chunk_map, rb_node);
+
+ ASSERT(bg->start == map->start);
+
+ if (!(bg->flags & BTRFS_BLOCK_GROUP_REMAPPED))
+ goto next;
+
+ if (bg->identity_remap_count != 0)
+ goto next;
+
+ if (map->num_stripes == 0)
+ goto next;
+
+ spin_lock(&fs_info->unused_bgs_lock);
+
+ if (list_empty(&bg->bg_list)) {
+ btrfs_get_block_group(bg);
+ list_add_tail(&bg->bg_list, &fs_info->fully_remapped_bgs);
+ } else {
+ list_move_tail(&bg->bg_list, &fs_info->fully_remapped_bgs);
+ }
+
+ spin_unlock(&fs_info->unused_bgs_lock);
+
+ /*
+ * Ideally we'd want to call btrfs_discard_queue_work() here,
+ * but it'd do nothing as the discard worker hasn't been
+ * started yet.
+ *
+ * The block group will get added to the discard list when
+ * btrfs_handle_fully_remapped_bgs() gets called, when we
+ * commit the first transaction.
+ */
+ if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
+ spin_lock(&bg->lock);
+ set_bit(BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING, &bg->runtime_flags);
+ spin_unlock(&bg->lock);
+ }
+
+next:
+ node_bg = rb_next(node_bg);
+ node_chunk = rb_next(node_chunk);
+ }
+
+ ASSERT(!node_bg && !node_chunk);
+
+ return 0;
+}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 5f933455118c..c03e04292900 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -49,6 +49,7 @@ enum btrfs_discard_state {
BTRFS_DISCARD_EXTENTS,
BTRFS_DISCARD_BITMAPS,
BTRFS_DISCARD_RESET_CURSOR,
+ BTRFS_DISCARD_FULLY_REMAPPED,
};
/*
@@ -92,6 +93,8 @@ enum btrfs_block_group_flags {
* transaction.
*/
BLOCK_GROUP_FLAG_NEW,
+ BLOCK_GROUP_FLAG_FULLY_REMAPPED,
+ BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING,
};
enum btrfs_caching_type {
@@ -129,13 +132,22 @@ struct btrfs_block_group {
u64 flags;
u64 cache_generation;
u64 global_root_id;
+ u64 remap_bytes;
+ u32 identity_remap_count;
/*
* The last committed used bytes of this block group, if the above @used
- * is still the same as @commit_used, we don't need to update block
+ * is still the same as @last_used, we don't need to update block
* group item of this block group.
*/
- u64 commit_used;
+ u64 last_used;
+ /* The last committed remap_bytes value of this block group. */
+ u64 last_remap_bytes;
+ /* The last commited identity_remap_count value of this block group. */
+ u32 last_identity_remap_count;
+ /* The last committed flags value for this block group. */
+ u64 last_flags;
+
/*
* If the free space extent count exceeds this number, convert the block
* group to bitmaps.
@@ -282,7 +294,8 @@ static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg)
{
lockdep_assert_held(&bg->lock);
- return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0);
+ return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0 ||
+ bg->remap_bytes > 0);
}
static inline bool btrfs_is_block_group_data_only(const struct btrfs_block_group *block_group)
@@ -295,6 +308,14 @@ static inline bool btrfs_is_block_group_data_only(const struct btrfs_block_group
!(block_group->flags & BTRFS_BLOCK_GROUP_METADATA);
}
+static inline u64 btrfs_block_group_available_space(const struct btrfs_block_group *bg)
+{
+ lockdep_assert_held(&bg->lock);
+
+ return (bg->length - bg->used - bg->pinned - bg->reserved -
+ bg->bytes_super - bg->zone_unusable);
+}
+
#ifdef CONFIG_BTRFS_DEBUG
int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group);
#endif
@@ -324,6 +345,7 @@ int btrfs_add_new_free_space(struct btrfs_block_group *block_group,
struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
struct btrfs_fs_info *fs_info,
const u64 chunk_offset);
+void btrfs_remove_bg_from_sinfo(struct btrfs_block_group *bg);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_chunk_map *map);
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
@@ -395,5 +417,8 @@ int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
enum btrfs_block_group_size_class size_class,
bool force_wrong_size_class);
bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg);
+void btrfs_mark_bg_fully_remapped(struct btrfs_block_group *bg,
+ struct btrfs_trans_handle *trans);
+int btrfs_populate_fully_remapped_bgs_list(struct btrfs_fs_info *fs_info);
#endif /* BTRFS_BLOCK_GROUP_H */
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 96cf7a162987..e823230c09b7 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -419,6 +419,9 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root)
case BTRFS_TREE_LOG_OBJECTID:
root->block_rsv = &fs_info->treelog_rsv;
break;
+ case BTRFS_REMAP_TREE_OBJECTID:
+ root->block_rsv = &fs_info->remap_block_rsv;
+ break;
default:
root->block_rsv = NULL;
break;
@@ -432,6 +435,9 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
fs_info->chunk_block_rsv.space_info = space_info;
+ space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA_REMAP);
+ fs_info->remap_block_rsv.space_info = space_info;
+
space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
fs_info->global_block_rsv.space_info = space_info;
fs_info->trans_block_rsv.space_info = space_info;
@@ -458,6 +464,8 @@ void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info)
WARN_ON(fs_info->trans_block_rsv.reserved > 0);
WARN_ON(fs_info->chunk_block_rsv.size > 0);
WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
+ WARN_ON(fs_info->remap_block_rsv.size > 0);
+ WARN_ON(fs_info->remap_block_rsv.reserved > 0);
WARN_ON(fs_info->delayed_block_rsv.size > 0);
WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index 79ae9d05cd91..8359fb96bc3c 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -22,6 +22,7 @@ enum btrfs_rsv_type {
BTRFS_BLOCK_RSV_DELALLOC,
BTRFS_BLOCK_RSV_TRANS,
BTRFS_BLOCK_RSV_CHUNK,
+ BTRFS_BLOCK_RSV_REMAP,
BTRFS_BLOCK_RSV_DELOPS,
BTRFS_BLOCK_RSV_DELREFS,
BTRFS_BLOCK_RSV_TREELOG,
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 6b3357287b42..1e7174ad32e2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -21,7 +21,6 @@
#include <linux/sched/mm.h>
#include <linux/log2.h>
#include <linux/shrinker.h>
-#include <crypto/hash.h>
#include "misc.h"
#include "ctree.h"
#include "fs.h"
@@ -87,37 +86,6 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len)
return false;
}
-static int compression_compress_pages(int type, struct list_head *ws,
- struct btrfs_inode *inode, u64 start,
- struct folio **folios, unsigned long *out_folios,
- unsigned long *total_in, unsigned long *total_out)
-{
- switch (type) {
- case BTRFS_COMPRESS_ZLIB:
- return zlib_compress_folios(ws, inode, start, folios,
- out_folios, total_in, total_out);
- case BTRFS_COMPRESS_LZO:
- return lzo_compress_folios(ws, inode, start, folios,
- out_folios, total_in, total_out);
- case BTRFS_COMPRESS_ZSTD:
- return zstd_compress_folios(ws, inode, start, folios,
- out_folios, total_in, total_out);
- case BTRFS_COMPRESS_NONE:
- default:
- /*
- * This can happen when compression races with remount setting
- * it to 'no compress', while caller doesn't call
- * inode_need_compress() to check if we really need to
- * compress.
- *
- * Not a big deal, just need to inform caller that we
- * haven't allocated any pages yet.
- */
- *out_folios = 0;
- return -E2BIG;
- }
-}
-
static int compression_decompress_bio(struct list_head *ws,
struct compressed_bio *cb)
{
@@ -156,13 +124,6 @@ static int compression_decompress(int type, struct list_head *ws,
}
}
-static void btrfs_free_compressed_folios(struct compressed_bio *cb)
-{
- for (unsigned int i = 0; i < cb->nr_folios; i++)
- btrfs_free_compr_folio(cb->compressed_folios[i]);
- kfree(cb->compressed_folios);
-}
-
static int btrfs_decompress_bio(struct compressed_bio *cb);
/*
@@ -271,12 +232,14 @@ static void end_bbio_compressed_read(struct btrfs_bio *bbio)
{
struct compressed_bio *cb = to_compressed_bio(bbio);
blk_status_t status = bbio->bio.bi_status;
+ struct folio_iter fi;
if (!status)
status = errno_to_blk_status(btrfs_decompress_bio(cb));
- btrfs_free_compressed_folios(cb);
btrfs_bio_end_io(cb->orig_bbio, status);
+ bio_for_each_folio_all(fi, &bbio->bio)
+ btrfs_free_compr_folio(fi.folio);
bio_put(&bbio->bio);
}
@@ -327,6 +290,7 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb)
static void end_bbio_compressed_write(struct btrfs_bio *bbio)
{
struct compressed_bio *cb = to_compressed_bio(bbio);
+ struct folio_iter fi;
btrfs_finish_ordered_extent(cb->bbio.ordered, NULL, cb->start, cb->len,
cb->bbio.bio.bi_status == BLK_STS_OK);
@@ -334,29 +298,11 @@ static void end_bbio_compressed_write(struct btrfs_bio *bbio)
if (cb->writeback)
end_compressed_writeback(cb);
/* Note, our inode could be gone now. */
- btrfs_free_compressed_folios(cb);
+ bio_for_each_folio_all(fi, &bbio->bio)
+ btrfs_free_compr_folio(fi.folio);
bio_put(&cb->bbio.bio);
}
-static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb)
-{
- struct bio *bio = &cb->bbio.bio;
- u32 offset = 0;
- unsigned int findex = 0;
-
- while (offset < cb->compressed_len) {
- struct folio *folio = cb->compressed_folios[findex];
- u32 len = min_t(u32, cb->compressed_len - offset, folio_size(folio));
- int ret;
-
- /* Maximum compressed extent is smaller than bio size limit. */
- ret = bio_add_folio(bio, folio, len, 0);
- ASSERT(ret);
- offset += len;
- findex++;
- }
-}
-
/*
* worker function to build and submit bios for previously compressed pages.
* The corresponding pages in the inode should be marked for writeback
@@ -367,35 +313,44 @@ static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb)
* the end io hooks.
*/
void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
- struct folio **compressed_folios,
- unsigned int nr_folios,
- blk_opf_t write_flags,
- bool writeback)
+ struct compressed_bio *cb)
{
struct btrfs_inode *inode = ordered->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct compressed_bio *cb;
ASSERT(IS_ALIGNED(ordered->file_offset, fs_info->sectorsize));
ASSERT(IS_ALIGNED(ordered->num_bytes, fs_info->sectorsize));
+ ASSERT(cb->writeback);
- cb = alloc_compressed_bio(inode, ordered->file_offset,
- REQ_OP_WRITE | write_flags,
- end_bbio_compressed_write);
cb->start = ordered->file_offset;
cb->len = ordered->num_bytes;
- cb->compressed_folios = compressed_folios;
cb->compressed_len = ordered->disk_num_bytes;
- cb->writeback = writeback;
- cb->nr_folios = nr_folios;
cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT;
cb->bbio.ordered = ordered;
- btrfs_add_compressed_bio_folios(cb);
btrfs_submit_bbio(&cb->bbio, 0);
}
/*
+ * Allocate a compressed write bio for @inode file offset @start length @len.
+ *
+ * The caller still needs to properly queue all folios and populate involved
+ * members.
+ */
+struct compressed_bio *btrfs_alloc_compressed_write(struct btrfs_inode *inode,
+ u64 start, u64 len)
+{
+ struct compressed_bio *cb;
+
+ cb = alloc_compressed_bio(inode, start, REQ_OP_WRITE, end_bbio_compressed_write);
+ cb->start = start;
+ cb->len = len;
+ cb->writeback = true;
+
+ return cb;
+}
+
+/*
* Add extra pages in the same compressed file extent so that we don't need to
* re-read the same extent again and again.
*
@@ -520,7 +475,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
folio_put(folio);
break;
}
- add_size = min(em->start + em->len, page_end + 1) - cur;
+ add_size = min(btrfs_extent_map_end(em), page_end + 1) - cur;
btrfs_free_extent_map(em);
btrfs_unlock_extent(tree, cur, page_end, NULL);
@@ -571,13 +526,13 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
struct extent_map_tree *em_tree = &inode->extent_tree;
struct compressed_bio *cb;
unsigned int compressed_len;
+ const u32 min_folio_size = btrfs_min_folio_size(fs_info);
u64 file_offset = bbio->file_offset;
u64 em_len;
u64 em_start;
struct extent_map *em;
unsigned long pflags;
int memstall = 0;
- blk_status_t status;
int ret;
/* we need the actual starting offset of this extent in the file */
@@ -585,7 +540,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
em = btrfs_lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize);
read_unlock(&em_tree->lock);
if (!em) {
- status = BLK_STS_IOERR;
+ ret = -EIO;
goto out;
}
@@ -607,27 +562,30 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
btrfs_free_extent_map(em);
- cb->nr_folios = DIV_ROUND_UP(compressed_len, btrfs_min_folio_size(fs_info));
- cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct folio *), GFP_NOFS);
- if (!cb->compressed_folios) {
- status = BLK_STS_RESOURCE;
- goto out_free_bio;
- }
+ for (int i = 0; i * min_folio_size < compressed_len; i++) {
+ struct folio *folio;
+ u32 cur_len = min(compressed_len - i * min_folio_size, min_folio_size);
+
+ folio = btrfs_alloc_compr_folio(fs_info);
+ if (!folio) {
+ ret = -ENOMEM;
+ goto out_free_bio;
+ }
- ret = btrfs_alloc_folio_array(cb->nr_folios, fs_info->block_min_order,
- cb->compressed_folios);
- if (ret) {
- status = BLK_STS_RESOURCE;
- goto out_free_compressed_pages;
+ ret = bio_add_folio(&cb->bbio.bio, folio, cur_len, 0);
+ if (unlikely(!ret)) {
+ folio_put(folio);
+ ret = -EINVAL;
+ goto out_free_bio;
+ }
}
+ ASSERT(cb->bbio.bio.bi_iter.bi_size == compressed_len);
add_ra_bio_pages(&inode->vfs_inode, em_start + em_len, cb, &memstall,
&pflags);
- /* include any pages we added in add_ra-bio_pages */
cb->len = bbio->bio.bi_iter.bi_size;
cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector;
- btrfs_add_compressed_bio_folios(cb);
if (memstall)
psi_memstall_leave(&pflags);
@@ -635,12 +593,10 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
btrfs_submit_bbio(&cb->bbio, 0);
return;
-out_free_compressed_pages:
- kfree(cb->compressed_folios);
out_free_bio:
- bio_put(&cb->bbio.bio);
+ cleanup_compressed_bio(cb);
out:
- btrfs_bio_end_io(bbio, status);
+ btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
}
/*
@@ -1027,42 +983,71 @@ int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
}
/*
- * Given an address space and start and length, compress the bytes into @pages
- * that are allocated on demand.
- *
- * @type_level is encoded algorithm and level, where level 0 means whatever
- * default the algorithm chooses and is opaque here;
- * - compression algo are 0-3
- * - the level are bits 4-7
+ * Given an address space and start and length, compress the page cache
+ * contents into @cb.
*
- * @out_folios is an in/out parameter, holds maximum number of folios to allocate
- * and returns number of actually allocated folios
+ * @type_level: is encoded algorithm and level, where level 0 means whatever
+ * default the algorithm chooses and is opaque here;
+ * - compression algo are 0-3
+ * - the level are bits 4-7
*
- * @total_in is used to return the number of bytes actually read. It
- * may be smaller than the input length if we had to exit early because we
- * ran out of room in the folios array or because we cross the
- * max_out threshold.
+ * @cb->bbio.bio.bi_iter.bi_size will indicate the compressed data size.
+ * The bi_size may not be sectorsize aligned, thus the caller still need
+ * to do the round up before submission.
*
- * @total_out is an in/out parameter, must be set to the input length and will
- * be also used to return the total number of compressed bytes
+ * This function will allocate compressed folios with btrfs_alloc_compr_folio(),
+ * thus callers must make sure the endio function and error handling are using
+ * btrfs_free_compr_folio() to release those folios.
+ * This is already done in end_bbio_compressed_write() and cleanup_compressed_bio().
*/
-int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inode,
- u64 start, struct folio **folios, unsigned long *out_folios,
- unsigned long *total_in, unsigned long *total_out)
+struct compressed_bio *btrfs_compress_bio(struct btrfs_inode *inode,
+ u64 start, u32 len, unsigned int type,
+ int level, blk_opf_t write_flags)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- const unsigned long orig_len = *total_out;
struct list_head *workspace;
+ struct compressed_bio *cb;
int ret;
+ cb = alloc_compressed_bio(inode, start, REQ_OP_WRITE | write_flags,
+ end_bbio_compressed_write);
+ cb->start = start;
+ cb->len = len;
+ cb->writeback = true;
+ cb->compress_type = type;
+
level = btrfs_compress_set_level(type, level);
workspace = get_workspace(fs_info, type, level);
- ret = compression_compress_pages(type, workspace, inode, start, folios,
- out_folios, total_in, total_out);
- /* The total read-in bytes should be no larger than the input. */
- ASSERT(*total_in <= orig_len);
+ switch (type) {
+ case BTRFS_COMPRESS_ZLIB:
+ ret = zlib_compress_bio(workspace, cb);
+ break;
+ case BTRFS_COMPRESS_LZO:
+ ret = lzo_compress_bio(workspace, cb);
+ break;
+ case BTRFS_COMPRESS_ZSTD:
+ ret = zstd_compress_bio(workspace, cb);
+ break;
+ case BTRFS_COMPRESS_NONE:
+ default:
+ /*
+ * This can happen when compression races with remount setting
+ * it to 'no compress', while caller doesn't call
+ * inode_need_compress() to check if we really need to
+ * compress.
+ *
+ * Not a big deal, just need to inform caller that we
+ * haven't allocated any pages yet.
+ */
+ ret = -E2BIG;
+ }
+
put_workspace(fs_info, type, workspace);
- return ret;
+ if (ret < 0) {
+ cleanup_compressed_bio(cb);
+ return ERR_PTR(ret);
+ }
+ return cb;
}
static int btrfs_decompress_bio(struct compressed_bio *cb)
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index e0228017e861..65b8bc4bbe0b 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -42,12 +42,6 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
#define BTRFS_ZLIB_DEFAULT_LEVEL 3
struct compressed_bio {
- /* Number of compressed folios in the array. */
- unsigned int nr_folios;
-
- /* The folios with the compressed data on them. */
- struct folio **compressed_folios;
-
/* starting offset in the inode for our pages */
u64 start;
@@ -91,18 +85,15 @@ int __init btrfs_init_compress(void);
void __cold btrfs_exit_compress(void);
bool btrfs_compress_level_valid(unsigned int type, int level);
-int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inode,
- u64 start, struct folio **folios, unsigned long *out_folios,
- unsigned long *total_in, unsigned long *total_out);
int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio,
unsigned long dest_pgoff, size_t srclen, size_t destlen);
int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
struct compressed_bio *cb, u32 decompressed);
+struct compressed_bio *btrfs_alloc_compressed_write(struct btrfs_inode *inode,
+ u64 start, u64 len);
void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
- struct folio **compressed_folios,
- unsigned int nr_folios, blk_opf_t write_flags,
- bool writeback);
+ struct compressed_bio *cb);
void btrfs_submit_compressed_read(struct btrfs_bio *bbio);
int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret);
@@ -146,10 +137,21 @@ int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end);
int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
struct folio **in_folio_ret);
+struct compressed_bio *btrfs_compress_bio(struct btrfs_inode *inode,
+ u64 start, u32 len, unsigned int type,
+ int level, blk_opf_t write_flags);
+
+static inline void cleanup_compressed_bio(struct compressed_bio *cb)
+{
+ struct bio *bio = &cb->bbio.bio;
+ struct folio_iter fi;
+
+ bio_for_each_folio_all(fi, bio)
+ btrfs_free_compr_folio(fi.folio);
+ bio_put(bio);
+}
-int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
- u64 start, struct folio **folios, unsigned long *out_folios,
- unsigned long *total_in, unsigned long *total_out);
+int zlib_compress_bio(struct list_head *ws, struct compressed_bio *cb);
int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int zlib_decompress(struct list_head *ws, const u8 *data_in,
struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
@@ -158,9 +160,7 @@ struct list_head *zlib_alloc_workspace(struct btrfs_fs_info *fs_info, unsigned i
void zlib_free_workspace(struct list_head *ws);
struct list_head *zlib_get_workspace(struct btrfs_fs_info *fs_info, unsigned int level);
-int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
- u64 start, struct folio **folios, unsigned long *out_folios,
- unsigned long *total_in, unsigned long *total_out);
+int lzo_compress_bio(struct list_head *ws, struct compressed_bio *cb);
int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int lzo_decompress(struct list_head *ws, const u8 *data_in,
struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
@@ -168,9 +168,7 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
struct list_head *lzo_alloc_workspace(struct btrfs_fs_info *fs_info);
void lzo_free_workspace(struct list_head *ws);
-int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
- u64 start, struct folio **folios, unsigned long *out_folios,
- unsigned long *total_in, unsigned long *total_out);
+int zstd_compress_bio(struct list_head *ws, struct compressed_bio *cb);
int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int zstd_decompress(struct list_head *ws, const u8 *data_in,
struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index a48b4befbee7..7267b2502665 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -249,6 +249,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
int ret = 0;
int level;
struct btrfs_disk_key disk_key;
+ const bool is_reloc_root = (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID);
u64 reloc_src_root = 0;
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
@@ -262,7 +263,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
else
btrfs_node_key(buf, &disk_key, 0);
- if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+ if (is_reloc_root)
reloc_src_root = btrfs_header_owner(buf);
cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
&disk_key, level, buf->start, 0,
@@ -276,7 +277,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
BTRFS_HEADER_FLAG_RELOC);
- if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+ if (is_reloc_root)
btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
else
btrfs_set_header_owner(cow, new_root_objectid);
@@ -291,16 +292,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
return ret;
}
- if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
- ret = btrfs_inc_ref(trans, root, cow, 1);
- if (unlikely(ret))
- btrfs_abort_transaction(trans, ret);
- } else {
- ret = btrfs_inc_ref(trans, root, cow, 0);
- if (unlikely(ret))
- btrfs_abort_transaction(trans, ret);
- }
- if (ret) {
+ ret = btrfs_inc_ref(trans, root, cow, is_reloc_root);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
btrfs_tree_unlock(cow);
free_extent_buffer(cow);
return ret;
@@ -362,6 +356,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
u64 owner;
u64 flags;
int ret;
+ const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID);
/*
* Backrefs update rules:
@@ -397,8 +392,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
}
} else {
refs = 1;
- if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID ||
- btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+ if (is_reloc_root || btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
else
flags = 0;
@@ -417,18 +411,17 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
}
if (refs > 1) {
- if ((owner == btrfs_root_id(root) ||
- btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) &&
+ if ((owner == btrfs_root_id(root) || is_reloc_root) &&
!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
- ret = btrfs_inc_ref(trans, root, buf, 1);
+ ret = btrfs_inc_ref(trans, root, buf, true);
if (ret)
return ret;
- if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
- ret = btrfs_dec_ref(trans, root, buf, 0);
+ if (is_reloc_root) {
+ ret = btrfs_dec_ref(trans, root, buf, false);
if (ret)
return ret;
- ret = btrfs_inc_ref(trans, root, cow, 1);
+ ret = btrfs_inc_ref(trans, root, cow, true);
if (ret)
return ret;
}
@@ -437,23 +430,16 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (ret)
return ret;
} else {
-
- if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
- ret = btrfs_inc_ref(trans, root, cow, 1);
- else
- ret = btrfs_inc_ref(trans, root, cow, 0);
+ ret = btrfs_inc_ref(trans, root, cow, is_reloc_root);
if (ret)
return ret;
}
} else {
if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
- if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
- ret = btrfs_inc_ref(trans, root, cow, 1);
- else
- ret = btrfs_inc_ref(trans, root, cow, 0);
+ ret = btrfs_inc_ref(trans, root, cow, is_reloc_root);
if (ret)
return ret;
- ret = btrfs_dec_ref(trans, root, buf, 1);
+ ret = btrfs_dec_ref(trans, root, buf, true);
if (ret)
return ret;
}
@@ -4016,8 +4002,7 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- ret = split_item(trans, path, new_key, split_offset);
- return ret;
+ return split_item(trans, path, new_key, split_offset);
}
/*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 692370fc07b2..6de7ad191e04 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -86,6 +86,14 @@ struct btrfs_path {
struct btrfs_path *path_name __free(btrfs_free_path) = NULL
/*
+ * This defines an on-stack path that will be auto released when exiting the scope.
+ *
+ * It is compatible with any existing manual btrfs_release_path() calls.
+ */
+#define BTRFS_PATH_AUTO_RELEASE(path_name) \
+ struct btrfs_path path_name __free(btrfs_release_path) = { 0 }
+
+/*
* The state of btrfs root
*/
enum {
@@ -601,6 +609,7 @@ void btrfs_release_path(struct btrfs_path *p);
struct btrfs_path *btrfs_alloc_path(void);
void btrfs_free_path(struct btrfs_path *p);
DEFINE_FREE(btrfs_free_path, struct btrfs_path *, btrfs_free_path(_T))
+DEFINE_FREE(btrfs_release_path, struct btrfs_path, btrfs_release_path(&_T))
int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_path *path, int slot, int nr);
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index b81e224d4a27..ecf05cd64696 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -609,7 +609,7 @@ static struct extent_map *defrag_get_extent(struct btrfs_inode *inode,
{
struct btrfs_root *root = inode->root;
struct btrfs_file_extent_item *fi;
- struct btrfs_path path = { 0 };
+ BTRFS_PATH_AUTO_RELEASE(path);
struct extent_map *em;
struct btrfs_key key;
u64 ino = btrfs_ino(inode);
@@ -720,16 +720,13 @@ next:
if (ret > 0)
goto not_found;
}
- btrfs_release_path(&path);
return em;
not_found:
- btrfs_release_path(&path);
btrfs_free_extent_map(em);
return NULL;
err:
- btrfs_release_path(&path);
btrfs_free_extent_map(em);
return ERR_PTR(ret);
}
@@ -795,10 +792,11 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct extent_map *next;
+ const u64 em_end = btrfs_extent_map_end(em);
bool ret = false;
/* This is the last extent */
- if (em->start + em->len >= i_size_read(inode))
+ if (em_end >= i_size_read(inode))
return false;
/*
@@ -807,7 +805,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
* one will not be a target.
* This will just cause extra IO without really reducing the fragments.
*/
- next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked);
+ next = defrag_lookup_extent(inode, em_end, newer_than, locked);
/* No more em or hole */
if (!next || next->disk_bytenr >= EXTENT_MAP_LAST_BYTE)
goto out;
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 4b7d9015e0da..1739a0b29c49 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -232,19 +232,19 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
}
static struct btrfs_delayed_node *btrfs_first_delayed_node(
- struct btrfs_delayed_root *delayed_root,
+ struct btrfs_fs_info *fs_info,
struct btrfs_ref_tracker *tracker)
{
struct btrfs_delayed_node *node;
- spin_lock(&delayed_root->lock);
- node = list_first_entry_or_null(&delayed_root->node_list,
+ spin_lock(&fs_info->delayed_root.lock);
+ node = list_first_entry_or_null(&fs_info->delayed_root.node_list,
struct btrfs_delayed_node, n_list);
if (node) {
refcount_inc(&node->refs);
btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC);
}
- spin_unlock(&delayed_root->lock);
+ spin_unlock(&fs_info->delayed_root.lock);
return node;
}
@@ -257,7 +257,7 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node(
struct list_head *p;
struct btrfs_delayed_node *next = NULL;
- delayed_root = node->root->fs_info->delayed_root;
+ delayed_root = &node->root->fs_info->delayed_root;
spin_lock(&delayed_root->lock);
if (!test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
/* not in the list */
@@ -287,7 +287,7 @@ static void __btrfs_release_delayed_node(
if (!delayed_node)
return;
- delayed_root = delayed_node->root->fs_info->delayed_root;
+ delayed_root = &delayed_node->root->fs_info->delayed_root;
mutex_lock(&delayed_node->mutex);
if (delayed_node->count)
@@ -425,7 +425,7 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
delayed_node->index_cnt = ins->index + 1;
delayed_node->count++;
- atomic_inc(&delayed_node->root->fs_info->delayed_root->items);
+ atomic_inc(&delayed_node->root->fs_info->delayed_root.items);
return 0;
}
@@ -443,7 +443,6 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
{
struct btrfs_delayed_node *delayed_node = delayed_item->delayed_node;
struct rb_root_cached *root;
- struct btrfs_delayed_root *delayed_root;
/* Not inserted, ignore it. */
if (RB_EMPTY_NODE(&delayed_item->rb_node))
@@ -452,8 +451,6 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
/* If it's in a rbtree, then we need to have delayed node locked. */
lockdep_assert_held(&delayed_node->mutex);
- delayed_root = delayed_node->root->fs_info->delayed_root;
-
if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM)
root = &delayed_node->ins_root;
else
@@ -462,8 +459,7 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
rb_erase_cached(&delayed_item->rb_node, root);
RB_CLEAR_NODE(&delayed_item->rb_node);
delayed_node->count--;
-
- finish_one_item(delayed_root);
+ finish_one_item(&delayed_node->root->fs_info->delayed_root);
}
static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
@@ -980,30 +976,21 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
{
- struct btrfs_delayed_root *delayed_root;
-
if (delayed_node &&
test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
ASSERT(delayed_node->root);
clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
delayed_node->count--;
-
- delayed_root = delayed_node->root->fs_info->delayed_root;
- finish_one_item(delayed_root);
+ finish_one_item(&delayed_node->root->fs_info->delayed_root);
}
}
static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node)
{
-
if (test_and_clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) {
- struct btrfs_delayed_root *delayed_root;
-
ASSERT(delayed_node->root);
delayed_node->count--;
-
- delayed_root = delayed_node->root->fs_info->delayed_root;
- finish_one_item(delayed_root);
+ finish_one_item(&delayed_node->root->fs_info->delayed_root);
}
}
@@ -1137,8 +1124,8 @@ __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
ret = btrfs_record_root_in_trans(trans, node->root);
if (ret)
return ret;
- ret = btrfs_update_delayed_inode(trans, node->root, path, node);
- return ret;
+
+ return btrfs_update_delayed_inode(trans, node->root, path, node);
}
/*
@@ -1150,7 +1137,6 @@ __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_delayed_root *delayed_root;
struct btrfs_delayed_node *curr_node, *prev_node;
struct btrfs_ref_tracker curr_delayed_node_tracker, prev_delayed_node_tracker;
struct btrfs_path *path;
@@ -1168,9 +1154,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
block_rsv = trans->block_rsv;
trans->block_rsv = &fs_info->delayed_block_rsv;
- delayed_root = fs_info->delayed_root;
-
- curr_node = btrfs_first_delayed_node(delayed_root, &curr_delayed_node_tracker);
+ curr_node = btrfs_first_delayed_node(fs_info, &curr_delayed_node_tracker);
while (curr_node && (!count || nr--)) {
ret = __btrfs_commit_inode_delayed_items(trans, path,
curr_node);
@@ -1417,7 +1401,7 @@ void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info)
struct btrfs_ref_tracker delayed_node_tracker;
struct btrfs_delayed_node *node;
- node = btrfs_first_delayed_node( fs_info->delayed_root, &delayed_node_tracker);
+ node = btrfs_first_delayed_node(fs_info, &delayed_node_tracker);
if (WARN_ON(node)) {
btrfs_delayed_node_ref_tracker_free(node,
&delayed_node_tracker);
@@ -1440,7 +1424,7 @@ static bool could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info)
{
- struct btrfs_delayed_root *delayed_root = fs_info->delayed_root;
+ struct btrfs_delayed_root *delayed_root = &fs_info->delayed_root;
if ((atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) ||
btrfs_workqueue_normal_congested(fs_info->delayed_workers))
@@ -1970,7 +1954,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
delayed_node->count++;
- atomic_inc(&root->fs_info->delayed_root->items);
+ atomic_inc(&root->fs_info->delayed_root.items);
release_node:
mutex_unlock(&delayed_node->mutex);
btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
@@ -2012,7 +1996,7 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode)
mutex_lock(&delayed_node->mutex);
if (!test_and_set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) {
delayed_node->count++;
- atomic_inc(&fs_info->delayed_root->items);
+ atomic_inc(&fs_info->delayed_root.items);
}
mutex_unlock(&delayed_node->mutex);
btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
@@ -2118,8 +2102,7 @@ void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info)
struct btrfs_delayed_node *curr_node, *prev_node;
struct btrfs_ref_tracker curr_delayed_node_tracker, prev_delayed_node_tracker;
- curr_node = btrfs_first_delayed_node(fs_info->delayed_root,
- &curr_delayed_node_tracker);
+ curr_node = btrfs_first_delayed_node(fs_info, &curr_delayed_node_tracker);
while (curr_node) {
__btrfs_kill_delayed_node(curr_node);
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index b09d4ec8c77d..fc752863f89b 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -30,21 +30,6 @@ enum btrfs_delayed_item_type {
BTRFS_DELAYED_DELETION_ITEM
};
-struct btrfs_delayed_root {
- spinlock_t lock;
- struct list_head node_list;
- /*
- * Used for delayed nodes which is waiting to be dealt with by the
- * worker. If the delayed node is inserted into the work queue, we
- * drop it from this list.
- */
- struct list_head prepare_list;
- atomic_t items; /* for delayed items */
- atomic_t items_seq; /* for delayed items */
- int nodes; /* for delayed nodes */
- wait_queue_head_t wait;
-};
-
struct btrfs_ref_tracker_dir {
#ifdef CONFIG_BTRFS_DEBUG
struct ref_tracker_dir dir;
diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c
index 07e19e88ba4b..9a63200d7a53 100644
--- a/fs/btrfs/direct-io.c
+++ b/fs/btrfs/direct-io.c
@@ -763,7 +763,7 @@ static ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
struct btrfs_dio_data data = { 0 };
return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
- IOMAP_DIO_PARTIAL, &data, done_before);
+ IOMAP_DIO_PARTIAL | IOMAP_DIO_FSBLOCK_ALIGNED, &data, done_before);
}
static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
@@ -772,7 +772,7 @@ static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *it
struct btrfs_dio_data data = { 0 };
return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
- IOMAP_DIO_PARTIAL, &data, done_before);
+ IOMAP_DIO_PARTIAL | IOMAP_DIO_FSBLOCK_ALIGNED, &data, done_before);
}
static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
@@ -785,19 +785,6 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
if (iov_iter_alignment(iter) & blocksize_mask)
return -EINVAL;
-
- /*
- * For bs > ps support, we heavily rely on large folios to make sure no
- * block will cross large folio boundaries.
- *
- * But memory provided by direct IO is only virtually contiguous, not
- * physically contiguous, and will break the btrfs' large folio requirement.
- *
- * So for bs > ps support, all direct IOs should fallback to buffered ones.
- */
- if (fs_info->sectorsize > PAGE_SIZE)
- return -EINVAL;
-
return 0;
}
@@ -814,6 +801,8 @@ ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
ssize_t ret;
unsigned int ilock_flags = 0;
struct iomap_dio *dio;
+ const u64 data_profile = btrfs_data_alloc_profile(fs_info) &
+ BTRFS_BLOCK_GROUP_PROFILE_MASK;
if (iocb->ki_flags & IOCB_NOWAIT)
ilock_flags |= BTRFS_ILOCK_TRY;
@@ -827,6 +816,16 @@ ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
ilock_flags |= BTRFS_ILOCK_SHARED;
+ /*
+ * If our data profile has duplication (either extra mirrors or RAID56),
+ * we can not trust the direct IO buffer, the content may change during
+ * writeback and cause different contents written to different mirrors.
+ *
+ * Thus only RAID0 and SINGLE can go true zero-copy direct IO.
+ */
+ if (data_profile != BTRFS_BLOCK_GROUP_RAID0 && data_profile != 0)
+ goto buffered;
+
relock:
ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
if (ret < 0)
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index 89fe85778115..1c304bf473e5 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -216,6 +216,25 @@ static struct btrfs_block_group *find_next_block_group(
}
/*
+ * Check whether a block group is empty.
+ *
+ * "Empty" here means that there are no extents physically located within the
+ * device extents corresponding to this block group.
+ *
+ * For a remapped block group, this means that all of its identity remaps have
+ * been removed. For a non-remapped block group, this means that no extents
+ * have an address within its range, and that nothing has been remapped to be
+ * within it.
+ */
+static bool block_group_is_empty(const struct btrfs_block_group *bg)
+{
+ if (bg->flags & BTRFS_BLOCK_GROUP_REMAPPED)
+ return bg->identity_remap_count == 0;
+
+ return bg->used == 0 && bg->remap_bytes == 0;
+}
+
+/*
* Look up next block group and set it for use.
*
* @discard_ctl: discard control
@@ -241,8 +260,10 @@ again:
block_group = find_next_block_group(discard_ctl, now);
if (block_group && now >= block_group->discard_eligible_time) {
+ const bool empty = block_group_is_empty(block_group);
+
if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
- block_group->used != 0) {
+ !empty) {
if (btrfs_is_block_group_data_only(block_group)) {
__add_to_discard_list(discard_ctl, block_group);
/*
@@ -267,7 +288,12 @@ again:
}
if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) {
block_group->discard_cursor = block_group->start;
- block_group->discard_state = BTRFS_DISCARD_EXTENTS;
+
+ if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED && empty) {
+ block_group->discard_state = BTRFS_DISCARD_FULLY_REMAPPED;
+ } else {
+ block_group->discard_state = BTRFS_DISCARD_EXTENTS;
+ }
}
}
if (block_group) {
@@ -373,7 +399,7 @@ void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
return;
- if (block_group->used == 0)
+ if (block_group_is_empty(block_group))
add_to_discard_unused_list(discard_ctl, block_group);
else
add_to_discard_list(discard_ctl, block_group);
@@ -470,7 +496,7 @@ static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
{
remove_from_discard_list(discard_ctl, block_group);
- if (block_group->used == 0) {
+ if (block_group_is_empty(block_group)) {
if (btrfs_is_free_space_trimmed(block_group))
btrfs_mark_bg_unused(block_group);
else
@@ -524,7 +550,8 @@ static void btrfs_discard_workfn(struct work_struct *work)
/* Perform discarding */
minlen = discard_minlen[discard_index];
- if (discard_state == BTRFS_DISCARD_BITMAPS) {
+ switch (discard_state) {
+ case BTRFS_DISCARD_BITMAPS: {
u64 maxlen = 0;
/*
@@ -541,17 +568,28 @@ static void btrfs_discard_workfn(struct work_struct *work)
btrfs_block_group_end(block_group),
minlen, maxlen, true);
discard_ctl->discard_bitmap_bytes += trimmed;
- } else {
+
+ break;
+ }
+
+ case BTRFS_DISCARD_FULLY_REMAPPED:
+ btrfs_trim_fully_remapped_block_group(block_group);
+ break;
+
+ default:
btrfs_trim_block_group_extents(block_group, &trimmed,
block_group->discard_cursor,
btrfs_block_group_end(block_group),
minlen, true);
discard_ctl->discard_extent_bytes += trimmed;
+
+ break;
}
/* Determine next steps for a block_group */
if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
- if (discard_state == BTRFS_DISCARD_BITMAPS) {
+ if (discard_state == BTRFS_DISCARD_BITMAPS ||
+ discard_state == BTRFS_DISCARD_FULLY_REMAPPED) {
btrfs_finish_discard_pass(discard_ctl, block_group);
} else {
block_group->discard_cursor = block_group->start;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2833b44f4b4f..20c405a4789d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -18,11 +18,11 @@
#include <linux/crc32c.h>
#include <linux/sched/mm.h>
#include <linux/unaligned.h>
-#include <crypto/hash.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
+#include "delayed-inode.h"
#include "bio.h"
#include "print-tree.h"
#include "locking.h"
@@ -62,12 +62,6 @@
static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
-static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
-{
- if (fs_info->csum_shash)
- crypto_free_shash(fs_info->csum_shash);
-}
-
/*
* Compute the csum of a btree block and store the result to provided buffer.
*/
@@ -76,12 +70,11 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result)
struct btrfs_fs_info *fs_info = buf->fs_info;
int num_pages;
u32 first_page_part;
- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+ struct btrfs_csum_ctx csum;
char *kaddr;
int i;
- shash->tfm = fs_info->csum_shash;
- crypto_shash_init(shash);
+ btrfs_csum_init(&csum, fs_info->csum_type);
if (buf->addr) {
/* Pages are contiguous, handle them as a big one. */
@@ -94,21 +87,21 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result)
num_pages = num_extent_pages(buf);
}
- crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
- first_page_part - BTRFS_CSUM_SIZE);
+ btrfs_csum_update(&csum, kaddr + BTRFS_CSUM_SIZE,
+ first_page_part - BTRFS_CSUM_SIZE);
/*
* Multiple single-page folios case would reach here.
*
* nodesize <= PAGE_SIZE and large folio all handled by above
- * crypto_shash_update() already.
+ * btrfs_csum_update() already.
*/
for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) {
kaddr = folio_address(buf->folios[i]);
- crypto_shash_update(shash, kaddr, PAGE_SIZE);
+ btrfs_csum_update(&csum, kaddr, PAGE_SIZE);
}
memset(result, 0, BTRFS_CSUM_SIZE);
- crypto_shash_final(shash, result);
+ btrfs_csum_final(&csum, result);
}
/*
@@ -160,18 +153,15 @@ static bool btrfs_supported_super_csum(u16 csum_type)
int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
const struct btrfs_super_block *disk_sb)
{
- char result[BTRFS_CSUM_SIZE];
- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
-
- shash->tfm = fs_info->csum_shash;
+ u8 result[BTRFS_CSUM_SIZE];
/*
* The super_block structure does not span the whole
* BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
* filled with zeros and is included in the checksum.
*/
- crypto_shash_digest(shash, (const u8 *)disk_sb + BTRFS_CSUM_SIZE,
- BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);
+ btrfs_csum(fs_info->csum_type, (const u8 *)disk_sb + BTRFS_CSUM_SIZE,
+ BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);
if (memcmp(disk_sb->csum, result, fs_info->csum_size))
return 1;
@@ -186,7 +176,6 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
const u32 step = min(fs_info->nodesize, PAGE_SIZE);
const u32 nr_steps = eb->len / step;
phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
- int ret = 0;
if (sb_rdonly(fs_info->sb))
return -EROFS;
@@ -208,9 +197,8 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
paddrs[i] = page_to_phys(&folio->page) + offset_in_page(eb->start);
}
- ret = btrfs_repair_io_failure(fs_info, 0, eb->start, eb->len, eb->start,
- paddrs, step, mirror_num);
- return ret;
+ return btrfs_repair_io_failure(fs_info, 0, eb->start, eb->len,
+ eb->start, paddrs, step, mirror_num);
}
/*
@@ -382,22 +370,19 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
btrfs_err_rl(fs_info,
"bad tree block start, mirror %u want %llu have %llu",
eb->read_mirror, eb->start, found_start);
- ret = -EIO;
- goto out;
+ return -EIO;
}
if (unlikely(check_tree_block_fsid(eb))) {
btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u",
eb->start, eb->read_mirror);
- ret = -EIO;
- goto out;
+ return -EIO;
}
found_level = btrfs_header_level(eb);
if (unlikely(found_level >= BTRFS_MAX_LEVEL)) {
btrfs_err(fs_info,
"bad tree block level, mirror %u level %d on logical %llu",
eb->read_mirror, btrfs_header_level(eb), eb->start);
- ret = -EIO;
- goto out;
+ return -EIO;
}
csum_tree_block(eb, result);
@@ -412,18 +397,15 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
BTRFS_CSUM_FMT_VALUE(csum_size, result),
btrfs_header_level(eb),
ignore_csum ? ", ignored" : "");
- if (unlikely(!ignore_csum)) {
- ret = -EUCLEAN;
- goto out;
- }
+ if (unlikely(!ignore_csum))
+ return -EUCLEAN;
}
if (unlikely(found_level != check->level)) {
btrfs_err(fs_info,
"level verify failed on logical %llu mirror %u wanted %u found %u",
eb->start, eb->read_mirror, check->level, found_level);
- ret = -EIO;
- goto out;
+ return -EIO;
}
if (unlikely(check->transid &&
btrfs_header_generation(eb) != check->transid)) {
@@ -431,8 +413,7 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
eb->start, eb->read_mirror, check->transid,
btrfs_header_generation(eb));
- ret = -EIO;
- goto out;
+ return -EIO;
}
if (check->has_first_key) {
const struct btrfs_key *expect_key = &check->first_key;
@@ -450,14 +431,13 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
expect_key->type, expect_key->offset,
found_key.objectid, found_key.type,
found_key.offset);
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
}
if (check->owner_root) {
ret = btrfs_check_eb_owner(eb, check->owner_root);
if (ret < 0)
- goto out;
+ return ret;
}
/* If this is a leaf block and it is corrupt, just return -EIO. */
@@ -471,7 +451,6 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
btrfs_err(fs_info,
"read time tree block corruption detected on logical %llu mirror %u",
eb->start, eb->read_mirror);
-out:
return ret;
}
@@ -815,7 +794,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_root *root;
- struct btrfs_key key;
unsigned int nofs_flag;
int ret = 0;
@@ -864,10 +842,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(leaf);
- key.objectid = objectid;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = 0;
- ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
+ ret = btrfs_insert_root(trans, tree_root, &root->root_key, &root->root_item);
if (ret)
goto fail;
@@ -1153,6 +1128,8 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
return btrfs_grab_root(btrfs_global_root(fs_info, &key));
case BTRFS_RAID_STRIPE_TREE_OBJECTID:
return btrfs_grab_root(fs_info->stripe_root);
+ case BTRFS_REMAP_TREE_OBJECTID:
+ return btrfs_grab_root(fs_info->remap_root);
default:
return NULL;
}
@@ -1229,11 +1206,9 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
ASSERT(percpu_counter_sum_positive(em_counter) == 0);
percpu_counter_destroy(em_counter);
percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
- btrfs_free_csum_hash(fs_info);
btrfs_free_stripe_hash_table(fs_info);
btrfs_free_ref_cache(fs_info);
kfree(fs_info->balance_ctl);
- kfree(fs_info->delayed_root);
free_global_roots(fs_info);
btrfs_put_root(fs_info->tree_root);
btrfs_put_root(fs_info->chunk_root);
@@ -1244,6 +1219,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
btrfs_put_root(fs_info->data_reloc_root);
btrfs_put_root(fs_info->block_group_root);
btrfs_put_root(fs_info->stripe_root);
+ btrfs_put_root(fs_info->remap_root);
btrfs_check_leaked_roots(fs_info);
btrfs_extent_buffer_leak_debug_check(fs_info);
kfree(fs_info->super_copy);
@@ -1488,6 +1464,10 @@ static int cleaner_kthread(void *arg)
*/
btrfs_run_defrag_inodes(fs_info);
+ if (btrfs_fs_incompat(fs_info, REMAP_TREE) &&
+ !btrfs_test_opt(fs_info, DISCARD_ASYNC))
+ btrfs_handle_fully_remapped_bgs(fs_info);
+
/*
* Acquires fs_info->reclaim_bgs_lock to avoid racing
* with relocation (btrfs_relocate_chunk) and relocation
@@ -1796,6 +1776,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
free_root_extent_buffers(info->data_reloc_root);
free_root_extent_buffers(info->block_group_root);
free_root_extent_buffers(info->stripe_root);
+ free_root_extent_buffers(info->remap_root);
if (free_chunk_root)
free_root_extent_buffers(info->chunk_root);
}
@@ -1983,21 +1964,8 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
return 0;
}
-static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
+static void btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
{
- struct crypto_shash *csum_shash;
- const char *csum_driver = btrfs_super_csum_driver(csum_type);
-
- csum_shash = crypto_alloc_shash(csum_driver, 0, 0);
-
- if (IS_ERR(csum_shash)) {
- btrfs_err(fs_info, "error allocating %s hash for checksum",
- csum_driver);
- return PTR_ERR(csum_shash);
- }
-
- fs_info->csum_shash = csum_shash;
-
/* Check if the checksum implementation is a fast accelerated one. */
switch (csum_type) {
case BTRFS_CSUM_TYPE_CRC32:
@@ -2011,10 +1979,8 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
break;
}
- btrfs_info(fs_info, "using %s (%s) checksum algorithm",
- btrfs_super_csum_name(csum_type),
- crypto_shash_driver_name(csum_shash));
- return 0;
+ btrfs_info(fs_info, "using %s checksum algorithm",
+ btrfs_super_csum_name(csum_type));
}
static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
@@ -2172,11 +2138,10 @@ static int load_global_roots(struct btrfs_root *tree_root)
return ret;
if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE))
return ret;
- ret = load_global_roots_objectid(tree_root, path,
- BTRFS_FREE_SPACE_TREE_OBJECTID,
- "free space");
- return ret;
+ return load_global_roots_objectid(tree_root, path,
+ BTRFS_FREE_SPACE_TREE_OBJECTID,
+ "free space");
}
static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
@@ -2225,21 +2190,44 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
if (ret)
goto out;
- /*
- * This tree can share blocks with some other fs tree during relocation
- * and we need a proper setup by btrfs_get_fs_root
- */
- root = btrfs_get_fs_root(tree_root->fs_info,
- BTRFS_DATA_RELOC_TREE_OBJECTID, true);
- if (IS_ERR(root)) {
- if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
- location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
- ret = PTR_ERR(root);
- goto out;
+ if (btrfs_fs_incompat(fs_info, REMAP_TREE)) {
+ /* The remap_root has already been loaded in load_important_roots(). */
+ root = fs_info->remap_root;
+
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+
+ root->root_key.objectid = BTRFS_REMAP_TREE_OBJECTID;
+ root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root->root_key.offset = 0;
+
+ /* Check that data reloc tree doesn't also exist. */
+ location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
+ root = btrfs_read_tree_root(fs_info->tree_root, &location);
+ if (!IS_ERR(root)) {
+ btrfs_err(fs_info, "data reloc tree exists when remap-tree enabled");
+ btrfs_put_root(root);
+ return -EIO;
+ } else if (PTR_ERR(root) != -ENOENT) {
+ btrfs_warn(fs_info, "error %ld when checking for data reloc tree",
+ PTR_ERR(root));
}
} else {
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->data_reloc_root = root;
+ /*
+ * This tree can share blocks with some other fs tree during
+ * relocation and we need a proper setup by btrfs_get_fs_root().
+ */
+ root = btrfs_get_fs_root(tree_root->fs_info,
+ BTRFS_DATA_RELOC_TREE_OBJECTID, true);
+ if (IS_ERR(root)) {
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->data_reloc_root = root;
+ }
}
location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
@@ -2479,6 +2467,35 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
+ if (btrfs_fs_incompat(fs_info, REMAP_TREE)) {
+ /*
+ * Reduce test matrix for remap tree by requiring block-group-tree
+ * and no-holes. Free-space-tree is a hard requirement.
+ */
+ if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) ||
+ !btrfs_fs_incompat(fs_info, NO_HOLES) ||
+ !btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) {
+ btrfs_err(fs_info,
+"remap-tree feature requires free-space-tree, no-holes, and block-group-tree");
+ ret = -EINVAL;
+ }
+
+ if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
+ btrfs_err(fs_info, "remap-tree not supported with mixed-bg");
+ ret = -EINVAL;
+ }
+
+ if (btrfs_fs_incompat(fs_info, ZONED)) {
+ btrfs_err(fs_info, "remap-tree not supported with zoned devices");
+ ret = -EINVAL;
+ }
+
+ if (sectorsize > PAGE_SIZE) {
+ btrfs_err(fs_info, "remap-tree not supported when block size > page size");
+ ret = -EINVAL;
+ }
+ }
+
/*
* Hint to catch really bogus numbers, bitflips or so, more exact checks are
* done later
@@ -2637,6 +2654,18 @@ static int load_important_roots(struct btrfs_fs_info *fs_info)
btrfs_warn(fs_info, "couldn't read tree root");
return ret;
}
+
+ if (btrfs_fs_incompat(fs_info, REMAP_TREE)) {
+ bytenr = btrfs_super_remap_root(sb);
+ gen = btrfs_super_remap_root_generation(sb);
+ level = btrfs_super_remap_root_level(sb);
+ ret = load_super_root(fs_info->remap_root, bytenr, gen, level);
+ if (ret) {
+ btrfs_warn(fs_info, "couldn't read remap root");
+ return ret;
+ }
+ }
+
return 0;
}
@@ -2773,6 +2802,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
INIT_LIST_HEAD(&fs_info->unused_bgs);
INIT_LIST_HEAD(&fs_info->reclaim_bgs);
+ INIT_LIST_HEAD(&fs_info->fully_remapped_bgs);
INIT_LIST_HEAD(&fs_info->zone_active_bgs);
#ifdef CONFIG_BTRFS_DEBUG
INIT_LIST_HEAD(&fs_info->allocated_roots);
@@ -2785,6 +2815,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
BTRFS_BLOCK_RSV_GLOBAL);
btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
+ btrfs_init_block_rsv(&fs_info->remap_block_rsv, BTRFS_BLOCK_RSV_REMAP);
btrfs_init_block_rsv(&fs_info->treelog_rsv, BTRFS_BLOCK_RSV_TREELOG);
btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
@@ -2827,6 +2858,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
mutex_init(&fs_info->chunk_mutex);
mutex_init(&fs_info->transaction_kthread_mutex);
mutex_init(&fs_info->cleaner_mutex);
+ mutex_init(&fs_info->remap_mutex);
mutex_init(&fs_info->ro_block_group_mutex);
init_rwsem(&fs_info->commit_root_sem);
init_rwsem(&fs_info->cleanup_work_sem);
@@ -2901,11 +2933,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
if (ret)
return ret;
- fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
- GFP_KERNEL);
- if (!fs_info->delayed_root)
- return -ENOMEM;
- btrfs_init_delayed_root(fs_info->delayed_root);
+ btrfs_init_delayed_root(&fs_info->delayed_root);
if (sb_rdonly(sb))
set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);
@@ -3018,6 +3046,8 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
btrfs_warn(fs_info,
"'clear_cache' option is ignored with extent tree v2");
+ else if (btrfs_fs_incompat(fs_info, REMAP_TREE))
+ btrfs_warn(fs_info, "'clear_cache' option is ignored with remap tree");
else
rebuild_free_space_tree = true;
} else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
@@ -3032,7 +3062,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
if (ret) {
btrfs_warn(fs_info,
"failed to rebuild free space tree: %d", ret);
- goto out;
+ return ret;
}
}
@@ -3043,11 +3073,20 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
if (ret) {
btrfs_warn(fs_info,
"failed to disable free space tree: %d", ret);
- goto out;
+ return ret;
}
}
/*
+ * Before btrfs-progs v6.16.1 mkfs.btrfs can leave free space entries
+ * for deleted temporary chunks. Delete them if they exist.
+ */
+ ret = btrfs_delete_orphan_free_space_entries(fs_info);
+ if (ret < 0) {
+ btrfs_err(fs_info, "failed to delete orphan free space tree entries: %d", ret);
+ return ret;
+ }
+ /*
* btrfs_find_orphan_roots() is responsible for finding all the dead
* roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
* them into the fs_info->fs_roots_radix tree. This must be done before
@@ -3060,17 +3099,17 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
*/
ret = btrfs_find_orphan_roots(fs_info);
if (ret)
- goto out;
+ return ret;
ret = btrfs_cleanup_fs_roots(fs_info);
if (ret)
- goto out;
+ return ret;
down_read(&fs_info->cleanup_work_sem);
if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
(ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
up_read(&fs_info->cleanup_work_sem);
- goto out;
+ return ret;
}
up_read(&fs_info->cleanup_work_sem);
@@ -3079,7 +3118,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
mutex_unlock(&fs_info->cleaner_mutex);
if (ret < 0) {
btrfs_warn(fs_info, "failed to recover relocation: %d", ret);
- goto out;
+ return ret;
}
if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
@@ -3089,24 +3128,24 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
if (ret) {
btrfs_warn(fs_info,
"failed to create free space tree: %d", ret);
- goto out;
+ return ret;
}
}
if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) {
ret = btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
if (ret)
- goto out;
+ return ret;
}
ret = btrfs_resume_balance_async(fs_info);
if (ret)
- goto out;
+ return ret;
ret = btrfs_resume_dev_replace_async(fs_info);
if (ret) {
btrfs_warn(fs_info, "failed to resume dev_replace");
- goto out;
+ return ret;
}
btrfs_qgroup_rescan_resume(fs_info);
@@ -3117,12 +3156,11 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
if (ret) {
btrfs_warn(fs_info,
"failed to create the UUID tree %d", ret);
- goto out;
+ return ret;
}
}
-out:
- return ret;
+ return 0;
}
/*
@@ -3253,6 +3291,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *tree_root;
struct btrfs_root *chunk_root;
+ struct btrfs_root *remap_root;
int ret;
int level;
@@ -3302,12 +3341,9 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
}
fs_info->csum_size = btrfs_super_csum_size(disk_super);
+ fs_info->csum_type = csum_type;
- ret = btrfs_init_csum_hash(fs_info, csum_type);
- if (ret) {
- btrfs_release_disk_super(disk_super);
- goto fail_alloc;
- }
+ btrfs_init_csum_hash(fs_info, csum_type);
/*
* We want to check superblock checksum, the type is stored inside.
@@ -3390,6 +3426,16 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
if (ret < 0)
goto fail_alloc;
+ if (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_REMAP_TREE) {
+ remap_root = btrfs_alloc_root(fs_info, BTRFS_REMAP_TREE_OBJECTID,
+ GFP_KERNEL);
+ fs_info->remap_root = remap_root;
+ if (!remap_root) {
+ ret = -ENOMEM;
+ goto fail_alloc;
+ }
+ }
+
/*
* At this point our mount options are validated, if we set ->max_inline
* to something non-standard make sure we truncate it to sectorsize.
@@ -3541,6 +3587,14 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_sysfs;
}
+ if (btrfs_fs_incompat(fs_info, REMAP_TREE)) {
+ ret = btrfs_populate_fully_remapped_bgs_list(fs_info);
+ if (ret) {
+ btrfs_err(fs_info, "failed to populate fully_remapped_bgs list: %d", ret);
+ goto fail_sysfs;
+ }
+ }
+
btrfs_zoned_reserve_data_reloc_bg(fs_info);
btrfs_free_zone_cache(fs_info);
@@ -3709,7 +3763,6 @@ static int write_dev_supers(struct btrfs_device *device,
{
struct btrfs_fs_info *fs_info = device->fs_info;
struct address_space *mapping = device->bdev->bd_mapping;
- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
int i;
int ret;
u64 bytenr, bytenr_orig;
@@ -3719,8 +3772,6 @@ static int write_dev_supers(struct btrfs_device *device,
if (max_mirrors == 0)
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
- shash->tfm = fs_info->csum_shash;
-
for (i = 0; i < max_mirrors; i++) {
struct folio *folio;
struct bio *bio;
@@ -3744,9 +3795,8 @@ static int write_dev_supers(struct btrfs_device *device,
btrfs_set_super_bytenr(sb, bytenr_orig);
- crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE,
- BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
- sb->csum);
+ btrfs_csum(fs_info->csum_type, (const u8 *)sb + BTRFS_CSUM_SIZE,
+ BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, sb->csum);
folio = __filemap_get_folio(mapping, bytenr >> PAGE_SHIFT,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
@@ -3866,7 +3916,7 @@ static void write_dev_flush(struct btrfs_device *device)
{
struct bio *bio = &device->flush_bio;
- device->last_flush_error = BLK_STS_OK;
+ clear_bit(BTRFS_DEV_STATE_FLUSH_FAILED, &device->dev_state);
bio_init(bio, device->bdev, NULL, 0,
REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
@@ -3891,7 +3941,7 @@ static bool wait_dev_flush(struct btrfs_device *device)
wait_for_completion_io(&device->flush_wait);
if (bio->bi_status) {
- device->last_flush_error = bio->bi_status;
+ set_bit(BTRFS_DEV_STATE_FLUSH_FAILED, &device->dev_state);
btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS);
return true;
}
@@ -3941,7 +3991,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
}
/*
- * Checks last_flush_error of disks in order to determine the device
+ * Checks flush failure of disks in order to determine the device
* state.
*/
if (unlikely(errors_wait && !btrfs_check_rw_degradable(info, NULL)))
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index bb2ca1c9c7b0..d0dd50f7d279 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -187,8 +187,6 @@ static int add_extent_changeset(struct extent_state *state, u32 bits,
struct extent_changeset *changeset,
int set)
{
- int ret;
-
if (!changeset)
return 0;
if (set && (state->state & bits) == bits)
@@ -196,9 +194,8 @@ static int add_extent_changeset(struct extent_state *state, u32 bits,
if (!set && (state->state & bits) == 0)
return 0;
changeset->bytes_changed += state->end - state->start + 1;
- ret = ulist_add(&changeset->range_changed, state->start, state->end,
- GFP_ATOMIC);
- return ret;
+
+ return ulist_add(&changeset->range_changed, state->start, state->end, GFP_ATOMIC);
}
static inline struct extent_state *next_state(struct extent_state *state)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e4cae34620d1..03cf9f242c70 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -41,6 +41,7 @@
#include "tree-checker.h"
#include "raid-stripe-tree.h"
#include "delayed-inode.h"
+#include "relocation.h"
#undef SCRAMBLE_DELAYED_REFS
@@ -476,7 +477,7 @@ again:
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != bytenr ||
key.type != BTRFS_EXTENT_DATA_REF_KEY)
- goto fail;
+ return ret;
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_extent_data_ref);
@@ -487,12 +488,11 @@ again:
btrfs_release_path(path);
goto again;
}
- ret = 0;
- break;
+ return 0;
}
path->slots[0]++;
}
-fail:
+
return ret;
}
@@ -1380,7 +1380,7 @@ out:
}
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 num_bytes, u64 *actual_bytes)
+ u64 num_bytes, u64 *actual_bytes, bool do_remap)
{
int ret = 0;
u64 discarded_bytes = 0;
@@ -1398,7 +1398,8 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
int i;
num_bytes = end - cur;
- stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes);
+ stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes,
+ do_remap);
if (IS_ERR(stripes)) {
ret = PTR_ERR(stripes);
if (ret == -EOPNOTSUPP)
@@ -1553,6 +1554,28 @@ static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info,
BTRFS_QGROUP_RSV_DATA);
}
+static int drop_remap_tree_ref(struct btrfs_trans_handle *trans,
+ const struct btrfs_delayed_ref_node *node)
+{
+ u64 bytenr = node->bytenr;
+ u64 num_bytes = node->num_bytes;
+ int ret;
+
+ ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ return 0;
+}
+
static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
const struct btrfs_delayed_ref_node *node,
@@ -1747,7 +1770,10 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
ret = __btrfs_inc_extent_ref(trans, node, extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
- ret = __btrfs_free_extent(trans, href, node, extent_op);
+ if (node->ref_root == BTRFS_REMAP_TREE_OBJECTID)
+ ret = drop_remap_tree_ref(trans, node);
+ else
+ ret = __btrfs_free_extent(trans, href, node, extent_op);
} else {
BUG();
}
@@ -1761,35 +1787,39 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_extent_op *extent_op,
bool insert_reserved)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
int ret = 0;
if (TRANS_ABORTED(trans)) {
if (insert_reserved) {
btrfs_pin_extent(trans, node->bytenr, node->num_bytes);
- free_head_ref_squota_rsv(trans->fs_info, href);
+ free_head_ref_squota_rsv(fs_info, href);
}
return 0;
}
if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
- node->type == BTRFS_SHARED_BLOCK_REF_KEY)
+ node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
ret = run_delayed_tree_ref(trans, href, node, extent_op,
insert_reserved);
- else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
- node->type == BTRFS_SHARED_DATA_REF_KEY)
+ } else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
+ node->type == BTRFS_SHARED_DATA_REF_KEY) {
ret = run_delayed_data_ref(trans, href, node, extent_op,
insert_reserved);
- else if (node->type == BTRFS_EXTENT_OWNER_REF_KEY)
- ret = 0;
- else
- BUG();
- if (ret && insert_reserved)
- btrfs_pin_extent(trans, node->bytenr, node->num_bytes);
- if (ret < 0)
- btrfs_err(trans->fs_info,
+ } else if (unlikely(node->type != BTRFS_EXTENT_OWNER_REF_KEY)) {
+ ret = -EUCLEAN;
+ btrfs_err(fs_info, "unexpected delayed ref node type: %u", node->type);
+ }
+
+ if (unlikely(ret)) {
+ if (insert_reserved)
+ btrfs_pin_extent(trans, node->bytenr, node->num_bytes);
+ btrfs_err(fs_info,
"failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d",
node->bytenr, node->num_bytes, node->type,
node->action, node->ref_mod, ret);
+ }
+
return ret;
}
@@ -2470,7 +2500,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
int i;
int action;
int level;
- int ret = 0;
+ int ret;
if (btrfs_is_testing(fs_info))
return 0;
@@ -2522,7 +2552,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
else
ret = btrfs_free_extent(trans, &ref);
if (ret)
- goto fail;
+ return ret;
} else {
/* We don't know the owning_root, leave as 0. */
ref.bytenr = btrfs_node_blockptr(buf, i);
@@ -2535,12 +2565,10 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
else
ret = btrfs_free_extent(trans, &ref);
if (ret)
- goto fail;
+ return ret;
}
}
return 0;
-fail:
- return ret;
}
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -2559,17 +2587,17 @@ static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 flags;
- u64 ret;
if (data)
flags = BTRFS_BLOCK_GROUP_DATA;
else if (root == fs_info->chunk_root)
flags = BTRFS_BLOCK_GROUP_SYSTEM;
+ else if (root == fs_info->remap_root)
+ flags = BTRFS_BLOCK_GROUP_METADATA_REMAP;
else
flags = BTRFS_BLOCK_GROUP_METADATA;
- ret = btrfs_get_alloc_profile(fs_info, flags);
- return ret;
+ return btrfs_get_alloc_profile(fs_info, flags);
}
static u64 first_logical_byte(struct btrfs_fs_info *fs_info)
@@ -2753,8 +2781,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
u64 len;
bool readonly;
- if (!cache ||
- start >= cache->start + cache->length) {
+ if (!cache || start >= btrfs_block_group_end(cache)) {
if (cache)
btrfs_put_block_group(cache);
total_unpinned = 0;
@@ -2770,7 +2797,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
empty_cluster <<= 1;
}
- len = cache->start + cache->length - start;
+ len = btrfs_block_group_end(cache) - start;
len = min(len, end + 1 - start);
if (return_free_space)
@@ -2819,6 +2846,75 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
return 0;
}
+/*
+ * Complete the remapping of a block group by removing its chunk stripes and
+ * device extents, and adding it to the unused list if there's no longer any
+ * extents nominally within it.
+ */
+int btrfs_complete_bg_remapping(struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+ struct btrfs_chunk_map *map;
+ int ret;
+
+ map = btrfs_get_chunk_map(fs_info, bg->start, 1);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ ret = btrfs_last_identity_remap_gone(map, bg);
+ if (ret) {
+ btrfs_free_chunk_map(map);
+ return ret;
+ }
+
+ /*
+ * Set num_stripes to 0, so that btrfs_remove_dev_extents() won't run a
+ * second time.
+ */
+ map->num_stripes = 0;
+
+ btrfs_free_chunk_map(map);
+
+ if (bg->used == 0) {
+ spin_lock(&fs_info->unused_bgs_lock);
+ if (!list_empty(&bg->bg_list)) {
+ list_del_init(&bg->bg_list);
+ btrfs_put_block_group(bg);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+
+ btrfs_mark_bg_unused(bg);
+ }
+
+ return 0;
+}
+
+void btrfs_handle_fully_remapped_bgs(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_group *bg;
+ int ret;
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ while (!list_empty(&fs_info->fully_remapped_bgs)) {
+ bg = list_first_entry(&fs_info->fully_remapped_bgs,
+ struct btrfs_block_group, bg_list);
+ list_del_init(&bg->bg_list);
+ spin_unlock(&fs_info->unused_bgs_lock);
+
+ btrfs_discard_extent(fs_info, bg->start, bg->length, NULL, false);
+
+ ret = btrfs_complete_bg_remapping(bg);
+ if (ret) {
+ btrfs_put_block_group(bg);
+ return;
+ }
+
+ btrfs_put_block_group(bg);
+ spin_lock(&fs_info->unused_bgs_lock);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+}
+
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -2839,7 +2935,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
if (btrfs_test_opt(fs_info, DISCARD_SYNC))
ret = btrfs_discard_extent(fs_info, start,
- end + 1 - start, NULL);
+ end + 1 - start, NULL, true);
next_state = btrfs_next_extent_state(unpin, cached_state);
btrfs_clear_extent_dirty(unpin, start, end, &cached_state);
@@ -2897,7 +2993,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
ret = -EROFS;
if (!TRANS_ABORTED(trans))
ret = btrfs_discard_extent(fs_info, block_group->start,
- block_group->length, NULL);
+ block_group->length, NULL, true);
/*
* Not strictly necessary to lock, as the block_group should be
@@ -2971,11 +3067,22 @@ u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info,
}
static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
- u64 bytenr, struct btrfs_squota_delta *delta)
+ u64 bytenr, struct btrfs_squota_delta *delta,
+ struct btrfs_path *path)
{
int ret;
+ bool remapped = false;
u64 num_bytes = delta->num_bytes;
+ /* Returns 1 on success and 0 on no-op. */
+ ret = btrfs_remove_extent_from_remap_tree(trans, path, bytenr, num_bytes);
+ if (unlikely(ret < 0)) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ } else if (ret == 1) {
+ remapped = true;
+ }
+
if (delta->is_data) {
struct btrfs_root *csum_root;
@@ -2999,10 +3106,13 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
return ret;
}
- ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes);
- if (unlikely(ret)) {
- btrfs_abort_transaction(trans, ret);
- return ret;
+ /* If remapped, FST has already been taken care of in remove_range_from_remap_tree(). */
+ if (!remapped) {
+ ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
}
ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
@@ -3361,7 +3471,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- ret = do_free_extent_accounting(trans, bytenr, &delta);
+ ret = do_free_extent_accounting(trans, bytenr, &delta, path);
}
btrfs_release_path(path);
@@ -3462,12 +3572,12 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
return 0;
if (btrfs_header_generation(buf) != trans->transid)
- goto out;
+ return 0;
if (root_id != BTRFS_TREE_LOG_OBJECTID) {
ret = check_ref_cleanup(trans, buf->start);
if (!ret)
- goto out;
+ return 0;
}
bg = btrfs_lookup_block_group(fs_info, buf->start);
@@ -3475,7 +3585,7 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
pin_down_extent(trans, bg, buf->start, buf->len, true);
btrfs_put_block_group(bg);
- goto out;
+ return 0;
}
/*
@@ -3499,7 +3609,7 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
|| btrfs_is_zoned(fs_info)) {
pin_down_extent(trans, bg, buf->start, buf->len, true);
btrfs_put_block_group(bg);
- goto out;
+ return 0;
}
WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
@@ -3509,7 +3619,6 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
btrfs_put_block_group(bg);
trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
-out:
return 0;
}
@@ -4191,10 +4300,8 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
else
trans = btrfs_join_transaction(root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- return ret;
- }
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
ret = btrfs_chunk_alloc(trans, space_info, ffe_ctl->flags,
CHUNK_ALLOC_FORCE_FOR_EXTENT);
@@ -4288,36 +4395,43 @@ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
struct find_free_extent_ctl *ffe_ctl,
struct btrfs_space_info *space_info)
{
+ struct btrfs_block_group *block_group;
+
if (ffe_ctl->for_treelog) {
spin_lock(&fs_info->treelog_bg_lock);
if (fs_info->treelog_bg)
ffe_ctl->hint_byte = fs_info->treelog_bg;
spin_unlock(&fs_info->treelog_bg_lock);
- } else if (ffe_ctl->for_data_reloc) {
+ return 0;
+ }
+
+ if (ffe_ctl->for_data_reloc) {
spin_lock(&fs_info->relocation_bg_lock);
if (fs_info->data_reloc_bg)
ffe_ctl->hint_byte = fs_info->data_reloc_bg;
spin_unlock(&fs_info->relocation_bg_lock);
- } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
- struct btrfs_block_group *block_group;
+ return 0;
+ }
- spin_lock(&fs_info->zone_active_bgs_lock);
- list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
- /*
- * No lock is OK here because avail is monotonically
- * decreasing, and this is just a hint.
- */
- u64 avail = block_group->zone_capacity - block_group->alloc_offset;
+ if (!(ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA))
+ return 0;
- if (block_group_bits(block_group, ffe_ctl->flags) &&
- block_group->space_info == space_info &&
- avail >= ffe_ctl->num_bytes) {
- ffe_ctl->hint_byte = block_group->start;
- break;
- }
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
+ /*
+ * No lock is OK here because avail is monotonically
+ * decreasing, and this is just a hint.
+ */
+ u64 avail = block_group->zone_capacity - block_group->alloc_offset;
+
+ if (block_group_bits(block_group, ffe_ctl->flags) &&
+ block_group->space_info == space_info &&
+ avail >= ffe_ctl->num_bytes) {
+ ffe_ctl->hint_byte = block_group->start;
+ break;
}
- spin_unlock(&fs_info->zone_active_bgs_lock);
}
+ spin_unlock(&fs_info->zone_active_bgs_lock);
return 0;
}
@@ -4441,7 +4555,8 @@ static noinline int find_free_extent(struct btrfs_root *root,
block_group->cached != BTRFS_CACHE_NO) {
down_read(&space_info->groups_sem);
if (list_empty(&block_group->list) ||
- block_group->ro) {
+ block_group->ro ||
+ (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) {
/*
* someone is removing this block group,
* we can't jump into the have_block_group
@@ -4475,7 +4590,8 @@ search:
ffe_ctl->hinted = false;
/* If the block group is read-only, we can skip it entirely. */
- if (unlikely(block_group->ro)) {
+ if (unlikely(block_group->ro ||
+ (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED))) {
if (ffe_ctl->for_treelog)
btrfs_clear_treelog_bg(block_group);
if (ffe_ctl->for_data_reloc)
@@ -4562,7 +4678,7 @@ have_block_group:
/* move on to the next group */
if (ffe_ctl->search_start + ffe_ctl->num_bytes >
- block_group->start + block_group->length) {
+ btrfs_block_group_end(block_group)) {
btrfs_add_free_space_unused(block_group,
ffe_ctl->found_offset,
ffe_ctl->num_bytes);
@@ -4883,6 +4999,9 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
int level = btrfs_delayed_ref_owner(node);
bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
+ if (unlikely(node->ref_root == BTRFS_REMAP_TREE_OBJECTID))
+ goto skip;
+
extent_key.objectid = node->bytenr;
if (skinny_metadata) {
/* The owner of a tree block is the level. */
@@ -4935,6 +5054,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_free_path(path);
+skip:
return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize);
}
@@ -5263,7 +5383,6 @@ struct walk_control {
* @root: the root we are currently deleting
* @wc: the walk control for this deletion
* @eb: the parent eb that we're currently visiting
- * @refs: the number of refs for wc->level - 1
* @flags: the flags for wc->level - 1
* @slot: the slot in the eb that we're currently checking
*
@@ -5458,12 +5577,12 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
/* wc->stage == UPDATE_BACKREF */
if (!(wc->flags[level] & flag)) {
ASSERT(path->locks[level]);
- ret = btrfs_inc_ref(trans, root, eb, 1);
+ ret = btrfs_inc_ref(trans, root, eb, true);
if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
- ret = btrfs_dec_ref(trans, root, eb, 0);
+ ret = btrfs_dec_ref(trans, root, eb, false);
if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -5864,18 +5983,12 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (wc->refs[level] == 1) {
if (level == 0) {
- if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
- ret = btrfs_dec_ref(trans, root, eb, 1);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- return ret;
- }
- } else {
- ret = btrfs_dec_ref(trans, root, eb, 0);
- if (unlikely(ret)) {
- btrfs_abort_transaction(trans, ret);
- return ret;
- }
+ const bool full_backref = (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF);
+
+ ret = btrfs_dec_ref(trans, root, eb, full_backref);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
}
if (btrfs_is_fstree(btrfs_root_id(root))) {
ret = btrfs_qgroup_trace_leaf_items(trans, eb);
@@ -6400,10 +6513,12 @@ void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u6
* it while performing the free space search since we have already
* held back allocations.
*/
-static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
+static int btrfs_trim_free_extents_throttle(struct btrfs_device *device,
+ u64 *trimmed, u64 pos, u64 *ret_next_pos)
{
- u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0;
int ret;
+ u64 start = pos;
+ u64 trim_len = 0;
*trimmed = 0;
@@ -6423,15 +6538,20 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
while (1) {
struct btrfs_fs_info *fs_info = device->fs_info;
+ u64 cur_start;
+ u64 end;
+ u64 len;
u64 bytes;
ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
if (ret)
break;
+ cur_start = start;
btrfs_find_first_clear_extent_bit(&device->alloc_state, start,
&start, &end,
CHUNK_TRIMMED | CHUNK_ALLOCATED);
+ start = max(start, cur_start);
/* Check if there are any CHUNK_* bits left */
if (start > device->total_bytes) {
@@ -6457,6 +6577,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
end = min(end, device->total_bytes - 1);
len = end - start + 1;
+ len = min(len, BTRFS_MAX_TRIM_LENGTH);
/* We didn't find any extents */
if (!len) {
@@ -6477,6 +6598,12 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
start += len;
*trimmed += bytes;
+ trim_len += len;
+ if (trim_len >= BTRFS_MAX_TRIM_LENGTH) {
+ *ret_next_pos = start;
+ ret = -EAGAIN;
+ break;
+ }
if (btrfs_trim_interrupted()) {
ret = -ERESTARTSYS;
@@ -6489,20 +6616,134 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
return ret;
}
+static int btrfs_trim_free_extents(struct btrfs_fs_info *fs_info, u64 *trimmed,
+ u64 *dev_failed, int *dev_ret)
+{
+ struct btrfs_device *dev;
+ struct btrfs_device *working_dev = NULL;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ u8 uuid[BTRFS_UUID_SIZE];
+ u64 start = BTRFS_DEVICE_RANGE_RESERVED;
+
+ *trimmed = 0;
+ *dev_failed = 0;
+ *dev_ret = 0;
+
+ /* Find the device with the smallest UUID to start. */
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(dev, &fs_devices->devices, dev_list) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
+ continue;
+ if (!working_dev ||
+ memcmp(dev->uuid, working_dev->uuid, BTRFS_UUID_SIZE) < 0)
+ working_dev = dev;
+ }
+ if (working_dev)
+ memcpy(uuid, working_dev->uuid, BTRFS_UUID_SIZE);
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ if (!working_dev)
+ return 0;
+
+ while (1) {
+ u64 group_trimmed = 0;
+ u64 next_pos = 0;
+ int ret = 0;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+
+ /* Find and trim the current device. */
+ list_for_each_entry(dev, &fs_devices->devices, dev_list) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
+ continue;
+ if (dev == working_dev) {
+ ret = btrfs_trim_free_extents_throttle(working_dev,
+ &group_trimmed, start, &next_pos);
+ break;
+ }
+ }
+
+ /* Throttle: continue the same device from the new position. */
+ if (ret == -EAGAIN && next_pos > start) {
+ mutex_unlock(&fs_devices->device_list_mutex);
+ *trimmed += group_trimmed;
+ start = next_pos;
+ cond_resched();
+ continue;
+ }
+
+ /* User interrupted. */
+ if (ret == -ERESTARTSYS || ret == -EINTR) {
+ mutex_unlock(&fs_devices->device_list_mutex);
+ *trimmed += group_trimmed;
+ return ret;
+ }
+
+ /*
+ * Device completed (ret == 0), failed, or EAGAIN with no progress.
+ * Record error if any, then move to next device.
+ */
+ if (ret == -EAGAIN) {
+ /* No progress - log and skip device. */
+ btrfs_warn(fs_info,
+ "trim throttle: no progress, offset=%llu device %s, skipping",
+ start, btrfs_dev_name(working_dev));
+ (*dev_failed)++;
+ if (!*dev_ret)
+ *dev_ret = ret;
+ } else if (ret) {
+ /* Device failed with error. */
+ (*dev_failed)++;
+ if (!*dev_ret)
+ *dev_ret = ret;
+ }
+
+ /*
+ * Find next device: smallest UUID larger than current.
+ * Devices added during trim with smaller UUID will be skipped.
+ */
+ working_dev = NULL;
+ list_for_each_entry(dev, &fs_devices->devices, dev_list) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
+ continue;
+ /* Must larger than current UUID. */
+ if (memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE) <= 0)
+ continue;
+ /* Find the smallest. */
+ if (!working_dev ||
+ memcmp(dev->uuid, working_dev->uuid, BTRFS_UUID_SIZE) < 0)
+ working_dev = dev;
+ }
+ if (working_dev)
+ memcpy(uuid, working_dev->uuid, BTRFS_UUID_SIZE);
+
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ *trimmed += group_trimmed;
+ start = BTRFS_DEVICE_RANGE_RESERVED;
+
+ /* No more devices. */
+ if (!working_dev)
+ break;
+
+ cond_resched();
+ }
+
+ return 0;
+}
+
/*
* Trim the whole filesystem by:
* 1) trimming the free space in each block group
* 2) trimming the unallocated space on each device
*
* This will also continue trimming even if a block group or device encounters
- * an error. The return value will be the last error, or 0 if nothing bad
+ * an error. The return value will be the first error, or 0 if nothing bad
* happens.
*/
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
{
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_block_group *cache = NULL;
- struct btrfs_device *device;
u64 group_trimmed;
u64 range_end = U64_MAX;
u64 start;
@@ -6533,14 +6774,15 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
}
start = max(range->start, cache->start);
- end = min(range_end, cache->start + cache->length);
+ end = min(range_end, btrfs_block_group_end(cache));
if (end - start >= range->minlen) {
if (!btrfs_block_group_done(cache)) {
ret = btrfs_cache_block_group(cache, true);
if (ret) {
bg_failed++;
- bg_ret = ret;
+ if (!bg_ret)
+ bg_ret = ret;
continue;
}
}
@@ -6551,9 +6793,14 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
range->minlen);
trimmed += group_trimmed;
+ if (ret == -ERESTARTSYS || ret == -EINTR) {
+ btrfs_put_block_group(cache);
+ break;
+ }
if (ret) {
bg_failed++;
- bg_ret = ret;
+ if (!bg_ret)
+ bg_ret = ret;
continue;
}
}
@@ -6561,30 +6808,22 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
if (bg_failed)
btrfs_warn(fs_info,
- "failed to trim %llu block group(s), last error %d",
+ "failed to trim %llu block group(s), first error %d",
bg_failed, bg_ret);
- mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
- if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
- continue;
+ if (ret == -ERESTARTSYS || ret == -EINTR)
+ return ret;
- ret = btrfs_trim_free_extents(device, &group_trimmed);
-
- trimmed += group_trimmed;
- if (ret) {
- dev_failed++;
- dev_ret = ret;
- break;
- }
- }
- mutex_unlock(&fs_devices->device_list_mutex);
+ ret = btrfs_trim_free_extents(fs_info, &group_trimmed, &dev_failed, &dev_ret);
+ trimmed += group_trimmed;
if (dev_failed)
btrfs_warn(fs_info,
- "failed to trim %llu device(s), last error %d",
+ "failed to trim %llu device(s), first error %d",
dev_failed, dev_ret);
range->len = trimmed;
+ if (ret == -ERESTARTSYS || ret == -EINTR)
+ return ret;
if (bg_ret)
return bg_ret;
return dev_ret;
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index 71bb8109c969..ff330d4896d6 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -161,7 +161,9 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *parent);
void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 num_bytes, u64 *actual_bytes);
+ u64 num_bytes, u64 *actual_bytes, bool do_remap);
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
+void btrfs_handle_fully_remapped_bgs(struct btrfs_fs_info *fs_info);
+int btrfs_complete_bg_remapping(struct btrfs_block_group *bg);
#endif
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f6cca3c97166..3df399dc8856 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -440,8 +440,7 @@ again:
loops = 1;
goto again;
} else {
- found = false;
- goto out_failed;
+ return false;
}
}
@@ -461,7 +460,7 @@ again:
}
*start = delalloc_start;
*end = delalloc_end;
-out_failed:
+
return found;
}
@@ -970,7 +969,7 @@ static void btrfs_readahead_expand(struct readahead_control *ractl,
{
const u64 ra_pos = readahead_pos(ractl);
const u64 ra_end = ra_pos + readahead_length(ractl);
- const u64 em_end = em->start + em->len;
+ const u64 em_end = btrfs_extent_map_end(em);
/* No expansion for holes and inline extents. */
if (em->disk_bytenr > EXTENT_MAP_LAST_BYTE)
@@ -998,11 +997,17 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
u64 start = folio_pos(folio);
const u64 end = start + folio_size(folio) - 1;
u64 extent_offset;
+ u64 locked_end;
u64 last_byte = i_size_read(inode);
struct extent_map *em;
int ret = 0;
const size_t blocksize = fs_info->sectorsize;
+ if (bio_ctrl->ractl)
+ locked_end = readahead_pos(bio_ctrl->ractl) + readahead_length(bio_ctrl->ractl) - 1;
+ else
+ locked_end = end;
+
ret = set_folio_extent_mapped(folio);
if (ret < 0) {
folio_unlock(folio);
@@ -1036,7 +1041,14 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
end_folio_read(folio, true, cur, blocksize);
continue;
}
- em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached);
+ /*
+ * Search extent map for the whole locked range.
+ * This will allow btrfs_get_extent() to return a larger hole
+ * when possible.
+ * This can reduce duplicated btrfs_get_extent() calls for large
+ * holes.
+ */
+ em = get_extent_map(BTRFS_I(inode), folio, cur, locked_end - cur + 1, em_cached);
if (IS_ERR(em)) {
end_folio_read(folio, false, cur, end + 1 - cur);
return PTR_ERR(em);
@@ -1426,8 +1438,9 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
u64 delalloc_start = page_start;
u64 delalloc_end = page_end;
u64 delalloc_to_write = 0;
+ unsigned int start_bit;
+ unsigned int end_bit;
int ret = 0;
- int bit;
/* Save the dirty bitmap as our submission bitmap will be a subset of it. */
if (btrfs_is_subpage(fs_info, folio)) {
@@ -1437,10 +1450,12 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
bio_ctrl->submit_bitmap = 1;
}
- for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) {
- u64 start = page_start + (bit << fs_info->sectorsize_bits);
+ for_each_set_bitrange(start_bit, end_bit, &bio_ctrl->submit_bitmap,
+ blocks_per_folio) {
+ u64 start = page_start + (start_bit << fs_info->sectorsize_bits);
+ u32 len = (end_bit - start_bit) << fs_info->sectorsize_bits;
- btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize);
+ btrfs_folio_set_lock(fs_info, folio, start, len);
}
/* Lock all (subpage) delalloc ranges inside the folio first. */
@@ -1557,10 +1572,13 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
fs_info->sectorsize_bits,
blocks_per_folio);
- for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size)
- btrfs_mark_ordered_io_finished(inode, folio,
- page_start + (bit << fs_info->sectorsize_bits),
- fs_info->sectorsize, false);
+ for_each_set_bitrange(start_bit, end_bit, &bio_ctrl->submit_bitmap,
+ bitmap_size) {
+ u64 start = page_start + (start_bit << fs_info->sectorsize_bits);
+ u32 len = (end_bit - start_bit) << fs_info->sectorsize_bits;
+
+ btrfs_mark_ordered_io_finished(inode, folio, start, len, false);
+ }
return ret;
}
out:
@@ -1598,7 +1616,7 @@ out:
/*
* Return 0 if we have submitted or queued the sector for submission.
- * Return <0 for critical errors, and the sector will have its dirty flag cleared.
+ * Return <0 for critical errors, and the involved sector will be cleaned up.
*
* Caller should make sure filepos < i_size and handle filepos >= i_size case.
*/
@@ -1623,6 +1641,13 @@ static int submit_one_sector(struct btrfs_inode *inode,
em = btrfs_get_extent(inode, NULL, filepos, sectorsize);
if (IS_ERR(em)) {
/*
+ * bio_ctrl may contain a bio crossing several folios.
+ * Submit it immediately so that the bio has a chance
+ * to finish normally, other than marked as error.
+ */
+ submit_one_bio(bio_ctrl);
+
+ /*
* When submission failed, we should still clear the folio dirty.
* Or the folio will be written back again but without any
* ordered extent.
@@ -1630,6 +1655,13 @@ static int submit_one_sector(struct btrfs_inode *inode,
btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize);
btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize);
+
+ /*
+ * Since there is no bio submitted to finish the ordered
+ * extent, we have to manually finish this sector.
+ */
+ btrfs_mark_ordered_io_finished(inode, folio, filepos,
+ fs_info->sectorsize, false);
return PTR_ERR(em);
}
@@ -1714,8 +1746,8 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
return ret;
}
- for (cur = start; cur < end; cur += fs_info->sectorsize)
- set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap);
+ bitmap_set(&range_bitmap, (start - folio_pos(folio)) >> fs_info->sectorsize_bits,
+ len >> fs_info->sectorsize_bits);
bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap,
blocks_per_folio);
@@ -1756,19 +1788,6 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
}
ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size);
if (unlikely(ret < 0)) {
- /*
- * bio_ctrl may contain a bio crossing several folios.
- * Submit it immediately so that the bio has a chance
- * to finish normally, other than marked as error.
- */
- submit_one_bio(bio_ctrl);
- /*
- * Failed to grab the extent map which should be very rare.
- * Since there is no bio submitted to finish the ordered
- * extent, we have to manually finish this sector.
- */
- btrfs_mark_ordered_io_finished(inode, folio, cur,
- fs_info->sectorsize, false);
if (!found_error)
found_error = ret;
continue;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 7e38c23a0c1c..095a561d733f 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -319,8 +319,15 @@ static void dump_extent_map(struct btrfs_fs_info *fs_info, const char *prefix,
/* Internal sanity checks for btrfs debug builds. */
static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map *em)
{
+ const u32 blocksize = fs_info->sectorsize;
+
if (!IS_ENABLED(CONFIG_BTRFS_DEBUG))
return;
+
+ if (!IS_ALIGNED(em->start, blocksize) ||
+ !IS_ALIGNED(em->len, blocksize))
+ dump_extent_map(fs_info, "unaligned start offset or length members", em);
+
if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) {
if (em->disk_num_bytes == 0)
dump_extent_map(fs_info, "zero disk_num_bytes", em);
@@ -334,6 +341,11 @@ static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map
dump_extent_map(fs_info,
"ram_bytes mismatch with disk_num_bytes for non-compressed em",
em);
+ if (!IS_ALIGNED(em->disk_bytenr, blocksize) ||
+ !IS_ALIGNED(em->disk_num_bytes, blocksize) ||
+ !IS_ALIGNED(em->offset, blocksize) ||
+ !IS_ALIGNED(em->ram_bytes, blocksize))
+ dump_extent_map(fs_info, "unaligned members", em);
} else if (em->offset) {
dump_extent_map(fs_info, "non-zero offset for hole/inline", em);
}
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 14e5257f0f04..7bd715442f3e 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -8,7 +8,6 @@
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/sched/mm.h>
-#include <crypto/hash.h>
#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
@@ -769,7 +768,6 @@ static void csum_one_bio(struct btrfs_bio *bbio, struct bvec_iter *src)
{
struct btrfs_inode *inode = bbio->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
struct bio *bio = &bbio->bio;
struct btrfs_ordered_sum *sums = bbio->sums;
struct bvec_iter iter = *src;
@@ -781,8 +779,6 @@ static void csum_one_bio(struct btrfs_bio *bbio, struct bvec_iter *src)
u32 offset = 0;
int index = 0;
- shash->tfm = fs_info->csum_shash;
-
btrfs_bio_for_each_block(paddr, bio, &iter, step) {
paddrs[(offset / step) % nr_steps] = paddr;
offset += step;
@@ -1138,7 +1134,7 @@ again:
}
ret = PTR_ERR(item);
if (ret != -EFBIG && ret != -ENOENT)
- goto out;
+ return ret;
if (ret == -EFBIG) {
u32 item_size;
@@ -1154,7 +1150,7 @@ again:
/* We didn't find a csum item, insert one. */
ret = find_next_csum_offset(root, path, &next_offset);
if (ret < 0)
- goto out;
+ return ret;
found_next = 1;
goto insert;
}
@@ -1182,7 +1178,7 @@ again:
csum_size, 1);
path->search_for_extension = false;
if (ret < 0)
- goto out;
+ return ret;
if (ret > 0) {
if (path->slots[0] == 0)
@@ -1238,14 +1234,14 @@ extend_csum:
btrfs_header_nritems(path->nodes[0])) {
ret = find_next_csum_offset(root, path, &next_offset);
if (ret < 0)
- goto out;
+ return ret;
found_next = 1;
goto insert;
}
ret = find_next_csum_offset(root, path, &next_offset);
if (ret < 0)
- goto out;
+ return ret;
tmp = (next_offset - bytenr) >> fs_info->sectorsize_bits;
if (tmp <= INT_MAX)
@@ -1286,7 +1282,7 @@ insert:
ret = btrfs_insert_empty_item(trans, root, path, &file_key,
ins_size);
if (ret < 0)
- goto out;
+ return ret;
leaf = path->nodes[0];
csum:
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
@@ -1311,8 +1307,8 @@ found:
cond_resched();
goto again;
}
-out:
- return ret;
+
+ return 0;
}
void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index aca2b541e72d..acaa3dbd2b7b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -566,7 +566,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
int del_nr = 0;
int del_slot = 0;
int recow;
- int ret = 0;
+ int ret;
u64 ino = btrfs_ino(inode);
path = btrfs_alloc_path();
@@ -581,7 +581,7 @@ again:
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
- goto out;
+ return ret;
if (ret > 0 && path->slots[0] > 0)
path->slots[0]--;
@@ -590,20 +590,20 @@ again:
if (unlikely(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)) {
ret = -EINVAL;
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
if (unlikely(btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC)) {
ret = -EINVAL;
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
if (unlikely(key.offset > start || extent_end < end)) {
ret = -EINVAL;
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
@@ -633,7 +633,7 @@ again:
trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
end - other_start);
- goto out;
+ return 0;
}
}
@@ -661,7 +661,7 @@ again:
other_end - start);
btrfs_set_file_extent_offset(leaf, fi,
start - orig_offset);
- goto out;
+ return 0;
}
}
@@ -677,7 +677,7 @@ again:
}
if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
leaf = path->nodes[0];
@@ -705,7 +705,7 @@ again:
ret = btrfs_inc_extent_ref(trans, &ref);
if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
if (split == start) {
@@ -714,7 +714,7 @@ again:
if (unlikely(start != key.offset)) {
ret = -EINVAL;
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
path->slots[0]--;
extent_end = end;
@@ -745,7 +745,7 @@ again:
ret = btrfs_free_extent(trans, &ref);
if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
}
other_start = 0;
@@ -763,7 +763,7 @@ again:
ret = btrfs_free_extent(trans, &ref);
if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
}
if (del_nr == 0) {
@@ -784,11 +784,11 @@ again:
ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
}
-out:
- return ret;
+
+ return 0;
}
/*
@@ -860,7 +860,7 @@ static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_
fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN) |
fgf_set_order(write_bytes);
struct folio *folio;
- int ret = 0;
+ int ret;
again:
folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask);
@@ -877,10 +877,8 @@ again:
if (ret) {
/* The folio is already unlocked. */
folio_put(folio);
- if (!nowait && ret == -EAGAIN) {
- ret = 0;
+ if (!nowait && ret == -EAGAIN)
goto again;
- }
return ret;
}
*folio_ret = folio;
@@ -1275,8 +1273,7 @@ again:
btrfs_delalloc_release_extents(inode, reserved_len);
release_space(inode, *data_reserved, reserved_start, reserved_len,
only_release_metadata);
- ret = extents_locked;
- return ret;
+ return extents_locked;
}
copied = copy_folio_from_iter_atomic(folio, offset_in_folio(folio, start),
@@ -1441,7 +1438,7 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
struct btrfs_inode *inode = BTRFS_I(file_inode(file));
ssize_t num_written, num_sync;
- if (unlikely(btrfs_is_shutdown(inode->root->fs_info)))
+ if (btrfs_is_shutdown(inode->root->fs_info))
return -EIO;
/*
* If the fs flips readonly due to some impossible error, although we
@@ -2046,7 +2043,7 @@ static int btrfs_file_mmap_prepare(struct vm_area_desc *desc)
struct file *filp = desc->file;
struct address_space *mapping = filp->f_mapping;
- if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(filp)))))
+ if (btrfs_is_shutdown(inode_to_fs_info(file_inode(filp))))
return -EIO;
if (!mapping->a_ops->read_folio)
return -ENOEXEC;
@@ -2199,10 +2196,11 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
/* Hole or vacuum extent(only exists in no-hole mode) */
if (em->disk_bytenr == EXTENT_MAP_HOLE) {
+ const u64 em_end = btrfs_extent_map_end(em);
+
ret = 1;
- *len = em->start + em->len > *start + *len ?
- 0 : *start + *len - em->start - em->len;
- *start = em->start + em->len;
+ *len = (em_end > *start + *len) ? 0 : (*start + *len - em_end);
+ *start = em_end;
}
btrfs_free_extent_map(em);
return ret;
@@ -2951,7 +2949,7 @@ static int btrfs_zero_range(struct inode *inode,
* new prealloc extent, so that we get a larger contiguous disk extent.
*/
if (em->start <= alloc_start && (em->flags & EXTENT_FLAG_PREALLOC)) {
- const u64 em_end = em->start + em->len;
+ const u64 em_end = btrfs_extent_map_end(em);
if (em_end >= offset + len) {
/*
@@ -3117,7 +3115,7 @@ static long btrfs_fallocate(struct file *file, int mode,
int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize;
int ret;
- if (unlikely(btrfs_is_shutdown(inode_to_fs_info(inode))))
+ if (btrfs_is_shutdown(inode_to_fs_info(inode)))
return -EIO;
/* Do not allow fallocate in ZONED mode */
@@ -3811,7 +3809,7 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
{
int ret;
- if (unlikely(btrfs_is_shutdown(inode_to_fs_info(inode))))
+ if (btrfs_is_shutdown(inode_to_fs_info(inode)))
return -EIO;
filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
@@ -3826,7 +3824,7 @@ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
ssize_t ret = 0;
- if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(iocb->ki_filp)))))
+ if (btrfs_is_shutdown(inode_to_fs_info(file_inode(iocb->ki_filp))))
return -EIO;
if (iocb->ki_flags & IOCB_DIRECT) {
@@ -3843,7 +3841,7 @@ static ssize_t btrfs_file_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe,
size_t len, unsigned int flags)
{
- if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(in)))))
+ if (btrfs_is_shutdown(inode_to_fs_info(file_inode(in))))
return -EIO;
return filemap_splice_read(in, ppos, pipe, len, flags);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f0f72850fab2..cc075a460a22 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -29,6 +29,7 @@
#include "file-item.h"
#include "file.h"
#include "super.h"
+#include "relocation.h"
#define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
#define MAX_CACHE_BYTES_PER_GIG SZ_64K
@@ -1079,7 +1080,7 @@ int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
struct btrfs_trim_range *trim_entry;
/* Get the cluster for this block_group if it exists */
- if (block_group && !list_empty(&block_group->cluster_list)) {
+ if (!list_empty(&block_group->cluster_list)) {
cluster = list_first_entry(&block_group->cluster_list,
struct btrfs_free_cluster, block_group_list);
}
@@ -1161,7 +1162,7 @@ update_cache_item(struct btrfs_trans_handle *trans,
if (ret < 0) {
btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
EXTENT_DELALLOC, NULL);
- goto fail;
+ return ret;
}
leaf = path->nodes[0];
if (ret > 0) {
@@ -1175,7 +1176,7 @@ update_cache_item(struct btrfs_trans_handle *trans,
inode->i_size - 1, EXTENT_DELALLOC,
NULL);
btrfs_release_path(path);
- goto fail;
+ return -ENOENT;
}
}
@@ -1188,9 +1189,6 @@ update_cache_item(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
return 0;
-
-fail:
- return -1;
}
static noinline_for_stack int write_pinned_extent_entries(
@@ -1200,12 +1198,10 @@ static noinline_for_stack int write_pinned_extent_entries(
int *entries)
{
u64 start, extent_start, extent_end, len;
+ const u64 block_group_end = btrfs_block_group_end(block_group);
struct extent_io_tree *unpin = NULL;
int ret;
- if (!block_group)
- return 0;
-
/*
* We want to add any pinned extents to our free space cache
* so we don't leak the space
@@ -1217,19 +1213,18 @@ static noinline_for_stack int write_pinned_extent_entries(
start = block_group->start;
- while (start < block_group->start + block_group->length) {
+ while (start < block_group_end) {
if (!btrfs_find_first_extent_bit(unpin, start,
&extent_start, &extent_end,
EXTENT_DIRTY, NULL))
return 0;
/* This pinned extent is out of our range */
- if (extent_start >= block_group->start + block_group->length)
+ if (extent_start >= block_group_end)
return 0;
extent_start = max(extent_start, start);
- extent_end = min(block_group->start + block_group->length,
- extent_end + 1);
+ extent_end = min(block_group_end, extent_end + 1);
len = extent_end - extent_start;
*entries += 1;
@@ -1374,9 +1369,9 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
static int __btrfs_write_out_cache(struct inode *inode,
struct btrfs_free_space_ctl *ctl,
struct btrfs_block_group *block_group,
- struct btrfs_io_ctl *io_ctl,
struct btrfs_trans_handle *trans)
{
+ struct btrfs_io_ctl *io_ctl = &block_group->io_ctl;
struct extent_state *cached_state = NULL;
LIST_HEAD(bitmap_list);
int entries = 0;
@@ -1393,7 +1388,7 @@ static int __btrfs_write_out_cache(struct inode *inode,
if (ret)
return ret;
- if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
+ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) {
down_write(&block_group->data_rwsem);
spin_lock(&block_group->lock);
if (block_group->delalloc_bytes) {
@@ -1465,7 +1460,7 @@ static int __btrfs_write_out_cache(struct inode *inode,
goto out_nospc;
}
- if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
up_write(&block_group->data_rwsem);
/*
* Release the pages and unlock the extent, we will flush
@@ -1500,7 +1495,7 @@ out_nospc:
cleanup_write_cache_enospc(inode, io_ctl, &cached_state);
out_unlock:
- if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
up_write(&block_group->data_rwsem);
out:
@@ -1536,8 +1531,7 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
if (IS_ERR(inode))
return 0;
- ret = __btrfs_write_out_cache(inode, ctl, block_group,
- &block_group->io_ctl, trans);
+ ret = __btrfs_write_out_cache(inode, ctl, block_group, trans);
if (ret) {
btrfs_debug(fs_info,
"failed to write free space cache for block group %llu error %d",
@@ -2020,7 +2014,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
int ret;
if (!ctl->free_space_offset.rb_node)
- goto out;
+ return NULL;
again:
if (use_bytes_index) {
node = rb_first_cached(&ctl->free_space_bytes);
@@ -2028,7 +2022,7 @@ again:
entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset),
0, 1);
if (!entry)
- goto out;
+ return NULL;
node = &entry->offset_index;
}
@@ -2112,7 +2106,7 @@ again:
*bytes = entry->bytes - align_off;
return entry;
}
-out:
+
return NULL;
}
@@ -2756,6 +2750,9 @@ int btrfs_add_free_space(struct btrfs_block_group *block_group,
{
enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+ if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)
+ return 0;
+
if (btrfs_is_zoned(block_group->fs_info))
return __btrfs_add_free_space_zoned(block_group, bytenr, size,
true);
@@ -2894,7 +2891,7 @@ again:
old_end - (offset + bytes),
info->trim_state);
WARN_ON(ret);
- goto out;
+ return ret;
}
}
@@ -2906,7 +2903,7 @@ again:
out_lock:
btrfs_discard_update_discardable(block_group);
spin_unlock(&ctl->tree_lock);
-out:
+
return ret;
}
@@ -3063,6 +3060,12 @@ bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
struct rb_node *node;
bool ret = true;
+ if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED &&
+ !test_bit(BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING, &block_group->runtime_flags) &&
+ block_group->identity_remap_count == 0) {
+ return true;
+ }
+
spin_lock(&ctl->tree_lock);
node = rb_first(&ctl->free_space_offset);
@@ -3674,7 +3677,7 @@ static int do_trimming(struct btrfs_block_group *block_group,
}
spin_unlock(&space_info->lock);
- ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
+ ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed, false);
if (!ret) {
*total_trimmed += trimmed;
trim_state = BTRFS_TRIM_STATE_TRIMMED;
@@ -3831,6 +3834,50 @@ out_unlock:
return ret;
}
+void btrfs_trim_fully_remapped_block_group(struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+ struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
+ int ret = 0;
+ u64 bytes, trimmed;
+ const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
+ u64 end = btrfs_block_group_end(bg);
+
+ if (!test_bit(BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING, &bg->runtime_flags)) {
+ bg->discard_cursor = end;
+
+ if (bg->used == 0) {
+ spin_lock(&fs_info->unused_bgs_lock);
+ if (!list_empty(&bg->bg_list)) {
+ list_del_init(&bg->bg_list);
+ btrfs_put_block_group(bg);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+
+ btrfs_mark_bg_unused(bg);
+ }
+
+ return;
+ }
+
+ bytes = end - bg->discard_cursor;
+
+ if (max_discard_size &&
+ bytes >= (max_discard_size + BTRFS_ASYNC_DISCARD_MIN_FILTER))
+ bytes = max_discard_size;
+
+ ret = btrfs_discard_extent(fs_info, bg->discard_cursor, bytes, &trimmed, false);
+ if (ret)
+ return;
+
+ bg->discard_cursor += trimmed;
+
+ if (bg->discard_cursor < end)
+ return;
+
+ btrfs_complete_bg_remapping(bg);
+}
+
/*
* If we break out of trimming a bitmap prematurely, we should reset the
* trimming bit. In a rather contrived case, it's possible to race here so
@@ -3956,7 +4003,7 @@ static int trim_bitmaps(struct btrfs_block_group *block_group,
if (async && *total_trimmed) {
spin_unlock(&ctl->tree_lock);
mutex_unlock(&ctl->cache_writeout_mutex);
- goto out;
+ return ret;
}
bytes = min(bytes, end - start);
@@ -4017,7 +4064,6 @@ next:
if (offset >= end)
block_group->discard_cursor = end;
-out:
return ret;
}
@@ -4110,20 +4156,20 @@ static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info,
{
struct btrfs_block_group *block_group;
struct rb_node *node;
- int ret = 0;
btrfs_info(fs_info, "cleaning free space cache v1");
node = rb_first_cached(&fs_info->block_group_cache_tree);
while (node) {
+ int ret;
+
block_group = rb_entry(node, struct btrfs_block_group, cache_node);
ret = btrfs_remove_free_space_inode(trans, NULL, block_group);
if (ret)
- goto out;
+ return ret;
node = rb_next(node);
}
-out:
- return ret;
+ return 0;
}
int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active)
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 9f1dbfdee8ca..33fc3b245648 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -166,6 +166,7 @@ int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
u64 *trimmed, u64 start, u64 end, u64 minlen,
u64 maxlen, bool async);
+void btrfs_trim_fully_remapped_block_group(struct btrfs_block_group *bg);
bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info);
int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active);
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 1ad2ad384b9e..ecddfca92b2b 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -21,8 +21,7 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path);
-static struct btrfs_root *btrfs_free_space_root(
- struct btrfs_block_group *block_group)
+struct btrfs_root *btrfs_free_space_root(struct btrfs_block_group *block_group)
{
struct btrfs_key key = {
.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID,
@@ -93,7 +92,6 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans,
return 0;
}
-EXPORT_FOR_TESTS
struct btrfs_free_space_info *btrfs_search_free_space_info(
struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
@@ -220,7 +218,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
return 0;
start = block_group->start;
- end = block_group->start + block_group->length;
+ end = btrfs_block_group_end(block_group);
key.objectid = end - 1;
key.type = (u8)-1;
@@ -360,7 +358,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans,
return 0;
start = block_group->start;
- end = block_group->start + block_group->length;
+ end = btrfs_block_group_end(block_group);
key.objectid = end - 1;
key.type = (u8)-1;
@@ -667,7 +665,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
* Read the bit for the block immediately after the extent of space if
* that block is within the block group.
*/
- if (end < block_group->start + block_group->length) {
+ if (end < btrfs_block_group_end(block_group)) {
/* The next block may be in the next bitmap. */
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (end >= key.objectid + key.offset) {
@@ -940,7 +938,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans,
right:
/* Search for a neighbor on the right. */
- if (end == block_group->start + block_group->length)
+ if (end == btrfs_block_group_end(block_group))
goto insert;
key.objectid = end;
key.type = (u8)-1;
@@ -1106,7 +1104,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
* highest, block group).
*/
start = block_group->start;
- end = block_group->start + block_group->length;
+ end = btrfs_block_group_end(block_group);
while (ret == 0) {
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
@@ -1396,9 +1394,9 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
* can use multiple transactions, every time btrfs_end_transaction() is
* called at btrfs_rebuild_free_space_tree() we finish the creation of
* new block groups by calling btrfs_create_pending_block_groups(), and
- * that in turn calls us, through add_block_group_free_space(), to add
- * a free space info item and a free space extent item for the block
- * group.
+ * that in turn calls us, through btrfs_add_block_group_free_space(),
+ * to add a free space info item and a free space extent item for the
+ * block group.
*
* Then later btrfs_rebuild_free_space_tree() may find such new block
* groups and processes them with populate_free_space_tree(), which can
@@ -1479,7 +1477,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans,
}
start = block_group->start;
- end = block_group->start + block_group->length;
+ end = btrfs_block_group_end(block_group);
key.objectid = end - 1;
key.type = (u8)-1;
@@ -1525,33 +1523,28 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
}
- ret = 0;
-
- return ret;
+ return 0;
}
static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
struct btrfs_path *path,
u32 expected_extent_count)
{
- struct btrfs_block_group *block_group;
- struct btrfs_fs_info *fs_info;
+ struct btrfs_block_group *block_group = caching_ctl->block_group;
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_root *root;
struct btrfs_key key;
bool prev_bit_set = false;
/* Initialize to silence GCC. */
u64 extent_start = 0;
- u64 end, offset;
+ const u64 end = btrfs_block_group_end(block_group);
+ u64 offset;
u64 total_found = 0;
u32 extent_count = 0;
int ret;
- block_group = caching_ctl->block_group;
- fs_info = block_group->fs_info;
root = btrfs_free_space_root(block_group);
- end = block_group->start + block_group->length;
-
while (1) {
ret = btrfs_next_item(root, path);
if (ret < 0)
@@ -1617,21 +1610,17 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
struct btrfs_path *path,
u32 expected_extent_count)
{
- struct btrfs_block_group *block_group;
- struct btrfs_fs_info *fs_info;
+ struct btrfs_block_group *block_group = caching_ctl->block_group;
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_root *root;
struct btrfs_key key;
- u64 end;
+ const u64 end = btrfs_block_group_end(block_group);
u64 total_found = 0;
u32 extent_count = 0;
int ret;
- block_group = caching_ctl->block_group;
- fs_info = block_group->fs_info;
root = btrfs_free_space_root(block_group);
- end = block_group->start + block_group->length;
-
while (1) {
u64 space_added;
@@ -1712,3 +1701,106 @@ int btrfs_load_free_space_tree(struct btrfs_caching_control *caching_ctl)
else
return load_free_space_extents(caching_ctl, path, extent_count);
}
+
+static int delete_orphan_free_space_entries(struct btrfs_root *fst_root,
+ struct btrfs_path *path,
+ u64 first_bg_bytenr)
+{
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ trans = btrfs_start_transaction(fst_root, 1);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ while (true) {
+ struct btrfs_key key = { 0 };
+ int i;
+
+ ret = btrfs_search_slot(trans, fst_root, &key, path, -1, 1);
+ if (ret < 0)
+ break;
+ ASSERT(ret > 0);
+ ret = 0;
+ for (i = 0; i < btrfs_header_nritems(path->nodes[0]); i++) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, i);
+ if (key.objectid >= first_bg_bytenr) {
+ /*
+ * Only break the for() loop and continue to
+ * delete items.
+ */
+ break;
+ }
+ }
+ /* No items to delete, finished. */
+ if (i == 0)
+ break;
+
+ ret = btrfs_del_items(trans, fst_root, path, 0, i);
+ if (ret < 0)
+ break;
+ btrfs_release_path(path);
+ }
+ btrfs_release_path(path);
+ btrfs_end_transaction(trans);
+ if (ret == 0)
+ btrfs_info(fst_root->fs_info, "deleted orphan free space tree entries");
+ return ret;
+}
+
+/* Remove any free space entry before the first block group. */
+int btrfs_delete_orphan_free_space_entries(struct btrfs_fs_info *fs_info)
+{
+ BTRFS_PATH_AUTO_RELEASE(path);
+ struct btrfs_key key = {
+ .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = 0,
+ };
+ struct btrfs_root *root;
+ struct btrfs_block_group *bg;
+ u64 first_bg_bytenr;
+ int ret;
+
+ /*
+ * Extent tree v2 has multiple global roots based on the block group.
+ * This means we cannot easily grab the global free space tree and locate
+ * orphan items. Furthermore this is still experimental, all users
+ * should use the latest btrfs-progs anyway.
+ */
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ return 0;
+ if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ return 0;
+ root = btrfs_global_root(fs_info, &key);
+ if (!root)
+ return 0;
+
+ key.objectid = 0;
+ key.type = 0;
+ key.offset = 0;
+
+ bg = btrfs_lookup_first_block_group(fs_info, 0);
+ if (unlikely(!bg)) {
+ btrfs_err(fs_info, "no block group found");
+ return -EUCLEAN;
+ }
+ first_bg_bytenr = bg->start;
+ btrfs_put_block_group(bg);
+
+ ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
+ if (ret < 0)
+ return ret;
+ /* There should not be an all-zero key in fst. */
+ ASSERT(ret > 0);
+
+ /* Empty free space tree. */
+ if (path.slots[0] >= btrfs_header_nritems(path.nodes[0]))
+ return 0;
+
+ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+ if (key.objectid >= first_bg_bytenr)
+ return 0;
+ btrfs_release_path(&path);
+ return delete_orphan_free_space_entries(root, &path, first_bg_bytenr);
+}
diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h
index 3d9a5d4477fc..709730e36888 100644
--- a/fs/btrfs/free-space-tree.h
+++ b/fs/btrfs/free-space-tree.h
@@ -35,12 +35,14 @@ int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans,
u64 start, u64 size);
int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans,
u64 start, u64 size);
-
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int btrfs_delete_orphan_free_space_entries(struct btrfs_fs_info *fs_info);
struct btrfs_free_space_info *
btrfs_search_free_space_info(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path, int cow);
+struct btrfs_root *btrfs_free_space_root(struct btrfs_block_group *block_group);
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
int __btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path, u64 start, u64 size);
diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c
index feb0a2faa837..14d83565cdee 100644
--- a/fs/btrfs/fs.c
+++ b/fs/btrfs/fs.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/crc32.h>
#include "messages.h"
#include "fs.h"
#include "accessors.h"
@@ -8,13 +9,11 @@
static const struct btrfs_csums {
u16 size;
const char name[10];
- const char driver[12];
} btrfs_csums[] = {
[BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
[BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
[BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
- [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
- .driver = "blake2b-256" },
+ [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b" },
};
/* This exists for btrfs-progs usages. */
@@ -37,21 +36,94 @@ const char *btrfs_super_csum_name(u16 csum_type)
return btrfs_csums[csum_type].name;
}
-/*
- * Return driver name if defined, otherwise the name that's also a valid driver
- * name.
- */
-const char *btrfs_super_csum_driver(u16 csum_type)
+size_t __attribute_const__ btrfs_get_num_csums(void)
{
- /* csum type is validated at mount time */
- return btrfs_csums[csum_type].driver[0] ?
- btrfs_csums[csum_type].driver :
- btrfs_csums[csum_type].name;
+ return ARRAY_SIZE(btrfs_csums);
}
-size_t __attribute_const__ btrfs_get_num_csums(void)
+void btrfs_csum(u16 csum_type, const u8 *data, size_t len, u8 *out)
{
- return ARRAY_SIZE(btrfs_csums);
+ switch (csum_type) {
+ case BTRFS_CSUM_TYPE_CRC32:
+ put_unaligned_le32(~crc32c(~0, data, len), out);
+ break;
+ case BTRFS_CSUM_TYPE_XXHASH:
+ put_unaligned_le64(xxh64(data, len, 0), out);
+ break;
+ case BTRFS_CSUM_TYPE_SHA256:
+ sha256(data, len, out);
+ break;
+ case BTRFS_CSUM_TYPE_BLAKE2:
+ blake2b(NULL, 0, data, len, out, 32);
+ break;
+ default:
+ /* Checksum type is validated at mount time. */
+ BUG();
+ }
+}
+
+void btrfs_csum_init(struct btrfs_csum_ctx *ctx, u16 csum_type)
+{
+ ctx->csum_type = csum_type;
+ switch (ctx->csum_type) {
+ case BTRFS_CSUM_TYPE_CRC32:
+ ctx->crc32 = ~0;
+ break;
+ case BTRFS_CSUM_TYPE_XXHASH:
+ xxh64_reset(&ctx->xxh64, 0);
+ break;
+ case BTRFS_CSUM_TYPE_SHA256:
+ sha256_init(&ctx->sha256);
+ break;
+ case BTRFS_CSUM_TYPE_BLAKE2:
+ blake2b_init(&ctx->blake2b, 32);
+ break;
+ default:
+ /* Checksume type is validated at mount time. */
+ BUG();
+ }
+}
+
+void btrfs_csum_update(struct btrfs_csum_ctx *ctx, const u8 *data, size_t len)
+{
+ switch (ctx->csum_type) {
+ case BTRFS_CSUM_TYPE_CRC32:
+ ctx->crc32 = crc32c(ctx->crc32, data, len);
+ break;
+ case BTRFS_CSUM_TYPE_XXHASH:
+ xxh64_update(&ctx->xxh64, data, len);
+ break;
+ case BTRFS_CSUM_TYPE_SHA256:
+ sha256_update(&ctx->sha256, data, len);
+ break;
+ case BTRFS_CSUM_TYPE_BLAKE2:
+ blake2b_update(&ctx->blake2b, data, len);
+ break;
+ default:
+ /* Checksum type is validated at mount time. */
+ BUG();
+ }
+}
+
+void btrfs_csum_final(struct btrfs_csum_ctx *ctx, u8 *out)
+{
+ switch (ctx->csum_type) {
+ case BTRFS_CSUM_TYPE_CRC32:
+ put_unaligned_le32(~ctx->crc32, out);
+ break;
+ case BTRFS_CSUM_TYPE_XXHASH:
+ put_unaligned_le64(xxh64_digest(&ctx->xxh64), out);
+ break;
+ case BTRFS_CSUM_TYPE_SHA256:
+ sha256_final(&ctx->sha256, out);
+ break;
+ case BTRFS_CSUM_TYPE_BLAKE2:
+ blake2b_final(&ctx->blake2b, out);
+ break;
+ default:
+ /* Checksum type is validated at mount time. */
+ BUG();
+ }
}
/*
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 8ffbc40ebe45..3de3b517810e 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -3,6 +3,8 @@
#ifndef BTRFS_FS_H
#define BTRFS_FS_H
+#include <crypto/blake2b.h>
+#include <crypto/sha2.h>
#include <linux/blkdev.h>
#include <linux/sizes.h>
#include <linux/time64.h>
@@ -24,6 +26,7 @@
#include <linux/wait_bit.h>
#include <linux/sched.h>
#include <linux/rbtree.h>
+#include <linux/xxhash.h>
#include <uapi/linux/btrfs.h>
#include <uapi/linux/btrfs_tree.h>
#include "extent-io-tree.h"
@@ -35,14 +38,12 @@ struct inode;
struct super_block;
struct kobject;
struct reloc_control;
-struct crypto_shash;
struct ulist;
struct btrfs_device;
struct btrfs_block_group;
struct btrfs_root;
struct btrfs_fs_devices;
struct btrfs_transaction;
-struct btrfs_delayed_root;
struct btrfs_balance_control;
struct btrfs_subpage_info;
struct btrfs_stripe_hash_table;
@@ -64,6 +65,12 @@ struct btrfs_space_info;
#define BTRFS_MAX_EXTENT_SIZE SZ_128M
+/*
+ * Maximum length to trim in a single iteration to avoid holding device list
+ * mutex for too long.
+ */
+#define BTRFS_MAX_TRIM_LENGTH SZ_2G
+
#define BTRFS_OLDEST_GENERATION 0ULL
#define BTRFS_EMPTY_DIR_SIZE 0
@@ -313,7 +320,8 @@ enum {
#define BTRFS_FEATURE_INCOMPAT_SUPP \
(BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \
BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE | \
- BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2)
+ BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2 | \
+ BTRFS_FEATURE_INCOMPAT_REMAP_TREE)
#else
@@ -461,6 +469,21 @@ struct btrfs_commit_stats {
u64 critical_section_start_time;
};
+struct btrfs_delayed_root {
+ spinlock_t lock;
+ int nodes; /* for delayed nodes */
+ struct list_head node_list;
+ /*
+ * Used for delayed nodes which is waiting to be dealt with by the
+ * worker. If the delayed node is inserted into the work queue, we
+ * drop it from this list.
+ */
+ struct list_head prepare_list;
+ atomic_t items; /* for delayed items */
+ atomic_t items_seq; /* for delayed items */
+ wait_queue_head_t wait;
+};
+
struct btrfs_fs_info {
u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
unsigned long flags;
@@ -473,6 +496,7 @@ struct btrfs_fs_info {
struct btrfs_root *data_reloc_root;
struct btrfs_root *block_group_root;
struct btrfs_root *stripe_root;
+ struct btrfs_root *remap_root;
/* The log root tree is a directory of all the other log roots */
struct btrfs_root *log_root_tree;
@@ -507,6 +531,8 @@ struct btrfs_fs_info {
struct btrfs_block_rsv trans_block_rsv;
/* Block reservation for chunk tree */
struct btrfs_block_rsv chunk_block_rsv;
+ /* Block reservation for remap tree. */
+ struct btrfs_block_rsv remap_block_rsv;
/* Block reservation for delayed operations */
struct btrfs_block_rsv delayed_block_rsv;
/* Block reservation for delayed refs */
@@ -581,6 +607,7 @@ struct btrfs_fs_info {
struct mutex transaction_kthread_mutex;
struct mutex cleaner_mutex;
struct mutex chunk_mutex;
+ struct mutex remap_mutex;
/*
* This is taken to make sure we don't set block groups ro after the
@@ -810,7 +837,7 @@ struct btrfs_fs_info {
/* Filesystem state */
unsigned long fs_state;
- struct btrfs_delayed_root *delayed_root;
+ struct btrfs_delayed_root delayed_root;
/* Entries are eb->start >> nodesize_bits */
struct xarray buffer_tree;
@@ -834,10 +861,11 @@ struct btrfs_fs_info {
struct list_head reclaim_bgs;
int bg_reclaim_threshold;
- /* Protects the lists unused_bgs and reclaim_bgs. */
+ /* Protects the lists unused_bgs, reclaim_bgs, and fully_remapped_bgs. */
spinlock_t unused_bgs_lock;
/* Protected by unused_bgs_lock. */
struct list_head unused_bgs;
+ struct list_head fully_remapped_bgs;
struct mutex unused_bg_unpin_mutex;
/* Protect block groups that are going to be deleted */
struct mutex reclaim_bgs_lock;
@@ -850,9 +878,10 @@ struct btrfs_fs_info {
u32 sectorsize_bits;
u32 block_min_order;
u32 block_max_order;
+ u32 stripesize;
u32 csum_size;
u32 csums_per_leaf;
- u32 stripesize;
+ u32 csum_type;
/*
* Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular
@@ -864,8 +893,6 @@ struct btrfs_fs_info {
spinlock_t swapfile_pins_lock;
struct rb_root swapfile_pins;
- struct crypto_shash *csum_shash;
-
/* Type of exclusive operation running, protected by super_lock */
enum btrfs_exclusive_operation exclusive_operation;
@@ -1057,8 +1084,20 @@ int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args)
u16 btrfs_csum_type_size(u16 type);
int btrfs_super_csum_size(const struct btrfs_super_block *s);
const char *btrfs_super_csum_name(u16 csum_type);
-const char *btrfs_super_csum_driver(u16 csum_type);
size_t __attribute_const__ btrfs_get_num_csums(void);
+struct btrfs_csum_ctx {
+ u16 csum_type;
+ union {
+ u32 crc32;
+ struct xxh64_state xxh64;
+ struct sha256_ctx sha256;
+ struct blake2b_ctx blake2b;
+ };
+};
+void btrfs_csum(u16 csum_type, const u8 *data, size_t len, u8 *out);
+void btrfs_csum_init(struct btrfs_csum_ctx *ctx, u16 csum_type);
+void btrfs_csum_update(struct btrfs_csum_ctx *ctx, const u8 *data, size_t len);
+void btrfs_csum_final(struct btrfs_csum_ctx *ctx, u8 *out);
static inline bool btrfs_is_empty_uuid(const u8 *uuid)
{
@@ -1105,15 +1144,17 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \
BTRFS_MOUNT_##opt)
-static inline int btrfs_fs_closing(const struct btrfs_fs_info *fs_info)
+static inline bool btrfs_fs_closing(const struct btrfs_fs_info *fs_info)
+{
+ return unlikely(test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags));
+}
+
+static inline bool btrfs_fs_closing_done(const struct btrfs_fs_info *fs_info)
{
- /* Do it this way so we only ever do one test_bit in the normal case. */
- if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) {
- if (test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags))
- return 2;
- return 1;
- }
- return 0;
+ if (btrfs_fs_closing(fs_info) && test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags))
+ return true;
+
+ return false;
}
/*
@@ -1141,9 +1182,9 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
(unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \
&(fs_info)->fs_state)))
-static inline bool btrfs_is_shutdown(struct btrfs_fs_info *fs_info)
+static inline bool btrfs_is_shutdown(const struct btrfs_fs_info *fs_info)
{
- return test_bit(BTRFS_FS_STATE_EMERGENCY_SHUTDOWN, &fs_info->fs_state);
+ return unlikely(test_bit(BTRFS_FS_STATE_EMERGENCY_SHUTDOWN, &fs_info->fs_state));
}
static inline void btrfs_force_shutdown(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index b73e1dd97208..a864f8c99729 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -371,14 +371,13 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
struct btrfs_path *path, u64 objectid)
{
struct btrfs_key key;
- int ret;
+
key.objectid = objectid;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- ret = btrfs_insert_empty_item(trans, root, path, &key,
- sizeof(struct btrfs_inode_item));
- return ret;
+ return btrfs_insert_empty_item(trans, root, path, &key,
+ sizeof(struct btrfs_inode_item));
}
int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ab356b50119c..82df115bd0c5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3,7 +3,6 @@
* Copyright (C) 2007 Oracle. All rights reserved.
*/
-#include <crypto/hash.h>
#include <linux/kernel.h>
#include <linux/bio.h>
#include <linux/blk-cgroup.h>
@@ -219,7 +218,7 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off
int mirror_num)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_path path = { 0 };
+ BTRFS_PATH_AUTO_RELEASE(path);
struct btrfs_key found_key = { 0 };
struct extent_buffer *eb;
struct btrfs_extent_item *ei;
@@ -257,7 +256,6 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off
if (ret < 0) {
btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d",
logical, ret);
- btrfs_release_path(&path);
return;
}
eb = path.nodes[0];
@@ -287,11 +285,14 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off
(ref_level ? "node" : "leaf"),
ref_level, ref_root);
}
- btrfs_release_path(&path);
} else {
struct btrfs_backref_walk_ctx ctx = { 0 };
struct data_reloc_warn reloc_warn = { 0 };
+ /*
+ * Do not hold the path as later iterate_extent_inodes() call
+ * can be time consuming.
+ */
btrfs_release_path(&path);
ctx.bytenr = found_key.objectid;
@@ -507,7 +508,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, root, path, &key,
datasize);
if (ret)
- goto fail;
+ return ret;
}
leaf = path->nodes[0];
ei = btrfs_item_ptr(leaf, path->slots[0],
@@ -546,7 +547,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
ret = btrfs_inode_set_file_extent_range(inode, 0,
ALIGN(size, root->fs_info->sectorsize));
if (ret)
- goto fail;
+ return ret;
/*
* We're an inline extent, so nobody can extend the file past i_size
@@ -562,8 +563,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
}
inode->disk_i_size = i_size;
-fail:
- return ret;
+ return 0;
}
static bool can_cow_file_range_inline(struct btrfs_inode *inode,
@@ -690,8 +690,8 @@ out:
/*
* Don't forget to free the reserved space, as for inlined extent
* it won't count as data extent, free them directly here.
- * And at reserve time, it's always aligned to page size, so
- * just free one page here.
+ * And at reserve time, it's always aligned to sector size, so
+ * just free one sector here.
*
* If we fallback to non-inline (ret == 1) due to -ENOSPC, then we need
* to keep the data reservation.
@@ -756,10 +756,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode,
struct async_extent {
u64 start;
u64 ram_size;
- u64 compressed_size;
- struct folio **folios;
- unsigned long nr_folios;
- int compress_type;
+ struct compressed_bio *cb;
struct list_head list;
};
@@ -780,24 +777,18 @@ struct async_cow {
struct async_chunk chunks[];
};
-static noinline int add_async_extent(struct async_chunk *cow,
- u64 start, u64 ram_size,
- u64 compressed_size,
- struct folio **folios,
- unsigned long nr_folios,
- int compress_type)
+static int add_async_extent(struct async_chunk *cow, u64 start, u64 ram_size,
+ struct compressed_bio *cb)
{
struct async_extent *async_extent;
async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
if (!async_extent)
return -ENOMEM;
+ ASSERT(ram_size < U32_MAX);
async_extent->start = start;
async_extent->ram_size = ram_size;
- async_extent->compressed_size = compressed_size;
- async_extent->folios = folios;
- async_extent->nr_folios = nr_folios;
- async_extent->compress_type = compress_type;
+ async_extent->cb = cb;
list_add_tail(&async_extent->list, &cow->extents);
return 0;
}
@@ -816,6 +807,13 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
return 0;
}
+ /*
+ * If the delalloc range is only one fs block and can not be inlined,
+ * do not even bother try compression, as there will be no space saving
+ * and will always fallback to regular write later.
+ */
+ if (start != 0 && end + 1 - start <= fs_info->sectorsize)
+ return 0;
/* Defrag ioctl takes precedence over mount options and properties. */
if (inode->defrag_compress == BTRFS_DEFRAG_DONT_COMPRESS)
return 0;
@@ -864,6 +862,61 @@ static int extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start,
return ret;
}
+static struct folio *compressed_bio_last_folio(struct compressed_bio *cb)
+{
+ struct bio *bio = &cb->bbio.bio;
+ struct bio_vec *bvec;
+ phys_addr_t paddr;
+
+ /*
+ * Make sure all folios have the same min_folio_size.
+ *
+ * Otherwise we cannot simply use offset_in_offset(folio, bi_size) to
+ * calculate the end of the last folio.
+ */
+ if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) {
+ struct btrfs_fs_info *fs_info = cb_to_fs_info(cb);
+ const u32 min_folio_size = btrfs_min_folio_size(fs_info);
+ struct folio_iter fi;
+
+ bio_for_each_folio_all(fi, bio)
+ ASSERT(folio_size(fi.folio) == min_folio_size);
+ }
+
+ /* The bio must not be empty. */
+ ASSERT(bio->bi_vcnt);
+
+ bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
+ paddr = page_to_phys(bvec->bv_page) + bvec->bv_offset + bvec->bv_len - 1;
+ return page_folio(phys_to_page(paddr));
+}
+
+static void zero_last_folio(struct compressed_bio *cb)
+{
+ struct bio *bio = &cb->bbio.bio;
+ struct folio *last_folio = compressed_bio_last_folio(cb);
+ const u32 bio_size = bio->bi_iter.bi_size;
+ const u32 foffset = offset_in_folio(last_folio, bio_size);
+
+ folio_zero_range(last_folio, foffset, folio_size(last_folio) - foffset);
+}
+
+static void round_up_last_block(struct compressed_bio *cb, u32 blocksize)
+{
+ struct bio *bio = &cb->bbio.bio;
+ struct folio *last_folio = compressed_bio_last_folio(cb);
+ const u32 bio_size = bio->bi_iter.bi_size;
+ const u32 foffset = offset_in_folio(last_folio, bio_size);
+ bool ret;
+
+ if (IS_ALIGNED(bio_size, blocksize))
+ return;
+
+ ret = bio_add_folio(bio, last_folio, round_up(foffset, blocksize) - foffset, foffset);
+ /* The remaining part should be merged thus never fail. */
+ ASSERT(ret);
+}
+
/*
* Work queue call back to started compression on a file and pages.
*
@@ -884,24 +937,22 @@ static void compress_file_range(struct btrfs_work *work)
struct btrfs_inode *inode = async_chunk->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct address_space *mapping = inode->vfs_inode.i_mapping;
- const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
+ struct compressed_bio *cb = NULL;
const u32 min_folio_size = btrfs_min_folio_size(fs_info);
u64 blocksize = fs_info->sectorsize;
u64 start = async_chunk->start;
u64 end = async_chunk->end;
u64 actual_end;
u64 i_size;
+ u32 cur_len;
int ret = 0;
- struct folio **folios = NULL;
- unsigned long nr_folios;
unsigned long total_compressed = 0;
unsigned long total_in = 0;
unsigned int loff;
- int i;
int compress_type = fs_info->compress_type;
int compress_level = fs_info->compress_level;
- if (unlikely(btrfs_is_shutdown(fs_info)))
+ if (btrfs_is_shutdown(fs_info))
goto cleanup_and_bail_uncompressed;
inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);
@@ -916,7 +967,7 @@ static void compress_file_range(struct btrfs_work *work)
/*
* All the folios should have been locked thus no failure.
*
- * And even if some folios are missing, btrfs_compress_folios()
+ * And even if some folios are missing, btrfs_compress_bio()
* would handle them correctly, so here just do an ASSERT() check for
* early logic errors.
*/
@@ -936,9 +987,10 @@ static void compress_file_range(struct btrfs_work *work)
barrier();
actual_end = min_t(u64, i_size, end + 1);
again:
- folios = NULL;
- nr_folios = (end >> min_folio_shift) - (start >> min_folio_shift) + 1;
- nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED >> min_folio_shift);
+ total_in = 0;
+ cur_len = min(end + 1 - start, BTRFS_MAX_UNCOMPRESSED);
+ ret = 0;
+ cb = NULL;
/*
* we don't want to send crud past the end of i_size through
@@ -953,21 +1005,6 @@ again:
if (actual_end <= start)
goto cleanup_and_bail_uncompressed;
- total_compressed = actual_end - start;
-
- /*
- * Skip compression for a small file range(<=blocksize) that
- * isn't an inline extent, since it doesn't save disk space at all.
- */
- if (total_compressed <= blocksize &&
- (start > 0 || end + 1 < inode->disk_i_size))
- goto cleanup_and_bail_uncompressed;
-
- total_compressed = min_t(unsigned long, total_compressed,
- BTRFS_MAX_UNCOMPRESSED);
- total_in = 0;
- ret = 0;
-
/*
* We do compression for mount -o compress and when the inode has not
* been flagged as NOCOMPRESS. This flag can change at any time if we
@@ -976,15 +1013,6 @@ again:
if (!inode_need_compress(inode, start, end))
goto cleanup_and_bail_uncompressed;
- folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS);
- if (!folios) {
- /*
- * Memory allocation failure is not a fatal error, we can fall
- * back to uncompressed code.
- */
- goto cleanup_and_bail_uncompressed;
- }
-
if (0 < inode->defrag_compress && inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) {
compress_type = inode->defrag_compress;
compress_level = inode->defrag_compress_level;
@@ -993,11 +1021,15 @@ again:
}
/* Compression level is applied here. */
- ret = btrfs_compress_folios(compress_type, compress_level,
- inode, start, folios, &nr_folios, &total_in,
- &total_compressed);
- if (ret)
+ cb = btrfs_compress_bio(inode, start, cur_len, compress_type,
+ compress_level, async_chunk->write_flags);
+ if (IS_ERR(cb)) {
+ cb = NULL;
goto mark_incompressible;
+ }
+
+ total_compressed = cb->bbio.bio.bi_iter.bi_size;
+ total_in = cur_len;
/*
* Zero the tail end of the last folio, as we might be sending it down
@@ -1005,7 +1037,7 @@ again:
*/
loff = (total_compressed & (min_folio_size - 1));
if (loff)
- folio_zero_range(folios[nr_folios - 1], loff, min_folio_size - loff);
+ zero_last_folio(cb);
/*
* Try to create an inline extent.
@@ -1021,11 +1053,13 @@ again:
BTRFS_COMPRESS_NONE, NULL, false);
else
ret = cow_file_range_inline(inode, NULL, start, end, total_compressed,
- compress_type, folios[0], false);
+ compress_type,
+ bio_first_folio_all(&cb->bbio.bio), false);
if (ret <= 0) {
+ cleanup_compressed_bio(cb);
if (ret < 0)
mapping_set_error(mapping, -EIO);
- goto free_pages;
+ return;
}
/*
@@ -1033,6 +1067,7 @@ again:
* block size boundary so the allocator does sane things.
*/
total_compressed = ALIGN(total_compressed, blocksize);
+ round_up_last_block(cb, blocksize);
/*
* One last check to make sure the compression is really a win, compare
@@ -1043,12 +1078,12 @@ again:
if (total_compressed + blocksize > total_in)
goto mark_incompressible;
+
/*
* The async work queues will take care of doing actual allocation on
* disk for these compressed pages, and will submit the bios.
*/
- ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios,
- nr_folios, compress_type);
+ ret = add_async_extent(async_chunk, start, total_in, cb);
BUG_ON(ret);
if (start + total_in < end) {
start += total_in;
@@ -1061,33 +1096,10 @@ mark_incompressible:
if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress)
inode->flags |= BTRFS_INODE_NOCOMPRESS;
cleanup_and_bail_uncompressed:
- ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
- BTRFS_COMPRESS_NONE);
+ ret = add_async_extent(async_chunk, start, end - start + 1, NULL);
BUG_ON(ret);
-free_pages:
- if (folios) {
- for (i = 0; i < nr_folios; i++) {
- WARN_ON(folios[i]->mapping);
- btrfs_free_compr_folio(folios[i]);
- }
- kfree(folios);
- }
-}
-
-static void free_async_extent_pages(struct async_extent *async_extent)
-{
- int i;
-
- if (!async_extent->folios)
- return;
-
- for (i = 0; i < async_extent->nr_folios; i++) {
- WARN_ON(async_extent->folios[i]->mapping);
- btrfs_free_compr_folio(async_extent->folios[i]);
- }
- kfree(async_extent->folios);
- async_extent->nr_folios = 0;
- async_extent->folios = NULL;
+ if (cb)
+ cleanup_compressed_bio(cb);
}
static void submit_uncompressed_range(struct btrfs_inode *inode,
@@ -1134,7 +1146,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
struct extent_state *cached = NULL;
struct extent_map *em;
int ret = 0;
- bool free_pages = false;
+ u32 compressed_size;
u64 start = async_extent->start;
u64 end = async_extent->start + async_extent->ram_size - 1;
@@ -1154,17 +1166,14 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
locked_folio = async_chunk->locked_folio;
}
- if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
- ASSERT(!async_extent->folios);
- ASSERT(async_extent->nr_folios == 0);
+ if (!async_extent->cb) {
submit_uncompressed_range(inode, async_extent, locked_folio);
- free_pages = true;
goto done;
}
+ compressed_size = async_extent->cb->bbio.bio.bi_iter.bi_size;
ret = btrfs_reserve_extent(root, async_extent->ram_size,
- async_extent->compressed_size,
- async_extent->compressed_size,
+ compressed_size, compressed_size,
0, *alloc_hint, &ins, true, true);
if (ret) {
/*
@@ -1174,7 +1183,8 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
* fall back to uncompressed.
*/
submit_uncompressed_range(inode, async_extent, locked_folio);
- free_pages = true;
+ cleanup_compressed_bio(async_extent->cb);
+ async_extent->cb = NULL;
goto done;
}
@@ -1186,7 +1196,9 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
file_extent.ram_bytes = async_extent->ram_size;
file_extent.num_bytes = async_extent->ram_size;
file_extent.offset = 0;
- file_extent.compression = async_extent->compress_type;
+ file_extent.compression = async_extent->cb->compress_type;
+
+ async_extent->cb->bbio.bio.bi_iter.bi_sector = ins.objectid >> SECTOR_SHIFT;
em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
if (IS_ERR(em)) {
@@ -1202,22 +1214,20 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
ret = PTR_ERR(ordered);
goto out_free_reserve;
}
+ async_extent->cb->bbio.ordered = ordered;
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
/* Clear dirty, set writeback and unlock the pages. */
extent_clear_unlock_delalloc(inode, start, end,
NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC,
PAGE_UNLOCK | PAGE_START_WRITEBACK);
- btrfs_submit_compressed_write(ordered,
- async_extent->folios, /* compressed_folios */
- async_extent->nr_folios,
- async_chunk->write_flags, true);
+ btrfs_submit_bbio(&async_extent->cb->bbio, 0);
+ async_extent->cb = NULL;
+
*alloc_hint = ins.objectid + ins.offset;
done:
if (async_chunk->blkcg_css)
kthread_associate_blkcg(NULL);
- if (free_pages)
- free_async_extent_pages(async_extent);
kfree(async_extent);
return;
@@ -1232,7 +1242,8 @@ out_free_reserve:
EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
PAGE_UNLOCK | PAGE_START_WRITEBACK |
PAGE_END_WRITEBACK);
- free_async_extent_pages(async_extent);
+ if (async_extent->cb)
+ cleanup_compressed_bio(async_extent->cb);
if (async_chunk->blkcg_css)
kthread_associate_blkcg(NULL);
btrfs_debug(fs_info,
@@ -1275,6 +1286,133 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
}
/*
+ * Handle COW for one range.
+ *
+ * @ins: The key representing the allocated range.
+ * @file_offset: The file offset of the COW range
+ * @num_bytes: The expected length of the COW range
+ * The actually allocated length can be smaller than it.
+ * @min_alloc_size: The minimal extent size.
+ * @alloc_hint: The hint for the extent allocator.
+ * @ret_alloc_size: The COW range handles by this function.
+ *
+ * Return 0 if everything is fine and update @ret_alloc_size updated. The
+ * range is still locked, and caller should unlock the range after everything
+ * is done or for error handling.
+ *
+ * Return <0 for error and @is updated for where the extra cleanup should
+ * happen. The range [file_offset, file_offset + ret_alloc_size) will be
+ * cleaned up by this function.
+ */
+static int cow_one_range(struct btrfs_inode *inode, struct folio *locked_folio,
+ struct btrfs_key *ins, struct extent_state **cached,
+ u64 file_offset, u32 num_bytes, u32 min_alloc_size,
+ u64 alloc_hint, u32 *ret_alloc_size)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_file_extent file_extent;
+ struct extent_map *em;
+ u32 cur_len = 0;
+ u64 cur_end;
+ int ret;
+
+ ret = btrfs_reserve_extent(root, num_bytes, num_bytes, min_alloc_size,
+ 0, alloc_hint, ins, true, true);
+ if (ret < 0) {
+ *ret_alloc_size = cur_len;
+ return ret;
+ }
+
+ cur_len = ins->offset;
+ cur_end = file_offset + cur_len - 1;
+
+ file_extent.disk_bytenr = ins->objectid;
+ file_extent.disk_num_bytes = ins->offset;
+ file_extent.num_bytes = ins->offset;
+ file_extent.ram_bytes = ins->offset;
+ file_extent.offset = 0;
+ file_extent.compression = BTRFS_COMPRESS_NONE;
+
+ /*
+ * Locked range will be released either during error clean up (inside
+ * this function or by the caller for previously successful ranges) or
+ * after the whole range is finished.
+ */
+ btrfs_lock_extent(&inode->io_tree, file_offset, cur_end, cached);
+ em = btrfs_create_io_em(inode, file_offset, &file_extent, BTRFS_ORDERED_REGULAR);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto free_reserved;
+ }
+ btrfs_free_extent_map(em);
+
+ ordered = btrfs_alloc_ordered_extent(inode, file_offset, &file_extent,
+ 1U << BTRFS_ORDERED_REGULAR);
+ if (IS_ERR(ordered)) {
+ btrfs_drop_extent_map_range(inode, file_offset, cur_end, false);
+ ret = PTR_ERR(ordered);
+ goto free_reserved;
+ }
+
+ if (btrfs_is_data_reloc_root(root)) {
+ ret = btrfs_reloc_clone_csums(ordered);
+
+ /*
+ * Only drop cache here, and process as normal.
+ *
+ * We must not allow extent_clear_unlock_delalloc() at
+ * free_reserved label to free meta of this ordered extent, as
+ * its meta should be freed by btrfs_finish_ordered_io().
+ *
+ * So we must continue until @start is increased to
+ * skip current ordered extent.
+ */
+ if (ret)
+ btrfs_drop_extent_map_range(inode, file_offset,
+ cur_end, false);
+ }
+ btrfs_put_ordered_extent(ordered);
+ btrfs_dec_block_group_reservations(fs_info, ins->objectid);
+ /*
+ * Error handling for btrfs_reloc_clone_csums().
+ *
+ * Treat the range as finished, thus only clear EXTENT_LOCKED | EXTENT_DELALLOC.
+ * The accounting will be done by ordered extents.
+ */
+ if (unlikely(ret < 0)) {
+ btrfs_cleanup_ordered_extents(inode, file_offset, cur_len);
+ extent_clear_unlock_delalloc(inode, file_offset, cur_end, locked_folio, cached,
+ EXTENT_LOCKED | EXTENT_DELALLOC,
+ PAGE_UNLOCK | PAGE_START_WRITEBACK |
+ PAGE_END_WRITEBACK);
+ mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
+ }
+ *ret_alloc_size = cur_len;
+ return ret;
+
+free_reserved:
+ extent_clear_unlock_delalloc(inode, file_offset, cur_end, locked_folio, cached,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_DELALLOC_NEW |
+ EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
+ PAGE_UNLOCK | PAGE_START_WRITEBACK |
+ PAGE_END_WRITEBACK);
+ btrfs_qgroup_free_data(inode, NULL, file_offset, cur_len, NULL);
+ btrfs_dec_block_group_reservations(fs_info, ins->objectid);
+ btrfs_free_reserved_extent(fs_info, ins->objectid, ins->offset, true);
+ mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
+ *ret_alloc_size = cur_len;
+ /*
+ * We should not return -EAGAIN where it's a special return code for
+ * zoned to catch btrfs_reserved_extent().
+ */
+ ASSERT(ret != -EAGAIN);
+ return ret;
+}
+
+/*
* when extent_io.c finds a delayed allocation range in the file,
* the call backs end up in this code. The basic idea is to
* allocate extents on disk for the range, and create ordered data structs
@@ -1310,16 +1448,15 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
u64 alloc_hint = 0;
u64 orig_start = start;
u64 num_bytes;
- u64 cur_alloc_size = 0;
- u64 min_alloc_size;
- u64 blocksize = fs_info->sectorsize;
+ u32 min_alloc_size;
+ u32 blocksize = fs_info->sectorsize;
+ u32 cur_alloc_size = 0;
struct btrfs_key ins;
- struct extent_map *em;
unsigned clear_bits;
unsigned long page_ops;
int ret = 0;
- if (unlikely(btrfs_is_shutdown(fs_info))) {
+ if (btrfs_is_shutdown(fs_info)) {
ret = -EIO;
goto out_unlock;
}
@@ -1383,16 +1520,14 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
min_alloc_size = fs_info->sectorsize;
while (num_bytes > 0) {
- struct btrfs_ordered_extent *ordered;
- struct btrfs_file_extent file_extent;
+ ret = cow_one_range(inode, locked_folio, &ins, &cached, start,
+ num_bytes, min_alloc_size, alloc_hint, &cur_alloc_size);
- ret = btrfs_reserve_extent(root, num_bytes, num_bytes,
- min_alloc_size, 0, alloc_hint,
- &ins, true, true);
if (ret == -EAGAIN) {
/*
- * btrfs_reserve_extent only returns -EAGAIN for zoned
- * file systems, which is an indication that there are
+ * cow_one_range() only returns -EAGAIN for zoned
+ * file systems (from btrfs_reserve_extent()), which
+ * is an indication that there are
* no active zones to allocate from at the moment.
*
* If this is the first loop iteration, wait for at
@@ -1421,79 +1556,14 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
}
if (ret < 0)
goto out_unlock;
- cur_alloc_size = ins.offset;
-
- file_extent.disk_bytenr = ins.objectid;
- file_extent.disk_num_bytes = ins.offset;
- file_extent.num_bytes = ins.offset;
- file_extent.ram_bytes = ins.offset;
- file_extent.offset = 0;
- file_extent.compression = BTRFS_COMPRESS_NONE;
- /*
- * Locked range will be released either during error clean up or
- * after the whole range is finished.
- */
- btrfs_lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1,
- &cached);
-
- em = btrfs_create_io_em(inode, start, &file_extent,
- BTRFS_ORDERED_REGULAR);
- if (IS_ERR(em)) {
- btrfs_unlock_extent(&inode->io_tree, start,
- start + cur_alloc_size - 1, &cached);
- ret = PTR_ERR(em);
- goto out_reserve;
- }
- btrfs_free_extent_map(em);
-
- ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
- 1U << BTRFS_ORDERED_REGULAR);
- if (IS_ERR(ordered)) {
- btrfs_unlock_extent(&inode->io_tree, start,
- start + cur_alloc_size - 1, &cached);
- ret = PTR_ERR(ordered);
- goto out_drop_extent_cache;
- }
-
- if (btrfs_is_data_reloc_root(root)) {
- ret = btrfs_reloc_clone_csums(ordered);
-
- /*
- * Only drop cache here, and process as normal.
- *
- * We must not allow extent_clear_unlock_delalloc()
- * at out_unlock label to free meta of this ordered
- * extent, as its meta should be freed by
- * btrfs_finish_ordered_io().
- *
- * So we must continue until @start is increased to
- * skip current ordered extent.
- */
- if (ret)
- btrfs_drop_extent_map_range(inode, start,
- start + cur_alloc_size - 1,
- false);
- }
- btrfs_put_ordered_extent(ordered);
+ /* We should not allocate an extent larger than requested.*/
+ ASSERT(cur_alloc_size <= num_bytes);
- btrfs_dec_block_group_reservations(fs_info, ins.objectid);
-
- if (num_bytes < cur_alloc_size)
- num_bytes = 0;
- else
- num_bytes -= cur_alloc_size;
+ num_bytes -= cur_alloc_size;
alloc_hint = ins.objectid + ins.offset;
start += cur_alloc_size;
cur_alloc_size = 0;
-
- /*
- * btrfs_reloc_clone_csums() error, since start is increased
- * extent_clear_unlock_delalloc() at out_unlock label won't
- * free metadata of current ordered extent, we're OK to exit.
- */
- if (ret)
- goto out_unlock;
}
extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached,
EXTENT_LOCKED | EXTENT_DELALLOC, page_ops);
@@ -1502,11 +1572,6 @@ done:
*done_offset = end;
return ret;
-out_drop_extent_cache:
- btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false);
-out_reserve:
- btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
out_unlock:
/*
* Now, we have three regions to clean up:
@@ -1543,24 +1608,9 @@ out_unlock:
page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
/*
- * For the range (2). If we reserved an extent for our delalloc range
- * (or a subrange) and failed to create the respective ordered extent,
- * then it means that when we reserved the extent we decremented the
- * extent's size from the data space_info's bytes_may_use counter and
- * incremented the space_info's bytes_reserved counter by the same
- * amount. We must make sure extent_clear_unlock_delalloc() does not try
- * to decrement again the data space_info's bytes_may_use counter,
- * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
- */
- if (cur_alloc_size) {
- extent_clear_unlock_delalloc(inode, start,
- start + cur_alloc_size - 1,
- locked_folio, &cached, clear_bits,
- page_ops);
- btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
- }
-
- /*
+ * For the range (2) the error handling is done by cow_one_range() itself.
+ * Nothing needs to be done.
+ *
* For the range (3). We never touched the region. In addition to the
* clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data
* space_info's bytes_may_use counter, reserved in
@@ -1575,7 +1625,7 @@ out_unlock:
end - start - cur_alloc_size + 1, NULL);
}
btrfs_err(fs_info,
-"%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu cur_alloc_size=%llu: %d",
+"%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu cur_alloc_size=%u: %d",
__func__, btrfs_root_id(inode->root),
btrfs_ino(inode), orig_start, end + 1 - orig_start,
start, cur_alloc_size, ret);
@@ -2072,7 +2122,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
*/
ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root));
- if (unlikely(btrfs_is_shutdown(fs_info))) {
+ if (btrfs_is_shutdown(fs_info)) {
ret = -EIO;
goto error;
}
@@ -2372,7 +2422,6 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol
u64 start, u64 end, struct writeback_control *wbc)
{
const bool zoned = btrfs_is_zoned(inode->root->fs_info);
- int ret;
/*
* The range must cover part of the @locked_folio, or a return of 1
@@ -2381,10 +2430,8 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol
ASSERT(!(end <= folio_pos(locked_folio) ||
start >= folio_next_pos(locked_folio)));
- if (should_nocow(inode, start, end)) {
- ret = run_delalloc_nocow(inode, locked_folio, start, end);
- return ret;
- }
+ if (should_nocow(inode, start, end))
+ return run_delalloc_nocow(inode, locked_folio, start, end);
if (btrfs_inode_can_compress(inode) &&
inode_need_compress(inode, start, end) &&
@@ -2392,11 +2439,9 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol
return 1;
if (zoned)
- ret = run_delalloc_cow(inode, locked_folio, start, end, wbc,
- true);
+ return run_delalloc_cow(inode, locked_folio, start, end, wbc, true);
else
- ret = cow_file_range(inode, locked_folio, start, end, NULL, 0);
- return ret;
+ return cow_file_range(inode, locked_folio, start, end, NULL, 0);
}
void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
@@ -3007,7 +3052,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
drop_args.extent_item_size = sizeof(*stack_fi);
ret = btrfs_drop_extents(trans, root, inode, &drop_args);
if (ret)
- goto out;
+ return ret;
if (!drop_args.extent_inserted) {
ins.objectid = btrfs_ino(inode);
@@ -3017,7 +3062,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, root, path, &ins,
sizeof(*stack_fi));
if (ret)
- goto out;
+ return ret;
}
leaf = path->nodes[0];
btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
@@ -3052,13 +3097,11 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
if (ret)
- goto out;
+ return ret;
- ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
- file_pos - offset,
- qgroup_reserved, &ins);
-out:
- return ret;
+ return btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
+ file_pos - offset,
+ qgroup_reserved, &ins);
}
static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
@@ -3226,19 +3269,21 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
logical_len);
btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
ordered_extent->disk_num_bytes);
+ if (unlikely(ret < 0)) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
} else {
BUG_ON(root == fs_info->tree_root);
ret = insert_ordered_extent_file_extent(trans, ordered_extent);
- if (!ret) {
- clear_reserved_extent = false;
- btrfs_release_delalloc_bytes(fs_info,
- ordered_extent->disk_bytenr,
- ordered_extent->disk_num_bytes);
+ if (unlikely(ret < 0)) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
}
- }
- if (unlikely(ret < 0)) {
- btrfs_abort_transaction(trans, ret);
- goto out;
+ clear_reserved_extent = false;
+ btrfs_release_delalloc_bytes(fs_info,
+ ordered_extent->disk_bytenr,
+ ordered_extent->disk_num_bytes);
}
ret = btrfs_unpin_extent_cache(inode, ordered_extent->file_offset,
@@ -3336,7 +3381,7 @@ out:
btrfs_discard_extent(fs_info,
ordered_extent->disk_bytenr,
ordered_extent->disk_num_bytes,
- NULL);
+ NULL, true);
btrfs_free_reserved_extent(fs_info,
ordered_extent->disk_bytenr,
ordered_extent->disk_num_bytes, true);
@@ -3418,20 +3463,19 @@ void btrfs_calculate_block_csum_pages(struct btrfs_fs_info *fs_info,
const u32 blocksize = fs_info->sectorsize;
const u32 step = min(blocksize, PAGE_SIZE);
const u32 nr_steps = blocksize / step;
- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+ struct btrfs_csum_ctx csum;
- shash->tfm = fs_info->csum_shash;
- crypto_shash_init(shash);
+ btrfs_csum_init(&csum, fs_info->csum_type);
for (int i = 0; i < nr_steps; i++) {
const phys_addr_t paddr = paddrs[i];
void *kaddr;
ASSERT(offset_in_page(paddr) + step <= PAGE_SIZE);
kaddr = kmap_local_page(phys_to_page(paddr)) + offset_in_page(paddr);
- crypto_shash_update(shash, kaddr, step);
+ btrfs_csum_update(&csum, kaddr, step);
kunmap_local(kaddr);
}
- crypto_shash_final(shash, dest);
+ btrfs_csum_final(&csum, dest);
}
/*
@@ -7137,7 +7181,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
read_unlock(&em_tree->lock);
if (em) {
- if (em->start > start || em->start + em->len <= start)
+ if (em->start > start || btrfs_extent_map_end(em) <= start)
btrfs_free_extent_map(em);
else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio)
btrfs_free_extent_map(em);
@@ -9790,12 +9834,12 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
struct extent_state *cached_state = NULL;
struct btrfs_ordered_extent *ordered;
struct btrfs_file_extent file_extent;
+ struct compressed_bio *cb = NULL;
int compression;
size_t orig_count;
+ const u32 min_folio_size = btrfs_min_folio_size(fs_info);
u64 start, end;
u64 num_bytes, ram_bytes, disk_num_bytes;
- unsigned long nr_folios, i;
- struct folio **folios;
struct btrfs_key ins;
bool extent_reserved = false;
struct extent_map *em;
@@ -9884,39 +9928,46 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
* isn't.
*/
disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
- nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
- folios = kvcalloc(nr_folios, sizeof(struct folio *), GFP_KERNEL_ACCOUNT);
- if (!folios)
- return -ENOMEM;
- for (i = 0; i < nr_folios; i++) {
- size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
+
+ cb = btrfs_alloc_compressed_write(inode, start, num_bytes);
+ for (int i = 0; i * min_folio_size < disk_num_bytes; i++) {
+ struct folio *folio;
+ size_t bytes = min(min_folio_size, iov_iter_count(from));
char *kaddr;
- folios[i] = folio_alloc(GFP_KERNEL_ACCOUNT, 0);
- if (!folios[i]) {
+ folio = btrfs_alloc_compr_folio(fs_info);
+ if (!folio) {
ret = -ENOMEM;
- goto out_folios;
+ goto out_cb;
}
- kaddr = kmap_local_folio(folios[i], 0);
- if (copy_from_iter(kaddr, bytes, from) != bytes) {
- kunmap_local(kaddr);
+ kaddr = kmap_local_folio(folio, 0);
+ ret = copy_from_iter(kaddr, bytes, from);
+ kunmap_local(kaddr);
+ if (ret != bytes) {
+ folio_put(folio);
ret = -EFAULT;
- goto out_folios;
+ goto out_cb;
+ }
+ if (bytes < min_folio_size)
+ folio_zero_range(folio, bytes, min_folio_size - bytes);
+ ret = bio_add_folio(&cb->bbio.bio, folio, folio_size(folio), 0);
+ if (unlikely(!ret)) {
+ folio_put(folio);
+ ret = -EINVAL;
+ goto out_cb;
}
- if (bytes < PAGE_SIZE)
- memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
- kunmap_local(kaddr);
}
+ ASSERT(cb->bbio.bio.bi_iter.bi_size == disk_num_bytes);
for (;;) {
ret = btrfs_wait_ordered_range(inode, start, num_bytes);
if (ret)
- goto out_folios;
+ goto out_cb;
ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
start >> PAGE_SHIFT,
end >> PAGE_SHIFT);
if (ret)
- goto out_folios;
+ goto out_cb;
btrfs_lock_extent(io_tree, start, end, &cached_state);
ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
if (!ordered &&
@@ -9948,7 +9999,8 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
encoded->unencoded_offset == 0 &&
can_cow_file_range_inline(inode, start, encoded->len, orig_count)) {
ret = __cow_file_range_inline(inode, encoded->len,
- orig_count, compression, folios[0],
+ orig_count, compression,
+ bio_first_folio_all(&cb->bbio.bio),
true);
if (ret <= 0) {
if (ret == 0)
@@ -9993,7 +10045,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
btrfs_delalloc_release_extents(inode, num_bytes);
- btrfs_submit_compressed_write(ordered, folios, nr_folios, 0, false);
+ btrfs_submit_compressed_write(ordered, cb);
ret = orig_count;
goto out;
@@ -10015,12 +10067,9 @@ out_free_data_space:
btrfs_free_reserved_data_space_noquota(inode, disk_num_bytes);
out_unlock:
btrfs_unlock_extent(io_tree, start, end, &cached_state);
-out_folios:
- for (i = 0; i < nr_folios; i++) {
- if (folios[i])
- folio_put(folios[i]);
- }
- kvfree(folios);
+out_cb:
+ if (cb)
+ cleanup_compressed_bio(cb);
out:
if (ret >= 0)
iocb->ki_pos += encoded->len;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d1ab03691606..a6cc2d3b414c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1107,7 +1107,7 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
bool readonly,
struct btrfs_qgroup_inherit *inherit)
{
- int ret = 0;
+ int ret;
struct qstr qname = QSTR_INIT(name, strlen(name));
if (!S_ISDIR(file_inode(file)->i_mode))
@@ -1115,7 +1115,7 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
ret = mnt_want_write_file(file);
if (ret)
- goto out;
+ return ret;
if (strchr(name, '/')) {
ret = -EINVAL;
@@ -1167,7 +1167,6 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
}
out_drop_write:
mnt_drop_write_file(file);
-out:
return ret;
}
@@ -1283,14 +1282,14 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
struct btrfs_trans_handle *trans;
u64 root_flags;
u64 flags;
- int ret = 0;
+ int ret;
if (!inode_owner_or_capable(file_mnt_idmap(file), inode))
return -EPERM;
ret = mnt_want_write_file(file);
if (ret)
- goto out;
+ return ret;
if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
ret = -EINVAL;
@@ -1359,7 +1358,6 @@ out_drop_sem:
up_write(&fs_info->subvol_sem);
out_drop_write:
mnt_drop_write_file(file);
-out:
return ret;
}
@@ -1425,10 +1423,8 @@ static noinline int copy_to_sk(struct btrfs_path *path,
continue;
if (sizeof(sh) + item_len > *buf_size) {
- if (*num_found) {
- ret = 1;
- goto out;
- }
+ if (*num_found)
+ return 1;
/*
* return one empty item back for v1, which does not
@@ -1440,10 +1436,8 @@ static noinline int copy_to_sk(struct btrfs_path *path,
ret = -EOVERFLOW;
}
- if (sizeof(sh) + item_len + *sk_offset > *buf_size) {
- ret = 1;
- goto out;
- }
+ if (sizeof(sh) + item_len + *sk_offset > *buf_size)
+ return 1;
sh.objectid = key->objectid;
sh.type = key->type;
@@ -1457,10 +1451,8 @@ static noinline int copy_to_sk(struct btrfs_path *path,
* problem. Otherwise we'll fault and then copy the buffer in
* properly this next time through
*/
- if (copy_to_user_nofault(ubuf + *sk_offset, &sh, sizeof(sh))) {
- ret = 0;
- goto out;
- }
+ if (copy_to_user_nofault(ubuf + *sk_offset, &sh, sizeof(sh)))
+ return 0;
*sk_offset += sizeof(sh);
@@ -1472,22 +1464,20 @@ static noinline int copy_to_sk(struct btrfs_path *path,
*/
if (read_extent_buffer_to_user_nofault(leaf, up,
item_off, item_len)) {
- ret = 0;
*sk_offset -= sizeof(sh);
- goto out;
+ return 0;
}
*sk_offset += item_len;
}
(*num_found)++;
- if (ret) /* -EOVERFLOW from above */
- goto out;
+ /* -EOVERFLOW from above. */
+ if (ret)
+ return ret;
- if (*num_found >= sk->nr_items) {
- ret = 1;
- goto out;
- }
+ if (*num_found >= sk->nr_items)
+ return 1;
}
advance_key:
ret = 0;
@@ -1507,7 +1497,7 @@ advance_key:
key->objectid++;
} else
ret = 1;
-out:
+
/*
* 0: all items from this leaf copied, continue with next
* 1: * more items can be copied, but unused buffer is too small
@@ -4931,7 +4921,7 @@ out_acct:
int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
- if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(cmd->file)))))
+ if (btrfs_is_shutdown(inode_to_fs_info(file_inode(cmd->file))))
return -EIO;
switch (cmd->cmd_op) {
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 0035851d72b0..e3df5ca0b552 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -73,6 +73,7 @@ static struct btrfs_lockdep_keyset {
{ .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") },
{ .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") },
{ .id = BTRFS_RAID_STRIPE_TREE_OBJECTID, DEFINE_NAME("raid-stripe") },
+ { .id = BTRFS_REMAP_TREE_OBJECTID, DEFINE_NAME("remap") },
{ .id = 0, DEFINE_NAME("tree") },
};
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 4758f66da449..8e20497afffe 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -123,126 +123,188 @@ static inline size_t read_compress_length(const char *buf)
}
/*
+ * Write data into @out_folio and queue it into @out_bio.
+ *
+ * Return 0 if everything is fine and @total_out will be increased.
+ * Return <0 for error.
+ *
+ * The @out_folio can be NULL after a full folio is queued.
+ * Thus the caller should check and allocate a new folio when needed.
+ */
+static int write_and_queue_folio(struct bio *out_bio, struct folio **out_folio,
+ u32 *total_out, u32 write_len)
+{
+ const u32 fsize = folio_size(*out_folio);
+ const u32 foffset = offset_in_folio(*out_folio, *total_out);
+
+ ASSERT(out_folio && *out_folio);
+ /* Should not cross folio boundary. */
+ ASSERT(foffset + write_len <= fsize);
+
+ /* We can not use bio_add_folio_nofail() which doesn't do any merge. */
+ if (!bio_add_folio(out_bio, *out_folio, write_len, foffset)) {
+ /*
+ * We have allocated a bio that havs BTRFS_MAX_COMPRESSED_PAGES
+ * vecs, and all ranges inside the same folio should have been
+ * merged. If bio_add_folio() still failed, that means we have
+ * reached the bvec limits.
+ *
+ * This should only happen at the beginning of a folio, and
+ * caller is responsible for releasing the folio, since it's
+ * not yet queued into the bio.
+ */
+ ASSERT(IS_ALIGNED(*total_out, fsize));
+ return -E2BIG;
+ }
+
+ *total_out += write_len;
+ /*
+ * The full folio has been filled and queued, reset @out_folio to NULL,
+ * so that error handling is fully handled by the bio.
+ */
+ if (IS_ALIGNED(*total_out, fsize))
+ *out_folio = NULL;
+ return 0;
+}
+
+/*
+ * Copy compressed data to bio.
+ *
+ * @out_bio: The bio that will contain all the compressed data.
+ * @compressed_data: The compressed data of this segment.
+ * @compressed_size: The size of the compressed data.
+ * @out_folio: The current output folio, will be updated if a new
+ * folio is allocated.
+ * @total_out: The total bytes of current output.
+ * @max_out: The maximum size of the compressed data.
+ *
* Will do:
*
* - Write a segment header into the destination
* - Copy the compressed buffer into the destination
* - Make sure we have enough space in the last sector to fit a segment header
* If not, we will pad at most (LZO_LEN (4)) - 1 bytes of zeros.
+ * - If a full folio is filled, it will be queued into @out_bio, and @out_folio
+ * will be updated.
*
* Will allocate new pages when needed.
*/
-static int copy_compressed_data_to_page(struct btrfs_fs_info *fs_info,
- char *compressed_data,
- size_t compressed_size,
- struct folio **out_folios,
- unsigned long max_nr_folio,
- u32 *cur_out)
+static int copy_compressed_data_to_bio(struct btrfs_fs_info *fs_info,
+ struct bio *out_bio,
+ const char *compressed_data,
+ size_t compressed_size,
+ struct folio **out_folio,
+ u32 *total_out, u32 max_out)
{
const u32 sectorsize = fs_info->sectorsize;
- const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
+ const u32 sectorsize_bits = fs_info->sectorsize_bits;
+ const u32 fsize = btrfs_min_folio_size(fs_info);
+ const u32 old_size = out_bio->bi_iter.bi_size;
+ u32 copy_start;
u32 sector_bytes_left;
- u32 orig_out;
- struct folio *cur_folio;
char *kaddr;
+ int ret;
- if ((*cur_out >> min_folio_shift) >= max_nr_folio)
- return -E2BIG;
+ ASSERT(out_folio);
+
+ /* There should be at least a lzo header queued. */
+ ASSERT(old_size);
+ ASSERT(old_size == *total_out);
/*
* We never allow a segment header crossing sector boundary, previous
* run should ensure we have enough space left inside the sector.
*/
- ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize);
+ ASSERT((old_size >> sectorsize_bits) == (old_size + LZO_LEN - 1) >> sectorsize_bits);
- cur_folio = out_folios[*cur_out >> min_folio_shift];
- /* Allocate a new page */
- if (!cur_folio) {
- cur_folio = btrfs_alloc_compr_folio(fs_info);
- if (!cur_folio)
+ if (!*out_folio) {
+ *out_folio = btrfs_alloc_compr_folio(fs_info);
+ if (!*out_folio)
return -ENOMEM;
- out_folios[*cur_out >> min_folio_shift] = cur_folio;
}
- kaddr = kmap_local_folio(cur_folio, offset_in_folio(cur_folio, *cur_out));
+ /* Write the segment header first. */
+ kaddr = kmap_local_folio(*out_folio, offset_in_folio(*out_folio, *total_out));
write_compress_length(kaddr, compressed_size);
- *cur_out += LZO_LEN;
-
- orig_out = *cur_out;
+ kunmap_local(kaddr);
+ ret = write_and_queue_folio(out_bio, out_folio, total_out, LZO_LEN);
+ if (ret < 0)
+ return ret;
- /* Copy compressed data */
- while (*cur_out - orig_out < compressed_size) {
- u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize,
- orig_out + compressed_size - *cur_out);
+ copy_start = *total_out;
- kunmap_local(kaddr);
+ /* Copy compressed data. */
+ while (*total_out - copy_start < compressed_size) {
+ u32 copy_len = min_t(u32, sectorsize - *total_out % sectorsize,
+ copy_start + compressed_size - *total_out);
+ u32 foffset = *total_out & (fsize - 1);
- if ((*cur_out >> min_folio_shift) >= max_nr_folio)
+ /* With the range copied, we're larger than the original range. */
+ if (((*total_out + copy_len) >> sectorsize_bits) >=
+ max_out >> sectorsize_bits)
return -E2BIG;
- cur_folio = out_folios[*cur_out >> min_folio_shift];
- /* Allocate a new page */
- if (!cur_folio) {
- cur_folio = btrfs_alloc_compr_folio(fs_info);
- if (!cur_folio)
+ if (!*out_folio) {
+ *out_folio = btrfs_alloc_compr_folio(fs_info);
+ if (!*out_folio)
return -ENOMEM;
- out_folios[*cur_out >> min_folio_shift] = cur_folio;
}
- kaddr = kmap_local_folio(cur_folio, 0);
- memcpy(kaddr + offset_in_folio(cur_folio, *cur_out),
- compressed_data + *cur_out - orig_out, copy_len);
-
- *cur_out += copy_len;
+ kaddr = kmap_local_folio(*out_folio, foffset);
+ memcpy(kaddr, compressed_data + *total_out - copy_start, copy_len);
+ kunmap_local(kaddr);
+ ret = write_and_queue_folio(out_bio, out_folio, total_out, copy_len);
+ if (ret < 0)
+ return ret;
}
/*
* Check if we can fit the next segment header into the remaining space
* of the sector.
*/
- sector_bytes_left = round_up(*cur_out, sectorsize) - *cur_out;
+ sector_bytes_left = round_up(*total_out, sectorsize) - *total_out;
if (sector_bytes_left >= LZO_LEN || sector_bytes_left == 0)
- goto out;
+ return 0;
- /* The remaining size is not enough, pad it with zeros */
- memset(kaddr + offset_in_page(*cur_out), 0,
- sector_bytes_left);
- *cur_out += sector_bytes_left;
+ ASSERT(*out_folio);
-out:
- kunmap_local(kaddr);
- return 0;
+ /* The remaining size is not enough, pad it with zeros */
+ folio_zero_range(*out_folio, offset_in_folio(*out_folio, *total_out), sector_bytes_left);
+ return write_and_queue_folio(out_bio, out_folio, total_out, sector_bytes_left);
}
-int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
- u64 start, struct folio **folios, unsigned long *out_folios,
- unsigned long *total_in, unsigned long *total_out)
+int lzo_compress_bio(struct list_head *ws, struct compressed_bio *cb)
{
+ struct btrfs_inode *inode = cb->bbio.inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct workspace *workspace = list_entry(ws, struct workspace, list);
+ struct bio *bio = &cb->bbio.bio;
+ const u64 start = cb->start;
+ const u32 len = cb->len;
const u32 sectorsize = fs_info->sectorsize;
const u32 min_folio_size = btrfs_min_folio_size(fs_info);
struct address_space *mapping = inode->vfs_inode.i_mapping;
struct folio *folio_in = NULL;
+ struct folio *folio_out = NULL;
char *sizes_ptr;
- const unsigned long max_nr_folio = *out_folios;
int ret = 0;
- /* Points to the file offset of input data */
+ /* Points to the file offset of input data. */
u64 cur_in = start;
- /* Points to the current output byte */
- u32 cur_out = 0;
- u32 len = *total_out;
+ /* Points to the current output byte. */
+ u32 total_out = 0;
- ASSERT(max_nr_folio > 0);
- *out_folios = 0;
- *total_out = 0;
- *total_in = 0;
+ ASSERT(bio->bi_iter.bi_size == 0);
+ ASSERT(len);
+
+ folio_out = btrfs_alloc_compr_folio(fs_info);
+ if (!folio_out)
+ return -ENOMEM;
+
+ /* Queue a segment header first. */
+ ret = write_and_queue_folio(bio, &folio_out, &total_out, LZO_LEN);
+ /* The first header should not fail. */
+ ASSERT(ret == 0);
- /*
- * Skip the header for now, we will later come back and write the total
- * compressed size
- */
- cur_out += LZO_LEN;
while (cur_in < start + len) {
char *data_in;
const u32 sectorsize_mask = sectorsize - 1;
@@ -250,19 +312,18 @@ int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
u32 in_len;
size_t out_len;
- /* Get the input page first */
+ /* Get the input page first. */
if (!folio_in) {
ret = btrfs_compress_filemap_get_folio(mapping, cur_in, &folio_in);
if (ret < 0)
goto out;
}
- /* Compress at most one sector of data each time */
+ /* Compress at most one sector of data each time. */
in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off);
ASSERT(in_len);
data_in = kmap_local_folio(folio_in, offset_in_folio(folio_in, cur_in));
- ret = lzo1x_1_compress(data_in, in_len,
- workspace->cbuf, &out_len,
+ ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, &out_len,
workspace->mem);
kunmap_local(data_in);
if (unlikely(ret < 0)) {
@@ -271,9 +332,8 @@ int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
goto out;
}
- ret = copy_compressed_data_to_page(fs_info, workspace->cbuf, out_len,
- folios, max_nr_folio,
- &cur_out);
+ ret = copy_compressed_data_to_bio(fs_info, bio, workspace->cbuf, out_len,
+ &folio_out, &total_out, len);
if (ret < 0)
goto out;
@@ -283,50 +343,80 @@ int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
* Check if we're making it bigger after two sectors. And if
* it is so, give up.
*/
- if (cur_in - start > sectorsize * 2 && cur_in - start < cur_out) {
+ if (cur_in - start > sectorsize * 2 && cur_in - start < total_out) {
ret = -E2BIG;
goto out;
}
- /* Check if we have reached folio boundary. */
+ /* Check if we have reached input folio boundary. */
if (IS_ALIGNED(cur_in, min_folio_size)) {
folio_put(folio_in);
folio_in = NULL;
}
}
+ /*
+ * The last folio is already queued. Bio is responsible for freeing
+ * those folios now.
+ */
+ folio_out = NULL;
/* Store the size of all chunks of compressed data */
- sizes_ptr = kmap_local_folio(folios[0], 0);
- write_compress_length(sizes_ptr, cur_out);
+ sizes_ptr = kmap_local_folio(bio_first_folio_all(bio), 0);
+ write_compress_length(sizes_ptr, total_out);
kunmap_local(sizes_ptr);
-
- ret = 0;
- *total_out = cur_out;
- *total_in = cur_in - start;
out:
+ /*
+ * We can only free the folio that has no part queued into the bio.
+ *
+ * As any folio that is already queued into bio will be released by
+ * the endio function of bio.
+ */
+ if (folio_out && IS_ALIGNED(total_out, min_folio_size)) {
+ btrfs_free_compr_folio(folio_out);
+ folio_out = NULL;
+ }
if (folio_in)
folio_put(folio_in);
- *out_folios = DIV_ROUND_UP(cur_out, min_folio_size);
return ret;
}
+static struct folio *get_current_folio(struct compressed_bio *cb, struct folio_iter *fi,
+ u32 *cur_folio_index, u32 cur_in)
+{
+ struct btrfs_fs_info *fs_info = cb_to_fs_info(cb);
+ const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
+
+ ASSERT(cur_folio_index);
+
+ /* Need to switch to the next folio. */
+ if (cur_in >> min_folio_shift != *cur_folio_index) {
+ /* We can only do the switch one folio a time. */
+ ASSERT(cur_in >> min_folio_shift == *cur_folio_index + 1);
+
+ bio_next_folio(fi, &cb->bbio.bio);
+ (*cur_folio_index)++;
+ }
+ return fi->folio;
+}
+
/*
* Copy the compressed segment payload into @dest.
*
* For the payload there will be no padding, just need to do page switching.
*/
static void copy_compressed_segment(struct compressed_bio *cb,
+ struct folio_iter *fi, u32 *cur_folio_index,
char *dest, u32 len, u32 *cur_in)
{
- struct btrfs_fs_info *fs_info = cb_to_fs_info(cb);
- const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
u32 orig_in = *cur_in;
while (*cur_in < orig_in + len) {
- struct folio *cur_folio = cb->compressed_folios[*cur_in >> min_folio_shift];
- u32 copy_len = min_t(u32, orig_in + len - *cur_in,
- folio_size(cur_folio) - offset_in_folio(cur_folio, *cur_in));
+ struct folio *cur_folio = get_current_folio(cb, fi, cur_folio_index, *cur_in);
+ u32 copy_len;
+ ASSERT(cur_folio);
+ copy_len = min_t(u32, orig_in + len - *cur_in,
+ folio_size(cur_folio) - offset_in_folio(cur_folio, *cur_in));
ASSERT(copy_len);
memcpy_from_folio(dest + *cur_in - orig_in, cur_folio,
@@ -341,7 +431,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
struct workspace *workspace = list_entry(ws, struct workspace, list);
const struct btrfs_fs_info *fs_info = cb->bbio.inode->root->fs_info;
const u32 sectorsize = fs_info->sectorsize;
- const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
+ struct folio_iter fi;
char *kaddr;
int ret;
/* Compressed data length, can be unaligned */
@@ -350,8 +440,15 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
u32 cur_in = 0;
/* Bytes decompressed so far */
u32 cur_out = 0;
-
- kaddr = kmap_local_folio(cb->compressed_folios[0], 0);
+ /* The current folio index number inside the bio. */
+ u32 cur_folio_index = 0;
+
+ bio_first_folio(&fi, &cb->bbio.bio, 0);
+ /* There must be a compressed folio and matches the sectorsize. */
+ if (unlikely(!fi.folio))
+ return -EINVAL;
+ ASSERT(folio_size(fi.folio) == sectorsize);
+ kaddr = kmap_local_folio(fi.folio, 0);
len_in = read_compress_length(kaddr);
kunmap_local(kaddr);
cur_in += LZO_LEN;
@@ -388,7 +485,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
*/
ASSERT(cur_in / sectorsize ==
(cur_in + LZO_LEN - 1) / sectorsize);
- cur_folio = cb->compressed_folios[cur_in >> min_folio_shift];
+ cur_folio = get_current_folio(cb, &fi, &cur_folio_index, cur_in);
ASSERT(cur_folio);
kaddr = kmap_local_folio(cur_folio, 0);
seg_len = read_compress_length(kaddr + offset_in_folio(cur_folio, cur_in));
@@ -410,7 +507,8 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
}
/* Copy the compressed segment payload into workspace */
- copy_compressed_segment(cb, workspace->cbuf, seg_len, &cur_in);
+ copy_compressed_segment(cb, &fi, &cur_folio_index, workspace->cbuf,
+ seg_len, &cur_in);
/* Decompress the data */
ret = lzo1x_decompress_safe(workspace->cbuf, seg_len,
@@ -456,7 +554,7 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
size_t in_len;
size_t out_len;
size_t max_segment_len = workspace_buf_length(fs_info);
- int ret = 0;
+ int ret;
if (unlikely(srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2))
return -EUCLEAN;
@@ -467,10 +565,8 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
data_in += LZO_LEN;
in_len = read_compress_length(data_in);
- if (unlikely(in_len != srclen - LZO_LEN * 2)) {
- ret = -EUCLEAN;
- goto out;
- }
+ if (unlikely(in_len != srclen - LZO_LEN * 2))
+ return -EUCLEAN;
data_in += LZO_LEN;
out_len = sectorsize;
@@ -482,19 +578,18 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
"lzo decompression failed, error %d root %llu inode %llu offset %llu",
ret, btrfs_root_id(inode->root), btrfs_ino(inode),
folio_pos(dest_folio));
- ret = -EIO;
- goto out;
+ return -EIO;
}
ASSERT(out_len <= sectorsize);
memcpy_to_folio(dest_folio, dest_pgoff, workspace->buf, out_len);
/* Early end, considered as an error. */
if (unlikely(out_len < destlen)) {
- ret = -EIO;
folio_zero_range(dest_folio, dest_pgoff + out_len, destlen - out_len);
+ return -EIO;
}
-out:
- return ret;
+
+ return 0;
}
const struct btrfs_compress_levels btrfs_lzo_compress = {
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
index 2f853de44473..6190777924bf 100644
--- a/fs/btrfs/messages.c
+++ b/fs/btrfs/messages.c
@@ -211,33 +211,19 @@ static struct ratelimit_state printk_limits[] = {
RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100),
};
-void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
+__printf(3, 4) __cold
+void _btrfs_printk(const struct btrfs_fs_info *fs_info, unsigned int level, const char *fmt, ...)
{
- char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
struct va_format vaf;
va_list args;
- int kern_level;
- const char *type = logtypes[4];
- struct ratelimit_state *ratelimit = &printk_limits[4];
+ const char *type = logtypes[level];
+ struct ratelimit_state *ratelimit = &printk_limits[level];
#ifdef CONFIG_PRINTK_INDEX
printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt);
#endif
va_start(args, fmt);
-
- while ((kern_level = printk_get_level(fmt)) != 0) {
- size_t size = printk_skip_level(fmt) - fmt;
-
- if (kern_level >= '0' && kern_level <= '7') {
- memcpy(lvl, fmt, size);
- lvl[size] = '\0';
- type = logtypes[kern_level - '0'];
- ratelimit = &printk_limits[kern_level - '0'];
- }
- fmt += size;
- }
-
vaf.fmt = fmt;
vaf.va = &args;
@@ -247,10 +233,10 @@ void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt,
char statestr[STATE_STRING_BUF_LEN];
btrfs_state_to_string(fs_info, statestr);
- _printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type,
+ _printk(KERN_SOH "%dBTRFS %s (device %s%s): %pV\n", level, type,
fs_info->sb->s_id, statestr, &vaf);
} else {
- _printk("%sBTRFS %s: %pV\n", lvl, type, &vaf);
+ _printk(KERN_SOH "%dBTRFS %s: %pV\n", level, type, &vaf);
}
}
diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
index d8c0bd17dcda..943e53980945 100644
--- a/fs/btrfs/messages.h
+++ b/fs/btrfs/messages.h
@@ -23,69 +23,74 @@ void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
#ifdef CONFIG_PRINTK
-#define btrfs_printk(fs_info, fmt, args...) \
- _btrfs_printk(fs_info, fmt, ##args)
-
-__printf(2, 3)
-__cold
-void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
+__printf(3, 4) __cold
+void _btrfs_printk(const struct btrfs_fs_info *fs_info, unsigned int level, const char *fmt, ...);
#else
-#define btrfs_printk(fs_info, fmt, args...) \
+#define btrfs_printk_in_rcu(fs_info, level, fmt, args...) \
+ btrfs_no_printk(fs_info, fmt, ##args)
+
+#define btrfs_printk_in_rcu(fs_info, level, fmt, args...) \
btrfs_no_printk(fs_info, fmt, ##args)
+
+#define btrfs_printk_rl_in_rcu(fs_info, level, fmt, args...) \
+ btrfs_no_printk(fs_info, fmt, ##args)
+
#endif
/*
* Print a message with filesystem info, enclosed in RCU protection.
*/
#define btrfs_crit(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args)
+ btrfs_printk_in_rcu(fs_info, LOGLEVEL_CRIT, fmt, ##args)
#define btrfs_err(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args)
+ btrfs_printk_in_rcu(fs_info, LOGLEVEL_ERR, fmt, ##args)
#define btrfs_warn(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args)
+ btrfs_printk_in_rcu(fs_info, LOGLEVEL_WARNING, fmt, ##args)
#define btrfs_info(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args)
+ btrfs_printk_in_rcu(fs_info, LOGLEVEL_INFO, fmt, ##args)
/*
* Wrappers that use a ratelimited printk
*/
#define btrfs_crit_rl(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args)
+ btrfs_printk_rl_in_rcu(fs_info, LOGLEVEL_CRIT, fmt, ##args)
#define btrfs_err_rl(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args)
+ btrfs_printk_rl_in_rcu(fs_info, LOGLEVEL_ERR, fmt, ##args)
#define btrfs_warn_rl(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args)
+ btrfs_printk_rl_in_rcu(fs_info, LOGLEVEL_WARNING, fmt, ##args)
#define btrfs_info_rl(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args)
+ btrfs_printk_rl_in_rcu(fs_info, LOGLEVEL_INFO, fmt, ##args)
#if defined(CONFIG_DYNAMIC_DEBUG)
#define btrfs_debug(fs_info, fmt, args...) \
_dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \
- fs_info, KERN_DEBUG fmt, ##args)
+ fs_info, LOGLEVEL_DEBUG, fmt, ##args)
#define btrfs_debug_rl(fs_info, fmt, args...) \
_dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \
- fs_info, KERN_DEBUG fmt, ##args)
+ fs_info, LOGLEVEL_DEBUG, fmt, ##args)
#elif defined(DEBUG)
#define btrfs_debug(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+ btrfs_printk_in_rcu(fs_info, LOGLEVEL_DEBUG, fmt, ##args)
#define btrfs_debug_rl(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+ btrfs_printk_rl_in_rcu(fs_info, LOGLEVEl_DEBUG, fmt, ##args)
#else
/* When printk() is no_printk(), expand to no-op. */
#define btrfs_debug(fs_info, fmt, args...) do { (void)(fs_info); } while(0)
#define btrfs_debug_rl(fs_info, fmt, args...) do { (void)(fs_info); } while(0)
#endif
-#define btrfs_printk_in_rcu(fs_info, fmt, args...) \
-do { \
- rcu_read_lock(); \
- btrfs_printk(fs_info, fmt, ##args); \
- rcu_read_unlock(); \
+#ifdef CONFIG_PRINTK
+
+#define btrfs_printk_in_rcu(fs_info, level, fmt, args...) \
+do { \
+ rcu_read_lock(); \
+ _btrfs_printk(fs_info, level, fmt, ##args); \
+ rcu_read_unlock(); \
} while (0)
-#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \
+#define btrfs_printk_rl_in_rcu(fs_info, level, fmt, args...) \
do { \
static DEFINE_RATELIMIT_STATE(_rs, \
DEFAULT_RATELIMIT_INTERVAL, \
@@ -93,10 +98,12 @@ do { \
\
rcu_read_lock(); \
if (__ratelimit(&_rs)) \
- btrfs_printk(fs_info, fmt, ##args); \
+ _btrfs_printk(fs_info, level, fmt, ##args); \
rcu_read_unlock(); \
} while (0)
+#endif
+
#ifdef CONFIG_BTRFS_ASSERT
__printf(1, 2)
@@ -113,7 +120,6 @@ static inline void verify_assert_printk_format(const char *fmt, ...) {
*/
#define __REST_ARGS(_, ... ) __VA_OPT__(,) __VA_ARGS__
-#if defined(CONFIG_CC_IS_CLANG) || GCC_VERSION >= 80000
/*
* Assertion with optional printk() format.
*
@@ -152,22 +158,6 @@ do { \
} while(0)
#else
-
-/* For GCC < 8.x only the simple output. */
-
-#define ASSERT(cond, args...) \
-do { \
- verify_assert_printk_format("check the format string" args); \
- if (!likely(cond)) { \
- pr_err("assertion failed: %s :: %ld, in %s:%d\n", \
- #cond, (long)(cond), __FILE__, __LINE__); \
- BUG(); \
- } \
-} while(0)
-
-#endif
-
-#else
/* Compile check the @cond expression but don't generate any code. */
#define ASSERT(cond, args...) BUILD_BUG_ON_INVALID(cond)
#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 206587820fec..f53c313ab6e4 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -346,6 +346,42 @@ int btrfs_verify_qgroup_counts(const struct btrfs_fs_info *fs_info, u64 qgroupid
}
#endif
+static bool squota_check_parent_usage(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *parent)
+{
+ u64 excl_sum = 0;
+ u64 rfer_sum = 0;
+ u64 excl_cmpr_sum = 0;
+ u64 rfer_cmpr_sum = 0;
+ struct btrfs_qgroup_list *glist;
+ int nr_members = 0;
+ bool mismatch;
+
+ if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE)
+ return false;
+ if (btrfs_qgroup_level(parent->qgroupid) == 0)
+ return false;
+
+ /* Eligible parent qgroup. Squota; level > 0; empty members list. */
+ list_for_each_entry(glist, &parent->members, next_member) {
+ excl_sum += glist->member->excl;
+ rfer_sum += glist->member->rfer;
+ excl_cmpr_sum += glist->member->excl_cmpr;
+ rfer_cmpr_sum += glist->member->rfer_cmpr;
+ nr_members++;
+ }
+ mismatch = (parent->excl != excl_sum || parent->rfer != rfer_sum ||
+ parent->excl_cmpr != excl_cmpr_sum || parent->rfer_cmpr != excl_cmpr_sum);
+
+ WARN(mismatch,
+ "parent squota qgroup %hu/%llu has mismatched usage from its %d members. "
+ "%llu %llu %llu %llu vs %llu %llu %llu %llu\n",
+ btrfs_qgroup_level(parent->qgroupid),
+ btrfs_qgroup_subvolid(parent->qgroupid), nr_members, parent->excl,
+ parent->rfer, parent->excl_cmpr, parent->rfer_cmpr, excl_sum,
+ rfer_sum, excl_cmpr_sum, rfer_cmpr_sum);
+ return mismatch;
+}
+
__printf(2, 3)
static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
@@ -658,7 +694,6 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
u64 dst)
{
- int ret;
struct btrfs_root *quota_root = trans->fs_info->quota_root;
BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
@@ -671,8 +706,7 @@ static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
key.type = BTRFS_QGROUP_RELATION_KEY;
key.offset = dst;
- ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
- return ret;
+ return btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
}
static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
@@ -797,9 +831,7 @@ static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid)
if (ret > 0)
return -ENOENT;
- ret = btrfs_del_item(trans, quota_root, path);
-
- return ret;
+ return btrfs_del_item(trans, quota_root, path);
}
static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
@@ -1562,6 +1594,7 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst
goto out;
}
ret = quick_update_accounting(fs_info, src, dst, 1);
+ squota_check_parent_usage(fs_info, parent);
spin_unlock(&fs_info->qgroup_lock);
out:
kfree(prealloc);
@@ -1580,10 +1613,8 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
int ret = 0;
int ret2;
- if (!fs_info->quota_root) {
- ret = -ENOTCONN;
- goto out;
- }
+ if (!fs_info->quota_root)
+ return -ENOTCONN;
member = find_qgroup_rb(fs_info, src);
parent = find_qgroup_rb(fs_info, dst);
@@ -1605,10 +1636,10 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
delete_item:
ret = del_qgroup_relation_item(trans, src, dst);
if (ret < 0 && ret != -ENOENT)
- goto out;
+ return ret;
ret2 = del_qgroup_relation_item(trans, dst, src);
if (ret2 < 0 && ret2 != -ENOENT)
- goto out;
+ return ret2;
/* At least one deletion succeeded, return 0 */
if (!ret || !ret2)
@@ -1618,9 +1649,11 @@ delete_item:
spin_lock(&fs_info->qgroup_lock);
del_relation_rb(fs_info, src, dst);
ret = quick_update_accounting(fs_info, src, dst, -1);
+ ASSERT(parent);
+ squota_check_parent_usage(fs_info, parent);
spin_unlock(&fs_info->qgroup_lock);
}
-out:
+
return ret;
}
@@ -1679,6 +1712,36 @@ out:
return ret;
}
+static bool can_delete_parent_qgroup(struct btrfs_qgroup *qgroup)
+
+{
+ ASSERT(btrfs_qgroup_level(qgroup->qgroupid));
+ return list_empty(&qgroup->members);
+}
+
+/*
+ * Return true if we can delete the squota qgroup and false otherwise.
+ *
+ * Rules for whether we can delete:
+ *
+ * A subvolume qgroup can be removed iff the subvolume is fully deleted, which
+ * is iff there is 0 usage in the qgroup.
+ *
+ * A higher level qgroup can be removed iff it has no members.
+ * Note: We audit its usage to warn on inconsitencies without blocking deletion.
+ */
+static bool can_delete_squota_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup)
+{
+ ASSERT(btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE);
+
+ if (btrfs_qgroup_level(qgroup->qgroupid) > 0) {
+ squota_check_parent_usage(fs_info, qgroup);
+ return can_delete_parent_qgroup(qgroup);
+ }
+
+ return !(qgroup->rfer || qgroup->excl || qgroup->rfer_cmpr || qgroup->excl_cmpr);
+}
+
/*
* Return 0 if we can not delete the qgroup (not empty or has children etc).
* Return >0 if we can delete the qgroup.
@@ -1689,23 +1752,13 @@ static int can_delete_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup
struct btrfs_key key;
BTRFS_PATH_AUTO_FREE(path);
- /*
- * Squota would never be inconsistent, but there can still be case
- * where a dropped subvolume still has qgroup numbers, and squota
- * relies on such qgroup for future accounting.
- *
- * So for squota, do not allow dropping any non-zero qgroup.
- */
- if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE &&
- (qgroup->rfer || qgroup->excl || qgroup->excl_cmpr || qgroup->rfer_cmpr))
- return 0;
+ /* Since squotas cannot be inconsistent, they have special rules for deletion. */
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
+ return can_delete_squota_qgroup(fs_info, qgroup);
/* For higher level qgroup, we can only delete it if it has no child. */
- if (btrfs_qgroup_level(qgroup->qgroupid)) {
- if (!list_empty(&qgroup->members))
- return 0;
- return 1;
- }
+ if (btrfs_qgroup_level(qgroup->qgroupid))
+ return can_delete_parent_qgroup(qgroup);
/*
* For level-0 qgroups, we can only delete it if it has no subvolume
@@ -2433,13 +2486,11 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
/* This node is old, no need to trace */
if (child_gen < last_snapshot)
- goto out;
+ return ret;
eb = btrfs_read_node_slot(eb, parent_slot);
- if (IS_ERR(eb)) {
- ret = PTR_ERR(eb);
- goto out;
- }
+ if (IS_ERR(eb))
+ return PTR_ERR(eb);
dst_path->nodes[cur_level] = eb;
dst_path->slots[cur_level] = 0;
@@ -2484,7 +2535,7 @@ cleanup:
dst_path->slots[cur_level] = 0;
dst_path->locks[cur_level] = 0;
}
-out:
+
return ret;
}
@@ -2596,10 +2647,8 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
return ret;
}
- if (root_level == 0) {
- ret = btrfs_qgroup_trace_leaf_items(trans, root_eb);
- return ret;
- }
+ if (root_level == 0)
+ return btrfs_qgroup_trace_leaf_items(trans, root_eb);
path = btrfs_alloc_path();
if (!path)
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 58dc3e5057ce..314cb95ba846 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -754,8 +754,13 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
/*
* We may have copied an inline extent into a page of the destination
- * range, so wait for writeback to complete before invalidating pages
- * from the page cache. This is a rare case.
+ * range. So flush delalloc and wait for ordered extent completion.
+ * This is to ensure the invalidation below does not fail, as if for
+ * example it finds a dirty folio, our folio release callback
+ * (btrfs_release_folio()) returns false, which makes the invalidation
+ * return an -EBUSY error. We can't ignore such failures since they
+ * could come from some range other than the copied inline extent's
+ * destination range and we have no way to know that.
*/
ret = btrfs_wait_ordered_range(BTRFS_I(inode), destoff, len);
if (ret < 0)
@@ -873,7 +878,7 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
bool same_inode = dst_inode == src_inode;
int ret;
- if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(src_file)))))
+ if (btrfs_is_shutdown(inode_to_fs_info(file_inode(src_file))))
return -EIO;
if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 5bfefc3e9c06..fcd0a2ba3554 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -37,6 +37,7 @@
#include "super.h"
#include "tree-checker.h"
#include "raid-stripe-tree.h"
+#include "free-space-tree.h"
/*
* Relocation overview
@@ -3254,7 +3255,6 @@ static int delete_v1_space_cache(struct extent_buffer *leaf,
struct btrfs_key key;
bool found = false;
int i;
- int ret;
if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID)
return 0;
@@ -3278,8 +3278,8 @@ static int delete_v1_space_cache(struct extent_buffer *leaf,
}
if (!found)
return -ENOENT;
- ret = delete_block_group_cache(block_group, NULL, space_cache_ino);
- return ret;
+
+ return delete_block_group_cache(block_group, NULL, space_cache_ino);
}
/*
@@ -3616,7 +3616,7 @@ restart:
btrfs_btree_balance_dirty(fs_info);
}
- if (!err) {
+ if (!err && !btrfs_fs_incompat(fs_info, REMAP_TREE)) {
ret = relocate_file_extent_cluster(rc);
if (ret < 0)
err = ret;
@@ -3860,6 +3860,1433 @@ static const char *stage_to_string(enum reloc_stage stage)
return "unknown";
}
+static int add_remap_tree_entries(struct btrfs_trans_handle *trans, struct btrfs_path *path,
+ struct btrfs_key *entries, unsigned int num_entries)
+{
+ int ret;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_item_batch batch;
+ u32 *data_sizes;
+ u32 max_items;
+
+ max_items = BTRFS_LEAF_DATA_SIZE(trans->fs_info) / sizeof(struct btrfs_item);
+
+ data_sizes = kzalloc(sizeof(u32) * min_t(u32, num_entries, max_items), GFP_NOFS);
+ if (!data_sizes)
+ return -ENOMEM;
+
+ while (true) {
+ batch.keys = entries;
+ batch.data_sizes = data_sizes;
+ batch.total_data_size = 0;
+ batch.nr = min_t(u32, num_entries, max_items);
+
+ ret = btrfs_insert_empty_items(trans, fs_info->remap_root, path, &batch);
+ btrfs_release_path(path);
+
+ if (num_entries <= max_items)
+ break;
+
+ num_entries -= max_items;
+ entries += max_items;
+ }
+
+ kfree(data_sizes);
+
+ return ret;
+}
+
+struct space_run {
+ u64 start;
+ u64 end;
+};
+
+static void parse_bitmap(u64 block_size, const unsigned long *bitmap,
+ unsigned long size, u64 address, struct space_run *space_runs,
+ unsigned int *num_space_runs)
+{
+ unsigned long pos, end;
+ u64 run_start, run_length;
+
+ pos = find_first_bit(bitmap, size);
+ if (pos == size)
+ return;
+
+ while (true) {
+ end = find_next_zero_bit(bitmap, size, pos);
+
+ run_start = address + (pos * block_size);
+ run_length = (end - pos) * block_size;
+
+ if (*num_space_runs != 0 &&
+ space_runs[*num_space_runs - 1].end == run_start) {
+ space_runs[*num_space_runs - 1].end += run_length;
+ } else {
+ space_runs[*num_space_runs].start = run_start;
+ space_runs[*num_space_runs].end = run_start + run_length;
+
+ (*num_space_runs)++;
+ }
+
+ if (end == size)
+ break;
+
+ pos = find_next_bit(bitmap, size, end + 1);
+ if (pos == size)
+ break;
+ }
+}
+
+static void adjust_block_group_remap_bytes(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *bg, s64 diff)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ bool bg_already_dirty = true;
+ bool mark_unused = false;
+
+ spin_lock(&bg->lock);
+ bg->remap_bytes += diff;
+ if (bg->used == 0 && bg->remap_bytes == 0)
+ mark_unused = true;
+ spin_unlock(&bg->lock);
+
+ if (mark_unused)
+ btrfs_mark_bg_unused(bg);
+
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (list_empty(&bg->dirty_list)) {
+ list_add_tail(&bg->dirty_list, &trans->transaction->dirty_bgs);
+ bg_already_dirty = false;
+ btrfs_get_block_group(bg);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+
+ /* Modified block groups are accounted for in the delayed_refs_rsv. */
+ if (!bg_already_dirty)
+ btrfs_inc_delayed_refs_rsv_bg_updates(fs_info);
+}
+
+/* Private structure for I/O from copy_remapped_data(). */
+struct reloc_io_private {
+ struct completion done;
+ refcount_t pending_refs;
+ blk_status_t status;
+};
+
+static void reloc_endio(struct btrfs_bio *bbio)
+{
+ struct reloc_io_private *priv = bbio->private;
+
+ if (bbio->bio.bi_status)
+ WRITE_ONCE(priv->status, bbio->bio.bi_status);
+
+ if (refcount_dec_and_test(&priv->pending_refs))
+ complete(&priv->done);
+
+ bio_put(&bbio->bio);
+}
+
+static int copy_remapped_data_io(struct btrfs_fs_info *fs_info,
+ struct reloc_io_private *priv,
+ struct page **pages, u64 addr, u64 length,
+ blk_opf_t op)
+{
+ struct btrfs_bio *bbio;
+ int i;
+
+ init_completion(&priv->done);
+ refcount_set(&priv->pending_refs, 1);
+ priv->status = 0;
+
+ bbio = btrfs_bio_alloc(BIO_MAX_VECS, op, BTRFS_I(fs_info->btree_inode),
+ addr, reloc_endio, priv);
+ bbio->bio.bi_iter.bi_sector = (addr >> SECTOR_SHIFT);
+ bbio->is_remap = true;
+
+ i = 0;
+ do {
+ size_t bytes = min_t(u64, length, PAGE_SIZE);
+
+ if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
+ refcount_inc(&priv->pending_refs);
+ btrfs_submit_bbio(bbio, 0);
+
+ bbio = btrfs_bio_alloc(BIO_MAX_VECS, op,
+ BTRFS_I(fs_info->btree_inode),
+ addr, reloc_endio, priv);
+ bbio->bio.bi_iter.bi_sector = (addr >> SECTOR_SHIFT);
+ bbio->is_remap = true;
+ continue;
+ }
+
+ i++;
+ addr += bytes;
+ length -= bytes;
+ } while (length);
+
+ refcount_inc(&priv->pending_refs);
+ btrfs_submit_bbio(bbio, 0);
+
+ if (!refcount_dec_and_test(&priv->pending_refs))
+ wait_for_completion_io(&priv->done);
+
+ return blk_status_to_errno(READ_ONCE(priv->status));
+}
+
+static int copy_remapped_data(struct btrfs_fs_info *fs_info, u64 old_addr,
+ u64 new_addr, u64 length)
+{
+ int ret;
+ u64 copy_len = min_t(u64, length, SZ_1M);
+ struct page **pages;
+ struct reloc_io_private priv;
+ unsigned int nr_pages = DIV_ROUND_UP(length, PAGE_SIZE);
+
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+ if (!pages)
+ return -ENOMEM;
+
+ ret = btrfs_alloc_page_array(nr_pages, pages, 0);
+ if (ret) {
+ ret = -ENOMEM;
+ goto end;
+ }
+
+ /* Copy 1MB at a time, to avoid using too much memory. */
+ do {
+ u64 to_copy = min_t(u64, length, copy_len);
+
+ /* Limit to one bio. */
+ to_copy = min_t(u64, to_copy, BIO_MAX_VECS << PAGE_SHIFT);
+
+ ret = copy_remapped_data_io(fs_info, &priv, pages, old_addr,
+ to_copy, REQ_OP_READ);
+ if (ret)
+ goto end;
+
+ ret = copy_remapped_data_io(fs_info, &priv, pages, new_addr,
+ to_copy, REQ_OP_WRITE);
+ if (ret)
+ goto end;
+
+ if (to_copy == length)
+ break;
+
+ old_addr += to_copy;
+ new_addr += to_copy;
+ length -= to_copy;
+ } while (true);
+
+ ret = 0;
+end:
+ for (int i = 0; i < nr_pages; i++) {
+ if (pages[i])
+ __free_page(pages[i]);
+ }
+ kfree(pages);
+
+ return ret;
+}
+
+static int add_remap_item(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path, u64 new_addr, u64 length,
+ u64 old_addr)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_remap_item remap = { 0 };
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ int ret;
+
+ key.objectid = old_addr;
+ key.type = BTRFS_REMAP_KEY;
+ key.offset = length;
+
+ ret = btrfs_insert_empty_item(trans, fs_info->remap_root, path,
+ &key, sizeof(struct btrfs_remap_item));
+ if (ret)
+ return ret;
+
+ leaf = path->nodes[0];
+ btrfs_set_stack_remap_address(&remap, new_addr);
+ write_extent_buffer(leaf, &remap, btrfs_item_ptr_offset(leaf, path->slots[0]),
+ sizeof(struct btrfs_remap_item));
+
+ btrfs_release_path(path);
+
+ return 0;
+}
+
+static int add_remap_backref_item(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path, u64 new_addr,
+ u64 length, u64 old_addr)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_remap_item remap = { 0 };
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ int ret;
+
+ key.objectid = new_addr;
+ key.type = BTRFS_REMAP_BACKREF_KEY;
+ key.offset = length;
+
+ ret = btrfs_insert_empty_item(trans, fs_info->remap_root, path, &key,
+ sizeof(struct btrfs_remap_item));
+ if (ret)
+ return ret;
+
+ leaf = path->nodes[0];
+ btrfs_set_stack_remap_address(&remap, old_addr);
+ write_extent_buffer(leaf, &remap, btrfs_item_ptr_offset(leaf, path->slots[0]),
+ sizeof(struct btrfs_remap_item));
+
+ btrfs_release_path(path);
+
+ return 0;
+}
+
+static int move_existing_remap(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ struct btrfs_block_group *bg, u64 new_addr,
+ u64 length, u64 old_addr)
+{
+ struct btrfs_trans_handle *trans;
+ struct extent_buffer *leaf;
+ struct btrfs_remap_item *remap_ptr;
+ struct btrfs_remap_item remap = { 0 };
+ struct btrfs_key key, ins;
+ u64 dest_addr, dest_length, min_size;
+ struct btrfs_block_group *dest_bg;
+ int ret;
+ const bool is_data = (bg->flags & BTRFS_BLOCK_GROUP_DATA);
+ struct btrfs_space_info *sinfo = bg->space_info;
+ bool mutex_taken = false;
+ bool bg_needs_free_space;
+
+ spin_lock(&sinfo->lock);
+ btrfs_space_info_update_bytes_may_use(sinfo, length);
+ spin_unlock(&sinfo->lock);
+
+ if (is_data)
+ min_size = fs_info->sectorsize;
+ else
+ min_size = fs_info->nodesize;
+
+ ret = btrfs_reserve_extent(fs_info->fs_root, length, length, min_size,
+ 0, 0, &ins, is_data, false);
+ if (unlikely(ret)) {
+ spin_lock(&sinfo->lock);
+ btrfs_space_info_update_bytes_may_use(sinfo, -length);
+ spin_unlock(&sinfo->lock);
+ return ret;
+ }
+
+ dest_addr = ins.objectid;
+ dest_length = ins.offset;
+
+ if (!is_data && !IS_ALIGNED(dest_length, fs_info->nodesize)) {
+ u64 new_length = ALIGN_DOWN(dest_length, fs_info->nodesize);
+
+ btrfs_free_reserved_extent(fs_info, dest_addr + new_length,
+ dest_length - new_length, 0);
+
+ dest_length = new_length;
+ }
+
+ trans = btrfs_join_transaction(fs_info->remap_root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto end;
+ }
+
+ mutex_lock(&fs_info->remap_mutex);
+ mutex_taken = true;
+
+ /* Find old remap entry. */
+ key.objectid = old_addr;
+ key.type = BTRFS_REMAP_KEY;
+ key.offset = length;
+
+ ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, 0, 1);
+ if (ret == 1) {
+ /*
+ * Not a problem if the remap entry wasn't found: that means
+ * that another transaction has deallocated the data.
+ * move_existing_remaps() loops until the BG contains no
+ * remaps, so we can just return 0 in this case.
+ */
+ btrfs_release_path(path);
+ ret = 0;
+ goto end;
+ } else if (unlikely(ret)) {
+ goto end;
+ }
+
+ ret = copy_remapped_data(fs_info, new_addr, dest_addr, dest_length);
+ if (unlikely(ret))
+ goto end;
+
+ /* Change data of old remap entry. */
+ leaf = path->nodes[0];
+ remap_ptr = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_remap_item);
+ btrfs_set_remap_address(leaf, remap_ptr, dest_addr);
+ btrfs_mark_buffer_dirty(trans, leaf);
+
+ if (dest_length != length) {
+ key.offset = dest_length;
+ btrfs_set_item_key_safe(trans, path, &key);
+ }
+
+ btrfs_release_path(path);
+
+ if (dest_length != length) {
+ /* Add remap item for remainder. */
+ ret = add_remap_item(trans, path, new_addr + dest_length,
+ length - dest_length, old_addr + dest_length);
+ if (unlikely(ret))
+ goto end;
+ }
+
+ /* Change or remove old backref. */
+ key.objectid = new_addr;
+ key.type = BTRFS_REMAP_BACKREF_KEY;
+ key.offset = length;
+
+ ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, -1, 1);
+ if (unlikely(ret)) {
+ if (ret == 1) {
+ btrfs_release_path(path);
+ ret = -ENOENT;
+ }
+ goto end;
+ }
+
+ leaf = path->nodes[0];
+
+ if (dest_length == length) {
+ ret = btrfs_del_item(trans, fs_info->remap_root, path);
+ if (unlikely(ret)) {
+ btrfs_release_path(path);
+ goto end;
+ }
+ } else {
+ key.objectid += dest_length;
+ key.offset -= dest_length;
+ btrfs_set_item_key_safe(trans, path, &key);
+ btrfs_set_stack_remap_address(&remap, old_addr + dest_length);
+
+ write_extent_buffer(leaf, &remap,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ sizeof(struct btrfs_remap_item));
+ }
+
+ btrfs_release_path(path);
+
+ /* Add new backref. */
+ ret = add_remap_backref_item(trans, path, dest_addr, dest_length, old_addr);
+ if (unlikely(ret))
+ goto end;
+
+ adjust_block_group_remap_bytes(trans, bg, -dest_length);
+
+ ret = btrfs_add_to_free_space_tree(trans, new_addr, dest_length);
+ if (unlikely(ret))
+ goto end;
+
+ dest_bg = btrfs_lookup_block_group(fs_info, dest_addr);
+
+ adjust_block_group_remap_bytes(trans, dest_bg, dest_length);
+
+ mutex_lock(&dest_bg->free_space_lock);
+ bg_needs_free_space = test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
+ &dest_bg->runtime_flags);
+ mutex_unlock(&dest_bg->free_space_lock);
+ btrfs_put_block_group(dest_bg);
+
+ if (bg_needs_free_space) {
+ ret = btrfs_add_block_group_free_space(trans, dest_bg);
+ if (unlikely(ret))
+ goto end;
+ }
+
+ ret = btrfs_remove_from_free_space_tree(trans, dest_addr, dest_length);
+ if (unlikely(ret)) {
+ btrfs_remove_from_free_space_tree(trans, new_addr, dest_length);
+ goto end;
+ }
+
+ ret = 0;
+
+end:
+ if (mutex_taken)
+ mutex_unlock(&fs_info->remap_mutex);
+
+ btrfs_dec_block_group_reservations(fs_info, dest_addr);
+
+ if (unlikely(ret)) {
+ btrfs_free_reserved_extent(fs_info, dest_addr, dest_length, 0);
+
+ if (trans) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ }
+ } else {
+ dest_bg = btrfs_lookup_block_group(fs_info, dest_addr);
+ btrfs_free_reserved_bytes(dest_bg, dest_length, 0);
+ btrfs_put_block_group(dest_bg);
+
+ ret = btrfs_commit_transaction(trans);
+ }
+
+ return ret;
+}
+
+static int move_existing_remaps(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group *bg,
+ struct btrfs_path *path)
+{
+ int ret;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_remap_item *remap;
+ u64 old_addr;
+
+ /* Look for backrefs in remap tree. */
+ while (bg->remap_bytes > 0) {
+ key.objectid = bg->start;
+ key.type = BTRFS_REMAP_BACKREF_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, fs_info->remap_root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ leaf = path->nodes[0];
+
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(fs_info->remap_root, path);
+ if (ret < 0) {
+ btrfs_release_path(path);
+ return ret;
+ }
+
+ if (ret) {
+ btrfs_release_path(path);
+ break;
+ }
+
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ if (key.type != BTRFS_REMAP_BACKREF_KEY) {
+ path->slots[0]++;
+
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(fs_info->remap_root, path);
+ if (ret < 0) {
+ btrfs_release_path(path);
+ return ret;
+ }
+
+ if (ret) {
+ btrfs_release_path(path);
+ break;
+ }
+
+ leaf = path->nodes[0];
+ }
+ }
+
+ remap = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_remap_item);
+ old_addr = btrfs_remap_address(leaf, remap);
+
+ btrfs_release_path(path);
+
+ ret = move_existing_remap(fs_info, path, bg, key.objectid,
+ key.offset, old_addr);
+ if (ret)
+ return ret;
+ }
+
+ ASSERT(bg->remap_bytes == 0);
+
+ return 0;
+}
+
+static int create_remap_tree_entries(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_free_space_info *fsi;
+ struct btrfs_key key, found_key;
+ struct extent_buffer *leaf;
+ struct btrfs_root *space_root;
+ u32 extent_count;
+ struct space_run *space_runs = NULL;
+ unsigned int num_space_runs = 0;
+ struct btrfs_key *entries = NULL;
+ unsigned int max_entries, num_entries;
+ int ret;
+
+ mutex_lock(&bg->free_space_lock);
+
+ if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &bg->runtime_flags)) {
+ mutex_unlock(&bg->free_space_lock);
+
+ ret = btrfs_add_block_group_free_space(trans, bg);
+ if (ret)
+ return ret;
+
+ mutex_lock(&bg->free_space_lock);
+ }
+
+ fsi = btrfs_search_free_space_info(trans, bg, path, 0);
+ if (IS_ERR(fsi)) {
+ mutex_unlock(&bg->free_space_lock);
+ return PTR_ERR(fsi);
+ }
+
+ extent_count = btrfs_free_space_extent_count(path->nodes[0], fsi);
+
+ btrfs_release_path(path);
+
+ space_runs = kmalloc(sizeof(*space_runs) * extent_count, GFP_NOFS);
+ if (!space_runs) {
+ mutex_unlock(&bg->free_space_lock);
+ return -ENOMEM;
+ }
+
+ key.objectid = bg->start;
+ key.type = 0;
+ key.offset = 0;
+
+ space_root = btrfs_free_space_root(bg);
+
+ ret = btrfs_search_slot(trans, space_root, &key, path, 0, 0);
+ if (ret < 0) {
+ mutex_unlock(&bg->free_space_lock);
+ goto out;
+ }
+
+ ret = 0;
+
+ while (true) {
+ leaf = path->nodes[0];
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ if (found_key.objectid >= bg->start + bg->length)
+ break;
+
+ if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY) {
+ if (num_space_runs != 0 &&
+ space_runs[num_space_runs - 1].end == found_key.objectid) {
+ space_runs[num_space_runs - 1].end =
+ found_key.objectid + found_key.offset;
+ } else {
+ ASSERT(num_space_runs < extent_count);
+
+ space_runs[num_space_runs].start = found_key.objectid;
+ space_runs[num_space_runs].end =
+ found_key.objectid + found_key.offset;
+
+ num_space_runs++;
+ }
+ } else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) {
+ void *bitmap;
+ unsigned long offset;
+ u32 data_size;
+
+ offset = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ data_size = btrfs_item_size(leaf, path->slots[0]);
+
+ if (data_size != 0) {
+ bitmap = kmalloc(data_size, GFP_NOFS);
+ if (!bitmap) {
+ mutex_unlock(&bg->free_space_lock);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ read_extent_buffer(leaf, bitmap, offset, data_size);
+
+ parse_bitmap(fs_info->sectorsize, bitmap,
+ data_size * BITS_PER_BYTE,
+ found_key.objectid, space_runs,
+ &num_space_runs);
+
+ ASSERT(num_space_runs <= extent_count);
+
+ kfree(bitmap);
+ }
+ }
+
+ path->slots[0]++;
+
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(space_root, path);
+ if (ret != 0) {
+ if (ret == 1)
+ ret = 0;
+ break;
+ }
+ leaf = path->nodes[0];
+ }
+ }
+
+ btrfs_release_path(path);
+
+ mutex_unlock(&bg->free_space_lock);
+
+ max_entries = extent_count + 2;
+ entries = kmalloc(sizeof(*entries) * max_entries, GFP_NOFS);
+ if (!entries) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ num_entries = 0;
+
+ if (num_space_runs == 0) {
+ entries[num_entries].objectid = bg->start;
+ entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY;
+ entries[num_entries].offset = bg->length;
+ num_entries++;
+ } else {
+ if (space_runs[0].start > bg->start) {
+ entries[num_entries].objectid = bg->start;
+ entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY;
+ entries[num_entries].offset = space_runs[0].start - bg->start;
+ num_entries++;
+ }
+
+ for (unsigned int i = 1; i < num_space_runs; i++) {
+ entries[num_entries].objectid = space_runs[i - 1].end;
+ entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY;
+ entries[num_entries].offset =
+ space_runs[i].start - space_runs[i - 1].end;
+ num_entries++;
+ }
+
+ if (space_runs[num_space_runs - 1].end < bg->start + bg->length) {
+ entries[num_entries].objectid =
+ space_runs[num_space_runs - 1].end;
+ entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY;
+ entries[num_entries].offset =
+ bg->start + bg->length - space_runs[num_space_runs - 1].end;
+ num_entries++;
+ }
+
+ if (num_entries == 0)
+ goto out;
+ }
+
+ bg->identity_remap_count = num_entries;
+
+ ret = add_remap_tree_entries(trans, path, entries, num_entries);
+
+out:
+ kfree(entries);
+ kfree(space_runs);
+
+ return ret;
+}
+
+static int find_next_identity_remap(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path, u64 bg_end,
+ u64 last_start, u64 *start, u64 *length)
+{
+ int ret;
+ struct btrfs_key key, found_key;
+ struct btrfs_root *remap_root = trans->fs_info->remap_root;
+ struct extent_buffer *leaf;
+
+ key.objectid = last_start;
+ key.type = BTRFS_IDENTITY_REMAP_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(trans, remap_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ leaf = path->nodes[0];
+ while (true) {
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(remap_root, path);
+
+ if (ret != 0) {
+ if (ret == 1)
+ ret = -ENOENT;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ if (found_key.objectid >= bg_end) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ if (found_key.type == BTRFS_IDENTITY_REMAP_KEY) {
+ *start = found_key.objectid;
+ *length = found_key.offset;
+ ret = 0;
+ goto out;
+ }
+
+ path->slots[0]++;
+ }
+
+out:
+ btrfs_release_path(path);
+
+ return ret;
+}
+
+static int remove_chunk_stripes(struct btrfs_trans_handle *trans,
+ struct btrfs_chunk_map *chunk_map,
+ struct btrfs_path *path)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_chunk *chunk;
+ int ret;
+
+ key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+ key.type = BTRFS_CHUNK_ITEM_KEY;
+ key.offset = chunk_map->start;
+
+ btrfs_reserve_chunk_metadata(trans, false);
+
+ ret = btrfs_search_slot(trans, fs_info->chunk_root, &key, path, 0, 1);
+ if (ret) {
+ if (ret == 1) {
+ btrfs_release_path(path);
+ ret = -ENOENT;
+ }
+ btrfs_trans_release_chunk_metadata(trans);
+ return ret;
+ }
+
+ leaf = path->nodes[0];
+
+ chunk = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_chunk);
+ btrfs_set_chunk_num_stripes(leaf, chunk, 0);
+ btrfs_set_chunk_sub_stripes(leaf, chunk, 0);
+
+ btrfs_truncate_item(trans, path, offsetof(struct btrfs_chunk, stripe), 1);
+
+ btrfs_mark_buffer_dirty(trans, leaf);
+
+ btrfs_release_path(path);
+ btrfs_trans_release_chunk_metadata(trans);
+
+ return 0;
+}
+
+int btrfs_last_identity_remap_gone(struct btrfs_chunk_map *chunk_map,
+ struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+ struct btrfs_trans_handle *trans;
+ int ret;
+ unsigned int num_items;
+ BTRFS_PATH_AUTO_FREE(path);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * One item for each entry we're removing in the dev extents tree, and
+ * another for each device. DUP chunks are all on one device,
+ * everything else has one device per stripe.
+ */
+ if (bg->flags & BTRFS_BLOCK_GROUP_DUP)
+ num_items = chunk_map->num_stripes + 1;
+ else
+ num_items = 2 * chunk_map->num_stripes;
+
+ trans = btrfs_start_transaction_fallback_global_rsv(fs_info->tree_root, num_items);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ ret = btrfs_remove_dev_extents(trans, chunk_map);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ mutex_lock(&trans->fs_info->chunk_mutex);
+ for (unsigned int i = 0; i < chunk_map->num_stripes; i++) {
+ ret = btrfs_update_device(trans, chunk_map->stripes[i].dev);
+ if (unlikely(ret)) {
+ mutex_unlock(&trans->fs_info->chunk_mutex);
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ }
+ mutex_unlock(&trans->fs_info->chunk_mutex);
+
+ write_lock(&trans->fs_info->mapping_tree_lock);
+ btrfs_chunk_map_device_clear_bits(chunk_map, CHUNK_ALLOCATED);
+ write_unlock(&trans->fs_info->mapping_tree_lock);
+
+ btrfs_remove_bg_from_sinfo(bg);
+
+ spin_lock(&bg->lock);
+ clear_bit(BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING, &bg->runtime_flags);
+ spin_unlock(&bg->lock);
+
+ ret = remove_chunk_stripes(trans, chunk_map, path);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ ret = btrfs_commit_transaction(trans);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+static void adjust_identity_remap_count(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *bg, int delta)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ bool bg_already_dirty = true;
+ bool mark_fully_remapped = false;
+
+ WARN_ON(delta < 0 && -delta > bg->identity_remap_count);
+
+ spin_lock(&bg->lock);
+
+ bg->identity_remap_count += delta;
+
+ if (bg->identity_remap_count == 0 &&
+ !test_bit(BLOCK_GROUP_FLAG_FULLY_REMAPPED, &bg->runtime_flags)) {
+ set_bit(BLOCK_GROUP_FLAG_FULLY_REMAPPED, &bg->runtime_flags);
+ mark_fully_remapped = true;
+ }
+
+ spin_unlock(&bg->lock);
+
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (list_empty(&bg->dirty_list)) {
+ list_add_tail(&bg->dirty_list, &trans->transaction->dirty_bgs);
+ bg_already_dirty = false;
+ btrfs_get_block_group(bg);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+
+ /* Modified block groups are accounted for in the delayed_refs_rsv. */
+ if (!bg_already_dirty)
+ btrfs_inc_delayed_refs_rsv_bg_updates(fs_info);
+
+ if (mark_fully_remapped)
+ btrfs_mark_bg_fully_remapped(bg, trans);
+}
+
+static int add_remap_entry(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_block_group *src_bg, u64 old_addr,
+ u64 new_addr, u64 length)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_key key, new_key;
+ int ret;
+ int identity_count_delta = 0;
+
+ key.objectid = old_addr;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, -1, 1);
+ if (ret < 0)
+ goto end;
+
+ if (path->slots[0] == 0) {
+ ret = -ENOENT;
+ goto end;
+ }
+
+ path->slots[0]--;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type != BTRFS_IDENTITY_REMAP_KEY ||
+ key.objectid > old_addr ||
+ key.objectid + key.offset <= old_addr) {
+ ret = -ENOENT;
+ goto end;
+ }
+
+ /* Shorten or delete identity mapping entry. */
+ if (key.objectid == old_addr) {
+ ret = btrfs_del_item(trans, fs_info->remap_root, path);
+ if (ret)
+ goto end;
+
+ identity_count_delta--;
+ } else {
+ new_key.objectid = key.objectid;
+ new_key.type = BTRFS_IDENTITY_REMAP_KEY;
+ new_key.offset = old_addr - key.objectid;
+
+ btrfs_set_item_key_safe(trans, path, &new_key);
+ }
+
+ btrfs_release_path(path);
+
+ /* Create new remap entry. */
+ ret = add_remap_item(trans, path, new_addr, length, old_addr);
+ if (ret)
+ goto end;
+
+ /* Add entry for remainder of identity mapping, if necessary. */
+ if (key.objectid + key.offset != old_addr + length) {
+ new_key.objectid = old_addr + length;
+ new_key.type = BTRFS_IDENTITY_REMAP_KEY;
+ new_key.offset = key.objectid + key.offset - old_addr - length;
+
+ ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
+ path, &new_key, 0);
+ if (ret)
+ goto end;
+
+ btrfs_release_path(path);
+
+ identity_count_delta++;
+ }
+
+ /* Add backref. */
+ ret = add_remap_backref_item(trans, path, new_addr, length, old_addr);
+ if (ret)
+ goto end;
+
+ if (identity_count_delta != 0)
+ adjust_identity_remap_count(trans, src_bg, identity_count_delta);
+
+end:
+ btrfs_release_path(path);
+
+ return ret;
+}
+
+static int mark_chunk_remapped(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path, u64 start)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_chunk_map *chunk_map;
+ struct btrfs_key key;
+ u64 type;
+ int ret;
+ struct extent_buffer *leaf;
+ struct btrfs_chunk *chunk;
+
+ read_lock(&fs_info->mapping_tree_lock);
+
+ chunk_map = btrfs_find_chunk_map_nolock(fs_info, start, 1);
+ if (!chunk_map) {
+ read_unlock(&fs_info->mapping_tree_lock);
+ return -ENOENT;
+ }
+
+ chunk_map->type |= BTRFS_BLOCK_GROUP_REMAPPED;
+ type = chunk_map->type;
+
+ read_unlock(&fs_info->mapping_tree_lock);
+
+ key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+ key.type = BTRFS_CHUNK_ITEM_KEY;
+ key.offset = start;
+
+ ret = btrfs_search_slot(trans, fs_info->chunk_root, &key, path, 0, 1);
+ if (ret == 1) {
+ ret = -ENOENT;
+ goto end;
+ } else if (ret < 0)
+ goto end;
+
+ leaf = path->nodes[0];
+
+ chunk = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_chunk);
+ btrfs_set_chunk_type(leaf, chunk, type);
+ btrfs_mark_buffer_dirty(trans, leaf);
+
+ ret = 0;
+end:
+ btrfs_free_chunk_map(chunk_map);
+ btrfs_release_path(path);
+
+ return ret;
+}
+
+static int do_remap_reloc_trans(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group *src_bg,
+ struct btrfs_path *path, u64 *last_start)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *extent_root;
+ struct btrfs_key ins;
+ struct btrfs_block_group *dest_bg = NULL;
+ u64 start = 0, remap_length = 0;
+ u64 length, new_addr, min_size;
+ int ret;
+ const bool is_data = (src_bg->flags & BTRFS_BLOCK_GROUP_DATA);
+ bool no_more = false;
+ bool made_reservation = false, bg_needs_free_space;
+ struct btrfs_space_info *sinfo = src_bg->space_info;
+
+ extent_root = btrfs_extent_root(fs_info, src_bg->start);
+
+ trans = btrfs_start_transaction(extent_root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ mutex_lock(&fs_info->remap_mutex);
+
+ ret = find_next_identity_remap(trans, path, src_bg->start + src_bg->length,
+ *last_start, &start, &remap_length);
+ if (ret == -ENOENT) {
+ no_more = true;
+ goto next;
+ } else if (ret) {
+ mutex_unlock(&fs_info->remap_mutex);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
+
+ /* Try to reserve enough space for block. */
+ spin_lock(&sinfo->lock);
+ btrfs_space_info_update_bytes_may_use(sinfo, remap_length);
+ spin_unlock(&sinfo->lock);
+
+ if (is_data)
+ min_size = fs_info->sectorsize;
+ else
+ min_size = fs_info->nodesize;
+
+ /*
+ * We're using btrfs_reserve_extent() to allocate a contiguous
+ * logical address range, but this will become a remap item rather than
+ * an extent in the extent tree.
+ *
+ * Short allocations are fine: it means that we chop off the beginning
+ * of the identity remap that we're processing, and will tackle the
+ * rest of it the next time round.
+ */
+ ret = btrfs_reserve_extent(fs_info->fs_root, remap_length, remap_length,
+ min_size, 0, 0, &ins, is_data, false);
+ if (ret) {
+ spin_lock(&sinfo->lock);
+ btrfs_space_info_update_bytes_may_use(sinfo, -remap_length);
+ spin_unlock(&sinfo->lock);
+
+ mutex_unlock(&fs_info->remap_mutex);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
+
+ made_reservation = true;
+
+ new_addr = ins.objectid;
+ length = ins.offset;
+
+ if (!is_data && !IS_ALIGNED(length, fs_info->nodesize)) {
+ u64 new_length = ALIGN_DOWN(length, fs_info->nodesize);
+
+ btrfs_free_reserved_extent(fs_info, new_addr + new_length,
+ length - new_length, 0);
+
+ length = new_length;
+ }
+
+ dest_bg = btrfs_lookup_block_group(fs_info, new_addr);
+
+ mutex_lock(&dest_bg->free_space_lock);
+ bg_needs_free_space = test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
+ &dest_bg->runtime_flags);
+ mutex_unlock(&dest_bg->free_space_lock);
+
+ if (bg_needs_free_space) {
+ ret = btrfs_add_block_group_free_space(trans, dest_bg);
+ if (ret)
+ goto fail;
+ }
+
+ ret = copy_remapped_data(fs_info, start, new_addr, length);
+ if (ret)
+ goto fail;
+
+ ret = btrfs_remove_from_free_space_tree(trans, new_addr, length);
+ if (ret)
+ goto fail;
+
+ ret = add_remap_entry(trans, path, src_bg, start, new_addr, length);
+ if (ret) {
+ btrfs_add_to_free_space_tree(trans, new_addr, length);
+ goto fail;
+ }
+
+ adjust_block_group_remap_bytes(trans, dest_bg, length);
+ btrfs_free_reserved_bytes(dest_bg, length, 0);
+
+ spin_lock(&sinfo->lock);
+ sinfo->bytes_readonly += length;
+ spin_unlock(&sinfo->lock);
+
+next:
+ if (dest_bg)
+ btrfs_put_block_group(dest_bg);
+
+ if (made_reservation)
+ btrfs_dec_block_group_reservations(fs_info, new_addr);
+
+ mutex_unlock(&fs_info->remap_mutex);
+
+ if (src_bg->identity_remap_count == 0) {
+ bool mark_fully_remapped = false;
+
+ spin_lock(&src_bg->lock);
+ if (!test_bit(BLOCK_GROUP_FLAG_FULLY_REMAPPED, &src_bg->runtime_flags)) {
+ mark_fully_remapped = true;
+ set_bit(BLOCK_GROUP_FLAG_FULLY_REMAPPED, &src_bg->runtime_flags);
+ }
+ spin_unlock(&src_bg->lock);
+
+ if (mark_fully_remapped)
+ btrfs_mark_bg_fully_remapped(src_bg, trans);
+ }
+
+ ret = btrfs_end_transaction(trans);
+ if (ret)
+ return ret;
+
+ if (no_more)
+ return 1;
+
+ *last_start = start;
+
+ return 0;
+
+fail:
+ if (dest_bg)
+ btrfs_put_block_group(dest_bg);
+
+ btrfs_free_reserved_extent(fs_info, new_addr, length, 0);
+
+ mutex_unlock(&fs_info->remap_mutex);
+ btrfs_end_transaction(trans);
+
+ return ret;
+}
+
+static int do_remap_reloc(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
+ struct btrfs_block_group *bg)
+{
+ u64 last_start = bg->start;
+ int ret;
+
+ while (true) {
+ ret = do_remap_reloc_trans(fs_info, bg, path, &last_start);
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical, u64 *length)
+{
+ int ret;
+ struct btrfs_key key, found_key;
+ struct extent_buffer *leaf;
+ struct btrfs_remap_item *remap;
+ BTRFS_PATH_AUTO_FREE(path);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = *logical;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, fs_info->remap_root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ leaf = path->nodes[0];
+ if (path->slots[0] == 0)
+ return -ENOENT;
+
+ path->slots[0]--;
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ if (found_key.type != BTRFS_REMAP_KEY &&
+ found_key.type != BTRFS_IDENTITY_REMAP_KEY) {
+ return -ENOENT;
+ }
+
+ if (found_key.objectid > *logical ||
+ found_key.objectid + found_key.offset <= *logical) {
+ return -ENOENT;
+ }
+
+ if (*logical + *length > found_key.objectid + found_key.offset)
+ *length = found_key.objectid + found_key.offset - *logical;
+
+ if (found_key.type == BTRFS_IDENTITY_REMAP_KEY)
+ return 0;
+
+ remap = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_remap_item);
+ *logical += btrfs_remap_address(leaf, remap) - found_key.objectid;
+
+ return 0;
+}
+
+static int start_block_group_remapping(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ struct btrfs_block_group *bg)
+{
+ struct btrfs_trans_handle *trans;
+ bool bg_already_dirty = true;
+ int ret, ret2;
+
+ ret = btrfs_cache_block_group(bg, true);
+ if (ret)
+ return ret;
+
+ trans = btrfs_start_transaction(fs_info->remap_root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ /* We need to run delayed refs, to make sure FST is up to date. */
+ ret = btrfs_run_delayed_refs(trans, U64_MAX);
+ if (ret) {
+ btrfs_end_transaction(trans);
+ return ret;
+ }
+
+ mutex_lock(&fs_info->remap_mutex);
+
+ if (bg->flags & BTRFS_BLOCK_GROUP_REMAPPED) {
+ ret = 0;
+ goto end;
+ }
+
+ ret = create_remap_tree_entries(trans, path, bg);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto end;
+ }
+
+ spin_lock(&bg->lock);
+ bg->flags |= BTRFS_BLOCK_GROUP_REMAPPED;
+ spin_unlock(&bg->lock);
+
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (list_empty(&bg->dirty_list)) {
+ list_add_tail(&bg->dirty_list, &trans->transaction->dirty_bgs);
+ bg_already_dirty = false;
+ btrfs_get_block_group(bg);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+
+ /* Modified block groups are accounted for in the delayed_refs_rsv. */
+ if (!bg_already_dirty)
+ btrfs_inc_delayed_refs_rsv_bg_updates(fs_info);
+
+ ret = mark_chunk_remapped(trans, path, bg->start);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto end;
+ }
+
+ ret = btrfs_remove_block_group_free_space(trans, bg);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto end;
+ }
+
+ btrfs_remove_free_space_cache(bg);
+
+end:
+ mutex_unlock(&fs_info->remap_mutex);
+
+ ret2 = btrfs_end_transaction(trans);
+ if (!ret)
+ ret = ret2;
+
+ return ret;
+}
+
+static int do_nonremap_reloc(struct btrfs_fs_info *fs_info, bool verbose,
+ struct reloc_control *rc)
+{
+ int ret;
+
+ while (1) {
+ enum reloc_stage finishes_stage;
+
+ mutex_lock(&fs_info->cleaner_mutex);
+ ret = relocate_block_group(rc);
+ mutex_unlock(&fs_info->cleaner_mutex);
+
+ finishes_stage = rc->stage;
+ /*
+ * We may have gotten ENOSPC after we already dirtied some
+ * extents. If writeout happens while we're relocating a
+ * different block group we could end up hitting the
+ * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in
+ * btrfs_reloc_cow_block. Make sure we write everything out
+ * properly so we don't trip over this problem, and then break
+ * out of the loop if we hit an error.
+ */
+ if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
+ int wb_ret;
+
+ wb_ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode),
+ 0, (u64)-1);
+ if (wb_ret && ret == 0)
+ ret = wb_ret;
+ invalidate_mapping_pages(rc->data_inode->i_mapping, 0, -1);
+ rc->stage = UPDATE_DATA_PTRS;
+ }
+
+ if (ret < 0)
+ return ret;
+
+ if (rc->extents_found == 0)
+ break;
+
+ if (verbose)
+ btrfs_info(fs_info, "found %llu extents, stage: %s",
+ rc->extents_found, stage_to_string(finishes_stage));
+ }
+
+ WARN_ON(rc->block_group->pinned > 0);
+ WARN_ON(rc->block_group->reserved > 0);
+ WARN_ON(rc->block_group->used > 0);
+
+ return 0;
+}
+
/*
* function to relocate all extents in a block group.
*/
@@ -3870,7 +5297,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start);
struct reloc_control *rc;
struct inode *inode;
- struct btrfs_path *path;
+ struct btrfs_path *path = NULL;
int ret;
bool bg_is_ro = false;
@@ -3932,7 +5359,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
}
inode = lookup_free_space_inode(rc->block_group, path);
- btrfs_free_path(path);
+ btrfs_release_path(path);
if (!IS_ERR(inode))
ret = delete_block_group_cache(rc->block_group, inode, 0);
@@ -3942,11 +5369,13 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
if (ret && ret != -ENOENT)
goto out;
- rc->data_inode = create_reloc_inode(rc->block_group);
- if (IS_ERR(rc->data_inode)) {
- ret = PTR_ERR(rc->data_inode);
- rc->data_inode = NULL;
- goto out;
+ if (!btrfs_fs_incompat(fs_info, REMAP_TREE)) {
+ rc->data_inode = create_reloc_inode(rc->block_group);
+ if (IS_ERR(rc->data_inode)) {
+ ret = PTR_ERR(rc->data_inode);
+ rc->data_inode = NULL;
+ goto out;
+ }
}
if (verbose)
@@ -3959,54 +5388,31 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
ret = btrfs_zone_finish(rc->block_group);
WARN_ON(ret && ret != -EAGAIN);
- while (1) {
- enum reloc_stage finishes_stage;
-
- mutex_lock(&fs_info->cleaner_mutex);
- ret = relocate_block_group(rc);
- mutex_unlock(&fs_info->cleaner_mutex);
-
- finishes_stage = rc->stage;
- /*
- * We may have gotten ENOSPC after we already dirtied some
- * extents. If writeout happens while we're relocating a
- * different block group we could end up hitting the
- * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in
- * btrfs_reloc_cow_block. Make sure we write everything out
- * properly so we don't trip over this problem, and then break
- * out of the loop if we hit an error.
- */
- if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
- int wb_ret;
-
- wb_ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), 0,
- (u64)-1);
- if (wb_ret && ret == 0)
- ret = wb_ret;
- invalidate_mapping_pages(rc->data_inode->i_mapping,
- 0, -1);
- rc->stage = UPDATE_DATA_PTRS;
+ if (should_relocate_using_remap_tree(bg)) {
+ if (bg->remap_bytes != 0) {
+ ret = move_existing_remaps(fs_info, bg, path);
+ if (ret)
+ goto out;
}
-
- if (ret < 0)
+ ret = start_block_group_remapping(fs_info, path, bg);
+ if (ret)
goto out;
- if (rc->extents_found == 0)
- break;
+ ret = do_remap_reloc(fs_info, path, rc->block_group);
+ if (ret)
+ goto out;
- if (verbose)
- btrfs_info(fs_info, "found %llu extents, stage: %s",
- rc->extents_found,
- stage_to_string(finishes_stage));
+ btrfs_delete_unused_bgs(fs_info);
+ } else {
+ ret = do_nonremap_reloc(fs_info, verbose, rc);
}
- WARN_ON(rc->block_group->pinned > 0);
- WARN_ON(rc->block_group->reserved > 0);
- WARN_ON(rc->block_group->used > 0);
out:
if (ret && bg_is_ro)
btrfs_dec_block_group_ro(rc->block_group);
- iput(rc->data_inode);
+ if (!btrfs_fs_incompat(fs_info, REMAP_TREE))
+ iput(rc->data_inode);
+ btrfs_free_path(path);
reloc_chunk_end(fs_info);
out_put_bg:
btrfs_put_block_group(bg);
@@ -4200,7 +5606,7 @@ out:
btrfs_free_path(path);
- if (ret == 0) {
+ if (ret == 0 && !btrfs_fs_incompat(fs_info, REMAP_TREE)) {
/* cleanup orphan inode in data relocation tree */
fs_root = btrfs_grab_root(fs_info->data_reloc_root);
ASSERT(fs_root);
@@ -4414,3 +5820,260 @@ u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info)
logical = fs_info->reloc_ctl->block_group->start;
return logical;
}
+
+static int insert_remap_item(struct btrfs_trans_handle *trans, struct btrfs_path *path,
+ u64 old_addr, u64 length, u64 new_addr)
+{
+ int ret;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_key key;
+ struct btrfs_remap_item remap = { 0 };
+
+ if (old_addr == new_addr) {
+ /* Add new identity remap item. */
+ key.objectid = old_addr;
+ key.type = BTRFS_IDENTITY_REMAP_KEY;
+ key.offset = length;
+
+ ret = btrfs_insert_empty_item(trans, fs_info->remap_root, path,
+ &key, 0);
+ if (ret)
+ return ret;
+ } else {
+ /* Add new remap item. */
+ key.objectid = old_addr;
+ key.type = BTRFS_REMAP_KEY;
+ key.offset = length;
+
+ ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
+ path, &key, sizeof(struct btrfs_remap_item));
+ if (ret)
+ return ret;
+
+ btrfs_set_stack_remap_address(&remap, new_addr);
+
+ write_extent_buffer(path->nodes[0], &remap,
+ btrfs_item_ptr_offset(path->nodes[0], path->slots[0]),
+ sizeof(struct btrfs_remap_item));
+
+ btrfs_release_path(path);
+
+ /* Add new backref item. */
+ key.objectid = new_addr;
+ key.type = BTRFS_REMAP_BACKREF_KEY;
+ key.offset = length;
+
+ ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
+ path, &key,
+ sizeof(struct btrfs_remap_item));
+ if (ret)
+ return ret;
+
+ btrfs_set_stack_remap_address(&remap, old_addr);
+
+ write_extent_buffer(path->nodes[0], &remap,
+ btrfs_item_ptr_offset(path->nodes[0], path->slots[0]),
+ sizeof(struct btrfs_remap_item));
+ }
+
+ btrfs_release_path(path);
+
+ return 0;
+}
+
+/*
+ * Punch a hole in the remap item or identity remap item pointed to by path,
+ * for the range [hole_start, hole_start + hole_length).
+ */
+static int remove_range_from_remap_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_block_group *bg,
+ u64 hole_start, u64 hole_length)
+{
+ int ret;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_key key;
+ u64 hole_end, new_addr, remap_start, remap_length, remap_end;
+ u64 overlap_length;
+ bool is_identity_remap;
+ int identity_count_delta = 0;
+
+ hole_end = hole_start + hole_length;
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ is_identity_remap = (key.type == BTRFS_IDENTITY_REMAP_KEY);
+
+ remap_start = key.objectid;
+ remap_length = key.offset;
+ remap_end = remap_start + remap_length;
+
+ if (is_identity_remap) {
+ new_addr = remap_start;
+ } else {
+ struct btrfs_remap_item *remap_ptr;
+
+ remap_ptr = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_remap_item);
+ new_addr = btrfs_remap_address(leaf, remap_ptr);
+ }
+
+ /* Delete old item. */
+ ret = btrfs_del_item(trans, fs_info->remap_root, path);
+ btrfs_release_path(path);
+ if (ret)
+ return ret;
+
+ if (is_identity_remap) {
+ identity_count_delta = -1;
+ } else {
+ /* Remove backref. */
+ key.objectid = new_addr;
+ key.type = BTRFS_REMAP_BACKREF_KEY;
+ key.offset = remap_length;
+
+ ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, -1, 1);
+ if (ret) {
+ if (ret == 1) {
+ btrfs_release_path(path);
+ ret = -ENOENT;
+ }
+ return ret;
+ }
+
+ ret = btrfs_del_item(trans, fs_info->remap_root, path);
+
+ btrfs_release_path(path);
+
+ if (ret)
+ return ret;
+ }
+
+ /* If hole_start > remap_start, re-add the start of the remap item. */
+ if (hole_start > remap_start) {
+ ret = insert_remap_item(trans, path, remap_start,
+ hole_start - remap_start, new_addr);
+ if (ret)
+ return ret;
+
+ if (is_identity_remap)
+ identity_count_delta++;
+ }
+
+ /* If hole_end < remap_end, re-add the end of the remap item. */
+ if (hole_end < remap_end) {
+ ret = insert_remap_item(trans, path, hole_end,
+ remap_end - hole_end,
+ hole_end - remap_start + new_addr);
+ if (ret)
+ return ret;
+
+ if (is_identity_remap)
+ identity_count_delta++;
+ }
+
+ if (identity_count_delta != 0)
+ adjust_identity_remap_count(trans, bg, identity_count_delta);
+
+ overlap_length = min_t(u64, hole_end, remap_end) -
+ max_t(u64, hole_start, remap_start);
+
+ if (!is_identity_remap) {
+ struct btrfs_block_group *dest_bg;
+
+ dest_bg = btrfs_lookup_block_group(fs_info, new_addr);
+ adjust_block_group_remap_bytes(trans, dest_bg, -overlap_length);
+ btrfs_put_block_group(dest_bg);
+ ret = btrfs_add_to_free_space_tree(trans,
+ hole_start - remap_start + new_addr,
+ overlap_length);
+ if (ret)
+ return ret;
+ }
+
+ ret = overlap_length;
+
+ return ret;
+}
+
+/*
+ * Return 1 if remove_range_from_remap_tree() has been called successfully,
+ * 0 if block group wasn't remapped, and a negative number on error.
+ */
+int btrfs_remove_extent_from_remap_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ u64 bytenr, u64 num_bytes)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_key key, found_key;
+ struct extent_buffer *leaf;
+ struct btrfs_block_group *bg;
+ int ret, length;
+
+ if (!(btrfs_super_incompat_flags(fs_info->super_copy) &
+ BTRFS_FEATURE_INCOMPAT_REMAP_TREE))
+ return 0;
+
+ bg = btrfs_lookup_block_group(fs_info, bytenr);
+ if (!bg)
+ return 0;
+
+ mutex_lock(&fs_info->remap_mutex);
+
+ if (!(bg->flags & BTRFS_BLOCK_GROUP_REMAPPED)) {
+ mutex_unlock(&fs_info->remap_mutex);
+ btrfs_put_block_group(bg);
+ return 0;
+ }
+
+ do {
+ key.objectid = bytenr;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, -1, 1);
+ if (ret < 0)
+ goto end;
+
+ leaf = path->nodes[0];
+ if (path->slots[0] == 0) {
+ ret = -ENOENT;
+ goto end;
+ }
+
+ path->slots[0]--;
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ if (found_key.type != BTRFS_IDENTITY_REMAP_KEY &&
+ found_key.type != BTRFS_REMAP_KEY) {
+ ret = -ENOENT;
+ goto end;
+ }
+
+ if (bytenr < found_key.objectid ||
+ bytenr >= found_key.objectid + found_key.offset) {
+ ret = -ENOENT;
+ goto end;
+ }
+
+ length = remove_range_from_remap_tree(trans, path, bg, bytenr, num_bytes);
+ if (length < 0) {
+ ret = length;
+ goto end;
+ }
+
+ bytenr += length;
+ num_bytes -= length;
+ } while (num_bytes > 0);
+
+ ret = 1;
+
+end:
+ mutex_unlock(&fs_info->remap_mutex);
+
+ btrfs_put_block_group(bg);
+ btrfs_release_path(path);
+
+ return ret;
+}
diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h
index 5c36b3f84b57..d647823b5d13 100644
--- a/fs/btrfs/relocation.h
+++ b/fs/btrfs/relocation.h
@@ -12,6 +12,17 @@ struct btrfs_trans_handle;
struct btrfs_ordered_extent;
struct btrfs_pending_snapshot;
+static inline bool should_relocate_using_remap_tree(const struct btrfs_block_group *bg)
+{
+ if (!btrfs_fs_incompat(bg->fs_info, REMAP_TREE))
+ return false;
+
+ if (bg->flags & (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA_REMAP))
+ return false;
+
+ return true;
+}
+
int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
bool verbose);
int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root);
@@ -31,5 +42,11 @@ int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info);
struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr);
bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root);
u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info);
+int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical, u64 *length);
+int btrfs_remove_extent_from_remap_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ u64 bytenr, u64 num_bytes);
+int btrfs_last_identity_remap_gone(struct btrfs_chunk_map *chunk_map,
+ struct btrfs_block_group *bg);
#endif
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 6a7e297ab0a7..37a4173c0a0b 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -217,8 +217,6 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_root *root;
- int err = 0;
- int ret;
path = btrfs_alloc_path();
if (!path)
@@ -230,20 +228,19 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
while (1) {
u64 root_objectid;
+ int ret;
ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
- if (ret < 0) {
- err = ret;
- break;
- }
+ if (ret < 0)
+ return ret;
leaf = path->nodes[0];
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(tree_root, path);
if (ret < 0)
- err = ret;
- if (ret != 0)
- break;
+ return ret;
+ else if (ret > 0)
+ return 0;
leaf = path->nodes[0];
}
@@ -252,34 +249,32 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
if (key.objectid != BTRFS_ORPHAN_OBJECTID ||
key.type != BTRFS_ORPHAN_ITEM_KEY)
- break;
+ return 0;
root_objectid = key.offset;
key.offset++;
root = btrfs_get_fs_root(fs_info, root_objectid, false);
- err = PTR_ERR_OR_ZERO(root);
- if (err && err != -ENOENT) {
+ ret = PTR_ERR_OR_ZERO(root);
+ if (ret && ret != -ENOENT) {
break;
- } else if (err == -ENOENT) {
+ } else if (ret == -ENOENT) {
struct btrfs_trans_handle *trans;
- btrfs_release_path(path);
-
trans = btrfs_join_transaction(tree_root);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
- btrfs_handle_fs_error(fs_info, err,
- "Failed to start trans to delete orphan item");
- break;
+ ret = PTR_ERR(trans);
+ btrfs_err(fs_info,
+ "failed to join transaction to delete orphan item: %d",
+ ret);
+ return ret;
}
- err = btrfs_del_orphan_item(trans, tree_root,
- root_objectid);
+ ret = btrfs_del_orphan_item(trans, tree_root, root_objectid);
btrfs_end_transaction(trans);
- if (err) {
- btrfs_handle_fs_error(fs_info, err,
- "Failed to delete root orphan item");
- break;
+ if (ret) {
+ btrfs_err(fs_info,
+ "failed to delete root orphan item: %d", ret);
+ return ret;
}
continue;
}
@@ -307,7 +302,7 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
btrfs_put_root(root);
}
- return err;
+ return 0;
}
/* drop the root item for 'key' from the tree root */
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a40ee41f42c6..2a64e2d50ced 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -6,7 +6,6 @@
#include <linux/blkdev.h>
#include <linux/ratelimit.h>
#include <linux/sched/mm.h>
-#include <crypto/hash.h>
#include "ctree.h"
#include "discard.h"
#include "volumes.h"
@@ -718,7 +717,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits);
void *first_kaddr = scrub_stripe_get_kaddr(stripe, sector_nr);
struct btrfs_header *header = first_kaddr;
- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+ struct btrfs_csum_ctx csum;
u8 on_disk_csum[BTRFS_CSUM_SIZE];
u8 calculated_csum[BTRFS_CSUM_SIZE];
@@ -760,17 +759,16 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
}
/* Now check tree block csum. */
- shash->tfm = fs_info->csum_shash;
- crypto_shash_init(shash);
- crypto_shash_update(shash, first_kaddr + BTRFS_CSUM_SIZE,
- fs_info->sectorsize - BTRFS_CSUM_SIZE);
+ btrfs_csum_init(&csum, fs_info->csum_type);
+ btrfs_csum_update(&csum, first_kaddr + BTRFS_CSUM_SIZE,
+ fs_info->sectorsize - BTRFS_CSUM_SIZE);
for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) {
- crypto_shash_update(shash, scrub_stripe_get_kaddr(stripe, i),
- fs_info->sectorsize);
+ btrfs_csum_update(&csum, scrub_stripe_get_kaddr(stripe, i),
+ fs_info->sectorsize);
}
- crypto_shash_final(shash, calculated_csum);
+ btrfs_csum_final(&csum, calculated_csum);
if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) {
scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree);
scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
@@ -1690,15 +1688,15 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
scrub_stripe_reset_bitmaps(stripe);
/* The range must be inside the bg. */
- ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length,
+ ASSERT(logical_start >= bg->start && logical_end <= btrfs_block_group_end(bg),
"bg->start=%llu logical_start=%llu logical_end=%llu end=%llu",
- bg->start, logical_start, logical_end, bg->start + bg->length);
+ bg->start, logical_start, logical_end, btrfs_block_group_end(bg));
ret = find_first_extent_item(extent_root, extent_path, logical_start,
logical_len);
/* Either error or not found. */
if (ret)
- goto out;
+ return ret;
get_extent_info(extent_path, &extent_start, &extent_len, &extent_flags,
&extent_gen);
if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
@@ -1731,7 +1729,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
ret = find_first_extent_item(extent_root, extent_path, cur_logical,
stripe_end - cur_logical + 1);
if (ret < 0)
- goto out;
+ return ret;
if (ret > 0) {
ret = 0;
break;
@@ -1765,7 +1763,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
stripe->logical, stripe_end,
stripe->csums, &csum_bitmap);
if (ret < 0)
- goto out;
+ return ret;
if (ret > 0)
ret = 0;
@@ -1775,7 +1773,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
}
}
set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state);
-out:
+
return ret;
}
@@ -2173,8 +2171,8 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
u64 full_stripe_start)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct btrfs_path extent_path = { 0 };
- struct btrfs_path csum_path = { 0 };
+ BTRFS_PATH_AUTO_RELEASE(extent_path);
+ BTRFS_PATH_AUTO_RELEASE(csum_path);
struct scrub_stripe *stripe;
bool all_empty = true;
const int data_stripes = nr_data_stripes(map);
@@ -2226,7 +2224,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
full_stripe_start + btrfs_stripe_nr_to_offset(i),
BTRFS_STRIPE_LEN, stripe);
if (ret < 0)
- goto out;
+ return ret;
/*
* No extent in this data stripe, need to manually mark them
* initialized to make later read submission happy.
@@ -2248,10 +2246,8 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
break;
}
}
- if (all_empty) {
- ret = 0;
- goto out;
- }
+ if (all_empty)
+ return 0;
for (int i = 0; i < data_stripes; i++) {
stripe = &sctx->raid56_data_stripes[i];
@@ -2292,20 +2288,15 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
"scrub: unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl",
full_stripe_start, i, stripe->nr_sectors,
&error);
- ret = -EIO;
- goto out;
+ return ret;
}
bitmap_or(&extent_bitmap, &extent_bitmap, &has_extent,
stripe->nr_sectors);
}
/* Now we can check and regenerate the P/Q stripe. */
- ret = scrub_raid56_cached_parity(sctx, scrub_dev, map, full_stripe_start,
- &extent_bitmap);
-out:
- btrfs_release_path(&extent_path);
- btrfs_release_path(&csum_path);
- return ret;
+ return scrub_raid56_cached_parity(sctx, scrub_dev, map, full_stripe_start,
+ &extent_bitmap);
}
/*
@@ -2328,7 +2319,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
int ret = 0;
/* The range must be inside the bg */
- ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
+ ASSERT(logical_start >= bg->start && logical_end <= btrfs_block_group_end(bg));
/* Go through each extent items inside the logical range */
while (cur_logical < logical_end) {
@@ -2420,12 +2411,13 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx,
const u64 logical_increment = simple_stripe_full_stripe_len(map);
const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index);
const u64 orig_physical = map->stripes[stripe_index].physical;
+ const u64 end = btrfs_block_group_end(bg);
const int mirror_num = simple_stripe_mirror_num(map, stripe_index);
u64 cur_logical = orig_logical;
u64 cur_physical = orig_physical;
int ret = 0;
- while (cur_logical < bg->start + bg->length) {
+ while (cur_logical < end) {
/*
* Inside each stripe, RAID0 is just SINGLE, and RAID10 is
* just RAID1, so we can reuse scrub_simple_mirror() to scrub
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index d8127a7120c2..3dcfdba018b5 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -6449,11 +6449,9 @@ static int process_extent(struct send_ctx *sctx,
if (sctx->parent_root && !sctx->cur_inode_new) {
ret = is_extent_unchanged(sctx, path, key);
if (ret < 0)
- goto out;
- if (ret) {
- ret = 0;
+ return ret;
+ if (ret)
goto out_hole;
- }
} else {
struct btrfs_file_extent_item *ei;
u8 type;
@@ -6469,31 +6467,25 @@ static int process_extent(struct send_ctx *sctx,
* we have enough commands queued up to justify rev'ing
* the send spec.
*/
- if (type == BTRFS_FILE_EXTENT_PREALLOC) {
- ret = 0;
- goto out;
- }
+ if (type == BTRFS_FILE_EXTENT_PREALLOC)
+ return 0;
/* Have a hole, just skip it. */
- if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0) {
- ret = 0;
- goto out;
- }
+ if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0)
+ return 0;
}
}
ret = find_extent_clone(sctx, path, key->objectid, key->offset,
sctx->cur_inode_size, &found_clone);
if (ret != -ENOENT && ret < 0)
- goto out;
+ return ret;
ret = send_write_or_clone(sctx, path, key, found_clone);
if (ret)
- goto out;
+ return ret;
out_hole:
- ret = maybe_send_hole(sctx, path, key);
-out:
- return ret;
+ return maybe_send_hole(sctx, path, key);
}
static int process_all_extents(struct send_ctx *sctx)
@@ -6535,23 +6527,24 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, bool at_end,
int *pending_move,
int *refs_processed)
{
- int ret = 0;
+ int ret;
if (sctx->cur_ino == 0)
- goto out;
+ return 0;
+
if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid &&
sctx->cmp_key->type <= BTRFS_INODE_EXTREF_KEY)
- goto out;
+ return 0;
+
if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs))
- goto out;
+ return 0;
ret = process_recorded_refs(sctx, pending_move);
if (ret < 0)
- goto out;
+ return ret;
*refs_processed = 1;
-out:
- return ret;
+ return 0;
}
static int finish_inode_if_needed(struct send_ctx *sctx, bool at_end)
@@ -6768,7 +6761,7 @@ static void close_current_inode(struct send_ctx *sctx)
static int changed_inode(struct send_ctx *sctx,
enum btrfs_compare_tree_result result)
{
- int ret = 0;
+ int ret;
struct btrfs_key *key = sctx->cmp_key;
struct btrfs_inode_item *left_ii = NULL;
struct btrfs_inode_item *right_ii = NULL;
@@ -6860,7 +6853,7 @@ static int changed_inode(struct send_ctx *sctx,
if (result == BTRFS_COMPARE_TREE_NEW) {
if (btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii) == 0) {
sctx->ignore_cur_inode = true;
- goto out;
+ return 0;
}
sctx->cur_inode_gen = left_gen;
sctx->cur_inode_new = true;
@@ -6888,7 +6881,7 @@ static int changed_inode(struct send_ctx *sctx,
old_nlinks = btrfs_inode_nlink(sctx->right_path->nodes[0], right_ii);
if (new_nlinks == 0 && old_nlinks == 0) {
sctx->ignore_cur_inode = true;
- goto out;
+ return 0;
} else if (new_nlinks == 0 || old_nlinks == 0) {
sctx->cur_inode_new_gen = 1;
}
@@ -6914,7 +6907,7 @@ static int changed_inode(struct send_ctx *sctx,
ret = process_all_refs(sctx,
BTRFS_COMPARE_TREE_DELETED);
if (ret < 0)
- goto out;
+ return ret;
}
/*
@@ -6935,11 +6928,11 @@ static int changed_inode(struct send_ctx *sctx,
left_ii);
ret = send_create_inode_if_needed(sctx);
if (ret < 0)
- goto out;
+ return ret;
ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
if (ret < 0)
- goto out;
+ return ret;
/*
* Advance send_progress now as we did not get
* into process_recorded_refs_if_needed in the
@@ -6953,10 +6946,10 @@ static int changed_inode(struct send_ctx *sctx,
*/
ret = process_all_extents(sctx);
if (ret < 0)
- goto out;
+ return ret;
ret = process_all_new_xattrs(sctx);
if (ret < 0)
- goto out;
+ return ret;
}
} else {
sctx->cur_inode_gen = left_gen;
@@ -6970,8 +6963,7 @@ static int changed_inode(struct send_ctx *sctx,
}
}
-out:
- return ret;
+ return 0;
}
/*
@@ -7104,20 +7096,20 @@ static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path,
u32 item_size;
u32 cur_offset = 0;
int ref_name_len;
- int ret = 0;
/* Easy case, just check this one dirid */
if (key->type == BTRFS_INODE_REF_KEY) {
dirid = key->offset;
- ret = dir_changed(sctx, dirid);
- goto out;
+ return dir_changed(sctx, dirid);
}
leaf = path->nodes[0];
item_size = btrfs_item_size(leaf, path->slots[0]);
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
while (cur_offset < item_size) {
+ int ret;
+
extref = (struct btrfs_inode_extref *)(ptr +
cur_offset);
dirid = btrfs_inode_extref_parent(leaf, extref);
@@ -7127,11 +7119,10 @@ static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path,
continue;
ret = dir_changed(sctx, dirid);
if (ret)
- break;
+ return ret;
last_dirid = dirid;
}
-out:
- return ret;
+ return 0;
}
/*
@@ -7212,12 +7203,12 @@ static int changed_cb(struct btrfs_path *left_path,
ret = finish_inode_if_needed(sctx, 0);
if (ret < 0)
- goto out;
+ return ret;
/* Ignore non-FS objects */
if (key->objectid == BTRFS_FREE_INO_OBJECTID ||
key->objectid == BTRFS_FREE_SPACE_OBJECTID)
- goto out;
+ return 0;
if (key->type == BTRFS_INODE_ITEM_KEY) {
ret = changed_inode(sctx, result);
@@ -7234,7 +7225,6 @@ static int changed_cb(struct btrfs_path *left_path,
ret = changed_verity(sctx, result);
}
-out:
return ret;
}
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 3f08e450f796..bb5aac7ee9d2 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -215,7 +215,7 @@ static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags)
if (flags & BTRFS_BLOCK_GROUP_DATA)
return BTRFS_MAX_DATA_CHUNK_SIZE;
- else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ else if (flags & (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA_REMAP))
return SZ_32M;
/* Handle BTRFS_BLOCK_GROUP_METADATA */
@@ -329,7 +329,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
struct btrfs_super_block *disk_super;
u64 features;
u64 flags;
- int mixed = 0;
+ bool mixed = false;
int ret;
disk_super = fs_info->super_copy;
@@ -338,26 +338,35 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
features = btrfs_super_incompat_flags(disk_super);
if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
- mixed = 1;
+ mixed = true;
flags = BTRFS_BLOCK_GROUP_SYSTEM;
ret = create_space_info(fs_info, flags);
if (ret)
- goto out;
+ return ret;
if (mixed) {
flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
ret = create_space_info(fs_info, flags);
+ if (ret)
+ return ret;
} else {
flags = BTRFS_BLOCK_GROUP_METADATA;
ret = create_space_info(fs_info, flags);
if (ret)
- goto out;
+ return ret;
flags = BTRFS_BLOCK_GROUP_DATA;
ret = create_space_info(fs_info, flags);
+ if (ret)
+ return ret;
+ }
+
+ if (features & BTRFS_FEATURE_INCOMPAT_REMAP_TREE) {
+ flags = BTRFS_BLOCK_GROUP_METADATA_REMAP;
+ ret = create_space_info(fs_info, flags);
}
-out:
+
return ret;
}
@@ -370,8 +379,13 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
factor = btrfs_bg_type_to_factor(block_group->flags);
spin_lock(&space_info->lock);
- space_info->total_bytes += block_group->length;
- space_info->disk_total += block_group->length * factor;
+
+ if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED) ||
+ block_group->identity_remap_count != 0) {
+ space_info->total_bytes += block_group->length;
+ space_info->disk_total += block_group->length * factor;
+ }
+
space_info->bytes_used += block_group->used;
space_info->disk_used += block_group->used * factor;
space_info->bytes_readonly += block_group->bytes_super;
@@ -606,27 +620,12 @@ do { \
spin_unlock(&__rsv->lock); \
} while (0)
-static const char *space_info_flag_to_str(const struct btrfs_space_info *space_info)
-{
- switch (space_info->flags) {
- case BTRFS_BLOCK_GROUP_SYSTEM:
- return "SYSTEM";
- case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA:
- return "DATA+METADATA";
- case BTRFS_BLOCK_GROUP_DATA:
- return "DATA";
- case BTRFS_BLOCK_GROUP_METADATA:
- return "METADATA";
- default:
- return "UNKNOWN";
- }
-}
-
static void dump_global_block_rsv(struct btrfs_fs_info *fs_info)
{
DUMP_BLOCK_RSV(fs_info, global_block_rsv);
DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, remap_block_rsv);
DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
}
@@ -634,7 +633,7 @@ static void dump_global_block_rsv(struct btrfs_fs_info *fs_info)
static void __btrfs_dump_space_info(const struct btrfs_space_info *info)
{
const struct btrfs_fs_info *fs_info = info->fs_info;
- const char *flag_str = space_info_flag_to_str(info);
+ const char *flag_str = btrfs_space_info_type_str(info);
lockdep_assert_held(&info->lock);
/* The free space could be negative in case of overcommit */
@@ -672,8 +671,7 @@ again:
u64 avail;
spin_lock(&cache->lock);
- avail = cache->length - cache->used - cache->pinned -
- cache->reserved - cache->bytes_super - cache->zone_unusable;
+ avail = btrfs_block_group_available_space(cache);
btrfs_info(fs_info,
"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu delalloc %llu super %llu zone_unusable (%llu bytes available) %s",
cache->start, cache->length, cache->used, cache->pinned,
@@ -2099,11 +2097,11 @@ static bool is_reclaim_urgent(struct btrfs_space_info *space_info)
return unalloc < data_chunk_size;
}
-static void do_reclaim_sweep(struct btrfs_space_info *space_info, int raid)
+static bool do_reclaim_sweep(struct btrfs_space_info *space_info, int raid)
{
struct btrfs_block_group *bg;
int thresh_pct;
- bool try_again = true;
+ bool will_reclaim = false;
bool urgent;
spin_lock(&space_info->lock);
@@ -2121,7 +2119,7 @@ again:
spin_lock(&bg->lock);
thresh = mult_perc(bg->length, thresh_pct);
if (bg->used < thresh && bg->reclaim_mark) {
- try_again = false;
+ will_reclaim = true;
reclaim = true;
}
bg->reclaim_mark++;
@@ -2138,12 +2136,13 @@ again:
* If we have any staler groups, we don't touch the fresher ones, but if we
* really need a block group, do take a fresh one.
*/
- if (try_again && urgent) {
- try_again = false;
+ if (!will_reclaim && urgent) {
+ urgent = false;
goto again;
}
up_read(&space_info->groups_sem);
+ return will_reclaim;
}
void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes)
@@ -2153,7 +2152,8 @@ void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s6
lockdep_assert_held(&space_info->lock);
space_info->reclaimable_bytes += bytes;
- if (space_info->reclaimable_bytes >= chunk_sz)
+ if (space_info->reclaimable_bytes > 0 &&
+ space_info->reclaimable_bytes >= chunk_sz)
btrfs_set_periodic_reclaim_ready(space_info, true);
}
@@ -2180,7 +2180,6 @@ static bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info)
spin_lock(&space_info->lock);
ret = space_info->periodic_reclaim_ready;
- btrfs_set_periodic_reclaim_ready(space_info, false);
spin_unlock(&space_info->lock);
return ret;
@@ -2194,8 +2193,10 @@ void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info)
list_for_each_entry(space_info, &fs_info->space_info, list) {
if (!btrfs_should_periodic_reclaim(space_info))
continue;
- for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++)
- do_reclaim_sweep(space_info, raid);
+ for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) {
+ if (do_reclaim_sweep(space_info, raid))
+ btrfs_set_periodic_reclaim_ready(space_info, false);
+ }
}
}
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 446c0614ad4a..0703f24b23f7 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -307,4 +307,20 @@ int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info);
void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info);
void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len);
+static inline const char *btrfs_space_info_type_str(const struct btrfs_space_info *space_info)
+{
+ switch (space_info->flags) {
+ case BTRFS_BLOCK_GROUP_SYSTEM:
+ return "SYSTEM";
+ case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA:
+ return "DATA+METADATA";
+ case BTRFS_BLOCK_GROUP_DATA:
+ return "DATA";
+ case BTRFS_BLOCK_GROUP_METADATA:
+ return "METADATA";
+ default:
+ return "UNKNOWN";
+ }
+}
+
#endif /* BTRFS_SPACE_INFO_H */
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index af56fdbba65d..d64d303b6edc 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2483,6 +2483,18 @@ static void btrfs_shutdown(struct super_block *sb)
}
#endif
+static int btrfs_show_stats(struct seq_file *seq, struct dentry *root)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
+
+ if (btrfs_is_zoned(fs_info)) {
+ btrfs_show_zoned_stats(fs_info, seq);
+ return 0;
+ }
+
+ return 0;
+}
+
static const struct super_operations btrfs_super_ops = {
.drop_inode = btrfs_drop_inode,
.evict_inode = btrfs_evict_inode,
@@ -2498,6 +2510,7 @@ static const struct super_operations btrfs_super_ops = {
.unfreeze_fs = btrfs_unfreeze,
.nr_cached_objects = btrfs_nr_cached_objects,
.free_cached_objects = btrfs_free_cached_objects,
+ .show_stats = btrfs_show_stats,
#ifdef CONFIG_BTRFS_EXPERIMENTAL
.remove_bdev = btrfs_remove_bdev,
.shutdown = btrfs_shutdown,
@@ -2700,7 +2713,3 @@ module_exit(exit_btrfs_fs)
MODULE_DESCRIPTION("B-Tree File System (BTRFS)");
MODULE_LICENSE("GPL");
-MODULE_SOFTDEP("pre: crc32c");
-MODULE_SOFTDEP("pre: xxhash64");
-MODULE_SOFTDEP("pre: sha256");
-MODULE_SOFTDEP("pre: blake2b-256");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 4b3c2acac51a..27bfb7b55ec4 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -11,7 +11,6 @@
#include <linux/bug.h>
#include <linux/list.h>
#include <linux/string_choices.h>
-#include <crypto/hash.h>
#include "messages.h"
#include "ctree.h"
#include "discard.h"
@@ -300,6 +299,8 @@ BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2);
/* Remove once support for raid stripe tree is feature complete. */
BTRFS_FEAT_ATTR_INCOMPAT(raid_stripe_tree, RAID_STRIPE_TREE);
+/* Remove once support for remap tree is feature complete. */
+BTRFS_FEAT_ATTR_INCOMPAT(remap_tree, REMAP_TREE);
#endif
#ifdef CONFIG_FS_VERITY
BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY);
@@ -332,6 +333,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
#ifdef CONFIG_BTRFS_EXPERIMENTAL
BTRFS_FEAT_ATTR_PTR(extent_tree_v2),
BTRFS_FEAT_ATTR_PTR(raid_stripe_tree),
+ BTRFS_FEAT_ATTR_PTR(remap_tree),
#endif
#ifdef CONFIG_FS_VERITY
BTRFS_FEAT_ATTR_PTR(verity),
@@ -1253,10 +1255,9 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
u16 csum_type = btrfs_super_csum_type(fs_info->super_copy);
+ const char *csum_name = btrfs_super_csum_name(csum_type);
- return sysfs_emit(buf, "%s (%s)\n",
- btrfs_super_csum_name(csum_type),
- crypto_shash_driver_name(fs_info->csum_shash));
+ return sysfs_emit(buf, "%s (%s-lib)\n", csum_name, csum_name);
}
BTRFS_ATTR(, checksum, btrfs_checksum_show);
@@ -1540,47 +1541,6 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show,
btrfs_bg_reclaim_threshold_store);
-#ifdef CONFIG_BTRFS_EXPERIMENTAL
-static ssize_t btrfs_offload_csum_show(struct kobject *kobj,
- struct kobj_attribute *a, char *buf)
-{
- struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
-
- switch (READ_ONCE(fs_devices->offload_csum_mode)) {
- case BTRFS_OFFLOAD_CSUM_AUTO:
- return sysfs_emit(buf, "auto\n");
- case BTRFS_OFFLOAD_CSUM_FORCE_ON:
- return sysfs_emit(buf, "1\n");
- case BTRFS_OFFLOAD_CSUM_FORCE_OFF:
- return sysfs_emit(buf, "0\n");
- default:
- WARN_ON(1);
- return -EINVAL;
- }
-}
-
-static ssize_t btrfs_offload_csum_store(struct kobject *kobj,
- struct kobj_attribute *a, const char *buf,
- size_t len)
-{
- struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
- int ret;
- bool val;
-
- ret = kstrtobool(buf, &val);
- if (ret == 0)
- WRITE_ONCE(fs_devices->offload_csum_mode,
- val ? BTRFS_OFFLOAD_CSUM_FORCE_ON : BTRFS_OFFLOAD_CSUM_FORCE_OFF);
- else if (ret == -EINVAL && sysfs_streq(buf, "auto"))
- WRITE_ONCE(fs_devices->offload_csum_mode, BTRFS_OFFLOAD_CSUM_AUTO);
- else
- return -EINVAL;
-
- return len;
-}
-BTRFS_ATTR_RW(, offload_csum, btrfs_offload_csum_show, btrfs_offload_csum_store);
-#endif
-
/*
* Per-filesystem information and stats.
*
@@ -1600,9 +1560,6 @@ static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, bg_reclaim_threshold),
BTRFS_ATTR_PTR(, commit_stats),
BTRFS_ATTR_PTR(, temp_fsid),
-#ifdef CONFIG_BTRFS_EXPERIMENTAL
- BTRFS_ATTR_PTR(, offload_csum),
-#endif
NULL,
};
@@ -1972,6 +1929,8 @@ static const char *alloc_name(struct btrfs_space_info *space_info)
case BTRFS_BLOCK_GROUP_SYSTEM:
ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_PRIMARY);
return "system";
+ case BTRFS_BLOCK_GROUP_METADATA_REMAP:
+ return "metadata-remap";
default:
WARN_ON(1);
return "invalid-combination";
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index b576897d71cc..7f13c05d3736 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -301,6 +301,9 @@ int btrfs_run_sanity_tests(void)
ret = btrfs_test_delayed_refs(sectorsize, nodesize);
if (ret)
goto out;
+ ret = btrfs_test_chunk_allocation(sectorsize, nodesize);
+ if (ret)
+ goto out;
}
}
ret = btrfs_test_extent_map();
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index 4307bdaa6749..b03d85a6e5ef 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -7,8 +7,10 @@
#define BTRFS_TESTS_H
#include <linux/types.h>
+#include <linux/cleanup.h>
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+
int btrfs_run_sanity_tests(void);
#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt "\n", ##__VA_ARGS__)
@@ -45,13 +47,18 @@ int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize);
int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize);
int btrfs_test_extent_map(void);
int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize);
+int btrfs_test_chunk_allocation(u32 sectorsize, u32 nodesize);
struct inode *btrfs_new_test_inode(void);
struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize);
void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info);
+DEFINE_FREE(btrfs_free_dummy_fs_info, struct btrfs_fs_info *,
+ btrfs_free_dummy_fs_info(_T))
void btrfs_free_dummy_root(struct btrfs_root *root);
struct btrfs_block_group *
btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long length);
void btrfs_free_dummy_block_group(struct btrfs_block_group *cache);
+DEFINE_FREE(btrfs_free_dummy_block_group, struct btrfs_block_group *,
+ btrfs_free_dummy_block_group(_T));
void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
void btrfs_init_dummy_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/tests/chunk-allocation-tests.c b/fs/btrfs/tests/chunk-allocation-tests.c
new file mode 100644
index 000000000000..9beb0602fc8c
--- /dev/null
+++ b/fs/btrfs/tests/chunk-allocation-tests.c
@@ -0,0 +1,476 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2026 Meta. All rights reserved.
+ */
+
+#include <linux/sizes.h>
+#include "btrfs-tests.h"
+#include "../volumes.h"
+#include "../disk-io.h"
+#include "../extent-io-tree.h"
+
+/*
+ * Tests for chunk allocator pending extent internals.
+ * These two functions form the core of searching the chunk allocation pending
+ * extent bitmap and have relatively easily definable semantics, so unit
+ * testing them can help ensure the correctness of chunk allocation.
+ */
+
+/*
+ * Describes the inputs to the system and expected results
+ * when testing btrfs_find_hole_in_pending_extents().
+ */
+struct pending_extent_test_case {
+ const char *name;
+ /* Input range to search. */
+ u64 hole_start;
+ u64 hole_len;
+ /* The size of hole we are searching for. */
+ u64 min_hole_size;
+ /*
+ * Pending extents to set up (up to 2 for up to 3 holes)
+ * If len == 0, then it is skipped.
+ */
+ struct {
+ u64 start;
+ u64 len;
+ } pending_extents[2];
+ /* Expected outputs. */
+ bool expected_found;
+ u64 expected_start;
+ u64 expected_len;
+};
+
+static const struct pending_extent_test_case find_hole_tests[] = {
+ {
+ .name = "no pending extents",
+ .hole_start = 0,
+ .hole_len = 10ULL * SZ_1G,
+ .min_hole_size = SZ_1G,
+ .pending_extents = { },
+ .expected_found = true,
+ .expected_start = 0,
+ .expected_len = 10ULL * SZ_1G,
+ },
+ {
+ .name = "pending extent at start of range",
+ .hole_start = 0,
+ .hole_len = 10ULL * SZ_1G,
+ .min_hole_size = SZ_1G,
+ .pending_extents = {
+ { .start = 0, .len = SZ_1G },
+ },
+ .expected_found = true,
+ .expected_start = SZ_1G,
+ .expected_len = 9ULL * SZ_1G,
+ },
+ {
+ .name = "pending extent overlapping start of range",
+ .hole_start = SZ_1G,
+ .hole_len = 9ULL * SZ_1G,
+ .min_hole_size = SZ_1G,
+ .pending_extents = {
+ { .start = 0, .len = SZ_2G },
+ },
+ .expected_found = true,
+ .expected_start = SZ_2G,
+ .expected_len = 8ULL * SZ_1G,
+ },
+ {
+ .name = "two holes; first hole is exactly big enough",
+ .hole_start = 0,
+ .hole_len = 10ULL * SZ_1G,
+ .min_hole_size = SZ_1G,
+ .pending_extents = {
+ { .start = SZ_1G, .len = SZ_1G },
+ },
+ .expected_found = true,
+ .expected_start = 0,
+ .expected_len = SZ_1G,
+ },
+ {
+ .name = "two holes; first hole is big enough",
+ .hole_start = 0,
+ .hole_len = 10ULL * SZ_1G,
+ .min_hole_size = SZ_1G,
+ .pending_extents = {
+ { .start = SZ_2G, .len = SZ_1G },
+ },
+ .expected_found = true,
+ .expected_start = 0,
+ .expected_len = SZ_2G,
+ },
+ {
+ .name = "two holes; second hole is big enough",
+ .hole_start = 0,
+ .hole_len = 10ULL * SZ_1G,
+ .min_hole_size = SZ_2G,
+ .pending_extents = {
+ { .start = SZ_1G, .len = SZ_1G },
+ },
+ .expected_found = true,
+ .expected_start = SZ_2G,
+ .expected_len = 8ULL * SZ_1G,
+ },
+ {
+ .name = "three holes; first hole big enough",
+ .hole_start = 0,
+ .hole_len = 10ULL * SZ_1G,
+ .min_hole_size = SZ_2G,
+ .pending_extents = {
+ { .start = SZ_2G, .len = SZ_1G },
+ { .start = 4ULL * SZ_1G, .len = SZ_1G },
+ },
+ .expected_found = true,
+ .expected_start = 0,
+ .expected_len = SZ_2G,
+ },
+ {
+ .name = "three holes; second hole big enough",
+ .hole_start = 0,
+ .hole_len = 10ULL * SZ_1G,
+ .min_hole_size = SZ_2G,
+ .pending_extents = {
+ { .start = SZ_1G, .len = SZ_1G },
+ { .start = 5ULL * SZ_1G, .len = SZ_1G },
+ },
+ .expected_found = true,
+ .expected_start = SZ_2G,
+ .expected_len = 3ULL * SZ_1G,
+ },
+ {
+ .name = "three holes; third hole big enough",
+ .hole_start = 0,
+ .hole_len = 10ULL * SZ_1G,
+ .min_hole_size = SZ_2G,
+ .pending_extents = {
+ { .start = SZ_1G, .len = SZ_1G },
+ { .start = 3ULL * SZ_1G, .len = 5ULL * SZ_1G },
+ },
+ .expected_found = true,
+ .expected_start = 8ULL * SZ_1G,
+ .expected_len = SZ_2G,
+ },
+ {
+ .name = "three holes; all holes too small",
+ .hole_start = 0,
+ .hole_len = 10ULL * SZ_1G,
+ .min_hole_size = SZ_2G,
+ .pending_extents = {
+ { .start = SZ_1G, .len = SZ_1G },
+ { .start = 3ULL * SZ_1G, .len = 6ULL * SZ_1G },
+ },
+ .expected_found = false,
+ .expected_start = 0,
+ .expected_len = SZ_1G,
+ },
+ {
+ .name = "three holes; all holes too small; first biggest",
+ .hole_start = 0,
+ .hole_len = 10ULL * SZ_1G,
+ .min_hole_size = 3ULL * SZ_1G,
+ .pending_extents = {
+ { .start = SZ_2G, .len = SZ_1G },
+ { .start = 4ULL * SZ_1G, .len = 5ULL * SZ_1G },
+ },
+ .expected_found = false,
+ .expected_start = 0,
+ .expected_len = SZ_2G,
+ },
+ {
+ .name = "three holes; all holes too small; second biggest",
+ .hole_start = 0,
+ .hole_len = 10ULL * SZ_1G,
+ .min_hole_size = 3ULL * SZ_1G,
+ .pending_extents = {
+ { .start = SZ_1G, .len = SZ_1G },
+ { .start = 4ULL * SZ_1G, .len = 5ULL * SZ_1G },
+ },
+ .expected_found = false,
+ .expected_start = SZ_2G,
+ .expected_len = SZ_2G,
+ },
+ {
+ .name = "three holes; all holes too small; third biggest",
+ .hole_start = 0,
+ .hole_len = 10ULL * SZ_1G,
+ .min_hole_size = 3ULL * SZ_1G,
+ .pending_extents = {
+ { .start = SZ_1G, .len = SZ_1G },
+ { .start = 3ULL * SZ_1G, .len = 5ULL * SZ_1G },
+ },
+ .expected_found = false,
+ .expected_start = 8ULL * SZ_1G,
+ .expected_len = SZ_2G,
+ },
+ {
+ .name = "hole entirely allocated by pending",
+ .hole_start = 0,
+ .hole_len = 10ULL * SZ_1G,
+ .min_hole_size = SZ_1G,
+ .pending_extents = {
+ { .start = 0, .len = 10ULL * SZ_1G },
+ },
+ .expected_found = false,
+ .expected_start = 10ULL * SZ_1G,
+ .expected_len = 0,
+ },
+ {
+ .name = "pending extent at end of range",
+ .hole_start = 0,
+ .hole_len = 10ULL * SZ_1G,
+ .min_hole_size = SZ_1G,
+ .pending_extents = {
+ { .start = 9ULL * SZ_1G, .len = SZ_2G },
+ },
+ .expected_found = true,
+ .expected_start = 0,
+ .expected_len = 9ULL * SZ_1G,
+ },
+ {
+ .name = "zero length input",
+ .hole_start = SZ_1G,
+ .hole_len = 0,
+ .min_hole_size = SZ_1G,
+ .pending_extents = { },
+ .expected_found = false,
+ .expected_start = SZ_1G,
+ .expected_len = 0,
+ },
+};
+
+static int test_find_hole_in_pending(u32 sectorsize, u32 nodesize)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_device *device;
+ int ret = 0;
+
+ test_msg("running find_hole_in_pending_extents tests");
+
+ fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
+ if (!fs_info) {
+ test_std_err(TEST_ALLOC_FS_INFO);
+ return -ENOMEM;
+ }
+
+ device = btrfs_alloc_dummy_device(fs_info);
+ if (IS_ERR(device)) {
+ test_err("failed to allocate dummy device");
+ ret = PTR_ERR(device);
+ goto out_free_fs_info;
+ }
+ device->fs_info = fs_info;
+
+ for (int i = 0; i < ARRAY_SIZE(find_hole_tests); i++) {
+ const struct pending_extent_test_case *test_case = &find_hole_tests[i];
+ u64 hole_start = test_case->hole_start;
+ u64 hole_len = test_case->hole_len;
+ bool found;
+
+ for (int j = 0; j < ARRAY_SIZE(test_case->pending_extents); j++) {
+ u64 start = test_case->pending_extents[j].start;
+ u64 len = test_case->pending_extents[j].len;
+
+ if (!len)
+ continue;
+ btrfs_set_extent_bit(&device->alloc_state,
+ start, start + len - 1,
+ CHUNK_ALLOCATED, NULL);
+ }
+
+ mutex_lock(&fs_info->chunk_mutex);
+ found = btrfs_find_hole_in_pending_extents(device, &hole_start, &hole_len,
+ test_case->min_hole_size);
+ mutex_unlock(&fs_info->chunk_mutex);
+
+ if (found != test_case->expected_found) {
+ test_err("%s: expected found=%d, got found=%d",
+ test_case->name, test_case->expected_found, found);
+ ret = -EINVAL;
+ goto out_clear_pending_extents;
+ }
+ if (hole_start != test_case->expected_start ||
+ hole_len != test_case->expected_len) {
+ test_err("%s: expected [%llu, %llu), got [%llu, %llu)",
+ test_case->name, test_case->expected_start,
+ test_case->expected_start +
+ test_case->expected_len,
+ hole_start, hole_start + hole_len);
+ ret = -EINVAL;
+ goto out_clear_pending_extents;
+ }
+out_clear_pending_extents:
+ btrfs_clear_extent_bit(&device->alloc_state, 0, (u64)-1,
+ CHUNK_ALLOCATED, NULL);
+ if (ret)
+ break;
+ }
+
+out_free_fs_info:
+ btrfs_free_dummy_fs_info(fs_info);
+ return ret;
+}
+
+/*
+ * Describes the inputs to the system and expected results
+ * when testing btrfs_first_pending_extent().
+ */
+struct first_pending_test_case {
+ const char *name;
+ /* The range to look for a pending extent in. */
+ u64 hole_start;
+ u64 hole_len;
+ /* The pending extent to look for. */
+ struct {
+ u64 start;
+ u64 len;
+ } pending_extent;
+ /* Expected outputs. */
+ bool expected_found;
+ u64 expected_pending_start;
+ u64 expected_pending_end;
+};
+
+static const struct first_pending_test_case first_pending_tests[] = {
+ {
+ .name = "no pending extent",
+ .hole_start = 0,
+ .hole_len = 10ULL * SZ_1G,
+ .pending_extent = { 0, 0 },
+ .expected_found = false,
+ },
+ {
+ .name = "pending extent at search start",
+ .hole_start = SZ_1G,
+ .hole_len = 9ULL * SZ_1G,
+ .pending_extent = { SZ_1G, SZ_1G },
+ .expected_found = true,
+ .expected_pending_start = SZ_1G,
+ .expected_pending_end = SZ_2G - 1,
+ },
+ {
+ .name = "pending extent overlapping search start",
+ .hole_start = SZ_1G,
+ .hole_len = 9ULL * SZ_1G,
+ .pending_extent = { 0, SZ_2G },
+ .expected_found = true,
+ .expected_pending_start = 0,
+ .expected_pending_end = SZ_2G - 1,
+ },
+ {
+ .name = "pending extent inside search range",
+ .hole_start = 0,
+ .hole_len = 10ULL * SZ_1G,
+ .pending_extent = { SZ_2G, SZ_1G },
+ .expected_found = true,
+ .expected_pending_start = SZ_2G,
+ .expected_pending_end = 3ULL * SZ_1G - 1,
+ },
+ {
+ .name = "pending extent outside search range",
+ .hole_start = 0,
+ .hole_len = SZ_1G,
+ .pending_extent = { SZ_2G, SZ_1G },
+ .expected_found = false,
+ },
+ {
+ .name = "pending extent overlapping end of search range",
+ .hole_start = 0,
+ .hole_len = SZ_2G,
+ .pending_extent = { SZ_1G, SZ_2G },
+ .expected_found = true,
+ .expected_pending_start = SZ_1G,
+ .expected_pending_end = 3ULL * SZ_1G - 1,
+ },
+};
+
+static int test_first_pending_extent(u32 sectorsize, u32 nodesize)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_device *device;
+ int ret = 0;
+
+ test_msg("running first_pending_extent tests");
+
+ fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
+ if (!fs_info) {
+ test_std_err(TEST_ALLOC_FS_INFO);
+ return -ENOMEM;
+ }
+
+ device = btrfs_alloc_dummy_device(fs_info);
+ if (IS_ERR(device)) {
+ test_err("failed to allocate dummy device");
+ ret = PTR_ERR(device);
+ goto out_free_fs_info;
+ }
+
+ device->fs_info = fs_info;
+
+ for (int i = 0; i < ARRAY_SIZE(first_pending_tests); i++) {
+ const struct first_pending_test_case *test_case = &first_pending_tests[i];
+ u64 start = test_case->pending_extent.start;
+ u64 len = test_case->pending_extent.len;
+ u64 pending_start, pending_end;
+ bool found;
+
+ if (len) {
+ btrfs_set_extent_bit(&device->alloc_state,
+ start, start + len - 1,
+ CHUNK_ALLOCATED, NULL);
+ }
+
+ mutex_lock(&fs_info->chunk_mutex);
+ found = btrfs_first_pending_extent(device, test_case->hole_start,
+ test_case->hole_len,
+ &pending_start, &pending_end);
+ mutex_unlock(&fs_info->chunk_mutex);
+
+ if (found != test_case->expected_found) {
+ test_err("%s: expected found=%d, got found=%d",
+ test_case->name, test_case->expected_found, found);
+ ret = -EINVAL;
+ goto out_clear_pending_extents;
+ }
+ if (!found)
+ goto out_clear_pending_extents;
+
+ if (pending_start != test_case->expected_pending_start ||
+ pending_end != test_case->expected_pending_end) {
+ test_err("%s: expected pending [%llu, %llu], got [%llu, %llu]",
+ test_case->name,
+ test_case->expected_pending_start,
+ test_case->expected_pending_end,
+ pending_start, pending_end);
+ ret = -EINVAL;
+ goto out_clear_pending_extents;
+ }
+
+out_clear_pending_extents:
+ btrfs_clear_extent_bit(&device->alloc_state, 0, (u64)-1,
+ CHUNK_ALLOCATED, NULL);
+ if (ret)
+ break;
+ }
+
+out_free_fs_info:
+ btrfs_free_dummy_fs_info(fs_info);
+ return ret;
+}
+
+int btrfs_test_chunk_allocation(u32 sectorsize, u32 nodesize)
+{
+ int ret;
+
+ test_msg("running chunk allocation tests");
+
+ ret = test_first_pending_extent(sectorsize, nodesize);
+ if (ret)
+ return ret;
+
+ ret = test_find_hole_in_pending(sectorsize, nodesize);
+ if (ret)
+ return ret;
+
+ return 0;
+}
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index aabf825e8d7b..811f36d41101 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -173,9 +173,12 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
return -ENOMEM;
}
- /* Add [0, 1K) */
+ /*
+ * Add [0, 1K) which is inlined. And the extent map length must
+ * be one block.
+ */
em->start = 0;
- em->len = SZ_1K;
+ em->len = SZ_4K;
em->disk_bytenr = EXTENT_MAP_INLINE;
em->disk_num_bytes = 0;
em->ram_bytes = SZ_1K;
@@ -219,7 +222,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
/* Add [0, 1K) */
em->start = 0;
- em->len = SZ_1K;
+ em->len = SZ_4K;
em->disk_bytenr = EXTENT_MAP_INLINE;
em->disk_num_bytes = 0;
em->ram_bytes = SZ_1K;
@@ -235,7 +238,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
ret = -ENOENT;
goto out;
}
- if (em->start != 0 || btrfs_extent_map_end(em) != SZ_1K ||
+ if (em->start != 0 || btrfs_extent_map_end(em) != SZ_4K ||
em->disk_bytenr != EXTENT_MAP_INLINE) {
test_err(
"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu disk_bytenr %llu",
@@ -1131,8 +1134,11 @@ int btrfs_test_extent_map(void)
/*
* Note: the fs_info is not set up completely, we only need
* fs_info::fsid for the tracepoint.
+ *
+ * And all the immediate numbers are based on 4K blocksize,
+ * thus we have to use 4K as sectorsize no matter the page size.
*/
- fs_info = btrfs_alloc_dummy_fs_info(PAGE_SIZE, PAGE_SIZE);
+ fs_info = btrfs_alloc_dummy_fs_info(SZ_4K, SZ_4K);
if (!fs_info) {
test_std_err(TEST_ALLOC_FS_INFO);
return -ENOMEM;
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index c8822edd32e2..8dee057f41fd 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -49,7 +49,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans,
if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
if (path->slots[0] != 0)
goto invalid;
- end = cache->start + cache->length;
+ end = btrfs_block_group_end(cache);
i = 0;
while (++path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
@@ -216,7 +216,7 @@ static int test_remove_end(struct btrfs_trans_handle *trans,
int ret;
ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
- cache->start + cache->length - alignment,
+ btrfs_block_group_end(cache) - alignment,
alignment);
if (ret) {
test_err("could not remove free space");
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index a4c2b7748b95..b04fbcaf0a1d 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -81,17 +81,20 @@ static void insert_inode_item_key(struct btrfs_root *root)
* diagram of how the extents will look though this may not be possible we still
* want to make sure everything acts normally (the last number is not inclusive)
*
- * [0 - 6][ 6 - 4096 ][ 4096 - 4100][4100 - 8195][8195 - 12291]
- * [inline][hole but no extent][ hole ][ regular ][regular1 split]
+ * The numbers are using 4K fs block size as an example, the real test will scale
+ * all the extent maps (except the inlined one) according to the block size.
*
- * [12291 - 16387][16387 - 24579][24579 - 28675][ 28675 - 32771][32771 - 36867 ]
- * [ hole ][regular1 split][ prealloc ][ prealloc1 ][prealloc1 written]
+ * [ 0 - 6 ][ 6 - 4K ][ 4K - 8K ][ 8K - 12K ]
+ * [ inline ][ implied hole ][ regular ][ regular1 split ]
*
- * [36867 - 45059][45059 - 53251][53251 - 57347][57347 - 61443][61443- 69635]
- * [ prealloc1 ][ compressed ][ compressed1 ][ regular ][ compressed1]
+ * [ 12K - 16K ][ 16K - 24K ][ 24K - 28K ][ 28K - 32K ][ 32K - 36K ]
+ * [ hole ][ regular1 split ][ prealloc ][ prealloc1 ][ prealloc1 written ]
*
- * [69635-73731][ 73731 - 86019 ][86019-90115]
- * [ regular ][ hole but no extent][ regular ]
+ * [ 36K - 44K ][ 44K - 52K ][ 52K - 56K ][ 56K - 60K ][ 60K - 68 K ]
+ * [ prealloc1 ][ compressed ][ compressed1 ][ regular ][ compressed1 ]
+ *
+ * [ 68K - 72K ][ 72K - 84K ][ 84K - 88K ]
+ * [ regular ][ hole but no extent ][ regular ]
*/
static void setup_file_extents(struct btrfs_root *root, u32 sectorsize)
{
@@ -100,6 +103,8 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize)
u64 offset = 0;
/*
+ * Start 0, length 6, inlined.
+ *
* Tree-checker has strict limits on inline extents that they can only
* exist at file offset 0, thus we can only have one inline file extent
* at most.
@@ -109,20 +114,18 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize)
slot++;
offset = sectorsize;
- /* Now another hole */
- insert_extent(root, offset, 4, 4, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0,
- slot);
+ /* Start 1 * blocksize, length 1 * blocksize, regular. */
+ insert_extent(root, offset, sectorsize, sectorsize, 0,
+ disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot);
slot++;
- offset += 4;
- /* Now for a regular extent */
- insert_extent(root, offset, sectorsize - 1, sectorsize - 1, 0,
- disk_bytenr, sectorsize - 1, BTRFS_FILE_EXTENT_REG, 0, slot);
- slot++;
- disk_bytenr += sectorsize;
- offset += sectorsize - 1;
+ /* We don't want the regular em merged with the next one. */
+ disk_bytenr += 2 * sectorsize;
+ offset += sectorsize;
/*
+ * Start 2 * blocksize, length 1 * blocksize, regular.
+ *
* Now for 3 extents that were split from a hole punch so we test
* offsets properly.
*/
@@ -130,10 +133,14 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize)
4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot);
slot++;
offset += sectorsize;
+
+ /* Start 3 * blocksize, length 1 * blocksize, regular, explicit hole. */
insert_extent(root, offset, sectorsize, sectorsize, 0, 0, 0,
BTRFS_FILE_EXTENT_REG, 0, slot);
slot++;
offset += sectorsize;
+
+ /* Start 4 * blocksize, length 2 * blocksize, regular. */
insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize,
2 * sectorsize, disk_bytenr, 4 * sectorsize,
BTRFS_FILE_EXTENT_REG, 0, slot);
@@ -141,7 +148,7 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize)
offset += 2 * sectorsize;
disk_bytenr += 4 * sectorsize;
- /* Now for a unwritten prealloc extent */
+ /* Start 6 * blocksize, length 1 * blocksize, preallocated. */
insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr,
sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
slot++;
@@ -154,6 +161,8 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize)
disk_bytenr += 2 * sectorsize;
/*
+ * Start 7 * blocksize, length 1 * blocksize, prealloc.
+ *
* Now for a partially written prealloc extent, basically the same as
* the hole punch example above. Ram_bytes never changes when you mark
* extents written btw.
@@ -162,11 +171,15 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize)
4 * sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
slot++;
offset += sectorsize;
+
+ /* Start 8 * blocksize, length 1 * blocksize, regular. */
insert_extent(root, offset, sectorsize, 4 * sectorsize, sectorsize,
disk_bytenr, 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0,
slot);
slot++;
offset += sectorsize;
+
+ /* Start 9 * blocksize, length 2 * blocksize, prealloc. */
insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize,
2 * sectorsize, disk_bytenr, 4 * sectorsize,
BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
@@ -174,7 +187,7 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize)
offset += 2 * sectorsize;
disk_bytenr += 4 * sectorsize;
- /* Now a normal compressed extent */
+ /* Start 11 * blocksize, length 2 * blocksize, regular. */
insert_extent(root, offset, 2 * sectorsize, 2 * sectorsize, 0,
disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG,
BTRFS_COMPRESS_ZLIB, slot);
@@ -183,17 +196,21 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize)
/* No merges */
disk_bytenr += 2 * sectorsize;
- /* Now a split compressed extent */
+ /* Start 13 * blocksize, length 1 * blocksize, regular. */
insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr,
sectorsize, BTRFS_FILE_EXTENT_REG,
BTRFS_COMPRESS_ZLIB, slot);
slot++;
offset += sectorsize;
+
+ /* Start 14 * blocksize, length 1 * blocksize, regular. */
insert_extent(root, offset, sectorsize, sectorsize, 0,
disk_bytenr + sectorsize, sectorsize,
BTRFS_FILE_EXTENT_REG, 0, slot);
slot++;
offset += sectorsize;
+
+ /* Start 15 * blocksize, length 2 * blocksize, regular. */
insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize,
2 * sectorsize, disk_bytenr, sectorsize,
BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot);
@@ -201,12 +218,19 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize)
offset += 2 * sectorsize;
disk_bytenr += 2 * sectorsize;
- /* Now extents that have a hole but no hole extent */
+ /* Start 17 * blocksize, length 1 * blocksize, regular. */
insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr,
sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot);
slot++;
offset += 4 * sectorsize;
disk_bytenr += sectorsize;
+
+ /*
+ * Start 18 * blocksize, length 3 * blocksize, implied hole (aka no
+ * file extent item).
+ *
+ * Start 21 * blocksize, length 1 * blocksize, regular.
+ */
insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr,
sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot);
}
@@ -313,29 +337,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
* unless we have a page for it to write into. Maybe we should change
* this?
*/
- offset = em->start + em->len;
- btrfs_free_extent_map(em);
-
- em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
- if (IS_ERR(em)) {
- test_err("got an error when we shouldn't have");
- goto out;
- }
- if (em->disk_bytenr != EXTENT_MAP_HOLE) {
- test_err("expected a hole, got %llu", em->disk_bytenr);
- goto out;
- }
- if (em->start != offset || em->len != 4) {
- test_err(
- "unexpected extent wanted start %llu len 4, got start %llu len %llu",
- offset, em->start, em->len);
- goto out;
- }
- if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %u", em->flags);
- goto out;
- }
- offset = em->start + em->len;
+ offset = btrfs_extent_map_end(em);
btrfs_free_extent_map(em);
/* Regular extent */
@@ -348,10 +350,10 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
- if (em->start != offset || em->len != sectorsize - 1) {
+ if (em->start != offset || em->len != sectorsize) {
test_err(
- "unexpected extent wanted start %llu len 4095, got start %llu len %llu",
- offset, em->start, em->len);
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
+ offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
@@ -362,7 +364,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
- offset = em->start + em->len;
+ offset = btrfs_extent_map_end(em);
btrfs_free_extent_map(em);
/* The next 3 are split extents */
@@ -391,7 +393,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
}
disk_bytenr = btrfs_extent_map_block_start(em);
orig_start = em->start;
- offset = em->start + em->len;
+ offset = btrfs_extent_map_end(em);
btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
@@ -413,7 +415,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
- offset = em->start + em->len;
+ offset = btrfs_extent_map_end(em);
btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
@@ -446,7 +448,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
disk_bytenr, btrfs_extent_map_block_start(em));
goto out;
}
- offset = em->start + em->len;
+ offset = btrfs_extent_map_end(em);
btrfs_free_extent_map(em);
/* Prealloc extent */
@@ -474,7 +476,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
- offset = em->start + em->len;
+ offset = btrfs_extent_map_end(em);
btrfs_free_extent_map(em);
/* The next 3 are a half written prealloc extent */
@@ -504,7 +506,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
}
disk_bytenr = btrfs_extent_map_block_start(em);
orig_start = em->start;
- offset = em->start + em->len;
+ offset = btrfs_extent_map_end(em);
btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
@@ -536,7 +538,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
disk_bytenr + em->offset, btrfs_extent_map_block_start(em));
goto out;
}
- offset = em->start + em->len;
+ offset = btrfs_extent_map_end(em);
btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
@@ -569,7 +571,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
disk_bytenr + em->offset, btrfs_extent_map_block_start(em));
goto out;
}
- offset = em->start + em->len;
+ offset = btrfs_extent_map_end(em);
btrfs_free_extent_map(em);
/* Now for the compressed extent */
@@ -602,7 +604,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
BTRFS_COMPRESS_ZLIB, btrfs_extent_map_compression(em));
goto out;
}
- offset = em->start + em->len;
+ offset = btrfs_extent_map_end(em);
btrfs_free_extent_map(em);
/* Split compressed extent */
@@ -637,7 +639,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
}
disk_bytenr = btrfs_extent_map_block_start(em);
orig_start = em->start;
- offset = em->start + em->len;
+ offset = btrfs_extent_map_end(em);
btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
@@ -663,7 +665,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
- offset = em->start + em->len;
+ offset = btrfs_extent_map_end(em);
btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
@@ -697,7 +699,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
BTRFS_COMPRESS_ZLIB, btrfs_extent_map_compression(em));
goto out;
}
- offset = em->start + em->len;
+ offset = btrfs_extent_map_end(em);
btrfs_free_extent_map(em);
/* A hole between regular extents but no hole extent */
@@ -724,7 +726,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
- offset = em->start + em->len;
+ offset = btrfs_extent_map_end(em);
btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, SZ_4M);
@@ -756,7 +758,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
- offset = em->start + em->len;
+ offset = btrfs_extent_map_end(em);
btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index bd03f465e2d3..0b2498749b1e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -950,7 +950,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
if (transid) {
if (transid <= btrfs_get_last_trans_committed(fs_info))
- goto out;
+ return 0;
/* find specified transaction */
spin_lock(&fs_info->trans_lock);
@@ -975,7 +975,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
if (!cur_trans) {
if (transid > btrfs_get_last_trans_committed(fs_info))
ret = -EINVAL;
- goto out;
+ return ret;
}
} else {
/* find newest transaction that is committing | committed */
@@ -991,14 +991,15 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
}
}
spin_unlock(&fs_info->trans_lock);
+ /* Nothing committing or committed. */
if (!cur_trans)
- goto out; /* nothing committing|committed */
+ return ret;
}
wait_for_commit(cur_trans, TRANS_STATE_COMPLETED);
ret = cur_trans->aborted;
btrfs_put_transaction(cur_trans);
-out:
+
return ret;
}
@@ -1515,7 +1516,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
btrfs_free_log(trans, root);
ret2 = btrfs_update_reloc_root(trans, root);
- if (ret2)
+ if (unlikely(ret2))
return ret2;
/* see comments in should_cow_block() */
@@ -1532,7 +1533,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
ret2 = btrfs_update_root(trans, fs_info->tree_root,
&root->root_key,
&root->root_item);
- if (ret2)
+ if (unlikely(ret2))
return ret2;
spin_lock(&fs_info->fs_roots_radix_lock);
}
@@ -1621,9 +1622,9 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
goto out;
switch_commit_roots(trans);
ret = btrfs_write_and_wait_transaction(trans);
- if (ret)
- btrfs_handle_fs_error(fs_info, ret,
- "Error while writing out transaction for qgroup");
+ if (unlikely(ret))
+ btrfs_err(fs_info,
+"error while writing out transaction during qgroup snapshot accounting: %d", ret);
out:
/*
@@ -1687,11 +1688,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
&pending->dentry->d_name, 0,
&fname);
memalloc_nofs_restore(nofs_flags);
- if (pending->error)
+ if (unlikely(pending->error))
goto free_pending;
pending->error = btrfs_get_free_objectid(tree_root, &objectid);
- if (pending->error)
+ if (unlikely(pending->error))
goto free_fname;
/*
@@ -1707,7 +1708,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
&pending->block_rsv,
to_reserve,
BTRFS_RESERVE_NO_FLUSH);
- if (pending->error)
+ if (unlikely(pending->error))
goto clear_skip_qgroup;
}
@@ -1719,7 +1720,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
trans->bytes_reserved, 1);
parent_root = parent_inode->root;
ret = record_root_in_trans(trans, parent_root, 0);
- if (ret)
+ if (unlikely(ret))
goto fail;
cur_time = current_time(&parent_inode->vfs_inode);
@@ -1736,7 +1737,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
btrfs_ino(parent_inode),
&fname.disk_name, 0);
- if (dir_item != NULL && !IS_ERR(dir_item)) {
+ if (unlikely(dir_item != NULL && !IS_ERR(dir_item))) {
pending->error = -EEXIST;
goto dir_item_existed;
} else if (IS_ERR(dir_item)) {
@@ -1873,7 +1874,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
ret = btrfs_qgroup_inherit(trans, btrfs_root_id(root), objectid,
btrfs_root_id(parent_root), pending->inherit);
- if (ret < 0)
+ if (unlikely(ret < 0))
goto fail;
ret = btrfs_insert_dir_item(trans, &fname.disk_name,
@@ -1939,7 +1940,7 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans)
list_for_each_entry_safe(pending, next, head, list) {
list_del(&pending->list);
ret = create_pending_snapshot(trans, pending);
- if (ret)
+ if (unlikely(ret))
break;
}
return ret;
@@ -1967,6 +1968,13 @@ static void update_super_roots(struct btrfs_fs_info *fs_info)
super->cache_generation = 0;
if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags))
super->uuid_tree_generation = root_item->generation;
+
+ if (btrfs_fs_incompat(fs_info, REMAP_TREE)) {
+ root_item = &fs_info->remap_root->root_item;
+ super->remap_root = root_item->bytenr;
+ super->remap_root_generation = root_item->generation;
+ super->remap_root_level = root_item->level;
+ }
}
int btrfs_transaction_blocked(struct btrfs_fs_info *info)
@@ -2258,7 +2266,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (run_it) {
ret = btrfs_start_dirty_block_groups(trans);
- if (ret)
+ if (unlikely(ret))
goto lockdep_trans_commit_start_release;
}
}
@@ -2308,7 +2316,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
ret = READ_ONCE(prev_trans->aborted);
btrfs_put_transaction(prev_trans);
- if (ret)
+ if (unlikely(ret))
goto lockdep_release;
spin_lock(&fs_info->trans_lock);
}
@@ -2338,11 +2346,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
extwriter_counter_dec(cur_trans, trans->type);
ret = btrfs_start_delalloc_flush(fs_info);
- if (ret)
+ if (unlikely(ret))
goto lockdep_release;
ret = btrfs_run_delayed_items(trans);
- if (ret)
+ if (unlikely(ret))
goto lockdep_release;
/*
@@ -2357,7 +2365,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
/* some pending stuffs might be added after the previous flush. */
ret = btrfs_run_delayed_items(trans);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
goto cleanup_transaction;
}
@@ -2429,7 +2437,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* core function of the snapshot creation.
*/
ret = create_pending_snapshots(trans);
- if (ret)
+ if (unlikely(ret))
goto unlock_reloc;
/*
@@ -2443,11 +2451,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* the nodes and leaves.
*/
ret = btrfs_run_delayed_items(trans);
- if (ret)
+ if (unlikely(ret))
goto unlock_reloc;
ret = btrfs_run_delayed_refs(trans, U64_MAX);
- if (ret)
+ if (unlikely(ret))
goto unlock_reloc;
/*
@@ -2459,7 +2467,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
WARN_ON(cur_trans != trans->transaction);
ret = commit_fs_roots(trans);
- if (ret)
+ if (unlikely(ret))
goto unlock_reloc;
/* commit_fs_roots gets rid of all the tree log roots, it is now
@@ -2472,11 +2480,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* new_roots. So let's do quota accounting.
*/
ret = btrfs_qgroup_account_extents(trans);
- if (ret < 0)
+ if (unlikely(ret < 0))
goto unlock_reloc;
ret = commit_cowonly_roots(trans);
- if (ret)
+ if (unlikely(ret))
goto unlock_reloc;
/*
@@ -2500,13 +2508,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
list_add_tail(&fs_info->chunk_root->dirty_list,
&cur_trans->switch_commits);
- if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
- btrfs_set_root_node(&fs_info->block_group_root->root_item,
- fs_info->block_group_root->node);
- list_add_tail(&fs_info->block_group_root->dirty_list,
- &cur_trans->switch_commits);
- }
-
switch_commit_roots(trans);
ASSERT(list_empty(&cur_trans->dirty_bgs));
@@ -2550,9 +2551,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
wake_up_process(fs_info->cleaner_kthread);
ret = btrfs_write_and_wait_transaction(trans);
- if (ret) {
- btrfs_handle_fs_error(fs_info, ret,
- "Error while writing out transaction");
+ if (unlikely(ret)) {
+ btrfs_err(fs_info, "error while writing out transaction: %d", ret);
mutex_unlock(&fs_info->tree_log_mutex);
goto scrub_continue;
}
@@ -2563,7 +2563,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* to go about their business
*/
mutex_unlock(&fs_info->tree_log_mutex);
- if (ret)
+ if (unlikely(ret))
goto scrub_continue;
update_commit_stats(fs_info);
@@ -2576,7 +2576,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
ret = btrfs_finish_extent_commit(trans);
- if (ret)
+ if (unlikely(ret))
goto scrub_continue;
if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index c21c21adf61e..452394b34d01 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -688,6 +688,7 @@ static int check_block_group_item(struct extent_buffer *leaf,
u64 chunk_objectid;
u64 flags;
u64 type;
+ size_t exp_size;
/*
* Here we don't really care about alignment since extent allocator can
@@ -699,10 +700,15 @@ static int check_block_group_item(struct extent_buffer *leaf,
return -EUCLEAN;
}
- if (unlikely(item_size != sizeof(bgi))) {
+ if (btrfs_fs_incompat(fs_info, REMAP_TREE))
+ exp_size = sizeof(struct btrfs_block_group_item_v2);
+ else
+ exp_size = sizeof(struct btrfs_block_group_item);
+
+ if (unlikely(item_size != exp_size)) {
block_group_err(leaf, slot,
"invalid item size, have %u expect %zu",
- item_size, sizeof(bgi));
+ item_size, exp_size);
return -EUCLEAN;
}
@@ -748,17 +754,26 @@ static int check_block_group_item(struct extent_buffer *leaf,
return -EUCLEAN;
}
+ if (unlikely(flags & BTRFS_BLOCK_GROUP_METADATA_REMAP &&
+ !btrfs_fs_incompat(fs_info, REMAP_TREE))) {
+ block_group_err(leaf, slot,
+"invalid flags, have 0x%llx (METADATA_REMAP flag set) but no remap-tree incompat flag",
+ flags);
+ return -EUCLEAN;
+ }
+
type = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
if (unlikely(type != BTRFS_BLOCK_GROUP_DATA &&
type != BTRFS_BLOCK_GROUP_METADATA &&
type != BTRFS_BLOCK_GROUP_SYSTEM &&
+ type != BTRFS_BLOCK_GROUP_METADATA_REMAP &&
type != (BTRFS_BLOCK_GROUP_METADATA |
BTRFS_BLOCK_GROUP_DATA))) {
block_group_err(leaf, slot,
-"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx",
+"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx, 0x%llx or 0x%llx",
type, hweight64(type),
BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA,
- BTRFS_BLOCK_GROUP_SYSTEM,
+ BTRFS_BLOCK_GROUP_SYSTEM, BTRFS_BLOCK_GROUP_METADATA_REMAP,
BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA);
return -EUCLEAN;
}
@@ -807,6 +822,32 @@ static void chunk_err(const struct btrfs_fs_info *fs_info,
va_end(args);
}
+static bool valid_stripe_count(u64 profile, u16 num_stripes, u16 sub_stripes)
+{
+ switch (profile) {
+ case BTRFS_BLOCK_GROUP_RAID0:
+ return true;
+ case BTRFS_BLOCK_GROUP_RAID10:
+ return sub_stripes == btrfs_raid_array[BTRFS_RAID_RAID10].sub_stripes;
+ case BTRFS_BLOCK_GROUP_RAID1:
+ return num_stripes == btrfs_raid_array[BTRFS_RAID_RAID1].devs_min;
+ case BTRFS_BLOCK_GROUP_RAID1C3:
+ return num_stripes == btrfs_raid_array[BTRFS_RAID_RAID1C3].devs_min;
+ case BTRFS_BLOCK_GROUP_RAID1C4:
+ return num_stripes == btrfs_raid_array[BTRFS_RAID_RAID1C4].devs_min;
+ case BTRFS_BLOCK_GROUP_RAID5:
+ return num_stripes >= btrfs_raid_array[BTRFS_RAID_RAID5].devs_min;
+ case BTRFS_BLOCK_GROUP_RAID6:
+ return num_stripes >= btrfs_raid_array[BTRFS_RAID_RAID6].devs_min;
+ case BTRFS_BLOCK_GROUP_DUP:
+ return num_stripes == btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes;
+ case 0: /* SINGLE */
+ return num_stripes == btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes;
+ default:
+ BUG();
+ }
+}
+
/*
* The common chunk check which could also work on super block sys chunk array.
*
@@ -830,6 +871,7 @@ int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info,
u64 features;
u32 chunk_sector_size;
bool mixed = false;
+ bool remapped;
int raid_index;
int nparity;
int ncopies;
@@ -852,13 +894,14 @@ int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info,
raid_index = btrfs_bg_flags_to_raid_index(type);
ncopies = btrfs_raid_array[raid_index].ncopies;
nparity = btrfs_raid_array[raid_index].nparity;
+ remapped = (type & BTRFS_BLOCK_GROUP_REMAPPED);
- if (unlikely(!num_stripes)) {
+ if (unlikely(!remapped && !num_stripes)) {
chunk_err(fs_info, leaf, chunk, logical,
"invalid chunk num_stripes, have %u", num_stripes);
return -EUCLEAN;
}
- if (unlikely(num_stripes < ncopies)) {
+ if (unlikely(num_stripes != 0 && num_stripes < ncopies)) {
chunk_err(fs_info, leaf, chunk, logical,
"invalid chunk num_stripes < ncopies, have %u < %d",
num_stripes, ncopies);
@@ -913,12 +956,10 @@ int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info,
length, btrfs_stripe_nr_to_offset(U32_MAX));
return -EUCLEAN;
}
- if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
- BTRFS_BLOCK_GROUP_PROFILE_MASK))) {
+ if (unlikely(type & ~BTRFS_BLOCK_GROUP_VALID)) {
chunk_err(fs_info, leaf, chunk, logical,
"unrecognized chunk type: 0x%llx",
- ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
- BTRFS_BLOCK_GROUP_PROFILE_MASK) & type);
+ type & ~BTRFS_BLOCK_GROUP_VALID);
return -EUCLEAN;
}
@@ -958,22 +999,9 @@ int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info,
}
}
- if (unlikely((type & BTRFS_BLOCK_GROUP_RAID10 &&
- sub_stripes != btrfs_raid_array[BTRFS_RAID_RAID10].sub_stripes) ||
- (type & BTRFS_BLOCK_GROUP_RAID1 &&
- num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1].devs_min) ||
- (type & BTRFS_BLOCK_GROUP_RAID1C3 &&
- num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C3].devs_min) ||
- (type & BTRFS_BLOCK_GROUP_RAID1C4 &&
- num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C4].devs_min) ||
- (type & BTRFS_BLOCK_GROUP_RAID5 &&
- num_stripes < btrfs_raid_array[BTRFS_RAID_RAID5].devs_min) ||
- (type & BTRFS_BLOCK_GROUP_RAID6 &&
- num_stripes < btrfs_raid_array[BTRFS_RAID_RAID6].devs_min) ||
- (type & BTRFS_BLOCK_GROUP_DUP &&
- num_stripes != btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes) ||
- ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
- num_stripes != btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes))) {
+ if (!remapped &&
+ !valid_stripe_count(type & BTRFS_BLOCK_GROUP_PROFILE_MASK,
+ num_stripes, sub_stripes)) {
chunk_err(fs_info, leaf, chunk, logical,
"invalid num_stripes:sub_stripes %u:%u for profile %llu",
num_stripes, sub_stripes,
@@ -997,11 +1025,11 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf,
struct btrfs_fs_info *fs_info = leaf->fs_info;
int num_stripes;
- if (unlikely(btrfs_item_size(leaf, slot) < sizeof(struct btrfs_chunk))) {
+ if (unlikely(btrfs_item_size(leaf, slot) < offsetof(struct btrfs_chunk, stripe))) {
chunk_err(fs_info, leaf, chunk, key->offset,
"invalid chunk item size: have %u expect [%zu, %u)",
btrfs_item_size(leaf, slot),
- sizeof(struct btrfs_chunk),
+ offsetof(struct btrfs_chunk, stripe),
BTRFS_LEAF_DATA_SIZE(fs_info));
return -EUCLEAN;
}
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index eb201f4ec3c7..833e2fd989eb 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -57,6 +57,11 @@ enum btrfs_tree_block_status {
BTRFS_TREE_BLOCK_WRITTEN_NOT_SET,
};
+
+#define BTRFS_BLOCK_GROUP_VALID (BTRFS_BLOCK_GROUP_TYPE_MASK | \
+ BTRFS_BLOCK_GROUP_PROFILE_MASK | \
+ BTRFS_BLOCK_GROUP_REMAPPED)
+
/*
* Exported simply for btrfs-progs which wants to have the
* btrfs_tree_block_status return codes.
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 6cffcf0c3e7a..e1bd03ebfd98 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5160,7 +5160,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
if (ctx->logged_before) {
drop_args.path = path;
drop_args.start = em->start;
- drop_args.end = em->start + em->len;
+ drop_args.end = btrfs_extent_map_end(em);
drop_args.replace_extent = true;
drop_args.extent_item_size = sizeof(fi);
ret = btrfs_drop_extents(trans, log, inode, &drop_args);
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index e3a1310fa7d5..f24c14b9bb2f 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -207,15 +207,11 @@ static int btrfs_uuid_iter_rem(struct btrfs_root *uuid_root, u8 *uuid, u8 type,
/* 1 - for the uuid item */
trans = btrfs_start_transaction(uuid_root, 1);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- goto out;
- }
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
ret = btrfs_uuid_tree_remove(trans, uuid, type, subid);
btrfs_end_transaction(trans);
-
-out:
return ret;
}
@@ -235,14 +231,14 @@ static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
if (type != BTRFS_UUID_KEY_SUBVOL &&
type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
- goto out;
+ return 0;
subvol_root = btrfs_get_fs_root(fs_info, subvolid, true);
if (IS_ERR(subvol_root)) {
ret = PTR_ERR(subvol_root);
if (ret == -ENOENT)
- ret = 1;
- goto out;
+ return 1;
+ return ret;
}
switch (type) {
@@ -257,7 +253,7 @@ static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
break;
}
btrfs_put_root(subvol_root);
-out:
+
return ret;
}
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index a2ac3fb68bc8..06cbd6f00a78 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -525,23 +525,21 @@ static int finish_verity(struct btrfs_inode *inode, const void *desc,
ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 0,
(const char *)&item, sizeof(item));
if (ret)
- goto out;
+ return ret;
/* Write out the descriptor itself */
ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 1,
desc, desc_size);
if (ret)
- goto out;
+ return ret;
/*
* 1 for updating the inode flag
* 1 for deleting the orphan
*/
trans = btrfs_start_transaction(root, 2);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- goto out;
- }
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
inode->ro_flags |= BTRFS_INODE_RO_VERITY;
btrfs_sync_inode_flags_to_i_flags(inode);
ret = btrfs_update_inode(trans, inode);
@@ -554,8 +552,7 @@ static int finish_verity(struct btrfs_inode *inode, const void *desc,
btrfs_set_fs_compat_ro(root->fs_info, VERITY);
end_trans:
btrfs_end_transaction(trans);
-out:
- return ret;
+ return 0;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8a08412f3529..f281d113519b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -231,6 +231,10 @@ void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
+ /* Block groups containing the remap tree. */
+ DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA_REMAP, "metadata-remap");
+ /* Block group that has been remapped. */
+ DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_REMAPPED, "remapped");
DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
@@ -1169,7 +1173,7 @@ static void btrfs_close_one_device(struct btrfs_device *device)
* any transaction and set the error state, guaranteeing no commits of
* unsafe super blocks.
*/
- device->last_flush_error = 0;
+ clear_bit(BTRFS_DEV_STATE_FLUSH_FAILED, &device->dev_state);
/* Verify the device is back in a pristine state */
WARN_ON(test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
@@ -1505,30 +1509,158 @@ error_bdev_put:
}
/*
- * Try to find a chunk that intersects [start, start + len] range and when one
- * such is found, record the end of it in *start
+ * Find the first pending extent intersecting a range.
+ *
+ * @device: the device to search
+ * @start: start of the range to check
+ * @len: length of the range to check
+ * @pending_start: output pointer for the start of the found pending extent
+ * @pending_end: output pointer for the end of the found pending extent (inclusive)
+ *
+ * Search for a pending chunk allocation that intersects the half-open range
+ * [start, start + len).
+ *
+ * Return: true if a pending extent was found, false otherwise.
+ * If the return value is true, store the first pending extent in
+ * [*pending_start, *pending_end]. Otherwise, the two output variables
+ * may still be modified, to something outside the range and should not
+ * be used.
*/
-static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
- u64 len)
+bool btrfs_first_pending_extent(struct btrfs_device *device, u64 start, u64 len,
+ u64 *pending_start, u64 *pending_end)
{
- u64 physical_start, physical_end;
-
lockdep_assert_held(&device->fs_info->chunk_mutex);
- if (btrfs_find_first_extent_bit(&device->alloc_state, *start,
- &physical_start, &physical_end,
+ if (btrfs_find_first_extent_bit(&device->alloc_state, start,
+ pending_start, pending_end,
CHUNK_ALLOCATED, NULL)) {
- if (in_range(physical_start, *start, len) ||
- in_range(*start, physical_start,
- physical_end + 1 - physical_start)) {
- *start = physical_end + 1;
+ if (in_range(*pending_start, start, len) ||
+ in_range(start, *pending_start, *pending_end + 1 - *pending_start)) {
return true;
}
}
return false;
}
+/*
+ * Find the first real hole accounting for pending extents.
+ *
+ * @device: the device containing the candidate hole
+ * @start: input/output pointer for the hole start position
+ * @len: input/output pointer for the hole length
+ * @min_hole_size: the size of hole we are looking for
+ *
+ * Given a potential hole specified by [*start, *start + *len), check for pending
+ * chunk allocations within that range. If pending extents are found, the hole is
+ * adjusted to represent the first true free space that is large enough when
+ * accounting for pending chunks.
+ *
+ * Note that this function must handle various cases involving non consecutive
+ * pending extents.
+ *
+ * Returns: true if a suitable hole was found and false otherwise.
+ * If the return value is true, then *start and *len are set to represent the hole.
+ * If the return value is false, then *start is set to the largest hole we
+ * found and *len is set to its length.
+ * If there are no holes at all, then *start is set to the end of the range and
+ * *len is set to 0.
+ */
+bool btrfs_find_hole_in_pending_extents(struct btrfs_device *device, u64 *start,
+ u64 *len, u64 min_hole_size)
+{
+ u64 pending_start, pending_end;
+ u64 end;
+ u64 max_hole_start = 0;
+ u64 max_hole_len = 0;
+
+ lockdep_assert_held(&device->fs_info->chunk_mutex);
+
+ if (*len == 0)
+ return false;
+
+ end = *start + *len - 1;
+
+ /*
+ * Loop until we either see a large enough hole or check every pending
+ * extent overlapping the candidate hole.
+ * At every hole that we observe, record it if it is the new max.
+ * At the end of the iteration, set the output variables to the max hole.
+ */
+ while (true) {
+ if (btrfs_first_pending_extent(device, *start, *len, &pending_start, &pending_end)) {
+ /*
+ * Case 1: the pending extent overlaps the start of
+ * candidate hole. That means the true hole is after the
+ * pending extent, but we need to find the next pending
+ * extent to properly size the hole. In the next loop,
+ * we will reduce to case 2 or 3.
+ * e.g.,
+ *
+ * |----pending A----| real hole |----pending B----|
+ * | candidate hole |
+ * *start end
+ */
+ if (pending_start <= *start) {
+ *start = pending_end + 1;
+ goto next;
+ }
+ /*
+ * Case 2: The pending extent starts after *start (and overlaps
+ * [*start, end), so the first hole just goes up to the start
+ * of the pending extent.
+ * e.g.,
+ *
+ * | real hole |----pending A----|
+ * | candidate hole |
+ * *start end
+ */
+ *len = pending_start - *start;
+ if (*len > max_hole_len) {
+ max_hole_start = *start;
+ max_hole_len = *len;
+ }
+ if (*len >= min_hole_size)
+ break;
+ /*
+ * If the hole wasn't big enough, then we advance past
+ * the pending extent and keep looking.
+ */
+ *start = pending_end + 1;
+ goto next;
+ } else {
+ /*
+ * Case 3: There is no pending extent overlapping the
+ * range [*start, *start + *len - 1], so the only remaining
+ * hole is the remaining range.
+ * e.g.,
+ *
+ * | candidate hole |
+ * | real hole |
+ * *start end
+ */
+
+ if (*len > max_hole_len) {
+ max_hole_start = *start;
+ max_hole_len = *len;
+ }
+ break;
+ }
+next:
+ if (*start > end)
+ break;
+ *len = end - *start + 1;
+ }
+ if (max_hole_len) {
+ *start = max_hole_start;
+ *len = max_hole_len;
+ } else {
+ *start = end + 1;
+ *len = 0;
+ }
+ return max_hole_len >= min_hole_size;
+}
+
static u64 dev_extent_search_start(struct btrfs_device *device)
{
switch (device->fs_devices->chunk_alloc_policy) {
@@ -1593,59 +1725,57 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
}
/*
- * Check if specified hole is suitable for allocation.
+ * Validate and adjust a hole for chunk allocation
+ *
+ * @device: the device containing the candidate hole
+ * @hole_start: input/output pointer for the hole start position
+ * @hole_size: input/output pointer for the hole size
+ * @num_bytes: minimum allocation size required
+ *
+ * Check if the specified hole is suitable for allocation and adjust it if
+ * necessary. The hole may be modified to skip over pending chunk allocations
+ * and to satisfy stricter zoned requirements on zoned filesystems.
*
- * @device: the device which we have the hole
- * @hole_start: starting position of the hole
- * @hole_size: the size of the hole
- * @num_bytes: the size of the free space that we need
+ * For regular (non-zoned) allocation, if the hole after adjustment is smaller
+ * than @num_bytes, the search continues past additional pending extents until
+ * either a sufficiently large hole is found or no more pending extents exist.
*
- * This function may modify @hole_start and @hole_size to reflect the suitable
- * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
+ * Return: true if a suitable hole was found and false otherwise.
+ * If the return value is true, then *hole_start and *hole_size are set to
+ * represent the hole we found.
+ * If the return value is false, then *hole_start is set to the largest
+ * hole we found and *hole_size is set to its length.
+ * If there are no holes at all, then *hole_start is set to the end of the range
+ * and *hole_size is set to 0.
*/
static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
u64 *hole_size, u64 num_bytes)
{
- bool changed = false;
- u64 hole_end = *hole_start + *hole_size;
+ bool found = false;
+ const u64 hole_end = *hole_start + *hole_size - 1;
- for (;;) {
- /*
- * Check before we set max_hole_start, otherwise we could end up
- * sending back this offset anyway.
- */
- if (contains_pending_extent(device, hole_start, *hole_size)) {
- if (hole_end >= *hole_start)
- *hole_size = hole_end - *hole_start;
- else
- *hole_size = 0;
- changed = true;
- }
+ ASSERT(*hole_size > 0);
- switch (device->fs_devices->chunk_alloc_policy) {
- default:
- btrfs_warn_unknown_chunk_allocation(device->fs_devices->chunk_alloc_policy);
- fallthrough;
- case BTRFS_CHUNK_ALLOC_REGULAR:
- /* No extra check */
- break;
- case BTRFS_CHUNK_ALLOC_ZONED:
- if (dev_extent_hole_check_zoned(device, hole_start,
- hole_size, num_bytes)) {
- changed = true;
- /*
- * The changed hole can contain pending extent.
- * Loop again to check that.
- */
- continue;
- }
- break;
- }
+again:
+ *hole_size = hole_end - *hole_start + 1;
+ found = btrfs_find_hole_in_pending_extents(device, hole_start, hole_size, num_bytes);
+ if (!found)
+ return found;
+ ASSERT(*hole_size >= num_bytes);
+ switch (device->fs_devices->chunk_alloc_policy) {
+ default:
+ btrfs_warn_unknown_chunk_allocation(device->fs_devices->chunk_alloc_policy);
+ fallthrough;
+ case BTRFS_CHUNK_ALLOC_REGULAR:
+ return found;
+ case BTRFS_CHUNK_ALLOC_ZONED:
+ if (dev_extent_hole_check_zoned(device, hole_start, hole_size, num_bytes))
+ goto again;
break;
}
- return changed;
+ return found;
}
/*
@@ -1704,7 +1834,7 @@ static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
ret = -ENOMEM;
goto out;
}
-again:
+
if (search_start >= search_end ||
test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
ret = -ENOSPC;
@@ -1791,11 +1921,7 @@ next:
*/
if (search_end > search_start) {
hole_size = search_end - search_start;
- if (dev_extent_hole_check(device, &search_start, &hole_size,
- num_bytes)) {
- btrfs_release_path(path);
- goto again;
- }
+ dev_extent_hole_check(device, &search_start, &hole_size, num_bytes);
if (hole_size > max_hole_size) {
max_hole_start = search_start;
@@ -2316,9 +2442,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
free_fs_devices(cur_devices);
}
- ret = btrfs_commit_transaction(trans);
-
- return ret;
+ return btrfs_commit_transaction(trans);
error_undo:
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
@@ -2923,8 +3047,7 @@ error:
return ret;
}
-static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
- struct btrfs_device *device)
+int btrfs_update_device(struct btrfs_trans_handle *trans, struct btrfs_device *device)
{
int ret;
BTRFS_PATH_AUTO_FREE(path);
@@ -3222,25 +3345,12 @@ static int remove_chunk_item(struct btrfs_trans_handle *trans,
return btrfs_free_chunk(trans, chunk_offset);
}
-int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
+int btrfs_remove_dev_extents(struct btrfs_trans_handle *trans, struct btrfs_chunk_map *map)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_chunk_map *map;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
u64 dev_extent_len = 0;
int i, ret = 0;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
-
- map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
- if (IS_ERR(map)) {
- /*
- * This is a logic error, but we don't want to just rely on the
- * user having built with ASSERT enabled, so if ASSERT doesn't
- * do anything we still error out.
- */
- DEBUG_WARN("errr %ld reading chunk map at offset %llu",
- PTR_ERR(map), chunk_offset);
- return PTR_ERR(map);
- }
/*
* First delete the device extent items from the devices btree.
@@ -3261,7 +3371,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
if (unlikely(ret)) {
mutex_unlock(&fs_devices->device_list_mutex);
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
if (device->bytes_used > 0) {
@@ -3281,6 +3391,26 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
}
mutex_unlock(&fs_devices->device_list_mutex);
+ return 0;
+}
+
+int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_chunk_map *map;
+ int ret;
+
+ map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
+ if (IS_ERR(map)) {
+ DEBUG_WARN("errr %ld reading chunk map at offset %llu",
+ PTR_ERR(map), chunk_offset);
+ return PTR_ERR(map);
+ }
+
+ ret = btrfs_remove_dev_extents(trans, map);
+ if (ret)
+ goto out;
+
/*
* We acquire fs_info->chunk_mutex for 2 reasons:
*
@@ -3376,11 +3506,10 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
*/
btrfs_trans_release_chunk_metadata(trans);
+ /* On error, btrfs_remove_block_group() aborts the transaction. */
ret = btrfs_remove_block_group(trans, map);
- if (unlikely(ret)) {
- btrfs_abort_transaction(trans, ret);
- goto out;
- }
+ if (unlikely(ret))
+ ASSERT(BTRFS_FS_ERROR(fs_info) != 0);
out:
if (trans->removing_chunk) {
@@ -3392,15 +3521,50 @@ out:
return ret;
}
-int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset,
- bool verbose)
+static int btrfs_relocate_chunk_finish(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group *bg)
{
struct btrfs_root *root = fs_info->chunk_root;
struct btrfs_trans_handle *trans;
- struct btrfs_block_group *block_group;
u64 length;
int ret;
+ btrfs_discard_cancel_work(&fs_info->discard_ctl, bg);
+ length = bg->length;
+ btrfs_put_block_group(bg);
+
+ /*
+ * On a zoned file system, discard the whole block group, this will
+ * trigger a REQ_OP_ZONE_RESET operation on the device zone. If
+ * resetting the zone fails, don't treat it as a fatal problem from the
+ * filesystem's point of view.
+ */
+ if (btrfs_is_zoned(fs_info)) {
+ ret = btrfs_discard_extent(fs_info, bg->start, length, NULL, true);
+ if (ret)
+ btrfs_info(fs_info, "failed to reset zone %llu after relocation",
+ bg->start);
+ }
+
+ trans = btrfs_start_trans_remove_block_group(root->fs_info, bg->start);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ btrfs_handle_fs_error(root->fs_info, ret, NULL);
+ return ret;
+ }
+
+ /* Step two, delete the device extents and the chunk tree entries. */
+ ret = btrfs_remove_chunk(trans, bg->start);
+ btrfs_end_transaction(trans);
+
+ return ret;
+}
+
+int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset, bool verbose)
+{
+ struct btrfs_block_group *block_group;
+ int ret;
+
if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
btrfs_err(fs_info,
"relocate: not supported on extent tree v2 yet");
@@ -3438,38 +3602,15 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset,
block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
if (!block_group)
return -ENOENT;
- btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
- length = block_group->length;
- btrfs_put_block_group(block_group);
- /*
- * On a zoned file system, discard the whole block group, this will
- * trigger a REQ_OP_ZONE_RESET operation on the device zone. If
- * resetting the zone fails, don't treat it as a fatal problem from the
- * filesystem's point of view.
- */
- if (btrfs_is_zoned(fs_info)) {
- ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL);
- if (ret)
- btrfs_info(fs_info,
- "failed to reset zone %llu after relocation",
- chunk_offset);
- }
-
- trans = btrfs_start_trans_remove_block_group(root->fs_info,
- chunk_offset);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- btrfs_handle_fs_error(root->fs_info, ret, NULL);
- return ret;
+ if (should_relocate_using_remap_tree(block_group)) {
+ /* If we're relocating using the remap tree we're now done. */
+ btrfs_put_block_group(block_group);
+ ret = 0;
+ } else {
+ ret = btrfs_relocate_chunk_finish(fs_info, block_group);
}
- /*
- * step two, delete the device extents and the
- * chunk tree entries
- */
- ret = btrfs_remove_chunk(trans, chunk_offset);
- btrfs_end_transaction(trans);
return ret;
}
@@ -3646,7 +3787,7 @@ static int insert_balance_item(struct btrfs_fs_info *fs_info,
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key key;
- int ret, err;
+ int ret;
path = btrfs_alloc_path();
if (!path)
@@ -3681,9 +3822,11 @@ static int insert_balance_item(struct btrfs_fs_info *fs_info,
btrfs_set_balance_flags(leaf, item, bctl->flags);
out:
btrfs_free_path(path);
- err = btrfs_commit_transaction(trans);
- if (err && !ret)
- ret = err;
+ if (ret == 0)
+ ret = btrfs_commit_transaction(trans);
+ else
+ btrfs_end_transaction(trans);
+
return ret;
}
@@ -3693,7 +3836,7 @@ static int del_balance_item(struct btrfs_fs_info *fs_info)
struct btrfs_trans_handle *trans;
struct btrfs_path *path;
struct btrfs_key key;
- int ret, err;
+ int ret;
path = btrfs_alloc_path();
if (!path)
@@ -3720,9 +3863,11 @@ static int del_balance_item(struct btrfs_fs_info *fs_info)
ret = btrfs_del_item(trans, root, path);
out:
btrfs_free_path(path);
- err = btrfs_commit_transaction(trans);
- if (err && !ret)
- ret = err;
+ if (ret == 0)
+ ret = btrfs_commit_transaction(trans);
+ else
+ btrfs_end_transaction(trans);
+
return ret;
}
@@ -3966,6 +4111,12 @@ static bool should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk
struct btrfs_balance_args *bargs = NULL;
u64 chunk_type = btrfs_chunk_type(leaf, chunk);
+ /* Treat METADATA_REMAP chunks as METADATA. */
+ if (chunk_type & BTRFS_BLOCK_GROUP_METADATA_REMAP) {
+ chunk_type &= ~BTRFS_BLOCK_GROUP_METADATA_REMAP;
+ chunk_type |= BTRFS_BLOCK_GROUP_METADATA;
+ }
+
/* type filter */
if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
(bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
@@ -4047,6 +4198,107 @@ static bool should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk
return true;
}
+struct remap_chunk_info {
+ struct list_head list;
+ u64 offset;
+ struct btrfs_block_group *bg;
+ bool made_ro;
+};
+
+static int cow_remap_tree(struct btrfs_trans_handle *trans, struct btrfs_path *path)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_key key = { 0 };
+ int ret;
+
+ ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, 0, 1);
+ if (ret < 0)
+ return ret;
+
+ while (true) {
+ ret = btrfs_next_leaf(fs_info->remap_root, path);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ ret = 0;
+ break;
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ btrfs_release_path(path);
+
+ ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, 0, 1);
+ if (ret < 0)
+ break;
+ }
+
+ return ret;
+}
+
+static int balance_remap_chunks(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
+ struct list_head *chunks)
+{
+ struct remap_chunk_info *rci, *tmp;
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ list_for_each_entry_safe(rci, tmp, chunks, list) {
+ rci->bg = btrfs_lookup_block_group(fs_info, rci->offset);
+ if (!rci->bg) {
+ list_del(&rci->list);
+ kfree(rci);
+ continue;
+ }
+
+ ret = btrfs_inc_block_group_ro(rci->bg, false);
+ if (ret)
+ goto end;
+
+ rci->made_ro = true;
+ }
+
+ if (list_empty(chunks))
+ return 0;
+
+ trans = btrfs_start_transaction(fs_info->remap_root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto end;
+ }
+
+ mutex_lock(&fs_info->remap_mutex);
+ ret = cow_remap_tree(trans, path);
+ mutex_unlock(&fs_info->remap_mutex);
+
+ btrfs_release_path(path);
+ btrfs_commit_transaction(trans);
+
+end:
+ while (!list_empty(chunks)) {
+ bool is_unused;
+
+ rci = list_first_entry(chunks, struct remap_chunk_info, list);
+
+ spin_lock(&rci->bg->lock);
+ is_unused = !btrfs_is_block_group_used(rci->bg);
+ spin_unlock(&rci->bg->lock);
+
+ if (is_unused)
+ btrfs_mark_bg_unused(rci->bg);
+
+ if (rci->made_ro)
+ btrfs_dec_block_group_ro(rci->bg);
+
+ btrfs_put_block_group(rci->bg);
+
+ list_del(&rci->list);
+ kfree(rci);
+ }
+
+ return ret;
+}
+
static int __btrfs_balance(struct btrfs_fs_info *fs_info)
{
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
@@ -4069,6 +4321,9 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
u32 count_meta = 0;
u32 count_sys = 0;
int chunk_reserved = 0;
+ struct remap_chunk_info *rci;
+ unsigned int num_remap_chunks = 0;
+ LIST_HEAD(remap_chunks);
path = btrfs_alloc_path();
if (!path) {
@@ -4135,6 +4390,14 @@ again:
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
chunk_type = btrfs_chunk_type(leaf, chunk);
+ /* Check if chunk has already been fully relocated. */
+ if (chunk_type & BTRFS_BLOCK_GROUP_REMAPPED &&
+ btrfs_chunk_num_stripes(leaf, chunk) == 0) {
+ btrfs_release_path(path);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ goto loop;
+ }
+
if (!counting) {
spin_lock(&fs_info->balance_lock);
bctl->stat.considered++;
@@ -4159,7 +4422,8 @@ again:
count_data++;
else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
count_sys++;
- else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
+ else if (chunk_type & (BTRFS_BLOCK_GROUP_METADATA |
+ BTRFS_BLOCK_GROUP_METADATA_REMAP))
count_meta++;
goto loop;
@@ -4179,6 +4443,29 @@ again:
goto loop;
}
+ /*
+ * Balancing METADATA_REMAP chunks takes place separately - add
+ * the details to a list so it can be processed later.
+ */
+ if (chunk_type & BTRFS_BLOCK_GROUP_METADATA_REMAP) {
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+
+ rci = kmalloc(sizeof(struct remap_chunk_info), GFP_NOFS);
+ if (!rci) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ rci->offset = found_key.offset;
+ rci->bg = NULL;
+ rci->made_ro = false;
+ list_add_tail(&rci->list, &remap_chunks);
+
+ num_remap_chunks++;
+
+ goto loop;
+ }
+
if (!chunk_reserved) {
/*
* We may be relocating the only data chunk we have,
@@ -4218,11 +4505,24 @@ loop:
key.offset = found_key.offset - 1;
}
+ btrfs_release_path(path);
+
if (counting) {
- btrfs_release_path(path);
counting = false;
goto again;
}
+
+ if (!list_empty(&remap_chunks)) {
+ ret = balance_remap_chunks(fs_info, path, &remap_chunks);
+ if (ret == -ENOSPC)
+ enospc_errors++;
+
+ if (!ret) {
+ spin_lock(&fs_info->balance_lock);
+ bctl->stat.completed += num_remap_chunks;
+ spin_unlock(&fs_info->balance_lock);
+ }
+ }
error:
if (enospc_errors) {
btrfs_info(fs_info, "%d enospc errors during balance",
@@ -4844,6 +5144,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
u64 diff;
u64 start;
u64 free_diff = 0;
+ u64 pending_start, pending_end;
new_size = round_down(new_size, fs_info->sectorsize);
start = new_size;
@@ -4889,7 +5190,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
* in-memory chunks are synced to disk so that the loop below sees them
* and relocates them accordingly.
*/
- if (contains_pending_extent(device, &start, diff)) {
+ if (btrfs_first_pending_extent(device, start, diff, &pending_start, &pending_end)) {
mutex_unlock(&fs_info->chunk_mutex);
ret = btrfs_commit_transaction(trans);
if (ret)
@@ -5410,7 +5711,7 @@ static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int
}
}
-static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits)
+void btrfs_chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits)
{
for (int i = 0; i < map->num_stripes; i++) {
struct btrfs_io_stripe *stripe = &map->stripes[i];
@@ -5427,7 +5728,7 @@ void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_ma
write_lock(&fs_info->mapping_tree_lock);
rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
RB_CLEAR_NODE(&map->rb_node);
- chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
+ btrfs_chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
write_unlock(&fs_info->mapping_tree_lock);
/* Once for the tree reference. */
@@ -5463,7 +5764,7 @@ int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *m
return -EEXIST;
}
chunk_map_device_set_bits(map, CHUNK_ALLOCATED);
- chunk_map_device_clear_bits(map, CHUNK_TRIMMED);
+ btrfs_chunk_map_device_clear_bits(map, CHUNK_TRIMMED);
write_unlock(&fs_info->mapping_tree_lock);
return 0;
@@ -5819,7 +6120,7 @@ void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info)
map = rb_entry(node, struct btrfs_chunk_map, rb_node);
rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
RB_CLEAR_NODE(&map->rb_node);
- chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
+ btrfs_chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
/* Once for the tree ref. */
btrfs_free_chunk_map(map);
cond_resched_rwlock_write(&fs_info->mapping_tree_lock);
@@ -6066,7 +6367,7 @@ void btrfs_put_bioc(struct btrfs_io_context *bioc)
*/
struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
u64 logical, u64 *length_ret,
- u32 *num_stripes)
+ u32 *num_stripes, bool do_remap)
{
struct btrfs_chunk_map *map;
struct btrfs_discard_stripe *stripes;
@@ -6090,6 +6391,24 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
if (IS_ERR(map))
return ERR_CAST(map);
+ if (do_remap && (map->type & BTRFS_BLOCK_GROUP_REMAPPED)) {
+ u64 new_logical = logical;
+
+ ret = btrfs_translate_remap(fs_info, &new_logical, &length);
+ if (ret)
+ goto out_free_map;
+
+ if (new_logical != logical) {
+ btrfs_free_chunk_map(map);
+
+ map = btrfs_get_chunk_map(fs_info, new_logical, length);
+ if (IS_ERR(map))
+ return ERR_CAST(map);
+
+ logical = new_logical;
+ }
+ }
+
/* we don't discard raid56 yet */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
ret = -EOPNOTSUPP;
@@ -6577,6 +6896,24 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
if (IS_ERR(map))
return PTR_ERR(map);
+ if (map->type & BTRFS_BLOCK_GROUP_REMAPPED) {
+ u64 new_logical = logical;
+
+ ret = btrfs_translate_remap(fs_info, &new_logical, length);
+ if (ret)
+ return ret;
+
+ if (new_logical != logical) {
+ btrfs_free_chunk_map(map);
+
+ map = btrfs_get_chunk_map(fs_info, new_logical, *length);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ logical = new_logical;
+ }
+ }
+
num_copies = btrfs_chunk_map_num_copies(map);
if (io_geom.mirror_num > num_copies)
return -EINVAL;
@@ -7041,7 +7378,12 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
*/
map->sub_stripes = btrfs_raid_array[index].sub_stripes;
map->verified_stripes = 0;
- map->stripe_size = btrfs_calc_stripe_length(map);
+
+ if (num_stripes > 0)
+ map->stripe_size = btrfs_calc_stripe_length(map);
+ else
+ map->stripe_size = 0;
+
for (i = 0; i < num_stripes; i++) {
map->stripes[i].physical =
btrfs_stripe_offset_nr(leaf, chunk, i);
@@ -7167,7 +7509,6 @@ static int read_one_dev(struct extent_buffer *leaf,
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
u64 devid;
- int ret;
u8 fs_uuid[BTRFS_FSID_SIZE];
u8 dev_uuid[BTRFS_UUID_SIZE];
@@ -7267,8 +7608,8 @@ static int read_one_dev(struct extent_buffer *leaf,
atomic64_add(device->total_bytes - device->bytes_used,
&fs_info->free_chunk_space);
}
- ret = 0;
- return ret;
+
+ return 0;
}
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
@@ -7357,10 +7698,9 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
map = btrfs_find_chunk_map(fs_info, 0, U64_MAX);
/* No chunk at all? Return false anyway */
- if (!map) {
- ret = false;
- goto out;
- }
+ if (!map)
+ return false;
+
while (map) {
int missing = 0;
int max_tolerated;
@@ -7374,7 +7714,7 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
if (!dev || !dev->bdev ||
test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
- dev->last_flush_error)
+ test_bit(BTRFS_DEV_STATE_FLUSH_FAILED, &dev->dev_state))
missing++;
else if (failing_dev && failing_dev == dev)
missing++;
@@ -7385,15 +7725,14 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
"chunk %llu missing %d devices, max tolerance is %d for writable mount",
map->start, missing, max_tolerated);
btrfs_free_chunk_map(map);
- ret = false;
- goto out;
+ return false;
}
next_start = map->start + map->chunk_len;
btrfs_free_chunk_map(map);
map = btrfs_find_chunk_map(fs_info, next_start, U64_MAX - next_start);
}
-out:
+
return ret;
}
@@ -8025,7 +8364,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
if (!path)
return -ENOMEM;
- path->reada = READA_FORWARD;
+ path->reada = READA_FORWARD_ALWAYS;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
return ret;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index f20abeb16bce..8288d79372a5 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -30,6 +30,7 @@ struct btrfs_block_group;
struct btrfs_trans_handle;
struct btrfs_transaction;
struct btrfs_zoned_device_info;
+struct btrfs_space_info;
#define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G)
@@ -58,7 +59,6 @@ static_assert(ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT);
*/
static_assert(const_ffs(BTRFS_BLOCK_GROUP_RAID0) <
const_ffs(BTRFS_BLOCK_GROUP_PROFILE_MASK & ~BTRFS_BLOCK_GROUP_RAID0));
-static_assert(ilog2(BTRFS_BLOCK_GROUP_RAID0) > ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK));
/* ilog2() can handle both constants and variables */
#define BTRFS_BG_FLAG_TO_INDEX(profile) \
@@ -80,6 +80,15 @@ enum btrfs_raid_types {
BTRFS_NR_RAID_TYPES
};
+static_assert(BTRFS_RAID_RAID0 == 1);
+static_assert(BTRFS_RAID_RAID1 == 2);
+static_assert(BTRFS_RAID_DUP == 3);
+static_assert(BTRFS_RAID_RAID10 == 4);
+static_assert(BTRFS_RAID_RAID5 == 5);
+static_assert(BTRFS_RAID_RAID6 == 6);
+static_assert(BTRFS_RAID_RAID1C3 == 7);
+static_assert(BTRFS_RAID_RAID1C4 == 8);
+
/*
* Use sequence counter to get consistent device stat data on
* 32-bit processors.
@@ -99,6 +108,7 @@ enum btrfs_raid_types {
#define BTRFS_DEV_STATE_REPLACE_TGT (3)
#define BTRFS_DEV_STATE_FLUSH_SENT (4)
#define BTRFS_DEV_STATE_NO_READA (5)
+#define BTRFS_DEV_STATE_FLUSH_FAILED (6)
/* Set when the device item is found in chunk tree, used to catch unexpected registered device. */
#define BTRFS_DEV_STATE_ITEM_FOUND (7)
@@ -125,13 +135,7 @@ struct btrfs_device {
struct btrfs_zoned_device_info *zone_info;
- /*
- * Device's major-minor number. Must be set even if the device is not
- * opened (bdev == NULL), unless the device is missing.
- */
- dev_t devt;
unsigned long dev_state;
- blk_status_t last_flush_error;
#ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED
seqcount_t data_seqcount;
@@ -195,6 +199,12 @@ struct btrfs_device {
atomic_t dev_stats_ccnt;
atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
+ /*
+ * Device's major-minor number. Must be set even if the device is not
+ * opened (bdev == NULL), unless the device is missing.
+ */
+ dev_t devt;
+
struct extent_io_tree alloc_state;
struct completion kobj_unregister;
@@ -321,25 +331,6 @@ enum btrfs_read_policy {
BTRFS_NR_READ_POLICY,
};
-#ifdef CONFIG_BTRFS_EXPERIMENTAL
-/*
- * Checksum mode - offload it to workqueues or do it synchronously in
- * btrfs_submit_chunk().
- */
-enum btrfs_offload_csum_mode {
- /*
- * Choose offloading checksum or do it synchronously automatically.
- * Do it synchronously if the checksum is fast, or offload to workqueues
- * otherwise.
- */
- BTRFS_OFFLOAD_CSUM_AUTO,
- /* Always offload checksum to workqueues. */
- BTRFS_OFFLOAD_CSUM_FORCE_ON,
- /* Never offload checksum to workqueues. */
- BTRFS_OFFLOAD_CSUM_FORCE_OFF,
-};
-#endif
-
struct btrfs_fs_devices {
u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
@@ -466,9 +457,6 @@ struct btrfs_fs_devices {
/* Device to be used for reading in case of RAID1. */
u64 read_devid;
-
- /* Checksum mode - offload it or do it synchronously. */
- enum btrfs_offload_csum_mode offload_csum_mode;
#endif
};
@@ -646,6 +634,7 @@ static inline void btrfs_free_chunk_map(struct btrfs_chunk_map *map)
kfree(map);
}
}
+DEFINE_FREE(btrfs_free_chunk_map, struct btrfs_chunk_map *, btrfs_free_chunk_map(_T))
struct btrfs_balance_control {
struct btrfs_balance_args data;
@@ -727,7 +716,7 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
u32 length, int mirror_num);
struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
u64 logical, u64 *length_ret,
- u32 *num_stripes);
+ u32 *num_stripes, bool do_remap);
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
@@ -789,6 +778,7 @@ u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map);
int btrfs_nr_parity_stripes(u64 type);
int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
struct btrfs_block_group *bg);
+int btrfs_remove_dev_extents(struct btrfs_trans_handle *trans, struct btrfs_chunk_map *map);
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -901,6 +891,13 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb);
+int btrfs_update_device(struct btrfs_trans_handle *trans, struct btrfs_device *device);
+void btrfs_chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits);
+
+bool btrfs_first_pending_extent(struct btrfs_device *device, u64 start, u64 len,
+ u64 *pending_start, u64 *pending_end);
+bool btrfs_find_hole_in_pending_extents(struct btrfs_device *device,
+ u64 *start, u64 *len, u64 min_hole_size);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 10ed48d4a846..0a8fcee16428 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -145,31 +145,24 @@ static int copy_data_into_buffer(struct address_space *mapping,
return 0;
}
-int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
- u64 start, struct folio **folios, unsigned long *out_folios,
- unsigned long *total_in, unsigned long *total_out)
+int zlib_compress_bio(struct list_head *ws, struct compressed_bio *cb)
{
+ struct btrfs_inode *inode = cb->bbio.inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct workspace *workspace = list_entry(ws, struct workspace, list);
struct address_space *mapping = inode->vfs_inode.i_mapping;
- const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
+ struct bio *bio = &cb->bbio.bio;
+ u64 start = cb->start;
+ u32 len = cb->len;
const u32 min_folio_size = btrfs_min_folio_size(fs_info);
int ret;
char *data_in = NULL;
char *cfolio_out;
- int nr_folios = 0;
struct folio *in_folio = NULL;
struct folio *out_folio = NULL;
- unsigned long len = *total_out;
- unsigned long nr_dest_folios = *out_folios;
- const unsigned long max_out = nr_dest_folios << min_folio_shift;
const u32 blocksize = fs_info->sectorsize;
const u64 orig_end = start + len;
- *out_folios = 0;
- *total_out = 0;
- *total_in = 0;
-
ret = zlib_deflateInit(&workspace->strm, workspace->level);
if (unlikely(ret != Z_OK)) {
btrfs_err(fs_info,
@@ -188,8 +181,6 @@ int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
goto out;
}
cfolio_out = folio_address(out_folio);
- folios[0] = out_folio;
- nr_folios = 1;
workspace->strm.next_in = workspace->buf;
workspace->strm.avail_in = 0;
@@ -198,8 +189,8 @@ int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
while (workspace->strm.total_in < len) {
/*
- * Get next input pages and copy the contents to
- * the workspace buffer if required.
+ * Get next input pages and copy the contents to the workspace
+ * buffer if required.
*/
if (workspace->strm.avail_in == 0) {
unsigned long bytes_left = len - workspace->strm.total_in;
@@ -250,40 +241,39 @@ int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
goto out;
}
- /* we're making it bigger, give up */
+ /* We're making it bigger, give up. */
if (workspace->strm.total_in > blocksize * 2 &&
- workspace->strm.total_in <
- workspace->strm.total_out) {
+ workspace->strm.total_in < workspace->strm.total_out) {
ret = -E2BIG;
goto out;
}
- /* we need another page for writing out. Test this
- * before the total_in so we will pull in a new page for
- * the stream end if required
- */
+ if (workspace->strm.total_out >= len) {
+ ret = -E2BIG;
+ goto out;
+ }
+ /* Queue the full folio and allocate a new one. */
if (workspace->strm.avail_out == 0) {
- if (nr_folios == nr_dest_folios) {
+ if (!bio_add_folio(bio, out_folio, folio_size(out_folio), 0)) {
ret = -E2BIG;
goto out;
}
+
out_folio = btrfs_alloc_compr_folio(fs_info);
if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
}
cfolio_out = folio_address(out_folio);
- folios[nr_folios] = out_folio;
- nr_folios++;
workspace->strm.avail_out = min_folio_size;
workspace->strm.next_out = cfolio_out;
}
- /* we're all done */
+ /* We're all done. */
if (workspace->strm.total_in >= len)
break;
- if (workspace->strm.total_out > max_out)
- break;
}
+
workspace->strm.avail_in = 0;
+
/*
* Call deflate with Z_FINISH flush parameter providing more output
* space but no more input data, until it returns with Z_STREAM_END.
@@ -297,23 +287,39 @@ int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
ret = -EIO;
goto out;
} else if (workspace->strm.avail_out == 0) {
- /* Get another folio for the stream end. */
- if (nr_folios == nr_dest_folios) {
+ if (workspace->strm.total_out >= len) {
+ ret = -E2BIG;
+ goto out;
+ }
+ if (!bio_add_folio(bio, out_folio, folio_size(out_folio), 0)) {
ret = -E2BIG;
goto out;
}
+ /* Get another folio for the stream end. */
out_folio = btrfs_alloc_compr_folio(fs_info);
if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
}
cfolio_out = folio_address(out_folio);
- folios[nr_folios] = out_folio;
- nr_folios++;
workspace->strm.avail_out = min_folio_size;
workspace->strm.next_out = cfolio_out;
}
}
+ /* Queue the remaining part of the folio. */
+ if (workspace->strm.total_out > bio->bi_iter.bi_size) {
+ u32 cur_len = offset_in_folio(out_folio, workspace->strm.total_out);
+
+ if (!bio_add_folio(bio, out_folio, cur_len, 0)) {
+ ret = -E2BIG;
+ goto out;
+ }
+ } else {
+ /* The last folio hasn't' been utilized. */
+ btrfs_free_compr_folio(out_folio);
+ }
+ out_folio = NULL;
+ ASSERT(bio->bi_iter.bi_size == workspace->strm.total_out);
zlib_deflateEnd(&workspace->strm);
if (workspace->strm.total_out >= workspace->strm.total_in) {
@@ -322,10 +328,9 @@ int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
}
ret = 0;
- *total_out = workspace->strm.total_out;
- *total_in = workspace->strm.total_in;
out:
- *out_folios = nr_folios;
+ if (out_folio)
+ btrfs_free_compr_folio(out_folio);
if (data_in) {
kunmap_local(data_in);
folio_put(in_folio);
@@ -338,18 +343,23 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
struct btrfs_fs_info *fs_info = cb_to_fs_info(cb);
struct workspace *workspace = list_entry(ws, struct workspace, list);
+ struct folio_iter fi;
const u32 min_folio_size = btrfs_min_folio_size(fs_info);
int ret = 0, ret2;
int wbits = MAX_WBITS;
char *data_in;
size_t total_out = 0;
- unsigned long folio_in_index = 0;
size_t srclen = cb->compressed_len;
- unsigned long total_folios_in = DIV_ROUND_UP(srclen, min_folio_size);
unsigned long buf_start;
- struct folio **folios_in = cb->compressed_folios;
- data_in = kmap_local_folio(folios_in[folio_in_index], 0);
+ bio_first_folio(&fi, &cb->bbio.bio, 0);
+
+ /* We must have at least one folio here, that has the correct size. */
+ if (unlikely(!fi.folio))
+ return -EINVAL;
+ ASSERT(folio_size(fi.folio) == min_folio_size);
+
+ data_in = kmap_local_folio(fi.folio, 0);
workspace->strm.next_in = data_in;
workspace->strm.avail_in = min_t(size_t, srclen, min_folio_size);
workspace->strm.total_in = 0;
@@ -404,12 +414,13 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (workspace->strm.avail_in == 0) {
unsigned long tmp;
kunmap_local(data_in);
- folio_in_index++;
- if (folio_in_index >= total_folios_in) {
+ bio_next_folio(&fi, &cb->bbio.bio);
+ if (!fi.folio) {
data_in = NULL;
break;
}
- data_in = kmap_local_folio(folios_in[folio_in_index], 0);
+ ASSERT(folio_size(fi.folio) == min_folio_size);
+ data_in = kmap_local_folio(fi.folio, 0);
workspace->strm.next_in = data_in;
tmp = srclen - workspace->strm.total_in;
workspace->strm.avail_in = min(tmp, min_folio_size);
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 359a98e6de85..ad8621587fd2 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1231,6 +1231,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_key found_key;
+ const u64 bg_end = btrfs_block_group_end(cache);
int ret;
u64 length;
@@ -1253,7 +1254,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
if (!path)
return -ENOMEM;
- key.objectid = cache->start + cache->length;
+ key.objectid = bg_end;
key.type = 0;
key.offset = 0;
@@ -1282,7 +1283,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
length = fs_info->nodesize;
if (unlikely(!(found_key.objectid >= cache->start &&
- found_key.objectid + length <= cache->start + cache->length))) {
+ found_key.objectid + length <= bg_end))) {
return -EUCLEAN;
}
*offset_ret = found_key.objectid + length - cache->start;
@@ -1437,18 +1438,32 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity);
if (unlikely(zone_info[0].alloc_offset == WP_MISSING_DEV)) {
- btrfs_err(bg->fs_info,
+ btrfs_err(fs_info,
"zoned: cannot recover write pointer for zone %llu",
zone_info[0].physical);
return -EIO;
}
if (unlikely(zone_info[1].alloc_offset == WP_MISSING_DEV)) {
- btrfs_err(bg->fs_info,
+ btrfs_err(fs_info,
"zoned: cannot recover write pointer for zone %llu",
zone_info[1].physical);
return -EIO;
}
+ /*
+ * When the last extent is removed, last_alloc can be smaller than the other write
+ * pointer. In that case, last_alloc should be moved to the corresponding write
+ * pointer position.
+ */
+ for (int i = 0; i < map->num_stripes; i++) {
+ if (zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ continue;
+ if (last_alloc <= zone_info[i].alloc_offset) {
+ last_alloc = zone_info[i].alloc_offset;
+ break;
+ }
+ }
+
if (zone_info[0].alloc_offset == WP_CONVENTIONAL)
zone_info[0].alloc_offset = last_alloc;
@@ -1456,7 +1471,7 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
zone_info[1].alloc_offset = last_alloc;
if (unlikely(zone_info[0].alloc_offset != zone_info[1].alloc_offset)) {
- btrfs_err(bg->fs_info,
+ btrfs_err(fs_info,
"zoned: write pointer offset mismatch of zones in DUP profile");
return -EIO;
}
@@ -1490,6 +1505,21 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
/* In case a device is missing we have a cap of 0, so don't use it. */
bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity);
+ /*
+ * When the last extent is removed, last_alloc can be smaller than the other write
+ * pointer. In that case, last_alloc should be moved to the corresponding write
+ * pointer position.
+ */
+ for (i = 0; i < map->num_stripes; i++) {
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
+ zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ continue;
+ if (last_alloc <= zone_info[i].alloc_offset) {
+ last_alloc = zone_info[i].alloc_offset;
+ break;
+ }
+ }
+
for (i = 0; i < map->num_stripes; i++) {
if (zone_info[i].alloc_offset == WP_MISSING_DEV)
continue;
@@ -1531,7 +1561,9 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
{
struct btrfs_fs_info *fs_info = bg->fs_info;
u64 stripe_nr = 0, stripe_offset = 0;
+ u64 prev_offset = 0;
u32 stripe_index = 0;
+ bool has_partial = false, has_conventional = false;
if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
@@ -1539,6 +1571,35 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
return -EINVAL;
}
+ /*
+ * When the last extent is removed, last_alloc can be smaller than the other write
+ * pointer. In that case, last_alloc should be moved to the corresponding write
+ * pointer position.
+ */
+ for (int i = 0; i < map->num_stripes; i++) {
+ u64 alloc;
+
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
+ zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ continue;
+
+ stripe_nr = zone_info[i].alloc_offset >> BTRFS_STRIPE_LEN_SHIFT;
+ stripe_offset = zone_info[i].alloc_offset & BTRFS_STRIPE_LEN_MASK;
+ if (stripe_offset == 0 && stripe_nr > 0) {
+ stripe_nr--;
+ stripe_offset = BTRFS_STRIPE_LEN;
+ }
+ alloc = ((stripe_nr * map->num_stripes + i) << BTRFS_STRIPE_LEN_SHIFT) +
+ stripe_offset;
+ last_alloc = max(last_alloc, alloc);
+
+ /* Partially written stripe found. It should be last. */
+ if (zone_info[i].alloc_offset & BTRFS_STRIPE_LEN_MASK)
+ break;
+ }
+ stripe_nr = 0;
+ stripe_offset = 0;
+
if (last_alloc) {
u32 factor = map->num_stripes;
@@ -1552,7 +1613,7 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
continue;
if (zone_info[i].alloc_offset == WP_CONVENTIONAL) {
-
+ has_conventional = true;
zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr);
if (stripe_index > i)
@@ -1561,6 +1622,28 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
zone_info[i].alloc_offset += stripe_offset;
}
+ /* Verification */
+ if (i != 0) {
+ if (unlikely(prev_offset < zone_info[i].alloc_offset)) {
+ btrfs_err(fs_info,
+ "zoned: stripe position disorder found in block group %llu",
+ bg->start);
+ return -EIO;
+ }
+
+ if (unlikely(has_partial &&
+ (zone_info[i].alloc_offset & BTRFS_STRIPE_LEN_MASK))) {
+ btrfs_err(fs_info,
+ "zoned: multiple partial written stripe found in block group %llu",
+ bg->start);
+ return -EIO;
+ }
+ }
+ prev_offset = zone_info[i].alloc_offset;
+
+ if ((zone_info[i].alloc_offset & BTRFS_STRIPE_LEN_MASK) != 0)
+ has_partial = true;
+
if (test_bit(0, active) != test_bit(i, active)) {
if (unlikely(!btrfs_zone_activate(bg)))
return -EIO;
@@ -1572,6 +1655,19 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
bg->alloc_offset += zone_info[i].alloc_offset;
}
+ /* Check if all devices stay in the same stripe row. */
+ if (unlikely(zone_info[0].alloc_offset -
+ zone_info[map->num_stripes - 1].alloc_offset > BTRFS_STRIPE_LEN)) {
+ btrfs_err(fs_info, "zoned: stripe gap too large in block group %llu", bg->start);
+ return -EIO;
+ }
+
+ if (unlikely(has_conventional && bg->alloc_offset < last_alloc)) {
+ btrfs_err(fs_info, "zoned: allocated extent stays beyond write pointers %llu %llu",
+ bg->alloc_offset, last_alloc);
+ return -EIO;
+ }
+
return 0;
}
@@ -1582,8 +1678,11 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
+ u64 AUTO_KFREE(raid0_allocs);
u64 stripe_nr = 0, stripe_offset = 0;
u32 stripe_index = 0;
+ bool has_partial = false, has_conventional = false;
+ u64 prev_offset = 0;
if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
@@ -1591,6 +1690,60 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
return -EINVAL;
}
+ raid0_allocs = kcalloc(map->num_stripes / map->sub_stripes, sizeof(*raid0_allocs),
+ GFP_NOFS);
+ if (!raid0_allocs)
+ return -ENOMEM;
+
+ /*
+ * When the last extent is removed, last_alloc can be smaller than the other write
+ * pointer. In that case, last_alloc should be moved to the corresponding write
+ * pointer position.
+ */
+ for (int i = 0; i < map->num_stripes; i += map->sub_stripes) {
+ u64 alloc = zone_info[i].alloc_offset;
+
+ for (int j = 1; j < map->sub_stripes; j++) {
+ int idx = i + j;
+
+ if (zone_info[idx].alloc_offset == WP_MISSING_DEV ||
+ zone_info[idx].alloc_offset == WP_CONVENTIONAL)
+ continue;
+ if (alloc == WP_MISSING_DEV || alloc == WP_CONVENTIONAL) {
+ alloc = zone_info[idx].alloc_offset;
+ } else if (unlikely(zone_info[idx].alloc_offset != alloc)) {
+ btrfs_err(fs_info,
+ "zoned: write pointer mismatch found in block group %llu",
+ bg->start);
+ return -EIO;
+ }
+ }
+
+ raid0_allocs[i / map->sub_stripes] = alloc;
+ if (alloc == WP_CONVENTIONAL)
+ continue;
+ if (unlikely(alloc == WP_MISSING_DEV)) {
+ btrfs_err(fs_info,
+ "zoned: cannot recover write pointer of block group %llu due to missing device",
+ bg->start);
+ return -EIO;
+ }
+
+ stripe_nr = alloc >> BTRFS_STRIPE_LEN_SHIFT;
+ stripe_offset = alloc & BTRFS_STRIPE_LEN_MASK;
+ if (stripe_offset == 0 && stripe_nr > 0) {
+ stripe_nr--;
+ stripe_offset = BTRFS_STRIPE_LEN;
+ }
+
+ alloc = ((stripe_nr * (map->num_stripes / map->sub_stripes) +
+ (i / map->sub_stripes)) <<
+ BTRFS_STRIPE_LEN_SHIFT) + stripe_offset;
+ last_alloc = max(last_alloc, alloc);
+ }
+ stripe_nr = 0;
+ stripe_offset = 0;
+
if (last_alloc) {
u32 factor = map->num_stripes / map->sub_stripes;
@@ -1600,24 +1753,51 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
}
for (int i = 0; i < map->num_stripes; i++) {
- if (zone_info[i].alloc_offset == WP_MISSING_DEV)
- continue;
+ int idx = i / map->sub_stripes;
- if (test_bit(0, active) != test_bit(i, active)) {
- if (unlikely(!btrfs_zone_activate(bg)))
- return -EIO;
- } else {
- if (test_bit(0, active))
- set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+ if (raid0_allocs[idx] == WP_CONVENTIONAL) {
+ has_conventional = true;
+ raid0_allocs[idx] = btrfs_stripe_nr_to_offset(stripe_nr);
+
+ if (stripe_index > idx)
+ raid0_allocs[idx] += BTRFS_STRIPE_LEN;
+ else if (stripe_index == idx)
+ raid0_allocs[idx] += stripe_offset;
}
- if (zone_info[i].alloc_offset == WP_CONVENTIONAL) {
- zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr);
+ if ((i % map->sub_stripes) == 0) {
+ /* Verification */
+ if (i != 0) {
+ if (unlikely(prev_offset < raid0_allocs[idx])) {
+ btrfs_err(fs_info,
+ "zoned: stripe position disorder found in block group %llu",
+ bg->start);
+ return -EIO;
+ }
- if (stripe_index > (i / map->sub_stripes))
- zone_info[i].alloc_offset += BTRFS_STRIPE_LEN;
- else if (stripe_index == (i / map->sub_stripes))
- zone_info[i].alloc_offset += stripe_offset;
+ if (unlikely(has_partial &&
+ (raid0_allocs[idx] & BTRFS_STRIPE_LEN_MASK))) {
+ btrfs_err(fs_info,
+ "zoned: multiple partial written stripe found in block group %llu",
+ bg->start);
+ return -EIO;
+ }
+ }
+ prev_offset = raid0_allocs[idx];
+
+ if ((raid0_allocs[idx] & BTRFS_STRIPE_LEN_MASK) != 0)
+ has_partial = true;
+ }
+
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
+ zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ zone_info[i].alloc_offset = raid0_allocs[idx];
+
+ if (test_bit(0, active) != test_bit(i, active)) {
+ if (unlikely(!btrfs_zone_activate(bg)))
+ return -EIO;
+ } else if (test_bit(0, active)) {
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
}
if ((i % map->sub_stripes) == 0) {
@@ -1626,9 +1806,79 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
}
}
+ /* Check if all devices stay in the same stripe row. */
+ if (unlikely(zone_info[0].alloc_offset -
+ zone_info[map->num_stripes - 1].alloc_offset > BTRFS_STRIPE_LEN)) {
+ btrfs_err(fs_info, "zoned: stripe gap too large in block group %llu",
+ bg->start);
+ return -EIO;
+ }
+
+ if (unlikely(has_conventional && bg->alloc_offset < last_alloc)) {
+ btrfs_err(fs_info, "zoned: allocated extent stays beyond write pointers %llu %llu",
+ bg->alloc_offset, last_alloc);
+ return -EIO;
+ }
+
return 0;
}
+EXPORT_FOR_TESTS
+int btrfs_load_block_group_by_raid_type(struct btrfs_block_group *bg,
+ struct btrfs_chunk_map *map,
+ struct zone_info *zone_info,
+ unsigned long *active, u64 last_alloc)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+ u64 profile;
+ int ret;
+
+ profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+ switch (profile) {
+ case 0: /* single */
+ ret = btrfs_load_block_group_single(bg, &zone_info[0], active);
+ break;
+ case BTRFS_BLOCK_GROUP_DUP:
+ ret = btrfs_load_block_group_dup(bg, map, zone_info, active, last_alloc);
+ break;
+ case BTRFS_BLOCK_GROUP_RAID1:
+ case BTRFS_BLOCK_GROUP_RAID1C3:
+ case BTRFS_BLOCK_GROUP_RAID1C4:
+ ret = btrfs_load_block_group_raid1(bg, map, zone_info, active, last_alloc);
+ break;
+ case BTRFS_BLOCK_GROUP_RAID0:
+ ret = btrfs_load_block_group_raid0(bg, map, zone_info, active, last_alloc);
+ break;
+ case BTRFS_BLOCK_GROUP_RAID10:
+ ret = btrfs_load_block_group_raid10(bg, map, zone_info, active, last_alloc);
+ break;
+ case BTRFS_BLOCK_GROUP_RAID5:
+ case BTRFS_BLOCK_GROUP_RAID6:
+ default:
+ btrfs_err(fs_info, "zoned: profile %s not yet supported",
+ btrfs_bg_type_to_raid_name(map->type));
+ return -EINVAL;
+ }
+
+ if (ret == -EIO && profile != 0 && profile != BTRFS_BLOCK_GROUP_RAID0 &&
+ profile != BTRFS_BLOCK_GROUP_RAID10) {
+ /*
+ * Detected broken write pointer. Make this block group
+ * unallocatable by setting the allocation pointer at the end of
+ * allocatable region. Relocating this block group will fix the
+ * mismatch.
+ *
+ * Currently, we cannot handle RAID0 or RAID10 case like this
+ * because we don't have a proper zone_capacity value. But,
+ * reading from this block group won't work anyway by a missing
+ * stripe.
+ */
+ bg->alloc_offset = bg->zone_capacity;
+ }
+
+ return ret;
+}
+
int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
@@ -1641,7 +1891,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
unsigned long *active = NULL;
u64 last_alloc = 0;
u32 num_sequential = 0, num_conventional = 0;
- u64 profile;
if (!btrfs_is_zoned(fs_info))
return 0;
@@ -1701,53 +1950,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
}
- profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
- switch (profile) {
- case 0: /* single */
- ret = btrfs_load_block_group_single(cache, &zone_info[0], active);
- break;
- case BTRFS_BLOCK_GROUP_DUP:
- ret = btrfs_load_block_group_dup(cache, map, zone_info, active,
- last_alloc);
- break;
- case BTRFS_BLOCK_GROUP_RAID1:
- case BTRFS_BLOCK_GROUP_RAID1C3:
- case BTRFS_BLOCK_GROUP_RAID1C4:
- ret = btrfs_load_block_group_raid1(cache, map, zone_info,
- active, last_alloc);
- break;
- case BTRFS_BLOCK_GROUP_RAID0:
- ret = btrfs_load_block_group_raid0(cache, map, zone_info,
- active, last_alloc);
- break;
- case BTRFS_BLOCK_GROUP_RAID10:
- ret = btrfs_load_block_group_raid10(cache, map, zone_info,
- active, last_alloc);
- break;
- case BTRFS_BLOCK_GROUP_RAID5:
- case BTRFS_BLOCK_GROUP_RAID6:
- default:
- btrfs_err(fs_info, "zoned: profile %s not yet supported",
- btrfs_bg_type_to_raid_name(map->type));
- ret = -EINVAL;
- goto out;
- }
-
- if (ret == -EIO && profile != 0 && profile != BTRFS_BLOCK_GROUP_RAID0 &&
- profile != BTRFS_BLOCK_GROUP_RAID10) {
- /*
- * Detected broken write pointer. Make this block group
- * unallocatable by setting the allocation pointer at the end of
- * allocatable region. Relocating this block group will fix the
- * mismatch.
- *
- * Currently, we cannot handle RAID0 or RAID10 case like this
- * because we don't have a proper zone_capacity value. But,
- * reading from this block group won't work anyway by a missing
- * stripe.
- */
- cache->alloc_offset = cache->zone_capacity;
- }
+ ret = btrfs_load_block_group_by_raid_type(cache, map, zone_info, active, last_alloc);
out:
/* Reject non SINGLE data profiles without RST */
@@ -2028,7 +2231,7 @@ int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
if (block_group) {
if (block_group->start > eb->start ||
- block_group->start + block_group->length <= eb->start) {
+ btrfs_block_group_end(block_group) <= eb->start) {
btrfs_put_block_group(block_group);
block_group = NULL;
ctx->zoned_bg = NULL;
@@ -2248,7 +2451,7 @@ out_unlock:
static void wait_eb_writebacks(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- const u64 end = block_group->start + block_group->length;
+ const u64 end = btrfs_block_group_end(block_group);
struct extent_buffer *eb;
unsigned long index, start = (block_group->start >> fs_info->nodesize_bits);
@@ -2984,3 +3187,58 @@ int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num
return 0;
}
+
+void btrfs_show_zoned_stats(struct btrfs_fs_info *fs_info, struct seq_file *seq)
+{
+ struct btrfs_block_group *bg;
+ u64 data_reloc_bg;
+ u64 treelog_bg;
+
+ seq_puts(seq, "\n zoned statistics:\n");
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ seq_printf(seq, "\tactive block-groups: %zu\n",
+ list_count_nodes(&fs_info->zone_active_bgs));
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ seq_printf(seq, "\t reclaimable: %zu\n",
+ list_count_nodes(&fs_info->reclaim_bgs));
+ seq_printf(seq, "\t unused: %zu\n", list_count_nodes(&fs_info->unused_bgs));
+ spin_unlock(&fs_info->unused_bgs_lock);
+
+ seq_printf(seq,"\t need reclaim: %s\n",
+ str_true_false(btrfs_zoned_should_reclaim(fs_info)));
+
+ data_reloc_bg = data_race(fs_info->data_reloc_bg);
+ if (data_reloc_bg)
+ seq_printf(seq, "\tdata relocation block-group: %llu\n",
+ data_reloc_bg);
+ treelog_bg = data_race(fs_info->treelog_bg);
+ if (treelog_bg)
+ seq_printf(seq, "\ttree-log block-group: %llu\n", treelog_bg);
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ seq_puts(seq, "\tactive zones:\n");
+ list_for_each_entry(bg, &fs_info->zone_active_bgs, active_bg_list) {
+ u64 start;
+ u64 alloc_offset;
+ u64 used;
+ u64 reserved;
+ u64 zone_unusable;
+ const char *typestr = btrfs_space_info_type_str(bg->space_info);
+
+ spin_lock(&bg->lock);
+ start = bg->start;
+ alloc_offset = bg->alloc_offset;
+ used = bg->used;
+ reserved = bg->reserved;
+ zone_unusable = bg->zone_unusable;
+ spin_unlock(&bg->lock);
+
+ seq_printf(seq,
+ "\t start: %llu, wp: %llu used: %llu, reserved: %llu, unusable: %llu (%s)\n",
+ start, alloc_offset, used, reserved, zone_unusable, typestr);
+ }
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 5cefdeb08b7b..8e21a836f858 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -10,6 +10,7 @@
#include <linux/errno.h>
#include <linux/spinlock.h>
#include <linux/mutex.h>
+#include <linux/seq_file.h>
#include "messages.h"
#include "volumes.h"
#include "disk-io.h"
@@ -96,6 +97,17 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info);
int btrfs_zoned_activate_one_bg(struct btrfs_space_info *space_info, bool do_finish);
void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info);
int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes);
+void btrfs_show_zoned_stats(struct btrfs_fs_info *fs_info, struct seq_file *seq);
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+struct zone_info;
+
+int btrfs_load_block_group_by_raid_type(struct btrfs_block_group *bg,
+ struct btrfs_chunk_map *map,
+ struct zone_info *zone_info,
+ unsigned long *active, u64 last_alloc);
+#endif
+
#else /* CONFIG_BLK_DEV_ZONED */
static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
@@ -275,6 +287,11 @@ static inline int btrfs_reset_unused_block_groups(struct btrfs_space_info *space
return 0;
}
+static inline int btrfs_show_zoned_stats(struct btrfs_fs_info *fs_info, struct seq_file *seq)
+{
+ return 0;
+}
+
#endif
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index c9cddcfa337b..32fd7f5454d3 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -396,36 +396,31 @@ fail:
return ERR_PTR(-ENOMEM);
}
-int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
- u64 start, struct folio **folios, unsigned long *out_folios,
- unsigned long *total_in, unsigned long *total_out)
+int zstd_compress_bio(struct list_head *ws, struct compressed_bio *cb)
{
+ struct btrfs_inode *inode = cb->bbio.inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct workspace *workspace = list_entry(ws, struct workspace, list);
struct address_space *mapping = inode->vfs_inode.i_mapping;
+ struct bio *bio = &cb->bbio.bio;
zstd_cstream *stream;
int ret = 0;
- int nr_folios = 0;
- struct folio *in_folio = NULL; /* The current folio to read. */
- struct folio *out_folio = NULL; /* The current folio to write to. */
+ /* The current folio to read. */
+ struct folio *in_folio = NULL;
+ /* The current folio to write to. */
+ struct folio *out_folio = NULL;
unsigned long tot_in = 0;
unsigned long tot_out = 0;
- unsigned long len = *total_out;
- const unsigned long nr_dest_folios = *out_folios;
- const u64 orig_end = start + len;
+ const u64 start = cb->start;
+ const u32 len = cb->len;
+ const u64 end = start + len;
const u32 blocksize = fs_info->sectorsize;
const u32 min_folio_size = btrfs_min_folio_size(fs_info);
- unsigned long max_out = nr_dest_folios * min_folio_size;
- unsigned int cur_len;
workspace->params = zstd_get_btrfs_parameters(workspace->req_level, len);
- *out_folios = 0;
- *total_out = 0;
- *total_in = 0;
- /* Initialize the stream */
- stream = zstd_init_cstream(&workspace->params, len, workspace->mem,
- workspace->size);
+ /* Initialize the stream. */
+ stream = zstd_init_cstream(&workspace->params, len, workspace->mem, workspace->size);
if (unlikely(!stream)) {
btrfs_err(fs_info,
"zstd compression init level %d failed, root %llu inode %llu offset %llu",
@@ -435,99 +430,95 @@ int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
goto out;
}
- /* map in the first page of input data */
+ /* Map in the first page of input data. */
ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio);
if (ret < 0)
goto out;
- cur_len = btrfs_calc_input_length(in_folio, orig_end, start);
workspace->in_buf.src = kmap_local_folio(in_folio, offset_in_folio(in_folio, start));
workspace->in_buf.pos = 0;
- workspace->in_buf.size = cur_len;
+ workspace->in_buf.size = btrfs_calc_input_length(in_folio, end, start);
- /* Allocate and map in the output buffer */
+ /* Allocate and map in the output buffer. */
out_folio = btrfs_alloc_compr_folio(fs_info);
if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
}
- folios[nr_folios++] = out_folio;
workspace->out_buf.dst = folio_address(out_folio);
workspace->out_buf.pos = 0;
- workspace->out_buf.size = min_t(size_t, max_out, min_folio_size);
+ workspace->out_buf.size = min_folio_size;
while (1) {
size_t ret2;
- ret2 = zstd_compress_stream(stream, &workspace->out_buf,
- &workspace->in_buf);
+ ret2 = zstd_compress_stream(stream, &workspace->out_buf, &workspace->in_buf);
if (unlikely(zstd_is_error(ret2))) {
btrfs_warn(fs_info,
"zstd compression level %d failed, error %d root %llu inode %llu offset %llu",
workspace->req_level, zstd_get_error_code(ret2),
btrfs_root_id(inode->root), btrfs_ino(inode),
- start);
+ start + tot_in);
ret = -EIO;
goto out;
}
- /* Check to see if we are making it bigger */
+ /* Check to see if we are making it bigger. */
if (tot_in + workspace->in_buf.pos > blocksize * 2 &&
- tot_in + workspace->in_buf.pos <
- tot_out + workspace->out_buf.pos) {
+ tot_in + workspace->in_buf.pos < tot_out + workspace->out_buf.pos) {
ret = -E2BIG;
goto out;
}
- /* We've reached the end of our output range */
- if (workspace->out_buf.pos >= max_out) {
- tot_out += workspace->out_buf.pos;
- ret = -E2BIG;
- goto out;
- }
-
- /* Check if we need more output space */
- if (workspace->out_buf.pos == workspace->out_buf.size) {
+ /* Check if we need more output space. */
+ if (workspace->out_buf.pos >= workspace->out_buf.size) {
tot_out += min_folio_size;
- max_out -= min_folio_size;
- if (nr_folios == nr_dest_folios) {
+ if (tot_out >= len) {
ret = -E2BIG;
goto out;
}
+ /* Queue the current foliot into the bio. */
+ if (!bio_add_folio(bio, out_folio, folio_size(out_folio), 0)) {
+ ret = -E2BIG;
+ goto out;
+ }
+
out_folio = btrfs_alloc_compr_folio(fs_info);
if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
}
- folios[nr_folios++] = out_folio;
workspace->out_buf.dst = folio_address(out_folio);
workspace->out_buf.pos = 0;
- workspace->out_buf.size = min_t(size_t, max_out, min_folio_size);
+ workspace->out_buf.size = min_folio_size;
}
- /* We've reached the end of the input */
- if (workspace->in_buf.pos >= len) {
+ /* We've reached the end of the input. */
+ if (tot_in + workspace->in_buf.pos >= len) {
tot_in += workspace->in_buf.pos;
break;
}
- /* Check if we need more input */
- if (workspace->in_buf.pos == workspace->in_buf.size) {
+ /* Check if we need more input. */
+ if (workspace->in_buf.pos >= workspace->in_buf.size) {
+ u64 cur;
+
tot_in += workspace->in_buf.size;
+ cur = start + tot_in;
+
kunmap_local(workspace->in_buf.src);
workspace->in_buf.src = NULL;
folio_put(in_folio);
- start += cur_len;
- len -= cur_len;
- ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio);
+
+ ret = btrfs_compress_filemap_get_folio(mapping, cur, &in_folio);
if (ret < 0)
goto out;
- cur_len = btrfs_calc_input_length(in_folio, orig_end, start);
workspace->in_buf.src = kmap_local_folio(in_folio,
- offset_in_folio(in_folio, start));
+ offset_in_folio(in_folio, cur));
workspace->in_buf.pos = 0;
- workspace->in_buf.size = cur_len;
+ workspace->in_buf.size = btrfs_calc_input_length(in_folio, end, cur);
}
}
+
while (1) {
size_t ret2;
@@ -537,23 +528,30 @@ int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
"zstd compression end level %d failed, error %d root %llu inode %llu offset %llu",
workspace->req_level, zstd_get_error_code(ret2),
btrfs_root_id(inode->root), btrfs_ino(inode),
- start);
+ start + tot_in);
ret = -EIO;
goto out;
}
+ /* Queue the remaining part of the output folio into bio. */
if (ret2 == 0) {
tot_out += workspace->out_buf.pos;
+ if (tot_out >= len) {
+ ret = -E2BIG;
+ goto out;
+ }
+ if (!bio_add_folio(bio, out_folio, workspace->out_buf.pos, 0)) {
+ ret = -E2BIG;
+ goto out;
+ }
+ out_folio = NULL;
break;
}
- if (workspace->out_buf.pos >= max_out) {
- tot_out += workspace->out_buf.pos;
+ tot_out += min_folio_size;
+ if (tot_out >= len) {
ret = -E2BIG;
goto out;
}
-
- tot_out += min_folio_size;
- max_out -= min_folio_size;
- if (nr_folios == nr_dest_folios) {
+ if (!bio_add_folio(bio, out_folio, folio_size(out_folio), 0)) {
ret = -E2BIG;
goto out;
}
@@ -562,10 +560,9 @@ int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
ret = -ENOMEM;
goto out;
}
- folios[nr_folios++] = out_folio;
workspace->out_buf.dst = folio_address(out_folio);
workspace->out_buf.pos = 0;
- workspace->out_buf.size = min_t(size_t, max_out, min_folio_size);
+ workspace->out_buf.size = min_folio_size;
}
if (tot_out >= tot_in) {
@@ -574,10 +571,10 @@ int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
}
ret = 0;
- *total_in = tot_in;
- *total_out = tot_out;
+ ASSERT(tot_out == bio->bi_iter.bi_size);
out:
- *out_folios = nr_folios;
+ if (out_folio)
+ btrfs_free_compr_folio(out_folio);
if (workspace->in_buf.src) {
kunmap_local(workspace->in_buf.src);
folio_put(in_folio);
@@ -589,7 +586,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
struct btrfs_fs_info *fs_info = cb_to_fs_info(cb);
struct workspace *workspace = list_entry(ws, struct workspace, list);
- struct folio **folios_in = cb->compressed_folios;
+ struct folio_iter fi;
size_t srclen = cb->compressed_len;
zstd_dstream *stream;
int ret = 0;
@@ -600,6 +597,11 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
unsigned long buf_start;
unsigned long total_out = 0;
+ bio_first_folio(&fi, &cb->bbio.bio, 0);
+ if (unlikely(!fi.folio))
+ return -EINVAL;
+ ASSERT(folio_size(fi.folio) == blocksize);
+
stream = zstd_init_dstream(
ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
if (unlikely(!stream)) {
@@ -612,7 +614,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
goto done;
}
- workspace->in_buf.src = kmap_local_folio(folios_in[folio_in_index], 0);
+ workspace->in_buf.src = kmap_local_folio(fi.folio, 0);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, srclen, min_folio_size);
@@ -660,8 +662,9 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
goto done;
}
srclen -= min_folio_size;
- workspace->in_buf.src =
- kmap_local_folio(folios_in[folio_in_index], 0);
+ bio_next_folio(&fi, &cb->bbio.bio);
+ ASSERT(fi.folio);
+ workspace->in_buf.src = kmap_local_folio(fi.folio, 0);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, srclen, min_folio_size);
}
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index e8fd92789423..9165154a274d 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -336,6 +336,7 @@ struct btrfs_ioctl_fs_info_args {
#define BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2 (1ULL << 13)
#define BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE (1ULL << 14)
#define BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA (1ULL << 16)
+#define BTRFS_FEATURE_INCOMPAT_REMAP_TREE (1ULL << 17)
struct btrfs_ioctl_feature_flags {
__u64 compat_flags;
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index fc29d273845d..f7843e6bb978 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -76,6 +76,9 @@
/* Tracks RAID stripes in block groups. */
#define BTRFS_RAID_STRIPE_TREE_OBJECTID 12ULL
+/* Holds details of remapped addresses after relocation. */
+#define BTRFS_REMAP_TREE_OBJECTID 13ULL
+
/* device stats in the device tree */
#define BTRFS_DEV_STATS_OBJECTID 0ULL
@@ -282,6 +285,10 @@
#define BTRFS_RAID_STRIPE_KEY 230
+#define BTRFS_IDENTITY_REMAP_KEY 234
+#define BTRFS_REMAP_KEY 235
+#define BTRFS_REMAP_BACKREF_KEY 236
+
/*
* Records the overall state of the qgroups.
* There's only one instance of this key present,
@@ -714,9 +721,12 @@ struct btrfs_super_block {
__u8 metadata_uuid[BTRFS_FSID_SIZE];
__u64 nr_global_roots;
+ __le64 remap_root;
+ __le64 remap_root_generation;
+ __u8 remap_root_level;
/* Future expansion */
- __le64 reserved[27];
+ __u8 reserved[199];
__u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
@@ -1161,12 +1171,15 @@ struct btrfs_dev_replace_item {
#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8)
#define BTRFS_BLOCK_GROUP_RAID1C3 (1ULL << 9)
#define BTRFS_BLOCK_GROUP_RAID1C4 (1ULL << 10)
+#define BTRFS_BLOCK_GROUP_REMAPPED (1ULL << 11)
+#define BTRFS_BLOCK_GROUP_METADATA_REMAP (1ULL << 12)
#define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \
BTRFS_SPACE_INFO_GLOBAL_RSV)
#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \
BTRFS_BLOCK_GROUP_SYSTEM | \
- BTRFS_BLOCK_GROUP_METADATA)
+ BTRFS_BLOCK_GROUP_METADATA | \
+ BTRFS_BLOCK_GROUP_METADATA_REMAP)
#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
BTRFS_BLOCK_GROUP_RAID1 | \
@@ -1219,6 +1232,14 @@ struct btrfs_block_group_item {
__le64 flags;
} __attribute__ ((__packed__));
+struct btrfs_block_group_item_v2 {
+ __le64 used;
+ __le64 chunk_objectid;
+ __le64 flags;
+ __le64 remap_bytes;
+ __le32 identity_remap_count;
+} __attribute__ ((__packed__));
+
struct btrfs_free_space_info {
__le32 extent_count;
__le32 flags;
@@ -1323,4 +1344,13 @@ struct btrfs_verity_descriptor_item {
__u8 encryption;
} __attribute__ ((__packed__));
+/*
+ * For a range identified by a BTRFS_REMAP_KEY item in the remap tree, gives
+ * the address that the start of the range will get remapped to. This
+ * structure is also shared by BTRFS_REMAP_BACKREF_KEY.
+ */
+struct btrfs_remap_item {
+ __le64 address;
+} __attribute__ ((__packed__));
+
#endif /* _BTRFS_CTREE_H_ */