diff options
Diffstat (limited to 'fs/btrfs')
75 files changed, 3024 insertions, 2514 deletions
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 99b3ced12805..78721412951c 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -12,6 +12,7 @@ #include <linux/string.h> #include <linux/mm.h> #include <uapi/linux/btrfs_tree.h> +#include "fs.h" #include "extent_io.h" struct extent_buffer; diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index e0ba00d64ea0..c336e2ab7f8a 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -14,12 +14,13 @@ #include "ctree.h" #include "xattr.h" #include "acl.h" +#include "misc.h" struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu) { int size; const char *name; - char *value = NULL; + char AUTO_KFREE(value); struct posix_acl *acl; if (rcu) @@ -49,7 +50,6 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu) acl = NULL; else acl = ERR_PTR(size); - kfree(value); return acl; } @@ -59,7 +59,7 @@ int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, { int ret, size = 0; const char *name; - char *value = NULL; + char AUTO_KFREE(value); switch (type) { case ACL_TYPE_ACCESS: @@ -85,28 +85,23 @@ int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, nofs_flag = memalloc_nofs_save(); value = kmalloc(size, GFP_KERNEL); memalloc_nofs_restore(nofs_flag); - if (!value) { - ret = -ENOMEM; - goto out; - } + if (!value) + return -ENOMEM; ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); if (ret < 0) - goto out; + return ret; } if (trans) ret = btrfs_setxattr(trans, inode, name, value, size, 0); else ret = btrfs_setxattr_trans(inode, name, value, size, 0); + if (ret < 0) + return ret; -out: - kfree(value); - - if (!ret) - set_cached_acl(inode, type, acl); - - return ret; + set_cached_acl(inode, type, acl); + return 0; } int btrfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 2ab550a1e715..78da47a3d00e 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -666,10 +666,9 @@ static int resolve_indirect_ref(struct btrfs_backref_walk_ctx *ctx, ret = btrfs_search_old_slot(root, &search_key, path, ctx->time_seq); btrfs_debug(ctx->fs_info, - "search slot in root %llu (level %d, ref count %d) returned %d for key (%llu %u %llu)", - ref->root_id, level, ref->count, ret, - ref->key_for_search.objectid, ref->key_for_search.type, - ref->key_for_search.offset); +"search slot in root %llu (level %d, ref count %d) returned %d for key " BTRFS_KEY_FMT, + ref->root_id, level, ref->count, ret, + BTRFS_KEY_FMT_VALUE(&ref->key_for_search)); if (ret < 0) goto out; @@ -1409,12 +1408,12 @@ static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx, if (!path) return -ENOMEM; if (!ctx->trans) { - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; } if (ctx->time_seq == BTRFS_SEQ_LAST) - path->skip_locking = 1; + path->skip_locking = true; again: head = NULL; @@ -1561,7 +1560,7 @@ again: btrfs_release_path(path); - ret = add_missing_keys(ctx->fs_info, &preftrees, path->skip_locking == 0); + ret = add_missing_keys(ctx->fs_info, &preftrees, !path->skip_locking); if (ret) goto out; @@ -2786,7 +2785,7 @@ struct btrfs_data_container *init_data_container(u32 total_bytes) * allocates space to return multiple file system paths for an inode. * total_bytes to allocate are passed, note that space usable for actual path * information will be total_bytes - sizeof(struct inode_fs_paths). - * the returned pointer must be freed with free_ipath() in the end. + * the returned pointer must be freed with __free_inode_fs_paths() in the end. */ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, struct btrfs_path *path) @@ -2811,14 +2810,6 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, return ifp; } -void free_ipath(struct inode_fs_paths *ipath) -{ - if (!ipath) - return; - kvfree(ipath->fspath); - kfree(ipath); -} - struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info) { struct btrfs_backref_iter *ret; @@ -2834,8 +2825,8 @@ struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_inf } /* Current backref iterator only supports iteration in commit root */ - ret->path->search_commit_root = 1; - ret->path->skip_locking = 1; + ret->path->search_commit_root = true; + ret->path->skip_locking = true; ret->fs_info = fs_info; return ret; @@ -3308,8 +3299,8 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans, level = cur->level + 1; /* Search the tree to find parent blocks referring to the block */ - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; path->lowest_level = level; ret = btrfs_search_slot(NULL, root, tree_key, path, 0, 0); path->lowest_level = 0; @@ -3323,9 +3314,9 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans, eb = path->nodes[level]; if (btrfs_node_blockptr(eb, path->slots[level]) != cur->bytenr) { btrfs_err(fs_info, -"couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)", +"couldn't find block (%llu) (level %d) in tree (%llu) with key " BTRFS_KEY_FMT, cur->bytenr, level - 1, btrfs_root_id(root), - tree_key->objectid, tree_key->type, tree_key->offset); + BTRFS_KEY_FMT_VALUE(tree_key)); btrfs_put_root(root); ret = -ENOENT; goto out; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 25d51c246070..1d009b0f4c69 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -241,7 +241,12 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, struct btrfs_data_container *init_data_container(u32 total_bytes); struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, struct btrfs_path *path); -void free_ipath(struct inode_fs_paths *ipath); + +DEFINE_FREE(inode_fs_paths, struct inode_fs_paths *, + if (_T) { + kvfree(_T->fspath); + kfree(_T); + }) int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, u64 start_off, struct btrfs_path *path, diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 21df48e6c4fa..fa1d321a2fb8 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -41,13 +41,17 @@ static bool bbio_has_ordered_extent(const struct btrfs_bio *bbio) * Initialize a btrfs_bio structure. This skips the embedded bio itself as it * is already initialized by the block layer. */ -void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, +void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, u64 file_offset, btrfs_bio_end_io_t end_io, void *private) { + /* @inode parameter is mandatory. */ + ASSERT(inode); + memset(bbio, 0, offsetof(struct btrfs_bio, bio)); - bbio->fs_info = fs_info; + bbio->inode = inode; bbio->end_io = end_io; bbio->private = private; + bbio->file_offset = file_offset; atomic_set(&bbio->pending_ios, 1); WRITE_ONCE(bbio->status, BLK_STS_OK); } @@ -60,7 +64,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, * a mempool. */ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, - struct btrfs_fs_info *fs_info, + struct btrfs_inode *inode, u64 file_offset, btrfs_bio_end_io_t end_io, void *private) { struct btrfs_bio *bbio; @@ -68,7 +72,7 @@ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); bbio = btrfs_bio(bio); - btrfs_bio_init(bbio, fs_info, end_io, private); + btrfs_bio_init(bbio, inode, file_offset, end_io, private); return bbio; } @@ -85,13 +89,13 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, return ERR_CAST(bio); bbio = btrfs_bio(bio); - btrfs_bio_init(bbio, fs_info, NULL, orig_bbio); - bbio->inode = orig_bbio->inode; - bbio->file_offset = orig_bbio->file_offset; + btrfs_bio_init(bbio, orig_bbio->inode, orig_bbio->file_offset, NULL, orig_bbio); orig_bbio->file_offset += map_length; if (bbio_has_ordered_extent(bbio)) { refcount_inc(&orig_bbio->ordered->refs); bbio->ordered = orig_bbio->ordered; + bbio->orig_logical = orig_bbio->orig_logical; + orig_bbio->orig_logical += map_length; } bbio->csum_search_commit_root = orig_bbio->csum_search_commit_root; atomic_inc(&orig_bbio->pending_ios); @@ -100,6 +104,12 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) { + /* Make sure we're already in task context. */ + ASSERT(in_task()); + + if (bbio->async_csum) + wait_for_completion(&bbio->csum_done); + bbio->bio.bi_status = status; if (bbio->bio.bi_pool == &btrfs_clone_bioset) { struct btrfs_bio *orig_bbio = bbio->private; @@ -163,11 +173,30 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, struct btrfs_failed_bio *fbio = repair_bbio->private; struct btrfs_inode *inode = repair_bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio); + /* + * We can not move forward the saved_iter, as it will be later + * utilized by repair_bbio again. + */ + struct bvec_iter saved_iter = repair_bbio->saved_iter; + const u32 step = min(fs_info->sectorsize, PAGE_SIZE); + const u64 logical = repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT; + const u32 nr_steps = repair_bbio->saved_iter.bi_size / step; int mirror = repair_bbio->mirror_num; + phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; + phys_addr_t paddr; + unsigned int slot = 0; + + /* Repair bbio should be eaxctly one block sized. */ + ASSERT(repair_bbio->saved_iter.bi_size == fs_info->sectorsize); + + btrfs_bio_for_each_block(paddr, &repair_bbio->bio, &saved_iter, step) { + ASSERT(slot < nr_steps); + paddrs[slot] = paddr; + slot++; + } if (repair_bbio->bio.bi_status || - !btrfs_data_csum_ok(repair_bbio, dev, 0, bvec_phys(bv))) { + !btrfs_data_csum_ok(repair_bbio, dev, 0, paddrs)) { bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); repair_bbio->bio.bi_iter = repair_bbio->saved_iter; @@ -186,8 +215,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, mirror = prev_repair_mirror(fbio, mirror); btrfs_repair_io_failure(fs_info, btrfs_ino(inode), repair_bbio->file_offset, fs_info->sectorsize, - repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT, - bvec_phys(bv), mirror); + logical, paddrs, step, mirror); } while (mirror != fbio->bbio->mirror_num); done: @@ -204,21 +232,25 @@ done: */ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, u32 bio_offset, - phys_addr_t paddr, + phys_addr_t paddrs[], struct btrfs_failed_bio *fbio) { struct btrfs_inode *inode = failed_bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct folio *folio = page_folio(phys_to_page(paddr)); const u32 sectorsize = fs_info->sectorsize; - const u32 foff = offset_in_folio(folio, paddr); - const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT); + const u32 step = min(fs_info->sectorsize, PAGE_SIZE); + const u32 nr_steps = sectorsize / step; + /* + * For bs > ps cases, the saved_iter can be partially moved forward. + * In that case we should round it down to the block boundary. + */ + const u64 logical = round_down(failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT, + sectorsize); struct btrfs_bio *repair_bbio; struct bio *repair_bio; int num_copies; int mirror; - ASSERT(foff + sectorsize <= folio_size(folio)); btrfs_debug(fs_info, "repair read error: read error at %llu", failed_bbio->file_offset + bio_offset); @@ -238,15 +270,22 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, atomic_inc(&fbio->repair_count); - repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, + repair_bio = bio_alloc_bioset(NULL, nr_steps, REQ_OP_READ, GFP_NOFS, &btrfs_repair_bioset); - repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector; - bio_add_folio_nofail(repair_bio, folio, sectorsize, foff); + repair_bio->bi_iter.bi_sector = logical >> SECTOR_SHIFT; + for (int i = 0; i < nr_steps; i++) { + int ret; + + ASSERT(offset_in_page(paddrs[i]) + step <= PAGE_SIZE); + + ret = bio_add_page(repair_bio, phys_to_page(paddrs[i]), step, + offset_in_page(paddrs[i])); + ASSERT(ret == step); + } repair_bbio = btrfs_bio(repair_bio); - btrfs_bio_init(repair_bbio, fs_info, NULL, fbio); - repair_bbio->inode = failed_bbio->inode; - repair_bbio->file_offset = failed_bbio->file_offset + bio_offset; + btrfs_bio_init(repair_bbio, failed_bbio->inode, failed_bbio->file_offset + bio_offset, + NULL, fbio); mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); @@ -258,10 +297,13 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - u32 sectorsize = fs_info->sectorsize; + const u32 sectorsize = fs_info->sectorsize; + const u32 step = min(sectorsize, PAGE_SIZE); + const u32 nr_steps = sectorsize / step; struct bvec_iter *iter = &bbio->saved_iter; blk_status_t status = bbio->bio.bi_status; struct btrfs_failed_bio *fbio = NULL; + phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; phys_addr_t paddr; u32 offset = 0; @@ -280,13 +322,19 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de /* Clear the I/O error. A failed repair will reset it. */ bbio->bio.bi_status = BLK_STS_OK; - btrfs_bio_for_each_block(paddr, &bbio->bio, iter, fs_info->sectorsize) { - if (status || !btrfs_data_csum_ok(bbio, dev, offset, paddr)) - fbio = repair_one_sector(bbio, offset, paddr, fbio); - offset += sectorsize; + btrfs_bio_for_each_block(paddr, &bbio->bio, iter, step) { + paddrs[(offset / step) % nr_steps] = paddr; + offset += step; + + if (IS_ALIGNED(offset, sectorsize)) { + if (status || + !btrfs_data_csum_ok(bbio, dev, offset - sectorsize, paddrs)) + fbio = repair_one_sector(bbio, offset - sectorsize, + paddrs, fbio); + } } if (bbio->csum != bbio->csum_inline) - kfree(bbio->csum); + kvfree(bbio->csum); if (fbio) btrfs_repair_done(fbio); @@ -317,36 +365,35 @@ static struct workqueue_struct *btrfs_end_io_wq(const struct btrfs_fs_info *fs_i return fs_info->endio_workers; } -static void btrfs_end_bio_work(struct work_struct *work) +static void simple_end_io_work(struct work_struct *work) { struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); + struct bio *bio = &bbio->bio; - /* Metadata reads are checked and repaired by the submitter. */ - if (is_data_bbio(bbio)) - btrfs_check_read_bio(bbio, bbio->bio.bi_private); - else - btrfs_bio_end_io(bbio, bbio->bio.bi_status); + if (bio_op(bio) == REQ_OP_READ) { + /* Metadata reads are checked and repaired by the submitter. */ + if (is_data_bbio(bbio)) + return btrfs_check_read_bio(bbio, bbio->bio.bi_private); + return btrfs_bio_end_io(bbio, bbio->bio.bi_status); + } + if (bio_is_zone_append(bio) && !bio->bi_status) + btrfs_record_physical_zoned(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); } static void btrfs_simple_end_io(struct bio *bio) { struct btrfs_bio *bbio = btrfs_bio(bio); struct btrfs_device *dev = bio->bi_private; - struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; btrfs_bio_counter_dec(fs_info); if (bio->bi_status) btrfs_log_dev_io_error(bio, dev); - if (bio_op(bio) == REQ_OP_READ) { - INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); - queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); - } else { - if (bio_is_zone_append(bio) && !bio->bi_status) - btrfs_record_physical_zoned(bbio); - btrfs_bio_end_io(bbio, bbio->bio.bi_status); - } + INIT_WORK(&bbio->end_io_work, simple_end_io_work); + queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); } static void btrfs_raid56_end_io(struct bio *bio) @@ -354,6 +401,9 @@ static void btrfs_raid56_end_io(struct bio *bio) struct btrfs_io_context *bioc = bio->bi_private; struct btrfs_bio *bbio = btrfs_bio(bio); + /* RAID56 endio is always handled in workqueue. */ + ASSERT(in_task()); + btrfs_bio_counter_dec(bioc->fs_info); bbio->mirror_num = bioc->mirror_num; if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) @@ -364,11 +414,12 @@ static void btrfs_raid56_end_io(struct bio *bio) btrfs_put_bioc(bioc); } -static void btrfs_orig_write_end_io(struct bio *bio) +static void orig_write_end_io_work(struct work_struct *work) { + struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); + struct bio *bio = &bbio->bio; struct btrfs_io_stripe *stripe = bio->bi_private; struct btrfs_io_context *bioc = stripe->bioc; - struct btrfs_bio *bbio = btrfs_bio(bio); btrfs_bio_counter_dec(bioc->fs_info); @@ -393,8 +444,18 @@ static void btrfs_orig_write_end_io(struct bio *bio) btrfs_put_bioc(bioc); } -static void btrfs_clone_write_end_io(struct bio *bio) +static void btrfs_orig_write_end_io(struct bio *bio) { + struct btrfs_bio *bbio = btrfs_bio(bio); + + INIT_WORK(&bbio->end_io_work, orig_write_end_io_work); + queue_work(btrfs_end_io_wq(bbio->inode->root->fs_info, bio), &bbio->end_io_work); +} + +static void clone_write_end_io_work(struct work_struct *work) +{ + struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); + struct bio *bio = &bbio->bio; struct btrfs_io_stripe *stripe = bio->bi_private; if (bio->bi_status) { @@ -409,6 +470,14 @@ static void btrfs_clone_write_end_io(struct bio *bio) bio_put(bio); } +static void btrfs_clone_write_end_io(struct bio *bio) +{ + struct btrfs_bio *bbio = btrfs_bio(bio); + + INIT_WORK(&bbio->end_io_work, clone_write_end_io_work); + queue_work(btrfs_end_io_wq(bbio->inode->root->fs_info, bio), &bbio->end_io_work); +} + static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) { if (!dev || !dev->bdev || @@ -455,6 +524,7 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) { struct bio *orig_bio = bioc->orig_bio, *bio; + struct btrfs_bio *orig_bbio = btrfs_bio(orig_bio); ASSERT(bio_op(orig_bio) != REQ_OP_READ); @@ -463,8 +533,11 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) bio = orig_bio; bio->bi_end_io = btrfs_orig_write_end_io; } else { - bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set); + /* We need to use endio_work to run end_io in task context. */ + bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &btrfs_bioset); bio_inc_remaining(orig_bio); + btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, + orig_bbio->file_offset, NULL, NULL); bio->bi_end_io = btrfs_clone_write_end_io; } @@ -509,7 +582,11 @@ static int btrfs_bio_csum(struct btrfs_bio *bbio) { if (bbio->bio.bi_opf & REQ_META) return btree_csum_one_bio(bbio); - return btrfs_csum_one_bio(bbio); +#ifdef CONFIG_BTRFS_EXPERIMENTAL + return btrfs_csum_one_bio(bbio, true); +#else + return btrfs_csum_one_bio(bbio, false); +#endif } /* @@ -581,20 +658,25 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free) static bool should_async_write(struct btrfs_bio *bbio) { + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; bool auto_csum_mode = true; #ifdef CONFIG_BTRFS_EXPERIMENTAL - struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode); - if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_OFF) - return false; - - auto_csum_mode = (csum_mode == BTRFS_OFFLOAD_CSUM_AUTO); + if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_ON) + return true; + /* + * Write bios will calculate checksum and submit bio at the same time. + * Unless explicitly required don't offload serial csum calculate and bio + * submit into a workqueue. + */ + return false; #endif /* Submit synchronously if the checksum implementation is fast. */ - if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags)) + if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) return false; /* @@ -605,7 +687,7 @@ static bool should_async_write(struct btrfs_bio *bbio) return false; /* Zoned devices require I/O to be submitted in order. */ - if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(bbio->fs_info)) + if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(fs_info)) return false; return true; @@ -620,7 +702,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, struct btrfs_io_context *bioc, struct btrfs_io_stripe *smap, int mirror_num) { - struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; struct async_submit_bio *async; async = kmalloc(sizeof(*async), GFP_NOFS); @@ -639,11 +721,12 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length) { + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; unsigned int nr_segs; int sector_offset; - map_length = min(map_length, bbio->fs_info->max_zone_append_size); - sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits, + map_length = min(map_length, fs_info->max_zone_append_size); + sector_offset = bio_split_rw_at(&bbio->bio, &fs_info->limits, &nr_segs, map_length); if (sector_offset) { /* @@ -651,7 +734,7 @@ static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length) * sectorsize and thus cause unaligned I/Os. Fix that by * always rounding down to the nearest boundary. */ - return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, bbio->fs_info->sectorsize); + return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, fs_info->sectorsize); } return map_length; } @@ -659,7 +742,7 @@ static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length) static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) { struct btrfs_inode *inode = bbio->inode; - struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio = &bbio->bio; u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 length = bio->bi_iter.bi_size; @@ -670,7 +753,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) blk_status_t status; int ret; - if (!bbio->inode || btrfs_is_data_reloc_root(inode->root)) + if (bbio->is_scrub || btrfs_is_data_reloc_root(inode->root)) smap.rst_search_commit_root = true; else smap.rst_search_commit_root = false; @@ -684,6 +767,14 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) goto end_bbio; } + /* + * For fscrypt writes we will get the encrypted bio after we've remapped + * our bio to the physical disk location, so we need to save the + * original bytenr so we know what we're checksumming. + */ + if (bio_op(bio) == REQ_OP_WRITE && is_data_bbio(bbio)) + bbio->orig_logical = logical; + map_length = min(map_length, length); if (use_append) map_length = btrfs_append_map_length(bbio, map_length); @@ -734,7 +825,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) * Csum items for reloc roots have already been cloned at this * point, so they are handled as part of the no-checksum case. */ - if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) && + if (!(inode->flags & BTRFS_INODE_NODATASUM) && !test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state) && !btrfs_is_data_reloc_root(inode->root)) { if (should_async_write(bbio) && @@ -782,25 +873,27 @@ end_bbio: static void assert_bbio_alignment(struct btrfs_bio *bbio) { #ifdef CONFIG_BTRFS_ASSERT - struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; struct bio_vec bvec; struct bvec_iter iter; const u32 blocksize = fs_info->sectorsize; + const u32 alignment = min(blocksize, PAGE_SIZE); + const u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; + const u32 length = bbio->bio.bi_iter.bi_size; - /* Metadata has no extra bs > ps alignment requirement. */ - if (!is_data_bbio(bbio)) - return; + /* The logical and length should still be aligned to blocksize. */ + ASSERT(IS_ALIGNED(logical, blocksize) && IS_ALIGNED(length, blocksize) && + length != 0, "root=%llu inode=%llu logical=%llu length=%u", + btrfs_root_id(bbio->inode->root), + btrfs_ino(bbio->inode), logical, length); bio_for_each_bvec(bvec, &bbio->bio, iter) - ASSERT(IS_ALIGNED(bvec.bv_offset, blocksize) && - IS_ALIGNED(bvec.bv_len, blocksize), + ASSERT(IS_ALIGNED(bvec.bv_offset, alignment) && + IS_ALIGNED(bvec.bv_len, alignment), "root=%llu inode=%llu logical=%llu length=%u index=%u bv_offset=%u bv_len=%u", btrfs_root_id(bbio->inode->root), - btrfs_ino(bbio->inode), - bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT, - bbio->bio.bi_iter.bi_size, iter.bi_idx, - bvec.bv_offset, - bvec.bv_len); + btrfs_ino(bbio->inode), logical, length, iter.bi_idx, + bvec.bv_offset, bvec.bv_len); #endif } @@ -824,18 +917,36 @@ void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num) * * The I/O is issued synchronously to block the repair read completion from * freeing the bio. + * + * @ino: Offending inode number + * @fileoff: File offset inside the inode + * @length: Length of the repair write + * @logical: Logical address of the range + * @paddrs: Physical address array of the content + * @step: Length of for each paddrs + * @mirror_num: Mirror number to write to. Must not be zero */ -int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, phys_addr_t paddr, int mirror_num) +int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 fileoff, + u32 length, u64 logical, const phys_addr_t paddrs[], + unsigned int step, int mirror_num) { + const u32 nr_steps = DIV_ROUND_UP_POW2(length, step); struct btrfs_io_stripe smap = { 0 }; - struct bio_vec bvec; - struct bio bio; + struct bio *bio = NULL; int ret = 0; ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); BUG_ON(!mirror_num); + /* Basic alignment checks. */ + ASSERT(IS_ALIGNED(logical, fs_info->sectorsize)); + ASSERT(IS_ALIGNED(length, fs_info->sectorsize)); + ASSERT(IS_ALIGNED(fileoff, fs_info->sectorsize)); + /* Either it's a single data or metadata block. */ + ASSERT(length <= BTRFS_MAX_BLOCKSIZE); + ASSERT(step <= length); + ASSERT(is_power_of_2(step)); + if (btrfs_repair_one_zone(fs_info, logical)) return 0; @@ -855,24 +966,27 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, goto out_counter_dec; } - bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); - bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; - __bio_add_page(&bio, phys_to_page(paddr), length, offset_in_page(paddr)); - ret = submit_bio_wait(&bio); + bio = bio_alloc(smap.dev->bdev, nr_steps, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS); + bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; + for (int i = 0; i < nr_steps; i++) { + ret = bio_add_page(bio, phys_to_page(paddrs[i]), step, offset_in_page(paddrs[i])); + /* We should have allocated enough slots to contain all the different pages. */ + ASSERT(ret == step); + } + ret = submit_bio_wait(bio); + bio_put(bio); if (ret) { /* try to remap that extent elsewhere? */ btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS); - goto out_bio_uninit; + goto out_counter_dec; } btrfs_info_rl(fs_info, "read error corrected: ino %llu off %llu (dev %s sector %llu)", - ino, start, btrfs_dev_name(smap.dev), + ino, fileoff, btrfs_dev_name(smap.dev), smap.physical >> SECTOR_SHIFT); ret = 0; -out_bio_uninit: - bio_uninit(&bio); out_counter_dec: btrfs_bio_counter_dec(fs_info); return ret; @@ -885,16 +999,16 @@ out_counter_dec: */ void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace) { - struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; u64 length = bbio->bio.bi_iter.bi_size; struct btrfs_io_stripe smap = { 0 }; int ret; - ASSERT(fs_info); ASSERT(mirror_num > 0); ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE); - ASSERT(!bbio->inode); + ASSERT(!is_data_inode(bbio->inode)); + ASSERT(bbio->is_scrub); btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num); diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index 00883aea55d7..1be74209f0b8 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -18,13 +18,6 @@ struct btrfs_inode; #define BTRFS_BIO_INLINE_CSUM_SIZE 64 -/* - * Maximum number of sectors for a single bio to limit the size of the - * checksum array. This matches the number of bio_vecs per bio and thus the - * I/O size for buffered I/O. - */ -#define BTRFS_MAX_BIO_SECTORS (256) - typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); /* @@ -34,7 +27,10 @@ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); struct btrfs_bio { /* * Inode and offset into it that this I/O operates on. - * Only set for data I/O. + * + * If the inode is a data one, csum verification and read-repair + * will be done automatically. + * If the inode is a metadata one, everything is handled by the caller. */ struct btrfs_inode *inode; u64 file_offset; @@ -56,11 +52,16 @@ struct btrfs_bio { * - pointer to the checksums for this bio * - original physical address from the allocator * (for zone append only) + * - original logical address, used for checksumming fscrypt bios */ struct { struct btrfs_ordered_extent *ordered; struct btrfs_ordered_sum *sums; + struct work_struct csum_work; + struct completion csum_done; + struct bvec_iter csum_saved_iter; u64 orig_physical; + u64 orig_logical; }; /* For metadata reads: parentness verification. */ @@ -76,14 +77,21 @@ struct btrfs_bio { atomic_t pending_ios; struct work_struct end_io_work; - /* File system that this I/O operates on. */ - struct btrfs_fs_info *fs_info; - /* Save the first error status of split bio. */ blk_status_t status; /* Use the commit root to look up csums (data read bio only). */ bool csum_search_commit_root; + + /* + * Since scrub will reuse btree inode, we need this flag to distinguish + * scrub bios. + */ + bool is_scrub; + + /* Whether the csum generation for data write is async. */ + bool async_csum; + /* * This member must come last, bio_alloc_bioset will allocate enough * bytes for entire btrfs_bio but relies on bio being last. @@ -99,10 +107,10 @@ static inline struct btrfs_bio *btrfs_bio(struct bio *bio) int __init btrfs_bioset_init(void); void __cold btrfs_bioset_exit(void); -void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, +void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, u64 file_offset, btrfs_bio_end_io_t end_io, void *private); struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, - struct btrfs_fs_info *fs_info, + struct btrfs_inode *inode, u64 file_offset, btrfs_bio_end_io_t end_io, void *private); void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status); @@ -111,7 +119,8 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status); void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num); void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace); -int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, phys_addr_t paddr, int mirror_num); +int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 fileoff, + u32 length, u64 logical, const phys_addr_t paddrs[], + unsigned int step, int mirror_num); #endif diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 5322ef2ae015..08b14449fabe 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -613,8 +613,8 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET)); - path->skip_locking = 1; - path->search_commit_root = 1; + path->skip_locking = true; + path->search_commit_root = true; path->reada = READA_FORWARD; search_offset = index * div_u64(block_group->length, max_index); @@ -744,8 +744,8 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) * root to add free space. So we skip locking and search the commit * root, since its read-only */ - path->skip_locking = 1; - path->search_commit_root = 1; + path->skip_locking = true; + path->search_commit_root = true; path->reada = READA_FORWARD; key.objectid = last; @@ -1065,7 +1065,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_chunk_map *map) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_block_group *block_group; struct btrfs_free_cluster *cluster; struct inode *inode; @@ -1305,7 +1305,6 @@ out: btrfs_put_block_group(block_group); if (remove_rsv) btrfs_dec_delayed_refs_rsv_bg_updates(fs_info); - btrfs_free_path(path); return ret; } @@ -1403,8 +1402,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, bool force) * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of * leeway to allow us to mark this block group as read only. */ - if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes, - BTRFS_RESERVE_NO_FLUSH)) + if (btrfs_can_overcommit(sinfo, num_bytes, BTRFS_RESERVE_NO_FLUSH)) ret = 0; } @@ -1425,7 +1423,7 @@ out: if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { btrfs_info(cache->fs_info, "unable to make block group %llu ro", cache->start); - btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, false); + btrfs_dump_space_info(cache->space_info, 0, false); } return ret; } @@ -1850,12 +1848,10 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) if (!btrfs_should_reclaim(fs_info)) return; - sb_start_write(fs_info->sb); + guard(super_write)(fs_info->sb); - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { - sb_end_write(fs_info->sb); + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) return; - } /* * Long running balances can keep us blocked here for eternity, so @@ -1863,7 +1859,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) */ if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) { btrfs_exclop_finish(fs_info); - sb_end_write(fs_info->sb); return; } @@ -1947,7 +1942,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) /* * Get out fast, in case we're read-only or unmounting the * filesystem. It is OK to drop block groups from the list even - * for the read-only case. As we did sb_start_write(), + * for the read-only case. As we did take the super write lock, * "mount -o remount,ro" won't happen and read-only filesystem * means it is forced read-only due to a fatal error. So, it * never gets back to read-write to let us reclaim again. @@ -2030,7 +2025,6 @@ end: list_splice_tail(&retry_list, &fs_info->reclaim_bgs); spin_unlock(&fs_info->unused_bgs_lock); btrfs_exclop_finish(fs_info); - sb_end_write(fs_info->sb); } void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) @@ -3072,7 +3066,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, * We have allocated a new chunk. We also need to activate that chunk to * grant metadata tickets for zoned filesystem. */ - ret = btrfs_zoned_activate_one_bg(fs_info, space_info, true); + ret = btrfs_zoned_activate_one_bg(space_info, true); if (ret < 0) goto out; @@ -3803,7 +3797,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, * reservation and return -EAGAIN, otherwise this function always succeeds. */ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, - u64 ram_bytes, u64 num_bytes, int delalloc, + u64 ram_bytes, u64 num_bytes, bool delalloc, bool force_wrong_size_class) { struct btrfs_space_info *space_info = cache->space_info; @@ -3814,30 +3808,38 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, spin_lock(&cache->lock); if (cache->ro) { ret = -EAGAIN; - goto out; + goto out_error; } if (btrfs_block_group_should_use_size_class(cache)) { size_class = btrfs_calc_block_group_size_class(num_bytes); ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class); if (ret) - goto out; + goto out_error; } + cache->reserved += num_bytes; - space_info->bytes_reserved += num_bytes; + if (delalloc) + cache->delalloc_bytes += num_bytes; + trace_btrfs_space_reservation(cache->fs_info, "space_info", space_info->flags, num_bytes, 1); + spin_unlock(&cache->lock); + + space_info->bytes_reserved += num_bytes; btrfs_space_info_update_bytes_may_use(space_info, -ram_bytes); - if (delalloc) - cache->delalloc_bytes += num_bytes; /* * Compression can use less space than we reserved, so wake tickets if * that happens. */ if (num_bytes < ram_bytes) - btrfs_try_granting_tickets(cache->fs_info, space_info); -out: + btrfs_try_granting_tickets(space_info); + spin_unlock(&space_info->lock); + + return 0; + +out_error: spin_unlock(&cache->lock); spin_unlock(&space_info->lock); return ret; @@ -3859,22 +3861,25 @@ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes, bool is_delalloc) { struct btrfs_space_info *space_info = cache->space_info; + bool bg_ro; spin_lock(&space_info->lock); spin_lock(&cache->lock); - if (cache->ro) + bg_ro = cache->ro; + cache->reserved -= num_bytes; + if (is_delalloc) + cache->delalloc_bytes -= num_bytes; + spin_unlock(&cache->lock); + + if (bg_ro) space_info->bytes_readonly += num_bytes; else if (btrfs_is_zoned(cache->fs_info)) space_info->bytes_zone_unusable += num_bytes; - cache->reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; space_info->max_extent_size = 0; - if (is_delalloc) - cache->delalloc_bytes -= num_bytes; - spin_unlock(&cache->lock); - - btrfs_try_granting_tickets(cache->fs_info, space_info); + btrfs_try_granting_tickets(space_info); spin_unlock(&space_info->lock); } @@ -4192,11 +4197,11 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, should_alloc = should_alloc_chunk(fs_info, space_info, force); if (space_info->full) { /* No more free physical space */ + spin_unlock(&space_info->lock); if (should_alloc) ret = -ENOSPC; else ret = 0; - spin_unlock(&space_info->lock); return ret; } else if (!should_alloc) { spin_unlock(&space_info->lock); @@ -4208,16 +4213,16 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, * recheck if we should continue with our allocation * attempt. */ + spin_unlock(&space_info->lock); wait_for_alloc = true; force = CHUNK_ALLOC_NO_FORCE; - spin_unlock(&space_info->lock); mutex_lock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->chunk_mutex); } else { /* Proceed with allocation */ - space_info->chunk_alloc = 1; - wait_for_alloc = false; + space_info->chunk_alloc = true; spin_unlock(&space_info->lock); + wait_for_alloc = false; } cond_resched(); @@ -4264,7 +4269,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, spin_lock(&space_info->lock); if (ret < 0) { if (ret == -ENOSPC) - space_info->full = 1; + space_info->full = true; else goto out; } else { @@ -4274,7 +4279,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; out: - space_info->chunk_alloc = 0; + space_info->chunk_alloc = false; spin_unlock(&space_info->lock); mutex_unlock(&fs_info->chunk_mutex); @@ -4315,7 +4320,7 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans, if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", left, bytes, type); - btrfs_dump_space_info(fs_info, info, 0, false); + btrfs_dump_space_info(info, 0, false); } if (left < bytes) { @@ -4340,7 +4345,7 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans, * We have a new chunk. We also need to activate it for * zoned filesystem. */ - ret = btrfs_zoned_activate_one_bg(fs_info, info, true); + ret = btrfs_zoned_activate_one_bg(info, true); if (ret < 0) return; @@ -4460,7 +4465,7 @@ static void check_removing_space_info(struct btrfs_space_info *space_info) * indicates a real bug if this happens. */ if (WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_may_use > 0)) - btrfs_dump_space_info(info, space_info, 0, false); + btrfs_dump_space_info(space_info, 0, false); /* * If there was a failure to cleanup a log tree, very likely due to an @@ -4471,7 +4476,7 @@ static void check_removing_space_info(struct btrfs_space_info *space_info) if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) || !BTRFS_FS_LOG_CLEANUP_ERROR(info)) { if (WARN_ON(space_info->bytes_reserved > 0)) - btrfs_dump_space_info(info, space_info, 0, false); + btrfs_dump_space_info(space_info, 0, false); } WARN_ON(space_info->reclaim_size > 0); diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 9172104a5889..5f933455118c 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -345,7 +345,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans); int btrfs_update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, bool alloc); int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, - u64 ram_bytes, u64 num_bytes, int delalloc, + u64 ram_bytes, u64 num_bytes, bool delalloc, bool force_wrong_size_class); void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes, bool is_delalloc); diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 5ad6de738aee..96cf7a162987 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -218,8 +218,7 @@ int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, if (num_bytes == 0) return 0; - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, - num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(block_rsv->space_info, num_bytes, flush); if (!ret) btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true); @@ -259,8 +258,7 @@ int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info, if (!ret) return 0; - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, - num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(block_rsv->space_info, num_bytes, flush); if (!ret) { btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false); return 0; @@ -387,7 +385,7 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) num_bytes = block_rsv->reserved - block_rsv->size; btrfs_space_info_update_bytes_may_use(sinfo, -num_bytes); block_rsv->reserved = block_rsv->size; - btrfs_try_granting_tickets(fs_info, sinfo); + btrfs_try_granting_tickets(sinfo); } block_rsv->full = (block_rsv->reserved == block_rsv->size); @@ -530,8 +528,8 @@ again: block_rsv->type, ret); } try_reserve: - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, - blocksize, BTRFS_RESERVE_NO_FLUSH); + ret = btrfs_reserve_metadata_bytes(block_rsv->space_info, blocksize, + BTRFS_RESERVE_NO_FLUSH); if (!ret) return block_rsv; /* @@ -552,7 +550,7 @@ try_reserve: * one last time to force a reservation if there's enough actual space * on disk to make the reservation. */ - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, blocksize, + ret = btrfs_reserve_metadata_bytes(block_rsv->space_info, blocksize, BTRFS_RESERVE_FLUSH_EMERGENCY); if (!ret) return block_rsv; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index af373d50a901..73602ee8de3f 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -18,20 +18,20 @@ #include <linux/lockdep.h> #include <uapi/linux/btrfs_tree.h> #include <trace/events/btrfs.h> +#include "ctree.h" #include "block-rsv.h" #include "extent_map.h" -#include "extent_io.h" #include "extent-io-tree.h" -#include "ordered-data.h" -#include "delayed-inode.h" -struct extent_state; struct posix_acl; struct iov_iter; struct writeback_control; struct btrfs_root; struct btrfs_fs_info; struct btrfs_trans_handle; +struct btrfs_bio; +struct btrfs_file_extent; +struct btrfs_delayed_node; /* * Since we search a directory based on f_pos (struct dir_context::pos) we have @@ -543,16 +543,14 @@ static inline void btrfs_set_inode_mapping_order(struct btrfs_inode *inode) #endif } -/* Array of bytes with variable length, hexadecimal format 0x1234 */ -#define CSUM_FMT "0x%*phN" -#define CSUM_FMT_VALUE(size, bytes) size, bytes - -void btrfs_calculate_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, - u8 *dest); +void btrfs_calculate_block_csum_folio(struct btrfs_fs_info *fs_info, + const phys_addr_t paddr, u8 *dest); +void btrfs_calculate_block_csum_pages(struct btrfs_fs_info *fs_info, + const phys_addr_t paddrs[], u8 *dest); int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum, const u8 * const csum_expected); bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, - u32 bio_offset, phys_addr_t paddr); + u32 bio_offset, const phys_addr_t paddrs[]); noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, struct btrfs_file_extent *file_extent, bool nowait); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index bacad18357b3..7dda6cc68379 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -67,9 +67,7 @@ static struct compressed_bio *alloc_compressed_bio(struct btrfs_inode *inode, bbio = btrfs_bio(bio_alloc_bioset(NULL, BTRFS_MAX_COMPRESSED_PAGES, op, GFP_NOFS, &btrfs_compressed_bioset)); - btrfs_bio_init(bbio, inode->root->fs_info, end_io, NULL); - bbio->inode = inode; - bbio->file_offset = start; + btrfs_bio_init(bbio, inode, start, end_io, NULL); return to_compressed_bio(bbio); } @@ -194,15 +192,13 @@ static unsigned long btrfs_compr_pool_count(struct shrinker *sh, struct shrink_c static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_control *sc) { - struct list_head remove; + LIST_HEAD(remove); struct list_head *tmp, *next; int freed; if (compr_pool.count == 0) return SHRINK_STOP; - INIT_LIST_HEAD(&remove); - /* For now, just simply drain the whole list. */ spin_lock(&compr_pool.lock); list_splice_init(&compr_pool.list, &remove); @@ -321,22 +317,6 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb) /* the inode may be gone now */ } -static void btrfs_finish_compressed_write_work(struct work_struct *work) -{ - struct compressed_bio *cb = - container_of(work, struct compressed_bio, write_end_work); - - btrfs_finish_ordered_extent(cb->bbio.ordered, NULL, cb->start, cb->len, - cb->bbio.bio.bi_status == BLK_STS_OK); - - if (cb->writeback) - end_compressed_writeback(cb); - /* Note, our inode could be gone now */ - - btrfs_free_compressed_folios(cb); - bio_put(&cb->bbio.bio); -} - /* * Do the cleanup once all the compressed pages hit the disk. This will clear * writeback on the file pages and free the compressed pages. @@ -347,28 +327,33 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work) static void end_bbio_compressed_write(struct btrfs_bio *bbio) { struct compressed_bio *cb = to_compressed_bio(bbio); - struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; - queue_work(fs_info->compressed_write_workers, &cb->write_end_work); + btrfs_finish_ordered_extent(cb->bbio.ordered, NULL, cb->start, cb->len, + cb->bbio.bio.bi_status == BLK_STS_OK); + + if (cb->writeback) + end_compressed_writeback(cb); + /* Note, our inode could be gone now. */ + btrfs_free_compressed_folios(cb); + bio_put(&cb->bbio.bio); } static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb) { - struct btrfs_fs_info *fs_info = cb->bbio.fs_info; struct bio *bio = &cb->bbio.bio; u32 offset = 0; + unsigned int findex = 0; while (offset < cb->compressed_len) { - struct folio *folio; + struct folio *folio = cb->compressed_folios[findex]; + u32 len = min_t(u32, cb->compressed_len - offset, folio_size(folio)); int ret; - u32 len = min_t(u32, cb->compressed_len - offset, - btrfs_min_folio_size(fs_info)); - folio = cb->compressed_folios[offset >> (PAGE_SHIFT + fs_info->block_min_order)]; /* Maximum compressed extent is smaller than bio size limit. */ ret = bio_add_folio(bio, folio, len, 0); ASSERT(ret); offset += len; + findex++; } } @@ -402,7 +387,6 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, cb->compressed_folios = compressed_folios; cb->compressed_len = ordered->disk_num_bytes; cb->writeback = writeback; - INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); cb->nr_folios = nr_folios; cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT; cb->bbio.ordered = ordered; @@ -1100,7 +1084,8 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) /* * a less complex decompression routine. Our compressed data fits in a * single page, and we want to read a single page out of it. - * start_byte tells us the offset into the compressed data we're interested in + * dest_pgoff tells us the offset into the destination folio where we write the + * decompressed data. */ int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index eba188a9e3bb..e0228017e861 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -14,14 +14,12 @@ #include <linux/pagemap.h> #include "bio.h" #include "fs.h" -#include "messages.h" +#include "btrfs_inode.h" struct address_space; -struct page; struct inode; struct btrfs_inode; struct btrfs_ordered_extent; -struct btrfs_bio; /* * We want to make sure that amount of RAM required to uncompress an extent is @@ -65,11 +63,8 @@ struct compressed_bio { /* Whether this is a write for writeback. */ bool writeback; - union { - /* For reads, this is the bio we are copying the data into */ - struct btrfs_bio *orig_bbio; - struct work_struct write_end_work; - }; + /* For reads, this is the bio we are copying the data into. */ + struct btrfs_bio *orig_bbio; /* Must be last. */ struct btrfs_bio bbio; @@ -77,7 +72,7 @@ struct compressed_bio { static inline struct btrfs_fs_info *cb_to_fs_info(const struct compressed_bio *cb) { - return cb->bbio.fs_info; + return cb->bbio.inode->root->fs_info; } /* @range_end must be exclusive. */ @@ -85,8 +80,8 @@ static inline u32 btrfs_calc_input_length(struct folio *folio, u64 range_end, u6 { /* @cur must be inside the folio. */ ASSERT(folio_pos(folio) <= cur); - ASSERT(cur < folio_end(folio)); - return min(range_end, folio_end(folio)) - cur; + ASSERT(cur < folio_next_pos(folio)); + return umin(range_end, folio_next_pos(folio)) - cur; } int btrfs_alloc_compress_wsm(struct btrfs_fs_info *fs_info); @@ -100,7 +95,7 @@ int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inod u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, - unsigned long start_byte, size_t srclen, size_t destlen); + unsigned long dest_pgoff, size_t srclen, size_t destlen); int btrfs_decompress_buf2page(const char *buf, u32 buf_len, struct compressed_bio *cb, u32 decompressed); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 561658aca018..a48b4befbee7 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -862,6 +862,75 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, } /* + * Promote a child node to become the new tree root. + * + * @trans: Transaction handle + * @root: Tree root structure to update + * @path: Path holding nodes and locks + * @level: Level of the parent (old root) + * @parent: The parent (old root) with exactly one item + * + * This helper is called during rebalancing when the root node contains only + * a single item (nritems == 1). We can reduce the tree height by promoting + * that child to become the new root and freeing the old root node. The path + * locks and references are updated accordingly. + * + * Return: 0 on success, negative errno on failure. The transaction is aborted + * on critical errors. + */ +static int promote_child_to_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + int level, struct extent_buffer *parent) +{ + struct extent_buffer *child; + int ret; + + ASSERT(btrfs_header_nritems(parent) == 1); + + child = btrfs_read_node_slot(parent, 0); + if (IS_ERR(child)) + return PTR_ERR(child); + + btrfs_tree_lock(child); + ret = btrfs_cow_block(trans, root, child, parent, 0, &child, BTRFS_NESTING_COW); + if (ret) { + btrfs_tree_unlock(child); + free_extent_buffer(child); + return ret; + } + + ret = btrfs_tree_mod_log_insert_root(root->node, child, true); + if (unlikely(ret < 0)) { + btrfs_tree_unlock(child); + free_extent_buffer(child); + btrfs_abort_transaction(trans, ret); + return ret; + } + rcu_assign_pointer(root->node, child); + + add_root_to_dirty_list(root); + btrfs_tree_unlock(child); + + path->locks[level] = 0; + path->nodes[level] = NULL; + btrfs_clear_buffer_dirty(trans, parent); + btrfs_tree_unlock(parent); + /* Once for the path. */ + free_extent_buffer(parent); + + root_sub_used_bytes(root); + ret = btrfs_free_tree_block(trans, btrfs_root_id(root), parent, 0, 1); + /* Once for the root ptr. */ + free_extent_buffer_stale(parent); + if (unlikely(ret < 0)) { + btrfs_abort_transaction(trans, ret); + return ret; + } + + return 0; +} + +/* * node level balancing, used to make sure nodes are in proper order for * item deletion. We balance from the top down, so we have to make sure * that a deletion won't leave an node completely empty later on. @@ -900,55 +969,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, * by promoting the node below to a root */ if (!parent) { - struct extent_buffer *child; - if (btrfs_header_nritems(mid) != 1) return 0; - /* promote the child to a root */ - child = btrfs_read_node_slot(mid, 0); - if (IS_ERR(child)) { - ret = PTR_ERR(child); - goto out; - } - - btrfs_tree_lock(child); - ret = btrfs_cow_block(trans, root, child, mid, 0, &child, - BTRFS_NESTING_COW); - if (ret) { - btrfs_tree_unlock(child); - free_extent_buffer(child); - goto out; - } - - ret = btrfs_tree_mod_log_insert_root(root->node, child, true); - if (unlikely(ret < 0)) { - btrfs_tree_unlock(child); - free_extent_buffer(child); - btrfs_abort_transaction(trans, ret); - goto out; - } - rcu_assign_pointer(root->node, child); - - add_root_to_dirty_list(root); - btrfs_tree_unlock(child); - - path->locks[level] = 0; - path->nodes[level] = NULL; - btrfs_clear_buffer_dirty(trans, mid); - btrfs_tree_unlock(mid); - /* once for the path */ - free_extent_buffer(mid); - - root_sub_used_bytes(root); - ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); - /* once for the root ptr */ - free_extent_buffer_stale(mid); - if (unlikely(ret < 0)) { - btrfs_abort_transaction(trans, ret); - goto out; - } - return 0; + return promote_child_to_root(trans, root, path, level, mid); } if (btrfs_header_nritems(mid) > BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) @@ -1101,11 +1125,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* update the path */ if (left) { if (btrfs_header_nritems(left) > orig_slot) { - refcount_inc(&left->refs); /* left was locked after cow */ path->nodes[level] = left; path->slots[level + 1] -= 1; path->slots[level] = orig_slot; + /* Left is now owned by path. */ + left = NULL; if (mid) { btrfs_tree_unlock(mid); free_extent_buffer(mid); @@ -1125,8 +1150,7 @@ out: free_extent_buffer(right); } if (left) { - if (path->nodes[level] != left) - btrfs_tree_unlock(left); + btrfs_tree_unlock(left); free_extent_buffer(left); } return ret; @@ -1435,8 +1459,8 @@ static noinline void unlock_up(struct btrfs_path *path, int level, } if (i >= lowest_unlock && i > skip_level) { - check_skip = false; btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); + check_skip = false; path->locks[i] = 0; if (write_lock_level && i > min_write_lock_level && @@ -1709,9 +1733,9 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, level = btrfs_header_level(b); /* * Ensure that all callers have set skip_locking when - * p->search_commit_root = 1. + * p->search_commit_root is true. */ - ASSERT(p->skip_locking == 1); + ASSERT(p->skip_locking); goto out; } @@ -2599,12 +2623,11 @@ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, if (unlikely(btrfs_comp_keys(&disk_key, new_key) >= 0)) { btrfs_print_leaf(eb); btrfs_crit(fs_info, - "slot %u key (%llu %u %llu) new key (%llu %u %llu)", + "slot %u key " BTRFS_KEY_FMT " new key " BTRFS_KEY_FMT, slot, btrfs_disk_key_objectid(&disk_key), btrfs_disk_key_type(&disk_key), btrfs_disk_key_offset(&disk_key), - new_key->objectid, new_key->type, - new_key->offset); + BTRFS_KEY_FMT_VALUE(new_key)); BUG(); } } @@ -2613,12 +2636,11 @@ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, if (unlikely(btrfs_comp_keys(&disk_key, new_key) <= 0)) { btrfs_print_leaf(eb); btrfs_crit(fs_info, - "slot %u key (%llu %u %llu) new key (%llu %u %llu)", + "slot %u key " BTRFS_KEY_FMT " new key " BTRFS_KEY_FMT, slot, btrfs_disk_key_objectid(&disk_key), btrfs_disk_key_type(&disk_key), btrfs_disk_key_offset(&disk_key), - new_key->objectid, new_key->type, - new_key->offset); + BTRFS_KEY_FMT_VALUE(new_key)); BUG(); } } @@ -2677,10 +2699,9 @@ static bool check_sibling_keys(const struct extent_buffer *left, btrfs_crit(left->fs_info, "right extent buffer:"); btrfs_print_tree(right, false); btrfs_crit(left->fs_info, -"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)", - left_last.objectid, left_last.type, - left_last.offset, right_first.objectid, - right_first.type, right_first.offset); +"bad key order, sibling blocks, left last " BTRFS_KEY_FMT " right first " BTRFS_KEY_FMT, + BTRFS_KEY_FMT_VALUE(&left_last), + BTRFS_KEY_FMT_VALUE(&right_first)); return true; } return false; @@ -3217,10 +3238,8 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, /* then fixup the leaf pointer in the path */ if (path->slots[0] >= left_nritems) { path->slots[0] -= left_nritems; - if (btrfs_header_nritems(path->nodes[0]) == 0) - btrfs_clear_buffer_dirty(trans, path->nodes[0]); - btrfs_tree_unlock(path->nodes[0]); - free_extent_buffer(path->nodes[0]); + btrfs_tree_unlock(left); + free_extent_buffer(left); path->nodes[0] = right; path->slots[1] += 1; } else { @@ -3398,9 +3417,13 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(left, old_left_nritems + push_items); /* fixup right node */ - if (push_items > right_nritems) - WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, - right_nritems); + if (unlikely(push_items > right_nritems)) { + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + btrfs_crit(fs_info, "push items (%d) > right leaf items (%u)", + push_items, right_nritems); + goto out; + } if (push_items < right_nritems) { push_space = btrfs_item_offset(right, push_items - 1) - @@ -3433,8 +3456,8 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, /* then fixup the leaf pointer in the path */ if (path->slots[0] < push_items) { path->slots[0] += old_left_nritems; - btrfs_tree_unlock(path->nodes[0]); - free_extent_buffer(path->nodes[0]); + btrfs_tree_unlock(right); + free_extent_buffer(right); path->nodes[0] = left; path->slots[1] -= 1; } else { @@ -3861,10 +3884,10 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - path->keep_locks = 1; - path->search_for_split = 1; + path->keep_locks = true; + path->search_for_split = true; ret = btrfs_search_slot(trans, root, &key, path, 0, 1); - path->search_for_split = 0; + path->search_for_split = false; if (ret > 0) ret = -EAGAIN; if (ret < 0) @@ -3891,11 +3914,11 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, if (ret) goto err; - path->keep_locks = 0; + path->keep_locks = false; btrfs_unlock_up_safe(path, 1); return 0; err: - path->keep_locks = 0; + path->keep_locks = false; return ret; } @@ -4109,7 +4132,7 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans, nritems = btrfs_header_nritems(leaf); data_end = leaf_data_end(leaf); - if (btrfs_leaf_free_space(leaf) < data_size) { + if (unlikely(btrfs_leaf_free_space(leaf) < data_size)) { btrfs_print_leaf(leaf); BUG(); } @@ -4139,7 +4162,6 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans, memmove_leaf_data(leaf, data_end - data_size, data_end, old_data - data_end); - data_end = old_data; old_size = btrfs_item_size(leaf, slot); btrfs_set_item_size(leaf, slot, old_size + data_size); btrfs_mark_buffer_dirty(trans, leaf); @@ -4498,9 +4520,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, /* delete the leaf if we've emptied it */ if (nritems == 0) { - if (leaf == root->node) { - btrfs_set_header_level(leaf, 0); - } else { + if (leaf != root->node) { btrfs_clear_buffer_dirty(trans, leaf); ret = btrfs_del_leaf(trans, root, path, leaf); if (ret < 0) @@ -4566,10 +4586,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (btrfs_header_nritems(leaf) == 0) { path->slots[1] = slot; ret = btrfs_del_leaf(trans, root, path, leaf); + free_extent_buffer(leaf); if (ret < 0) return ret; - free_extent_buffer(leaf); - ret = 0; } else { /* if we're still in the path, make sure * we're dirty. Otherwise, one of the @@ -4613,11 +4632,11 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, u32 nritems; int level; int ret = 1; - int keep_locks = path->keep_locks; + const bool keep_locks = path->keep_locks; ASSERT(!path->nowait); ASSERT(path->lowest_level == 0); - path->keep_locks = 1; + path->keep_locks = true; again: cur = btrfs_read_lock_root_node(root); level = btrfs_header_level(cur); @@ -4707,7 +4726,7 @@ out: * 0 is returned if another key is found, < 0 if there are any errors * and 1 is returned if there are no higher keys in the tree * - * path->keep_locks should be set to 1 on the search made before + * path->keep_locks should be set to true on the search made before * calling this function. */ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, @@ -4806,13 +4825,13 @@ again: next = NULL; btrfs_release_path(path); - path->keep_locks = 1; + path->keep_locks = true; if (time_seq) { ret = btrfs_search_old_slot(root, &key, path, time_seq); } else { if (path->need_commit_sem) { - path->need_commit_sem = 0; + path->need_commit_sem = false; need_commit_sem = true; if (path->nowait) { if (!down_read_trylock(&fs_info->commit_root_sem)) { @@ -4825,41 +4844,30 @@ again: } ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); } - path->keep_locks = 0; + path->keep_locks = false; if (ret < 0) goto done; nritems = btrfs_header_nritems(path->nodes[0]); /* - * by releasing the path above we dropped all our locks. A balance - * could have added more items next to the key that used to be - * at the very end of the block. So, check again here and - * advance the path if there are now more items available. - */ - if (nritems > 0 && path->slots[0] < nritems - 1) { - if (ret == 0) - path->slots[0]++; - ret = 0; - goto done; - } - /* - * So the above check misses one case: - * - after releasing the path above, someone has removed the item that - * used to be at the very end of the block, and balance between leafs - * gets another one with bigger key.offset to replace it. + * By releasing the path above we dropped all our locks. A balance + * could have happened and * - * This one should be returned as well, or we can get leaf corruption - * later(esp. in __btrfs_drop_extents()). + * 1. added more items after the previous last item + * 2. deleted the previous last item * - * And a bit more explanation about this check, - * with ret > 0, the key isn't found, the path points to the slot - * where it should be inserted, so the path->slots[0] item must be the - * bigger one. + * So, check again here and advance the path if there are now more + * items available. */ - if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) { - ret = 0; - goto done; + if (nritems > 0 && path->slots[0] <= nritems - 1) { + if (ret == 0 && path->slots[0] != nritems - 1) { + path->slots[0]++; + goto done; + } else if (ret > 0) { + ret = 0; + goto done; + } } while (level < BTRFS_MAX_LEVEL) { @@ -4964,7 +4972,7 @@ done: if (need_commit_sem) { int ret2; - path->need_commit_sem = 1; + path->need_commit_sem = true; ret2 = finish_need_commit_sem_search(path); up_read(&fs_info->commit_root_sem); if (ret2) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index fe70b593c7cd..692370fc07b2 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -17,9 +17,7 @@ #include <linux/refcount.h> #include <uapi/linux/btrfs_tree.h> #include "locking.h" -#include "fs.h" #include "accessors.h" -#include "extent-io-tree.h" struct extent_buffer; struct btrfs_block_rsv; @@ -67,21 +65,21 @@ struct btrfs_path { * set by btrfs_split_item, tells search_slot to keep all locks * and to force calls to keep space in the nodes */ - unsigned int search_for_split:1; + bool search_for_split:1; /* Keep some upper locks as we walk down. */ - unsigned int keep_locks:1; - unsigned int skip_locking:1; - unsigned int search_commit_root:1; - unsigned int need_commit_sem:1; - unsigned int skip_release_on_error:1; + bool keep_locks:1; + bool skip_locking:1; + bool search_commit_root:1; + bool need_commit_sem:1; + bool skip_release_on_error:1; /* * Indicate that new item (btrfs_search_slot) is extending already * existing item and ins_len contains only the data size and not item * header (ie. sizeof(struct btrfs_item) is not included). */ - unsigned int search_for_extension:1; + bool search_for_extension:1; /* Stop search if any locks need to be taken (for read) */ - unsigned int nowait:1; + bool nowait:1; }; #define BTRFS_PATH_AUTO_FREE(path_name) \ diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 7b277934f66f..b81e224d4a27 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -15,6 +15,7 @@ #include "defrag.h" #include "file-item.h" #include "super.h" +#include "compression.h" static struct kmem_cache *btrfs_inode_defrag_cachep; @@ -254,10 +255,9 @@ again: range.extent_thresh = defrag->extent_thresh; file_ra_state_init(ra, inode->vfs_inode.i_mapping); - sb_start_write(fs_info->sb); - ret = btrfs_defrag_file(inode, ra, &range, defrag->transid, - BTRFS_DEFRAG_BATCH); - sb_end_write(fs_info->sb); + scoped_guard(super_write, fs_info->sb) + ret = btrfs_defrag_file(inode, ra, &range, + defrag->transid, BTRFS_DEFRAG_BATCH); iput(&inode->vfs_inode); if (ret < 0) @@ -471,7 +471,7 @@ static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, memcpy(&key, &root->defrag_progress, sizeof(key)); } - path->keep_locks = 1; + path->keep_locks = true; ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); if (ret < 0) @@ -514,7 +514,7 @@ static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, /* * Now that we reallocated the node we can find the next key. Note that * btrfs_find_next_key() can release our path and do another search - * without COWing, this is because even with path->keep_locks = 1, + * without COWing, this is because even with path->keep_locks == true, * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a * node when path->slots[node_level - 1] does not point to the last * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore @@ -886,7 +886,7 @@ again: } lock_start = folio_pos(folio); - lock_end = folio_end(folio) - 1; + lock_end = folio_next_pos(folio) - 1; /* Wait for any existing ordered extent in the range */ while (1) { struct btrfs_ordered_extent *ordered; @@ -1178,7 +1178,8 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, if (!folio) break; - if (start >= folio_end(folio) || start + len <= folio_pos(folio)) + if (start >= folio_next_pos(folio) || + start + len <= folio_pos(folio)) continue; btrfs_folio_clamp_clear_checked(fs_info, folio, start, len); btrfs_folio_clamp_set_dirty(fs_info, folio, start, len); @@ -1219,7 +1220,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, folios[i] = NULL; goto free_folios; } - cur = folio_end(folios[i]); + cur = folio_next_pos(folios[i]); } for (int i = 0; i < nr_pages; i++) { if (!folios[i]) diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 288e1776c02d..0970799d0aa4 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -358,8 +358,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, noflush); if (ret) return ret; - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, - meta_reserve, flush); + ret = btrfs_reserve_metadata_bytes(block_rsv->space_info, meta_reserve, + flush); if (ret) { btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); return ret; diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 41e37f7f67cc..ce6e9f8812e0 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -668,7 +668,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, struct btrfs_key first_key; const u32 first_data_size = first_item->data_len; int total_size; - char *ins_data = NULL; + char AUTO_KFREE(ins_data); int ret; bool continuous_keys_only = false; @@ -740,10 +740,8 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, ins_data = kmalloc_array(batch.nr, sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS); - if (!ins_data) { - ret = -ENOMEM; - goto out; - } + if (!ins_data) + return -ENOMEM; ins_sizes = (u32 *)ins_data; ins_keys = (struct btrfs_key *)(ins_data + batch.nr * sizeof(u32)); batch.keys = ins_keys; @@ -759,7 +757,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_items(trans, root, path, &batch); if (ret) - goto out; + return ret; list_for_each_entry(curr, &item_list, tree_list) { char *data_ptr; @@ -814,9 +812,8 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, list_del(&curr->tree_list); btrfs_release_delayed_item(curr); } -out: - kfree(ins_data); - return ret; + + return 0; } static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans, @@ -2011,13 +2008,10 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode) * It is very rare. */ mutex_lock(&delayed_node->mutex); - if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) - goto release_node; - - set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags); - delayed_node->count++; - atomic_inc(&fs_info->delayed_root->items); -release_node: + if (!test_and_set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) { + delayed_node->count++; + atomic_inc(&fs_info->delayed_root->items); + } mutex_unlock(&delayed_node->mutex); btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return 0; @@ -2110,9 +2104,9 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) for (int i = 0; i < count; i++) { __btrfs_kill_delayed_node(delayed_nodes[i]); + btrfs_delayed_node_ref_tracker_dir_print(delayed_nodes[i]); btrfs_release_delayed_node(delayed_nodes[i], &delayed_node_trackers[i]); - btrfs_delayed_node_ref_tracker_dir_print(delayed_nodes[i]); } } } diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 0d949edc0caf..b09d4ec8c77d 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -219,6 +219,13 @@ static inline void btrfs_delayed_node_ref_tracker_dir_print(struct btrfs_delayed if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER)) return; + /* + * Only print if there are leaked references. The caller is + * holding one reference, so if refs == 1 there is no leak. + */ + if (refcount_read(&node->refs) == 1) + return; + ref_tracker_dir_print(&node->ref_dir.dir, BTRFS_DELAYED_NODE_REF_TRACKER_DISPLAY_LIMIT); } diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 481802efaa14..e8bc37453336 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -228,7 +228,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, if (!num_bytes) return 0; - ret = btrfs_reserve_metadata_bytes(fs_info, space_info, num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(space_info, num_bytes, flush); if (ret) return ret; @@ -798,9 +798,13 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, } /* - * helper function to actually insert a head node into the rbtree. - * this does all the dirty work in terms of maintaining the correct - * overall modification count. + * Helper function to actually insert a head node into the xarray. This does all + * the dirty work in terms of maintaining the correct overall modification + * count. + * + * The caller is responsible for calling kfree() on @qrecord. More specifically, + * if this function reports that it did not insert it as noted in + * @qrecord_inserted_ret, then it's safe to call kfree() on it. * * Returns an error pointer in case of an error. */ @@ -814,7 +818,14 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_root *delayed_refs; const unsigned long index = (head_ref->bytenr >> fs_info->sectorsize_bits); - bool qrecord_inserted = false; + + /* + * If 'qrecord_inserted_ret' is provided, then the first thing we need + * to do is to initialize it to false just in case we have an exit + * before trying to insert the record. + */ + if (qrecord_inserted_ret) + *qrecord_inserted_ret = false; delayed_refs = &trans->transaction->delayed_refs; lockdep_assert_held(&delayed_refs->lock); @@ -833,6 +844,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, /* Record qgroup extent info if provided */ if (qrecord) { + /* + * Setting 'qrecord' but not 'qrecord_inserted_ret' will likely + * result in a memory leakage. + */ + ASSERT(qrecord_inserted_ret != NULL); + int ret; ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord, @@ -840,12 +857,10 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, if (ret) { /* Clean up if insertion fails or item exists. */ xa_release(&delayed_refs->dirty_extents, index); - /* Caller responsible for freeing qrecord on error. */ if (ret < 0) return ERR_PTR(ret); - kfree(qrecord); - } else { - qrecord_inserted = true; + } else if (qrecord_inserted_ret) { + *qrecord_inserted_ret = true; } } @@ -888,8 +903,6 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, delayed_refs->num_heads++; delayed_refs->num_heads_ready++; } - if (qrecord_inserted_ret) - *qrecord_inserted_ret = qrecord_inserted; return head_ref; } @@ -1049,6 +1062,14 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, xa_release(&delayed_refs->head_refs, index); spin_unlock(&delayed_refs->lock); ret = PTR_ERR(new_head_ref); + + /* + * It's only safe to call kfree() on 'qrecord' if + * add_delayed_ref_head() has _not_ inserted it for + * tracing. Otherwise we need to handle this here. + */ + if (!qrecord_reserved || qrecord_inserted) + goto free_head_ref; goto free_record; } head_ref = new_head_ref; @@ -1071,6 +1092,8 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, if (qrecord_inserted) return btrfs_qgroup_trace_extent_post(trans, record, generic_ref->bytenr); + + kfree(record); return 0; free_record: diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index a4eaef60549e..b6c7da8e1bc8 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -489,8 +489,8 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, } path->reada = READA_FORWARD; - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; key.objectid = src_dev->devid; key.type = BTRFS_DEV_EXTENT_KEY; diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 69863e398e22..085a83ae9e62 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -9,6 +9,7 @@ #include "transaction.h" #include "accessors.h" #include "dir-item.h" +#include "delayed-inode.h" /* * insert a name into a directory, doing overflow properly if there is a hash @@ -111,7 +112,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, int ret = 0; int ret2 = 0; struct btrfs_root *root = dir->root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dir_item *dir_item; struct extent_buffer *leaf; unsigned long name_ptr; @@ -163,7 +164,6 @@ second_insert: ret2 = btrfs_insert_delayed_dir_index(trans, name->name, name->len, dir, &disk_key, type, index); out_free: - btrfs_free_path(path); if (ret) return ret; if (ret2) diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index 802d4dbe5b38..07e19e88ba4b 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -10,6 +10,8 @@ #include "fs.h" #include "transaction.h" #include "volumes.h" +#include "bio.h" +#include "ordered-data.h" struct btrfs_dio_data { ssize_t submitted; @@ -184,7 +186,7 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len); again: ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, - 0, alloc_hint, &ins, 1, 1); + 0, alloc_hint, &ins, true, true); if (ret == -EAGAIN) { ASSERT(btrfs_is_zoned(fs_info)); wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH, @@ -385,7 +387,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, * to allocate a contiguous array for the checksums. */ if (!write) - len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS); + len = min_t(u64, len, fs_info->sectorsize * BIO_MAX_VECS); lockstart = start; lockend = start + len - 1; @@ -713,10 +715,8 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, container_of(bbio, struct btrfs_dio_private, bbio); struct btrfs_dio_data *dio_data = iter->private; - btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info, + btrfs_bio_init(bbio, BTRFS_I(iter->inode), file_offset, btrfs_dio_end_io, bio->bi_private); - bbio->inode = BTRFS_I(iter->inode); - bbio->file_offset = file_offset; dip->file_offset = file_offset; dip->bytes = bio->bi_iter.bi_size; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0aa7e5d1b05f..89149fac804c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -50,6 +50,7 @@ #include "relocation.h" #include "scrub.h" #include "super.h" +#include "delayed-inode.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ @@ -182,26 +183,33 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) { struct btrfs_fs_info *fs_info = eb->fs_info; + const u32 step = min(fs_info->nodesize, PAGE_SIZE); + const u32 nr_steps = eb->len / step; + phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; int ret = 0; if (sb_rdonly(fs_info->sb)) return -EROFS; - for (int i = 0; i < num_extent_folios(eb); i++) { + for (int i = 0; i < num_extent_pages(eb); i++) { struct folio *folio = eb->folios[i]; - u64 start = max_t(u64, eb->start, folio_pos(folio)); - u64 end = min_t(u64, eb->start + eb->len, - folio_pos(folio) + eb->folio_size); - u32 len = end - start; - phys_addr_t paddr = PFN_PHYS(folio_pfn(folio)) + - offset_in_folio(folio, start); - - ret = btrfs_repair_io_failure(fs_info, 0, start, len, start, - paddr, mirror_num); - if (ret) - break; + + /* No large folio support yet. */ + ASSERT(folio_order(folio) == 0); + ASSERT(i < nr_steps); + + /* + * For nodesize < page size, there is just one paddr, with some + * offset inside the page. + * + * For nodesize >= page size, it's one or more paddrs, and eb->start + * must be aligned to page boundary. + */ + paddrs[i] = page_to_phys(&folio->page) + offset_in_page(eb->start); } + ret = btrfs_repair_io_failure(fs_info, 0, eb->start, eb->len, eb->start, + paddrs, step, mirror_num); return ret; } @@ -398,10 +406,10 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, if (memcmp(result, header_csum, csum_size) != 0) { btrfs_warn_rl(fs_info, -"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d%s", +"checksum verify failed on logical %llu mirror %u wanted " BTRFS_CSUM_FMT " found " BTRFS_CSUM_FMT " level %d%s", eb->start, eb->read_mirror, - CSUM_FMT_VALUE(csum_size, header_csum), - CSUM_FMT_VALUE(csum_size, result), + BTRFS_CSUM_FMT_VALUE(csum_size, header_csum), + BTRFS_CSUM_FMT_VALUE(csum_size, result), btrfs_header_level(eb), ignore_csum ? ", ignored" : ""); if (unlikely(!ignore_csum)) { @@ -644,20 +652,10 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, if (!root) return NULL; - memset(&root->root_key, 0, sizeof(root->root_key)); - memset(&root->root_item, 0, sizeof(root->root_item)); - memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); root->fs_info = fs_info; root->root_key.objectid = objectid; - root->node = NULL; - root->commit_root = NULL; - root->state = 0; RB_CLEAR_NODE(&root->rb_node); - btrfs_set_root_last_trans(root, 0); - root->free_objectid = 0; - root->nr_delalloc_inodes = 0; - root->nr_ordered_extents = 0; xa_init(&root->inodes); xa_init(&root->delayed_nodes); @@ -691,10 +689,7 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, refcount_set(&root->refs, 1); atomic_set(&root->snapshot_force_cow, 0); atomic_set(&root->nr_swapfiles, 0); - btrfs_set_root_log_transid(root, 0); root->log_transid_committed = -1; - btrfs_set_root_last_log_commit(root, 0); - root->anon_dev = 0; if (!btrfs_is_testing(fs_info)) { btrfs_extent_io_tree_init(fs_info, &root->dirty_log_pages, IO_TREE_ROOT_DIRTY_LOG_PAGES); @@ -1773,8 +1768,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) destroy_workqueue(fs_info->endio_workers); if (fs_info->rmw_workers) destroy_workqueue(fs_info->rmw_workers); - if (fs_info->compressed_write_workers) - destroy_workqueue(fs_info->compressed_write_workers); btrfs_destroy_workqueue(fs_info->endio_write_workers); btrfs_destroy_workqueue(fs_info->endio_freespace_worker); btrfs_destroy_workqueue(fs_info->delayed_workers); @@ -1986,8 +1979,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) fs_info->endio_write_workers = btrfs_alloc_workqueue(fs_info, "endio-write", flags, max_active, 2); - fs_info->compressed_write_workers = - alloc_workqueue("btrfs-compressed-write", flags, max_active); fs_info->endio_freespace_worker = btrfs_alloc_workqueue(fs_info, "freespace-write", flags, max_active, 0); @@ -2003,7 +1994,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) if (!(fs_info->workers && fs_info->delalloc_workers && fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && - fs_info->compressed_write_workers && fs_info->endio_write_workers && fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->fixup_workers && @@ -3255,12 +3245,6 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount) PAGE_SIZE, fs_info->sectorsize); return -EINVAL; } - if (fs_info->sectorsize > PAGE_SIZE && btrfs_fs_incompat(fs_info, RAID56)) { - btrfs_err(fs_info, - "RAID56 is not supported for page size %lu with sectorsize %u", - PAGE_SIZE, fs_info->sectorsize); - return -EINVAL; - } /* This can be called by remount, we need to protect the super block. */ spin_lock(&fs_info->super_lock); @@ -4290,7 +4274,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) /* * When finishing a compressed write bio we schedule a work queue item - * to finish an ordered extent - btrfs_finish_compressed_write_work() + * to finish an ordered extent - end_bbio_compressed_write() * calls btrfs_finish_ordered_extent() which in turns does a call to * btrfs_queue_ordered_fn(), and that queues the ordered extent * completion either in the endio_write_workers work queue or in the @@ -4298,7 +4282,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) * below, so before we flush them we must flush this queue for the * workers of compressed writes. */ - flush_workqueue(fs_info->compressed_write_workers); + flush_workqueue(fs_info->endio_workers); /* * After we parked the cleaner kthread, ordered extents may have diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 57920f2c6fe4..5320da83d0cf 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -9,7 +9,8 @@ #include <linux/sizes.h> #include <linux/compiler_types.h> #include "ctree.h" -#include "fs.h" +#include "bio.h" +#include "ordered-data.h" struct block_device; struct super_block; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index dc4ca98c3780..e4cae34620d1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -40,6 +40,7 @@ #include "orphan.h" #include "tree-checker.h" #include "raid-stripe-tree.h" +#include "delayed-inode.h" #undef SCRAMBLE_DELAYED_REFS @@ -164,8 +165,8 @@ search_again: if (unlikely(num_refs == 0)) { ret = -EUCLEAN; btrfs_err(fs_info, - "unexpected zero reference count for extent item (%llu %u %llu)", - key.objectid, key.type, key.offset); + "unexpected zero reference count for extent item " BTRFS_KEY_FMT, + BTRFS_KEY_FMT_VALUE(&key)); btrfs_abort_transaction(trans, ret); return ret; } @@ -597,8 +598,8 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, num_refs = btrfs_shared_data_ref_count(leaf, ref2); } else { btrfs_err(trans->fs_info, - "unrecognized backref key (%llu %u %llu)", - key.objectid, key.type, key.offset); + "unrecognized backref key " BTRFS_KEY_FMT, + BTRFS_KEY_FMT_VALUE(&key)); btrfs_abort_transaction(trans, -EUCLEAN); return -EUCLEAN; } @@ -788,7 +789,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, want = extent_ref_type(parent, owner); if (insert) { extra_size = btrfs_extent_inline_ref_size(want); - path->search_for_extension = 1; + path->search_for_extension = true; } else extra_size = -1; @@ -954,7 +955,7 @@ again: if (!path->keep_locks) { btrfs_release_path(path); - path->keep_locks = 1; + path->keep_locks = true; goto again; } @@ -975,11 +976,11 @@ out_no_entry: *ref_ret = (struct btrfs_extent_inline_ref *)ptr; out: if (path->keep_locks) { - path->keep_locks = 0; + path->keep_locks = false; btrfs_unlock_up_safe(path, 1); } if (insert) - path->search_for_extension = 0; + path->search_for_extension = false; return ret; } @@ -1764,7 +1765,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, if (TRANS_ABORTED(trans)) { if (insert_reserved) { - btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); + btrfs_pin_extent(trans, node->bytenr, node->num_bytes); free_head_ref_squota_rsv(trans->fs_info, href); } return 0; @@ -1783,7 +1784,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, else BUG(); if (ret && insert_reserved) - btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); + btrfs_pin_extent(trans, node->bytenr, node->num_bytes); if (ret < 0) btrfs_err(trans->fs_info, "failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d", @@ -1890,7 +1891,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); if (head->must_insert_reserved) { - btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1); + btrfs_pin_extent(trans, head->bytenr, head->num_bytes); if (head->is_data) { struct btrfs_root *csum_root; @@ -2591,34 +2592,34 @@ static u64 first_logical_byte(struct btrfs_fs_info *fs_info) } static int pin_down_extent(struct btrfs_trans_handle *trans, - struct btrfs_block_group *cache, - u64 bytenr, u64 num_bytes, int reserved) + struct btrfs_block_group *bg, + u64 bytenr, u64 num_bytes, bool reserved) { - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - cache->pinned += num_bytes; - btrfs_space_info_update_bytes_pinned(cache->space_info, num_bytes); - if (reserved) { - cache->reserved -= num_bytes; - cache->space_info->bytes_reserved -= num_bytes; - } - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); + struct btrfs_space_info *space_info = bg->space_info; + const u64 reserved_bytes = (reserved ? num_bytes : 0); + + spin_lock(&space_info->lock); + spin_lock(&bg->lock); + bg->pinned += num_bytes; + bg->reserved -= reserved_bytes; + spin_unlock(&bg->lock); + space_info->bytes_reserved -= reserved_bytes; + btrfs_space_info_update_bytes_pinned(space_info, num_bytes); + spin_unlock(&space_info->lock); btrfs_set_extent_bit(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); return 0; } -int btrfs_pin_extent(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, int reserved) +int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes) { struct btrfs_block_group *cache; cache = btrfs_lookup_block_group(trans->fs_info, bytenr); BUG_ON(!cache); /* Logic error */ - pin_down_extent(trans, cache, bytenr, num_bytes, reserved); + pin_down_extent(trans, cache, bytenr, num_bytes, true); btrfs_put_block_group(cache); return 0; @@ -2642,7 +2643,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, if (ret) goto out; - pin_down_extent(trans, cache, eb->start, eb->len, 0); + pin_down_extent(trans, cache, eb->start, eb->len, false); /* remove us from the free space cache (if we're there at all) */ ret = btrfs_remove_free_space(cache, eb->start, eb->len); @@ -2747,13 +2748,11 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, struct btrfs_free_cluster *cluster = NULL; u64 total_unpinned = 0; u64 empty_cluster = 0; - bool readonly; - int ret = 0; while (start <= end) { u64 len; + bool readonly; - readonly = false; if (!cache || start >= cache->start + cache->length) { if (cache) @@ -2762,8 +2761,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, cache = btrfs_lookup_block_group(fs_info, start); if (unlikely(cache == NULL)) { /* Logic error, something removed the block group. */ - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } cluster = fetch_cluster_info(fs_info, @@ -2797,27 +2795,28 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); spin_lock(&cache->lock); + readonly = cache->ro; cache->pinned -= len; + spin_unlock(&cache->lock); + btrfs_space_info_update_bytes_pinned(space_info, -len); space_info->max_extent_size = 0; - if (cache->ro) { + + if (readonly) { space_info->bytes_readonly += len; - readonly = true; } else if (btrfs_is_zoned(fs_info)) { /* Need reset before reusing in a zoned block group */ btrfs_space_info_update_bytes_zone_unusable(space_info, len); - readonly = true; - } - spin_unlock(&cache->lock); - if (!readonly && return_free_space) + } else if (return_free_space) { btrfs_return_free_space(space_info, len); + } spin_unlock(&space_info->lock); } if (cache) btrfs_put_block_group(cache); -out: - return ret; + + return 0; } int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) @@ -3086,7 +3085,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *extent_root; struct extent_buffer *leaf; struct btrfs_extent_item *ei; @@ -3121,7 +3120,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, node->bytenr, refs_to_drop); ret = -EINVAL; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } if (is_data) @@ -3166,15 +3165,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, abort_and_dump(trans, path, "invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref", path->slots[0]); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } /* Must be SHARED_* item, remove the backref first */ ret = remove_extent_backref(trans, extent_root, path, NULL, refs_to_drop, is_data); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } btrfs_release_path(path); @@ -3223,7 +3221,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } extent_slot = path->slots[0]; } @@ -3232,10 +3230,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d", bytenr, node->parent, node->ref_root, owner_objectid, owner_offset, path->slots[0]); - goto out; + return ret; } else { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } leaf = path->nodes[0]; @@ -3246,7 +3244,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); - goto out; + return ret; } ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item); @@ -3260,8 +3258,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, key.objectid, key.type, key.offset, path->slots[0], owner_objectid, item_size, sizeof(*ei) + sizeof(*bi)); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } bi = (struct btrfs_tree_block_info *)(ei + 1); WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); @@ -3272,8 +3269,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, abort_and_dump(trans, path, "trying to drop %d refs but we only have %llu for bytenr %llu slot %u", refs_to_drop, refs, bytenr, path->slots[0]); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } refs -= refs_to_drop; @@ -3289,8 +3285,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, abort_and_dump(trans, path, "invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u", path->slots[0]); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } } else { btrfs_set_extent_refs(leaf, ei, refs); @@ -3300,7 +3295,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, iref, refs_to_drop, is_data); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } } } else { @@ -3320,17 +3315,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, "invalid refs_to_drop, current refs %u refs_to_drop %u slot %u", extent_data_ref_count(path, iref), refs_to_drop, path->slots[0]); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } if (iref) { if (unlikely(path->slots[0] != extent_slot)) { abort_and_dump(trans, path, -"invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref", - key.objectid, key.type, - key.offset, path->slots[0]); - ret = -EUCLEAN; - goto out; +"invalid iref, extent item key " BTRFS_KEY_FMT " slot %u doesn't have wanted iref", + BTRFS_KEY_FMT_VALUE(&key), + path->slots[0]); + return -EUCLEAN; } } else { /* @@ -3343,8 +3336,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, abort_and_dump(trans, path, "invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM", path->slots[0]); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } path->slots[0] = extent_slot; num_to_del = 2; @@ -3365,7 +3357,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, num_to_del); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } btrfs_release_path(path); @@ -3373,8 +3365,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } btrfs_release_path(path); -out: - btrfs_free_path(path); return ret; } @@ -3483,7 +3473,7 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans, bg = btrfs_lookup_block_group(fs_info, buf->start); if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { - pin_down_extent(trans, bg, buf->start, buf->len, 1); + pin_down_extent(trans, bg, buf->start, buf->len, true); btrfs_put_block_group(bg); goto out; } @@ -3507,7 +3497,7 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags) || btrfs_is_zoned(fs_info)) { - pin_down_extent(trans, bg, buf->start, buf->len, 1); + pin_down_extent(trans, bg, buf->start, buf->len, true); btrfs_put_block_group(bg); goto out; } @@ -3537,7 +3527,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) * tree, just update pinning info and exit early. */ if (ref->ref_root == BTRFS_TREE_LOG_OBJECTID) { - btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes, 1); + btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes); ret = 0; } else if (ref->type == BTRFS_REF_METADATA) { ret = btrfs_add_delayed_tree_ref(trans, ref, NULL); @@ -3588,15 +3578,14 @@ enum btrfs_loop_type { }; static inline void -btrfs_lock_block_group(struct btrfs_block_group *cache, - int delalloc) +btrfs_lock_block_group(struct btrfs_block_group *cache, bool delalloc) { if (delalloc) down_read(&cache->data_rwsem); } static inline void btrfs_grab_block_group(struct btrfs_block_group *cache, - int delalloc) + bool delalloc) { btrfs_get_block_group(cache); if (delalloc) @@ -3606,7 +3595,7 @@ static inline void btrfs_grab_block_group(struct btrfs_block_group *cache, static struct btrfs_block_group *btrfs_lock_cluster( struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, - int delalloc) + bool delalloc) __acquires(&cluster->refill_lock) { struct btrfs_block_group *used_bg = NULL; @@ -3643,8 +3632,7 @@ static struct btrfs_block_group *btrfs_lock_cluster( } static inline void -btrfs_release_block_group(struct btrfs_block_group *cache, - int delalloc) +btrfs_release_block_group(struct btrfs_block_group *cache, bool delalloc) { if (delalloc) up_read(&cache->data_rwsem); @@ -4034,7 +4022,7 @@ static int do_allocation(struct btrfs_block_group *block_group, static void release_block_group(struct btrfs_block_group *block_group, struct find_free_extent_ctl *ffe_ctl, - int delalloc) + bool delalloc) { switch (ffe_ctl->policy) { case BTRFS_EXTENT_ALLOC_CLUSTERED: @@ -4690,7 +4678,7 @@ loop: int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - struct btrfs_key *ins, int is_data, int delalloc) + struct btrfs_key *ins, bool is_data, bool delalloc) { struct btrfs_fs_info *fs_info = root->fs_info; struct find_free_extent_ctl ffe_ctl = {}; @@ -4735,8 +4723,7 @@ again: "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d", flags, num_bytes, for_treelog, for_data_reloc); if (sinfo) - btrfs_dump_space_info(fs_info, sinfo, - num_bytes, 1); + btrfs_dump_space_info(sinfo, num_bytes, 1); } } @@ -4776,7 +4763,7 @@ int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, return -ENOSPC; } - ret = pin_down_extent(trans, cache, eb->start, eb->len, 1); + ret = pin_down_extent(trans, cache, eb->start, eb->len, true); btrfs_put_block_group(cache); return ret; } @@ -5022,7 +5009,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner, offset, ins, 1, root_objectid); if (ret) - btrfs_pin_extent(trans, ins->objectid, ins->offset, 1); + btrfs_pin_extent(trans, ins->objectid, ins->offset); ret = btrfs_record_squota_delta(fs_info, &delta); btrfs_put_block_group(block_group); return ret; @@ -5168,7 +5155,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, return ERR_CAST(block_rsv); ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize, - empty_size, hint, &ins, 0, 0); + empty_size, hint, &ins, false, false); if (ret) goto out_unuse; @@ -6061,7 +6048,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, bool update_ref, bool for_reloc struct btrfs_trans_handle *trans; struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root_item *root_item = &root->root_item; - struct walk_control *wc; + struct walk_control AUTO_KFREE(wc); struct btrfs_key key; const u64 rootid = btrfs_root_id(root); int ret = 0; @@ -6079,9 +6066,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, bool update_ref, bool for_reloc wc = kzalloc(sizeof(*wc), GFP_NOFS); if (!wc) { - btrfs_free_path(path); ret = -ENOMEM; - goto out; + goto out_free; } /* @@ -6291,7 +6277,6 @@ out_end_trans: btrfs_end_transaction_throttle(trans); out_free: - kfree(wc); btrfs_free_path(path); out: if (!ret && root_dropped) { @@ -6334,7 +6319,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; BTRFS_PATH_AUTO_FREE(path); - struct walk_control *wc; + struct walk_control AUTO_KFREE(wc); int level; int parent_level; int ret = 0; @@ -6373,18 +6358,17 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, while (1) { ret = walk_down_tree(trans, root, path, wc); if (ret < 0) - break; + return ret; ret = walk_up_tree(trans, root, path, wc, parent_level); if (ret) { - if (ret > 0) - ret = 0; + if (ret < 0) + return ret; break; } } - kfree(wc); - return ret; + return 0; } /* diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index e970ac42a871..71bb8109c969 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -30,7 +30,6 @@ struct find_free_extent_ctl { u64 min_alloc_size; u64 empty_size; u64 flags; - int delalloc; /* Where to start the search inside the bg */ u64 search_start; @@ -40,6 +39,7 @@ struct find_free_extent_ctl { struct btrfs_free_cluster *last_ptr; bool use_cluster; + bool delalloc; bool have_caching_bg; bool orig_have_caching_bg; @@ -49,6 +49,16 @@ struct find_free_extent_ctl { /* Allocation is called for data relocation */ bool for_data_reloc; + /* + * Set to true if we're retrying the allocation on this block group + * after waiting for caching progress, this is so that we retry only + * once before moving on to another block group. + */ + bool retry_uncached; + + /* Whether or not the allocator is currently following a hint. */ + bool hinted; + /* RAID index, converted from flags */ int index; @@ -57,13 +67,6 @@ struct find_free_extent_ctl { */ int loop; - /* - * Set to true if we're retrying the allocation on this block group - * after waiting for caching progress, this is so that we retry only - * once before moving on to another block group. - */ - bool retry_uncached; - /* If current block group is cached */ int cached; @@ -82,9 +85,6 @@ struct find_free_extent_ctl { /* Allocation policy */ enum btrfs_extent_allocation_policy policy; - /* Whether or not the allocator is currently following a hint */ - bool hinted; - /* Size class of block groups to prefer in early loops */ enum btrfs_block_group_size_class size_class; }; @@ -110,8 +110,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 offset, int metadata, u64 *refs, u64 *flags, u64 *owner_root); -int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num, - int reserved); +int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num); int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, const struct extent_buffer *eb); int btrfs_exclude_logged_extents(struct extent_buffer *eb); @@ -138,7 +137,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, struct btrfs_key *ins); int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - struct btrfs_key *ins, int is_data, int delalloc); + struct btrfs_key *ins, bool is_data, bool delalloc); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, bool full_backref); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c123a3ef154a..629fd5af4286 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -333,7 +333,7 @@ static noinline int lock_delalloc_folios(struct inode *inode, goto out; } range_start = max_t(u64, folio_pos(folio), start); - range_len = min_t(u64, folio_end(folio), end + 1) - range_start; + range_len = min_t(u64, folio_next_pos(folio), end + 1) - range_start; btrfs_folio_set_lock(fs_info, folio, range_start, range_len); processed_end = range_start + range_len - 1; @@ -374,8 +374,7 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; const u64 orig_start = *start; const u64 orig_end = *end; - /* The sanity tests may not set a valid fs_info. */ - u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE; + u64 max_bytes = fs_info->max_extent_size; u64 delalloc_start; u64 delalloc_end; bool found; @@ -387,7 +386,7 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, ASSERT(orig_end > orig_start); /* The range should at least cover part of the folio */ - ASSERT(!(orig_start >= folio_end(locked_folio) || + ASSERT(!(orig_start >= folio_next_pos(locked_folio) || orig_end <= folio_pos(locked_folio))); again: /* step one, find a bunch of delalloc bytes starting at start */ @@ -493,7 +492,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); ASSERT(folio_pos(folio) <= start && - start + len <= folio_end(folio)); + start + len <= folio_next_pos(folio)); if (uptodate && btrfs_verify_folio(folio, start, len)) btrfs_folio_set_uptodate(fs_info, folio, start, len); @@ -518,7 +517,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le */ static void end_bbio_data_write(struct btrfs_bio *bbio) { - struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; struct bio *bio = &bbio->bio; int error = blk_status_to_errno(bio->bi_status); struct folio_iter fi; @@ -574,7 +573,7 @@ static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) */ static void end_bbio_data_read(struct btrfs_bio *bbio) { - struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; struct bio *bio = &bbio->bio; struct folio_iter fi; @@ -739,12 +738,10 @@ static void alloc_new_bio(struct btrfs_inode *inode, struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_bio *bbio; - bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, fs_info, - bio_ctrl->end_io_func, NULL); + bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, inode, + file_offset, bio_ctrl->end_io_func, NULL); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; bbio->bio.bi_write_hint = inode->vfs_inode.i_write_hint; - bbio->inode = inode; - bbio->file_offset = file_offset; bio_ctrl->bbio = bbio; bio_ctrl->len_to_oe_boundary = U32_MAX; bio_ctrl->next_file_offset = file_offset; @@ -973,7 +970,7 @@ static void btrfs_readahead_expand(struct readahead_control *ractl, { const u64 ra_pos = readahead_pos(ractl); const u64 ra_end = ra_pos + readahead_length(ractl); - const u64 em_end = em->start + em->ram_bytes; + const u64 em_end = em->start + em->len; /* No expansion for holes and inline extents. */ if (em->disk_bytenr > EXTENT_MAP_LAST_BYTE) @@ -1201,7 +1198,7 @@ static bool can_skip_one_ordered_range(struct btrfs_inode *inode, * finished our folio read and unlocked the folio. */ if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) { - u64 range_len = min(folio_end(folio), + u64 range_len = umin(folio_next_pos(folio), ordered->file_offset + ordered->num_bytes) - cur; ret = true; @@ -1223,7 +1220,7 @@ static bool can_skip_one_ordered_range(struct btrfs_inode *inode, * So we return true and update @next_ret to the OE/folio boundary. */ if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { - u64 range_len = min(folio_end(folio), + u64 range_len = umin(folio_next_pos(folio), ordered->file_offset + ordered->num_bytes) - cur; /* @@ -1691,14 +1688,17 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, unsigned long range_bitmap = 0; bool submitted_io = false; int found_error = 0; + const u64 end = start + len; const u64 folio_start = folio_pos(folio); + const u64 folio_end = folio_start + folio_size(folio); const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); u64 cur; int bit; int ret = 0; - ASSERT(start >= folio_start && - start + len <= folio_start + folio_size(folio)); + ASSERT(start >= folio_start, "start=%llu folio_start=%llu", start, folio_start); + ASSERT(end <= folio_end, "start=%llu len=%u folio_start=%llu folio_size=%zu", + start, len, folio_start, folio_size(folio)); ret = btrfs_writepage_cow_fixup(folio); if (ret == -EAGAIN) { @@ -1714,7 +1714,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, return ret; } - for (cur = start; cur < start + len; cur += fs_info->sectorsize) + for (cur = start; cur < end; cur += fs_info->sectorsize) set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap, blocks_per_folio); @@ -1725,8 +1725,24 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits); if (cur >= i_size) { + struct btrfs_ordered_extent *ordered; + + ordered = btrfs_lookup_first_ordered_range(inode, cur, + folio_end - cur); + /* + * We have just run delalloc before getting here, so + * there must be an ordered extent. + */ + ASSERT(ordered != NULL); + spin_lock(&inode->ordered_tree_lock); + set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); + ordered->truncated_len = min(ordered->truncated_len, + cur - ordered->file_offset); + spin_unlock(&inode->ordered_tree_lock); + btrfs_put_ordered_extent(ordered); + btrfs_mark_ordered_io_finished(inode, folio, cur, - start + len - cur, true); + end - cur, true); /* * This range is beyond i_size, thus we don't need to * bother writing back. @@ -1735,8 +1751,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, * writeback the sectors with subpage dirty bits, * causing writeback without ordered extent. */ - btrfs_folio_clear_dirty(fs_info, folio, cur, - start + len - cur); + btrfs_folio_clear_dirty(fs_info, folio, cur, end - cur); break; } ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size); @@ -1856,7 +1871,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl folio_size(folio), bio_ctrl, i_size); if (ret == 1) return 0; - if (ret < 0) + if (unlikely(ret < 0)) btrfs_err_rl(fs_info, "failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d", btrfs_root_id(inode->root), btrfs_ino(inode), @@ -2206,16 +2221,15 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc), - eb->fs_info, end_bbio_meta_write, eb); + BTRFS_I(fs_info->btree_inode), eb->start, + end_bbio_meta_write, eb); bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev); wbc_init_bio(wbc, &bbio->bio); - bbio->inode = BTRFS_I(eb->fs_info->btree_inode); - bbio->file_offset = eb->start; for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; u64 range_start = max_t(u64, eb->start, folio_pos(folio)); - u32 range_len = min_t(u64, folio_end(folio), + u32 range_len = min_t(u64, folio_next_pos(folio), eb->start + eb->len) - range_start; folio_lock(folio); @@ -2228,6 +2242,14 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, wbc_account_cgroup_owner(wbc, folio, range_len); folio_unlock(folio); } + /* + * If the fs is already in error status, do not submit any writeback + * but immediately finish it. + */ + if (unlikely(BTRFS_FS_ERROR(fs_info))) { + btrfs_bio_end_io(bbio, errno_to_blk_status(BTRFS_FS_ERROR(fs_info))); + return; + } btrfs_submit_bbio(bbio, 0); } @@ -2460,10 +2482,7 @@ static int extent_write_cache_pages(struct address_space *mapping, &BTRFS_I(inode)->runtime_flags)) wbc->tagged_writepages = 1; - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); @@ -2619,7 +2638,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f continue; } - cur_end = min_t(u64, folio_end(folio) - 1, end); + cur_end = min_t(u64, folio_next_pos(folio) - 1, end); cur_len = cur_end + 1 - cur; ASSERT(folio_test_locked(folio)); @@ -3818,6 +3837,7 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio) int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, const struct btrfs_tree_parent_check *check) { + struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_bio *bbio; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) @@ -3851,16 +3871,14 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, refcount_inc(&eb->refs); bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, - REQ_OP_READ | REQ_META, eb->fs_info, - end_bbio_meta_read, eb); + REQ_OP_READ | REQ_META, BTRFS_I(fs_info->btree_inode), + eb->start, end_bbio_meta_read, eb); bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; - bbio->inode = BTRFS_I(eb->fs_info->btree_inode); - bbio->file_offset = eb->start; memcpy(&bbio->parent_check, check, sizeof(*check)); for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; u64 range_start = max_t(u64, eb->start, folio_pos(folio)); - u32 range_len = min_t(u64, folio_end(folio), + u32 range_len = min_t(u64, folio_next_pos(folio), eb->start + eb->len) - range_start; bio_add_folio_nofail(&bbio->bio, folio, range_len, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 5fcbfe44218c..02ebb2f238af 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -12,7 +12,6 @@ #include <linux/rwsem.h> #include <linux/list.h> #include <linux/slab.h> -#include "compression.h" #include "messages.h" #include "ulist.h" #include "misc.h" diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index d4b81ee4d97b..6f685f3c9327 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -8,8 +8,7 @@ #include <linux/rbtree.h> #include <linux/list.h> #include <linux/refcount.h> -#include "misc.h" -#include "compression.h" +#include "fs.h" struct btrfs_inode; struct btrfs_fs_info; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index a42e6d54e7cd..14e5257f0f04 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -18,6 +18,7 @@ #include "fs.h" #include "accessors.h" #include "file-item.h" +#include "volumes.h" #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) * 2) / \ @@ -372,7 +373,7 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio) return -ENOMEM; if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { - bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); + bbio->csum = kvcalloc(nblocks, csum_size, GFP_NOFS); if (!bbio->csum) return -ENOMEM; } else { @@ -393,8 +394,8 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio) * between reading the free space cache and updating the csum tree. */ if (btrfs_is_free_space_inode(inode)) { - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; } /* @@ -422,8 +423,8 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio) * from across transactions. */ if (bbio->csum_search_commit_root) { - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; down_read(&fs_info->commit_root_sem); } @@ -438,7 +439,7 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio) if (count < 0) { ret = count; if (bbio->csum != bbio->csum_inline) - kfree(bbio->csum); + kvfree(bbio->csum); bbio->csum = NULL; break; } @@ -764,21 +765,55 @@ fail: return ret; } +static void csum_one_bio(struct btrfs_bio *bbio, struct bvec_iter *src) +{ + struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + struct bio *bio = &bbio->bio; + struct btrfs_ordered_sum *sums = bbio->sums; + struct bvec_iter iter = *src; + phys_addr_t paddr; + const u32 blocksize = fs_info->sectorsize; + const u32 step = min(blocksize, PAGE_SIZE); + const u32 nr_steps = blocksize / step; + phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; + u32 offset = 0; + int index = 0; + + shash->tfm = fs_info->csum_shash; + + btrfs_bio_for_each_block(paddr, bio, &iter, step) { + paddrs[(offset / step) % nr_steps] = paddr; + offset += step; + + if (IS_ALIGNED(offset, blocksize)) { + btrfs_calculate_block_csum_pages(fs_info, paddrs, sums->sums + index); + index += fs_info->csum_size; + } + } +} + +static void csum_one_bio_work(struct work_struct *work) +{ + struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, csum_work); + + ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE); + ASSERT(bbio->async_csum == true); + csum_one_bio(bbio, &bbio->csum_saved_iter); + complete(&bbio->csum_done); +} + /* * Calculate checksums of the data contained inside a bio. */ -int btrfs_csum_one_bio(struct btrfs_bio *bbio) +int btrfs_csum_one_bio(struct btrfs_bio *bbio, bool async) { struct btrfs_ordered_extent *ordered = bbio->ordered; struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct bio *bio = &bbio->bio; struct btrfs_ordered_sum *sums; - struct bvec_iter iter = bio->bi_iter; - phys_addr_t paddr; - const u32 blocksize = fs_info->sectorsize; - int index; unsigned nofs_flag; nofs_flag = memalloc_nofs_save(); @@ -789,21 +824,21 @@ int btrfs_csum_one_bio(struct btrfs_bio *bbio) if (!sums) return -ENOMEM; + sums->logical = bbio->orig_logical; sums->len = bio->bi_iter.bi_size; INIT_LIST_HEAD(&sums->list); - - sums->logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; - index = 0; - - shash->tfm = fs_info->csum_shash; - - btrfs_bio_for_each_block(paddr, bio, &iter, blocksize) { - btrfs_calculate_block_csum(fs_info, paddr, sums->sums + index); - index += fs_info->csum_size; - } - bbio->sums = sums; btrfs_add_ordered_sum(ordered, sums); + + if (!async) { + csum_one_bio(bbio, &bbio->bio.bi_iter); + return 0; + } + init_completion(&bbio->csum_done); + bbio->async_csum = true; + bbio->csum_saved_iter = bbio->bio.bi_iter; + INIT_WORK(&bbio->csum_work, csum_one_bio_work); + schedule_work(&bbio->csum_work); return 0; } @@ -1142,10 +1177,10 @@ again: } btrfs_release_path(path); - path->search_for_extension = 1; + path->search_for_extension = true; ret = btrfs_search_slot(trans, root, &file_key, path, csum_size, 1); - path->search_for_extension = 0; + path->search_for_extension = false; if (ret < 0) goto out; diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 63216c43676d..5645c5e3abdb 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -7,7 +7,7 @@ #include <linux/list.h> #include <uapi/linux/btrfs_tree.h> #include "ctree.h" -#include "accessors.h" +#include "ordered-data.h" struct extent_map; struct btrfs_file_extent_item; @@ -64,7 +64,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); -int btrfs_csum_one_bio(struct btrfs_bio *bbio); +int btrfs_csum_one_bio(struct btrfs_bio *bbio, bool async); int btrfs_alloc_dummy_sum(struct btrfs_bio *bbio); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7efd1f8a1912..7a501e73d880 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -75,7 +75,7 @@ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos u64 num_bytes; u64 start_pos; u64 end_of_last_block; - u64 end_pos = pos + write_bytes; + const u64 end_pos = pos + write_bytes; loff_t isize = i_size_read(&inode->vfs_inode); unsigned int extra_bits = 0; @@ -86,10 +86,9 @@ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos extra_bits |= EXTENT_NORESERVE; start_pos = round_down(pos, fs_info->sectorsize); - num_bytes = round_up(write_bytes + pos - start_pos, - fs_info->sectorsize); + num_bytes = round_up(end_pos - start_pos, fs_info->sectorsize); ASSERT(num_bytes <= U32_MAX); - ASSERT(folio_pos(folio) <= pos && folio_end(folio) >= pos + write_bytes); + ASSERT(folio_pos(folio) <= pos && folio_next_pos(folio) >= end_pos); end_of_last_block = start_pos + num_bytes - 1; @@ -799,7 +798,7 @@ static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 u64 len) { u64 clamp_start = max_t(u64, pos, folio_pos(folio)); - u64 clamp_end = min_t(u64, pos + len, folio_end(folio)); + u64 clamp_end = min_t(u64, pos + len, folio_next_pos(folio)); const u32 blocksize = inode_to_fs_info(inode)->sectorsize; int ret = 0; @@ -1254,8 +1253,8 @@ again: * The reserved range goes beyond the current folio, shrink the reserved * space to the folio boundary. */ - if (reserved_start + reserved_len > folio_end(folio)) { - const u64 last_block = folio_end(folio); + if (reserved_start + reserved_len > folio_next_pos(folio)) { + const u64 last_block = folio_next_pos(folio); shrink_reserved_space(inode, *data_reserved, reserved_start, reserved_len, last_block - reserved_start, @@ -1441,6 +1440,8 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, struct btrfs_inode *inode = BTRFS_I(file_inode(file)); ssize_t num_written, num_sync; + if (unlikely(btrfs_is_shutdown(inode->root->fs_info))) + return -EIO; /* * If the fs flips readonly due to some impossible error, although we * have opened a file as writable, we have to stop this write operation @@ -2043,6 +2044,8 @@ static int btrfs_file_mmap_prepare(struct vm_area_desc *desc) struct file *filp = desc->file; struct address_space *mapping = filp->f_mapping; + if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(filp))))) + return -EIO; if (!mapping->a_ops->read_folio) return -ENOEXEC; @@ -2854,12 +2857,22 @@ static int btrfs_fallocate_update_isize(struct inode *inode, { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; + u64 range_start; + u64 range_end; int ret; int ret2; if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode)) return 0; + range_start = round_down(i_size_read(inode), root->fs_info->sectorsize); + range_end = round_up(end, root->fs_info->sectorsize); + + ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), range_start, + range_end - range_start); + if (ret) + return ret; + trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -3102,6 +3115,9 @@ static long btrfs_fallocate(struct file *file, int mode, int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize; int ret; + if (unlikely(btrfs_is_shutdown(inode_to_fs_info(inode)))) + return -EIO; + /* Do not allow fallocate in ZONED mode */ if (btrfs_is_zoned(inode_to_fs_info(inode))) return -EOPNOTSUPP; @@ -3793,6 +3809,9 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) { int ret; + if (unlikely(btrfs_is_shutdown(inode_to_fs_info(inode)))) + return -EIO; + filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; ret = fsverity_file_open(inode, filp); @@ -3805,6 +3824,9 @@ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { ssize_t ret = 0; + if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(iocb->ki_filp))))) + return -EIO; + if (iocb->ki_flags & IOCB_DIRECT) { ret = btrfs_direct_read(iocb, to); if (ret < 0 || !iov_iter_count(to) || @@ -3815,10 +3837,20 @@ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) return filemap_read(iocb, to, ret); } +static ssize_t btrfs_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(in))))) + return -EIO; + + return filemap_splice_read(in, ppos, pipe, len, flags); +} + const struct file_operations btrfs_file_operations = { .llseek = btrfs_file_llseek, .read_iter = btrfs_file_read_iter, - .splice_read = filemap_splice_read, + .splice_read = btrfs_file_splice_read, .write_iter = btrfs_file_write_iter, .splice_write = iter_file_splice_write, .mmap_prepare = btrfs_file_mmap_prepare, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ab873bd67192..f0f72850fab2 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -968,8 +968,8 @@ int load_free_space_cache(struct btrfs_block_group *block_group) path = btrfs_alloc_path(); if (!path) return 0; - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; /* * We must pass a path with search_commit_root set to btrfs_iget in @@ -3656,7 +3656,7 @@ static int do_trimming(struct btrfs_block_group *block_group, struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; int ret; - int update = 0; + bool bg_ro; const u64 end = start + bytes; const u64 reserved_end = reserved_start + reserved_bytes; enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; @@ -3664,12 +3664,14 @@ static int do_trimming(struct btrfs_block_group *block_group, spin_lock(&space_info->lock); spin_lock(&block_group->lock); - if (!block_group->ro) { + bg_ro = block_group->ro; + if (!bg_ro) { block_group->reserved += reserved_bytes; + spin_unlock(&block_group->lock); space_info->bytes_reserved += reserved_bytes; - update = 1; + } else { + spin_unlock(&block_group->lock); } - spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed); @@ -3690,14 +3692,16 @@ static int do_trimming(struct btrfs_block_group *block_group, list_del(&trim_entry->list); mutex_unlock(&ctl->cache_writeout_mutex); - if (update) { + if (!bg_ro) { spin_lock(&space_info->lock); spin_lock(&block_group->lock); - if (block_group->ro) - space_info->bytes_readonly += reserved_bytes; + bg_ro = block_group->ro; block_group->reserved -= reserved_bytes; - space_info->bytes_reserved -= reserved_bytes; spin_unlock(&block_group->lock); + + space_info->bytes_reserved -= reserved_bytes; + if (bg_ro) + space_info->bytes_readonly += reserved_bytes; spin_unlock(&space_info->lock); } diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index dad0b492a663..1ad2ad384b9e 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -165,11 +165,9 @@ static unsigned long *alloc_bitmap(u32 bitmap_size) /* * GFP_NOFS doesn't work with kvmalloc(), but we really can't recurse - * into the filesystem as the free space bitmap can be modified in the - * critical section of a transaction commit. - * - * TODO: push the memalloc_nofs_{save,restore}() to the caller where we - * know that recursion is unsafe. + * into the filesystem here. All callers hold a transaction handle + * open, so if a GFP_KERNEL allocation recurses into the filesystem + * and triggers a transaction commit, we would deadlock. */ nofs_flag = memalloc_nofs_save(); ret = kvzalloc(bitmap_rounded_size, GFP_KERNEL); @@ -218,11 +216,8 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, bitmap_size = free_space_bitmap_size(fs_info, block_group->length); bitmap = alloc_bitmap(bitmap_size); - if (unlikely(!bitmap)) { - ret = -ENOMEM; - btrfs_abort_transaction(trans, ret); - goto out; - } + if (unlikely(!bitmap)) + return 0; start = block_group->start; end = block_group->start + block_group->length; @@ -361,11 +356,8 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, bitmap_size = free_space_bitmap_size(fs_info, block_group->length); bitmap = alloc_bitmap(bitmap_size); - if (unlikely(!bitmap)) { - ret = -ENOMEM; - btrfs_abort_transaction(trans, ret); - goto out; - } + if (unlikely(!bitmap)) + return 0; start = block_group->start; end = block_group->start + block_group->length; @@ -841,7 +833,7 @@ int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, u64 start, u64 size) { struct btrfs_block_group *block_group; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE)) @@ -851,7 +843,7 @@ int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, if (unlikely(!path)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } block_group = btrfs_lookup_block_group(trans->fs_info, start); @@ -859,7 +851,7 @@ int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, DEBUG_WARN("no block group found for start=%llu", start); ret = -ENOENT; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } mutex_lock(&block_group->free_space_lock); @@ -869,8 +861,7 @@ int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); btrfs_put_block_group(block_group); -out: - btrfs_free_path(path); + return ret; } @@ -1023,7 +1014,7 @@ int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, u64 start, u64 size) { struct btrfs_block_group *block_group; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE)) @@ -1033,7 +1024,7 @@ int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, if (unlikely(!path)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } block_group = btrfs_lookup_block_group(trans->fs_info, start); @@ -1041,7 +1032,7 @@ int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, DEBUG_WARN("no block group found for start=%llu", start); ret = -ENOENT; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } mutex_lock(&block_group->free_space_lock); @@ -1051,8 +1042,7 @@ int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); btrfs_put_block_group(block_group); -out: - btrfs_free_path(path); + return ret; } @@ -1106,14 +1096,15 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, * If ret is 1 (no key found), it means this is an empty block group, * without any extents allocated from it and there's no block group * item (key BTRFS_BLOCK_GROUP_ITEM_KEY) located in the extent tree - * because we are using the block group tree feature, so block group - * items are stored in the block group tree. It also means there are no - * extents allocated for block groups with a start offset beyond this - * block group's end offset (this is the last, highest, block group). + * because we are using the block group tree feature (so block group + * items are stored in the block group tree) or this is a new block + * group created in the current transaction and its block group item + * was not yet inserted in the extent tree (that happens in + * btrfs_create_pending_block_groups() -> insert_block_group_item()). + * It also means there are no extents allocated for block groups with a + * start offset beyond this block group's end offset (this is the last, + * highest, block group). */ - if (!btrfs_fs_compat_ro(trans->fs_info, BLOCK_GROUP_TREE)) - ASSERT(ret == 0); - start = block_group->start; end = block_group->start + block_group->length; while (ret == 0) { @@ -1465,7 +1456,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group) { struct btrfs_root *root = btrfs_free_space_root(block_group); - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key, found_key; struct extent_buffer *leaf; u64 start, end; @@ -1484,7 +1475,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, if (unlikely(!path)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } start = block_group->start; @@ -1498,7 +1489,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } leaf = path->nodes[0]; @@ -1529,14 +1520,13 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, ret = btrfs_del_items(trans, root, path, path->slots[0], nr); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } btrfs_release_path(path); } ret = 0; -out: - btrfs_free_path(path); + return ret; } @@ -1701,8 +1691,8 @@ int btrfs_load_free_space_tree(struct btrfs_caching_control *caching_ctl) * Just like caching_thread() doesn't want to deadlock on the extent * tree, we don't want to deadlock on the free space tree. */ - path->skip_locking = 1; - path->search_commit_root = 1; + path->skip_locking = true; + path->search_commit_root = true; path->reada = READA_FORWARD; info = btrfs_search_free_space_info(NULL, block_group, path, 0); diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 814bbc9417d2..0f7e1ef27891 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -29,6 +29,7 @@ #include "extent-io-tree.h" #include "async-thread.h" #include "block-rsv.h" +#include "messages.h" struct inode; struct super_block; @@ -73,6 +74,13 @@ struct btrfs_space_info; #define BTRFS_SUPER_INFO_SIZE 4096 static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); +/* Array of bytes with variable length, hexadecimal format 0x1234 */ +#define BTRFS_CSUM_FMT "0x%*phN" +#define BTRFS_CSUM_FMT_VALUE(size, bytes) size, bytes + +#define BTRFS_KEY_FMT "(%llu %u %llu)" +#define BTRFS_KEY_FMT_VALUE(key) (key)->objectid, (key)->type, (key)->offset + /* * Number of metadata items necessary for an unlink operation: * @@ -124,6 +132,12 @@ enum { /* No more delayed iput can be queued. */ BTRFS_FS_STATE_NO_DELAYED_IPUT, + /* + * Emergency shutdown, a step further than transaction aborted by + * rejecting all operations. + */ + BTRFS_FS_STATE_EMERGENCY_SHUTDOWN, + BTRFS_FS_STATE_COUNT }; @@ -644,7 +658,6 @@ struct btrfs_fs_info { struct workqueue_struct *endio_workers; struct workqueue_struct *endio_meta_workers; struct workqueue_struct *rmw_workers; - struct workqueue_struct *compressed_write_workers; struct btrfs_workqueue *endio_write_workers; struct btrfs_workqueue *endio_freespace_worker; struct btrfs_workqueue *caching_workers; @@ -1120,6 +1133,27 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ &(fs_info)->fs_state))) +static inline bool btrfs_is_shutdown(struct btrfs_fs_info *fs_info) +{ + return test_bit(BTRFS_FS_STATE_EMERGENCY_SHUTDOWN, &fs_info->fs_state); +} + +static inline void btrfs_force_shutdown(struct btrfs_fs_info *fs_info) +{ + /* + * Here we do not want to use handle_fs_error(), which will mark the fs + * read-only. + * Some call sites like shutdown ioctl will mark the fs shutdown when + * the fs is frozen. But thaw path will handle RO and RW fs + * differently. + * + * So here we only mark the fs error without flipping it RO. + */ + WRITE_ONCE(fs_info->fs_error, -EIO); + if (!test_and_set_bit(BTRFS_FS_STATE_EMERGENCY_SHUTDOWN, &fs_info->fs_state)) + btrfs_crit(fs_info, "emergency shutdown"); +} + /* * We use folio flag owner_2 to indicate there is an ordered extent with * unfinished IO. diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 1bd73b80f9fa..b73e1dd97208 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -312,7 +312,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->skip_release_on_error = 1; + path->skip_release_on_error = true; ret = btrfs_insert_empty_item(trans, root, path, &key, ins_len); if (ret == -EEXIST) { @@ -444,7 +444,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_truncate_control *control) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; struct btrfs_key key; @@ -730,6 +730,5 @@ out: if (!ret && control->last_size > new_size) control->last_size = new_size; - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3b1b3a0553ee..c4bee47829ed 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9,6 +9,7 @@ #include <linux/blk-cgroup.h> #include <linux/file.h> #include <linux/fs.h> +#include <linux/fs_struct.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/time.h> @@ -71,6 +72,7 @@ #include "backref.h" #include "raid-stripe-tree.h" #include "fiemap.h" +#include "delayed-inode.h" #define COW_FILE_RANGE_KEEP_LOCKED (1UL << 0) #define COW_FILE_RANGE_NO_INLINE (1UL << 1) @@ -130,7 +132,7 @@ static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, struct btrfs_fs_info *fs_info = warn->fs_info; struct extent_buffer *eb; struct btrfs_inode_item *inode_item; - struct inode_fs_paths *ipath = NULL; + struct inode_fs_paths *ipath __free(inode_fs_paths) = NULL; struct btrfs_root *local_root; struct btrfs_key key; unsigned int nofs_flag; @@ -177,8 +179,10 @@ static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, return ret; } ret = paths_from_inode(inum, ipath); - if (ret < 0) + if (ret < 0) { + btrfs_put_root(local_root); goto err; + } /* * We deliberately ignore the bit ipath might have been too small to @@ -193,7 +197,6 @@ static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, } btrfs_put_root(local_root); - free_ipath(ipath); return 0; err: @@ -201,7 +204,6 @@ err: "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d", warn->logical, warn->mirror_num, root, inum, offset, ret); - free_ipath(ipath); return ret; } @@ -233,21 +235,21 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off if (logical == U64_MAX) { btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation"); btrfs_warn_rl(fs_info, -"csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", +"csum failed root %lld ino %llu off %llu csum " BTRFS_CSUM_FMT " expected csum " BTRFS_CSUM_FMT " mirror %d", btrfs_root_id(inode->root), btrfs_ino(inode), file_off, - CSUM_FMT_VALUE(csum_size, csum), - CSUM_FMT_VALUE(csum_size, csum_expected), + BTRFS_CSUM_FMT_VALUE(csum_size, csum), + BTRFS_CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); return; } logical += file_off; btrfs_warn_rl(fs_info, -"csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", +"csum failed root %lld ino %llu off %llu logical %llu csum " BTRFS_CSUM_FMT " expected csum " BTRFS_CSUM_FMT " mirror %d", btrfs_root_id(inode->root), btrfs_ino(inode), file_off, logical, - CSUM_FMT_VALUE(csum_size, csum), - CSUM_FMT_VALUE(csum_size, csum_expected), + BTRFS_CSUM_FMT_VALUE(csum_size, csum), + BTRFS_CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags); @@ -318,19 +320,19 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, /* Output without objectid, which is more meaningful */ if (btrfs_root_id(root) >= BTRFS_LAST_FREE_OBJECTID) { btrfs_warn_rl(root->fs_info, -"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", +"csum failed root %lld ino %lld off %llu csum " BTRFS_CSUM_FMT " expected csum " BTRFS_CSUM_FMT " mirror %d", btrfs_root_id(root), btrfs_ino(inode), logical_start, - CSUM_FMT_VALUE(csum_size, csum), - CSUM_FMT_VALUE(csum_size, csum_expected), + BTRFS_CSUM_FMT_VALUE(csum_size, csum), + BTRFS_CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); } else { btrfs_warn_rl(root->fs_info, -"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", +"csum failed root %llu ino %llu off %llu csum " BTRFS_CSUM_FMT " expected csum " BTRFS_CSUM_FMT " mirror %d", btrfs_root_id(root), btrfs_ino(inode), logical_start, - CSUM_FMT_VALUE(csum_size, csum), - CSUM_FMT_VALUE(csum_size, csum_expected), + BTRFS_CSUM_FMT_VALUE(csum_size, csum), + BTRFS_CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); } } @@ -409,7 +411,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, continue; } - index = folio_end(folio) >> PAGE_SHIFT; + index = folio_next_index(folio); /* * Here we just clear all Ordered bits for every page in the * range, then btrfs_mark_ordered_io_finished() will handle @@ -591,6 +593,10 @@ static bool can_cow_file_range_inline(struct btrfs_inode *inode, if (size < i_size_read(&inode->vfs_inode)) return false; + /* Encrypted file cannot be inlined. */ + if (IS_ENCRYPTED(&inode->vfs_inode)) + return false; + return true; } @@ -862,7 +868,7 @@ static void compress_file_range(struct btrfs_work *work) u64 actual_end; u64 i_size; int ret = 0; - struct folio **folios; + struct folio **folios = NULL; unsigned long nr_folios; unsigned long total_compressed = 0; unsigned long total_in = 0; @@ -871,6 +877,9 @@ static void compress_file_range(struct btrfs_work *work) int compress_type = fs_info->compress_type; int compress_level = fs_info->compress_level; + if (unlikely(btrfs_is_shutdown(fs_info))) + goto cleanup_and_bail_uncompressed; + inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); /* @@ -1132,7 +1141,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, ret = btrfs_reserve_extent(root, async_extent->ram_size, async_extent->compressed_size, async_extent->compressed_size, - 0, *alloc_hint, &ins, 1, 1); + 0, *alloc_hint, &ins, true, true); if (ret) { /* * We can't reserve contiguous space for the compressed size. @@ -1286,6 +1295,11 @@ static noinline int cow_file_range(struct btrfs_inode *inode, unsigned long page_ops; int ret = 0; + if (unlikely(btrfs_is_shutdown(fs_info))) { + ret = -EIO; + goto out_unlock; + } + if (btrfs_is_free_space_inode(inode)) { ret = -EINVAL; goto out_unlock; @@ -1350,7 +1364,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, ret = btrfs_reserve_extent(root, num_bytes, num_bytes, min_alloc_size, 0, alloc_hint, - &ins, 1, 1); + &ins, true, true); if (ret == -EAGAIN) { /* * btrfs_reserve_extent only returns -EAGAIN for zoned @@ -2004,7 +2018,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; - struct btrfs_path *path; + struct btrfs_path *path = NULL; u64 cow_start = (u64)-1; /* * If not 0, represents the inclusive end of the last fallback_to_cow() @@ -2034,6 +2048,10 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, */ ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root)); + if (unlikely(btrfs_is_shutdown(fs_info))) { + ret = -EIO; + goto error; + } path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; @@ -2336,7 +2354,8 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol * The range must cover part of the @locked_folio, or a return of 1 * can confuse the caller. */ - ASSERT(!(end <= folio_pos(locked_folio) || start >= folio_end(locked_folio))); + ASSERT(!(end <= folio_pos(locked_folio) || + start >= folio_next_pos(locked_folio))); if (should_nocow(inode, start, end)) { ret = run_delalloc_nocow(inode, locked_folio, start, end); @@ -2743,7 +2762,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct btrfs_inode *inode = fixup->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 page_start = folio_pos(folio); - u64 page_end = folio_end(folio) - 1; + u64 page_end = folio_next_pos(folio) - 1; int ret = 0; bool free_delalloc_space = true; @@ -3330,36 +3349,67 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) return btrfs_finish_one_ordered(ordered); } -void btrfs_calculate_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, - u8 *dest) +/* + * Calculate the checksum of an fs block at physical memory address @paddr, + * and save the result to @dest. + * + * The folio containing @paddr must be large enough to contain a full fs block. + */ +void btrfs_calculate_block_csum_folio(struct btrfs_fs_info *fs_info, + const phys_addr_t paddr, u8 *dest) { struct folio *folio = page_folio(phys_to_page(paddr)); const u32 blocksize = fs_info->sectorsize; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + const u32 step = min(blocksize, PAGE_SIZE); + const u32 nr_steps = blocksize / step; + phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; - shash->tfm = fs_info->csum_shash; /* The full block must be inside the folio. */ ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio)); - if (folio_test_partial_kmap(folio)) { - size_t cur = paddr; + for (int i = 0; i < nr_steps; i++) { + u32 pindex = offset_in_folio(folio, paddr + i * step) >> PAGE_SHIFT; - crypto_shash_init(shash); - while (cur < paddr + blocksize) { - void *kaddr; - size_t len = min(paddr + blocksize - cur, - PAGE_SIZE - offset_in_page(cur)); + /* + * For bs <= ps cases, we will only run the loop once, so the offset + * inside the page will only added to paddrs[0]. + * + * For bs > ps cases, the block must be page aligned, thus offset + * inside the page will always be 0. + */ + paddrs[i] = page_to_phys(folio_page(folio, pindex)) + offset_in_page(paddr); + } + return btrfs_calculate_block_csum_pages(fs_info, paddrs, dest); +} - kaddr = kmap_local_folio(folio, offset_in_folio(folio, cur)); - crypto_shash_update(shash, kaddr, len); - kunmap_local(kaddr); - cur += len; - } - crypto_shash_final(shash, dest); - } else { - crypto_shash_digest(shash, phys_to_virt(paddr), blocksize, dest); +/* + * Calculate the checksum of a fs block backed by multiple noncontiguous pages + * at @paddrs[] and save the result to @dest. + * + * The folio containing @paddr must be large enough to contain a full fs block. + */ +void btrfs_calculate_block_csum_pages(struct btrfs_fs_info *fs_info, + const phys_addr_t paddrs[], u8 *dest) +{ + const u32 blocksize = fs_info->sectorsize; + const u32 step = min(blocksize, PAGE_SIZE); + const u32 nr_steps = blocksize / step; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + for (int i = 0; i < nr_steps; i++) { + const phys_addr_t paddr = paddrs[i]; + void *kaddr; + + ASSERT(offset_in_page(paddr) + step <= PAGE_SIZE); + kaddr = kmap_local_page(phys_to_page(paddr)) + offset_in_page(paddr); + crypto_shash_update(shash, kaddr, step); + kunmap_local(kaddr); } + crypto_shash_final(shash, dest); } + /* * Verify the checksum for a single sector without any extra action that depend * on the type of I/O. @@ -3369,19 +3419,20 @@ void btrfs_calculate_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum, const u8 * const csum_expected) { - btrfs_calculate_block_csum(fs_info, paddr, csum); + btrfs_calculate_block_csum_folio(fs_info, paddr, csum); if (unlikely(memcmp(csum, csum_expected, fs_info->csum_size) != 0)) return -EIO; return 0; } /* - * Verify the checksum of a single data sector. + * Verify the checksum of a single data sector, which can be scattered at + * different noncontiguous pages. * * @bbio: btrfs_io_bio which contains the csum * @dev: device the sector is on * @bio_offset: offset to the beginning of the bio (in bytes) - * @bv: bio_vec to check + * @paddrs: physical addresses which back the fs block * * Check if the checksum on a data block is valid. When a checksum mismatch is * detected, report the error and fill the corrupted range with zero. @@ -3389,12 +3440,13 @@ int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 * Return %true if the sector is ok or had no checksum to start with, else %false. */ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, - u32 bio_offset, phys_addr_t paddr) + u32 bio_offset, const phys_addr_t paddrs[]) { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; const u32 blocksize = fs_info->sectorsize; - struct folio *folio; + const u32 step = min(blocksize, PAGE_SIZE); + const u32 nr_steps = blocksize / step; u64 file_offset = bbio->file_offset + bio_offset; u64 end = file_offset + blocksize - 1; u8 *csum_expected; @@ -3414,7 +3466,8 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) * fs_info->csum_size; - if (btrfs_check_block_csum(fs_info, paddr, csum, csum_expected)) + btrfs_calculate_block_csum_pages(fs_info, paddrs, csum); + if (unlikely(memcmp(csum, csum_expected, fs_info->csum_size) != 0)) goto zeroit; return true; @@ -3423,9 +3476,8 @@ zeroit: bbio->mirror_num); if (dev) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); - folio = page_folio(phys_to_page(paddr)); - ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio)); - folio_zero_range(folio, offset_in_folio(folio, paddr), blocksize); + for (int i = 0; i < nr_steps; i++) + memzero_page(phys_to_page(paddrs[i]), offset_in_page(paddrs[i]), step); return false; } @@ -3884,7 +3936,7 @@ static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) ASSERT(ret != -ENOMEM); return ret; } else if (existing) { - WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING))); + WARN_ON(!(inode_state_read_once(&existing->vfs_inode) & (I_WILL_FREE | I_FREEING))); } return 0; @@ -4312,8 +4364,8 @@ skip_backref: * operations on the log tree, increasing latency for applications. */ if (!rename_ctx) { - btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino); - btrfs_del_dir_entries_in_log(trans, root, name, dir, index); + btrfs_del_inode_ref_in_log(trans, name, inode, dir); + btrfs_del_dir_entries_in_log(trans, name, dir, index); } /* @@ -4412,7 +4464,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, { struct btrfs_root *root = dir->root; struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key key; @@ -4505,7 +4557,6 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, if (ret) btrfs_abort_transaction(trans, ret); out: - btrfs_free_path(path); fscrypt_free_filename(&fname); return ret; } @@ -4855,7 +4906,7 @@ again: */ zero_start = max_t(u64, folio_pos(folio), start); - zero_end = folio_end(folio); + zero_end = folio_next_pos(folio); folio_zero_range(folio, zero_start - folio_pos(folio), zero_end - zero_start); @@ -5038,7 +5089,7 @@ again: * not reach disk, it still affects our page caches. */ zero_start = max_t(u64, folio_pos(folio), start); - zero_end = min_t(u64, folio_end(folio) - 1, end); + zero_end = min_t(u64, folio_next_pos(folio) - 1, end); } else { zero_start = max_t(u64, block_start, start); zero_end = min_t(u64, block_end, end); @@ -5361,7 +5412,7 @@ static void evict_inode_truncate_pages(struct inode *inode) struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct rb_node *node; - ASSERT(inode->i_state & I_FREEING); + ASSERT(inode_state_read_once(inode) & I_FREEING); truncate_inode_pages_final(&inode->i_data); btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); @@ -5630,9 +5681,9 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, location->type != BTRFS_ROOT_ITEM_KEY)) { ret = -EUCLEAN; btrfs_warn(root->fs_info, -"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", +"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location " BTRFS_KEY_FMT ")", __func__, fname.disk_name.name, btrfs_ino(dir), - location->objectid, location->type, location->offset); + BTRFS_KEY_FMT_VALUE(location)); } if (!ret) *type = btrfs_dir_ftype(path->nodes[0], di); @@ -5799,7 +5850,7 @@ struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->vfs_inode.i_state & I_NEW)) + if (!(inode_state_read_once(&inode->vfs_inode) & I_NEW)) return inode; ret = btrfs_read_locked_inode(inode, path); @@ -5823,7 +5874,7 @@ struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root) if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->vfs_inode.i_state & I_NEW)) + if (!(inode_state_read_once(&inode->vfs_inode) & I_NEW)) return inode; path = btrfs_alloc_path(); @@ -5837,6 +5888,8 @@ struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root) if (ret) return ERR_PTR(ret); + if (S_ISDIR(inode->vfs_inode.i_mode)) + inode->vfs_inode.i_opflags |= IOP_FASTPERM_MAY_EXEC; unlock_new_inode(&inode->vfs_inode); return inode; } @@ -6289,8 +6342,8 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode) } /* - * This is a copy of file_update_time. We need this so we can return error on - * ENOSPC for updating the inode in the case of file write and mmap writes. + * We need our own ->update_time so that we can return error on ENOSPC for + * updating the inode in the case of file write and mmap writes. */ static int btrfs_update_time(struct inode *inode, int flags) { @@ -6788,8 +6841,11 @@ static int btrfs_create_common(struct inode *dir, struct dentry *dentry, } ret = btrfs_create_new_inode(trans, &new_inode_args); - if (!ret) + if (!ret) { + if (S_ISDIR(inode->i_mode)) + inode->i_opflags |= IOP_FASTPERM_MAY_EXEC; d_instantiate_new(dentry, inode); + } btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); @@ -6873,7 +6929,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, BTRFS_I(inode)->dir_index = 0ULL; inode_inc_iversion(inode); inode_set_ctime_current(inode); - set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), &fname.disk_name, 1, index); @@ -7066,8 +7121,8 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, * point the commit_root has everything we need. */ if (btrfs_is_free_space_inode(inode)) { - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; } ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); @@ -7480,7 +7535,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, u64 page_start = folio_pos(folio); u64 page_end = page_start + folio_size(folio) - 1; u64 cur; - int inode_evicting = inode->vfs_inode.i_state & I_FREEING; + int inode_evicting = inode_state_read_once(&inode->vfs_inode) & I_FREEING; /* * We have folio locked so no new ordered extent can be created on this @@ -7577,11 +7632,11 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, &cached_state); - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock(&inode->ordered_tree_lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); ordered->truncated_len = min(ordered->truncated_len, cur - ordered->file_offset); - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock(&inode->ordered_tree_lock); /* * If the ordered extent has finished, we're safe to delete all @@ -7643,19 +7698,22 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) .ino = btrfs_ino(inode), .min_type = BTRFS_EXTENT_DATA_KEY, .clear_extent_range = true, + .new_size = inode->vfs_inode.i_size, }; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv rsv; int ret; struct btrfs_trans_handle *trans; - u64 mask = fs_info->sectorsize - 1; const u64 min_size = btrfs_calc_metadata_size(fs_info, 1); + const u64 lock_start = round_down(inode->vfs_inode.i_size, fs_info->sectorsize); + const u64 i_size_up = round_up(inode->vfs_inode.i_size, fs_info->sectorsize); + + /* Our inode is locked and the i_size can't be changed concurrently. */ + btrfs_assert_inode_locked(inode); if (!skip_writeback) { - ret = btrfs_wait_ordered_range(inode, - inode->vfs_inode.i_size & (~mask), - (u64)-1); + ret = btrfs_wait_ordered_range(inode, lock_start, (u64)-1); if (ret) return ret; } @@ -7719,19 +7777,14 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) while (1) { struct extent_state *cached_state = NULL; - const u64 new_size = inode->vfs_inode.i_size; - const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); - control.new_size = new_size; btrfs_lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); /* * We want to drop from the next block forward in case this new * size is not block aligned since we will be keeping the last * block of the extent just the way it is. */ - btrfs_drop_extent_map_range(inode, - ALIGN(new_size, fs_info->sectorsize), - (u64)-1, false); + btrfs_drop_extent_map_range(inode, i_size_up, (u64)-1, false); ret = btrfs_truncate_inode_items(trans, root, &control); @@ -8709,15 +8762,13 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -static int start_delalloc_inodes(struct btrfs_root *root, - struct writeback_control *wbc, bool snapshot, - bool in_reclaim_context) +static int start_delalloc_inodes(struct btrfs_root *root, long *nr_to_write, + bool snapshot, bool in_reclaim_context) { struct btrfs_delalloc_work *work, *next; LIST_HEAD(works); LIST_HEAD(splice); int ret = 0; - bool full_flush = wbc->nr_to_write == LONG_MAX; mutex_lock(&root->delalloc_mutex); spin_lock(&root->delalloc_lock); @@ -8743,10 +8794,10 @@ static int start_delalloc_inodes(struct btrfs_root *root, if (snapshot) set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &inode->runtime_flags); - if (full_flush) { - work = btrfs_alloc_delalloc_work(&inode->vfs_inode); + if (nr_to_write == NULL) { + work = btrfs_alloc_delalloc_work(tmp_inode); if (!work) { - iput(&inode->vfs_inode); + iput(tmp_inode); ret = -ENOMEM; goto out; } @@ -8754,9 +8805,11 @@ static int start_delalloc_inodes(struct btrfs_root *root, btrfs_queue_work(root->fs_info->flush_workers, &work->work); } else { - ret = filemap_fdatawrite_wbc(inode->vfs_inode.i_mapping, wbc); + ret = filemap_flush_nr(tmp_inode->i_mapping, + nr_to_write); btrfs_add_delayed_iput(inode); - if (ret || wbc->nr_to_write <= 0) + + if (ret || *nr_to_write <= 0) goto out; } cond_resched(); @@ -8782,29 +8835,17 @@ out: int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context) { - struct writeback_control wbc = { - .nr_to_write = LONG_MAX, - .sync_mode = WB_SYNC_NONE, - .range_start = 0, - .range_end = LLONG_MAX, - }; struct btrfs_fs_info *fs_info = root->fs_info; if (BTRFS_FS_ERROR(fs_info)) return -EROFS; - - return start_delalloc_inodes(root, &wbc, true, in_reclaim_context); + return start_delalloc_inodes(root, NULL, true, in_reclaim_context); } int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, bool in_reclaim_context) { - struct writeback_control wbc = { - .nr_to_write = nr, - .sync_mode = WB_SYNC_NONE, - .range_start = 0, - .range_end = LLONG_MAX, - }; + long *nr_to_write = nr == LONG_MAX ? NULL : &nr; struct btrfs_root *root; LIST_HEAD(splice); int ret; @@ -8816,13 +8857,6 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, spin_lock(&fs_info->delalloc_root_lock); list_splice_init(&fs_info->delalloc_roots, &splice); while (!list_empty(&splice)) { - /* - * Reset nr_to_write here so we know that we're doing a full - * flush. - */ - if (nr == LONG_MAX) - wbc.nr_to_write = LONG_MAX; - root = list_first_entry(&splice, struct btrfs_root, delalloc_root); root = btrfs_grab_root(root); @@ -8831,9 +8865,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); - ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context); + ret = start_delalloc_inodes(root, nr_to_write, false, + in_reclaim_context); btrfs_put_root(root); - if (ret < 0 || wbc.nr_to_write <= 0) + if (ret < 0 || nr <= 0) goto out; spin_lock(&fs_info->delalloc_root_lock); } @@ -9063,7 +9098,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, */ cur_bytes = min(cur_bytes, last_alloc); ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes, - min_size, 0, *alloc_hint, &ins, 1, 0); + min_size, 0, *alloc_hint, &ins, true, false); if (ret) break; @@ -9169,6 +9204,11 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, min_size, actual_len, alloc_hint, trans); } +/* + * NOTE: in case you are adding MAY_EXEC check for directories: + * we are marking them with IOP_FASTPERM_MAY_EXEC, allowing path lookup to + * elide calls here. + */ static int btrfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { @@ -9394,7 +9434,6 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, u64 disk_bytenr, u64 disk_io_size, struct page **pages, void *uring_ctx) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_encoded_read_private *priv, sync_priv; struct completion sync_reads; unsigned long i = 0; @@ -9419,10 +9458,9 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, priv->status = 0; priv->uring_ctx = uring_ctx; - bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, + bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, inode, 0, btrfs_encoded_read_endio, priv); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; - bbio->inode = inode; do { size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); @@ -9431,10 +9469,9 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, refcount_inc(&priv->pending_refs); btrfs_submit_bbio(bbio, 0); - bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, + bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, inode, 0, btrfs_encoded_read_endio, priv); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; - bbio->inode = inode; continue; } @@ -9825,8 +9862,6 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, } for (;;) { - struct btrfs_ordered_extent *ordered; - ret = btrfs_wait_ordered_range(inode, start, num_bytes); if (ret) goto out_folios; @@ -9876,7 +9911,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, } ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes, - disk_num_bytes, 0, 0, &ins, 1, 1); + disk_num_bytes, 0, 0, &ins, true, true); if (ret) goto out_delalloc_release; extent_reserved = true; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 185bef0df1c2..acb484546b1d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -503,7 +503,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap, struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_trans_handle *trans; struct btrfs_key key; - struct btrfs_root_item *root_item; + struct btrfs_root_item AUTO_KFREE(root_item); struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; struct btrfs_root *root = BTRFS_I(dir)->root; @@ -527,20 +527,18 @@ static noinline int create_subvol(struct mnt_idmap *idmap, ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid); if (ret) - goto out_root_item; + return ret; /* * Don't create subvolume whose level is not zero. Or qgroup will be * screwed up since it assumes subvolume qgroup's level to be 0. */ - if (btrfs_qgroup_level(objectid)) { - ret = -ENOSPC; - goto out_root_item; - } + if (btrfs_qgroup_level(objectid)) + return -ENOSPC; ret = get_anon_bdev(&anon_dev); if (ret < 0) - goto out_root_item; + return ret; new_inode_args.inode = btrfs_new_subvol_inode(idmap, dir); if (!new_inode_args.inode) { @@ -692,8 +690,7 @@ out_inode: out_anon_dev: if (anon_dev) free_anon_bdev(anon_dev); -out_root_item: - kfree(root_item); + return ret; } @@ -904,14 +901,9 @@ static noinline int btrfs_mksubvol(struct dentry *parent, struct fscrypt_str name_str = FSTR_INIT((char *)qname->name, qname->len); int ret; - ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); - if (ret == -EINTR) - return ret; - - dentry = lookup_one(idmap, qname, parent); - ret = PTR_ERR(dentry); + dentry = start_creating_killable(idmap, parent, qname); if (IS_ERR(dentry)) - goto out_unlock; + return PTR_ERR(dentry); ret = btrfs_may_create(idmap, dir, dentry); if (ret) @@ -940,9 +932,7 @@ static noinline int btrfs_mksubvol(struct dentry *parent, out_up_read: up_read(&fs_info->subvol_sem); out_dput: - dput(dentry); -out_unlock: - btrfs_inode_unlock(BTRFS_I(dir), 0); + end_creating(dentry); return ret; } @@ -1606,7 +1596,7 @@ static noinline int search_ioctl(struct btrfs_root *root, { struct btrfs_fs_info *info = root->fs_info; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; int num_found = 0; unsigned long sk_offset = 0; @@ -1626,10 +1616,8 @@ static noinline int search_ioctl(struct btrfs_root *root, } else { /* Look up the root from the arguments. */ root = btrfs_get_fs_root(info, sk->tree_id, true); - if (IS_ERR(root)) { - btrfs_free_path(path); + if (IS_ERR(root)) return PTR_ERR(root); - } } key.objectid = sk->min_objectid; @@ -1663,7 +1651,6 @@ static noinline int search_ioctl(struct btrfs_root *root, sk->nr_items = num_found; btrfs_put_root(root); - btrfs_free_path(path); return ret; } @@ -1746,7 +1733,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, int total_len = 0; struct btrfs_inode_ref *iref; struct extent_buffer *l; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); if (dirid == BTRFS_FIRST_FREE_OBJECTID) { name[0]='\0'; @@ -1807,7 +1794,6 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, ret = 0; out: btrfs_put_root(root); - btrfs_free_path(path); return ret; } @@ -1824,8 +1810,8 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, struct btrfs_inode_ref *iref; struct btrfs_root_ref *rref; struct btrfs_root *root = NULL; - struct btrfs_path *path; - struct btrfs_key key, key2; + BTRFS_PATH_AUTO_FREE(path); + struct btrfs_key key; struct extent_buffer *leaf; char *ptr; int slot; @@ -1845,10 +1831,8 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1]; root = btrfs_get_fs_root(fs_info, treeid, true); - if (IS_ERR(root)) { - ret = PTR_ERR(root); - goto out; - } + if (IS_ERR(root)) + return PTR_ERR(root); key.objectid = dirid; key.type = BTRFS_INODE_REF_KEY; @@ -1880,24 +1864,6 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, read_extent_buffer(leaf, ptr, (unsigned long)(iref + 1), len); - /* Check the read+exec permission of this directory */ - ret = btrfs_previous_item(root, path, dirid, - BTRFS_INODE_ITEM_KEY); - if (ret < 0) { - goto out_put; - } else if (ret > 0) { - ret = -ENOENT; - goto out_put; - } - - leaf = path->nodes[0]; - slot = path->slots[0]; - btrfs_item_key_to_cpu(leaf, &key2, slot); - if (key2.objectid != dirid) { - ret = -ENOENT; - goto out_put; - } - /* * We don't need the path anymore, so release it and * avoid deadlocks and lockdep warnings in case @@ -1905,18 +1871,17 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, * btree and lock the same leaf. */ btrfs_release_path(path); - temp_inode = btrfs_iget(key2.objectid, root); + temp_inode = btrfs_iget(key.offset, root); if (IS_ERR(temp_inode)) { ret = PTR_ERR(temp_inode); goto out_put; } + /* Check the read+exec permission of this directory. */ ret = inode_permission(idmap, &temp_inode->vfs_inode, MAY_READ | MAY_EXEC); iput(&temp_inode->vfs_inode); - if (ret) { - ret = -EACCES; + if (ret) goto out_put; - } if (key.offset == upper_limit) break; @@ -1942,12 +1907,10 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, key.type = BTRFS_ROOT_REF_KEY; key.offset = args->treeid; ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); - if (ret < 0) { - goto out; - } else if (ret > 0) { - ret = -ENOENT; - goto out; - } + if (ret < 0) + return ret; + else if (ret > 0) + return -ENOENT; leaf = path->nodes[0]; slot = path->slots[0]; @@ -1957,10 +1920,8 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, item_len = btrfs_item_size(leaf, slot); /* Check if dirid in ROOT_REF corresponds to passed dirid */ rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref); - if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) { - ret = -EINVAL; - goto out; - } + if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) + return -EINVAL; /* Copy subvolume's name */ item_off += sizeof(struct btrfs_root_ref); @@ -1970,8 +1931,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, out_put: btrfs_put_root(root); -out: - btrfs_free_path(path); + return ret; } @@ -2417,18 +2377,10 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto free_subvol_name; } - ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); - if (ret == -EINTR) - goto free_subvol_name; - dentry = lookup_one(idmap, &QSTR(subvol_name), parent); + dentry = start_removing_killable(idmap, parent, &QSTR(subvol_name)); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); - goto out_unlock_dir; - } - - if (d_really_is_negative(dentry)) { - ret = -ENOENT; - goto out_dput; + goto out_end_removing; } inode = d_inode(dentry); @@ -2449,7 +2401,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, */ ret = -EPERM; if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED)) - goto out_dput; + goto out_end_removing; /* * Do not allow deletion if the parent dir is the same @@ -2460,21 +2412,21 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, */ ret = -EINVAL; if (root == dest) - goto out_dput; + goto out_end_removing; ret = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC); if (ret) - goto out_dput; + goto out_end_removing; } /* check if subvolume may be deleted by a user */ ret = btrfs_may_delete(idmap, dir, dentry, 1); if (ret) - goto out_dput; + goto out_end_removing; if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { ret = -EINVAL; - goto out_dput; + goto out_end_removing; } btrfs_inode_lock(BTRFS_I(inode), 0); @@ -2483,10 +2435,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (!ret) d_delete_notify(dir, dentry); -out_dput: - dput(dentry); -out_unlock_dir: - btrfs_inode_unlock(BTRFS_I(dir), 0); +out_end_removing: + end_removing(dentry); free_subvol_name: kfree(subvol_name_ptr); free_parent: @@ -2956,7 +2906,7 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_space_args space_args = { 0 }; struct btrfs_ioctl_space_info space; struct btrfs_ioctl_space_info *dest; - struct btrfs_ioctl_space_info *dest_orig; + struct btrfs_ioctl_space_info AUTO_KFREE(dest_orig); struct btrfs_ioctl_space_info __user *user_dest; struct btrfs_space_info *info; static const u64 types[] = { @@ -3077,9 +3027,8 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, (arg + sizeof(struct btrfs_ioctl_space_args)); if (copy_to_user(user_dest, dest_orig, alloc_size)) - ret = -EFAULT; + return -EFAULT; - kfree(dest_orig); out: if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args))) ret = -EFAULT; @@ -3298,7 +3247,7 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) u64 rel_ptr; int size; struct btrfs_ioctl_ino_path_args *ipa = NULL; - struct inode_fs_paths *ipath = NULL; + struct inode_fs_paths *ipath __free(inode_fs_paths) = NULL; struct btrfs_path *path; if (!capable(CAP_DAC_READ_SEARCH)) @@ -3346,7 +3295,6 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) out: btrfs_free_path(path); - free_ipath(ipath); kfree(ipa); return ret; @@ -3611,7 +3559,7 @@ static long btrfs_ioctl_balance_ctl(struct btrfs_fs_info *fs_info, int cmd) static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info, void __user *arg) { - struct btrfs_ioctl_balance_args *bargs; + struct btrfs_ioctl_balance_args AUTO_KFREE(bargs); int ret = 0; if (!capable(CAP_SYS_ADMIN)) @@ -3633,8 +3581,6 @@ static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info, if (copy_to_user(arg, bargs, sizeof(*bargs))) ret = -EFAULT; - - kfree(bargs); out: mutex_unlock(&fs_info->balance_mutex); return ret; @@ -3740,7 +3686,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL); if (!prealloc) { ret = -ENOMEM; - goto drop_write; + goto out; } } @@ -4228,7 +4174,7 @@ static int check_feature_bits(const struct btrfs_fs_info *fs_info, u64 safe_set, u64 safe_clear) { const char *type = btrfs_feature_set_name(set); - char *names; + const char AUTO_KFREE(names); u64 disallowed, unsupported; u64 set_mask = flags & change_mask; u64 clear_mask = ~flags & change_mask; @@ -4236,12 +4182,11 @@ static int check_feature_bits(const struct btrfs_fs_info *fs_info, unsupported = set_mask & ~supported_flags; if (unsupported) { names = btrfs_printable_features(set, unsupported); - if (names) { + if (names) btrfs_warn(fs_info, "this kernel does not support the %s feature bit%s", names, strchr(names, ',') ? "s" : ""); - kfree(names); - } else + else btrfs_warn(fs_info, "this kernel does not support %s bits 0x%llx", type, unsupported); @@ -4251,12 +4196,11 @@ static int check_feature_bits(const struct btrfs_fs_info *fs_info, disallowed = set_mask & ~safe_set; if (disallowed) { names = btrfs_printable_features(set, disallowed); - if (names) { + if (names) btrfs_warn(fs_info, "can't set the %s feature bit%s while mounted", names, strchr(names, ',') ? "s" : ""); - kfree(names); - } else + else btrfs_warn(fs_info, "can't set %s bits 0x%llx while mounted", type, disallowed); @@ -4266,12 +4210,11 @@ static int check_feature_bits(const struct btrfs_fs_info *fs_info, disallowed = clear_mask & ~safe_clear; if (disallowed) { names = btrfs_printable_features(set, disallowed); - if (names) { + if (names) btrfs_warn(fs_info, "can't clear the %s feature bit%s while mounted", names, strchr(names, ',') ? "s" : ""); - kfree(names); - } else + else btrfs_warn(fs_info, "can't clear %s bits 0x%llx while mounted", type, disallowed); @@ -4418,10 +4361,6 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, goto out_acct; } - if (fs_info->sectorsize > PAGE_SIZE) { - ret = -ENOTTY; - goto out_acct; - } if (compat) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) struct btrfs_ioctl_encoded_io_args_32 args32; @@ -4513,7 +4452,6 @@ out_acct: static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool compat) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(file->f_inode); struct btrfs_ioctl_encoded_io_args args; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; @@ -4527,11 +4465,6 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool goto out_acct; } - if (fs_info->sectorsize > PAGE_SIZE) { - ret = -ENOTTY; - goto out_acct; - } - if (!(file->f_mode & FMODE_WRITE)) { ret = -EBADF; goto out_acct; @@ -4649,8 +4582,9 @@ struct io_btrfs_cmd { struct btrfs_uring_priv *priv; }; -static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int issue_flags) +static void btrfs_uring_read_finished(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd); struct btrfs_uring_priv *priv = bc->priv; struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp)); @@ -4695,7 +4629,7 @@ out: btrfs_unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); - io_uring_cmd_done(cmd, ret, issue_flags); + io_uring_cmd_done(cmd, ret, IO_URING_CMD_TASK_WORK_ISSUE_FLAGS); add_rchar(current, ret); for (index = 0; index < priv->nr_pages; index++) @@ -4813,11 +4747,6 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue ret = -EPERM; goto out_acct; } - if (fs_info->sectorsize > PAGE_SIZE) { - ret = -ENOTTY; - goto out_acct; - } - sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); if (issue_flags & IO_URING_F_COMPAT) { @@ -4945,7 +4874,6 @@ out_acct: static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issue_flags) { struct file *file = cmd->file; - struct btrfs_fs_info *fs_info = inode_to_fs_info(file->f_inode); loff_t pos; struct kiocb kiocb; ssize_t ret; @@ -4960,11 +4888,6 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu ret = -EPERM; goto out_acct; } - if (fs_info->sectorsize > PAGE_SIZE) { - ret = -ENOTTY; - goto out_acct; - } - sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); if (!(file->f_mode & FMODE_WRITE)) { @@ -5077,6 +5000,9 @@ out_acct: int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { + if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(cmd->file))))) + return -EIO; + switch (cmd->cmd_op) { case BTRFS_IOC_ENCODED_READ: #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) @@ -5220,6 +5146,43 @@ static int btrfs_ioctl_subvol_sync(struct btrfs_fs_info *fs_info, void __user *a return 0; } +#ifdef CONFIG_BTRFS_EXPERIMENTAL +static int btrfs_ioctl_shutdown(struct btrfs_fs_info *fs_info, unsigned long arg) +{ + int ret = 0; + u32 flags; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(flags, (u32 __user *)arg)) + return -EFAULT; + + if (flags >= BTRFS_SHUTDOWN_FLAGS_LAST) + return -EINVAL; + + if (btrfs_is_shutdown(fs_info)) + return 0; + + switch (flags) { + case BTRFS_SHUTDOWN_FLAGS_LOGFLUSH: + case BTRFS_SHUTDOWN_FLAGS_DEFAULT: + ret = freeze_super(fs_info->sb, FREEZE_HOLDER_KERNEL, NULL); + if (ret) + return ret; + btrfs_force_shutdown(fs_info); + ret = thaw_super(fs_info->sb, FREEZE_HOLDER_KERNEL, NULL); + if (ret) + return ret; + break; + case BTRFS_SHUTDOWN_FLAGS_NOLOGFLUSH: + btrfs_force_shutdown(fs_info); + break; + } + return ret; +} +#endif + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -5375,6 +5338,10 @@ long btrfs_ioctl(struct file *file, unsigned int #endif case BTRFS_IOC_SUBVOL_SYNC_WAIT: return btrfs_ioctl_subvol_sync(fs_info, argp); +#ifdef CONFIG_BTRFS_EXPERIMENTAL + case BTRFS_IOC_SHUTDOWN: + return btrfs_ioctl_shutdown(fs_info, arg); +#endif } return -ENOTTY; diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index a0cf8effe008..2f853de44473 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -24,6 +24,7 @@ static const char fs_state_chars[] = { [BTRFS_FS_STATE_NO_DATA_CSUMS] = 'C', [BTRFS_FS_STATE_SKIP_META_CSUMS] = 'S', [BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L', + [BTRFS_FS_STATE_EMERGENCY_SHUTDOWN] = 'E', }; static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf) diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 4416c165644f..d8c0bd17dcda 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -168,7 +168,8 @@ do { \ #endif #else -#define ASSERT(cond, args...) (void)(cond) +/* Compile check the @cond expression but don't generate any code. */ +#define ASSERT(cond, args...) BUILD_BUG_ON_INVALID(cond) #endif #ifdef CONFIG_BTRFS_DEBUG diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 60f9b000d644..12c5a9d6564f 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -14,6 +14,13 @@ #include <linux/bio.h> /* + * Convenience macros to define a pointer with the __free(kfree) and + * __free(kvfree) cleanup attributes and initialized to NULL. + */ +#define AUTO_KFREE(name) *name __free(kfree) = NULL +#define AUTO_KVFREE(name) *name __free(kvfree) = NULL + +/* * Enumerate bits using enum autoincrement. Define the @name as the n-th bit. */ #define ENUM_BIT(name) \ @@ -209,9 +216,4 @@ static inline bool bitmap_test_range_all_zero(const unsigned long *addr, return (found_set == start + nbits); } -static inline u64 folio_end(struct folio *folio) -{ - return folio_pos(folio) + folio_size(folio); -} - #endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 2829f20d7bb5..5df02c707aee 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -237,14 +237,14 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry) /* One ref for the tree. */ refcount_inc(&entry->refs); - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock(&inode->ordered_tree_lock); node = tree_insert(&inode->ordered_tree, entry->file_offset, &entry->rb_node); if (unlikely(node)) btrfs_panic(fs_info, -EEXIST, "inconsistency in ordered tree at offset %llu", entry->file_offset); - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock(&inode->ordered_tree_lock); spin_lock(&root->ordered_extent_lock); list_add_tail(&entry->root_extent_list, @@ -328,9 +328,9 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, { struct btrfs_inode *inode = entry->inode; - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock(&inode->ordered_tree_lock); list_add_tail(&sum->list, &entry->list); - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock(&inode->ordered_tree_lock); } void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered) @@ -359,7 +359,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, if (folio) { ASSERT(folio->mapping); ASSERT(folio_pos(folio) <= file_offset); - ASSERT(file_offset + len <= folio_end(folio)); + ASSERT(file_offset + len <= folio_next_pos(folio)); /* * Ordered flag indicates whether we still have @@ -417,15 +417,14 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, bool uptodate) { struct btrfs_inode *inode = ordered->inode; - unsigned long flags; bool ret; trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate); - spin_lock_irqsave(&inode->ordered_tree_lock, flags); + spin_lock(&inode->ordered_tree_lock); ret = can_finish_ordered_extent(ordered, folio, file_offset, len, uptodate); - spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + spin_unlock(&inode->ordered_tree_lock); /* * If this is a COW write it means we created new extent maps for the @@ -481,18 +480,16 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, { struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - unsigned long flags; u64 cur = file_offset; + const u64 end = file_offset + num_bytes; - trace_btrfs_writepage_end_io_hook(inode, file_offset, - file_offset + num_bytes - 1, - uptodate); + trace_btrfs_writepage_end_io_hook(inode, file_offset, end - 1, uptodate); - spin_lock_irqsave(&inode->ordered_tree_lock, flags); - while (cur < file_offset + num_bytes) { + spin_lock(&inode->ordered_tree_lock); + while (cur < end) { u64 entry_end; - u64 end; - u32 len; + u64 this_end; + u64 len; node = ordered_tree_search(inode, cur); /* No ordered extents at all */ @@ -535,19 +532,18 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, * | * cur */ - end = min(entry->file_offset + entry->num_bytes, - file_offset + num_bytes) - 1; - ASSERT(end + 1 - cur < U32_MAX); - len = end + 1 - cur; + this_end = min(entry_end, end); + len = this_end - cur; + ASSERT(len < U32_MAX); if (can_finish_ordered_extent(entry, folio, cur, len, uptodate)) { - spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + spin_unlock(&inode->ordered_tree_lock); btrfs_queue_ordered_fn(entry); - spin_lock_irqsave(&inode->ordered_tree_lock, flags); + spin_lock(&inode->ordered_tree_lock); } cur += len; } - spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + spin_unlock(&inode->ordered_tree_lock); } /* @@ -573,10 +569,9 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, { struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - unsigned long flags; bool finished = false; - spin_lock_irqsave(&inode->ordered_tree_lock, flags); + spin_lock(&inode->ordered_tree_lock); if (cached && *cached) { entry = *cached; goto have_entry; @@ -613,7 +608,7 @@ out: refcount_inc(&entry->refs); trace_btrfs_ordered_extent_dec_test_pending(inode, entry); } - spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + spin_unlock(&inode->ordered_tree_lock); return finished; } @@ -678,7 +673,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes, fs_info->delalloc_batch); - spin_lock_irq(&btrfs_inode->ordered_tree_lock); + spin_lock(&btrfs_inode->ordered_tree_lock); node = &entry->rb_node; rb_erase(node, &btrfs_inode->ordered_tree); RB_CLEAR_NODE(node); @@ -686,7 +681,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, btrfs_inode->ordered_tree_last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags); - spin_unlock_irq(&btrfs_inode->ordered_tree_lock); + spin_unlock(&btrfs_inode->ordered_tree_lock); /* * The current running transaction is waiting on us, we need to let it @@ -971,9 +966,8 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino { struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - unsigned long flags; - spin_lock_irqsave(&inode->ordered_tree_lock, flags); + spin_lock(&inode->ordered_tree_lock); node = ordered_tree_search(inode, file_offset); if (!node) goto out; @@ -986,7 +980,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino trace_btrfs_ordered_extent_lookup(inode, entry); } out: - spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + spin_unlock(&inode->ordered_tree_lock); return entry; } @@ -999,7 +993,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock(&inode->ordered_tree_lock); node = ordered_tree_search(inode, file_offset); if (!node) { node = ordered_tree_search(inode, file_offset + len); @@ -1026,7 +1020,7 @@ out: refcount_inc(&entry->refs); trace_btrfs_ordered_extent_lookup_range(inode, entry); } - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock(&inode->ordered_tree_lock); return entry; } @@ -1041,7 +1035,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, btrfs_assert_inode_locked(inode); - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock(&inode->ordered_tree_lock); for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) { struct btrfs_ordered_extent *ordered; @@ -1055,7 +1049,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, refcount_inc(&ordered->refs); trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered); } - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock(&inode->ordered_tree_lock); } /* @@ -1068,7 +1062,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock(&inode->ordered_tree_lock); node = ordered_tree_search(inode, file_offset); if (!node) goto out; @@ -1077,7 +1071,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) refcount_inc(&entry->refs); trace_btrfs_ordered_extent_lookup_first(inode, entry); out: - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock(&inode->ordered_tree_lock); return entry; } @@ -1099,7 +1093,7 @@ struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( struct rb_node *next; struct btrfs_ordered_extent *entry = NULL; - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock(&inode->ordered_tree_lock); node = inode->ordered_tree.rb_node; /* * Here we don't want to use tree_search() which will use tree->last @@ -1154,7 +1148,7 @@ out: trace_btrfs_ordered_extent_lookup_first_range(inode, entry); } - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock(&inode->ordered_tree_lock); return entry; } @@ -1286,9 +1280,7 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( /* * Take the root's ordered_extent_lock to avoid a race with * btrfs_wait_ordered_extents() when updating the disk_bytenr and - * disk_num_bytes fields of the ordered extent below. And we disable - * IRQs because the inode's ordered_tree_lock is used in IRQ context - * elsewhere. + * disk_num_bytes fields of the ordered extent below. * * There's no concern about a previous caller of * btrfs_wait_ordered_extents() getting the trimmed ordered extent diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 62b993fae54f..f189bf09ce6a 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -131,7 +131,7 @@ static void print_extent_item(const struct extent_buffer *eb, int slot, int type struct btrfs_tree_block_info *info; info = (struct btrfs_tree_block_info *)(ei + 1); btrfs_tree_block_key(eb, info, &key); - pr_info("\t\ttree block key (%llu %u %llu) level %d\n", + pr_info("\t\ttree block key " BTRFS_KEY_FMT " level %d\n", btrfs_disk_key_objectid(&key), key.type, btrfs_disk_key_offset(&key), btrfs_tree_block_level(eb, info)); @@ -277,9 +277,8 @@ static void print_dir_item(const struct extent_buffer *eb, int i) struct btrfs_key location; btrfs_dir_item_key_to_cpu(eb, di, &location); - pr_info("\t\tlocation key (%llu %u %llu) type %d\n", - location.objectid, location.type, location.offset, - btrfs_dir_ftype(eb, di)); + pr_info("\t\tlocation key " BTRFS_KEY_FMT " type %d\n", + BTRFS_KEY_FMT_VALUE(&location), btrfs_dir_ftype(eb, di)); pr_info("\t\ttransid %llu data_len %u name_len %u\n", btrfs_dir_transid(eb, di), data_len, name_len); di = (struct btrfs_dir_item *)((char *)di + len); @@ -421,7 +420,7 @@ static void key_type_string(const struct btrfs_key *key, char *buf, int buf_size if (key->type == 0 && key->objectid == BTRFS_FREE_SPACE_OBJECTID) scnprintf(buf, buf_size, "UNTYPED"); else if (key_to_str[key->type]) - scnprintf(buf, buf_size, key_to_str[key->type]); + scnprintf(buf, buf_size, "%s", key_to_str[key->type]); else scnprintf(buf, buf_size, "UNKNOWN.%d", key->type); } @@ -598,10 +597,9 @@ void btrfs_print_tree(const struct extent_buffer *c, bool follow) print_eb_refs_lock(c); for (i = 0; i < nr; i++) { btrfs_node_key_to_cpu(c, &key, i); - pr_info("\tkey %d (%llu %u %llu) block %llu gen %llu\n", - i, key.objectid, key.type, key.offset, - btrfs_node_blockptr(c, i), - btrfs_node_ptr_generation(c, i)); + pr_info("\tkey %d " BTRFS_KEY_FMT " block %llu gen %llu\n", + i, BTRFS_KEY_FMT_VALUE(&key), btrfs_node_blockptr(c, i), + btrfs_node_ptr_generation(c, i)); } if (!follow) return; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 1175b8192cd7..9e2b53e90dcb 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -660,7 +660,7 @@ static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, { int ret; struct btrfs_root *quota_root = trans->fs_info->quota_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; path = btrfs_alloc_path(); @@ -672,7 +672,6 @@ static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, key.offset = dst; ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0); - btrfs_free_path(path); return ret; } @@ -681,7 +680,7 @@ static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, { int ret; struct btrfs_root *quota_root = trans->fs_info->quota_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; path = btrfs_alloc_path(); @@ -694,24 +693,19 @@ static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); if (ret < 0) - goto out; + return ret; - if (ret > 0) { - ret = -ENOENT; - goto out; - } + if (ret > 0) + return -ENOENT; - ret = btrfs_del_item(trans, quota_root, path); -out: - btrfs_free_path(path); - return ret; + return btrfs_del_item(trans, quota_root, path); } static int add_qgroup_item(struct btrfs_trans_handle *trans, struct btrfs_root *quota_root, u64 qgroupid) { int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_qgroup_info_item *qgroup_info; struct btrfs_qgroup_limit_item *qgroup_limit; struct extent_buffer *leaf; @@ -737,7 +731,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, quota_root, path, &key, sizeof(*qgroup_info)); if (ret && ret != -EEXIST) - goto out; + return ret; leaf = path->nodes[0]; qgroup_info = btrfs_item_ptr(leaf, path->slots[0], @@ -754,7 +748,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, quota_root, path, &key, sizeof(*qgroup_limit)); if (ret && ret != -EEXIST) - goto out; + return ret; leaf = path->nodes[0]; qgroup_limit = btrfs_item_ptr(leaf, path->slots[0], @@ -765,17 +759,14 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0); btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0); - ret = 0; -out: - btrfs_free_path(path); - return ret; + return 0; } static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid) { int ret; struct btrfs_root *quota_root = trans->fs_info->quota_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; path = btrfs_alloc_path(); @@ -787,33 +778,27 @@ static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid) key.offset = qgroupid; ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); if (ret < 0) - goto out; + return ret; - if (ret > 0) { - ret = -ENOENT; - goto out; - } + if (ret > 0) + return -ENOENT; ret = btrfs_del_item(trans, quota_root, path); if (ret) - goto out; + return ret; btrfs_release_path(path); key.type = BTRFS_QGROUP_LIMIT_KEY; ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); if (ret < 0) - goto out; + return ret; - if (ret > 0) { - ret = -ENOENT; - goto out; - } + if (ret > 0) + return -ENOENT; ret = btrfs_del_item(trans, quota_root, path); -out: - btrfs_free_path(path); return ret; } @@ -821,7 +806,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, struct btrfs_qgroup *qgroup) { struct btrfs_root *quota_root = trans->fs_info->quota_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *l; struct btrfs_qgroup_limit_item *qgroup_limit; @@ -841,7 +826,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, ret = -ENOENT; if (ret) - goto out; + return ret; l = path->nodes[0]; slot = path->slots[0]; @@ -851,8 +836,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl); btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer); btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl); -out: - btrfs_free_path(path); + return ret; } @@ -861,7 +845,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root = fs_info->quota_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *l; struct btrfs_qgroup_info_item *qgroup_info; @@ -884,7 +868,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, ret = -ENOENT; if (ret) - goto out; + return ret; l = path->nodes[0]; slot = path->slots[0]; @@ -894,8 +878,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr); btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl); btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr); -out: - btrfs_free_path(path); + return ret; } @@ -903,7 +886,7 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root = fs_info->quota_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *l; struct btrfs_qgroup_status_item *ptr; @@ -923,7 +906,7 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans) ret = -ENOENT; if (ret) - goto out; + return ret; l = path->nodes[0]; slot = path->slots[0]; @@ -933,8 +916,7 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans) btrfs_set_qgroup_status_generation(l, ptr, trans->transid); btrfs_set_qgroup_status_rescan(l, ptr, fs_info->qgroup_rescan_progress.objectid); -out: - btrfs_free_path(path); + return ret; } @@ -944,7 +926,7 @@ out: static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *leaf = NULL; int ret; @@ -961,7 +943,7 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, while (1) { ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) - goto out; + return ret; leaf = path->nodes[0]; nr = btrfs_header_nritems(leaf); if (!nr) @@ -974,14 +956,12 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, path->slots[0] = 0; ret = btrfs_del_items(trans, root, path, 0, nr); if (ret) - goto out; + return ret; btrfs_release_path(path); } - ret = 0; -out: - btrfs_free_path(path); - return ret; + + return 0; } int btrfs_quota_enable(struct btrfs_fs_info *fs_info, @@ -1263,7 +1243,14 @@ out: btrfs_end_transaction(trans); else if (trans) ret = btrfs_end_transaction(trans); - kfree(prealloc); + + /* + * At this point we either failed at allocating prealloc, or we + * succeeded and passed the ownership to it to add_qgroup_rb(). In any + * case, this needs to be NULL or there is something wrong. + */ + ASSERT(prealloc == NULL); + return ret; } @@ -1539,8 +1526,10 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst ASSERT(prealloc); /* Check the level of src and dst first */ - if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) + if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) { + kfree(prealloc); return -EINVAL; + } mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) { @@ -1693,7 +1682,12 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); - kfree(prealloc); + /* + * At this point we either failed at allocating prealloc, or we + * succeeded and passed the ownership to it to add_qgroup_rb(). In any + * case, this needs to be NULL or there is something wrong. + */ + ASSERT(prealloc == NULL); return ret; } @@ -1705,8 +1699,7 @@ out: static int can_delete_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup) { struct btrfs_key key; - struct btrfs_path *path; - int ret; + BTRFS_PATH_AUTO_FREE(path); /* * Squota would never be inconsistent, but there can still be case @@ -1739,13 +1732,11 @@ static int can_delete_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup if (!path) return -ENOMEM; - ret = btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL); - btrfs_free_path(path); /* * The @ret from btrfs_find_root() exactly matches our definition for * the return value, thus can be returned directly. */ - return ret; + return btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL); } int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) @@ -2294,7 +2285,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, bool trace_leaf) { struct btrfs_key key; - struct btrfs_path *src_path; + BTRFS_PATH_AUTO_FREE(src_path); struct btrfs_fs_info *fs_info = trans->fs_info; u32 nodesize = fs_info->nodesize; int cur_level = root_level; @@ -2306,10 +2297,8 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, return -EINVAL; src_path = btrfs_alloc_path(); - if (!src_path) { - ret = -ENOMEM; - goto out; - } + if (!src_path) + return -ENOMEM; if (dst_level) btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0); @@ -2335,10 +2324,8 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, parent_slot = src_path->slots[cur_level + 1]; eb = btrfs_read_node_slot(eb, parent_slot); - if (IS_ERR(eb)) { - ret = PTR_ERR(eb); - goto out; - } + if (IS_ERR(eb)) + return PTR_ERR(eb); src_path->nodes[cur_level] = eb; @@ -2359,10 +2346,8 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, &src_key, src_path->slots[cur_level]); } /* Content mismatch, something went wrong */ - if (btrfs_comp_cpu_keys(&dst_key, &src_key)) { - ret = -ENOENT; - goto out; - } + if (btrfs_comp_cpu_keys(&dst_key, &src_key)) + return -ENOENT; cur_level--; } @@ -2373,21 +2358,20 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start, nodesize); if (ret < 0) - goto out; + return ret; ret = btrfs_qgroup_trace_extent(trans, dst_path->nodes[dst_level]->start, nodesize); if (ret < 0) - goto out; + return ret; /* Record leaf file extents */ if (dst_level == 0 && trace_leaf) { ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]); if (ret < 0) - goto out; + return ret; ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]); } -out: - btrfs_free_path(src_path); + return ret; } @@ -2588,7 +2572,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, int level; u8 drop_subptree_thres; struct extent_buffer *eb = root_eb; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); ASSERT(0 <= root_level && root_level < BTRFS_MAX_LEVEL); ASSERT(root_eb != NULL); @@ -2621,12 +2605,12 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, ret = btrfs_read_extent_buffer(root_eb, &check); if (ret) - goto out; + return ret; } if (root_level == 0) { ret = btrfs_qgroup_trace_leaf_items(trans, root_eb); - goto out; + return ret; } path = btrfs_alloc_path(); @@ -2662,10 +2646,8 @@ walk_down: child_bytenr = btrfs_node_blockptr(eb, parent_slot); eb = btrfs_read_node_slot(eb, parent_slot); - if (IS_ERR(eb)) { - ret = PTR_ERR(eb); - goto out; - } + if (IS_ERR(eb)) + return PTR_ERR(eb); path->nodes[level] = eb; path->slots[level] = 0; @@ -2676,14 +2658,14 @@ walk_down: ret = btrfs_qgroup_trace_extent(trans, child_bytenr, fs_info->nodesize); if (ret) - goto out; + return ret; } if (level == 0) { ret = btrfs_qgroup_trace_leaf_items(trans, path->nodes[level]); if (ret) - goto out; + return ret; /* Nonzero return here means we completed our search */ ret = adjust_slots_upwards(path, root_level); @@ -2697,11 +2679,7 @@ walk_down: level--; } - ret = 0; -out: - btrfs_free_path(path); - - return ret; + return 0; } static void qgroup_iterator_nested_add(struct list_head *head, struct btrfs_qgroup *qgroup) @@ -3301,7 +3279,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, struct btrfs_root *quota_root; struct btrfs_qgroup *srcgroup; struct btrfs_qgroup *dstgroup; - struct btrfs_qgroup *prealloc; + struct btrfs_qgroup *prealloc = NULL; struct btrfs_qgroup_list **qlist_prealloc = NULL; bool free_inherit = false; bool need_rescan = false; @@ -3542,7 +3520,14 @@ out: } if (free_inherit) kfree(inherit); - kfree(prealloc); + + /* + * At this point we either failed at allocating prealloc, or we + * succeeded and passed the ownership to it to add_qgroup_rb(). In any + * case, this needs to be NULL or there is something wrong. + */ + ASSERT(prealloc == NULL); + return ret; } @@ -3710,10 +3695,8 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, path, 1, 0); btrfs_debug(fs_info, - "current progress key (%llu %u %llu), search_slot ret %d", - fs_info->qgroup_rescan_progress.objectid, - fs_info->qgroup_rescan_progress.type, - fs_info->qgroup_rescan_progress.offset, ret); + "current progress key " BTRFS_KEY_FMT ", search_slot ret %d", + BTRFS_KEY_FMT_VALUE(&fs_info->qgroup_rescan_progress), ret); if (ret) { /* @@ -3815,8 +3798,8 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) * Rescan should only search for commit root, and any later difference * should be recorded by qgroup */ - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; while (!ret && !(stopped = rescan_should_stop(fs_info))) { trans = btrfs_start_transaction(fs_info->fs_root, 0); @@ -4794,7 +4777,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_tree_parent_check check = { 0 }; struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks; - struct btrfs_qgroup_swapped_block *block; + struct btrfs_qgroup_swapped_block AUTO_KFREE(block); struct extent_buffer *reloc_eb = NULL; struct rb_node *node; bool swapped = false; @@ -4851,7 +4834,6 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb, block->last_snapshot, block->trace_leaf); free_out: - kfree(block); free_extent_buffer(reloc_eb); out: if (ret < 0) { diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index cc6f6095cc9f..2987cb7c686e 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -19,7 +19,7 @@ static int btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans, u64 newlen, u64 frontpad) { struct btrfs_root *stripe_root = trans->fs_info->stripe_root; - struct btrfs_stripe_extent *extent, *newitem; + struct btrfs_stripe_extent *extent, AUTO_KFREE(newitem); struct extent_buffer *leaf; int slot; size_t item_size; @@ -53,14 +53,10 @@ static int btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans, ret = btrfs_del_item(trans, stripe_root, path); if (ret) - goto out; + return ret; btrfs_release_path(path); - ret = btrfs_insert_item(trans, stripe_root, &newkey, newitem, item_size); - -out: - kfree(newitem); - return ret; + return btrfs_insert_item(trans, stripe_root, &newkey, newitem, item_size); } int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length) @@ -299,7 +295,7 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, struct btrfs_key stripe_key; struct btrfs_root *stripe_root = fs_info->stripe_root; const int num_stripes = btrfs_bg_type_to_factor(bioc->map_type); - struct btrfs_stripe_extent *stripe_extent; + struct btrfs_stripe_extent AUTO_KFREE(stripe_extent); const size_t item_size = struct_size(stripe_extent, strides, num_stripes); int ret; @@ -336,8 +332,6 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); } - kfree(stripe_extent); - return ret; } @@ -394,8 +388,8 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, return -ENOMEM; if (stripe->rst_search_commit_root) { - path->skip_locking = 1; - path->search_commit_root = 1; + path->skip_locking = true; + path->search_commit_root = true; } ret = btrfs_search_slot(NULL, stripe_root, &stripe_key, path, 0, 0); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 0135dceb7baa..f38d8305e46d 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -66,10 +66,10 @@ static void btrfs_dump_rbio(const struct btrfs_fs_info *fs_info, dump_bioc(fs_info, rbio->bioc); btrfs_crit(fs_info, -"rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u scrubp=%u dbitmap=0x%lx", +"rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u sector_nsteps=%u scrubp=%u dbitmap=0x%lx", rbio->flags, rbio->nr_sectors, rbio->nr_data, rbio->real_stripes, rbio->stripe_nsectors, - rbio->scrubp, rbio->dbitmap); + rbio->sector_nsteps, rbio->scrubp, rbio->dbitmap); } #define ASSERT_RBIO(expr, rbio) \ @@ -134,18 +134,10 @@ struct btrfs_stripe_hash_table { }; /* - * A structure to present a sector inside a page, the length is fixed to - * sectorsize; + * The PFN may still be valid, but our paddrs should always be block size + * aligned, thus such -1 paddr is definitely not a valid one. */ -struct sector_ptr { - /* - * Blocks from the bio list can still be highmem. - * So here we use physical address to present a page and the offset inside it. - */ - phys_addr_t paddr; - bool has_paddr; - bool uptodate; -}; +#define INVALID_PADDR (~(phys_addr_t)0) static void rmw_rbio_work(struct work_struct *work); static void rmw_rbio_work_locked(struct work_struct *work); @@ -159,8 +151,8 @@ static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio) { bitmap_free(rbio->error_bitmap); kfree(rbio->stripe_pages); - kfree(rbio->bio_sectors); - kfree(rbio->stripe_sectors); + kfree(rbio->bio_paddrs); + kfree(rbio->stripe_paddrs); kfree(rbio->finish_pointers); } @@ -235,12 +227,22 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) return 0; } -static void memcpy_sectors(const struct sector_ptr *dst, - const struct sector_ptr *src, u32 blocksize) +static void memcpy_from_bio_to_stripe(struct btrfs_raid_bio *rbio, unsigned int sector_nr) { - memcpy_page(phys_to_page(dst->paddr), offset_in_page(dst->paddr), - phys_to_page(src->paddr), offset_in_page(src->paddr), - blocksize); + const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE); + + ASSERT(sector_nr < rbio->nr_sectors); + for (int i = 0; i < rbio->sector_nsteps; i++) { + unsigned int index = sector_nr * rbio->sector_nsteps + i; + phys_addr_t dst = rbio->stripe_paddrs[index]; + phys_addr_t src = rbio->bio_paddrs[index]; + + ASSERT(dst != INVALID_PADDR); + ASSERT(src != INVALID_PADDR); + + memcpy_page(phys_to_page(dst), offset_in_page(dst), + phys_to_page(src), offset_in_page(src), step); + } } /* @@ -263,20 +265,19 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) for (i = 0; i < rbio->nr_sectors; i++) { /* Some range not covered by bio (partial write), skip it */ - if (!rbio->bio_sectors[i].has_paddr) { + if (rbio->bio_paddrs[i * rbio->sector_nsteps] == INVALID_PADDR) { /* * Even if the sector is not covered by bio, if it is * a data sector it should still be uptodate as it is * read from disk. */ if (i < rbio->nr_data * rbio->stripe_nsectors) - ASSERT(rbio->stripe_sectors[i].uptodate); + ASSERT(test_bit(i, rbio->stripe_uptodate_bitmap)); continue; } - memcpy_sectors(&rbio->stripe_sectors[i], &rbio->bio_sectors[i], - rbio->bioc->fs_info->sectorsize); - rbio->stripe_sectors[i].uptodate = 1; + memcpy_from_bio_to_stripe(rbio, i); + set_bit(i, rbio->stripe_uptodate_bitmap); } set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); } @@ -299,19 +300,48 @@ static int rbio_bucket(struct btrfs_raid_bio *rbio) return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); } -static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, - unsigned int page_nr) +/* Get the sector number of the first sector covered by @page_nr. */ +static u32 page_nr_to_sector_nr(struct btrfs_raid_bio *rbio, unsigned int page_nr) { - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - const u32 sectors_per_page = PAGE_SIZE / sectorsize; + u32 sector_nr; + + ASSERT(page_nr < rbio->nr_pages); + + sector_nr = (page_nr << PAGE_SHIFT) >> rbio->bioc->fs_info->sectorsize_bits; + ASSERT(sector_nr < rbio->nr_sectors); + return sector_nr; +} + +/* + * Get the number of sectors covered by @page_nr. + * + * For bs > ps cases, the result will always be 1. + * For bs <= ps cases, the result will be ps / bs. + */ +static u32 page_nr_to_num_sectors(struct btrfs_raid_bio *rbio, unsigned int page_nr) +{ + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + u32 nr_sectors; + + ASSERT(page_nr < rbio->nr_pages); + + nr_sectors = round_up(PAGE_SIZE, fs_info->sectorsize) >> fs_info->sectorsize_bits; + ASSERT(nr_sectors > 0); + return nr_sectors; +} + +static __maybe_unused bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, + unsigned int page_nr) +{ + const u32 sector_nr = page_nr_to_sector_nr(rbio, page_nr); + const u32 nr_bits = page_nr_to_num_sectors(rbio, page_nr); int i; ASSERT(page_nr < rbio->nr_pages); + ASSERT(sector_nr + nr_bits < rbio->nr_sectors); - for (i = sectors_per_page * page_nr; - i < sectors_per_page * page_nr + sectors_per_page; - i++) { - if (!rbio->stripe_sectors[i].uptodate) + for (i = sector_nr; i < sector_nr + nr_bits; i++) { + if (!test_bit(i, rbio->stripe_uptodate_bitmap)) return false; } return true; @@ -324,46 +354,44 @@ static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, */ static void index_stripe_sectors(struct btrfs_raid_bio *rbio) { - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE); u32 offset; int i; - for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) { + for (i = 0, offset = 0; i < rbio->nr_sectors * rbio->sector_nsteps; + i++, offset += step) { int page_index = offset >> PAGE_SHIFT; ASSERT(page_index < rbio->nr_pages); if (!rbio->stripe_pages[page_index]) continue; - rbio->stripe_sectors[i].has_paddr = true; - rbio->stripe_sectors[i].paddr = - page_to_phys(rbio->stripe_pages[page_index]) + - offset_in_page(offset); + rbio->stripe_paddrs[i] = page_to_phys(rbio->stripe_pages[page_index]) + + offset_in_page(offset); } } static void steal_rbio_page(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest, int page_nr) { - const u32 sectorsize = src->bioc->fs_info->sectorsize; - const u32 sectors_per_page = PAGE_SIZE / sectorsize; - int i; + const u32 sector_nr = page_nr_to_sector_nr(src, page_nr); + const u32 nr_bits = page_nr_to_num_sectors(src, page_nr); + + ASSERT(page_nr < src->nr_pages); + ASSERT(sector_nr + nr_bits < src->nr_sectors); if (dest->stripe_pages[page_nr]) __free_page(dest->stripe_pages[page_nr]); dest->stripe_pages[page_nr] = src->stripe_pages[page_nr]; src->stripe_pages[page_nr] = NULL; - /* Also update the sector->uptodate bits. */ - for (i = sectors_per_page * page_nr; - i < sectors_per_page * page_nr + sectors_per_page; i++) - dest->stripe_sectors[i].uptodate = true; + /* Also update the stripe_uptodate_bitmap bits. */ + bitmap_set(dest->stripe_uptodate_bitmap, sector_nr, nr_bits); } static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr) { - const int sector_nr = (page_nr << PAGE_SHIFT) >> - rbio->bioc->fs_info->sectorsize_bits; + const int sector_nr = page_nr_to_sector_nr(rbio, page_nr); /* * We have ensured PAGE_SIZE is aligned with sectorsize, thus @@ -677,39 +705,62 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, return 1; } -static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio, - unsigned int stripe_nr, - unsigned int sector_nr) +/* Return the sector index for @stripe_nr and @sector_nr. */ +static unsigned int rbio_sector_index(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, + unsigned int sector_nr) { + unsigned int ret; + ASSERT_RBIO_STRIPE(stripe_nr < rbio->real_stripes, rbio, stripe_nr); ASSERT_RBIO_SECTOR(sector_nr < rbio->stripe_nsectors, rbio, sector_nr); - return stripe_nr * rbio->stripe_nsectors + sector_nr; + ret = stripe_nr * rbio->stripe_nsectors + sector_nr; + ASSERT(ret < rbio->nr_sectors); + return ret; +} + +/* Return the paddr array index for @stripe_nr, @sector_nr and @step_nr. */ +static unsigned int rbio_paddr_index(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, + unsigned int sector_nr, + unsigned int step_nr) +{ + unsigned int ret; + + ASSERT_RBIO_SECTOR(step_nr < rbio->sector_nsteps, rbio, step_nr); + + ret = rbio_sector_index(rbio, stripe_nr, sector_nr) * rbio->sector_nsteps + step_nr; + ASSERT(ret < rbio->nr_sectors * rbio->sector_nsteps); + return ret; } -/* Return a sector from rbio->stripe_sectors, not from the bio list */ -static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio, - unsigned int stripe_nr, - unsigned int sector_nr) +static phys_addr_t rbio_stripe_paddr(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, unsigned int sector_nr, + unsigned int step_nr) { - return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr, - sector_nr)]; + return rbio->stripe_paddrs[rbio_paddr_index(rbio, stripe_nr, sector_nr, step_nr)]; } -/* Grab a sector inside P stripe */ -static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio, - unsigned int sector_nr) +static phys_addr_t rbio_pstripe_paddr(const struct btrfs_raid_bio *rbio, + unsigned int sector_nr, unsigned int step_nr) { - return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr); + return rbio_stripe_paddr(rbio, rbio->nr_data, sector_nr, step_nr); } -/* Grab a sector inside Q stripe, return NULL if not RAID6 */ -static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio, - unsigned int sector_nr) +static phys_addr_t rbio_qstripe_paddr(const struct btrfs_raid_bio *rbio, + unsigned int sector_nr, unsigned int step_nr) { if (rbio->nr_data + 1 == rbio->real_stripes) - return NULL; - return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr); + return INVALID_PADDR; + return rbio_stripe_paddr(rbio, rbio->nr_data + 1, sector_nr, step_nr); +} + +/* Return a paddr pointer into the rbio::stripe_paddrs[] for the specified sector. */ +static phys_addr_t *rbio_stripe_paddrs(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, unsigned int sector_nr) +{ + return &rbio->stripe_paddrs[rbio_paddr_index(rbio, stripe_nr, sector_nr, 0)]; } /* @@ -944,7 +995,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t status) } /* - * Get a sector pointer specified by its @stripe_nr and @sector_nr. + * Get paddr pointer for the sector specified by its @stripe_nr and @sector_nr. * * @rbio: The raid bio * @stripe_nr: Stripe number, valid range [0, real_stripe) @@ -954,34 +1005,52 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t status) * * The read/modify/write code wants to reuse the original bio page as much * as possible, and only use stripe_sectors as fallback. + * + * Return NULL if bio_list_only is set but the specified sector has no + * coresponding bio. */ -static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, - int stripe_nr, int sector_nr, - bool bio_list_only) +static phys_addr_t *sector_paddrs_in_rbio(struct btrfs_raid_bio *rbio, + int stripe_nr, int sector_nr, + bool bio_list_only) { - struct sector_ptr *sector; - int index; + phys_addr_t *ret = NULL; + const int index = rbio_paddr_index(rbio, stripe_nr, sector_nr, 0); - ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->real_stripes, - rbio, stripe_nr); - ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors, - rbio, sector_nr); + ASSERT(index >= 0 && index < rbio->nr_sectors * rbio->sector_nsteps); - index = stripe_nr * rbio->stripe_nsectors + sector_nr; - ASSERT(index >= 0 && index < rbio->nr_sectors); - - spin_lock(&rbio->bio_list_lock); - sector = &rbio->bio_sectors[index]; - if (sector->has_paddr || bio_list_only) { - /* Don't return sector without a valid page pointer */ - if (!sector->has_paddr) - sector = NULL; - spin_unlock(&rbio->bio_list_lock); - return sector; + scoped_guard(spinlock, &rbio->bio_list_lock) { + if (rbio->bio_paddrs[index] != INVALID_PADDR || bio_list_only) { + /* Don't return sector without a valid page pointer */ + if (rbio->bio_paddrs[index] != INVALID_PADDR) + ret = &rbio->bio_paddrs[index]; + return ret; + } } - spin_unlock(&rbio->bio_list_lock); + return &rbio->stripe_paddrs[index]; +} - return &rbio->stripe_sectors[index]; +/* + * Similar to sector_paddr_in_rbio(), but with extra consideration for + * bs > ps cases, where we can have multiple steps for a fs block. + */ +static phys_addr_t sector_paddr_in_rbio(struct btrfs_raid_bio *rbio, + int stripe_nr, int sector_nr, int step_nr, + bool bio_list_only) +{ + phys_addr_t ret = INVALID_PADDR; + const int index = rbio_paddr_index(rbio, stripe_nr, sector_nr, step_nr); + + ASSERT(index >= 0 && index < rbio->nr_sectors * rbio->sector_nsteps); + + scoped_guard(spinlock, &rbio->bio_list_lock) { + if (rbio->bio_paddrs[index] != INVALID_PADDR || bio_list_only) { + /* Don't return sector without a valid page pointer */ + if (rbio->bio_paddrs[index] != INVALID_PADDR) + ret = rbio->bio_paddrs[index]; + return ret; + } + } + return rbio->stripe_paddrs[index]; } /* @@ -997,10 +1066,16 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, const unsigned int stripe_nsectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; const unsigned int num_sectors = stripe_nsectors * real_stripes; + const unsigned int step = min(fs_info->sectorsize, PAGE_SIZE); + const unsigned int sector_nsteps = fs_info->sectorsize / step; struct btrfs_raid_bio *rbio; - /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ - ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); + /* + * For bs <= ps cases, ps must be aligned to bs. + * For bs > ps cases, bs must be aligned to ps. + */ + ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize) || + IS_ALIGNED(fs_info->sectorsize, PAGE_SIZE)); /* * Our current stripe len should be fixed to 64k thus stripe_nsectors * (at most 16) should be no larger than BITS_PER_LONG. @@ -1019,19 +1094,22 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, return ERR_PTR(-ENOMEM); rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS); - rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), - GFP_NOFS); - rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), - GFP_NOFS); + rbio->bio_paddrs = kcalloc(num_sectors * sector_nsteps, sizeof(phys_addr_t), GFP_NOFS); + rbio->stripe_paddrs = kcalloc(num_sectors * sector_nsteps, sizeof(phys_addr_t), GFP_NOFS); rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS); rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS); + rbio->stripe_uptodate_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS); - if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors || - !rbio->finish_pointers || !rbio->error_bitmap) { + if (!rbio->stripe_pages || !rbio->bio_paddrs || !rbio->stripe_paddrs || + !rbio->finish_pointers || !rbio->error_bitmap || !rbio->stripe_uptodate_bitmap) { free_raid_bio_pointers(rbio); kfree(rbio); return ERR_PTR(-ENOMEM); } + for (int i = 0; i < num_sectors * sector_nsteps; i++) { + rbio->stripe_paddrs[i] = INVALID_PADDR; + rbio->bio_paddrs[i] = INVALID_PADDR; + } bio_list_init(&rbio->bio_list); init_waitqueue_head(&rbio->io_wait); @@ -1046,6 +1124,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, rbio->real_stripes = real_stripes; rbio->stripe_npages = stripe_npages; rbio->stripe_nsectors = stripe_nsectors; + rbio->sector_nsteps = sector_nsteps; refcount_set(&rbio->refs, 1); atomic_set(&rbio->stripes_pending, 0); @@ -1090,8 +1169,8 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) * @faila and @failb will also be updated to the first and second stripe * number of the errors. */ -static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr, - int *faila, int *failb) +static int get_rbio_vertical_errors(struct btrfs_raid_bio *rbio, int sector_nr, + int *faila, int *failb) { int stripe_nr; int found_errors = 0; @@ -1123,20 +1202,41 @@ static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr, return found_errors; } +static int bio_add_paddrs(struct bio *bio, phys_addr_t *paddrs, unsigned int nr_steps, + unsigned int step) +{ + int added = 0; + int ret; + + for (int i = 0; i < nr_steps; i++) { + ret = bio_add_page(bio, phys_to_page(paddrs[i]), step, + offset_in_page(paddrs[i])); + if (ret != step) + goto revert; + added += ret; + } + return added; +revert: + /* + * We don't need to revert the bvec, as the bio will be submitted immediately, + * as long as the size is reduced the extra bvec will not be accessed. + */ + bio->bi_iter.bi_size -= added; + return 0; +} + /* * Add a single sector @sector into our list of bios for IO. * * Return 0 if everything went well. - * Return <0 for error. + * Return <0 for error, and no byte will be added to @rbio. */ -static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list, - struct sector_ptr *sector, - unsigned int stripe_nr, - unsigned int sector_nr, - enum req_op op) +static int rbio_add_io_paddrs(struct btrfs_raid_bio *rbio, struct bio_list *bio_list, + phys_addr_t *paddrs, unsigned int stripe_nr, + unsigned int sector_nr, enum req_op op) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + const u32 step = min(sectorsize, PAGE_SIZE); struct bio *last = bio_list->tail; int ret; struct bio *bio; @@ -1152,7 +1252,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, rbio, stripe_nr); ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors, rbio, sector_nr); - ASSERT(sector->has_paddr); + ASSERT(paddrs != NULL); stripe = &rbio->bioc->stripes[stripe_nr]; disk_start = stripe->physical + sector_nr * sectorsize; @@ -1165,8 +1265,8 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, rbio->error_bitmap); /* Check if we have reached tolerance early. */ - found_errors = get_rbio_veritical_errors(rbio, sector_nr, - NULL, NULL); + found_errors = get_rbio_vertical_errors(rbio, sector_nr, + NULL, NULL); if (unlikely(found_errors > rbio->bioc->max_errors)) return -EIO; return 0; @@ -1183,8 +1283,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, */ if (last_end == disk_start && !last->bi_status && last->bi_bdev == stripe->dev->bdev) { - ret = bio_add_page(last, phys_to_page(sector->paddr), - sectorsize, offset_in_page(sector->paddr)); + ret = bio_add_paddrs(last, paddrs, rbio->sector_nsteps, step); if (ret == sectorsize) return 0; } @@ -1197,28 +1296,27 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT; bio->bi_private = rbio; - __bio_add_page(bio, phys_to_page(sector->paddr), sectorsize, - offset_in_page(sector->paddr)); + ret = bio_add_paddrs(bio, paddrs, rbio->sector_nsteps, step); + ASSERT(ret == sectorsize); bio_list_add(bio_list, bio); return 0; } static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) { - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - const u32 sectorsize_bits = rbio->bioc->fs_info->sectorsize_bits; + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + const u32 step = min(fs_info->sectorsize, PAGE_SIZE); + const u32 step_bits = min(fs_info->sectorsize_bits, PAGE_SHIFT); struct bvec_iter iter = bio->bi_iter; phys_addr_t paddr; u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - rbio->bioc->full_stripe_logical; - btrfs_bio_for_each_block(paddr, bio, &iter, sectorsize) { - unsigned int index = (offset >> sectorsize_bits); - struct sector_ptr *sector = &rbio->bio_sectors[index]; + btrfs_bio_for_each_block(paddr, bio, &iter, step) { + unsigned int index = (offset >> step_bits); - sector->has_paddr = true; - sector->paddr = paddr; - offset += sectorsize; + rbio->bio_paddrs[index] = paddr; + offset += step; } } @@ -1296,56 +1394,64 @@ static void assert_rbio(struct btrfs_raid_bio *rbio) ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio); } -static inline void *kmap_local_sector(const struct sector_ptr *sector) +static inline void *kmap_local_paddr(phys_addr_t paddr) { /* The sector pointer must have a page mapped to it. */ - ASSERT(sector->has_paddr); + ASSERT(paddr != INVALID_PADDR); - return kmap_local_page(phys_to_page(sector->paddr)) + - offset_in_page(sector->paddr); + return kmap_local_page(phys_to_page(paddr)) + offset_in_page(paddr); } -/* Generate PQ for one vertical stripe. */ -static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) +static void generate_pq_vertical_step(struct btrfs_raid_bio *rbio, unsigned int sector_nr, + unsigned int step_nr) { void **pointers = rbio->finish_pointers; - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - struct sector_ptr *sector; + const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE); int stripe; const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6; /* First collect one sector from each data stripe */ - for (stripe = 0; stripe < rbio->nr_data; stripe++) { - sector = sector_in_rbio(rbio, stripe, sectornr, 0); - pointers[stripe] = kmap_local_sector(sector); - } + for (stripe = 0; stripe < rbio->nr_data; stripe++) + pointers[stripe] = kmap_local_paddr( + sector_paddr_in_rbio(rbio, stripe, sector_nr, step_nr, 0)); /* Then add the parity stripe */ - sector = rbio_pstripe_sector(rbio, sectornr); - sector->uptodate = 1; - pointers[stripe++] = kmap_local_sector(sector); + pointers[stripe++] = kmap_local_paddr(rbio_pstripe_paddr(rbio, sector_nr, step_nr)); if (has_qstripe) { /* * RAID6, add the qstripe and call the library function * to fill in our p/q */ - sector = rbio_qstripe_sector(rbio, sectornr); - sector->uptodate = 1; - pointers[stripe++] = kmap_local_sector(sector); + pointers[stripe++] = kmap_local_paddr( + rbio_qstripe_paddr(rbio, sector_nr, step_nr)); assert_rbio(rbio); - raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, - pointers); + raid6_call.gen_syndrome(rbio->real_stripes, step, pointers); } else { /* raid5 */ - memcpy(pointers[rbio->nr_data], pointers[0], sectorsize); - run_xor(pointers + 1, rbio->nr_data - 1, sectorsize); + memcpy(pointers[rbio->nr_data], pointers[0], step); + run_xor(pointers + 1, rbio->nr_data - 1, step); } for (stripe = stripe - 1; stripe >= 0; stripe--) kunmap_local(pointers[stripe]); } +/* Generate PQ for one vertical stripe. */ +static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) +{ + const bool has_qstripe = (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6); + + for (int i = 0; i < rbio->sector_nsteps; i++) + generate_pq_vertical_step(rbio, sectornr, i); + + set_bit(rbio_sector_index(rbio, rbio->nr_data, sectornr), + rbio->stripe_uptodate_bitmap); + if (has_qstripe) + set_bit(rbio_sector_index(rbio, rbio->nr_data + 1, sectornr), + rbio->stripe_uptodate_bitmap); +} + static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { @@ -1372,7 +1478,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { - struct sector_ptr *sector; + phys_addr_t *paddrs; stripe = total_sector_nr / rbio->stripe_nsectors; sectornr = total_sector_nr % rbio->stripe_nsectors; @@ -1382,14 +1488,14 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, continue; if (stripe < rbio->nr_data) { - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (!sector) + paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1); + if (paddrs == NULL) continue; } else { - sector = rbio_stripe_sector(rbio, stripe, sectornr); + paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); } - ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, + ret = rbio_add_io_paddrs(rbio, bio_list, paddrs, stripe, sectornr, REQ_OP_WRITE); if (ret) goto error; @@ -1407,7 +1513,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { - struct sector_ptr *sector; + phys_addr_t *paddrs; stripe = total_sector_nr / rbio->stripe_nsectors; sectornr = total_sector_nr % rbio->stripe_nsectors; @@ -1432,14 +1538,14 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, continue; if (stripe < rbio->nr_data) { - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (!sector) + paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1); + if (paddrs == NULL) continue; } else { - sector = rbio_stripe_sector(rbio, stripe, sectornr); + paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); } - ret = rbio_add_io_sector(rbio, bio_list, sector, + ret = rbio_add_io_paddrs(rbio, bio_list, paddrs, rbio->real_stripes, sectornr, REQ_OP_WRITE); if (ret) @@ -1487,21 +1593,17 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) } /* - * For subpage case, we can no longer set page Up-to-date directly for - * stripe_pages[], thus we need to locate the sector. + * Return the index inside the rbio->stripe_sectors[] array. + * + * Return -1 if not found. */ -static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, - phys_addr_t paddr) +static int find_stripe_sector_nr(struct btrfs_raid_bio *rbio, phys_addr_t paddr) { - int i; - - for (i = 0; i < rbio->nr_sectors; i++) { - struct sector_ptr *sector = &rbio->stripe_sectors[i]; - - if (sector->has_paddr && sector->paddr == paddr) - return sector; + for (int i = 0; i < rbio->nr_sectors; i++) { + if (rbio->stripe_paddrs[i * rbio->sector_nsteps] == paddr) + return i; } - return NULL; + return -1; } /* @@ -1510,17 +1612,23 @@ static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, */ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) { - const u32 blocksize = rbio->bioc->fs_info->sectorsize; + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + const u32 step = min(sectorsize, PAGE_SIZE); + u32 offset = 0; phys_addr_t paddr; ASSERT(!bio_flagged(bio, BIO_CLONED)); - btrfs_bio_for_each_block_all(paddr, bio, blocksize) { - struct sector_ptr *sector = find_stripe_sector(rbio, paddr); + btrfs_bio_for_each_block_all(paddr, bio, step) { + /* Hitting the first step of a sector. */ + if (IS_ALIGNED(offset, sectorsize)) { + int sector_nr = find_stripe_sector_nr(rbio, paddr); - ASSERT(sector); - if (sector) - sector->uptodate = 1; + ASSERT(sector_nr >= 0); + if (sector_nr >= 0) + set_bit(sector_nr, rbio->stripe_uptodate_bitmap); + } + offset += step; } } @@ -1530,10 +1638,9 @@ static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio) int i; for (i = 0; i < rbio->nr_sectors; i++) { - if (rbio->stripe_sectors[i].paddr == bvec_paddr) + if (rbio->stripe_paddrs[i * rbio->sector_nsteps] == bvec_paddr) break; - if (rbio->bio_sectors[i].has_paddr && - rbio->bio_sectors[i].paddr == bvec_paddr) + if (rbio->bio_paddrs[i * rbio->sector_nsteps] == bvec_paddr) break; } ASSERT(i < rbio->nr_sectors); @@ -1566,7 +1673,11 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, struct bio *bio) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + const u32 step = min(fs_info->sectorsize, PAGE_SIZE); + const u32 nr_steps = rbio->sector_nsteps; int total_sector_nr = get_bio_sector_nr(rbio, bio); + u32 offset = 0; + phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; phys_addr_t paddr; /* No data csum for the whole stripe, no need to verify. */ @@ -1577,18 +1688,24 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors) return; - btrfs_bio_for_each_block_all(paddr, bio, fs_info->sectorsize) { + btrfs_bio_for_each_block_all(paddr, bio, step) { u8 csum_buf[BTRFS_CSUM_SIZE]; - u8 *expected_csum = rbio->csum_buf + total_sector_nr * fs_info->csum_size; - int ret; + u8 *expected_csum; + + paddrs[(offset / step) % nr_steps] = paddr; + offset += step; + + /* Not yet covering the full fs block, continue to the next step. */ + if (!IS_ALIGNED(offset, fs_info->sectorsize)) + continue; /* No csum for this sector, skip to the next sector. */ if (!test_bit(total_sector_nr, rbio->csum_bitmap)) continue; - ret = btrfs_check_block_csum(fs_info, paddr, - csum_buf, expected_csum); - if (ret < 0) + expected_csum = rbio->csum_buf + total_sector_nr * fs_info->csum_size; + btrfs_calculate_block_csum_pages(fs_info, paddrs, csum_buf); + if (unlikely(memcmp(csum_buf, expected_csum, fs_info->csum_size) != 0)) set_bit(total_sector_nr, rbio->error_bitmap); total_sector_nr++; } @@ -1785,10 +1902,9 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, int stripe_nr, int sector_nr) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; - struct sector_ptr *sector; + phys_addr_t *paddrs; u8 csum_buf[BTRFS_CSUM_SIZE]; u8 *csum_expected; - int ret; if (!rbio->csum_bitmap || !rbio->csum_buf) return 0; @@ -1801,54 +1917,32 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, * bio list if possible. */ if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { - sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); + paddrs = sector_paddrs_in_rbio(rbio, stripe_nr, sector_nr, 0); } else { - sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); + paddrs = rbio_stripe_paddrs(rbio, stripe_nr, sector_nr); } csum_expected = rbio->csum_buf + (stripe_nr * rbio->stripe_nsectors + sector_nr) * fs_info->csum_size; - ret = btrfs_check_block_csum(fs_info, sector->paddr, csum_buf, csum_expected); - return ret; + btrfs_calculate_block_csum_pages(fs_info, paddrs, csum_buf); + if (unlikely(memcmp(csum_buf, csum_expected, fs_info->csum_size) != 0)) + return -EIO; + return 0; } -/* - * Recover a vertical stripe specified by @sector_nr. - * @*pointers are the pre-allocated pointers by the caller, so we don't - * need to allocate/free the pointers again and again. - */ -static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, - void **pointers, void **unmap_array) +static void recover_vertical_step(struct btrfs_raid_bio *rbio, + unsigned int sector_nr, + unsigned int step_nr, + int faila, int failb, + void **pointers, void **unmap_array) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; - struct sector_ptr *sector; - const u32 sectorsize = fs_info->sectorsize; - int found_errors; - int faila; - int failb; + const u32 step = min(fs_info->sectorsize, PAGE_SIZE); int stripe_nr; - int ret = 0; - /* - * Now we just use bitmap to mark the horizontal stripes in - * which we have data when doing parity scrub. - */ - if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && - !test_bit(sector_nr, &rbio->dbitmap)) - return 0; - - found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, - &failb); - /* - * No errors in the vertical stripe, skip it. Can happen for recovery - * which only part of a stripe failed csum check. - */ - if (!found_errors) - return 0; - - if (unlikely(found_errors > rbio->bioc->max_errors)) - return -EIO; + ASSERT(step_nr < rbio->sector_nsteps); + ASSERT(sector_nr < rbio->stripe_nsectors); /* * Setup our array of pointers with sectors from each stripe @@ -1857,16 +1951,18 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, * pointer order. */ for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { + phys_addr_t paddr; + /* * If we're rebuilding a read, we have to use pages from the * bio list if possible. */ if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { - sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); + paddr = sector_paddr_in_rbio(rbio, stripe_nr, sector_nr, step_nr, 0); } else { - sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); + paddr = rbio_stripe_paddr(rbio, stripe_nr, sector_nr, step_nr); } - pointers[stripe_nr] = kmap_local_sector(sector); + pointers[stripe_nr] = kmap_local_paddr(paddr); unmap_array[stripe_nr] = pointers[stripe_nr]; } @@ -1912,10 +2008,10 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, } if (failb == rbio->real_stripes - 2) { - raid6_datap_recov(rbio->real_stripes, sectorsize, + raid6_datap_recov(rbio->real_stripes, step, faila, pointers); } else { - raid6_2data_recov(rbio->real_stripes, sectorsize, + raid6_2data_recov(rbio->real_stripes, step, faila, failb, pointers); } } else { @@ -1925,7 +2021,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, ASSERT(failb == -1); pstripe: /* Copy parity block into failed block to start with */ - memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); + memcpy(pointers[faila], pointers[rbio->nr_data], step); /* Rearrange the pointer array */ p = pointers[faila]; @@ -1935,40 +2031,66 @@ pstripe: pointers[rbio->nr_data - 1] = p; /* Xor in the rest */ - run_xor(pointers, rbio->nr_data - 1, sectorsize); - + run_xor(pointers, rbio->nr_data - 1, step); } +cleanup: + for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--) + kunmap_local(unmap_array[stripe_nr]); +} + +/* + * Recover a vertical stripe specified by @sector_nr. + * @*pointers are the pre-allocated pointers by the caller, so we don't + * need to allocate/free the pointers again and again. + */ +static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, + void **pointers, void **unmap_array) +{ + int found_errors; + int faila; + int failb; + int ret = 0; + /* - * No matter if this is a RMW or recovery, we should have all - * failed sectors repaired in the vertical stripe, thus they are now - * uptodate. - * Especially if we determine to cache the rbio, we need to - * have at least all data sectors uptodate. - * - * If possible, also check if the repaired sector matches its data - * checksum. + * Now we just use bitmap to mark the horizontal stripes in + * which we have data when doing parity scrub. + */ + if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && + !test_bit(sector_nr, &rbio->dbitmap)) + return 0; + + found_errors = get_rbio_vertical_errors(rbio, sector_nr, &faila, + &failb); + /* + * No errors in the vertical stripe, skip it. Can happen for recovery + * which only part of a stripe failed csum check. */ + if (!found_errors) + return 0; + + if (unlikely(found_errors > rbio->bioc->max_errors)) + return -EIO; + + for (int i = 0; i < rbio->sector_nsteps; i++) + recover_vertical_step(rbio, sector_nr, i, faila, failb, + pointers, unmap_array); if (faila >= 0) { ret = verify_one_sector(rbio, faila, sector_nr); if (ret < 0) - goto cleanup; + return ret; - sector = rbio_stripe_sector(rbio, faila, sector_nr); - sector->uptodate = 1; + set_bit(rbio_sector_index(rbio, faila, sector_nr), + rbio->stripe_uptodate_bitmap); } if (failb >= 0) { ret = verify_one_sector(rbio, failb, sector_nr); if (ret < 0) - goto cleanup; + return ret; - sector = rbio_stripe_sector(rbio, failb, sector_nr); - sector->uptodate = 1; + set_bit(rbio_sector_index(rbio, failb, sector_nr), + rbio->stripe_uptodate_bitmap); } - -cleanup: - for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--) - kunmap_local(unmap_array[stripe_nr]); return ret; } @@ -2043,7 +2165,7 @@ static void recover_rbio(struct btrfs_raid_bio *rbio) total_sector_nr++) { int stripe = total_sector_nr / rbio->stripe_nsectors; int sectornr = total_sector_nr % rbio->stripe_nsectors; - struct sector_ptr *sector; + phys_addr_t *paddrs; /* * Skip the range which has error. It can be a range which is @@ -2060,8 +2182,8 @@ static void recover_rbio(struct btrfs_raid_bio *rbio) continue; } - sector = rbio_stripe_sector(rbio, stripe, sectornr); - ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); + ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe, sectornr, REQ_OP_READ); if (ret < 0) { bio_list_put(&bio_list); @@ -2106,7 +2228,7 @@ static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_n int faila; int failb; - found_errors = get_rbio_veritical_errors(rbio, sector_nr, + found_errors = get_rbio_vertical_errors(rbio, sector_nr, &faila, &failb); /* This vertical stripe doesn't have errors. */ if (!found_errors) @@ -2250,13 +2372,13 @@ static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { - struct sector_ptr *sector; int stripe = total_sector_nr / rbio->stripe_nsectors; int sectornr = total_sector_nr % rbio->stripe_nsectors; + phys_addr_t *paddrs; - sector = rbio_stripe_sector(rbio, stripe, sectornr); - ret = rbio_add_io_sector(rbio, &bio_list, sector, - stripe, sectornr, REQ_OP_READ); + paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); + ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe, + sectornr, REQ_OP_READ); if (ret) { bio_list_put(&bio_list); return ret; @@ -2310,14 +2432,15 @@ static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) int i; for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) { - struct sector_ptr *sector = &rbio->stripe_sectors[i]; + phys_addr_t paddr = rbio->stripe_paddrs[i * rbio->sector_nsteps]; /* * We have a sector which doesn't have page nor uptodate, * thus this rbio can not be cached one, as cached one must * have all its data sectors present and uptodate. */ - if (!sector->has_paddr || !sector->uptodate) + if (paddr == INVALID_PADDR || + !test_bit(i, rbio->stripe_uptodate_bitmap)) return true; } return false; @@ -2398,7 +2521,7 @@ static void rmw_rbio(struct btrfs_raid_bio *rbio) for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { int found_errors; - found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL); + found_errors = get_rbio_vertical_errors(rbio, sectornr, NULL, NULL); if (unlikely(found_errors > rbio->bioc->max_errors)) { ret = -EIO; break; @@ -2469,47 +2592,121 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, return rbio; } +static int alloc_rbio_sector_pages(struct btrfs_raid_bio *rbio, + int sector_nr) +{ + const u32 step = min(PAGE_SIZE, rbio->bioc->fs_info->sectorsize); + const u32 base = sector_nr * rbio->sector_nsteps; + + for (int i = base; i < base + rbio->sector_nsteps; i++) { + const unsigned int page_index = (i * step) >> PAGE_SHIFT; + struct page *page; + + if (rbio->stripe_pages[page_index]) + continue; + page = alloc_page(GFP_NOFS); + if (!page) + return -ENOMEM; + rbio->stripe_pages[page_index] = page; + } + return 0; +} + /* * We just scrub the parity that we have correct data on the same horizontal, * so we needn't allocate all pages for all the stripes. */ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) { - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; int total_sector_nr; for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { - struct page *page; int sectornr = total_sector_nr % rbio->stripe_nsectors; - int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT; + int ret; if (!test_bit(sectornr, &rbio->dbitmap)) continue; - if (rbio->stripe_pages[index]) - continue; - page = alloc_page(GFP_NOFS); - if (!page) - return -ENOMEM; - rbio->stripe_pages[index] = page; + ret = alloc_rbio_sector_pages(rbio, total_sector_nr); + if (ret < 0) + return ret; } index_stripe_sectors(rbio); return 0; } +/* Return true if the content of the step matches the caclulated one. */ +static bool verify_one_parity_step(struct btrfs_raid_bio *rbio, + void *pointers[], unsigned int sector_nr, + unsigned int step_nr) +{ + const unsigned int nr_data = rbio->nr_data; + const bool has_qstripe = (rbio->real_stripes - rbio->nr_data == 2); + const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE); + void *parity; + bool ret = false; + + ASSERT(step_nr < rbio->sector_nsteps); + + /* First collect one page from each data stripe. */ + for (int stripe = 0; stripe < nr_data; stripe++) + pointers[stripe] = kmap_local_paddr( + sector_paddr_in_rbio(rbio, stripe, sector_nr, + step_nr, 0)); + + if (has_qstripe) { + assert_rbio(rbio); + /* RAID6, call the library function to fill in our P/Q. */ + raid6_call.gen_syndrome(rbio->real_stripes, step, pointers); + } else { + /* RAID5. */ + memcpy(pointers[nr_data], pointers[0], step); + run_xor(pointers + 1, nr_data - 1, step); + } + + /* Check scrubbing parity and repair it. */ + parity = kmap_local_paddr(rbio_stripe_paddr(rbio, rbio->scrubp, sector_nr, step_nr)); + if (memcmp(parity, pointers[rbio->scrubp], step) != 0) + memcpy(parity, pointers[rbio->scrubp], step); + else + ret = true; + kunmap_local(parity); + + for (int stripe = nr_data - 1; stripe >= 0; stripe--) + kunmap_local(pointers[stripe]); + return ret; +} + +/* + * The @pointers array should have the P/Q parity already mapped. + */ +static void verify_one_parity_sector(struct btrfs_raid_bio *rbio, + void *pointers[], unsigned int sector_nr) +{ + bool found_error = false; + + for (int step_nr = 0; step_nr < rbio->sector_nsteps; step_nr++) { + bool match; + + match = verify_one_parity_step(rbio, pointers, sector_nr, step_nr); + if (!match) + found_error = true; + } + if (!found_error) + bitmap_clear(&rbio->dbitmap, sector_nr, 1); +} + static int finish_parity_scrub(struct btrfs_raid_bio *rbio) { struct btrfs_io_context *bioc = rbio->bioc; - const u32 sectorsize = bioc->fs_info->sectorsize; void **pointers = rbio->finish_pointers; unsigned long *pbitmap = &rbio->finish_pbitmap; int nr_data = rbio->nr_data; - int stripe; int sectornr; bool has_qstripe; struct page *page; - struct sector_ptr p_sector = { 0 }; - struct sector_ptr q_sector = { 0 }; + phys_addr_t p_paddr = INVALID_PADDR; + phys_addr_t q_paddr = INVALID_PADDR; struct bio_list bio_list; int is_replace = 0; int ret; @@ -2542,72 +2739,36 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) page = alloc_page(GFP_NOFS); if (!page) return -ENOMEM; - p_sector.has_paddr = true; - p_sector.paddr = page_to_phys(page); - p_sector.uptodate = 1; + p_paddr = page_to_phys(page); page = NULL; + pointers[nr_data] = kmap_local_paddr(p_paddr); if (has_qstripe) { /* RAID6, allocate and map temp space for the Q stripe */ page = alloc_page(GFP_NOFS); if (!page) { - __free_page(phys_to_page(p_sector.paddr)); - p_sector.has_paddr = false; + __free_page(phys_to_page(p_paddr)); + p_paddr = INVALID_PADDR; return -ENOMEM; } - q_sector.has_paddr = true; - q_sector.paddr = page_to_phys(page); - q_sector.uptodate = 1; + q_paddr = page_to_phys(page); page = NULL; - pointers[rbio->real_stripes - 1] = kmap_local_sector(&q_sector); + pointers[rbio->real_stripes - 1] = kmap_local_paddr(q_paddr); } bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); /* Map the parity stripe just once */ - pointers[nr_data] = kmap_local_sector(&p_sector); - - for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { - struct sector_ptr *sector; - void *parity; - - /* first collect one page from each data stripe */ - for (stripe = 0; stripe < nr_data; stripe++) { - sector = sector_in_rbio(rbio, stripe, sectornr, 0); - pointers[stripe] = kmap_local_sector(sector); - } - if (has_qstripe) { - assert_rbio(rbio); - /* RAID6, call the library function to fill in our P/Q */ - raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, - pointers); - } else { - /* raid5 */ - memcpy(pointers[nr_data], pointers[0], sectorsize); - run_xor(pointers + 1, nr_data - 1, sectorsize); - } - - /* Check scrubbing parity and repair it */ - sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); - parity = kmap_local_sector(sector); - if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0) - memcpy(parity, pointers[rbio->scrubp], sectorsize); - else - /* Parity is right, needn't writeback */ - bitmap_clear(&rbio->dbitmap, sectornr, 1); - kunmap_local(parity); - - for (stripe = nr_data - 1; stripe >= 0; stripe--) - kunmap_local(pointers[stripe]); - } + for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) + verify_one_parity_sector(rbio, pointers, sectornr); kunmap_local(pointers[nr_data]); - __free_page(phys_to_page(p_sector.paddr)); - p_sector.has_paddr = false; - if (q_sector.has_paddr) { - __free_page(phys_to_page(q_sector.paddr)); - q_sector.has_paddr = false; + __free_page(phys_to_page(p_paddr)); + p_paddr = INVALID_PADDR; + if (q_paddr != INVALID_PADDR) { + __free_page(phys_to_page(q_paddr)); + q_paddr = INVALID_PADDR; } /* @@ -2616,10 +2777,10 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) * everything else. */ for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { - struct sector_ptr *sector; + phys_addr_t *paddrs; - sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); - ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp, + paddrs = rbio_stripe_paddrs(rbio, rbio->scrubp, sectornr); + ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, rbio->scrubp, sectornr, REQ_OP_WRITE); if (ret) goto cleanup; @@ -2634,11 +2795,10 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) */ ASSERT_RBIO(rbio->bioc->replace_stripe_src >= 0, rbio); for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { - struct sector_ptr *sector; + phys_addr_t *paddrs; - sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); - ret = rbio_add_io_sector(rbio, &bio_list, sector, - rbio->real_stripes, + paddrs = rbio_stripe_paddrs(rbio, rbio->scrubp, sectornr); + ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, rbio->real_stripes, sectornr, REQ_OP_WRITE); if (ret) goto cleanup; @@ -2686,7 +2846,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) int failb; int found_errors; - found_errors = get_rbio_veritical_errors(rbio, sector_nr, + found_errors = get_rbio_vertical_errors(rbio, sector_nr, &faila, &failb); if (unlikely(found_errors > rbio->bioc->max_errors)) { ret = -EIO; @@ -2755,7 +2915,7 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) total_sector_nr++) { int sectornr = total_sector_nr % rbio->stripe_nsectors; int stripe = total_sector_nr / rbio->stripe_nsectors; - struct sector_ptr *sector; + phys_addr_t *paddrs; /* No data in the vertical stripe, no need to read. */ if (!test_bit(sectornr, &rbio->dbitmap)) @@ -2763,22 +2923,23 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) /* * We want to find all the sectors missing from the rbio and - * read them from the disk. If sector_in_rbio() finds a sector + * read them from the disk. If sector_paddr_in_rbio() finds a sector * in the bio list we don't need to read it off the stripe. */ - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (sector) + paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1); + if (paddrs == NULL) continue; - sector = rbio_stripe_sector(rbio, stripe, sectornr); + paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); /* * The bio cache may have handed us an uptodate sector. If so, * use it. */ - if (sector->uptodate) + if (test_bit(rbio_sector_index(rbio, stripe, sectornr), + rbio->stripe_uptodate_bitmap)) continue; - ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe, sectornr, REQ_OP_READ); if (ret) { bio_list_put(&bio_list); @@ -2819,7 +2980,7 @@ static void scrub_rbio(struct btrfs_raid_bio *rbio) for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { int found_errors; - found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL); + found_errors = get_rbio_vertical_errors(rbio, sector_nr, NULL, NULL); if (unlikely(found_errors > rbio->bioc->max_errors)) { ret = -EIO; break; @@ -2857,9 +3018,6 @@ void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio, unsigned int foffset = 0; int ret; - /* We shouldn't hit RAID56 for bs > ps cases for now. */ - ASSERT(fs_info->sectorsize <= PAGE_SIZE); - /* * If we hit ENOMEM temporarily, but later at * raid56_parity_submit_scrub_rbio() time it succeeded, we just do @@ -2893,8 +3051,7 @@ void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio, foffset = 0; } } - for (unsigned int sector_nr = offset_in_full_stripe >> fs_info->sectorsize_bits; - sector_nr < (offset_in_full_stripe + BTRFS_STRIPE_LEN) >> fs_info->sectorsize_bits; - sector_nr++) - rbio->stripe_sectors[sector_nr].uptodate = true; + bitmap_set(rbio->stripe_uptodate_bitmap, + offset_in_full_stripe >> fs_info->sectorsize_bits, + BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); } diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 84c4d1d29c7a..1f463ecf7e41 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -16,7 +16,6 @@ #include "volumes.h" struct page; -struct sector_ptr; struct btrfs_fs_info; enum btrfs_rbio_ops { @@ -25,6 +24,84 @@ enum btrfs_rbio_ops { BTRFS_RBIO_PARITY_SCRUB, }; +/* + * Overview of btrfs_raid_bio. + * + * One btrfs_raid_bio represents a full stripe of RAID56, including both data + * and P/Q stripes. For now, each data and P/Q stripe is of a fixed length (64K). + * + * One btrfs_raid_bio can have one or more bios from higher layer, covering + * part or all of the data stripes. + * + * [PAGES FROM HIGHER LAYER BIOS] + * Higher layer bios are in the btrfs_raid_bio::bio_list. + * + * Pages from the bio_list are represented like the following: + * + * bio_list: |<- Bio 1 ->| |<- Bio 2 ->| ... + * bio_paddrs: [0] [1] [2] [3] [4] [5] ... + * + * If there is a bio covering a sector (one btrfs fs block), the corresponding + * pointer in btrfs_raid_bio::bio_paddrs[] will point to the physical address + * (with the offset inside the page) of the corresponding bio. + * + * If there is no bio covering a sector, then btrfs_raid_bio::bio_paddrs[i] will + * be INVALID_PADDR. + * + * The length of each entry in bio_paddrs[] is a step (aka, min(sectorsize, PAGE_SIZE)). + * + * [PAGES FOR INTERNAL USAGES] + * Pages not covered by any bio or belonging to P/Q stripes are stored in + * btrfs_raid_bio::stripe_pages[] and stripe_paddrs[], like the following: + * + * stripe_pages: |<- Page 0 ->|<- Page 1 ->| ... + * stripe_paddrs: [0] [1] [2] [3] [4] ... + * + * stripe_pages[] array stores all the pages covering the full stripe, including + * data and P/Q pages. + * stripe_pages[0] is the first page of the first data stripe. + * stripe_pages[BTRFS_STRIPE_LEN / PAGE_SIZE] is the first page of the second + * data stripe. + * + * Some pointers inside stripe_pages[] can be NULL, e.g. for a full stripe write + * (the bio covers all data stripes) there is no need to allocate pages for + * data stripes (can grab from bio_paddrs[]). + * + * If the corresponding page of stripe_paddrs[i] is not allocated, the value of + * stripe_paddrs[i] will be INVALID_PADDR. + * + * The length of each entry in stripe_paddrs[] is a step. + * + * [LOCATING A SECTOR] + * To locate a sector for IO, we need the following info: + * + * - stripe_nr + * Starts from 0 (representing the first data stripe), ends at + * @nr_data (RAID5, P stripe) or @nr_data + 1 (RAID6, Q stripe). + * + * - sector_nr + * Starts from 0 (representing the first sector of the stripe), ends + * at BTRFS_STRIPE_LEN / sectorsize - 1. + * + * - step_nr + * A step is min(sector_size, PAGE_SIZE). + * + * Starts from 0 (representing the first step of the sector), ends + * at @sector_nsteps - 1. + * + * For most call sites they do not need to bother this parameter. + * It is for bs > ps support and only for vertical stripe related works. + * (e.g. RMW/recover) + * + * - from which array + * Whether grabbing from stripe_paddrs[] (aka, internal pages) or from the + * bio_paddrs[] (aka, from the higher layer bios). + * + * For IO, a physical address is returned, so that we can extract the page and + * the offset inside the page for IO. + * A special value INVALID_PADDR represents when the physical address is invalid, + * normally meaning there is no page allocated for the specified sector. + */ struct btrfs_raid_bio { struct btrfs_io_context *bioc; @@ -82,6 +159,14 @@ struct btrfs_raid_bio { /* How many sectors there are for each stripe */ u8 stripe_nsectors; + /* + * How many steps there are for one sector. + * + * For bs > ps cases, it's sectorsize / PAGE_SIZE. + * For bs <= ps cases, it's always 1. + */ + u8 sector_nsteps; + /* Stripe number that we're scrubbing */ u8 scrubp; @@ -116,13 +201,13 @@ struct btrfs_raid_bio { struct page **stripe_pages; /* Pointers to the sectors in the bio_list, for faster lookup */ - struct sector_ptr *bio_sectors; + phys_addr_t *bio_paddrs; - /* - * For subpage support, we need to map each sector to above - * stripe_pages. - */ - struct sector_ptr *stripe_sectors; + /* Pointers to the sectors in the stripe_pages[]. */ + phys_addr_t *stripe_paddrs; + + /* Each set bit means the corresponding sector in stripe_sectors[] is uptodate. */ + unsigned long *stripe_uptodate_bitmap; /* Allocated with real_stripes-many pointers for finish_*() calls */ void **finish_pointers; @@ -131,10 +216,6 @@ struct btrfs_raid_bio { * The bitmap recording where IO errors happened. * Each bit is corresponding to one sector in either bio_sectors[] or * stripe_sectors[] array. - * - * The reason we don't use another bit in sector_ptr is, we have two - * arrays of sectors, and a lot of IO can use sectors in both arrays. - * Thus making it much harder to iterate. */ unsigned long *error_bitmap; diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index de4cb0f3fbd0..e9224145d754 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -982,7 +982,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) extent_root = btrfs_extent_root(fs_info, 0); /* If the extent tree is damaged we cannot ignore it (IGNOREBADROOTS). */ - if (IS_ERR(extent_root)) { + if (!extent_root) { btrfs_warn(fs_info, "ref-verify: extent tree not available, disabling"); btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); return 0; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 5465a5eae9b2..b5fe95baf92e 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/blkdev.h> +#include <linux/fscrypt.h> #include <linux/iversion.h> #include "ctree.h" #include "fs.h" @@ -343,7 +344,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_trans_handle *trans; - char *buf = NULL; + char AUTO_KVFREE(buf); struct btrfs_key key; u32 nritems; int slot; @@ -358,10 +359,8 @@ static int btrfs_clone(struct inode *src, struct inode *inode, return ret; path = btrfs_alloc_path(); - if (!path) { - kvfree(buf); + if (!path) return ret; - } path->reada = READA_FORWARD; /* Clone data */ @@ -611,7 +610,6 @@ process_slot: } out: - kvfree(buf); clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags); return ret; @@ -792,6 +790,10 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, ASSERT(inode_in->vfs_inode.i_sb == inode_out->vfs_inode.i_sb); } + /* Can only reflink encrypted files if both files are encrypted. */ + if (IS_ENCRYPTED(&inode_in->vfs_inode) != IS_ENCRYPTED(&inode_out->vfs_inode)) + return -EINVAL; + /* Don't make the dst file partly checksummed */ if ((inode_in->flags & BTRFS_INODE_NODATASUM) != (inode_out->flags & BTRFS_INODE_NODATASUM)) { @@ -868,6 +870,9 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, bool same_inode = dst_inode == src_inode; int ret; + if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(src_file))))) + return -EIO; + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 8dd8de6b9fb8..5bfefc3e9c06 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -511,7 +511,7 @@ static void __del_reloc_root(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *rb_node; - struct mapping_node *node = NULL; + struct mapping_node AUTO_KFREE(node); struct reloc_control *rc = fs_info->reloc_ctl; bool put_ref = false; @@ -544,7 +544,6 @@ static void __del_reloc_root(struct btrfs_root *root) spin_unlock(&fs_info->trans_lock); if (put_ref) btrfs_put_root(root); - kfree(node); } /* @@ -586,10 +585,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *reloc_root; struct extent_buffer *eb; - struct btrfs_root_item *root_item; + struct btrfs_root_item AUTO_KFREE(root_item); struct btrfs_key root_key; int ret = 0; - bool must_abort = false; root_item = kmalloc(sizeof(*root_item), GFP_NOFS); if (!root_item) @@ -615,17 +613,16 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, btrfs_disk_key_to_cpu(&cpu_key, &root->root_item.drop_progress); btrfs_err(fs_info, - "cannot relocate partially dropped subvolume %llu, drop progress key (%llu %u %llu)", - objectid, cpu_key.objectid, cpu_key.type, cpu_key.offset); - ret = -EUCLEAN; - goto fail; + "cannot relocate partially dropped subvolume %llu, drop progress key " BTRFS_KEY_FMT, + objectid, BTRFS_KEY_FMT_VALUE(&cpu_key)); + return ERR_PTR(-EUCLEAN); } /* called by btrfs_init_reloc_root */ ret = btrfs_copy_root(trans, root, root->commit_root, &eb, BTRFS_TREE_RELOC_OBJECTID); if (ret) - goto fail; + return ERR_PTR(ret); /* * Set the last_snapshot field to the generation of the commit @@ -648,14 +645,13 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, ret = btrfs_copy_root(trans, root, root->node, &eb, BTRFS_TREE_RELOC_OBJECTID); if (ret) - goto fail; + return ERR_PTR(ret); } /* * We have changed references at this point, we must abort the - * transaction if anything fails. + * transaction if anything fails (i.e. 'goto abort'). */ - must_abort = true; memcpy(root_item, &root->root_item, sizeof(*root_item)); btrfs_set_root_bytenr(root_item, eb->start); @@ -675,9 +671,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, ret = btrfs_insert_root(trans, fs_info->tree_root, &root_key, root_item); if (ret) - goto fail; - - kfree(root_item); + goto abort; reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key); if (IS_ERR(reloc_root)) { @@ -687,11 +681,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); btrfs_set_root_last_trans(reloc_root, trans->transid); return reloc_root; -fail: - kfree(root_item); + abort: - if (must_abort) - btrfs_abort_transaction(trans, ret); + btrfs_abort_transaction(trans, ret); return ERR_PTR(ret); } @@ -2947,7 +2939,7 @@ static int relocate_file_extent_cluster(struct reloc_control *rc) const struct file_extent_cluster *cluster = &rc->cluster; u64 offset = BTRFS_I(inode)->reloc_block_group_start; u64 cur_file_offset = cluster->start - offset; - struct file_ra_state *ra; + struct file_ra_state AUTO_KFREE(ra); int cluster_nr = 0; int ret = 0; @@ -2960,13 +2952,13 @@ static int relocate_file_extent_cluster(struct reloc_control *rc) ret = prealloc_file_extent_cluster(rc); if (ret) - goto out; + return ret; file_ra_state_init(ra, inode->i_mapping); ret = setup_relocation_extent_mapping(rc); if (ret) - goto out; + return ret; while (cur_file_offset < cluster->end - offset) { ret = relocate_one_folio(rc, ra, &cluster_nr, &cur_file_offset); @@ -2975,8 +2967,6 @@ static int relocate_file_extent_cluster(struct reloc_control *rc) } if (ret == 0) WARN_ON(cluster_nr != cluster->nr); -out: - kfree(ra); return ret; } @@ -3175,8 +3165,8 @@ again: key.offset = blocksize; } - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); if (ret < 0) return ret; @@ -3368,8 +3358,8 @@ int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = 0; - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); if (ret < 0) @@ -3780,6 +3770,7 @@ out: /* * Mark start of chunk relocation that is cancellable. Check if the cancellation * has been requested meanwhile and don't start in that case. + * NOTE: if this returns an error, reloc_chunk_end() must not be called. * * Return: * 0 success @@ -3796,10 +3787,8 @@ static int reloc_chunk_start(struct btrfs_fs_info *fs_info) if (atomic_read(&fs_info->reloc_cancel_req) > 0) { btrfs_info(fs_info, "chunk relocation canceled on start"); - /* - * On cancel, clear all requests but let the caller mark - * the end after cleanup operations. - */ + /* On cancel, clear all requests. */ + clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags); atomic_set(&fs_info->reloc_cancel_req, 0); return -ECANCELED; } @@ -3808,9 +3797,11 @@ static int reloc_chunk_start(struct btrfs_fs_info *fs_info) /* * Mark end of chunk relocation that is cancellable and wake any waiters. + * NOTE: call only if a previous call to reloc_chunk_start() succeeded. */ static void reloc_chunk_end(struct btrfs_fs_info *fs_info) { + ASSERT(test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)); /* Requested after start, clear bit first so any waiters can continue */ if (atomic_read(&fs_info->reloc_cancel_req) > 0) btrfs_info(fs_info, "chunk relocation canceled during operation"); @@ -3881,8 +3872,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, struct inode *inode; struct btrfs_path *path; int ret; - int rw = 0; - int err = 0; + bool bg_is_ro = false; /* * This only gets set if we had a half-deleted snapshot on mount. We @@ -3924,24 +3914,20 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, } ret = reloc_chunk_start(fs_info); - if (ret < 0) { - err = ret; + if (ret < 0) goto out_put_bg; - } rc->extent_root = extent_root; rc->block_group = bg; ret = btrfs_inc_block_group_ro(rc->block_group, true); - if (ret) { - err = ret; + if (ret) goto out; - } - rw = 1; + bg_is_ro = true; path = btrfs_alloc_path(); if (!path) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } @@ -3953,14 +3939,12 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, else ret = PTR_ERR(inode); - if (ret && ret != -ENOENT) { - err = ret; + if (ret && ret != -ENOENT) goto out; - } rc->data_inode = create_reloc_inode(rc->block_group); if (IS_ERR(rc->data_inode)) { - err = PTR_ERR(rc->data_inode); + ret = PTR_ERR(rc->data_inode); rc->data_inode = NULL; goto out; } @@ -3981,8 +3965,6 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, mutex_lock(&fs_info->cleaner_mutex); ret = relocate_block_group(rc); mutex_unlock(&fs_info->cleaner_mutex); - if (ret < 0) - err = ret; finishes_stage = rc->stage; /* @@ -3995,16 +3977,18 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, * out of the loop if we hit an error. */ if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { - ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), 0, - (u64)-1); - if (ret) - err = ret; + int wb_ret; + + wb_ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), 0, + (u64)-1); + if (wb_ret && ret == 0) + ret = wb_ret; invalidate_mapping_pages(rc->data_inode->i_mapping, 0, -1); rc->stage = UPDATE_DATA_PTRS; } - if (err < 0) + if (ret < 0) goto out; if (rc->extents_found == 0) @@ -4020,14 +4004,14 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, WARN_ON(rc->block_group->reserved > 0); WARN_ON(rc->block_group->used > 0); out: - if (err && rw) + if (ret && bg_is_ro) btrfs_dec_block_group_ro(rc->block_group); iput(rc->data_inode); + reloc_chunk_end(fs_info); out_put_bg: btrfs_put_block_group(bg); - reloc_chunk_end(fs_info); free_reloc_control(rc); - return err; + return ret; } static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) @@ -4208,8 +4192,8 @@ out_clean: ret = ret2; out_unset: unset_reloc_control(rc); -out_end: reloc_chunk_end(fs_info); +out_end: free_reloc_control(rc); out: free_reloc_roots(&reloc_roots); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index d07eab70f759..6a7e297ab0a7 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -147,8 +147,8 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root if (unlikely(ret > 0)) { btrfs_crit(fs_info, - "unable to find root key (%llu %u %llu) in tree %llu", - key->objectid, key->type, key->offset, btrfs_root_id(root)); + "unable to find root key " BTRFS_KEY_FMT " in tree %llu", + BTRFS_KEY_FMT_VALUE(key), btrfs_root_id(root)); ret = -EUCLEAN; btrfs_abort_transaction(trans, ret); return ret; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 4691d0bdb2e8..a40ee41f42c6 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -463,10 +463,10 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( refcount_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; sctx->fs_info = fs_info; - sctx->extent_path.search_commit_root = 1; - sctx->extent_path.skip_locking = 1; - sctx->csum_path.search_commit_root = 1; - sctx->csum_path.skip_locking = 1; + sctx->extent_path.search_commit_root = true; + sctx->extent_path.skip_locking = true; + sctx->csum_path.search_commit_root = true; + sctx->csum_path.skip_locking = true; for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) { int ret; @@ -505,7 +505,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, struct btrfs_inode_item *inode_item; struct scrub_warning *swarn = warn_ctx; struct btrfs_fs_info *fs_info = swarn->dev->fs_info; - struct inode_fs_paths *ipath = NULL; + struct inode_fs_paths *ipath __free(inode_fs_paths) = NULL; struct btrfs_root *local_root; struct btrfs_key key; @@ -569,7 +569,6 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, (char *)(unsigned long)ipath->fspath->val[i]); btrfs_put_root(local_root); - free_ipath(ipath); return 0; err: @@ -580,7 +579,6 @@ err: swarn->physical, root, inum, offset, ret); - free_ipath(ipath); return 0; } @@ -694,7 +692,7 @@ static void *scrub_stripe_get_kaddr(struct scrub_stripe *stripe, int sector_nr) /* stripe->folios[] is allocated by us and no highmem is allowed. */ ASSERT(folio); - ASSERT(!folio_test_partial_kmap(folio)); + ASSERT(!folio_test_highmem(folio)); return folio_address(folio) + offset_in_folio(folio, offset); } @@ -707,7 +705,7 @@ static phys_addr_t scrub_stripe_get_paddr(struct scrub_stripe *stripe, int secto /* stripe->folios[] is allocated by us and no highmem is allowed. */ ASSERT(folio); - ASSERT(!folio_test_partial_kmap(folio)); + ASSERT(!folio_test_highmem(folio)); /* And the range must be contained inside the folio. */ ASSERT(offset_in_folio(folio, offset) + fs_info->sectorsize <= folio_size(folio)); return page_to_phys(folio_page(folio, 0)) + offset_in_folio(folio, offset); @@ -777,10 +775,10 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, -"scrub: tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, +"scrub: tree block %llu mirror %u has bad csum, has " BTRFS_CSUM_FMT " want " BTRFS_CSUM_FMT, logical, stripe->mirror_num, - CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), - CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); + BTRFS_CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), + BTRFS_CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); return; } if (stripe->sectors[sector_nr].generation != @@ -929,10 +927,11 @@ static int calc_next_mirror(int mirror, int num_copies) static void scrub_bio_add_sector(struct btrfs_bio *bbio, struct scrub_stripe *stripe, int sector_nr) { + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); int ret; - ret = bio_add_page(&bbio->bio, virt_to_page(kaddr), bbio->fs_info->sectorsize, + ret = bio_add_page(&bbio->bio, virt_to_page(kaddr), fs_info->sectorsize, offset_in_page(kaddr)); /* * Caller should ensure the bbio has enough size. @@ -942,7 +941,21 @@ static void scrub_bio_add_sector(struct btrfs_bio *bbio, struct scrub_stripe *st * to create the minimal amount of bio vectors, for fs block size < page * size cases. */ - ASSERT(ret == bbio->fs_info->sectorsize); + ASSERT(ret == fs_info->sectorsize); +} + +static struct btrfs_bio *alloc_scrub_bbio(struct btrfs_fs_info *fs_info, + unsigned int nr_vecs, blk_opf_t opf, + u64 logical, + btrfs_bio_end_io_t end_io, void *private) +{ + struct btrfs_bio *bbio; + + bbio = btrfs_bio_alloc(nr_vecs, opf, BTRFS_I(fs_info->btree_inode), + logical, end_io, private); + bbio->is_scrub = true; + bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT; + return bbio; } static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, @@ -953,8 +966,9 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); int i; - ASSERT(stripe->mirror_num >= 1); - ASSERT(atomic_read(&stripe->pending_io) == 0); + ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num); + ASSERT(atomic_read(&stripe->pending_io) == 0, + "atomic_read(&stripe->pending_io)=%d", atomic_read(&stripe->pending_io)); for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) { /* The current sector cannot be merged, submit the bio. */ @@ -968,12 +982,10 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, bbio = NULL; } - if (!bbio) { - bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ, - fs_info, scrub_repair_read_endio, stripe); - bbio->bio.bi_iter.bi_sector = (stripe->logical + - (i << fs_info->sectorsize_bits)) >> SECTOR_SHIFT; - } + if (!bbio) + bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_READ, + stripe->logical + (i << fs_info->sectorsize_bits), + scrub_repair_read_endio, stripe); scrub_bio_add_sector(bbio, stripe, i); } @@ -1019,7 +1031,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, int ret; /* For scrub, our mirror_num should always start at 1. */ - ASSERT(stripe->mirror_num >= 1); + ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num); ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, stripe->logical, &mapped_len, &bioc, NULL, NULL); @@ -1159,7 +1171,7 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work) int mirror; int i; - ASSERT(stripe->mirror_num > 0); + ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num); wait_scrub_stripe_io(stripe); scrub_verify_one_stripe(stripe, scrub_bitmap_read_has_extent(stripe)); @@ -1284,7 +1296,7 @@ static void scrub_write_endio(struct btrfs_bio *bbio) bitmap_set(&stripe->write_error_bitmap, sector_nr, bio_size >> fs_info->sectorsize_bits); spin_unlock_irqrestore(&stripe->write_error_lock, flags); - for (int i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++) + for (i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++) btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_WRITE_ERRS); } @@ -1352,13 +1364,10 @@ static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *str scrub_submit_write_bio(sctx, stripe, bbio, dev_replace); bbio = NULL; } - if (!bbio) { - bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_WRITE, - fs_info, scrub_write_endio, stripe); - bbio->bio.bi_iter.bi_sector = (stripe->logical + - (sector_nr << fs_info->sectorsize_bits)) >> - SECTOR_SHIFT; - } + if (!bbio) + bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_WRITE, + stripe->logical + (sector_nr << fs_info->sectorsize_bits), + scrub_write_endio, stripe); scrub_bio_add_sector(bbio, stripe, sector_nr); } if (bbio) @@ -1478,7 +1487,7 @@ static int compare_extent_item_range(struct btrfs_path *path, btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY || - key.type == BTRFS_METADATA_ITEM_KEY); + key.type == BTRFS_METADATA_ITEM_KEY, "key.type=%u", key.type); if (key.type == BTRFS_METADATA_ITEM_KEY) len = fs_info->nodesize; else @@ -1583,7 +1592,7 @@ static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret, btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); ASSERT(key.type == BTRFS_METADATA_ITEM_KEY || - key.type == BTRFS_EXTENT_ITEM_KEY); + key.type == BTRFS_EXTENT_ITEM_KEY, "key.type=%u", key.type); *extent_start_ret = key.objectid; if (key.type == BTRFS_METADATA_ITEM_KEY) *size_ret = path->nodes[0]->fs_info->nodesize; @@ -1681,7 +1690,9 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, scrub_stripe_reset_bitmaps(stripe); /* The range must be inside the bg. */ - ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); + ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length, + "bg->start=%llu logical_start=%llu logical_end=%llu end=%llu", + bg->start, logical_start, logical_end, bg->start + bg->length); ret = find_first_extent_item(extent_root, extent_path, logical_start, logical_len); @@ -1849,9 +1860,8 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) continue; } - bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ, - fs_info, scrub_read_endio, stripe); - bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT; + bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_READ, + logical, scrub_read_endio, stripe); } scrub_bio_add_sector(bbio, stripe, i); @@ -1888,10 +1898,8 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, return; } - bbio = btrfs_bio_alloc(BTRFS_STRIPE_LEN >> min_folio_shift, REQ_OP_READ, fs_info, - scrub_read_endio, stripe); - - bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT; + bbio = alloc_scrub_bbio(fs_info, BTRFS_STRIPE_LEN >> min_folio_shift, REQ_OP_READ, + stripe->logical, scrub_read_endio, stripe); /* Read the whole range inside the chunk boundary. */ for (unsigned int cur = 0; cur < nr_sectors; cur++) scrub_bio_add_sector(bbio, stripe, cur); @@ -2069,37 +2077,135 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group * return 0; } +/* + * Return 0 if we should not cancel the scrub. + * Return <0 if we need to cancel the scrub, returned value will + * indicate the reason: + * - -ECANCELED - Being explicitly canceled through ioctl. + * - -EINTR - Being interrupted by signal or fs/process freezing. + */ +static int should_cancel_scrub(const struct scrub_ctx *sctx) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + + if (atomic_read(&fs_info->scrub_cancel_req) || + atomic_read(&sctx->cancel_req)) + return -ECANCELED; + + /* + * The user (e.g. fsfreeze command) or power management (PM) + * suspend/hibernate can freeze the fs. And PM suspend/hibernate will + * also freeze all user processes. + * + * A user process can only be frozen when it is in user space, thus we + * have to cancel the run so that the process can return to the user + * space. + * + * Furthermore we have to check both filesystem and process freezing, + * as PM can be configured to freeze the filesystems before processes. + * + * If we only check fs freezing, then suspend without fs freezing + * will timeout, as the process is still in kernel space. + * + * If we only check process freezing, then suspend with fs freezing + * will timeout, as the running scrub will prevent the fs from being frozen. + */ + if (fs_info->sb->s_writers.frozen > SB_UNFROZEN || + freezing(current) || signal_pending(current)) + return -EINTR; + return 0; +} + +static int scrub_raid56_cached_parity(struct scrub_ctx *sctx, + struct btrfs_device *scrub_dev, + struct btrfs_chunk_map *map, + u64 full_stripe_start, + unsigned long *extent_bitmap) +{ + DECLARE_COMPLETION_ONSTACK(io_done); + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct btrfs_io_context *bioc = NULL; + struct btrfs_raid_bio *rbio; + struct bio bio; + const int data_stripes = nr_data_stripes(map); + u64 length = btrfs_stripe_nr_to_offset(data_stripes); + int ret; + + bio_init(&bio, NULL, NULL, 0, REQ_OP_READ); + bio.bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT; + bio.bi_private = &io_done; + bio.bi_end_io = raid56_scrub_wait_endio; + + btrfs_bio_counter_inc_blocked(fs_info); + ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start, + &length, &bioc, NULL, NULL); + if (ret < 0) + goto out; + /* For RAID56 write there must be an @bioc allocated. */ + ASSERT(bioc); + rbio = raid56_parity_alloc_scrub_rbio(&bio, bioc, scrub_dev, extent_bitmap, + BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); + btrfs_put_bioc(bioc); + if (!rbio) { + ret = -ENOMEM; + goto out; + } + /* Use the recovered stripes as cache to avoid read them from disk again. */ + for (int i = 0; i < data_stripes; i++) { + struct scrub_stripe *stripe = &sctx->raid56_data_stripes[i]; + + raid56_parity_cache_data_folios(rbio, stripe->folios, + full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT)); + } + raid56_parity_submit_scrub_rbio(rbio); + wait_for_completion_io(&io_done); + ret = blk_status_to_errno(bio.bi_status); +out: + btrfs_bio_counter_dec(fs_info); + bio_uninit(&bio); + return ret; +} + static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, struct btrfs_block_group *bg, struct btrfs_chunk_map *map, u64 full_stripe_start) { - DECLARE_COMPLETION_ONSTACK(io_done); struct btrfs_fs_info *fs_info = sctx->fs_info; - struct btrfs_raid_bio *rbio; - struct btrfs_io_context *bioc = NULL; struct btrfs_path extent_path = { 0 }; struct btrfs_path csum_path = { 0 }; - struct bio *bio; struct scrub_stripe *stripe; bool all_empty = true; const int data_stripes = nr_data_stripes(map); unsigned long extent_bitmap = 0; - u64 length = btrfs_stripe_nr_to_offset(data_stripes); int ret; ASSERT(sctx->raid56_data_stripes); + ret = should_cancel_scrub(sctx); + if (ret < 0) + return ret; + + if (atomic_read(&fs_info->scrub_pause_req)) + scrub_blocked_if_needed(fs_info); + + spin_lock(&bg->lock); + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) { + spin_unlock(&bg->lock); + return 0; + } + spin_unlock(&bg->lock); + /* * For data stripe search, we cannot reuse the same extent/csum paths, * as the data stripe bytenr may be smaller than previous extent. Thus * we have to use our own extent/csum paths. */ - extent_path.search_commit_root = 1; - extent_path.skip_locking = 1; - csum_path.search_commit_root = 1; - csum_path.skip_locking = 1; + extent_path.search_commit_root = true; + extent_path.skip_locking = true; + csum_path.search_commit_root = true; + csum_path.skip_locking = true; for (int i = 0; i < data_stripes; i++) { int stripe_index; @@ -2194,43 +2300,11 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, } /* Now we can check and regenerate the P/Q stripe. */ - bio = bio_alloc(NULL, 1, REQ_OP_READ, GFP_NOFS); - bio->bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT; - bio->bi_private = &io_done; - bio->bi_end_io = raid56_scrub_wait_endio; - - btrfs_bio_counter_inc_blocked(fs_info); - ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start, - &length, &bioc, NULL, NULL); - if (ret < 0) { - btrfs_put_bioc(bioc); - btrfs_bio_counter_dec(fs_info); - goto out; - } - rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, scrub_dev, &extent_bitmap, - BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); - btrfs_put_bioc(bioc); - if (!rbio) { - ret = -ENOMEM; - btrfs_bio_counter_dec(fs_info); - goto out; - } - /* Use the recovered stripes as cache to avoid read them from disk again. */ - for (int i = 0; i < data_stripes; i++) { - stripe = &sctx->raid56_data_stripes[i]; - - raid56_parity_cache_data_folios(rbio, stripe->folios, - full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT)); - } - raid56_parity_submit_scrub_rbio(rbio); - wait_for_completion_io(&io_done); - ret = blk_status_to_errno(bio->bi_status); - bio_put(bio); - btrfs_bio_counter_dec(fs_info); - + ret = scrub_raid56_cached_parity(sctx, scrub_dev, map, full_stripe_start, + &extent_bitmap); +out: btrfs_release_path(&extent_path); btrfs_release_path(&csum_path); -out: return ret; } @@ -2261,18 +2335,13 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, u64 found_logical = U64_MAX; u64 cur_physical = physical + cur_logical - logical_start; - /* Canceled? */ - if (atomic_read(&fs_info->scrub_cancel_req) || - atomic_read(&sctx->cancel_req)) { - ret = -ECANCELED; + ret = should_cancel_scrub(sctx); + if (ret < 0) break; - } - /* Paused? */ - if (atomic_read(&fs_info->scrub_pause_req)) { - /* Push queued extents */ + + if (atomic_read(&fs_info->scrub_pause_req)) scrub_blocked_if_needed(fs_info); - } - /* Block group removed? */ + spin_lock(&bg->lock); if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) { spin_unlock(&bg->lock); @@ -2527,8 +2596,6 @@ out: } if (sctx->is_dev_replace && ret >= 0) { - int ret2; - ret2 = sync_write_pointer_for_zoned(sctx, chunk_logical + offset, map->stripes[stripe_index].physical, @@ -2621,8 +2688,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, return -ENOMEM; path->reada = READA_FORWARD; - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; key.objectid = scrub_dev->devid; key.type = BTRFS_DEV_EXTENT_KEY; @@ -3037,6 +3104,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, unsigned int nofs_flag; bool need_commit = false; + /* Set the basic fallback @last_physical before we got a sctx. */ + if (progress) + progress->last_physical = start; + if (btrfs_fs_closing(fs_info)) return -EAGAIN; @@ -3055,6 +3126,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, sctx = scrub_setup_ctx(fs_info, is_dev_replace); if (IS_ERR(sctx)) return PTR_ERR(sctx); + sctx->stat.last_physical = start; ret = scrub_workers_get(fs_info); if (ret) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 9230e5066fc6..2522faa97478 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -47,28 +47,30 @@ * It allows fast adding of path elements on the right side (normal path) and * fast adding to the left side (reversed path). A reversed path can also be * unreversed if needed. + * + * The definition of struct fs_path relies on -fms-extensions to allow + * including a tagged struct as an anonymous member. */ +struct __fs_path { + char *start; + char *end; + + char *buf; + unsigned short buf_len:15; + unsigned short reversed:1; +}; +static_assert(sizeof(struct __fs_path) < 256); struct fs_path { - union { - struct { - char *start; - char *end; - - char *buf; - unsigned short buf_len:15; - unsigned short reversed:1; - char inline_buf[]; - }; - /* - * Average path length does not exceed 200 bytes, we'll have - * better packing in the slab and higher chance to satisfy - * an allocation later during send. - */ - char pad[256]; - }; + struct __fs_path; + /* + * Average path length does not exceed 200 bytes, we'll have + * better packing in the slab and higher chance to satisfy + * an allocation later during send. + */ + char inline_buf[256 - sizeof(struct __fs_path)]; }; #define FS_PATH_INLINE_SIZE \ - (sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf)) + sizeof_field(struct fs_path, inline_buf) /* reused for each extent */ @@ -178,7 +180,6 @@ struct send_ctx { u64 cur_inode_rdev; u64 cur_inode_last_extent; u64 cur_inode_next_write_offset; - struct fs_path cur_inode_path; bool cur_inode_new; bool cur_inode_new_gen; bool cur_inode_deleted; @@ -305,6 +306,8 @@ struct send_ctx { struct btrfs_lru_cache dir_created_cache; struct btrfs_lru_cache dir_utimes_cache; + + struct fs_path cur_inode_path; }; struct pending_dir_move { @@ -631,9 +634,9 @@ static struct btrfs_path *alloc_path_for_send(void) path = btrfs_alloc_path(); if (!path) return NULL; - path->search_commit_root = 1; - path->skip_locking = 1; - path->need_commit_sem = 1; + path->search_commit_root = true; + path->skip_locking = true; + path->need_commit_sem = true; return path; } @@ -1051,10 +1054,8 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, } if (unlikely(start < p->buf)) { btrfs_err(root->fs_info, - "send: path ref buffer underflow for key (%llu %u %llu)", - found_key->objectid, - found_key->type, - found_key->offset); + "send: path ref buffer underflow for key " BTRFS_KEY_FMT, + BTRFS_KEY_FMT_VALUE(found_key)); ret = -EINVAL; goto out; } @@ -1134,12 +1135,12 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, btrfs_dir_item_key_to_cpu(eb, di, &di_key); if (btrfs_dir_ftype(eb, di) == BTRFS_FT_XATTR) { - if (name_len > XATTR_NAME_MAX) { + if (unlikely(name_len > XATTR_NAME_MAX)) { ret = -ENAMETOOLONG; goto out; } - if (name_len + data_len > - BTRFS_MAX_XATTR_SIZE(root->fs_info)) { + if (unlikely(name_len + data_len > + BTRFS_MAX_XATTR_SIZE(root->fs_info))) { ret = -E2BIG; goto out; } @@ -1147,7 +1148,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, /* * Path too long */ - if (name_len + data_len > PATH_MAX) { + if (unlikely(name_len + data_len > PATH_MAX)) { ret = -ENAMETOOLONG; goto out; } @@ -2458,7 +2459,7 @@ static int send_subvol_begin(struct send_ctx *sctx) struct btrfs_key key; struct btrfs_root_ref *ref; struct extent_buffer *leaf; - char *name = NULL; + char AUTO_KFREE(name); int namelen; path = btrfs_alloc_path(); @@ -2476,18 +2477,15 @@ static int send_subvol_begin(struct send_ctx *sctx) ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root, &key, path, 1, 0); if (ret < 0) - goto out; - if (ret) { - ret = -ENOENT; - goto out; - } + return ret; + if (ret) + return -ENOENT; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.type != BTRFS_ROOT_BACKREF_KEY || key.objectid != btrfs_root_id(send_root)) { - ret = -ENOENT; - goto out; + return -ENOENT; } ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); namelen = btrfs_root_ref_name_len(leaf, ref); @@ -2497,11 +2495,11 @@ static int send_subvol_begin(struct send_ctx *sctx) if (parent_root) { ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT); if (ret < 0) - goto out; + return ret; } else { ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL); if (ret < 0) - goto out; + return ret; } TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen); @@ -2529,8 +2527,6 @@ static int send_subvol_begin(struct send_ctx *sctx) ret = send_cmd(sctx); tlv_put_failure: -out: - kfree(name); return ret; } @@ -4077,7 +4073,7 @@ static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) */ static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) { - char *name; + char AUTO_KFREE(name); int ret; name = kmemdup(ref->name, ref->name_len, GFP_KERNEL); @@ -4087,17 +4083,58 @@ static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) fs_path_reset(ref->full_path); ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path); if (ret < 0) - goto out; + return ret; ret = fs_path_add(ref->full_path, name, ref->name_len); if (ret < 0) - goto out; + return ret; /* Update the reference's base name pointer. */ set_ref_path(ref, ref->full_path); -out: - kfree(name); - return ret; + + return 0; +} + +static int rbtree_check_dir_ref_comp(const void *k, const struct rb_node *node) +{ + const struct recorded_ref *data = k; + const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node); + + if (data->dir > ref->dir) + return 1; + if (data->dir < ref->dir) + return -1; + if (data->dir_gen > ref->dir_gen) + return 1; + if (data->dir_gen < ref->dir_gen) + return -1; + return 0; +} + +static bool rbtree_check_dir_ref_less(struct rb_node *node, const struct rb_node *parent) +{ + const struct recorded_ref *entry = rb_entry(node, struct recorded_ref, node); + + return rbtree_check_dir_ref_comp(entry, parent) < 0; +} + +static int record_check_dir_ref_in_tree(struct rb_root *root, + struct recorded_ref *ref, struct list_head *list) +{ + struct recorded_ref *tmp_ref; + int ret; + + if (rb_find(ref, root, rbtree_check_dir_ref_comp)) + return 0; + + ret = dup_ref(ref, list); + if (ret < 0) + return ret; + + tmp_ref = list_last_entry(list, struct recorded_ref, list); + rb_add(&tmp_ref->node, root, rbtree_check_dir_ref_less); + tmp_ref->root = root; + return 0; } static int rename_current_inode(struct send_ctx *sctx, @@ -4127,11 +4164,11 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) struct recorded_ref *cur; struct recorded_ref *cur2; LIST_HEAD(check_dirs); + struct rb_root rbtree_check_dirs = RB_ROOT; struct fs_path *valid_path = NULL; u64 ow_inode = 0; u64 ow_gen; u64 ow_mode; - u64 last_dir_ino_rm = 0; bool did_overwrite = false; bool is_orphan = false; bool can_rename = true; @@ -4435,7 +4472,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) goto out; } } - ret = dup_ref(cur, &check_dirs); + ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs); if (ret < 0) goto out; } @@ -4463,7 +4500,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) } list_for_each_entry(cur, &sctx->deleted_refs, list) { - ret = dup_ref(cur, &check_dirs); + ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs); if (ret < 0) goto out; } @@ -4473,7 +4510,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * We have a moved dir. Add the old parent to check_dirs */ cur = list_first_entry(&sctx->deleted_refs, struct recorded_ref, list); - ret = dup_ref(cur, &check_dirs); + ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs); if (ret < 0) goto out; } else if (!S_ISDIR(sctx->cur_inode_mode)) { @@ -4507,7 +4544,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) if (is_current_inode_path(sctx, cur->full_path)) fs_path_reset(&sctx->cur_inode_path); } - ret = dup_ref(cur, &check_dirs); + ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs); if (ret < 0) goto out; } @@ -4550,8 +4587,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; - } else if (ret == inode_state_did_delete && - cur->dir != last_dir_ino_rm) { + } else if (ret == inode_state_did_delete) { ret = can_rmdir(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; @@ -4563,7 +4599,6 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) ret = send_rmdir(sctx, valid_path); if (ret < 0) goto out; - last_dir_ino_rm = cur->dir; } } } @@ -4909,6 +4944,7 @@ struct find_xattr_ctx { int found_idx; char *found_data; int found_data_len; + bool copy_data; }; static int __find_xattr(int num, struct btrfs_key *di_key, const char *name, @@ -4920,9 +4956,11 @@ static int __find_xattr(int num, struct btrfs_key *di_key, const char *name, strncmp(name, ctx->name, name_len) == 0) { ctx->found_idx = num; ctx->found_data_len = data_len; - ctx->found_data = kmemdup(data, data_len, GFP_KERNEL); - if (!ctx->found_data) - return -ENOMEM; + if (ctx->copy_data) { + ctx->found_data = kmemdup(data, data_len, GFP_KERNEL); + if (!ctx->found_data) + return -ENOMEM; + } return 1; } return 0; @@ -4942,6 +4980,7 @@ static int find_xattr(struct btrfs_root *root, ctx.found_idx = -1; ctx.found_data = NULL; ctx.found_data_len = 0; + ctx.copy_data = (data != NULL); ret = iterate_dir_item(root, path, __find_xattr, &ctx); if (ret < 0) @@ -4953,7 +4992,7 @@ static int find_xattr(struct btrfs_root *root, *data = ctx.found_data; *data_len = ctx.found_data_len; } else { - kfree(ctx.found_data); + ASSERT(ctx.found_data == NULL); } return ctx.found_idx; } @@ -4966,8 +5005,8 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key, { int ret; struct send_ctx *sctx = ctx; - char *found_data = NULL; - int found_data_len = 0; + char AUTO_KFREE(found_data); + int found_data_len = 0; ret = find_xattr(sctx->parent_root, sctx->right_path, sctx->cmp_key, name, name_len, &found_data, @@ -4985,7 +5024,6 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key, } } - kfree(found_data); return ret; } @@ -5096,7 +5134,7 @@ static int process_verity(struct send_ctx *sctx) if (ret < 0) goto iput; - if (ret > FS_VERITY_MAX_DESCRIPTOR_SIZE) { + if (unlikely(ret > FS_VERITY_MAX_DESCRIPTOR_SIZE)) { ret = -EMSGSIZE; goto iput; } @@ -5140,14 +5178,14 @@ static int put_data_header(struct send_ctx *sctx, u32 len) * Since v2, the data attribute header doesn't include a length, * it is implicitly to the end of the command. */ - if (sctx->send_max_size - sctx->send_size < sizeof(__le16) + len) + if (unlikely(sctx->send_max_size - sctx->send_size < sizeof(__le16) + len)) return -EOVERFLOW; put_unaligned_le16(BTRFS_SEND_A_DATA, sctx->send_buf + sctx->send_size); sctx->send_size += sizeof(__le16); } else { struct btrfs_tlv_header *hdr; - if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len) + if (unlikely(sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len)) return -EOVERFLOW; hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size); put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type); @@ -5547,8 +5585,8 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, * between the beginning of the command and the file data. */ data_offset = PAGE_ALIGN(sctx->send_size); - if (data_offset > sctx->send_max_size || - sctx->send_max_size - data_offset < disk_num_bytes) { + if (unlikely(data_offset > sctx->send_max_size || + sctx->send_max_size - data_offset < disk_num_bytes)) { ret = -EOVERFLOW; goto out; } @@ -5601,14 +5639,7 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - /* - * Do not go through encoded read for bs > ps cases. - * - * Encoded send is using vmallocated pages as buffer, which we can - * not ensure every folio is large enough to contain a block. - */ - if (sctx->send_root->fs_info->sectorsize <= PAGE_SIZE && - (sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) && + if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) && btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { bool is_inline = (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE); @@ -5722,7 +5753,7 @@ static int send_capabilities(struct send_ctx *sctx) struct btrfs_dir_item *di; struct extent_buffer *leaf; unsigned long data_ptr; - char *buf = NULL; + char AUTO_KFREE(buf); int buf_len; int ret = 0; @@ -5734,28 +5765,23 @@ static int send_capabilities(struct send_ctx *sctx) XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), 0); if (!di) { /* There is no xattr for this inode */ - goto out; + return 0; } else if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; + return PTR_ERR(di); } leaf = path->nodes[0]; buf_len = btrfs_dir_data_len(leaf, di); buf = kmalloc(buf_len, GFP_KERNEL); - if (!buf) { - ret = -ENOMEM; - goto out; - } + if (!buf) + return -ENOMEM; data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di); read_extent_buffer(leaf, buf, data_ptr, buf_len); ret = send_set_xattr(sctx, XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), buf, buf_len); -out: - kfree(buf); return ret; } @@ -7232,8 +7258,8 @@ static int search_key_again(const struct send_ctx *sctx, if (unlikely(ret > 0)) { btrfs_print_tree(path->nodes[path->lowest_level], false); btrfs_err(root->fs_info, -"send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d", - key->objectid, key->type, key->offset, +"send: key " BTRFS_KEY_FMT" not found in %s root %llu, lowest_level %d, slot %d", + BTRFS_KEY_FMT_VALUE(key), (root == sctx->parent_root ? "parent" : "send"), btrfs_root_id(root), path->lowest_level, path->slots[path->lowest_level]); @@ -7601,10 +7627,10 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, goto out; } - left_path->search_commit_root = 1; - left_path->skip_locking = 1; - right_path->search_commit_root = 1; - right_path->skip_locking = 1; + left_path->search_commit_root = true; + left_path->skip_locking = true; + right_path->search_commit_root = true; + right_path->skip_locking = true; /* * Strategy: Go to the first items of both trees. Then do diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 97452fb5d29b..6babbe333741 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -15,6 +15,7 @@ #include "accessors.h" #include "extent-tree.h" #include "zoned.h" +#include "delayed-inode.h" /* * HOW DOES SPACE RESERVATION WORK @@ -67,7 +68,7 @@ * Assume we are unable to simply make the reservation because we do not have * enough space * - * -> __reserve_bytes + * -> reserve_bytes * create a reserve_ticket with ->bytes set to our reservation, add it to * the tail of space_info->tickets, kick async flush thread * @@ -172,15 +173,14 @@ * thing with or without extra unallocated space. */ -u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info, - bool may_use_included) -{ - ASSERT(s_info); - return s_info->bytes_used + s_info->bytes_reserved + - s_info->bytes_pinned + s_info->bytes_readonly + - s_info->bytes_zone_unusable + - (may_use_included ? s_info->bytes_may_use : 0); -} +struct reserve_ticket { + u64 bytes; + int error; + bool steal; + struct list_head list; + wait_queue_head_t wait; + spinlock_t lock; +}; /* * after adding space to the filesystem, we need to clear the full flags @@ -192,7 +192,7 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) struct btrfs_space_info *found; list_for_each_entry(found, head, list) - found->full = 0; + found->full = false; } /* @@ -211,7 +211,7 @@ static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags) if (btrfs_is_zoned(fs_info)) return fs_info->zone_size; - ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK); + ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK, "flags=%llu", flags); if (flags & BTRFS_BLOCK_GROUP_DATA) return BTRFS_MAX_DATA_CHUNK_SIZE; @@ -262,8 +262,9 @@ static int create_space_info_sub_group(struct btrfs_space_info *parent, u64 flag struct btrfs_space_info *sub_group; int ret; - ASSERT(parent->subgroup_id == BTRFS_SUB_GROUP_PRIMARY); - ASSERT(id != BTRFS_SUB_GROUP_PRIMARY); + ASSERT(parent->subgroup_id == BTRFS_SUB_GROUP_PRIMARY, + "parent->subgroup_id=%d", parent->subgroup_id); + ASSERT(id != BTRFS_SUB_GROUP_PRIMARY, "id=%d", id); sub_group = kzalloc(sizeof(*sub_group), GFP_NOFS); if (!sub_group) @@ -274,7 +275,7 @@ static int create_space_info_sub_group(struct btrfs_space_info *parent, u64 flag sub_group->parent = parent; sub_group->subgroup_id = id; - ret = btrfs_sysfs_add_space_info_type(fs_info, sub_group); + ret = btrfs_sysfs_add_space_info_type(sub_group); if (ret) { kfree(sub_group); parent->sub_group[index] = NULL; @@ -308,7 +309,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) return ret; } - ret = btrfs_sysfs_add_space_info_type(info, space_info); + ret = btrfs_sysfs_add_space_info_type(space_info); if (ret) return ret; @@ -372,8 +373,8 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, space_info->bytes_readonly += block_group->bytes_super; btrfs_space_info_update_bytes_zone_unusable(space_info, block_group->zone_unusable); if (block_group->length > 0) - space_info->full = 0; - btrfs_try_granting_tickets(info, space_info); + space_info->full = false; + btrfs_try_granting_tickets(space_info); spin_unlock(&space_info->lock); block_group->space_info = space_info; @@ -421,10 +422,10 @@ static u64 calc_effective_data_chunk_size(struct btrfs_fs_info *fs_info) return min_t(u64, data_chunk_size, SZ_1G); } -static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, - const struct btrfs_space_info *space_info, - enum btrfs_reserve_flush_enum flush) +static u64 calc_available_free_space(const struct btrfs_space_info *space_info, + enum btrfs_reserve_flush_enum flush) { + struct btrfs_fs_info *fs_info = space_info->fs_info; u64 profile; u64 avail; u64 data_chunk_size; @@ -490,44 +491,77 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, return avail; } -int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, - const struct btrfs_space_info *space_info, u64 bytes, - enum btrfs_reserve_flush_enum flush) +static inline bool check_can_overcommit(const struct btrfs_space_info *space_info, + u64 space_info_used_bytes, u64 bytes, + enum btrfs_reserve_flush_enum flush) +{ + const u64 avail = calc_available_free_space(space_info, flush); + + return (space_info_used_bytes + bytes < space_info->total_bytes + avail); +} + +static inline bool can_overcommit(const struct btrfs_space_info *space_info, + u64 space_info_used_bytes, u64 bytes, + enum btrfs_reserve_flush_enum flush) +{ + /* Don't overcommit when in mixed mode. */ + if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) + return false; + + return check_can_overcommit(space_info, space_info_used_bytes, bytes, flush); +} + +bool btrfs_can_overcommit(const struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush) { - u64 avail; u64 used; /* Don't overcommit when in mixed mode */ if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) - return 0; + return false; used = btrfs_space_info_used(space_info, true); - avail = calc_available_free_space(fs_info, space_info, flush); - if (used + bytes < space_info->total_bytes + avail) - return 1; - return 0; + return check_can_overcommit(space_info, used, bytes, flush); } static void remove_ticket(struct btrfs_space_info *space_info, - struct reserve_ticket *ticket) + struct reserve_ticket *ticket, int error) { + lockdep_assert_held(&space_info->lock); + if (!list_empty(&ticket->list)) { list_del_init(&ticket->list); - ASSERT(space_info->reclaim_size >= ticket->bytes); + ASSERT(space_info->reclaim_size >= ticket->bytes, + "space_info->reclaim_size=%llu ticket->bytes=%llu", + space_info->reclaim_size, ticket->bytes); space_info->reclaim_size -= ticket->bytes; } + + spin_lock(&ticket->lock); + /* + * If we are called from a task waiting on the ticket, it may happen + * that before it sets an error on the ticket, a reclaim task was able + * to satisfy the ticket. In that case ignore the error. + */ + if (error && ticket->bytes > 0) + ticket->error = error; + else + ticket->bytes = 0; + + wake_up(&ticket->wait); + spin_unlock(&ticket->lock); } /* * This is for space we already have accounted in space_info->bytes_may_use, so * basically when we're returning space from block_rsv's. */ -void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) +void btrfs_try_granting_tickets(struct btrfs_space_info *space_info) { struct list_head *head; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; + u64 used = btrfs_space_info_used(space_info, true); lockdep_assert_held(&space_info->lock); @@ -535,19 +569,18 @@ void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, again: while (!list_empty(head)) { struct reserve_ticket *ticket; - u64 used = btrfs_space_info_used(space_info, true); + u64 used_after; ticket = list_first_entry(head, struct reserve_ticket, list); + used_after = used + ticket->bytes; /* Check and see if our ticket can be satisfied now. */ - if ((used + ticket->bytes <= space_info->total_bytes) || - btrfs_can_overcommit(fs_info, space_info, ticket->bytes, - flush)) { + if (used_after <= space_info->total_bytes || + can_overcommit(space_info, used, ticket->bytes, flush)) { btrfs_space_info_update_bytes_may_use(space_info, ticket->bytes); - remove_ticket(space_info, ticket); - ticket->bytes = 0; + remove_ticket(space_info, ticket, 0); space_info->tickets_id++; - wake_up(&ticket->wait); + used = used_after; } else { break; } @@ -594,9 +627,9 @@ static void dump_global_block_rsv(struct btrfs_fs_info *fs_info) DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); } -static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info, - const struct btrfs_space_info *info) +static void __btrfs_dump_space_info(const struct btrfs_space_info *info) { + const struct btrfs_fs_info *fs_info = info->fs_info; const char *flag_str = space_info_flag_to_str(info); lockdep_assert_held(&info->lock); @@ -613,16 +646,16 @@ static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info, info->bytes_readonly, info->bytes_zone_unusable); } -void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *info, u64 bytes, +void btrfs_dump_space_info(struct btrfs_space_info *info, u64 bytes, bool dump_block_groups) { + struct btrfs_fs_info *fs_info = info->fs_info; struct btrfs_block_group *cache; u64 total_avail = 0; int index = 0; spin_lock(&info->lock); - __btrfs_dump_space_info(fs_info, info); + __btrfs_dump_space_info(info); dump_global_block_rsv(fs_info); spin_unlock(&info->lock); @@ -670,11 +703,11 @@ static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info, /* * shrink metadata reservation for delalloc */ -static void shrink_delalloc(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, +static void shrink_delalloc(struct btrfs_space_info *space_info, u64 to_reclaim, bool wait_ordered, bool for_preempt) { + struct btrfs_fs_info *fs_info = space_info->fs_info; struct btrfs_trans_handle *trans; u64 delalloc_bytes; u64 ordered_bytes; @@ -801,10 +834,10 @@ skip_async: * and may fail for various reasons. The caller is supposed to examine the * state of @space_info to detect the outcome. */ -static void flush_space(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 num_bytes, - enum btrfs_flush_state state, bool for_preempt) +static void flush_space(struct btrfs_space_info *space_info, u64 num_bytes, + enum btrfs_flush_state state, bool for_preempt) { + struct btrfs_fs_info *fs_info = space_info->fs_info; struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; int nr; @@ -833,7 +866,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, case FLUSH_DELALLOC_FULL: if (state == FLUSH_DELALLOC_FULL) num_bytes = U64_MAX; - shrink_delalloc(fs_info, space_info, num_bytes, + shrink_delalloc(space_info, num_bytes, state != FLUSH_DELALLOC, for_preempt); break; case FLUSH_DELAYED_REFS_NR: @@ -900,8 +933,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, return; } -static u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, - const struct btrfs_space_info *space_info) +static u64 btrfs_calc_reclaim_metadata_size(const struct btrfs_space_info *space_info) { u64 used; u64 avail; @@ -909,8 +941,7 @@ static u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, lockdep_assert_held(&space_info->lock); - avail = calc_available_free_space(fs_info, space_info, - BTRFS_RESERVE_FLUSH_ALL); + avail = calc_available_free_space(space_info, BTRFS_RESERVE_FLUSH_ALL); used = btrfs_space_info_used(space_info, true); /* @@ -925,18 +956,25 @@ static u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, return to_reclaim; } -static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, - const struct btrfs_space_info *space_info) +static bool need_preemptive_reclaim(const struct btrfs_space_info *space_info) { + struct btrfs_fs_info *fs_info = space_info->fs_info; const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv); u64 ordered, delalloc; u64 thresh; u64 used; - thresh = mult_perc(space_info->total_bytes, 90); - lockdep_assert_held(&space_info->lock); + /* + * We have tickets queued, bail so we don't compete with the async + * flushers. + */ + if (space_info->reclaim_size) + return false; + + thresh = mult_perc(space_info->total_bytes, 90); + /* If we're just plain full then async reclaim just slows us down. */ if ((space_info->bytes_used + space_info->bytes_reserved + global_rsv_size) >= thresh) @@ -957,13 +995,6 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, return false; /* - * We have tickets queued, bail so we don't compete with the async - * flushers. - */ - if (space_info->reclaim_size) - return false; - - /* * If we have over half of the free space occupied by reservations or * pinned then we want to start flushing. * @@ -992,8 +1023,7 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, * much delalloc we need for the background flusher to kick in. */ - thresh = calc_available_free_space(fs_info, space_info, - BTRFS_RESERVE_FLUSH_ALL); + thresh = calc_available_free_space(space_info, BTRFS_RESERVE_FLUSH_ALL); used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_readonly + global_rsv_size; if (used < space_info->total_bytes) @@ -1037,13 +1067,15 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); } -static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, +static bool steal_from_global_rsv(struct btrfs_space_info *space_info, struct reserve_ticket *ticket) { + struct btrfs_fs_info *fs_info = space_info->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; u64 min_bytes; + lockdep_assert_held(&space_info->lock); + if (!ticket->steal) return false; @@ -1057,21 +1089,19 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info, return false; } global_rsv->reserved -= ticket->bytes; - remove_ticket(space_info, ticket); - ticket->bytes = 0; - wake_up(&ticket->wait); - space_info->tickets_id++; if (global_rsv->reserved < global_rsv->size) - global_rsv->full = 0; + global_rsv->full = false; spin_unlock(&global_rsv->lock); + remove_ticket(space_info, ticket, 0); + space_info->tickets_id++; + return true; } /* * We've exhausted our flushing, start failing tickets. * - * @fs_info - fs_info for this fs * @space_info - the space info we were flushing * * We call this when we've exhausted our flushing ability and haven't made @@ -1084,47 +1114,44 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info, * other tickets, or if it stumbles across a ticket that was smaller than the * first ticket. */ -static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) +static bool maybe_fail_all_tickets(struct btrfs_space_info *space_info) { + struct btrfs_fs_info *fs_info = space_info->fs_info; struct reserve_ticket *ticket; u64 tickets_id = space_info->tickets_id; - const bool aborted = BTRFS_FS_ERROR(fs_info); + const int abort_error = BTRFS_FS_ERROR(fs_info); trace_btrfs_fail_all_tickets(fs_info, space_info); if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { btrfs_info(fs_info, "cannot satisfy tickets, dumping space info"); - __btrfs_dump_space_info(fs_info, space_info); + __btrfs_dump_space_info(space_info); } while (!list_empty(&space_info->tickets) && tickets_id == space_info->tickets_id) { ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); + if (unlikely(abort_error)) { + remove_ticket(space_info, ticket, abort_error); + } else { + if (steal_from_global_rsv(space_info, ticket)) + return true; - if (!aborted && steal_from_global_rsv(fs_info, space_info, ticket)) - return true; - - if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - btrfs_info(fs_info, "failing ticket with %llu bytes", - ticket->bytes); + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) + btrfs_info(fs_info, "failing ticket with %llu bytes", + ticket->bytes); - remove_ticket(space_info, ticket); - if (aborted) - ticket->error = -EIO; - else - ticket->error = -ENOSPC; - wake_up(&ticket->wait); + remove_ticket(space_info, ticket, -ENOSPC); - /* - * We're just throwing tickets away, so more flushing may not - * trip over btrfs_try_granting_tickets, so we need to call it - * here to see if we can make progress with the next ticket in - * the list. - */ - if (!aborted) - btrfs_try_granting_tickets(fs_info, space_info); + /* + * We're just throwing tickets away, so more flushing may + * not trip over btrfs_try_granting_tickets, so we need + * to call it here to see if we can make progress with + * the next ticket in the list. + */ + btrfs_try_granting_tickets(space_info); + } } return (tickets_id != space_info->tickets_id); } @@ -1144,9 +1171,9 @@ static void do_async_reclaim_metadata_space(struct btrfs_space_info *space_info) final_state = COMMIT_TRANS; spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); + to_reclaim = btrfs_calc_reclaim_metadata_size(space_info); if (!to_reclaim) { - space_info->flush = 0; + space_info->flush = false; spin_unlock(&space_info->lock); return; } @@ -1155,15 +1182,14 @@ static void do_async_reclaim_metadata_space(struct btrfs_space_info *space_info) flush_state = FLUSH_DELAYED_ITEMS_NR; do { - flush_space(fs_info, space_info, to_reclaim, flush_state, false); + flush_space(space_info, to_reclaim, flush_state, false); spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { - space_info->flush = 0; + space_info->flush = false; spin_unlock(&space_info->lock); return; } - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, - space_info); + to_reclaim = btrfs_calc_reclaim_metadata_size(space_info); if (last_tickets_id == space_info->tickets_id) { flush_state++; } else { @@ -1197,11 +1223,11 @@ static void do_async_reclaim_metadata_space(struct btrfs_space_info *space_info) if (flush_state > final_state) { commit_cycles++; if (commit_cycles > 2) { - if (maybe_fail_all_tickets(fs_info, space_info)) { + if (maybe_fail_all_tickets(space_info)) { flush_state = FLUSH_DELAYED_ITEMS_NR; commit_cycles--; } else { - space_info->flush = 0; + space_info->flush = false; } } else { flush_state = FLUSH_DELAYED_ITEMS_NR; @@ -1257,14 +1283,15 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) trans_rsv = &fs_info->trans_block_rsv; spin_lock(&space_info->lock); - while (need_preemptive_reclaim(fs_info, space_info)) { + while (need_preemptive_reclaim(space_info)) { enum btrfs_flush_state flush; u64 delalloc_size = 0; u64 to_reclaim, block_rsv_size; const u64 global_rsv_size = btrfs_block_rsv_reserved(global_rsv); + const u64 bytes_may_use = space_info->bytes_may_use; + const u64 bytes_pinned = space_info->bytes_pinned; - loops++; - + spin_unlock(&space_info->lock); /* * We don't have a precise counter for the metadata being * reserved for delalloc, so we'll approximate it by subtracting @@ -1276,8 +1303,8 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) btrfs_block_rsv_reserved(delayed_block_rsv) + btrfs_block_rsv_reserved(delayed_refs_rsv) + btrfs_block_rsv_reserved(trans_rsv); - if (block_rsv_size < space_info->bytes_may_use) - delalloc_size = space_info->bytes_may_use - block_rsv_size; + if (block_rsv_size < bytes_may_use) + delalloc_size = bytes_may_use - block_rsv_size; /* * We don't want to include the global_rsv in our calculation, @@ -1294,10 +1321,10 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) if (delalloc_size > block_rsv_size) { to_reclaim = delalloc_size; flush = FLUSH_DELALLOC; - } else if (space_info->bytes_pinned > + } else if (bytes_pinned > (btrfs_block_rsv_reserved(delayed_block_rsv) + btrfs_block_rsv_reserved(delayed_refs_rsv))) { - to_reclaim = space_info->bytes_pinned; + to_reclaim = bytes_pinned; flush = COMMIT_TRANS; } else if (btrfs_block_rsv_reserved(delayed_block_rsv) > btrfs_block_rsv_reserved(delayed_refs_rsv)) { @@ -1308,7 +1335,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) flush = FLUSH_DELAYED_REFS_NR; } - spin_unlock(&space_info->lock); + loops++; /* * We don't want to reclaim everything, just a portion, so scale @@ -1318,7 +1345,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) to_reclaim >>= 2; if (!to_reclaim) to_reclaim = btrfs_calc_insert_metadata_size(fs_info, 1); - flush_space(fs_info, space_info, to_reclaim, flush, true); + flush_space(space_info, to_reclaim, flush, true); cond_resched(); spin_lock(&space_info->lock); } @@ -1383,7 +1410,7 @@ static void do_async_reclaim_data_space(struct btrfs_space_info *space_info) spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { - space_info->flush = 0; + space_info->flush = false; spin_unlock(&space_info->lock); return; } @@ -1391,27 +1418,27 @@ static void do_async_reclaim_data_space(struct btrfs_space_info *space_info) spin_unlock(&space_info->lock); while (!space_info->full) { - flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); + flush_space(space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { - space_info->flush = 0; + space_info->flush = false; spin_unlock(&space_info->lock); return; } /* Something happened, fail everything and bail. */ - if (BTRFS_FS_ERROR(fs_info)) + if (unlikely(BTRFS_FS_ERROR(fs_info))) goto aborted_fs; last_tickets_id = space_info->tickets_id; spin_unlock(&space_info->lock); } while (flush_state < ARRAY_SIZE(data_flush_states)) { - flush_space(fs_info, space_info, U64_MAX, + flush_space(space_info, U64_MAX, data_flush_states[flush_state], false); spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { - space_info->flush = 0; + space_info->flush = false; spin_unlock(&space_info->lock); return; } @@ -1425,16 +1452,16 @@ static void do_async_reclaim_data_space(struct btrfs_space_info *space_info) if (flush_state >= ARRAY_SIZE(data_flush_states)) { if (space_info->full) { - if (maybe_fail_all_tickets(fs_info, space_info)) + if (maybe_fail_all_tickets(space_info)) flush_state = 0; else - space_info->flush = 0; + space_info->flush = false; } else { flush_state = 0; } /* Something happened, fail everything and bail. */ - if (BTRFS_FS_ERROR(fs_info)) + if (unlikely(BTRFS_FS_ERROR(fs_info))) goto aborted_fs; } @@ -1443,8 +1470,8 @@ static void do_async_reclaim_data_space(struct btrfs_space_info *space_info) return; aborted_fs: - maybe_fail_all_tickets(fs_info, space_info); - space_info->flush = 0; + maybe_fail_all_tickets(space_info); + space_info->flush = false; spin_unlock(&space_info->lock); } @@ -1489,40 +1516,47 @@ static const enum btrfs_flush_state evict_flush_states[] = { RESET_ZONES, }; -static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - struct reserve_ticket *ticket, - const enum btrfs_flush_state *states, - int states_nr) +static bool is_ticket_served(struct reserve_ticket *ticket) { + bool ret; + + spin_lock(&ticket->lock); + ret = (ticket->bytes == 0); + spin_unlock(&ticket->lock); + + return ret; +} + +static void priority_reclaim_metadata_space(struct btrfs_space_info *space_info, + struct reserve_ticket *ticket, + const enum btrfs_flush_state *states, + int states_nr) +{ + struct btrfs_fs_info *fs_info = space_info->fs_info; u64 to_reclaim; int flush_state = 0; - spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); /* * This is the priority reclaim path, so to_reclaim could be >0 still * because we may have only satisfied the priority tickets and still * left non priority tickets on the list. We would then have * to_reclaim but ->bytes == 0. */ - if (ticket->bytes == 0) { - spin_unlock(&space_info->lock); + if (is_ticket_served(ticket)) return; - } + + spin_lock(&space_info->lock); + to_reclaim = btrfs_calc_reclaim_metadata_size(space_info); + spin_unlock(&space_info->lock); while (flush_state < states_nr) { - spin_unlock(&space_info->lock); - flush_space(fs_info, space_info, to_reclaim, states[flush_state], - false); - flush_state++; - spin_lock(&space_info->lock); - if (ticket->bytes == 0) { - spin_unlock(&space_info->lock); + flush_space(space_info, to_reclaim, states[flush_state], false); + if (is_ticket_served(ticket)) return; - } + flush_state++; } + spin_lock(&space_info->lock); /* * Attempt to steal from the global rsv if we can, except if the fs was * turned into error mode due to a transaction abort when flushing space @@ -1531,48 +1565,38 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, * just to have caller fail immediately instead of later when trying to * modify the fs, making it easier to debug -ENOSPC problems. */ - if (BTRFS_FS_ERROR(fs_info)) { - ticket->error = BTRFS_FS_ERROR(fs_info); - remove_ticket(space_info, ticket); - } else if (!steal_from_global_rsv(fs_info, space_info, ticket)) { - ticket->error = -ENOSPC; - remove_ticket(space_info, ticket); - } + if (unlikely(BTRFS_FS_ERROR(fs_info))) + remove_ticket(space_info, ticket, BTRFS_FS_ERROR(fs_info)); + else if (!steal_from_global_rsv(space_info, ticket)) + remove_ticket(space_info, ticket, -ENOSPC); /* * We must run try_granting_tickets here because we could be a large * ticket in front of a smaller ticket that can now be satisfied with * the available space. */ - btrfs_try_granting_tickets(fs_info, space_info); + btrfs_try_granting_tickets(space_info); spin_unlock(&space_info->lock); } -static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, +static void priority_reclaim_data_space(struct btrfs_space_info *space_info, struct reserve_ticket *ticket) { - spin_lock(&space_info->lock); - /* We could have been granted before we got here. */ - if (ticket->bytes == 0) { - spin_unlock(&space_info->lock); + if (is_ticket_served(ticket)) return; - } + spin_lock(&space_info->lock); while (!space_info->full) { spin_unlock(&space_info->lock); - flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); - spin_lock(&space_info->lock); - if (ticket->bytes == 0) { - spin_unlock(&space_info->lock); + flush_space(space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); + if (is_ticket_served(ticket)) return; - } + spin_lock(&space_info->lock); } - ticket->error = -ENOSPC; - remove_ticket(space_info, ticket); - btrfs_try_granting_tickets(fs_info, space_info); + remove_ticket(space_info, ticket, -ENOSPC); + btrfs_try_granting_tickets(space_info); spin_unlock(&space_info->lock); } @@ -1581,11 +1605,13 @@ static void wait_reserve_ticket(struct btrfs_space_info *space_info, { DEFINE_WAIT(wait); - int ret = 0; - spin_lock(&space_info->lock); + spin_lock(&ticket->lock); while (ticket->bytes > 0 && ticket->error == 0) { + int ret; + ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); + spin_unlock(&ticket->lock); if (ret) { /* * Delete us from the list. After we unlock the space @@ -1595,24 +1621,23 @@ static void wait_reserve_ticket(struct btrfs_space_info *space_info, * despite getting an error, resulting in a space leak * (bytes_may_use counter of our space_info). */ - remove_ticket(space_info, ticket); - ticket->error = -EINTR; - break; + spin_lock(&space_info->lock); + remove_ticket(space_info, ticket, -EINTR); + spin_unlock(&space_info->lock); + return; } - spin_unlock(&space_info->lock); schedule(); finish_wait(&ticket->wait, &wait); - spin_lock(&space_info->lock); + spin_lock(&ticket->lock); } - spin_unlock(&space_info->lock); + spin_unlock(&ticket->lock); } /* * Do the appropriate flushing and waiting for a ticket. * - * @fs_info: the filesystem * @space_info: space info for the reservation * @ticket: ticket for the reservation * @start_ns: timestamp when the reservation started @@ -1622,8 +1647,7 @@ static void wait_reserve_ticket(struct btrfs_space_info *space_info, * This does the work of figuring out how to flush for the ticket, waiting for * the reservation, and returning the appropriate error if there is one. */ -static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, +static int handle_reserve_ticket(struct btrfs_space_info *space_info, struct reserve_ticket *ticket, u64 start_ns, u64 orig_bytes, enum btrfs_reserve_flush_enum flush) @@ -1637,20 +1661,20 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, wait_reserve_ticket(space_info, ticket); break; case BTRFS_RESERVE_FLUSH_LIMIT: - priority_reclaim_metadata_space(fs_info, space_info, ticket, + priority_reclaim_metadata_space(space_info, ticket, priority_flush_states, ARRAY_SIZE(priority_flush_states)); break; case BTRFS_RESERVE_FLUSH_EVICT: - priority_reclaim_metadata_space(fs_info, space_info, ticket, + priority_reclaim_metadata_space(space_info, ticket, evict_flush_states, ARRAY_SIZE(evict_flush_states)); break; case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE: - priority_reclaim_data_space(fs_info, space_info, ticket); + priority_reclaim_data_space(space_info, ticket); break; default: - ASSERT(0); + ASSERT(0, "flush=%d", flush); break; } @@ -1662,9 +1686,10 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, * releasing reserved space (if an error happens the expectation is that * space wasn't reserved at all). */ - ASSERT(!(ticket->bytes == 0 && ticket->error)); - trace_btrfs_reserve_ticket(fs_info, space_info->flags, orig_bytes, - start_ns, flush, ticket->error); + ASSERT(!(ticket->bytes == 0 && ticket->error), + "ticket->bytes=%llu ticket->error=%d", ticket->bytes, ticket->error); + trace_btrfs_reserve_ticket(space_info->fs_info, space_info->flags, + orig_bytes, start_ns, flush, ticket->error); return ret; } @@ -1678,9 +1703,9 @@ static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush) (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL); } -static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) +static inline void maybe_clamp_preempt(struct btrfs_space_info *space_info) { + struct btrfs_fs_info *fs_info = space_info->fs_info; u64 ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes); u64 delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes); @@ -1715,7 +1740,6 @@ static inline bool can_ticket(enum btrfs_reserve_flush_enum flush) /* * Try to reserve bytes from the block_rsv's space. * - * @fs_info: the filesystem * @space_info: space info we want to allocate from * @orig_bytes: number of bytes we want * @flush: whether or not we can flush to make our reservation @@ -1727,10 +1751,10 @@ static inline bool can_ticket(enum btrfs_reserve_flush_enum flush) * regain reservations will be made and this will fail if there is not enough * space already. */ -static int __reserve_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 orig_bytes, - enum btrfs_reserve_flush_enum flush) +static int reserve_bytes(struct btrfs_space_info *space_info, u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) { + struct btrfs_fs_info *fs_info = space_info->fs_info; struct work_struct *async_work; struct reserve_ticket ticket; u64 start_ns = 0; @@ -1738,7 +1762,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, int ret = -ENOSPC; bool pending_tickets; - ASSERT(orig_bytes); + ASSERT(orig_bytes, "orig_bytes=%llu", orig_bytes); /* * If have a transaction handle (current->journal_info != NULL), then * the flush method can not be neither BTRFS_RESERVE_FLUSH_ALL* nor @@ -1747,9 +1771,9 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, */ if (current->journal_info) { /* One assert per line for easier debugging. */ - ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL); - ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL_STEAL); - ASSERT(flush != BTRFS_RESERVE_FLUSH_EVICT); + ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL, "flush=%d", flush); + ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL_STEAL, "flush=%d", flush); + ASSERT(flush != BTRFS_RESERVE_FLUSH_EVICT, "flush=%d", flush); } if (flush == BTRFS_RESERVE_FLUSH_DATA) @@ -1777,7 +1801,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, */ if (!pending_tickets && ((used + orig_bytes <= space_info->total_bytes) || - btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) { + can_overcommit(space_info, used, orig_bytes, flush))) { btrfs_space_info_update_bytes_may_use(space_info, orig_bytes); ret = 0; } @@ -1788,7 +1812,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * left to allocate for the block. */ if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) { - used = btrfs_space_info_used(space_info, false); + used -= space_info->bytes_may_use; if (used + orig_bytes <= space_info->total_bytes) { btrfs_space_info_update_bytes_may_use(space_info, orig_bytes); ret = 0; @@ -1807,6 +1831,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, ticket.error = 0; space_info->reclaim_size += ticket.bytes; init_waitqueue_head(&ticket.wait); + spin_lock_init(&ticket.lock); ticket.steal = can_steal(flush); if (trace_btrfs_reserve_ticket_enabled()) start_ns = ktime_get_ns(); @@ -1823,9 +1848,9 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * preemptive flushing in order to keep up with * the workload. */ - maybe_clamp_preempt(fs_info, space_info); + maybe_clamp_preempt(space_info); - space_info->flush = 1; + space_info->flush = true; trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, @@ -1844,7 +1869,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, */ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && !work_busy(&fs_info->preempt_reclaim_work) && - need_preemptive_reclaim(fs_info, space_info)) { + need_preemptive_reclaim(space_info)) { trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, "preempt"); queue_work(system_dfl_wq, @@ -1855,14 +1880,12 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, if (!ret || !can_ticket(flush)) return ret; - return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns, - orig_bytes, flush); + return handle_reserve_ticket(space_info, &ticket, start_ns, orig_bytes, flush); } /* * Try to reserve metadata bytes from the block_rsv's space. * - * @fs_info: the filesystem * @space_info: the space_info we're allocating for * @orig_bytes: number of bytes we want * @flush: whether or not we can flush to make our reservation @@ -1874,20 +1897,21 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * regain reservations will be made and this will fail if there is not enough * space already. */ -int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, +int btrfs_reserve_metadata_bytes(struct btrfs_space_info *space_info, u64 orig_bytes, enum btrfs_reserve_flush_enum flush) { int ret; - ret = __reserve_bytes(fs_info, space_info, orig_bytes, flush); + ret = reserve_bytes(space_info, orig_bytes, flush); if (ret == -ENOSPC) { + struct btrfs_fs_info *fs_info = space_info->fs_info; + trace_btrfs_space_reservation(fs_info, "space_info:enospc", space_info->flags, orig_bytes, 1); if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - btrfs_dump_space_info(fs_info, space_info, orig_bytes, false); + btrfs_dump_space_info(space_info, orig_bytes, false); } return ret; } @@ -1895,7 +1919,7 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, /* * Try to reserve data bytes for an allocation. * - * @fs_info: the filesystem + * @space_info: the space_info we're allocating for * @bytes: number of bytes we need * @flush: how we are allowed to flush * @@ -1910,15 +1934,17 @@ int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes, ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA || flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE || - flush == BTRFS_RESERVE_NO_FLUSH); - ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); + flush == BTRFS_RESERVE_NO_FLUSH, "flush=%d", flush); + ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA, + "current->journal_info=0x%lx flush=%d", + (unsigned long)current->journal_info, flush); - ret = __reserve_bytes(fs_info, space_info, bytes, flush); + ret = reserve_bytes(space_info, bytes, flush); if (ret == -ENOSPC) { trace_btrfs_space_reservation(fs_info, "space_info:enospc", space_info->flags, bytes, 1); if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - btrfs_dump_space_info(fs_info, space_info, bytes, false); + btrfs_dump_space_info(space_info, bytes, false); } return ret; } @@ -1931,7 +1957,7 @@ __cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info) btrfs_info(fs_info, "dumping space info:"); list_for_each_entry(space_info, &fs_info->space_info, list) { spin_lock(&space_info->lock); - __btrfs_dump_space_info(fs_info, space_info); + __btrfs_dump_space_info(space_info); spin_unlock(&space_info->lock); } dump_global_block_rsv(fs_info); @@ -1948,7 +1974,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) int factor; /* It's df, we don't care if it's racy */ - if (list_empty(&sinfo->ro_bgs)) + if (data_race(list_empty(&sinfo->ro_bgs))) return 0; spin_lock(&sinfo->lock); @@ -2187,7 +2213,7 @@ void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len) global_rsv->reserved += to_add; btrfs_space_info_update_bytes_may_use(space_info, to_add); if (global_rsv->reserved >= global_rsv->size) - global_rsv->full = 1; + global_rsv->full = true; len -= to_add; } spin_unlock(&global_rsv->lock); @@ -2195,5 +2221,5 @@ void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len) grant: /* Add to any tickets we may have. */ if (len) - btrfs_try_granting_tickets(fs_info, space_info); + btrfs_try_granting_tickets(space_info); } diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 679f22efb407..446c0614ad4a 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -142,11 +142,11 @@ struct btrfs_space_info { flushing. The value is >> clamp, so turns out to be a 2^clamp divisor. */ - unsigned int full:1; /* indicates that we cannot allocate any more + bool full; /* indicates that we cannot allocate any more chunks for this space */ - unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ + bool chunk_alloc; /* set if we are allocating a chunk */ - unsigned int flush:1; /* set if we are trying to make space */ + bool flush; /* set if we are trying to make space */ unsigned int force_alloc; /* set if we need to force a chunk alloc for this space */ @@ -224,14 +224,6 @@ struct btrfs_space_info { s64 reclaimable_bytes; }; -struct reserve_ticket { - u64 bytes; - int error; - bool steal; - struct list_head list; - wait_queue_head_t wait; -}; - static inline bool btrfs_mixed_space_info(const struct btrfs_space_info *space_info) { return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && @@ -266,6 +258,17 @@ DECLARE_SPACE_INFO_UPDATE(bytes_may_use, "space_info"); DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned"); DECLARE_SPACE_INFO_UPDATE(bytes_zone_unusable, "zone_unusable"); +static inline u64 btrfs_space_info_used(const struct btrfs_space_info *s_info, + bool may_use_included) +{ + lockdep_assert_held(&s_info->lock); + + return s_info->bytes_used + s_info->bytes_reserved + + s_info->bytes_pinned + s_info->bytes_readonly + + s_info->bytes_zone_unusable + + (may_use_included ? s_info->bytes_may_use : 0); +} + int btrfs_init_space_info(struct btrfs_fs_info *fs_info); void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, struct btrfs_block_group *block_group); @@ -273,21 +276,15 @@ void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, u64 chunk_size); struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, u64 flags); -u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info, - bool may_use_included); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); -void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *info, u64 bytes, +void btrfs_dump_space_info(struct btrfs_space_info *info, u64 bytes, bool dump_block_groups); -int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, +int btrfs_reserve_metadata_bytes(struct btrfs_space_info *space_info, u64 orig_bytes, enum btrfs_reserve_flush_enum flush); -void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info); -int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, - const struct btrfs_space_info *space_info, u64 bytes, - enum btrfs_reserve_flush_enum flush); +void btrfs_try_granting_tickets(struct btrfs_space_info *space_info); +bool btrfs_can_overcommit(const struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush); static inline void btrfs_space_info_free_bytes_may_use( struct btrfs_space_info *space_info, @@ -295,7 +292,7 @@ static inline void btrfs_space_info_free_bytes_may_use( { spin_lock(&space_info->lock); btrfs_space_info_update_bytes_may_use(space_info, -num_bytes); - btrfs_try_granting_tickets(space_info->fs_info, space_info); + btrfs_try_granting_tickets(space_info); spin_unlock(&space_info->lock); } int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes, diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 5ca8d4db6722..f82e71f5d88b 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -180,13 +180,14 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, /* Basic checks */ ASSERT(folio_test_private(folio) && folio_get_private(folio)); ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && - IS_ALIGNED(len, fs_info->sectorsize)); + IS_ALIGNED(len, fs_info->sectorsize), "start=%llu len=%u", start, len); /* * The range check only works for mapped page, we can still have * unmapped page like dummy extent buffer pages. */ if (folio->mapping) - ASSERT(folio_pos(folio) <= start && start + len <= folio_end(folio), + ASSERT(folio_pos(folio) <= start && + start + len <= folio_next_pos(folio), "start=%llu len=%u folio_pos=%llu folio_size=%zu", start, len, folio_pos(folio), folio_size(folio)); } @@ -194,12 +195,11 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, #define subpage_calc_start_bit(fs_info, folio, name, start, len) \ ({ \ unsigned int __start_bit; \ - const unsigned int blocks_per_folio = \ - btrfs_blocks_per_folio(fs_info, folio); \ + const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \ \ btrfs_subpage_assert(fs_info, folio, start, len); \ __start_bit = offset_in_folio(folio, start) >> fs_info->sectorsize_bits; \ - __start_bit += blocks_per_folio * btrfs_bitmap_nr_##name; \ + __start_bit += __bpf * btrfs_bitmap_nr_##name; \ __start_bit; \ }) @@ -217,7 +217,7 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) if (folio_pos(folio) >= orig_start + orig_len) *len = 0; else - *len = min_t(u64, folio_end(folio), orig_start + orig_len) - *start; + *len = min_t(u64, folio_next_pos(folio), orig_start + orig_len) - *start; } static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info, @@ -250,7 +250,9 @@ static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info, clear_bit(bit, bfs->bitmaps); cleared++; } - ASSERT(atomic_read(&bfs->nr_locked) >= cleared); + ASSERT(atomic_read(&bfs->nr_locked) >= cleared, + "atomic_read(&bfs->nr_locked)=%d cleared=%d", + atomic_read(&bfs->nr_locked), cleared); last = atomic_sub_and_test(cleared, &bfs->nr_locked); spin_unlock_irqrestore(&bfs->lock, flags); return last; @@ -329,7 +331,9 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, if (test_and_clear_bit(bit + start_bit, bfs->bitmaps)) cleared++; } - ASSERT(atomic_read(&bfs->nr_locked) >= cleared); + ASSERT(atomic_read(&bfs->nr_locked) >= cleared, + "atomic_read(&bfs->nr_locked)=%d cleared=%d", + atomic_read(&bfs->nr_locked), cleared); last = atomic_sub_and_test(cleared, &bfs->nr_locked); spin_unlock_irqrestore(&bfs->lock, flags); if (last) @@ -338,24 +342,20 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, #define subpage_test_bitmap_all_set(fs_info, folio, name) \ ({ \ - struct btrfs_folio_state *bfs = folio_get_private(folio); \ - const unsigned int blocks_per_folio = \ - btrfs_blocks_per_folio(fs_info, folio); \ + struct btrfs_folio_state *__bfs = folio_get_private(folio); \ + const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \ \ - bitmap_test_range_all_set(bfs->bitmaps, \ - blocks_per_folio * btrfs_bitmap_nr_##name, \ - blocks_per_folio); \ + bitmap_test_range_all_set(__bfs->bitmaps, \ + __bpf * btrfs_bitmap_nr_##name, __bpf); \ }) #define subpage_test_bitmap_all_zero(fs_info, folio, name) \ ({ \ - struct btrfs_folio_state *bfs = folio_get_private(folio); \ - const unsigned int blocks_per_folio = \ - btrfs_blocks_per_folio(fs_info, folio); \ + struct btrfs_folio_state *__bfs = folio_get_private(folio); \ + const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \ \ - bitmap_test_range_all_zero(bfs->bitmaps, \ - blocks_per_folio * btrfs_bitmap_nr_##name, \ - blocks_per_folio); \ + bitmap_test_range_all_zero(__bfs->bitmaps, \ + __bpf * btrfs_bitmap_nr_##name, __bpf); \ }) void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, @@ -445,6 +445,7 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, writeback, start, len); unsigned long flags; + bool keep_write; spin_lock_irqsave(&bfs->lock, flags); bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); @@ -455,18 +456,9 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, * assume writeback is complete, and exit too early — violating sync * ordering guarantees. */ + keep_write = folio_test_dirty(folio); if (!folio_test_writeback(folio)) - __folio_start_writeback(folio, true); - if (!folio_test_dirty(folio)) { - struct address_space *mapping = folio_mapping(folio); - XA_STATE(xas, &mapping->i_pages, folio->index); - unsigned long flags; - - xas_lock_irqsave(&xas, flags); - xas_load(&xas); - xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); - xas_unlock_irqrestore(&xas, flags); - } + __folio_start_writeback(folio, keep_write); spin_unlock_irqrestore(&bfs->lock, flags); } @@ -672,27 +664,23 @@ IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, #define GET_SUBPAGE_BITMAP(fs_info, folio, name, dst) \ { \ - const unsigned int blocks_per_folio = \ - btrfs_blocks_per_folio(fs_info, folio); \ - const struct btrfs_folio_state *bfs = folio_get_private(folio); \ + const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \ + const struct btrfs_folio_state *__bfs = folio_get_private(folio); \ \ - ASSERT(blocks_per_folio <= BITS_PER_LONG); \ - *dst = bitmap_read(bfs->bitmaps, \ - blocks_per_folio * btrfs_bitmap_nr_##name, \ - blocks_per_folio); \ + ASSERT(__bpf <= BITS_PER_LONG); \ + *dst = bitmap_read(__bfs->bitmaps, \ + __bpf * btrfs_bitmap_nr_##name, __bpf); \ } #define SUBPAGE_DUMP_BITMAP(fs_info, folio, name, start, len) \ { \ unsigned long bitmap; \ - const unsigned int blocks_per_folio = \ - btrfs_blocks_per_folio(fs_info, folio); \ + const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \ \ GET_SUBPAGE_BITMAP(fs_info, folio, name, &bitmap); \ btrfs_warn(fs_info, \ "dumping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \ - start, len, folio_pos(folio), \ - blocks_per_folio, &bitmap); \ + start, len, folio_pos(folio), __bpf, &bitmap); \ } /* diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index ad0552db7c7d..d81a0ade559f 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -7,7 +7,6 @@ #include <linux/atomic.h> #include <linux/sizes.h> #include "btrfs_inode.h" -#include "fs.h" struct address_space; struct folio; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d6e496436539..1999533b52be 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -807,17 +807,15 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, struct btrfs_root_ref *root_ref; struct btrfs_inode_ref *inode_ref; struct btrfs_key key; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); char *name = NULL, *ptr; u64 dirid; int len; int ret; path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto err; - } + if (!path) + return ERR_PTR(-ENOMEM); name = kmalloc(PATH_MAX, GFP_KERNEL); if (!name) { @@ -905,7 +903,6 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, fs_root = NULL; } - btrfs_free_path(path); if (ptr == name + PATH_MAX - 1) { name[0] = '/'; name[1] = '\0'; @@ -916,7 +913,6 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, err: btrfs_put_root(fs_root); - btrfs_free_path(path); kfree(name); return ERR_PTR(ret); } @@ -1614,7 +1610,7 @@ static inline void btrfs_descending_sort_devices( static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, u64 *free_bytes) { - struct btrfs_device_info *devices_info; + struct btrfs_device_info AUTO_KFREE(devices_info); struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; u64 type; @@ -1712,7 +1708,6 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, nr_devices--; } - kfree(devices_info); *free_bytes = avail_space; return 0; } @@ -1900,8 +1895,6 @@ static int btrfs_get_tree_super(struct fs_context *fc) return PTR_ERR(sb); } - set_device_specific_options(fs_info); - if (sb->s_root) { /* * Not the first mount of the fs thus got an existing super block. @@ -1946,6 +1939,7 @@ static int btrfs_get_tree_super(struct fs_context *fc) deactivate_locked_super(sb); return -EACCES; } + set_device_specific_options(fs_info); bdev = fs_devices->latest_dev->bdev; snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id); @@ -2069,7 +2063,13 @@ static int btrfs_get_tree_subvol(struct fs_context *fc) fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); if (!fs_info->super_copy || !fs_info->super_for_commit) { - btrfs_free_fs_info(fs_info); + /* + * Dont call btrfs_free_fs_info() to free it as it's still + * initialized partially. + */ + kfree(fs_info->super_copy); + kfree(fs_info->super_for_commit); + kvfree(fs_info); return -ENOMEM; } btrfs_init_fs_info(fs_info); @@ -2425,6 +2425,66 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont return 0; } +#ifdef CONFIG_BTRFS_EXPERIMENTAL +static int btrfs_remove_bdev(struct super_block *sb, struct block_device *bdev) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_device *device; + struct btrfs_dev_lookup_args lookup_args = { .devt = bdev->bd_dev }; + bool can_rw; + + mutex_lock(&fs_info->fs_devices->device_list_mutex); + device = btrfs_find_device(fs_info->fs_devices, &lookup_args); + if (!device) { + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + /* Device not found, should not affect the running fs, just give a warning. */ + btrfs_warn(fs_info, "unable to find btrfs device for block device '%pg'", bdev); + return 0; + } + /* + * The to-be-removed device is already missing? + * + * That's weird but no special handling needed and can exit right now. + */ + if (unlikely(test_and_set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))) { + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + btrfs_warn(fs_info, "btrfs device id %llu is already missing", device->devid); + return 0; + } + + device->fs_devices->missing_devices++; + if (test_and_clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { + list_del_init(&device->dev_alloc_list); + WARN_ON(device->fs_devices->rw_devices < 1); + device->fs_devices->rw_devices--; + } + can_rw = btrfs_check_rw_degradable(fs_info, device); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + /* + * Now device is considered missing, btrfs_device_name() won't give a + * meaningful result anymore, so only output the devid. + */ + if (unlikely(!can_rw)) { + btrfs_crit(fs_info, + "btrfs device id %llu has gone missing, can not maintain read-write", + device->devid); + return -EIO; + } + btrfs_warn(fs_info, + "btrfs device id %llu has gone missing, continue as degraded", + device->devid); + btrfs_set_opt(fs_info->mount_opt, DEGRADED); + return 0; +} + +static void btrfs_shutdown(struct super_block *sb) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + + btrfs_force_shutdown(fs_info); +} +#endif + static const struct super_operations btrfs_super_ops = { .drop_inode = btrfs_drop_inode, .evict_inode = btrfs_evict_inode, @@ -2440,6 +2500,10 @@ static const struct super_operations btrfs_super_ops = { .unfreeze_fs = btrfs_unfreeze, .nr_cached_objects = btrfs_nr_cached_objects, .free_cached_objects = btrfs_free_cached_objects, +#ifdef CONFIG_BTRFS_EXPERIMENTAL + .remove_bdev = btrfs_remove_bdev, + .shutdown = btrfs_shutdown, +#endif }; static const struct file_operations btrfs_ctl_fops = { diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 81f52c1f55ce..1f64c132b387 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -10,6 +10,7 @@ #include <linux/completion.h> #include <linux/bug.h> #include <linux/list.h> +#include <linux/string_choices.h> #include <crypto/hash.h> #include "messages.h" #include "ctree.h" @@ -25,6 +26,7 @@ #include "misc.h" #include "fs.h" #include "accessors.h" +#include "zoned.h" /* * Structure name Path @@ -1187,6 +1189,56 @@ static ssize_t btrfs_commit_stats_store(struct kobject *kobj, } BTRFS_ATTR_RW(, commit_stats, btrfs_commit_stats_show, btrfs_commit_stats_store); +static ssize_t btrfs_zoned_stats_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + struct btrfs_block_group *bg; + size_t ret = 0; + + + if (!btrfs_is_zoned(fs_info)) + return ret; + + spin_lock(&fs_info->zone_active_bgs_lock); + ret += sysfs_emit_at(buf, ret, "active block-groups: %zu\n", + list_count_nodes(&fs_info->zone_active_bgs)); + spin_unlock(&fs_info->zone_active_bgs_lock); + + mutex_lock(&fs_info->reclaim_bgs_lock); + spin_lock(&fs_info->unused_bgs_lock); + ret += sysfs_emit_at(buf, ret, "\treclaimable: %zu\n", + list_count_nodes(&fs_info->reclaim_bgs)); + ret += sysfs_emit_at(buf, ret, "\tunused: %zu\n", + list_count_nodes(&fs_info->unused_bgs)); + spin_unlock(&fs_info->unused_bgs_lock); + mutex_unlock(&fs_info->reclaim_bgs_lock); + + ret += sysfs_emit_at(buf, ret, "\tneed reclaim: %s\n", + str_true_false(btrfs_zoned_should_reclaim(fs_info))); + + if (fs_info->data_reloc_bg) + ret += sysfs_emit_at(buf, ret, + "data relocation block-group: %llu\n", + fs_info->data_reloc_bg); + if (fs_info->treelog_bg) + ret += sysfs_emit_at(buf, ret, + "tree-log block-group: %llu\n", + fs_info->treelog_bg); + + spin_lock(&fs_info->zone_active_bgs_lock); + ret += sysfs_emit_at(buf, ret, "active zones:\n"); + list_for_each_entry(bg, &fs_info->zone_active_bgs, active_bg_list) { + ret += sysfs_emit_at(buf, ret, + "\tstart: %llu, wp: %llu used: %llu, reserved: %llu, unusable: %llu\n", + bg->start, bg->alloc_offset, bg->used, + bg->reserved, bg->zone_unusable); + } + spin_unlock(&fs_info->zone_active_bgs_lock); + return ret; +} +BTRFS_ATTR(, zoned_stats, btrfs_zoned_stats_show); + static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { @@ -1599,6 +1651,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, bg_reclaim_threshold), BTRFS_ATTR_PTR(, commit_stats), BTRFS_ATTR_PTR(, temp_fsid), + BTRFS_ATTR_PTR(, zoned_stats), #ifdef CONFIG_BTRFS_EXPERIMENTAL BTRFS_ATTR_PTR(, offload_csum), #endif @@ -1981,13 +2034,12 @@ static const char *alloc_name(struct btrfs_space_info *space_info) * Create a sysfs entry for a space info type at path * /sys/fs/btrfs/UUID/allocation/TYPE */ -int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) +int btrfs_sysfs_add_space_info_type(struct btrfs_space_info *space_info) { int ret; ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, - fs_info->space_info_kobj, "%s", + space_info->fs_info->space_info_kobj, "%s", alloc_name(space_info)); if (ret) { kobject_put(&space_info->kobj); diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 0f94ae923210..05498e5346c3 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -37,8 +37,7 @@ void __cold btrfs_exit_sysfs(void); int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info); void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info); void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache); -int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info); +int btrfs_sysfs_add_space_info_type(struct btrfs_space_info *space_info); void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info); void btrfs_sysfs_update_devid(struct btrfs_device *device); diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index b19328d077d3..a0187d6163df 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -505,7 +505,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb) static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) { struct btrfs_fs_info *fs_info; - unsigned long *bitmap = NULL; + unsigned long AUTO_KFREE(bitmap); struct extent_buffer *eb = NULL; int ret; @@ -551,7 +551,6 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) ret = __test_eb_bitmaps(bitmap, eb); out: free_extent_buffer(eb); - kfree(bitmap); btrfs_free_dummy_fs_info(fs_info); return ret; } diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 42af6c737c6e..0b9f25dd1a68 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -1013,7 +1013,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, struct rmap_test_vector *test) { struct btrfs_chunk_map *map; - u64 *logical = NULL; + u64 AUTO_KFREE(logical); int i, out_ndaddrs, out_stripe_len; int ret; @@ -1046,7 +1046,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, if (ret) { test_err("error adding chunk map to mapping tree"); btrfs_free_chunk_map(map); - goto out_free; + return ret; } ret = btrfs_rmap_block(fs_info, map->start, btrfs_sb_offset(1), @@ -1079,8 +1079,6 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, ret = 0; out: btrfs_remove_chunk_map(fs_info, map); -out_free: - kfree(logical); return ret; } diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 3fc8dc3fd980..05cfda8af422 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -20,7 +20,7 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, struct btrfs_extent_item *item; struct btrfs_extent_inline_ref *iref; struct btrfs_tree_block_info *block_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key ins; u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info); @@ -41,7 +41,6 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, ret = btrfs_insert_empty_item(&trans, root, path, &ins, size); if (ret) { test_err("couldn't insert ref %d", ret); - btrfs_free_path(path); return ret; } @@ -61,7 +60,6 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY); btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); } - btrfs_free_path(path); return 0; } @@ -70,7 +68,7 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, { struct btrfs_trans_handle trans; struct btrfs_extent_item *item; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; u64 refs; int ret; @@ -90,7 +88,6 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); if (ret) { test_err("couldn't find extent ref"); - btrfs_free_path(path); return ret; } @@ -112,7 +109,6 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, ret = btrfs_insert_empty_item(&trans, root, path, &key, 0); if (ret) test_err("failed to insert backref"); - btrfs_free_path(path); return ret; } @@ -121,7 +117,7 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr, { struct btrfs_trans_handle trans; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; btrfs_init_dummy_trans(&trans, NULL); @@ -139,11 +135,9 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr, ret = btrfs_search_slot(&trans, root, &key, path, -1, 1); if (ret) { test_err("didn't find our key %d", ret); - btrfs_free_path(path); return ret; } btrfs_del_item(&trans, root, path); - btrfs_free_path(path); return 0; } @@ -152,7 +146,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, { struct btrfs_trans_handle trans; struct btrfs_extent_item *item; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; u64 refs; int ret; @@ -172,7 +166,6 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); if (ret) { test_err("couldn't find extent ref"); - btrfs_free_path(path); return ret; } @@ -198,7 +191,6 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, return ret; } btrfs_del_item(&trans, root, path); - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 89ae0c7a610a..05ee4391c83a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -32,6 +32,8 @@ #include "ioctl.h" #include "relocation.h" #include "scrub.h" +#include "ordered-data.h" +#include "delayed-inode.h" static struct kmem_cache *btrfs_trans_handle_cachep; @@ -138,7 +140,6 @@ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { void btrfs_put_transaction(struct btrfs_transaction *transaction) { - WARN_ON(refcount_read(&transaction->use_count) == 0); if (refcount_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); WARN_ON(!xa_empty(&transaction->delayed_refs.head_refs)); @@ -185,7 +186,8 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) * At this point no one can be using this transaction to modify any tree * and no one can start another transaction to modify any tree either. */ - ASSERT(cur_trans->state == TRANS_STATE_COMMIT_DOING); + ASSERT(cur_trans->state == TRANS_STATE_COMMIT_DOING, + "cur_trans->state=%d", cur_trans->state); down_write(&fs_info->commit_root_sem); @@ -575,7 +577,7 @@ static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info, * We want to reserve all the bytes we may need all at once, so we only * do 1 enospc flushing cycle per transaction start. */ - ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); + ret = btrfs_reserve_metadata_bytes(si, bytes, flush); /* * If we are an emergency flush, which can steal from the global block @@ -585,7 +587,7 @@ static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info, if (ret && flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) { bytes -= *delayed_refs_bytes; *delayed_refs_bytes = 0; - ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); + ret = btrfs_reserve_metadata_bytes(si, bytes, flush); } return ret; @@ -1024,13 +1026,18 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans) struct btrfs_fs_info *fs_info = trans->fs_info; if (!trans->block_rsv) { - ASSERT(!trans->bytes_reserved); - ASSERT(!trans->delayed_refs_bytes_reserved); + ASSERT(trans->bytes_reserved == 0, + "trans->bytes_reserved=%llu", trans->bytes_reserved); + ASSERT(trans->delayed_refs_bytes_reserved == 0, + "trans->delayed_refs_bytes_reserved=%llu", + trans->delayed_refs_bytes_reserved); return; } if (!trans->bytes_reserved) { - ASSERT(!trans->delayed_refs_bytes_reserved); + ASSERT(trans->delayed_refs_bytes_reserved == 0, + "trans->delayed_refs_bytes_reserved=%llu", + trans->delayed_refs_bytes_reserved); return; } @@ -1229,7 +1236,8 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark) bool errors = false; int ret; - ASSERT(btrfs_root_id(log_root) == BTRFS_TREE_LOG_OBJECTID); + ASSERT(btrfs_root_id(log_root) == BTRFS_TREE_LOG_OBJECTID, + "root_id(log_root)=%llu", btrfs_root_id(log_root)); ret = __btrfs_wait_marked_extents(fs_info, dirty_pages); if ((mark & EXTENT_DIRTY_LOG1) && @@ -1334,7 +1342,8 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) * At this point no one can be using this transaction to modify any tree * and no one can start another transaction to modify any tree either. */ - ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); + ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING, + "trans->transaction->state=%d", trans->transaction->state); eb = btrfs_lock_root_node(fs_info->tree_root); ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, @@ -1468,7 +1477,8 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) * At this point no one can be using this transaction to modify any tree * and no one can start another transaction to modify any tree either. */ - ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); + ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING, + "trans->transaction->state=%d", trans->transaction->state); spin_lock(&fs_info->fs_roots_radix_lock); while (1) { @@ -1486,9 +1496,15 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) * At this point we can neither have tasks logging inodes * from a root nor trying to commit a log tree. */ - ASSERT(atomic_read(&root->log_writers) == 0); - ASSERT(atomic_read(&root->log_commit[0]) == 0); - ASSERT(atomic_read(&root->log_commit[1]) == 0); + ASSERT(atomic_read(&root->log_writers) == 0, + "atomic_read(&root->log_writers)=%d", + atomic_read(&root->log_writers)); + ASSERT(atomic_read(&root->log_commit[0]) == 0, + "atomic_read(&root->log_commit[0])=%d", + atomic_read(&root->log_commit[0])); + ASSERT(atomic_read(&root->log_commit[1]) == 0, + "atomic_read(&root->log_commit[1])=%d", + atomic_read(&root->log_commit[1])); radix_tree_tag_clear(&fs_info->fs_roots_radix, (unsigned long)btrfs_root_id(root), @@ -2157,7 +2173,8 @@ static void add_pending_snapshot(struct btrfs_trans_handle *trans) return; lockdep_assert_held(&trans->fs_info->trans_lock); - ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_PREP); + ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_PREP, + "cur_trans->state=%d", cur_trans->state); list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots); } @@ -2184,7 +2201,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) struct btrfs_transaction *prev_trans = NULL; int ret; - ASSERT(refcount_read(&trans->use_count) == 1); + ASSERT(refcount_read(&trans->use_count) == 1, + "refcount_read(&trans->use_count)=%d", refcount_read(&trans->use_count)); btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); clear_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 9f7c777af635..18ef069197e5 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -14,10 +14,6 @@ #include <linux/wait.h> #include "btrfs_inode.h" #include "delayed-ref.h" -#include "extent-io-tree.h" -#include "block-rsv.h" -#include "messages.h" -#include "misc.h" struct dentry; struct inode; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index ca30b15ea452..c21c21adf61e 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -186,7 +186,7 @@ static bool check_prev_ino(struct extent_buffer *leaf, key->type == BTRFS_INODE_EXTREF_KEY || key->type == BTRFS_DIR_INDEX_KEY || key->type == BTRFS_DIR_ITEM_KEY || - key->type == BTRFS_EXTENT_DATA_KEY); + key->type == BTRFS_EXTENT_DATA_KEY, "key->type=%u", key->type); /* * Only subvolume trees along with their reloc trees need this check. @@ -1618,10 +1618,9 @@ static int check_extent_item(struct extent_buffer *leaf, if (unlikely(prev_end > key->objectid)) { extent_err(leaf, slot, - "previous extent [%llu %u %llu] overlaps current extent [%llu %u %llu]", - prev_key->objectid, prev_key->type, - prev_key->offset, key->objectid, key->type, - key->offset); + "previous extent " BTRFS_KEY_FMT " overlaps current extent " BTRFS_KEY_FMT, + BTRFS_KEY_FMT_VALUE(prev_key), + BTRFS_KEY_FMT_VALUE(key)); return -EUCLEAN; } } @@ -1797,7 +1796,7 @@ static int check_inode_extref(struct extent_buffer *leaf, struct btrfs_inode_extref *extref = (struct btrfs_inode_extref *)ptr; u16 namelen; - if (unlikely(ptr + sizeof(*extref)) > end) { + if (unlikely(ptr + sizeof(*extref) > end)) { inode_ref_err(leaf, slot, "inode extref overflow, ptr %lu end %lu inode_extref size %zu", ptr, end, sizeof(*extref)); @@ -2060,10 +2059,9 @@ enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf) /* Make sure the keys are in the right order */ if (unlikely(btrfs_comp_cpu_keys(&prev_key, &key) >= 0)) { generic_err(leaf, slot, - "bad key order, prev (%llu %u %llu) current (%llu %u %llu)", - prev_key.objectid, prev_key.type, - prev_key.offset, key.objectid, key.type, - key.offset); + "bad key order, prev " BTRFS_KEY_FMT " current " BTRFS_KEY_FMT, + BTRFS_KEY_FMT_VALUE(&prev_key), + BTRFS_KEY_FMT_VALUE(&key)); return BTRFS_TREE_BLOCK_BAD_KEY_ORDER; } @@ -2181,10 +2179,9 @@ enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node) if (unlikely(btrfs_comp_cpu_keys(&key, &next_key) >= 0)) { generic_err(node, slot, - "bad key order, current (%llu %u %llu) next (%llu %u %llu)", - key.objectid, key.type, key.offset, - next_key.objectid, next_key.type, - next_key.offset); + "bad key order, current " BTRFS_KEY_FMT " next " BTRFS_KEY_FMT, + BTRFS_KEY_FMT_VALUE(&key), + BTRFS_KEY_FMT_VALUE(&next_key)); return BTRFS_TREE_BLOCK_BAD_KEY_ORDER; } } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 621e0df097e3..fff37c8d96a4 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -29,6 +29,7 @@ #include "orphan.h" #include "print-tree.h" #include "tree-checker.h" +#include "delayed-inode.h" #define MAX_CONFLICT_INODES 10 @@ -198,9 +199,9 @@ static void do_abort_log_replay(struct walk_control *wc, const char *function, if (wc->log_leaf) { btrfs_crit(fs_info, - "log tree (for root %llu) leaf currently being processed (slot %d key %llu %u %llu):", +"log tree (for root %llu) leaf currently being processed (slot %d key " BTRFS_KEY_FMT "):", btrfs_root_id(wc->root), wc->log_slot, - wc->log_key.objectid, wc->log_key.type, wc->log_key.offset); + BTRFS_KEY_FMT_VALUE(&wc->log_key)); btrfs_print_leaf(wc->log_leaf); } @@ -262,7 +263,7 @@ static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *r struct btrfs_inode *inode; /* Only meant to be called for subvolume roots and not for log roots. */ - ASSERT(btrfs_is_fstree(btrfs_root_id(root))); + ASSERT(btrfs_is_fstree(btrfs_root_id(root)), "root_id=%llu", btrfs_root_id(root)); /* * We're holding a transaction handle whether we are logging or @@ -501,7 +502,7 @@ static int overwrite_item(struct walk_control *wc) * the leaf before writing into the log tree. See the comments at * copy_items() for more details. */ - ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID); + ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID, "root_id=%llu", btrfs_root_id(root)); item_size = btrfs_item_size(wc->log_leaf, wc->log_slot); src_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot); @@ -510,9 +511,9 @@ static int overwrite_item(struct walk_control *wc) ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0); if (ret < 0) { btrfs_abort_log_replay(wc, ret, - "failed to search subvolume tree for key (%llu %u %llu) root %llu", - wc->log_key.objectid, wc->log_key.type, - wc->log_key.offset, btrfs_root_id(root)); + "failed to search subvolume tree for key " BTRFS_KEY_FMT " root %llu", + BTRFS_KEY_FMT_VALUE(&wc->log_key), + btrfs_root_id(root)); return ret; } @@ -601,9 +602,9 @@ static int overwrite_item(struct walk_control *wc) insert: btrfs_release_path(wc->subvol_path); /* try to insert the key into the destination tree */ - wc->subvol_path->skip_release_on_error = 1; + wc->subvol_path->skip_release_on_error = true; ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &wc->log_key, item_size); - wc->subvol_path->skip_release_on_error = 0; + wc->subvol_path->skip_release_on_error = false; dst_eb = wc->subvol_path->nodes[0]; dst_slot = wc->subvol_path->slots[0]; @@ -618,9 +619,8 @@ insert: btrfs_extend_item(trans, wc->subvol_path, item_size - found_size); } else if (ret) { btrfs_abort_log_replay(wc, ret, - "failed to insert item for key (%llu %u %llu)", - wc->log_key.objectid, wc->log_key.type, - wc->log_key.offset); + "failed to insert item for key " BTRFS_KEY_FMT, + BTRFS_KEY_FMT_VALUE(&wc->log_key)); return ret; } dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot); @@ -829,9 +829,9 @@ static noinline int replay_one_extent(struct walk_control *wc) &wc->log_key, sizeof(*item)); if (ret) { btrfs_abort_log_replay(wc, ret, - "failed to insert item with key (%llu %u %llu) root %llu", - wc->log_key.objectid, wc->log_key.type, - wc->log_key.offset, btrfs_root_id(root)); + "failed to insert item with key " BTRFS_KEY_FMT " root %llu", + BTRFS_KEY_FMT_VALUE(&wc->log_key), + btrfs_root_id(root)); goto out; } dest_offset = btrfs_item_ptr_offset(wc->subvol_path->nodes[0], @@ -1348,9 +1348,9 @@ again: ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0); if (ret < 0) { btrfs_abort_log_replay(wc, ret, - "failed to search subvolume tree for key (%llu %u %llu) root %llu", - search_key.objectid, search_key.type, - search_key.offset, btrfs_root_id(root)); + "failed to search subvolume tree for key " BTRFS_KEY_FMT " root %llu", + BTRFS_KEY_FMT_VALUE(&search_key), + btrfs_root_id(root)); return ret; } else if (ret == 0) { /* @@ -1483,9 +1483,9 @@ again: } if (ret < 0) { btrfs_abort_log_replay(wc, ret, - "failed to search subvolume tree for key (%llu %u %llu) root %llu", - wc->log_key.objectid, wc->log_key.type, - wc->log_key.offset, btrfs_root_id(root)); + "failed to search subvolume tree for key " BTRFS_KEY_FMT " root %llu", + BTRFS_KEY_FMT_VALUE(&wc->log_key), + btrfs_root_id(root)); goto out; } @@ -2282,7 +2282,8 @@ static noinline int replay_one_dir_item(struct walk_control *wc) struct btrfs_dir_item *di; /* We only log dir index keys, which only contain a single dir item. */ - ASSERT(wc->log_key.type == BTRFS_DIR_INDEX_KEY); + ASSERT(wc->log_key.type == BTRFS_DIR_INDEX_KEY, + "wc->log_key.type=%u", wc->log_key.type); di = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_dir_item); ret = replay_one_name(wc, di); @@ -2434,7 +2435,7 @@ static noinline int check_item_in_log(struct walk_control *wc, * we need to do is process the dir index keys, we (and our caller) can * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY). */ - ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY); + ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY, "dir_key->type=%u", dir_key->type); eb = wc->subvol_path->nodes[0]; slot = wc->subvol_path->slots[0]; @@ -2647,7 +2648,7 @@ static noinline int replay_dir_deletes(struct walk_control *wc, int ret = 0; struct btrfs_key dir_key; struct btrfs_key found_key; - struct btrfs_path *log_path; + BTRFS_PATH_AUTO_FREE(log_path); struct btrfs_inode *dir; dir_key.objectid = dirid; @@ -2664,7 +2665,6 @@ static noinline int replay_dir_deletes(struct walk_control *wc, * we replay the deletes before we copy in the inode item from the log. */ if (IS_ERR(dir)) { - btrfs_free_path(log_path); ret = PTR_ERR(dir); if (ret == -ENOENT) ret = 0; @@ -2700,10 +2700,9 @@ static noinline int replay_dir_deletes(struct walk_control *wc, wc->subvol_path, 0, 0); if (ret < 0) { btrfs_abort_log_replay(wc, ret, - "failed to search root %llu for key (%llu %u %llu)", + "failed to search root %llu for key " BTRFS_KEY_FMT, btrfs_root_id(root), - dir_key.objectid, dir_key.type, - dir_key.offset); + BTRFS_KEY_FMT_VALUE(&dir_key)); goto out; } @@ -2745,7 +2744,6 @@ static noinline int replay_dir_deletes(struct walk_control *wc, ret = 0; out: btrfs_release_path(wc->subvol_path); - btrfs_free_path(log_path); iput(&dir->vfs_inode); return ret; } @@ -3340,7 +3338,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&root->log_mutex); return ctx->log_ret; } - ASSERT(log_transid == root->log_transid); + ASSERT(log_transid == root->log_transid, + "log_transid=%d root->log_transid=%d", log_transid, root->log_transid); atomic_set(&root->log_commit[index1], 1); /* wait for previous tree log sync to complete */ @@ -3480,7 +3479,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = root_log_ctx.log_ret; goto out; } - ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); + ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid, + "root_log_ctx.log_transid=%d log_root_tree->log_transid=%d", + root_log_ctx.log_transid, log_root_tree->log_transid); atomic_set(&log_root_tree->log_commit[index2], 1); if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { @@ -3584,7 +3585,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * someone else already started it. We use <= and not < because the * first log transaction has an ID of 0. */ - ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid); + ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid, + "last_log_commit(root)=%d log_transid=%d", + btrfs_get_root_last_log_commit(root), log_transid); btrfs_set_root_last_log_commit(root, log_transid); out_wake_log_root: @@ -3895,10 +3898,10 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans, * or the entire directory. */ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const struct fscrypt_str *name, struct btrfs_inode *dir, u64 index) { + struct btrfs_root *root = dir->root; BTRFS_PATH_AUTO_FREE(path); int ret; @@ -3933,11 +3936,11 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, /* see comments for btrfs_del_dir_entries_in_log */ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const struct fscrypt_str *name, - struct btrfs_inode *inode, u64 dirid) + struct btrfs_inode *inode, + struct btrfs_inode *dir) { - struct btrfs_root *log; + struct btrfs_root *root = dir->root; int ret; ret = inode_logged(trans, inode, NULL); @@ -3952,10 +3955,10 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret); if (WARN_ON(ret)) return; - log = root->log_root; mutex_lock(&inode->log_mutex); - ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode), dirid, NULL); + ret = btrfs_del_inode_ref(trans, root->log_root, name, btrfs_ino(inode), + btrfs_ino(dir), NULL); mutex_unlock(&inode->log_mutex); if (ret < 0 && ret != -ENOENT) btrfs_set_log_full_commit(trans); @@ -4017,7 +4020,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, int count) { struct btrfs_root *log = inode->root->log_root; - char *ins_data = NULL; + char AUTO_KFREE(ins_data); struct btrfs_item_batch batch; struct extent_buffer *dst; unsigned long src_offset; @@ -4028,7 +4031,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, int ret; int i; - ASSERT(count > 0); + ASSERT(count > 0, "count=%d", count); batch.nr = count; if (count == 1) { @@ -4062,7 +4065,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); if (ret) - goto out; + return ret; dst = dst_path->nodes[0]; /* @@ -4081,7 +4084,9 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, btrfs_release_path(dst_path); last_index = batch.keys[count - 1].offset; - ASSERT(last_index > inode->last_dir_index_offset); + ASSERT(last_index > inode->last_dir_index_offset, + "last_index=%llu inode->last_dir_index_offset=%llu", + last_index, inode->last_dir_index_offset); /* * If for some unexpected reason the last item's index is not greater @@ -4094,8 +4099,6 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, if (btrfs_get_first_dir_index_to_log(inode) == 0) btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset); -out: - kfree(ins_data); return ret; } @@ -4154,7 +4157,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, for (int i = path->slots[0]; i < nritems; i++) { struct btrfs_dir_item *di; struct btrfs_key key; - int ret; btrfs_item_key_to_cpu(src, &key, i); @@ -4224,8 +4226,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, } if (batch_size > 0) { - int ret; - ret = flush_dir_items_batch(trans, inode, src, dst_path, batch_start, batch_size); if (ret < 0) @@ -4410,7 +4410,9 @@ done: * change in the current transaction), then we don't need to log * a range, last_old_dentry_offset is == to last_offset. */ - ASSERT(last_old_dentry_offset <= last_offset); + ASSERT(last_old_dentry_offset <= last_offset, + "last_old_dentry_offset=%llu last_offset=%llu", + last_old_dentry_offset, last_offset); if (last_old_dentry_offset < last_offset) ret = insert_dir_log_key(trans, log, path, ino, last_old_dentry_offset + 1, @@ -4765,7 +4767,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_key *ins_keys; u32 *ins_sizes; struct btrfs_item_batch batch; - char *ins_data; + char AUTO_KFREE(ins_data); int dst_index; const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM); const u64 i_size = i_size_read(&inode->vfs_inode); @@ -4893,7 +4895,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, disk_bytenr + extent_num_bytes - 1, &ordered_sums, false); if (ret < 0) - goto out; + return ret; ret = 0; list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) { @@ -4903,7 +4905,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, kfree(sums); } if (ret) - goto out; + return ret; add_to_batch: ins_sizes[dst_index] = btrfs_item_size(src, src_slot); @@ -4917,11 +4919,11 @@ add_to_batch: * so we don't need to do anything. */ if (batch.nr == 0) - goto out; + return 0; ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); if (ret) - goto out; + return ret; dst_index = 0; for (int i = 0; i < nr; i++) { @@ -4974,8 +4976,6 @@ copy_item: } btrfs_release_path(dst_path); -out: - kfree(ins_data); return ret; } @@ -5414,12 +5414,12 @@ process: set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock(&inode->ordered_tree_lock); if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); atomic_inc(&trans->transaction->pending_ordered); } - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock(&inode->ordered_tree_lock); } btrfs_put_ordered_extent(ordered); } @@ -5694,9 +5694,8 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, struct btrfs_inode *inode, u64 *other_ino, u64 *other_parent) { - int ret; BTRFS_PATH_AUTO_FREE(search_path); - char *name = NULL; + char AUTO_KFREE(name); u32 name_len = 0; u32 item_size = btrfs_item_size(eb, slot); u32 cur_offset = 0; @@ -5705,8 +5704,8 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, search_path = btrfs_alloc_path(); if (!search_path) return -ENOMEM; - search_path->search_commit_root = 1; - search_path->skip_locking = 1; + search_path->search_commit_root = true; + search_path->skip_locking = true; while (cur_offset < item_size) { u64 parent; @@ -5739,10 +5738,8 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, char *new_name; new_name = krealloc(name, this_name_len, GFP_NOFS); - if (!new_name) { - ret = -ENOMEM; - goto out; - } + if (!new_name) + return -ENOMEM; name_len = this_name_len; name = new_name; } @@ -5760,28 +5757,24 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, di, &di_key); if (di_key.type == BTRFS_INODE_ITEM_KEY) { if (di_key.objectid != key->objectid) { - ret = 1; *other_ino = di_key.objectid; *other_parent = parent; + return 1; } else { - ret = 0; + return 0; } } else { - ret = -EAGAIN; + return -EAGAIN; } - goto out; } else if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; + return PTR_ERR(di); } btrfs_release_path(search_path); cur_offset += this_len; } - ret = 0; -out: - kfree(name); - return ret; + + return 0; } /* @@ -6031,8 +6024,8 @@ static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino, key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (WARN_ON_ONCE(ret > 0)) { @@ -6052,8 +6045,8 @@ static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino, } btrfs_release_path(path); - path->search_commit_root = 0; - path->skip_locking = 0; + path->search_commit_root = false; + path->skip_locking = false; return ret; } @@ -6543,7 +6536,7 @@ static int log_delayed_insertion_items(struct btrfs_trans_handle *trans, curr = list_next_entry(curr, log_list); } - ASSERT(batch.nr >= 1); + ASSERT(batch.nr >= 1, "batch.nr=%d", batch.nr); ret = insert_delayed_items_batch(trans, log, path, &batch, first); curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item, @@ -6587,7 +6580,9 @@ static int log_delayed_deletions_full(struct btrfs_trans_handle *trans, } last_dir_index = curr->index; - ASSERT(last_dir_index >= first_dir_index); + ASSERT(last_dir_index >= first_dir_index, + "last_dir_index=%llu first_dir_index=%llu", + last_dir_index, first_dir_index); ret = insert_dir_log_key(trans, inode->root->log_root, path, ino, first_dir_index, last_dir_index); @@ -6681,7 +6676,9 @@ static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans, goto next_batch; last_dir_index = last->index; - ASSERT(last_dir_index >= first_dir_index); + ASSERT(last_dir_index >= first_dir_index, + "last_dir_index=%llu first_dir_index=%llu", + last_dir_index, first_dir_index); /* * If this range starts right after where the previous one ends, * then we want to reuse the previous range item and change its @@ -6748,7 +6745,8 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, */ lockdep_assert_not_held(&inode->log_mutex); - ASSERT(!ctx->logging_new_delayed_dentries); + ASSERT(!ctx->logging_new_delayed_dentries, + "ctx->logging_new_delayed_dentries=%d", ctx->logging_new_delayed_dentries); ctx->logging_new_delayed_dentries = true; list_for_each_entry(item, delayed_ins_list, log_list) { @@ -7122,7 +7120,7 @@ log_extents: * a power failure unless the log was synced as part of an fsync * against any other unrelated inode. */ - if (inode_only != LOG_INODE_EXISTS) + if (!ctx->logging_new_name && inode_only != LOG_INODE_EXISTS) inode->last_log_commit = inode->last_sub_trans; spin_unlock(&inode->lock); @@ -7169,8 +7167,8 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->skip_locking = 1; - path->search_commit_root = 1; + path->skip_locking = true; + path->search_commit_root = true; key.objectid = ino; key.type = BTRFS_INODE_REF_KEY; @@ -7203,28 +7201,24 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, item_size = btrfs_item_size(leaf, slot); ptr = btrfs_item_ptr_offset(leaf, slot); while (cur_offset < item_size) { - struct btrfs_key inode_key; + u64 dir_id; struct btrfs_inode *dir_inode; - inode_key.type = BTRFS_INODE_ITEM_KEY; - inode_key.offset = 0; - if (key.type == BTRFS_INODE_EXTREF_KEY) { struct btrfs_inode_extref *extref; extref = (struct btrfs_inode_extref *) (ptr + cur_offset); - inode_key.objectid = btrfs_inode_extref_parent( - leaf, extref); + dir_id = btrfs_inode_extref_parent(leaf, extref); cur_offset += sizeof(*extref); cur_offset += btrfs_inode_extref_name_len(leaf, extref); } else { - inode_key.objectid = key.offset; + dir_id = key.offset; cur_offset = item_size; } - dir_inode = btrfs_iget_logging(inode_key.objectid, root); + dir_inode = btrfs_iget_logging(dir_id, root); /* * If the parent inode was deleted, return an error to * fallback to a transaction commit. This is to prevent @@ -7910,6 +7904,9 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, bool log_pinned = false; int ret; + /* The inode has a new name (ref/extref), so make sure we log it. */ + set_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); + btrfs_init_log_ctx(&ctx, inode); ctx.logging_new_name = true; @@ -7962,7 +7959,8 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct fscrypt_name fname; - ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX); + ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX, + "old_dir_index=%llu", old_dir_index); ret = fscrypt_setup_filename(&old_dir->vfs_inode, &old_dentry->d_name, 0, &fname); diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index dc313e6bb2fa..41e47fda036d 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -8,8 +8,7 @@ #include <linux/list.h> #include <linux/fs.h> -#include "messages.h" -#include "ctree.h" +#include <linux/fscrypt.h> #include "transaction.h" struct inode; @@ -80,13 +79,12 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *dentry, struct btrfs_log_ctx *ctx); void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const struct fscrypt_str *name, struct btrfs_inode *dir, u64 index); void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const struct fscrypt_str *name, - struct btrfs_inode *inode, u64 dirid); + struct btrfs_inode *inode, + struct btrfs_inode *dir); void btrfs_end_log_trans(struct btrfs_root *root); void btrfs_pin_log_trans(struct btrfs_root *root); void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 17b5e81123a1..e3a1310fa7d5 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -27,32 +27,26 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, const u8 *uuid, u8 type, u64 subid) { int ret; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *eb; int slot; u32 item_size; unsigned long offset; struct btrfs_key key; - if (WARN_ON_ONCE(!uuid_root)) { - ret = -ENOENT; - goto out; - } + if (WARN_ON_ONCE(!uuid_root)) + return -ENOENT; path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; btrfs_uuid_to_key(uuid, type, &key); ret = btrfs_search_slot(NULL, uuid_root, &key, path, 0, 0); - if (ret < 0) { - goto out; - } else if (ret > 0) { - ret = -ENOENT; - goto out; - } + if (ret < 0) + return ret; + if (ret > 0) + return -ENOENT; eb = path->nodes[0]; slot = path->slots[0]; @@ -64,7 +58,7 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, const u8 *uuid, btrfs_warn(uuid_root->fs_info, "uuid item with illegal size %lu!", (unsigned long)item_size); - goto out; + return ret; } while (item_size) { __le64 data; @@ -78,8 +72,6 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, const u8 *uuid, item_size -= sizeof(data); } -out: - btrfs_free_path(path); return ret; } @@ -89,7 +81,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 typ struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *uuid_root = fs_info->uuid_root; int ret; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *eb; int slot; @@ -100,18 +92,14 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 typ if (ret != -ENOENT) return ret; - if (WARN_ON_ONCE(!uuid_root)) { - ret = -EINVAL; - goto out; - } + if (WARN_ON_ONCE(!uuid_root)) + return -EINVAL; btrfs_uuid_to_key(uuid, type, &key); path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; ret = btrfs_insert_empty_item(trans, uuid_root, path, &key, sizeof(subid_le)); @@ -134,15 +122,12 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 typ btrfs_warn(fs_info, "insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!", ret, key.objectid, key.offset, type); - goto out; + return ret; } - ret = 0; subid_le = cpu_to_le64(subid_cpu); write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le)); -out: - btrfs_free_path(path); - return ret; + return 0; } int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type, @@ -151,7 +136,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *uuid_root = fs_info->uuid_root; int ret; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *eb; int slot; @@ -161,29 +146,23 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 unsigned long move_src; unsigned long move_len; - if (WARN_ON_ONCE(!uuid_root)) { - ret = -EINVAL; - goto out; - } + if (WARN_ON_ONCE(!uuid_root)) + return -EINVAL; btrfs_uuid_to_key(uuid, type, &key); path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; ret = btrfs_search_slot(trans, uuid_root, &key, path, -1, 1); if (ret < 0) { btrfs_warn(fs_info, "error %d while searching for uuid item!", ret); - goto out; - } - if (ret > 0) { - ret = -ENOENT; - goto out; + return ret; } + if (ret > 0) + return -ENOENT; eb = path->nodes[0]; slot = path->slots[0]; @@ -192,8 +171,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 if (!IS_ALIGNED(item_size, sizeof(u64))) { btrfs_warn(fs_info, "uuid item with illegal size %lu!", (unsigned long)item_size); - ret = -ENOENT; - goto out; + return -ENOENT; } while (item_size) { __le64 read_subid; @@ -205,16 +183,12 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 item_size -= sizeof(read_subid); } - if (!item_size) { - ret = -ENOENT; - goto out; - } + if (!item_size) + return -ENOENT; item_size = btrfs_item_size(eb, slot); - if (item_size == sizeof(subid)) { - ret = btrfs_del_item(trans, uuid_root, path); - goto out; - } + if (item_size == sizeof(subid)) + return btrfs_del_item(trans, uuid_root, path); move_dst = offset; move_src = offset + sizeof(subid); @@ -222,9 +196,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 memmove_extent_buffer(eb, move_dst, move_src, move_len); btrfs_truncate_item(trans, path, item_size - sizeof(subid), 1); -out: - btrfs_free_path(path); - return ret; + return 0; } static int btrfs_uuid_iter_rem(struct btrfs_root *uuid_root, u8 *uuid, u8 type, @@ -293,7 +265,7 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info) { struct btrfs_root *root = fs_info->uuid_root; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret = 0; struct extent_buffer *leaf; int slot; @@ -301,10 +273,8 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info) unsigned long offset; path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; key.objectid = 0; key.type = 0; @@ -312,17 +282,15 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info) again_search_slot: ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); - if (ret) { - if (ret > 0) - ret = 0; - goto out; - } + if (ret < 0) + return ret; + if (ret > 0) + return 0; while (1) { - if (btrfs_fs_closing(fs_info)) { - ret = -EINTR; - goto out; - } + if (btrfs_fs_closing(fs_info)) + return -EINTR; + cond_resched(); leaf = path->nodes[0]; slot = path->slots[0]; @@ -353,7 +321,7 @@ again_search_slot: ret = btrfs_check_uuid_tree_entry(fs_info, uuid, key.type, subid_cpu); if (ret < 0) - goto out; + return ret; if (ret > 0) { btrfs_release_path(path); ret = btrfs_uuid_iter_rem(root, uuid, key.type, @@ -369,7 +337,7 @@ again_search_slot: goto again_search_slot; } if (ret < 0 && ret != -ENOENT) - goto out; + return ret; key.offset++; goto again_search_slot; } @@ -386,8 +354,6 @@ skip: break; } -out: - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 46bd8ca58670..06dfcb461f53 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -109,7 +109,7 @@ static int drop_verity_items(struct btrfs_inode *inode, u8 key_type) { struct btrfs_trans_handle *trans; struct btrfs_root *root = inode->root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int count = 0; int ret; @@ -121,10 +121,8 @@ static int drop_verity_items(struct btrfs_inode *inode, u8 key_type) while (1) { /* 1 for the item being dropped */ trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out; - } + if (IS_ERR(trans)) + return PTR_ERR(trans); /* * Walk backwards through all the items until we find one that @@ -143,7 +141,7 @@ static int drop_verity_items(struct btrfs_inode *inode, u8 key_type) path->slots[0]--; } else if (ret < 0) { btrfs_end_transaction(trans); - goto out; + return ret; } btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); @@ -161,17 +159,14 @@ static int drop_verity_items(struct btrfs_inode *inode, u8 key_type) ret = btrfs_del_items(trans, root, path, path->slots[0], 1); if (ret) { btrfs_end_transaction(trans); - goto out; + return ret; } count++; btrfs_release_path(path); btrfs_end_transaction(trans); } - ret = count; btrfs_end_transaction(trans); -out: - btrfs_free_path(path); - return ret; + return count; } /* @@ -217,7 +212,7 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, const char *src, u64 len) { struct btrfs_trans_handle *trans; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = inode->root; struct extent_buffer *leaf; struct btrfs_key key; @@ -233,10 +228,8 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, while (len > 0) { /* 1 for the new item being inserted */ trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } + if (IS_ERR(trans)) + return PTR_ERR(trans); key.objectid = btrfs_ino(inode); key.type = key_type; @@ -267,7 +260,6 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, btrfs_end_transaction(trans); } - btrfs_free_path(path); return ret; } @@ -296,7 +288,7 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, char *dest, u64 len, struct folio *dest_folio) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = inode->root; struct extent_buffer *leaf; struct btrfs_key key; @@ -404,7 +396,6 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, } } out: - btrfs_free_path(path); if (!ret) ret = copied; return ret; @@ -587,6 +578,9 @@ static int btrfs_begin_enable_verity(struct file *filp) btrfs_assert_inode_locked(inode); + if (IS_ENCRYPTED(&inode->vfs_inode)) + return -EOPNOTSUPP; + if (test_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags)) return -EBUSY; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2bec544d8ba3..ae1742a35e76 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -739,7 +739,7 @@ static bool is_same_device(struct btrfs_device *device, const char *new_path) { struct path old = { .mnt = NULL, .dentry = NULL }; struct path new = { .mnt = NULL, .dentry = NULL }; - char *old_path = NULL; + char AUTO_KFREE(old_path); bool is_same = false; int ret; @@ -765,7 +765,6 @@ static bool is_same_device(struct btrfs_device *device, const char *new_path) if (path_equal(&old, &new)) is_same = true; out: - kfree(old_path); path_put(&old); path_put(&new); return is_same; @@ -1681,7 +1680,7 @@ static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; struct btrfs_dev_extent *dev_extent; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); u64 search_start; u64 hole_size; u64 max_hole_start; @@ -1711,8 +1710,8 @@ again: } path->reada = READA_FORWARD; - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; key.objectid = device->devid; key.type = BTRFS_DEV_EXTENT_KEY; @@ -1812,7 +1811,6 @@ next: "max_hole_start=%llu max_hole_size=%llu search_end=%llu", max_hole_start, max_hole_size, search_end); out: - btrfs_free_path(path); *start = max_hole_start; if (len) *len = max_hole_size; @@ -1826,7 +1824,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; struct extent_buffer *leaf = NULL; @@ -1845,7 +1843,7 @@ again: ret = btrfs_previous_item(root, path, key.objectid, BTRFS_DEV_EXTENT_KEY); if (ret) - goto out; + return ret; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); extent = btrfs_item_ptr(leaf, path->slots[0], @@ -1860,7 +1858,7 @@ again: extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); } else { - goto out; + return ret; } *dev_extent_len = btrfs_dev_extent_length(leaf, extent); @@ -1868,8 +1866,6 @@ again: ret = btrfs_del_item(trans, root, path); if (ret == 0) set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); -out: - btrfs_free_path(path); return ret; } @@ -1897,7 +1893,7 @@ static noinline int find_next_devid(struct btrfs_fs_info *fs_info, int ret; struct btrfs_key key; struct btrfs_key found_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); path = btrfs_alloc_path(); if (!path) @@ -1909,13 +1905,12 @@ static noinline int find_next_devid(struct btrfs_fs_info *fs_info, ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); if (ret < 0) - goto error; + return ret; if (unlikely(ret == 0)) { /* Corruption */ btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); - ret = -EUCLEAN; - goto error; + return -EUCLEAN; } ret = btrfs_previous_item(fs_info->chunk_root, path, @@ -1928,10 +1923,7 @@ static noinline int find_next_devid(struct btrfs_fs_info *fs_info, path->slots[0]); *devid_ret = found_key.offset + 1; } - ret = 0; -error: - btrfs_free_path(path); - return ret; + return 0; } /* @@ -1942,7 +1934,7 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, struct btrfs_device *device) { int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dev_item *dev_item; struct extent_buffer *leaf; struct btrfs_key key; @@ -1961,7 +1953,7 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, &key, sizeof(*dev_item)); btrfs_trans_release_chunk_metadata(trans); if (ret) - goto out; + return ret; leaf = path->nodes[0]; dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); @@ -1987,10 +1979,7 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, ptr, BTRFS_FSID_SIZE); - ret = 0; -out: - btrfs_free_path(path); - return ret; + return 0; } /* @@ -2002,14 +1991,11 @@ out: static void update_dev_time(const char *device_path) { struct path path; - int ret; - ret = kern_path(device_path, LOOKUP_FOLLOW, &path); - if (ret) - return; - - inode_update_time(d_inode(path.dentry), S_MTIME | S_CTIME | S_VERSION); - path_put(&path); + if (!kern_path(device_path, LOOKUP_FOLLOW, &path)) { + vfs_utimes(&path, NULL); + path_put(&path); + } } static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, @@ -2017,7 +2003,7 @@ static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, { struct btrfs_root *root = device->fs_info->chunk_root; int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; path = btrfs_alloc_path(); @@ -2031,16 +2017,12 @@ static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, btrfs_reserve_chunk_metadata(trans, false); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); btrfs_trans_release_chunk_metadata(trans); - if (ret) { - if (ret > 0) - ret = -ENOENT; - goto out; - } + if (ret > 0) + return -ENOENT; + if (ret < 0) + return ret; - ret = btrfs_del_item(trans, root, path); -out: - btrfs_free_path(path); - return ret; + return btrfs_del_item(trans, root, path); } /* @@ -2626,7 +2608,7 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = fs_info->chunk_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_dev_item *dev_item; struct btrfs_device *device; @@ -2648,7 +2630,7 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) ret = btrfs_search_slot(trans, root, &key, path, 0, 1); btrfs_trans_release_chunk_metadata(trans); if (ret < 0) - goto error; + return ret; leaf = path->nodes[0]; next_slot: @@ -2657,7 +2639,7 @@ next_slot: if (ret > 0) break; if (ret < 0) - goto error; + return ret; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); btrfs_release_path(path); @@ -2688,10 +2670,7 @@ next_slot: path->slots[0]++; goto next_slot; } - ret = 0; -error: - btrfs_free_path(path); - return ret; + return 0; } int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) @@ -2946,7 +2925,7 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, struct btrfs_device *device) { int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = device->fs_info->chunk_root; struct btrfs_dev_item *dev_item; struct extent_buffer *leaf; @@ -2962,12 +2941,10 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) - goto out; + return ret; - if (ret > 0) { - ret = -ENOENT; - goto out; - } + if (ret > 0) + return -ENOENT; leaf = path->nodes[0]; dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); @@ -2981,8 +2958,6 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, btrfs_device_get_disk_total_bytes(device)); btrfs_set_device_bytes_used(leaf, dev_item, btrfs_device_get_bytes_used(device)); -out: - btrfs_free_path(path); return ret; } @@ -3035,7 +3010,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = fs_info->chunk_root; int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; path = btrfs_alloc_path(); @@ -3048,23 +3023,21 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) - goto out; - else if (unlikely(ret > 0)) { /* Logic error or corruption */ + return ret; + if (unlikely(ret > 0)) { + /* Logic error or corruption */ btrfs_err(fs_info, "failed to lookup chunk %llu when freeing", chunk_offset); btrfs_abort_transaction(trans, -ENOENT); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } ret = btrfs_del_item(trans, root, path); if (unlikely(ret < 0)) { btrfs_err(fs_info, "failed to delete chunk %llu item", chunk_offset); btrfs_abort_transaction(trans, ret); - goto out; + return ret; } -out: - btrfs_free_path(path); return ret; } @@ -3501,7 +3474,7 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset, static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) { struct btrfs_root *chunk_root = fs_info->chunk_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_chunk *chunk; struct btrfs_key key; @@ -3525,7 +3498,7 @@ again: ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); if (ret < 0) { mutex_unlock(&fs_info->reclaim_bgs_lock); - goto error; + return ret; } if (unlikely(ret == 0)) { /* @@ -3535,9 +3508,8 @@ again: * offset (one less than the previous one, wrong * alignment and size). */ - ret = -EUCLEAN; mutex_unlock(&fs_info->reclaim_bgs_lock); - goto error; + return -EUCLEAN; } ret = btrfs_previous_item(chunk_root, path, key.objectid, @@ -3545,7 +3517,7 @@ again: if (ret) mutex_unlock(&fs_info->reclaim_bgs_lock); if (ret < 0) - goto error; + return ret; if (ret > 0) break; @@ -3579,8 +3551,6 @@ again: } else if (WARN_ON(failed && retried)) { ret = -ENOSPC; } -error: - btrfs_free_path(path); return ret; } @@ -4081,7 +4051,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) struct btrfs_root *chunk_root = fs_info->chunk_root; u64 chunk_type; struct btrfs_chunk *chunk; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; struct extent_buffer *leaf; @@ -4252,7 +4222,6 @@ loop: goto again; } error: - btrfs_free_path(path); if (enospc_errors) { btrfs_info(fs_info, "%d enospc errors during balance", enospc_errors); @@ -4410,7 +4379,7 @@ static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) { u32 size_buf = 1024; char tmp_buf[192] = {'\0'}; - char *buf; + char AUTO_KFREE(buf); char *bp; u32 size_bp = size_buf; int ret; @@ -4458,8 +4427,6 @@ out_overflow: btrfs_info(fs_info, "balance: %s %s", (bctl->flags & BTRFS_BALANCE_RESUME) ? "resume" : "start", buf); - - kfree(buf); } /* @@ -4660,12 +4627,12 @@ static int balance_kthread(void *data) struct btrfs_fs_info *fs_info = data; int ret = 0; - sb_start_write(fs_info->sb); + guard(super_write)(fs_info->sb); + mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl) ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); mutex_unlock(&fs_info->balance_mutex); - sb_end_write(fs_info->sb); return ret; } @@ -4709,7 +4676,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) struct btrfs_balance_control *bctl; struct btrfs_balance_item *item; struct btrfs_disk_balance_args disk_bargs; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key key; int ret; @@ -4724,17 +4691,14 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret > 0) { /* ret = -ENOENT; */ - ret = 0; - goto out; + return 0; } bctl = kzalloc(sizeof(*bctl), GFP_NOFS); - if (!bctl) { - ret = -ENOMEM; - goto out; - } + if (!bctl) + return -ENOMEM; leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); @@ -4771,8 +4735,6 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) fs_info->balance_ctl = bctl; spin_unlock(&fs_info->balance_lock); mutex_unlock(&fs_info->balance_mutex); -out: - btrfs_free_path(path); return ret; } @@ -5593,9 +5555,8 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_fs_devices *fs_devices = info->fs_devices; - struct btrfs_device_info *devices_info = NULL; + struct btrfs_device_info AUTO_KFREE(devices_info); struct alloc_chunk_ctl ctl; - struct btrfs_block_group *block_group; int ret; lockdep_assert_held(&info->chunk_mutex); @@ -5628,22 +5589,14 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, return ERR_PTR(-ENOMEM); ret = gather_device_info(fs_devices, &ctl, devices_info); - if (ret < 0) { - block_group = ERR_PTR(ret); - goto out; - } + if (ret < 0) + return ERR_PTR(ret); ret = decide_stripe_size(fs_devices, &ctl, devices_info); - if (ret < 0) { - block_group = ERR_PTR(ret); - goto out; - } - - block_group = create_chunk(trans, &ctl, devices_info); + if (ret < 0) + return ERR_PTR(ret); -out: - kfree(devices_info); - return block_group; + return create_chunk(trans, &ctl, devices_info); } /* @@ -6076,12 +6029,7 @@ struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, { struct btrfs_io_context *bioc; - bioc = kzalloc( - /* The size of btrfs_io_context */ - sizeof(struct btrfs_io_context) + - /* Plus the variable array for the stripes */ - sizeof(struct btrfs_io_stripe) * (total_stripes), - GFP_NOFS); + bioc = kzalloc(struct_size(bioc, stripes, total_stripes), GFP_NOFS); if (!bioc) return NULL; @@ -6807,6 +6755,8 @@ static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args, const struct btrfs_device *device) { + if (args->devt) + return device->devt == args->devt; if (args->missing) { if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) && !device->bdev) @@ -7455,7 +7405,7 @@ static void readahead_tree_node_children(struct extent_buffer *node) int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) { struct btrfs_root *root = fs_info->chunk_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key key; struct btrfs_key found_key; @@ -7494,7 +7444,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) * chunk tree, to keep it simple, just skip locking on the chunk tree. */ ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags)); - path->skip_locking = 1; + path->skip_locking = true; /* * Read all device items, and then all the chunk items. All @@ -7572,8 +7522,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) ret = 0; error: mutex_unlock(&uuid_mutex); - - btrfs_free_path(path); return ret; } @@ -7673,7 +7621,7 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; struct btrfs_device *device; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); int ret = 0; path = btrfs_alloc_path(); @@ -7695,8 +7643,6 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) } out: mutex_unlock(&fs_devices->device_list_mutex); - - btrfs_free_path(path); return ret; } @@ -7705,7 +7651,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *dev_root = fs_info->dev_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *eb; struct btrfs_dev_stats_item *ptr; @@ -7724,7 +7670,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, btrfs_warn(fs_info, "error %d while searching for dev_stats item for device %s", ret, btrfs_dev_name(device)); - goto out; + return ret; } if (ret == 0 && @@ -7735,7 +7681,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, btrfs_warn(fs_info, "delete too small dev_stats item for device %s failed %d", btrfs_dev_name(device), ret); - goto out; + return ret; } ret = 1; } @@ -7749,7 +7695,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, btrfs_warn(fs_info, "insert dev_stats item for device %s failed %d", btrfs_dev_name(device), ret); - goto out; + return ret; } } @@ -7758,8 +7704,6 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) btrfs_set_dev_stats_value(eb, ptr, i, btrfs_dev_stat_read(device, i)); -out: - btrfs_free_path(path); return ret; } @@ -8049,7 +7993,7 @@ out: */ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; u64 prev_devid = 0; @@ -8080,17 +8024,15 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) path->reada = READA_FORWARD; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); if (ret < 0) - goto out; + return ret; /* No dev extents at all? Not good */ - if (unlikely(ret > 0)) { - ret = -EUCLEAN; - goto out; - } + if (unlikely(ret > 0)) + return -EUCLEAN; } while (1) { struct extent_buffer *leaf = path->nodes[0]; @@ -8116,20 +8058,19 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", devid, physical_offset, prev_dev_ext_end); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } ret = verify_one_dev_extent(fs_info, chunk_offset, devid, physical_offset, physical_len); if (ret < 0) - goto out; + return ret; prev_devid = devid; prev_dev_ext_end = physical_offset + physical_len; ret = btrfs_next_item(root, path); if (ret < 0) - goto out; + return ret; if (ret > 0) { ret = 0; break; @@ -8137,10 +8078,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) } /* Ensure all chunks have corresponding dev extents */ - ret = verify_chunk_dev_extent_mapping(fs_info); -out: - btrfs_free_path(path); - return ret; + return verify_chunk_dev_extent_mapping(fs_info); } /* @@ -8177,12 +8115,12 @@ static int relocating_repair_kthread(void *data) target = cache->start; btrfs_put_block_group(cache); - sb_start_write(fs_info->sb); + guard(super_write)(fs_info->sb); + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { btrfs_info(fs_info, "zoned: skip relocating block group %llu to repair: EBUSY", target); - sb_end_write(fs_info->sb); return -EBUSY; } @@ -8210,7 +8148,6 @@ out: btrfs_put_block_group(cache); mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_exclop_finish(fs_info); - sb_end_write(fs_info->sb); return ret; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 2cbf8080eade..34b854c1a303 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -45,7 +45,7 @@ extern struct mutex uuid_mutex; #define BTRFS_STRIPE_LEN_SHIFT (16) #define BTRFS_STRIPE_LEN_MASK (BTRFS_STRIPE_LEN - 1) -static_assert(const_ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT); +static_assert(ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT); /* Used by sanity check for btrfs_raid_types. */ #define const_ffs(n) (__builtin_ctzll(n) + 1) @@ -58,8 +58,7 @@ static_assert(const_ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT); */ static_assert(const_ffs(BTRFS_BLOCK_GROUP_RAID0) < const_ffs(BTRFS_BLOCK_GROUP_PROFILE_MASK & ~BTRFS_BLOCK_GROUP_RAID0)); -static_assert(const_ilog2(BTRFS_BLOCK_GROUP_RAID0) > - ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK)); +static_assert(ilog2(BTRFS_BLOCK_GROUP_RAID0) > ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK)); /* ilog2() can handle both constants and variables */ #define BTRFS_BG_FLAG_TO_INDEX(profile) \ @@ -662,6 +661,11 @@ struct btrfs_dev_lookup_args { u64 devid; u8 *uuid; u8 *fsid; + /* + * If devt is specified, all other members will be ignored as it is + * enough to uniquely locate a device. + */ + dev_t devt; bool missing; }; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 79fb1614bd0c..ab55d10bd71f 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -29,9 +29,8 @@ int btrfs_getxattr(const struct inode *inode, const char *name, { struct btrfs_dir_item *di; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; - int ret = 0; unsigned long data_ptr; path = btrfs_alloc_path(); @@ -41,26 +40,19 @@ int btrfs_getxattr(const struct inode *inode, const char *name, /* lookup the xattr by name */ di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(BTRFS_I(inode)), name, strlen(name), 0); - if (!di) { - ret = -ENODATA; - goto out; - } else if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } + if (!di) + return -ENODATA; + if (IS_ERR(di)) + return PTR_ERR(di); leaf = path->nodes[0]; /* if size is 0, that means we want the size of the attr */ - if (!size) { - ret = btrfs_dir_data_len(leaf, di); - goto out; - } + if (!size) + return btrfs_dir_data_len(leaf, di); /* now get the data out of our dir_item */ - if (btrfs_dir_data_len(leaf, di) > size) { - ret = -ERANGE; - goto out; - } + if (btrfs_dir_data_len(leaf, di) > size) + return -ERANGE; /* * The way things are packed into the leaf is like this @@ -73,11 +65,7 @@ int btrfs_getxattr(const struct inode *inode, const char *name, btrfs_dir_name_len(leaf, di)); read_extent_buffer(leaf, buffer, data_ptr, btrfs_dir_data_len(leaf, di)); - ret = btrfs_dir_data_len(leaf, di); - -out: - btrfs_free_path(path); - return ret; + return btrfs_dir_data_len(leaf, di); } int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, @@ -85,7 +73,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, { struct btrfs_dir_item *di = NULL; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); size_t name_len = strlen(name); int ret = 0; @@ -97,7 +85,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->skip_release_on_error = 1; + path->skip_release_on_error = true; if (!value) { di = btrfs_lookup_xattr(trans, root, path, @@ -212,7 +200,6 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, */ } out: - btrfs_free_path(path); if (!ret) { set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); @@ -278,7 +265,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) struct btrfs_key key; struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int iter_ret = 0; int ret = 0; size_t total_size = 0, size_left = size; @@ -354,8 +341,6 @@ next: else ret = total_size; - btrfs_free_path(path); - return ret; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index e00036672f33..359a98e6de85 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -37,8 +37,8 @@ #define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G) #define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G) -#define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET) -#define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET) +#define BTRFS_SB_LOG_FIRST_SHIFT ilog2(BTRFS_SB_LOG_FIRST_OFFSET) +#define BTRFS_SB_LOG_SECOND_SHIFT ilog2(BTRFS_SB_LOG_SECOND_OFFSET) /* Number of superblock log zones */ #define BTRFS_NR_SB_LOG_ZONES 2 @@ -93,7 +93,8 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, sector_t sector; for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { - ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL); + ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL, + "zones[%d].type=%d", i, zones[i].type); empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY); full[i] = sb_zone_is_full(&zones[i]); } @@ -166,14 +167,14 @@ static inline u32 sb_zone_number(int shift, int mirror) { u64 zone = U64_MAX; - ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); + ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX, "mirror=%d", mirror); switch (mirror) { case 0: zone = 0; break; case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break; case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break; } - ASSERT(zone <= U32_MAX); + ASSERT(zone <= U32_MAX, "zone=%llu", zone); return (u32)zone; } @@ -240,7 +241,8 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, unsigned int i; u32 zno; - ASSERT(IS_ALIGNED(pos, zinfo->zone_size)); + ASSERT(IS_ALIGNED(pos, zinfo->zone_size), + "pos=%llu zinfo->zone_size=%llu", pos, zinfo->zone_size); zno = pos >> zinfo->zone_size_shift; /* * We cannot report zones beyond the zone end. So, it is OK to @@ -264,8 +266,8 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, } } - ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, - copy_zone_info_cb, zones); + ret = blkdev_report_zones_cached(device->bdev, pos >> SECTOR_SHIFT, + *nr_zones, copy_zone_info_cb, zones); if (ret < 0) { btrfs_err(device->fs_info, "zoned: failed to read zone %llu on %s (devid %llu)", @@ -494,6 +496,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) case BLK_ZONE_COND_IMP_OPEN: case BLK_ZONE_COND_EXP_OPEN: case BLK_ZONE_COND_CLOSED: + case BLK_ZONE_COND_ACTIVE: __set_bit(nreported, zone_info->active_zones); nactive++; break; @@ -896,9 +899,9 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, if (sb_zone + 1 >= nr_zones) return -ENOENT; - ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev), - BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb, - zones); + ret = blkdev_report_zones_cached(bdev, zone_start_sector(sb_zone, bdev), + BTRFS_NR_SB_LOG_ZONES, + copy_zone_info_cb, zones); if (ret < 0) return ret; if (unlikely(ret != BTRFS_NR_SB_LOG_ZONES)) @@ -1055,8 +1058,10 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, bool have_sb; int i; - ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size)); - ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size)); + ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size), + "hole_start=%llu zinfo->zone_size=%llu", hole_start, zinfo->zone_size); + ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size), + "num_bytes=%llu zinfo->zone_size=%llu", num_bytes, zinfo->zone_size); while (pos < hole_end) { begin = pos >> shift; @@ -1172,8 +1177,10 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) u64 pos; int ret; - ASSERT(IS_ALIGNED(start, zinfo->zone_size)); - ASSERT(IS_ALIGNED(size, zinfo->zone_size)); + ASSERT(IS_ALIGNED(start, zinfo->zone_size), + "start=%llu, zinfo->zone_size=%llu", start, zinfo->zone_size); + ASSERT(IS_ALIGNED(size, zinfo->zone_size), + "size=%llu, zinfo->zone_size=%llu", size, zinfo->zone_size); if (begin + nbits > zinfo->nr_zones) return -ERANGE; @@ -1317,6 +1324,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, if (!btrfs_dev_is_sequential(device, info->physical)) { up_read(&dev_replace->rwsem); info->alloc_offset = WP_CONVENTIONAL; + info->capacity = device->zone_info->zone_size; return 0; } @@ -1522,6 +1530,8 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; + u64 stripe_nr = 0, stripe_offset = 0; + u32 stripe_index = 0; if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", @@ -1529,28 +1539,26 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, return -EINVAL; } + if (last_alloc) { + u32 factor = map->num_stripes; + + stripe_nr = last_alloc >> BTRFS_STRIPE_LEN_SHIFT; + stripe_offset = last_alloc & BTRFS_STRIPE_LEN_MASK; + stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); + } + for (int i = 0; i < map->num_stripes; i++) { if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { - u64 stripe_nr, full_stripe_nr; - u64 stripe_offset; - int stripe_index; - - stripe_nr = div64_u64(last_alloc, map->stripe_size); - stripe_offset = stripe_nr * map->stripe_size; - full_stripe_nr = div_u64(stripe_nr, map->num_stripes); - div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); - zone_info[i].alloc_offset = - full_stripe_nr * map->stripe_size; + zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr); if (stripe_index > i) - zone_info[i].alloc_offset += map->stripe_size; + zone_info[i].alloc_offset += BTRFS_STRIPE_LEN; else if (stripe_index == i) - zone_info[i].alloc_offset += - (last_alloc - stripe_offset); + zone_info[i].alloc_offset += stripe_offset; } if (test_bit(0, active) != test_bit(i, active)) { @@ -1574,6 +1582,8 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; + u64 stripe_nr = 0, stripe_offset = 0; + u32 stripe_index = 0; if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", @@ -1581,6 +1591,14 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, return -EINVAL; } + if (last_alloc) { + u32 factor = map->num_stripes / map->sub_stripes; + + stripe_nr = last_alloc >> BTRFS_STRIPE_LEN_SHIFT; + stripe_offset = last_alloc & BTRFS_STRIPE_LEN_MASK; + stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); + } + for (int i = 0; i < map->num_stripes; i++) { if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; @@ -1594,26 +1612,12 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, } if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { - u64 stripe_nr, full_stripe_nr; - u64 stripe_offset; - int stripe_index; - - stripe_nr = div64_u64(last_alloc, map->stripe_size); - stripe_offset = stripe_nr * map->stripe_size; - full_stripe_nr = div_u64(stripe_nr, - map->num_stripes / map->sub_stripes); - div_u64_rem(stripe_nr, - (map->num_stripes / map->sub_stripes), - &stripe_index); - - zone_info[i].alloc_offset = - full_stripe_nr * map->stripe_size; + zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr); if (stripe_index > (i / map->sub_stripes)) - zone_info[i].alloc_offset += map->stripe_size; + zone_info[i].alloc_offset += BTRFS_STRIPE_LEN; else if (stripe_index == (i / map->sub_stripes)) - zone_info[i].alloc_offset += - (last_alloc - stripe_offset); + zone_info[i].alloc_offset += stripe_offset; } if ((i % map->sub_stripes) == 0) { @@ -1631,7 +1635,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) struct btrfs_chunk_map *map; u64 logical = cache->start; u64 length = cache->length; - struct zone_info *zone_info = NULL; + struct zone_info AUTO_KFREE(zone_info); int ret; int i; unsigned long *active = NULL; @@ -1683,8 +1687,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags); if (num_conventional > 0) { - /* Zone capacity is always zone size in emulation */ - cache->zone_capacity = cache->length; ret = calculate_alloc_pointer(cache, &last_alloc, new); if (ret) { btrfs_err(fs_info, @@ -1693,6 +1695,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } else if (map->num_stripes == num_conventional) { cache->alloc_offset = last_alloc; + cache->zone_capacity = cache->length; set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags); goto out; } @@ -1753,7 +1756,7 @@ out: !fs_info->stripe_root) { btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", btrfs_bg_type_to_raid_name(map->type)); - return -EINVAL; + ret = -EINVAL; } if (unlikely(cache->alloc_offset > cache->zone_capacity)) { @@ -1786,7 +1789,6 @@ out: cache->physical_map = NULL; } bitmap_free(active); - kfree(zone_info); return ret; } @@ -1813,14 +1815,14 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio) { u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT); struct btrfs_inode *inode = bbio->inode; - struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_block_group *cache; bool ret = false; if (!btrfs_is_zoned(fs_info)) return false; - if (!inode || !is_data_inode(inode)) + if (!is_data_inode(inode)) return false; if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) @@ -1871,7 +1873,7 @@ static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered, em = btrfs_search_extent_mapping(em_tree, ordered->file_offset, ordered->num_bytes); /* The em should be a new COW extent, thus it should not have an offset. */ - ASSERT(em->offset == 0); + ASSERT(em->offset == 0, "em->offset=%llu", em->offset); em->disk_bytenr = logical; btrfs_free_extent_map(em); write_unlock(&em_tree->lock); @@ -2582,7 +2584,8 @@ again: struct btrfs_space_info *reloc_sinfo = data_sinfo->sub_group[0]; int factor; - ASSERT(reloc_sinfo->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); + ASSERT(reloc_sinfo->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC, + "reloc_sinfo->subgroup_id=%d", reloc_sinfo->subgroup_id); factor = btrfs_bg_type_to_factor(bg->flags); down_write(&space_info->groups_sem); @@ -2596,9 +2599,9 @@ again: space_info->disk_total -= bg->length * factor; space_info->disk_total -= bg->zone_unusable; /* There is no allocation ever happened. */ - ASSERT(bg->used == 0); + ASSERT(bg->used == 0, "bg->used=%llu", bg->used); /* No super block in a block group on the zoned setup. */ - ASSERT(bg->bytes_super == 0); + ASSERT(bg->bytes_super == 0, "bg->bytes_super=%llu", bg->bytes_super); spin_unlock(&space_info->lock); bg->space_info = reloc_sinfo; @@ -2624,7 +2627,8 @@ again: /* Allocate new BG in the data relocation space_info. */ space_info = data_sinfo->sub_group[0]; - ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); + ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC, + "space_info->subgroup_id=%d", space_info->subgroup_id); ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE); btrfs_end_transaction(trans); if (ret == 1) { @@ -2754,10 +2758,9 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) return ret < 0 ? ret : 1; } -int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - bool do_finish) +int btrfs_zoned_activate_one_bg(struct btrfs_space_info *space_info, bool do_finish) { + struct btrfs_fs_info *fs_info = space_info->fs_info; struct btrfs_block_group *bg; int index; @@ -2966,7 +2969,8 @@ int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num * This holds because we currently reset fully used then freed * block group. */ - ASSERT(reclaimed == bg->zone_capacity); + ASSERT(reclaimed == bg->zone_capacity, + "reclaimed=%llu bg->zone_capacity=%llu", reclaimed, bg->zone_capacity); bg->free_space_ctl->free_space += reclaimed; space_info->bytes_zone_unusable -= reclaimed; spin_unlock(&bg->lock); diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 17c5656580dd..5cefdeb08b7b 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -15,7 +15,6 @@ #include "disk-io.h" #include "block-group.h" #include "btrfs_inode.h" -#include "fs.h" struct block_device; struct extent_buffer; @@ -94,8 +93,7 @@ bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info); void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, u64 length); int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info); -int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, bool do_finish); +int btrfs_zoned_activate_one_bg(struct btrfs_space_info *space_info, bool do_finish); void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info); int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes); #else /* CONFIG_BLK_DEV_ZONED */ @@ -262,8 +260,7 @@ static inline int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) return 1; } -static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, +static inline int btrfs_zoned_activate_one_bg(struct btrfs_space_info *space_info, bool do_finish) { /* Consider all the block groups are active */ |
