diff options
Diffstat (limited to 'fs/btrfs')
-rw-r--r-- | fs/btrfs/btrfs_inode.h | 2 | ||||
-rw-r--r-- | fs/btrfs/extent_io.c | 40 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 56 | ||||
-rw-r--r-- | fs/btrfs/qgroup.c | 6 | ||||
-rw-r--r-- | fs/btrfs/super.c | 9 | ||||
-rw-r--r-- | fs/btrfs/tree-log.c | 78 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 5 |
7 files changed, 132 insertions, 64 deletions
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index b99fb0273292..0387b9f43a52 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -248,7 +248,7 @@ struct btrfs_inode { u64 new_delalloc_bytes; /* * The offset of the last dir index key that was logged. - * This is used only for directories. + * This is used only for directories. Protected by 'log_mutex'. */ u64 last_dir_index_offset; }; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c953297aa89a..b21cb72835cc 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -111,6 +111,24 @@ struct btrfs_bio_ctrl { */ unsigned long submit_bitmap; struct readahead_control *ractl; + + /* + * The start offset of the last used extent map by a read operation. + * + * This is for proper compressed read merge. + * U64_MAX means we are starting the read and have made no progress yet. + * + * The current btrfs_bio_is_contig() only uses disk_bytenr as + * the condition to check if the read can be merged with previous + * bio, which is not correct. E.g. two file extents pointing to the + * same extent but with different offset. + * + * So here we need to do extra checks to only merge reads that are + * covered by the same extent map. + * Just extent_map::start will be enough, as they are unique + * inside the same inode. + */ + u64 last_em_start; }; static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) @@ -909,7 +927,7 @@ static void btrfs_readahead_expand(struct readahead_control *ractl, * return 0 on success, otherwise return error */ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, - struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start) + struct btrfs_bio_ctrl *bio_ctrl) { struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); @@ -1019,12 +1037,11 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, * non-optimal behavior (submitting 2 bios for the same extent). */ if (compress_type != BTRFS_COMPRESS_NONE && - prev_em_start && *prev_em_start != (u64)-1 && - *prev_em_start != em->start) + bio_ctrl->last_em_start != U64_MAX && + bio_ctrl->last_em_start != em->start) force_bio_submit = true; - if (prev_em_start) - *prev_em_start = em->start; + bio_ctrl->last_em_start = em->start; btrfs_free_extent_map(em); em = NULL; @@ -1238,12 +1255,15 @@ int btrfs_read_folio(struct file *file, struct folio *folio) const u64 start = folio_pos(folio); const u64 end = start + folio_size(folio) - 1; struct extent_state *cached_state = NULL; - struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; + struct btrfs_bio_ctrl bio_ctrl = { + .opf = REQ_OP_READ, + .last_em_start = U64_MAX, + }; struct extent_map *em_cached = NULL; int ret; lock_extents_for_read(inode, start, end, &cached_state); - ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); + ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl); btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); btrfs_free_extent_map(em_cached); @@ -2583,7 +2603,8 @@ void btrfs_readahead(struct readahead_control *rac) { struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD, - .ractl = rac + .ractl = rac, + .last_em_start = U64_MAX, }; struct folio *folio; struct btrfs_inode *inode = BTRFS_I(rac->mapping->host); @@ -2591,12 +2612,11 @@ void btrfs_readahead(struct readahead_control *rac) const u64 end = start + readahead_length(rac) - 1; struct extent_state *cached_state = NULL; struct extent_map *em_cached = NULL; - u64 prev_em_start = (u64)-1; lock_extents_for_read(inode, start, end, &cached_state); while ((folio = readahead_folio(rac)) != NULL) - btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); + btrfs_do_readpage(folio, &em_cached, &bio_ctrl); btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9e4aec7330cb..e7218e78bff4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5696,7 +5696,17 @@ static void btrfs_del_inode_from_root(struct btrfs_inode *inode) bool empty = false; xa_lock(&root->inodes); - entry = __xa_erase(&root->inodes, btrfs_ino(inode)); + /* + * This btrfs_inode is being freed and has already been unhashed at this + * point. It's possible that another btrfs_inode has already been + * allocated for the same inode and inserted itself into the root, so + * don't delete it in that case. + * + * Note that this shouldn't need to allocate memory, so the gfp flags + * don't really matter. + */ + entry = __xa_cmpxchg(&root->inodes, btrfs_ino(inode), inode, NULL, + GFP_ATOMIC); if (entry == inode) empty = xa_empty(&root->inodes); xa_unlock(&root->inodes); @@ -6805,7 +6815,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct fscrypt_name fname; u64 index; int ret; - int drop_inode = 0; /* do not allow sys_link's with other subvols of the same device */ if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root)) @@ -6837,44 +6846,44 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, /* There are several dir indexes for this inode, clear the cache. */ BTRFS_I(inode)->dir_index = 0ULL; - inc_nlink(inode); inode_inc_iversion(inode); inode_set_ctime_current(inode); - ihold(inode); set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), &fname.disk_name, 1, index); + if (ret) + goto fail; + /* Link added now we update the inode item with the new link count. */ + inc_nlink(inode); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) { - drop_inode = 1; - } else { - struct dentry *parent = dentry->d_parent; + btrfs_abort_transaction(trans, ret); + goto fail; + } - ret = btrfs_update_inode(trans, BTRFS_I(inode)); - if (ret) + if (inode->i_nlink == 1) { + /* + * If the new hard link count is 1, it's a file created with the + * open(2) O_TMPFILE flag. + */ + ret = btrfs_orphan_del(trans, BTRFS_I(inode)); + if (ret) { + btrfs_abort_transaction(trans, ret); goto fail; - if (inode->i_nlink == 1) { - /* - * If new hard link count is 1, it's a file created - * with open(2) O_TMPFILE flag. - */ - ret = btrfs_orphan_del(trans, BTRFS_I(inode)); - if (ret) - goto fail; } - d_instantiate(dentry, inode); - btrfs_log_new_name(trans, old_dentry, NULL, 0, parent); } + /* Grab reference for the new dentry passed to d_instantiate(). */ + ihold(inode); + d_instantiate(dentry, inode); + btrfs_log_new_name(trans, old_dentry, NULL, 0, dentry->d_parent); + fail: fscrypt_free_filename(&fname); if (trans) btrfs_end_transaction(trans); - if (drop_inode) { - inode_dec_link_count(inode); - iput(inode); - } btrfs_btree_balance_dirty(fs_info); return ret; } @@ -7830,6 +7839,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->last_sub_trans = 0; ei->logged_trans = 0; ei->delalloc_bytes = 0; + /* new_delalloc_bytes and last_dir_index_offset are in a union. */ ei->new_delalloc_bytes = 0; ei->defrag_bytes = 0; ei->disk_i_size = 0; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index ccaa9a3cf1ce..da102da169fd 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1455,6 +1455,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root, struct btrfs_qgroup *qgroup; LIST_HEAD(qgroup_list); u64 num_bytes = src->excl; + u64 num_bytes_cmpr = src->excl_cmpr; int ret = 0; qgroup = find_qgroup_rb(fs_info, ref_root); @@ -1466,11 +1467,12 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root, struct btrfs_qgroup_list *glist; qgroup->rfer += sign * num_bytes; - qgroup->rfer_cmpr += sign * num_bytes; + qgroup->rfer_cmpr += sign * num_bytes_cmpr; WARN_ON(sign < 0 && qgroup->excl < num_bytes); + WARN_ON(sign < 0 && qgroup->excl_cmpr < num_bytes_cmpr); qgroup->excl += sign * num_bytes; - qgroup->excl_cmpr += sign * num_bytes; + qgroup->excl_cmpr += sign * num_bytes_cmpr; if (sign > 0) qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a262b494a89f..df1f6cc3fe21 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -299,9 +299,12 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx, btrfs_set_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, NODATACOW); btrfs_clear_opt(ctx->mount_opt, NODATASUM); - } else if (btrfs_match_compress_type(string, "lzo", false)) { + } else if (btrfs_match_compress_type(string, "lzo", true)) { ctx->compress_type = BTRFS_COMPRESS_LZO; - ctx->compress_level = 0; + ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_LZO, + string + 3); + if (string[3] == ':' && string[4]) + btrfs_warn(NULL, "Compression level ignored for LZO"); btrfs_set_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, NODATACOW); btrfs_clear_opt(ctx->mount_opt, NODATASUM); @@ -1079,7 +1082,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_printf(seq, ",compress-force=%s", compress_type); else seq_printf(seq, ",compress=%s", compress_type); - if (info->compress_level) + if (info->compress_level && info->compress_type != BTRFS_COMPRESS_LZO) seq_printf(seq, ":%d", info->compress_level); } if (btrfs_test_opt(info, NOSSD)) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 69e11557fd13..7d5d90845ca9 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3340,6 +3340,31 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, return 0; } +static bool mark_inode_as_not_logged(const struct btrfs_trans_handle *trans, + struct btrfs_inode *inode) +{ + bool ret = false; + + /* + * Do this only if ->logged_trans is still 0 to prevent races with + * concurrent logging as we may see the inode not logged when + * inode_logged() is called but it gets logged after inode_logged() did + * not find it in the log tree and we end up setting ->logged_trans to a + * value less than trans->transid after the concurrent logging task has + * set it to trans->transid. As a consequence, subsequent rename, unlink + * and link operations may end up not logging new names and removing old + * names from the log. + */ + spin_lock(&inode->lock); + if (inode->logged_trans == 0) + inode->logged_trans = trans->transid - 1; + else if (inode->logged_trans == trans->transid) + ret = true; + spin_unlock(&inode->lock); + + return ret; +} + /* * Check if an inode was logged in the current transaction. This correctly deals * with the case where the inode was logged but has a logged_trans of 0, which @@ -3357,15 +3382,32 @@ static int inode_logged(const struct btrfs_trans_handle *trans, struct btrfs_key key; int ret; - if (inode->logged_trans == trans->transid) + /* + * Quick lockless call, since once ->logged_trans is set to the current + * transaction, we never set it to a lower value anywhere else. + */ + if (data_race(inode->logged_trans) == trans->transid) return 1; /* - * If logged_trans is not 0, then we know the inode logged was not logged - * in this transaction, so we can return false right away. + * If logged_trans is not 0 and not trans->transid, then we know the + * inode was not logged in this transaction, so we can return false + * right away. We take the lock to avoid a race caused by load/store + * tearing with a concurrent btrfs_log_inode() call or a concurrent task + * in this function further below - an update to trans->transid can be + * teared into two 32 bits updates for example, in which case we could + * see a positive value that is not trans->transid and assume the inode + * was not logged when it was. */ - if (inode->logged_trans > 0) + spin_lock(&inode->lock); + if (inode->logged_trans == trans->transid) { + spin_unlock(&inode->lock); + return 1; + } else if (inode->logged_trans > 0) { + spin_unlock(&inode->lock); return 0; + } + spin_unlock(&inode->lock); /* * If no log tree was created for this root in this transaction, then @@ -3374,10 +3416,8 @@ static int inode_logged(const struct btrfs_trans_handle *trans, * transaction's ID, to avoid the search below in a future call in case * a log tree gets created after this. */ - if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) { - inode->logged_trans = trans->transid - 1; - return 0; - } + if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) + return mark_inode_as_not_logged(trans, inode); /* * We have a log tree and the inode's logged_trans is 0. We can't tell @@ -3431,8 +3471,7 @@ static int inode_logged(const struct btrfs_trans_handle *trans, * Set logged_trans to a value greater than 0 and less then the * current transaction to avoid doing the search in future calls. */ - inode->logged_trans = trans->transid - 1; - return 0; + return mark_inode_as_not_logged(trans, inode); } /* @@ -3440,20 +3479,9 @@ static int inode_logged(const struct btrfs_trans_handle *trans, * the current transacion's ID, to avoid future tree searches as long as * the inode is not evicted again. */ + spin_lock(&inode->lock); inode->logged_trans = trans->transid; - - /* - * If it's a directory, then we must set last_dir_index_offset to the - * maximum possible value, so that the next attempt to log the inode does - * not skip checking if dir index keys found in modified subvolume tree - * leaves have been logged before, otherwise it would result in attempts - * to insert duplicate dir index keys in the log tree. This must be done - * because last_dir_index_offset is an in-memory only field, not persisted - * in the inode item or any other on-disk structure, so its value is lost - * once the inode is evicted. - */ - if (S_ISDIR(inode->vfs_inode.i_mode)) - inode->last_dir_index_offset = (u64)-1; + spin_unlock(&inode->lock); return 1; } @@ -4045,7 +4073,7 @@ done: /* * If the inode was logged before and it was evicted, then its - * last_dir_index_offset is (u64)-1, so we don't the value of the last index + * last_dir_index_offset is 0, so we don't know the value of the last index * key offset. If that's the case, search for it and update the inode. This * is to avoid lookups in the log tree every time we try to insert a dir index * key from a leaf changed in the current transaction, and to allow us to always @@ -4061,7 +4089,7 @@ static int update_last_dir_index_offset(struct btrfs_inode *inode, lockdep_assert_held(&inode->log_mutex); - if (inode->last_dir_index_offset != (u64)-1) + if (inode->last_dir_index_offset != 0) return 0; if (!ctx->logged_before) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index fa7a929a0461..c6e3efd6f602 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2722,6 +2722,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path goto error; } + if (bdev_nr_bytes(file_bdev(bdev_file)) <= BTRFS_DEVICE_RANGE_RESERVED) { + ret = -EINVAL; + goto error; + } + if (fs_devices->seeding) { seeding_dev = true; down_write(&sb->s_umount); |