diff options
| author | Filipe Manana <fdmanana@suse.com> | 2026-02-17 14:46:50 +0000 |
|---|---|---|
| committer | David Sterba <dsterba@suse.com> | 2026-03-23 20:09:33 +0100 |
| commit | 5254d4181add9dfaa5e3519edd71cc8f752b2f85 (patch) | |
| tree | 6ef4a4698e569856ed81b881fff4d9ac0fd1d37f /fs | |
| parent | b52fe51f724385b3ed81e37e510a4a33107e8161 (diff) | |
btrfs: fix zero size inode with non-zero size after log replay
When logging that an inode exists, as part of logging a new name or
logging new dir entries for a directory, we always set the generation of
the logged inode item to 0. This is to signal during log replay (in
overwrite_item()), that we should not set the i_size since we only logged
that an inode exists, so the i_size of the inode in the subvolume tree
must be preserved (as when we log new names or that an inode exists, we
don't log extents).
This works fine except when we have already logged an inode in full mode
or it's the first time we are logging an inode created in a past
transaction, that inode has a new i_size of 0 and then we log a new name
for the inode (due to a new hardlink or a rename), in which case we log
an i_size of 0 for the inode and a generation of 0, which causes the log
replay code to not update the inode's i_size to 0 (in overwrite_item()).
An example scenario:
mkdir /mnt/dir
xfs_io -f -c "pwrite 0 64K" /mnt/dir/foo
sync
xfs_io -c "truncate 0" -c "fsync" /mnt/dir/foo
ln /mnt/dir/foo /mnt/dir/bar
xfs_io -c "fsync" /mnt/dir
<power fail>
After log replay the file remains with a size of 64K. This is because when
we first log the inode, when we fsync file foo, we log its current i_size
of 0, and then when we create a hard link we log again the inode in exists
mode (LOG_INODE_EXISTS) but we set a generation of 0 for the inode item we
add to the log tree, so during log replay overwrite_item() sees that the
generation is 0 and i_size is 0 so we skip updating the inode's i_size
from 64K to 0.
Fix this by making sure at fill_inode_item() we always log the real
generation of the inode if it was logged in the current transaction with
the i_size we logged before. Also if an inode created in a previous
transaction is logged in exists mode only, make sure we log the i_size
stored in the inode item located from the commit root, so that if we log
multiple times that the inode exists we get the correct i_size.
A test case for fstests will follow soon.
Reported-by: Vyacheslav Kovalevsky <slava.kovalevskiy.2014@gmail.com>
Link: https://lore.kernel.org/linux-btrfs/af8c15fa-4e41-4bb2-885c-0bc4e97532a6@gmail.com/
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Diffstat (limited to 'fs')
| -rw-r--r-- | fs/btrfs/tree-log.c | 98 |
1 files changed, 65 insertions, 33 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9ff3933bc382..fce1b16a882b 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4616,21 +4616,32 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct inode *inode, bool log_inode_only, u64 logged_isize) { + u64 gen = BTRFS_I(inode)->generation; u64 flags; if (log_inode_only) { - /* set the generation to zero so the recover code - * can tell the difference between an logging - * just to say 'this inode exists' and a logging - * to say 'update this inode with these values' + /* + * Set the generation to zero so the recover code can tell the + * difference between a logging just to say 'this inode exists' + * and a logging to say 'update this inode with these values'. + * But only if the inode was not already logged before. + * We access ->logged_trans directly since it was already set + * up in the call chain by btrfs_log_inode(), and data_race() + * to avoid false alerts from KCSAN and since it was set already + * and one can set it to 0 since that only happens on eviction + * and we are holding a ref on the inode. */ - btrfs_set_inode_generation(leaf, item, 0); + ASSERT(data_race(BTRFS_I(inode)->logged_trans) > 0); + if (data_race(BTRFS_I(inode)->logged_trans) < trans->transid) + gen = 0; + btrfs_set_inode_size(leaf, item, logged_isize); } else { - btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); btrfs_set_inode_size(leaf, item, inode->i_size); } + btrfs_set_inode_generation(leaf, item, gen); + btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); btrfs_set_inode_mode(leaf, item, inode->i_mode); @@ -5448,42 +5459,63 @@ process: return 0; } -static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, - struct btrfs_path *path, u64 *size_ret) +static int get_inode_size_to_log(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, u64 *size_ret) { struct btrfs_key key; + struct btrfs_inode_item *item; int ret; key.objectid = btrfs_ino(inode); key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); - if (ret < 0) { - return ret; - } else if (ret > 0) { - *size_ret = 0; - } else { - struct btrfs_inode_item *item; + /* + * Our caller called inode_logged(), so logged_trans is up to date. + * Use data_race() to silence any warning from KCSAN. Once logged_trans + * is set, it can only be reset to 0 after inode eviction. + */ + if (data_race(inode->logged_trans) == trans->transid) { + ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0); + } else if (inode->generation < trans->transid) { + path->search_commit_root = true; + path->skip_locking = true; + ret = btrfs_search_slot(NULL, inode->root, &key, path, 0, 0); + path->search_commit_root = false; + path->skip_locking = false; - item = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_inode_item); - *size_ret = btrfs_inode_size(path->nodes[0], item); - /* - * If the in-memory inode's i_size is smaller then the inode - * size stored in the btree, return the inode's i_size, so - * that we get a correct inode size after replaying the log - * when before a power failure we had a shrinking truncate - * followed by addition of a new name (rename / new hard link). - * Otherwise return the inode size from the btree, to avoid - * data loss when replaying a log due to previously doing a - * write that expands the inode's size and logging a new name - * immediately after. - */ - if (*size_ret > inode->vfs_inode.i_size) - *size_ret = inode->vfs_inode.i_size; + } else { + *size_ret = 0; + return 0; } + /* + * If the inode was logged before or is from a past transaction, then + * its inode item must exist in the log root or in the commit root. + */ + ASSERT(ret <= 0); + if (WARN_ON_ONCE(ret > 0)) + ret = -ENOENT; + + if (ret < 0) + return ret; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + *size_ret = btrfs_inode_size(path->nodes[0], item); + /* + * If the in-memory inode's i_size is smaller then the inode size stored + * in the btree, return the inode's i_size, so that we get a correct + * inode size after replaying the log when before a power failure we had + * a shrinking truncate followed by addition of a new name (rename / new + * hard link). Otherwise return the inode size from the btree, to avoid + * data loss when replaying a log due to previously doing a write that + * expands the inode's size and logging a new name immediately after. + */ + if (*size_ret > inode->vfs_inode.i_size) + *size_ret = inode->vfs_inode.i_size; + btrfs_release_path(path); return 0; } @@ -6996,7 +7028,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, ret = drop_inode_items(trans, log, path, inode, BTRFS_XATTR_ITEM_KEY); } else { - if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) { + if (inode_only == LOG_INODE_EXISTS) { /* * Make sure the new inode item we write to the log has * the same isize as the current one (if it exists). @@ -7010,7 +7042,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * (zeroes), as if an expanding truncate happened, * instead of getting a file of 4Kb only. */ - ret = logged_inode_size(log, inode, path, &logged_isize); + ret = get_inode_size_to_log(trans, inode, path, &logged_isize); if (ret) goto out_unlock; } |
