summaryrefslogtreecommitdiff
path: root/fs/ext4
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/balloc.c2
-rw-r--r--fs/ext4/dir.c8
-rw-r--r--fs/ext4/ext4.h50
-rw-r--r--fs/ext4/ext4_jbd2.c3
-rw-r--r--fs/ext4/extents.c28
-rw-r--r--fs/ext4/extents_status.c31
-rw-r--r--fs/ext4/extents_status.h2
-rw-r--r--fs/ext4/hash.c2
-rw-r--r--fs/ext4/ialloc.c1
-rw-r--r--fs/ext4/inline.c14
-rw-r--r--fs/ext4/inode.c193
-rw-r--r--fs/ext4/ioctl.c14
-rw-r--r--fs/ext4/mballoc.c188
-rw-r--r--fs/ext4/mmp.c8
-rw-r--r--fs/ext4/move_extent.c786
-rw-r--r--fs/ext4/namei.c18
-rw-r--r--fs/ext4/orphan.c8
-rw-r--r--fs/ext4/readpage.c7
-rw-r--r--fs/ext4/super.c72
-rw-r--r--fs/ext4/sysfs.c6
-rw-r--r--fs/ext4/verity.c2
-rw-r--r--fs/ext4/xattr.c6
22 files changed, 741 insertions, 708 deletions
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index c9329ed5c094..8040c731b3e4 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -752,7 +752,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
*count = ar.len;
/*
* Account for the allocated meta blocks. We will never
- * fail EDQUOT for metdata, but we do account for it.
+ * fail EDQUOT for metadata, but we do account for it.
*/
if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) {
dquot_alloc_block_nofail(inode,
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index d4164c507a90..256fe2c1d4c1 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -192,13 +192,13 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
continue;
}
if (err > 0) {
- pgoff_t index = map.m_pblk >>
- (PAGE_SHIFT - inode->i_blkbits);
+ pgoff_t index = map.m_pblk << inode->i_blkbits >>
+ PAGE_SHIFT;
if (!ra_has_index(&file->f_ra, index))
page_cache_sync_readahead(
sb->s_bdev->bd_mapping,
- &file->f_ra, file,
- index, 1);
+ &file->f_ra, file, index,
+ 1 << EXT4_SB(sb)->s_min_folio_order);
file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT;
bh = ext4_bread(NULL, inode, map.m_lblk, 0);
if (IS_ERR(bh)) {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 57087da6c7be..56112f201cac 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -260,6 +260,7 @@ struct ext4_map_blocks {
ext4_lblk_t m_lblk;
unsigned int m_len;
unsigned int m_flags;
+ u64 m_seq;
};
/*
@@ -367,7 +368,14 @@ struct ext4_io_submit {
blkbits))
#define EXT4_B_TO_LBLK(inode, offset) \
(round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits)
-
+#define EXT4_LBLK_TO_B(inode, lblk) ((loff_t)(lblk) << (inode)->i_blkbits)
+
+/* Translate a block number to a page index */
+#define EXT4_LBLK_TO_PG(inode, lblk) (EXT4_LBLK_TO_B((inode), (lblk)) >> \
+ PAGE_SHIFT)
+/* Translate a page index to a block number */
+#define EXT4_PG_TO_LBLK(inode, pnum) (((loff_t)(pnum) << PAGE_SHIFT) >> \
+ (inode)->i_blkbits)
/* Translate a block number to a cluster number */
#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits)
/* Translate a cluster number to a block number */
@@ -694,13 +702,22 @@ enum {
/* Caller is from the delayed allocation writeout path
* finally doing the actual allocation of delayed blocks */
#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
- /* caller is from the direct IO path, request to creation of an
- unwritten extents if not allocated, split the unwritten
- extent if blocks has been preallocated already*/
-#define EXT4_GET_BLOCKS_PRE_IO 0x0008
-#define EXT4_GET_BLOCKS_CONVERT 0x0010
-#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\
+ /*
+ * This means that we cannot merge newly allocated extents, and if we
+ * found an unwritten extent, we need to split it.
+ */
+#define EXT4_GET_BLOCKS_SPLIT_NOMERGE 0x0008
+ /*
+ * Caller is from the dio or dioread_nolock buffered IO, reqest to
+ * create an unwritten extent if it does not exist or split the
+ * found unwritten extent. Also do not merge the newly created
+ * unwritten extent, io end will convert unwritten to written,
+ * and try to merge the written extent.
+ */
+#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_SPLIT_NOMERGE|\
EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
+ /* Convert unwritten extent to initialized. */
+#define EXT4_GET_BLOCKS_CONVERT 0x0010
/* Eventual metadata allocation (due to growing extent tree)
* should not fail, so try to use reserved blocks for that.*/
#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020
@@ -1138,6 +1155,8 @@ struct ext4_inode_info {
ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for
extents to shrink. Protected by
i_es_lock */
+ u64 i_es_seq; /* Change counter for extents.
+ Protected by i_es_lock */
/* ialloc */
ext4_group_t i_last_alloc_group;
@@ -1685,6 +1704,11 @@ struct ext4_sb_info {
/* record the last minlen when FITRIM is called. */
unsigned long s_last_trim_minblks;
+ /* minimum folio order of a page cache allocation */
+ u16 s_min_folio_order;
+ /* supported maximum folio order, 0 means not supported */
+ u16 s_max_folio_order;
+
/* Precomputed FS UUID checksum for seeding other checksums */
__u32 s_csum_seed;
@@ -2472,28 +2496,19 @@ static inline unsigned int ext4_dir_rec_len(__u8 name_len,
return (rec_len & ~EXT4_DIR_ROUND);
}
-/*
- * If we ever get support for fs block sizes > page_size, we'll need
- * to remove the #if statements in the next two functions...
- */
static inline unsigned int
ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
{
unsigned len = le16_to_cpu(dlen);
-#if (PAGE_SIZE >= 65536)
if (len == EXT4_MAX_REC_LEN || len == 0)
return blocksize;
return (len & 65532) | ((len & 3) << 16);
-#else
- return len;
-#endif
}
static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
{
BUG_ON((len > blocksize) || (blocksize > (1 << 18)) || (len & 3));
-#if (PAGE_SIZE >= 65536)
if (len < 65536)
return cpu_to_le16(len);
if (len == blocksize) {
@@ -2503,9 +2518,6 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
return cpu_to_le16(0);
}
return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
-#else
- return cpu_to_le16(len);
-#endif
}
/*
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index a0e66bc10093..05e5946ed9b3 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -16,8 +16,7 @@ int ext4_inode_journal_mode(struct inode *inode)
ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
(ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
- !test_opt(inode->i_sb, DELALLOC) &&
- !mapping_large_folio_support(inode->i_mapping))) {
+ !test_opt(inode->i_sb, DELALLOC))) {
/* We do not support data journalling for encrypted data */
if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ca5499e9412b..2cf5759ba689 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -333,7 +333,7 @@ ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
int nofail)
{
int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
- int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO;
+ int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_SPLIT_NOMERGE;
if (nofail)
flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL;
@@ -2002,7 +2002,7 @@ ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
}
/* try to insert block into found extent and return */
- if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {
+ if (ex && !(gb_flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE)) {
/*
* Try to see whether we should rather test the extent on
@@ -2181,7 +2181,7 @@ has_space:
merge:
/* try to merge extents */
- if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
+ if (!(gb_flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE))
ext4_ext_try_to_merge(handle, inode, path, nearex);
/* time to correct all indexes above */
@@ -2213,7 +2213,7 @@ static int ext4_fill_es_cache_info(struct inode *inode,
while (block <= end) {
next = 0;
flags = 0;
- if (!ext4_es_lookup_extent(inode, block, &next, &es))
+ if (!ext4_es_lookup_extent(inode, block, &next, &es, NULL))
break;
if (ext4_es_is_unwritten(&es))
flags |= FIEMAP_EXTENT_UNWRITTEN;
@@ -3224,7 +3224,7 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
else
ext4_ext_mark_initialized(ex);
- if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
+ if (!(flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE))
ext4_ext_try_to_merge(handle, inode, path, ex);
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
@@ -3368,7 +3368,7 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
if (map->m_lblk + map->m_len < ee_block + ee_len) {
split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
- flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
+ flags1 = flags | EXT4_GET_BLOCKS_SPLIT_NOMERGE;
if (unwritten)
split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
EXT4_EXT_MARK_UNWRIT2;
@@ -3721,10 +3721,6 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle,
>> inode->i_sb->s_blocksize_bits;
if (eof_block < map->m_lblk + map->m_len)
eof_block = map->m_lblk + map->m_len;
- /*
- * It is safe to convert extent to initialized via explicit
- * zeroout only if extent is fully inside i_size or new_size.
- */
depth = ext_depth(inode);
ex = path[depth].p_ext;
ee_block = le32_to_cpu(ex->ee_block);
@@ -3735,11 +3731,15 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle,
split_flag |= EXT4_EXT_DATA_VALID1;
/* Convert to initialized */
} else if (flags & EXT4_GET_BLOCKS_CONVERT) {
+ /*
+ * It is safe to convert extent to initialized via explicit
+ * zeroout only if extent is fully inside i_size or new_size.
+ */
split_flag |= ee_block + ee_len <= eof_block ?
EXT4_EXT_MAY_ZEROOUT : 0;
split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
}
- flags |= EXT4_GET_BLOCKS_PRE_IO;
+ flags |= EXT4_GET_BLOCKS_SPLIT_NOMERGE;
return ext4_split_extent(handle, inode, path, map, split_flag, flags,
allocated);
}
@@ -3911,7 +3911,7 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
*allocated, newblock);
/* get_block() before submitting IO, split the extent */
- if (flags & EXT4_GET_BLOCKS_PRE_IO) {
+ if (flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE) {
path = ext4_split_convert_extents(handle, inode, map, path,
flags | EXT4_GET_BLOCKS_CONVERT, allocated);
if (IS_ERR(path))
@@ -4562,7 +4562,7 @@ retry:
* allow a full retry cycle for any remaining allocations
*/
retries = 0;
- epos = (loff_t)(map.m_lblk + ret) << blkbits;
+ epos = EXT4_LBLK_TO_B(inode, map.m_lblk + ret);
inode_set_ctime_current(inode);
if (new_size) {
if (epos > new_size)
@@ -5618,7 +5618,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
path = ext4_split_extent_at(handle, inode, path,
start_lblk, split_flag,
EXT4_EX_NOCACHE |
- EXT4_GET_BLOCKS_PRE_IO |
+ EXT4_GET_BLOCKS_SPLIT_NOMERGE |
EXT4_GET_BLOCKS_METADATA_NOFAIL);
}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 31dc0496f8d0..e04fbf10fe4f 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -235,6 +235,13 @@ static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
return es->es_lblk + es->es_len - 1;
}
+static inline void ext4_es_inc_seq(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ WRITE_ONCE(ei->i_es_seq, ei->i_es_seq + 1);
+}
+
/*
* search through the tree for an delayed extent with a given offset. If
* it can't be found, try to find next extent.
@@ -906,7 +913,6 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
newes.es_lblk = lblk;
newes.es_len = len;
ext4_es_store_pblock_status(&newes, pblk, status);
- trace_ext4_es_insert_extent(inode, &newes);
ext4_es_insert_extent_check(inode, &newes);
@@ -955,6 +961,11 @@ retry:
}
pending = err3;
}
+ /*
+ * TODO: For cache on-disk extents, there is no need to increment
+ * the sequence counter, this requires future optimization.
+ */
+ ext4_es_inc_seq(inode);
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
/*
@@ -981,6 +992,7 @@ error:
if (err1 || err2 || err3 < 0)
goto retry;
+ trace_ext4_es_insert_extent(inode, &newes);
ext4_es_print_tree(inode);
return;
}
@@ -1027,8 +1039,8 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
* Return: 1 on found, 0 on not
*/
int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t *next_lblk,
- struct extent_status *es)
+ ext4_lblk_t *next_lblk, struct extent_status *es,
+ u64 *pseq)
{
struct ext4_es_tree *tree;
struct ext4_es_stats *stats;
@@ -1087,6 +1099,8 @@ out:
} else
*next_lblk = 0;
}
+ if (pseq)
+ *pseq = EXT4_I(inode)->i_es_seq;
} else {
percpu_counter_inc(&stats->es_stats_cache_misses);
}
@@ -1550,7 +1564,6 @@ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return;
- trace_ext4_es_remove_extent(inode, lblk, len);
es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
lblk, len, inode->i_ino);
@@ -1570,16 +1583,21 @@ retry:
*/
write_lock(&EXT4_I(inode)->i_es_lock);
err = __es_remove_extent(inode, lblk, end, &reserved, es);
+ if (err)
+ goto error;
/* Free preallocated extent if it didn't get used. */
if (es) {
if (!es->es_len)
__es_free_extent(es);
es = NULL;
}
+ ext4_es_inc_seq(inode);
+error:
write_unlock(&EXT4_I(inode)->i_es_lock);
if (err)
goto retry;
+ trace_ext4_es_remove_extent(inode, lblk, len);
ext4_es_print_tree(inode);
ext4_da_release_space(inode, reserved);
}
@@ -2140,8 +2158,6 @@ void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
newes.es_lblk = lblk;
newes.es_len = len;
ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
- trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated,
- end_allocated);
ext4_es_insert_extent_check(inode, &newes);
@@ -2196,11 +2212,14 @@ retry:
pr2 = NULL;
}
}
+ ext4_es_inc_seq(inode);
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
if (err1 || err2 || err3 < 0)
goto retry;
+ trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated,
+ end_allocated);
ext4_es_print_tree(inode);
ext4_print_pending_tree(inode);
return;
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 8f9c008d11e8..f3396cf32b44 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -148,7 +148,7 @@ extern void ext4_es_find_extent_range(struct inode *inode,
struct extent_status *es);
extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t *next_lblk,
- struct extent_status *es);
+ struct extent_status *es, u64 *pseq);
extern bool ext4_es_scan_range(struct inode *inode,
int (*matching_fn)(struct extent_status *es),
ext4_lblk_t lblk, ext4_lblk_t end);
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 33cd5b6b02d5..48483cd015d3 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -268,7 +268,7 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len,
combined_hash = fscrypt_fname_siphash(dir, &qname);
} else {
ext4_warning_inode(dir, "Siphash requires key");
- return -1;
+ return -EINVAL;
}
hash = (__u32)(combined_hash >> 32);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index ba4fd9aba1c1..b20a1bf866ab 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1293,7 +1293,6 @@ got:
ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen));
}
- ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
ext4_set_inode_state(inode, EXT4_STATE_NEW);
ei->i_extra_isize = sbi->s_want_extra_isize;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 1b094a4f3866..1f6bc05593df 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -418,7 +418,12 @@ static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
return -ENOSPC;
ext4_write_lock_xattr(inode, &no_expand);
-
+ /*
+ * ei->i_inline_size may have changed since the initial check
+ * if other xattrs were added. Recalculate to ensure
+ * ext4_update_inline_data() validates against current capacity.
+ */
+ (void) ext4_find_inline_data_nolock(inode);
if (ei->i_inline_off)
ret = ext4_update_inline_data(handle, inode, len);
else
@@ -446,9 +451,13 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle,
if (!ei->i_inline_off)
return 0;
+ down_write(&ei->i_data_sem);
+
error = ext4_get_inode_loc(inode, &is.iloc);
- if (error)
+ if (error) {
+ up_write(&ei->i_data_sem);
return error;
+ }
error = ext4_xattr_ibody_find(inode, &i, &is);
if (error)
@@ -487,6 +496,7 @@ out:
brelse(is.iloc.bh);
if (error == -ENODATA)
error = 0;
+ up_write(&ei->i_data_sem);
return error;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e99306a8f47c..0c466ccbed69 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -202,8 +202,7 @@ void ext4_evict_inode(struct inode *inode)
* the inode. Flush worker is ignoring it because of I_FREEING flag but
* we still need to remove the inode from the writeback lists.
*/
- if (!list_empty_careful(&inode->i_io_list))
- inode_io_list_del(inode);
+ inode_io_list_del(inode);
/*
* Protect us against freezing - iput() caller didn't have to have any
@@ -425,7 +424,7 @@ void ext4_check_map_extents_env(struct inode *inode)
if (!S_ISREG(inode->i_mode) ||
IS_NOQUOTA(inode) || IS_VERITY(inode) ||
is_special_ino(inode->i_sb, inode->i_ino) ||
- (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) ||
+ (inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) ||
ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
ext4_verity_in_progress(inode))
return;
@@ -550,10 +549,13 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
retval = ext4_ext_map_blocks(handle, inode, map, flags);
else
retval = ext4_ind_map_blocks(handle, inode, map, flags);
-
- if (retval <= 0)
+ if (retval < 0)
return retval;
+ /* A hole? */
+ if (retval == 0)
+ goto out;
+
if (unlikely(retval != map->m_len)) {
ext4_warning(inode->i_sb,
"ES len assertion failed for inode "
@@ -573,11 +575,13 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
map->m_pblk, status, false);
- return retval;
+ } else {
+ retval = ext4_map_query_blocks_next_in_leaf(handle, inode, map,
+ orig_mlen);
}
-
- return ext4_map_query_blocks_next_in_leaf(handle, inode, map,
- orig_mlen);
+out:
+ map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq);
+ return retval;
}
static int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
@@ -648,8 +652,8 @@ static int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
* If the extent has been zeroed out, we don't need to update
* extent status tree.
*/
- if (flags & EXT4_GET_BLOCKS_PRE_IO &&
- ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
+ if (flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE &&
+ ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) {
if (ext4_es_is_written(&es))
return retval;
}
@@ -658,6 +662,7 @@ static int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk,
status, flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE);
+ map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq);
return retval;
}
@@ -723,7 +728,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
ext4_check_map_extents_env(inode);
/* Lookup extent status tree firstly */
- if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
+ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) {
if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
map->m_pblk = ext4_es_pblock(&es) +
map->m_lblk - es.es_lblk;
@@ -810,7 +815,13 @@ found:
down_write(&EXT4_I(inode)->i_data_sem);
retval = ext4_map_create_blocks(handle, inode, map, flags);
up_write((&EXT4_I(inode)->i_data_sem));
- if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
+
+ if (retval < 0)
+ ext_debug(inode, "failed with err %d\n", retval);
+ if (retval <= 0)
+ return retval;
+
+ if (map->m_flags & EXT4_MAP_MAPPED) {
ret = check_block_validity(inode, map);
if (ret != 0)
return ret;
@@ -825,9 +836,8 @@ found:
!(flags & EXT4_GET_BLOCKS_ZERO) &&
!ext4_is_quota_file(inode) &&
ext4_should_order_data(inode)) {
- loff_t start_byte =
- (loff_t)map->m_lblk << inode->i_blkbits;
- loff_t length = (loff_t)map->m_len << inode->i_blkbits;
+ loff_t start_byte = EXT4_LBLK_TO_B(inode, map->m_lblk);
+ loff_t length = EXT4_LBLK_TO_B(inode, map->m_len);
if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
ret = ext4_jbd2_inode_add_wait(handle, inode,
@@ -839,12 +849,8 @@ found:
return ret;
}
}
- if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN ||
- map->m_flags & EXT4_MAP_MAPPED))
- ext4_fc_track_range(handle, inode, map->m_lblk,
- map->m_lblk + map->m_len - 1);
- if (retval < 0)
- ext_debug(inode, "failed with err %d\n", retval);
+ ext4_fc_track_range(handle, inode, map->m_lblk, map->m_lblk +
+ map->m_len - 1);
return retval;
}
@@ -1163,8 +1169,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio,
unsigned block_start, block_end;
sector_t block;
int err = 0;
- unsigned blocksize = inode->i_sb->s_blocksize;
- unsigned bbits;
+ unsigned int blocksize = i_blocksize(inode);
struct buffer_head *bh, *head, *wait[2];
int nr_wait = 0;
int i;
@@ -1173,12 +1178,12 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio,
BUG_ON(!folio_test_locked(folio));
BUG_ON(to > folio_size(folio));
BUG_ON(from > to);
+ WARN_ON_ONCE(blocksize > folio_size(folio));
head = folio_buffers(folio);
if (!head)
head = create_empty_buffers(folio, blocksize, 0);
- bbits = ilog2(blocksize);
- block = (sector_t)folio->index << (PAGE_SHIFT - bbits);
+ block = EXT4_PG_TO_LBLK(inode, folio->index);
for (bh = head, block_start = 0; bh != head || !block_start;
block++, block_start = block_end, bh = bh->b_this_page) {
@@ -1319,8 +1324,8 @@ retry_grab:
if (IS_ERR(folio))
return PTR_ERR(folio);
- if (pos + len > folio_pos(folio) + folio_size(folio))
- len = folio_pos(folio) + folio_size(folio) - pos;
+ if (len > folio_next_pos(folio) - pos)
+ len = folio_next_pos(folio) - pos;
from = offset_in_folio(folio, pos);
to = from + len;
@@ -1908,7 +1913,7 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map)
ext4_check_map_extents_env(inode);
/* Lookup extent status tree firstly */
- if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
+ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) {
map->m_len = min_t(unsigned int, map->m_len,
es.es_len - (map->m_lblk - es.es_lblk));
@@ -1961,7 +1966,7 @@ add_delayed:
* is held in write mode, before inserting a new da entry in
* the extent status tree.
*/
- if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
+ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) {
map->m_len = min_t(unsigned int, map->m_len,
es.es_len - (map->m_lblk - es.es_lblk));
@@ -1979,6 +1984,8 @@ add_delayed:
map->m_flags |= EXT4_MAP_DELAYED;
retval = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len);
+ if (!retval)
+ map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq);
up_write(&EXT4_I(inode)->i_data_sem);
return retval;
@@ -2225,7 +2232,6 @@ static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio,
ext4_lblk_t lblk = *m_lblk;
ext4_fsblk_t pblock = *m_pblk;
int err = 0;
- int blkbits = mpd->inode->i_blkbits;
ssize_t io_end_size = 0;
struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end);
@@ -2251,7 +2257,8 @@ static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio,
err = PTR_ERR(io_end_vec);
goto out;
}
- io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits;
+ io_end_vec->offset = EXT4_LBLK_TO_B(mpd->inode,
+ mpd->map.m_lblk);
}
*map_bh = true;
goto out;
@@ -2261,7 +2268,7 @@ static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio,
bh->b_blocknr = pblock++;
}
clear_buffer_unwritten(bh);
- io_end_size += (1 << blkbits);
+ io_end_size += i_blocksize(mpd->inode);
} while (lblk++, (bh = bh->b_this_page) != head);
io_end_vec->size += io_end_size;
@@ -2291,15 +2298,14 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
struct folio_batch fbatch;
unsigned nr, i;
struct inode *inode = mpd->inode;
- int bpp_bits = PAGE_SHIFT - inode->i_blkbits;
pgoff_t start, end;
ext4_lblk_t lblk;
ext4_fsblk_t pblock;
int err;
bool map_bh = false;
- start = mpd->map.m_lblk >> bpp_bits;
- end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
+ start = EXT4_LBLK_TO_PG(inode, mpd->map.m_lblk);
+ end = EXT4_LBLK_TO_PG(inode, mpd->map.m_lblk + mpd->map.m_len - 1);
pblock = mpd->map.m_pblk;
folio_batch_init(&fbatch);
@@ -2310,7 +2316,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
for (i = 0; i < nr; i++) {
struct folio *folio = fbatch.folios[i];
- lblk = folio->index << bpp_bits;
+ lblk = EXT4_PG_TO_LBLK(inode, folio->index);
err = mpage_process_folio(mpd, folio, &lblk, &pblock,
&map_bh);
/*
@@ -2463,7 +2469,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,
io_end_vec = ext4_alloc_io_end_vec(io_end);
if (IS_ERR(io_end_vec))
return PTR_ERR(io_end_vec);
- io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits;
+ io_end_vec->offset = EXT4_LBLK_TO_B(inode, map->m_lblk);
do {
err = mpage_map_one_extent(handle, mpd);
if (err < 0) {
@@ -2613,16 +2619,12 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
pgoff_t end = mpd->end_pos >> PAGE_SHIFT;
xa_mark_t tag;
int i, err = 0;
- int blkbits = mpd->inode->i_blkbits;
ext4_lblk_t lblk;
struct buffer_head *head;
handle_t *handle = NULL;
int bpp = ext4_journal_blocks_per_folio(mpd->inode);
- if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
- tag = PAGECACHE_TAG_TOWRITE;
- else
- tag = PAGECACHE_TAG_DIRTY;
+ tag = wbc_to_tag(mpd->wbc);
mpd->map.m_len = 0;
mpd->next_pos = mpd->start_pos;
@@ -2652,7 +2654,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
*/
if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
mpd->wbc->nr_to_write <=
- mpd->map.m_len >> (PAGE_SHIFT - blkbits))
+ EXT4_LBLK_TO_PG(mpd->inode, mpd->map.m_len))
goto out;
/* If we can't merge this page, we are done. */
@@ -2704,7 +2706,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
if (mpd->map.m_len == 0)
mpd->start_pos = folio_pos(folio);
- mpd->next_pos = folio_pos(folio) + folio_size(folio);
+ mpd->next_pos = folio_next_pos(folio);
/*
* Writeout when we cannot modify metadata is simple.
* Just submit the page. For data=journal mode we
@@ -2730,8 +2732,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
mpage_folio_done(mpd, folio);
} else {
/* Add all dirty buffers to mpd */
- lblk = ((ext4_lblk_t)folio->index) <<
- (PAGE_SHIFT - blkbits);
+ lblk = EXT4_PG_TO_LBLK(mpd->inode, folio->index);
head = folio_buffers(folio);
err = mpage_process_page_bufs(mpd, head, head,
lblk);
@@ -3146,8 +3147,8 @@ retry:
if (IS_ERR(folio))
return PTR_ERR(folio);
- if (pos + len > folio_pos(folio) + folio_size(folio))
- len = folio_pos(folio) + folio_size(folio) - pos;
+ if (len > folio_next_pos(folio) - pos)
+ len = folio_next_pos(folio) - pos;
ret = ext4_block_write_begin(NULL, folio, pos, len,
ext4_da_get_block_prep);
@@ -3473,7 +3474,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
/* Any metadata buffers to write? */
if (!list_empty(&inode->i_mapping->i_private_list))
return true;
- return inode->i_state & I_DIRTY_DATASYNC;
+ return inode_state_read_once(inode) & I_DIRTY_DATASYNC;
}
static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
@@ -3503,8 +3504,8 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
else
iomap->bdev = inode->i_sb->s_bdev;
- iomap->offset = (u64) map->m_lblk << blkbits;
- iomap->length = (u64) map->m_len << blkbits;
+ iomap->offset = EXT4_LBLK_TO_B(inode, map->m_lblk);
+ iomap->length = EXT4_LBLK_TO_B(inode, map->m_len);
if ((map->m_flags & EXT4_MAP_MAPPED) &&
!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -3544,7 +3545,7 @@ static int ext4_map_blocks_atomic_write_slow(handle_t *handle,
ext4_lblk_t m_lblk = map->m_lblk;
unsigned int m_len = map->m_len;
unsigned int mapped_len = 0, m_flags = 0;
- ext4_fsblk_t next_pblk;
+ ext4_fsblk_t next_pblk = 0;
bool check_next_pblk = false;
int ret = 0;
@@ -3678,7 +3679,6 @@ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
unsigned int flags)
{
handle_t *handle;
- u8 blkbits = inode->i_blkbits;
int ret, dio_credits, m_flags = 0, retries = 0;
bool force_commit = false;
@@ -3737,7 +3737,7 @@ retry:
* i_disksize out to i_size. This could be beyond where direct I/O is
* happening and thus expose allocated blocks to direct I/O reads.
*/
- else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode))
+ else if (EXT4_LBLK_TO_B(inode, map->m_lblk) >= i_size_read(inode))
m_flags = EXT4_GET_BLOCKS_CREATE;
else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
@@ -4072,7 +4072,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,
blocksize = inode->i_sb->s_blocksize;
- iblock = folio->index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
+ iblock = EXT4_PG_TO_LBLK(inode, folio->index);
bh = folio_buffers(folio);
if (!bh)
@@ -4157,9 +4157,8 @@ static int ext4_block_zero_page_range(handle_t *handle,
struct address_space *mapping, loff_t from, loff_t length)
{
struct inode *inode = mapping->host;
- unsigned offset = from & (PAGE_SIZE-1);
unsigned blocksize = inode->i_sb->s_blocksize;
- unsigned max = blocksize - (offset & (blocksize - 1));
+ unsigned int max = blocksize - (from & (blocksize - 1));
/*
* correct length if it does not fall between
@@ -4184,7 +4183,6 @@ static int ext4_block_zero_page_range(handle_t *handle,
static int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from)
{
- unsigned offset = from & (PAGE_SIZE-1);
unsigned length;
unsigned blocksize;
struct inode *inode = mapping->host;
@@ -4193,8 +4191,8 @@ static int ext4_block_truncate_page(handle_t *handle,
if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode))
return 0;
- blocksize = inode->i_sb->s_blocksize;
- length = blocksize - (offset & (blocksize - 1));
+ blocksize = i_blocksize(inode);
+ length = blocksize - (from & (blocksize - 1));
return ext4_block_zero_page_range(handle, mapping, from, length);
}
@@ -4400,10 +4398,10 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
/*
* If the hole extends beyond i_size, set the hole to end after
- * the page that contains i_size.
+ * the block that contains i_size to save pointless tail block zeroing.
*/
- if (end > inode->i_size)
- end = round_up(inode->i_size, PAGE_SIZE);
+ if (end >= inode->i_size)
+ end = round_up(inode->i_size, sb->s_blocksize);
if (end > max_end)
end = max_end;
length = end - offset;
@@ -4552,7 +4550,7 @@ int ext4_truncate(struct inode *inode)
* or it's a completely new inode. In those cases we might not
* have i_rwsem locked because it's not necessary.
*/
- if (!(inode->i_state & (I_NEW|I_FREEING)))
+ if (!(inode_state_read_once(inode) & (I_NEW | I_FREEING)))
WARN_ON(!inode_is_locked(inode));
trace_ext4_truncate_enter(inode);
@@ -5146,37 +5144,23 @@ error:
return -EFSCORRUPTED;
}
-static bool ext4_should_enable_large_folio(struct inode *inode)
+void ext4_set_inode_mapping_order(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
+ u16 min_order, max_order;
- if (!S_ISREG(inode->i_mode))
- return false;
- if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
- ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
- return false;
- if (ext4_has_feature_verity(sb))
- return false;
- if (ext4_has_feature_encrypt(sb))
- return false;
-
- return true;
-}
+ max_order = EXT4_SB(sb)->s_max_folio_order;
+ if (!max_order)
+ return;
-/*
- * Limit the maximum folio order to 2048 blocks to prevent overestimation
- * of reserve handle credits during the folio writeback in environments
- * where the PAGE_SIZE exceeds 4KB.
- */
-#define EXT4_MAX_PAGECACHE_ORDER(i) \
- umin(MAX_PAGECACHE_ORDER, (11 + (i)->i_blkbits - PAGE_SHIFT))
-void ext4_set_inode_mapping_order(struct inode *inode)
-{
- if (!ext4_should_enable_large_folio(inode))
+ min_order = EXT4_SB(sb)->s_min_folio_order;
+ if (!min_order && !S_ISREG(inode->i_mode))
return;
- mapping_set_folio_order_range(inode->i_mapping, 0,
- EXT4_MAX_PAGECACHE_ORDER(inode));
+ if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
+ max_order = min_order;
+
+ mapping_set_folio_order_range(inode->i_mapping, min_order, max_order);
}
struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
@@ -5210,7 +5194,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW)) {
+ if (!(inode_state_read_once(inode) & I_NEW)) {
ret = check_igot_inode(inode, flags, function, line);
if (ret) {
iput(inode);
@@ -5288,7 +5272,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ei->i_projid = make_kprojid(&init_user_ns, i_projid);
set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
- ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
ei->i_inline_off = 0;
ei->i_dir_start_lookup = 0;
ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
@@ -5521,7 +5504,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
if (ret)
goto bad_inode;
brelse(iloc.bh);
-
+ /* Initialize the "no ACL's" state for the simple cases */
+ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) && !ei->i_file_acl)
+ cache_no_acl(inode);
unlock_new_inode(inode);
return inode;
@@ -5549,7 +5534,7 @@ static void __ext4_update_other_inode_time(struct super_block *sb,
if (inode_is_dirtytime_only(inode)) {
struct ext4_inode_info *ei = EXT4_I(inode);
- inode->i_state &= ~I_DIRTY_TIME;
+ inode_state_clear(inode, I_DIRTY_TIME);
spin_unlock(&inode->i_lock);
spin_lock(&ei->i_raw_lock);
@@ -6552,14 +6537,14 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
* dirty data which can be converted only after flushing the dirty
* data (and journalled aops don't know how to handle these cases).
*/
- if (val) {
- filemap_invalidate_lock(inode->i_mapping);
- err = filemap_write_and_wait(inode->i_mapping);
- if (err < 0) {
- filemap_invalidate_unlock(inode->i_mapping);
- return err;
- }
+ filemap_invalidate_lock(inode->i_mapping);
+ err = filemap_write_and_wait(inode->i_mapping);
+ if (err < 0) {
+ filemap_invalidate_unlock(inode->i_mapping);
+ return err;
}
+ /* Before switch the inode journalling mode evict all the page cache. */
+ truncate_pagecache(inode, 0);
alloc_ctx = ext4_writepages_down_write(inode->i_sb);
jbd2_journal_lock_updates(journal);
@@ -6579,17 +6564,17 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
if (err < 0) {
jbd2_journal_unlock_updates(journal);
ext4_writepages_up_write(inode->i_sb, alloc_ctx);
+ filemap_invalidate_unlock(inode->i_mapping);
return err;
}
ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
}
ext4_set_aops(inode);
+ ext4_set_inode_mapping_order(inode);
jbd2_journal_unlock_updates(journal);
ext4_writepages_up_write(inode->i_sb, alloc_ctx);
-
- if (val)
- filemap_invalidate_unlock(inode->i_mapping);
+ filemap_invalidate_unlock(inode->i_mapping);
/* Finally we can mark the inode as dirty. */
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a93a7baae990..7ce0fc40aec2 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -1394,6 +1394,10 @@ static int ext4_ioctl_set_tune_sb(struct file *filp,
if (copy_from_user(&params, in, sizeof(params)))
return -EFAULT;
+ if (strnlen(params.mount_opts, sizeof(params.mount_opts)) ==
+ sizeof(params.mount_opts))
+ return -E2BIG;
+
if ((params.set_flags & ~TUNE_OPS_SUPPORTED) != 0)
return -EOPNOTSUPP;
@@ -1641,16 +1645,6 @@ group_extend_out:
if (!(fd_file(donor)->f_mode & FMODE_WRITE))
return -EBADF;
- if (ext4_has_feature_bigalloc(sb)) {
- ext4_msg(sb, KERN_ERR,
- "Online defrag not supported with bigalloc");
- return -EOPNOTSUPP;
- } else if (IS_DAX(inode)) {
- ext4_msg(sb, KERN_ERR,
- "Online defrag not supported with DAX");
- return -EOPNOTSUPP;
- }
-
err = mnt_want_write_file(filp);
if (err)
return err;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 9087183602e4..56d50fd3310b 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -98,14 +98,14 @@
* block bitmap and buddy information. The information are stored in the
* inode as:
*
- * { page }
+ * { folio }
* [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
*
*
* one block each for bitmap and buddy information. So for each group we
- * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE /
- * blocksize) blocks. So it can have information regarding groups_per_page
- * which is blocks_per_page/2
+ * take up 2 blocks. A folio can contain blocks_per_folio (folio_size /
+ * blocksize) blocks. So it can have information regarding groups_per_folio
+ * which is blocks_per_folio/2
*
* The buddy cache inode is not stored on disk. The inode is thrown
* away when the filesystem is unmounted.
@@ -682,6 +682,24 @@ do { \
} \
} while (0)
+/*
+ * Perform buddy integrity check with the following steps:
+ *
+ * 1. Top-down validation (from highest order down to order 1, excluding order-0 bitmap):
+ * For each pair of adjacent orders, if a higher-order bit is set (indicating a free block),
+ * at most one of the two corresponding lower-order bits may be clear (free).
+ *
+ * 2. Order-0 (bitmap) validation, performed on bit pairs:
+ * - If either bit in a pair is set (1, allocated), then all corresponding higher-order bits
+ * must not be free (0).
+ * - If both bits in a pair are clear (0, free), then exactly one of the corresponding
+ * higher-order bits must be free (0).
+ *
+ * 3. Preallocation (pa) list validation:
+ * For each preallocated block (pa) in the group:
+ * - Verify that pa_pstart falls within the bounds of this block group.
+ * - Ensure the corresponding bit(s) in the order-0 bitmap are marked as allocated (1).
+ */
static void __mb_check_buddy(struct ext4_buddy *e4b, char *file,
const char *function, int line)
{
@@ -723,15 +741,6 @@ static void __mb_check_buddy(struct ext4_buddy *e4b, char *file,
continue;
}
- /* both bits in buddy2 must be 1 */
- MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
- MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
-
- for (j = 0; j < (1 << order); j++) {
- k = (i * (1 << order)) + j;
- MB_CHECK_ASSERT(
- !mb_test_bit(k, e4b->bd_bitmap));
- }
count++;
}
MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
@@ -747,15 +756,21 @@ static void __mb_check_buddy(struct ext4_buddy *e4b, char *file,
fragments++;
fstart = i;
}
- continue;
+ } else {
+ fstart = -1;
}
- fstart = -1;
- /* check used bits only */
- for (j = 0; j < e4b->bd_blkbits + 1; j++) {
- buddy2 = mb_find_buddy(e4b, j, &max2);
- k = i >> j;
- MB_CHECK_ASSERT(k < max2);
- MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
+ if (!(i & 1)) {
+ int in_use, zero_bit_count = 0;
+
+ in_use = mb_test_bit(i, buddy) || mb_test_bit(i + 1, buddy);
+ for (j = 1; j < e4b->bd_blkbits + 2; j++) {
+ buddy2 = mb_find_buddy(e4b, j, &max2);
+ k = i >> j;
+ MB_CHECK_ASSERT(k < max2);
+ if (!mb_test_bit(k, buddy2))
+ zero_bit_count++;
+ }
+ MB_CHECK_ASSERT(zero_bit_count == !in_use);
}
}
MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
@@ -768,6 +783,8 @@ static void __mb_check_buddy(struct ext4_buddy *e4b, char *file,
ext4_group_t groupnr;
struct ext4_prealloc_space *pa;
pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
+ if (!pa->pa_len)
+ continue;
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
MB_CHECK_ASSERT(groupnr == e4b->bd_group);
for (i = 0; i < pa->pa_len; i++)
@@ -1329,26 +1346,25 @@ static void mb_regenerate_buddy(struct ext4_buddy *e4b)
* block bitmap and buddy information. The information are
* stored in the inode as
*
- * { page }
+ * { folio }
* [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
*
*
* one block each for bitmap and buddy information.
- * So for each group we take up 2 blocks. A page can
- * contain blocks_per_page (PAGE_SIZE / blocksize) blocks.
- * So it can have information regarding groups_per_page which
- * is blocks_per_page/2
+ * So for each group we take up 2 blocks. A folio can
+ * contain blocks_per_folio (folio_size / blocksize) blocks.
+ * So it can have information regarding groups_per_folio which
+ * is blocks_per_folio/2
*
* Locking note: This routine takes the block group lock of all groups
- * for this page; do not hold this lock when calling this routine!
+ * for this folio; do not hold this lock when calling this routine!
*/
-
static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp)
{
ext4_group_t ngroups;
unsigned int blocksize;
- int blocks_per_page;
- int groups_per_page;
+ int blocks_per_folio;
+ int groups_per_folio;
int err = 0;
int i;
ext4_group_t first_group, group;
@@ -1365,27 +1381,24 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp)
sb = inode->i_sb;
ngroups = ext4_get_groups_count(sb);
blocksize = i_blocksize(inode);
- blocks_per_page = PAGE_SIZE / blocksize;
+ blocks_per_folio = folio_size(folio) / blocksize;
+ WARN_ON_ONCE(!blocks_per_folio);
+ groups_per_folio = DIV_ROUND_UP(blocks_per_folio, 2);
mb_debug(sb, "init folio %lu\n", folio->index);
- groups_per_page = blocks_per_page >> 1;
- if (groups_per_page == 0)
- groups_per_page = 1;
-
/* allocate buffer_heads to read bitmaps */
- if (groups_per_page > 1) {
- i = sizeof(struct buffer_head *) * groups_per_page;
+ if (groups_per_folio > 1) {
+ i = sizeof(struct buffer_head *) * groups_per_folio;
bh = kzalloc(i, gfp);
if (bh == NULL)
return -ENOMEM;
} else
bh = &bhs;
- first_group = folio->index * blocks_per_page / 2;
-
/* read all groups the folio covers into the cache */
- for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
+ first_group = EXT4_PG_TO_LBLK(inode, folio->index) / 2;
+ for (i = 0, group = first_group; i < groups_per_folio; i++, group++) {
if (group >= ngroups)
break;
@@ -1393,7 +1406,7 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp)
if (!grinfo)
continue;
/*
- * If page is uptodate then we came here after online resize
+ * If folio is uptodate then we came here after online resize
* which added some new uninitialized group info structs, so
* we must skip all initialized uptodate buddies on the folio,
* which may be currently in use by an allocating task.
@@ -1413,7 +1426,7 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp)
}
/* wait for I/O completion */
- for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
+ for (i = 0, group = first_group; i < groups_per_folio; i++, group++) {
int err2;
if (!bh[i])
@@ -1423,8 +1436,8 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp)
err = err2;
}
- first_block = folio->index * blocks_per_page;
- for (i = 0; i < blocks_per_page; i++) {
+ first_block = EXT4_PG_TO_LBLK(inode, folio->index);
+ for (i = 0; i < blocks_per_folio; i++) {
group = (first_block + i) >> 1;
if (group >= ngroups)
break;
@@ -1501,7 +1514,7 @@ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp)
out:
if (bh) {
- for (i = 0; i < groups_per_page; i++)
+ for (i = 0; i < groups_per_folio; i++)
brelse(bh[i]);
if (bh != &bhs)
kfree(bh);
@@ -1510,55 +1523,57 @@ out:
}
/*
- * Lock the buddy and bitmap pages. This make sure other parallel init_group
- * on the same buddy page doesn't happen whild holding the buddy page lock.
- * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
- * are on the same page e4b->bd_buddy_folio is NULL and return value is 0.
+ * Lock the buddy and bitmap folios. This makes sure other parallel init_group
+ * on the same buddy folio doesn't happen while holding the buddy folio lock.
+ * Return locked buddy and bitmap folios on e4b struct. If buddy and bitmap
+ * are on the same folio e4b->bd_buddy_folio is NULL and return value is 0.
*/
-static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
+static int ext4_mb_get_buddy_folio_lock(struct super_block *sb,
ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
{
struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
- int block, pnum, poff;
- int blocks_per_page;
+ int block, pnum;
struct folio *folio;
e4b->bd_buddy_folio = NULL;
e4b->bd_bitmap_folio = NULL;
- blocks_per_page = PAGE_SIZE / sb->s_blocksize;
/*
* the buddy cache inode stores the block bitmap
* and buddy information in consecutive blocks.
* So for each group we need two blocks.
*/
block = group * 2;
- pnum = block / blocks_per_page;
- poff = block % blocks_per_page;
+ pnum = EXT4_LBLK_TO_PG(inode, block);
folio = __filemap_get_folio(inode->i_mapping, pnum,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
if (IS_ERR(folio))
return PTR_ERR(folio);
BUG_ON(folio->mapping != inode->i_mapping);
+ WARN_ON_ONCE(folio_size(folio) < sb->s_blocksize);
e4b->bd_bitmap_folio = folio;
- e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize);
+ e4b->bd_bitmap = folio_address(folio) +
+ offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block));
- if (blocks_per_page >= 2) {
- /* buddy and bitmap are on the same page */
+ block++;
+ pnum = EXT4_LBLK_TO_PG(inode, block);
+ if (folio_contains(folio, pnum)) {
+ /* buddy and bitmap are on the same folio */
return 0;
}
- /* blocks_per_page == 1, hence we need another page for the buddy */
- folio = __filemap_get_folio(inode->i_mapping, block + 1,
+ /* we need another folio for the buddy */
+ folio = __filemap_get_folio(inode->i_mapping, pnum,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
if (IS_ERR(folio))
return PTR_ERR(folio);
BUG_ON(folio->mapping != inode->i_mapping);
+ WARN_ON_ONCE(folio_size(folio) < sb->s_blocksize);
e4b->bd_buddy_folio = folio;
return 0;
}
-static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
+static void ext4_mb_put_buddy_folio_lock(struct ext4_buddy *e4b)
{
if (e4b->bd_bitmap_folio) {
folio_unlock(e4b->bd_bitmap_folio);
@@ -1572,7 +1587,7 @@ static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
/*
* Locking note: This routine calls ext4_mb_init_cache(), which takes the
- * block group lock of all groups for this page; do not hold the BG lock when
+ * block group lock of all groups for this folio; do not hold the BG lock when
* calling this routine!
*/
static noinline_for_stack
@@ -1592,14 +1607,14 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
/*
* This ensures that we don't reinit the buddy cache
- * page which map to the group from which we are already
+ * folio which map to the group from which we are already
* allocating. If we are looking at the buddy cache we would
* have taken a reference using ext4_mb_load_buddy and that
- * would have pinned buddy page to page cache.
- * The call to ext4_mb_get_buddy_page_lock will mark the
- * page accessed.
+ * would have pinned buddy folio to page cache.
+ * The call to ext4_mb_get_buddy_folio_lock will mark the
+ * folio accessed.
*/
- ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp);
+ ret = ext4_mb_get_buddy_folio_lock(sb, group, &e4b, gfp);
if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
/*
* somebody initialized the group
@@ -1620,7 +1635,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
if (e4b.bd_buddy_folio == NULL) {
/*
* If both the bitmap and buddy are in
- * the same page we don't need to force
+ * the same folio we don't need to force
* init the buddy
*/
ret = 0;
@@ -1636,23 +1651,21 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
goto err;
}
err:
- ext4_mb_put_buddy_page_lock(&e4b);
+ ext4_mb_put_buddy_folio_lock(&e4b);
return ret;
}
/*
* Locking note: This routine calls ext4_mb_init_cache(), which takes the
- * block group lock of all groups for this page; do not hold the BG lock when
+ * block group lock of all groups for this folio; do not hold the BG lock when
* calling this routine!
*/
static noinline_for_stack int
ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
struct ext4_buddy *e4b, gfp_t gfp)
{
- int blocks_per_page;
int block;
int pnum;
- int poff;
struct folio *folio;
int ret;
struct ext4_group_info *grp;
@@ -1662,7 +1675,6 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
might_sleep();
mb_debug(sb, "load group %u\n", group);
- blocks_per_page = PAGE_SIZE / sb->s_blocksize;
grp = ext4_get_group_info(sb, group);
if (!grp)
return -EFSCORRUPTED;
@@ -1690,8 +1702,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
* So for each group we need two blocks.
*/
block = group * 2;
- pnum = block / blocks_per_page;
- poff = block % blocks_per_page;
+ pnum = EXT4_LBLK_TO_PG(inode, block);
/* Avoid locking the folio in the fast path ... */
folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0);
@@ -1723,7 +1734,8 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
goto err;
}
mb_cmp_bitmaps(e4b, folio_address(folio) +
- (poff * sb->s_blocksize));
+ offset_in_folio(folio,
+ EXT4_LBLK_TO_B(inode, block)));
}
folio_unlock(folio);
}
@@ -1739,12 +1751,18 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
/* Folios marked accessed already */
e4b->bd_bitmap_folio = folio;
- e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize);
+ e4b->bd_bitmap = folio_address(folio) +
+ offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block));
block++;
- pnum = block / blocks_per_page;
- poff = block % blocks_per_page;
+ pnum = EXT4_LBLK_TO_PG(inode, block);
+ /* buddy and bitmap are on the same folio? */
+ if (folio_contains(folio, pnum)) {
+ folio_get(folio);
+ goto update_buddy;
+ }
+ /* we need another folio for the buddy */
folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0);
if (IS_ERR(folio) || !folio_test_uptodate(folio)) {
if (!IS_ERR(folio))
@@ -1779,9 +1797,11 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
goto err;
}
+update_buddy:
/* Folios marked accessed already */
e4b->bd_buddy_folio = folio;
- e4b->bd_buddy = folio_address(folio) + (poff * sb->s_blocksize);
+ e4b->bd_buddy = folio_address(folio) +
+ offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block));
return 0;
@@ -2224,7 +2244,7 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
ac->ac_buddy = ret >> 16;
/*
- * take the page reference. We want the page to be pinned
+ * take the folio reference. We want the folio to be pinned
* so that we don't get a ext4_mb_init_cache_call for this
* group until we update the bitmap. That would mean we
* double allocate blocks. The reference is dropped
@@ -2930,7 +2950,7 @@ static int ext4_mb_scan_group(struct ext4_allocation_context *ac,
if (cr < CR_ANY_FREE && spin_is_locked(ext4_group_lock_ptr(sb, group)))
return 0;
- /* This now checks without needing the buddy page */
+ /* This now checks without needing the buddy folio */
ret = ext4_mb_good_group_nolock(ac, group, cr);
if (ret <= 0) {
if (!ac->ac_first_err)
@@ -3490,6 +3510,8 @@ static int ext4_mb_init_backend(struct super_block *sb)
* this will avoid confusion if it ever shows up during debugging. */
sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
+ ext4_set_inode_mapping_order(sbi->s_buddy_cache);
+
for (i = 0; i < ngroups; i++) {
cond_resched();
desc = ext4_get_group_desc(sb, i, NULL);
@@ -4720,7 +4742,7 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
"ext4: mb_load_buddy failed (%d)", err))
/*
* This should never happen since we pin the
- * pages in the ext4_allocation_context so
+ * folios in the ext4_allocation_context so
* ext4_mb_load_buddy() should never fail.
*/
return;
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index ab1ff51302fb..6f57c181ff77 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -57,16 +57,12 @@ static int write_mmp_block_thawed(struct super_block *sb,
static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
{
- int err;
-
/*
* We protect against freezing so that we don't create dirty buffers
* on frozen filesystem.
*/
- sb_start_write(sb);
- err = write_mmp_block_thawed(sb, bh);
- sb_end_write(sb);
- return err;
+ scoped_guard(super_write, sb)
+ return write_mmp_block_thawed(sb, bh);
}
/*
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 4b091c21908f..0550fd30fd10 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -13,28 +13,14 @@
#include "ext4.h"
#include "ext4_extents.h"
-/**
- * get_ext_path() - Find an extent path for designated logical block number.
- * @inode: inode to be searched
- * @lblock: logical block number to find an extent path
- * @path: pointer to an extent path
- *
- * ext4_find_extent wrapper. Return an extent path pointer on success,
- * or an error pointer on failure.
- */
-static inline struct ext4_ext_path *
-get_ext_path(struct inode *inode, ext4_lblk_t lblock,
- struct ext4_ext_path *path)
-{
- path = ext4_find_extent(inode, lblock, path, EXT4_EX_NOCACHE);
- if (IS_ERR(path))
- return path;
- if (path[ext_depth(inode)].p_ext == NULL) {
- ext4_free_ext_path(path);
- return ERR_PTR(-ENODATA);
- }
- return path;
-}
+#include <trace/events/ext4.h>
+
+struct mext_data {
+ struct inode *orig_inode; /* Origin file inode */
+ struct inode *donor_inode; /* Donor file inode */
+ struct ext4_map_blocks orig_map;/* Origin file's move mapping */
+ ext4_lblk_t donor_lblk; /* Start block of the donor file */
+};
/**
* ext4_double_down_write_data_sem() - write lock two inodes's i_data_sem
@@ -52,7 +38,6 @@ ext4_double_down_write_data_sem(struct inode *first, struct inode *second)
} else {
down_write(&EXT4_I(second)->i_data_sem);
down_write_nested(&EXT4_I(first)->i_data_sem, I_DATA_SEM_OTHER);
-
}
}
@@ -71,59 +56,14 @@ ext4_double_up_write_data_sem(struct inode *orig_inode,
up_write(&EXT4_I(donor_inode)->i_data_sem);
}
-/**
- * mext_check_coverage - Check that all extents in range has the same type
- *
- * @inode: inode in question
- * @from: block offset of inode
- * @count: block count to be checked
- * @unwritten: extents expected to be unwritten
- * @err: pointer to save error value
- *
- * Return 1 if all extents in range has expected type, and zero otherwise.
- */
-static int
-mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
- int unwritten, int *err)
-{
- struct ext4_ext_path *path = NULL;
- struct ext4_extent *ext;
- int ret = 0;
- ext4_lblk_t last = from + count;
- while (from < last) {
- path = get_ext_path(inode, from, path);
- if (IS_ERR(path)) {
- *err = PTR_ERR(path);
- return ret;
- }
- ext = path[ext_depth(inode)].p_ext;
- if (unwritten != ext4_ext_is_unwritten(ext))
- goto out;
- from += ext4_ext_get_actual_len(ext);
- }
- ret = 1;
-out:
- ext4_free_ext_path(path);
- return ret;
-}
-
-/**
- * mext_folio_double_lock - Grab and lock folio on both @inode1 and @inode2
- *
- * @inode1: the inode structure
- * @inode2: the inode structure
- * @index1: folio index
- * @index2: folio index
- * @folio: result folio vector
- *
- * Grab two locked folio for inode's by inode order
- */
-static int
-mext_folio_double_lock(struct inode *inode1, struct inode *inode2,
- pgoff_t index1, pgoff_t index2, struct folio *folio[2])
+/* Grab and lock folio on both @inode1 and @inode2 by inode order. */
+static int mext_folio_double_lock(struct inode *inode1, struct inode *inode2,
+ pgoff_t index1, pgoff_t index2, size_t len,
+ struct folio *folio[2])
{
struct address_space *mapping[2];
unsigned int flags;
+ fgf_t fgp_flags = FGP_WRITEBEGIN;
BUG_ON(!inode1 || !inode2);
if (inode1 < inode2) {
@@ -136,14 +76,15 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2,
}
flags = memalloc_nofs_save();
- folio[0] = __filemap_get_folio(mapping[0], index1, FGP_WRITEBEGIN,
+ fgp_flags |= fgf_set_order(len);
+ folio[0] = __filemap_get_folio(mapping[0], index1, fgp_flags,
mapping_gfp_mask(mapping[0]));
if (IS_ERR(folio[0])) {
memalloc_nofs_restore(flags);
return PTR_ERR(folio[0]);
}
- folio[1] = __filemap_get_folio(mapping[1], index2, FGP_WRITEBEGIN,
+ folio[1] = __filemap_get_folio(mapping[1], index2, fgp_flags,
mapping_gfp_mask(mapping[1]));
memalloc_nofs_restore(flags);
if (IS_ERR(folio[1])) {
@@ -164,8 +105,16 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2,
return 0;
}
+static void mext_folio_double_unlock(struct folio *folio[2])
+{
+ folio_unlock(folio[0]);
+ folio_put(folio[0]);
+ folio_unlock(folio[1]);
+ folio_put(folio[1]);
+}
+
/* Force folio buffers uptodate w/o dropping folio's lock */
-static int mext_page_mkuptodate(struct folio *folio, size_t from, size_t to)
+static int mext_folio_mkuptodate(struct folio *folio, size_t from, size_t to)
{
struct inode *inode = folio->mapping->host;
sector_t block;
@@ -238,267 +187,313 @@ out:
return 0;
}
-/**
- * move_extent_per_page - Move extent data per page
- *
- * @o_filp: file structure of original file
- * @donor_inode: donor inode
- * @orig_page_offset: page index on original file
- * @donor_page_offset: page index on donor file
- * @data_offset_in_page: block index where data swapping starts
- * @block_len_in_page: the number of blocks to be swapped
- * @unwritten: orig extent is unwritten or not
- * @err: pointer to save return value
- *
- * Save the data in original inode blocks and replace original inode extents
- * with donor inode extents by calling ext4_swap_extents().
- * Finally, write out the saved data in new original inode blocks. Return
- * replaced block count.
+enum mext_move_type {MEXT_SKIP_EXTENT, MEXT_MOVE_EXTENT, MEXT_COPY_DATA};
+
+/*
+ * Start to move extent between the origin inode and the donor inode,
+ * hold one folio for each inode and check the candidate moving extent
+ * mapping status again.
*/
-static int
-move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
- pgoff_t orig_page_offset, pgoff_t donor_page_offset,
- int data_offset_in_page,
- int block_len_in_page, int unwritten, int *err)
+static int mext_move_begin(struct mext_data *mext, struct folio *folio[2],
+ enum mext_move_type *move_type)
{
- struct inode *orig_inode = file_inode(o_filp);
- struct folio *folio[2] = {NULL, NULL};
- handle_t *handle;
- ext4_lblk_t orig_blk_offset, donor_blk_offset;
- unsigned long blocksize = orig_inode->i_sb->s_blocksize;
- unsigned int tmp_data_size, data_size, replaced_size;
- int i, err2, jblocks, retries = 0;
- int replaced_count = 0;
- int from;
- int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits;
- struct super_block *sb = orig_inode->i_sb;
- struct buffer_head *bh = NULL;
+ struct inode *orig_inode = mext->orig_inode;
+ struct inode *donor_inode = mext->donor_inode;
+ unsigned int blkbits = orig_inode->i_blkbits;
+ struct ext4_map_blocks donor_map = {0};
+ loff_t orig_pos, donor_pos;
+ size_t move_len;
+ int ret;
+
+ orig_pos = ((loff_t)mext->orig_map.m_lblk) << blkbits;
+ donor_pos = ((loff_t)mext->donor_lblk) << blkbits;
+ ret = mext_folio_double_lock(orig_inode, donor_inode,
+ orig_pos >> PAGE_SHIFT, donor_pos >> PAGE_SHIFT,
+ ((size_t)mext->orig_map.m_len) << blkbits, folio);
+ if (ret)
+ return ret;
/*
- * It needs twice the amount of ordinary journal buffers because
- * inode and donor_inode may change each different metadata blocks.
+ * Check the origin inode's mapping information again under the
+ * folio lock, as we do not hold the i_data_sem at all times, and
+ * it may change during the concurrent write-back operation.
*/
-again:
- *err = 0;
- jblocks = ext4_meta_trans_blocks(orig_inode, block_len_in_page,
- block_len_in_page) * 2;
- handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks);
+ if (mext->orig_map.m_seq != READ_ONCE(EXT4_I(orig_inode)->i_es_seq)) {
+ ret = -ESTALE;
+ goto error;
+ }
+
+ /* Adjust the moving length according to the length of shorter folio. */
+ move_len = umin(folio_pos(folio[0]) + folio_size(folio[0]) - orig_pos,
+ folio_pos(folio[1]) + folio_size(folio[1]) - donor_pos);
+ move_len >>= blkbits;
+ if (move_len < mext->orig_map.m_len)
+ mext->orig_map.m_len = move_len;
+
+ donor_map.m_lblk = mext->donor_lblk;
+ donor_map.m_len = mext->orig_map.m_len;
+ donor_map.m_flags = 0;
+ ret = ext4_map_blocks(NULL, donor_inode, &donor_map, 0);
+ if (ret < 0)
+ goto error;
+
+ /* Adjust the moving length according to the donor mapping length. */
+ mext->orig_map.m_len = donor_map.m_len;
+
+ /* Skip moving if the donor range is a hole or a delalloc extent. */
+ if (!(donor_map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN)))
+ *move_type = MEXT_SKIP_EXTENT;
+ /* If both mapping ranges are unwritten, no need to copy data. */
+ else if ((mext->orig_map.m_flags & EXT4_MAP_UNWRITTEN) &&
+ (donor_map.m_flags & EXT4_MAP_UNWRITTEN))
+ *move_type = MEXT_MOVE_EXTENT;
+ else
+ *move_type = MEXT_COPY_DATA;
+
+ return 0;
+error:
+ mext_folio_double_unlock(folio);
+ return ret;
+}
+
+/*
+ * Re-create the new moved mapping buffers of the original inode and commit
+ * the entire written range.
+ */
+static int mext_folio_mkwrite(struct inode *inode, struct folio *folio,
+ size_t from, size_t to)
+{
+ unsigned int blocksize = i_blocksize(inode);
+ struct buffer_head *bh, *head;
+ size_t block_start, block_end;
+ sector_t block;
+ int ret;
+
+ head = folio_buffers(folio);
+ if (!head)
+ head = create_empty_buffers(folio, blocksize, 0);
+
+ block = folio_pos(folio) >> inode->i_blkbits;
+ block_end = 0;
+ bh = head;
+ do {
+ block_start = block_end;
+ block_end = block_start + blocksize;
+ if (block_end <= from || block_start >= to)
+ continue;
+
+ ret = ext4_get_block(inode, block, bh, 0);
+ if (ret)
+ return ret;
+ } while (block++, (bh = bh->b_this_page) != head);
+
+ block_commit_write(folio, from, to);
+ return 0;
+}
+
+/*
+ * Save the data in original inode extent blocks and replace one folio size
+ * aligned original inode extent with one or one partial donor inode extent,
+ * and then write out the saved data in new original inode blocks. Pass out
+ * the replaced block count through m_len. Return 0 on success, and an error
+ * code otherwise.
+ */
+static int mext_move_extent(struct mext_data *mext, u64 *m_len)
+{
+ struct inode *orig_inode = mext->orig_inode;
+ struct inode *donor_inode = mext->donor_inode;
+ struct ext4_map_blocks *orig_map = &mext->orig_map;
+ unsigned int blkbits = orig_inode->i_blkbits;
+ struct folio *folio[2] = {NULL, NULL};
+ loff_t from, length;
+ enum mext_move_type move_type = 0;
+ handle_t *handle;
+ u64 r_len = 0;
+ unsigned int credits;
+ int ret, ret2;
+
+ *m_len = 0;
+ trace_ext4_move_extent_enter(orig_inode, orig_map, donor_inode,
+ mext->donor_lblk);
+ credits = ext4_chunk_trans_extent(orig_inode, 0) * 2;
+ handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, credits);
if (IS_ERR(handle)) {
- *err = PTR_ERR(handle);
- return 0;
+ ret = PTR_ERR(handle);
+ goto out;
}
- orig_blk_offset = orig_page_offset * blocks_per_page +
- data_offset_in_page;
-
- donor_blk_offset = donor_page_offset * blocks_per_page +
- data_offset_in_page;
-
- /* Calculate data_size */
- if ((orig_blk_offset + block_len_in_page - 1) ==
- ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
- /* Replace the last block */
- tmp_data_size = orig_inode->i_size & (blocksize - 1);
- /*
- * If data_size equal zero, it shows data_size is multiples of
- * blocksize. So we set appropriate value.
- */
- if (tmp_data_size == 0)
- tmp_data_size = blocksize;
-
- data_size = tmp_data_size +
- ((block_len_in_page - 1) << orig_inode->i_blkbits);
- } else
- data_size = block_len_in_page << orig_inode->i_blkbits;
-
- replaced_size = data_size;
-
- *err = mext_folio_double_lock(orig_inode, donor_inode, orig_page_offset,
- donor_page_offset, folio);
- if (unlikely(*err < 0))
- goto stop_journal;
+ ret = mext_move_begin(mext, folio, &move_type);
+ if (ret)
+ goto stop_handle;
+
+ if (move_type == MEXT_SKIP_EXTENT)
+ goto unlock;
+
/*
- * If orig extent was unwritten it can become initialized
- * at any time after i_data_sem was dropped, in order to
- * serialize with delalloc we have recheck extent while we
- * hold page's lock, if it is still the case data copy is not
- * necessary, just swap data blocks between orig and donor.
+ * Copy the data. First, read the original inode data into the page
+ * cache. Then, release the existing mapping relationships and swap
+ * the extent. Finally, re-establish the new mapping relationships
+ * and dirty the page cache.
*/
- if (unwritten) {
- ext4_double_down_write_data_sem(orig_inode, donor_inode);
- /* If any of extents in range became initialized we have to
- * fallback to data copying */
- unwritten = mext_check_coverage(orig_inode, orig_blk_offset,
- block_len_in_page, 1, err);
- if (*err)
- goto drop_data_sem;
-
- unwritten &= mext_check_coverage(donor_inode, donor_blk_offset,
- block_len_in_page, 1, err);
- if (*err)
- goto drop_data_sem;
-
- if (!unwritten) {
- ext4_double_up_write_data_sem(orig_inode, donor_inode);
- goto data_copy;
- }
- if (!filemap_release_folio(folio[0], 0) ||
- !filemap_release_folio(folio[1], 0)) {
- *err = -EBUSY;
- goto drop_data_sem;
- }
- replaced_count = ext4_swap_extents(handle, orig_inode,
- donor_inode, orig_blk_offset,
- donor_blk_offset,
- block_len_in_page, 1, err);
- drop_data_sem:
- ext4_double_up_write_data_sem(orig_inode, donor_inode);
- goto unlock_folios;
+ if (move_type == MEXT_COPY_DATA) {
+ from = offset_in_folio(folio[0],
+ ((loff_t)orig_map->m_lblk) << blkbits);
+ length = ((loff_t)orig_map->m_len) << blkbits;
+
+ ret = mext_folio_mkuptodate(folio[0], from, from + length);
+ if (ret)
+ goto unlock;
}
-data_copy:
- from = offset_in_folio(folio[0],
- orig_blk_offset << orig_inode->i_blkbits);
- *err = mext_page_mkuptodate(folio[0], from, from + replaced_size);
- if (*err)
- goto unlock_folios;
-
- /* At this point all buffers in range are uptodate, old mapping layout
- * is no longer required, try to drop it now. */
+
if (!filemap_release_folio(folio[0], 0) ||
!filemap_release_folio(folio[1], 0)) {
- *err = -EBUSY;
- goto unlock_folios;
+ ret = -EBUSY;
+ goto unlock;
}
+
+ /* Move extent */
ext4_double_down_write_data_sem(orig_inode, donor_inode);
- replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode,
- orig_blk_offset, donor_blk_offset,
- block_len_in_page, 1, err);
+ *m_len = ext4_swap_extents(handle, orig_inode, donor_inode,
+ orig_map->m_lblk, mext->donor_lblk,
+ orig_map->m_len, 1, &ret);
ext4_double_up_write_data_sem(orig_inode, donor_inode);
- if (*err) {
- if (replaced_count) {
- block_len_in_page = replaced_count;
- replaced_size =
- block_len_in_page << orig_inode->i_blkbits;
- } else
- goto unlock_folios;
- }
- /* Perform all necessary steps similar write_begin()/write_end()
- * but keeping in mind that i_size will not change */
- bh = folio_buffers(folio[0]);
- if (!bh)
- bh = create_empty_buffers(folio[0],
- 1 << orig_inode->i_blkbits, 0);
- for (i = 0; i < from >> orig_inode->i_blkbits; i++)
- bh = bh->b_this_page;
- for (i = 0; i < block_len_in_page; i++) {
- *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0);
- if (*err < 0)
- goto repair_branches;
- bh = bh->b_this_page;
- }
- block_commit_write(folio[0], from, from + replaced_size);
+ /* A short-length swap cannot occur after a successful swap extent. */
+ if (WARN_ON_ONCE(!ret && (*m_len != orig_map->m_len)))
+ ret = -EIO;
- /* Even in case of data=writeback it is reasonable to pin
- * inode to transaction, to prevent unexpected data loss */
- *err = ext4_jbd2_inode_add_write(handle, orig_inode,
- (loff_t)orig_page_offset << PAGE_SHIFT, replaced_size);
+ if (!(*m_len) || (move_type == MEXT_MOVE_EXTENT))
+ goto unlock;
-unlock_folios:
- folio_unlock(folio[0]);
- folio_put(folio[0]);
- folio_unlock(folio[1]);
- folio_put(folio[1]);
-stop_journal:
+ /* Copy data */
+ length = (*m_len) << blkbits;
+ ret2 = mext_folio_mkwrite(orig_inode, folio[0], from, from + length);
+ if (ret2) {
+ if (!ret)
+ ret = ret2;
+ goto repair_branches;
+ }
+ /*
+ * Even in case of data=writeback it is reasonable to pin
+ * inode to transaction, to prevent unexpected data loss.
+ */
+ ret2 = ext4_jbd2_inode_add_write(handle, orig_inode,
+ ((loff_t)orig_map->m_lblk) << blkbits, length);
+ if (!ret)
+ ret = ret2;
+unlock:
+ mext_folio_double_unlock(folio);
+stop_handle:
ext4_journal_stop(handle);
- if (*err == -ENOSPC &&
- ext4_should_retry_alloc(sb, &retries))
- goto again;
- /* Buffer was busy because probably is pinned to journal transaction,
- * force transaction commit may help to free it. */
- if (*err == -EBUSY && retries++ < 4 && EXT4_SB(sb)->s_journal &&
- jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal))
- goto again;
- return replaced_count;
+out:
+ trace_ext4_move_extent_exit(orig_inode, orig_map->m_lblk, donor_inode,
+ mext->donor_lblk, orig_map->m_len, *m_len,
+ move_type, ret);
+ return ret;
repair_branches:
- /*
- * This should never ever happen!
- * Extents are swapped already, but we are not able to copy data.
- * Try to swap extents to it's original places
- */
- ext4_double_down_write_data_sem(orig_inode, donor_inode);
- replaced_count = ext4_swap_extents(handle, donor_inode, orig_inode,
- orig_blk_offset, donor_blk_offset,
- block_len_in_page, 0, &err2);
- ext4_double_up_write_data_sem(orig_inode, donor_inode);
- if (replaced_count != block_len_in_page) {
- ext4_error_inode_block(orig_inode, (sector_t)(orig_blk_offset),
- EIO, "Unable to copy data block,"
- " data will be lost.");
- *err = -EIO;
+ ret2 = 0;
+ r_len = ext4_swap_extents(handle, donor_inode, orig_inode,
+ mext->donor_lblk, orig_map->m_lblk,
+ *m_len, 0, &ret2);
+ if (ret2 || r_len != *m_len) {
+ ext4_error_inode_block(orig_inode, (sector_t)(orig_map->m_lblk),
+ EIO, "Unable to copy data block, data will be lost!");
+ ret = -EIO;
}
- replaced_count = 0;
- goto unlock_folios;
+ *m_len = 0;
+ goto unlock;
}
-/**
- * mext_check_arguments - Check whether move extent can be done
- *
- * @orig_inode: original inode
- * @donor_inode: donor inode
- * @orig_start: logical start offset in block for orig
- * @donor_start: logical start offset in block for donor
- * @len: the number of blocks to be moved
- *
- * Check the arguments of ext4_move_extents() whether the files can be
- * exchanged with each other.
- * Return 0 on success, or a negative error value on failure.
+/*
+ * Check the validity of the basic filesystem environment and the
+ * inodes' support status.
*/
-static int
-mext_check_arguments(struct inode *orig_inode,
- struct inode *donor_inode, __u64 orig_start,
- __u64 donor_start, __u64 *len)
+static int mext_check_validity(struct inode *orig_inode,
+ struct inode *donor_inode)
{
- __u64 orig_eof, donor_eof;
- unsigned int blkbits = orig_inode->i_blkbits;
- unsigned int blocksize = 1 << blkbits;
+ struct super_block *sb = orig_inode->i_sb;
+
+ /* origin and donor should be different inodes */
+ if (orig_inode == donor_inode) {
+ ext4_debug("ext4 move extent: The argument files should not be same inode [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* origin and donor should belone to the same filesystem */
+ if (orig_inode->i_sb != donor_inode->i_sb) {
+ ext4_debug("ext4 move extent: The argument files should be in same FS [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* Regular file check */
+ if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
+ ext4_debug("ext4 move extent: The argument files should be regular file [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
- orig_eof = (i_size_read(orig_inode) + blocksize - 1) >> blkbits;
- donor_eof = (i_size_read(donor_inode) + blocksize - 1) >> blkbits;
+ if (ext4_has_feature_bigalloc(sb)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online defrag not supported with bigalloc");
+ return -EOPNOTSUPP;
+ }
+ if (IS_DAX(orig_inode)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online defrag not supported with DAX");
+ return -EOPNOTSUPP;
+ }
+
+ /*
+ * TODO: it's not obvious how to swap blocks for inodes with full
+ * journaling enabled.
+ */
+ if (ext4_should_journal_data(orig_inode) ||
+ ext4_should_journal_data(donor_inode)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online defrag not supported with data journaling");
+ return -EOPNOTSUPP;
+ }
+
+ if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online defrag not supported for encrypted files");
+ return -EOPNOTSUPP;
+ }
+
+ /* Ext4 move extent supports only extent based file */
+ if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS)) ||
+ !(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
+ ext4_msg(sb, KERN_ERR,
+ "Online defrag not supported for non-extent files");
+ return -EOPNOTSUPP;
+ }
if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
- ext4_debug("ext4 move extent: suid or sgid is set"
- " to donor file [ino:orig %lu, donor %lu]\n",
+ ext4_debug("ext4 move extent: suid or sgid is set to donor file [ino:orig %lu, donor %lu]\n",
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
- if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode))
+ if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode)) {
+ ext4_debug("ext4 move extent: donor should not be immutable or append file [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
return -EPERM;
+ }
/* Ext4 move extent does not support swap files */
if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
ext4_debug("ext4 move extent: The argument files should not be swap files [ino:orig %lu, donor %lu]\n",
- orig_inode->i_ino, donor_inode->i_ino);
+ orig_inode->i_ino, donor_inode->i_ino);
return -ETXTBSY;
}
- if (ext4_is_quota_file(orig_inode) && ext4_is_quota_file(donor_inode)) {
+ if (ext4_is_quota_file(orig_inode) || ext4_is_quota_file(donor_inode)) {
ext4_debug("ext4 move extent: The argument files should not be quota files [ino:orig %lu, donor %lu]\n",
- orig_inode->i_ino, donor_inode->i_ino);
- return -EOPNOTSUPP;
- }
-
- /* Ext4 move extent supports only extent based file */
- if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
- ext4_debug("ext4 move extent: orig file is not extents "
- "based file [ino:orig %lu]\n", orig_inode->i_ino);
- return -EOPNOTSUPP;
- } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
- ext4_debug("ext4 move extent: donor file is not extents "
- "based file [ino:donor %lu]\n", donor_inode->i_ino);
+ orig_inode->i_ino, donor_inode->i_ino);
return -EOPNOTSUPP;
}
@@ -507,12 +502,25 @@ mext_check_arguments(struct inode *orig_inode,
return -EINVAL;
}
+ return 0;
+}
+
+/*
+ * Check the moving range of ext4_move_extents() whether the files can be
+ * exchanged with each other, and adjust the length to fit within the file
+ * size. Return 0 on success, or a negative error value on failure.
+ */
+static int mext_check_adjust_range(struct inode *orig_inode,
+ struct inode *donor_inode, __u64 orig_start,
+ __u64 donor_start, __u64 *len)
+{
+ __u64 orig_eof, donor_eof;
+
/* Start offset should be same */
if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) !=
(donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) {
- ext4_debug("ext4 move extent: orig and donor's start "
- "offsets are not aligned [ino:orig %lu, donor %lu]\n",
- orig_inode->i_ino, donor_inode->i_ino);
+ ext4_debug("ext4 move extent: orig and donor's start offsets are not aligned [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
@@ -521,11 +529,14 @@ mext_check_arguments(struct inode *orig_inode,
(*len > EXT_MAX_BLOCKS) ||
(donor_start + *len >= EXT_MAX_BLOCKS) ||
(orig_start + *len >= EXT_MAX_BLOCKS)) {
- ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
- "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS,
- orig_inode->i_ino, donor_inode->i_ino);
+ ext4_debug("ext4 move extent: Can't handle over [%u] blocks [ino:orig %lu, donor %lu]\n",
+ EXT_MAX_BLOCKS,
+ orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
+
+ orig_eof = EXT4_B_TO_LBLK(orig_inode, i_size_read(orig_inode));
+ donor_eof = EXT4_B_TO_LBLK(donor_inode, i_size_read(donor_inode));
if (orig_eof <= orig_start)
*len = 0;
else if (orig_eof < orig_start + *len - 1)
@@ -535,9 +546,8 @@ mext_check_arguments(struct inode *orig_inode,
else if (donor_eof < donor_start + *len - 1)
*len = donor_eof - donor_start;
if (!*len) {
- ext4_debug("ext4 move extent: len should not be 0 "
- "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
- donor_inode->i_ino);
+ ext4_debug("ext4 move extent: len should not be 0 [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
@@ -556,140 +566,81 @@ mext_check_arguments(struct inode *orig_inode,
*
* This function returns 0 and moved block length is set in moved_len
* if succeed, otherwise returns error value.
- *
*/
-int
-ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
- __u64 donor_blk, __u64 len, __u64 *moved_len)
+int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
+ __u64 donor_blk, __u64 len, __u64 *moved_len)
{
struct inode *orig_inode = file_inode(o_filp);
struct inode *donor_inode = file_inode(d_filp);
- struct ext4_ext_path *path = NULL;
- int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits;
- ext4_lblk_t o_end, o_start = orig_blk;
- ext4_lblk_t d_start = donor_blk;
+ struct mext_data mext;
+ struct super_block *sb = orig_inode->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int retries = 0;
+ u64 m_len;
int ret;
- if (orig_inode->i_sb != donor_inode->i_sb) {
- ext4_debug("ext4 move extent: The argument files "
- "should be in same FS [ino:orig %lu, donor %lu]\n",
- orig_inode->i_ino, donor_inode->i_ino);
- return -EINVAL;
- }
-
- /* orig and donor should be different inodes */
- if (orig_inode == donor_inode) {
- ext4_debug("ext4 move extent: The argument files should not "
- "be same inode [ino:orig %lu, donor %lu]\n",
- orig_inode->i_ino, donor_inode->i_ino);
- return -EINVAL;
- }
-
- /* Regular file check */
- if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
- ext4_debug("ext4 move extent: The argument files should be "
- "regular file [ino:orig %lu, donor %lu]\n",
- orig_inode->i_ino, donor_inode->i_ino);
- return -EINVAL;
- }
-
- /* TODO: it's not obvious how to swap blocks for inodes with full
- journaling enabled */
- if (ext4_should_journal_data(orig_inode) ||
- ext4_should_journal_data(donor_inode)) {
- ext4_msg(orig_inode->i_sb, KERN_ERR,
- "Online defrag not supported with data journaling");
- return -EOPNOTSUPP;
- }
-
- if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) {
- ext4_msg(orig_inode->i_sb, KERN_ERR,
- "Online defrag not supported for encrypted files");
- return -EOPNOTSUPP;
- }
+ *moved_len = 0;
/* Protect orig and donor inodes against a truncate */
lock_two_nondirectories(orig_inode, donor_inode);
+ ret = mext_check_validity(orig_inode, donor_inode);
+ if (ret)
+ goto out;
+
/* Wait for all existing dio workers */
inode_dio_wait(orig_inode);
inode_dio_wait(donor_inode);
- /* Protect extent tree against block allocations via delalloc */
- ext4_double_down_write_data_sem(orig_inode, donor_inode);
- /* Check the filesystem environment whether move_extent can be done */
- ret = mext_check_arguments(orig_inode, donor_inode, orig_blk,
- donor_blk, &len);
+ /* Check and adjust the specified move_extent range. */
+ ret = mext_check_adjust_range(orig_inode, donor_inode, orig_blk,
+ donor_blk, &len);
if (ret)
goto out;
- o_end = o_start + len;
- *moved_len = 0;
- while (o_start < o_end) {
- struct ext4_extent *ex;
- ext4_lblk_t cur_blk, next_blk;
- pgoff_t orig_page_index, donor_page_index;
- int offset_in_page;
- int unwritten, cur_len;
-
- path = get_ext_path(orig_inode, o_start, path);
- if (IS_ERR(path)) {
- ret = PTR_ERR(path);
+ mext.orig_inode = orig_inode;
+ mext.donor_inode = donor_inode;
+ while (len) {
+ mext.orig_map.m_lblk = orig_blk;
+ mext.orig_map.m_len = len;
+ mext.orig_map.m_flags = 0;
+ mext.donor_lblk = donor_blk;
+
+ ret = ext4_map_blocks(NULL, orig_inode, &mext.orig_map, 0);
+ if (ret < 0)
goto out;
- }
- ex = path[path->p_depth].p_ext;
- cur_blk = le32_to_cpu(ex->ee_block);
- cur_len = ext4_ext_get_actual_len(ex);
- /* Check hole before the start pos */
- if (cur_blk + cur_len - 1 < o_start) {
- next_blk = ext4_ext_next_allocated_block(path);
- if (next_blk == EXT_MAX_BLOCKS) {
- ret = -ENODATA;
- goto out;
+
+ /* Skip moving if it is a hole or a delalloc extent. */
+ if (mext.orig_map.m_flags &
+ (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN)) {
+ ret = mext_move_extent(&mext, &m_len);
+ *moved_len += m_len;
+ if (!ret)
+ goto next;
+
+ /* Move failed or partially failed. */
+ if (m_len) {
+ orig_blk += m_len;
+ donor_blk += m_len;
+ len -= m_len;
}
- d_start += next_blk - o_start;
- o_start = next_blk;
- continue;
- /* Check hole after the start pos */
- } else if (cur_blk > o_start) {
- /* Skip hole */
- d_start += cur_blk - o_start;
- o_start = cur_blk;
- /* Extent inside requested range ?*/
- if (cur_blk >= o_end)
- goto out;
- } else { /* in_range(o_start, o_blk, o_len) */
- cur_len += cur_blk - o_start;
+ if (ret == -ESTALE)
+ continue;
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(sb, &retries))
+ continue;
+ if (ret == -EBUSY &&
+ sbi->s_journal && retries++ < 4 &&
+ jbd2_journal_force_commit_nested(sbi->s_journal))
+ continue;
+
+ goto out;
}
- unwritten = ext4_ext_is_unwritten(ex);
- if (o_end - o_start < cur_len)
- cur_len = o_end - o_start;
-
- orig_page_index = o_start >> (PAGE_SHIFT -
- orig_inode->i_blkbits);
- donor_page_index = d_start >> (PAGE_SHIFT -
- donor_inode->i_blkbits);
- offset_in_page = o_start % blocks_per_page;
- if (cur_len > blocks_per_page - offset_in_page)
- cur_len = blocks_per_page - offset_in_page;
- /*
- * Up semaphore to avoid following problems:
- * a. transaction deadlock among ext4_journal_start,
- * ->write_begin via pagefault, and jbd2_journal_commit
- * b. racing with ->read_folio, ->write_begin, and
- * ext4_get_block in move_extent_per_page
- */
- ext4_double_up_write_data_sem(orig_inode, donor_inode);
- /* Swap original branches with new branches */
- *moved_len += move_extent_per_page(o_filp, donor_inode,
- orig_page_index, donor_page_index,
- offset_in_page, cur_len,
- unwritten, &ret);
- ext4_double_down_write_data_sem(orig_inode, donor_inode);
- if (ret < 0)
- break;
- o_start += cur_len;
- d_start += cur_len;
+next:
+ orig_blk += mext.orig_map.m_len;
+ donor_blk += mext.orig_map.m_len;
+ len -= mext.orig_map.m_len;
+ retries = 0;
}
out:
@@ -698,9 +649,6 @@ out:
ext4_discard_preallocations(donor_inode);
}
- ext4_free_ext_path(path);
- ext4_double_up_write_data_sem(orig_inode, donor_inode);
unlock_two_nondirectories(orig_inode, donor_inode);
-
return ret;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2cd36f59c9e3..c4b5e252af0e 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1076,7 +1076,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
bh->b_data, bh->b_size,
- (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
+ EXT4_LBLK_TO_B(dir, block)
+ ((char *)de - bh->b_data))) {
/* silently ignore the rest of the block */
break;
@@ -1630,7 +1630,7 @@ restart:
}
set_buffer_verified(bh);
i = search_dirblock(bh, dir, fname,
- block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
+ EXT4_LBLK_TO_B(dir, block), res_dir);
if (i == 1) {
EXT4_I(dir)->i_dir_start_lookup = block;
ret = bh;
@@ -1710,7 +1710,6 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **res_dir)
{
- struct super_block * sb = dir->i_sb;
struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
struct buffer_head *bh;
ext4_lblk_t block;
@@ -1729,8 +1728,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
goto errout;
retval = search_dirblock(bh, dir, fname,
- block << EXT4_BLOCK_SIZE_BITS(sb),
- res_dir);
+ EXT4_LBLK_TO_B(dir, block), res_dir);
if (retval == 1)
goto success;
brelse(bh);
@@ -1762,7 +1760,7 @@ success:
static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
struct inode *inode;
- struct ext4_dir_entry_2 *de;
+ struct ext4_dir_entry_2 *de = NULL;
struct buffer_head *bh;
if (dentry->d_name.len > EXT4_NAME_LEN)
@@ -1818,7 +1816,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
struct dentry *ext4_get_parent(struct dentry *child)
{
__u32 ino;
- struct ext4_dir_entry_2 * de;
+ struct ext4_dir_entry_2 * de = NULL;
struct buffer_head *bh;
bh = ext4_find_entry(d_inode(child), &dotdot_name, &de, NULL);
@@ -3133,7 +3131,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
int retval;
struct inode *inode;
struct buffer_head *bh;
- struct ext4_dir_entry_2 *de;
+ struct ext4_dir_entry_2 *de = NULL;
handle_t *handle = NULL;
retval = ext4_emergency_state(dir->i_sb);
@@ -3224,7 +3222,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
{
int retval = -ENOENT;
struct buffer_head *bh;
- struct ext4_dir_entry_2 *de;
+ struct ext4_dir_entry_2 *de = NULL;
handle_t *handle;
int skip_remove_dentry = 0;
@@ -3688,7 +3686,7 @@ static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
{
int retval = -ENOENT;
struct buffer_head *bh;
- struct ext4_dir_entry_2 *de;
+ struct ext4_dir_entry_2 *de = NULL;
bh = ext4_find_entry(dir, d_name, &de, NULL);
if (IS_ERR(bh))
diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c
index 82d5e7501455..c9b93b670b0f 100644
--- a/fs/ext4/orphan.c
+++ b/fs/ext4/orphan.c
@@ -8,6 +8,8 @@
#include "ext4.h"
#include "ext4_jbd2.h"
+#define EXT4_MAX_ORPHAN_FILE_BLOCKS 512
+
static int ext4_orphan_file_add(handle_t *handle, struct inode *inode)
{
int i, j, start;
@@ -107,7 +109,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
if (!sbi->s_journal || is_bad_inode(inode))
return 0;
- WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
+ WARN_ON_ONCE(!(inode_state_read_once(inode) & (I_NEW | I_FREEING)) &&
!inode_is_locked(inode));
if (ext4_inode_orphan_tracked(inode))
return 0;
@@ -232,7 +234,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS))
return 0;
- WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
+ WARN_ON_ONCE(!(inode_state_read_once(inode) & (I_NEW | I_FREEING)) &&
!inode_is_locked(inode));
if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE))
return ext4_orphan_file_del(handle, inode);
@@ -588,7 +590,7 @@ int ext4_init_orphan_info(struct super_block *sb)
* consuming absurd amounts of memory when pinning blocks of orphan
* file in memory.
*/
- if (inode->i_size > 8 << 20) {
+ if (inode->i_size > (EXT4_MAX_ORPHAN_FILE_BLOCKS << inode->i_blkbits)) {
ext4_msg(sb, KERN_ERR, "orphan file too big: %llu",
(unsigned long long)inode->i_size);
ret = -EFSCORRUPTED;
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index f329daf6e5c7..e7f2350c725b 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -213,9 +213,7 @@ int ext4_mpage_readpages(struct inode *inode,
{
struct bio *bio = NULL;
sector_t last_block_in_bio = 0;
-
const unsigned blkbits = inode->i_blkbits;
- const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
const unsigned blocksize = 1 << blkbits;
sector_t next_block;
sector_t block_in_file;
@@ -251,9 +249,8 @@ int ext4_mpage_readpages(struct inode *inode,
blocks_per_folio = folio_size(folio) >> blkbits;
first_hole = blocks_per_folio;
- block_in_file = next_block =
- (sector_t)folio->index << (PAGE_SHIFT - blkbits);
- last_block = block_in_file + nr_pages * blocks_per_page;
+ block_in_file = next_block = EXT4_PG_TO_LBLK(inode, folio->index);
+ last_block = EXT4_PG_TO_LBLK(inode, folio->index + nr_pages);
last_block_in_file = (ext4_readpage_limit(inode) +
blocksize - 1) >> blkbits;
if (last_block > last_block_in_file)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 33e7c08c9529..87205660c5d0 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -698,7 +698,7 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
WARN_ON_ONCE(1);
if (!continue_fs && !ext4_emergency_ro(sb) && journal)
- jbd2_journal_abort(journal, -EIO);
+ jbd2_journal_abort(journal, -error);
if (!bdev_read_only(sb->s_bdev)) {
save_error_info(sb, error, ino, block, func, line);
@@ -1396,6 +1396,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
inode_set_iversion(&ei->vfs_inode, 1);
ei->i_flags = 0;
+ ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
spin_lock_init(&ei->i_raw_lock);
ei->i_prealloc_node = RB_ROOT;
atomic_set(&ei->i_prealloc_active, 0);
@@ -1406,6 +1407,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_es_all_nr = 0;
ei->i_es_shk_nr = 0;
ei->i_es_shrink_lblk = 0;
+ ei->i_es_seq = 0;
ei->i_reserved_data_blocks = 0;
spin_lock_init(&(ei->i_block_reservation_lock));
ext4_init_pending_tree(&ei->i_pending_tree);
@@ -2475,7 +2477,7 @@ static int parse_apply_sb_mount_options(struct super_block *sb,
struct ext4_fs_context *m_ctx)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- char s_mount_opts[65];
+ char s_mount_opts[64];
struct ext4_fs_context *s_ctx = NULL;
struct fs_context *fc = NULL;
int ret = -ENOMEM;
@@ -2483,7 +2485,8 @@ static int parse_apply_sb_mount_options(struct super_block *sb,
if (!sbi->s_es->s_mount_opts[0])
return 0;
- strscpy_pad(s_mount_opts, sbi->s_es->s_mount_opts);
+ if (strscpy_pad(s_mount_opts, sbi->s_es->s_mount_opts) < 0)
+ return -E2BIG;
fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL);
if (!fc)
@@ -4188,7 +4191,7 @@ int ext4_calculate_overhead(struct super_block *sb)
unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum);
ext4_group_t i, ngroups = ext4_get_groups_count(sb);
ext4_fsblk_t overhead = 0;
- char *buf = (char *) get_zeroed_page(GFP_NOFS);
+ char *buf = kvmalloc(sb->s_blocksize, GFP_NOFS | __GFP_ZERO);
if (!buf)
return -ENOMEM;
@@ -4213,7 +4216,7 @@ int ext4_calculate_overhead(struct super_block *sb)
blks = count_overhead(sb, i, buf);
overhead += blks;
if (blks)
- memset(buf, 0, PAGE_SIZE);
+ memset(buf, 0, sb->s_blocksize);
cond_resched();
}
@@ -4236,7 +4239,7 @@ int ext4_calculate_overhead(struct super_block *sb)
}
sbi->s_overhead = overhead;
smp_wmb();
- free_page((unsigned long) buf);
+ kvfree(buf);
return 0;
}
@@ -4389,8 +4392,7 @@ static void ext4_set_def_opts(struct super_block *sb,
((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
set_opt(sb, DELALLOC);
- if (sb->s_blocksize <= PAGE_SIZE)
- set_opt(sb, DIOREAD_NOLOCK);
+ set_opt(sb, DIOREAD_NOLOCK);
}
static int ext4_handle_clustersize(struct super_block *sb)
@@ -5040,6 +5042,41 @@ static const char *ext4_has_journal_option(struct super_block *sb)
return NULL;
}
+/*
+ * Limit the maximum folio order to 2048 blocks to prevent overestimation
+ * of reserve handle credits during the folio writeback in environments
+ * where the PAGE_SIZE exceeds 4KB.
+ */
+#define EXT4_MAX_PAGECACHE_ORDER(sb) \
+ umin(MAX_PAGECACHE_ORDER, (11 + (sb)->s_blocksize_bits - PAGE_SHIFT))
+static void ext4_set_max_mapping_order(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+ sbi->s_max_folio_order = sbi->s_min_folio_order;
+ else
+ sbi->s_max_folio_order = EXT4_MAX_PAGECACHE_ORDER(sb);
+}
+
+static int ext4_check_large_folio(struct super_block *sb)
+{
+ const char *err_str = NULL;
+
+ if (ext4_has_feature_encrypt(sb))
+ err_str = "encrypt";
+
+ if (!err_str) {
+ ext4_set_max_mapping_order(sb);
+ } else if (sb->s_blocksize > PAGE_SIZE) {
+ ext4_msg(sb, KERN_ERR, "bs(%lu) > ps(%lu) unsupported for %s",
+ sb->s_blocksize, PAGE_SIZE, err_str);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb,
int silent)
{
@@ -5107,11 +5144,8 @@ static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb,
* If the default block size is not the same as the real block size,
* we need to reload it.
*/
- if (sb->s_blocksize == blocksize) {
- *lsb = logical_sb_block;
- sbi->s_sbh = bh;
- return 0;
- }
+ if (sb->s_blocksize == blocksize)
+ goto success;
/*
* bh must be released before kill_bdev(), otherwise
@@ -5142,6 +5176,9 @@ static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb,
ext4_msg(sb, KERN_ERR, "Magic mismatch, very weird!");
goto out;
}
+
+success:
+ sbi->s_min_folio_order = get_order(blocksize);
*lsb = logical_sb_block;
sbi->s_sbh = bh;
return 0;
@@ -5316,6 +5353,10 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
ext4_apply_options(fc, sb);
+ err = ext4_check_large_folio(sb);
+ if (err < 0)
+ goto failed_mount;
+
err = ext4_encoding_init(sb, es);
if (err)
goto failed_mount;
@@ -5842,7 +5883,7 @@ static int ext4_journal_bmap(journal_t *journal, sector_t *block)
ext4_msg(journal->j_inode->i_sb, KERN_CRIT,
"journal bmap failed: block %llu ret %d\n",
*block, ret);
- jbd2_journal_abort(journal, ret ? ret : -EIO);
+ jbd2_journal_abort(journal, ret ? ret : -EFSCORRUPTED);
return ret;
}
*block = map.m_pblk;
@@ -7412,7 +7453,8 @@ static struct file_system_type ext4_fs_type = {
.init_fs_context = ext4_init_fs_context,
.parameters = ext4_param_specs,
.kill_sb = ext4_kill_sb,
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME |
+ FS_LBS,
};
MODULE_ALIAS_FS("ext4");
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 987bd00f916a..0018e09b867e 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -332,6 +332,9 @@ EXT4_ATTR_FEATURE(fast_commit);
#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION)
EXT4_ATTR_FEATURE(encrypted_casefold);
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+EXT4_ATTR_FEATURE(blocksize_gt_pagesize);
+#endif
static struct attribute *ext4_feat_attrs[] = {
ATTR_LIST(lazy_itable_init),
@@ -352,6 +355,9 @@ static struct attribute *ext4_feat_attrs[] = {
#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION)
ATTR_LIST(encrypted_casefold),
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ ATTR_LIST(blocksize_gt_pagesize),
+#endif
NULL,
};
ATTRIBUTE_GROUPS(ext4_feat);
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index b0acb0c50313..415d9c4d8a32 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -302,7 +302,7 @@ static int ext4_get_verity_descriptor_location(struct inode *inode,
end_lblk = le32_to_cpu(last_extent->ee_block) +
ext4_ext_get_actual_len(last_extent);
- desc_size_pos = (u64)end_lblk << inode->i_blkbits;
+ desc_size_pos = EXT4_LBLK_TO_B(inode, end_lblk);
ext4_free_ext_path(path);
if (desc_size_pos < sizeof(desc_size_disk))
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index ce7253b3f549..2e02efbddaac 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1174,7 +1174,11 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
if (block_csum)
end = (void *)bh->b_data + bh->b_size;
else {
- ext4_get_inode_loc(parent, &iloc);
+ err = ext4_get_inode_loc(parent, &iloc);
+ if (err) {
+ EXT4_ERROR_INODE(parent, "parent inode loc (error %d)", err);
+ return;
+ }
end = (void *)ext4_raw_inode(&iloc) + EXT4_SB(parent->i_sb)->s_inode_size;
}