summaryrefslogtreecommitdiff
path: root/fs/xfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/Kconfig11
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h6
-rw-r--r--fs/xfs/libxfs/xfs_rtgroup.h6
-rw-r--r--fs/xfs/scrub/common.c2
-rw-r--r--fs/xfs/scrub/inode_repair.c2
-rw-r--r--fs/xfs/scrub/nlinks.c34
-rw-r--r--fs/xfs/scrub/orphanage.c13
-rw-r--r--fs/xfs/scrub/parent.c2
-rw-r--r--fs/xfs/scrub/symlink_repair.c2
-rw-r--r--fs/xfs/scrub/xfarray.c2
-rw-r--r--fs/xfs/xfs_aops.c7
-rw-r--r--fs/xfs/xfs_bmap_util.c2
-rw-r--r--fs/xfs/xfs_buf.c2
-rw-r--r--fs/xfs/xfs_buf.h1
-rw-r--r--fs/xfs/xfs_discard.c4
-rw-r--r--fs/xfs/xfs_file.c50
-rw-r--r--fs/xfs/xfs_handle.c56
-rw-r--r--fs/xfs/xfs_health.c4
-rw-r--r--fs/xfs/xfs_icache.c6
-rw-r--r--fs/xfs/xfs_inode.c6
-rw-r--r--fs/xfs/xfs_inode_item.c4
-rw-r--r--fs/xfs/xfs_ioctl.c6
-rw-r--r--fs/xfs/xfs_iomap.c120
-rw-r--r--fs/xfs/xfs_iops.c2
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_reflink.h2
-rw-r--r--fs/xfs/xfs_super.c58
-rw-r--r--fs/xfs/xfs_zone_alloc.c190
-rw-r--r--fs/xfs/xfs_zone_gc.c108
-rw-r--r--fs/xfs/xfs_zone_priv.h2
30 files changed, 453 insertions, 258 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 8930d5254e1d..b99da294e9a3 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -119,6 +119,15 @@ config XFS_RT
See the xfs man page in section 5 for additional information.
+ This option is mandatory to support zoned block devices. For these
+ devices, the realtime subvolume must be backed by a zoned block
+ device and a regular block device used as the main device (for
+ metadata). If the zoned block device is a host-managed SMR hard-disk
+ containing conventional zones at the beginning of its address space,
+ XFS will use the disk conventional zones as the main device and the
+ remaining sequential write required zones as the backing storage for
+ the realtime subvolume.
+
If unsure, say N.
config XFS_DRAIN_INTENTS
@@ -156,7 +165,7 @@ config XFS_ONLINE_SCRUB_STATS
bool "XFS online metadata check usage data collection"
default y
depends on XFS_ONLINE_SCRUB
- select DEBUG_FS
+ depends on DEBUG_FS
help
If you say Y here, the kernel will gather usage data about
the online metadata check subsystem. This includes the number
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index de840abc0bcd..57e47077c75a 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -73,7 +73,8 @@
#define XFS_ERRTAG_WRITE_DELAY_MS 43
#define XFS_ERRTAG_EXCHMAPS_FINISH_ONE 44
#define XFS_ERRTAG_METAFILE_RESV_CRITICAL 45
-#define XFS_ERRTAG_MAX 46
+#define XFS_ERRTAG_FORCE_ZERO_RANGE 46
+#define XFS_ERRTAG_MAX 47
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -133,7 +134,8 @@ XFS_ERRTAG(ATTR_LEAF_TO_NODE, attr_leaf_to_node, 1) \
XFS_ERRTAG(WB_DELAY_MS, wb_delay_ms, 3000) \
XFS_ERRTAG(WRITE_DELAY_MS, write_delay_ms, 3000) \
XFS_ERRTAG(EXCHMAPS_FINISH_ONE, exchmaps_finish_one, 1) \
-XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4)
+XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4) \
+XFS_ERRTAG(FORCE_ZERO_RANGE, force_zero_range, 4)
#endif /* XFS_ERRTAG */
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index d36a6ae0abe5..d4fcf591e63d 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -50,6 +50,12 @@ struct xfs_rtgroup {
uint8_t *rtg_rsum_cache;
struct xfs_open_zone *rtg_open_zone;
};
+
+ /*
+ * Count of outstanding GC operations for zoned XFS. Any RTG with a
+ * non-zero rtg_gccount will not be picked as new GC victim.
+ */
+ atomic_t rtg_gccount;
};
/*
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 2ef7742be7d3..7bfa37c99480 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -1249,7 +1249,7 @@ xchk_irele(
* hits do not clear DONTCACHE, so we must do it here.
*/
spin_lock(&VFS_I(ip)->i_lock);
- VFS_I(ip)->i_state &= ~I_DONTCACHE;
+ inode_state_clear(VFS_I(ip), I_DONTCACHE);
spin_unlock(&VFS_I(ip)->i_lock);
}
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index a90a011c7e5f..4f7040c9ddf0 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -1933,7 +1933,7 @@ xrep_inode_pptr(
* Unlinked inodes that cannot be added to the directory tree will not
* have a parent pointer.
*/
- if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
+ if (inode->i_nlink == 0 && !(inode_state_read_once(inode) & I_LINKABLE))
return 0;
/* Children of the superblock do not have parent pointers. */
diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c
index 26721fab5cab..091c79e432e5 100644
--- a/fs/xfs/scrub/nlinks.c
+++ b/fs/xfs/scrub/nlinks.c
@@ -376,6 +376,36 @@ out_incomplete:
return error;
}
+static uint
+xchk_nlinks_ilock_dir(
+ struct xfs_inode *ip)
+{
+ uint lock_mode = XFS_ILOCK_SHARED;
+
+ /*
+ * We're going to scan the directory entries, so we must be ready to
+ * pull the data fork mappings into memory if they aren't already.
+ */
+ if (xfs_need_iread_extents(&ip->i_df))
+ lock_mode = XFS_ILOCK_EXCL;
+
+ /*
+ * We're going to scan the parent pointers, so we must be ready to
+ * pull the attr fork mappings into memory if they aren't already.
+ */
+ if (xfs_has_parent(ip->i_mount) && xfs_inode_has_attr_fork(ip) &&
+ xfs_need_iread_extents(&ip->i_af))
+ lock_mode = XFS_ILOCK_EXCL;
+
+ /*
+ * Take the IOLOCK so that other threads cannot start a directory
+ * update while we're scanning.
+ */
+ lock_mode |= XFS_IOLOCK_SHARED;
+ xfs_ilock(ip, lock_mode);
+ return lock_mode;
+}
+
/* Walk a directory to bump the observed link counts of the children. */
STATIC int
xchk_nlinks_collect_dir(
@@ -394,8 +424,7 @@ xchk_nlinks_collect_dir(
return 0;
/* Prevent anyone from changing this directory while we walk it. */
- xfs_ilock(dp, XFS_IOLOCK_SHARED);
- lock_mode = xfs_ilock_data_map_shared(dp);
+ lock_mode = xchk_nlinks_ilock_dir(dp);
/*
* The dotdot entry of an unlinked directory still points to the last
@@ -452,7 +481,6 @@ out_abort:
xchk_iscan_abort(&xnc->collect_iscan);
out_unlock:
xfs_iunlock(dp, lock_mode);
- xfs_iunlock(dp, XFS_IOLOCK_SHARED);
return error;
}
diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c
index 9c12cb844231..4e550a1d5353 100644
--- a/fs/xfs/scrub/orphanage.c
+++ b/fs/xfs/scrub/orphanage.c
@@ -152,11 +152,10 @@ xrep_orphanage_create(
}
/* Try to find the orphanage directory. */
- inode_lock_nested(root_inode, I_MUTEX_PARENT);
- orphanage_dentry = lookup_noperm(&QSTR(ORPHANAGE), root_dentry);
+ orphanage_dentry = start_creating_noperm(root_dentry, &QSTR(ORPHANAGE));
if (IS_ERR(orphanage_dentry)) {
error = PTR_ERR(orphanage_dentry);
- goto out_unlock_root;
+ goto out_dput_root;
}
/*
@@ -167,10 +166,10 @@ xrep_orphanage_create(
*/
if (d_really_is_negative(orphanage_dentry)) {
orphanage_dentry = vfs_mkdir(&nop_mnt_idmap, root_inode,
- orphanage_dentry, 0750);
+ orphanage_dentry, 0750, NULL);
error = PTR_ERR(orphanage_dentry);
if (IS_ERR(orphanage_dentry))
- goto out_unlock_root;
+ goto out_dput_orphanage;
}
/* Not a directory? Bail out. */
@@ -200,9 +199,7 @@ xrep_orphanage_create(
sc->orphanage_ilock_flags = 0;
out_dput_orphanage:
- dput(orphanage_dentry);
-out_unlock_root:
- inode_unlock(VFS_I(sc->mp->m_rootip));
+ end_creating(orphanage_dentry);
out_dput_root:
dput(root_dentry);
out:
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index 3b692c4acc1e..11d5de10fd56 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -915,7 +915,7 @@ xchk_pptr_looks_zapped(
* Temporary files that cannot be linked into the directory tree do not
* have attr forks because they cannot ever have parents.
*/
- if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
+ if (inode->i_nlink == 0 && !(inode_state_read_once(inode) & I_LINKABLE))
return false;
/*
diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c
index 5902398185a8..df629892462f 100644
--- a/fs/xfs/scrub/symlink_repair.c
+++ b/fs/xfs/scrub/symlink_repair.c
@@ -184,7 +184,7 @@ xrep_symlink_salvage_inline(
sc->ip->i_disk_size == 1 && old_target[0] == '?')
return 0;
- nr = min(XFS_SYMLINK_MAXLEN, xfs_inode_data_fork_size(ip));
+ nr = min(XFS_SYMLINK_MAXLEN, ifp->if_bytes);
memcpy(target_buf, ifp->if_data, nr);
return nr;
}
diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c
index cdd13ed9c569..ed2e8c64b1a8 100644
--- a/fs/xfs/scrub/xfarray.c
+++ b/fs/xfs/scrub/xfarray.c
@@ -834,7 +834,7 @@ xfarray_sort_scan(
si->first_folio_idx = xfarray_idx(si->array,
folio_pos(si->folio) + si->array->obj_size - 1);
- next_pos = folio_pos(si->folio) + folio_size(si->folio);
+ next_pos = folio_next_pos(si->folio);
si->last_folio_idx = xfarray_idx(si->array, next_pos - 1);
if (xfarray_pos(si->array, si->last_folio_idx + 1) > next_pos)
si->last_folio_idx--;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index a26f79815533..56a544638491 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -271,7 +271,7 @@ xfs_discard_folio(
* folio itself and not the start offset that is passed in.
*/
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
- folio_pos(folio) + folio_size(folio), NULL);
+ folio_next_pos(folio), NULL);
}
/*
@@ -742,14 +742,15 @@ xfs_vm_read_folio(
struct file *unused,
struct folio *folio)
{
- return iomap_read_folio(folio, &xfs_read_iomap_ops);
+ iomap_bio_read_folio(folio, &xfs_read_iomap_ops);
+ return 0;
}
STATIC void
xfs_vm_readahead(
struct readahead_control *rac)
{
- iomap_readahead(rac, &xfs_read_iomap_ops);
+ iomap_bio_readahead(rac, &xfs_read_iomap_ops);
}
static int
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 06ca11731e43..2208a720ec3f 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -514,7 +514,7 @@ xfs_can_free_eofblocks(
* Caller must either hold the exclusive io lock; or be inactivating
* the inode, which guarantees there are no other users of the inode.
*/
- if (!(VFS_I(ip)->i_state & I_FREEING))
+ if (!(inode_state_read_once(VFS_I(ip)) & I_FREEING))
xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL);
/* prealloc/delalloc exists only on regular files */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 773d959965dc..47edf3041631 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1751,7 +1751,7 @@ xfs_init_buftarg(
const char *descr)
{
/* The maximum size of the buftarg is only known once the sb is read. */
- btp->bt_nr_sectors = (xfs_daddr_t)-1;
+ btp->bt_nr_sectors = XFS_BUF_DADDR_MAX;
/* Set up device logical sector size mask */
btp->bt_logical_sectorsize = logical_sectorsize;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 8fa7bdf59c91..e25cd2a160f3 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -22,6 +22,7 @@ extern struct kmem_cache *xfs_buf_cache;
*/
struct xfs_buf;
+#define XFS_BUF_DADDR_MAX ((xfs_daddr_t) S64_MAX)
#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL))
#define XBF_READ (1u << 0) /* buffer intended for reading from device */
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index ee49f20875af..6917de832191 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -726,8 +726,10 @@ xfs_trim_rtgroup_extents(
break;
}
- if (!tr.queued)
+ if (!tr.queued) {
+ kfree(tr.extents);
break;
+ }
/*
* We hand the extent list to the discard function here so the
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 2702fef2c90c..6108612182e2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -27,6 +27,8 @@
#include "xfs_file.h"
#include "xfs_aops.h"
#include "xfs_zone_alloc.h"
+#include "xfs_error.h"
+#include "xfs_errortag.h"
#include <linux/dax.h>
#include <linux/falloc.h>
@@ -674,8 +676,17 @@ xfs_file_dio_write_aligned(
struct xfs_zone_alloc_ctx *ac)
{
unsigned int iolock = XFS_IOLOCK_SHARED;
+ unsigned int dio_flags = 0;
ssize_t ret;
+ /*
+ * For always COW inodes, each bio must be aligned to the file system
+ * block size and not just the device sector size because we need to
+ * allocate a block-aligned amount of space for each write.
+ */
+ if (xfs_is_always_cow_inode(ip))
+ dio_flags |= IOMAP_DIO_FSBLOCK_ALIGNED;
+
ret = xfs_ilock_iocb_for_write(iocb, &iolock);
if (ret)
return ret;
@@ -693,7 +704,7 @@ xfs_file_dio_write_aligned(
iolock = XFS_IOLOCK_SHARED;
}
trace_xfs_file_direct_write(iocb, from);
- ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0);
+ ret = iomap_dio_rw(iocb, from, ops, dops, dio_flags, ac, 0);
out_unlock:
xfs_iunlock(ip, iolock);
return ret;
@@ -890,15 +901,7 @@ xfs_file_dio_write(
if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
return -EINVAL;
- /*
- * For always COW inodes we also must check the alignment of each
- * individual iovec segment, as they could end up with different
- * I/Os due to the way bio_iov_iter_get_pages works, and we'd
- * then overwrite an already written block.
- */
- if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) ||
- (xfs_is_always_cow_inode(ip) &&
- (iov_iter_alignment(from) & ip->i_mount->m_blockmask)))
+ if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
return xfs_file_dio_write_unaligned(ip, iocb, from);
if (xfs_is_zoned_inode(ip))
return xfs_file_dio_write_zoned(ip, iocb, from);
@@ -1254,23 +1257,36 @@ xfs_falloc_zero_range(
struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(file);
+ struct xfs_inode *ip = XFS_I(inode);
unsigned int blksize = i_blocksize(inode);
loff_t new_size = 0;
int error;
- trace_xfs_zero_file_space(XFS_I(inode));
+ trace_xfs_zero_file_space(ip);
error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
if (error)
return error;
- error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
- if (error)
- return error;
+ /*
+ * Zero range implements a full zeroing mechanism but is only used in
+ * limited situations. It is more efficient to allocate unwritten
+ * extents than to perform zeroing here, so use an errortag to randomly
+ * force zeroing on DEBUG kernels for added test coverage.
+ */
+ if (XFS_TEST_ERROR(ip->i_mount,
+ XFS_ERRTAG_FORCE_ZERO_RANGE)) {
+ error = xfs_zero_range(ip, offset, len, ac, NULL);
+ } else {
+ error = xfs_free_file_space(ip, offset, len, ac);
+ if (error)
+ return error;
- len = round_up(offset + len, blksize) - round_down(offset, blksize);
- offset = round_down(offset, blksize);
- error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+ len = round_up(offset + len, blksize) -
+ round_down(offset, blksize);
+ offset = round_down(offset, blksize);
+ error = xfs_alloc_file_space(ip, offset, len);
+ }
if (error)
return error;
return xfs_falloc_setsize(file, new_size);
diff --git a/fs/xfs/xfs_handle.c b/fs/xfs/xfs_handle.c
index f19fce557354..5a3e3bf4e7cc 100644
--- a/fs/xfs/xfs_handle.c
+++ b/fs/xfs/xfs_handle.c
@@ -233,14 +233,11 @@ xfs_open_by_handle(
xfs_fsop_handlereq_t *hreq)
{
const struct cred *cred = current_cred();
- int error;
- int fd;
int permflag;
- struct file *filp;
struct inode *inode;
struct dentry *dentry;
fmode_t fmode;
- struct path path;
+ struct path path __free(path_put) = {};
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -249,12 +246,11 @@ xfs_open_by_handle(
if (IS_ERR(dentry))
return PTR_ERR(dentry);
inode = d_inode(dentry);
+ path.dentry = dentry;
/* Restrict xfs_open_by_handle to directories & regular files. */
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
- error = -EPERM;
- goto out_dput;
- }
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
+ return -EPERM;
#if BITS_PER_LONG != 32
hreq->oflags |= O_LARGEFILE;
@@ -263,48 +259,30 @@ xfs_open_by_handle(
permflag = hreq->oflags;
fmode = OPEN_FMODE(permflag);
if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
- (fmode & FMODE_WRITE) && IS_APPEND(inode)) {
- error = -EPERM;
- goto out_dput;
- }
+ (fmode & FMODE_WRITE) && IS_APPEND(inode))
+ return -EPERM;
- if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
- error = -EPERM;
- goto out_dput;
- }
+ if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode))
+ return -EPERM;
/* Can't write directories. */
- if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) {
- error = -EISDIR;
- goto out_dput;
- }
+ if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE))
+ return -EISDIR;
- fd = get_unused_fd_flags(0);
- if (fd < 0) {
- error = fd;
- goto out_dput;
- }
+ path.mnt = mntget(parfilp->f_path.mnt);
- path.mnt = parfilp->f_path.mnt;
- path.dentry = dentry;
- filp = dentry_open(&path, hreq->oflags, cred);
- dput(dentry);
- if (IS_ERR(filp)) {
- put_unused_fd(fd);
- return PTR_ERR(filp);
- }
+ FD_PREPARE(fdf, 0, dentry_open(&path, hreq->oflags, cred));
+ if (fdf.err)
+ return fdf.err;
if (S_ISREG(inode->i_mode)) {
+ struct file *filp = fd_prepare_file(fdf);
+
filp->f_flags |= O_NOATIME;
filp->f_mode |= FMODE_NOCMTIME;
}
- fd_install(fd, filp);
- return fd;
-
- out_dput:
- dput(dentry);
- return error;
+ return fd_publish(fdf);
}
int
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index 7c541fb373d5..3c1557fb1cf0 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -285,7 +285,7 @@ xfs_inode_mark_sick(
* is not the case here.
*/
spin_lock(&VFS_I(ip)->i_lock);
- VFS_I(ip)->i_state &= ~I_DONTCACHE;
+ inode_state_clear(VFS_I(ip), I_DONTCACHE);
spin_unlock(&VFS_I(ip)->i_lock);
}
@@ -309,7 +309,7 @@ xfs_inode_mark_corrupt(
* is not the case here.
*/
spin_lock(&VFS_I(ip)->i_lock);
- VFS_I(ip)->i_state &= ~I_DONTCACHE;
+ inode_state_clear(VFS_I(ip), I_DONTCACHE);
spin_unlock(&VFS_I(ip)->i_lock);
}
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index e44040206851..f3fc4d21bfe1 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -334,7 +334,7 @@ xfs_reinit_inode(
dev_t dev = inode->i_rdev;
kuid_t uid = inode->i_uid;
kgid_t gid = inode->i_gid;
- unsigned long state = inode->i_state;
+ unsigned long state = inode_state_read_once(inode);
error = inode_init_always(mp->m_super, inode);
@@ -345,7 +345,7 @@ xfs_reinit_inode(
inode->i_rdev = dev;
inode->i_uid = uid;
inode->i_gid = gid;
- inode->i_state = state;
+ inode_state_assign_raw(inode, state);
mapping_set_folio_min_order(inode->i_mapping,
M_IGEO(mp)->min_folio_order);
return error;
@@ -411,7 +411,7 @@ xfs_iget_recycle(
ip->i_flags |= XFS_INEW;
xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
XFS_ICI_RECLAIM_TAG);
- inode->i_state = I_NEW;
+ inode_state_assign_raw(inode, I_NEW);
spin_unlock(&ip->i_flags_lock);
spin_unlock(&pag->pag_ici_lock);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 36b39539e561..f1f88e48fe22 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1580,7 +1580,7 @@ xfs_iunlink_reload_next(
next_ip->i_prev_unlinked = prev_agino;
trace_xfs_iunlink_reload_next(next_ip);
rele:
- ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE));
+ ASSERT(!(inode_state_read_once(VFS_I(next_ip)) & I_DONTCACHE));
if (xfs_is_quotacheck_running(mp) && next_ip)
xfs_iflags_set(next_ip, XFS_IQUOTAUNCHECKED);
xfs_irele(next_ip);
@@ -2111,7 +2111,7 @@ xfs_rename_alloc_whiteout(
*/
xfs_setup_iops(tmpfile);
xfs_finish_inode_setup(tmpfile);
- VFS_I(tmpfile)->i_state |= I_LINKABLE;
+ inode_state_set_raw(VFS_I(tmpfile), I_LINKABLE);
*wip = tmpfile;
return 0;
@@ -2330,7 +2330,7 @@ retry:
* flag from the inode so it doesn't accidentally get misused in
* future.
*/
- VFS_I(du_wip.ip)->i_state &= ~I_LINKABLE;
+ inode_state_clear_raw(VFS_I(du_wip.ip), I_LINKABLE);
}
out_commit:
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 1bd411a1114c..2eb0c6011a2e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -113,9 +113,9 @@ xfs_inode_item_precommit(
* to log the timestamps, or will clear already cleared fields in the
* worst case.
*/
- if (inode->i_state & I_DIRTY_TIME) {
+ if (inode_state_read_once(inode) & I_DIRTY_TIME) {
spin_lock(&inode->i_lock);
- inode->i_state &= ~I_DIRTY_TIME;
+ inode_state_clear(inode, I_DIRTY_TIME);
spin_unlock(&inode->i_lock);
}
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index a6bb7ee7a27a..59eaad774371 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1408,10 +1408,8 @@ xfs_file_ioctl(
trace_xfs_ioc_free_eofblocks(mp, &icw, _RET_IP_);
- sb_start_write(mp->m_super);
- error = xfs_blockgc_free_space(mp, &icw);
- sb_end_write(mp->m_super);
- return error;
+ guard(super_write)(mp->m_super);
+ return xfs_blockgc_free_space(mp, &icw);
}
case XFS_IOC_EXCHANGE_RANGE:
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index d3f6e3e42a11..04f39ea15898 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1091,6 +1091,29 @@ const struct iomap_ops xfs_zoned_direct_write_iomap_ops = {
};
#endif /* CONFIG_XFS_RT */
+#ifdef DEBUG
+static void
+xfs_check_atomic_cow_conversion(
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb,
+ xfs_filblks_t count_fsb,
+ const struct xfs_bmbt_irec *cmap)
+{
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec cmap2 = { };
+
+ if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap2))
+ xfs_trim_extent(&cmap2, offset_fsb, count_fsb);
+
+ ASSERT(cmap2.br_startoff == cmap->br_startoff);
+ ASSERT(cmap2.br_blockcount == cmap->br_blockcount);
+ ASSERT(cmap2.br_startblock == cmap->br_startblock);
+ ASSERT(cmap2.br_state == cmap->br_state);
+}
+#else
+# define xfs_check_atomic_cow_conversion(...) ((void)0)
+#endif
+
static int
xfs_atomic_write_cow_iomap_begin(
struct inode *inode,
@@ -1102,9 +1125,10 @@ xfs_atomic_write_cow_iomap_begin(
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- const xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
- xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
- xfs_filblks_t count_fsb = end_fsb - offset_fsb;
+ const xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ const xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length);
+ const xfs_filblks_t count_fsb = end_fsb - offset_fsb;
+ xfs_filblks_t hole_count_fsb;
int nmaps = 1;
xfs_filblks_t resaligned;
struct xfs_bmbt_irec cmap;
@@ -1130,7 +1154,7 @@ xfs_atomic_write_cow_iomap_begin(
return -EAGAIN;
trace_xfs_iomap_atomic_write_cow(ip, offset, length);
-
+retry:
xfs_ilock(ip, XFS_ILOCK_EXCL);
if (!ip->i_cowfp) {
@@ -1141,14 +1165,22 @@ xfs_atomic_write_cow_iomap_begin(
if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
cmap.br_startoff = end_fsb;
if (cmap.br_startoff <= offset_fsb) {
+ if (isnullstartblock(cmap.br_startblock))
+ goto convert_delay;
+
+ /*
+ * cmap could extend outside the write range due to previous
+ * speculative preallocations. We must trim cmap to the write
+ * range because the cow fork treats written mappings to mean
+ * "write in progress".
+ */
xfs_trim_extent(&cmap, offset_fsb, count_fsb);
goto found;
}
- end_fsb = cmap.br_startoff;
- count_fsb = end_fsb - offset_fsb;
+ hole_count_fsb = cmap.br_startoff - offset_fsb;
- resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb,
+ resaligned = xfs_aligned_fsb_count(offset_fsb, hole_count_fsb,
xfs_get_cowextsz_hint(ip));
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -1169,8 +1201,10 @@ xfs_atomic_write_cow_iomap_begin(
if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
cmap.br_startoff = end_fsb;
if (cmap.br_startoff <= offset_fsb) {
- xfs_trim_extent(&cmap, offset_fsb, count_fsb);
xfs_trans_cancel(tp);
+ if (isnullstartblock(cmap.br_startblock))
+ goto convert_delay;
+ xfs_trim_extent(&cmap, offset_fsb, count_fsb);
goto found;
}
@@ -1182,7 +1216,7 @@ xfs_atomic_write_cow_iomap_begin(
* atomic writes to that same range will be aligned (and don't require
* this COW-based method).
*/
- error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
+ error = xfs_bmapi_write(tp, ip, offset_fsb, hole_count_fsb,
XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC |
XFS_BMAPI_EXTSZALIGN, 0, &cmap, &nmaps);
if (error) {
@@ -1195,21 +1229,43 @@ xfs_atomic_write_cow_iomap_begin(
if (error)
goto out_unlock;
+ /*
+ * cmap could map more blocks than the range we passed into bmapi_write
+ * because of EXTSZALIGN or adjacent pre-existing unwritten mappings
+ * that were merged. Trim cmap to the original write range so that we
+ * don't convert more than we were asked to do for this write.
+ */
+ xfs_trim_extent(&cmap, offset_fsb, count_fsb);
+
found:
if (cmap.br_state != XFS_EXT_NORM) {
- error = xfs_reflink_convert_cow_locked(ip, offset_fsb,
- count_fsb);
+ error = xfs_reflink_convert_cow_locked(ip, cmap.br_startoff,
+ cmap.br_blockcount);
if (error)
goto out_unlock;
cmap.br_state = XFS_EXT_NORM;
+ xfs_check_atomic_cow_conversion(ip, offset_fsb, count_fsb,
+ &cmap);
}
- length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
- trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
+ trace_xfs_iomap_found(ip, offset, length, XFS_COW_FORK, &cmap);
seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq);
+convert_delay:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ error = xfs_bmapi_convert_delalloc(ip, XFS_COW_FORK, offset, iomap,
+ NULL);
+ if (error)
+ return error;
+
+ /*
+ * Try the lookup again, because the delalloc conversion might have
+ * turned the COW mapping into unwritten, but we need it to be in
+ * written state.
+ */
+ goto retry;
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
@@ -1702,6 +1758,8 @@ xfs_buffered_write_iomap_begin(
struct iomap *iomap,
struct iomap *srcmap)
{
+ struct iomap_iter *iter = container_of(iomap, struct iomap_iter,
+ iomap);
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
@@ -1767,21 +1825,41 @@ xfs_buffered_write_iomap_begin(
}
/*
- * For zeroing, trim a delalloc extent that extends beyond the EOF
- * block. If it starts beyond the EOF block, convert it to an
+ * For zeroing, trim extents that extend beyond the EOF block. If a
+ * delalloc extent starts beyond the EOF block, convert it to an
* unwritten extent.
*/
- if ((flags & IOMAP_ZERO) && imap.br_startoff <= offset_fsb &&
- isnullstartblock(imap.br_startblock)) {
+ if (flags & IOMAP_ZERO) {
xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
+ u64 end;
- if (offset_fsb >= eof_fsb)
+ if (isnullstartblock(imap.br_startblock) &&
+ offset_fsb >= eof_fsb)
goto convert_delay;
- if (end_fsb > eof_fsb) {
+ if (offset_fsb < eof_fsb && end_fsb > eof_fsb)
end_fsb = eof_fsb;
- xfs_trim_extent(&imap, offset_fsb,
- end_fsb - offset_fsb);
+
+ /*
+ * Look up dirty folios for unwritten mappings within EOF.
+ * Providing this bypasses the flush iomap uses to trigger
+ * extent conversion when unwritten mappings have dirty
+ * pagecache in need of zeroing.
+ *
+ * Trim the mapping to the end pos of the lookup, which in turn
+ * was trimmed to the end of the batch if it became full before
+ * the end of the mapping.
+ */
+ if (imap.br_state == XFS_EXT_UNWRITTEN &&
+ offset_fsb < eof_fsb) {
+ loff_t len = min(count,
+ XFS_FSB_TO_B(mp, imap.br_blockcount));
+
+ end = iomap_fill_dirty_folios(iter, offset, len);
+ end_fsb = min_t(xfs_fileoff_t, end_fsb,
+ XFS_B_TO_FSB(mp, end));
}
+
+ xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
}
/*
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index caff0125faea..ad94fbf55014 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1420,7 +1420,7 @@ xfs_setup_inode(
bool is_meta = xfs_is_internal_inode(ip);
inode->i_ino = ip->i_ino;
- inode->i_state |= I_NEW;
+ inode_state_set_raw(inode, I_NEW);
inode_sb_list_add(inode);
/* make the inode look hashed for the writeback code */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index f046d1215b04..b871dfde372b 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -236,7 +236,6 @@ typedef struct xfs_mount {
bool m_update_sb; /* sb needs update in mount */
unsigned int m_max_open_zones;
unsigned int m_zonegc_low_space;
- struct xfs_mru_cache *m_zone_cache; /* Inode to open zone cache */
/* max_atomic_write mount option value */
unsigned long long m_awu_max_bytes;
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 36cda724da89..9d1ed9bb0bee 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -17,7 +17,7 @@ xfs_can_free_cowblocks(struct xfs_inode *ip)
{
struct inode *inode = VFS_I(ip);
- if ((inode->i_state & I_DIRTY_PAGES) ||
+ if ((inode_state_read_once(inode) & I_DIRTY_PAGES) ||
mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) ||
mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK) ||
atomic_read(&inode->i_dio_count))
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index e85a156dc17d..bc71aa9dcee8 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -102,7 +102,7 @@ static const struct constant_table dax_param_enums[] = {
* Table driven mount option parser.
*/
enum {
- Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev,
+ Op_deprecated, Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev,
Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid,
Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups,
Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32,
@@ -114,7 +114,21 @@ enum {
Opt_lifetime, Opt_nolifetime, Opt_max_atomic_write,
};
+#define fsparam_dead(NAME) \
+ __fsparam(NULL, (NAME), Op_deprecated, fs_param_deprecated, NULL)
+
static const struct fs_parameter_spec xfs_fs_parameters[] = {
+ /*
+ * These mount options were supposed to be deprecated in September 2025
+ * but the deprecation warning was buggy, so not all users were
+ * notified. The deprecation is now obnoxiously loud and postponed to
+ * September 2030.
+ */
+ fsparam_dead("attr2"),
+ fsparam_dead("noattr2"),
+ fsparam_dead("ikeep"),
+ fsparam_dead("noikeep"),
+
fsparam_u32("logbufs", Opt_logbufs),
fsparam_string("logbsize", Opt_logbsize),
fsparam_string("logdev", Opt_logdev),
@@ -786,6 +800,12 @@ xfs_fs_evict_inode(
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
+
+ if (IS_ENABLED(CONFIG_XFS_RT) &&
+ S_ISREG(inode->i_mode) && inode->i_private) {
+ xfs_open_zone_put(inode->i_private);
+ inode->i_private = NULL;
+ }
}
static void
@@ -1373,16 +1393,25 @@ suffix_kstrtoull(
static inline void
xfs_fs_warn_deprecated(
struct fs_context *fc,
- struct fs_parameter *param,
- uint64_t flag,
- bool value)
+ struct fs_parameter *param)
{
- /* Don't print the warning if reconfiguring and current mount point
- * already had the flag set
+ /*
+ * Always warn about someone passing in a deprecated mount option.
+ * Previously we wouldn't print the warning if we were reconfiguring
+ * and current mount point already had the flag set, but that was not
+ * the right thing to do.
+ *
+ * Many distributions mount the root filesystem with no options in the
+ * initramfs and rely on mount -a to remount the root fs with the
+ * options in fstab. However, the old behavior meant that there would
+ * never be a warning about deprecated mount options for the root fs in
+ * /etc/fstab. On a single-fs system, that means no warning at all.
+ *
+ * Compounding this problem are distribution scripts that copy
+ * /proc/mounts to fstab, which means that we can't remove mount
+ * options unless we're 100% sure they have only ever been advertised
+ * in /proc/mounts in response to explicitly provided mount options.
*/
- if ((fc->purpose & FS_CONTEXT_FOR_RECONFIGURE) &&
- !!(XFS_M(fc->root->d_sb)->m_features & flag) == value)
- return;
xfs_warn(fc->s_fs_info, "%s mount option is deprecated.", param->key);
}
@@ -1408,6 +1437,9 @@ xfs_fs_parse_param(
return opt;
switch (opt) {
+ case Op_deprecated:
+ xfs_fs_warn_deprecated(fc, param);
+ return 0;
case Opt_logbufs:
parsing_mp->m_logbufs = result.uint_32;
return 0;
@@ -1528,7 +1560,6 @@ xfs_fs_parse_param(
xfs_mount_set_dax_mode(parsing_mp, result.uint_32);
return 0;
#endif
- /* Following mount options will be removed in September 2025 */
case Opt_max_open_zones:
parsing_mp->m_max_open_zones = result.uint_32;
return 0;
@@ -1662,7 +1693,10 @@ xfs_fs_fill_super(
if (error)
return error;
- sb_min_blocksize(sb, BBSIZE);
+ if (!sb_min_blocksize(sb, BBSIZE)) {
+ xfs_err(mp, "unable to set blocksize");
+ return -EINVAL;
+ }
sb->s_xattr = xfs_xattr_handlers;
sb->s_export_op = &xfs_export_operations;
#ifdef CONFIG_XFS_QUOTA
@@ -2221,7 +2255,7 @@ xfs_init_fs_context(
struct xfs_mount *mp;
int i;
- mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL | __GFP_NOFAIL);
+ mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
if (!mp)
return -ENOMEM;
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
index 1147bacb2da8..8dde444596f1 100644
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -26,14 +26,22 @@
#include "xfs_trace.h"
#include "xfs_mru_cache.h"
+static void
+xfs_open_zone_free_rcu(
+ struct callback_head *cb)
+{
+ struct xfs_open_zone *oz = container_of(cb, typeof(*oz), oz_rcu);
+
+ xfs_rtgroup_rele(oz->oz_rtg);
+ kfree(oz);
+}
+
void
xfs_open_zone_put(
struct xfs_open_zone *oz)
{
- if (atomic_dec_and_test(&oz->oz_ref)) {
- xfs_rtgroup_rele(oz->oz_rtg);
- kfree(oz);
- }
+ if (atomic_dec_and_test(&oz->oz_ref))
+ call_rcu(&oz->oz_rcu, xfs_open_zone_free_rcu);
}
static inline uint32_t
@@ -238,6 +246,14 @@ xfs_zoned_map_extent(
* If a data write raced with this GC write, keep the existing data in
* the data fork, mark our newly written GC extent as reclaimable, then
* move on to the next extent.
+ *
+ * Note that this can also happen when racing with operations that do
+ * not actually invalidate the data, but just move it to a different
+ * inode (XFS_IOC_EXCHANGE_RANGE), or to a different offset inside the
+ * inode (FALLOC_FL_COLLAPSE_RANGE / FALLOC_FL_INSERT_RANGE). If the
+ * data was just moved around, GC fails to free the zone, but the zone
+ * becomes a GC candidate again as soon as all previous GC I/O has
+ * finished and these blocks will be moved out eventually.
*/
if (old_startblock != NULLFSBLOCK &&
old_startblock != data.br_startblock)
@@ -599,7 +615,7 @@ xfs_select_open_zone_mru(
lockdep_assert_held(&zi->zi_open_zones_lock);
list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry)
- if (xfs_try_use_zone(zi, file_hint, oz, false))
+ if (xfs_try_use_zone(zi, file_hint, oz, XFS_ZONE_ALLOC_OK))
return oz;
cond_resched_lock(&zi->zi_open_zones_lock);
@@ -614,14 +630,25 @@ static inline enum rw_hint xfs_inode_write_hint(struct xfs_inode *ip)
}
/*
- * Try to pack inodes that are written back after they were closed tight instead
- * of trying to open new zones for them or spread them to the least recently
- * used zone. This optimizes the data layout for workloads that untar or copy
- * a lot of small files. Right now this does not separate multiple such
+ * Try to tightly pack small files that are written back after they were closed
+ * instead of trying to open new zones for them or spread them to the least
+ * recently used zone. This optimizes the data layout for workloads that untar
+ * or copy a lot of small files. Right now this does not separate multiple such
* streams.
*/
static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip)
{
+ struct xfs_mount *mp = ip->i_mount;
+ size_t zone_capacity =
+ XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_RTG].blocks);
+
+ /*
+ * Do not pack write files that are already using a full zone to avoid
+ * fragmentation.
+ */
+ if (i_size_read(VFS_I(ip)) >= zone_capacity)
+ return false;
+
return !inode_is_open_for_write(VFS_I(ip)) &&
!(ip->i_diflags & XFS_DIFLAG_APPEND);
}
@@ -746,97 +773,54 @@ xfs_mark_rtg_boundary(
}
/*
- * Cache the last zone written to for an inode so that it is considered first
- * for subsequent writes.
- */
-struct xfs_zone_cache_item {
- struct xfs_mru_cache_elem mru;
- struct xfs_open_zone *oz;
-};
-
-static inline struct xfs_zone_cache_item *
-xfs_zone_cache_item(struct xfs_mru_cache_elem *mru)
-{
- return container_of(mru, struct xfs_zone_cache_item, mru);
-}
-
-static void
-xfs_zone_cache_free_func(
- void *data,
- struct xfs_mru_cache_elem *mru)
-{
- struct xfs_zone_cache_item *item = xfs_zone_cache_item(mru);
-
- xfs_open_zone_put(item->oz);
- kfree(item);
-}
-
-/*
* Check if we have a cached last open zone available for the inode and
* if yes return a reference to it.
*/
static struct xfs_open_zone *
-xfs_cached_zone(
- struct xfs_mount *mp,
- struct xfs_inode *ip)
+xfs_get_cached_zone(
+ struct xfs_inode *ip)
{
- struct xfs_mru_cache_elem *mru;
- struct xfs_open_zone *oz;
+ struct xfs_open_zone *oz;
- mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
- if (!mru)
- return NULL;
- oz = xfs_zone_cache_item(mru)->oz;
+ rcu_read_lock();
+ oz = VFS_I(ip)->i_private;
if (oz) {
/*
* GC only steals open zones at mount time, so no GC zones
* should end up in the cache.
*/
ASSERT(!oz->oz_is_gc);
- ASSERT(atomic_read(&oz->oz_ref) > 0);
- atomic_inc(&oz->oz_ref);
+ if (!atomic_inc_not_zero(&oz->oz_ref))
+ oz = NULL;
}
- xfs_mru_cache_done(mp->m_zone_cache);
+ rcu_read_unlock();
+
return oz;
}
/*
- * Update the last used zone cache for a given inode.
+ * Stash our zone in the inode so that is is reused for future allocations.
*
- * The caller must have a reference on the open zone.
+ * The open_zone structure will be pinned until either the inode is freed or
+ * until the cached open zone is replaced with a different one because the
+ * current one was full when we tried to use it. This means we keep any
+ * open zone around forever as long as any inode that used it for the last
+ * write is cached, which slightly increases the memory use of cached inodes
+ * that were every written to, but significantly simplifies the cached zone
+ * lookup. Because the open_zone is clearly marked as full when all data
+ * in the underlying RTG was written, the caching is always safe.
*/
static void
-xfs_zone_cache_create_association(
- struct xfs_inode *ip,
- struct xfs_open_zone *oz)
+xfs_set_cached_zone(
+ struct xfs_inode *ip,
+ struct xfs_open_zone *oz)
{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_zone_cache_item *item = NULL;
- struct xfs_mru_cache_elem *mru;
+ struct xfs_open_zone *old_oz;
- ASSERT(atomic_read(&oz->oz_ref) > 0);
atomic_inc(&oz->oz_ref);
-
- mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
- if (mru) {
- /*
- * If we have an association already, update it to point to the
- * new zone.
- */
- item = xfs_zone_cache_item(mru);
- xfs_open_zone_put(item->oz);
- item->oz = oz;
- xfs_mru_cache_done(mp->m_zone_cache);
- return;
- }
-
- item = kmalloc(sizeof(*item), GFP_KERNEL);
- if (!item) {
- xfs_open_zone_put(oz);
- return;
- }
- item->oz = oz;
- xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru);
+ old_oz = xchg(&VFS_I(ip)->i_private, oz);
+ if (old_oz)
+ xfs_open_zone_put(old_oz);
}
static void
@@ -880,15 +864,14 @@ xfs_zone_alloc_and_submit(
* the inode is still associated with a zone and use that if so.
*/
if (!*oz)
- *oz = xfs_cached_zone(mp, ip);
+ *oz = xfs_get_cached_zone(ip);
if (!*oz) {
select_zone:
*oz = xfs_select_zone(mp, write_hint, pack_tight);
if (!*oz)
goto out_error;
-
- xfs_zone_cache_create_association(ip, *oz);
+ xfs_set_cached_zone(ip, *oz);
}
alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size),
@@ -966,6 +949,12 @@ xfs_free_open_zones(
xfs_open_zone_put(oz);
}
spin_unlock(&zi->zi_open_zones_lock);
+
+ /*
+ * Wait for all open zones to be freed so that they drop the group
+ * references:
+ */
+ rcu_barrier();
}
struct xfs_init_zones {
@@ -1215,6 +1204,7 @@ xfs_mount_zones(
.mp = mp,
};
struct xfs_buftarg *bt = mp->m_rtdev_targp;
+ xfs_extlen_t zone_blocks = mp->m_groups[XG_TYPE_RTG].blocks;
int error;
if (!bt) {
@@ -1245,10 +1235,33 @@ xfs_mount_zones(
return -ENOMEM;
xfs_info(mp, "%u zones of %u blocks (%u max open zones)",
- mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks,
- mp->m_max_open_zones);
+ mp->m_sb.sb_rgcount, zone_blocks, mp->m_max_open_zones);
trace_xfs_zones_mount(mp);
+ /*
+ * The writeback code switches between inodes regularly to provide
+ * fairness. The default lower bound is 4MiB, but for zoned file
+ * systems we want to increase that both to reduce seeks, but also more
+ * importantly so that workloads that writes files in a multiple of the
+ * zone size do not get fragmented and require garbage collection when
+ * they shouldn't. Increase is to the zone size capped by the max
+ * extent len.
+ *
+ * Note that because s_min_writeback_pages is a superblock field, this
+ * value also get applied to non-zoned files on the data device if
+ * there are any. On typical zoned setup all data is on the RT device
+ * because using the more efficient sequential write required zones
+ * is the reason for using the zone allocator, and either the RT device
+ * and the (meta)data device are on the same block device, or the
+ * (meta)data device is on a fast SSD while the data on the RT device
+ * is on a SMR HDD. In any combination of the above cases enforcing
+ * the higher min_writeback_pages for non-RT inodes is either a noop
+ * or beneficial.
+ */
+ mp->m_super->s_min_writeback_pages =
+ XFS_FSB_TO_B(mp, min(zone_blocks, XFS_MAX_BMBT_EXTLEN)) >>
+ PAGE_SHIFT;
+
if (bdev_is_zoned(bt->bt_bdev)) {
error = blkdev_report_zones(bt->bt_bdev,
XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart),
@@ -1260,8 +1273,10 @@ xfs_mount_zones(
while ((rtg = xfs_rtgroup_next(mp, rtg))) {
error = xfs_init_zone(&iz, rtg, NULL);
- if (error)
+ if (error) {
+ xfs_rtgroup_rele(rtg);
goto out_free_zone_info;
+ }
}
}
@@ -1279,14 +1294,6 @@ xfs_mount_zones(
error = xfs_zone_gc_mount(mp);
if (error)
goto out_free_zone_info;
-
- /*
- * Set up a mru cache to track inode to open zone for data placement
- * purposes. The magic values for group count and life time is the
- * same as the defaults for file streams, which seems sane enough.
- */
- xfs_mru_cache_create(&mp->m_zone_cache, mp,
- 5000, 10, xfs_zone_cache_free_func);
return 0;
out_free_zone_info:
@@ -1300,5 +1307,4 @@ xfs_unmount_zones(
{
xfs_zone_gc_unmount(mp);
xfs_free_zone_info(mp->m_zone_info);
- xfs_mru_cache_destroy(mp->m_zone_cache);
}
diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
index 064cd1a857a0..4ade54445532 100644
--- a/fs/xfs/xfs_zone_gc.c
+++ b/fs/xfs/xfs_zone_gc.c
@@ -114,6 +114,8 @@ struct xfs_gc_bio {
/* Open Zone being written to */
struct xfs_open_zone *oz;
+ struct xfs_rtgroup *victim_rtg;
+
/* Bio used for reads and writes, including the bvec used by it */
struct bio_vec bv;
struct bio bio; /* must be last */
@@ -264,6 +266,7 @@ xfs_zone_gc_iter_init(
iter->rec_count = 0;
iter->rec_idx = 0;
iter->victim_rtg = victim_rtg;
+ atomic_inc(&victim_rtg->rtg_gccount);
}
/*
@@ -362,6 +365,7 @@ xfs_zone_gc_query(
return 0;
done:
+ atomic_dec(&iter->victim_rtg->rtg_gccount);
xfs_rtgroup_rele(iter->victim_rtg);
iter->victim_rtg = NULL;
return 0;
@@ -451,6 +455,20 @@ xfs_zone_gc_pick_victim_from(
if (!rtg)
continue;
+ /*
+ * If the zone is already undergoing GC, don't pick it again.
+ *
+ * This prevents us from picking one of the zones for which we
+ * already submitted GC I/O, but for which the remapping hasn't
+ * concluded yet. This won't cause data corruption, but
+ * increases write amplification and slows down GC, so this is
+ * a bad thing.
+ */
+ if (atomic_read(&rtg->rtg_gccount)) {
+ xfs_rtgroup_rele(rtg);
+ continue;
+ }
+
/* skip zones that are just waiting for a reset */
if (rtg_rmap(rtg)->i_used_blocks == 0 ||
rtg_rmap(rtg)->i_used_blocks >= victim_used) {
@@ -491,21 +509,6 @@ xfs_zone_gc_select_victim(
struct xfs_rtgroup *victim_rtg = NULL;
unsigned int bucket;
- if (xfs_is_shutdown(mp))
- return false;
-
- if (iter->victim_rtg)
- return true;
-
- /*
- * Don't start new work if we are asked to stop or park.
- */
- if (kthread_should_stop() || kthread_should_park())
- return false;
-
- if (!xfs_zoned_need_gc(mp))
- return false;
-
spin_lock(&zi->zi_used_buckets_lock);
for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
@@ -703,6 +706,9 @@ xfs_zone_gc_start_chunk(
chunk->scratch = &data->scratch[data->scratch_idx];
chunk->data = data;
chunk->oz = oz;
+ chunk->victim_rtg = iter->victim_rtg;
+ atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref);
+ atomic_inc(&chunk->victim_rtg->rtg_gccount);
bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
bio->bi_end_io = xfs_zone_gc_end_io;
@@ -725,6 +731,8 @@ static void
xfs_zone_gc_free_chunk(
struct xfs_gc_bio *chunk)
{
+ atomic_dec(&chunk->victim_rtg->rtg_gccount);
+ xfs_rtgroup_rele(chunk->victim_rtg);
list_del(&chunk->entry);
xfs_open_zone_put(chunk->oz);
xfs_irele(chunk->ip);
@@ -785,6 +793,10 @@ xfs_zone_gc_split_write(
split_chunk->oz = chunk->oz;
atomic_inc(&chunk->oz->oz_ref);
+ split_chunk->victim_rtg = chunk->victim_rtg;
+ atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref);
+ atomic_inc(&chunk->victim_rtg->rtg_gccount);
+
chunk->offset += split_len;
chunk->len -= split_len;
chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
@@ -975,6 +987,27 @@ xfs_zone_gc_reset_zones(
} while (next);
}
+static bool
+xfs_zone_gc_should_start_new_work(
+ struct xfs_zone_gc_data *data)
+{
+ if (xfs_is_shutdown(data->mp))
+ return false;
+ if (!xfs_zone_gc_space_available(data))
+ return false;
+
+ if (!data->iter.victim_rtg) {
+ if (kthread_should_stop() || kthread_should_park())
+ return false;
+ if (!xfs_zoned_need_gc(data->mp))
+ return false;
+ if (!xfs_zone_gc_select_victim(data))
+ return false;
+ }
+
+ return true;
+}
+
/*
* Handle the work to read and write data for GC and to reset the zones,
* including handling all completions.
@@ -982,7 +1015,7 @@ xfs_zone_gc_reset_zones(
* Note that the order of the chunks is preserved so that we don't undo the
* optimal order established by xfs_zone_gc_query().
*/
-static bool
+static void
xfs_zone_gc_handle_work(
struct xfs_zone_gc_data *data)
{
@@ -996,30 +1029,22 @@ xfs_zone_gc_handle_work(
zi->zi_reset_list = NULL;
spin_unlock(&zi->zi_reset_list_lock);
- if (!xfs_zone_gc_select_victim(data) ||
- !xfs_zone_gc_space_available(data)) {
- if (list_empty(&data->reading) &&
- list_empty(&data->writing) &&
- list_empty(&data->resetting) &&
- !reset_list)
- return false;
- }
-
- __set_current_state(TASK_RUNNING);
- try_to_freeze();
-
- if (reset_list)
+ if (reset_list) {
+ set_current_state(TASK_RUNNING);
xfs_zone_gc_reset_zones(data, reset_list);
+ }
list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
break;
+ set_current_state(TASK_RUNNING);
xfs_zone_gc_finish_reset(chunk);
}
list_for_each_entry_safe(chunk, next, &data->writing, entry) {
if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
break;
+ set_current_state(TASK_RUNNING);
xfs_zone_gc_finish_chunk(chunk);
}
@@ -1027,15 +1052,18 @@ xfs_zone_gc_handle_work(
list_for_each_entry_safe(chunk, next, &data->reading, entry) {
if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
break;
+ set_current_state(TASK_RUNNING);
xfs_zone_gc_write_chunk(chunk);
}
blk_finish_plug(&plug);
- blk_start_plug(&plug);
- while (xfs_zone_gc_start_chunk(data))
- ;
- blk_finish_plug(&plug);
- return true;
+ if (xfs_zone_gc_should_start_new_work(data)) {
+ set_current_state(TASK_RUNNING);
+ blk_start_plug(&plug);
+ while (xfs_zone_gc_start_chunk(data))
+ ;
+ blk_finish_plug(&plug);
+ }
}
/*
@@ -1059,8 +1087,18 @@ xfs_zoned_gcd(
for (;;) {
set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
xfs_set_zonegc_running(mp);
- if (xfs_zone_gc_handle_work(data))
+
+ xfs_zone_gc_handle_work(data);
+
+ /*
+ * Only sleep if nothing set the state to running. Else check for
+ * work again as someone might have queued up more work and woken
+ * us in the meantime.
+ */
+ if (get_current_state() == TASK_RUNNING) {
+ try_to_freeze();
continue;
+ }
if (list_empty(&data->reading) &&
list_empty(&data->writing) &&
diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h
index 35e6de3d25ed..4322e26dd99a 100644
--- a/fs/xfs/xfs_zone_priv.h
+++ b/fs/xfs/xfs_zone_priv.h
@@ -44,6 +44,8 @@ struct xfs_open_zone {
* the life time of an open zone.
*/
struct xfs_rtgroup *oz_rtg;
+
+ struct rcu_head oz_rcu;
};
/*