summaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_zone_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_zone_alloc.c')
-rw-r--r--fs/xfs/xfs_zone_alloc.c70
1 files changed, 61 insertions, 9 deletions
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
index 23cdab4515bb..bbcf21704ea0 100644
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -103,9 +103,6 @@ xfs_zone_account_reclaimable(
*/
trace_xfs_zone_emptied(rtg);
- if (!was_full)
- xfs_group_clear_mark(xg, XFS_RTG_RECLAIMABLE);
-
spin_lock(&zi->zi_used_buckets_lock);
if (!was_full)
xfs_zone_remove_from_bucket(zi, rgno, from_bucket);
@@ -127,7 +124,6 @@ xfs_zone_account_reclaimable(
xfs_zone_add_to_bucket(zi, rgno, to_bucket);
spin_unlock(&zi->zi_used_buckets_lock);
- xfs_group_set_mark(xg, XFS_RTG_RECLAIMABLE);
if (zi->zi_gc_thread && xfs_zoned_need_gc(mp))
wake_up_process(zi->zi_gc_thread);
} else if (to_bucket != from_bucket) {
@@ -142,6 +138,28 @@ xfs_zone_account_reclaimable(
}
}
+/*
+ * Check if we have any zones that can be reclaimed by looking at the entry
+ * counters for the zone buckets.
+ */
+bool
+xfs_zoned_have_reclaimable(
+ struct xfs_zone_info *zi)
+{
+ int i;
+
+ spin_lock(&zi->zi_used_buckets_lock);
+ for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) {
+ if (zi->zi_used_bucket_entries[i]) {
+ spin_unlock(&zi->zi_used_buckets_lock);
+ return true;
+ }
+ }
+ spin_unlock(&zi->zi_used_buckets_lock);
+
+ return false;
+}
+
static void
xfs_open_zone_mark_full(
struct xfs_open_zone *oz)
@@ -246,6 +264,14 @@ xfs_zoned_map_extent(
* If a data write raced with this GC write, keep the existing data in
* the data fork, mark our newly written GC extent as reclaimable, then
* move on to the next extent.
+ *
+ * Note that this can also happen when racing with operations that do
+ * not actually invalidate the data, but just move it to a different
+ * inode (XFS_IOC_EXCHANGE_RANGE), or to a different offset inside the
+ * inode (FALLOC_FL_COLLAPSE_RANGE / FALLOC_FL_INSERT_RANGE). If the
+ * data was just moved around, GC fails to free the zone, but the zone
+ * becomes a GC candidate again as soon as all previous GC I/O has
+ * finished and these blocks will be moved out eventually.
*/
if (old_startblock != NULLFSBLOCK &&
old_startblock != data.br_startblock)
@@ -607,7 +633,7 @@ xfs_select_open_zone_mru(
lockdep_assert_held(&zi->zi_open_zones_lock);
list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry)
- if (xfs_try_use_zone(zi, file_hint, oz, false))
+ if (xfs_try_use_zone(zi, file_hint, oz, XFS_ZONE_ALLOC_OK))
return oz;
cond_resched_lock(&zi->zi_open_zones_lock);
@@ -1196,6 +1222,7 @@ xfs_mount_zones(
.mp = mp,
};
struct xfs_buftarg *bt = mp->m_rtdev_targp;
+ xfs_extlen_t zone_blocks = mp->m_groups[XG_TYPE_RTG].blocks;
int error;
if (!bt) {
@@ -1226,12 +1253,35 @@ xfs_mount_zones(
return -ENOMEM;
xfs_info(mp, "%u zones of %u blocks (%u max open zones)",
- mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks,
- mp->m_max_open_zones);
+ mp->m_sb.sb_rgcount, zone_blocks, mp->m_max_open_zones);
trace_xfs_zones_mount(mp);
+ /*
+ * The writeback code switches between inodes regularly to provide
+ * fairness. The default lower bound is 4MiB, but for zoned file
+ * systems we want to increase that both to reduce seeks, but also more
+ * importantly so that workloads that writes files in a multiple of the
+ * zone size do not get fragmented and require garbage collection when
+ * they shouldn't. Increase is to the zone size capped by the max
+ * extent len.
+ *
+ * Note that because s_min_writeback_pages is a superblock field, this
+ * value also get applied to non-zoned files on the data device if
+ * there are any. On typical zoned setup all data is on the RT device
+ * because using the more efficient sequential write required zones
+ * is the reason for using the zone allocator, and either the RT device
+ * and the (meta)data device are on the same block device, or the
+ * (meta)data device is on a fast SSD while the data on the RT device
+ * is on a SMR HDD. In any combination of the above cases enforcing
+ * the higher min_writeback_pages for non-RT inodes is either a noop
+ * or beneficial.
+ */
+ mp->m_super->s_min_writeback_pages =
+ XFS_FSB_TO_B(mp, min(zone_blocks, XFS_MAX_BMBT_EXTLEN)) >>
+ PAGE_SHIFT;
+
if (bdev_is_zoned(bt->bt_bdev)) {
- error = blkdev_report_zones(bt->bt_bdev,
+ error = blkdev_report_zones_cached(bt->bt_bdev,
XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart),
mp->m_sb.sb_rgcount, xfs_get_zone_info_cb, &iz);
if (error < 0)
@@ -1241,8 +1291,10 @@ xfs_mount_zones(
while ((rtg = xfs_rtgroup_next(mp, rtg))) {
error = xfs_init_zone(&iz, rtg, NULL);
- if (error)
+ if (error) {
+ xfs_rtgroup_rele(rtg);
goto out_free_zone_info;
+ }
}
}