diff options
Diffstat (limited to 'fs/xfs')
65 files changed, 1248 insertions, 1011 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 065953475cf5..8930d5254e1d 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -25,7 +25,7 @@ config XFS_FS config XFS_SUPPORT_V4 bool "Support deprecated V4 (crc=0) format" depends on XFS_FS - default y + default n help The V4 filesystem format lacks certain features that are supported by the V5 format, such as metadata checksumming, strengthened @@ -40,7 +40,7 @@ config XFS_SUPPORT_V4 filesystem is a V4 filesystem. If no such string is found, please upgrade xfsprogs to the latest version and try again. - This option will become default N in September 2025. Support for the + This option became default N in September 2025. Support for the V4 format will be removed entirely in September 2030. Distributors can say N here to withdraw support earlier. @@ -50,7 +50,7 @@ config XFS_SUPPORT_V4 config XFS_SUPPORT_ASCII_CI bool "Support deprecated case-insensitive ascii (ascii-ci=1) format" depends on XFS_FS - default y + default n help The ASCII case insensitivity filesystem feature only works correctly on systems that have been coerced into using ISO 8859-1, and it does @@ -67,7 +67,7 @@ config XFS_SUPPORT_ASCII_CI filesystem is a case-insensitive filesystem. If no such string is found, please upgrade xfsprogs to the latest version and try again. - This option will become default N in September 2025. Support for the + This option became default N in September 2025. Support for the feature will be removed entirely in September 2030. Distributors can say N here to withdraw support earlier. @@ -137,7 +137,7 @@ config XFS_BTREE_IN_MEM config XFS_ONLINE_SCRUB bool "XFS online metadata check support" - default n + default y depends on XFS_FS depends on TMPFS && SHMEM select XFS_LIVE_HOOKS @@ -150,12 +150,8 @@ config XFS_ONLINE_SCRUB advantage here is to look for problems proactively so that they can be dealt with in a controlled manner. - This feature is considered EXPERIMENTAL. Use with caution! - See the xfs_scrub man page in section 8 for additional information. - If unsure, say N. - config XFS_ONLINE_SCRUB_STATS bool "XFS online metadata check usage data collection" default y @@ -171,11 +167,9 @@ config XFS_ONLINE_SCRUB_STATS Usage data are collected in /sys/kernel/debug/xfs/scrub. - If unsure, say N. - config XFS_ONLINE_REPAIR bool "XFS online metadata repair support" - default n + default y depends on XFS_FS && XFS_ONLINE_SCRUB select XFS_BTREE_IN_MEM help @@ -186,12 +180,8 @@ config XFS_ONLINE_REPAIR formatted with secondary metadata, such as reverse mappings and inode parent pointers. - This feature is considered EXPERIMENTAL. Use with caution! - See the xfs_scrub man page in section 8 for additional information. - If unsure, say N. - config XFS_WARN bool "XFS Verbose Warnings" depends on XFS_FS && !XFS_DEBUG diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index fb79215a509d..8ac8230c3d3c 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -92,9 +92,8 @@ xfs_ag_resv_critical( trace_xfs_ag_resv_critical(pag, type, avail); /* Critically low if less than 10% or max btree height remains. */ - return XFS_TEST_ERROR(avail < orig / 10 || - avail < mp->m_agbtree_maxlevels, - mp, XFS_ERRTAG_AG_RESV_CRITICAL); + return avail < orig / 10 || avail < mp->m_agbtree_maxlevels || + XFS_TEST_ERROR(mp, XFS_ERRTAG_AG_RESV_CRITICAL); } /* @@ -203,7 +202,7 @@ __xfs_ag_resv_init( return -EINVAL; } - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_AG_RESV_FAIL)) error = -ENOSPC; else error = xfs_dec_fdblocks(mp, hidden_space, true); diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 000cc7f4a3ce..ad381c73abc4 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -3321,7 +3321,7 @@ xfs_agf_read_verify( xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_agf_verify(bp); - if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF)) + if (fa || XFS_TEST_ERROR(mp, XFS_ERRTAG_ALLOC_READ_AGF)) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } } @@ -4019,8 +4019,7 @@ __xfs_free_extent( ASSERT(len != 0); ASSERT(type != XFS_AG_RESV_AGFL); - if (XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_FREE_EXTENT)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_FREE_EXTENT)) return -EIO; error = xfs_free_extent_fix_freelist(tp, pag, &agbp); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index fddb55605e0c..91c1b30ebaab 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -667,12 +667,8 @@ xfs_attr_shortform_bytesfit( /* * For attr2 we can try to move the forkoff if there is space in the - * literal area, but for the old format we are done if there is no - * space in the fixed attribute fork. + * literal area */ - if (!xfs_has_attr2(mp)) - return 0; - dsize = dp->i_df.if_bytes; switch (dp->i_df.if_format) { @@ -723,22 +719,16 @@ xfs_attr_shortform_bytesfit( } /* - * Switch on the ATTR2 superblock bit (implies also FEATURES2) unless: - * - noattr2 mount option is set, - * - on-disk version bit says it is already set, or - * - the attr2 mount option is not set to enable automatic upgrade from attr1. + * Switch on the ATTR2 superblock bit (implies also FEATURES2) unless + * on-disk version bit says it is already set */ STATIC void xfs_sbversion_add_attr2( struct xfs_mount *mp, struct xfs_trans *tp) { - if (xfs_has_noattr2(mp)) - return; if (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT) return; - if (!xfs_has_attr2(mp)) - return; spin_lock(&mp->m_sb_lock); xfs_add_attr2(mp); @@ -889,7 +879,7 @@ xfs_attr_sf_removename( /* * Fix up the start offset of the attribute fork */ - if (totsize == sizeof(struct xfs_attr_sf_hdr) && xfs_has_attr2(mp) && + if (totsize == sizeof(struct xfs_attr_sf_hdr) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE)) && !xfs_has_parent(mp)) { @@ -900,7 +890,6 @@ xfs_attr_sf_removename( ASSERT(dp->i_forkoff); ASSERT(totsize > sizeof(struct xfs_attr_sf_hdr) || (args->op_flags & XFS_DA_OP_ADDNAME) || - !xfs_has_attr2(mp) || dp->i_df.if_format == XFS_DINODE_FMT_BTREE || xfs_has_parent(mp)); xfs_trans_log_inode(args->trans, dp, @@ -1040,8 +1029,7 @@ xfs_attr_shortform_allfit( bytes += xfs_attr_sf_entsize_byname(name_loc->namelen, be16_to_cpu(name_loc->valuelen)); } - if (xfs_has_attr2(dp->i_mount) && - (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && + if ((dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && (bytes == sizeof(struct xfs_attr_sf_hdr))) return -1; return xfs_attr_shortform_bytesfit(dp, bytes); @@ -1161,7 +1149,6 @@ xfs_attr3_leaf_to_shortform( * this case. */ if (!(args->op_flags & XFS_DA_OP_REPLACE)) { - ASSERT(xfs_has_attr2(dp->i_mount)); ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE); xfs_attr_fork_remove(dp, args->trans); } @@ -1225,7 +1212,7 @@ xfs_attr3_leaf_to_node( trace_xfs_attr_leaf_to_node(args); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) { + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) { error = -EIO; goto out; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index d954f9b8071f..53ef4b7e504d 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -997,8 +997,7 @@ xfs_bmap_add_attrfork_local( static int xfs_bmap_set_attrforkoff( struct xfs_inode *ip, - int size, - int *version) + int size) { int default_size = xfs_default_attroffset(ip) >> 3; @@ -1012,8 +1011,6 @@ xfs_bmap_set_attrforkoff( ip->i_forkoff = xfs_attr_shortform_bytesfit(ip, size); if (!ip->i_forkoff) ip->i_forkoff = default_size; - else if (xfs_has_attr2(ip->i_mount) && version) - *version = 2; break; default: ASSERT(0); @@ -1035,7 +1032,6 @@ xfs_bmap_add_attrfork( int rsvd) /* xact may use reserved blks */ { struct xfs_mount *mp = tp->t_mountp; - int version = 1; /* superblock attr version */ int logflags; /* logging flags */ int error; /* error return value */ @@ -1045,7 +1041,7 @@ xfs_bmap_add_attrfork( ASSERT(!xfs_inode_has_attr_fork(ip)); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_bmap_set_attrforkoff(ip, size, &version); + error = xfs_bmap_set_attrforkoff(ip, size); if (error) return error; @@ -1069,16 +1065,12 @@ xfs_bmap_add_attrfork( xfs_trans_log_inode(tp, ip, logflags); if (error) return error; - if (!xfs_has_attr(mp) || - (!xfs_has_attr2(mp) && version == 2)) { + if (!xfs_has_attr(mp)) { bool log_sb = false; spin_lock(&mp->m_sb_lock); if (!xfs_has_attr(mp)) { xfs_add_attr(mp); - log_sb = true; - } - if (!xfs_has_attr2(mp) && version == 2) { xfs_add_attr2(mp); log_sb = true; } @@ -3662,8 +3654,7 @@ xfs_bmap_btalloc( /* Trim the allocation back to the maximum an AG can fit. */ args.maxlen = min(ap->length, mp->m_ag_max_usable); - if (unlikely(XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) + if (unlikely(XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) error = xfs_bmap_exact_minlen_extent_alloc(ap, &args); else if ((ap->datatype & XFS_ALLOC_USERDATA) && xfs_inode_is_filestream(ap->ip)) @@ -3849,7 +3840,7 @@ xfs_bmapi_read( } if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -4200,7 +4191,7 @@ xfs_bmapi_write( (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)); if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -4545,7 +4536,7 @@ xfs_bmapi_remap( (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)); if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -5679,7 +5670,7 @@ xfs_bmap_collapse_extents( int logflags = 0; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -5795,7 +5786,7 @@ xfs_bmap_insert_extents( int logflags = 0; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -5900,7 +5891,7 @@ xfs_bmap_split_extent( int i = 0; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -6065,7 +6056,7 @@ xfs_bmap_finish_one( trace_xfs_bmap_deferred(bi); - if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE)) + if (XFS_TEST_ERROR(tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE)) return -EIO; switch (bi->bi_type) { diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index a61211d253f1..dbe9df8c3300 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -306,7 +306,7 @@ xfs_btree_check_block( fa = __xfs_btree_check_block(cur, block, level, bp); if (XFS_IS_CORRUPT(mp, fa != NULL) || - XFS_TEST_ERROR(false, mp, xfs_btree_block_errtag(cur))) { + XFS_TEST_ERROR(mp, xfs_btree_block_errtag(cur))) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); xfs_btree_mark_sick(cur); diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 723a0643b838..90f7fc219fcc 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -565,7 +565,7 @@ xfs_da3_split( trace_xfs_da_split(state->args); - if (XFS_TEST_ERROR(false, state->mp, XFS_ERRTAG_DA_LEAF_SPLIT)) + if (XFS_TEST_ERROR(state->mp, XFS_ERRTAG_DA_LEAF_SPLIT)) return -EIO; /* diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 1775abcfa04d..82a338458a51 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -223,7 +223,7 @@ xfs_dir_ino_validate( bool ino_ok = xfs_verify_dir_ino(mp, ino); if (XFS_IS_CORRUPT(mp, !ino_ok) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DIR_INO_VALIDATE)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_DIR_INO_VALIDATE)) { xfs_warn(mp, "Invalid inode number 0x%Lx", (unsigned long long) ino); return -EFSCORRUPTED; diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index a53c5d40e084..de840abc0bcd 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -4,14 +4,22 @@ * Copyright (C) 2017 Oracle. * All Rights Reserved. */ -#ifndef __XFS_ERRORTAG_H_ +#if !defined(__XFS_ERRORTAG_H_) || defined(XFS_ERRTAG) #define __XFS_ERRORTAG_H_ /* - * error injection tags - the labels can be anything you want - * but each tag should have its own unique number + * There are two ways to use this header file. The first way is to #include it + * bare, which will define all the XFS_ERRTAG_* error injection knobs for use + * with the XFS_TEST_ERROR macro. The second way is to enclose the #include + * with a #define for an XFS_ERRTAG macro, in which case the header will define + " an XFS_ERRTAGS macro that expands to invoke that XFS_ERRTAG macro for each + * defined error injection knob. */ +/* + * These are the actual error injection tags. The numbers should be consecutive + * because arrays are sized based on the maximum. + */ #define XFS_ERRTAG_NOERROR 0 #define XFS_ERRTAG_IFLUSH_1 1 #define XFS_ERRTAG_IFLUSH_2 2 @@ -71,49 +79,61 @@ * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. */ #define XFS_RANDOM_DEFAULT 100 -#define XFS_RANDOM_IFLUSH_1 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_2 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_3 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_4 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_5 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_6 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_DA_READ_BUF XFS_RANDOM_DEFAULT -#define XFS_RANDOM_BTREE_CHECK_LBLOCK (XFS_RANDOM_DEFAULT/4) -#define XFS_RANDOM_BTREE_CHECK_SBLOCK XFS_RANDOM_DEFAULT -#define XFS_RANDOM_ALLOC_READ_AGF XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IALLOC_READ_AGI XFS_RANDOM_DEFAULT -#define XFS_RANDOM_ITOBP_INOTOBP XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IUNLINK XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IUNLINK_REMOVE XFS_RANDOM_DEFAULT -#define XFS_RANDOM_DIR_INO_VALIDATE XFS_RANDOM_DEFAULT -#define XFS_RANDOM_BULKSTAT_READ_CHUNK XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IODONE_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_STRATREAD_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_STRATCMPL_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT -#define XFS_RANDOM_FREE_EXTENT 1 -#define XFS_RANDOM_RMAP_FINISH_ONE 1 -#define XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE 1 -#define XFS_RANDOM_REFCOUNT_FINISH_ONE 1 -#define XFS_RANDOM_BMAP_FINISH_ONE 1 -#define XFS_RANDOM_AG_RESV_CRITICAL 4 -#define XFS_RANDOM_LOG_BAD_CRC 1 -#define XFS_RANDOM_LOG_ITEM_PIN 1 -#define XFS_RANDOM_BUF_LRU_REF 2 -#define XFS_RANDOM_FORCE_SCRUB_REPAIR 1 -#define XFS_RANDOM_FORCE_SUMMARY_RECALC 1 -#define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_BUF_IOERROR XFS_RANDOM_DEFAULT -#define XFS_RANDOM_REDUCE_MAX_IEXTENTS 1 -#define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT 1 -#define XFS_RANDOM_AG_RESV_FAIL 1 -#define XFS_RANDOM_LARP 1 -#define XFS_RANDOM_DA_LEAF_SPLIT 1 -#define XFS_RANDOM_ATTR_LEAF_TO_NODE 1 -#define XFS_RANDOM_WB_DELAY_MS 3000 -#define XFS_RANDOM_WRITE_DELAY_MS 3000 -#define XFS_RANDOM_EXCHMAPS_FINISH_ONE 1 -#define XFS_RANDOM_METAFILE_RESV_CRITICAL 4 + +/* + * Table of errror injection knobs. The parameters to the XFS_ERRTAG macro are: + * 1. The XFS_ERRTAG_ flag but without the prefix; + * 2. The name of the sysfs knob; and + * 3. The default value for the knob. + */ +#ifdef XFS_ERRTAG +# undef XFS_ERRTAGS +# define XFS_ERRTAGS \ +XFS_ERRTAG(NOERROR, noerror, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_1, iflush1, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_2, iflush2, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_3, iflush3, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_4, iflush4, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_5, iflush5, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_6, iflush6, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(DA_READ_BUF, dareadbuf, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(BTREE_CHECK_LBLOCK, btree_chk_lblk, XFS_RANDOM_DEFAULT/4) \ +XFS_ERRTAG(BTREE_CHECK_SBLOCK, btree_chk_sblk, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(ALLOC_READ_AGF, readagf, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IALLOC_READ_AGI, readagi, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(ITOBP_INOTOBP, itobp, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IUNLINK, iunlink, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IUNLINK_REMOVE, iunlinkrm, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(DIR_INO_VALIDATE, dirinovalid, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(BULKSTAT_READ_CHUNK, bulkstat, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IODONE_IOERR, logiodone, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(STRATREAD_IOERR, stratread, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(STRATCMPL_IOERR, stratcmpl, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(DIOWRITE_IOERR, diowrite, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(BMAPIFORMAT, bmapifmt, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(FREE_EXTENT, free_extent, 1) \ +XFS_ERRTAG(RMAP_FINISH_ONE, rmap_finish_one, 1) \ +XFS_ERRTAG(REFCOUNT_CONTINUE_UPDATE, refcount_continue_update, 1) \ +XFS_ERRTAG(REFCOUNT_FINISH_ONE, refcount_finish_one, 1) \ +XFS_ERRTAG(BMAP_FINISH_ONE, bmap_finish_one, 1) \ +XFS_ERRTAG(AG_RESV_CRITICAL, ag_resv_critical, 4) \ +XFS_ERRTAG(LOG_BAD_CRC, log_bad_crc, 1) \ +XFS_ERRTAG(LOG_ITEM_PIN, log_item_pin, 1) \ +XFS_ERRTAG(BUF_LRU_REF, buf_lru_ref, 2) \ +XFS_ERRTAG(FORCE_SCRUB_REPAIR, force_repair, 1) \ +XFS_ERRTAG(FORCE_SUMMARY_RECALC, bad_summary, 1) \ +XFS_ERRTAG(IUNLINK_FALLBACK, iunlink_fallback, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(BUF_IOERROR, buf_ioerror, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(REDUCE_MAX_IEXTENTS, reduce_max_iextents, 1) \ +XFS_ERRTAG(BMAP_ALLOC_MINLEN_EXTENT, bmap_alloc_minlen_extent, 1) \ +XFS_ERRTAG(AG_RESV_FAIL, ag_resv_fail, 1) \ +XFS_ERRTAG(LARP, larp, 1) \ +XFS_ERRTAG(DA_LEAF_SPLIT, da_leaf_split, 1) \ +XFS_ERRTAG(ATTR_LEAF_TO_NODE, attr_leaf_to_node, 1) \ +XFS_ERRTAG(WB_DELAY_MS, wb_delay_ms, 3000) \ +XFS_ERRTAG(WRITE_DELAY_MS, write_delay_ms, 3000) \ +XFS_ERRTAG(EXCHMAPS_FINISH_ONE, exchmaps_finish_one, 1) \ +XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4) +#endif /* XFS_ERRTAG */ #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_exchmaps.c b/fs/xfs/libxfs/xfs_exchmaps.c index 3f1d6a98c118..932ee4619e9e 100644 --- a/fs/xfs/libxfs/xfs_exchmaps.c +++ b/fs/xfs/libxfs/xfs_exchmaps.c @@ -616,7 +616,7 @@ xfs_exchmaps_finish_one( return error; } - if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE)) + if (XFS_TEST_ERROR(tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE)) return -EIO; /* If we still have work to do, ask for a new transaction. */ @@ -882,7 +882,7 @@ xmi_ensure_delta_nextents( &new_nextents)) return -EFBIG; - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && new_nextents > 10) return -EFBIG; diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 750111634d9f..d97295eaebe6 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2140,7 +2140,7 @@ xfs_difree_inobt( * remove the chunk if the block size is large enough for multiple inode * chunks (that might not be free). */ - if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && + if (rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { xic->deleted = true; xic->first_ino = xfs_agino_to_ino(pag, rec.ir_startino); @@ -2286,7 +2286,7 @@ xfs_difree_finobt( * enough for multiple chunks. Leave the finobt record to remain in sync * with the inobt. */ - if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && + if (rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { error = xfs_btree_delete(cur, &i); if (error) @@ -2706,7 +2706,7 @@ xfs_agi_read_verify( xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_agi_verify(bp); - if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_IALLOC_READ_AGI)) + if (fa || XFS_TEST_ERROR(mp, XFS_ERRTAG_IALLOC_READ_AGI)) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } } diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index aa13fc00afd7..b1812b2c3cce 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -61,8 +61,8 @@ xfs_inode_buf_verify( di_ok = xfs_verify_magic16(bp, dip->di_magic) && xfs_dinode_good_version(mp, dip->di_version) && xfs_verify_agino_or_null(bp->b_pag, unlinked_ino); - if (unlikely(XFS_TEST_ERROR(!di_ok, mp, - XFS_ERRTAG_ITOBP_INOTOBP))) { + if (unlikely(!di_ok || + XFS_TEST_ERROR(mp, XFS_ERRTAG_ITOBP_INOTOBP))) { if (readahead) { bp->b_flags &= ~XBF_DONE; xfs_buf_ioerror(bp, -EIO); diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 4f99b90add55..1772d82f2d68 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -756,8 +756,7 @@ xfs_iext_count_extend( if (nr_exts < ifp->if_nextents) return -EFBIG; - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && - nr_exts > 10) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && nr_exts > 10) return -EFBIG; if (nr_exts > xfs_iext_max_nextents(has_large, whichfork)) { diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c index 48fe49a5f050..309ce6dd5553 100644 --- a/fs/xfs/libxfs/xfs_inode_util.c +++ b/fs/xfs/libxfs/xfs_inode_util.c @@ -299,17 +299,6 @@ xfs_inode_init( } else { inode_init_owner(args->idmap, inode, dir, args->mode); } - - /* - * If the group ID of the new file does not match the effective - * group ID or one of the supplementary group IDs, the S_ISGID - * bit is cleared (and only if the irix_sgid_inherit - * compatibility variable is set). - */ - if (irix_sgid_inherit && (inode->i_mode & S_ISGID) && - !vfsgid_in_group_p(i_gid_into_vfsgid(args->idmap, inode))) - inode->i_mode &= ~S_ISGID; - ip->i_projid = xfs_get_initial_prid(pip); } diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 0d637c276db0..6c50cb2ece19 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -86,43 +86,6 @@ struct xfs_unmount_log_format { uint32_t pad2; /* may as well make it 64 bits */ }; -/* Region types for iovec's i_type */ -#define XLOG_REG_TYPE_BFORMAT 1 -#define XLOG_REG_TYPE_BCHUNK 2 -#define XLOG_REG_TYPE_EFI_FORMAT 3 -#define XLOG_REG_TYPE_EFD_FORMAT 4 -#define XLOG_REG_TYPE_IFORMAT 5 -#define XLOG_REG_TYPE_ICORE 6 -#define XLOG_REG_TYPE_IEXT 7 -#define XLOG_REG_TYPE_IBROOT 8 -#define XLOG_REG_TYPE_ILOCAL 9 -#define XLOG_REG_TYPE_IATTR_EXT 10 -#define XLOG_REG_TYPE_IATTR_BROOT 11 -#define XLOG_REG_TYPE_IATTR_LOCAL 12 -#define XLOG_REG_TYPE_QFORMAT 13 -#define XLOG_REG_TYPE_DQUOT 14 -#define XLOG_REG_TYPE_QUOTAOFF 15 -#define XLOG_REG_TYPE_LRHEADER 16 -#define XLOG_REG_TYPE_UNMOUNT 17 -#define XLOG_REG_TYPE_COMMIT 18 -#define XLOG_REG_TYPE_TRANSHDR 19 -#define XLOG_REG_TYPE_ICREATE 20 -#define XLOG_REG_TYPE_RUI_FORMAT 21 -#define XLOG_REG_TYPE_RUD_FORMAT 22 -#define XLOG_REG_TYPE_CUI_FORMAT 23 -#define XLOG_REG_TYPE_CUD_FORMAT 24 -#define XLOG_REG_TYPE_BUI_FORMAT 25 -#define XLOG_REG_TYPE_BUD_FORMAT 26 -#define XLOG_REG_TYPE_ATTRI_FORMAT 27 -#define XLOG_REG_TYPE_ATTRD_FORMAT 28 -#define XLOG_REG_TYPE_ATTR_NAME 29 -#define XLOG_REG_TYPE_ATTR_VALUE 30 -#define XLOG_REG_TYPE_XMI_FORMAT 31 -#define XLOG_REG_TYPE_XMD_FORMAT 32 -#define XLOG_REG_TYPE_ATTR_NEWNAME 33 -#define XLOG_REG_TYPE_ATTR_NEWVALUE 34 -#define XLOG_REG_TYPE_MAX 34 - /* * Flags to log operation header * @@ -141,14 +104,13 @@ struct xfs_unmount_log_format { #define XLOG_END_TRANS 0x10 /* End a continued transaction */ #define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */ - -typedef struct xlog_op_header { +struct xlog_op_header { __be32 oh_tid; /* transaction id of operation : 4 b */ __be32 oh_len; /* bytes in data region : 4 b */ __u8 oh_clientid; /* who sent me this : 1 b */ __u8 oh_flags; /* : 1 b */ __u16 oh_res2; /* 32 bit align : 2 b */ -} xlog_op_header_t; +}; /* valid values for h_fmt */ #define XLOG_FMT_UNKNOWN 0 @@ -174,12 +136,40 @@ typedef struct xlog_rec_header { __be32 h_prev_block; /* block number to previous LR : 4 */ __be32 h_num_logops; /* number of log operations in this LR : 4 */ __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; - /* new fields */ + + /* fields added by the Linux port: */ __be32 h_fmt; /* format of log record : 4 */ uuid_t h_fs_uuid; /* uuid of FS : 16 */ + + /* fields added for log v2: */ __be32 h_size; /* iclog size : 4 */ + + /* + * When h_size added for log v2 support, it caused structure to have + * a different size on i386 vs all other architectures because the + * sum of the size ofthe member is not aligned by that of the largest + * __be64-sized member, and i386 has really odd struct alignment rules. + * + * Due to the way the log headers are placed out on-disk that alone is + * not a problem becaue the xlog_rec_header always sits alone in a + * BBSIZEs area, and the rest of that area is padded with zeroes. + * But xlog_cksum used to calculate the checksum based on the structure + * size, and thus gives different checksums for i386 vs the rest. + * We now do two checksum validation passes for both sizes to allow + * moving v5 file systems with unclean logs between i386 and other + * (little-endian) architectures. + */ + __u32 h_pad0; } xlog_rec_header_t; +#ifdef __i386__ +#define XLOG_REC_SIZE offsetofend(struct xlog_rec_header, h_size) +#define XLOG_REC_SIZE_OTHER sizeof(struct xlog_rec_header) +#else +#define XLOG_REC_SIZE sizeof(struct xlog_rec_header) +#define XLOG_REC_SIZE_OTHER offsetofend(struct xlog_rec_header, h_size) +#endif /* __i386__ */ + typedef struct xlog_rec_ext_header { __be32 xh_cycle; /* write cycle of log : 4 */ __be32 xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */ @@ -195,12 +185,11 @@ typedef union xlog_in_core2 { } xlog_in_core_2_t; /* not an on-disk structure, but needed by log recovery in userspace */ -typedef struct xfs_log_iovec { +struct xfs_log_iovec { void *i_addr; /* beginning address of region */ int i_len; /* length in bytes of region */ uint i_type; /* type of region */ -} xfs_log_iovec_t; - +}; /* * Transaction Header definitions. @@ -213,12 +202,12 @@ typedef struct xfs_log_iovec { * Do not change the below structure without redoing the code in * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans(). */ -typedef struct xfs_trans_header { +struct xfs_trans_header { uint th_magic; /* magic number */ uint th_type; /* transaction type */ int32_t th_tid; /* transaction id (unused) */ uint th_num_items; /* num items logged by trans */ -} xfs_trans_header_t; +}; #define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */ @@ -542,7 +531,7 @@ struct xfs_log_dinode { #define __XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) #define XFS_BLF_DATAMAP_SIZE (__XFS_BLF_DATAMAP_SIZE + 1) -typedef struct xfs_buf_log_format { +struct xfs_buf_log_format { unsigned short blf_type; /* buf log item type indicator */ unsigned short blf_size; /* size of this item */ unsigned short blf_flags; /* misc state */ @@ -550,7 +539,7 @@ typedef struct xfs_buf_log_format { int64_t blf_blkno; /* starting blkno of this buf */ unsigned int blf_map_size; /* used size of data bitmap in words */ unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */ -} xfs_buf_log_format_t; +}; /* * All buffers now need to tell recovery where the magic number @@ -606,40 +595,41 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf) /* * EFI/EFD log format definitions */ -typedef struct xfs_extent { +struct xfs_extent { xfs_fsblock_t ext_start; xfs_extlen_t ext_len; -} xfs_extent_t; +}; /* - * Since an xfs_extent_t has types (start:64, len: 32) - * there are different alignments on 32 bit and 64 bit kernels. - * So we provide the different variants for use by a - * conversion routine. + * Since the structures in struct xfs_extent add up to 96 bytes, it has + * different alignments on i386 vs all other architectures, because i386 + * does not pad structures to their natural alignment. + * + * Provide the different variants for use by a conversion routine. */ -typedef struct xfs_extent_32 { +struct xfs_extent_32 { uint64_t ext_start; uint32_t ext_len; -} __attribute__((packed)) xfs_extent_32_t; +} __attribute__((packed)); -typedef struct xfs_extent_64 { +struct xfs_extent_64 { uint64_t ext_start; uint32_t ext_len; uint32_t ext_pad; -} xfs_extent_64_t; +}; /* * This is the structure used to lay out an efi log item in the * log. The efi_extents field is a variable size array whose * size is given by efi_nextents. */ -typedef struct xfs_efi_log_format { +struct xfs_efi_log_format { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_t efi_extents[]; /* array of extents to free */ -} xfs_efi_log_format_t; + struct xfs_extent efi_extents[]; /* array of extents to free */ +}; static inline size_t xfs_efi_log_format_sizeof( @@ -649,13 +639,13 @@ xfs_efi_log_format_sizeof( nr * sizeof(struct xfs_extent); } -typedef struct xfs_efi_log_format_32 { +struct xfs_efi_log_format_32 { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_32_t efi_extents[]; /* array of extents to free */ -} __attribute__((packed)) xfs_efi_log_format_32_t; + struct xfs_extent_32 efi_extents[]; /* array of extents to free */ +} __attribute__((packed)); static inline size_t xfs_efi_log_format32_sizeof( @@ -665,13 +655,13 @@ xfs_efi_log_format32_sizeof( nr * sizeof(struct xfs_extent_32); } -typedef struct xfs_efi_log_format_64 { +struct xfs_efi_log_format_64 { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_64_t efi_extents[]; /* array of extents to free */ -} xfs_efi_log_format_64_t; + struct xfs_extent_64 efi_extents[]; /* array of extents to free */ +}; static inline size_t xfs_efi_log_format64_sizeof( @@ -686,13 +676,13 @@ xfs_efi_log_format64_sizeof( * log. The efd_extents array is a variable size array whose * size is given by efd_nextents; */ -typedef struct xfs_efd_log_format { +struct xfs_efd_log_format { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_t efd_extents[]; /* array of extents freed */ -} xfs_efd_log_format_t; + struct xfs_extent efd_extents[]; /* array of extents freed */ +}; static inline size_t xfs_efd_log_format_sizeof( @@ -702,13 +692,13 @@ xfs_efd_log_format_sizeof( nr * sizeof(struct xfs_extent); } -typedef struct xfs_efd_log_format_32 { +struct xfs_efd_log_format_32 { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_32_t efd_extents[]; /* array of extents freed */ -} __attribute__((packed)) xfs_efd_log_format_32_t; + struct xfs_extent_32 efd_extents[]; /* array of extents freed */ +} __attribute__((packed)); static inline size_t xfs_efd_log_format32_sizeof( @@ -718,13 +708,13 @@ xfs_efd_log_format32_sizeof( nr * sizeof(struct xfs_extent_32); } -typedef struct xfs_efd_log_format_64 { +struct xfs_efd_log_format_64 { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_64_t efd_extents[]; /* array of extents freed */ -} xfs_efd_log_format_64_t; + struct xfs_extent_64 efd_extents[]; /* array of extents freed */ +}; static inline size_t xfs_efd_log_format64_sizeof( @@ -957,14 +947,14 @@ struct xfs_xmd_log_format { * The first two fields must be the type and size fitting into * 32 bits : log_recovery code assumes that. */ -typedef struct xfs_dq_logformat { +struct xfs_dq_logformat { uint16_t qlf_type; /* dquot log item type */ uint16_t qlf_size; /* size of this item */ xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */ int64_t qlf_blkno; /* blkno of dquot buffer */ int32_t qlf_len; /* len of dquot buffer */ uint32_t qlf_boffset; /* off of dquot in buffer */ -} xfs_dq_logformat_t; +}; /* * log format struct for QUOTAOFF records. @@ -974,12 +964,12 @@ typedef struct xfs_dq_logformat { * to the first and ensures that the first logitem is taken out of the AIL * only when the last one is securely committed. */ -typedef struct xfs_qoff_logformat { +struct xfs_qoff_logformat { unsigned short qf_type; /* quotaoff log item type */ unsigned short qf_size; /* size of this item */ unsigned int qf_flags; /* USR and/or GRP */ char qf_pad[12]; /* padding for future */ -} xfs_qoff_logformat_t; +}; /* * Disk quotas status in m_qflags, and also sb_qflags. 16 bits. diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 95de23095030..9e712e62369c 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -111,7 +111,7 @@ struct xlog_recover_item { struct xlog_recover { struct hlist_node r_list; xlog_tid_t r_log_tid; /* log's transaction id */ - xfs_trans_header_t r_theader; /* trans header for partial */ + struct xfs_trans_header r_theader; /* trans header for partial */ int r_state; /* not needed */ xfs_lsn_t r_lsn; /* xact lsn */ struct list_head r_itemq; /* q for items */ diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c index 225923e463c4..b02e3d6c0868 100644 --- a/fs/xfs/libxfs/xfs_metafile.c +++ b/fs/xfs/libxfs/xfs_metafile.c @@ -121,7 +121,7 @@ xfs_metafile_resv_critical( div_u64(mp->m_metafile_resv_target, 10))) return true; - return XFS_TEST_ERROR(false, mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL); + return XFS_TEST_ERROR(mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL); } /* Allocate a block from the metadata file's reservation. */ diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h index 5ed44fdf7491..7bfa3242e2c5 100644 --- a/fs/xfs/libxfs/xfs_ondisk.h +++ b/fs/xfs/libxfs/xfs_ondisk.h @@ -174,6 +174,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_rud_log_format, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_map_extent, 32); XFS_CHECK_STRUCT_SIZE(struct xfs_phys_extent, 16); + XFS_CHECK_STRUCT_SIZE(struct xlog_rec_header, 328); + XFS_CHECK_STRUCT_SIZE(struct xlog_rec_ext_header, 260); XFS_CHECK_OFFSET(struct xfs_bui_log_format, bui_extents, 16); XFS_CHECK_OFFSET(struct xfs_cui_log_format, cui_extents, 16); diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 897784037483..2484dc9f6d7e 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1113,8 +1113,7 @@ xfs_refcount_still_have_space( * refcount continue update "error" has been injected. */ if (cur->bc_refc.nr_ops > 2 && - XFS_TEST_ERROR(false, cur->bc_mp, - XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE)) + XFS_TEST_ERROR(cur->bc_mp, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE)) return false; if (cur->bc_refc.nr_ops == 0) @@ -1398,7 +1397,7 @@ xfs_refcount_finish_one( trace_xfs_refcount_deferred(mp, ri); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) return -EIO; /* @@ -1511,7 +1510,7 @@ xfs_rtrefcount_finish_one( trace_xfs_refcount_deferred(mp, ri); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) return -EIO; /* diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 3cdf50563fec..83e0488ff773 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -2690,7 +2690,7 @@ xfs_rmap_finish_one( trace_xfs_rmap_deferred(mp, ri); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_RMAP_FINISH_ONE)) return -EIO; /* diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 5057536e586c..618061d898d4 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1067,7 +1067,7 @@ xfs_rtfree_extent( ASSERT(rbmip->i_itemp != NULL); xfs_assert_ilocked(rbmip, XFS_ILOCK_EXCL); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FREE_EXTENT)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_FREE_EXTENT)) return -EIO; error = xfs_rtcheck_alloc_range(&args, start, len); diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 711e180f9ebb..cdd16dd805d7 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -142,8 +142,6 @@ xfs_sb_version_to_features( if (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) { if (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT) features |= XFS_FEAT_LAZYSBCOUNT; - if (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT) - features |= XFS_FEAT_ATTR2; if (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT) features |= XFS_FEAT_PROJID32; if (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE) @@ -155,7 +153,7 @@ xfs_sb_version_to_features( /* Always on V5 features */ features |= XFS_FEAT_ALIGN | XFS_FEAT_LOGV2 | XFS_FEAT_EXTFLG | - XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_ATTR2 | XFS_FEAT_PROJID32 | + XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_PROJID32 | XFS_FEAT_V3INODES | XFS_FEAT_CRC | XFS_FEAT_PQUOTINO; /* Optional V5 features */ @@ -1524,7 +1522,8 @@ xfs_fs_geometry( geo->version = XFS_FSOP_GEOM_VERSION; geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK | XFS_FSOP_GEOM_FLAGS_DIRV2 | - XFS_FSOP_GEOM_FLAGS_EXTFLG; + XFS_FSOP_GEOM_FLAGS_EXTFLG | + XFS_FSOP_GEOM_FLAGS_ATTR2; if (xfs_has_attr(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR; if (xfs_has_quota(mp)) @@ -1537,8 +1536,6 @@ xfs_fs_geometry( geo->flags |= XFS_FSOP_GEOM_FLAGS_DIRV2CI; if (xfs_has_lazysbcount(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_LAZYSB; - if (xfs_has_attr2(mp)) - geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR2; if (xfs_has_projid32(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_PROJID32; if (xfs_has_crc(mp)) diff --git a/fs/xfs/libxfs/xfs_zones.h b/fs/xfs/libxfs/xfs_zones.h index c4f1367b2cca..5fefd132e002 100644 --- a/fs/xfs/libxfs/xfs_zones.h +++ b/fs/xfs/libxfs/xfs_zones.h @@ -29,6 +29,13 @@ struct xfs_rtgroup; #define XFS_OPEN_GC_ZONES 1U #define XFS_MIN_OPEN_ZONES (XFS_OPEN_GC_ZONES + 1U) +/* + * For zoned devices that do not have a limit on the number of open zones, and + * for regular devices using the zoned allocator, use the most common SMR disks + * limit (128) as the default limit on the number of open zones. + */ +#define XFS_DEFAULT_MAX_OPEN_ZONES 128 + bool xfs_zone_validate(struct blk_zone *zone, struct xfs_rtgroup *rtg, xfs_rgblock_t *write_pointer); diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c index 38a246b8bf11..b2a83801412e 100644 --- a/fs/xfs/scrub/cow_repair.c +++ b/fs/xfs/scrub/cow_repair.c @@ -300,7 +300,7 @@ xrep_cow_find_bad( * on the debugging knob, replace everything in the CoW fork. */ if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || - XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { + XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock, xc->irec.br_blockcount); if (error) @@ -385,7 +385,7 @@ xrep_cow_find_bad_rt( * CoW fork and then scan for staging extents in the refcountbt. */ if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || - XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { + XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock, xc->irec.br_blockcount); if (error) diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c index 14939d7de349..378ec7c8d38e 100644 --- a/fs/xfs/scrub/metapath.c +++ b/fs/xfs/scrub/metapath.c @@ -79,7 +79,7 @@ xchk_metapath_cleanup( if (mpath->dp_ilock_flags) xfs_iunlock(mpath->dp, mpath->dp_ilock_flags); - kfree(mpath->path); + kfree_const(mpath->path); } /* Set up a metadir path scan. @path must be dynamically allocated. */ @@ -98,13 +98,13 @@ xchk_setup_metapath_scan( error = xchk_install_live_inode(sc, ip); if (error) { - kfree(path); + kfree_const(path); return error; } mpath = kzalloc(sizeof(struct xchk_metapath), XCHK_GFP_FLAGS); if (!mpath) { - kfree(path); + kfree_const(path); return -ENOMEM; } @@ -132,7 +132,7 @@ xchk_setup_metapath_rtdir( return -ENOENT; return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip, - kasprintf(GFP_KERNEL, "rtgroups"), sc->mp->m_rtdirip); + kstrdup_const("rtgroups", GFP_KERNEL), sc->mp->m_rtdirip); } /* Scan a rtgroup inode under the /rtgroups directory. */ @@ -179,7 +179,7 @@ xchk_setup_metapath_quotadir( return -ENOENT; return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip, - kstrdup("quota", GFP_KERNEL), qi->qi_dirip); + kstrdup_const("quota", GFP_KERNEL), qi->qi_dirip); } /* Scan a quota inode under the /quota directory. */ @@ -212,7 +212,7 @@ xchk_setup_metapath_dqinode( return -ENOENT; return xchk_setup_metapath_scan(sc, qi->qi_dirip, - kstrdup(xfs_dqinode_path(type), GFP_KERNEL), ip); + kstrdup_const(xfs_dqinode_path(type), GFP_KERNEL), ip); } #else # define xchk_setup_metapath_quotadir(...) (-ENOENT) diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c index 1588ce971cb8..951ae8b71566 100644 --- a/fs/xfs/scrub/newbt.c +++ b/fs/xfs/scrub/newbt.c @@ -28,6 +28,15 @@ #include "scrub/newbt.h" /* + * This is the maximum number of deferred extent freeing item extents (EFIs) + * that we'll attach to a transaction without rolling the transaction to avoid + * overrunning a tr_itruncate reservation. The newbt code should reserve + * exactly the correct number of blocks to rebuild the btree, so there should + * not be any excess blocks to free when committing a new btree. + */ +#define XREP_MAX_ITRUNCATE_EFIS (128) + +/* * Estimate proper slack values for a btree that's being reloaded. * * Under most circumstances, we'll take whatever default loading value the diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 8703897c0a9c..07f5bb8a6421 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -36,6 +36,12 @@ #include "xfs_metafile.h" #include "xfs_rtgroup.h" #include "xfs_rtrmap_btree.h" +#include "xfs_extfree_item.h" +#include "xfs_rmap_item.h" +#include "xfs_refcount_item.h" +#include "xfs_buf_item.h" +#include "xfs_bmap_item.h" +#include "xfs_bmap_btree.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -91,21 +97,33 @@ struct xreap_state { struct xfs_scrub *sc; - /* Reverse mapping owner and metadata reservation type. */ - const struct xfs_owner_info *oinfo; - enum xfs_ag_resv_type resv; + union { + struct { + /* + * For AG blocks, this is reverse mapping owner and + * metadata reservation type. + */ + const struct xfs_owner_info *oinfo; + enum xfs_ag_resv_type resv; + }; + struct { + /* For file blocks, this is the inode and fork. */ + struct xfs_inode *ip; + int whichfork; + }; + }; - /* If true, roll the transaction before reaping the next extent. */ - bool force_roll; + /* Number of invalidated buffers logged to the current transaction. */ + unsigned int nr_binval; - /* Number of deferred reaps attached to the current transaction. */ - unsigned int deferred; + /* Maximum number of buffers we can invalidate in a single tx. */ + unsigned int max_binval; - /* Number of invalidated buffers logged to the current transaction. */ - unsigned int invalidated; + /* Number of deferred reaps attached to the current transaction. */ + unsigned int nr_deferred; - /* Number of deferred reaps queued during the whole reap sequence. */ - unsigned long long total_deferred; + /* Maximum number of intents we can reap in a single transaction. */ + unsigned int max_deferred; }; /* Put a block back on the AGFL. */ @@ -148,71 +166,79 @@ xreap_put_freelist( } /* Are there any uncommitted reap operations? */ -static inline bool xreap_dirty(const struct xreap_state *rs) +static inline bool xreap_is_dirty(const struct xreap_state *rs) { - if (rs->force_roll) - return true; - if (rs->deferred) - return true; - if (rs->invalidated) - return true; - if (rs->total_deferred) - return true; - return false; + return rs->nr_binval > 0 || rs->nr_deferred > 0; } -#define XREAP_MAX_BINVAL (2048) - /* - * Decide if we want to roll the transaction after reaping an extent. We don't - * want to overrun the transaction reservation, so we prohibit more than - * 128 EFIs per transaction. For the same reason, we limit the number - * of buffer invalidations to 2048. + * Decide if we need to roll the transaction to clear out the the log + * reservation that we allocated to buffer invalidations. */ -static inline bool xreap_want_roll(const struct xreap_state *rs) +static inline bool xreap_want_binval_roll(const struct xreap_state *rs) { - if (rs->force_roll) - return true; - if (rs->deferred > XREP_MAX_ITRUNCATE_EFIS) - return true; - if (rs->invalidated > XREAP_MAX_BINVAL) - return true; - return false; + return rs->nr_binval >= rs->max_binval; } -static inline void xreap_reset(struct xreap_state *rs) +/* Reset the buffer invalidation count after rolling. */ +static inline void xreap_binval_reset(struct xreap_state *rs) { - rs->total_deferred += rs->deferred; - rs->deferred = 0; - rs->invalidated = 0; - rs->force_roll = false; + rs->nr_binval = 0; } -#define XREAP_MAX_DEFER_CHAIN (2048) +/* + * Bump the number of invalidated buffers, and return true if we can continue, + * or false if we need to roll the transaction. + */ +static inline bool xreap_inc_binval(struct xreap_state *rs) +{ + rs->nr_binval++; + return rs->nr_binval < rs->max_binval; +} /* * Decide if we want to finish the deferred ops that are attached to the scrub * transaction. We don't want to queue huge chains of deferred ops because * that can consume a lot of log space and kernel memory. Hence we trigger a - * xfs_defer_finish if there are more than 2048 deferred reap operations or the - * caller did some real work. + * xfs_defer_finish if there are too many deferred reap operations or we've run + * out of space for invalidations. */ -static inline bool -xreap_want_defer_finish(const struct xreap_state *rs) +static inline bool xreap_want_defer_finish(const struct xreap_state *rs) { - if (rs->force_roll) - return true; - if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN) - return true; - return false; + return rs->nr_deferred >= rs->max_deferred; } +/* + * Reset the defer chain length and buffer invalidation count after finishing + * items. + */ static inline void xreap_defer_finish_reset(struct xreap_state *rs) { - rs->total_deferred = 0; - rs->deferred = 0; - rs->invalidated = 0; - rs->force_roll = false; + rs->nr_deferred = 0; + rs->nr_binval = 0; +} + +/* + * Bump the number of deferred extent reaps. + */ +static inline void xreap_inc_defer(struct xreap_state *rs) +{ + rs->nr_deferred++; +} + +/* Force the caller to finish a deferred item chain. */ +static inline void xreap_force_defer_finish(struct xreap_state *rs) +{ + rs->nr_deferred = rs->max_deferred; +} + +/* Maximum number of fsblocks that we might find in a buffer to invalidate. */ +static inline unsigned int +xrep_binval_max_fsblocks( + struct xfs_mount *mp) +{ + /* Remote xattr values are the largest buffers that we support. */ + return xfs_attr3_max_rmt_blocks(mp); } /* @@ -224,12 +250,8 @@ xrep_bufscan_max_sectors( struct xfs_mount *mp, xfs_extlen_t fsblocks) { - int max_fsbs; - - /* Remote xattr values are the largest buffers that we support. */ - max_fsbs = xfs_attr3_max_rmt_blocks(mp); - - return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, max_fsbs)); + return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, + xrep_binval_max_fsblocks(mp))); } /* @@ -297,14 +319,13 @@ xreap_agextent_binval( while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) { xfs_trans_bjoin(sc->tp, bp); xfs_trans_binval(sc->tp, bp); - rs->invalidated++; /* * Stop invalidating if we've hit the limit; we should * still have enough reservation left to free however * far we've gotten. */ - if (rs->invalidated > XREAP_MAX_BINVAL) { + if (!xreap_inc_binval(rs)) { *aglenp -= agbno_next - bno; goto out; } @@ -416,21 +437,23 @@ xreap_agextent_iter( trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag), agbno, *aglenp); - rs->force_roll = true; - if (rs->oinfo == &XFS_RMAP_OINFO_COW) { /* - * If we're unmapping CoW staging extents, remove the + * t0: Unmapping CoW staging extents, remove the * records from the refcountbt, which will remove the * rmap record as well. */ xfs_refcount_free_cow_extent(sc->tp, false, fsbno, *aglenp); + xreap_inc_defer(rs); return 0; } - return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, - *aglenp, rs->oinfo); + /* t1: unmap crosslinked metadata blocks */ + xfs_rmap_free_extent(sc->tp, false, fsbno, *aglenp, + rs->oinfo->oi_owner); + xreap_inc_defer(rs); + return 0; } trace_xreap_dispose_free_extent(pag_group(sc->sa.pag), agbno, *aglenp); @@ -443,12 +466,12 @@ xreap_agextent_iter( */ xreap_agextent_binval(rs, agbno, aglenp); if (*aglenp == 0) { - ASSERT(xreap_want_roll(rs)); + ASSERT(xreap_want_binval_roll(rs)); return 0; } /* - * If we're getting rid of CoW staging extents, use deferred work items + * t2: To get rid of CoW staging extents, use deferred work items * to remove the refcountbt records (which removes the rmap records) * and free the extent. We're not worried about the system going down * here because log recovery walks the refcount btree to clean out the @@ -463,23 +486,23 @@ xreap_agextent_iter( if (error) return error; - rs->force_roll = true; + xreap_inc_defer(rs); return 0; } - /* Put blocks back on the AGFL one at a time. */ + /* t3: Put blocks back on the AGFL one at a time. */ if (rs->resv == XFS_AG_RESV_AGFL) { ASSERT(*aglenp == 1); error = xreap_put_freelist(sc, agbno); if (error) return error; - rs->force_roll = true; + xreap_force_defer_finish(rs); return 0; } /* - * Use deferred frees to get rid of the old btree blocks to try to + * t4: Use deferred frees to get rid of the old btree blocks to try to * minimize the window in which we could crash and lose the old blocks. * Add a defer ops barrier every other extent to avoid stressing the * system with large EFIs. @@ -489,12 +512,194 @@ xreap_agextent_iter( if (error) return error; - rs->deferred++; - if (rs->deferred % 2 == 0) + xreap_inc_defer(rs); + if (rs->nr_deferred % 2 == 0) xfs_defer_add_barrier(sc->tp); return 0; } +/* Configure the deferral and invalidation limits */ +static inline void +xreap_configure_limits( + struct xreap_state *rs, + unsigned int fixed_overhead, + unsigned int variable_overhead, + unsigned int per_intent, + unsigned int per_binval) +{ + struct xfs_scrub *sc = rs->sc; + unsigned int res = sc->tp->t_log_res - fixed_overhead; + + /* Don't underflow the reservation */ + if (sc->tp->t_log_res < (fixed_overhead + variable_overhead)) { + ASSERT(sc->tp->t_log_res >= + (fixed_overhead + variable_overhead)); + xfs_force_shutdown(sc->mp, SHUTDOWN_CORRUPT_INCORE); + return; + } + + rs->max_deferred = per_intent ? res / variable_overhead : 0; + res -= rs->max_deferred * per_intent; + rs->max_binval = per_binval ? res / per_binval : 0; +} + +/* + * Compute the maximum number of intent items that reaping can attach to the + * scrub transaction given the worst case log overhead of the intent items + * needed to reap a single per-AG space extent. This is not for freeing CoW + * staging extents. + */ +STATIC void +xreap_configure_agextent_limits( + struct xreap_state *rs) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_mount *mp = sc->mp; + + /* + * In the worst case, relogging an intent item causes both an intent + * item and a done item to be attached to a transaction for each extent + * that we'd like to process. + */ + const unsigned int efi = xfs_efi_log_space(1) + + xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1) + + xfs_rud_log_space(); + + /* + * Various things can happen when reaping non-CoW metadata blocks: + * + * t1: Unmapping crosslinked metadata blocks: deferred removal of rmap + * record. + * + * t3: Freeing to AGFL: roll and finish deferred items for every block. + * Limits here do not matter. + * + * t4: Freeing metadata blocks: deferred freeing of the space, which + * also removes the rmap record. + * + * For simplicity, we'll use the worst-case intents size to determine + * the maximum number of deferred extents before we have to finish the + * whole chain. If we're trying to reap a btree larger than this size, + * a crash midway through reaping can result in leaked blocks. + */ + const unsigned int t1 = rui; + const unsigned int t4 = rui + efi; + const unsigned int per_intent = max(t1, t4); + + /* + * For each transaction in a reap chain, we must be able to take one + * step in the defer item chain, which should only consist of EFI or + * RUI items. + */ + const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1); + const unsigned int step_size = max(f1, f2); + + /* Largest buffer size (in fsblocks) that can be invalidated. */ + const unsigned int max_binval = xrep_binval_max_fsblocks(mp); + + /* Maximum overhead of invalidating one buffer. */ + const unsigned int per_binval = + xfs_buf_inval_log_space(1, XFS_B_TO_FSBT(mp, max_binval)); + + /* + * For each transaction in a reap chain, we can delete some number of + * extents and invalidate some number of blocks. We assume that btree + * blocks aren't usually contiguous; and that scrub likely pulled all + * the buffers into memory. From these assumptions, set the maximum + * number of deferrals we can queue before flushing the defer chain, + * and the number of invalidations we can queue before rolling to a + * clean transaction (and possibly relogging some of the deferrals) to + * the same quantity. + */ + const unsigned int variable_overhead = per_intent + per_binval; + + xreap_configure_limits(rs, step_size, variable_overhead, per_intent, + per_binval); + + trace_xreap_agextent_limits(sc->tp, per_binval, rs->max_binval, + step_size, per_intent, rs->max_deferred); +} + +/* + * Compute the maximum number of intent items that reaping can attach to the + * scrub transaction given the worst case log overhead of the intent items + * needed to reap a single CoW staging extent. This is not for freeing + * metadata blocks. + */ +STATIC void +xreap_configure_agcow_limits( + struct xreap_state *rs) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_mount *mp = sc->mp; + + /* + * In the worst case, relogging an intent item causes both an intent + * item and a done item to be attached to a transaction for each extent + * that we'd like to process. + */ + const unsigned int efi = xfs_efi_log_space(1) + + xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1) + + xfs_rud_log_space(); + const unsigned int cui = xfs_cui_log_space(1) + + xfs_cud_log_space(); + + /* + * Various things can happen when reaping non-CoW metadata blocks: + * + * t0: Unmapping crosslinked CoW blocks: deferred removal of refcount + * record, which defers removal of rmap record + * + * t2: Freeing CoW blocks: deferred removal of refcount record, which + * defers removal of rmap record; and deferred removal of the space + * + * For simplicity, we'll use the worst-case intents size to determine + * the maximum number of deferred extents before we have to finish the + * whole chain. If we're trying to reap a btree larger than this size, + * a crash midway through reaping can result in leaked blocks. + */ + const unsigned int t0 = cui + rui; + const unsigned int t2 = cui + rui + efi; + const unsigned int per_intent = max(t0, t2); + + /* + * For each transaction in a reap chain, we must be able to take one + * step in the defer item chain, which should only consist of CUI, EFI, + * or RUI items. + */ + const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1); + const unsigned int f3 = xfs_calc_finish_cui_reservation(mp, 1); + const unsigned int step_size = max3(f1, f2, f3); + + /* Largest buffer size (in fsblocks) that can be invalidated. */ + const unsigned int max_binval = xrep_binval_max_fsblocks(mp); + + /* Overhead of invalidating one buffer */ + const unsigned int per_binval = + xfs_buf_inval_log_space(1, XFS_B_TO_FSBT(mp, max_binval)); + + /* + * For each transaction in a reap chain, we can delete some number of + * extents and invalidate some number of blocks. We assume that CoW + * staging extents are usually more than 1 fsblock, and that there + * shouldn't be any buffers for those blocks. From the assumptions, + * set the number of deferrals to use as much of the reservation as + * it can, but leave space to invalidate 1/8th that number of buffers. + */ + const unsigned int variable_overhead = per_intent + + (per_binval / 8); + + xreap_configure_limits(rs, step_size, variable_overhead, per_intent, + per_binval); + + trace_xreap_agcow_limits(sc->tp, per_binval, rs->max_binval, step_size, + per_intent, rs->max_deferred); +} + /* * Break an AG metadata extent into sub-extents by fate (crosslinked, not * crosslinked), and dispose of each sub-extent separately. @@ -531,11 +736,11 @@ xreap_agmeta_extent( if (error) return error; xreap_defer_finish_reset(rs); - } else if (xreap_want_roll(rs)) { + } else if (xreap_want_binval_roll(rs)) { error = xrep_roll_ag_trans(sc); if (error) return error; - xreap_reset(rs); + xreap_binval_reset(rs); } agbno += aglen; @@ -562,11 +767,12 @@ xrep_reap_agblocks( ASSERT(xfs_has_rmapbt(sc->mp)); ASSERT(sc->ip == NULL); + xreap_configure_agextent_limits(&rs); error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs); if (error) return error; - if (xreap_dirty(&rs)) + if (xreap_is_dirty(&rs)) return xrep_defer_finish(sc); return 0; @@ -628,7 +834,7 @@ xreap_fsmeta_extent( if (error) goto out_agf; xreap_defer_finish_reset(rs); - } else if (xreap_want_roll(rs)) { + } else if (xreap_want_binval_roll(rs)) { /* * Hold the AGF buffer across the transaction roll so * that we don't have to reattach it to the scrub @@ -639,7 +845,7 @@ xreap_fsmeta_extent( xfs_trans_bjoin(sc->tp, sc->sa.agf_bp); if (error) goto out_agf; - xreap_reset(rs); + xreap_binval_reset(rs); } agbno += aglen; @@ -674,11 +880,15 @@ xrep_reap_fsblocks( ASSERT(xfs_has_rmapbt(sc->mp)); ASSERT(sc->ip != NULL); + if (oinfo == &XFS_RMAP_OINFO_COW) + xreap_configure_agcow_limits(&rs); + else + xreap_configure_agextent_limits(&rs); error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); if (error) return error; - if (xreap_dirty(&rs)) + if (xreap_is_dirty(&rs)) return xrep_defer_finish(sc); return 0; @@ -770,7 +980,7 @@ xreap_rgextent_iter( rtbno = xfs_rgbno_to_rtb(sc->sr.rtg, rgbno); /* - * If there are other rmappings, this block is cross linked and must + * t1: There are other rmappings; this block is cross linked and must * not be freed. Remove the forward and reverse mapping and move on. */ if (crosslinked) { @@ -778,14 +988,14 @@ xreap_rgextent_iter( *rglenp); xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp); - rs->deferred++; + xreap_inc_defer(rs); return 0; } trace_xreap_dispose_free_extent(rtg_group(sc->sr.rtg), rgbno, *rglenp); /* - * The CoW staging extent is not crosslinked. Use deferred work items + * t2: The CoW staging extent is not crosslinked. Use deferred work * to remove the refcountbt records (which removes the rmap records) * and free the extent. We're not worried about the system going down * here because log recovery walks the refcount btree to clean out the @@ -799,10 +1009,73 @@ xreap_rgextent_iter( if (error) return error; - rs->deferred++; + xreap_inc_defer(rs); return 0; } +/* + * Compute the maximum number of intent items that reaping can attach to the + * scrub transaction given the worst case log overhead of the intent items + * needed to reap a single CoW staging extent. This is not for freeing + * metadata blocks. + */ +STATIC void +xreap_configure_rgcow_limits( + struct xreap_state *rs) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_mount *mp = sc->mp; + + /* + * In the worst case, relogging an intent item causes both an intent + * item and a done item to be attached to a transaction for each extent + * that we'd like to process. + */ + const unsigned int efi = xfs_efi_log_space(1) + + xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1) + + xfs_rud_log_space(); + const unsigned int cui = xfs_cui_log_space(1) + + xfs_cud_log_space(); + + /* + * Various things can happen when reaping non-CoW metadata blocks: + * + * t1: Unmapping crosslinked CoW blocks: deferred removal of refcount + * record, which defers removal of rmap record + * + * t2: Freeing CoW blocks: deferred removal of refcount record, which + * defers removal of rmap record; and deferred removal of the space + * + * For simplicity, we'll use the worst-case intents size to determine + * the maximum number of deferred extents before we have to finish the + * whole chain. If we're trying to reap a btree larger than this size, + * a crash midway through reaping can result in leaked blocks. + */ + const unsigned int t1 = cui + rui; + const unsigned int t2 = cui + rui + efi; + const unsigned int per_intent = max(t1, t2); + + /* + * For each transaction in a reap chain, we must be able to take one + * step in the defer item chain, which should only consist of CUI, EFI, + * or RUI items. + */ + const unsigned int f1 = xfs_calc_finish_rt_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rt_rui_reservation(mp, 1); + const unsigned int f3 = xfs_calc_finish_rt_cui_reservation(mp, 1); + const unsigned int step_size = max3(f1, f2, f3); + + /* + * The only buffer for the rt device is the rtgroup super, so we don't + * need to save space for buffer invalidations. + */ + xreap_configure_limits(rs, step_size, per_intent, per_intent, 0); + + trace_xreap_rgcow_limits(sc->tp, 0, 0, step_size, per_intent, + rs->max_deferred); +} + #define XREAP_RTGLOCK_ALL (XFS_RTGLOCK_BITMAP | \ XFS_RTGLOCK_RMAP | \ XFS_RTGLOCK_REFCOUNT) @@ -855,11 +1128,11 @@ xreap_rtmeta_extent( if (error) goto out_unlock; xreap_defer_finish_reset(rs); - } else if (xreap_want_roll(rs)) { + } else if (xreap_want_binval_roll(rs)) { error = xfs_trans_roll_inode(&sc->tp, sc->ip); if (error) goto out_unlock; - xreap_reset(rs); + xreap_binval_reset(rs); } rgbno += rglen; @@ -891,12 +1164,14 @@ xrep_reap_rtblocks( ASSERT(xfs_has_rmapbt(sc->mp)); ASSERT(sc->ip != NULL); + ASSERT(oinfo == &XFS_RMAP_OINFO_COW); + xreap_configure_rgcow_limits(&rs); error = xrtb_bitmap_walk(bitmap, xreap_rtmeta_extent, &rs); if (error) return error; - if (xreap_dirty(&rs)) + if (xreap_is_dirty(&rs)) return xrep_defer_finish(sc); return 0; @@ -929,13 +1204,13 @@ xrep_reap_metadir_fsblocks( ASSERT(sc->ip != NULL); ASSERT(xfs_is_metadir_inode(sc->ip)); + xreap_configure_agextent_limits(&rs); xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK); - error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); if (error) return error; - if (xreap_dirty(&rs)) { + if (xreap_is_dirty(&rs)) { error = xrep_defer_finish(sc); if (error) return error; @@ -955,13 +1230,12 @@ xrep_reap_metadir_fsblocks( */ STATIC int xreap_bmapi_select( - struct xfs_scrub *sc, - struct xfs_inode *ip, - int whichfork, + struct xreap_state *rs, struct xfs_bmbt_irec *imap, bool *crosslinked) { struct xfs_owner_info oinfo; + struct xfs_scrub *sc = rs->sc; struct xfs_btree_cur *cur; xfs_filblks_t len = 1; xfs_agblock_t bno; @@ -975,7 +1249,8 @@ xreap_bmapi_select( cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, sc->sa.pag); - xfs_rmap_ino_owner(&oinfo, ip->i_ino, whichfork, imap->br_startoff); + xfs_rmap_ino_owner(&oinfo, rs->ip->i_ino, rs->whichfork, + imap->br_startoff); error = xfs_rmap_has_other_keys(cur, agbno, 1, &oinfo, crosslinked); if (error) goto out_cur; @@ -1038,21 +1313,19 @@ xreap_buf_loggable( */ STATIC int xreap_bmapi_binval( - struct xfs_scrub *sc, - struct xfs_inode *ip, - int whichfork, + struct xreap_state *rs, struct xfs_bmbt_irec *imap) { + struct xfs_scrub *sc = rs->sc; struct xfs_mount *mp = sc->mp; struct xfs_perag *pag = sc->sa.pag; - int bmap_flags = xfs_bmapi_aflag(whichfork); + int bmap_flags = xfs_bmapi_aflag(rs->whichfork); xfs_fileoff_t off; xfs_fileoff_t max_off; xfs_extlen_t scan_blocks; xfs_agblock_t bno; xfs_agblock_t agbno; xfs_agblock_t agbno_next; - unsigned int invalidated = 0; int error; /* @@ -1079,7 +1352,7 @@ xreap_bmapi_binval( struct xfs_bmbt_irec hmap; int nhmaps = 1; - error = xfs_bmapi_read(ip, off, max_off - off, &hmap, + error = xfs_bmapi_read(rs->ip, off, max_off - off, &hmap, &nhmaps, bmap_flags); if (error) return error; @@ -1120,14 +1393,13 @@ xreap_bmapi_binval( xfs_buf_stale(bp); xfs_buf_relse(bp); } - invalidated++; /* * Stop invalidating if we've hit the limit; we should * still have enough reservation left to free however - * much of the mapping we've seen so far. + * far we've gotten. */ - if (invalidated > XREAP_MAX_BINVAL) { + if (!xreap_inc_binval(rs)) { imap->br_blockcount = agbno_next - bno; goto out; } @@ -1149,12 +1421,11 @@ out: */ STATIC int xrep_reap_bmapi_iter( - struct xfs_scrub *sc, - struct xfs_inode *ip, - int whichfork, + struct xreap_state *rs, struct xfs_bmbt_irec *imap, bool crosslinked) { + struct xfs_scrub *sc = rs->sc; int error; if (crosslinked) { @@ -1171,14 +1442,14 @@ xrep_reap_bmapi_iter( imap->br_blockcount); /* - * Schedule removal of the mapping from the fork. We use + * t0: Schedule removal of the mapping from the fork. We use * deferred log intents in this function to control the exact * sequence of metadata updates. */ - xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap); - xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT, + xfs_bmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap); + xfs_trans_mod_dquot_byino(sc->tp, rs->ip, XFS_TRANS_DQ_BCOUNT, -(int64_t)imap->br_blockcount); - xfs_rmap_unmap_extent(sc->tp, ip, whichfork, imap); + xfs_rmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap); return 0; } @@ -1199,41 +1470,139 @@ xrep_reap_bmapi_iter( * transaction is full of logged buffer invalidations, so we need to * return early so that we can roll and retry. */ - error = xreap_bmapi_binval(sc, ip, whichfork, imap); + error = xreap_bmapi_binval(rs, imap); if (error || imap->br_blockcount == 0) return error; /* - * Schedule removal of the mapping from the fork. We use deferred log - * intents in this function to control the exact sequence of metadata + * t1: Schedule removal of the mapping from the fork. We use deferred + * work in this function to control the exact sequence of metadata * updates. */ - xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap); - xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT, + xfs_bmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap); + xfs_trans_mod_dquot_byino(sc->tp, rs->ip, XFS_TRANS_DQ_BCOUNT, -(int64_t)imap->br_blockcount); return xfs_free_extent_later(sc->tp, imap->br_startblock, imap->br_blockcount, NULL, XFS_AG_RESV_NONE, XFS_FREE_EXTENT_SKIP_DISCARD); } +/* Compute the maximum mapcount of a file buffer. */ +static unsigned int +xreap_bmapi_binval_mapcount( + struct xfs_scrub *sc) +{ + /* directory blocks can span multiple fsblocks and be discontiguous */ + if (sc->sm->sm_type == XFS_SCRUB_TYPE_DIR) + return sc->mp->m_dir_geo->fsbcount; + + /* all other file xattr/symlink blocks must be contiguous */ + return 1; +} + +/* Compute the maximum block size of a file buffer. */ +static unsigned int +xreap_bmapi_binval_blocksize( + struct xfs_scrub *sc) +{ + switch (sc->sm->sm_type) { + case XFS_SCRUB_TYPE_DIR: + return sc->mp->m_dir_geo->blksize; + case XFS_SCRUB_TYPE_XATTR: + case XFS_SCRUB_TYPE_PARENT: + /* + * The xattr structure itself consists of single fsblocks, but + * there could be remote xattr blocks to invalidate. + */ + return XFS_XATTR_SIZE_MAX; + } + + /* everything else is a single block */ + return sc->mp->m_sb.sb_blocksize; +} + +/* + * Compute the maximum number of buffer invalidations that we can do while + * reaping a single extent from a file fork. + */ +STATIC void +xreap_configure_bmapi_limits( + struct xreap_state *rs) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_mount *mp = sc->mp; + + /* overhead of invalidating a buffer */ + const unsigned int per_binval = + xfs_buf_inval_log_space(xreap_bmapi_binval_mapcount(sc), + xreap_bmapi_binval_blocksize(sc)); + + /* + * In the worst case, relogging an intent item causes both an intent + * item and a done item to be attached to a transaction for each extent + * that we'd like to process. + */ + const unsigned int efi = xfs_efi_log_space(1) + + xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1) + + xfs_rud_log_space(); + const unsigned int bui = xfs_bui_log_space(1) + + xfs_bud_log_space(); + + /* + * t1: Unmapping crosslinked file data blocks: one bmap deletion, + * possibly an EFI for underfilled bmbt blocks, and an rmap deletion. + * + * t2: Freeing freeing file data blocks: one bmap deletion, possibly an + * EFI for underfilled bmbt blocks, and another EFI for the space + * itself. + */ + const unsigned int t1 = (bui + efi) + rui; + const unsigned int t2 = (bui + efi) + efi; + const unsigned int per_intent = max(t1, t2); + + /* + * For each transaction in a reap chain, we must be able to take one + * step in the defer item chain, which should only consist of CUI, EFI, + * or RUI items. + */ + const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1); + const unsigned int f3 = xfs_calc_finish_bui_reservation(mp, 1); + const unsigned int step_size = max3(f1, f2, f3); + + /* + * Each call to xreap_ifork_extent starts with a clean transaction and + * operates on a single mapping by creating a chain of log intent items + * for that mapping. We need to leave enough reservation in the + * transaction to log btree buffer and inode updates for each step in + * the chain, and to relog the log intents. + */ + const unsigned int per_extent_res = per_intent + step_size; + + xreap_configure_limits(rs, per_extent_res, per_binval, 0, per_binval); + + trace_xreap_bmapi_limits(sc->tp, per_binval, rs->max_binval, + step_size, per_intent, 1); +} + /* * Dispose of as much of this file extent as we can. Upon successful return, * the imap will reflect the mapping that was removed from the fork. */ STATIC int xreap_ifork_extent( - struct xfs_scrub *sc, - struct xfs_inode *ip, - int whichfork, + struct xreap_state *rs, struct xfs_bmbt_irec *imap) { + struct xfs_scrub *sc = rs->sc; xfs_agnumber_t agno; bool crosslinked; int error; ASSERT(sc->sa.pag == NULL); - trace_xreap_ifork_extent(sc, ip, whichfork, imap); + trace_xreap_ifork_extent(sc, rs->ip, rs->whichfork, imap); agno = XFS_FSB_TO_AGNO(sc->mp, imap->br_startblock); sc->sa.pag = xfs_perag_get(sc->mp, agno); @@ -1248,11 +1617,11 @@ xreap_ifork_extent( * Decide the fate of the blocks at the beginning of the mapping, then * update the mapping to use it with the unmap calls. */ - error = xreap_bmapi_select(sc, ip, whichfork, imap, &crosslinked); + error = xreap_bmapi_select(rs, imap, &crosslinked); if (error) goto out_agf; - error = xrep_reap_bmapi_iter(sc, ip, whichfork, imap, crosslinked); + error = xrep_reap_bmapi_iter(rs, imap, crosslinked); if (error) goto out_agf; @@ -1276,6 +1645,11 @@ xrep_reap_ifork( struct xfs_inode *ip, int whichfork) { + struct xreap_state rs = { + .sc = sc, + .ip = ip, + .whichfork = whichfork, + }; xfs_fileoff_t off = 0; int bmap_flags = xfs_bmapi_aflag(whichfork); int error; @@ -1284,6 +1658,7 @@ xrep_reap_ifork( ASSERT(ip == sc->ip || ip == sc->tempip); ASSERT(whichfork == XFS_ATTR_FORK || !XFS_IS_REALTIME_INODE(ip)); + xreap_configure_bmapi_limits(&rs); while (off < XFS_MAX_FILEOFF) { struct xfs_bmbt_irec imap; int nimaps = 1; @@ -1303,13 +1678,14 @@ xrep_reap_ifork( * can in a single transaction. */ if (xfs_bmap_is_real_extent(&imap)) { - error = xreap_ifork_extent(sc, ip, whichfork, &imap); + error = xreap_ifork_extent(&rs, &imap); if (error) return error; error = xfs_defer_finish(&sc->tp); if (error) return error; + xreap_defer_finish_reset(&rs); } off = imap.br_startoff + imap.br_blockcount; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index d00c18954a26..efd5a7ccdf62 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -1110,7 +1110,7 @@ xrep_will_attempt( return true; /* Let debug users force us into the repair routines. */ - if (XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) + if (XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) return true; /* Metadata is corrupt or failed cross-referencing. */ diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 9c04295742c8..2bb125c4f9bf 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -18,14 +18,6 @@ static inline int xrep_notsupported(struct xfs_scrub *sc) #ifdef CONFIG_XFS_ONLINE_REPAIR -/* - * This is the maximum number of deferred extent freeing item extents (EFIs) - * that we'll attach to a transaction without rolling the transaction to avoid - * overrunning a tr_itruncate reservation. - */ -#define XREP_MAX_ITRUNCATE_EFIS (128) - - /* Repair helpers */ int xrep_attempt(struct xfs_scrub *sc, struct xchk_stats_run *run); diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c index 953ce7be78dc..5902398185a8 100644 --- a/fs/xfs/scrub/symlink_repair.c +++ b/fs/xfs/scrub/symlink_repair.c @@ -185,7 +185,7 @@ xrep_symlink_salvage_inline( return 0; nr = min(XFS_SYMLINK_MAXLEN, xfs_inode_data_fork_size(ip)); - strncpy(target_buf, ifp->if_data, nr); + memcpy(target_buf, ifp->if_data, nr); return nr; } diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 2450e214103f..987313a52e64 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -22,6 +22,7 @@ #include "xfs_parent.h" #include "xfs_metafile.h" #include "xfs_rtgroup.h" +#include "xfs_trans.h" #include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index a8187281eb96..39ea651cbb75 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -2000,6 +2000,51 @@ DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval); DEFINE_REPAIR_EXTENT_EVENT(xreap_bmapi_binval); DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert); +DECLARE_EVENT_CLASS(xrep_reap_limits_class, + TP_PROTO(const struct xfs_trans *tp, unsigned int per_binval, + unsigned int max_binval, unsigned int step_size, + unsigned int per_intent, + unsigned int max_deferred), + TP_ARGS(tp, per_binval, max_binval, step_size, per_intent, max_deferred), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, log_res) + __field(unsigned int, per_binval) + __field(unsigned int, max_binval) + __field(unsigned int, step_size) + __field(unsigned int, per_intent) + __field(unsigned int, max_deferred) + ), + TP_fast_assign( + __entry->dev = tp->t_mountp->m_super->s_dev; + __entry->log_res = tp->t_log_res; + __entry->per_binval = per_binval; + __entry->max_binval = max_binval; + __entry->step_size = step_size; + __entry->per_intent = per_intent; + __entry->max_deferred = max_deferred; + ), + TP_printk("dev %d:%d logres %u per_binval %u max_binval %u step_size %u per_intent %u max_deferred %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->log_res, + __entry->per_binval, + __entry->max_binval, + __entry->step_size, + __entry->per_intent, + __entry->max_deferred) +); +#define DEFINE_REPAIR_REAP_LIMITS_EVENT(name) \ +DEFINE_EVENT(xrep_reap_limits_class, name, \ + TP_PROTO(const struct xfs_trans *tp, unsigned int per_binval, \ + unsigned int max_binval, unsigned int step_size, \ + unsigned int per_intent, \ + unsigned int max_deferred), \ + TP_ARGS(tp, per_binval, max_binval, step_size, per_intent, max_deferred)) +DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agextent_limits); +DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agcow_limits); +DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_rgcow_limits); +DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_bmapi_limits); + DECLARE_EVENT_CLASS(xrep_reap_find_class, TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, xfs_extlen_t len, bool crosslinked), diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 5eef3bc30bda..c3a593319bee 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -491,7 +491,7 @@ xfs_attr_finish_item( /* Reset trans after EAGAIN cycle since the transaction is new */ args->trans = tp; - if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) { + if (XFS_TEST_ERROR(args->dp->i_mount, XFS_ERRTAG_LARP)) { error = -EIO; goto out; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index f9ef3b2a332a..773d959965dc 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -387,8 +387,6 @@ xfs_buf_map_verify( struct xfs_buftarg *btp, struct xfs_buf_map *map) { - xfs_daddr_t eofs; - /* Check for IOs smaller than the sector size / not sector aligned */ ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize)); ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask)); @@ -397,11 +395,10 @@ xfs_buf_map_verify( * Corrupted block numbers can get through to here, unfortunately, so we * have to check that the buffer falls within the filesystem bounds. */ - eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); - if (map->bm_bn < 0 || map->bm_bn >= eofs) { + if (map->bm_bn < 0 || map->bm_bn >= btp->bt_nr_sectors) { xfs_alert(btp->bt_mount, "%s: daddr 0x%llx out of range, EOFS 0x%llx", - __func__, map->bm_bn, eofs); + __func__, map->bm_bn, btp->bt_nr_sectors); WARN_ON(1); return -EFSCORRUPTED; } @@ -1299,7 +1296,7 @@ xfs_buf_bio_end_io( if (bio->bi_status) xfs_buf_ioerror(bp, blk_status_to_errno(bio->bi_status)); else if ((bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) && - XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) + XFS_TEST_ERROR(bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) xfs_buf_ioerror(bp, -EIO); if (bp->b_flags & XBF_ASYNC) { @@ -1720,26 +1717,30 @@ xfs_configure_buftarg_atomic_writes( int xfs_configure_buftarg( struct xfs_buftarg *btp, - unsigned int sectorsize) + unsigned int sectorsize, + xfs_rfsblock_t nr_blocks) { - int error; + struct xfs_mount *mp = btp->bt_mount; - ASSERT(btp->bt_bdev != NULL); + if (btp->bt_bdev) { + int error; - /* Set up metadata sector size info */ - btp->bt_meta_sectorsize = sectorsize; - btp->bt_meta_sectormask = sectorsize - 1; + error = bdev_validate_blocksize(btp->bt_bdev, sectorsize); + if (error) { + xfs_warn(mp, + "Cannot use blocksize %u on device %pg, err %d", + sectorsize, btp->bt_bdev, error); + return -EINVAL; + } - error = bdev_validate_blocksize(btp->bt_bdev, sectorsize); - if (error) { - xfs_warn(btp->bt_mount, - "Cannot use blocksize %u on device %pg, err %d", - sectorsize, btp->bt_bdev, error); - return -EINVAL; + if (bdev_can_atomic_write(btp->bt_bdev)) + xfs_configure_buftarg_atomic_writes(btp); } - if (bdev_can_atomic_write(btp->bt_bdev)) - xfs_configure_buftarg_atomic_writes(btp); + btp->bt_meta_sectorsize = sectorsize; + btp->bt_meta_sectormask = sectorsize - 1; + /* m_blkbb_log is not set up yet */ + btp->bt_nr_sectors = nr_blocks << (mp->m_sb.sb_blocklog - BBSHIFT); return 0; } @@ -1749,6 +1750,9 @@ xfs_init_buftarg( size_t logical_sectorsize, const char *descr) { + /* The maximum size of the buftarg is only known once the sb is read. */ + btp->bt_nr_sectors = (xfs_daddr_t)-1; + /* Set up device logical sector size mask */ btp->bt_logical_sectorsize = logical_sectorsize; btp->bt_logical_sectormask = logical_sectorsize - 1; @@ -2084,7 +2088,7 @@ void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) * This allows userspace to disrupt buffer caching for debug/testing * purposes. */ - if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF)) + if (XFS_TEST_ERROR(bp->b_mount, XFS_ERRTAG_BUF_LRU_REF)) lru_ref = 0; atomic_set(&bp->b_lru_ref, lru_ref); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index b269e115d9ac..8fa7bdf59c91 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -103,6 +103,7 @@ struct xfs_buftarg { size_t bt_meta_sectormask; size_t bt_logical_sectorsize; size_t bt_logical_sectormask; + xfs_daddr_t bt_nr_sectors; /* LRU control structures */ struct shrinker *bt_shrinker; @@ -372,7 +373,8 @@ struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp, extern void xfs_free_buftarg(struct xfs_buftarg *); extern void xfs_buftarg_wait(struct xfs_buftarg *); extern void xfs_buftarg_drain(struct xfs_buftarg *); -int xfs_configure_buftarg(struct xfs_buftarg *btp, unsigned int sectorsize); +int xfs_configure_buftarg(struct xfs_buftarg *btp, unsigned int sectorsize, + xfs_fsblock_t nr_blocks); #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 5d58e2ae4972..e4c8af873632 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -736,6 +736,16 @@ xlog_recover_do_primary_sb_buffer( */ xfs_sb_from_disk(&mp->m_sb, dsb); + /* + * Grow can change the device size. Mirror that into the buftarg. + */ + mp->m_ddev_targp->bt_nr_sectors = + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); + if (mp->m_rtdev_targp && mp->m_rtdev_targp != mp->m_ddev_targp) { + mp->m_rtdev_targp->bt_nr_sectors = + XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); + } + if (mp->m_sb.sb_agcount < orig_agcount) { xfs_alert(mp, "Shrinking AG count in log recovery not supported"); return -EFSCORRUPTED; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index dbd87e137694..39830b252ac8 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -10,61 +10,17 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_sysfs.h" #include "xfs_inode.h" #ifdef DEBUG -static unsigned int xfs_errortag_random_default[] = { - XFS_RANDOM_DEFAULT, - XFS_RANDOM_IFLUSH_1, - XFS_RANDOM_IFLUSH_2, - XFS_RANDOM_IFLUSH_3, - XFS_RANDOM_IFLUSH_4, - XFS_RANDOM_IFLUSH_5, - XFS_RANDOM_IFLUSH_6, - XFS_RANDOM_DA_READ_BUF, - XFS_RANDOM_BTREE_CHECK_LBLOCK, - XFS_RANDOM_BTREE_CHECK_SBLOCK, - XFS_RANDOM_ALLOC_READ_AGF, - XFS_RANDOM_IALLOC_READ_AGI, - XFS_RANDOM_ITOBP_INOTOBP, - XFS_RANDOM_IUNLINK, - XFS_RANDOM_IUNLINK_REMOVE, - XFS_RANDOM_DIR_INO_VALIDATE, - XFS_RANDOM_BULKSTAT_READ_CHUNK, - XFS_RANDOM_IODONE_IOERR, - XFS_RANDOM_STRATREAD_IOERR, - XFS_RANDOM_STRATCMPL_IOERR, - XFS_RANDOM_DIOWRITE_IOERR, - XFS_RANDOM_BMAPIFORMAT, - XFS_RANDOM_FREE_EXTENT, - XFS_RANDOM_RMAP_FINISH_ONE, - XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE, - XFS_RANDOM_REFCOUNT_FINISH_ONE, - XFS_RANDOM_BMAP_FINISH_ONE, - XFS_RANDOM_AG_RESV_CRITICAL, - 0, /* XFS_RANDOM_DROP_WRITES has been removed */ - XFS_RANDOM_LOG_BAD_CRC, - XFS_RANDOM_LOG_ITEM_PIN, - XFS_RANDOM_BUF_LRU_REF, - XFS_RANDOM_FORCE_SCRUB_REPAIR, - XFS_RANDOM_FORCE_SUMMARY_RECALC, - XFS_RANDOM_IUNLINK_FALLBACK, - XFS_RANDOM_BUF_IOERROR, - XFS_RANDOM_REDUCE_MAX_IEXTENTS, - XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT, - XFS_RANDOM_AG_RESV_FAIL, - XFS_RANDOM_LARP, - XFS_RANDOM_DA_LEAF_SPLIT, - XFS_RANDOM_ATTR_LEAF_TO_NODE, - XFS_RANDOM_WB_DELAY_MS, - XFS_RANDOM_WRITE_DELAY_MS, - XFS_RANDOM_EXCHMAPS_FINISH_ONE, - XFS_RANDOM_METAFILE_RESV_CRITICAL, -}; +#define XFS_ERRTAG(_tag, _name, _default) \ + [XFS_ERRTAG_##_tag] = (_default), +#include "xfs_errortag.h" +static const unsigned int xfs_errortag_random_default[] = { XFS_ERRTAGS }; +#undef XFS_ERRTAG struct xfs_errortag_attr { struct attribute attr; @@ -93,21 +49,18 @@ xfs_errortag_attr_store( size_t count) { struct xfs_mount *mp = to_mp(kobject); - struct xfs_errortag_attr *xfs_attr = to_attr(attr); + unsigned int error_tag = to_attr(attr)->tag; int ret; - unsigned int val; if (strcmp(buf, "default") == 0) { - val = xfs_errortag_random_default[xfs_attr->tag]; + mp->m_errortag[error_tag] = + xfs_errortag_random_default[error_tag]; } else { - ret = kstrtouint(buf, 0, &val); + ret = kstrtouint(buf, 0, &mp->m_errortag[error_tag]); if (ret) return ret; } - ret = xfs_errortag_set(mp, xfs_attr->tag, val); - if (ret) - return ret; return count; } @@ -118,10 +71,9 @@ xfs_errortag_attr_show( char *buf) { struct xfs_mount *mp = to_mp(kobject); - struct xfs_errortag_attr *xfs_attr = to_attr(attr); + unsigned int error_tag = to_attr(attr)->tag; - return snprintf(buf, PAGE_SIZE, "%u\n", - xfs_errortag_get(mp, xfs_attr->tag)); + return snprintf(buf, PAGE_SIZE, "%u\n", mp->m_errortag[error_tag]); } static const struct sysfs_ops xfs_errortag_sysfs_ops = { @@ -129,110 +81,28 @@ static const struct sysfs_ops xfs_errortag_sysfs_ops = { .store = xfs_errortag_attr_store, }; -#define XFS_ERRORTAG_ATTR_RW(_name, _tag) \ +#define XFS_ERRTAG(_tag, _name, _default) \ static struct xfs_errortag_attr xfs_errortag_attr_##_name = { \ .attr = {.name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(S_IWUSR | S_IRUGO) }, \ - .tag = (_tag), \ -} - -#define XFS_ERRORTAG_ATTR_LIST(_name) &xfs_errortag_attr_##_name.attr - -XFS_ERRORTAG_ATTR_RW(noerror, XFS_ERRTAG_NOERROR); -XFS_ERRORTAG_ATTR_RW(iflush1, XFS_ERRTAG_IFLUSH_1); -XFS_ERRORTAG_ATTR_RW(iflush2, XFS_ERRTAG_IFLUSH_2); -XFS_ERRORTAG_ATTR_RW(iflush3, XFS_ERRTAG_IFLUSH_3); -XFS_ERRORTAG_ATTR_RW(iflush4, XFS_ERRTAG_IFLUSH_4); -XFS_ERRORTAG_ATTR_RW(iflush5, XFS_ERRTAG_IFLUSH_5); -XFS_ERRORTAG_ATTR_RW(iflush6, XFS_ERRTAG_IFLUSH_6); -XFS_ERRORTAG_ATTR_RW(dareadbuf, XFS_ERRTAG_DA_READ_BUF); -XFS_ERRORTAG_ATTR_RW(btree_chk_lblk, XFS_ERRTAG_BTREE_CHECK_LBLOCK); -XFS_ERRORTAG_ATTR_RW(btree_chk_sblk, XFS_ERRTAG_BTREE_CHECK_SBLOCK); -XFS_ERRORTAG_ATTR_RW(readagf, XFS_ERRTAG_ALLOC_READ_AGF); -XFS_ERRORTAG_ATTR_RW(readagi, XFS_ERRTAG_IALLOC_READ_AGI); -XFS_ERRORTAG_ATTR_RW(itobp, XFS_ERRTAG_ITOBP_INOTOBP); -XFS_ERRORTAG_ATTR_RW(iunlink, XFS_ERRTAG_IUNLINK); -XFS_ERRORTAG_ATTR_RW(iunlinkrm, XFS_ERRTAG_IUNLINK_REMOVE); -XFS_ERRORTAG_ATTR_RW(dirinovalid, XFS_ERRTAG_DIR_INO_VALIDATE); -XFS_ERRORTAG_ATTR_RW(bulkstat, XFS_ERRTAG_BULKSTAT_READ_CHUNK); -XFS_ERRORTAG_ATTR_RW(logiodone, XFS_ERRTAG_IODONE_IOERR); -XFS_ERRORTAG_ATTR_RW(stratread, XFS_ERRTAG_STRATREAD_IOERR); -XFS_ERRORTAG_ATTR_RW(stratcmpl, XFS_ERRTAG_STRATCMPL_IOERR); -XFS_ERRORTAG_ATTR_RW(diowrite, XFS_ERRTAG_DIOWRITE_IOERR); -XFS_ERRORTAG_ATTR_RW(bmapifmt, XFS_ERRTAG_BMAPIFORMAT); -XFS_ERRORTAG_ATTR_RW(free_extent, XFS_ERRTAG_FREE_EXTENT); -XFS_ERRORTAG_ATTR_RW(rmap_finish_one, XFS_ERRTAG_RMAP_FINISH_ONE); -XFS_ERRORTAG_ATTR_RW(refcount_continue_update, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE); -XFS_ERRORTAG_ATTR_RW(refcount_finish_one, XFS_ERRTAG_REFCOUNT_FINISH_ONE); -XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE); -XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL); -XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC); -XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); -XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); -XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR); -XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC); -XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK); -XFS_ERRORTAG_ATTR_RW(buf_ioerror, XFS_ERRTAG_BUF_IOERROR); -XFS_ERRORTAG_ATTR_RW(reduce_max_iextents, XFS_ERRTAG_REDUCE_MAX_IEXTENTS); -XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT); -XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL); -XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP); -XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT); -XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE); -XFS_ERRORTAG_ATTR_RW(wb_delay_ms, XFS_ERRTAG_WB_DELAY_MS); -XFS_ERRORTAG_ATTR_RW(write_delay_ms, XFS_ERRTAG_WRITE_DELAY_MS); -XFS_ERRORTAG_ATTR_RW(exchmaps_finish_one, XFS_ERRTAG_EXCHMAPS_FINISH_ONE); -XFS_ERRORTAG_ATTR_RW(metafile_resv_crit, XFS_ERRTAG_METAFILE_RESV_CRITICAL); + .tag = XFS_ERRTAG_##_tag, \ +}; +#include "xfs_errortag.h" +XFS_ERRTAGS +#undef XFS_ERRTAG +#define XFS_ERRTAG(_tag, _name, _default) \ + &xfs_errortag_attr_##_name.attr, +#include "xfs_errortag.h" static struct attribute *xfs_errortag_attrs[] = { - XFS_ERRORTAG_ATTR_LIST(noerror), - XFS_ERRORTAG_ATTR_LIST(iflush1), - XFS_ERRORTAG_ATTR_LIST(iflush2), - XFS_ERRORTAG_ATTR_LIST(iflush3), - XFS_ERRORTAG_ATTR_LIST(iflush4), - XFS_ERRORTAG_ATTR_LIST(iflush5), - XFS_ERRORTAG_ATTR_LIST(iflush6), - XFS_ERRORTAG_ATTR_LIST(dareadbuf), - XFS_ERRORTAG_ATTR_LIST(btree_chk_lblk), - XFS_ERRORTAG_ATTR_LIST(btree_chk_sblk), - XFS_ERRORTAG_ATTR_LIST(readagf), - XFS_ERRORTAG_ATTR_LIST(readagi), - XFS_ERRORTAG_ATTR_LIST(itobp), - XFS_ERRORTAG_ATTR_LIST(iunlink), - XFS_ERRORTAG_ATTR_LIST(iunlinkrm), - XFS_ERRORTAG_ATTR_LIST(dirinovalid), - XFS_ERRORTAG_ATTR_LIST(bulkstat), - XFS_ERRORTAG_ATTR_LIST(logiodone), - XFS_ERRORTAG_ATTR_LIST(stratread), - XFS_ERRORTAG_ATTR_LIST(stratcmpl), - XFS_ERRORTAG_ATTR_LIST(diowrite), - XFS_ERRORTAG_ATTR_LIST(bmapifmt), - XFS_ERRORTAG_ATTR_LIST(free_extent), - XFS_ERRORTAG_ATTR_LIST(rmap_finish_one), - XFS_ERRORTAG_ATTR_LIST(refcount_continue_update), - XFS_ERRORTAG_ATTR_LIST(refcount_finish_one), - XFS_ERRORTAG_ATTR_LIST(bmap_finish_one), - XFS_ERRORTAG_ATTR_LIST(ag_resv_critical), - XFS_ERRORTAG_ATTR_LIST(log_bad_crc), - XFS_ERRORTAG_ATTR_LIST(log_item_pin), - XFS_ERRORTAG_ATTR_LIST(buf_lru_ref), - XFS_ERRORTAG_ATTR_LIST(force_repair), - XFS_ERRORTAG_ATTR_LIST(bad_summary), - XFS_ERRORTAG_ATTR_LIST(iunlink_fallback), - XFS_ERRORTAG_ATTR_LIST(buf_ioerror), - XFS_ERRORTAG_ATTR_LIST(reduce_max_iextents), - XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent), - XFS_ERRORTAG_ATTR_LIST(ag_resv_fail), - XFS_ERRORTAG_ATTR_LIST(larp), - XFS_ERRORTAG_ATTR_LIST(da_leaf_split), - XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node), - XFS_ERRORTAG_ATTR_LIST(wb_delay_ms), - XFS_ERRORTAG_ATTR_LIST(write_delay_ms), - XFS_ERRORTAG_ATTR_LIST(exchmaps_finish_one), - XFS_ERRORTAG_ATTR_LIST(metafile_resv_crit), - NULL, + XFS_ERRTAGS + NULL }; ATTRIBUTE_GROUPS(xfs_errortag); +#undef XFS_ERRTAG + +/* -1 because XFS_ERRTAG_DROP_WRITES got removed, + 1 for NULL termination */ +static_assert(ARRAY_SIZE(xfs_errortag_attrs) == XFS_ERRTAG_MAX); static const struct kobj_type xfs_errortag_ktype = { .release = xfs_sysfs_release, @@ -295,7 +165,6 @@ xfs_errortag_enabled( bool xfs_errortag_test( struct xfs_mount *mp, - const char *expression, const char *file, int line, unsigned int error_tag) @@ -321,36 +190,12 @@ xfs_errortag_test( return false; xfs_warn_ratelimited(mp, -"Injecting error (%s) at file %s, line %d, on filesystem \"%s\"", - expression, file, line, mp->m_super->s_id); +"Injecting error at file %s, line %d, on filesystem \"%s\"", + file, line, mp->m_super->s_id); return true; } int -xfs_errortag_get( - struct xfs_mount *mp, - unsigned int error_tag) -{ - if (!xfs_errortag_valid(error_tag)) - return -EINVAL; - - return mp->m_errortag[error_tag]; -} - -int -xfs_errortag_set( - struct xfs_mount *mp, - unsigned int error_tag, - unsigned int tag_value) -{ - if (!xfs_errortag_valid(error_tag)) - return -EINVAL; - - mp->m_errortag[error_tag] = tag_value; - return 0; -} - -int xfs_errortag_add( struct xfs_mount *mp, unsigned int error_tag) @@ -359,9 +204,8 @@ xfs_errortag_add( if (!xfs_errortag_valid(error_tag)) return -EINVAL; - - return xfs_errortag_set(mp, error_tag, - xfs_errortag_random_default[error_tag]); + mp->m_errortag[error_tag] = xfs_errortag_random_default[error_tag]; + return 0; } int diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 0b9c5ba8a598..fe6a71bbe9cd 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -8,22 +8,17 @@ struct xfs_mount; -extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, - const char *filename, int linenum, - xfs_failaddr_t failaddr); -extern void xfs_corruption_error(const char *tag, int level, - struct xfs_mount *mp, const void *buf, size_t bufsize, - const char *filename, int linenum, - xfs_failaddr_t failaddr); +void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, + const char *filename, int linenum, xfs_failaddr_t failaddr); +void xfs_corruption_error(const char *tag, int level, struct xfs_mount *mp, + const void *buf, size_t bufsize, const char *filename, + int linenum, xfs_failaddr_t failaddr); void xfs_buf_corruption_error(struct xfs_buf *bp, xfs_failaddr_t fa); -extern void xfs_buf_verifier_error(struct xfs_buf *bp, int error, - const char *name, const void *buf, size_t bufsz, - xfs_failaddr_t failaddr); -extern void xfs_verifier_error(struct xfs_buf *bp, int error, - xfs_failaddr_t failaddr); -extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error, - const char *name, const void *buf, size_t bufsz, - xfs_failaddr_t failaddr); +void xfs_buf_verifier_error(struct xfs_buf *bp, int error, const char *name, + const void *buf, size_t bufsz, xfs_failaddr_t failaddr); +void xfs_verifier_error(struct xfs_buf *bp, int error, xfs_failaddr_t failaddr); +void xfs_inode_verifier_error(struct xfs_inode *ip, int error, const char *name, + const void *buf, size_t bufsz, xfs_failaddr_t failaddr); #define XFS_ERROR_REPORT(e, lvl, mp) \ xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address) @@ -39,12 +34,12 @@ extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error, #define XFS_CORRUPTION_DUMP_LEN (128) #ifdef DEBUG -extern int xfs_errortag_init(struct xfs_mount *mp); -extern void xfs_errortag_del(struct xfs_mount *mp); -extern bool xfs_errortag_test(struct xfs_mount *mp, const char *expression, - const char *file, int line, unsigned int error_tag); -#define XFS_TEST_ERROR(expr, mp, tag) \ - ((expr) || xfs_errortag_test((mp), #expr, __FILE__, __LINE__, (tag))) +int xfs_errortag_init(struct xfs_mount *mp); +void xfs_errortag_del(struct xfs_mount *mp); +bool xfs_errortag_test(struct xfs_mount *mp, const char *file, int line, + unsigned int error_tag); +#define XFS_TEST_ERROR(mp, tag) \ + xfs_errortag_test((mp), __FILE__, __LINE__, (tag)) bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag); #define XFS_ERRORTAG_DELAY(mp, tag) \ do { \ @@ -58,17 +53,13 @@ bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag); mdelay((mp)->m_errortag[(tag)]); \ } while (0) -extern int xfs_errortag_get(struct xfs_mount *mp, unsigned int error_tag); -extern int xfs_errortag_set(struct xfs_mount *mp, unsigned int error_tag, - unsigned int tag_value); -extern int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag); -extern int xfs_errortag_clearall(struct xfs_mount *mp); +int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag); +int xfs_errortag_clearall(struct xfs_mount *mp); #else #define xfs_errortag_init(mp) (0) #define xfs_errortag_del(mp) -#define XFS_TEST_ERROR(expr, mp, tag) (expr) +#define XFS_TEST_ERROR(mp, tag) (false) #define XFS_ERRORTAG_DELAY(mp, tag) ((void)0) -#define xfs_errortag_set(mp, tag, val) (ENOSYS) #define xfs_errortag_add(mp, tag) (ENOSYS) #define xfs_errortag_clearall(mp) (ENOSYS) #endif /* DEBUG */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 47ee598a9827..418ddab590e0 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -202,7 +202,7 @@ xfs_efi_copy_format( sizeof(struct xfs_extent)); return 0; } else if (buf->iov_len == len32) { - xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->iov_base; + struct xfs_efi_log_format_32 *src_efi_fmt_32 = buf->iov_base; dst_efi_fmt->efi_type = src_efi_fmt_32->efi_type; dst_efi_fmt->efi_size = src_efi_fmt_32->efi_size; @@ -216,7 +216,7 @@ xfs_efi_copy_format( } return 0; } else if (buf->iov_len == len64) { - xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->iov_base; + struct xfs_efi_log_format_64 *src_efi_fmt_64 = buf->iov_base; dst_efi_fmt->efi_type = src_efi_fmt_64->efi_type; dst_efi_fmt->efi_size = src_efi_fmt_64->efi_size; diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index c8402040410b..af1b0331f7af 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -49,7 +49,7 @@ struct xfs_efi_log_item { struct xfs_log_item efi_item; atomic_t efi_refcount; atomic_t efi_next_extent; - xfs_efi_log_format_t efi_format; + struct xfs_efi_log_format efi_format; }; static inline size_t @@ -69,7 +69,7 @@ struct xfs_efd_log_item { struct xfs_log_item efd_item; struct xfs_efi_log_item *efd_efip; uint efd_next_extent; - xfs_efd_log_format_t efd_format; + struct xfs_efd_log_format efd_format; }; static inline size_t diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f96fbf5c54c9..2702fef2c90c 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -75,52 +75,47 @@ xfs_dir_fsync( return xfs_log_force_inode(ip); } -static xfs_csn_t -xfs_fsync_seq( - struct xfs_inode *ip, - bool datasync) -{ - if (!xfs_ipincount(ip)) - return 0; - if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) - return 0; - return ip->i_itemp->ili_commit_seq; -} - /* - * All metadata updates are logged, which means that we just have to flush the - * log up to the latest LSN that touched the inode. + * All metadata updates are logged, which means that we just have to push the + * journal to the required sequence number than holds the updates. We track + * datasync commits separately to full sync commits, and hence only need to + * select the correct sequence number for the log force here. * - * If we have concurrent fsync/fdatasync() calls, we need them to all block on - * the log force before we clear the ili_fsync_fields field. This ensures that - * we don't get a racing sync operation that does not wait for the metadata to - * hit the journal before returning. If we race with clearing ili_fsync_fields, - * then all that will happen is the log force will do nothing as the lsn will - * already be on disk. We can't race with setting ili_fsync_fields because that - * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock - * shared until after the ili_fsync_fields is cleared. + * We don't have to serialise against concurrent modifications, as we do not + * have to wait for modifications that have not yet completed. We define a + * transaction commit as completing when the commit sequence number is updated, + * hence if the sequence number has not updated, the sync operation has been + * run before the commit completed and we don't have to wait for it. + * + * If we have concurrent fsync/fdatasync() calls, the sequence numbers remain + * set on the log item until - at least - the journal flush completes. In + * reality, they are only cleared when the inode is fully unpinned (i.e. + * persistent in the journal and not dirty in the CIL), and so we rely on + * xfs_log_force_seq() either skipping sequences that have been persisted or + * waiting on sequences that are still in flight to correctly order concurrent + * sync operations. */ -static int +static int xfs_fsync_flush_log( struct xfs_inode *ip, bool datasync, int *log_flushed) { - int error = 0; - xfs_csn_t seq; + struct xfs_inode_log_item *iip = ip->i_itemp; + xfs_csn_t seq = 0; - xfs_ilock(ip, XFS_ILOCK_SHARED); - seq = xfs_fsync_seq(ip, datasync); - if (seq) { - error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, - log_flushed); + spin_lock(&iip->ili_lock); + if (datasync) + seq = iip->ili_datasync_seq; + else + seq = iip->ili_commit_seq; + spin_unlock(&iip->ili_lock); - spin_lock(&ip->i_itemp->ili_lock); - ip->i_itemp->ili_fsync_fields = 0; - spin_unlock(&ip->i_itemp->ili_lock); - } - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return error; + if (!seq) + return 0; + + return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, + log_flushed); } STATIC int @@ -158,12 +153,10 @@ xfs_file_fsync( error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); /* - * Any inode that has dirty modifications in the log is pinned. The - * racy check here for a pinned inode will not catch modifications - * that happen concurrently to the fsync call, but fsync semantics - * only require to sync previously completed I/O. + * If the inode has a inode log item attached, it may need the journal + * flushed to persist any changes the log item might be tracking. */ - if (xfs_ipincount(ip)) { + if (ip->i_itemp) { err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed); if (err2 && !error) error = err2; diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index f6f628c01feb..566fd663c95b 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -14,8 +14,6 @@ */ xfs_param_t xfs_params = { /* MIN DFLT MAX */ - .sgid_inherit = { 0, 0, 1 }, - .symlink_mode = { 0, 0, 1 }, .panic_mask = { 0, 0, XFS_PTAG_MASK}, .error_level = { 0, 3, 11 }, .syncd_timer = { 1*100, 30*100, 7200*100}, diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 4cf7abe50143..e44040206851 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -646,8 +646,7 @@ xfs_iget_cache_miss( goto out_destroy; /* - * For version 5 superblocks, if we are initialising a new inode and we - * are not utilising the XFS_FEAT_IKEEP inode cluster mode, we can + * For version 5 superblocks, if we are initialising a new inode, we * simply build the new inode core with a random generation number. * * For version 4 (and older) superblocks, log recovery is dependent on @@ -655,8 +654,7 @@ xfs_iget_cache_miss( * value and hence we must also read the inode off disk even when * initializing new inodes. */ - if (xfs_has_v3inodes(mp) && - (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) { + if (xfs_has_v3inodes(mp) && (flags & XFS_IGET_CREATE)) { VFS_I(ip)->i_generation = get_random_u32(); } else { struct xfs_buf *bp; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 9c39251961a3..36b39539e561 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -877,6 +877,35 @@ xfs_create_tmpfile( return error; } +static inline int +xfs_projid_differ( + struct xfs_inode *tdp, + struct xfs_inode *sip) +{ + /* + * If we are using project inheritance, we only allow hard link/renames + * creation in our tree when the project IDs are the same; else + * the tree quota mechanism could be circumvented. + */ + if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) && + tdp->i_projid != sip->i_projid)) { + /* + * Project quota setup skips special files which can + * leave inodes in a PROJINHERIT directory without a + * project ID set. We need to allow links to be made + * to these "project-less" inodes because userspace + * expects them to succeed after project ID setup, + * but everything else should be rejected. + */ + if (!special_file(VFS_I(sip)->i_mode) || + sip->i_projid != 0) { + return -EXDEV; + } + } + + return 0; +} + int xfs_link( struct xfs_inode *tdp, @@ -930,27 +959,9 @@ xfs_link( goto error_return; } - /* - * If we are using project inheritance, we only allow hard link - * creation in our tree when the project IDs are the same; else - * the tree quota mechanism could be circumvented. - */ - if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) && - tdp->i_projid != sip->i_projid)) { - /* - * Project quota setup skips special files which can - * leave inodes in a PROJINHERIT directory without a - * project ID set. We need to allow links to be made - * to these "project-less" inodes because userspace - * expects them to succeed after project ID setup, - * but everything else should be rejected. - */ - if (!special_file(VFS_I(sip)->i_mode) || - sip->i_projid != 0) { - error = -EXDEV; - goto error_return; - } - } + error = xfs_projid_differ(tdp, sip); + if (error) + goto error_return; error = xfs_dir_add_child(tp, resblks, &du); if (error) @@ -1035,7 +1046,7 @@ xfs_itruncate_extents_flags( int error = 0; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); - if (atomic_read(&VFS_I(ip)->i_count)) + if (icount_read(VFS_I(ip))) xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); ASSERT(new_size <= XFS_ISIZE(ip)); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); @@ -1656,7 +1667,6 @@ retry: spin_lock(&iip->ili_lock); iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; - iip->ili_fsync_fields = 0; spin_unlock(&iip->ili_lock); ASSERT(iip->ili_last_fields); @@ -1821,12 +1831,20 @@ static void xfs_iunpin( struct xfs_inode *ip) { - xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); + struct xfs_inode_log_item *iip = ip->i_itemp; + xfs_csn_t seq = 0; trace_xfs_inode_unpin_nowait(ip, _RET_IP_); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); + + spin_lock(&iip->ili_lock); + seq = iip->ili_commit_seq; + spin_unlock(&iip->ili_lock); + if (!seq) + return; /* Give the log a push to start the unpinning I/O */ - xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL); + xfs_log_force_seq(ip->i_mount, seq, 0, NULL); } @@ -2227,16 +2245,9 @@ retry: if (du_wip.ip) xfs_trans_ijoin(tp, du_wip.ip, 0); - /* - * If we are using project inheritance, we only allow renames - * into our tree when the project IDs are the same; else the - * tree quota mechanism would be circumvented. - */ - if (unlikely((target_dp->i_diflags & XFS_DIFLAG_PROJINHERIT) && - target_dp->i_projid != src_ip->i_projid)) { - error = -EXDEV; + error = xfs_projid_differ(target_dp, src_ip); + if (error) goto out_trans_cancel; - } /* RENAME_EXCHANGE is unique from here on. */ if (flags & RENAME_EXCHANGE) { @@ -2377,8 +2388,8 @@ xfs_iflush( * error handling as the caller will shutdown and fail the buffer. */ error = -EFSCORRUPTED; - if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), - mp, XFS_ERRTAG_IFLUSH_1)) { + if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC) || + XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_1)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); @@ -2394,29 +2405,27 @@ xfs_iflush( goto flush_out; } } else if (S_ISREG(VFS_I(ip)->i_mode)) { - if (XFS_TEST_ERROR( - ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && - ip->i_df.if_format != XFS_DINODE_FMT_BTREE, - mp, XFS_ERRTAG_IFLUSH_3)) { + if ((ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE) || + XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_3)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad regular inode %llu, ptr "PTR_FMT, __func__, ip->i_ino, ip); goto flush_out; } } else if (S_ISDIR(VFS_I(ip)->i_mode)) { - if (XFS_TEST_ERROR( - ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && - ip->i_df.if_format != XFS_DINODE_FMT_BTREE && - ip->i_df.if_format != XFS_DINODE_FMT_LOCAL, - mp, XFS_ERRTAG_IFLUSH_4)) { + if ((ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE && + ip->i_df.if_format != XFS_DINODE_FMT_LOCAL) || + XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_4)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad directory inode %llu, ptr "PTR_FMT, __func__, ip->i_ino, ip); goto flush_out; } } - if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) > - ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { + if (ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) > + ip->i_nblocks || XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_5)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: detected corrupt incore inode %llu, " "total extents = %llu nblocks = %lld, ptr "PTR_FMT, @@ -2425,8 +2434,8 @@ xfs_iflush( ip->i_nblocks, ip); goto flush_out; } - if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize, - mp, XFS_ERRTAG_IFLUSH_6)) { + if (ip->i_forkoff > mp->m_sb.sb_inodesize || + XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_6)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, ip->i_forkoff, ip); @@ -2502,7 +2511,6 @@ flush_out: spin_lock(&iip->ili_lock); iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; - iip->ili_fsync_fields = 0; set_bit(XFS_LI_FLUSHING, &iip->ili_item.li_flags); spin_unlock(&iip->ili_lock); @@ -2661,12 +2669,15 @@ int xfs_log_force_inode( struct xfs_inode *ip) { + struct xfs_inode_log_item *iip = ip->i_itemp; xfs_csn_t seq = 0; - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) - seq = ip->i_itemp->ili_commit_seq; - xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (!iip) + return 0; + + spin_lock(&iip->ili_lock); + seq = iip->ili_commit_seq; + spin_unlock(&iip->ili_lock); if (!seq) return 0; diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 829675700fcd..1bd411a1114c 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -131,46 +131,28 @@ xfs_inode_item_precommit( } /* - * Inode verifiers do not check that the extent size hint is an integer - * multiple of the rt extent size on a directory with both rtinherit - * and extszinherit flags set. If we're logging a directory that is - * misconfigured in this way, clear the hint. + * Inode verifiers do not check that the extent size hints are an + * integer multiple of the rt extent size on a directory with + * rtinherit flags set. If we're logging a directory that is + * misconfigured in this way, clear the bad hints. */ - if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && - (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && - xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) { - ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | - XFS_DIFLAG_EXTSZINHERIT); - ip->i_extsize = 0; - flags |= XFS_ILOG_CORE; + if (ip->i_diflags & XFS_DIFLAG_RTINHERIT) { + if ((ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && + xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) { + ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | + XFS_DIFLAG_EXTSZINHERIT); + ip->i_extsize = 0; + flags |= XFS_ILOG_CORE; + } + if ((ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && + xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) { + ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; + ip->i_cowextsize = 0; + flags |= XFS_ILOG_CORE; + } } - /* - * Record the specific change for fdatasync optimisation. This allows - * fdatasync to skip log forces for inodes that are only timestamp - * dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it - * to XFS_ILOG_CORE so that the actual on-disk dirty tracking - * (ili_fields) correctly tracks that the version has changed. - */ spin_lock(&iip->ili_lock); - iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION); - if (flags & XFS_ILOG_IVERSION) - flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE); - - /* - * Inode verifiers do not check that the CoW extent size hint is an - * integer multiple of the rt extent size on a directory with both - * rtinherit and cowextsize flags set. If we're logging a directory - * that is misconfigured in this way, clear the hint. - */ - if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && - (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && - xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) { - ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; - ip->i_cowextsize = 0; - flags |= XFS_ILOG_CORE; - } - if (!iip->ili_item.li_buf) { struct xfs_buf *bp; int error; @@ -205,6 +187,20 @@ xfs_inode_item_precommit( } /* + * Store the dirty flags back into the inode item as this state is used + * later on in xfs_inode_item_committing() to determine whether the + * transaction is relevant to fsync state or not. + */ + iip->ili_dirty_flags = flags; + + /* + * Convert the flags on-disk fields that have been modified in the + * transaction so that ili_fields tracks the changes correctly. + */ + if (flags & XFS_ILOG_IVERSION) + flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE); + + /* * Always OR in the bits from the ili_last_fields field. This is to * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines * in the eventual clearing of the ili_fields bits. See the big comment @@ -214,12 +210,6 @@ xfs_inode_item_precommit( spin_unlock(&iip->ili_lock); xfs_inode_item_precommit_check(ip); - - /* - * We are done with the log item transaction dirty state, so clear it so - * that it doesn't pollute future transactions. - */ - iip->ili_dirty_flags = 0; return 0; } @@ -729,13 +719,24 @@ xfs_inode_item_unpin( struct xfs_log_item *lip, int remove) { - struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; trace_xfs_inode_unpin(ip, _RET_IP_); ASSERT(lip->li_buf || xfs_iflags_test(ip, XFS_ISTALE)); ASSERT(atomic_read(&ip->i_pincount) > 0); - if (atomic_dec_and_test(&ip->i_pincount)) + + /* + * If this is the last unpin, then the inode no longer needs a journal + * flush to persist it. Hence we can clear the commit sequence numbers + * as a fsync/fdatasync operation on the inode at this point is a no-op. + */ + if (atomic_dec_and_lock(&ip->i_pincount, &iip->ili_lock)) { + iip->ili_commit_seq = 0; + iip->ili_datasync_seq = 0; + spin_unlock(&iip->ili_lock); wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); + } } STATIC uint @@ -858,12 +859,45 @@ xfs_inode_item_committed( return lsn; } +/* + * The modification is now complete, so before we unlock the inode we need to + * update the commit sequence numbers for data integrity journal flushes. We + * always record the commit sequence number (ili_commit_seq) so that anything + * that needs a full journal sync will capture all of this modification. + * + * We then + * check if the changes will impact a datasync (O_DSYNC) journal flush. If the + * changes will require a datasync flush, then we also record the sequence in + * ili_datasync_seq. + * + * These commit sequence numbers will get cleared atomically with the inode being + * unpinned (i.e. pin count goes to zero), and so it will only be set when the + * inode is dirty in the journal. This removes the need for checking if the + * inode is pinned to determine if a journal flush is necessary, and hence + * removes the need for holding the ILOCK_SHARED in xfs_file_fsync() to + * serialise pin counts against commit sequence number updates. + * + */ STATIC void xfs_inode_item_committing( struct xfs_log_item *lip, xfs_csn_t seq) { - INODE_ITEM(lip)->ili_commit_seq = seq; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + + spin_lock(&iip->ili_lock); + iip->ili_commit_seq = seq; + if (iip->ili_dirty_flags & ~(XFS_ILOG_IVERSION | XFS_ILOG_TIMESTAMP)) + iip->ili_datasync_seq = seq; + spin_unlock(&iip->ili_lock); + + /* + * Clear the per-transaction dirty flags now that we have finished + * recording the transaction's inode modifications in the CIL and are + * about to release and (maybe) unlock the inode. + */ + iip->ili_dirty_flags = 0; + return xfs_inode_item_release(lip); } @@ -1055,7 +1089,6 @@ xfs_iflush_abort_clean( { iip->ili_last_fields = 0; iip->ili_fields = 0; - iip->ili_fsync_fields = 0; iip->ili_flush_lsn = 0; iip->ili_item.li_buf = NULL; list_del_init(&iip->ili_item.li_bio_list); diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index ba92ce11a011..2ddcca41714f 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -32,9 +32,17 @@ struct xfs_inode_log_item { spinlock_t ili_lock; /* flush state lock */ unsigned int ili_last_fields; /* fields when flushed */ unsigned int ili_fields; /* fields to be logged */ - unsigned int ili_fsync_fields; /* logged since last fsync */ xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ + + /* + * We record the sequence number for every inode modification, as + * well as those that only require fdatasync operations for data + * integrity. This allows optimisation of the O_DSYNC/fdatasync path + * without needing to track what modifications the journal is currently + * carrying for the inode. These are protected by the above ili_lock. + */ xfs_csn_t ili_commit_seq; /* last transaction commit */ + xfs_csn_t ili_datasync_seq; /* for datasync optimisation */ }; static inline int xfs_inode_clean(struct xfs_inode *ip) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index e1051a530a50..a6bb7ee7a27a 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -512,9 +512,6 @@ xfs_fileattr_get( { struct xfs_inode *ip = XFS_I(d_inode(dentry)); - if (d_is_special(dentry)) - return -ENOTTY; - xfs_ilock(ip, XFS_ILOCK_SHARED); xfs_fill_fsxattr(ip, XFS_DATA_FORK, fa); xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -736,9 +733,6 @@ xfs_fileattr_set( trace_xfs_ioctl_setattr(ip); - if (d_is_special(dentry)) - return -ENOTTY; - if (!fa->fsx_valid) { if (fa->flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL | FS_NODUMP_FL | @@ -1209,21 +1203,21 @@ xfs_file_ioctl( current->comm); return -ENOTTY; case XFS_IOC_DIOINFO: { - struct xfs_buftarg *target = xfs_inode_buftarg(ip); + struct kstat st; struct dioattr da; - da.d_mem = target->bt_logical_sectorsize; + error = vfs_getattr(&filp->f_path, &st, STATX_DIOALIGN, 0); + if (error) + return error; /* - * See xfs_report_dioalign() for an explanation about why this - * reports a value larger than the sector size for COW inodes. + * Some userspace directly feeds the return value to + * posix_memalign, which fails for values that are smaller than + * the pointer size. Round up the value to not break userspace. */ - if (xfs_is_cow_inode(ip)) - da.d_miniosz = xfs_inode_alloc_unitsize(ip); - else - da.d_miniosz = target->bt_logical_sectorsize; + da.d_mem = roundup(st.dio_mem_align, sizeof(void *)); + da.d_miniosz = st.dio_offset_align; da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); - if (copy_to_user(arg, &da, sizeof(da))) return -EFAULT; return 0; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 2a74f2957341..d3f6e3e42a11 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -149,9 +149,18 @@ xfs_bmbt_to_iomap( iomap->bdev = target->bt_bdev; iomap->flags = iomap_flags; - if (xfs_ipincount(ip) && - (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) - iomap->flags |= IOMAP_F_DIRTY; + /* + * If the inode is dirty for datasync purposes, let iomap know so it + * doesn't elide the IO completion journal flushes on O_DSYNC IO. + */ + if (ip->i_itemp) { + struct xfs_inode_log_item *iip = ip->i_itemp; + + spin_lock(&iip->ili_lock); + if (iip->ili_datasync_seq) + iomap->flags |= IOMAP_F_DIRTY; + spin_unlock(&iip->ili_lock); + } iomap->validity_cookie = sequence_cookie; return 0; @@ -1554,7 +1563,7 @@ xfs_zoned_buffered_write_iomap_begin( return error; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, XFS_DATA_FORK); error = -EFSCORRUPTED; goto out_unlock; @@ -1728,7 +1737,7 @@ xfs_buffered_write_iomap_begin( return error; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, XFS_DATA_FORK); error = -EFSCORRUPTED; goto out_unlock; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 603effabe1ee..caff0125faea 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -431,14 +431,12 @@ xfs_vn_symlink( struct dentry *dentry, const char *symname) { - struct inode *inode; - struct xfs_inode *cip = NULL; - struct xfs_name name; - int error; - umode_t mode; + struct inode *inode; + struct xfs_inode *cip = NULL; + struct xfs_name name; + int error; + umode_t mode = S_IFLNK | S_IRWXUGO; - mode = S_IFLNK | - (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); error = xfs_dentry_mode_to_name(&name, dentry, mode); if (unlikely(error)) goto out; @@ -1335,6 +1333,8 @@ static const struct inode_operations xfs_symlink_inode_operations = { .setattr = xfs_vn_setattr, .listxattr = xfs_vn_listxattr, .update_time = xfs_vn_update_time, + .fileattr_get = xfs_fileattr_get, + .fileattr_set = xfs_fileattr_set, }; /* Figure out if this file actually supports DAX. */ diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 9a2221b4aa21..4dd747bdbcca 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -89,8 +89,6 @@ typedef __u32 xfs_nlink_t; #undef XFS_NATIVE_HOST #endif -#define irix_sgid_inherit xfs_params.sgid_inherit.val -#define irix_symlink_mode xfs_params.symlink_mode.val #define xfs_panic_mask xfs_params.panic_mask.val #define xfs_error_level xfs_params.error_level.val #define xfs_syncd_centisecs xfs_params.syncd_timer.val diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c8a57e21a1d3..603e85c1ab4c 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -969,8 +969,8 @@ xfs_log_unmount_write( * counters will be recalculated. Refer to xlog_check_unmount_rec for * more details. */ - if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp, - XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { + if (xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS) || + XFS_TEST_ERROR(mp, XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { xfs_alert(mp, "%s: will fix summary counters at next mount", __func__); return; @@ -1240,7 +1240,7 @@ xlog_ioend_work( /* * Race to shutdown the filesystem if we see an error. */ - if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { + if (error || XFS_TEST_ERROR(log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { xfs_alert(log->l_mp, "log I/O error %d", error); xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } @@ -1489,8 +1489,7 @@ xlog_alloc_log( log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | - WQ_HIGHPRI), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_PERCPU), 0, mp->m_super->s_id); if (!log->l_ioend_workqueue) goto out_free_iclog; @@ -1568,13 +1567,13 @@ xlog_cksum( struct xlog *log, struct xlog_rec_header *rhead, char *dp, - int size) + unsigned int hdrsize, + unsigned int size) { uint32_t crc; /* first generate the crc for the record header ... */ - crc = xfs_start_cksum_update((char *)rhead, - sizeof(struct xlog_rec_header), + crc = xfs_start_cksum_update((char *)rhead, hdrsize, offsetof(struct xlog_rec_header, h_crc)); /* ... then for additional cycle data for v2 logs ... */ @@ -1818,7 +1817,7 @@ xlog_sync( /* calculcate the checksum */ iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, - iclog->ic_datap, size); + iclog->ic_datap, XLOG_REC_SIZE, size); /* * Intentionally corrupt the log record CRC based on the error injection * frequency, if defined. This facilitates testing log recovery in the @@ -1827,7 +1826,7 @@ xlog_sync( * detects the bad CRC and attempts to recover. */ #ifdef DEBUG - if (XFS_TEST_ERROR(false, log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) { + if (XFS_TEST_ERROR(log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) { iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA); iclog->ic_fail_crc = true; xfs_warn(log->l_mp, @@ -2656,10 +2655,11 @@ restart: * until you know exactly how many bytes get copied. Therefore, wait * until later to update ic_offset. * - * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's + * xlog_write() algorithm assumes that at least 2 xlog_op_header's * can fit into remaining data section. */ - if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) { + if (iclog->ic_size - iclog->ic_offset < + 2 * sizeof(struct xlog_op_header)) { int error = 0; xlog_state_switch_iclogs(log, iclog, iclog->ic_size); @@ -3153,11 +3153,11 @@ xlog_calc_unit_res( */ /* for trans header */ - unit_bytes += sizeof(xlog_op_header_t); - unit_bytes += sizeof(xfs_trans_header_t); + unit_bytes += sizeof(struct xlog_op_header); + unit_bytes += sizeof(struct xfs_trans_header); /* for start-rec */ - unit_bytes += sizeof(xlog_op_header_t); + unit_bytes += sizeof(struct xlog_op_header); /* * for LR headers - the space for data in an iclog is the size minus @@ -3180,12 +3180,12 @@ xlog_calc_unit_res( num_headers = howmany(unit_bytes, iclog_space); /* for split-recs - ophdrs added when data split over LRs */ - unit_bytes += sizeof(xlog_op_header_t) * num_headers; + unit_bytes += sizeof(struct xlog_op_header) * num_headers; /* add extra header reservations if we overrun */ while (!num_headers || howmany(unit_bytes, iclog_space) > num_headers) { - unit_bytes += sizeof(xlog_op_header_t); + unit_bytes += sizeof(struct xlog_op_header); num_headers++; } unit_bytes += log->l_iclog_hsize * num_headers; @@ -3322,7 +3322,7 @@ xlog_verify_iclog( struct xlog_in_core *iclog, int count) { - xlog_op_header_t *ophead; + struct xlog_op_header *ophead; xlog_in_core_t *icptr; xlog_in_core_2_t *xhdr; void *base_ptr, *ptr, *p; @@ -3400,7 +3400,7 @@ xlog_verify_iclog( op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]); } } - ptr += sizeof(xlog_op_header_t) + op_len; + ptr += sizeof(struct xlog_op_header) + op_len; } } #endif diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index af6daf4f6792..dcc1f44ed68f 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -20,6 +20,43 @@ struct xfs_log_vec { int lv_alloc_size; /* size of allocated lv */ }; +/* Region types for iovec's i_type */ +#define XLOG_REG_TYPE_BFORMAT 1 +#define XLOG_REG_TYPE_BCHUNK 2 +#define XLOG_REG_TYPE_EFI_FORMAT 3 +#define XLOG_REG_TYPE_EFD_FORMAT 4 +#define XLOG_REG_TYPE_IFORMAT 5 +#define XLOG_REG_TYPE_ICORE 6 +#define XLOG_REG_TYPE_IEXT 7 +#define XLOG_REG_TYPE_IBROOT 8 +#define XLOG_REG_TYPE_ILOCAL 9 +#define XLOG_REG_TYPE_IATTR_EXT 10 +#define XLOG_REG_TYPE_IATTR_BROOT 11 +#define XLOG_REG_TYPE_IATTR_LOCAL 12 +#define XLOG_REG_TYPE_QFORMAT 13 +#define XLOG_REG_TYPE_DQUOT 14 +#define XLOG_REG_TYPE_QUOTAOFF 15 +#define XLOG_REG_TYPE_LRHEADER 16 +#define XLOG_REG_TYPE_UNMOUNT 17 +#define XLOG_REG_TYPE_COMMIT 18 +#define XLOG_REG_TYPE_TRANSHDR 19 +#define XLOG_REG_TYPE_ICREATE 20 +#define XLOG_REG_TYPE_RUI_FORMAT 21 +#define XLOG_REG_TYPE_RUD_FORMAT 22 +#define XLOG_REG_TYPE_CUI_FORMAT 23 +#define XLOG_REG_TYPE_CUD_FORMAT 24 +#define XLOG_REG_TYPE_BUI_FORMAT 25 +#define XLOG_REG_TYPE_BUD_FORMAT 26 +#define XLOG_REG_TYPE_ATTRI_FORMAT 27 +#define XLOG_REG_TYPE_ATTRD_FORMAT 28 +#define XLOG_REG_TYPE_ATTR_NAME 29 +#define XLOG_REG_TYPE_ATTR_VALUE 30 +#define XLOG_REG_TYPE_XMI_FORMAT 31 +#define XLOG_REG_TYPE_XMD_FORMAT 32 +#define XLOG_REG_TYPE_ATTR_NEWNAME 33 +#define XLOG_REG_TYPE_ATTR_NEWVALUE 34 +#define XLOG_REG_TYPE_MAX 34 + #define XFS_LOG_VEC_ORDERED (-1) /* diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index a9a7a271c15b..0cfc654d8e87 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -499,8 +499,8 @@ xlog_recover_finish( extern void xlog_recover_cancel(struct xlog *); -extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, - char *dp, int size); +__le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, + char *dp, unsigned int hdrsize, unsigned int size); extern struct kmem_cache *xfs_log_ticket_cache; struct xlog_ticket *xlog_ticket_alloc(struct xlog *log, int unit_bytes, diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index e6ed9e09c027..549d60959aee 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2894,20 +2894,34 @@ xlog_recover_process( int pass, struct list_head *buffer_list) { - __le32 old_crc = rhead->h_crc; - __le32 crc; + __le32 expected_crc = rhead->h_crc, crc, other_crc; - crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); + crc = xlog_cksum(log, rhead, dp, XLOG_REC_SIZE, + be32_to_cpu(rhead->h_len)); + + /* + * Look at the end of the struct xlog_rec_header definition in + * xfs_log_format.h for the glory details. + */ + if (expected_crc && crc != expected_crc) { + other_crc = xlog_cksum(log, rhead, dp, XLOG_REC_SIZE_OTHER, + be32_to_cpu(rhead->h_len)); + if (other_crc == expected_crc) { + xfs_notice_once(log->l_mp, + "Fixing up incorrect CRC due to padding."); + crc = other_crc; + } + } /* * Nothing else to do if this is a CRC verification pass. Just return * if this a record with a non-zero crc. Unfortunately, mkfs always - * sets old_crc to 0 so we must consider this valid even on v5 supers. - * Otherwise, return EFSBADCRC on failure so the callers up the stack - * know precisely what failed. + * sets expected_crc to 0 so we must consider this valid even on v5 + * supers. Otherwise, return EFSBADCRC on failure so the callers up the + * stack know precisely what failed. */ if (pass == XLOG_RECOVER_CRCPASS) { - if (old_crc && crc != old_crc) + if (expected_crc && crc != expected_crc) return -EFSBADCRC; return 0; } @@ -2918,11 +2932,11 @@ xlog_recover_process( * zero CRC check prevents warnings from being emitted when upgrading * the kernel from one that does not add CRCs by default. */ - if (crc != old_crc) { - if (old_crc || xfs_has_crc(log->l_mp)) { + if (crc != expected_crc) { + if (expected_crc || xfs_has_crc(log->l_mp)) { xfs_alert(log->l_mp, "log record CRC mismatch: found 0x%x, expected 0x%x.", - le32_to_cpu(old_crc), + le32_to_cpu(expected_crc), le32_to_cpu(crc)); xfs_hex_dump(dp, 32); } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index dc32c5e34d81..0953f6ae94ab 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1057,19 +1057,6 @@ xfs_mountfs( xfs_inodegc_start(mp); xfs_blockgc_start(mp); - /* - * Now that we've recovered any pending superblock feature bit - * additions, we can finish setting up the attr2 behaviour for the - * mount. The noattr2 option overrides the superblock flag, so only - * check the superblock feature flag if the mount option is not set. - */ - if (xfs_has_noattr2(mp)) { - mp->m_features &= ~XFS_FEAT_ATTR2; - } else if (!xfs_has_attr2(mp) && - (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT)) { - mp->m_features |= XFS_FEAT_ATTR2; - } - if (xfs_has_metadir(mp)) { error = xfs_mount_setup_metadir(mp); if (error) diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 97de44c32272..f046d1215b04 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -363,7 +363,6 @@ typedef struct xfs_mount { #define XFS_FEAT_EXTFLG (1ULL << 7) /* unwritten extents */ #define XFS_FEAT_ASCIICI (1ULL << 8) /* ASCII only case-insens. */ #define XFS_FEAT_LAZYSBCOUNT (1ULL << 9) /* Superblk counters */ -#define XFS_FEAT_ATTR2 (1ULL << 10) /* dynamic attr fork */ #define XFS_FEAT_PARENT (1ULL << 11) /* parent pointers */ #define XFS_FEAT_PROJID32 (1ULL << 12) /* 32 bit project id */ #define XFS_FEAT_CRC (1ULL << 13) /* metadata CRCs */ @@ -386,7 +385,6 @@ typedef struct xfs_mount { /* Mount features */ #define XFS_FEAT_NOLIFETIME (1ULL << 47) /* disable lifetime hints */ -#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ #define XFS_FEAT_NOALIGN (1ULL << 49) /* ignore alignment */ #define XFS_FEAT_ALLOCSIZE (1ULL << 50) /* user specified allocation size */ #define XFS_FEAT_LARGE_IOSIZE (1ULL << 51) /* report large preferred @@ -396,7 +394,6 @@ typedef struct xfs_mount { #define XFS_FEAT_DISCARD (1ULL << 54) /* discard unused blocks */ #define XFS_FEAT_GRPID (1ULL << 55) /* group-ID assigned from directory */ #define XFS_FEAT_SMALL_INUMS (1ULL << 56) /* user wants 32bit inodes */ -#define XFS_FEAT_IKEEP (1ULL << 57) /* keep empty inode clusters*/ #define XFS_FEAT_SWALLOC (1ULL << 58) /* stripe width allocation */ #define XFS_FEAT_FILESTREAMS (1ULL << 59) /* use filestreams allocator */ #define XFS_FEAT_DAX_ALWAYS (1ULL << 60) /* DAX always enabled */ @@ -504,12 +501,17 @@ __XFS_HAS_V4_FEAT(align, ALIGN) __XFS_HAS_V4_FEAT(logv2, LOGV2) __XFS_HAS_V4_FEAT(extflg, EXTFLG) __XFS_HAS_V4_FEAT(lazysbcount, LAZYSBCOUNT) -__XFS_ADD_V4_FEAT(attr2, ATTR2) __XFS_ADD_V4_FEAT(projid32, PROJID32) __XFS_HAS_V4_FEAT(v3inodes, V3INODES) __XFS_HAS_V4_FEAT(crc, CRC) __XFS_HAS_V4_FEAT(pquotino, PQUOTINO) +static inline void xfs_add_attr2(struct xfs_mount *mp) +{ + if (IS_ENABLED(CONFIG_XFS_SUPPORT_V4)) + xfs_sb_version_addattr2(&mp->m_sb); +} + /* * Mount features * @@ -517,7 +519,6 @@ __XFS_HAS_V4_FEAT(pquotino, PQUOTINO) * bit inodes and read-only state, are kept as operational state rather than * features. */ -__XFS_HAS_FEAT(noattr2, NOATTR2) __XFS_HAS_FEAT(noalign, NOALIGN) __XFS_HAS_FEAT(allocsize, ALLOCSIZE) __XFS_HAS_FEAT(large_iosize, LARGE_IOSIZE) @@ -526,7 +527,6 @@ __XFS_HAS_FEAT(dirsync, DIRSYNC) __XFS_HAS_FEAT(discard, DISCARD) __XFS_HAS_FEAT(grpid, GRPID) __XFS_HAS_FEAT(small_inums, SMALL_INUMS) -__XFS_HAS_FEAT(ikeep, IKEEP) __XFS_HAS_FEAT(swalloc, SWALLOC) __XFS_HAS_FEAT(filestreams, FILESTREAMS) __XFS_HAS_FEAT(dax_always, DAX_ALWAYS) diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index 866c71d9fbae..73b7e72944e4 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -293,7 +293,8 @@ int xfs_mru_cache_init(void) { xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", - XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 1); + XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU), + 1); if (!xfs_mru_reap_wq) return -ENOMEM; return 0; diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index fbeddcac4792..b17672889942 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -165,7 +165,7 @@ xfs_dax_translate_range( uint64_t *bblen) { u64 dev_start = btp->bt_dax_part_off; - u64 dev_len = bdev_nr_bytes(btp->bt_bdev); + u64 dev_len = BBTOB(btp->bt_nr_sectors); u64 dev_end = dev_start + dev_len - 1; /* Notify failure on the whole device. */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index bb0a82635a77..e85a156dc17d 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -105,8 +105,8 @@ enum { Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid, Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups, - Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, Opt_ikeep, - Opt_noikeep, Opt_largeio, Opt_nolargeio, Opt_attr2, Opt_noattr2, + Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, + Opt_largeio, Opt_nolargeio, Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota, Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, @@ -133,12 +133,8 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = { fsparam_flag("norecovery", Opt_norecovery), fsparam_flag("inode64", Opt_inode64), fsparam_flag("inode32", Opt_inode32), - fsparam_flag("ikeep", Opt_ikeep), - fsparam_flag("noikeep", Opt_noikeep), fsparam_flag("largeio", Opt_largeio), fsparam_flag("nolargeio", Opt_nolargeio), - fsparam_flag("attr2", Opt_attr2), - fsparam_flag("noattr2", Opt_noattr2), fsparam_flag("filestreams", Opt_filestreams), fsparam_flag("quota", Opt_quota), fsparam_flag("noquota", Opt_noquota), @@ -175,13 +171,11 @@ xfs_fs_show_options( { static struct proc_xfs_info xfs_info_set[] = { /* the few simple ones we can get from the mount struct */ - { XFS_FEAT_IKEEP, ",ikeep" }, { XFS_FEAT_WSYNC, ",wsync" }, { XFS_FEAT_NOALIGN, ",noalign" }, { XFS_FEAT_SWALLOC, ",swalloc" }, { XFS_FEAT_NOUUID, ",nouuid" }, { XFS_FEAT_NORECOVERY, ",norecovery" }, - { XFS_FEAT_ATTR2, ",attr2" }, { XFS_FEAT_FILESTREAMS, ",filestreams" }, { XFS_FEAT_GRPID, ",grpid" }, { XFS_FEAT_DISCARD, ",discard" }, @@ -541,7 +535,8 @@ xfs_setup_devices( { int error; - error = xfs_configure_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize); + error = xfs_configure_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize, + mp->m_sb.sb_dblocks); if (error) return error; @@ -551,7 +546,7 @@ xfs_setup_devices( if (xfs_has_sector(mp)) log_sector_size = mp->m_sb.sb_logsectsize; error = xfs_configure_buftarg(mp->m_logdev_targp, - log_sector_size); + log_sector_size, mp->m_sb.sb_logblocks); if (error) return error; } @@ -565,7 +560,7 @@ xfs_setup_devices( mp->m_rtdev_targp = mp->m_ddev_targp; } else if (mp->m_rtname) { error = xfs_configure_buftarg(mp->m_rtdev_targp, - mp->m_sb.sb_sectsize); + mp->m_sb.sb_sectsize, mp->m_sb.sb_rblocks); if (error) return error; } @@ -578,19 +573,19 @@ xfs_init_mount_workqueues( struct xfs_mount *mp) { mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU), 1, mp->m_super->s_id); if (!mp->m_buf_workqueue) goto out; mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU), 0, mp->m_super->s_id); if (!mp->m_unwritten_workqueue) goto out_destroy_buf; mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU), 0, mp->m_super->s_id); if (!mp->m_reclaim_workqueue) goto out_destroy_unwritten; @@ -602,13 +597,14 @@ xfs_init_mount_workqueues( goto out_destroy_reclaim; mp->m_inodegc_wq = alloc_workqueue("xfs-inodegc/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU), 1, mp->m_super->s_id); if (!mp->m_inodegc_wq) goto out_destroy_blockgc; mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", - XFS_WQFLAGS(WQ_FREEZABLE), 0, mp->m_super->s_id); + XFS_WQFLAGS(WQ_FREEZABLE | WQ_PERCPU), 0, + mp->m_super->s_id); if (!mp->m_sync_workqueue) goto out_destroy_inodegc; @@ -778,7 +774,7 @@ xfs_fs_drop_inode( return 0; } - return generic_drop_inode(inode); + return inode_generic_drop(inode); } STATIC void @@ -1088,15 +1084,6 @@ xfs_finish_flags( } /* - * V5 filesystems always use attr2 format for attributes. - */ - if (xfs_has_crc(mp) && xfs_has_noattr2(mp)) { - xfs_warn(mp, "Cannot mount a V5 filesystem as noattr2. " - "attr2 is always enabled for V5 filesystems."); - return -EINVAL; - } - - /* * prohibit r/w mounts of read-only filesystems */ if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !xfs_is_readonly(mp)) { @@ -1542,22 +1529,6 @@ xfs_fs_parse_param( return 0; #endif /* Following mount options will be removed in September 2025 */ - case Opt_ikeep: - xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, true); - parsing_mp->m_features |= XFS_FEAT_IKEEP; - return 0; - case Opt_noikeep: - xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, false); - parsing_mp->m_features &= ~XFS_FEAT_IKEEP; - return 0; - case Opt_attr2: - xfs_fs_warn_deprecated(fc, param, XFS_FEAT_ATTR2, true); - parsing_mp->m_features |= XFS_FEAT_ATTR2; - return 0; - case Opt_noattr2: - xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true); - parsing_mp->m_features |= XFS_FEAT_NOATTR2; - return 0; case Opt_max_open_zones: parsing_mp->m_max_open_zones = result.uint_32; return 0; @@ -1593,16 +1564,6 @@ xfs_fs_validate_params( return -EINVAL; } - /* - * We have not read the superblock at this point, so only the attr2 - * mount option can set the attr2 feature by this stage. - */ - if (xfs_has_attr2(mp) && xfs_has_noattr2(mp)) { - xfs_warn(mp, "attr2 and noattr2 cannot both be specified."); - return -EINVAL; - } - - if (xfs_has_noalign(mp) && (mp->m_dalign || mp->m_swidth)) { xfs_warn(mp, "sunit and swidth options incompatible with the noalign option"); @@ -2177,21 +2138,6 @@ xfs_fs_reconfigure( if (error) return error; - /* attr2 -> noattr2 */ - if (xfs_has_noattr2(new_mp)) { - if (xfs_has_crc(mp)) { - xfs_warn(mp, - "attr2 is always enabled for a V5 filesystem - can't be changed."); - return -EINVAL; - } - mp->m_features &= ~XFS_FEAT_ATTR2; - mp->m_features |= XFS_FEAT_NOATTR2; - } else if (xfs_has_attr2(new_mp)) { - /* noattr2 -> attr2 */ - mp->m_features &= ~XFS_FEAT_NOATTR2; - mp->m_features |= XFS_FEAT_ATTR2; - } - /* Validate new max_atomic_write option before making other changes */ if (mp->m_awu_max_bytes != new_mp->m_awu_max_bytes) { error = xfs_set_max_atomic_write_opt(mp, @@ -2596,8 +2542,8 @@ xfs_init_workqueues(void) * AGs in all the filesystems mounted. Hence use the default large * max_active value for this workqueue. */ - xfs_alloc_wq = alloc_workqueue("xfsalloc", - XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 0); + xfs_alloc_wq = alloc_workqueue("xfsalloc", XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU), + 0); if (!xfs_alloc_wq) return -ENOMEM; diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index 751dc74a3067..9918f14b4874 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -50,7 +50,7 @@ xfs_panic_mask_proc_handler( } #endif /* CONFIG_PROC_FS */ -STATIC int +static inline int xfs_deprecated_dointvec_minmax( const struct ctl_table *ctl, int write, @@ -68,24 +68,6 @@ xfs_deprecated_dointvec_minmax( static const struct ctl_table xfs_table[] = { { - .procname = "irix_sgid_inherit", - .data = &xfs_params.sgid_inherit.val, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = xfs_deprecated_dointvec_minmax, - .extra1 = &xfs_params.sgid_inherit.min, - .extra2 = &xfs_params.sgid_inherit.max - }, - { - .procname = "irix_symlink_mode", - .data = &xfs_params.symlink_mode.val, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = xfs_deprecated_dointvec_minmax, - .extra1 = &xfs_params.symlink_mode.min, - .extra2 = &xfs_params.symlink_mode.max - }, - { .procname = "panic_mask", .data = &xfs_params.panic_mask.val, .maxlen = sizeof(int), @@ -185,15 +167,6 @@ static const struct ctl_table xfs_table[] = { .extra1 = &xfs_params.blockgc_timer.min, .extra2 = &xfs_params.blockgc_timer.max, }, - { - .procname = "speculative_cow_prealloc_lifetime", - .data = &xfs_params.blockgc_timer.val, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = xfs_deprecated_dointvec_minmax, - .extra1 = &xfs_params.blockgc_timer.min, - .extra2 = &xfs_params.blockgc_timer.max, - }, /* please keep this the last entry */ #ifdef CONFIG_PROC_FS { diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index 51646f066c4f..ed9d896079c1 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -19,9 +19,6 @@ typedef struct xfs_sysctl_val { } xfs_sysctl_val_t; typedef struct xfs_param { - xfs_sysctl_val_t sgid_inherit; /* Inherit S_ISGID if process' GID is - * not a member of parent dir GID. */ - xfs_sysctl_val_t symlink_mode; /* Link creat mode affected by umask */ xfs_sysctl_val_t panic_mask; /* bitmask to cause panic on errors. */ xfs_sysctl_val_t error_level; /* Degree of reporting for problems */ xfs_sysctl_val_t syncd_timer; /* Interval between xfssyncd wakeups */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index ac344e42846c..79b8641880ab 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1152,7 +1152,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class, TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; - __entry->count = atomic_read(&VFS_I(ip)->i_count); + __entry->count = icount_read(VFS_I(ip)); __entry->pincount = atomic_read(&ip->i_pincount); __entry->iflags = ip->i_flags; __entry->caller_ip = caller_ip; diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 575e7028f423..474f5a04ec63 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -452,19 +452,17 @@ xfs_trans_mod_sb( */ STATIC void xfs_trans_apply_sb_deltas( - xfs_trans_t *tp) + struct xfs_trans *tp) { - struct xfs_dsb *sbp; - struct xfs_buf *bp; - int whole = 0; - - bp = xfs_trans_getsb(tp); - sbp = bp->b_addr; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_buf *bp = xfs_trans_getsb(tp); + struct xfs_dsb *sbp = bp->b_addr; + int whole = 0; /* * Only update the superblock counters if we are logging them */ - if (!xfs_has_lazysbcount((tp->t_mountp))) { + if (!xfs_has_lazysbcount(mp)) { if (tp->t_icount_delta) be64_add_cpu(&sbp->sb_icount, tp->t_icount_delta); if (tp->t_ifree_delta) @@ -491,8 +489,7 @@ xfs_trans_apply_sb_deltas( * write the correct value ondisk. */ if ((tp->t_frextents_delta || tp->t_res_frextents_delta) && - !xfs_has_rtgroups(tp->t_mountp)) { - struct xfs_mount *mp = tp->t_mountp; + !xfs_has_rtgroups(mp)) { int64_t rtxdelta; rtxdelta = tp->t_frextents_delta + tp->t_res_frextents_delta; @@ -505,6 +502,8 @@ xfs_trans_apply_sb_deltas( if (tp->t_dblocks_delta) { be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta); + mp->m_ddev_targp->bt_nr_sectors += + XFS_FSB_TO_BB(mp, tp->t_dblocks_delta); whole = 1; } if (tp->t_agcount_delta) { @@ -524,7 +523,7 @@ xfs_trans_apply_sb_deltas( * recompute the ondisk rtgroup block log. The incore values * will be recomputed in xfs_trans_unreserve_and_mod_sb. */ - if (xfs_has_rtgroups(tp->t_mountp)) { + if (xfs_has_rtgroups(mp)) { sbp->sb_rgblklog = xfs_compute_rgblklog( be32_to_cpu(sbp->sb_rgextents), be32_to_cpu(sbp->sb_rextsize)); @@ -537,6 +536,8 @@ xfs_trans_apply_sb_deltas( } if (tp->t_rblocks_delta) { be64_add_cpu(&sbp->sb_rblocks, tp->t_rblocks_delta); + mp->m_rtdev_targp->bt_nr_sectors += + XFS_FSB_TO_BB(mp, tp->t_rblocks_delta); whole = 1; } if (tp->t_rextents_delta) { diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 67c328d23e4a..38983c6777df 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -374,7 +374,7 @@ xfsaild_push_item( * If log item pinning is enabled, skip the push and track the item as * pinned. This can help induce head-behind-tail conditions. */ - if (XFS_TEST_ERROR(false, ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN)) + if (XFS_TEST_ERROR(ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN)) return XFS_ITEM_PINNED; /* diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index f28214c28ab5..1147bacb2da8 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -493,64 +493,58 @@ xfs_try_open_zone( return oz; } +enum xfs_zone_alloc_score { + /* Any open zone will do it, we're desperate */ + XFS_ZONE_ALLOC_ANY = 0, + + /* It better fit somehow */ + XFS_ZONE_ALLOC_OK = 1, + + /* Only reuse a zone if it fits really well. */ + XFS_ZONE_ALLOC_GOOD = 2, +}; + /* - * For data with short or medium lifetime, try to colocated it into an - * already open zone with a matching temperature. + * Life time hint co-location matrix. Fields not set default to 0 + * aka XFS_ZONE_ALLOC_ANY. */ -static bool -xfs_colocate_eagerly( - enum rw_hint file_hint) -{ - switch (file_hint) { - case WRITE_LIFE_MEDIUM: - case WRITE_LIFE_SHORT: - case WRITE_LIFE_NONE: - return true; - default: - return false; - } -} - -static bool -xfs_good_hint_match( - struct xfs_open_zone *oz, - enum rw_hint file_hint) -{ - switch (oz->oz_write_hint) { - case WRITE_LIFE_LONG: - case WRITE_LIFE_EXTREME: - /* colocate long and extreme */ - if (file_hint == WRITE_LIFE_LONG || - file_hint == WRITE_LIFE_EXTREME) - return true; - break; - case WRITE_LIFE_MEDIUM: - /* colocate medium with medium */ - if (file_hint == WRITE_LIFE_MEDIUM) - return true; - break; - case WRITE_LIFE_SHORT: - case WRITE_LIFE_NONE: - case WRITE_LIFE_NOT_SET: - /* colocate short and none */ - if (file_hint <= WRITE_LIFE_SHORT) - return true; - break; - } - return false; -} +static const unsigned int +xfs_zoned_hint_score[WRITE_LIFE_HINT_NR][WRITE_LIFE_HINT_NR] = { + [WRITE_LIFE_NOT_SET] = { + [WRITE_LIFE_NOT_SET] = XFS_ZONE_ALLOC_OK, + }, + [WRITE_LIFE_NONE] = { + [WRITE_LIFE_NONE] = XFS_ZONE_ALLOC_OK, + }, + [WRITE_LIFE_SHORT] = { + [WRITE_LIFE_SHORT] = XFS_ZONE_ALLOC_GOOD, + }, + [WRITE_LIFE_MEDIUM] = { + [WRITE_LIFE_MEDIUM] = XFS_ZONE_ALLOC_GOOD, + }, + [WRITE_LIFE_LONG] = { + [WRITE_LIFE_LONG] = XFS_ZONE_ALLOC_OK, + [WRITE_LIFE_EXTREME] = XFS_ZONE_ALLOC_OK, + }, + [WRITE_LIFE_EXTREME] = { + [WRITE_LIFE_LONG] = XFS_ZONE_ALLOC_OK, + [WRITE_LIFE_EXTREME] = XFS_ZONE_ALLOC_OK, + }, +}; static bool xfs_try_use_zone( struct xfs_zone_info *zi, enum rw_hint file_hint, struct xfs_open_zone *oz, - bool lowspace) + unsigned int goodness) { if (oz->oz_allocated == rtg_blocks(oz->oz_rtg)) return false; - if (!lowspace && !xfs_good_hint_match(oz, file_hint)) + + if (xfs_zoned_hint_score[oz->oz_write_hint][file_hint] < goodness) return false; + if (!atomic_inc_not_zero(&oz->oz_ref)) return false; @@ -581,14 +575,14 @@ static struct xfs_open_zone * xfs_select_open_zone_lru( struct xfs_zone_info *zi, enum rw_hint file_hint, - bool lowspace) + unsigned int goodness) { struct xfs_open_zone *oz; lockdep_assert_held(&zi->zi_open_zones_lock); list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) - if (xfs_try_use_zone(zi, file_hint, oz, lowspace)) + if (xfs_try_use_zone(zi, file_hint, oz, goodness)) return oz; cond_resched_lock(&zi->zi_open_zones_lock); @@ -651,9 +645,11 @@ xfs_select_zone_nowait( * data. */ spin_lock(&zi->zi_open_zones_lock); - if (xfs_colocate_eagerly(write_hint)) - oz = xfs_select_open_zone_lru(zi, write_hint, false); - else if (pack_tight) + oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_GOOD); + if (oz) + goto out_unlock; + + if (pack_tight) oz = xfs_select_open_zone_mru(zi, write_hint); if (oz) goto out_unlock; @@ -667,16 +663,16 @@ xfs_select_zone_nowait( goto out_unlock; /* - * Try to colocate cold data with other cold data if we failed to open a - * new zone for it. + * Try to find an zone that is an ok match to colocate data with. + */ + oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_OK); + if (oz) + goto out_unlock; + + /* + * Pick the least recently used zone, regardless of hint match */ - if (write_hint != WRITE_LIFE_NOT_SET && - !xfs_colocate_eagerly(write_hint)) - oz = xfs_select_open_zone_lru(zi, write_hint, false); - if (!oz) - oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, false); - if (!oz) - oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, true); + oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_ANY); out_unlock: spin_unlock(&zi->zi_open_zones_lock); return oz; @@ -1135,7 +1131,7 @@ xfs_calc_open_zones( if (bdev_open_zones) mp->m_max_open_zones = bdev_open_zones; else - mp->m_max_open_zones = xfs_max_open_zones(mp); + mp->m_max_open_zones = XFS_DEFAULT_MAX_OPEN_ZONES; } if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) { @@ -1248,7 +1244,7 @@ xfs_mount_zones( if (!mp->m_zone_info) return -ENOMEM; - xfs_info(mp, "%u zones of %u blocks size (%u max open)", + xfs_info(mp, "%u zones of %u blocks (%u max open zones)", mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks, mp->m_max_open_zones); trace_xfs_zones_mount(mp); |