summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_dentry.c10
-rw-r--r--fs/9p/vfs_inode.c8
-rw-r--r--fs/9p/vfs_inode_dotl.c8
-rw-r--r--fs/btrfs/delayed-inode.c2
-rw-r--r--fs/btrfs/delayed-inode.h7
-rw-r--r--fs/btrfs/extent_io.c8
-rw-r--r--fs/btrfs/file.c10
-rw-r--r--fs/btrfs/inode.c1
-rw-r--r--fs/btrfs/qgroup.c4
-rw-r--r--fs/btrfs/ref-verify.c2
-rw-r--r--fs/btrfs/send.c56
-rw-r--r--fs/btrfs/super.c8
-rw-r--r--fs/btrfs/tree-log.c3
-rw-r--r--fs/crypto/inline_crypt.c3
-rw-r--r--fs/erofs/zmap.c59
-rw-r--r--fs/hugetlbfs/inode.c9
-rw-r--r--fs/nfsd/nfs4proc.c21
-rw-r--r--fs/nfsd/nfs4state.c1
-rw-r--r--fs/nfsd/nfs4xdr.c21
-rw-r--r--fs/nfsd/nfsd.h3
-rw-r--r--fs/nfsd/xdr4.h1
-rw-r--r--fs/notify/fdinfo.c6
-rw-r--r--fs/ocfs2/move_extents.c5
-rw-r--r--fs/resctrl/monitor.c16
-rw-r--r--fs/smb/client/cifsfs.c2
-rw-r--r--fs/smb/client/cifsglob.h4
-rw-r--r--fs/smb/client/cifsproto.h3
-rw-r--r--fs/smb/client/cifssmb.c8
-rw-r--r--fs/smb/client/connect.c46
-rw-r--r--fs/smb/client/dfs_cache.c55
-rw-r--r--fs/smb/client/inode.c5
-rw-r--r--fs/smb/client/smb2ops.c7
-rw-r--r--fs/smb/client/smb2proto.h6
-rw-r--r--fs/smb/client/smb2transport.c18
-rw-r--r--fs/smb/client/smbdirect.c103
-rw-r--r--fs/smb/client/trace.c1
-rw-r--r--fs/smb/common/smbdirect/smbdirect_socket.h13
-rw-r--r--fs/smb/server/transport_ipc.c8
-rw-r--r--fs/smb/server/transport_rdma.c391
-rw-r--r--fs/sysfs/group.c26
-rw-r--r--fs/xfs/Kconfig11
-rw-r--r--fs/xfs/libxfs/xfs_rtgroup.h6
-rw-r--r--fs/xfs/scrub/nlinks.c34
-rw-r--r--fs/xfs/xfs_buf.c2
-rw-r--r--fs/xfs/xfs_buf.h1
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_super.c53
-rw-r--r--fs/xfs/xfs_zone_alloc.c156
-rw-r--r--fs/xfs/xfs_zone_gc.c108
-rw-r--r--fs/xfs/xfs_zone_priv.h2
50 files changed, 862 insertions, 480 deletions
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index f3248a3e5402..c1acbc98465d 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -66,7 +66,6 @@ static int __v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
struct p9_fid *fid;
struct inode *inode;
struct v9fs_inode *v9inode;
- unsigned int cached;
if (flags & LOOKUP_RCU)
return -ECHILD;
@@ -76,11 +75,7 @@ static int __v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
goto out_valid;
v9inode = V9FS_I(inode);
- struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
-
- cached = v9ses->cache & (CACHE_META | CACHE_LOOSE);
-
- if (!cached || v9inode->cache_validity & V9FS_INO_INVALID_ATTR) {
+ if (v9inode->cache_validity & V9FS_INO_INVALID_ATTR) {
int retval;
struct v9fs_session_info *v9ses;
@@ -114,6 +109,7 @@ static int __v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
p9_debug(P9_DEBUG_VFS,
"refresh inode: dentry = %pd (%p), got error %pe\n",
dentry, dentry, ERR_PTR(retval));
+ if (retval < 0)
return retval;
}
}
@@ -150,8 +146,6 @@ const struct dentry_operations v9fs_cached_dentry_operations = {
};
const struct dentry_operations v9fs_dentry_operations = {
- .d_revalidate = v9fs_lookup_revalidate,
- .d_weak_revalidate = __v9fs_lookup_revalidate,
.d_release = v9fs_dentry_release,
.d_unalias_trylock = v9fs_dentry_unalias_trylock,
.d_unalias_unlock = v9fs_dentry_unalias_unlock,
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 69f378a83775..d0c77ec31b1d 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1339,14 +1339,8 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
* Don't update inode if the file type is different
*/
umode = p9mode2unixmode(v9ses, st, &rdev);
- if (inode_wrong_type(inode, umode)) {
- /*
- * Do this as a way of letting the caller know the inode should not
- * be reused
- */
- v9fs_invalidate_inode_attr(inode);
+ if (inode_wrong_type(inode, umode))
goto out;
- }
/*
* We don't want to refresh inode->i_size,
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 0b404e8484d2..be297e335468 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -897,14 +897,8 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
/*
* Don't update inode if the file type is different
*/
- if (inode_wrong_type(inode, st->st_mode)) {
- /*
- * Do this as a way of letting the caller know the inode should not
- * be reused
- */
- v9fs_invalidate_inode_attr(inode);
+ if (inode_wrong_type(inode, st->st_mode))
goto out;
- }
/*
* We don't want to refresh inode->i_size,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 41e37f7f67cc..3df7b9d7fbe8 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -2110,9 +2110,9 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
for (int i = 0; i < count; i++) {
__btrfs_kill_delayed_node(delayed_nodes[i]);
+ btrfs_delayed_node_ref_tracker_dir_print(delayed_nodes[i]);
btrfs_release_delayed_node(delayed_nodes[i],
&delayed_node_trackers[i]);
- btrfs_delayed_node_ref_tracker_dir_print(delayed_nodes[i]);
}
}
}
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 0d949edc0caf..b09d4ec8c77d 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -219,6 +219,13 @@ static inline void btrfs_delayed_node_ref_tracker_dir_print(struct btrfs_delayed
if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER))
return;
+ /*
+ * Only print if there are leaked references. The caller is
+ * holding one reference, so if refs == 1 there is no leak.
+ */
+ if (refcount_read(&node->refs) == 1)
+ return;
+
ref_tracker_dir_print(&node->ref_dir.dir,
BTRFS_DELAYED_NODE_REF_TRACKER_DISPLAY_LIMIT);
}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 755ec6dfd51c..23273d0e6f22 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2228,6 +2228,14 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
wbc_account_cgroup_owner(wbc, folio, range_len);
folio_unlock(folio);
}
+ /*
+ * If the fs is already in error status, do not submit any writeback
+ * but immediately finish it.
+ */
+ if (unlikely(BTRFS_FS_ERROR(fs_info))) {
+ btrfs_bio_end_io(bbio, errno_to_blk_status(BTRFS_FS_ERROR(fs_info)));
+ return;
+ }
btrfs_submit_bbio(bbio, 0);
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 7efd1f8a1912..fa82def46e39 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2854,12 +2854,22 @@ static int btrfs_fallocate_update_isize(struct inode *inode,
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 range_start;
+ u64 range_end;
int ret;
int ret2;
if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
return 0;
+ range_start = round_down(i_size_read(inode), root->fs_info->sectorsize);
+ range_end = round_up(end, root->fs_info->sectorsize);
+
+ ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), range_start,
+ range_end - range_start);
+ if (ret)
+ return ret;
+
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans))
return PTR_ERR(trans);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3b1b3a0553ee..3df5f36185a0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6873,7 +6873,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
BTRFS_I(inode)->dir_index = 0ULL;
inode_inc_iversion(inode);
inode_set_ctime_current(inode);
- set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
&fname.disk_name, 1, index);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 1175b8192cd7..31ad8580322a 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1539,8 +1539,10 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst
ASSERT(prealloc);
/* Check the level of src and dst first */
- if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
+ if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) {
+ kfree(prealloc);
return -EINVAL;
+ }
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root) {
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index de4cb0f3fbd0..e9224145d754 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -982,7 +982,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
extent_root = btrfs_extent_root(fs_info, 0);
/* If the extent tree is damaged we cannot ignore it (IGNOREBADROOTS). */
- if (IS_ERR(extent_root)) {
+ if (!extent_root) {
btrfs_warn(fs_info, "ref-verify: extent tree not available, disabling");
btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
return 0;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 6144e66661f5..96a030d28e09 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4102,6 +4102,48 @@ out:
return ret;
}
+static int rbtree_check_dir_ref_comp(const void *k, const struct rb_node *node)
+{
+ const struct recorded_ref *data = k;
+ const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node);
+
+ if (data->dir > ref->dir)
+ return 1;
+ if (data->dir < ref->dir)
+ return -1;
+ if (data->dir_gen > ref->dir_gen)
+ return 1;
+ if (data->dir_gen < ref->dir_gen)
+ return -1;
+ return 0;
+}
+
+static bool rbtree_check_dir_ref_less(struct rb_node *node, const struct rb_node *parent)
+{
+ const struct recorded_ref *entry = rb_entry(node, struct recorded_ref, node);
+
+ return rbtree_check_dir_ref_comp(entry, parent) < 0;
+}
+
+static int record_check_dir_ref_in_tree(struct rb_root *root,
+ struct recorded_ref *ref, struct list_head *list)
+{
+ struct recorded_ref *tmp_ref;
+ int ret;
+
+ if (rb_find(ref, root, rbtree_check_dir_ref_comp))
+ return 0;
+
+ ret = dup_ref(ref, list);
+ if (ret < 0)
+ return ret;
+
+ tmp_ref = list_last_entry(list, struct recorded_ref, list);
+ rb_add(&tmp_ref->node, root, rbtree_check_dir_ref_less);
+ tmp_ref->root = root;
+ return 0;
+}
+
static int rename_current_inode(struct send_ctx *sctx,
struct fs_path *current_path,
struct fs_path *new_path)
@@ -4129,11 +4171,11 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
struct recorded_ref *cur;
struct recorded_ref *cur2;
LIST_HEAD(check_dirs);
+ struct rb_root rbtree_check_dirs = RB_ROOT;
struct fs_path *valid_path = NULL;
u64 ow_inode = 0;
u64 ow_gen;
u64 ow_mode;
- u64 last_dir_ino_rm = 0;
bool did_overwrite = false;
bool is_orphan = false;
bool can_rename = true;
@@ -4437,7 +4479,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
goto out;
}
}
- ret = dup_ref(cur, &check_dirs);
+ ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs);
if (ret < 0)
goto out;
}
@@ -4465,7 +4507,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
}
list_for_each_entry(cur, &sctx->deleted_refs, list) {
- ret = dup_ref(cur, &check_dirs);
+ ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs);
if (ret < 0)
goto out;
}
@@ -4475,7 +4517,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
* We have a moved dir. Add the old parent to check_dirs
*/
cur = list_first_entry(&sctx->deleted_refs, struct recorded_ref, list);
- ret = dup_ref(cur, &check_dirs);
+ ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs);
if (ret < 0)
goto out;
} else if (!S_ISDIR(sctx->cur_inode_mode)) {
@@ -4509,7 +4551,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
if (is_current_inode_path(sctx, cur->full_path))
fs_path_reset(&sctx->cur_inode_path);
}
- ret = dup_ref(cur, &check_dirs);
+ ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs);
if (ret < 0)
goto out;
}
@@ -4552,8 +4594,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen);
if (ret < 0)
goto out;
- } else if (ret == inode_state_did_delete &&
- cur->dir != last_dir_ino_rm) {
+ } else if (ret == inode_state_did_delete) {
ret = can_rmdir(sctx, cur->dir, cur->dir_gen);
if (ret < 0)
goto out;
@@ -4565,7 +4606,6 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
ret = send_rmdir(sctx, valid_path);
if (ret < 0)
goto out;
- last_dir_ino_rm = cur->dir;
}
}
}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index aadc02374b2a..430e7419349c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2068,7 +2068,13 @@ static int btrfs_get_tree_subvol(struct fs_context *fc)
fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
if (!fs_info->super_copy || !fs_info->super_for_commit) {
- btrfs_free_fs_info(fs_info);
+ /*
+ * Dont call btrfs_free_fs_info() to free it as it's still
+ * initialized partially.
+ */
+ kfree(fs_info->super_copy);
+ kfree(fs_info->super_for_commit);
+ kvfree(fs_info);
return -ENOMEM;
}
btrfs_init_fs_info(fs_info);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 621e0df097e3..c90b2d2cb08f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -7910,6 +7910,9 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
bool log_pinned = false;
int ret;
+ /* The inode has a new name (ref/extref), so make sure we log it. */
+ set_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
+
btrfs_init_log_ctx(&ctx, inode);
ctx.logging_new_name = true;
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index 5dee7c498bc8..ed6e926226b5 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -333,8 +333,7 @@ static bool bh_get_inode_and_lblk_num(const struct buffer_head *bh,
inode = mapping->host;
*inode_ret = inode;
- *lblk_num_ret = ((u64)folio->index << (PAGE_SHIFT - inode->i_blkbits)) +
- (bh_offset(bh) >> inode->i_blkbits);
+ *lblk_num_ret = (folio_pos(folio) + bh_offset(bh)) >> inode->i_blkbits;
return true;
}
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index e5581dbeb4c2..c8d8e129eb4b 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -55,10 +55,6 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m,
} else {
m->partialref = !!(advise & Z_EROFS_LI_PARTIAL_REF);
m->clusterofs = le16_to_cpu(di->di_clusterofs);
- if (m->clusterofs >= 1 << vi->z_lclusterbits) {
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
m->pblk = le32_to_cpu(di->di_u.blkaddr);
}
return 0;
@@ -240,21 +236,29 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m,
static int z_erofs_load_lcluster_from_disk(struct z_erofs_maprecorder *m,
unsigned int lcn, bool lookahead)
{
+ struct erofs_inode *vi = EROFS_I(m->inode);
+ int err;
+
+ if (vi->datalayout == EROFS_INODE_COMPRESSED_COMPACT) {
+ err = z_erofs_load_compact_lcluster(m, lcn, lookahead);
+ } else {
+ DBG_BUGON(vi->datalayout != EROFS_INODE_COMPRESSED_FULL);
+ err = z_erofs_load_full_lcluster(m, lcn);
+ }
+ if (err)
+ return err;
+
if (m->type >= Z_EROFS_LCLUSTER_TYPE_MAX) {
erofs_err(m->inode->i_sb, "unknown type %u @ lcn %u of nid %llu",
- m->type, lcn, EROFS_I(m->inode)->nid);
+ m->type, lcn, EROFS_I(m->inode)->nid);
DBG_BUGON(1);
return -EOPNOTSUPP;
+ } else if (m->type != Z_EROFS_LCLUSTER_TYPE_NONHEAD &&
+ m->clusterofs >= (1 << vi->z_lclusterbits)) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
}
-
- switch (EROFS_I(m->inode)->datalayout) {
- case EROFS_INODE_COMPRESSED_FULL:
- return z_erofs_load_full_lcluster(m, lcn);
- case EROFS_INODE_COMPRESSED_COMPACT:
- return z_erofs_load_compact_lcluster(m, lcn, lookahead);
- default:
- return -EINVAL;
- }
+ return 0;
}
static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
@@ -268,20 +272,19 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
unsigned long lcn = m->lcn - lookback_distance;
int err;
+ if (!lookback_distance)
+ break;
+
err = z_erofs_load_lcluster_from_disk(m, lcn, false);
if (err)
return err;
-
if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
lookback_distance = m->delta[0];
- if (!lookback_distance)
- break;
continue;
- } else {
- m->headtype = m->type;
- m->map->m_la = (lcn << lclusterbits) | m->clusterofs;
- return 0;
}
+ m->headtype = m->type;
+ m->map->m_la = (lcn << lclusterbits) | m->clusterofs;
+ return 0;
}
erofs_err(sb, "bogus lookback distance %u @ lcn %lu of nid %llu",
lookback_distance, m->lcn, vi->nid);
@@ -431,13 +434,6 @@ static int z_erofs_map_blocks_fo(struct inode *inode,
end = inode->i_size;
} else {
if (m.type != Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
- /* m.lcn should be >= 1 if endoff < m.clusterofs */
- if (!m.lcn) {
- erofs_err(sb, "invalid logical cluster 0 at nid %llu",
- vi->nid);
- err = -EFSCORRUPTED;
- goto unmap_out;
- }
end = (m.lcn << lclusterbits) | m.clusterofs;
map->m_flags |= EROFS_MAP_FULL_MAPPED;
m.delta[0] = 1;
@@ -596,7 +592,7 @@ static int z_erofs_map_blocks_ext(struct inode *inode,
vi->z_fragmentoff = map->m_plen;
if (recsz > offsetof(struct z_erofs_extent, pstart_lo))
vi->z_fragmentoff |= map->m_pa << 32;
- } else if (map->m_plen) {
+ } else if (map->m_plen & Z_EROFS_EXTENT_PLEN_MASK) {
map->m_flags |= EROFS_MAP_MAPPED |
EROFS_MAP_FULL_MAPPED | EROFS_MAP_ENCODED;
fmt = map->m_plen >> Z_EROFS_EXTENT_PLEN_FMT_BIT;
@@ -715,6 +711,7 @@ static int z_erofs_map_sanity_check(struct inode *inode,
struct erofs_map_blocks *map)
{
struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+ u64 pend;
if (!(map->m_flags & EROFS_MAP_ENCODED))
return 0;
@@ -732,6 +729,10 @@ static int z_erofs_map_sanity_check(struct inode *inode,
if (unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE ||
map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE))
return -EOPNOTSUPP;
+ /* Filesystems beyond 48-bit physical block addresses are invalid */
+ if (unlikely(check_add_overflow(map->m_pa, map->m_plen, &pend) ||
+ (pend >> sbi->blkszbits) >= BIT_ULL(48)))
+ return -EFSCORRUPTED;
return 0;
}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9c94ed8c3ab0..f42548ee9083 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -478,14 +478,6 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
if (!hugetlb_vma_trylock_write(vma))
continue;
- /*
- * Skip VMAs without shareable locks. Per the design in commit
- * 40549ba8f8e0, these will be handled by remove_inode_hugepages()
- * called after this function with proper locking.
- */
- if (!__vma_shareable_lock(vma))
- goto skip;
-
v_start = vma_offset_start(vma, start);
v_end = vma_offset_end(vma, end);
@@ -496,7 +488,6 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
* vmas. Therefore, lock is not held when calling
* unmap_hugepage_range for private vmas.
*/
-skip:
hugetlb_vma_unlock_write(vma);
}
}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index e466cf52d7d7..7f7e6bb23a90 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -988,10 +988,11 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
static void
nfsd4_read_release(union nfsd4_op_u *u)
{
- if (u->read.rd_nf)
+ if (u->read.rd_nf) {
+ trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp,
+ u->read.rd_offset, u->read.rd_length);
nfsd_file_put(u->read.rd_nf);
- trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp,
- u->read.rd_offset, u->read.rd_length);
+ }
}
static __be32
@@ -2892,10 +2893,20 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
rqstp->rq_lease_breaker = (void **)&cstate->clp;
- trace_nfsd_compound(rqstp, args->tag, args->taglen, args->opcnt);
+ trace_nfsd_compound(rqstp, args->tag, args->taglen, args->client_opcnt);
while (!status && resp->opcnt < args->opcnt) {
op = &args->ops[resp->opcnt++];
+ if (unlikely(resp->opcnt == NFSD_MAX_OPS_PER_COMPOUND)) {
+ /* If there are still more operations to process,
+ * stop here and report NFS4ERR_RESOURCE. */
+ if (cstate->minorversion == 0 &&
+ args->client_opcnt > resp->opcnt) {
+ op->status = nfserr_resource;
+ goto encode_op;
+ }
+ }
+
/*
* The XDR decode routines may have pre-set op->status;
* for example, if there is a miscellaneous XDR error
@@ -2972,7 +2983,7 @@ encode_op:
status = op->status;
}
- trace_nfsd_compound_status(args->opcnt, resp->opcnt,
+ trace_nfsd_compound_status(args->client_opcnt, resp->opcnt,
status, nfsd4_op_name(op->opnum));
nfsd4_cstate_clear_replay(cstate);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 81fa7cc6c77b..c1b54322c412 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3902,6 +3902,7 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs
ca->headerpadsz = 0;
ca->maxreq_sz = min_t(u32, ca->maxreq_sz, maxrpc);
ca->maxresp_sz = min_t(u32, ca->maxresp_sz, maxrpc);
+ ca->maxops = min_t(u32, ca->maxops, NFSD_MAX_OPS_PER_COMPOUND);
ca->maxresp_cached = min_t(u32, ca->maxresp_cached,
NFSD_SLOT_CACHE_SIZE + NFSD_MIN_HDR_SEQ_SZ);
ca->maxreqs = min_t(u32, ca->maxreqs, NFSD_MAX_SLOTS_PER_SESSION);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c0a3c6a7c8bb..6040a6145dad 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2488,8 +2488,10 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
if (xdr_stream_decode_u32(argp->xdr, &argp->minorversion) < 0)
return false;
- if (xdr_stream_decode_u32(argp->xdr, &argp->opcnt) < 0)
+ if (xdr_stream_decode_u32(argp->xdr, &argp->client_opcnt) < 0)
return false;
+ argp->opcnt = min_t(u32, argp->client_opcnt,
+ NFSD_MAX_OPS_PER_COMPOUND);
if (argp->opcnt > ARRAY_SIZE(argp->iops)) {
argp->ops = vcalloc(argp->opcnt, sizeof(*argp->ops));
@@ -2628,10 +2630,8 @@ static __be32 nfsd4_encode_components_esc(struct xdr_stream *xdr, char sep,
__be32 *p;
__be32 pathlen;
int pathlen_offset;
- int strlen, count=0;
char *str, *end, *next;
-
- dprintk("nfsd4_encode_components(%s)\n", components);
+ int count = 0;
pathlen_offset = xdr->buf->len;
p = xdr_reserve_space(xdr, 4);
@@ -2658,9 +2658,8 @@ static __be32 nfsd4_encode_components_esc(struct xdr_stream *xdr, char sep,
for (; *end && (*end != sep); end++)
/* find sep or end of string */;
- strlen = end - str;
- if (strlen) {
- if (xdr_stream_encode_opaque(xdr, str, strlen) < 0)
+ if (end > str) {
+ if (xdr_stream_encode_opaque(xdr, str, end - str) < 0)
return nfserr_resource;
count++;
} else
@@ -2939,6 +2938,12 @@ struct nfsd4_fattr_args {
typedef __be32(*nfsd4_enc_attr)(struct xdr_stream *xdr,
const struct nfsd4_fattr_args *args);
+static __be32 nfsd4_encode_fattr4__inval(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfserr_inval;
+}
+
static __be32 nfsd4_encode_fattr4__noop(struct xdr_stream *xdr,
const struct nfsd4_fattr_args *args)
{
@@ -3560,6 +3565,8 @@ static const nfsd4_enc_attr nfsd4_enc_fattr4_encode_ops[] = {
[FATTR4_MODE_UMASK] = nfsd4_encode_fattr4__noop,
[FATTR4_XATTR_SUPPORT] = nfsd4_encode_fattr4_xattr_support,
+ [FATTR4_TIME_DELEG_ACCESS] = nfsd4_encode_fattr4__inval,
+ [FATTR4_TIME_DELEG_MODIFY] = nfsd4_encode_fattr4__inval,
[FATTR4_OPEN_ARGUMENTS] = nfsd4_encode_fattr4_open_arguments,
};
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index ea87b42894dd..f19320018639 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -57,6 +57,9 @@ struct readdir_cd {
__be32 err; /* 0, nfserr, or nfserr_eof */
};
+/* Maximum number of operations per session compound */
+#define NFSD_MAX_OPS_PER_COMPOUND 200
+
struct nfsd_genl_rqstp {
struct sockaddr rq_daddr;
struct sockaddr rq_saddr;
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index d4b48602b2b0..ee0570cbdd9e 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -903,6 +903,7 @@ struct nfsd4_compoundargs {
char * tag;
u32 taglen;
u32 minorversion;
+ u32 client_opcnt;
u32 opcnt;
bool splice_ok;
struct nfsd4_op *ops;
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 1161eabf11ee..9cc7eb863643 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -17,6 +17,7 @@
#include "fanotify/fanotify.h"
#include "fdinfo.h"
#include "fsnotify.h"
+#include "../internal.h"
#if defined(CONFIG_PROC_FS)
@@ -46,7 +47,12 @@ static void show_mark_fhandle(struct seq_file *m, struct inode *inode)
size = f->handle_bytes >> 2;
+ if (!super_trylock_shared(inode->i_sb))
+ return;
+
ret = exportfs_encode_fid(inode, (struct fid *)f->f_handle, &size);
+ up_read(&inode->i_sb->s_umount);
+
if ((ret == FILEID_INVALID) || (ret < 0))
return;
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 86f2631e6360..10923bf7c8b8 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -867,6 +867,11 @@ static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
mlog_errno(ret);
goto out;
}
+ /*
+ * Invalidate extent cache after moving/defragging to prevent
+ * stale cached data with outdated extent flags.
+ */
+ ocfs2_extent_map_trunc(inode, cpos);
context->clusters_moved += alloc_size;
next:
diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c
index 4076336fbba6..572a9925bd6c 100644
--- a/fs/resctrl/monitor.c
+++ b/fs/resctrl/monitor.c
@@ -1782,15 +1782,13 @@ int resctrl_mon_resource_init(void)
mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID;
if (r->mon.mbm_cntr_assignable) {
- if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID))
- resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID);
- if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID))
- resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID);
- mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask;
- mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask &
- (READS_TO_LOCAL_MEM |
- READS_TO_LOCAL_S_MEM |
- NON_TEMP_WRITE_TO_LOCAL_MEM);
+ if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID))
+ mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask;
+ if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID))
+ mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask &
+ (READS_TO_LOCAL_MEM |
+ READS_TO_LOCAL_S_MEM |
+ NON_TEMP_WRITE_TO_LOCAL_MEM);
r->mon.mbm_assign_on_mkdir = true;
resctrl_file_fflags_init("num_mbm_cntrs",
RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 4f959f1e08d2..185ac41bd7e9 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -173,7 +173,7 @@ module_param(enable_oplocks, bool, 0644);
MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1");
module_param(enable_gcm_256, bool, 0644);
-MODULE_PARM_DESC(enable_gcm_256, "Enable requesting strongest (256 bit) GCM encryption. Default: y/Y/0");
+MODULE_PARM_DESC(enable_gcm_256, "Enable requesting strongest (256 bit) GCM encryption. Default: y/Y/1");
module_param(require_gcm_256, bool, 0644);
MODULE_PARM_DESC(require_gcm_256, "Require strongest (256 bit) GCM encryption. Default: n/N/0");
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 16a00a61fd2c..203e2aaa3c25 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -534,8 +534,6 @@ struct smb_version_operations {
void (*new_lease_key)(struct cifs_fid *);
int (*generate_signingkey)(struct cifs_ses *ses,
struct TCP_Server_Info *server);
- int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *,
- bool allocate_crypto);
int (*set_integrity)(const unsigned int, struct cifs_tcon *tcon,
struct cifsFileInfo *src_file);
int (*enum_snapshots)(const unsigned int xid, struct cifs_tcon *tcon,
@@ -732,7 +730,7 @@ struct TCP_Server_Info {
bool nosharesock;
bool tcp_nodelay;
bool terminate;
- unsigned int credits; /* send no more requests at once */
+ int credits; /* send no more requests at once */
unsigned int max_credits; /* can override large 32000 default at mnt */
unsigned int in_flight; /* number of requests on the wire to server */
unsigned int max_in_flight; /* max number of requests that were on wire */
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index 4976be2c47c1..3528c365a452 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -9,6 +9,7 @@
#define _CIFSPROTO_H
#include <linux/nls.h>
#include <linux/ctype.h>
+#include "cifsglob.h"
#include "trace.h"
#ifdef CONFIG_CIFS_DFS_UPCALL
#include "dfs_cache.h"
@@ -615,6 +616,8 @@ extern int E_md4hash(const unsigned char *passwd, unsigned char *p16,
extern struct TCP_Server_Info *
cifs_find_tcp_session(struct smb3_fs_context *ctx);
+struct cifs_tcon *cifs_setup_ipc(struct cifs_ses *ses, bool seal);
+
void __cifs_put_smb_ses(struct cifs_ses *ses);
extern struct cifs_ses *
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 2881efcbe09a..7da194f29fef 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -1311,6 +1311,8 @@ cifs_readv_callback(struct mid_q_entry *mid)
.rreq_debug_id = rdata->rreq->debug_id,
.rreq_debug_index = rdata->subreq.debug_index,
};
+ unsigned int rreq_debug_id = rdata->rreq->debug_id;
+ unsigned int subreq_debug_index = rdata->subreq.debug_index;
cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%zu\n",
__func__, mid->mid, mid->mid_state, rdata->result,
@@ -1374,6 +1376,9 @@ do_retry:
__set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags);
}
+ trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, rdata->credits.value,
+ server->credits, server->in_flight,
+ 0, cifs_trace_rw_credits_read_response_clear);
rdata->credits.value = 0;
rdata->subreq.error = rdata->result;
rdata->subreq.transferred += rdata->got_bytes;
@@ -1381,6 +1386,9 @@ do_retry:
netfs_read_subreq_terminated(&rdata->subreq);
release_mid(mid);
add_credits(server, &credits, 0);
+ trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, 0,
+ server->credits, server->in_flight,
+ credits.value, cifs_trace_rw_credits_read_response_add);
}
/* cifs_async_readv - send an async write, and set up mid to handle result */
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index dd12f3eb61dc..55cb4b0cbd48 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -310,6 +310,8 @@ cifs_abort_connection(struct TCP_Server_Info *server)
server->ssocket->flags);
sock_release(server->ssocket);
server->ssocket = NULL;
+ } else if (cifs_rdma_enabled(server)) {
+ smbd_destroy(server);
}
server->sequence_number = 0;
server->session_estab = false;
@@ -338,12 +340,6 @@ cifs_abort_connection(struct TCP_Server_Info *server)
mid_execute_callback(mid);
release_mid(mid);
}
-
- if (cifs_rdma_enabled(server)) {
- cifs_server_lock(server);
- smbd_destroy(server);
- cifs_server_unlock(server);
- }
}
static bool cifs_tcp_ses_needs_reconnect(struct TCP_Server_Info *server, int num_targets)
@@ -2015,39 +2011,31 @@ static int match_session(struct cifs_ses *ses,
/**
* cifs_setup_ipc - helper to setup the IPC tcon for the session
* @ses: smb session to issue the request on
- * @ctx: the superblock configuration context to use for building the
- * new tree connection for the IPC (interprocess communication RPC)
+ * @seal: if encryption is requested
*
* A new IPC connection is made and stored in the session
* tcon_ipc. The IPC tcon has the same lifetime as the session.
*/
-static int
-cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx)
+struct cifs_tcon *cifs_setup_ipc(struct cifs_ses *ses, bool seal)
{
int rc = 0, xid;
struct cifs_tcon *tcon;
char unc[SERVER_NAME_LENGTH + sizeof("//x/IPC$")] = {0};
- bool seal = false;
struct TCP_Server_Info *server = ses->server;
/*
* If the mount request that resulted in the creation of the
* session requires encryption, force IPC to be encrypted too.
*/
- if (ctx->seal) {
- if (server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION)
- seal = true;
- else {
- cifs_server_dbg(VFS,
- "IPC: server doesn't support encryption\n");
- return -EOPNOTSUPP;
- }
+ if (seal && !(server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION)) {
+ cifs_server_dbg(VFS, "IPC: server doesn't support encryption\n");
+ return ERR_PTR(-EOPNOTSUPP);
}
/* no need to setup directory caching on IPC share, so pass in false */
tcon = tcon_info_alloc(false, netfs_trace_tcon_ref_new_ipc);
if (tcon == NULL)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
spin_lock(&server->srv_lock);
scnprintf(unc, sizeof(unc), "\\\\%s\\IPC$", server->hostname);
@@ -2057,13 +2045,13 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx)
tcon->ses = ses;
tcon->ipc = true;
tcon->seal = seal;
- rc = server->ops->tree_connect(xid, ses, unc, tcon, ctx->local_nls);
+ rc = server->ops->tree_connect(xid, ses, unc, tcon, ses->local_nls);
free_xid(xid);
if (rc) {
- cifs_server_dbg(VFS, "failed to connect to IPC (rc=%d)\n", rc);
+ cifs_server_dbg(VFS | ONCE, "failed to connect to IPC (rc=%d)\n", rc);
tconInfoFree(tcon, netfs_trace_tcon_ref_free_ipc_fail);
- goto out;
+ return ERR_PTR(rc);
}
cifs_dbg(FYI, "IPC tcon rc=%d ipc tid=0x%x\n", rc, tcon->tid);
@@ -2071,9 +2059,7 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx)
spin_lock(&tcon->tc_lock);
tcon->status = TID_GOOD;
spin_unlock(&tcon->tc_lock);
- ses->tcon_ipc = tcon;
-out:
- return rc;
+ return tcon;
}
static struct cifs_ses *
@@ -2347,6 +2333,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
{
struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
+ struct cifs_tcon *ipc;
struct cifs_ses *ses;
unsigned int xid;
int retries = 0;
@@ -2525,7 +2512,12 @@ retry_new_session:
list_add(&ses->smb_ses_list, &server->smb_ses_list);
spin_unlock(&cifs_tcp_ses_lock);
- cifs_setup_ipc(ses, ctx);
+ ipc = cifs_setup_ipc(ses, ctx->seal);
+ spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&ses->ses_lock);
+ ses->tcon_ipc = !IS_ERR(ipc) ? ipc : NULL;
+ spin_unlock(&ses->ses_lock);
+ spin_unlock(&cifs_tcp_ses_lock);
free_xid(xid);
diff --git a/fs/smb/client/dfs_cache.c b/fs/smb/client/dfs_cache.c
index 4dada26d56b5..f2ad0ccd08a7 100644
--- a/fs/smb/client/dfs_cache.c
+++ b/fs/smb/client/dfs_cache.c
@@ -1120,24 +1120,63 @@ static bool target_share_equal(struct cifs_tcon *tcon, const char *s1)
return match;
}
-static bool is_ses_good(struct cifs_ses *ses)
+static bool is_ses_good(struct cifs_tcon *tcon, struct cifs_ses *ses)
{
struct TCP_Server_Info *server = ses->server;
- struct cifs_tcon *tcon = ses->tcon_ipc;
+ struct cifs_tcon *ipc = NULL;
bool ret;
+ spin_lock(&cifs_tcp_ses_lock);
spin_lock(&ses->ses_lock);
spin_lock(&ses->chan_lock);
+
ret = !cifs_chan_needs_reconnect(ses, server) &&
- ses->ses_status == SES_GOOD &&
- !tcon->need_reconnect;
+ ses->ses_status == SES_GOOD;
+
spin_unlock(&ses->chan_lock);
+
+ if (!ret)
+ goto out;
+
+ if (likely(ses->tcon_ipc)) {
+ if (ses->tcon_ipc->need_reconnect) {
+ ret = false;
+ goto out;
+ }
+ } else {
+ spin_unlock(&ses->ses_lock);
+ spin_unlock(&cifs_tcp_ses_lock);
+
+ ipc = cifs_setup_ipc(ses, tcon->seal);
+
+ spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&ses->ses_lock);
+ if (!IS_ERR(ipc)) {
+ if (!ses->tcon_ipc) {
+ ses->tcon_ipc = ipc;
+ ipc = NULL;
+ }
+ } else {
+ ret = false;
+ ipc = NULL;
+ }
+ }
+
+out:
spin_unlock(&ses->ses_lock);
+ spin_unlock(&cifs_tcp_ses_lock);
+ if (ipc && server->ops->tree_disconnect) {
+ unsigned int xid = get_xid();
+
+ (void)server->ops->tree_disconnect(xid, ipc);
+ _free_xid(xid);
+ }
+ tconInfoFree(ipc, netfs_trace_tcon_ref_free_ipc);
return ret;
}
/* Refresh dfs referral of @ses */
-static void refresh_ses_referral(struct cifs_ses *ses)
+static void refresh_ses_referral(struct cifs_tcon *tcon, struct cifs_ses *ses)
{
struct cache_entry *ce;
unsigned int xid;
@@ -1153,7 +1192,7 @@ static void refresh_ses_referral(struct cifs_ses *ses)
}
ses = CIFS_DFS_ROOT_SES(ses);
- if (!is_ses_good(ses)) {
+ if (!is_ses_good(tcon, ses)) {
cifs_dbg(FYI, "%s: skip cache refresh due to disconnected ipc\n",
__func__);
goto out;
@@ -1241,7 +1280,7 @@ static void refresh_tcon_referral(struct cifs_tcon *tcon, bool force_refresh)
up_read(&htable_rw_lock);
ses = CIFS_DFS_ROOT_SES(ses);
- if (!is_ses_good(ses)) {
+ if (!is_ses_good(tcon, ses)) {
cifs_dbg(FYI, "%s: skip cache refresh due to disconnected ipc\n",
__func__);
goto out;
@@ -1309,7 +1348,7 @@ void dfs_cache_refresh(struct work_struct *work)
tcon = container_of(work, struct cifs_tcon, dfs_cache_work.work);
list_for_each_entry(ses, &tcon->dfs_ses_list, dlist)
- refresh_ses_referral(ses);
+ refresh_ses_referral(tcon, ses);
refresh_tcon_referral(tcon, false);
queue_delayed_work(dfscache_wq, &tcon->dfs_cache_work,
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 098a79b7a959..cac355364e43 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -2484,11 +2484,8 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
}
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
do_rename_exit:
- if (rc == 0) {
+ if (rc == 0)
d_move(from_dentry, to_dentry);
- /* Force a new lookup */
- d_drop(from_dentry);
- }
cifs_put_tlink(tlink);
return rc;
}
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 95cd484cfbba..1e39f2165e42 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -2799,11 +2799,12 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_fid fid;
int rc;
__le16 *utf16_path;
- struct cached_fid *cfid = NULL;
+ struct cached_fid *cfid;
int retries = 0, cur_sleep = 1;
replay_again:
/* reinitialize for possible replay */
+ cfid = NULL;
flags = CIFS_CP_CREATE_CLOSE_OP;
oplock = SMB2_OPLOCK_LEVEL_NONE;
server = cifs_pick_channel(ses);
@@ -5446,7 +5447,6 @@ struct smb_version_operations smb20_operations = {
.get_lease_key = smb2_get_lease_key,
.set_lease_key = smb2_set_lease_key,
.new_lease_key = smb2_new_lease_key,
- .calc_signature = smb2_calc_signature,
.is_read_op = smb2_is_read_op,
.set_oplock_level = smb2_set_oplock_level,
.create_lease_buf = smb2_create_lease_buf,
@@ -5550,7 +5550,6 @@ struct smb_version_operations smb21_operations = {
.get_lease_key = smb2_get_lease_key,
.set_lease_key = smb2_set_lease_key,
.new_lease_key = smb2_new_lease_key,
- .calc_signature = smb2_calc_signature,
.is_read_op = smb21_is_read_op,
.set_oplock_level = smb21_set_oplock_level,
.create_lease_buf = smb2_create_lease_buf,
@@ -5660,7 +5659,6 @@ struct smb_version_operations smb30_operations = {
.set_lease_key = smb2_set_lease_key,
.new_lease_key = smb2_new_lease_key,
.generate_signingkey = generate_smb30signingkey,
- .calc_signature = smb3_calc_signature,
.set_integrity = smb3_set_integrity,
.is_read_op = smb21_is_read_op,
.set_oplock_level = smb3_set_oplock_level,
@@ -5777,7 +5775,6 @@ struct smb_version_operations smb311_operations = {
.set_lease_key = smb2_set_lease_key,
.new_lease_key = smb2_new_lease_key,
.generate_signingkey = generate_smb311signingkey,
- .calc_signature = smb3_calc_signature,
.set_integrity = smb3_set_integrity,
.is_read_op = smb21_is_read_op,
.set_oplock_level = smb3_set_oplock_level,
diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h
index 6eb86d134abc..5241daaae543 100644
--- a/fs/smb/client/smb2proto.h
+++ b/fs/smb/client/smb2proto.h
@@ -39,12 +39,6 @@ extern struct mid_q_entry *smb2_setup_async_request(
struct TCP_Server_Info *server, struct smb_rqst *rqst);
extern struct cifs_tcon *smb2_find_smb_tcon(struct TCP_Server_Info *server,
__u64 ses_id, __u32 tid);
-extern int smb2_calc_signature(struct smb_rqst *rqst,
- struct TCP_Server_Info *server,
- bool allocate_crypto);
-extern int smb3_calc_signature(struct smb_rqst *rqst,
- struct TCP_Server_Info *server,
- bool allocate_crypto);
extern void smb2_echo_request(struct work_struct *work);
extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode);
extern bool smb2_is_valid_oplock_break(char *buffer,
diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c
index ad6068e17a2a..6a9b80385b86 100644
--- a/fs/smb/client/smb2transport.c
+++ b/fs/smb/client/smb2transport.c
@@ -209,9 +209,9 @@ smb2_find_smb_tcon(struct TCP_Server_Info *server, __u64 ses_id, __u32 tid)
return tcon;
}
-int
+static int
smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
- bool allocate_crypto)
+ bool allocate_crypto)
{
int rc;
unsigned char smb2_signature[SMB2_HMACSHA256_SIZE];
@@ -465,9 +465,9 @@ generate_smb311signingkey(struct cifs_ses *ses,
return generate_smb3signingkey(ses, server, &triplet);
}
-int
+static int
smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
- bool allocate_crypto)
+ bool allocate_crypto)
{
int rc;
unsigned char smb3_signature[SMB2_CMACAES_SIZE];
@@ -477,6 +477,9 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
struct smb_rqst drqst;
u8 key[SMB3_SIGN_KEY_SIZE];
+ if (server->vals->protocol_id <= SMB21_PROT_ID)
+ return smb2_calc_signature(rqst, server, allocate_crypto);
+
rc = smb3_get_sign_key(le64_to_cpu(shdr->SessionId), server, key);
if (unlikely(rc)) {
cifs_server_dbg(FYI, "%s: Could not get signing key\n", __func__);
@@ -547,7 +550,6 @@ out:
static int
smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
{
- int rc = 0;
struct smb2_hdr *shdr;
struct smb2_sess_setup_req *ssr;
bool is_binding;
@@ -574,9 +576,7 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
return 0;
}
- rc = server->ops->calc_signature(rqst, server, false);
-
- return rc;
+ return smb3_calc_signature(rqst, server, false);
}
int
@@ -612,7 +612,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
memset(shdr->Signature, 0, SMB2_SIGNATURE_SIZE);
- rc = server->ops->calc_signature(rqst, server, true);
+ rc = smb3_calc_signature(rqst, server, true);
if (rc)
return rc;
diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c
index 49e2df3ad1f0..85a4c55b61b8 100644
--- a/fs/smb/client/smbdirect.c
+++ b/fs/smb/client/smbdirect.c
@@ -172,6 +172,7 @@ static void smbd_disconnect_wake_up_all(struct smbdirect_socket *sc)
* in order to notice the broken connection.
*/
wake_up_all(&sc->status_wait);
+ wake_up_all(&sc->send_io.lcredits.wait_queue);
wake_up_all(&sc->send_io.credits.wait_queue);
wake_up_all(&sc->send_io.pending.dec_wait_queue);
wake_up_all(&sc->send_io.pending.zero_wait_queue);
@@ -495,6 +496,7 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc)
struct smbdirect_send_io *request =
container_of(wc->wr_cqe, struct smbdirect_send_io, cqe);
struct smbdirect_socket *sc = request->socket;
+ int lcredits = 0;
log_rdma_send(INFO, "smbdirect_send_io 0x%p completed wc->status=%s\n",
request, ib_wc_status_msg(wc->status));
@@ -504,22 +506,24 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc)
request->sge[i].addr,
request->sge[i].length,
DMA_TO_DEVICE);
+ mempool_free(request, sc->send_io.mem.pool);
+ lcredits += 1;
if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) {
if (wc->status != IB_WC_WR_FLUSH_ERR)
log_rdma_send(ERR, "wc->status=%s wc->opcode=%d\n",
ib_wc_status_msg(wc->status), wc->opcode);
- mempool_free(request, sc->send_io.mem.pool);
smbd_disconnect_rdma_connection(sc);
return;
}
+ atomic_add(lcredits, &sc->send_io.lcredits.count);
+ wake_up(&sc->send_io.lcredits.wait_queue);
+
if (atomic_dec_and_test(&sc->send_io.pending.count))
wake_up(&sc->send_io.pending.zero_wait_queue);
wake_up(&sc->send_io.pending.dec_wait_queue);
-
- mempool_free(request, sc->send_io.mem.pool);
}
static void dump_smbdirect_negotiate_resp(struct smbdirect_negotiate_resp *resp)
@@ -567,6 +571,7 @@ static bool process_negotiation_response(
log_rdma_event(ERR, "error: credits_granted==0\n");
return false;
}
+ atomic_set(&sc->send_io.lcredits.count, sp->send_credit_target);
atomic_set(&sc->send_io.credits.count, le16_to_cpu(packet->credits_granted));
if (le32_to_cpu(packet->preferred_send_size) > sp->max_recv_size) {
@@ -1114,6 +1119,24 @@ static int smbd_post_send_iter(struct smbdirect_socket *sc,
struct smbdirect_data_transfer *packet;
int new_credits = 0;
+wait_lcredit:
+ /* Wait for local send credits */
+ rc = wait_event_interruptible(sc->send_io.lcredits.wait_queue,
+ atomic_read(&sc->send_io.lcredits.count) > 0 ||
+ sc->status != SMBDIRECT_SOCKET_CONNECTED);
+ if (rc)
+ goto err_wait_lcredit;
+
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
+ log_outgoing(ERR, "disconnected not sending on wait_credit\n");
+ rc = -EAGAIN;
+ goto err_wait_lcredit;
+ }
+ if (unlikely(atomic_dec_return(&sc->send_io.lcredits.count) < 0)) {
+ atomic_inc(&sc->send_io.lcredits.count);
+ goto wait_lcredit;
+ }
+
wait_credit:
/* Wait for send credits. A SMBD packet needs one credit */
rc = wait_event_interruptible(sc->send_io.credits.wait_queue,
@@ -1132,23 +1155,6 @@ wait_credit:
goto wait_credit;
}
-wait_send_queue:
- wait_event(sc->send_io.pending.dec_wait_queue,
- atomic_read(&sc->send_io.pending.count) < sp->send_credit_target ||
- sc->status != SMBDIRECT_SOCKET_CONNECTED);
-
- if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
- log_outgoing(ERR, "disconnected not sending on wait_send_queue\n");
- rc = -EAGAIN;
- goto err_wait_send_queue;
- }
-
- if (unlikely(atomic_inc_return(&sc->send_io.pending.count) >
- sp->send_credit_target)) {
- atomic_dec(&sc->send_io.pending.count);
- goto wait_send_queue;
- }
-
request = mempool_alloc(sc->send_io.mem.pool, GFP_KERNEL);
if (!request) {
rc = -ENOMEM;
@@ -1229,10 +1235,21 @@ wait_send_queue:
le32_to_cpu(packet->data_length),
le32_to_cpu(packet->remaining_data_length));
+ /*
+ * Now that we got a local and a remote credit
+ * we add us as pending
+ */
+ atomic_inc(&sc->send_io.pending.count);
+
rc = smbd_post_send(sc, request);
if (!rc)
return 0;
+ if (atomic_dec_and_test(&sc->send_io.pending.count))
+ wake_up(&sc->send_io.pending.zero_wait_queue);
+
+ wake_up(&sc->send_io.pending.dec_wait_queue);
+
err_dma:
for (i = 0; i < request->num_sge; i++)
if (request->sge[i].addr)
@@ -1246,14 +1263,14 @@ err_dma:
atomic_sub(new_credits, &sc->recv_io.credits.count);
err_alloc:
- if (atomic_dec_and_test(&sc->send_io.pending.count))
- wake_up(&sc->send_io.pending.zero_wait_queue);
-
-err_wait_send_queue:
- /* roll back send credits and pending */
atomic_inc(&sc->send_io.credits.count);
+ wake_up(&sc->send_io.credits.wait_queue);
err_wait_credit:
+ atomic_inc(&sc->send_io.lcredits.count);
+ wake_up(&sc->send_io.lcredits.wait_queue);
+
+err_wait_lcredit:
return rc;
}
@@ -1767,6 +1784,7 @@ static struct smbd_connection *_smbd_get_connection(
struct smbdirect_socket *sc;
struct smbdirect_socket_parameters *sp;
struct rdma_conn_param conn_param;
+ struct ib_qp_cap qp_cap;
struct ib_qp_init_attr qp_attr;
struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr;
struct ib_port_immutable port_immutable;
@@ -1838,6 +1856,25 @@ static struct smbd_connection *_smbd_get_connection(
goto config_failed;
}
+ sp->responder_resources =
+ min_t(u8, sp->responder_resources,
+ sc->ib.dev->attrs.max_qp_rd_atom);
+ log_rdma_mr(INFO, "responder_resources=%d\n",
+ sp->responder_resources);
+
+ /*
+ * We use allocate sp->responder_resources * 2 MRs
+ * and each MR needs WRs for REG and INV, so
+ * we use '* 4'.
+ *
+ * +1 for ib_drain_qp()
+ */
+ memset(&qp_cap, 0, sizeof(qp_cap));
+ qp_cap.max_send_wr = sp->send_credit_target + sp->responder_resources * 4 + 1;
+ qp_cap.max_recv_wr = sp->recv_credit_max + 1;
+ qp_cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE;
+ qp_cap.max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE;
+
sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0);
if (IS_ERR(sc->ib.pd)) {
rc = PTR_ERR(sc->ib.pd);
@@ -1848,7 +1885,7 @@ static struct smbd_connection *_smbd_get_connection(
sc->ib.send_cq =
ib_alloc_cq_any(sc->ib.dev, sc,
- sp->send_credit_target, IB_POLL_SOFTIRQ);
+ qp_cap.max_send_wr, IB_POLL_SOFTIRQ);
if (IS_ERR(sc->ib.send_cq)) {
sc->ib.send_cq = NULL;
goto alloc_cq_failed;
@@ -1856,7 +1893,7 @@ static struct smbd_connection *_smbd_get_connection(
sc->ib.recv_cq =
ib_alloc_cq_any(sc->ib.dev, sc,
- sp->recv_credit_max, IB_POLL_SOFTIRQ);
+ qp_cap.max_recv_wr, IB_POLL_SOFTIRQ);
if (IS_ERR(sc->ib.recv_cq)) {
sc->ib.recv_cq = NULL;
goto alloc_cq_failed;
@@ -1865,11 +1902,7 @@ static struct smbd_connection *_smbd_get_connection(
memset(&qp_attr, 0, sizeof(qp_attr));
qp_attr.event_handler = smbd_qp_async_error_upcall;
qp_attr.qp_context = sc;
- qp_attr.cap.max_send_wr = sp->send_credit_target;
- qp_attr.cap.max_recv_wr = sp->recv_credit_max;
- qp_attr.cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE;
- qp_attr.cap.max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE;
- qp_attr.cap.max_inline_data = 0;
+ qp_attr.cap = qp_cap;
qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
qp_attr.qp_type = IB_QPT_RC;
qp_attr.send_cq = sc->ib.send_cq;
@@ -1883,12 +1916,6 @@ static struct smbd_connection *_smbd_get_connection(
}
sc->ib.qp = sc->rdma.cm_id->qp;
- sp->responder_resources =
- min_t(u8, sp->responder_resources,
- sc->ib.dev->attrs.max_qp_rd_atom);
- log_rdma_mr(INFO, "responder_resources=%d\n",
- sp->responder_resources);
-
memset(&conn_param, 0, sizeof(conn_param));
conn_param.initiator_depth = sp->initiator_depth;
conn_param.responder_resources = sp->responder_resources;
diff --git a/fs/smb/client/trace.c b/fs/smb/client/trace.c
index 465483787193..16b0e719731f 100644
--- a/fs/smb/client/trace.c
+++ b/fs/smb/client/trace.c
@@ -4,5 +4,6 @@
*
* Author(s): Steve French <stfrench@microsoft.com>
*/
+#include "cifsglob.h"
#define CREATE_TRACE_POINTS
#include "trace.h"
diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h
index 361db7f9f623..ee5a90d691c8 100644
--- a/fs/smb/common/smbdirect/smbdirect_socket.h
+++ b/fs/smb/common/smbdirect/smbdirect_socket.h
@@ -142,7 +142,15 @@ struct smbdirect_socket {
} mem;
/*
- * The credit state for the send side
+ * The local credit state for ib_post_send()
+ */
+ struct {
+ atomic_t count;
+ wait_queue_head_t wait_queue;
+ } lcredits;
+
+ /*
+ * The remote credit state for the send side
*/
struct {
atomic_t count;
@@ -337,6 +345,9 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc)
INIT_DELAYED_WORK(&sc->idle.timer_work, __smbdirect_socket_disabled_work);
disable_delayed_work_sync(&sc->idle.timer_work);
+ atomic_set(&sc->send_io.lcredits.count, 0);
+ init_waitqueue_head(&sc->send_io.lcredits.wait_queue);
+
atomic_set(&sc->send_io.credits.count, 0);
init_waitqueue_head(&sc->send_io.credits.wait_queue);
diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c
index 46f87fd1ce1c..2c08cccfa680 100644
--- a/fs/smb/server/transport_ipc.c
+++ b/fs/smb/server/transport_ipc.c
@@ -263,10 +263,16 @@ static void ipc_msg_handle_free(int handle)
static int handle_response(int type, void *payload, size_t sz)
{
- unsigned int handle = *(unsigned int *)payload;
+ unsigned int handle;
struct ipc_msg_table_entry *entry;
int ret = 0;
+ /* Prevent 4-byte read beyond declared payload size */
+ if (sz < sizeof(unsigned int))
+ return -EINVAL;
+
+ handle = *(unsigned int *)payload;
+
ipc_update_last_active();
down_read(&ipc_msg_table_lock);
hash_for_each_possible(ipc_msg_table, entry, ipc_table_hlist, handle) {
diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c
index a201c5871a77..7d86553fcc7c 100644
--- a/fs/smb/server/transport_rdma.c
+++ b/fs/smb/server/transport_rdma.c
@@ -219,6 +219,7 @@ static void smb_direct_disconnect_wake_up_all(struct smbdirect_socket *sc)
* in order to notice the broken connection.
*/
wake_up_all(&sc->status_wait);
+ wake_up_all(&sc->send_io.lcredits.wait_queue);
wake_up_all(&sc->send_io.credits.wait_queue);
wake_up_all(&sc->send_io.pending.zero_wait_queue);
wake_up_all(&sc->recv_io.reassembly.wait_queue);
@@ -417,9 +418,6 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id)
sc->ib.dev = sc->rdma.cm_id->device;
- INIT_WORK(&sc->recv_io.posted.refill_work,
- smb_direct_post_recv_credits);
- INIT_WORK(&sc->idle.immediate_work, smb_direct_send_immediate_work);
INIT_DELAYED_WORK(&sc->idle.timer_work, smb_direct_idle_connection_timer);
conn = ksmbd_conn_alloc();
@@ -450,11 +448,10 @@ static void free_transport(struct smb_direct_transport *t)
struct smbdirect_recv_io *recvmsg;
disable_work_sync(&sc->disconnect_work);
- if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING) {
+ if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING)
smb_direct_disconnect_rdma_work(&sc->disconnect_work);
- wait_event_interruptible(sc->status_wait,
- sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
- }
+ if (sc->status < SMBDIRECT_SOCKET_DISCONNECTED)
+ wait_event(sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
/*
* Wake up all waiters in all wait queues
@@ -469,9 +466,11 @@ static void free_transport(struct smb_direct_transport *t)
disable_delayed_work_sync(&sc->idle.timer_work);
disable_work_sync(&sc->idle.immediate_work);
+ if (sc->rdma.cm_id)
+ rdma_lock_handler(sc->rdma.cm_id);
+
if (sc->ib.qp) {
ib_drain_qp(sc->ib.qp);
- ib_mr_pool_destroy(sc->ib.qp, &sc->ib.qp->rdma_mrs);
sc->ib.qp = NULL;
rdma_destroy_qp(sc->rdma.cm_id);
}
@@ -498,8 +497,10 @@ static void free_transport(struct smb_direct_transport *t)
ib_free_cq(sc->ib.recv_cq);
if (sc->ib.pd)
ib_dealloc_pd(sc->ib.pd);
- if (sc->rdma.cm_id)
+ if (sc->rdma.cm_id) {
+ rdma_unlock_handler(sc->rdma.cm_id);
rdma_destroy_id(sc->rdma.cm_id);
+ }
smb_direct_destroy_pools(sc);
ksmbd_conn_free(KSMBD_TRANS(t)->conn);
@@ -524,6 +525,12 @@ static void smb_direct_free_sendmsg(struct smbdirect_socket *sc,
{
int i;
+ /*
+ * The list needs to be empty!
+ * The caller should take care of it.
+ */
+ WARN_ON_ONCE(!list_empty(&msg->sibling_list));
+
if (msg->num_sge > 0) {
ib_dma_unmap_single(sc->ib.dev,
msg->sge[0].addr, msg->sge[0].length,
@@ -909,9 +916,9 @@ static void smb_direct_post_recv_credits(struct work_struct *work)
static void send_done(struct ib_cq *cq, struct ib_wc *wc)
{
- struct smbdirect_send_io *sendmsg, *sibling;
+ struct smbdirect_send_io *sendmsg, *sibling, *next;
struct smbdirect_socket *sc;
- struct list_head *pos, *prev, *end;
+ int lcredits = 0;
sendmsg = container_of(wc->wr_cqe, struct smbdirect_send_io, cqe);
sc = sendmsg->socket;
@@ -920,27 +927,31 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc)
ib_wc_status_msg(wc->status), wc->status,
wc->opcode);
+ /*
+ * Free possible siblings and then the main send_io
+ */
+ list_for_each_entry_safe(sibling, next, &sendmsg->sibling_list, sibling_list) {
+ list_del_init(&sibling->sibling_list);
+ smb_direct_free_sendmsg(sc, sibling);
+ lcredits += 1;
+ }
+ /* Note this frees wc->wr_cqe, but not wc */
+ smb_direct_free_sendmsg(sc, sendmsg);
+ lcredits += 1;
+
if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) {
pr_err("Send error. status='%s (%d)', opcode=%d\n",
ib_wc_status_msg(wc->status), wc->status,
wc->opcode);
smb_direct_disconnect_rdma_connection(sc);
+ return;
}
+ atomic_add(lcredits, &sc->send_io.lcredits.count);
+ wake_up(&sc->send_io.lcredits.wait_queue);
+
if (atomic_dec_and_test(&sc->send_io.pending.count))
wake_up(&sc->send_io.pending.zero_wait_queue);
-
- /* iterate and free the list of messages in reverse. the list's head
- * is invalid.
- */
- for (pos = &sendmsg->sibling_list, prev = pos->prev, end = sendmsg->sibling_list.next;
- prev != end; pos = prev, prev = prev->prev) {
- sibling = container_of(pos, struct smbdirect_send_io, sibling_list);
- smb_direct_free_sendmsg(sc, sibling);
- }
-
- sibling = container_of(pos, struct smbdirect_send_io, sibling_list);
- smb_direct_free_sendmsg(sc, sibling);
}
static int manage_credits_prior_sending(struct smbdirect_socket *sc)
@@ -988,8 +999,6 @@ static int smb_direct_post_send(struct smbdirect_socket *sc,
ret = ib_post_send(sc->ib.qp, wr, NULL);
if (ret) {
pr_err("failed to post send: %d\n", ret);
- if (atomic_dec_and_test(&sc->send_io.pending.count))
- wake_up(&sc->send_io.pending.zero_wait_queue);
smb_direct_disconnect_rdma_connection(sc);
}
return ret;
@@ -1032,19 +1041,29 @@ static int smb_direct_flush_send_list(struct smbdirect_socket *sc,
last->wr.send_flags = IB_SEND_SIGNALED;
last->wr.wr_cqe = &last->cqe;
+ /*
+ * Remove last from send_ctx->msg_list
+ * and splice the rest of send_ctx->msg_list
+ * to last->sibling_list.
+ *
+ * send_ctx->msg_list is a valid empty list
+ * at the end.
+ */
+ list_del_init(&last->sibling_list);
+ list_splice_tail_init(&send_ctx->msg_list, &last->sibling_list);
+ send_ctx->wr_cnt = 0;
+
ret = smb_direct_post_send(sc, &first->wr);
- if (!ret) {
- smb_direct_send_ctx_init(send_ctx,
- send_ctx->need_invalidate_rkey,
- send_ctx->remote_key);
- } else {
- atomic_add(send_ctx->wr_cnt, &sc->send_io.credits.count);
- wake_up(&sc->send_io.credits.wait_queue);
- list_for_each_entry_safe(first, last, &send_ctx->msg_list,
- sibling_list) {
- smb_direct_free_sendmsg(sc, first);
+ if (ret) {
+ struct smbdirect_send_io *sibling, *next;
+
+ list_for_each_entry_safe(sibling, next, &last->sibling_list, sibling_list) {
+ list_del_init(&sibling->sibling_list);
+ smb_direct_free_sendmsg(sc, sibling);
}
+ smb_direct_free_sendmsg(sc, last);
}
+
return ret;
}
@@ -1070,6 +1089,23 @@ static int wait_for_credits(struct smbdirect_socket *sc,
} while (true);
}
+static int wait_for_send_lcredit(struct smbdirect_socket *sc,
+ struct smbdirect_send_batch *send_ctx)
+{
+ if (send_ctx && (atomic_read(&sc->send_io.lcredits.count) <= 1)) {
+ int ret;
+
+ ret = smb_direct_flush_send_list(sc, send_ctx, false);
+ if (ret)
+ return ret;
+ }
+
+ return wait_for_credits(sc,
+ &sc->send_io.lcredits.wait_queue,
+ &sc->send_io.lcredits.count,
+ 1);
+}
+
static int wait_for_send_credits(struct smbdirect_socket *sc,
struct smbdirect_send_batch *send_ctx)
{
@@ -1257,9 +1293,13 @@ static int smb_direct_post_send_data(struct smbdirect_socket *sc,
int data_length;
struct scatterlist sg[SMBDIRECT_SEND_IO_MAX_SGE - 1];
+ ret = wait_for_send_lcredit(sc, send_ctx);
+ if (ret)
+ goto lcredit_failed;
+
ret = wait_for_send_credits(sc, send_ctx);
if (ret)
- return ret;
+ goto credit_failed;
data_length = 0;
for (i = 0; i < niov; i++)
@@ -1267,10 +1307,8 @@ static int smb_direct_post_send_data(struct smbdirect_socket *sc,
ret = smb_direct_create_header(sc, data_length, remaining_data_length,
&msg);
- if (ret) {
- atomic_inc(&sc->send_io.credits.count);
- return ret;
- }
+ if (ret)
+ goto header_failed;
for (i = 0; i < niov; i++) {
struct ib_sge *sge;
@@ -1308,7 +1346,11 @@ static int smb_direct_post_send_data(struct smbdirect_socket *sc,
return 0;
err:
smb_direct_free_sendmsg(sc, msg);
+header_failed:
atomic_inc(&sc->send_io.credits.count);
+credit_failed:
+ atomic_inc(&sc->send_io.lcredits.count);
+lcredit_failed:
return ret;
}
@@ -1687,10 +1729,10 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id,
}
case RDMA_CM_EVENT_DEVICE_REMOVAL:
case RDMA_CM_EVENT_DISCONNECTED: {
- ib_drain_qp(sc->ib.qp);
-
sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
smb_direct_disconnect_rdma_work(&sc->disconnect_work);
+ if (sc->ib.qp)
+ ib_drain_qp(sc->ib.qp);
break;
}
case RDMA_CM_EVENT_CONNECT_ERROR: {
@@ -1864,27 +1906,17 @@ static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc)
goto out_err;
}
- smb_direct_post_recv_credits(&sc->recv_io.posted.refill_work);
return 0;
out_err:
put_recvmsg(sc, recvmsg);
return ret;
}
-static unsigned int smb_direct_get_max_fr_pages(struct smbdirect_socket *sc)
-{
- return min_t(unsigned int,
- sc->ib.dev->attrs.max_fast_reg_page_list_len,
- 256);
-}
-
-static int smb_direct_init_params(struct smbdirect_socket *sc,
- struct ib_qp_cap *cap)
+static int smb_direct_init_params(struct smbdirect_socket *sc)
{
struct smbdirect_socket_parameters *sp = &sc->parameters;
- struct ib_device *device = sc->ib.dev;
- int max_send_sges, max_rw_wrs, max_send_wrs;
- unsigned int max_sge_per_wr, wrs_per_credit;
+ int max_send_sges;
+ unsigned int maxpages;
/* need 3 more sge. because a SMB_DIRECT header, SMB2 header,
* SMB2 response could be mapped.
@@ -1895,67 +1927,20 @@ static int smb_direct_init_params(struct smbdirect_socket *sc,
return -EINVAL;
}
- /* Calculate the number of work requests for RDMA R/W.
- * The maximum number of pages which can be registered
- * with one Memory region can be transferred with one
- * R/W credit. And at least 4 work requests for each credit
- * are needed for MR registration, RDMA R/W, local & remote
- * MR invalidation.
- */
- sc->rw_io.credits.num_pages = smb_direct_get_max_fr_pages(sc);
- sc->rw_io.credits.max = DIV_ROUND_UP(sp->max_read_write_size,
- (sc->rw_io.credits.num_pages - 1) *
- PAGE_SIZE);
-
- max_sge_per_wr = min_t(unsigned int, device->attrs.max_send_sge,
- device->attrs.max_sge_rd);
- max_sge_per_wr = max_t(unsigned int, max_sge_per_wr,
- max_send_sges);
- wrs_per_credit = max_t(unsigned int, 4,
- DIV_ROUND_UP(sc->rw_io.credits.num_pages,
- max_sge_per_wr) + 1);
- max_rw_wrs = sc->rw_io.credits.max * wrs_per_credit;
-
- max_send_wrs = sp->send_credit_target + max_rw_wrs;
- if (max_send_wrs > device->attrs.max_cqe ||
- max_send_wrs > device->attrs.max_qp_wr) {
- pr_err("consider lowering send_credit_target = %d\n",
- sp->send_credit_target);
- pr_err("Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
- device->attrs.max_cqe, device->attrs.max_qp_wr);
- return -EINVAL;
- }
+ atomic_set(&sc->send_io.lcredits.count, sp->send_credit_target);
- if (sp->recv_credit_max > device->attrs.max_cqe ||
- sp->recv_credit_max > device->attrs.max_qp_wr) {
- pr_err("consider lowering receive_credit_max = %d\n",
- sp->recv_credit_max);
- pr_err("Possible CQE overrun, device reporting max_cpe %d max_qp_wr %d\n",
- device->attrs.max_cqe, device->attrs.max_qp_wr);
- return -EINVAL;
- }
-
- if (device->attrs.max_send_sge < SMBDIRECT_SEND_IO_MAX_SGE) {
- pr_err("warning: device max_send_sge = %d too small\n",
- device->attrs.max_send_sge);
- return -EINVAL;
- }
- if (device->attrs.max_recv_sge < SMBDIRECT_RECV_IO_MAX_SGE) {
- pr_err("warning: device max_recv_sge = %d too small\n",
- device->attrs.max_recv_sge);
- return -EINVAL;
- }
+ maxpages = DIV_ROUND_UP(sp->max_read_write_size, PAGE_SIZE);
+ sc->rw_io.credits.max = rdma_rw_mr_factor(sc->ib.dev,
+ sc->rdma.cm_id->port_num,
+ maxpages);
+ sc->rw_io.credits.num_pages = DIV_ROUND_UP(maxpages, sc->rw_io.credits.max);
+ /* add one extra in order to handle unaligned pages */
+ sc->rw_io.credits.max += 1;
sc->recv_io.credits.target = 1;
atomic_set(&sc->rw_io.credits.count, sc->rw_io.credits.max);
- cap->max_send_wr = max_send_wrs;
- cap->max_recv_wr = sp->recv_credit_max;
- cap->max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE;
- cap->max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE;
- cap->max_inline_data = 0;
- cap->max_rdma_ctxs = sc->rw_io.credits.max;
return 0;
}
@@ -2029,13 +2014,129 @@ err:
return -ENOMEM;
}
-static int smb_direct_create_qpair(struct smbdirect_socket *sc,
- struct ib_qp_cap *cap)
+static u32 smb_direct_rdma_rw_send_wrs(struct ib_device *dev, const struct ib_qp_init_attr *attr)
+{
+ /*
+ * This could be split out of rdma_rw_init_qp()
+ * and be a helper function next to rdma_rw_mr_factor()
+ *
+ * We can't check unlikely(rdma_rw_force_mr) here,
+ * but that is most likely 0 anyway.
+ */
+ u32 factor;
+
+ WARN_ON_ONCE(attr->port_num == 0);
+
+ /*
+ * Each context needs at least one RDMA READ or WRITE WR.
+ *
+ * For some hardware we might need more, eventually we should ask the
+ * HCA driver for a multiplier here.
+ */
+ factor = 1;
+
+ /*
+ * If the device needs MRs to perform RDMA READ or WRITE operations,
+ * we'll need two additional MRs for the registrations and the
+ * invalidation.
+ */
+ if (rdma_protocol_iwarp(dev, attr->port_num) || dev->attrs.max_sgl_rd)
+ factor += 2; /* inv + reg */
+
+ return factor * attr->cap.max_rdma_ctxs;
+}
+
+static int smb_direct_create_qpair(struct smbdirect_socket *sc)
{
struct smbdirect_socket_parameters *sp = &sc->parameters;
int ret;
+ struct ib_qp_cap qp_cap;
struct ib_qp_init_attr qp_attr;
- int pages_per_rw;
+ u32 max_send_wr;
+ u32 rdma_send_wr;
+
+ /*
+ * Note that {rdma,ib}_create_qp() will call
+ * rdma_rw_init_qp() if cap->max_rdma_ctxs is not 0.
+ * It will adjust cap->max_send_wr to the required
+ * number of additional WRs for the RDMA RW operations.
+ * It will cap cap->max_send_wr to the device limit.
+ *
+ * +1 for ib_drain_qp
+ */
+ qp_cap.max_send_wr = sp->send_credit_target + 1;
+ qp_cap.max_recv_wr = sp->recv_credit_max + 1;
+ qp_cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE;
+ qp_cap.max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE;
+ qp_cap.max_inline_data = 0;
+ qp_cap.max_rdma_ctxs = sc->rw_io.credits.max;
+
+ /*
+ * Find out the number of max_send_wr
+ * after rdma_rw_init_qp() adjusted it.
+ *
+ * We only do it on a temporary variable,
+ * as rdma_create_qp() will trigger
+ * rdma_rw_init_qp() again.
+ */
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.cap = qp_cap;
+ qp_attr.port_num = sc->rdma.cm_id->port_num;
+ rdma_send_wr = smb_direct_rdma_rw_send_wrs(sc->ib.dev, &qp_attr);
+ max_send_wr = qp_cap.max_send_wr + rdma_send_wr;
+
+ if (qp_cap.max_send_wr > sc->ib.dev->attrs.max_cqe ||
+ qp_cap.max_send_wr > sc->ib.dev->attrs.max_qp_wr) {
+ pr_err("Possible CQE overrun: max_send_wr %d\n",
+ qp_cap.max_send_wr);
+ pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n",
+ IB_DEVICE_NAME_MAX,
+ sc->ib.dev->name,
+ sc->ib.dev->attrs.max_cqe,
+ sc->ib.dev->attrs.max_qp_wr);
+ pr_err("consider lowering send_credit_target = %d\n",
+ sp->send_credit_target);
+ return -EINVAL;
+ }
+
+ if (qp_cap.max_rdma_ctxs &&
+ (max_send_wr >= sc->ib.dev->attrs.max_cqe ||
+ max_send_wr >= sc->ib.dev->attrs.max_qp_wr)) {
+ pr_err("Possible CQE overrun: rdma_send_wr %d + max_send_wr %d = %d\n",
+ rdma_send_wr, qp_cap.max_send_wr, max_send_wr);
+ pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n",
+ IB_DEVICE_NAME_MAX,
+ sc->ib.dev->name,
+ sc->ib.dev->attrs.max_cqe,
+ sc->ib.dev->attrs.max_qp_wr);
+ pr_err("consider lowering send_credit_target = %d, max_rdma_ctxs = %d\n",
+ sp->send_credit_target, qp_cap.max_rdma_ctxs);
+ return -EINVAL;
+ }
+
+ if (qp_cap.max_recv_wr > sc->ib.dev->attrs.max_cqe ||
+ qp_cap.max_recv_wr > sc->ib.dev->attrs.max_qp_wr) {
+ pr_err("Possible CQE overrun: max_recv_wr %d\n",
+ qp_cap.max_recv_wr);
+ pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n",
+ IB_DEVICE_NAME_MAX,
+ sc->ib.dev->name,
+ sc->ib.dev->attrs.max_cqe,
+ sc->ib.dev->attrs.max_qp_wr);
+ pr_err("consider lowering receive_credit_max = %d\n",
+ sp->recv_credit_max);
+ return -EINVAL;
+ }
+
+ if (qp_cap.max_send_sge > sc->ib.dev->attrs.max_send_sge ||
+ qp_cap.max_recv_sge > sc->ib.dev->attrs.max_recv_sge) {
+ pr_err("device %.*s max_send_sge/max_recv_sge = %d/%d too small\n",
+ IB_DEVICE_NAME_MAX,
+ sc->ib.dev->name,
+ sc->ib.dev->attrs.max_send_sge,
+ sc->ib.dev->attrs.max_recv_sge);
+ return -EINVAL;
+ }
sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0);
if (IS_ERR(sc->ib.pd)) {
@@ -2046,8 +2147,7 @@ static int smb_direct_create_qpair(struct smbdirect_socket *sc,
}
sc->ib.send_cq = ib_alloc_cq_any(sc->ib.dev, sc,
- sp->send_credit_target +
- cap->max_rdma_ctxs,
+ max_send_wr,
IB_POLL_WORKQUEUE);
if (IS_ERR(sc->ib.send_cq)) {
pr_err("Can't create RDMA send CQ\n");
@@ -2057,7 +2157,7 @@ static int smb_direct_create_qpair(struct smbdirect_socket *sc,
}
sc->ib.recv_cq = ib_alloc_cq_any(sc->ib.dev, sc,
- sp->recv_credit_max,
+ qp_cap.max_recv_wr,
IB_POLL_WORKQUEUE);
if (IS_ERR(sc->ib.recv_cq)) {
pr_err("Can't create RDMA recv CQ\n");
@@ -2066,10 +2166,18 @@ static int smb_direct_create_qpair(struct smbdirect_socket *sc,
goto err;
}
+ /*
+ * We reset completely here!
+ * As the above use was just temporary
+ * to calc max_send_wr and rdma_send_wr.
+ *
+ * rdma_create_qp() will trigger rdma_rw_init_qp()
+ * again if max_rdma_ctxs is not 0.
+ */
memset(&qp_attr, 0, sizeof(qp_attr));
qp_attr.event_handler = smb_direct_qpair_handler;
qp_attr.qp_context = sc;
- qp_attr.cap = *cap;
+ qp_attr.cap = qp_cap;
qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
qp_attr.qp_type = IB_QPT_RC;
qp_attr.send_cq = sc->ib.send_cq;
@@ -2085,18 +2193,6 @@ static int smb_direct_create_qpair(struct smbdirect_socket *sc,
sc->ib.qp = sc->rdma.cm_id->qp;
sc->rdma.cm_id->event_handler = smb_direct_cm_handler;
- pages_per_rw = DIV_ROUND_UP(sp->max_read_write_size, PAGE_SIZE) + 1;
- if (pages_per_rw > sc->ib.dev->attrs.max_sgl_rd) {
- ret = ib_mr_pool_init(sc->ib.qp, &sc->ib.qp->rdma_mrs,
- sc->rw_io.credits.max, IB_MR_TYPE_MEM_REG,
- sc->rw_io.credits.num_pages, 0);
- if (ret) {
- pr_err("failed to init mr pool count %zu pages %zu\n",
- sc->rw_io.credits.max, sc->rw_io.credits.num_pages);
- goto err;
- }
- }
-
return 0;
err:
if (sc->ib.qp) {
@@ -2154,8 +2250,8 @@ static int smb_direct_prepare(struct ksmbd_transport *t)
return -ECONNABORTED;
ret = smb_direct_check_recvmsg(recvmsg);
- if (ret == -ECONNABORTED)
- goto out;
+ if (ret)
+ goto put;
req = (struct smbdirect_negotiate_req *)recvmsg->packet;
sp->max_recv_size = min_t(int, sp->max_recv_size,
@@ -2170,23 +2266,46 @@ static int smb_direct_prepare(struct ksmbd_transport *t)
sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max);
sc->recv_io.credits.target = max_t(u16, sc->recv_io.credits.target, 1);
- ret = smb_direct_send_negotiate_response(sc, ret);
-out:
+put:
spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
sc->recv_io.reassembly.queue_length--;
list_del(&recvmsg->list);
spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
put_recvmsg(sc, recvmsg);
+ if (ret == -ECONNABORTED)
+ return ret;
+
+ if (ret)
+ goto respond;
+
+ /*
+ * We negotiated with success, so we need to refill the recv queue.
+ * We do that with sc->idle.immediate_work still being disabled
+ * via smbdirect_socket_init(), so that queue_work(sc->workqueue,
+ * &sc->idle.immediate_work) in smb_direct_post_recv_credits()
+ * is a no-op.
+ *
+ * The message that grants the credits to the client is
+ * the negotiate response.
+ */
+ INIT_WORK(&sc->recv_io.posted.refill_work, smb_direct_post_recv_credits);
+ smb_direct_post_recv_credits(&sc->recv_io.posted.refill_work);
+ if (unlikely(sc->first_error))
+ return sc->first_error;
+ INIT_WORK(&sc->idle.immediate_work, smb_direct_send_immediate_work);
+
+respond:
+ ret = smb_direct_send_negotiate_response(sc, ret);
+
return ret;
}
static int smb_direct_connect(struct smbdirect_socket *sc)
{
- struct ib_qp_cap qp_cap;
int ret;
- ret = smb_direct_init_params(sc, &qp_cap);
+ ret = smb_direct_init_params(sc);
if (ret) {
pr_err("Can't configure RDMA parameters\n");
return ret;
@@ -2198,7 +2317,7 @@ static int smb_direct_connect(struct smbdirect_socket *sc)
return ret;
}
- ret = smb_direct_create_qpair(sc, &qp_cap);
+ ret = smb_direct_create_qpair(sc);
if (ret) {
pr_err("Can't accept RDMA client: %d\n", ret);
return ret;
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 2d78e94072a0..e142bac4f9f8 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -498,17 +498,26 @@ int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj,
}
EXPORT_SYMBOL_GPL(compat_only_sysfs_link_entry_to_kobj);
-static int sysfs_group_attrs_change_owner(struct kernfs_node *grp_kn,
+static int sysfs_group_attrs_change_owner(struct kobject *kobj,
+ struct kernfs_node *grp_kn,
const struct attribute_group *grp,
struct iattr *newattrs)
{
struct kernfs_node *kn;
- int error;
+ int error, i;
+ umode_t mode;
if (grp->attrs) {
struct attribute *const *attr;
- for (attr = grp->attrs; *attr; attr++) {
+ for (i = 0, attr = grp->attrs; *attr; i++, attr++) {
+ if (grp->is_visible) {
+ mode = grp->is_visible(kobj, *attr, i);
+ if (mode & SYSFS_GROUP_INVISIBLE)
+ break;
+ if (!mode)
+ continue;
+ }
kn = kernfs_find_and_get(grp_kn, (*attr)->name);
if (!kn)
return -ENOENT;
@@ -523,7 +532,14 @@ static int sysfs_group_attrs_change_owner(struct kernfs_node *grp_kn,
if (grp->bin_attrs) {
const struct bin_attribute *const *bin_attr;
- for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) {
+ for (i = 0, bin_attr = grp->bin_attrs; *bin_attr; i++, bin_attr++) {
+ if (grp->is_bin_visible) {
+ mode = grp->is_bin_visible(kobj, *bin_attr, i);
+ if (mode & SYSFS_GROUP_INVISIBLE)
+ break;
+ if (!mode)
+ continue;
+ }
kn = kernfs_find_and_get(grp_kn, (*bin_attr)->attr.name);
if (!kn)
return -ENOENT;
@@ -573,7 +589,7 @@ int sysfs_group_change_owner(struct kobject *kobj,
error = kernfs_setattr(grp_kn, &newattrs);
if (!error)
- error = sysfs_group_attrs_change_owner(grp_kn, grp, &newattrs);
+ error = sysfs_group_attrs_change_owner(kobj, grp_kn, grp, &newattrs);
kernfs_put(grp_kn);
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 8930d5254e1d..b99da294e9a3 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -119,6 +119,15 @@ config XFS_RT
See the xfs man page in section 5 for additional information.
+ This option is mandatory to support zoned block devices. For these
+ devices, the realtime subvolume must be backed by a zoned block
+ device and a regular block device used as the main device (for
+ metadata). If the zoned block device is a host-managed SMR hard-disk
+ containing conventional zones at the beginning of its address space,
+ XFS will use the disk conventional zones as the main device and the
+ remaining sequential write required zones as the backing storage for
+ the realtime subvolume.
+
If unsure, say N.
config XFS_DRAIN_INTENTS
@@ -156,7 +165,7 @@ config XFS_ONLINE_SCRUB_STATS
bool "XFS online metadata check usage data collection"
default y
depends on XFS_ONLINE_SCRUB
- select DEBUG_FS
+ depends on DEBUG_FS
help
If you say Y here, the kernel will gather usage data about
the online metadata check subsystem. This includes the number
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index d36a6ae0abe5..d4fcf591e63d 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -50,6 +50,12 @@ struct xfs_rtgroup {
uint8_t *rtg_rsum_cache;
struct xfs_open_zone *rtg_open_zone;
};
+
+ /*
+ * Count of outstanding GC operations for zoned XFS. Any RTG with a
+ * non-zero rtg_gccount will not be picked as new GC victim.
+ */
+ atomic_t rtg_gccount;
};
/*
diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c
index 26721fab5cab..091c79e432e5 100644
--- a/fs/xfs/scrub/nlinks.c
+++ b/fs/xfs/scrub/nlinks.c
@@ -376,6 +376,36 @@ out_incomplete:
return error;
}
+static uint
+xchk_nlinks_ilock_dir(
+ struct xfs_inode *ip)
+{
+ uint lock_mode = XFS_ILOCK_SHARED;
+
+ /*
+ * We're going to scan the directory entries, so we must be ready to
+ * pull the data fork mappings into memory if they aren't already.
+ */
+ if (xfs_need_iread_extents(&ip->i_df))
+ lock_mode = XFS_ILOCK_EXCL;
+
+ /*
+ * We're going to scan the parent pointers, so we must be ready to
+ * pull the attr fork mappings into memory if they aren't already.
+ */
+ if (xfs_has_parent(ip->i_mount) && xfs_inode_has_attr_fork(ip) &&
+ xfs_need_iread_extents(&ip->i_af))
+ lock_mode = XFS_ILOCK_EXCL;
+
+ /*
+ * Take the IOLOCK so that other threads cannot start a directory
+ * update while we're scanning.
+ */
+ lock_mode |= XFS_IOLOCK_SHARED;
+ xfs_ilock(ip, lock_mode);
+ return lock_mode;
+}
+
/* Walk a directory to bump the observed link counts of the children. */
STATIC int
xchk_nlinks_collect_dir(
@@ -394,8 +424,7 @@ xchk_nlinks_collect_dir(
return 0;
/* Prevent anyone from changing this directory while we walk it. */
- xfs_ilock(dp, XFS_IOLOCK_SHARED);
- lock_mode = xfs_ilock_data_map_shared(dp);
+ lock_mode = xchk_nlinks_ilock_dir(dp);
/*
* The dotdot entry of an unlinked directory still points to the last
@@ -452,7 +481,6 @@ out_abort:
xchk_iscan_abort(&xnc->collect_iscan);
out_unlock:
xfs_iunlock(dp, lock_mode);
- xfs_iunlock(dp, XFS_IOLOCK_SHARED);
return error;
}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 773d959965dc..47edf3041631 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1751,7 +1751,7 @@ xfs_init_buftarg(
const char *descr)
{
/* The maximum size of the buftarg is only known once the sb is read. */
- btp->bt_nr_sectors = (xfs_daddr_t)-1;
+ btp->bt_nr_sectors = XFS_BUF_DADDR_MAX;
/* Set up device logical sector size mask */
btp->bt_logical_sectorsize = logical_sectorsize;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 8fa7bdf59c91..e25cd2a160f3 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -22,6 +22,7 @@ extern struct kmem_cache *xfs_buf_cache;
*/
struct xfs_buf;
+#define XFS_BUF_DADDR_MAX ((xfs_daddr_t) S64_MAX)
#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL))
#define XBF_READ (1u << 0) /* buffer intended for reading from device */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index f046d1215b04..b871dfde372b 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -236,7 +236,6 @@ typedef struct xfs_mount {
bool m_update_sb; /* sb needs update in mount */
unsigned int m_max_open_zones;
unsigned int m_zonegc_low_space;
- struct xfs_mru_cache *m_zone_cache; /* Inode to open zone cache */
/* max_atomic_write mount option value */
unsigned long long m_awu_max_bytes;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index e85a156dc17d..1067ebb3b001 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -102,7 +102,7 @@ static const struct constant_table dax_param_enums[] = {
* Table driven mount option parser.
*/
enum {
- Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev,
+ Op_deprecated, Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev,
Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid,
Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups,
Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32,
@@ -114,7 +114,21 @@ enum {
Opt_lifetime, Opt_nolifetime, Opt_max_atomic_write,
};
+#define fsparam_dead(NAME) \
+ __fsparam(NULL, (NAME), Op_deprecated, fs_param_deprecated, NULL)
+
static const struct fs_parameter_spec xfs_fs_parameters[] = {
+ /*
+ * These mount options were supposed to be deprecated in September 2025
+ * but the deprecation warning was buggy, so not all users were
+ * notified. The deprecation is now obnoxiously loud and postponed to
+ * September 2030.
+ */
+ fsparam_dead("attr2"),
+ fsparam_dead("noattr2"),
+ fsparam_dead("ikeep"),
+ fsparam_dead("noikeep"),
+
fsparam_u32("logbufs", Opt_logbufs),
fsparam_string("logbsize", Opt_logbsize),
fsparam_string("logdev", Opt_logdev),
@@ -786,6 +800,12 @@ xfs_fs_evict_inode(
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
+
+ if (IS_ENABLED(CONFIG_XFS_RT) &&
+ S_ISREG(inode->i_mode) && inode->i_private) {
+ xfs_open_zone_put(inode->i_private);
+ inode->i_private = NULL;
+ }
}
static void
@@ -1373,16 +1393,25 @@ suffix_kstrtoull(
static inline void
xfs_fs_warn_deprecated(
struct fs_context *fc,
- struct fs_parameter *param,
- uint64_t flag,
- bool value)
+ struct fs_parameter *param)
{
- /* Don't print the warning if reconfiguring and current mount point
- * already had the flag set
+ /*
+ * Always warn about someone passing in a deprecated mount option.
+ * Previously we wouldn't print the warning if we were reconfiguring
+ * and current mount point already had the flag set, but that was not
+ * the right thing to do.
+ *
+ * Many distributions mount the root filesystem with no options in the
+ * initramfs and rely on mount -a to remount the root fs with the
+ * options in fstab. However, the old behavior meant that there would
+ * never be a warning about deprecated mount options for the root fs in
+ * /etc/fstab. On a single-fs system, that means no warning at all.
+ *
+ * Compounding this problem are distribution scripts that copy
+ * /proc/mounts to fstab, which means that we can't remove mount
+ * options unless we're 100% sure they have only ever been advertised
+ * in /proc/mounts in response to explicitly provided mount options.
*/
- if ((fc->purpose & FS_CONTEXT_FOR_RECONFIGURE) &&
- !!(XFS_M(fc->root->d_sb)->m_features & flag) == value)
- return;
xfs_warn(fc->s_fs_info, "%s mount option is deprecated.", param->key);
}
@@ -1408,6 +1437,9 @@ xfs_fs_parse_param(
return opt;
switch (opt) {
+ case Op_deprecated:
+ xfs_fs_warn_deprecated(fc, param);
+ return 0;
case Opt_logbufs:
parsing_mp->m_logbufs = result.uint_32;
return 0;
@@ -1528,7 +1560,6 @@ xfs_fs_parse_param(
xfs_mount_set_dax_mode(parsing_mp, result.uint_32);
return 0;
#endif
- /* Following mount options will be removed in September 2025 */
case Opt_max_open_zones:
parsing_mp->m_max_open_zones = result.uint_32;
return 0;
@@ -2221,7 +2252,7 @@ xfs_init_fs_context(
struct xfs_mount *mp;
int i;
- mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL | __GFP_NOFAIL);
+ mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
if (!mp)
return -ENOMEM;
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
index 1147bacb2da8..040402240807 100644
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -26,14 +26,22 @@
#include "xfs_trace.h"
#include "xfs_mru_cache.h"
+static void
+xfs_open_zone_free_rcu(
+ struct callback_head *cb)
+{
+ struct xfs_open_zone *oz = container_of(cb, typeof(*oz), oz_rcu);
+
+ xfs_rtgroup_rele(oz->oz_rtg);
+ kfree(oz);
+}
+
void
xfs_open_zone_put(
struct xfs_open_zone *oz)
{
- if (atomic_dec_and_test(&oz->oz_ref)) {
- xfs_rtgroup_rele(oz->oz_rtg);
- kfree(oz);
- }
+ if (atomic_dec_and_test(&oz->oz_ref))
+ call_rcu(&oz->oz_rcu, xfs_open_zone_free_rcu);
}
static inline uint32_t
@@ -238,6 +246,14 @@ xfs_zoned_map_extent(
* If a data write raced with this GC write, keep the existing data in
* the data fork, mark our newly written GC extent as reclaimable, then
* move on to the next extent.
+ *
+ * Note that this can also happen when racing with operations that do
+ * not actually invalidate the data, but just move it to a different
+ * inode (XFS_IOC_EXCHANGE_RANGE), or to a different offset inside the
+ * inode (FALLOC_FL_COLLAPSE_RANGE / FALLOC_FL_INSERT_RANGE). If the
+ * data was just moved around, GC fails to free the zone, but the zone
+ * becomes a GC candidate again as soon as all previous GC I/O has
+ * finished and these blocks will be moved out eventually.
*/
if (old_startblock != NULLFSBLOCK &&
old_startblock != data.br_startblock)
@@ -614,14 +630,25 @@ static inline enum rw_hint xfs_inode_write_hint(struct xfs_inode *ip)
}
/*
- * Try to pack inodes that are written back after they were closed tight instead
- * of trying to open new zones for them or spread them to the least recently
- * used zone. This optimizes the data layout for workloads that untar or copy
- * a lot of small files. Right now this does not separate multiple such
+ * Try to tightly pack small files that are written back after they were closed
+ * instead of trying to open new zones for them or spread them to the least
+ * recently used zone. This optimizes the data layout for workloads that untar
+ * or copy a lot of small files. Right now this does not separate multiple such
* streams.
*/
static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip)
{
+ struct xfs_mount *mp = ip->i_mount;
+ size_t zone_capacity =
+ XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_RTG].blocks);
+
+ /*
+ * Do not pack write files that are already using a full zone to avoid
+ * fragmentation.
+ */
+ if (i_size_read(VFS_I(ip)) >= zone_capacity)
+ return false;
+
return !inode_is_open_for_write(VFS_I(ip)) &&
!(ip->i_diflags & XFS_DIFLAG_APPEND);
}
@@ -746,97 +773,54 @@ xfs_mark_rtg_boundary(
}
/*
- * Cache the last zone written to for an inode so that it is considered first
- * for subsequent writes.
- */
-struct xfs_zone_cache_item {
- struct xfs_mru_cache_elem mru;
- struct xfs_open_zone *oz;
-};
-
-static inline struct xfs_zone_cache_item *
-xfs_zone_cache_item(struct xfs_mru_cache_elem *mru)
-{
- return container_of(mru, struct xfs_zone_cache_item, mru);
-}
-
-static void
-xfs_zone_cache_free_func(
- void *data,
- struct xfs_mru_cache_elem *mru)
-{
- struct xfs_zone_cache_item *item = xfs_zone_cache_item(mru);
-
- xfs_open_zone_put(item->oz);
- kfree(item);
-}
-
-/*
* Check if we have a cached last open zone available for the inode and
* if yes return a reference to it.
*/
static struct xfs_open_zone *
-xfs_cached_zone(
- struct xfs_mount *mp,
- struct xfs_inode *ip)
+xfs_get_cached_zone(
+ struct xfs_inode *ip)
{
- struct xfs_mru_cache_elem *mru;
- struct xfs_open_zone *oz;
+ struct xfs_open_zone *oz;
- mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
- if (!mru)
- return NULL;
- oz = xfs_zone_cache_item(mru)->oz;
+ rcu_read_lock();
+ oz = VFS_I(ip)->i_private;
if (oz) {
/*
* GC only steals open zones at mount time, so no GC zones
* should end up in the cache.
*/
ASSERT(!oz->oz_is_gc);
- ASSERT(atomic_read(&oz->oz_ref) > 0);
- atomic_inc(&oz->oz_ref);
+ if (!atomic_inc_not_zero(&oz->oz_ref))
+ oz = NULL;
}
- xfs_mru_cache_done(mp->m_zone_cache);
+ rcu_read_unlock();
+
return oz;
}
/*
- * Update the last used zone cache for a given inode.
+ * Stash our zone in the inode so that is is reused for future allocations.
*
- * The caller must have a reference on the open zone.
+ * The open_zone structure will be pinned until either the inode is freed or
+ * until the cached open zone is replaced with a different one because the
+ * current one was full when we tried to use it. This means we keep any
+ * open zone around forever as long as any inode that used it for the last
+ * write is cached, which slightly increases the memory use of cached inodes
+ * that were every written to, but significantly simplifies the cached zone
+ * lookup. Because the open_zone is clearly marked as full when all data
+ * in the underlying RTG was written, the caching is always safe.
*/
static void
-xfs_zone_cache_create_association(
- struct xfs_inode *ip,
- struct xfs_open_zone *oz)
+xfs_set_cached_zone(
+ struct xfs_inode *ip,
+ struct xfs_open_zone *oz)
{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_zone_cache_item *item = NULL;
- struct xfs_mru_cache_elem *mru;
+ struct xfs_open_zone *old_oz;
- ASSERT(atomic_read(&oz->oz_ref) > 0);
atomic_inc(&oz->oz_ref);
-
- mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
- if (mru) {
- /*
- * If we have an association already, update it to point to the
- * new zone.
- */
- item = xfs_zone_cache_item(mru);
- xfs_open_zone_put(item->oz);
- item->oz = oz;
- xfs_mru_cache_done(mp->m_zone_cache);
- return;
- }
-
- item = kmalloc(sizeof(*item), GFP_KERNEL);
- if (!item) {
- xfs_open_zone_put(oz);
- return;
- }
- item->oz = oz;
- xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru);
+ old_oz = xchg(&VFS_I(ip)->i_private, oz);
+ if (old_oz)
+ xfs_open_zone_put(old_oz);
}
static void
@@ -880,15 +864,14 @@ xfs_zone_alloc_and_submit(
* the inode is still associated with a zone and use that if so.
*/
if (!*oz)
- *oz = xfs_cached_zone(mp, ip);
+ *oz = xfs_get_cached_zone(ip);
if (!*oz) {
select_zone:
*oz = xfs_select_zone(mp, write_hint, pack_tight);
if (!*oz)
goto out_error;
-
- xfs_zone_cache_create_association(ip, *oz);
+ xfs_set_cached_zone(ip, *oz);
}
alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size),
@@ -966,6 +949,12 @@ xfs_free_open_zones(
xfs_open_zone_put(oz);
}
spin_unlock(&zi->zi_open_zones_lock);
+
+ /*
+ * Wait for all open zones to be freed so that they drop the group
+ * references:
+ */
+ rcu_barrier();
}
struct xfs_init_zones {
@@ -1279,14 +1268,6 @@ xfs_mount_zones(
error = xfs_zone_gc_mount(mp);
if (error)
goto out_free_zone_info;
-
- /*
- * Set up a mru cache to track inode to open zone for data placement
- * purposes. The magic values for group count and life time is the
- * same as the defaults for file streams, which seems sane enough.
- */
- xfs_mru_cache_create(&mp->m_zone_cache, mp,
- 5000, 10, xfs_zone_cache_free_func);
return 0;
out_free_zone_info:
@@ -1300,5 +1281,4 @@ xfs_unmount_zones(
{
xfs_zone_gc_unmount(mp);
xfs_free_zone_info(mp->m_zone_info);
- xfs_mru_cache_destroy(mp->m_zone_cache);
}
diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
index 064cd1a857a0..4ade54445532 100644
--- a/fs/xfs/xfs_zone_gc.c
+++ b/fs/xfs/xfs_zone_gc.c
@@ -114,6 +114,8 @@ struct xfs_gc_bio {
/* Open Zone being written to */
struct xfs_open_zone *oz;
+ struct xfs_rtgroup *victim_rtg;
+
/* Bio used for reads and writes, including the bvec used by it */
struct bio_vec bv;
struct bio bio; /* must be last */
@@ -264,6 +266,7 @@ xfs_zone_gc_iter_init(
iter->rec_count = 0;
iter->rec_idx = 0;
iter->victim_rtg = victim_rtg;
+ atomic_inc(&victim_rtg->rtg_gccount);
}
/*
@@ -362,6 +365,7 @@ xfs_zone_gc_query(
return 0;
done:
+ atomic_dec(&iter->victim_rtg->rtg_gccount);
xfs_rtgroup_rele(iter->victim_rtg);
iter->victim_rtg = NULL;
return 0;
@@ -451,6 +455,20 @@ xfs_zone_gc_pick_victim_from(
if (!rtg)
continue;
+ /*
+ * If the zone is already undergoing GC, don't pick it again.
+ *
+ * This prevents us from picking one of the zones for which we
+ * already submitted GC I/O, but for which the remapping hasn't
+ * concluded yet. This won't cause data corruption, but
+ * increases write amplification and slows down GC, so this is
+ * a bad thing.
+ */
+ if (atomic_read(&rtg->rtg_gccount)) {
+ xfs_rtgroup_rele(rtg);
+ continue;
+ }
+
/* skip zones that are just waiting for a reset */
if (rtg_rmap(rtg)->i_used_blocks == 0 ||
rtg_rmap(rtg)->i_used_blocks >= victim_used) {
@@ -491,21 +509,6 @@ xfs_zone_gc_select_victim(
struct xfs_rtgroup *victim_rtg = NULL;
unsigned int bucket;
- if (xfs_is_shutdown(mp))
- return false;
-
- if (iter->victim_rtg)
- return true;
-
- /*
- * Don't start new work if we are asked to stop or park.
- */
- if (kthread_should_stop() || kthread_should_park())
- return false;
-
- if (!xfs_zoned_need_gc(mp))
- return false;
-
spin_lock(&zi->zi_used_buckets_lock);
for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
@@ -703,6 +706,9 @@ xfs_zone_gc_start_chunk(
chunk->scratch = &data->scratch[data->scratch_idx];
chunk->data = data;
chunk->oz = oz;
+ chunk->victim_rtg = iter->victim_rtg;
+ atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref);
+ atomic_inc(&chunk->victim_rtg->rtg_gccount);
bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
bio->bi_end_io = xfs_zone_gc_end_io;
@@ -725,6 +731,8 @@ static void
xfs_zone_gc_free_chunk(
struct xfs_gc_bio *chunk)
{
+ atomic_dec(&chunk->victim_rtg->rtg_gccount);
+ xfs_rtgroup_rele(chunk->victim_rtg);
list_del(&chunk->entry);
xfs_open_zone_put(chunk->oz);
xfs_irele(chunk->ip);
@@ -785,6 +793,10 @@ xfs_zone_gc_split_write(
split_chunk->oz = chunk->oz;
atomic_inc(&chunk->oz->oz_ref);
+ split_chunk->victim_rtg = chunk->victim_rtg;
+ atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref);
+ atomic_inc(&chunk->victim_rtg->rtg_gccount);
+
chunk->offset += split_len;
chunk->len -= split_len;
chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
@@ -975,6 +987,27 @@ xfs_zone_gc_reset_zones(
} while (next);
}
+static bool
+xfs_zone_gc_should_start_new_work(
+ struct xfs_zone_gc_data *data)
+{
+ if (xfs_is_shutdown(data->mp))
+ return false;
+ if (!xfs_zone_gc_space_available(data))
+ return false;
+
+ if (!data->iter.victim_rtg) {
+ if (kthread_should_stop() || kthread_should_park())
+ return false;
+ if (!xfs_zoned_need_gc(data->mp))
+ return false;
+ if (!xfs_zone_gc_select_victim(data))
+ return false;
+ }
+
+ return true;
+}
+
/*
* Handle the work to read and write data for GC and to reset the zones,
* including handling all completions.
@@ -982,7 +1015,7 @@ xfs_zone_gc_reset_zones(
* Note that the order of the chunks is preserved so that we don't undo the
* optimal order established by xfs_zone_gc_query().
*/
-static bool
+static void
xfs_zone_gc_handle_work(
struct xfs_zone_gc_data *data)
{
@@ -996,30 +1029,22 @@ xfs_zone_gc_handle_work(
zi->zi_reset_list = NULL;
spin_unlock(&zi->zi_reset_list_lock);
- if (!xfs_zone_gc_select_victim(data) ||
- !xfs_zone_gc_space_available(data)) {
- if (list_empty(&data->reading) &&
- list_empty(&data->writing) &&
- list_empty(&data->resetting) &&
- !reset_list)
- return false;
- }
-
- __set_current_state(TASK_RUNNING);
- try_to_freeze();
-
- if (reset_list)
+ if (reset_list) {
+ set_current_state(TASK_RUNNING);
xfs_zone_gc_reset_zones(data, reset_list);
+ }
list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
break;
+ set_current_state(TASK_RUNNING);
xfs_zone_gc_finish_reset(chunk);
}
list_for_each_entry_safe(chunk, next, &data->writing, entry) {
if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
break;
+ set_current_state(TASK_RUNNING);
xfs_zone_gc_finish_chunk(chunk);
}
@@ -1027,15 +1052,18 @@ xfs_zone_gc_handle_work(
list_for_each_entry_safe(chunk, next, &data->reading, entry) {
if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
break;
+ set_current_state(TASK_RUNNING);
xfs_zone_gc_write_chunk(chunk);
}
blk_finish_plug(&plug);
- blk_start_plug(&plug);
- while (xfs_zone_gc_start_chunk(data))
- ;
- blk_finish_plug(&plug);
- return true;
+ if (xfs_zone_gc_should_start_new_work(data)) {
+ set_current_state(TASK_RUNNING);
+ blk_start_plug(&plug);
+ while (xfs_zone_gc_start_chunk(data))
+ ;
+ blk_finish_plug(&plug);
+ }
}
/*
@@ -1059,8 +1087,18 @@ xfs_zoned_gcd(
for (;;) {
set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
xfs_set_zonegc_running(mp);
- if (xfs_zone_gc_handle_work(data))
+
+ xfs_zone_gc_handle_work(data);
+
+ /*
+ * Only sleep if nothing set the state to running. Else check for
+ * work again as someone might have queued up more work and woken
+ * us in the meantime.
+ */
+ if (get_current_state() == TASK_RUNNING) {
+ try_to_freeze();
continue;
+ }
if (list_empty(&data->reading) &&
list_empty(&data->writing) &&
diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h
index 35e6de3d25ed..4322e26dd99a 100644
--- a/fs/xfs/xfs_zone_priv.h
+++ b/fs/xfs/xfs_zone_priv.h
@@ -44,6 +44,8 @@ struct xfs_open_zone {
* the life time of an open zone.
*/
struct xfs_rtgroup *oz_rtg;
+
+ struct rcu_head oz_rcu;
};
/*