summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/block-group.c9
-rw-r--r--fs/btrfs/delayed-inode.c3
-rw-r--r--fs/btrfs/extent_io.c40
-rw-r--r--fs/btrfs/inode.c23
-rw-r--r--fs/btrfs/qgroup.c6
-rw-r--r--fs/btrfs/super.c9
-rw-r--r--fs/btrfs/tree-log.c2
-rw-r--r--fs/btrfs/volumes.c5
-rw-r--r--fs/btrfs/zoned.c2
-rw-r--r--fs/ceph/addr.c9
-rw-r--r--fs/ceph/debugfs.c14
-rw-r--r--fs/ceph/dir.c17
-rw-r--r--fs/ceph/file.c24
-rw-r--r--fs/ceph/inode.c88
-rw-r--r--fs/ceph/mds_client.c172
-rw-r--r--fs/ceph/mds_client.h18
-rw-r--r--fs/coredump.c4
-rw-r--r--fs/erofs/erofs_fs.h8
-rw-r--r--fs/erofs/internal.h1
-rw-r--r--fs/erofs/super.c12
-rw-r--r--fs/erofs/xattr.c13
-rw-r--r--fs/erofs/zmap.c67
-rw-r--r--fs/exec.c2
-rw-r--r--fs/fhandle.c8
-rw-r--r--fs/fuse/dev.c2
-rw-r--r--fs/fuse/dir.c3
-rw-r--r--fs/fuse/file.c5
-rw-r--r--fs/fuse/fuse_i.h14
-rw-r--r--fs/fuse/inode.c16
-rw-r--r--fs/fuse/passthrough.c5
-rw-r--r--fs/fuse/virtio_fs.c2
-rw-r--r--fs/kernfs/file.c58
-rw-r--r--fs/namespace.c2
-rw-r--r--fs/nfs/client.c2
-rw-r--r--fs/nfs/file.c40
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c21
-rw-r--r--fs/nfs/inode.c13
-rw-r--r--fs/nfs/internal.h12
-rw-r--r--fs/nfs/io.c13
-rw-r--r--fs/nfs/localio.c21
-rw-r--r--fs/nfs/nfs42proc.c35
-rw-r--r--fs/nfs/nfs4file.c2
-rw-r--r--fs/nfs/nfs4proc.c7
-rw-r--r--fs/nfs/nfstrace.h1
-rw-r--r--fs/nfs/write.c53
-rw-r--r--fs/resctrl/ctrlmondata.c2
-rw-r--r--fs/resctrl/internal.h4
-rw-r--r--fs/resctrl/monitor.c6
-rw-r--r--fs/smb/client/cifs_debug.c31
-rw-r--r--fs/smb/client/cifs_unicode.c3
-rw-r--r--fs/smb/client/cifsglob.h13
-rw-r--r--fs/smb/client/file.c18
-rw-r--r--fs/smb/client/inode.c86
-rw-r--r--fs/smb/client/reparse.c2
-rw-r--r--fs/smb/client/smb1ops.c4
-rw-r--r--fs/smb/client/smb2glob.h3
-rw-r--r--fs/smb/client/smb2inode.c204
-rw-r--r--fs/smb/client/smb2misc.c19
-rw-r--r--fs/smb/client/smb2ops.c32
-rw-r--r--fs/smb/client/smb2pdu.c4
-rw-r--r--fs/smb/client/smb2proto.h3
-rw-r--r--fs/smb/client/trace.h61
-rw-r--r--fs/smb/server/smb2pdu.c25
-rw-r--r--fs/smb/server/transport_rdma.c183
-rw-r--r--fs/smb/server/vfs_cache.h2
65 files changed, 1111 insertions, 477 deletions
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 9bf282d2453c..499a9edf0ca3 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1795,7 +1795,14 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
bg1 = list_entry(a, struct btrfs_block_group, bg_list);
bg2 = list_entry(b, struct btrfs_block_group, bg_list);
- return bg1->used > bg2->used;
+ /*
+ * Some other task may be updating the ->used field concurrently, but it
+ * is not serious if we get a stale value or load/store tearing issues,
+ * as sorting the list of block groups to reclaim is not critical and an
+ * occasional imperfect order is ok. So silence KCSAN and avoid the
+ * overhead of locking or any other synchronization.
+ */
+ return data_race(bg1->used > bg2->used);
}
static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 0f8d8e275143..c0c1ddd46b67 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1843,7 +1843,6 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev)
{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_delayed_node *delayed_node;
struct btrfs_inode_item *inode_item;
struct inode *vfs_inode = &inode->vfs_inode;
@@ -1864,8 +1863,6 @@ int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev)
i_uid_write(vfs_inode, btrfs_stack_inode_uid(inode_item));
i_gid_write(vfs_inode, btrfs_stack_inode_gid(inode_item));
btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item));
- btrfs_inode_set_file_extent_range(inode, 0,
- round_up(i_size_read(vfs_inode), fs_info->sectorsize));
vfs_inode->i_mode = btrfs_stack_inode_mode(inode_item);
set_nlink(vfs_inode, btrfs_stack_inode_nlink(inode_item));
inode_set_bytes(vfs_inode, btrfs_stack_inode_nbytes(inode_item));
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c953297aa89a..b21cb72835cc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -111,6 +111,24 @@ struct btrfs_bio_ctrl {
*/
unsigned long submit_bitmap;
struct readahead_control *ractl;
+
+ /*
+ * The start offset of the last used extent map by a read operation.
+ *
+ * This is for proper compressed read merge.
+ * U64_MAX means we are starting the read and have made no progress yet.
+ *
+ * The current btrfs_bio_is_contig() only uses disk_bytenr as
+ * the condition to check if the read can be merged with previous
+ * bio, which is not correct. E.g. two file extents pointing to the
+ * same extent but with different offset.
+ *
+ * So here we need to do extra checks to only merge reads that are
+ * covered by the same extent map.
+ * Just extent_map::start will be enough, as they are unique
+ * inside the same inode.
+ */
+ u64 last_em_start;
};
static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
@@ -909,7 +927,7 @@ static void btrfs_readahead_expand(struct readahead_control *ractl,
* return 0 on success, otherwise return error
*/
static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
- struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start)
+ struct btrfs_bio_ctrl *bio_ctrl)
{
struct inode *inode = folio->mapping->host;
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
@@ -1019,12 +1037,11 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
* non-optimal behavior (submitting 2 bios for the same extent).
*/
if (compress_type != BTRFS_COMPRESS_NONE &&
- prev_em_start && *prev_em_start != (u64)-1 &&
- *prev_em_start != em->start)
+ bio_ctrl->last_em_start != U64_MAX &&
+ bio_ctrl->last_em_start != em->start)
force_bio_submit = true;
- if (prev_em_start)
- *prev_em_start = em->start;
+ bio_ctrl->last_em_start = em->start;
btrfs_free_extent_map(em);
em = NULL;
@@ -1238,12 +1255,15 @@ int btrfs_read_folio(struct file *file, struct folio *folio)
const u64 start = folio_pos(folio);
const u64 end = start + folio_size(folio) - 1;
struct extent_state *cached_state = NULL;
- struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ };
+ struct btrfs_bio_ctrl bio_ctrl = {
+ .opf = REQ_OP_READ,
+ .last_em_start = U64_MAX,
+ };
struct extent_map *em_cached = NULL;
int ret;
lock_extents_for_read(inode, start, end, &cached_state);
- ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL);
+ ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl);
btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
btrfs_free_extent_map(em_cached);
@@ -2583,7 +2603,8 @@ void btrfs_readahead(struct readahead_control *rac)
{
struct btrfs_bio_ctrl bio_ctrl = {
.opf = REQ_OP_READ | REQ_RAHEAD,
- .ractl = rac
+ .ractl = rac,
+ .last_em_start = U64_MAX,
};
struct folio *folio;
struct btrfs_inode *inode = BTRFS_I(rac->mapping->host);
@@ -2591,12 +2612,11 @@ void btrfs_readahead(struct readahead_control *rac)
const u64 end = start + readahead_length(rac) - 1;
struct extent_state *cached_state = NULL;
struct extent_map *em_cached = NULL;
- u64 prev_em_start = (u64)-1;
lock_extents_for_read(inode, start, end, &cached_state);
while ((folio = readahead_folio(rac)) != NULL)
- btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start);
+ btrfs_do_readpage(folio, &em_cached, &bio_ctrl);
btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index dd82dcc7b2b7..18db1053cdf0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3885,10 +3885,6 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path
bool filled = false;
int first_xattr_slot;
- ret = btrfs_init_file_extent_tree(inode);
- if (ret)
- goto out;
-
ret = btrfs_fill_inode(inode, &rdev);
if (!ret)
filled = true;
@@ -3920,8 +3916,6 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path
i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item));
i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item));
btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
- btrfs_inode_set_file_extent_range(inode, 0,
- round_up(i_size_read(vfs_inode), fs_info->sectorsize));
inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime),
btrfs_timespec_nsec(leaf, &inode_item->atime));
@@ -3953,6 +3947,11 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path
btrfs_set_inode_mapping_order(inode);
cache_index:
+ ret = btrfs_init_file_extent_tree(inode);
+ if (ret)
+ goto out;
+ btrfs_inode_set_file_extent_range(inode, 0,
+ round_up(i_size_read(vfs_inode), fs_info->sectorsize));
/*
* If we were modified in the current generation and evicted from memory
* and then re-read we need to do a full sync since we don't have any
@@ -5696,7 +5695,17 @@ static void btrfs_del_inode_from_root(struct btrfs_inode *inode)
bool empty = false;
xa_lock(&root->inodes);
- entry = __xa_erase(&root->inodes, btrfs_ino(inode));
+ /*
+ * This btrfs_inode is being freed and has already been unhashed at this
+ * point. It's possible that another btrfs_inode has already been
+ * allocated for the same inode and inserted itself into the root, so
+ * don't delete it in that case.
+ *
+ * Note that this shouldn't need to allocate memory, so the gfp flags
+ * don't really matter.
+ */
+ entry = __xa_cmpxchg(&root->inodes, btrfs_ino(inode), inode, NULL,
+ GFP_ATOMIC);
if (entry == inode)
empty = xa_empty(&root->inodes);
xa_unlock(&root->inodes);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index ccaa9a3cf1ce..da102da169fd 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1455,6 +1455,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,
struct btrfs_qgroup *qgroup;
LIST_HEAD(qgroup_list);
u64 num_bytes = src->excl;
+ u64 num_bytes_cmpr = src->excl_cmpr;
int ret = 0;
qgroup = find_qgroup_rb(fs_info, ref_root);
@@ -1466,11 +1467,12 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,
struct btrfs_qgroup_list *glist;
qgroup->rfer += sign * num_bytes;
- qgroup->rfer_cmpr += sign * num_bytes;
+ qgroup->rfer_cmpr += sign * num_bytes_cmpr;
WARN_ON(sign < 0 && qgroup->excl < num_bytes);
+ WARN_ON(sign < 0 && qgroup->excl_cmpr < num_bytes_cmpr);
qgroup->excl += sign * num_bytes;
- qgroup->excl_cmpr += sign * num_bytes;
+ qgroup->excl_cmpr += sign * num_bytes_cmpr;
if (sign > 0)
qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a262b494a89f..df1f6cc3fe21 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -299,9 +299,12 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx,
btrfs_set_opt(ctx->mount_opt, COMPRESS);
btrfs_clear_opt(ctx->mount_opt, NODATACOW);
btrfs_clear_opt(ctx->mount_opt, NODATASUM);
- } else if (btrfs_match_compress_type(string, "lzo", false)) {
+ } else if (btrfs_match_compress_type(string, "lzo", true)) {
ctx->compress_type = BTRFS_COMPRESS_LZO;
- ctx->compress_level = 0;
+ ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_LZO,
+ string + 3);
+ if (string[3] == ':' && string[4])
+ btrfs_warn(NULL, "Compression level ignored for LZO");
btrfs_set_opt(ctx->mount_opt, COMPRESS);
btrfs_clear_opt(ctx->mount_opt, NODATACOW);
btrfs_clear_opt(ctx->mount_opt, NODATASUM);
@@ -1079,7 +1082,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_printf(seq, ",compress-force=%s", compress_type);
else
seq_printf(seq, ",compress=%s", compress_type);
- if (info->compress_level)
+ if (info->compress_level && info->compress_type != BTRFS_COMPRESS_LZO)
seq_printf(seq, ":%d", info->compress_level);
}
if (btrfs_test_opt(info, NOSSD))
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7d5d90845ca9..7a63afedd01e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1964,7 +1964,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
search_key.objectid = log_key.objectid;
search_key.type = BTRFS_INODE_EXTREF_KEY;
- search_key.offset = key->objectid;
+ search_key.offset = btrfs_extref_hash(key->objectid, name.name, name.len);
ret = backref_in_log(root->log_root, &search_key, key->objectid, &name);
if (ret < 0) {
goto out;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index fa7a929a0461..c6e3efd6f602 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2722,6 +2722,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
goto error;
}
+ if (bdev_nr_bytes(file_bdev(bdev_file)) <= BTRFS_DEVICE_RANGE_RESERVED) {
+ ret = -EINVAL;
+ goto error;
+ }
+
if (fs_devices->seeding) {
seeding_dev = true;
down_write(&sb->s_umount);
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index ea662036f441..efc2a81f50e5 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -2582,9 +2582,9 @@ again:
spin_lock(&space_info->lock);
space_info->total_bytes -= bg->length;
space_info->disk_total -= bg->length * factor;
+ space_info->disk_total -= bg->zone_unusable;
/* There is no allocation ever happened. */
ASSERT(bg->used == 0);
- ASSERT(bg->zone_unusable == 0);
/* No super block in a block group on the zoned setup. */
ASSERT(bg->bytes_super == 0);
spin_unlock(&space_info->lock);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 8b202d789e93..322ed268f14a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1264,7 +1264,9 @@ static inline int move_dirty_folio_in_page_array(struct address_space *mapping,
0,
gfp_flags);
if (IS_ERR(pages[index])) {
- if (PTR_ERR(pages[index]) == -EINVAL) {
+ int err = PTR_ERR(pages[index]);
+
+ if (err == -EINVAL) {
pr_err_client(cl, "inode->i_blkbits=%hhu\n",
inode->i_blkbits);
}
@@ -1273,7 +1275,7 @@ static inline int move_dirty_folio_in_page_array(struct address_space *mapping,
BUG_ON(ceph_wbc->locked_pages == 0);
pages[index] = NULL;
- return PTR_ERR(pages[index]);
+ return err;
}
} else {
pages[index] = &folio->page;
@@ -1687,6 +1689,7 @@ get_more_pages:
process_folio_batch:
rc = ceph_process_folio_batch(mapping, wbc, &ceph_wbc);
+ ceph_shift_unused_folios_left(&ceph_wbc.fbatch);
if (rc)
goto release_folios;
@@ -1695,8 +1698,6 @@ process_folio_batch:
goto release_folios;
if (ceph_wbc.processed_in_fbatch) {
- ceph_shift_unused_folios_left(&ceph_wbc.fbatch);
-
if (folio_batch_count(&ceph_wbc.fbatch) == 0 &&
ceph_wbc.locked_pages < ceph_wbc.max_pages) {
doutc(cl, "reached end fbatch, trying for more\n");
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index fdd404fc8112..f3fe786b4143 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -55,8 +55,6 @@ static int mdsc_show(struct seq_file *s, void *p)
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
struct rb_node *rp;
- int pathlen = 0;
- u64 pathbase;
char *path;
mutex_lock(&mdsc->mutex);
@@ -81,8 +79,8 @@ static int mdsc_show(struct seq_file *s, void *p)
if (req->r_inode) {
seq_printf(s, " #%llx", ceph_ino(req->r_inode));
} else if (req->r_dentry) {
- path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen,
- &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0);
if (IS_ERR(path))
path = NULL;
spin_lock(&req->r_dentry->d_lock);
@@ -91,7 +89,7 @@ static int mdsc_show(struct seq_file *s, void *p)
req->r_dentry,
path ? path : "");
spin_unlock(&req->r_dentry->d_lock);
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
} else if (req->r_path1) {
seq_printf(s, " #%llx/%s", req->r_ino1.ino,
req->r_path1);
@@ -100,8 +98,8 @@ static int mdsc_show(struct seq_file *s, void *p)
}
if (req->r_old_dentry) {
- path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &pathlen,
- &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &path_info, 0);
if (IS_ERR(path))
path = NULL;
spin_lock(&req->r_old_dentry->d_lock);
@@ -111,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p)
req->r_old_dentry,
path ? path : "");
spin_unlock(&req->r_old_dentry->d_lock);
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
} else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) {
if (req->r_ino2.ino)
seq_printf(s, " #%llx/%s", req->r_ino2.ino,
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 8478e7e75df6..32973c62c1a2 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1271,10 +1271,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
/* If op failed, mark everyone involved for errors */
if (result) {
- int pathlen = 0;
- u64 base = 0;
- char *path = ceph_mdsc_build_path(mdsc, dentry, &pathlen,
- &base, 0);
+ struct ceph_path_info path_info = {0};
+ char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
/* mark error on parent + clear complete */
mapping_set_error(req->r_parent->i_mapping, result);
@@ -1288,8 +1286,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
mapping_set_error(req->r_old_inode->i_mapping, result);
pr_warn_client(cl, "failure path=(%llx)%s result=%d!\n",
- base, IS_ERR(path) ? "<<bad>>" : path, result);
- ceph_mdsc_free_path(path, pathlen);
+ path_info.vino.ino, IS_ERR(path) ? "<<bad>>" : path, result);
+ ceph_mdsc_free_path_info(&path_info);
}
out:
iput(req->r_old_inode);
@@ -1347,8 +1345,6 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
int err = -EROFS;
int op;
char *path;
- int pathlen;
- u64 pathbase;
if (ceph_snap(dir) == CEPH_SNAPDIR) {
/* rmdir .snap/foo is RMSNAP */
@@ -1367,14 +1363,15 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
if (!dn) {
try_async = false;
} else {
- path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0);
if (IS_ERR(path)) {
try_async = false;
err = 0;
} else {
err = ceph_mds_check_access(mdsc, path, MAY_WRITE);
}
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
dput(dn);
/* For none EACCES cases will let the MDS do the mds auth check */
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index c02f100f8552..978acd3d4b32 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -368,8 +368,6 @@ int ceph_open(struct inode *inode, struct file *file)
int flags, fmode, wanted;
struct dentry *dentry;
char *path;
- int pathlen;
- u64 pathbase;
bool do_sync = false;
int mask = MAY_READ;
@@ -399,14 +397,15 @@ int ceph_open(struct inode *inode, struct file *file)
if (!dentry) {
do_sync = true;
} else {
- path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
if (IS_ERR(path)) {
do_sync = true;
err = 0;
} else {
err = ceph_mds_check_access(mdsc, path, mask);
}
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
dput(dentry);
/* For none EACCES cases will let the MDS do the mds auth check */
@@ -614,15 +613,13 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
mapping_set_error(req->r_parent->i_mapping, result);
if (result) {
- int pathlen = 0;
- u64 base = 0;
- char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen,
- &base, 0);
+ struct ceph_path_info path_info = {0};
+ char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0);
pr_warn_client(cl,
"async create failure path=(%llx)%s result=%d!\n",
- base, IS_ERR(path) ? "<<bad>>" : path, result);
- ceph_mdsc_free_path(path, pathlen);
+ path_info.vino.ino, IS_ERR(path) ? "<<bad>>" : path, result);
+ ceph_mdsc_free_path_info(&path_info);
ceph_dir_clear_complete(req->r_parent);
if (!d_unhashed(dentry))
@@ -791,8 +788,6 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
int mask;
int err;
char *path;
- int pathlen;
- u64 pathbase;
doutc(cl, "%p %llx.%llx dentry %p '%pd' %s flags %d mode 0%o\n",
dir, ceph_vinop(dir), dentry, dentry,
@@ -814,7 +809,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
if (!dn) {
try_async = false;
} else {
- path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0);
if (IS_ERR(path)) {
try_async = false;
err = 0;
@@ -826,7 +822,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
mask |= MAY_WRITE;
err = ceph_mds_check_access(mdsc, path, mask);
}
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
dput(dn);
/* For none EACCES cases will let the MDS do the mds auth check */
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index fc543075b827..f67025465de0 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -55,6 +55,52 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
return 0;
}
+/*
+ * Check if the parent inode matches the vino from directory reply info
+ */
+static inline bool ceph_vino_matches_parent(struct inode *parent,
+ struct ceph_vino vino)
+{
+ return ceph_ino(parent) == vino.ino && ceph_snap(parent) == vino.snap;
+}
+
+/*
+ * Validate that the directory inode referenced by @req->r_parent matches the
+ * inode number and snapshot id contained in the reply's directory record. If
+ * they do not match – which can theoretically happen if the parent dentry was
+ * moved between the time the request was issued and the reply arrived – fall
+ * back to looking up the correct inode in the inode cache.
+ *
+ * A reference is *always* returned. Callers that receive a different inode
+ * than the original @parent are responsible for dropping the extra reference
+ * once the reply has been processed.
+ */
+static struct inode *ceph_get_reply_dir(struct super_block *sb,
+ struct inode *parent,
+ struct ceph_mds_reply_info_parsed *rinfo)
+{
+ struct ceph_vino vino;
+
+ if (unlikely(!rinfo->diri.in))
+ return parent; /* nothing to compare against */
+
+ /* If we didn't have a cached parent inode to begin with, just bail out. */
+ if (!parent)
+ return NULL;
+
+ vino.ino = le64_to_cpu(rinfo->diri.in->ino);
+ vino.snap = le64_to_cpu(rinfo->diri.in->snapid);
+
+ if (likely(ceph_vino_matches_parent(parent, vino)))
+ return parent; /* matches – use the original reference */
+
+ /* Mismatch – this should be rare. Emit a WARN and obtain the correct inode. */
+ WARN_ONCE(1, "ceph: reply dir mismatch (parent valid %llx.%llx reply %llx.%llx)\n",
+ ceph_ino(parent), ceph_snap(parent), vino.ino, vino.snap);
+
+ return ceph_get_inode(sb, vino, NULL);
+}
+
/**
* ceph_new_inode - allocate a new inode in advance of an expected create
* @dir: parent directory for new inode
@@ -1523,6 +1569,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
struct ceph_vino tvino, dvino;
struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
struct ceph_client *cl = fsc->client;
+ struct inode *parent_dir = NULL;
int err = 0;
doutc(cl, "%p is_dentry %d is_target %d\n", req,
@@ -1536,10 +1583,17 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
}
if (rinfo->head->is_dentry) {
- struct inode *dir = req->r_parent;
-
- if (dir) {
- err = ceph_fill_inode(dir, NULL, &rinfo->diri,
+ /*
+ * r_parent may be stale, in cases when R_PARENT_LOCKED is not set,
+ * so we need to get the correct inode
+ */
+ parent_dir = ceph_get_reply_dir(sb, req->r_parent, rinfo);
+ if (unlikely(IS_ERR(parent_dir))) {
+ err = PTR_ERR(parent_dir);
+ goto done;
+ }
+ if (parent_dir) {
+ err = ceph_fill_inode(parent_dir, NULL, &rinfo->diri,
rinfo->dirfrag, session, -1,
&req->r_caps_reservation);
if (err < 0)
@@ -1548,14 +1602,14 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
WARN_ON_ONCE(1);
}
- if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
+ if (parent_dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
bool is_nokey = false;
struct qstr dname;
struct dentry *dn, *parent;
struct fscrypt_str oname = FSTR_INIT(NULL, 0);
- struct ceph_fname fname = { .dir = dir,
+ struct ceph_fname fname = { .dir = parent_dir,
.name = rinfo->dname,
.ctext = rinfo->altname,
.name_len = rinfo->dname_len,
@@ -1564,10 +1618,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
BUG_ON(!rinfo->head->is_target);
BUG_ON(req->r_dentry);
- parent = d_find_any_alias(dir);
+ parent = d_find_any_alias(parent_dir);
BUG_ON(!parent);
- err = ceph_fname_alloc_buffer(dir, &oname);
+ err = ceph_fname_alloc_buffer(parent_dir, &oname);
if (err < 0) {
dput(parent);
goto done;
@@ -1576,7 +1630,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
err = ceph_fname_to_usr(&fname, NULL, &oname, &is_nokey);
if (err < 0) {
dput(parent);
- ceph_fname_free_buffer(dir, &oname);
+ ceph_fname_free_buffer(parent_dir, &oname);
goto done;
}
dname.name = oname.name;
@@ -1595,7 +1649,7 @@ retry_lookup:
dname.len, dname.name, dn);
if (!dn) {
dput(parent);
- ceph_fname_free_buffer(dir, &oname);
+ ceph_fname_free_buffer(parent_dir, &oname);
err = -ENOMEM;
goto done;
}
@@ -1610,12 +1664,12 @@ retry_lookup:
ceph_snap(d_inode(dn)) != tvino.snap)) {
doutc(cl, " dn %p points to wrong inode %p\n",
dn, d_inode(dn));
- ceph_dir_clear_ordered(dir);
+ ceph_dir_clear_ordered(parent_dir);
d_delete(dn);
dput(dn);
goto retry_lookup;
}
- ceph_fname_free_buffer(dir, &oname);
+ ceph_fname_free_buffer(parent_dir, &oname);
req->r_dentry = dn;
dput(parent);
@@ -1794,6 +1848,9 @@ retry_lookup:
&dvino, ptvino);
}
done:
+ /* Drop extra ref from ceph_get_reply_dir() if it returned a new inode */
+ if (unlikely(!IS_ERR_OR_NULL(parent_dir) && parent_dir != req->r_parent))
+ iput(parent_dir);
doutc(cl, "done err=%d\n", err);
return err;
}
@@ -2487,22 +2544,21 @@ int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode,
int truncate_retry = 20; /* The RMW will take around 50ms */
struct dentry *dentry;
char *path;
- int pathlen;
- u64 pathbase;
bool do_sync = false;
dentry = d_find_alias(inode);
if (!dentry) {
do_sync = true;
} else {
- path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
if (IS_ERR(path)) {
do_sync = true;
err = 0;
} else {
err = ceph_mds_check_access(mdsc, path, MAY_WRITE);
}
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
dput(dentry);
/* For none EACCES cases will let the MDS do the mds auth check */
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 0f497c39ff82..3bc72b47fe4d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2681,8 +2681,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
* ceph_mdsc_build_path - build a path string to a given dentry
* @mdsc: mds client
* @dentry: dentry to which path should be built
- * @plen: returned length of string
- * @pbase: returned base inode number
+ * @path_info: output path, length, base ino+snap, and freepath ownership flag
* @for_wire: is this path going to be sent to the MDS?
*
* Build a string that represents the path to the dentry. This is mostly called
@@ -2700,7 +2699,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
* foo/.snap/bar -> foo//bar
*/
char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
- int *plen, u64 *pbase, int for_wire)
+ struct ceph_path_info *path_info, int for_wire)
{
struct ceph_client *cl = mdsc->fsc->client;
struct dentry *cur;
@@ -2810,16 +2809,28 @@ retry:
return ERR_PTR(-ENAMETOOLONG);
}
- *pbase = base;
- *plen = PATH_MAX - 1 - pos;
+ /* Initialize the output structure */
+ memset(path_info, 0, sizeof(*path_info));
+
+ path_info->vino.ino = base;
+ path_info->pathlen = PATH_MAX - 1 - pos;
+ path_info->path = path + pos;
+ path_info->freepath = true;
+
+ /* Set snap from dentry if available */
+ if (d_inode(dentry))
+ path_info->vino.snap = ceph_snap(d_inode(dentry));
+ else
+ path_info->vino.snap = CEPH_NOSNAP;
+
doutc(cl, "on %p %d built %llx '%.*s'\n", dentry, d_count(dentry),
- base, *plen, path + pos);
+ base, PATH_MAX - 1 - pos, path + pos);
return path + pos;
}
static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
- struct inode *dir, const char **ppath, int *ppathlen,
- u64 *pino, bool *pfreepath, bool parent_locked)
+ struct inode *dir, struct ceph_path_info *path_info,
+ bool parent_locked)
{
char *path;
@@ -2828,41 +2839,47 @@ static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry
dir = d_inode_rcu(dentry->d_parent);
if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP &&
!IS_ENCRYPTED(dir)) {
- *pino = ceph_ino(dir);
+ path_info->vino.ino = ceph_ino(dir);
+ path_info->vino.snap = ceph_snap(dir);
rcu_read_unlock();
- *ppath = dentry->d_name.name;
- *ppathlen = dentry->d_name.len;
+ path_info->path = dentry->d_name.name;
+ path_info->pathlen = dentry->d_name.len;
+ path_info->freepath = false;
return 0;
}
rcu_read_unlock();
- path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
+ path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1);
if (IS_ERR(path))
return PTR_ERR(path);
- *ppath = path;
- *pfreepath = true;
+ /*
+ * ceph_mdsc_build_path already fills path_info, including snap handling.
+ */
return 0;
}
-static int build_inode_path(struct inode *inode,
- const char **ppath, int *ppathlen, u64 *pino,
- bool *pfreepath)
+static int build_inode_path(struct inode *inode, struct ceph_path_info *path_info)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct dentry *dentry;
char *path;
if (ceph_snap(inode) == CEPH_NOSNAP) {
- *pino = ceph_ino(inode);
- *ppathlen = 0;
+ path_info->vino.ino = ceph_ino(inode);
+ path_info->vino.snap = ceph_snap(inode);
+ path_info->pathlen = 0;
+ path_info->freepath = false;
return 0;
}
dentry = d_find_alias(inode);
- path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
+ path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1);
dput(dentry);
if (IS_ERR(path))
return PTR_ERR(path);
- *ppath = path;
- *pfreepath = true;
+ /*
+ * ceph_mdsc_build_path already fills path_info, including snap from dentry.
+ * Override with inode's snap since that's what this function is for.
+ */
+ path_info->vino.snap = ceph_snap(inode);
return 0;
}
@@ -2872,26 +2889,32 @@ static int build_inode_path(struct inode *inode,
*/
static int set_request_path_attr(struct ceph_mds_client *mdsc, struct inode *rinode,
struct dentry *rdentry, struct inode *rdiri,
- const char *rpath, u64 rino, const char **ppath,
- int *pathlen, u64 *ino, bool *freepath,
+ const char *rpath, u64 rino,
+ struct ceph_path_info *path_info,
bool parent_locked)
{
struct ceph_client *cl = mdsc->fsc->client;
int r = 0;
+ /* Initialize the output structure */
+ memset(path_info, 0, sizeof(*path_info));
+
if (rinode) {
- r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
+ r = build_inode_path(rinode, path_info);
doutc(cl, " inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
ceph_snap(rinode));
} else if (rdentry) {
- r = build_dentry_path(mdsc, rdentry, rdiri, ppath, pathlen, ino,
- freepath, parent_locked);
- doutc(cl, " dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath);
+ r = build_dentry_path(mdsc, rdentry, rdiri, path_info, parent_locked);
+ doutc(cl, " dentry %p %llx/%.*s\n", rdentry, path_info->vino.ino,
+ path_info->pathlen, path_info->path);
} else if (rpath || rino) {
- *ino = rino;
- *ppath = rpath;
- *pathlen = rpath ? strlen(rpath) : 0;
- doutc(cl, " path %.*s\n", *pathlen, rpath);
+ path_info->vino.ino = rino;
+ path_info->vino.snap = CEPH_NOSNAP;
+ path_info->path = rpath;
+ path_info->pathlen = rpath ? strlen(rpath) : 0;
+ path_info->freepath = false;
+
+ doutc(cl, " path %.*s\n", path_info->pathlen, rpath);
}
return r;
@@ -2968,11 +2991,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
struct ceph_client *cl = mdsc->fsc->client;
struct ceph_msg *msg;
struct ceph_mds_request_head_legacy *lhead;
- const char *path1 = NULL;
- const char *path2 = NULL;
- u64 ino1 = 0, ino2 = 0;
- int pathlen1 = 0, pathlen2 = 0;
- bool freepath1 = false, freepath2 = false;
+ struct ceph_path_info path_info1 = {0};
+ struct ceph_path_info path_info2 = {0};
struct dentry *old_dentry = NULL;
int len;
u16 releases;
@@ -2982,25 +3002,49 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
u16 request_head_version = mds_supported_head_version(session);
kuid_t caller_fsuid = req->r_cred->fsuid;
kgid_t caller_fsgid = req->r_cred->fsgid;
+ bool parent_locked = test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
- req->r_parent, req->r_path1, req->r_ino1.ino,
- &path1, &pathlen1, &ino1, &freepath1,
- test_bit(CEPH_MDS_R_PARENT_LOCKED,
- &req->r_req_flags));
+ req->r_parent, req->r_path1, req->r_ino1.ino,
+ &path_info1, parent_locked);
if (ret < 0) {
msg = ERR_PTR(ret);
goto out;
}
+ /*
+ * When the parent directory's i_rwsem is *not* locked, req->r_parent may
+ * have become stale (e.g. after a concurrent rename) between the time the
+ * dentry was looked up and now. If we detect that the stored r_parent
+ * does not match the inode number we just encoded for the request, switch
+ * to the correct inode so that the MDS receives a valid parent reference.
+ */
+ if (!parent_locked && req->r_parent && path_info1.vino.ino &&
+ ceph_ino(req->r_parent) != path_info1.vino.ino) {
+ struct inode *old_parent = req->r_parent;
+ struct inode *correct_dir = ceph_get_inode(mdsc->fsc->sb, path_info1.vino, NULL);
+ if (!IS_ERR(correct_dir)) {
+ WARN_ONCE(1, "ceph: r_parent mismatch (had %llx wanted %llx) - updating\n",
+ ceph_ino(old_parent), path_info1.vino.ino);
+ /*
+ * Transfer CEPH_CAP_PIN from the old parent to the new one.
+ * The pin was taken earlier in ceph_mdsc_submit_request().
+ */
+ ceph_put_cap_refs(ceph_inode(old_parent), CEPH_CAP_PIN);
+ iput(old_parent);
+ req->r_parent = correct_dir;
+ ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
+ }
+ }
+
/* If r_old_dentry is set, then assume that its parent is locked */
if (req->r_old_dentry &&
!(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED))
old_dentry = req->r_old_dentry;
ret = set_request_path_attr(mdsc, NULL, old_dentry,
- req->r_old_dentry_dir,
- req->r_path2, req->r_ino2.ino,
- &path2, &pathlen2, &ino2, &freepath2, true);
+ req->r_old_dentry_dir,
+ req->r_path2, req->r_ino2.ino,
+ &path_info2, true);
if (ret < 0) {
msg = ERR_PTR(ret);
goto out_free1;
@@ -3031,7 +3075,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
/* filepaths */
len += 2 * (1 + sizeof(u32) + sizeof(u64));
- len += pathlen1 + pathlen2;
+ len += path_info1.pathlen + path_info2.pathlen;
/* cap releases */
len += sizeof(struct ceph_mds_request_release) *
@@ -3039,9 +3083,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
!!req->r_old_inode_drop + !!req->r_old_dentry_drop);
if (req->r_dentry_drop)
- len += pathlen1;
+ len += path_info1.pathlen;
if (req->r_old_dentry_drop)
- len += pathlen2;
+ len += path_info2.pathlen;
/* MClientRequest tail */
@@ -3154,8 +3198,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
lhead->ino = cpu_to_le64(req->r_deleg_ino);
lhead->args = req->r_args;
- ceph_encode_filepath(&p, end, ino1, path1);
- ceph_encode_filepath(&p, end, ino2, path2);
+ ceph_encode_filepath(&p, end, path_info1.vino.ino, path_info1.path);
+ ceph_encode_filepath(&p, end, path_info2.vino.ino, path_info2.path);
/* make note of release offset, in case we need to replay */
req->r_request_release_offset = p - msg->front.iov_base;
@@ -3218,11 +3262,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
msg->hdr.data_off = cpu_to_le16(0);
out_free2:
- if (freepath2)
- ceph_mdsc_free_path((char *)path2, pathlen2);
+ ceph_mdsc_free_path_info(&path_info2);
out_free1:
- if (freepath1)
- ceph_mdsc_free_path((char *)path1, pathlen1);
+ ceph_mdsc_free_path_info(&path_info1);
out:
return msg;
out_err:
@@ -4579,24 +4621,20 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
struct ceph_pagelist *pagelist = recon_state->pagelist;
struct dentry *dentry;
struct ceph_cap *cap;
- char *path;
- int pathlen = 0, err;
- u64 pathbase;
+ struct ceph_path_info path_info = {0};
+ int err;
u64 snap_follows;
dentry = d_find_primary(inode);
if (dentry) {
/* set pathbase to parent dir when msg_version >= 2 */
- path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase,
+ char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info,
recon_state->msg_version >= 2);
dput(dentry);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out_err;
}
- } else {
- path = NULL;
- pathbase = 0;
}
spin_lock(&ci->i_ceph_lock);
@@ -4629,7 +4667,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
rec.v2.issued = cpu_to_le32(cap->issued);
rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
- rec.v2.pathbase = cpu_to_le64(pathbase);
+ rec.v2.pathbase = cpu_to_le64(path_info.vino.ino);
rec.v2.flock_len = (__force __le32)
((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
} else {
@@ -4644,7 +4682,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
ts = inode_get_atime(inode);
ceph_encode_timespec64(&rec.v1.atime, &ts);
rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
- rec.v1.pathbase = cpu_to_le64(pathbase);
+ rec.v1.pathbase = cpu_to_le64(path_info.vino.ino);
}
if (list_empty(&ci->i_cap_snaps)) {
@@ -4706,7 +4744,7 @@ encode_again:
sizeof(struct ceph_filelock);
rec.v2.flock_len = cpu_to_le32(struct_len);
- struct_len += sizeof(u32) + pathlen + sizeof(rec.v2);
+ struct_len += sizeof(u32) + path_info.pathlen + sizeof(rec.v2);
if (struct_v >= 2)
struct_len += sizeof(u64); /* snap_follows */
@@ -4730,7 +4768,7 @@ encode_again:
ceph_pagelist_encode_8(pagelist, 1);
ceph_pagelist_encode_32(pagelist, struct_len);
}
- ceph_pagelist_encode_string(pagelist, path, pathlen);
+ ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen);
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
ceph_locks_to_pagelist(flocks, pagelist,
num_fcntl_locks, num_flock_locks);
@@ -4741,17 +4779,17 @@ out_freeflocks:
} else {
err = ceph_pagelist_reserve(pagelist,
sizeof(u64) + sizeof(u32) +
- pathlen + sizeof(rec.v1));
+ path_info.pathlen + sizeof(rec.v1));
if (err)
goto out_err;
ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
- ceph_pagelist_encode_string(pagelist, path, pathlen);
+ ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen);
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
}
out_err:
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
if (!err)
recon_state->nr_caps++;
return err;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 3e2a6fa7c19a..0428a5eaf28c 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -617,14 +617,24 @@ extern int ceph_mds_check_access(struct ceph_mds_client *mdsc, char *tpath,
extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
-static inline void ceph_mdsc_free_path(char *path, int len)
+/*
+ * Structure to group path-related output parameters for build_*_path functions
+ */
+struct ceph_path_info {
+ const char *path;
+ int pathlen;
+ struct ceph_vino vino;
+ bool freepath;
+};
+
+static inline void ceph_mdsc_free_path_info(const struct ceph_path_info *path_info)
{
- if (!IS_ERR_OR_NULL(path))
- __putname(path - (PATH_MAX - 1 - len));
+ if (path_info && path_info->freepath && !IS_ERR_OR_NULL(path_info->path))
+ __putname((char *)path_info->path - (PATH_MAX - 1 - path_info->pathlen));
}
extern char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc,
- struct dentry *dentry, int *plen, u64 *base,
+ struct dentry *dentry, struct ceph_path_info *path_info,
int for_wire);
extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
diff --git a/fs/coredump.c b/fs/coredump.c
index 5dce257c67fc..60bc9685e149 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -1466,11 +1466,15 @@ static int proc_dostring_coredump(const struct ctl_table *table, int write,
ssize_t retval;
char old_core_pattern[CORENAME_MAX_SIZE];
+ if (write)
+ return proc_dostring(table, write, buffer, lenp, ppos);
+
retval = strscpy(old_core_pattern, core_pattern, CORENAME_MAX_SIZE);
error = proc_dostring(table, write, buffer, lenp, ppos);
if (error)
return error;
+
if (!check_coredump_socket()) {
strscpy(core_pattern, old_core_pattern, retval + 1);
return -EINVAL;
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 377ee12b8b96..3d5738f80072 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -12,10 +12,12 @@
/* to allow for x86 boot sectors and other oddities. */
#define EROFS_SUPER_OFFSET 1024
-#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001
-#define EROFS_FEATURE_COMPAT_MTIME 0x00000002
-#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004
+#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001
+#define EROFS_FEATURE_COMPAT_MTIME 0x00000002
+#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004
#define EROFS_FEATURE_COMPAT_SHARED_EA_IN_METABOX 0x00000008
+#define EROFS_FEATURE_COMPAT_PLAIN_XATTR_PFX 0x00000010
+
/*
* Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 4ccc5f0ee8df..9319c66e86c3 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -234,6 +234,7 @@ EROFS_FEATURE_FUNCS(metabox, incompat, INCOMPAT_METABOX)
EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER)
EROFS_FEATURE_FUNCS(shared_ea_in_metabox, compat, COMPAT_SHARED_EA_IN_METABOX)
+EROFS_FEATURE_FUNCS(plain_xattr_pfx, compat, COMPAT_PLAIN_XATTR_PFX)
static inline u64 erofs_nid_to_ino64(struct erofs_sb_info *sbi, erofs_nid_t nid)
{
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 1b529ace4db0..db13b40a78e0 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -1018,10 +1018,22 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
return 0;
}
+static void erofs_evict_inode(struct inode *inode)
+{
+#ifdef CONFIG_FS_DAX
+ if (IS_DAX(inode))
+ dax_break_layout_final(inode);
+#endif
+
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
+}
+
const struct super_operations erofs_sops = {
.put_super = erofs_put_super,
.alloc_inode = erofs_alloc_inode,
.free_inode = erofs_free_inode,
+ .evict_inode = erofs_evict_inode,
.statfs = erofs_statfs,
.show_options = erofs_show_options,
};
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index eaa9efd766ee..396536d9a862 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -482,6 +482,7 @@ int erofs_xattr_prefixes_init(struct super_block *sb)
erofs_off_t pos = (erofs_off_t)sbi->xattr_prefix_start << 2;
struct erofs_xattr_prefix_item *pfs;
int ret = 0, i, len;
+ bool plain = erofs_sb_has_plain_xattr_pfx(sbi);
if (!sbi->xattr_prefix_count)
return 0;
@@ -490,9 +491,15 @@ int erofs_xattr_prefixes_init(struct super_block *sb)
if (!pfs)
return -ENOMEM;
- if (sbi->packed_inode)
- buf.mapping = sbi->packed_inode->i_mapping;
- else
+ if (!plain) {
+ if (erofs_sb_has_metabox(sbi))
+ (void)erofs_init_metabuf(&buf, sb, true);
+ else if (sbi->packed_inode)
+ buf.mapping = sbi->packed_inode->i_mapping;
+ else
+ plain = true;
+ }
+ if (plain)
(void)erofs_init_metabuf(&buf, sb, false);
for (i = 0; i < sbi->xattr_prefix_count; i++) {
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index a93efd95c555..798223e6da9c 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -394,10 +394,10 @@ static int z_erofs_map_blocks_fo(struct inode *inode,
.map = map,
.in_mbox = erofs_inode_in_metabox(inode),
};
- int err = 0;
- unsigned int endoff, afmt;
+ unsigned int endoff;
unsigned long initial_lcn;
unsigned long long ofs, end;
+ int err;
ofs = flags & EROFS_GET_BLOCKS_FINDTAIL ? inode->i_size - 1 : map->m_la;
if (fragment && !(flags & EROFS_GET_BLOCKS_FINDTAIL) &&
@@ -482,20 +482,15 @@ static int z_erofs_map_blocks_fo(struct inode *inode,
err = -EFSCORRUPTED;
goto unmap_out;
}
- afmt = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER ?
- Z_EROFS_COMPRESSION_INTERLACED :
- Z_EROFS_COMPRESSION_SHIFTED;
+ if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER)
+ map->m_algorithmformat = Z_EROFS_COMPRESSION_INTERLACED;
+ else
+ map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
+ } else if (m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) {
+ map->m_algorithmformat = vi->z_algorithmtype[1];
} else {
- afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ?
- vi->z_algorithmtype[1] : vi->z_algorithmtype[0];
- if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) {
- erofs_err(sb, "inconsistent algorithmtype %u for nid %llu",
- afmt, vi->nid);
- err = -EFSCORRUPTED;
- goto unmap_out;
- }
+ map->m_algorithmformat = vi->z_algorithmtype[0];
}
- map->m_algorithmformat = afmt;
if ((flags & EROFS_GET_BLOCKS_FIEMAP) ||
((flags & EROFS_GET_BLOCKS_READMORE) &&
@@ -626,9 +621,9 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map)
{
struct erofs_inode *const vi = EROFS_I(inode);
struct super_block *const sb = inode->i_sb;
- int err, headnr;
- erofs_off_t pos;
struct z_erofs_map_header *h;
+ erofs_off_t pos;
+ int err = 0;
if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) {
/*
@@ -642,7 +637,6 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map)
if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE))
return -ERESTARTSYS;
- err = 0;
if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags))
goto out_unlock;
@@ -679,15 +673,6 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map)
else if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER)
vi->z_idata_size = le16_to_cpu(h->h_idata_size);
- headnr = 0;
- if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX ||
- vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) {
- erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel",
- headnr + 1, vi->z_algorithmtype[headnr], vi->nid);
- err = -EOPNOTSUPP;
- goto out_unlock;
- }
-
if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) &&
vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
Z_EROFS_ADVISE_BIG_PCLUSTER_2)) {
@@ -726,6 +711,30 @@ out_unlock:
return err;
}
+static int z_erofs_map_sanity_check(struct inode *inode,
+ struct erofs_map_blocks *map)
+{
+ struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+
+ if (!(map->m_flags & EROFS_MAP_ENCODED))
+ return 0;
+ if (unlikely(map->m_algorithmformat >= Z_EROFS_COMPRESSION_RUNTIME_MAX)) {
+ erofs_err(inode->i_sb, "unknown algorithm %d @ pos %llu for nid %llu, please upgrade kernel",
+ map->m_algorithmformat, map->m_la, EROFS_I(inode)->nid);
+ return -EOPNOTSUPP;
+ }
+ if (unlikely(map->m_algorithmformat < Z_EROFS_COMPRESSION_MAX &&
+ !(sbi->available_compr_algs & (1 << map->m_algorithmformat)))) {
+ erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu",
+ map->m_algorithmformat, EROFS_I(inode)->nid);
+ return -EFSCORRUPTED;
+ }
+ if (unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE ||
+ map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE))
+ return -EOPNOTSUPP;
+ return 0;
+}
+
int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
int flags)
{
@@ -746,10 +755,8 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
else
err = z_erofs_map_blocks_fo(inode, map, flags);
}
- if (!err && (map->m_flags & EROFS_MAP_ENCODED) &&
- unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE ||
- map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE))
- err = -EOPNOTSUPP;
+ if (!err)
+ err = z_erofs_map_sanity_check(inode, map);
if (err)
map->m_llen = 0;
}
diff --git a/fs/exec.c b/fs/exec.c
index 2a1e5e4042a1..e861a4b7ffda 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -2048,7 +2048,7 @@ static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int writ
{
int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (!error)
+ if (!error && !write)
validate_coredump_safety();
return error;
}
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 68a7d2861c58..a907ddfac4d5 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -208,6 +208,14 @@ static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
return 1;
/*
+ * Verify that the decoded dentry itself has a valid id mapping.
+ * In case the decoded dentry is the mountfd root itself, this
+ * verifies that the mountfd inode itself has a valid id mapping.
+ */
+ if (!privileged_wrt_inode_uidgid(user_ns, idmap, d_inode(dentry)))
+ return 0;
+
+ /*
* It's racy as we're not taking rename_lock but we're able to ignore
* permissions and we just need an approximation whether we were able
* to follow a path to the file.
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index e80cd8f2c049..5150aa25e64b 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1893,7 +1893,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode,
index = outarg->offset >> PAGE_SHIFT;
- while (num) {
+ while (num && ap->num_folios < num_pages) {
struct folio *folio;
unsigned int folio_offset;
unsigned int nr_bytes;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 2d817d7cab26..5c569c3cb53f 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1199,7 +1199,7 @@ static void fuse_fillattr(struct mnt_idmap *idmap, struct inode *inode,
if (attr->blksize != 0)
blkbits = ilog2(attr->blksize);
else
- blkbits = inode->i_sb->s_blocksize_bits;
+ blkbits = fc->blkbits;
stat->blksize = 1 << blkbits;
}
@@ -1377,6 +1377,7 @@ retry:
generic_fillattr(idmap, request_mask, inode, stat);
stat->mode = fi->orig_i_mode;
stat->ino = fi->orig_ino;
+ stat->blksize = 1 << fi->cached_i_blkbits;
if (test_bit(FUSE_I_BTIME, &fi->state)) {
stat->btime = fi->i_btime;
stat->result_mask |= STATX_BTIME;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 5525a4520b0f..4adcf09d4b01 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2960,7 +2960,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
.nodeid_out = ff_out->nodeid,
.fh_out = ff_out->fh,
.off_out = pos_out,
- .len = len,
+ .len = min_t(size_t, len, UINT_MAX & PAGE_MASK),
.flags = flags
};
struct fuse_write_out outarg;
@@ -3026,6 +3026,9 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
fc->no_copy_file_range = 1;
err = -EOPNOTSUPP;
}
+ if (!err && outarg.size > len)
+ err = -EIO;
+
if (err)
goto out;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index ec248d13c8bf..cc428d04be3e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -210,6 +210,12 @@ struct fuse_inode {
/** Reference to backing file in passthrough mode */
struct fuse_backing *fb;
#endif
+
+ /*
+ * The underlying inode->i_blkbits value will not be modified,
+ * so preserve the blocksize specified by the server.
+ */
+ u8 cached_i_blkbits;
};
/** FUSE inode state bits */
@@ -969,6 +975,14 @@ struct fuse_conn {
/* Request timeout (in jiffies). 0 = no timeout */
unsigned int req_timeout;
} timeout;
+
+ /*
+ * This is a workaround until fuse uses iomap for reads.
+ * For fuseblk servers, this represents the blocksize passed in at
+ * mount time and for regular fuse servers, this is equivalent to
+ * inode->i_blkbits.
+ */
+ u8 blkbits;
};
/*
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 67c2318bfc42..7ddfd2b3cc9c 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -289,6 +289,11 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
}
}
+ if (attr->blksize)
+ fi->cached_i_blkbits = ilog2(attr->blksize);
+ else
+ fi->cached_i_blkbits = fc->blkbits;
+
/*
* Don't set the sticky bit in i_mode, unless we want the VFS
* to check permissions. This prevents failures due to the
@@ -1805,10 +1810,21 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
err = -EINVAL;
if (!sb_set_blocksize(sb, ctx->blksize))
goto err;
+ /*
+ * This is a workaround until fuse hooks into iomap for reads.
+ * Use PAGE_SIZE for the blocksize else if the writeback cache
+ * is enabled, buffered writes go through iomap and a read may
+ * overwrite partially written data if blocksize < PAGE_SIZE
+ */
+ fc->blkbits = sb->s_blocksize_bits;
+ if (ctx->blksize != PAGE_SIZE &&
+ !sb_set_blocksize(sb, PAGE_SIZE))
+ goto err;
#endif
} else {
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
+ fc->blkbits = sb->s_blocksize_bits;
}
sb->s_subtype = ctx->subtype;
diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
index 607ef735ad4a..eb97ac009e75 100644
--- a/fs/fuse/passthrough.c
+++ b/fs/fuse/passthrough.c
@@ -237,6 +237,11 @@ int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map)
if (!file)
goto out;
+ /* read/write/splice/mmap passthrough only relevant for regular files */
+ res = d_is_dir(file->f_path.dentry) ? -EISDIR : -EINVAL;
+ if (!d_is_reg(file->f_path.dentry))
+ goto out_fput;
+
backing_sb = file_inode(file)->i_sb;
res = -ELOOP;
if (backing_sb->s_stack_depth >= fc->max_stack_depth)
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index c826e7ca49f5..76c8fd0bfc75 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -1016,7 +1016,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
if (kaddr)
*kaddr = fs->window_kaddr + offset;
if (pfn)
- *pfn = fs->window_phys_addr + offset;
+ *pfn = PHYS_PFN(fs->window_phys_addr + offset);
return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
}
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index a6c692cac616..9adf36e6364b 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -70,6 +70,24 @@ static struct kernfs_open_node *of_on(struct kernfs_open_file *of)
!list_empty(&of->list));
}
+/* Get active reference to kernfs node for an open file */
+static struct kernfs_open_file *kernfs_get_active_of(struct kernfs_open_file *of)
+{
+ /* Skip if file was already released */
+ if (unlikely(of->released))
+ return NULL;
+
+ if (!kernfs_get_active(of->kn))
+ return NULL;
+
+ return of;
+}
+
+static void kernfs_put_active_of(struct kernfs_open_file *of)
+{
+ return kernfs_put_active(of->kn);
+}
+
/**
* kernfs_deref_open_node_locked - Get kernfs_open_node corresponding to @kn
*
@@ -139,7 +157,7 @@ static void kernfs_seq_stop_active(struct seq_file *sf, void *v)
if (ops->seq_stop)
ops->seq_stop(sf, v);
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
}
static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
@@ -152,7 +170,7 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
* the ops aren't called concurrently for the same open file.
*/
mutex_lock(&of->mutex);
- if (!kernfs_get_active(of->kn))
+ if (!kernfs_get_active_of(of))
return ERR_PTR(-ENODEV);
ops = kernfs_ops(of->kn);
@@ -238,7 +256,7 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
* the ops aren't called concurrently for the same open file.
*/
mutex_lock(&of->mutex);
- if (!kernfs_get_active(of->kn)) {
+ if (!kernfs_get_active_of(of)) {
len = -ENODEV;
mutex_unlock(&of->mutex);
goto out_free;
@@ -252,7 +270,7 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
else
len = -EINVAL;
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
mutex_unlock(&of->mutex);
if (len < 0)
@@ -323,7 +341,7 @@ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter)
* the ops aren't called concurrently for the same open file.
*/
mutex_lock(&of->mutex);
- if (!kernfs_get_active(of->kn)) {
+ if (!kernfs_get_active_of(of)) {
mutex_unlock(&of->mutex);
len = -ENODEV;
goto out_free;
@@ -335,7 +353,7 @@ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter)
else
len = -EINVAL;
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
mutex_unlock(&of->mutex);
if (len > 0)
@@ -357,13 +375,13 @@ static void kernfs_vma_open(struct vm_area_struct *vma)
if (!of->vm_ops)
return;
- if (!kernfs_get_active(of->kn))
+ if (!kernfs_get_active_of(of))
return;
if (of->vm_ops->open)
of->vm_ops->open(vma);
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
}
static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf)
@@ -375,14 +393,14 @@ static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf)
if (!of->vm_ops)
return VM_FAULT_SIGBUS;
- if (!kernfs_get_active(of->kn))
+ if (!kernfs_get_active_of(of))
return VM_FAULT_SIGBUS;
ret = VM_FAULT_SIGBUS;
if (of->vm_ops->fault)
ret = of->vm_ops->fault(vmf);
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
return ret;
}
@@ -395,7 +413,7 @@ static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf)
if (!of->vm_ops)
return VM_FAULT_SIGBUS;
- if (!kernfs_get_active(of->kn))
+ if (!kernfs_get_active_of(of))
return VM_FAULT_SIGBUS;
ret = 0;
@@ -404,7 +422,7 @@ static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf)
else
file_update_time(file);
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
return ret;
}
@@ -418,14 +436,14 @@ static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr,
if (!of->vm_ops)
return -EINVAL;
- if (!kernfs_get_active(of->kn))
+ if (!kernfs_get_active_of(of))
return -EINVAL;
ret = -EINVAL;
if (of->vm_ops->access)
ret = of->vm_ops->access(vma, addr, buf, len, write);
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
return ret;
}
@@ -455,7 +473,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
mutex_lock(&of->mutex);
rc = -ENODEV;
- if (!kernfs_get_active(of->kn))
+ if (!kernfs_get_active_of(of))
goto out_unlock;
ops = kernfs_ops(of->kn);
@@ -490,7 +508,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
}
vma->vm_ops = &kernfs_vm_ops;
out_put:
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
out_unlock:
mutex_unlock(&of->mutex);
@@ -852,7 +870,7 @@ static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)
struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry);
__poll_t ret;
- if (!kernfs_get_active(kn))
+ if (!kernfs_get_active_of(of))
return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
if (kn->attr.ops->poll)
@@ -860,7 +878,7 @@ static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)
else
ret = kernfs_generic_poll(of, wait);
- kernfs_put_active(kn);
+ kernfs_put_active_of(of);
return ret;
}
@@ -875,7 +893,7 @@ static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence)
* the ops aren't called concurrently for the same open file.
*/
mutex_lock(&of->mutex);
- if (!kernfs_get_active(of->kn)) {
+ if (!kernfs_get_active_of(of)) {
mutex_unlock(&of->mutex);
return -ENODEV;
}
@@ -886,7 +904,7 @@ static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence)
else
ret = generic_file_llseek(file, offset, whence);
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
mutex_unlock(&of->mutex);
return ret;
}
diff --git a/fs/namespace.c b/fs/namespace.c
index ae6d1312b184..51f77c65c0c6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2455,7 +2455,7 @@ struct vfsmount *clone_private_mount(const struct path *path)
return ERR_PTR(-EINVAL);
}
- if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
+ if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
if (__has_locked_children(old_mnt, path->dentry))
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 8fb4a950dd55..4e3dcc157a83 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -888,6 +888,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
if (fsinfo->xattr_support)
server->caps |= NFS_CAP_XATTR;
+ else
+ server->caps &= ~NFS_CAP_XATTR;
#endif
}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 86e36c630f09..8059ece82468 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -28,6 +28,7 @@
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/gfp.h>
+#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/compaction.h>
@@ -280,6 +281,37 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
}
EXPORT_SYMBOL_GPL(nfs_file_fsync);
+void nfs_truncate_last_folio(struct address_space *mapping, loff_t from,
+ loff_t to)
+{
+ struct folio *folio;
+
+ if (from >= to)
+ return;
+
+ folio = filemap_lock_folio(mapping, from >> PAGE_SHIFT);
+ if (IS_ERR(folio))
+ return;
+
+ if (folio_mkclean(folio))
+ folio_mark_dirty(folio);
+
+ if (folio_test_uptodate(folio)) {
+ loff_t fpos = folio_pos(folio);
+ size_t offset = from - fpos;
+ size_t end = folio_size(folio);
+
+ if (to - fpos < end)
+ end = to - fpos;
+ folio_zero_segment(folio, offset, end);
+ trace_nfs_size_truncate_folio(mapping->host, to);
+ }
+
+ folio_unlock(folio);
+ folio_put(folio);
+}
+EXPORT_SYMBOL_GPL(nfs_truncate_last_folio);
+
/*
* Decide whether a read/modify/write cycle may be more efficient
* then a modify/write/read cycle when writing to a page in the
@@ -356,6 +388,7 @@ static int nfs_write_begin(const struct kiocb *iocb,
dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n",
file, mapping->host->i_ino, len, (long long) pos);
+ nfs_truncate_last_folio(mapping, i_size_read(mapping->host), pos);
fgp |= fgf_set_order(len);
start:
@@ -442,10 +475,11 @@ static void nfs_invalidate_folio(struct folio *folio, size_t offset,
dfprintk(PAGECACHE, "NFS: invalidate_folio(%lu, %zu, %zu)\n",
folio->index, offset, length);
- if (offset != 0 || length < folio_size(folio))
- return;
/* Cancel any unstarted writes on this page */
- nfs_wb_folio_cancel(inode, folio);
+ if (offset != 0 || length < folio_size(folio))
+ nfs_wb_folio(inode, folio);
+ else
+ nfs_wb_folio_cancel(inode, folio);
folio_wait_private_2(folio); /* [DEPRECATED] */
trace_nfs_invalidate_folio(inode, folio_pos(folio) + offset, length);
}
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 8dc921d83538..9edb5f9b0c4e 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -293,7 +293,7 @@ ff_lseg_match_mirrors(struct pnfs_layout_segment *l1,
struct pnfs_layout_segment *l2)
{
const struct nfs4_ff_layout_segment *fl1 = FF_LAYOUT_LSEG(l1);
- const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l1);
+ const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l2);
u32 i;
if (fl1->mirror_array_cnt != fl2->mirror_array_cnt)
@@ -773,8 +773,11 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
continue;
if (check_device &&
- nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node))
+ nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node)) {
+ // reinitialize the error state in case if this is the last iteration
+ ds = ERR_PTR(-EINVAL);
continue;
+ }
*best_idx = idx;
break;
@@ -804,7 +807,7 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
struct nfs4_pnfs_ds *ds;
ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx);
- if (ds)
+ if (!IS_ERR(ds))
return ds;
return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx);
}
@@ -818,7 +821,7 @@ ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio,
ds = ff_layout_choose_best_ds_for_read(lseg, pgio->pg_mirror_idx,
best_idx);
- if (ds || !pgio->pg_mirror_idx)
+ if (!IS_ERR(ds) || !pgio->pg_mirror_idx)
return ds;
return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx);
}
@@ -868,7 +871,7 @@ retry:
req->wb_nio = 0;
ds = ff_layout_get_ds_for_read(pgio, &ds_idx);
- if (!ds) {
+ if (IS_ERR(ds)) {
if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
goto out_mds;
pnfs_generic_pg_cleanup(pgio);
@@ -1072,11 +1075,13 @@ static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr)
{
u32 idx = hdr->pgio_mirror_idx + 1;
u32 new_idx = 0;
+ struct nfs4_pnfs_ds *ds;
- if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx))
- ff_layout_send_layouterror(hdr->lseg);
- else
+ ds = ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx);
+ if (IS_ERR(ds))
pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg);
+ else
+ ff_layout_send_layouterror(hdr->lseg);
pnfs_read_resend_pnfs(hdr, new_idx);
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 338ef77ae423..49df9debb1a6 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -716,6 +716,7 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
{
struct inode *inode = d_inode(dentry);
struct nfs_fattr *fattr;
+ loff_t oldsize = i_size_read(inode);
int error = 0;
nfs_inc_stats(inode, NFSIOS_VFSSETATTR);
@@ -731,7 +732,7 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (error)
return error;
- if (attr->ia_size == i_size_read(inode))
+ if (attr->ia_size == oldsize)
attr->ia_valid &= ~ATTR_SIZE;
}
@@ -767,8 +768,10 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
trace_nfs_setattr_enter(inode);
/* Write all dirty data */
- if (S_ISREG(inode->i_mode))
+ if (S_ISREG(inode->i_mode)) {
+ nfs_file_block_o_direct(NFS_I(inode));
nfs_sync_inode(inode);
+ }
fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode));
if (fattr == NULL) {
@@ -777,8 +780,12 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
}
error = NFS_PROTO(inode)->setattr(dentry, fattr, attr);
- if (error == 0)
+ if (error == 0) {
+ if (attr->ia_valid & ATTR_SIZE)
+ nfs_truncate_last_folio(inode->i_mapping, oldsize,
+ attr->ia_size);
error = nfs_refresh_inode(inode, fattr);
+ }
nfs_free_fattr(fattr);
out:
trace_nfs_setattr_exit(inode, error);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 74d712b58423..c0a44f389f8f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -437,6 +437,8 @@ int nfs_file_release(struct inode *, struct file *);
int nfs_lock(struct file *, int, struct file_lock *);
int nfs_flock(struct file *, int, struct file_lock *);
int nfs_check_flags(int);
+void nfs_truncate_last_folio(struct address_space *mapping, loff_t from,
+ loff_t to);
/* inode.c */
extern struct workqueue_struct *nfsiod_workqueue;
@@ -530,6 +532,16 @@ static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi)
return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0;
}
+/* Must be called with exclusively locked inode->i_rwsem */
+static inline void nfs_file_block_o_direct(struct nfs_inode *nfsi)
+{
+ if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
+ clear_bit(NFS_INO_ODIRECT, &nfsi->flags);
+ inode_dio_wait(&nfsi->vfs_inode);
+ }
+}
+
+
/* namespace.c */
#define NFS_PATH_CANONICAL 1
extern char *nfs_path(char **p, struct dentry *dentry,
diff --git a/fs/nfs/io.c b/fs/nfs/io.c
index 3388faf2acb9..d275b0a250bf 100644
--- a/fs/nfs/io.c
+++ b/fs/nfs/io.c
@@ -14,15 +14,6 @@
#include "internal.h"
-/* Call with exclusively locked inode->i_rwsem */
-static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode)
-{
- if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
- clear_bit(NFS_INO_ODIRECT, &nfsi->flags);
- inode_dio_wait(inode);
- }
-}
-
/**
* nfs_start_io_read - declare the file is being used for buffered reads
* @inode: file inode
@@ -57,7 +48,7 @@ nfs_start_io_read(struct inode *inode)
err = down_write_killable(&inode->i_rwsem);
if (err)
return err;
- nfs_block_o_direct(nfsi, inode);
+ nfs_file_block_o_direct(nfsi);
downgrade_write(&inode->i_rwsem);
return 0;
@@ -90,7 +81,7 @@ nfs_start_io_write(struct inode *inode)
err = down_write_killable(&inode->i_rwsem);
if (!err)
- nfs_block_o_direct(NFS_I(inode), inode);
+ nfs_file_block_o_direct(NFS_I(inode));
return err;
}
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index bd5fca285899..97abf62f109d 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -180,10 +180,8 @@ static void nfs_local_probe(struct nfs_client *clp)
return;
}
- if (nfs_client_is_local(clp)) {
- /* If already enabled, disable and re-enable */
- nfs_localio_disable_client(clp);
- }
+ if (nfs_client_is_local(clp))
+ return;
if (!nfs_uuid_begin(&clp->cl_uuid))
return;
@@ -244,7 +242,8 @@ __nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
case -ENOMEM:
case -ENXIO:
case -ENOENT:
- /* Revalidate localio, will disable if unsupported */
+ /* Revalidate localio */
+ nfs_localio_disable_client(clp);
nfs_local_probe(clp);
}
}
@@ -453,12 +452,13 @@ static void nfs_local_call_read(struct work_struct *work)
nfs_local_iter_init(&iter, iocb, READ);
status = filp->f_op->read_iter(&iocb->kiocb, &iter);
+
+ revert_creds(save_cred);
+
if (status != -EIOCBQUEUED) {
nfs_local_read_done(iocb, status);
nfs_local_pgio_release(iocb);
}
-
- revert_creds(save_cred);
}
static int
@@ -648,14 +648,15 @@ static void nfs_local_call_write(struct work_struct *work)
file_start_write(filp);
status = filp->f_op->write_iter(&iocb->kiocb, &iter);
file_end_write(filp);
+
+ revert_creds(save_cred);
+ current->flags = old_flags;
+
if (status != -EIOCBQUEUED) {
nfs_local_write_done(iocb, status);
nfs_local_vfs_getattr(iocb);
nfs_local_pgio_release(iocb);
}
-
- revert_creds(save_cred);
- current->flags = old_flags;
}
static int
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 01c01f45358b..6a0b5871ba3b 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -114,6 +114,7 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
exception.inode = inode;
exception.state = lock->open_context->state;
+ nfs_file_block_o_direct(NFS_I(inode));
err = nfs_sync_inode(inode);
if (err)
goto out;
@@ -137,6 +138,7 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len)
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE],
};
struct inode *inode = file_inode(filep);
+ loff_t oldsize = i_size_read(inode);
int err;
if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE))
@@ -145,7 +147,11 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len)
inode_lock(inode);
err = nfs42_proc_fallocate(&msg, filep, offset, len);
- if (err == -EOPNOTSUPP)
+
+ if (err == 0)
+ nfs_truncate_last_folio(inode->i_mapping, oldsize,
+ offset + len);
+ else if (err == -EOPNOTSUPP)
NFS_SERVER(inode)->caps &= ~(NFS_CAP_ALLOCATE |
NFS_CAP_ZERO_RANGE);
@@ -183,6 +189,7 @@ int nfs42_proc_zero_range(struct file *filep, loff_t offset, loff_t len)
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ZERO_RANGE],
};
struct inode *inode = file_inode(filep);
+ loff_t oldsize = i_size_read(inode);
int err;
if (!nfs_server_capable(inode, NFS_CAP_ZERO_RANGE))
@@ -191,9 +198,11 @@ int nfs42_proc_zero_range(struct file *filep, loff_t offset, loff_t len)
inode_lock(inode);
err = nfs42_proc_fallocate(&msg, filep, offset, len);
- if (err == 0)
+ if (err == 0) {
+ nfs_truncate_last_folio(inode->i_mapping, oldsize,
+ offset + len);
truncate_pagecache_range(inode, offset, (offset + len) -1);
- if (err == -EOPNOTSUPP)
+ } else if (err == -EOPNOTSUPP)
NFS_SERVER(inode)->caps &= ~NFS_CAP_ZERO_RANGE;
inode_unlock(inode);
@@ -354,22 +363,27 @@ out:
/**
* nfs42_copy_dest_done - perform inode cache updates after clone/copy offload
- * @inode: pointer to destination inode
+ * @file: pointer to destination file
* @pos: destination offset
* @len: copy length
+ * @oldsize: length of the file prior to clone/copy
*
* Punch a hole in the inode page cache, so that the NFS client will
* know to retrieve new data.
* Update the file size if necessary, and then mark the inode as having
* invalid cached values for change attribute, ctime, mtime and space used.
*/
-static void nfs42_copy_dest_done(struct inode *inode, loff_t pos, loff_t len)
+static void nfs42_copy_dest_done(struct file *file, loff_t pos, loff_t len,
+ loff_t oldsize)
{
+ struct inode *inode = file_inode(file);
+ struct address_space *mapping = file->f_mapping;
loff_t newsize = pos + len;
loff_t end = newsize - 1;
- WARN_ON_ONCE(invalidate_inode_pages2_range(inode->i_mapping,
- pos >> PAGE_SHIFT, end >> PAGE_SHIFT));
+ nfs_truncate_last_folio(mapping, oldsize, pos);
+ WARN_ON_ONCE(invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
+ end >> PAGE_SHIFT));
spin_lock(&inode->i_lock);
if (newsize > i_size_read(inode))
@@ -402,6 +416,7 @@ static ssize_t _nfs42_proc_copy(struct file *src,
struct nfs_server *src_server = NFS_SERVER(src_inode);
loff_t pos_src = args->src_pos;
loff_t pos_dst = args->dst_pos;
+ loff_t oldsize_dst = i_size_read(dst_inode);
size_t count = args->count;
ssize_t status;
@@ -430,6 +445,7 @@ static ssize_t _nfs42_proc_copy(struct file *src,
return status;
}
+ nfs_file_block_o_direct(NFS_I(dst_inode));
status = nfs_sync_inode(dst_inode);
if (status)
return status;
@@ -475,7 +491,7 @@ static ssize_t _nfs42_proc_copy(struct file *src,
goto out;
}
- nfs42_copy_dest_done(dst_inode, pos_dst, res->write_res.count);
+ nfs42_copy_dest_done(dst, pos_dst, res->write_res.count, oldsize_dst);
nfs_invalidate_atime(src_inode);
status = res->write_res.count;
out:
@@ -1242,6 +1258,7 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
struct nfs42_clone_res res = {
.server = server,
};
+ loff_t oldsize_dst = i_size_read(dst_inode);
int status;
msg->rpc_argp = &args;
@@ -1276,7 +1293,7 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
/* a zero-length count means clone to EOF in src */
if (count == 0 && res.dst_fattr->valid & NFS_ATTR_FATTR_SIZE)
count = nfs_size_to_loff_t(res.dst_fattr->size) - dst_offset;
- nfs42_copy_dest_done(dst_inode, dst_offset, count);
+ nfs42_copy_dest_done(dst_f, dst_offset, count, oldsize_dst);
status = nfs_post_op_update_inode(dst_inode, res.dst_fattr);
}
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 1d6b5f4230c9..c9a0d1e420c6 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -278,9 +278,11 @@ static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off,
lock_two_nondirectories(src_inode, dst_inode);
/* flush all pending writes on both src and dst so that server
* has the latest data */
+ nfs_file_block_o_direct(NFS_I(src_inode));
ret = nfs_sync_inode(src_inode);
if (ret)
goto out_unlock;
+ nfs_file_block_o_direct(NFS_I(dst_inode));
ret = nfs_sync_inode(dst_inode);
if (ret)
goto out_unlock;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 7d2b67e06cc3..ce61253efd45 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4013,8 +4013,10 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
res.attr_bitmask[2];
}
memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
- server->caps &= ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS |
- NFS_CAP_SYMLINKS| NFS_CAP_SECURITY_LABEL);
+ server->caps &=
+ ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS |
+ NFS_CAP_SECURITY_LABEL | NFS_CAP_FS_LOCATIONS |
+ NFS_CAP_OPEN_XOR | NFS_CAP_DELEGTIME);
server->fattr_valid = NFS_ATTR_FATTR_V4;
if (res.attr_bitmask[0] & FATTR4_WORD0_ACL &&
res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL)
@@ -4092,7 +4094,6 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
};
int err;
- nfs_server_set_init_caps(server);
do {
err = nfs4_handle_exception(server,
_nfs4_server_capabilities(server, fhandle),
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 96b1323318c2..627115179795 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -272,6 +272,7 @@ DECLARE_EVENT_CLASS(nfs_update_size_class,
TP_ARGS(inode, new_size))
DEFINE_NFS_UPDATE_SIZE_EVENT(truncate);
+DEFINE_NFS_UPDATE_SIZE_EVENT(truncate_folio);
DEFINE_NFS_UPDATE_SIZE_EVENT(wcc);
DEFINE_NFS_UPDATE_SIZE_EVENT(update);
DEFINE_NFS_UPDATE_SIZE_EVENT(grow);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 8b7c04737967..647c53d1418a 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -237,59 +237,17 @@ static void nfs_mapping_set_error(struct folio *folio, int error)
}
/*
- * nfs_page_group_search_locked
- * @head - head request of page group
- * @page_offset - offset into page
+ * nfs_page_covers_folio
+ * @req: struct nfs_page
*
- * Search page group with head @head to find a request that contains the
- * page offset @page_offset.
- *
- * Returns a pointer to the first matching nfs request, or NULL if no
- * match is found.
- *
- * Must be called with the page group lock held
- */
-static struct nfs_page *
-nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset)
-{
- struct nfs_page *req;
-
- req = head;
- do {
- if (page_offset >= req->wb_pgbase &&
- page_offset < (req->wb_pgbase + req->wb_bytes))
- return req;
-
- req = req->wb_this_page;
- } while (req != head);
-
- return NULL;
-}
-
-/*
- * nfs_page_group_covers_page
- * @head - head request of page group
- *
- * Return true if the page group with head @head covers the whole page,
- * returns false otherwise
+ * Return true if the request covers the whole folio.
+ * Note that the caller should ensure all subrequests have been joined
*/
static bool nfs_page_group_covers_page(struct nfs_page *req)
{
unsigned int len = nfs_folio_length(nfs_page_to_folio(req));
- struct nfs_page *tmp;
- unsigned int pos = 0;
-
- nfs_page_group_lock(req);
- for (;;) {
- tmp = nfs_page_group_search_locked(req->wb_head, pos);
- if (!tmp)
- break;
- pos = tmp->wb_pgbase + tmp->wb_bytes;
- }
-
- nfs_page_group_unlock(req);
- return pos >= len;
+ return req->wb_pgbase == 0 && req->wb_bytes == len;
}
/* We can set the PG_uptodate flag if we see that a write request
@@ -2045,6 +2003,7 @@ int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio)
* release it */
nfs_inode_remove_request(req);
nfs_unlock_and_release_request(req);
+ folio_cancel_dirty(folio);
}
return ret;
diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c
index d98e0d2de09f..3c39cfacb251 100644
--- a/fs/resctrl/ctrlmondata.c
+++ b/fs/resctrl/ctrlmondata.c
@@ -625,11 +625,11 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
*/
list_for_each_entry(d, &r->mon_domains, hdr.list) {
if (d->ci_id == domid) {
- rr.ci_id = d->ci_id;
cpu = cpumask_any(&d->hdr.cpu_mask);
ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
if (!ci)
continue;
+ rr.ci = ci;
mon_event_read(&rr, r, NULL, rdtgrp,
&ci->shared_cpu_map, evtid, false);
goto checkresult;
diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h
index 0a1eedba2b03..9a8cf6f11151 100644
--- a/fs/resctrl/internal.h
+++ b/fs/resctrl/internal.h
@@ -98,7 +98,7 @@ struct mon_data {
* domains in @r sharing L3 @ci.id
* @evtid: Which monitor event to read.
* @first: Initialize MBM counter when true.
- * @ci_id: Cacheinfo id for L3. Only set when @d is NULL. Used when summing domains.
+ * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains.
* @err: Error encountered when reading counter.
* @val: Returned value of event counter. If @rgrp is a parent resource group,
* @val includes the sum of event counts from its child resource groups.
@@ -112,7 +112,7 @@ struct rmid_read {
struct rdt_mon_domain *d;
enum resctrl_event_id evtid;
bool first;
- unsigned int ci_id;
+ struct cacheinfo *ci;
int err;
u64 val;
void *arch_mon_ctx;
diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c
index f5637855c3ac..7326c28a7908 100644
--- a/fs/resctrl/monitor.c
+++ b/fs/resctrl/monitor.c
@@ -361,7 +361,6 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
{
int cpu = smp_processor_id();
struct rdt_mon_domain *d;
- struct cacheinfo *ci;
struct mbm_state *m;
int err, ret;
u64 tval = 0;
@@ -389,8 +388,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
}
/* Summing domains that share a cache, must be on a CPU for that cache. */
- ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
- if (!ci || ci->id != rr->ci_id)
+ if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map))
return -EINVAL;
/*
@@ -402,7 +400,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
*/
ret = -EINVAL;
list_for_each_entry(d, &rr->r->mon_domains, hdr.list) {
- if (d->ci_id != rr->ci_id)
+ if (d->ci_id != rr->ci->id)
continue;
err = resctrl_arch_rmid_read(rr->r, d, closid, rmid,
rr->evtid, &tval, rr->arch_mon_ctx);
diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c
index beb4f18f05ef..2337cf795db3 100644
--- a/fs/smb/client/cifs_debug.c
+++ b/fs/smb/client/cifs_debug.c
@@ -304,6 +304,8 @@ static int cifs_debug_dirs_proc_show(struct seq_file *m, void *v)
list_for_each(tmp1, &ses->tcon_list) {
tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
cfids = tcon->cfids;
+ if (!cfids)
+ continue;
spin_lock(&cfids->cfid_list_lock); /* check lock ordering */
seq_printf(m, "Num entries: %d\n", cfids->num_entries);
list_for_each_entry(cfid, &cfids->entries, entry) {
@@ -319,8 +321,6 @@ static int cifs_debug_dirs_proc_show(struct seq_file *m, void *v)
seq_printf(m, "\n");
}
spin_unlock(&cfids->cfid_list_lock);
-
-
}
}
}
@@ -347,6 +347,22 @@ static __always_inline const char *compression_alg_str(__le16 alg)
}
}
+static __always_inline const char *cipher_alg_str(__le16 cipher)
+{
+ switch (cipher) {
+ case SMB2_ENCRYPTION_AES128_CCM:
+ return "AES128-CCM";
+ case SMB2_ENCRYPTION_AES128_GCM:
+ return "AES128-GCM";
+ case SMB2_ENCRYPTION_AES256_CCM:
+ return "AES256-CCM";
+ case SMB2_ENCRYPTION_AES256_GCM:
+ return "AES256-GCM";
+ default:
+ return "UNKNOWN";
+ }
+}
+
static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
{
struct mid_q_entry *mid_entry;
@@ -539,6 +555,11 @@ skip_rdma:
else
seq_puts(m, "disabled (not supported by this server)");
+ /* Show negotiated encryption cipher, even if not required */
+ seq_puts(m, "\nEncryption: ");
+ if (server->cipher_type)
+ seq_printf(m, "Negotiated cipher (%s)", cipher_alg_str(server->cipher_type));
+
seq_printf(m, "\n\n\tSessions: ");
i = 0;
list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
@@ -576,12 +597,8 @@ skip_rdma:
/* dump session id helpful for use with network trace */
seq_printf(m, " SessionId: 0x%llx", ses->Suid);
- if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) {
+ if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)
seq_puts(m, " encrypted");
- /* can help in debugging to show encryption type */
- if (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)
- seq_puts(m, "(gcm256)");
- }
if (ses->sign)
seq_puts(m, " signed");
diff --git a/fs/smb/client/cifs_unicode.c b/fs/smb/client/cifs_unicode.c
index 4cc6e0896fad..f8659d36793f 100644
--- a/fs/smb/client/cifs_unicode.c
+++ b/fs/smb/client/cifs_unicode.c
@@ -629,6 +629,9 @@ cifs_strndup_to_utf16(const char *src, const int maxlen, int *utf16_len,
int len;
__le16 *dst;
+ if (!src)
+ return NULL;
+
len = cifs_local_to_utf16_bytes(src, maxlen, cp);
len += 2; /* NULL */
dst = kmalloc(len, GFP_KERNEL);
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 1e64a4fb6af0..0fae95cf81c4 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -87,7 +87,7 @@
#define SMB_INTERFACE_POLL_INTERVAL 600
/* maximum number of PDUs in one compound */
-#define MAX_COMPOUND 7
+#define MAX_COMPOUND 10
/*
* Default number of credits to keep available for SMB3.
@@ -1882,9 +1882,12 @@ static inline bool is_replayable_error(int error)
/* cifs_get_writable_file() flags */
-#define FIND_WR_ANY 0
-#define FIND_WR_FSUID_ONLY 1
-#define FIND_WR_WITH_DELETE 2
+enum cifs_writable_file_flags {
+ FIND_WR_ANY = 0U,
+ FIND_WR_FSUID_ONLY = (1U << 0),
+ FIND_WR_WITH_DELETE = (1U << 1),
+ FIND_WR_NO_PENDING_DELETE = (1U << 2),
+};
#define MID_FREE 0
#define MID_REQUEST_ALLOCATED 1
@@ -2343,6 +2346,8 @@ struct smb2_compound_vars {
struct kvec qi_iov;
struct kvec io_iov[SMB2_IOCTL_IOV_SIZE];
struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE];
+ struct kvec unlink_iov[SMB2_SET_INFO_IOV_SIZE];
+ struct kvec rename_iov[SMB2_SET_INFO_IOV_SIZE];
struct kvec close_iov;
struct smb2_file_rename_info_hdr rename_info;
struct smb2_file_link_info_hdr link_info;
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 186e061068be..cb907e18cc35 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -998,7 +998,10 @@ int cifs_open(struct inode *inode, struct file *file)
/* Get the cached handle as SMB2 close is deferred */
if (OPEN_FMODE(file->f_flags) & FMODE_WRITE) {
- rc = cifs_get_writable_path(tcon, full_path, FIND_WR_FSUID_ONLY, &cfile);
+ rc = cifs_get_writable_path(tcon, full_path,
+ FIND_WR_FSUID_ONLY |
+ FIND_WR_NO_PENDING_DELETE,
+ &cfile);
} else {
rc = cifs_get_readable_path(tcon, full_path, &cfile);
}
@@ -2530,6 +2533,9 @@ refind_writable:
continue;
if (with_delete && !(open_file->fid.access & DELETE))
continue;
+ if ((flags & FIND_WR_NO_PENDING_DELETE) &&
+ open_file->status_file_deleted)
+ continue;
if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
if (!open_file->invalidHandle) {
/* found a good writable file */
@@ -2647,6 +2653,16 @@ cifs_get_readable_path(struct cifs_tcon *tcon, const char *name,
spin_unlock(&tcon->open_file_lock);
free_dentry_path(page);
*ret_file = find_readable_file(cinode, 0);
+ if (*ret_file) {
+ spin_lock(&cinode->open_file_lock);
+ if ((*ret_file)->status_file_deleted) {
+ spin_unlock(&cinode->open_file_lock);
+ cifsFileInfo_put(*ret_file);
+ *ret_file = NULL;
+ } else {
+ spin_unlock(&cinode->open_file_lock);
+ }
+ }
return *ret_file ? 0 : -ENOENT;
}
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index fe453a4b3dc8..11d442e8b3d6 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -1931,7 +1931,7 @@ cifs_drop_nlink(struct inode *inode)
* but will return the EACCES to the caller. Note that the VFS does not call
* unlink on negative dentries currently.
*/
-int cifs_unlink(struct inode *dir, struct dentry *dentry)
+static int __cifs_unlink(struct inode *dir, struct dentry *dentry, bool sillyrename)
{
int rc = 0;
unsigned int xid;
@@ -2003,7 +2003,11 @@ retry_std_delete:
goto psx_del_no_retry;
}
- rc = server->ops->unlink(xid, tcon, full_path, cifs_sb, dentry);
+ if (sillyrename || (server->vals->protocol_id > SMB10_PROT_ID &&
+ d_is_positive(dentry) && d_count(dentry) > 2))
+ rc = -EBUSY;
+ else
+ rc = server->ops->unlink(xid, tcon, full_path, cifs_sb, dentry);
psx_del_no_retry:
if (!rc) {
@@ -2071,6 +2075,11 @@ unlink_out:
return rc;
}
+int cifs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ return __cifs_unlink(dir, dentry, false);
+}
+
static int
cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode,
const char *full_path, struct cifs_sb_info *cifs_sb,
@@ -2358,14 +2367,16 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
rc = server->ops->rmdir(xid, tcon, full_path, cifs_sb);
cifs_put_tlink(tlink);
+ cifsInode = CIFS_I(d_inode(direntry));
+
if (!rc) {
+ set_bit(CIFS_INO_DELETE_PENDING, &cifsInode->flags);
spin_lock(&d_inode(direntry)->i_lock);
i_size_write(d_inode(direntry), 0);
clear_nlink(d_inode(direntry));
spin_unlock(&d_inode(direntry)->i_lock);
}
- cifsInode = CIFS_I(d_inode(direntry));
/* force revalidate to go get info when needed */
cifsInode->time = 0;
@@ -2458,8 +2469,11 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
}
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
do_rename_exit:
- if (rc == 0)
+ if (rc == 0) {
d_move(from_dentry, to_dentry);
+ /* Force a new lookup */
+ d_drop(from_dentry);
+ }
cifs_put_tlink(tlink);
return rc;
}
@@ -2470,6 +2484,7 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir,
struct dentry *target_dentry, unsigned int flags)
{
const char *from_name, *to_name;
+ struct TCP_Server_Info *server;
void *page1, *page2;
struct cifs_sb_info *cifs_sb;
struct tcon_link *tlink;
@@ -2505,6 +2520,7 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir,
if (IS_ERR(tlink))
return PTR_ERR(tlink);
tcon = tlink_tcon(tlink);
+ server = tcon->ses->server;
page1 = alloc_dentry_path();
page2 = alloc_dentry_path();
@@ -2591,19 +2607,53 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir,
unlink_target:
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
-
- /* Try unlinking the target dentry if it's not negative */
- if (d_really_is_positive(target_dentry) && (rc == -EACCES || rc == -EEXIST)) {
- if (d_is_dir(target_dentry))
- tmprc = cifs_rmdir(target_dir, target_dentry);
- else
- tmprc = cifs_unlink(target_dir, target_dentry);
- if (tmprc)
- goto cifs_rename_exit;
- rc = cifs_do_rename(xid, source_dentry, from_name,
- target_dentry, to_name);
- if (!rc)
- rehash = false;
+ if (d_really_is_positive(target_dentry)) {
+ if (!rc) {
+ struct inode *inode = d_inode(target_dentry);
+ /*
+ * Samba and ksmbd servers allow renaming a target
+ * directory that is open, so make sure to update
+ * ->i_nlink and then mark it as delete pending.
+ */
+ if (S_ISDIR(inode->i_mode)) {
+ drop_cached_dir_by_name(xid, tcon, to_name, cifs_sb);
+ spin_lock(&inode->i_lock);
+ i_size_write(inode, 0);
+ clear_nlink(inode);
+ spin_unlock(&inode->i_lock);
+ set_bit(CIFS_INO_DELETE_PENDING, &CIFS_I(inode)->flags);
+ CIFS_I(inode)->time = 0; /* force reval */
+ inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+ }
+ } else if (rc == -EACCES || rc == -EEXIST) {
+ /*
+ * Rename failed, possibly due to a busy target.
+ * Retry it by unliking the target first.
+ */
+ if (d_is_dir(target_dentry)) {
+ tmprc = cifs_rmdir(target_dir, target_dentry);
+ } else {
+ tmprc = __cifs_unlink(target_dir, target_dentry,
+ server->vals->protocol_id > SMB10_PROT_ID);
+ }
+ if (tmprc) {
+ /*
+ * Some servers will return STATUS_ACCESS_DENIED
+ * or STATUS_DIRECTORY_NOT_EMPTY when failing to
+ * rename a non-empty directory. Make sure to
+ * propagate the appropriate error back to
+ * userspace.
+ */
+ if (tmprc == -EEXIST || tmprc == -ENOTEMPTY)
+ rc = tmprc;
+ goto cifs_rename_exit;
+ }
+ rc = cifs_do_rename(xid, source_dentry, from_name,
+ target_dentry, to_name);
+ if (!rc)
+ rehash = false;
+ }
}
/* force revalidate to go get info when needed */
@@ -2629,6 +2679,8 @@ cifs_dentry_needs_reval(struct dentry *dentry)
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
struct cached_fid *cfid = NULL;
+ if (test_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags))
+ return false;
if (cifs_i->time == 0)
return true;
diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
index 7869cec58f52..10c84c095fe7 100644
--- a/fs/smb/client/reparse.c
+++ b/fs/smb/client/reparse.c
@@ -278,7 +278,7 @@ static int detect_directory_symlink_target(struct cifs_sb_info *cifs_sb,
}
/*
- * For absolute symlinks it is not possible to determinate
+ * For absolute symlinks it is not possible to determine
* if it should point to directory or file.
*/
if (symname[0] == '/') {
diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c
index 893a1ea8c000..a02d41d1ce4a 100644
--- a/fs/smb/client/smb1ops.c
+++ b/fs/smb/client/smb1ops.c
@@ -1005,7 +1005,7 @@ smb_set_file_info(struct inode *inode, const char *full_path,
rc = -EOPNOTSUPP;
}
- /* Fallback to SMB_COM_SETATTR command when absolutelty needed. */
+ /* Fallback to SMB_COM_SETATTR command when absolutely needed. */
if (rc == -EOPNOTSUPP) {
cifs_dbg(FYI, "calling SetInformation since SetPathInfo for attrs/times not supported by this server\n");
rc = SMBSetInformation(xid, tcon, full_path,
@@ -1039,7 +1039,7 @@ set_via_filehandle:
cifsFileInfo_put(open_file);
/*
- * Setting the read-only bit is not honered on non-NT servers when done
+ * Setting the read-only bit is not honored on non-NT servers when done
* via open-semantics. So for setting it, use SMB_COM_SETATTR command.
* This command works only after the file is closed, so use it only when
* operation was called without the filehandle.
diff --git a/fs/smb/client/smb2glob.h b/fs/smb/client/smb2glob.h
index 224495322a05..e56e4d402f13 100644
--- a/fs/smb/client/smb2glob.h
+++ b/fs/smb/client/smb2glob.h
@@ -30,10 +30,9 @@ enum smb2_compound_ops {
SMB2_OP_QUERY_DIR,
SMB2_OP_MKDIR,
SMB2_OP_RENAME,
- SMB2_OP_DELETE,
SMB2_OP_HARDLINK,
SMB2_OP_SET_EOF,
- SMB2_OP_RMDIR,
+ SMB2_OP_UNLINK,
SMB2_OP_POSIX_QUERY_INFO,
SMB2_OP_SET_REPARSE,
SMB2_OP_GET_REPARSE,
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index 31c13fb5b85b..7cadc8ca4f55 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -346,9 +346,6 @@ replay_again:
trace_smb3_posix_query_info_compound_enter(xid, tcon->tid,
ses->Suid, full_path);
break;
- case SMB2_OP_DELETE:
- trace_smb3_delete_enter(xid, tcon->tid, ses->Suid, full_path);
- break;
case SMB2_OP_MKDIR:
/*
* Directories are created through parameters in the
@@ -356,23 +353,40 @@ replay_again:
*/
trace_smb3_mkdir_enter(xid, tcon->tid, ses->Suid, full_path);
break;
- case SMB2_OP_RMDIR:
- rqst[num_rqst].rq_iov = &vars->si_iov[0];
+ case SMB2_OP_UNLINK:
+ rqst[num_rqst].rq_iov = vars->unlink_iov;
rqst[num_rqst].rq_nvec = 1;
size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */
data[0] = &delete_pending[0];
- rc = SMB2_set_info_init(tcon, server,
- &rqst[num_rqst], COMPOUND_FID,
- COMPOUND_FID, current->tgid,
- FILE_DISPOSITION_INFORMATION,
- SMB2_O_INFO_FILE, 0, data, size);
- if (rc)
+ if (cfile) {
+ rc = SMB2_set_info_init(tcon, server,
+ &rqst[num_rqst],
+ cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid,
+ current->tgid,
+ FILE_DISPOSITION_INFORMATION,
+ SMB2_O_INFO_FILE, 0,
+ data, size);
+ } else {
+ rc = SMB2_set_info_init(tcon, server,
+ &rqst[num_rqst],
+ COMPOUND_FID,
+ COMPOUND_FID,
+ current->tgid,
+ FILE_DISPOSITION_INFORMATION,
+ SMB2_O_INFO_FILE, 0,
+ data, size);
+ }
+ if (!rc && (!cfile || num_rqst > 1)) {
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst]);
+ } else if (rc) {
goto finished;
- smb2_set_next_command(tcon, &rqst[num_rqst]);
- smb2_set_related(&rqst[num_rqst++]);
- trace_smb3_rmdir_enter(xid, tcon->tid, ses->Suid, full_path);
+ }
+ num_rqst++;
+ trace_smb3_unlink_enter(xid, tcon->tid, ses->Suid, full_path);
break;
case SMB2_OP_SET_EOF:
rqst[num_rqst].rq_iov = &vars->si_iov[0];
@@ -442,7 +456,7 @@ replay_again:
ses->Suid, full_path);
break;
case SMB2_OP_RENAME:
- rqst[num_rqst].rq_iov = &vars->si_iov[0];
+ rqst[num_rqst].rq_iov = vars->rename_iov;
rqst[num_rqst].rq_nvec = 2;
len = in_iov[i].iov_len;
@@ -732,19 +746,6 @@ finished:
trace_smb3_posix_query_info_compound_done(xid, tcon->tid,
ses->Suid);
break;
- case SMB2_OP_DELETE:
- if (rc)
- trace_smb3_delete_err(xid, tcon->tid, ses->Suid, rc);
- else {
- /*
- * If dentry (hence, inode) is NULL, lease break is going to
- * take care of degrading leases on handles for deleted files.
- */
- if (inode)
- cifs_mark_open_handles_for_deleted_file(inode, full_path);
- trace_smb3_delete_done(xid, tcon->tid, ses->Suid);
- }
- break;
case SMB2_OP_MKDIR:
if (rc)
trace_smb3_mkdir_err(xid, tcon->tid, ses->Suid, rc);
@@ -765,11 +766,11 @@ finished:
trace_smb3_rename_done(xid, tcon->tid, ses->Suid);
SMB2_set_info_free(&rqst[num_rqst++]);
break;
- case SMB2_OP_RMDIR:
- if (rc)
- trace_smb3_rmdir_err(xid, tcon->tid, ses->Suid, rc);
+ case SMB2_OP_UNLINK:
+ if (!rc)
+ trace_smb3_unlink_done(xid, tcon->tid, ses->Suid);
else
- trace_smb3_rmdir_done(xid, tcon->tid, ses->Suid);
+ trace_smb3_unlink_err(xid, tcon->tid, ses->Suid, rc);
SMB2_set_info_free(&rqst[num_rqst++]);
break;
case SMB2_OP_SET_EOF:
@@ -1166,7 +1167,7 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
FILE_OPEN, CREATE_NOT_FILE, ACL_NO_MODE);
return smb2_compound_op(xid, tcon, cifs_sb,
name, &oparms, NULL,
- &(int){SMB2_OP_RMDIR}, 1,
+ &(int){SMB2_OP_UNLINK}, 1,
NULL, NULL, NULL, NULL);
}
@@ -1175,20 +1176,29 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
struct cifs_sb_info *cifs_sb, struct dentry *dentry)
{
struct cifs_open_parms oparms;
+ struct inode *inode = NULL;
+ int rc;
- oparms = CIFS_OPARMS(cifs_sb, tcon, name,
- DELETE, FILE_OPEN,
- CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
- ACL_NO_MODE);
- int rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms,
- NULL, &(int){SMB2_OP_DELETE}, 1,
- NULL, NULL, NULL, dentry);
+ if (dentry)
+ inode = d_inode(dentry);
+
+ oparms = CIFS_OPARMS(cifs_sb, tcon, name, DELETE,
+ FILE_OPEN, OPEN_REPARSE_POINT, ACL_NO_MODE);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms,
+ NULL, &(int){SMB2_OP_UNLINK},
+ 1, NULL, NULL, NULL, dentry);
if (rc == -EINVAL) {
cifs_dbg(FYI, "invalid lease key, resending request without lease");
rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms,
- NULL, &(int){SMB2_OP_DELETE}, 1,
- NULL, NULL, NULL, NULL);
+ NULL, &(int){SMB2_OP_UNLINK},
+ 1, NULL, NULL, NULL, NULL);
}
+ /*
+ * If dentry (hence, inode) is NULL, lease break is going to
+ * take care of degrading leases on handles for deleted files.
+ */
+ if (!rc && inode)
+ cifs_mark_open_handles_for_deleted_file(inode, name);
return rc;
}
@@ -1441,3 +1451,113 @@ out:
cifs_free_open_info(&data);
return rc;
}
+
+static inline __le16 *utf16_smb2_path(struct cifs_sb_info *cifs_sb,
+ const char *name, size_t namelen)
+{
+ int len;
+
+ if (*name == '\\' ||
+ (cifs_sb_master_tlink(cifs_sb) &&
+ cifs_sb_master_tcon(cifs_sb)->posix_extensions && *name == '/'))
+ name++;
+ return cifs_strndup_to_utf16(name, namelen, &len,
+ cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
+}
+
+int smb2_rename_pending_delete(const char *full_path,
+ struct dentry *dentry,
+ const unsigned int xid)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(d_inode(dentry)->i_sb);
+ struct cifsInodeInfo *cinode = CIFS_I(d_inode(dentry));
+ __le16 *utf16_path __free(kfree) = NULL;
+ __u32 co = file_create_options(dentry);
+ int cmds[] = {
+ SMB2_OP_SET_INFO,
+ SMB2_OP_RENAME,
+ SMB2_OP_UNLINK,
+ };
+ const int num_cmds = ARRAY_SIZE(cmds);
+ char *to_name __free(kfree) = NULL;
+ __u32 attrs = cinode->cifsAttrs;
+ struct cifs_open_parms oparms;
+ static atomic_t sillycounter;
+ struct cifsFileInfo *cfile;
+ struct tcon_link *tlink;
+ struct cifs_tcon *tcon;
+ struct kvec iov[2];
+ const char *ppath;
+ void *page;
+ size_t len;
+ int rc;
+
+ tlink = cifs_sb_tlink(cifs_sb);
+ if (IS_ERR(tlink))
+ return PTR_ERR(tlink);
+ tcon = tlink_tcon(tlink);
+
+ page = alloc_dentry_path();
+
+ ppath = build_path_from_dentry(dentry->d_parent, page);
+ if (IS_ERR(ppath)) {
+ rc = PTR_ERR(ppath);
+ goto out;
+ }
+
+ len = strlen(ppath) + strlen("/.__smb1234") + 1;
+ to_name = kmalloc(len, GFP_KERNEL);
+ if (!to_name) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ scnprintf(to_name, len, "%s%c.__smb%04X", ppath, CIFS_DIR_SEP(cifs_sb),
+ atomic_inc_return(&sillycounter) & 0xffff);
+
+ utf16_path = utf16_smb2_path(cifs_sb, to_name, len);
+ if (!utf16_path) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ drop_cached_dir_by_name(xid, tcon, full_path, cifs_sb);
+ oparms = CIFS_OPARMS(cifs_sb, tcon, full_path,
+ DELETE | FILE_WRITE_ATTRIBUTES,
+ FILE_OPEN, co, ACL_NO_MODE);
+
+ attrs &= ~ATTR_READONLY;
+ if (!attrs)
+ attrs = ATTR_NORMAL;
+ if (d_inode(dentry)->i_nlink <= 1)
+ attrs |= ATTR_HIDDEN;
+ iov[0].iov_base = &(FILE_BASIC_INFO) {
+ .Attributes = cpu_to_le32(attrs),
+ };
+ iov[0].iov_len = sizeof(FILE_BASIC_INFO);
+ iov[1].iov_base = utf16_path;
+ iov[1].iov_len = sizeof(*utf16_path) * UniStrlen((wchar_t *)utf16_path);
+
+ cifs_get_writable_path(tcon, full_path, FIND_WR_WITH_DELETE, &cfile);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms, iov,
+ cmds, num_cmds, cfile, NULL, NULL, dentry);
+ if (rc == -EINVAL) {
+ cifs_dbg(FYI, "invalid lease key, resending request without lease\n");
+ cifs_get_writable_path(tcon, full_path,
+ FIND_WR_WITH_DELETE, &cfile);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms, iov,
+ cmds, num_cmds, cfile, NULL, NULL, NULL);
+ }
+ if (!rc) {
+ set_bit(CIFS_INO_DELETE_PENDING, &cinode->flags);
+ } else {
+ cifs_tcon_dbg(FYI, "%s: failed to rename '%s' to '%s': %d\n",
+ __func__, full_path, to_name, rc);
+ rc = -EIO;
+ }
+out:
+ cifs_put_tlink(tlink);
+ free_dentry_path(page);
+ return rc;
+}
diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c
index cddf273c14ae..89d933b4a8bc 100644
--- a/fs/smb/client/smb2misc.c
+++ b/fs/smb/client/smb2misc.c
@@ -614,6 +614,15 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server)
struct cifs_tcon *tcon;
struct cifs_pending_open *open;
+ /* Trace receipt of lease break request from server */
+ trace_smb3_lease_break_enter(le32_to_cpu(rsp->CurrentLeaseState),
+ le32_to_cpu(rsp->Flags),
+ le16_to_cpu(rsp->Epoch),
+ le32_to_cpu(rsp->hdr.Id.SyncId.TreeId),
+ le64_to_cpu(rsp->hdr.SessionId),
+ *((u64 *)rsp->LeaseKey),
+ *((u64 *)&rsp->LeaseKey[8]));
+
cifs_dbg(FYI, "Checking for lease break\n");
/* If server is a channel, select the primary channel */
@@ -660,10 +669,12 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server)
spin_unlock(&cifs_tcp_ses_lock);
cifs_dbg(FYI, "Can not process lease break - no lease matched\n");
trace_smb3_lease_not_found(le32_to_cpu(rsp->CurrentLeaseState),
- le32_to_cpu(rsp->hdr.Id.SyncId.TreeId),
- le64_to_cpu(rsp->hdr.SessionId),
- *((u64 *)rsp->LeaseKey),
- *((u64 *)&rsp->LeaseKey[8]));
+ le32_to_cpu(rsp->Flags),
+ le16_to_cpu(rsp->Epoch),
+ le32_to_cpu(rsp->hdr.Id.SyncId.TreeId),
+ le64_to_cpu(rsp->hdr.SessionId),
+ *((u64 *)rsp->LeaseKey),
+ *((u64 *)&rsp->LeaseKey[8]));
return false;
}
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 94b1d7a395d5..e586f3f4b5c9 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -2640,13 +2640,35 @@ smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst)
}
/* SMB headers in a compound are 8 byte aligned. */
- if (!IS_ALIGNED(len, 8)) {
- num_padding = 8 - (len & 7);
+ if (IS_ALIGNED(len, 8))
+ goto out;
+
+ num_padding = 8 - (len & 7);
+ if (smb3_encryption_required(tcon)) {
+ int i;
+
+ /*
+ * Flatten request into a single buffer with required padding as
+ * the encryption layer can't handle the padding iovs.
+ */
+ for (i = 1; i < rqst->rq_nvec; i++) {
+ memcpy(rqst->rq_iov[0].iov_base +
+ rqst->rq_iov[0].iov_len,
+ rqst->rq_iov[i].iov_base,
+ rqst->rq_iov[i].iov_len);
+ rqst->rq_iov[0].iov_len += rqst->rq_iov[i].iov_len;
+ }
+ memset(rqst->rq_iov[0].iov_base + rqst->rq_iov[0].iov_len,
+ 0, num_padding);
+ rqst->rq_iov[0].iov_len += num_padding;
+ rqst->rq_nvec = 1;
+ } else {
rqst->rq_iov[rqst->rq_nvec].iov_base = smb2_padding;
rqst->rq_iov[rqst->rq_nvec].iov_len = num_padding;
rqst->rq_nvec++;
- len += num_padding;
}
+ len += num_padding;
+out:
shdr->NextCommand = cpu_to_le32(len);
}
@@ -5376,6 +5398,7 @@ struct smb_version_operations smb20_operations = {
.llseek = smb3_llseek,
.is_status_io_timeout = smb2_is_status_io_timeout,
.is_network_name_deleted = smb2_is_network_name_deleted,
+ .rename_pending_delete = smb2_rename_pending_delete,
};
#endif /* CIFS_ALLOW_INSECURE_LEGACY */
@@ -5481,6 +5504,7 @@ struct smb_version_operations smb21_operations = {
.llseek = smb3_llseek,
.is_status_io_timeout = smb2_is_status_io_timeout,
.is_network_name_deleted = smb2_is_network_name_deleted,
+ .rename_pending_delete = smb2_rename_pending_delete,
};
struct smb_version_operations smb30_operations = {
@@ -5597,6 +5621,7 @@ struct smb_version_operations smb30_operations = {
.llseek = smb3_llseek,
.is_status_io_timeout = smb2_is_status_io_timeout,
.is_network_name_deleted = smb2_is_network_name_deleted,
+ .rename_pending_delete = smb2_rename_pending_delete,
};
struct smb_version_operations smb311_operations = {
@@ -5713,6 +5738,7 @@ struct smb_version_operations smb311_operations = {
.llseek = smb3_llseek,
.is_status_io_timeout = smb2_is_status_io_timeout,
.is_network_name_deleted = smb2_is_network_name_deleted,
+ .rename_pending_delete = smb2_rename_pending_delete,
};
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 2df93a75e3b8..c3b9d3f6210f 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -6192,11 +6192,11 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
please_key_high = (__u64 *)(lease_key+8);
if (rc) {
cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE);
- trace_smb3_lease_err(le32_to_cpu(lease_state), tcon->tid,
+ trace_smb3_lease_ack_err(le32_to_cpu(lease_state), tcon->tid,
ses->Suid, *please_key_low, *please_key_high, rc);
cifs_dbg(FYI, "Send error in Lease Break = %d\n", rc);
} else
- trace_smb3_lease_done(le32_to_cpu(lease_state), tcon->tid,
+ trace_smb3_lease_ack_done(le32_to_cpu(lease_state), tcon->tid,
ses->Suid, *please_key_low, *please_key_high);
return rc;
diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h
index 6e805ece6a7b..b3f1398c9f79 100644
--- a/fs/smb/client/smb2proto.h
+++ b/fs/smb/client/smb2proto.h
@@ -317,5 +317,8 @@ int posix_info_sid_size(const void *beg, const void *end);
int smb2_make_nfs_node(unsigned int xid, struct inode *inode,
struct dentry *dentry, struct cifs_tcon *tcon,
const char *full_path, umode_t mode, dev_t dev);
+int smb2_rename_pending_delete(const char *full_path,
+ struct dentry *dentry,
+ const unsigned int xid);
#endif /* _SMB2PROTO_H */
diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h
index 93e5b2bb9f28..fd650e2afc76 100644
--- a/fs/smb/client/trace.h
+++ b/fs/smb/client/trace.h
@@ -669,13 +669,12 @@ DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_info_compound_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(posix_query_info_compound_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(hardlink_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rename_enter);
-DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rmdir_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(unlink_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_reparse_compound_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(get_reparse_compound_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_wsl_ea_compound_enter);
-DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(tdis_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mknod_enter);
@@ -710,13 +709,12 @@ DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_info_compound_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(posix_query_info_compound_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(hardlink_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rename_done);
-DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rmdir_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(unlink_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_reparse_compound_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(get_reparse_compound_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_wsl_ea_compound_done);
-DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(tdis_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mknod_done);
@@ -756,14 +754,13 @@ DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_info_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(posix_query_info_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(hardlink_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rename_err);
-DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rmdir_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(unlink_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_reparse_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(get_reparse_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_wsl_ea_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err);
-DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(tdis_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mknod_err);
@@ -1171,8 +1168,54 @@ DEFINE_EVENT(smb3_lease_done_class, smb3_##name, \
__u64 lease_key_high), \
TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high))
-DEFINE_SMB3_LEASE_DONE_EVENT(lease_done);
-DEFINE_SMB3_LEASE_DONE_EVENT(lease_not_found);
+DEFINE_SMB3_LEASE_DONE_EVENT(lease_ack_done);
+/* Tracepoint when a lease break request is received/entered (includes epoch and flags) */
+DECLARE_EVENT_CLASS(smb3_lease_enter_class,
+ TP_PROTO(__u32 lease_state,
+ __u32 flags,
+ __u16 epoch,
+ __u32 tid,
+ __u64 sesid,
+ __u64 lease_key_low,
+ __u64 lease_key_high),
+ TP_ARGS(lease_state, flags, epoch, tid, sesid, lease_key_low, lease_key_high),
+ TP_STRUCT__entry(
+ __field(__u32, lease_state)
+ __field(__u32, flags)
+ __field(__u16, epoch)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(__u64, lease_key_low)
+ __field(__u64, lease_key_high)
+ ),
+ TP_fast_assign(
+ __entry->lease_state = lease_state;
+ __entry->flags = flags;
+ __entry->epoch = epoch;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->lease_key_low = lease_key_low;
+ __entry->lease_key_high = lease_key_high;
+ ),
+ TP_printk("sid=0x%llx tid=0x%x lease_key=0x%llx%llx lease_state=0x%x flags=0x%x epoch=%u",
+ __entry->sesid, __entry->tid, __entry->lease_key_high,
+ __entry->lease_key_low, __entry->lease_state, __entry->flags, __entry->epoch)
+)
+
+#define DEFINE_SMB3_LEASE_ENTER_EVENT(name) \
+DEFINE_EVENT(smb3_lease_enter_class, smb3_##name, \
+ TP_PROTO(__u32 lease_state, \
+ __u32 flags, \
+ __u16 epoch, \
+ __u32 tid, \
+ __u64 sesid, \
+ __u64 lease_key_low, \
+ __u64 lease_key_high), \
+ TP_ARGS(lease_state, flags, epoch, tid, sesid, lease_key_low, lease_key_high))
+
+DEFINE_SMB3_LEASE_ENTER_EVENT(lease_break_enter);
+/* Lease not found: reuse lease_enter payload (includes epoch and flags) */
+DEFINE_SMB3_LEASE_ENTER_EVENT(lease_not_found);
DECLARE_EVENT_CLASS(smb3_lease_err_class,
TP_PROTO(__u32 lease_state,
@@ -1213,7 +1256,7 @@ DEFINE_EVENT(smb3_lease_err_class, smb3_##name, \
int rc), \
TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high, rc))
-DEFINE_SMB3_LEASE_ERR_EVENT(lease_err);
+DEFINE_SMB3_LEASE_ERR_EVENT(lease_ack_err);
DECLARE_EVENT_CLASS(smb3_connect_class,
TP_PROTO(char *hostname,
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 0d92ce49aed7..a565fc36cee6 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -2951,18 +2951,19 @@ int smb2_open(struct ksmbd_work *work)
}
ksmbd_debug(SMB, "converted name = %s\n", name);
- if (strchr(name, ':')) {
- if (!test_share_config_flag(work->tcon->share_conf,
- KSMBD_SHARE_FLAG_STREAMS)) {
- rc = -EBADF;
- goto err_out2;
- }
- rc = parse_stream_name(name, &stream_name, &s_type);
- if (rc < 0)
- goto err_out2;
- }
if (posix_ctxt == false) {
+ if (strchr(name, ':')) {
+ if (!test_share_config_flag(work->tcon->share_conf,
+ KSMBD_SHARE_FLAG_STREAMS)) {
+ rc = -EBADF;
+ goto err_out2;
+ }
+ rc = parse_stream_name(name, &stream_name, &s_type);
+ if (rc < 0)
+ goto err_out2;
+ }
+
rc = ksmbd_validate_filename(name);
if (rc < 0)
goto err_out2;
@@ -3443,6 +3444,8 @@ int smb2_open(struct ksmbd_work *work)
fp->attrib_only = !(req->DesiredAccess & ~(FILE_READ_ATTRIBUTES_LE |
FILE_WRITE_ATTRIBUTES_LE | FILE_SYNCHRONIZE_LE));
+ fp->is_posix_ctxt = posix_ctxt;
+
/* fp should be searchable through ksmbd_inode.m_fp_list
* after daccess, saccess, attrib_only, and stream are
* initialized.
@@ -5988,7 +5991,7 @@ static int smb2_rename(struct ksmbd_work *work,
if (IS_ERR(new_name))
return PTR_ERR(new_name);
- if (strchr(new_name, ':')) {
+ if (fp->is_posix_ctxt == false && strchr(new_name, ':')) {
int s_type;
char *xattr_stream_name, *stream_name = NULL;
size_t xattr_stream_size;
diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c
index 5466aa8c39b1..6550bd9f002c 100644
--- a/fs/smb/server/transport_rdma.c
+++ b/fs/smb/server/transport_rdma.c
@@ -554,7 +554,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
case SMB_DIRECT_MSG_DATA_TRANSFER: {
struct smb_direct_data_transfer *data_transfer =
(struct smb_direct_data_transfer *)recvmsg->packet;
- unsigned int data_length;
+ u32 remaining_data_length, data_offset, data_length;
int avail_recvmsg_count, receive_credits;
if (wc->byte_len <
@@ -564,15 +564,25 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
return;
}
+ remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length);
data_length = le32_to_cpu(data_transfer->data_length);
- if (data_length) {
- if (wc->byte_len < sizeof(struct smb_direct_data_transfer) +
- (u64)data_length) {
- put_recvmsg(t, recvmsg);
- smb_direct_disconnect_rdma_connection(t);
- return;
- }
+ data_offset = le32_to_cpu(data_transfer->data_offset);
+ if (wc->byte_len < data_offset ||
+ wc->byte_len < (u64)data_offset + data_length) {
+ put_recvmsg(t, recvmsg);
+ smb_direct_disconnect_rdma_connection(t);
+ return;
+ }
+ if (remaining_data_length > t->max_fragmented_recv_size ||
+ data_length > t->max_fragmented_recv_size ||
+ (u64)remaining_data_length + (u64)data_length >
+ (u64)t->max_fragmented_recv_size) {
+ put_recvmsg(t, recvmsg);
+ smb_direct_disconnect_rdma_connection(t);
+ return;
+ }
+ if (data_length) {
if (t->full_packet_received)
recvmsg->first_segment = true;
@@ -1209,78 +1219,130 @@ static int smb_direct_writev(struct ksmbd_transport *t,
bool need_invalidate, unsigned int remote_key)
{
struct smb_direct_transport *st = smb_trans_direct_transfort(t);
- int remaining_data_length;
- int start, i, j;
- int max_iov_size = st->max_send_size -
+ size_t remaining_data_length;
+ size_t iov_idx;
+ size_t iov_ofs;
+ size_t max_iov_size = st->max_send_size -
sizeof(struct smb_direct_data_transfer);
int ret;
- struct kvec vec;
struct smb_direct_send_ctx send_ctx;
+ int error = 0;
if (st->status != SMB_DIRECT_CS_CONNECTED)
return -ENOTCONN;
//FIXME: skip RFC1002 header..
+ if (WARN_ON_ONCE(niovs <= 1 || iov[0].iov_len != 4))
+ return -EINVAL;
buflen -= 4;
+ iov_idx = 1;
+ iov_ofs = 0;
remaining_data_length = buflen;
ksmbd_debug(RDMA, "Sending smb (RDMA): smb_len=%u\n", buflen);
smb_direct_send_ctx_init(st, &send_ctx, need_invalidate, remote_key);
- start = i = 1;
- buflen = 0;
- while (true) {
- buflen += iov[i].iov_len;
- if (buflen > max_iov_size) {
- if (i > start) {
- remaining_data_length -=
- (buflen - iov[i].iov_len);
- ret = smb_direct_post_send_data(st, &send_ctx,
- &iov[start], i - start,
- remaining_data_length);
- if (ret)
+ while (remaining_data_length) {
+ struct kvec vecs[SMB_DIRECT_MAX_SEND_SGES - 1]; /* minus smbdirect hdr */
+ size_t possible_bytes = max_iov_size;
+ size_t possible_vecs;
+ size_t bytes = 0;
+ size_t nvecs = 0;
+
+ /*
+ * For the last message remaining_data_length should be
+ * have been 0 already!
+ */
+ if (WARN_ON_ONCE(iov_idx >= niovs)) {
+ error = -EINVAL;
+ goto done;
+ }
+
+ /*
+ * We have 2 factors which limit the arguments we pass
+ * to smb_direct_post_send_data():
+ *
+ * 1. The number of supported sges for the send,
+ * while one is reserved for the smbdirect header.
+ * And we currently need one SGE per page.
+ * 2. The number of negotiated payload bytes per send.
+ */
+ possible_vecs = min_t(size_t, ARRAY_SIZE(vecs), niovs - iov_idx);
+
+ while (iov_idx < niovs && possible_vecs && possible_bytes) {
+ struct kvec *v = &vecs[nvecs];
+ int page_count;
+
+ v->iov_base = ((u8 *)iov[iov_idx].iov_base) + iov_ofs;
+ v->iov_len = min_t(size_t,
+ iov[iov_idx].iov_len - iov_ofs,
+ possible_bytes);
+ page_count = get_buf_page_count(v->iov_base, v->iov_len);
+ if (page_count > possible_vecs) {
+ /*
+ * If the number of pages in the buffer
+ * is to much (because we currently require
+ * one SGE per page), we need to limit the
+ * length.
+ *
+ * We know possible_vecs is at least 1,
+ * so we always keep the first page.
+ *
+ * We need to calculate the number extra
+ * pages (epages) we can also keep.
+ *
+ * We calculate the number of bytes in the
+ * first page (fplen), this should never be
+ * larger than v->iov_len because page_count is
+ * at least 2, but adding a limitation feels
+ * better.
+ *
+ * Then we calculate the number of bytes (elen)
+ * we can keep for the extra pages.
+ */
+ size_t epages = possible_vecs - 1;
+ size_t fpofs = offset_in_page(v->iov_base);
+ size_t fplen = min_t(size_t, PAGE_SIZE - fpofs, v->iov_len);
+ size_t elen = min_t(size_t, v->iov_len - fplen, epages*PAGE_SIZE);
+
+ v->iov_len = fplen + elen;
+ page_count = get_buf_page_count(v->iov_base, v->iov_len);
+ if (WARN_ON_ONCE(page_count > possible_vecs)) {
+ /*
+ * Something went wrong in the above
+ * logic...
+ */
+ error = -EINVAL;
goto done;
- } else {
- /* iov[start] is too big, break it */
- int nvec = (buflen + max_iov_size - 1) /
- max_iov_size;
-
- for (j = 0; j < nvec; j++) {
- vec.iov_base =
- (char *)iov[start].iov_base +
- j * max_iov_size;
- vec.iov_len =
- min_t(int, max_iov_size,
- buflen - max_iov_size * j);
- remaining_data_length -= vec.iov_len;
- ret = smb_direct_post_send_data(st, &send_ctx, &vec, 1,
- remaining_data_length);
- if (ret)
- goto done;
}
- i++;
- if (i == niovs)
- break;
}
- start = i;
- buflen = 0;
- } else {
- i++;
- if (i == niovs) {
- /* send out all remaining vecs */
- remaining_data_length -= buflen;
- ret = smb_direct_post_send_data(st, &send_ctx,
- &iov[start], i - start,
- remaining_data_length);
- if (ret)
- goto done;
- break;
+ possible_vecs -= page_count;
+ nvecs += 1;
+ possible_bytes -= v->iov_len;
+ bytes += v->iov_len;
+
+ iov_ofs += v->iov_len;
+ if (iov_ofs >= iov[iov_idx].iov_len) {
+ iov_idx += 1;
+ iov_ofs = 0;
}
}
+
+ remaining_data_length -= bytes;
+
+ ret = smb_direct_post_send_data(st, &send_ctx,
+ vecs, nvecs,
+ remaining_data_length);
+ if (unlikely(ret)) {
+ error = ret;
+ goto done;
+ }
}
done:
ret = smb_direct_flush_send_list(st, &send_ctx, true);
+ if (unlikely(!ret && error))
+ ret = error;
/*
* As an optimization, we don't wait for individual I/O to finish
@@ -1744,6 +1806,11 @@ static int smb_direct_init_params(struct smb_direct_transport *t,
return -EINVAL;
}
+ if (device->attrs.max_send_sge < SMB_DIRECT_MAX_SEND_SGES) {
+ pr_err("warning: device max_send_sge = %d too small\n",
+ device->attrs.max_send_sge);
+ return -EINVAL;
+ }
if (device->attrs.max_recv_sge < SMB_DIRECT_MAX_RECV_SGES) {
pr_err("warning: device max_recv_sge = %d too small\n",
device->attrs.max_recv_sge);
@@ -1767,7 +1834,7 @@ static int smb_direct_init_params(struct smb_direct_transport *t,
cap->max_send_wr = max_send_wrs;
cap->max_recv_wr = t->recv_credit_max;
- cap->max_send_sge = max_sge_per_wr;
+ cap->max_send_sge = SMB_DIRECT_MAX_SEND_SGES;
cap->max_recv_sge = SMB_DIRECT_MAX_RECV_SGES;
cap->max_inline_data = 0;
cap->max_rdma_ctxs = t->max_rw_credits;
diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h
index 0708155b5caf..78b506c5ef03 100644
--- a/fs/smb/server/vfs_cache.h
+++ b/fs/smb/server/vfs_cache.h
@@ -112,6 +112,8 @@ struct ksmbd_file {
bool is_durable;
bool is_persistent;
bool is_resilient;
+
+ bool is_posix_ctxt;
};
static inline void set_ctx_actor(struct dir_context *ctx,