summaryrefslogtreecommitdiff
path: root/fs/nfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/nfs')
-rw-r--r--fs/nfs/blocklayout/blocklayout.c8
-rw-r--r--fs/nfs/blocklayout/dev.c8
-rw-r--r--fs/nfs/callback.c10
-rw-r--r--fs/nfs/dir.c26
-rw-r--r--fs/nfs/file.c29
-rw-r--r--fs/nfs/filelayout/filelayout.c10
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c10
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c786
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h64
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c115
-rw-r--r--fs/nfs/fs_context.c3
-rw-r--r--fs/nfs/inode.c15
-rw-r--r--fs/nfs/internal.h10
-rw-r--r--fs/nfs/localio.c405
-rw-r--r--fs/nfs/namespace.c3
-rw-r--r--fs/nfs/nfs2xdr.c2
-rw-r--r--fs/nfs/nfs3xdr.c2
-rw-r--r--fs/nfs/nfs42proc.c4
-rw-r--r--fs/nfs/nfs42xdr.c2
-rw-r--r--fs/nfs/nfs4file.c1
-rw-r--r--fs/nfs/nfs4proc.c12
-rw-r--r--fs/nfs/nfs4state.c3
-rw-r--r--fs/nfs/nfs4super.c44
-rw-r--r--fs/nfs/nfs4xdr.c4
-rw-r--r--fs/nfs/nfstrace.h215
-rw-r--r--fs/nfs/write.c34
26 files changed, 1294 insertions, 531 deletions
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 5d6edafbed20..0e4c67373e4f 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -676,7 +676,7 @@ bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr,
struct pnfs_layout_segment *lseg;
struct xdr_buf buf;
struct xdr_stream xdr;
- struct page *scratch;
+ struct folio *scratch;
int status, i;
uint32_t count;
__be32 *p;
@@ -689,13 +689,13 @@ bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr,
return ERR_PTR(-ENOMEM);
status = -ENOMEM;
- scratch = alloc_page(gfp_mask);
+ scratch = folio_alloc(gfp_mask, 0);
if (!scratch)
goto out;
xdr_init_decode_pages(&xdr, &buf,
lgr->layoutp->pages, lgr->layoutp->len);
- xdr_set_scratch_page(&xdr, scratch);
+ xdr_set_scratch_folio(&xdr, scratch);
status = -EIO;
p = xdr_inline_decode(&xdr, 4);
@@ -744,7 +744,7 @@ process_extents:
}
out_free_scratch:
- __free_page(scratch);
+ folio_put(scratch);
out:
dprintk("%s returns %d\n", __func__, status);
switch (status) {
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index 44306ac22353..ab76120705e2 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -541,16 +541,16 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
struct pnfs_block_dev *top;
struct xdr_stream xdr;
struct xdr_buf buf;
- struct page *scratch;
+ struct folio *scratch;
int nr_volumes, ret, i;
__be32 *p;
- scratch = alloc_page(gfp_mask);
+ scratch = folio_alloc(gfp_mask, 0);
if (!scratch)
goto out;
xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen);
- xdr_set_scratch_page(&xdr, scratch);
+ xdr_set_scratch_folio(&xdr, scratch);
p = xdr_inline_decode(&xdr, sizeof(__be32));
if (!p)
@@ -582,7 +582,7 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
out_free_volumes:
kfree(volumes);
out_free_scratch:
- __free_page(scratch);
+ folio_put(scratch);
out:
return node;
}
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 86bdc7d23fb9..c8b837006bb2 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -136,7 +136,7 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc
return;
dprintk("NFS: destroy per-net callback data; net=%x\n", net->ns.inum);
- svc_xprt_destroy_all(serv, net);
+ svc_xprt_destroy_all(serv, net, false);
}
static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
@@ -153,7 +153,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
ret = svc_bind(serv, net);
if (ret < 0) {
printk(KERN_WARNING "NFS: bind callback service failed\n");
- goto err_bind;
+ goto err;
}
ret = 0;
@@ -166,13 +166,11 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
if (ret < 0) {
printk(KERN_ERR "NFS: callback service start failed\n");
- goto err_socks;
+ goto err;
}
return 0;
-err_socks:
- svc_rpcb_cleanup(serv, net);
-err_bind:
+err:
nn->cb_users[minorversion]--;
dprintk("NFS: Couldn't create callback socket: err = %d; "
"net = %x\n", ret, net->ns.inum);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index d81217923936..46d9c65d50f8 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -829,17 +829,17 @@ static int nfs_readdir_folio_filler(struct nfs_readdir_descriptor *desc,
struct address_space *mapping = desc->file->f_mapping;
struct folio *new, *folio = *arrays;
struct xdr_stream stream;
- struct page *scratch;
+ struct folio *scratch;
struct xdr_buf buf;
u64 cookie;
int status;
- scratch = alloc_page(GFP_KERNEL);
+ scratch = folio_alloc(GFP_KERNEL, 0);
if (scratch == NULL)
return -ENOMEM;
xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen);
- xdr_set_scratch_page(&stream, scratch);
+ xdr_set_scratch_folio(&stream, scratch);
do {
status = nfs_readdir_entry_decode(desc, entry, &stream);
@@ -891,7 +891,7 @@ static int nfs_readdir_folio_filler(struct nfs_readdir_descriptor *desc,
if (folio != *arrays)
nfs_readdir_folio_unlock_and_put(folio);
- put_page(scratch);
+ folio_put(scratch);
return status;
}
@@ -2198,8 +2198,6 @@ no_open:
else
dput(dentry);
}
- if (IS_ERR(res))
- return PTR_ERR(res);
return finish_no_open(file, res);
}
EXPORT_SYMBOL_GPL(nfs_atomic_open);
@@ -2260,7 +2258,7 @@ int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry,
struct file *file, unsigned int open_flags,
umode_t mode)
{
-
+ struct dentry *res = NULL;
/* Same as look+open from lookup_open(), but with different O_TRUNC
* handling.
*/
@@ -2275,21 +2273,15 @@ int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry,
if (error)
return error;
return finish_open(file, dentry, NULL);
- } else if (d_in_lookup(dentry)) {
+ }
+ if (d_in_lookup(dentry)) {
/* The only flags nfs_lookup considers are
* LOOKUP_EXCL and LOOKUP_RENAME_TARGET, and
* we want those to be zero so the lookup isn't skipped.
*/
- struct dentry *res = nfs_lookup(dir, dentry, 0);
-
- d_lookup_done(dentry);
- if (unlikely(res)) {
- if (IS_ERR(res))
- return PTR_ERR(res);
- return finish_no_open(file, res);
- }
+ res = nfs_lookup(dir, dentry, 0);
}
- return finish_no_open(file, NULL);
+ return finish_no_open(file, res);
}
EXPORT_SYMBOL_GPL(nfs_atomic_open_v23);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 8059ece82468..d020aab40c64 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -161,6 +161,8 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t result;
+ trace_nfs_file_read(iocb, to);
+
if (iocb->ki_flags & IOCB_DIRECT)
return nfs_file_direct_read(iocb, to, false);
@@ -361,6 +363,8 @@ static bool nfs_want_read_modify_write(struct file *file, struct folio *folio,
if (pnfs_ld_read_whole_page(file_inode(file)))
return true;
+ if (folio_test_dropbehind(folio))
+ return false;
/* Open for reading too? */
if (file->f_mode & FMODE_READ)
return true;
@@ -380,22 +384,23 @@ static int nfs_write_begin(const struct kiocb *iocb,
loff_t pos, unsigned len, struct folio **foliop,
void **fsdata)
{
- fgf_t fgp = FGP_WRITEBEGIN;
struct folio *folio;
struct file *file = iocb->ki_filp;
int once_thru = 0;
int ret;
+ trace_nfs_write_begin(file_inode(file), pos, len);
+
dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n",
file, mapping->host->i_ino, len, (long long) pos);
nfs_truncate_last_folio(mapping, i_size_read(mapping->host), pos);
- fgp |= fgf_set_order(len);
start:
- folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp,
- mapping_gfp_mask(mapping));
- if (IS_ERR(folio))
- return PTR_ERR(folio);
+ folio = write_begin_get_folio(iocb, mapping, pos >> PAGE_SHIFT, len);
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
+ goto out;
+ }
*foliop = folio;
ret = nfs_flush_incompatible(file, folio);
@@ -405,11 +410,14 @@ start:
} else if (!once_thru &&
nfs_want_read_modify_write(file, folio, pos, len)) {
once_thru = 1;
+ folio_clear_dropbehind(folio);
ret = nfs_read_folio(file, folio);
folio_put(folio);
if (!ret)
goto start;
}
+out:
+ trace_nfs_write_begin_done(file_inode(file), pos, len, ret);
return ret;
}
@@ -423,6 +431,7 @@ static int nfs_write_end(const struct kiocb *iocb,
unsigned offset = offset_in_folio(folio, pos);
int status;
+ trace_nfs_write_end(file_inode(file), pos, len);
dfprintk(PAGECACHE, "NFS: write_end(%pD2(%lu), %u@%lld)\n",
file, mapping->host->i_ino, len, (long long) pos);
@@ -451,13 +460,16 @@ static int nfs_write_end(const struct kiocb *iocb,
folio_unlock(folio);
folio_put(folio);
- if (status < 0)
+ if (status < 0) {
+ trace_nfs_write_end_done(file_inode(file), pos, len, status);
return status;
+ }
NFS_I(mapping->host)->write_io += copied;
if (nfs_ctx_key_to_expire(ctx, mapping->host))
nfs_wb_all(mapping->host);
+ trace_nfs_write_end_done(file_inode(file), pos, len, copied);
return copied;
}
@@ -690,6 +702,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
errseq_t since;
int error;
+ trace_nfs_file_write(iocb, from);
+
result = nfs_key_timeout_notify(file, inode);
if (result)
return result;
@@ -949,5 +963,6 @@ const struct file_operations nfs_file_operations = {
.splice_write = iter_file_splice_write,
.check_flags = nfs_check_flags,
.setlease = simple_nosetlease,
+ .fop_flags = FOP_DONTCACHE,
};
EXPORT_SYMBOL_GPL(nfs_file_operations);
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index d39a1f58e18d..5c4551117c58 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -646,19 +646,19 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
{
struct xdr_stream stream;
struct xdr_buf buf;
- struct page *scratch;
+ struct folio *scratch;
__be32 *p;
uint32_t nfl_util;
int i;
dprintk("%s: set_layout_map Begin\n", __func__);
- scratch = alloc_page(gfp_flags);
+ scratch = folio_alloc(gfp_flags, 0);
if (!scratch)
return -ENOMEM;
xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
- xdr_set_scratch_page(&stream, scratch);
+ xdr_set_scratch_folio(&stream, scratch);
/* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8),
* num_fh (4) */
@@ -724,11 +724,11 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
fl->fh_array[i]->size);
}
- __free_page(scratch);
+ folio_put(scratch);
return 0;
out_err:
- __free_page(scratch);
+ folio_put(scratch);
return -EIO;
}
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index 29d9234d5c08..df79aeb68db4 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -73,18 +73,18 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
struct nfs4_file_layout_dsaddr *dsaddr = NULL;
struct xdr_stream stream;
struct xdr_buf buf;
- struct page *scratch;
+ struct folio *scratch;
struct list_head dsaddrs;
struct nfs4_pnfs_ds_addr *da;
struct net *net = server->nfs_client->cl_net;
/* set up xdr stream */
- scratch = alloc_page(gfp_flags);
+ scratch = folio_alloc(gfp_flags, 0);
if (!scratch)
goto out_err;
xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
- xdr_set_scratch_page(&stream, scratch);
+ xdr_set_scratch_folio(&stream, scratch);
/* Get the stripe count (number of stripe index) */
p = xdr_inline_decode(&stream, 4);
@@ -186,7 +186,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
}
}
- __free_page(scratch);
+ folio_put(scratch);
return dsaddr;
out_err_drain_dsaddrs:
@@ -204,7 +204,7 @@ out_err_free_deviceid:
out_err_free_stripe_indices:
kfree(stripe_indices);
out_err_free_scratch:
- __free_page(scratch);
+ folio_put(scratch);
out_err:
dprintk("%s ERROR: returning NULL\n", __func__);
return NULL;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 9edb5f9b0c4e..df01d2876b68 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -47,7 +47,7 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
int dev_limit, enum nfs4_ff_op_type type);
static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr,
const struct nfs42_layoutstat_devinfo *devinfo,
- struct nfs4_ff_layout_mirror *mirror);
+ struct nfs4_ff_layout_ds_stripe *dss_info);
static struct pnfs_layout_hdr *
ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
@@ -164,31 +164,32 @@ decode_name(struct xdr_stream *xdr, u32 *id)
}
static struct nfsd_file *
-ff_local_open_fh(struct pnfs_layout_segment *lseg, u32 ds_idx,
+ff_local_open_fh(struct pnfs_layout_segment *lseg, u32 ds_idx, u32 dss_id,
struct nfs_client *clp, const struct cred *cred,
struct nfs_fh *fh, fmode_t mode)
{
#if IS_ENABLED(CONFIG_NFS_LOCALIO)
struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
- return nfs_local_open_fh(clp, cred, fh, &mirror->nfl, mode);
+ return nfs_local_open_fh(clp, cred, fh, &mirror->dss[dss_id].nfl, mode);
#else
return NULL;
#endif
}
-static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
- const struct nfs4_ff_layout_mirror *m2)
+static bool ff_dss_match_fh(const struct nfs4_ff_layout_ds_stripe *dss1,
+ const struct nfs4_ff_layout_ds_stripe *dss2)
{
int i, j;
- if (m1->fh_versions_cnt != m2->fh_versions_cnt)
+ if (dss1->fh_versions_cnt != dss2->fh_versions_cnt)
return false;
- for (i = 0; i < m1->fh_versions_cnt; i++) {
+
+ for (i = 0; i < dss1->fh_versions_cnt; i++) {
bool found_fh = false;
- for (j = 0; j < m2->fh_versions_cnt; j++) {
- if (nfs_compare_fh(&m1->fh_versions[i],
- &m2->fh_versions[j]) == 0) {
+ for (j = 0; j < dss2->fh_versions_cnt; j++) {
+ if (nfs_compare_fh(&dss1->fh_versions[i],
+ &dss2->fh_versions[j]) == 0) {
found_fh = true;
break;
}
@@ -199,6 +200,38 @@ static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
return true;
}
+static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
+ const struct nfs4_ff_layout_mirror *m2)
+{
+ u32 dss_id;
+
+ if (m1->dss_count != m2->dss_count)
+ return false;
+
+ for (dss_id = 0; dss_id < m1->dss_count; dss_id++)
+ if (!ff_dss_match_fh(&m1->dss[dss_id], &m2->dss[dss_id]))
+ return false;
+
+ return true;
+}
+
+static bool ff_mirror_match_devid(const struct nfs4_ff_layout_mirror *m1,
+ const struct nfs4_ff_layout_mirror *m2)
+{
+ u32 dss_id;
+
+ if (m1->dss_count != m2->dss_count)
+ return false;
+
+ for (dss_id = 0; dss_id < m1->dss_count; dss_id++)
+ if (memcmp(&m1->dss[dss_id].devid,
+ &m2->dss[dss_id].devid,
+ sizeof(m1->dss[dss_id].devid)) != 0)
+ return false;
+
+ return true;
+}
+
static struct nfs4_ff_layout_mirror *
ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
struct nfs4_ff_layout_mirror *mirror)
@@ -209,7 +242,7 @@ ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
spin_lock(&inode->i_lock);
list_for_each_entry(pos, &ff_layout->mirrors, mirrors) {
- if (memcmp(&mirror->devid, &pos->devid, sizeof(pos->devid)) != 0)
+ if (!ff_mirror_match_devid(mirror, pos))
continue;
if (!ff_mirror_match_fh(mirror, pos))
continue;
@@ -240,29 +273,37 @@ ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror)
static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
{
struct nfs4_ff_layout_mirror *mirror;
+ u32 dss_id;
mirror = kzalloc(sizeof(*mirror), gfp_flags);
if (mirror != NULL) {
spin_lock_init(&mirror->lock);
refcount_set(&mirror->ref, 1);
INIT_LIST_HEAD(&mirror->mirrors);
- nfs_localio_file_init(&mirror->nfl);
+ for (dss_id = 0; dss_id < mirror->dss_count; dss_id++)
+ nfs_localio_file_init(&mirror->dss[dss_id].nfl);
}
return mirror;
}
static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
{
- const struct cred *cred;
+ const struct cred *cred;
+ u32 dss_id;
ff_layout_remove_mirror(mirror);
- kfree(mirror->fh_versions);
- nfs_close_local_fh(&mirror->nfl);
- cred = rcu_access_pointer(mirror->ro_cred);
- put_cred(cred);
- cred = rcu_access_pointer(mirror->rw_cred);
- put_cred(cred);
- nfs4_ff_layout_put_deviceid(mirror->mirror_ds);
+
+ for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) {
+ kfree(mirror->dss[dss_id].fh_versions);
+ cred = rcu_access_pointer(mirror->dss[dss_id].ro_cred);
+ put_cred(cred);
+ cred = rcu_access_pointer(mirror->dss[dss_id].rw_cred);
+ put_cred(cred);
+ nfs_close_local_fh(&mirror->dss[dss_id].nfl);
+ nfs4_ff_layout_put_deviceid(mirror->dss[dss_id].mirror_ds);
+ }
+
+ kfree(mirror->dss);
kfree(mirror);
}
@@ -366,14 +407,24 @@ ff_layout_add_lseg(struct pnfs_layout_hdr *lo,
free_me);
}
+static u32 ff_mirror_efficiency_sum(const struct nfs4_ff_layout_mirror *mirror)
+{
+ u32 dss_id, sum = 0;
+
+ for (dss_id = 0; dss_id < mirror->dss_count; dss_id++)
+ sum += mirror->dss[dss_id].efficiency;
+
+ return sum;
+}
+
static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
{
int i, j;
for (i = 0; i < fls->mirror_array_cnt - 1; i++) {
for (j = i + 1; j < fls->mirror_array_cnt; j++)
- if (fls->mirror_array[i]->efficiency <
- fls->mirror_array[j]->efficiency)
+ if (ff_mirror_efficiency_sum(fls->mirror_array[i]) <
+ ff_mirror_efficiency_sum(fls->mirror_array[j]))
swap(fls->mirror_array[i],
fls->mirror_array[j]);
}
@@ -388,20 +439,21 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
struct nfs4_ff_layout_segment *fls = NULL;
struct xdr_stream stream;
struct xdr_buf buf;
- struct page *scratch;
+ struct folio *scratch;
u64 stripe_unit;
u32 mirror_array_cnt;
__be32 *p;
int i, rc;
+ struct nfs4_ff_layout_ds_stripe *dss_info;
dprintk("--> %s\n", __func__);
- scratch = alloc_page(gfp_flags);
+ scratch = folio_alloc(gfp_flags, 0);
if (!scratch)
return ERR_PTR(-ENOMEM);
xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages,
lgr->layoutp->len);
- xdr_set_scratch_page(&stream, scratch);
+ xdr_set_scratch_folio(&stream, scratch);
/* stripe unit and mirror_array_cnt */
rc = -EIO;
@@ -427,23 +479,32 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
fls->mirror_array_cnt = mirror_array_cnt;
fls->stripe_unit = stripe_unit;
+ u32 dss_count = 0;
for (i = 0; i < fls->mirror_array_cnt; i++) {
struct nfs4_ff_layout_mirror *mirror;
struct cred *kcred;
const struct cred __rcu *cred;
kuid_t uid;
kgid_t gid;
- u32 ds_count, fh_count, id;
- int j;
+ u32 fh_count, id;
+ int j, dss_id;
rc = -EIO;
p = xdr_inline_decode(&stream, 4);
if (!p)
goto out_err_free;
- ds_count = be32_to_cpup(p);
- /* FIXME: allow for striping? */
- if (ds_count != 1)
+ // Ensure all mirrors have same stripe count.
+ if (dss_count == 0)
+ dss_count = be32_to_cpup(p);
+ else if (dss_count != be32_to_cpup(p))
+ goto out_err_free;
+
+ if (dss_count > NFS4_FLEXFILE_LAYOUT_MAX_STRIPE_CNT ||
+ dss_count == 0)
+ goto out_err_free;
+
+ if (dss_count > 1 && stripe_unit == 0)
goto out_err_free;
fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags);
@@ -452,91 +513,105 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
goto out_err_free;
}
- fls->mirror_array[i]->ds_count = ds_count;
+ fls->mirror_array[i]->dss_count = dss_count;
+ fls->mirror_array[i]->dss =
+ kcalloc(dss_count, sizeof(struct nfs4_ff_layout_ds_stripe),
+ gfp_flags);
- /* deviceid */
- rc = decode_deviceid(&stream, &fls->mirror_array[i]->devid);
- if (rc)
- goto out_err_free;
+ for (dss_id = 0; dss_id < dss_count; dss_id++) {
+ dss_info = &fls->mirror_array[i]->dss[dss_id];
+ dss_info->mirror = fls->mirror_array[i];
- /* efficiency */
- rc = -EIO;
- p = xdr_inline_decode(&stream, 4);
- if (!p)
- goto out_err_free;
- fls->mirror_array[i]->efficiency = be32_to_cpup(p);
+ /* deviceid */
+ rc = decode_deviceid(&stream, &dss_info->devid);
+ if (rc)
+ goto out_err_free;
- /* stateid */
- rc = decode_pnfs_stateid(&stream, &fls->mirror_array[i]->stateid);
- if (rc)
- goto out_err_free;
+ /* efficiency */
+ rc = -EIO;
+ p = xdr_inline_decode(&stream, 4);
+ if (!p)
+ goto out_err_free;
+ dss_info->efficiency = be32_to_cpup(p);
- /* fh */
- rc = -EIO;
- p = xdr_inline_decode(&stream, 4);
- if (!p)
- goto out_err_free;
- fh_count = be32_to_cpup(p);
+ /* stateid */
+ rc = decode_pnfs_stateid(&stream, &dss_info->stateid);
+ if (rc)
+ goto out_err_free;
- fls->mirror_array[i]->fh_versions =
- kcalloc(fh_count, sizeof(struct nfs_fh),
- gfp_flags);
- if (fls->mirror_array[i]->fh_versions == NULL) {
- rc = -ENOMEM;
- goto out_err_free;
- }
+ /* fh */
+ rc = -EIO;
+ p = xdr_inline_decode(&stream, 4);
+ if (!p)
+ goto out_err_free;
+ fh_count = be32_to_cpup(p);
- for (j = 0; j < fh_count; j++) {
- rc = decode_nfs_fh(&stream,
- &fls->mirror_array[i]->fh_versions[j]);
+ dss_info->fh_versions =
+ kcalloc(fh_count, sizeof(struct nfs_fh),
+ gfp_flags);
+ if (dss_info->fh_versions == NULL) {
+ rc = -ENOMEM;
+ goto out_err_free;
+ }
+
+ for (j = 0; j < fh_count; j++) {
+ rc = decode_nfs_fh(&stream,
+ &dss_info->fh_versions[j]);
+ if (rc)
+ goto out_err_free;
+ }
+
+ dss_info->fh_versions_cnt = fh_count;
+
+ /* user */
+ rc = decode_name(&stream, &id);
if (rc)
goto out_err_free;
- }
- fls->mirror_array[i]->fh_versions_cnt = fh_count;
+ uid = make_kuid(&init_user_ns, id);
- /* user */
- rc = decode_name(&stream, &id);
- if (rc)
- goto out_err_free;
+ /* group */
+ rc = decode_name(&stream, &id);
+ if (rc)
+ goto out_err_free;
- uid = make_kuid(&init_user_ns, id);
+ gid = make_kgid(&init_user_ns, id);
- /* group */
- rc = decode_name(&stream, &id);
- if (rc)
- goto out_err_free;
+ if (gfp_flags & __GFP_FS)
+ kcred = prepare_kernel_cred(&init_task);
+ else {
+ unsigned int nofs_flags = memalloc_nofs_save();
- gid = make_kgid(&init_user_ns, id);
+ kcred = prepare_kernel_cred(&init_task);
+ memalloc_nofs_restore(nofs_flags);
+ }
+ rc = -ENOMEM;
+ if (!kcred)
+ goto out_err_free;
+ kcred->fsuid = uid;
+ kcred->fsgid = gid;
+ cred = RCU_INITIALIZER(kcred);
- if (gfp_flags & __GFP_FS)
- kcred = prepare_kernel_cred(&init_task);
- else {
- unsigned int nofs_flags = memalloc_nofs_save();
- kcred = prepare_kernel_cred(&init_task);
- memalloc_nofs_restore(nofs_flags);
+ if (lgr->range.iomode == IOMODE_READ)
+ rcu_assign_pointer(dss_info->ro_cred, cred);
+ else
+ rcu_assign_pointer(dss_info->rw_cred, cred);
}
- rc = -ENOMEM;
- if (!kcred)
- goto out_err_free;
- kcred->fsuid = uid;
- kcred->fsgid = gid;
- cred = RCU_INITIALIZER(kcred);
-
- if (lgr->range.iomode == IOMODE_READ)
- rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
- else
- rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred);
mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]);
if (mirror != fls->mirror_array[i]) {
- /* swap cred ptrs so free_mirror will clean up old */
- if (lgr->range.iomode == IOMODE_READ) {
- cred = xchg(&mirror->ro_cred, cred);
- rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
- } else {
- cred = xchg(&mirror->rw_cred, cred);
- rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred);
+ for (dss_id = 0; dss_id < dss_count; dss_id++) {
+ dss_info = &fls->mirror_array[i]->dss[dss_id];
+ /* swap cred ptrs so free_mirror will clean up old */
+ if (lgr->range.iomode == IOMODE_READ) {
+ cred = xchg(&mirror->dss[dss_id].ro_cred,
+ dss_info->ro_cred);
+ rcu_assign_pointer(dss_info->ro_cred, cred);
+ } else {
+ cred = xchg(&mirror->dss[dss_id].rw_cred,
+ dss_info->rw_cred);
+ rcu_assign_pointer(dss_info->rw_cred, cred);
+ }
}
ff_layout_free_mirror(fls->mirror_array[i]);
fls->mirror_array[i] = mirror;
@@ -564,7 +639,7 @@ out_sort_mirrors:
ret = &fls->generic_hdr;
dprintk("<-- %s (success)\n", __func__);
out_free_page:
- __free_page(scratch);
+ folio_put(scratch);
return ret;
out_err_free:
_ff_layout_free_lseg(fls);
@@ -593,6 +668,26 @@ ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
_ff_layout_free_lseg(fls);
}
+static u32 calc_commit_idx(struct pnfs_layout_segment *lseg,
+ u32 mirror_idx, u32 dss_id)
+{
+ struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
+
+ return (mirror_idx * flseg->mirror_array[0]->dss_count) + dss_id;
+}
+
+static u32 calc_mirror_idx_from_commit(struct pnfs_layout_segment *lseg,
+ u32 commit_index)
+{
+ return commit_index / FF_LAYOUT_LSEG(lseg)->mirror_array[0]->dss_count;
+}
+
+static u32 calc_dss_id_from_commit(struct pnfs_layout_segment *lseg,
+ u32 commit_index)
+{
+ return commit_index % FF_LAYOUT_LSEG(lseg)->mirror_array[0]->dss_count;
+}
+
static void
nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
{
@@ -617,6 +712,7 @@ nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
static bool
nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id,
struct nfs4_ff_layoutstat *layoutstat,
ktime_t now)
{
@@ -624,8 +720,8 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(mirror->layout);
nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now);
- if (!mirror->start_time)
- mirror->start_time = now;
+ if (!mirror->dss[dss_id].start_time)
+ mirror->dss[dss_id].start_time = now;
if (mirror->report_interval != 0)
report_interval = (s64)mirror->report_interval * 1000LL;
else if (layoutstats_timer != 0)
@@ -675,13 +771,16 @@ nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat,
static void
nfs4_ff_layout_stat_io_start_read(struct inode *inode,
struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id,
__u64 requested, ktime_t now)
{
bool report;
spin_lock(&mirror->lock);
- report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now);
- nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested);
+ report = nfs4_ff_layoutstat_start_io(
+ mirror, dss_id, &mirror->dss[dss_id].read_stat, now);
+ nfs4_ff_layout_stat_io_update_requested(
+ &mirror->dss[dss_id].read_stat, requested);
set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
spin_unlock(&mirror->lock);
@@ -692,11 +791,12 @@ nfs4_ff_layout_stat_io_start_read(struct inode *inode,
static void
nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id,
__u64 requested,
__u64 completed)
{
spin_lock(&mirror->lock);
- nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat,
+ nfs4_ff_layout_stat_io_update_completed(&mirror->dss[dss_id].read_stat,
requested, completed,
ktime_get(), task->tk_start);
set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
@@ -706,13 +806,20 @@ nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
static void
nfs4_ff_layout_stat_io_start_write(struct inode *inode,
struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id,
__u64 requested, ktime_t now)
{
bool report;
spin_lock(&mirror->lock);
- report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now);
- nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested);
+ report = nfs4_ff_layoutstat_start_io(
+ mirror,
+ dss_id,
+ &mirror->dss[dss_id].write_stat,
+ now);
+ nfs4_ff_layout_stat_io_update_requested(
+ &mirror->dss[dss_id].write_stat,
+ requested);
set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
spin_unlock(&mirror->lock);
@@ -723,6 +830,7 @@ nfs4_ff_layout_stat_io_start_write(struct inode *inode,
static void
nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id,
__u64 requested,
__u64 completed,
enum nfs3_stable_how committed)
@@ -731,25 +839,25 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
requested = completed = 0;
spin_lock(&mirror->lock);
- nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat,
+ nfs4_ff_layout_stat_io_update_completed(&mirror->dss[dss_id].write_stat,
requested, completed, ktime_get(), task->tk_start);
set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
spin_unlock(&mirror->lock);
}
static void
-ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, u32 idx)
+ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, u32 idx, u32 dss_id)
{
- struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id);
if (devid)
nfs4_mark_deviceid_unavailable(devid);
}
static void
-ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx)
+ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx, u32 dss_id)
{
- struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id);
if (devid)
nfs4_mark_deviceid_available(devid);
@@ -758,6 +866,7 @@ ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx)
static struct nfs4_pnfs_ds *
ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
u32 start_idx, u32 *best_idx,
+ u32 offset, u32 *dss_id,
bool check_device)
{
struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
@@ -768,12 +877,16 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
/* mirrors are initially sorted by efficiency */
for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
mirror = FF_LAYOUT_COMP(lseg, idx);
- ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false);
+ *dss_id = nfs4_ff_layout_calc_dss_id(
+ fls->stripe_unit,
+ fls->mirror_array[idx]->dss_count,
+ offset);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, *dss_id, false);
if (IS_ERR(ds))
continue;
if (check_device &&
- nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node)) {
+ nfs4_test_deviceid_unavailable(&mirror->dss[*dss_id].mirror_ds->id_node)) {
// reinitialize the error state in case if this is the last iteration
ds = ERR_PTR(-EINVAL);
continue;
@@ -788,42 +901,52 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
static struct nfs4_pnfs_ds *
ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment *lseg,
- u32 start_idx, u32 *best_idx)
+ u32 start_idx, u32 *best_idx,
+ u32 offset, u32 *dss_id)
{
- return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, false);
+ return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx,
+ offset, dss_id, false);
}
static struct nfs4_pnfs_ds *
ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment *lseg,
- u32 start_idx, u32 *best_idx)
+ u32 start_idx, u32 *best_idx,
+ u32 offset, u32 *dss_id)
{
- return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, true);
+ return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx,
+ offset, dss_id, true);
}
static struct nfs4_pnfs_ds *
ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
- u32 start_idx, u32 *best_idx)
+ u32 start_idx, u32 *best_idx,
+ u32 offset, u32 *dss_id)
{
struct nfs4_pnfs_ds *ds;
- ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx);
+ ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx,
+ offset, dss_id);
if (!IS_ERR(ds))
return ds;
- return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx);
+ return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx,
+ offset, dss_id);
}
static struct nfs4_pnfs_ds *
ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio,
- u32 *best_idx)
+ u32 *best_idx,
+ u32 offset,
+ u32 *dss_id)
{
struct pnfs_layout_segment *lseg = pgio->pg_lseg;
struct nfs4_pnfs_ds *ds;
ds = ff_layout_choose_best_ds_for_read(lseg, pgio->pg_mirror_idx,
- best_idx);
+ best_idx, offset, dss_id);
if (!IS_ERR(ds) || !pgio->pg_mirror_idx)
return ds;
- return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx);
+ return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx,
+ offset, dss_id);
}
static void
@@ -842,6 +965,56 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
}
}
+static bool
+ff_layout_lseg_is_striped(const struct nfs4_ff_layout_segment *fls)
+{
+ return fls->mirror_array[0]->dss_count > 1;
+}
+
+/*
+ * ff_layout_pg_test(). Called by nfs_can_coalesce_requests()
+ *
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+static size_t
+ff_layout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+ struct nfs_page *req)
+{
+ unsigned int size;
+ u64 p_stripe, r_stripe;
+ u32 stripe_offset;
+ u64 segment_offset = pgio->pg_lseg->pls_range.offset;
+ u32 stripe_unit = FF_LAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
+
+ /* calls nfs_generic_pg_test */
+ size = pnfs_generic_pg_test(pgio, prev, req);
+ if (!size)
+ return 0;
+ else if (!ff_layout_lseg_is_striped(FF_LAYOUT_LSEG(pgio->pg_lseg)))
+ return size;
+
+ /* see if req and prev are in the same stripe */
+ if (prev) {
+ p_stripe = (u64)req_offset(prev) - segment_offset;
+ r_stripe = (u64)req_offset(req) - segment_offset;
+ do_div(p_stripe, stripe_unit);
+ do_div(r_stripe, stripe_unit);
+
+ if (p_stripe != r_stripe)
+ return 0;
+ }
+
+ /* calculate remaining bytes in the current stripe */
+ div_u64_rem((u64)req_offset(req) - segment_offset,
+ stripe_unit,
+ &stripe_offset);
+ WARN_ON_ONCE(stripe_offset > stripe_unit);
+ if (stripe_offset >= stripe_unit)
+ return 0;
+ return min(stripe_unit - (unsigned int)stripe_offset, size);
+}
+
static void
ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
@@ -849,7 +1022,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_pgio_mirror *pgm;
struct nfs4_ff_layout_mirror *mirror;
struct nfs4_pnfs_ds *ds;
- u32 ds_idx;
+ u32 ds_idx, dss_id;
if (NFS_SERVER(pgio->pg_inode)->flags &
(NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR))
@@ -870,7 +1043,8 @@ retry:
/* Reset wb_nio, since getting layout segment was successful */
req->wb_nio = 0;
- ds = ff_layout_get_ds_for_read(pgio, &ds_idx);
+ ds = ff_layout_get_ds_for_read(pgio, &ds_idx,
+ req_offset(req), &dss_id);
if (IS_ERR(ds)) {
if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
goto out_mds;
@@ -882,7 +1056,7 @@ retry:
mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
pgm = &pgio->pg_mirrors[0];
- pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
+ pgm->pg_bsize = mirror->dss[dss_id].mirror_ds->ds_versions[0].rsize;
pgio->pg_mirror_idx = ds_idx;
return;
@@ -919,7 +1093,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs4_ff_layout_mirror *mirror;
struct nfs_pgio_mirror *pgm;
struct nfs4_pnfs_ds *ds;
- u32 i;
+ u32 i, dss_id;
retry:
pnfs_generic_pg_check_layout(pgio, req);
@@ -944,7 +1118,12 @@ retry:
for (i = 0; i < pgio->pg_mirror_count; i++) {
mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
- ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror, true);
+ dss_id = nfs4_ff_layout_calc_dss_id(
+ FF_LAYOUT_LSEG(pgio->pg_lseg)->stripe_unit,
+ mirror->dss_count,
+ req_offset(req));
+ ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror,
+ dss_id, true);
if (IS_ERR(ds)) {
if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
goto out_mds;
@@ -954,7 +1133,7 @@ retry:
goto retry;
}
pgm = &pgio->pg_mirrors[i];
- pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize;
+ pgm->pg_bsize = mirror->dss[dss_id].mirror_ds->ds_versions[0].wsize;
}
if (NFS_SERVER(pgio->pg_inode)->flags &
@@ -1020,14 +1199,14 @@ ff_layout_pg_get_mirror_write(struct nfs_pageio_descriptor *desc, u32 idx)
static const struct nfs_pageio_ops ff_layout_pg_read_ops = {
.pg_init = ff_layout_pg_init_read,
- .pg_test = pnfs_generic_pg_test,
+ .pg_test = ff_layout_pg_test,
.pg_doio = pnfs_generic_pg_readpages,
.pg_cleanup = pnfs_generic_pg_cleanup,
};
static const struct nfs_pageio_ops ff_layout_pg_write_ops = {
.pg_init = ff_layout_pg_init_write,
- .pg_test = pnfs_generic_pg_test,
+ .pg_test = ff_layout_pg_test,
.pg_doio = pnfs_generic_pg_writepages,
.pg_get_mirror_count = ff_layout_pg_get_mirror_count_write,
.pg_cleanup = pnfs_generic_pg_cleanup,
@@ -1075,9 +1254,11 @@ static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr)
{
u32 idx = hdr->pgio_mirror_idx + 1;
u32 new_idx = 0;
+ u32 dss_id = 0;
struct nfs4_pnfs_ds *ds;
- ds = ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx);
+ ds = ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx,
+ hdr->args.offset, &dss_id);
if (IS_ERR(ds))
pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg);
else
@@ -1114,11 +1295,11 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
struct nfs4_state *state,
struct nfs_client *clp,
struct pnfs_layout_segment *lseg,
- u32 idx)
+ u32 idx, u32 dss_id)
{
struct pnfs_layout_hdr *lo = lseg->pls_layout;
struct inode *inode = lo->plh_inode;
- struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id);
struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
switch (op_status) {
@@ -1215,9 +1396,9 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
u32 op_status,
struct nfs_client *clp,
struct pnfs_layout_segment *lseg,
- u32 idx)
+ u32 idx, u32 dss_id)
{
- struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id);
switch (op_status) {
case NFS_OK:
@@ -1281,12 +1462,12 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
struct nfs4_state *state,
struct nfs_client *clp,
struct pnfs_layout_segment *lseg,
- u32 idx)
+ u32 idx, u32 dss_id)
{
int vers = clp->cl_nfs_mod->rpc_vers->number;
if (task->tk_status >= 0) {
- ff_layout_mark_ds_reachable(lseg, idx);
+ ff_layout_mark_ds_reachable(lseg, idx, dss_id);
return 0;
}
@@ -1297,10 +1478,10 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
switch (vers) {
case 3:
return ff_layout_async_handle_error_v3(task, op_status, clp,
- lseg, idx);
+ lseg, idx, dss_id);
case 4:
return ff_layout_async_handle_error_v4(task, op_status, state,
- clp, lseg, idx);
+ clp, lseg, idx, dss_id);
default:
/* should never happen */
WARN_ON_ONCE(1);
@@ -1309,7 +1490,7 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
}
static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
- u32 idx, u64 offset, u64 length,
+ u32 idx, u32 dss_id, u64 offset, u64 length,
u32 *op_status, int opnum, int error)
{
struct nfs4_ff_layout_mirror *mirror;
@@ -1347,7 +1528,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
mirror = FF_LAYOUT_COMP(lseg, idx);
err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
- mirror, offset, length, status, opnum,
+ mirror, dss_id, offset, length, status, opnum,
nfs_io_gfp_mask());
switch (status) {
@@ -1356,7 +1537,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
case NFS4ERR_PERM:
break;
case NFS4ERR_NXIO:
- ff_layout_mark_ds_unreachable(lseg, idx);
+ ff_layout_mark_ds_unreachable(lseg, idx, dss_id);
/*
* Don't return the layout if this is a read and we still
* have layouts to try
@@ -1376,10 +1557,16 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
static int ff_layout_read_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
+ struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(hdr->lseg);
+ u32 dss_id = nfs4_ff_layout_calc_dss_id(
+ flseg->stripe_unit,
+ flseg->mirror_array[hdr->pgio_mirror_idx]->dss_count,
+ hdr->args.offset);
int err;
if (task->tk_status < 0) {
- ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
+ ff_layout_io_track_ds_error(hdr->lseg,
+ hdr->pgio_mirror_idx, dss_id,
hdr->args.offset, hdr->args.count,
&hdr->res.op_status, OP_READ,
task->tk_status);
@@ -1389,7 +1576,8 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
err = ff_layout_async_handle_error(task, hdr->res.op_status,
hdr->args.context->state,
hdr->ds_clp, hdr->lseg,
- hdr->pgio_mirror_idx);
+ hdr->pgio_mirror_idx,
+ dss_id);
trace_nfs4_pnfs_read(hdr, err);
clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
@@ -1445,23 +1633,47 @@ ff_layout_set_layoutcommit(struct inode *inode,
static void ff_layout_read_record_layoutstats_start(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
+ struct nfs4_ff_layout_mirror *mirror;
+ u32 dss_id;
+
if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
return;
- nfs4_ff_layout_stat_io_start_read(hdr->inode,
- FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
- hdr->args.count,
- task->tk_start);
+
+ mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx);
+ dss_id = nfs4_ff_layout_calc_dss_id(
+ FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit,
+ mirror->dss_count,
+ hdr->args.offset);
+
+ nfs4_ff_layout_stat_io_start_read(
+ hdr->inode,
+ mirror,
+ dss_id,
+ hdr->args.count,
+ task->tk_start);
}
static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
+ struct nfs4_ff_layout_mirror *mirror;
+ u32 dss_id;
+
if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
return;
- nfs4_ff_layout_stat_io_end_read(task,
- FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
- hdr->args.count,
- hdr->res.count);
+
+ mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx);
+ dss_id = nfs4_ff_layout_calc_dss_id(
+ FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit,
+ mirror->dss_count,
+ hdr->args.offset);
+
+ nfs4_ff_layout_stat_io_end_read(
+ task,
+ mirror,
+ dss_id,
+ hdr->args.count,
+ hdr->res.count);
set_bit(NFS_LSEG_LAYOUTRETURN, &hdr->lseg->pls_flags);
}
@@ -1549,11 +1761,17 @@ static void ff_layout_read_release(void *data)
static int ff_layout_write_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
+ struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(hdr->lseg);
+ u32 dss_id = nfs4_ff_layout_calc_dss_id(
+ flseg->stripe_unit,
+ flseg->mirror_array[hdr->pgio_mirror_idx]->dss_count,
+ hdr->args.offset);
loff_t end_offs = 0;
int err;
if (task->tk_status < 0) {
- ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
+ ff_layout_io_track_ds_error(hdr->lseg,
+ hdr->pgio_mirror_idx, dss_id,
hdr->args.offset, hdr->args.count,
&hdr->res.op_status, OP_WRITE,
task->tk_status);
@@ -1563,7 +1781,8 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
err = ff_layout_async_handle_error(task, hdr->res.op_status,
hdr->args.context->state,
hdr->ds_clp, hdr->lseg,
- hdr->pgio_mirror_idx);
+ hdr->pgio_mirror_idx,
+ dss_id);
trace_nfs4_pnfs_write(hdr, err);
clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
@@ -1601,9 +1820,11 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
struct nfs_commit_data *data)
{
int err;
+ u32 idx = calc_mirror_idx_from_commit(data->lseg, data->ds_commit_index);
+ u32 dss_id = calc_dss_id_from_commit(data->lseg, data->ds_commit_index);
if (task->tk_status < 0) {
- ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index,
+ ff_layout_io_track_ds_error(data->lseg, idx, dss_id,
data->args.offset, data->args.count,
&data->res.op_status, OP_COMMIT,
task->tk_status);
@@ -1611,8 +1832,8 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
}
err = ff_layout_async_handle_error(task, data->res.op_status,
- NULL, data->ds_clp, data->lseg,
- data->ds_commit_index);
+ NULL, data->ds_clp, data->lseg, idx,
+ dss_id);
trace_nfs4_pnfs_commit_ds(data, err);
switch (err) {
@@ -1631,30 +1852,54 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
}
ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb);
-
return 0;
}
static void ff_layout_write_record_layoutstats_start(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
+ struct nfs4_ff_layout_mirror *mirror;
+ u32 dss_id;
+
if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
return;
- nfs4_ff_layout_stat_io_start_write(hdr->inode,
- FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
- hdr->args.count,
- task->tk_start);
+
+ mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx);
+ dss_id = nfs4_ff_layout_calc_dss_id(
+ FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit,
+ mirror->dss_count,
+ hdr->args.offset);
+
+ nfs4_ff_layout_stat_io_start_write(
+ hdr->inode,
+ mirror,
+ dss_id,
+ hdr->args.count,
+ task->tk_start);
}
static void ff_layout_write_record_layoutstats_done(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
+ struct nfs4_ff_layout_mirror *mirror;
+ u32 dss_id;
+
if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
return;
- nfs4_ff_layout_stat_io_end_write(task,
- FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
- hdr->args.count, hdr->res.count,
- hdr->res.verf->committed);
+
+ mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx);
+ dss_id = nfs4_ff_layout_calc_dss_id(
+ FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit,
+ mirror->dss_count,
+ hdr->args.offset);
+
+ nfs4_ff_layout_stat_io_end_write(
+ task,
+ mirror,
+ dss_id,
+ hdr->args.count,
+ hdr->res.count,
+ hdr->res.verf->committed);
set_bit(NFS_LSEG_LAYOUTRETURN, &hdr->lseg->pls_flags);
}
@@ -1737,10 +1982,16 @@ static void ff_layout_write_release(void *data)
static void ff_layout_commit_record_layoutstats_start(struct rpc_task *task,
struct nfs_commit_data *cdata)
{
+ u32 idx, dss_id;
+
if (test_and_set_bit(NFS_IOHDR_STAT, &cdata->flags))
return;
+
+ idx = calc_mirror_idx_from_commit(cdata->lseg, cdata->ds_commit_index);
+ dss_id = calc_dss_id_from_commit(cdata->lseg, cdata->ds_commit_index);
nfs4_ff_layout_stat_io_start_write(cdata->inode,
- FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
+ FF_LAYOUT_COMP(cdata->lseg, idx),
+ dss_id,
0, task->tk_start);
}
@@ -1749,6 +2000,7 @@ static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task,
{
struct nfs_page *req;
__u64 count = 0;
+ u32 idx, dss_id;
if (!test_and_clear_bit(NFS_IOHDR_STAT, &cdata->flags))
return;
@@ -1757,8 +2009,12 @@ static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task,
list_for_each_entry(req, &cdata->pages, wb_list)
count += req->wb_bytes;
}
+
+ idx = calc_mirror_idx_from_commit(cdata->lseg, cdata->ds_commit_index);
+ dss_id = calc_dss_id_from_commit(cdata->lseg, cdata->ds_commit_index);
nfs4_ff_layout_stat_io_end_write(task,
- FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
+ FF_LAYOUT_COMP(cdata->lseg, idx),
+ dss_id,
count, count, NFS_FILE_SYNC);
set_bit(NFS_LSEG_LAYOUTRETURN, &cdata->lseg->pls_flags);
}
@@ -1872,6 +2128,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
u32 idx = hdr->pgio_mirror_idx;
int vers;
struct nfs_fh *fh;
+ u32 dss_id;
bool ds_fatal_error = false;
dprintk("--> %s ino %lu pgbase %u req %zu@%llu\n",
@@ -1879,22 +2136,26 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
hdr->args.pgbase, (size_t)hdr->args.count, offset);
mirror = FF_LAYOUT_COMP(lseg, idx);
- ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false);
+ dss_id = nfs4_ff_layout_calc_dss_id(
+ FF_LAYOUT_LSEG(lseg)->stripe_unit,
+ mirror->dss_count,
+ offset);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, dss_id, false);
if (IS_ERR(ds)) {
ds_fatal_error = nfs_error_is_fatal(PTR_ERR(ds));
goto out_failed;
}
ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
- hdr->inode);
+ hdr->inode, dss_id);
if (IS_ERR(ds_clnt))
goto out_failed;
- ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred);
+ ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred, dss_id);
if (!ds_cred)
goto out_failed;
- vers = nfs4_ff_layout_ds_version(mirror);
+ vers = nfs4_ff_layout_ds_version(mirror, dss_id);
dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers);
@@ -1902,11 +2163,11 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
hdr->pgio_done_cb = ff_layout_read_done_cb;
refcount_inc(&ds->ds_clp->cl_count);
hdr->ds_clp = ds->ds_clp;
- fh = nfs4_ff_layout_select_ds_fh(mirror);
+ fh = nfs4_ff_layout_select_ds_fh(mirror, dss_id);
if (fh)
hdr->args.fh = fh;
- nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid);
+ nfs4_ff_layout_select_ds_stateid(mirror, dss_id, &hdr->args.stateid);
/*
* Note that if we ever decide to split across DSes,
@@ -1916,7 +2177,8 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
hdr->mds_offset = offset;
/* Start IO accounting for local read */
- localio = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh, FMODE_READ);
+ localio = ff_local_open_fh(lseg, idx, dss_id, ds->ds_clp, ds_cred, fh,
+ FMODE_READ);
if (localio) {
hdr->task.tk_start = ktime_get();
ff_layout_read_record_layoutstats_start(&hdr->task, hdr);
@@ -1953,25 +2215,30 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
int vers;
struct nfs_fh *fh;
u32 idx = hdr->pgio_mirror_idx;
+ u32 dss_id;
bool ds_fatal_error = false;
mirror = FF_LAYOUT_COMP(lseg, idx);
- ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
+ dss_id = nfs4_ff_layout_calc_dss_id(
+ FF_LAYOUT_LSEG(lseg)->stripe_unit,
+ mirror->dss_count,
+ offset);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, dss_id, true);
if (IS_ERR(ds)) {
ds_fatal_error = nfs_error_is_fatal(PTR_ERR(ds));
goto out_failed;
}
ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
- hdr->inode);
+ hdr->inode, dss_id);
if (IS_ERR(ds_clnt))
goto out_failed;
- ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred);
+ ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred, dss_id);
if (!ds_cred)
goto out_failed;
- vers = nfs4_ff_layout_ds_version(mirror);
+ vers = nfs4_ff_layout_ds_version(mirror, dss_id);
dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n",
__func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
@@ -1981,12 +2248,12 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
hdr->pgio_done_cb = ff_layout_write_done_cb;
refcount_inc(&ds->ds_clp->cl_count);
hdr->ds_clp = ds->ds_clp;
- hdr->ds_commit_idx = idx;
- fh = nfs4_ff_layout_select_ds_fh(mirror);
+ hdr->ds_commit_idx = calc_commit_idx(lseg, idx, dss_id);
+ fh = nfs4_ff_layout_select_ds_fh(mirror, dss_id);
if (fh)
hdr->args.fh = fh;
- nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid);
+ nfs4_ff_layout_select_ds_stateid(mirror, dss_id, &hdr->args.stateid);
/*
* Note that if we ever decide to split across DSes,
@@ -1995,7 +2262,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
hdr->args.offset = offset;
/* Start IO accounting for local write */
- localio = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh,
+ localio = ff_local_open_fh(lseg, idx, dss_id, ds->ds_clp, ds_cred, fh,
FMODE_READ|FMODE_WRITE);
if (localio) {
hdr->task.tk_start = ktime_get();
@@ -2019,20 +2286,15 @@ out_failed:
return PNFS_NOT_ATTEMPTED;
}
-static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
-{
- return i;
-}
-
static struct nfs_fh *
-select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i)
+select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i, u32 dss_id)
{
struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
/* FIXME: Assume that there is only one NFS version available
* for the DS.
*/
- return &flseg->mirror_array[i]->fh_versions[0];
+ return &flseg->mirror_array[i]->dss[dss_id].fh_versions[0];
}
static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
@@ -2043,7 +2305,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
struct nfsd_file *localio;
struct nfs4_ff_layout_mirror *mirror;
const struct cred *ds_cred;
- u32 idx;
+ u32 idx, dss_id;
int vers, ret;
struct nfs_fh *fh;
@@ -2051,22 +2313,23 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)))
goto out_err;
- idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
+ idx = calc_mirror_idx_from_commit(lseg, data->ds_commit_index);
mirror = FF_LAYOUT_COMP(lseg, idx);
- ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
+ dss_id = calc_dss_id_from_commit(lseg, data->ds_commit_index);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, dss_id, true);
if (IS_ERR(ds))
goto out_err;
ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
- data->inode);
+ data->inode, dss_id);
if (IS_ERR(ds_clnt))
goto out_err;
- ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, data->cred);
+ ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, data->cred, dss_id);
if (!ds_cred)
goto out_err;
- vers = nfs4_ff_layout_ds_version(mirror);
+ vers = nfs4_ff_layout_ds_version(mirror, dss_id);
dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__,
data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count),
@@ -2075,12 +2338,12 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
data->cred = ds_cred;
refcount_inc(&ds->ds_clp->cl_count);
data->ds_clp = ds->ds_clp;
- fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
+ fh = select_ds_fh_from_commit(lseg, idx, dss_id);
if (fh)
data->args.fh = fh;
/* Start IO accounting for local commit */
- localio = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh,
+ localio = ff_local_open_fh(lseg, idx, dss_id, ds->ds_clp, ds_cred, fh,
FMODE_READ|FMODE_WRITE);
if (localio) {
data->task.tk_start = ktime_get();
@@ -2144,25 +2407,28 @@ static void ff_layout_cancel_io(struct pnfs_layout_segment *lseg)
struct nfs4_pnfs_ds *ds;
struct nfs_client *ds_clp;
struct rpc_clnt *clnt;
- u32 idx;
+ u32 idx, dss_id;
for (idx = 0; idx < flseg->mirror_array_cnt; idx++) {
mirror = flseg->mirror_array[idx];
- mirror_ds = mirror->mirror_ds;
- if (IS_ERR_OR_NULL(mirror_ds))
- continue;
- ds = mirror->mirror_ds->ds;
- if (!ds)
- continue;
- ds_clp = ds->ds_clp;
- if (!ds_clp)
- continue;
- clnt = ds_clp->cl_rpcclient;
- if (!clnt)
- continue;
- if (!rpc_cancel_tasks(clnt, -EAGAIN, ff_layout_match_io, lseg))
- continue;
- rpc_clnt_disconnect(clnt);
+ for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) {
+ mirror_ds = mirror->dss[dss_id].mirror_ds;
+ if (IS_ERR_OR_NULL(mirror_ds))
+ continue;
+ ds = mirror->dss[dss_id].mirror_ds->ds;
+ if (!ds)
+ continue;
+ ds_clp = ds->ds_clp;
+ if (!ds_clp)
+ continue;
+ clnt = ds_clp->cl_rpcclient;
+ if (!clnt)
+ continue;
+ if (!rpc_cancel_tasks(clnt, -EAGAIN,
+ ff_layout_match_io, lseg))
+ continue;
+ rpc_clnt_disconnect(clnt);
+ }
}
}
@@ -2184,8 +2450,9 @@ ff_layout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
struct inode *inode = lseg->pls_layout->plh_inode;
struct pnfs_commit_array *array, *new;
+ u32 size = flseg->mirror_array_cnt * flseg->mirror_array[0]->dss_count;
- new = pnfs_alloc_commit_array(flseg->mirror_array_cnt,
+ new = pnfs_alloc_commit_array(size,
nfs_io_gfp_mask());
if (new) {
spin_lock(&inode->i_lock);
@@ -2549,11 +2816,11 @@ ff_layout_encode_io_latency(struct xdr_stream *xdr,
static void
ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr,
const struct nfs42_layoutstat_devinfo *devinfo,
- struct nfs4_ff_layout_mirror *mirror)
+ struct nfs4_ff_layout_ds_stripe *dss_info)
{
struct nfs4_pnfs_ds_addr *da;
- struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds;
- struct nfs_fh *fh = &mirror->fh_versions[0];
+ struct nfs4_pnfs_ds *ds = dss_info->mirror_ds->ds;
+ struct nfs_fh *fh = &dss_info->fh_versions[0];
__be32 *p;
da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node);
@@ -2565,13 +2832,17 @@ ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr,
p = xdr_reserve_space(xdr, 4 + fh->size);
xdr_encode_opaque(p, fh->data, fh->size);
/* ff_io_latency4 read */
- spin_lock(&mirror->lock);
- ff_layout_encode_io_latency(xdr, &mirror->read_stat.io_stat);
+ spin_lock(&dss_info->mirror->lock);
+ ff_layout_encode_io_latency(xdr,
+ &dss_info->read_stat.io_stat);
/* ff_io_latency4 write */
- ff_layout_encode_io_latency(xdr, &mirror->write_stat.io_stat);
- spin_unlock(&mirror->lock);
+ ff_layout_encode_io_latency(xdr,
+ &dss_info->write_stat.io_stat);
+ spin_unlock(&dss_info->mirror->lock);
/* nfstime4 */
- ff_layout_encode_nfstime(xdr, ktime_sub(ktime_get(), mirror->start_time));
+ ff_layout_encode_nfstime(xdr,
+ ktime_sub(ktime_get(),
+ dss_info->start_time));
/* bool */
p = xdr_reserve_space(xdr, 4);
*p = cpu_to_be32(false);
@@ -2595,7 +2866,8 @@ ff_layout_encode_layoutstats(struct xdr_stream *xdr, const void *args,
static void
ff_layout_free_layoutstats(struct nfs4_xdr_opaque_data *opaque)
{
- struct nfs4_ff_layout_mirror *mirror = opaque->data;
+ struct nfs4_ff_layout_ds_stripe *dss_info = opaque->data;
+ struct nfs4_ff_layout_mirror *mirror = dss_info->mirror;
ff_layout_put_mirror(mirror);
}
@@ -2612,37 +2884,47 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
{
struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
struct nfs4_ff_layout_mirror *mirror;
+ struct nfs4_ff_layout_ds_stripe *dss_info;
struct nfs4_deviceid_node *dev;
- int i = 0;
+ int i = 0, dss_id;
list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
- if (i >= dev_limit)
- break;
- if (IS_ERR_OR_NULL(mirror->mirror_ds))
- continue;
- if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL,
- &mirror->flags) &&
- type != NFS4_FF_OP_LAYOUTRETURN)
- continue;
- /* mirror refcount put in cleanup_layoutstats */
- if (!refcount_inc_not_zero(&mirror->ref))
- continue;
- dev = &mirror->mirror_ds->id_node;
- memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
- devinfo->offset = 0;
- devinfo->length = NFS4_MAX_UINT64;
- spin_lock(&mirror->lock);
- devinfo->read_count = mirror->read_stat.io_stat.ops_completed;
- devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed;
- devinfo->write_count = mirror->write_stat.io_stat.ops_completed;
- devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed;
- spin_unlock(&mirror->lock);
- devinfo->layout_type = LAYOUT_FLEX_FILES;
- devinfo->ld_private.ops = &layoutstat_ops;
- devinfo->ld_private.data = mirror;
-
- devinfo++;
- i++;
+ for (dss_id = 0; dss_id < mirror->dss_count; ++dss_id) {
+ dss_info = &mirror->dss[dss_id];
+ if (i >= dev_limit)
+ break;
+ if (IS_ERR_OR_NULL(dss_info->mirror_ds))
+ continue;
+ if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL,
+ &mirror->flags) &&
+ type != NFS4_FF_OP_LAYOUTRETURN)
+ continue;
+ /* mirror refcount put in cleanup_layoutstats */
+ if (!refcount_inc_not_zero(&mirror->ref))
+ continue;
+ dev = &dss_info->mirror_ds->id_node;
+ memcpy(&devinfo->dev_id,
+ &dev->deviceid,
+ NFS4_DEVICEID4_SIZE);
+ devinfo->offset = 0;
+ devinfo->length = NFS4_MAX_UINT64;
+ spin_lock(&mirror->lock);
+ devinfo->read_count =
+ dss_info->read_stat.io_stat.ops_completed;
+ devinfo->read_bytes =
+ dss_info->read_stat.io_stat.bytes_completed;
+ devinfo->write_count =
+ dss_info->write_stat.io_stat.ops_completed;
+ devinfo->write_bytes =
+ dss_info->write_stat.io_stat.bytes_completed;
+ spin_unlock(&mirror->lock);
+ devinfo->layout_type = LAYOUT_FLEX_FILES;
+ devinfo->ld_private.ops = &layoutstat_ops;
+ devinfo->ld_private.data = &mirror->dss[dss_id];
+
+ devinfo++;
+ i++;
+ }
}
return i;
}
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 095df09017a5..17a008c8e97c 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -21,6 +21,8 @@
* due to network error etc. */
#define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096
+#define NFS4_FLEXFILE_LAYOUT_MAX_STRIPE_CNT 4096
+
/* LAYOUTSTATS report interval in ms */
#define FF_LAYOUTSTATS_REPORT_INTERVAL (60000L)
#define FF_LAYOUTSTATS_MAXDEV 4
@@ -71,12 +73,12 @@ struct nfs4_ff_layoutstat {
struct nfs4_ff_busy_timer busy_timer;
};
-struct nfs4_ff_layout_mirror {
- struct pnfs_layout_hdr *layout;
- struct list_head mirrors;
- u32 ds_count;
- u32 efficiency;
+struct nfs4_ff_layout_mirror;
+
+struct nfs4_ff_layout_ds_stripe {
+ struct nfs4_ff_layout_mirror *mirror;
struct nfs4_deviceid devid;
+ u32 efficiency;
struct nfs4_ff_layout_ds *mirror_ds;
u32 fh_versions_cnt;
struct nfs_fh *fh_versions;
@@ -84,12 +86,19 @@ struct nfs4_ff_layout_mirror {
const struct cred __rcu *ro_cred;
const struct cred __rcu *rw_cred;
struct nfs_file_localio nfl;
- refcount_t ref;
- spinlock_t lock;
- unsigned long flags;
struct nfs4_ff_layoutstat read_stat;
struct nfs4_ff_layoutstat write_stat;
ktime_t start_time;
+};
+
+struct nfs4_ff_layout_mirror {
+ struct pnfs_layout_hdr *layout;
+ struct list_head mirrors;
+ u32 dss_count;
+ struct nfs4_ff_layout_ds_stripe *dss;
+ refcount_t ref;
+ spinlock_t lock;
+ unsigned long flags;
u32 report_interval;
};
@@ -150,12 +159,12 @@ FF_LAYOUT_COMP(struct pnfs_layout_segment *lseg, u32 idx)
}
static inline struct nfs4_deviceid_node *
-FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx)
+FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx, u32 dss_id)
{
struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, idx);
if (mirror != NULL) {
- struct nfs4_ff_layout_ds *mirror_ds = mirror->mirror_ds;
+ struct nfs4_ff_layout_ds *mirror_ds = mirror->dss[dss_id].mirror_ds;
if (!IS_ERR_OR_NULL(mirror_ds))
return &mirror_ds->id_node;
@@ -182,9 +191,22 @@ ff_layout_no_read_on_rw(struct pnfs_layout_segment *lseg)
}
static inline int
-nfs4_ff_layout_ds_version(const struct nfs4_ff_layout_mirror *mirror)
+nfs4_ff_layout_ds_version(const struct nfs4_ff_layout_mirror *mirror, u32 dss_id)
+{
+ return mirror->dss[dss_id].mirror_ds->ds_versions[0].version;
+}
+
+static inline u32
+nfs4_ff_layout_calc_dss_id(const u64 stripe_unit, const u32 dss_count, const loff_t offset)
{
- return mirror->mirror_ds->ds_versions[0].version;
+ u64 tmp = offset;
+
+ if (dss_count == 1 || stripe_unit == 0)
+ return 0;
+
+ do_div(tmp, stripe_unit);
+
+ return do_div(tmp, dss_count);
}
struct nfs4_ff_layout_ds *
@@ -193,9 +215,9 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds);
void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds);
int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
- struct nfs4_ff_layout_mirror *mirror, u64 offset,
- u64 length, int status, enum nfs_opnum4 opnum,
- gfp_t gfp_flags);
+ struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id, u64 offset, u64 length, int status,
+ enum nfs_opnum4 opnum, gfp_t gfp_flags);
void ff_layout_send_layouterror(struct pnfs_layout_segment *lseg);
int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head);
void ff_layout_free_ds_ioerr(struct list_head *head);
@@ -204,23 +226,27 @@ unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
struct list_head *head,
unsigned int maxnum);
struct nfs_fh *
-nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror);
+nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror, u32 dss_id);
void
nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror,
- nfs4_stateid *stateid);
+ u32 dss_id,
+ nfs4_stateid *stateid);
struct nfs4_pnfs_ds *
nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id,
bool fail_return);
struct rpc_clnt *
nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror,
struct nfs_client *ds_clp,
- struct inode *inode);
+ struct inode *inode,
+ u32 dss_id);
const struct cred *ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror,
const struct pnfs_layout_range *range,
- const struct cred *mdscred);
+ const struct cred *mdscred,
+ u32 dss_id);
bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg);
bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg);
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index 30365ec782bb..c55ea8fa3bfa 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -44,7 +44,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
{
struct xdr_stream stream;
struct xdr_buf buf;
- struct page *scratch;
+ struct folio *scratch;
struct list_head dsaddrs;
struct nfs4_pnfs_ds_addr *da;
struct nfs4_ff_layout_ds *new_ds = NULL;
@@ -56,7 +56,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
int i, ret = -ENOMEM;
/* set up xdr stream */
- scratch = alloc_page(gfp_flags);
+ scratch = folio_alloc(gfp_flags, 0);
if (!scratch)
goto out_err;
@@ -70,7 +70,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
INIT_LIST_HEAD(&dsaddrs);
xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
- xdr_set_scratch_page(&stream, scratch);
+ xdr_set_scratch_folio(&stream, scratch);
/* multipath count */
p = xdr_inline_decode(&stream, 4);
@@ -163,7 +163,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
kfree(da);
}
- __free_page(scratch);
+ folio_put(scratch);
return new_ds;
out_err_drain_dsaddrs:
@@ -177,7 +177,7 @@ out_err_drain_dsaddrs:
kfree(ds_versions);
out_scratch:
- __free_page(scratch);
+ folio_put(scratch);
out_err:
kfree(new_ds);
@@ -250,16 +250,16 @@ ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo,
}
int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
- struct nfs4_ff_layout_mirror *mirror, u64 offset,
- u64 length, int status, enum nfs_opnum4 opnum,
- gfp_t gfp_flags)
+ struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id, u64 offset, u64 length, int status,
+ enum nfs_opnum4 opnum, gfp_t gfp_flags)
{
struct nfs4_ff_layout_ds_err *dserr;
if (status == 0)
return 0;
- if (IS_ERR_OR_NULL(mirror->mirror_ds))
+ if (IS_ERR_OR_NULL(mirror->dss[dss_id].mirror_ds))
return -EINVAL;
dserr = kmalloc(sizeof(*dserr), gfp_flags);
@@ -271,8 +271,8 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
dserr->length = length;
dserr->status = status;
dserr->opnum = opnum;
- nfs4_stateid_copy(&dserr->stateid, &mirror->stateid);
- memcpy(&dserr->deviceid, &mirror->mirror_ds->id_node.deviceid,
+ nfs4_stateid_copy(&dserr->stateid, &mirror->dss[dss_id].stateid);
+ memcpy(&dserr->deviceid, &mirror->dss[dss_id].mirror_ds->id_node.deviceid,
NFS4_DEVICEID4_SIZE);
spin_lock(&flo->generic_hdr.plh_inode->i_lock);
@@ -282,14 +282,14 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
}
static const struct cred *
-ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode)
+ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode, u32 dss_id)
{
const struct cred *cred, __rcu **pcred;
if (iomode == IOMODE_READ)
- pcred = &mirror->ro_cred;
+ pcred = &mirror->dss[dss_id].ro_cred;
else
- pcred = &mirror->rw_cred;
+ pcred = &mirror->dss[dss_id].rw_cred;
rcu_read_lock();
do {
@@ -304,43 +304,45 @@ ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode)
}
struct nfs_fh *
-nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror)
+nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror, u32 dss_id)
{
/* FIXME: For now assume there is only 1 version available for the DS */
- return &mirror->fh_versions[0];
+ return &mirror->dss[dss_id].fh_versions[0];
}
void
nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror,
- nfs4_stateid *stateid)
+ u32 dss_id,
+ nfs4_stateid *stateid)
{
- if (nfs4_ff_layout_ds_version(mirror) == 4)
- nfs4_stateid_copy(stateid, &mirror->stateid);
+ if (nfs4_ff_layout_ds_version(mirror, dss_id) == 4)
+ nfs4_stateid_copy(stateid, &mirror->dss[dss_id].stateid);
}
static bool
ff_layout_init_mirror_ds(struct pnfs_layout_hdr *lo,
- struct nfs4_ff_layout_mirror *mirror)
+ struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id)
{
if (mirror == NULL)
goto outerr;
- if (mirror->mirror_ds == NULL) {
+ if (mirror->dss[dss_id].mirror_ds == NULL) {
struct nfs4_deviceid_node *node;
struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV);
node = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode),
- &mirror->devid, lo->plh_lc_cred,
+ &mirror->dss[dss_id].devid, lo->plh_lc_cred,
GFP_KERNEL);
if (node)
mirror_ds = FF_LAYOUT_MIRROR_DS(node);
/* check for race with another call to this function */
- if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) &&
+ if (cmpxchg(&mirror->dss[dss_id].mirror_ds, NULL, mirror_ds) &&
mirror_ds != ERR_PTR(-ENODEV))
nfs4_put_deviceid_node(node);
}
- if (IS_ERR(mirror->mirror_ds))
+ if (IS_ERR(mirror->dss[dss_id].mirror_ds))
goto outerr;
return true;
@@ -352,6 +354,7 @@ outerr:
* nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call
* @lseg: the layout segment we're operating on
* @mirror: layout mirror describing the DS to use
+ * @dss_id: DS stripe id to select stripe to use
* @fail_return: return layout on connect failure?
*
* Try to prepare a DS connection to accept an RPC call. This involves
@@ -368,6 +371,7 @@ outerr:
struct nfs4_pnfs_ds *
nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id,
bool fail_return)
{
struct nfs4_pnfs_ds *ds;
@@ -376,10 +380,10 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
unsigned int max_payload;
int status = -EAGAIN;
- if (!ff_layout_init_mirror_ds(lseg->pls_layout, mirror))
+ if (!ff_layout_init_mirror_ds(lseg->pls_layout, mirror, dss_id))
goto noconnect;
- ds = mirror->mirror_ds->ds;
+ ds = mirror->dss[dss_id].mirror_ds->ds;
if (READ_ONCE(ds->ds_clp))
goto out;
/* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
@@ -388,10 +392,10 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
/* FIXME: For now we assume the server sent only one version of NFS
* to use for the DS.
*/
- status = nfs4_pnfs_ds_connect(s, ds, &mirror->mirror_ds->id_node,
+ status = nfs4_pnfs_ds_connect(s, ds, &mirror->dss[dss_id].mirror_ds->id_node,
dataserver_timeo, dataserver_retrans,
- mirror->mirror_ds->ds_versions[0].version,
- mirror->mirror_ds->ds_versions[0].minor_version);
+ mirror->dss[dss_id].mirror_ds->ds_versions[0].version,
+ mirror->dss[dss_id].mirror_ds->ds_versions[0].minor_version);
/* connect success, check rsize/wsize limit */
if (!status) {
@@ -404,15 +408,15 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
max_payload =
nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient),
NULL);
- if (mirror->mirror_ds->ds_versions[0].rsize > max_payload)
- mirror->mirror_ds->ds_versions[0].rsize = max_payload;
- if (mirror->mirror_ds->ds_versions[0].wsize > max_payload)
- mirror->mirror_ds->ds_versions[0].wsize = max_payload;
+ if (mirror->dss[dss_id].mirror_ds->ds_versions[0].rsize > max_payload)
+ mirror->dss[dss_id].mirror_ds->ds_versions[0].rsize = max_payload;
+ if (mirror->dss[dss_id].mirror_ds->ds_versions[0].wsize > max_payload)
+ mirror->dss[dss_id].mirror_ds->ds_versions[0].wsize = max_payload;
goto out;
}
noconnect:
ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
- mirror, lseg->pls_range.offset,
+ mirror, dss_id, lseg->pls_range.offset,
lseg->pls_range.length, NFS4ERR_NXIO,
OP_ILLEGAL, GFP_NOIO);
ff_layout_send_layouterror(lseg);
@@ -426,12 +430,13 @@ out:
const struct cred *
ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror,
const struct pnfs_layout_range *range,
- const struct cred *mdscred)
+ const struct cred *mdscred,
+ u32 dss_id)
{
const struct cred *cred;
- if (mirror && !mirror->mirror_ds->ds_versions[0].tightly_coupled) {
- cred = ff_layout_get_mirror_cred(mirror, range->iomode);
+ if (mirror && !mirror->dss[dss_id].mirror_ds->ds_versions[0].tightly_coupled) {
+ cred = ff_layout_get_mirror_cred(mirror, range->iomode, dss_id);
if (!cred)
cred = get_cred(mdscred);
} else {
@@ -445,15 +450,17 @@ ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror,
* @mirror: pointer to the mirror
* @ds_clp: nfs_client for the DS
* @inode: pointer to inode
+ * @dss_id: DS stripe id
*
* Find or create a DS rpc client with th MDS server rpc client auth flavor
* in the nfs_client cl_ds_clients list.
*/
struct rpc_clnt *
nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror,
- struct nfs_client *ds_clp, struct inode *inode)
+ struct nfs_client *ds_clp, struct inode *inode,
+ u32 dss_id)
{
- switch (mirror->mirror_ds->ds_versions[0].version) {
+ switch (mirror->dss[dss_id].mirror_ds->ds_versions[0].version) {
case 3:
/* For NFSv3 DS, flavor is set when creating DS connections */
return ds_clp->cl_rpcclient;
@@ -559,16 +566,18 @@ static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg)
{
struct nfs4_ff_layout_mirror *mirror;
struct nfs4_deviceid_node *devid;
- u32 idx;
+ u32 idx, dss_id;
for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
mirror = FF_LAYOUT_COMP(lseg, idx);
- if (mirror) {
- if (!mirror->mirror_ds)
+ if (!mirror)
+ continue;
+ for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) {
+ if (!mirror->dss[dss_id].mirror_ds)
return true;
- if (IS_ERR(mirror->mirror_ds))
+ if (IS_ERR(mirror->dss[dss_id].mirror_ds))
continue;
- devid = &mirror->mirror_ds->id_node;
+ devid = &mirror->dss[dss_id].mirror_ds->id_node;
if (!nfs4_test_deviceid_unavailable(devid))
return true;
}
@@ -581,17 +590,21 @@ static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg)
{
struct nfs4_ff_layout_mirror *mirror;
struct nfs4_deviceid_node *devid;
- u32 idx;
+ u32 idx, dss_id;
for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
mirror = FF_LAYOUT_COMP(lseg, idx);
- if (!mirror || IS_ERR(mirror->mirror_ds))
- return false;
- if (!mirror->mirror_ds)
- continue;
- devid = &mirror->mirror_ds->id_node;
- if (nfs4_test_deviceid_unavailable(devid))
+ if (!mirror)
return false;
+ for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) {
+ if (IS_ERR(mirror->dss[dss_id].mirror_ds))
+ return false;
+ if (!mirror->dss[dss_id].mirror_ds)
+ continue;
+ devid = &mirror->dss[dss_id].mirror_ds->id_node;
+ if (nfs4_test_deviceid_unavailable(devid))
+ return false;
+ }
}
return FF_LAYOUT_MIRROR_COUNT(lseg) != 0;
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index 9e94d18448ff..b4679b7161b0 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -1269,8 +1269,7 @@ static int nfs23_parse_monolithic(struct fs_context *fc,
int ret;
data->context[NFS_MAX_CONTEXT_LEN] = '\0';
- ret = vfs_parse_fs_string(fc, "context",
- data->context, strlen(data->context));
+ ret = vfs_parse_fs_string(fc, "context", data->context);
if (ret < 0)
return ret;
#else
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 9bdaf7f38bed..18b57c7c2f97 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1073,6 +1073,21 @@ out_no_revalidate:
if (S_ISDIR(inode->i_mode))
stat->blksize = NFS_SERVER(inode)->dtsize;
stat->btime = NFS_I(inode)->btime;
+
+ /* Special handling for STATX_DIOALIGN and STATX_DIO_READ_ALIGN
+ * - NFS doesn't have DIO alignment constraints, avoid getting
+ * these DIO attrs from remote and just respond with most
+ * accommodating limits (so client will issue supported DIO).
+ * - this is unintuitive, but the most coarse-grained
+ * dio_offset_align is the most accommodating.
+ */
+ if ((request_mask & (STATX_DIOALIGN | STATX_DIO_READ_ALIGN)) &&
+ S_ISREG(inode->i_mode)) {
+ stat->result_mask |= STATX_DIOALIGN | STATX_DIO_READ_ALIGN;
+ stat->dio_mem_align = 4; /* 4-byte alignment */
+ stat->dio_offset_align = PAGE_SIZE;
+ stat->dio_read_offset_align = stat->dio_offset_align;
+ }
out:
trace_nfs_getattr_exit(inode, err);
return err;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index c0a44f389f8f..2ecd38e1d17a 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -456,6 +456,16 @@ extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
#if IS_ENABLED(CONFIG_NFS_LOCALIO)
/* localio.c */
+struct nfs_local_dio {
+ u32 mem_align;
+ u32 offset_align;
+ loff_t middle_offset;
+ loff_t end_offset;
+ ssize_t start_len; /* Length for misaligned first extent */
+ ssize_t middle_len; /* Length for DIO-aligned middle extent */
+ ssize_t end_len; /* Length for misaligned last extent */
+};
+
extern void nfs_local_probe_async(struct nfs_client *);
extern void nfs_local_probe_async_work(struct work_struct *);
extern struct nfsd_file *nfs_local_open_fh(struct nfs_client *,
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 97abf62f109d..2c0455e91571 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -30,6 +30,8 @@
#define NFSDBG_FACILITY NFSDBG_VFS
+#define NFSLOCAL_MAX_IOS 3
+
struct nfs_local_kiocb {
struct kiocb kiocb;
struct bio_vec *bvec;
@@ -37,6 +39,14 @@ struct nfs_local_kiocb {
struct work_struct work;
void (*aio_complete_work)(struct work_struct *);
struct nfsd_file *localio;
+ /* Begin mostly DIO-specific members */
+ size_t end_len;
+ short int end_iter_index;
+ short int n_iters;
+ bool iter_is_dio_aligned[NFSLOCAL_MAX_IOS];
+ loff_t offset[NFSLOCAL_MAX_IOS] ____cacheline_aligned;
+ struct iov_iter iters[NFSLOCAL_MAX_IOS];
+ /* End mostly DIO-specific members */
};
struct nfs_local_fsync_ctx {
@@ -49,11 +59,6 @@ struct nfs_local_fsync_ctx {
static bool localio_enabled __read_mostly = true;
module_param(localio_enabled, bool, 0644);
-static bool localio_O_DIRECT_semantics __read_mostly = false;
-module_param(localio_O_DIRECT_semantics, bool, 0644);
-MODULE_PARM_DESC(localio_O_DIRECT_semantics,
- "LOCALIO will use O_DIRECT semantics to filesystem.");
-
static inline bool nfs_client_is_local(const struct nfs_client *clp)
{
return !!rcu_access_pointer(clp->cl_uuid.net);
@@ -231,13 +236,13 @@ __nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
struct nfsd_file __rcu **pnf,
const fmode_t mode)
{
+ int status = 0;
struct nfsd_file *localio;
localio = nfs_open_local_fh(&clp->cl_uuid, clp->cl_rpcclient,
cred, fh, nfl, pnf, mode);
if (IS_ERR(localio)) {
- int status = PTR_ERR(localio);
- trace_nfs_local_open_fh(fh, mode, status);
+ status = PTR_ERR(localio);
switch (status) {
case -ENOMEM:
case -ENXIO:
@@ -247,6 +252,7 @@ __nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
nfs_local_probe(clp);
}
}
+ trace_nfs_local_open_fh(fh, mode, status);
return localio;
}
@@ -281,23 +287,6 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
}
EXPORT_SYMBOL_GPL(nfs_local_open_fh);
-static struct bio_vec *
-nfs_bvec_alloc_and_import_pagevec(struct page **pagevec,
- unsigned int npages, gfp_t flags)
-{
- struct bio_vec *bvec, *p;
-
- bvec = kmalloc_array(npages, sizeof(*bvec), flags);
- if (bvec != NULL) {
- for (p = bvec; npages > 0; p++, pagevec++, npages--) {
- p->bv_page = *pagevec;
- p->bv_len = PAGE_SIZE;
- p->bv_offset = 0;
- }
- }
- return bvec;
-}
-
static void
nfs_local_iocb_free(struct nfs_local_kiocb *iocb)
{
@@ -311,40 +300,191 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr,
{
struct nfs_local_kiocb *iocb;
- iocb = kmalloc(sizeof(*iocb), flags);
+ iocb = kzalloc(sizeof(*iocb), flags);
if (iocb == NULL)
return NULL;
- iocb->bvec = nfs_bvec_alloc_and_import_pagevec(hdr->page_array.pagevec,
- hdr->page_array.npages, flags);
+
+ iocb->bvec = kmalloc_array(hdr->page_array.npages,
+ sizeof(struct bio_vec), flags);
if (iocb->bvec == NULL) {
kfree(iocb);
return NULL;
}
- if (localio_O_DIRECT_semantics &&
- test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) {
- iocb->kiocb.ki_filp = file;
- iocb->kiocb.ki_flags = IOCB_DIRECT;
- } else
- init_sync_kiocb(&iocb->kiocb, file);
+ init_sync_kiocb(&iocb->kiocb, file);
- iocb->kiocb.ki_pos = hdr->args.offset;
iocb->hdr = hdr;
iocb->kiocb.ki_flags &= ~IOCB_APPEND;
iocb->aio_complete_work = NULL;
+ iocb->end_iter_index = -1;
+
return iocb;
}
-static void
-nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int dir)
+static bool
+nfs_is_local_dio_possible(struct nfs_local_kiocb *iocb, int rw,
+ size_t len, struct nfs_local_dio *local_dio)
+{
+ struct nfs_pgio_header *hdr = iocb->hdr;
+ loff_t offset = hdr->args.offset;
+ u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align;
+ loff_t start_end, orig_end, middle_end;
+
+ nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align,
+ &nf_dio_offset_align, &nf_dio_read_offset_align);
+ if (rw == ITER_DEST)
+ nf_dio_offset_align = nf_dio_read_offset_align;
+
+ if (unlikely(!nf_dio_mem_align || !nf_dio_offset_align))
+ return false;
+ if (unlikely(nf_dio_offset_align > PAGE_SIZE))
+ return false;
+ if (unlikely(len < nf_dio_offset_align))
+ return false;
+
+ local_dio->mem_align = nf_dio_mem_align;
+ local_dio->offset_align = nf_dio_offset_align;
+
+ start_end = round_up(offset, nf_dio_offset_align);
+ orig_end = offset + len;
+ middle_end = round_down(orig_end, nf_dio_offset_align);
+
+ local_dio->middle_offset = start_end;
+ local_dio->end_offset = middle_end;
+
+ local_dio->start_len = start_end - offset;
+ local_dio->middle_len = middle_end - start_end;
+ local_dio->end_len = orig_end - middle_end;
+
+ if (rw == ITER_DEST)
+ trace_nfs_local_dio_read(hdr->inode, offset, len, local_dio);
+ else
+ trace_nfs_local_dio_write(hdr->inode, offset, len, local_dio);
+ return true;
+}
+
+static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i,
+ unsigned int addr_mask, unsigned int len_mask)
+{
+ const struct bio_vec *bvec = i->bvec;
+ size_t skip = i->iov_offset;
+ size_t size = i->count;
+
+ if (size & len_mask)
+ return false;
+ do {
+ size_t len = bvec->bv_len;
+
+ if (len > size)
+ len = size;
+ if ((unsigned long)(bvec->bv_offset + skip) & addr_mask)
+ return false;
+ bvec++;
+ size -= len;
+ skip = 0;
+ } while (size);
+
+ return true;
+}
+
+/*
+ * Setup as many as 3 iov_iter based on extents described by @local_dio.
+ * Returns the number of iov_iter that were setup.
+ */
+static int
+nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw,
+ unsigned int nvecs, size_t len,
+ struct nfs_local_dio *local_dio)
+{
+ int n_iters = 0;
+ struct iov_iter *iters = iocb->iters;
+
+ /* Setup misaligned start? */
+ if (local_dio->start_len) {
+ iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
+ iters[n_iters].count = local_dio->start_len;
+ iocb->offset[n_iters] = iocb->hdr->args.offset;
+ iocb->iter_is_dio_aligned[n_iters] = false;
+ ++n_iters;
+ }
+
+ /* Setup misaligned end?
+ * If so, the end is purposely setup to be issued using buffered IO
+ * before the middle (which will use DIO, if DIO-aligned, with AIO).
+ * This creates problems if/when the end results in a partial write.
+ * So must save index and length of end to handle this corner case.
+ */
+ if (local_dio->end_len) {
+ iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
+ iocb->offset[n_iters] = local_dio->end_offset;
+ iov_iter_advance(&iters[n_iters],
+ local_dio->start_len + local_dio->middle_len);
+ iocb->iter_is_dio_aligned[n_iters] = false;
+ /* Save index and length of end */
+ iocb->end_iter_index = n_iters;
+ iocb->end_len = local_dio->end_len;
+ ++n_iters;
+ }
+
+ /* Setup DIO-aligned middle to be issued last, to allow for
+ * DIO with AIO completion (see nfs_local_call_{read,write}).
+ */
+ iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
+ if (local_dio->start_len)
+ iov_iter_advance(&iters[n_iters], local_dio->start_len);
+ iters[n_iters].count -= local_dio->end_len;
+ iocb->offset[n_iters] = local_dio->middle_offset;
+
+ iocb->iter_is_dio_aligned[n_iters] =
+ nfs_iov_iter_aligned_bvec(&iters[n_iters],
+ local_dio->mem_align-1, local_dio->offset_align-1);
+
+ if (unlikely(!iocb->iter_is_dio_aligned[n_iters])) {
+ trace_nfs_local_dio_misaligned(iocb->hdr->inode,
+ iocb->hdr->args.offset, len, local_dio);
+ return 0; /* no DIO-aligned IO possible */
+ }
+ ++n_iters;
+
+ iocb->n_iters = n_iters;
+ return n_iters;
+}
+
+static noinline_for_stack void
+nfs_local_iters_init(struct nfs_local_kiocb *iocb, int rw)
{
struct nfs_pgio_header *hdr = iocb->hdr;
+ struct page **pagevec = hdr->page_array.pagevec;
+ unsigned long v, total;
+ unsigned int base;
+ size_t len;
+
+ v = 0;
+ total = hdr->args.count;
+ base = hdr->args.pgbase;
+ while (total && v < hdr->page_array.npages) {
+ len = min_t(size_t, total, PAGE_SIZE - base);
+ bvec_set_page(&iocb->bvec[v], *pagevec, len, base);
+ total -= len;
+ ++pagevec;
+ ++v;
+ base = 0;
+ }
+ len = hdr->args.count - total;
+
+ if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) {
+ struct nfs_local_dio local_dio;
+
+ if (nfs_is_local_dio_possible(iocb, rw, len, &local_dio) &&
+ nfs_local_iters_setup_dio(iocb, rw, v, len, &local_dio) != 0)
+ return; /* is DIO-aligned */
+ }
- iov_iter_bvec(i, dir, iocb->bvec, hdr->page_array.npages,
- hdr->args.count + hdr->args.pgbase);
- if (hdr->args.pgbase != 0)
- iov_iter_advance(i, hdr->args.pgbase);
+ /* Use buffered IO */
+ iocb->offset[0] = hdr->args.offset;
+ iov_iter_bvec(&iocb->iters[0], rw, iocb->bvec, v, len);
+ iocb->n_iters = 1;
}
static void
@@ -367,10 +507,12 @@ nfs_local_pgio_init(struct nfs_pgio_header *hdr,
static void
nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status)
{
+ /* Must handle partial completions */
if (status >= 0) {
- hdr->res.count = status;
- hdr->res.op_status = NFS4_OK;
- hdr->task.tk_status = 0;
+ hdr->res.count += status;
+ /* @hdr was initialized to 0 (zeroed during allocation) */
+ if (hdr->task.tk_status == 0)
+ hdr->res.op_status = NFS4_OK;
} else {
hdr->res.op_status = nfs_localio_errno_to_nfs4_stat(status);
hdr->task.tk_status = status;
@@ -378,12 +520,18 @@ nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status)
}
static void
+nfs_local_iocb_release(struct nfs_local_kiocb *iocb)
+{
+ nfs_local_file_put(iocb->localio);
+ nfs_local_iocb_free(iocb);
+}
+
+static void
nfs_local_pgio_release(struct nfs_local_kiocb *iocb)
{
struct nfs_pgio_header *hdr = iocb->hdr;
- nfs_local_file_put(iocb->localio);
- nfs_local_iocb_free(iocb);
+ nfs_local_iocb_release(iocb);
nfs_local_hdr_release(hdr, hdr->task.tk_ops);
}
@@ -405,7 +553,10 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status)
struct nfs_pgio_header *hdr = iocb->hdr;
struct file *filp = iocb->kiocb.ki_filp;
- nfs_local_pgio_done(hdr, status);
+ if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) {
+ /* Underlying FS will return -EINVAL if misaligned DIO is attempted. */
+ pr_info_ratelimited("nfs: Unexpected direct I/O read alignment failure\n");
+ }
/*
* Must clear replen otherwise NFSv3 data corruption will occur
@@ -434,6 +585,7 @@ static void nfs_local_read_aio_complete(struct kiocb *kiocb, long ret)
struct nfs_local_kiocb *iocb =
container_of(kiocb, struct nfs_local_kiocb, kiocb);
+ nfs_local_pgio_done(iocb->hdr, ret);
nfs_local_read_done(iocb, ret);
nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_read_aio_complete_work */
}
@@ -444,14 +596,25 @@ static void nfs_local_call_read(struct work_struct *work)
container_of(work, struct nfs_local_kiocb, work);
struct file *filp = iocb->kiocb.ki_filp;
const struct cred *save_cred;
- struct iov_iter iter;
ssize_t status;
save_cred = override_creds(filp->f_cred);
- nfs_local_iter_init(&iter, iocb, READ);
+ for (int i = 0; i < iocb->n_iters ; i++) {
+ if (iocb->iter_is_dio_aligned[i]) {
+ iocb->kiocb.ki_flags |= IOCB_DIRECT;
+ iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
+ iocb->aio_complete_work = nfs_local_read_aio_complete_work;
+ }
- status = filp->f_op->read_iter(&iocb->kiocb, &iter);
+ iocb->kiocb.ki_pos = iocb->offset[i];
+ status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iters[i]);
+ if (status != -EIOCBQUEUED) {
+ nfs_local_pgio_done(iocb->hdr, status);
+ if (iocb->hdr->task.tk_status)
+ break;
+ }
+ }
revert_creds(save_cred);
@@ -462,33 +625,17 @@ static void nfs_local_call_read(struct work_struct *work)
}
static int
-nfs_do_local_read(struct nfs_pgio_header *hdr,
- struct nfsd_file *localio,
+nfs_local_do_read(struct nfs_local_kiocb *iocb,
const struct rpc_call_ops *call_ops)
{
- struct nfs_local_kiocb *iocb;
- struct file *file = nfs_to->nfsd_file_file(localio);
-
- /* Don't support filesystems without read_iter */
- if (!file->f_op->read_iter)
- return -EAGAIN;
+ struct nfs_pgio_header *hdr = iocb->hdr;
dprintk("%s: vfs_read count=%u pos=%llu\n",
__func__, hdr->args.count, hdr->args.offset);
- iocb = nfs_local_iocb_alloc(hdr, file, GFP_KERNEL);
- if (iocb == NULL)
- return -ENOMEM;
- iocb->localio = localio;
-
nfs_local_pgio_init(hdr, call_ops);
hdr->res.eof = false;
- if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
- iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
- iocb->aio_complete_work = nfs_local_read_aio_complete_work;
- }
-
INIT_WORK(&iocb->work, nfs_local_call_read);
queue_work(nfslocaliod_workqueue, &iocb->work);
@@ -529,7 +676,7 @@ nfs_set_local_verifier(struct inode *inode,
}
/* Factored out from fs/nfsd/vfs.h:fh_getattr() */
-static int __vfs_getattr(struct path *p, struct kstat *stat, int version)
+static int __vfs_getattr(const struct path *p, struct kstat *stat, int version)
{
u32 request_mask = STATX_BASIC_STATS;
@@ -597,7 +744,13 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
dprintk("%s: wrote %ld bytes.\n", __func__, status > 0 ? status : 0);
+ if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) {
+ /* Underlying FS will return -EINVAL if misaligned DIO is attempted. */
+ pr_info_ratelimited("nfs: Unexpected direct I/O write alignment failure\n");
+ }
+
/* Handle short writes as if they are ENOSPC */
+ status = hdr->res.count;
if (status > 0 && status < hdr->args.count) {
hdr->mds_offset += status;
hdr->args.offset += status;
@@ -605,11 +758,11 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
hdr->args.count -= status;
nfs_set_pgio_error(hdr, -ENOSPC, hdr->args.offset);
status = -ENOSPC;
+ /* record -ENOSPC in terms of nfs_local_pgio_done */
+ nfs_local_pgio_done(hdr, status);
}
- if (status < 0)
+ if (hdr->task.tk_status < 0)
nfs_reset_boot_verifier(inode);
-
- nfs_local_pgio_done(hdr, status);
}
static void nfs_local_write_aio_complete_work(struct work_struct *work)
@@ -626,6 +779,7 @@ static void nfs_local_write_aio_complete(struct kiocb *kiocb, long ret)
struct nfs_local_kiocb *iocb =
container_of(kiocb, struct nfs_local_kiocb, kiocb);
+ nfs_local_pgio_done(iocb->hdr, ret);
nfs_local_write_done(iocb, ret);
nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_write_aio_complete_work */
}
@@ -637,16 +791,53 @@ static void nfs_local_call_write(struct work_struct *work)
struct file *filp = iocb->kiocb.ki_filp;
unsigned long old_flags = current->flags;
const struct cred *save_cred;
- struct iov_iter iter;
ssize_t status;
current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
save_cred = override_creds(filp->f_cred);
- nfs_local_iter_init(&iter, iocb, WRITE);
-
file_start_write(filp);
- status = filp->f_op->write_iter(&iocb->kiocb, &iter);
+ for (int i = 0; i < iocb->n_iters ; i++) {
+ if (iocb->iter_is_dio_aligned[i]) {
+ iocb->kiocb.ki_flags |= IOCB_DIRECT;
+ iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
+ iocb->aio_complete_work = nfs_local_write_aio_complete_work;
+ }
+retry:
+ iocb->kiocb.ki_pos = iocb->offset[i];
+ status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iters[i]);
+ if (status != -EIOCBQUEUED) {
+ if (unlikely(status >= 0 && status < iocb->iters[i].count)) {
+ /* partial write */
+ if (i == iocb->end_iter_index) {
+ /* Must not account partial end, otherwise, due
+ * to end being issued before middle: the partial
+ * write accounting in nfs_local_write_done()
+ * would incorrectly advance hdr->args.offset
+ */
+ status = 0;
+ } else {
+ /* Partial write at start or buffered middle,
+ * exit early.
+ */
+ nfs_local_pgio_done(iocb->hdr, status);
+ break;
+ }
+ } else if (unlikely(status == -ENOTBLK &&
+ (iocb->kiocb.ki_flags & IOCB_DIRECT))) {
+ /* VFS will return -ENOTBLK if DIO WRITE fails to
+ * invalidate the page cache. Retry using buffered IO.
+ */
+ iocb->kiocb.ki_flags &= ~IOCB_DIRECT;
+ iocb->kiocb.ki_complete = NULL;
+ iocb->aio_complete_work = NULL;
+ goto retry;
+ }
+ nfs_local_pgio_done(iocb->hdr, status);
+ if (iocb->hdr->task.tk_status)
+ break;
+ }
+ }
file_end_write(filp);
revert_creds(save_cred);
@@ -660,26 +851,15 @@ static void nfs_local_call_write(struct work_struct *work)
}
static int
-nfs_do_local_write(struct nfs_pgio_header *hdr,
- struct nfsd_file *localio,
+nfs_local_do_write(struct nfs_local_kiocb *iocb,
const struct rpc_call_ops *call_ops)
{
- struct nfs_local_kiocb *iocb;
- struct file *file = nfs_to->nfsd_file_file(localio);
-
- /* Don't support filesystems without write_iter */
- if (!file->f_op->write_iter)
- return -EAGAIN;
+ struct nfs_pgio_header *hdr = iocb->hdr;
dprintk("%s: vfs_write count=%u pos=%llu %s\n",
__func__, hdr->args.count, hdr->args.offset,
(hdr->args.stable == NFS_UNSTABLE) ? "unstable" : "stable");
- iocb = nfs_local_iocb_alloc(hdr, file, GFP_NOIO);
- if (iocb == NULL)
- return -ENOMEM;
- iocb->localio = localio;
-
switch (hdr->args.stable) {
default:
break;
@@ -694,43 +874,74 @@ nfs_do_local_write(struct nfs_pgio_header *hdr,
nfs_set_local_verifier(hdr->inode, hdr->res.verf, hdr->args.stable);
- if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
- iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
- iocb->aio_complete_work = nfs_local_write_aio_complete_work;
- }
-
INIT_WORK(&iocb->work, nfs_local_call_write);
queue_work(nfslocaliod_workqueue, &iocb->work);
return 0;
}
+static struct nfs_local_kiocb *
+nfs_local_iocb_init(struct nfs_pgio_header *hdr, struct nfsd_file *localio)
+{
+ struct file *file = nfs_to->nfsd_file_file(localio);
+ struct nfs_local_kiocb *iocb;
+ gfp_t gfp_mask;
+ int rw;
+
+ if (hdr->rw_mode & FMODE_READ) {
+ if (!file->f_op->read_iter)
+ return ERR_PTR(-EOPNOTSUPP);
+ gfp_mask = GFP_KERNEL;
+ rw = ITER_DEST;
+ } else {
+ if (!file->f_op->write_iter)
+ return ERR_PTR(-EOPNOTSUPP);
+ gfp_mask = GFP_NOIO;
+ rw = ITER_SOURCE;
+ }
+
+ iocb = nfs_local_iocb_alloc(hdr, file, gfp_mask);
+ if (iocb == NULL)
+ return ERR_PTR(-ENOMEM);
+ iocb->hdr = hdr;
+ iocb->localio = localio;
+
+ nfs_local_iters_init(iocb, rw);
+
+ return iocb;
+}
+
int nfs_local_doio(struct nfs_client *clp, struct nfsd_file *localio,
struct nfs_pgio_header *hdr,
const struct rpc_call_ops *call_ops)
{
+ struct nfs_local_kiocb *iocb;
int status = 0;
if (!hdr->args.count)
return 0;
+ iocb = nfs_local_iocb_init(hdr, localio);
+ if (IS_ERR(iocb))
+ return PTR_ERR(iocb);
+
switch (hdr->rw_mode) {
case FMODE_READ:
- status = nfs_do_local_read(hdr, localio, call_ops);
+ status = nfs_local_do_read(iocb, call_ops);
break;
case FMODE_WRITE:
- status = nfs_do_local_write(hdr, localio, call_ops);
+ status = nfs_local_do_write(iocb, call_ops);
break;
default:
dprintk("%s: invalid mode: %d\n", __func__,
hdr->rw_mode);
- status = -EINVAL;
+ status = -EOPNOTSUPP;
}
if (status != 0) {
if (status == -EAGAIN)
nfs_localio_disable_client(clp);
- nfs_local_file_put(localio);
+ nfs_local_iocb_release(iocb);
hdr->task.tk_status = status;
nfs_local_hdr_release(hdr, call_ops);
}
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index f9a3a1fbf44c..5a4d193da1a9 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -290,7 +290,8 @@ int nfs_do_submount(struct fs_context *fc)
nfs_errorf(fc, "NFS: Couldn't determine submount pathname");
ret = PTR_ERR(p);
} else {
- ret = vfs_parse_fs_string(fc, "source", p, buffer + 4096 - p);
+ ret = vfs_parse_fs_qstr(fc, "source",
+ &QSTR_LEN(p, buffer + 4096 - p));
if (!ret)
ret = vfs_get_tree(fc);
}
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 6e75c6c2d234..9eff09158518 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -23,8 +23,8 @@
#include <linux/nfs2.h>
#include <linux/nfs_fs.h>
#include <linux/nfs_common.h>
-#include "nfstrace.h"
#include "internal.h"
+#include "nfstrace.h"
#define NFSDBG_FACILITY NFSDBG_XDR
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 4ae01c10b7e2..e17d72908412 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -23,8 +23,8 @@
#include <linux/nfsacl.h>
#include <linux/nfs_common.h>
-#include "nfstrace.h"
#include "internal.h"
+#include "nfstrace.h"
#define NFSDBG_FACILITY NFSDBG_XDR
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 6a0b5871ba3b..d537fb0c230e 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -1514,7 +1514,7 @@ static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf,
ret = -ENOMEM;
- res.scratch = alloc_page(GFP_KERNEL);
+ res.scratch = folio_alloc(GFP_KERNEL, 0);
if (!res.scratch)
goto out;
@@ -1552,7 +1552,7 @@ out_free_pages:
}
kfree(pages);
out_free_scratch:
- __free_page(res.scratch);
+ folio_put(res.scratch);
out:
return ret;
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 4cc915d5741d..e10d83ba835e 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -1781,7 +1781,7 @@ static int nfs4_xdr_dec_listxattrs(struct rpc_rqst *rqstp,
struct compound_hdr hdr;
int status;
- xdr_set_scratch_page(xdr, res->scratch);
+ xdr_set_scratch_folio(xdr, res->scratch);
status = decode_compound_hdr(xdr, &hdr);
if (status)
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index c9a0d1e420c6..7f43e890d356 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -456,4 +456,5 @@ const struct file_operations nfs4_file_operations = {
#else
.llseek = nfs_file_llseek,
#endif
+ .fop_flags = FOP_DONTCACHE,
};
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ce61253efd45..f58098417142 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -391,7 +391,9 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
*p++ = htonl(attrs); /* bitmap */
*p++ = htonl(12); /* attribute buffer length */
*p++ = htonl(NF4DIR);
+ spin_lock(&dentry->d_lock);
p = xdr_encode_hyper(p, NFS_FILEID(d_inode(dentry->d_parent)));
+ spin_unlock(&dentry->d_lock);
readdir->pgbase = (char *)p - (char *)start;
readdir->count -= readdir->pgbase;
@@ -6160,7 +6162,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf,
}
/* for decoding across pages */
- res.acl_scratch = alloc_page(GFP_KERNEL);
+ res.acl_scratch = folio_alloc(GFP_KERNEL, 0);
if (!res.acl_scratch)
goto out_free;
@@ -6196,7 +6198,7 @@ out_free:
while (--i >= 0)
__free_page(pages[i]);
if (res.acl_scratch)
- __free_page(res.acl_scratch);
+ folio_put(res.acl_scratch);
kfree(pages);
return ret;
}
@@ -7872,10 +7874,10 @@ int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state,
return err;
do {
err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW);
- if (err != -NFS4ERR_DELAY)
+ if (err != -NFS4ERR_DELAY && err != -NFS4ERR_GRACE)
break;
ssleep(1);
- } while (err == -NFS4ERR_DELAY);
+ } while (err == -NFS4ERR_DELAY || err == -NFSERR_GRACE);
return nfs4_handle_delegation_recall_error(server, state, stateid, fl, err);
}
@@ -9442,7 +9444,7 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args
goto out;
if (rcvd->max_rqst_sz > sent->max_rqst_sz)
return -EINVAL;
- if (rcvd->max_resp_sz < sent->max_resp_sz)
+ if (rcvd->max_resp_sz > sent->max_resp_sz)
return -EINVAL;
if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached)
return -EINVAL;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 7612e977e80b..01179f7de322 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2744,6 +2744,9 @@ out_error:
case -ENETUNREACH:
nfs_mark_client_ready(clp, -EIO);
break;
+ case -EINVAL:
+ nfs_mark_client_ready(clp, status);
+ break;
default:
ssleep(1);
break;
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index b29a26923ce0..5ec9c83f1ef0 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -149,21 +149,9 @@ static int do_nfs4_mount(struct nfs_server *server,
struct fs_context *root_fc;
struct vfsmount *root_mnt;
struct dentry *dentry;
- size_t len;
+ char *source;
int ret;
- struct fs_parameter param = {
- .key = "source",
- .type = fs_value_is_string,
- .dirfd = -1,
- };
-
- struct fs_parameter param_fsc = {
- .key = "fsc",
- .type = fs_value_is_string,
- .dirfd = -1,
- };
-
if (IS_ERR(server))
return PTR_ERR(server);
@@ -181,15 +169,7 @@ static int do_nfs4_mount(struct nfs_server *server,
root_ctx->server = server;
if (ctx->fscache_uniq) {
- len = strlen(ctx->fscache_uniq);
- param_fsc.size = len;
- param_fsc.string = kmemdup_nul(ctx->fscache_uniq, len, GFP_KERNEL);
- if (param_fsc.string == NULL) {
- put_fs_context(root_fc);
- return -ENOMEM;
- }
- ret = vfs_parse_fs_param(root_fc, &param_fsc);
- kfree(param_fsc.string);
+ ret = vfs_parse_fs_string(root_fc, "fsc", ctx->fscache_uniq);
if (ret < 0) {
put_fs_context(root_fc);
return ret;
@@ -197,20 +177,18 @@ static int do_nfs4_mount(struct nfs_server *server,
}
/* We leave export_path unset as it's not used to find the root. */
- len = strlen(hostname) + 5;
- param.string = kmalloc(len, GFP_KERNEL);
- if (param.string == NULL) {
- put_fs_context(root_fc);
- return -ENOMEM;
- }
-
/* Does hostname needs to be enclosed in brackets? */
if (strchr(hostname, ':'))
- param.size = snprintf(param.string, len, "[%s]:/", hostname);
+ source = kasprintf(GFP_KERNEL, "[%s]:/", hostname);
else
- param.size = snprintf(param.string, len, "%s:/", hostname);
- ret = vfs_parse_fs_param(root_fc, &param);
- kfree(param.string);
+ source = kasprintf(GFP_KERNEL, "%s:/", hostname);
+
+ if (!source) {
+ put_fs_context(root_fc);
+ return -ENOMEM;
+ }
+ ret = vfs_parse_fs_string(root_fc, "source", source);
+ kfree(source);
if (ret < 0) {
put_fs_context(root_fc);
return ret;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 49ff98571fa5..1d0e6c10f921 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4930,7 +4930,7 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
}
/*
- * The prefered block size for layout directed io
+ * The preferred block size for layout directed io
*/
static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
uint32_t *res)
@@ -6585,7 +6585,7 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
int status;
if (res->acl_scratch != NULL)
- xdr_set_scratch_page(xdr, res->acl_scratch);
+ xdr_set_scratch_folio(xdr, res->acl_scratch);
status = decode_compound_hdr(xdr, &hdr);
if (status)
goto out;
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 627115179795..6ce55e8e6b67 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -45,6 +45,23 @@
{ BIT(NFS_INO_LAYOUTSTATS), "LAYOUTSTATS" }, \
{ BIT(NFS_INO_ODIRECT), "ODIRECT" })
+#define nfs_show_wb_flags(v) \
+ __print_flags(v, "|", \
+ { BIT(PG_BUSY), "BUSY" }, \
+ { BIT(PG_MAPPED), "MAPPED" }, \
+ { BIT(PG_FOLIO), "FOLIO" }, \
+ { BIT(PG_CLEAN), "CLEAN" }, \
+ { BIT(PG_COMMIT_TO_DS), "COMMIT_TO_DS" }, \
+ { BIT(PG_INODE_REF), "INODE_REF" }, \
+ { BIT(PG_HEADLOCK), "HEADLOCK" }, \
+ { BIT(PG_TEARDOWN), "TEARDOWN" }, \
+ { BIT(PG_UNLOCKPAGE), "UNLOCKPAGE" }, \
+ { BIT(PG_UPTODATE), "UPTODATE" }, \
+ { BIT(PG_WB_END), "WB_END" }, \
+ { BIT(PG_REMOVE), "REMOVE" }, \
+ { BIT(PG_CONTENDED1), "CONTENDED1" }, \
+ { BIT(PG_CONTENDED2), "CONTENDED2" })
+
DECLARE_EVENT_CLASS(nfs_inode_event,
TP_PROTO(
const struct inode *inode
@@ -967,7 +984,7 @@ DECLARE_EVENT_CLASS(nfs_folio_event,
__entry->fileid = nfsi->fileid;
__entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
__entry->version = inode_peek_iversion_raw(inode);
- __entry->offset = offset,
+ __entry->offset = offset;
__entry->count = count;
),
@@ -1017,8 +1034,8 @@ DECLARE_EVENT_CLASS(nfs_folio_event_done,
__entry->fileid = nfsi->fileid;
__entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
__entry->version = inode_peek_iversion_raw(inode);
- __entry->offset = offset,
- __entry->count = count,
+ __entry->offset = offset;
+ __entry->count = count;
__entry->ret = ret;
),
@@ -1051,6 +1068,73 @@ DEFINE_NFS_FOLIO_EVENT_DONE(nfs_writeback_folio_done);
DEFINE_NFS_FOLIO_EVENT(nfs_invalidate_folio);
DEFINE_NFS_FOLIO_EVENT_DONE(nfs_launder_folio_done);
+DEFINE_NFS_FOLIO_EVENT(nfs_try_to_update_request);
+DEFINE_NFS_FOLIO_EVENT_DONE(nfs_try_to_update_request_done);
+
+DEFINE_NFS_FOLIO_EVENT(nfs_update_folio);
+DEFINE_NFS_FOLIO_EVENT_DONE(nfs_update_folio_done);
+
+DEFINE_NFS_FOLIO_EVENT(nfs_write_begin);
+DEFINE_NFS_FOLIO_EVENT_DONE(nfs_write_begin_done);
+
+DEFINE_NFS_FOLIO_EVENT(nfs_write_end);
+DEFINE_NFS_FOLIO_EVENT_DONE(nfs_write_end_done);
+
+DEFINE_NFS_FOLIO_EVENT(nfs_writepages);
+DEFINE_NFS_FOLIO_EVENT_DONE(nfs_writepages_done);
+
+DECLARE_EVENT_CLASS(nfs_kiocb_event,
+ TP_PROTO(
+ const struct kiocb *iocb,
+ const struct iov_iter *iter
+ ),
+
+ TP_ARGS(iocb, iter),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(u64, version)
+ __field(loff_t, offset)
+ __field(size_t, count)
+ __field(int, flags)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = file_inode(iocb->ki_filp);
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->version = inode_peek_iversion_raw(inode);
+ __entry->offset = iocb->ki_pos;
+ __entry->count = iov_iter_count(iter);
+ __entry->flags = iocb->ki_flags;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu offset=%lld count=%zu ki_flags=%s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->version,
+ __entry->offset, __entry->count,
+ __print_flags(__entry->flags, "|", TRACE_IOCB_STRINGS)
+ )
+);
+
+#define DEFINE_NFS_KIOCB_EVENT(name) \
+ DEFINE_EVENT(nfs_kiocb_event, name, \
+ TP_PROTO( \
+ const struct kiocb *iocb, \
+ const struct iov_iter *iter \
+ ), \
+ TP_ARGS(iocb, iter))
+
+DEFINE_NFS_KIOCB_EVENT(nfs_file_read);
+DEFINE_NFS_KIOCB_EVENT(nfs_file_write);
+
TRACE_EVENT(nfs_aop_readahead,
TP_PROTO(
const struct inode *inode,
@@ -1398,6 +1482,55 @@ TRACE_EVENT(nfs_writeback_done,
)
);
+DECLARE_EVENT_CLASS(nfs_page_class,
+ TP_PROTO(
+ const struct nfs_page *req
+ ),
+
+ TP_ARGS(req),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(const struct nfs_page *__private, req)
+ __field(loff_t, offset)
+ __field(unsigned int, count)
+ __field(unsigned long, flags)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = folio_inode(req->wb_folio);
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->req = req;
+ __entry->offset = req_offset(req);
+ __entry->count = req->wb_bytes;
+ __entry->flags = req->wb_flags;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x req=%p offset=%lld count=%u flags=%s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid, __entry->fhandle,
+ __entry->req, __entry->offset, __entry->count,
+ nfs_show_wb_flags(__entry->flags)
+ )
+);
+
+#define DEFINE_NFS_PAGE_EVENT(name) \
+ DEFINE_EVENT(nfs_page_class, name, \
+ TP_PROTO( \
+ const struct nfs_page *req \
+ ), \
+ TP_ARGS(req))
+
+DEFINE_NFS_PAGE_EVENT(nfs_writepage_setup);
+DEFINE_NFS_PAGE_EVENT(nfs_do_writepage);
+
DECLARE_EVENT_CLASS(nfs_page_error_class,
TP_PROTO(
const struct inode *inode,
@@ -1599,6 +1732,76 @@ DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_completion);
DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_schedule_iovec);
DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_reschedule_io);
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+
+DECLARE_EVENT_CLASS(nfs_local_dio_class,
+ TP_PROTO(
+ const struct inode *inode,
+ loff_t offset,
+ ssize_t count,
+ const struct nfs_local_dio *local_dio
+ ),
+ TP_ARGS(inode, offset, count, local_dio),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, fileid)
+ __field(u32, fhandle)
+ __field(loff_t, offset)
+ __field(ssize_t, count)
+ __field(u32, mem_align)
+ __field(u32, offset_align)
+ __field(loff_t, start)
+ __field(ssize_t, start_len)
+ __field(loff_t, middle)
+ __field(ssize_t, middle_len)
+ __field(loff_t, end)
+ __field(ssize_t, end_len)
+ ),
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = &nfsi->fh;
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ __entry->offset = offset;
+ __entry->count = count;
+ __entry->mem_align = local_dio->mem_align;
+ __entry->offset_align = local_dio->offset_align;
+ __entry->start = offset;
+ __entry->start_len = local_dio->start_len;
+ __entry->middle = local_dio->middle_offset;
+ __entry->middle_len = local_dio->middle_len;
+ __entry->end = local_dio->end_offset;
+ __entry->end_len = local_dio->end_len;
+ ),
+ TP_printk("fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%zd "
+ "mem_align=%u offset_align=%u "
+ "start=%llu+%zd middle=%llu+%zd end=%llu+%zd",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->offset, __entry->count,
+ __entry->mem_align, __entry->offset_align,
+ __entry->start, __entry->start_len,
+ __entry->middle, __entry->middle_len,
+ __entry->end, __entry->end_len)
+)
+
+#define DEFINE_NFS_LOCAL_DIO_EVENT(name) \
+DEFINE_EVENT(nfs_local_dio_class, nfs_local_dio_##name, \
+ TP_PROTO(const struct inode *inode, \
+ loff_t offset, \
+ ssize_t count, \
+ const struct nfs_local_dio *local_dio),\
+ TP_ARGS(inode, offset, count, local_dio))
+
+DEFINE_NFS_LOCAL_DIO_EVENT(read);
+DEFINE_NFS_LOCAL_DIO_EVENT(write);
+DEFINE_NFS_LOCAL_DIO_EVENT(misaligned);
+
+#endif /* CONFIG_NFS_LOCALIO */
+
TRACE_EVENT(nfs_fh_to_dentry,
TP_PROTO(
const struct super_block *sb,
@@ -1713,10 +1916,10 @@ TRACE_EVENT(nfs_local_open_fh,
),
TP_printk(
- "error=%d fhandle=0x%08x mode=%s",
- __entry->error,
+ "fhandle=0x%08x mode=%s result=%d",
__entry->fhandle,
- show_fs_fmode_flags(__entry->fmode)
+ show_fs_fmode_flags(__entry->fmode),
+ __entry->error
)
);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 647c53d1418a..0fb6905736d5 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -296,7 +296,7 @@ static void nfs_folio_end_writeback(struct folio *folio)
{
struct nfs_server *nfss = NFS_SERVER(folio->mapping->host);
- folio_end_writeback(folio);
+ folio_end_writeback_no_dropbehind(folio);
if (atomic_long_dec_return(&nfss->writeback) <
NFS_CONGESTION_OFF_THRESH) {
nfss->write_congested = 0;
@@ -593,6 +593,7 @@ static int nfs_do_writepage(struct folio *folio, struct writeback_control *wbc,
if (IS_ERR(req))
return PTR_ERR(req);
+ trace_nfs_do_writepage(req);
nfs_folio_set_writeback(folio);
WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags));
@@ -656,12 +657,14 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
int priority = 0;
int err;
+ trace_nfs_writepages(inode, wbc->range_start, wbc->range_end - wbc->range_start);
+
/* Wait with writeback until write congestion eases */
if (wbc->sync_mode == WB_SYNC_NONE && nfss->write_congested) {
err = wait_event_killable(nfss->write_congestion_wait,
nfss->write_congested == 0);
if (err)
- return err;
+ goto out_err;
}
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
@@ -692,10 +695,10 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
} while (err < 0 && !nfs_error_is_fatal(err));
nfs_io_completion_put(ioc);
- if (err < 0)
- goto out_err;
- return 0;
+ if (err > 0)
+ err = 0;
out_err:
+ trace_nfs_writepages_done(inode, wbc->range_start, wbc->range_end - wbc->range_start, err);
return err;
}
@@ -745,6 +748,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
clear_bit(PG_MAPPED, &req->wb_head->wb_flags);
}
spin_unlock(&mapping->i_private_lock);
+
+ folio_end_dropbehind(folio);
}
nfs_page_group_unlock(req);
@@ -926,7 +931,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
req->wb_nio = 0;
memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
nfs_mark_request_commit(req, hdr->lseg, &cinfo,
- hdr->pgio_mirror_idx);
+ hdr->ds_commit_idx);
goto next;
}
remove_req:
@@ -1017,11 +1022,12 @@ static struct nfs_page *nfs_try_to_update_request(struct folio *folio,
unsigned int end;
int error;
+ trace_nfs_try_to_update_request(folio_inode(folio), offset, bytes);
end = offset + bytes;
req = nfs_lock_and_join_requests(folio);
if (IS_ERR_OR_NULL(req))
- return req;
+ goto out;
rqend = req->wb_offset + req->wb_bytes;
/*
@@ -1043,6 +1049,9 @@ static struct nfs_page *nfs_try_to_update_request(struct folio *folio,
else
req->wb_bytes = rqend - req->wb_offset;
req->wb_nio = 0;
+out:
+ trace_nfs_try_to_update_request_done(folio_inode(folio), offset, bytes,
+ PTR_ERR_OR_ZERO(req));
return req;
out_flushme:
/*
@@ -1053,6 +1062,7 @@ out_flushme:
nfs_mark_request_dirty(req);
nfs_unlock_and_release_request(req);
error = nfs_wb_folio(folio->mapping->host, folio);
+ trace_nfs_try_to_update_request_done(folio_inode(folio), offset, bytes, error);
return (error < 0) ? ERR_PTR(error) : NULL;
}
@@ -1090,6 +1100,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx,
req = nfs_setup_write_request(ctx, folio, offset, count);
if (IS_ERR(req))
return PTR_ERR(req);
+ trace_nfs_writepage_setup(req);
/* Update file length */
nfs_grow_file(folio, offset, count);
nfs_mark_uptodate(req);
@@ -1290,6 +1301,8 @@ int nfs_update_folio(struct file *file, struct folio *folio,
nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
+ trace_nfs_update_folio(inode, offset, count);
+
dprintk("NFS: nfs_update_folio(%pD2 %d@%lld)\n", file, count,
(long long)(folio_pos(folio) + offset));
@@ -1309,6 +1322,7 @@ int nfs_update_folio(struct file *file, struct folio *folio,
if (status < 0)
nfs_set_pageerror(mapping);
out:
+ trace_nfs_update_folio_done(inode, offset, count, status);
dprintk("NFS: nfs_update_folio returns %d (isize %lld)\n",
status, (long long)i_size_read(inode));
return status;
@@ -1806,7 +1820,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
nfs_mapping_set_error(folio, status);
nfs_inode_remove_request(req);
}
- dprintk_cont(", error = %d\n", status);
+ dprintk(", error = %d\n", status);
goto next;
}
@@ -1816,11 +1830,11 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
/* We have a match */
if (folio)
nfs_inode_remove_request(req);
- dprintk_cont(" OK\n");
+ dprintk(" OK\n");
goto next;
}
/* We have a mismatch. Write the page again */
- dprintk_cont(" mismatch\n");
+ dprintk(" mismatch\n");
nfs_mark_request_dirty(req);
atomic_long_inc(&NFS_I(data->inode)->redirtied_pages);
next: