From fc82228a5e3860502dbf3bfa4a9570cb7093cf7f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 3 Dec 2017 20:38:01 -0500 Subject: ext4: support fast symlinks from ext3 file systems 407cd7fb83c0 (ext4: change fast symlink test to not rely on i_blocks) broke ~10 years old ext3 file systems created by 2.6.17. Any ELF executable fails because the /lib/ld-linux.so.2 fast symlink cannot be read anymore. The patch assumed fast symlinks were created in a specific way, but that's not true on these really old file systems. The new behavior is apparently needed only with the large EA inode feature. Revert to the old behavior if the large EA inode feature is not set. This makes my old VM boot again. Fixes: 407cd7fb83c0 (ext4: change fast symlink test to not rely on i_blocks) Signed-off-by: Andi Kleen Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger Cc: stable@vger.kernel.org --- fs/ext4/inode.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7df2c5644e59..534a9130f625 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -149,6 +149,15 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, */ int ext4_inode_is_fast_symlink(struct inode *inode) { + if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { + int ea_blocks = EXT4_I(inode)->i_file_acl ? + EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; + + if (ext4_has_inline_data(inode)) + return 0; + + return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); + } return S_ISLNK(inode->i_mode) && inode->i_size && (inode->i_size < EXT4_N_BLOCKS * 4); } -- cgit v1.2.3 From c894aa97577e47d3066b27b32499ecf899bfa8b0 Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Sun, 3 Dec 2017 22:52:51 -0500 Subject: ext4: fix fdatasync(2) after fallocate(2) operation Currently, fallocate(2) with KEEP_SIZE followed by a fdatasync(2) then crash, we'll see wrong allocated block number (stat -c %b), the blocks allocated beyond EOF are all lost. fstests generic/468 exposes this bug. Commit 67a7d5f561f4 ("ext4: fix fdatasync(2) after extent manipulation operations") fixed all the other extent manipulation operation paths such as hole punch, zero range, collapse range etc., but forgot the fallocate case. So similarly, fix it by recording the correct journal tid in ext4 inode in fallocate(2) path, so that ext4_sync_file() will wait for the right tid to be committed on fdatasync(2). This addresses the test failure in xfstests test generic/468. Signed-off-by: Eryu Guan Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/extents.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 07bca11749d4..c941251ac0c0 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4722,6 +4722,7 @@ retry: EXT4_INODE_EOFBLOCKS); } ext4_mark_inode_dirty(handle, inode); + ext4_update_inode_fsync_trans(handle, inode, 1); ret2 = ext4_journal_stop(handle); if (ret2) break; -- cgit v1.2.3 From ca0168e8a77cf833f8c9ac1d26a3a4012bab4f72 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 5 Dec 2017 09:32:25 -0500 Subject: alloc_super(): do ->s_umount initialization earlier ... so that failure exits could count on it having been done. Signed-off-by: Al Viro --- fs/super.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index d4e33e8f1e6f..7ff1349609e4 100644 --- a/fs/super.c +++ b/fs/super.c @@ -191,6 +191,24 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, INIT_LIST_HEAD(&s->s_mounts); s->s_user_ns = get_user_ns(user_ns); + init_rwsem(&s->s_umount); + lockdep_set_class(&s->s_umount, &type->s_umount_key); + /* + * sget() can have s_umount recursion. + * + * When it cannot find a suitable sb, it allocates a new + * one (this one), and tries again to find a suitable old + * one. + * + * In case that succeeds, it will acquire the s_umount + * lock of the old one. Since these are clearly distrinct + * locks, and this object isn't exposed yet, there's no + * risk of deadlocks. + * + * Annotate this by putting this lock in a different + * subclass. + */ + down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); if (security_sb_alloc(s)) goto fail; @@ -218,25 +236,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, goto fail; if (list_lru_init_memcg(&s->s_inode_lru)) goto fail; - - init_rwsem(&s->s_umount); - lockdep_set_class(&s->s_umount, &type->s_umount_key); - /* - * sget() can have s_umount recursion. - * - * When it cannot find a suitable sb, it allocates a new - * one (this one), and tries again to find a suitable old - * one. - * - * In case that succeeds, it will acquire the s_umount - * lock of the old one. Since these are clearly distrinct - * locks, and this object isn't exposed yet, there's no - * risk of deadlocks. - * - * Annotate this by putting this lock in a different - * subclass. - */ - down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); s->s_count = 1; atomic_set(&s->s_active, 1); mutex_init(&s->s_vfs_rename_mutex); -- cgit v1.2.3 From a821df3f1af72aa6a0d573eea94a7dd2613e9f4e Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Tue, 21 Nov 2017 09:36:33 +1100 Subject: cifs: fix NULL deref in SMB2_read Signed-off-by: Ronnie Sahlberg Reviewed-by: Pavel Shilovsky CC: Stable Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 5331631386a2..01346b8b6edb 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2678,27 +2678,27 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, cifs_small_buf_release(req); rsp = (struct smb2_read_rsp *)rsp_iov.iov_base; - shdr = get_sync_hdr(rsp); - if (shdr->Status == STATUS_END_OF_FILE) { + if (rc) { + if (rc != -ENODATA) { + cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE); + cifs_dbg(VFS, "Send error in read = %d\n", rc); + } free_rsp_buf(resp_buftype, rsp_iov.iov_base); - return 0; + return rc == -ENODATA ? 0 : rc; } - if (rc) { - cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE); - cifs_dbg(VFS, "Send error in read = %d\n", rc); - } else { - *nbytes = le32_to_cpu(rsp->DataLength); - if ((*nbytes > CIFS_MAX_MSGSIZE) || - (*nbytes > io_parms->length)) { - cifs_dbg(FYI, "bad length %d for count %d\n", - *nbytes, io_parms->length); - rc = -EIO; - *nbytes = 0; - } + *nbytes = le32_to_cpu(rsp->DataLength); + if ((*nbytes > CIFS_MAX_MSGSIZE) || + (*nbytes > io_parms->length)) { + cifs_dbg(FYI, "bad length %d for count %d\n", + *nbytes, io_parms->length); + rc = -EIO; + *nbytes = 0; } + shdr = get_sync_hdr(rsp); + if (*buf) { memcpy(*buf, (char *)shdr + rsp->DataOffset, *nbytes); free_rsp_buf(resp_buftype, rsp_iov.iov_base); -- cgit v1.2.3 From 5702591fc6a3f409f460def104ee149330dac82d Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Tue, 21 Nov 2017 14:47:56 +0100 Subject: CIFS: don't log STATUS_NOT_FOUND errors for DFS cifs.ko makes DFS queries regardless of the type of the server and non-DFS servers are common. This often results in superfluous logging of non-critical errors. Signed-off-by: Aurelien Aptel Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg --- fs/cifs/smb2ops.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index e06740436b92..ed88ab8a4774 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1406,7 +1406,8 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses, } while (rc == -EAGAIN); if (rc) { - cifs_dbg(VFS, "ioctl error in smb2_get_dfs_refer rc=%d\n", rc); + if (rc != -ENOENT) + cifs_dbg(VFS, "ioctl error in smb2_get_dfs_refer rc=%d\n", rc); goto out; } -- cgit v1.2.3 From 040d786032bf59002d374b86d75b04d97624005c Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 30 Nov 2017 11:59:22 +0800 Subject: ceph: drop negative child dentries before try pruning inode's alias Negative child dentry holds reference on inode's alias, it makes d_prune_aliases() do nothing. Cc: stable@vger.kernel.org Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 42 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index ab69dcb70e8a..1b468250e947 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1440,6 +1440,29 @@ static int __close_session(struct ceph_mds_client *mdsc, return request_close_session(mdsc, session); } +static bool drop_negative_children(struct dentry *dentry) +{ + struct dentry *child; + bool all_negative = true; + + if (!d_is_dir(dentry)) + goto out; + + spin_lock(&dentry->d_lock); + list_for_each_entry(child, &dentry->d_subdirs, d_child) { + if (d_really_is_positive(child)) { + all_negative = false; + break; + } + } + spin_unlock(&dentry->d_lock); + + if (all_negative) + shrink_dcache_parent(dentry); +out: + return all_negative; +} + /* * Trim old(er) caps. * @@ -1490,16 +1513,27 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) if ((used | wanted) & ~oissued & mine) goto out; /* we need these caps */ - session->s_trim_caps--; if (oissued) { /* we aren't the only cap.. just remove us */ __ceph_remove_cap(cap, true); + session->s_trim_caps--; } else { + struct dentry *dentry; /* try dropping referring dentries */ spin_unlock(&ci->i_ceph_lock); - d_prune_aliases(inode); - dout("trim_caps_cb %p cap %p pruned, count now %d\n", - inode, cap, atomic_read(&inode->i_count)); + dentry = d_find_any_alias(inode); + if (dentry && drop_negative_children(dentry)) { + int count; + dput(dentry); + d_prune_aliases(inode); + count = atomic_read(&inode->i_count); + if (count == 1) + session->s_trim_caps--; + dout("trim_caps_cb %p cap %p pruned, count now %d\n", + inode, cap, count); + } else { + dput(dentry); + } return 0; } -- cgit v1.2.3 From eaf0ec303bd73f6b2c18f48542974a710fadfeb9 Mon Sep 17 00:00:00 2001 From: Pravin Shedge Date: Wed, 6 Dec 2017 10:16:15 -0800 Subject: fs: xfs: remove duplicate includes These duplicate includes have been found with scripts/checkincludes.pl but they have been removed manually to avoid removing false positives. Signed-off-by: Pravin Shedge Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/scrub/scrub.c | 1 - fs/xfs/scrub/trace.c | 1 - fs/xfs/xfs_reflink.c | 2 -- fs/xfs/xfs_trace.c | 1 - 4 files changed, 5 deletions(-) (limited to 'fs') diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 9c42c4efd01e..ab3aef2ae823 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -46,7 +46,6 @@ #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" -#include "scrub/scrub.h" #include "scrub/btree.h" /* diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 472080e75788..86daed0e3a45 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -26,7 +26,6 @@ #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_da_format.h" -#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_trans.h" diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index cc041a29eb70..cf7c8f81bebb 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -49,8 +49,6 @@ #include "xfs_alloc.h" #include "xfs_quota_defs.h" #include "xfs_quota.h" -#include "xfs_btree.h" -#include "xfs_bmap_btree.h" #include "xfs_reflink.h" #include "xfs_iomap.h" #include "xfs_rmap_btree.h" diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 5d95fe348294..35f3546b6af5 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -24,7 +24,6 @@ #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_da_format.h" -#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_da_btree.h" -- cgit v1.2.3 From f59cf5c29919d17b61913c3360a7bd29b72975c1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Dec 2017 17:32:55 -0800 Subject: xfs: remove "no-allocation" reservations for file creations If we create a new file we will need an inode, and usually some metadata in the parent direction. Aiming for everything to go well despite the lack of a reservation leads to dirty transactions cancelled under a heavy create/delete load. This patch removes those nospace transactions, which will lead to slightly earlier ENOSPC on some workloads, but instead prevent file system shutdowns due to cancelling dirty transactions for others. A customer could observe assertations failures and shutdowns due to cancelation of dirty transactions during heavy NFS workloads as shown below: 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728125] XFS: Assertion failed: error != -ENOSPC, file: fs/xfs/xfs_inode.c, line: 1262 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728222] Call Trace: 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728246] [] dump_stack+0x63/0x81 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728262] [] warn_slowpath_common+0x8a/0xc0 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728264] [] warn_slowpath_null+0x1a/0x20 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728285] [] asswarn+0x33/0x40 [xfs] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728308] [] xfs_create+0x7be/0x7d0 [xfs] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728329] [] xfs_generic_create+0x1fb/0x2e0 [xfs] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728348] [] xfs_vn_mknod+0x14/0x20 [xfs] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728366] [] xfs_vn_create+0x13/0x20 [xfs] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728380] [] vfs_create+0xd5/0x140 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728390] [] do_nfsd_create+0x499/0x610 [nfsd] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728396] [] nfsd3_proc_create+0x135/0x210 [nfsd] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728401] [] nfsd_dispatch+0xc3/0x210 [nfsd] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728416] [] svc_process_common+0x453/0x6f0 [sunrpc] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728423] [] svc_process+0x113/0x1f0 [sunrpc] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728427] [] nfsd+0x10f/0x180 [nfsd] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728432] [] ? nfsd_destroy+0x80/0x80 [nfsd] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728438] [] kthread+0xd8/0xf0 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728441] [] ? kthread_create_on_node+0x1b0/0x1b0 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728451] [] ret_from_fork+0x42/0x70 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728453] [] ? kthread_create_on_node+0x1b0/0x1b0 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728454] ---[ end trace f9822c842fec81d4 ]--- 2017-05-30 21:17:06 kernel: ALERT: [ 2670.728477] XFS (sdb): Internal error xfs_trans_cancel at line 983 of file fs/xfs/xfs_trans.c. Caller xfs_create+0x4ee/0x7d0 [xfs] 2017-05-30 21:17:06 kernel: ALERT: [ 2670.728684] XFS (sdb): Corruption of in-memory data detected. Shutting down filesystem 2017-05-30 21:17:06 kernel: ALERT: [ 2670.728685] XFS (sdb): Please umount the filesystem and rectify the problem(s) Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ialloc.c | 10 +++------- fs/xfs/libxfs/xfs_ialloc.h | 1 - fs/xfs/xfs_inode.c | 33 +++++++-------------------------- fs/xfs/xfs_inode.h | 2 +- fs/xfs/xfs_qm.c | 4 ++-- fs/xfs/xfs_symlink.c | 15 +-------------- 6 files changed, 14 insertions(+), 51 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index de3f04a98656..3b57ef0f2f76 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -920,8 +920,7 @@ STATIC xfs_agnumber_t xfs_ialloc_ag_select( xfs_trans_t *tp, /* transaction pointer */ xfs_ino_t parent, /* parent directory inode number */ - umode_t mode, /* bits set to indicate file type */ - int okalloc) /* ok to allocate more space */ + umode_t mode) /* bits set to indicate file type */ { xfs_agnumber_t agcount; /* number of ag's in the filesystem */ xfs_agnumber_t agno; /* current ag number */ @@ -978,9 +977,6 @@ xfs_ialloc_ag_select( return agno; } - if (!okalloc) - goto nextag; - if (!pag->pagf_init) { error = xfs_alloc_pagf_init(mp, tp, agno, flags); if (error) @@ -1680,7 +1676,6 @@ xfs_dialloc( struct xfs_trans *tp, xfs_ino_t parent, umode_t mode, - int okalloc, struct xfs_buf **IO_agbp, xfs_ino_t *inop) { @@ -1692,6 +1687,7 @@ xfs_dialloc( int noroom = 0; xfs_agnumber_t start_agno; struct xfs_perag *pag; + int okalloc = 1; if (*IO_agbp) { /* @@ -1707,7 +1703,7 @@ xfs_dialloc( * We do not have an agbp, so select an initial allocation * group for inode allocation. */ - start_agno = xfs_ialloc_ag_select(tp, parent, mode, okalloc); + start_agno = xfs_ialloc_ag_select(tp, parent, mode); if (start_agno == NULLAGNUMBER) { *inop = NULLFSINO; return 0; diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index d2bdcd5e7312..66a8de0b1caa 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -81,7 +81,6 @@ xfs_dialloc( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t parent, /* parent inode (directory) */ umode_t mode, /* mode bits for new inode */ - int okalloc, /* ok to allocate more space */ struct xfs_buf **agbp, /* buf for a.g. inode header */ xfs_ino_t *inop); /* inode number allocated */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 801274126648..b41952a4ddd8 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -749,7 +749,6 @@ xfs_ialloc( xfs_nlink_t nlink, dev_t rdev, prid_t prid, - int okalloc, xfs_buf_t **ialloc_context, xfs_inode_t **ipp) { @@ -765,7 +764,7 @@ xfs_ialloc( * Call the space management code to pick * the on-disk inode to be allocated. */ - error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc, + error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, ialloc_context, &ino); if (error) return error; @@ -957,7 +956,6 @@ xfs_dir_ialloc( xfs_nlink_t nlink, dev_t rdev, prid_t prid, /* project id */ - int okalloc, /* ok to allocate new space */ xfs_inode_t **ipp, /* pointer to inode; it will be locked. */ int *committed) @@ -988,8 +986,8 @@ xfs_dir_ialloc( * transaction commit so that no other process can steal * the inode(s) that we've just allocated. */ - code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc, - &ialloc_context, &ip); + code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, &ialloc_context, + &ip); /* * Return an error if we were unable to allocate a new inode. @@ -1061,7 +1059,7 @@ xfs_dir_ialloc( * this call should always succeed. */ code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, - okalloc, &ialloc_context, &ip); + &ialloc_context, &ip); /* * If we get an error at this point, return to the caller @@ -1182,11 +1180,6 @@ xfs_create( xfs_flush_inodes(mp); error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); } - if (error == -ENOSPC) { - /* No space at all so try a "no-allocation" reservation */ - resblks = 0; - error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp); - } if (error) goto out_release_inode; @@ -1203,19 +1196,13 @@ xfs_create( if (error) goto out_trans_cancel; - if (!resblks) { - error = xfs_dir_canenter(tp, dp, name); - if (error) - goto out_trans_cancel; - } - /* * A newly created regular or special file just has one directory * entry pointing to them, but a directory also the "." entry * pointing to itself. */ - error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, - prid, resblks > 0, &ip, NULL); + error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, &ip, + NULL); if (error) goto out_trans_cancel; @@ -1340,11 +1327,6 @@ xfs_create_tmpfile( tres = &M_RES(mp)->tr_create_tmpfile; error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); - if (error == -ENOSPC) { - /* No space at all so try a "no-allocation" reservation */ - resblks = 0; - error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp); - } if (error) goto out_release_inode; @@ -1353,8 +1335,7 @@ xfs_create_tmpfile( if (error) goto out_trans_cancel; - error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, - prid, resblks > 0, &ip, NULL); + error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip, NULL); if (error) goto out_trans_cancel; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index cc13c3763721..b2136af9289f 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -428,7 +428,7 @@ xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, - xfs_nlink_t, dev_t, prid_t, int, + xfs_nlink_t, dev_t, prid_t, struct xfs_inode **, int *); /* from xfs_file.c */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 010a13a201aa..ec952dfad359 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -793,8 +793,8 @@ xfs_qm_qino_alloc( return error; if (need_alloc) { - error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, - &committed); + error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, ip, + &committed); if (error) { xfs_trans_cancel(tp); return error; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 68d3ca2c4968..2e9e793a8f9d 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -232,11 +232,6 @@ xfs_symlink( resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, resblks, 0, 0, &tp); - if (error == -ENOSPC && fs_blocks == 0) { - resblks = 0; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, 0, 0, 0, - &tp); - } if (error) goto out_release_inode; @@ -259,14 +254,6 @@ xfs_symlink( if (error) goto out_trans_cancel; - /* - * Check for ability to enter directory entry, if no space reserved. - */ - if (!resblks) { - error = xfs_dir_canenter(tp, dp, link_name); - if (error) - goto out_trans_cancel; - } /* * Initialize the bmap freelist prior to calling either * bmapi or the directory create code. @@ -277,7 +264,7 @@ xfs_symlink( * Allocate an inode for the symlink. */ error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, - prid, resblks > 0, &ip, NULL); + prid, &ip, NULL); if (error) goto out_trans_cancel; -- cgit v1.2.3 From b7e0b6ff54dd92febbb1914ab93cd6a21622e169 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 6 Dec 2017 16:13:35 -0800 Subject: xfs: make iomap_begin functions trim iomaps consistently Historically, the XFS iomap_begin function only returned mappings for exactly the range queried, i.e. it doesn't do XFS_BMAPI_ENTIRE lookups. The current vfs iomap consumers are only set up to deal with trimmed mappings. xfs_xattr_iomap_begin does BMAPI_ENTIRE lookups, which is inconsistent with the current iomap usage. Remove the flag so that both iomap_begin functions behave the same way. FWIW this also fixes a behavioral regression in xattr FIEMAP that was introduced in 4.8 wherein attr fork extents are no longer trimmed like they used to be. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_iomap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 33eb4fb2e3fd..7ab52a8bc0a9 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1213,7 +1213,7 @@ xfs_xattr_iomap_begin( ASSERT(ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, - &nimaps, XFS_BMAPI_ENTIRE | XFS_BMAPI_ATTRFORK); + &nimaps, XFS_BMAPI_ATTRFORK); out_unlock: xfs_iunlock(ip, lockmode); -- cgit v1.2.3 From d7ee946942bdd12394809305e3df05aa4c8b7b8f Mon Sep 17 00:00:00 2001 From: Markus Trippelsdorf Date: Wed, 11 Oct 2017 07:01:31 +0200 Subject: VFS: Handle lazytime in do_mount() Since commit e462ec50cb5fa ("VFS: Differentiate mount flags (MS_*) from internal superblock flags") the lazytime mount option doesn't get passed on anymore. Fix the issue by handling the option in do_mount(). Reviewed-by: Lukas Czerner Signed-off-by: Markus Trippelsdorf Signed-off-by: Al Viro --- fs/namespace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index e158ec6b527b..9d1374ab6e06 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2826,6 +2826,7 @@ long do_mount(const char *dev_name, const char __user *dir_name, SB_DIRSYNC | SB_SILENT | SB_POSIXACL | + SB_LAZYTIME | SB_I_VERSION); if (flags & MS_REMOUNT) -- cgit v1.2.3 From 996fc4477a0ea28226b30d175f053fb6f9a4fa36 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 10 Dec 2017 23:44:11 -0500 Subject: ext4: add missing error check in __ext4_new_inode() It's possible for ext4_get_acl() to return an ERR_PTR. So we need to add a check for this case in __ext4_new_inode(). Otherwise on an error we can end up oops the kernel. This was getting triggered by xfstests generic/388, which is a test which exercises the shutdown code path. Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/ialloc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index b4267d72f249..b32cf263750d 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -816,6 +816,8 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, #ifdef CONFIG_EXT4_FS_POSIX_ACL struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(p)) + return ERR_CAST(p); if (p) { int acl_size = p->a_count * sizeof(ext4_acl_entry); -- cgit v1.2.3 From 438c84c2f0c794f75ab55ce65c505b01bfce4480 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 11 Dec 2017 11:28:10 +0100 Subject: ovl: don't follow redirects if redirect_dir=off Overlayfs is following redirects even when redirects are disabled. If this is unintentional (probably the majority of cases) then this can be a problem. E.g. upper layer comes from untrusted USB drive, and attacker crafts a redirect to enable read access to otherwise unreadable directories. If "redirect_dir=off", then turn off following as well as creation of redirects. If "redirect_dir=follow", then turn on following, but turn off creation of redirects (which is what "redirect_dir=off" does now). This is a backward incompatible change, so make it dependent on a config option. Reported-by: David Howells Signed-off-by: Miklos Szeredi --- Documentation/filesystems/overlayfs.txt | 34 +++++++++++++++++ fs/overlayfs/Kconfig | 10 +++++ fs/overlayfs/namei.c | 16 ++++++++ fs/overlayfs/ovl_entry.h | 2 + fs/overlayfs/super.c | 68 ++++++++++++++++++++++++--------- 5 files changed, 113 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt index 8caa60734647..e6a5f4912b6d 100644 --- a/Documentation/filesystems/overlayfs.txt +++ b/Documentation/filesystems/overlayfs.txt @@ -156,6 +156,40 @@ handle it in two different ways: root of the overlay. Finally the directory is moved to the new location. +There are several ways to tune the "redirect_dir" feature. + +Kernel config options: + +- OVERLAY_FS_REDIRECT_DIR: + If this is enabled, then redirect_dir is turned on by default. +- OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW: + If this is enabled, then redirects are always followed by default. Enabling + this results in a less secure configuration. Enable this option only when + worried about backward compatibility with kernels that have the redirect_dir + feature and follow redirects even if turned off. + +Module options (can also be changed through /sys/module/overlay/parameters/*): + +- "redirect_dir=BOOL": + See OVERLAY_FS_REDIRECT_DIR kernel config option above. +- "redirect_always_follow=BOOL": + See OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW kernel config option above. +- "redirect_max=NUM": + The maximum number of bytes in an absolute redirect (default is 256). + +Mount options: + +- "redirect_dir=on": + Redirects are enabled. +- "redirect_dir=follow": + Redirects are not created, but followed. +- "redirect_dir=off": + Redirects are not created and only followed if "redirect_always_follow" + feature is enabled in the kernel/module config. +- "redirect_dir=nofollow": + Redirects are not created and not followed (equivalent to "redirect_dir=off" + if "redirect_always_follow" feature is not enabled). + Non-directories --------------- diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index cbfc196e5dc5..5ac415466861 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -24,6 +24,16 @@ config OVERLAY_FS_REDIRECT_DIR an overlay which has redirects on a kernel that doesn't support this feature will have unexpected results. +config OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW + bool "Overlayfs: follow redirects even if redirects are turned off" + default y + depends on OVERLAY_FS + help + Disable this to get a possibly more secure configuration, but that + might not be backward compatible with previous kernels. + + For more information, see Documentation/filesystems/overlayfs.txt + config OVERLAY_FS_INDEX bool "Overlayfs: turn on inodes index feature by default" depends on OVERLAY_FS diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 625ed8066570..2a12dc2e9840 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -681,6 +681,22 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (d.stop) break; + /* + * Following redirects can have security consequences: it's like + * a symlink into the lower layer without the permission checks. + * This is only a problem if the upper layer is untrusted (e.g + * comes from an USB drive). This can allow a non-readable file + * or directory to become readable. + * + * Only following redirects when redirects are enabled disables + * this attack vector when not necessary. + */ + err = -EPERM; + if (d.redirect && !ofs->config.redirect_follow) { + pr_warn_ratelimited("overlay: refusing to follow redirect for (%pd2)\n", dentry); + goto out_put; + } + if (d.redirect && d.redirect[0] == '/' && poe != roe) { poe = roe; diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 752bab645879..9d0bc03bf6e4 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -14,6 +14,8 @@ struct ovl_config { char *workdir; bool default_permissions; bool redirect_dir; + bool redirect_follow; + const char *redirect_mode; bool index; }; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 288d20f9a55a..13a8a8617e44 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -33,6 +33,13 @@ module_param_named(redirect_dir, ovl_redirect_dir_def, bool, 0644); MODULE_PARM_DESC(ovl_redirect_dir_def, "Default to on or off for the redirect_dir feature"); +static bool ovl_redirect_always_follow = + IS_ENABLED(CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW); +module_param_named(redirect_always_follow, ovl_redirect_always_follow, + bool, 0644); +MODULE_PARM_DESC(ovl_redirect_always_follow, + "Follow redirects even if redirect_dir feature is turned off"); + static bool ovl_index_def = IS_ENABLED(CONFIG_OVERLAY_FS_INDEX); module_param_named(index, ovl_index_def, bool, 0644); MODULE_PARM_DESC(ovl_index_def, @@ -232,6 +239,7 @@ static void ovl_free_fs(struct ovl_fs *ofs) kfree(ofs->config.lowerdir); kfree(ofs->config.upperdir); kfree(ofs->config.workdir); + kfree(ofs->config.redirect_mode); if (ofs->creator_cred) put_cred(ofs->creator_cred); kfree(ofs); @@ -295,6 +303,11 @@ static bool ovl_force_readonly(struct ovl_fs *ofs) return (!ofs->upper_mnt || !ofs->workdir); } +static const char *ovl_redirect_mode_def(void) +{ + return ovl_redirect_dir_def ? "on" : "off"; +} + /** * ovl_show_options * @@ -313,12 +326,10 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) } if (ofs->config.default_permissions) seq_puts(m, ",default_permissions"); - if (ofs->config.redirect_dir != ovl_redirect_dir_def) - seq_printf(m, ",redirect_dir=%s", - ofs->config.redirect_dir ? "on" : "off"); + if (strcmp(ofs->config.redirect_mode, ovl_redirect_mode_def()) != 0) + seq_printf(m, ",redirect_dir=%s", ofs->config.redirect_mode); if (ofs->config.index != ovl_index_def) - seq_printf(m, ",index=%s", - ofs->config.index ? "on" : "off"); + seq_printf(m, ",index=%s", ofs->config.index ? "on" : "off"); return 0; } @@ -348,8 +359,7 @@ enum { OPT_UPPERDIR, OPT_WORKDIR, OPT_DEFAULT_PERMISSIONS, - OPT_REDIRECT_DIR_ON, - OPT_REDIRECT_DIR_OFF, + OPT_REDIRECT_DIR, OPT_INDEX_ON, OPT_INDEX_OFF, OPT_ERR, @@ -360,8 +370,7 @@ static const match_table_t ovl_tokens = { {OPT_UPPERDIR, "upperdir=%s"}, {OPT_WORKDIR, "workdir=%s"}, {OPT_DEFAULT_PERMISSIONS, "default_permissions"}, - {OPT_REDIRECT_DIR_ON, "redirect_dir=on"}, - {OPT_REDIRECT_DIR_OFF, "redirect_dir=off"}, + {OPT_REDIRECT_DIR, "redirect_dir=%s"}, {OPT_INDEX_ON, "index=on"}, {OPT_INDEX_OFF, "index=off"}, {OPT_ERR, NULL} @@ -390,10 +399,37 @@ static char *ovl_next_opt(char **s) return sbegin; } +static int ovl_parse_redirect_mode(struct ovl_config *config, const char *mode) +{ + if (strcmp(mode, "on") == 0) { + config->redirect_dir = true; + /* + * Does not make sense to have redirect creation without + * redirect following. + */ + config->redirect_follow = true; + } else if (strcmp(mode, "follow") == 0) { + config->redirect_follow = true; + } else if (strcmp(mode, "off") == 0) { + if (ovl_redirect_always_follow) + config->redirect_follow = true; + } else if (strcmp(mode, "nofollow") != 0) { + pr_err("overlayfs: bad mount option \"redirect_dir=%s\"\n", + mode); + return -EINVAL; + } + + return 0; +} + static int ovl_parse_opt(char *opt, struct ovl_config *config) { char *p; + config->redirect_mode = kstrdup(ovl_redirect_mode_def(), GFP_KERNEL); + if (!config->redirect_mode) + return -ENOMEM; + while ((p = ovl_next_opt(&opt)) != NULL) { int token; substring_t args[MAX_OPT_ARGS]; @@ -428,12 +464,11 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) config->default_permissions = true; break; - case OPT_REDIRECT_DIR_ON: - config->redirect_dir = true; - break; - - case OPT_REDIRECT_DIR_OFF: - config->redirect_dir = false; + case OPT_REDIRECT_DIR: + kfree(config->redirect_mode); + config->redirect_mode = match_strdup(&args[0]); + if (!config->redirect_mode) + return -ENOMEM; break; case OPT_INDEX_ON: @@ -458,7 +493,7 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) config->workdir = NULL; } - return 0; + return ovl_parse_redirect_mode(config, config->redirect_mode); } #define OVL_WORKDIR_NAME "work" @@ -1160,7 +1195,6 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (!cred) goto out_err; - ofs->config.redirect_dir = ovl_redirect_dir_def; ofs->config.index = ovl_index_def; err = ovl_parse_opt((char *) data, &ofs->config); if (err) -- cgit v1.2.3 From 08d8f8a5b094b66b29936e8751b4a818b8db1207 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 27 Nov 2017 10:12:44 -0500 Subject: ovl: Pass ovl_get_nlink() parameters in right order Right now we seem to be passing index as "lowerdentry" and origin.dentry as "upperdentry". IIUC, we should pass these parameters in reversed order and this looks like a bug. Signed-off-by: Vivek Goyal Acked-by: Amir Goldstein Fixes: caf70cb2ba5d ("ovl: cleanup orphan index entries") Cc: #v4.13 Signed-off-by: Miklos Szeredi --- fs/overlayfs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 2a12dc2e9840..beb945e1963c 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -435,7 +435,7 @@ int ovl_verify_index(struct dentry *index, struct ovl_path *lower, /* Check if index is orphan and don't warn before cleaning it */ if (d_inode(index)->i_nlink == 1 && - ovl_get_nlink(index, origin.dentry, 0) == 0) + ovl_get_nlink(origin.dentry, index, 0) == 0) err = -ENOENT; dput(origin.dentry); -- cgit v1.2.3 From b02a16e6413a2f782e542ef60bad9ff6bf212f8a Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 29 Nov 2017 07:35:21 +0200 Subject: ovl: update ctx->pos on impure dir iteration This fixes a regression with readdir of impure dir in overlayfs that is shared to VM via 9p fs. Reported-by: Miguel Bernal Marin Fixes: 4edb83bb1041 ("ovl: constant d_ino for non-merge dirs") Cc: #4.14 Signed-off-by: Amir Goldstein Tested-by: Miguel Bernal Marin Signed-off-by: Miklos Szeredi --- fs/overlayfs/readdir.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 0daa4354fec4..51088849ce97 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -663,7 +663,10 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx) return PTR_ERR(rdt.cache); } - return iterate_dir(od->realfile, &rdt.ctx); + err = iterate_dir(od->realfile, &rdt.ctx); + ctx->pos = rdt.ctx.pos; + + return err; } -- cgit v1.2.3 From e8d4bfe3a71537284a90561f77c85dea6c154369 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Wed, 29 Nov 2017 10:01:32 +0800 Subject: ovl: Sync upper dirty data when syncing overlayfs When executing filesystem sync or umount on overlayfs, dirty data does not get synced as expected on upper filesystem. This patch fixes sync filesystem method to keep data consistency for overlayfs. Signed-off-by: Chengguang Xu Fixes: e593b2bf513d ("ovl: properly implement sync_filesystem()") Cc: #4.11 Signed-off-by: Miklos Szeredi --- fs/overlayfs/super.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 13a8a8617e44..76440feb79f6 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -252,6 +252,7 @@ static void ovl_put_super(struct super_block *sb) ovl_free_fs(ofs); } +/* Sync real dirty inodes in upper filesystem (if it exists) */ static int ovl_sync_fs(struct super_block *sb, int wait) { struct ovl_fs *ofs = sb->s_fs_info; @@ -260,14 +261,24 @@ static int ovl_sync_fs(struct super_block *sb, int wait) if (!ofs->upper_mnt) return 0; - upper_sb = ofs->upper_mnt->mnt_sb; - if (!upper_sb->s_op->sync_fs) + + /* + * If this is a sync(2) call or an emergency sync, all the super blocks + * will be iterated, including upper_sb, so no need to do anything. + * + * If this is a syncfs(2) call, then we do need to call + * sync_filesystem() on upper_sb, but enough if we do it when being + * called with wait == 1. + */ + if (!wait) return 0; - /* real inodes have already been synced by sync_filesystem(ovl_sb) */ + upper_sb = ofs->upper_mnt->mnt_sb; + down_read(&upper_sb->s_umount); - ret = upper_sb->s_op->sync_fs(upper_sb, wait); + ret = sync_filesystem(upper_sb); up_read(&upper_sb->s_umount); + return ret; } -- cgit v1.2.3 From 7879cb43f9a75710af439c6bd81c94de1aa3d740 Mon Sep 17 00:00:00 2001 From: Vasyl Gomonovych Date: Tue, 28 Nov 2017 00:09:23 +0100 Subject: ovl: Use PTR_ERR_OR_ZERO() Fix ptr_ret.cocci warnings: fs/overlayfs/overlayfs.h:179:11-17: WARNING: PTR_ERR_OR_ZERO can be used Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR Generated by: scripts/coccinelle/api/ptr_ret.cocci Signed-off-by: Vasyl Gomonovych Signed-off-by: Miklos Szeredi --- fs/overlayfs/overlayfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 13eab09a6b6f..b489099ccd49 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -180,7 +180,7 @@ static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry) static inline struct dentry *ovl_do_tmpfile(struct dentry *dentry, umode_t mode) { struct dentry *ret = vfs_tmpfile(dentry, mode, 0); - int err = IS_ERR(ret) ? PTR_ERR(ret) : 0; + int err = PTR_ERR_OR_ZERO(ret); pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err); return ret; -- cgit v1.2.3 From 9d5afec6b8bd46d6ed821aa1579634437f58ef1f Mon Sep 17 00:00:00 2001 From: Chandan Rajendra Date: Mon, 11 Dec 2017 15:00:57 -0500 Subject: ext4: fix crash when a directory's i_size is too small On a ppc64 machine, when mounting a fuzzed ext2 image (generated by fsfuzzer) the following call trace is seen, VFS: brelse: Trying to free free buffer WARNING: CPU: 1 PID: 6913 at /root/repos/linux/fs/buffer.c:1165 .__brelse.part.6+0x24/0x40 .__brelse.part.6+0x20/0x40 (unreliable) .ext4_find_entry+0x384/0x4f0 .ext4_lookup+0x84/0x250 .lookup_slow+0xdc/0x230 .walk_component+0x268/0x400 .path_lookupat+0xec/0x2d0 .filename_lookup+0x9c/0x1d0 .vfs_statx+0x98/0x140 .SyS_newfstatat+0x48/0x80 system_call+0x58/0x6c This happens because the directory that ext4_find_entry() looks up has inode->i_size that is less than the block size of the filesystem. This causes 'nblocks' to have a value of zero. ext4_bread_batch() ends up not reading any of the directory file's blocks. This renders the entries in bh_use[] array to continue to have garbage data. buffer_uptodate() on bh_use[0] can then return a zero value upon which brelse() function is invoked. This commit fixes the bug by returning -ENOENT when the directory file has no associated blocks. Reported-by: Abdul Haleem Signed-off-by: Chandan Rajendra Cc: stable@vger.kernel.org --- fs/ext4/namei.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 798b3ac680db..e750d68fbcb5 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1399,6 +1399,10 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, "falling back\n")); } nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); + if (!nblocks) { + ret = NULL; + goto cleanup_and_exit; + } start = EXT4_I(dir)->i_dir_start_lookup; if (start >= nblocks) start = 0; -- cgit v1.2.3 From da2e6b7eeda8919f677c790ef51161dd02e513a6 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 22 Nov 2017 20:27:34 +0200 Subject: ovl: fix overlay: warning prefix Conform two stray warning messages to the standard overlayfs: prefix. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/dir.c | 3 ++- fs/overlayfs/readdir.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index e13921824c70..f9788bc116a8 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -887,7 +887,8 @@ static int ovl_set_redirect(struct dentry *dentry, bool samedir) spin_unlock(&dentry->d_lock); } else { kfree(redirect); - pr_warn_ratelimited("overlay: failed to set redirect (%i)\n", err); + pr_warn_ratelimited("overlayfs: failed to set redirect (%i)\n", + err); /* Fall back to userspace copy-up */ err = -EXDEV; } diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 51088849ce97..8c98578d27a1 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -499,7 +499,7 @@ out: return err; fail: - pr_warn_ratelimited("overlay: failed to look up (%s) for ino (%i)\n", + pr_warn_ratelimited("overlayfs: failed to look up (%s) for ino (%i)\n", p->name, err); goto out; } -- cgit v1.2.3 From b7b2846fe26f2c0d7f317c874a13d3ecf22670ff Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 7 Dec 2017 19:07:02 -0800 Subject: xfs: add the ability to join a held buffer to a defer_ops In certain cases, defer_ops callers will lock a buffer and want to hold the lock across transaction rolls. Similar to ijoined inodes, we want to dirty & join the buffer with each transaction roll in defer_finish so that afterwards the caller still owns the buffer lock and we haven't inadvertently pinned the log. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_defer.c | 39 ++++++++++++++++++++++++++++++++++++--- fs/xfs/libxfs/xfs_defer.h | 5 ++++- 2 files changed, 40 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 072ebfe1d6ae..087fea02c389 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -249,6 +249,10 @@ xfs_defer_trans_roll( for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) xfs_trans_log_inode(*tp, dop->dop_inodes[i], XFS_ILOG_CORE); + /* Hold the (previously bjoin'd) buffer locked across the roll. */ + for (i = 0; i < XFS_DEFER_OPS_NR_BUFS && dop->dop_bufs[i]; i++) + xfs_trans_dirty_buf(*tp, dop->dop_bufs[i]); + trace_xfs_defer_trans_roll((*tp)->t_mountp, dop); /* Roll the transaction. */ @@ -264,6 +268,12 @@ xfs_defer_trans_roll( for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) xfs_trans_ijoin(*tp, dop->dop_inodes[i], 0); + /* Rejoin the buffers and dirty them so the log moves forward. */ + for (i = 0; i < XFS_DEFER_OPS_NR_BUFS && dop->dop_bufs[i]; i++) { + xfs_trans_bjoin(*tp, dop->dop_bufs[i]); + xfs_trans_bhold(*tp, dop->dop_bufs[i]); + } + return error; } @@ -295,6 +305,31 @@ xfs_defer_ijoin( } } + ASSERT(0); + return -EFSCORRUPTED; +} + +/* + * Add this buffer to the deferred op. Each joined buffer is relogged + * each time we roll the transaction. + */ +int +xfs_defer_bjoin( + struct xfs_defer_ops *dop, + struct xfs_buf *bp) +{ + int i; + + for (i = 0; i < XFS_DEFER_OPS_NR_BUFS; i++) { + if (dop->dop_bufs[i] == bp) + return 0; + else if (dop->dop_bufs[i] == NULL) { + dop->dop_bufs[i] = bp; + return 0; + } + } + + ASSERT(0); return -EFSCORRUPTED; } @@ -493,9 +528,7 @@ xfs_defer_init( struct xfs_defer_ops *dop, xfs_fsblock_t *fbp) { - dop->dop_committed = false; - dop->dop_low = false; - memset(&dop->dop_inodes, 0, sizeof(dop->dop_inodes)); + memset(dop, 0, sizeof(struct xfs_defer_ops)); *fbp = NULLFSBLOCK; INIT_LIST_HEAD(&dop->dop_intake); INIT_LIST_HEAD(&dop->dop_pending); diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index d4f046dd44bd..045beacdd37d 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -59,6 +59,7 @@ enum xfs_defer_ops_type { }; #define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */ +#define XFS_DEFER_OPS_NR_BUFS 2 /* join up to two buffers */ struct xfs_defer_ops { bool dop_committed; /* did any trans commit? */ @@ -66,8 +67,9 @@ struct xfs_defer_ops { struct list_head dop_intake; /* unlogged pending work */ struct list_head dop_pending; /* logged pending work */ - /* relog these inodes with each roll */ + /* relog these with each roll */ struct xfs_inode *dop_inodes[XFS_DEFER_OPS_NR_INODES]; + struct xfs_buf *dop_bufs[XFS_DEFER_OPS_NR_BUFS]; }; void xfs_defer_add(struct xfs_defer_ops *dop, enum xfs_defer_ops_type type, @@ -77,6 +79,7 @@ void xfs_defer_cancel(struct xfs_defer_ops *dop); void xfs_defer_init(struct xfs_defer_ops *dop, xfs_fsblock_t *fbp); bool xfs_defer_has_unfinished_work(struct xfs_defer_ops *dop); int xfs_defer_ijoin(struct xfs_defer_ops *dop, struct xfs_inode *ip); +int xfs_defer_bjoin(struct xfs_defer_ops *dop, struct xfs_buf *bp); /* Description of a deferred type. */ struct xfs_defer_op_type { -- cgit v1.2.3 From 6e643cd094de3bd0f97edcc1db0089afa24d909f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 7 Dec 2017 19:07:02 -0800 Subject: xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute The new attribute leaf buffer is not held locked across the transaction roll between the shortform->leaf modification and the addition of the new entry. As a result, the attribute buffer modification being made is not atomic from an operational perspective. Hence the AIL push can grab it in the transient state of "just created" after the initial transaction is rolled, because the buffer has been released. This leads to xfs_attr3_leaf_verify() asserting that hdr.count is zero, treating this as in-memory corruption, and shutting down the filesystem. Darrick ported the original patch to 4.15 and reworked it use the xfs_defer_bjoin helper and hold/join the buffer correctly across the second transaction roll. Signed-off-by: Alex Lyakas Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_attr.c | 20 +++++++++++++++----- fs/xfs/libxfs/xfs_attr_leaf.c | 9 ++++++--- fs/xfs/libxfs/xfs_attr_leaf.h | 3 ++- 3 files changed, 23 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 6249c92671de..a76914db72ef 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -212,6 +212,7 @@ xfs_attr_set( int flags) { struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *leaf_bp = NULL; struct xfs_da_args args; struct xfs_defer_ops dfops; struct xfs_trans_res tres; @@ -327,9 +328,16 @@ xfs_attr_set( * GROT: another possible req'mt for a double-split btree op. */ xfs_defer_init(args.dfops, args.firstblock); - error = xfs_attr_shortform_to_leaf(&args); + error = xfs_attr_shortform_to_leaf(&args, &leaf_bp); if (error) goto out_defer_cancel; + /* + * Prevent the leaf buffer from being unlocked so that a + * concurrent AIL push cannot grab the half-baked leaf + * buffer and run into problems with the write verifier. + */ + xfs_trans_bhold(args.trans, leaf_bp); + xfs_defer_bjoin(args.dfops, leaf_bp); xfs_defer_ijoin(args.dfops, dp); error = xfs_defer_finish(&args.trans, args.dfops); if (error) @@ -337,13 +345,14 @@ xfs_attr_set( /* * Commit the leaf transformation. We'll need another (linked) - * transaction to add the new attribute to the leaf. + * transaction to add the new attribute to the leaf, which + * means that we have to hold & join the leaf buffer here too. */ - error = xfs_trans_roll_inode(&args.trans, dp); if (error) goto out; - + xfs_trans_bjoin(args.trans, leaf_bp); + leaf_bp = NULL; } if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) @@ -374,8 +383,9 @@ xfs_attr_set( out_defer_cancel: xfs_defer_cancel(&dfops); - args.trans = NULL; out: + if (leaf_bp) + xfs_trans_brelse(args.trans, leaf_bp); if (args.trans) xfs_trans_cancel(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 53cc8b986eac..601eaa36f1ad 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -735,10 +735,13 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args) } /* - * Convert from using the shortform to the leaf. + * Convert from using the shortform to the leaf. On success, return the + * buffer so that we can keep it locked until we're totally done with it. */ int -xfs_attr_shortform_to_leaf(xfs_da_args_t *args) +xfs_attr_shortform_to_leaf( + struct xfs_da_args *args, + struct xfs_buf **leaf_bp) { xfs_inode_t *dp; xfs_attr_shortform_t *sf; @@ -818,7 +821,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) sfe = XFS_ATTR_SF_NEXTENTRY(sfe); } error = 0; - + *leaf_bp = bp; out: kmem_free(tmpbuffer); return error; diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index f7dda0c237b0..894124efb421 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -48,7 +48,8 @@ void xfs_attr_shortform_create(struct xfs_da_args *args); void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff); int xfs_attr_shortform_lookup(struct xfs_da_args *args); int xfs_attr_shortform_getvalue(struct xfs_da_args *args); -int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); +int xfs_attr_shortform_to_leaf(struct xfs_da_args *args, + struct xfs_buf **leaf_bp); int xfs_attr_shortform_remove(struct xfs_da_args *args); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes); -- cgit v1.2.3 From 8c57b88637d78a723e0854fc3d06c6d4c31a1e0c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 10 Dec 2017 18:03:53 -0800 Subject: xfs: account for null transactions in bunmapi In e1a4e37cc7b665 ("xfs: try to avoid blowing out the transaction reservation when bunmaping a shared extent"), we try to constrain the amount of real extents we unmap from the data fork in a given call so that we don't blow out transaction reservations. However, not all bunmapi operations require a transaction -- if we're only removing a delalloc extent, no transaction is needed, so we have to code against that. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_bmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 1210f684d3c2..1bddbba6b80c 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5136,7 +5136,7 @@ __xfs_bunmapi( * blowing out the transaction with a mix of EFIs and reflink * adjustments. */ - if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) + if (tp && xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res)); else max_len = len; -- cgit v1.2.3 From c54854a437a447a6bb1dcb11f60dd01cef3fa597 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 10 Dec 2017 18:03:54 -0800 Subject: xfs: move xfs_iext_insert tracepoint to report useful information Move the tracepoint in xfs_iext_insert to after the point where we've inserted the extent because otherwise we report stale extent data in the ftrace output. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_iext_tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index 89bf16b4d937..b0f31791c7e6 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -632,8 +632,6 @@ xfs_iext_insert( struct xfs_iext_leaf *new = NULL; int nr_entries, i; - trace_xfs_iext_insert(ip, cur, state, _RET_IP_); - if (ifp->if_height == 0) xfs_iext_alloc_root(ifp, cur); else if (ifp->if_height == 1) @@ -661,6 +659,8 @@ xfs_iext_insert( xfs_iext_set(cur_rec(cur), irec); ifp->if_bytes += sizeof(struct xfs_iext_rec); + trace_xfs_iext_insert(ip, cur, state, _RET_IP_); + if (new) xfs_iext_insert_node(ifp, xfs_iext_leaf_key(new, 0), new, 2); } -- cgit v1.2.3 From 5c989a0ee06eb77a44baffd1779a5dbb9a7e873f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 10 Dec 2017 18:03:54 -0800 Subject: xfs: remove dest file's post-eof preallocations before reflinking If we try to reflink into a file with post-eof preallocations at an offset well past the preallocations, we increase i_size as one would expect. However, those allocations do not have page cache backing them, so they won't get cleaned out on their own. This leads to asserts in the collapse/insert range code and xfs_destroy_inode when they encounter delalloc extents they weren't expecting to find. Since there are plenty of other places where we dump those post-eof blocks, do the same to the reflink destination file before we start remapping extents. This was found by adding clonerange support to fsstress and running it in write-only mode. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_reflink.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index cf7c8f81bebb..e13f5ad57a03 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1291,6 +1291,17 @@ xfs_reflink_remap_range( trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); + /* + * Clear out post-eof preallocations because we don't have page cache + * backing the delayed allocations and they'll never get freed on + * their own. + */ + if (xfs_can_free_eofblocks(dest, true)) { + ret = xfs_free_eofblocks(dest); + if (ret) + goto out_unlock; + } + /* Set flags and remap blocks. */ ret = xfs_reflink_set_inode_flag(src, dest); if (ret) -- cgit v1.2.3 From 73353f486c9b5b2407ec32be1004174dbbaf6c18 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 10 Dec 2017 18:03:55 -0800 Subject: xfs: relax is_reflink_inode assert in xfs_reflink_find_cow_mapping We don't hold the ilock through the entire sequence of xfs_writepage_map -> xfs_map_cow -> xfs_reflink_find_cow_mapping. This means that we can race with another thread that is trying to clear the inode reflink flag, with the result that the flag is set for the xfs_map_cow check but cleared before we get to the assert in find_cow_mapping. When this happens, we blow the assert even though everything is fine. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_reflink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index e13f5ad57a03..99c5852f9fe7 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -490,8 +490,9 @@ xfs_reflink_find_cow_mapping( struct xfs_iext_cursor icur; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); - ASSERT(xfs_is_reflink_inode(ip)); + if (!xfs_is_reflink_inode(ip)) + return false; offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got)) return false; -- cgit v1.2.3 From 9d40fba8b2056773b9744a95df9ddd6cc33a4f83 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 10 Dec 2017 18:03:55 -0800 Subject: xfs: avoid infinite loop when cancelling CoW blocks after writeback failure When we're cancelling a cow range, we don't always delete each extent that we iterate, so we have to move icur backwards in the list to avoid an infinite loop. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_reflink.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 99c5852f9fe7..6931b0c79cac 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -611,6 +611,9 @@ xfs_reflink_cancel_cow_blocks( /* Remove the mapping from the CoW fork. */ xfs_bmap_del_extent_cow(ip, &icur, &got, &del); + } else { + /* Didn't do anything, push cursor back. */ + xfs_iext_prev(ifp, &icur); } next_extent: if (!xfs_iext_get_extent(ifp, &icur, &got)) -- cgit v1.2.3 From a192de265b26c525672884630d5376c405e83b2a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 10 Dec 2017 18:03:56 -0800 Subject: xfs: allow CoW remap transactions to use reserve blocks Since we as yet have no way of holding on to the indlen blocks that are reserved as part of CoW fork delalloc reservations, let the CoW remap transaction dip into the reserves so that we avoid failing writes. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_reflink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 6931b0c79cac..e49e6db415f7 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -729,7 +729,7 @@ xfs_reflink_end_cow( (unsigned int)(end_fsb - offset_fsb), XFS_DATA_FORK); error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, - resblks, 0, 0, &tp); + resblks, 0, XFS_TRANS_RESERVE, &tp); if (error) goto out; -- cgit v1.2.3 From 302ec300ef8a545a7fc7f667e5fd743b091c2eeb Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 14 Dec 2017 15:32:38 -0800 Subject: autofs: fix careless error in recent commit Commit ecc0c469f277 ("autofs: don't fail mount for transient error") was meant to replace an 'if' with a 'switch', but instead added the 'switch' leaving the case in place. Link: http://lkml.kernel.org/r/87zi6wstmw.fsf@notabene.neil.brown.name Fixes: ecc0c469f277 ("autofs: don't fail mount for transient error") Reported-by: Ben Hutchings Signed-off-by: NeilBrown Cc: Ian Kent Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/waitq.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 8fc41705c7cd..961a12dc6dc8 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -170,7 +170,6 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, mutex_unlock(&sbi->wq_mutex); - if (autofs4_write(sbi, pipe, &pkt, pktsz)) switch (ret = autofs4_write(sbi, pipe, &pkt, pktsz)) { case 0: break; -- cgit v1.2.3 From 3756f6401c302617c5e091081ca4d26ab604bec5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 14 Dec 2017 15:32:41 -0800 Subject: exec: avoid gcc-8 warning for get_task_comm gcc-8 warns about using strncpy() with the source size as the limit: fs/exec.c:1223:32: error: argument to 'sizeof' in 'strncpy' call is the same expression as the source; did you mean to use the size of the destination? [-Werror=sizeof-pointer-memaccess] This is indeed slightly suspicious, as it protects us from source arguments without NUL-termination, but does not guarantee that the destination is terminated. This keeps the strncpy() to ensure we have properly padded target buffer, but ensures that we use the correct length, by passing the actual length of the destination buffer as well as adding a build-time check to ensure it is exactly TASK_COMM_LEN. There are only 23 callsites which I all reviewed to ensure this is currently the case. We could get away with doing only the check or passing the right length, but it doesn't hurt to do both. Link: http://lkml.kernel.org/r/20171205151724.1764896-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Suggested-by: Kees Cook Acked-by: Kees Cook Acked-by: Ingo Molnar Cc: Alexander Viro Cc: Peter Zijlstra Cc: Serge Hallyn Cc: James Morris Cc: Aleksa Sarai Cc: "Eric W. Biederman" Cc: Frederic Weisbecker Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 7 +++---- include/linux/sched.h | 6 +++++- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 6be2aa0ab26f..156f56acfe8e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1216,15 +1216,14 @@ killed: return -EAGAIN; } -char *get_task_comm(char *buf, struct task_struct *tsk) +char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk) { - /* buf must be at least sizeof(tsk->comm) in size */ task_lock(tsk); - strncpy(buf, tsk->comm, sizeof(tsk->comm)); + strncpy(buf, tsk->comm, buf_size); task_unlock(tsk); return buf; } -EXPORT_SYMBOL_GPL(get_task_comm); +EXPORT_SYMBOL_GPL(__get_task_comm); /* * These functions flushes out all traces of the currently running executable diff --git a/include/linux/sched.h b/include/linux/sched.h index 21991d668d35..5124ba709830 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1503,7 +1503,11 @@ static inline void set_task_comm(struct task_struct *tsk, const char *from) __set_task_comm(tsk, from, false); } -extern char *get_task_comm(char *to, struct task_struct *tsk); +extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk); +#define get_task_comm(buf, tsk) ({ \ + BUILD_BUG_ON(sizeof(buf) != TASK_COMM_LEN); \ + __get_task_comm(buf, sizeof(buf), tsk); \ +}) #ifdef CONFIG_SMP void scheduler_ipi(void); -- cgit v1.2.3 From bdcf0a423ea1c40bbb40e7ee483b50fc8aa3d758 Mon Sep 17 00:00:00 2001 From: Thiago Rafael Becker Date: Thu, 14 Dec 2017 15:33:12 -0800 Subject: kernel: make groups_sort calling a responsibility group_info allocators In testing, we found that nfsd threads may call set_groups in parallel for the same entry cached in auth.unix.gid, racing in the call of groups_sort, corrupting the groups for that entry and leading to permission denials for the client. This patch: - Make groups_sort globally visible. - Move the call to groups_sort to the modifiers of group_info - Remove the call to groups_sort from set_groups Link: http://lkml.kernel.org/r/20171211151420.18655-1-thiago.becker@gmail.com Signed-off-by: Thiago Rafael Becker Reviewed-by: Matthew Wilcox Reviewed-by: NeilBrown Acked-by: "J. Bruce Fields" Cc: Al Viro Cc: Martin Schwidefsky Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/kernel/compat_linux.c | 1 + fs/nfsd/auth.c | 3 +++ include/linux/cred.h | 1 + kernel/groups.c | 5 +++-- kernel/uid16.c | 1 + net/sunrpc/auth_gss/gss_rpc_xdr.c | 1 + net/sunrpc/auth_gss/svcauth_gss.c | 1 + net/sunrpc/svcauth_unix.c | 2 ++ 8 files changed, 13 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c index f04db3779b34..59eea9c65d3e 100644 --- a/arch/s390/kernel/compat_linux.c +++ b/arch/s390/kernel/compat_linux.c @@ -263,6 +263,7 @@ COMPAT_SYSCALL_DEFINE2(s390_setgroups16, int, gidsetsize, u16 __user *, grouplis return retval; } + groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 697f8ae7792d..f650e475d8f0 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -60,6 +60,9 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) gi->gid[i] = exp->ex_anon_gid; else gi->gid[i] = rqgi->gid[i]; + + /* Each thread allocates its own gi, no race */ + groups_sort(gi); } } else { gi = get_group_info(rqgi); diff --git a/include/linux/cred.h b/include/linux/cred.h index 099058e1178b..631286535d0f 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -83,6 +83,7 @@ extern int set_current_groups(struct group_info *); extern void set_groups(struct cred *, struct group_info *); extern int groups_search(const struct group_info *, kgid_t); extern bool may_setgroups(void); +extern void groups_sort(struct group_info *); /* * The security context of a task diff --git a/kernel/groups.c b/kernel/groups.c index e357bc800111..daae2f2dc6d4 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -86,11 +86,12 @@ static int gid_cmp(const void *_a, const void *_b) return gid_gt(a, b) - gid_lt(a, b); } -static void groups_sort(struct group_info *group_info) +void groups_sort(struct group_info *group_info) { sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid), gid_cmp, NULL); } +EXPORT_SYMBOL(groups_sort); /* a simple bsearch */ int groups_search(const struct group_info *group_info, kgid_t grp) @@ -122,7 +123,6 @@ int groups_search(const struct group_info *group_info, kgid_t grp) void set_groups(struct cred *new, struct group_info *group_info) { put_group_info(new->group_info); - groups_sort(group_info); get_group_info(group_info); new->group_info = group_info; } @@ -206,6 +206,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) return retval; } + groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); diff --git a/kernel/uid16.c b/kernel/uid16.c index ce74a4901d2b..ef1da2a5f9bd 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -192,6 +192,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) return retval; } + groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index c4778cae58ef..444380f968f1 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -231,6 +231,7 @@ static int gssx_dec_linux_creds(struct xdr_stream *xdr, goto out_free_groups; creds->cr_group_info->gid[i] = kgid; } + groups_sort(creds->cr_group_info); return 0; out_free_groups: diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 5dd4e6c9fef2..26531193fce4 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -481,6 +481,7 @@ static int rsc_parse(struct cache_detail *cd, goto out; rsci.cred.cr_group_info->gid[i] = kgid; } + groups_sort(rsci.cred.cr_group_info); /* mech name */ len = qword_get(&mesg, buf, mlen); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 740b67d5a733..af7f28fb8102 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -520,6 +520,7 @@ static int unix_gid_parse(struct cache_detail *cd, ug.gi->gid[i] = kgid; } + groups_sort(ug.gi); ugp = unix_gid_lookup(cd, uid); if (ugp) { struct cache_head *ch; @@ -819,6 +820,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) kgid_t kgid = make_kgid(&init_user_ns, svc_getnl(argv)); cred->cr_group_info->gid[i] = kgid; } + groups_sort(cred->cr_group_info); if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) { *authp = rpc_autherr_badverf; return SVC_DENIED; -- cgit v1.2.3 From c156618e15101a9cc8c815108fec0300a0ec6637 Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Tue, 5 Dec 2017 13:55:44 -0500 Subject: nfs: fix a deadlock in nfs client initialization The following deadlock can occur between a process waiting for a client to initialize in while walking the client list during nfsv4 server trunking detection and another process waiting for the nfs_clid_init_mutex so it can initialize that client: Process 1 Process 2 --------- --------- spin_lock(&nn->nfs_client_lock); list_add_tail(&CLIENTA->cl_share_link, &nn->nfs_client_list); spin_unlock(&nn->nfs_client_lock); spin_lock(&nn->nfs_client_lock); list_add_tail(&CLIENTB->cl_share_link, &nn->nfs_client_list); spin_unlock(&nn->nfs_client_lock); mutex_lock(&nfs_clid_init_mutex); nfs41_walk_client_list(clp, result, cred); nfs_wait_client_init_complete(CLIENTA); (waiting for nfs_clid_init_mutex) Make sure nfs_match_client() only evaluates clients that have completed initialization in order to prevent that deadlock. This patch also fixes v4.0 trunking behavior by not marking the client NFS_CS_READY until the clientid has been confirmed. Signed-off-by: Scott Mayhew Signed-off-by: Anna Schumaker --- fs/nfs/client.c | 11 +++++++++++ fs/nfs/nfs4client.c | 17 +++++++++++++---- 2 files changed, 24 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 0ac2fb1c6b63..b9129e2befea 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -291,12 +291,23 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat const struct sockaddr *sap = data->addr; struct nfs_net *nn = net_generic(data->net, nfs_net_id); +again: list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; /* Don't match clients that failed to initialise properly */ if (clp->cl_cons_state < 0) continue; + /* If a client is still initializing then we need to wait */ + if (clp->cl_cons_state > NFS_CS_READY) { + refcount_inc(&clp->cl_count); + spin_unlock(&nn->nfs_client_lock); + nfs_wait_client_init_complete(clp); + nfs_put_client(clp); + spin_lock(&nn->nfs_client_lock); + goto again; + } + /* Different NFS versions cannot share the same nfs_client */ if (clp->rpc_ops != data->nfs_mod->rpc_ops) continue; diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 12bbab0becb4..65a7e5da508c 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -404,15 +404,19 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, if (error < 0) goto error; - if (!nfs4_has_session(clp)) - nfs_mark_client_ready(clp, NFS_CS_READY); - error = nfs4_discover_server_trunking(clp, &old); if (error < 0) goto error; - if (clp != old) + if (clp != old) { clp->cl_preserve_clid = true; + /* + * Mark the client as having failed initialization so other + * processes walking the nfs_client_list in nfs_match_client() + * won't try to use it. + */ + nfs_mark_client_ready(clp, -EPERM); + } nfs_put_client(clp); clear_bit(NFS_CS_TSM_POSSIBLE, &clp->cl_flags); return old; @@ -539,6 +543,9 @@ int nfs40_walk_client_list(struct nfs_client *new, spin_lock(&nn->nfs_client_lock); list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { + if (pos == new) + goto found; + status = nfs4_match_client(pos, new, &prev, nn); if (status < 0) goto out_unlock; @@ -559,6 +566,7 @@ int nfs40_walk_client_list(struct nfs_client *new, * way that a SETCLIENTID_CONFIRM to pos can succeed is * if new and pos point to the same server: */ +found: refcount_inc(&pos->cl_count); spin_unlock(&nn->nfs_client_lock); @@ -572,6 +580,7 @@ int nfs40_walk_client_list(struct nfs_client *new, case 0: nfs4_swap_callback_idents(pos, new); pos->cl_confirm = new->cl_confirm; + nfs_mark_client_ready(pos, NFS_CS_READY); prev = NULL; *result = pos; -- cgit v1.2.3 From dc4fd9ab01ab379ae5af522b3efd4187a7c30a31 Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Fri, 8 Dec 2017 16:00:12 -0500 Subject: nfs: don't wait on commit in nfs_commit_inode() if there were no commit requests If there were no commit requests, then nfs_commit_inode() should not wait on the commit or mark the inode dirty, otherwise the following BUG_ON can be triggered: [ 1917.130762] kernel BUG at fs/inode.c:578! [ 1917.130766] Oops: Exception in kernel mode, sig: 5 [#1] [ 1917.130768] SMP NR_CPUS=2048 NUMA pSeries [ 1917.130772] Modules linked in: iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi blocklayoutdriver rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace fscache sunrpc sg nx_crypto pseries_rng ip_tables xfs libcrc32c sd_mod crc_t10dif crct10dif_generic crct10dif_common ibmvscsi scsi_transport_srp ibmveth scsi_tgt dm_mirror dm_region_hash dm_log dm_mod [ 1917.130805] CPU: 2 PID: 14923 Comm: umount.nfs4 Tainted: G ------------ T 3.10.0-768.el7.ppc64 #1 [ 1917.130810] task: c0000005ecd88040 ti: c00000004cea0000 task.ti: c00000004cea0000 [ 1917.130813] NIP: c000000000354178 LR: c000000000354160 CTR: c00000000012db80 [ 1917.130816] REGS: c00000004cea3720 TRAP: 0700 Tainted: G ------------ T (3.10.0-768.el7.ppc64) [ 1917.130820] MSR: 8000000100029032 CR: 22002822 XER: 20000000 [ 1917.130828] CFAR: c00000000011f594 SOFTE: 1 GPR00: c000000000354160 c00000004cea39a0 c0000000014c4700 c0000000018cc750 GPR04: 000000000000c750 80c0000000000000 0600000000000000 04eeb76bea749a03 GPR08: 0000000000000034 c0000000018cc758 0000000000000001 d000000005e619e8 GPR12: c00000000012db80 c000000007b31200 0000000000000000 0000000000000000 GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR20: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR24: 0000000000000000 c000000000dfc3ec 0000000000000000 c0000005eefc02c0 GPR28: d0000000079dbd50 c0000005b94a02c0 c0000005b94a0250 c0000005b94a01c8 [ 1917.130867] NIP [c000000000354178] .evict+0x1c8/0x350 [ 1917.130871] LR [c000000000354160] .evict+0x1b0/0x350 [ 1917.130873] Call Trace: [ 1917.130876] [c00000004cea39a0] [c000000000354160] .evict+0x1b0/0x350 (unreliable) [ 1917.130880] [c00000004cea3a30] [c0000000003558cc] .evict_inodes+0x13c/0x270 [ 1917.130884] [c00000004cea3af0] [c000000000327d20] .kill_anon_super+0x70/0x1e0 [ 1917.130896] [c00000004cea3b80] [d000000005e43e30] .nfs_kill_super+0x20/0x60 [nfs] [ 1917.130900] [c00000004cea3c00] [c000000000328a20] .deactivate_locked_super+0xa0/0x1b0 [ 1917.130903] [c00000004cea3c80] [c00000000035ba54] .cleanup_mnt+0xd4/0x180 [ 1917.130907] [c00000004cea3d10] [c000000000119034] .task_work_run+0x114/0x150 [ 1917.130912] [c00000004cea3db0] [c00000000001ba6c] .do_notify_resume+0xcc/0x100 [ 1917.130916] [c00000004cea3e30] [c00000000000a7b0] .ret_from_except_lite+0x5c/0x60 [ 1917.130919] Instruction dump: [ 1917.130921] 7fc3f378 486734b5 60000000 387f00a0 38800003 4bdcb365 60000000 e95f00a0 [ 1917.130927] 694a0060 7d4a0074 794ad182 694a0001 <0b0a0000> 892d02a4 2f890000 40de0134 Signed-off-by: Scott Mayhew Cc: stable@vger.kernel.org # 4.5+ Signed-off-by: Anna Schumaker --- fs/nfs/write.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 5b5f464f6f2a..4a379d7918f2 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1890,6 +1890,8 @@ int nfs_commit_inode(struct inode *inode, int how) if (res) error = nfs_generic_commit_list(inode, &head, how, &cinfo); nfs_commit_end(cinfo.mds); + if (res == 0) + return res; if (error < 0) goto out_error; if (!may_wait) -- cgit v1.2.3 From f6f3732162b5ae3c771b9285a5a32d72b8586920 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 15 Dec 2017 18:53:22 -0800 Subject: Revert "mm: replace p??_write with pte_access_permitted in fault + gup paths" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commits 5c9d2d5c269c, c7da82b894e9, and e7fe7b5cae90. We'll probably need to revisit this, but basically we should not complicate the get_user_pages_fast() case, and checking the actual page table protection key bits will require more care anyway, since the protection keys depend on the exact state of the VM in question. Particularly when doing a "remote" page lookup (ie in somebody elses VM, not your own), you need to be much more careful than this was. Dave Hansen says: "So, the underlying bug here is that we now a get_user_pages_remote() and then go ahead and do the p*_access_permitted() checks against the current PKRU. This was introduced recently with the addition of the new p??_access_permitted() calls. We have checks in the VMA path for the "remote" gups and we avoid consulting PKRU for them. This got missed in the pkeys selftests because I did a ptrace read, but not a *write*. I also didn't explicitly test it against something where a COW needed to be done" It's also not entirely clear that it makes sense to check the protection key bits at this level at all. But one possible eventual solution is to make the get_user_pages_fast() case just abort if it sees protection key bits set, which makes us fall back to the regular get_user_pages() case, which then has a vma and can do the check there if we want to. We'll see. Somewhat related to this all: what we _do_ want to do some day is to check the PAGE_USER bit - it should obviously always be set for user pages, but it would be a good check to have back. Because we have no generic way to test for it, we lost it as part of moving over from the architecture-specific x86 GUP implementation to the generic one in commit e585513b76f7 ("x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation"). Cc: Peter Zijlstra Cc: Dan Williams Cc: Dave Hansen Cc: Kirill A. Shutemov Cc: "Jérôme Glisse" Cc: Andrew Morton Cc: Al Viro Signed-off-by: Linus Torvalds --- arch/s390/include/asm/pgtable.h | 6 ------ arch/sparc/mm/gup.c | 4 ++-- fs/dax.c | 3 +-- mm/gup.c | 2 +- mm/hmm.c | 8 ++++---- mm/huge_memory.c | 6 +++--- mm/memory.c | 8 ++++---- 7 files changed, 15 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 57d7bc92e0b8..0a6b0286c32e 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1264,12 +1264,6 @@ static inline pud_t pud_mkwrite(pud_t pud) return pud; } -#define pud_write pud_write -static inline int pud_write(pud_t pud) -{ - return (pud_val(pud) & _REGION3_ENTRY_WRITE) != 0; -} - static inline pud_t pud_mkclean(pud_t pud) { if (pud_large(pud)) { diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c index 33c0f8bb0f33..5335ba3c850e 100644 --- a/arch/sparc/mm/gup.c +++ b/arch/sparc/mm/gup.c @@ -75,7 +75,7 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, if (!(pmd_val(pmd) & _PAGE_VALID)) return 0; - if (!pmd_access_permitted(pmd, write)) + if (write && !pmd_write(pmd)) return 0; refs = 0; @@ -114,7 +114,7 @@ static int gup_huge_pud(pud_t *pudp, pud_t pud, unsigned long addr, if (!(pud_val(pud) & _PAGE_VALID)) return 0; - if (!pud_access_permitted(pud, write)) + if (write && !pud_write(pud)) return 0; refs = 0; diff --git a/fs/dax.c b/fs/dax.c index 78b72c48374e..95981591977a 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -627,8 +627,7 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, if (pfn != pmd_pfn(*pmdp)) goto unlock_pmd; - if (!pmd_dirty(*pmdp) - && !pmd_access_permitted(*pmdp, WRITE)) + if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp)) goto unlock_pmd; flush_cache_page(vma, address, pfn); diff --git a/mm/gup.c b/mm/gup.c index d3fb60e5bfac..e0d82b6706d7 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -66,7 +66,7 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, */ static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) { - return pte_access_permitted(pte, WRITE) || + return pte_write(pte) || ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)); } diff --git a/mm/hmm.c b/mm/hmm.c index 3a5c172af560..ea19742a5d60 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -391,11 +391,11 @@ again: if (pmd_protnone(pmd)) return hmm_vma_walk_clear(start, end, walk); - if (!pmd_access_permitted(pmd, write_fault)) + if (write_fault && !pmd_write(pmd)) return hmm_vma_walk_clear(start, end, walk); pfn = pmd_pfn(pmd) + pte_index(addr); - flag |= pmd_access_permitted(pmd, WRITE) ? HMM_PFN_WRITE : 0; + flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0; for (; addr < end; addr += PAGE_SIZE, i++, pfn++) pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag; return 0; @@ -456,11 +456,11 @@ again: continue; } - if (!pte_access_permitted(pte, write_fault)) + if (write_fault && !pte_write(pte)) goto fault; pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag; - pfns[i] |= pte_access_permitted(pte, WRITE) ? HMM_PFN_WRITE : 0; + pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0; continue; fault: diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2f2f5e774902..0e7ded98d114 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -870,7 +870,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, */ WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set"); - if (!pmd_access_permitted(*pmd, flags & FOLL_WRITE)) + if (flags & FOLL_WRITE && !pmd_write(*pmd)) return NULL; if (pmd_present(*pmd) && pmd_devmap(*pmd)) @@ -1012,7 +1012,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, assert_spin_locked(pud_lockptr(mm, pud)); - if (!pud_access_permitted(*pud, flags & FOLL_WRITE)) + if (flags & FOLL_WRITE && !pud_write(*pud)) return NULL; if (pud_present(*pud) && pud_devmap(*pud)) @@ -1386,7 +1386,7 @@ out_unlock: */ static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags) { - return pmd_access_permitted(pmd, WRITE) || + return pmd_write(pmd) || ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd)); } diff --git a/mm/memory.c b/mm/memory.c index cfaba6287702..ca5674cbaff2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3949,7 +3949,7 @@ static int handle_pte_fault(struct vm_fault *vmf) if (unlikely(!pte_same(*vmf->pte, entry))) goto unlock; if (vmf->flags & FAULT_FLAG_WRITE) { - if (!pte_access_permitted(entry, WRITE)) + if (!pte_write(entry)) return do_wp_page(vmf); entry = pte_mkdirty(entry); } @@ -4014,7 +4014,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, /* NUMA case for anonymous PUDs would go here */ - if (dirty && !pud_access_permitted(orig_pud, WRITE)) { + if (dirty && !pud_write(orig_pud)) { ret = wp_huge_pud(&vmf, orig_pud); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -4047,7 +4047,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) return do_huge_pmd_numa_page(&vmf, orig_pmd); - if (dirty && !pmd_access_permitted(orig_pmd, WRITE)) { + if (dirty && !pmd_write(orig_pmd)) { ret = wp_huge_pmd(&vmf, orig_pmd); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -4337,7 +4337,7 @@ int follow_phys(struct vm_area_struct *vma, goto out; pte = *ptep; - if (!pte_access_permitted(pte, flags & FOLL_WRITE)) + if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; *prot = pgprot_val(pte_pgprot(pte)); -- cgit v1.2.3 From b9f5fb1800d8a4a3bc6cd3152c5f3d252986cf79 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 10 Nov 2017 15:57:21 +0100 Subject: cramfs: fix MTD dependency With CONFIG_MTD=m and CONFIG_CRAMFS=y, we now get a link failure: fs/cramfs/inode.o: In function `cramfs_mount': inode.c:(.text+0x220): undefined reference to `mount_mtd' fs/cramfs/inode.o: In function `cramfs_mtd_fill_super': inode.c:(.text+0x6d8): undefined reference to `mtd_point' inode.c:(.text+0xae4): undefined reference to `mtd_unpoint' This adds a more specific Kconfig dependency to avoid the broken configuration. Alternatively we could make CRAMFS itself depend on "MTD || !MTD" with a similar result. Fixes: 99c18ce580c6 ("cramfs: direct memory access support") Signed-off-by: Arnd Bergmann Signed-off-by: Nicolas Pitre Signed-off-by: Linus Torvalds --- fs/cramfs/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig index f937082f3244..58e2fe40b2a0 100644 --- a/fs/cramfs/Kconfig +++ b/fs/cramfs/Kconfig @@ -34,6 +34,7 @@ config CRAMFS_BLOCKDEV config CRAMFS_MTD bool "Support CramFs image directly mapped in physical memory" depends on CRAMFS && MTD + depends on CRAMFS=m || MTD=y default y if !CRAMFS_BLOCKDEV help This option allows the CramFs driver to load data directly from -- cgit v1.2.3 From 779f4e1c6c7c661db40dfebd6dd6bda7b5f88aa3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 12 Dec 2017 11:28:38 -0800 Subject: Revert "exec: avoid RLIMIT_STACK races with prlimit()" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 04e35f4495dd560db30c25efca4eecae8ec8c375. SELinux runs with secureexec for all non-"noatsecure" domain transitions, which means lots of processes end up hitting the stack hard-limit change that was introduced in order to fix a race with prlimit(). That race fix will need to be redesigned. Reported-by: Laura Abbott Reported-by: Tomáš Trnka Cc: stable@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: Linus Torvalds --- fs/exec.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 156f56acfe8e..5688b5e1b937 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1339,15 +1339,10 @@ void setup_new_exec(struct linux_binprm * bprm) * avoid bad behavior from the prior rlimits. This has to * happen before arch_pick_mmap_layout(), which examines * RLIMIT_STACK, but after the point of no return to avoid - * races from other threads changing the limits. This also - * must be protected from races with prlimit() calls. + * needing to clean up the change on failure. */ - task_lock(current->group_leader); if (current->signal->rlim[RLIMIT_STACK].rlim_cur > _STK_LIM) current->signal->rlim[RLIMIT_STACK].rlim_cur = _STK_LIM; - if (current->signal->rlim[RLIMIT_STACK].rlim_max > _STK_LIM) - current->signal->rlim[RLIMIT_STACK].rlim_max = _STK_LIM; - task_unlock(current->group_leader); } arch_pick_mmap_layout(current->mm); -- cgit v1.2.3 From 91aae6be4139b9e3902656d819e6af66e051bd7a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 14 Dec 2017 15:42:22 -0800 Subject: xfs: track cowblocks separately in i_flags The EOFBLOCKS/COWBLOCKS tags are totally separate things, so track them with separate i_flags. Right now we're abusing IEOFBLOCKS for both, which is totally bogus because we won't tag the inode with COWBLOCKS if IEOFBLOCKS was set by a previous tagging of the inode with EOFBLOCKS. Found by wiring up clonerange to fsstress in xfs/017. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_icache.c | 33 ++++++++++++++++++++++++--------- fs/xfs/xfs_inode.h | 1 + 2 files changed, 25 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 43005fbe8b1e..58d2d4253c8e 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1536,8 +1536,23 @@ xfs_inode_free_quota_eofblocks( return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks); } +static inline unsigned long +xfs_iflag_for_tag( + int tag) +{ + switch (tag) { + case XFS_ICI_EOFBLOCKS_TAG: + return XFS_IEOFBLOCKS; + case XFS_ICI_COWBLOCKS_TAG: + return XFS_ICOWBLOCKS; + default: + ASSERT(0); + return 0; + } +} + static void -__xfs_inode_set_eofblocks_tag( +__xfs_inode_set_blocks_tag( xfs_inode_t *ip, void (*execute)(struct xfs_mount *mp), void (*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, @@ -1552,10 +1567,10 @@ __xfs_inode_set_eofblocks_tag( * Don't bother locking the AG and looking up in the radix trees * if we already know that we have the tag set. */ - if (ip->i_flags & XFS_IEOFBLOCKS) + if (ip->i_flags & xfs_iflag_for_tag(tag)) return; spin_lock(&ip->i_flags_lock); - ip->i_flags |= XFS_IEOFBLOCKS; + ip->i_flags |= xfs_iflag_for_tag(tag); spin_unlock(&ip->i_flags_lock); pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); @@ -1587,13 +1602,13 @@ xfs_inode_set_eofblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_set_eofblocks_tag(ip); - return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_eofblocks, + return __xfs_inode_set_blocks_tag(ip, xfs_queue_eofblocks, trace_xfs_perag_set_eofblocks, XFS_ICI_EOFBLOCKS_TAG); } static void -__xfs_inode_clear_eofblocks_tag( +__xfs_inode_clear_blocks_tag( xfs_inode_t *ip, void (*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, int error, unsigned long caller_ip), @@ -1603,7 +1618,7 @@ __xfs_inode_clear_eofblocks_tag( struct xfs_perag *pag; spin_lock(&ip->i_flags_lock); - ip->i_flags &= ~XFS_IEOFBLOCKS; + ip->i_flags &= ~xfs_iflag_for_tag(tag); spin_unlock(&ip->i_flags_lock); pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); @@ -1630,7 +1645,7 @@ xfs_inode_clear_eofblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_clear_eofblocks_tag(ip); - return __xfs_inode_clear_eofblocks_tag(ip, + return __xfs_inode_clear_blocks_tag(ip, trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG); } @@ -1724,7 +1739,7 @@ xfs_inode_set_cowblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_set_cowblocks_tag(ip); - return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_cowblocks, + return __xfs_inode_set_blocks_tag(ip, xfs_queue_cowblocks, trace_xfs_perag_set_cowblocks, XFS_ICI_COWBLOCKS_TAG); } @@ -1734,6 +1749,6 @@ xfs_inode_clear_cowblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_clear_cowblocks_tag(ip); - return __xfs_inode_clear_eofblocks_tag(ip, + return __xfs_inode_clear_blocks_tag(ip, trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG); } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index b2136af9289f..d383e392ec9d 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -232,6 +232,7 @@ static inline bool xfs_is_reflink_inode(struct xfs_inode *ip) * log recovery to replay a bmap operation on the inode. */ #define XFS_IRECOVERY (1 << 11) +#define XFS_ICOWBLOCKS (1 << 12)/* has the cowblocks tag set */ /* * Per-lifetime flags need to be reset when re-using a reclaimable inode during -- cgit v1.2.3 From 363e59baa4f76d3f97c0133ff7014cba3d90a7c3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 14 Dec 2017 15:42:59 -0800 Subject: xfs: don't be so eager to clear the cowblocks tag on truncate Currently, xfs_itruncate_extents clears the cowblocks tag if i_cnextents is zero. This is wrong, since i_cnextents only tracks real extents in the CoW fork, which means that we could have some delayed CoW reservations still in there that will now never get cleaned. Fix a further bug where we /don't/ clear the reflink iflag if there are any attribute blocks -- really, it's only safe to clear the reflink flag if there are no data fork extents and no cow fork extents. Found by adding clonerange to fsstress in xfs/017. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_inode.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index b41952a4ddd8..6f95bdb408ce 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1487,6 +1487,24 @@ xfs_link( return error; } +/* Clear the reflink flag and the cowblocks tag if possible. */ +static void +xfs_itruncate_clear_reflink_flags( + struct xfs_inode *ip) +{ + struct xfs_ifork *dfork; + struct xfs_ifork *cfork; + + if (!xfs_is_reflink_inode(ip)) + return; + dfork = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + cfork = XFS_IFORK_PTR(ip, XFS_COW_FORK); + if (dfork->if_bytes == 0 && cfork->if_bytes == 0) + ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + if (cfork->if_bytes == 0) + xfs_inode_clear_cowblocks_tag(ip); +} + /* * Free up the underlying blocks past new_size. The new size must be smaller * than the current size. This routine can be used both for the attribute and @@ -1583,15 +1601,7 @@ xfs_itruncate_extents( if (error) goto out; - /* - * Clear the reflink flag if there are no data fork blocks and - * there are no extents staged in the cow fork. - */ - if (xfs_is_reflink_inode(ip) && ip->i_cnextents == 0) { - if (ip->i_d.di_nblocks == 0) - ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; - xfs_inode_clear_cowblocks_tag(ip); - } + xfs_itruncate_clear_reflink_flags(ip); /* * Always re-log the inode so that our permanent transaction can keep -- cgit v1.2.3 From 10ddf64e420f7f6c1a871bfb4ff2de08faef8235 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 14 Dec 2017 15:46:05 -0800 Subject: xfs: remove leftover CoW reservations when remounting ro When we're remounting the filesystem readonly, remove all CoW preallocations prior to going ro. If the fs goes down after the ro remount, we never clean up the staging extents, which means xfs_check will trip over them on a subsequent run. Practically speaking, the next mount will clean them up too, so this is unlikely to be seen. Since we shut down the cowblocks cleaner on remount-ro, we also have to make sure we start it back up if/when we remount-rw. Found by adding clonerange to fsstress and running xfs/017. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_icache.c | 2 +- fs/xfs/xfs_icache.h | 1 + fs/xfs/xfs_super.c | 9 +++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 58d2d4253c8e..3861d61fb265 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -870,7 +870,7 @@ xfs_eofblocks_worker( * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default). * (We'll just piggyback on the post-EOF prealloc space workqueue.) */ -STATIC void +void xfs_queue_cowblocks( struct xfs_mount *mp) { diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index bff4d85e5498..d4a77588eca1 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -81,6 +81,7 @@ void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip); int xfs_icache_free_cowblocks(struct xfs_mount *, struct xfs_eofblocks *); int xfs_inode_free_quota_cowblocks(struct xfs_inode *ip); void xfs_cowblocks_worker(struct work_struct *); +void xfs_queue_cowblocks(struct xfs_mount *); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index f663022353c0..2db6a40a96bd 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1360,6 +1360,7 @@ xfs_fs_remount( xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return error; } + xfs_queue_cowblocks(mp); /* Create the per-AG metadata reservation pool .*/ error = xfs_fs_reserve_ag_blocks(mp); @@ -1369,6 +1370,14 @@ xfs_fs_remount( /* rw -> ro */ if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { + /* Get rid of any leftover CoW reservations... */ + cancel_delayed_work_sync(&mp->m_cowblocks_work); + error = xfs_icache_free_cowblocks(mp, NULL); + if (error) { + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; + } + /* Free the per-AG metadata reservation pool. */ error = xfs_fs_unreserve_ag_blocks(mp); if (error) { -- cgit v1.2.3 From 86d692bfad1b0097fa866f5fcfa5f5adf4cd82e8 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 14 Dec 2017 15:46:06 -0800 Subject: xfs: set cowblocks tag for direct cow writes too If a user performs a direct CoW write, we end up loading the CoW fork with preallocated extents. Therefore, we must set the cowblocks tag so that they can be cleared out if we run low on space. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_reflink.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index e49e6db415f7..47aea2e82c26 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -454,6 +454,8 @@ retry: if (error) goto out_bmap_cancel; + xfs_inode_set_cowblocks_tag(ip); + /* Finish up. */ error = xfs_defer_finish(&tp, &dfops); if (error) -- cgit v1.2.3 From 0525e952dcceb9fc947c6d395de7f72220c7d081 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 7 Dec 2017 19:07:03 -0800 Subject: xfs: queue deferred rmap ops for cow staging extent alloc/free in the right order Under the deferred rmap operation scheme, there's a certain order in which the rmap deferred ops have to be queued to maintain integrity during log replay. For alloc/map operations that order is cui -> rui; for free/unmap operations that order is cui -> rui -> efi. However, the initial refcount code got the ordering wrong in the free side of things because it queued refcount free op and an EFI and the refcount free op queued a rmap free op, resulting in the order cui -> efi -> rui. If we fail before the efd finishes, the efi recovery will try to do a wildcard rmap removal and the subsequent rui will fail to find the rmap and blow up. This didn't ever happen due to other screws up in handling unknown owner rmap removals, but those other screw ups broke recovery in other ways, so fix the ordering to follow the intended rules. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_refcount.c | 52 ++++++++++++++++---------------------------- 1 file changed, 19 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 585b35d34142..c40d26763075 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1488,27 +1488,12 @@ __xfs_refcount_cow_alloc( xfs_extlen_t aglen, struct xfs_defer_ops *dfops) { - int error; - trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_private.a.agno, agbno, aglen); /* Add refcount btree reservation */ - error = xfs_refcount_adjust_cow(rcur, agbno, aglen, + return xfs_refcount_adjust_cow(rcur, agbno, aglen, XFS_REFCOUNT_ADJUST_COW_ALLOC, dfops); - if (error) - return error; - - /* Add rmap entry */ - if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) { - error = xfs_rmap_alloc_extent(rcur->bc_mp, dfops, - rcur->bc_private.a.agno, - agbno, aglen, XFS_RMAP_OWN_COW); - if (error) - return error; - } - - return error; } /* @@ -1521,27 +1506,12 @@ __xfs_refcount_cow_free( xfs_extlen_t aglen, struct xfs_defer_ops *dfops) { - int error; - trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_private.a.agno, agbno, aglen); /* Remove refcount btree reservation */ - error = xfs_refcount_adjust_cow(rcur, agbno, aglen, + return xfs_refcount_adjust_cow(rcur, agbno, aglen, XFS_REFCOUNT_ADJUST_COW_FREE, dfops); - if (error) - return error; - - /* Remove rmap entry */ - if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) { - error = xfs_rmap_free_extent(rcur->bc_mp, dfops, - rcur->bc_private.a.agno, - agbno, aglen, XFS_RMAP_OWN_COW); - if (error) - return error; - } - - return error; } /* Record a CoW staging extent in the refcount btree. */ @@ -1552,11 +1522,19 @@ xfs_refcount_alloc_cow_extent( xfs_fsblock_t fsb, xfs_extlen_t len) { + int error; + if (!xfs_sb_version_hasreflink(&mp->m_sb)) return 0; - return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_ALLOC_COW, + error = __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_ALLOC_COW, fsb, len); + if (error) + return error; + + /* Add rmap entry */ + return xfs_rmap_alloc_extent(mp, dfops, XFS_FSB_TO_AGNO(mp, fsb), + XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); } /* Forget a CoW staging event in the refcount btree. */ @@ -1567,9 +1545,17 @@ xfs_refcount_free_cow_extent( xfs_fsblock_t fsb, xfs_extlen_t len) { + int error; + if (!xfs_sb_version_hasreflink(&mp->m_sb)) return 0; + /* Remove rmap entry */ + error = xfs_rmap_free_extent(mp, dfops, XFS_FSB_TO_AGNO(mp, fsb), + XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); + if (error) + return error; + return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_FREE_COW, fsb, len); } -- cgit v1.2.3 From 33df3a9cf925183a6a169bc3eff2bd0febd1298a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 7 Dec 2017 19:07:27 -0800 Subject: xfs: always honor OWN_UNKNOWN rmap removal requests Calling xfs_rmap_free with an unknown owner is supposed to remove any rmaps covering that range regardless of owner. This is used by the EFI recovery code to say "we're freeing this, it mustn't be owned by anything anymore", but for whatever reason xfs_free_ag_extent filters them out. Therefore, remove the filter and make xfs_rmap_unmap actually treat it as a wildcard owner -- free anything that's already there, and if there's no owner at all then that's fine too. There are two existing callers of bmap_add_free that take care the rmap deferred ops themselves and use OWN_UNKNOWN to skip the EFI-based rmap cleanup; convert these to use OWN_NULL (via helpers), and now we really require that an RUI (if any) gets added to the defer ops before any EFI. Lastly, now that xfs_free_extent filters out OWN_NULL rmap free requests, growfs will have to consult directly with the rmap to ensure that there aren't any rmaps in the grown region. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_alloc.c | 4 ++-- fs/xfs/libxfs/xfs_rmap.c | 25 +++++++++++++++++++++++++ fs/xfs/libxfs/xfs_rmap.h | 16 +++++++++++++++- fs/xfs/xfs_extfree_item.c | 2 +- fs/xfs/xfs_fsops.c | 5 +++++ 5 files changed, 48 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 0da80019a917..83ed7715f856 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -702,7 +702,7 @@ xfs_alloc_ag_vextent( ASSERT(args->agbno % args->alignment == 0); /* if not file data, insert new block into the reverse map btree */ - if (args->oinfo.oi_owner != XFS_RMAP_OWN_UNKNOWN) { + if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) { error = xfs_rmap_alloc(args->tp, args->agbp, args->agno, args->agbno, args->len, &args->oinfo); if (error) @@ -1682,7 +1682,7 @@ xfs_free_ag_extent( bno_cur = cnt_cur = NULL; mp = tp->t_mountp; - if (oinfo->oi_owner != XFS_RMAP_OWN_UNKNOWN) { + if (!xfs_rmap_should_skip_owner_update(oinfo)) { error = xfs_rmap_free(tp, agbp, agno, bno, len, oinfo); if (error) goto error0; diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index dd019cee1b3b..7465cfb39276 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -444,6 +444,30 @@ xfs_rmap_unmap( goto out_done; } + /* + * If we're doing an unknown-owner removal for EFI recovery, we expect + * to find the full range in the rmapbt or nothing at all. If we + * don't find any rmaps overlapping either end of the range, we're + * done. Hopefully this means that the EFI creator already queued + * (and finished) a RUI to remove the rmap. + */ + if (owner == XFS_RMAP_OWN_UNKNOWN && + ltrec.rm_startblock + ltrec.rm_blockcount <= bno) { + struct xfs_rmap_irec rtrec; + + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto out_error; + if (i == 0) + goto out_done; + error = xfs_rmap_get_rec(cur, &rtrec, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + if (rtrec.rm_startblock >= bno + len) + goto out_done; + } + /* Make sure the unwritten flag matches. */ XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) == (ltrec.rm_flags & XFS_RMAP_UNWRITTEN), out_error); @@ -664,6 +688,7 @@ xfs_rmap_map( flags |= XFS_RMAP_UNWRITTEN; trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len, unwritten, oinfo); + ASSERT(!xfs_rmap_should_skip_owner_update(oinfo)); /* * For the initial lookup, look for an exact match or the left-adjacent diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 466ede637080..0fcd5b1ba729 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -61,7 +61,21 @@ static inline void xfs_rmap_skip_owner_update( struct xfs_owner_info *oi) { - oi->oi_owner = XFS_RMAP_OWN_UNKNOWN; + xfs_rmap_ag_owner(oi, XFS_RMAP_OWN_NULL); +} + +static inline bool +xfs_rmap_should_skip_owner_update( + struct xfs_owner_info *oi) +{ + return oi->oi_owner == XFS_RMAP_OWN_NULL; +} + +static inline void +xfs_rmap_any_owner_update( + struct xfs_owner_info *oi) +{ + xfs_rmap_ag_owner(oi, XFS_RMAP_OWN_UNKNOWN); } /* Reverse mapping functions. */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 44f8c5451210..64da90655e95 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -538,7 +538,7 @@ xfs_efi_recover( return error; efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); - xfs_rmap_skip_owner_update(&oinfo); + xfs_rmap_any_owner_update(&oinfo); for (i = 0; i < efip->efi_format.efi_nextents; i++) { extp = &efip->efi_format.efi_extents[i]; error = xfs_trans_free_extent(tp, efdp, extp->ext_start, diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 8f22fc579dbb..60a2e128cb6a 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -571,6 +571,11 @@ xfs_growfs_data_private( * this doesn't actually exist in the rmap btree. */ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_NULL); + error = xfs_rmap_free(tp, bp, agno, + be32_to_cpu(agf->agf_length) - new, + new, &oinfo); + if (error) + goto error0; error = xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, agno, be32_to_cpu(agf->agf_length) - new), -- cgit v1.2.3 From 68c58e9b9a88c1a9d0c2eaf6c7acefb00f5fbbfb Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 7 Dec 2017 19:07:55 -0800 Subject: xfs: only skip rmap owner checks for unknown-owner rmap removal For rmap removal, refactor the rmap owner checks into a separate function, then skip the checks if we are performing an unknown-owner removal. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_rmap.c | 76 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 52 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 7465cfb39276..50db920ceeeb 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -367,6 +367,51 @@ xfs_rmap_lookup_le_range( return error; } +/* + * Perform all the relevant owner checks for a removal op. If we're doing an + * unknown-owner removal then we have no owner information to check. + */ +static int +xfs_rmap_free_check_owner( + struct xfs_mount *mp, + uint64_t ltoff, + struct xfs_rmap_irec *rec, + xfs_fsblock_t bno, + xfs_filblks_t len, + uint64_t owner, + uint64_t offset, + unsigned int flags) +{ + int error = 0; + + if (owner == XFS_RMAP_OWN_UNKNOWN) + return 0; + + /* Make sure the unwritten flag matches. */ + XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) == + (rec->rm_flags & XFS_RMAP_UNWRITTEN), out); + + /* Make sure the owner matches what we expect to find in the tree. */ + XFS_WANT_CORRUPTED_GOTO(mp, owner == rec->rm_owner, out); + + /* Check the offset, if necessary. */ + if (XFS_RMAP_NON_INODE_OWNER(owner)) + goto out; + + if (flags & XFS_RMAP_BMBT_BLOCK) { + XFS_WANT_CORRUPTED_GOTO(mp, rec->rm_flags & XFS_RMAP_BMBT_BLOCK, + out); + } else { + XFS_WANT_CORRUPTED_GOTO(mp, rec->rm_offset <= offset, out); + XFS_WANT_CORRUPTED_GOTO(mp, + ltoff + rec->rm_blockcount >= offset + len, + out); + } + +out: + return error; +} + /* * Find the extent in the rmap btree and remove it. * @@ -468,33 +513,16 @@ xfs_rmap_unmap( goto out_done; } - /* Make sure the unwritten flag matches. */ - XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) == - (ltrec.rm_flags & XFS_RMAP_UNWRITTEN), out_error); - /* Make sure the extent we found covers the entire freeing range. */ XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno && - ltrec.rm_startblock + ltrec.rm_blockcount >= - bno + len, out_error); - - /* Make sure the owner matches what we expect to find in the tree. */ - XFS_WANT_CORRUPTED_GOTO(mp, owner == ltrec.rm_owner || - XFS_RMAP_NON_INODE_OWNER(owner), out_error); + ltrec.rm_startblock + ltrec.rm_blockcount >= + bno + len, out_error); - /* Check the offset, if necessary. */ - if (!XFS_RMAP_NON_INODE_OWNER(owner)) { - if (flags & XFS_RMAP_BMBT_BLOCK) { - XFS_WANT_CORRUPTED_GOTO(mp, - ltrec.rm_flags & XFS_RMAP_BMBT_BLOCK, - out_error); - } else { - XFS_WANT_CORRUPTED_GOTO(mp, - ltrec.rm_offset <= offset, out_error); - XFS_WANT_CORRUPTED_GOTO(mp, - ltoff + ltrec.rm_blockcount >= offset + len, - out_error); - } - } + /* Check owner information. */ + error = xfs_rmap_free_check_owner(mp, ltoff, <rec, bno, len, owner, + offset, flags); + if (error) + goto out_error; if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) { /* exact match, simply remove the record from rmap tree */ -- cgit v1.2.3