summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.mailmap1
-rw-r--r--Documentation/filesystems/overlayfs.rst50
-rw-r--r--MAINTAINERS13
-rw-r--r--Makefile2
-rw-r--r--arch/s390/kvm/dat.c100
-rw-r--r--arch/s390/kvm/dat.h23
-rw-r--r--arch/s390/kvm/gaccess.c71
-rw-r--r--arch/s390/kvm/gmap.c160
-rw-r--r--arch/s390/kvm/gmap.h33
-rw-r--r--arch/s390/kvm/kvm-s390.c18
-rw-r--r--arch/s390/kvm/vsie.c4
-rw-r--r--arch/x86/coco/sev/noinstr.c6
-rw-r--r--arch/x86/entry/entry_fred.c14
-rw-r--r--arch/x86/kernel/cpu/common.c33
-rw-r--r--arch/x86/kvm/mmu/mmu.c17
-rw-r--r--drivers/dma/dw-edma/dw-edma-core.c8
-rw-r--r--drivers/dma/dw-edma/dw-hdma-v0-core.c6
-rw-r--r--drivers/dma/fsl-edma-main.c26
-rw-r--r--drivers/dma/idxd/cdev.c8
-rw-r--r--drivers/dma/idxd/device.c45
-rw-r--r--drivers/dma/idxd/dma.c18
-rw-r--r--drivers/dma/idxd/idxd.h1
-rw-r--r--drivers/dma/idxd/init.c14
-rw-r--r--drivers/dma/idxd/irq.c16
-rw-r--r--drivers/dma/idxd/submit.c2
-rw-r--r--drivers/dma/idxd/sysfs.c1
-rw-r--r--drivers/dma/sh/rz-dmac.c66
-rw-r--r--drivers/dma/xilinx/xdma.c4
-rw-r--r--drivers/dma/xilinx/xilinx_dma.c46
-rw-r--r--drivers/i2c/busses/i2c-designware-amdisp.c11
-rw-r--r--drivers/i2c/busses/i2c-imx.c51
-rw-r--r--drivers/irqchip/irq-qcom-mpm.c3
-rw-r--r--drivers/irqchip/irq-renesas-rzv2h.c2
-rw-r--r--drivers/phy/Kconfig5
-rw-r--r--drivers/phy/freescale/phy-fsl-lynx-28g.c2
-rw-r--r--drivers/phy/qualcomm/phy-qcom-qmp-ufs.c3
-rw-r--r--drivers/phy/spacemit/phy-k1-usb2.c14
-rw-r--r--drivers/phy/ti/phy-j721e-wiz.c2
-rw-r--r--drivers/xen/privcmd.c3
-rw-r--r--fs/btrfs/block-group.c2
-rw-r--r--fs/btrfs/disk-io.c4
-rw-r--r--fs/btrfs/tree-log.c98
-rw-r--r--fs/btrfs/volumes.c5
-rw-r--r--fs/btrfs/zlib.c4
-rw-r--r--fs/ext4/Makefile5
-rw-r--r--fs/ext4/crypto.c9
-rw-r--r--fs/ext4/ext4.h6
-rw-r--r--fs/ext4/ext4_extents.h12
-rw-r--r--fs/ext4/extents-test.c12
-rw-r--r--fs/ext4/extents.c80
-rw-r--r--fs/ext4/fast_commit.c17
-rw-r--r--fs/ext4/fsync.c16
-rw-r--r--fs/ext4/ialloc.c6
-rw-r--r--fs/ext4/inline.c10
-rw-r--r--fs/ext4/inode.c75
-rw-r--r--fs/ext4/mballoc-test.c81
-rw-r--r--fs/ext4/mballoc.c132
-rw-r--r--fs/ext4/mballoc.h30
-rw-r--r--fs/ext4/page-io.c10
-rw-r--r--fs/ext4/super.c37
-rw-r--r--fs/ext4/sysfs.c10
-rw-r--r--fs/fs-writeback.c36
-rw-r--r--fs/fuse/file.c4
-rw-r--r--fs/fuse/inode.c1
-rw-r--r--fs/iomap/bio.c51
-rw-r--r--fs/iomap/buffered-io.c15
-rw-r--r--fs/jbd2/checkpoint.c15
-rw-r--r--fs/namei.c10
-rw-r--r--fs/netfs/buffered_read.c3
-rw-r--r--fs/netfs/direct_read.c3
-rw-r--r--fs/netfs/direct_write.c15
-rw-r--r--fs/netfs/iterator.c43
-rw-r--r--fs/netfs/read_collect.c4
-rw-r--r--fs/netfs/read_retry.c5
-rw-r--r--fs/netfs/read_single.c1
-rw-r--r--fs/netfs/write_collect.c4
-rw-r--r--fs/netfs/write_issue.c3
-rw-r--r--fs/overlayfs/copy_up.c6
-rw-r--r--fs/overlayfs/overlayfs.h21
-rw-r--r--fs/overlayfs/ovl_entry.h7
-rw-r--r--fs/overlayfs/params.c33
-rw-r--r--fs/overlayfs/super.c2
-rw-r--r--fs/overlayfs/util.c5
-rw-r--r--include/linux/fs/super_types.h1
-rw-r--r--include/linux/leafops.h32
-rw-r--r--include/linux/mempolicy.h1
-rw-r--r--include/linux/netfs.h1
-rw-r--r--include/linux/pagemap.h11
-rw-r--r--include/trace/events/btrfs.h11
-rw-r--r--include/trace/events/netfs.h8
-rw-r--r--kernel/futex/core.c2
-rw-r--r--kernel/futex/pi.c3
-rw-r--r--kernel/futex/syscalls.c8
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/trace/trace_events_trigger.c79
-rw-r--r--kernel/trace/trace_osnoise.c10
-rw-r--r--lib/bug.c7
-rw-r--r--mm/damon/sysfs.c10
-rw-r--r--mm/memory.c18
-rw-r--r--mm/mempolicy.c10
-rw-r--r--mm/mseal.c3
-rw-r--r--mm/pagewalk.c25
-rw-r--r--mm/swap_state.c9
-rw-r--r--tools/testing/selftests/mount_setattr/mount_setattr_test.c2
104 files changed, 1501 insertions, 602 deletions
diff --git a/.mailmap b/.mailmap
index 7d14504daf24..2d04aeba68b4 100644
--- a/.mailmap
+++ b/.mailmap
@@ -316,6 +316,7 @@ Hans Verkuil <hverkuil@kernel.org> <hverkuil-cisco@xs4all.nl>
Hans Verkuil <hverkuil@kernel.org> <hansverk@cisco.com>
Hao Ge <hao.ge@linux.dev> <gehao@kylinos.cn>
Harry Yoo <harry.yoo@oracle.com> <42.hyeyoo@gmail.com>
+Harry Yoo <harry@kernel.org> <harry.yoo@oracle.com>
Heiko Carstens <hca@linux.ibm.com> <h.carstens@de.ibm.com>
Heiko Carstens <hca@linux.ibm.com> <heiko.carstens@de.ibm.com>
Heiko Stuebner <heiko@sntech.de> <heiko.stuebner@bqreaders.com>
diff --git a/Documentation/filesystems/overlayfs.rst b/Documentation/filesystems/overlayfs.rst
index af5a69f87da4..eb846518e6ac 100644
--- a/Documentation/filesystems/overlayfs.rst
+++ b/Documentation/filesystems/overlayfs.rst
@@ -783,6 +783,56 @@ controlled by the "uuid" mount option, which supports these values:
mounted with "uuid=on".
+Durability and copy up
+----------------------
+
+The fsync(2) system call ensures that the data and metadata of a file
+are safely written to the backing storage, which is expected to
+guarantee the existence of the information post system crash.
+
+Without an fsync(2) call, there is no guarantee that the observed
+data after a system crash will be either the old or the new data, but
+in practice, the observed data after crash is often the old or new data
+or a mix of both.
+
+When an overlayfs file is modified for the first time, copy up will
+create a copy of the lower file and its parent directories in the upper
+layer. Since the Linux filesystem API does not enforce any particular
+ordering on storing changes without explicit fsync(2) calls, in case
+of a system crash, the upper file could end up with no data at all
+(i.e. zeros), which would be an unusual outcome. To avoid this
+experience, overlayfs calls fsync(2) on the upper file before completing
+data copy up with rename(2) or link(2) to make the copy up "atomic".
+
+By default, overlayfs does not explicitly call fsync(2) on copied up
+directories or on metadata-only copy up, so it provides no guarantee to
+persist the user's modification unless the user calls fsync(2).
+The fsync during copy up only guarantees that if a copy up is observed
+after a crash, the observed data is not zeroes or intermediate values
+from the copy up staging area.
+
+On traditional local filesystems with a single journal (e.g. ext4, xfs),
+fsync on a file also persists the parent directory changes, because they
+are usually modified in the same transaction, so metadata durability during
+data copy up effectively comes for free. Overlayfs further limits risk by
+disallowing network filesystems as upper layer.
+
+Overlayfs can be tuned to prefer performance or durability when storing
+to the underlying upper layer. This is controlled by the "fsync" mount
+option, which supports these values:
+
+- "auto": (default)
+ Call fsync(2) on upper file before completion of data copy up.
+ No explicit fsync(2) on directory or metadata-only copy up.
+- "strict":
+ Call fsync(2) on upper file and directories before completion of any
+ copy up.
+- "volatile": [*]
+ Prefer performance over durability (see `Volatile mount`_)
+
+[*] The mount option "volatile" is an alias to "fsync=volatile".
+
+
Volatile mount
--------------
diff --git a/MAINTAINERS b/MAINTAINERS
index 23d88c825175..c3fe46d7c4bc 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9619,7 +9619,12 @@ F: include/linux/ext2*
EXT4 FILE SYSTEM
M: "Theodore Ts'o" <tytso@mit.edu>
-M: Andreas Dilger <adilger.kernel@dilger.ca>
+R: Andreas Dilger <adilger.kernel@dilger.ca>
+R: Baokun Li <libaokun@linux.alibaba.com>
+R: Jan Kara <jack@suse.cz>
+R: Ojaswin Mujoo <ojaswin@linux.ibm.com>
+R: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
+R: Zhang Yi <yi.zhang@huawei.com>
L: linux-ext4@vger.kernel.org
S: Maintained
W: http://ext4.wiki.kernel.org
@@ -12015,7 +12020,6 @@ I2C SUBSYSTEM
M: Wolfram Sang <wsa+renesas@sang-engineering.com>
L: linux-i2c@vger.kernel.org
S: Maintained
-W: https://i2c.wiki.kernel.org/
Q: https://patchwork.ozlabs.org/project/linux-i2c/list/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/wsa/linux.git
F: Documentation/i2c/
@@ -12041,7 +12045,6 @@ I2C SUBSYSTEM HOST DRIVERS
M: Andi Shyti <andi.shyti@kernel.org>
L: linux-i2c@vger.kernel.org
S: Maintained
-W: https://i2c.wiki.kernel.org/
Q: https://patchwork.ozlabs.org/project/linux-i2c/list/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/andi.shyti/linux.git
F: Documentation/devicetree/bindings/i2c/
@@ -16883,7 +16886,7 @@ M: Lorenzo Stoakes <ljs@kernel.org>
R: Rik van Riel <riel@surriel.com>
R: Liam R. Howlett <Liam.Howlett@oracle.com>
R: Vlastimil Babka <vbabka@kernel.org>
-R: Harry Yoo <harry.yoo@oracle.com>
+R: Harry Yoo <harry@kernel.org>
R: Jann Horn <jannh@google.com>
L: linux-mm@kvack.org
S: Maintained
@@ -24349,7 +24352,7 @@ F: drivers/nvmem/layouts/sl28vpd.c
SLAB ALLOCATOR
M: Vlastimil Babka <vbabka@kernel.org>
-M: Harry Yoo <harry.yoo@oracle.com>
+M: Harry Yoo <harry@kernel.org>
M: Andrew Morton <akpm@linux-foundation.org>
R: Hao Li <hao.li@linux.dev>
R: Christoph Lameter <cl@gentwo.org>
diff --git a/Makefile b/Makefile
index e1279c4d5b24..6b1d9fb1a6b4 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
VERSION = 7
PATCHLEVEL = 0
SUBLEVEL = 0
-EXTRAVERSION = -rc5
+EXTRAVERSION = -rc6
NAME = Baby Opossum Posse
# *DOCUMENTATION*
diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c
index 670404d4fa44..7b8d70fe406d 100644
--- a/arch/s390/kvm/dat.c
+++ b/arch/s390/kvm/dat.c
@@ -135,32 +135,6 @@ int dat_set_asce_limit(struct kvm_s390_mmu_cache *mc, union asce *asce, int newt
}
/**
- * dat_crstep_xchg() - Exchange a gmap CRSTE with another.
- * @crstep: Pointer to the CRST entry
- * @new: Replacement entry.
- * @gfn: The affected guest address.
- * @asce: The ASCE of the address space.
- *
- * Context: This function is assumed to be called with kvm->mmu_lock held.
- */
-void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce asce)
-{
- if (crstep->h.i) {
- WRITE_ONCE(*crstep, new);
- return;
- } else if (cpu_has_edat2()) {
- crdte_crste(crstep, *crstep, new, gfn, asce);
- return;
- }
-
- if (machine_has_tlb_guest())
- idte_crste(crstep, gfn, IDTE_GUEST_ASCE, asce, IDTE_GLOBAL);
- else
- idte_crste(crstep, gfn, 0, NULL_ASCE, IDTE_GLOBAL);
- WRITE_ONCE(*crstep, new);
-}
-
-/**
* dat_crstep_xchg_atomic() - Atomically exchange a gmap CRSTE with another.
* @crstep: Pointer to the CRST entry.
* @old: Expected old value.
@@ -175,8 +149,8 @@ void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce
*
* Return: %true if the exchange was successful.
*/
-bool dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new, gfn_t gfn,
- union asce asce)
+bool __must_check dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new,
+ gfn_t gfn, union asce asce)
{
if (old.h.i)
return arch_try_cmpxchg((long *)crstep, &old.val, new.val);
@@ -292,6 +266,7 @@ static int dat_split_ste(struct kvm_s390_mmu_cache *mc, union pmd *pmdp, gfn_t g
pt->ptes[i].val = init.val | i * PAGE_SIZE;
/* No need to take locks as the page table is not installed yet. */
pgste_init.prefix_notif = old.s.fc1.prefix_notif;
+ pgste_init.vsie_notif = old.s.fc1.vsie_notif;
pgste_init.pcl = uses_skeys && init.h.i;
dat_init_pgstes(pt, pgste_init.val);
} else {
@@ -893,7 +868,8 @@ static long _dat_slot_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct d
/* This table entry needs to be updated. */
if (walk->start <= gfn && walk->end >= next) {
- dat_crstep_xchg_atomic(crstep, crste, new_crste, gfn, walk->asce);
+ if (!dat_crstep_xchg_atomic(crstep, crste, new_crste, gfn, walk->asce))
+ return -EINVAL;
/* A lower level table was present, needs to be freed. */
if (!crste.h.fc && !crste.h.i) {
if (is_pmd(crste))
@@ -1021,67 +997,21 @@ bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end)
return _dat_walk_gfn_range(start, end, asce, &test_age_ops, 0, NULL) > 0;
}
-int dat_link(struct kvm_s390_mmu_cache *mc, union asce asce, int level,
- bool uses_skeys, struct guest_fault *f)
-{
- union crste oldval, newval;
- union pte newpte, oldpte;
- union pgste pgste;
- int rc = 0;
-
- rc = dat_entry_walk(mc, f->gfn, asce, DAT_WALK_ALLOC_CONTINUE, level, &f->crstep, &f->ptep);
- if (rc == -EINVAL || rc == -ENOMEM)
- return rc;
- if (rc)
- return -EAGAIN;
-
- if (WARN_ON_ONCE(unlikely(get_level(f->crstep, f->ptep) > level)))
- return -EINVAL;
-
- if (f->ptep) {
- pgste = pgste_get_lock(f->ptep);
- oldpte = *f->ptep;
- newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page);
- newpte.s.sd = oldpte.s.sd;
- oldpte.s.sd = 0;
- if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) {
- pgste = __dat_ptep_xchg(f->ptep, pgste, newpte, f->gfn, asce, uses_skeys);
- if (f->callback)
- f->callback(f);
- } else {
- rc = -EAGAIN;
- }
- pgste_set_unlock(f->ptep, pgste);
- } else {
- oldval = READ_ONCE(*f->crstep);
- newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable,
- f->write_attempt | oldval.s.fc1.d);
- newval.s.fc1.sd = oldval.s.fc1.sd;
- if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val &&
- crste_origin_large(oldval) != crste_origin_large(newval))
- return -EAGAIN;
- if (!dat_crstep_xchg_atomic(f->crstep, oldval, newval, f->gfn, asce))
- return -EAGAIN;
- if (f->callback)
- f->callback(f);
- }
-
- return rc;
-}
-
static long dat_set_pn_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
{
- union crste crste = READ_ONCE(*crstep);
+ union crste newcrste, oldcrste;
int *n = walk->priv;
- if (!crste.h.fc || crste.h.i || crste.h.p)
- return 0;
-
+ do {
+ oldcrste = READ_ONCE(*crstep);
+ if (!oldcrste.h.fc || oldcrste.h.i || oldcrste.h.p)
+ return 0;
+ if (oldcrste.s.fc1.prefix_notif)
+ break;
+ newcrste = oldcrste;
+ newcrste.s.fc1.prefix_notif = 1;
+ } while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, gfn, walk->asce));
*n = 2;
- if (crste.s.fc1.prefix_notif)
- return 0;
- crste.s.fc1.prefix_notif = 1;
- dat_crstep_xchg(crstep, crste, gfn, walk->asce);
return 0;
}
diff --git a/arch/s390/kvm/dat.h b/arch/s390/kvm/dat.h
index 123e11dcd70d..874cc962e196 100644
--- a/arch/s390/kvm/dat.h
+++ b/arch/s390/kvm/dat.h
@@ -160,14 +160,14 @@ union pmd {
unsigned long :44; /* HW */
unsigned long : 3; /* Unused */
unsigned long : 1; /* HW */
+ unsigned long s : 1; /* Special */
unsigned long w : 1; /* Writable soft-bit */
unsigned long r : 1; /* Readable soft-bit */
unsigned long d : 1; /* Dirty */
unsigned long y : 1; /* Young */
- unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
unsigned long : 3; /* HW */
+ unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
unsigned long vsie_notif : 1; /* Referenced in a shadow table */
- unsigned long : 1; /* Unused */
unsigned long : 4; /* HW */
unsigned long sd : 1; /* Soft-Dirty */
unsigned long pr : 1; /* Present */
@@ -183,14 +183,14 @@ union pud {
unsigned long :33; /* HW */
unsigned long :14; /* Unused */
unsigned long : 1; /* HW */
+ unsigned long s : 1; /* Special */
unsigned long w : 1; /* Writable soft-bit */
unsigned long r : 1; /* Readable soft-bit */
unsigned long d : 1; /* Dirty */
unsigned long y : 1; /* Young */
- unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
unsigned long : 3; /* HW */
+ unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
unsigned long vsie_notif : 1; /* Referenced in a shadow table */
- unsigned long : 1; /* Unused */
unsigned long : 4; /* HW */
unsigned long sd : 1; /* Soft-Dirty */
unsigned long pr : 1; /* Present */
@@ -254,14 +254,14 @@ union crste {
struct {
unsigned long :47;
unsigned long : 1; /* HW (should be 0) */
+ unsigned long s : 1; /* Special */
unsigned long w : 1; /* Writable */
unsigned long r : 1; /* Readable */
unsigned long d : 1; /* Dirty */
unsigned long y : 1; /* Young */
- unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
unsigned long : 3; /* HW */
+ unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
unsigned long vsie_notif : 1; /* Referenced in a shadow table */
- unsigned long : 1;
unsigned long : 4; /* HW */
unsigned long sd : 1; /* Soft-Dirty */
unsigned long pr : 1; /* Present */
@@ -540,8 +540,6 @@ int dat_set_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, gf
u16 type, u16 param);
int dat_set_prefix_notif_bit(union asce asce, gfn_t gfn);
bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end);
-int dat_link(struct kvm_s390_mmu_cache *mc, union asce asce, int level,
- bool uses_skeys, struct guest_fault *f);
int dat_perform_essa(union asce asce, gfn_t gfn, int orc, union essa_state *state, bool *dirty);
long dat_reset_cmma(union asce asce, gfn_t start_gfn);
@@ -938,11 +936,14 @@ static inline bool dat_pudp_xchg_atomic(union pud *pudp, union pud old, union pu
return dat_crstep_xchg_atomic(_CRSTEP(pudp), _CRSTE(old), _CRSTE(new), gfn, asce);
}
-static inline void dat_crstep_clear(union crste *crstep, gfn_t gfn, union asce asce)
+static inline union crste dat_crstep_clear_atomic(union crste *crstep, gfn_t gfn, union asce asce)
{
- union crste newcrste = _CRSTE_EMPTY(crstep->h.tt);
+ union crste oldcrste, empty = _CRSTE_EMPTY(crstep->h.tt);
- dat_crstep_xchg(crstep, newcrste, gfn, asce);
+ do {
+ oldcrste = READ_ONCE(*crstep);
+ } while (!dat_crstep_xchg_atomic(crstep, oldcrste, empty, gfn, asce));
+ return oldcrste;
}
static inline int get_level(union crste *crstep, union pte *ptep)
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index a9da9390867d..53a8550e7102 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -1436,13 +1436,21 @@ static int _do_shadow_pte(struct gmap *sg, gpa_t raddr, union pte *ptep_h, union
if (!pgste_get_trylock(ptep_h, &pgste))
return -EAGAIN;
- newpte = _pte(f->pfn, f->writable, !p, 0);
- newpte.s.d |= ptep->s.d;
- newpte.s.sd |= ptep->s.sd;
- newpte.h.p &= ptep->h.p;
- pgste = _gmap_ptep_xchg(sg->parent, ptep_h, newpte, pgste, f->gfn, false);
- pgste.vsie_notif = 1;
+ newpte = _pte(f->pfn, f->writable, !p, ptep_h->s.s);
+ newpte.s.d |= ptep_h->s.d;
+ newpte.s.sd |= ptep_h->s.sd;
+ newpte.h.p &= ptep_h->h.p;
+ if (!newpte.h.p && !f->writable) {
+ rc = -EOPNOTSUPP;
+ } else {
+ pgste = _gmap_ptep_xchg(sg->parent, ptep_h, newpte, pgste, f->gfn, false);
+ pgste.vsie_notif = 1;
+ }
pgste_set_unlock(ptep_h, pgste);
+ if (rc)
+ return rc;
+ if (!sg->parent)
+ return -EAGAIN;
newpte = _pte(f->pfn, 0, !p, 0);
if (!pgste_get_trylock(ptep, &pgste))
@@ -1456,7 +1464,7 @@ static int _do_shadow_pte(struct gmap *sg, gpa_t raddr, union pte *ptep_h, union
static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, union crste *table,
struct guest_fault *f, bool p)
{
- union crste newcrste;
+ union crste newcrste, oldcrste;
gfn_t gfn;
int rc;
@@ -1469,16 +1477,28 @@ static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, uni
if (rc)
return rc;
- newcrste = _crste_fc1(f->pfn, host->h.tt, f->writable, !p);
- newcrste.s.fc1.d |= host->s.fc1.d;
- newcrste.s.fc1.sd |= host->s.fc1.sd;
- newcrste.h.p &= host->h.p;
- newcrste.s.fc1.vsie_notif = 1;
- newcrste.s.fc1.prefix_notif = host->s.fc1.prefix_notif;
- _gmap_crstep_xchg(sg->parent, host, newcrste, f->gfn, false);
+ do {
+ /* _gmap_crstep_xchg_atomic() could have unshadowed this shadow gmap */
+ if (!sg->parent)
+ return -EAGAIN;
+ oldcrste = READ_ONCE(*host);
+ newcrste = _crste_fc1(f->pfn, oldcrste.h.tt, f->writable, !p);
+ newcrste.s.fc1.d |= oldcrste.s.fc1.d;
+ newcrste.s.fc1.sd |= oldcrste.s.fc1.sd;
+ newcrste.h.p &= oldcrste.h.p;
+ newcrste.s.fc1.vsie_notif = 1;
+ newcrste.s.fc1.prefix_notif = oldcrste.s.fc1.prefix_notif;
+ newcrste.s.fc1.s = oldcrste.s.fc1.s;
+ if (!newcrste.h.p && !f->writable)
+ return -EOPNOTSUPP;
+ } while (!_gmap_crstep_xchg_atomic(sg->parent, host, oldcrste, newcrste, f->gfn, false));
+ if (!sg->parent)
+ return -EAGAIN;
- newcrste = _crste_fc1(f->pfn, host->h.tt, 0, !p);
- dat_crstep_xchg(table, newcrste, gpa_to_gfn(raddr), sg->asce);
+ newcrste = _crste_fc1(f->pfn, oldcrste.h.tt, 0, !p);
+ gfn = gpa_to_gfn(raddr);
+ while (!dat_crstep_xchg_atomic(table, READ_ONCE(*table), newcrste, gfn, sg->asce))
+ ;
return 0;
}
@@ -1502,21 +1522,31 @@ static int _gaccess_do_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
if (rc)
return rc;
- /* A race occourred. The shadow mapping is already valid, nothing to do */
- if ((ptep && !ptep->h.i) || (!ptep && crste_leaf(*table)))
+ /* A race occurred. The shadow mapping is already valid, nothing to do */
+ if ((ptep && !ptep->h.i && ptep->h.p == w->p) ||
+ (!ptep && crste_leaf(*table) && !table->h.i && table->h.p == w->p))
return 0;
gl = get_level(table, ptep);
+ /* In case of a real address space */
+ if (w->level <= LEVEL_MEM) {
+ l = TABLE_TYPE_PAGE_TABLE;
+ hl = TABLE_TYPE_REGION1;
+ goto real_address_space;
+ }
+
/*
* Skip levels that are already protected. For each level, protect
* only the page containing the entry, not the whole table.
*/
for (i = gl ; i >= w->level; i--) {
- rc = gmap_protect_rmap(mc, sg, entries[i - 1].gfn, gpa_to_gfn(saddr),
- entries[i - 1].pfn, i, entries[i - 1].writable);
+ rc = gmap_protect_rmap(mc, sg, entries[i].gfn, gpa_to_gfn(saddr),
+ entries[i].pfn, i + 1, entries[i].writable);
if (rc)
return rc;
+ if (!sg->parent)
+ return -EAGAIN;
}
rc = dat_entry_walk(NULL, entries[LEVEL_MEM].gfn, sg->parent->asce, DAT_WALK_LEAF,
@@ -1528,6 +1558,7 @@ static int _gaccess_do_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
/* Get the smallest granularity */
l = min3(gl, hl, w->level);
+real_address_space:
flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0);
/* If necessary, create the shadow mapping */
if (l < gl) {
diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c
index ef0c6ebfdde2..645c32c767d2 100644
--- a/arch/s390/kvm/gmap.c
+++ b/arch/s390/kvm/gmap.c
@@ -313,13 +313,16 @@ static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, st
struct clear_young_pte_priv *priv = walk->priv;
union crste crste, new;
- crste = READ_ONCE(*crstep);
+ do {
+ crste = READ_ONCE(*crstep);
+
+ if (!crste.h.fc)
+ return 0;
+ if (!crste.s.fc1.y && crste.h.i)
+ return 0;
+ if (crste_prefix(crste) && !gmap_mkold_prefix(priv->gmap, gfn, end))
+ break;
- if (!crste.h.fc)
- return 0;
- if (!crste.s.fc1.y && crste.h.i)
- return 0;
- if (!crste_prefix(crste) || gmap_mkold_prefix(priv->gmap, gfn, end)) {
new = crste;
new.h.i = 1;
new.s.fc1.y = 0;
@@ -328,8 +331,8 @@ static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, st
folio_set_dirty(phys_to_folio(crste_origin_large(crste)));
new.s.fc1.d = 0;
new.h.p = 1;
- dat_crstep_xchg(crstep, new, gfn, walk->asce);
- }
+ } while (!dat_crstep_xchg_atomic(crstep, crste, new, gfn, walk->asce));
+
priv->young = 1;
return 0;
}
@@ -391,14 +394,18 @@ static long _gmap_unmap_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct
{
struct gmap_unmap_priv *priv = walk->priv;
struct folio *folio = NULL;
+ union crste old = *crstep;
- if (crstep->h.fc) {
- if (crstep->s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags))
- folio = phys_to_folio(crste_origin_large(*crstep));
- gmap_crstep_xchg(priv->gmap, crstep, _CRSTE_EMPTY(crstep->h.tt), gfn);
- if (folio)
- uv_convert_from_secure_folio(folio);
- }
+ if (!old.h.fc)
+ return 0;
+
+ if (old.s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags))
+ folio = phys_to_folio(crste_origin_large(old));
+ /* No races should happen because kvm->mmu_lock is held in write mode */
+ KVM_BUG_ON(!gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn),
+ priv->gmap->kvm);
+ if (folio)
+ uv_convert_from_secure_folio(folio);
return 0;
}
@@ -474,23 +481,24 @@ static long _crste_test_and_clear_softdirty(union crste *table, gfn_t gfn, gfn_t
if (fatal_signal_pending(current))
return 1;
- crste = READ_ONCE(*table);
- if (!crste.h.fc)
- return 0;
- if (crste.h.p && !crste.s.fc1.sd)
- return 0;
+ do {
+ crste = READ_ONCE(*table);
+ if (!crste.h.fc)
+ return 0;
+ if (crste.h.p && !crste.s.fc1.sd)
+ return 0;
- /*
- * If this large page contains one or more prefixes of vCPUs that are
- * currently running, do not reset the protection, leave it marked as
- * dirty.
- */
- if (!crste.s.fc1.prefix_notif || gmap_mkold_prefix(gmap, gfn, end)) {
+ /*
+ * If this large page contains one or more prefixes of vCPUs that are
+ * currently running, do not reset the protection, leave it marked as
+ * dirty.
+ */
+ if (crste.s.fc1.prefix_notif && !gmap_mkold_prefix(gmap, gfn, end))
+ break;
new = crste;
new.h.p = 1;
new.s.fc1.sd = 0;
- gmap_crstep_xchg(gmap, table, new, gfn);
- }
+ } while (!gmap_crstep_xchg_atomic(gmap, table, crste, new, gfn));
for ( ; gfn < end; gfn++)
mark_page_dirty(gmap->kvm, gfn);
@@ -511,7 +519,7 @@ void gmap_sync_dirty_log(struct gmap *gmap, gfn_t start, gfn_t end)
_dat_walk_gfn_range(start, end, gmap->asce, &walk_ops, 0, gmap);
}
-static int gmap_handle_minor_crste_fault(union asce asce, struct guest_fault *f)
+static int gmap_handle_minor_crste_fault(struct gmap *gmap, struct guest_fault *f)
{
union crste newcrste, oldcrste = READ_ONCE(*f->crstep);
@@ -536,10 +544,8 @@ static int gmap_handle_minor_crste_fault(union asce asce, struct guest_fault *f)
newcrste.s.fc1.d = 1;
newcrste.s.fc1.sd = 1;
}
- if (!oldcrste.s.fc1.d && newcrste.s.fc1.d)
- SetPageDirty(phys_to_page(crste_origin_large(newcrste)));
/* In case of races, let the slow path deal with it. */
- return !dat_crstep_xchg_atomic(f->crstep, oldcrste, newcrste, f->gfn, asce);
+ return !gmap_crstep_xchg_atomic(gmap, f->crstep, oldcrste, newcrste, f->gfn);
}
/* Trying to write on a read-only page, let the slow path deal with it. */
return 1;
@@ -568,8 +574,6 @@ static int _gmap_handle_minor_pte_fault(struct gmap *gmap, union pgste *pgste,
newpte.s.d = 1;
newpte.s.sd = 1;
}
- if (!oldpte.s.d && newpte.s.d)
- SetPageDirty(pfn_to_page(newpte.h.pfra));
*pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, *pgste, f->gfn);
return 0;
@@ -606,7 +610,7 @@ int gmap_try_fixup_minor(struct gmap *gmap, struct guest_fault *fault)
fault->callback(fault);
pgste_set_unlock(fault->ptep, pgste);
} else {
- rc = gmap_handle_minor_crste_fault(gmap->asce, fault);
+ rc = gmap_handle_minor_crste_fault(gmap, fault);
if (!rc && fault->callback)
fault->callback(fault);
}
@@ -623,10 +627,61 @@ static inline bool gmap_1m_allowed(struct gmap *gmap, gfn_t gfn)
return test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &gmap->flags);
}
+static int _gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, int level,
+ struct guest_fault *f)
+{
+ union crste oldval, newval;
+ union pte newpte, oldpte;
+ union pgste pgste;
+ int rc = 0;
+
+ rc = dat_entry_walk(mc, f->gfn, gmap->asce, DAT_WALK_ALLOC_CONTINUE, level,
+ &f->crstep, &f->ptep);
+ if (rc == -ENOMEM)
+ return rc;
+ if (KVM_BUG_ON(rc == -EINVAL, gmap->kvm))
+ return rc;
+ if (rc)
+ return -EAGAIN;
+ if (KVM_BUG_ON(get_level(f->crstep, f->ptep) > level, gmap->kvm))
+ return -EINVAL;
+
+ if (f->ptep) {
+ pgste = pgste_get_lock(f->ptep);
+ oldpte = *f->ptep;
+ newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page);
+ newpte.s.sd = oldpte.s.sd;
+ oldpte.s.sd = 0;
+ if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) {
+ pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, pgste, f->gfn);
+ if (f->callback)
+ f->callback(f);
+ } else {
+ rc = -EAGAIN;
+ }
+ pgste_set_unlock(f->ptep, pgste);
+ } else {
+ do {
+ oldval = READ_ONCE(*f->crstep);
+ newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable,
+ f->write_attempt | oldval.s.fc1.d);
+ newval.s.fc1.s = !f->page;
+ newval.s.fc1.sd = oldval.s.fc1.sd;
+ if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val &&
+ crste_origin_large(oldval) != crste_origin_large(newval))
+ return -EAGAIN;
+ } while (!gmap_crstep_xchg_atomic(gmap, f->crstep, oldval, newval, f->gfn));
+ if (f->callback)
+ f->callback(f);
+ }
+
+ return rc;
+}
+
int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *f)
{
unsigned int order;
- int rc, level;
+ int level;
lockdep_assert_held(&gmap->kvm->mmu_lock);
@@ -638,16 +693,14 @@ int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fau
else if (order >= get_order(_SEGMENT_SIZE) && gmap_1m_allowed(gmap, f->gfn))
level = TABLE_TYPE_SEGMENT;
}
- rc = dat_link(mc, gmap->asce, level, uses_skeys(gmap), f);
- KVM_BUG_ON(rc == -EINVAL, gmap->kvm);
- return rc;
+ return _gmap_link(mc, gmap, level, f);
}
static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap,
gfn_t p_gfn, gfn_t c_gfn, bool force_alloc)
{
+ union crste newcrste, oldcrste;
struct page_table *pt;
- union crste newcrste;
union crste *crstep;
union pte *ptep;
int rc;
@@ -673,7 +726,11 @@ static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap,
&crstep, &ptep);
if (rc)
return rc;
- dat_crstep_xchg(crstep, newcrste, c_gfn, gmap->asce);
+ do {
+ oldcrste = READ_ONCE(*crstep);
+ if (oldcrste.val == newcrste.val)
+ break;
+ } while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, c_gfn, gmap->asce));
return 0;
}
@@ -777,8 +834,10 @@ static void gmap_ucas_unmap_one(struct gmap *gmap, gfn_t c_gfn)
int rc;
rc = dat_entry_walk(NULL, c_gfn, gmap->asce, 0, TABLE_TYPE_SEGMENT, &crstep, &ptep);
- if (!rc)
- dat_crstep_xchg(crstep, _PMD_EMPTY, c_gfn, gmap->asce);
+ if (rc)
+ return;
+ while (!dat_crstep_xchg_atomic(crstep, READ_ONCE(*crstep), _PMD_EMPTY, c_gfn, gmap->asce))
+ ;
}
void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count)
@@ -1017,8 +1076,8 @@ static void gmap_unshadow_level(struct gmap *sg, gfn_t r_gfn, int level)
dat_ptep_xchg(ptep, _PTE_EMPTY, r_gfn, sg->asce, uses_skeys(sg));
return;
}
- crste = READ_ONCE(*crstep);
- dat_crstep_clear(crstep, r_gfn, sg->asce);
+
+ crste = dat_crstep_clear_atomic(crstep, r_gfn, sg->asce);
if (crste_leaf(crste) || crste.h.i)
return;
if (is_pmd(crste))
@@ -1101,6 +1160,7 @@ struct gmap_protect_asce_top_level {
static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
struct gmap_protect_asce_top_level *context)
{
+ struct gmap *parent;
int rc, i;
guard(write_lock)(&sg->kvm->mmu_lock);
@@ -1108,7 +1168,12 @@ static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, s
if (kvm_s390_array_needs_retry_safe(sg->kvm, context->seq, context->f))
return -EAGAIN;
- scoped_guard(spinlock, &sg->parent->children_lock) {
+ parent = READ_ONCE(sg->parent);
+ if (!parent)
+ return -EAGAIN;
+ scoped_guard(spinlock, &parent->children_lock) {
+ if (READ_ONCE(sg->parent) != parent)
+ return -EAGAIN;
for (i = 0; i < CRST_TABLE_PAGES; i++) {
if (!context->f[i].valid)
continue;
@@ -1191,6 +1256,9 @@ struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *pare
struct gmap *sg, *new;
int rc;
+ if (WARN_ON(!parent))
+ return ERR_PTR(-EINVAL);
+
scoped_guard(spinlock, &parent->children_lock) {
sg = gmap_find_shadow(parent, asce, edat_level);
if (sg) {
diff --git a/arch/s390/kvm/gmap.h b/arch/s390/kvm/gmap.h
index ccb5cd751e31..579399ef5480 100644
--- a/arch/s390/kvm/gmap.h
+++ b/arch/s390/kvm/gmap.h
@@ -185,6 +185,8 @@ static inline union pgste _gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, un
else
_gmap_handle_vsie_unshadow_event(gmap, gfn);
}
+ if (!ptep->s.d && newpte.s.d && !newpte.s.s)
+ SetPageDirty(pfn_to_page(newpte.h.pfra));
return __dat_ptep_xchg(ptep, pgste, newpte, gfn, gmap->asce, uses_skeys(gmap));
}
@@ -194,35 +196,42 @@ static inline union pgste gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, uni
return _gmap_ptep_xchg(gmap, ptep, newpte, pgste, gfn, true);
}
-static inline void _gmap_crstep_xchg(struct gmap *gmap, union crste *crstep, union crste ne,
- gfn_t gfn, bool needs_lock)
+static inline bool __must_check _gmap_crstep_xchg_atomic(struct gmap *gmap, union crste *crstep,
+ union crste oldcrste, union crste newcrste,
+ gfn_t gfn, bool needs_lock)
{
- unsigned long align = 8 + (is_pmd(*crstep) ? 0 : 11);
+ unsigned long align = is_pmd(newcrste) ? _PAGE_ENTRIES : _PAGE_ENTRIES * _CRST_ENTRIES;
+
+ if (KVM_BUG_ON(crstep->h.tt != oldcrste.h.tt || newcrste.h.tt != oldcrste.h.tt, gmap->kvm))
+ return true;
lockdep_assert_held(&gmap->kvm->mmu_lock);
if (!needs_lock)
lockdep_assert_held(&gmap->children_lock);
gfn = ALIGN_DOWN(gfn, align);
- if (crste_prefix(*crstep) && (ne.h.p || ne.h.i || !crste_prefix(ne))) {
- ne.s.fc1.prefix_notif = 0;
+ if (crste_prefix(oldcrste) && (newcrste.h.p || newcrste.h.i || !crste_prefix(newcrste))) {
+ newcrste.s.fc1.prefix_notif = 0;
gmap_unmap_prefix(gmap, gfn, gfn + align);
}
- if (crste_leaf(*crstep) && crstep->s.fc1.vsie_notif &&
- (ne.h.p || ne.h.i || !ne.s.fc1.vsie_notif)) {
- ne.s.fc1.vsie_notif = 0;
+ if (crste_leaf(oldcrste) && oldcrste.s.fc1.vsie_notif &&
+ (newcrste.h.p || newcrste.h.i || !newcrste.s.fc1.vsie_notif)) {
+ newcrste.s.fc1.vsie_notif = 0;
if (needs_lock)
gmap_handle_vsie_unshadow_event(gmap, gfn);
else
_gmap_handle_vsie_unshadow_event(gmap, gfn);
}
- dat_crstep_xchg(crstep, ne, gfn, gmap->asce);
+ if (!oldcrste.s.fc1.d && newcrste.s.fc1.d && !newcrste.s.fc1.s)
+ SetPageDirty(phys_to_page(crste_origin_large(newcrste)));
+ return dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, gfn, gmap->asce);
}
-static inline void gmap_crstep_xchg(struct gmap *gmap, union crste *crstep, union crste ne,
- gfn_t gfn)
+static inline bool __must_check gmap_crstep_xchg_atomic(struct gmap *gmap, union crste *crstep,
+ union crste oldcrste, union crste newcrste,
+ gfn_t gfn)
{
- return _gmap_crstep_xchg(gmap, crstep, ne, gfn, true);
+ return _gmap_crstep_xchg_atomic(gmap, crstep, oldcrste, newcrste, gfn, true);
}
/**
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index b2c01fa7b852..d7838334a338 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -5520,9 +5520,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
#endif
case KVM_S390_VCPU_FAULT: {
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- r = vcpu_dat_fault_handler(vcpu, arg, 0);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ gpa_t gaddr = arg;
+
+ scoped_guard(srcu, &vcpu->kvm->srcu) {
+ r = vcpu_ucontrol_translate(vcpu, &gaddr);
+ if (r)
+ break;
+
+ r = kvm_s390_faultin_gfn_simple(vcpu, NULL, gpa_to_gfn(gaddr), false);
+ if (r == PGM_ADDRESSING)
+ r = -EFAULT;
+ if (r <= 0)
+ break;
+ r = -EIO;
+ KVM_BUG_ON(r, vcpu->kvm);
+ }
break;
}
case KVM_ENABLE_CAP:
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 0330829b4046..72895dddc39a 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -1328,7 +1328,7 @@ static void unregister_shadow_scb(struct kvm_vcpu *vcpu)
static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
{
struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
- struct gmap *sg;
+ struct gmap *sg = NULL;
int rc = 0;
while (1) {
@@ -1368,6 +1368,8 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
sg = gmap_put(sg);
cond_resched();
}
+ if (sg)
+ sg = gmap_put(sg);
if (rc == -EFAULT) {
/*
diff --git a/arch/x86/coco/sev/noinstr.c b/arch/x86/coco/sev/noinstr.c
index 9d94aca4a698..5afd663a1c21 100644
--- a/arch/x86/coco/sev/noinstr.c
+++ b/arch/x86/coco/sev/noinstr.c
@@ -121,6 +121,9 @@ noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state)
WARN_ON(!irqs_disabled());
+ if (!sev_cfg.ghcbs_initialized)
+ return boot_ghcb;
+
data = this_cpu_read(runtime_data);
ghcb = &data->ghcb_page;
@@ -164,6 +167,9 @@ noinstr void __sev_put_ghcb(struct ghcb_state *state)
WARN_ON(!irqs_disabled());
+ if (!sev_cfg.ghcbs_initialized)
+ return;
+
data = this_cpu_read(runtime_data);
ghcb = &data->ghcb_page;
diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c
index 88c757ac8ccd..fbe2d10dd737 100644
--- a/arch/x86/entry/entry_fred.c
+++ b/arch/x86/entry/entry_fred.c
@@ -177,6 +177,16 @@ static noinstr void fred_extint(struct pt_regs *regs)
}
}
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+noinstr void exc_vmm_communication(struct pt_regs *regs, unsigned long error_code)
+{
+ if (user_mode(regs))
+ return user_exc_vmm_communication(regs, error_code);
+ else
+ return kernel_exc_vmm_communication(regs, error_code);
+}
+#endif
+
static noinstr void fred_hwexc(struct pt_regs *regs, unsigned long error_code)
{
/* Optimize for #PF. That's the only exception which matters performance wise */
@@ -207,6 +217,10 @@ static noinstr void fred_hwexc(struct pt_regs *regs, unsigned long error_code)
#ifdef CONFIG_X86_CET
case X86_TRAP_CP: return exc_control_protection(regs, error_code);
#endif
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ case X86_TRAP_VC: return exc_vmm_communication(regs, error_code);
+#endif
+
default: return fred_bad_type(regs, error_code);
}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index a8ff4376c286..ec0670114efa 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -433,7 +433,20 @@ static __always_inline void setup_lass(struct cpuinfo_x86 *c)
/* These bits should not change their value after CPU init is finished. */
static const unsigned long cr4_pinned_mask = X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP |
- X86_CR4_FSGSBASE | X86_CR4_CET | X86_CR4_FRED;
+ X86_CR4_FSGSBASE | X86_CR4_CET;
+
+/*
+ * The CR pinning protects against ROP on the 'mov %reg, %CRn' instruction(s).
+ * Since you can ROP directly to these instructions (barring shadow stack),
+ * any protection must follow immediately and unconditionally after that.
+ *
+ * Specifically, the CR[04] write functions below will have the value
+ * validation controlled by the @cr_pinning static_branch which is
+ * __ro_after_init, just like the cr4_pinned_bits value.
+ *
+ * Once set, an attacker will have to defeat page-tables to get around these
+ * restrictions. Which is a much bigger ask than 'simple' ROP.
+ */
static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning);
static unsigned long cr4_pinned_bits __ro_after_init;
@@ -2050,12 +2063,6 @@ static void identify_cpu(struct cpuinfo_x86 *c)
setup_umip(c);
setup_lass(c);
- /* Enable FSGSBASE instructions if available. */
- if (cpu_has(c, X86_FEATURE_FSGSBASE)) {
- cr4_set_bits(X86_CR4_FSGSBASE);
- elf_hwcap2 |= HWCAP2_FSGSBASE;
- }
-
/*
* The vendor-specific functions might have changed features.
* Now we do "generic changes."
@@ -2416,6 +2423,18 @@ void cpu_init_exception_handling(bool boot_cpu)
/* GHCB needs to be setup to handle #VC. */
setup_ghcb();
+ /*
+ * On CPUs with FSGSBASE support, paranoid_entry() uses
+ * ALTERNATIVE-patched RDGSBASE/WRGSBASE instructions. Secondary CPUs
+ * boot after alternatives are patched globally, so early exceptions
+ * execute patched code that depends on FSGSBASE. Enable the feature
+ * before any exceptions occur.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_FSGSBASE)) {
+ cr4_set_bits(X86_CR4_FSGSBASE);
+ elf_hwcap2 |= HWCAP2_FSGSBASE;
+ }
+
if (cpu_feature_enabled(X86_FEATURE_FRED)) {
/* The boot CPU has enabled FRED during early boot */
if (!boot_cpu)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index b922a8b00057..dd06453d5b72 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3044,12 +3044,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
bool prefetch = !fault || fault->prefetch;
bool write_fault = fault && fault->write;
- if (unlikely(is_noslot_pfn(pfn))) {
- vcpu->stat.pf_mmio_spte_created++;
- mark_mmio_spte(vcpu, sptep, gfn, pte_access);
- return RET_PF_EMULATE;
- }
-
if (is_shadow_present_pte(*sptep)) {
if (prefetch && is_last_spte(*sptep, level) &&
pfn == spte_to_pfn(*sptep))
@@ -3066,13 +3060,22 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
child = spte_to_child_sp(pte);
drop_parent_pte(vcpu->kvm, child, sptep);
flush = true;
- } else if (WARN_ON_ONCE(pfn != spte_to_pfn(*sptep))) {
+ } else if (pfn != spte_to_pfn(*sptep)) {
+ WARN_ON_ONCE(vcpu->arch.mmu->root_role.direct);
drop_spte(vcpu->kvm, sptep);
flush = true;
} else
was_rmapped = 1;
}
+ if (unlikely(is_noslot_pfn(pfn))) {
+ vcpu->stat.pf_mmio_spte_created++;
+ mark_mmio_spte(vcpu, sptep, gfn, pte_access);
+ if (flush)
+ kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level);
+ return RET_PF_EMULATE;
+ }
+
wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch,
false, host_writable, &spte);
diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c
index e7d698b352d3..5397dbda4f72 100644
--- a/drivers/dma/dw-edma/dw-edma-core.c
+++ b/drivers/dma/dw-edma/dw-edma-core.c
@@ -844,6 +844,7 @@ static int dw_edma_irq_request(struct dw_edma *dw,
{
struct dw_edma_chip *chip = dw->chip;
struct device *dev = dw->chip->dev;
+ struct msi_desc *msi_desc;
u32 wr_mask = 1;
u32 rd_mask = 1;
int i, err = 0;
@@ -895,9 +896,12 @@ static int dw_edma_irq_request(struct dw_edma *dw,
&dw->irq[i]);
if (err)
goto err_irq_free;
-
- if (irq_get_msi_desc(irq))
+ msi_desc = irq_get_msi_desc(irq);
+ if (msi_desc) {
get_cached_msi_msg(irq, &dw->irq[i].msi);
+ if (!msi_desc->pci.msi_attrib.is_msix)
+ dw->irq[i].msi.data = dw->irq[0].msi.data + i;
+ }
}
dw->nr_irqs = i;
diff --git a/drivers/dma/dw-edma/dw-hdma-v0-core.c b/drivers/dma/dw-edma/dw-hdma-v0-core.c
index e3f8db4fe909..ce8f7254bab2 100644
--- a/drivers/dma/dw-edma/dw-hdma-v0-core.c
+++ b/drivers/dma/dw-edma/dw-hdma-v0-core.c
@@ -252,10 +252,10 @@ static void dw_hdma_v0_core_start(struct dw_edma_chunk *chunk, bool first)
lower_32_bits(chunk->ll_region.paddr));
SET_CH_32(dw, chan->dir, chan->id, llp.msb,
upper_32_bits(chunk->ll_region.paddr));
+ /* Set consumer cycle */
+ SET_CH_32(dw, chan->dir, chan->id, cycle_sync,
+ HDMA_V0_CONSUMER_CYCLE_STAT | HDMA_V0_CONSUMER_CYCLE_BIT);
}
- /* Set consumer cycle */
- SET_CH_32(dw, chan->dir, chan->id, cycle_sync,
- HDMA_V0_CONSUMER_CYCLE_STAT | HDMA_V0_CONSUMER_CYCLE_BIT);
dw_hdma_v0_sync_ll_data(chunk);
diff --git a/drivers/dma/fsl-edma-main.c b/drivers/dma/fsl-edma-main.c
index dbcdd1e68319..b596baa0a182 100644
--- a/drivers/dma/fsl-edma-main.c
+++ b/drivers/dma/fsl-edma-main.c
@@ -317,10 +317,8 @@ static struct dma_chan *fsl_edma3_xlate(struct of_phandle_args *dma_spec,
return NULL;
i = fsl_chan - fsl_edma->chans;
- fsl_chan->priority = dma_spec->args[1];
- fsl_chan->is_rxchan = dma_spec->args[2] & FSL_EDMA_RX;
- fsl_chan->is_remote = dma_spec->args[2] & FSL_EDMA_REMOTE;
- fsl_chan->is_multi_fifo = dma_spec->args[2] & FSL_EDMA_MULTI_FIFO;
+ if (!b_chmux && i != dma_spec->args[0])
+ continue;
if ((dma_spec->args[2] & FSL_EDMA_EVEN_CH) && (i & 0x1))
continue;
@@ -328,17 +326,15 @@ static struct dma_chan *fsl_edma3_xlate(struct of_phandle_args *dma_spec,
if ((dma_spec->args[2] & FSL_EDMA_ODD_CH) && !(i & 0x1))
continue;
- if (!b_chmux && i == dma_spec->args[0]) {
- chan = dma_get_slave_channel(chan);
- chan->device->privatecnt++;
- return chan;
- } else if (b_chmux && !fsl_chan->srcid) {
- /* if controller support channel mux, choose a free channel */
- chan = dma_get_slave_channel(chan);
- chan->device->privatecnt++;
- fsl_chan->srcid = dma_spec->args[0];
- return chan;
- }
+ fsl_chan->srcid = dma_spec->args[0];
+ fsl_chan->priority = dma_spec->args[1];
+ fsl_chan->is_rxchan = dma_spec->args[2] & FSL_EDMA_RX;
+ fsl_chan->is_remote = dma_spec->args[2] & FSL_EDMA_REMOTE;
+ fsl_chan->is_multi_fifo = dma_spec->args[2] & FSL_EDMA_MULTI_FIFO;
+
+ chan = dma_get_slave_channel(chan);
+ chan->device->privatecnt++;
+ return chan;
}
return NULL;
}
diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index c37d233535f9..0366c7cf3502 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -158,11 +158,7 @@ static const struct device_type idxd_cdev_file_type = {
static void idxd_cdev_dev_release(struct device *dev)
{
struct idxd_cdev *idxd_cdev = dev_to_cdev(dev);
- struct idxd_cdev_context *cdev_ctx;
- struct idxd_wq *wq = idxd_cdev->wq;
- cdev_ctx = &ictx[wq->idxd->data->type];
- ida_free(&cdev_ctx->minor_ida, idxd_cdev->minor);
kfree(idxd_cdev);
}
@@ -582,11 +578,15 @@ int idxd_wq_add_cdev(struct idxd_wq *wq)
void idxd_wq_del_cdev(struct idxd_wq *wq)
{
+ struct idxd_cdev_context *cdev_ctx;
struct idxd_cdev *idxd_cdev;
idxd_cdev = wq->idxd_cdev;
wq->idxd_cdev = NULL;
cdev_device_del(&idxd_cdev->cdev, cdev_dev(idxd_cdev));
+
+ cdev_ctx = &ictx[wq->idxd->data->type];
+ ida_free(&cdev_ctx->minor_ida, idxd_cdev->minor);
put_device(cdev_dev(idxd_cdev));
}
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index c26128529ff4..131138483b87 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -175,6 +175,7 @@ void idxd_wq_free_resources(struct idxd_wq *wq)
free_descs(wq);
dma_free_coherent(dev, wq->compls_size, wq->compls, wq->compls_addr);
sbitmap_queue_free(&wq->sbq);
+ wq->type = IDXD_WQT_NONE;
}
EXPORT_SYMBOL_NS_GPL(idxd_wq_free_resources, "IDXD");
@@ -382,7 +383,6 @@ static void idxd_wq_disable_cleanup(struct idxd_wq *wq)
lockdep_assert_held(&wq->wq_lock);
wq->state = IDXD_WQ_DISABLED;
memset(wq->wqcfg, 0, idxd->wqcfg_size);
- wq->type = IDXD_WQT_NONE;
wq->threshold = 0;
wq->priority = 0;
wq->enqcmds_retries = IDXD_ENQCMDS_RETRIES;
@@ -831,8 +831,7 @@ static void idxd_device_evl_free(struct idxd_device *idxd)
struct device *dev = &idxd->pdev->dev;
struct idxd_evl *evl = idxd->evl;
- gencfg.bits = ioread32(idxd->reg_base + IDXD_GENCFG_OFFSET);
- if (!gencfg.evl_en)
+ if (!evl)
return;
mutex_lock(&evl->lock);
@@ -1125,7 +1124,11 @@ int idxd_device_config(struct idxd_device *idxd)
{
int rc;
- lockdep_assert_held(&idxd->dev_lock);
+ guard(spinlock)(&idxd->dev_lock);
+
+ if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+ return 0;
+
rc = idxd_wqs_setup(idxd);
if (rc < 0)
return rc;
@@ -1332,6 +1335,11 @@ void idxd_wq_free_irq(struct idxd_wq *wq)
free_irq(ie->vector, ie);
idxd_flush_pending_descs(ie);
+
+ /* The interrupt might have been already released by FLR */
+ if (ie->int_handle == INVALID_INT_HANDLE)
+ return;
+
if (idxd->request_int_handles)
idxd_device_release_int_handle(idxd, ie->int_handle, IDXD_IRQ_MSIX);
idxd_device_clear_perm_entry(idxd, ie);
@@ -1340,6 +1348,23 @@ void idxd_wq_free_irq(struct idxd_wq *wq)
ie->pasid = IOMMU_PASID_INVALID;
}
+void idxd_wq_flush_descs(struct idxd_wq *wq)
+{
+ struct idxd_irq_entry *ie = &wq->ie;
+ struct idxd_device *idxd = wq->idxd;
+
+ guard(mutex)(&wq->wq_lock);
+
+ if (wq->state != IDXD_WQ_ENABLED || wq->type != IDXD_WQT_KERNEL)
+ return;
+
+ idxd_flush_pending_descs(ie);
+ if (idxd->request_int_handles)
+ idxd_device_release_int_handle(idxd, ie->int_handle, IDXD_IRQ_MSIX);
+ idxd_device_clear_perm_entry(idxd, ie);
+ ie->int_handle = INVALID_INT_HANDLE;
+}
+
int idxd_wq_request_irq(struct idxd_wq *wq)
{
struct idxd_device *idxd = wq->idxd;
@@ -1454,11 +1479,7 @@ int idxd_drv_enable_wq(struct idxd_wq *wq)
}
}
- rc = 0;
- spin_lock(&idxd->dev_lock);
- if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
- rc = idxd_device_config(idxd);
- spin_unlock(&idxd->dev_lock);
+ rc = idxd_device_config(idxd);
if (rc < 0) {
dev_dbg(dev, "Writing wq %d config failed: %d\n", wq->id, rc);
goto err;
@@ -1533,7 +1554,6 @@ void idxd_drv_disable_wq(struct idxd_wq *wq)
idxd_wq_reset(wq);
idxd_wq_free_resources(wq);
percpu_ref_exit(&wq->wq_active);
- wq->type = IDXD_WQT_NONE;
wq->client_count = 0;
}
EXPORT_SYMBOL_NS_GPL(idxd_drv_disable_wq, "IDXD");
@@ -1554,10 +1574,7 @@ int idxd_device_drv_probe(struct idxd_dev *idxd_dev)
}
/* Device configuration */
- spin_lock(&idxd->dev_lock);
- if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
- rc = idxd_device_config(idxd);
- spin_unlock(&idxd->dev_lock);
+ rc = idxd_device_config(idxd);
if (rc < 0)
return -ENXIO;
diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
index dbecd699237e..9937b671f637 100644
--- a/drivers/dma/idxd/dma.c
+++ b/drivers/dma/idxd/dma.c
@@ -194,6 +194,22 @@ static void idxd_dma_release(struct dma_device *device)
kfree(idxd_dma);
}
+static int idxd_dma_terminate_all(struct dma_chan *c)
+{
+ struct idxd_wq *wq = to_idxd_wq(c);
+
+ idxd_wq_flush_descs(wq);
+
+ return 0;
+}
+
+static void idxd_dma_synchronize(struct dma_chan *c)
+{
+ struct idxd_wq *wq = to_idxd_wq(c);
+
+ idxd_wq_drain(wq);
+}
+
int idxd_register_dma_device(struct idxd_device *idxd)
{
struct idxd_dma_dev *idxd_dma;
@@ -224,6 +240,8 @@ int idxd_register_dma_device(struct idxd_device *idxd)
dma->device_issue_pending = idxd_dma_issue_pending;
dma->device_alloc_chan_resources = idxd_dma_alloc_chan_resources;
dma->device_free_chan_resources = idxd_dma_free_chan_resources;
+ dma->device_terminate_all = idxd_dma_terminate_all;
+ dma->device_synchronize = idxd_dma_synchronize;
rc = dma_async_device_register(dma);
if (rc < 0) {
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index ea8c4daed38d..ce78b9a7c641 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -803,6 +803,7 @@ void idxd_wq_quiesce(struct idxd_wq *wq);
int idxd_wq_init_percpu_ref(struct idxd_wq *wq);
void idxd_wq_free_irq(struct idxd_wq *wq);
int idxd_wq_request_irq(struct idxd_wq *wq);
+void idxd_wq_flush_descs(struct idxd_wq *wq);
/* submission */
int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc);
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index fb80803d5b57..f1cfc7790d95 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -973,7 +973,8 @@ static void idxd_device_config_restore(struct idxd_device *idxd,
idxd->rdbuf_limit = idxd_saved->saved_idxd.rdbuf_limit;
- idxd->evl->size = saved_evl->size;
+ if (idxd->evl)
+ idxd->evl->size = saved_evl->size;
for (i = 0; i < idxd->max_groups; i++) {
struct idxd_group *saved_group, *group;
@@ -1104,12 +1105,10 @@ static void idxd_reset_done(struct pci_dev *pdev)
idxd_device_config_restore(idxd, idxd->idxd_saved);
/* Re-configure IDXD device if allowed. */
- if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) {
- rc = idxd_device_config(idxd);
- if (rc < 0) {
- dev_err(dev, "HALT: %s config fails\n", idxd_name);
- goto out;
- }
+ rc = idxd_device_config(idxd);
+ if (rc < 0) {
+ dev_err(dev, "HALT: %s config fails\n", idxd_name);
+ goto out;
}
/* Bind IDXD device to driver. */
@@ -1147,6 +1146,7 @@ static void idxd_reset_done(struct pci_dev *pdev)
}
out:
kfree(idxd->idxd_saved);
+ idxd->idxd_saved = NULL;
}
static const struct pci_error_handlers idxd_error_handler = {
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 7782f8c51c32..6a25e1fd0e62 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -397,6 +397,17 @@ static void idxd_device_flr(struct work_struct *work)
dev_err(&idxd->pdev->dev, "FLR failed\n");
}
+static void idxd_wqs_flush_descs(struct idxd_device *idxd)
+{
+ int i;
+
+ for (i = 0; i < idxd->max_wqs; i++) {
+ struct idxd_wq *wq = idxd->wqs[i];
+
+ idxd_wq_flush_descs(wq);
+ }
+}
+
static irqreturn_t idxd_halt(struct idxd_device *idxd)
{
union gensts_reg gensts;
@@ -415,6 +426,11 @@ static irqreturn_t idxd_halt(struct idxd_device *idxd)
} else if (gensts.reset_type == IDXD_DEVICE_RESET_FLR) {
idxd->state = IDXD_DEV_HALTED;
idxd_mask_error_interrupts(idxd);
+ /* Flush all pending descriptors, and disable
+ * interrupts, they will be re-enabled when FLR
+ * concludes.
+ */
+ idxd_wqs_flush_descs(idxd);
dev_dbg(&idxd->pdev->dev,
"idxd halted, doing FLR. After FLR, configs are restored\n");
INIT_WORK(&idxd->work, idxd_device_flr);
diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index 6db1c5fcedc5..03217041b8b3 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -138,7 +138,7 @@ static void llist_abort_desc(struct idxd_wq *wq, struct idxd_irq_entry *ie,
*/
list_for_each_entry_safe(d, t, &flist, list) {
list_del_init(&d->list);
- idxd_dma_complete_txd(found, IDXD_COMPLETE_ABORT, true,
+ idxd_dma_complete_txd(d, IDXD_COMPLETE_ABORT, true,
NULL, NULL);
}
}
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index cc2c83d7f710..6d251095c350 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1836,6 +1836,7 @@ static void idxd_conf_device_release(struct device *dev)
{
struct idxd_device *idxd = confdev_to_idxd(dev);
+ destroy_workqueue(idxd->wq);
kfree(idxd->groups);
bitmap_free(idxd->wq_enable_map);
kfree(idxd->wqs);
diff --git a/drivers/dma/sh/rz-dmac.c b/drivers/dma/sh/rz-dmac.c
index d84ca551b2bf..f30bdf69c740 100644
--- a/drivers/dma/sh/rz-dmac.c
+++ b/drivers/dma/sh/rz-dmac.c
@@ -10,6 +10,7 @@
*/
#include <linux/bitfield.h>
+#include <linux/cleanup.h>
#include <linux/dma-mapping.h>
#include <linux/dmaengine.h>
#include <linux/interrupt.h>
@@ -296,13 +297,10 @@ static void rz_dmac_disable_hw(struct rz_dmac_chan *channel)
{
struct dma_chan *chan = &channel->vc.chan;
struct rz_dmac *dmac = to_rz_dmac(chan->device);
- unsigned long flags;
dev_dbg(dmac->dev, "%s channel %d\n", __func__, channel->index);
- local_irq_save(flags);
rz_dmac_ch_writel(channel, CHCTRL_DEFAULT, CHCTRL, 1);
- local_irq_restore(flags);
}
static void rz_dmac_set_dmars_register(struct rz_dmac *dmac, int nr, u32 dmars)
@@ -447,6 +445,7 @@ static int rz_dmac_alloc_chan_resources(struct dma_chan *chan)
if (!desc)
break;
+ /* No need to lock. This is called only for the 1st client. */
list_add_tail(&desc->node, &channel->ld_free);
channel->descs_allocated++;
}
@@ -502,18 +501,21 @@ rz_dmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
dev_dbg(dmac->dev, "%s channel: %d src=0x%pad dst=0x%pad len=%zu\n",
__func__, channel->index, &src, &dest, len);
- if (list_empty(&channel->ld_free))
- return NULL;
+ scoped_guard(spinlock_irqsave, &channel->vc.lock) {
+ if (list_empty(&channel->ld_free))
+ return NULL;
- desc = list_first_entry(&channel->ld_free, struct rz_dmac_desc, node);
+ desc = list_first_entry(&channel->ld_free, struct rz_dmac_desc, node);
- desc->type = RZ_DMAC_DESC_MEMCPY;
- desc->src = src;
- desc->dest = dest;
- desc->len = len;
- desc->direction = DMA_MEM_TO_MEM;
+ desc->type = RZ_DMAC_DESC_MEMCPY;
+ desc->src = src;
+ desc->dest = dest;
+ desc->len = len;
+ desc->direction = DMA_MEM_TO_MEM;
+
+ list_move_tail(channel->ld_free.next, &channel->ld_queue);
+ }
- list_move_tail(channel->ld_free.next, &channel->ld_queue);
return vchan_tx_prep(&channel->vc, &desc->vd, flags);
}
@@ -529,27 +531,29 @@ rz_dmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
int dma_length = 0;
int i = 0;
- if (list_empty(&channel->ld_free))
- return NULL;
+ scoped_guard(spinlock_irqsave, &channel->vc.lock) {
+ if (list_empty(&channel->ld_free))
+ return NULL;
- desc = list_first_entry(&channel->ld_free, struct rz_dmac_desc, node);
+ desc = list_first_entry(&channel->ld_free, struct rz_dmac_desc, node);
- for_each_sg(sgl, sg, sg_len, i) {
- dma_length += sg_dma_len(sg);
- }
+ for_each_sg(sgl, sg, sg_len, i)
+ dma_length += sg_dma_len(sg);
- desc->type = RZ_DMAC_DESC_SLAVE_SG;
- desc->sg = sgl;
- desc->sgcount = sg_len;
- desc->len = dma_length;
- desc->direction = direction;
+ desc->type = RZ_DMAC_DESC_SLAVE_SG;
+ desc->sg = sgl;
+ desc->sgcount = sg_len;
+ desc->len = dma_length;
+ desc->direction = direction;
- if (direction == DMA_DEV_TO_MEM)
- desc->src = channel->src_per_address;
- else
- desc->dest = channel->dst_per_address;
+ if (direction == DMA_DEV_TO_MEM)
+ desc->src = channel->src_per_address;
+ else
+ desc->dest = channel->dst_per_address;
+
+ list_move_tail(channel->ld_free.next, &channel->ld_queue);
+ }
- list_move_tail(channel->ld_free.next, &channel->ld_queue);
return vchan_tx_prep(&channel->vc, &desc->vd, flags);
}
@@ -561,8 +565,8 @@ static int rz_dmac_terminate_all(struct dma_chan *chan)
unsigned int i;
LIST_HEAD(head);
- rz_dmac_disable_hw(channel);
spin_lock_irqsave(&channel->vc.lock, flags);
+ rz_dmac_disable_hw(channel);
for (i = 0; i < DMAC_NR_LMDESC; i++)
lmdesc[i].header = 0;
@@ -699,7 +703,9 @@ static void rz_dmac_irq_handle_channel(struct rz_dmac_chan *channel)
if (chstat & CHSTAT_ER) {
dev_err(dmac->dev, "DMAC err CHSTAT_%d = %08X\n",
channel->index, chstat);
- rz_dmac_ch_writel(channel, CHCTRL_DEFAULT, CHCTRL, 1);
+
+ scoped_guard(spinlock_irqsave, &channel->vc.lock)
+ rz_dmac_ch_writel(channel, CHCTRL_DEFAULT, CHCTRL, 1);
goto done;
}
diff --git a/drivers/dma/xilinx/xdma.c b/drivers/dma/xilinx/xdma.c
index d02a4dac2291..782a55edc55b 100644
--- a/drivers/dma/xilinx/xdma.c
+++ b/drivers/dma/xilinx/xdma.c
@@ -1234,8 +1234,8 @@ static int xdma_probe(struct platform_device *pdev)
xdev->rmap = devm_regmap_init_mmio(&pdev->dev, reg_base,
&xdma_regmap_config);
- if (!xdev->rmap) {
- xdma_err(xdev, "config regmap failed: %d", ret);
+ if (IS_ERR(xdev->rmap)) {
+ xdma_err(xdev, "config regmap failed: %pe", xdev->rmap);
goto failed;
}
INIT_LIST_HEAD(&xdev->dma_dev.channels);
diff --git a/drivers/dma/xilinx/xilinx_dma.c b/drivers/dma/xilinx/xilinx_dma.c
index b53292e02448..e3a18ee42aa2 100644
--- a/drivers/dma/xilinx/xilinx_dma.c
+++ b/drivers/dma/xilinx/xilinx_dma.c
@@ -997,16 +997,16 @@ static u32 xilinx_dma_get_residue(struct xilinx_dma_chan *chan,
struct xilinx_cdma_tx_segment,
node);
cdma_hw = &cdma_seg->hw;
- residue += (cdma_hw->control - cdma_hw->status) &
- chan->xdev->max_buffer_len;
+ residue += (cdma_hw->control & chan->xdev->max_buffer_len) -
+ (cdma_hw->status & chan->xdev->max_buffer_len);
} else if (chan->xdev->dma_config->dmatype ==
XDMA_TYPE_AXIDMA) {
axidma_seg = list_entry(entry,
struct xilinx_axidma_tx_segment,
node);
axidma_hw = &axidma_seg->hw;
- residue += (axidma_hw->control - axidma_hw->status) &
- chan->xdev->max_buffer_len;
+ residue += (axidma_hw->control & chan->xdev->max_buffer_len) -
+ (axidma_hw->status & chan->xdev->max_buffer_len);
} else {
aximcdma_seg =
list_entry(entry,
@@ -1014,8 +1014,8 @@ static u32 xilinx_dma_get_residue(struct xilinx_dma_chan *chan,
node);
aximcdma_hw = &aximcdma_seg->hw;
residue +=
- (aximcdma_hw->control - aximcdma_hw->status) &
- chan->xdev->max_buffer_len;
+ (aximcdma_hw->control & chan->xdev->max_buffer_len) -
+ (aximcdma_hw->status & chan->xdev->max_buffer_len);
}
}
@@ -1235,14 +1235,6 @@ static int xilinx_dma_alloc_chan_resources(struct dma_chan *dchan)
dma_cookie_init(dchan);
- if (chan->xdev->dma_config->dmatype == XDMA_TYPE_AXIDMA) {
- /* For AXI DMA resetting once channel will reset the
- * other channel as well so enable the interrupts here.
- */
- dma_ctrl_set(chan, XILINX_DMA_REG_DMACR,
- XILINX_DMA_DMAXR_ALL_IRQ_MASK);
- }
-
if ((chan->xdev->dma_config->dmatype == XDMA_TYPE_CDMA) && chan->has_sg)
dma_ctrl_set(chan, XILINX_DMA_REG_DMACR,
XILINX_CDMA_CR_SGMODE);
@@ -1564,8 +1556,29 @@ static void xilinx_dma_start_transfer(struct xilinx_dma_chan *chan)
if (chan->err)
return;
- if (list_empty(&chan->pending_list))
+ if (list_empty(&chan->pending_list)) {
+ if (chan->cyclic) {
+ struct xilinx_dma_tx_descriptor *desc;
+ struct list_head *entry;
+
+ desc = list_last_entry(&chan->done_list,
+ struct xilinx_dma_tx_descriptor, node);
+ list_for_each(entry, &desc->segments) {
+ struct xilinx_axidma_tx_segment *axidma_seg;
+ struct xilinx_axidma_desc_hw *axidma_hw;
+ axidma_seg = list_entry(entry,
+ struct xilinx_axidma_tx_segment,
+ node);
+ axidma_hw = &axidma_seg->hw;
+ axidma_hw->status = 0;
+ }
+
+ list_splice_tail_init(&chan->done_list, &chan->active_list);
+ chan->desc_pendingcount = 0;
+ chan->idle = false;
+ }
return;
+ }
if (!chan->idle)
return;
@@ -1591,6 +1604,7 @@ static void xilinx_dma_start_transfer(struct xilinx_dma_chan *chan)
head_desc->async_tx.phys);
reg &= ~XILINX_DMA_CR_DELAY_MAX;
reg |= chan->irq_delay << XILINX_DMA_CR_DELAY_SHIFT;
+ reg |= XILINX_DMA_DMAXR_ALL_IRQ_MASK;
dma_ctrl_write(chan, XILINX_DMA_REG_DMACR, reg);
xilinx_dma_start(chan);
@@ -3024,7 +3038,7 @@ static int xilinx_dma_chan_probe(struct xilinx_dma_device *xdev,
return -EINVAL;
}
- xdev->common.directions |= chan->direction;
+ xdev->common.directions |= BIT(chan->direction);
/* Request the interrupt */
chan->irq = of_irq_get(node, chan->tdest);
diff --git a/drivers/i2c/busses/i2c-designware-amdisp.c b/drivers/i2c/busses/i2c-designware-amdisp.c
index c48728ad9f6f..9f0ec0fae6f2 100644
--- a/drivers/i2c/busses/i2c-designware-amdisp.c
+++ b/drivers/i2c/busses/i2c-designware-amdisp.c
@@ -7,6 +7,7 @@
#include <linux/module.h>
#include <linux/platform_device.h>
+#include <linux/pm_domain.h>
#include <linux/pm_runtime.h>
#include <linux/soc/amd/isp4_misc.h>
@@ -76,22 +77,20 @@ static int amd_isp_dw_i2c_plat_probe(struct platform_device *pdev)
device_enable_async_suspend(&pdev->dev);
- pm_runtime_enable(&pdev->dev);
- pm_runtime_get_sync(&pdev->dev);
-
+ dev_pm_genpd_resume(&pdev->dev);
ret = i2c_dw_probe(isp_i2c_dev);
if (ret) {
dev_err_probe(&pdev->dev, ret, "i2c_dw_probe failed\n");
goto error_release_rpm;
}
-
- pm_runtime_put_sync(&pdev->dev);
+ dev_pm_genpd_suspend(&pdev->dev);
+ pm_runtime_set_suspended(&pdev->dev);
+ pm_runtime_enable(&pdev->dev);
return 0;
error_release_rpm:
amd_isp_dw_i2c_plat_pm_cleanup(isp_i2c_dev);
- pm_runtime_put_sync(&pdev->dev);
return ret;
}
diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c
index 85f554044cf1..452d120a210b 100644
--- a/drivers/i2c/busses/i2c-imx.c
+++ b/drivers/i2c/busses/i2c-imx.c
@@ -1018,8 +1018,9 @@ static inline int i2c_imx_isr_read(struct imx_i2c_struct *i2c_imx)
return 0;
}
-static inline void i2c_imx_isr_read_continue(struct imx_i2c_struct *i2c_imx)
+static inline enum imx_i2c_state i2c_imx_isr_read_continue(struct imx_i2c_struct *i2c_imx)
{
+ enum imx_i2c_state next_state = IMX_I2C_STATE_READ_CONTINUE;
unsigned int temp;
if ((i2c_imx->msg->len - 1) == i2c_imx->msg_buf_idx) {
@@ -1033,18 +1034,20 @@ static inline void i2c_imx_isr_read_continue(struct imx_i2c_struct *i2c_imx)
i2c_imx->stopped = 1;
temp &= ~(I2CR_MSTA | I2CR_MTX);
imx_i2c_write_reg(temp, i2c_imx, IMX_I2C_I2CR);
- } else {
- /*
- * For i2c master receiver repeat restart operation like:
- * read -> repeat MSTA -> read/write
- * The controller must set MTX before read the last byte in
- * the first read operation, otherwise the first read cost
- * one extra clock cycle.
- */
- temp = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2CR);
- temp |= I2CR_MTX;
- imx_i2c_write_reg(temp, i2c_imx, IMX_I2C_I2CR);
+
+ return IMX_I2C_STATE_DONE;
}
+ /*
+ * For i2c master receiver repeat restart operation like:
+ * read -> repeat MSTA -> read/write
+ * The controller must set MTX before read the last byte in
+ * the first read operation, otherwise the first read cost
+ * one extra clock cycle.
+ */
+ temp = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2CR);
+ temp |= I2CR_MTX;
+ imx_i2c_write_reg(temp, i2c_imx, IMX_I2C_I2CR);
+ next_state = IMX_I2C_STATE_DONE;
} else if (i2c_imx->msg_buf_idx == (i2c_imx->msg->len - 2)) {
temp = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2CR);
temp |= I2CR_TXAK;
@@ -1052,6 +1055,7 @@ static inline void i2c_imx_isr_read_continue(struct imx_i2c_struct *i2c_imx)
}
i2c_imx->msg->buf[i2c_imx->msg_buf_idx++] = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2DR);
+ return next_state;
}
static inline void i2c_imx_isr_read_block_data_len(struct imx_i2c_struct *i2c_imx)
@@ -1088,11 +1092,9 @@ static irqreturn_t i2c_imx_master_isr(struct imx_i2c_struct *i2c_imx, unsigned i
break;
case IMX_I2C_STATE_READ_CONTINUE:
- i2c_imx_isr_read_continue(i2c_imx);
- if (i2c_imx->msg_buf_idx == i2c_imx->msg->len) {
- i2c_imx->state = IMX_I2C_STATE_DONE;
+ i2c_imx->state = i2c_imx_isr_read_continue(i2c_imx);
+ if (i2c_imx->state == IMX_I2C_STATE_DONE)
wake_up(&i2c_imx->queue);
- }
break;
case IMX_I2C_STATE_READ_BLOCK_DATA:
@@ -1490,6 +1492,7 @@ static int i2c_imx_read(struct imx_i2c_struct *i2c_imx, struct i2c_msg *msgs,
bool is_lastmsg)
{
int block_data = msgs->flags & I2C_M_RECV_LEN;
+ int ret = 0;
dev_dbg(&i2c_imx->adapter.dev,
"<%s> write slave address: addr=0x%x\n",
@@ -1522,10 +1525,20 @@ static int i2c_imx_read(struct imx_i2c_struct *i2c_imx, struct i2c_msg *msgs,
dev_err(&i2c_imx->adapter.dev, "<%s> read timedout\n", __func__);
return -ETIMEDOUT;
}
- if (!i2c_imx->stopped)
- return i2c_imx_bus_busy(i2c_imx, 0, false);
+ if (i2c_imx->is_lastmsg) {
+ if (!i2c_imx->stopped)
+ ret = i2c_imx_bus_busy(i2c_imx, 0, false);
+ /*
+ * Only read the last byte of the last message after the bus is
+ * not busy. Else the controller generates another clock which
+ * might confuse devices.
+ */
+ if (!ret)
+ i2c_imx->msg->buf[i2c_imx->msg_buf_idx++] = imx_i2c_read_reg(i2c_imx,
+ IMX_I2C_I2DR);
+ }
- return 0;
+ return ret;
}
static int i2c_imx_xfer_common(struct i2c_adapter *adapter,
diff --git a/drivers/irqchip/irq-qcom-mpm.c b/drivers/irqchip/irq-qcom-mpm.c
index 83f31ea657b7..181320528a47 100644
--- a/drivers/irqchip/irq-qcom-mpm.c
+++ b/drivers/irqchip/irq-qcom-mpm.c
@@ -306,6 +306,8 @@ static int mpm_pd_power_off(struct generic_pm_domain *genpd)
if (ret < 0)
return ret;
+ mbox_client_txdone(priv->mbox_chan, 0);
+
return 0;
}
@@ -434,6 +436,7 @@ static int qcom_mpm_probe(struct platform_device *pdev, struct device_node *pare
}
priv->mbox_client.dev = dev;
+ priv->mbox_client.knows_txdone = true;
priv->mbox_chan = mbox_request_channel(&priv->mbox_client, 0);
if (IS_ERR(priv->mbox_chan)) {
ret = PTR_ERR(priv->mbox_chan);
diff --git a/drivers/irqchip/irq-renesas-rzv2h.c b/drivers/irqchip/irq-renesas-rzv2h.c
index da2bc43a0e12..03e93b061edd 100644
--- a/drivers/irqchip/irq-renesas-rzv2h.c
+++ b/drivers/irqchip/irq-renesas-rzv2h.c
@@ -621,7 +621,7 @@ static int rzv2h_icu_probe_common(struct platform_device *pdev, struct device_no
return 0;
pm_put:
- pm_runtime_put(&pdev->dev);
+ pm_runtime_put_sync(&pdev->dev);
return ret;
}
diff --git a/drivers/phy/Kconfig b/drivers/phy/Kconfig
index 02467dfd4fb0..1875d5b784f6 100644
--- a/drivers/phy/Kconfig
+++ b/drivers/phy/Kconfig
@@ -6,7 +6,7 @@
menu "PHY Subsystem"
config PHY_COMMON_PROPS
- bool
+ bool "PHY common properties" if KUNIT_ALL_TESTS
help
This parses properties common between generic PHYs and Ethernet PHYs.
@@ -16,8 +16,7 @@ config PHY_COMMON_PROPS
config PHY_COMMON_PROPS_TEST
tristate "KUnit tests for PHY common props" if !KUNIT_ALL_TESTS
- select PHY_COMMON_PROPS
- depends on KUNIT
+ depends on KUNIT && PHY_COMMON_PROPS
default KUNIT_ALL_TESTS
help
This builds KUnit tests for the PHY common property API.
diff --git a/drivers/phy/freescale/phy-fsl-lynx-28g.c b/drivers/phy/freescale/phy-fsl-lynx-28g.c
index 2b0fd95ba62f..63427fc34e26 100644
--- a/drivers/phy/freescale/phy-fsl-lynx-28g.c
+++ b/drivers/phy/freescale/phy-fsl-lynx-28g.c
@@ -1069,6 +1069,8 @@ static void lynx_28g_cdr_lock_check(struct work_struct *work)
for (i = 0; i < LYNX_28G_NUM_LANE; i++) {
lane = &priv->lane[i];
+ if (!lane->phy)
+ continue;
mutex_lock(&lane->phy->mutex);
diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-ufs.c b/drivers/phy/qualcomm/phy-qcom-qmp-ufs.c
index df138a5442eb..771bc7c2ab50 100644
--- a/drivers/phy/qualcomm/phy-qcom-qmp-ufs.c
+++ b/drivers/phy/qualcomm/phy-qcom-qmp-ufs.c
@@ -990,6 +990,7 @@ static const struct qmp_phy_init_tbl sm8650_ufsphy_pcs[] = {
QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_MULTI_LANE_CTRL1, 0x02),
QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_TX_MID_TERM_CTRL1, 0x43),
QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_PCS_CTRL1, 0xc1),
+ QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_PLL_CNTL, 0x33),
QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_TX_LARGE_AMP_DRV_LVL, 0x0f),
QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_RX_SIGDET_CTRL2, 0x68),
QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_TX_POST_EMP_LVL_S4, 0x0e),
@@ -999,13 +1000,11 @@ static const struct qmp_phy_init_tbl sm8650_ufsphy_pcs[] = {
};
static const struct qmp_phy_init_tbl sm8650_ufsphy_g4_pcs[] = {
- QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_PLL_CNTL, 0x13),
QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_TX_HSGEAR_CAPABILITY, 0x04),
QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_RX_HSGEAR_CAPABILITY, 0x04),
};
static const struct qmp_phy_init_tbl sm8650_ufsphy_g5_pcs[] = {
- QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_PLL_CNTL, 0x33),
QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_TX_HSGEAR_CAPABILITY, 0x05),
QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_RX_HSGEAR_CAPABILITY, 0x05),
QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_RX_HS_G5_SYNC_LENGTH_CAPABILITY, 0x4d),
diff --git a/drivers/phy/spacemit/phy-k1-usb2.c b/drivers/phy/spacemit/phy-k1-usb2.c
index 342061380012..9215d0b223b2 100644
--- a/drivers/phy/spacemit/phy-k1-usb2.c
+++ b/drivers/phy/spacemit/phy-k1-usb2.c
@@ -48,6 +48,9 @@
#define PHY_CLK_HSTXP_EN BIT(3) /* clock hstxp enable */
#define PHY_HSTXP_MODE BIT(4) /* 0: force en_txp to be 1; 1: no force */
+#define PHY_K1_HS_HOST_DISC 0x40
+#define PHY_K1_HS_HOST_DISC_CLR BIT(0)
+
#define PHY_PLL_DIV_CFG 0x98
#define PHY_FDIV_FRACT_8_15 GENMASK(7, 0)
#define PHY_FDIV_FRACT_16_19 GENMASK(11, 8)
@@ -142,9 +145,20 @@ static int spacemit_usb2phy_exit(struct phy *phy)
return 0;
}
+static int spacemit_usb2phy_disconnect(struct phy *phy, int port)
+{
+ struct spacemit_usb2phy *sphy = phy_get_drvdata(phy);
+
+ regmap_update_bits(sphy->regmap_base, PHY_K1_HS_HOST_DISC,
+ PHY_K1_HS_HOST_DISC_CLR, PHY_K1_HS_HOST_DISC_CLR);
+
+ return 0;
+}
+
static const struct phy_ops spacemit_usb2phy_ops = {
.init = spacemit_usb2phy_init,
.exit = spacemit_usb2phy_exit,
+ .disconnect = spacemit_usb2phy_disconnect,
.owner = THIS_MODULE,
};
diff --git a/drivers/phy/ti/phy-j721e-wiz.c b/drivers/phy/ti/phy-j721e-wiz.c
index 6e9ecb88dc8b..6b584706b913 100644
--- a/drivers/phy/ti/phy-j721e-wiz.c
+++ b/drivers/phy/ti/phy-j721e-wiz.c
@@ -1425,6 +1425,7 @@ static int wiz_get_lane_phy_types(struct device *dev, struct wiz *wiz)
dev_err(dev,
"%s: Reading \"reg\" from \"%s\" failed: %d\n",
__func__, subnode->name, ret);
+ of_node_put(serdes);
return ret;
}
of_property_read_u32(subnode, "cdns,num-lanes", &num_lanes);
@@ -1439,6 +1440,7 @@ static int wiz_get_lane_phy_types(struct device *dev, struct wiz *wiz)
}
}
+ of_node_put(serdes);
return 0;
}
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index bbf9ee21306c..15ba592236e8 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -1765,6 +1765,9 @@ err_privcmdbuf:
static void __exit privcmd_exit(void)
{
+ if (!xen_initial_domain())
+ unregister_xenstore_notifier(&xenstore_notifier);
+
privcmd_ioeventfd_exit();
privcmd_irqfd_exit();
misc_deregister(&privcmd_dev);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index ebf5079096af..c0d17a369bda 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -4583,7 +4583,7 @@ static void check_removing_space_info(struct btrfs_space_info *space_info)
for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) {
if (space_info->sub_group[i]) {
check_removing_space_info(space_info->sub_group[i]);
- kfree(space_info->sub_group[i]);
+ btrfs_sysfs_remove_space_info(space_info->sub_group[i]);
space_info->sub_group[i] = NULL;
}
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 01f2dbb69832..1b0eb246b714 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2531,8 +2531,8 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
if (mirror_num >= 0 &&
btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) {
- btrfs_err(fs_info, "super offset mismatch %llu != %u",
- btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
+ btrfs_err(fs_info, "super offset mismatch %llu != %llu",
+ btrfs_super_bytenr(sb), btrfs_sb_offset(mirror_num));
ret = -EINVAL;
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index ab0d460c7139..ac871efb9763 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4616,21 +4616,32 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
struct inode *inode, bool log_inode_only,
u64 logged_isize)
{
+ u64 gen = BTRFS_I(inode)->generation;
u64 flags;
if (log_inode_only) {
- /* set the generation to zero so the recover code
- * can tell the difference between an logging
- * just to say 'this inode exists' and a logging
- * to say 'update this inode with these values'
+ /*
+ * Set the generation to zero so the recover code can tell the
+ * difference between a logging just to say 'this inode exists'
+ * and a logging to say 'update this inode with these values'.
+ * But only if the inode was not already logged before.
+ * We access ->logged_trans directly since it was already set
+ * up in the call chain by btrfs_log_inode(), and data_race()
+ * to avoid false alerts from KCSAN and since it was set already
+ * and one can set it to 0 since that only happens on eviction
+ * and we are holding a ref on the inode.
*/
- btrfs_set_inode_generation(leaf, item, 0);
+ ASSERT(data_race(BTRFS_I(inode)->logged_trans) > 0);
+ if (data_race(BTRFS_I(inode)->logged_trans) < trans->transid)
+ gen = 0;
+
btrfs_set_inode_size(leaf, item, logged_isize);
} else {
- btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
btrfs_set_inode_size(leaf, item, inode->i_size);
}
+ btrfs_set_inode_generation(leaf, item, gen);
+
btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
btrfs_set_inode_mode(leaf, item, inode->i_mode);
@@ -5448,42 +5459,63 @@ process:
return 0;
}
-static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
- struct btrfs_path *path, u64 *size_ret)
+static int get_inode_size_to_log(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path, u64 *size_ret)
{
struct btrfs_key key;
+ struct btrfs_inode_item *item;
int ret;
key.objectid = btrfs_ino(inode);
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
- if (ret < 0) {
- return ret;
- } else if (ret > 0) {
- *size_ret = 0;
- } else {
- struct btrfs_inode_item *item;
+ /*
+ * Our caller called inode_logged(), so logged_trans is up to date.
+ * Use data_race() to silence any warning from KCSAN. Once logged_trans
+ * is set, it can only be reset to 0 after inode eviction.
+ */
+ if (data_race(inode->logged_trans) == trans->transid) {
+ ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
+ } else if (inode->generation < trans->transid) {
+ path->search_commit_root = true;
+ path->skip_locking = true;
+ ret = btrfs_search_slot(NULL, inode->root, &key, path, 0, 0);
+ path->search_commit_root = false;
+ path->skip_locking = false;
- item = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_inode_item);
- *size_ret = btrfs_inode_size(path->nodes[0], item);
- /*
- * If the in-memory inode's i_size is smaller then the inode
- * size stored in the btree, return the inode's i_size, so
- * that we get a correct inode size after replaying the log
- * when before a power failure we had a shrinking truncate
- * followed by addition of a new name (rename / new hard link).
- * Otherwise return the inode size from the btree, to avoid
- * data loss when replaying a log due to previously doing a
- * write that expands the inode's size and logging a new name
- * immediately after.
- */
- if (*size_ret > inode->vfs_inode.i_size)
- *size_ret = inode->vfs_inode.i_size;
+ } else {
+ *size_ret = 0;
+ return 0;
}
+ /*
+ * If the inode was logged before or is from a past transaction, then
+ * its inode item must exist in the log root or in the commit root.
+ */
+ ASSERT(ret <= 0);
+ if (WARN_ON_ONCE(ret > 0))
+ ret = -ENOENT;
+
+ if (ret < 0)
+ return ret;
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_item);
+ *size_ret = btrfs_inode_size(path->nodes[0], item);
+ /*
+ * If the in-memory inode's i_size is smaller then the inode size stored
+ * in the btree, return the inode's i_size, so that we get a correct
+ * inode size after replaying the log when before a power failure we had
+ * a shrinking truncate followed by addition of a new name (rename / new
+ * hard link). Otherwise return the inode size from the btree, to avoid
+ * data loss when replaying a log due to previously doing a write that
+ * expands the inode's size and logging a new name immediately after.
+ */
+ if (*size_ret > inode->vfs_inode.i_size)
+ *size_ret = inode->vfs_inode.i_size;
+
btrfs_release_path(path);
return 0;
}
@@ -6996,7 +7028,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
ret = drop_inode_items(trans, log, path, inode,
BTRFS_XATTR_ITEM_KEY);
} else {
- if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) {
+ if (inode_only == LOG_INODE_EXISTS) {
/*
* Make sure the new inode item we write to the log has
* the same isize as the current one (if it exists).
@@ -7010,7 +7042,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
* (zeroes), as if an expanding truncate happened,
* instead of getting a file of 4Kb only.
*/
- ret = logged_inode_size(log, inode, path, &logged_isize);
+ ret = get_inode_size_to_log(trans, inode, path, &logged_isize);
if (ret)
goto out_unlock;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ab51e6ecfdef..6b8e810a35ce 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -8099,8 +8099,9 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
smp_rmb();
ret = update_dev_stat_item(trans, device);
- if (!ret)
- atomic_sub(stats_cnt, &device->dev_stats_ccnt);
+ if (ret)
+ break;
+ atomic_sub(stats_cnt, &device->dev_stats_ccnt);
}
mutex_unlock(&fs_devices->device_list_mutex);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 1b7544aa88a2..c676e715b4f8 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -308,7 +308,9 @@ int zlib_compress_bio(struct list_head *ws, struct compressed_bio *cb)
}
/* Queue the remaining part of the folio. */
if (workspace->strm.total_out > bio->bi_iter.bi_size) {
- u32 cur_len = offset_in_folio(out_folio, workspace->strm.total_out);
+ const u32 cur_len = workspace->strm.total_out - bio->bi_iter.bi_size;
+
+ ASSERT(cur_len <= folio_size(out_folio));
if (!bio_add_folio(bio, out_folio, cur_len, 0)) {
ret = -E2BIG;
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 72206a292676..3baee4e7c1cf 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -14,7 +14,8 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
-ext4-inode-test-objs += inode-test.o
-obj-$(CONFIG_EXT4_KUNIT_TESTS) += ext4-inode-test.o
+ext4-test-objs += inode-test.o mballoc-test.o \
+ extents-test.o
+obj-$(CONFIG_EXT4_KUNIT_TESTS) += ext4-test.o
ext4-$(CONFIG_FS_VERITY) += verity.o
ext4-$(CONFIG_FS_ENCRYPTION) += crypto.o
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index cf0a0970c095..f41f320f4437 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -163,10 +163,17 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
*/
if (handle) {
+ /*
+ * Since the inode is new it is ok to pass the
+ * XATTR_CREATE flag. This is necessary to match the
+ * remaining journal credits check in the set_handle
+ * function with the credits allocated for the new
+ * inode.
+ */
res = ext4_xattr_set_handle(handle, inode,
EXT4_XATTR_INDEX_ENCRYPTION,
EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
- ctx, len, 0);
+ ctx, len, XATTR_CREATE);
if (!res) {
ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
ext4_clear_inode_state(inode,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 293f698b7042..7617e2d454ea 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1570,6 +1570,7 @@ struct ext4_sb_info {
struct proc_dir_entry *s_proc;
struct kobject s_kobj;
struct completion s_kobj_unregister;
+ struct mutex s_error_notify_mutex; /* protects sysfs_notify vs kobject_del */
struct super_block *s_sb;
struct buffer_head *s_mmp_bh;
@@ -3944,6 +3945,11 @@ static inline bool ext4_inode_can_atomic_write(struct inode *inode)
extern int ext4_block_write_begin(handle_t *handle, struct folio *folio,
loff_t pos, unsigned len,
get_block_t *get_block);
+
+#if IS_ENABLED(CONFIG_EXT4_KUNIT_TESTS)
+#define EXPORT_SYMBOL_FOR_EXT4_TEST(sym) \
+ EXPORT_SYMBOL_FOR_MODULES(sym, "ext4-test")
+#endif
#endif /* __KERNEL__ */
#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index c484125d963f..ebaf7cc42430 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -264,5 +264,17 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
0xffff);
}
+extern int __ext4_ext_dirty(const char *where, unsigned int line,
+ handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path);
+extern int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex);
+#if IS_ENABLED(CONFIG_EXT4_KUNIT_TESTS)
+extern int ext4_ext_space_root_idx_test(struct inode *inode, int check);
+extern struct ext4_ext_path *ext4_split_convert_extents_test(
+ handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map,
+ struct ext4_ext_path *path,
+ int flags, unsigned int *allocated);
+#endif
#endif /* _EXT4_EXTENTS */
diff --git a/fs/ext4/extents-test.c b/fs/ext4/extents-test.c
index 7c4690eb7dad..5496b2c8e2cd 100644
--- a/fs/ext4/extents-test.c
+++ b/fs/ext4/extents-test.c
@@ -142,8 +142,10 @@ static struct file_system_type ext_fs_type = {
static void extents_kunit_exit(struct kunit *test)
{
- struct ext4_sb_info *sbi = k_ctx.k_ei->vfs_inode.i_sb->s_fs_info;
+ struct super_block *sb = k_ctx.k_ei->vfs_inode.i_sb;
+ struct ext4_sb_info *sbi = sb->s_fs_info;
+ ext4_es_unregister_shrinker(sbi);
kfree(sbi);
kfree(k_ctx.k_ei);
kfree(k_ctx.k_data);
@@ -280,8 +282,8 @@ static int extents_kunit_init(struct kunit *test)
eh->eh_depth = 0;
eh->eh_entries = cpu_to_le16(1);
eh->eh_magic = EXT4_EXT_MAGIC;
- eh->eh_max =
- cpu_to_le16(ext4_ext_space_root_idx(&k_ctx.k_ei->vfs_inode, 0));
+ eh->eh_max = cpu_to_le16(ext4_ext_space_root_idx_test(
+ &k_ctx.k_ei->vfs_inode, 0));
eh->eh_generation = 0;
/*
@@ -384,8 +386,8 @@ static void test_split_convert(struct kunit *test)
switch (param->type) {
case TEST_SPLIT_CONVERT:
- path = ext4_split_convert_extents(NULL, inode, &map, path,
- param->split_flags, NULL);
+ path = ext4_split_convert_extents_test(NULL, inode, &map,
+ path, param->split_flags, NULL);
break;
case TEST_CREATE_BLOCKS:
ext4_map_create_blocks_helper(test, inode, &map, param->split_flags);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ae3804f36535..8cce1479be6d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -184,9 +184,9 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
* - ENOMEM
* - EIO
*/
-static int __ext4_ext_dirty(const char *where, unsigned int line,
- handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path)
+int __ext4_ext_dirty(const char *where, unsigned int line,
+ handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path)
{
int err;
@@ -1736,6 +1736,13 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
err = ext4_ext_get_access(handle, inode, path + k);
if (err)
return err;
+ if (unlikely(path[k].p_idx > EXT_LAST_INDEX(path[k].p_hdr))) {
+ EXT4_ERROR_INODE(inode,
+ "path[%d].p_idx %p > EXT_LAST_INDEX %p",
+ k, path[k].p_idx,
+ EXT_LAST_INDEX(path[k].p_hdr));
+ return -EFSCORRUPTED;
+ }
path[k].p_idx->ei_block = border;
err = ext4_ext_dirty(handle, inode, path + k);
if (err)
@@ -1748,6 +1755,14 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
err = ext4_ext_get_access(handle, inode, path + k);
if (err)
goto clean;
+ if (unlikely(path[k].p_idx > EXT_LAST_INDEX(path[k].p_hdr))) {
+ EXT4_ERROR_INODE(inode,
+ "path[%d].p_idx %p > EXT_LAST_INDEX %p",
+ k, path[k].p_idx,
+ EXT_LAST_INDEX(path[k].p_hdr));
+ err = -EFSCORRUPTED;
+ goto clean;
+ }
path[k].p_idx->ei_block = border;
err = ext4_ext_dirty(handle, inode, path + k);
if (err)
@@ -3144,7 +3159,7 @@ static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
}
/* FIXME!! we need to try to merge to left or right after zero-out */
-static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
+int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
{
ext4_fsblk_t ee_pblock;
unsigned int ee_len;
@@ -3239,6 +3254,9 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
insert_err = PTR_ERR(path);
err = 0;
+ if (insert_err != -ENOSPC && insert_err != -EDQUOT &&
+ insert_err != -ENOMEM)
+ goto out_path;
/*
* Get a new path to try to zeroout or fix the extent length.
@@ -3255,13 +3273,20 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
goto out_path;
}
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ if (!ex) {
+ EXT4_ERROR_INODE(inode,
+ "bad extent address lblock: %lu, depth: %d pblock %llu",
+ (unsigned long)ee_block, depth, path[depth].p_block);
+ err = -EFSCORRUPTED;
+ goto out;
+ }
+
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
- depth = ext_depth(inode);
- ex = path[depth].p_ext;
-
fix_extent_len:
ex->ee_len = orig_ex.ee_len;
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
@@ -3363,7 +3388,7 @@ static int ext4_split_extent_zeroout(handle_t *handle, struct inode *inode,
ext4_ext_mark_initialized(ex);
- ext4_ext_dirty(handle, inode, path + depth);
+ err = ext4_ext_dirty(handle, inode, path + depth);
if (err)
return err;
@@ -4457,9 +4482,13 @@ got_allocated_blocks:
path = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (IS_ERR(path)) {
err = PTR_ERR(path);
- if (allocated_clusters) {
+ /*
+ * Gracefully handle out of space conditions. If the filesystem
+ * is inconsistent, we'll just leak allocated blocks to avoid
+ * causing even more damage.
+ */
+ if (allocated_clusters && (err == -EDQUOT || err == -ENOSPC)) {
int fb_flags = 0;
-
/*
* free data blocks we just allocated.
* not a good idea to call discard here directly,
@@ -6238,6 +6267,33 @@ out:
return 0;
}
-#ifdef CONFIG_EXT4_KUNIT_TESTS
-#include "extents-test.c"
+#if IS_ENABLED(CONFIG_EXT4_KUNIT_TESTS)
+int ext4_ext_space_root_idx_test(struct inode *inode, int check)
+{
+ return ext4_ext_space_root_idx(inode, check);
+}
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_ext_space_root_idx_test);
+
+struct ext4_ext_path *ext4_split_convert_extents_test(handle_t *handle,
+ struct inode *inode, struct ext4_map_blocks *map,
+ struct ext4_ext_path *path, int flags,
+ unsigned int *allocated)
+{
+ return ext4_split_convert_extents(handle, inode, map, path,
+ flags, allocated);
+}
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_split_convert_extents_test);
+
+EXPORT_SYMBOL_FOR_EXT4_TEST(__ext4_ext_dirty);
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_ext_zeroout);
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_es_register_shrinker);
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_es_unregister_shrinker);
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_map_create_blocks);
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_es_init_tree);
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_es_lookup_extent);
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_es_insert_extent);
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_ext_insert_extent);
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_find_extent);
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_issue_zeroout);
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_map_query_blocks);
#endif
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index f575751f1cae..2f0057e04934 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -975,13 +975,13 @@ static int ext4_fc_flush_data(journal_t *journal)
int ret = 0;
list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
- ret = jbd2_submit_inode_data(journal, ei->jinode);
+ ret = jbd2_submit_inode_data(journal, READ_ONCE(ei->jinode));
if (ret)
return ret;
}
list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
- ret = jbd2_wait_inode_data(journal, ei->jinode);
+ ret = jbd2_wait_inode_data(journal, READ_ONCE(ei->jinode));
if (ret)
return ret;
}
@@ -1613,19 +1613,21 @@ static int ext4_fc_replay_inode(struct super_block *sb,
/* Immediately update the inode on disk. */
ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
if (ret)
- goto out;
+ goto out_brelse;
ret = sync_dirty_buffer(iloc.bh);
if (ret)
- goto out;
+ goto out_brelse;
ret = ext4_mark_inode_used(sb, ino);
if (ret)
- goto out;
+ goto out_brelse;
/* Given that we just wrote the inode on disk, this SHOULD succeed. */
inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
if (IS_ERR(inode)) {
ext4_debug("Inode not found.");
- return -EFSCORRUPTED;
+ inode = NULL;
+ ret = -EFSCORRUPTED;
+ goto out_brelse;
}
/*
@@ -1642,13 +1644,14 @@ static int ext4_fc_replay_inode(struct super_block *sb,
ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
sync_dirty_buffer(iloc.bh);
+out_brelse:
brelse(iloc.bh);
out:
iput(inode);
if (!ret)
blkdev_issue_flush(sb->s_bdev);
- return 0;
+ return ret;
}
/*
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index e476c6de3074..bd8f230fa507 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -83,11 +83,23 @@ static int ext4_fsync_nojournal(struct file *file, loff_t start, loff_t end,
int datasync, bool *needs_barrier)
{
struct inode *inode = file->f_inode;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = 0,
+ };
int ret;
ret = generic_buffers_fsync_noflush(file, start, end, datasync);
- if (!ret)
- ret = ext4_sync_parent(inode);
+ if (ret)
+ return ret;
+
+ /* Force writeout of inode table buffer to disk */
+ ret = ext4_write_inode(inode, &wbc);
+ if (ret)
+ return ret;
+
+ ret = ext4_sync_parent(inode);
+
if (test_opt(inode->i_sb, BARRIER))
*needs_barrier = true;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index b20a1bf866ab..b1bc1950c9f0 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -686,6 +686,12 @@ static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
if (unlikely(!gdp))
return 0;
+ /* Inode was never used in this filesystem? */
+ if (ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT) ||
+ ino >= EXT4_INODES_PER_GROUP(sb) - ext4_itable_unused_count(sb, gdp)))
+ return 0;
+
bh = sb_find_get_block(sb, ext4_inode_table(sb, gdp) +
(ino / inodes_per_block));
if (!bh || !buffer_uptodate(bh))
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 1f6bc05593df..408677fa8196 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -522,7 +522,15 @@ static int ext4_read_inline_folio(struct inode *inode, struct folio *folio)
goto out;
len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode));
- BUG_ON(len > PAGE_SIZE);
+
+ if (len > PAGE_SIZE) {
+ ext4_error_inode(inode, __func__, __LINE__, 0,
+ "inline size %zu exceeds PAGE_SIZE", len);
+ ret = -EFSCORRUPTED;
+ brelse(iloc.bh);
+ goto out;
+ }
+
kaddr = kmap_local_folio(folio, 0);
ret = ext4_read_inline_data(inode, kaddr, len, &iloc);
kaddr = folio_zero_tail(folio, len, kaddr + len);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 396dc3a5d16b..1123d995494b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -128,6 +128,8 @@ void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
static inline int ext4_begin_ordered_truncate(struct inode *inode,
loff_t new_size)
{
+ struct jbd2_inode *jinode = READ_ONCE(EXT4_I(inode)->jinode);
+
trace_ext4_begin_ordered_truncate(inode, new_size);
/*
* If jinode is zero, then we never opened the file for
@@ -135,10 +137,10 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
* jbd2_journal_begin_ordered_truncate() since there's no
* outstanding writes we need to flush.
*/
- if (!EXT4_I(inode)->jinode)
+ if (!jinode)
return 0;
return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
- EXT4_I(inode)->jinode,
+ jinode,
new_size);
}
@@ -184,6 +186,14 @@ void ext4_evict_inode(struct inode *inode)
if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)
ext4_evict_ea_inode(inode);
if (inode->i_nlink) {
+ /*
+ * If there's dirty page will lead to data loss, user
+ * could see stale data.
+ */
+ if (unlikely(!ext4_emergency_state(inode->i_sb) &&
+ mapping_tagged(&inode->i_data, PAGECACHE_TAG_DIRTY)))
+ ext4_warning_inode(inode, "data will be lost");
+
truncate_inode_pages_final(&inode->i_data);
goto no_delete;
@@ -4451,8 +4461,13 @@ int ext4_inode_attach_jinode(struct inode *inode)
spin_unlock(&inode->i_lock);
return -ENOMEM;
}
- ei->jinode = jinode;
- jbd2_journal_init_jbd_inode(ei->jinode, inode);
+ jbd2_journal_init_jbd_inode(jinode, inode);
+ /*
+ * Publish ->jinode only after it is fully initialized so that
+ * readers never observe a partially initialized jbd2_inode.
+ */
+ smp_wmb();
+ WRITE_ONCE(ei->jinode, jinode);
jinode = NULL;
}
spin_unlock(&inode->i_lock);
@@ -5401,18 +5416,36 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
inode->i_op = &ext4_encrypted_symlink_inode_operations;
} else if (ext4_inode_is_fast_symlink(inode)) {
inode->i_op = &ext4_fast_symlink_inode_operations;
- if (inode->i_size == 0 ||
- inode->i_size >= sizeof(ei->i_data) ||
- strnlen((char *)ei->i_data, inode->i_size + 1) !=
- inode->i_size) {
- ext4_error_inode(inode, function, line, 0,
- "invalid fast symlink length %llu",
- (unsigned long long)inode->i_size);
- ret = -EFSCORRUPTED;
- goto bad_inode;
+
+ /*
+ * Orphan cleanup can see inodes with i_size == 0
+ * and i_data uninitialized. Skip size checks in
+ * that case. This is safe because the first thing
+ * ext4_evict_inode() does for fast symlinks is
+ * clearing of i_data and i_size.
+ */
+ if ((EXT4_SB(sb)->s_mount_state & EXT4_ORPHAN_FS)) {
+ if (inode->i_nlink != 0) {
+ ext4_error_inode(inode, function, line, 0,
+ "invalid orphan symlink nlink %d",
+ inode->i_nlink);
+ ret = -EFSCORRUPTED;
+ goto bad_inode;
+ }
+ } else {
+ if (inode->i_size == 0 ||
+ inode->i_size >= sizeof(ei->i_data) ||
+ strnlen((char *)ei->i_data, inode->i_size + 1) !=
+ inode->i_size) {
+ ext4_error_inode(inode, function, line, 0,
+ "invalid fast symlink length %llu",
+ (unsigned long long)inode->i_size);
+ ret = -EFSCORRUPTED;
+ goto bad_inode;
+ }
+ inode_set_cached_link(inode, (char *)ei->i_data,
+ inode->i_size);
}
- inode_set_cached_link(inode, (char *)ei->i_data,
- inode->i_size);
} else {
inode->i_op = &ext4_symlink_inode_operations;
}
@@ -5849,6 +5882,18 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (attr->ia_size == inode->i_size)
inc_ivers = false;
+ /*
+ * If file has inline data but new size exceeds inline capacity,
+ * convert to extent-based storage first to prevent inconsistent
+ * state (inline flag set but size exceeds inline capacity).
+ */
+ if (ext4_has_inline_data(inode) &&
+ attr->ia_size > EXT4_I(inode)->i_inline_size) {
+ error = ext4_convert_inline_data(inode);
+ if (error)
+ goto err_out;
+ }
+
if (shrink) {
if (ext4_should_order_data(inode)) {
error = ext4_begin_ordered_truncate(inode,
diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c
index 9fbdf6a09489..6f5bfbb0e8a4 100644
--- a/fs/ext4/mballoc-test.c
+++ b/fs/ext4/mballoc-test.c
@@ -8,6 +8,7 @@
#include <linux/random.h>
#include "ext4.h"
+#include "mballoc.h"
struct mbt_grp_ctx {
struct buffer_head bitmap_bh;
@@ -336,7 +337,7 @@ ext4_mb_mark_context_stub(handle_t *handle, struct super_block *sb, bool state,
if (state)
mb_set_bits(bitmap_bh->b_data, blkoff, len);
else
- mb_clear_bits(bitmap_bh->b_data, blkoff, len);
+ mb_clear_bits_test(bitmap_bh->b_data, blkoff, len);
return 0;
}
@@ -413,14 +414,14 @@ static void test_new_blocks_simple(struct kunit *test)
/* get block at goal */
ar.goal = ext4_group_first_block_no(sb, goal_group);
- found = ext4_mb_new_blocks_simple(&ar, &err);
+ found = ext4_mb_new_blocks_simple_test(&ar, &err);
KUNIT_ASSERT_EQ_MSG(test, ar.goal, found,
"failed to alloc block at goal, expected %llu found %llu",
ar.goal, found);
/* get block after goal in goal group */
ar.goal = ext4_group_first_block_no(sb, goal_group);
- found = ext4_mb_new_blocks_simple(&ar, &err);
+ found = ext4_mb_new_blocks_simple_test(&ar, &err);
KUNIT_ASSERT_EQ_MSG(test, ar.goal + EXT4_C2B(sbi, 1), found,
"failed to alloc block after goal in goal group, expected %llu found %llu",
ar.goal + 1, found);
@@ -428,7 +429,7 @@ static void test_new_blocks_simple(struct kunit *test)
/* get block after goal group */
mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb));
ar.goal = ext4_group_first_block_no(sb, goal_group);
- found = ext4_mb_new_blocks_simple(&ar, &err);
+ found = ext4_mb_new_blocks_simple_test(&ar, &err);
KUNIT_ASSERT_EQ_MSG(test,
ext4_group_first_block_no(sb, goal_group + 1), found,
"failed to alloc block after goal group, expected %llu found %llu",
@@ -438,7 +439,7 @@ static void test_new_blocks_simple(struct kunit *test)
for (i = goal_group; i < ext4_get_groups_count(sb); i++)
mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb));
ar.goal = ext4_group_first_block_no(sb, goal_group);
- found = ext4_mb_new_blocks_simple(&ar, &err);
+ found = ext4_mb_new_blocks_simple_test(&ar, &err);
KUNIT_ASSERT_EQ_MSG(test,
ext4_group_first_block_no(sb, 0) + EXT4_C2B(sbi, 1), found,
"failed to alloc block before goal group, expected %llu found %llu",
@@ -448,7 +449,7 @@ static void test_new_blocks_simple(struct kunit *test)
for (i = 0; i < ext4_get_groups_count(sb); i++)
mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb));
ar.goal = ext4_group_first_block_no(sb, goal_group);
- found = ext4_mb_new_blocks_simple(&ar, &err);
+ found = ext4_mb_new_blocks_simple_test(&ar, &err);
KUNIT_ASSERT_NE_MSG(test, err, 0,
"unexpectedly get block when no block is available");
}
@@ -492,16 +493,16 @@ validate_free_blocks_simple(struct kunit *test, struct super_block *sb,
continue;
bitmap = mbt_ctx_bitmap(sb, i);
- bit = mb_find_next_zero_bit(bitmap, max, 0);
+ bit = mb_find_next_zero_bit_test(bitmap, max, 0);
KUNIT_ASSERT_EQ_MSG(test, bit, max,
"free block on unexpected group %d", i);
}
bitmap = mbt_ctx_bitmap(sb, goal_group);
- bit = mb_find_next_zero_bit(bitmap, max, 0);
+ bit = mb_find_next_zero_bit_test(bitmap, max, 0);
KUNIT_ASSERT_EQ(test, bit, start);
- bit = mb_find_next_bit(bitmap, max, bit + 1);
+ bit = mb_find_next_bit_test(bitmap, max, bit + 1);
KUNIT_ASSERT_EQ(test, bit, start + len);
}
@@ -524,7 +525,7 @@ test_free_blocks_simple_range(struct kunit *test, ext4_group_t goal_group,
block = ext4_group_first_block_no(sb, goal_group) +
EXT4_C2B(sbi, start);
- ext4_free_blocks_simple(inode, block, len);
+ ext4_free_blocks_simple_test(inode, block, len);
validate_free_blocks_simple(test, sb, goal_group, start, len);
mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb));
}
@@ -566,15 +567,15 @@ test_mark_diskspace_used_range(struct kunit *test,
bitmap = mbt_ctx_bitmap(sb, TEST_GOAL_GROUP);
memset(bitmap, 0, sb->s_blocksize);
- ret = ext4_mb_mark_diskspace_used(ac, NULL);
+ ret = ext4_mb_mark_diskspace_used_test(ac, NULL);
KUNIT_ASSERT_EQ(test, ret, 0);
max = EXT4_CLUSTERS_PER_GROUP(sb);
- i = mb_find_next_bit(bitmap, max, 0);
+ i = mb_find_next_bit_test(bitmap, max, 0);
KUNIT_ASSERT_EQ(test, i, start);
- i = mb_find_next_zero_bit(bitmap, max, i + 1);
+ i = mb_find_next_zero_bit_test(bitmap, max, i + 1);
KUNIT_ASSERT_EQ(test, i, start + len);
- i = mb_find_next_bit(bitmap, max, i + 1);
+ i = mb_find_next_bit_test(bitmap, max, i + 1);
KUNIT_ASSERT_EQ(test, max, i);
}
@@ -617,54 +618,54 @@ static void mbt_generate_buddy(struct super_block *sb, void *buddy,
max = EXT4_CLUSTERS_PER_GROUP(sb);
bb_h = buddy + sbi->s_mb_offsets[1];
- off = mb_find_next_zero_bit(bb, max, 0);
+ off = mb_find_next_zero_bit_test(bb, max, 0);
grp->bb_first_free = off;
while (off < max) {
grp->bb_counters[0]++;
grp->bb_free++;
- if (!(off & 1) && !mb_test_bit(off + 1, bb)) {
+ if (!(off & 1) && !mb_test_bit_test(off + 1, bb)) {
grp->bb_free++;
grp->bb_counters[0]--;
- mb_clear_bit(off >> 1, bb_h);
+ mb_clear_bit_test(off >> 1, bb_h);
grp->bb_counters[1]++;
grp->bb_largest_free_order = 1;
off++;
}
- off = mb_find_next_zero_bit(bb, max, off + 1);
+ off = mb_find_next_zero_bit_test(bb, max, off + 1);
}
for (order = 1; order < MB_NUM_ORDERS(sb) - 1; order++) {
bb = buddy + sbi->s_mb_offsets[order];
bb_h = buddy + sbi->s_mb_offsets[order + 1];
max = max >> 1;
- off = mb_find_next_zero_bit(bb, max, 0);
+ off = mb_find_next_zero_bit_test(bb, max, 0);
while (off < max) {
- if (!(off & 1) && !mb_test_bit(off + 1, bb)) {
+ if (!(off & 1) && !mb_test_bit_test(off + 1, bb)) {
mb_set_bits(bb, off, 2);
grp->bb_counters[order] -= 2;
- mb_clear_bit(off >> 1, bb_h);
+ mb_clear_bit_test(off >> 1, bb_h);
grp->bb_counters[order + 1]++;
grp->bb_largest_free_order = order + 1;
off++;
}
- off = mb_find_next_zero_bit(bb, max, off + 1);
+ off = mb_find_next_zero_bit_test(bb, max, off + 1);
}
}
max = EXT4_CLUSTERS_PER_GROUP(sb);
- off = mb_find_next_zero_bit(bitmap, max, 0);
+ off = mb_find_next_zero_bit_test(bitmap, max, 0);
while (off < max) {
grp->bb_fragments++;
- off = mb_find_next_bit(bitmap, max, off + 1);
+ off = mb_find_next_bit_test(bitmap, max, off + 1);
if (off + 1 >= max)
break;
- off = mb_find_next_zero_bit(bitmap, max, off + 1);
+ off = mb_find_next_zero_bit_test(bitmap, max, off + 1);
}
}
@@ -706,7 +707,7 @@ do_test_generate_buddy(struct kunit *test, struct super_block *sb, void *bitmap,
/* needed by validation in ext4_mb_generate_buddy */
ext4_grp->bb_free = mbt_grp->bb_free;
memset(ext4_buddy, 0xff, sb->s_blocksize);
- ext4_mb_generate_buddy(sb, ext4_buddy, bitmap, TEST_GOAL_GROUP,
+ ext4_mb_generate_buddy_test(sb, ext4_buddy, bitmap, TEST_GOAL_GROUP,
ext4_grp);
KUNIT_ASSERT_EQ(test, memcmp(mbt_buddy, ext4_buddy, sb->s_blocksize),
@@ -760,7 +761,7 @@ test_mb_mark_used_range(struct kunit *test, struct ext4_buddy *e4b,
ex.fe_group = TEST_GOAL_GROUP;
ext4_lock_group(sb, TEST_GOAL_GROUP);
- mb_mark_used(e4b, &ex);
+ mb_mark_used_test(e4b, &ex);
ext4_unlock_group(sb, TEST_GOAL_GROUP);
mb_set_bits(bitmap, start, len);
@@ -769,7 +770,7 @@ test_mb_mark_used_range(struct kunit *test, struct ext4_buddy *e4b,
memset(buddy, 0xff, sb->s_blocksize);
for (i = 0; i < MB_NUM_ORDERS(sb); i++)
grp->bb_counters[i] = 0;
- ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp);
+ ext4_mb_generate_buddy_test(sb, buddy, bitmap, 0, grp);
KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize),
0);
@@ -798,7 +799,7 @@ static void test_mb_mark_used(struct kunit *test)
bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp);
- ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b);
+ ret = ext4_mb_load_buddy_test(sb, TEST_GOAL_GROUP, &e4b);
KUNIT_ASSERT_EQ(test, ret, 0);
grp->bb_free = EXT4_CLUSTERS_PER_GROUP(sb);
@@ -809,7 +810,7 @@ static void test_mb_mark_used(struct kunit *test)
test_mb_mark_used_range(test, &e4b, ranges[i].start,
ranges[i].len, bitmap, buddy, grp);
- ext4_mb_unload_buddy(&e4b);
+ ext4_mb_unload_buddy_test(&e4b);
}
static void
@@ -825,16 +826,16 @@ test_mb_free_blocks_range(struct kunit *test, struct ext4_buddy *e4b,
return;
ext4_lock_group(sb, e4b->bd_group);
- mb_free_blocks(NULL, e4b, start, len);
+ mb_free_blocks_test(NULL, e4b, start, len);
ext4_unlock_group(sb, e4b->bd_group);
- mb_clear_bits(bitmap, start, len);
+ mb_clear_bits_test(bitmap, start, len);
/* bypass bb_free validatoin in ext4_mb_generate_buddy */
grp->bb_free += len;
memset(buddy, 0xff, sb->s_blocksize);
for (i = 0; i < MB_NUM_ORDERS(sb); i++)
grp->bb_counters[i] = 0;
- ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp);
+ ext4_mb_generate_buddy_test(sb, buddy, bitmap, 0, grp);
KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize),
0);
@@ -865,7 +866,7 @@ static void test_mb_free_blocks(struct kunit *test)
bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp);
- ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b);
+ ret = ext4_mb_load_buddy_test(sb, TEST_GOAL_GROUP, &e4b);
KUNIT_ASSERT_EQ(test, ret, 0);
ex.fe_start = 0;
@@ -873,7 +874,7 @@ static void test_mb_free_blocks(struct kunit *test)
ex.fe_group = TEST_GOAL_GROUP;
ext4_lock_group(sb, TEST_GOAL_GROUP);
- mb_mark_used(&e4b, &ex);
+ mb_mark_used_test(&e4b, &ex);
ext4_unlock_group(sb, TEST_GOAL_GROUP);
grp->bb_free = 0;
@@ -886,7 +887,7 @@ static void test_mb_free_blocks(struct kunit *test)
test_mb_free_blocks_range(test, &e4b, ranges[i].start,
ranges[i].len, bitmap, buddy, grp);
- ext4_mb_unload_buddy(&e4b);
+ ext4_mb_unload_buddy_test(&e4b);
}
#define COUNT_FOR_ESTIMATE 100000
@@ -904,7 +905,7 @@ static void test_mb_mark_used_cost(struct kunit *test)
if (sb->s_blocksize > PAGE_SIZE)
kunit_skip(test, "blocksize exceeds pagesize");
- ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b);
+ ret = ext4_mb_load_buddy_test(sb, TEST_GOAL_GROUP, &e4b);
KUNIT_ASSERT_EQ(test, ret, 0);
ex.fe_group = TEST_GOAL_GROUP;
@@ -918,7 +919,7 @@ static void test_mb_mark_used_cost(struct kunit *test)
ex.fe_start = ranges[i].start;
ex.fe_len = ranges[i].len;
ext4_lock_group(sb, TEST_GOAL_GROUP);
- mb_mark_used(&e4b, &ex);
+ mb_mark_used_test(&e4b, &ex);
ext4_unlock_group(sb, TEST_GOAL_GROUP);
}
end = jiffies;
@@ -929,14 +930,14 @@ static void test_mb_mark_used_cost(struct kunit *test)
continue;
ext4_lock_group(sb, TEST_GOAL_GROUP);
- mb_free_blocks(NULL, &e4b, ranges[i].start,
+ mb_free_blocks_test(NULL, &e4b, ranges[i].start,
ranges[i].len);
ext4_unlock_group(sb, TEST_GOAL_GROUP);
}
}
kunit_info(test, "costed jiffies %lu\n", all);
- ext4_mb_unload_buddy(&e4b);
+ ext4_mb_unload_buddy_test(&e4b);
}
static const struct mbt_ext4_block_layout mbt_test_layouts[] = {
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 20e9fdaf4301..bb58eafb87bc 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1199,6 +1199,8 @@ static int ext4_mb_scan_groups(struct ext4_allocation_context *ac)
/* searching for the right group start from the goal value specified */
start = ac->ac_g_ex.fe_group;
+ if (start >= ngroups)
+ start = 0;
ac->ac_prefetch_grp = start;
ac->ac_prefetch_nr = 0;
@@ -2443,8 +2445,12 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
return 0;
err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
- if (err)
+ if (err) {
+ if (EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info) &&
+ !(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+ return 0;
return err;
+ }
ext4_lock_group(ac->ac_sb, group);
if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
@@ -3580,9 +3586,7 @@ err_freebuddy:
rcu_read_unlock();
iput(sbi->s_buddy_cache);
err_freesgi:
- rcu_read_lock();
- kvfree(rcu_dereference(sbi->s_group_info));
- rcu_read_unlock();
+ kvfree(rcu_access_pointer(sbi->s_group_info));
return -ENOMEM;
}
@@ -3889,15 +3893,14 @@ void ext4_mb_release(struct super_block *sb)
struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
int count;
- if (test_opt(sb, DISCARD)) {
- /*
- * wait the discard work to drain all of ext4_free_data
- */
- flush_work(&sbi->s_discard_work);
- WARN_ON_ONCE(!list_empty(&sbi->s_discard_list));
- }
+ /*
+ * wait the discard work to drain all of ext4_free_data
+ */
+ flush_work(&sbi->s_discard_work);
+ WARN_ON_ONCE(!list_empty(&sbi->s_discard_list));
- if (sbi->s_group_info) {
+ group_info = rcu_access_pointer(sbi->s_group_info);
+ if (group_info) {
for (i = 0; i < ngroups; i++) {
cond_resched();
grinfo = ext4_get_group_info(sb, i);
@@ -3915,12 +3918,9 @@ void ext4_mb_release(struct super_block *sb)
num_meta_group_infos = (ngroups +
EXT4_DESC_PER_BLOCK(sb) - 1) >>
EXT4_DESC_PER_BLOCK_BITS(sb);
- rcu_read_lock();
- group_info = rcu_dereference(sbi->s_group_info);
for (i = 0; i < num_meta_group_infos; i++)
kfree(group_info[i]);
kvfree(group_info);
- rcu_read_unlock();
}
ext4_mb_avg_fragment_size_destroy(sbi);
ext4_mb_largest_free_orders_destroy(sbi);
@@ -4084,7 +4084,7 @@ void ext4_exit_mballoc(void)
#define EXT4_MB_BITMAP_MARKED_CHECK 0x0001
#define EXT4_MB_SYNC_UPDATE 0x0002
-static int
+int
ext4_mb_mark_context(handle_t *handle, struct super_block *sb, bool state,
ext4_group_t group, ext4_grpblk_t blkoff,
ext4_grpblk_t len, int flags, ext4_grpblk_t *ret_changed)
@@ -7188,6 +7188,102 @@ out_unload:
return error;
}
-#ifdef CONFIG_EXT4_KUNIT_TESTS
-#include "mballoc-test.c"
+#if IS_ENABLED(CONFIG_EXT4_KUNIT_TESTS)
+void mb_clear_bits_test(void *bm, int cur, int len)
+{
+ mb_clear_bits(bm, cur, len);
+}
+EXPORT_SYMBOL_FOR_EXT4_TEST(mb_clear_bits_test);
+
+ext4_fsblk_t
+ext4_mb_new_blocks_simple_test(struct ext4_allocation_request *ar,
+ int *errp)
+{
+ return ext4_mb_new_blocks_simple(ar, errp);
+}
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_new_blocks_simple_test);
+
+int mb_find_next_zero_bit_test(void *addr, int max, int start)
+{
+ return mb_find_next_zero_bit(addr, max, start);
+}
+EXPORT_SYMBOL_FOR_EXT4_TEST(mb_find_next_zero_bit_test);
+
+int mb_find_next_bit_test(void *addr, int max, int start)
+{
+ return mb_find_next_bit(addr, max, start);
+}
+EXPORT_SYMBOL_FOR_EXT4_TEST(mb_find_next_bit_test);
+
+void mb_clear_bit_test(int bit, void *addr)
+{
+ mb_clear_bit(bit, addr);
+}
+EXPORT_SYMBOL_FOR_EXT4_TEST(mb_clear_bit_test);
+
+int mb_test_bit_test(int bit, void *addr)
+{
+ return mb_test_bit(bit, addr);
+}
+EXPORT_SYMBOL_FOR_EXT4_TEST(mb_test_bit_test);
+
+int ext4_mb_mark_diskspace_used_test(struct ext4_allocation_context *ac,
+ handle_t *handle)
+{
+ return ext4_mb_mark_diskspace_used(ac, handle);
+}
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_mark_diskspace_used_test);
+
+int mb_mark_used_test(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
+{
+ return mb_mark_used(e4b, ex);
+}
+EXPORT_SYMBOL_FOR_EXT4_TEST(mb_mark_used_test);
+
+void ext4_mb_generate_buddy_test(struct super_block *sb, void *buddy,
+ void *bitmap, ext4_group_t group,
+ struct ext4_group_info *grp)
+{
+ ext4_mb_generate_buddy(sb, buddy, bitmap, group, grp);
+}
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_generate_buddy_test);
+
+int ext4_mb_load_buddy_test(struct super_block *sb, ext4_group_t group,
+ struct ext4_buddy *e4b)
+{
+ return ext4_mb_load_buddy(sb, group, e4b);
+}
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_load_buddy_test);
+
+void ext4_mb_unload_buddy_test(struct ext4_buddy *e4b)
+{
+ ext4_mb_unload_buddy(e4b);
+}
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_unload_buddy_test);
+
+void mb_free_blocks_test(struct inode *inode, struct ext4_buddy *e4b,
+ int first, int count)
+{
+ mb_free_blocks(inode, e4b, first, count);
+}
+EXPORT_SYMBOL_FOR_EXT4_TEST(mb_free_blocks_test);
+
+void ext4_free_blocks_simple_test(struct inode *inode, ext4_fsblk_t block,
+ unsigned long count)
+{
+ return ext4_free_blocks_simple(inode, block, count);
+}
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_free_blocks_simple_test);
+
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_wait_block_bitmap);
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_init);
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_get_group_desc);
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_count_free_clusters);
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_get_group_info);
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_free_group_clusters_set);
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_release);
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_read_block_bitmap_nowait);
+EXPORT_SYMBOL_FOR_EXT4_TEST(mb_set_bits);
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_fc_init_inode);
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_mark_context);
#endif
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 15a049f05d04..39333ce72cbd 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -270,4 +270,34 @@ ext4_mballoc_query_range(
ext4_mballoc_query_range_fn formatter,
void *priv);
+extern int ext4_mb_mark_context(handle_t *handle,
+ struct super_block *sb, bool state,
+ ext4_group_t group, ext4_grpblk_t blkoff,
+ ext4_grpblk_t len, int flags,
+ ext4_grpblk_t *ret_changed);
+#if IS_ENABLED(CONFIG_EXT4_KUNIT_TESTS)
+extern void mb_clear_bits_test(void *bm, int cur, int len);
+extern ext4_fsblk_t
+ext4_mb_new_blocks_simple_test(struct ext4_allocation_request *ar,
+ int *errp);
+extern int mb_find_next_zero_bit_test(void *addr, int max, int start);
+extern int mb_find_next_bit_test(void *addr, int max, int start);
+extern void mb_clear_bit_test(int bit, void *addr);
+extern int mb_test_bit_test(int bit, void *addr);
+extern int
+ext4_mb_mark_diskspace_used_test(struct ext4_allocation_context *ac,
+ handle_t *handle);
+extern int mb_mark_used_test(struct ext4_buddy *e4b,
+ struct ext4_free_extent *ex);
+extern void ext4_mb_generate_buddy_test(struct super_block *sb,
+ void *buddy, void *bitmap, ext4_group_t group,
+ struct ext4_group_info *grp);
+extern int ext4_mb_load_buddy_test(struct super_block *sb,
+ ext4_group_t group, struct ext4_buddy *e4b);
+extern void ext4_mb_unload_buddy_test(struct ext4_buddy *e4b);
+extern void mb_free_blocks_test(struct inode *inode,
+ struct ext4_buddy *e4b, int first, int count);
+extern void ext4_free_blocks_simple_test(struct inode *inode,
+ ext4_fsblk_t block, unsigned long count);
+#endif
#endif
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index a8c95eee91b7..39fe50b3c662 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -524,9 +524,15 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
nr_to_submit++;
} while ((bh = bh->b_this_page) != head);
- /* Nothing to submit? Just unlock the folio... */
- if (!nr_to_submit)
+ if (!nr_to_submit) {
+ /*
+ * We have nothing to submit. Just cycle the folio through
+ * writeback state to properly update xarray tags.
+ */
+ __folio_start_writeback(folio, keep_towrite);
+ folio_end_writeback(folio);
return 0;
+ }
bh = head = folio_buffers(folio);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 43f680c750ae..a34efb44e73d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1254,12 +1254,10 @@ static void ext4_group_desc_free(struct ext4_sb_info *sbi)
struct buffer_head **group_desc;
int i;
- rcu_read_lock();
- group_desc = rcu_dereference(sbi->s_group_desc);
+ group_desc = rcu_access_pointer(sbi->s_group_desc);
for (i = 0; i < sbi->s_gdb_count; i++)
brelse(group_desc[i]);
kvfree(group_desc);
- rcu_read_unlock();
}
static void ext4_flex_groups_free(struct ext4_sb_info *sbi)
@@ -1267,14 +1265,12 @@ static void ext4_flex_groups_free(struct ext4_sb_info *sbi)
struct flex_groups **flex_groups;
int i;
- rcu_read_lock();
- flex_groups = rcu_dereference(sbi->s_flex_groups);
+ flex_groups = rcu_access_pointer(sbi->s_flex_groups);
if (flex_groups) {
for (i = 0; i < sbi->s_flex_groups_allocated; i++)
kvfree(flex_groups[i]);
kvfree(flex_groups);
}
- rcu_read_unlock();
}
static void ext4_put_super(struct super_block *sb)
@@ -1527,6 +1523,27 @@ void ext4_clear_inode(struct inode *inode)
invalidate_inode_buffers(inode);
clear_inode(inode);
ext4_discard_preallocations(inode);
+ /*
+ * We must remove the inode from the hash before ext4_free_inode()
+ * clears the bit in inode bitmap as otherwise another process reusing
+ * the inode will block in insert_inode_hash() waiting for inode
+ * eviction to complete while holding transaction handle open, but
+ * ext4_evict_inode() still running for that inode could block waiting
+ * for transaction commit if the inode is marked as IS_SYNC => deadlock.
+ *
+ * Removing the inode from the hash here is safe. There are two cases
+ * to consider:
+ * 1) The inode still has references to it (i_nlink > 0). In that case
+ * we are keeping the inode and once we remove the inode from the hash,
+ * iget() can create the new inode structure for the same inode number
+ * and we are fine with that as all IO on behalf of the inode is
+ * finished.
+ * 2) We are deleting the inode (i_nlink == 0). In that case inode
+ * number cannot be reused until ext4_free_inode() clears the bit in
+ * the inode bitmap, at which point all IO is done and reuse is fine
+ * again.
+ */
+ remove_inode_hash(inode);
ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
dquot_drop(inode);
if (EXT4_I(inode)->jinode) {
@@ -3633,6 +3650,13 @@ int ext4_feature_set_ok(struct super_block *sb, int readonly)
"extents feature\n");
return 0;
}
+ if (ext4_has_feature_bigalloc(sb) &&
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
+ ext4_msg(sb, KERN_WARNING,
+ "bad geometry: bigalloc file system with non-zero "
+ "first_data_block\n");
+ return 0;
+ }
#if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2)
if (!readonly && (ext4_has_feature_quota(sb) ||
@@ -5403,6 +5427,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
spin_lock_init(&sbi->s_error_lock);
+ mutex_init(&sbi->s_error_notify_mutex);
INIT_WORK(&sbi->s_sb_upd_work, update_super_work);
err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed);
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index b87d7bdab06a..923b375e017f 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -597,7 +597,10 @@ static const struct kobj_type ext4_feat_ktype = {
void ext4_notify_error_sysfs(struct ext4_sb_info *sbi)
{
- sysfs_notify(&sbi->s_kobj, NULL, "errors_count");
+ mutex_lock(&sbi->s_error_notify_mutex);
+ if (sbi->s_kobj.state_in_sysfs)
+ sysfs_notify(&sbi->s_kobj, NULL, "errors_count");
+ mutex_unlock(&sbi->s_error_notify_mutex);
}
static struct kobject *ext4_root;
@@ -610,8 +613,10 @@ int ext4_register_sysfs(struct super_block *sb)
int err;
init_completion(&sbi->s_kobj_unregister);
+ mutex_lock(&sbi->s_error_notify_mutex);
err = kobject_init_and_add(&sbi->s_kobj, &ext4_sb_ktype, ext4_root,
"%s", sb->s_id);
+ mutex_unlock(&sbi->s_error_notify_mutex);
if (err) {
kobject_put(&sbi->s_kobj);
wait_for_completion(&sbi->s_kobj_unregister);
@@ -644,7 +649,10 @@ void ext4_unregister_sysfs(struct super_block *sb)
if (sbi->s_proc)
remove_proc_subtree(sb->s_id, ext4_proc_root);
+
+ mutex_lock(&sbi->s_error_notify_mutex);
kobject_del(&sbi->s_kobj);
+ mutex_unlock(&sbi->s_error_notify_mutex);
}
int __init ext4_init_sysfs(void)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 7c75ed7e8979..3c75ee025bda 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1711,6 +1711,19 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
}
}
+static bool __sync_lazytime(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ if (!(inode_state_read(inode) & I_DIRTY_TIME)) {
+ spin_unlock(&inode->i_lock);
+ return false;
+ }
+ inode_state_clear(inode, I_DIRTY_TIME);
+ spin_unlock(&inode->i_lock);
+ inode->i_op->sync_lazytime(inode);
+ return true;
+}
+
bool sync_lazytime(struct inode *inode)
{
if (!(inode_state_read_once(inode) & I_DIRTY_TIME))
@@ -1718,9 +1731,8 @@ bool sync_lazytime(struct inode *inode)
trace_writeback_lazytime(inode);
if (inode->i_op->sync_lazytime)
- inode->i_op->sync_lazytime(inode);
- else
- mark_inode_dirty_sync(inode);
+ return __sync_lazytime(inode);
+ mark_inode_dirty_sync(inode);
return true;
}
@@ -2775,13 +2787,8 @@ static void wait_sb_inodes(struct super_block *sb)
* The mapping can appear untagged while still on-list since we
* do not have the mapping lock. Skip it here, wb completion
* will remove it.
- *
- * If the mapping does not have data integrity semantics,
- * there's no need to wait for the writeout to complete, as the
- * mapping cannot guarantee that data is persistently stored.
*/
- if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK) ||
- mapping_no_data_integrity(mapping))
+ if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
continue;
spin_unlock_irq(&sb->s_inode_wblist_lock);
@@ -2916,6 +2923,17 @@ void sync_inodes_sb(struct super_block *sb)
*/
if (bdi == &noop_backing_dev_info)
return;
+
+ /*
+ * If the superblock has SB_I_NO_DATA_INTEGRITY set, there's no need to
+ * wait for the writeout to complete, as the filesystem cannot guarantee
+ * data persistence on sync. Just kick off writeback and return.
+ */
+ if (sb->s_iflags & SB_I_NO_DATA_INTEGRITY) {
+ wakeup_flusher_threads_bdi(bdi, WB_REASON_SYNC);
+ return;
+ }
+
WARN_ON(!rwsem_is_locked(&sb->s_umount));
/* protect against inode wb switch, see inode_switch_wbs_work_fn() */
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index b1bb7153cb78..676fd9856bfb 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -3201,10 +3201,8 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags)
inode->i_fop = &fuse_file_operations;
inode->i_data.a_ops = &fuse_file_aops;
- if (fc->writeback_cache) {
+ if (fc->writeback_cache)
mapping_set_writeback_may_deadlock_on_reclaim(&inode->i_data);
- mapping_set_no_data_integrity(&inode->i_data);
- }
INIT_LIST_HEAD(&fi->write_files);
INIT_LIST_HEAD(&fi->queued_writes);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e57b8af06be9..c795abe47a4f 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1709,6 +1709,7 @@ static void fuse_sb_defaults(struct super_block *sb)
sb->s_export_op = &fuse_export_operations;
sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE;
sb->s_iflags |= SB_I_NOIDMAP;
+ sb->s_iflags |= SB_I_NO_DATA_INTEGRITY;
if (sb->s_user_ns != &init_user_ns)
sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER;
sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION);
diff --git a/fs/iomap/bio.c b/fs/iomap/bio.c
index fc045f2e4c45..edd908183058 100644
--- a/fs/iomap/bio.c
+++ b/fs/iomap/bio.c
@@ -8,7 +8,10 @@
#include "internal.h"
#include "trace.h"
-static void iomap_read_end_io(struct bio *bio)
+static DEFINE_SPINLOCK(failed_read_lock);
+static struct bio_list failed_read_list = BIO_EMPTY_LIST;
+
+static void __iomap_read_end_io(struct bio *bio)
{
int error = blk_status_to_errno(bio->bi_status);
struct folio_iter fi;
@@ -18,6 +21,52 @@ static void iomap_read_end_io(struct bio *bio)
bio_put(bio);
}
+static void
+iomap_fail_reads(
+ struct work_struct *work)
+{
+ struct bio *bio;
+ struct bio_list tmp = BIO_EMPTY_LIST;
+ unsigned long flags;
+
+ spin_lock_irqsave(&failed_read_lock, flags);
+ bio_list_merge_init(&tmp, &failed_read_list);
+ spin_unlock_irqrestore(&failed_read_lock, flags);
+
+ while ((bio = bio_list_pop(&tmp)) != NULL) {
+ __iomap_read_end_io(bio);
+ cond_resched();
+ }
+}
+
+static DECLARE_WORK(failed_read_work, iomap_fail_reads);
+
+static void iomap_fail_buffered_read(struct bio *bio)
+{
+ unsigned long flags;
+
+ /*
+ * Bounce I/O errors to a workqueue to avoid nested i_lock acquisitions
+ * in the fserror code. The caller no longer owns the bio reference
+ * after the spinlock drops.
+ */
+ spin_lock_irqsave(&failed_read_lock, flags);
+ if (bio_list_empty(&failed_read_list))
+ WARN_ON_ONCE(!schedule_work(&failed_read_work));
+ bio_list_add(&failed_read_list, bio);
+ spin_unlock_irqrestore(&failed_read_lock, flags);
+}
+
+static void iomap_read_end_io(struct bio *bio)
+{
+ if (bio->bi_status) {
+ iomap_fail_buffered_read(bio);
+ return;
+ }
+
+ __iomap_read_end_io(bio);
+}
+
static void iomap_bio_submit_read(struct iomap_read_folio_ctx *ctx)
{
struct bio *bio = ctx->read_ctx;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 00f0efaf12b2..92a831cf4bf1 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -514,6 +514,7 @@ static int iomap_read_folio_iter(struct iomap_iter *iter,
loff_t length = iomap_length(iter);
struct folio *folio = ctx->cur_folio;
size_t folio_len = folio_size(folio);
+ struct iomap_folio_state *ifs;
size_t poff, plen;
loff_t pos_diff;
int ret;
@@ -525,7 +526,7 @@ static int iomap_read_folio_iter(struct iomap_iter *iter,
return iomap_iter_advance(iter, length);
}
- ifs_alloc(iter->inode, folio, iter->flags);
+ ifs = ifs_alloc(iter->inode, folio, iter->flags);
length = min_t(loff_t, length, folio_len - offset_in_folio(folio, pos));
while (length) {
@@ -560,11 +561,15 @@ static int iomap_read_folio_iter(struct iomap_iter *iter,
*bytes_submitted += plen;
/*
- * If the entire folio has been read in by the IO
- * helper, then the helper owns the folio and will end
- * the read on it.
+ * Hand off folio ownership to the IO helper when:
+ * 1) The entire folio has been submitted for IO, or
+ * 2) There is no ifs attached to the folio
+ *
+ * Case (2) occurs when 1 << i_blkbits matches the folio
+ * size but the underlying filesystem or block device
+ * uses a smaller granularity for IO.
*/
- if (*bytes_submitted == folio_len)
+ if (*bytes_submitted == folio_len || !ifs)
ctx->cur_folio = NULL;
}
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index de89c5bef607..1508e2f54462 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -267,7 +267,15 @@ restart:
*/
BUFFER_TRACE(bh, "queue");
get_bh(bh);
- J_ASSERT_BH(bh, !buffer_jwrite(bh));
+ if (WARN_ON_ONCE(buffer_jwrite(bh))) {
+ put_bh(bh); /* drop the ref we just took */
+ spin_unlock(&journal->j_list_lock);
+ /* Clean up any previously batched buffers */
+ if (batch_count)
+ __flush_batch(journal, &batch_count);
+ jbd2_journal_abort(journal, -EFSCORRUPTED);
+ return -EFSCORRUPTED;
+ }
journal->j_chkpt_bhs[batch_count++] = bh;
transaction->t_chp_stats.cs_written++;
transaction->t_checkpoint_list = jh->b_cpnext;
@@ -325,7 +333,10 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr))
return 1;
- J_ASSERT(blocknr != 0);
+ if (WARN_ON_ONCE(blocknr == 0)) {
+ jbd2_journal_abort(journal, -EFSCORRUPTED);
+ return -EFSCORRUPTED;
+ }
/*
* We need to make sure that any blocks that were recently written out
diff --git a/fs/namei.c b/fs/namei.c
index 58f715f7657e..9e5500dad14f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2437,8 +2437,14 @@ inside:
EXPORT_SYMBOL(hashlen_string);
/*
- * Calculate the length and hash of the path component, and
- * return the length as the result.
+ * hash_name - Calculate the length and hash of the path component
+ * @nd: the path resolution state
+ * @name: the pathname to read the component from
+ * @lastword: if the component fits in a single word, LAST_WORD_IS_DOT,
+ * LAST_WORD_IS_DOTDOT, or some other value depending on whether the
+ * component is '.', '..', or something else. Otherwise, @lastword is 0.
+ *
+ * Returns: a pointer to the terminating '/' or NUL character in @name.
*/
static inline const char *hash_name(struct nameidata *nd,
const char *name,
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 88a0d801525f..a8c0d86118c5 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -171,9 +171,8 @@ static void netfs_queue_read(struct netfs_io_request *rreq,
spin_lock(&rreq->lock);
list_add_tail(&subreq->rreq_link, &stream->subrequests);
if (list_is_first(&subreq->rreq_link, &stream->subrequests)) {
- stream->front = subreq;
if (!stream->active) {
- stream->collected_to = stream->front->start;
+ stream->collected_to = subreq->start;
/* Store list pointers before active flag */
smp_store_release(&stream->active, true);
}
diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c
index a498ee8d6674..f72e6da88cca 100644
--- a/fs/netfs/direct_read.c
+++ b/fs/netfs/direct_read.c
@@ -71,9 +71,8 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq)
spin_lock(&rreq->lock);
list_add_tail(&subreq->rreq_link, &stream->subrequests);
if (list_is_first(&subreq->rreq_link, &stream->subrequests)) {
- stream->front = subreq;
if (!stream->active) {
- stream->collected_to = stream->front->start;
+ stream->collected_to = subreq->start;
/* Store list pointers before active flag */
smp_store_release(&stream->active, true);
}
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index dd1451bf7543..f9ab69de3e29 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -111,7 +111,6 @@ static int netfs_unbuffered_write(struct netfs_io_request *wreq)
netfs_prepare_write(wreq, stream, wreq->start + wreq->transferred);
subreq = stream->construct;
stream->construct = NULL;
- stream->front = NULL;
}
/* Check if (re-)preparation failed. */
@@ -186,10 +185,18 @@ static int netfs_unbuffered_write(struct netfs_io_request *wreq)
stream->sreq_max_segs = INT_MAX;
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
- stream->prepare_write(subreq);
- __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
- netfs_stat(&netfs_n_wh_retry_write_subreq);
+ if (stream->prepare_write) {
+ stream->prepare_write(subreq);
+ __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+ netfs_stat(&netfs_n_wh_retry_write_subreq);
+ } else {
+ struct iov_iter source;
+
+ netfs_reset_iter(subreq);
+ source = subreq->io_iter;
+ netfs_reissue_write(stream, subreq, &source);
+ }
}
netfs_unbuffered_write_done(wreq);
diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c
index 72a435e5fc6d..154a14bb2d7f 100644
--- a/fs/netfs/iterator.c
+++ b/fs/netfs/iterator.c
@@ -143,6 +143,47 @@ static size_t netfs_limit_bvec(const struct iov_iter *iter, size_t start_offset,
}
/*
+ * Select the span of a kvec iterator we're going to use. Limit it by both
+ * maximum size and maximum number of segments. Returns the size of the span
+ * in bytes.
+ */
+static size_t netfs_limit_kvec(const struct iov_iter *iter, size_t start_offset,
+ size_t max_size, size_t max_segs)
+{
+ const struct kvec *kvecs = iter->kvec;
+ unsigned int nkv = iter->nr_segs, ix = 0, nsegs = 0;
+ size_t len, span = 0, n = iter->count;
+ size_t skip = iter->iov_offset + start_offset;
+
+ if (WARN_ON(!iov_iter_is_kvec(iter)) ||
+ WARN_ON(start_offset > n) ||
+ n == 0)
+ return 0;
+
+ while (n && ix < nkv && skip) {
+ len = kvecs[ix].iov_len;
+ if (skip < len)
+ break;
+ skip -= len;
+ n -= len;
+ ix++;
+ }
+
+ while (n && ix < nkv) {
+ len = min3(n, kvecs[ix].iov_len - skip, max_size);
+ span += len;
+ nsegs++;
+ ix++;
+ if (span >= max_size || nsegs >= max_segs)
+ break;
+ skip = 0;
+ n -= len;
+ }
+
+ return min(span, max_size);
+}
+
+/*
* Select the span of an xarray iterator we're going to use. Limit it by both
* maximum size and maximum number of segments. It is assumed that segments
* can be larger than a page in size, provided they're physically contiguous.
@@ -245,6 +286,8 @@ size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset,
return netfs_limit_bvec(iter, start_offset, max_size, max_segs);
if (iov_iter_is_xarray(iter))
return netfs_limit_xarray(iter, start_offset, max_size, max_segs);
+ if (iov_iter_is_kvec(iter))
+ return netfs_limit_kvec(iter, start_offset, max_size, max_segs);
BUG();
}
EXPORT_SYMBOL(netfs_limit_iter);
diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
index 137f0e28a44c..e5f6665b3341 100644
--- a/fs/netfs/read_collect.c
+++ b/fs/netfs/read_collect.c
@@ -205,7 +205,8 @@ reassess:
* in progress. The issuer thread may be adding stuff to the tail
* whilst we're doing this.
*/
- front = READ_ONCE(stream->front);
+ front = list_first_entry_or_null(&stream->subrequests,
+ struct netfs_io_subrequest, rreq_link);
while (front) {
size_t transferred;
@@ -301,7 +302,6 @@ reassess:
list_del_init(&front->rreq_link);
front = list_first_entry_or_null(&stream->subrequests,
struct netfs_io_subrequest, rreq_link);
- stream->front = front;
spin_unlock(&rreq->lock);
netfs_put_subrequest(remove,
notes & ABANDON_SREQ ?
diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c
index 7793ba5e3e8f..cca9ac43c077 100644
--- a/fs/netfs/read_retry.c
+++ b/fs/netfs/read_retry.c
@@ -93,8 +93,10 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
from->start, from->transferred, from->len);
if (test_bit(NETFS_SREQ_FAILED, &from->flags) ||
- !test_bit(NETFS_SREQ_NEED_RETRY, &from->flags))
+ !test_bit(NETFS_SREQ_NEED_RETRY, &from->flags)) {
+ subreq = from;
goto abandon;
+ }
list_for_each_continue(next, &stream->subrequests) {
subreq = list_entry(next, struct netfs_io_subrequest, rreq_link);
@@ -178,6 +180,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
if (subreq == to)
break;
}
+ subreq = NULL;
continue;
}
diff --git a/fs/netfs/read_single.c b/fs/netfs/read_single.c
index 8e6264f62a8f..d0e23bc42445 100644
--- a/fs/netfs/read_single.c
+++ b/fs/netfs/read_single.c
@@ -107,7 +107,6 @@ static int netfs_single_dispatch_read(struct netfs_io_request *rreq)
spin_lock(&rreq->lock);
list_add_tail(&subreq->rreq_link, &stream->subrequests);
trace_netfs_sreq(subreq, netfs_sreq_trace_added);
- stream->front = subreq;
/* Store list pointers before active flag */
smp_store_release(&stream->active, true);
spin_unlock(&rreq->lock);
diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
index 83eb3dc1adf8..b194447f4b11 100644
--- a/fs/netfs/write_collect.c
+++ b/fs/netfs/write_collect.c
@@ -228,7 +228,8 @@ reassess_streams:
if (!smp_load_acquire(&stream->active))
continue;
- front = stream->front;
+ front = list_first_entry_or_null(&stream->subrequests,
+ struct netfs_io_subrequest, rreq_link);
while (front) {
trace_netfs_collect_sreq(wreq, front);
//_debug("sreq [%x] %llx %zx/%zx",
@@ -279,7 +280,6 @@ reassess_streams:
list_del_init(&front->rreq_link);
front = list_first_entry_or_null(&stream->subrequests,
struct netfs_io_subrequest, rreq_link);
- stream->front = front;
spin_unlock(&wreq->lock);
netfs_put_subrequest(remove,
notes & SAW_FAILURE ?
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
index 437268f65640..2db688f94125 100644
--- a/fs/netfs/write_issue.c
+++ b/fs/netfs/write_issue.c
@@ -206,9 +206,8 @@ void netfs_prepare_write(struct netfs_io_request *wreq,
spin_lock(&wreq->lock);
list_add_tail(&subreq->rreq_link, &stream->subrequests);
if (list_is_first(&subreq->rreq_link, &stream->subrequests)) {
- stream->front = subreq;
if (!stream->active) {
- stream->collected_to = stream->front->start;
+ stream->collected_to = subreq->start;
/* Write list pointers before active flag */
smp_store_release(&stream->active, true);
}
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 758611ee4475..13cb60b52bd6 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -1146,15 +1146,15 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
return -EOVERFLOW;
/*
- * With metacopy disabled, we fsync after final metadata copyup, for
+ * With "fsync=strict", we fsync after final metadata copyup, for
* both regular files and directories to get atomic copyup semantics
* on filesystems that do not use strict metadata ordering (e.g. ubifs).
*
- * With metacopy enabled we want to avoid fsync on all meta copyup
+ * By default, we want to avoid fsync on all meta copyup, because
* that will hurt performance of workloads such as chown -R, so we
* only fsync on data copyup as legacy behavior.
*/
- ctx.metadata_fsync = !OVL_FS(dentry->d_sb)->config.metacopy &&
+ ctx.metadata_fsync = ovl_should_sync_metadata(OVL_FS(dentry->d_sb)) &&
(S_ISREG(ctx.stat.mode) || S_ISDIR(ctx.stat.mode));
ctx.metacopy = ovl_need_meta_copy_up(dentry, ctx.stat.mode, flags);
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index cad2055ebf18..63b299bf12f7 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -99,6 +99,12 @@ enum {
OVL_VERITY_REQUIRE,
};
+enum {
+ OVL_FSYNC_VOLATILE,
+ OVL_FSYNC_AUTO,
+ OVL_FSYNC_STRICT,
+};
+
/*
* The tuple (fh,uuid) is a universal unique identifier for a copy up origin,
* where:
@@ -656,6 +662,21 @@ static inline bool ovl_xino_warn(struct ovl_fs *ofs)
return ofs->config.xino == OVL_XINO_ON;
}
+static inline bool ovl_should_sync(struct ovl_fs *ofs)
+{
+ return ofs->config.fsync_mode != OVL_FSYNC_VOLATILE;
+}
+
+static inline bool ovl_should_sync_metadata(struct ovl_fs *ofs)
+{
+ return ofs->config.fsync_mode == OVL_FSYNC_STRICT;
+}
+
+static inline bool ovl_is_volatile(struct ovl_config *config)
+{
+ return config->fsync_mode == OVL_FSYNC_VOLATILE;
+}
+
/*
* To avoid regressions in existing setups with overlay lower offline changes,
* we allow lower changes only if none of the new features are used.
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index 1d4828dbcf7a..80cad4ea96a3 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -18,7 +18,7 @@ struct ovl_config {
int xino;
bool metacopy;
bool userxattr;
- bool ovl_volatile;
+ int fsync_mode;
};
struct ovl_sb {
@@ -120,11 +120,6 @@ static inline struct ovl_fs *OVL_FS(struct super_block *sb)
return (struct ovl_fs *)sb->s_fs_info;
}
-static inline bool ovl_should_sync(struct ovl_fs *ofs)
-{
- return !ofs->config.ovl_volatile;
-}
-
static inline unsigned int ovl_numlower(struct ovl_entry *oe)
{
return oe ? oe->__numlower : 0;
diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index 8111b437ae5d..c93fcaa45d4a 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -58,6 +58,7 @@ enum ovl_opt {
Opt_xino,
Opt_metacopy,
Opt_verity,
+ Opt_fsync,
Opt_volatile,
Opt_override_creds,
};
@@ -140,6 +141,23 @@ static int ovl_verity_mode_def(void)
return OVL_VERITY_OFF;
}
+static const struct constant_table ovl_parameter_fsync[] = {
+ { "volatile", OVL_FSYNC_VOLATILE },
+ { "auto", OVL_FSYNC_AUTO },
+ { "strict", OVL_FSYNC_STRICT },
+ {}
+};
+
+static const char *ovl_fsync_mode(struct ovl_config *config)
+{
+ return ovl_parameter_fsync[config->fsync_mode].name;
+}
+
+static int ovl_fsync_mode_def(void)
+{
+ return OVL_FSYNC_AUTO;
+}
+
const struct fs_parameter_spec ovl_parameter_spec[] = {
fsparam_string_empty("lowerdir", Opt_lowerdir),
fsparam_file_or_string("lowerdir+", Opt_lowerdir_add),
@@ -155,6 +173,7 @@ const struct fs_parameter_spec ovl_parameter_spec[] = {
fsparam_enum("xino", Opt_xino, ovl_parameter_xino),
fsparam_enum("metacopy", Opt_metacopy, ovl_parameter_bool),
fsparam_enum("verity", Opt_verity, ovl_parameter_verity),
+ fsparam_enum("fsync", Opt_fsync, ovl_parameter_fsync),
fsparam_flag("volatile", Opt_volatile),
fsparam_flag_no("override_creds", Opt_override_creds),
{}
@@ -665,8 +684,11 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_verity:
config->verity_mode = result.uint_32;
break;
+ case Opt_fsync:
+ config->fsync_mode = result.uint_32;
+ break;
case Opt_volatile:
- config->ovl_volatile = true;
+ config->fsync_mode = OVL_FSYNC_VOLATILE;
break;
case Opt_userxattr:
config->userxattr = true;
@@ -800,6 +822,7 @@ int ovl_init_fs_context(struct fs_context *fc)
ofs->config.nfs_export = ovl_nfs_export_def;
ofs->config.xino = ovl_xino_def();
ofs->config.metacopy = ovl_metacopy_def;
+ ofs->config.fsync_mode = ovl_fsync_mode_def();
fc->s_fs_info = ofs;
fc->fs_private = ctx;
@@ -870,9 +893,9 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx,
config->index = false;
}
- if (!config->upperdir && config->ovl_volatile) {
+ if (!config->upperdir && ovl_is_volatile(config)) {
pr_info("option \"volatile\" is meaningless in a non-upper mount, ignoring it.\n");
- config->ovl_volatile = false;
+ config->fsync_mode = ovl_fsync_mode_def();
}
if (!config->upperdir && config->uuid == OVL_UUID_ON) {
@@ -1070,8 +1093,8 @@ int ovl_show_options(struct seq_file *m, struct dentry *dentry)
seq_printf(m, ",xino=%s", ovl_xino_mode(&ofs->config));
if (ofs->config.metacopy != ovl_metacopy_def)
seq_printf(m, ",metacopy=%s", str_on_off(ofs->config.metacopy));
- if (ofs->config.ovl_volatile)
- seq_puts(m, ",volatile");
+ if (ofs->config.fsync_mode != ovl_fsync_mode_def())
+ seq_printf(m, ",fsync=%s", ovl_fsync_mode(&ofs->config));
if (ofs->config.userxattr)
seq_puts(m, ",userxattr");
if (ofs->config.verity_mode != ovl_verity_mode_def())
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index d4c12feec039..0822987cfb51 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -776,7 +776,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
* For volatile mount, create a incompat/volatile/dirty file to keep
* track of it.
*/
- if (ofs->config.ovl_volatile) {
+ if (ovl_is_volatile(&ofs->config)) {
err = ovl_create_volatile_dirty(ofs);
if (err < 0) {
pr_err("Failed to create volatile/dirty file.\n");
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 3f1b763a8bb4..2ea769f311c3 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -85,7 +85,10 @@ int ovl_can_decode_fh(struct super_block *sb)
if (!exportfs_can_decode_fh(sb->s_export_op))
return 0;
- return sb->s_export_op->encode_fh ? -1 : FILEID_INO32_GEN;
+ if (sb->s_export_op->encode_fh == generic_encode_ino32_fh)
+ return FILEID_INO32_GEN;
+
+ return -1;
}
struct dentry *ovl_indexdir(struct super_block *sb)
diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h
index fa7638b81246..383050e7fdf5 100644
--- a/include/linux/fs/super_types.h
+++ b/include/linux/fs/super_types.h
@@ -338,5 +338,6 @@ struct super_block {
#define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */
#define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */
#define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */
+#define SB_I_NO_DATA_INTEGRITY 0x00008000 /* fs cannot guarantee data persistence on sync */
#endif /* _LINUX_FS_SUPER_TYPES_H */
diff --git a/include/linux/leafops.h b/include/linux/leafops.h
index a9ff94b744f2..05673d3529e7 100644
--- a/include/linux/leafops.h
+++ b/include/linux/leafops.h
@@ -363,6 +363,23 @@ static inline unsigned long softleaf_to_pfn(softleaf_t entry)
return swp_offset(entry) & SWP_PFN_MASK;
}
+static inline void softleaf_migration_sync(softleaf_t entry,
+ struct folio *folio)
+{
+ /*
+ * Ensure we do not race with split, which might alter tail pages into new
+ * folios and thus result in observing an unlocked folio.
+ * This matches the write barrier in __split_folio_to_order().
+ */
+ smp_rmb();
+
+ /*
+ * Any use of migration entries may only occur while the
+ * corresponding page is locked
+ */
+ VM_WARN_ON_ONCE(!folio_test_locked(folio));
+}
+
/**
* softleaf_to_page() - Obtains struct page for PFN encoded within leaf entry.
* @entry: Leaf entry, softleaf_has_pfn(@entry) must return true.
@@ -374,11 +391,8 @@ static inline struct page *softleaf_to_page(softleaf_t entry)
struct page *page = pfn_to_page(softleaf_to_pfn(entry));
VM_WARN_ON_ONCE(!softleaf_has_pfn(entry));
- /*
- * Any use of migration entries may only occur while the
- * corresponding page is locked
- */
- VM_WARN_ON_ONCE(softleaf_is_migration(entry) && !PageLocked(page));
+ if (softleaf_is_migration(entry))
+ softleaf_migration_sync(entry, page_folio(page));
return page;
}
@@ -394,12 +408,8 @@ static inline struct folio *softleaf_to_folio(softleaf_t entry)
struct folio *folio = pfn_folio(softleaf_to_pfn(entry));
VM_WARN_ON_ONCE(!softleaf_has_pfn(entry));
- /*
- * Any use of migration entries may only occur while the
- * corresponding folio is locked.
- */
- VM_WARN_ON_ONCE(softleaf_is_migration(entry) &&
- !folio_test_locked(folio));
+ if (softleaf_is_migration(entry))
+ softleaf_migration_sync(entry, folio);
return folio;
}
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 0fe96f3ab3ef..65c732d440d2 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -55,6 +55,7 @@ struct mempolicy {
nodemask_t cpuset_mems_allowed; /* relative to these nodes */
nodemask_t user_nodemask; /* nodemask passed by user */
} w;
+ struct rcu_head rcu;
};
/*
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 72ee7d210a74..ba17ac5bf356 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -140,7 +140,6 @@ struct netfs_io_stream {
void (*issue_write)(struct netfs_io_subrequest *subreq);
/* Collection tracking */
struct list_head subrequests; /* Contributory I/O operations */
- struct netfs_io_subrequest *front; /* Op being collected */
unsigned long long collected_to; /* Position we've collected results to */
size_t transferred; /* The amount transferred from this stream */
unsigned short error; /* Aggregate error for the stream */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index ec442af3f886..31a848485ad9 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -210,7 +210,6 @@ enum mapping_flags {
AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM = 9,
AS_KERNEL_FILE = 10, /* mapping for a fake kernel file that shouldn't
account usage to user cgroups */
- AS_NO_DATA_INTEGRITY = 11, /* no data integrity guarantees */
/* Bits 16-25 are used for FOLIO_ORDER */
AS_FOLIO_ORDER_BITS = 5,
AS_FOLIO_ORDER_MIN = 16,
@@ -346,16 +345,6 @@ static inline bool mapping_writeback_may_deadlock_on_reclaim(const struct addres
return test_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags);
}
-static inline void mapping_set_no_data_integrity(struct address_space *mapping)
-{
- set_bit(AS_NO_DATA_INTEGRITY, &mapping->flags);
-}
-
-static inline bool mapping_no_data_integrity(const struct address_space *mapping)
-{
- return test_bit(AS_NO_DATA_INTEGRITY, &mapping->flags);
-}
-
static inline gfp_t mapping_gfp_mask(const struct address_space *mapping)
{
return mapping->gfp_mask;
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 125bdc166bfe..0864700f76e0 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -769,12 +769,15 @@ TRACE_EVENT(btrfs_sync_file,
),
TP_fast_assign(
- const struct dentry *dentry = file->f_path.dentry;
- const struct inode *inode = d_inode(dentry);
+ struct dentry *dentry = file_dentry(file);
+ struct inode *inode = file_inode(file);
+ struct dentry *parent = dget_parent(dentry);
+ struct inode *parent_inode = d_inode(parent);
- TP_fast_assign_fsid(btrfs_sb(file->f_path.dentry->d_sb));
+ dput(parent);
+ TP_fast_assign_fsid(btrfs_sb(inode->i_sb));
__entry->ino = btrfs_ino(BTRFS_I(inode));
- __entry->parent = btrfs_ino(BTRFS_I(d_inode(dentry->d_parent)));
+ __entry->parent = btrfs_ino(BTRFS_I(parent_inode));
__entry->datasync = datasync;
__entry->root_objectid = btrfs_root_id(BTRFS_I(inode)->root);
),
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index 2d366be46a1c..cbe28211106c 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -740,19 +740,19 @@ TRACE_EVENT(netfs_collect_stream,
__field(unsigned int, wreq)
__field(unsigned char, stream)
__field(unsigned long long, collected_to)
- __field(unsigned long long, front)
+ __field(unsigned long long, issued_to)
),
TP_fast_assign(
__entry->wreq = wreq->debug_id;
__entry->stream = stream->stream_nr;
__entry->collected_to = stream->collected_to;
- __entry->front = stream->front ? stream->front->start : UINT_MAX;
+ __entry->issued_to = atomic64_read(&wreq->issued_to);
),
- TP_printk("R=%08x[%x:] cto=%llx frn=%llx",
+ TP_printk("R=%08x[%x:] cto=%llx ito=%llx",
__entry->wreq, __entry->stream,
- __entry->collected_to, __entry->front)
+ __entry->collected_to, __entry->issued_to)
);
TRACE_EVENT(netfs_folioq,
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index cf7e610eac42..31e83a09789e 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -342,7 +342,7 @@ static int __futex_key_to_node(struct mm_struct *mm, unsigned long addr)
if (!vma)
return FUTEX_NO_NODE;
- mpol = vma_policy(vma);
+ mpol = READ_ONCE(vma->vm_policy);
if (!mpol)
return FUTEX_NO_NODE;
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index bc1f7e83a37e..7808068fa59e 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -918,7 +918,7 @@ int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
{
struct hrtimer_sleeper timeout, *to;
- struct task_struct *exiting = NULL;
+ struct task_struct *exiting;
struct rt_mutex_waiter rt_waiter;
struct futex_q q = futex_q_init;
DEFINE_WAKE_Q(wake_q);
@@ -933,6 +933,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
to = futex_setup_timer(time, &timeout, flags, 0);
retry:
+ exiting = NULL;
ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE);
if (unlikely(ret != 0))
goto out;
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 743c7a728237..77ad9691f6a6 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -459,6 +459,14 @@ SYSCALL_DEFINE4(futex_requeue,
if (ret)
return ret;
+ /*
+ * For now mandate both flags are identical, like the sys_futex()
+ * interface has. If/when we merge the variable sized futex support,
+ * that patch can modify this test to allow a difference in size.
+ */
+ if (futexes[0].w.flags != futexes[1].w.flags)
+ return -EINVAL;
+
cmpval = futexes[0].w.val;
return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags,
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 069d93bfb0c7..b64db405ba5c 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -540,7 +540,7 @@ static s64 alarm_timer_forward(struct k_itimer *timr, ktime_t now)
{
struct alarm *alarm = &timr->it.alarm.alarmtimer;
- return alarm_forward(alarm, timr->it_interval, now);
+ return alarm_forward(alarm, now, timr->it_interval);
}
/**
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index d5230b759a2d..655db2e82513 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -22,6 +22,39 @@ static struct task_struct *trigger_kthread;
static struct llist_head trigger_data_free_list;
static DEFINE_MUTEX(trigger_data_kthread_mutex);
+static int trigger_kthread_fn(void *ignore);
+
+static void trigger_create_kthread_locked(void)
+{
+ lockdep_assert_held(&trigger_data_kthread_mutex);
+
+ if (!trigger_kthread) {
+ struct task_struct *kthread;
+
+ kthread = kthread_create(trigger_kthread_fn, NULL,
+ "trigger_data_free");
+ if (!IS_ERR(kthread))
+ WRITE_ONCE(trigger_kthread, kthread);
+ }
+}
+
+static void trigger_data_free_queued_locked(void)
+{
+ struct event_trigger_data *data, *tmp;
+ struct llist_node *llnodes;
+
+ lockdep_assert_held(&trigger_data_kthread_mutex);
+
+ llnodes = llist_del_all(&trigger_data_free_list);
+ if (!llnodes)
+ return;
+
+ tracepoint_synchronize_unregister();
+
+ llist_for_each_entry_safe(data, tmp, llnodes, llist)
+ kfree(data);
+}
+
/* Bulk garbage collection of event_trigger_data elements */
static int trigger_kthread_fn(void *ignore)
{
@@ -56,30 +89,50 @@ void trigger_data_free(struct event_trigger_data *data)
if (data->cmd_ops->set_filter)
data->cmd_ops->set_filter(NULL, data, NULL);
+ /*
+ * Boot-time trigger registration can fail before kthread creation
+ * works. Keep the deferred-free semantics during boot and let late
+ * init start the kthread to drain the list.
+ */
+ if (system_state == SYSTEM_BOOTING && !trigger_kthread) {
+ llist_add(&data->llist, &trigger_data_free_list);
+ return;
+ }
+
if (unlikely(!trigger_kthread)) {
guard(mutex)(&trigger_data_kthread_mutex);
+
+ trigger_create_kthread_locked();
/* Check again after taking mutex */
if (!trigger_kthread) {
- struct task_struct *kthread;
-
- kthread = kthread_create(trigger_kthread_fn, NULL,
- "trigger_data_free");
- if (!IS_ERR(kthread))
- WRITE_ONCE(trigger_kthread, kthread);
+ llist_add(&data->llist, &trigger_data_free_list);
+ /* Drain the queued frees synchronously if creation failed. */
+ trigger_data_free_queued_locked();
+ return;
}
}
- if (!trigger_kthread) {
- /* Do it the slow way */
- tracepoint_synchronize_unregister();
- kfree(data);
- return;
- }
-
llist_add(&data->llist, &trigger_data_free_list);
wake_up_process(trigger_kthread);
}
+static int __init trigger_data_free_init(void)
+{
+ guard(mutex)(&trigger_data_kthread_mutex);
+
+ if (llist_empty(&trigger_data_free_list))
+ return 0;
+
+ trigger_create_kthread_locked();
+ if (trigger_kthread)
+ wake_up_process(trigger_kthread);
+ else
+ trigger_data_free_queued_locked();
+
+ return 0;
+}
+late_initcall(trigger_data_free_init);
+
static inline void data_ops_trigger(struct event_trigger_data *data,
struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index dee610e465b9..be6cf0bb3c03 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -2073,8 +2073,8 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy)
if (!osnoise_has_registered_instances())
return;
- guard(mutex)(&interface_lock);
guard(cpus_read_lock)();
+ guard(mutex)(&interface_lock);
if (!cpu_online(cpu))
return;
@@ -2237,11 +2237,11 @@ static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf,
if (running)
stop_per_cpu_kthreads();
- mutex_lock(&interface_lock);
/*
* avoid CPU hotplug operations that might read options.
*/
cpus_read_lock();
+ mutex_lock(&interface_lock);
retval = cnt;
@@ -2257,8 +2257,8 @@ static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf,
clear_bit(option, &osnoise_options);
}
- cpus_read_unlock();
mutex_unlock(&interface_lock);
+ cpus_read_unlock();
if (running)
start_per_cpu_kthreads();
@@ -2345,16 +2345,16 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count,
if (running)
stop_per_cpu_kthreads();
- mutex_lock(&interface_lock);
/*
* osnoise_cpumask is read by CPU hotplug operations.
*/
cpus_read_lock();
+ mutex_lock(&interface_lock);
cpumask_copy(&osnoise_cpumask, osnoise_cpumask_new);
- cpus_read_unlock();
mutex_unlock(&interface_lock);
+ cpus_read_unlock();
if (running)
start_per_cpu_kthreads();
diff --git a/lib/bug.c b/lib/bug.c
index 623c467a8b76..aab9e6a40c5f 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -173,10 +173,8 @@ struct bug_entry *find_bug(unsigned long bugaddr)
return module_find_bug(bugaddr);
}
-__diag_push();
-__diag_ignore(GCC, all, "-Wsuggest-attribute=format",
- "Not a valid __printf() conversion candidate.");
-static void __warn_printf(const char *fmt, struct pt_regs *regs)
+static __printf(1, 0)
+void __warn_printf(const char *fmt, struct pt_regs *regs)
{
if (!fmt)
return;
@@ -195,7 +193,6 @@ static void __warn_printf(const char *fmt, struct pt_regs *regs)
printk("%s", fmt);
}
-__diag_pop();
static enum bug_trap_type __report_bug(struct bug_entry *bug, unsigned long bugaddr, struct pt_regs *regs)
{
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 576d1ddd736b..6a44a2f3d8fc 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1524,8 +1524,10 @@ static int damon_sysfs_commit_input(void *data)
if (IS_ERR(param_ctx))
return PTR_ERR(param_ctx);
test_ctx = damon_sysfs_new_test_ctx(kdamond->damon_ctx);
- if (!test_ctx)
+ if (!test_ctx) {
+ damon_destroy_ctx(param_ctx);
return -ENOMEM;
+ }
err = damon_commit_ctx(test_ctx, param_ctx);
if (err)
goto out;
@@ -1618,9 +1620,12 @@ static int damon_sysfs_repeat_call_fn(void *data)
if (!mutex_trylock(&damon_sysfs_lock))
return 0;
+ if (sysfs_kdamond->contexts->nr != 1)
+ goto out;
damon_sysfs_upd_tuned_intervals(sysfs_kdamond);
damon_sysfs_upd_schemes_stats(sysfs_kdamond);
damon_sysfs_upd_schemes_effective_quotas(sysfs_kdamond);
+out:
mutex_unlock(&damon_sysfs_lock);
return 0;
}
@@ -1747,6 +1752,9 @@ static int damon_sysfs_update_schemes_tried_regions(
static int damon_sysfs_handle_cmd(enum damon_sysfs_cmd cmd,
struct damon_sysfs_kdamond *kdamond)
{
+ if (cmd != DAMON_SYSFS_CMD_OFF && kdamond->contexts->nr != 1)
+ return -EINVAL;
+
switch (cmd) {
case DAMON_SYSFS_CMD_ON:
return damon_sysfs_turn_damon_on(kdamond);
diff --git a/mm/memory.c b/mm/memory.c
index 2f815a34d924..c65e82c86fed 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6815,11 +6815,16 @@ retry:
pudp = pud_offset(p4dp, address);
pud = pudp_get(pudp);
- if (pud_none(pud))
+ if (!pud_present(pud))
goto out;
if (pud_leaf(pud)) {
lock = pud_lock(mm, pudp);
- if (!unlikely(pud_leaf(pud))) {
+ pud = pudp_get(pudp);
+
+ if (unlikely(!pud_present(pud))) {
+ spin_unlock(lock);
+ goto out;
+ } else if (unlikely(!pud_leaf(pud))) {
spin_unlock(lock);
goto retry;
}
@@ -6831,9 +6836,16 @@ retry:
pmdp = pmd_offset(pudp, address);
pmd = pmdp_get_lockless(pmdp);
+ if (!pmd_present(pmd))
+ goto out;
if (pmd_leaf(pmd)) {
lock = pmd_lock(mm, pmdp);
- if (!unlikely(pmd_leaf(pmd))) {
+ pmd = pmdp_get(pmdp);
+
+ if (unlikely(!pmd_present(pmd))) {
+ spin_unlock(lock);
+ goto out;
+ } else if (unlikely(!pmd_leaf(pmd))) {
spin_unlock(lock);
goto retry;
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0e5175f1c767..cf92bd6a8226 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -487,7 +487,13 @@ void __mpol_put(struct mempolicy *pol)
{
if (!atomic_dec_and_test(&pol->refcnt))
return;
- kmem_cache_free(policy_cache, pol);
+ /*
+ * Required to allow mmap_lock_speculative*() access, see for example
+ * futex_key_to_node_opt(). All accesses are serialized by mmap_lock,
+ * however the speculative lock section unbound by the normal lock
+ * boundaries, requiring RCU freeing.
+ */
+ kfree_rcu(pol, rcu);
}
EXPORT_SYMBOL_FOR_MODULES(__mpol_put, "kvm");
@@ -1020,7 +1026,7 @@ static int vma_replace_policy(struct vm_area_struct *vma,
}
old = vma->vm_policy;
- vma->vm_policy = new; /* protected by mmap_lock */
+ WRITE_ONCE(vma->vm_policy, new); /* protected by mmap_lock */
mpol_put(old);
return 0;
diff --git a/mm/mseal.c b/mm/mseal.c
index 316b5e1dec78..ac58643181f7 100644
--- a/mm/mseal.c
+++ b/mm/mseal.c
@@ -56,7 +56,6 @@ static int mseal_apply(struct mm_struct *mm,
unsigned long start, unsigned long end)
{
struct vm_area_struct *vma, *prev;
- unsigned long curr_start = start;
VMA_ITERATOR(vmi, mm, start);
/* We know there are no gaps so this will be non-NULL. */
@@ -66,6 +65,7 @@ static int mseal_apply(struct mm_struct *mm,
prev = vma;
for_each_vma_range(vmi, vma, end) {
+ const unsigned long curr_start = MAX(vma->vm_start, start);
const unsigned long curr_end = MIN(vma->vm_end, end);
if (!(vma->vm_flags & VM_SEALED)) {
@@ -79,7 +79,6 @@ static int mseal_apply(struct mm_struct *mm,
}
prev = vma;
- curr_start = curr_end;
}
return 0;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index a94c401ab2cf..4e7bcd975c54 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -97,6 +97,7 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
+ pud_t pudval = pudp_get(pud);
pmd_t *pmd;
unsigned long next;
const struct mm_walk_ops *ops = walk->ops;
@@ -105,6 +106,24 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
int err = 0;
int depth = real_depth(3);
+ /*
+ * For PTE handling, pte_offset_map_lock() takes care of checking
+ * whether there actually is a page table. But it also has to be
+ * very careful about concurrent page table reclaim.
+ *
+ * Similarly, we have to be careful here - a PUD entry that points
+ * to a PMD table cannot go away, so we can just walk it. But if
+ * it's something else, we need to ensure we didn't race something,
+ * so need to retry.
+ *
+ * A pertinent example of this is a PUD refault after PUD split -
+ * we will need to split again or risk accessing invalid memory.
+ */
+ if (!pud_present(pudval) || pud_leaf(pudval)) {
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
+
pmd = pmd_offset(pud, addr);
do {
again:
@@ -218,12 +237,12 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
else if (pud_leaf(*pud) || !pud_present(*pud))
continue; /* Nothing to do. */
- if (pud_none(*pud))
- goto again;
-
err = walk_pmd_range(pud, addr, next, walk);
if (err)
break;
+
+ if (walk->action == ACTION_AGAIN)
+ goto again;
} while (pud++, addr = next, addr != end);
return err;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 6d0eef7470be..48aff2c917c0 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -494,6 +494,10 @@ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry,
__folio_set_locked(folio);
__folio_set_swapbacked(folio);
+
+ if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry))
+ goto failed;
+
for (;;) {
ret = swap_cache_add_folio(folio, entry, &shadow);
if (!ret)
@@ -514,11 +518,6 @@ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry,
goto failed;
}
- if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) {
- swap_cache_del_folio(folio);
- goto failed;
- }
-
memcg1_swapin(entry, folio_nr_pages(folio));
if (shadow)
workingset_refault(folio, shadow);
diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c
index 7aec3ae82a44..c6dafb3cc116 100644
--- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c
+++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c
@@ -1020,7 +1020,7 @@ FIXTURE_SETUP(mount_setattr_idmapped)
"size=100000,mode=700"), 0);
ASSERT_EQ(mount("testing", "/mnt", "tmpfs", MS_NOATIME | MS_NODEV,
- "size=2m,mode=700"), 0);
+ "size=256m,mode=700"), 0);
ASSERT_EQ(mkdir("/mnt/A", 0777), 0);