diff options
104 files changed, 1501 insertions, 602 deletions
@@ -316,6 +316,7 @@ Hans Verkuil <hverkuil@kernel.org> <hverkuil-cisco@xs4all.nl> Hans Verkuil <hverkuil@kernel.org> <hansverk@cisco.com> Hao Ge <hao.ge@linux.dev> <gehao@kylinos.cn> Harry Yoo <harry.yoo@oracle.com> <42.hyeyoo@gmail.com> +Harry Yoo <harry@kernel.org> <harry.yoo@oracle.com> Heiko Carstens <hca@linux.ibm.com> <h.carstens@de.ibm.com> Heiko Carstens <hca@linux.ibm.com> <heiko.carstens@de.ibm.com> Heiko Stuebner <heiko@sntech.de> <heiko.stuebner@bqreaders.com> diff --git a/Documentation/filesystems/overlayfs.rst b/Documentation/filesystems/overlayfs.rst index af5a69f87da4..eb846518e6ac 100644 --- a/Documentation/filesystems/overlayfs.rst +++ b/Documentation/filesystems/overlayfs.rst @@ -783,6 +783,56 @@ controlled by the "uuid" mount option, which supports these values: mounted with "uuid=on". +Durability and copy up +---------------------- + +The fsync(2) system call ensures that the data and metadata of a file +are safely written to the backing storage, which is expected to +guarantee the existence of the information post system crash. + +Without an fsync(2) call, there is no guarantee that the observed +data after a system crash will be either the old or the new data, but +in practice, the observed data after crash is often the old or new data +or a mix of both. + +When an overlayfs file is modified for the first time, copy up will +create a copy of the lower file and its parent directories in the upper +layer. Since the Linux filesystem API does not enforce any particular +ordering on storing changes without explicit fsync(2) calls, in case +of a system crash, the upper file could end up with no data at all +(i.e. zeros), which would be an unusual outcome. To avoid this +experience, overlayfs calls fsync(2) on the upper file before completing +data copy up with rename(2) or link(2) to make the copy up "atomic". + +By default, overlayfs does not explicitly call fsync(2) on copied up +directories or on metadata-only copy up, so it provides no guarantee to +persist the user's modification unless the user calls fsync(2). +The fsync during copy up only guarantees that if a copy up is observed +after a crash, the observed data is not zeroes or intermediate values +from the copy up staging area. + +On traditional local filesystems with a single journal (e.g. ext4, xfs), +fsync on a file also persists the parent directory changes, because they +are usually modified in the same transaction, so metadata durability during +data copy up effectively comes for free. Overlayfs further limits risk by +disallowing network filesystems as upper layer. + +Overlayfs can be tuned to prefer performance or durability when storing +to the underlying upper layer. This is controlled by the "fsync" mount +option, which supports these values: + +- "auto": (default) + Call fsync(2) on upper file before completion of data copy up. + No explicit fsync(2) on directory or metadata-only copy up. +- "strict": + Call fsync(2) on upper file and directories before completion of any + copy up. +- "volatile": [*] + Prefer performance over durability (see `Volatile mount`_) + +[*] The mount option "volatile" is an alias to "fsync=volatile". + + Volatile mount -------------- diff --git a/MAINTAINERS b/MAINTAINERS index 23d88c825175..c3fe46d7c4bc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9619,7 +9619,12 @@ F: include/linux/ext2* EXT4 FILE SYSTEM M: "Theodore Ts'o" <tytso@mit.edu> -M: Andreas Dilger <adilger.kernel@dilger.ca> +R: Andreas Dilger <adilger.kernel@dilger.ca> +R: Baokun Li <libaokun@linux.alibaba.com> +R: Jan Kara <jack@suse.cz> +R: Ojaswin Mujoo <ojaswin@linux.ibm.com> +R: Ritesh Harjani (IBM) <ritesh.list@gmail.com> +R: Zhang Yi <yi.zhang@huawei.com> L: linux-ext4@vger.kernel.org S: Maintained W: http://ext4.wiki.kernel.org @@ -12015,7 +12020,6 @@ I2C SUBSYSTEM M: Wolfram Sang <wsa+renesas@sang-engineering.com> L: linux-i2c@vger.kernel.org S: Maintained -W: https://i2c.wiki.kernel.org/ Q: https://patchwork.ozlabs.org/project/linux-i2c/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/wsa/linux.git F: Documentation/i2c/ @@ -12041,7 +12045,6 @@ I2C SUBSYSTEM HOST DRIVERS M: Andi Shyti <andi.shyti@kernel.org> L: linux-i2c@vger.kernel.org S: Maintained -W: https://i2c.wiki.kernel.org/ Q: https://patchwork.ozlabs.org/project/linux-i2c/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/andi.shyti/linux.git F: Documentation/devicetree/bindings/i2c/ @@ -16883,7 +16886,7 @@ M: Lorenzo Stoakes <ljs@kernel.org> R: Rik van Riel <riel@surriel.com> R: Liam R. Howlett <Liam.Howlett@oracle.com> R: Vlastimil Babka <vbabka@kernel.org> -R: Harry Yoo <harry.yoo@oracle.com> +R: Harry Yoo <harry@kernel.org> R: Jann Horn <jannh@google.com> L: linux-mm@kvack.org S: Maintained @@ -24349,7 +24352,7 @@ F: drivers/nvmem/layouts/sl28vpd.c SLAB ALLOCATOR M: Vlastimil Babka <vbabka@kernel.org> -M: Harry Yoo <harry.yoo@oracle.com> +M: Harry Yoo <harry@kernel.org> M: Andrew Morton <akpm@linux-foundation.org> R: Hao Li <hao.li@linux.dev> R: Christoph Lameter <cl@gentwo.org> @@ -2,7 +2,7 @@ VERSION = 7 PATCHLEVEL = 0 SUBLEVEL = 0 -EXTRAVERSION = -rc5 +EXTRAVERSION = -rc6 NAME = Baby Opossum Posse # *DOCUMENTATION* diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c index 670404d4fa44..7b8d70fe406d 100644 --- a/arch/s390/kvm/dat.c +++ b/arch/s390/kvm/dat.c @@ -135,32 +135,6 @@ int dat_set_asce_limit(struct kvm_s390_mmu_cache *mc, union asce *asce, int newt } /** - * dat_crstep_xchg() - Exchange a gmap CRSTE with another. - * @crstep: Pointer to the CRST entry - * @new: Replacement entry. - * @gfn: The affected guest address. - * @asce: The ASCE of the address space. - * - * Context: This function is assumed to be called with kvm->mmu_lock held. - */ -void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce asce) -{ - if (crstep->h.i) { - WRITE_ONCE(*crstep, new); - return; - } else if (cpu_has_edat2()) { - crdte_crste(crstep, *crstep, new, gfn, asce); - return; - } - - if (machine_has_tlb_guest()) - idte_crste(crstep, gfn, IDTE_GUEST_ASCE, asce, IDTE_GLOBAL); - else - idte_crste(crstep, gfn, 0, NULL_ASCE, IDTE_GLOBAL); - WRITE_ONCE(*crstep, new); -} - -/** * dat_crstep_xchg_atomic() - Atomically exchange a gmap CRSTE with another. * @crstep: Pointer to the CRST entry. * @old: Expected old value. @@ -175,8 +149,8 @@ void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce * * Return: %true if the exchange was successful. */ -bool dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new, gfn_t gfn, - union asce asce) +bool __must_check dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new, + gfn_t gfn, union asce asce) { if (old.h.i) return arch_try_cmpxchg((long *)crstep, &old.val, new.val); @@ -292,6 +266,7 @@ static int dat_split_ste(struct kvm_s390_mmu_cache *mc, union pmd *pmdp, gfn_t g pt->ptes[i].val = init.val | i * PAGE_SIZE; /* No need to take locks as the page table is not installed yet. */ pgste_init.prefix_notif = old.s.fc1.prefix_notif; + pgste_init.vsie_notif = old.s.fc1.vsie_notif; pgste_init.pcl = uses_skeys && init.h.i; dat_init_pgstes(pt, pgste_init.val); } else { @@ -893,7 +868,8 @@ static long _dat_slot_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct d /* This table entry needs to be updated. */ if (walk->start <= gfn && walk->end >= next) { - dat_crstep_xchg_atomic(crstep, crste, new_crste, gfn, walk->asce); + if (!dat_crstep_xchg_atomic(crstep, crste, new_crste, gfn, walk->asce)) + return -EINVAL; /* A lower level table was present, needs to be freed. */ if (!crste.h.fc && !crste.h.i) { if (is_pmd(crste)) @@ -1021,67 +997,21 @@ bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end) return _dat_walk_gfn_range(start, end, asce, &test_age_ops, 0, NULL) > 0; } -int dat_link(struct kvm_s390_mmu_cache *mc, union asce asce, int level, - bool uses_skeys, struct guest_fault *f) -{ - union crste oldval, newval; - union pte newpte, oldpte; - union pgste pgste; - int rc = 0; - - rc = dat_entry_walk(mc, f->gfn, asce, DAT_WALK_ALLOC_CONTINUE, level, &f->crstep, &f->ptep); - if (rc == -EINVAL || rc == -ENOMEM) - return rc; - if (rc) - return -EAGAIN; - - if (WARN_ON_ONCE(unlikely(get_level(f->crstep, f->ptep) > level))) - return -EINVAL; - - if (f->ptep) { - pgste = pgste_get_lock(f->ptep); - oldpte = *f->ptep; - newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page); - newpte.s.sd = oldpte.s.sd; - oldpte.s.sd = 0; - if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) { - pgste = __dat_ptep_xchg(f->ptep, pgste, newpte, f->gfn, asce, uses_skeys); - if (f->callback) - f->callback(f); - } else { - rc = -EAGAIN; - } - pgste_set_unlock(f->ptep, pgste); - } else { - oldval = READ_ONCE(*f->crstep); - newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable, - f->write_attempt | oldval.s.fc1.d); - newval.s.fc1.sd = oldval.s.fc1.sd; - if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val && - crste_origin_large(oldval) != crste_origin_large(newval)) - return -EAGAIN; - if (!dat_crstep_xchg_atomic(f->crstep, oldval, newval, f->gfn, asce)) - return -EAGAIN; - if (f->callback) - f->callback(f); - } - - return rc; -} - static long dat_set_pn_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) { - union crste crste = READ_ONCE(*crstep); + union crste newcrste, oldcrste; int *n = walk->priv; - if (!crste.h.fc || crste.h.i || crste.h.p) - return 0; - + do { + oldcrste = READ_ONCE(*crstep); + if (!oldcrste.h.fc || oldcrste.h.i || oldcrste.h.p) + return 0; + if (oldcrste.s.fc1.prefix_notif) + break; + newcrste = oldcrste; + newcrste.s.fc1.prefix_notif = 1; + } while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, gfn, walk->asce)); *n = 2; - if (crste.s.fc1.prefix_notif) - return 0; - crste.s.fc1.prefix_notif = 1; - dat_crstep_xchg(crstep, crste, gfn, walk->asce); return 0; } diff --git a/arch/s390/kvm/dat.h b/arch/s390/kvm/dat.h index 123e11dcd70d..874cc962e196 100644 --- a/arch/s390/kvm/dat.h +++ b/arch/s390/kvm/dat.h @@ -160,14 +160,14 @@ union pmd { unsigned long :44; /* HW */ unsigned long : 3; /* Unused */ unsigned long : 1; /* HW */ + unsigned long s : 1; /* Special */ unsigned long w : 1; /* Writable soft-bit */ unsigned long r : 1; /* Readable soft-bit */ unsigned long d : 1; /* Dirty */ unsigned long y : 1; /* Young */ - unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ unsigned long : 3; /* HW */ + unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ unsigned long vsie_notif : 1; /* Referenced in a shadow table */ - unsigned long : 1; /* Unused */ unsigned long : 4; /* HW */ unsigned long sd : 1; /* Soft-Dirty */ unsigned long pr : 1; /* Present */ @@ -183,14 +183,14 @@ union pud { unsigned long :33; /* HW */ unsigned long :14; /* Unused */ unsigned long : 1; /* HW */ + unsigned long s : 1; /* Special */ unsigned long w : 1; /* Writable soft-bit */ unsigned long r : 1; /* Readable soft-bit */ unsigned long d : 1; /* Dirty */ unsigned long y : 1; /* Young */ - unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ unsigned long : 3; /* HW */ + unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ unsigned long vsie_notif : 1; /* Referenced in a shadow table */ - unsigned long : 1; /* Unused */ unsigned long : 4; /* HW */ unsigned long sd : 1; /* Soft-Dirty */ unsigned long pr : 1; /* Present */ @@ -254,14 +254,14 @@ union crste { struct { unsigned long :47; unsigned long : 1; /* HW (should be 0) */ + unsigned long s : 1; /* Special */ unsigned long w : 1; /* Writable */ unsigned long r : 1; /* Readable */ unsigned long d : 1; /* Dirty */ unsigned long y : 1; /* Young */ - unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ unsigned long : 3; /* HW */ + unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ unsigned long vsie_notif : 1; /* Referenced in a shadow table */ - unsigned long : 1; unsigned long : 4; /* HW */ unsigned long sd : 1; /* Soft-Dirty */ unsigned long pr : 1; /* Present */ @@ -540,8 +540,6 @@ int dat_set_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, gf u16 type, u16 param); int dat_set_prefix_notif_bit(union asce asce, gfn_t gfn); bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end); -int dat_link(struct kvm_s390_mmu_cache *mc, union asce asce, int level, - bool uses_skeys, struct guest_fault *f); int dat_perform_essa(union asce asce, gfn_t gfn, int orc, union essa_state *state, bool *dirty); long dat_reset_cmma(union asce asce, gfn_t start_gfn); @@ -938,11 +936,14 @@ static inline bool dat_pudp_xchg_atomic(union pud *pudp, union pud old, union pu return dat_crstep_xchg_atomic(_CRSTEP(pudp), _CRSTE(old), _CRSTE(new), gfn, asce); } -static inline void dat_crstep_clear(union crste *crstep, gfn_t gfn, union asce asce) +static inline union crste dat_crstep_clear_atomic(union crste *crstep, gfn_t gfn, union asce asce) { - union crste newcrste = _CRSTE_EMPTY(crstep->h.tt); + union crste oldcrste, empty = _CRSTE_EMPTY(crstep->h.tt); - dat_crstep_xchg(crstep, newcrste, gfn, asce); + do { + oldcrste = READ_ONCE(*crstep); + } while (!dat_crstep_xchg_atomic(crstep, oldcrste, empty, gfn, asce)); + return oldcrste; } static inline int get_level(union crste *crstep, union pte *ptep) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index a9da9390867d..53a8550e7102 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -1436,13 +1436,21 @@ static int _do_shadow_pte(struct gmap *sg, gpa_t raddr, union pte *ptep_h, union if (!pgste_get_trylock(ptep_h, &pgste)) return -EAGAIN; - newpte = _pte(f->pfn, f->writable, !p, 0); - newpte.s.d |= ptep->s.d; - newpte.s.sd |= ptep->s.sd; - newpte.h.p &= ptep->h.p; - pgste = _gmap_ptep_xchg(sg->parent, ptep_h, newpte, pgste, f->gfn, false); - pgste.vsie_notif = 1; + newpte = _pte(f->pfn, f->writable, !p, ptep_h->s.s); + newpte.s.d |= ptep_h->s.d; + newpte.s.sd |= ptep_h->s.sd; + newpte.h.p &= ptep_h->h.p; + if (!newpte.h.p && !f->writable) { + rc = -EOPNOTSUPP; + } else { + pgste = _gmap_ptep_xchg(sg->parent, ptep_h, newpte, pgste, f->gfn, false); + pgste.vsie_notif = 1; + } pgste_set_unlock(ptep_h, pgste); + if (rc) + return rc; + if (!sg->parent) + return -EAGAIN; newpte = _pte(f->pfn, 0, !p, 0); if (!pgste_get_trylock(ptep, &pgste)) @@ -1456,7 +1464,7 @@ static int _do_shadow_pte(struct gmap *sg, gpa_t raddr, union pte *ptep_h, union static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, union crste *table, struct guest_fault *f, bool p) { - union crste newcrste; + union crste newcrste, oldcrste; gfn_t gfn; int rc; @@ -1469,16 +1477,28 @@ static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, uni if (rc) return rc; - newcrste = _crste_fc1(f->pfn, host->h.tt, f->writable, !p); - newcrste.s.fc1.d |= host->s.fc1.d; - newcrste.s.fc1.sd |= host->s.fc1.sd; - newcrste.h.p &= host->h.p; - newcrste.s.fc1.vsie_notif = 1; - newcrste.s.fc1.prefix_notif = host->s.fc1.prefix_notif; - _gmap_crstep_xchg(sg->parent, host, newcrste, f->gfn, false); + do { + /* _gmap_crstep_xchg_atomic() could have unshadowed this shadow gmap */ + if (!sg->parent) + return -EAGAIN; + oldcrste = READ_ONCE(*host); + newcrste = _crste_fc1(f->pfn, oldcrste.h.tt, f->writable, !p); + newcrste.s.fc1.d |= oldcrste.s.fc1.d; + newcrste.s.fc1.sd |= oldcrste.s.fc1.sd; + newcrste.h.p &= oldcrste.h.p; + newcrste.s.fc1.vsie_notif = 1; + newcrste.s.fc1.prefix_notif = oldcrste.s.fc1.prefix_notif; + newcrste.s.fc1.s = oldcrste.s.fc1.s; + if (!newcrste.h.p && !f->writable) + return -EOPNOTSUPP; + } while (!_gmap_crstep_xchg_atomic(sg->parent, host, oldcrste, newcrste, f->gfn, false)); + if (!sg->parent) + return -EAGAIN; - newcrste = _crste_fc1(f->pfn, host->h.tt, 0, !p); - dat_crstep_xchg(table, newcrste, gpa_to_gfn(raddr), sg->asce); + newcrste = _crste_fc1(f->pfn, oldcrste.h.tt, 0, !p); + gfn = gpa_to_gfn(raddr); + while (!dat_crstep_xchg_atomic(table, READ_ONCE(*table), newcrste, gfn, sg->asce)) + ; return 0; } @@ -1502,21 +1522,31 @@ static int _gaccess_do_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *sg, if (rc) return rc; - /* A race occourred. The shadow mapping is already valid, nothing to do */ - if ((ptep && !ptep->h.i) || (!ptep && crste_leaf(*table))) + /* A race occurred. The shadow mapping is already valid, nothing to do */ + if ((ptep && !ptep->h.i && ptep->h.p == w->p) || + (!ptep && crste_leaf(*table) && !table->h.i && table->h.p == w->p)) return 0; gl = get_level(table, ptep); + /* In case of a real address space */ + if (w->level <= LEVEL_MEM) { + l = TABLE_TYPE_PAGE_TABLE; + hl = TABLE_TYPE_REGION1; + goto real_address_space; + } + /* * Skip levels that are already protected. For each level, protect * only the page containing the entry, not the whole table. */ for (i = gl ; i >= w->level; i--) { - rc = gmap_protect_rmap(mc, sg, entries[i - 1].gfn, gpa_to_gfn(saddr), - entries[i - 1].pfn, i, entries[i - 1].writable); + rc = gmap_protect_rmap(mc, sg, entries[i].gfn, gpa_to_gfn(saddr), + entries[i].pfn, i + 1, entries[i].writable); if (rc) return rc; + if (!sg->parent) + return -EAGAIN; } rc = dat_entry_walk(NULL, entries[LEVEL_MEM].gfn, sg->parent->asce, DAT_WALK_LEAF, @@ -1528,6 +1558,7 @@ static int _gaccess_do_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *sg, /* Get the smallest granularity */ l = min3(gl, hl, w->level); +real_address_space: flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0); /* If necessary, create the shadow mapping */ if (l < gl) { diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c index ef0c6ebfdde2..645c32c767d2 100644 --- a/arch/s390/kvm/gmap.c +++ b/arch/s390/kvm/gmap.c @@ -313,13 +313,16 @@ static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, st struct clear_young_pte_priv *priv = walk->priv; union crste crste, new; - crste = READ_ONCE(*crstep); + do { + crste = READ_ONCE(*crstep); + + if (!crste.h.fc) + return 0; + if (!crste.s.fc1.y && crste.h.i) + return 0; + if (crste_prefix(crste) && !gmap_mkold_prefix(priv->gmap, gfn, end)) + break; - if (!crste.h.fc) - return 0; - if (!crste.s.fc1.y && crste.h.i) - return 0; - if (!crste_prefix(crste) || gmap_mkold_prefix(priv->gmap, gfn, end)) { new = crste; new.h.i = 1; new.s.fc1.y = 0; @@ -328,8 +331,8 @@ static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, st folio_set_dirty(phys_to_folio(crste_origin_large(crste))); new.s.fc1.d = 0; new.h.p = 1; - dat_crstep_xchg(crstep, new, gfn, walk->asce); - } + } while (!dat_crstep_xchg_atomic(crstep, crste, new, gfn, walk->asce)); + priv->young = 1; return 0; } @@ -391,14 +394,18 @@ static long _gmap_unmap_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct { struct gmap_unmap_priv *priv = walk->priv; struct folio *folio = NULL; + union crste old = *crstep; - if (crstep->h.fc) { - if (crstep->s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags)) - folio = phys_to_folio(crste_origin_large(*crstep)); - gmap_crstep_xchg(priv->gmap, crstep, _CRSTE_EMPTY(crstep->h.tt), gfn); - if (folio) - uv_convert_from_secure_folio(folio); - } + if (!old.h.fc) + return 0; + + if (old.s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags)) + folio = phys_to_folio(crste_origin_large(old)); + /* No races should happen because kvm->mmu_lock is held in write mode */ + KVM_BUG_ON(!gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn), + priv->gmap->kvm); + if (folio) + uv_convert_from_secure_folio(folio); return 0; } @@ -474,23 +481,24 @@ static long _crste_test_and_clear_softdirty(union crste *table, gfn_t gfn, gfn_t if (fatal_signal_pending(current)) return 1; - crste = READ_ONCE(*table); - if (!crste.h.fc) - return 0; - if (crste.h.p && !crste.s.fc1.sd) - return 0; + do { + crste = READ_ONCE(*table); + if (!crste.h.fc) + return 0; + if (crste.h.p && !crste.s.fc1.sd) + return 0; - /* - * If this large page contains one or more prefixes of vCPUs that are - * currently running, do not reset the protection, leave it marked as - * dirty. - */ - if (!crste.s.fc1.prefix_notif || gmap_mkold_prefix(gmap, gfn, end)) { + /* + * If this large page contains one or more prefixes of vCPUs that are + * currently running, do not reset the protection, leave it marked as + * dirty. + */ + if (crste.s.fc1.prefix_notif && !gmap_mkold_prefix(gmap, gfn, end)) + break; new = crste; new.h.p = 1; new.s.fc1.sd = 0; - gmap_crstep_xchg(gmap, table, new, gfn); - } + } while (!gmap_crstep_xchg_atomic(gmap, table, crste, new, gfn)); for ( ; gfn < end; gfn++) mark_page_dirty(gmap->kvm, gfn); @@ -511,7 +519,7 @@ void gmap_sync_dirty_log(struct gmap *gmap, gfn_t start, gfn_t end) _dat_walk_gfn_range(start, end, gmap->asce, &walk_ops, 0, gmap); } -static int gmap_handle_minor_crste_fault(union asce asce, struct guest_fault *f) +static int gmap_handle_minor_crste_fault(struct gmap *gmap, struct guest_fault *f) { union crste newcrste, oldcrste = READ_ONCE(*f->crstep); @@ -536,10 +544,8 @@ static int gmap_handle_minor_crste_fault(union asce asce, struct guest_fault *f) newcrste.s.fc1.d = 1; newcrste.s.fc1.sd = 1; } - if (!oldcrste.s.fc1.d && newcrste.s.fc1.d) - SetPageDirty(phys_to_page(crste_origin_large(newcrste))); /* In case of races, let the slow path deal with it. */ - return !dat_crstep_xchg_atomic(f->crstep, oldcrste, newcrste, f->gfn, asce); + return !gmap_crstep_xchg_atomic(gmap, f->crstep, oldcrste, newcrste, f->gfn); } /* Trying to write on a read-only page, let the slow path deal with it. */ return 1; @@ -568,8 +574,6 @@ static int _gmap_handle_minor_pte_fault(struct gmap *gmap, union pgste *pgste, newpte.s.d = 1; newpte.s.sd = 1; } - if (!oldpte.s.d && newpte.s.d) - SetPageDirty(pfn_to_page(newpte.h.pfra)); *pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, *pgste, f->gfn); return 0; @@ -606,7 +610,7 @@ int gmap_try_fixup_minor(struct gmap *gmap, struct guest_fault *fault) fault->callback(fault); pgste_set_unlock(fault->ptep, pgste); } else { - rc = gmap_handle_minor_crste_fault(gmap->asce, fault); + rc = gmap_handle_minor_crste_fault(gmap, fault); if (!rc && fault->callback) fault->callback(fault); } @@ -623,10 +627,61 @@ static inline bool gmap_1m_allowed(struct gmap *gmap, gfn_t gfn) return test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &gmap->flags); } +static int _gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, int level, + struct guest_fault *f) +{ + union crste oldval, newval; + union pte newpte, oldpte; + union pgste pgste; + int rc = 0; + + rc = dat_entry_walk(mc, f->gfn, gmap->asce, DAT_WALK_ALLOC_CONTINUE, level, + &f->crstep, &f->ptep); + if (rc == -ENOMEM) + return rc; + if (KVM_BUG_ON(rc == -EINVAL, gmap->kvm)) + return rc; + if (rc) + return -EAGAIN; + if (KVM_BUG_ON(get_level(f->crstep, f->ptep) > level, gmap->kvm)) + return -EINVAL; + + if (f->ptep) { + pgste = pgste_get_lock(f->ptep); + oldpte = *f->ptep; + newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page); + newpte.s.sd = oldpte.s.sd; + oldpte.s.sd = 0; + if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) { + pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, pgste, f->gfn); + if (f->callback) + f->callback(f); + } else { + rc = -EAGAIN; + } + pgste_set_unlock(f->ptep, pgste); + } else { + do { + oldval = READ_ONCE(*f->crstep); + newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable, + f->write_attempt | oldval.s.fc1.d); + newval.s.fc1.s = !f->page; + newval.s.fc1.sd = oldval.s.fc1.sd; + if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val && + crste_origin_large(oldval) != crste_origin_large(newval)) + return -EAGAIN; + } while (!gmap_crstep_xchg_atomic(gmap, f->crstep, oldval, newval, f->gfn)); + if (f->callback) + f->callback(f); + } + + return rc; +} + int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *f) { unsigned int order; - int rc, level; + int level; lockdep_assert_held(&gmap->kvm->mmu_lock); @@ -638,16 +693,14 @@ int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fau else if (order >= get_order(_SEGMENT_SIZE) && gmap_1m_allowed(gmap, f->gfn)) level = TABLE_TYPE_SEGMENT; } - rc = dat_link(mc, gmap->asce, level, uses_skeys(gmap), f); - KVM_BUG_ON(rc == -EINVAL, gmap->kvm); - return rc; + return _gmap_link(mc, gmap, level, f); } static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, gfn_t p_gfn, gfn_t c_gfn, bool force_alloc) { + union crste newcrste, oldcrste; struct page_table *pt; - union crste newcrste; union crste *crstep; union pte *ptep; int rc; @@ -673,7 +726,11 @@ static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, &crstep, &ptep); if (rc) return rc; - dat_crstep_xchg(crstep, newcrste, c_gfn, gmap->asce); + do { + oldcrste = READ_ONCE(*crstep); + if (oldcrste.val == newcrste.val) + break; + } while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, c_gfn, gmap->asce)); return 0; } @@ -777,8 +834,10 @@ static void gmap_ucas_unmap_one(struct gmap *gmap, gfn_t c_gfn) int rc; rc = dat_entry_walk(NULL, c_gfn, gmap->asce, 0, TABLE_TYPE_SEGMENT, &crstep, &ptep); - if (!rc) - dat_crstep_xchg(crstep, _PMD_EMPTY, c_gfn, gmap->asce); + if (rc) + return; + while (!dat_crstep_xchg_atomic(crstep, READ_ONCE(*crstep), _PMD_EMPTY, c_gfn, gmap->asce)) + ; } void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count) @@ -1017,8 +1076,8 @@ static void gmap_unshadow_level(struct gmap *sg, gfn_t r_gfn, int level) dat_ptep_xchg(ptep, _PTE_EMPTY, r_gfn, sg->asce, uses_skeys(sg)); return; } - crste = READ_ONCE(*crstep); - dat_crstep_clear(crstep, r_gfn, sg->asce); + + crste = dat_crstep_clear_atomic(crstep, r_gfn, sg->asce); if (crste_leaf(crste) || crste.h.i) return; if (is_pmd(crste)) @@ -1101,6 +1160,7 @@ struct gmap_protect_asce_top_level { static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg, struct gmap_protect_asce_top_level *context) { + struct gmap *parent; int rc, i; guard(write_lock)(&sg->kvm->mmu_lock); @@ -1108,7 +1168,12 @@ static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, s if (kvm_s390_array_needs_retry_safe(sg->kvm, context->seq, context->f)) return -EAGAIN; - scoped_guard(spinlock, &sg->parent->children_lock) { + parent = READ_ONCE(sg->parent); + if (!parent) + return -EAGAIN; + scoped_guard(spinlock, &parent->children_lock) { + if (READ_ONCE(sg->parent) != parent) + return -EAGAIN; for (i = 0; i < CRST_TABLE_PAGES; i++) { if (!context->f[i].valid) continue; @@ -1191,6 +1256,9 @@ struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *pare struct gmap *sg, *new; int rc; + if (WARN_ON(!parent)) + return ERR_PTR(-EINVAL); + scoped_guard(spinlock, &parent->children_lock) { sg = gmap_find_shadow(parent, asce, edat_level); if (sg) { diff --git a/arch/s390/kvm/gmap.h b/arch/s390/kvm/gmap.h index ccb5cd751e31..579399ef5480 100644 --- a/arch/s390/kvm/gmap.h +++ b/arch/s390/kvm/gmap.h @@ -185,6 +185,8 @@ static inline union pgste _gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, un else _gmap_handle_vsie_unshadow_event(gmap, gfn); } + if (!ptep->s.d && newpte.s.d && !newpte.s.s) + SetPageDirty(pfn_to_page(newpte.h.pfra)); return __dat_ptep_xchg(ptep, pgste, newpte, gfn, gmap->asce, uses_skeys(gmap)); } @@ -194,35 +196,42 @@ static inline union pgste gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, uni return _gmap_ptep_xchg(gmap, ptep, newpte, pgste, gfn, true); } -static inline void _gmap_crstep_xchg(struct gmap *gmap, union crste *crstep, union crste ne, - gfn_t gfn, bool needs_lock) +static inline bool __must_check _gmap_crstep_xchg_atomic(struct gmap *gmap, union crste *crstep, + union crste oldcrste, union crste newcrste, + gfn_t gfn, bool needs_lock) { - unsigned long align = 8 + (is_pmd(*crstep) ? 0 : 11); + unsigned long align = is_pmd(newcrste) ? _PAGE_ENTRIES : _PAGE_ENTRIES * _CRST_ENTRIES; + + if (KVM_BUG_ON(crstep->h.tt != oldcrste.h.tt || newcrste.h.tt != oldcrste.h.tt, gmap->kvm)) + return true; lockdep_assert_held(&gmap->kvm->mmu_lock); if (!needs_lock) lockdep_assert_held(&gmap->children_lock); gfn = ALIGN_DOWN(gfn, align); - if (crste_prefix(*crstep) && (ne.h.p || ne.h.i || !crste_prefix(ne))) { - ne.s.fc1.prefix_notif = 0; + if (crste_prefix(oldcrste) && (newcrste.h.p || newcrste.h.i || !crste_prefix(newcrste))) { + newcrste.s.fc1.prefix_notif = 0; gmap_unmap_prefix(gmap, gfn, gfn + align); } - if (crste_leaf(*crstep) && crstep->s.fc1.vsie_notif && - (ne.h.p || ne.h.i || !ne.s.fc1.vsie_notif)) { - ne.s.fc1.vsie_notif = 0; + if (crste_leaf(oldcrste) && oldcrste.s.fc1.vsie_notif && + (newcrste.h.p || newcrste.h.i || !newcrste.s.fc1.vsie_notif)) { + newcrste.s.fc1.vsie_notif = 0; if (needs_lock) gmap_handle_vsie_unshadow_event(gmap, gfn); else _gmap_handle_vsie_unshadow_event(gmap, gfn); } - dat_crstep_xchg(crstep, ne, gfn, gmap->asce); + if (!oldcrste.s.fc1.d && newcrste.s.fc1.d && !newcrste.s.fc1.s) + SetPageDirty(phys_to_page(crste_origin_large(newcrste))); + return dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, gfn, gmap->asce); } -static inline void gmap_crstep_xchg(struct gmap *gmap, union crste *crstep, union crste ne, - gfn_t gfn) +static inline bool __must_check gmap_crstep_xchg_atomic(struct gmap *gmap, union crste *crstep, + union crste oldcrste, union crste newcrste, + gfn_t gfn) { - return _gmap_crstep_xchg(gmap, crstep, ne, gfn, true); + return _gmap_crstep_xchg_atomic(gmap, crstep, oldcrste, newcrste, gfn, true); } /** diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index b2c01fa7b852..d7838334a338 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -5520,9 +5520,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } #endif case KVM_S390_VCPU_FAULT: { - idx = srcu_read_lock(&vcpu->kvm->srcu); - r = vcpu_dat_fault_handler(vcpu, arg, 0); - srcu_read_unlock(&vcpu->kvm->srcu, idx); + gpa_t gaddr = arg; + + scoped_guard(srcu, &vcpu->kvm->srcu) { + r = vcpu_ucontrol_translate(vcpu, &gaddr); + if (r) + break; + + r = kvm_s390_faultin_gfn_simple(vcpu, NULL, gpa_to_gfn(gaddr), false); + if (r == PGM_ADDRESSING) + r = -EFAULT; + if (r <= 0) + break; + r = -EIO; + KVM_BUG_ON(r, vcpu->kvm); + } break; } case KVM_ENABLE_CAP: diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 0330829b4046..72895dddc39a 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -1328,7 +1328,7 @@ static void unregister_shadow_scb(struct kvm_vcpu *vcpu) static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; - struct gmap *sg; + struct gmap *sg = NULL; int rc = 0; while (1) { @@ -1368,6 +1368,8 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) sg = gmap_put(sg); cond_resched(); } + if (sg) + sg = gmap_put(sg); if (rc == -EFAULT) { /* diff --git a/arch/x86/coco/sev/noinstr.c b/arch/x86/coco/sev/noinstr.c index 9d94aca4a698..5afd663a1c21 100644 --- a/arch/x86/coco/sev/noinstr.c +++ b/arch/x86/coco/sev/noinstr.c @@ -121,6 +121,9 @@ noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state) WARN_ON(!irqs_disabled()); + if (!sev_cfg.ghcbs_initialized) + return boot_ghcb; + data = this_cpu_read(runtime_data); ghcb = &data->ghcb_page; @@ -164,6 +167,9 @@ noinstr void __sev_put_ghcb(struct ghcb_state *state) WARN_ON(!irqs_disabled()); + if (!sev_cfg.ghcbs_initialized) + return; + data = this_cpu_read(runtime_data); ghcb = &data->ghcb_page; diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c index 88c757ac8ccd..fbe2d10dd737 100644 --- a/arch/x86/entry/entry_fred.c +++ b/arch/x86/entry/entry_fred.c @@ -177,6 +177,16 @@ static noinstr void fred_extint(struct pt_regs *regs) } } +#ifdef CONFIG_AMD_MEM_ENCRYPT +noinstr void exc_vmm_communication(struct pt_regs *regs, unsigned long error_code) +{ + if (user_mode(regs)) + return user_exc_vmm_communication(regs, error_code); + else + return kernel_exc_vmm_communication(regs, error_code); +} +#endif + static noinstr void fred_hwexc(struct pt_regs *regs, unsigned long error_code) { /* Optimize for #PF. That's the only exception which matters performance wise */ @@ -207,6 +217,10 @@ static noinstr void fred_hwexc(struct pt_regs *regs, unsigned long error_code) #ifdef CONFIG_X86_CET case X86_TRAP_CP: return exc_control_protection(regs, error_code); #endif +#ifdef CONFIG_AMD_MEM_ENCRYPT + case X86_TRAP_VC: return exc_vmm_communication(regs, error_code); +#endif + default: return fred_bad_type(regs, error_code); } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a8ff4376c286..ec0670114efa 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -433,7 +433,20 @@ static __always_inline void setup_lass(struct cpuinfo_x86 *c) /* These bits should not change their value after CPU init is finished. */ static const unsigned long cr4_pinned_mask = X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | - X86_CR4_FSGSBASE | X86_CR4_CET | X86_CR4_FRED; + X86_CR4_FSGSBASE | X86_CR4_CET; + +/* + * The CR pinning protects against ROP on the 'mov %reg, %CRn' instruction(s). + * Since you can ROP directly to these instructions (barring shadow stack), + * any protection must follow immediately and unconditionally after that. + * + * Specifically, the CR[04] write functions below will have the value + * validation controlled by the @cr_pinning static_branch which is + * __ro_after_init, just like the cr4_pinned_bits value. + * + * Once set, an attacker will have to defeat page-tables to get around these + * restrictions. Which is a much bigger ask than 'simple' ROP. + */ static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning); static unsigned long cr4_pinned_bits __ro_after_init; @@ -2050,12 +2063,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) setup_umip(c); setup_lass(c); - /* Enable FSGSBASE instructions if available. */ - if (cpu_has(c, X86_FEATURE_FSGSBASE)) { - cr4_set_bits(X86_CR4_FSGSBASE); - elf_hwcap2 |= HWCAP2_FSGSBASE; - } - /* * The vendor-specific functions might have changed features. * Now we do "generic changes." @@ -2416,6 +2423,18 @@ void cpu_init_exception_handling(bool boot_cpu) /* GHCB needs to be setup to handle #VC. */ setup_ghcb(); + /* + * On CPUs with FSGSBASE support, paranoid_entry() uses + * ALTERNATIVE-patched RDGSBASE/WRGSBASE instructions. Secondary CPUs + * boot after alternatives are patched globally, so early exceptions + * execute patched code that depends on FSGSBASE. Enable the feature + * before any exceptions occur. + */ + if (cpu_feature_enabled(X86_FEATURE_FSGSBASE)) { + cr4_set_bits(X86_CR4_FSGSBASE); + elf_hwcap2 |= HWCAP2_FSGSBASE; + } + if (cpu_feature_enabled(X86_FEATURE_FRED)) { /* The boot CPU has enabled FRED during early boot */ if (!boot_cpu) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b922a8b00057..dd06453d5b72 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3044,12 +3044,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, bool prefetch = !fault || fault->prefetch; bool write_fault = fault && fault->write; - if (unlikely(is_noslot_pfn(pfn))) { - vcpu->stat.pf_mmio_spte_created++; - mark_mmio_spte(vcpu, sptep, gfn, pte_access); - return RET_PF_EMULATE; - } - if (is_shadow_present_pte(*sptep)) { if (prefetch && is_last_spte(*sptep, level) && pfn == spte_to_pfn(*sptep)) @@ -3066,13 +3060,22 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, child = spte_to_child_sp(pte); drop_parent_pte(vcpu->kvm, child, sptep); flush = true; - } else if (WARN_ON_ONCE(pfn != spte_to_pfn(*sptep))) { + } else if (pfn != spte_to_pfn(*sptep)) { + WARN_ON_ONCE(vcpu->arch.mmu->root_role.direct); drop_spte(vcpu->kvm, sptep); flush = true; } else was_rmapped = 1; } + if (unlikely(is_noslot_pfn(pfn))) { + vcpu->stat.pf_mmio_spte_created++; + mark_mmio_spte(vcpu, sptep, gfn, pte_access); + if (flush) + kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level); + return RET_PF_EMULATE; + } + wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch, false, host_writable, &spte); diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c index e7d698b352d3..5397dbda4f72 100644 --- a/drivers/dma/dw-edma/dw-edma-core.c +++ b/drivers/dma/dw-edma/dw-edma-core.c @@ -844,6 +844,7 @@ static int dw_edma_irq_request(struct dw_edma *dw, { struct dw_edma_chip *chip = dw->chip; struct device *dev = dw->chip->dev; + struct msi_desc *msi_desc; u32 wr_mask = 1; u32 rd_mask = 1; int i, err = 0; @@ -895,9 +896,12 @@ static int dw_edma_irq_request(struct dw_edma *dw, &dw->irq[i]); if (err) goto err_irq_free; - - if (irq_get_msi_desc(irq)) + msi_desc = irq_get_msi_desc(irq); + if (msi_desc) { get_cached_msi_msg(irq, &dw->irq[i].msi); + if (!msi_desc->pci.msi_attrib.is_msix) + dw->irq[i].msi.data = dw->irq[0].msi.data + i; + } } dw->nr_irqs = i; diff --git a/drivers/dma/dw-edma/dw-hdma-v0-core.c b/drivers/dma/dw-edma/dw-hdma-v0-core.c index e3f8db4fe909..ce8f7254bab2 100644 --- a/drivers/dma/dw-edma/dw-hdma-v0-core.c +++ b/drivers/dma/dw-edma/dw-hdma-v0-core.c @@ -252,10 +252,10 @@ static void dw_hdma_v0_core_start(struct dw_edma_chunk *chunk, bool first) lower_32_bits(chunk->ll_region.paddr)); SET_CH_32(dw, chan->dir, chan->id, llp.msb, upper_32_bits(chunk->ll_region.paddr)); + /* Set consumer cycle */ + SET_CH_32(dw, chan->dir, chan->id, cycle_sync, + HDMA_V0_CONSUMER_CYCLE_STAT | HDMA_V0_CONSUMER_CYCLE_BIT); } - /* Set consumer cycle */ - SET_CH_32(dw, chan->dir, chan->id, cycle_sync, - HDMA_V0_CONSUMER_CYCLE_STAT | HDMA_V0_CONSUMER_CYCLE_BIT); dw_hdma_v0_sync_ll_data(chunk); diff --git a/drivers/dma/fsl-edma-main.c b/drivers/dma/fsl-edma-main.c index dbcdd1e68319..b596baa0a182 100644 --- a/drivers/dma/fsl-edma-main.c +++ b/drivers/dma/fsl-edma-main.c @@ -317,10 +317,8 @@ static struct dma_chan *fsl_edma3_xlate(struct of_phandle_args *dma_spec, return NULL; i = fsl_chan - fsl_edma->chans; - fsl_chan->priority = dma_spec->args[1]; - fsl_chan->is_rxchan = dma_spec->args[2] & FSL_EDMA_RX; - fsl_chan->is_remote = dma_spec->args[2] & FSL_EDMA_REMOTE; - fsl_chan->is_multi_fifo = dma_spec->args[2] & FSL_EDMA_MULTI_FIFO; + if (!b_chmux && i != dma_spec->args[0]) + continue; if ((dma_spec->args[2] & FSL_EDMA_EVEN_CH) && (i & 0x1)) continue; @@ -328,17 +326,15 @@ static struct dma_chan *fsl_edma3_xlate(struct of_phandle_args *dma_spec, if ((dma_spec->args[2] & FSL_EDMA_ODD_CH) && !(i & 0x1)) continue; - if (!b_chmux && i == dma_spec->args[0]) { - chan = dma_get_slave_channel(chan); - chan->device->privatecnt++; - return chan; - } else if (b_chmux && !fsl_chan->srcid) { - /* if controller support channel mux, choose a free channel */ - chan = dma_get_slave_channel(chan); - chan->device->privatecnt++; - fsl_chan->srcid = dma_spec->args[0]; - return chan; - } + fsl_chan->srcid = dma_spec->args[0]; + fsl_chan->priority = dma_spec->args[1]; + fsl_chan->is_rxchan = dma_spec->args[2] & FSL_EDMA_RX; + fsl_chan->is_remote = dma_spec->args[2] & FSL_EDMA_REMOTE; + fsl_chan->is_multi_fifo = dma_spec->args[2] & FSL_EDMA_MULTI_FIFO; + + chan = dma_get_slave_channel(chan); + chan->device->privatecnt++; + return chan; } return NULL; } diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c index c37d233535f9..0366c7cf3502 100644 --- a/drivers/dma/idxd/cdev.c +++ b/drivers/dma/idxd/cdev.c @@ -158,11 +158,7 @@ static const struct device_type idxd_cdev_file_type = { static void idxd_cdev_dev_release(struct device *dev) { struct idxd_cdev *idxd_cdev = dev_to_cdev(dev); - struct idxd_cdev_context *cdev_ctx; - struct idxd_wq *wq = idxd_cdev->wq; - cdev_ctx = &ictx[wq->idxd->data->type]; - ida_free(&cdev_ctx->minor_ida, idxd_cdev->minor); kfree(idxd_cdev); } @@ -582,11 +578,15 @@ int idxd_wq_add_cdev(struct idxd_wq *wq) void idxd_wq_del_cdev(struct idxd_wq *wq) { + struct idxd_cdev_context *cdev_ctx; struct idxd_cdev *idxd_cdev; idxd_cdev = wq->idxd_cdev; wq->idxd_cdev = NULL; cdev_device_del(&idxd_cdev->cdev, cdev_dev(idxd_cdev)); + + cdev_ctx = &ictx[wq->idxd->data->type]; + ida_free(&cdev_ctx->minor_ida, idxd_cdev->minor); put_device(cdev_dev(idxd_cdev)); } diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c index c26128529ff4..131138483b87 100644 --- a/drivers/dma/idxd/device.c +++ b/drivers/dma/idxd/device.c @@ -175,6 +175,7 @@ void idxd_wq_free_resources(struct idxd_wq *wq) free_descs(wq); dma_free_coherent(dev, wq->compls_size, wq->compls, wq->compls_addr); sbitmap_queue_free(&wq->sbq); + wq->type = IDXD_WQT_NONE; } EXPORT_SYMBOL_NS_GPL(idxd_wq_free_resources, "IDXD"); @@ -382,7 +383,6 @@ static void idxd_wq_disable_cleanup(struct idxd_wq *wq) lockdep_assert_held(&wq->wq_lock); wq->state = IDXD_WQ_DISABLED; memset(wq->wqcfg, 0, idxd->wqcfg_size); - wq->type = IDXD_WQT_NONE; wq->threshold = 0; wq->priority = 0; wq->enqcmds_retries = IDXD_ENQCMDS_RETRIES; @@ -831,8 +831,7 @@ static void idxd_device_evl_free(struct idxd_device *idxd) struct device *dev = &idxd->pdev->dev; struct idxd_evl *evl = idxd->evl; - gencfg.bits = ioread32(idxd->reg_base + IDXD_GENCFG_OFFSET); - if (!gencfg.evl_en) + if (!evl) return; mutex_lock(&evl->lock); @@ -1125,7 +1124,11 @@ int idxd_device_config(struct idxd_device *idxd) { int rc; - lockdep_assert_held(&idxd->dev_lock); + guard(spinlock)(&idxd->dev_lock); + + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return 0; + rc = idxd_wqs_setup(idxd); if (rc < 0) return rc; @@ -1332,6 +1335,11 @@ void idxd_wq_free_irq(struct idxd_wq *wq) free_irq(ie->vector, ie); idxd_flush_pending_descs(ie); + + /* The interrupt might have been already released by FLR */ + if (ie->int_handle == INVALID_INT_HANDLE) + return; + if (idxd->request_int_handles) idxd_device_release_int_handle(idxd, ie->int_handle, IDXD_IRQ_MSIX); idxd_device_clear_perm_entry(idxd, ie); @@ -1340,6 +1348,23 @@ void idxd_wq_free_irq(struct idxd_wq *wq) ie->pasid = IOMMU_PASID_INVALID; } +void idxd_wq_flush_descs(struct idxd_wq *wq) +{ + struct idxd_irq_entry *ie = &wq->ie; + struct idxd_device *idxd = wq->idxd; + + guard(mutex)(&wq->wq_lock); + + if (wq->state != IDXD_WQ_ENABLED || wq->type != IDXD_WQT_KERNEL) + return; + + idxd_flush_pending_descs(ie); + if (idxd->request_int_handles) + idxd_device_release_int_handle(idxd, ie->int_handle, IDXD_IRQ_MSIX); + idxd_device_clear_perm_entry(idxd, ie); + ie->int_handle = INVALID_INT_HANDLE; +} + int idxd_wq_request_irq(struct idxd_wq *wq) { struct idxd_device *idxd = wq->idxd; @@ -1454,11 +1479,7 @@ int idxd_drv_enable_wq(struct idxd_wq *wq) } } - rc = 0; - spin_lock(&idxd->dev_lock); - if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) - rc = idxd_device_config(idxd); - spin_unlock(&idxd->dev_lock); + rc = idxd_device_config(idxd); if (rc < 0) { dev_dbg(dev, "Writing wq %d config failed: %d\n", wq->id, rc); goto err; @@ -1533,7 +1554,6 @@ void idxd_drv_disable_wq(struct idxd_wq *wq) idxd_wq_reset(wq); idxd_wq_free_resources(wq); percpu_ref_exit(&wq->wq_active); - wq->type = IDXD_WQT_NONE; wq->client_count = 0; } EXPORT_SYMBOL_NS_GPL(idxd_drv_disable_wq, "IDXD"); @@ -1554,10 +1574,7 @@ int idxd_device_drv_probe(struct idxd_dev *idxd_dev) } /* Device configuration */ - spin_lock(&idxd->dev_lock); - if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) - rc = idxd_device_config(idxd); - spin_unlock(&idxd->dev_lock); + rc = idxd_device_config(idxd); if (rc < 0) return -ENXIO; diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c index dbecd699237e..9937b671f637 100644 --- a/drivers/dma/idxd/dma.c +++ b/drivers/dma/idxd/dma.c @@ -194,6 +194,22 @@ static void idxd_dma_release(struct dma_device *device) kfree(idxd_dma); } +static int idxd_dma_terminate_all(struct dma_chan *c) +{ + struct idxd_wq *wq = to_idxd_wq(c); + + idxd_wq_flush_descs(wq); + + return 0; +} + +static void idxd_dma_synchronize(struct dma_chan *c) +{ + struct idxd_wq *wq = to_idxd_wq(c); + + idxd_wq_drain(wq); +} + int idxd_register_dma_device(struct idxd_device *idxd) { struct idxd_dma_dev *idxd_dma; @@ -224,6 +240,8 @@ int idxd_register_dma_device(struct idxd_device *idxd) dma->device_issue_pending = idxd_dma_issue_pending; dma->device_alloc_chan_resources = idxd_dma_alloc_chan_resources; dma->device_free_chan_resources = idxd_dma_free_chan_resources; + dma->device_terminate_all = idxd_dma_terminate_all; + dma->device_synchronize = idxd_dma_synchronize; rc = dma_async_device_register(dma); if (rc < 0) { diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h index ea8c4daed38d..ce78b9a7c641 100644 --- a/drivers/dma/idxd/idxd.h +++ b/drivers/dma/idxd/idxd.h @@ -803,6 +803,7 @@ void idxd_wq_quiesce(struct idxd_wq *wq); int idxd_wq_init_percpu_ref(struct idxd_wq *wq); void idxd_wq_free_irq(struct idxd_wq *wq); int idxd_wq_request_irq(struct idxd_wq *wq); +void idxd_wq_flush_descs(struct idxd_wq *wq); /* submission */ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc); diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index fb80803d5b57..f1cfc7790d95 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -973,7 +973,8 @@ static void idxd_device_config_restore(struct idxd_device *idxd, idxd->rdbuf_limit = idxd_saved->saved_idxd.rdbuf_limit; - idxd->evl->size = saved_evl->size; + if (idxd->evl) + idxd->evl->size = saved_evl->size; for (i = 0; i < idxd->max_groups; i++) { struct idxd_group *saved_group, *group; @@ -1104,12 +1105,10 @@ static void idxd_reset_done(struct pci_dev *pdev) idxd_device_config_restore(idxd, idxd->idxd_saved); /* Re-configure IDXD device if allowed. */ - if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) { - rc = idxd_device_config(idxd); - if (rc < 0) { - dev_err(dev, "HALT: %s config fails\n", idxd_name); - goto out; - } + rc = idxd_device_config(idxd); + if (rc < 0) { + dev_err(dev, "HALT: %s config fails\n", idxd_name); + goto out; } /* Bind IDXD device to driver. */ @@ -1147,6 +1146,7 @@ static void idxd_reset_done(struct pci_dev *pdev) } out: kfree(idxd->idxd_saved); + idxd->idxd_saved = NULL; } static const struct pci_error_handlers idxd_error_handler = { diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c index 7782f8c51c32..6a25e1fd0e62 100644 --- a/drivers/dma/idxd/irq.c +++ b/drivers/dma/idxd/irq.c @@ -397,6 +397,17 @@ static void idxd_device_flr(struct work_struct *work) dev_err(&idxd->pdev->dev, "FLR failed\n"); } +static void idxd_wqs_flush_descs(struct idxd_device *idxd) +{ + int i; + + for (i = 0; i < idxd->max_wqs; i++) { + struct idxd_wq *wq = idxd->wqs[i]; + + idxd_wq_flush_descs(wq); + } +} + static irqreturn_t idxd_halt(struct idxd_device *idxd) { union gensts_reg gensts; @@ -415,6 +426,11 @@ static irqreturn_t idxd_halt(struct idxd_device *idxd) } else if (gensts.reset_type == IDXD_DEVICE_RESET_FLR) { idxd->state = IDXD_DEV_HALTED; idxd_mask_error_interrupts(idxd); + /* Flush all pending descriptors, and disable + * interrupts, they will be re-enabled when FLR + * concludes. + */ + idxd_wqs_flush_descs(idxd); dev_dbg(&idxd->pdev->dev, "idxd halted, doing FLR. After FLR, configs are restored\n"); INIT_WORK(&idxd->work, idxd_device_flr); diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c index 6db1c5fcedc5..03217041b8b3 100644 --- a/drivers/dma/idxd/submit.c +++ b/drivers/dma/idxd/submit.c @@ -138,7 +138,7 @@ static void llist_abort_desc(struct idxd_wq *wq, struct idxd_irq_entry *ie, */ list_for_each_entry_safe(d, t, &flist, list) { list_del_init(&d->list); - idxd_dma_complete_txd(found, IDXD_COMPLETE_ABORT, true, + idxd_dma_complete_txd(d, IDXD_COMPLETE_ABORT, true, NULL, NULL); } } diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c index cc2c83d7f710..6d251095c350 100644 --- a/drivers/dma/idxd/sysfs.c +++ b/drivers/dma/idxd/sysfs.c @@ -1836,6 +1836,7 @@ static void idxd_conf_device_release(struct device *dev) { struct idxd_device *idxd = confdev_to_idxd(dev); + destroy_workqueue(idxd->wq); kfree(idxd->groups); bitmap_free(idxd->wq_enable_map); kfree(idxd->wqs); diff --git a/drivers/dma/sh/rz-dmac.c b/drivers/dma/sh/rz-dmac.c index d84ca551b2bf..f30bdf69c740 100644 --- a/drivers/dma/sh/rz-dmac.c +++ b/drivers/dma/sh/rz-dmac.c @@ -10,6 +10,7 @@ */ #include <linux/bitfield.h> +#include <linux/cleanup.h> #include <linux/dma-mapping.h> #include <linux/dmaengine.h> #include <linux/interrupt.h> @@ -296,13 +297,10 @@ static void rz_dmac_disable_hw(struct rz_dmac_chan *channel) { struct dma_chan *chan = &channel->vc.chan; struct rz_dmac *dmac = to_rz_dmac(chan->device); - unsigned long flags; dev_dbg(dmac->dev, "%s channel %d\n", __func__, channel->index); - local_irq_save(flags); rz_dmac_ch_writel(channel, CHCTRL_DEFAULT, CHCTRL, 1); - local_irq_restore(flags); } static void rz_dmac_set_dmars_register(struct rz_dmac *dmac, int nr, u32 dmars) @@ -447,6 +445,7 @@ static int rz_dmac_alloc_chan_resources(struct dma_chan *chan) if (!desc) break; + /* No need to lock. This is called only for the 1st client. */ list_add_tail(&desc->node, &channel->ld_free); channel->descs_allocated++; } @@ -502,18 +501,21 @@ rz_dmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src, dev_dbg(dmac->dev, "%s channel: %d src=0x%pad dst=0x%pad len=%zu\n", __func__, channel->index, &src, &dest, len); - if (list_empty(&channel->ld_free)) - return NULL; + scoped_guard(spinlock_irqsave, &channel->vc.lock) { + if (list_empty(&channel->ld_free)) + return NULL; - desc = list_first_entry(&channel->ld_free, struct rz_dmac_desc, node); + desc = list_first_entry(&channel->ld_free, struct rz_dmac_desc, node); - desc->type = RZ_DMAC_DESC_MEMCPY; - desc->src = src; - desc->dest = dest; - desc->len = len; - desc->direction = DMA_MEM_TO_MEM; + desc->type = RZ_DMAC_DESC_MEMCPY; + desc->src = src; + desc->dest = dest; + desc->len = len; + desc->direction = DMA_MEM_TO_MEM; + + list_move_tail(channel->ld_free.next, &channel->ld_queue); + } - list_move_tail(channel->ld_free.next, &channel->ld_queue); return vchan_tx_prep(&channel->vc, &desc->vd, flags); } @@ -529,27 +531,29 @@ rz_dmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, int dma_length = 0; int i = 0; - if (list_empty(&channel->ld_free)) - return NULL; + scoped_guard(spinlock_irqsave, &channel->vc.lock) { + if (list_empty(&channel->ld_free)) + return NULL; - desc = list_first_entry(&channel->ld_free, struct rz_dmac_desc, node); + desc = list_first_entry(&channel->ld_free, struct rz_dmac_desc, node); - for_each_sg(sgl, sg, sg_len, i) { - dma_length += sg_dma_len(sg); - } + for_each_sg(sgl, sg, sg_len, i) + dma_length += sg_dma_len(sg); - desc->type = RZ_DMAC_DESC_SLAVE_SG; - desc->sg = sgl; - desc->sgcount = sg_len; - desc->len = dma_length; - desc->direction = direction; + desc->type = RZ_DMAC_DESC_SLAVE_SG; + desc->sg = sgl; + desc->sgcount = sg_len; + desc->len = dma_length; + desc->direction = direction; - if (direction == DMA_DEV_TO_MEM) - desc->src = channel->src_per_address; - else - desc->dest = channel->dst_per_address; + if (direction == DMA_DEV_TO_MEM) + desc->src = channel->src_per_address; + else + desc->dest = channel->dst_per_address; + + list_move_tail(channel->ld_free.next, &channel->ld_queue); + } - list_move_tail(channel->ld_free.next, &channel->ld_queue); return vchan_tx_prep(&channel->vc, &desc->vd, flags); } @@ -561,8 +565,8 @@ static int rz_dmac_terminate_all(struct dma_chan *chan) unsigned int i; LIST_HEAD(head); - rz_dmac_disable_hw(channel); spin_lock_irqsave(&channel->vc.lock, flags); + rz_dmac_disable_hw(channel); for (i = 0; i < DMAC_NR_LMDESC; i++) lmdesc[i].header = 0; @@ -699,7 +703,9 @@ static void rz_dmac_irq_handle_channel(struct rz_dmac_chan *channel) if (chstat & CHSTAT_ER) { dev_err(dmac->dev, "DMAC err CHSTAT_%d = %08X\n", channel->index, chstat); - rz_dmac_ch_writel(channel, CHCTRL_DEFAULT, CHCTRL, 1); + + scoped_guard(spinlock_irqsave, &channel->vc.lock) + rz_dmac_ch_writel(channel, CHCTRL_DEFAULT, CHCTRL, 1); goto done; } diff --git a/drivers/dma/xilinx/xdma.c b/drivers/dma/xilinx/xdma.c index d02a4dac2291..782a55edc55b 100644 --- a/drivers/dma/xilinx/xdma.c +++ b/drivers/dma/xilinx/xdma.c @@ -1234,8 +1234,8 @@ static int xdma_probe(struct platform_device *pdev) xdev->rmap = devm_regmap_init_mmio(&pdev->dev, reg_base, &xdma_regmap_config); - if (!xdev->rmap) { - xdma_err(xdev, "config regmap failed: %d", ret); + if (IS_ERR(xdev->rmap)) { + xdma_err(xdev, "config regmap failed: %pe", xdev->rmap); goto failed; } INIT_LIST_HEAD(&xdev->dma_dev.channels); diff --git a/drivers/dma/xilinx/xilinx_dma.c b/drivers/dma/xilinx/xilinx_dma.c index b53292e02448..e3a18ee42aa2 100644 --- a/drivers/dma/xilinx/xilinx_dma.c +++ b/drivers/dma/xilinx/xilinx_dma.c @@ -997,16 +997,16 @@ static u32 xilinx_dma_get_residue(struct xilinx_dma_chan *chan, struct xilinx_cdma_tx_segment, node); cdma_hw = &cdma_seg->hw; - residue += (cdma_hw->control - cdma_hw->status) & - chan->xdev->max_buffer_len; + residue += (cdma_hw->control & chan->xdev->max_buffer_len) - + (cdma_hw->status & chan->xdev->max_buffer_len); } else if (chan->xdev->dma_config->dmatype == XDMA_TYPE_AXIDMA) { axidma_seg = list_entry(entry, struct xilinx_axidma_tx_segment, node); axidma_hw = &axidma_seg->hw; - residue += (axidma_hw->control - axidma_hw->status) & - chan->xdev->max_buffer_len; + residue += (axidma_hw->control & chan->xdev->max_buffer_len) - + (axidma_hw->status & chan->xdev->max_buffer_len); } else { aximcdma_seg = list_entry(entry, @@ -1014,8 +1014,8 @@ static u32 xilinx_dma_get_residue(struct xilinx_dma_chan *chan, node); aximcdma_hw = &aximcdma_seg->hw; residue += - (aximcdma_hw->control - aximcdma_hw->status) & - chan->xdev->max_buffer_len; + (aximcdma_hw->control & chan->xdev->max_buffer_len) - + (aximcdma_hw->status & chan->xdev->max_buffer_len); } } @@ -1235,14 +1235,6 @@ static int xilinx_dma_alloc_chan_resources(struct dma_chan *dchan) dma_cookie_init(dchan); - if (chan->xdev->dma_config->dmatype == XDMA_TYPE_AXIDMA) { - /* For AXI DMA resetting once channel will reset the - * other channel as well so enable the interrupts here. - */ - dma_ctrl_set(chan, XILINX_DMA_REG_DMACR, - XILINX_DMA_DMAXR_ALL_IRQ_MASK); - } - if ((chan->xdev->dma_config->dmatype == XDMA_TYPE_CDMA) && chan->has_sg) dma_ctrl_set(chan, XILINX_DMA_REG_DMACR, XILINX_CDMA_CR_SGMODE); @@ -1564,8 +1556,29 @@ static void xilinx_dma_start_transfer(struct xilinx_dma_chan *chan) if (chan->err) return; - if (list_empty(&chan->pending_list)) + if (list_empty(&chan->pending_list)) { + if (chan->cyclic) { + struct xilinx_dma_tx_descriptor *desc; + struct list_head *entry; + + desc = list_last_entry(&chan->done_list, + struct xilinx_dma_tx_descriptor, node); + list_for_each(entry, &desc->segments) { + struct xilinx_axidma_tx_segment *axidma_seg; + struct xilinx_axidma_desc_hw *axidma_hw; + axidma_seg = list_entry(entry, + struct xilinx_axidma_tx_segment, + node); + axidma_hw = &axidma_seg->hw; + axidma_hw->status = 0; + } + + list_splice_tail_init(&chan->done_list, &chan->active_list); + chan->desc_pendingcount = 0; + chan->idle = false; + } return; + } if (!chan->idle) return; @@ -1591,6 +1604,7 @@ static void xilinx_dma_start_transfer(struct xilinx_dma_chan *chan) head_desc->async_tx.phys); reg &= ~XILINX_DMA_CR_DELAY_MAX; reg |= chan->irq_delay << XILINX_DMA_CR_DELAY_SHIFT; + reg |= XILINX_DMA_DMAXR_ALL_IRQ_MASK; dma_ctrl_write(chan, XILINX_DMA_REG_DMACR, reg); xilinx_dma_start(chan); @@ -3024,7 +3038,7 @@ static int xilinx_dma_chan_probe(struct xilinx_dma_device *xdev, return -EINVAL; } - xdev->common.directions |= chan->direction; + xdev->common.directions |= BIT(chan->direction); /* Request the interrupt */ chan->irq = of_irq_get(node, chan->tdest); diff --git a/drivers/i2c/busses/i2c-designware-amdisp.c b/drivers/i2c/busses/i2c-designware-amdisp.c index c48728ad9f6f..9f0ec0fae6f2 100644 --- a/drivers/i2c/busses/i2c-designware-amdisp.c +++ b/drivers/i2c/busses/i2c-designware-amdisp.c @@ -7,6 +7,7 @@ #include <linux/module.h> #include <linux/platform_device.h> +#include <linux/pm_domain.h> #include <linux/pm_runtime.h> #include <linux/soc/amd/isp4_misc.h> @@ -76,22 +77,20 @@ static int amd_isp_dw_i2c_plat_probe(struct platform_device *pdev) device_enable_async_suspend(&pdev->dev); - pm_runtime_enable(&pdev->dev); - pm_runtime_get_sync(&pdev->dev); - + dev_pm_genpd_resume(&pdev->dev); ret = i2c_dw_probe(isp_i2c_dev); if (ret) { dev_err_probe(&pdev->dev, ret, "i2c_dw_probe failed\n"); goto error_release_rpm; } - - pm_runtime_put_sync(&pdev->dev); + dev_pm_genpd_suspend(&pdev->dev); + pm_runtime_set_suspended(&pdev->dev); + pm_runtime_enable(&pdev->dev); return 0; error_release_rpm: amd_isp_dw_i2c_plat_pm_cleanup(isp_i2c_dev); - pm_runtime_put_sync(&pdev->dev); return ret; } diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c index 85f554044cf1..452d120a210b 100644 --- a/drivers/i2c/busses/i2c-imx.c +++ b/drivers/i2c/busses/i2c-imx.c @@ -1018,8 +1018,9 @@ static inline int i2c_imx_isr_read(struct imx_i2c_struct *i2c_imx) return 0; } -static inline void i2c_imx_isr_read_continue(struct imx_i2c_struct *i2c_imx) +static inline enum imx_i2c_state i2c_imx_isr_read_continue(struct imx_i2c_struct *i2c_imx) { + enum imx_i2c_state next_state = IMX_I2C_STATE_READ_CONTINUE; unsigned int temp; if ((i2c_imx->msg->len - 1) == i2c_imx->msg_buf_idx) { @@ -1033,18 +1034,20 @@ static inline void i2c_imx_isr_read_continue(struct imx_i2c_struct *i2c_imx) i2c_imx->stopped = 1; temp &= ~(I2CR_MSTA | I2CR_MTX); imx_i2c_write_reg(temp, i2c_imx, IMX_I2C_I2CR); - } else { - /* - * For i2c master receiver repeat restart operation like: - * read -> repeat MSTA -> read/write - * The controller must set MTX before read the last byte in - * the first read operation, otherwise the first read cost - * one extra clock cycle. - */ - temp = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2CR); - temp |= I2CR_MTX; - imx_i2c_write_reg(temp, i2c_imx, IMX_I2C_I2CR); + + return IMX_I2C_STATE_DONE; } + /* + * For i2c master receiver repeat restart operation like: + * read -> repeat MSTA -> read/write + * The controller must set MTX before read the last byte in + * the first read operation, otherwise the first read cost + * one extra clock cycle. + */ + temp = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2CR); + temp |= I2CR_MTX; + imx_i2c_write_reg(temp, i2c_imx, IMX_I2C_I2CR); + next_state = IMX_I2C_STATE_DONE; } else if (i2c_imx->msg_buf_idx == (i2c_imx->msg->len - 2)) { temp = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2CR); temp |= I2CR_TXAK; @@ -1052,6 +1055,7 @@ static inline void i2c_imx_isr_read_continue(struct imx_i2c_struct *i2c_imx) } i2c_imx->msg->buf[i2c_imx->msg_buf_idx++] = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2DR); + return next_state; } static inline void i2c_imx_isr_read_block_data_len(struct imx_i2c_struct *i2c_imx) @@ -1088,11 +1092,9 @@ static irqreturn_t i2c_imx_master_isr(struct imx_i2c_struct *i2c_imx, unsigned i break; case IMX_I2C_STATE_READ_CONTINUE: - i2c_imx_isr_read_continue(i2c_imx); - if (i2c_imx->msg_buf_idx == i2c_imx->msg->len) { - i2c_imx->state = IMX_I2C_STATE_DONE; + i2c_imx->state = i2c_imx_isr_read_continue(i2c_imx); + if (i2c_imx->state == IMX_I2C_STATE_DONE) wake_up(&i2c_imx->queue); - } break; case IMX_I2C_STATE_READ_BLOCK_DATA: @@ -1490,6 +1492,7 @@ static int i2c_imx_read(struct imx_i2c_struct *i2c_imx, struct i2c_msg *msgs, bool is_lastmsg) { int block_data = msgs->flags & I2C_M_RECV_LEN; + int ret = 0; dev_dbg(&i2c_imx->adapter.dev, "<%s> write slave address: addr=0x%x\n", @@ -1522,10 +1525,20 @@ static int i2c_imx_read(struct imx_i2c_struct *i2c_imx, struct i2c_msg *msgs, dev_err(&i2c_imx->adapter.dev, "<%s> read timedout\n", __func__); return -ETIMEDOUT; } - if (!i2c_imx->stopped) - return i2c_imx_bus_busy(i2c_imx, 0, false); + if (i2c_imx->is_lastmsg) { + if (!i2c_imx->stopped) + ret = i2c_imx_bus_busy(i2c_imx, 0, false); + /* + * Only read the last byte of the last message after the bus is + * not busy. Else the controller generates another clock which + * might confuse devices. + */ + if (!ret) + i2c_imx->msg->buf[i2c_imx->msg_buf_idx++] = imx_i2c_read_reg(i2c_imx, + IMX_I2C_I2DR); + } - return 0; + return ret; } static int i2c_imx_xfer_common(struct i2c_adapter *adapter, diff --git a/drivers/irqchip/irq-qcom-mpm.c b/drivers/irqchip/irq-qcom-mpm.c index 83f31ea657b7..181320528a47 100644 --- a/drivers/irqchip/irq-qcom-mpm.c +++ b/drivers/irqchip/irq-qcom-mpm.c @@ -306,6 +306,8 @@ static int mpm_pd_power_off(struct generic_pm_domain *genpd) if (ret < 0) return ret; + mbox_client_txdone(priv->mbox_chan, 0); + return 0; } @@ -434,6 +436,7 @@ static int qcom_mpm_probe(struct platform_device *pdev, struct device_node *pare } priv->mbox_client.dev = dev; + priv->mbox_client.knows_txdone = true; priv->mbox_chan = mbox_request_channel(&priv->mbox_client, 0); if (IS_ERR(priv->mbox_chan)) { ret = PTR_ERR(priv->mbox_chan); diff --git a/drivers/irqchip/irq-renesas-rzv2h.c b/drivers/irqchip/irq-renesas-rzv2h.c index da2bc43a0e12..03e93b061edd 100644 --- a/drivers/irqchip/irq-renesas-rzv2h.c +++ b/drivers/irqchip/irq-renesas-rzv2h.c @@ -621,7 +621,7 @@ static int rzv2h_icu_probe_common(struct platform_device *pdev, struct device_no return 0; pm_put: - pm_runtime_put(&pdev->dev); + pm_runtime_put_sync(&pdev->dev); return ret; } diff --git a/drivers/phy/Kconfig b/drivers/phy/Kconfig index 02467dfd4fb0..1875d5b784f6 100644 --- a/drivers/phy/Kconfig +++ b/drivers/phy/Kconfig @@ -6,7 +6,7 @@ menu "PHY Subsystem" config PHY_COMMON_PROPS - bool + bool "PHY common properties" if KUNIT_ALL_TESTS help This parses properties common between generic PHYs and Ethernet PHYs. @@ -16,8 +16,7 @@ config PHY_COMMON_PROPS config PHY_COMMON_PROPS_TEST tristate "KUnit tests for PHY common props" if !KUNIT_ALL_TESTS - select PHY_COMMON_PROPS - depends on KUNIT + depends on KUNIT && PHY_COMMON_PROPS default KUNIT_ALL_TESTS help This builds KUnit tests for the PHY common property API. diff --git a/drivers/phy/freescale/phy-fsl-lynx-28g.c b/drivers/phy/freescale/phy-fsl-lynx-28g.c index 2b0fd95ba62f..63427fc34e26 100644 --- a/drivers/phy/freescale/phy-fsl-lynx-28g.c +++ b/drivers/phy/freescale/phy-fsl-lynx-28g.c @@ -1069,6 +1069,8 @@ static void lynx_28g_cdr_lock_check(struct work_struct *work) for (i = 0; i < LYNX_28G_NUM_LANE; i++) { lane = &priv->lane[i]; + if (!lane->phy) + continue; mutex_lock(&lane->phy->mutex); diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-ufs.c b/drivers/phy/qualcomm/phy-qcom-qmp-ufs.c index df138a5442eb..771bc7c2ab50 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp-ufs.c +++ b/drivers/phy/qualcomm/phy-qcom-qmp-ufs.c @@ -990,6 +990,7 @@ static const struct qmp_phy_init_tbl sm8650_ufsphy_pcs[] = { QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_MULTI_LANE_CTRL1, 0x02), QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_TX_MID_TERM_CTRL1, 0x43), QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_PCS_CTRL1, 0xc1), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_PLL_CNTL, 0x33), QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_TX_LARGE_AMP_DRV_LVL, 0x0f), QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_RX_SIGDET_CTRL2, 0x68), QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_TX_POST_EMP_LVL_S4, 0x0e), @@ -999,13 +1000,11 @@ static const struct qmp_phy_init_tbl sm8650_ufsphy_pcs[] = { }; static const struct qmp_phy_init_tbl sm8650_ufsphy_g4_pcs[] = { - QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_PLL_CNTL, 0x13), QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_TX_HSGEAR_CAPABILITY, 0x04), QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_RX_HSGEAR_CAPABILITY, 0x04), }; static const struct qmp_phy_init_tbl sm8650_ufsphy_g5_pcs[] = { - QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_PLL_CNTL, 0x33), QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_TX_HSGEAR_CAPABILITY, 0x05), QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_RX_HSGEAR_CAPABILITY, 0x05), QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_RX_HS_G5_SYNC_LENGTH_CAPABILITY, 0x4d), diff --git a/drivers/phy/spacemit/phy-k1-usb2.c b/drivers/phy/spacemit/phy-k1-usb2.c index 342061380012..9215d0b223b2 100644 --- a/drivers/phy/spacemit/phy-k1-usb2.c +++ b/drivers/phy/spacemit/phy-k1-usb2.c @@ -48,6 +48,9 @@ #define PHY_CLK_HSTXP_EN BIT(3) /* clock hstxp enable */ #define PHY_HSTXP_MODE BIT(4) /* 0: force en_txp to be 1; 1: no force */ +#define PHY_K1_HS_HOST_DISC 0x40 +#define PHY_K1_HS_HOST_DISC_CLR BIT(0) + #define PHY_PLL_DIV_CFG 0x98 #define PHY_FDIV_FRACT_8_15 GENMASK(7, 0) #define PHY_FDIV_FRACT_16_19 GENMASK(11, 8) @@ -142,9 +145,20 @@ static int spacemit_usb2phy_exit(struct phy *phy) return 0; } +static int spacemit_usb2phy_disconnect(struct phy *phy, int port) +{ + struct spacemit_usb2phy *sphy = phy_get_drvdata(phy); + + regmap_update_bits(sphy->regmap_base, PHY_K1_HS_HOST_DISC, + PHY_K1_HS_HOST_DISC_CLR, PHY_K1_HS_HOST_DISC_CLR); + + return 0; +} + static const struct phy_ops spacemit_usb2phy_ops = { .init = spacemit_usb2phy_init, .exit = spacemit_usb2phy_exit, + .disconnect = spacemit_usb2phy_disconnect, .owner = THIS_MODULE, }; diff --git a/drivers/phy/ti/phy-j721e-wiz.c b/drivers/phy/ti/phy-j721e-wiz.c index 6e9ecb88dc8b..6b584706b913 100644 --- a/drivers/phy/ti/phy-j721e-wiz.c +++ b/drivers/phy/ti/phy-j721e-wiz.c @@ -1425,6 +1425,7 @@ static int wiz_get_lane_phy_types(struct device *dev, struct wiz *wiz) dev_err(dev, "%s: Reading \"reg\" from \"%s\" failed: %d\n", __func__, subnode->name, ret); + of_node_put(serdes); return ret; } of_property_read_u32(subnode, "cdns,num-lanes", &num_lanes); @@ -1439,6 +1440,7 @@ static int wiz_get_lane_phy_types(struct device *dev, struct wiz *wiz) } } + of_node_put(serdes); return 0; } diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index bbf9ee21306c..15ba592236e8 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -1765,6 +1765,9 @@ err_privcmdbuf: static void __exit privcmd_exit(void) { + if (!xen_initial_domain()) + unregister_xenstore_notifier(&xenstore_notifier); + privcmd_ioeventfd_exit(); privcmd_irqfd_exit(); misc_deregister(&privcmd_dev); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index ebf5079096af..c0d17a369bda 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -4583,7 +4583,7 @@ static void check_removing_space_info(struct btrfs_space_info *space_info) for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) { if (space_info->sub_group[i]) { check_removing_space_info(space_info->sub_group[i]); - kfree(space_info->sub_group[i]); + btrfs_sysfs_remove_space_info(space_info->sub_group[i]); space_info->sub_group[i] = NULL; } } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 01f2dbb69832..1b0eb246b714 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2531,8 +2531,8 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, if (mirror_num >= 0 && btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) { - btrfs_err(fs_info, "super offset mismatch %llu != %u", - btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET); + btrfs_err(fs_info, "super offset mismatch %llu != %llu", + btrfs_super_bytenr(sb), btrfs_sb_offset(mirror_num)); ret = -EINVAL; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ab0d460c7139..ac871efb9763 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4616,21 +4616,32 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct inode *inode, bool log_inode_only, u64 logged_isize) { + u64 gen = BTRFS_I(inode)->generation; u64 flags; if (log_inode_only) { - /* set the generation to zero so the recover code - * can tell the difference between an logging - * just to say 'this inode exists' and a logging - * to say 'update this inode with these values' + /* + * Set the generation to zero so the recover code can tell the + * difference between a logging just to say 'this inode exists' + * and a logging to say 'update this inode with these values'. + * But only if the inode was not already logged before. + * We access ->logged_trans directly since it was already set + * up in the call chain by btrfs_log_inode(), and data_race() + * to avoid false alerts from KCSAN and since it was set already + * and one can set it to 0 since that only happens on eviction + * and we are holding a ref on the inode. */ - btrfs_set_inode_generation(leaf, item, 0); + ASSERT(data_race(BTRFS_I(inode)->logged_trans) > 0); + if (data_race(BTRFS_I(inode)->logged_trans) < trans->transid) + gen = 0; + btrfs_set_inode_size(leaf, item, logged_isize); } else { - btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); btrfs_set_inode_size(leaf, item, inode->i_size); } + btrfs_set_inode_generation(leaf, item, gen); + btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); btrfs_set_inode_mode(leaf, item, inode->i_mode); @@ -5448,42 +5459,63 @@ process: return 0; } -static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, - struct btrfs_path *path, u64 *size_ret) +static int get_inode_size_to_log(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, u64 *size_ret) { struct btrfs_key key; + struct btrfs_inode_item *item; int ret; key.objectid = btrfs_ino(inode); key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); - if (ret < 0) { - return ret; - } else if (ret > 0) { - *size_ret = 0; - } else { - struct btrfs_inode_item *item; + /* + * Our caller called inode_logged(), so logged_trans is up to date. + * Use data_race() to silence any warning from KCSAN. Once logged_trans + * is set, it can only be reset to 0 after inode eviction. + */ + if (data_race(inode->logged_trans) == trans->transid) { + ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0); + } else if (inode->generation < trans->transid) { + path->search_commit_root = true; + path->skip_locking = true; + ret = btrfs_search_slot(NULL, inode->root, &key, path, 0, 0); + path->search_commit_root = false; + path->skip_locking = false; - item = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_inode_item); - *size_ret = btrfs_inode_size(path->nodes[0], item); - /* - * If the in-memory inode's i_size is smaller then the inode - * size stored in the btree, return the inode's i_size, so - * that we get a correct inode size after replaying the log - * when before a power failure we had a shrinking truncate - * followed by addition of a new name (rename / new hard link). - * Otherwise return the inode size from the btree, to avoid - * data loss when replaying a log due to previously doing a - * write that expands the inode's size and logging a new name - * immediately after. - */ - if (*size_ret > inode->vfs_inode.i_size) - *size_ret = inode->vfs_inode.i_size; + } else { + *size_ret = 0; + return 0; } + /* + * If the inode was logged before or is from a past transaction, then + * its inode item must exist in the log root or in the commit root. + */ + ASSERT(ret <= 0); + if (WARN_ON_ONCE(ret > 0)) + ret = -ENOENT; + + if (ret < 0) + return ret; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + *size_ret = btrfs_inode_size(path->nodes[0], item); + /* + * If the in-memory inode's i_size is smaller then the inode size stored + * in the btree, return the inode's i_size, so that we get a correct + * inode size after replaying the log when before a power failure we had + * a shrinking truncate followed by addition of a new name (rename / new + * hard link). Otherwise return the inode size from the btree, to avoid + * data loss when replaying a log due to previously doing a write that + * expands the inode's size and logging a new name immediately after. + */ + if (*size_ret > inode->vfs_inode.i_size) + *size_ret = inode->vfs_inode.i_size; + btrfs_release_path(path); return 0; } @@ -6996,7 +7028,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, ret = drop_inode_items(trans, log, path, inode, BTRFS_XATTR_ITEM_KEY); } else { - if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) { + if (inode_only == LOG_INODE_EXISTS) { /* * Make sure the new inode item we write to the log has * the same isize as the current one (if it exists). @@ -7010,7 +7042,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * (zeroes), as if an expanding truncate happened, * instead of getting a file of 4Kb only. */ - ret = logged_inode_size(log, inode, path, &logged_isize); + ret = get_inode_size_to_log(trans, inode, path, &logged_isize); if (ret) goto out_unlock; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ab51e6ecfdef..6b8e810a35ce 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -8099,8 +8099,9 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) smp_rmb(); ret = update_dev_stat_item(trans, device); - if (!ret) - atomic_sub(stats_cnt, &device->dev_stats_ccnt); + if (ret) + break; + atomic_sub(stats_cnt, &device->dev_stats_ccnt); } mutex_unlock(&fs_devices->device_list_mutex); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 1b7544aa88a2..c676e715b4f8 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -308,7 +308,9 @@ int zlib_compress_bio(struct list_head *ws, struct compressed_bio *cb) } /* Queue the remaining part of the folio. */ if (workspace->strm.total_out > bio->bi_iter.bi_size) { - u32 cur_len = offset_in_folio(out_folio, workspace->strm.total_out); + const u32 cur_len = workspace->strm.total_out - bio->bi_iter.bi_size; + + ASSERT(cur_len <= folio_size(out_folio)); if (!bio_add_folio(bio, out_folio, cur_len, 0)) { ret = -E2BIG; diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 72206a292676..3baee4e7c1cf 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -14,7 +14,8 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o -ext4-inode-test-objs += inode-test.o -obj-$(CONFIG_EXT4_KUNIT_TESTS) += ext4-inode-test.o +ext4-test-objs += inode-test.o mballoc-test.o \ + extents-test.o +obj-$(CONFIG_EXT4_KUNIT_TESTS) += ext4-test.o ext4-$(CONFIG_FS_VERITY) += verity.o ext4-$(CONFIG_FS_ENCRYPTION) += crypto.o diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index cf0a0970c095..f41f320f4437 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -163,10 +163,17 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, */ if (handle) { + /* + * Since the inode is new it is ok to pass the + * XATTR_CREATE flag. This is necessary to match the + * remaining journal credits check in the set_handle + * function with the credits allocated for the new + * inode. + */ res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, - ctx, len, 0); + ctx, len, XATTR_CREATE); if (!res) { ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); ext4_clear_inode_state(inode, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 293f698b7042..7617e2d454ea 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1570,6 +1570,7 @@ struct ext4_sb_info { struct proc_dir_entry *s_proc; struct kobject s_kobj; struct completion s_kobj_unregister; + struct mutex s_error_notify_mutex; /* protects sysfs_notify vs kobject_del */ struct super_block *s_sb; struct buffer_head *s_mmp_bh; @@ -3944,6 +3945,11 @@ static inline bool ext4_inode_can_atomic_write(struct inode *inode) extern int ext4_block_write_begin(handle_t *handle, struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block); + +#if IS_ENABLED(CONFIG_EXT4_KUNIT_TESTS) +#define EXPORT_SYMBOL_FOR_EXT4_TEST(sym) \ + EXPORT_SYMBOL_FOR_MODULES(sym, "ext4-test") +#endif #endif /* __KERNEL__ */ #endif /* _EXT4_H */ diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index c484125d963f..ebaf7cc42430 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -264,5 +264,17 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, 0xffff); } +extern int __ext4_ext_dirty(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct ext4_ext_path *path); +extern int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex); +#if IS_ENABLED(CONFIG_EXT4_KUNIT_TESTS) +extern int ext4_ext_space_root_idx_test(struct inode *inode, int check); +extern struct ext4_ext_path *ext4_split_convert_extents_test( + handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path, + int flags, unsigned int *allocated); +#endif #endif /* _EXT4_EXTENTS */ diff --git a/fs/ext4/extents-test.c b/fs/ext4/extents-test.c index 7c4690eb7dad..5496b2c8e2cd 100644 --- a/fs/ext4/extents-test.c +++ b/fs/ext4/extents-test.c @@ -142,8 +142,10 @@ static struct file_system_type ext_fs_type = { static void extents_kunit_exit(struct kunit *test) { - struct ext4_sb_info *sbi = k_ctx.k_ei->vfs_inode.i_sb->s_fs_info; + struct super_block *sb = k_ctx.k_ei->vfs_inode.i_sb; + struct ext4_sb_info *sbi = sb->s_fs_info; + ext4_es_unregister_shrinker(sbi); kfree(sbi); kfree(k_ctx.k_ei); kfree(k_ctx.k_data); @@ -280,8 +282,8 @@ static int extents_kunit_init(struct kunit *test) eh->eh_depth = 0; eh->eh_entries = cpu_to_le16(1); eh->eh_magic = EXT4_EXT_MAGIC; - eh->eh_max = - cpu_to_le16(ext4_ext_space_root_idx(&k_ctx.k_ei->vfs_inode, 0)); + eh->eh_max = cpu_to_le16(ext4_ext_space_root_idx_test( + &k_ctx.k_ei->vfs_inode, 0)); eh->eh_generation = 0; /* @@ -384,8 +386,8 @@ static void test_split_convert(struct kunit *test) switch (param->type) { case TEST_SPLIT_CONVERT: - path = ext4_split_convert_extents(NULL, inode, &map, path, - param->split_flags, NULL); + path = ext4_split_convert_extents_test(NULL, inode, &map, + path, param->split_flags, NULL); break; case TEST_CREATE_BLOCKS: ext4_map_create_blocks_helper(test, inode, &map, param->split_flags); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ae3804f36535..8cce1479be6d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -184,9 +184,9 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode, * - ENOMEM * - EIO */ -static int __ext4_ext_dirty(const char *where, unsigned int line, - handle_t *handle, struct inode *inode, - struct ext4_ext_path *path) +int __ext4_ext_dirty(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) { int err; @@ -1736,6 +1736,13 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, err = ext4_ext_get_access(handle, inode, path + k); if (err) return err; + if (unlikely(path[k].p_idx > EXT_LAST_INDEX(path[k].p_hdr))) { + EXT4_ERROR_INODE(inode, + "path[%d].p_idx %p > EXT_LAST_INDEX %p", + k, path[k].p_idx, + EXT_LAST_INDEX(path[k].p_hdr)); + return -EFSCORRUPTED; + } path[k].p_idx->ei_block = border; err = ext4_ext_dirty(handle, inode, path + k); if (err) @@ -1748,6 +1755,14 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, err = ext4_ext_get_access(handle, inode, path + k); if (err) goto clean; + if (unlikely(path[k].p_idx > EXT_LAST_INDEX(path[k].p_hdr))) { + EXT4_ERROR_INODE(inode, + "path[%d].p_idx %p > EXT_LAST_INDEX %p", + k, path[k].p_idx, + EXT_LAST_INDEX(path[k].p_hdr)); + err = -EFSCORRUPTED; + goto clean; + } path[k].p_idx->ei_block = border; err = ext4_ext_dirty(handle, inode, path + k); if (err) @@ -3144,7 +3159,7 @@ static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) } /* FIXME!! we need to try to merge to left or right after zero-out */ -static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) +int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) { ext4_fsblk_t ee_pblock; unsigned int ee_len; @@ -3239,6 +3254,9 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, insert_err = PTR_ERR(path); err = 0; + if (insert_err != -ENOSPC && insert_err != -EDQUOT && + insert_err != -ENOMEM) + goto out_path; /* * Get a new path to try to zeroout or fix the extent length. @@ -3255,13 +3273,20 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, goto out_path; } + depth = ext_depth(inode); + ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, + "bad extent address lblock: %lu, depth: %d pblock %llu", + (unsigned long)ee_block, depth, path[depth].p_block); + err = -EFSCORRUPTED; + goto out; + } + err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; - depth = ext_depth(inode); - ex = path[depth].p_ext; - fix_extent_len: ex->ee_len = orig_ex.ee_len; err = ext4_ext_dirty(handle, inode, path + path->p_depth); @@ -3363,7 +3388,7 @@ static int ext4_split_extent_zeroout(handle_t *handle, struct inode *inode, ext4_ext_mark_initialized(ex); - ext4_ext_dirty(handle, inode, path + depth); + err = ext4_ext_dirty(handle, inode, path + depth); if (err) return err; @@ -4457,9 +4482,13 @@ got_allocated_blocks: path = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (IS_ERR(path)) { err = PTR_ERR(path); - if (allocated_clusters) { + /* + * Gracefully handle out of space conditions. If the filesystem + * is inconsistent, we'll just leak allocated blocks to avoid + * causing even more damage. + */ + if (allocated_clusters && (err == -EDQUOT || err == -ENOSPC)) { int fb_flags = 0; - /* * free data blocks we just allocated. * not a good idea to call discard here directly, @@ -6238,6 +6267,33 @@ out: return 0; } -#ifdef CONFIG_EXT4_KUNIT_TESTS -#include "extents-test.c" +#if IS_ENABLED(CONFIG_EXT4_KUNIT_TESTS) +int ext4_ext_space_root_idx_test(struct inode *inode, int check) +{ + return ext4_ext_space_root_idx(inode, check); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_ext_space_root_idx_test); + +struct ext4_ext_path *ext4_split_convert_extents_test(handle_t *handle, + struct inode *inode, struct ext4_map_blocks *map, + struct ext4_ext_path *path, int flags, + unsigned int *allocated) +{ + return ext4_split_convert_extents(handle, inode, map, path, + flags, allocated); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_split_convert_extents_test); + +EXPORT_SYMBOL_FOR_EXT4_TEST(__ext4_ext_dirty); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_ext_zeroout); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_es_register_shrinker); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_es_unregister_shrinker); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_map_create_blocks); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_es_init_tree); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_es_lookup_extent); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_es_insert_extent); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_ext_insert_extent); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_find_extent); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_issue_zeroout); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_map_query_blocks); #endif diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index f575751f1cae..2f0057e04934 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -975,13 +975,13 @@ static int ext4_fc_flush_data(journal_t *journal) int ret = 0; list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { - ret = jbd2_submit_inode_data(journal, ei->jinode); + ret = jbd2_submit_inode_data(journal, READ_ONCE(ei->jinode)); if (ret) return ret; } list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { - ret = jbd2_wait_inode_data(journal, ei->jinode); + ret = jbd2_wait_inode_data(journal, READ_ONCE(ei->jinode)); if (ret) return ret; } @@ -1613,19 +1613,21 @@ static int ext4_fc_replay_inode(struct super_block *sb, /* Immediately update the inode on disk. */ ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); if (ret) - goto out; + goto out_brelse; ret = sync_dirty_buffer(iloc.bh); if (ret) - goto out; + goto out_brelse; ret = ext4_mark_inode_used(sb, ino); if (ret) - goto out; + goto out_brelse; /* Given that we just wrote the inode on disk, this SHOULD succeed. */ inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode not found."); - return -EFSCORRUPTED; + inode = NULL; + ret = -EFSCORRUPTED; + goto out_brelse; } /* @@ -1642,13 +1644,14 @@ static int ext4_fc_replay_inode(struct super_block *sb, ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode)); ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); sync_dirty_buffer(iloc.bh); +out_brelse: brelse(iloc.bh); out: iput(inode); if (!ret) blkdev_issue_flush(sb->s_bdev); - return 0; + return ret; } /* diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index e476c6de3074..bd8f230fa507 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -83,11 +83,23 @@ static int ext4_fsync_nojournal(struct file *file, loff_t start, loff_t end, int datasync, bool *needs_barrier) { struct inode *inode = file->f_inode; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 0, + }; int ret; ret = generic_buffers_fsync_noflush(file, start, end, datasync); - if (!ret) - ret = ext4_sync_parent(inode); + if (ret) + return ret; + + /* Force writeout of inode table buffer to disk */ + ret = ext4_write_inode(inode, &wbc); + if (ret) + return ret; + + ret = ext4_sync_parent(inode); + if (test_opt(inode->i_sb, BARRIER)) *needs_barrier = true; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index b20a1bf866ab..b1bc1950c9f0 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -686,6 +686,12 @@ static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) if (unlikely(!gdp)) return 0; + /* Inode was never used in this filesystem? */ + if (ext4_has_group_desc_csum(sb) && + (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT) || + ino >= EXT4_INODES_PER_GROUP(sb) - ext4_itable_unused_count(sb, gdp))) + return 0; + bh = sb_find_get_block(sb, ext4_inode_table(sb, gdp) + (ino / inodes_per_block)); if (!bh || !buffer_uptodate(bh)) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 1f6bc05593df..408677fa8196 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -522,7 +522,15 @@ static int ext4_read_inline_folio(struct inode *inode, struct folio *folio) goto out; len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode)); - BUG_ON(len > PAGE_SIZE); + + if (len > PAGE_SIZE) { + ext4_error_inode(inode, __func__, __LINE__, 0, + "inline size %zu exceeds PAGE_SIZE", len); + ret = -EFSCORRUPTED; + brelse(iloc.bh); + goto out; + } + kaddr = kmap_local_folio(folio, 0); ret = ext4_read_inline_data(inode, kaddr, len, &iloc); kaddr = folio_zero_tail(folio, len, kaddr + len); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 396dc3a5d16b..1123d995494b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -128,6 +128,8 @@ void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { + struct jbd2_inode *jinode = READ_ONCE(EXT4_I(inode)->jinode); + trace_ext4_begin_ordered_truncate(inode, new_size); /* * If jinode is zero, then we never opened the file for @@ -135,10 +137,10 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, * jbd2_journal_begin_ordered_truncate() since there's no * outstanding writes we need to flush. */ - if (!EXT4_I(inode)->jinode) + if (!jinode) return 0; return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode), - EXT4_I(inode)->jinode, + jinode, new_size); } @@ -184,6 +186,14 @@ void ext4_evict_inode(struct inode *inode) if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL) ext4_evict_ea_inode(inode); if (inode->i_nlink) { + /* + * If there's dirty page will lead to data loss, user + * could see stale data. + */ + if (unlikely(!ext4_emergency_state(inode->i_sb) && + mapping_tagged(&inode->i_data, PAGECACHE_TAG_DIRTY))) + ext4_warning_inode(inode, "data will be lost"); + truncate_inode_pages_final(&inode->i_data); goto no_delete; @@ -4451,8 +4461,13 @@ int ext4_inode_attach_jinode(struct inode *inode) spin_unlock(&inode->i_lock); return -ENOMEM; } - ei->jinode = jinode; - jbd2_journal_init_jbd_inode(ei->jinode, inode); + jbd2_journal_init_jbd_inode(jinode, inode); + /* + * Publish ->jinode only after it is fully initialized so that + * readers never observe a partially initialized jbd2_inode. + */ + smp_wmb(); + WRITE_ONCE(ei->jinode, jinode); jinode = NULL; } spin_unlock(&inode->i_lock); @@ -5401,18 +5416,36 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, inode->i_op = &ext4_encrypted_symlink_inode_operations; } else if (ext4_inode_is_fast_symlink(inode)) { inode->i_op = &ext4_fast_symlink_inode_operations; - if (inode->i_size == 0 || - inode->i_size >= sizeof(ei->i_data) || - strnlen((char *)ei->i_data, inode->i_size + 1) != - inode->i_size) { - ext4_error_inode(inode, function, line, 0, - "invalid fast symlink length %llu", - (unsigned long long)inode->i_size); - ret = -EFSCORRUPTED; - goto bad_inode; + + /* + * Orphan cleanup can see inodes with i_size == 0 + * and i_data uninitialized. Skip size checks in + * that case. This is safe because the first thing + * ext4_evict_inode() does for fast symlinks is + * clearing of i_data and i_size. + */ + if ((EXT4_SB(sb)->s_mount_state & EXT4_ORPHAN_FS)) { + if (inode->i_nlink != 0) { + ext4_error_inode(inode, function, line, 0, + "invalid orphan symlink nlink %d", + inode->i_nlink); + ret = -EFSCORRUPTED; + goto bad_inode; + } + } else { + if (inode->i_size == 0 || + inode->i_size >= sizeof(ei->i_data) || + strnlen((char *)ei->i_data, inode->i_size + 1) != + inode->i_size) { + ext4_error_inode(inode, function, line, 0, + "invalid fast symlink length %llu", + (unsigned long long)inode->i_size); + ret = -EFSCORRUPTED; + goto bad_inode; + } + inode_set_cached_link(inode, (char *)ei->i_data, + inode->i_size); } - inode_set_cached_link(inode, (char *)ei->i_data, - inode->i_size); } else { inode->i_op = &ext4_symlink_inode_operations; } @@ -5849,6 +5882,18 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (attr->ia_size == inode->i_size) inc_ivers = false; + /* + * If file has inline data but new size exceeds inline capacity, + * convert to extent-based storage first to prevent inconsistent + * state (inline flag set but size exceeds inline capacity). + */ + if (ext4_has_inline_data(inode) && + attr->ia_size > EXT4_I(inode)->i_inline_size) { + error = ext4_convert_inline_data(inode); + if (error) + goto err_out; + } + if (shrink) { if (ext4_should_order_data(inode)) { error = ext4_begin_ordered_truncate(inode, diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c index 9fbdf6a09489..6f5bfbb0e8a4 100644 --- a/fs/ext4/mballoc-test.c +++ b/fs/ext4/mballoc-test.c @@ -8,6 +8,7 @@ #include <linux/random.h> #include "ext4.h" +#include "mballoc.h" struct mbt_grp_ctx { struct buffer_head bitmap_bh; @@ -336,7 +337,7 @@ ext4_mb_mark_context_stub(handle_t *handle, struct super_block *sb, bool state, if (state) mb_set_bits(bitmap_bh->b_data, blkoff, len); else - mb_clear_bits(bitmap_bh->b_data, blkoff, len); + mb_clear_bits_test(bitmap_bh->b_data, blkoff, len); return 0; } @@ -413,14 +414,14 @@ static void test_new_blocks_simple(struct kunit *test) /* get block at goal */ ar.goal = ext4_group_first_block_no(sb, goal_group); - found = ext4_mb_new_blocks_simple(&ar, &err); + found = ext4_mb_new_blocks_simple_test(&ar, &err); KUNIT_ASSERT_EQ_MSG(test, ar.goal, found, "failed to alloc block at goal, expected %llu found %llu", ar.goal, found); /* get block after goal in goal group */ ar.goal = ext4_group_first_block_no(sb, goal_group); - found = ext4_mb_new_blocks_simple(&ar, &err); + found = ext4_mb_new_blocks_simple_test(&ar, &err); KUNIT_ASSERT_EQ_MSG(test, ar.goal + EXT4_C2B(sbi, 1), found, "failed to alloc block after goal in goal group, expected %llu found %llu", ar.goal + 1, found); @@ -428,7 +429,7 @@ static void test_new_blocks_simple(struct kunit *test) /* get block after goal group */ mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb)); ar.goal = ext4_group_first_block_no(sb, goal_group); - found = ext4_mb_new_blocks_simple(&ar, &err); + found = ext4_mb_new_blocks_simple_test(&ar, &err); KUNIT_ASSERT_EQ_MSG(test, ext4_group_first_block_no(sb, goal_group + 1), found, "failed to alloc block after goal group, expected %llu found %llu", @@ -438,7 +439,7 @@ static void test_new_blocks_simple(struct kunit *test) for (i = goal_group; i < ext4_get_groups_count(sb); i++) mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb)); ar.goal = ext4_group_first_block_no(sb, goal_group); - found = ext4_mb_new_blocks_simple(&ar, &err); + found = ext4_mb_new_blocks_simple_test(&ar, &err); KUNIT_ASSERT_EQ_MSG(test, ext4_group_first_block_no(sb, 0) + EXT4_C2B(sbi, 1), found, "failed to alloc block before goal group, expected %llu found %llu", @@ -448,7 +449,7 @@ static void test_new_blocks_simple(struct kunit *test) for (i = 0; i < ext4_get_groups_count(sb); i++) mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb)); ar.goal = ext4_group_first_block_no(sb, goal_group); - found = ext4_mb_new_blocks_simple(&ar, &err); + found = ext4_mb_new_blocks_simple_test(&ar, &err); KUNIT_ASSERT_NE_MSG(test, err, 0, "unexpectedly get block when no block is available"); } @@ -492,16 +493,16 @@ validate_free_blocks_simple(struct kunit *test, struct super_block *sb, continue; bitmap = mbt_ctx_bitmap(sb, i); - bit = mb_find_next_zero_bit(bitmap, max, 0); + bit = mb_find_next_zero_bit_test(bitmap, max, 0); KUNIT_ASSERT_EQ_MSG(test, bit, max, "free block on unexpected group %d", i); } bitmap = mbt_ctx_bitmap(sb, goal_group); - bit = mb_find_next_zero_bit(bitmap, max, 0); + bit = mb_find_next_zero_bit_test(bitmap, max, 0); KUNIT_ASSERT_EQ(test, bit, start); - bit = mb_find_next_bit(bitmap, max, bit + 1); + bit = mb_find_next_bit_test(bitmap, max, bit + 1); KUNIT_ASSERT_EQ(test, bit, start + len); } @@ -524,7 +525,7 @@ test_free_blocks_simple_range(struct kunit *test, ext4_group_t goal_group, block = ext4_group_first_block_no(sb, goal_group) + EXT4_C2B(sbi, start); - ext4_free_blocks_simple(inode, block, len); + ext4_free_blocks_simple_test(inode, block, len); validate_free_blocks_simple(test, sb, goal_group, start, len); mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb)); } @@ -566,15 +567,15 @@ test_mark_diskspace_used_range(struct kunit *test, bitmap = mbt_ctx_bitmap(sb, TEST_GOAL_GROUP); memset(bitmap, 0, sb->s_blocksize); - ret = ext4_mb_mark_diskspace_used(ac, NULL); + ret = ext4_mb_mark_diskspace_used_test(ac, NULL); KUNIT_ASSERT_EQ(test, ret, 0); max = EXT4_CLUSTERS_PER_GROUP(sb); - i = mb_find_next_bit(bitmap, max, 0); + i = mb_find_next_bit_test(bitmap, max, 0); KUNIT_ASSERT_EQ(test, i, start); - i = mb_find_next_zero_bit(bitmap, max, i + 1); + i = mb_find_next_zero_bit_test(bitmap, max, i + 1); KUNIT_ASSERT_EQ(test, i, start + len); - i = mb_find_next_bit(bitmap, max, i + 1); + i = mb_find_next_bit_test(bitmap, max, i + 1); KUNIT_ASSERT_EQ(test, max, i); } @@ -617,54 +618,54 @@ static void mbt_generate_buddy(struct super_block *sb, void *buddy, max = EXT4_CLUSTERS_PER_GROUP(sb); bb_h = buddy + sbi->s_mb_offsets[1]; - off = mb_find_next_zero_bit(bb, max, 0); + off = mb_find_next_zero_bit_test(bb, max, 0); grp->bb_first_free = off; while (off < max) { grp->bb_counters[0]++; grp->bb_free++; - if (!(off & 1) && !mb_test_bit(off + 1, bb)) { + if (!(off & 1) && !mb_test_bit_test(off + 1, bb)) { grp->bb_free++; grp->bb_counters[0]--; - mb_clear_bit(off >> 1, bb_h); + mb_clear_bit_test(off >> 1, bb_h); grp->bb_counters[1]++; grp->bb_largest_free_order = 1; off++; } - off = mb_find_next_zero_bit(bb, max, off + 1); + off = mb_find_next_zero_bit_test(bb, max, off + 1); } for (order = 1; order < MB_NUM_ORDERS(sb) - 1; order++) { bb = buddy + sbi->s_mb_offsets[order]; bb_h = buddy + sbi->s_mb_offsets[order + 1]; max = max >> 1; - off = mb_find_next_zero_bit(bb, max, 0); + off = mb_find_next_zero_bit_test(bb, max, 0); while (off < max) { - if (!(off & 1) && !mb_test_bit(off + 1, bb)) { + if (!(off & 1) && !mb_test_bit_test(off + 1, bb)) { mb_set_bits(bb, off, 2); grp->bb_counters[order] -= 2; - mb_clear_bit(off >> 1, bb_h); + mb_clear_bit_test(off >> 1, bb_h); grp->bb_counters[order + 1]++; grp->bb_largest_free_order = order + 1; off++; } - off = mb_find_next_zero_bit(bb, max, off + 1); + off = mb_find_next_zero_bit_test(bb, max, off + 1); } } max = EXT4_CLUSTERS_PER_GROUP(sb); - off = mb_find_next_zero_bit(bitmap, max, 0); + off = mb_find_next_zero_bit_test(bitmap, max, 0); while (off < max) { grp->bb_fragments++; - off = mb_find_next_bit(bitmap, max, off + 1); + off = mb_find_next_bit_test(bitmap, max, off + 1); if (off + 1 >= max) break; - off = mb_find_next_zero_bit(bitmap, max, off + 1); + off = mb_find_next_zero_bit_test(bitmap, max, off + 1); } } @@ -706,7 +707,7 @@ do_test_generate_buddy(struct kunit *test, struct super_block *sb, void *bitmap, /* needed by validation in ext4_mb_generate_buddy */ ext4_grp->bb_free = mbt_grp->bb_free; memset(ext4_buddy, 0xff, sb->s_blocksize); - ext4_mb_generate_buddy(sb, ext4_buddy, bitmap, TEST_GOAL_GROUP, + ext4_mb_generate_buddy_test(sb, ext4_buddy, bitmap, TEST_GOAL_GROUP, ext4_grp); KUNIT_ASSERT_EQ(test, memcmp(mbt_buddy, ext4_buddy, sb->s_blocksize), @@ -760,7 +761,7 @@ test_mb_mark_used_range(struct kunit *test, struct ext4_buddy *e4b, ex.fe_group = TEST_GOAL_GROUP; ext4_lock_group(sb, TEST_GOAL_GROUP); - mb_mark_used(e4b, &ex); + mb_mark_used_test(e4b, &ex); ext4_unlock_group(sb, TEST_GOAL_GROUP); mb_set_bits(bitmap, start, len); @@ -769,7 +770,7 @@ test_mb_mark_used_range(struct kunit *test, struct ext4_buddy *e4b, memset(buddy, 0xff, sb->s_blocksize); for (i = 0; i < MB_NUM_ORDERS(sb); i++) grp->bb_counters[i] = 0; - ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp); + ext4_mb_generate_buddy_test(sb, buddy, bitmap, 0, grp); KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize), 0); @@ -798,7 +799,7 @@ static void test_mb_mark_used(struct kunit *test) bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp); - ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); + ret = ext4_mb_load_buddy_test(sb, TEST_GOAL_GROUP, &e4b); KUNIT_ASSERT_EQ(test, ret, 0); grp->bb_free = EXT4_CLUSTERS_PER_GROUP(sb); @@ -809,7 +810,7 @@ static void test_mb_mark_used(struct kunit *test) test_mb_mark_used_range(test, &e4b, ranges[i].start, ranges[i].len, bitmap, buddy, grp); - ext4_mb_unload_buddy(&e4b); + ext4_mb_unload_buddy_test(&e4b); } static void @@ -825,16 +826,16 @@ test_mb_free_blocks_range(struct kunit *test, struct ext4_buddy *e4b, return; ext4_lock_group(sb, e4b->bd_group); - mb_free_blocks(NULL, e4b, start, len); + mb_free_blocks_test(NULL, e4b, start, len); ext4_unlock_group(sb, e4b->bd_group); - mb_clear_bits(bitmap, start, len); + mb_clear_bits_test(bitmap, start, len); /* bypass bb_free validatoin in ext4_mb_generate_buddy */ grp->bb_free += len; memset(buddy, 0xff, sb->s_blocksize); for (i = 0; i < MB_NUM_ORDERS(sb); i++) grp->bb_counters[i] = 0; - ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp); + ext4_mb_generate_buddy_test(sb, buddy, bitmap, 0, grp); KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize), 0); @@ -865,7 +866,7 @@ static void test_mb_free_blocks(struct kunit *test) bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp); - ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); + ret = ext4_mb_load_buddy_test(sb, TEST_GOAL_GROUP, &e4b); KUNIT_ASSERT_EQ(test, ret, 0); ex.fe_start = 0; @@ -873,7 +874,7 @@ static void test_mb_free_blocks(struct kunit *test) ex.fe_group = TEST_GOAL_GROUP; ext4_lock_group(sb, TEST_GOAL_GROUP); - mb_mark_used(&e4b, &ex); + mb_mark_used_test(&e4b, &ex); ext4_unlock_group(sb, TEST_GOAL_GROUP); grp->bb_free = 0; @@ -886,7 +887,7 @@ static void test_mb_free_blocks(struct kunit *test) test_mb_free_blocks_range(test, &e4b, ranges[i].start, ranges[i].len, bitmap, buddy, grp); - ext4_mb_unload_buddy(&e4b); + ext4_mb_unload_buddy_test(&e4b); } #define COUNT_FOR_ESTIMATE 100000 @@ -904,7 +905,7 @@ static void test_mb_mark_used_cost(struct kunit *test) if (sb->s_blocksize > PAGE_SIZE) kunit_skip(test, "blocksize exceeds pagesize"); - ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); + ret = ext4_mb_load_buddy_test(sb, TEST_GOAL_GROUP, &e4b); KUNIT_ASSERT_EQ(test, ret, 0); ex.fe_group = TEST_GOAL_GROUP; @@ -918,7 +919,7 @@ static void test_mb_mark_used_cost(struct kunit *test) ex.fe_start = ranges[i].start; ex.fe_len = ranges[i].len; ext4_lock_group(sb, TEST_GOAL_GROUP); - mb_mark_used(&e4b, &ex); + mb_mark_used_test(&e4b, &ex); ext4_unlock_group(sb, TEST_GOAL_GROUP); } end = jiffies; @@ -929,14 +930,14 @@ static void test_mb_mark_used_cost(struct kunit *test) continue; ext4_lock_group(sb, TEST_GOAL_GROUP); - mb_free_blocks(NULL, &e4b, ranges[i].start, + mb_free_blocks_test(NULL, &e4b, ranges[i].start, ranges[i].len); ext4_unlock_group(sb, TEST_GOAL_GROUP); } } kunit_info(test, "costed jiffies %lu\n", all); - ext4_mb_unload_buddy(&e4b); + ext4_mb_unload_buddy_test(&e4b); } static const struct mbt_ext4_block_layout mbt_test_layouts[] = { diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 20e9fdaf4301..bb58eafb87bc 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1199,6 +1199,8 @@ static int ext4_mb_scan_groups(struct ext4_allocation_context *ac) /* searching for the right group start from the goal value specified */ start = ac->ac_g_ex.fe_group; + if (start >= ngroups) + start = 0; ac->ac_prefetch_grp = start; ac->ac_prefetch_nr = 0; @@ -2443,8 +2445,12 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, return 0; err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); - if (err) + if (err) { + if (EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info) && + !(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + return 0; return err; + } ext4_lock_group(ac->ac_sb, group); if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) @@ -3580,9 +3586,7 @@ err_freebuddy: rcu_read_unlock(); iput(sbi->s_buddy_cache); err_freesgi: - rcu_read_lock(); - kvfree(rcu_dereference(sbi->s_group_info)); - rcu_read_unlock(); + kvfree(rcu_access_pointer(sbi->s_group_info)); return -ENOMEM; } @@ -3889,15 +3893,14 @@ void ext4_mb_release(struct super_block *sb) struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); int count; - if (test_opt(sb, DISCARD)) { - /* - * wait the discard work to drain all of ext4_free_data - */ - flush_work(&sbi->s_discard_work); - WARN_ON_ONCE(!list_empty(&sbi->s_discard_list)); - } + /* + * wait the discard work to drain all of ext4_free_data + */ + flush_work(&sbi->s_discard_work); + WARN_ON_ONCE(!list_empty(&sbi->s_discard_list)); - if (sbi->s_group_info) { + group_info = rcu_access_pointer(sbi->s_group_info); + if (group_info) { for (i = 0; i < ngroups; i++) { cond_resched(); grinfo = ext4_get_group_info(sb, i); @@ -3915,12 +3918,9 @@ void ext4_mb_release(struct super_block *sb) num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); - rcu_read_lock(); - group_info = rcu_dereference(sbi->s_group_info); for (i = 0; i < num_meta_group_infos; i++) kfree(group_info[i]); kvfree(group_info); - rcu_read_unlock(); } ext4_mb_avg_fragment_size_destroy(sbi); ext4_mb_largest_free_orders_destroy(sbi); @@ -4084,7 +4084,7 @@ void ext4_exit_mballoc(void) #define EXT4_MB_BITMAP_MARKED_CHECK 0x0001 #define EXT4_MB_SYNC_UPDATE 0x0002 -static int +int ext4_mb_mark_context(handle_t *handle, struct super_block *sb, bool state, ext4_group_t group, ext4_grpblk_t blkoff, ext4_grpblk_t len, int flags, ext4_grpblk_t *ret_changed) @@ -7188,6 +7188,102 @@ out_unload: return error; } -#ifdef CONFIG_EXT4_KUNIT_TESTS -#include "mballoc-test.c" +#if IS_ENABLED(CONFIG_EXT4_KUNIT_TESTS) +void mb_clear_bits_test(void *bm, int cur, int len) +{ + mb_clear_bits(bm, cur, len); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(mb_clear_bits_test); + +ext4_fsblk_t +ext4_mb_new_blocks_simple_test(struct ext4_allocation_request *ar, + int *errp) +{ + return ext4_mb_new_blocks_simple(ar, errp); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_new_blocks_simple_test); + +int mb_find_next_zero_bit_test(void *addr, int max, int start) +{ + return mb_find_next_zero_bit(addr, max, start); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(mb_find_next_zero_bit_test); + +int mb_find_next_bit_test(void *addr, int max, int start) +{ + return mb_find_next_bit(addr, max, start); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(mb_find_next_bit_test); + +void mb_clear_bit_test(int bit, void *addr) +{ + mb_clear_bit(bit, addr); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(mb_clear_bit_test); + +int mb_test_bit_test(int bit, void *addr) +{ + return mb_test_bit(bit, addr); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(mb_test_bit_test); + +int ext4_mb_mark_diskspace_used_test(struct ext4_allocation_context *ac, + handle_t *handle) +{ + return ext4_mb_mark_diskspace_used(ac, handle); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_mark_diskspace_used_test); + +int mb_mark_used_test(struct ext4_buddy *e4b, struct ext4_free_extent *ex) +{ + return mb_mark_used(e4b, ex); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(mb_mark_used_test); + +void ext4_mb_generate_buddy_test(struct super_block *sb, void *buddy, + void *bitmap, ext4_group_t group, + struct ext4_group_info *grp) +{ + ext4_mb_generate_buddy(sb, buddy, bitmap, group, grp); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_generate_buddy_test); + +int ext4_mb_load_buddy_test(struct super_block *sb, ext4_group_t group, + struct ext4_buddy *e4b) +{ + return ext4_mb_load_buddy(sb, group, e4b); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_load_buddy_test); + +void ext4_mb_unload_buddy_test(struct ext4_buddy *e4b) +{ + ext4_mb_unload_buddy(e4b); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_unload_buddy_test); + +void mb_free_blocks_test(struct inode *inode, struct ext4_buddy *e4b, + int first, int count) +{ + mb_free_blocks(inode, e4b, first, count); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(mb_free_blocks_test); + +void ext4_free_blocks_simple_test(struct inode *inode, ext4_fsblk_t block, + unsigned long count) +{ + return ext4_free_blocks_simple(inode, block, count); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_free_blocks_simple_test); + +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_wait_block_bitmap); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_init); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_get_group_desc); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_count_free_clusters); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_get_group_info); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_free_group_clusters_set); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_release); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_read_block_bitmap_nowait); +EXPORT_SYMBOL_FOR_EXT4_TEST(mb_set_bits); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_fc_init_inode); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_mark_context); #endif diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 15a049f05d04..39333ce72cbd 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -270,4 +270,34 @@ ext4_mballoc_query_range( ext4_mballoc_query_range_fn formatter, void *priv); +extern int ext4_mb_mark_context(handle_t *handle, + struct super_block *sb, bool state, + ext4_group_t group, ext4_grpblk_t blkoff, + ext4_grpblk_t len, int flags, + ext4_grpblk_t *ret_changed); +#if IS_ENABLED(CONFIG_EXT4_KUNIT_TESTS) +extern void mb_clear_bits_test(void *bm, int cur, int len); +extern ext4_fsblk_t +ext4_mb_new_blocks_simple_test(struct ext4_allocation_request *ar, + int *errp); +extern int mb_find_next_zero_bit_test(void *addr, int max, int start); +extern int mb_find_next_bit_test(void *addr, int max, int start); +extern void mb_clear_bit_test(int bit, void *addr); +extern int mb_test_bit_test(int bit, void *addr); +extern int +ext4_mb_mark_diskspace_used_test(struct ext4_allocation_context *ac, + handle_t *handle); +extern int mb_mark_used_test(struct ext4_buddy *e4b, + struct ext4_free_extent *ex); +extern void ext4_mb_generate_buddy_test(struct super_block *sb, + void *buddy, void *bitmap, ext4_group_t group, + struct ext4_group_info *grp); +extern int ext4_mb_load_buddy_test(struct super_block *sb, + ext4_group_t group, struct ext4_buddy *e4b); +extern void ext4_mb_unload_buddy_test(struct ext4_buddy *e4b); +extern void mb_free_blocks_test(struct inode *inode, + struct ext4_buddy *e4b, int first, int count); +extern void ext4_free_blocks_simple_test(struct inode *inode, + ext4_fsblk_t block, unsigned long count); +#endif #endif diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index a8c95eee91b7..39fe50b3c662 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -524,9 +524,15 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, nr_to_submit++; } while ((bh = bh->b_this_page) != head); - /* Nothing to submit? Just unlock the folio... */ - if (!nr_to_submit) + if (!nr_to_submit) { + /* + * We have nothing to submit. Just cycle the folio through + * writeback state to properly update xarray tags. + */ + __folio_start_writeback(folio, keep_towrite); + folio_end_writeback(folio); return 0; + } bh = head = folio_buffers(folio); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 43f680c750ae..a34efb44e73d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1254,12 +1254,10 @@ static void ext4_group_desc_free(struct ext4_sb_info *sbi) struct buffer_head **group_desc; int i; - rcu_read_lock(); - group_desc = rcu_dereference(sbi->s_group_desc); + group_desc = rcu_access_pointer(sbi->s_group_desc); for (i = 0; i < sbi->s_gdb_count; i++) brelse(group_desc[i]); kvfree(group_desc); - rcu_read_unlock(); } static void ext4_flex_groups_free(struct ext4_sb_info *sbi) @@ -1267,14 +1265,12 @@ static void ext4_flex_groups_free(struct ext4_sb_info *sbi) struct flex_groups **flex_groups; int i; - rcu_read_lock(); - flex_groups = rcu_dereference(sbi->s_flex_groups); + flex_groups = rcu_access_pointer(sbi->s_flex_groups); if (flex_groups) { for (i = 0; i < sbi->s_flex_groups_allocated; i++) kvfree(flex_groups[i]); kvfree(flex_groups); } - rcu_read_unlock(); } static void ext4_put_super(struct super_block *sb) @@ -1527,6 +1523,27 @@ void ext4_clear_inode(struct inode *inode) invalidate_inode_buffers(inode); clear_inode(inode); ext4_discard_preallocations(inode); + /* + * We must remove the inode from the hash before ext4_free_inode() + * clears the bit in inode bitmap as otherwise another process reusing + * the inode will block in insert_inode_hash() waiting for inode + * eviction to complete while holding transaction handle open, but + * ext4_evict_inode() still running for that inode could block waiting + * for transaction commit if the inode is marked as IS_SYNC => deadlock. + * + * Removing the inode from the hash here is safe. There are two cases + * to consider: + * 1) The inode still has references to it (i_nlink > 0). In that case + * we are keeping the inode and once we remove the inode from the hash, + * iget() can create the new inode structure for the same inode number + * and we are fine with that as all IO on behalf of the inode is + * finished. + * 2) We are deleting the inode (i_nlink == 0). In that case inode + * number cannot be reused until ext4_free_inode() clears the bit in + * the inode bitmap, at which point all IO is done and reuse is fine + * again. + */ + remove_inode_hash(inode); ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); dquot_drop(inode); if (EXT4_I(inode)->jinode) { @@ -3633,6 +3650,13 @@ int ext4_feature_set_ok(struct super_block *sb, int readonly) "extents feature\n"); return 0; } + if (ext4_has_feature_bigalloc(sb) && + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { + ext4_msg(sb, KERN_WARNING, + "bad geometry: bigalloc file system with non-zero " + "first_data_block\n"); + return 0; + } #if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2) if (!readonly && (ext4_has_feature_quota(sb) || @@ -5403,6 +5427,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) timer_setup(&sbi->s_err_report, print_daily_error_info, 0); spin_lock_init(&sbi->s_error_lock); + mutex_init(&sbi->s_error_notify_mutex); INIT_WORK(&sbi->s_sb_upd_work, update_super_work); err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index b87d7bdab06a..923b375e017f 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -597,7 +597,10 @@ static const struct kobj_type ext4_feat_ktype = { void ext4_notify_error_sysfs(struct ext4_sb_info *sbi) { - sysfs_notify(&sbi->s_kobj, NULL, "errors_count"); + mutex_lock(&sbi->s_error_notify_mutex); + if (sbi->s_kobj.state_in_sysfs) + sysfs_notify(&sbi->s_kobj, NULL, "errors_count"); + mutex_unlock(&sbi->s_error_notify_mutex); } static struct kobject *ext4_root; @@ -610,8 +613,10 @@ int ext4_register_sysfs(struct super_block *sb) int err; init_completion(&sbi->s_kobj_unregister); + mutex_lock(&sbi->s_error_notify_mutex); err = kobject_init_and_add(&sbi->s_kobj, &ext4_sb_ktype, ext4_root, "%s", sb->s_id); + mutex_unlock(&sbi->s_error_notify_mutex); if (err) { kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); @@ -644,7 +649,10 @@ void ext4_unregister_sysfs(struct super_block *sb) if (sbi->s_proc) remove_proc_subtree(sb->s_id, ext4_proc_root); + + mutex_lock(&sbi->s_error_notify_mutex); kobject_del(&sbi->s_kobj); + mutex_unlock(&sbi->s_error_notify_mutex); } int __init ext4_init_sysfs(void) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 7c75ed7e8979..3c75ee025bda 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1711,6 +1711,19 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, } } +static bool __sync_lazytime(struct inode *inode) +{ + spin_lock(&inode->i_lock); + if (!(inode_state_read(inode) & I_DIRTY_TIME)) { + spin_unlock(&inode->i_lock); + return false; + } + inode_state_clear(inode, I_DIRTY_TIME); + spin_unlock(&inode->i_lock); + inode->i_op->sync_lazytime(inode); + return true; +} + bool sync_lazytime(struct inode *inode) { if (!(inode_state_read_once(inode) & I_DIRTY_TIME)) @@ -1718,9 +1731,8 @@ bool sync_lazytime(struct inode *inode) trace_writeback_lazytime(inode); if (inode->i_op->sync_lazytime) - inode->i_op->sync_lazytime(inode); - else - mark_inode_dirty_sync(inode); + return __sync_lazytime(inode); + mark_inode_dirty_sync(inode); return true; } @@ -2775,13 +2787,8 @@ static void wait_sb_inodes(struct super_block *sb) * The mapping can appear untagged while still on-list since we * do not have the mapping lock. Skip it here, wb completion * will remove it. - * - * If the mapping does not have data integrity semantics, - * there's no need to wait for the writeout to complete, as the - * mapping cannot guarantee that data is persistently stored. */ - if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK) || - mapping_no_data_integrity(mapping)) + if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) continue; spin_unlock_irq(&sb->s_inode_wblist_lock); @@ -2916,6 +2923,17 @@ void sync_inodes_sb(struct super_block *sb) */ if (bdi == &noop_backing_dev_info) return; + + /* + * If the superblock has SB_I_NO_DATA_INTEGRITY set, there's no need to + * wait for the writeout to complete, as the filesystem cannot guarantee + * data persistence on sync. Just kick off writeback and return. + */ + if (sb->s_iflags & SB_I_NO_DATA_INTEGRITY) { + wakeup_flusher_threads_bdi(bdi, WB_REASON_SYNC); + return; + } + WARN_ON(!rwsem_is_locked(&sb->s_umount)); /* protect against inode wb switch, see inode_switch_wbs_work_fn() */ diff --git a/fs/fuse/file.c b/fs/fuse/file.c index b1bb7153cb78..676fd9856bfb 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -3201,10 +3201,8 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) inode->i_fop = &fuse_file_operations; inode->i_data.a_ops = &fuse_file_aops; - if (fc->writeback_cache) { + if (fc->writeback_cache) mapping_set_writeback_may_deadlock_on_reclaim(&inode->i_data); - mapping_set_no_data_integrity(&inode->i_data); - } INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index e57b8af06be9..c795abe47a4f 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1709,6 +1709,7 @@ static void fuse_sb_defaults(struct super_block *sb) sb->s_export_op = &fuse_export_operations; sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE; sb->s_iflags |= SB_I_NOIDMAP; + sb->s_iflags |= SB_I_NO_DATA_INTEGRITY; if (sb->s_user_ns != &init_user_ns) sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); diff --git a/fs/iomap/bio.c b/fs/iomap/bio.c index fc045f2e4c45..edd908183058 100644 --- a/fs/iomap/bio.c +++ b/fs/iomap/bio.c @@ -8,7 +8,10 @@ #include "internal.h" #include "trace.h" -static void iomap_read_end_io(struct bio *bio) +static DEFINE_SPINLOCK(failed_read_lock); +static struct bio_list failed_read_list = BIO_EMPTY_LIST; + +static void __iomap_read_end_io(struct bio *bio) { int error = blk_status_to_errno(bio->bi_status); struct folio_iter fi; @@ -18,6 +21,52 @@ static void iomap_read_end_io(struct bio *bio) bio_put(bio); } +static void +iomap_fail_reads( + struct work_struct *work) +{ + struct bio *bio; + struct bio_list tmp = BIO_EMPTY_LIST; + unsigned long flags; + + spin_lock_irqsave(&failed_read_lock, flags); + bio_list_merge_init(&tmp, &failed_read_list); + spin_unlock_irqrestore(&failed_read_lock, flags); + + while ((bio = bio_list_pop(&tmp)) != NULL) { + __iomap_read_end_io(bio); + cond_resched(); + } +} + +static DECLARE_WORK(failed_read_work, iomap_fail_reads); + +static void iomap_fail_buffered_read(struct bio *bio) +{ + unsigned long flags; + + /* + * Bounce I/O errors to a workqueue to avoid nested i_lock acquisitions + * in the fserror code. The caller no longer owns the bio reference + * after the spinlock drops. + */ + spin_lock_irqsave(&failed_read_lock, flags); + if (bio_list_empty(&failed_read_list)) + WARN_ON_ONCE(!schedule_work(&failed_read_work)); + bio_list_add(&failed_read_list, bio); + spin_unlock_irqrestore(&failed_read_lock, flags); +} + +static void iomap_read_end_io(struct bio *bio) +{ + if (bio->bi_status) { + iomap_fail_buffered_read(bio); + return; + } + + __iomap_read_end_io(bio); +} + static void iomap_bio_submit_read(struct iomap_read_folio_ctx *ctx) { struct bio *bio = ctx->read_ctx; diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 00f0efaf12b2..92a831cf4bf1 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -514,6 +514,7 @@ static int iomap_read_folio_iter(struct iomap_iter *iter, loff_t length = iomap_length(iter); struct folio *folio = ctx->cur_folio; size_t folio_len = folio_size(folio); + struct iomap_folio_state *ifs; size_t poff, plen; loff_t pos_diff; int ret; @@ -525,7 +526,7 @@ static int iomap_read_folio_iter(struct iomap_iter *iter, return iomap_iter_advance(iter, length); } - ifs_alloc(iter->inode, folio, iter->flags); + ifs = ifs_alloc(iter->inode, folio, iter->flags); length = min_t(loff_t, length, folio_len - offset_in_folio(folio, pos)); while (length) { @@ -560,11 +561,15 @@ static int iomap_read_folio_iter(struct iomap_iter *iter, *bytes_submitted += plen; /* - * If the entire folio has been read in by the IO - * helper, then the helper owns the folio and will end - * the read on it. + * Hand off folio ownership to the IO helper when: + * 1) The entire folio has been submitted for IO, or + * 2) There is no ifs attached to the folio + * + * Case (2) occurs when 1 << i_blkbits matches the folio + * size but the underlying filesystem or block device + * uses a smaller granularity for IO. */ - if (*bytes_submitted == folio_len) + if (*bytes_submitted == folio_len || !ifs) ctx->cur_folio = NULL; } diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index de89c5bef607..1508e2f54462 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -267,7 +267,15 @@ restart: */ BUFFER_TRACE(bh, "queue"); get_bh(bh); - J_ASSERT_BH(bh, !buffer_jwrite(bh)); + if (WARN_ON_ONCE(buffer_jwrite(bh))) { + put_bh(bh); /* drop the ref we just took */ + spin_unlock(&journal->j_list_lock); + /* Clean up any previously batched buffers */ + if (batch_count) + __flush_batch(journal, &batch_count); + jbd2_journal_abort(journal, -EFSCORRUPTED); + return -EFSCORRUPTED; + } journal->j_chkpt_bhs[batch_count++] = bh; transaction->t_chp_stats.cs_written++; transaction->t_checkpoint_list = jh->b_cpnext; @@ -325,7 +333,10 @@ int jbd2_cleanup_journal_tail(journal_t *journal) if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr)) return 1; - J_ASSERT(blocknr != 0); + if (WARN_ON_ONCE(blocknr == 0)) { + jbd2_journal_abort(journal, -EFSCORRUPTED); + return -EFSCORRUPTED; + } /* * We need to make sure that any blocks that were recently written out diff --git a/fs/namei.c b/fs/namei.c index 58f715f7657e..9e5500dad14f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2437,8 +2437,14 @@ inside: EXPORT_SYMBOL(hashlen_string); /* - * Calculate the length and hash of the path component, and - * return the length as the result. + * hash_name - Calculate the length and hash of the path component + * @nd: the path resolution state + * @name: the pathname to read the component from + * @lastword: if the component fits in a single word, LAST_WORD_IS_DOT, + * LAST_WORD_IS_DOTDOT, or some other value depending on whether the + * component is '.', '..', or something else. Otherwise, @lastword is 0. + * + * Returns: a pointer to the terminating '/' or NUL character in @name. */ static inline const char *hash_name(struct nameidata *nd, const char *name, diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 88a0d801525f..a8c0d86118c5 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -171,9 +171,8 @@ static void netfs_queue_read(struct netfs_io_request *rreq, spin_lock(&rreq->lock); list_add_tail(&subreq->rreq_link, &stream->subrequests); if (list_is_first(&subreq->rreq_link, &stream->subrequests)) { - stream->front = subreq; if (!stream->active) { - stream->collected_to = stream->front->start; + stream->collected_to = subreq->start; /* Store list pointers before active flag */ smp_store_release(&stream->active, true); } diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c index a498ee8d6674..f72e6da88cca 100644 --- a/fs/netfs/direct_read.c +++ b/fs/netfs/direct_read.c @@ -71,9 +71,8 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq) spin_lock(&rreq->lock); list_add_tail(&subreq->rreq_link, &stream->subrequests); if (list_is_first(&subreq->rreq_link, &stream->subrequests)) { - stream->front = subreq; if (!stream->active) { - stream->collected_to = stream->front->start; + stream->collected_to = subreq->start; /* Store list pointers before active flag */ smp_store_release(&stream->active, true); } diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index dd1451bf7543..f9ab69de3e29 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -111,7 +111,6 @@ static int netfs_unbuffered_write(struct netfs_io_request *wreq) netfs_prepare_write(wreq, stream, wreq->start + wreq->transferred); subreq = stream->construct; stream->construct = NULL; - stream->front = NULL; } /* Check if (re-)preparation failed. */ @@ -186,10 +185,18 @@ static int netfs_unbuffered_write(struct netfs_io_request *wreq) stream->sreq_max_segs = INT_MAX; netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); - stream->prepare_write(subreq); - __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); - netfs_stat(&netfs_n_wh_retry_write_subreq); + if (stream->prepare_write) { + stream->prepare_write(subreq); + __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); + netfs_stat(&netfs_n_wh_retry_write_subreq); + } else { + struct iov_iter source; + + netfs_reset_iter(subreq); + source = subreq->io_iter; + netfs_reissue_write(stream, subreq, &source); + } } netfs_unbuffered_write_done(wreq); diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c index 72a435e5fc6d..154a14bb2d7f 100644 --- a/fs/netfs/iterator.c +++ b/fs/netfs/iterator.c @@ -143,6 +143,47 @@ static size_t netfs_limit_bvec(const struct iov_iter *iter, size_t start_offset, } /* + * Select the span of a kvec iterator we're going to use. Limit it by both + * maximum size and maximum number of segments. Returns the size of the span + * in bytes. + */ +static size_t netfs_limit_kvec(const struct iov_iter *iter, size_t start_offset, + size_t max_size, size_t max_segs) +{ + const struct kvec *kvecs = iter->kvec; + unsigned int nkv = iter->nr_segs, ix = 0, nsegs = 0; + size_t len, span = 0, n = iter->count; + size_t skip = iter->iov_offset + start_offset; + + if (WARN_ON(!iov_iter_is_kvec(iter)) || + WARN_ON(start_offset > n) || + n == 0) + return 0; + + while (n && ix < nkv && skip) { + len = kvecs[ix].iov_len; + if (skip < len) + break; + skip -= len; + n -= len; + ix++; + } + + while (n && ix < nkv) { + len = min3(n, kvecs[ix].iov_len - skip, max_size); + span += len; + nsegs++; + ix++; + if (span >= max_size || nsegs >= max_segs) + break; + skip = 0; + n -= len; + } + + return min(span, max_size); +} + +/* * Select the span of an xarray iterator we're going to use. Limit it by both * maximum size and maximum number of segments. It is assumed that segments * can be larger than a page in size, provided they're physically contiguous. @@ -245,6 +286,8 @@ size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset, return netfs_limit_bvec(iter, start_offset, max_size, max_segs); if (iov_iter_is_xarray(iter)) return netfs_limit_xarray(iter, start_offset, max_size, max_segs); + if (iov_iter_is_kvec(iter)) + return netfs_limit_kvec(iter, start_offset, max_size, max_segs); BUG(); } EXPORT_SYMBOL(netfs_limit_iter); diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index 137f0e28a44c..e5f6665b3341 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -205,7 +205,8 @@ reassess: * in progress. The issuer thread may be adding stuff to the tail * whilst we're doing this. */ - front = READ_ONCE(stream->front); + front = list_first_entry_or_null(&stream->subrequests, + struct netfs_io_subrequest, rreq_link); while (front) { size_t transferred; @@ -301,7 +302,6 @@ reassess: list_del_init(&front->rreq_link); front = list_first_entry_or_null(&stream->subrequests, struct netfs_io_subrequest, rreq_link); - stream->front = front; spin_unlock(&rreq->lock); netfs_put_subrequest(remove, notes & ABANDON_SREQ ? diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c index 7793ba5e3e8f..cca9ac43c077 100644 --- a/fs/netfs/read_retry.c +++ b/fs/netfs/read_retry.c @@ -93,8 +93,10 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) from->start, from->transferred, from->len); if (test_bit(NETFS_SREQ_FAILED, &from->flags) || - !test_bit(NETFS_SREQ_NEED_RETRY, &from->flags)) + !test_bit(NETFS_SREQ_NEED_RETRY, &from->flags)) { + subreq = from; goto abandon; + } list_for_each_continue(next, &stream->subrequests) { subreq = list_entry(next, struct netfs_io_subrequest, rreq_link); @@ -178,6 +180,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) if (subreq == to) break; } + subreq = NULL; continue; } diff --git a/fs/netfs/read_single.c b/fs/netfs/read_single.c index 8e6264f62a8f..d0e23bc42445 100644 --- a/fs/netfs/read_single.c +++ b/fs/netfs/read_single.c @@ -107,7 +107,6 @@ static int netfs_single_dispatch_read(struct netfs_io_request *rreq) spin_lock(&rreq->lock); list_add_tail(&subreq->rreq_link, &stream->subrequests); trace_netfs_sreq(subreq, netfs_sreq_trace_added); - stream->front = subreq; /* Store list pointers before active flag */ smp_store_release(&stream->active, true); spin_unlock(&rreq->lock); diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 83eb3dc1adf8..b194447f4b11 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -228,7 +228,8 @@ reassess_streams: if (!smp_load_acquire(&stream->active)) continue; - front = stream->front; + front = list_first_entry_or_null(&stream->subrequests, + struct netfs_io_subrequest, rreq_link); while (front) { trace_netfs_collect_sreq(wreq, front); //_debug("sreq [%x] %llx %zx/%zx", @@ -279,7 +280,6 @@ reassess_streams: list_del_init(&front->rreq_link); front = list_first_entry_or_null(&stream->subrequests, struct netfs_io_subrequest, rreq_link); - stream->front = front; spin_unlock(&wreq->lock); netfs_put_subrequest(remove, notes & SAW_FAILURE ? diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 437268f65640..2db688f94125 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -206,9 +206,8 @@ void netfs_prepare_write(struct netfs_io_request *wreq, spin_lock(&wreq->lock); list_add_tail(&subreq->rreq_link, &stream->subrequests); if (list_is_first(&subreq->rreq_link, &stream->subrequests)) { - stream->front = subreq; if (!stream->active) { - stream->collected_to = stream->front->start; + stream->collected_to = subreq->start; /* Write list pointers before active flag */ smp_store_release(&stream->active, true); } diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 758611ee4475..13cb60b52bd6 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -1146,15 +1146,15 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, return -EOVERFLOW; /* - * With metacopy disabled, we fsync after final metadata copyup, for + * With "fsync=strict", we fsync after final metadata copyup, for * both regular files and directories to get atomic copyup semantics * on filesystems that do not use strict metadata ordering (e.g. ubifs). * - * With metacopy enabled we want to avoid fsync on all meta copyup + * By default, we want to avoid fsync on all meta copyup, because * that will hurt performance of workloads such as chown -R, so we * only fsync on data copyup as legacy behavior. */ - ctx.metadata_fsync = !OVL_FS(dentry->d_sb)->config.metacopy && + ctx.metadata_fsync = ovl_should_sync_metadata(OVL_FS(dentry->d_sb)) && (S_ISREG(ctx.stat.mode) || S_ISDIR(ctx.stat.mode)); ctx.metacopy = ovl_need_meta_copy_up(dentry, ctx.stat.mode, flags); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index cad2055ebf18..63b299bf12f7 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -99,6 +99,12 @@ enum { OVL_VERITY_REQUIRE, }; +enum { + OVL_FSYNC_VOLATILE, + OVL_FSYNC_AUTO, + OVL_FSYNC_STRICT, +}; + /* * The tuple (fh,uuid) is a universal unique identifier for a copy up origin, * where: @@ -656,6 +662,21 @@ static inline bool ovl_xino_warn(struct ovl_fs *ofs) return ofs->config.xino == OVL_XINO_ON; } +static inline bool ovl_should_sync(struct ovl_fs *ofs) +{ + return ofs->config.fsync_mode != OVL_FSYNC_VOLATILE; +} + +static inline bool ovl_should_sync_metadata(struct ovl_fs *ofs) +{ + return ofs->config.fsync_mode == OVL_FSYNC_STRICT; +} + +static inline bool ovl_is_volatile(struct ovl_config *config) +{ + return config->fsync_mode == OVL_FSYNC_VOLATILE; +} + /* * To avoid regressions in existing setups with overlay lower offline changes, * we allow lower changes only if none of the new features are used. diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 1d4828dbcf7a..80cad4ea96a3 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -18,7 +18,7 @@ struct ovl_config { int xino; bool metacopy; bool userxattr; - bool ovl_volatile; + int fsync_mode; }; struct ovl_sb { @@ -120,11 +120,6 @@ static inline struct ovl_fs *OVL_FS(struct super_block *sb) return (struct ovl_fs *)sb->s_fs_info; } -static inline bool ovl_should_sync(struct ovl_fs *ofs) -{ - return !ofs->config.ovl_volatile; -} - static inline unsigned int ovl_numlower(struct ovl_entry *oe) { return oe ? oe->__numlower : 0; diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index 8111b437ae5d..c93fcaa45d4a 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -58,6 +58,7 @@ enum ovl_opt { Opt_xino, Opt_metacopy, Opt_verity, + Opt_fsync, Opt_volatile, Opt_override_creds, }; @@ -140,6 +141,23 @@ static int ovl_verity_mode_def(void) return OVL_VERITY_OFF; } +static const struct constant_table ovl_parameter_fsync[] = { + { "volatile", OVL_FSYNC_VOLATILE }, + { "auto", OVL_FSYNC_AUTO }, + { "strict", OVL_FSYNC_STRICT }, + {} +}; + +static const char *ovl_fsync_mode(struct ovl_config *config) +{ + return ovl_parameter_fsync[config->fsync_mode].name; +} + +static int ovl_fsync_mode_def(void) +{ + return OVL_FSYNC_AUTO; +} + const struct fs_parameter_spec ovl_parameter_spec[] = { fsparam_string_empty("lowerdir", Opt_lowerdir), fsparam_file_or_string("lowerdir+", Opt_lowerdir_add), @@ -155,6 +173,7 @@ const struct fs_parameter_spec ovl_parameter_spec[] = { fsparam_enum("xino", Opt_xino, ovl_parameter_xino), fsparam_enum("metacopy", Opt_metacopy, ovl_parameter_bool), fsparam_enum("verity", Opt_verity, ovl_parameter_verity), + fsparam_enum("fsync", Opt_fsync, ovl_parameter_fsync), fsparam_flag("volatile", Opt_volatile), fsparam_flag_no("override_creds", Opt_override_creds), {} @@ -665,8 +684,11 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_verity: config->verity_mode = result.uint_32; break; + case Opt_fsync: + config->fsync_mode = result.uint_32; + break; case Opt_volatile: - config->ovl_volatile = true; + config->fsync_mode = OVL_FSYNC_VOLATILE; break; case Opt_userxattr: config->userxattr = true; @@ -800,6 +822,7 @@ int ovl_init_fs_context(struct fs_context *fc) ofs->config.nfs_export = ovl_nfs_export_def; ofs->config.xino = ovl_xino_def(); ofs->config.metacopy = ovl_metacopy_def; + ofs->config.fsync_mode = ovl_fsync_mode_def(); fc->s_fs_info = ofs; fc->fs_private = ctx; @@ -870,9 +893,9 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx, config->index = false; } - if (!config->upperdir && config->ovl_volatile) { + if (!config->upperdir && ovl_is_volatile(config)) { pr_info("option \"volatile\" is meaningless in a non-upper mount, ignoring it.\n"); - config->ovl_volatile = false; + config->fsync_mode = ovl_fsync_mode_def(); } if (!config->upperdir && config->uuid == OVL_UUID_ON) { @@ -1070,8 +1093,8 @@ int ovl_show_options(struct seq_file *m, struct dentry *dentry) seq_printf(m, ",xino=%s", ovl_xino_mode(&ofs->config)); if (ofs->config.metacopy != ovl_metacopy_def) seq_printf(m, ",metacopy=%s", str_on_off(ofs->config.metacopy)); - if (ofs->config.ovl_volatile) - seq_puts(m, ",volatile"); + if (ofs->config.fsync_mode != ovl_fsync_mode_def()) + seq_printf(m, ",fsync=%s", ovl_fsync_mode(&ofs->config)); if (ofs->config.userxattr) seq_puts(m, ",userxattr"); if (ofs->config.verity_mode != ovl_verity_mode_def()) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index d4c12feec039..0822987cfb51 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -776,7 +776,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, * For volatile mount, create a incompat/volatile/dirty file to keep * track of it. */ - if (ofs->config.ovl_volatile) { + if (ovl_is_volatile(&ofs->config)) { err = ovl_create_volatile_dirty(ofs); if (err < 0) { pr_err("Failed to create volatile/dirty file.\n"); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 3f1b763a8bb4..2ea769f311c3 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -85,7 +85,10 @@ int ovl_can_decode_fh(struct super_block *sb) if (!exportfs_can_decode_fh(sb->s_export_op)) return 0; - return sb->s_export_op->encode_fh ? -1 : FILEID_INO32_GEN; + if (sb->s_export_op->encode_fh == generic_encode_ino32_fh) + return FILEID_INO32_GEN; + + return -1; } struct dentry *ovl_indexdir(struct super_block *sb) diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h index fa7638b81246..383050e7fdf5 100644 --- a/include/linux/fs/super_types.h +++ b/include/linux/fs/super_types.h @@ -338,5 +338,6 @@ struct super_block { #define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */ #define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */ #define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */ +#define SB_I_NO_DATA_INTEGRITY 0x00008000 /* fs cannot guarantee data persistence on sync */ #endif /* _LINUX_FS_SUPER_TYPES_H */ diff --git a/include/linux/leafops.h b/include/linux/leafops.h index a9ff94b744f2..05673d3529e7 100644 --- a/include/linux/leafops.h +++ b/include/linux/leafops.h @@ -363,6 +363,23 @@ static inline unsigned long softleaf_to_pfn(softleaf_t entry) return swp_offset(entry) & SWP_PFN_MASK; } +static inline void softleaf_migration_sync(softleaf_t entry, + struct folio *folio) +{ + /* + * Ensure we do not race with split, which might alter tail pages into new + * folios and thus result in observing an unlocked folio. + * This matches the write barrier in __split_folio_to_order(). + */ + smp_rmb(); + + /* + * Any use of migration entries may only occur while the + * corresponding page is locked + */ + VM_WARN_ON_ONCE(!folio_test_locked(folio)); +} + /** * softleaf_to_page() - Obtains struct page for PFN encoded within leaf entry. * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true. @@ -374,11 +391,8 @@ static inline struct page *softleaf_to_page(softleaf_t entry) struct page *page = pfn_to_page(softleaf_to_pfn(entry)); VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); - /* - * Any use of migration entries may only occur while the - * corresponding page is locked - */ - VM_WARN_ON_ONCE(softleaf_is_migration(entry) && !PageLocked(page)); + if (softleaf_is_migration(entry)) + softleaf_migration_sync(entry, page_folio(page)); return page; } @@ -394,12 +408,8 @@ static inline struct folio *softleaf_to_folio(softleaf_t entry) struct folio *folio = pfn_folio(softleaf_to_pfn(entry)); VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); - /* - * Any use of migration entries may only occur while the - * corresponding folio is locked. - */ - VM_WARN_ON_ONCE(softleaf_is_migration(entry) && - !folio_test_locked(folio)); + if (softleaf_is_migration(entry)) + softleaf_migration_sync(entry, folio); return folio; } diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 0fe96f3ab3ef..65c732d440d2 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -55,6 +55,7 @@ struct mempolicy { nodemask_t cpuset_mems_allowed; /* relative to these nodes */ nodemask_t user_nodemask; /* nodemask passed by user */ } w; + struct rcu_head rcu; }; /* diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 72ee7d210a74..ba17ac5bf356 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -140,7 +140,6 @@ struct netfs_io_stream { void (*issue_write)(struct netfs_io_subrequest *subreq); /* Collection tracking */ struct list_head subrequests; /* Contributory I/O operations */ - struct netfs_io_subrequest *front; /* Op being collected */ unsigned long long collected_to; /* Position we've collected results to */ size_t transferred; /* The amount transferred from this stream */ unsigned short error; /* Aggregate error for the stream */ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index ec442af3f886..31a848485ad9 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -210,7 +210,6 @@ enum mapping_flags { AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM = 9, AS_KERNEL_FILE = 10, /* mapping for a fake kernel file that shouldn't account usage to user cgroups */ - AS_NO_DATA_INTEGRITY = 11, /* no data integrity guarantees */ /* Bits 16-25 are used for FOLIO_ORDER */ AS_FOLIO_ORDER_BITS = 5, AS_FOLIO_ORDER_MIN = 16, @@ -346,16 +345,6 @@ static inline bool mapping_writeback_may_deadlock_on_reclaim(const struct addres return test_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags); } -static inline void mapping_set_no_data_integrity(struct address_space *mapping) -{ - set_bit(AS_NO_DATA_INTEGRITY, &mapping->flags); -} - -static inline bool mapping_no_data_integrity(const struct address_space *mapping) -{ - return test_bit(AS_NO_DATA_INTEGRITY, &mapping->flags); -} - static inline gfp_t mapping_gfp_mask(const struct address_space *mapping) { return mapping->gfp_mask; diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 125bdc166bfe..0864700f76e0 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -769,12 +769,15 @@ TRACE_EVENT(btrfs_sync_file, ), TP_fast_assign( - const struct dentry *dentry = file->f_path.dentry; - const struct inode *inode = d_inode(dentry); + struct dentry *dentry = file_dentry(file); + struct inode *inode = file_inode(file); + struct dentry *parent = dget_parent(dentry); + struct inode *parent_inode = d_inode(parent); - TP_fast_assign_fsid(btrfs_sb(file->f_path.dentry->d_sb)); + dput(parent); + TP_fast_assign_fsid(btrfs_sb(inode->i_sb)); __entry->ino = btrfs_ino(BTRFS_I(inode)); - __entry->parent = btrfs_ino(BTRFS_I(d_inode(dentry->d_parent))); + __entry->parent = btrfs_ino(BTRFS_I(parent_inode)); __entry->datasync = datasync; __entry->root_objectid = btrfs_root_id(BTRFS_I(inode)->root); ), diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 2d366be46a1c..cbe28211106c 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -740,19 +740,19 @@ TRACE_EVENT(netfs_collect_stream, __field(unsigned int, wreq) __field(unsigned char, stream) __field(unsigned long long, collected_to) - __field(unsigned long long, front) + __field(unsigned long long, issued_to) ), TP_fast_assign( __entry->wreq = wreq->debug_id; __entry->stream = stream->stream_nr; __entry->collected_to = stream->collected_to; - __entry->front = stream->front ? stream->front->start : UINT_MAX; + __entry->issued_to = atomic64_read(&wreq->issued_to); ), - TP_printk("R=%08x[%x:] cto=%llx frn=%llx", + TP_printk("R=%08x[%x:] cto=%llx ito=%llx", __entry->wreq, __entry->stream, - __entry->collected_to, __entry->front) + __entry->collected_to, __entry->issued_to) ); TRACE_EVENT(netfs_folioq, diff --git a/kernel/futex/core.c b/kernel/futex/core.c index cf7e610eac42..31e83a09789e 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -342,7 +342,7 @@ static int __futex_key_to_node(struct mm_struct *mm, unsigned long addr) if (!vma) return FUTEX_NO_NODE; - mpol = vma_policy(vma); + mpol = READ_ONCE(vma->vm_policy); if (!mpol) return FUTEX_NO_NODE; diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c index bc1f7e83a37e..7808068fa59e 100644 --- a/kernel/futex/pi.c +++ b/kernel/futex/pi.c @@ -918,7 +918,7 @@ int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked) int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to; - struct task_struct *exiting = NULL; + struct task_struct *exiting; struct rt_mutex_waiter rt_waiter; struct futex_q q = futex_q_init; DEFINE_WAKE_Q(wake_q); @@ -933,6 +933,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl to = futex_setup_timer(time, &timeout, flags, 0); retry: + exiting = NULL; ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE); if (unlikely(ret != 0)) goto out; diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index 743c7a728237..77ad9691f6a6 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -459,6 +459,14 @@ SYSCALL_DEFINE4(futex_requeue, if (ret) return ret; + /* + * For now mandate both flags are identical, like the sys_futex() + * interface has. If/when we merge the variable sized futex support, + * that patch can modify this test to allow a difference in size. + */ + if (futexes[0].w.flags != futexes[1].w.flags) + return -EINVAL; + cmpval = futexes[0].w.val; return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags, diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 069d93bfb0c7..b64db405ba5c 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -540,7 +540,7 @@ static s64 alarm_timer_forward(struct k_itimer *timr, ktime_t now) { struct alarm *alarm = &timr->it.alarm.alarmtimer; - return alarm_forward(alarm, timr->it_interval, now); + return alarm_forward(alarm, now, timr->it_interval); } /** diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d5230b759a2d..655db2e82513 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -22,6 +22,39 @@ static struct task_struct *trigger_kthread; static struct llist_head trigger_data_free_list; static DEFINE_MUTEX(trigger_data_kthread_mutex); +static int trigger_kthread_fn(void *ignore); + +static void trigger_create_kthread_locked(void) +{ + lockdep_assert_held(&trigger_data_kthread_mutex); + + if (!trigger_kthread) { + struct task_struct *kthread; + + kthread = kthread_create(trigger_kthread_fn, NULL, + "trigger_data_free"); + if (!IS_ERR(kthread)) + WRITE_ONCE(trigger_kthread, kthread); + } +} + +static void trigger_data_free_queued_locked(void) +{ + struct event_trigger_data *data, *tmp; + struct llist_node *llnodes; + + lockdep_assert_held(&trigger_data_kthread_mutex); + + llnodes = llist_del_all(&trigger_data_free_list); + if (!llnodes) + return; + + tracepoint_synchronize_unregister(); + + llist_for_each_entry_safe(data, tmp, llnodes, llist) + kfree(data); +} + /* Bulk garbage collection of event_trigger_data elements */ static int trigger_kthread_fn(void *ignore) { @@ -56,30 +89,50 @@ void trigger_data_free(struct event_trigger_data *data) if (data->cmd_ops->set_filter) data->cmd_ops->set_filter(NULL, data, NULL); + /* + * Boot-time trigger registration can fail before kthread creation + * works. Keep the deferred-free semantics during boot and let late + * init start the kthread to drain the list. + */ + if (system_state == SYSTEM_BOOTING && !trigger_kthread) { + llist_add(&data->llist, &trigger_data_free_list); + return; + } + if (unlikely(!trigger_kthread)) { guard(mutex)(&trigger_data_kthread_mutex); + + trigger_create_kthread_locked(); /* Check again after taking mutex */ if (!trigger_kthread) { - struct task_struct *kthread; - - kthread = kthread_create(trigger_kthread_fn, NULL, - "trigger_data_free"); - if (!IS_ERR(kthread)) - WRITE_ONCE(trigger_kthread, kthread); + llist_add(&data->llist, &trigger_data_free_list); + /* Drain the queued frees synchronously if creation failed. */ + trigger_data_free_queued_locked(); + return; } } - if (!trigger_kthread) { - /* Do it the slow way */ - tracepoint_synchronize_unregister(); - kfree(data); - return; - } - llist_add(&data->llist, &trigger_data_free_list); wake_up_process(trigger_kthread); } +static int __init trigger_data_free_init(void) +{ + guard(mutex)(&trigger_data_kthread_mutex); + + if (llist_empty(&trigger_data_free_list)) + return 0; + + trigger_create_kthread_locked(); + if (trigger_kthread) + wake_up_process(trigger_kthread); + else + trigger_data_free_queued_locked(); + + return 0; +} +late_initcall(trigger_data_free_init); + static inline void data_ops_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index dee610e465b9..be6cf0bb3c03 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -2073,8 +2073,8 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy) if (!osnoise_has_registered_instances()) return; - guard(mutex)(&interface_lock); guard(cpus_read_lock)(); + guard(mutex)(&interface_lock); if (!cpu_online(cpu)) return; @@ -2237,11 +2237,11 @@ static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf, if (running) stop_per_cpu_kthreads(); - mutex_lock(&interface_lock); /* * avoid CPU hotplug operations that might read options. */ cpus_read_lock(); + mutex_lock(&interface_lock); retval = cnt; @@ -2257,8 +2257,8 @@ static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf, clear_bit(option, &osnoise_options); } - cpus_read_unlock(); mutex_unlock(&interface_lock); + cpus_read_unlock(); if (running) start_per_cpu_kthreads(); @@ -2345,16 +2345,16 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, if (running) stop_per_cpu_kthreads(); - mutex_lock(&interface_lock); /* * osnoise_cpumask is read by CPU hotplug operations. */ cpus_read_lock(); + mutex_lock(&interface_lock); cpumask_copy(&osnoise_cpumask, osnoise_cpumask_new); - cpus_read_unlock(); mutex_unlock(&interface_lock); + cpus_read_unlock(); if (running) start_per_cpu_kthreads(); diff --git a/lib/bug.c b/lib/bug.c index 623c467a8b76..aab9e6a40c5f 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -173,10 +173,8 @@ struct bug_entry *find_bug(unsigned long bugaddr) return module_find_bug(bugaddr); } -__diag_push(); -__diag_ignore(GCC, all, "-Wsuggest-attribute=format", - "Not a valid __printf() conversion candidate."); -static void __warn_printf(const char *fmt, struct pt_regs *regs) +static __printf(1, 0) +void __warn_printf(const char *fmt, struct pt_regs *regs) { if (!fmt) return; @@ -195,7 +193,6 @@ static void __warn_printf(const char *fmt, struct pt_regs *regs) printk("%s", fmt); } -__diag_pop(); static enum bug_trap_type __report_bug(struct bug_entry *bug, unsigned long bugaddr, struct pt_regs *regs) { diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 576d1ddd736b..6a44a2f3d8fc 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1524,8 +1524,10 @@ static int damon_sysfs_commit_input(void *data) if (IS_ERR(param_ctx)) return PTR_ERR(param_ctx); test_ctx = damon_sysfs_new_test_ctx(kdamond->damon_ctx); - if (!test_ctx) + if (!test_ctx) { + damon_destroy_ctx(param_ctx); return -ENOMEM; + } err = damon_commit_ctx(test_ctx, param_ctx); if (err) goto out; @@ -1618,9 +1620,12 @@ static int damon_sysfs_repeat_call_fn(void *data) if (!mutex_trylock(&damon_sysfs_lock)) return 0; + if (sysfs_kdamond->contexts->nr != 1) + goto out; damon_sysfs_upd_tuned_intervals(sysfs_kdamond); damon_sysfs_upd_schemes_stats(sysfs_kdamond); damon_sysfs_upd_schemes_effective_quotas(sysfs_kdamond); +out: mutex_unlock(&damon_sysfs_lock); return 0; } @@ -1747,6 +1752,9 @@ static int damon_sysfs_update_schemes_tried_regions( static int damon_sysfs_handle_cmd(enum damon_sysfs_cmd cmd, struct damon_sysfs_kdamond *kdamond) { + if (cmd != DAMON_SYSFS_CMD_OFF && kdamond->contexts->nr != 1) + return -EINVAL; + switch (cmd) { case DAMON_SYSFS_CMD_ON: return damon_sysfs_turn_damon_on(kdamond); diff --git a/mm/memory.c b/mm/memory.c index 2f815a34d924..c65e82c86fed 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6815,11 +6815,16 @@ retry: pudp = pud_offset(p4dp, address); pud = pudp_get(pudp); - if (pud_none(pud)) + if (!pud_present(pud)) goto out; if (pud_leaf(pud)) { lock = pud_lock(mm, pudp); - if (!unlikely(pud_leaf(pud))) { + pud = pudp_get(pudp); + + if (unlikely(!pud_present(pud))) { + spin_unlock(lock); + goto out; + } else if (unlikely(!pud_leaf(pud))) { spin_unlock(lock); goto retry; } @@ -6831,9 +6836,16 @@ retry: pmdp = pmd_offset(pudp, address); pmd = pmdp_get_lockless(pmdp); + if (!pmd_present(pmd)) + goto out; if (pmd_leaf(pmd)) { lock = pmd_lock(mm, pmdp); - if (!unlikely(pmd_leaf(pmd))) { + pmd = pmdp_get(pmdp); + + if (unlikely(!pmd_present(pmd))) { + spin_unlock(lock); + goto out; + } else if (unlikely(!pmd_leaf(pmd))) { spin_unlock(lock); goto retry; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0e5175f1c767..cf92bd6a8226 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -487,7 +487,13 @@ void __mpol_put(struct mempolicy *pol) { if (!atomic_dec_and_test(&pol->refcnt)) return; - kmem_cache_free(policy_cache, pol); + /* + * Required to allow mmap_lock_speculative*() access, see for example + * futex_key_to_node_opt(). All accesses are serialized by mmap_lock, + * however the speculative lock section unbound by the normal lock + * boundaries, requiring RCU freeing. + */ + kfree_rcu(pol, rcu); } EXPORT_SYMBOL_FOR_MODULES(__mpol_put, "kvm"); @@ -1020,7 +1026,7 @@ static int vma_replace_policy(struct vm_area_struct *vma, } old = vma->vm_policy; - vma->vm_policy = new; /* protected by mmap_lock */ + WRITE_ONCE(vma->vm_policy, new); /* protected by mmap_lock */ mpol_put(old); return 0; diff --git a/mm/mseal.c b/mm/mseal.c index 316b5e1dec78..ac58643181f7 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -56,7 +56,6 @@ static int mseal_apply(struct mm_struct *mm, unsigned long start, unsigned long end) { struct vm_area_struct *vma, *prev; - unsigned long curr_start = start; VMA_ITERATOR(vmi, mm, start); /* We know there are no gaps so this will be non-NULL. */ @@ -66,6 +65,7 @@ static int mseal_apply(struct mm_struct *mm, prev = vma; for_each_vma_range(vmi, vma, end) { + const unsigned long curr_start = MAX(vma->vm_start, start); const unsigned long curr_end = MIN(vma->vm_end, end); if (!(vma->vm_flags & VM_SEALED)) { @@ -79,7 +79,6 @@ static int mseal_apply(struct mm_struct *mm, } prev = vma; - curr_start = curr_end; } return 0; diff --git a/mm/pagewalk.c b/mm/pagewalk.c index a94c401ab2cf..4e7bcd975c54 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -97,6 +97,7 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, struct mm_walk *walk) { + pud_t pudval = pudp_get(pud); pmd_t *pmd; unsigned long next; const struct mm_walk_ops *ops = walk->ops; @@ -105,6 +106,24 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, int err = 0; int depth = real_depth(3); + /* + * For PTE handling, pte_offset_map_lock() takes care of checking + * whether there actually is a page table. But it also has to be + * very careful about concurrent page table reclaim. + * + * Similarly, we have to be careful here - a PUD entry that points + * to a PMD table cannot go away, so we can just walk it. But if + * it's something else, we need to ensure we didn't race something, + * so need to retry. + * + * A pertinent example of this is a PUD refault after PUD split - + * we will need to split again or risk accessing invalid memory. + */ + if (!pud_present(pudval) || pud_leaf(pudval)) { + walk->action = ACTION_AGAIN; + return 0; + } + pmd = pmd_offset(pud, addr); do { again: @@ -218,12 +237,12 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, else if (pud_leaf(*pud) || !pud_present(*pud)) continue; /* Nothing to do. */ - if (pud_none(*pud)) - goto again; - err = walk_pmd_range(pud, addr, next, walk); if (err) break; + + if (walk->action == ACTION_AGAIN) + goto again; } while (pud++, addr = next, addr != end); return err; diff --git a/mm/swap_state.c b/mm/swap_state.c index 6d0eef7470be..48aff2c917c0 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -494,6 +494,10 @@ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry, __folio_set_locked(folio); __folio_set_swapbacked(folio); + + if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) + goto failed; + for (;;) { ret = swap_cache_add_folio(folio, entry, &shadow); if (!ret) @@ -514,11 +518,6 @@ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry, goto failed; } - if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) { - swap_cache_del_folio(folio); - goto failed; - } - memcg1_swapin(entry, folio_nr_pages(folio)); if (shadow) workingset_refault(folio, shadow); diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c index 7aec3ae82a44..c6dafb3cc116 100644 --- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c +++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c @@ -1020,7 +1020,7 @@ FIXTURE_SETUP(mount_setattr_idmapped) "size=100000,mode=700"), 0); ASSERT_EQ(mount("testing", "/mnt", "tmpfs", MS_NOATIME | MS_NODEV, - "size=2m,mode=700"), 0); + "size=256m,mode=700"), 0); ASSERT_EQ(mkdir("/mnt/A", 0777), 0); |
