summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-fs-f2fs22
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt38
-rw-r--r--Documentation/core-api/mm-api.rst1
-rw-r--r--Documentation/devicetree/bindings/i2c/apple,i2c.yaml5
-rw-r--r--Documentation/filesystems/f2fs.rst6
-rw-r--r--arch/arm/include/asm/cti.h160
-rw-r--r--arch/arm64/mm/mmu.c4
-rw-r--r--arch/x86/kernel/alternative.c3
-rw-r--r--arch/x86/kernel/ftrace.c2
-rw-r--r--arch/x86/kernel/kprobes/core.c18
-rw-r--r--arch/x86/mm/init.c24
-rw-r--r--drivers/i2c/busses/Kconfig1
-rw-r--r--drivers/i2c/busses/i2c-qcom-geni.c6
-rw-r--r--drivers/i2c/busses/i2c-stm32f7.c32
-rw-r--r--drivers/i2c/busses/i2c-tegra.c64
-rw-r--r--drivers/i2c/i2c-core-acpi.c1
-rw-r--r--drivers/i2c/muxes/i2c-mux-mule.c3
-rw-r--r--fs/exfat/dir.c12
-rw-r--r--fs/exfat/fatent.c10
-rw-r--r--fs/exfat/file.c5
-rw-r--r--fs/exfat/namei.c5
-rw-r--r--fs/exfat/super.c32
-rw-r--r--fs/f2fs/checkpoint.c8
-rw-r--r--fs/f2fs/compress.c120
-rw-r--r--fs/f2fs/data.c183
-rw-r--r--fs/f2fs/debug.c21
-rw-r--r--fs/f2fs/dir.c4
-rw-r--r--fs/f2fs/extent_cache.c10
-rw-r--r--fs/f2fs/f2fs.h151
-rw-r--r--fs/f2fs/file.c107
-rw-r--r--fs/f2fs/gc.c54
-rw-r--r--fs/f2fs/gc.h5
-rw-r--r--fs/f2fs/inline.c20
-rw-r--r--fs/f2fs/inode.c84
-rw-r--r--fs/f2fs/namei.c12
-rw-r--r--fs/f2fs/node.c261
-rw-r--r--fs/f2fs/node.h77
-rw-r--r--fs/f2fs/recovery.c116
-rw-r--r--fs/f2fs/segment.c62
-rw-r--r--fs/f2fs/segment.h59
-rw-r--r--fs/f2fs/super.c2111
-rw-r--r--fs/f2fs/sysfs.c48
-rw-r--r--include/linux/compiler-gcc.h2
-rw-r--r--include/linux/execmem.h54
-rw-r--r--include/linux/f2fs_fs.h2
-rw-r--r--include/linux/fscrypt.h10
-rw-r--r--include/linux/io-mapping.h3
-rw-r--r--include/linux/mm.h6
-rw-r--r--include/linux/mmap_lock.h30
-rw-r--r--include/linux/page-flags.h2
-rw-r--r--include/linux/pgtable.h45
-rw-r--r--include/linux/rmap.h22
-rw-r--r--include/linux/sprintf.h2
-rw-r--r--kernel/fork.c9
-rw-r--r--kernel/module/main.c13
-rw-r--r--kernel/printk/internal.h2
-rw-r--r--kernel/printk/nbcon.c89
-rw-r--r--kernel/printk/printk.c20
-rw-r--r--lib/vsprintf.c70
-rw-r--r--mm/Kconfig4
-rw-r--r--mm/Makefile1
-rw-r--r--mm/damon/vaddr.c4
-rw-r--r--mm/execmem.c206
-rw-r--r--mm/internal.h2
-rw-r--r--mm/io-mapping.c30
-rw-r--r--mm/kasan/common.c25
-rw-r--r--mm/khugepaged.c58
-rw-r--r--mm/madvise.c71
-rw-r--r--mm/memory-failure.c12
-rw-r--r--mm/mempool.c24
-rw-r--r--mm/mincore.c3
-rw-r--r--mm/mmap_lock.c10
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/mremap.c4
-rw-r--r--mm/mseal.c166
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/rmap.c2
-rw-r--r--mm/shmem.c279
-rw-r--r--mm/slub.c5
-rw-r--r--mm/vma.c4
-rw-r--r--mm/vma.h27
-rw-r--r--tools/testing/selftests/cachestat/test_cachestat.c62
-rw-r--r--tools/testing/selftests/mm/.gitignore1
-rw-r--r--tools/testing/selftests/mm/Makefile1
-rw-r--r--tools/testing/selftests/mm/process_madv.c344
-rwxr-xr-xtools/testing/selftests/mm/run_vmtests.sh5
-rw-r--r--tools/testing/vma/vma_internal.h6
87 files changed, 3394 insertions, 2314 deletions
diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index bf03263b9f46..bc0e7fefc39d 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -861,3 +861,25 @@ Description: This is a read-only entry to show the value of sb.s_encoding_flags,
SB_ENC_STRICT_MODE_FL 0x00000001
SB_ENC_NO_COMPAT_FALLBACK_FL 0x00000002
============================ ==========
+
+What: /sys/fs/f2fs/<disk>/reserved_pin_section
+Date: June 2025
+Contact: "Chao Yu" <chao@kernel.org>
+Description: This threshold is used to control triggering garbage collection while
+ fallocating on pinned file, so, it can guarantee there is enough free
+ reserved section before preallocating on pinned file.
+ By default, the value is ovp_sections, especially, for zoned ufs, the
+ value is 1.
+
+What: /sys/fs/f2fs/<disk>/gc_boost_gc_multiple
+Date: June 2025
+Contact: "Daeho Jeong" <daehojeong@google.com>
+Description: Set a multiplier for the background GC migration window when F2FS GC is
+ boosted. The range should be from 1 to the segment count in a section.
+ Default: 5
+
+What: /sys/fs/f2fs/<disk>/gc_boost_gc_greedy
+Date: June 2025
+Contact: "Daeho Jeong" <daehojeong@google.com>
+Description: Control GC algorithm for boost GC. 0: cost benefit, 1: greedy
+ Default: 1
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index c1a5cd991f2e..747a55abf494 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1828,6 +1828,27 @@
backtraces on all cpus.
Format: 0 | 1
+ hash_pointers=
+ [KNL,EARLY]
+ By default, when pointers are printed to the console
+ or buffers via the %p format string, that pointer is
+ "hashed", i.e. obscured by hashing the pointer value.
+ This is a security feature that hides actual kernel
+ addresses from unprivileged users, but it also makes
+ debugging the kernel more difficult since unequal
+ pointers can no longer be compared. The choices are:
+ Format: { auto | always | never }
+ Default: auto
+
+ auto - Hash pointers unless slab_debug is enabled.
+ always - Always hash pointers (even if slab_debug is
+ enabled).
+ never - Never hash pointers. This option should only
+ be specified when debugging the kernel. Do
+ not use on production kernels. The boot
+ param "no_hash_pointers" is an alias for
+ this mode.
+
hashdist= [KNL,NUMA] Large hashes allocated during boot
are distributed across NUMA nodes. Defaults on
for 64-bit NUMA, off otherwise.
@@ -4216,18 +4237,7 @@
no_hash_pointers
[KNL,EARLY]
- Force pointers printed to the console or buffers to be
- unhashed. By default, when a pointer is printed via %p
- format string, that pointer is "hashed", i.e. obscured
- by hashing the pointer value. This is a security feature
- that hides actual kernel addresses from unprivileged
- users, but it also makes debugging the kernel more
- difficult since unequal pointers can no longer be
- compared. However, if this command-line option is
- specified, then all normal pointers will have their true
- value printed. This option should only be specified when
- debugging the kernel. Please do not use on production
- kernels.
+ Alias for "hash_pointers=never".
nohibernate [HIBERNATION] Disable hibernation and resume.
@@ -6644,6 +6654,10 @@
Documentation/admin-guide/mm/slab.rst.
(slub_debug legacy name also accepted for now)
+ Using this option implies the "no_hash_pointers"
+ option which can be undone by adding the
+ "hash_pointers=always" option.
+
slab_max_order= [MM]
Determines the maximum allowed order for slabs.
A high setting may cause OOMs due to memory
diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst
index 50cfc7842930..5063179cfc70 100644
--- a/Documentation/core-api/mm-api.rst
+++ b/Documentation/core-api/mm-api.rst
@@ -133,4 +133,3 @@ More Memory Management Functions
.. kernel-doc:: mm/mmu_notifier.c
.. kernel-doc:: mm/balloon_compaction.c
.. kernel-doc:: mm/huge_memory.c
-.. kernel-doc:: mm/io-mapping.c
diff --git a/Documentation/devicetree/bindings/i2c/apple,i2c.yaml b/Documentation/devicetree/bindings/i2c/apple,i2c.yaml
index 077d2a539c83..fed3e1b8c43f 100644
--- a/Documentation/devicetree/bindings/i2c/apple,i2c.yaml
+++ b/Documentation/devicetree/bindings/i2c/apple,i2c.yaml
@@ -22,6 +22,11 @@ properties:
compatible:
items:
- enum:
+ - apple,s5l8960x-i2c
+ - apple,t7000-i2c
+ - apple,s8000-i2c
+ - apple,t8010-i2c
+ - apple,t8015-i2c
- apple,t8103-i2c
- apple,t8112-i2c
- apple,t6000-i2c
diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst
index 8eeb7ea14f61..e5bb89452aff 100644
--- a/Documentation/filesystems/f2fs.rst
+++ b/Documentation/filesystems/f2fs.rst
@@ -238,9 +238,9 @@ usrjquota=<file> Appoint specified file and type during mount, so that quota
grpjquota=<file> information can be properly updated during recovery flow,
prjjquota=<file> <quota file>: must be in root directory;
jqfmt=<quota type> <quota type>: [vfsold,vfsv0,vfsv1].
-offusrjquota Turn off user journalled quota.
-offgrpjquota Turn off group journalled quota.
-offprjjquota Turn off project journalled quota.
+usrjquota= Turn off user journalled quota.
+grpjquota= Turn off group journalled quota.
+prjjquota= Turn off project journalled quota.
quota Enable plain user disk quota accounting.
noquota Disable all plain disk quota option.
alloc_mode=%s Adjust block allocation policy, which supports "reuse"
diff --git a/arch/arm/include/asm/cti.h b/arch/arm/include/asm/cti.h
deleted file mode 100644
index f8500e5d6ea8..000000000000
--- a/arch/arm/include/asm/cti.h
+++ /dev/null
@@ -1,160 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __ASMARM_CTI_H
-#define __ASMARM_CTI_H
-
-#include <asm/io.h>
-#include <asm/hardware/coresight.h>
-
-/* The registers' definition is from section 3.2 of
- * Embedded Cross Trigger Revision: r0p0
- */
-#define CTICONTROL 0x000
-#define CTISTATUS 0x004
-#define CTILOCK 0x008
-#define CTIPROTECTION 0x00C
-#define CTIINTACK 0x010
-#define CTIAPPSET 0x014
-#define CTIAPPCLEAR 0x018
-#define CTIAPPPULSE 0x01c
-#define CTIINEN 0x020
-#define CTIOUTEN 0x0A0
-#define CTITRIGINSTATUS 0x130
-#define CTITRIGOUTSTATUS 0x134
-#define CTICHINSTATUS 0x138
-#define CTICHOUTSTATUS 0x13c
-#define CTIPERIPHID0 0xFE0
-#define CTIPERIPHID1 0xFE4
-#define CTIPERIPHID2 0xFE8
-#define CTIPERIPHID3 0xFEC
-#define CTIPCELLID0 0xFF0
-#define CTIPCELLID1 0xFF4
-#define CTIPCELLID2 0xFF8
-#define CTIPCELLID3 0xFFC
-
-/* The below are from section 3.6.4 of
- * CoreSight v1.0 Architecture Specification
- */
-#define LOCKACCESS 0xFB0
-#define LOCKSTATUS 0xFB4
-
-/**
- * struct cti - cross trigger interface struct
- * @base: mapped virtual address for the cti base
- * @irq: irq number for the cti
- * @trig_out_for_irq: triger out number which will cause
- * the @irq happen
- *
- * cti struct used to operate cti registers.
- */
-struct cti {
- void __iomem *base;
- int irq;
- int trig_out_for_irq;
-};
-
-/**
- * cti_init - initialize the cti instance
- * @cti: cti instance
- * @base: mapped virtual address for the cti base
- * @irq: irq number for the cti
- * @trig_out: triger out number which will cause
- * the @irq happen
- *
- * called by machine code to pass the board dependent
- * @base, @irq and @trig_out to cti.
- */
-static inline void cti_init(struct cti *cti,
- void __iomem *base, int irq, int trig_out)
-{
- cti->base = base;
- cti->irq = irq;
- cti->trig_out_for_irq = trig_out;
-}
-
-/**
- * cti_map_trigger - use the @chan to map @trig_in to @trig_out
- * @cti: cti instance
- * @trig_in: trigger in number
- * @trig_out: trigger out number
- * @channel: channel number
- *
- * This function maps one trigger in of @trig_in to one trigger
- * out of @trig_out using the channel @chan.
- */
-static inline void cti_map_trigger(struct cti *cti,
- int trig_in, int trig_out, int chan)
-{
- void __iomem *base = cti->base;
- unsigned long val;
-
- val = __raw_readl(base + CTIINEN + trig_in * 4);
- val |= BIT(chan);
- __raw_writel(val, base + CTIINEN + trig_in * 4);
-
- val = __raw_readl(base + CTIOUTEN + trig_out * 4);
- val |= BIT(chan);
- __raw_writel(val, base + CTIOUTEN + trig_out * 4);
-}
-
-/**
- * cti_enable - enable the cti module
- * @cti: cti instance
- *
- * enable the cti module
- */
-static inline void cti_enable(struct cti *cti)
-{
- __raw_writel(0x1, cti->base + CTICONTROL);
-}
-
-/**
- * cti_disable - disable the cti module
- * @cti: cti instance
- *
- * enable the cti module
- */
-static inline void cti_disable(struct cti *cti)
-{
- __raw_writel(0, cti->base + CTICONTROL);
-}
-
-/**
- * cti_irq_ack - clear the cti irq
- * @cti: cti instance
- *
- * clear the cti irq
- */
-static inline void cti_irq_ack(struct cti *cti)
-{
- void __iomem *base = cti->base;
- unsigned long val;
-
- val = __raw_readl(base + CTIINTACK);
- val |= BIT(cti->trig_out_for_irq);
- __raw_writel(val, base + CTIINTACK);
-}
-
-/**
- * cti_unlock - unlock cti module
- * @cti: cti instance
- *
- * unlock the cti module, or else any writes to the cti
- * module is not allowed.
- */
-static inline void cti_unlock(struct cti *cti)
-{
- __raw_writel(CS_LAR_KEY, cti->base + LOCKACCESS);
-}
-
-/**
- * cti_lock - lock cti module
- * @cti: cti instance
- *
- * lock the cti module, so any writes to the cti
- * module will be not allowed.
- */
-static inline void cti_lock(struct cti *cti)
-{
- __raw_writel(~CS_LAR_KEY, cti->base + LOCKACCESS);
-}
-#endif
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index abd9725796e9..34e5d78af076 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -721,7 +721,7 @@ void mark_rodata_ro(void)
static void __init declare_vma(struct vm_struct *vma,
void *va_start, void *va_end,
- vm_flags_t vm_flags)
+ unsigned long vm_flags)
{
phys_addr_t pa_start = __pa_symbol(va_start);
unsigned long size = va_end - va_start;
@@ -1528,7 +1528,7 @@ early_initcall(prevent_bootmem_remove_init);
pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr,
pte_t *ptep, unsigned int nr)
{
- pte_t pte = get_and_clear_full_ptes(vma->vm_mm, addr, ptep, nr, /* full = */ 0);
+ pte_t pte = get_and_clear_ptes(vma->vm_mm, addr, ptep, nr);
if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) {
/*
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 9f6b7dab2d9a..7bde68247b5f 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -120,7 +120,7 @@ struct its_array its_pages;
static void *__its_alloc(struct its_array *pages)
{
- void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE);
+ void *page __free(execmem) = execmem_alloc_rw(EXECMEM_MODULE_TEXT, PAGE_SIZE);
if (!page)
return NULL;
@@ -237,7 +237,6 @@ static void *its_alloc(void)
if (!page)
return NULL;
- execmem_make_temp_rw(page, PAGE_SIZE);
if (pages == &its_pages)
set_memory_x((unsigned long)page, 1);
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 252e82bcfd2f..4450acec9390 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -263,7 +263,7 @@ void arch_ftrace_update_code(int command)
static inline void *alloc_tramp(unsigned long size)
{
- return execmem_alloc(EXECMEM_FTRACE, size);
+ return execmem_alloc_rw(EXECMEM_FTRACE, size);
}
static inline void tramp_free(void *tramp)
{
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 47cb8eb138ba..6079d15dab8c 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -481,24 +481,6 @@ static int prepare_singlestep(kprobe_opcode_t *buf, struct kprobe *p,
return len;
}
-/* Make page to RO mode when allocate it */
-void *alloc_insn_page(void)
-{
- void *page;
-
- page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
- if (!page)
- return NULL;
-
- /*
- * TODO: Once additional kernel code protection mechanisms are set, ensure
- * that the page was not maliciously altered and it is still zeroed.
- */
- set_memory_rox((unsigned long)page, 1);
-
- return page;
-}
-
/* Kprobe x86 instruction emulation - only regs->ip or IF flag modifiers */
static void kprobe_emulate_ifmodifiers(struct kprobe *p, struct pt_regs *regs)
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 7456df985d96..bb57e93b4caf 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -1063,13 +1063,9 @@ unsigned long arch_max_swapfile_size(void)
static struct execmem_info execmem_info __ro_after_init;
#ifdef CONFIG_ARCH_HAS_EXECMEM_ROX
-void execmem_fill_trapping_insns(void *ptr, size_t size, bool writeable)
+void execmem_fill_trapping_insns(void *ptr, size_t size)
{
- /* fill memory with INT3 instructions */
- if (writeable)
- memset(ptr, INT3_INSN_OPCODE, size);
- else
- text_poke_set(ptr, INT3_INSN_OPCODE, size);
+ memset(ptr, INT3_INSN_OPCODE, size);
}
#endif
@@ -1102,7 +1098,21 @@ struct execmem_info __init *execmem_arch_setup(void)
.pgprot = pgprot,
.alignment = MODULE_ALIGN,
},
- [EXECMEM_KPROBES ... EXECMEM_BPF] = {
+ [EXECMEM_KPROBES] = {
+ .flags = flags,
+ .start = start,
+ .end = MODULES_END,
+ .pgprot = PAGE_KERNEL_ROX,
+ .alignment = MODULE_ALIGN,
+ },
+ [EXECMEM_FTRACE] = {
+ .flags = flags,
+ .start = start,
+ .end = MODULES_END,
+ .pgprot = pgprot,
+ .alignment = MODULE_ALIGN,
+ },
+ [EXECMEM_BPF] = {
.flags = EXECMEM_KASAN_SHADOW,
.start = start,
.end = MODULES_END,
diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index c8d115b58e44..070d014fdc5d 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -992,7 +992,6 @@ config I2C_APPLE
tristate "Apple SMBus platform driver"
depends on !I2C_PASEMI
depends on ARCH_APPLE || COMPILE_TEST
- default ARCH_APPLE
help
Say Y here if you want to use the I2C controller present on Apple
Silicon chips such as the M1.
diff --git a/drivers/i2c/busses/i2c-qcom-geni.c b/drivers/i2c/busses/i2c-qcom-geni.c
index 13889f52b6f7..ff2289b52c84 100644
--- a/drivers/i2c/busses/i2c-qcom-geni.c
+++ b/drivers/i2c/busses/i2c-qcom-geni.c
@@ -155,9 +155,9 @@ static const struct geni_i2c_clk_fld geni_i2c_clk_map_19p2mhz[] = {
/* source_clock = 32 MHz */
static const struct geni_i2c_clk_fld geni_i2c_clk_map_32mhz[] = {
- { I2C_MAX_STANDARD_MODE_FREQ, 8, 14, 18, 40 },
- { I2C_MAX_FAST_MODE_FREQ, 4, 3, 11, 20 },
- { I2C_MAX_FAST_MODE_PLUS_FREQ, 2, 3, 6, 15 },
+ { I2C_MAX_STANDARD_MODE_FREQ, 8, 14, 18, 38 },
+ { I2C_MAX_FAST_MODE_FREQ, 4, 3, 9, 19 },
+ { I2C_MAX_FAST_MODE_PLUS_FREQ, 2, 3, 5, 15 },
{}
};
diff --git a/drivers/i2c/busses/i2c-stm32f7.c b/drivers/i2c/busses/i2c-stm32f7.c
index c8b4b404f6c1..e6815f6cae78 100644
--- a/drivers/i2c/busses/i2c-stm32f7.c
+++ b/drivers/i2c/busses/i2c-stm32f7.c
@@ -742,11 +742,14 @@ static void stm32f7_i2c_dma_callback(void *arg)
{
struct stm32f7_i2c_dev *i2c_dev = arg;
struct stm32_i2c_dma *dma = i2c_dev->dma;
+ struct stm32f7_i2c_msg *f7_msg = &i2c_dev->f7_msg;
stm32f7_i2c_disable_dma_req(i2c_dev);
dmaengine_terminate_async(dma->chan_using);
dma_unmap_single(i2c_dev->dev, dma->dma_buf, dma->dma_len,
dma->dma_data_dir);
+ if (!f7_msg->smbus)
+ i2c_put_dma_safe_msg_buf(f7_msg->buf, i2c_dev->msg, true);
complete(&dma->dma_complete);
}
@@ -882,6 +885,7 @@ static void stm32f7_i2c_xfer_msg(struct stm32f7_i2c_dev *i2c_dev,
{
struct stm32f7_i2c_msg *f7_msg = &i2c_dev->f7_msg;
void __iomem *base = i2c_dev->base;
+ u8 *dma_buf;
u32 cr1, cr2;
int ret;
@@ -931,17 +935,23 @@ static void stm32f7_i2c_xfer_msg(struct stm32f7_i2c_dev *i2c_dev,
/* Configure DMA or enable RX/TX interrupt */
i2c_dev->use_dma = false;
- if (i2c_dev->dma && f7_msg->count >= STM32F7_I2C_DMA_LEN_MIN
- && !i2c_dev->atomic) {
- ret = stm32_i2c_prep_dma_xfer(i2c_dev->dev, i2c_dev->dma,
- msg->flags & I2C_M_RD,
- f7_msg->count, f7_msg->buf,
- stm32f7_i2c_dma_callback,
- i2c_dev);
- if (!ret)
- i2c_dev->use_dma = true;
- else
- dev_warn(i2c_dev->dev, "can't use DMA\n");
+ if (i2c_dev->dma && !i2c_dev->atomic) {
+ dma_buf = i2c_get_dma_safe_msg_buf(msg, STM32F7_I2C_DMA_LEN_MIN);
+ if (dma_buf) {
+ f7_msg->buf = dma_buf;
+ ret = stm32_i2c_prep_dma_xfer(i2c_dev->dev, i2c_dev->dma,
+ msg->flags & I2C_M_RD,
+ f7_msg->count, f7_msg->buf,
+ stm32f7_i2c_dma_callback,
+ i2c_dev);
+ if (ret) {
+ dev_warn(i2c_dev->dev, "can't use DMA\n");
+ i2c_put_dma_safe_msg_buf(f7_msg->buf, msg, false);
+ f7_msg->buf = msg->buf;
+ } else {
+ i2c_dev->use_dma = true;
+ }
+ }
}
if (!i2c_dev->use_dma) {
diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c
index 4f05afab161f..4eb31b913c1a 100644
--- a/drivers/i2c/busses/i2c-tegra.c
+++ b/drivers/i2c/busses/i2c-tegra.c
@@ -134,6 +134,8 @@
#define I2C_MST_FIFO_STATUS_TX GENMASK(23, 16)
#define I2C_MST_FIFO_STATUS_RX GENMASK(7, 0)
+#define I2C_MASTER_RESET_CNTRL 0x0a8
+
/* configuration load timeout in microseconds */
#define I2C_CONFIG_LOAD_TIMEOUT 1000000
@@ -184,6 +186,9 @@ enum msg_end_type {
* @has_mst_fifo: The I2C controller contains the new MST FIFO interface that
* provides additional features and allows for longer messages to
* be transferred in one go.
+ * @has_mst_reset: The I2C controller contains MASTER_RESET_CTRL register which
+ * provides an alternative to controller reset when configured as
+ * I2C master
* @quirks: I2C adapter quirks for limiting write/read transfer size and not
* allowing 0 length transfers.
* @supports_bus_clear: Bus Clear support to recover from bus hang during
@@ -213,6 +218,7 @@ struct tegra_i2c_hw_feature {
bool has_multi_master_mode;
bool has_slcg_override_reg;
bool has_mst_fifo;
+ bool has_mst_reset;
const struct i2c_adapter_quirks *quirks;
bool supports_bus_clear;
bool has_apb_dma;
@@ -605,6 +611,26 @@ static int tegra_i2c_wait_for_config_load(struct tegra_i2c_dev *i2c_dev)
return 0;
}
+static int tegra_i2c_master_reset(struct tegra_i2c_dev *i2c_dev)
+{
+ if (!i2c_dev->hw->has_mst_reset)
+ return -EOPNOTSUPP;
+
+ /*
+ * Writing 1 to I2C_MASTER_RESET_CNTRL will reset all internal state of
+ * Master logic including FIFOs. Clear this bit to 0 for normal operation.
+ * SW needs to wait for 2us after assertion and de-assertion of this soft
+ * reset.
+ */
+ i2c_writel(i2c_dev, 0x1, I2C_MASTER_RESET_CNTRL);
+ fsleep(2);
+
+ i2c_writel(i2c_dev, 0x0, I2C_MASTER_RESET_CNTRL);
+ fsleep(2);
+
+ return 0;
+}
+
static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev)
{
u32 val, clk_divisor, clk_multiplier, tsu_thd, tlow, thigh, non_hs_mode;
@@ -612,6 +638,16 @@ static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev)
int err;
/*
+ * Reset the controller before initializing it.
+ * In case if device_reset() returns -ENOENT, i.e. when the reset is
+ * not available, the internal software reset will be used if it is
+ * supported by the controller.
+ */
+ err = device_reset(i2c_dev->dev);
+ if (err == -ENOENT)
+ err = tegra_i2c_master_reset(i2c_dev);
+
+ /*
* The reset shouldn't ever fail in practice. The failure will be a
* sign of a severe problem that needs to be resolved. Still we don't
* want to fail the initialization completely because this may break
@@ -619,7 +655,6 @@ static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev)
* emit a noisy warning on error, which won't stay unnoticed and
* won't hose machine entirely.
*/
- err = device_reset(i2c_dev->dev);
WARN_ON_ONCE(err);
if (IS_DVC(i2c_dev))
@@ -1266,17 +1301,9 @@ static int tegra_i2c_xfer_msg(struct tegra_i2c_dev *i2c_dev,
if (i2c_dev->dma_mode) {
if (i2c_dev->msg_read) {
- dma_sync_single_for_device(i2c_dev->dma_dev,
- i2c_dev->dma_phys,
- xfer_size, DMA_FROM_DEVICE);
-
err = tegra_i2c_dma_submit(i2c_dev, xfer_size);
if (err)
return err;
- } else {
- dma_sync_single_for_cpu(i2c_dev->dma_dev,
- i2c_dev->dma_phys,
- xfer_size, DMA_TO_DEVICE);
}
}
@@ -1286,11 +1313,6 @@ static int tegra_i2c_xfer_msg(struct tegra_i2c_dev *i2c_dev,
if (i2c_dev->dma_mode) {
memcpy(i2c_dev->dma_buf + I2C_PACKET_HEADER_SIZE,
msg->buf, i2c_dev->msg_len);
-
- dma_sync_single_for_device(i2c_dev->dma_dev,
- i2c_dev->dma_phys,
- xfer_size, DMA_TO_DEVICE);
-
err = tegra_i2c_dma_submit(i2c_dev, xfer_size);
if (err)
return err;
@@ -1331,13 +1353,8 @@ static int tegra_i2c_xfer_msg(struct tegra_i2c_dev *i2c_dev,
return -ETIMEDOUT;
}
- if (i2c_dev->msg_read && i2c_dev->msg_err == I2C_ERR_NONE) {
- dma_sync_single_for_cpu(i2c_dev->dma_dev,
- i2c_dev->dma_phys,
- xfer_size, DMA_FROM_DEVICE);
-
+ if (i2c_dev->msg_read && i2c_dev->msg_err == I2C_ERR_NONE)
memcpy(i2c_dev->msg_buf, i2c_dev->dma_buf, i2c_dev->msg_len);
- }
}
time_left = tegra_i2c_wait_completion(i2c_dev, &i2c_dev->msg_complete,
@@ -1468,6 +1485,7 @@ static const struct tegra_i2c_hw_feature tegra20_i2c_hw = {
.has_multi_master_mode = false,
.has_slcg_override_reg = false,
.has_mst_fifo = false,
+ .has_mst_reset = false,
.quirks = &tegra_i2c_quirks,
.supports_bus_clear = false,
.has_apb_dma = true,
@@ -1492,6 +1510,7 @@ static const struct tegra_i2c_hw_feature tegra30_i2c_hw = {
.has_multi_master_mode = false,
.has_slcg_override_reg = false,
.has_mst_fifo = false,
+ .has_mst_reset = false,
.quirks = &tegra_i2c_quirks,
.supports_bus_clear = false,
.has_apb_dma = true,
@@ -1516,6 +1535,7 @@ static const struct tegra_i2c_hw_feature tegra114_i2c_hw = {
.has_multi_master_mode = false,
.has_slcg_override_reg = false,
.has_mst_fifo = false,
+ .has_mst_reset = false,
.quirks = &tegra_i2c_quirks,
.supports_bus_clear = true,
.has_apb_dma = true,
@@ -1540,6 +1560,7 @@ static const struct tegra_i2c_hw_feature tegra124_i2c_hw = {
.has_multi_master_mode = false,
.has_slcg_override_reg = true,
.has_mst_fifo = false,
+ .has_mst_reset = false,
.quirks = &tegra_i2c_quirks,
.supports_bus_clear = true,
.has_apb_dma = true,
@@ -1564,6 +1585,7 @@ static const struct tegra_i2c_hw_feature tegra210_i2c_hw = {
.has_multi_master_mode = false,
.has_slcg_override_reg = true,
.has_mst_fifo = false,
+ .has_mst_reset = false,
.quirks = &tegra_i2c_quirks,
.supports_bus_clear = true,
.has_apb_dma = true,
@@ -1588,6 +1610,7 @@ static const struct tegra_i2c_hw_feature tegra186_i2c_hw = {
.has_multi_master_mode = false,
.has_slcg_override_reg = true,
.has_mst_fifo = false,
+ .has_mst_reset = false,
.quirks = &tegra_i2c_quirks,
.supports_bus_clear = true,
.has_apb_dma = false,
@@ -1612,6 +1635,7 @@ static const struct tegra_i2c_hw_feature tegra194_i2c_hw = {
.has_multi_master_mode = true,
.has_slcg_override_reg = true,
.has_mst_fifo = true,
+ .has_mst_reset = true,
.quirks = &tegra194_i2c_quirks,
.supports_bus_clear = true,
.has_apb_dma = false,
diff --git a/drivers/i2c/i2c-core-acpi.c b/drivers/i2c/i2c-core-acpi.c
index 3445cc3b476b..ed90858a27b7 100644
--- a/drivers/i2c/i2c-core-acpi.c
+++ b/drivers/i2c/i2c-core-acpi.c
@@ -370,6 +370,7 @@ static const struct acpi_device_id i2c_acpi_force_100khz_device_ids[] = {
* the device works without issues on Windows at what is expected to be
* a 400KHz frequency. The root cause of the issue is not known.
*/
+ { "DLL0945", 0 },
{ "ELAN06FA", 0 },
{}
};
diff --git a/drivers/i2c/muxes/i2c-mux-mule.c b/drivers/i2c/muxes/i2c-mux-mule.c
index 284ff4afeeac..d3b32b794172 100644
--- a/drivers/i2c/muxes/i2c-mux-mule.c
+++ b/drivers/i2c/muxes/i2c-mux-mule.c
@@ -47,7 +47,6 @@ static int mule_i2c_mux_probe(struct platform_device *pdev)
struct mule_i2c_reg_mux *priv;
struct i2c_client *client;
struct i2c_mux_core *muxc;
- struct device_node *dev;
unsigned int readback;
int ndev, ret;
bool old_fw;
@@ -95,7 +94,7 @@ static int mule_i2c_mux_probe(struct platform_device *pdev)
"Failed to register mux remove\n");
/* Create device adapters */
- for_each_child_of_node(mux_dev->of_node, dev) {
+ for_each_child_of_node_scoped(mux_dev->of_node, dev) {
u32 reg;
ret = of_property_read_u32(dev, "reg", &reg);
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index 3103b932b674..ee060e26f51d 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -996,6 +996,7 @@ int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei,
struct exfat_hint_femp candi_empty;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
int num_entries = exfat_calc_num_entries(p_uniname);
+ unsigned int clu_count = 0;
if (num_entries < 0)
return num_entries;
@@ -1133,6 +1134,10 @@ rewind:
} else {
if (exfat_get_next_cluster(sb, &clu.dir))
return -EIO;
+
+ /* break if the cluster chain includes a loop */
+ if (unlikely(++clu_count > EXFAT_DATA_CLUSTER_COUNT(sbi)))
+ goto not_found;
}
}
@@ -1195,6 +1200,7 @@ int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir)
int i, count = 0;
int dentries_per_clu;
unsigned int entry_type;
+ unsigned int clu_count = 0;
struct exfat_chain clu;
struct exfat_dentry *ep;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
@@ -1227,6 +1233,12 @@ int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir)
} else {
if (exfat_get_next_cluster(sb, &(clu.dir)))
return -EIO;
+
+ if (unlikely(++clu_count > sbi->used_clusters)) {
+ exfat_fs_error(sb, "FAT or bitmap is corrupted");
+ return -EIO;
+ }
+
}
}
diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c
index 23065f948ae7..232cc7f8ab92 100644
--- a/fs/exfat/fatent.c
+++ b/fs/exfat/fatent.c
@@ -490,5 +490,15 @@ int exfat_count_num_clusters(struct super_block *sb,
}
*ret_count = count;
+
+ /*
+ * since exfat_count_used_clusters() is not called, sbi->used_clusters
+ * cannot be used here.
+ */
+ if (unlikely(i == sbi->num_clusters && clu != EXFAT_EOF_CLUSTER)) {
+ exfat_fs_error(sb, "The cluster chain has a loop");
+ return -EIO;
+ }
+
return 0;
}
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 6b82497572b4..538d2b6ac2ec 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -622,9 +622,8 @@ static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
if (pos > valid_size)
pos = valid_size;
- if (iocb_is_dsync(iocb) && iocb->ki_pos > pos) {
- ssize_t err = vfs_fsync_range(file, pos, iocb->ki_pos - 1,
- iocb->ki_flags & IOCB_SYNC);
+ if (iocb->ki_pos > pos) {
+ ssize_t err = generic_write_sync(iocb, iocb->ki_pos - pos);
if (err < 0)
return err;
}
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index fede0283d6e2..f5f1c4e8a29f 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -890,6 +890,7 @@ static int exfat_check_dir_empty(struct super_block *sb,
{
int i, dentries_per_clu;
unsigned int type;
+ unsigned int clu_count = 0;
struct exfat_chain clu;
struct exfat_dentry *ep;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
@@ -926,6 +927,10 @@ static int exfat_check_dir_empty(struct super_block *sb,
} else {
if (exfat_get_next_cluster(sb, &(clu.dir)))
return -EIO;
+
+ /* break if the cluster chain includes a loop */
+ if (unlikely(++clu_count > EXFAT_DATA_CLUSTER_COUNT(sbi)))
+ break;
}
}
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index ea5c1334a214..8926e63f5bb7 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -341,13 +341,12 @@ static void exfat_hash_init(struct super_block *sb)
INIT_HLIST_HEAD(&sbi->inode_hashtable[i]);
}
-static int exfat_read_root(struct inode *inode)
+static int exfat_read_root(struct inode *inode, struct exfat_chain *root_clu)
{
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(inode);
- struct exfat_chain cdir;
- int num_subdirs, num_clu = 0;
+ int num_subdirs;
exfat_chain_set(&ei->dir, sbi->root_dir, 0, ALLOC_FAT_CHAIN);
ei->entry = -1;
@@ -360,12 +359,9 @@ static int exfat_read_root(struct inode *inode)
ei->hint_stat.clu = sbi->root_dir;
ei->hint_femp.eidx = EXFAT_HINT_NONE;
- exfat_chain_set(&cdir, sbi->root_dir, 0, ALLOC_FAT_CHAIN);
- if (exfat_count_num_clusters(sb, &cdir, &num_clu))
- return -EIO;
- i_size_write(inode, num_clu << sbi->cluster_size_bits);
+ i_size_write(inode, EXFAT_CLU_TO_B(root_clu->size, sbi));
- num_subdirs = exfat_count_dir_entries(sb, &cdir);
+ num_subdirs = exfat_count_dir_entries(sb, root_clu);
if (num_subdirs < 0)
return -EIO;
set_nlink(inode, num_subdirs + EXFAT_MIN_SUBDIR);
@@ -578,7 +574,8 @@ static int exfat_verify_boot_region(struct super_block *sb)
}
/* mount the file system volume */
-static int __exfat_fill_super(struct super_block *sb)
+static int __exfat_fill_super(struct super_block *sb,
+ struct exfat_chain *root_clu)
{
int ret;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
@@ -595,6 +592,18 @@ static int __exfat_fill_super(struct super_block *sb)
goto free_bh;
}
+ /*
+ * Call exfat_count_num_cluster() before searching for up-case and
+ * bitmap directory entries to avoid infinite loop if they are missing
+ * and the cluster chain includes a loop.
+ */
+ exfat_chain_set(root_clu, sbi->root_dir, 0, ALLOC_FAT_CHAIN);
+ ret = exfat_count_num_clusters(sb, root_clu, &root_clu->size);
+ if (ret) {
+ exfat_err(sb, "failed to count the number of clusters in root");
+ goto free_bh;
+ }
+
ret = exfat_create_upcase_table(sb);
if (ret) {
exfat_err(sb, "failed to load upcase table");
@@ -627,6 +636,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
struct exfat_sb_info *sbi = sb->s_fs_info;
struct exfat_mount_options *opts = &sbi->options;
struct inode *root_inode;
+ struct exfat_chain root_clu;
int err;
if (opts->allow_utime == (unsigned short)-1)
@@ -645,7 +655,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_time_min = EXFAT_MIN_TIMESTAMP_SECS;
sb->s_time_max = EXFAT_MAX_TIMESTAMP_SECS;
- err = __exfat_fill_super(sb);
+ err = __exfat_fill_super(sb, &root_clu);
if (err) {
exfat_err(sb, "failed to recognize exfat type");
goto check_nls_io;
@@ -680,7 +690,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
root_inode->i_ino = EXFAT_ROOT_INO;
inode_set_iversion(root_inode, 1);
- err = exfat_read_root(root_inode);
+ err = exfat_read_root(root_inode, &root_clu);
if (err) {
exfat_err(sb, "failed to initialize root inode");
goto put_inode;
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index f149ec28aefd..db3831f7f2f5 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -82,7 +82,7 @@ repeat:
if (folio_test_uptodate(folio))
goto out;
- fio.page = &folio->page;
+ fio.folio = folio;
err = f2fs_submit_page_bio(&fio);
if (err) {
@@ -309,7 +309,7 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
continue;
}
- fio.page = &folio->page;
+ fio.folio = folio;
err = f2fs_submit_page_bio(&fio);
f2fs_folio_put(folio, err ? true : false);
@@ -485,7 +485,7 @@ static bool f2fs_dirty_meta_folio(struct address_space *mapping,
folio_mark_uptodate(folio);
if (filemap_dirty_folio(mapping, folio)) {
inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_META);
- set_page_private_reference(&folio->page);
+ folio_set_f2fs_reference(folio);
return true;
}
return false;
@@ -1045,7 +1045,7 @@ void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio)
inode_inc_dirty_pages(inode);
spin_unlock(&sbi->inode_lock[type]);
- set_page_private_reference(&folio->page);
+ folio_set_f2fs_reference(folio);
}
void f2fs_remove_dirty_inode(struct inode *inode)
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index b3c1df93a163..5c1f47e45dab 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -23,20 +23,18 @@
static struct kmem_cache *cic_entry_slab;
static struct kmem_cache *dic_entry_slab;
-static void *page_array_alloc(struct inode *inode, int nr)
+static void *page_array_alloc(struct f2fs_sb_info *sbi, int nr)
{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
unsigned int size = sizeof(struct page *) * nr;
if (likely(size <= sbi->page_array_slab_size))
return f2fs_kmem_cache_alloc(sbi->page_array_slab,
- GFP_F2FS_ZERO, false, F2FS_I_SB(inode));
+ GFP_F2FS_ZERO, false, sbi);
return f2fs_kzalloc(sbi, size, GFP_NOFS);
}
-static void page_array_free(struct inode *inode, void *pages, int nr)
+static void page_array_free(struct f2fs_sb_info *sbi, void *pages, int nr)
{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
unsigned int size = sizeof(struct page *) * nr;
if (!pages)
@@ -73,17 +71,15 @@ static pgoff_t start_idx_of_cluster(struct compress_ctx *cc)
return cc->cluster_idx << cc->log_cluster_size;
}
-bool f2fs_is_compressed_page(struct page *page)
+bool f2fs_is_compressed_page(struct folio *folio)
{
- if (!PagePrivate(page))
- return false;
- if (!page_private(page))
+ if (!folio->private)
return false;
- if (page_private_nonpointer(page))
+ if (folio_test_f2fs_nonpointer(folio))
return false;
- f2fs_bug_on(F2FS_P_SB(page),
- *((u32 *)page_private(page)) != F2FS_COMPRESSED_PAGE_MAGIC);
+ f2fs_bug_on(F2FS_F_SB(folio),
+ *((u32 *)folio->private) != F2FS_COMPRESSED_PAGE_MAGIC);
return true;
}
@@ -149,13 +145,13 @@ int f2fs_init_compress_ctx(struct compress_ctx *cc)
if (cc->rpages)
return 0;
- cc->rpages = page_array_alloc(cc->inode, cc->cluster_size);
+ cc->rpages = page_array_alloc(F2FS_I_SB(cc->inode), cc->cluster_size);
return cc->rpages ? 0 : -ENOMEM;
}
void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse)
{
- page_array_free(cc->inode, cc->rpages, cc->cluster_size);
+ page_array_free(F2FS_I_SB(cc->inode), cc->rpages, cc->cluster_size);
cc->rpages = NULL;
cc->nr_rpages = 0;
cc->nr_cpages = 0;
@@ -216,13 +212,13 @@ static int lzo_decompress_pages(struct decompress_io_ctx *dic)
ret = lzo1x_decompress_safe(dic->cbuf->cdata, dic->clen,
dic->rbuf, &dic->rlen);
if (ret != LZO_E_OK) {
- f2fs_err_ratelimited(F2FS_I_SB(dic->inode),
+ f2fs_err_ratelimited(dic->sbi,
"lzo decompress failed, ret:%d", ret);
return -EIO;
}
if (dic->rlen != PAGE_SIZE << dic->log_cluster_size) {
- f2fs_err_ratelimited(F2FS_I_SB(dic->inode),
+ f2fs_err_ratelimited(dic->sbi,
"lzo invalid rlen:%zu, expected:%lu",
dic->rlen, PAGE_SIZE << dic->log_cluster_size);
return -EIO;
@@ -296,13 +292,13 @@ static int lz4_decompress_pages(struct decompress_io_ctx *dic)
ret = LZ4_decompress_safe(dic->cbuf->cdata, dic->rbuf,
dic->clen, dic->rlen);
if (ret < 0) {
- f2fs_err_ratelimited(F2FS_I_SB(dic->inode),
+ f2fs_err_ratelimited(dic->sbi,
"lz4 decompress failed, ret:%d", ret);
return -EIO;
}
if (ret != PAGE_SIZE << dic->log_cluster_size) {
- f2fs_err_ratelimited(F2FS_I_SB(dic->inode),
+ f2fs_err_ratelimited(dic->sbi,
"lz4 invalid ret:%d, expected:%lu",
ret, PAGE_SIZE << dic->log_cluster_size);
return -EIO;
@@ -424,13 +420,13 @@ static int zstd_init_decompress_ctx(struct decompress_io_ctx *dic)
workspace_size = zstd_dstream_workspace_bound(max_window_size);
- workspace = f2fs_vmalloc(F2FS_I_SB(dic->inode), workspace_size);
+ workspace = f2fs_vmalloc(dic->sbi, workspace_size);
if (!workspace)
return -ENOMEM;
stream = zstd_init_dstream(max_window_size, workspace, workspace_size);
if (!stream) {
- f2fs_err_ratelimited(F2FS_I_SB(dic->inode),
+ f2fs_err_ratelimited(dic->sbi,
"%s zstd_init_dstream failed", __func__);
vfree(workspace);
return -EIO;
@@ -466,14 +462,14 @@ static int zstd_decompress_pages(struct decompress_io_ctx *dic)
ret = zstd_decompress_stream(stream, &outbuf, &inbuf);
if (zstd_is_error(ret)) {
- f2fs_err_ratelimited(F2FS_I_SB(dic->inode),
+ f2fs_err_ratelimited(dic->sbi,
"%s zstd_decompress_stream failed, ret: %d",
__func__, zstd_get_error_code(ret));
return -EIO;
}
if (dic->rlen != outbuf.pos) {
- f2fs_err_ratelimited(F2FS_I_SB(dic->inode),
+ f2fs_err_ratelimited(dic->sbi,
"%s ZSTD invalid rlen:%zu, expected:%lu",
__func__, dic->rlen,
PAGE_SIZE << dic->log_cluster_size);
@@ -622,6 +618,7 @@ static void *f2fs_vmap(struct page **pages, unsigned int count)
static int f2fs_compress_pages(struct compress_ctx *cc)
{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
struct f2fs_inode_info *fi = F2FS_I(cc->inode);
const struct f2fs_compress_ops *cops =
f2fs_cops[fi->i_compress_algorithm];
@@ -642,7 +639,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE);
cc->valid_nr_cpages = cc->nr_cpages;
- cc->cpages = page_array_alloc(cc->inode, cc->nr_cpages);
+ cc->cpages = page_array_alloc(sbi, cc->nr_cpages);
if (!cc->cpages) {
ret = -ENOMEM;
goto destroy_compress_ctx;
@@ -716,7 +713,7 @@ out_free_cpages:
if (cc->cpages[i])
f2fs_compress_free_page(cc->cpages[i]);
}
- page_array_free(cc->inode, cc->cpages, cc->nr_cpages);
+ page_array_free(sbi, cc->cpages, cc->nr_cpages);
cc->cpages = NULL;
destroy_compress_ctx:
if (cops->destroy_compress_ctx)
@@ -734,7 +731,7 @@ static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic,
void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task)
{
- struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode);
+ struct f2fs_sb_info *sbi = dic->sbi;
struct f2fs_inode_info *fi = F2FS_I(dic->inode);
const struct f2fs_compress_ops *cops =
f2fs_cops[fi->i_compress_algorithm];
@@ -796,25 +793,27 @@ out_end_io:
f2fs_decompress_end_io(dic, ret, in_task);
}
+static void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi,
+ struct folio *folio, nid_t ino, block_t blkaddr);
+
/*
* This is called when a page of a compressed cluster has been read from disk
* (or failed to be read from disk). It checks whether this page was the last
* page being waited on in the cluster, and if so, it decompresses the cluster
* (or in the case of a failure, cleans up without actually decompressing).
*/
-void f2fs_end_read_compressed_page(struct page *page, bool failed,
+void f2fs_end_read_compressed_page(struct folio *folio, bool failed,
block_t blkaddr, bool in_task)
{
- struct decompress_io_ctx *dic =
- (struct decompress_io_ctx *)page_private(page);
- struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode);
+ struct decompress_io_ctx *dic = folio->private;
+ struct f2fs_sb_info *sbi = dic->sbi;
dec_page_count(sbi, F2FS_RD_DATA);
if (failed)
WRITE_ONCE(dic->failed, true);
else if (blkaddr && in_task)
- f2fs_cache_compressed_page(sbi, page,
+ f2fs_cache_compressed_page(sbi, folio,
dic->inode->i_ino, blkaddr);
if (atomic_dec_and_test(&dic->remaining_pages))
@@ -1340,7 +1339,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
cic->magic = F2FS_COMPRESSED_PAGE_MAGIC;
cic->inode = inode;
atomic_set(&cic->pending_pages, cc->valid_nr_cpages);
- cic->rpages = page_array_alloc(cc->inode, cc->cluster_size);
+ cic->rpages = page_array_alloc(sbi, cc->cluster_size);
if (!cic->rpages)
goto out_put_cic;
@@ -1420,7 +1419,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
(*submitted)++;
unlock_continue:
inode_dec_dirty_pages(cc->inode);
- unlock_page(fio.page);
+ folio_unlock(fio.folio);
}
if (fio.compr_blocks)
@@ -1442,13 +1441,13 @@ unlock_continue:
spin_unlock(&fi->i_size_lock);
f2fs_put_rpages(cc);
- page_array_free(cc->inode, cc->cpages, cc->nr_cpages);
+ page_array_free(sbi, cc->cpages, cc->nr_cpages);
cc->cpages = NULL;
f2fs_destroy_compress_ctx(cc, false);
return 0;
out_destroy_crypt:
- page_array_free(cc->inode, cic->rpages, cc->cluster_size);
+ page_array_free(sbi, cic->rpages, cc->cluster_size);
for (--i; i >= 0; i--) {
if (!cc->cpages[i])
@@ -1469,18 +1468,18 @@ out_free:
f2fs_compress_free_page(cc->cpages[i]);
cc->cpages[i] = NULL;
}
- page_array_free(cc->inode, cc->cpages, cc->nr_cpages);
+ page_array_free(sbi, cc->cpages, cc->nr_cpages);
cc->cpages = NULL;
return -EAGAIN;
}
-void f2fs_compress_write_end_io(struct bio *bio, struct page *page)
+void f2fs_compress_write_end_io(struct bio *bio, struct folio *folio)
{
+ struct page *page = &folio->page;
struct f2fs_sb_info *sbi = bio->bi_private;
- struct compress_io_ctx *cic =
- (struct compress_io_ctx *)page_private(page);
- enum count_type type = WB_DATA_TYPE(page,
- f2fs_is_compressed_page(page));
+ struct compress_io_ctx *cic = folio->private;
+ enum count_type type = WB_DATA_TYPE(folio,
+ f2fs_is_compressed_page(folio));
int i;
if (unlikely(bio->bi_status != BLK_STS_OK))
@@ -1499,7 +1498,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page)
end_page_writeback(cic->rpages[i]);
}
- page_array_free(cic->inode, cic->rpages, cic->nr_rpages);
+ page_array_free(sbi, cic->rpages, cic->nr_rpages);
kmem_cache_free(cic_entry_slab, cic);
}
@@ -1633,14 +1632,13 @@ static inline bool allow_memalloc_for_decomp(struct f2fs_sb_info *sbi,
static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic,
bool pre_alloc)
{
- const struct f2fs_compress_ops *cops =
- f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm];
+ const struct f2fs_compress_ops *cops = f2fs_cops[dic->compress_algorithm];
int i;
- if (!allow_memalloc_for_decomp(F2FS_I_SB(dic->inode), pre_alloc))
+ if (!allow_memalloc_for_decomp(dic->sbi, pre_alloc))
return 0;
- dic->tpages = page_array_alloc(dic->inode, dic->cluster_size);
+ dic->tpages = page_array_alloc(dic->sbi, dic->cluster_size);
if (!dic->tpages)
return -ENOMEM;
@@ -1670,10 +1668,9 @@ static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic,
static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic,
bool bypass_destroy_callback, bool pre_alloc)
{
- const struct f2fs_compress_ops *cops =
- f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm];
+ const struct f2fs_compress_ops *cops = f2fs_cops[dic->compress_algorithm];
- if (!allow_memalloc_for_decomp(F2FS_I_SB(dic->inode), pre_alloc))
+ if (!allow_memalloc_for_decomp(dic->sbi, pre_alloc))
return;
if (!bypass_destroy_callback && cops->destroy_decompress_ctx)
@@ -1700,7 +1697,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
if (!dic)
return ERR_PTR(-ENOMEM);
- dic->rpages = page_array_alloc(cc->inode, cc->cluster_size);
+ dic->rpages = page_array_alloc(sbi, cc->cluster_size);
if (!dic->rpages) {
kmem_cache_free(dic_entry_slab, dic);
return ERR_PTR(-ENOMEM);
@@ -1708,6 +1705,8 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
dic->magic = F2FS_COMPRESSED_PAGE_MAGIC;
dic->inode = cc->inode;
+ dic->sbi = sbi;
+ dic->compress_algorithm = F2FS_I(cc->inode)->i_compress_algorithm;
atomic_set(&dic->remaining_pages, cc->nr_cpages);
dic->cluster_idx = cc->cluster_idx;
dic->cluster_size = cc->cluster_size;
@@ -1721,7 +1720,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
dic->rpages[i] = cc->rpages[i];
dic->nr_rpages = cc->cluster_size;
- dic->cpages = page_array_alloc(dic->inode, dic->nr_cpages);
+ dic->cpages = page_array_alloc(sbi, dic->nr_cpages);
if (!dic->cpages) {
ret = -ENOMEM;
goto out_free;
@@ -1751,6 +1750,8 @@ static void f2fs_free_dic(struct decompress_io_ctx *dic,
bool bypass_destroy_callback)
{
int i;
+ /* use sbi in dic to avoid UFA of dic->inode*/
+ struct f2fs_sb_info *sbi = dic->sbi;
f2fs_release_decomp_mem(dic, bypass_destroy_callback, true);
@@ -1762,7 +1763,7 @@ static void f2fs_free_dic(struct decompress_io_ctx *dic,
continue;
f2fs_compress_free_page(dic->tpages[i]);
}
- page_array_free(dic->inode, dic->tpages, dic->cluster_size);
+ page_array_free(sbi, dic->tpages, dic->cluster_size);
}
if (dic->cpages) {
@@ -1771,10 +1772,10 @@ static void f2fs_free_dic(struct decompress_io_ctx *dic,
continue;
f2fs_compress_free_page(dic->cpages[i]);
}
- page_array_free(dic->inode, dic->cpages, dic->nr_cpages);
+ page_array_free(sbi, dic->cpages, dic->nr_cpages);
}
- page_array_free(dic->inode, dic->rpages, dic->nr_rpages);
+ page_array_free(sbi, dic->rpages, dic->nr_rpages);
kmem_cache_free(dic_entry_slab, dic);
}
@@ -1793,8 +1794,7 @@ static void f2fs_put_dic(struct decompress_io_ctx *dic, bool in_task)
f2fs_free_dic(dic, false);
} else {
INIT_WORK(&dic->free_work, f2fs_late_free_dic);
- queue_work(F2FS_I_SB(dic->inode)->post_read_wq,
- &dic->free_work);
+ queue_work(dic->sbi->post_read_wq, &dic->free_work);
}
}
}
@@ -1921,8 +1921,8 @@ void f2fs_invalidate_compress_pages_range(struct f2fs_sb_info *sbi,
invalidate_mapping_pages(COMPRESS_MAPPING(sbi), blkaddr, blkaddr + len - 1);
}
-void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page,
- nid_t ino, block_t blkaddr)
+static void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi,
+ struct folio *folio, nid_t ino, block_t blkaddr)
{
struct folio *cfolio;
int ret;
@@ -1953,9 +1953,9 @@ void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page,
return;
}
- set_page_private_data(&cfolio->page, ino);
+ folio_set_f2fs_data(cfolio, ino);
- memcpy(folio_address(cfolio), page_address(page), PAGE_SIZE);
+ memcpy(folio_address(cfolio), folio_address(folio), PAGE_SIZE);
folio_mark_uptodate(cfolio);
f2fs_folio_put(cfolio, true);
}
@@ -2012,7 +2012,7 @@ void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino)
continue;
}
- if (ino != get_page_private_data(&folio->page)) {
+ if (ino != folio_get_f2fs_data(folio)) {
folio_unlock(folio);
continue;
}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 711ad80b38d0..7961e0ddfca3 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -47,14 +47,14 @@ void f2fs_destroy_bioset(void)
bioset_exit(&f2fs_bioset);
}
-bool f2fs_is_cp_guaranteed(struct page *page)
+bool f2fs_is_cp_guaranteed(const struct folio *folio)
{
- struct address_space *mapping = page_folio(page)->mapping;
+ struct address_space *mapping = folio->mapping;
struct inode *inode;
struct f2fs_sb_info *sbi;
- if (fscrypt_is_bounce_page(page))
- return page_private_gcing(fscrypt_pagecache_page(page));
+ if (fscrypt_is_bounce_folio(folio))
+ return folio_test_f2fs_gcing(fscrypt_pagecache_folio(folio));
inode = mapping->host;
sbi = F2FS_I_SB(inode);
@@ -65,7 +65,7 @@ bool f2fs_is_cp_guaranteed(struct page *page)
return true;
if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) ||
- page_private_gcing(page))
+ folio_test_f2fs_gcing(folio))
return true;
return false;
}
@@ -142,9 +142,9 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
bio_for_each_folio_all(fi, bio) {
struct folio *folio = fi.folio;
- if (f2fs_is_compressed_page(&folio->page)) {
+ if (f2fs_is_compressed_page(folio)) {
if (ctx && !ctx->decompression_attempted)
- f2fs_end_read_compressed_page(&folio->page, true, 0,
+ f2fs_end_read_compressed_page(folio, true, 0,
in_task);
f2fs_put_folio_dic(folio, in_task);
continue;
@@ -181,14 +181,13 @@ static void f2fs_verify_bio(struct work_struct *work)
* as those were handled separately by f2fs_end_read_compressed_page().
*/
if (may_have_compressed_pages) {
- struct bio_vec *bv;
- struct bvec_iter_all iter_all;
+ struct folio_iter fi;
- bio_for_each_segment_all(bv, bio, iter_all) {
- struct page *page = bv->bv_page;
+ bio_for_each_folio_all(fi, bio) {
+ struct folio *folio = fi.folio;
- if (!f2fs_is_compressed_page(page) &&
- !fsverity_verify_page(page)) {
+ if (!f2fs_is_compressed_page(folio) &&
+ !fsverity_verify_page(&folio->page)) {
bio->bi_status = BLK_STS_IOERR;
break;
}
@@ -233,16 +232,15 @@ static void f2fs_verify_and_finish_bio(struct bio *bio, bool in_task)
static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx,
bool in_task)
{
- struct bio_vec *bv;
- struct bvec_iter_all iter_all;
+ struct folio_iter fi;
bool all_compressed = true;
block_t blkaddr = ctx->fs_blkaddr;
- bio_for_each_segment_all(bv, ctx->bio, iter_all) {
- struct page *page = bv->bv_page;
+ bio_for_each_folio_all(fi, ctx->bio) {
+ struct folio *folio = fi.folio;
- if (f2fs_is_compressed_page(page))
- f2fs_end_read_compressed_page(page, false, blkaddr,
+ if (f2fs_is_compressed_page(folio))
+ f2fs_end_read_compressed_page(folio, false, blkaddr,
in_task);
else
all_compressed = false;
@@ -280,9 +278,9 @@ static void f2fs_post_read_work(struct work_struct *work)
static void f2fs_read_end_io(struct bio *bio)
{
- struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio));
+ struct f2fs_sb_info *sbi = F2FS_F_SB(bio_first_folio_all(bio));
struct bio_post_read_ctx *ctx;
- bool intask = in_task();
+ bool intask = in_task() && !irqs_disabled();
iostat_update_and_unbind_ctx(bio);
ctx = bio->bi_private;
@@ -339,13 +337,13 @@ static void f2fs_write_end_io(struct bio *bio)
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
- if (f2fs_is_compressed_page(&folio->page)) {
- f2fs_compress_write_end_io(bio, &folio->page);
+ if (f2fs_is_compressed_page(folio)) {
+ f2fs_compress_write_end_io(bio, folio);
continue;
}
#endif
- type = WB_DATA_TYPE(&folio->page, false);
+ type = WB_DATA_TYPE(folio, false);
if (unlikely(bio->bi_status != BLK_STS_OK)) {
mapping_set_error(folio->mapping, -EIO);
@@ -355,12 +353,12 @@ static void f2fs_write_end_io(struct bio *bio)
}
f2fs_bug_on(sbi, is_node_folio(folio) &&
- folio->index != nid_of_node(&folio->page));
+ folio->index != nid_of_node(folio));
dec_page_count(sbi, type);
if (f2fs_in_warm_node_list(sbi, folio))
f2fs_del_fsync_node_entry(sbi, folio);
- clear_page_private_gcing(&folio->page);
+ folio_clear_f2fs_gcing(folio);
folio_end_writeback(folio);
}
if (!get_pages(sbi, F2FS_WB_CP_DATA) &&
@@ -419,7 +417,6 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio)
{
unsigned int temp_mask = GENMASK(NR_TEMP_TYPE - 1, 0);
- struct folio *fio_folio = page_folio(fio->page);
unsigned int fua_flag, meta_flag, io_flag;
blk_opf_t op_flags = 0;
@@ -447,7 +444,7 @@ static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio)
op_flags |= REQ_FUA;
if (fio->type == DATA &&
- F2FS_I(fio_folio->mapping->host)->ioprio_hint == F2FS_IOPRIO_WRITE)
+ F2FS_I(fio->folio->mapping->host)->ioprio_hint == F2FS_IOPRIO_WRITE)
op_flags |= REQ_PRIO;
return op_flags;
@@ -546,14 +543,14 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
}
static bool __has_merged_page(struct bio *bio, struct inode *inode,
- struct page *page, nid_t ino)
+ struct folio *folio, nid_t ino)
{
struct folio_iter fi;
if (!bio)
return false;
- if (!inode && !page && !ino)
+ if (!inode && !folio && !ino)
return true;
bio_for_each_folio_all(fi, bio) {
@@ -564,7 +561,7 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode,
if (IS_ERR(target))
continue;
}
- if (f2fs_is_compressed_page(&target->page)) {
+ if (f2fs_is_compressed_page(target)) {
target = f2fs_compress_control_folio(target);
if (IS_ERR(target))
continue;
@@ -572,9 +569,9 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode,
if (inode && inode == target->mapping->host)
return true;
- if (page && page == &target->page)
+ if (folio && folio == target)
return true;
- if (ino && ino == ino_of_node(&target->page))
+ if (ino && ino == ino_of_node(target))
return true;
}
@@ -641,7 +638,7 @@ unlock_out:
}
static void __submit_merged_write_cond(struct f2fs_sb_info *sbi,
- struct inode *inode, struct page *page,
+ struct inode *inode, struct folio *folio,
nid_t ino, enum page_type type, bool force)
{
enum temp_type temp;
@@ -653,7 +650,7 @@ static void __submit_merged_write_cond(struct f2fs_sb_info *sbi,
struct f2fs_bio_info *io = sbi->write_io[btype] + temp;
f2fs_down_read(&io->io_rwsem);
- ret = __has_merged_page(io->bio, inode, page, ino);
+ ret = __has_merged_page(io->bio, inode, folio, ino);
f2fs_up_read(&io->io_rwsem);
}
if (ret)
@@ -671,10 +668,10 @@ void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type)
}
void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi,
- struct inode *inode, struct page *page,
+ struct inode *inode, struct folio *folio,
nid_t ino, enum page_type type)
{
- __submit_merged_write_cond(sbi, inode, page, ino, type, false);
+ __submit_merged_write_cond(sbi, inode, folio, ino, type, false);
}
void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi)
@@ -691,7 +688,7 @@ void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi)
int f2fs_submit_page_bio(struct f2fs_io_info *fio)
{
struct bio *bio;
- struct folio *fio_folio = page_folio(fio->page);
+ struct folio *fio_folio = fio->folio;
struct folio *data_folio = fio->encrypted_page ?
page_folio(fio->encrypted_page) : fio_folio;
@@ -713,7 +710,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
wbc_account_cgroup_owner(fio->io_wbc, fio_folio, PAGE_SIZE);
inc_page_count(fio->sbi, is_read_io(fio->op) ?
- __read_io_type(data_folio) : WB_DATA_TYPE(fio->page, false));
+ __read_io_type(data_folio) : WB_DATA_TYPE(fio->folio, false));
if (is_read_io(bio_op(bio)))
f2fs_submit_read_bio(fio->sbi, bio, fio->type);
@@ -779,7 +776,7 @@ static void del_bio_entry(struct bio_entry *be)
static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio,
struct page *page)
{
- struct folio *fio_folio = page_folio(fio->page);
+ struct folio *fio_folio = fio->folio;
struct f2fs_sb_info *sbi = fio->sbi;
enum temp_type temp;
bool found = false;
@@ -848,7 +845,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi,
found = (target == be->bio);
else
found = __has_merged_page(be->bio, NULL,
- &folio->page, 0);
+ folio, 0);
if (found)
break;
}
@@ -865,7 +862,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi,
found = (target == be->bio);
else
found = __has_merged_page(be->bio, NULL,
- &folio->page, 0);
+ folio, 0);
if (found) {
target = be->bio;
del_bio_entry(be);
@@ -886,15 +883,15 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi,
int f2fs_merge_page_bio(struct f2fs_io_info *fio)
{
struct bio *bio = *fio->bio;
- struct page *page = fio->encrypted_page ?
- fio->encrypted_page : fio->page;
- struct folio *folio = page_folio(fio->page);
+ struct folio *data_folio = fio->encrypted_page ?
+ page_folio(fio->encrypted_page) : fio->folio;
+ struct folio *folio = fio->folio;
if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr,
__is_meta_io(fio) ? META_GENERIC : DATA_GENERIC))
return -EFSCORRUPTED;
- trace_f2fs_submit_folio_bio(page_folio(page), fio);
+ trace_f2fs_submit_folio_bio(data_folio, fio);
if (bio && !page_is_mergeable(fio->sbi, bio, *fio->last_block,
fio->new_blkaddr))
@@ -905,16 +902,16 @@ alloc_new:
f2fs_set_bio_crypt_ctx(bio, folio->mapping->host,
folio->index, fio, GFP_NOIO);
- add_bio_entry(fio->sbi, bio, page, fio->temp);
+ add_bio_entry(fio->sbi, bio, &data_folio->page, fio->temp);
} else {
- if (add_ipu_page(fio, &bio, page))
+ if (add_ipu_page(fio, &bio, &data_folio->page))
goto alloc_new;
}
if (fio->io_wbc)
wbc_account_cgroup_owner(fio->io_wbc, folio, folio_size(folio));
- inc_page_count(fio->sbi, WB_DATA_TYPE(page, false));
+ inc_page_count(fio->sbi, WB_DATA_TYPE(data_folio, false));
*fio->last_block = fio->new_blkaddr;
*fio->bio = bio;
@@ -949,7 +946,7 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio)
struct f2fs_sb_info *sbi = fio->sbi;
enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp;
- struct page *bio_page;
+ struct folio *bio_folio;
enum count_type type;
f2fs_bug_on(sbi, is_read_io(fio->op));
@@ -980,44 +977,44 @@ next:
verify_fio_blkaddr(fio);
if (fio->encrypted_page)
- bio_page = fio->encrypted_page;
+ bio_folio = page_folio(fio->encrypted_page);
else if (fio->compressed_page)
- bio_page = fio->compressed_page;
+ bio_folio = page_folio(fio->compressed_page);
else
- bio_page = fio->page;
+ bio_folio = fio->folio;
/* set submitted = true as a return value */
fio->submitted = 1;
- type = WB_DATA_TYPE(bio_page, fio->compressed_page);
+ type = WB_DATA_TYPE(bio_folio, fio->compressed_page);
inc_page_count(sbi, type);
if (io->bio &&
(!io_is_mergeable(sbi, io->bio, io, fio, io->last_block_in_bio,
fio->new_blkaddr) ||
!f2fs_crypt_mergeable_bio(io->bio, fio_inode(fio),
- page_folio(bio_page)->index, fio)))
+ bio_folio->index, fio)))
__submit_merged_bio(io);
alloc_new:
if (io->bio == NULL) {
io->bio = __bio_alloc(fio, BIO_MAX_VECS);
f2fs_set_bio_crypt_ctx(io->bio, fio_inode(fio),
- page_folio(bio_page)->index, fio, GFP_NOIO);
+ bio_folio->index, fio, GFP_NOIO);
io->fio = *fio;
}
- if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < PAGE_SIZE) {
+ if (!bio_add_folio(io->bio, bio_folio, folio_size(bio_folio), 0)) {
__submit_merged_bio(io);
goto alloc_new;
}
if (fio->io_wbc)
- wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page),
- PAGE_SIZE);
+ wbc_account_cgroup_owner(fio->io_wbc, fio->folio,
+ folio_size(fio->folio));
io->last_block_in_bio = fio->new_blkaddr;
- trace_f2fs_submit_folio_write(page_folio(fio->page), fio);
+ trace_f2fs_submit_folio_write(fio->folio, fio);
#ifdef CONFIG_BLK_DEV_ZONED
if (f2fs_sb_has_blkzoned(sbi) && btype < META &&
is_end_zone_blkaddr(sbi, fio->new_blkaddr)) {
@@ -1553,10 +1550,14 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag)
unsigned int start_pgofs;
int bidx = 0;
bool is_hole;
+ bool lfs_dio_write;
if (!maxblocks)
return 0;
+ lfs_dio_write = (flag == F2FS_GET_BLOCK_DIO && f2fs_lfs_mode(sbi) &&
+ map->m_may_create);
+
if (!map->m_may_create && f2fs_map_blocks_cached(inode, map, flag))
goto out;
@@ -1572,8 +1573,11 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag)
end = pgofs + maxblocks;
next_dnode:
- if (map->m_may_create)
+ if (map->m_may_create) {
+ if (f2fs_lfs_mode(sbi))
+ f2fs_balance_fs(sbi, true);
f2fs_map_lock(sbi, flag);
+ }
/* When reading holes, we need its node page */
set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -1589,7 +1593,7 @@ next_dnode:
start_pgofs = pgofs;
prealloc = 0;
last_ofs_in_node = ofs_in_node = dn.ofs_in_node;
- end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode);
+ end_offset = ADDRS_PER_PAGE(dn.node_folio, inode);
next_block:
blkaddr = f2fs_data_blkaddr(&dn);
@@ -1603,7 +1607,7 @@ next_block:
/* use out-place-update for direct IO under LFS mode */
if (map->m_may_create && (is_hole ||
(flag == F2FS_GET_BLOCK_DIO && f2fs_lfs_mode(sbi) &&
- !f2fs_is_pinned_file(inode)))) {
+ !f2fs_is_pinned_file(inode) && map->m_last_pblk != blkaddr))) {
if (unlikely(f2fs_cp_error(sbi))) {
err = -EIO;
goto sync_out;
@@ -1687,10 +1691,15 @@ next_block:
if (map->m_multidev_dio)
map->m_bdev = FDEV(bidx).bdev;
+
+ if (lfs_dio_write)
+ map->m_last_pblk = NULL_ADDR;
} else if (map_is_mergeable(sbi, map, blkaddr, flag, bidx, ofs)) {
ofs++;
map->m_len++;
} else {
+ if (lfs_dio_write && !f2fs_is_pinned_file(inode))
+ map->m_last_pblk = blkaddr;
goto sync_out;
}
@@ -1715,14 +1724,6 @@ skip:
dn.ofs_in_node = end_offset;
}
- if (flag == F2FS_GET_BLOCK_DIO && f2fs_lfs_mode(sbi) &&
- map->m_may_create) {
- /* the next block to be allocated may not be contiguous. */
- if (GET_SEGOFF_FROM_SEG0(sbi, blkaddr) % BLKS_PER_SEC(sbi) ==
- CAP_BLKS_PER_SEC(sbi) - 1)
- goto sync_out;
- }
-
if (pgofs >= end)
goto sync_out;
else if (dn.ofs_in_node < end_offset)
@@ -2303,7 +2304,7 @@ submit_and_realloc:
}
if (!bio) {
- bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages,
+ bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages - i,
f2fs_ra_op_flags(rac),
folio->index, for_write);
if (IS_ERR(bio)) {
@@ -2376,6 +2377,14 @@ static int f2fs_mpage_readpages(struct inode *inode,
unsigned max_nr_pages = nr_pages;
int ret = 0;
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (f2fs_compressed_file(inode)) {
+ index = rac ? readahead_index(rac) : folio->index;
+ max_nr_pages = round_up(index + nr_pages, cc.cluster_size) -
+ round_down(index, cc.cluster_size);
+ }
+#endif
+
map.m_pblk = 0;
map.m_lblk = 0;
map.m_len = 0;
@@ -2642,7 +2651,7 @@ static inline bool need_inplace_update(struct f2fs_io_info *fio)
int f2fs_do_write_data_page(struct f2fs_io_info *fio)
{
- struct folio *folio = page_folio(fio->page);
+ struct folio *folio = fio->folio;
struct inode *inode = folio->mapping->host;
struct dnode_of_data dn;
struct node_info ni;
@@ -2652,7 +2661,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
/* Use COW inode to make dnode_of_data for atomic write */
atomic_commit = f2fs_is_atomic_file(inode) &&
- page_private_atomic(folio_page(folio, 0));
+ folio_test_f2fs_atomic(folio);
if (atomic_commit)
set_new_dnode(&dn, F2FS_I(inode)->cow_inode, NULL, NULL, 0);
else
@@ -2683,7 +2692,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
/* This page is already truncated */
if (fio->old_blkaddr == NULL_ADDR) {
folio_clear_uptodate(folio);
- clear_page_private_gcing(folio_page(folio, 0));
+ folio_clear_f2fs_gcing(folio);
goto out_writepage;
}
got_it:
@@ -2753,7 +2762,7 @@ got_it:
trace_f2fs_do_write_data_page(folio, OPU);
set_inode_flag(inode, FI_APPEND_WRITE);
if (atomic_commit)
- clear_page_private_atomic(folio_page(folio, 0));
+ folio_clear_f2fs_atomic(folio);
out_writepage:
f2fs_put_dnode(&dn);
out:
@@ -2771,7 +2780,6 @@ int f2fs_write_single_data_page(struct folio *folio, int *submitted,
bool allow_balance)
{
struct inode *inode = folio->mapping->host;
- struct page *page = folio_page(folio, 0);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
loff_t i_size = i_size_read(inode);
const pgoff_t end_index = ((unsigned long long)i_size)
@@ -2788,7 +2796,7 @@ int f2fs_write_single_data_page(struct folio *folio, int *submitted,
.op = REQ_OP_WRITE,
.op_flags = wbc_to_write_flags(wbc),
.old_blkaddr = NULL_ADDR,
- .page = page,
+ .folio = folio,
.encrypted_page = NULL,
.submitted = 0,
.compr_blocks = compr_blocks,
@@ -2890,7 +2898,7 @@ out:
inode_dec_dirty_pages(inode);
if (err) {
folio_clear_uptodate(folio);
- clear_page_private_gcing(page);
+ folio_clear_f2fs_gcing(folio);
}
folio_unlock(folio);
if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) &&
@@ -3376,7 +3384,7 @@ restart:
f2fs_do_read_inline_data(folio, ifolio);
set_inode_flag(inode, FI_DATA_EXIST);
if (inode->i_nlink)
- set_page_private_inline(&ifolio->page);
+ folio_set_f2fs_inline(ifolio);
goto out;
}
err = f2fs_convert_inline_folio(&dn, folio);
@@ -3698,7 +3706,7 @@ static int f2fs_write_end(const struct kiocb *iocb,
folio_mark_dirty(folio);
if (f2fs_is_atomic_file(inode))
- set_page_private_atomic(folio_page(folio, 0));
+ folio_set_f2fs_atomic(folio);
if (pos + copied > i_size_read(inode) &&
!f2fs_verity_in_progress(inode)) {
@@ -3733,7 +3741,7 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
f2fs_remove_dirty_inode(inode);
}
}
- clear_page_private_all(&folio->page);
+ folio_detach_private(folio);
}
bool f2fs_release_folio(struct folio *folio, gfp_t wait)
@@ -3742,7 +3750,7 @@ bool f2fs_release_folio(struct folio *folio, gfp_t wait)
if (folio_test_dirty(folio))
return false;
- clear_page_private_all(&folio->page);
+ folio_detach_private(folio);
return true;
}
@@ -4160,7 +4168,7 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned int flags, struct iomap *iomap,
struct iomap *srcmap)
{
- struct f2fs_map_blocks map = {};
+ struct f2fs_map_blocks map = { NULL, };
pgoff_t next_pgofs = 0;
int err;
@@ -4169,6 +4177,10 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
map.m_next_pgofs = &next_pgofs;
map.m_seg_type = f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode),
inode->i_write_hint);
+ if (flags & IOMAP_WRITE && iomap->private) {
+ map.m_last_pblk = (unsigned long)iomap->private;
+ iomap->private = NULL;
+ }
/*
* If the blocks being overwritten are already allocated,
@@ -4207,6 +4219,9 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
iomap->flags |= IOMAP_F_MERGED;
iomap->bdev = map.m_bdev;
iomap->addr = F2FS_BLK_TO_BYTES(map.m_pblk);
+
+ if (flags & IOMAP_WRITE && map.m_last_pblk)
+ iomap->private = (void *)map.m_last_pblk;
} else {
if (flags & IOMAP_WRITE)
return -ENOTBLK;
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 16c2dfb4f595..43a83bbd3bc5 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -21,7 +21,7 @@
#include "gc.h"
static LIST_HEAD(f2fs_stat_list);
-static DEFINE_RAW_SPINLOCK(f2fs_stat_lock);
+static DEFINE_SPINLOCK(f2fs_stat_lock);
#ifdef CONFIG_DEBUG_FS
static struct dentry *f2fs_debugfs_root;
#endif
@@ -91,7 +91,7 @@ static void update_multidevice_stats(struct f2fs_sb_info *sbi)
seg_blks = get_seg_entry(sbi, j)->valid_blocks;
/* update segment stats */
- if (IS_CURSEG(sbi, j))
+ if (is_curseg(sbi, j))
dev_stats[i].devstats[0][DEVSTAT_INUSE]++;
else if (seg_blks == BLKS_PER_SEG(sbi))
dev_stats[i].devstats[0][DEVSTAT_FULL]++;
@@ -109,7 +109,7 @@ static void update_multidevice_stats(struct f2fs_sb_info *sbi)
sec_blks = get_sec_entry(sbi, j)->valid_blocks;
/* update section stats */
- if (IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, j)))
+ if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, j)))
dev_stats[i].devstats[1][DEVSTAT_INUSE]++;
else if (sec_blks == BLKS_PER_SEC(sbi))
dev_stats[i].devstats[1][DEVSTAT_FULL]++;
@@ -439,9 +439,8 @@ static int stat_show(struct seq_file *s, void *v)
{
struct f2fs_stat_info *si;
int i = 0, j = 0;
- unsigned long flags;
- raw_spin_lock_irqsave(&f2fs_stat_lock, flags);
+ spin_lock(&f2fs_stat_lock);
list_for_each_entry(si, &f2fs_stat_list, stat_list) {
struct f2fs_sb_info *sbi = si->sbi;
@@ -753,7 +752,7 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, " - paged : %llu KB\n",
si->page_mem >> 10);
}
- raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags);
+ spin_unlock(&f2fs_stat_lock);
return 0;
}
@@ -765,7 +764,6 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
struct f2fs_stat_info *si;
struct f2fs_dev_stats *dev_stats;
- unsigned long flags;
int i;
si = f2fs_kzalloc(sbi, sizeof(struct f2fs_stat_info), GFP_KERNEL);
@@ -817,9 +815,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
atomic_set(&sbi->max_aw_cnt, 0);
- raw_spin_lock_irqsave(&f2fs_stat_lock, flags);
+ spin_lock(&f2fs_stat_lock);
list_add_tail(&si->stat_list, &f2fs_stat_list);
- raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags);
+ spin_unlock(&f2fs_stat_lock);
return 0;
}
@@ -827,11 +825,10 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
{
struct f2fs_stat_info *si = F2FS_STAT(sbi);
- unsigned long flags;
- raw_spin_lock_irqsave(&f2fs_stat_lock, flags);
+ spin_lock(&f2fs_stat_lock);
list_del(&si->stat_list);
- raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags);
+ spin_unlock(&f2fs_stat_lock);
kfree(si->dev_stats);
kfree(si);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index c36b3b22bfff..fffd7749d6d1 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -454,7 +454,7 @@ static void init_dent_inode(struct inode *dir, struct inode *inode,
f2fs_folio_wait_writeback(ifolio, NODE, true, true);
/* copy name info. to this inode folio */
- ri = F2FS_INODE(&ifolio->page);
+ ri = F2FS_INODE(ifolio);
ri->i_namelen = cpu_to_le32(fname->disk_name.len);
memcpy(ri->i_name, fname->disk_name.name, fname->disk_name.len);
if (IS_ENCRYPTED(dir)) {
@@ -897,7 +897,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct folio *folio,
f2fs_clear_page_cache_dirty_tag(folio);
folio_clear_dirty_for_io(folio);
folio_clear_uptodate(folio);
- clear_page_private_all(&folio->page);
+ folio_detach_private(folio);
inode_dec_dirty_pages(dir);
f2fs_remove_dirty_inode(dir);
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index cfe925a3d555..199c1e7a83ef 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -19,10 +19,10 @@
#include "node.h"
#include <trace/events/f2fs.h>
-bool sanity_check_extent_cache(struct inode *inode, struct page *ipage)
+bool sanity_check_extent_cache(struct inode *inode, struct folio *ifolio)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct f2fs_extent *i_ext = &F2FS_INODE(ipage)->i_ext;
+ struct f2fs_extent *i_ext = &F2FS_INODE(ifolio)->i_ext;
struct extent_info ei;
int devi;
@@ -411,10 +411,10 @@ void f2fs_init_read_extent_tree(struct inode *inode, struct folio *ifolio)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct extent_tree_info *eti = &sbi->extent_tree[EX_READ];
- struct f2fs_extent *i_ext = &F2FS_INODE(&ifolio->page)->i_ext;
+ struct f2fs_extent *i_ext = &F2FS_INODE(ifolio)->i_ext;
struct extent_tree *et;
struct extent_node *en;
- struct extent_info ei;
+ struct extent_info ei = {0};
if (!__may_extent_tree(inode, EX_READ)) {
/* drop largest read extent */
@@ -934,7 +934,7 @@ static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type typ
if (!__may_extent_tree(dn->inode, type))
return;
- ei.fofs = f2fs_start_bidx_of_node(ofs_of_node(&dn->node_folio->page), dn->inode) +
+ ei.fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_folio), dn->inode) +
dn->ofs_in_node;
ei.len = 1;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index c78464792ceb..46be7560548c 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -386,7 +386,7 @@ struct discard_cmd {
struct rb_node rb_node; /* rb node located in rb-tree */
struct discard_info di; /* discard info */
struct list_head list; /* command list */
- struct completion wait; /* compleation */
+ struct completion wait; /* completion */
struct block_device *bdev; /* bdev */
unsigned short ref; /* reference count */
unsigned char state; /* state */
@@ -732,6 +732,7 @@ struct f2fs_map_blocks {
block_t m_lblk;
unsigned int m_len;
unsigned int m_flags;
+ unsigned long m_last_pblk; /* last allocated block, only used for DIO in LFS mode */
pgoff_t *m_next_pgofs; /* point next possible non-hole pgofs */
pgoff_t *m_next_extent; /* point to next possible extent */
int m_seg_type;
@@ -875,6 +876,7 @@ struct f2fs_inode_info {
/* linked in global inode list for cache donation */
struct list_head gdonate_list;
pgoff_t donate_start, donate_end; /* inclusive */
+ atomic_t open_count; /* # of open files */
struct task_struct *atomic_write_task; /* store atomic write task */
struct extent_tree *extent_tree[NR_EXTENT_CACHES];
@@ -1123,8 +1125,8 @@ struct f2fs_sm_info {
* f2fs monitors the number of several block types such as on-writeback,
* dirty dentry blocks, dirty node blocks, and dirty meta blocks.
*/
-#define WB_DATA_TYPE(p, f) \
- (f || f2fs_is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA)
+#define WB_DATA_TYPE(folio, f) \
+ (f || f2fs_is_cp_guaranteed(folio) ? F2FS_WB_CP_DATA : F2FS_WB_DATA)
enum count_type {
F2FS_DIRTY_DENTS,
F2FS_DIRTY_DATA,
@@ -1240,7 +1242,10 @@ struct f2fs_io_info {
blk_opf_t op_flags; /* req_flag_bits */
block_t new_blkaddr; /* new block address to be written */
block_t old_blkaddr; /* old block address before Cow */
- struct page *page; /* page to be written */
+ union {
+ struct page *page; /* page to be written */
+ struct folio *folio;
+ };
struct page *encrypted_page; /* encrypted page */
struct page *compressed_page; /* compressed page */
struct list_head list; /* serialize IOs */
@@ -1286,7 +1291,7 @@ struct f2fs_bio_info {
struct f2fs_dev_info {
struct file *bdev_file;
struct block_device *bdev;
- char path[MAX_PATH_LEN];
+ char path[MAX_PATH_LEN + 1];
unsigned int total_segments;
block_t start_blk;
block_t end_blk;
@@ -1427,7 +1432,7 @@ enum {
enum {
MEMORY_MODE_NORMAL, /* memory mode for normal devices */
- MEMORY_MODE_LOW, /* memory mode for low memry devices */
+ MEMORY_MODE_LOW, /* memory mode for low memory devices */
};
enum errors_option {
@@ -1491,7 +1496,7 @@ enum compress_flag {
#define COMPRESS_DATA_RESERVED_SIZE 4
struct compress_data {
__le32 clen; /* compressed data size */
- __le32 chksum; /* compressed data chksum */
+ __le32 chksum; /* compressed data checksum */
__le32 reserved[COMPRESS_DATA_RESERVED_SIZE]; /* reserved */
u8 cdata[]; /* compressed data */
};
@@ -1536,6 +1541,7 @@ struct compress_io_ctx {
struct decompress_io_ctx {
u32 magic; /* magic number to indicate page is compressed */
struct inode *inode; /* inode the context belong to */
+ struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */
pgoff_t cluster_idx; /* cluster index number */
unsigned int cluster_size; /* page count in cluster */
unsigned int log_cluster_size; /* log of cluster size */
@@ -1576,6 +1582,7 @@ struct decompress_io_ctx {
bool failed; /* IO error occurred before decompression? */
bool need_verity; /* need fs-verity verification after decompression? */
+ unsigned char compress_algorithm; /* backup algorithm type */
void *private; /* payload buffer for specified decompression algorithm */
void *private2; /* extra payload buffer */
struct work_struct verity_work; /* work to verify the decompressed pages */
@@ -1724,6 +1731,9 @@ struct f2fs_sb_info {
/* for skip statistic */
unsigned long long skipped_gc_rwsem; /* FG_GC only */
+ /* free sections reserved for pinned file */
+ unsigned int reserved_pin_section;
+
/* threshold for gc trials on pinned files */
unsigned short gc_pin_file_threshold;
struct f2fs_rwsem pin_sem;
@@ -2013,16 +2023,11 @@ static inline struct f2fs_sb_info *F2FS_M_SB(struct address_space *mapping)
return F2FS_I_SB(mapping->host);
}
-static inline struct f2fs_sb_info *F2FS_F_SB(struct folio *folio)
+static inline struct f2fs_sb_info *F2FS_F_SB(const struct folio *folio)
{
return F2FS_M_SB(folio->mapping);
}
-static inline struct f2fs_sb_info *F2FS_P_SB(struct page *page)
-{
- return F2FS_F_SB(page_folio(page));
-}
-
static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi)
{
return (struct f2fs_super_block *)(sbi->raw_super);
@@ -2043,14 +2048,14 @@ static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi)
return (struct f2fs_checkpoint *)(sbi->ckpt);
}
-static inline struct f2fs_node *F2FS_NODE(const struct page *page)
+static inline struct f2fs_node *F2FS_NODE(const struct folio *folio)
{
- return (struct f2fs_node *)page_address(page);
+ return (struct f2fs_node *)folio_address(folio);
}
-static inline struct f2fs_inode *F2FS_INODE(struct page *page)
+static inline struct f2fs_inode *F2FS_INODE(const struct folio *folio)
{
- return &((struct f2fs_node *)page_address(page))->i;
+ return &((struct f2fs_node *)folio_address(folio))->i;
}
static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi)
@@ -2453,6 +2458,13 @@ release_quota:
}
#define PAGE_PRIVATE_GET_FUNC(name, flagname) \
+static inline bool folio_test_f2fs_##name(const struct folio *folio) \
+{ \
+ unsigned long priv = (unsigned long)folio->private; \
+ unsigned long v = (1UL << PAGE_PRIVATE_NOT_POINTER) | \
+ (1UL << PAGE_PRIVATE_##flagname); \
+ return (priv & v) == v; \
+} \
static inline bool page_private_##name(struct page *page) \
{ \
return PagePrivate(page) && \
@@ -2461,6 +2473,17 @@ static inline bool page_private_##name(struct page *page) \
}
#define PAGE_PRIVATE_SET_FUNC(name, flagname) \
+static inline void folio_set_f2fs_##name(struct folio *folio) \
+{ \
+ unsigned long v = (1UL << PAGE_PRIVATE_NOT_POINTER) | \
+ (1UL << PAGE_PRIVATE_##flagname); \
+ if (!folio->private) \
+ folio_attach_private(folio, (void *)v); \
+ else { \
+ v |= (unsigned long)folio->private; \
+ folio->private = (void *)v; \
+ } \
+} \
static inline void set_page_private_##name(struct page *page) \
{ \
if (!PagePrivate(page)) \
@@ -2470,6 +2493,16 @@ static inline void set_page_private_##name(struct page *page) \
}
#define PAGE_PRIVATE_CLEAR_FUNC(name, flagname) \
+static inline void folio_clear_f2fs_##name(struct folio *folio) \
+{ \
+ unsigned long v = (unsigned long)folio->private; \
+ \
+ v &= ~(1UL << PAGE_PRIVATE_##flagname); \
+ if (v == (1UL << PAGE_PRIVATE_NOT_POINTER)) \
+ folio_detach_private(folio); \
+ else \
+ folio->private = (void *)v; \
+} \
static inline void clear_page_private_##name(struct page *page) \
{ \
clear_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \
@@ -2492,39 +2525,23 @@ PAGE_PRIVATE_CLEAR_FUNC(inline, INLINE_INODE);
PAGE_PRIVATE_CLEAR_FUNC(gcing, ONGOING_MIGRATION);
PAGE_PRIVATE_CLEAR_FUNC(atomic, ATOMIC_WRITE);
-static inline unsigned long get_page_private_data(struct page *page)
+static inline unsigned long folio_get_f2fs_data(struct folio *folio)
{
- unsigned long data = page_private(page);
+ unsigned long data = (unsigned long)folio->private;
if (!test_bit(PAGE_PRIVATE_NOT_POINTER, &data))
return 0;
return data >> PAGE_PRIVATE_MAX;
}
-static inline void set_page_private_data(struct page *page, unsigned long data)
+static inline void folio_set_f2fs_data(struct folio *folio, unsigned long data)
{
- if (!PagePrivate(page))
- attach_page_private(page, (void *)0);
- set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page));
- page_private(page) |= data << PAGE_PRIVATE_MAX;
-}
-
-static inline void clear_page_private_data(struct page *page)
-{
- page_private(page) &= GENMASK(PAGE_PRIVATE_MAX - 1, 0);
- if (page_private(page) == BIT(PAGE_PRIVATE_NOT_POINTER))
- detach_page_private(page);
-}
+ data = (1UL << PAGE_PRIVATE_NOT_POINTER) | (data << PAGE_PRIVATE_MAX);
-static inline void clear_page_private_all(struct page *page)
-{
- clear_page_private_data(page);
- clear_page_private_reference(page);
- clear_page_private_gcing(page);
- clear_page_private_inline(page);
- clear_page_private_atomic(page);
-
- f2fs_bug_on(F2FS_P_SB(page), page_private(page));
+ if (!folio_test_private(folio))
+ folio_attach_private(folio, (void *)data);
+ else
+ folio->private = (void *)((unsigned long)folio->private | data);
}
static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
@@ -3011,9 +3028,9 @@ static inline void f2fs_radix_tree_insert(struct radix_tree_root *root,
#define RAW_IS_INODE(p) ((p)->footer.nid == (p)->footer.ino)
-static inline bool IS_INODE(struct page *page)
+static inline bool IS_INODE(const struct folio *folio)
{
- struct f2fs_node *p = F2FS_NODE(page);
+ struct f2fs_node *p = F2FS_NODE(folio);
return RAW_IS_INODE(p);
}
@@ -3031,20 +3048,20 @@ static inline __le32 *blkaddr_in_node(struct f2fs_node *node)
static inline int f2fs_has_extra_attr(struct inode *inode);
static inline unsigned int get_dnode_base(struct inode *inode,
- struct page *node_page)
+ struct folio *node_folio)
{
- if (!IS_INODE(node_page))
+ if (!IS_INODE(node_folio))
return 0;
return inode ? get_extra_isize(inode) :
- offset_in_addr(&F2FS_NODE(node_page)->i);
+ offset_in_addr(&F2FS_NODE(node_folio)->i);
}
static inline __le32 *get_dnode_addr(struct inode *inode,
struct folio *node_folio)
{
- return blkaddr_in_node(F2FS_NODE(&node_folio->page)) +
- get_dnode_base(inode, &node_folio->page);
+ return blkaddr_in_node(F2FS_NODE(node_folio)) +
+ get_dnode_base(inode, node_folio);
}
static inline block_t data_blkaddr(struct inode *inode,
@@ -3366,9 +3383,10 @@ static inline unsigned int addrs_per_page(struct inode *inode,
return addrs;
}
-static inline void *inline_xattr_addr(struct inode *inode, struct folio *folio)
+static inline
+void *inline_xattr_addr(struct inode *inode, const struct folio *folio)
{
- struct f2fs_inode *ri = F2FS_INODE(&folio->page);
+ struct f2fs_inode *ri = F2FS_INODE(folio);
return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE -
get_inline_xattr_addrs(inode)]);
@@ -3628,13 +3646,14 @@ int f2fs_pin_file_control(struct inode *inode, bool inc);
*/
void f2fs_set_inode_flags(struct inode *inode);
bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct folio *folio);
-void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page);
+void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct folio *folio);
struct inode *f2fs_iget(struct super_block *sb, unsigned long ino);
struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino);
int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink);
void f2fs_update_inode(struct inode *inode, struct folio *node_folio);
void f2fs_update_inode_page(struct inode *inode);
int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc);
+void f2fs_remove_donate_inode(struct inode *inode);
void f2fs_evict_inode(struct inode *inode);
void f2fs_handle_failed_inode(struct inode *inode);
@@ -3784,8 +3803,8 @@ void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid);
void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid);
int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink);
int f2fs_recover_inline_xattr(struct inode *inode, struct folio *folio);
-int f2fs_recover_xattr_data(struct inode *inode, struct page *page);
-int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page);
+int f2fs_recover_xattr_data(struct inode *inode, struct folio *folio);
+int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct folio *folio);
int f2fs_restore_node_summary(struct f2fs_sb_info *sbi,
unsigned int segno, struct f2fs_summary_block *sum);
int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc);
@@ -3852,7 +3871,7 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
bool recover_newaddr);
enum temp_type f2fs_get_segment_temp(struct f2fs_sb_info *sbi,
enum log_type seg_type);
-int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
+int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct folio *folio,
block_t old_blkaddr, block_t *new_blkaddr,
struct f2fs_summary *sum, int type,
struct f2fs_io_info *fio);
@@ -3886,7 +3905,7 @@ unsigned long long f2fs_get_section_mtime(struct f2fs_sb_info *sbi,
static inline struct inode *fio_inode(struct f2fs_io_info *fio)
{
- return page_folio(fio->page)->mapping->host;
+ return fio->folio->mapping->host;
}
#define DEF_FRAGMENT_SIZE 4
@@ -3953,7 +3972,7 @@ void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi);
*/
int __init f2fs_init_bioset(void);
void f2fs_destroy_bioset(void);
-bool f2fs_is_cp_guaranteed(struct page *page);
+bool f2fs_is_cp_guaranteed(const struct folio *folio);
int f2fs_init_bio_entry_cache(void);
void f2fs_destroy_bio_entry_cache(void);
void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
@@ -3961,7 +3980,7 @@ void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi);
void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type);
void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi,
- struct inode *inode, struct page *page,
+ struct inode *inode, struct folio *folio,
nid_t ino, enum page_type type);
void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi,
struct bio **bio, struct folio *folio);
@@ -4303,7 +4322,7 @@ extern struct kmem_cache *f2fs_inode_entry_slab;
* inline.c
*/
bool f2fs_may_inline_data(struct inode *inode);
-bool f2fs_sanity_check_inline_data(struct inode *inode, struct page *ipage);
+bool f2fs_sanity_check_inline_data(struct inode *inode, struct folio *ifolio);
bool f2fs_may_inline_dentry(struct inode *inode);
void f2fs_do_read_inline_data(struct folio *folio, struct folio *ifolio);
void f2fs_truncate_inline_inode(struct inode *inode, struct folio *ifolio,
@@ -4345,7 +4364,7 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi);
/*
* extent_cache.c
*/
-bool sanity_check_extent_cache(struct inode *inode, struct page *ipage);
+bool sanity_check_extent_cache(struct inode *inode, struct folio *ifolio);
void f2fs_init_extent_tree(struct inode *inode);
void f2fs_drop_extent_tree(struct inode *inode);
void f2fs_destroy_extent_node(struct inode *inode);
@@ -4435,20 +4454,20 @@ enum cluster_check_type {
CLUSTER_COMPR_BLKS, /* return # of compressed blocks in a cluster */
CLUSTER_RAW_BLKS /* return # of raw blocks in a cluster */
};
-bool f2fs_is_compressed_page(struct page *page);
+bool f2fs_is_compressed_page(struct folio *folio);
struct folio *f2fs_compress_control_folio(struct folio *folio);
int f2fs_prepare_compress_overwrite(struct inode *inode,
struct page **pagep, pgoff_t index, void **fsdata);
bool f2fs_compress_write_end(struct inode *inode, void *fsdata,
pgoff_t index, unsigned copied);
int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock);
-void f2fs_compress_write_end_io(struct bio *bio, struct page *page);
+void f2fs_compress_write_end_io(struct bio *bio, struct folio *folio);
bool f2fs_is_compress_backend_ready(struct inode *inode);
bool f2fs_is_compress_level_valid(int alg, int lvl);
int __init f2fs_init_compress_mempool(void);
void f2fs_destroy_compress_mempool(void);
void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task);
-void f2fs_end_read_compressed_page(struct page *page, bool failed,
+void f2fs_end_read_compressed_page(struct folio *folio, bool failed,
block_t blkaddr, bool in_task);
bool f2fs_cluster_is_empty(struct compress_ctx *cc);
bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index);
@@ -4486,8 +4505,6 @@ void f2fs_destroy_compress_cache(void);
struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi);
void f2fs_invalidate_compress_pages_range(struct f2fs_sb_info *sbi,
block_t blkaddr, unsigned int len);
-void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page,
- nid_t ino, block_t blkaddr);
bool f2fs_load_compressed_folio(struct f2fs_sb_info *sbi, struct folio *folio,
block_t blkaddr);
void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino);
@@ -4504,7 +4521,7 @@ void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino);
sbi->compr_saved_block += diff; \
} while (0)
#else
-static inline bool f2fs_is_compressed_page(struct page *page) { return false; }
+static inline bool f2fs_is_compressed_page(struct folio *folio) { return false; }
static inline bool f2fs_is_compress_backend_ready(struct inode *inode)
{
if (!f2fs_compressed_file(inode))
@@ -4522,7 +4539,7 @@ static inline int __init f2fs_init_compress_mempool(void) { return 0; }
static inline void f2fs_destroy_compress_mempool(void) { }
static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic,
bool in_task) { }
-static inline void f2fs_end_read_compressed_page(struct page *page,
+static inline void f2fs_end_read_compressed_page(struct folio *folio,
bool failed, block_t blkaddr, bool in_task)
{
WARN_ON_ONCE(1);
@@ -4542,8 +4559,6 @@ static inline int __init f2fs_init_compress_cache(void) { return 0; }
static inline void f2fs_destroy_compress_cache(void) { }
static inline void f2fs_invalidate_compress_pages_range(struct f2fs_sb_info *sbi,
block_t blkaddr, unsigned int len) { }
-static inline void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi,
- struct page *page, nid_t ino, block_t blkaddr) { }
static inline bool f2fs_load_compressed_folio(struct f2fs_sb_info *sbi,
struct folio *folio, block_t blkaddr) { return false; }
static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi,
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index c677230699fd..42faaed6a02d 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -489,7 +489,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
}
}
- end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode);
+ end_offset = ADDRS_PER_PAGE(dn.node_folio, inode);
/* find data/hole in dnode block */
for (; dn.ofs_in_node < end_offset;
@@ -629,7 +629,10 @@ static int f2fs_file_open(struct inode *inode, struct file *filp)
if (err)
return err;
- return finish_preallocate_blocks(inode);
+ err = finish_preallocate_blocks(inode);
+ if (!err)
+ atomic_inc(&F2FS_I(inode)->open_count);
+ return err;
}
void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
@@ -708,7 +711,7 @@ next:
* once we invalidate valid blkaddr in range [ofs, ofs + count],
* we will invalidate all blkaddr in the whole range.
*/
- fofs = f2fs_start_bidx_of_node(ofs_of_node(&dn->node_folio->page),
+ fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_folio),
dn->inode) + ofs;
f2fs_update_read_extent_cache_range(dn, fofs, 0, len);
f2fs_update_age_extent_cache_range(dn, fofs, len);
@@ -815,12 +818,12 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock)
goto out;
}
- count = ADDRS_PER_PAGE(&dn.node_folio->page, inode);
+ count = ADDRS_PER_PAGE(dn.node_folio, inode);
count -= dn.ofs_in_node;
f2fs_bug_on(sbi, count < 0);
- if (dn.ofs_in_node || IS_INODE(&dn.node_folio->page)) {
+ if (dn.ofs_in_node || IS_INODE(dn.node_folio)) {
f2fs_truncate_data_blocks_range(&dn, count);
free_from += count;
}
@@ -1043,11 +1046,24 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
{
struct inode *inode = d_inode(dentry);
struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int err;
- if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+ if (unlikely(f2fs_cp_error(sbi)))
return -EIO;
+ err = setattr_prepare(idmap, dentry, attr);
+ if (err)
+ return err;
+
+ err = fscrypt_prepare_setattr(dentry, attr);
+ if (err)
+ return err;
+
+ err = fsverity_prepare_setattr(dentry, attr);
+ if (err)
+ return err;
+
if (unlikely(IS_IMMUTABLE(inode)))
return -EPERM;
@@ -1064,20 +1080,19 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
!IS_ALIGNED(attr->ia_size,
F2FS_BLK_TO_BYTES(fi->i_cluster_size)))
return -EINVAL;
+ /*
+ * To prevent scattered pin block generation, we don't allow
+ * smaller/equal size unaligned truncation for pinned file.
+ * We only support overwrite IO to pinned file, so don't
+ * care about larger size truncation.
+ */
+ if (f2fs_is_pinned_file(inode) &&
+ attr->ia_size <= i_size_read(inode) &&
+ !IS_ALIGNED(attr->ia_size,
+ F2FS_BLK_TO_BYTES(CAP_BLKS_PER_SEC(sbi))))
+ return -EINVAL;
}
- err = setattr_prepare(idmap, dentry, attr);
- if (err)
- return err;
-
- err = fscrypt_prepare_setattr(dentry, attr);
- if (err)
- return err;
-
- err = fsverity_prepare_setattr(dentry, attr);
- if (err)
- return err;
-
if (is_quota_modification(idmap, inode, attr)) {
err = f2fs_dquot_initialize(inode);
if (err)
@@ -1085,12 +1100,11 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
}
if (i_uid_needs_update(idmap, attr, inode) ||
i_gid_needs_update(idmap, attr, inode)) {
- f2fs_lock_op(F2FS_I_SB(inode));
+ f2fs_lock_op(sbi);
err = dquot_transfer(idmap, inode, attr);
if (err) {
- set_sbi_flag(F2FS_I_SB(inode),
- SBI_QUOTA_NEED_REPAIR);
- f2fs_unlock_op(F2FS_I_SB(inode));
+ set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
+ f2fs_unlock_op(sbi);
return err;
}
/*
@@ -1100,7 +1114,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
i_uid_update(idmap, attr, inode);
i_gid_update(idmap, attr, inode);
f2fs_mark_inode_dirty_sync(inode, true);
- f2fs_unlock_op(F2FS_I_SB(inode));
+ f2fs_unlock_op(sbi);
}
if (attr->ia_valid & ATTR_SIZE) {
@@ -1163,7 +1177,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
f2fs_mark_inode_dirty_sync(inode, true);
/* inode change will produce dirty node pages flushed by checkpoint */
- f2fs_balance_fs(F2FS_I_SB(inode), true);
+ f2fs_balance_fs(sbi, true);
return err;
}
@@ -1223,7 +1237,7 @@ int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
return err;
}
- end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode);
+ end_offset = ADDRS_PER_PAGE(dn.node_folio, inode);
count = min(end_offset - dn.ofs_in_node, pg_end - pg_start);
f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset);
@@ -1322,7 +1336,7 @@ next_dnode:
goto next;
}
- done = min((pgoff_t)ADDRS_PER_PAGE(&dn.node_folio->page, inode) -
+ done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_folio, inode) -
dn.ofs_in_node, len);
for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) {
*blkaddr = f2fs_data_blkaddr(&dn);
@@ -1411,7 +1425,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
}
ilen = min((pgoff_t)
- ADDRS_PER_PAGE(&dn.node_folio->page, dst_inode) -
+ ADDRS_PER_PAGE(dn.node_folio, dst_inode) -
dn.ofs_in_node, len - i);
do {
dn.data_blkaddr = f2fs_data_blkaddr(&dn);
@@ -1453,7 +1467,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
memcpy_folio(fdst, 0, fsrc, 0, PAGE_SIZE);
folio_mark_dirty(fdst);
- set_page_private_gcing(&fdst->page);
+ folio_set_f2fs_gcing(fdst);
f2fs_folio_put(fdst, true);
f2fs_folio_put(fsrc, true);
@@ -1707,7 +1721,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
goto out;
}
- end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode);
+ end_offset = ADDRS_PER_PAGE(dn.node_folio, inode);
end = min(pg_end, end_offset - dn.ofs_in_node + index);
ret = f2fs_do_zero_range(&dn, index, end);
@@ -1888,9 +1902,8 @@ next_alloc:
}
}
- if (has_not_enough_free_secs(sbi, 0, f2fs_sb_has_blkzoned(sbi) ?
- ZONED_PIN_SEC_REQUIRED_COUNT :
- GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) {
+ if (has_not_enough_free_secs(sbi, 0,
+ sbi->reserved_pin_section)) {
f2fs_down_write(&sbi->gc_lock);
stat_inc_gc_call_count(sbi, FOREGROUND);
err = f2fs_gc(sbi, &gc_control);
@@ -2028,6 +2041,9 @@ out:
static int f2fs_release_file(struct inode *inode, struct file *filp)
{
+ if (atomic_dec_and_test(&F2FS_I(inode)->open_count))
+ f2fs_remove_donate_inode(inode);
+
/*
* f2fs_release_file is called at every close calls. So we should
* not drop any inmemory pages by close called by other process.
@@ -2978,7 +2994,7 @@ do_map:
f2fs_folio_wait_writeback(folio, DATA, true, true);
folio_mark_dirty(folio);
- set_page_private_gcing(&folio->page);
+ folio_set_f2fs_gcing(folio);
f2fs_folio_put(folio, true);
idx++;
@@ -3876,7 +3892,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
break;
}
- end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode);
+ end_offset = ADDRS_PER_PAGE(dn.node_folio, inode);
count = min(end_offset - dn.ofs_in_node, last_idx - page_idx);
count = round_up(count, fi->i_cluster_size);
@@ -4054,7 +4070,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
break;
}
- end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode);
+ end_offset = ADDRS_PER_PAGE(dn.node_folio, inode);
count = min(end_offset - dn.ofs_in_node, last_idx - page_idx);
count = round_up(count, fi->i_cluster_size);
@@ -4218,7 +4234,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg)
goto out;
}
- end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode);
+ end_offset = ADDRS_PER_PAGE(dn.node_folio, inode);
count = min(end_offset - dn.ofs_in_node, pg_end - index);
for (i = 0; i < count; i++, index++, dn.ofs_in_node++) {
struct block_device *cur_bdev;
@@ -4415,7 +4431,7 @@ static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len)
f2fs_folio_wait_writeback(folio, DATA, true, true);
folio_mark_dirty(folio);
- set_page_private_gcing(&folio->page);
+ folio_set_f2fs_gcing(folio);
redirty_idx = folio_next_index(folio);
folio_unlock(folio);
folio_put_refs(folio, 2);
@@ -4825,6 +4841,7 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct inode *inode = file_inode(iocb->ki_filp);
const loff_t pos = iocb->ki_pos;
ssize_t ret;
+ bool dio;
if (!f2fs_is_compress_backend_ready(inode))
return -EOPNOTSUPP;
@@ -4833,12 +4850,15 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
f2fs_trace_rw_file_path(iocb->ki_filp, iocb->ki_pos,
iov_iter_count(to), READ);
+ dio = f2fs_should_use_dio(inode, iocb, to);
+
/* In LFS mode, if there is inflight dio, wait for its completion */
if (f2fs_lfs_mode(F2FS_I_SB(inode)) &&
- get_pages(F2FS_I_SB(inode), F2FS_DIO_WRITE))
+ get_pages(F2FS_I_SB(inode), F2FS_DIO_WRITE) &&
+ (!f2fs_is_pinned_file(inode) || !dio))
inode_dio_wait(inode);
- if (f2fs_should_use_dio(inode, iocb, to)) {
+ if (dio) {
ret = f2fs_dio_read_iter(iocb, to);
} else {
ret = filemap_read(iocb, to, 0);
@@ -4846,8 +4866,7 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
f2fs_update_iostat(F2FS_I_SB(inode), inode,
APP_BUFFERED_READ_IO, ret);
}
- if (trace_f2fs_dataread_end_enabled())
- trace_f2fs_dataread_end(inode, pos, ret);
+ trace_f2fs_dataread_end(inode, pos, ret);
return ret;
}
@@ -4870,8 +4889,7 @@ static ssize_t f2fs_file_splice_read(struct file *in, loff_t *ppos,
f2fs_update_iostat(F2FS_I_SB(inode), inode,
APP_BUFFERED_READ_IO, ret);
- if (trace_f2fs_dataread_end_enabled())
- trace_f2fs_dataread_end(inode, pos, ret);
+ trace_f2fs_dataread_end(inode, pos, ret);
return ret;
}
@@ -5216,8 +5234,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
f2fs_dio_write_iter(iocb, from, &may_need_sync) :
f2fs_buffered_write_iter(iocb, from);
- if (trace_f2fs_datawrite_end_enabled())
- trace_f2fs_datawrite_end(inode, orig_pos, ret);
+ trace_f2fs_datawrite_end(inode, orig_pos, ret);
}
/* Don't leave any preallocated blocks around past i_size. */
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 3cb5242f4ddf..098e9f71421e 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -141,10 +141,10 @@ do_gc:
FOREGROUND : BACKGROUND);
sync_mode = (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC) ||
- gc_control.one_time;
+ (gc_control.one_time && gc_th->boost_gc_greedy);
/* foreground GC was been triggered via f2fs_balance_fs() */
- if (foreground)
+ if (foreground && !f2fs_sb_has_blkzoned(sbi))
sync_mode = false;
gc_control.init_gc_type = sync_mode ? FG_GC : BG_GC;
@@ -197,6 +197,8 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi)
gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME;
gc_th->valid_thresh_ratio = DEF_GC_THREAD_VALID_THRESH_RATIO;
+ gc_th->boost_gc_multiple = BOOST_GC_MULTIPLE;
+ gc_th->boost_gc_greedy = GC_GREEDY;
if (f2fs_sb_has_blkzoned(sbi)) {
gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME_ZONED;
@@ -278,12 +280,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
- if (p->alloc_mode == SSR) {
- p->gc_mode = GC_GREEDY;
- p->dirty_bitmap = dirty_i->dirty_segmap[type];
- p->max_search = dirty_i->nr_dirty[type];
- p->ofs_unit = 1;
- } else if (p->alloc_mode == AT_SSR) {
+ if (p->alloc_mode == SSR || p->alloc_mode == AT_SSR) {
p->gc_mode = GC_GREEDY;
p->dirty_bitmap = dirty_i->dirty_segmap[type];
p->max_search = dirty_i->nr_dirty[type];
@@ -389,14 +386,15 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
}
static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
- unsigned int segno, struct victim_sel_policy *p)
+ unsigned int segno, struct victim_sel_policy *p,
+ unsigned int valid_thresh_ratio)
{
if (p->alloc_mode == SSR)
return get_seg_entry(sbi, segno)->ckpt_valid_blocks;
- if (p->one_time_gc && (get_valid_blocks(sbi, segno, true) >=
- CAP_BLKS_PER_SEC(sbi) * sbi->gc_thread->valid_thresh_ratio /
- 100))
+ if (p->one_time_gc && (valid_thresh_ratio < 100) &&
+ (get_valid_blocks(sbi, segno, true) >=
+ CAP_BLKS_PER_SEC(sbi) * valid_thresh_ratio / 100))
return UINT_MAX;
/* alloc_mode == LFS */
@@ -777,6 +775,7 @@ int f2fs_get_victim(struct f2fs_sb_info *sbi, unsigned int *result,
unsigned int secno, last_victim;
unsigned int last_segment;
unsigned int nsearched;
+ unsigned int valid_thresh_ratio = 100;
bool is_atgc;
int ret = 0;
@@ -786,7 +785,11 @@ int f2fs_get_victim(struct f2fs_sb_info *sbi, unsigned int *result,
p.alloc_mode = alloc_mode;
p.age = age;
p.age_threshold = sbi->am.age_threshold;
- p.one_time_gc = one_time;
+ if (one_time) {
+ p.one_time_gc = one_time;
+ if (has_enough_free_secs(sbi, 0, NR_PERSISTENT_LOG))
+ valid_thresh_ratio = sbi->gc_thread->valid_thresh_ratio;
+ }
retry:
select_policy(sbi, gc_type, type, &p);
@@ -912,7 +915,7 @@ retry:
goto next;
}
- cost = get_gc_cost(sbi, segno, &p);
+ cost = get_gc_cost(sbi, segno, &p, valid_thresh_ratio);
if (p.min_cost > cost) {
p.min_segno = segno;
@@ -1162,8 +1165,8 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
return false;
}
- if (IS_INODE(&node_folio->page)) {
- base = offset_in_addr(F2FS_INODE(&node_folio->page));
+ if (IS_INODE(node_folio)) {
+ base = offset_in_addr(F2FS_INODE(node_folio));
max_addrs = DEF_ADDRS_PER_INODE;
} else {
base = 0;
@@ -1177,7 +1180,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
return false;
}
- *nofs = ofs_of_node(&node_folio->page);
+ *nofs = ofs_of_node(node_folio);
source_blkaddr = data_blkaddr(NULL, node_folio, ofs_in_node);
f2fs_folio_put(node_folio, true);
@@ -1249,7 +1252,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
}
got_it:
/* read folio */
- fio.page = &folio->page;
+ fio.folio = folio;
fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
/*
@@ -1353,7 +1356,7 @@ static int move_data_block(struct inode *inode, block_t bidx,
goto put_out;
/* read page */
- fio.page = &folio->page;
+ fio.folio = folio;
fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
if (lfs_mode)
@@ -1473,7 +1476,7 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type,
goto out;
}
folio_mark_dirty(folio);
- set_page_private_gcing(&folio->page);
+ folio_set_f2fs_gcing(folio);
} else {
struct f2fs_io_info fio = {
.sbi = F2FS_I_SB(inode),
@@ -1483,7 +1486,7 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type,
.op = REQ_OP_WRITE,
.op_flags = REQ_SYNC,
.old_blkaddr = NULL_ADDR,
- .page = &folio->page,
+ .folio = folio,
.encrypted_page = NULL,
.need_lock = LOCK_REQ,
.io_type = FS_GC_DATA_IO,
@@ -1499,11 +1502,11 @@ retry:
f2fs_remove_dirty_inode(inode);
}
- set_page_private_gcing(&folio->page);
+ folio_set_f2fs_gcing(folio);
err = f2fs_do_write_data_page(&fio);
if (err) {
- clear_page_private_gcing(&folio->page);
+ folio_clear_f2fs_gcing(folio);
if (err == -ENOMEM) {
memalloc_retry_wait(GFP_NOFS);
goto retry;
@@ -1749,7 +1752,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
!has_enough_free_blocks(sbi,
sbi->gc_thread->boost_zoned_gc_percent))
window_granularity *=
- BOOST_GC_MULTIPLE;
+ sbi->gc_thread->boost_gc_multiple;
end_segno = start_segno + window_granularity;
}
@@ -1891,6 +1894,7 @@ gc_more:
/* Let's run FG_GC, if we don't have enough space. */
if (has_not_enough_free_secs(sbi, 0, 0)) {
gc_type = FG_GC;
+ gc_control->one_time = false;
/*
* For example, if there are many prefree_segments below given
@@ -2064,7 +2068,7 @@ int f2fs_gc_range(struct f2fs_sb_info *sbi,
.iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
};
- if (IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, segno)))
+ if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, segno)))
continue;
do_garbage_collect(sbi, segno, &gc_list, FG_GC, true, false);
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 5c1eaf55e127..24e8b1c27acc 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -68,6 +68,8 @@ struct f2fs_gc_kthread {
unsigned int no_zoned_gc_percent;
unsigned int boost_zoned_gc_percent;
unsigned int valid_thresh_ratio;
+ unsigned int boost_gc_multiple;
+ unsigned int boost_gc_greedy;
};
struct gc_inode_list {
@@ -194,6 +196,7 @@ static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
static inline bool need_to_boost_gc(struct f2fs_sb_info *sbi)
{
if (f2fs_sb_has_blkzoned(sbi))
- return !has_enough_free_blocks(sbi, LIMIT_BOOST_ZONED_GC);
+ return !has_enough_free_blocks(sbi,
+ sbi->gc_thread->boost_zoned_gc_percent);
return has_enough_invalid_blocks(sbi);
}
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 901c630685ce..58ac831ef704 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -33,9 +33,9 @@ bool f2fs_may_inline_data(struct inode *inode)
return !f2fs_post_read_required(inode);
}
-static bool inode_has_blocks(struct inode *inode, struct page *ipage)
+static bool inode_has_blocks(struct inode *inode, struct folio *ifolio)
{
- struct f2fs_inode *ri = F2FS_INODE(ipage);
+ struct f2fs_inode *ri = F2FS_INODE(ifolio);
int i;
if (F2FS_HAS_BLOCKS(inode))
@@ -48,12 +48,12 @@ static bool inode_has_blocks(struct inode *inode, struct page *ipage)
return false;
}
-bool f2fs_sanity_check_inline_data(struct inode *inode, struct page *ipage)
+bool f2fs_sanity_check_inline_data(struct inode *inode, struct folio *ifolio)
{
if (!f2fs_has_inline_data(inode))
return false;
- if (inode_has_blocks(inode, ipage))
+ if (inode_has_blocks(inode, ifolio))
return false;
if (!support_inline_data(inode))
@@ -150,7 +150,7 @@ int f2fs_convert_inline_folio(struct dnode_of_data *dn, struct folio *folio)
.type = DATA,
.op = REQ_OP_WRITE,
.op_flags = REQ_SYNC | REQ_PRIO,
- .page = &folio->page,
+ .folio = folio,
.encrypted_page = NULL,
.io_type = FS_DATA_IO,
};
@@ -206,7 +206,7 @@ int f2fs_convert_inline_folio(struct dnode_of_data *dn, struct folio *folio)
/* clear inline data and flag after data writeback */
f2fs_truncate_inline_inode(dn->inode, dn->inode_folio, 0);
- clear_page_private_inline(&dn->inode_folio->page);
+ folio_clear_f2fs_inline(dn->inode_folio);
clear_out:
stat_dec_inline_inode(dn->inode);
clear_inode_flag(dn->inode, FI_INLINE_DATA);
@@ -286,7 +286,7 @@ int f2fs_write_inline_data(struct inode *inode, struct folio *folio)
set_inode_flag(inode, FI_APPEND_WRITE);
set_inode_flag(inode, FI_DATA_EXIST);
- clear_page_private_inline(&ifolio->page);
+ folio_clear_f2fs_inline(ifolio);
f2fs_folio_put(ifolio, 1);
return 0;
}
@@ -305,8 +305,8 @@ int f2fs_recover_inline_data(struct inode *inode, struct folio *nfolio)
* x o -> remove data blocks, and then recover inline_data
* x x -> recover data blocks
*/
- if (IS_INODE(&nfolio->page))
- ri = F2FS_INODE(&nfolio->page);
+ if (IS_INODE(nfolio))
+ ri = F2FS_INODE(nfolio);
if (f2fs_has_inline_data(inode) &&
ri && (ri->i_inline & F2FS_INLINE_DATA)) {
@@ -825,7 +825,7 @@ int f2fs_inline_data_fiemap(struct inode *inode,
byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits;
byteaddr += (char *)inline_data_addr(inode, ifolio) -
- (char *)F2FS_INODE(&ifolio->page);
+ (char *)F2FS_INODE(ifolio);
err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags);
trace_f2fs_fiemap(inode, start, byteaddr, ilen, flags, err);
out:
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 083d52a42bfb..8c4eafe9ffac 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -108,7 +108,7 @@ static void __recover_inline_status(struct inode *inode, struct folio *ifolio)
f2fs_folio_wait_writeback(ifolio, NODE, true, true);
set_inode_flag(inode, FI_DATA_EXIST);
- set_raw_inline(inode, F2FS_INODE(&ifolio->page));
+ set_raw_inline(inode, F2FS_INODE(ifolio));
folio_mark_dirty(ifolio);
return;
}
@@ -116,14 +116,15 @@ static void __recover_inline_status(struct inode *inode, struct folio *ifolio)
return;
}
-static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page)
+static
+bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct folio *folio)
{
- struct f2fs_inode *ri = &F2FS_NODE(page)->i;
+ struct f2fs_inode *ri = &F2FS_NODE(folio)->i;
if (!f2fs_sb_has_inode_chksum(sbi))
return false;
- if (!IS_INODE(page) || !(ri->i_inline & F2FS_EXTRA_ATTR))
+ if (!IS_INODE(folio) || !(ri->i_inline & F2FS_EXTRA_ATTR))
return false;
if (!F2FS_FITS_IN_INODE(ri, le16_to_cpu(ri->i_extra_isize),
@@ -133,9 +134,9 @@ static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page
return true;
}
-static __u32 f2fs_inode_chksum(struct f2fs_sb_info *sbi, struct page *page)
+static __u32 f2fs_inode_chksum(struct f2fs_sb_info *sbi, struct folio *folio)
{
- struct f2fs_node *node = F2FS_NODE(page);
+ struct f2fs_node *node = F2FS_NODE(folio);
struct f2fs_inode *ri = &node->i;
__le32 ino = node->footer.ino;
__le32 gen = ri->i_generation;
@@ -164,34 +165,34 @@ bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct folio *folio)
return true;
#ifdef CONFIG_F2FS_CHECK_FS
- if (!f2fs_enable_inode_chksum(sbi, &folio->page))
+ if (!f2fs_enable_inode_chksum(sbi, folio))
#else
- if (!f2fs_enable_inode_chksum(sbi, &folio->page) ||
+ if (!f2fs_enable_inode_chksum(sbi, folio) ||
folio_test_dirty(folio) ||
folio_test_writeback(folio))
#endif
return true;
- ri = &F2FS_NODE(&folio->page)->i;
+ ri = &F2FS_NODE(folio)->i;
provided = le32_to_cpu(ri->i_inode_checksum);
- calculated = f2fs_inode_chksum(sbi, &folio->page);
+ calculated = f2fs_inode_chksum(sbi, folio);
if (provided != calculated)
f2fs_warn(sbi, "checksum invalid, nid = %lu, ino_of_node = %x, %x vs. %x",
- folio->index, ino_of_node(&folio->page),
+ folio->index, ino_of_node(folio),
provided, calculated);
return provided == calculated;
}
-void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page)
+void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct folio *folio)
{
- struct f2fs_inode *ri = &F2FS_NODE(page)->i;
+ struct f2fs_inode *ri = &F2FS_NODE(folio)->i;
- if (!f2fs_enable_inode_chksum(sbi, page))
+ if (!f2fs_enable_inode_chksum(sbi, folio))
return;
- ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page));
+ ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, folio));
}
static bool sanity_check_compress_inode(struct inode *inode,
@@ -266,28 +267,28 @@ err_level:
return false;
}
-static bool sanity_check_inode(struct inode *inode, struct page *node_page)
+static bool sanity_check_inode(struct inode *inode, struct folio *node_folio)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
- struct f2fs_inode *ri = F2FS_INODE(node_page);
+ struct f2fs_inode *ri = F2FS_INODE(node_folio);
unsigned long long iblocks;
- iblocks = le64_to_cpu(F2FS_INODE(node_page)->i_blocks);
+ iblocks = le64_to_cpu(F2FS_INODE(node_folio)->i_blocks);
if (!iblocks) {
f2fs_warn(sbi, "%s: corrupted inode i_blocks i_ino=%lx iblocks=%llu, run fsck to fix.",
__func__, inode->i_ino, iblocks);
return false;
}
- if (ino_of_node(node_page) != nid_of_node(node_page)) {
+ if (ino_of_node(node_folio) != nid_of_node(node_folio)) {
f2fs_warn(sbi, "%s: corrupted inode footer i_ino=%lx, ino,nid: [%u, %u] run fsck to fix.",
__func__, inode->i_ino,
- ino_of_node(node_page), nid_of_node(node_page));
+ ino_of_node(node_folio), nid_of_node(node_folio));
return false;
}
- if (ino_of_node(node_page) == fi->i_xattr_nid) {
+ if (ino_of_node(node_folio) == fi->i_xattr_nid) {
f2fs_warn(sbi, "%s: corrupted inode i_ino=%lx, xnid=%x, run fsck to fix.",
__func__, inode->i_ino, fi->i_xattr_nid);
return false;
@@ -354,7 +355,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
}
}
- if (f2fs_sanity_check_inline_data(inode, node_page)) {
+ if (f2fs_sanity_check_inline_data(inode, node_folio)) {
f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_data, run fsck to fix",
__func__, inode->i_ino, inode->i_mode);
return false;
@@ -419,7 +420,7 @@ static int do_read_inode(struct inode *inode)
if (IS_ERR(node_folio))
return PTR_ERR(node_folio);
- ri = F2FS_INODE(&node_folio->page);
+ ri = F2FS_INODE(node_folio);
inode->i_mode = le16_to_cpu(ri->i_mode);
i_uid_write(inode, le32_to_cpu(ri->i_uid));
@@ -469,7 +470,7 @@ static int do_read_inode(struct inode *inode)
fi->i_inline_xattr_size = 0;
}
- if (!sanity_check_inode(inode, &node_folio->page)) {
+ if (!sanity_check_inode(inode, node_folio)) {
f2fs_folio_put(node_folio, true);
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE);
@@ -481,9 +482,9 @@ static int do_read_inode(struct inode *inode)
__recover_inline_status(inode, node_folio);
/* try to recover cold bit for non-dir inode */
- if (!S_ISDIR(inode->i_mode) && !is_cold_node(&node_folio->page)) {
+ if (!S_ISDIR(inode->i_mode) && !is_cold_node(node_folio)) {
f2fs_folio_wait_writeback(node_folio, NODE, true, true);
- set_cold_node(&node_folio->page, false);
+ set_cold_node(node_folio, false);
folio_mark_dirty(node_folio);
}
@@ -531,7 +532,7 @@ static int do_read_inode(struct inode *inode)
init_idisk_time(inode);
- if (!sanity_check_extent_cache(inode, &node_folio->page)) {
+ if (!sanity_check_extent_cache(inode, node_folio)) {
f2fs_folio_put(node_folio, true);
f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE);
return -EFSCORRUPTED;
@@ -669,7 +670,7 @@ void f2fs_update_inode(struct inode *inode, struct folio *node_folio)
f2fs_inode_synced(inode);
- ri = F2FS_INODE(&node_folio->page);
+ ri = F2FS_INODE(node_folio);
ri->i_mode = cpu_to_le16(inode->i_mode);
ri->i_advise = fi->i_advise;
@@ -748,11 +749,11 @@ void f2fs_update_inode(struct inode *inode, struct folio *node_folio)
/* deleted inode */
if (inode->i_nlink == 0)
- clear_page_private_inline(&node_folio->page);
+ folio_clear_f2fs_inline(node_folio);
init_idisk_time(inode);
#ifdef CONFIG_F2FS_CHECK_FS
- f2fs_inode_chksum_set(F2FS_I_SB(inode), &node_folio->page);
+ f2fs_inode_chksum_set(F2FS_I_SB(inode), node_folio);
#endif
}
@@ -820,7 +821,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
return 0;
}
-static void f2fs_remove_donate_inode(struct inode *inode)
+void f2fs_remove_donate_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -933,6 +934,19 @@ retry:
f2fs_update_inode_page(inode);
if (dquot_initialize_needed(inode))
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
+
+ /*
+ * If both f2fs_truncate() and f2fs_update_inode_page() failed
+ * due to fuzzed corrupted inode, call f2fs_inode_synced() to
+ * avoid triggering later f2fs_bug_on().
+ */
+ if (is_inode_flag_set(inode, FI_DIRTY_INODE)) {
+ f2fs_warn(sbi,
+ "f2fs_evict_inode: inode is dirty, ino:%lu",
+ inode->i_ino);
+ f2fs_inode_synced(inode);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ }
}
if (freeze_protected)
sb_end_intwrite(inode->i_sb);
@@ -949,8 +963,12 @@ no_delete:
if (likely(!f2fs_cp_error(sbi) &&
!is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE));
- else
- f2fs_inode_synced(inode);
+
+ /*
+ * anyway, it needs to remove the inode from sbi->inode_list[DIRTY_META]
+ * list to avoid UAF in f2fs_sync_inode_meta() during checkpoint.
+ */
+ f2fs_inode_synced(inode);
/* for the case f2fs_new_inode() was failed, .i_ino is zero, skip it */
if (inode->i_ino)
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 07e333ee21b7..b882771e4699 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -1298,19 +1298,19 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
- struct page *page;
+ struct folio *folio;
const char *target;
if (!dentry)
return ERR_PTR(-ECHILD);
- page = read_mapping_page(inode->i_mapping, 0, NULL);
- if (IS_ERR(page))
- return ERR_CAST(page);
+ folio = read_mapping_folio(inode->i_mapping, 0, NULL);
+ if (IS_ERR(folio))
+ return ERR_CAST(folio);
- target = fscrypt_get_symlink(inode, page_address(page),
+ target = fscrypt_get_symlink(inode, folio_address(folio),
inode->i_sb->s_blocksize, done);
- put_page(page);
+ folio_put(folio);
return target;
}
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index bfe104db284e..27743b93e186 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -135,7 +135,7 @@ static struct folio *get_current_nat_folio(struct f2fs_sb_info *sbi, nid_t nid)
return f2fs_get_meta_folio_retry(sbi, current_nat_addr(sbi, nid));
}
-static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
+static struct folio *get_next_nat_folio(struct f2fs_sb_info *sbi, nid_t nid)
{
struct folio *src_folio;
struct folio *dst_folio;
@@ -149,7 +149,7 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
/* get current nat block page with lock */
src_folio = get_current_nat_folio(sbi, nid);
if (IS_ERR(src_folio))
- return &src_folio->page;
+ return src_folio;
dst_folio = f2fs_grab_meta_folio(sbi, dst_off);
f2fs_bug_on(sbi, folio_test_dirty(src_folio));
@@ -161,7 +161,7 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
set_to_next_nat(nm_i, nid);
- return &dst_folio->page;
+ return dst_folio;
}
static struct nat_entry *__alloc_nat_entry(struct f2fs_sb_info *sbi,
@@ -185,7 +185,7 @@ static void __free_nat_entry(struct nat_entry *e)
/* must be locked by nat_tree_lock */
static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i,
- struct nat_entry *ne, struct f2fs_nat_entry *raw_ne, bool no_fail)
+ struct nat_entry *ne, struct f2fs_nat_entry *raw_ne, bool no_fail, bool init_dirty)
{
if (no_fail)
f2fs_radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne);
@@ -195,6 +195,12 @@ static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i,
if (raw_ne)
node_info_from_raw_nat(&ne->ni, raw_ne);
+ if (init_dirty) {
+ INIT_LIST_HEAD(&ne->list);
+ nm_i->nat_cnt[TOTAL_NAT]++;
+ return ne;
+ }
+
spin_lock(&nm_i->nat_list_lock);
list_add_tail(&ne->list, &nm_i->nat_entries);
spin_unlock(&nm_i->nat_list_lock);
@@ -204,14 +210,17 @@ static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i,
return ne;
}
-static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
+static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n, bool for_dirty)
{
struct nat_entry *ne;
ne = radix_tree_lookup(&nm_i->nat_root, n);
- /* for recent accessed nat entry, move it to tail of lru list */
- if (ne && !get_nat_flag(ne, IS_DIRTY)) {
+ /*
+ * for recent accessed nat entry which will not be dirtied soon
+ * later, move it to tail of lru list.
+ */
+ if (ne && !get_nat_flag(ne, IS_DIRTY) && !for_dirty) {
spin_lock(&nm_i->nat_list_lock);
if (!list_empty(&ne->list))
list_move_tail(&ne->list, &nm_i->nat_entries);
@@ -256,7 +265,7 @@ static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
}
static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
- struct nat_entry *ne)
+ struct nat_entry *ne, bool init_dirty)
{
struct nat_entry_set *head;
bool new_ne = nat_get_blkaddr(ne) == NEW_ADDR;
@@ -279,7 +288,8 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
goto refresh_list;
nm_i->nat_cnt[DIRTY_NAT]++;
- nm_i->nat_cnt[RECLAIMABLE_NAT]--;
+ if (!init_dirty)
+ nm_i->nat_cnt[RECLAIMABLE_NAT]--;
set_nat_flag(ne, IS_DIRTY, true);
refresh_list:
spin_lock(&nm_i->nat_list_lock);
@@ -312,8 +322,7 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct folio *folio)
{
- return is_node_folio(folio) && IS_DNODE(&folio->page) &&
- is_cold_node(&folio->page);
+ return is_node_folio(folio) && IS_DNODE(folio) && is_cold_node(folio);
}
void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi)
@@ -384,7 +393,7 @@ int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
bool need = false;
f2fs_down_read(&nm_i->nat_tree_lock);
- e = __lookup_nat_cache(nm_i, nid);
+ e = __lookup_nat_cache(nm_i, nid, false);
if (e) {
if (!get_nat_flag(e, IS_CHECKPOINTED) &&
!get_nat_flag(e, HAS_FSYNCED_INODE))
@@ -401,7 +410,7 @@ bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
bool is_cp = true;
f2fs_down_read(&nm_i->nat_tree_lock);
- e = __lookup_nat_cache(nm_i, nid);
+ e = __lookup_nat_cache(nm_i, nid, false);
if (e && !get_nat_flag(e, IS_CHECKPOINTED))
is_cp = false;
f2fs_up_read(&nm_i->nat_tree_lock);
@@ -415,7 +424,7 @@ bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
bool need_update = true;
f2fs_down_read(&nm_i->nat_tree_lock);
- e = __lookup_nat_cache(nm_i, ino);
+ e = __lookup_nat_cache(nm_i, ino, false);
if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
(get_nat_flag(e, IS_CHECKPOINTED) ||
get_nat_flag(e, HAS_FSYNCED_INODE)))
@@ -440,9 +449,9 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
return;
f2fs_down_write(&nm_i->nat_tree_lock);
- e = __lookup_nat_cache(nm_i, nid);
+ e = __lookup_nat_cache(nm_i, nid, false);
if (!e)
- e = __init_nat_entry(nm_i, new, ne, false);
+ e = __init_nat_entry(nm_i, new, ne, false, false);
else
f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) ||
nat_get_blkaddr(e) !=
@@ -459,11 +468,13 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
struct nat_entry *new = __alloc_nat_entry(sbi, ni->nid, true);
+ bool init_dirty = false;
f2fs_down_write(&nm_i->nat_tree_lock);
- e = __lookup_nat_cache(nm_i, ni->nid);
+ e = __lookup_nat_cache(nm_i, ni->nid, true);
if (!e) {
- e = __init_nat_entry(nm_i, new, NULL, true);
+ init_dirty = true;
+ e = __init_nat_entry(nm_i, new, NULL, true, true);
copy_node_info(&e->ni, ni);
f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
} else if (new_blkaddr == NEW_ADDR) {
@@ -499,11 +510,11 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
nat_set_blkaddr(e, new_blkaddr);
if (!__is_valid_data_blkaddr(new_blkaddr))
set_nat_flag(e, IS_CHECKPOINTED, false);
- __set_nat_cache_dirty(nm_i, e);
+ __set_nat_cache_dirty(nm_i, e, init_dirty);
/* update fsync_mark if its inode nat entry is still alive */
if (ni->nid != ni->ino)
- e = __lookup_nat_cache(nm_i, ni->ino);
+ e = __lookup_nat_cache(nm_i, ni->ino, false);
if (e) {
if (fsync_done && ni->nid == ni->ino)
set_nat_flag(e, HAS_FSYNCED_INODE, true);
@@ -555,20 +566,24 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
struct f2fs_nat_entry ne;
struct nat_entry *e;
pgoff_t index;
- block_t blkaddr;
int i;
+ bool need_cache = true;
ni->flag = 0;
ni->nid = nid;
retry:
/* Check nat cache */
f2fs_down_read(&nm_i->nat_tree_lock);
- e = __lookup_nat_cache(nm_i, nid);
+ e = __lookup_nat_cache(nm_i, nid, false);
if (e) {
ni->ino = nat_get_ino(e);
ni->blk_addr = nat_get_blkaddr(e);
ni->version = nat_get_version(e);
f2fs_up_read(&nm_i->nat_tree_lock);
+ if (IS_ENABLED(CONFIG_F2FS_CHECK_FS)) {
+ need_cache = false;
+ goto sanity_check;
+ }
return 0;
}
@@ -594,7 +609,7 @@ retry:
up_read(&curseg->journal_rwsem);
if (i >= 0) {
f2fs_up_read(&nm_i->nat_tree_lock);
- goto cache;
+ goto sanity_check;
}
/* Fill node_info from nat page */
@@ -609,14 +624,23 @@ retry:
ne = nat_blk->entries[nid - start_nid];
node_info_from_raw_nat(ni, &ne);
f2fs_folio_put(folio, true);
-cache:
- blkaddr = le32_to_cpu(ne.block_addr);
- if (__is_valid_data_blkaddr(blkaddr) &&
- !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE))
- return -EFAULT;
+sanity_check:
+ if (__is_valid_data_blkaddr(ni->blk_addr) &&
+ !f2fs_is_valid_blkaddr(sbi, ni->blk_addr,
+ DATA_GENERIC_ENHANCE)) {
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_err_ratelimited(sbi,
+ "f2fs_get_node_info of %pS: inconsistent nat entry, "
+ "ino:%u, nid:%u, blkaddr:%u, ver:%u, flag:%u",
+ __builtin_return_address(0),
+ ni->ino, ni->nid, ni->blk_addr, ni->version, ni->flag);
+ f2fs_handle_error(sbi, ERROR_INCONSISTENT_NAT);
+ return -EFSCORRUPTED;
+ }
/* cache nat entry */
- cache_nat_entry(sbi, nid, &ne);
+ if (need_cache)
+ cache_nat_entry(sbi, nid, &ne);
return 0;
}
@@ -636,7 +660,7 @@ static void f2fs_ra_node_pages(struct folio *parent, int start, int n)
end = start + n;
end = min(end, (int)NIDS_PER_BLOCK);
for (i = start; i < end; i++) {
- nid = get_nid(&parent->page, i, false);
+ nid = get_nid(parent, i, false);
f2fs_ra_node_page(sbi, nid);
}
@@ -795,7 +819,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
parent = nfolio[0];
if (level != 0)
- nids[1] = get_nid(&parent->page, offset[0], true);
+ nids[1] = get_nid(parent, offset[0], true);
dn->inode_folio = nfolio[0];
dn->inode_folio_locked = true;
@@ -803,6 +827,16 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
for (i = 1; i <= level; i++) {
bool done = false;
+ if (nids[i] && nids[i] == dn->inode->i_ino) {
+ err = -EFSCORRUPTED;
+ f2fs_err_ratelimited(sbi,
+ "inode mapping table is corrupted, run fsck to fix it, "
+ "ino:%lu, nid:%u, level:%d, offset:%d",
+ dn->inode->i_ino, nids[i], level, offset[level]);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ goto release_pages;
+ }
+
if (!nids[i] && mode == ALLOC_NODE) {
/* alloc new node */
if (!f2fs_alloc_nid(sbi, &(nids[i]))) {
@@ -846,7 +880,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
}
if (i < level) {
parent = nfolio[i];
- nids[i + 1] = get_nid(&parent->page, offset[i], false);
+ nids[i + 1] = get_nid(parent, offset[i], false);
}
}
dn->nid = nids[level];
@@ -961,9 +995,9 @@ static int truncate_dnode(struct dnode_of_data *dn)
else if (IS_ERR(folio))
return PTR_ERR(folio);
- if (IS_INODE(&folio->page) || ino_of_node(&folio->page) != dn->inode->i_ino) {
+ if (IS_INODE(folio) || ino_of_node(folio) != dn->inode->i_ino) {
f2fs_err(sbi, "incorrect node reference, ino: %lu, nid: %u, ino_of_node: %u",
- dn->inode->i_ino, dn->nid, ino_of_node(&folio->page));
+ dn->inode->i_ino, dn->nid, ino_of_node(folio));
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_handle_error(sbi, ERROR_INVALID_NODE_REFERENCE);
f2fs_folio_put(folio, true);
@@ -1007,7 +1041,7 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
f2fs_ra_node_pages(folio, ofs, NIDS_PER_BLOCK);
- rn = F2FS_NODE(&folio->page);
+ rn = F2FS_NODE(folio);
if (depth < 3) {
for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) {
child_nid = le32_to_cpu(rn->in.nid[i]);
@@ -1070,7 +1104,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
int i;
int idx = depth - 2;
- nid[0] = get_nid(&dn->inode_folio->page, offset[0], true);
+ nid[0] = get_nid(dn->inode_folio, offset[0], true);
if (!nid[0])
return 0;
@@ -1083,14 +1117,14 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
idx = i - 1;
goto fail;
}
- nid[i + 1] = get_nid(&folios[i]->page, offset[i + 1], false);
+ nid[i + 1] = get_nid(folios[i], offset[i + 1], false);
}
f2fs_ra_node_pages(folios[idx], offset[idx + 1], NIDS_PER_BLOCK);
/* free direct nodes linked to a partial indirect node */
for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) {
- child_nid = get_nid(&folios[idx]->page, i, false);
+ child_nid = get_nid(folios[idx], i, false);
if (!child_nid)
continue;
dn->nid = child_nid;
@@ -1159,7 +1193,7 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from)
set_new_dnode(&dn, inode, folio, NULL, 0);
folio_unlock(folio);
- ri = F2FS_INODE(&folio->page);
+ ri = F2FS_INODE(folio);
switch (level) {
case 0:
case 1:
@@ -1188,7 +1222,7 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from)
skip_partial:
while (cont) {
- dn.nid = get_nid(&folio->page, offset[0], true);
+ dn.nid = get_nid(folio, offset[0], true);
switch (offset[0]) {
case NODE_DIR1_BLOCK:
case NODE_DIR2_BLOCK:
@@ -1220,7 +1254,7 @@ skip_partial:
}
if (err < 0)
goto fail;
- if (offset[1] == 0 && get_nid(&folio->page, offset[0], true)) {
+ if (offset[1] == 0 && get_nid(folio, offset[0], true)) {
folio_lock(folio);
BUG_ON(!is_node_folio(folio));
set_nid(folio, offset[0], 0, true);
@@ -1367,8 +1401,8 @@ struct folio *f2fs_new_node_folio(struct dnode_of_data *dn, unsigned int ofs)
set_node_addr(sbi, &new_ni, NEW_ADDR, false);
f2fs_folio_wait_writeback(folio, NODE, true, true);
- fill_node_footer(&folio->page, dn->nid, dn->inode->i_ino, ofs, true);
- set_cold_node(&folio->page, S_ISDIR(dn->inode->i_mode));
+ fill_node_footer(folio, dn->nid, dn->inode->i_ino, ofs, true);
+ set_cold_node(folio, S_ISDIR(dn->inode->i_mode));
if (!folio_test_uptodate(folio))
folio_mark_uptodate(folio);
if (folio_mark_dirty(folio))
@@ -1400,7 +1434,7 @@ static int read_node_folio(struct folio *folio, blk_opf_t op_flags)
.type = NODE,
.op = REQ_OP_READ,
.op_flags = op_flags,
- .page = &folio->page,
+ .folio = folio,
.encrypted_page = NULL,
};
int err;
@@ -1462,17 +1496,15 @@ static int sanity_check_node_footer(struct f2fs_sb_info *sbi,
struct folio *folio, pgoff_t nid,
enum node_type ntype)
{
- struct page *page = &folio->page;
-
- if (unlikely(nid != nid_of_node(page) ||
- (ntype == NODE_TYPE_INODE && !IS_INODE(page)) ||
+ if (unlikely(nid != nid_of_node(folio) ||
+ (ntype == NODE_TYPE_INODE && !IS_INODE(folio)) ||
(ntype == NODE_TYPE_XATTR &&
- !f2fs_has_xattr_block(ofs_of_node(page))) ||
+ !f2fs_has_xattr_block(ofs_of_node(folio))) ||
time_to_inject(sbi, FAULT_INCONSISTENT_FOOTER))) {
f2fs_warn(sbi, "inconsistent node block, node_type:%d, nid:%lu, "
"node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]",
- ntype, nid, nid_of_node(page), ino_of_node(page),
- ofs_of_node(page), cpver_of_node(page),
+ ntype, nid, nid_of_node(folio), ino_of_node(folio),
+ ofs_of_node(folio), cpver_of_node(folio),
next_blkaddr_of_node(folio));
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER);
@@ -1553,7 +1585,7 @@ struct folio *f2fs_get_xnode_folio(struct f2fs_sb_info *sbi, pgoff_t xnid)
static struct folio *f2fs_get_node_folio_ra(struct folio *parent, int start)
{
struct f2fs_sb_info *sbi = F2FS_F_SB(parent);
- nid_t nid = get_nid(&parent->page, start, false);
+ nid_t nid = get_nid(parent, start, false);
return __get_node_folio(sbi, nid, parent, start, NODE_TYPE_REGULAR);
}
@@ -1618,9 +1650,9 @@ static struct folio *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
return ERR_PTR(-EIO);
}
- if (!IS_DNODE(&folio->page) || !is_cold_node(&folio->page))
+ if (!IS_DNODE(folio) || !is_cold_node(folio))
continue;
- if (ino_of_node(&folio->page) != ino)
+ if (ino_of_node(folio) != ino)
continue;
folio_lock(folio);
@@ -1630,7 +1662,7 @@ continue_unlock:
folio_unlock(folio);
continue;
}
- if (ino_of_node(&folio->page) != ino)
+ if (ino_of_node(folio) != ino)
goto continue_unlock;
if (!folio_test_dirty(folio)) {
@@ -1660,11 +1692,11 @@ static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted
struct node_info ni;
struct f2fs_io_info fio = {
.sbi = sbi,
- .ino = ino_of_node(&folio->page),
+ .ino = ino_of_node(folio),
.type = NODE,
.op = REQ_OP_WRITE,
.op_flags = wbc_to_write_flags(wbc),
- .page = &folio->page,
+ .folio = folio,
.encrypted_page = NULL,
.submitted = 0,
.io_type = io_type,
@@ -1689,11 +1721,11 @@ static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted
if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
wbc->sync_mode == WB_SYNC_NONE &&
- IS_DNODE(&folio->page) && is_cold_node(&folio->page))
+ IS_DNODE(folio) && is_cold_node(folio))
goto redirty_out;
/* get old block addr of this node page */
- nid = nid_of_node(&folio->page);
+ nid = nid_of_node(folio);
f2fs_bug_on(sbi, folio->index != nid);
if (f2fs_get_node_info(sbi, nid, &ni, !do_balance))
@@ -1731,7 +1763,7 @@ static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted
fio.old_blkaddr = ni.blk_addr;
f2fs_do_write_node_page(nid, &fio);
- set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(&folio->page));
+ set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(folio));
dec_page_count(sbi, F2FS_DIRTY_NODES);
f2fs_up_read(&sbi->node_write);
@@ -1827,9 +1859,9 @@ retry:
goto out;
}
- if (!IS_DNODE(&folio->page) || !is_cold_node(&folio->page))
+ if (!IS_DNODE(folio) || !is_cold_node(folio))
continue;
- if (ino_of_node(&folio->page) != ino)
+ if (ino_of_node(folio) != ino)
continue;
folio_lock(folio);
@@ -1839,7 +1871,7 @@ continue_unlock:
folio_unlock(folio);
continue;
}
- if (ino_of_node(&folio->page) != ino)
+ if (ino_of_node(folio) != ino)
goto continue_unlock;
if (!folio_test_dirty(folio) && folio != last_folio) {
@@ -1849,17 +1881,17 @@ continue_unlock:
f2fs_folio_wait_writeback(folio, NODE, true, true);
- set_fsync_mark(&folio->page, 0);
- set_dentry_mark(&folio->page, 0);
+ set_fsync_mark(folio, 0);
+ set_dentry_mark(folio, 0);
if (!atomic || folio == last_folio) {
- set_fsync_mark(&folio->page, 1);
+ set_fsync_mark(folio, 1);
percpu_counter_inc(&sbi->rf_node_block_count);
- if (IS_INODE(&folio->page)) {
+ if (IS_INODE(folio)) {
if (is_inode_flag_set(inode,
FI_DIRTY_INODE))
f2fs_update_inode(inode, folio);
- set_dentry_mark(&folio->page,
+ set_dentry_mark(folio,
f2fs_need_dentry_mark(sbi, ino));
}
/* may be written by other thread */
@@ -1935,7 +1967,7 @@ static bool flush_dirty_inode(struct folio *folio)
{
struct f2fs_sb_info *sbi = F2FS_F_SB(folio);
struct inode *inode;
- nid_t ino = ino_of_node(&folio->page);
+ nid_t ino = ino_of_node(folio);
inode = find_inode_nowait(sbi->sb, ino, f2fs_match_ino, NULL);
if (!inode)
@@ -1964,7 +1996,7 @@ void f2fs_flush_inline_data(struct f2fs_sb_info *sbi)
for (i = 0; i < nr_folios; i++) {
struct folio *folio = fbatch.folios[i];
- if (!IS_INODE(&folio->page))
+ if (!IS_INODE(folio))
continue;
folio_lock(folio);
@@ -1975,10 +2007,10 @@ void f2fs_flush_inline_data(struct f2fs_sb_info *sbi)
goto unlock;
/* flush inline_data, if it's async context. */
- if (page_private_inline(&folio->page)) {
- clear_page_private_inline(&folio->page);
+ if (folio_test_f2fs_inline(folio)) {
+ folio_clear_f2fs_inline(folio);
folio_unlock(folio);
- flush_inline_data(sbi, ino_of_node(&folio->page));
+ flush_inline_data(sbi, ino_of_node(folio));
continue;
}
unlock:
@@ -2027,13 +2059,13 @@ next_step:
* 1. dentry dnodes
* 2. file dnodes
*/
- if (step == 0 && IS_DNODE(&folio->page))
+ if (step == 0 && IS_DNODE(folio))
continue;
- if (step == 1 && (!IS_DNODE(&folio->page) ||
- is_cold_node(&folio->page)))
+ if (step == 1 && (!IS_DNODE(folio) ||
+ is_cold_node(folio)))
continue;
- if (step == 2 && (!IS_DNODE(&folio->page) ||
- !is_cold_node(&folio->page)))
+ if (step == 2 && (!IS_DNODE(folio) ||
+ !is_cold_node(folio)))
continue;
lock_node:
if (wbc->sync_mode == WB_SYNC_ALL)
@@ -2057,15 +2089,15 @@ continue_unlock:
goto write_node;
/* flush inline_data */
- if (page_private_inline(&folio->page)) {
- clear_page_private_inline(&folio->page);
+ if (folio_test_f2fs_inline(folio)) {
+ folio_clear_f2fs_inline(folio);
folio_unlock(folio);
- flush_inline_data(sbi, ino_of_node(&folio->page));
+ flush_inline_data(sbi, ino_of_node(folio));
goto lock_node;
}
/* flush dirty inode */
- if (IS_INODE(&folio->page) && flush_dirty_inode(folio))
+ if (IS_INODE(folio) && flush_dirty_inode(folio))
goto lock_node;
write_node:
f2fs_folio_wait_writeback(folio, NODE, true, true);
@@ -2073,8 +2105,8 @@ write_node:
if (!folio_clear_dirty_for_io(folio))
goto continue_unlock;
- set_fsync_mark(&folio->page, 0);
- set_dentry_mark(&folio->page, 0);
+ set_fsync_mark(folio, 0);
+ set_dentry_mark(folio, 0);
if (!__write_node_folio(folio, false, &submitted,
wbc, do_balance, io_type, NULL)) {
@@ -2201,12 +2233,12 @@ static bool f2fs_dirty_node_folio(struct address_space *mapping,
if (!folio_test_uptodate(folio))
folio_mark_uptodate(folio);
#ifdef CONFIG_F2FS_CHECK_FS
- if (IS_INODE(&folio->page))
- f2fs_inode_chksum_set(F2FS_M_SB(mapping), &folio->page);
+ if (IS_INODE(folio))
+ f2fs_inode_chksum_set(F2FS_M_SB(mapping), folio);
#endif
if (filemap_dirty_folio(mapping, folio)) {
inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES);
- set_page_private_reference(&folio->page);
+ folio_set_f2fs_reference(folio);
return true;
}
return false;
@@ -2351,7 +2383,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi,
* - __remove_nid_from_list(PREALLOC_NID)
* - __insert_nid_to_list(FREE_NID)
*/
- ne = __lookup_nat_cache(nm_i, nid);
+ ne = __lookup_nat_cache(nm_i, nid, false);
if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
nat_get_blkaddr(ne) != NULL_ADDR))
goto err_out;
@@ -2714,7 +2746,7 @@ int f2fs_recover_inline_xattr(struct inode *inode, struct folio *folio)
if (IS_ERR(ifolio))
return PTR_ERR(ifolio);
- ri = F2FS_INODE(&folio->page);
+ ri = F2FS_INODE(folio);
if (ri->i_inline & F2FS_INLINE_XATTR) {
if (!f2fs_has_inline_xattr(inode)) {
set_inode_flag(inode, FI_INLINE_XATTR);
@@ -2740,7 +2772,7 @@ update_inode:
return 0;
}
-int f2fs_recover_xattr_data(struct inode *inode, struct page *page)
+int f2fs_recover_xattr_data(struct inode *inode, struct folio *folio)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid;
@@ -2778,8 +2810,8 @@ recover_xnid:
f2fs_update_inode_page(inode);
/* 3: update and set xattr node page dirty */
- if (page) {
- memcpy(F2FS_NODE(&xfolio->page), F2FS_NODE(page),
+ if (folio) {
+ memcpy(F2FS_NODE(xfolio), F2FS_NODE(folio),
VALID_XATTR_BLOCK_SIZE);
folio_mark_dirty(xfolio);
}
@@ -2788,10 +2820,10 @@ recover_xnid:
return 0;
}
-int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
+int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct folio *folio)
{
struct f2fs_inode *src, *dst;
- nid_t ino = ino_of_node(page);
+ nid_t ino = ino_of_node(folio);
struct node_info old_ni, new_ni;
struct folio *ifolio;
int err;
@@ -2814,11 +2846,11 @@ retry:
if (!folio_test_uptodate(ifolio))
folio_mark_uptodate(ifolio);
- fill_node_footer(&ifolio->page, ino, ino, 0, true);
- set_cold_node(&ifolio->page, false);
+ fill_node_footer(ifolio, ino, ino, 0, true);
+ set_cold_node(ifolio, false);
- src = F2FS_INODE(page);
- dst = F2FS_INODE(&ifolio->page);
+ src = F2FS_INODE(folio);
+ dst = F2FS_INODE(ifolio);
memcpy(dst, src, offsetof(struct f2fs_inode, i_ext));
dst->i_size = 0;
@@ -2884,7 +2916,7 @@ int f2fs_restore_node_summary(struct f2fs_sb_info *sbi,
if (IS_ERR(folio))
return PTR_ERR(folio);
- rn = F2FS_NODE(&folio->page);
+ rn = F2FS_NODE(folio);
sum_entry->nid = rn->footer.nid;
sum_entry->version = 0;
sum_entry->ofs_in_node = 0;
@@ -2904,6 +2936,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
struct f2fs_journal *journal = curseg->journal;
int i;
+ bool init_dirty;
down_write(&curseg->journal_rwsem);
for (i = 0; i < nats_in_cursum(journal); i++) {
@@ -2914,12 +2947,15 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
if (f2fs_check_nid_range(sbi, nid))
continue;
+ init_dirty = false;
+
raw_ne = nat_in_journal(journal, i);
- ne = __lookup_nat_cache(nm_i, nid);
+ ne = __lookup_nat_cache(nm_i, nid, true);
if (!ne) {
+ init_dirty = true;
ne = __alloc_nat_entry(sbi, nid, true);
- __init_nat_entry(nm_i, ne, &raw_ne, true);
+ __init_nat_entry(nm_i, ne, &raw_ne, true, true);
}
/*
@@ -2934,7 +2970,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
spin_unlock(&nm_i->nid_list_lock);
}
- __set_nat_cache_dirty(nm_i, ne);
+ __set_nat_cache_dirty(nm_i, ne, init_dirty);
}
update_nats_in_cursum(journal, -i);
up_write(&curseg->journal_rwsem);
@@ -2959,11 +2995,10 @@ add_out:
}
static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
- struct page *page)
+ const struct f2fs_nat_block *nat_blk)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK;
- struct f2fs_nat_block *nat_blk = page_address(page);
int valid = 0;
int i = 0;
@@ -3000,7 +3035,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
bool to_journal = true;
struct f2fs_nat_block *nat_blk;
struct nat_entry *ne, *cur;
- struct page *page = NULL;
+ struct folio *folio = NULL;
/*
* there are two steps to flush nat entries:
@@ -3014,11 +3049,11 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
if (to_journal) {
down_write(&curseg->journal_rwsem);
} else {
- page = get_next_nat_page(sbi, start_nid);
- if (IS_ERR(page))
- return PTR_ERR(page);
+ folio = get_next_nat_folio(sbi, start_nid);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
- nat_blk = page_address(page);
+ nat_blk = folio_address(folio);
f2fs_bug_on(sbi, !nat_blk);
}
@@ -3054,8 +3089,8 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
if (to_journal) {
up_write(&curseg->journal_rwsem);
} else {
- __update_nat_bits(sbi, start_nid, page);
- f2fs_put_page(page, 1);
+ __update_nat_bits(sbi, start_nid, nat_blk);
+ f2fs_folio_put(folio, true);
}
/* Allow dirty nats by node block allocation in write_begin */
@@ -3395,10 +3430,10 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
}
kvfree(nm_i->free_nid_count);
- kvfree(nm_i->nat_bitmap);
+ kfree(nm_i->nat_bitmap);
kvfree(nm_i->nat_bits);
#ifdef CONFIG_F2FS_CHECK_FS
- kvfree(nm_i->nat_bitmap_mir);
+ kfree(nm_i->nat_bitmap_mir);
#endif
sbi->nm_info = NULL;
kfree(nm_i);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 1446c433b3ec..030390543b54 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -31,7 +31,7 @@
/* control total # of nats */
#define DEF_NAT_CACHE_THRESHOLD 100000
-/* control total # of node writes used for roll-fowrad recovery */
+/* control total # of node writes used for roll-forward recovery */
#define DEF_RF_NODE_BLOCKS 0
/* vector size for gang look-up from nat cache that consists of radix tree */
@@ -243,41 +243,41 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid)
#endif
}
-static inline nid_t ino_of_node(struct page *node_page)
+static inline nid_t ino_of_node(const struct folio *node_folio)
{
- struct f2fs_node *rn = F2FS_NODE(node_page);
+ struct f2fs_node *rn = F2FS_NODE(node_folio);
return le32_to_cpu(rn->footer.ino);
}
-static inline nid_t nid_of_node(struct page *node_page)
+static inline nid_t nid_of_node(const struct folio *node_folio)
{
- struct f2fs_node *rn = F2FS_NODE(node_page);
+ struct f2fs_node *rn = F2FS_NODE(node_folio);
return le32_to_cpu(rn->footer.nid);
}
-static inline unsigned int ofs_of_node(const struct page *node_page)
+static inline unsigned int ofs_of_node(const struct folio *node_folio)
{
- struct f2fs_node *rn = F2FS_NODE(node_page);
+ struct f2fs_node *rn = F2FS_NODE(node_folio);
unsigned flag = le32_to_cpu(rn->footer.flag);
return flag >> OFFSET_BIT_SHIFT;
}
-static inline __u64 cpver_of_node(struct page *node_page)
+static inline __u64 cpver_of_node(const struct folio *node_folio)
{
- struct f2fs_node *rn = F2FS_NODE(node_page);
+ struct f2fs_node *rn = F2FS_NODE(node_folio);
return le64_to_cpu(rn->footer.cp_ver);
}
-static inline block_t next_blkaddr_of_node(struct folio *node_folio)
+static inline block_t next_blkaddr_of_node(const struct folio *node_folio)
{
- struct f2fs_node *rn = F2FS_NODE(&node_folio->page);
+ struct f2fs_node *rn = F2FS_NODE(node_folio);
return le32_to_cpu(rn->footer.next_blkaddr);
}
-static inline void fill_node_footer(struct page *page, nid_t nid,
+static inline void fill_node_footer(const struct folio *folio, nid_t nid,
nid_t ino, unsigned int ofs, bool reset)
{
- struct f2fs_node *rn = F2FS_NODE(page);
+ struct f2fs_node *rn = F2FS_NODE(folio);
unsigned int old_flag = 0;
if (reset)
@@ -293,17 +293,18 @@ static inline void fill_node_footer(struct page *page, nid_t nid,
(old_flag & OFFSET_BIT_MASK));
}
-static inline void copy_node_footer(struct page *dst, struct page *src)
+static inline void copy_node_footer(const struct folio *dst,
+ const struct folio *src)
{
struct f2fs_node *src_rn = F2FS_NODE(src);
struct f2fs_node *dst_rn = F2FS_NODE(dst);
memcpy(&dst_rn->footer, &src_rn->footer, sizeof(struct node_footer));
}
-static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
+static inline void fill_node_footer_blkaddr(struct folio *folio, block_t blkaddr)
{
- struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page));
- struct f2fs_node *rn = F2FS_NODE(page);
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_F_SB(folio));
+ struct f2fs_node *rn = F2FS_NODE(folio);
__u64 cp_ver = cur_cp_version(ckpt);
if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG))
@@ -313,19 +314,19 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
rn->footer.next_blkaddr = cpu_to_le32(blkaddr);
}
-static inline bool is_recoverable_dnode(struct page *page)
+static inline bool is_recoverable_dnode(const struct folio *folio)
{
- struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page));
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_F_SB(folio));
__u64 cp_ver = cur_cp_version(ckpt);
/* Don't care crc part, if fsck.f2fs sets it. */
if (__is_set_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG))
- return (cp_ver << 32) == (cpver_of_node(page) << 32);
+ return (cp_ver << 32) == (cpver_of_node(folio) << 32);
if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG))
cp_ver |= (cur_cp_crc(ckpt) << 32);
- return cp_ver == cpver_of_node(page);
+ return cp_ver == cpver_of_node(folio);
}
/*
@@ -349,9 +350,9 @@ static inline bool is_recoverable_dnode(struct page *page)
* `- indirect node ((6 + 2N) + (N - 1)(N + 1))
* `- direct node
*/
-static inline bool IS_DNODE(const struct page *node_page)
+static inline bool IS_DNODE(const struct folio *node_folio)
{
- unsigned int ofs = ofs_of_node(node_page);
+ unsigned int ofs = ofs_of_node(node_folio);
if (f2fs_has_xattr_block(ofs))
return true;
@@ -369,7 +370,7 @@ static inline bool IS_DNODE(const struct page *node_page)
static inline int set_nid(struct folio *folio, int off, nid_t nid, bool i)
{
- struct f2fs_node *rn = F2FS_NODE(&folio->page);
+ struct f2fs_node *rn = F2FS_NODE(folio);
f2fs_folio_wait_writeback(folio, NODE, true, true);
@@ -380,9 +381,9 @@ static inline int set_nid(struct folio *folio, int off, nid_t nid, bool i)
return folio_mark_dirty(folio);
}
-static inline nid_t get_nid(struct page *p, int off, bool i)
+static inline nid_t get_nid(const struct folio *folio, int off, bool i)
{
- struct f2fs_node *rn = F2FS_NODE(p);
+ struct f2fs_node *rn = F2FS_NODE(folio);
if (i)
return le32_to_cpu(rn->i.i_nid[off - NODE_DIR1_BLOCK]);
@@ -396,19 +397,19 @@ static inline nid_t get_nid(struct page *p, int off, bool i)
* - Mark cold data pages in page cache
*/
-static inline int is_node(const struct page *page, int type)
+static inline int is_node(const struct folio *folio, int type)
{
- struct f2fs_node *rn = F2FS_NODE(page);
+ struct f2fs_node *rn = F2FS_NODE(folio);
return le32_to_cpu(rn->footer.flag) & BIT(type);
}
-#define is_cold_node(page) is_node(page, COLD_BIT_SHIFT)
-#define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT)
-#define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT)
+#define is_cold_node(folio) is_node(folio, COLD_BIT_SHIFT)
+#define is_fsync_dnode(folio) is_node(folio, FSYNC_BIT_SHIFT)
+#define is_dent_dnode(folio) is_node(folio, DENT_BIT_SHIFT)
-static inline void set_cold_node(struct page *page, bool is_dir)
+static inline void set_cold_node(const struct folio *folio, bool is_dir)
{
- struct f2fs_node *rn = F2FS_NODE(page);
+ struct f2fs_node *rn = F2FS_NODE(folio);
unsigned int flag = le32_to_cpu(rn->footer.flag);
if (is_dir)
@@ -418,9 +419,9 @@ static inline void set_cold_node(struct page *page, bool is_dir)
rn->footer.flag = cpu_to_le32(flag);
}
-static inline void set_mark(struct page *page, int mark, int type)
+static inline void set_mark(struct folio *folio, int mark, int type)
{
- struct f2fs_node *rn = F2FS_NODE(page);
+ struct f2fs_node *rn = F2FS_NODE(folio);
unsigned int flag = le32_to_cpu(rn->footer.flag);
if (mark)
flag |= BIT(type);
@@ -429,8 +430,8 @@ static inline void set_mark(struct page *page, int mark, int type)
rn->footer.flag = cpu_to_le32(flag);
#ifdef CONFIG_F2FS_CHECK_FS
- f2fs_inode_chksum_set(F2FS_P_SB(page), page);
+ f2fs_inode_chksum_set(F2FS_F_SB(folio), folio);
#endif
}
-#define set_dentry_mark(page, mark) set_mark(page, mark, DENT_BIT_SHIFT)
-#define set_fsync_mark(page, mark) set_mark(page, mark, FSYNC_BIT_SHIFT)
+#define set_dentry_mark(folio, mark) set_mark(folio, mark, DENT_BIT_SHIFT)
+#define set_fsync_mark(folio, mark) set_mark(folio, mark, FSYNC_BIT_SHIFT)
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 51ebed4e1521..4cb3a91801b4 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -157,10 +157,10 @@ static int init_recovered_filename(const struct inode *dir,
return 0;
}
-static int recover_dentry(struct inode *inode, struct page *ipage,
+static int recover_dentry(struct inode *inode, struct folio *ifolio,
struct list_head *dir_list)
{
- struct f2fs_inode *raw_inode = F2FS_INODE(ipage);
+ struct f2fs_inode *raw_inode = F2FS_INODE(ifolio);
nid_t pino = le32_to_cpu(raw_inode->i_pino);
struct f2fs_dir_entry *de;
struct f2fs_filename fname;
@@ -233,14 +233,14 @@ out:
else
name = raw_inode->i_name;
f2fs_notice(F2FS_I_SB(inode), "%s: ino = %x, name = %s, dir = %lx, err = %d",
- __func__, ino_of_node(ipage), name,
+ __func__, ino_of_node(ifolio), name,
IS_ERR(dir) ? 0 : dir->i_ino, err);
return err;
}
-static int recover_quota_data(struct inode *inode, struct page *page)
+static int recover_quota_data(struct inode *inode, struct folio *folio)
{
- struct f2fs_inode *raw = F2FS_INODE(page);
+ struct f2fs_inode *raw = F2FS_INODE(folio);
struct iattr attr;
uid_t i_uid = le32_to_cpu(raw->i_uid);
gid_t i_gid = le32_to_cpu(raw->i_gid);
@@ -277,16 +277,16 @@ static void recover_inline_flags(struct inode *inode, struct f2fs_inode *ri)
clear_inode_flag(inode, FI_DATA_EXIST);
}
-static int recover_inode(struct inode *inode, struct page *page)
+static int recover_inode(struct inode *inode, struct folio *folio)
{
- struct f2fs_inode *raw = F2FS_INODE(page);
+ struct f2fs_inode *raw = F2FS_INODE(folio);
struct f2fs_inode_info *fi = F2FS_I(inode);
char *name;
int err;
inode->i_mode = le16_to_cpu(raw->i_mode);
- err = recover_quota_data(inode, page);
+ err = recover_quota_data(inode, folio);
if (err)
return err;
@@ -333,10 +333,10 @@ static int recover_inode(struct inode *inode, struct page *page)
if (file_enc_name(inode))
name = "<encrypted>";
else
- name = F2FS_INODE(page)->i_name;
+ name = F2FS_INODE(folio)->i_name;
f2fs_notice(F2FS_I_SB(inode), "recover_inode: ino = %x, name = %s, inline = %x",
- ino_of_node(page), name, raw->i_inline);
+ ino_of_node(folio), name, raw->i_inline);
return 0;
}
@@ -375,7 +375,7 @@ static int sanity_check_node_chain(struct f2fs_sb_info *sbi, block_t blkaddr,
if (IS_ERR(folio))
return PTR_ERR(folio);
- if (!is_recoverable_dnode(&folio->page)) {
+ if (!is_recoverable_dnode(folio)) {
f2fs_folio_put(folio, true);
*is_detecting = false;
return 0;
@@ -424,22 +424,22 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
break;
}
- if (!is_recoverable_dnode(&folio->page)) {
+ if (!is_recoverable_dnode(folio)) {
f2fs_folio_put(folio, true);
break;
}
- if (!is_fsync_dnode(&folio->page))
+ if (!is_fsync_dnode(folio))
goto next;
- entry = get_fsync_inode(head, ino_of_node(&folio->page));
+ entry = get_fsync_inode(head, ino_of_node(folio));
if (!entry) {
bool quota_inode = false;
if (!check_only &&
- IS_INODE(&folio->page) &&
- is_dent_dnode(&folio->page)) {
- err = f2fs_recover_inode_page(sbi, &folio->page);
+ IS_INODE(folio) &&
+ is_dent_dnode(folio)) {
+ err = f2fs_recover_inode_page(sbi, folio);
if (err) {
f2fs_folio_put(folio, true);
break;
@@ -451,7 +451,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
* CP | dnode(F) | inode(DF)
* For this case, we should not give up now.
*/
- entry = add_fsync_inode(sbi, head, ino_of_node(&folio->page),
+ entry = add_fsync_inode(sbi, head, ino_of_node(folio),
quota_inode);
if (IS_ERR(entry)) {
err = PTR_ERR(entry);
@@ -463,7 +463,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
}
entry->blkaddr = blkaddr;
- if (IS_INODE(&folio->page) && is_dent_dnode(&folio->page))
+ if (IS_INODE(folio) && is_dent_dnode(folio))
entry->last_dentry = blkaddr;
next:
/* check next segment */
@@ -527,7 +527,7 @@ got_it:
nid = le32_to_cpu(sum.nid);
ofs_in_node = le16_to_cpu(sum.ofs_in_node);
- max_addrs = ADDRS_PER_PAGE(&dn->node_folio->page, dn->inode);
+ max_addrs = ADDRS_PER_PAGE(dn->node_folio, dn->inode);
if (ofs_in_node >= max_addrs) {
f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%lu, nid:%u, max:%u",
ofs_in_node, dn->inode->i_ino, nid, max_addrs);
@@ -552,8 +552,8 @@ got_it:
if (IS_ERR(node_folio))
return PTR_ERR(node_folio);
- offset = ofs_of_node(&node_folio->page);
- ino = ino_of_node(&node_folio->page);
+ offset = ofs_of_node(node_folio);
+ ino = ino_of_node(node_folio);
f2fs_folio_put(node_folio, true);
if (ino != dn->inode->i_ino) {
@@ -624,16 +624,16 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
{
struct dnode_of_data dn;
struct node_info ni;
- unsigned int start, end;
+ unsigned int start = 0, end = 0, index;
int err = 0, recovered = 0;
/* step 1: recover xattr */
- if (IS_INODE(&folio->page)) {
+ if (IS_INODE(folio)) {
err = f2fs_recover_inline_xattr(inode, folio);
if (err)
goto out;
- } else if (f2fs_has_xattr_block(ofs_of_node(&folio->page))) {
- err = f2fs_recover_xattr_data(inode, &folio->page);
+ } else if (f2fs_has_xattr_block(ofs_of_node(folio))) {
+ err = f2fs_recover_xattr_data(inode, folio);
if (!err)
recovered++;
goto out;
@@ -648,8 +648,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
}
/* step 3: recover data indices */
- start = f2fs_start_bidx_of_node(ofs_of_node(&folio->page), inode);
- end = start + ADDRS_PER_PAGE(&folio->page, inode);
+ start = f2fs_start_bidx_of_node(ofs_of_node(folio), inode);
+ end = start + ADDRS_PER_PAGE(folio, inode);
set_new_dnode(&dn, inode, NULL, NULL, 0);
retry_dn:
@@ -668,18 +668,18 @@ retry_dn:
if (err)
goto err;
- f2fs_bug_on(sbi, ni.ino != ino_of_node(&folio->page));
+ f2fs_bug_on(sbi, ni.ino != ino_of_node(folio));
- if (ofs_of_node(&dn.node_folio->page) != ofs_of_node(&folio->page)) {
+ if (ofs_of_node(dn.node_folio) != ofs_of_node(folio)) {
f2fs_warn(sbi, "Inconsistent ofs_of_node, ino:%lu, ofs:%u, %u",
- inode->i_ino, ofs_of_node(&dn.node_folio->page),
- ofs_of_node(&folio->page));
+ inode->i_ino, ofs_of_node(dn.node_folio),
+ ofs_of_node(folio));
err = -EFSCORRUPTED;
f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER);
goto err;
}
- for (; start < end; start++, dn.ofs_in_node++) {
+ for (index = start; index < end; index++, dn.ofs_in_node++) {
block_t src, dest;
src = f2fs_data_blkaddr(&dn);
@@ -708,9 +708,9 @@ retry_dn:
}
if (!file_keep_isize(inode) &&
- (i_size_read(inode) <= ((loff_t)start << PAGE_SHIFT)))
+ (i_size_read(inode) <= ((loff_t)index << PAGE_SHIFT)))
f2fs_i_size_write(inode,
- (loff_t)(start + 1) << PAGE_SHIFT);
+ (loff_t)(index + 1) << PAGE_SHIFT);
/*
* dest is reserved block, invalidate src block
@@ -758,16 +758,18 @@ retry_prev:
}
}
- copy_node_footer(&dn.node_folio->page, &folio->page);
- fill_node_footer(&dn.node_folio->page, dn.nid, ni.ino,
- ofs_of_node(&folio->page), false);
+ copy_node_footer(dn.node_folio, folio);
+ fill_node_footer(dn.node_folio, dn.nid, ni.ino,
+ ofs_of_node(folio), false);
folio_mark_dirty(dn.node_folio);
err:
f2fs_put_dnode(&dn);
out:
- f2fs_notice(sbi, "recover_data: ino = %lx (i_size: %s) recovered = %d, err = %d",
- inode->i_ino, file_keep_isize(inode) ? "keep" : "recover",
- recovered, err);
+ f2fs_notice(sbi, "recover_data: ino = %lx, nid = %x (i_size: %s), "
+ "range (%u, %u), recovered = %d, err = %d",
+ inode->i_ino, nid_of_node(folio),
+ file_keep_isize(inode) ? "keep" : "recover",
+ start, end, recovered, err);
return err;
}
@@ -778,6 +780,14 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
int err = 0;
block_t blkaddr;
unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS;
+ unsigned int recoverable_dnode = 0;
+ unsigned int fsynced_dnode = 0;
+ unsigned int total_dnode = 0;
+ unsigned int recovered_inode = 0;
+ unsigned int recovered_dentry = 0;
+ unsigned int recovered_dnode = 0;
+
+ f2fs_notice(sbi, "do_recover_data: start to recover dnode");
/* get node pages in the current segment */
curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
@@ -796,38 +806,43 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
break;
}
- if (!is_recoverable_dnode(&folio->page)) {
+ if (!is_recoverable_dnode(folio)) {
f2fs_folio_put(folio, true);
break;
}
+ recoverable_dnode++;
- entry = get_fsync_inode(inode_list, ino_of_node(&folio->page));
+ entry = get_fsync_inode(inode_list, ino_of_node(folio));
if (!entry)
goto next;
+ fsynced_dnode++;
/*
* inode(x) | CP | inode(x) | dnode(F)
* In this case, we can lose the latest inode(x).
* So, call recover_inode for the inode update.
*/
- if (IS_INODE(&folio->page)) {
- err = recover_inode(entry->inode, &folio->page);
+ if (IS_INODE(folio)) {
+ err = recover_inode(entry->inode, folio);
if (err) {
f2fs_folio_put(folio, true);
break;
}
+ recovered_inode++;
}
if (entry->last_dentry == blkaddr) {
- err = recover_dentry(entry->inode, &folio->page, dir_list);
+ err = recover_dentry(entry->inode, folio, dir_list);
if (err) {
f2fs_folio_put(folio, true);
break;
}
+ recovered_dentry++;
}
err = do_recover_data(sbi, entry->inode, folio);
if (err) {
f2fs_folio_put(folio, true);
break;
}
+ recovered_dnode++;
if (entry->blkaddr == blkaddr)
list_move_tail(&entry->list, tmp_inode_list);
@@ -840,9 +855,15 @@ next:
f2fs_folio_put(folio, true);
f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks);
+ total_dnode++;
}
if (!err)
err = f2fs_allocate_new_segments(sbi);
+
+ f2fs_notice(sbi, "do_recover_data: dnode: (recoverable: %u, fsynced: %u, "
+ "total: %u), recovered: (inode: %u, dentry: %u, dnode: %u), err: %d",
+ recoverable_dnode, fsynced_dnode, total_dnode, recovered_inode,
+ recovered_dentry, recovered_dnode, err);
return err;
}
@@ -855,6 +876,9 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
unsigned long s_flags = sbi->sb->s_flags;
bool need_writecp = false;
+ f2fs_notice(sbi, "f2fs_recover_fsync_data: recovery fsync data, "
+ "check_only: %d", check_only);
+
if (is_sbi_flag_set(sbi, SBI_IS_WRITABLE))
f2fs_info(sbi, "recover fsync data on readonly fs");
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index ae1223ef648f..cc82d42ef14c 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -334,7 +334,7 @@ static int __f2fs_commit_atomic_write(struct inode *inode)
goto next;
}
- blen = min((pgoff_t)ADDRS_PER_PAGE(&dn.node_folio->page, cow_inode),
+ blen = min((pgoff_t)ADDRS_PER_PAGE(dn.node_folio, cow_inode),
len);
index = off;
for (i = 0; i < blen; i++, dn.ofs_in_node++, index++) {
@@ -455,7 +455,8 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
} else {
struct f2fs_gc_control gc_control = {
.victim_segno = NULL_SEGNO,
- .init_gc_type = BG_GC,
+ .init_gc_type = f2fs_sb_has_blkzoned(sbi) ?
+ FG_GC : BG_GC,
.no_bg_gc = true,
.should_migrate_blocks = false,
.err_gc_skipped = false,
@@ -772,7 +773,7 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
/* need not be added */
- if (IS_CURSEG(sbi, segno))
+ if (is_curseg(sbi, segno))
return;
if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type]))
@@ -799,7 +800,7 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
!valid_blocks) ||
valid_blocks == CAP_BLKS_PER_SEC(sbi));
- if (!IS_CURSEC(sbi, secno))
+ if (!is_cursec(sbi, secno))
set_bit(secno, dirty_i->dirty_secmap);
}
}
@@ -838,7 +839,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
return;
}
- if (!IS_CURSEC(sbi, secno))
+ if (!is_cursec(sbi, secno))
set_bit(secno, dirty_i->dirty_secmap);
}
}
@@ -855,7 +856,7 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
unsigned short valid_blocks, ckpt_valid_blocks;
unsigned int usable_blocks;
- if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno))
+ if (segno == NULL_SEGNO || is_curseg(sbi, segno))
return;
usable_blocks = f2fs_usable_blks_in_seg(sbi, segno);
@@ -888,7 +889,7 @@ void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi)
for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) {
if (get_valid_blocks(sbi, segno, false))
continue;
- if (IS_CURSEG(sbi, segno))
+ if (is_curseg(sbi, segno))
continue;
__locate_dirty_segment(sbi, segno, PRE);
__remove_dirty_segment(sbi, segno, DIRTY);
@@ -2107,7 +2108,7 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
if (!force) {
if (!f2fs_realtime_discard_enable(sbi) ||
(!se->valid_blocks &&
- !IS_CURSEG(sbi, cpc->trim_start)) ||
+ !is_curseg(sbi, cpc->trim_start)) ||
SM_I(sbi)->dcc_info->nr_discards >=
SM_I(sbi)->dcc_info->max_discards)
return false;
@@ -2235,7 +2236,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
next:
secno = GET_SEC_FROM_SEG(sbi, start);
start_segno = GET_SEG_FROM_SEC(sbi, secno);
- if (!IS_CURSEC(sbi, secno) &&
+ if (!is_cursec(sbi, secno) &&
!get_valid_blocks(sbi, start, true))
f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno),
BLKS_PER_SEC(sbi));
@@ -3619,7 +3620,7 @@ static int __get_segment_type_4(struct f2fs_io_info *fio)
else
return CURSEG_COLD_DATA;
} else {
- if (IS_DNODE(fio->page) && is_cold_node(fio->page))
+ if (IS_DNODE(fio->folio) && is_cold_node(fio->folio))
return CURSEG_WARM_NODE;
else
return CURSEG_COLD_NODE;
@@ -3665,8 +3666,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
if (file_is_cold(inode) || f2fs_need_compress_data(inode))
return CURSEG_COLD_DATA;
- type = __get_age_segment_type(inode,
- page_folio(fio->page)->index);
+ type = __get_age_segment_type(inode, fio->folio->index);
if (type != NO_CHECK_TYPE)
return type;
@@ -3677,8 +3677,8 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
return f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode),
inode->i_write_hint);
} else {
- if (IS_DNODE(fio->page))
- return is_cold_node(fio->page) ? CURSEG_WARM_NODE :
+ if (IS_DNODE(fio->folio))
+ return is_cold_node(fio->folio) ? CURSEG_WARM_NODE :
CURSEG_HOT_NODE;
return CURSEG_COLD_NODE;
}
@@ -3746,7 +3746,7 @@ static void f2fs_randomize_chunk(struct f2fs_sb_info *sbi,
get_random_u32_inclusive(1, sbi->max_fragment_hole);
}
-int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
+int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct folio *folio,
block_t old_blkaddr, block_t *new_blkaddr,
struct f2fs_summary *sum, int type,
struct f2fs_io_info *fio)
@@ -3850,10 +3850,10 @@ skip_new_segment:
up_write(&sit_i->sentry_lock);
- if (page && IS_NODESEG(curseg->seg_type)) {
- fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
+ if (folio && IS_NODESEG(curseg->seg_type)) {
+ fill_node_footer_blkaddr(folio, NEXT_FREE_BLKADDR(sbi, curseg));
- f2fs_inode_chksum_set(sbi, page);
+ f2fs_inode_chksum_set(sbi, folio);
}
if (fio) {
@@ -3931,7 +3931,7 @@ static int log_type_to_seg_type(enum log_type type)
static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
{
- struct folio *folio = page_folio(fio->page);
+ struct folio *folio = fio->folio;
enum log_type type = __get_segment_type(fio);
int seg_type = log_type_to_seg_type(type);
bool keep_order = (f2fs_lfs_mode(fio->sbi) &&
@@ -3940,15 +3940,21 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
if (keep_order)
f2fs_down_read(&fio->sbi->io_order_lock);
- if (f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
+ if (f2fs_allocate_data_block(fio->sbi, folio, fio->old_blkaddr,
&fio->new_blkaddr, sum, type, fio)) {
if (fscrypt_inode_uses_fs_layer_crypto(folio->mapping->host))
fscrypt_finalize_bounce_page(&fio->encrypted_page);
folio_end_writeback(folio);
if (f2fs_in_warm_node_list(fio->sbi, folio))
f2fs_del_fsync_node_entry(fio->sbi, folio);
+ f2fs_bug_on(fio->sbi, !is_set_ckpt_flags(fio->sbi,
+ CP_ERROR_FLAG));
goto out;
}
+
+ f2fs_bug_on(fio->sbi, !f2fs_is_valid_blkaddr_raw(fio->sbi,
+ fio->new_blkaddr, DATA_GENERIC_ENHANCE));
+
if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO)
f2fs_invalidate_internal_cache(fio->sbi, fio->old_blkaddr, 1);
@@ -3972,7 +3978,7 @@ void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct folio *folio,
.op_flags = REQ_SYNC | REQ_META | REQ_PRIO,
.old_blkaddr = folio->index,
.new_blkaddr = folio->index,
- .page = folio_page(folio, 0),
+ .folio = folio,
.encrypted_page = NULL,
.in_list = 0,
};
@@ -4100,14 +4106,14 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
if (!recover_curseg) {
/* for recovery flow */
- if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) {
+ if (se->valid_blocks == 0 && !is_curseg(sbi, segno)) {
if (old_blkaddr == NULL_ADDR)
type = CURSEG_COLD_DATA;
else
type = CURSEG_WARM_DATA;
}
} else {
- if (IS_CURSEG(sbi, segno)) {
+ if (is_curseg(sbi, segno)) {
/* se->type is volatile as SSR allocation */
type = __f2fs_get_curseg(sbi, segno);
f2fs_bug_on(sbi, type == NO_CHECK_TYPE);
@@ -4191,7 +4197,7 @@ void f2fs_folio_wait_writeback(struct folio *folio, enum page_type type,
struct f2fs_sb_info *sbi = F2FS_F_SB(folio);
/* submit cached LFS IO */
- f2fs_submit_merged_write_cond(sbi, NULL, &folio->page, 0, type);
+ f2fs_submit_merged_write_cond(sbi, NULL, folio, 0, type);
/* submit cached IPU IO */
f2fs_submit_merged_ipu_write(sbi, NULL, folio);
if (ordered) {
@@ -5143,7 +5149,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
if (!valid_blocks || valid_blocks == CAP_BLKS_PER_SEC(sbi))
continue;
- if (IS_CURSEC(sbi, secno))
+ if (is_cursec(sbi, secno))
continue;
set_bit(secno, dirty_i->dirty_secmap);
}
@@ -5279,7 +5285,7 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
* Get # of valid block of the zone.
*/
valid_block_cnt = get_valid_blocks(sbi, zone_segno, true);
- if (IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, zone_segno))) {
+ if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, zone_segno))) {
f2fs_notice(sbi, "Open zones: valid block[0x%x,0x%x] cond[%s]",
zone_segno, valid_block_cnt,
blk_zone_cond_str(zone->cond));
@@ -5806,9 +5812,9 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
kvfree(sit_i->dirty_sentries_bitmap);
SM_I(sbi)->sit_info = NULL;
- kvfree(sit_i->sit_bitmap);
+ kfree(sit_i->sit_bitmap);
#ifdef CONFIG_F2FS_CHECK_FS
- kvfree(sit_i->sit_bitmap_mir);
+ kfree(sit_i->sit_bitmap_mir);
kvfree(sit_i->invalid_segmap);
#endif
kfree(sit_i);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index db619fd2f51a..5e2ee5c686b1 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -34,34 +34,6 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi,
f2fs_bug_on(sbi, seg_type >= NR_PERSISTENT_LOG);
}
-#define IS_CURSEG(sbi, seg) \
- (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \
- ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \
- ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \
- ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \
- ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \
- ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno) || \
- ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno) || \
- ((seg) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno))
-
-#define IS_CURSEC(sbi, secno) \
- (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \
- SEGS_PER_SEC(sbi)) || \
- ((secno) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \
- SEGS_PER_SEC(sbi)) || \
- ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \
- SEGS_PER_SEC(sbi)) || \
- ((secno) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \
- SEGS_PER_SEC(sbi)) || \
- ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \
- SEGS_PER_SEC(sbi)) || \
- ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \
- SEGS_PER_SEC(sbi)) || \
- ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno / \
- SEGS_PER_SEC(sbi)) || \
- ((secno) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno / \
- SEGS_PER_SEC(sbi)))
-
#define MAIN_BLKADDR(sbi) \
(SM_I(sbi) ? SM_I(sbi)->main_blkaddr : \
le32_to_cpu(F2FS_RAW_SUPER(sbi)->main_blkaddr))
@@ -318,6 +290,28 @@ static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type)
return (struct curseg_info *)(SM_I(sbi)->curseg_array + type);
}
+static inline bool is_curseg(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+ int i;
+
+ for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) {
+ if (segno == CURSEG_I(sbi, i)->segno)
+ return true;
+ }
+ return false;
+}
+
+static inline bool is_cursec(struct f2fs_sb_info *sbi, unsigned int secno)
+{
+ int i;
+
+ for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) {
+ if (secno == GET_SEC_FROM_SEG(sbi, CURSEG_I(sbi, i)->segno))
+ return true;
+ }
+ return false;
+}
+
static inline struct seg_entry *get_seg_entry(struct f2fs_sb_info *sbi,
unsigned int segno)
{
@@ -509,7 +503,7 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
free_i->free_segments++;
- if (!inmem && IS_CURSEC(sbi, secno))
+ if (!inmem && is_cursec(sbi, secno))
goto unlock_out;
/* check large section */
@@ -674,8 +668,7 @@ static inline void __get_secs_required(struct f2fs_sb_info *sbi,
unsigned int dent_blocks = total_dent_blocks % CAP_BLKS_PER_SEC(sbi);
unsigned int data_blocks = 0;
- if (f2fs_lfs_mode(sbi) &&
- unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
+ if (f2fs_lfs_mode(sbi)) {
total_data_blocks = get_pages(sbi, F2FS_DIRTY_DATA);
data_secs = total_data_blocks / CAP_BLKS_PER_SEC(sbi);
data_blocks = total_data_blocks % CAP_BLKS_PER_SEC(sbi);
@@ -684,7 +677,7 @@ static inline void __get_secs_required(struct f2fs_sb_info *sbi,
if (lower_p)
*lower_p = node_secs + dent_secs + data_secs;
if (upper_p)
- *upper_p = node_secs + dent_secs +
+ *upper_p = node_secs + dent_secs + data_secs +
(node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0) +
(data_blocks ? 1 : 0);
if (curseg_p)
@@ -986,7 +979,7 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type)
static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno)
{
- if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno))
+ if (is_cursec(sbi, secno) || (sbi->cur_victim_sec == secno))
return true;
return false;
}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index bbf1dad6843f..e16c4e2830c2 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -27,6 +27,8 @@
#include <linux/part_stat.h>
#include <linux/zstd.h>
#include <linux/lz4.h>
+#include <linux/ctype.h>
+#include <linux/fs_parser.h>
#include "f2fs.h"
#include "node.h"
@@ -125,29 +127,20 @@ enum {
Opt_disable_roll_forward,
Opt_norecovery,
Opt_discard,
- Opt_nodiscard,
Opt_noheap,
Opt_heap,
Opt_user_xattr,
- Opt_nouser_xattr,
Opt_acl,
- Opt_noacl,
Opt_active_logs,
Opt_disable_ext_identify,
Opt_inline_xattr,
- Opt_noinline_xattr,
Opt_inline_xattr_size,
Opt_inline_data,
Opt_inline_dentry,
- Opt_noinline_dentry,
Opt_flush_merge,
- Opt_noflush_merge,
Opt_barrier,
- Opt_nobarrier,
Opt_fastboot,
Opt_extent_cache,
- Opt_noextent_cache,
- Opt_noinline_data,
Opt_data_flush,
Opt_reserve_root,
Opt_resgid,
@@ -156,21 +149,13 @@ enum {
Opt_fault_injection,
Opt_fault_type,
Opt_lazytime,
- Opt_nolazytime,
Opt_quota,
- Opt_noquota,
Opt_usrquota,
Opt_grpquota,
Opt_prjquota,
Opt_usrjquota,
Opt_grpjquota,
Opt_prjjquota,
- Opt_offusrjquota,
- Opt_offgrpjquota,
- Opt_offprjjquota,
- Opt_jqfmt_vfsold,
- Opt_jqfmt_vfsv0,
- Opt_jqfmt_vfsv1,
Opt_alloc,
Opt_fsync,
Opt_test_dummy_encryption,
@@ -180,107 +165,209 @@ enum {
Opt_checkpoint_disable_cap_perc,
Opt_checkpoint_enable,
Opt_checkpoint_merge,
- Opt_nocheckpoint_merge,
Opt_compress_algorithm,
Opt_compress_log_size,
- Opt_compress_extension,
Opt_nocompress_extension,
+ Opt_compress_extension,
Opt_compress_chksum,
Opt_compress_mode,
Opt_compress_cache,
Opt_atgc,
Opt_gc_merge,
- Opt_nogc_merge,
Opt_discard_unit,
Opt_memory_mode,
Opt_age_extent_cache,
Opt_errors,
Opt_nat_bits,
+ Opt_jqfmt,
+ Opt_checkpoint,
Opt_err,
};
-static match_table_t f2fs_tokens = {
- {Opt_gc_background, "background_gc=%s"},
- {Opt_disable_roll_forward, "disable_roll_forward"},
- {Opt_norecovery, "norecovery"},
- {Opt_discard, "discard"},
- {Opt_nodiscard, "nodiscard"},
- {Opt_noheap, "no_heap"},
- {Opt_heap, "heap"},
- {Opt_user_xattr, "user_xattr"},
- {Opt_nouser_xattr, "nouser_xattr"},
- {Opt_acl, "acl"},
- {Opt_noacl, "noacl"},
- {Opt_active_logs, "active_logs=%u"},
- {Opt_disable_ext_identify, "disable_ext_identify"},
- {Opt_inline_xattr, "inline_xattr"},
- {Opt_noinline_xattr, "noinline_xattr"},
- {Opt_inline_xattr_size, "inline_xattr_size=%u"},
- {Opt_inline_data, "inline_data"},
- {Opt_inline_dentry, "inline_dentry"},
- {Opt_noinline_dentry, "noinline_dentry"},
- {Opt_flush_merge, "flush_merge"},
- {Opt_noflush_merge, "noflush_merge"},
- {Opt_barrier, "barrier"},
- {Opt_nobarrier, "nobarrier"},
- {Opt_fastboot, "fastboot"},
- {Opt_extent_cache, "extent_cache"},
- {Opt_noextent_cache, "noextent_cache"},
- {Opt_noinline_data, "noinline_data"},
- {Opt_data_flush, "data_flush"},
- {Opt_reserve_root, "reserve_root=%u"},
- {Opt_resgid, "resgid=%u"},
- {Opt_resuid, "resuid=%u"},
- {Opt_mode, "mode=%s"},
- {Opt_fault_injection, "fault_injection=%u"},
- {Opt_fault_type, "fault_type=%u"},
- {Opt_lazytime, "lazytime"},
- {Opt_nolazytime, "nolazytime"},
- {Opt_quota, "quota"},
- {Opt_noquota, "noquota"},
- {Opt_usrquota, "usrquota"},
- {Opt_grpquota, "grpquota"},
- {Opt_prjquota, "prjquota"},
- {Opt_usrjquota, "usrjquota=%s"},
- {Opt_grpjquota, "grpjquota=%s"},
- {Opt_prjjquota, "prjjquota=%s"},
- {Opt_offusrjquota, "usrjquota="},
- {Opt_offgrpjquota, "grpjquota="},
- {Opt_offprjjquota, "prjjquota="},
- {Opt_jqfmt_vfsold, "jqfmt=vfsold"},
- {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
- {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
- {Opt_alloc, "alloc_mode=%s"},
- {Opt_fsync, "fsync_mode=%s"},
- {Opt_test_dummy_encryption, "test_dummy_encryption=%s"},
- {Opt_test_dummy_encryption, "test_dummy_encryption"},
- {Opt_inlinecrypt, "inlinecrypt"},
- {Opt_checkpoint_disable, "checkpoint=disable"},
- {Opt_checkpoint_disable_cap, "checkpoint=disable:%u"},
- {Opt_checkpoint_disable_cap_perc, "checkpoint=disable:%u%%"},
- {Opt_checkpoint_enable, "checkpoint=enable"},
- {Opt_checkpoint_merge, "checkpoint_merge"},
- {Opt_nocheckpoint_merge, "nocheckpoint_merge"},
- {Opt_compress_algorithm, "compress_algorithm=%s"},
- {Opt_compress_log_size, "compress_log_size=%u"},
- {Opt_compress_extension, "compress_extension=%s"},
- {Opt_nocompress_extension, "nocompress_extension=%s"},
- {Opt_compress_chksum, "compress_chksum"},
- {Opt_compress_mode, "compress_mode=%s"},
- {Opt_compress_cache, "compress_cache"},
- {Opt_atgc, "atgc"},
- {Opt_gc_merge, "gc_merge"},
- {Opt_nogc_merge, "nogc_merge"},
- {Opt_discard_unit, "discard_unit=%s"},
- {Opt_memory_mode, "memory=%s"},
- {Opt_age_extent_cache, "age_extent_cache"},
- {Opt_errors, "errors=%s"},
- {Opt_nat_bits, "nat_bits"},
+static const struct constant_table f2fs_param_background_gc[] = {
+ {"on", BGGC_MODE_ON},
+ {"off", BGGC_MODE_OFF},
+ {"sync", BGGC_MODE_SYNC},
+ {}
+};
+
+static const struct constant_table f2fs_param_mode[] = {
+ {"adaptive", FS_MODE_ADAPTIVE},
+ {"lfs", FS_MODE_LFS},
+ {"fragment:segment", FS_MODE_FRAGMENT_SEG},
+ {"fragment:block", FS_MODE_FRAGMENT_BLK},
+ {}
+};
+
+static const struct constant_table f2fs_param_jqfmt[] = {
+ {"vfsold", QFMT_VFS_OLD},
+ {"vfsv0", QFMT_VFS_V0},
+ {"vfsv1", QFMT_VFS_V1},
+ {}
+};
+
+static const struct constant_table f2fs_param_alloc_mode[] = {
+ {"default", ALLOC_MODE_DEFAULT},
+ {"reuse", ALLOC_MODE_REUSE},
+ {}
+};
+static const struct constant_table f2fs_param_fsync_mode[] = {
+ {"posix", FSYNC_MODE_POSIX},
+ {"strict", FSYNC_MODE_STRICT},
+ {"nobarrier", FSYNC_MODE_NOBARRIER},
+ {}
+};
+
+static const struct constant_table f2fs_param_compress_mode[] = {
+ {"fs", COMPR_MODE_FS},
+ {"user", COMPR_MODE_USER},
+ {}
+};
+
+static const struct constant_table f2fs_param_discard_unit[] = {
+ {"block", DISCARD_UNIT_BLOCK},
+ {"segment", DISCARD_UNIT_SEGMENT},
+ {"section", DISCARD_UNIT_SECTION},
+ {}
+};
+
+static const struct constant_table f2fs_param_memory_mode[] = {
+ {"normal", MEMORY_MODE_NORMAL},
+ {"low", MEMORY_MODE_LOW},
+ {}
+};
+
+static const struct constant_table f2fs_param_errors[] = {
+ {"remount-ro", MOUNT_ERRORS_READONLY},
+ {"continue", MOUNT_ERRORS_CONTINUE},
+ {"panic", MOUNT_ERRORS_PANIC},
+ {}
+};
+
+static const struct fs_parameter_spec f2fs_param_specs[] = {
+ fsparam_enum("background_gc", Opt_gc_background, f2fs_param_background_gc),
+ fsparam_flag("disable_roll_forward", Opt_disable_roll_forward),
+ fsparam_flag("norecovery", Opt_norecovery),
+ fsparam_flag_no("discard", Opt_discard),
+ fsparam_flag("no_heap", Opt_noheap),
+ fsparam_flag("heap", Opt_heap),
+ fsparam_flag_no("user_xattr", Opt_user_xattr),
+ fsparam_flag_no("acl", Opt_acl),
+ fsparam_s32("active_logs", Opt_active_logs),
+ fsparam_flag("disable_ext_identify", Opt_disable_ext_identify),
+ fsparam_flag_no("inline_xattr", Opt_inline_xattr),
+ fsparam_s32("inline_xattr_size", Opt_inline_xattr_size),
+ fsparam_flag_no("inline_data", Opt_inline_data),
+ fsparam_flag_no("inline_dentry", Opt_inline_dentry),
+ fsparam_flag_no("flush_merge", Opt_flush_merge),
+ fsparam_flag_no("barrier", Opt_barrier),
+ fsparam_flag("fastboot", Opt_fastboot),
+ fsparam_flag_no("extent_cache", Opt_extent_cache),
+ fsparam_flag("data_flush", Opt_data_flush),
+ fsparam_u32("reserve_root", Opt_reserve_root),
+ fsparam_gid("resgid", Opt_resgid),
+ fsparam_uid("resuid", Opt_resuid),
+ fsparam_enum("mode", Opt_mode, f2fs_param_mode),
+ fsparam_s32("fault_injection", Opt_fault_injection),
+ fsparam_u32("fault_type", Opt_fault_type),
+ fsparam_flag_no("lazytime", Opt_lazytime),
+ fsparam_flag_no("quota", Opt_quota),
+ fsparam_flag("usrquota", Opt_usrquota),
+ fsparam_flag("grpquota", Opt_grpquota),
+ fsparam_flag("prjquota", Opt_prjquota),
+ fsparam_string_empty("usrjquota", Opt_usrjquota),
+ fsparam_string_empty("grpjquota", Opt_grpjquota),
+ fsparam_string_empty("prjjquota", Opt_prjjquota),
+ fsparam_flag("nat_bits", Opt_nat_bits),
+ fsparam_enum("jqfmt", Opt_jqfmt, f2fs_param_jqfmt),
+ fsparam_enum("alloc_mode", Opt_alloc, f2fs_param_alloc_mode),
+ fsparam_enum("fsync_mode", Opt_fsync, f2fs_param_fsync_mode),
+ fsparam_string("test_dummy_encryption", Opt_test_dummy_encryption),
+ fsparam_flag("test_dummy_encryption", Opt_test_dummy_encryption),
+ fsparam_flag("inlinecrypt", Opt_inlinecrypt),
+ fsparam_string("checkpoint", Opt_checkpoint),
+ fsparam_flag_no("checkpoint_merge", Opt_checkpoint_merge),
+ fsparam_string("compress_algorithm", Opt_compress_algorithm),
+ fsparam_u32("compress_log_size", Opt_compress_log_size),
+ fsparam_string("compress_extension", Opt_compress_extension),
+ fsparam_string("nocompress_extension", Opt_nocompress_extension),
+ fsparam_flag("compress_chksum", Opt_compress_chksum),
+ fsparam_enum("compress_mode", Opt_compress_mode, f2fs_param_compress_mode),
+ fsparam_flag("compress_cache", Opt_compress_cache),
+ fsparam_flag("atgc", Opt_atgc),
+ fsparam_flag_no("gc_merge", Opt_gc_merge),
+ fsparam_enum("discard_unit", Opt_discard_unit, f2fs_param_discard_unit),
+ fsparam_enum("memory", Opt_memory_mode, f2fs_param_memory_mode),
+ fsparam_flag("age_extent_cache", Opt_age_extent_cache),
+ fsparam_enum("errors", Opt_errors, f2fs_param_errors),
+ {}
+};
+
+/* Resort to a match_table for this interestingly formatted option */
+static match_table_t f2fs_checkpoint_tokens = {
+ {Opt_checkpoint_disable, "disable"},
+ {Opt_checkpoint_disable_cap, "disable:%u"},
+ {Opt_checkpoint_disable_cap_perc, "disable:%u%%"},
+ {Opt_checkpoint_enable, "enable"},
{Opt_err, NULL},
};
+#define F2FS_SPEC_background_gc (1 << 0)
+#define F2FS_SPEC_inline_xattr_size (1 << 1)
+#define F2FS_SPEC_active_logs (1 << 2)
+#define F2FS_SPEC_reserve_root (1 << 3)
+#define F2FS_SPEC_resgid (1 << 4)
+#define F2FS_SPEC_resuid (1 << 5)
+#define F2FS_SPEC_mode (1 << 6)
+#define F2FS_SPEC_fault_injection (1 << 7)
+#define F2FS_SPEC_fault_type (1 << 8)
+#define F2FS_SPEC_jqfmt (1 << 9)
+#define F2FS_SPEC_alloc_mode (1 << 10)
+#define F2FS_SPEC_fsync_mode (1 << 11)
+#define F2FS_SPEC_checkpoint_disable_cap (1 << 12)
+#define F2FS_SPEC_checkpoint_disable_cap_perc (1 << 13)
+#define F2FS_SPEC_compress_level (1 << 14)
+#define F2FS_SPEC_compress_algorithm (1 << 15)
+#define F2FS_SPEC_compress_log_size (1 << 16)
+#define F2FS_SPEC_compress_extension (1 << 17)
+#define F2FS_SPEC_nocompress_extension (1 << 18)
+#define F2FS_SPEC_compress_chksum (1 << 19)
+#define F2FS_SPEC_compress_mode (1 << 20)
+#define F2FS_SPEC_discard_unit (1 << 21)
+#define F2FS_SPEC_memory_mode (1 << 22)
+#define F2FS_SPEC_errors (1 << 23)
+
+struct f2fs_fs_context {
+ struct f2fs_mount_info info;
+ unsigned int opt_mask; /* Bits changed */
+ unsigned int spec_mask;
+ unsigned short qname_mask;
+};
+
+#define F2FS_CTX_INFO(ctx) ((ctx)->info)
+
+static inline void ctx_set_opt(struct f2fs_fs_context *ctx,
+ unsigned int flag)
+{
+ ctx->info.opt |= flag;
+ ctx->opt_mask |= flag;
+}
+
+static inline void ctx_clear_opt(struct f2fs_fs_context *ctx,
+ unsigned int flag)
+{
+ ctx->info.opt &= ~flag;
+ ctx->opt_mask |= flag;
+}
+
+static inline bool ctx_test_opt(struct f2fs_fs_context *ctx,
+ unsigned int flag)
+{
+ return ctx->info.opt & flag;
+}
+
void f2fs_printk(struct f2fs_sb_info *sbi, bool limit_rate,
- const char *fmt, ...)
+ const char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -292,11 +379,19 @@ void f2fs_printk(struct f2fs_sb_info *sbi, bool limit_rate,
vaf.fmt = printk_skip_level(fmt);
vaf.va = &args;
if (limit_rate)
- printk_ratelimited("%c%cF2FS-fs (%s): %pV\n",
- KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf);
+ if (sbi)
+ printk_ratelimited("%c%cF2FS-fs (%s): %pV\n",
+ KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf);
+ else
+ printk_ratelimited("%c%cF2FS-fs: %pV\n",
+ KERN_SOH_ASCII, level, &vaf);
else
- printk("%c%cF2FS-fs (%s): %pV\n",
- KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf);
+ if (sbi)
+ printk("%c%cF2FS-fs (%s): %pV\n",
+ KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf);
+ else
+ printk("%c%cF2FS-fs: %pV\n",
+ KERN_SOH_ASCII, level, &vaf);
va_end(args);
}
@@ -390,159 +485,90 @@ static void init_once(void *foo)
#ifdef CONFIG_QUOTA
static const char * const quotatypes[] = INITQFNAMES;
#define QTYPE2NAME(t) (quotatypes[t])
-static int f2fs_set_qf_name(struct f2fs_sb_info *sbi, int qtype,
- substring_t *args)
+/*
+ * Note the name of the specified quota file.
+ */
+static int f2fs_note_qf_name(struct fs_context *fc, int qtype,
+ struct fs_parameter *param)
{
- struct super_block *sb = sbi->sb;
+ struct f2fs_fs_context *ctx = fc->fs_private;
char *qname;
- int ret = -EINVAL;
- if (sb_any_quota_loaded(sb) && !F2FS_OPTION(sbi).s_qf_names[qtype]) {
- f2fs_err(sbi, "Cannot change journaled quota options when quota turned on");
+ if (param->size < 1) {
+ f2fs_err(NULL, "Missing quota name");
return -EINVAL;
}
- if (f2fs_sb_has_quota_ino(sbi)) {
- f2fs_info(sbi, "QUOTA feature is enabled, so ignore qf_name");
+ if (strchr(param->string, '/')) {
+ f2fs_err(NULL, "quotafile must be on filesystem root");
+ return -EINVAL;
+ }
+ if (ctx->info.s_qf_names[qtype]) {
+ if (strcmp(ctx->info.s_qf_names[qtype], param->string) != 0) {
+ f2fs_err(NULL, "Quota file already specified");
+ return -EINVAL;
+ }
return 0;
}
- qname = match_strdup(args);
+ qname = kmemdup_nul(param->string, param->size, GFP_KERNEL);
if (!qname) {
- f2fs_err(sbi, "Not enough memory for storing quotafile name");
+ f2fs_err(NULL, "Not enough memory for storing quotafile name");
return -ENOMEM;
}
- if (F2FS_OPTION(sbi).s_qf_names[qtype]) {
- if (strcmp(F2FS_OPTION(sbi).s_qf_names[qtype], qname) == 0)
- ret = 0;
- else
- f2fs_err(sbi, "%s quota file already specified",
- QTYPE2NAME(qtype));
- goto errout;
- }
- if (strchr(qname, '/')) {
- f2fs_err(sbi, "quotafile must be on filesystem root");
- goto errout;
- }
- F2FS_OPTION(sbi).s_qf_names[qtype] = qname;
- set_opt(sbi, QUOTA);
+ F2FS_CTX_INFO(ctx).s_qf_names[qtype] = qname;
+ ctx->qname_mask |= 1 << qtype;
return 0;
-errout:
- kfree(qname);
- return ret;
}
-static int f2fs_clear_qf_name(struct f2fs_sb_info *sbi, int qtype)
+/*
+ * Clear the name of the specified quota file.
+ */
+static int f2fs_unnote_qf_name(struct fs_context *fc, int qtype)
{
- struct super_block *sb = sbi->sb;
+ struct f2fs_fs_context *ctx = fc->fs_private;
- if (sb_any_quota_loaded(sb) && F2FS_OPTION(sbi).s_qf_names[qtype]) {
- f2fs_err(sbi, "Cannot change journaled quota options when quota turned on");
- return -EINVAL;
- }
- kfree(F2FS_OPTION(sbi).s_qf_names[qtype]);
- F2FS_OPTION(sbi).s_qf_names[qtype] = NULL;
+ kfree(ctx->info.s_qf_names[qtype]);
+ ctx->info.s_qf_names[qtype] = NULL;
+ ctx->qname_mask |= 1 << qtype;
return 0;
}
-static int f2fs_check_quota_options(struct f2fs_sb_info *sbi)
+static void f2fs_unnote_qf_name_all(struct fs_context *fc)
{
- /*
- * We do the test below only for project quotas. 'usrquota' and
- * 'grpquota' mount options are allowed even without quota feature
- * to support legacy quotas in quota files.
- */
- if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi)) {
- f2fs_err(sbi, "Project quota feature not enabled. Cannot enable project quota enforcement.");
- return -1;
- }
- if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] ||
- F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] ||
- F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) {
- if (test_opt(sbi, USRQUOTA) &&
- F2FS_OPTION(sbi).s_qf_names[USRQUOTA])
- clear_opt(sbi, USRQUOTA);
-
- if (test_opt(sbi, GRPQUOTA) &&
- F2FS_OPTION(sbi).s_qf_names[GRPQUOTA])
- clear_opt(sbi, GRPQUOTA);
-
- if (test_opt(sbi, PRJQUOTA) &&
- F2FS_OPTION(sbi).s_qf_names[PRJQUOTA])
- clear_opt(sbi, PRJQUOTA);
-
- if (test_opt(sbi, GRPQUOTA) || test_opt(sbi, USRQUOTA) ||
- test_opt(sbi, PRJQUOTA)) {
- f2fs_err(sbi, "old and new quota format mixing");
- return -1;
- }
-
- if (!F2FS_OPTION(sbi).s_jquota_fmt) {
- f2fs_err(sbi, "journaled quota format not specified");
- return -1;
- }
- }
+ int i;
- if (f2fs_sb_has_quota_ino(sbi) && F2FS_OPTION(sbi).s_jquota_fmt) {
- f2fs_info(sbi, "QUOTA feature is enabled, so ignore jquota_fmt");
- F2FS_OPTION(sbi).s_jquota_fmt = 0;
- }
- return 0;
+ for (i = 0; i < MAXQUOTAS; i++)
+ f2fs_unnote_qf_name(fc, i);
}
#endif
-static int f2fs_set_test_dummy_encryption(struct f2fs_sb_info *sbi,
- const char *opt,
- const substring_t *arg,
- bool is_remount)
+static int f2fs_parse_test_dummy_encryption(const struct fs_parameter *param,
+ struct f2fs_fs_context *ctx)
{
- struct fs_parameter param = {
- .type = fs_value_is_string,
- .string = arg->from ? arg->from : "",
- };
- struct fscrypt_dummy_policy *policy =
- &F2FS_OPTION(sbi).dummy_enc_policy;
int err;
if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) {
- f2fs_warn(sbi, "test_dummy_encryption option not supported");
+ f2fs_warn(NULL, "test_dummy_encryption option not supported");
return -EINVAL;
}
-
- if (!f2fs_sb_has_encrypt(sbi)) {
- f2fs_err(sbi, "Encrypt feature is off");
- return -EINVAL;
- }
-
- /*
- * This mount option is just for testing, and it's not worthwhile to
- * implement the extra complexity (e.g. RCU protection) that would be
- * needed to allow it to be set or changed during remount. We do allow
- * it to be specified during remount, but only if there is no change.
- */
- if (is_remount && !fscrypt_is_dummy_policy_set(policy)) {
- f2fs_warn(sbi, "Can't set test_dummy_encryption on remount");
- return -EINVAL;
- }
-
- err = fscrypt_parse_test_dummy_encryption(&param, policy);
+ err = fscrypt_parse_test_dummy_encryption(param,
+ &ctx->info.dummy_enc_policy);
if (err) {
- if (err == -EEXIST)
- f2fs_warn(sbi,
- "Can't change test_dummy_encryption on remount");
- else if (err == -EINVAL)
- f2fs_warn(sbi, "Value of option \"%s\" is unrecognized",
- opt);
+ if (err == -EINVAL)
+ f2fs_warn(NULL, "Value of option \"%s\" is unrecognized",
+ param->key);
+ else if (err == -EEXIST)
+ f2fs_warn(NULL, "Conflicting test_dummy_encryption options");
else
- f2fs_warn(sbi, "Error processing option \"%s\" [%d]",
- opt, err);
+ f2fs_warn(NULL, "Error processing option \"%s\" [%d]",
+ param->key, err);
return -EINVAL;
}
- f2fs_warn(sbi, "Test dummy encryption mode enabled");
return 0;
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
-static bool is_compress_extension_exist(struct f2fs_sb_info *sbi,
+static bool is_compress_extension_exist(struct f2fs_mount_info *info,
const char *new_ext, bool is_ext)
{
unsigned char (*ext)[F2FS_EXTENSION_LEN];
@@ -550,11 +576,11 @@ static bool is_compress_extension_exist(struct f2fs_sb_info *sbi,
int i;
if (is_ext) {
- ext = F2FS_OPTION(sbi).extensions;
- ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
+ ext = info->extensions;
+ ext_cnt = info->compress_ext_cnt;
} else {
- ext = F2FS_OPTION(sbi).noextensions;
- ext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt;
+ ext = info->noextensions;
+ ext_cnt = info->nocompress_ext_cnt;
}
for (i = 0; i < ext_cnt; i++) {
@@ -572,28 +598,28 @@ static bool is_compress_extension_exist(struct f2fs_sb_info *sbi,
* extension will be treated as special cases and will not be compressed.
* 3. Don't allow the non-compress extension specifies all files.
*/
-static int f2fs_test_compress_extension(struct f2fs_sb_info *sbi)
+static int f2fs_test_compress_extension(unsigned char (*noext)[F2FS_EXTENSION_LEN],
+ int noext_cnt,
+ unsigned char (*ext)[F2FS_EXTENSION_LEN],
+ int ext_cnt)
{
- unsigned char (*ext)[F2FS_EXTENSION_LEN];
- unsigned char (*noext)[F2FS_EXTENSION_LEN];
- int ext_cnt, noext_cnt, index = 0, no_index = 0;
-
- ext = F2FS_OPTION(sbi).extensions;
- ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
- noext = F2FS_OPTION(sbi).noextensions;
- noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt;
+ int index = 0, no_index = 0;
if (!noext_cnt)
return 0;
for (no_index = 0; no_index < noext_cnt; no_index++) {
+ if (strlen(noext[no_index]) == 0)
+ continue;
if (!strcasecmp("*", noext[no_index])) {
- f2fs_info(sbi, "Don't allow the nocompress extension specifies all files");
+ f2fs_info(NULL, "Don't allow the nocompress extension specifies all files");
return -EINVAL;
}
for (index = 0; index < ext_cnt; index++) {
+ if (strlen(ext[index]) == 0)
+ continue;
if (!strcasecmp(ext[index], noext[no_index])) {
- f2fs_info(sbi, "Don't allow the same extension %s appear in both compress and nocompress extension",
+ f2fs_info(NULL, "Don't allow the same extension %s appear in both compress and nocompress extension",
ext[index]);
return -EINVAL;
}
@@ -603,58 +629,62 @@ static int f2fs_test_compress_extension(struct f2fs_sb_info *sbi)
}
#ifdef CONFIG_F2FS_FS_LZ4
-static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str)
+static int f2fs_set_lz4hc_level(struct f2fs_fs_context *ctx, const char *str)
{
#ifdef CONFIG_F2FS_FS_LZ4HC
unsigned int level;
if (strlen(str) == 3) {
- F2FS_OPTION(sbi).compress_level = 0;
+ F2FS_CTX_INFO(ctx).compress_level = 0;
+ ctx->spec_mask |= F2FS_SPEC_compress_level;
return 0;
}
str += 3;
if (str[0] != ':') {
- f2fs_info(sbi, "wrong format, e.g. <alg_name>:<compr_level>");
+ f2fs_info(NULL, "wrong format, e.g. <alg_name>:<compr_level>");
return -EINVAL;
}
if (kstrtouint(str + 1, 10, &level))
return -EINVAL;
if (!f2fs_is_compress_level_valid(COMPRESS_LZ4, level)) {
- f2fs_info(sbi, "invalid lz4hc compress level: %d", level);
+ f2fs_info(NULL, "invalid lz4hc compress level: %d", level);
return -EINVAL;
}
- F2FS_OPTION(sbi).compress_level = level;
+ F2FS_CTX_INFO(ctx).compress_level = level;
+ ctx->spec_mask |= F2FS_SPEC_compress_level;
return 0;
#else
if (strlen(str) == 3) {
- F2FS_OPTION(sbi).compress_level = 0;
+ F2FS_CTX_INFO(ctx).compress_level = 0;
+ ctx->spec_mask |= F2FS_SPEC_compress_level;
return 0;
}
- f2fs_info(sbi, "kernel doesn't support lz4hc compression");
+ f2fs_info(NULL, "kernel doesn't support lz4hc compression");
return -EINVAL;
#endif
}
#endif
#ifdef CONFIG_F2FS_FS_ZSTD
-static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str)
+static int f2fs_set_zstd_level(struct f2fs_fs_context *ctx, const char *str)
{
int level;
int len = 4;
if (strlen(str) == len) {
- F2FS_OPTION(sbi).compress_level = F2FS_ZSTD_DEFAULT_CLEVEL;
+ F2FS_CTX_INFO(ctx).compress_level = F2FS_ZSTD_DEFAULT_CLEVEL;
+ ctx->spec_mask |= F2FS_SPEC_compress_level;
return 0;
}
str += len;
if (str[0] != ':') {
- f2fs_info(sbi, "wrong format, e.g. <alg_name>:<compr_level>");
+ f2fs_info(NULL, "wrong format, e.g. <alg_name>:<compr_level>");
return -EINVAL;
}
if (kstrtoint(str + 1, 10, &level))
@@ -662,685 +692,750 @@ static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str)
/* f2fs does not support negative compress level now */
if (level < 0) {
- f2fs_info(sbi, "do not support negative compress level: %d", level);
+ f2fs_info(NULL, "do not support negative compress level: %d", level);
return -ERANGE;
}
if (!f2fs_is_compress_level_valid(COMPRESS_ZSTD, level)) {
- f2fs_info(sbi, "invalid zstd compress level: %d", level);
+ f2fs_info(NULL, "invalid zstd compress level: %d", level);
return -EINVAL;
}
- F2FS_OPTION(sbi).compress_level = level;
+ F2FS_CTX_INFO(ctx).compress_level = level;
+ ctx->spec_mask |= F2FS_SPEC_compress_level;
return 0;
}
#endif
#endif
-static int parse_options(struct f2fs_sb_info *sbi, char *options, bool is_remount)
+static int f2fs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- substring_t args[MAX_OPT_ARGS];
+ struct f2fs_fs_context *ctx = fc->fs_private;
#ifdef CONFIG_F2FS_FS_COMPRESSION
unsigned char (*ext)[F2FS_EXTENSION_LEN];
unsigned char (*noext)[F2FS_EXTENSION_LEN];
int ext_cnt, noext_cnt;
+ char *name;
#endif
- char *p, *name;
- int arg = 0;
- kuid_t uid;
- kgid_t gid;
- int ret;
-
- if (!options)
- return 0;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
+ substring_t args[MAX_OPT_ARGS];
+ struct fs_parse_result result;
+ int token, ret, arg;
- if (!*p)
- continue;
- /*
- * Initialize args struct so we know whether arg was
- * found; some options take optional arguments.
- */
- args[0].to = args[0].from = NULL;
- token = match_token(p, f2fs_tokens, args);
+ token = fs_parse(fc, f2fs_param_specs, param, &result);
+ if (token < 0)
+ return token;
- switch (token) {
- case Opt_gc_background:
- name = match_strdup(&args[0]);
-
- if (!name)
- return -ENOMEM;
- if (!strcmp(name, "on")) {
- F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON;
- } else if (!strcmp(name, "off")) {
- if (f2fs_sb_has_blkzoned(sbi)) {
- f2fs_warn(sbi, "zoned devices need bggc");
- kfree(name);
- return -EINVAL;
- }
- F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_OFF;
- } else if (!strcmp(name, "sync")) {
- F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_SYNC;
- } else {
- kfree(name);
- return -EINVAL;
- }
- kfree(name);
- break;
- case Opt_disable_roll_forward:
- set_opt(sbi, DISABLE_ROLL_FORWARD);
- break;
- case Opt_norecovery:
- /* requires ro mount, checked in f2fs_default_check */
- set_opt(sbi, NORECOVERY);
- break;
- case Opt_discard:
- if (!f2fs_hw_support_discard(sbi)) {
- f2fs_warn(sbi, "device does not support discard");
- break;
- }
- set_opt(sbi, DISCARD);
- break;
- case Opt_nodiscard:
- if (f2fs_hw_should_discard(sbi)) {
- f2fs_warn(sbi, "discard is required for zoned block devices");
- return -EINVAL;
- }
- clear_opt(sbi, DISCARD);
- break;
- case Opt_noheap:
- case Opt_heap:
- f2fs_warn(sbi, "heap/no_heap options were deprecated");
- break;
+ switch (token) {
+ case Opt_gc_background:
+ F2FS_CTX_INFO(ctx).bggc_mode = result.uint_32;
+ ctx->spec_mask |= F2FS_SPEC_background_gc;
+ break;
+ case Opt_disable_roll_forward:
+ ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_ROLL_FORWARD);
+ break;
+ case Opt_norecovery:
+ /* requires ro mount, checked in f2fs_validate_options */
+ ctx_set_opt(ctx, F2FS_MOUNT_NORECOVERY);
+ break;
+ case Opt_discard:
+ if (result.negated)
+ ctx_clear_opt(ctx, F2FS_MOUNT_DISCARD);
+ else
+ ctx_set_opt(ctx, F2FS_MOUNT_DISCARD);
+ break;
+ case Opt_noheap:
+ case Opt_heap:
+ f2fs_warn(NULL, "heap/no_heap options were deprecated");
+ break;
#ifdef CONFIG_F2FS_FS_XATTR
- case Opt_user_xattr:
- set_opt(sbi, XATTR_USER);
- break;
- case Opt_nouser_xattr:
- clear_opt(sbi, XATTR_USER);
- break;
- case Opt_inline_xattr:
- set_opt(sbi, INLINE_XATTR);
- break;
- case Opt_noinline_xattr:
- clear_opt(sbi, INLINE_XATTR);
- break;
- case Opt_inline_xattr_size:
- if (args->from && match_int(args, &arg))
- return -EINVAL;
- set_opt(sbi, INLINE_XATTR_SIZE);
- F2FS_OPTION(sbi).inline_xattr_size = arg;
- break;
+ case Opt_user_xattr:
+ if (result.negated)
+ ctx_clear_opt(ctx, F2FS_MOUNT_XATTR_USER);
+ else
+ ctx_set_opt(ctx, F2FS_MOUNT_XATTR_USER);
+ break;
+ case Opt_inline_xattr:
+ if (result.negated)
+ ctx_clear_opt(ctx, F2FS_MOUNT_INLINE_XATTR);
+ else
+ ctx_set_opt(ctx, F2FS_MOUNT_INLINE_XATTR);
+ break;
+ case Opt_inline_xattr_size:
+ if (result.int_32 < MIN_INLINE_XATTR_SIZE ||
+ result.int_32 > MAX_INLINE_XATTR_SIZE) {
+ f2fs_err(NULL, "inline xattr size is out of range: %u ~ %u",
+ (u32)MIN_INLINE_XATTR_SIZE, (u32)MAX_INLINE_XATTR_SIZE);
+ return -EINVAL;
+ }
+ ctx_set_opt(ctx, F2FS_MOUNT_INLINE_XATTR_SIZE);
+ F2FS_CTX_INFO(ctx).inline_xattr_size = result.int_32;
+ ctx->spec_mask |= F2FS_SPEC_inline_xattr_size;
+ break;
#else
- case Opt_user_xattr:
- case Opt_nouser_xattr:
- case Opt_inline_xattr:
- case Opt_noinline_xattr:
- case Opt_inline_xattr_size:
- f2fs_info(sbi, "xattr options not supported");
- break;
+ case Opt_user_xattr:
+ case Opt_inline_xattr:
+ case Opt_inline_xattr_size:
+ f2fs_info(NULL, "%s options not supported", param->key);
+ break;
#endif
#ifdef CONFIG_F2FS_FS_POSIX_ACL
- case Opt_acl:
- set_opt(sbi, POSIX_ACL);
- break;
- case Opt_noacl:
- clear_opt(sbi, POSIX_ACL);
- break;
+ case Opt_acl:
+ if (result.negated)
+ ctx_clear_opt(ctx, F2FS_MOUNT_POSIX_ACL);
+ else
+ ctx_set_opt(ctx, F2FS_MOUNT_POSIX_ACL);
+ break;
#else
- case Opt_acl:
- case Opt_noacl:
- f2fs_info(sbi, "acl options not supported");
- break;
+ case Opt_acl:
+ f2fs_info(NULL, "%s options not supported", param->key);
+ break;
#endif
- case Opt_active_logs:
- if (args->from && match_int(args, &arg))
- return -EINVAL;
- if (arg != 2 && arg != 4 &&
- arg != NR_CURSEG_PERSIST_TYPE)
- return -EINVAL;
- F2FS_OPTION(sbi).active_logs = arg;
- break;
- case Opt_disable_ext_identify:
- set_opt(sbi, DISABLE_EXT_IDENTIFY);
- break;
- case Opt_inline_data:
- set_opt(sbi, INLINE_DATA);
- break;
- case Opt_inline_dentry:
- set_opt(sbi, INLINE_DENTRY);
- break;
- case Opt_noinline_dentry:
- clear_opt(sbi, INLINE_DENTRY);
- break;
- case Opt_flush_merge:
- set_opt(sbi, FLUSH_MERGE);
- break;
- case Opt_noflush_merge:
- clear_opt(sbi, FLUSH_MERGE);
- break;
- case Opt_nobarrier:
- set_opt(sbi, NOBARRIER);
- break;
- case Opt_barrier:
- clear_opt(sbi, NOBARRIER);
- break;
- case Opt_fastboot:
- set_opt(sbi, FASTBOOT);
- break;
- case Opt_extent_cache:
- set_opt(sbi, READ_EXTENT_CACHE);
- break;
- case Opt_noextent_cache:
- if (f2fs_sb_has_device_alias(sbi)) {
- f2fs_err(sbi, "device aliasing requires extent cache");
- return -EINVAL;
- }
- clear_opt(sbi, READ_EXTENT_CACHE);
- break;
- case Opt_noinline_data:
- clear_opt(sbi, INLINE_DATA);
- break;
- case Opt_data_flush:
- set_opt(sbi, DATA_FLUSH);
- break;
- case Opt_reserve_root:
- if (args->from && match_int(args, &arg))
- return -EINVAL;
- if (test_opt(sbi, RESERVE_ROOT)) {
- f2fs_info(sbi, "Preserve previous reserve_root=%u",
- F2FS_OPTION(sbi).root_reserved_blocks);
- } else {
- F2FS_OPTION(sbi).root_reserved_blocks = arg;
- set_opt(sbi, RESERVE_ROOT);
- }
- break;
- case Opt_resuid:
- if (args->from && match_int(args, &arg))
- return -EINVAL;
- uid = make_kuid(current_user_ns(), arg);
- if (!uid_valid(uid)) {
- f2fs_err(sbi, "Invalid uid value %d", arg);
- return -EINVAL;
- }
- F2FS_OPTION(sbi).s_resuid = uid;
- break;
- case Opt_resgid:
- if (args->from && match_int(args, &arg))
- return -EINVAL;
- gid = make_kgid(current_user_ns(), arg);
- if (!gid_valid(gid)) {
- f2fs_err(sbi, "Invalid gid value %d", arg);
- return -EINVAL;
- }
- F2FS_OPTION(sbi).s_resgid = gid;
- break;
- case Opt_mode:
- name = match_strdup(&args[0]);
-
- if (!name)
- return -ENOMEM;
- if (!strcmp(name, "adaptive")) {
- F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE;
- } else if (!strcmp(name, "lfs")) {
- F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS;
- } else if (!strcmp(name, "fragment:segment")) {
- F2FS_OPTION(sbi).fs_mode = FS_MODE_FRAGMENT_SEG;
- } else if (!strcmp(name, "fragment:block")) {
- F2FS_OPTION(sbi).fs_mode = FS_MODE_FRAGMENT_BLK;
- } else {
- kfree(name);
- return -EINVAL;
- }
- kfree(name);
- break;
+ case Opt_active_logs:
+ if (result.int_32 != 2 && result.int_32 != 4 &&
+ result.int_32 != NR_CURSEG_PERSIST_TYPE)
+ return -EINVAL;
+ ctx->spec_mask |= F2FS_SPEC_active_logs;
+ F2FS_CTX_INFO(ctx).active_logs = result.int_32;
+ break;
+ case Opt_disable_ext_identify:
+ ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_EXT_IDENTIFY);
+ break;
+ case Opt_inline_data:
+ if (result.negated)
+ ctx_clear_opt(ctx, F2FS_MOUNT_INLINE_DATA);
+ else
+ ctx_set_opt(ctx, F2FS_MOUNT_INLINE_DATA);
+ break;
+ case Opt_inline_dentry:
+ if (result.negated)
+ ctx_clear_opt(ctx, F2FS_MOUNT_INLINE_DENTRY);
+ else
+ ctx_set_opt(ctx, F2FS_MOUNT_INLINE_DENTRY);
+ break;
+ case Opt_flush_merge:
+ if (result.negated)
+ ctx_clear_opt(ctx, F2FS_MOUNT_FLUSH_MERGE);
+ else
+ ctx_set_opt(ctx, F2FS_MOUNT_FLUSH_MERGE);
+ break;
+ case Opt_barrier:
+ if (result.negated)
+ ctx_set_opt(ctx, F2FS_MOUNT_NOBARRIER);
+ else
+ ctx_clear_opt(ctx, F2FS_MOUNT_NOBARRIER);
+ break;
+ case Opt_fastboot:
+ ctx_set_opt(ctx, F2FS_MOUNT_FASTBOOT);
+ break;
+ case Opt_extent_cache:
+ if (result.negated)
+ ctx_clear_opt(ctx, F2FS_MOUNT_READ_EXTENT_CACHE);
+ else
+ ctx_set_opt(ctx, F2FS_MOUNT_READ_EXTENT_CACHE);
+ break;
+ case Opt_data_flush:
+ ctx_set_opt(ctx, F2FS_MOUNT_DATA_FLUSH);
+ break;
+ case Opt_reserve_root:
+ ctx_set_opt(ctx, F2FS_MOUNT_RESERVE_ROOT);
+ F2FS_CTX_INFO(ctx).root_reserved_blocks = result.uint_32;
+ ctx->spec_mask |= F2FS_SPEC_reserve_root;
+ break;
+ case Opt_resuid:
+ F2FS_CTX_INFO(ctx).s_resuid = result.uid;
+ ctx->spec_mask |= F2FS_SPEC_resuid;
+ break;
+ case Opt_resgid:
+ F2FS_CTX_INFO(ctx).s_resgid = result.gid;
+ ctx->spec_mask |= F2FS_SPEC_resgid;
+ break;
+ case Opt_mode:
+ F2FS_CTX_INFO(ctx).fs_mode = result.uint_32;
+ ctx->spec_mask |= F2FS_SPEC_mode;
+ break;
#ifdef CONFIG_F2FS_FAULT_INJECTION
- case Opt_fault_injection:
- if (args->from && match_int(args, &arg))
- return -EINVAL;
- if (f2fs_build_fault_attr(sbi, arg, 0, FAULT_RATE))
- return -EINVAL;
- set_opt(sbi, FAULT_INJECTION);
- break;
+ case Opt_fault_injection:
+ F2FS_CTX_INFO(ctx).fault_info.inject_rate = result.int_32;
+ ctx->spec_mask |= F2FS_SPEC_fault_injection;
+ ctx_set_opt(ctx, F2FS_MOUNT_FAULT_INJECTION);
+ break;
- case Opt_fault_type:
- if (args->from && match_int(args, &arg))
- return -EINVAL;
- if (f2fs_build_fault_attr(sbi, 0, arg, FAULT_TYPE))
- return -EINVAL;
- set_opt(sbi, FAULT_INJECTION);
- break;
+ case Opt_fault_type:
+ if (result.uint_32 > BIT(FAULT_MAX))
+ return -EINVAL;
+ F2FS_CTX_INFO(ctx).fault_info.inject_type = result.uint_32;
+ ctx->spec_mask |= F2FS_SPEC_fault_type;
+ ctx_set_opt(ctx, F2FS_MOUNT_FAULT_INJECTION);
+ break;
#else
- case Opt_fault_injection:
- case Opt_fault_type:
- f2fs_info(sbi, "fault injection options not supported");
- break;
+ case Opt_fault_injection:
+ case Opt_fault_type:
+ f2fs_info(NULL, "%s options not supported", param->key);
+ break;
#endif
- case Opt_lazytime:
- set_opt(sbi, LAZYTIME);
- break;
- case Opt_nolazytime:
- clear_opt(sbi, LAZYTIME);
- break;
+ case Opt_lazytime:
+ if (result.negated)
+ ctx_clear_opt(ctx, F2FS_MOUNT_LAZYTIME);
+ else
+ ctx_set_opt(ctx, F2FS_MOUNT_LAZYTIME);
+ break;
#ifdef CONFIG_QUOTA
- case Opt_quota:
- case Opt_usrquota:
- set_opt(sbi, USRQUOTA);
- break;
- case Opt_grpquota:
- set_opt(sbi, GRPQUOTA);
- break;
- case Opt_prjquota:
- set_opt(sbi, PRJQUOTA);
- break;
- case Opt_usrjquota:
- ret = f2fs_set_qf_name(sbi, USRQUOTA, &args[0]);
- if (ret)
- return ret;
- break;
- case Opt_grpjquota:
- ret = f2fs_set_qf_name(sbi, GRPQUOTA, &args[0]);
- if (ret)
- return ret;
- break;
- case Opt_prjjquota:
- ret = f2fs_set_qf_name(sbi, PRJQUOTA, &args[0]);
- if (ret)
- return ret;
- break;
- case Opt_offusrjquota:
- ret = f2fs_clear_qf_name(sbi, USRQUOTA);
- if (ret)
- return ret;
- break;
- case Opt_offgrpjquota:
- ret = f2fs_clear_qf_name(sbi, GRPQUOTA);
- if (ret)
- return ret;
- break;
- case Opt_offprjjquota:
- ret = f2fs_clear_qf_name(sbi, PRJQUOTA);
- if (ret)
- return ret;
- break;
- case Opt_jqfmt_vfsold:
- F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_OLD;
- break;
- case Opt_jqfmt_vfsv0:
- F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V0;
- break;
- case Opt_jqfmt_vfsv1:
- F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V1;
- break;
- case Opt_noquota:
- clear_opt(sbi, QUOTA);
- clear_opt(sbi, USRQUOTA);
- clear_opt(sbi, GRPQUOTA);
- clear_opt(sbi, PRJQUOTA);
- break;
+ case Opt_quota:
+ if (result.negated) {
+ ctx_clear_opt(ctx, F2FS_MOUNT_QUOTA);
+ ctx_clear_opt(ctx, F2FS_MOUNT_USRQUOTA);
+ ctx_clear_opt(ctx, F2FS_MOUNT_GRPQUOTA);
+ ctx_clear_opt(ctx, F2FS_MOUNT_PRJQUOTA);
+ } else
+ ctx_set_opt(ctx, F2FS_MOUNT_USRQUOTA);
+ break;
+ case Opt_usrquota:
+ ctx_set_opt(ctx, F2FS_MOUNT_USRQUOTA);
+ break;
+ case Opt_grpquota:
+ ctx_set_opt(ctx, F2FS_MOUNT_GRPQUOTA);
+ break;
+ case Opt_prjquota:
+ ctx_set_opt(ctx, F2FS_MOUNT_PRJQUOTA);
+ break;
+ case Opt_usrjquota:
+ if (!*param->string)
+ ret = f2fs_unnote_qf_name(fc, USRQUOTA);
+ else
+ ret = f2fs_note_qf_name(fc, USRQUOTA, param);
+ if (ret)
+ return ret;
+ break;
+ case Opt_grpjquota:
+ if (!*param->string)
+ ret = f2fs_unnote_qf_name(fc, GRPQUOTA);
+ else
+ ret = f2fs_note_qf_name(fc, GRPQUOTA, param);
+ if (ret)
+ return ret;
+ break;
+ case Opt_prjjquota:
+ if (!*param->string)
+ ret = f2fs_unnote_qf_name(fc, PRJQUOTA);
+ else
+ ret = f2fs_note_qf_name(fc, PRJQUOTA, param);
+ if (ret)
+ return ret;
+ break;
+ case Opt_jqfmt:
+ F2FS_CTX_INFO(ctx).s_jquota_fmt = result.int_32;
+ ctx->spec_mask |= F2FS_SPEC_jqfmt;
+ break;
#else
- case Opt_quota:
- case Opt_usrquota:
- case Opt_grpquota:
- case Opt_prjquota:
- case Opt_usrjquota:
- case Opt_grpjquota:
- case Opt_prjjquota:
- case Opt_offusrjquota:
- case Opt_offgrpjquota:
- case Opt_offprjjquota:
- case Opt_jqfmt_vfsold:
- case Opt_jqfmt_vfsv0:
- case Opt_jqfmt_vfsv1:
- case Opt_noquota:
- f2fs_info(sbi, "quota operations not supported");
- break;
+ case Opt_quota:
+ case Opt_usrquota:
+ case Opt_grpquota:
+ case Opt_prjquota:
+ case Opt_usrjquota:
+ case Opt_grpjquota:
+ case Opt_prjjquota:
+ f2fs_info(NULL, "quota operations not supported");
+ break;
#endif
- case Opt_alloc:
- name = match_strdup(&args[0]);
- if (!name)
- return -ENOMEM;
-
- if (!strcmp(name, "default")) {
- F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT;
- } else if (!strcmp(name, "reuse")) {
- F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE;
- } else {
- kfree(name);
- return -EINVAL;
- }
- kfree(name);
- break;
- case Opt_fsync:
- name = match_strdup(&args[0]);
- if (!name)
- return -ENOMEM;
- if (!strcmp(name, "posix")) {
- F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX;
- } else if (!strcmp(name, "strict")) {
- F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_STRICT;
- } else if (!strcmp(name, "nobarrier")) {
- F2FS_OPTION(sbi).fsync_mode =
- FSYNC_MODE_NOBARRIER;
- } else {
- kfree(name);
- return -EINVAL;
- }
- kfree(name);
- break;
- case Opt_test_dummy_encryption:
- ret = f2fs_set_test_dummy_encryption(sbi, p, &args[0],
- is_remount);
- if (ret)
- return ret;
- break;
- case Opt_inlinecrypt:
+ case Opt_alloc:
+ F2FS_CTX_INFO(ctx).alloc_mode = result.uint_32;
+ ctx->spec_mask |= F2FS_SPEC_alloc_mode;
+ break;
+ case Opt_fsync:
+ F2FS_CTX_INFO(ctx).fsync_mode = result.uint_32;
+ ctx->spec_mask |= F2FS_SPEC_fsync_mode;
+ break;
+ case Opt_test_dummy_encryption:
+ ret = f2fs_parse_test_dummy_encryption(param, ctx);
+ if (ret)
+ return ret;
+ break;
+ case Opt_inlinecrypt:
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
- set_opt(sbi, INLINECRYPT);
+ ctx_set_opt(ctx, F2FS_MOUNT_INLINECRYPT);
#else
- f2fs_info(sbi, "inline encryption not supported");
+ f2fs_info(NULL, "inline encryption not supported");
#endif
- break;
+ break;
+ case Opt_checkpoint:
+ /*
+ * Initialize args struct so we know whether arg was
+ * found; some options take optional arguments.
+ */
+ args[0].from = args[0].to = NULL;
+ arg = 0;
+
+ /* revert to match_table for checkpoint= options */
+ token = match_token(param->string, f2fs_checkpoint_tokens, args);
+ switch (token) {
case Opt_checkpoint_disable_cap_perc:
if (args->from && match_int(args, &arg))
return -EINVAL;
if (arg < 0 || arg > 100)
return -EINVAL;
- F2FS_OPTION(sbi).unusable_cap_perc = arg;
- set_opt(sbi, DISABLE_CHECKPOINT);
+ F2FS_CTX_INFO(ctx).unusable_cap_perc = arg;
+ ctx->spec_mask |= F2FS_SPEC_checkpoint_disable_cap_perc;
+ ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT);
break;
case Opt_checkpoint_disable_cap:
if (args->from && match_int(args, &arg))
return -EINVAL;
- F2FS_OPTION(sbi).unusable_cap = arg;
- set_opt(sbi, DISABLE_CHECKPOINT);
+ F2FS_CTX_INFO(ctx).unusable_cap = arg;
+ ctx->spec_mask |= F2FS_SPEC_checkpoint_disable_cap;
+ ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT);
break;
case Opt_checkpoint_disable:
- set_opt(sbi, DISABLE_CHECKPOINT);
+ ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT);
break;
case Opt_checkpoint_enable:
- clear_opt(sbi, DISABLE_CHECKPOINT);
- break;
- case Opt_checkpoint_merge:
- set_opt(sbi, MERGE_CHECKPOINT);
- break;
- case Opt_nocheckpoint_merge:
- clear_opt(sbi, MERGE_CHECKPOINT);
+ ctx_clear_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT);
break;
+ default:
+ return -EINVAL;
+ }
+ break;
+ case Opt_checkpoint_merge:
+ if (result.negated)
+ ctx_clear_opt(ctx, F2FS_MOUNT_MERGE_CHECKPOINT);
+ else
+ ctx_set_opt(ctx, F2FS_MOUNT_MERGE_CHECKPOINT);
+ break;
#ifdef CONFIG_F2FS_FS_COMPRESSION
- case Opt_compress_algorithm:
- if (!f2fs_sb_has_compression(sbi)) {
- f2fs_info(sbi, "Image doesn't support compression");
- break;
- }
- name = match_strdup(&args[0]);
- if (!name)
- return -ENOMEM;
- if (!strcmp(name, "lzo")) {
+ case Opt_compress_algorithm:
+ name = param->string;
+ if (!strcmp(name, "lzo")) {
#ifdef CONFIG_F2FS_FS_LZO
- F2FS_OPTION(sbi).compress_level = 0;
- F2FS_OPTION(sbi).compress_algorithm =
- COMPRESS_LZO;
+ F2FS_CTX_INFO(ctx).compress_level = 0;
+ F2FS_CTX_INFO(ctx).compress_algorithm = COMPRESS_LZO;
+ ctx->spec_mask |= F2FS_SPEC_compress_level;
+ ctx->spec_mask |= F2FS_SPEC_compress_algorithm;
#else
- f2fs_info(sbi, "kernel doesn't support lzo compression");
+ f2fs_info(NULL, "kernel doesn't support lzo compression");
#endif
- } else if (!strncmp(name, "lz4", 3)) {
+ } else if (!strncmp(name, "lz4", 3)) {
#ifdef CONFIG_F2FS_FS_LZ4
- ret = f2fs_set_lz4hc_level(sbi, name);
- if (ret) {
- kfree(name);
- return -EINVAL;
- }
- F2FS_OPTION(sbi).compress_algorithm =
- COMPRESS_LZ4;
+ ret = f2fs_set_lz4hc_level(ctx, name);
+ if (ret)
+ return -EINVAL;
+ F2FS_CTX_INFO(ctx).compress_algorithm = COMPRESS_LZ4;
+ ctx->spec_mask |= F2FS_SPEC_compress_algorithm;
#else
- f2fs_info(sbi, "kernel doesn't support lz4 compression");
+ f2fs_info(NULL, "kernel doesn't support lz4 compression");
#endif
- } else if (!strncmp(name, "zstd", 4)) {
+ } else if (!strncmp(name, "zstd", 4)) {
#ifdef CONFIG_F2FS_FS_ZSTD
- ret = f2fs_set_zstd_level(sbi, name);
- if (ret) {
- kfree(name);
- return -EINVAL;
- }
- F2FS_OPTION(sbi).compress_algorithm =
- COMPRESS_ZSTD;
+ ret = f2fs_set_zstd_level(ctx, name);
+ if (ret)
+ return -EINVAL;
+ F2FS_CTX_INFO(ctx).compress_algorithm = COMPRESS_ZSTD;
+ ctx->spec_mask |= F2FS_SPEC_compress_algorithm;
#else
- f2fs_info(sbi, "kernel doesn't support zstd compression");
+ f2fs_info(NULL, "kernel doesn't support zstd compression");
#endif
- } else if (!strcmp(name, "lzo-rle")) {
+ } else if (!strcmp(name, "lzo-rle")) {
#ifdef CONFIG_F2FS_FS_LZORLE
- F2FS_OPTION(sbi).compress_level = 0;
- F2FS_OPTION(sbi).compress_algorithm =
- COMPRESS_LZORLE;
+ F2FS_CTX_INFO(ctx).compress_level = 0;
+ F2FS_CTX_INFO(ctx).compress_algorithm = COMPRESS_LZORLE;
+ ctx->spec_mask |= F2FS_SPEC_compress_level;
+ ctx->spec_mask |= F2FS_SPEC_compress_algorithm;
#else
- f2fs_info(sbi, "kernel doesn't support lzorle compression");
+ f2fs_info(NULL, "kernel doesn't support lzorle compression");
#endif
- } else {
- kfree(name);
- return -EINVAL;
- }
- kfree(name);
+ } else
+ return -EINVAL;
+ break;
+ case Opt_compress_log_size:
+ if (result.uint_32 < MIN_COMPRESS_LOG_SIZE ||
+ result.uint_32 > MAX_COMPRESS_LOG_SIZE) {
+ f2fs_err(NULL,
+ "Compress cluster log size is out of range");
+ return -EINVAL;
+ }
+ F2FS_CTX_INFO(ctx).compress_log_size = result.uint_32;
+ ctx->spec_mask |= F2FS_SPEC_compress_log_size;
+ break;
+ case Opt_compress_extension:
+ name = param->string;
+ ext = F2FS_CTX_INFO(ctx).extensions;
+ ext_cnt = F2FS_CTX_INFO(ctx).compress_ext_cnt;
+
+ if (strlen(name) >= F2FS_EXTENSION_LEN ||
+ ext_cnt >= COMPRESS_EXT_NUM) {
+ f2fs_err(NULL, "invalid extension length/number");
+ return -EINVAL;
+ }
+
+ if (is_compress_extension_exist(&ctx->info, name, true))
break;
- case Opt_compress_log_size:
- if (!f2fs_sb_has_compression(sbi)) {
- f2fs_info(sbi, "Image doesn't support compression");
- break;
- }
- if (args->from && match_int(args, &arg))
- return -EINVAL;
- if (arg < MIN_COMPRESS_LOG_SIZE ||
- arg > MAX_COMPRESS_LOG_SIZE) {
- f2fs_err(sbi,
- "Compress cluster log size is out of range");
- return -EINVAL;
- }
- F2FS_OPTION(sbi).compress_log_size = arg;
+
+ ret = strscpy(ext[ext_cnt], name, F2FS_EXTENSION_LEN);
+ if (ret < 0)
+ return ret;
+ F2FS_CTX_INFO(ctx).compress_ext_cnt++;
+ ctx->spec_mask |= F2FS_SPEC_compress_extension;
+ break;
+ case Opt_nocompress_extension:
+ name = param->string;
+ noext = F2FS_CTX_INFO(ctx).noextensions;
+ noext_cnt = F2FS_CTX_INFO(ctx).nocompress_ext_cnt;
+
+ if (strlen(name) >= F2FS_EXTENSION_LEN ||
+ noext_cnt >= COMPRESS_EXT_NUM) {
+ f2fs_err(NULL, "invalid extension length/number");
+ return -EINVAL;
+ }
+
+ if (is_compress_extension_exist(&ctx->info, name, false))
break;
- case Opt_compress_extension:
- if (!f2fs_sb_has_compression(sbi)) {
- f2fs_info(sbi, "Image doesn't support compression");
- break;
- }
- name = match_strdup(&args[0]);
- if (!name)
- return -ENOMEM;
- ext = F2FS_OPTION(sbi).extensions;
- ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
+ ret = strscpy(noext[noext_cnt], name, F2FS_EXTENSION_LEN);
+ if (ret < 0)
+ return ret;
+ F2FS_CTX_INFO(ctx).nocompress_ext_cnt++;
+ ctx->spec_mask |= F2FS_SPEC_nocompress_extension;
+ break;
+ case Opt_compress_chksum:
+ F2FS_CTX_INFO(ctx).compress_chksum = true;
+ ctx->spec_mask |= F2FS_SPEC_compress_chksum;
+ break;
+ case Opt_compress_mode:
+ F2FS_CTX_INFO(ctx).compress_mode = result.uint_32;
+ ctx->spec_mask |= F2FS_SPEC_compress_mode;
+ break;
+ case Opt_compress_cache:
+ ctx_set_opt(ctx, F2FS_MOUNT_COMPRESS_CACHE);
+ break;
+#else
+ case Opt_compress_algorithm:
+ case Opt_compress_log_size:
+ case Opt_compress_extension:
+ case Opt_nocompress_extension:
+ case Opt_compress_chksum:
+ case Opt_compress_mode:
+ case Opt_compress_cache:
+ f2fs_info(NULL, "compression options not supported");
+ break;
+#endif
+ case Opt_atgc:
+ ctx_set_opt(ctx, F2FS_MOUNT_ATGC);
+ break;
+ case Opt_gc_merge:
+ if (result.negated)
+ ctx_clear_opt(ctx, F2FS_MOUNT_GC_MERGE);
+ else
+ ctx_set_opt(ctx, F2FS_MOUNT_GC_MERGE);
+ break;
+ case Opt_discard_unit:
+ F2FS_CTX_INFO(ctx).discard_unit = result.uint_32;
+ ctx->spec_mask |= F2FS_SPEC_discard_unit;
+ break;
+ case Opt_memory_mode:
+ F2FS_CTX_INFO(ctx).memory_mode = result.uint_32;
+ ctx->spec_mask |= F2FS_SPEC_memory_mode;
+ break;
+ case Opt_age_extent_cache:
+ ctx_set_opt(ctx, F2FS_MOUNT_AGE_EXTENT_CACHE);
+ break;
+ case Opt_errors:
+ F2FS_CTX_INFO(ctx).errors = result.uint_32;
+ ctx->spec_mask |= F2FS_SPEC_errors;
+ break;
+ case Opt_nat_bits:
+ ctx_set_opt(ctx, F2FS_MOUNT_NAT_BITS);
+ break;
+ }
+ return 0;
+}
- if (strlen(name) >= F2FS_EXTENSION_LEN ||
- ext_cnt >= COMPRESS_EXT_NUM) {
- f2fs_err(sbi,
- "invalid extension length/number");
- kfree(name);
- return -EINVAL;
- }
+/*
+ * Check quota settings consistency.
+ */
+static int f2fs_check_quota_consistency(struct fs_context *fc,
+ struct super_block *sb)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ #ifdef CONFIG_QUOTA
+ struct f2fs_fs_context *ctx = fc->fs_private;
+ bool quota_feature = f2fs_sb_has_quota_ino(sbi);
+ bool quota_turnon = sb_any_quota_loaded(sb);
+ char *old_qname, *new_qname;
+ bool usr_qf_name, grp_qf_name, prj_qf_name, usrquota, grpquota, prjquota;
+ int i;
- if (is_compress_extension_exist(sbi, name, true)) {
- kfree(name);
- break;
- }
+ /*
+ * We do the test below only for project quotas. 'usrquota' and
+ * 'grpquota' mount options are allowed even without quota feature
+ * to support legacy quotas in quota files.
+ */
+ if (ctx_test_opt(ctx, F2FS_MOUNT_PRJQUOTA) &&
+ !f2fs_sb_has_project_quota(sbi)) {
+ f2fs_err(sbi, "Project quota feature not enabled. Cannot enable project quota enforcement.");
+ return -EINVAL;
+ }
- ret = strscpy(ext[ext_cnt], name);
- if (ret < 0) {
- kfree(name);
- return ret;
- }
- F2FS_OPTION(sbi).compress_ext_cnt++;
- kfree(name);
- break;
- case Opt_nocompress_extension:
- if (!f2fs_sb_has_compression(sbi)) {
- f2fs_info(sbi, "Image doesn't support compression");
- break;
- }
- name = match_strdup(&args[0]);
- if (!name)
- return -ENOMEM;
+ if (ctx->qname_mask) {
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (!(ctx->qname_mask & (1 << i)))
+ continue;
- noext = F2FS_OPTION(sbi).noextensions;
- noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt;
+ old_qname = F2FS_OPTION(sbi).s_qf_names[i];
+ new_qname = F2FS_CTX_INFO(ctx).s_qf_names[i];
+ if (quota_turnon &&
+ !!old_qname != !!new_qname)
+ goto err_jquota_change;
- if (strlen(name) >= F2FS_EXTENSION_LEN ||
- noext_cnt >= COMPRESS_EXT_NUM) {
- f2fs_err(sbi,
- "invalid extension length/number");
- kfree(name);
- return -EINVAL;
+ if (old_qname) {
+ if (strcmp(old_qname, new_qname) == 0) {
+ ctx->qname_mask &= ~(1 << i);
+ continue;
+ }
+ goto err_jquota_specified;
}
- if (is_compress_extension_exist(sbi, name, false)) {
- kfree(name);
- break;
+ if (quota_feature) {
+ f2fs_info(sbi, "QUOTA feature is enabled, so ignore qf_name");
+ ctx->qname_mask &= ~(1 << i);
+ kfree(F2FS_CTX_INFO(ctx).s_qf_names[i]);
+ F2FS_CTX_INFO(ctx).s_qf_names[i] = NULL;
}
+ }
+ }
+
+ /* Make sure we don't mix old and new quota format */
+ usr_qf_name = F2FS_OPTION(sbi).s_qf_names[USRQUOTA] ||
+ F2FS_CTX_INFO(ctx).s_qf_names[USRQUOTA];
+ grp_qf_name = F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] ||
+ F2FS_CTX_INFO(ctx).s_qf_names[GRPQUOTA];
+ prj_qf_name = F2FS_OPTION(sbi).s_qf_names[PRJQUOTA] ||
+ F2FS_CTX_INFO(ctx).s_qf_names[PRJQUOTA];
+ usrquota = test_opt(sbi, USRQUOTA) ||
+ ctx_test_opt(ctx, F2FS_MOUNT_USRQUOTA);
+ grpquota = test_opt(sbi, GRPQUOTA) ||
+ ctx_test_opt(ctx, F2FS_MOUNT_GRPQUOTA);
+ prjquota = test_opt(sbi, PRJQUOTA) ||
+ ctx_test_opt(ctx, F2FS_MOUNT_PRJQUOTA);
+
+ if (usr_qf_name) {
+ ctx_clear_opt(ctx, F2FS_MOUNT_USRQUOTA);
+ usrquota = false;
+ }
+ if (grp_qf_name) {
+ ctx_clear_opt(ctx, F2FS_MOUNT_GRPQUOTA);
+ grpquota = false;
+ }
+ if (prj_qf_name) {
+ ctx_clear_opt(ctx, F2FS_MOUNT_PRJQUOTA);
+ prjquota = false;
+ }
+ if (usr_qf_name || grp_qf_name || prj_qf_name) {
+ if (grpquota || usrquota || prjquota) {
+ f2fs_err(sbi, "old and new quota format mixing");
+ return -EINVAL;
+ }
+ if (!(ctx->spec_mask & F2FS_SPEC_jqfmt ||
+ F2FS_OPTION(sbi).s_jquota_fmt)) {
+ f2fs_err(sbi, "journaled quota format not specified");
+ return -EINVAL;
+ }
+ }
+ return 0;
+
+err_jquota_change:
+ f2fs_err(sbi, "Cannot change journaled quota options when quota turned on");
+ return -EINVAL;
+err_jquota_specified:
+ f2fs_err(sbi, "%s quota file already specified",
+ QTYPE2NAME(i));
+ return -EINVAL;
- ret = strscpy(noext[noext_cnt], name);
- if (ret < 0) {
- kfree(name);
- return ret;
- }
- F2FS_OPTION(sbi).nocompress_ext_cnt++;
- kfree(name);
- break;
- case Opt_compress_chksum:
- if (!f2fs_sb_has_compression(sbi)) {
- f2fs_info(sbi, "Image doesn't support compression");
- break;
- }
- F2FS_OPTION(sbi).compress_chksum = true;
- break;
- case Opt_compress_mode:
- if (!f2fs_sb_has_compression(sbi)) {
- f2fs_info(sbi, "Image doesn't support compression");
- break;
- }
- name = match_strdup(&args[0]);
- if (!name)
- return -ENOMEM;
- if (!strcmp(name, "fs")) {
- F2FS_OPTION(sbi).compress_mode = COMPR_MODE_FS;
- } else if (!strcmp(name, "user")) {
- F2FS_OPTION(sbi).compress_mode = COMPR_MODE_USER;
- } else {
- kfree(name);
- return -EINVAL;
- }
- kfree(name);
- break;
- case Opt_compress_cache:
- if (!f2fs_sb_has_compression(sbi)) {
- f2fs_info(sbi, "Image doesn't support compression");
- break;
- }
- set_opt(sbi, COMPRESS_CACHE);
- break;
#else
- case Opt_compress_algorithm:
- case Opt_compress_log_size:
- case Opt_compress_extension:
- case Opt_nocompress_extension:
- case Opt_compress_chksum:
- case Opt_compress_mode:
- case Opt_compress_cache:
- f2fs_info(sbi, "compression options not supported");
- break;
+ if (f2fs_readonly(sbi->sb))
+ return 0;
+ if (f2fs_sb_has_quota_ino(sbi)) {
+ f2fs_info(sbi, "Filesystem with quota feature cannot be mounted RDWR without CONFIG_QUOTA");
+ return -EINVAL;
+ }
+ if (f2fs_sb_has_project_quota(sbi)) {
+ f2fs_err(sbi, "Filesystem with project quota feature cannot be mounted RDWR without CONFIG_QUOTA");
+ return -EINVAL;
+ }
+
+ return 0;
#endif
- case Opt_atgc:
- set_opt(sbi, ATGC);
- break;
- case Opt_gc_merge:
- set_opt(sbi, GC_MERGE);
- break;
- case Opt_nogc_merge:
- clear_opt(sbi, GC_MERGE);
- break;
- case Opt_discard_unit:
- name = match_strdup(&args[0]);
- if (!name)
- return -ENOMEM;
- if (!strcmp(name, "block")) {
- F2FS_OPTION(sbi).discard_unit =
- DISCARD_UNIT_BLOCK;
- } else if (!strcmp(name, "segment")) {
- F2FS_OPTION(sbi).discard_unit =
- DISCARD_UNIT_SEGMENT;
- } else if (!strcmp(name, "section")) {
- F2FS_OPTION(sbi).discard_unit =
- DISCARD_UNIT_SECTION;
- } else {
- kfree(name);
- return -EINVAL;
- }
- kfree(name);
- break;
- case Opt_memory_mode:
- name = match_strdup(&args[0]);
- if (!name)
- return -ENOMEM;
- if (!strcmp(name, "normal")) {
- F2FS_OPTION(sbi).memory_mode =
- MEMORY_MODE_NORMAL;
- } else if (!strcmp(name, "low")) {
- F2FS_OPTION(sbi).memory_mode =
- MEMORY_MODE_LOW;
- } else {
- kfree(name);
- return -EINVAL;
+}
+
+static int f2fs_check_test_dummy_encryption(struct fs_context *fc,
+ struct super_block *sb)
+{
+ struct f2fs_fs_context *ctx = fc->fs_private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+
+ if (!fscrypt_is_dummy_policy_set(&F2FS_CTX_INFO(ctx).dummy_enc_policy))
+ return 0;
+
+ if (!f2fs_sb_has_encrypt(sbi)) {
+ f2fs_err(sbi, "Encrypt feature is off");
+ return -EINVAL;
+ }
+
+ /*
+ * This mount option is just for testing, and it's not worthwhile to
+ * implement the extra complexity (e.g. RCU protection) that would be
+ * needed to allow it to be set or changed during remount. We do allow
+ * it to be specified during remount, but only if there is no change.
+ */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ if (fscrypt_dummy_policies_equal(&F2FS_OPTION(sbi).dummy_enc_policy,
+ &F2FS_CTX_INFO(ctx).dummy_enc_policy))
+ return 0;
+ f2fs_warn(sbi, "Can't set or change test_dummy_encryption on remount");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static inline bool test_compression_spec(unsigned int mask)
+{
+ return mask & (F2FS_SPEC_compress_algorithm
+ | F2FS_SPEC_compress_log_size
+ | F2FS_SPEC_compress_extension
+ | F2FS_SPEC_nocompress_extension
+ | F2FS_SPEC_compress_chksum
+ | F2FS_SPEC_compress_mode);
+}
+
+static inline void clear_compression_spec(struct f2fs_fs_context *ctx)
+{
+ ctx->spec_mask &= ~(F2FS_SPEC_compress_algorithm
+ | F2FS_SPEC_compress_log_size
+ | F2FS_SPEC_compress_extension
+ | F2FS_SPEC_nocompress_extension
+ | F2FS_SPEC_compress_chksum
+ | F2FS_SPEC_compress_mode);
+}
+
+static int f2fs_check_compression(struct fs_context *fc,
+ struct super_block *sb)
+{
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ struct f2fs_fs_context *ctx = fc->fs_private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ int i, cnt;
+
+ if (!f2fs_sb_has_compression(sbi)) {
+ if (test_compression_spec(ctx->spec_mask) ||
+ ctx_test_opt(ctx, F2FS_MOUNT_COMPRESS_CACHE))
+ f2fs_info(sbi, "Image doesn't support compression");
+ clear_compression_spec(ctx);
+ ctx->opt_mask &= ~F2FS_MOUNT_COMPRESS_CACHE;
+ return 0;
+ }
+ if (ctx->spec_mask & F2FS_SPEC_compress_extension) {
+ cnt = F2FS_CTX_INFO(ctx).compress_ext_cnt;
+ for (i = 0; i < F2FS_CTX_INFO(ctx).compress_ext_cnt; i++) {
+ if (is_compress_extension_exist(&F2FS_OPTION(sbi),
+ F2FS_CTX_INFO(ctx).extensions[i], true)) {
+ F2FS_CTX_INFO(ctx).extensions[i][0] = '\0';
+ cnt--;
}
- kfree(name);
- break;
- case Opt_age_extent_cache:
- set_opt(sbi, AGE_EXTENT_CACHE);
- break;
- case Opt_errors:
- name = match_strdup(&args[0]);
- if (!name)
- return -ENOMEM;
- if (!strcmp(name, "remount-ro")) {
- F2FS_OPTION(sbi).errors =
- MOUNT_ERRORS_READONLY;
- } else if (!strcmp(name, "continue")) {
- F2FS_OPTION(sbi).errors =
- MOUNT_ERRORS_CONTINUE;
- } else if (!strcmp(name, "panic")) {
- F2FS_OPTION(sbi).errors =
- MOUNT_ERRORS_PANIC;
- } else {
- kfree(name);
- return -EINVAL;
+ }
+ if (F2FS_OPTION(sbi).compress_ext_cnt + cnt > COMPRESS_EXT_NUM) {
+ f2fs_err(sbi, "invalid extension length/number");
+ return -EINVAL;
+ }
+ }
+ if (ctx->spec_mask & F2FS_SPEC_nocompress_extension) {
+ cnt = F2FS_CTX_INFO(ctx).nocompress_ext_cnt;
+ for (i = 0; i < F2FS_CTX_INFO(ctx).nocompress_ext_cnt; i++) {
+ if (is_compress_extension_exist(&F2FS_OPTION(sbi),
+ F2FS_CTX_INFO(ctx).noextensions[i], false)) {
+ F2FS_CTX_INFO(ctx).noextensions[i][0] = '\0';
+ cnt--;
}
- kfree(name);
- break;
- case Opt_nat_bits:
- set_opt(sbi, NAT_BITS);
- break;
- default:
- f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value",
- p);
+ }
+ if (F2FS_OPTION(sbi).nocompress_ext_cnt + cnt > COMPRESS_EXT_NUM) {
+ f2fs_err(sbi, "invalid noextension length/number");
return -EINVAL;
}
}
+
+ if (f2fs_test_compress_extension(F2FS_CTX_INFO(ctx).noextensions,
+ F2FS_CTX_INFO(ctx).nocompress_ext_cnt,
+ F2FS_CTX_INFO(ctx).extensions,
+ F2FS_CTX_INFO(ctx).compress_ext_cnt)) {
+ f2fs_err(sbi, "new noextensions conflicts with new extensions");
+ return -EINVAL;
+ }
+ if (f2fs_test_compress_extension(F2FS_CTX_INFO(ctx).noextensions,
+ F2FS_CTX_INFO(ctx).nocompress_ext_cnt,
+ F2FS_OPTION(sbi).extensions,
+ F2FS_OPTION(sbi).compress_ext_cnt)) {
+ f2fs_err(sbi, "new noextensions conflicts with old extensions");
+ return -EINVAL;
+ }
+ if (f2fs_test_compress_extension(F2FS_OPTION(sbi).noextensions,
+ F2FS_OPTION(sbi).nocompress_ext_cnt,
+ F2FS_CTX_INFO(ctx).extensions,
+ F2FS_CTX_INFO(ctx).compress_ext_cnt)) {
+ f2fs_err(sbi, "new extensions conflicts with old noextensions");
+ return -EINVAL;
+ }
+#endif
return 0;
}
-static int f2fs_default_check(struct f2fs_sb_info *sbi)
+static int f2fs_check_opt_consistency(struct fs_context *fc,
+ struct super_block *sb)
{
-#ifdef CONFIG_QUOTA
- if (f2fs_check_quota_options(sbi))
+ struct f2fs_fs_context *ctx = fc->fs_private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ int err;
+
+ if (ctx_test_opt(ctx, F2FS_MOUNT_NORECOVERY) && !f2fs_readonly(sb))
return -EINVAL;
-#else
- if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sbi->sb)) {
- f2fs_info(sbi, "Filesystem with quota feature cannot be mounted RDWR without CONFIG_QUOTA");
+
+ if (f2fs_hw_should_discard(sbi) &&
+ (ctx->opt_mask & F2FS_MOUNT_DISCARD) &&
+ !ctx_test_opt(ctx, F2FS_MOUNT_DISCARD)) {
+ f2fs_warn(sbi, "discard is required for zoned block devices");
return -EINVAL;
}
- if (f2fs_sb_has_project_quota(sbi) && !f2fs_readonly(sbi->sb)) {
- f2fs_err(sbi, "Filesystem with project quota feature cannot be mounted RDWR without CONFIG_QUOTA");
+
+ if (!f2fs_hw_support_discard(sbi) &&
+ (ctx->opt_mask & F2FS_MOUNT_DISCARD) &&
+ ctx_test_opt(ctx, F2FS_MOUNT_DISCARD)) {
+ f2fs_warn(sbi, "device does not support discard");
+ ctx_clear_opt(ctx, F2FS_MOUNT_DISCARD);
+ ctx->opt_mask &= ~F2FS_MOUNT_DISCARD;
+ }
+
+ if (f2fs_sb_has_device_alias(sbi) &&
+ (ctx->opt_mask & F2FS_MOUNT_READ_EXTENT_CACHE) &&
+ !ctx_test_opt(ctx, F2FS_MOUNT_READ_EXTENT_CACHE)) {
+ f2fs_err(sbi, "device aliasing requires extent cache");
return -EINVAL;
}
-#endif
+
+ if (test_opt(sbi, RESERVE_ROOT) &&
+ (ctx->opt_mask & F2FS_MOUNT_RESERVE_ROOT) &&
+ ctx_test_opt(ctx, F2FS_MOUNT_RESERVE_ROOT)) {
+ f2fs_info(sbi, "Preserve previous reserve_root=%u",
+ F2FS_OPTION(sbi).root_reserved_blocks);
+ ctx_clear_opt(ctx, F2FS_MOUNT_RESERVE_ROOT);
+ ctx->opt_mask &= ~F2FS_MOUNT_RESERVE_ROOT;
+ }
+
+ err = f2fs_check_test_dummy_encryption(fc, sb);
+ if (err)
+ return err;
+
+ err = f2fs_check_compression(fc, sb);
+ if (err)
+ return err;
+
+ err = f2fs_check_quota_consistency(fc, sb);
+ if (err)
+ return err;
if (!IS_ENABLED(CONFIG_UNICODE) && f2fs_sb_has_casefold(sbi)) {
f2fs_err(sbi,
@@ -1354,15 +1449,19 @@ static int f2fs_default_check(struct f2fs_sb_info *sbi)
* devices, but mandatory for host-managed zoned block devices.
*/
if (f2fs_sb_has_blkzoned(sbi)) {
+ if (F2FS_CTX_INFO(ctx).bggc_mode == BGGC_MODE_OFF) {
+ f2fs_warn(sbi, "zoned devices need bggc");
+ return -EINVAL;
+ }
#ifdef CONFIG_BLK_DEV_ZONED
- if (F2FS_OPTION(sbi).discard_unit !=
- DISCARD_UNIT_SECTION) {
+ if ((ctx->spec_mask & F2FS_SPEC_discard_unit) &&
+ F2FS_CTX_INFO(ctx).discard_unit != DISCARD_UNIT_SECTION) {
f2fs_info(sbi, "Zoned block device doesn't need small discard, set discard_unit=section by default");
- F2FS_OPTION(sbi).discard_unit =
- DISCARD_UNIT_SECTION;
+ F2FS_CTX_INFO(ctx).discard_unit = DISCARD_UNIT_SECTION;
}
- if (F2FS_OPTION(sbi).fs_mode != FS_MODE_LFS) {
+ if ((ctx->spec_mask & F2FS_SPEC_mode) &&
+ F2FS_CTX_INFO(ctx).fs_mode != FS_MODE_LFS) {
f2fs_info(sbi, "Only lfs mode is allowed with zoned block device feature");
return -EINVAL;
}
@@ -1372,43 +1471,25 @@ static int f2fs_default_check(struct f2fs_sb_info *sbi)
#endif
}
-#ifdef CONFIG_F2FS_FS_COMPRESSION
- if (f2fs_test_compress_extension(sbi)) {
- f2fs_err(sbi, "invalid compress or nocompress extension");
- return -EINVAL;
- }
-#endif
-
- if (test_opt(sbi, INLINE_XATTR_SIZE)) {
- int min_size, max_size;
-
+ if (ctx_test_opt(ctx, F2FS_MOUNT_INLINE_XATTR_SIZE)) {
if (!f2fs_sb_has_extra_attr(sbi) ||
!f2fs_sb_has_flexible_inline_xattr(sbi)) {
f2fs_err(sbi, "extra_attr or flexible_inline_xattr feature is off");
return -EINVAL;
}
- if (!test_opt(sbi, INLINE_XATTR)) {
+ if (!ctx_test_opt(ctx, F2FS_MOUNT_INLINE_XATTR) && !test_opt(sbi, INLINE_XATTR)) {
f2fs_err(sbi, "inline_xattr_size option should be set with inline_xattr option");
return -EINVAL;
}
-
- min_size = MIN_INLINE_XATTR_SIZE;
- max_size = MAX_INLINE_XATTR_SIZE;
-
- if (F2FS_OPTION(sbi).inline_xattr_size < min_size ||
- F2FS_OPTION(sbi).inline_xattr_size > max_size) {
- f2fs_err(sbi, "inline xattr size is out of range: %d ~ %d",
- min_size, max_size);
- return -EINVAL;
- }
}
- if (test_opt(sbi, ATGC) && f2fs_lfs_mode(sbi)) {
+ if (ctx_test_opt(ctx, F2FS_MOUNT_ATGC) &&
+ F2FS_CTX_INFO(ctx).fs_mode == FS_MODE_LFS) {
f2fs_err(sbi, "LFS is not compatible with ATGC");
return -EINVAL;
}
- if (f2fs_is_readonly(sbi) && test_opt(sbi, FLUSH_MERGE)) {
+ if (f2fs_is_readonly(sbi) && ctx_test_opt(ctx, F2FS_MOUNT_FLUSH_MERGE)) {
f2fs_err(sbi, "FLUSH_MERGE not compatible with readonly mode");
return -EINVAL;
}
@@ -1417,12 +1498,190 @@ static int f2fs_default_check(struct f2fs_sb_info *sbi)
f2fs_err(sbi, "Allow to mount readonly mode only");
return -EROFS;
}
+ return 0;
+}
+
+static void f2fs_apply_quota_options(struct fs_context *fc,
+ struct super_block *sb)
+{
+#ifdef CONFIG_QUOTA
+ struct f2fs_fs_context *ctx = fc->fs_private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ bool quota_feature = f2fs_sb_has_quota_ino(sbi);
+ char *qname;
+ int i;
+
+ if (quota_feature)
+ return;
+
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (!(ctx->qname_mask & (1 << i)))
+ continue;
+
+ qname = F2FS_CTX_INFO(ctx).s_qf_names[i];
+ if (qname) {
+ qname = kstrdup(F2FS_CTX_INFO(ctx).s_qf_names[i],
+ GFP_KERNEL | __GFP_NOFAIL);
+ set_opt(sbi, QUOTA);
+ }
+ F2FS_OPTION(sbi).s_qf_names[i] = qname;
+ }
+
+ if (ctx->spec_mask & F2FS_SPEC_jqfmt)
+ F2FS_OPTION(sbi).s_jquota_fmt = F2FS_CTX_INFO(ctx).s_jquota_fmt;
+
+ if (quota_feature && F2FS_OPTION(sbi).s_jquota_fmt) {
+ f2fs_info(sbi, "QUOTA feature is enabled, so ignore jquota_fmt");
+ F2FS_OPTION(sbi).s_jquota_fmt = 0;
+ }
+#endif
+}
+
+static void f2fs_apply_test_dummy_encryption(struct fs_context *fc,
+ struct super_block *sb)
+{
+ struct f2fs_fs_context *ctx = fc->fs_private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+
+ if (!fscrypt_is_dummy_policy_set(&F2FS_CTX_INFO(ctx).dummy_enc_policy) ||
+ /* if already set, it was already verified to be the same */
+ fscrypt_is_dummy_policy_set(&F2FS_OPTION(sbi).dummy_enc_policy))
+ return;
+ swap(F2FS_OPTION(sbi).dummy_enc_policy, F2FS_CTX_INFO(ctx).dummy_enc_policy);
+ f2fs_warn(sbi, "Test dummy encryption mode enabled");
+}
- if (test_opt(sbi, NORECOVERY) && !f2fs_readonly(sbi->sb)) {
- f2fs_err(sbi, "norecovery requires readonly mount");
+static void f2fs_apply_compression(struct fs_context *fc,
+ struct super_block *sb)
+{
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ struct f2fs_fs_context *ctx = fc->fs_private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ unsigned char (*ctx_ext)[F2FS_EXTENSION_LEN];
+ unsigned char (*sbi_ext)[F2FS_EXTENSION_LEN];
+ int ctx_cnt, sbi_cnt, i;
+
+ if (ctx->spec_mask & F2FS_SPEC_compress_level)
+ F2FS_OPTION(sbi).compress_level =
+ F2FS_CTX_INFO(ctx).compress_level;
+ if (ctx->spec_mask & F2FS_SPEC_compress_algorithm)
+ F2FS_OPTION(sbi).compress_algorithm =
+ F2FS_CTX_INFO(ctx).compress_algorithm;
+ if (ctx->spec_mask & F2FS_SPEC_compress_log_size)
+ F2FS_OPTION(sbi).compress_log_size =
+ F2FS_CTX_INFO(ctx).compress_log_size;
+ if (ctx->spec_mask & F2FS_SPEC_compress_chksum)
+ F2FS_OPTION(sbi).compress_chksum =
+ F2FS_CTX_INFO(ctx).compress_chksum;
+ if (ctx->spec_mask & F2FS_SPEC_compress_mode)
+ F2FS_OPTION(sbi).compress_mode =
+ F2FS_CTX_INFO(ctx).compress_mode;
+ if (ctx->spec_mask & F2FS_SPEC_compress_extension) {
+ ctx_ext = F2FS_CTX_INFO(ctx).extensions;
+ ctx_cnt = F2FS_CTX_INFO(ctx).compress_ext_cnt;
+ sbi_ext = F2FS_OPTION(sbi).extensions;
+ sbi_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
+ for (i = 0; i < ctx_cnt; i++) {
+ if (strlen(ctx_ext[i]) == 0)
+ continue;
+ strscpy(sbi_ext[sbi_cnt], ctx_ext[i]);
+ sbi_cnt++;
+ }
+ F2FS_OPTION(sbi).compress_ext_cnt = sbi_cnt;
+ }
+ if (ctx->spec_mask & F2FS_SPEC_nocompress_extension) {
+ ctx_ext = F2FS_CTX_INFO(ctx).noextensions;
+ ctx_cnt = F2FS_CTX_INFO(ctx).nocompress_ext_cnt;
+ sbi_ext = F2FS_OPTION(sbi).noextensions;
+ sbi_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt;
+ for (i = 0; i < ctx_cnt; i++) {
+ if (strlen(ctx_ext[i]) == 0)
+ continue;
+ strscpy(sbi_ext[sbi_cnt], ctx_ext[i]);
+ sbi_cnt++;
+ }
+ F2FS_OPTION(sbi).nocompress_ext_cnt = sbi_cnt;
+ }
+#endif
+}
+
+static void f2fs_apply_options(struct fs_context *fc, struct super_block *sb)
+{
+ struct f2fs_fs_context *ctx = fc->fs_private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+
+ F2FS_OPTION(sbi).opt &= ~ctx->opt_mask;
+ F2FS_OPTION(sbi).opt |= F2FS_CTX_INFO(ctx).opt;
+
+ if (ctx->spec_mask & F2FS_SPEC_background_gc)
+ F2FS_OPTION(sbi).bggc_mode = F2FS_CTX_INFO(ctx).bggc_mode;
+ if (ctx->spec_mask & F2FS_SPEC_inline_xattr_size)
+ F2FS_OPTION(sbi).inline_xattr_size =
+ F2FS_CTX_INFO(ctx).inline_xattr_size;
+ if (ctx->spec_mask & F2FS_SPEC_active_logs)
+ F2FS_OPTION(sbi).active_logs = F2FS_CTX_INFO(ctx).active_logs;
+ if (ctx->spec_mask & F2FS_SPEC_reserve_root)
+ F2FS_OPTION(sbi).root_reserved_blocks =
+ F2FS_CTX_INFO(ctx).root_reserved_blocks;
+ if (ctx->spec_mask & F2FS_SPEC_resgid)
+ F2FS_OPTION(sbi).s_resgid = F2FS_CTX_INFO(ctx).s_resgid;
+ if (ctx->spec_mask & F2FS_SPEC_resuid)
+ F2FS_OPTION(sbi).s_resuid = F2FS_CTX_INFO(ctx).s_resuid;
+ if (ctx->spec_mask & F2FS_SPEC_mode)
+ F2FS_OPTION(sbi).fs_mode = F2FS_CTX_INFO(ctx).fs_mode;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (ctx->spec_mask & F2FS_SPEC_fault_injection)
+ (void)f2fs_build_fault_attr(sbi,
+ F2FS_CTX_INFO(ctx).fault_info.inject_rate, 0, FAULT_RATE);
+ if (ctx->spec_mask & F2FS_SPEC_fault_type)
+ (void)f2fs_build_fault_attr(sbi, 0,
+ F2FS_CTX_INFO(ctx).fault_info.inject_type, FAULT_TYPE);
+#endif
+ if (ctx->spec_mask & F2FS_SPEC_alloc_mode)
+ F2FS_OPTION(sbi).alloc_mode = F2FS_CTX_INFO(ctx).alloc_mode;
+ if (ctx->spec_mask & F2FS_SPEC_fsync_mode)
+ F2FS_OPTION(sbi).fsync_mode = F2FS_CTX_INFO(ctx).fsync_mode;
+ if (ctx->spec_mask & F2FS_SPEC_checkpoint_disable_cap)
+ F2FS_OPTION(sbi).unusable_cap = F2FS_CTX_INFO(ctx).unusable_cap;
+ if (ctx->spec_mask & F2FS_SPEC_checkpoint_disable_cap_perc)
+ F2FS_OPTION(sbi).unusable_cap_perc =
+ F2FS_CTX_INFO(ctx).unusable_cap_perc;
+ if (ctx->spec_mask & F2FS_SPEC_discard_unit)
+ F2FS_OPTION(sbi).discard_unit = F2FS_CTX_INFO(ctx).discard_unit;
+ if (ctx->spec_mask & F2FS_SPEC_memory_mode)
+ F2FS_OPTION(sbi).memory_mode = F2FS_CTX_INFO(ctx).memory_mode;
+ if (ctx->spec_mask & F2FS_SPEC_errors)
+ F2FS_OPTION(sbi).errors = F2FS_CTX_INFO(ctx).errors;
+
+ f2fs_apply_compression(fc, sb);
+ f2fs_apply_test_dummy_encryption(fc, sb);
+ f2fs_apply_quota_options(fc, sb);
+}
+
+static int f2fs_sanity_check_options(struct f2fs_sb_info *sbi, bool remount)
+{
+ if (f2fs_sb_has_device_alias(sbi) &&
+ !test_opt(sbi, READ_EXTENT_CACHE)) {
+ f2fs_err(sbi, "device aliasing requires extent cache");
return -EINVAL;
}
+ if (!remount)
+ return 0;
+
+#ifdef CONFIG_BLK_DEV_ZONED
+ if (f2fs_sb_has_blkzoned(sbi) &&
+ sbi->max_open_zones < F2FS_OPTION(sbi).active_logs) {
+ f2fs_err(sbi,
+ "zoned: max open zones %u is too small, need at least %u open zones",
+ sbi->max_open_zones, F2FS_OPTION(sbi).active_logs);
+ return -EINVAL;
+ }
+#endif
+ if (f2fs_lfs_mode(sbi) && !IS_F2FS_IPU_DISABLE(sbi)) {
+ f2fs_warn(sbi, "LFS is not compatible with IPU");
+ return -EINVAL;
+ }
return 0;
}
@@ -1442,6 +1701,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
/* Initialize f2fs-specific inode info */
atomic_set(&fi->dirty_pages, 0);
atomic_set(&fi->i_compr_blocks, 0);
+ atomic_set(&fi->open_count, 0);
init_f2fs_rwsem(&fi->i_sem);
spin_lock_init(&fi->i_size_lock);
INIT_LIST_HEAD(&fi->dirty_list);
@@ -1718,7 +1978,7 @@ static void f2fs_put_super(struct super_block *sb)
destroy_percpu_info(sbi);
f2fs_destroy_iostat(sbi);
for (i = 0; i < NR_PAGE_TYPE; i++)
- kvfree(sbi->write_io[i]);
+ kfree(sbi->write_io[i]);
#if IS_ENABLED(CONFIG_UNICODE)
utf8_unload(sb->s_encoding);
#endif
@@ -2329,11 +2589,12 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
f2fs_flush_ckpt_thread(sbi);
}
-static int f2fs_remount(struct super_block *sb, int *flags, char *data)
+static int __f2fs_remount(struct fs_context *fc, struct super_block *sb)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
struct f2fs_mount_info org_mount_opt;
unsigned long old_sb_flags;
+ unsigned int flags = fc->sb_flags;
int err;
bool need_restart_gc = false, need_stop_gc = false;
bool need_restart_flush = false, need_stop_flush = false;
@@ -2379,7 +2640,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
#endif
/* recover superblocks we couldn't write due to previous RO mount */
- if (!(*flags & SB_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) {
+ if (!(flags & SB_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) {
err = f2fs_commit_super(sbi, false);
f2fs_info(sbi, "Try to recover all the superblocks, ret: %d",
err);
@@ -2389,23 +2650,13 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
default_options(sbi, true);
- /* parse mount options */
- err = parse_options(sbi, data, true);
+ err = f2fs_check_opt_consistency(fc, sb);
if (err)
goto restore_opts;
-#ifdef CONFIG_BLK_DEV_ZONED
- if (f2fs_sb_has_blkzoned(sbi) &&
- sbi->max_open_zones < F2FS_OPTION(sbi).active_logs) {
- f2fs_err(sbi,
- "zoned: max open zones %u is too small, need at least %u open zones",
- sbi->max_open_zones, F2FS_OPTION(sbi).active_logs);
- err = -EINVAL;
- goto restore_opts;
- }
-#endif
+ f2fs_apply_options(fc, sb);
- err = f2fs_default_check(sbi);
+ err = f2fs_sanity_check_options(sbi, true);
if (err)
goto restore_opts;
@@ -2416,20 +2667,20 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
* Previous and new state of filesystem is RO,
* so skip checking GC and FLUSH_MERGE conditions.
*/
- if (f2fs_readonly(sb) && (*flags & SB_RDONLY))
+ if (f2fs_readonly(sb) && (flags & SB_RDONLY))
goto skip;
- if (f2fs_dev_is_readonly(sbi) && !(*flags & SB_RDONLY)) {
+ if (f2fs_dev_is_readonly(sbi) && !(flags & SB_RDONLY)) {
err = -EROFS;
goto restore_opts;
}
#ifdef CONFIG_QUOTA
- if (!f2fs_readonly(sb) && (*flags & SB_RDONLY)) {
+ if (!f2fs_readonly(sb) && (flags & SB_RDONLY)) {
err = dquot_suspend(sb, -1);
if (err < 0)
goto restore_opts;
- } else if (f2fs_readonly(sb) && !(*flags & SB_RDONLY)) {
+ } else if (f2fs_readonly(sb) && !(flags & SB_RDONLY)) {
/* dquot_resume needs RW */
sb->s_flags &= ~SB_RDONLY;
if (sb_any_quota_suspended(sb)) {
@@ -2441,12 +2692,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
}
}
#endif
- if (f2fs_lfs_mode(sbi) && !IS_F2FS_IPU_DISABLE(sbi)) {
- err = -EINVAL;
- f2fs_warn(sbi, "LFS is not compatible with IPU");
- goto restore_opts;
- }
-
/* disallow enable atgc dynamically */
if (no_atgc == !!test_opt(sbi, ATGC)) {
err = -EINVAL;
@@ -2485,7 +2730,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
- if ((*flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) {
+ if ((flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) {
err = -EINVAL;
f2fs_warn(sbi, "disabling checkpoint not compatible with read-only");
goto restore_opts;
@@ -2496,7 +2741,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
* or if background_gc = off is passed in mount
* option. Also sync the filesystem.
*/
- if ((*flags & SB_RDONLY) ||
+ if ((flags & SB_RDONLY) ||
(F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF &&
!test_opt(sbi, GC_MERGE))) {
if (sbi->gc_thread) {
@@ -2510,7 +2755,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
need_stop_gc = true;
}
- if (*flags & SB_RDONLY) {
+ if (flags & SB_RDONLY) {
sync_inodes_sb(sb);
set_sbi_flag(sbi, SBI_IS_DIRTY);
@@ -2523,7 +2768,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
* We stop issue flush thread if FS is mounted as RO
* or if flush_merge is not passed in mount option.
*/
- if ((*flags & SB_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) {
+ if ((flags & SB_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) {
clear_opt(sbi, FLUSH_MERGE);
f2fs_destroy_flush_cmd_control(sbi, false);
need_restart_flush = true;
@@ -2565,11 +2810,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
* triggered while remount and we need to take care of it before
* returning from remount.
*/
- if ((*flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) ||
+ if ((flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) ||
!test_opt(sbi, MERGE_CHECKPOINT)) {
f2fs_stop_ckpt_thread(sbi);
} else {
- /* Flush if the prevous checkpoint, if exists. */
+ /* Flush if the previous checkpoint, if exists. */
f2fs_flush_ckpt_thread(sbi);
err = f2fs_start_ckpt_thread(sbi);
@@ -2592,7 +2837,7 @@ skip:
(test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0);
limit_reserve_root(sbi);
- *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME);
+ fc->sb_flags = (flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME);
sbi->umount_lock_holder = NULL;
return 0;
@@ -3263,7 +3508,6 @@ static const struct super_operations f2fs_sops = {
.freeze_fs = f2fs_freeze,
.unfreeze_fs = f2fs_unfreeze,
.statfs = f2fs_statfs,
- .remount_fs = f2fs_remount,
.shutdown = f2fs_shutdown,
};
@@ -3451,6 +3695,7 @@ static int __f2fs_commit_super(struct f2fs_sb_info *sbi, struct folio *folio,
f2fs_bug_on(sbi, 1);
ret = submit_bio_wait(bio);
+ bio_put(bio);
folio_end_writeback(folio);
return ret;
@@ -4522,14 +4767,14 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi)
sbi->readdir_ra = true;
}
-static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
+static int f2fs_fill_super(struct super_block *sb, struct fs_context *fc)
{
+ struct f2fs_fs_context *ctx = fc->fs_private;
struct f2fs_sb_info *sbi;
struct f2fs_super_block *raw_super;
struct inode *root;
int err;
bool skip_recovery = false, need_fsck = false;
- char *options = NULL;
int recovery, i, valid_super_block;
struct curseg_info *seg_i;
int retry_cnt = 1;
@@ -4592,18 +4837,14 @@ try_onemore:
sizeof(raw_super->uuid));
default_options(sbi, false);
- /* parse mount options */
- options = kstrdup((const char *)data, GFP_KERNEL);
- if (data && !options) {
- err = -ENOMEM;
- goto free_sb_buf;
- }
- err = parse_options(sbi, options, false);
+ err = f2fs_check_opt_consistency(fc, sb);
if (err)
- goto free_options;
+ goto free_sb_buf;
+
+ f2fs_apply_options(fc, sb);
- err = f2fs_default_check(sbi);
+ err = f2fs_sanity_check_options(sbi, false);
if (err)
goto free_options;
@@ -4770,6 +5011,10 @@ try_onemore:
/* get segno of first zoned block device */
sbi->first_seq_zone_segno = get_first_seq_zone_segno(sbi);
+ sbi->reserved_pin_section = f2fs_sb_has_blkzoned(sbi) ?
+ ZONED_PIN_SEC_REQUIRED_COUNT :
+ GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi));
+
/* Read accumulated write IO statistics if exists */
seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
if (__exist_node_summaries(sbi))
@@ -4930,7 +5175,6 @@ reset_checkpoint:
if (err)
goto sync_free_meta;
}
- kvfree(options);
/* recover broken superblock */
if (recovery) {
@@ -5013,7 +5257,7 @@ free_iostat:
f2fs_destroy_iostat(sbi);
free_bio_info:
for (i = 0; i < NR_PAGE_TYPE; i++)
- kvfree(sbi->write_io[i]);
+ kfree(sbi->write_io[i]);
#if IS_ENABLED(CONFIG_UNICODE)
utf8_unload(sb->s_encoding);
@@ -5024,8 +5268,8 @@ free_options:
for (i = 0; i < MAXQUOTAS; i++)
kfree(F2FS_OPTION(sbi).s_qf_names[i]);
#endif
- fscrypt_free_dummy_policy(&F2FS_OPTION(sbi).dummy_enc_policy);
- kvfree(options);
+ /* no need to free dummy_enc_policy, we just keep it in ctx when failed */
+ swap(F2FS_CTX_INFO(ctx).dummy_enc_policy, F2FS_OPTION(sbi).dummy_enc_policy);
free_sb_buf:
kfree(raw_super);
free_sbi:
@@ -5041,12 +5285,39 @@ free_sbi:
return err;
}
-static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data)
+static int f2fs_get_tree(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, f2fs_fill_super);
+ return get_tree_bdev(fc, f2fs_fill_super);
}
+static int f2fs_reconfigure(struct fs_context *fc)
+{
+ struct super_block *sb = fc->root->d_sb;
+
+ return __f2fs_remount(fc, sb);
+}
+
+static void f2fs_fc_free(struct fs_context *fc)
+{
+ struct f2fs_fs_context *ctx = fc->fs_private;
+
+ if (!ctx)
+ return;
+
+#ifdef CONFIG_QUOTA
+ f2fs_unnote_qf_name_all(fc);
+#endif
+ fscrypt_free_dummy_policy(&F2FS_CTX_INFO(ctx).dummy_enc_policy);
+ kfree(ctx);
+}
+
+static const struct fs_context_operations f2fs_context_ops = {
+ .parse_param = f2fs_parse_param,
+ .get_tree = f2fs_get_tree,
+ .reconfigure = f2fs_reconfigure,
+ .free = f2fs_fc_free,
+};
+
static void kill_f2fs_super(struct super_block *sb)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
@@ -5088,10 +5359,24 @@ static void kill_f2fs_super(struct super_block *sb)
}
}
+static int f2fs_init_fs_context(struct fs_context *fc)
+{
+ struct f2fs_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct f2fs_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ fc->fs_private = ctx;
+ fc->ops = &f2fs_context_ops;
+
+ return 0;
+}
+
static struct file_system_type f2fs_fs_type = {
.owner = THIS_MODULE,
.name = "f2fs",
- .mount = f2fs_mount,
+ .init_fs_context = f2fs_init_fs_context,
.kill_sb = kill_f2fs_super,
.fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 75134d69a0bd..f736052dea50 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -628,6 +628,27 @@ out:
return count;
}
+ if (!strcmp(a->attr.name, "gc_no_zoned_gc_percent")) {
+ if (t > 100)
+ return -EINVAL;
+ *ui = (unsigned int)t;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "gc_boost_zoned_gc_percent")) {
+ if (t > 100)
+ return -EINVAL;
+ *ui = (unsigned int)t;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "gc_valid_thresh_ratio")) {
+ if (t > 100)
+ return -EINVAL;
+ *ui = (unsigned int)t;
+ return count;
+ }
+
#ifdef CONFIG_F2FS_IOSTAT
if (!strcmp(a->attr.name, "iostat_enable")) {
sbi->iostat_enable = !!t;
@@ -824,6 +845,27 @@ out:
return count;
}
+ if (!strcmp(a->attr.name, "reserved_pin_section")) {
+ if (t > GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))
+ return -EINVAL;
+ *ui = (unsigned int)t;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "gc_boost_gc_multiple")) {
+ if (t < 1 || t > SEGS_PER_SEC(sbi))
+ return -EINVAL;
+ sbi->gc_thread->boost_gc_multiple = (unsigned int)t;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "gc_boost_gc_greedy")) {
+ if (t > GC_GREEDY)
+ return -EINVAL;
+ sbi->gc_thread->boost_gc_greedy = (unsigned int)t;
+ return count;
+ }
+
*ui = (unsigned int)t;
return count;
@@ -1050,6 +1092,8 @@ GC_THREAD_RW_ATTR(gc_no_gc_sleep_time, no_gc_sleep_time);
GC_THREAD_RW_ATTR(gc_no_zoned_gc_percent, no_zoned_gc_percent);
GC_THREAD_RW_ATTR(gc_boost_zoned_gc_percent, boost_zoned_gc_percent);
GC_THREAD_RW_ATTR(gc_valid_thresh_ratio, valid_thresh_ratio);
+GC_THREAD_RW_ATTR(gc_boost_gc_multiple, boost_gc_multiple);
+GC_THREAD_RW_ATTR(gc_boost_gc_greedy, boost_gc_greedy);
/* SM_INFO ATTR */
SM_INFO_RW_ATTR(reclaim_segments, rec_prefree_segments);
@@ -1130,6 +1174,7 @@ F2FS_SBI_GENERAL_RO_ATTR(unusable_blocks_per_sec);
F2FS_SBI_GENERAL_RW_ATTR(blkzone_alloc_policy);
#endif
F2FS_SBI_GENERAL_RW_ATTR(carve_out);
+F2FS_SBI_GENERAL_RW_ATTR(reserved_pin_section);
/* STAT_INFO ATTR */
#ifdef CONFIG_F2FS_STAT_FS
@@ -1220,6 +1265,8 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(gc_no_zoned_gc_percent),
ATTR_LIST(gc_boost_zoned_gc_percent),
ATTR_LIST(gc_valid_thresh_ratio),
+ ATTR_LIST(gc_boost_gc_multiple),
+ ATTR_LIST(gc_boost_gc_greedy),
ATTR_LIST(gc_idle),
ATTR_LIST(gc_urgent),
ATTR_LIST(reclaim_segments),
@@ -1323,6 +1370,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(last_age_weight),
ATTR_LIST(max_read_extent_count),
ATTR_LIST(carve_out),
+ ATTR_LIST(reserved_pin_section),
NULL,
};
ATTRIBUTE_GROUPS(f2fs);
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 32048052c64a..5d07c469b571 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -127,6 +127,8 @@
#define __diag_GCC_8(s)
#endif
+#define __diag_GCC_all(s) __diag(s)
+
#define __diag_ignore_all(option, comment) \
__diag(__diag_GCC_ignore option)
diff --git a/include/linux/execmem.h b/include/linux/execmem.h
index 3be35680a54f..7de229134e30 100644
--- a/include/linux/execmem.h
+++ b/include/linux/execmem.h
@@ -60,27 +60,11 @@ enum execmem_range_flags {
* will trap
* @ptr: pointer to memory to fill
* @size: size of the range to fill
- * @writable: is the memory poited by @ptr is writable or ROX
*
* A hook for architecures to fill execmem ranges with invalid instructions.
* Architectures that use EXECMEM_ROX_CACHE must implement this.
*/
-void execmem_fill_trapping_insns(void *ptr, size_t size, bool writable);
-
-/**
- * execmem_make_temp_rw - temporarily remap region with read-write
- * permissions
- * @ptr: address of the region to remap
- * @size: size of the region to remap
- *
- * Remaps a part of the cached large page in the ROX cache in the range
- * [@ptr, @ptr + @size) as writable and not executable. The caller must
- * have exclusive ownership of this range and ensure nothing will try to
- * execute code in this range.
- *
- * Return: 0 on success or negative error code on failure.
- */
-int execmem_make_temp_rw(void *ptr, size_t size);
+void execmem_fill_trapping_insns(void *ptr, size_t size);
/**
* execmem_restore_rox - restore read-only-execute permissions
@@ -95,7 +79,6 @@ int execmem_make_temp_rw(void *ptr, size_t size);
*/
int execmem_restore_rox(void *ptr, size_t size);
#else
-static inline int execmem_make_temp_rw(void *ptr, size_t size) { return 0; }
static inline int execmem_restore_rox(void *ptr, size_t size) { return 0; }
#endif
@@ -166,6 +149,28 @@ struct execmem_info *execmem_arch_setup(void);
void *execmem_alloc(enum execmem_type type, size_t size);
/**
+ * execmem_alloc_rw - allocate writable executable memory
+ * @type: type of the allocation
+ * @size: how many bytes of memory are required
+ *
+ * Allocates memory that will contain executable code, either generated or
+ * loaded from kernel modules.
+ *
+ * Allocates memory that will contain data coupled with executable code,
+ * like data sections in kernel modules.
+ *
+ * Forces writable permissions on the allocated memory and the caller is
+ * responsible to manage the permissions afterwards.
+ *
+ * For architectures that use ROX cache the permissions will be set to R+W.
+ * For architectures that don't use ROX cache the default permissions for @type
+ * will be used as they must be writable.
+ *
+ * Return: a pointer to the allocated memory or %NULL
+ */
+void *execmem_alloc_rw(enum execmem_type type, size_t size);
+
+/**
* execmem_free - free executable memory
* @ptr: pointer to the memory that should be freed
*/
@@ -186,19 +191,6 @@ struct vm_struct *execmem_vmap(size_t size);
#endif
/**
- * execmem_update_copy - copy an update to executable memory
- * @dst: destination address to update
- * @src: source address containing the data
- * @size: how many bytes of memory shold be copied
- *
- * Copy @size bytes from @src to @dst using text poking if the memory at
- * @dst is read-only.
- *
- * Return: a pointer to @dst or NULL on error
- */
-void *execmem_update_copy(void *dst, const void *src, size_t size);
-
-/**
* execmem_is_rox - check if execmem is read-only
* @type - the execmem type to check
*
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 5206d63b3386..2f8b8bfc0e73 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -268,7 +268,7 @@ struct node_footer {
/* Node IDs in an Indirect Block */
#define NIDS_PER_BLOCK ((F2FS_BLKSIZE - sizeof(struct node_footer)) / sizeof(__le32))
-#define ADDRS_PER_PAGE(page, inode) (addrs_per_page(inode, IS_INODE(page)))
+#define ADDRS_PER_PAGE(folio, inode) (addrs_per_page(inode, IS_INODE(folio)))
#define NODE_DIR1_BLOCK (DEF_ADDRS_PER_INODE + 1)
#define NODE_DIR2_BLOCK (DEF_ADDRS_PER_INODE + 2)
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 8d0e3ad89b94..10dd161690a2 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -332,12 +332,13 @@ static inline struct page *fscrypt_pagecache_page(struct page *bounce_page)
return (struct page *)page_private(bounce_page);
}
-static inline bool fscrypt_is_bounce_folio(struct folio *folio)
+static inline bool fscrypt_is_bounce_folio(const struct folio *folio)
{
return folio->mapping == NULL;
}
-static inline struct folio *fscrypt_pagecache_folio(struct folio *bounce_folio)
+static inline
+struct folio *fscrypt_pagecache_folio(const struct folio *bounce_folio)
{
return bounce_folio->private;
}
@@ -517,12 +518,13 @@ static inline struct page *fscrypt_pagecache_page(struct page *bounce_page)
return ERR_PTR(-EINVAL);
}
-static inline bool fscrypt_is_bounce_folio(struct folio *folio)
+static inline bool fscrypt_is_bounce_folio(const struct folio *folio)
{
return false;
}
-static inline struct folio *fscrypt_pagecache_folio(struct folio *bounce_folio)
+static inline
+struct folio *fscrypt_pagecache_folio(const struct folio *bounce_folio)
{
WARN_ON_ONCE(1);
return ERR_PTR(-EINVAL);
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index 7376c1df9c90..c16353cc6e3c 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -225,7 +225,4 @@ io_mapping_free(struct io_mapping *iomap)
kfree(iomap);
}
-int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma,
- unsigned long addr, unsigned long pfn, unsigned long size);
-
#endif /* _LINUX_IO_MAPPING_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 349f0d9aad22..1ae97a0b8ec7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -414,8 +414,10 @@ extern unsigned int kobjsize(const void *objp);
#endif
#ifdef CONFIG_64BIT
-/* VM is sealed, in vm_flags */
-#define VM_SEALED _BITUL(63)
+#define VM_SEALED_BIT 42
+#define VM_SEALED BIT(VM_SEALED_BIT)
+#else
+#define VM_SEALED VM_NONE
#endif
/* Bits set in the VMA until the stack is in its final location */
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 1f4f44951abe..11a078de9150 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -12,6 +12,7 @@ extern int rcuwait_wake_up(struct rcuwait *w);
#include <linux/tracepoint-defs.h>
#include <linux/types.h>
#include <linux/cleanup.h>
+#include <linux/sched/mm.h>
#define MMAP_LOCK_INITIALIZER(name) \
.mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock),
@@ -154,6 +155,10 @@ static inline void vma_refcount_put(struct vm_area_struct *vma)
* reused and attached to a different mm before we lock it.
* Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
* detached.
+ *
+ * WARNING! The vma passed to this function cannot be used if the function
+ * fails to lock it because in certain cases RCU lock is dropped and then
+ * reacquired. Once RCU lock is dropped the vma can be concurently freed.
*/
static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
struct vm_area_struct *vma)
@@ -183,6 +188,31 @@ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
}
rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
+
+ /*
+ * If vma got attached to another mm from under us, that mm is not
+ * stable and can be freed in the narrow window after vma->vm_refcnt
+ * is dropped and before rcuwait_wake_up(mm) is called. Grab it before
+ * releasing vma->vm_refcnt.
+ */
+ if (unlikely(vma->vm_mm != mm)) {
+ /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */
+ struct mm_struct *other_mm = vma->vm_mm;
+
+ /*
+ * __mmdrop() is a heavy operation and we don't need RCU
+ * protection here. Release RCU lock during these operations.
+ * We reinstate the RCU read lock as the caller expects it to
+ * be held when this function returns even on error.
+ */
+ rcu_read_unlock();
+ mmgrab(other_mm);
+ vma_refcount_put(vma);
+ mmdrop(other_mm);
+ rcu_read_lock();
+ return NULL;
+ }
+
/*
* Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
* False unlocked result is impossible because we modify and check
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 8e4d6eda8a8d..8d3fa3a91ce4 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -837,8 +837,6 @@ void set_page_writeback(struct page *page);
#define folio_start_writeback(folio) \
__folio_start_writeback(folio, false)
-#define folio_start_writeback_keepwrite(folio) \
- __folio_start_writeback(folio, true)
static __always_inline bool folio_test_head(const struct folio *folio)
{
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index e3b99920be05..4c035637eeb7 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -736,6 +736,29 @@ static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm,
}
#endif
+/**
+ * get_and_clear_ptes - Clear present PTEs that map consecutive pages of
+ * the same folio, collecting dirty/accessed bits.
+ * @mm: Address space the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to clear.
+ *
+ * Use this instead of get_and_clear_full_ptes() if it is known that we don't
+ * need to clear the full mm, which is mostly the case.
+ *
+ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
+ * some PTEs might be write-protected.
+ *
+ * Context: The caller holds the page table lock. The PTEs map consecutive
+ * pages that belong to the same folio. The PTEs are all in the same PMD.
+ */
+static inline pte_t get_and_clear_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr)
+{
+ return get_and_clear_full_ptes(mm, addr, ptep, nr, 0);
+}
+
#ifndef clear_full_ptes
/**
* clear_full_ptes - Clear present PTEs that map consecutive pages of the same
@@ -768,6 +791,28 @@ static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr,
}
#endif
+/**
+ * clear_ptes - Clear present PTEs that map consecutive pages of the same folio.
+ * @mm: Address space the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to clear.
+ *
+ * Use this instead of clear_full_ptes() if it is known that we don't need to
+ * clear the full mm, which is mostly the case.
+ *
+ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
+ * some PTEs might be write-protected.
+ *
+ * Context: The caller holds the page table lock. The PTEs map consecutive
+ * pages that belong to the same folio. The PTEs are all in the same PMD.
+ */
+static inline void clear_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr)
+{
+ clear_full_ptes(mm, addr, ptep, nr, 0);
+}
+
/*
* If two threads concurrently fault at the same page, the thread that
* won the race updates the PTE and its local TLB/Cache. The other thread
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 20803fcb49a7..6cd020eea37a 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -449,6 +449,28 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio,
default:
VM_WARN_ON_ONCE(true);
}
+
+ /*
+ * Anon folios must have an associated live anon_vma as long as they're
+ * mapped into userspace.
+ * Note that the atomic_read() mainly does two things:
+ *
+ * 1. In KASAN builds with CONFIG_SLUB_RCU_DEBUG, it causes KASAN to
+ * check that the associated anon_vma has not yet been freed (subject
+ * to KASAN's usual limitations). This check will pass if the
+ * anon_vma's refcount has already dropped to 0 but an RCU grace
+ * period hasn't passed since then.
+ * 2. If the anon_vma has not yet been freed, it checks that the
+ * anon_vma still has a nonzero refcount (as opposed to being in the
+ * middle of an RCU delay for getting freed).
+ */
+ if (folio_test_anon(folio) && !folio_test_ksm(folio)) {
+ unsigned long mapping = (unsigned long)folio->mapping;
+ struct anon_vma *anon_vma;
+
+ anon_vma = (void *)(mapping - FOLIO_MAPPING_ANON);
+ VM_WARN_ON_FOLIO(atomic_read(&anon_vma->refcount) == 0, folio);
+ }
}
/*
diff --git a/include/linux/sprintf.h b/include/linux/sprintf.h
index 876130091384..f06f7b785091 100644
--- a/include/linux/sprintf.h
+++ b/include/linux/sprintf.h
@@ -23,7 +23,7 @@ __scanf(2, 0) int vsscanf(const char *, const char *, va_list);
/* These are for specific cases, do not use without real need */
extern bool no_hash_pointers;
-int no_hash_pointers_enable(char *str);
+void hash_pointers_finalize(bool slub_debug);
/* Used for Rust formatting ('%pA') */
char *rust_fmt_argument(char *buf, char *end, const void *ptr);
diff --git a/kernel/fork.c b/kernel/fork.c
index 9ce93fd20f82..c4ada32598bd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -585,9 +585,12 @@ static void check_mm(struct mm_struct *mm)
for (i = 0; i < NR_MM_COUNTERS; i++) {
long x = percpu_counter_sum(&mm->rss_stat[i]);
- if (unlikely(x))
- pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
- mm, resident_page_types[i], x);
+ if (unlikely(x)) {
+ pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld Comm:%s Pid:%d\n",
+ mm, resident_page_types[i], x,
+ current->comm,
+ task_pid_nr(current));
+ }
}
if (mm_pgtables_bytes(mm))
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 7f8bb51aedd4..c66b26184936 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1322,20 +1322,11 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
else
execmem_type = EXECMEM_MODULE_TEXT;
- ptr = execmem_alloc(execmem_type, size);
+ ptr = execmem_alloc_rw(execmem_type, size);
if (!ptr)
return -ENOMEM;
- if (execmem_is_rox(execmem_type)) {
- int err = execmem_make_temp_rw(ptr, size);
-
- if (err) {
- execmem_free(ptr);
- return -ENOMEM;
- }
-
- mod->mem[type].is_rox = true;
- }
+ mod->mem[type].is_rox = execmem_is_rox(execmem_type);
/*
* The pointer to these blocks of memory are stored on the module
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index bbed41ad29cf..ef282001f200 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -64,6 +64,7 @@ struct dev_printk_info;
extern struct printk_ringbuffer *prb;
extern bool printk_kthreads_running;
+extern bool printk_kthreads_ready;
extern bool debug_non_panic_cpus;
__printf(4, 0)
@@ -179,6 +180,7 @@ static inline void nbcon_kthread_wake(struct console *con)
#define PRINTKRB_RECORD_MAX 0
#define printk_kthreads_running (false)
+#define printk_kthreads_ready (false)
/*
* In !PRINTK builds we still export console_sem
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
index fd12efcc4aed..646801813415 100644
--- a/kernel/printk/nbcon.c
+++ b/kernel/printk/nbcon.c
@@ -214,8 +214,9 @@ static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq)
/**
* nbcon_context_try_acquire_direct - Try to acquire directly
- * @ctxt: The context of the caller
- * @cur: The current console state
+ * @ctxt: The context of the caller
+ * @cur: The current console state
+ * @is_reacquire: This acquire is a reacquire
*
* Acquire the console when it is released. Also acquire the console when
* the current owner has a lower priority and the console is in a safe state.
@@ -225,17 +226,17 @@ static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq)
*
* Errors:
*
- * -EPERM: A panic is in progress and this is not the panic CPU.
- * Or the current owner or waiter has the same or higher
- * priority. No acquire method can be successful in
- * this case.
+ * -EPERM: A panic is in progress and this is neither the panic
+ * CPU nor is this a reacquire. Or the current owner or
+ * waiter has the same or higher priority. No acquire
+ * method can be successful in these cases.
*
* -EBUSY: The current owner has a lower priority but the console
* in an unsafe state. The caller should try using
* the handover acquire method.
*/
static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt,
- struct nbcon_state *cur)
+ struct nbcon_state *cur, bool is_reacquire)
{
unsigned int cpu = smp_processor_id();
struct console *con = ctxt->console;
@@ -243,14 +244,20 @@ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt,
do {
/*
- * Panic does not imply that the console is owned. However, it
- * is critical that non-panic CPUs during panic are unable to
- * acquire ownership in order to satisfy the assumptions of
- * nbcon_waiter_matches(). In particular, the assumption that
- * lower priorities are ignored during panic.
+ * Panic does not imply that the console is owned. However,
+ * since all non-panic CPUs are stopped during panic(), it
+ * is safer to have them avoid gaining console ownership.
+ *
+ * If this acquire is a reacquire (and an unsafe takeover
+ * has not previously occurred) then it is allowed to attempt
+ * a direct acquire in panic. This gives console drivers an
+ * opportunity to perform any necessary cleanup if they were
+ * interrupted by the panic CPU while printing.
*/
- if (other_cpu_in_panic())
+ if (other_cpu_in_panic() &&
+ (!is_reacquire || cur->unsafe_takeover)) {
return -EPERM;
+ }
if (ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio)
return -EPERM;
@@ -301,8 +308,9 @@ static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio)
* Event #1 implies this context is EMERGENCY.
* Event #2 implies the new context is PANIC.
* Event #3 occurs when panic() has flushed the console.
- * Events #4 and #5 are not possible due to the other_cpu_in_panic()
- * check in nbcon_context_try_acquire_direct().
+ * Event #4 occurs when a non-panic CPU reacquires.
+ * Event #5 is not possible due to the other_cpu_in_panic() check
+ * in nbcon_context_try_acquire_handover().
*/
return (cur->req_prio == expected_prio);
@@ -431,6 +439,16 @@ static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt,
WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio);
WARN_ON_ONCE(!cur->unsafe);
+ /*
+ * Panic does not imply that the console is owned. However, it
+ * is critical that non-panic CPUs during panic are unable to
+ * wait for a handover in order to satisfy the assumptions of
+ * nbcon_waiter_matches(). In particular, the assumption that
+ * lower priorities are ignored during panic.
+ */
+ if (other_cpu_in_panic())
+ return -EPERM;
+
/* Handover is not possible on the same CPU. */
if (cur->cpu == cpu)
return -EBUSY;
@@ -558,7 +576,8 @@ static struct printk_buffers panic_nbcon_pbufs;
/**
* nbcon_context_try_acquire - Try to acquire nbcon console
- * @ctxt: The context of the caller
+ * @ctxt: The context of the caller
+ * @is_reacquire: This acquire is a reacquire
*
* Context: Under @ctxt->con->device_lock() or local_irq_save().
* Return: True if the console was acquired. False otherwise.
@@ -568,7 +587,7 @@ static struct printk_buffers panic_nbcon_pbufs;
* in an unsafe state. Otherwise, on success the caller may assume
* the console is not in an unsafe state.
*/
-static bool nbcon_context_try_acquire(struct nbcon_context *ctxt)
+static bool nbcon_context_try_acquire(struct nbcon_context *ctxt, bool is_reacquire)
{
unsigned int cpu = smp_processor_id();
struct console *con = ctxt->console;
@@ -577,7 +596,7 @@ static bool nbcon_context_try_acquire(struct nbcon_context *ctxt)
nbcon_state_read(con, &cur);
try_again:
- err = nbcon_context_try_acquire_direct(ctxt, &cur);
+ err = nbcon_context_try_acquire_direct(ctxt, &cur, is_reacquire);
if (err != -EBUSY)
goto out;
@@ -913,7 +932,7 @@ void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt)
{
struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
- while (!nbcon_context_try_acquire(ctxt))
+ while (!nbcon_context_try_acquire(ctxt, true))
cpu_relax();
nbcon_write_context_set_buf(wctxt, NULL, 0);
@@ -1101,7 +1120,7 @@ static bool nbcon_emit_one(struct nbcon_write_context *wctxt, bool use_atomic)
cant_migrate();
}
- if (!nbcon_context_try_acquire(ctxt))
+ if (!nbcon_context_try_acquire(ctxt, false))
goto out;
/*
@@ -1486,7 +1505,7 @@ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq,
ctxt->prio = nbcon_get_default_prio();
ctxt->allow_unsafe_takeover = allow_unsafe_takeover;
- if (!nbcon_context_try_acquire(ctxt))
+ if (!nbcon_context_try_acquire(ctxt, false))
return -EPERM;
while (nbcon_seq_read(con) < stop_seq) {
@@ -1671,6 +1690,9 @@ bool nbcon_alloc(struct console *con)
{
struct nbcon_state state = { };
+ /* Synchronize the kthread start. */
+ lockdep_assert_console_list_lock_held();
+
/* The write_thread() callback is mandatory. */
if (WARN_ON(!con->write_thread))
return false;
@@ -1701,12 +1723,15 @@ bool nbcon_alloc(struct console *con)
return false;
}
- if (printk_kthreads_running) {
+ if (printk_kthreads_ready && !have_boot_console) {
if (!nbcon_kthread_create(con)) {
kfree(con->pbufs);
con->pbufs = NULL;
return false;
}
+
+ /* Might be the first kthread. */
+ printk_kthreads_running = true;
}
}
@@ -1716,14 +1741,30 @@ bool nbcon_alloc(struct console *con)
/**
* nbcon_free - Free and cleanup the nbcon console specific data
* @con: Console to free/cleanup nbcon data
+ *
+ * Important: @have_nbcon_console must be updated before calling
+ * this function. In particular, it can be set only when there
+ * is still another nbcon console registered.
*/
void nbcon_free(struct console *con)
{
struct nbcon_state state = { };
- if (printk_kthreads_running)
+ /* Synchronize the kthread stop. */
+ lockdep_assert_console_list_lock_held();
+
+ if (printk_kthreads_running) {
nbcon_kthread_stop(con);
+ /* Might be the last nbcon console.
+ *
+ * Do not rely on printk_kthreads_check_locked(). It is not
+ * called in some code paths, see nbcon_free() callers.
+ */
+ if (!have_nbcon_console)
+ printk_kthreads_running = false;
+ }
+
nbcon_state_set(con, &state);
/* Boot consoles share global printk buffers. */
@@ -1762,7 +1803,7 @@ bool nbcon_device_try_acquire(struct console *con)
ctxt->console = con;
ctxt->prio = NBCON_PRIO_NORMAL;
- if (!nbcon_context_try_acquire(ctxt))
+ if (!nbcon_context_try_acquire(ctxt, false))
return false;
if (!nbcon_context_enter_unsafe(ctxt))
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 1eea80d0648e..0efbcdda9aab 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3574,7 +3574,7 @@ EXPORT_SYMBOL(console_resume);
static int unregister_console_locked(struct console *console);
/* True when system boot is far enough to create printer threads. */
-static bool printk_kthreads_ready __ro_after_init;
+bool printk_kthreads_ready __ro_after_init;
static struct task_struct *printk_legacy_kthread;
@@ -3713,6 +3713,7 @@ static void printk_kthreads_check_locked(void)
if (!printk_kthreads_ready)
return;
+ /* Start or stop the legacy kthread when needed. */
if (have_legacy_console || have_boot_console) {
if (!printk_legacy_kthread &&
force_legacy_kthread() &&
@@ -4204,14 +4205,6 @@ static int unregister_console_locked(struct console *console)
*/
synchronize_srcu(&console_srcu);
- if (console->flags & CON_NBCON)
- nbcon_free(console);
-
- console_sysfs_notify();
-
- if (console->exit)
- res = console->exit(console);
-
/*
* With this console gone, the global flags tracking registered
* console types may have changed. Update them.
@@ -4232,6 +4225,15 @@ static int unregister_console_locked(struct console *console)
if (!found_nbcon_con)
have_nbcon_console = found_nbcon_con;
+ /* @have_nbcon_console must be updated before calling nbcon_free(). */
+ if (console->flags & CON_NBCON)
+ nbcon_free(console);
+
+ console_sysfs_notify();
+
+ if (console->exit)
+ res = console->exit(console);
+
/* Changed console list, may require printer threads to start/stop. */
printk_kthreads_check_locked();
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 3d85800757aa..eb0cb11d0d12 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -60,6 +60,20 @@
bool no_hash_pointers __ro_after_init;
EXPORT_SYMBOL_GPL(no_hash_pointers);
+/*
+ * Hashed pointers policy selected by "hash_pointers=..." boot param
+ *
+ * `auto` - Hashed pointers enabled unless disabled by slub_debug_enabled=true
+ * `always` - Hashed pointers enabled unconditionally
+ * `never` - Hashed pointers disabled unconditionally
+ */
+enum hash_pointers_policy {
+ HASH_PTR_AUTO = 0,
+ HASH_PTR_ALWAYS,
+ HASH_PTR_NEVER
+};
+static enum hash_pointers_policy hash_pointers_mode __initdata;
+
noinline
static unsigned long long simple_strntoull(const char *startp, char **endp, unsigned int base, size_t max_chars)
{
@@ -1699,10 +1713,9 @@ char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
return buf;
}
-#pragma GCC diagnostic push
-#ifndef __clang__
-#pragma GCC diagnostic ignored "-Wsuggest-attribute=format"
-#endif
+__diag_push();
+__diag_ignore(GCC, all, "-Wsuggest-attribute=format",
+ "Not a valid __printf() conversion candidate.");
static char *va_format(char *buf, char *end, struct va_format *va_fmt,
struct printf_spec spec)
{
@@ -1717,7 +1730,7 @@ static char *va_format(char *buf, char *end, struct va_format *va_fmt,
return buf;
}
-#pragma GCC diagnostic pop
+__diag_pop();
static noinline_for_stack
char *uuid_string(char *buf, char *end, const u8 *addr,
@@ -2289,12 +2302,23 @@ char *resource_or_range(const char *fmt, char *buf, char *end, void *ptr,
return resource_string(buf, end, ptr, spec, fmt);
}
-int __init no_hash_pointers_enable(char *str)
+void __init hash_pointers_finalize(bool slub_debug)
{
- if (no_hash_pointers)
- return 0;
+ switch (hash_pointers_mode) {
+ case HASH_PTR_ALWAYS:
+ no_hash_pointers = false;
+ break;
+ case HASH_PTR_NEVER:
+ no_hash_pointers = true;
+ break;
+ case HASH_PTR_AUTO:
+ default:
+ no_hash_pointers = slub_debug;
+ break;
+ }
- no_hash_pointers = true;
+ if (!no_hash_pointers)
+ return;
pr_warn("**********************************************************\n");
pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
@@ -2307,11 +2331,39 @@ int __init no_hash_pointers_enable(char *str)
pr_warn("** the kernel, report this immediately to your system **\n");
pr_warn("** administrator! **\n");
pr_warn("** **\n");
+ pr_warn("** Use hash_pointers=always to force this mode off **\n");
+ pr_warn("** **\n");
pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
pr_warn("**********************************************************\n");
+}
+
+static int __init hash_pointers_mode_parse(char *str)
+{
+ if (!str) {
+ pr_warn("Hash pointers mode empty; falling back to auto.\n");
+ hash_pointers_mode = HASH_PTR_AUTO;
+ } else if (strncmp(str, "auto", 4) == 0) {
+ pr_info("Hash pointers mode set to auto.\n");
+ hash_pointers_mode = HASH_PTR_AUTO;
+ } else if (strncmp(str, "never", 5) == 0) {
+ pr_info("Hash pointers mode set to never.\n");
+ hash_pointers_mode = HASH_PTR_NEVER;
+ } else if (strncmp(str, "always", 6) == 0) {
+ pr_info("Hash pointers mode set to always.\n");
+ hash_pointers_mode = HASH_PTR_ALWAYS;
+ } else {
+ pr_warn("Unknown hash_pointers mode '%s' specified; assuming auto.\n", str);
+ hash_pointers_mode = HASH_PTR_AUTO;
+ }
return 0;
}
+early_param("hash_pointers", hash_pointers_mode_parse);
+
+static int __init no_hash_pointers_enable(char *str)
+{
+ return hash_pointers_mode_parse("never");
+}
early_param("no_hash_pointers", no_hash_pointers_enable);
/*
diff --git a/mm/Kconfig b/mm/Kconfig
index d5d4eca947a6..e443fe8cd6cf 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1242,10 +1242,6 @@ config KMAP_LOCAL
config KMAP_LOCAL_NON_LINEAR_PTE_ARRAY
bool
-# struct io_mapping based helper. Selected by drivers that need them
-config IO_MAPPING
- bool
-
config MEMFD_CREATE
bool "Enable memfd_create() system call" if EXPERT
diff --git a/mm/Makefile b/mm/Makefile
index 1a7a11d4933d..ef54aa615d9d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -141,7 +141,6 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o
obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
obj-$(CONFIG_PTDUMP) += ptdump.o
obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
-obj-$(CONFIG_IO_MAPPING) += io-mapping.o
obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 94af19c4dfed..87e825349bdf 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -711,6 +711,10 @@ static void damos_va_migrate_dests_add(struct folio *folio,
target -= dests->weight_arr[i];
}
+ /* If the folio is already in the right node, don't do anything */
+ if (folio_nid(folio) == dests->node_id_arr[i])
+ return;
+
isolate:
if (!folio_isolate_lru(folio))
return;
diff --git a/mm/execmem.c b/mm/execmem.c
index 627e6cf64f4f..0822305413ec 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -26,7 +26,7 @@ static struct execmem_info default_execmem_info __ro_after_init;
#ifdef CONFIG_MMU
static void *execmem_vmalloc(struct execmem_range *range, size_t size,
- pgprot_t pgprot, vm_flags_t vm_flags)
+ pgprot_t pgprot, unsigned long vm_flags)
{
bool kasan = range->flags & EXECMEM_KASAN_SHADOW;
gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN;
@@ -82,7 +82,7 @@ struct vm_struct *execmem_vmap(size_t size)
}
#else
static void *execmem_vmalloc(struct execmem_range *range, size_t size,
- pgprot_t pgprot, vm_flags_t vm_flags)
+ pgprot_t pgprot, unsigned long vm_flags)
{
return vmalloc(size);
}
@@ -93,8 +93,15 @@ struct execmem_cache {
struct mutex mutex;
struct maple_tree busy_areas;
struct maple_tree free_areas;
+ unsigned int pending_free_cnt; /* protected by mutex */
};
+/* delay to schedule asynchronous free if fast path free fails */
+#define FREE_DELAY (msecs_to_jiffies(10))
+
+/* mark entries in busy_areas that should be freed asynchronously */
+#define PENDING_FREE_MASK (1 << (PAGE_SHIFT - 1))
+
static struct execmem_cache execmem_cache = {
.mutex = __MUTEX_INITIALIZER(execmem_cache.mutex),
.busy_areas = MTREE_INIT_EXT(busy_areas, MT_FLAGS_LOCK_EXTERN,
@@ -130,6 +137,27 @@ err_restore:
return err;
}
+static int execmem_force_rw(void *ptr, size_t size)
+{
+ unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ unsigned long addr = (unsigned long)ptr;
+ int ret;
+
+ ret = set_memory_nx(addr, nr);
+ if (ret)
+ return ret;
+
+ return set_memory_rw(addr, nr);
+}
+
+int execmem_restore_rox(void *ptr, size_t size)
+{
+ unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ unsigned long addr = (unsigned long)ptr;
+
+ return set_memory_rox(addr, nr);
+}
+
static void execmem_cache_clean(struct work_struct *work)
{
struct maple_tree *free_areas = &execmem_cache.free_areas;
@@ -155,20 +183,17 @@ static void execmem_cache_clean(struct work_struct *work)
static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean);
-static int execmem_cache_add(void *ptr, size_t size)
+static int execmem_cache_add_locked(void *ptr, size_t size, gfp_t gfp_mask)
{
struct maple_tree *free_areas = &execmem_cache.free_areas;
- struct mutex *mutex = &execmem_cache.mutex;
unsigned long addr = (unsigned long)ptr;
MA_STATE(mas, free_areas, addr - 1, addr + 1);
unsigned long lower, upper;
void *area = NULL;
- int err;
lower = addr;
upper = addr + size - 1;
- mutex_lock(mutex);
area = mas_walk(&mas);
if (area && mas.last == addr - 1)
lower = mas.index;
@@ -178,12 +203,14 @@ static int execmem_cache_add(void *ptr, size_t size)
upper = mas.last;
mas_set_range(&mas, lower, upper);
- err = mas_store_gfp(&mas, (void *)lower, GFP_KERNEL);
- mutex_unlock(mutex);
- if (err)
- return err;
+ return mas_store_gfp(&mas, (void *)lower, gfp_mask);
+}
- return 0;
+static int execmem_cache_add(void *ptr, size_t size, gfp_t gfp_mask)
+{
+ guard(mutex)(&execmem_cache.mutex);
+
+ return execmem_cache_add_locked(ptr, size, gfp_mask);
}
static bool within_range(struct execmem_range *range, struct ma_state *mas,
@@ -256,7 +283,7 @@ out_unlock:
static int execmem_cache_populate(struct execmem_range *range, size_t size)
{
- vm_flags_t vm_flags = VM_ALLOW_HUGE_VMAP;
+ unsigned long vm_flags = VM_ALLOW_HUGE_VMAP;
struct vm_struct *vm;
size_t alloc_size;
int err = -ENOMEM;
@@ -264,6 +291,11 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size)
alloc_size = round_up(size, PMD_SIZE);
p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags);
+ if (!p) {
+ alloc_size = size;
+ p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags);
+ }
+
if (!p)
return err;
@@ -272,13 +304,13 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size)
goto err_free_mem;
/* fill memory with instructions that will trap */
- execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true);
+ execmem_fill_trapping_insns(p, alloc_size);
err = set_memory_rox((unsigned long)p, vm->nr_pages);
if (err)
goto err_free_mem;
- err = execmem_cache_add(p, alloc_size);
+ err = execmem_cache_add(p, alloc_size, GFP_KERNEL);
if (err)
goto err_reset_direct_map;
@@ -307,57 +339,117 @@ static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
return __execmem_cache_alloc(range, size);
}
-static bool execmem_cache_free(void *ptr)
+static inline bool is_pending_free(void *ptr)
{
- struct maple_tree *busy_areas = &execmem_cache.busy_areas;
- struct mutex *mutex = &execmem_cache.mutex;
- unsigned long addr = (unsigned long)ptr;
- MA_STATE(mas, busy_areas, addr, addr);
- size_t size;
- void *area;
+ return ((unsigned long)ptr & PENDING_FREE_MASK);
+}
- mutex_lock(mutex);
- area = mas_walk(&mas);
- if (!area) {
- mutex_unlock(mutex);
- return false;
- }
- size = mas_range_len(&mas);
+static inline void *pending_free_set(void *ptr)
+{
+ return (void *)((unsigned long)ptr | PENDING_FREE_MASK);
+}
- mas_store_gfp(&mas, NULL, GFP_KERNEL);
- mutex_unlock(mutex);
+static inline void *pending_free_clear(void *ptr)
+{
+ return (void *)((unsigned long)ptr & ~PENDING_FREE_MASK);
+}
- execmem_fill_trapping_insns(ptr, size, /* writable = */ false);
+static int __execmem_cache_free(struct ma_state *mas, void *ptr, gfp_t gfp_mask)
+{
+ size_t size = mas_range_len(mas);
+ int err;
- execmem_cache_add(ptr, size);
+ err = execmem_force_rw(ptr, size);
+ if (err)
+ return err;
- schedule_work(&execmem_cache_clean_work);
+ execmem_fill_trapping_insns(ptr, size);
+ execmem_restore_rox(ptr, size);
- return true;
+ err = execmem_cache_add_locked(ptr, size, gfp_mask);
+ if (err)
+ return err;
+
+ mas_store_gfp(mas, NULL, gfp_mask);
+ return 0;
}
-int execmem_make_temp_rw(void *ptr, size_t size)
+static void execmem_cache_free_slow(struct work_struct *work);
+static DECLARE_DELAYED_WORK(execmem_cache_free_work, execmem_cache_free_slow);
+
+static void execmem_cache_free_slow(struct work_struct *work)
{
- unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
- unsigned long addr = (unsigned long)ptr;
- int ret;
+ struct maple_tree *busy_areas = &execmem_cache.busy_areas;
+ MA_STATE(mas, busy_areas, 0, ULONG_MAX);
+ void *area;
- ret = set_memory_nx(addr, nr);
- if (ret)
- return ret;
+ guard(mutex)(&execmem_cache.mutex);
- return set_memory_rw(addr, nr);
+ if (!execmem_cache.pending_free_cnt)
+ return;
+
+ mas_for_each(&mas, area, ULONG_MAX) {
+ if (!is_pending_free(area))
+ continue;
+
+ area = pending_free_clear(area);
+ if (__execmem_cache_free(&mas, area, GFP_KERNEL))
+ continue;
+
+ execmem_cache.pending_free_cnt--;
+ }
+
+ if (execmem_cache.pending_free_cnt)
+ schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY);
+ else
+ schedule_work(&execmem_cache_clean_work);
}
-int execmem_restore_rox(void *ptr, size_t size)
+static bool execmem_cache_free(void *ptr)
{
- unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ struct maple_tree *busy_areas = &execmem_cache.busy_areas;
unsigned long addr = (unsigned long)ptr;
+ MA_STATE(mas, busy_areas, addr, addr);
+ void *area;
+ int err;
- return set_memory_rox(addr, nr);
+ guard(mutex)(&execmem_cache.mutex);
+
+ area = mas_walk(&mas);
+ if (!area)
+ return false;
+
+ err = __execmem_cache_free(&mas, area, GFP_KERNEL | __GFP_NORETRY);
+ if (err) {
+ /*
+ * mas points to exact slot we've got the area from, nothing
+ * else can modify the tree because of the mutex, so there
+ * won't be any allocations in mas_store_gfp() and it will just
+ * change the pointer.
+ */
+ area = pending_free_set(area);
+ mas_store_gfp(&mas, area, GFP_KERNEL);
+ execmem_cache.pending_free_cnt++;
+ schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY);
+ return true;
+ }
+
+ schedule_work(&execmem_cache_clean_work);
+
+ return true;
}
#else /* CONFIG_ARCH_HAS_EXECMEM_ROX */
+/*
+ * when ROX cache is not used the permissions defined by architectures for
+ * execmem ranges that are updated before use (e.g. EXECMEM_MODULE_TEXT) must
+ * be writable anyway
+ */
+static inline int execmem_force_rw(void *ptr, size_t size)
+{
+ return 0;
+}
+
static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
{
return NULL;
@@ -373,9 +465,9 @@ void *execmem_alloc(enum execmem_type type, size_t size)
{
struct execmem_range *range = &execmem_info->ranges[type];
bool use_cache = range->flags & EXECMEM_ROX_CACHE;
- vm_flags_t vm_flags = VM_FLUSH_RESET_PERMS;
+ unsigned long vm_flags = VM_FLUSH_RESET_PERMS;
pgprot_t pgprot = range->pgprot;
- void *p;
+ void *p = NULL;
size = PAGE_ALIGN(size);
@@ -387,6 +479,21 @@ void *execmem_alloc(enum execmem_type type, size_t size)
return kasan_reset_tag(p);
}
+void *execmem_alloc_rw(enum execmem_type type, size_t size)
+{
+ void *p __free(execmem) = execmem_alloc(type, size);
+ int err;
+
+ if (!p)
+ return NULL;
+
+ err = execmem_force_rw(p, size);
+ if (err)
+ return NULL;
+
+ return no_free_ptr(p);
+}
+
void execmem_free(void *ptr)
{
/*
@@ -399,11 +506,6 @@ void execmem_free(void *ptr)
vfree(ptr);
}
-void *execmem_update_copy(void *dst, const void *src, size_t size)
-{
- return text_poke_copy(dst, src, size);
-}
-
bool execmem_is_rox(enum execmem_type type)
{
return !!(execmem_info->ranges[type].flags & EXECMEM_ROX_CACHE);
diff --git a/mm/internal.h b/mm/internal.h
index 1da16d550a45..45b725c3dc03 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1391,7 +1391,7 @@ int migrate_device_coherent_folio(struct folio *folio);
struct vm_struct *__get_vm_area_node(unsigned long size,
unsigned long align, unsigned long shift,
- vm_flags_t vm_flags, unsigned long start,
+ unsigned long vm_flags, unsigned long start,
unsigned long end, int node, gfp_t gfp_mask,
const void *caller);
diff --git a/mm/io-mapping.c b/mm/io-mapping.c
deleted file mode 100644
index d3586e95c12c..000000000000
--- a/mm/io-mapping.c
+++ /dev/null
@@ -1,30 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-
-#include <linux/mm.h>
-#include <linux/io-mapping.h>
-
-/**
- * io_mapping_map_user - remap an I/O mapping to userspace
- * @iomap: the source io_mapping
- * @vma: user vma to map to
- * @addr: target user address to start at
- * @pfn: physical address of kernel memory
- * @size: size of map area
- *
- * Note: this is only safe if the mm semaphore is held when called.
- */
-int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma,
- unsigned long addr, unsigned long pfn, unsigned long size)
-{
- vm_flags_t expected_flags = VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
-
- if (WARN_ON_ONCE((vma->vm_flags & expected_flags) != expected_flags))
- return -EINVAL;
-
- pgprot_t remap_prot = __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) |
- (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK));
-
- /* We rely on prevalidation of the io-mapping to skip pfnmap tracking. */
- return remap_pfn_range_notrack(vma, addr, pfn, size, remap_prot);
-}
-EXPORT_SYMBOL_GPL(io_mapping_map_user);
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index ed4873e18c75..9142964ab9c9 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -230,16 +230,12 @@ static bool check_slab_allocation(struct kmem_cache *cache, void *object,
}
static inline void poison_slab_object(struct kmem_cache *cache, void *object,
- bool init, bool still_accessible)
+ bool init)
{
void *tagged_object = object;
object = kasan_reset_tag(object);
- /* RCU slabs could be legally used after free within the RCU period. */
- if (unlikely(still_accessible))
- return;
-
kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE),
KASAN_SLAB_FREE, init);
@@ -261,7 +257,22 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init,
if (!kasan_arch_is_ready() || is_kfence_address(object))
return false;
- poison_slab_object(cache, object, init, still_accessible);
+ /*
+ * If this point is reached with an object that must still be
+ * accessible under RCU, we can't poison it; in that case, also skip the
+ * quarantine. This should mostly only happen when CONFIG_SLUB_RCU_DEBUG
+ * has been disabled manually.
+ *
+ * Putting the object on the quarantine wouldn't help catch UAFs (since
+ * we can't poison it here), and it would mask bugs caused by
+ * SLAB_TYPESAFE_BY_RCU users not being careful enough about object
+ * reuse; so overall, putting the object into the quarantine here would
+ * be counterproductive.
+ */
+ if (still_accessible)
+ return false;
+
+ poison_slab_object(cache, object, init);
/*
* If the object is put into quarantine, do not let slab put the object
@@ -519,7 +530,7 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip)
if (check_slab_allocation(slab->slab_cache, ptr, ip))
return false;
- poison_slab_object(slab->slab_cache, ptr, false, false);
+ poison_slab_object(slab->slab_cache, ptr, false);
return true;
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index a55fb1dcd224..374a6a5193a7 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -700,12 +700,15 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
spinlock_t *ptl,
struct list_head *compound_pagelist)
{
+ unsigned long end = address + HPAGE_PMD_SIZE;
struct folio *src, *tmp;
- pte_t *_pte;
pte_t pteval;
+ pte_t *_pte;
+ unsigned int nr_ptes;
- for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
- _pte++, address += PAGE_SIZE) {
+ for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes,
+ address += nr_ptes * PAGE_SIZE) {
+ nr_ptes = 1;
pteval = ptep_get(_pte);
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
@@ -722,18 +725,26 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
struct page *src_page = pte_page(pteval);
src = page_folio(src_page);
- if (!folio_test_large(src))
+
+ if (folio_test_large(src)) {
+ unsigned int max_nr_ptes = (end - address) >> PAGE_SHIFT;
+
+ nr_ptes = folio_pte_batch(src, _pte, pteval, max_nr_ptes);
+ } else {
release_pte_folio(src);
+ }
+
/*
* ptl mostly unnecessary, but preempt has to
* be disabled to update the per-cpu stats
* inside folio_remove_rmap_pte().
*/
spin_lock(ptl);
- ptep_clear(vma->vm_mm, address, _pte);
- folio_remove_rmap_pte(src, src_page, vma);
+ clear_ptes(vma->vm_mm, address, _pte, nr_ptes);
+ folio_remove_rmap_ptes(src, src_page, nr_ptes, vma);
spin_unlock(ptl);
- free_folio_and_swap_cache(src);
+ free_swap_cache(src);
+ folio_put_refs(src, nr_ptes);
}
}
@@ -1492,15 +1503,17 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
bool install_pmd)
{
+ int nr_mapped_ptes = 0, result = SCAN_FAIL;
+ unsigned int nr_batch_ptes;
struct mmu_notifier_range range;
bool notified = false;
unsigned long haddr = addr & HPAGE_PMD_MASK;
+ unsigned long end = haddr + HPAGE_PMD_SIZE;
struct vm_area_struct *vma = vma_lookup(mm, haddr);
struct folio *folio;
pte_t *start_pte, *pte;
pmd_t *pmd, pgt_pmd;
spinlock_t *pml = NULL, *ptl;
- int nr_ptes = 0, result = SCAN_FAIL;
int i;
mmap_assert_locked(mm);
@@ -1614,11 +1627,15 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
goto abort;
/* step 2: clear page table and adjust rmap */
- for (i = 0, addr = haddr, pte = start_pte;
- i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+ for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR;
+ i += nr_batch_ptes, addr += nr_batch_ptes * PAGE_SIZE,
+ pte += nr_batch_ptes) {
+ unsigned int max_nr_batch_ptes = (end - addr) >> PAGE_SHIFT;
struct page *page;
pte_t ptent = ptep_get(pte);
+ nr_batch_ptes = 1;
+
if (pte_none(ptent))
continue;
/*
@@ -1632,26 +1649,29 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
goto abort;
}
page = vm_normal_page(vma, addr, ptent);
+
if (folio_page(folio, i) != page)
goto abort;
+ nr_batch_ptes = folio_pte_batch(folio, pte, ptent, max_nr_batch_ptes);
+
/*
* Must clear entry, or a racing truncate may re-remove it.
* TLB flush can be left until pmdp_collapse_flush() does it.
* PTE dirty? Shmem page is already dirty; file is read-only.
*/
- ptep_clear(mm, addr, pte);
- folio_remove_rmap_pte(folio, page, vma);
- nr_ptes++;
+ clear_ptes(mm, addr, pte, nr_batch_ptes);
+ folio_remove_rmap_ptes(folio, page, nr_batch_ptes, vma);
+ nr_mapped_ptes += nr_batch_ptes;
}
if (!pml)
spin_unlock(ptl);
/* step 3: set proper refcount and mm_counters. */
- if (nr_ptes) {
- folio_ref_sub(folio, nr_ptes);
- add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
+ if (nr_mapped_ptes) {
+ folio_ref_sub(folio, nr_mapped_ptes);
+ add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes);
}
/* step 4: remove empty page table */
@@ -1684,10 +1704,10 @@ maybe_install_pmd:
: SCAN_SUCCEED;
goto drop_folio;
abort:
- if (nr_ptes) {
+ if (nr_mapped_ptes) {
flush_tlb_mm(mm);
- folio_ref_sub(folio, nr_ptes);
- add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
+ folio_ref_sub(folio, nr_mapped_ptes);
+ add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes);
}
unlock:
if (start_pte)
diff --git a/mm/madvise.c b/mm/madvise.c
index bb80fc5ea08f..35ed4ab0d7c5 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -19,6 +19,7 @@
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/mm_inline.h>
+#include <linux/mmu_context.h>
#include <linux/string.h>
#include <linux/uio.h>
#include <linux/ksm.h>
@@ -1256,6 +1257,74 @@ static long madvise_guard_remove(struct madvise_behavior *madv_behavior)
&guard_remove_walk_ops, NULL);
}
+#ifdef CONFIG_64BIT
+/* Does the madvise operation result in discarding of mapped data? */
+static bool is_discard(int behavior)
+{
+ switch (behavior) {
+ case MADV_FREE:
+ case MADV_DONTNEED:
+ case MADV_DONTNEED_LOCKED:
+ case MADV_REMOVE:
+ case MADV_DONTFORK:
+ case MADV_WIPEONFORK:
+ case MADV_GUARD_INSTALL:
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * We are restricted from madvise()'ing mseal()'d VMAs only in very particular
+ * circumstances - discarding of data from read-only anonymous SEALED mappings.
+ *
+ * This is because users cannot trivally discard data from these VMAs, and may
+ * only do so via an appropriate madvise() call.
+ */
+static bool can_madvise_modify(struct madvise_behavior *madv_behavior)
+{
+ struct vm_area_struct *vma = madv_behavior->vma;
+
+ /* If the VMA isn't sealed we're good. */
+ if (!vma_is_sealed(vma))
+ return true;
+
+ /* For a sealed VMA, we only care about discard operations. */
+ if (!is_discard(madv_behavior->behavior))
+ return true;
+
+ /*
+ * We explicitly permit all file-backed mappings, whether MAP_SHARED or
+ * MAP_PRIVATE.
+ *
+ * The latter causes some complications. Because now, one can mmap()
+ * read/write a MAP_PRIVATE mapping, write to it, then mprotect()
+ * read-only, mseal() and a discard will be permitted.
+ *
+ * However, in order to avoid issues with potential use of madvise(...,
+ * MADV_DONTNEED) of mseal()'d .text mappings we, for the time being,
+ * permit this.
+ */
+ if (!vma_is_anonymous(vma))
+ return true;
+
+ /* If the user could write to the mapping anyway, then this is fine. */
+ if ((vma->vm_flags & VM_WRITE) &&
+ arch_vma_access_permitted(vma, /* write= */ true,
+ /* execute= */ false, /* foreign= */ false))
+ return true;
+
+ /* Otherwise, we are not permitted to perform this operation. */
+ return false;
+}
+#else
+static bool can_madvise_modify(struct madvise_behavior *madv_behavior)
+{
+ return true;
+}
+#endif
+
/*
* Apply an madvise behavior to a region of a vma. madvise_update_vma
* will handle splitting a vm area into separate areas, each area with its own
@@ -1269,7 +1338,7 @@ static int madvise_vma_behavior(struct madvise_behavior *madv_behavior)
struct madvise_behavior_range *range = &madv_behavior->range;
int error;
- if (unlikely(!can_modify_vma_madv(madv_behavior->vma, behavior)))
+ if (unlikely(!can_madvise_modify(madv_behavior)))
return -EPERM;
switch (behavior) {
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 3047b9ac667e..e2e685b971bb 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -837,11 +837,17 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
struct mm_walk *walk)
{
struct hwpoison_walk *hwp = walk->private;
- pte_t pte = huge_ptep_get(walk->mm, addr, ptep);
struct hstate *h = hstate_vma(walk->vma);
+ spinlock_t *ptl;
+ pte_t pte;
+ int ret;
- return check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
- hwp->pfn, &hwp->tk);
+ ptl = huge_pte_lock(h, walk->mm, ptep);
+ pte = huge_ptep_get(walk->mm, addr, ptep);
+ ret = check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
+ hwp->pfn, &hwp->tk);
+ spin_unlock(ptl);
+ return ret;
}
#else
#define hwpoison_hugetlb_range NULL
diff --git a/mm/mempool.c b/mm/mempool.c
index 204a216b6418..1c38e873e546 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -136,7 +136,7 @@ static void kasan_unpoison_element(mempool_t *pool, void *element)
static __always_inline void add_element(mempool_t *pool, void *element)
{
- BUG_ON(pool->curr_nr >= pool->min_nr);
+ BUG_ON(pool->min_nr != 0 && pool->curr_nr >= pool->min_nr);
poison_element(pool, element);
if (kasan_poison_element(pool, element))
pool->elements[pool->curr_nr++] = element;
@@ -202,16 +202,20 @@ int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
pool->alloc = alloc_fn;
pool->free = free_fn;
init_waitqueue_head(&pool->wait);
-
- pool->elements = kmalloc_array_node(min_nr, sizeof(void *),
+ /*
+ * max() used here to ensure storage for at least 1 element to support
+ * zero minimum pool
+ */
+ pool->elements = kmalloc_array_node(max(1, min_nr), sizeof(void *),
gfp_mask, node_id);
if (!pool->elements)
return -ENOMEM;
/*
- * First pre-allocate the guaranteed number of buffers.
+ * First pre-allocate the guaranteed number of buffers,
+ * also pre-allocate 1 element for zero minimum pool.
*/
- while (pool->curr_nr < pool->min_nr) {
+ while (pool->curr_nr < max(1, pool->min_nr)) {
void *element;
element = pool->alloc(gfp_mask, pool->pool_data);
@@ -555,20 +559,12 @@ void mempool_free(void *element, mempool_t *pool)
* wake-up path of previous test. This explicit check ensures the
* allocation of element when both min_nr and curr_nr are 0, and
* any active waiters are properly awakened.
- *
- * Inline the same logic as previous test, add_element() cannot be
- * directly used here since it has BUG_ON to deny if min_nr equals
- * curr_nr, so here picked rest of add_element() to use without
- * BUG_ON check.
*/
if (unlikely(pool->min_nr == 0 &&
READ_ONCE(pool->curr_nr) == 0)) {
spin_lock_irqsave(&pool->lock, flags);
if (likely(pool->curr_nr == 0)) {
- /* Inline the logic of add_element() */
- poison_element(pool, element);
- if (kasan_poison_element(pool, element))
- pool->elements[pool->curr_nr++] = element;
+ add_element(pool, element);
spin_unlock_irqrestore(&pool->lock, flags);
if (wq_has_sleeper(&pool->wait))
wake_up(&pool->wait);
diff --git a/mm/mincore.c b/mm/mincore.c
index 42d6c9c8da86..10dabefc3acc 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -29,7 +29,9 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
#ifdef CONFIG_HUGETLB_PAGE
unsigned char present;
unsigned char *vec = walk->private;
+ spinlock_t *ptl;
+ ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
/*
* Hugepages under user process are always in RAM and never
* swapped out, but theoretically it needs to be checked.
@@ -38,6 +40,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
for (; addr != end; vec++, addr += PAGE_SIZE)
*vec = present;
walk->private = vec;
+ spin_unlock(ptl);
#else
BUG();
#endif
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index 729fb7d0dd59..b006cec8e6fe 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -164,8 +164,7 @@ retry:
*/
/* Check if the vma we locked is the right one. */
- if (unlikely(vma->vm_mm != mm ||
- address < vma->vm_start || address >= vma->vm_end))
+ if (unlikely(address < vma->vm_start || address >= vma->vm_end))
goto inval_end_read;
rcu_read_unlock();
@@ -236,11 +235,8 @@ retry:
goto fallback;
}
- /*
- * Verify the vma we locked belongs to the same address space and it's
- * not behind of the last search position.
- */
- if (unlikely(vma->vm_mm != mm || from_addr >= vma->vm_end))
+ /* Verify the vma is not behind the last search position. */
+ if (unlikely(from_addr >= vma->vm_end))
goto fallback_unlock;
/*
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 2ddd37b2f462..78bded7acf79 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -766,7 +766,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
unsigned long charged = 0;
int error;
- if (!can_modify_vma(vma))
+ if (vma_is_sealed(vma))
return -EPERM;
if (newflags == oldflags) {
diff --git a/mm/mremap.c b/mm/mremap.c
index e15cf2e444c7..677a4d744df9 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -280,7 +280,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
old_pte, max_nr_ptes);
force_flush = true;
}
- pte = get_and_clear_full_ptes(mm, old_addr, old_ptep, nr_ptes, 0);
+ pte = get_and_clear_ptes(mm, old_addr, old_ptep, nr_ptes);
pte = move_pte(pte, old_addr, new_addr);
pte = move_soft_dirty_pte(pte);
@@ -1651,7 +1651,7 @@ static int check_prep_vma(struct vma_remap_struct *vrm)
return -EFAULT;
/* If mseal()'d, mremap() is prohibited. */
- if (!can_modify_vma(vma))
+ if (vma_is_sealed(vma))
return -EPERM;
/* Align to hugetlb page size, if required. */
diff --git a/mm/mseal.c b/mm/mseal.c
index c27197ac04e8..e5b205562d2e 100644
--- a/mm/mseal.c
+++ b/mm/mseal.c
@@ -11,148 +11,74 @@
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
-#include <linux/mmu_context.h>
#include <linux/syscalls.h>
#include <linux/sched.h>
#include "internal.h"
-static inline void set_vma_sealed(struct vm_area_struct *vma)
-{
- vm_flags_set(vma, VM_SEALED);
-}
-
-static bool is_madv_discard(int behavior)
-{
- switch (behavior) {
- case MADV_FREE:
- case MADV_DONTNEED:
- case MADV_DONTNEED_LOCKED:
- case MADV_REMOVE:
- case MADV_DONTFORK:
- case MADV_WIPEONFORK:
- case MADV_GUARD_INSTALL:
- return true;
- }
-
- return false;
-}
-
-static bool is_ro_anon(struct vm_area_struct *vma)
-{
- /* check anonymous mapping. */
- if (vma->vm_file || vma->vm_flags & VM_SHARED)
- return false;
-
- /*
- * check for non-writable:
- * PROT=RO or PKRU is not writeable.
- */
- if (!(vma->vm_flags & VM_WRITE) ||
- !arch_vma_access_permitted(vma, true, false, false))
- return true;
-
- return false;
-}
-
/*
- * Check if a vma is allowed to be modified by madvise.
+ * mseal() disallows an input range which contain unmapped ranges (VMA holes).
+ *
+ * It disallows unmapped regions from start to end whether they exist at the
+ * start, in the middle, or at the end of the range, or any combination thereof.
+ *
+ * This is because after sealng a range, there's nothing to stop memory mapping
+ * of ranges in the remaining gaps later, meaning that the user might then
+ * wrongly consider the entirety of the mseal()'d range to be sealed when it
+ * in fact isn't.
*/
-bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
-{
- if (!is_madv_discard(behavior))
- return true;
-
- if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma)))
- return false;
-
- /* Allow by default. */
- return true;
-}
-
-static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
- struct vm_area_struct **prev, unsigned long start,
- unsigned long end, vm_flags_t newflags)
-{
- int ret = 0;
- vm_flags_t oldflags = vma->vm_flags;
-
- if (newflags == oldflags)
- goto out;
-
- vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
- if (IS_ERR(vma)) {
- ret = PTR_ERR(vma);
- goto out;
- }
-
- set_vma_sealed(vma);
-out:
- *prev = vma;
- return ret;
-}
/*
- * Check for do_mseal:
- * 1> start is part of a valid vma.
- * 2> end is part of a valid vma.
- * 3> No gap (unallocated address) between start and end.
- * 4> map is sealable.
+ * Does the [start, end) range contain any unmapped memory?
+ *
+ * We ensure that:
+ * - start is part of a valid VMA.
+ * - end is part of a valid VMA.
+ * - no gap (unallocated memory) exists between start and end.
*/
-static int check_mm_seal(unsigned long start, unsigned long end)
+static bool range_contains_unmapped(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
{
struct vm_area_struct *vma;
- unsigned long nstart = start;
-
+ unsigned long prev_end = start;
VMA_ITERATOR(vmi, current->mm, start);
- /* going through each vma to check. */
for_each_vma_range(vmi, vma, end) {
- if (vma->vm_start > nstart)
- /* unallocated memory found. */
- return -ENOMEM;
-
- if (vma->vm_end >= end)
- return 0;
+ if (vma->vm_start > prev_end)
+ return true;
- nstart = vma->vm_end;
+ prev_end = vma->vm_end;
}
- return -ENOMEM;
+ return prev_end < end;
}
-/*
- * Apply sealing.
- */
-static int apply_mm_seal(unsigned long start, unsigned long end)
+static int mseal_apply(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
{
- unsigned long nstart;
struct vm_area_struct *vma, *prev;
+ unsigned long curr_start = start;
+ VMA_ITERATOR(vmi, mm, start);
- VMA_ITERATOR(vmi, current->mm, start);
-
+ /* We know there are no gaps so this will be non-NULL. */
vma = vma_iter_load(&vmi);
- /*
- * Note: check_mm_seal should already checked ENOMEM case.
- * so vma should not be null, same for the other ENOMEM cases.
- */
prev = vma_prev(&vmi);
if (start > vma->vm_start)
prev = vma;
- nstart = start;
for_each_vma_range(vmi, vma, end) {
- int error;
- unsigned long tmp;
- vm_flags_t newflags;
+ unsigned long curr_end = MIN(vma->vm_end, end);
- newflags = vma->vm_flags | VM_SEALED;
- tmp = vma->vm_end;
- if (tmp > end)
- tmp = end;
- error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
- if (error)
- return error;
- nstart = vma_iter_end(&vmi);
+ if (!(vma->vm_flags & VM_SEALED)) {
+ vma = vma_modify_flags(&vmi, prev, vma,
+ curr_start, curr_end,
+ vma->vm_flags | VM_SEALED);
+ if (IS_ERR(vma))
+ return PTR_ERR(vma);
+ vm_flags_set(vma, VM_SEALED);
+ }
+
+ prev = vma;
+ curr_start = curr_end;
}
return 0;
@@ -240,14 +166,10 @@ int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
if (mmap_write_lock_killable(mm))
return -EINTR;
- /*
- * First pass, this helps to avoid
- * partial sealing in case of error in input address range,
- * e.g. ENOMEM error.
- */
- ret = check_mm_seal(start, end);
- if (ret)
+ if (range_contains_unmapped(mm, start, end)) {
+ ret = -ENOMEM;
goto out;
+ }
/*
* Second pass, this should success, unless there are errors
@@ -255,10 +177,10 @@ int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
* reaching the max supported VMAs, however, those cases shall
* be rare.
*/
- ret = apply_mm_seal(start, end);
+ ret = mseal_apply(mm, start, end);
out:
- mmap_write_unlock(current->mm);
+ mmap_write_unlock(mm);
return ret;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index 736d0e0f0618..8b819fafd57b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -126,7 +126,7 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
- pgprot_t prot, vm_flags_t vm_flags, int node,
+ pgprot_t prot, unsigned long vm_flags, int node,
const void *caller)
{
return __vmalloc_noprof(size, gfp_mask);
diff --git a/mm/rmap.c b/mm/rmap.c
index f93ce27132ab..568198e9efc2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -2036,7 +2036,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
flush_cache_range(vma, address, end_addr);
/* Nuke the page table entry. */
- pteval = get_and_clear_full_ptes(mm, address, pvmw.pte, nr_pages, 0);
+ pteval = get_and_clear_ptes(mm, address, pvmw.pte, nr_pages);
/*
* We clear the PTE but do not flush so potentially
* a remote CPU could still be writing to the folio.
diff --git a/mm/shmem.c b/mm/shmem.c
index 7fdd707ac1ac..e2c76a30802b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -512,15 +512,27 @@ static int shmem_replace_entry(struct address_space *mapping,
/*
* Sometimes, before we decide whether to proceed or to fail, we must check
- * that an entry was not already brought back from swap by a racing thread.
+ * that an entry was not already brought back or split by a racing thread.
*
* Checking folio is not enough: by the time a swapcache folio is locked, it
* might be reused, and again be swapcache, using the same swap as before.
+ * Returns the swap entry's order if it still presents, else returns -1.
*/
-static bool shmem_confirm_swap(struct address_space *mapping,
- pgoff_t index, swp_entry_t swap)
+static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index,
+ swp_entry_t swap)
{
- return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
+ XA_STATE(xas, &mapping->i_pages, index);
+ int ret = -1;
+ void *entry;
+
+ rcu_read_lock();
+ do {
+ entry = xas_load(&xas);
+ if (entry == swp_to_radix_entry(swap))
+ ret = xas_get_order(&xas);
+ } while (xas_retry(&xas, entry));
+ rcu_read_unlock();
+ return ret;
}
/*
@@ -891,7 +903,9 @@ static int shmem_add_to_page_cache(struct folio *folio,
pgoff_t index, void *expected, gfp_t gfp)
{
XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
- long nr = folio_nr_pages(folio);
+ unsigned long nr = folio_nr_pages(folio);
+ swp_entry_t iter, swap;
+ void *entry;
VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
@@ -903,14 +917,25 @@ static int shmem_add_to_page_cache(struct folio *folio,
gfp &= GFP_RECLAIM_MASK;
folio_throttle_swaprate(folio, gfp);
+ swap = radix_to_swp_entry(expected);
do {
+ iter = swap;
xas_lock_irq(&xas);
- if (expected != xas_find_conflict(&xas)) {
- xas_set_err(&xas, -EEXIST);
- goto unlock;
+ xas_for_each_conflict(&xas, entry) {
+ /*
+ * The range must either be empty, or filled with
+ * expected swap entries. Shmem swap entries are never
+ * partially freed without split of both entry and
+ * folio, so there shouldn't be any holes.
+ */
+ if (!expected || entry != swp_to_radix_entry(iter)) {
+ xas_set_err(&xas, -EEXIST);
+ goto unlock;
+ }
+ iter.val += 1 << xas_get_order(&xas);
}
- if (expected && xas_find_conflict(&xas)) {
+ if (expected && iter.val - nr != swap.val) {
xas_set_err(&xas, -EEXIST);
goto unlock;
}
@@ -1992,30 +2017,47 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode,
swp_entry_t entry, int order, gfp_t gfp)
{
struct shmem_inode_info *info = SHMEM_I(inode);
+ int nr_pages = 1 << order;
struct folio *new;
+ gfp_t alloc_gfp;
void *shadow;
- int nr_pages;
/*
* We have arrived here because our zones are constrained, so don't
* limit chance of success with further cpuset and node constraints.
*/
gfp &= ~GFP_CONSTRAINT_MASK;
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && order > 0) {
- gfp_t huge_gfp = vma_thp_gfp_mask(vma);
+ alloc_gfp = gfp;
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+ if (WARN_ON_ONCE(order))
+ return ERR_PTR(-EINVAL);
+ } else if (order) {
+ /*
+ * If uffd is active for the vma, we need per-page fault
+ * fidelity to maintain the uffd semantics, then fallback
+ * to swapin order-0 folio, as well as for zswap case.
+ * Any existing sub folio in the swap cache also blocks
+ * mTHP swapin.
+ */
+ if ((vma && unlikely(userfaultfd_armed(vma))) ||
+ !zswap_never_enabled() ||
+ non_swapcache_batch(entry, nr_pages) != nr_pages)
+ goto fallback;
- gfp = limit_gfp_mask(huge_gfp, gfp);
+ alloc_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp);
+ }
+retry:
+ new = shmem_alloc_folio(alloc_gfp, order, info, index);
+ if (!new) {
+ new = ERR_PTR(-ENOMEM);
+ goto fallback;
}
- new = shmem_alloc_folio(gfp, order, info, index);
- if (!new)
- return ERR_PTR(-ENOMEM);
-
- nr_pages = folio_nr_pages(new);
if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL,
- gfp, entry)) {
+ alloc_gfp, entry)) {
folio_put(new);
- return ERR_PTR(-ENOMEM);
+ new = ERR_PTR(-ENOMEM);
+ goto fallback;
}
/*
@@ -2030,7 +2072,9 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode,
*/
if (swapcache_prepare(entry, nr_pages)) {
folio_put(new);
- return ERR_PTR(-EEXIST);
+ new = ERR_PTR(-EEXIST);
+ /* Try smaller folio to avoid cache conflict */
+ goto fallback;
}
__folio_set_locked(new);
@@ -2044,6 +2088,15 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode,
folio_add_lru(new);
swap_read_folio(new, NULL);
return new;
+fallback:
+ /* Order 0 swapin failed, nothing to fallback to, abort */
+ if (!order)
+ return new;
+ entry.val += index - round_down(index, nr_pages);
+ alloc_gfp = gfp;
+ nr_pages = 1;
+ order = 0;
+ goto retry;
}
/*
@@ -2249,7 +2302,7 @@ unlock:
if (xas_error(&xas))
return xas_error(&xas);
- return entry_order;
+ return 0;
}
/*
@@ -2266,133 +2319,109 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct address_space *mapping = inode->i_mapping;
struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
struct shmem_inode_info *info = SHMEM_I(inode);
+ swp_entry_t swap, index_entry;
struct swap_info_struct *si;
struct folio *folio = NULL;
bool skip_swapcache = false;
- swp_entry_t swap;
- int error, nr_pages, order, split_order;
+ int error, nr_pages, order;
+ pgoff_t offset;
VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
- swap = radix_to_swp_entry(*foliop);
+ index_entry = radix_to_swp_entry(*foliop);
+ swap = index_entry;
*foliop = NULL;
- if (is_poisoned_swp_entry(swap))
+ if (is_poisoned_swp_entry(index_entry))
return -EIO;
- si = get_swap_device(swap);
- if (!si) {
- if (!shmem_confirm_swap(mapping, index, swap))
+ si = get_swap_device(index_entry);
+ order = shmem_confirm_swap(mapping, index, index_entry);
+ if (unlikely(!si)) {
+ if (order < 0)
return -EEXIST;
else
return -EINVAL;
}
+ if (unlikely(order < 0)) {
+ put_swap_device(si);
+ return -EEXIST;
+ }
+
+ /* index may point to the middle of a large entry, get the sub entry */
+ if (order) {
+ offset = index - round_down(index, 1 << order);
+ swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
+ }
/* Look it up and read it in.. */
folio = swap_cache_get_folio(swap, NULL, 0);
- order = xa_get_order(&mapping->i_pages, index);
if (!folio) {
- int nr_pages = 1 << order;
- bool fallback_order0 = false;
-
- /* Or update major stats only when swapin succeeds?? */
+ if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
+ /* Direct swapin skipping swap cache & readahead */
+ folio = shmem_swap_alloc_folio(inode, vma, index,
+ index_entry, order, gfp);
+ if (IS_ERR(folio)) {
+ error = PTR_ERR(folio);
+ folio = NULL;
+ goto failed;
+ }
+ skip_swapcache = true;
+ } else {
+ /* Cached swapin only supports order 0 folio */
+ folio = shmem_swapin_cluster(swap, gfp, info, index);
+ if (!folio) {
+ error = -ENOMEM;
+ goto failed;
+ }
+ }
if (fault_type) {
*fault_type |= VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(fault_mm, PGMAJFAULT);
}
+ }
+ if (order > folio_order(folio)) {
/*
- * If uffd is active for the vma, we need per-page fault
- * fidelity to maintain the uffd semantics, then fallback
- * to swapin order-0 folio, as well as for zswap case.
- * Any existing sub folio in the swap cache also blocks
- * mTHP swapin.
- */
- if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) ||
- !zswap_never_enabled() ||
- non_swapcache_batch(swap, nr_pages) != nr_pages))
- fallback_order0 = true;
-
- /* Skip swapcache for synchronous device. */
- if (!fallback_order0 && data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
- folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp);
- if (!IS_ERR(folio)) {
- skip_swapcache = true;
- goto alloced;
- }
-
- /*
- * Fallback to swapin order-0 folio unless the swap entry
- * already exists.
- */
- error = PTR_ERR(folio);
- folio = NULL;
- if (error == -EEXIST)
- goto failed;
- }
-
- /*
- * Now swap device can only swap in order 0 folio, then we
- * should split the large swap entry stored in the pagecache
- * if necessary.
- */
- split_order = shmem_split_large_entry(inode, index, swap, gfp);
- if (split_order < 0) {
- error = split_order;
- goto failed;
- }
-
- /*
- * If the large swap entry has already been split, it is
- * necessary to recalculate the new swap entry based on
- * the old order alignment.
- */
- if (split_order > 0) {
- pgoff_t offset = index - round_down(index, 1 << split_order);
-
- swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
- }
-
- /* Here we actually start the io */
- folio = shmem_swapin_cluster(swap, gfp, info, index);
- if (!folio) {
- error = -ENOMEM;
- goto failed;
- }
- } else if (order != folio_order(folio)) {
- /*
- * Swap readahead may swap in order 0 folios into swapcache
+ * Swapin may get smaller folios due to various reasons:
+ * It may fallback to order 0 due to memory pressure or race,
+ * swap readahead may swap in order 0 folios into swapcache
* asynchronously, while the shmem mapping can still stores
* large swap entries. In such cases, we should split the
* large swap entry to prevent possible data corruption.
*/
- split_order = shmem_split_large_entry(inode, index, swap, gfp);
- if (split_order < 0) {
- folio_put(folio);
- folio = NULL;
- error = split_order;
- goto failed;
- }
-
- /*
- * If the large swap entry has already been split, it is
- * necessary to recalculate the new swap entry based on
- * the old order alignment.
- */
- if (split_order > 0) {
- pgoff_t offset = index - round_down(index, 1 << split_order);
+ error = shmem_split_large_entry(inode, index, index_entry, gfp);
+ if (error)
+ goto failed_nolock;
+ }
- swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
- }
+ /*
+ * If the folio is large, round down swap and index by folio size.
+ * No matter what race occurs, the swap layer ensures we either get
+ * a valid folio that has its swap entry aligned by size, or a
+ * temporarily invalid one which we'll abort very soon and retry.
+ *
+ * shmem_add_to_page_cache ensures the whole range contains expected
+ * entries and prevents any corruption, so any race split is fine
+ * too, it will succeed as long as the entries are still there.
+ */
+ nr_pages = folio_nr_pages(folio);
+ if (nr_pages > 1) {
+ swap.val = round_down(swap.val, nr_pages);
+ index = round_down(index, nr_pages);
}
-alloced:
- /* We have to do this with folio locked to prevent races */
+ /*
+ * We have to do this with the folio locked to prevent races.
+ * The shmem_confirm_swap below only checks if the first swap
+ * entry matches the folio, that's enough to ensure the folio
+ * is not used outside of shmem, as shmem swap entries
+ * and swap cache folios are never partially freed.
+ */
folio_lock(folio);
if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
- folio->swap.val != swap.val ||
- !shmem_confirm_swap(mapping, index, swap) ||
- xa_get_order(&mapping->i_pages, index) != folio_order(folio)) {
+ shmem_confirm_swap(mapping, index, swap) < 0 ||
+ folio->swap.val != swap.val) {
error = -EEXIST;
goto unlock;
}
@@ -2415,8 +2444,7 @@ alloced:
goto failed;
}
- error = shmem_add_to_page_cache(folio, mapping,
- round_down(index, nr_pages),
+ error = shmem_add_to_page_cache(folio, mapping, index,
swp_to_radix_entry(swap), gfp);
if (error)
goto failed;
@@ -2439,18 +2467,19 @@ alloced:
*foliop = folio;
return 0;
failed:
- if (!shmem_confirm_swap(mapping, index, swap))
+ if (shmem_confirm_swap(mapping, index, swap) < 0)
error = -EEXIST;
if (error == -EIO)
shmem_set_folio_swapin_error(inode, index, folio, swap,
skip_swapcache);
unlock:
- if (skip_swapcache)
- swapcache_clear(si, swap, folio_nr_pages(folio));
- if (folio) {
+ if (folio)
folio_unlock(folio);
+failed_nolock:
+ if (skip_swapcache)
+ swapcache_clear(si, folio->swap, folio_nr_pages(folio));
+ if (folio)
folio_put(folio);
- }
put_swap_device(si);
return error;
@@ -5960,8 +5989,8 @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping,
struct folio *folio;
int error;
- error = shmem_get_folio_gfp(inode, index, 0, &folio, SGP_CACHE,
- gfp, NULL, NULL);
+ error = shmem_get_folio_gfp(inode, index, i_size_read(inode),
+ &folio, SGP_CACHE, gfp, NULL, NULL);
if (error)
return ERR_PTR(error);
diff --git a/mm/slub.c b/mm/slub.c
index cf7c6032d5fd..30003763d224 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -6312,9 +6312,8 @@ void __init kmem_cache_init(void)
if (debug_guardpage_minorder())
slub_max_order = 0;
- /* Print slub debugging pointers without hashing */
- if (__slub_debug_enabled())
- no_hash_pointers_enable(NULL);
+ /* Inform pointer hashing choice about slub debugging state. */
+ hash_pointers_finalize(__slub_debug_enabled());
kmem_cache_node = &boot_kmem_cache_node;
kmem_cache = &boot_kmem_cache;
diff --git a/mm/vma.c b/mm/vma.c
index 9ba93be621da..3b12c7579831 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -1351,7 +1351,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
}
/* Don't bother splitting the VMA if we can't unmap it anyway */
- if (!can_modify_vma(vms->vma)) {
+ if (vma_is_sealed(vms->vma)) {
error = -EPERM;
goto start_split_failed;
}
@@ -1371,7 +1371,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
for_each_vma_range(*(vms->vmi), next, vms->end) {
long nrpages;
- if (!can_modify_vma(next)) {
+ if (vma_is_sealed(next)) {
error = -EPERM;
goto modify_vma_failed;
}
diff --git a/mm/vma.h b/mm/vma.h
index acdcc515c459..b123a9cdedb0 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -559,38 +559,15 @@ struct vm_area_struct *vma_iter_next_rewind(struct vma_iterator *vmi,
}
#ifdef CONFIG_64BIT
-
static inline bool vma_is_sealed(struct vm_area_struct *vma)
{
return (vma->vm_flags & VM_SEALED);
}
-
-/*
- * check if a vma is sealed for modification.
- * return true, if modification is allowed.
- */
-static inline bool can_modify_vma(struct vm_area_struct *vma)
-{
- if (unlikely(vma_is_sealed(vma)))
- return false;
-
- return true;
-}
-
-bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior);
-
#else
-
-static inline bool can_modify_vma(struct vm_area_struct *vma)
-{
- return true;
-}
-
-static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
+static inline bool vma_is_sealed(struct vm_area_struct *vma)
{
- return true;
+ return false;
}
-
#endif
#if defined(CONFIG_STACK_GROWSUP)
diff --git a/tools/testing/selftests/cachestat/test_cachestat.c b/tools/testing/selftests/cachestat/test_cachestat.c
index 632ab44737ec..c952640f163b 100644
--- a/tools/testing/selftests/cachestat/test_cachestat.c
+++ b/tools/testing/selftests/cachestat/test_cachestat.c
@@ -33,6 +33,11 @@ void print_cachestat(struct cachestat *cs)
cs->nr_evicted, cs->nr_recently_evicted);
}
+enum file_type {
+ FILE_MMAP,
+ FILE_SHMEM
+};
+
bool write_exactly(int fd, size_t filesize)
{
int random_fd = open("/dev/urandom", O_RDONLY);
@@ -201,8 +206,20 @@ out1:
out:
return ret;
}
+const char *file_type_str(enum file_type type)
+{
+ switch (type) {
+ case FILE_SHMEM:
+ return "shmem";
+ case FILE_MMAP:
+ return "mmap";
+ default:
+ return "unknown";
+ }
+}
-bool test_cachestat_shmem(void)
+
+bool run_cachestat_test(enum file_type type)
{
size_t PS = sysconf(_SC_PAGESIZE);
size_t filesize = PS * 512 * 2; /* 2 2MB huge pages */
@@ -212,27 +229,50 @@ bool test_cachestat_shmem(void)
char *filename = "tmpshmcstat";
struct cachestat cs;
bool ret = true;
+ int fd;
unsigned long num_pages = compute_len / PS;
- int fd = shm_open(filename, O_CREAT | O_RDWR, 0600);
+ if (type == FILE_SHMEM)
+ fd = shm_open(filename, O_CREAT | O_RDWR, 0600);
+ else
+ fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0666);
if (fd < 0) {
- ksft_print_msg("Unable to create shmem file.\n");
+ ksft_print_msg("Unable to create %s file.\n",
+ file_type_str(type));
ret = false;
goto out;
}
if (ftruncate(fd, filesize)) {
- ksft_print_msg("Unable to truncate shmem file.\n");
+ ksft_print_msg("Unable to truncate %s file.\n",file_type_str(type));
ret = false;
goto close_fd;
}
+ switch (type) {
+ case FILE_SHMEM:
+ if (!write_exactly(fd, filesize)) {
+ ksft_print_msg("Unable to write to file.\n");
+ ret = false;
+ goto close_fd;
+ }
+ break;
+ case FILE_MMAP:
+ char *map = mmap(NULL, filesize, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, 0);
- if (!write_exactly(fd, filesize)) {
- ksft_print_msg("Unable to write to shmem file.\n");
+ if (map == MAP_FAILED) {
+ ksft_print_msg("mmap failed.\n");
+ ret = false;
+ goto close_fd;
+ }
+ for (int i = 0; i < filesize; i++)
+ map[i] = 'A';
+ break;
+ default:
+ ksft_print_msg("Unsupported file type.\n");
ret = false;
goto close_fd;
}
-
syscall_ret = syscall(__NR_cachestat, fd, &cs_range, &cs, 0);
if (syscall_ret) {
@@ -308,12 +348,18 @@ int main(void)
break;
}
- if (test_cachestat_shmem())
+ if (run_cachestat_test(FILE_SHMEM))
ksft_test_result_pass("cachestat works with a shmem file\n");
else {
ksft_test_result_fail("cachestat fails with a shmem file\n");
ret = 1;
}
+ if (run_cachestat_test(FILE_MMAP))
+ ksft_test_result_pass("cachestat works with a mmap file\n");
+ else {
+ ksft_test_result_fail("cachestat fails with a mmap file\n");
+ ret = 1;
+ }
return ret;
}
diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index f2dafa0b700b..e7b23a8a05fe 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -21,6 +21,7 @@ on-fault-limit
transhuge-stress
pagemap_ioctl
pfnmap
+process_madv
*.tmp*
protection_keys
protection_keys_32
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index ae6f994d3add..d13b3cef2a2b 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -85,6 +85,7 @@ TEST_GEN_FILES += mseal_test
TEST_GEN_FILES += on-fault-limit
TEST_GEN_FILES += pagemap_ioctl
TEST_GEN_FILES += pfnmap
+TEST_GEN_FILES += process_madv
TEST_GEN_FILES += thuge-gen
TEST_GEN_FILES += transhuge-stress
TEST_GEN_FILES += uffd-stress
diff --git a/tools/testing/selftests/mm/process_madv.c b/tools/testing/selftests/mm/process_madv.c
new file mode 100644
index 000000000000..471cae8427f1
--- /dev/null
+++ b/tools/testing/selftests/mm/process_madv.c
@@ -0,0 +1,344 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#define _GNU_SOURCE
+#include "../kselftest_harness.h"
+#include <errno.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/mman.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <sched.h>
+#include "vm_util.h"
+
+#include "../pidfd/pidfd.h"
+
+FIXTURE(process_madvise)
+{
+ unsigned long page_size;
+ pid_t child_pid;
+ int remote_pidfd;
+ int pidfd;
+};
+
+FIXTURE_SETUP(process_madvise)
+{
+ self->page_size = (unsigned long)sysconf(_SC_PAGESIZE);
+ self->pidfd = PIDFD_SELF;
+ self->remote_pidfd = -1;
+ self->child_pid = -1;
+};
+
+FIXTURE_TEARDOWN_PARENT(process_madvise)
+{
+ /* This teardown is guaranteed to run, even if tests SKIP or ASSERT */
+ if (self->child_pid > 0) {
+ kill(self->child_pid, SIGKILL);
+ waitpid(self->child_pid, NULL, 0);
+ }
+
+ if (self->remote_pidfd >= 0)
+ close(self->remote_pidfd);
+}
+
+static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec,
+ size_t vlen, int advice, unsigned int flags)
+{
+ return syscall(__NR_process_madvise, pidfd, iovec, vlen, advice, flags);
+}
+
+/*
+ * This test uses PIDFD_SELF to target the current process. The main
+ * goal is to verify the basic behavior of process_madvise() with
+ * a vector of non-contiguous memory ranges, not its cross-process
+ * capabilities.
+ */
+TEST_F(process_madvise, basic)
+{
+ const unsigned long pagesize = self->page_size;
+ const int madvise_pages = 4;
+ struct iovec vec[madvise_pages];
+ int pidfd = self->pidfd;
+ ssize_t ret;
+ char *map;
+
+ /*
+ * Create a single large mapping. We will pick pages from this
+ * mapping to advise on. This ensures we test non-contiguous iovecs.
+ */
+ map = mmap(NULL, pagesize * 10, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (map == MAP_FAILED)
+ SKIP(return, "mmap failed, not enough memory.\n");
+
+ /* Fill the entire region with a known pattern. */
+ memset(map, 'A', pagesize * 10);
+
+ /*
+ * Setup the iovec to point to 4 non-contiguous pages
+ * within the mapping.
+ */
+ vec[0].iov_base = &map[0 * pagesize];
+ vec[0].iov_len = pagesize;
+ vec[1].iov_base = &map[3 * pagesize];
+ vec[1].iov_len = pagesize;
+ vec[2].iov_base = &map[5 * pagesize];
+ vec[2].iov_len = pagesize;
+ vec[3].iov_base = &map[8 * pagesize];
+ vec[3].iov_len = pagesize;
+
+ ret = sys_process_madvise(pidfd, vec, madvise_pages, MADV_DONTNEED, 0);
+ if (ret == -1 && errno == EPERM)
+ SKIP(return,
+ "process_madvise() unsupported or permission denied, try running as root.\n");
+ else if (errno == EINVAL)
+ SKIP(return,
+ "process_madvise() unsupported or parameter invalid, please check arguments.\n");
+
+ /* The call should succeed and report the total bytes processed. */
+ ASSERT_EQ(ret, madvise_pages * pagesize);
+
+ /* Check that advised pages are now zero. */
+ for (int i = 0; i < madvise_pages; i++) {
+ char *advised_page = (char *)vec[i].iov_base;
+
+ /* Content must be 0, not 'A'. */
+ ASSERT_EQ(*advised_page, '\0');
+ }
+
+ /* Check that an un-advised page in between is still 'A'. */
+ char *unadvised_page = &map[1 * pagesize];
+
+ for (int i = 0; i < pagesize; i++)
+ ASSERT_EQ(unadvised_page[i], 'A');
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(map, pagesize * 10), 0);
+}
+
+/*
+ * This test deterministically validates process_madvise() with MADV_COLLAPSE
+ * on a remote process, other advices are difficult to verify reliably.
+ *
+ * The test verifies that a memory region in a child process,
+ * focus on process_madv remote result, only check addresses and lengths.
+ * The correctness of the MADV_COLLAPSE can be found in the relevant test examples in khugepaged.
+ */
+TEST_F(process_madvise, remote_collapse)
+{
+ const unsigned long pagesize = self->page_size;
+ long huge_page_size;
+ int pipe_info[2];
+ ssize_t ret;
+ struct iovec vec;
+
+ struct child_info {
+ pid_t pid;
+ void *map_addr;
+ } info;
+
+ huge_page_size = read_pmd_pagesize();
+ if (huge_page_size <= 0)
+ SKIP(return, "Could not determine a valid huge page size.\n");
+
+ ASSERT_EQ(pipe(pipe_info), 0);
+
+ self->child_pid = fork();
+ ASSERT_NE(self->child_pid, -1);
+
+ if (self->child_pid == 0) {
+ char *map;
+ size_t map_size = 2 * huge_page_size;
+
+ close(pipe_info[0]);
+
+ map = mmap(NULL, map_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(map, MAP_FAILED);
+
+ /* Fault in as small pages */
+ for (size_t i = 0; i < map_size; i += pagesize)
+ map[i] = 'A';
+
+ /* Send info and pause */
+ info.pid = getpid();
+ info.map_addr = map;
+ ret = write(pipe_info[1], &info, sizeof(info));
+ ASSERT_EQ(ret, sizeof(info));
+ close(pipe_info[1]);
+
+ pause();
+ exit(0);
+ }
+
+ close(pipe_info[1]);
+
+ /* Receive child info */
+ ret = read(pipe_info[0], &info, sizeof(info));
+ if (ret <= 0) {
+ waitpid(self->child_pid, NULL, 0);
+ SKIP(return, "Failed to read child info from pipe.\n");
+ }
+ ASSERT_EQ(ret, sizeof(info));
+ close(pipe_info[0]);
+ self->child_pid = info.pid;
+
+ self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0);
+ ASSERT_GE(self->remote_pidfd, 0);
+
+ vec.iov_base = info.map_addr;
+ vec.iov_len = huge_page_size;
+
+ ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_COLLAPSE,
+ 0);
+ if (ret == -1) {
+ if (errno == EINVAL)
+ SKIP(return, "PROCESS_MADV_ADVISE is not supported.\n");
+ else if (errno == EPERM)
+ SKIP(return,
+ "No process_madvise() permissions, try running as root.\n");
+ return;
+ }
+
+ ASSERT_EQ(ret, huge_page_size);
+}
+
+/*
+ * Test process_madvise() with a pidfd for a process that has already
+ * exited to ensure correct error handling.
+ */
+TEST_F(process_madvise, exited_process_pidfd)
+{
+ const unsigned long pagesize = self->page_size;
+ struct iovec vec;
+ char *map;
+ ssize_t ret;
+
+ map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
+ 0);
+ if (map == MAP_FAILED)
+ SKIP(return, "mmap failed, not enough memory.\n");
+
+ vec.iov_base = map;
+ vec.iov_len = pagesize;
+
+ /*
+ * Using a pidfd for a process that has already exited should fail
+ * with ESRCH.
+ */
+ self->child_pid = fork();
+ ASSERT_NE(self->child_pid, -1);
+
+ if (self->child_pid == 0)
+ exit(0);
+
+ self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0);
+ ASSERT_GE(self->remote_pidfd, 0);
+
+ /* Wait for the child to ensure it has terminated. */
+ waitpid(self->child_pid, NULL, 0);
+
+ ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_DONTNEED,
+ 0);
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, ESRCH);
+}
+
+/*
+ * Test process_madvise() with bad pidfds to ensure correct error
+ * handling.
+ */
+TEST_F(process_madvise, bad_pidfd)
+{
+ const unsigned long pagesize = self->page_size;
+ struct iovec vec;
+ char *map;
+ ssize_t ret;
+
+ map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
+ 0);
+ if (map == MAP_FAILED)
+ SKIP(return, "mmap failed, not enough memory.\n");
+
+ vec.iov_base = map;
+ vec.iov_len = pagesize;
+
+ /* Using an invalid fd number (-1) should fail with EBADF. */
+ ret = sys_process_madvise(-1, &vec, 1, MADV_DONTNEED, 0);
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, EBADF);
+
+ /*
+ * Using a valid fd that is not a pidfd (e.g. stdin) should fail
+ * with EBADF.
+ */
+ ret = sys_process_madvise(STDIN_FILENO, &vec, 1, MADV_DONTNEED, 0);
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, EBADF);
+}
+
+/*
+ * Test that process_madvise() rejects vlen > UIO_MAXIOV.
+ * The kernel should return -EINVAL when the number of iovecs exceeds 1024.
+ */
+TEST_F(process_madvise, invalid_vlen)
+{
+ const unsigned long pagesize = self->page_size;
+ int pidfd = self->pidfd;
+ struct iovec vec;
+ char *map;
+ ssize_t ret;
+
+ map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
+ 0);
+ if (map == MAP_FAILED)
+ SKIP(return, "mmap failed, not enough memory.\n");
+
+ vec.iov_base = map;
+ vec.iov_len = pagesize;
+
+ ret = sys_process_madvise(pidfd, &vec, 1025, MADV_DONTNEED, 0);
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, EINVAL);
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(map, pagesize), 0);
+}
+
+/*
+ * Test process_madvise() with an invalid flag value. Currently, only a flag
+ * value of 0 is supported. This test is reserved for the future, e.g., if
+ * synchronous flags are added.
+ */
+TEST_F(process_madvise, flag)
+{
+ const unsigned long pagesize = self->page_size;
+ unsigned int invalid_flag;
+ int pidfd = self->pidfd;
+ struct iovec vec;
+ char *map;
+ ssize_t ret;
+
+ map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
+ 0);
+ if (map == MAP_FAILED)
+ SKIP(return, "mmap failed, not enough memory.\n");
+
+ vec.iov_base = map;
+ vec.iov_len = pagesize;
+
+ invalid_flag = 0x80000000;
+
+ ret = sys_process_madvise(pidfd, &vec, 1, MADV_DONTNEED, invalid_flag);
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, EINVAL);
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(map, pagesize), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index a38c984103ce..471e539d82b8 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -65,6 +65,8 @@ separated by spaces:
test pagemap_scan IOCTL
- pfnmap
tests for VM_PFNMAP handling
+- process_madv
+ test for process_madv
- cow
test copy-on-write semantics
- thp
@@ -425,6 +427,9 @@ CATEGORY="madv_guard" run_test ./guard-regions
# MADV_POPULATE_READ and MADV_POPULATE_WRITE tests
CATEGORY="madv_populate" run_test ./madv_populate
+# PROCESS_MADV test
+CATEGORY="process_madv" run_test ./process_madv
+
CATEGORY="vma_merge" run_test ./merge
if [ -x ./memfd_secret ]
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index a838c37f93e5..3639aa8dd2b0 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -108,8 +108,10 @@ extern unsigned long dac_mmap_min_addr;
#define CAP_IPC_LOCK 14
#ifdef CONFIG_64BIT
-/* VM is sealed, in vm_flags */
-#define VM_SEALED _BITUL(63)
+#define VM_SEALED_BIT 42
+#define VM_SEALED BIT(VM_SEALED_BIT)
+#else
+#define VM_SEALED VM_NONE
#endif
#define FIRST_USER_ADDRESS 0UL