diff options
87 files changed, 3394 insertions, 2314 deletions
diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index bf03263b9f46..bc0e7fefc39d 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -861,3 +861,25 @@ Description: This is a read-only entry to show the value of sb.s_encoding_flags, SB_ENC_STRICT_MODE_FL 0x00000001 SB_ENC_NO_COMPAT_FALLBACK_FL 0x00000002 ============================ ========== + +What: /sys/fs/f2fs/<disk>/reserved_pin_section +Date: June 2025 +Contact: "Chao Yu" <chao@kernel.org> +Description: This threshold is used to control triggering garbage collection while + fallocating on pinned file, so, it can guarantee there is enough free + reserved section before preallocating on pinned file. + By default, the value is ovp_sections, especially, for zoned ufs, the + value is 1. + +What: /sys/fs/f2fs/<disk>/gc_boost_gc_multiple +Date: June 2025 +Contact: "Daeho Jeong" <daehojeong@google.com> +Description: Set a multiplier for the background GC migration window when F2FS GC is + boosted. The range should be from 1 to the segment count in a section. + Default: 5 + +What: /sys/fs/f2fs/<disk>/gc_boost_gc_greedy +Date: June 2025 +Contact: "Daeho Jeong" <daehojeong@google.com> +Description: Control GC algorithm for boost GC. 0: cost benefit, 1: greedy + Default: 1 diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index c1a5cd991f2e..747a55abf494 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1828,6 +1828,27 @@ backtraces on all cpus. Format: 0 | 1 + hash_pointers= + [KNL,EARLY] + By default, when pointers are printed to the console + or buffers via the %p format string, that pointer is + "hashed", i.e. obscured by hashing the pointer value. + This is a security feature that hides actual kernel + addresses from unprivileged users, but it also makes + debugging the kernel more difficult since unequal + pointers can no longer be compared. The choices are: + Format: { auto | always | never } + Default: auto + + auto - Hash pointers unless slab_debug is enabled. + always - Always hash pointers (even if slab_debug is + enabled). + never - Never hash pointers. This option should only + be specified when debugging the kernel. Do + not use on production kernels. The boot + param "no_hash_pointers" is an alias for + this mode. + hashdist= [KNL,NUMA] Large hashes allocated during boot are distributed across NUMA nodes. Defaults on for 64-bit NUMA, off otherwise. @@ -4216,18 +4237,7 @@ no_hash_pointers [KNL,EARLY] - Force pointers printed to the console or buffers to be - unhashed. By default, when a pointer is printed via %p - format string, that pointer is "hashed", i.e. obscured - by hashing the pointer value. This is a security feature - that hides actual kernel addresses from unprivileged - users, but it also makes debugging the kernel more - difficult since unequal pointers can no longer be - compared. However, if this command-line option is - specified, then all normal pointers will have their true - value printed. This option should only be specified when - debugging the kernel. Please do not use on production - kernels. + Alias for "hash_pointers=never". nohibernate [HIBERNATION] Disable hibernation and resume. @@ -6644,6 +6654,10 @@ Documentation/admin-guide/mm/slab.rst. (slub_debug legacy name also accepted for now) + Using this option implies the "no_hash_pointers" + option which can be undone by adding the + "hash_pointers=always" option. + slab_max_order= [MM] Determines the maximum allowed order for slabs. A high setting may cause OOMs due to memory diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst index 50cfc7842930..5063179cfc70 100644 --- a/Documentation/core-api/mm-api.rst +++ b/Documentation/core-api/mm-api.rst @@ -133,4 +133,3 @@ More Memory Management Functions .. kernel-doc:: mm/mmu_notifier.c .. kernel-doc:: mm/balloon_compaction.c .. kernel-doc:: mm/huge_memory.c -.. kernel-doc:: mm/io-mapping.c diff --git a/Documentation/devicetree/bindings/i2c/apple,i2c.yaml b/Documentation/devicetree/bindings/i2c/apple,i2c.yaml index 077d2a539c83..fed3e1b8c43f 100644 --- a/Documentation/devicetree/bindings/i2c/apple,i2c.yaml +++ b/Documentation/devicetree/bindings/i2c/apple,i2c.yaml @@ -22,6 +22,11 @@ properties: compatible: items: - enum: + - apple,s5l8960x-i2c + - apple,t7000-i2c + - apple,s8000-i2c + - apple,t8010-i2c + - apple,t8015-i2c - apple,t8103-i2c - apple,t8112-i2c - apple,t6000-i2c diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 8eeb7ea14f61..e5bb89452aff 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -238,9 +238,9 @@ usrjquota=<file> Appoint specified file and type during mount, so that quota grpjquota=<file> information can be properly updated during recovery flow, prjjquota=<file> <quota file>: must be in root directory; jqfmt=<quota type> <quota type>: [vfsold,vfsv0,vfsv1]. -offusrjquota Turn off user journalled quota. -offgrpjquota Turn off group journalled quota. -offprjjquota Turn off project journalled quota. +usrjquota= Turn off user journalled quota. +grpjquota= Turn off group journalled quota. +prjjquota= Turn off project journalled quota. quota Enable plain user disk quota accounting. noquota Disable all plain disk quota option. alloc_mode=%s Adjust block allocation policy, which supports "reuse" diff --git a/arch/arm/include/asm/cti.h b/arch/arm/include/asm/cti.h deleted file mode 100644 index f8500e5d6ea8..000000000000 --- a/arch/arm/include/asm/cti.h +++ /dev/null @@ -1,160 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ASMARM_CTI_H -#define __ASMARM_CTI_H - -#include <asm/io.h> -#include <asm/hardware/coresight.h> - -/* The registers' definition is from section 3.2 of - * Embedded Cross Trigger Revision: r0p0 - */ -#define CTICONTROL 0x000 -#define CTISTATUS 0x004 -#define CTILOCK 0x008 -#define CTIPROTECTION 0x00C -#define CTIINTACK 0x010 -#define CTIAPPSET 0x014 -#define CTIAPPCLEAR 0x018 -#define CTIAPPPULSE 0x01c -#define CTIINEN 0x020 -#define CTIOUTEN 0x0A0 -#define CTITRIGINSTATUS 0x130 -#define CTITRIGOUTSTATUS 0x134 -#define CTICHINSTATUS 0x138 -#define CTICHOUTSTATUS 0x13c -#define CTIPERIPHID0 0xFE0 -#define CTIPERIPHID1 0xFE4 -#define CTIPERIPHID2 0xFE8 -#define CTIPERIPHID3 0xFEC -#define CTIPCELLID0 0xFF0 -#define CTIPCELLID1 0xFF4 -#define CTIPCELLID2 0xFF8 -#define CTIPCELLID3 0xFFC - -/* The below are from section 3.6.4 of - * CoreSight v1.0 Architecture Specification - */ -#define LOCKACCESS 0xFB0 -#define LOCKSTATUS 0xFB4 - -/** - * struct cti - cross trigger interface struct - * @base: mapped virtual address for the cti base - * @irq: irq number for the cti - * @trig_out_for_irq: triger out number which will cause - * the @irq happen - * - * cti struct used to operate cti registers. - */ -struct cti { - void __iomem *base; - int irq; - int trig_out_for_irq; -}; - -/** - * cti_init - initialize the cti instance - * @cti: cti instance - * @base: mapped virtual address for the cti base - * @irq: irq number for the cti - * @trig_out: triger out number which will cause - * the @irq happen - * - * called by machine code to pass the board dependent - * @base, @irq and @trig_out to cti. - */ -static inline void cti_init(struct cti *cti, - void __iomem *base, int irq, int trig_out) -{ - cti->base = base; - cti->irq = irq; - cti->trig_out_for_irq = trig_out; -} - -/** - * cti_map_trigger - use the @chan to map @trig_in to @trig_out - * @cti: cti instance - * @trig_in: trigger in number - * @trig_out: trigger out number - * @channel: channel number - * - * This function maps one trigger in of @trig_in to one trigger - * out of @trig_out using the channel @chan. - */ -static inline void cti_map_trigger(struct cti *cti, - int trig_in, int trig_out, int chan) -{ - void __iomem *base = cti->base; - unsigned long val; - - val = __raw_readl(base + CTIINEN + trig_in * 4); - val |= BIT(chan); - __raw_writel(val, base + CTIINEN + trig_in * 4); - - val = __raw_readl(base + CTIOUTEN + trig_out * 4); - val |= BIT(chan); - __raw_writel(val, base + CTIOUTEN + trig_out * 4); -} - -/** - * cti_enable - enable the cti module - * @cti: cti instance - * - * enable the cti module - */ -static inline void cti_enable(struct cti *cti) -{ - __raw_writel(0x1, cti->base + CTICONTROL); -} - -/** - * cti_disable - disable the cti module - * @cti: cti instance - * - * enable the cti module - */ -static inline void cti_disable(struct cti *cti) -{ - __raw_writel(0, cti->base + CTICONTROL); -} - -/** - * cti_irq_ack - clear the cti irq - * @cti: cti instance - * - * clear the cti irq - */ -static inline void cti_irq_ack(struct cti *cti) -{ - void __iomem *base = cti->base; - unsigned long val; - - val = __raw_readl(base + CTIINTACK); - val |= BIT(cti->trig_out_for_irq); - __raw_writel(val, base + CTIINTACK); -} - -/** - * cti_unlock - unlock cti module - * @cti: cti instance - * - * unlock the cti module, or else any writes to the cti - * module is not allowed. - */ -static inline void cti_unlock(struct cti *cti) -{ - __raw_writel(CS_LAR_KEY, cti->base + LOCKACCESS); -} - -/** - * cti_lock - lock cti module - * @cti: cti instance - * - * lock the cti module, so any writes to the cti - * module will be not allowed. - */ -static inline void cti_lock(struct cti *cti) -{ - __raw_writel(~CS_LAR_KEY, cti->base + LOCKACCESS); -} -#endif diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index abd9725796e9..34e5d78af076 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -721,7 +721,7 @@ void mark_rodata_ro(void) static void __init declare_vma(struct vm_struct *vma, void *va_start, void *va_end, - vm_flags_t vm_flags) + unsigned long vm_flags) { phys_addr_t pa_start = __pa_symbol(va_start); unsigned long size = va_end - va_start; @@ -1528,7 +1528,7 @@ early_initcall(prevent_bootmem_remove_init); pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr) { - pte_t pte = get_and_clear_full_ptes(vma->vm_mm, addr, ptep, nr, /* full = */ 0); + pte_t pte = get_and_clear_ptes(vma->vm_mm, addr, ptep, nr); if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) { /* diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 9f6b7dab2d9a..7bde68247b5f 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -120,7 +120,7 @@ struct its_array its_pages; static void *__its_alloc(struct its_array *pages) { - void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE); + void *page __free(execmem) = execmem_alloc_rw(EXECMEM_MODULE_TEXT, PAGE_SIZE); if (!page) return NULL; @@ -237,7 +237,6 @@ static void *its_alloc(void) if (!page) return NULL; - execmem_make_temp_rw(page, PAGE_SIZE); if (pages == &its_pages) set_memory_x((unsigned long)page, 1); diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 252e82bcfd2f..4450acec9390 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -263,7 +263,7 @@ void arch_ftrace_update_code(int command) static inline void *alloc_tramp(unsigned long size) { - return execmem_alloc(EXECMEM_FTRACE, size); + return execmem_alloc_rw(EXECMEM_FTRACE, size); } static inline void tramp_free(void *tramp) { diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 47cb8eb138ba..6079d15dab8c 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -481,24 +481,6 @@ static int prepare_singlestep(kprobe_opcode_t *buf, struct kprobe *p, return len; } -/* Make page to RO mode when allocate it */ -void *alloc_insn_page(void) -{ - void *page; - - page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE); - if (!page) - return NULL; - - /* - * TODO: Once additional kernel code protection mechanisms are set, ensure - * that the page was not maliciously altered and it is still zeroed. - */ - set_memory_rox((unsigned long)page, 1); - - return page; -} - /* Kprobe x86 instruction emulation - only regs->ip or IF flag modifiers */ static void kprobe_emulate_ifmodifiers(struct kprobe *p, struct pt_regs *regs) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 7456df985d96..bb57e93b4caf 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -1063,13 +1063,9 @@ unsigned long arch_max_swapfile_size(void) static struct execmem_info execmem_info __ro_after_init; #ifdef CONFIG_ARCH_HAS_EXECMEM_ROX -void execmem_fill_trapping_insns(void *ptr, size_t size, bool writeable) +void execmem_fill_trapping_insns(void *ptr, size_t size) { - /* fill memory with INT3 instructions */ - if (writeable) - memset(ptr, INT3_INSN_OPCODE, size); - else - text_poke_set(ptr, INT3_INSN_OPCODE, size); + memset(ptr, INT3_INSN_OPCODE, size); } #endif @@ -1102,7 +1098,21 @@ struct execmem_info __init *execmem_arch_setup(void) .pgprot = pgprot, .alignment = MODULE_ALIGN, }, - [EXECMEM_KPROBES ... EXECMEM_BPF] = { + [EXECMEM_KPROBES] = { + .flags = flags, + .start = start, + .end = MODULES_END, + .pgprot = PAGE_KERNEL_ROX, + .alignment = MODULE_ALIGN, + }, + [EXECMEM_FTRACE] = { + .flags = flags, + .start = start, + .end = MODULES_END, + .pgprot = pgprot, + .alignment = MODULE_ALIGN, + }, + [EXECMEM_BPF] = { .flags = EXECMEM_KASAN_SHADOW, .start = start, .end = MODULES_END, diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index c8d115b58e44..070d014fdc5d 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -992,7 +992,6 @@ config I2C_APPLE tristate "Apple SMBus platform driver" depends on !I2C_PASEMI depends on ARCH_APPLE || COMPILE_TEST - default ARCH_APPLE help Say Y here if you want to use the I2C controller present on Apple Silicon chips such as the M1. diff --git a/drivers/i2c/busses/i2c-qcom-geni.c b/drivers/i2c/busses/i2c-qcom-geni.c index 13889f52b6f7..ff2289b52c84 100644 --- a/drivers/i2c/busses/i2c-qcom-geni.c +++ b/drivers/i2c/busses/i2c-qcom-geni.c @@ -155,9 +155,9 @@ static const struct geni_i2c_clk_fld geni_i2c_clk_map_19p2mhz[] = { /* source_clock = 32 MHz */ static const struct geni_i2c_clk_fld geni_i2c_clk_map_32mhz[] = { - { I2C_MAX_STANDARD_MODE_FREQ, 8, 14, 18, 40 }, - { I2C_MAX_FAST_MODE_FREQ, 4, 3, 11, 20 }, - { I2C_MAX_FAST_MODE_PLUS_FREQ, 2, 3, 6, 15 }, + { I2C_MAX_STANDARD_MODE_FREQ, 8, 14, 18, 38 }, + { I2C_MAX_FAST_MODE_FREQ, 4, 3, 9, 19 }, + { I2C_MAX_FAST_MODE_PLUS_FREQ, 2, 3, 5, 15 }, {} }; diff --git a/drivers/i2c/busses/i2c-stm32f7.c b/drivers/i2c/busses/i2c-stm32f7.c index c8b4b404f6c1..e6815f6cae78 100644 --- a/drivers/i2c/busses/i2c-stm32f7.c +++ b/drivers/i2c/busses/i2c-stm32f7.c @@ -742,11 +742,14 @@ static void stm32f7_i2c_dma_callback(void *arg) { struct stm32f7_i2c_dev *i2c_dev = arg; struct stm32_i2c_dma *dma = i2c_dev->dma; + struct stm32f7_i2c_msg *f7_msg = &i2c_dev->f7_msg; stm32f7_i2c_disable_dma_req(i2c_dev); dmaengine_terminate_async(dma->chan_using); dma_unmap_single(i2c_dev->dev, dma->dma_buf, dma->dma_len, dma->dma_data_dir); + if (!f7_msg->smbus) + i2c_put_dma_safe_msg_buf(f7_msg->buf, i2c_dev->msg, true); complete(&dma->dma_complete); } @@ -882,6 +885,7 @@ static void stm32f7_i2c_xfer_msg(struct stm32f7_i2c_dev *i2c_dev, { struct stm32f7_i2c_msg *f7_msg = &i2c_dev->f7_msg; void __iomem *base = i2c_dev->base; + u8 *dma_buf; u32 cr1, cr2; int ret; @@ -931,17 +935,23 @@ static void stm32f7_i2c_xfer_msg(struct stm32f7_i2c_dev *i2c_dev, /* Configure DMA or enable RX/TX interrupt */ i2c_dev->use_dma = false; - if (i2c_dev->dma && f7_msg->count >= STM32F7_I2C_DMA_LEN_MIN - && !i2c_dev->atomic) { - ret = stm32_i2c_prep_dma_xfer(i2c_dev->dev, i2c_dev->dma, - msg->flags & I2C_M_RD, - f7_msg->count, f7_msg->buf, - stm32f7_i2c_dma_callback, - i2c_dev); - if (!ret) - i2c_dev->use_dma = true; - else - dev_warn(i2c_dev->dev, "can't use DMA\n"); + if (i2c_dev->dma && !i2c_dev->atomic) { + dma_buf = i2c_get_dma_safe_msg_buf(msg, STM32F7_I2C_DMA_LEN_MIN); + if (dma_buf) { + f7_msg->buf = dma_buf; + ret = stm32_i2c_prep_dma_xfer(i2c_dev->dev, i2c_dev->dma, + msg->flags & I2C_M_RD, + f7_msg->count, f7_msg->buf, + stm32f7_i2c_dma_callback, + i2c_dev); + if (ret) { + dev_warn(i2c_dev->dev, "can't use DMA\n"); + i2c_put_dma_safe_msg_buf(f7_msg->buf, msg, false); + f7_msg->buf = msg->buf; + } else { + i2c_dev->use_dma = true; + } + } } if (!i2c_dev->use_dma) { diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c index 4f05afab161f..4eb31b913c1a 100644 --- a/drivers/i2c/busses/i2c-tegra.c +++ b/drivers/i2c/busses/i2c-tegra.c @@ -134,6 +134,8 @@ #define I2C_MST_FIFO_STATUS_TX GENMASK(23, 16) #define I2C_MST_FIFO_STATUS_RX GENMASK(7, 0) +#define I2C_MASTER_RESET_CNTRL 0x0a8 + /* configuration load timeout in microseconds */ #define I2C_CONFIG_LOAD_TIMEOUT 1000000 @@ -184,6 +186,9 @@ enum msg_end_type { * @has_mst_fifo: The I2C controller contains the new MST FIFO interface that * provides additional features and allows for longer messages to * be transferred in one go. + * @has_mst_reset: The I2C controller contains MASTER_RESET_CTRL register which + * provides an alternative to controller reset when configured as + * I2C master * @quirks: I2C adapter quirks for limiting write/read transfer size and not * allowing 0 length transfers. * @supports_bus_clear: Bus Clear support to recover from bus hang during @@ -213,6 +218,7 @@ struct tegra_i2c_hw_feature { bool has_multi_master_mode; bool has_slcg_override_reg; bool has_mst_fifo; + bool has_mst_reset; const struct i2c_adapter_quirks *quirks; bool supports_bus_clear; bool has_apb_dma; @@ -605,6 +611,26 @@ static int tegra_i2c_wait_for_config_load(struct tegra_i2c_dev *i2c_dev) return 0; } +static int tegra_i2c_master_reset(struct tegra_i2c_dev *i2c_dev) +{ + if (!i2c_dev->hw->has_mst_reset) + return -EOPNOTSUPP; + + /* + * Writing 1 to I2C_MASTER_RESET_CNTRL will reset all internal state of + * Master logic including FIFOs. Clear this bit to 0 for normal operation. + * SW needs to wait for 2us after assertion and de-assertion of this soft + * reset. + */ + i2c_writel(i2c_dev, 0x1, I2C_MASTER_RESET_CNTRL); + fsleep(2); + + i2c_writel(i2c_dev, 0x0, I2C_MASTER_RESET_CNTRL); + fsleep(2); + + return 0; +} + static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev) { u32 val, clk_divisor, clk_multiplier, tsu_thd, tlow, thigh, non_hs_mode; @@ -612,6 +638,16 @@ static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev) int err; /* + * Reset the controller before initializing it. + * In case if device_reset() returns -ENOENT, i.e. when the reset is + * not available, the internal software reset will be used if it is + * supported by the controller. + */ + err = device_reset(i2c_dev->dev); + if (err == -ENOENT) + err = tegra_i2c_master_reset(i2c_dev); + + /* * The reset shouldn't ever fail in practice. The failure will be a * sign of a severe problem that needs to be resolved. Still we don't * want to fail the initialization completely because this may break @@ -619,7 +655,6 @@ static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev) * emit a noisy warning on error, which won't stay unnoticed and * won't hose machine entirely. */ - err = device_reset(i2c_dev->dev); WARN_ON_ONCE(err); if (IS_DVC(i2c_dev)) @@ -1266,17 +1301,9 @@ static int tegra_i2c_xfer_msg(struct tegra_i2c_dev *i2c_dev, if (i2c_dev->dma_mode) { if (i2c_dev->msg_read) { - dma_sync_single_for_device(i2c_dev->dma_dev, - i2c_dev->dma_phys, - xfer_size, DMA_FROM_DEVICE); - err = tegra_i2c_dma_submit(i2c_dev, xfer_size); if (err) return err; - } else { - dma_sync_single_for_cpu(i2c_dev->dma_dev, - i2c_dev->dma_phys, - xfer_size, DMA_TO_DEVICE); } } @@ -1286,11 +1313,6 @@ static int tegra_i2c_xfer_msg(struct tegra_i2c_dev *i2c_dev, if (i2c_dev->dma_mode) { memcpy(i2c_dev->dma_buf + I2C_PACKET_HEADER_SIZE, msg->buf, i2c_dev->msg_len); - - dma_sync_single_for_device(i2c_dev->dma_dev, - i2c_dev->dma_phys, - xfer_size, DMA_TO_DEVICE); - err = tegra_i2c_dma_submit(i2c_dev, xfer_size); if (err) return err; @@ -1331,13 +1353,8 @@ static int tegra_i2c_xfer_msg(struct tegra_i2c_dev *i2c_dev, return -ETIMEDOUT; } - if (i2c_dev->msg_read && i2c_dev->msg_err == I2C_ERR_NONE) { - dma_sync_single_for_cpu(i2c_dev->dma_dev, - i2c_dev->dma_phys, - xfer_size, DMA_FROM_DEVICE); - + if (i2c_dev->msg_read && i2c_dev->msg_err == I2C_ERR_NONE) memcpy(i2c_dev->msg_buf, i2c_dev->dma_buf, i2c_dev->msg_len); - } } time_left = tegra_i2c_wait_completion(i2c_dev, &i2c_dev->msg_complete, @@ -1468,6 +1485,7 @@ static const struct tegra_i2c_hw_feature tegra20_i2c_hw = { .has_multi_master_mode = false, .has_slcg_override_reg = false, .has_mst_fifo = false, + .has_mst_reset = false, .quirks = &tegra_i2c_quirks, .supports_bus_clear = false, .has_apb_dma = true, @@ -1492,6 +1510,7 @@ static const struct tegra_i2c_hw_feature tegra30_i2c_hw = { .has_multi_master_mode = false, .has_slcg_override_reg = false, .has_mst_fifo = false, + .has_mst_reset = false, .quirks = &tegra_i2c_quirks, .supports_bus_clear = false, .has_apb_dma = true, @@ -1516,6 +1535,7 @@ static const struct tegra_i2c_hw_feature tegra114_i2c_hw = { .has_multi_master_mode = false, .has_slcg_override_reg = false, .has_mst_fifo = false, + .has_mst_reset = false, .quirks = &tegra_i2c_quirks, .supports_bus_clear = true, .has_apb_dma = true, @@ -1540,6 +1560,7 @@ static const struct tegra_i2c_hw_feature tegra124_i2c_hw = { .has_multi_master_mode = false, .has_slcg_override_reg = true, .has_mst_fifo = false, + .has_mst_reset = false, .quirks = &tegra_i2c_quirks, .supports_bus_clear = true, .has_apb_dma = true, @@ -1564,6 +1585,7 @@ static const struct tegra_i2c_hw_feature tegra210_i2c_hw = { .has_multi_master_mode = false, .has_slcg_override_reg = true, .has_mst_fifo = false, + .has_mst_reset = false, .quirks = &tegra_i2c_quirks, .supports_bus_clear = true, .has_apb_dma = true, @@ -1588,6 +1610,7 @@ static const struct tegra_i2c_hw_feature tegra186_i2c_hw = { .has_multi_master_mode = false, .has_slcg_override_reg = true, .has_mst_fifo = false, + .has_mst_reset = false, .quirks = &tegra_i2c_quirks, .supports_bus_clear = true, .has_apb_dma = false, @@ -1612,6 +1635,7 @@ static const struct tegra_i2c_hw_feature tegra194_i2c_hw = { .has_multi_master_mode = true, .has_slcg_override_reg = true, .has_mst_fifo = true, + .has_mst_reset = true, .quirks = &tegra194_i2c_quirks, .supports_bus_clear = true, .has_apb_dma = false, diff --git a/drivers/i2c/i2c-core-acpi.c b/drivers/i2c/i2c-core-acpi.c index 3445cc3b476b..ed90858a27b7 100644 --- a/drivers/i2c/i2c-core-acpi.c +++ b/drivers/i2c/i2c-core-acpi.c @@ -370,6 +370,7 @@ static const struct acpi_device_id i2c_acpi_force_100khz_device_ids[] = { * the device works without issues on Windows at what is expected to be * a 400KHz frequency. The root cause of the issue is not known. */ + { "DLL0945", 0 }, { "ELAN06FA", 0 }, {} }; diff --git a/drivers/i2c/muxes/i2c-mux-mule.c b/drivers/i2c/muxes/i2c-mux-mule.c index 284ff4afeeac..d3b32b794172 100644 --- a/drivers/i2c/muxes/i2c-mux-mule.c +++ b/drivers/i2c/muxes/i2c-mux-mule.c @@ -47,7 +47,6 @@ static int mule_i2c_mux_probe(struct platform_device *pdev) struct mule_i2c_reg_mux *priv; struct i2c_client *client; struct i2c_mux_core *muxc; - struct device_node *dev; unsigned int readback; int ndev, ret; bool old_fw; @@ -95,7 +94,7 @@ static int mule_i2c_mux_probe(struct platform_device *pdev) "Failed to register mux remove\n"); /* Create device adapters */ - for_each_child_of_node(mux_dev->of_node, dev) { + for_each_child_of_node_scoped(mux_dev->of_node, dev) { u32 reg; ret = of_property_read_u32(dev, "reg", ®); diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index 3103b932b674..ee060e26f51d 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -996,6 +996,7 @@ int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei, struct exfat_hint_femp candi_empty; struct exfat_sb_info *sbi = EXFAT_SB(sb); int num_entries = exfat_calc_num_entries(p_uniname); + unsigned int clu_count = 0; if (num_entries < 0) return num_entries; @@ -1133,6 +1134,10 @@ rewind: } else { if (exfat_get_next_cluster(sb, &clu.dir)) return -EIO; + + /* break if the cluster chain includes a loop */ + if (unlikely(++clu_count > EXFAT_DATA_CLUSTER_COUNT(sbi))) + goto not_found; } } @@ -1195,6 +1200,7 @@ int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir) int i, count = 0; int dentries_per_clu; unsigned int entry_type; + unsigned int clu_count = 0; struct exfat_chain clu; struct exfat_dentry *ep; struct exfat_sb_info *sbi = EXFAT_SB(sb); @@ -1227,6 +1233,12 @@ int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir) } else { if (exfat_get_next_cluster(sb, &(clu.dir))) return -EIO; + + if (unlikely(++clu_count > sbi->used_clusters)) { + exfat_fs_error(sb, "FAT or bitmap is corrupted"); + return -EIO; + } + } } diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c index 23065f948ae7..232cc7f8ab92 100644 --- a/fs/exfat/fatent.c +++ b/fs/exfat/fatent.c @@ -490,5 +490,15 @@ int exfat_count_num_clusters(struct super_block *sb, } *ret_count = count; + + /* + * since exfat_count_used_clusters() is not called, sbi->used_clusters + * cannot be used here. + */ + if (unlikely(i == sbi->num_clusters && clu != EXFAT_EOF_CLUSTER)) { + exfat_fs_error(sb, "The cluster chain has a loop"); + return -EIO; + } + return 0; } diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 6b82497572b4..538d2b6ac2ec 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -622,9 +622,8 @@ static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) if (pos > valid_size) pos = valid_size; - if (iocb_is_dsync(iocb) && iocb->ki_pos > pos) { - ssize_t err = vfs_fsync_range(file, pos, iocb->ki_pos - 1, - iocb->ki_flags & IOCB_SYNC); + if (iocb->ki_pos > pos) { + ssize_t err = generic_write_sync(iocb, iocb->ki_pos - pos); if (err < 0) return err; } diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index fede0283d6e2..f5f1c4e8a29f 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -890,6 +890,7 @@ static int exfat_check_dir_empty(struct super_block *sb, { int i, dentries_per_clu; unsigned int type; + unsigned int clu_count = 0; struct exfat_chain clu; struct exfat_dentry *ep; struct exfat_sb_info *sbi = EXFAT_SB(sb); @@ -926,6 +927,10 @@ static int exfat_check_dir_empty(struct super_block *sb, } else { if (exfat_get_next_cluster(sb, &(clu.dir))) return -EIO; + + /* break if the cluster chain includes a loop */ + if (unlikely(++clu_count > EXFAT_DATA_CLUSTER_COUNT(sbi))) + break; } } diff --git a/fs/exfat/super.c b/fs/exfat/super.c index ea5c1334a214..8926e63f5bb7 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -341,13 +341,12 @@ static void exfat_hash_init(struct super_block *sb) INIT_HLIST_HEAD(&sbi->inode_hashtable[i]); } -static int exfat_read_root(struct inode *inode) +static int exfat_read_root(struct inode *inode, struct exfat_chain *root_clu) { struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(inode); - struct exfat_chain cdir; - int num_subdirs, num_clu = 0; + int num_subdirs; exfat_chain_set(&ei->dir, sbi->root_dir, 0, ALLOC_FAT_CHAIN); ei->entry = -1; @@ -360,12 +359,9 @@ static int exfat_read_root(struct inode *inode) ei->hint_stat.clu = sbi->root_dir; ei->hint_femp.eidx = EXFAT_HINT_NONE; - exfat_chain_set(&cdir, sbi->root_dir, 0, ALLOC_FAT_CHAIN); - if (exfat_count_num_clusters(sb, &cdir, &num_clu)) - return -EIO; - i_size_write(inode, num_clu << sbi->cluster_size_bits); + i_size_write(inode, EXFAT_CLU_TO_B(root_clu->size, sbi)); - num_subdirs = exfat_count_dir_entries(sb, &cdir); + num_subdirs = exfat_count_dir_entries(sb, root_clu); if (num_subdirs < 0) return -EIO; set_nlink(inode, num_subdirs + EXFAT_MIN_SUBDIR); @@ -578,7 +574,8 @@ static int exfat_verify_boot_region(struct super_block *sb) } /* mount the file system volume */ -static int __exfat_fill_super(struct super_block *sb) +static int __exfat_fill_super(struct super_block *sb, + struct exfat_chain *root_clu) { int ret; struct exfat_sb_info *sbi = EXFAT_SB(sb); @@ -595,6 +592,18 @@ static int __exfat_fill_super(struct super_block *sb) goto free_bh; } + /* + * Call exfat_count_num_cluster() before searching for up-case and + * bitmap directory entries to avoid infinite loop if they are missing + * and the cluster chain includes a loop. + */ + exfat_chain_set(root_clu, sbi->root_dir, 0, ALLOC_FAT_CHAIN); + ret = exfat_count_num_clusters(sb, root_clu, &root_clu->size); + if (ret) { + exfat_err(sb, "failed to count the number of clusters in root"); + goto free_bh; + } + ret = exfat_create_upcase_table(sb); if (ret) { exfat_err(sb, "failed to load upcase table"); @@ -627,6 +636,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc) struct exfat_sb_info *sbi = sb->s_fs_info; struct exfat_mount_options *opts = &sbi->options; struct inode *root_inode; + struct exfat_chain root_clu; int err; if (opts->allow_utime == (unsigned short)-1) @@ -645,7 +655,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_time_min = EXFAT_MIN_TIMESTAMP_SECS; sb->s_time_max = EXFAT_MAX_TIMESTAMP_SECS; - err = __exfat_fill_super(sb); + err = __exfat_fill_super(sb, &root_clu); if (err) { exfat_err(sb, "failed to recognize exfat type"); goto check_nls_io; @@ -680,7 +690,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc) root_inode->i_ino = EXFAT_ROOT_INO; inode_set_iversion(root_inode, 1); - err = exfat_read_root(root_inode); + err = exfat_read_root(root_inode, &root_clu); if (err) { exfat_err(sb, "failed to initialize root inode"); goto put_inode; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f149ec28aefd..db3831f7f2f5 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -82,7 +82,7 @@ repeat: if (folio_test_uptodate(folio)) goto out; - fio.page = &folio->page; + fio.folio = folio; err = f2fs_submit_page_bio(&fio); if (err) { @@ -309,7 +309,7 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, continue; } - fio.page = &folio->page; + fio.folio = folio; err = f2fs_submit_page_bio(&fio); f2fs_folio_put(folio, err ? true : false); @@ -485,7 +485,7 @@ static bool f2fs_dirty_meta_folio(struct address_space *mapping, folio_mark_uptodate(folio); if (filemap_dirty_folio(mapping, folio)) { inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_META); - set_page_private_reference(&folio->page); + folio_set_f2fs_reference(folio); return true; } return false; @@ -1045,7 +1045,7 @@ void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio) inode_inc_dirty_pages(inode); spin_unlock(&sbi->inode_lock[type]); - set_page_private_reference(&folio->page); + folio_set_f2fs_reference(folio); } void f2fs_remove_dirty_inode(struct inode *inode) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index b3c1df93a163..5c1f47e45dab 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -23,20 +23,18 @@ static struct kmem_cache *cic_entry_slab; static struct kmem_cache *dic_entry_slab; -static void *page_array_alloc(struct inode *inode, int nr) +static void *page_array_alloc(struct f2fs_sb_info *sbi, int nr) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int size = sizeof(struct page *) * nr; if (likely(size <= sbi->page_array_slab_size)) return f2fs_kmem_cache_alloc(sbi->page_array_slab, - GFP_F2FS_ZERO, false, F2FS_I_SB(inode)); + GFP_F2FS_ZERO, false, sbi); return f2fs_kzalloc(sbi, size, GFP_NOFS); } -static void page_array_free(struct inode *inode, void *pages, int nr) +static void page_array_free(struct f2fs_sb_info *sbi, void *pages, int nr) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int size = sizeof(struct page *) * nr; if (!pages) @@ -73,17 +71,15 @@ static pgoff_t start_idx_of_cluster(struct compress_ctx *cc) return cc->cluster_idx << cc->log_cluster_size; } -bool f2fs_is_compressed_page(struct page *page) +bool f2fs_is_compressed_page(struct folio *folio) { - if (!PagePrivate(page)) - return false; - if (!page_private(page)) + if (!folio->private) return false; - if (page_private_nonpointer(page)) + if (folio_test_f2fs_nonpointer(folio)) return false; - f2fs_bug_on(F2FS_P_SB(page), - *((u32 *)page_private(page)) != F2FS_COMPRESSED_PAGE_MAGIC); + f2fs_bug_on(F2FS_F_SB(folio), + *((u32 *)folio->private) != F2FS_COMPRESSED_PAGE_MAGIC); return true; } @@ -149,13 +145,13 @@ int f2fs_init_compress_ctx(struct compress_ctx *cc) if (cc->rpages) return 0; - cc->rpages = page_array_alloc(cc->inode, cc->cluster_size); + cc->rpages = page_array_alloc(F2FS_I_SB(cc->inode), cc->cluster_size); return cc->rpages ? 0 : -ENOMEM; } void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse) { - page_array_free(cc->inode, cc->rpages, cc->cluster_size); + page_array_free(F2FS_I_SB(cc->inode), cc->rpages, cc->cluster_size); cc->rpages = NULL; cc->nr_rpages = 0; cc->nr_cpages = 0; @@ -216,13 +212,13 @@ static int lzo_decompress_pages(struct decompress_io_ctx *dic) ret = lzo1x_decompress_safe(dic->cbuf->cdata, dic->clen, dic->rbuf, &dic->rlen); if (ret != LZO_E_OK) { - f2fs_err_ratelimited(F2FS_I_SB(dic->inode), + f2fs_err_ratelimited(dic->sbi, "lzo decompress failed, ret:%d", ret); return -EIO; } if (dic->rlen != PAGE_SIZE << dic->log_cluster_size) { - f2fs_err_ratelimited(F2FS_I_SB(dic->inode), + f2fs_err_ratelimited(dic->sbi, "lzo invalid rlen:%zu, expected:%lu", dic->rlen, PAGE_SIZE << dic->log_cluster_size); return -EIO; @@ -296,13 +292,13 @@ static int lz4_decompress_pages(struct decompress_io_ctx *dic) ret = LZ4_decompress_safe(dic->cbuf->cdata, dic->rbuf, dic->clen, dic->rlen); if (ret < 0) { - f2fs_err_ratelimited(F2FS_I_SB(dic->inode), + f2fs_err_ratelimited(dic->sbi, "lz4 decompress failed, ret:%d", ret); return -EIO; } if (ret != PAGE_SIZE << dic->log_cluster_size) { - f2fs_err_ratelimited(F2FS_I_SB(dic->inode), + f2fs_err_ratelimited(dic->sbi, "lz4 invalid ret:%d, expected:%lu", ret, PAGE_SIZE << dic->log_cluster_size); return -EIO; @@ -424,13 +420,13 @@ static int zstd_init_decompress_ctx(struct decompress_io_ctx *dic) workspace_size = zstd_dstream_workspace_bound(max_window_size); - workspace = f2fs_vmalloc(F2FS_I_SB(dic->inode), workspace_size); + workspace = f2fs_vmalloc(dic->sbi, workspace_size); if (!workspace) return -ENOMEM; stream = zstd_init_dstream(max_window_size, workspace, workspace_size); if (!stream) { - f2fs_err_ratelimited(F2FS_I_SB(dic->inode), + f2fs_err_ratelimited(dic->sbi, "%s zstd_init_dstream failed", __func__); vfree(workspace); return -EIO; @@ -466,14 +462,14 @@ static int zstd_decompress_pages(struct decompress_io_ctx *dic) ret = zstd_decompress_stream(stream, &outbuf, &inbuf); if (zstd_is_error(ret)) { - f2fs_err_ratelimited(F2FS_I_SB(dic->inode), + f2fs_err_ratelimited(dic->sbi, "%s zstd_decompress_stream failed, ret: %d", __func__, zstd_get_error_code(ret)); return -EIO; } if (dic->rlen != outbuf.pos) { - f2fs_err_ratelimited(F2FS_I_SB(dic->inode), + f2fs_err_ratelimited(dic->sbi, "%s ZSTD invalid rlen:%zu, expected:%lu", __func__, dic->rlen, PAGE_SIZE << dic->log_cluster_size); @@ -622,6 +618,7 @@ static void *f2fs_vmap(struct page **pages, unsigned int count) static int f2fs_compress_pages(struct compress_ctx *cc) { + struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); struct f2fs_inode_info *fi = F2FS_I(cc->inode); const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; @@ -642,7 +639,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc) cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE); cc->valid_nr_cpages = cc->nr_cpages; - cc->cpages = page_array_alloc(cc->inode, cc->nr_cpages); + cc->cpages = page_array_alloc(sbi, cc->nr_cpages); if (!cc->cpages) { ret = -ENOMEM; goto destroy_compress_ctx; @@ -716,7 +713,7 @@ out_free_cpages: if (cc->cpages[i]) f2fs_compress_free_page(cc->cpages[i]); } - page_array_free(cc->inode, cc->cpages, cc->nr_cpages); + page_array_free(sbi, cc->cpages, cc->nr_cpages); cc->cpages = NULL; destroy_compress_ctx: if (cops->destroy_compress_ctx) @@ -734,7 +731,7 @@ static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic, void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) { - struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); + struct f2fs_sb_info *sbi = dic->sbi; struct f2fs_inode_info *fi = F2FS_I(dic->inode); const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; @@ -796,25 +793,27 @@ out_end_io: f2fs_decompress_end_io(dic, ret, in_task); } +static void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, + struct folio *folio, nid_t ino, block_t blkaddr); + /* * This is called when a page of a compressed cluster has been read from disk * (or failed to be read from disk). It checks whether this page was the last * page being waited on in the cluster, and if so, it decompresses the cluster * (or in the case of a failure, cleans up without actually decompressing). */ -void f2fs_end_read_compressed_page(struct page *page, bool failed, +void f2fs_end_read_compressed_page(struct folio *folio, bool failed, block_t blkaddr, bool in_task) { - struct decompress_io_ctx *dic = - (struct decompress_io_ctx *)page_private(page); - struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); + struct decompress_io_ctx *dic = folio->private; + struct f2fs_sb_info *sbi = dic->sbi; dec_page_count(sbi, F2FS_RD_DATA); if (failed) WRITE_ONCE(dic->failed, true); else if (blkaddr && in_task) - f2fs_cache_compressed_page(sbi, page, + f2fs_cache_compressed_page(sbi, folio, dic->inode->i_ino, blkaddr); if (atomic_dec_and_test(&dic->remaining_pages)) @@ -1340,7 +1339,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, cic->magic = F2FS_COMPRESSED_PAGE_MAGIC; cic->inode = inode; atomic_set(&cic->pending_pages, cc->valid_nr_cpages); - cic->rpages = page_array_alloc(cc->inode, cc->cluster_size); + cic->rpages = page_array_alloc(sbi, cc->cluster_size); if (!cic->rpages) goto out_put_cic; @@ -1420,7 +1419,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, (*submitted)++; unlock_continue: inode_dec_dirty_pages(cc->inode); - unlock_page(fio.page); + folio_unlock(fio.folio); } if (fio.compr_blocks) @@ -1442,13 +1441,13 @@ unlock_continue: spin_unlock(&fi->i_size_lock); f2fs_put_rpages(cc); - page_array_free(cc->inode, cc->cpages, cc->nr_cpages); + page_array_free(sbi, cc->cpages, cc->nr_cpages); cc->cpages = NULL; f2fs_destroy_compress_ctx(cc, false); return 0; out_destroy_crypt: - page_array_free(cc->inode, cic->rpages, cc->cluster_size); + page_array_free(sbi, cic->rpages, cc->cluster_size); for (--i; i >= 0; i--) { if (!cc->cpages[i]) @@ -1469,18 +1468,18 @@ out_free: f2fs_compress_free_page(cc->cpages[i]); cc->cpages[i] = NULL; } - page_array_free(cc->inode, cc->cpages, cc->nr_cpages); + page_array_free(sbi, cc->cpages, cc->nr_cpages); cc->cpages = NULL; return -EAGAIN; } -void f2fs_compress_write_end_io(struct bio *bio, struct page *page) +void f2fs_compress_write_end_io(struct bio *bio, struct folio *folio) { + struct page *page = &folio->page; struct f2fs_sb_info *sbi = bio->bi_private; - struct compress_io_ctx *cic = - (struct compress_io_ctx *)page_private(page); - enum count_type type = WB_DATA_TYPE(page, - f2fs_is_compressed_page(page)); + struct compress_io_ctx *cic = folio->private; + enum count_type type = WB_DATA_TYPE(folio, + f2fs_is_compressed_page(folio)); int i; if (unlikely(bio->bi_status != BLK_STS_OK)) @@ -1499,7 +1498,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page) end_page_writeback(cic->rpages[i]); } - page_array_free(cic->inode, cic->rpages, cic->nr_rpages); + page_array_free(sbi, cic->rpages, cic->nr_rpages); kmem_cache_free(cic_entry_slab, cic); } @@ -1633,14 +1632,13 @@ static inline bool allow_memalloc_for_decomp(struct f2fs_sb_info *sbi, static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, bool pre_alloc) { - const struct f2fs_compress_ops *cops = - f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm]; + const struct f2fs_compress_ops *cops = f2fs_cops[dic->compress_algorithm]; int i; - if (!allow_memalloc_for_decomp(F2FS_I_SB(dic->inode), pre_alloc)) + if (!allow_memalloc_for_decomp(dic->sbi, pre_alloc)) return 0; - dic->tpages = page_array_alloc(dic->inode, dic->cluster_size); + dic->tpages = page_array_alloc(dic->sbi, dic->cluster_size); if (!dic->tpages) return -ENOMEM; @@ -1670,10 +1668,9 @@ static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic, bool bypass_destroy_callback, bool pre_alloc) { - const struct f2fs_compress_ops *cops = - f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm]; + const struct f2fs_compress_ops *cops = f2fs_cops[dic->compress_algorithm]; - if (!allow_memalloc_for_decomp(F2FS_I_SB(dic->inode), pre_alloc)) + if (!allow_memalloc_for_decomp(dic->sbi, pre_alloc)) return; if (!bypass_destroy_callback && cops->destroy_decompress_ctx) @@ -1700,7 +1697,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) if (!dic) return ERR_PTR(-ENOMEM); - dic->rpages = page_array_alloc(cc->inode, cc->cluster_size); + dic->rpages = page_array_alloc(sbi, cc->cluster_size); if (!dic->rpages) { kmem_cache_free(dic_entry_slab, dic); return ERR_PTR(-ENOMEM); @@ -1708,6 +1705,8 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) dic->magic = F2FS_COMPRESSED_PAGE_MAGIC; dic->inode = cc->inode; + dic->sbi = sbi; + dic->compress_algorithm = F2FS_I(cc->inode)->i_compress_algorithm; atomic_set(&dic->remaining_pages, cc->nr_cpages); dic->cluster_idx = cc->cluster_idx; dic->cluster_size = cc->cluster_size; @@ -1721,7 +1720,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) dic->rpages[i] = cc->rpages[i]; dic->nr_rpages = cc->cluster_size; - dic->cpages = page_array_alloc(dic->inode, dic->nr_cpages); + dic->cpages = page_array_alloc(sbi, dic->nr_cpages); if (!dic->cpages) { ret = -ENOMEM; goto out_free; @@ -1751,6 +1750,8 @@ static void f2fs_free_dic(struct decompress_io_ctx *dic, bool bypass_destroy_callback) { int i; + /* use sbi in dic to avoid UFA of dic->inode*/ + struct f2fs_sb_info *sbi = dic->sbi; f2fs_release_decomp_mem(dic, bypass_destroy_callback, true); @@ -1762,7 +1763,7 @@ static void f2fs_free_dic(struct decompress_io_ctx *dic, continue; f2fs_compress_free_page(dic->tpages[i]); } - page_array_free(dic->inode, dic->tpages, dic->cluster_size); + page_array_free(sbi, dic->tpages, dic->cluster_size); } if (dic->cpages) { @@ -1771,10 +1772,10 @@ static void f2fs_free_dic(struct decompress_io_ctx *dic, continue; f2fs_compress_free_page(dic->cpages[i]); } - page_array_free(dic->inode, dic->cpages, dic->nr_cpages); + page_array_free(sbi, dic->cpages, dic->nr_cpages); } - page_array_free(dic->inode, dic->rpages, dic->nr_rpages); + page_array_free(sbi, dic->rpages, dic->nr_rpages); kmem_cache_free(dic_entry_slab, dic); } @@ -1793,8 +1794,7 @@ static void f2fs_put_dic(struct decompress_io_ctx *dic, bool in_task) f2fs_free_dic(dic, false); } else { INIT_WORK(&dic->free_work, f2fs_late_free_dic); - queue_work(F2FS_I_SB(dic->inode)->post_read_wq, - &dic->free_work); + queue_work(dic->sbi->post_read_wq, &dic->free_work); } } } @@ -1921,8 +1921,8 @@ void f2fs_invalidate_compress_pages_range(struct f2fs_sb_info *sbi, invalidate_mapping_pages(COMPRESS_MAPPING(sbi), blkaddr, blkaddr + len - 1); } -void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page, - nid_t ino, block_t blkaddr) +static void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, + struct folio *folio, nid_t ino, block_t blkaddr) { struct folio *cfolio; int ret; @@ -1953,9 +1953,9 @@ void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page, return; } - set_page_private_data(&cfolio->page, ino); + folio_set_f2fs_data(cfolio, ino); - memcpy(folio_address(cfolio), page_address(page), PAGE_SIZE); + memcpy(folio_address(cfolio), folio_address(folio), PAGE_SIZE); folio_mark_uptodate(cfolio); f2fs_folio_put(cfolio, true); } @@ -2012,7 +2012,7 @@ void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) continue; } - if (ino != get_page_private_data(&folio->page)) { + if (ino != folio_get_f2fs_data(folio)) { folio_unlock(folio); continue; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 711ad80b38d0..7961e0ddfca3 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -47,14 +47,14 @@ void f2fs_destroy_bioset(void) bioset_exit(&f2fs_bioset); } -bool f2fs_is_cp_guaranteed(struct page *page) +bool f2fs_is_cp_guaranteed(const struct folio *folio) { - struct address_space *mapping = page_folio(page)->mapping; + struct address_space *mapping = folio->mapping; struct inode *inode; struct f2fs_sb_info *sbi; - if (fscrypt_is_bounce_page(page)) - return page_private_gcing(fscrypt_pagecache_page(page)); + if (fscrypt_is_bounce_folio(folio)) + return folio_test_f2fs_gcing(fscrypt_pagecache_folio(folio)); inode = mapping->host; sbi = F2FS_I_SB(inode); @@ -65,7 +65,7 @@ bool f2fs_is_cp_guaranteed(struct page *page) return true; if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) || - page_private_gcing(page)) + folio_test_f2fs_gcing(folio)) return true; return false; } @@ -142,9 +142,9 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task) bio_for_each_folio_all(fi, bio) { struct folio *folio = fi.folio; - if (f2fs_is_compressed_page(&folio->page)) { + if (f2fs_is_compressed_page(folio)) { if (ctx && !ctx->decompression_attempted) - f2fs_end_read_compressed_page(&folio->page, true, 0, + f2fs_end_read_compressed_page(folio, true, 0, in_task); f2fs_put_folio_dic(folio, in_task); continue; @@ -181,14 +181,13 @@ static void f2fs_verify_bio(struct work_struct *work) * as those were handled separately by f2fs_end_read_compressed_page(). */ if (may_have_compressed_pages) { - struct bio_vec *bv; - struct bvec_iter_all iter_all; + struct folio_iter fi; - bio_for_each_segment_all(bv, bio, iter_all) { - struct page *page = bv->bv_page; + bio_for_each_folio_all(fi, bio) { + struct folio *folio = fi.folio; - if (!f2fs_is_compressed_page(page) && - !fsverity_verify_page(page)) { + if (!f2fs_is_compressed_page(folio) && + !fsverity_verify_page(&folio->page)) { bio->bi_status = BLK_STS_IOERR; break; } @@ -233,16 +232,15 @@ static void f2fs_verify_and_finish_bio(struct bio *bio, bool in_task) static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx, bool in_task) { - struct bio_vec *bv; - struct bvec_iter_all iter_all; + struct folio_iter fi; bool all_compressed = true; block_t blkaddr = ctx->fs_blkaddr; - bio_for_each_segment_all(bv, ctx->bio, iter_all) { - struct page *page = bv->bv_page; + bio_for_each_folio_all(fi, ctx->bio) { + struct folio *folio = fi.folio; - if (f2fs_is_compressed_page(page)) - f2fs_end_read_compressed_page(page, false, blkaddr, + if (f2fs_is_compressed_page(folio)) + f2fs_end_read_compressed_page(folio, false, blkaddr, in_task); else all_compressed = false; @@ -280,9 +278,9 @@ static void f2fs_post_read_work(struct work_struct *work) static void f2fs_read_end_io(struct bio *bio) { - struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio)); + struct f2fs_sb_info *sbi = F2FS_F_SB(bio_first_folio_all(bio)); struct bio_post_read_ctx *ctx; - bool intask = in_task(); + bool intask = in_task() && !irqs_disabled(); iostat_update_and_unbind_ctx(bio); ctx = bio->bi_private; @@ -339,13 +337,13 @@ static void f2fs_write_end_io(struct bio *bio) } #ifdef CONFIG_F2FS_FS_COMPRESSION - if (f2fs_is_compressed_page(&folio->page)) { - f2fs_compress_write_end_io(bio, &folio->page); + if (f2fs_is_compressed_page(folio)) { + f2fs_compress_write_end_io(bio, folio); continue; } #endif - type = WB_DATA_TYPE(&folio->page, false); + type = WB_DATA_TYPE(folio, false); if (unlikely(bio->bi_status != BLK_STS_OK)) { mapping_set_error(folio->mapping, -EIO); @@ -355,12 +353,12 @@ static void f2fs_write_end_io(struct bio *bio) } f2fs_bug_on(sbi, is_node_folio(folio) && - folio->index != nid_of_node(&folio->page)); + folio->index != nid_of_node(folio)); dec_page_count(sbi, type); if (f2fs_in_warm_node_list(sbi, folio)) f2fs_del_fsync_node_entry(sbi, folio); - clear_page_private_gcing(&folio->page); + folio_clear_f2fs_gcing(folio); folio_end_writeback(folio); } if (!get_pages(sbi, F2FS_WB_CP_DATA) && @@ -419,7 +417,6 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio) { unsigned int temp_mask = GENMASK(NR_TEMP_TYPE - 1, 0); - struct folio *fio_folio = page_folio(fio->page); unsigned int fua_flag, meta_flag, io_flag; blk_opf_t op_flags = 0; @@ -447,7 +444,7 @@ static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio) op_flags |= REQ_FUA; if (fio->type == DATA && - F2FS_I(fio_folio->mapping->host)->ioprio_hint == F2FS_IOPRIO_WRITE) + F2FS_I(fio->folio->mapping->host)->ioprio_hint == F2FS_IOPRIO_WRITE) op_flags |= REQ_PRIO; return op_flags; @@ -546,14 +543,14 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) } static bool __has_merged_page(struct bio *bio, struct inode *inode, - struct page *page, nid_t ino) + struct folio *folio, nid_t ino) { struct folio_iter fi; if (!bio) return false; - if (!inode && !page && !ino) + if (!inode && !folio && !ino) return true; bio_for_each_folio_all(fi, bio) { @@ -564,7 +561,7 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode, if (IS_ERR(target)) continue; } - if (f2fs_is_compressed_page(&target->page)) { + if (f2fs_is_compressed_page(target)) { target = f2fs_compress_control_folio(target); if (IS_ERR(target)) continue; @@ -572,9 +569,9 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode, if (inode && inode == target->mapping->host) return true; - if (page && page == &target->page) + if (folio && folio == target) return true; - if (ino && ino == ino_of_node(&target->page)) + if (ino && ino == ino_of_node(target)) return true; } @@ -641,7 +638,7 @@ unlock_out: } static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, - struct inode *inode, struct page *page, + struct inode *inode, struct folio *folio, nid_t ino, enum page_type type, bool force) { enum temp_type temp; @@ -653,7 +650,7 @@ static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, struct f2fs_bio_info *io = sbi->write_io[btype] + temp; f2fs_down_read(&io->io_rwsem); - ret = __has_merged_page(io->bio, inode, page, ino); + ret = __has_merged_page(io->bio, inode, folio, ino); f2fs_up_read(&io->io_rwsem); } if (ret) @@ -671,10 +668,10 @@ void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type) } void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, - struct inode *inode, struct page *page, + struct inode *inode, struct folio *folio, nid_t ino, enum page_type type) { - __submit_merged_write_cond(sbi, inode, page, ino, type, false); + __submit_merged_write_cond(sbi, inode, folio, ino, type, false); } void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi) @@ -691,7 +688,7 @@ void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi) int f2fs_submit_page_bio(struct f2fs_io_info *fio) { struct bio *bio; - struct folio *fio_folio = page_folio(fio->page); + struct folio *fio_folio = fio->folio; struct folio *data_folio = fio->encrypted_page ? page_folio(fio->encrypted_page) : fio_folio; @@ -713,7 +710,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) wbc_account_cgroup_owner(fio->io_wbc, fio_folio, PAGE_SIZE); inc_page_count(fio->sbi, is_read_io(fio->op) ? - __read_io_type(data_folio) : WB_DATA_TYPE(fio->page, false)); + __read_io_type(data_folio) : WB_DATA_TYPE(fio->folio, false)); if (is_read_io(bio_op(bio))) f2fs_submit_read_bio(fio->sbi, bio, fio->type); @@ -779,7 +776,7 @@ static void del_bio_entry(struct bio_entry *be) static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio, struct page *page) { - struct folio *fio_folio = page_folio(fio->page); + struct folio *fio_folio = fio->folio; struct f2fs_sb_info *sbi = fio->sbi; enum temp_type temp; bool found = false; @@ -848,7 +845,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, found = (target == be->bio); else found = __has_merged_page(be->bio, NULL, - &folio->page, 0); + folio, 0); if (found) break; } @@ -865,7 +862,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, found = (target == be->bio); else found = __has_merged_page(be->bio, NULL, - &folio->page, 0); + folio, 0); if (found) { target = be->bio; del_bio_entry(be); @@ -886,15 +883,15 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, int f2fs_merge_page_bio(struct f2fs_io_info *fio) { struct bio *bio = *fio->bio; - struct page *page = fio->encrypted_page ? - fio->encrypted_page : fio->page; - struct folio *folio = page_folio(fio->page); + struct folio *data_folio = fio->encrypted_page ? + page_folio(fio->encrypted_page) : fio->folio; + struct folio *folio = fio->folio; if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) return -EFSCORRUPTED; - trace_f2fs_submit_folio_bio(page_folio(page), fio); + trace_f2fs_submit_folio_bio(data_folio, fio); if (bio && !page_is_mergeable(fio->sbi, bio, *fio->last_block, fio->new_blkaddr)) @@ -905,16 +902,16 @@ alloc_new: f2fs_set_bio_crypt_ctx(bio, folio->mapping->host, folio->index, fio, GFP_NOIO); - add_bio_entry(fio->sbi, bio, page, fio->temp); + add_bio_entry(fio->sbi, bio, &data_folio->page, fio->temp); } else { - if (add_ipu_page(fio, &bio, page)) + if (add_ipu_page(fio, &bio, &data_folio->page)) goto alloc_new; } if (fio->io_wbc) wbc_account_cgroup_owner(fio->io_wbc, folio, folio_size(folio)); - inc_page_count(fio->sbi, WB_DATA_TYPE(page, false)); + inc_page_count(fio->sbi, WB_DATA_TYPE(data_folio, false)); *fio->last_block = fio->new_blkaddr; *fio->bio = bio; @@ -949,7 +946,7 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp; - struct page *bio_page; + struct folio *bio_folio; enum count_type type; f2fs_bug_on(sbi, is_read_io(fio->op)); @@ -980,44 +977,44 @@ next: verify_fio_blkaddr(fio); if (fio->encrypted_page) - bio_page = fio->encrypted_page; + bio_folio = page_folio(fio->encrypted_page); else if (fio->compressed_page) - bio_page = fio->compressed_page; + bio_folio = page_folio(fio->compressed_page); else - bio_page = fio->page; + bio_folio = fio->folio; /* set submitted = true as a return value */ fio->submitted = 1; - type = WB_DATA_TYPE(bio_page, fio->compressed_page); + type = WB_DATA_TYPE(bio_folio, fio->compressed_page); inc_page_count(sbi, type); if (io->bio && (!io_is_mergeable(sbi, io->bio, io, fio, io->last_block_in_bio, fio->new_blkaddr) || !f2fs_crypt_mergeable_bio(io->bio, fio_inode(fio), - page_folio(bio_page)->index, fio))) + bio_folio->index, fio))) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { io->bio = __bio_alloc(fio, BIO_MAX_VECS); f2fs_set_bio_crypt_ctx(io->bio, fio_inode(fio), - page_folio(bio_page)->index, fio, GFP_NOIO); + bio_folio->index, fio, GFP_NOIO); io->fio = *fio; } - if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < PAGE_SIZE) { + if (!bio_add_folio(io->bio, bio_folio, folio_size(bio_folio), 0)) { __submit_merged_bio(io); goto alloc_new; } if (fio->io_wbc) - wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page), - PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, fio->folio, + folio_size(fio->folio)); io->last_block_in_bio = fio->new_blkaddr; - trace_f2fs_submit_folio_write(page_folio(fio->page), fio); + trace_f2fs_submit_folio_write(fio->folio, fio); #ifdef CONFIG_BLK_DEV_ZONED if (f2fs_sb_has_blkzoned(sbi) && btype < META && is_end_zone_blkaddr(sbi, fio->new_blkaddr)) { @@ -1553,10 +1550,14 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag) unsigned int start_pgofs; int bidx = 0; bool is_hole; + bool lfs_dio_write; if (!maxblocks) return 0; + lfs_dio_write = (flag == F2FS_GET_BLOCK_DIO && f2fs_lfs_mode(sbi) && + map->m_may_create); + if (!map->m_may_create && f2fs_map_blocks_cached(inode, map, flag)) goto out; @@ -1572,8 +1573,11 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag) end = pgofs + maxblocks; next_dnode: - if (map->m_may_create) + if (map->m_may_create) { + if (f2fs_lfs_mode(sbi)) + f2fs_balance_fs(sbi, true); f2fs_map_lock(sbi, flag); + } /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -1589,7 +1593,7 @@ next_dnode: start_pgofs = pgofs; prealloc = 0; last_ofs_in_node = ofs_in_node = dn.ofs_in_node; - end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); next_block: blkaddr = f2fs_data_blkaddr(&dn); @@ -1603,7 +1607,7 @@ next_block: /* use out-place-update for direct IO under LFS mode */ if (map->m_may_create && (is_hole || (flag == F2FS_GET_BLOCK_DIO && f2fs_lfs_mode(sbi) && - !f2fs_is_pinned_file(inode)))) { + !f2fs_is_pinned_file(inode) && map->m_last_pblk != blkaddr))) { if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; goto sync_out; @@ -1687,10 +1691,15 @@ next_block: if (map->m_multidev_dio) map->m_bdev = FDEV(bidx).bdev; + + if (lfs_dio_write) + map->m_last_pblk = NULL_ADDR; } else if (map_is_mergeable(sbi, map, blkaddr, flag, bidx, ofs)) { ofs++; map->m_len++; } else { + if (lfs_dio_write && !f2fs_is_pinned_file(inode)) + map->m_last_pblk = blkaddr; goto sync_out; } @@ -1715,14 +1724,6 @@ skip: dn.ofs_in_node = end_offset; } - if (flag == F2FS_GET_BLOCK_DIO && f2fs_lfs_mode(sbi) && - map->m_may_create) { - /* the next block to be allocated may not be contiguous. */ - if (GET_SEGOFF_FROM_SEG0(sbi, blkaddr) % BLKS_PER_SEC(sbi) == - CAP_BLKS_PER_SEC(sbi) - 1) - goto sync_out; - } - if (pgofs >= end) goto sync_out; else if (dn.ofs_in_node < end_offset) @@ -2303,7 +2304,7 @@ submit_and_realloc: } if (!bio) { - bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages, + bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages - i, f2fs_ra_op_flags(rac), folio->index, for_write); if (IS_ERR(bio)) { @@ -2376,6 +2377,14 @@ static int f2fs_mpage_readpages(struct inode *inode, unsigned max_nr_pages = nr_pages; int ret = 0; +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + index = rac ? readahead_index(rac) : folio->index; + max_nr_pages = round_up(index + nr_pages, cc.cluster_size) - + round_down(index, cc.cluster_size); + } +#endif + map.m_pblk = 0; map.m_lblk = 0; map.m_len = 0; @@ -2642,7 +2651,7 @@ static inline bool need_inplace_update(struct f2fs_io_info *fio) int f2fs_do_write_data_page(struct f2fs_io_info *fio) { - struct folio *folio = page_folio(fio->page); + struct folio *folio = fio->folio; struct inode *inode = folio->mapping->host; struct dnode_of_data dn; struct node_info ni; @@ -2652,7 +2661,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) /* Use COW inode to make dnode_of_data for atomic write */ atomic_commit = f2fs_is_atomic_file(inode) && - page_private_atomic(folio_page(folio, 0)); + folio_test_f2fs_atomic(folio); if (atomic_commit) set_new_dnode(&dn, F2FS_I(inode)->cow_inode, NULL, NULL, 0); else @@ -2683,7 +2692,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) /* This page is already truncated */ if (fio->old_blkaddr == NULL_ADDR) { folio_clear_uptodate(folio); - clear_page_private_gcing(folio_page(folio, 0)); + folio_clear_f2fs_gcing(folio); goto out_writepage; } got_it: @@ -2753,7 +2762,7 @@ got_it: trace_f2fs_do_write_data_page(folio, OPU); set_inode_flag(inode, FI_APPEND_WRITE); if (atomic_commit) - clear_page_private_atomic(folio_page(folio, 0)); + folio_clear_f2fs_atomic(folio); out_writepage: f2fs_put_dnode(&dn); out: @@ -2771,7 +2780,6 @@ int f2fs_write_single_data_page(struct folio *folio, int *submitted, bool allow_balance) { struct inode *inode = folio->mapping->host; - struct page *page = folio_page(folio, 0); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); loff_t i_size = i_size_read(inode); const pgoff_t end_index = ((unsigned long long)i_size) @@ -2788,7 +2796,7 @@ int f2fs_write_single_data_page(struct folio *folio, int *submitted, .op = REQ_OP_WRITE, .op_flags = wbc_to_write_flags(wbc), .old_blkaddr = NULL_ADDR, - .page = page, + .folio = folio, .encrypted_page = NULL, .submitted = 0, .compr_blocks = compr_blocks, @@ -2890,7 +2898,7 @@ out: inode_dec_dirty_pages(inode); if (err) { folio_clear_uptodate(folio); - clear_page_private_gcing(page); + folio_clear_f2fs_gcing(folio); } folio_unlock(folio); if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) && @@ -3376,7 +3384,7 @@ restart: f2fs_do_read_inline_data(folio, ifolio); set_inode_flag(inode, FI_DATA_EXIST); if (inode->i_nlink) - set_page_private_inline(&ifolio->page); + folio_set_f2fs_inline(ifolio); goto out; } err = f2fs_convert_inline_folio(&dn, folio); @@ -3698,7 +3706,7 @@ static int f2fs_write_end(const struct kiocb *iocb, folio_mark_dirty(folio); if (f2fs_is_atomic_file(inode)) - set_page_private_atomic(folio_page(folio, 0)); + folio_set_f2fs_atomic(folio); if (pos + copied > i_size_read(inode) && !f2fs_verity_in_progress(inode)) { @@ -3733,7 +3741,7 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length) f2fs_remove_dirty_inode(inode); } } - clear_page_private_all(&folio->page); + folio_detach_private(folio); } bool f2fs_release_folio(struct folio *folio, gfp_t wait) @@ -3742,7 +3750,7 @@ bool f2fs_release_folio(struct folio *folio, gfp_t wait) if (folio_test_dirty(folio)) return false; - clear_page_private_all(&folio->page); + folio_detach_private(folio); return true; } @@ -4160,7 +4168,7 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { - struct f2fs_map_blocks map = {}; + struct f2fs_map_blocks map = { NULL, }; pgoff_t next_pgofs = 0; int err; @@ -4169,6 +4177,10 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, map.m_next_pgofs = &next_pgofs; map.m_seg_type = f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode), inode->i_write_hint); + if (flags & IOMAP_WRITE && iomap->private) { + map.m_last_pblk = (unsigned long)iomap->private; + iomap->private = NULL; + } /* * If the blocks being overwritten are already allocated, @@ -4207,6 +4219,9 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->flags |= IOMAP_F_MERGED; iomap->bdev = map.m_bdev; iomap->addr = F2FS_BLK_TO_BYTES(map.m_pblk); + + if (flags & IOMAP_WRITE && map.m_last_pblk) + iomap->private = (void *)map.m_last_pblk; } else { if (flags & IOMAP_WRITE) return -ENOTBLK; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 16c2dfb4f595..43a83bbd3bc5 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -21,7 +21,7 @@ #include "gc.h" static LIST_HEAD(f2fs_stat_list); -static DEFINE_RAW_SPINLOCK(f2fs_stat_lock); +static DEFINE_SPINLOCK(f2fs_stat_lock); #ifdef CONFIG_DEBUG_FS static struct dentry *f2fs_debugfs_root; #endif @@ -91,7 +91,7 @@ static void update_multidevice_stats(struct f2fs_sb_info *sbi) seg_blks = get_seg_entry(sbi, j)->valid_blocks; /* update segment stats */ - if (IS_CURSEG(sbi, j)) + if (is_curseg(sbi, j)) dev_stats[i].devstats[0][DEVSTAT_INUSE]++; else if (seg_blks == BLKS_PER_SEG(sbi)) dev_stats[i].devstats[0][DEVSTAT_FULL]++; @@ -109,7 +109,7 @@ static void update_multidevice_stats(struct f2fs_sb_info *sbi) sec_blks = get_sec_entry(sbi, j)->valid_blocks; /* update section stats */ - if (IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, j))) + if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, j))) dev_stats[i].devstats[1][DEVSTAT_INUSE]++; else if (sec_blks == BLKS_PER_SEC(sbi)) dev_stats[i].devstats[1][DEVSTAT_FULL]++; @@ -439,9 +439,8 @@ static int stat_show(struct seq_file *s, void *v) { struct f2fs_stat_info *si; int i = 0, j = 0; - unsigned long flags; - raw_spin_lock_irqsave(&f2fs_stat_lock, flags); + spin_lock(&f2fs_stat_lock); list_for_each_entry(si, &f2fs_stat_list, stat_list) { struct f2fs_sb_info *sbi = si->sbi; @@ -753,7 +752,7 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - paged : %llu KB\n", si->page_mem >> 10); } - raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags); + spin_unlock(&f2fs_stat_lock); return 0; } @@ -765,7 +764,6 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_stat_info *si; struct f2fs_dev_stats *dev_stats; - unsigned long flags; int i; si = f2fs_kzalloc(sbi, sizeof(struct f2fs_stat_info), GFP_KERNEL); @@ -817,9 +815,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->max_aw_cnt, 0); - raw_spin_lock_irqsave(&f2fs_stat_lock, flags); + spin_lock(&f2fs_stat_lock); list_add_tail(&si->stat_list, &f2fs_stat_list); - raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags); + spin_unlock(&f2fs_stat_lock); return 0; } @@ -827,11 +825,10 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = F2FS_STAT(sbi); - unsigned long flags; - raw_spin_lock_irqsave(&f2fs_stat_lock, flags); + spin_lock(&f2fs_stat_lock); list_del(&si->stat_list); - raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags); + spin_unlock(&f2fs_stat_lock); kfree(si->dev_stats); kfree(si); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index c36b3b22bfff..fffd7749d6d1 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -454,7 +454,7 @@ static void init_dent_inode(struct inode *dir, struct inode *inode, f2fs_folio_wait_writeback(ifolio, NODE, true, true); /* copy name info. to this inode folio */ - ri = F2FS_INODE(&ifolio->page); + ri = F2FS_INODE(ifolio); ri->i_namelen = cpu_to_le32(fname->disk_name.len); memcpy(ri->i_name, fname->disk_name.name, fname->disk_name.len); if (IS_ENCRYPTED(dir)) { @@ -897,7 +897,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct folio *folio, f2fs_clear_page_cache_dirty_tag(folio); folio_clear_dirty_for_io(folio); folio_clear_uptodate(folio); - clear_page_private_all(&folio->page); + folio_detach_private(folio); inode_dec_dirty_pages(dir); f2fs_remove_dirty_inode(dir); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index cfe925a3d555..199c1e7a83ef 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -19,10 +19,10 @@ #include "node.h" #include <trace/events/f2fs.h> -bool sanity_check_extent_cache(struct inode *inode, struct page *ipage) +bool sanity_check_extent_cache(struct inode *inode, struct folio *ifolio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_extent *i_ext = &F2FS_INODE(ipage)->i_ext; + struct f2fs_extent *i_ext = &F2FS_INODE(ifolio)->i_ext; struct extent_info ei; int devi; @@ -411,10 +411,10 @@ void f2fs_init_read_extent_tree(struct inode *inode, struct folio *ifolio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree_info *eti = &sbi->extent_tree[EX_READ]; - struct f2fs_extent *i_ext = &F2FS_INODE(&ifolio->page)->i_ext; + struct f2fs_extent *i_ext = &F2FS_INODE(ifolio)->i_ext; struct extent_tree *et; struct extent_node *en; - struct extent_info ei; + struct extent_info ei = {0}; if (!__may_extent_tree(inode, EX_READ)) { /* drop largest read extent */ @@ -934,7 +934,7 @@ static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type typ if (!__may_extent_tree(dn->inode, type)) return; - ei.fofs = f2fs_start_bidx_of_node(ofs_of_node(&dn->node_folio->page), dn->inode) + + ei.fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_folio), dn->inode) + dn->ofs_in_node; ei.len = 1; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c78464792ceb..46be7560548c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -386,7 +386,7 @@ struct discard_cmd { struct rb_node rb_node; /* rb node located in rb-tree */ struct discard_info di; /* discard info */ struct list_head list; /* command list */ - struct completion wait; /* compleation */ + struct completion wait; /* completion */ struct block_device *bdev; /* bdev */ unsigned short ref; /* reference count */ unsigned char state; /* state */ @@ -732,6 +732,7 @@ struct f2fs_map_blocks { block_t m_lblk; unsigned int m_len; unsigned int m_flags; + unsigned long m_last_pblk; /* last allocated block, only used for DIO in LFS mode */ pgoff_t *m_next_pgofs; /* point next possible non-hole pgofs */ pgoff_t *m_next_extent; /* point to next possible extent */ int m_seg_type; @@ -875,6 +876,7 @@ struct f2fs_inode_info { /* linked in global inode list for cache donation */ struct list_head gdonate_list; pgoff_t donate_start, donate_end; /* inclusive */ + atomic_t open_count; /* # of open files */ struct task_struct *atomic_write_task; /* store atomic write task */ struct extent_tree *extent_tree[NR_EXTENT_CACHES]; @@ -1123,8 +1125,8 @@ struct f2fs_sm_info { * f2fs monitors the number of several block types such as on-writeback, * dirty dentry blocks, dirty node blocks, and dirty meta blocks. */ -#define WB_DATA_TYPE(p, f) \ - (f || f2fs_is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA) +#define WB_DATA_TYPE(folio, f) \ + (f || f2fs_is_cp_guaranteed(folio) ? F2FS_WB_CP_DATA : F2FS_WB_DATA) enum count_type { F2FS_DIRTY_DENTS, F2FS_DIRTY_DATA, @@ -1240,7 +1242,10 @@ struct f2fs_io_info { blk_opf_t op_flags; /* req_flag_bits */ block_t new_blkaddr; /* new block address to be written */ block_t old_blkaddr; /* old block address before Cow */ - struct page *page; /* page to be written */ + union { + struct page *page; /* page to be written */ + struct folio *folio; + }; struct page *encrypted_page; /* encrypted page */ struct page *compressed_page; /* compressed page */ struct list_head list; /* serialize IOs */ @@ -1286,7 +1291,7 @@ struct f2fs_bio_info { struct f2fs_dev_info { struct file *bdev_file; struct block_device *bdev; - char path[MAX_PATH_LEN]; + char path[MAX_PATH_LEN + 1]; unsigned int total_segments; block_t start_blk; block_t end_blk; @@ -1427,7 +1432,7 @@ enum { enum { MEMORY_MODE_NORMAL, /* memory mode for normal devices */ - MEMORY_MODE_LOW, /* memory mode for low memry devices */ + MEMORY_MODE_LOW, /* memory mode for low memory devices */ }; enum errors_option { @@ -1491,7 +1496,7 @@ enum compress_flag { #define COMPRESS_DATA_RESERVED_SIZE 4 struct compress_data { __le32 clen; /* compressed data size */ - __le32 chksum; /* compressed data chksum */ + __le32 chksum; /* compressed data checksum */ __le32 reserved[COMPRESS_DATA_RESERVED_SIZE]; /* reserved */ u8 cdata[]; /* compressed data */ }; @@ -1536,6 +1541,7 @@ struct compress_io_ctx { struct decompress_io_ctx { u32 magic; /* magic number to indicate page is compressed */ struct inode *inode; /* inode the context belong to */ + struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ pgoff_t cluster_idx; /* cluster index number */ unsigned int cluster_size; /* page count in cluster */ unsigned int log_cluster_size; /* log of cluster size */ @@ -1576,6 +1582,7 @@ struct decompress_io_ctx { bool failed; /* IO error occurred before decompression? */ bool need_verity; /* need fs-verity verification after decompression? */ + unsigned char compress_algorithm; /* backup algorithm type */ void *private; /* payload buffer for specified decompression algorithm */ void *private2; /* extra payload buffer */ struct work_struct verity_work; /* work to verify the decompressed pages */ @@ -1724,6 +1731,9 @@ struct f2fs_sb_info { /* for skip statistic */ unsigned long long skipped_gc_rwsem; /* FG_GC only */ + /* free sections reserved for pinned file */ + unsigned int reserved_pin_section; + /* threshold for gc trials on pinned files */ unsigned short gc_pin_file_threshold; struct f2fs_rwsem pin_sem; @@ -2013,16 +2023,11 @@ static inline struct f2fs_sb_info *F2FS_M_SB(struct address_space *mapping) return F2FS_I_SB(mapping->host); } -static inline struct f2fs_sb_info *F2FS_F_SB(struct folio *folio) +static inline struct f2fs_sb_info *F2FS_F_SB(const struct folio *folio) { return F2FS_M_SB(folio->mapping); } -static inline struct f2fs_sb_info *F2FS_P_SB(struct page *page) -{ - return F2FS_F_SB(page_folio(page)); -} - static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi) { return (struct f2fs_super_block *)(sbi->raw_super); @@ -2043,14 +2048,14 @@ static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi) return (struct f2fs_checkpoint *)(sbi->ckpt); } -static inline struct f2fs_node *F2FS_NODE(const struct page *page) +static inline struct f2fs_node *F2FS_NODE(const struct folio *folio) { - return (struct f2fs_node *)page_address(page); + return (struct f2fs_node *)folio_address(folio); } -static inline struct f2fs_inode *F2FS_INODE(struct page *page) +static inline struct f2fs_inode *F2FS_INODE(const struct folio *folio) { - return &((struct f2fs_node *)page_address(page))->i; + return &((struct f2fs_node *)folio_address(folio))->i; } static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi) @@ -2453,6 +2458,13 @@ release_quota: } #define PAGE_PRIVATE_GET_FUNC(name, flagname) \ +static inline bool folio_test_f2fs_##name(const struct folio *folio) \ +{ \ + unsigned long priv = (unsigned long)folio->private; \ + unsigned long v = (1UL << PAGE_PRIVATE_NOT_POINTER) | \ + (1UL << PAGE_PRIVATE_##flagname); \ + return (priv & v) == v; \ +} \ static inline bool page_private_##name(struct page *page) \ { \ return PagePrivate(page) && \ @@ -2461,6 +2473,17 @@ static inline bool page_private_##name(struct page *page) \ } #define PAGE_PRIVATE_SET_FUNC(name, flagname) \ +static inline void folio_set_f2fs_##name(struct folio *folio) \ +{ \ + unsigned long v = (1UL << PAGE_PRIVATE_NOT_POINTER) | \ + (1UL << PAGE_PRIVATE_##flagname); \ + if (!folio->private) \ + folio_attach_private(folio, (void *)v); \ + else { \ + v |= (unsigned long)folio->private; \ + folio->private = (void *)v; \ + } \ +} \ static inline void set_page_private_##name(struct page *page) \ { \ if (!PagePrivate(page)) \ @@ -2470,6 +2493,16 @@ static inline void set_page_private_##name(struct page *page) \ } #define PAGE_PRIVATE_CLEAR_FUNC(name, flagname) \ +static inline void folio_clear_f2fs_##name(struct folio *folio) \ +{ \ + unsigned long v = (unsigned long)folio->private; \ + \ + v &= ~(1UL << PAGE_PRIVATE_##flagname); \ + if (v == (1UL << PAGE_PRIVATE_NOT_POINTER)) \ + folio_detach_private(folio); \ + else \ + folio->private = (void *)v; \ +} \ static inline void clear_page_private_##name(struct page *page) \ { \ clear_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ @@ -2492,39 +2525,23 @@ PAGE_PRIVATE_CLEAR_FUNC(inline, INLINE_INODE); PAGE_PRIVATE_CLEAR_FUNC(gcing, ONGOING_MIGRATION); PAGE_PRIVATE_CLEAR_FUNC(atomic, ATOMIC_WRITE); -static inline unsigned long get_page_private_data(struct page *page) +static inline unsigned long folio_get_f2fs_data(struct folio *folio) { - unsigned long data = page_private(page); + unsigned long data = (unsigned long)folio->private; if (!test_bit(PAGE_PRIVATE_NOT_POINTER, &data)) return 0; return data >> PAGE_PRIVATE_MAX; } -static inline void set_page_private_data(struct page *page, unsigned long data) +static inline void folio_set_f2fs_data(struct folio *folio, unsigned long data) { - if (!PagePrivate(page)) - attach_page_private(page, (void *)0); - set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)); - page_private(page) |= data << PAGE_PRIVATE_MAX; -} - -static inline void clear_page_private_data(struct page *page) -{ - page_private(page) &= GENMASK(PAGE_PRIVATE_MAX - 1, 0); - if (page_private(page) == BIT(PAGE_PRIVATE_NOT_POINTER)) - detach_page_private(page); -} + data = (1UL << PAGE_PRIVATE_NOT_POINTER) | (data << PAGE_PRIVATE_MAX); -static inline void clear_page_private_all(struct page *page) -{ - clear_page_private_data(page); - clear_page_private_reference(page); - clear_page_private_gcing(page); - clear_page_private_inline(page); - clear_page_private_atomic(page); - - f2fs_bug_on(F2FS_P_SB(page), page_private(page)); + if (!folio_test_private(folio)) + folio_attach_private(folio, (void *)data); + else + folio->private = (void *)((unsigned long)folio->private | data); } static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, @@ -3011,9 +3028,9 @@ static inline void f2fs_radix_tree_insert(struct radix_tree_root *root, #define RAW_IS_INODE(p) ((p)->footer.nid == (p)->footer.ino) -static inline bool IS_INODE(struct page *page) +static inline bool IS_INODE(const struct folio *folio) { - struct f2fs_node *p = F2FS_NODE(page); + struct f2fs_node *p = F2FS_NODE(folio); return RAW_IS_INODE(p); } @@ -3031,20 +3048,20 @@ static inline __le32 *blkaddr_in_node(struct f2fs_node *node) static inline int f2fs_has_extra_attr(struct inode *inode); static inline unsigned int get_dnode_base(struct inode *inode, - struct page *node_page) + struct folio *node_folio) { - if (!IS_INODE(node_page)) + if (!IS_INODE(node_folio)) return 0; return inode ? get_extra_isize(inode) : - offset_in_addr(&F2FS_NODE(node_page)->i); + offset_in_addr(&F2FS_NODE(node_folio)->i); } static inline __le32 *get_dnode_addr(struct inode *inode, struct folio *node_folio) { - return blkaddr_in_node(F2FS_NODE(&node_folio->page)) + - get_dnode_base(inode, &node_folio->page); + return blkaddr_in_node(F2FS_NODE(node_folio)) + + get_dnode_base(inode, node_folio); } static inline block_t data_blkaddr(struct inode *inode, @@ -3366,9 +3383,10 @@ static inline unsigned int addrs_per_page(struct inode *inode, return addrs; } -static inline void *inline_xattr_addr(struct inode *inode, struct folio *folio) +static inline +void *inline_xattr_addr(struct inode *inode, const struct folio *folio) { - struct f2fs_inode *ri = F2FS_INODE(&folio->page); + struct f2fs_inode *ri = F2FS_INODE(folio); return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - get_inline_xattr_addrs(inode)]); @@ -3628,13 +3646,14 @@ int f2fs_pin_file_control(struct inode *inode, bool inc); */ void f2fs_set_inode_flags(struct inode *inode); bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct folio *folio); -void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page); +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct folio *folio); struct inode *f2fs_iget(struct super_block *sb, unsigned long ino); struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino); int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); void f2fs_update_inode(struct inode *inode, struct folio *node_folio); void f2fs_update_inode_page(struct inode *inode); int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc); +void f2fs_remove_donate_inode(struct inode *inode); void f2fs_evict_inode(struct inode *inode); void f2fs_handle_failed_inode(struct inode *inode); @@ -3784,8 +3803,8 @@ void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); int f2fs_recover_inline_xattr(struct inode *inode, struct folio *folio); -int f2fs_recover_xattr_data(struct inode *inode, struct page *page); -int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); +int f2fs_recover_xattr_data(struct inode *inode, struct folio *folio); +int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct folio *folio); int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); @@ -3852,7 +3871,7 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, bool recover_newaddr); enum temp_type f2fs_get_segment_temp(struct f2fs_sb_info *sbi, enum log_type seg_type); -int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, +int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct folio *folio, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, struct f2fs_io_info *fio); @@ -3886,7 +3905,7 @@ unsigned long long f2fs_get_section_mtime(struct f2fs_sb_info *sbi, static inline struct inode *fio_inode(struct f2fs_io_info *fio) { - return page_folio(fio->page)->mapping->host; + return fio->folio->mapping->host; } #define DEF_FRAGMENT_SIZE 4 @@ -3953,7 +3972,7 @@ void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi); */ int __init f2fs_init_bioset(void); void f2fs_destroy_bioset(void); -bool f2fs_is_cp_guaranteed(struct page *page); +bool f2fs_is_cp_guaranteed(const struct folio *folio); int f2fs_init_bio_entry_cache(void); void f2fs_destroy_bio_entry_cache(void); void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio, @@ -3961,7 +3980,7 @@ void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio, int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi); void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, - struct inode *inode, struct page *page, + struct inode *inode, struct folio *folio, nid_t ino, enum page_type type); void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, struct bio **bio, struct folio *folio); @@ -4303,7 +4322,7 @@ extern struct kmem_cache *f2fs_inode_entry_slab; * inline.c */ bool f2fs_may_inline_data(struct inode *inode); -bool f2fs_sanity_check_inline_data(struct inode *inode, struct page *ipage); +bool f2fs_sanity_check_inline_data(struct inode *inode, struct folio *ifolio); bool f2fs_may_inline_dentry(struct inode *inode); void f2fs_do_read_inline_data(struct folio *folio, struct folio *ifolio); void f2fs_truncate_inline_inode(struct inode *inode, struct folio *ifolio, @@ -4345,7 +4364,7 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); /* * extent_cache.c */ -bool sanity_check_extent_cache(struct inode *inode, struct page *ipage); +bool sanity_check_extent_cache(struct inode *inode, struct folio *ifolio); void f2fs_init_extent_tree(struct inode *inode); void f2fs_drop_extent_tree(struct inode *inode); void f2fs_destroy_extent_node(struct inode *inode); @@ -4435,20 +4454,20 @@ enum cluster_check_type { CLUSTER_COMPR_BLKS, /* return # of compressed blocks in a cluster */ CLUSTER_RAW_BLKS /* return # of raw blocks in a cluster */ }; -bool f2fs_is_compressed_page(struct page *page); +bool f2fs_is_compressed_page(struct folio *folio); struct folio *f2fs_compress_control_folio(struct folio *folio); int f2fs_prepare_compress_overwrite(struct inode *inode, struct page **pagep, pgoff_t index, void **fsdata); bool f2fs_compress_write_end(struct inode *inode, void *fsdata, pgoff_t index, unsigned copied); int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock); -void f2fs_compress_write_end_io(struct bio *bio, struct page *page); +void f2fs_compress_write_end_io(struct bio *bio, struct folio *folio); bool f2fs_is_compress_backend_ready(struct inode *inode); bool f2fs_is_compress_level_valid(int alg, int lvl); int __init f2fs_init_compress_mempool(void); void f2fs_destroy_compress_mempool(void); void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task); -void f2fs_end_read_compressed_page(struct page *page, bool failed, +void f2fs_end_read_compressed_page(struct folio *folio, bool failed, block_t blkaddr, bool in_task); bool f2fs_cluster_is_empty(struct compress_ctx *cc); bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); @@ -4486,8 +4505,6 @@ void f2fs_destroy_compress_cache(void); struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi); void f2fs_invalidate_compress_pages_range(struct f2fs_sb_info *sbi, block_t blkaddr, unsigned int len); -void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page, - nid_t ino, block_t blkaddr); bool f2fs_load_compressed_folio(struct f2fs_sb_info *sbi, struct folio *folio, block_t blkaddr); void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino); @@ -4504,7 +4521,7 @@ void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino); sbi->compr_saved_block += diff; \ } while (0) #else -static inline bool f2fs_is_compressed_page(struct page *page) { return false; } +static inline bool f2fs_is_compressed_page(struct folio *folio) { return false; } static inline bool f2fs_is_compress_backend_ready(struct inode *inode) { if (!f2fs_compressed_file(inode)) @@ -4522,7 +4539,7 @@ static inline int __init f2fs_init_compress_mempool(void) { return 0; } static inline void f2fs_destroy_compress_mempool(void) { } static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) { } -static inline void f2fs_end_read_compressed_page(struct page *page, +static inline void f2fs_end_read_compressed_page(struct folio *folio, bool failed, block_t blkaddr, bool in_task) { WARN_ON_ONCE(1); @@ -4542,8 +4559,6 @@ static inline int __init f2fs_init_compress_cache(void) { return 0; } static inline void f2fs_destroy_compress_cache(void) { } static inline void f2fs_invalidate_compress_pages_range(struct f2fs_sb_info *sbi, block_t blkaddr, unsigned int len) { } -static inline void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, - struct page *page, nid_t ino, block_t blkaddr) { } static inline bool f2fs_load_compressed_folio(struct f2fs_sb_info *sbi, struct folio *folio, block_t blkaddr) { return false; } static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c677230699fd..42faaed6a02d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -489,7 +489,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) } } - end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); /* find data/hole in dnode block */ for (; dn.ofs_in_node < end_offset; @@ -629,7 +629,10 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) if (err) return err; - return finish_preallocate_blocks(inode); + err = finish_preallocate_blocks(inode); + if (!err) + atomic_inc(&F2FS_I(inode)->open_count); + return err; } void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) @@ -708,7 +711,7 @@ next: * once we invalidate valid blkaddr in range [ofs, ofs + count], * we will invalidate all blkaddr in the whole range. */ - fofs = f2fs_start_bidx_of_node(ofs_of_node(&dn->node_folio->page), + fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_folio), dn->inode) + ofs; f2fs_update_read_extent_cache_range(dn, fofs, 0, len); f2fs_update_age_extent_cache_range(dn, fofs, len); @@ -815,12 +818,12 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) goto out; } - count = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + count = ADDRS_PER_PAGE(dn.node_folio, inode); count -= dn.ofs_in_node; f2fs_bug_on(sbi, count < 0); - if (dn.ofs_in_node || IS_INODE(&dn.node_folio->page)) { + if (dn.ofs_in_node || IS_INODE(dn.node_folio)) { f2fs_truncate_data_blocks_range(&dn, count); free_from += count; } @@ -1043,11 +1046,24 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, { struct inode *inode = d_inode(dentry); struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int err; - if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + if (unlikely(f2fs_cp_error(sbi))) return -EIO; + err = setattr_prepare(idmap, dentry, attr); + if (err) + return err; + + err = fscrypt_prepare_setattr(dentry, attr); + if (err) + return err; + + err = fsverity_prepare_setattr(dentry, attr); + if (err) + return err; + if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; @@ -1064,20 +1080,19 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, !IS_ALIGNED(attr->ia_size, F2FS_BLK_TO_BYTES(fi->i_cluster_size))) return -EINVAL; + /* + * To prevent scattered pin block generation, we don't allow + * smaller/equal size unaligned truncation for pinned file. + * We only support overwrite IO to pinned file, so don't + * care about larger size truncation. + */ + if (f2fs_is_pinned_file(inode) && + attr->ia_size <= i_size_read(inode) && + !IS_ALIGNED(attr->ia_size, + F2FS_BLK_TO_BYTES(CAP_BLKS_PER_SEC(sbi)))) + return -EINVAL; } - err = setattr_prepare(idmap, dentry, attr); - if (err) - return err; - - err = fscrypt_prepare_setattr(dentry, attr); - if (err) - return err; - - err = fsverity_prepare_setattr(dentry, attr); - if (err) - return err; - if (is_quota_modification(idmap, inode, attr)) { err = f2fs_dquot_initialize(inode); if (err) @@ -1085,12 +1100,11 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, } if (i_uid_needs_update(idmap, attr, inode) || i_gid_needs_update(idmap, attr, inode)) { - f2fs_lock_op(F2FS_I_SB(inode)); + f2fs_lock_op(sbi); err = dquot_transfer(idmap, inode, attr); if (err) { - set_sbi_flag(F2FS_I_SB(inode), - SBI_QUOTA_NEED_REPAIR); - f2fs_unlock_op(F2FS_I_SB(inode)); + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + f2fs_unlock_op(sbi); return err; } /* @@ -1100,7 +1114,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, i_uid_update(idmap, attr, inode); i_gid_update(idmap, attr, inode); f2fs_mark_inode_dirty_sync(inode, true); - f2fs_unlock_op(F2FS_I_SB(inode)); + f2fs_unlock_op(sbi); } if (attr->ia_valid & ATTR_SIZE) { @@ -1163,7 +1177,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, f2fs_mark_inode_dirty_sync(inode, true); /* inode change will produce dirty node pages flushed by checkpoint */ - f2fs_balance_fs(F2FS_I_SB(inode), true); + f2fs_balance_fs(sbi, true); return err; } @@ -1223,7 +1237,7 @@ int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) return err; } - end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); count = min(end_offset - dn.ofs_in_node, pg_end - pg_start); f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset); @@ -1322,7 +1336,7 @@ next_dnode: goto next; } - done = min((pgoff_t)ADDRS_PER_PAGE(&dn.node_folio->page, inode) - + done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_folio, inode) - dn.ofs_in_node, len); for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) { *blkaddr = f2fs_data_blkaddr(&dn); @@ -1411,7 +1425,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, } ilen = min((pgoff_t) - ADDRS_PER_PAGE(&dn.node_folio->page, dst_inode) - + ADDRS_PER_PAGE(dn.node_folio, dst_inode) - dn.ofs_in_node, len - i); do { dn.data_blkaddr = f2fs_data_blkaddr(&dn); @@ -1453,7 +1467,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, memcpy_folio(fdst, 0, fsrc, 0, PAGE_SIZE); folio_mark_dirty(fdst); - set_page_private_gcing(&fdst->page); + folio_set_f2fs_gcing(fdst); f2fs_folio_put(fdst, true); f2fs_folio_put(fsrc, true); @@ -1707,7 +1721,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, goto out; } - end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); end = min(pg_end, end_offset - dn.ofs_in_node + index); ret = f2fs_do_zero_range(&dn, index, end); @@ -1888,9 +1902,8 @@ next_alloc: } } - if (has_not_enough_free_secs(sbi, 0, f2fs_sb_has_blkzoned(sbi) ? - ZONED_PIN_SEC_REQUIRED_COUNT : - GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { + if (has_not_enough_free_secs(sbi, 0, + sbi->reserved_pin_section)) { f2fs_down_write(&sbi->gc_lock); stat_inc_gc_call_count(sbi, FOREGROUND); err = f2fs_gc(sbi, &gc_control); @@ -2028,6 +2041,9 @@ out: static int f2fs_release_file(struct inode *inode, struct file *filp) { + if (atomic_dec_and_test(&F2FS_I(inode)->open_count)) + f2fs_remove_donate_inode(inode); + /* * f2fs_release_file is called at every close calls. So we should * not drop any inmemory pages by close called by other process. @@ -2978,7 +2994,7 @@ do_map: f2fs_folio_wait_writeback(folio, DATA, true, true); folio_mark_dirty(folio); - set_page_private_gcing(&folio->page); + folio_set_f2fs_gcing(folio); f2fs_folio_put(folio, true); idx++; @@ -3876,7 +3892,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) break; } - end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); count = min(end_offset - dn.ofs_in_node, last_idx - page_idx); count = round_up(count, fi->i_cluster_size); @@ -4054,7 +4070,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) break; } - end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); count = min(end_offset - dn.ofs_in_node, last_idx - page_idx); count = round_up(count, fi->i_cluster_size); @@ -4218,7 +4234,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) goto out; } - end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); count = min(end_offset - dn.ofs_in_node, pg_end - index); for (i = 0; i < count; i++, index++, dn.ofs_in_node++) { struct block_device *cur_bdev; @@ -4415,7 +4431,7 @@ static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len) f2fs_folio_wait_writeback(folio, DATA, true, true); folio_mark_dirty(folio); - set_page_private_gcing(&folio->page); + folio_set_f2fs_gcing(folio); redirty_idx = folio_next_index(folio); folio_unlock(folio); folio_put_refs(folio, 2); @@ -4825,6 +4841,7 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) struct inode *inode = file_inode(iocb->ki_filp); const loff_t pos = iocb->ki_pos; ssize_t ret; + bool dio; if (!f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; @@ -4833,12 +4850,15 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) f2fs_trace_rw_file_path(iocb->ki_filp, iocb->ki_pos, iov_iter_count(to), READ); + dio = f2fs_should_use_dio(inode, iocb, to); + /* In LFS mode, if there is inflight dio, wait for its completion */ if (f2fs_lfs_mode(F2FS_I_SB(inode)) && - get_pages(F2FS_I_SB(inode), F2FS_DIO_WRITE)) + get_pages(F2FS_I_SB(inode), F2FS_DIO_WRITE) && + (!f2fs_is_pinned_file(inode) || !dio)) inode_dio_wait(inode); - if (f2fs_should_use_dio(inode, iocb, to)) { + if (dio) { ret = f2fs_dio_read_iter(iocb, to); } else { ret = filemap_read(iocb, to, 0); @@ -4846,8 +4866,7 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) f2fs_update_iostat(F2FS_I_SB(inode), inode, APP_BUFFERED_READ_IO, ret); } - if (trace_f2fs_dataread_end_enabled()) - trace_f2fs_dataread_end(inode, pos, ret); + trace_f2fs_dataread_end(inode, pos, ret); return ret; } @@ -4870,8 +4889,7 @@ static ssize_t f2fs_file_splice_read(struct file *in, loff_t *ppos, f2fs_update_iostat(F2FS_I_SB(inode), inode, APP_BUFFERED_READ_IO, ret); - if (trace_f2fs_dataread_end_enabled()) - trace_f2fs_dataread_end(inode, pos, ret); + trace_f2fs_dataread_end(inode, pos, ret); return ret; } @@ -5216,8 +5234,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) f2fs_dio_write_iter(iocb, from, &may_need_sync) : f2fs_buffered_write_iter(iocb, from); - if (trace_f2fs_datawrite_end_enabled()) - trace_f2fs_datawrite_end(inode, orig_pos, ret); + trace_f2fs_datawrite_end(inode, orig_pos, ret); } /* Don't leave any preallocated blocks around past i_size. */ diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 3cb5242f4ddf..098e9f71421e 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -141,10 +141,10 @@ do_gc: FOREGROUND : BACKGROUND); sync_mode = (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC) || - gc_control.one_time; + (gc_control.one_time && gc_th->boost_gc_greedy); /* foreground GC was been triggered via f2fs_balance_fs() */ - if (foreground) + if (foreground && !f2fs_sb_has_blkzoned(sbi)) sync_mode = false; gc_control.init_gc_type = sync_mode ? FG_GC : BG_GC; @@ -197,6 +197,8 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME; gc_th->valid_thresh_ratio = DEF_GC_THREAD_VALID_THRESH_RATIO; + gc_th->boost_gc_multiple = BOOST_GC_MULTIPLE; + gc_th->boost_gc_greedy = GC_GREEDY; if (f2fs_sb_has_blkzoned(sbi)) { gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME_ZONED; @@ -278,12 +280,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - if (p->alloc_mode == SSR) { - p->gc_mode = GC_GREEDY; - p->dirty_bitmap = dirty_i->dirty_segmap[type]; - p->max_search = dirty_i->nr_dirty[type]; - p->ofs_unit = 1; - } else if (p->alloc_mode == AT_SSR) { + if (p->alloc_mode == SSR || p->alloc_mode == AT_SSR) { p->gc_mode = GC_GREEDY; p->dirty_bitmap = dirty_i->dirty_segmap[type]; p->max_search = dirty_i->nr_dirty[type]; @@ -389,14 +386,15 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) } static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, - unsigned int segno, struct victim_sel_policy *p) + unsigned int segno, struct victim_sel_policy *p, + unsigned int valid_thresh_ratio) { if (p->alloc_mode == SSR) return get_seg_entry(sbi, segno)->ckpt_valid_blocks; - if (p->one_time_gc && (get_valid_blocks(sbi, segno, true) >= - CAP_BLKS_PER_SEC(sbi) * sbi->gc_thread->valid_thresh_ratio / - 100)) + if (p->one_time_gc && (valid_thresh_ratio < 100) && + (get_valid_blocks(sbi, segno, true) >= + CAP_BLKS_PER_SEC(sbi) * valid_thresh_ratio / 100)) return UINT_MAX; /* alloc_mode == LFS */ @@ -777,6 +775,7 @@ int f2fs_get_victim(struct f2fs_sb_info *sbi, unsigned int *result, unsigned int secno, last_victim; unsigned int last_segment; unsigned int nsearched; + unsigned int valid_thresh_ratio = 100; bool is_atgc; int ret = 0; @@ -786,7 +785,11 @@ int f2fs_get_victim(struct f2fs_sb_info *sbi, unsigned int *result, p.alloc_mode = alloc_mode; p.age = age; p.age_threshold = sbi->am.age_threshold; - p.one_time_gc = one_time; + if (one_time) { + p.one_time_gc = one_time; + if (has_enough_free_secs(sbi, 0, NR_PERSISTENT_LOG)) + valid_thresh_ratio = sbi->gc_thread->valid_thresh_ratio; + } retry: select_policy(sbi, gc_type, type, &p); @@ -912,7 +915,7 @@ retry: goto next; } - cost = get_gc_cost(sbi, segno, &p); + cost = get_gc_cost(sbi, segno, &p, valid_thresh_ratio); if (p.min_cost > cost) { p.min_segno = segno; @@ -1162,8 +1165,8 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return false; } - if (IS_INODE(&node_folio->page)) { - base = offset_in_addr(F2FS_INODE(&node_folio->page)); + if (IS_INODE(node_folio)) { + base = offset_in_addr(F2FS_INODE(node_folio)); max_addrs = DEF_ADDRS_PER_INODE; } else { base = 0; @@ -1177,7 +1180,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return false; } - *nofs = ofs_of_node(&node_folio->page); + *nofs = ofs_of_node(node_folio); source_blkaddr = data_blkaddr(NULL, node_folio, ofs_in_node); f2fs_folio_put(node_folio, true); @@ -1249,7 +1252,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index) } got_it: /* read folio */ - fio.page = &folio->page; + fio.folio = folio; fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; /* @@ -1353,7 +1356,7 @@ static int move_data_block(struct inode *inode, block_t bidx, goto put_out; /* read page */ - fio.page = &folio->page; + fio.folio = folio; fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; if (lfs_mode) @@ -1473,7 +1476,7 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, goto out; } folio_mark_dirty(folio); - set_page_private_gcing(&folio->page); + folio_set_f2fs_gcing(folio); } else { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), @@ -1483,7 +1486,7 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC, .old_blkaddr = NULL_ADDR, - .page = &folio->page, + .folio = folio, .encrypted_page = NULL, .need_lock = LOCK_REQ, .io_type = FS_GC_DATA_IO, @@ -1499,11 +1502,11 @@ retry: f2fs_remove_dirty_inode(inode); } - set_page_private_gcing(&folio->page); + folio_set_f2fs_gcing(folio); err = f2fs_do_write_data_page(&fio); if (err) { - clear_page_private_gcing(&folio->page); + folio_clear_f2fs_gcing(folio); if (err == -ENOMEM) { memalloc_retry_wait(GFP_NOFS); goto retry; @@ -1749,7 +1752,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, !has_enough_free_blocks(sbi, sbi->gc_thread->boost_zoned_gc_percent)) window_granularity *= - BOOST_GC_MULTIPLE; + sbi->gc_thread->boost_gc_multiple; end_segno = start_segno + window_granularity; } @@ -1891,6 +1894,7 @@ gc_more: /* Let's run FG_GC, if we don't have enough space. */ if (has_not_enough_free_secs(sbi, 0, 0)) { gc_type = FG_GC; + gc_control->one_time = false; /* * For example, if there are many prefree_segments below given @@ -2064,7 +2068,7 @@ int f2fs_gc_range(struct f2fs_sb_info *sbi, .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; - if (IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, segno))) + if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, segno))) continue; do_garbage_collect(sbi, segno, &gc_list, FG_GC, true, false); diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 5c1eaf55e127..24e8b1c27acc 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -68,6 +68,8 @@ struct f2fs_gc_kthread { unsigned int no_zoned_gc_percent; unsigned int boost_zoned_gc_percent; unsigned int valid_thresh_ratio; + unsigned int boost_gc_multiple; + unsigned int boost_gc_greedy; }; struct gc_inode_list { @@ -194,6 +196,7 @@ static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) static inline bool need_to_boost_gc(struct f2fs_sb_info *sbi) { if (f2fs_sb_has_blkzoned(sbi)) - return !has_enough_free_blocks(sbi, LIMIT_BOOST_ZONED_GC); + return !has_enough_free_blocks(sbi, + sbi->gc_thread->boost_zoned_gc_percent); return has_enough_invalid_blocks(sbi); } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 901c630685ce..58ac831ef704 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -33,9 +33,9 @@ bool f2fs_may_inline_data(struct inode *inode) return !f2fs_post_read_required(inode); } -static bool inode_has_blocks(struct inode *inode, struct page *ipage) +static bool inode_has_blocks(struct inode *inode, struct folio *ifolio) { - struct f2fs_inode *ri = F2FS_INODE(ipage); + struct f2fs_inode *ri = F2FS_INODE(ifolio); int i; if (F2FS_HAS_BLOCKS(inode)) @@ -48,12 +48,12 @@ static bool inode_has_blocks(struct inode *inode, struct page *ipage) return false; } -bool f2fs_sanity_check_inline_data(struct inode *inode, struct page *ipage) +bool f2fs_sanity_check_inline_data(struct inode *inode, struct folio *ifolio) { if (!f2fs_has_inline_data(inode)) return false; - if (inode_has_blocks(inode, ipage)) + if (inode_has_blocks(inode, ifolio)) return false; if (!support_inline_data(inode)) @@ -150,7 +150,7 @@ int f2fs_convert_inline_folio(struct dnode_of_data *dn, struct folio *folio) .type = DATA, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_PRIO, - .page = &folio->page, + .folio = folio, .encrypted_page = NULL, .io_type = FS_DATA_IO, }; @@ -206,7 +206,7 @@ int f2fs_convert_inline_folio(struct dnode_of_data *dn, struct folio *folio) /* clear inline data and flag after data writeback */ f2fs_truncate_inline_inode(dn->inode, dn->inode_folio, 0); - clear_page_private_inline(&dn->inode_folio->page); + folio_clear_f2fs_inline(dn->inode_folio); clear_out: stat_dec_inline_inode(dn->inode); clear_inode_flag(dn->inode, FI_INLINE_DATA); @@ -286,7 +286,7 @@ int f2fs_write_inline_data(struct inode *inode, struct folio *folio) set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); - clear_page_private_inline(&ifolio->page); + folio_clear_f2fs_inline(ifolio); f2fs_folio_put(ifolio, 1); return 0; } @@ -305,8 +305,8 @@ int f2fs_recover_inline_data(struct inode *inode, struct folio *nfolio) * x o -> remove data blocks, and then recover inline_data * x x -> recover data blocks */ - if (IS_INODE(&nfolio->page)) - ri = F2FS_INODE(&nfolio->page); + if (IS_INODE(nfolio)) + ri = F2FS_INODE(nfolio); if (f2fs_has_inline_data(inode) && ri && (ri->i_inline & F2FS_INLINE_DATA)) { @@ -825,7 +825,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits; byteaddr += (char *)inline_data_addr(inode, ifolio) - - (char *)F2FS_INODE(&ifolio->page); + (char *)F2FS_INODE(ifolio); err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags); trace_f2fs_fiemap(inode, start, byteaddr, ilen, flags, err); out: diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 083d52a42bfb..8c4eafe9ffac 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -108,7 +108,7 @@ static void __recover_inline_status(struct inode *inode, struct folio *ifolio) f2fs_folio_wait_writeback(ifolio, NODE, true, true); set_inode_flag(inode, FI_DATA_EXIST); - set_raw_inline(inode, F2FS_INODE(&ifolio->page)); + set_raw_inline(inode, F2FS_INODE(ifolio)); folio_mark_dirty(ifolio); return; } @@ -116,14 +116,15 @@ static void __recover_inline_status(struct inode *inode, struct folio *ifolio) return; } -static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) +static +bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct folio *folio) { - struct f2fs_inode *ri = &F2FS_NODE(page)->i; + struct f2fs_inode *ri = &F2FS_NODE(folio)->i; if (!f2fs_sb_has_inode_chksum(sbi)) return false; - if (!IS_INODE(page) || !(ri->i_inline & F2FS_EXTRA_ATTR)) + if (!IS_INODE(folio) || !(ri->i_inline & F2FS_EXTRA_ATTR)) return false; if (!F2FS_FITS_IN_INODE(ri, le16_to_cpu(ri->i_extra_isize), @@ -133,9 +134,9 @@ static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page return true; } -static __u32 f2fs_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) +static __u32 f2fs_inode_chksum(struct f2fs_sb_info *sbi, struct folio *folio) { - struct f2fs_node *node = F2FS_NODE(page); + struct f2fs_node *node = F2FS_NODE(folio); struct f2fs_inode *ri = &node->i; __le32 ino = node->footer.ino; __le32 gen = ri->i_generation; @@ -164,34 +165,34 @@ bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct folio *folio) return true; #ifdef CONFIG_F2FS_CHECK_FS - if (!f2fs_enable_inode_chksum(sbi, &folio->page)) + if (!f2fs_enable_inode_chksum(sbi, folio)) #else - if (!f2fs_enable_inode_chksum(sbi, &folio->page) || + if (!f2fs_enable_inode_chksum(sbi, folio) || folio_test_dirty(folio) || folio_test_writeback(folio)) #endif return true; - ri = &F2FS_NODE(&folio->page)->i; + ri = &F2FS_NODE(folio)->i; provided = le32_to_cpu(ri->i_inode_checksum); - calculated = f2fs_inode_chksum(sbi, &folio->page); + calculated = f2fs_inode_chksum(sbi, folio); if (provided != calculated) f2fs_warn(sbi, "checksum invalid, nid = %lu, ino_of_node = %x, %x vs. %x", - folio->index, ino_of_node(&folio->page), + folio->index, ino_of_node(folio), provided, calculated); return provided == calculated; } -void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page) +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct folio *folio) { - struct f2fs_inode *ri = &F2FS_NODE(page)->i; + struct f2fs_inode *ri = &F2FS_NODE(folio)->i; - if (!f2fs_enable_inode_chksum(sbi, page)) + if (!f2fs_enable_inode_chksum(sbi, folio)) return; - ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page)); + ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, folio)); } static bool sanity_check_compress_inode(struct inode *inode, @@ -266,28 +267,28 @@ err_level: return false; } -static bool sanity_check_inode(struct inode *inode, struct page *node_page) +static bool sanity_check_inode(struct inode *inode, struct folio *node_folio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_inode *ri = F2FS_INODE(node_page); + struct f2fs_inode *ri = F2FS_INODE(node_folio); unsigned long long iblocks; - iblocks = le64_to_cpu(F2FS_INODE(node_page)->i_blocks); + iblocks = le64_to_cpu(F2FS_INODE(node_folio)->i_blocks); if (!iblocks) { f2fs_warn(sbi, "%s: corrupted inode i_blocks i_ino=%lx iblocks=%llu, run fsck to fix.", __func__, inode->i_ino, iblocks); return false; } - if (ino_of_node(node_page) != nid_of_node(node_page)) { + if (ino_of_node(node_folio) != nid_of_node(node_folio)) { f2fs_warn(sbi, "%s: corrupted inode footer i_ino=%lx, ino,nid: [%u, %u] run fsck to fix.", __func__, inode->i_ino, - ino_of_node(node_page), nid_of_node(node_page)); + ino_of_node(node_folio), nid_of_node(node_folio)); return false; } - if (ino_of_node(node_page) == fi->i_xattr_nid) { + if (ino_of_node(node_folio) == fi->i_xattr_nid) { f2fs_warn(sbi, "%s: corrupted inode i_ino=%lx, xnid=%x, run fsck to fix.", __func__, inode->i_ino, fi->i_xattr_nid); return false; @@ -354,7 +355,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) } } - if (f2fs_sanity_check_inline_data(inode, node_page)) { + if (f2fs_sanity_check_inline_data(inode, node_folio)) { f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_data, run fsck to fix", __func__, inode->i_ino, inode->i_mode); return false; @@ -419,7 +420,7 @@ static int do_read_inode(struct inode *inode) if (IS_ERR(node_folio)) return PTR_ERR(node_folio); - ri = F2FS_INODE(&node_folio->page); + ri = F2FS_INODE(node_folio); inode->i_mode = le16_to_cpu(ri->i_mode); i_uid_write(inode, le32_to_cpu(ri->i_uid)); @@ -469,7 +470,7 @@ static int do_read_inode(struct inode *inode) fi->i_inline_xattr_size = 0; } - if (!sanity_check_inode(inode, &node_folio->page)) { + if (!sanity_check_inode(inode, node_folio)) { f2fs_folio_put(node_folio, true); set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); @@ -481,9 +482,9 @@ static int do_read_inode(struct inode *inode) __recover_inline_status(inode, node_folio); /* try to recover cold bit for non-dir inode */ - if (!S_ISDIR(inode->i_mode) && !is_cold_node(&node_folio->page)) { + if (!S_ISDIR(inode->i_mode) && !is_cold_node(node_folio)) { f2fs_folio_wait_writeback(node_folio, NODE, true, true); - set_cold_node(&node_folio->page, false); + set_cold_node(node_folio, false); folio_mark_dirty(node_folio); } @@ -531,7 +532,7 @@ static int do_read_inode(struct inode *inode) init_idisk_time(inode); - if (!sanity_check_extent_cache(inode, &node_folio->page)) { + if (!sanity_check_extent_cache(inode, node_folio)) { f2fs_folio_put(node_folio, true); f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); return -EFSCORRUPTED; @@ -669,7 +670,7 @@ void f2fs_update_inode(struct inode *inode, struct folio *node_folio) f2fs_inode_synced(inode); - ri = F2FS_INODE(&node_folio->page); + ri = F2FS_INODE(node_folio); ri->i_mode = cpu_to_le16(inode->i_mode); ri->i_advise = fi->i_advise; @@ -748,11 +749,11 @@ void f2fs_update_inode(struct inode *inode, struct folio *node_folio) /* deleted inode */ if (inode->i_nlink == 0) - clear_page_private_inline(&node_folio->page); + folio_clear_f2fs_inline(node_folio); init_idisk_time(inode); #ifdef CONFIG_F2FS_CHECK_FS - f2fs_inode_chksum_set(F2FS_I_SB(inode), &node_folio->page); + f2fs_inode_chksum_set(F2FS_I_SB(inode), node_folio); #endif } @@ -820,7 +821,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) return 0; } -static void f2fs_remove_donate_inode(struct inode *inode) +void f2fs_remove_donate_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -933,6 +934,19 @@ retry: f2fs_update_inode_page(inode); if (dquot_initialize_needed(inode)) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + + /* + * If both f2fs_truncate() and f2fs_update_inode_page() failed + * due to fuzzed corrupted inode, call f2fs_inode_synced() to + * avoid triggering later f2fs_bug_on(). + */ + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { + f2fs_warn(sbi, + "f2fs_evict_inode: inode is dirty, ino:%lu", + inode->i_ino); + f2fs_inode_synced(inode); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } } if (freeze_protected) sb_end_intwrite(inode->i_sb); @@ -949,8 +963,12 @@ no_delete: if (likely(!f2fs_cp_error(sbi) && !is_sbi_flag_set(sbi, SBI_CP_DISABLED))) f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE)); - else - f2fs_inode_synced(inode); + + /* + * anyway, it needs to remove the inode from sbi->inode_list[DIRTY_META] + * list to avoid UAF in f2fs_sync_inode_meta() during checkpoint. + */ + f2fs_inode_synced(inode); /* for the case f2fs_new_inode() was failed, .i_ino is zero, skip it */ if (inode->i_ino) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 07e333ee21b7..b882771e4699 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -1298,19 +1298,19 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct page *page; + struct folio *folio; const char *target; if (!dentry) return ERR_PTR(-ECHILD); - page = read_mapping_page(inode->i_mapping, 0, NULL); - if (IS_ERR(page)) - return ERR_CAST(page); + folio = read_mapping_folio(inode->i_mapping, 0, NULL); + if (IS_ERR(folio)) + return ERR_CAST(folio); - target = fscrypt_get_symlink(inode, page_address(page), + target = fscrypt_get_symlink(inode, folio_address(folio), inode->i_sb->s_blocksize, done); - put_page(page); + folio_put(folio); return target; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index bfe104db284e..27743b93e186 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -135,7 +135,7 @@ static struct folio *get_current_nat_folio(struct f2fs_sb_info *sbi, nid_t nid) return f2fs_get_meta_folio_retry(sbi, current_nat_addr(sbi, nid)); } -static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) +static struct folio *get_next_nat_folio(struct f2fs_sb_info *sbi, nid_t nid) { struct folio *src_folio; struct folio *dst_folio; @@ -149,7 +149,7 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) /* get current nat block page with lock */ src_folio = get_current_nat_folio(sbi, nid); if (IS_ERR(src_folio)) - return &src_folio->page; + return src_folio; dst_folio = f2fs_grab_meta_folio(sbi, dst_off); f2fs_bug_on(sbi, folio_test_dirty(src_folio)); @@ -161,7 +161,7 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) set_to_next_nat(nm_i, nid); - return &dst_folio->page; + return dst_folio; } static struct nat_entry *__alloc_nat_entry(struct f2fs_sb_info *sbi, @@ -185,7 +185,7 @@ static void __free_nat_entry(struct nat_entry *e) /* must be locked by nat_tree_lock */ static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i, - struct nat_entry *ne, struct f2fs_nat_entry *raw_ne, bool no_fail) + struct nat_entry *ne, struct f2fs_nat_entry *raw_ne, bool no_fail, bool init_dirty) { if (no_fail) f2fs_radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne); @@ -195,6 +195,12 @@ static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i, if (raw_ne) node_info_from_raw_nat(&ne->ni, raw_ne); + if (init_dirty) { + INIT_LIST_HEAD(&ne->list); + nm_i->nat_cnt[TOTAL_NAT]++; + return ne; + } + spin_lock(&nm_i->nat_list_lock); list_add_tail(&ne->list, &nm_i->nat_entries); spin_unlock(&nm_i->nat_list_lock); @@ -204,14 +210,17 @@ static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i, return ne; } -static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) +static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n, bool for_dirty) { struct nat_entry *ne; ne = radix_tree_lookup(&nm_i->nat_root, n); - /* for recent accessed nat entry, move it to tail of lru list */ - if (ne && !get_nat_flag(ne, IS_DIRTY)) { + /* + * for recent accessed nat entry which will not be dirtied soon + * later, move it to tail of lru list. + */ + if (ne && !get_nat_flag(ne, IS_DIRTY) && !for_dirty) { spin_lock(&nm_i->nat_list_lock); if (!list_empty(&ne->list)) list_move_tail(&ne->list, &nm_i->nat_entries); @@ -256,7 +265,7 @@ static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i, } static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, - struct nat_entry *ne) + struct nat_entry *ne, bool init_dirty) { struct nat_entry_set *head; bool new_ne = nat_get_blkaddr(ne) == NEW_ADDR; @@ -279,7 +288,8 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, goto refresh_list; nm_i->nat_cnt[DIRTY_NAT]++; - nm_i->nat_cnt[RECLAIMABLE_NAT]--; + if (!init_dirty) + nm_i->nat_cnt[RECLAIMABLE_NAT]--; set_nat_flag(ne, IS_DIRTY, true); refresh_list: spin_lock(&nm_i->nat_list_lock); @@ -312,8 +322,7 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct folio *folio) { - return is_node_folio(folio) && IS_DNODE(&folio->page) && - is_cold_node(&folio->page); + return is_node_folio(folio) && IS_DNODE(folio) && is_cold_node(folio); } void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi) @@ -384,7 +393,7 @@ int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid) bool need = false; f2fs_down_read(&nm_i->nat_tree_lock); - e = __lookup_nat_cache(nm_i, nid); + e = __lookup_nat_cache(nm_i, nid, false); if (e) { if (!get_nat_flag(e, IS_CHECKPOINTED) && !get_nat_flag(e, HAS_FSYNCED_INODE)) @@ -401,7 +410,7 @@ bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) bool is_cp = true; f2fs_down_read(&nm_i->nat_tree_lock); - e = __lookup_nat_cache(nm_i, nid); + e = __lookup_nat_cache(nm_i, nid, false); if (e && !get_nat_flag(e, IS_CHECKPOINTED)) is_cp = false; f2fs_up_read(&nm_i->nat_tree_lock); @@ -415,7 +424,7 @@ bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) bool need_update = true; f2fs_down_read(&nm_i->nat_tree_lock); - e = __lookup_nat_cache(nm_i, ino); + e = __lookup_nat_cache(nm_i, ino, false); if (e && get_nat_flag(e, HAS_LAST_FSYNC) && (get_nat_flag(e, IS_CHECKPOINTED) || get_nat_flag(e, HAS_FSYNCED_INODE))) @@ -440,9 +449,9 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, return; f2fs_down_write(&nm_i->nat_tree_lock); - e = __lookup_nat_cache(nm_i, nid); + e = __lookup_nat_cache(nm_i, nid, false); if (!e) - e = __init_nat_entry(nm_i, new, ne, false); + e = __init_nat_entry(nm_i, new, ne, false, false); else f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) || nat_get_blkaddr(e) != @@ -459,11 +468,13 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; struct nat_entry *new = __alloc_nat_entry(sbi, ni->nid, true); + bool init_dirty = false; f2fs_down_write(&nm_i->nat_tree_lock); - e = __lookup_nat_cache(nm_i, ni->nid); + e = __lookup_nat_cache(nm_i, ni->nid, true); if (!e) { - e = __init_nat_entry(nm_i, new, NULL, true); + init_dirty = true; + e = __init_nat_entry(nm_i, new, NULL, true, true); copy_node_info(&e->ni, ni); f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR); } else if (new_blkaddr == NEW_ADDR) { @@ -499,11 +510,11 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, nat_set_blkaddr(e, new_blkaddr); if (!__is_valid_data_blkaddr(new_blkaddr)) set_nat_flag(e, IS_CHECKPOINTED, false); - __set_nat_cache_dirty(nm_i, e); + __set_nat_cache_dirty(nm_i, e, init_dirty); /* update fsync_mark if its inode nat entry is still alive */ if (ni->nid != ni->ino) - e = __lookup_nat_cache(nm_i, ni->ino); + e = __lookup_nat_cache(nm_i, ni->ino, false); if (e) { if (fsync_done && ni->nid == ni->ino) set_nat_flag(e, HAS_FSYNCED_INODE, true); @@ -555,20 +566,24 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct f2fs_nat_entry ne; struct nat_entry *e; pgoff_t index; - block_t blkaddr; int i; + bool need_cache = true; ni->flag = 0; ni->nid = nid; retry: /* Check nat cache */ f2fs_down_read(&nm_i->nat_tree_lock); - e = __lookup_nat_cache(nm_i, nid); + e = __lookup_nat_cache(nm_i, nid, false); if (e) { ni->ino = nat_get_ino(e); ni->blk_addr = nat_get_blkaddr(e); ni->version = nat_get_version(e); f2fs_up_read(&nm_i->nat_tree_lock); + if (IS_ENABLED(CONFIG_F2FS_CHECK_FS)) { + need_cache = false; + goto sanity_check; + } return 0; } @@ -594,7 +609,7 @@ retry: up_read(&curseg->journal_rwsem); if (i >= 0) { f2fs_up_read(&nm_i->nat_tree_lock); - goto cache; + goto sanity_check; } /* Fill node_info from nat page */ @@ -609,14 +624,23 @@ retry: ne = nat_blk->entries[nid - start_nid]; node_info_from_raw_nat(ni, &ne); f2fs_folio_put(folio, true); -cache: - blkaddr = le32_to_cpu(ne.block_addr); - if (__is_valid_data_blkaddr(blkaddr) && - !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) - return -EFAULT; +sanity_check: + if (__is_valid_data_blkaddr(ni->blk_addr) && + !f2fs_is_valid_blkaddr(sbi, ni->blk_addr, + DATA_GENERIC_ENHANCE)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_err_ratelimited(sbi, + "f2fs_get_node_info of %pS: inconsistent nat entry, " + "ino:%u, nid:%u, blkaddr:%u, ver:%u, flag:%u", + __builtin_return_address(0), + ni->ino, ni->nid, ni->blk_addr, ni->version, ni->flag); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_NAT); + return -EFSCORRUPTED; + } /* cache nat entry */ - cache_nat_entry(sbi, nid, &ne); + if (need_cache) + cache_nat_entry(sbi, nid, &ne); return 0; } @@ -636,7 +660,7 @@ static void f2fs_ra_node_pages(struct folio *parent, int start, int n) end = start + n; end = min(end, (int)NIDS_PER_BLOCK); for (i = start; i < end; i++) { - nid = get_nid(&parent->page, i, false); + nid = get_nid(parent, i, false); f2fs_ra_node_page(sbi, nid); } @@ -795,7 +819,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) parent = nfolio[0]; if (level != 0) - nids[1] = get_nid(&parent->page, offset[0], true); + nids[1] = get_nid(parent, offset[0], true); dn->inode_folio = nfolio[0]; dn->inode_folio_locked = true; @@ -803,6 +827,16 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) for (i = 1; i <= level; i++) { bool done = false; + if (nids[i] && nids[i] == dn->inode->i_ino) { + err = -EFSCORRUPTED; + f2fs_err_ratelimited(sbi, + "inode mapping table is corrupted, run fsck to fix it, " + "ino:%lu, nid:%u, level:%d, offset:%d", + dn->inode->i_ino, nids[i], level, offset[level]); + set_sbi_flag(sbi, SBI_NEED_FSCK); + goto release_pages; + } + if (!nids[i] && mode == ALLOC_NODE) { /* alloc new node */ if (!f2fs_alloc_nid(sbi, &(nids[i]))) { @@ -846,7 +880,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) } if (i < level) { parent = nfolio[i]; - nids[i + 1] = get_nid(&parent->page, offset[i], false); + nids[i + 1] = get_nid(parent, offset[i], false); } } dn->nid = nids[level]; @@ -961,9 +995,9 @@ static int truncate_dnode(struct dnode_of_data *dn) else if (IS_ERR(folio)) return PTR_ERR(folio); - if (IS_INODE(&folio->page) || ino_of_node(&folio->page) != dn->inode->i_ino) { + if (IS_INODE(folio) || ino_of_node(folio) != dn->inode->i_ino) { f2fs_err(sbi, "incorrect node reference, ino: %lu, nid: %u, ino_of_node: %u", - dn->inode->i_ino, dn->nid, ino_of_node(&folio->page)); + dn->inode->i_ino, dn->nid, ino_of_node(folio)); set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_handle_error(sbi, ERROR_INVALID_NODE_REFERENCE); f2fs_folio_put(folio, true); @@ -1007,7 +1041,7 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, f2fs_ra_node_pages(folio, ofs, NIDS_PER_BLOCK); - rn = F2FS_NODE(&folio->page); + rn = F2FS_NODE(folio); if (depth < 3) { for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) { child_nid = le32_to_cpu(rn->in.nid[i]); @@ -1070,7 +1104,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, int i; int idx = depth - 2; - nid[0] = get_nid(&dn->inode_folio->page, offset[0], true); + nid[0] = get_nid(dn->inode_folio, offset[0], true); if (!nid[0]) return 0; @@ -1083,14 +1117,14 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, idx = i - 1; goto fail; } - nid[i + 1] = get_nid(&folios[i]->page, offset[i + 1], false); + nid[i + 1] = get_nid(folios[i], offset[i + 1], false); } f2fs_ra_node_pages(folios[idx], offset[idx + 1], NIDS_PER_BLOCK); /* free direct nodes linked to a partial indirect node */ for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) { - child_nid = get_nid(&folios[idx]->page, i, false); + child_nid = get_nid(folios[idx], i, false); if (!child_nid) continue; dn->nid = child_nid; @@ -1159,7 +1193,7 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) set_new_dnode(&dn, inode, folio, NULL, 0); folio_unlock(folio); - ri = F2FS_INODE(&folio->page); + ri = F2FS_INODE(folio); switch (level) { case 0: case 1: @@ -1188,7 +1222,7 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) skip_partial: while (cont) { - dn.nid = get_nid(&folio->page, offset[0], true); + dn.nid = get_nid(folio, offset[0], true); switch (offset[0]) { case NODE_DIR1_BLOCK: case NODE_DIR2_BLOCK: @@ -1220,7 +1254,7 @@ skip_partial: } if (err < 0) goto fail; - if (offset[1] == 0 && get_nid(&folio->page, offset[0], true)) { + if (offset[1] == 0 && get_nid(folio, offset[0], true)) { folio_lock(folio); BUG_ON(!is_node_folio(folio)); set_nid(folio, offset[0], 0, true); @@ -1367,8 +1401,8 @@ struct folio *f2fs_new_node_folio(struct dnode_of_data *dn, unsigned int ofs) set_node_addr(sbi, &new_ni, NEW_ADDR, false); f2fs_folio_wait_writeback(folio, NODE, true, true); - fill_node_footer(&folio->page, dn->nid, dn->inode->i_ino, ofs, true); - set_cold_node(&folio->page, S_ISDIR(dn->inode->i_mode)); + fill_node_footer(folio, dn->nid, dn->inode->i_ino, ofs, true); + set_cold_node(folio, S_ISDIR(dn->inode->i_mode)); if (!folio_test_uptodate(folio)) folio_mark_uptodate(folio); if (folio_mark_dirty(folio)) @@ -1400,7 +1434,7 @@ static int read_node_folio(struct folio *folio, blk_opf_t op_flags) .type = NODE, .op = REQ_OP_READ, .op_flags = op_flags, - .page = &folio->page, + .folio = folio, .encrypted_page = NULL, }; int err; @@ -1462,17 +1496,15 @@ static int sanity_check_node_footer(struct f2fs_sb_info *sbi, struct folio *folio, pgoff_t nid, enum node_type ntype) { - struct page *page = &folio->page; - - if (unlikely(nid != nid_of_node(page) || - (ntype == NODE_TYPE_INODE && !IS_INODE(page)) || + if (unlikely(nid != nid_of_node(folio) || + (ntype == NODE_TYPE_INODE && !IS_INODE(folio)) || (ntype == NODE_TYPE_XATTR && - !f2fs_has_xattr_block(ofs_of_node(page))) || + !f2fs_has_xattr_block(ofs_of_node(folio))) || time_to_inject(sbi, FAULT_INCONSISTENT_FOOTER))) { f2fs_warn(sbi, "inconsistent node block, node_type:%d, nid:%lu, " "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", - ntype, nid, nid_of_node(page), ino_of_node(page), - ofs_of_node(page), cpver_of_node(page), + ntype, nid, nid_of_node(folio), ino_of_node(folio), + ofs_of_node(folio), cpver_of_node(folio), next_blkaddr_of_node(folio)); set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); @@ -1553,7 +1585,7 @@ struct folio *f2fs_get_xnode_folio(struct f2fs_sb_info *sbi, pgoff_t xnid) static struct folio *f2fs_get_node_folio_ra(struct folio *parent, int start) { struct f2fs_sb_info *sbi = F2FS_F_SB(parent); - nid_t nid = get_nid(&parent->page, start, false); + nid_t nid = get_nid(parent, start, false); return __get_node_folio(sbi, nid, parent, start, NODE_TYPE_REGULAR); } @@ -1618,9 +1650,9 @@ static struct folio *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) return ERR_PTR(-EIO); } - if (!IS_DNODE(&folio->page) || !is_cold_node(&folio->page)) + if (!IS_DNODE(folio) || !is_cold_node(folio)) continue; - if (ino_of_node(&folio->page) != ino) + if (ino_of_node(folio) != ino) continue; folio_lock(folio); @@ -1630,7 +1662,7 @@ continue_unlock: folio_unlock(folio); continue; } - if (ino_of_node(&folio->page) != ino) + if (ino_of_node(folio) != ino) goto continue_unlock; if (!folio_test_dirty(folio)) { @@ -1660,11 +1692,11 @@ static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted struct node_info ni; struct f2fs_io_info fio = { .sbi = sbi, - .ino = ino_of_node(&folio->page), + .ino = ino_of_node(folio), .type = NODE, .op = REQ_OP_WRITE, .op_flags = wbc_to_write_flags(wbc), - .page = &folio->page, + .folio = folio, .encrypted_page = NULL, .submitted = 0, .io_type = io_type, @@ -1689,11 +1721,11 @@ static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) && wbc->sync_mode == WB_SYNC_NONE && - IS_DNODE(&folio->page) && is_cold_node(&folio->page)) + IS_DNODE(folio) && is_cold_node(folio)) goto redirty_out; /* get old block addr of this node page */ - nid = nid_of_node(&folio->page); + nid = nid_of_node(folio); f2fs_bug_on(sbi, folio->index != nid); if (f2fs_get_node_info(sbi, nid, &ni, !do_balance)) @@ -1731,7 +1763,7 @@ static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted fio.old_blkaddr = ni.blk_addr; f2fs_do_write_node_page(nid, &fio); - set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(&folio->page)); + set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(folio)); dec_page_count(sbi, F2FS_DIRTY_NODES); f2fs_up_read(&sbi->node_write); @@ -1827,9 +1859,9 @@ retry: goto out; } - if (!IS_DNODE(&folio->page) || !is_cold_node(&folio->page)) + if (!IS_DNODE(folio) || !is_cold_node(folio)) continue; - if (ino_of_node(&folio->page) != ino) + if (ino_of_node(folio) != ino) continue; folio_lock(folio); @@ -1839,7 +1871,7 @@ continue_unlock: folio_unlock(folio); continue; } - if (ino_of_node(&folio->page) != ino) + if (ino_of_node(folio) != ino) goto continue_unlock; if (!folio_test_dirty(folio) && folio != last_folio) { @@ -1849,17 +1881,17 @@ continue_unlock: f2fs_folio_wait_writeback(folio, NODE, true, true); - set_fsync_mark(&folio->page, 0); - set_dentry_mark(&folio->page, 0); + set_fsync_mark(folio, 0); + set_dentry_mark(folio, 0); if (!atomic || folio == last_folio) { - set_fsync_mark(&folio->page, 1); + set_fsync_mark(folio, 1); percpu_counter_inc(&sbi->rf_node_block_count); - if (IS_INODE(&folio->page)) { + if (IS_INODE(folio)) { if (is_inode_flag_set(inode, FI_DIRTY_INODE)) f2fs_update_inode(inode, folio); - set_dentry_mark(&folio->page, + set_dentry_mark(folio, f2fs_need_dentry_mark(sbi, ino)); } /* may be written by other thread */ @@ -1935,7 +1967,7 @@ static bool flush_dirty_inode(struct folio *folio) { struct f2fs_sb_info *sbi = F2FS_F_SB(folio); struct inode *inode; - nid_t ino = ino_of_node(&folio->page); + nid_t ino = ino_of_node(folio); inode = find_inode_nowait(sbi->sb, ino, f2fs_match_ino, NULL); if (!inode) @@ -1964,7 +1996,7 @@ void f2fs_flush_inline_data(struct f2fs_sb_info *sbi) for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; - if (!IS_INODE(&folio->page)) + if (!IS_INODE(folio)) continue; folio_lock(folio); @@ -1975,10 +2007,10 @@ void f2fs_flush_inline_data(struct f2fs_sb_info *sbi) goto unlock; /* flush inline_data, if it's async context. */ - if (page_private_inline(&folio->page)) { - clear_page_private_inline(&folio->page); + if (folio_test_f2fs_inline(folio)) { + folio_clear_f2fs_inline(folio); folio_unlock(folio); - flush_inline_data(sbi, ino_of_node(&folio->page)); + flush_inline_data(sbi, ino_of_node(folio)); continue; } unlock: @@ -2027,13 +2059,13 @@ next_step: * 1. dentry dnodes * 2. file dnodes */ - if (step == 0 && IS_DNODE(&folio->page)) + if (step == 0 && IS_DNODE(folio)) continue; - if (step == 1 && (!IS_DNODE(&folio->page) || - is_cold_node(&folio->page))) + if (step == 1 && (!IS_DNODE(folio) || + is_cold_node(folio))) continue; - if (step == 2 && (!IS_DNODE(&folio->page) || - !is_cold_node(&folio->page))) + if (step == 2 && (!IS_DNODE(folio) || + !is_cold_node(folio))) continue; lock_node: if (wbc->sync_mode == WB_SYNC_ALL) @@ -2057,15 +2089,15 @@ continue_unlock: goto write_node; /* flush inline_data */ - if (page_private_inline(&folio->page)) { - clear_page_private_inline(&folio->page); + if (folio_test_f2fs_inline(folio)) { + folio_clear_f2fs_inline(folio); folio_unlock(folio); - flush_inline_data(sbi, ino_of_node(&folio->page)); + flush_inline_data(sbi, ino_of_node(folio)); goto lock_node; } /* flush dirty inode */ - if (IS_INODE(&folio->page) && flush_dirty_inode(folio)) + if (IS_INODE(folio) && flush_dirty_inode(folio)) goto lock_node; write_node: f2fs_folio_wait_writeback(folio, NODE, true, true); @@ -2073,8 +2105,8 @@ write_node: if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; - set_fsync_mark(&folio->page, 0); - set_dentry_mark(&folio->page, 0); + set_fsync_mark(folio, 0); + set_dentry_mark(folio, 0); if (!__write_node_folio(folio, false, &submitted, wbc, do_balance, io_type, NULL)) { @@ -2201,12 +2233,12 @@ static bool f2fs_dirty_node_folio(struct address_space *mapping, if (!folio_test_uptodate(folio)) folio_mark_uptodate(folio); #ifdef CONFIG_F2FS_CHECK_FS - if (IS_INODE(&folio->page)) - f2fs_inode_chksum_set(F2FS_M_SB(mapping), &folio->page); + if (IS_INODE(folio)) + f2fs_inode_chksum_set(F2FS_M_SB(mapping), folio); #endif if (filemap_dirty_folio(mapping, folio)) { inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES); - set_page_private_reference(&folio->page); + folio_set_f2fs_reference(folio); return true; } return false; @@ -2351,7 +2383,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, * - __remove_nid_from_list(PREALLOC_NID) * - __insert_nid_to_list(FREE_NID) */ - ne = __lookup_nat_cache(nm_i, nid); + ne = __lookup_nat_cache(nm_i, nid, false); if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || nat_get_blkaddr(ne) != NULL_ADDR)) goto err_out; @@ -2714,7 +2746,7 @@ int f2fs_recover_inline_xattr(struct inode *inode, struct folio *folio) if (IS_ERR(ifolio)) return PTR_ERR(ifolio); - ri = F2FS_INODE(&folio->page); + ri = F2FS_INODE(folio); if (ri->i_inline & F2FS_INLINE_XATTR) { if (!f2fs_has_inline_xattr(inode)) { set_inode_flag(inode, FI_INLINE_XATTR); @@ -2740,7 +2772,7 @@ update_inode: return 0; } -int f2fs_recover_xattr_data(struct inode *inode, struct page *page) +int f2fs_recover_xattr_data(struct inode *inode, struct folio *folio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; @@ -2778,8 +2810,8 @@ recover_xnid: f2fs_update_inode_page(inode); /* 3: update and set xattr node page dirty */ - if (page) { - memcpy(F2FS_NODE(&xfolio->page), F2FS_NODE(page), + if (folio) { + memcpy(F2FS_NODE(xfolio), F2FS_NODE(folio), VALID_XATTR_BLOCK_SIZE); folio_mark_dirty(xfolio); } @@ -2788,10 +2820,10 @@ recover_xnid: return 0; } -int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) +int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct folio *folio) { struct f2fs_inode *src, *dst; - nid_t ino = ino_of_node(page); + nid_t ino = ino_of_node(folio); struct node_info old_ni, new_ni; struct folio *ifolio; int err; @@ -2814,11 +2846,11 @@ retry: if (!folio_test_uptodate(ifolio)) folio_mark_uptodate(ifolio); - fill_node_footer(&ifolio->page, ino, ino, 0, true); - set_cold_node(&ifolio->page, false); + fill_node_footer(ifolio, ino, ino, 0, true); + set_cold_node(ifolio, false); - src = F2FS_INODE(page); - dst = F2FS_INODE(&ifolio->page); + src = F2FS_INODE(folio); + dst = F2FS_INODE(ifolio); memcpy(dst, src, offsetof(struct f2fs_inode, i_ext)); dst->i_size = 0; @@ -2884,7 +2916,7 @@ int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, if (IS_ERR(folio)) return PTR_ERR(folio); - rn = F2FS_NODE(&folio->page); + rn = F2FS_NODE(folio); sum_entry->nid = rn->footer.nid; sum_entry->version = 0; sum_entry->ofs_in_node = 0; @@ -2904,6 +2936,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_journal *journal = curseg->journal; int i; + bool init_dirty; down_write(&curseg->journal_rwsem); for (i = 0; i < nats_in_cursum(journal); i++) { @@ -2914,12 +2947,15 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) if (f2fs_check_nid_range(sbi, nid)) continue; + init_dirty = false; + raw_ne = nat_in_journal(journal, i); - ne = __lookup_nat_cache(nm_i, nid); + ne = __lookup_nat_cache(nm_i, nid, true); if (!ne) { + init_dirty = true; ne = __alloc_nat_entry(sbi, nid, true); - __init_nat_entry(nm_i, ne, &raw_ne, true); + __init_nat_entry(nm_i, ne, &raw_ne, true, true); } /* @@ -2934,7 +2970,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) spin_unlock(&nm_i->nid_list_lock); } - __set_nat_cache_dirty(nm_i, ne); + __set_nat_cache_dirty(nm_i, ne, init_dirty); } update_nats_in_cursum(journal, -i); up_write(&curseg->journal_rwsem); @@ -2959,11 +2995,10 @@ add_out: } static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, - struct page *page) + const struct f2fs_nat_block *nat_blk) { struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK; - struct f2fs_nat_block *nat_blk = page_address(page); int valid = 0; int i = 0; @@ -3000,7 +3035,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, bool to_journal = true; struct f2fs_nat_block *nat_blk; struct nat_entry *ne, *cur; - struct page *page = NULL; + struct folio *folio = NULL; /* * there are two steps to flush nat entries: @@ -3014,11 +3049,11 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, if (to_journal) { down_write(&curseg->journal_rwsem); } else { - page = get_next_nat_page(sbi, start_nid); - if (IS_ERR(page)) - return PTR_ERR(page); + folio = get_next_nat_folio(sbi, start_nid); + if (IS_ERR(folio)) + return PTR_ERR(folio); - nat_blk = page_address(page); + nat_blk = folio_address(folio); f2fs_bug_on(sbi, !nat_blk); } @@ -3054,8 +3089,8 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, if (to_journal) { up_write(&curseg->journal_rwsem); } else { - __update_nat_bits(sbi, start_nid, page); - f2fs_put_page(page, 1); + __update_nat_bits(sbi, start_nid, nat_blk); + f2fs_folio_put(folio, true); } /* Allow dirty nats by node block allocation in write_begin */ @@ -3395,10 +3430,10 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) } kvfree(nm_i->free_nid_count); - kvfree(nm_i->nat_bitmap); + kfree(nm_i->nat_bitmap); kvfree(nm_i->nat_bits); #ifdef CONFIG_F2FS_CHECK_FS - kvfree(nm_i->nat_bitmap_mir); + kfree(nm_i->nat_bitmap_mir); #endif sbi->nm_info = NULL; kfree(nm_i); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 1446c433b3ec..030390543b54 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -31,7 +31,7 @@ /* control total # of nats */ #define DEF_NAT_CACHE_THRESHOLD 100000 -/* control total # of node writes used for roll-fowrad recovery */ +/* control total # of node writes used for roll-forward recovery */ #define DEF_RF_NODE_BLOCKS 0 /* vector size for gang look-up from nat cache that consists of radix tree */ @@ -243,41 +243,41 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid) #endif } -static inline nid_t ino_of_node(struct page *node_page) +static inline nid_t ino_of_node(const struct folio *node_folio) { - struct f2fs_node *rn = F2FS_NODE(node_page); + struct f2fs_node *rn = F2FS_NODE(node_folio); return le32_to_cpu(rn->footer.ino); } -static inline nid_t nid_of_node(struct page *node_page) +static inline nid_t nid_of_node(const struct folio *node_folio) { - struct f2fs_node *rn = F2FS_NODE(node_page); + struct f2fs_node *rn = F2FS_NODE(node_folio); return le32_to_cpu(rn->footer.nid); } -static inline unsigned int ofs_of_node(const struct page *node_page) +static inline unsigned int ofs_of_node(const struct folio *node_folio) { - struct f2fs_node *rn = F2FS_NODE(node_page); + struct f2fs_node *rn = F2FS_NODE(node_folio); unsigned flag = le32_to_cpu(rn->footer.flag); return flag >> OFFSET_BIT_SHIFT; } -static inline __u64 cpver_of_node(struct page *node_page) +static inline __u64 cpver_of_node(const struct folio *node_folio) { - struct f2fs_node *rn = F2FS_NODE(node_page); + struct f2fs_node *rn = F2FS_NODE(node_folio); return le64_to_cpu(rn->footer.cp_ver); } -static inline block_t next_blkaddr_of_node(struct folio *node_folio) +static inline block_t next_blkaddr_of_node(const struct folio *node_folio) { - struct f2fs_node *rn = F2FS_NODE(&node_folio->page); + struct f2fs_node *rn = F2FS_NODE(node_folio); return le32_to_cpu(rn->footer.next_blkaddr); } -static inline void fill_node_footer(struct page *page, nid_t nid, +static inline void fill_node_footer(const struct folio *folio, nid_t nid, nid_t ino, unsigned int ofs, bool reset) { - struct f2fs_node *rn = F2FS_NODE(page); + struct f2fs_node *rn = F2FS_NODE(folio); unsigned int old_flag = 0; if (reset) @@ -293,17 +293,18 @@ static inline void fill_node_footer(struct page *page, nid_t nid, (old_flag & OFFSET_BIT_MASK)); } -static inline void copy_node_footer(struct page *dst, struct page *src) +static inline void copy_node_footer(const struct folio *dst, + const struct folio *src) { struct f2fs_node *src_rn = F2FS_NODE(src); struct f2fs_node *dst_rn = F2FS_NODE(dst); memcpy(&dst_rn->footer, &src_rn->footer, sizeof(struct node_footer)); } -static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) +static inline void fill_node_footer_blkaddr(struct folio *folio, block_t blkaddr) { - struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); - struct f2fs_node *rn = F2FS_NODE(page); + struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_F_SB(folio)); + struct f2fs_node *rn = F2FS_NODE(folio); __u64 cp_ver = cur_cp_version(ckpt); if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) @@ -313,19 +314,19 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) rn->footer.next_blkaddr = cpu_to_le32(blkaddr); } -static inline bool is_recoverable_dnode(struct page *page) +static inline bool is_recoverable_dnode(const struct folio *folio) { - struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); + struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_F_SB(folio)); __u64 cp_ver = cur_cp_version(ckpt); /* Don't care crc part, if fsck.f2fs sets it. */ if (__is_set_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG)) - return (cp_ver << 32) == (cpver_of_node(page) << 32); + return (cp_ver << 32) == (cpver_of_node(folio) << 32); if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) cp_ver |= (cur_cp_crc(ckpt) << 32); - return cp_ver == cpver_of_node(page); + return cp_ver == cpver_of_node(folio); } /* @@ -349,9 +350,9 @@ static inline bool is_recoverable_dnode(struct page *page) * `- indirect node ((6 + 2N) + (N - 1)(N + 1)) * `- direct node */ -static inline bool IS_DNODE(const struct page *node_page) +static inline bool IS_DNODE(const struct folio *node_folio) { - unsigned int ofs = ofs_of_node(node_page); + unsigned int ofs = ofs_of_node(node_folio); if (f2fs_has_xattr_block(ofs)) return true; @@ -369,7 +370,7 @@ static inline bool IS_DNODE(const struct page *node_page) static inline int set_nid(struct folio *folio, int off, nid_t nid, bool i) { - struct f2fs_node *rn = F2FS_NODE(&folio->page); + struct f2fs_node *rn = F2FS_NODE(folio); f2fs_folio_wait_writeback(folio, NODE, true, true); @@ -380,9 +381,9 @@ static inline int set_nid(struct folio *folio, int off, nid_t nid, bool i) return folio_mark_dirty(folio); } -static inline nid_t get_nid(struct page *p, int off, bool i) +static inline nid_t get_nid(const struct folio *folio, int off, bool i) { - struct f2fs_node *rn = F2FS_NODE(p); + struct f2fs_node *rn = F2FS_NODE(folio); if (i) return le32_to_cpu(rn->i.i_nid[off - NODE_DIR1_BLOCK]); @@ -396,19 +397,19 @@ static inline nid_t get_nid(struct page *p, int off, bool i) * - Mark cold data pages in page cache */ -static inline int is_node(const struct page *page, int type) +static inline int is_node(const struct folio *folio, int type) { - struct f2fs_node *rn = F2FS_NODE(page); + struct f2fs_node *rn = F2FS_NODE(folio); return le32_to_cpu(rn->footer.flag) & BIT(type); } -#define is_cold_node(page) is_node(page, COLD_BIT_SHIFT) -#define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT) -#define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT) +#define is_cold_node(folio) is_node(folio, COLD_BIT_SHIFT) +#define is_fsync_dnode(folio) is_node(folio, FSYNC_BIT_SHIFT) +#define is_dent_dnode(folio) is_node(folio, DENT_BIT_SHIFT) -static inline void set_cold_node(struct page *page, bool is_dir) +static inline void set_cold_node(const struct folio *folio, bool is_dir) { - struct f2fs_node *rn = F2FS_NODE(page); + struct f2fs_node *rn = F2FS_NODE(folio); unsigned int flag = le32_to_cpu(rn->footer.flag); if (is_dir) @@ -418,9 +419,9 @@ static inline void set_cold_node(struct page *page, bool is_dir) rn->footer.flag = cpu_to_le32(flag); } -static inline void set_mark(struct page *page, int mark, int type) +static inline void set_mark(struct folio *folio, int mark, int type) { - struct f2fs_node *rn = F2FS_NODE(page); + struct f2fs_node *rn = F2FS_NODE(folio); unsigned int flag = le32_to_cpu(rn->footer.flag); if (mark) flag |= BIT(type); @@ -429,8 +430,8 @@ static inline void set_mark(struct page *page, int mark, int type) rn->footer.flag = cpu_to_le32(flag); #ifdef CONFIG_F2FS_CHECK_FS - f2fs_inode_chksum_set(F2FS_P_SB(page), page); + f2fs_inode_chksum_set(F2FS_F_SB(folio), folio); #endif } -#define set_dentry_mark(page, mark) set_mark(page, mark, DENT_BIT_SHIFT) -#define set_fsync_mark(page, mark) set_mark(page, mark, FSYNC_BIT_SHIFT) +#define set_dentry_mark(folio, mark) set_mark(folio, mark, DENT_BIT_SHIFT) +#define set_fsync_mark(folio, mark) set_mark(folio, mark, FSYNC_BIT_SHIFT) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 51ebed4e1521..4cb3a91801b4 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -157,10 +157,10 @@ static int init_recovered_filename(const struct inode *dir, return 0; } -static int recover_dentry(struct inode *inode, struct page *ipage, +static int recover_dentry(struct inode *inode, struct folio *ifolio, struct list_head *dir_list) { - struct f2fs_inode *raw_inode = F2FS_INODE(ipage); + struct f2fs_inode *raw_inode = F2FS_INODE(ifolio); nid_t pino = le32_to_cpu(raw_inode->i_pino); struct f2fs_dir_entry *de; struct f2fs_filename fname; @@ -233,14 +233,14 @@ out: else name = raw_inode->i_name; f2fs_notice(F2FS_I_SB(inode), "%s: ino = %x, name = %s, dir = %lx, err = %d", - __func__, ino_of_node(ipage), name, + __func__, ino_of_node(ifolio), name, IS_ERR(dir) ? 0 : dir->i_ino, err); return err; } -static int recover_quota_data(struct inode *inode, struct page *page) +static int recover_quota_data(struct inode *inode, struct folio *folio) { - struct f2fs_inode *raw = F2FS_INODE(page); + struct f2fs_inode *raw = F2FS_INODE(folio); struct iattr attr; uid_t i_uid = le32_to_cpu(raw->i_uid); gid_t i_gid = le32_to_cpu(raw->i_gid); @@ -277,16 +277,16 @@ static void recover_inline_flags(struct inode *inode, struct f2fs_inode *ri) clear_inode_flag(inode, FI_DATA_EXIST); } -static int recover_inode(struct inode *inode, struct page *page) +static int recover_inode(struct inode *inode, struct folio *folio) { - struct f2fs_inode *raw = F2FS_INODE(page); + struct f2fs_inode *raw = F2FS_INODE(folio); struct f2fs_inode_info *fi = F2FS_I(inode); char *name; int err; inode->i_mode = le16_to_cpu(raw->i_mode); - err = recover_quota_data(inode, page); + err = recover_quota_data(inode, folio); if (err) return err; @@ -333,10 +333,10 @@ static int recover_inode(struct inode *inode, struct page *page) if (file_enc_name(inode)) name = "<encrypted>"; else - name = F2FS_INODE(page)->i_name; + name = F2FS_INODE(folio)->i_name; f2fs_notice(F2FS_I_SB(inode), "recover_inode: ino = %x, name = %s, inline = %x", - ino_of_node(page), name, raw->i_inline); + ino_of_node(folio), name, raw->i_inline); return 0; } @@ -375,7 +375,7 @@ static int sanity_check_node_chain(struct f2fs_sb_info *sbi, block_t blkaddr, if (IS_ERR(folio)) return PTR_ERR(folio); - if (!is_recoverable_dnode(&folio->page)) { + if (!is_recoverable_dnode(folio)) { f2fs_folio_put(folio, true); *is_detecting = false; return 0; @@ -424,22 +424,22 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, break; } - if (!is_recoverable_dnode(&folio->page)) { + if (!is_recoverable_dnode(folio)) { f2fs_folio_put(folio, true); break; } - if (!is_fsync_dnode(&folio->page)) + if (!is_fsync_dnode(folio)) goto next; - entry = get_fsync_inode(head, ino_of_node(&folio->page)); + entry = get_fsync_inode(head, ino_of_node(folio)); if (!entry) { bool quota_inode = false; if (!check_only && - IS_INODE(&folio->page) && - is_dent_dnode(&folio->page)) { - err = f2fs_recover_inode_page(sbi, &folio->page); + IS_INODE(folio) && + is_dent_dnode(folio)) { + err = f2fs_recover_inode_page(sbi, folio); if (err) { f2fs_folio_put(folio, true); break; @@ -451,7 +451,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, * CP | dnode(F) | inode(DF) * For this case, we should not give up now. */ - entry = add_fsync_inode(sbi, head, ino_of_node(&folio->page), + entry = add_fsync_inode(sbi, head, ino_of_node(folio), quota_inode); if (IS_ERR(entry)) { err = PTR_ERR(entry); @@ -463,7 +463,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, } entry->blkaddr = blkaddr; - if (IS_INODE(&folio->page) && is_dent_dnode(&folio->page)) + if (IS_INODE(folio) && is_dent_dnode(folio)) entry->last_dentry = blkaddr; next: /* check next segment */ @@ -527,7 +527,7 @@ got_it: nid = le32_to_cpu(sum.nid); ofs_in_node = le16_to_cpu(sum.ofs_in_node); - max_addrs = ADDRS_PER_PAGE(&dn->node_folio->page, dn->inode); + max_addrs = ADDRS_PER_PAGE(dn->node_folio, dn->inode); if (ofs_in_node >= max_addrs) { f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%lu, nid:%u, max:%u", ofs_in_node, dn->inode->i_ino, nid, max_addrs); @@ -552,8 +552,8 @@ got_it: if (IS_ERR(node_folio)) return PTR_ERR(node_folio); - offset = ofs_of_node(&node_folio->page); - ino = ino_of_node(&node_folio->page); + offset = ofs_of_node(node_folio); + ino = ino_of_node(node_folio); f2fs_folio_put(node_folio, true); if (ino != dn->inode->i_ino) { @@ -624,16 +624,16 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, { struct dnode_of_data dn; struct node_info ni; - unsigned int start, end; + unsigned int start = 0, end = 0, index; int err = 0, recovered = 0; /* step 1: recover xattr */ - if (IS_INODE(&folio->page)) { + if (IS_INODE(folio)) { err = f2fs_recover_inline_xattr(inode, folio); if (err) goto out; - } else if (f2fs_has_xattr_block(ofs_of_node(&folio->page))) { - err = f2fs_recover_xattr_data(inode, &folio->page); + } else if (f2fs_has_xattr_block(ofs_of_node(folio))) { + err = f2fs_recover_xattr_data(inode, folio); if (!err) recovered++; goto out; @@ -648,8 +648,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, } /* step 3: recover data indices */ - start = f2fs_start_bidx_of_node(ofs_of_node(&folio->page), inode); - end = start + ADDRS_PER_PAGE(&folio->page, inode); + start = f2fs_start_bidx_of_node(ofs_of_node(folio), inode); + end = start + ADDRS_PER_PAGE(folio, inode); set_new_dnode(&dn, inode, NULL, NULL, 0); retry_dn: @@ -668,18 +668,18 @@ retry_dn: if (err) goto err; - f2fs_bug_on(sbi, ni.ino != ino_of_node(&folio->page)); + f2fs_bug_on(sbi, ni.ino != ino_of_node(folio)); - if (ofs_of_node(&dn.node_folio->page) != ofs_of_node(&folio->page)) { + if (ofs_of_node(dn.node_folio) != ofs_of_node(folio)) { f2fs_warn(sbi, "Inconsistent ofs_of_node, ino:%lu, ofs:%u, %u", - inode->i_ino, ofs_of_node(&dn.node_folio->page), - ofs_of_node(&folio->page)); + inode->i_ino, ofs_of_node(dn.node_folio), + ofs_of_node(folio)); err = -EFSCORRUPTED; f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); goto err; } - for (; start < end; start++, dn.ofs_in_node++) { + for (index = start; index < end; index++, dn.ofs_in_node++) { block_t src, dest; src = f2fs_data_blkaddr(&dn); @@ -708,9 +708,9 @@ retry_dn: } if (!file_keep_isize(inode) && - (i_size_read(inode) <= ((loff_t)start << PAGE_SHIFT))) + (i_size_read(inode) <= ((loff_t)index << PAGE_SHIFT))) f2fs_i_size_write(inode, - (loff_t)(start + 1) << PAGE_SHIFT); + (loff_t)(index + 1) << PAGE_SHIFT); /* * dest is reserved block, invalidate src block @@ -758,16 +758,18 @@ retry_prev: } } - copy_node_footer(&dn.node_folio->page, &folio->page); - fill_node_footer(&dn.node_folio->page, dn.nid, ni.ino, - ofs_of_node(&folio->page), false); + copy_node_footer(dn.node_folio, folio); + fill_node_footer(dn.node_folio, dn.nid, ni.ino, + ofs_of_node(folio), false); folio_mark_dirty(dn.node_folio); err: f2fs_put_dnode(&dn); out: - f2fs_notice(sbi, "recover_data: ino = %lx (i_size: %s) recovered = %d, err = %d", - inode->i_ino, file_keep_isize(inode) ? "keep" : "recover", - recovered, err); + f2fs_notice(sbi, "recover_data: ino = %lx, nid = %x (i_size: %s), " + "range (%u, %u), recovered = %d, err = %d", + inode->i_ino, nid_of_node(folio), + file_keep_isize(inode) ? "keep" : "recover", + start, end, recovered, err); return err; } @@ -778,6 +780,14 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, int err = 0; block_t blkaddr; unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS; + unsigned int recoverable_dnode = 0; + unsigned int fsynced_dnode = 0; + unsigned int total_dnode = 0; + unsigned int recovered_inode = 0; + unsigned int recovered_dentry = 0; + unsigned int recovered_dnode = 0; + + f2fs_notice(sbi, "do_recover_data: start to recover dnode"); /* get node pages in the current segment */ curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); @@ -796,38 +806,43 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, break; } - if (!is_recoverable_dnode(&folio->page)) { + if (!is_recoverable_dnode(folio)) { f2fs_folio_put(folio, true); break; } + recoverable_dnode++; - entry = get_fsync_inode(inode_list, ino_of_node(&folio->page)); + entry = get_fsync_inode(inode_list, ino_of_node(folio)); if (!entry) goto next; + fsynced_dnode++; /* * inode(x) | CP | inode(x) | dnode(F) * In this case, we can lose the latest inode(x). * So, call recover_inode for the inode update. */ - if (IS_INODE(&folio->page)) { - err = recover_inode(entry->inode, &folio->page); + if (IS_INODE(folio)) { + err = recover_inode(entry->inode, folio); if (err) { f2fs_folio_put(folio, true); break; } + recovered_inode++; } if (entry->last_dentry == blkaddr) { - err = recover_dentry(entry->inode, &folio->page, dir_list); + err = recover_dentry(entry->inode, folio, dir_list); if (err) { f2fs_folio_put(folio, true); break; } + recovered_dentry++; } err = do_recover_data(sbi, entry->inode, folio); if (err) { f2fs_folio_put(folio, true); break; } + recovered_dnode++; if (entry->blkaddr == blkaddr) list_move_tail(&entry->list, tmp_inode_list); @@ -840,9 +855,15 @@ next: f2fs_folio_put(folio, true); f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks); + total_dnode++; } if (!err) err = f2fs_allocate_new_segments(sbi); + + f2fs_notice(sbi, "do_recover_data: dnode: (recoverable: %u, fsynced: %u, " + "total: %u), recovered: (inode: %u, dentry: %u, dnode: %u), err: %d", + recoverable_dnode, fsynced_dnode, total_dnode, recovered_inode, + recovered_dentry, recovered_dnode, err); return err; } @@ -855,6 +876,9 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) unsigned long s_flags = sbi->sb->s_flags; bool need_writecp = false; + f2fs_notice(sbi, "f2fs_recover_fsync_data: recovery fsync data, " + "check_only: %d", check_only); + if (is_sbi_flag_set(sbi, SBI_IS_WRITABLE)) f2fs_info(sbi, "recover fsync data on readonly fs"); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ae1223ef648f..cc82d42ef14c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -334,7 +334,7 @@ static int __f2fs_commit_atomic_write(struct inode *inode) goto next; } - blen = min((pgoff_t)ADDRS_PER_PAGE(&dn.node_folio->page, cow_inode), + blen = min((pgoff_t)ADDRS_PER_PAGE(dn.node_folio, cow_inode), len); index = off; for (i = 0; i < blen; i++, dn.ofs_in_node++, index++) { @@ -455,7 +455,8 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) } else { struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, - .init_gc_type = BG_GC, + .init_gc_type = f2fs_sb_has_blkzoned(sbi) ? + FG_GC : BG_GC, .no_bg_gc = true, .should_migrate_blocks = false, .err_gc_skipped = false, @@ -772,7 +773,7 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); /* need not be added */ - if (IS_CURSEG(sbi, segno)) + if (is_curseg(sbi, segno)) return; if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type])) @@ -799,7 +800,7 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, !valid_blocks) || valid_blocks == CAP_BLKS_PER_SEC(sbi)); - if (!IS_CURSEC(sbi, secno)) + if (!is_cursec(sbi, secno)) set_bit(secno, dirty_i->dirty_secmap); } } @@ -838,7 +839,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, return; } - if (!IS_CURSEC(sbi, secno)) + if (!is_cursec(sbi, secno)) set_bit(secno, dirty_i->dirty_secmap); } } @@ -855,7 +856,7 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) unsigned short valid_blocks, ckpt_valid_blocks; unsigned int usable_blocks; - if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno)) + if (segno == NULL_SEGNO || is_curseg(sbi, segno)) return; usable_blocks = f2fs_usable_blks_in_seg(sbi, segno); @@ -888,7 +889,7 @@ void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi) for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { if (get_valid_blocks(sbi, segno, false)) continue; - if (IS_CURSEG(sbi, segno)) + if (is_curseg(sbi, segno)) continue; __locate_dirty_segment(sbi, segno, PRE); __remove_dirty_segment(sbi, segno, DIRTY); @@ -2107,7 +2108,7 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, if (!force) { if (!f2fs_realtime_discard_enable(sbi) || (!se->valid_blocks && - !IS_CURSEG(sbi, cpc->trim_start)) || + !is_curseg(sbi, cpc->trim_start)) || SM_I(sbi)->dcc_info->nr_discards >= SM_I(sbi)->dcc_info->max_discards) return false; @@ -2235,7 +2236,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, next: secno = GET_SEC_FROM_SEG(sbi, start); start_segno = GET_SEG_FROM_SEC(sbi, secno); - if (!IS_CURSEC(sbi, secno) && + if (!is_cursec(sbi, secno) && !get_valid_blocks(sbi, start, true)) f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno), BLKS_PER_SEC(sbi)); @@ -3619,7 +3620,7 @@ static int __get_segment_type_4(struct f2fs_io_info *fio) else return CURSEG_COLD_DATA; } else { - if (IS_DNODE(fio->page) && is_cold_node(fio->page)) + if (IS_DNODE(fio->folio) && is_cold_node(fio->folio)) return CURSEG_WARM_NODE; else return CURSEG_COLD_NODE; @@ -3665,8 +3666,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) if (file_is_cold(inode) || f2fs_need_compress_data(inode)) return CURSEG_COLD_DATA; - type = __get_age_segment_type(inode, - page_folio(fio->page)->index); + type = __get_age_segment_type(inode, fio->folio->index); if (type != NO_CHECK_TYPE) return type; @@ -3677,8 +3677,8 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) return f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode), inode->i_write_hint); } else { - if (IS_DNODE(fio->page)) - return is_cold_node(fio->page) ? CURSEG_WARM_NODE : + if (IS_DNODE(fio->folio)) + return is_cold_node(fio->folio) ? CURSEG_WARM_NODE : CURSEG_HOT_NODE; return CURSEG_COLD_NODE; } @@ -3746,7 +3746,7 @@ static void f2fs_randomize_chunk(struct f2fs_sb_info *sbi, get_random_u32_inclusive(1, sbi->max_fragment_hole); } -int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, +int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct folio *folio, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, struct f2fs_io_info *fio) @@ -3850,10 +3850,10 @@ skip_new_segment: up_write(&sit_i->sentry_lock); - if (page && IS_NODESEG(curseg->seg_type)) { - fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); + if (folio && IS_NODESEG(curseg->seg_type)) { + fill_node_footer_blkaddr(folio, NEXT_FREE_BLKADDR(sbi, curseg)); - f2fs_inode_chksum_set(sbi, page); + f2fs_inode_chksum_set(sbi, folio); } if (fio) { @@ -3931,7 +3931,7 @@ static int log_type_to_seg_type(enum log_type type) static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { - struct folio *folio = page_folio(fio->page); + struct folio *folio = fio->folio; enum log_type type = __get_segment_type(fio); int seg_type = log_type_to_seg_type(type); bool keep_order = (f2fs_lfs_mode(fio->sbi) && @@ -3940,15 +3940,21 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) if (keep_order) f2fs_down_read(&fio->sbi->io_order_lock); - if (f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, + if (f2fs_allocate_data_block(fio->sbi, folio, fio->old_blkaddr, &fio->new_blkaddr, sum, type, fio)) { if (fscrypt_inode_uses_fs_layer_crypto(folio->mapping->host)) fscrypt_finalize_bounce_page(&fio->encrypted_page); folio_end_writeback(folio); if (f2fs_in_warm_node_list(fio->sbi, folio)) f2fs_del_fsync_node_entry(fio->sbi, folio); + f2fs_bug_on(fio->sbi, !is_set_ckpt_flags(fio->sbi, + CP_ERROR_FLAG)); goto out; } + + f2fs_bug_on(fio->sbi, !f2fs_is_valid_blkaddr_raw(fio->sbi, + fio->new_blkaddr, DATA_GENERIC_ENHANCE)); + if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) f2fs_invalidate_internal_cache(fio->sbi, fio->old_blkaddr, 1); @@ -3972,7 +3978,7 @@ void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct folio *folio, .op_flags = REQ_SYNC | REQ_META | REQ_PRIO, .old_blkaddr = folio->index, .new_blkaddr = folio->index, - .page = folio_page(folio, 0), + .folio = folio, .encrypted_page = NULL, .in_list = 0, }; @@ -4100,14 +4106,14 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (!recover_curseg) { /* for recovery flow */ - if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) { + if (se->valid_blocks == 0 && !is_curseg(sbi, segno)) { if (old_blkaddr == NULL_ADDR) type = CURSEG_COLD_DATA; else type = CURSEG_WARM_DATA; } } else { - if (IS_CURSEG(sbi, segno)) { + if (is_curseg(sbi, segno)) { /* se->type is volatile as SSR allocation */ type = __f2fs_get_curseg(sbi, segno); f2fs_bug_on(sbi, type == NO_CHECK_TYPE); @@ -4191,7 +4197,7 @@ void f2fs_folio_wait_writeback(struct folio *folio, enum page_type type, struct f2fs_sb_info *sbi = F2FS_F_SB(folio); /* submit cached LFS IO */ - f2fs_submit_merged_write_cond(sbi, NULL, &folio->page, 0, type); + f2fs_submit_merged_write_cond(sbi, NULL, folio, 0, type); /* submit cached IPU IO */ f2fs_submit_merged_ipu_write(sbi, NULL, folio); if (ordered) { @@ -5143,7 +5149,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) if (!valid_blocks || valid_blocks == CAP_BLKS_PER_SEC(sbi)) continue; - if (IS_CURSEC(sbi, secno)) + if (is_cursec(sbi, secno)) continue; set_bit(secno, dirty_i->dirty_secmap); } @@ -5279,7 +5285,7 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, * Get # of valid block of the zone. */ valid_block_cnt = get_valid_blocks(sbi, zone_segno, true); - if (IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, zone_segno))) { + if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, zone_segno))) { f2fs_notice(sbi, "Open zones: valid block[0x%x,0x%x] cond[%s]", zone_segno, valid_block_cnt, blk_zone_cond_str(zone->cond)); @@ -5806,9 +5812,9 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) kvfree(sit_i->dirty_sentries_bitmap); SM_I(sbi)->sit_info = NULL; - kvfree(sit_i->sit_bitmap); + kfree(sit_i->sit_bitmap); #ifdef CONFIG_F2FS_CHECK_FS - kvfree(sit_i->sit_bitmap_mir); + kfree(sit_i->sit_bitmap_mir); kvfree(sit_i->invalid_segmap); #endif kfree(sit_i); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index db619fd2f51a..5e2ee5c686b1 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -34,34 +34,6 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, seg_type >= NR_PERSISTENT_LOG); } -#define IS_CURSEG(sbi, seg) \ - (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ - ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ - ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ - ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ - ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ - ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno) || \ - ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno) || \ - ((seg) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno)) - -#define IS_CURSEC(sbi, secno) \ - (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ - SEGS_PER_SEC(sbi)) || \ - ((secno) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ - SEGS_PER_SEC(sbi)) || \ - ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ - SEGS_PER_SEC(sbi)) || \ - ((secno) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ - SEGS_PER_SEC(sbi)) || \ - ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ - SEGS_PER_SEC(sbi)) || \ - ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ - SEGS_PER_SEC(sbi)) || \ - ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno / \ - SEGS_PER_SEC(sbi)) || \ - ((secno) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno / \ - SEGS_PER_SEC(sbi))) - #define MAIN_BLKADDR(sbi) \ (SM_I(sbi) ? SM_I(sbi)->main_blkaddr : \ le32_to_cpu(F2FS_RAW_SUPER(sbi)->main_blkaddr)) @@ -318,6 +290,28 @@ static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type) return (struct curseg_info *)(SM_I(sbi)->curseg_array + type); } +static inline bool is_curseg(struct f2fs_sb_info *sbi, unsigned int segno) +{ + int i; + + for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { + if (segno == CURSEG_I(sbi, i)->segno) + return true; + } + return false; +} + +static inline bool is_cursec(struct f2fs_sb_info *sbi, unsigned int secno) +{ + int i; + + for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { + if (secno == GET_SEC_FROM_SEG(sbi, CURSEG_I(sbi, i)->segno)) + return true; + } + return false; +} + static inline struct seg_entry *get_seg_entry(struct f2fs_sb_info *sbi, unsigned int segno) { @@ -509,7 +503,7 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, free_i->free_segments++; - if (!inmem && IS_CURSEC(sbi, secno)) + if (!inmem && is_cursec(sbi, secno)) goto unlock_out; /* check large section */ @@ -674,8 +668,7 @@ static inline void __get_secs_required(struct f2fs_sb_info *sbi, unsigned int dent_blocks = total_dent_blocks % CAP_BLKS_PER_SEC(sbi); unsigned int data_blocks = 0; - if (f2fs_lfs_mode(sbi) && - unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (f2fs_lfs_mode(sbi)) { total_data_blocks = get_pages(sbi, F2FS_DIRTY_DATA); data_secs = total_data_blocks / CAP_BLKS_PER_SEC(sbi); data_blocks = total_data_blocks % CAP_BLKS_PER_SEC(sbi); @@ -684,7 +677,7 @@ static inline void __get_secs_required(struct f2fs_sb_info *sbi, if (lower_p) *lower_p = node_secs + dent_secs + data_secs; if (upper_p) - *upper_p = node_secs + dent_secs + + *upper_p = node_secs + dent_secs + data_secs + (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0) + (data_blocks ? 1 : 0); if (curseg_p) @@ -986,7 +979,7 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) { - if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno)) + if (is_cursec(sbi, secno) || (sbi->cur_victim_sec == secno)) return true; return false; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index bbf1dad6843f..e16c4e2830c2 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -27,6 +27,8 @@ #include <linux/part_stat.h> #include <linux/zstd.h> #include <linux/lz4.h> +#include <linux/ctype.h> +#include <linux/fs_parser.h> #include "f2fs.h" #include "node.h" @@ -125,29 +127,20 @@ enum { Opt_disable_roll_forward, Opt_norecovery, Opt_discard, - Opt_nodiscard, Opt_noheap, Opt_heap, Opt_user_xattr, - Opt_nouser_xattr, Opt_acl, - Opt_noacl, Opt_active_logs, Opt_disable_ext_identify, Opt_inline_xattr, - Opt_noinline_xattr, Opt_inline_xattr_size, Opt_inline_data, Opt_inline_dentry, - Opt_noinline_dentry, Opt_flush_merge, - Opt_noflush_merge, Opt_barrier, - Opt_nobarrier, Opt_fastboot, Opt_extent_cache, - Opt_noextent_cache, - Opt_noinline_data, Opt_data_flush, Opt_reserve_root, Opt_resgid, @@ -156,21 +149,13 @@ enum { Opt_fault_injection, Opt_fault_type, Opt_lazytime, - Opt_nolazytime, Opt_quota, - Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_usrjquota, Opt_grpjquota, Opt_prjjquota, - Opt_offusrjquota, - Opt_offgrpjquota, - Opt_offprjjquota, - Opt_jqfmt_vfsold, - Opt_jqfmt_vfsv0, - Opt_jqfmt_vfsv1, Opt_alloc, Opt_fsync, Opt_test_dummy_encryption, @@ -180,107 +165,209 @@ enum { Opt_checkpoint_disable_cap_perc, Opt_checkpoint_enable, Opt_checkpoint_merge, - Opt_nocheckpoint_merge, Opt_compress_algorithm, Opt_compress_log_size, - Opt_compress_extension, Opt_nocompress_extension, + Opt_compress_extension, Opt_compress_chksum, Opt_compress_mode, Opt_compress_cache, Opt_atgc, Opt_gc_merge, - Opt_nogc_merge, Opt_discard_unit, Opt_memory_mode, Opt_age_extent_cache, Opt_errors, Opt_nat_bits, + Opt_jqfmt, + Opt_checkpoint, Opt_err, }; -static match_table_t f2fs_tokens = { - {Opt_gc_background, "background_gc=%s"}, - {Opt_disable_roll_forward, "disable_roll_forward"}, - {Opt_norecovery, "norecovery"}, - {Opt_discard, "discard"}, - {Opt_nodiscard, "nodiscard"}, - {Opt_noheap, "no_heap"}, - {Opt_heap, "heap"}, - {Opt_user_xattr, "user_xattr"}, - {Opt_nouser_xattr, "nouser_xattr"}, - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, - {Opt_active_logs, "active_logs=%u"}, - {Opt_disable_ext_identify, "disable_ext_identify"}, - {Opt_inline_xattr, "inline_xattr"}, - {Opt_noinline_xattr, "noinline_xattr"}, - {Opt_inline_xattr_size, "inline_xattr_size=%u"}, - {Opt_inline_data, "inline_data"}, - {Opt_inline_dentry, "inline_dentry"}, - {Opt_noinline_dentry, "noinline_dentry"}, - {Opt_flush_merge, "flush_merge"}, - {Opt_noflush_merge, "noflush_merge"}, - {Opt_barrier, "barrier"}, - {Opt_nobarrier, "nobarrier"}, - {Opt_fastboot, "fastboot"}, - {Opt_extent_cache, "extent_cache"}, - {Opt_noextent_cache, "noextent_cache"}, - {Opt_noinline_data, "noinline_data"}, - {Opt_data_flush, "data_flush"}, - {Opt_reserve_root, "reserve_root=%u"}, - {Opt_resgid, "resgid=%u"}, - {Opt_resuid, "resuid=%u"}, - {Opt_mode, "mode=%s"}, - {Opt_fault_injection, "fault_injection=%u"}, - {Opt_fault_type, "fault_type=%u"}, - {Opt_lazytime, "lazytime"}, - {Opt_nolazytime, "nolazytime"}, - {Opt_quota, "quota"}, - {Opt_noquota, "noquota"}, - {Opt_usrquota, "usrquota"}, - {Opt_grpquota, "grpquota"}, - {Opt_prjquota, "prjquota"}, - {Opt_usrjquota, "usrjquota=%s"}, - {Opt_grpjquota, "grpjquota=%s"}, - {Opt_prjjquota, "prjjquota=%s"}, - {Opt_offusrjquota, "usrjquota="}, - {Opt_offgrpjquota, "grpjquota="}, - {Opt_offprjjquota, "prjjquota="}, - {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, - {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, - {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, - {Opt_alloc, "alloc_mode=%s"}, - {Opt_fsync, "fsync_mode=%s"}, - {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, - {Opt_test_dummy_encryption, "test_dummy_encryption"}, - {Opt_inlinecrypt, "inlinecrypt"}, - {Opt_checkpoint_disable, "checkpoint=disable"}, - {Opt_checkpoint_disable_cap, "checkpoint=disable:%u"}, - {Opt_checkpoint_disable_cap_perc, "checkpoint=disable:%u%%"}, - {Opt_checkpoint_enable, "checkpoint=enable"}, - {Opt_checkpoint_merge, "checkpoint_merge"}, - {Opt_nocheckpoint_merge, "nocheckpoint_merge"}, - {Opt_compress_algorithm, "compress_algorithm=%s"}, - {Opt_compress_log_size, "compress_log_size=%u"}, - {Opt_compress_extension, "compress_extension=%s"}, - {Opt_nocompress_extension, "nocompress_extension=%s"}, - {Opt_compress_chksum, "compress_chksum"}, - {Opt_compress_mode, "compress_mode=%s"}, - {Opt_compress_cache, "compress_cache"}, - {Opt_atgc, "atgc"}, - {Opt_gc_merge, "gc_merge"}, - {Opt_nogc_merge, "nogc_merge"}, - {Opt_discard_unit, "discard_unit=%s"}, - {Opt_memory_mode, "memory=%s"}, - {Opt_age_extent_cache, "age_extent_cache"}, - {Opt_errors, "errors=%s"}, - {Opt_nat_bits, "nat_bits"}, +static const struct constant_table f2fs_param_background_gc[] = { + {"on", BGGC_MODE_ON}, + {"off", BGGC_MODE_OFF}, + {"sync", BGGC_MODE_SYNC}, + {} +}; + +static const struct constant_table f2fs_param_mode[] = { + {"adaptive", FS_MODE_ADAPTIVE}, + {"lfs", FS_MODE_LFS}, + {"fragment:segment", FS_MODE_FRAGMENT_SEG}, + {"fragment:block", FS_MODE_FRAGMENT_BLK}, + {} +}; + +static const struct constant_table f2fs_param_jqfmt[] = { + {"vfsold", QFMT_VFS_OLD}, + {"vfsv0", QFMT_VFS_V0}, + {"vfsv1", QFMT_VFS_V1}, + {} +}; + +static const struct constant_table f2fs_param_alloc_mode[] = { + {"default", ALLOC_MODE_DEFAULT}, + {"reuse", ALLOC_MODE_REUSE}, + {} +}; +static const struct constant_table f2fs_param_fsync_mode[] = { + {"posix", FSYNC_MODE_POSIX}, + {"strict", FSYNC_MODE_STRICT}, + {"nobarrier", FSYNC_MODE_NOBARRIER}, + {} +}; + +static const struct constant_table f2fs_param_compress_mode[] = { + {"fs", COMPR_MODE_FS}, + {"user", COMPR_MODE_USER}, + {} +}; + +static const struct constant_table f2fs_param_discard_unit[] = { + {"block", DISCARD_UNIT_BLOCK}, + {"segment", DISCARD_UNIT_SEGMENT}, + {"section", DISCARD_UNIT_SECTION}, + {} +}; + +static const struct constant_table f2fs_param_memory_mode[] = { + {"normal", MEMORY_MODE_NORMAL}, + {"low", MEMORY_MODE_LOW}, + {} +}; + +static const struct constant_table f2fs_param_errors[] = { + {"remount-ro", MOUNT_ERRORS_READONLY}, + {"continue", MOUNT_ERRORS_CONTINUE}, + {"panic", MOUNT_ERRORS_PANIC}, + {} +}; + +static const struct fs_parameter_spec f2fs_param_specs[] = { + fsparam_enum("background_gc", Opt_gc_background, f2fs_param_background_gc), + fsparam_flag("disable_roll_forward", Opt_disable_roll_forward), + fsparam_flag("norecovery", Opt_norecovery), + fsparam_flag_no("discard", Opt_discard), + fsparam_flag("no_heap", Opt_noheap), + fsparam_flag("heap", Opt_heap), + fsparam_flag_no("user_xattr", Opt_user_xattr), + fsparam_flag_no("acl", Opt_acl), + fsparam_s32("active_logs", Opt_active_logs), + fsparam_flag("disable_ext_identify", Opt_disable_ext_identify), + fsparam_flag_no("inline_xattr", Opt_inline_xattr), + fsparam_s32("inline_xattr_size", Opt_inline_xattr_size), + fsparam_flag_no("inline_data", Opt_inline_data), + fsparam_flag_no("inline_dentry", Opt_inline_dentry), + fsparam_flag_no("flush_merge", Opt_flush_merge), + fsparam_flag_no("barrier", Opt_barrier), + fsparam_flag("fastboot", Opt_fastboot), + fsparam_flag_no("extent_cache", Opt_extent_cache), + fsparam_flag("data_flush", Opt_data_flush), + fsparam_u32("reserve_root", Opt_reserve_root), + fsparam_gid("resgid", Opt_resgid), + fsparam_uid("resuid", Opt_resuid), + fsparam_enum("mode", Opt_mode, f2fs_param_mode), + fsparam_s32("fault_injection", Opt_fault_injection), + fsparam_u32("fault_type", Opt_fault_type), + fsparam_flag_no("lazytime", Opt_lazytime), + fsparam_flag_no("quota", Opt_quota), + fsparam_flag("usrquota", Opt_usrquota), + fsparam_flag("grpquota", Opt_grpquota), + fsparam_flag("prjquota", Opt_prjquota), + fsparam_string_empty("usrjquota", Opt_usrjquota), + fsparam_string_empty("grpjquota", Opt_grpjquota), + fsparam_string_empty("prjjquota", Opt_prjjquota), + fsparam_flag("nat_bits", Opt_nat_bits), + fsparam_enum("jqfmt", Opt_jqfmt, f2fs_param_jqfmt), + fsparam_enum("alloc_mode", Opt_alloc, f2fs_param_alloc_mode), + fsparam_enum("fsync_mode", Opt_fsync, f2fs_param_fsync_mode), + fsparam_string("test_dummy_encryption", Opt_test_dummy_encryption), + fsparam_flag("test_dummy_encryption", Opt_test_dummy_encryption), + fsparam_flag("inlinecrypt", Opt_inlinecrypt), + fsparam_string("checkpoint", Opt_checkpoint), + fsparam_flag_no("checkpoint_merge", Opt_checkpoint_merge), + fsparam_string("compress_algorithm", Opt_compress_algorithm), + fsparam_u32("compress_log_size", Opt_compress_log_size), + fsparam_string("compress_extension", Opt_compress_extension), + fsparam_string("nocompress_extension", Opt_nocompress_extension), + fsparam_flag("compress_chksum", Opt_compress_chksum), + fsparam_enum("compress_mode", Opt_compress_mode, f2fs_param_compress_mode), + fsparam_flag("compress_cache", Opt_compress_cache), + fsparam_flag("atgc", Opt_atgc), + fsparam_flag_no("gc_merge", Opt_gc_merge), + fsparam_enum("discard_unit", Opt_discard_unit, f2fs_param_discard_unit), + fsparam_enum("memory", Opt_memory_mode, f2fs_param_memory_mode), + fsparam_flag("age_extent_cache", Opt_age_extent_cache), + fsparam_enum("errors", Opt_errors, f2fs_param_errors), + {} +}; + +/* Resort to a match_table for this interestingly formatted option */ +static match_table_t f2fs_checkpoint_tokens = { + {Opt_checkpoint_disable, "disable"}, + {Opt_checkpoint_disable_cap, "disable:%u"}, + {Opt_checkpoint_disable_cap_perc, "disable:%u%%"}, + {Opt_checkpoint_enable, "enable"}, {Opt_err, NULL}, }; +#define F2FS_SPEC_background_gc (1 << 0) +#define F2FS_SPEC_inline_xattr_size (1 << 1) +#define F2FS_SPEC_active_logs (1 << 2) +#define F2FS_SPEC_reserve_root (1 << 3) +#define F2FS_SPEC_resgid (1 << 4) +#define F2FS_SPEC_resuid (1 << 5) +#define F2FS_SPEC_mode (1 << 6) +#define F2FS_SPEC_fault_injection (1 << 7) +#define F2FS_SPEC_fault_type (1 << 8) +#define F2FS_SPEC_jqfmt (1 << 9) +#define F2FS_SPEC_alloc_mode (1 << 10) +#define F2FS_SPEC_fsync_mode (1 << 11) +#define F2FS_SPEC_checkpoint_disable_cap (1 << 12) +#define F2FS_SPEC_checkpoint_disable_cap_perc (1 << 13) +#define F2FS_SPEC_compress_level (1 << 14) +#define F2FS_SPEC_compress_algorithm (1 << 15) +#define F2FS_SPEC_compress_log_size (1 << 16) +#define F2FS_SPEC_compress_extension (1 << 17) +#define F2FS_SPEC_nocompress_extension (1 << 18) +#define F2FS_SPEC_compress_chksum (1 << 19) +#define F2FS_SPEC_compress_mode (1 << 20) +#define F2FS_SPEC_discard_unit (1 << 21) +#define F2FS_SPEC_memory_mode (1 << 22) +#define F2FS_SPEC_errors (1 << 23) + +struct f2fs_fs_context { + struct f2fs_mount_info info; + unsigned int opt_mask; /* Bits changed */ + unsigned int spec_mask; + unsigned short qname_mask; +}; + +#define F2FS_CTX_INFO(ctx) ((ctx)->info) + +static inline void ctx_set_opt(struct f2fs_fs_context *ctx, + unsigned int flag) +{ + ctx->info.opt |= flag; + ctx->opt_mask |= flag; +} + +static inline void ctx_clear_opt(struct f2fs_fs_context *ctx, + unsigned int flag) +{ + ctx->info.opt &= ~flag; + ctx->opt_mask |= flag; +} + +static inline bool ctx_test_opt(struct f2fs_fs_context *ctx, + unsigned int flag) +{ + return ctx->info.opt & flag; +} + void f2fs_printk(struct f2fs_sb_info *sbi, bool limit_rate, - const char *fmt, ...) + const char *fmt, ...) { struct va_format vaf; va_list args; @@ -292,11 +379,19 @@ void f2fs_printk(struct f2fs_sb_info *sbi, bool limit_rate, vaf.fmt = printk_skip_level(fmt); vaf.va = &args; if (limit_rate) - printk_ratelimited("%c%cF2FS-fs (%s): %pV\n", - KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf); + if (sbi) + printk_ratelimited("%c%cF2FS-fs (%s): %pV\n", + KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf); + else + printk_ratelimited("%c%cF2FS-fs: %pV\n", + KERN_SOH_ASCII, level, &vaf); else - printk("%c%cF2FS-fs (%s): %pV\n", - KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf); + if (sbi) + printk("%c%cF2FS-fs (%s): %pV\n", + KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf); + else + printk("%c%cF2FS-fs: %pV\n", + KERN_SOH_ASCII, level, &vaf); va_end(args); } @@ -390,159 +485,90 @@ static void init_once(void *foo) #ifdef CONFIG_QUOTA static const char * const quotatypes[] = INITQFNAMES; #define QTYPE2NAME(t) (quotatypes[t]) -static int f2fs_set_qf_name(struct f2fs_sb_info *sbi, int qtype, - substring_t *args) +/* + * Note the name of the specified quota file. + */ +static int f2fs_note_qf_name(struct fs_context *fc, int qtype, + struct fs_parameter *param) { - struct super_block *sb = sbi->sb; + struct f2fs_fs_context *ctx = fc->fs_private; char *qname; - int ret = -EINVAL; - if (sb_any_quota_loaded(sb) && !F2FS_OPTION(sbi).s_qf_names[qtype]) { - f2fs_err(sbi, "Cannot change journaled quota options when quota turned on"); + if (param->size < 1) { + f2fs_err(NULL, "Missing quota name"); return -EINVAL; } - if (f2fs_sb_has_quota_ino(sbi)) { - f2fs_info(sbi, "QUOTA feature is enabled, so ignore qf_name"); + if (strchr(param->string, '/')) { + f2fs_err(NULL, "quotafile must be on filesystem root"); + return -EINVAL; + } + if (ctx->info.s_qf_names[qtype]) { + if (strcmp(ctx->info.s_qf_names[qtype], param->string) != 0) { + f2fs_err(NULL, "Quota file already specified"); + return -EINVAL; + } return 0; } - qname = match_strdup(args); + qname = kmemdup_nul(param->string, param->size, GFP_KERNEL); if (!qname) { - f2fs_err(sbi, "Not enough memory for storing quotafile name"); + f2fs_err(NULL, "Not enough memory for storing quotafile name"); return -ENOMEM; } - if (F2FS_OPTION(sbi).s_qf_names[qtype]) { - if (strcmp(F2FS_OPTION(sbi).s_qf_names[qtype], qname) == 0) - ret = 0; - else - f2fs_err(sbi, "%s quota file already specified", - QTYPE2NAME(qtype)); - goto errout; - } - if (strchr(qname, '/')) { - f2fs_err(sbi, "quotafile must be on filesystem root"); - goto errout; - } - F2FS_OPTION(sbi).s_qf_names[qtype] = qname; - set_opt(sbi, QUOTA); + F2FS_CTX_INFO(ctx).s_qf_names[qtype] = qname; + ctx->qname_mask |= 1 << qtype; return 0; -errout: - kfree(qname); - return ret; } -static int f2fs_clear_qf_name(struct f2fs_sb_info *sbi, int qtype) +/* + * Clear the name of the specified quota file. + */ +static int f2fs_unnote_qf_name(struct fs_context *fc, int qtype) { - struct super_block *sb = sbi->sb; + struct f2fs_fs_context *ctx = fc->fs_private; - if (sb_any_quota_loaded(sb) && F2FS_OPTION(sbi).s_qf_names[qtype]) { - f2fs_err(sbi, "Cannot change journaled quota options when quota turned on"); - return -EINVAL; - } - kfree(F2FS_OPTION(sbi).s_qf_names[qtype]); - F2FS_OPTION(sbi).s_qf_names[qtype] = NULL; + kfree(ctx->info.s_qf_names[qtype]); + ctx->info.s_qf_names[qtype] = NULL; + ctx->qname_mask |= 1 << qtype; return 0; } -static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) +static void f2fs_unnote_qf_name_all(struct fs_context *fc) { - /* - * We do the test below only for project quotas. 'usrquota' and - * 'grpquota' mount options are allowed even without quota feature - * to support legacy quotas in quota files. - */ - if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi)) { - f2fs_err(sbi, "Project quota feature not enabled. Cannot enable project quota enforcement."); - return -1; - } - if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] || - F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] || - F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) { - if (test_opt(sbi, USRQUOTA) && - F2FS_OPTION(sbi).s_qf_names[USRQUOTA]) - clear_opt(sbi, USRQUOTA); - - if (test_opt(sbi, GRPQUOTA) && - F2FS_OPTION(sbi).s_qf_names[GRPQUOTA]) - clear_opt(sbi, GRPQUOTA); - - if (test_opt(sbi, PRJQUOTA) && - F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) - clear_opt(sbi, PRJQUOTA); - - if (test_opt(sbi, GRPQUOTA) || test_opt(sbi, USRQUOTA) || - test_opt(sbi, PRJQUOTA)) { - f2fs_err(sbi, "old and new quota format mixing"); - return -1; - } - - if (!F2FS_OPTION(sbi).s_jquota_fmt) { - f2fs_err(sbi, "journaled quota format not specified"); - return -1; - } - } + int i; - if (f2fs_sb_has_quota_ino(sbi) && F2FS_OPTION(sbi).s_jquota_fmt) { - f2fs_info(sbi, "QUOTA feature is enabled, so ignore jquota_fmt"); - F2FS_OPTION(sbi).s_jquota_fmt = 0; - } - return 0; + for (i = 0; i < MAXQUOTAS; i++) + f2fs_unnote_qf_name(fc, i); } #endif -static int f2fs_set_test_dummy_encryption(struct f2fs_sb_info *sbi, - const char *opt, - const substring_t *arg, - bool is_remount) +static int f2fs_parse_test_dummy_encryption(const struct fs_parameter *param, + struct f2fs_fs_context *ctx) { - struct fs_parameter param = { - .type = fs_value_is_string, - .string = arg->from ? arg->from : "", - }; - struct fscrypt_dummy_policy *policy = - &F2FS_OPTION(sbi).dummy_enc_policy; int err; if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) { - f2fs_warn(sbi, "test_dummy_encryption option not supported"); + f2fs_warn(NULL, "test_dummy_encryption option not supported"); return -EINVAL; } - - if (!f2fs_sb_has_encrypt(sbi)) { - f2fs_err(sbi, "Encrypt feature is off"); - return -EINVAL; - } - - /* - * This mount option is just for testing, and it's not worthwhile to - * implement the extra complexity (e.g. RCU protection) that would be - * needed to allow it to be set or changed during remount. We do allow - * it to be specified during remount, but only if there is no change. - */ - if (is_remount && !fscrypt_is_dummy_policy_set(policy)) { - f2fs_warn(sbi, "Can't set test_dummy_encryption on remount"); - return -EINVAL; - } - - err = fscrypt_parse_test_dummy_encryption(¶m, policy); + err = fscrypt_parse_test_dummy_encryption(param, + &ctx->info.dummy_enc_policy); if (err) { - if (err == -EEXIST) - f2fs_warn(sbi, - "Can't change test_dummy_encryption on remount"); - else if (err == -EINVAL) - f2fs_warn(sbi, "Value of option \"%s\" is unrecognized", - opt); + if (err == -EINVAL) + f2fs_warn(NULL, "Value of option \"%s\" is unrecognized", + param->key); + else if (err == -EEXIST) + f2fs_warn(NULL, "Conflicting test_dummy_encryption options"); else - f2fs_warn(sbi, "Error processing option \"%s\" [%d]", - opt, err); + f2fs_warn(NULL, "Error processing option \"%s\" [%d]", + param->key, err); return -EINVAL; } - f2fs_warn(sbi, "Test dummy encryption mode enabled"); return 0; } #ifdef CONFIG_F2FS_FS_COMPRESSION -static bool is_compress_extension_exist(struct f2fs_sb_info *sbi, +static bool is_compress_extension_exist(struct f2fs_mount_info *info, const char *new_ext, bool is_ext) { unsigned char (*ext)[F2FS_EXTENSION_LEN]; @@ -550,11 +576,11 @@ static bool is_compress_extension_exist(struct f2fs_sb_info *sbi, int i; if (is_ext) { - ext = F2FS_OPTION(sbi).extensions; - ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + ext = info->extensions; + ext_cnt = info->compress_ext_cnt; } else { - ext = F2FS_OPTION(sbi).noextensions; - ext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + ext = info->noextensions; + ext_cnt = info->nocompress_ext_cnt; } for (i = 0; i < ext_cnt; i++) { @@ -572,28 +598,28 @@ static bool is_compress_extension_exist(struct f2fs_sb_info *sbi, * extension will be treated as special cases and will not be compressed. * 3. Don't allow the non-compress extension specifies all files. */ -static int f2fs_test_compress_extension(struct f2fs_sb_info *sbi) +static int f2fs_test_compress_extension(unsigned char (*noext)[F2FS_EXTENSION_LEN], + int noext_cnt, + unsigned char (*ext)[F2FS_EXTENSION_LEN], + int ext_cnt) { - unsigned char (*ext)[F2FS_EXTENSION_LEN]; - unsigned char (*noext)[F2FS_EXTENSION_LEN]; - int ext_cnt, noext_cnt, index = 0, no_index = 0; - - ext = F2FS_OPTION(sbi).extensions; - ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; - noext = F2FS_OPTION(sbi).noextensions; - noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + int index = 0, no_index = 0; if (!noext_cnt) return 0; for (no_index = 0; no_index < noext_cnt; no_index++) { + if (strlen(noext[no_index]) == 0) + continue; if (!strcasecmp("*", noext[no_index])) { - f2fs_info(sbi, "Don't allow the nocompress extension specifies all files"); + f2fs_info(NULL, "Don't allow the nocompress extension specifies all files"); return -EINVAL; } for (index = 0; index < ext_cnt; index++) { + if (strlen(ext[index]) == 0) + continue; if (!strcasecmp(ext[index], noext[no_index])) { - f2fs_info(sbi, "Don't allow the same extension %s appear in both compress and nocompress extension", + f2fs_info(NULL, "Don't allow the same extension %s appear in both compress and nocompress extension", ext[index]); return -EINVAL; } @@ -603,58 +629,62 @@ static int f2fs_test_compress_extension(struct f2fs_sb_info *sbi) } #ifdef CONFIG_F2FS_FS_LZ4 -static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str) +static int f2fs_set_lz4hc_level(struct f2fs_fs_context *ctx, const char *str) { #ifdef CONFIG_F2FS_FS_LZ4HC unsigned int level; if (strlen(str) == 3) { - F2FS_OPTION(sbi).compress_level = 0; + F2FS_CTX_INFO(ctx).compress_level = 0; + ctx->spec_mask |= F2FS_SPEC_compress_level; return 0; } str += 3; if (str[0] != ':') { - f2fs_info(sbi, "wrong format, e.g. <alg_name>:<compr_level>"); + f2fs_info(NULL, "wrong format, e.g. <alg_name>:<compr_level>"); return -EINVAL; } if (kstrtouint(str + 1, 10, &level)) return -EINVAL; if (!f2fs_is_compress_level_valid(COMPRESS_LZ4, level)) { - f2fs_info(sbi, "invalid lz4hc compress level: %d", level); + f2fs_info(NULL, "invalid lz4hc compress level: %d", level); return -EINVAL; } - F2FS_OPTION(sbi).compress_level = level; + F2FS_CTX_INFO(ctx).compress_level = level; + ctx->spec_mask |= F2FS_SPEC_compress_level; return 0; #else if (strlen(str) == 3) { - F2FS_OPTION(sbi).compress_level = 0; + F2FS_CTX_INFO(ctx).compress_level = 0; + ctx->spec_mask |= F2FS_SPEC_compress_level; return 0; } - f2fs_info(sbi, "kernel doesn't support lz4hc compression"); + f2fs_info(NULL, "kernel doesn't support lz4hc compression"); return -EINVAL; #endif } #endif #ifdef CONFIG_F2FS_FS_ZSTD -static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str) +static int f2fs_set_zstd_level(struct f2fs_fs_context *ctx, const char *str) { int level; int len = 4; if (strlen(str) == len) { - F2FS_OPTION(sbi).compress_level = F2FS_ZSTD_DEFAULT_CLEVEL; + F2FS_CTX_INFO(ctx).compress_level = F2FS_ZSTD_DEFAULT_CLEVEL; + ctx->spec_mask |= F2FS_SPEC_compress_level; return 0; } str += len; if (str[0] != ':') { - f2fs_info(sbi, "wrong format, e.g. <alg_name>:<compr_level>"); + f2fs_info(NULL, "wrong format, e.g. <alg_name>:<compr_level>"); return -EINVAL; } if (kstrtoint(str + 1, 10, &level)) @@ -662,685 +692,750 @@ static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str) /* f2fs does not support negative compress level now */ if (level < 0) { - f2fs_info(sbi, "do not support negative compress level: %d", level); + f2fs_info(NULL, "do not support negative compress level: %d", level); return -ERANGE; } if (!f2fs_is_compress_level_valid(COMPRESS_ZSTD, level)) { - f2fs_info(sbi, "invalid zstd compress level: %d", level); + f2fs_info(NULL, "invalid zstd compress level: %d", level); return -EINVAL; } - F2FS_OPTION(sbi).compress_level = level; + F2FS_CTX_INFO(ctx).compress_level = level; + ctx->spec_mask |= F2FS_SPEC_compress_level; return 0; } #endif #endif -static int parse_options(struct f2fs_sb_info *sbi, char *options, bool is_remount) +static int f2fs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - substring_t args[MAX_OPT_ARGS]; + struct f2fs_fs_context *ctx = fc->fs_private; #ifdef CONFIG_F2FS_FS_COMPRESSION unsigned char (*ext)[F2FS_EXTENSION_LEN]; unsigned char (*noext)[F2FS_EXTENSION_LEN]; int ext_cnt, noext_cnt; + char *name; #endif - char *p, *name; - int arg = 0; - kuid_t uid; - kgid_t gid; - int ret; - - if (!options) - return 0; - - while ((p = strsep(&options, ",")) != NULL) { - int token; + substring_t args[MAX_OPT_ARGS]; + struct fs_parse_result result; + int token, ret, arg; - if (!*p) - continue; - /* - * Initialize args struct so we know whether arg was - * found; some options take optional arguments. - */ - args[0].to = args[0].from = NULL; - token = match_token(p, f2fs_tokens, args); + token = fs_parse(fc, f2fs_param_specs, param, &result); + if (token < 0) + return token; - switch (token) { - case Opt_gc_background: - name = match_strdup(&args[0]); - - if (!name) - return -ENOMEM; - if (!strcmp(name, "on")) { - F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON; - } else if (!strcmp(name, "off")) { - if (f2fs_sb_has_blkzoned(sbi)) { - f2fs_warn(sbi, "zoned devices need bggc"); - kfree(name); - return -EINVAL; - } - F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_OFF; - } else if (!strcmp(name, "sync")) { - F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_SYNC; - } else { - kfree(name); - return -EINVAL; - } - kfree(name); - break; - case Opt_disable_roll_forward: - set_opt(sbi, DISABLE_ROLL_FORWARD); - break; - case Opt_norecovery: - /* requires ro mount, checked in f2fs_default_check */ - set_opt(sbi, NORECOVERY); - break; - case Opt_discard: - if (!f2fs_hw_support_discard(sbi)) { - f2fs_warn(sbi, "device does not support discard"); - break; - } - set_opt(sbi, DISCARD); - break; - case Opt_nodiscard: - if (f2fs_hw_should_discard(sbi)) { - f2fs_warn(sbi, "discard is required for zoned block devices"); - return -EINVAL; - } - clear_opt(sbi, DISCARD); - break; - case Opt_noheap: - case Opt_heap: - f2fs_warn(sbi, "heap/no_heap options were deprecated"); - break; + switch (token) { + case Opt_gc_background: + F2FS_CTX_INFO(ctx).bggc_mode = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_background_gc; + break; + case Opt_disable_roll_forward: + ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_ROLL_FORWARD); + break; + case Opt_norecovery: + /* requires ro mount, checked in f2fs_validate_options */ + ctx_set_opt(ctx, F2FS_MOUNT_NORECOVERY); + break; + case Opt_discard: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_DISCARD); + else + ctx_set_opt(ctx, F2FS_MOUNT_DISCARD); + break; + case Opt_noheap: + case Opt_heap: + f2fs_warn(NULL, "heap/no_heap options were deprecated"); + break; #ifdef CONFIG_F2FS_FS_XATTR - case Opt_user_xattr: - set_opt(sbi, XATTR_USER); - break; - case Opt_nouser_xattr: - clear_opt(sbi, XATTR_USER); - break; - case Opt_inline_xattr: - set_opt(sbi, INLINE_XATTR); - break; - case Opt_noinline_xattr: - clear_opt(sbi, INLINE_XATTR); - break; - case Opt_inline_xattr_size: - if (args->from && match_int(args, &arg)) - return -EINVAL; - set_opt(sbi, INLINE_XATTR_SIZE); - F2FS_OPTION(sbi).inline_xattr_size = arg; - break; + case Opt_user_xattr: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_XATTR_USER); + else + ctx_set_opt(ctx, F2FS_MOUNT_XATTR_USER); + break; + case Opt_inline_xattr: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_INLINE_XATTR); + else + ctx_set_opt(ctx, F2FS_MOUNT_INLINE_XATTR); + break; + case Opt_inline_xattr_size: + if (result.int_32 < MIN_INLINE_XATTR_SIZE || + result.int_32 > MAX_INLINE_XATTR_SIZE) { + f2fs_err(NULL, "inline xattr size is out of range: %u ~ %u", + (u32)MIN_INLINE_XATTR_SIZE, (u32)MAX_INLINE_XATTR_SIZE); + return -EINVAL; + } + ctx_set_opt(ctx, F2FS_MOUNT_INLINE_XATTR_SIZE); + F2FS_CTX_INFO(ctx).inline_xattr_size = result.int_32; + ctx->spec_mask |= F2FS_SPEC_inline_xattr_size; + break; #else - case Opt_user_xattr: - case Opt_nouser_xattr: - case Opt_inline_xattr: - case Opt_noinline_xattr: - case Opt_inline_xattr_size: - f2fs_info(sbi, "xattr options not supported"); - break; + case Opt_user_xattr: + case Opt_inline_xattr: + case Opt_inline_xattr_size: + f2fs_info(NULL, "%s options not supported", param->key); + break; #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL - case Opt_acl: - set_opt(sbi, POSIX_ACL); - break; - case Opt_noacl: - clear_opt(sbi, POSIX_ACL); - break; + case Opt_acl: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_POSIX_ACL); + else + ctx_set_opt(ctx, F2FS_MOUNT_POSIX_ACL); + break; #else - case Opt_acl: - case Opt_noacl: - f2fs_info(sbi, "acl options not supported"); - break; + case Opt_acl: + f2fs_info(NULL, "%s options not supported", param->key); + break; #endif - case Opt_active_logs: - if (args->from && match_int(args, &arg)) - return -EINVAL; - if (arg != 2 && arg != 4 && - arg != NR_CURSEG_PERSIST_TYPE) - return -EINVAL; - F2FS_OPTION(sbi).active_logs = arg; - break; - case Opt_disable_ext_identify: - set_opt(sbi, DISABLE_EXT_IDENTIFY); - break; - case Opt_inline_data: - set_opt(sbi, INLINE_DATA); - break; - case Opt_inline_dentry: - set_opt(sbi, INLINE_DENTRY); - break; - case Opt_noinline_dentry: - clear_opt(sbi, INLINE_DENTRY); - break; - case Opt_flush_merge: - set_opt(sbi, FLUSH_MERGE); - break; - case Opt_noflush_merge: - clear_opt(sbi, FLUSH_MERGE); - break; - case Opt_nobarrier: - set_opt(sbi, NOBARRIER); - break; - case Opt_barrier: - clear_opt(sbi, NOBARRIER); - break; - case Opt_fastboot: - set_opt(sbi, FASTBOOT); - break; - case Opt_extent_cache: - set_opt(sbi, READ_EXTENT_CACHE); - break; - case Opt_noextent_cache: - if (f2fs_sb_has_device_alias(sbi)) { - f2fs_err(sbi, "device aliasing requires extent cache"); - return -EINVAL; - } - clear_opt(sbi, READ_EXTENT_CACHE); - break; - case Opt_noinline_data: - clear_opt(sbi, INLINE_DATA); - break; - case Opt_data_flush: - set_opt(sbi, DATA_FLUSH); - break; - case Opt_reserve_root: - if (args->from && match_int(args, &arg)) - return -EINVAL; - if (test_opt(sbi, RESERVE_ROOT)) { - f2fs_info(sbi, "Preserve previous reserve_root=%u", - F2FS_OPTION(sbi).root_reserved_blocks); - } else { - F2FS_OPTION(sbi).root_reserved_blocks = arg; - set_opt(sbi, RESERVE_ROOT); - } - break; - case Opt_resuid: - if (args->from && match_int(args, &arg)) - return -EINVAL; - uid = make_kuid(current_user_ns(), arg); - if (!uid_valid(uid)) { - f2fs_err(sbi, "Invalid uid value %d", arg); - return -EINVAL; - } - F2FS_OPTION(sbi).s_resuid = uid; - break; - case Opt_resgid: - if (args->from && match_int(args, &arg)) - return -EINVAL; - gid = make_kgid(current_user_ns(), arg); - if (!gid_valid(gid)) { - f2fs_err(sbi, "Invalid gid value %d", arg); - return -EINVAL; - } - F2FS_OPTION(sbi).s_resgid = gid; - break; - case Opt_mode: - name = match_strdup(&args[0]); - - if (!name) - return -ENOMEM; - if (!strcmp(name, "adaptive")) { - F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE; - } else if (!strcmp(name, "lfs")) { - F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS; - } else if (!strcmp(name, "fragment:segment")) { - F2FS_OPTION(sbi).fs_mode = FS_MODE_FRAGMENT_SEG; - } else if (!strcmp(name, "fragment:block")) { - F2FS_OPTION(sbi).fs_mode = FS_MODE_FRAGMENT_BLK; - } else { - kfree(name); - return -EINVAL; - } - kfree(name); - break; + case Opt_active_logs: + if (result.int_32 != 2 && result.int_32 != 4 && + result.int_32 != NR_CURSEG_PERSIST_TYPE) + return -EINVAL; + ctx->spec_mask |= F2FS_SPEC_active_logs; + F2FS_CTX_INFO(ctx).active_logs = result.int_32; + break; + case Opt_disable_ext_identify: + ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_EXT_IDENTIFY); + break; + case Opt_inline_data: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_INLINE_DATA); + else + ctx_set_opt(ctx, F2FS_MOUNT_INLINE_DATA); + break; + case Opt_inline_dentry: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_INLINE_DENTRY); + else + ctx_set_opt(ctx, F2FS_MOUNT_INLINE_DENTRY); + break; + case Opt_flush_merge: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_FLUSH_MERGE); + else + ctx_set_opt(ctx, F2FS_MOUNT_FLUSH_MERGE); + break; + case Opt_barrier: + if (result.negated) + ctx_set_opt(ctx, F2FS_MOUNT_NOBARRIER); + else + ctx_clear_opt(ctx, F2FS_MOUNT_NOBARRIER); + break; + case Opt_fastboot: + ctx_set_opt(ctx, F2FS_MOUNT_FASTBOOT); + break; + case Opt_extent_cache: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_READ_EXTENT_CACHE); + else + ctx_set_opt(ctx, F2FS_MOUNT_READ_EXTENT_CACHE); + break; + case Opt_data_flush: + ctx_set_opt(ctx, F2FS_MOUNT_DATA_FLUSH); + break; + case Opt_reserve_root: + ctx_set_opt(ctx, F2FS_MOUNT_RESERVE_ROOT); + F2FS_CTX_INFO(ctx).root_reserved_blocks = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_reserve_root; + break; + case Opt_resuid: + F2FS_CTX_INFO(ctx).s_resuid = result.uid; + ctx->spec_mask |= F2FS_SPEC_resuid; + break; + case Opt_resgid: + F2FS_CTX_INFO(ctx).s_resgid = result.gid; + ctx->spec_mask |= F2FS_SPEC_resgid; + break; + case Opt_mode: + F2FS_CTX_INFO(ctx).fs_mode = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_mode; + break; #ifdef CONFIG_F2FS_FAULT_INJECTION - case Opt_fault_injection: - if (args->from && match_int(args, &arg)) - return -EINVAL; - if (f2fs_build_fault_attr(sbi, arg, 0, FAULT_RATE)) - return -EINVAL; - set_opt(sbi, FAULT_INJECTION); - break; + case Opt_fault_injection: + F2FS_CTX_INFO(ctx).fault_info.inject_rate = result.int_32; + ctx->spec_mask |= F2FS_SPEC_fault_injection; + ctx_set_opt(ctx, F2FS_MOUNT_FAULT_INJECTION); + break; - case Opt_fault_type: - if (args->from && match_int(args, &arg)) - return -EINVAL; - if (f2fs_build_fault_attr(sbi, 0, arg, FAULT_TYPE)) - return -EINVAL; - set_opt(sbi, FAULT_INJECTION); - break; + case Opt_fault_type: + if (result.uint_32 > BIT(FAULT_MAX)) + return -EINVAL; + F2FS_CTX_INFO(ctx).fault_info.inject_type = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_fault_type; + ctx_set_opt(ctx, F2FS_MOUNT_FAULT_INJECTION); + break; #else - case Opt_fault_injection: - case Opt_fault_type: - f2fs_info(sbi, "fault injection options not supported"); - break; + case Opt_fault_injection: + case Opt_fault_type: + f2fs_info(NULL, "%s options not supported", param->key); + break; #endif - case Opt_lazytime: - set_opt(sbi, LAZYTIME); - break; - case Opt_nolazytime: - clear_opt(sbi, LAZYTIME); - break; + case Opt_lazytime: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_LAZYTIME); + else + ctx_set_opt(ctx, F2FS_MOUNT_LAZYTIME); + break; #ifdef CONFIG_QUOTA - case Opt_quota: - case Opt_usrquota: - set_opt(sbi, USRQUOTA); - break; - case Opt_grpquota: - set_opt(sbi, GRPQUOTA); - break; - case Opt_prjquota: - set_opt(sbi, PRJQUOTA); - break; - case Opt_usrjquota: - ret = f2fs_set_qf_name(sbi, USRQUOTA, &args[0]); - if (ret) - return ret; - break; - case Opt_grpjquota: - ret = f2fs_set_qf_name(sbi, GRPQUOTA, &args[0]); - if (ret) - return ret; - break; - case Opt_prjjquota: - ret = f2fs_set_qf_name(sbi, PRJQUOTA, &args[0]); - if (ret) - return ret; - break; - case Opt_offusrjquota: - ret = f2fs_clear_qf_name(sbi, USRQUOTA); - if (ret) - return ret; - break; - case Opt_offgrpjquota: - ret = f2fs_clear_qf_name(sbi, GRPQUOTA); - if (ret) - return ret; - break; - case Opt_offprjjquota: - ret = f2fs_clear_qf_name(sbi, PRJQUOTA); - if (ret) - return ret; - break; - case Opt_jqfmt_vfsold: - F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_OLD; - break; - case Opt_jqfmt_vfsv0: - F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V0; - break; - case Opt_jqfmt_vfsv1: - F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V1; - break; - case Opt_noquota: - clear_opt(sbi, QUOTA); - clear_opt(sbi, USRQUOTA); - clear_opt(sbi, GRPQUOTA); - clear_opt(sbi, PRJQUOTA); - break; + case Opt_quota: + if (result.negated) { + ctx_clear_opt(ctx, F2FS_MOUNT_QUOTA); + ctx_clear_opt(ctx, F2FS_MOUNT_USRQUOTA); + ctx_clear_opt(ctx, F2FS_MOUNT_GRPQUOTA); + ctx_clear_opt(ctx, F2FS_MOUNT_PRJQUOTA); + } else + ctx_set_opt(ctx, F2FS_MOUNT_USRQUOTA); + break; + case Opt_usrquota: + ctx_set_opt(ctx, F2FS_MOUNT_USRQUOTA); + break; + case Opt_grpquota: + ctx_set_opt(ctx, F2FS_MOUNT_GRPQUOTA); + break; + case Opt_prjquota: + ctx_set_opt(ctx, F2FS_MOUNT_PRJQUOTA); + break; + case Opt_usrjquota: + if (!*param->string) + ret = f2fs_unnote_qf_name(fc, USRQUOTA); + else + ret = f2fs_note_qf_name(fc, USRQUOTA, param); + if (ret) + return ret; + break; + case Opt_grpjquota: + if (!*param->string) + ret = f2fs_unnote_qf_name(fc, GRPQUOTA); + else + ret = f2fs_note_qf_name(fc, GRPQUOTA, param); + if (ret) + return ret; + break; + case Opt_prjjquota: + if (!*param->string) + ret = f2fs_unnote_qf_name(fc, PRJQUOTA); + else + ret = f2fs_note_qf_name(fc, PRJQUOTA, param); + if (ret) + return ret; + break; + case Opt_jqfmt: + F2FS_CTX_INFO(ctx).s_jquota_fmt = result.int_32; + ctx->spec_mask |= F2FS_SPEC_jqfmt; + break; #else - case Opt_quota: - case Opt_usrquota: - case Opt_grpquota: - case Opt_prjquota: - case Opt_usrjquota: - case Opt_grpjquota: - case Opt_prjjquota: - case Opt_offusrjquota: - case Opt_offgrpjquota: - case Opt_offprjjquota: - case Opt_jqfmt_vfsold: - case Opt_jqfmt_vfsv0: - case Opt_jqfmt_vfsv1: - case Opt_noquota: - f2fs_info(sbi, "quota operations not supported"); - break; + case Opt_quota: + case Opt_usrquota: + case Opt_grpquota: + case Opt_prjquota: + case Opt_usrjquota: + case Opt_grpjquota: + case Opt_prjjquota: + f2fs_info(NULL, "quota operations not supported"); + break; #endif - case Opt_alloc: - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - - if (!strcmp(name, "default")) { - F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; - } else if (!strcmp(name, "reuse")) { - F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; - } else { - kfree(name); - return -EINVAL; - } - kfree(name); - break; - case Opt_fsync: - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - if (!strcmp(name, "posix")) { - F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; - } else if (!strcmp(name, "strict")) { - F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_STRICT; - } else if (!strcmp(name, "nobarrier")) { - F2FS_OPTION(sbi).fsync_mode = - FSYNC_MODE_NOBARRIER; - } else { - kfree(name); - return -EINVAL; - } - kfree(name); - break; - case Opt_test_dummy_encryption: - ret = f2fs_set_test_dummy_encryption(sbi, p, &args[0], - is_remount); - if (ret) - return ret; - break; - case Opt_inlinecrypt: + case Opt_alloc: + F2FS_CTX_INFO(ctx).alloc_mode = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_alloc_mode; + break; + case Opt_fsync: + F2FS_CTX_INFO(ctx).fsync_mode = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_fsync_mode; + break; + case Opt_test_dummy_encryption: + ret = f2fs_parse_test_dummy_encryption(param, ctx); + if (ret) + return ret; + break; + case Opt_inlinecrypt: #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT - set_opt(sbi, INLINECRYPT); + ctx_set_opt(ctx, F2FS_MOUNT_INLINECRYPT); #else - f2fs_info(sbi, "inline encryption not supported"); + f2fs_info(NULL, "inline encryption not supported"); #endif - break; + break; + case Opt_checkpoint: + /* + * Initialize args struct so we know whether arg was + * found; some options take optional arguments. + */ + args[0].from = args[0].to = NULL; + arg = 0; + + /* revert to match_table for checkpoint= options */ + token = match_token(param->string, f2fs_checkpoint_tokens, args); + switch (token) { case Opt_checkpoint_disable_cap_perc: if (args->from && match_int(args, &arg)) return -EINVAL; if (arg < 0 || arg > 100) return -EINVAL; - F2FS_OPTION(sbi).unusable_cap_perc = arg; - set_opt(sbi, DISABLE_CHECKPOINT); + F2FS_CTX_INFO(ctx).unusable_cap_perc = arg; + ctx->spec_mask |= F2FS_SPEC_checkpoint_disable_cap_perc; + ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT); break; case Opt_checkpoint_disable_cap: if (args->from && match_int(args, &arg)) return -EINVAL; - F2FS_OPTION(sbi).unusable_cap = arg; - set_opt(sbi, DISABLE_CHECKPOINT); + F2FS_CTX_INFO(ctx).unusable_cap = arg; + ctx->spec_mask |= F2FS_SPEC_checkpoint_disable_cap; + ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT); break; case Opt_checkpoint_disable: - set_opt(sbi, DISABLE_CHECKPOINT); + ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT); break; case Opt_checkpoint_enable: - clear_opt(sbi, DISABLE_CHECKPOINT); - break; - case Opt_checkpoint_merge: - set_opt(sbi, MERGE_CHECKPOINT); - break; - case Opt_nocheckpoint_merge: - clear_opt(sbi, MERGE_CHECKPOINT); + ctx_clear_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT); break; + default: + return -EINVAL; + } + break; + case Opt_checkpoint_merge: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_MERGE_CHECKPOINT); + else + ctx_set_opt(ctx, F2FS_MOUNT_MERGE_CHECKPOINT); + break; #ifdef CONFIG_F2FS_FS_COMPRESSION - case Opt_compress_algorithm: - if (!f2fs_sb_has_compression(sbi)) { - f2fs_info(sbi, "Image doesn't support compression"); - break; - } - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - if (!strcmp(name, "lzo")) { + case Opt_compress_algorithm: + name = param->string; + if (!strcmp(name, "lzo")) { #ifdef CONFIG_F2FS_FS_LZO - F2FS_OPTION(sbi).compress_level = 0; - F2FS_OPTION(sbi).compress_algorithm = - COMPRESS_LZO; + F2FS_CTX_INFO(ctx).compress_level = 0; + F2FS_CTX_INFO(ctx).compress_algorithm = COMPRESS_LZO; + ctx->spec_mask |= F2FS_SPEC_compress_level; + ctx->spec_mask |= F2FS_SPEC_compress_algorithm; #else - f2fs_info(sbi, "kernel doesn't support lzo compression"); + f2fs_info(NULL, "kernel doesn't support lzo compression"); #endif - } else if (!strncmp(name, "lz4", 3)) { + } else if (!strncmp(name, "lz4", 3)) { #ifdef CONFIG_F2FS_FS_LZ4 - ret = f2fs_set_lz4hc_level(sbi, name); - if (ret) { - kfree(name); - return -EINVAL; - } - F2FS_OPTION(sbi).compress_algorithm = - COMPRESS_LZ4; + ret = f2fs_set_lz4hc_level(ctx, name); + if (ret) + return -EINVAL; + F2FS_CTX_INFO(ctx).compress_algorithm = COMPRESS_LZ4; + ctx->spec_mask |= F2FS_SPEC_compress_algorithm; #else - f2fs_info(sbi, "kernel doesn't support lz4 compression"); + f2fs_info(NULL, "kernel doesn't support lz4 compression"); #endif - } else if (!strncmp(name, "zstd", 4)) { + } else if (!strncmp(name, "zstd", 4)) { #ifdef CONFIG_F2FS_FS_ZSTD - ret = f2fs_set_zstd_level(sbi, name); - if (ret) { - kfree(name); - return -EINVAL; - } - F2FS_OPTION(sbi).compress_algorithm = - COMPRESS_ZSTD; + ret = f2fs_set_zstd_level(ctx, name); + if (ret) + return -EINVAL; + F2FS_CTX_INFO(ctx).compress_algorithm = COMPRESS_ZSTD; + ctx->spec_mask |= F2FS_SPEC_compress_algorithm; #else - f2fs_info(sbi, "kernel doesn't support zstd compression"); + f2fs_info(NULL, "kernel doesn't support zstd compression"); #endif - } else if (!strcmp(name, "lzo-rle")) { + } else if (!strcmp(name, "lzo-rle")) { #ifdef CONFIG_F2FS_FS_LZORLE - F2FS_OPTION(sbi).compress_level = 0; - F2FS_OPTION(sbi).compress_algorithm = - COMPRESS_LZORLE; + F2FS_CTX_INFO(ctx).compress_level = 0; + F2FS_CTX_INFO(ctx).compress_algorithm = COMPRESS_LZORLE; + ctx->spec_mask |= F2FS_SPEC_compress_level; + ctx->spec_mask |= F2FS_SPEC_compress_algorithm; #else - f2fs_info(sbi, "kernel doesn't support lzorle compression"); + f2fs_info(NULL, "kernel doesn't support lzorle compression"); #endif - } else { - kfree(name); - return -EINVAL; - } - kfree(name); + } else + return -EINVAL; + break; + case Opt_compress_log_size: + if (result.uint_32 < MIN_COMPRESS_LOG_SIZE || + result.uint_32 > MAX_COMPRESS_LOG_SIZE) { + f2fs_err(NULL, + "Compress cluster log size is out of range"); + return -EINVAL; + } + F2FS_CTX_INFO(ctx).compress_log_size = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_compress_log_size; + break; + case Opt_compress_extension: + name = param->string; + ext = F2FS_CTX_INFO(ctx).extensions; + ext_cnt = F2FS_CTX_INFO(ctx).compress_ext_cnt; + + if (strlen(name) >= F2FS_EXTENSION_LEN || + ext_cnt >= COMPRESS_EXT_NUM) { + f2fs_err(NULL, "invalid extension length/number"); + return -EINVAL; + } + + if (is_compress_extension_exist(&ctx->info, name, true)) break; - case Opt_compress_log_size: - if (!f2fs_sb_has_compression(sbi)) { - f2fs_info(sbi, "Image doesn't support compression"); - break; - } - if (args->from && match_int(args, &arg)) - return -EINVAL; - if (arg < MIN_COMPRESS_LOG_SIZE || - arg > MAX_COMPRESS_LOG_SIZE) { - f2fs_err(sbi, - "Compress cluster log size is out of range"); - return -EINVAL; - } - F2FS_OPTION(sbi).compress_log_size = arg; + + ret = strscpy(ext[ext_cnt], name, F2FS_EXTENSION_LEN); + if (ret < 0) + return ret; + F2FS_CTX_INFO(ctx).compress_ext_cnt++; + ctx->spec_mask |= F2FS_SPEC_compress_extension; + break; + case Opt_nocompress_extension: + name = param->string; + noext = F2FS_CTX_INFO(ctx).noextensions; + noext_cnt = F2FS_CTX_INFO(ctx).nocompress_ext_cnt; + + if (strlen(name) >= F2FS_EXTENSION_LEN || + noext_cnt >= COMPRESS_EXT_NUM) { + f2fs_err(NULL, "invalid extension length/number"); + return -EINVAL; + } + + if (is_compress_extension_exist(&ctx->info, name, false)) break; - case Opt_compress_extension: - if (!f2fs_sb_has_compression(sbi)) { - f2fs_info(sbi, "Image doesn't support compression"); - break; - } - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - ext = F2FS_OPTION(sbi).extensions; - ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + ret = strscpy(noext[noext_cnt], name, F2FS_EXTENSION_LEN); + if (ret < 0) + return ret; + F2FS_CTX_INFO(ctx).nocompress_ext_cnt++; + ctx->spec_mask |= F2FS_SPEC_nocompress_extension; + break; + case Opt_compress_chksum: + F2FS_CTX_INFO(ctx).compress_chksum = true; + ctx->spec_mask |= F2FS_SPEC_compress_chksum; + break; + case Opt_compress_mode: + F2FS_CTX_INFO(ctx).compress_mode = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_compress_mode; + break; + case Opt_compress_cache: + ctx_set_opt(ctx, F2FS_MOUNT_COMPRESS_CACHE); + break; +#else + case Opt_compress_algorithm: + case Opt_compress_log_size: + case Opt_compress_extension: + case Opt_nocompress_extension: + case Opt_compress_chksum: + case Opt_compress_mode: + case Opt_compress_cache: + f2fs_info(NULL, "compression options not supported"); + break; +#endif + case Opt_atgc: + ctx_set_opt(ctx, F2FS_MOUNT_ATGC); + break; + case Opt_gc_merge: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_GC_MERGE); + else + ctx_set_opt(ctx, F2FS_MOUNT_GC_MERGE); + break; + case Opt_discard_unit: + F2FS_CTX_INFO(ctx).discard_unit = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_discard_unit; + break; + case Opt_memory_mode: + F2FS_CTX_INFO(ctx).memory_mode = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_memory_mode; + break; + case Opt_age_extent_cache: + ctx_set_opt(ctx, F2FS_MOUNT_AGE_EXTENT_CACHE); + break; + case Opt_errors: + F2FS_CTX_INFO(ctx).errors = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_errors; + break; + case Opt_nat_bits: + ctx_set_opt(ctx, F2FS_MOUNT_NAT_BITS); + break; + } + return 0; +} - if (strlen(name) >= F2FS_EXTENSION_LEN || - ext_cnt >= COMPRESS_EXT_NUM) { - f2fs_err(sbi, - "invalid extension length/number"); - kfree(name); - return -EINVAL; - } +/* + * Check quota settings consistency. + */ +static int f2fs_check_quota_consistency(struct fs_context *fc, + struct super_block *sb) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + #ifdef CONFIG_QUOTA + struct f2fs_fs_context *ctx = fc->fs_private; + bool quota_feature = f2fs_sb_has_quota_ino(sbi); + bool quota_turnon = sb_any_quota_loaded(sb); + char *old_qname, *new_qname; + bool usr_qf_name, grp_qf_name, prj_qf_name, usrquota, grpquota, prjquota; + int i; - if (is_compress_extension_exist(sbi, name, true)) { - kfree(name); - break; - } + /* + * We do the test below only for project quotas. 'usrquota' and + * 'grpquota' mount options are allowed even without quota feature + * to support legacy quotas in quota files. + */ + if (ctx_test_opt(ctx, F2FS_MOUNT_PRJQUOTA) && + !f2fs_sb_has_project_quota(sbi)) { + f2fs_err(sbi, "Project quota feature not enabled. Cannot enable project quota enforcement."); + return -EINVAL; + } - ret = strscpy(ext[ext_cnt], name); - if (ret < 0) { - kfree(name); - return ret; - } - F2FS_OPTION(sbi).compress_ext_cnt++; - kfree(name); - break; - case Opt_nocompress_extension: - if (!f2fs_sb_has_compression(sbi)) { - f2fs_info(sbi, "Image doesn't support compression"); - break; - } - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; + if (ctx->qname_mask) { + for (i = 0; i < MAXQUOTAS; i++) { + if (!(ctx->qname_mask & (1 << i))) + continue; - noext = F2FS_OPTION(sbi).noextensions; - noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + old_qname = F2FS_OPTION(sbi).s_qf_names[i]; + new_qname = F2FS_CTX_INFO(ctx).s_qf_names[i]; + if (quota_turnon && + !!old_qname != !!new_qname) + goto err_jquota_change; - if (strlen(name) >= F2FS_EXTENSION_LEN || - noext_cnt >= COMPRESS_EXT_NUM) { - f2fs_err(sbi, - "invalid extension length/number"); - kfree(name); - return -EINVAL; + if (old_qname) { + if (strcmp(old_qname, new_qname) == 0) { + ctx->qname_mask &= ~(1 << i); + continue; + } + goto err_jquota_specified; } - if (is_compress_extension_exist(sbi, name, false)) { - kfree(name); - break; + if (quota_feature) { + f2fs_info(sbi, "QUOTA feature is enabled, so ignore qf_name"); + ctx->qname_mask &= ~(1 << i); + kfree(F2FS_CTX_INFO(ctx).s_qf_names[i]); + F2FS_CTX_INFO(ctx).s_qf_names[i] = NULL; } + } + } + + /* Make sure we don't mix old and new quota format */ + usr_qf_name = F2FS_OPTION(sbi).s_qf_names[USRQUOTA] || + F2FS_CTX_INFO(ctx).s_qf_names[USRQUOTA]; + grp_qf_name = F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] || + F2FS_CTX_INFO(ctx).s_qf_names[GRPQUOTA]; + prj_qf_name = F2FS_OPTION(sbi).s_qf_names[PRJQUOTA] || + F2FS_CTX_INFO(ctx).s_qf_names[PRJQUOTA]; + usrquota = test_opt(sbi, USRQUOTA) || + ctx_test_opt(ctx, F2FS_MOUNT_USRQUOTA); + grpquota = test_opt(sbi, GRPQUOTA) || + ctx_test_opt(ctx, F2FS_MOUNT_GRPQUOTA); + prjquota = test_opt(sbi, PRJQUOTA) || + ctx_test_opt(ctx, F2FS_MOUNT_PRJQUOTA); + + if (usr_qf_name) { + ctx_clear_opt(ctx, F2FS_MOUNT_USRQUOTA); + usrquota = false; + } + if (grp_qf_name) { + ctx_clear_opt(ctx, F2FS_MOUNT_GRPQUOTA); + grpquota = false; + } + if (prj_qf_name) { + ctx_clear_opt(ctx, F2FS_MOUNT_PRJQUOTA); + prjquota = false; + } + if (usr_qf_name || grp_qf_name || prj_qf_name) { + if (grpquota || usrquota || prjquota) { + f2fs_err(sbi, "old and new quota format mixing"); + return -EINVAL; + } + if (!(ctx->spec_mask & F2FS_SPEC_jqfmt || + F2FS_OPTION(sbi).s_jquota_fmt)) { + f2fs_err(sbi, "journaled quota format not specified"); + return -EINVAL; + } + } + return 0; + +err_jquota_change: + f2fs_err(sbi, "Cannot change journaled quota options when quota turned on"); + return -EINVAL; +err_jquota_specified: + f2fs_err(sbi, "%s quota file already specified", + QTYPE2NAME(i)); + return -EINVAL; - ret = strscpy(noext[noext_cnt], name); - if (ret < 0) { - kfree(name); - return ret; - } - F2FS_OPTION(sbi).nocompress_ext_cnt++; - kfree(name); - break; - case Opt_compress_chksum: - if (!f2fs_sb_has_compression(sbi)) { - f2fs_info(sbi, "Image doesn't support compression"); - break; - } - F2FS_OPTION(sbi).compress_chksum = true; - break; - case Opt_compress_mode: - if (!f2fs_sb_has_compression(sbi)) { - f2fs_info(sbi, "Image doesn't support compression"); - break; - } - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - if (!strcmp(name, "fs")) { - F2FS_OPTION(sbi).compress_mode = COMPR_MODE_FS; - } else if (!strcmp(name, "user")) { - F2FS_OPTION(sbi).compress_mode = COMPR_MODE_USER; - } else { - kfree(name); - return -EINVAL; - } - kfree(name); - break; - case Opt_compress_cache: - if (!f2fs_sb_has_compression(sbi)) { - f2fs_info(sbi, "Image doesn't support compression"); - break; - } - set_opt(sbi, COMPRESS_CACHE); - break; #else - case Opt_compress_algorithm: - case Opt_compress_log_size: - case Opt_compress_extension: - case Opt_nocompress_extension: - case Opt_compress_chksum: - case Opt_compress_mode: - case Opt_compress_cache: - f2fs_info(sbi, "compression options not supported"); - break; + if (f2fs_readonly(sbi->sb)) + return 0; + if (f2fs_sb_has_quota_ino(sbi)) { + f2fs_info(sbi, "Filesystem with quota feature cannot be mounted RDWR without CONFIG_QUOTA"); + return -EINVAL; + } + if (f2fs_sb_has_project_quota(sbi)) { + f2fs_err(sbi, "Filesystem with project quota feature cannot be mounted RDWR without CONFIG_QUOTA"); + return -EINVAL; + } + + return 0; #endif - case Opt_atgc: - set_opt(sbi, ATGC); - break; - case Opt_gc_merge: - set_opt(sbi, GC_MERGE); - break; - case Opt_nogc_merge: - clear_opt(sbi, GC_MERGE); - break; - case Opt_discard_unit: - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - if (!strcmp(name, "block")) { - F2FS_OPTION(sbi).discard_unit = - DISCARD_UNIT_BLOCK; - } else if (!strcmp(name, "segment")) { - F2FS_OPTION(sbi).discard_unit = - DISCARD_UNIT_SEGMENT; - } else if (!strcmp(name, "section")) { - F2FS_OPTION(sbi).discard_unit = - DISCARD_UNIT_SECTION; - } else { - kfree(name); - return -EINVAL; - } - kfree(name); - break; - case Opt_memory_mode: - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - if (!strcmp(name, "normal")) { - F2FS_OPTION(sbi).memory_mode = - MEMORY_MODE_NORMAL; - } else if (!strcmp(name, "low")) { - F2FS_OPTION(sbi).memory_mode = - MEMORY_MODE_LOW; - } else { - kfree(name); - return -EINVAL; +} + +static int f2fs_check_test_dummy_encryption(struct fs_context *fc, + struct super_block *sb) +{ + struct f2fs_fs_context *ctx = fc->fs_private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (!fscrypt_is_dummy_policy_set(&F2FS_CTX_INFO(ctx).dummy_enc_policy)) + return 0; + + if (!f2fs_sb_has_encrypt(sbi)) { + f2fs_err(sbi, "Encrypt feature is off"); + return -EINVAL; + } + + /* + * This mount option is just for testing, and it's not worthwhile to + * implement the extra complexity (e.g. RCU protection) that would be + * needed to allow it to be set or changed during remount. We do allow + * it to be specified during remount, but only if there is no change. + */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + if (fscrypt_dummy_policies_equal(&F2FS_OPTION(sbi).dummy_enc_policy, + &F2FS_CTX_INFO(ctx).dummy_enc_policy)) + return 0; + f2fs_warn(sbi, "Can't set or change test_dummy_encryption on remount"); + return -EINVAL; + } + return 0; +} + +static inline bool test_compression_spec(unsigned int mask) +{ + return mask & (F2FS_SPEC_compress_algorithm + | F2FS_SPEC_compress_log_size + | F2FS_SPEC_compress_extension + | F2FS_SPEC_nocompress_extension + | F2FS_SPEC_compress_chksum + | F2FS_SPEC_compress_mode); +} + +static inline void clear_compression_spec(struct f2fs_fs_context *ctx) +{ + ctx->spec_mask &= ~(F2FS_SPEC_compress_algorithm + | F2FS_SPEC_compress_log_size + | F2FS_SPEC_compress_extension + | F2FS_SPEC_nocompress_extension + | F2FS_SPEC_compress_chksum + | F2FS_SPEC_compress_mode); +} + +static int f2fs_check_compression(struct fs_context *fc, + struct super_block *sb) +{ +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct f2fs_fs_context *ctx = fc->fs_private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int i, cnt; + + if (!f2fs_sb_has_compression(sbi)) { + if (test_compression_spec(ctx->spec_mask) || + ctx_test_opt(ctx, F2FS_MOUNT_COMPRESS_CACHE)) + f2fs_info(sbi, "Image doesn't support compression"); + clear_compression_spec(ctx); + ctx->opt_mask &= ~F2FS_MOUNT_COMPRESS_CACHE; + return 0; + } + if (ctx->spec_mask & F2FS_SPEC_compress_extension) { + cnt = F2FS_CTX_INFO(ctx).compress_ext_cnt; + for (i = 0; i < F2FS_CTX_INFO(ctx).compress_ext_cnt; i++) { + if (is_compress_extension_exist(&F2FS_OPTION(sbi), + F2FS_CTX_INFO(ctx).extensions[i], true)) { + F2FS_CTX_INFO(ctx).extensions[i][0] = '\0'; + cnt--; } - kfree(name); - break; - case Opt_age_extent_cache: - set_opt(sbi, AGE_EXTENT_CACHE); - break; - case Opt_errors: - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - if (!strcmp(name, "remount-ro")) { - F2FS_OPTION(sbi).errors = - MOUNT_ERRORS_READONLY; - } else if (!strcmp(name, "continue")) { - F2FS_OPTION(sbi).errors = - MOUNT_ERRORS_CONTINUE; - } else if (!strcmp(name, "panic")) { - F2FS_OPTION(sbi).errors = - MOUNT_ERRORS_PANIC; - } else { - kfree(name); - return -EINVAL; + } + if (F2FS_OPTION(sbi).compress_ext_cnt + cnt > COMPRESS_EXT_NUM) { + f2fs_err(sbi, "invalid extension length/number"); + return -EINVAL; + } + } + if (ctx->spec_mask & F2FS_SPEC_nocompress_extension) { + cnt = F2FS_CTX_INFO(ctx).nocompress_ext_cnt; + for (i = 0; i < F2FS_CTX_INFO(ctx).nocompress_ext_cnt; i++) { + if (is_compress_extension_exist(&F2FS_OPTION(sbi), + F2FS_CTX_INFO(ctx).noextensions[i], false)) { + F2FS_CTX_INFO(ctx).noextensions[i][0] = '\0'; + cnt--; } - kfree(name); - break; - case Opt_nat_bits: - set_opt(sbi, NAT_BITS); - break; - default: - f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", - p); + } + if (F2FS_OPTION(sbi).nocompress_ext_cnt + cnt > COMPRESS_EXT_NUM) { + f2fs_err(sbi, "invalid noextension length/number"); return -EINVAL; } } + + if (f2fs_test_compress_extension(F2FS_CTX_INFO(ctx).noextensions, + F2FS_CTX_INFO(ctx).nocompress_ext_cnt, + F2FS_CTX_INFO(ctx).extensions, + F2FS_CTX_INFO(ctx).compress_ext_cnt)) { + f2fs_err(sbi, "new noextensions conflicts with new extensions"); + return -EINVAL; + } + if (f2fs_test_compress_extension(F2FS_CTX_INFO(ctx).noextensions, + F2FS_CTX_INFO(ctx).nocompress_ext_cnt, + F2FS_OPTION(sbi).extensions, + F2FS_OPTION(sbi).compress_ext_cnt)) { + f2fs_err(sbi, "new noextensions conflicts with old extensions"); + return -EINVAL; + } + if (f2fs_test_compress_extension(F2FS_OPTION(sbi).noextensions, + F2FS_OPTION(sbi).nocompress_ext_cnt, + F2FS_CTX_INFO(ctx).extensions, + F2FS_CTX_INFO(ctx).compress_ext_cnt)) { + f2fs_err(sbi, "new extensions conflicts with old noextensions"); + return -EINVAL; + } +#endif return 0; } -static int f2fs_default_check(struct f2fs_sb_info *sbi) +static int f2fs_check_opt_consistency(struct fs_context *fc, + struct super_block *sb) { -#ifdef CONFIG_QUOTA - if (f2fs_check_quota_options(sbi)) + struct f2fs_fs_context *ctx = fc->fs_private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int err; + + if (ctx_test_opt(ctx, F2FS_MOUNT_NORECOVERY) && !f2fs_readonly(sb)) return -EINVAL; -#else - if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sbi->sb)) { - f2fs_info(sbi, "Filesystem with quota feature cannot be mounted RDWR without CONFIG_QUOTA"); + + if (f2fs_hw_should_discard(sbi) && + (ctx->opt_mask & F2FS_MOUNT_DISCARD) && + !ctx_test_opt(ctx, F2FS_MOUNT_DISCARD)) { + f2fs_warn(sbi, "discard is required for zoned block devices"); return -EINVAL; } - if (f2fs_sb_has_project_quota(sbi) && !f2fs_readonly(sbi->sb)) { - f2fs_err(sbi, "Filesystem with project quota feature cannot be mounted RDWR without CONFIG_QUOTA"); + + if (!f2fs_hw_support_discard(sbi) && + (ctx->opt_mask & F2FS_MOUNT_DISCARD) && + ctx_test_opt(ctx, F2FS_MOUNT_DISCARD)) { + f2fs_warn(sbi, "device does not support discard"); + ctx_clear_opt(ctx, F2FS_MOUNT_DISCARD); + ctx->opt_mask &= ~F2FS_MOUNT_DISCARD; + } + + if (f2fs_sb_has_device_alias(sbi) && + (ctx->opt_mask & F2FS_MOUNT_READ_EXTENT_CACHE) && + !ctx_test_opt(ctx, F2FS_MOUNT_READ_EXTENT_CACHE)) { + f2fs_err(sbi, "device aliasing requires extent cache"); return -EINVAL; } -#endif + + if (test_opt(sbi, RESERVE_ROOT) && + (ctx->opt_mask & F2FS_MOUNT_RESERVE_ROOT) && + ctx_test_opt(ctx, F2FS_MOUNT_RESERVE_ROOT)) { + f2fs_info(sbi, "Preserve previous reserve_root=%u", + F2FS_OPTION(sbi).root_reserved_blocks); + ctx_clear_opt(ctx, F2FS_MOUNT_RESERVE_ROOT); + ctx->opt_mask &= ~F2FS_MOUNT_RESERVE_ROOT; + } + + err = f2fs_check_test_dummy_encryption(fc, sb); + if (err) + return err; + + err = f2fs_check_compression(fc, sb); + if (err) + return err; + + err = f2fs_check_quota_consistency(fc, sb); + if (err) + return err; if (!IS_ENABLED(CONFIG_UNICODE) && f2fs_sb_has_casefold(sbi)) { f2fs_err(sbi, @@ -1354,15 +1449,19 @@ static int f2fs_default_check(struct f2fs_sb_info *sbi) * devices, but mandatory for host-managed zoned block devices. */ if (f2fs_sb_has_blkzoned(sbi)) { + if (F2FS_CTX_INFO(ctx).bggc_mode == BGGC_MODE_OFF) { + f2fs_warn(sbi, "zoned devices need bggc"); + return -EINVAL; + } #ifdef CONFIG_BLK_DEV_ZONED - if (F2FS_OPTION(sbi).discard_unit != - DISCARD_UNIT_SECTION) { + if ((ctx->spec_mask & F2FS_SPEC_discard_unit) && + F2FS_CTX_INFO(ctx).discard_unit != DISCARD_UNIT_SECTION) { f2fs_info(sbi, "Zoned block device doesn't need small discard, set discard_unit=section by default"); - F2FS_OPTION(sbi).discard_unit = - DISCARD_UNIT_SECTION; + F2FS_CTX_INFO(ctx).discard_unit = DISCARD_UNIT_SECTION; } - if (F2FS_OPTION(sbi).fs_mode != FS_MODE_LFS) { + if ((ctx->spec_mask & F2FS_SPEC_mode) && + F2FS_CTX_INFO(ctx).fs_mode != FS_MODE_LFS) { f2fs_info(sbi, "Only lfs mode is allowed with zoned block device feature"); return -EINVAL; } @@ -1372,43 +1471,25 @@ static int f2fs_default_check(struct f2fs_sb_info *sbi) #endif } -#ifdef CONFIG_F2FS_FS_COMPRESSION - if (f2fs_test_compress_extension(sbi)) { - f2fs_err(sbi, "invalid compress or nocompress extension"); - return -EINVAL; - } -#endif - - if (test_opt(sbi, INLINE_XATTR_SIZE)) { - int min_size, max_size; - + if (ctx_test_opt(ctx, F2FS_MOUNT_INLINE_XATTR_SIZE)) { if (!f2fs_sb_has_extra_attr(sbi) || !f2fs_sb_has_flexible_inline_xattr(sbi)) { f2fs_err(sbi, "extra_attr or flexible_inline_xattr feature is off"); return -EINVAL; } - if (!test_opt(sbi, INLINE_XATTR)) { + if (!ctx_test_opt(ctx, F2FS_MOUNT_INLINE_XATTR) && !test_opt(sbi, INLINE_XATTR)) { f2fs_err(sbi, "inline_xattr_size option should be set with inline_xattr option"); return -EINVAL; } - - min_size = MIN_INLINE_XATTR_SIZE; - max_size = MAX_INLINE_XATTR_SIZE; - - if (F2FS_OPTION(sbi).inline_xattr_size < min_size || - F2FS_OPTION(sbi).inline_xattr_size > max_size) { - f2fs_err(sbi, "inline xattr size is out of range: %d ~ %d", - min_size, max_size); - return -EINVAL; - } } - if (test_opt(sbi, ATGC) && f2fs_lfs_mode(sbi)) { + if (ctx_test_opt(ctx, F2FS_MOUNT_ATGC) && + F2FS_CTX_INFO(ctx).fs_mode == FS_MODE_LFS) { f2fs_err(sbi, "LFS is not compatible with ATGC"); return -EINVAL; } - if (f2fs_is_readonly(sbi) && test_opt(sbi, FLUSH_MERGE)) { + if (f2fs_is_readonly(sbi) && ctx_test_opt(ctx, F2FS_MOUNT_FLUSH_MERGE)) { f2fs_err(sbi, "FLUSH_MERGE not compatible with readonly mode"); return -EINVAL; } @@ -1417,12 +1498,190 @@ static int f2fs_default_check(struct f2fs_sb_info *sbi) f2fs_err(sbi, "Allow to mount readonly mode only"); return -EROFS; } + return 0; +} + +static void f2fs_apply_quota_options(struct fs_context *fc, + struct super_block *sb) +{ +#ifdef CONFIG_QUOTA + struct f2fs_fs_context *ctx = fc->fs_private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + bool quota_feature = f2fs_sb_has_quota_ino(sbi); + char *qname; + int i; + + if (quota_feature) + return; + + for (i = 0; i < MAXQUOTAS; i++) { + if (!(ctx->qname_mask & (1 << i))) + continue; + + qname = F2FS_CTX_INFO(ctx).s_qf_names[i]; + if (qname) { + qname = kstrdup(F2FS_CTX_INFO(ctx).s_qf_names[i], + GFP_KERNEL | __GFP_NOFAIL); + set_opt(sbi, QUOTA); + } + F2FS_OPTION(sbi).s_qf_names[i] = qname; + } + + if (ctx->spec_mask & F2FS_SPEC_jqfmt) + F2FS_OPTION(sbi).s_jquota_fmt = F2FS_CTX_INFO(ctx).s_jquota_fmt; + + if (quota_feature && F2FS_OPTION(sbi).s_jquota_fmt) { + f2fs_info(sbi, "QUOTA feature is enabled, so ignore jquota_fmt"); + F2FS_OPTION(sbi).s_jquota_fmt = 0; + } +#endif +} + +static void f2fs_apply_test_dummy_encryption(struct fs_context *fc, + struct super_block *sb) +{ + struct f2fs_fs_context *ctx = fc->fs_private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (!fscrypt_is_dummy_policy_set(&F2FS_CTX_INFO(ctx).dummy_enc_policy) || + /* if already set, it was already verified to be the same */ + fscrypt_is_dummy_policy_set(&F2FS_OPTION(sbi).dummy_enc_policy)) + return; + swap(F2FS_OPTION(sbi).dummy_enc_policy, F2FS_CTX_INFO(ctx).dummy_enc_policy); + f2fs_warn(sbi, "Test dummy encryption mode enabled"); +} - if (test_opt(sbi, NORECOVERY) && !f2fs_readonly(sbi->sb)) { - f2fs_err(sbi, "norecovery requires readonly mount"); +static void f2fs_apply_compression(struct fs_context *fc, + struct super_block *sb) +{ +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct f2fs_fs_context *ctx = fc->fs_private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned char (*ctx_ext)[F2FS_EXTENSION_LEN]; + unsigned char (*sbi_ext)[F2FS_EXTENSION_LEN]; + int ctx_cnt, sbi_cnt, i; + + if (ctx->spec_mask & F2FS_SPEC_compress_level) + F2FS_OPTION(sbi).compress_level = + F2FS_CTX_INFO(ctx).compress_level; + if (ctx->spec_mask & F2FS_SPEC_compress_algorithm) + F2FS_OPTION(sbi).compress_algorithm = + F2FS_CTX_INFO(ctx).compress_algorithm; + if (ctx->spec_mask & F2FS_SPEC_compress_log_size) + F2FS_OPTION(sbi).compress_log_size = + F2FS_CTX_INFO(ctx).compress_log_size; + if (ctx->spec_mask & F2FS_SPEC_compress_chksum) + F2FS_OPTION(sbi).compress_chksum = + F2FS_CTX_INFO(ctx).compress_chksum; + if (ctx->spec_mask & F2FS_SPEC_compress_mode) + F2FS_OPTION(sbi).compress_mode = + F2FS_CTX_INFO(ctx).compress_mode; + if (ctx->spec_mask & F2FS_SPEC_compress_extension) { + ctx_ext = F2FS_CTX_INFO(ctx).extensions; + ctx_cnt = F2FS_CTX_INFO(ctx).compress_ext_cnt; + sbi_ext = F2FS_OPTION(sbi).extensions; + sbi_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + for (i = 0; i < ctx_cnt; i++) { + if (strlen(ctx_ext[i]) == 0) + continue; + strscpy(sbi_ext[sbi_cnt], ctx_ext[i]); + sbi_cnt++; + } + F2FS_OPTION(sbi).compress_ext_cnt = sbi_cnt; + } + if (ctx->spec_mask & F2FS_SPEC_nocompress_extension) { + ctx_ext = F2FS_CTX_INFO(ctx).noextensions; + ctx_cnt = F2FS_CTX_INFO(ctx).nocompress_ext_cnt; + sbi_ext = F2FS_OPTION(sbi).noextensions; + sbi_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + for (i = 0; i < ctx_cnt; i++) { + if (strlen(ctx_ext[i]) == 0) + continue; + strscpy(sbi_ext[sbi_cnt], ctx_ext[i]); + sbi_cnt++; + } + F2FS_OPTION(sbi).nocompress_ext_cnt = sbi_cnt; + } +#endif +} + +static void f2fs_apply_options(struct fs_context *fc, struct super_block *sb) +{ + struct f2fs_fs_context *ctx = fc->fs_private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + F2FS_OPTION(sbi).opt &= ~ctx->opt_mask; + F2FS_OPTION(sbi).opt |= F2FS_CTX_INFO(ctx).opt; + + if (ctx->spec_mask & F2FS_SPEC_background_gc) + F2FS_OPTION(sbi).bggc_mode = F2FS_CTX_INFO(ctx).bggc_mode; + if (ctx->spec_mask & F2FS_SPEC_inline_xattr_size) + F2FS_OPTION(sbi).inline_xattr_size = + F2FS_CTX_INFO(ctx).inline_xattr_size; + if (ctx->spec_mask & F2FS_SPEC_active_logs) + F2FS_OPTION(sbi).active_logs = F2FS_CTX_INFO(ctx).active_logs; + if (ctx->spec_mask & F2FS_SPEC_reserve_root) + F2FS_OPTION(sbi).root_reserved_blocks = + F2FS_CTX_INFO(ctx).root_reserved_blocks; + if (ctx->spec_mask & F2FS_SPEC_resgid) + F2FS_OPTION(sbi).s_resgid = F2FS_CTX_INFO(ctx).s_resgid; + if (ctx->spec_mask & F2FS_SPEC_resuid) + F2FS_OPTION(sbi).s_resuid = F2FS_CTX_INFO(ctx).s_resuid; + if (ctx->spec_mask & F2FS_SPEC_mode) + F2FS_OPTION(sbi).fs_mode = F2FS_CTX_INFO(ctx).fs_mode; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (ctx->spec_mask & F2FS_SPEC_fault_injection) + (void)f2fs_build_fault_attr(sbi, + F2FS_CTX_INFO(ctx).fault_info.inject_rate, 0, FAULT_RATE); + if (ctx->spec_mask & F2FS_SPEC_fault_type) + (void)f2fs_build_fault_attr(sbi, 0, + F2FS_CTX_INFO(ctx).fault_info.inject_type, FAULT_TYPE); +#endif + if (ctx->spec_mask & F2FS_SPEC_alloc_mode) + F2FS_OPTION(sbi).alloc_mode = F2FS_CTX_INFO(ctx).alloc_mode; + if (ctx->spec_mask & F2FS_SPEC_fsync_mode) + F2FS_OPTION(sbi).fsync_mode = F2FS_CTX_INFO(ctx).fsync_mode; + if (ctx->spec_mask & F2FS_SPEC_checkpoint_disable_cap) + F2FS_OPTION(sbi).unusable_cap = F2FS_CTX_INFO(ctx).unusable_cap; + if (ctx->spec_mask & F2FS_SPEC_checkpoint_disable_cap_perc) + F2FS_OPTION(sbi).unusable_cap_perc = + F2FS_CTX_INFO(ctx).unusable_cap_perc; + if (ctx->spec_mask & F2FS_SPEC_discard_unit) + F2FS_OPTION(sbi).discard_unit = F2FS_CTX_INFO(ctx).discard_unit; + if (ctx->spec_mask & F2FS_SPEC_memory_mode) + F2FS_OPTION(sbi).memory_mode = F2FS_CTX_INFO(ctx).memory_mode; + if (ctx->spec_mask & F2FS_SPEC_errors) + F2FS_OPTION(sbi).errors = F2FS_CTX_INFO(ctx).errors; + + f2fs_apply_compression(fc, sb); + f2fs_apply_test_dummy_encryption(fc, sb); + f2fs_apply_quota_options(fc, sb); +} + +static int f2fs_sanity_check_options(struct f2fs_sb_info *sbi, bool remount) +{ + if (f2fs_sb_has_device_alias(sbi) && + !test_opt(sbi, READ_EXTENT_CACHE)) { + f2fs_err(sbi, "device aliasing requires extent cache"); return -EINVAL; } + if (!remount) + return 0; + +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_has_blkzoned(sbi) && + sbi->max_open_zones < F2FS_OPTION(sbi).active_logs) { + f2fs_err(sbi, + "zoned: max open zones %u is too small, need at least %u open zones", + sbi->max_open_zones, F2FS_OPTION(sbi).active_logs); + return -EINVAL; + } +#endif + if (f2fs_lfs_mode(sbi) && !IS_F2FS_IPU_DISABLE(sbi)) { + f2fs_warn(sbi, "LFS is not compatible with IPU"); + return -EINVAL; + } return 0; } @@ -1442,6 +1701,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Initialize f2fs-specific inode info */ atomic_set(&fi->dirty_pages, 0); atomic_set(&fi->i_compr_blocks, 0); + atomic_set(&fi->open_count, 0); init_f2fs_rwsem(&fi->i_sem); spin_lock_init(&fi->i_size_lock); INIT_LIST_HEAD(&fi->dirty_list); @@ -1718,7 +1978,7 @@ static void f2fs_put_super(struct super_block *sb) destroy_percpu_info(sbi); f2fs_destroy_iostat(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) - kvfree(sbi->write_io[i]); + kfree(sbi->write_io[i]); #if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif @@ -2329,11 +2589,12 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) f2fs_flush_ckpt_thread(sbi); } -static int f2fs_remount(struct super_block *sb, int *flags, char *data) +static int __f2fs_remount(struct fs_context *fc, struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); struct f2fs_mount_info org_mount_opt; unsigned long old_sb_flags; + unsigned int flags = fc->sb_flags; int err; bool need_restart_gc = false, need_stop_gc = false; bool need_restart_flush = false, need_stop_flush = false; @@ -2379,7 +2640,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) #endif /* recover superblocks we couldn't write due to previous RO mount */ - if (!(*flags & SB_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { + if (!(flags & SB_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { err = f2fs_commit_super(sbi, false); f2fs_info(sbi, "Try to recover all the superblocks, ret: %d", err); @@ -2389,23 +2650,13 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) default_options(sbi, true); - /* parse mount options */ - err = parse_options(sbi, data, true); + err = f2fs_check_opt_consistency(fc, sb); if (err) goto restore_opts; -#ifdef CONFIG_BLK_DEV_ZONED - if (f2fs_sb_has_blkzoned(sbi) && - sbi->max_open_zones < F2FS_OPTION(sbi).active_logs) { - f2fs_err(sbi, - "zoned: max open zones %u is too small, need at least %u open zones", - sbi->max_open_zones, F2FS_OPTION(sbi).active_logs); - err = -EINVAL; - goto restore_opts; - } -#endif + f2fs_apply_options(fc, sb); - err = f2fs_default_check(sbi); + err = f2fs_sanity_check_options(sbi, true); if (err) goto restore_opts; @@ -2416,20 +2667,20 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * Previous and new state of filesystem is RO, * so skip checking GC and FLUSH_MERGE conditions. */ - if (f2fs_readonly(sb) && (*flags & SB_RDONLY)) + if (f2fs_readonly(sb) && (flags & SB_RDONLY)) goto skip; - if (f2fs_dev_is_readonly(sbi) && !(*flags & SB_RDONLY)) { + if (f2fs_dev_is_readonly(sbi) && !(flags & SB_RDONLY)) { err = -EROFS; goto restore_opts; } #ifdef CONFIG_QUOTA - if (!f2fs_readonly(sb) && (*flags & SB_RDONLY)) { + if (!f2fs_readonly(sb) && (flags & SB_RDONLY)) { err = dquot_suspend(sb, -1); if (err < 0) goto restore_opts; - } else if (f2fs_readonly(sb) && !(*flags & SB_RDONLY)) { + } else if (f2fs_readonly(sb) && !(flags & SB_RDONLY)) { /* dquot_resume needs RW */ sb->s_flags &= ~SB_RDONLY; if (sb_any_quota_suspended(sb)) { @@ -2441,12 +2692,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } } #endif - if (f2fs_lfs_mode(sbi) && !IS_F2FS_IPU_DISABLE(sbi)) { - err = -EINVAL; - f2fs_warn(sbi, "LFS is not compatible with IPU"); - goto restore_opts; - } - /* disallow enable atgc dynamically */ if (no_atgc == !!test_opt(sbi, ATGC)) { err = -EINVAL; @@ -2485,7 +2730,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } - if ((*flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) { + if ((flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) { err = -EINVAL; f2fs_warn(sbi, "disabling checkpoint not compatible with read-only"); goto restore_opts; @@ -2496,7 +2741,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * or if background_gc = off is passed in mount * option. Also sync the filesystem. */ - if ((*flags & SB_RDONLY) || + if ((flags & SB_RDONLY) || (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF && !test_opt(sbi, GC_MERGE))) { if (sbi->gc_thread) { @@ -2510,7 +2755,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_gc = true; } - if (*flags & SB_RDONLY) { + if (flags & SB_RDONLY) { sync_inodes_sb(sb); set_sbi_flag(sbi, SBI_IS_DIRTY); @@ -2523,7 +2768,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * We stop issue flush thread if FS is mounted as RO * or if flush_merge is not passed in mount option. */ - if ((*flags & SB_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { + if ((flags & SB_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { clear_opt(sbi, FLUSH_MERGE); f2fs_destroy_flush_cmd_control(sbi, false); need_restart_flush = true; @@ -2565,11 +2810,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * triggered while remount and we need to take care of it before * returning from remount. */ - if ((*flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) || + if ((flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) || !test_opt(sbi, MERGE_CHECKPOINT)) { f2fs_stop_ckpt_thread(sbi); } else { - /* Flush if the prevous checkpoint, if exists. */ + /* Flush if the previous checkpoint, if exists. */ f2fs_flush_ckpt_thread(sbi); err = f2fs_start_ckpt_thread(sbi); @@ -2592,7 +2837,7 @@ skip: (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0); limit_reserve_root(sbi); - *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); + fc->sb_flags = (flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); sbi->umount_lock_holder = NULL; return 0; @@ -3263,7 +3508,6 @@ static const struct super_operations f2fs_sops = { .freeze_fs = f2fs_freeze, .unfreeze_fs = f2fs_unfreeze, .statfs = f2fs_statfs, - .remount_fs = f2fs_remount, .shutdown = f2fs_shutdown, }; @@ -3451,6 +3695,7 @@ static int __f2fs_commit_super(struct f2fs_sb_info *sbi, struct folio *folio, f2fs_bug_on(sbi, 1); ret = submit_bio_wait(bio); + bio_put(bio); folio_end_writeback(folio); return ret; @@ -4522,14 +4767,14 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) sbi->readdir_ra = true; } -static int f2fs_fill_super(struct super_block *sb, void *data, int silent) +static int f2fs_fill_super(struct super_block *sb, struct fs_context *fc) { + struct f2fs_fs_context *ctx = fc->fs_private; struct f2fs_sb_info *sbi; struct f2fs_super_block *raw_super; struct inode *root; int err; bool skip_recovery = false, need_fsck = false; - char *options = NULL; int recovery, i, valid_super_block; struct curseg_info *seg_i; int retry_cnt = 1; @@ -4592,18 +4837,14 @@ try_onemore: sizeof(raw_super->uuid)); default_options(sbi, false); - /* parse mount options */ - options = kstrdup((const char *)data, GFP_KERNEL); - if (data && !options) { - err = -ENOMEM; - goto free_sb_buf; - } - err = parse_options(sbi, options, false); + err = f2fs_check_opt_consistency(fc, sb); if (err) - goto free_options; + goto free_sb_buf; + + f2fs_apply_options(fc, sb); - err = f2fs_default_check(sbi); + err = f2fs_sanity_check_options(sbi, false); if (err) goto free_options; @@ -4770,6 +5011,10 @@ try_onemore: /* get segno of first zoned block device */ sbi->first_seq_zone_segno = get_first_seq_zone_segno(sbi); + sbi->reserved_pin_section = f2fs_sb_has_blkzoned(sbi) ? + ZONED_PIN_SEC_REQUIRED_COUNT : + GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)); + /* Read accumulated write IO statistics if exists */ seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); if (__exist_node_summaries(sbi)) @@ -4930,7 +5175,6 @@ reset_checkpoint: if (err) goto sync_free_meta; } - kvfree(options); /* recover broken superblock */ if (recovery) { @@ -5013,7 +5257,7 @@ free_iostat: f2fs_destroy_iostat(sbi); free_bio_info: for (i = 0; i < NR_PAGE_TYPE; i++) - kvfree(sbi->write_io[i]); + kfree(sbi->write_io[i]); #if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); @@ -5024,8 +5268,8 @@ free_options: for (i = 0; i < MAXQUOTAS; i++) kfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif - fscrypt_free_dummy_policy(&F2FS_OPTION(sbi).dummy_enc_policy); - kvfree(options); + /* no need to free dummy_enc_policy, we just keep it in ctx when failed */ + swap(F2FS_CTX_INFO(ctx).dummy_enc_policy, F2FS_OPTION(sbi).dummy_enc_policy); free_sb_buf: kfree(raw_super); free_sbi: @@ -5041,12 +5285,39 @@ free_sbi: return err; } -static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) +static int f2fs_get_tree(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, f2fs_fill_super); + return get_tree_bdev(fc, f2fs_fill_super); } +static int f2fs_reconfigure(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; + + return __f2fs_remount(fc, sb); +} + +static void f2fs_fc_free(struct fs_context *fc) +{ + struct f2fs_fs_context *ctx = fc->fs_private; + + if (!ctx) + return; + +#ifdef CONFIG_QUOTA + f2fs_unnote_qf_name_all(fc); +#endif + fscrypt_free_dummy_policy(&F2FS_CTX_INFO(ctx).dummy_enc_policy); + kfree(ctx); +} + +static const struct fs_context_operations f2fs_context_ops = { + .parse_param = f2fs_parse_param, + .get_tree = f2fs_get_tree, + .reconfigure = f2fs_reconfigure, + .free = f2fs_fc_free, +}; + static void kill_f2fs_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -5088,10 +5359,24 @@ static void kill_f2fs_super(struct super_block *sb) } } +static int f2fs_init_fs_context(struct fs_context *fc) +{ + struct f2fs_fs_context *ctx; + + ctx = kzalloc(sizeof(struct f2fs_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + fc->fs_private = ctx; + fc->ops = &f2fs_context_ops; + + return 0; +} + static struct file_system_type f2fs_fs_type = { .owner = THIS_MODULE, .name = "f2fs", - .mount = f2fs_mount, + .init_fs_context = f2fs_init_fs_context, .kill_sb = kill_f2fs_super, .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 75134d69a0bd..f736052dea50 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -628,6 +628,27 @@ out: return count; } + if (!strcmp(a->attr.name, "gc_no_zoned_gc_percent")) { + if (t > 100) + return -EINVAL; + *ui = (unsigned int)t; + return count; + } + + if (!strcmp(a->attr.name, "gc_boost_zoned_gc_percent")) { + if (t > 100) + return -EINVAL; + *ui = (unsigned int)t; + return count; + } + + if (!strcmp(a->attr.name, "gc_valid_thresh_ratio")) { + if (t > 100) + return -EINVAL; + *ui = (unsigned int)t; + return count; + } + #ifdef CONFIG_F2FS_IOSTAT if (!strcmp(a->attr.name, "iostat_enable")) { sbi->iostat_enable = !!t; @@ -824,6 +845,27 @@ out: return count; } + if (!strcmp(a->attr.name, "reserved_pin_section")) { + if (t > GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi))) + return -EINVAL; + *ui = (unsigned int)t; + return count; + } + + if (!strcmp(a->attr.name, "gc_boost_gc_multiple")) { + if (t < 1 || t > SEGS_PER_SEC(sbi)) + return -EINVAL; + sbi->gc_thread->boost_gc_multiple = (unsigned int)t; + return count; + } + + if (!strcmp(a->attr.name, "gc_boost_gc_greedy")) { + if (t > GC_GREEDY) + return -EINVAL; + sbi->gc_thread->boost_gc_greedy = (unsigned int)t; + return count; + } + *ui = (unsigned int)t; return count; @@ -1050,6 +1092,8 @@ GC_THREAD_RW_ATTR(gc_no_gc_sleep_time, no_gc_sleep_time); GC_THREAD_RW_ATTR(gc_no_zoned_gc_percent, no_zoned_gc_percent); GC_THREAD_RW_ATTR(gc_boost_zoned_gc_percent, boost_zoned_gc_percent); GC_THREAD_RW_ATTR(gc_valid_thresh_ratio, valid_thresh_ratio); +GC_THREAD_RW_ATTR(gc_boost_gc_multiple, boost_gc_multiple); +GC_THREAD_RW_ATTR(gc_boost_gc_greedy, boost_gc_greedy); /* SM_INFO ATTR */ SM_INFO_RW_ATTR(reclaim_segments, rec_prefree_segments); @@ -1130,6 +1174,7 @@ F2FS_SBI_GENERAL_RO_ATTR(unusable_blocks_per_sec); F2FS_SBI_GENERAL_RW_ATTR(blkzone_alloc_policy); #endif F2FS_SBI_GENERAL_RW_ATTR(carve_out); +F2FS_SBI_GENERAL_RW_ATTR(reserved_pin_section); /* STAT_INFO ATTR */ #ifdef CONFIG_F2FS_STAT_FS @@ -1220,6 +1265,8 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(gc_no_zoned_gc_percent), ATTR_LIST(gc_boost_zoned_gc_percent), ATTR_LIST(gc_valid_thresh_ratio), + ATTR_LIST(gc_boost_gc_multiple), + ATTR_LIST(gc_boost_gc_greedy), ATTR_LIST(gc_idle), ATTR_LIST(gc_urgent), ATTR_LIST(reclaim_segments), @@ -1323,6 +1370,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(last_age_weight), ATTR_LIST(max_read_extent_count), ATTR_LIST(carve_out), + ATTR_LIST(reserved_pin_section), NULL, }; ATTRIBUTE_GROUPS(f2fs); diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 32048052c64a..5d07c469b571 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -127,6 +127,8 @@ #define __diag_GCC_8(s) #endif +#define __diag_GCC_all(s) __diag(s) + #define __diag_ignore_all(option, comment) \ __diag(__diag_GCC_ignore option) diff --git a/include/linux/execmem.h b/include/linux/execmem.h index 3be35680a54f..7de229134e30 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -60,27 +60,11 @@ enum execmem_range_flags { * will trap * @ptr: pointer to memory to fill * @size: size of the range to fill - * @writable: is the memory poited by @ptr is writable or ROX * * A hook for architecures to fill execmem ranges with invalid instructions. * Architectures that use EXECMEM_ROX_CACHE must implement this. */ -void execmem_fill_trapping_insns(void *ptr, size_t size, bool writable); - -/** - * execmem_make_temp_rw - temporarily remap region with read-write - * permissions - * @ptr: address of the region to remap - * @size: size of the region to remap - * - * Remaps a part of the cached large page in the ROX cache in the range - * [@ptr, @ptr + @size) as writable and not executable. The caller must - * have exclusive ownership of this range and ensure nothing will try to - * execute code in this range. - * - * Return: 0 on success or negative error code on failure. - */ -int execmem_make_temp_rw(void *ptr, size_t size); +void execmem_fill_trapping_insns(void *ptr, size_t size); /** * execmem_restore_rox - restore read-only-execute permissions @@ -95,7 +79,6 @@ int execmem_make_temp_rw(void *ptr, size_t size); */ int execmem_restore_rox(void *ptr, size_t size); #else -static inline int execmem_make_temp_rw(void *ptr, size_t size) { return 0; } static inline int execmem_restore_rox(void *ptr, size_t size) { return 0; } #endif @@ -166,6 +149,28 @@ struct execmem_info *execmem_arch_setup(void); void *execmem_alloc(enum execmem_type type, size_t size); /** + * execmem_alloc_rw - allocate writable executable memory + * @type: type of the allocation + * @size: how many bytes of memory are required + * + * Allocates memory that will contain executable code, either generated or + * loaded from kernel modules. + * + * Allocates memory that will contain data coupled with executable code, + * like data sections in kernel modules. + * + * Forces writable permissions on the allocated memory and the caller is + * responsible to manage the permissions afterwards. + * + * For architectures that use ROX cache the permissions will be set to R+W. + * For architectures that don't use ROX cache the default permissions for @type + * will be used as they must be writable. + * + * Return: a pointer to the allocated memory or %NULL + */ +void *execmem_alloc_rw(enum execmem_type type, size_t size); + +/** * execmem_free - free executable memory * @ptr: pointer to the memory that should be freed */ @@ -186,19 +191,6 @@ struct vm_struct *execmem_vmap(size_t size); #endif /** - * execmem_update_copy - copy an update to executable memory - * @dst: destination address to update - * @src: source address containing the data - * @size: how many bytes of memory shold be copied - * - * Copy @size bytes from @src to @dst using text poking if the memory at - * @dst is read-only. - * - * Return: a pointer to @dst or NULL on error - */ -void *execmem_update_copy(void *dst, const void *src, size_t size); - -/** * execmem_is_rox - check if execmem is read-only * @type - the execmem type to check * diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 5206d63b3386..2f8b8bfc0e73 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -268,7 +268,7 @@ struct node_footer { /* Node IDs in an Indirect Block */ #define NIDS_PER_BLOCK ((F2FS_BLKSIZE - sizeof(struct node_footer)) / sizeof(__le32)) -#define ADDRS_PER_PAGE(page, inode) (addrs_per_page(inode, IS_INODE(page))) +#define ADDRS_PER_PAGE(folio, inode) (addrs_per_page(inode, IS_INODE(folio))) #define NODE_DIR1_BLOCK (DEF_ADDRS_PER_INODE + 1) #define NODE_DIR2_BLOCK (DEF_ADDRS_PER_INODE + 2) diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 8d0e3ad89b94..10dd161690a2 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -332,12 +332,13 @@ static inline struct page *fscrypt_pagecache_page(struct page *bounce_page) return (struct page *)page_private(bounce_page); } -static inline bool fscrypt_is_bounce_folio(struct folio *folio) +static inline bool fscrypt_is_bounce_folio(const struct folio *folio) { return folio->mapping == NULL; } -static inline struct folio *fscrypt_pagecache_folio(struct folio *bounce_folio) +static inline +struct folio *fscrypt_pagecache_folio(const struct folio *bounce_folio) { return bounce_folio->private; } @@ -517,12 +518,13 @@ static inline struct page *fscrypt_pagecache_page(struct page *bounce_page) return ERR_PTR(-EINVAL); } -static inline bool fscrypt_is_bounce_folio(struct folio *folio) +static inline bool fscrypt_is_bounce_folio(const struct folio *folio) { return false; } -static inline struct folio *fscrypt_pagecache_folio(struct folio *bounce_folio) +static inline +struct folio *fscrypt_pagecache_folio(const struct folio *bounce_folio) { WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index 7376c1df9c90..c16353cc6e3c 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -225,7 +225,4 @@ io_mapping_free(struct io_mapping *iomap) kfree(iomap); } -int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma, - unsigned long addr, unsigned long pfn, unsigned long size); - #endif /* _LINUX_IO_MAPPING_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 349f0d9aad22..1ae97a0b8ec7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -414,8 +414,10 @@ extern unsigned int kobjsize(const void *objp); #endif #ifdef CONFIG_64BIT -/* VM is sealed, in vm_flags */ -#define VM_SEALED _BITUL(63) +#define VM_SEALED_BIT 42 +#define VM_SEALED BIT(VM_SEALED_BIT) +#else +#define VM_SEALED VM_NONE #endif /* Bits set in the VMA until the stack is in its final location */ diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 1f4f44951abe..11a078de9150 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -12,6 +12,7 @@ extern int rcuwait_wake_up(struct rcuwait *w); #include <linux/tracepoint-defs.h> #include <linux/types.h> #include <linux/cleanup.h> +#include <linux/sched/mm.h> #define MMAP_LOCK_INITIALIZER(name) \ .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock), @@ -154,6 +155,10 @@ static inline void vma_refcount_put(struct vm_area_struct *vma) * reused and attached to a different mm before we lock it. * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got * detached. + * + * WARNING! The vma passed to this function cannot be used if the function + * fails to lock it because in certain cases RCU lock is dropped and then + * reacquired. Once RCU lock is dropped the vma can be concurently freed. */ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, struct vm_area_struct *vma) @@ -183,6 +188,31 @@ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, } rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); + + /* + * If vma got attached to another mm from under us, that mm is not + * stable and can be freed in the narrow window after vma->vm_refcnt + * is dropped and before rcuwait_wake_up(mm) is called. Grab it before + * releasing vma->vm_refcnt. + */ + if (unlikely(vma->vm_mm != mm)) { + /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ + struct mm_struct *other_mm = vma->vm_mm; + + /* + * __mmdrop() is a heavy operation and we don't need RCU + * protection here. Release RCU lock during these operations. + * We reinstate the RCU read lock as the caller expects it to + * be held when this function returns even on error. + */ + rcu_read_unlock(); + mmgrab(other_mm); + vma_refcount_put(vma); + mmdrop(other_mm); + rcu_read_lock(); + return NULL; + } + /* * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. * False unlocked result is impossible because we modify and check diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 8e4d6eda8a8d..8d3fa3a91ce4 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -837,8 +837,6 @@ void set_page_writeback(struct page *page); #define folio_start_writeback(folio) \ __folio_start_writeback(folio, false) -#define folio_start_writeback_keepwrite(folio) \ - __folio_start_writeback(folio, true) static __always_inline bool folio_test_head(const struct folio *folio) { diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index e3b99920be05..4c035637eeb7 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -736,6 +736,29 @@ static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm, } #endif +/** + * get_and_clear_ptes - Clear present PTEs that map consecutive pages of + * the same folio, collecting dirty/accessed bits. + * @mm: Address space the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to clear. + * + * Use this instead of get_and_clear_full_ptes() if it is known that we don't + * need to clear the full mm, which is mostly the case. + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline pte_t get_and_clear_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ + return get_and_clear_full_ptes(mm, addr, ptep, nr, 0); +} + #ifndef clear_full_ptes /** * clear_full_ptes - Clear present PTEs that map consecutive pages of the same @@ -768,6 +791,28 @@ static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr, } #endif +/** + * clear_ptes - Clear present PTEs that map consecutive pages of the same folio. + * @mm: Address space the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to clear. + * + * Use this instead of clear_full_ptes() if it is known that we don't need to + * clear the full mm, which is mostly the case. + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline void clear_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ + clear_full_ptes(mm, addr, ptep, nr, 0); +} + /* * If two threads concurrently fault at the same page, the thread that * won the race updates the PTE and its local TLB/Cache. The other thread diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 20803fcb49a7..6cd020eea37a 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -449,6 +449,28 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio, default: VM_WARN_ON_ONCE(true); } + + /* + * Anon folios must have an associated live anon_vma as long as they're + * mapped into userspace. + * Note that the atomic_read() mainly does two things: + * + * 1. In KASAN builds with CONFIG_SLUB_RCU_DEBUG, it causes KASAN to + * check that the associated anon_vma has not yet been freed (subject + * to KASAN's usual limitations). This check will pass if the + * anon_vma's refcount has already dropped to 0 but an RCU grace + * period hasn't passed since then. + * 2. If the anon_vma has not yet been freed, it checks that the + * anon_vma still has a nonzero refcount (as opposed to being in the + * middle of an RCU delay for getting freed). + */ + if (folio_test_anon(folio) && !folio_test_ksm(folio)) { + unsigned long mapping = (unsigned long)folio->mapping; + struct anon_vma *anon_vma; + + anon_vma = (void *)(mapping - FOLIO_MAPPING_ANON); + VM_WARN_ON_FOLIO(atomic_read(&anon_vma->refcount) == 0, folio); + } } /* diff --git a/include/linux/sprintf.h b/include/linux/sprintf.h index 876130091384..f06f7b785091 100644 --- a/include/linux/sprintf.h +++ b/include/linux/sprintf.h @@ -23,7 +23,7 @@ __scanf(2, 0) int vsscanf(const char *, const char *, va_list); /* These are for specific cases, do not use without real need */ extern bool no_hash_pointers; -int no_hash_pointers_enable(char *str); +void hash_pointers_finalize(bool slub_debug); /* Used for Rust formatting ('%pA') */ char *rust_fmt_argument(char *buf, char *end, const void *ptr); diff --git a/kernel/fork.c b/kernel/fork.c index 9ce93fd20f82..c4ada32598bd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -585,9 +585,12 @@ static void check_mm(struct mm_struct *mm) for (i = 0; i < NR_MM_COUNTERS; i++) { long x = percpu_counter_sum(&mm->rss_stat[i]); - if (unlikely(x)) - pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", - mm, resident_page_types[i], x); + if (unlikely(x)) { + pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld Comm:%s Pid:%d\n", + mm, resident_page_types[i], x, + current->comm, + task_pid_nr(current)); + } } if (mm_pgtables_bytes(mm)) diff --git a/kernel/module/main.c b/kernel/module/main.c index 7f8bb51aedd4..c66b26184936 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1322,20 +1322,11 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type) else execmem_type = EXECMEM_MODULE_TEXT; - ptr = execmem_alloc(execmem_type, size); + ptr = execmem_alloc_rw(execmem_type, size); if (!ptr) return -ENOMEM; - if (execmem_is_rox(execmem_type)) { - int err = execmem_make_temp_rw(ptr, size); - - if (err) { - execmem_free(ptr); - return -ENOMEM; - } - - mod->mem[type].is_rox = true; - } + mod->mem[type].is_rox = execmem_is_rox(execmem_type); /* * The pointer to these blocks of memory are stored on the module diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index bbed41ad29cf..ef282001f200 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -64,6 +64,7 @@ struct dev_printk_info; extern struct printk_ringbuffer *prb; extern bool printk_kthreads_running; +extern bool printk_kthreads_ready; extern bool debug_non_panic_cpus; __printf(4, 0) @@ -179,6 +180,7 @@ static inline void nbcon_kthread_wake(struct console *con) #define PRINTKRB_RECORD_MAX 0 #define printk_kthreads_running (false) +#define printk_kthreads_ready (false) /* * In !PRINTK builds we still export console_sem diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index fd12efcc4aed..646801813415 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -214,8 +214,9 @@ static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq) /** * nbcon_context_try_acquire_direct - Try to acquire directly - * @ctxt: The context of the caller - * @cur: The current console state + * @ctxt: The context of the caller + * @cur: The current console state + * @is_reacquire: This acquire is a reacquire * * Acquire the console when it is released. Also acquire the console when * the current owner has a lower priority and the console is in a safe state. @@ -225,17 +226,17 @@ static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq) * * Errors: * - * -EPERM: A panic is in progress and this is not the panic CPU. - * Or the current owner or waiter has the same or higher - * priority. No acquire method can be successful in - * this case. + * -EPERM: A panic is in progress and this is neither the panic + * CPU nor is this a reacquire. Or the current owner or + * waiter has the same or higher priority. No acquire + * method can be successful in these cases. * * -EBUSY: The current owner has a lower priority but the console * in an unsafe state. The caller should try using * the handover acquire method. */ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt, - struct nbcon_state *cur) + struct nbcon_state *cur, bool is_reacquire) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; @@ -243,14 +244,20 @@ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt, do { /* - * Panic does not imply that the console is owned. However, it - * is critical that non-panic CPUs during panic are unable to - * acquire ownership in order to satisfy the assumptions of - * nbcon_waiter_matches(). In particular, the assumption that - * lower priorities are ignored during panic. + * Panic does not imply that the console is owned. However, + * since all non-panic CPUs are stopped during panic(), it + * is safer to have them avoid gaining console ownership. + * + * If this acquire is a reacquire (and an unsafe takeover + * has not previously occurred) then it is allowed to attempt + * a direct acquire in panic. This gives console drivers an + * opportunity to perform any necessary cleanup if they were + * interrupted by the panic CPU while printing. */ - if (other_cpu_in_panic()) + if (other_cpu_in_panic() && + (!is_reacquire || cur->unsafe_takeover)) { return -EPERM; + } if (ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio) return -EPERM; @@ -301,8 +308,9 @@ static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio) * Event #1 implies this context is EMERGENCY. * Event #2 implies the new context is PANIC. * Event #3 occurs when panic() has flushed the console. - * Events #4 and #5 are not possible due to the other_cpu_in_panic() - * check in nbcon_context_try_acquire_direct(). + * Event #4 occurs when a non-panic CPU reacquires. + * Event #5 is not possible due to the other_cpu_in_panic() check + * in nbcon_context_try_acquire_handover(). */ return (cur->req_prio == expected_prio); @@ -431,6 +439,16 @@ static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt, WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio); WARN_ON_ONCE(!cur->unsafe); + /* + * Panic does not imply that the console is owned. However, it + * is critical that non-panic CPUs during panic are unable to + * wait for a handover in order to satisfy the assumptions of + * nbcon_waiter_matches(). In particular, the assumption that + * lower priorities are ignored during panic. + */ + if (other_cpu_in_panic()) + return -EPERM; + /* Handover is not possible on the same CPU. */ if (cur->cpu == cpu) return -EBUSY; @@ -558,7 +576,8 @@ static struct printk_buffers panic_nbcon_pbufs; /** * nbcon_context_try_acquire - Try to acquire nbcon console - * @ctxt: The context of the caller + * @ctxt: The context of the caller + * @is_reacquire: This acquire is a reacquire * * Context: Under @ctxt->con->device_lock() or local_irq_save(). * Return: True if the console was acquired. False otherwise. @@ -568,7 +587,7 @@ static struct printk_buffers panic_nbcon_pbufs; * in an unsafe state. Otherwise, on success the caller may assume * the console is not in an unsafe state. */ -static bool nbcon_context_try_acquire(struct nbcon_context *ctxt) +static bool nbcon_context_try_acquire(struct nbcon_context *ctxt, bool is_reacquire) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; @@ -577,7 +596,7 @@ static bool nbcon_context_try_acquire(struct nbcon_context *ctxt) nbcon_state_read(con, &cur); try_again: - err = nbcon_context_try_acquire_direct(ctxt, &cur); + err = nbcon_context_try_acquire_direct(ctxt, &cur, is_reacquire); if (err != -EBUSY) goto out; @@ -913,7 +932,7 @@ void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); - while (!nbcon_context_try_acquire(ctxt)) + while (!nbcon_context_try_acquire(ctxt, true)) cpu_relax(); nbcon_write_context_set_buf(wctxt, NULL, 0); @@ -1101,7 +1120,7 @@ static bool nbcon_emit_one(struct nbcon_write_context *wctxt, bool use_atomic) cant_migrate(); } - if (!nbcon_context_try_acquire(ctxt)) + if (!nbcon_context_try_acquire(ctxt, false)) goto out; /* @@ -1486,7 +1505,7 @@ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, ctxt->prio = nbcon_get_default_prio(); ctxt->allow_unsafe_takeover = allow_unsafe_takeover; - if (!nbcon_context_try_acquire(ctxt)) + if (!nbcon_context_try_acquire(ctxt, false)) return -EPERM; while (nbcon_seq_read(con) < stop_seq) { @@ -1671,6 +1690,9 @@ bool nbcon_alloc(struct console *con) { struct nbcon_state state = { }; + /* Synchronize the kthread start. */ + lockdep_assert_console_list_lock_held(); + /* The write_thread() callback is mandatory. */ if (WARN_ON(!con->write_thread)) return false; @@ -1701,12 +1723,15 @@ bool nbcon_alloc(struct console *con) return false; } - if (printk_kthreads_running) { + if (printk_kthreads_ready && !have_boot_console) { if (!nbcon_kthread_create(con)) { kfree(con->pbufs); con->pbufs = NULL; return false; } + + /* Might be the first kthread. */ + printk_kthreads_running = true; } } @@ -1716,14 +1741,30 @@ bool nbcon_alloc(struct console *con) /** * nbcon_free - Free and cleanup the nbcon console specific data * @con: Console to free/cleanup nbcon data + * + * Important: @have_nbcon_console must be updated before calling + * this function. In particular, it can be set only when there + * is still another nbcon console registered. */ void nbcon_free(struct console *con) { struct nbcon_state state = { }; - if (printk_kthreads_running) + /* Synchronize the kthread stop. */ + lockdep_assert_console_list_lock_held(); + + if (printk_kthreads_running) { nbcon_kthread_stop(con); + /* Might be the last nbcon console. + * + * Do not rely on printk_kthreads_check_locked(). It is not + * called in some code paths, see nbcon_free() callers. + */ + if (!have_nbcon_console) + printk_kthreads_running = false; + } + nbcon_state_set(con, &state); /* Boot consoles share global printk buffers. */ @@ -1762,7 +1803,7 @@ bool nbcon_device_try_acquire(struct console *con) ctxt->console = con; ctxt->prio = NBCON_PRIO_NORMAL; - if (!nbcon_context_try_acquire(ctxt)) + if (!nbcon_context_try_acquire(ctxt, false)) return false; if (!nbcon_context_enter_unsafe(ctxt)) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 1eea80d0648e..0efbcdda9aab 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3574,7 +3574,7 @@ EXPORT_SYMBOL(console_resume); static int unregister_console_locked(struct console *console); /* True when system boot is far enough to create printer threads. */ -static bool printk_kthreads_ready __ro_after_init; +bool printk_kthreads_ready __ro_after_init; static struct task_struct *printk_legacy_kthread; @@ -3713,6 +3713,7 @@ static void printk_kthreads_check_locked(void) if (!printk_kthreads_ready) return; + /* Start or stop the legacy kthread when needed. */ if (have_legacy_console || have_boot_console) { if (!printk_legacy_kthread && force_legacy_kthread() && @@ -4204,14 +4205,6 @@ static int unregister_console_locked(struct console *console) */ synchronize_srcu(&console_srcu); - if (console->flags & CON_NBCON) - nbcon_free(console); - - console_sysfs_notify(); - - if (console->exit) - res = console->exit(console); - /* * With this console gone, the global flags tracking registered * console types may have changed. Update them. @@ -4232,6 +4225,15 @@ static int unregister_console_locked(struct console *console) if (!found_nbcon_con) have_nbcon_console = found_nbcon_con; + /* @have_nbcon_console must be updated before calling nbcon_free(). */ + if (console->flags & CON_NBCON) + nbcon_free(console); + + console_sysfs_notify(); + + if (console->exit) + res = console->exit(console); + /* Changed console list, may require printer threads to start/stop. */ printk_kthreads_check_locked(); diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 3d85800757aa..eb0cb11d0d12 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -60,6 +60,20 @@ bool no_hash_pointers __ro_after_init; EXPORT_SYMBOL_GPL(no_hash_pointers); +/* + * Hashed pointers policy selected by "hash_pointers=..." boot param + * + * `auto` - Hashed pointers enabled unless disabled by slub_debug_enabled=true + * `always` - Hashed pointers enabled unconditionally + * `never` - Hashed pointers disabled unconditionally + */ +enum hash_pointers_policy { + HASH_PTR_AUTO = 0, + HASH_PTR_ALWAYS, + HASH_PTR_NEVER +}; +static enum hash_pointers_policy hash_pointers_mode __initdata; + noinline static unsigned long long simple_strntoull(const char *startp, char **endp, unsigned int base, size_t max_chars) { @@ -1699,10 +1713,9 @@ char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec, return buf; } -#pragma GCC diagnostic push -#ifndef __clang__ -#pragma GCC diagnostic ignored "-Wsuggest-attribute=format" -#endif +__diag_push(); +__diag_ignore(GCC, all, "-Wsuggest-attribute=format", + "Not a valid __printf() conversion candidate."); static char *va_format(char *buf, char *end, struct va_format *va_fmt, struct printf_spec spec) { @@ -1717,7 +1730,7 @@ static char *va_format(char *buf, char *end, struct va_format *va_fmt, return buf; } -#pragma GCC diagnostic pop +__diag_pop(); static noinline_for_stack char *uuid_string(char *buf, char *end, const u8 *addr, @@ -2289,12 +2302,23 @@ char *resource_or_range(const char *fmt, char *buf, char *end, void *ptr, return resource_string(buf, end, ptr, spec, fmt); } -int __init no_hash_pointers_enable(char *str) +void __init hash_pointers_finalize(bool slub_debug) { - if (no_hash_pointers) - return 0; + switch (hash_pointers_mode) { + case HASH_PTR_ALWAYS: + no_hash_pointers = false; + break; + case HASH_PTR_NEVER: + no_hash_pointers = true; + break; + case HASH_PTR_AUTO: + default: + no_hash_pointers = slub_debug; + break; + } - no_hash_pointers = true; + if (!no_hash_pointers) + return; pr_warn("**********************************************************\n"); pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); @@ -2307,11 +2331,39 @@ int __init no_hash_pointers_enable(char *str) pr_warn("** the kernel, report this immediately to your system **\n"); pr_warn("** administrator! **\n"); pr_warn("** **\n"); + pr_warn("** Use hash_pointers=always to force this mode off **\n"); + pr_warn("** **\n"); pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); pr_warn("**********************************************************\n"); +} + +static int __init hash_pointers_mode_parse(char *str) +{ + if (!str) { + pr_warn("Hash pointers mode empty; falling back to auto.\n"); + hash_pointers_mode = HASH_PTR_AUTO; + } else if (strncmp(str, "auto", 4) == 0) { + pr_info("Hash pointers mode set to auto.\n"); + hash_pointers_mode = HASH_PTR_AUTO; + } else if (strncmp(str, "never", 5) == 0) { + pr_info("Hash pointers mode set to never.\n"); + hash_pointers_mode = HASH_PTR_NEVER; + } else if (strncmp(str, "always", 6) == 0) { + pr_info("Hash pointers mode set to always.\n"); + hash_pointers_mode = HASH_PTR_ALWAYS; + } else { + pr_warn("Unknown hash_pointers mode '%s' specified; assuming auto.\n", str); + hash_pointers_mode = HASH_PTR_AUTO; + } return 0; } +early_param("hash_pointers", hash_pointers_mode_parse); + +static int __init no_hash_pointers_enable(char *str) +{ + return hash_pointers_mode_parse("never"); +} early_param("no_hash_pointers", no_hash_pointers_enable); /* diff --git a/mm/Kconfig b/mm/Kconfig index d5d4eca947a6..e443fe8cd6cf 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1242,10 +1242,6 @@ config KMAP_LOCAL config KMAP_LOCAL_NON_LINEAR_PTE_ARRAY bool -# struct io_mapping based helper. Selected by drivers that need them -config IO_MAPPING - bool - config MEMFD_CREATE bool "Enable memfd_create() system call" if EXPERT diff --git a/mm/Makefile b/mm/Makefile index 1a7a11d4933d..ef54aa615d9d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -141,7 +141,6 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o obj-$(CONFIG_PTDUMP) += ptdump.o obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o -obj-$(CONFIG_IO_MAPPING) += io-mapping.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 94af19c4dfed..87e825349bdf 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -711,6 +711,10 @@ static void damos_va_migrate_dests_add(struct folio *folio, target -= dests->weight_arr[i]; } + /* If the folio is already in the right node, don't do anything */ + if (folio_nid(folio) == dests->node_id_arr[i]) + return; + isolate: if (!folio_isolate_lru(folio)) return; diff --git a/mm/execmem.c b/mm/execmem.c index 627e6cf64f4f..0822305413ec 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -26,7 +26,7 @@ static struct execmem_info default_execmem_info __ro_after_init; #ifdef CONFIG_MMU static void *execmem_vmalloc(struct execmem_range *range, size_t size, - pgprot_t pgprot, vm_flags_t vm_flags) + pgprot_t pgprot, unsigned long vm_flags) { bool kasan = range->flags & EXECMEM_KASAN_SHADOW; gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN; @@ -82,7 +82,7 @@ struct vm_struct *execmem_vmap(size_t size) } #else static void *execmem_vmalloc(struct execmem_range *range, size_t size, - pgprot_t pgprot, vm_flags_t vm_flags) + pgprot_t pgprot, unsigned long vm_flags) { return vmalloc(size); } @@ -93,8 +93,15 @@ struct execmem_cache { struct mutex mutex; struct maple_tree busy_areas; struct maple_tree free_areas; + unsigned int pending_free_cnt; /* protected by mutex */ }; +/* delay to schedule asynchronous free if fast path free fails */ +#define FREE_DELAY (msecs_to_jiffies(10)) + +/* mark entries in busy_areas that should be freed asynchronously */ +#define PENDING_FREE_MASK (1 << (PAGE_SHIFT - 1)) + static struct execmem_cache execmem_cache = { .mutex = __MUTEX_INITIALIZER(execmem_cache.mutex), .busy_areas = MTREE_INIT_EXT(busy_areas, MT_FLAGS_LOCK_EXTERN, @@ -130,6 +137,27 @@ err_restore: return err; } +static int execmem_force_rw(void *ptr, size_t size) +{ + unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned long addr = (unsigned long)ptr; + int ret; + + ret = set_memory_nx(addr, nr); + if (ret) + return ret; + + return set_memory_rw(addr, nr); +} + +int execmem_restore_rox(void *ptr, size_t size) +{ + unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned long addr = (unsigned long)ptr; + + return set_memory_rox(addr, nr); +} + static void execmem_cache_clean(struct work_struct *work) { struct maple_tree *free_areas = &execmem_cache.free_areas; @@ -155,20 +183,17 @@ static void execmem_cache_clean(struct work_struct *work) static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean); -static int execmem_cache_add(void *ptr, size_t size) +static int execmem_cache_add_locked(void *ptr, size_t size, gfp_t gfp_mask) { struct maple_tree *free_areas = &execmem_cache.free_areas; - struct mutex *mutex = &execmem_cache.mutex; unsigned long addr = (unsigned long)ptr; MA_STATE(mas, free_areas, addr - 1, addr + 1); unsigned long lower, upper; void *area = NULL; - int err; lower = addr; upper = addr + size - 1; - mutex_lock(mutex); area = mas_walk(&mas); if (area && mas.last == addr - 1) lower = mas.index; @@ -178,12 +203,14 @@ static int execmem_cache_add(void *ptr, size_t size) upper = mas.last; mas_set_range(&mas, lower, upper); - err = mas_store_gfp(&mas, (void *)lower, GFP_KERNEL); - mutex_unlock(mutex); - if (err) - return err; + return mas_store_gfp(&mas, (void *)lower, gfp_mask); +} - return 0; +static int execmem_cache_add(void *ptr, size_t size, gfp_t gfp_mask) +{ + guard(mutex)(&execmem_cache.mutex); + + return execmem_cache_add_locked(ptr, size, gfp_mask); } static bool within_range(struct execmem_range *range, struct ma_state *mas, @@ -256,7 +283,7 @@ out_unlock: static int execmem_cache_populate(struct execmem_range *range, size_t size) { - vm_flags_t vm_flags = VM_ALLOW_HUGE_VMAP; + unsigned long vm_flags = VM_ALLOW_HUGE_VMAP; struct vm_struct *vm; size_t alloc_size; int err = -ENOMEM; @@ -264,6 +291,11 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size) alloc_size = round_up(size, PMD_SIZE); p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags); + if (!p) { + alloc_size = size; + p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags); + } + if (!p) return err; @@ -272,13 +304,13 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size) goto err_free_mem; /* fill memory with instructions that will trap */ - execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true); + execmem_fill_trapping_insns(p, alloc_size); err = set_memory_rox((unsigned long)p, vm->nr_pages); if (err) goto err_free_mem; - err = execmem_cache_add(p, alloc_size); + err = execmem_cache_add(p, alloc_size, GFP_KERNEL); if (err) goto err_reset_direct_map; @@ -307,57 +339,117 @@ static void *execmem_cache_alloc(struct execmem_range *range, size_t size) return __execmem_cache_alloc(range, size); } -static bool execmem_cache_free(void *ptr) +static inline bool is_pending_free(void *ptr) { - struct maple_tree *busy_areas = &execmem_cache.busy_areas; - struct mutex *mutex = &execmem_cache.mutex; - unsigned long addr = (unsigned long)ptr; - MA_STATE(mas, busy_areas, addr, addr); - size_t size; - void *area; + return ((unsigned long)ptr & PENDING_FREE_MASK); +} - mutex_lock(mutex); - area = mas_walk(&mas); - if (!area) { - mutex_unlock(mutex); - return false; - } - size = mas_range_len(&mas); +static inline void *pending_free_set(void *ptr) +{ + return (void *)((unsigned long)ptr | PENDING_FREE_MASK); +} - mas_store_gfp(&mas, NULL, GFP_KERNEL); - mutex_unlock(mutex); +static inline void *pending_free_clear(void *ptr) +{ + return (void *)((unsigned long)ptr & ~PENDING_FREE_MASK); +} - execmem_fill_trapping_insns(ptr, size, /* writable = */ false); +static int __execmem_cache_free(struct ma_state *mas, void *ptr, gfp_t gfp_mask) +{ + size_t size = mas_range_len(mas); + int err; - execmem_cache_add(ptr, size); + err = execmem_force_rw(ptr, size); + if (err) + return err; - schedule_work(&execmem_cache_clean_work); + execmem_fill_trapping_insns(ptr, size); + execmem_restore_rox(ptr, size); - return true; + err = execmem_cache_add_locked(ptr, size, gfp_mask); + if (err) + return err; + + mas_store_gfp(mas, NULL, gfp_mask); + return 0; } -int execmem_make_temp_rw(void *ptr, size_t size) +static void execmem_cache_free_slow(struct work_struct *work); +static DECLARE_DELAYED_WORK(execmem_cache_free_work, execmem_cache_free_slow); + +static void execmem_cache_free_slow(struct work_struct *work) { - unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; - unsigned long addr = (unsigned long)ptr; - int ret; + struct maple_tree *busy_areas = &execmem_cache.busy_areas; + MA_STATE(mas, busy_areas, 0, ULONG_MAX); + void *area; - ret = set_memory_nx(addr, nr); - if (ret) - return ret; + guard(mutex)(&execmem_cache.mutex); - return set_memory_rw(addr, nr); + if (!execmem_cache.pending_free_cnt) + return; + + mas_for_each(&mas, area, ULONG_MAX) { + if (!is_pending_free(area)) + continue; + + area = pending_free_clear(area); + if (__execmem_cache_free(&mas, area, GFP_KERNEL)) + continue; + + execmem_cache.pending_free_cnt--; + } + + if (execmem_cache.pending_free_cnt) + schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY); + else + schedule_work(&execmem_cache_clean_work); } -int execmem_restore_rox(void *ptr, size_t size) +static bool execmem_cache_free(void *ptr) { - unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; + struct maple_tree *busy_areas = &execmem_cache.busy_areas; unsigned long addr = (unsigned long)ptr; + MA_STATE(mas, busy_areas, addr, addr); + void *area; + int err; - return set_memory_rox(addr, nr); + guard(mutex)(&execmem_cache.mutex); + + area = mas_walk(&mas); + if (!area) + return false; + + err = __execmem_cache_free(&mas, area, GFP_KERNEL | __GFP_NORETRY); + if (err) { + /* + * mas points to exact slot we've got the area from, nothing + * else can modify the tree because of the mutex, so there + * won't be any allocations in mas_store_gfp() and it will just + * change the pointer. + */ + area = pending_free_set(area); + mas_store_gfp(&mas, area, GFP_KERNEL); + execmem_cache.pending_free_cnt++; + schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY); + return true; + } + + schedule_work(&execmem_cache_clean_work); + + return true; } #else /* CONFIG_ARCH_HAS_EXECMEM_ROX */ +/* + * when ROX cache is not used the permissions defined by architectures for + * execmem ranges that are updated before use (e.g. EXECMEM_MODULE_TEXT) must + * be writable anyway + */ +static inline int execmem_force_rw(void *ptr, size_t size) +{ + return 0; +} + static void *execmem_cache_alloc(struct execmem_range *range, size_t size) { return NULL; @@ -373,9 +465,9 @@ void *execmem_alloc(enum execmem_type type, size_t size) { struct execmem_range *range = &execmem_info->ranges[type]; bool use_cache = range->flags & EXECMEM_ROX_CACHE; - vm_flags_t vm_flags = VM_FLUSH_RESET_PERMS; + unsigned long vm_flags = VM_FLUSH_RESET_PERMS; pgprot_t pgprot = range->pgprot; - void *p; + void *p = NULL; size = PAGE_ALIGN(size); @@ -387,6 +479,21 @@ void *execmem_alloc(enum execmem_type type, size_t size) return kasan_reset_tag(p); } +void *execmem_alloc_rw(enum execmem_type type, size_t size) +{ + void *p __free(execmem) = execmem_alloc(type, size); + int err; + + if (!p) + return NULL; + + err = execmem_force_rw(p, size); + if (err) + return NULL; + + return no_free_ptr(p); +} + void execmem_free(void *ptr) { /* @@ -399,11 +506,6 @@ void execmem_free(void *ptr) vfree(ptr); } -void *execmem_update_copy(void *dst, const void *src, size_t size) -{ - return text_poke_copy(dst, src, size); -} - bool execmem_is_rox(enum execmem_type type) { return !!(execmem_info->ranges[type].flags & EXECMEM_ROX_CACHE); diff --git a/mm/internal.h b/mm/internal.h index 1da16d550a45..45b725c3dc03 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1391,7 +1391,7 @@ int migrate_device_coherent_folio(struct folio *folio); struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long shift, - vm_flags_t vm_flags, unsigned long start, + unsigned long vm_flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, const void *caller); diff --git a/mm/io-mapping.c b/mm/io-mapping.c deleted file mode 100644 index d3586e95c12c..000000000000 --- a/mm/io-mapping.c +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only - -#include <linux/mm.h> -#include <linux/io-mapping.h> - -/** - * io_mapping_map_user - remap an I/O mapping to userspace - * @iomap: the source io_mapping - * @vma: user vma to map to - * @addr: target user address to start at - * @pfn: physical address of kernel memory - * @size: size of map area - * - * Note: this is only safe if the mm semaphore is held when called. - */ -int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma, - unsigned long addr, unsigned long pfn, unsigned long size) -{ - vm_flags_t expected_flags = VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; - - if (WARN_ON_ONCE((vma->vm_flags & expected_flags) != expected_flags)) - return -EINVAL; - - pgprot_t remap_prot = __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) | - (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK)); - - /* We rely on prevalidation of the io-mapping to skip pfnmap tracking. */ - return remap_pfn_range_notrack(vma, addr, pfn, size, remap_prot); -} -EXPORT_SYMBOL_GPL(io_mapping_map_user); diff --git a/mm/kasan/common.c b/mm/kasan/common.c index ed4873e18c75..9142964ab9c9 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -230,16 +230,12 @@ static bool check_slab_allocation(struct kmem_cache *cache, void *object, } static inline void poison_slab_object(struct kmem_cache *cache, void *object, - bool init, bool still_accessible) + bool init) { void *tagged_object = object; object = kasan_reset_tag(object); - /* RCU slabs could be legally used after free within the RCU period. */ - if (unlikely(still_accessible)) - return; - kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), KASAN_SLAB_FREE, init); @@ -261,7 +257,22 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, if (!kasan_arch_is_ready() || is_kfence_address(object)) return false; - poison_slab_object(cache, object, init, still_accessible); + /* + * If this point is reached with an object that must still be + * accessible under RCU, we can't poison it; in that case, also skip the + * quarantine. This should mostly only happen when CONFIG_SLUB_RCU_DEBUG + * has been disabled manually. + * + * Putting the object on the quarantine wouldn't help catch UAFs (since + * we can't poison it here), and it would mask bugs caused by + * SLAB_TYPESAFE_BY_RCU users not being careful enough about object + * reuse; so overall, putting the object into the quarantine here would + * be counterproductive. + */ + if (still_accessible) + return false; + + poison_slab_object(cache, object, init); /* * If the object is put into quarantine, do not let slab put the object @@ -519,7 +530,7 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip) if (check_slab_allocation(slab->slab_cache, ptr, ip)) return false; - poison_slab_object(slab->slab_cache, ptr, false, false); + poison_slab_object(slab->slab_cache, ptr, false); return true; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a55fb1dcd224..374a6a5193a7 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -700,12 +700,15 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, spinlock_t *ptl, struct list_head *compound_pagelist) { + unsigned long end = address + HPAGE_PMD_SIZE; struct folio *src, *tmp; - pte_t *_pte; pte_t pteval; + pte_t *_pte; + unsigned int nr_ptes; - for (_pte = pte; _pte < pte + HPAGE_PMD_NR; - _pte++, address += PAGE_SIZE) { + for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes, + address += nr_ptes * PAGE_SIZE) { + nr_ptes = 1; pteval = ptep_get(_pte); if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); @@ -722,18 +725,26 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, struct page *src_page = pte_page(pteval); src = page_folio(src_page); - if (!folio_test_large(src)) + + if (folio_test_large(src)) { + unsigned int max_nr_ptes = (end - address) >> PAGE_SHIFT; + + nr_ptes = folio_pte_batch(src, _pte, pteval, max_nr_ptes); + } else { release_pte_folio(src); + } + /* * ptl mostly unnecessary, but preempt has to * be disabled to update the per-cpu stats * inside folio_remove_rmap_pte(). */ spin_lock(ptl); - ptep_clear(vma->vm_mm, address, _pte); - folio_remove_rmap_pte(src, src_page, vma); + clear_ptes(vma->vm_mm, address, _pte, nr_ptes); + folio_remove_rmap_ptes(src, src_page, nr_ptes, vma); spin_unlock(ptl); - free_folio_and_swap_cache(src); + free_swap_cache(src); + folio_put_refs(src, nr_ptes); } } @@ -1492,15 +1503,17 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, bool install_pmd) { + int nr_mapped_ptes = 0, result = SCAN_FAIL; + unsigned int nr_batch_ptes; struct mmu_notifier_range range; bool notified = false; unsigned long haddr = addr & HPAGE_PMD_MASK; + unsigned long end = haddr + HPAGE_PMD_SIZE; struct vm_area_struct *vma = vma_lookup(mm, haddr); struct folio *folio; pte_t *start_pte, *pte; pmd_t *pmd, pgt_pmd; spinlock_t *pml = NULL, *ptl; - int nr_ptes = 0, result = SCAN_FAIL; int i; mmap_assert_locked(mm); @@ -1614,11 +1627,15 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, goto abort; /* step 2: clear page table and adjust rmap */ - for (i = 0, addr = haddr, pte = start_pte; - i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { + for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; + i += nr_batch_ptes, addr += nr_batch_ptes * PAGE_SIZE, + pte += nr_batch_ptes) { + unsigned int max_nr_batch_ptes = (end - addr) >> PAGE_SHIFT; struct page *page; pte_t ptent = ptep_get(pte); + nr_batch_ptes = 1; + if (pte_none(ptent)) continue; /* @@ -1632,26 +1649,29 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, goto abort; } page = vm_normal_page(vma, addr, ptent); + if (folio_page(folio, i) != page) goto abort; + nr_batch_ptes = folio_pte_batch(folio, pte, ptent, max_nr_batch_ptes); + /* * Must clear entry, or a racing truncate may re-remove it. * TLB flush can be left until pmdp_collapse_flush() does it. * PTE dirty? Shmem page is already dirty; file is read-only. */ - ptep_clear(mm, addr, pte); - folio_remove_rmap_pte(folio, page, vma); - nr_ptes++; + clear_ptes(mm, addr, pte, nr_batch_ptes); + folio_remove_rmap_ptes(folio, page, nr_batch_ptes, vma); + nr_mapped_ptes += nr_batch_ptes; } if (!pml) spin_unlock(ptl); /* step 3: set proper refcount and mm_counters. */ - if (nr_ptes) { - folio_ref_sub(folio, nr_ptes); - add_mm_counter(mm, mm_counter_file(folio), -nr_ptes); + if (nr_mapped_ptes) { + folio_ref_sub(folio, nr_mapped_ptes); + add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes); } /* step 4: remove empty page table */ @@ -1684,10 +1704,10 @@ maybe_install_pmd: : SCAN_SUCCEED; goto drop_folio; abort: - if (nr_ptes) { + if (nr_mapped_ptes) { flush_tlb_mm(mm); - folio_ref_sub(folio, nr_ptes); - add_mm_counter(mm, mm_counter_file(folio), -nr_ptes); + folio_ref_sub(folio, nr_mapped_ptes); + add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes); } unlock: if (start_pte) diff --git a/mm/madvise.c b/mm/madvise.c index bb80fc5ea08f..35ed4ab0d7c5 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -19,6 +19,7 @@ #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/mm_inline.h> +#include <linux/mmu_context.h> #include <linux/string.h> #include <linux/uio.h> #include <linux/ksm.h> @@ -1256,6 +1257,74 @@ static long madvise_guard_remove(struct madvise_behavior *madv_behavior) &guard_remove_walk_ops, NULL); } +#ifdef CONFIG_64BIT +/* Does the madvise operation result in discarding of mapped data? */ +static bool is_discard(int behavior) +{ + switch (behavior) { + case MADV_FREE: + case MADV_DONTNEED: + case MADV_DONTNEED_LOCKED: + case MADV_REMOVE: + case MADV_DONTFORK: + case MADV_WIPEONFORK: + case MADV_GUARD_INSTALL: + return true; + } + + return false; +} + +/* + * We are restricted from madvise()'ing mseal()'d VMAs only in very particular + * circumstances - discarding of data from read-only anonymous SEALED mappings. + * + * This is because users cannot trivally discard data from these VMAs, and may + * only do so via an appropriate madvise() call. + */ +static bool can_madvise_modify(struct madvise_behavior *madv_behavior) +{ + struct vm_area_struct *vma = madv_behavior->vma; + + /* If the VMA isn't sealed we're good. */ + if (!vma_is_sealed(vma)) + return true; + + /* For a sealed VMA, we only care about discard operations. */ + if (!is_discard(madv_behavior->behavior)) + return true; + + /* + * We explicitly permit all file-backed mappings, whether MAP_SHARED or + * MAP_PRIVATE. + * + * The latter causes some complications. Because now, one can mmap() + * read/write a MAP_PRIVATE mapping, write to it, then mprotect() + * read-only, mseal() and a discard will be permitted. + * + * However, in order to avoid issues with potential use of madvise(..., + * MADV_DONTNEED) of mseal()'d .text mappings we, for the time being, + * permit this. + */ + if (!vma_is_anonymous(vma)) + return true; + + /* If the user could write to the mapping anyway, then this is fine. */ + if ((vma->vm_flags & VM_WRITE) && + arch_vma_access_permitted(vma, /* write= */ true, + /* execute= */ false, /* foreign= */ false)) + return true; + + /* Otherwise, we are not permitted to perform this operation. */ + return false; +} +#else +static bool can_madvise_modify(struct madvise_behavior *madv_behavior) +{ + return true; +} +#endif + /* * Apply an madvise behavior to a region of a vma. madvise_update_vma * will handle splitting a vm area into separate areas, each area with its own @@ -1269,7 +1338,7 @@ static int madvise_vma_behavior(struct madvise_behavior *madv_behavior) struct madvise_behavior_range *range = &madv_behavior->range; int error; - if (unlikely(!can_modify_vma_madv(madv_behavior->vma, behavior))) + if (unlikely(!can_madvise_modify(madv_behavior))) return -EPERM; switch (behavior) { diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 3047b9ac667e..e2e685b971bb 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -837,11 +837,17 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask, struct mm_walk *walk) { struct hwpoison_walk *hwp = walk->private; - pte_t pte = huge_ptep_get(walk->mm, addr, ptep); struct hstate *h = hstate_vma(walk->vma); + spinlock_t *ptl; + pte_t pte; + int ret; - return check_hwpoisoned_entry(pte, addr, huge_page_shift(h), - hwp->pfn, &hwp->tk); + ptl = huge_pte_lock(h, walk->mm, ptep); + pte = huge_ptep_get(walk->mm, addr, ptep); + ret = check_hwpoisoned_entry(pte, addr, huge_page_shift(h), + hwp->pfn, &hwp->tk); + spin_unlock(ptl); + return ret; } #else #define hwpoison_hugetlb_range NULL diff --git a/mm/mempool.c b/mm/mempool.c index 204a216b6418..1c38e873e546 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -136,7 +136,7 @@ static void kasan_unpoison_element(mempool_t *pool, void *element) static __always_inline void add_element(mempool_t *pool, void *element) { - BUG_ON(pool->curr_nr >= pool->min_nr); + BUG_ON(pool->min_nr != 0 && pool->curr_nr >= pool->min_nr); poison_element(pool, element); if (kasan_poison_element(pool, element)) pool->elements[pool->curr_nr++] = element; @@ -202,16 +202,20 @@ int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, pool->alloc = alloc_fn; pool->free = free_fn; init_waitqueue_head(&pool->wait); - - pool->elements = kmalloc_array_node(min_nr, sizeof(void *), + /* + * max() used here to ensure storage for at least 1 element to support + * zero minimum pool + */ + pool->elements = kmalloc_array_node(max(1, min_nr), sizeof(void *), gfp_mask, node_id); if (!pool->elements) return -ENOMEM; /* - * First pre-allocate the guaranteed number of buffers. + * First pre-allocate the guaranteed number of buffers, + * also pre-allocate 1 element for zero minimum pool. */ - while (pool->curr_nr < pool->min_nr) { + while (pool->curr_nr < max(1, pool->min_nr)) { void *element; element = pool->alloc(gfp_mask, pool->pool_data); @@ -555,20 +559,12 @@ void mempool_free(void *element, mempool_t *pool) * wake-up path of previous test. This explicit check ensures the * allocation of element when both min_nr and curr_nr are 0, and * any active waiters are properly awakened. - * - * Inline the same logic as previous test, add_element() cannot be - * directly used here since it has BUG_ON to deny if min_nr equals - * curr_nr, so here picked rest of add_element() to use without - * BUG_ON check. */ if (unlikely(pool->min_nr == 0 && READ_ONCE(pool->curr_nr) == 0)) { spin_lock_irqsave(&pool->lock, flags); if (likely(pool->curr_nr == 0)) { - /* Inline the logic of add_element() */ - poison_element(pool, element); - if (kasan_poison_element(pool, element)) - pool->elements[pool->curr_nr++] = element; + add_element(pool, element); spin_unlock_irqrestore(&pool->lock, flags); if (wq_has_sleeper(&pool->wait)) wake_up(&pool->wait); diff --git a/mm/mincore.c b/mm/mincore.c index 42d6c9c8da86..10dabefc3acc 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -29,7 +29,9 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, #ifdef CONFIG_HUGETLB_PAGE unsigned char present; unsigned char *vec = walk->private; + spinlock_t *ptl; + ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); /* * Hugepages under user process are always in RAM and never * swapped out, but theoretically it needs to be checked. @@ -38,6 +40,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, for (; addr != end; vec++, addr += PAGE_SIZE) *vec = present; walk->private = vec; + spin_unlock(ptl); #else BUG(); #endif diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 729fb7d0dd59..b006cec8e6fe 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -164,8 +164,7 @@ retry: */ /* Check if the vma we locked is the right one. */ - if (unlikely(vma->vm_mm != mm || - address < vma->vm_start || address >= vma->vm_end)) + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) goto inval_end_read; rcu_read_unlock(); @@ -236,11 +235,8 @@ retry: goto fallback; } - /* - * Verify the vma we locked belongs to the same address space and it's - * not behind of the last search position. - */ - if (unlikely(vma->vm_mm != mm || from_addr >= vma->vm_end)) + /* Verify the vma is not behind the last search position. */ + if (unlikely(from_addr >= vma->vm_end)) goto fallback_unlock; /* diff --git a/mm/mprotect.c b/mm/mprotect.c index 2ddd37b2f462..78bded7acf79 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -766,7 +766,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, unsigned long charged = 0; int error; - if (!can_modify_vma(vma)) + if (vma_is_sealed(vma)) return -EPERM; if (newflags == oldflags) { diff --git a/mm/mremap.c b/mm/mremap.c index e15cf2e444c7..677a4d744df9 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -280,7 +280,7 @@ static int move_ptes(struct pagetable_move_control *pmc, old_pte, max_nr_ptes); force_flush = true; } - pte = get_and_clear_full_ptes(mm, old_addr, old_ptep, nr_ptes, 0); + pte = get_and_clear_ptes(mm, old_addr, old_ptep, nr_ptes); pte = move_pte(pte, old_addr, new_addr); pte = move_soft_dirty_pte(pte); @@ -1651,7 +1651,7 @@ static int check_prep_vma(struct vma_remap_struct *vrm) return -EFAULT; /* If mseal()'d, mremap() is prohibited. */ - if (!can_modify_vma(vma)) + if (vma_is_sealed(vma)) return -EPERM; /* Align to hugetlb page size, if required. */ diff --git a/mm/mseal.c b/mm/mseal.c index c27197ac04e8..e5b205562d2e 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -11,148 +11,74 @@ #include <linux/mman.h> #include <linux/mm.h> #include <linux/mm_inline.h> -#include <linux/mmu_context.h> #include <linux/syscalls.h> #include <linux/sched.h> #include "internal.h" -static inline void set_vma_sealed(struct vm_area_struct *vma) -{ - vm_flags_set(vma, VM_SEALED); -} - -static bool is_madv_discard(int behavior) -{ - switch (behavior) { - case MADV_FREE: - case MADV_DONTNEED: - case MADV_DONTNEED_LOCKED: - case MADV_REMOVE: - case MADV_DONTFORK: - case MADV_WIPEONFORK: - case MADV_GUARD_INSTALL: - return true; - } - - return false; -} - -static bool is_ro_anon(struct vm_area_struct *vma) -{ - /* check anonymous mapping. */ - if (vma->vm_file || vma->vm_flags & VM_SHARED) - return false; - - /* - * check for non-writable: - * PROT=RO or PKRU is not writeable. - */ - if (!(vma->vm_flags & VM_WRITE) || - !arch_vma_access_permitted(vma, true, false, false)) - return true; - - return false; -} - /* - * Check if a vma is allowed to be modified by madvise. + * mseal() disallows an input range which contain unmapped ranges (VMA holes). + * + * It disallows unmapped regions from start to end whether they exist at the + * start, in the middle, or at the end of the range, or any combination thereof. + * + * This is because after sealng a range, there's nothing to stop memory mapping + * of ranges in the remaining gaps later, meaning that the user might then + * wrongly consider the entirety of the mseal()'d range to be sealed when it + * in fact isn't. */ -bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) -{ - if (!is_madv_discard(behavior)) - return true; - - if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma))) - return false; - - /* Allow by default. */ - return true; -} - -static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, - struct vm_area_struct **prev, unsigned long start, - unsigned long end, vm_flags_t newflags) -{ - int ret = 0; - vm_flags_t oldflags = vma->vm_flags; - - if (newflags == oldflags) - goto out; - - vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags); - if (IS_ERR(vma)) { - ret = PTR_ERR(vma); - goto out; - } - - set_vma_sealed(vma); -out: - *prev = vma; - return ret; -} /* - * Check for do_mseal: - * 1> start is part of a valid vma. - * 2> end is part of a valid vma. - * 3> No gap (unallocated address) between start and end. - * 4> map is sealable. + * Does the [start, end) range contain any unmapped memory? + * + * We ensure that: + * - start is part of a valid VMA. + * - end is part of a valid VMA. + * - no gap (unallocated memory) exists between start and end. */ -static int check_mm_seal(unsigned long start, unsigned long end) +static bool range_contains_unmapped(struct mm_struct *mm, + unsigned long start, unsigned long end) { struct vm_area_struct *vma; - unsigned long nstart = start; - + unsigned long prev_end = start; VMA_ITERATOR(vmi, current->mm, start); - /* going through each vma to check. */ for_each_vma_range(vmi, vma, end) { - if (vma->vm_start > nstart) - /* unallocated memory found. */ - return -ENOMEM; - - if (vma->vm_end >= end) - return 0; + if (vma->vm_start > prev_end) + return true; - nstart = vma->vm_end; + prev_end = vma->vm_end; } - return -ENOMEM; + return prev_end < end; } -/* - * Apply sealing. - */ -static int apply_mm_seal(unsigned long start, unsigned long end) +static int mseal_apply(struct mm_struct *mm, + unsigned long start, unsigned long end) { - unsigned long nstart; struct vm_area_struct *vma, *prev; + unsigned long curr_start = start; + VMA_ITERATOR(vmi, mm, start); - VMA_ITERATOR(vmi, current->mm, start); - + /* We know there are no gaps so this will be non-NULL. */ vma = vma_iter_load(&vmi); - /* - * Note: check_mm_seal should already checked ENOMEM case. - * so vma should not be null, same for the other ENOMEM cases. - */ prev = vma_prev(&vmi); if (start > vma->vm_start) prev = vma; - nstart = start; for_each_vma_range(vmi, vma, end) { - int error; - unsigned long tmp; - vm_flags_t newflags; + unsigned long curr_end = MIN(vma->vm_end, end); - newflags = vma->vm_flags | VM_SEALED; - tmp = vma->vm_end; - if (tmp > end) - tmp = end; - error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags); - if (error) - return error; - nstart = vma_iter_end(&vmi); + if (!(vma->vm_flags & VM_SEALED)) { + vma = vma_modify_flags(&vmi, prev, vma, + curr_start, curr_end, + vma->vm_flags | VM_SEALED); + if (IS_ERR(vma)) + return PTR_ERR(vma); + vm_flags_set(vma, VM_SEALED); + } + + prev = vma; + curr_start = curr_end; } return 0; @@ -240,14 +166,10 @@ int do_mseal(unsigned long start, size_t len_in, unsigned long flags) if (mmap_write_lock_killable(mm)) return -EINTR; - /* - * First pass, this helps to avoid - * partial sealing in case of error in input address range, - * e.g. ENOMEM error. - */ - ret = check_mm_seal(start, end); - if (ret) + if (range_contains_unmapped(mm, start, end)) { + ret = -ENOMEM; goto out; + } /* * Second pass, this should success, unless there are errors @@ -255,10 +177,10 @@ int do_mseal(unsigned long start, size_t len_in, unsigned long flags) * reaching the max supported VMAs, however, those cases shall * be rare. */ - ret = apply_mm_seal(start, end); + ret = mseal_apply(mm, start, end); out: - mmap_write_unlock(current->mm); + mmap_write_unlock(mm); return ret; } diff --git a/mm/nommu.c b/mm/nommu.c index 736d0e0f0618..8b819fafd57b 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -126,7 +126,7 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, - pgprot_t prot, vm_flags_t vm_flags, int node, + pgprot_t prot, unsigned long vm_flags, int node, const void *caller) { return __vmalloc_noprof(size, gfp_mask); diff --git a/mm/rmap.c b/mm/rmap.c index f93ce27132ab..568198e9efc2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2036,7 +2036,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, flush_cache_range(vma, address, end_addr); /* Nuke the page table entry. */ - pteval = get_and_clear_full_ptes(mm, address, pvmw.pte, nr_pages, 0); + pteval = get_and_clear_ptes(mm, address, pvmw.pte, nr_pages); /* * We clear the PTE but do not flush so potentially * a remote CPU could still be writing to the folio. diff --git a/mm/shmem.c b/mm/shmem.c index 7fdd707ac1ac..e2c76a30802b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -512,15 +512,27 @@ static int shmem_replace_entry(struct address_space *mapping, /* * Sometimes, before we decide whether to proceed or to fail, we must check - * that an entry was not already brought back from swap by a racing thread. + * that an entry was not already brought back or split by a racing thread. * * Checking folio is not enough: by the time a swapcache folio is locked, it * might be reused, and again be swapcache, using the same swap as before. + * Returns the swap entry's order if it still presents, else returns -1. */ -static bool shmem_confirm_swap(struct address_space *mapping, - pgoff_t index, swp_entry_t swap) +static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index, + swp_entry_t swap) { - return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap); + XA_STATE(xas, &mapping->i_pages, index); + int ret = -1; + void *entry; + + rcu_read_lock(); + do { + entry = xas_load(&xas); + if (entry == swp_to_radix_entry(swap)) + ret = xas_get_order(&xas); + } while (xas_retry(&xas, entry)); + rcu_read_unlock(); + return ret; } /* @@ -891,7 +903,9 @@ static int shmem_add_to_page_cache(struct folio *folio, pgoff_t index, void *expected, gfp_t gfp) { XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); - long nr = folio_nr_pages(folio); + unsigned long nr = folio_nr_pages(folio); + swp_entry_t iter, swap; + void *entry; VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); @@ -903,14 +917,25 @@ static int shmem_add_to_page_cache(struct folio *folio, gfp &= GFP_RECLAIM_MASK; folio_throttle_swaprate(folio, gfp); + swap = radix_to_swp_entry(expected); do { + iter = swap; xas_lock_irq(&xas); - if (expected != xas_find_conflict(&xas)) { - xas_set_err(&xas, -EEXIST); - goto unlock; + xas_for_each_conflict(&xas, entry) { + /* + * The range must either be empty, or filled with + * expected swap entries. Shmem swap entries are never + * partially freed without split of both entry and + * folio, so there shouldn't be any holes. + */ + if (!expected || entry != swp_to_radix_entry(iter)) { + xas_set_err(&xas, -EEXIST); + goto unlock; + } + iter.val += 1 << xas_get_order(&xas); } - if (expected && xas_find_conflict(&xas)) { + if (expected && iter.val - nr != swap.val) { xas_set_err(&xas, -EEXIST); goto unlock; } @@ -1992,30 +2017,47 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode, swp_entry_t entry, int order, gfp_t gfp) { struct shmem_inode_info *info = SHMEM_I(inode); + int nr_pages = 1 << order; struct folio *new; + gfp_t alloc_gfp; void *shadow; - int nr_pages; /* * We have arrived here because our zones are constrained, so don't * limit chance of success with further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && order > 0) { - gfp_t huge_gfp = vma_thp_gfp_mask(vma); + alloc_gfp = gfp; + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + if (WARN_ON_ONCE(order)) + return ERR_PTR(-EINVAL); + } else if (order) { + /* + * If uffd is active for the vma, we need per-page fault + * fidelity to maintain the uffd semantics, then fallback + * to swapin order-0 folio, as well as for zswap case. + * Any existing sub folio in the swap cache also blocks + * mTHP swapin. + */ + if ((vma && unlikely(userfaultfd_armed(vma))) || + !zswap_never_enabled() || + non_swapcache_batch(entry, nr_pages) != nr_pages) + goto fallback; - gfp = limit_gfp_mask(huge_gfp, gfp); + alloc_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp); + } +retry: + new = shmem_alloc_folio(alloc_gfp, order, info, index); + if (!new) { + new = ERR_PTR(-ENOMEM); + goto fallback; } - new = shmem_alloc_folio(gfp, order, info, index); - if (!new) - return ERR_PTR(-ENOMEM); - - nr_pages = folio_nr_pages(new); if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL, - gfp, entry)) { + alloc_gfp, entry)) { folio_put(new); - return ERR_PTR(-ENOMEM); + new = ERR_PTR(-ENOMEM); + goto fallback; } /* @@ -2030,7 +2072,9 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode, */ if (swapcache_prepare(entry, nr_pages)) { folio_put(new); - return ERR_PTR(-EEXIST); + new = ERR_PTR(-EEXIST); + /* Try smaller folio to avoid cache conflict */ + goto fallback; } __folio_set_locked(new); @@ -2044,6 +2088,15 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode, folio_add_lru(new); swap_read_folio(new, NULL); return new; +fallback: + /* Order 0 swapin failed, nothing to fallback to, abort */ + if (!order) + return new; + entry.val += index - round_down(index, nr_pages); + alloc_gfp = gfp; + nr_pages = 1; + order = 0; + goto retry; } /* @@ -2249,7 +2302,7 @@ unlock: if (xas_error(&xas)) return xas_error(&xas); - return entry_order; + return 0; } /* @@ -2266,133 +2319,109 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL; struct shmem_inode_info *info = SHMEM_I(inode); + swp_entry_t swap, index_entry; struct swap_info_struct *si; struct folio *folio = NULL; bool skip_swapcache = false; - swp_entry_t swap; - int error, nr_pages, order, split_order; + int error, nr_pages, order; + pgoff_t offset; VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); - swap = radix_to_swp_entry(*foliop); + index_entry = radix_to_swp_entry(*foliop); + swap = index_entry; *foliop = NULL; - if (is_poisoned_swp_entry(swap)) + if (is_poisoned_swp_entry(index_entry)) return -EIO; - si = get_swap_device(swap); - if (!si) { - if (!shmem_confirm_swap(mapping, index, swap)) + si = get_swap_device(index_entry); + order = shmem_confirm_swap(mapping, index, index_entry); + if (unlikely(!si)) { + if (order < 0) return -EEXIST; else return -EINVAL; } + if (unlikely(order < 0)) { + put_swap_device(si); + return -EEXIST; + } + + /* index may point to the middle of a large entry, get the sub entry */ + if (order) { + offset = index - round_down(index, 1 << order); + swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); + } /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap, NULL, 0); - order = xa_get_order(&mapping->i_pages, index); if (!folio) { - int nr_pages = 1 << order; - bool fallback_order0 = false; - - /* Or update major stats only when swapin succeeds?? */ + if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { + /* Direct swapin skipping swap cache & readahead */ + folio = shmem_swap_alloc_folio(inode, vma, index, + index_entry, order, gfp); + if (IS_ERR(folio)) { + error = PTR_ERR(folio); + folio = NULL; + goto failed; + } + skip_swapcache = true; + } else { + /* Cached swapin only supports order 0 folio */ + folio = shmem_swapin_cluster(swap, gfp, info, index); + if (!folio) { + error = -ENOMEM; + goto failed; + } + } if (fault_type) { *fault_type |= VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); count_memcg_event_mm(fault_mm, PGMAJFAULT); } + } + if (order > folio_order(folio)) { /* - * If uffd is active for the vma, we need per-page fault - * fidelity to maintain the uffd semantics, then fallback - * to swapin order-0 folio, as well as for zswap case. - * Any existing sub folio in the swap cache also blocks - * mTHP swapin. - */ - if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) || - !zswap_never_enabled() || - non_swapcache_batch(swap, nr_pages) != nr_pages)) - fallback_order0 = true; - - /* Skip swapcache for synchronous device. */ - if (!fallback_order0 && data_race(si->flags & SWP_SYNCHRONOUS_IO)) { - folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp); - if (!IS_ERR(folio)) { - skip_swapcache = true; - goto alloced; - } - - /* - * Fallback to swapin order-0 folio unless the swap entry - * already exists. - */ - error = PTR_ERR(folio); - folio = NULL; - if (error == -EEXIST) - goto failed; - } - - /* - * Now swap device can only swap in order 0 folio, then we - * should split the large swap entry stored in the pagecache - * if necessary. - */ - split_order = shmem_split_large_entry(inode, index, swap, gfp); - if (split_order < 0) { - error = split_order; - goto failed; - } - - /* - * If the large swap entry has already been split, it is - * necessary to recalculate the new swap entry based on - * the old order alignment. - */ - if (split_order > 0) { - pgoff_t offset = index - round_down(index, 1 << split_order); - - swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); - } - - /* Here we actually start the io */ - folio = shmem_swapin_cluster(swap, gfp, info, index); - if (!folio) { - error = -ENOMEM; - goto failed; - } - } else if (order != folio_order(folio)) { - /* - * Swap readahead may swap in order 0 folios into swapcache + * Swapin may get smaller folios due to various reasons: + * It may fallback to order 0 due to memory pressure or race, + * swap readahead may swap in order 0 folios into swapcache * asynchronously, while the shmem mapping can still stores * large swap entries. In such cases, we should split the * large swap entry to prevent possible data corruption. */ - split_order = shmem_split_large_entry(inode, index, swap, gfp); - if (split_order < 0) { - folio_put(folio); - folio = NULL; - error = split_order; - goto failed; - } - - /* - * If the large swap entry has already been split, it is - * necessary to recalculate the new swap entry based on - * the old order alignment. - */ - if (split_order > 0) { - pgoff_t offset = index - round_down(index, 1 << split_order); + error = shmem_split_large_entry(inode, index, index_entry, gfp); + if (error) + goto failed_nolock; + } - swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); - } + /* + * If the folio is large, round down swap and index by folio size. + * No matter what race occurs, the swap layer ensures we either get + * a valid folio that has its swap entry aligned by size, or a + * temporarily invalid one which we'll abort very soon and retry. + * + * shmem_add_to_page_cache ensures the whole range contains expected + * entries and prevents any corruption, so any race split is fine + * too, it will succeed as long as the entries are still there. + */ + nr_pages = folio_nr_pages(folio); + if (nr_pages > 1) { + swap.val = round_down(swap.val, nr_pages); + index = round_down(index, nr_pages); } -alloced: - /* We have to do this with folio locked to prevent races */ + /* + * We have to do this with the folio locked to prevent races. + * The shmem_confirm_swap below only checks if the first swap + * entry matches the folio, that's enough to ensure the folio + * is not used outside of shmem, as shmem swap entries + * and swap cache folios are never partially freed. + */ folio_lock(folio); if ((!skip_swapcache && !folio_test_swapcache(folio)) || - folio->swap.val != swap.val || - !shmem_confirm_swap(mapping, index, swap) || - xa_get_order(&mapping->i_pages, index) != folio_order(folio)) { + shmem_confirm_swap(mapping, index, swap) < 0 || + folio->swap.val != swap.val) { error = -EEXIST; goto unlock; } @@ -2415,8 +2444,7 @@ alloced: goto failed; } - error = shmem_add_to_page_cache(folio, mapping, - round_down(index, nr_pages), + error = shmem_add_to_page_cache(folio, mapping, index, swp_to_radix_entry(swap), gfp); if (error) goto failed; @@ -2439,18 +2467,19 @@ alloced: *foliop = folio; return 0; failed: - if (!shmem_confirm_swap(mapping, index, swap)) + if (shmem_confirm_swap(mapping, index, swap) < 0) error = -EEXIST; if (error == -EIO) shmem_set_folio_swapin_error(inode, index, folio, swap, skip_swapcache); unlock: - if (skip_swapcache) - swapcache_clear(si, swap, folio_nr_pages(folio)); - if (folio) { + if (folio) folio_unlock(folio); +failed_nolock: + if (skip_swapcache) + swapcache_clear(si, folio->swap, folio_nr_pages(folio)); + if (folio) folio_put(folio); - } put_swap_device(si); return error; @@ -5960,8 +5989,8 @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping, struct folio *folio; int error; - error = shmem_get_folio_gfp(inode, index, 0, &folio, SGP_CACHE, - gfp, NULL, NULL); + error = shmem_get_folio_gfp(inode, index, i_size_read(inode), + &folio, SGP_CACHE, gfp, NULL, NULL); if (error) return ERR_PTR(error); diff --git a/mm/slub.c b/mm/slub.c index cf7c6032d5fd..30003763d224 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -6312,9 +6312,8 @@ void __init kmem_cache_init(void) if (debug_guardpage_minorder()) slub_max_order = 0; - /* Print slub debugging pointers without hashing */ - if (__slub_debug_enabled()) - no_hash_pointers_enable(NULL); + /* Inform pointer hashing choice about slub debugging state. */ + hash_pointers_finalize(__slub_debug_enabled()); kmem_cache_node = &boot_kmem_cache_node; kmem_cache = &boot_kmem_cache; @@ -1351,7 +1351,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, } /* Don't bother splitting the VMA if we can't unmap it anyway */ - if (!can_modify_vma(vms->vma)) { + if (vma_is_sealed(vms->vma)) { error = -EPERM; goto start_split_failed; } @@ -1371,7 +1371,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, for_each_vma_range(*(vms->vmi), next, vms->end) { long nrpages; - if (!can_modify_vma(next)) { + if (vma_is_sealed(next)) { error = -EPERM; goto modify_vma_failed; } @@ -559,38 +559,15 @@ struct vm_area_struct *vma_iter_next_rewind(struct vma_iterator *vmi, } #ifdef CONFIG_64BIT - static inline bool vma_is_sealed(struct vm_area_struct *vma) { return (vma->vm_flags & VM_SEALED); } - -/* - * check if a vma is sealed for modification. - * return true, if modification is allowed. - */ -static inline bool can_modify_vma(struct vm_area_struct *vma) -{ - if (unlikely(vma_is_sealed(vma))) - return false; - - return true; -} - -bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior); - #else - -static inline bool can_modify_vma(struct vm_area_struct *vma) -{ - return true; -} - -static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) +static inline bool vma_is_sealed(struct vm_area_struct *vma) { - return true; + return false; } - #endif #if defined(CONFIG_STACK_GROWSUP) diff --git a/tools/testing/selftests/cachestat/test_cachestat.c b/tools/testing/selftests/cachestat/test_cachestat.c index 632ab44737ec..c952640f163b 100644 --- a/tools/testing/selftests/cachestat/test_cachestat.c +++ b/tools/testing/selftests/cachestat/test_cachestat.c @@ -33,6 +33,11 @@ void print_cachestat(struct cachestat *cs) cs->nr_evicted, cs->nr_recently_evicted); } +enum file_type { + FILE_MMAP, + FILE_SHMEM +}; + bool write_exactly(int fd, size_t filesize) { int random_fd = open("/dev/urandom", O_RDONLY); @@ -201,8 +206,20 @@ out1: out: return ret; } +const char *file_type_str(enum file_type type) +{ + switch (type) { + case FILE_SHMEM: + return "shmem"; + case FILE_MMAP: + return "mmap"; + default: + return "unknown"; + } +} -bool test_cachestat_shmem(void) + +bool run_cachestat_test(enum file_type type) { size_t PS = sysconf(_SC_PAGESIZE); size_t filesize = PS * 512 * 2; /* 2 2MB huge pages */ @@ -212,27 +229,50 @@ bool test_cachestat_shmem(void) char *filename = "tmpshmcstat"; struct cachestat cs; bool ret = true; + int fd; unsigned long num_pages = compute_len / PS; - int fd = shm_open(filename, O_CREAT | O_RDWR, 0600); + if (type == FILE_SHMEM) + fd = shm_open(filename, O_CREAT | O_RDWR, 0600); + else + fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0666); if (fd < 0) { - ksft_print_msg("Unable to create shmem file.\n"); + ksft_print_msg("Unable to create %s file.\n", + file_type_str(type)); ret = false; goto out; } if (ftruncate(fd, filesize)) { - ksft_print_msg("Unable to truncate shmem file.\n"); + ksft_print_msg("Unable to truncate %s file.\n",file_type_str(type)); ret = false; goto close_fd; } + switch (type) { + case FILE_SHMEM: + if (!write_exactly(fd, filesize)) { + ksft_print_msg("Unable to write to file.\n"); + ret = false; + goto close_fd; + } + break; + case FILE_MMAP: + char *map = mmap(NULL, filesize, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); - if (!write_exactly(fd, filesize)) { - ksft_print_msg("Unable to write to shmem file.\n"); + if (map == MAP_FAILED) { + ksft_print_msg("mmap failed.\n"); + ret = false; + goto close_fd; + } + for (int i = 0; i < filesize; i++) + map[i] = 'A'; + break; + default: + ksft_print_msg("Unsupported file type.\n"); ret = false; goto close_fd; } - syscall_ret = syscall(__NR_cachestat, fd, &cs_range, &cs, 0); if (syscall_ret) { @@ -308,12 +348,18 @@ int main(void) break; } - if (test_cachestat_shmem()) + if (run_cachestat_test(FILE_SHMEM)) ksft_test_result_pass("cachestat works with a shmem file\n"); else { ksft_test_result_fail("cachestat fails with a shmem file\n"); ret = 1; } + if (run_cachestat_test(FILE_MMAP)) + ksft_test_result_pass("cachestat works with a mmap file\n"); + else { + ksft_test_result_fail("cachestat fails with a mmap file\n"); + ret = 1; + } return ret; } diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index f2dafa0b700b..e7b23a8a05fe 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -21,6 +21,7 @@ on-fault-limit transhuge-stress pagemap_ioctl pfnmap +process_madv *.tmp* protection_keys protection_keys_32 diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index ae6f994d3add..d13b3cef2a2b 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -85,6 +85,7 @@ TEST_GEN_FILES += mseal_test TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += pagemap_ioctl TEST_GEN_FILES += pfnmap +TEST_GEN_FILES += process_madv TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += uffd-stress diff --git a/tools/testing/selftests/mm/process_madv.c b/tools/testing/selftests/mm/process_madv.c new file mode 100644 index 000000000000..471cae8427f1 --- /dev/null +++ b/tools/testing/selftests/mm/process_madv.c @@ -0,0 +1,344 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#define _GNU_SOURCE +#include "../kselftest_harness.h" +#include <errno.h> +#include <setjmp.h> +#include <signal.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <linux/mman.h> +#include <sys/syscall.h> +#include <unistd.h> +#include <sched.h> +#include "vm_util.h" + +#include "../pidfd/pidfd.h" + +FIXTURE(process_madvise) +{ + unsigned long page_size; + pid_t child_pid; + int remote_pidfd; + int pidfd; +}; + +FIXTURE_SETUP(process_madvise) +{ + self->page_size = (unsigned long)sysconf(_SC_PAGESIZE); + self->pidfd = PIDFD_SELF; + self->remote_pidfd = -1; + self->child_pid = -1; +}; + +FIXTURE_TEARDOWN_PARENT(process_madvise) +{ + /* This teardown is guaranteed to run, even if tests SKIP or ASSERT */ + if (self->child_pid > 0) { + kill(self->child_pid, SIGKILL); + waitpid(self->child_pid, NULL, 0); + } + + if (self->remote_pidfd >= 0) + close(self->remote_pidfd); +} + +static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec, + size_t vlen, int advice, unsigned int flags) +{ + return syscall(__NR_process_madvise, pidfd, iovec, vlen, advice, flags); +} + +/* + * This test uses PIDFD_SELF to target the current process. The main + * goal is to verify the basic behavior of process_madvise() with + * a vector of non-contiguous memory ranges, not its cross-process + * capabilities. + */ +TEST_F(process_madvise, basic) +{ + const unsigned long pagesize = self->page_size; + const int madvise_pages = 4; + struct iovec vec[madvise_pages]; + int pidfd = self->pidfd; + ssize_t ret; + char *map; + + /* + * Create a single large mapping. We will pick pages from this + * mapping to advise on. This ensures we test non-contiguous iovecs. + */ + map = mmap(NULL, pagesize * 10, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (map == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + + /* Fill the entire region with a known pattern. */ + memset(map, 'A', pagesize * 10); + + /* + * Setup the iovec to point to 4 non-contiguous pages + * within the mapping. + */ + vec[0].iov_base = &map[0 * pagesize]; + vec[0].iov_len = pagesize; + vec[1].iov_base = &map[3 * pagesize]; + vec[1].iov_len = pagesize; + vec[2].iov_base = &map[5 * pagesize]; + vec[2].iov_len = pagesize; + vec[3].iov_base = &map[8 * pagesize]; + vec[3].iov_len = pagesize; + + ret = sys_process_madvise(pidfd, vec, madvise_pages, MADV_DONTNEED, 0); + if (ret == -1 && errno == EPERM) + SKIP(return, + "process_madvise() unsupported or permission denied, try running as root.\n"); + else if (errno == EINVAL) + SKIP(return, + "process_madvise() unsupported or parameter invalid, please check arguments.\n"); + + /* The call should succeed and report the total bytes processed. */ + ASSERT_EQ(ret, madvise_pages * pagesize); + + /* Check that advised pages are now zero. */ + for (int i = 0; i < madvise_pages; i++) { + char *advised_page = (char *)vec[i].iov_base; + + /* Content must be 0, not 'A'. */ + ASSERT_EQ(*advised_page, '\0'); + } + + /* Check that an un-advised page in between is still 'A'. */ + char *unadvised_page = &map[1 * pagesize]; + + for (int i = 0; i < pagesize; i++) + ASSERT_EQ(unadvised_page[i], 'A'); + + /* Cleanup. */ + ASSERT_EQ(munmap(map, pagesize * 10), 0); +} + +/* + * This test deterministically validates process_madvise() with MADV_COLLAPSE + * on a remote process, other advices are difficult to verify reliably. + * + * The test verifies that a memory region in a child process, + * focus on process_madv remote result, only check addresses and lengths. + * The correctness of the MADV_COLLAPSE can be found in the relevant test examples in khugepaged. + */ +TEST_F(process_madvise, remote_collapse) +{ + const unsigned long pagesize = self->page_size; + long huge_page_size; + int pipe_info[2]; + ssize_t ret; + struct iovec vec; + + struct child_info { + pid_t pid; + void *map_addr; + } info; + + huge_page_size = read_pmd_pagesize(); + if (huge_page_size <= 0) + SKIP(return, "Could not determine a valid huge page size.\n"); + + ASSERT_EQ(pipe(pipe_info), 0); + + self->child_pid = fork(); + ASSERT_NE(self->child_pid, -1); + + if (self->child_pid == 0) { + char *map; + size_t map_size = 2 * huge_page_size; + + close(pipe_info[0]); + + map = mmap(NULL, map_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(map, MAP_FAILED); + + /* Fault in as small pages */ + for (size_t i = 0; i < map_size; i += pagesize) + map[i] = 'A'; + + /* Send info and pause */ + info.pid = getpid(); + info.map_addr = map; + ret = write(pipe_info[1], &info, sizeof(info)); + ASSERT_EQ(ret, sizeof(info)); + close(pipe_info[1]); + + pause(); + exit(0); + } + + close(pipe_info[1]); + + /* Receive child info */ + ret = read(pipe_info[0], &info, sizeof(info)); + if (ret <= 0) { + waitpid(self->child_pid, NULL, 0); + SKIP(return, "Failed to read child info from pipe.\n"); + } + ASSERT_EQ(ret, sizeof(info)); + close(pipe_info[0]); + self->child_pid = info.pid; + + self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0); + ASSERT_GE(self->remote_pidfd, 0); + + vec.iov_base = info.map_addr; + vec.iov_len = huge_page_size; + + ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_COLLAPSE, + 0); + if (ret == -1) { + if (errno == EINVAL) + SKIP(return, "PROCESS_MADV_ADVISE is not supported.\n"); + else if (errno == EPERM) + SKIP(return, + "No process_madvise() permissions, try running as root.\n"); + return; + } + + ASSERT_EQ(ret, huge_page_size); +} + +/* + * Test process_madvise() with a pidfd for a process that has already + * exited to ensure correct error handling. + */ +TEST_F(process_madvise, exited_process_pidfd) +{ + const unsigned long pagesize = self->page_size; + struct iovec vec; + char *map; + ssize_t ret; + + map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, + 0); + if (map == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + + vec.iov_base = map; + vec.iov_len = pagesize; + + /* + * Using a pidfd for a process that has already exited should fail + * with ESRCH. + */ + self->child_pid = fork(); + ASSERT_NE(self->child_pid, -1); + + if (self->child_pid == 0) + exit(0); + + self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0); + ASSERT_GE(self->remote_pidfd, 0); + + /* Wait for the child to ensure it has terminated. */ + waitpid(self->child_pid, NULL, 0); + + ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_DONTNEED, + 0); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, ESRCH); +} + +/* + * Test process_madvise() with bad pidfds to ensure correct error + * handling. + */ +TEST_F(process_madvise, bad_pidfd) +{ + const unsigned long pagesize = self->page_size; + struct iovec vec; + char *map; + ssize_t ret; + + map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, + 0); + if (map == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + + vec.iov_base = map; + vec.iov_len = pagesize; + + /* Using an invalid fd number (-1) should fail with EBADF. */ + ret = sys_process_madvise(-1, &vec, 1, MADV_DONTNEED, 0); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EBADF); + + /* + * Using a valid fd that is not a pidfd (e.g. stdin) should fail + * with EBADF. + */ + ret = sys_process_madvise(STDIN_FILENO, &vec, 1, MADV_DONTNEED, 0); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EBADF); +} + +/* + * Test that process_madvise() rejects vlen > UIO_MAXIOV. + * The kernel should return -EINVAL when the number of iovecs exceeds 1024. + */ +TEST_F(process_madvise, invalid_vlen) +{ + const unsigned long pagesize = self->page_size; + int pidfd = self->pidfd; + struct iovec vec; + char *map; + ssize_t ret; + + map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, + 0); + if (map == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + + vec.iov_base = map; + vec.iov_len = pagesize; + + ret = sys_process_madvise(pidfd, &vec, 1025, MADV_DONTNEED, 0); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EINVAL); + + /* Cleanup. */ + ASSERT_EQ(munmap(map, pagesize), 0); +} + +/* + * Test process_madvise() with an invalid flag value. Currently, only a flag + * value of 0 is supported. This test is reserved for the future, e.g., if + * synchronous flags are added. + */ +TEST_F(process_madvise, flag) +{ + const unsigned long pagesize = self->page_size; + unsigned int invalid_flag; + int pidfd = self->pidfd; + struct iovec vec; + char *map; + ssize_t ret; + + map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, + 0); + if (map == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + + vec.iov_base = map; + vec.iov_len = pagesize; + + invalid_flag = 0x80000000; + + ret = sys_process_madvise(pidfd, &vec, 1, MADV_DONTNEED, invalid_flag); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EINVAL); + + /* Cleanup. */ + ASSERT_EQ(munmap(map, pagesize), 0); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index a38c984103ce..471e539d82b8 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -65,6 +65,8 @@ separated by spaces: test pagemap_scan IOCTL - pfnmap tests for VM_PFNMAP handling +- process_madv + test for process_madv - cow test copy-on-write semantics - thp @@ -425,6 +427,9 @@ CATEGORY="madv_guard" run_test ./guard-regions # MADV_POPULATE_READ and MADV_POPULATE_WRITE tests CATEGORY="madv_populate" run_test ./madv_populate +# PROCESS_MADV test +CATEGORY="process_madv" run_test ./process_madv + CATEGORY="vma_merge" run_test ./merge if [ -x ./memfd_secret ] diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index a838c37f93e5..3639aa8dd2b0 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -108,8 +108,10 @@ extern unsigned long dac_mmap_min_addr; #define CAP_IPC_LOCK 14 #ifdef CONFIG_64BIT -/* VM is sealed, in vm_flags */ -#define VM_SEALED _BITUL(63) +#define VM_SEALED_BIT 42 +#define VM_SEALED BIT(VM_SEALED_BIT) +#else +#define VM_SEALED VM_NONE #endif #define FIRST_USER_ADDRESS 0UL |