summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/core-api/mm-api.rst1
-rw-r--r--Documentation/devicetree/bindings/i2c/apple,i2c.yaml5
-rw-r--r--arch/arm/include/asm/cti.h160
-rw-r--r--arch/arm64/mm/mmu.c4
-rw-r--r--arch/x86/kernel/alternative.c3
-rw-r--r--arch/x86/kernel/ftrace.c2
-rw-r--r--arch/x86/kernel/kprobes/core.c18
-rw-r--r--arch/x86/mm/init.c24
-rw-r--r--drivers/i2c/busses/Kconfig1
-rw-r--r--drivers/i2c/busses/i2c-qcom-geni.c6
-rw-r--r--drivers/i2c/busses/i2c-stm32f7.c32
-rw-r--r--drivers/i2c/busses/i2c-tegra.c64
-rw-r--r--drivers/i2c/i2c-core-acpi.c1
-rw-r--r--drivers/i2c/muxes/i2c-mux-mule.c3
-rw-r--r--drivers/media/platform/qcom/venus/pm_helpers.c12
-rw-r--r--drivers/net/usb/usbnet.c6
-rw-r--r--fs/exfat/dir.c12
-rw-r--r--fs/exfat/fatent.c10
-rw-r--r--fs/exfat/file.c5
-rw-r--r--fs/exfat/namei.c5
-rw-r--r--fs/exfat/super.c32
-rw-r--r--include/linux/execmem.h54
-rw-r--r--include/linux/io-mapping.h3
-rw-r--r--include/linux/mm.h6
-rw-r--r--include/linux/mmap_lock.h30
-rw-r--r--include/linux/page-flags.h2
-rw-r--r--include/linux/pgtable.h45
-rw-r--r--include/linux/rmap.h22
-rw-r--r--kernel/events/core.c36
-rw-r--r--kernel/fork.c9
-rw-r--r--kernel/module/main.c13
-rw-r--r--mm/Kconfig4
-rw-r--r--mm/Makefile1
-rw-r--r--mm/damon/vaddr.c4
-rw-r--r--mm/execmem.c206
-rw-r--r--mm/internal.h2
-rw-r--r--mm/io-mapping.c30
-rw-r--r--mm/kasan/common.c25
-rw-r--r--mm/khugepaged.c58
-rw-r--r--mm/madvise.c71
-rw-r--r--mm/memory-failure.c12
-rw-r--r--mm/mempool.c24
-rw-r--r--mm/mincore.c3
-rw-r--r--mm/mmap_lock.c10
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/mremap.c4
-rw-r--r--mm/mseal.c166
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/rmap.c2
-rw-r--r--mm/shmem.c279
-rw-r--r--mm/vma.c4
-rw-r--r--mm/vma.h27
-rw-r--r--tools/testing/selftests/cachestat/test_cachestat.c62
-rw-r--r--tools/testing/selftests/mm/.gitignore1
-rw-r--r--tools/testing/selftests/mm/Makefile1
-rw-r--r--tools/testing/selftests/mm/process_madv.c344
-rwxr-xr-xtools/testing/selftests/mm/run_vmtests.sh5
-rw-r--r--tools/testing/selftests/perf_events/.gitignore1
-rw-r--r--tools/testing/selftests/perf_events/Makefile2
-rw-r--r--tools/testing/selftests/perf_events/mmap.c236
-rw-r--r--tools/testing/vma/vma_internal.h6
61 files changed, 1482 insertions, 738 deletions
diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst
index 50cfc7842930..5063179cfc70 100644
--- a/Documentation/core-api/mm-api.rst
+++ b/Documentation/core-api/mm-api.rst
@@ -133,4 +133,3 @@ More Memory Management Functions
.. kernel-doc:: mm/mmu_notifier.c
.. kernel-doc:: mm/balloon_compaction.c
.. kernel-doc:: mm/huge_memory.c
-.. kernel-doc:: mm/io-mapping.c
diff --git a/Documentation/devicetree/bindings/i2c/apple,i2c.yaml b/Documentation/devicetree/bindings/i2c/apple,i2c.yaml
index 077d2a539c83..fed3e1b8c43f 100644
--- a/Documentation/devicetree/bindings/i2c/apple,i2c.yaml
+++ b/Documentation/devicetree/bindings/i2c/apple,i2c.yaml
@@ -22,6 +22,11 @@ properties:
compatible:
items:
- enum:
+ - apple,s5l8960x-i2c
+ - apple,t7000-i2c
+ - apple,s8000-i2c
+ - apple,t8010-i2c
+ - apple,t8015-i2c
- apple,t8103-i2c
- apple,t8112-i2c
- apple,t6000-i2c
diff --git a/arch/arm/include/asm/cti.h b/arch/arm/include/asm/cti.h
deleted file mode 100644
index f8500e5d6ea8..000000000000
--- a/arch/arm/include/asm/cti.h
+++ /dev/null
@@ -1,160 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __ASMARM_CTI_H
-#define __ASMARM_CTI_H
-
-#include <asm/io.h>
-#include <asm/hardware/coresight.h>
-
-/* The registers' definition is from section 3.2 of
- * Embedded Cross Trigger Revision: r0p0
- */
-#define CTICONTROL 0x000
-#define CTISTATUS 0x004
-#define CTILOCK 0x008
-#define CTIPROTECTION 0x00C
-#define CTIINTACK 0x010
-#define CTIAPPSET 0x014
-#define CTIAPPCLEAR 0x018
-#define CTIAPPPULSE 0x01c
-#define CTIINEN 0x020
-#define CTIOUTEN 0x0A0
-#define CTITRIGINSTATUS 0x130
-#define CTITRIGOUTSTATUS 0x134
-#define CTICHINSTATUS 0x138
-#define CTICHOUTSTATUS 0x13c
-#define CTIPERIPHID0 0xFE0
-#define CTIPERIPHID1 0xFE4
-#define CTIPERIPHID2 0xFE8
-#define CTIPERIPHID3 0xFEC
-#define CTIPCELLID0 0xFF0
-#define CTIPCELLID1 0xFF4
-#define CTIPCELLID2 0xFF8
-#define CTIPCELLID3 0xFFC
-
-/* The below are from section 3.6.4 of
- * CoreSight v1.0 Architecture Specification
- */
-#define LOCKACCESS 0xFB0
-#define LOCKSTATUS 0xFB4
-
-/**
- * struct cti - cross trigger interface struct
- * @base: mapped virtual address for the cti base
- * @irq: irq number for the cti
- * @trig_out_for_irq: triger out number which will cause
- * the @irq happen
- *
- * cti struct used to operate cti registers.
- */
-struct cti {
- void __iomem *base;
- int irq;
- int trig_out_for_irq;
-};
-
-/**
- * cti_init - initialize the cti instance
- * @cti: cti instance
- * @base: mapped virtual address for the cti base
- * @irq: irq number for the cti
- * @trig_out: triger out number which will cause
- * the @irq happen
- *
- * called by machine code to pass the board dependent
- * @base, @irq and @trig_out to cti.
- */
-static inline void cti_init(struct cti *cti,
- void __iomem *base, int irq, int trig_out)
-{
- cti->base = base;
- cti->irq = irq;
- cti->trig_out_for_irq = trig_out;
-}
-
-/**
- * cti_map_trigger - use the @chan to map @trig_in to @trig_out
- * @cti: cti instance
- * @trig_in: trigger in number
- * @trig_out: trigger out number
- * @channel: channel number
- *
- * This function maps one trigger in of @trig_in to one trigger
- * out of @trig_out using the channel @chan.
- */
-static inline void cti_map_trigger(struct cti *cti,
- int trig_in, int trig_out, int chan)
-{
- void __iomem *base = cti->base;
- unsigned long val;
-
- val = __raw_readl(base + CTIINEN + trig_in * 4);
- val |= BIT(chan);
- __raw_writel(val, base + CTIINEN + trig_in * 4);
-
- val = __raw_readl(base + CTIOUTEN + trig_out * 4);
- val |= BIT(chan);
- __raw_writel(val, base + CTIOUTEN + trig_out * 4);
-}
-
-/**
- * cti_enable - enable the cti module
- * @cti: cti instance
- *
- * enable the cti module
- */
-static inline void cti_enable(struct cti *cti)
-{
- __raw_writel(0x1, cti->base + CTICONTROL);
-}
-
-/**
- * cti_disable - disable the cti module
- * @cti: cti instance
- *
- * enable the cti module
- */
-static inline void cti_disable(struct cti *cti)
-{
- __raw_writel(0, cti->base + CTICONTROL);
-}
-
-/**
- * cti_irq_ack - clear the cti irq
- * @cti: cti instance
- *
- * clear the cti irq
- */
-static inline void cti_irq_ack(struct cti *cti)
-{
- void __iomem *base = cti->base;
- unsigned long val;
-
- val = __raw_readl(base + CTIINTACK);
- val |= BIT(cti->trig_out_for_irq);
- __raw_writel(val, base + CTIINTACK);
-}
-
-/**
- * cti_unlock - unlock cti module
- * @cti: cti instance
- *
- * unlock the cti module, or else any writes to the cti
- * module is not allowed.
- */
-static inline void cti_unlock(struct cti *cti)
-{
- __raw_writel(CS_LAR_KEY, cti->base + LOCKACCESS);
-}
-
-/**
- * cti_lock - lock cti module
- * @cti: cti instance
- *
- * lock the cti module, so any writes to the cti
- * module will be not allowed.
- */
-static inline void cti_lock(struct cti *cti)
-{
- __raw_writel(~CS_LAR_KEY, cti->base + LOCKACCESS);
-}
-#endif
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index abd9725796e9..34e5d78af076 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -721,7 +721,7 @@ void mark_rodata_ro(void)
static void __init declare_vma(struct vm_struct *vma,
void *va_start, void *va_end,
- vm_flags_t vm_flags)
+ unsigned long vm_flags)
{
phys_addr_t pa_start = __pa_symbol(va_start);
unsigned long size = va_end - va_start;
@@ -1528,7 +1528,7 @@ early_initcall(prevent_bootmem_remove_init);
pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr,
pte_t *ptep, unsigned int nr)
{
- pte_t pte = get_and_clear_full_ptes(vma->vm_mm, addr, ptep, nr, /* full = */ 0);
+ pte_t pte = get_and_clear_ptes(vma->vm_mm, addr, ptep, nr);
if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) {
/*
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 9f6b7dab2d9a..7bde68247b5f 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -120,7 +120,7 @@ struct its_array its_pages;
static void *__its_alloc(struct its_array *pages)
{
- void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE);
+ void *page __free(execmem) = execmem_alloc_rw(EXECMEM_MODULE_TEXT, PAGE_SIZE);
if (!page)
return NULL;
@@ -237,7 +237,6 @@ static void *its_alloc(void)
if (!page)
return NULL;
- execmem_make_temp_rw(page, PAGE_SIZE);
if (pages == &its_pages)
set_memory_x((unsigned long)page, 1);
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 252e82bcfd2f..4450acec9390 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -263,7 +263,7 @@ void arch_ftrace_update_code(int command)
static inline void *alloc_tramp(unsigned long size)
{
- return execmem_alloc(EXECMEM_FTRACE, size);
+ return execmem_alloc_rw(EXECMEM_FTRACE, size);
}
static inline void tramp_free(void *tramp)
{
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 47cb8eb138ba..6079d15dab8c 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -481,24 +481,6 @@ static int prepare_singlestep(kprobe_opcode_t *buf, struct kprobe *p,
return len;
}
-/* Make page to RO mode when allocate it */
-void *alloc_insn_page(void)
-{
- void *page;
-
- page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
- if (!page)
- return NULL;
-
- /*
- * TODO: Once additional kernel code protection mechanisms are set, ensure
- * that the page was not maliciously altered and it is still zeroed.
- */
- set_memory_rox((unsigned long)page, 1);
-
- return page;
-}
-
/* Kprobe x86 instruction emulation - only regs->ip or IF flag modifiers */
static void kprobe_emulate_ifmodifiers(struct kprobe *p, struct pt_regs *regs)
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 7456df985d96..bb57e93b4caf 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -1063,13 +1063,9 @@ unsigned long arch_max_swapfile_size(void)
static struct execmem_info execmem_info __ro_after_init;
#ifdef CONFIG_ARCH_HAS_EXECMEM_ROX
-void execmem_fill_trapping_insns(void *ptr, size_t size, bool writeable)
+void execmem_fill_trapping_insns(void *ptr, size_t size)
{
- /* fill memory with INT3 instructions */
- if (writeable)
- memset(ptr, INT3_INSN_OPCODE, size);
- else
- text_poke_set(ptr, INT3_INSN_OPCODE, size);
+ memset(ptr, INT3_INSN_OPCODE, size);
}
#endif
@@ -1102,7 +1098,21 @@ struct execmem_info __init *execmem_arch_setup(void)
.pgprot = pgprot,
.alignment = MODULE_ALIGN,
},
- [EXECMEM_KPROBES ... EXECMEM_BPF] = {
+ [EXECMEM_KPROBES] = {
+ .flags = flags,
+ .start = start,
+ .end = MODULES_END,
+ .pgprot = PAGE_KERNEL_ROX,
+ .alignment = MODULE_ALIGN,
+ },
+ [EXECMEM_FTRACE] = {
+ .flags = flags,
+ .start = start,
+ .end = MODULES_END,
+ .pgprot = pgprot,
+ .alignment = MODULE_ALIGN,
+ },
+ [EXECMEM_BPF] = {
.flags = EXECMEM_KASAN_SHADOW,
.start = start,
.end = MODULES_END,
diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index c8d115b58e44..070d014fdc5d 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -992,7 +992,6 @@ config I2C_APPLE
tristate "Apple SMBus platform driver"
depends on !I2C_PASEMI
depends on ARCH_APPLE || COMPILE_TEST
- default ARCH_APPLE
help
Say Y here if you want to use the I2C controller present on Apple
Silicon chips such as the M1.
diff --git a/drivers/i2c/busses/i2c-qcom-geni.c b/drivers/i2c/busses/i2c-qcom-geni.c
index 13889f52b6f7..ff2289b52c84 100644
--- a/drivers/i2c/busses/i2c-qcom-geni.c
+++ b/drivers/i2c/busses/i2c-qcom-geni.c
@@ -155,9 +155,9 @@ static const struct geni_i2c_clk_fld geni_i2c_clk_map_19p2mhz[] = {
/* source_clock = 32 MHz */
static const struct geni_i2c_clk_fld geni_i2c_clk_map_32mhz[] = {
- { I2C_MAX_STANDARD_MODE_FREQ, 8, 14, 18, 40 },
- { I2C_MAX_FAST_MODE_FREQ, 4, 3, 11, 20 },
- { I2C_MAX_FAST_MODE_PLUS_FREQ, 2, 3, 6, 15 },
+ { I2C_MAX_STANDARD_MODE_FREQ, 8, 14, 18, 38 },
+ { I2C_MAX_FAST_MODE_FREQ, 4, 3, 9, 19 },
+ { I2C_MAX_FAST_MODE_PLUS_FREQ, 2, 3, 5, 15 },
{}
};
diff --git a/drivers/i2c/busses/i2c-stm32f7.c b/drivers/i2c/busses/i2c-stm32f7.c
index c8b4b404f6c1..e6815f6cae78 100644
--- a/drivers/i2c/busses/i2c-stm32f7.c
+++ b/drivers/i2c/busses/i2c-stm32f7.c
@@ -742,11 +742,14 @@ static void stm32f7_i2c_dma_callback(void *arg)
{
struct stm32f7_i2c_dev *i2c_dev = arg;
struct stm32_i2c_dma *dma = i2c_dev->dma;
+ struct stm32f7_i2c_msg *f7_msg = &i2c_dev->f7_msg;
stm32f7_i2c_disable_dma_req(i2c_dev);
dmaengine_terminate_async(dma->chan_using);
dma_unmap_single(i2c_dev->dev, dma->dma_buf, dma->dma_len,
dma->dma_data_dir);
+ if (!f7_msg->smbus)
+ i2c_put_dma_safe_msg_buf(f7_msg->buf, i2c_dev->msg, true);
complete(&dma->dma_complete);
}
@@ -882,6 +885,7 @@ static void stm32f7_i2c_xfer_msg(struct stm32f7_i2c_dev *i2c_dev,
{
struct stm32f7_i2c_msg *f7_msg = &i2c_dev->f7_msg;
void __iomem *base = i2c_dev->base;
+ u8 *dma_buf;
u32 cr1, cr2;
int ret;
@@ -931,17 +935,23 @@ static void stm32f7_i2c_xfer_msg(struct stm32f7_i2c_dev *i2c_dev,
/* Configure DMA or enable RX/TX interrupt */
i2c_dev->use_dma = false;
- if (i2c_dev->dma && f7_msg->count >= STM32F7_I2C_DMA_LEN_MIN
- && !i2c_dev->atomic) {
- ret = stm32_i2c_prep_dma_xfer(i2c_dev->dev, i2c_dev->dma,
- msg->flags & I2C_M_RD,
- f7_msg->count, f7_msg->buf,
- stm32f7_i2c_dma_callback,
- i2c_dev);
- if (!ret)
- i2c_dev->use_dma = true;
- else
- dev_warn(i2c_dev->dev, "can't use DMA\n");
+ if (i2c_dev->dma && !i2c_dev->atomic) {
+ dma_buf = i2c_get_dma_safe_msg_buf(msg, STM32F7_I2C_DMA_LEN_MIN);
+ if (dma_buf) {
+ f7_msg->buf = dma_buf;
+ ret = stm32_i2c_prep_dma_xfer(i2c_dev->dev, i2c_dev->dma,
+ msg->flags & I2C_M_RD,
+ f7_msg->count, f7_msg->buf,
+ stm32f7_i2c_dma_callback,
+ i2c_dev);
+ if (ret) {
+ dev_warn(i2c_dev->dev, "can't use DMA\n");
+ i2c_put_dma_safe_msg_buf(f7_msg->buf, msg, false);
+ f7_msg->buf = msg->buf;
+ } else {
+ i2c_dev->use_dma = true;
+ }
+ }
}
if (!i2c_dev->use_dma) {
diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c
index 4f05afab161f..4eb31b913c1a 100644
--- a/drivers/i2c/busses/i2c-tegra.c
+++ b/drivers/i2c/busses/i2c-tegra.c
@@ -134,6 +134,8 @@
#define I2C_MST_FIFO_STATUS_TX GENMASK(23, 16)
#define I2C_MST_FIFO_STATUS_RX GENMASK(7, 0)
+#define I2C_MASTER_RESET_CNTRL 0x0a8
+
/* configuration load timeout in microseconds */
#define I2C_CONFIG_LOAD_TIMEOUT 1000000
@@ -184,6 +186,9 @@ enum msg_end_type {
* @has_mst_fifo: The I2C controller contains the new MST FIFO interface that
* provides additional features and allows for longer messages to
* be transferred in one go.
+ * @has_mst_reset: The I2C controller contains MASTER_RESET_CTRL register which
+ * provides an alternative to controller reset when configured as
+ * I2C master
* @quirks: I2C adapter quirks for limiting write/read transfer size and not
* allowing 0 length transfers.
* @supports_bus_clear: Bus Clear support to recover from bus hang during
@@ -213,6 +218,7 @@ struct tegra_i2c_hw_feature {
bool has_multi_master_mode;
bool has_slcg_override_reg;
bool has_mst_fifo;
+ bool has_mst_reset;
const struct i2c_adapter_quirks *quirks;
bool supports_bus_clear;
bool has_apb_dma;
@@ -605,6 +611,26 @@ static int tegra_i2c_wait_for_config_load(struct tegra_i2c_dev *i2c_dev)
return 0;
}
+static int tegra_i2c_master_reset(struct tegra_i2c_dev *i2c_dev)
+{
+ if (!i2c_dev->hw->has_mst_reset)
+ return -EOPNOTSUPP;
+
+ /*
+ * Writing 1 to I2C_MASTER_RESET_CNTRL will reset all internal state of
+ * Master logic including FIFOs. Clear this bit to 0 for normal operation.
+ * SW needs to wait for 2us after assertion and de-assertion of this soft
+ * reset.
+ */
+ i2c_writel(i2c_dev, 0x1, I2C_MASTER_RESET_CNTRL);
+ fsleep(2);
+
+ i2c_writel(i2c_dev, 0x0, I2C_MASTER_RESET_CNTRL);
+ fsleep(2);
+
+ return 0;
+}
+
static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev)
{
u32 val, clk_divisor, clk_multiplier, tsu_thd, tlow, thigh, non_hs_mode;
@@ -612,6 +638,16 @@ static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev)
int err;
/*
+ * Reset the controller before initializing it.
+ * In case if device_reset() returns -ENOENT, i.e. when the reset is
+ * not available, the internal software reset will be used if it is
+ * supported by the controller.
+ */
+ err = device_reset(i2c_dev->dev);
+ if (err == -ENOENT)
+ err = tegra_i2c_master_reset(i2c_dev);
+
+ /*
* The reset shouldn't ever fail in practice. The failure will be a
* sign of a severe problem that needs to be resolved. Still we don't
* want to fail the initialization completely because this may break
@@ -619,7 +655,6 @@ static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev)
* emit a noisy warning on error, which won't stay unnoticed and
* won't hose machine entirely.
*/
- err = device_reset(i2c_dev->dev);
WARN_ON_ONCE(err);
if (IS_DVC(i2c_dev))
@@ -1266,17 +1301,9 @@ static int tegra_i2c_xfer_msg(struct tegra_i2c_dev *i2c_dev,
if (i2c_dev->dma_mode) {
if (i2c_dev->msg_read) {
- dma_sync_single_for_device(i2c_dev->dma_dev,
- i2c_dev->dma_phys,
- xfer_size, DMA_FROM_DEVICE);
-
err = tegra_i2c_dma_submit(i2c_dev, xfer_size);
if (err)
return err;
- } else {
- dma_sync_single_for_cpu(i2c_dev->dma_dev,
- i2c_dev->dma_phys,
- xfer_size, DMA_TO_DEVICE);
}
}
@@ -1286,11 +1313,6 @@ static int tegra_i2c_xfer_msg(struct tegra_i2c_dev *i2c_dev,
if (i2c_dev->dma_mode) {
memcpy(i2c_dev->dma_buf + I2C_PACKET_HEADER_SIZE,
msg->buf, i2c_dev->msg_len);
-
- dma_sync_single_for_device(i2c_dev->dma_dev,
- i2c_dev->dma_phys,
- xfer_size, DMA_TO_DEVICE);
-
err = tegra_i2c_dma_submit(i2c_dev, xfer_size);
if (err)
return err;
@@ -1331,13 +1353,8 @@ static int tegra_i2c_xfer_msg(struct tegra_i2c_dev *i2c_dev,
return -ETIMEDOUT;
}
- if (i2c_dev->msg_read && i2c_dev->msg_err == I2C_ERR_NONE) {
- dma_sync_single_for_cpu(i2c_dev->dma_dev,
- i2c_dev->dma_phys,
- xfer_size, DMA_FROM_DEVICE);
-
+ if (i2c_dev->msg_read && i2c_dev->msg_err == I2C_ERR_NONE)
memcpy(i2c_dev->msg_buf, i2c_dev->dma_buf, i2c_dev->msg_len);
- }
}
time_left = tegra_i2c_wait_completion(i2c_dev, &i2c_dev->msg_complete,
@@ -1468,6 +1485,7 @@ static const struct tegra_i2c_hw_feature tegra20_i2c_hw = {
.has_multi_master_mode = false,
.has_slcg_override_reg = false,
.has_mst_fifo = false,
+ .has_mst_reset = false,
.quirks = &tegra_i2c_quirks,
.supports_bus_clear = false,
.has_apb_dma = true,
@@ -1492,6 +1510,7 @@ static const struct tegra_i2c_hw_feature tegra30_i2c_hw = {
.has_multi_master_mode = false,
.has_slcg_override_reg = false,
.has_mst_fifo = false,
+ .has_mst_reset = false,
.quirks = &tegra_i2c_quirks,
.supports_bus_clear = false,
.has_apb_dma = true,
@@ -1516,6 +1535,7 @@ static const struct tegra_i2c_hw_feature tegra114_i2c_hw = {
.has_multi_master_mode = false,
.has_slcg_override_reg = false,
.has_mst_fifo = false,
+ .has_mst_reset = false,
.quirks = &tegra_i2c_quirks,
.supports_bus_clear = true,
.has_apb_dma = true,
@@ -1540,6 +1560,7 @@ static const struct tegra_i2c_hw_feature tegra124_i2c_hw = {
.has_multi_master_mode = false,
.has_slcg_override_reg = true,
.has_mst_fifo = false,
+ .has_mst_reset = false,
.quirks = &tegra_i2c_quirks,
.supports_bus_clear = true,
.has_apb_dma = true,
@@ -1564,6 +1585,7 @@ static const struct tegra_i2c_hw_feature tegra210_i2c_hw = {
.has_multi_master_mode = false,
.has_slcg_override_reg = true,
.has_mst_fifo = false,
+ .has_mst_reset = false,
.quirks = &tegra_i2c_quirks,
.supports_bus_clear = true,
.has_apb_dma = true,
@@ -1588,6 +1610,7 @@ static const struct tegra_i2c_hw_feature tegra186_i2c_hw = {
.has_multi_master_mode = false,
.has_slcg_override_reg = true,
.has_mst_fifo = false,
+ .has_mst_reset = false,
.quirks = &tegra_i2c_quirks,
.supports_bus_clear = true,
.has_apb_dma = false,
@@ -1612,6 +1635,7 @@ static const struct tegra_i2c_hw_feature tegra194_i2c_hw = {
.has_multi_master_mode = true,
.has_slcg_override_reg = true,
.has_mst_fifo = true,
+ .has_mst_reset = true,
.quirks = &tegra194_i2c_quirks,
.supports_bus_clear = true,
.has_apb_dma = false,
diff --git a/drivers/i2c/i2c-core-acpi.c b/drivers/i2c/i2c-core-acpi.c
index 3445cc3b476b..ed90858a27b7 100644
--- a/drivers/i2c/i2c-core-acpi.c
+++ b/drivers/i2c/i2c-core-acpi.c
@@ -370,6 +370,7 @@ static const struct acpi_device_id i2c_acpi_force_100khz_device_ids[] = {
* the device works without issues on Windows at what is expected to be
* a 400KHz frequency. The root cause of the issue is not known.
*/
+ { "DLL0945", 0 },
{ "ELAN06FA", 0 },
{}
};
diff --git a/drivers/i2c/muxes/i2c-mux-mule.c b/drivers/i2c/muxes/i2c-mux-mule.c
index 284ff4afeeac..d3b32b794172 100644
--- a/drivers/i2c/muxes/i2c-mux-mule.c
+++ b/drivers/i2c/muxes/i2c-mux-mule.c
@@ -47,7 +47,6 @@ static int mule_i2c_mux_probe(struct platform_device *pdev)
struct mule_i2c_reg_mux *priv;
struct i2c_client *client;
struct i2c_mux_core *muxc;
- struct device_node *dev;
unsigned int readback;
int ndev, ret;
bool old_fw;
@@ -95,7 +94,7 @@ static int mule_i2c_mux_probe(struct platform_device *pdev)
"Failed to register mux remove\n");
/* Create device adapters */
- for_each_child_of_node(mux_dev->of_node, dev) {
+ for_each_child_of_node_scoped(mux_dev->of_node, dev) {
u32 reg;
ret = of_property_read_u32(dev, "reg", &reg);
diff --git a/drivers/media/platform/qcom/venus/pm_helpers.c b/drivers/media/platform/qcom/venus/pm_helpers.c
index 8dd5a9b0d060..e32f8862a9f9 100644
--- a/drivers/media/platform/qcom/venus/pm_helpers.c
+++ b/drivers/media/platform/qcom/venus/pm_helpers.c
@@ -48,7 +48,8 @@ static int core_clks_enable(struct venus_core *core)
int ret;
opp = dev_pm_opp_find_freq_ceil(dev, &freq);
- dev_pm_opp_put(opp);
+ if (!IS_ERR(opp))
+ dev_pm_opp_put(opp);
for (i = 0; i < res->clks_num; i++) {
if (IS_V6(core)) {
@@ -660,7 +661,8 @@ static int decide_core(struct venus_inst *inst)
/*TODO : divide this inst->load by work_route */
opp = dev_pm_opp_find_freq_floor(dev, &max_freq);
- dev_pm_opp_put(opp);
+ if (!IS_ERR(opp))
+ dev_pm_opp_put(opp);
min_loaded_core(inst, &min_coreid, &min_load, false);
min_loaded_core(inst, &min_lp_coreid, &min_lp_load, true);
@@ -1121,7 +1123,8 @@ static int load_scale_v4(struct venus_inst *inst)
freq = max(freq_core1, freq_core2);
opp = dev_pm_opp_find_freq_floor(dev, &max_freq);
- dev_pm_opp_put(opp);
+ if (!IS_ERR(opp))
+ dev_pm_opp_put(opp);
if (freq > max_freq) {
dev_dbg(dev, VDBGL "requested clock rate: %lu scaling clock rate : %lu\n",
@@ -1131,7 +1134,8 @@ static int load_scale_v4(struct venus_inst *inst)
}
opp = dev_pm_opp_find_freq_ceil(dev, &freq);
- dev_pm_opp_put(opp);
+ if (!IS_ERR(opp))
+ dev_pm_opp_put(opp);
set_freq:
diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index a38ffbf4b3f0..511c4154cf74 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -1120,6 +1120,9 @@ static void __handle_link_change(struct usbnet *dev)
if (!test_bit(EVENT_DEV_OPEN, &dev->flags))
return;
+ if (test_and_clear_bit(EVENT_LINK_CARRIER_ON, &dev->flags))
+ netif_carrier_on(dev->net);
+
if (!netif_carrier_ok(dev->net)) {
/* kill URBs for reading packets to save bus bandwidth */
unlink_urbs(dev, &dev->rxq);
@@ -1129,9 +1132,6 @@ static void __handle_link_change(struct usbnet *dev)
* tx queue is stopped by netcore after link becomes off
*/
} else {
- if (test_and_clear_bit(EVENT_LINK_CARRIER_ON, &dev->flags))
- netif_carrier_on(dev->net);
-
/* submitting URBs for reading packets */
queue_work(system_bh_wq, &dev->bh_work);
}
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index 3103b932b674..ee060e26f51d 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -996,6 +996,7 @@ int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei,
struct exfat_hint_femp candi_empty;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
int num_entries = exfat_calc_num_entries(p_uniname);
+ unsigned int clu_count = 0;
if (num_entries < 0)
return num_entries;
@@ -1133,6 +1134,10 @@ rewind:
} else {
if (exfat_get_next_cluster(sb, &clu.dir))
return -EIO;
+
+ /* break if the cluster chain includes a loop */
+ if (unlikely(++clu_count > EXFAT_DATA_CLUSTER_COUNT(sbi)))
+ goto not_found;
}
}
@@ -1195,6 +1200,7 @@ int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir)
int i, count = 0;
int dentries_per_clu;
unsigned int entry_type;
+ unsigned int clu_count = 0;
struct exfat_chain clu;
struct exfat_dentry *ep;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
@@ -1227,6 +1233,12 @@ int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir)
} else {
if (exfat_get_next_cluster(sb, &(clu.dir)))
return -EIO;
+
+ if (unlikely(++clu_count > sbi->used_clusters)) {
+ exfat_fs_error(sb, "FAT or bitmap is corrupted");
+ return -EIO;
+ }
+
}
}
diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c
index 23065f948ae7..232cc7f8ab92 100644
--- a/fs/exfat/fatent.c
+++ b/fs/exfat/fatent.c
@@ -490,5 +490,15 @@ int exfat_count_num_clusters(struct super_block *sb,
}
*ret_count = count;
+
+ /*
+ * since exfat_count_used_clusters() is not called, sbi->used_clusters
+ * cannot be used here.
+ */
+ if (unlikely(i == sbi->num_clusters && clu != EXFAT_EOF_CLUSTER)) {
+ exfat_fs_error(sb, "The cluster chain has a loop");
+ return -EIO;
+ }
+
return 0;
}
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 6b82497572b4..538d2b6ac2ec 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -622,9 +622,8 @@ static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
if (pos > valid_size)
pos = valid_size;
- if (iocb_is_dsync(iocb) && iocb->ki_pos > pos) {
- ssize_t err = vfs_fsync_range(file, pos, iocb->ki_pos - 1,
- iocb->ki_flags & IOCB_SYNC);
+ if (iocb->ki_pos > pos) {
+ ssize_t err = generic_write_sync(iocb, iocb->ki_pos - pos);
if (err < 0)
return err;
}
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index fede0283d6e2..f5f1c4e8a29f 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -890,6 +890,7 @@ static int exfat_check_dir_empty(struct super_block *sb,
{
int i, dentries_per_clu;
unsigned int type;
+ unsigned int clu_count = 0;
struct exfat_chain clu;
struct exfat_dentry *ep;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
@@ -926,6 +927,10 @@ static int exfat_check_dir_empty(struct super_block *sb,
} else {
if (exfat_get_next_cluster(sb, &(clu.dir)))
return -EIO;
+
+ /* break if the cluster chain includes a loop */
+ if (unlikely(++clu_count > EXFAT_DATA_CLUSTER_COUNT(sbi)))
+ break;
}
}
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index ea5c1334a214..8926e63f5bb7 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -341,13 +341,12 @@ static void exfat_hash_init(struct super_block *sb)
INIT_HLIST_HEAD(&sbi->inode_hashtable[i]);
}
-static int exfat_read_root(struct inode *inode)
+static int exfat_read_root(struct inode *inode, struct exfat_chain *root_clu)
{
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(inode);
- struct exfat_chain cdir;
- int num_subdirs, num_clu = 0;
+ int num_subdirs;
exfat_chain_set(&ei->dir, sbi->root_dir, 0, ALLOC_FAT_CHAIN);
ei->entry = -1;
@@ -360,12 +359,9 @@ static int exfat_read_root(struct inode *inode)
ei->hint_stat.clu = sbi->root_dir;
ei->hint_femp.eidx = EXFAT_HINT_NONE;
- exfat_chain_set(&cdir, sbi->root_dir, 0, ALLOC_FAT_CHAIN);
- if (exfat_count_num_clusters(sb, &cdir, &num_clu))
- return -EIO;
- i_size_write(inode, num_clu << sbi->cluster_size_bits);
+ i_size_write(inode, EXFAT_CLU_TO_B(root_clu->size, sbi));
- num_subdirs = exfat_count_dir_entries(sb, &cdir);
+ num_subdirs = exfat_count_dir_entries(sb, root_clu);
if (num_subdirs < 0)
return -EIO;
set_nlink(inode, num_subdirs + EXFAT_MIN_SUBDIR);
@@ -578,7 +574,8 @@ static int exfat_verify_boot_region(struct super_block *sb)
}
/* mount the file system volume */
-static int __exfat_fill_super(struct super_block *sb)
+static int __exfat_fill_super(struct super_block *sb,
+ struct exfat_chain *root_clu)
{
int ret;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
@@ -595,6 +592,18 @@ static int __exfat_fill_super(struct super_block *sb)
goto free_bh;
}
+ /*
+ * Call exfat_count_num_cluster() before searching for up-case and
+ * bitmap directory entries to avoid infinite loop if they are missing
+ * and the cluster chain includes a loop.
+ */
+ exfat_chain_set(root_clu, sbi->root_dir, 0, ALLOC_FAT_CHAIN);
+ ret = exfat_count_num_clusters(sb, root_clu, &root_clu->size);
+ if (ret) {
+ exfat_err(sb, "failed to count the number of clusters in root");
+ goto free_bh;
+ }
+
ret = exfat_create_upcase_table(sb);
if (ret) {
exfat_err(sb, "failed to load upcase table");
@@ -627,6 +636,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
struct exfat_sb_info *sbi = sb->s_fs_info;
struct exfat_mount_options *opts = &sbi->options;
struct inode *root_inode;
+ struct exfat_chain root_clu;
int err;
if (opts->allow_utime == (unsigned short)-1)
@@ -645,7 +655,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_time_min = EXFAT_MIN_TIMESTAMP_SECS;
sb->s_time_max = EXFAT_MAX_TIMESTAMP_SECS;
- err = __exfat_fill_super(sb);
+ err = __exfat_fill_super(sb, &root_clu);
if (err) {
exfat_err(sb, "failed to recognize exfat type");
goto check_nls_io;
@@ -680,7 +690,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
root_inode->i_ino = EXFAT_ROOT_INO;
inode_set_iversion(root_inode, 1);
- err = exfat_read_root(root_inode);
+ err = exfat_read_root(root_inode, &root_clu);
if (err) {
exfat_err(sb, "failed to initialize root inode");
goto put_inode;
diff --git a/include/linux/execmem.h b/include/linux/execmem.h
index 3be35680a54f..7de229134e30 100644
--- a/include/linux/execmem.h
+++ b/include/linux/execmem.h
@@ -60,27 +60,11 @@ enum execmem_range_flags {
* will trap
* @ptr: pointer to memory to fill
* @size: size of the range to fill
- * @writable: is the memory poited by @ptr is writable or ROX
*
* A hook for architecures to fill execmem ranges with invalid instructions.
* Architectures that use EXECMEM_ROX_CACHE must implement this.
*/
-void execmem_fill_trapping_insns(void *ptr, size_t size, bool writable);
-
-/**
- * execmem_make_temp_rw - temporarily remap region with read-write
- * permissions
- * @ptr: address of the region to remap
- * @size: size of the region to remap
- *
- * Remaps a part of the cached large page in the ROX cache in the range
- * [@ptr, @ptr + @size) as writable and not executable. The caller must
- * have exclusive ownership of this range and ensure nothing will try to
- * execute code in this range.
- *
- * Return: 0 on success or negative error code on failure.
- */
-int execmem_make_temp_rw(void *ptr, size_t size);
+void execmem_fill_trapping_insns(void *ptr, size_t size);
/**
* execmem_restore_rox - restore read-only-execute permissions
@@ -95,7 +79,6 @@ int execmem_make_temp_rw(void *ptr, size_t size);
*/
int execmem_restore_rox(void *ptr, size_t size);
#else
-static inline int execmem_make_temp_rw(void *ptr, size_t size) { return 0; }
static inline int execmem_restore_rox(void *ptr, size_t size) { return 0; }
#endif
@@ -166,6 +149,28 @@ struct execmem_info *execmem_arch_setup(void);
void *execmem_alloc(enum execmem_type type, size_t size);
/**
+ * execmem_alloc_rw - allocate writable executable memory
+ * @type: type of the allocation
+ * @size: how many bytes of memory are required
+ *
+ * Allocates memory that will contain executable code, either generated or
+ * loaded from kernel modules.
+ *
+ * Allocates memory that will contain data coupled with executable code,
+ * like data sections in kernel modules.
+ *
+ * Forces writable permissions on the allocated memory and the caller is
+ * responsible to manage the permissions afterwards.
+ *
+ * For architectures that use ROX cache the permissions will be set to R+W.
+ * For architectures that don't use ROX cache the default permissions for @type
+ * will be used as they must be writable.
+ *
+ * Return: a pointer to the allocated memory or %NULL
+ */
+void *execmem_alloc_rw(enum execmem_type type, size_t size);
+
+/**
* execmem_free - free executable memory
* @ptr: pointer to the memory that should be freed
*/
@@ -186,19 +191,6 @@ struct vm_struct *execmem_vmap(size_t size);
#endif
/**
- * execmem_update_copy - copy an update to executable memory
- * @dst: destination address to update
- * @src: source address containing the data
- * @size: how many bytes of memory shold be copied
- *
- * Copy @size bytes from @src to @dst using text poking if the memory at
- * @dst is read-only.
- *
- * Return: a pointer to @dst or NULL on error
- */
-void *execmem_update_copy(void *dst, const void *src, size_t size);
-
-/**
* execmem_is_rox - check if execmem is read-only
* @type - the execmem type to check
*
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index 7376c1df9c90..c16353cc6e3c 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -225,7 +225,4 @@ io_mapping_free(struct io_mapping *iomap)
kfree(iomap);
}
-int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma,
- unsigned long addr, unsigned long pfn, unsigned long size);
-
#endif /* _LINUX_IO_MAPPING_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 349f0d9aad22..1ae97a0b8ec7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -414,8 +414,10 @@ extern unsigned int kobjsize(const void *objp);
#endif
#ifdef CONFIG_64BIT
-/* VM is sealed, in vm_flags */
-#define VM_SEALED _BITUL(63)
+#define VM_SEALED_BIT 42
+#define VM_SEALED BIT(VM_SEALED_BIT)
+#else
+#define VM_SEALED VM_NONE
#endif
/* Bits set in the VMA until the stack is in its final location */
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 1f4f44951abe..11a078de9150 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -12,6 +12,7 @@ extern int rcuwait_wake_up(struct rcuwait *w);
#include <linux/tracepoint-defs.h>
#include <linux/types.h>
#include <linux/cleanup.h>
+#include <linux/sched/mm.h>
#define MMAP_LOCK_INITIALIZER(name) \
.mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock),
@@ -154,6 +155,10 @@ static inline void vma_refcount_put(struct vm_area_struct *vma)
* reused and attached to a different mm before we lock it.
* Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
* detached.
+ *
+ * WARNING! The vma passed to this function cannot be used if the function
+ * fails to lock it because in certain cases RCU lock is dropped and then
+ * reacquired. Once RCU lock is dropped the vma can be concurently freed.
*/
static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
struct vm_area_struct *vma)
@@ -183,6 +188,31 @@ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
}
rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
+
+ /*
+ * If vma got attached to another mm from under us, that mm is not
+ * stable and can be freed in the narrow window after vma->vm_refcnt
+ * is dropped and before rcuwait_wake_up(mm) is called. Grab it before
+ * releasing vma->vm_refcnt.
+ */
+ if (unlikely(vma->vm_mm != mm)) {
+ /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */
+ struct mm_struct *other_mm = vma->vm_mm;
+
+ /*
+ * __mmdrop() is a heavy operation and we don't need RCU
+ * protection here. Release RCU lock during these operations.
+ * We reinstate the RCU read lock as the caller expects it to
+ * be held when this function returns even on error.
+ */
+ rcu_read_unlock();
+ mmgrab(other_mm);
+ vma_refcount_put(vma);
+ mmdrop(other_mm);
+ rcu_read_lock();
+ return NULL;
+ }
+
/*
* Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
* False unlocked result is impossible because we modify and check
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 8e4d6eda8a8d..8d3fa3a91ce4 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -837,8 +837,6 @@ void set_page_writeback(struct page *page);
#define folio_start_writeback(folio) \
__folio_start_writeback(folio, false)
-#define folio_start_writeback_keepwrite(folio) \
- __folio_start_writeback(folio, true)
static __always_inline bool folio_test_head(const struct folio *folio)
{
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index e3b99920be05..4c035637eeb7 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -736,6 +736,29 @@ static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm,
}
#endif
+/**
+ * get_and_clear_ptes - Clear present PTEs that map consecutive pages of
+ * the same folio, collecting dirty/accessed bits.
+ * @mm: Address space the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to clear.
+ *
+ * Use this instead of get_and_clear_full_ptes() if it is known that we don't
+ * need to clear the full mm, which is mostly the case.
+ *
+ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
+ * some PTEs might be write-protected.
+ *
+ * Context: The caller holds the page table lock. The PTEs map consecutive
+ * pages that belong to the same folio. The PTEs are all in the same PMD.
+ */
+static inline pte_t get_and_clear_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr)
+{
+ return get_and_clear_full_ptes(mm, addr, ptep, nr, 0);
+}
+
#ifndef clear_full_ptes
/**
* clear_full_ptes - Clear present PTEs that map consecutive pages of the same
@@ -768,6 +791,28 @@ static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr,
}
#endif
+/**
+ * clear_ptes - Clear present PTEs that map consecutive pages of the same folio.
+ * @mm: Address space the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to clear.
+ *
+ * Use this instead of clear_full_ptes() if it is known that we don't need to
+ * clear the full mm, which is mostly the case.
+ *
+ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
+ * some PTEs might be write-protected.
+ *
+ * Context: The caller holds the page table lock. The PTEs map consecutive
+ * pages that belong to the same folio. The PTEs are all in the same PMD.
+ */
+static inline void clear_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr)
+{
+ clear_full_ptes(mm, addr, ptep, nr, 0);
+}
+
/*
* If two threads concurrently fault at the same page, the thread that
* won the race updates the PTE and its local TLB/Cache. The other thread
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 20803fcb49a7..6cd020eea37a 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -449,6 +449,28 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio,
default:
VM_WARN_ON_ONCE(true);
}
+
+ /*
+ * Anon folios must have an associated live anon_vma as long as they're
+ * mapped into userspace.
+ * Note that the atomic_read() mainly does two things:
+ *
+ * 1. In KASAN builds with CONFIG_SLUB_RCU_DEBUG, it causes KASAN to
+ * check that the associated anon_vma has not yet been freed (subject
+ * to KASAN's usual limitations). This check will pass if the
+ * anon_vma's refcount has already dropped to 0 but an RCU grace
+ * period hasn't passed since then.
+ * 2. If the anon_vma has not yet been freed, it checks that the
+ * anon_vma still has a nonzero refcount (as opposed to being in the
+ * middle of an RCU delay for getting freed).
+ */
+ if (folio_test_anon(folio) && !folio_test_ksm(folio)) {
+ unsigned long mapping = (unsigned long)folio->mapping;
+ struct anon_vma *anon_vma;
+
+ anon_vma = (void *)(mapping - FOLIO_MAPPING_ANON);
+ VM_WARN_ON_FOLIO(atomic_read(&anon_vma->refcount) == 0, folio);
+ }
}
/*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 22fdf0c187cd..8060c2857bb2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6842,10 +6842,20 @@ static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf)
return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS;
}
+static int perf_mmap_may_split(struct vm_area_struct *vma, unsigned long addr)
+{
+ /*
+ * Forbid splitting perf mappings to prevent refcount leaks due to
+ * the resulting non-matching offsets and sizes. See open()/close().
+ */
+ return -EINVAL;
+}
+
static const struct vm_operations_struct perf_mmap_vmops = {
.open = perf_mmap_open,
.close = perf_mmap_close, /* non mergeable */
.pfn_mkwrite = perf_mmap_pfn_mkwrite,
+ .may_split = perf_mmap_may_split,
};
static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma)
@@ -7051,8 +7061,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
ret = 0;
goto unlock;
}
-
- atomic_set(&rb->aux_mmap_count, 1);
}
user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
@@ -7115,15 +7123,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
perf_event_update_time(event);
perf_event_init_userpage(event);
perf_event_update_userpage(event);
+ ret = 0;
} else {
ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
event->attr.aux_watermark, flags);
- if (!ret)
+ if (!ret) {
+ atomic_set(&rb->aux_mmap_count, 1);
rb->aux_mmap_locked = extra;
+ }
}
- ret = 0;
-
unlock:
if (!ret) {
atomic_long_add(user_extra, &user->locked_vm);
@@ -7131,6 +7140,7 @@ unlock:
atomic_inc(&event->mmap_count);
} else if (rb) {
+ /* AUX allocation failed */
atomic_dec(&rb->mmap_count);
}
aux_unlock:
@@ -7138,6 +7148,9 @@ aux_unlock:
mutex_unlock(aux_mutex);
mutex_unlock(&event->mmap_mutex);
+ if (ret)
+ return ret;
+
/*
* Since pinned accounting is per vm we cannot allow fork() to copy our
* vma.
@@ -7145,13 +7158,20 @@ aux_unlock:
vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
vma->vm_ops = &perf_mmap_vmops;
- if (!ret)
- ret = map_range(rb, vma);
-
mapped = get_mapped(event, event_mapped);
if (mapped)
mapped(event, vma->vm_mm);
+ /*
+ * Try to map it into the page table. On fail, invoke
+ * perf_mmap_close() to undo the above, as the callsite expects
+ * full cleanup in this case and therefore does not invoke
+ * vmops::close().
+ */
+ ret = map_range(rb, vma);
+ if (ret)
+ perf_mmap_close(vma);
+
return ret;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 9ce93fd20f82..c4ada32598bd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -585,9 +585,12 @@ static void check_mm(struct mm_struct *mm)
for (i = 0; i < NR_MM_COUNTERS; i++) {
long x = percpu_counter_sum(&mm->rss_stat[i]);
- if (unlikely(x))
- pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
- mm, resident_page_types[i], x);
+ if (unlikely(x)) {
+ pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld Comm:%s Pid:%d\n",
+ mm, resident_page_types[i], x,
+ current->comm,
+ task_pid_nr(current));
+ }
}
if (mm_pgtables_bytes(mm))
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 7f8bb51aedd4..c66b26184936 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1322,20 +1322,11 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
else
execmem_type = EXECMEM_MODULE_TEXT;
- ptr = execmem_alloc(execmem_type, size);
+ ptr = execmem_alloc_rw(execmem_type, size);
if (!ptr)
return -ENOMEM;
- if (execmem_is_rox(execmem_type)) {
- int err = execmem_make_temp_rw(ptr, size);
-
- if (err) {
- execmem_free(ptr);
- return -ENOMEM;
- }
-
- mod->mem[type].is_rox = true;
- }
+ mod->mem[type].is_rox = execmem_is_rox(execmem_type);
/*
* The pointer to these blocks of memory are stored on the module
diff --git a/mm/Kconfig b/mm/Kconfig
index d5d4eca947a6..e443fe8cd6cf 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1242,10 +1242,6 @@ config KMAP_LOCAL
config KMAP_LOCAL_NON_LINEAR_PTE_ARRAY
bool
-# struct io_mapping based helper. Selected by drivers that need them
-config IO_MAPPING
- bool
-
config MEMFD_CREATE
bool "Enable memfd_create() system call" if EXPERT
diff --git a/mm/Makefile b/mm/Makefile
index 1a7a11d4933d..ef54aa615d9d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -141,7 +141,6 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o
obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
obj-$(CONFIG_PTDUMP) += ptdump.o
obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
-obj-$(CONFIG_IO_MAPPING) += io-mapping.o
obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 94af19c4dfed..87e825349bdf 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -711,6 +711,10 @@ static void damos_va_migrate_dests_add(struct folio *folio,
target -= dests->weight_arr[i];
}
+ /* If the folio is already in the right node, don't do anything */
+ if (folio_nid(folio) == dests->node_id_arr[i])
+ return;
+
isolate:
if (!folio_isolate_lru(folio))
return;
diff --git a/mm/execmem.c b/mm/execmem.c
index 627e6cf64f4f..0822305413ec 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -26,7 +26,7 @@ static struct execmem_info default_execmem_info __ro_after_init;
#ifdef CONFIG_MMU
static void *execmem_vmalloc(struct execmem_range *range, size_t size,
- pgprot_t pgprot, vm_flags_t vm_flags)
+ pgprot_t pgprot, unsigned long vm_flags)
{
bool kasan = range->flags & EXECMEM_KASAN_SHADOW;
gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN;
@@ -82,7 +82,7 @@ struct vm_struct *execmem_vmap(size_t size)
}
#else
static void *execmem_vmalloc(struct execmem_range *range, size_t size,
- pgprot_t pgprot, vm_flags_t vm_flags)
+ pgprot_t pgprot, unsigned long vm_flags)
{
return vmalloc(size);
}
@@ -93,8 +93,15 @@ struct execmem_cache {
struct mutex mutex;
struct maple_tree busy_areas;
struct maple_tree free_areas;
+ unsigned int pending_free_cnt; /* protected by mutex */
};
+/* delay to schedule asynchronous free if fast path free fails */
+#define FREE_DELAY (msecs_to_jiffies(10))
+
+/* mark entries in busy_areas that should be freed asynchronously */
+#define PENDING_FREE_MASK (1 << (PAGE_SHIFT - 1))
+
static struct execmem_cache execmem_cache = {
.mutex = __MUTEX_INITIALIZER(execmem_cache.mutex),
.busy_areas = MTREE_INIT_EXT(busy_areas, MT_FLAGS_LOCK_EXTERN,
@@ -130,6 +137,27 @@ err_restore:
return err;
}
+static int execmem_force_rw(void *ptr, size_t size)
+{
+ unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ unsigned long addr = (unsigned long)ptr;
+ int ret;
+
+ ret = set_memory_nx(addr, nr);
+ if (ret)
+ return ret;
+
+ return set_memory_rw(addr, nr);
+}
+
+int execmem_restore_rox(void *ptr, size_t size)
+{
+ unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ unsigned long addr = (unsigned long)ptr;
+
+ return set_memory_rox(addr, nr);
+}
+
static void execmem_cache_clean(struct work_struct *work)
{
struct maple_tree *free_areas = &execmem_cache.free_areas;
@@ -155,20 +183,17 @@ static void execmem_cache_clean(struct work_struct *work)
static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean);
-static int execmem_cache_add(void *ptr, size_t size)
+static int execmem_cache_add_locked(void *ptr, size_t size, gfp_t gfp_mask)
{
struct maple_tree *free_areas = &execmem_cache.free_areas;
- struct mutex *mutex = &execmem_cache.mutex;
unsigned long addr = (unsigned long)ptr;
MA_STATE(mas, free_areas, addr - 1, addr + 1);
unsigned long lower, upper;
void *area = NULL;
- int err;
lower = addr;
upper = addr + size - 1;
- mutex_lock(mutex);
area = mas_walk(&mas);
if (area && mas.last == addr - 1)
lower = mas.index;
@@ -178,12 +203,14 @@ static int execmem_cache_add(void *ptr, size_t size)
upper = mas.last;
mas_set_range(&mas, lower, upper);
- err = mas_store_gfp(&mas, (void *)lower, GFP_KERNEL);
- mutex_unlock(mutex);
- if (err)
- return err;
+ return mas_store_gfp(&mas, (void *)lower, gfp_mask);
+}
- return 0;
+static int execmem_cache_add(void *ptr, size_t size, gfp_t gfp_mask)
+{
+ guard(mutex)(&execmem_cache.mutex);
+
+ return execmem_cache_add_locked(ptr, size, gfp_mask);
}
static bool within_range(struct execmem_range *range, struct ma_state *mas,
@@ -256,7 +283,7 @@ out_unlock:
static int execmem_cache_populate(struct execmem_range *range, size_t size)
{
- vm_flags_t vm_flags = VM_ALLOW_HUGE_VMAP;
+ unsigned long vm_flags = VM_ALLOW_HUGE_VMAP;
struct vm_struct *vm;
size_t alloc_size;
int err = -ENOMEM;
@@ -264,6 +291,11 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size)
alloc_size = round_up(size, PMD_SIZE);
p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags);
+ if (!p) {
+ alloc_size = size;
+ p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags);
+ }
+
if (!p)
return err;
@@ -272,13 +304,13 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size)
goto err_free_mem;
/* fill memory with instructions that will trap */
- execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true);
+ execmem_fill_trapping_insns(p, alloc_size);
err = set_memory_rox((unsigned long)p, vm->nr_pages);
if (err)
goto err_free_mem;
- err = execmem_cache_add(p, alloc_size);
+ err = execmem_cache_add(p, alloc_size, GFP_KERNEL);
if (err)
goto err_reset_direct_map;
@@ -307,57 +339,117 @@ static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
return __execmem_cache_alloc(range, size);
}
-static bool execmem_cache_free(void *ptr)
+static inline bool is_pending_free(void *ptr)
{
- struct maple_tree *busy_areas = &execmem_cache.busy_areas;
- struct mutex *mutex = &execmem_cache.mutex;
- unsigned long addr = (unsigned long)ptr;
- MA_STATE(mas, busy_areas, addr, addr);
- size_t size;
- void *area;
+ return ((unsigned long)ptr & PENDING_FREE_MASK);
+}
- mutex_lock(mutex);
- area = mas_walk(&mas);
- if (!area) {
- mutex_unlock(mutex);
- return false;
- }
- size = mas_range_len(&mas);
+static inline void *pending_free_set(void *ptr)
+{
+ return (void *)((unsigned long)ptr | PENDING_FREE_MASK);
+}
- mas_store_gfp(&mas, NULL, GFP_KERNEL);
- mutex_unlock(mutex);
+static inline void *pending_free_clear(void *ptr)
+{
+ return (void *)((unsigned long)ptr & ~PENDING_FREE_MASK);
+}
- execmem_fill_trapping_insns(ptr, size, /* writable = */ false);
+static int __execmem_cache_free(struct ma_state *mas, void *ptr, gfp_t gfp_mask)
+{
+ size_t size = mas_range_len(mas);
+ int err;
- execmem_cache_add(ptr, size);
+ err = execmem_force_rw(ptr, size);
+ if (err)
+ return err;
- schedule_work(&execmem_cache_clean_work);
+ execmem_fill_trapping_insns(ptr, size);
+ execmem_restore_rox(ptr, size);
- return true;
+ err = execmem_cache_add_locked(ptr, size, gfp_mask);
+ if (err)
+ return err;
+
+ mas_store_gfp(mas, NULL, gfp_mask);
+ return 0;
}
-int execmem_make_temp_rw(void *ptr, size_t size)
+static void execmem_cache_free_slow(struct work_struct *work);
+static DECLARE_DELAYED_WORK(execmem_cache_free_work, execmem_cache_free_slow);
+
+static void execmem_cache_free_slow(struct work_struct *work)
{
- unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
- unsigned long addr = (unsigned long)ptr;
- int ret;
+ struct maple_tree *busy_areas = &execmem_cache.busy_areas;
+ MA_STATE(mas, busy_areas, 0, ULONG_MAX);
+ void *area;
- ret = set_memory_nx(addr, nr);
- if (ret)
- return ret;
+ guard(mutex)(&execmem_cache.mutex);
- return set_memory_rw(addr, nr);
+ if (!execmem_cache.pending_free_cnt)
+ return;
+
+ mas_for_each(&mas, area, ULONG_MAX) {
+ if (!is_pending_free(area))
+ continue;
+
+ area = pending_free_clear(area);
+ if (__execmem_cache_free(&mas, area, GFP_KERNEL))
+ continue;
+
+ execmem_cache.pending_free_cnt--;
+ }
+
+ if (execmem_cache.pending_free_cnt)
+ schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY);
+ else
+ schedule_work(&execmem_cache_clean_work);
}
-int execmem_restore_rox(void *ptr, size_t size)
+static bool execmem_cache_free(void *ptr)
{
- unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ struct maple_tree *busy_areas = &execmem_cache.busy_areas;
unsigned long addr = (unsigned long)ptr;
+ MA_STATE(mas, busy_areas, addr, addr);
+ void *area;
+ int err;
- return set_memory_rox(addr, nr);
+ guard(mutex)(&execmem_cache.mutex);
+
+ area = mas_walk(&mas);
+ if (!area)
+ return false;
+
+ err = __execmem_cache_free(&mas, area, GFP_KERNEL | __GFP_NORETRY);
+ if (err) {
+ /*
+ * mas points to exact slot we've got the area from, nothing
+ * else can modify the tree because of the mutex, so there
+ * won't be any allocations in mas_store_gfp() and it will just
+ * change the pointer.
+ */
+ area = pending_free_set(area);
+ mas_store_gfp(&mas, area, GFP_KERNEL);
+ execmem_cache.pending_free_cnt++;
+ schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY);
+ return true;
+ }
+
+ schedule_work(&execmem_cache_clean_work);
+
+ return true;
}
#else /* CONFIG_ARCH_HAS_EXECMEM_ROX */
+/*
+ * when ROX cache is not used the permissions defined by architectures for
+ * execmem ranges that are updated before use (e.g. EXECMEM_MODULE_TEXT) must
+ * be writable anyway
+ */
+static inline int execmem_force_rw(void *ptr, size_t size)
+{
+ return 0;
+}
+
static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
{
return NULL;
@@ -373,9 +465,9 @@ void *execmem_alloc(enum execmem_type type, size_t size)
{
struct execmem_range *range = &execmem_info->ranges[type];
bool use_cache = range->flags & EXECMEM_ROX_CACHE;
- vm_flags_t vm_flags = VM_FLUSH_RESET_PERMS;
+ unsigned long vm_flags = VM_FLUSH_RESET_PERMS;
pgprot_t pgprot = range->pgprot;
- void *p;
+ void *p = NULL;
size = PAGE_ALIGN(size);
@@ -387,6 +479,21 @@ void *execmem_alloc(enum execmem_type type, size_t size)
return kasan_reset_tag(p);
}
+void *execmem_alloc_rw(enum execmem_type type, size_t size)
+{
+ void *p __free(execmem) = execmem_alloc(type, size);
+ int err;
+
+ if (!p)
+ return NULL;
+
+ err = execmem_force_rw(p, size);
+ if (err)
+ return NULL;
+
+ return no_free_ptr(p);
+}
+
void execmem_free(void *ptr)
{
/*
@@ -399,11 +506,6 @@ void execmem_free(void *ptr)
vfree(ptr);
}
-void *execmem_update_copy(void *dst, const void *src, size_t size)
-{
- return text_poke_copy(dst, src, size);
-}
-
bool execmem_is_rox(enum execmem_type type)
{
return !!(execmem_info->ranges[type].flags & EXECMEM_ROX_CACHE);
diff --git a/mm/internal.h b/mm/internal.h
index 1da16d550a45..45b725c3dc03 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1391,7 +1391,7 @@ int migrate_device_coherent_folio(struct folio *folio);
struct vm_struct *__get_vm_area_node(unsigned long size,
unsigned long align, unsigned long shift,
- vm_flags_t vm_flags, unsigned long start,
+ unsigned long vm_flags, unsigned long start,
unsigned long end, int node, gfp_t gfp_mask,
const void *caller);
diff --git a/mm/io-mapping.c b/mm/io-mapping.c
deleted file mode 100644
index d3586e95c12c..000000000000
--- a/mm/io-mapping.c
+++ /dev/null
@@ -1,30 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-
-#include <linux/mm.h>
-#include <linux/io-mapping.h>
-
-/**
- * io_mapping_map_user - remap an I/O mapping to userspace
- * @iomap: the source io_mapping
- * @vma: user vma to map to
- * @addr: target user address to start at
- * @pfn: physical address of kernel memory
- * @size: size of map area
- *
- * Note: this is only safe if the mm semaphore is held when called.
- */
-int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma,
- unsigned long addr, unsigned long pfn, unsigned long size)
-{
- vm_flags_t expected_flags = VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
-
- if (WARN_ON_ONCE((vma->vm_flags & expected_flags) != expected_flags))
- return -EINVAL;
-
- pgprot_t remap_prot = __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) |
- (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK));
-
- /* We rely on prevalidation of the io-mapping to skip pfnmap tracking. */
- return remap_pfn_range_notrack(vma, addr, pfn, size, remap_prot);
-}
-EXPORT_SYMBOL_GPL(io_mapping_map_user);
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index ed4873e18c75..9142964ab9c9 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -230,16 +230,12 @@ static bool check_slab_allocation(struct kmem_cache *cache, void *object,
}
static inline void poison_slab_object(struct kmem_cache *cache, void *object,
- bool init, bool still_accessible)
+ bool init)
{
void *tagged_object = object;
object = kasan_reset_tag(object);
- /* RCU slabs could be legally used after free within the RCU period. */
- if (unlikely(still_accessible))
- return;
-
kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE),
KASAN_SLAB_FREE, init);
@@ -261,7 +257,22 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init,
if (!kasan_arch_is_ready() || is_kfence_address(object))
return false;
- poison_slab_object(cache, object, init, still_accessible);
+ /*
+ * If this point is reached with an object that must still be
+ * accessible under RCU, we can't poison it; in that case, also skip the
+ * quarantine. This should mostly only happen when CONFIG_SLUB_RCU_DEBUG
+ * has been disabled manually.
+ *
+ * Putting the object on the quarantine wouldn't help catch UAFs (since
+ * we can't poison it here), and it would mask bugs caused by
+ * SLAB_TYPESAFE_BY_RCU users not being careful enough about object
+ * reuse; so overall, putting the object into the quarantine here would
+ * be counterproductive.
+ */
+ if (still_accessible)
+ return false;
+
+ poison_slab_object(cache, object, init);
/*
* If the object is put into quarantine, do not let slab put the object
@@ -519,7 +530,7 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip)
if (check_slab_allocation(slab->slab_cache, ptr, ip))
return false;
- poison_slab_object(slab->slab_cache, ptr, false, false);
+ poison_slab_object(slab->slab_cache, ptr, false);
return true;
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index a55fb1dcd224..374a6a5193a7 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -700,12 +700,15 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
spinlock_t *ptl,
struct list_head *compound_pagelist)
{
+ unsigned long end = address + HPAGE_PMD_SIZE;
struct folio *src, *tmp;
- pte_t *_pte;
pte_t pteval;
+ pte_t *_pte;
+ unsigned int nr_ptes;
- for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
- _pte++, address += PAGE_SIZE) {
+ for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes,
+ address += nr_ptes * PAGE_SIZE) {
+ nr_ptes = 1;
pteval = ptep_get(_pte);
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
@@ -722,18 +725,26 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
struct page *src_page = pte_page(pteval);
src = page_folio(src_page);
- if (!folio_test_large(src))
+
+ if (folio_test_large(src)) {
+ unsigned int max_nr_ptes = (end - address) >> PAGE_SHIFT;
+
+ nr_ptes = folio_pte_batch(src, _pte, pteval, max_nr_ptes);
+ } else {
release_pte_folio(src);
+ }
+
/*
* ptl mostly unnecessary, but preempt has to
* be disabled to update the per-cpu stats
* inside folio_remove_rmap_pte().
*/
spin_lock(ptl);
- ptep_clear(vma->vm_mm, address, _pte);
- folio_remove_rmap_pte(src, src_page, vma);
+ clear_ptes(vma->vm_mm, address, _pte, nr_ptes);
+ folio_remove_rmap_ptes(src, src_page, nr_ptes, vma);
spin_unlock(ptl);
- free_folio_and_swap_cache(src);
+ free_swap_cache(src);
+ folio_put_refs(src, nr_ptes);
}
}
@@ -1492,15 +1503,17 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
bool install_pmd)
{
+ int nr_mapped_ptes = 0, result = SCAN_FAIL;
+ unsigned int nr_batch_ptes;
struct mmu_notifier_range range;
bool notified = false;
unsigned long haddr = addr & HPAGE_PMD_MASK;
+ unsigned long end = haddr + HPAGE_PMD_SIZE;
struct vm_area_struct *vma = vma_lookup(mm, haddr);
struct folio *folio;
pte_t *start_pte, *pte;
pmd_t *pmd, pgt_pmd;
spinlock_t *pml = NULL, *ptl;
- int nr_ptes = 0, result = SCAN_FAIL;
int i;
mmap_assert_locked(mm);
@@ -1614,11 +1627,15 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
goto abort;
/* step 2: clear page table and adjust rmap */
- for (i = 0, addr = haddr, pte = start_pte;
- i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+ for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR;
+ i += nr_batch_ptes, addr += nr_batch_ptes * PAGE_SIZE,
+ pte += nr_batch_ptes) {
+ unsigned int max_nr_batch_ptes = (end - addr) >> PAGE_SHIFT;
struct page *page;
pte_t ptent = ptep_get(pte);
+ nr_batch_ptes = 1;
+
if (pte_none(ptent))
continue;
/*
@@ -1632,26 +1649,29 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
goto abort;
}
page = vm_normal_page(vma, addr, ptent);
+
if (folio_page(folio, i) != page)
goto abort;
+ nr_batch_ptes = folio_pte_batch(folio, pte, ptent, max_nr_batch_ptes);
+
/*
* Must clear entry, or a racing truncate may re-remove it.
* TLB flush can be left until pmdp_collapse_flush() does it.
* PTE dirty? Shmem page is already dirty; file is read-only.
*/
- ptep_clear(mm, addr, pte);
- folio_remove_rmap_pte(folio, page, vma);
- nr_ptes++;
+ clear_ptes(mm, addr, pte, nr_batch_ptes);
+ folio_remove_rmap_ptes(folio, page, nr_batch_ptes, vma);
+ nr_mapped_ptes += nr_batch_ptes;
}
if (!pml)
spin_unlock(ptl);
/* step 3: set proper refcount and mm_counters. */
- if (nr_ptes) {
- folio_ref_sub(folio, nr_ptes);
- add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
+ if (nr_mapped_ptes) {
+ folio_ref_sub(folio, nr_mapped_ptes);
+ add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes);
}
/* step 4: remove empty page table */
@@ -1684,10 +1704,10 @@ maybe_install_pmd:
: SCAN_SUCCEED;
goto drop_folio;
abort:
- if (nr_ptes) {
+ if (nr_mapped_ptes) {
flush_tlb_mm(mm);
- folio_ref_sub(folio, nr_ptes);
- add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
+ folio_ref_sub(folio, nr_mapped_ptes);
+ add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes);
}
unlock:
if (start_pte)
diff --git a/mm/madvise.c b/mm/madvise.c
index bb80fc5ea08f..35ed4ab0d7c5 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -19,6 +19,7 @@
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/mm_inline.h>
+#include <linux/mmu_context.h>
#include <linux/string.h>
#include <linux/uio.h>
#include <linux/ksm.h>
@@ -1256,6 +1257,74 @@ static long madvise_guard_remove(struct madvise_behavior *madv_behavior)
&guard_remove_walk_ops, NULL);
}
+#ifdef CONFIG_64BIT
+/* Does the madvise operation result in discarding of mapped data? */
+static bool is_discard(int behavior)
+{
+ switch (behavior) {
+ case MADV_FREE:
+ case MADV_DONTNEED:
+ case MADV_DONTNEED_LOCKED:
+ case MADV_REMOVE:
+ case MADV_DONTFORK:
+ case MADV_WIPEONFORK:
+ case MADV_GUARD_INSTALL:
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * We are restricted from madvise()'ing mseal()'d VMAs only in very particular
+ * circumstances - discarding of data from read-only anonymous SEALED mappings.
+ *
+ * This is because users cannot trivally discard data from these VMAs, and may
+ * only do so via an appropriate madvise() call.
+ */
+static bool can_madvise_modify(struct madvise_behavior *madv_behavior)
+{
+ struct vm_area_struct *vma = madv_behavior->vma;
+
+ /* If the VMA isn't sealed we're good. */
+ if (!vma_is_sealed(vma))
+ return true;
+
+ /* For a sealed VMA, we only care about discard operations. */
+ if (!is_discard(madv_behavior->behavior))
+ return true;
+
+ /*
+ * We explicitly permit all file-backed mappings, whether MAP_SHARED or
+ * MAP_PRIVATE.
+ *
+ * The latter causes some complications. Because now, one can mmap()
+ * read/write a MAP_PRIVATE mapping, write to it, then mprotect()
+ * read-only, mseal() and a discard will be permitted.
+ *
+ * However, in order to avoid issues with potential use of madvise(...,
+ * MADV_DONTNEED) of mseal()'d .text mappings we, for the time being,
+ * permit this.
+ */
+ if (!vma_is_anonymous(vma))
+ return true;
+
+ /* If the user could write to the mapping anyway, then this is fine. */
+ if ((vma->vm_flags & VM_WRITE) &&
+ arch_vma_access_permitted(vma, /* write= */ true,
+ /* execute= */ false, /* foreign= */ false))
+ return true;
+
+ /* Otherwise, we are not permitted to perform this operation. */
+ return false;
+}
+#else
+static bool can_madvise_modify(struct madvise_behavior *madv_behavior)
+{
+ return true;
+}
+#endif
+
/*
* Apply an madvise behavior to a region of a vma. madvise_update_vma
* will handle splitting a vm area into separate areas, each area with its own
@@ -1269,7 +1338,7 @@ static int madvise_vma_behavior(struct madvise_behavior *madv_behavior)
struct madvise_behavior_range *range = &madv_behavior->range;
int error;
- if (unlikely(!can_modify_vma_madv(madv_behavior->vma, behavior)))
+ if (unlikely(!can_madvise_modify(madv_behavior)))
return -EPERM;
switch (behavior) {
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 3047b9ac667e..e2e685b971bb 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -837,11 +837,17 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
struct mm_walk *walk)
{
struct hwpoison_walk *hwp = walk->private;
- pte_t pte = huge_ptep_get(walk->mm, addr, ptep);
struct hstate *h = hstate_vma(walk->vma);
+ spinlock_t *ptl;
+ pte_t pte;
+ int ret;
- return check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
- hwp->pfn, &hwp->tk);
+ ptl = huge_pte_lock(h, walk->mm, ptep);
+ pte = huge_ptep_get(walk->mm, addr, ptep);
+ ret = check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
+ hwp->pfn, &hwp->tk);
+ spin_unlock(ptl);
+ return ret;
}
#else
#define hwpoison_hugetlb_range NULL
diff --git a/mm/mempool.c b/mm/mempool.c
index 204a216b6418..1c38e873e546 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -136,7 +136,7 @@ static void kasan_unpoison_element(mempool_t *pool, void *element)
static __always_inline void add_element(mempool_t *pool, void *element)
{
- BUG_ON(pool->curr_nr >= pool->min_nr);
+ BUG_ON(pool->min_nr != 0 && pool->curr_nr >= pool->min_nr);
poison_element(pool, element);
if (kasan_poison_element(pool, element))
pool->elements[pool->curr_nr++] = element;
@@ -202,16 +202,20 @@ int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
pool->alloc = alloc_fn;
pool->free = free_fn;
init_waitqueue_head(&pool->wait);
-
- pool->elements = kmalloc_array_node(min_nr, sizeof(void *),
+ /*
+ * max() used here to ensure storage for at least 1 element to support
+ * zero minimum pool
+ */
+ pool->elements = kmalloc_array_node(max(1, min_nr), sizeof(void *),
gfp_mask, node_id);
if (!pool->elements)
return -ENOMEM;
/*
- * First pre-allocate the guaranteed number of buffers.
+ * First pre-allocate the guaranteed number of buffers,
+ * also pre-allocate 1 element for zero minimum pool.
*/
- while (pool->curr_nr < pool->min_nr) {
+ while (pool->curr_nr < max(1, pool->min_nr)) {
void *element;
element = pool->alloc(gfp_mask, pool->pool_data);
@@ -555,20 +559,12 @@ void mempool_free(void *element, mempool_t *pool)
* wake-up path of previous test. This explicit check ensures the
* allocation of element when both min_nr and curr_nr are 0, and
* any active waiters are properly awakened.
- *
- * Inline the same logic as previous test, add_element() cannot be
- * directly used here since it has BUG_ON to deny if min_nr equals
- * curr_nr, so here picked rest of add_element() to use without
- * BUG_ON check.
*/
if (unlikely(pool->min_nr == 0 &&
READ_ONCE(pool->curr_nr) == 0)) {
spin_lock_irqsave(&pool->lock, flags);
if (likely(pool->curr_nr == 0)) {
- /* Inline the logic of add_element() */
- poison_element(pool, element);
- if (kasan_poison_element(pool, element))
- pool->elements[pool->curr_nr++] = element;
+ add_element(pool, element);
spin_unlock_irqrestore(&pool->lock, flags);
if (wq_has_sleeper(&pool->wait))
wake_up(&pool->wait);
diff --git a/mm/mincore.c b/mm/mincore.c
index 42d6c9c8da86..10dabefc3acc 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -29,7 +29,9 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
#ifdef CONFIG_HUGETLB_PAGE
unsigned char present;
unsigned char *vec = walk->private;
+ spinlock_t *ptl;
+ ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
/*
* Hugepages under user process are always in RAM and never
* swapped out, but theoretically it needs to be checked.
@@ -38,6 +40,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
for (; addr != end; vec++, addr += PAGE_SIZE)
*vec = present;
walk->private = vec;
+ spin_unlock(ptl);
#else
BUG();
#endif
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index 729fb7d0dd59..b006cec8e6fe 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -164,8 +164,7 @@ retry:
*/
/* Check if the vma we locked is the right one. */
- if (unlikely(vma->vm_mm != mm ||
- address < vma->vm_start || address >= vma->vm_end))
+ if (unlikely(address < vma->vm_start || address >= vma->vm_end))
goto inval_end_read;
rcu_read_unlock();
@@ -236,11 +235,8 @@ retry:
goto fallback;
}
- /*
- * Verify the vma we locked belongs to the same address space and it's
- * not behind of the last search position.
- */
- if (unlikely(vma->vm_mm != mm || from_addr >= vma->vm_end))
+ /* Verify the vma is not behind the last search position. */
+ if (unlikely(from_addr >= vma->vm_end))
goto fallback_unlock;
/*
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 2ddd37b2f462..78bded7acf79 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -766,7 +766,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
unsigned long charged = 0;
int error;
- if (!can_modify_vma(vma))
+ if (vma_is_sealed(vma))
return -EPERM;
if (newflags == oldflags) {
diff --git a/mm/mremap.c b/mm/mremap.c
index e15cf2e444c7..677a4d744df9 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -280,7 +280,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
old_pte, max_nr_ptes);
force_flush = true;
}
- pte = get_and_clear_full_ptes(mm, old_addr, old_ptep, nr_ptes, 0);
+ pte = get_and_clear_ptes(mm, old_addr, old_ptep, nr_ptes);
pte = move_pte(pte, old_addr, new_addr);
pte = move_soft_dirty_pte(pte);
@@ -1651,7 +1651,7 @@ static int check_prep_vma(struct vma_remap_struct *vrm)
return -EFAULT;
/* If mseal()'d, mremap() is prohibited. */
- if (!can_modify_vma(vma))
+ if (vma_is_sealed(vma))
return -EPERM;
/* Align to hugetlb page size, if required. */
diff --git a/mm/mseal.c b/mm/mseal.c
index c27197ac04e8..e5b205562d2e 100644
--- a/mm/mseal.c
+++ b/mm/mseal.c
@@ -11,148 +11,74 @@
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
-#include <linux/mmu_context.h>
#include <linux/syscalls.h>
#include <linux/sched.h>
#include "internal.h"
-static inline void set_vma_sealed(struct vm_area_struct *vma)
-{
- vm_flags_set(vma, VM_SEALED);
-}
-
-static bool is_madv_discard(int behavior)
-{
- switch (behavior) {
- case MADV_FREE:
- case MADV_DONTNEED:
- case MADV_DONTNEED_LOCKED:
- case MADV_REMOVE:
- case MADV_DONTFORK:
- case MADV_WIPEONFORK:
- case MADV_GUARD_INSTALL:
- return true;
- }
-
- return false;
-}
-
-static bool is_ro_anon(struct vm_area_struct *vma)
-{
- /* check anonymous mapping. */
- if (vma->vm_file || vma->vm_flags & VM_SHARED)
- return false;
-
- /*
- * check for non-writable:
- * PROT=RO or PKRU is not writeable.
- */
- if (!(vma->vm_flags & VM_WRITE) ||
- !arch_vma_access_permitted(vma, true, false, false))
- return true;
-
- return false;
-}
-
/*
- * Check if a vma is allowed to be modified by madvise.
+ * mseal() disallows an input range which contain unmapped ranges (VMA holes).
+ *
+ * It disallows unmapped regions from start to end whether they exist at the
+ * start, in the middle, or at the end of the range, or any combination thereof.
+ *
+ * This is because after sealng a range, there's nothing to stop memory mapping
+ * of ranges in the remaining gaps later, meaning that the user might then
+ * wrongly consider the entirety of the mseal()'d range to be sealed when it
+ * in fact isn't.
*/
-bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
-{
- if (!is_madv_discard(behavior))
- return true;
-
- if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma)))
- return false;
-
- /* Allow by default. */
- return true;
-}
-
-static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
- struct vm_area_struct **prev, unsigned long start,
- unsigned long end, vm_flags_t newflags)
-{
- int ret = 0;
- vm_flags_t oldflags = vma->vm_flags;
-
- if (newflags == oldflags)
- goto out;
-
- vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
- if (IS_ERR(vma)) {
- ret = PTR_ERR(vma);
- goto out;
- }
-
- set_vma_sealed(vma);
-out:
- *prev = vma;
- return ret;
-}
/*
- * Check for do_mseal:
- * 1> start is part of a valid vma.
- * 2> end is part of a valid vma.
- * 3> No gap (unallocated address) between start and end.
- * 4> map is sealable.
+ * Does the [start, end) range contain any unmapped memory?
+ *
+ * We ensure that:
+ * - start is part of a valid VMA.
+ * - end is part of a valid VMA.
+ * - no gap (unallocated memory) exists between start and end.
*/
-static int check_mm_seal(unsigned long start, unsigned long end)
+static bool range_contains_unmapped(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
{
struct vm_area_struct *vma;
- unsigned long nstart = start;
-
+ unsigned long prev_end = start;
VMA_ITERATOR(vmi, current->mm, start);
- /* going through each vma to check. */
for_each_vma_range(vmi, vma, end) {
- if (vma->vm_start > nstart)
- /* unallocated memory found. */
- return -ENOMEM;
-
- if (vma->vm_end >= end)
- return 0;
+ if (vma->vm_start > prev_end)
+ return true;
- nstart = vma->vm_end;
+ prev_end = vma->vm_end;
}
- return -ENOMEM;
+ return prev_end < end;
}
-/*
- * Apply sealing.
- */
-static int apply_mm_seal(unsigned long start, unsigned long end)
+static int mseal_apply(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
{
- unsigned long nstart;
struct vm_area_struct *vma, *prev;
+ unsigned long curr_start = start;
+ VMA_ITERATOR(vmi, mm, start);
- VMA_ITERATOR(vmi, current->mm, start);
-
+ /* We know there are no gaps so this will be non-NULL. */
vma = vma_iter_load(&vmi);
- /*
- * Note: check_mm_seal should already checked ENOMEM case.
- * so vma should not be null, same for the other ENOMEM cases.
- */
prev = vma_prev(&vmi);
if (start > vma->vm_start)
prev = vma;
- nstart = start;
for_each_vma_range(vmi, vma, end) {
- int error;
- unsigned long tmp;
- vm_flags_t newflags;
+ unsigned long curr_end = MIN(vma->vm_end, end);
- newflags = vma->vm_flags | VM_SEALED;
- tmp = vma->vm_end;
- if (tmp > end)
- tmp = end;
- error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
- if (error)
- return error;
- nstart = vma_iter_end(&vmi);
+ if (!(vma->vm_flags & VM_SEALED)) {
+ vma = vma_modify_flags(&vmi, prev, vma,
+ curr_start, curr_end,
+ vma->vm_flags | VM_SEALED);
+ if (IS_ERR(vma))
+ return PTR_ERR(vma);
+ vm_flags_set(vma, VM_SEALED);
+ }
+
+ prev = vma;
+ curr_start = curr_end;
}
return 0;
@@ -240,14 +166,10 @@ int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
if (mmap_write_lock_killable(mm))
return -EINTR;
- /*
- * First pass, this helps to avoid
- * partial sealing in case of error in input address range,
- * e.g. ENOMEM error.
- */
- ret = check_mm_seal(start, end);
- if (ret)
+ if (range_contains_unmapped(mm, start, end)) {
+ ret = -ENOMEM;
goto out;
+ }
/*
* Second pass, this should success, unless there are errors
@@ -255,10 +177,10 @@ int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
* reaching the max supported VMAs, however, those cases shall
* be rare.
*/
- ret = apply_mm_seal(start, end);
+ ret = mseal_apply(mm, start, end);
out:
- mmap_write_unlock(current->mm);
+ mmap_write_unlock(mm);
return ret;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index 736d0e0f0618..8b819fafd57b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -126,7 +126,7 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
- pgprot_t prot, vm_flags_t vm_flags, int node,
+ pgprot_t prot, unsigned long vm_flags, int node,
const void *caller)
{
return __vmalloc_noprof(size, gfp_mask);
diff --git a/mm/rmap.c b/mm/rmap.c
index f93ce27132ab..568198e9efc2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -2036,7 +2036,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
flush_cache_range(vma, address, end_addr);
/* Nuke the page table entry. */
- pteval = get_and_clear_full_ptes(mm, address, pvmw.pte, nr_pages, 0);
+ pteval = get_and_clear_ptes(mm, address, pvmw.pte, nr_pages);
/*
* We clear the PTE but do not flush so potentially
* a remote CPU could still be writing to the folio.
diff --git a/mm/shmem.c b/mm/shmem.c
index 7fdd707ac1ac..e2c76a30802b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -512,15 +512,27 @@ static int shmem_replace_entry(struct address_space *mapping,
/*
* Sometimes, before we decide whether to proceed or to fail, we must check
- * that an entry was not already brought back from swap by a racing thread.
+ * that an entry was not already brought back or split by a racing thread.
*
* Checking folio is not enough: by the time a swapcache folio is locked, it
* might be reused, and again be swapcache, using the same swap as before.
+ * Returns the swap entry's order if it still presents, else returns -1.
*/
-static bool shmem_confirm_swap(struct address_space *mapping,
- pgoff_t index, swp_entry_t swap)
+static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index,
+ swp_entry_t swap)
{
- return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
+ XA_STATE(xas, &mapping->i_pages, index);
+ int ret = -1;
+ void *entry;
+
+ rcu_read_lock();
+ do {
+ entry = xas_load(&xas);
+ if (entry == swp_to_radix_entry(swap))
+ ret = xas_get_order(&xas);
+ } while (xas_retry(&xas, entry));
+ rcu_read_unlock();
+ return ret;
}
/*
@@ -891,7 +903,9 @@ static int shmem_add_to_page_cache(struct folio *folio,
pgoff_t index, void *expected, gfp_t gfp)
{
XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
- long nr = folio_nr_pages(folio);
+ unsigned long nr = folio_nr_pages(folio);
+ swp_entry_t iter, swap;
+ void *entry;
VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
@@ -903,14 +917,25 @@ static int shmem_add_to_page_cache(struct folio *folio,
gfp &= GFP_RECLAIM_MASK;
folio_throttle_swaprate(folio, gfp);
+ swap = radix_to_swp_entry(expected);
do {
+ iter = swap;
xas_lock_irq(&xas);
- if (expected != xas_find_conflict(&xas)) {
- xas_set_err(&xas, -EEXIST);
- goto unlock;
+ xas_for_each_conflict(&xas, entry) {
+ /*
+ * The range must either be empty, or filled with
+ * expected swap entries. Shmem swap entries are never
+ * partially freed without split of both entry and
+ * folio, so there shouldn't be any holes.
+ */
+ if (!expected || entry != swp_to_radix_entry(iter)) {
+ xas_set_err(&xas, -EEXIST);
+ goto unlock;
+ }
+ iter.val += 1 << xas_get_order(&xas);
}
- if (expected && xas_find_conflict(&xas)) {
+ if (expected && iter.val - nr != swap.val) {
xas_set_err(&xas, -EEXIST);
goto unlock;
}
@@ -1992,30 +2017,47 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode,
swp_entry_t entry, int order, gfp_t gfp)
{
struct shmem_inode_info *info = SHMEM_I(inode);
+ int nr_pages = 1 << order;
struct folio *new;
+ gfp_t alloc_gfp;
void *shadow;
- int nr_pages;
/*
* We have arrived here because our zones are constrained, so don't
* limit chance of success with further cpuset and node constraints.
*/
gfp &= ~GFP_CONSTRAINT_MASK;
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && order > 0) {
- gfp_t huge_gfp = vma_thp_gfp_mask(vma);
+ alloc_gfp = gfp;
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+ if (WARN_ON_ONCE(order))
+ return ERR_PTR(-EINVAL);
+ } else if (order) {
+ /*
+ * If uffd is active for the vma, we need per-page fault
+ * fidelity to maintain the uffd semantics, then fallback
+ * to swapin order-0 folio, as well as for zswap case.
+ * Any existing sub folio in the swap cache also blocks
+ * mTHP swapin.
+ */
+ if ((vma && unlikely(userfaultfd_armed(vma))) ||
+ !zswap_never_enabled() ||
+ non_swapcache_batch(entry, nr_pages) != nr_pages)
+ goto fallback;
- gfp = limit_gfp_mask(huge_gfp, gfp);
+ alloc_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp);
+ }
+retry:
+ new = shmem_alloc_folio(alloc_gfp, order, info, index);
+ if (!new) {
+ new = ERR_PTR(-ENOMEM);
+ goto fallback;
}
- new = shmem_alloc_folio(gfp, order, info, index);
- if (!new)
- return ERR_PTR(-ENOMEM);
-
- nr_pages = folio_nr_pages(new);
if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL,
- gfp, entry)) {
+ alloc_gfp, entry)) {
folio_put(new);
- return ERR_PTR(-ENOMEM);
+ new = ERR_PTR(-ENOMEM);
+ goto fallback;
}
/*
@@ -2030,7 +2072,9 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode,
*/
if (swapcache_prepare(entry, nr_pages)) {
folio_put(new);
- return ERR_PTR(-EEXIST);
+ new = ERR_PTR(-EEXIST);
+ /* Try smaller folio to avoid cache conflict */
+ goto fallback;
}
__folio_set_locked(new);
@@ -2044,6 +2088,15 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode,
folio_add_lru(new);
swap_read_folio(new, NULL);
return new;
+fallback:
+ /* Order 0 swapin failed, nothing to fallback to, abort */
+ if (!order)
+ return new;
+ entry.val += index - round_down(index, nr_pages);
+ alloc_gfp = gfp;
+ nr_pages = 1;
+ order = 0;
+ goto retry;
}
/*
@@ -2249,7 +2302,7 @@ unlock:
if (xas_error(&xas))
return xas_error(&xas);
- return entry_order;
+ return 0;
}
/*
@@ -2266,133 +2319,109 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct address_space *mapping = inode->i_mapping;
struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
struct shmem_inode_info *info = SHMEM_I(inode);
+ swp_entry_t swap, index_entry;
struct swap_info_struct *si;
struct folio *folio = NULL;
bool skip_swapcache = false;
- swp_entry_t swap;
- int error, nr_pages, order, split_order;
+ int error, nr_pages, order;
+ pgoff_t offset;
VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
- swap = radix_to_swp_entry(*foliop);
+ index_entry = radix_to_swp_entry(*foliop);
+ swap = index_entry;
*foliop = NULL;
- if (is_poisoned_swp_entry(swap))
+ if (is_poisoned_swp_entry(index_entry))
return -EIO;
- si = get_swap_device(swap);
- if (!si) {
- if (!shmem_confirm_swap(mapping, index, swap))
+ si = get_swap_device(index_entry);
+ order = shmem_confirm_swap(mapping, index, index_entry);
+ if (unlikely(!si)) {
+ if (order < 0)
return -EEXIST;
else
return -EINVAL;
}
+ if (unlikely(order < 0)) {
+ put_swap_device(si);
+ return -EEXIST;
+ }
+
+ /* index may point to the middle of a large entry, get the sub entry */
+ if (order) {
+ offset = index - round_down(index, 1 << order);
+ swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
+ }
/* Look it up and read it in.. */
folio = swap_cache_get_folio(swap, NULL, 0);
- order = xa_get_order(&mapping->i_pages, index);
if (!folio) {
- int nr_pages = 1 << order;
- bool fallback_order0 = false;
-
- /* Or update major stats only when swapin succeeds?? */
+ if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
+ /* Direct swapin skipping swap cache & readahead */
+ folio = shmem_swap_alloc_folio(inode, vma, index,
+ index_entry, order, gfp);
+ if (IS_ERR(folio)) {
+ error = PTR_ERR(folio);
+ folio = NULL;
+ goto failed;
+ }
+ skip_swapcache = true;
+ } else {
+ /* Cached swapin only supports order 0 folio */
+ folio = shmem_swapin_cluster(swap, gfp, info, index);
+ if (!folio) {
+ error = -ENOMEM;
+ goto failed;
+ }
+ }
if (fault_type) {
*fault_type |= VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(fault_mm, PGMAJFAULT);
}
+ }
+ if (order > folio_order(folio)) {
/*
- * If uffd is active for the vma, we need per-page fault
- * fidelity to maintain the uffd semantics, then fallback
- * to swapin order-0 folio, as well as for zswap case.
- * Any existing sub folio in the swap cache also blocks
- * mTHP swapin.
- */
- if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) ||
- !zswap_never_enabled() ||
- non_swapcache_batch(swap, nr_pages) != nr_pages))
- fallback_order0 = true;
-
- /* Skip swapcache for synchronous device. */
- if (!fallback_order0 && data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
- folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp);
- if (!IS_ERR(folio)) {
- skip_swapcache = true;
- goto alloced;
- }
-
- /*
- * Fallback to swapin order-0 folio unless the swap entry
- * already exists.
- */
- error = PTR_ERR(folio);
- folio = NULL;
- if (error == -EEXIST)
- goto failed;
- }
-
- /*
- * Now swap device can only swap in order 0 folio, then we
- * should split the large swap entry stored in the pagecache
- * if necessary.
- */
- split_order = shmem_split_large_entry(inode, index, swap, gfp);
- if (split_order < 0) {
- error = split_order;
- goto failed;
- }
-
- /*
- * If the large swap entry has already been split, it is
- * necessary to recalculate the new swap entry based on
- * the old order alignment.
- */
- if (split_order > 0) {
- pgoff_t offset = index - round_down(index, 1 << split_order);
-
- swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
- }
-
- /* Here we actually start the io */
- folio = shmem_swapin_cluster(swap, gfp, info, index);
- if (!folio) {
- error = -ENOMEM;
- goto failed;
- }
- } else if (order != folio_order(folio)) {
- /*
- * Swap readahead may swap in order 0 folios into swapcache
+ * Swapin may get smaller folios due to various reasons:
+ * It may fallback to order 0 due to memory pressure or race,
+ * swap readahead may swap in order 0 folios into swapcache
* asynchronously, while the shmem mapping can still stores
* large swap entries. In such cases, we should split the
* large swap entry to prevent possible data corruption.
*/
- split_order = shmem_split_large_entry(inode, index, swap, gfp);
- if (split_order < 0) {
- folio_put(folio);
- folio = NULL;
- error = split_order;
- goto failed;
- }
-
- /*
- * If the large swap entry has already been split, it is
- * necessary to recalculate the new swap entry based on
- * the old order alignment.
- */
- if (split_order > 0) {
- pgoff_t offset = index - round_down(index, 1 << split_order);
+ error = shmem_split_large_entry(inode, index, index_entry, gfp);
+ if (error)
+ goto failed_nolock;
+ }
- swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
- }
+ /*
+ * If the folio is large, round down swap and index by folio size.
+ * No matter what race occurs, the swap layer ensures we either get
+ * a valid folio that has its swap entry aligned by size, or a
+ * temporarily invalid one which we'll abort very soon and retry.
+ *
+ * shmem_add_to_page_cache ensures the whole range contains expected
+ * entries and prevents any corruption, so any race split is fine
+ * too, it will succeed as long as the entries are still there.
+ */
+ nr_pages = folio_nr_pages(folio);
+ if (nr_pages > 1) {
+ swap.val = round_down(swap.val, nr_pages);
+ index = round_down(index, nr_pages);
}
-alloced:
- /* We have to do this with folio locked to prevent races */
+ /*
+ * We have to do this with the folio locked to prevent races.
+ * The shmem_confirm_swap below only checks if the first swap
+ * entry matches the folio, that's enough to ensure the folio
+ * is not used outside of shmem, as shmem swap entries
+ * and swap cache folios are never partially freed.
+ */
folio_lock(folio);
if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
- folio->swap.val != swap.val ||
- !shmem_confirm_swap(mapping, index, swap) ||
- xa_get_order(&mapping->i_pages, index) != folio_order(folio)) {
+ shmem_confirm_swap(mapping, index, swap) < 0 ||
+ folio->swap.val != swap.val) {
error = -EEXIST;
goto unlock;
}
@@ -2415,8 +2444,7 @@ alloced:
goto failed;
}
- error = shmem_add_to_page_cache(folio, mapping,
- round_down(index, nr_pages),
+ error = shmem_add_to_page_cache(folio, mapping, index,
swp_to_radix_entry(swap), gfp);
if (error)
goto failed;
@@ -2439,18 +2467,19 @@ alloced:
*foliop = folio;
return 0;
failed:
- if (!shmem_confirm_swap(mapping, index, swap))
+ if (shmem_confirm_swap(mapping, index, swap) < 0)
error = -EEXIST;
if (error == -EIO)
shmem_set_folio_swapin_error(inode, index, folio, swap,
skip_swapcache);
unlock:
- if (skip_swapcache)
- swapcache_clear(si, swap, folio_nr_pages(folio));
- if (folio) {
+ if (folio)
folio_unlock(folio);
+failed_nolock:
+ if (skip_swapcache)
+ swapcache_clear(si, folio->swap, folio_nr_pages(folio));
+ if (folio)
folio_put(folio);
- }
put_swap_device(si);
return error;
@@ -5960,8 +5989,8 @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping,
struct folio *folio;
int error;
- error = shmem_get_folio_gfp(inode, index, 0, &folio, SGP_CACHE,
- gfp, NULL, NULL);
+ error = shmem_get_folio_gfp(inode, index, i_size_read(inode),
+ &folio, SGP_CACHE, gfp, NULL, NULL);
if (error)
return ERR_PTR(error);
diff --git a/mm/vma.c b/mm/vma.c
index 9ba93be621da..3b12c7579831 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -1351,7 +1351,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
}
/* Don't bother splitting the VMA if we can't unmap it anyway */
- if (!can_modify_vma(vms->vma)) {
+ if (vma_is_sealed(vms->vma)) {
error = -EPERM;
goto start_split_failed;
}
@@ -1371,7 +1371,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
for_each_vma_range(*(vms->vmi), next, vms->end) {
long nrpages;
- if (!can_modify_vma(next)) {
+ if (vma_is_sealed(next)) {
error = -EPERM;
goto modify_vma_failed;
}
diff --git a/mm/vma.h b/mm/vma.h
index acdcc515c459..b123a9cdedb0 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -559,38 +559,15 @@ struct vm_area_struct *vma_iter_next_rewind(struct vma_iterator *vmi,
}
#ifdef CONFIG_64BIT
-
static inline bool vma_is_sealed(struct vm_area_struct *vma)
{
return (vma->vm_flags & VM_SEALED);
}
-
-/*
- * check if a vma is sealed for modification.
- * return true, if modification is allowed.
- */
-static inline bool can_modify_vma(struct vm_area_struct *vma)
-{
- if (unlikely(vma_is_sealed(vma)))
- return false;
-
- return true;
-}
-
-bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior);
-
#else
-
-static inline bool can_modify_vma(struct vm_area_struct *vma)
-{
- return true;
-}
-
-static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
+static inline bool vma_is_sealed(struct vm_area_struct *vma)
{
- return true;
+ return false;
}
-
#endif
#if defined(CONFIG_STACK_GROWSUP)
diff --git a/tools/testing/selftests/cachestat/test_cachestat.c b/tools/testing/selftests/cachestat/test_cachestat.c
index 632ab44737ec..c952640f163b 100644
--- a/tools/testing/selftests/cachestat/test_cachestat.c
+++ b/tools/testing/selftests/cachestat/test_cachestat.c
@@ -33,6 +33,11 @@ void print_cachestat(struct cachestat *cs)
cs->nr_evicted, cs->nr_recently_evicted);
}
+enum file_type {
+ FILE_MMAP,
+ FILE_SHMEM
+};
+
bool write_exactly(int fd, size_t filesize)
{
int random_fd = open("/dev/urandom", O_RDONLY);
@@ -201,8 +206,20 @@ out1:
out:
return ret;
}
+const char *file_type_str(enum file_type type)
+{
+ switch (type) {
+ case FILE_SHMEM:
+ return "shmem";
+ case FILE_MMAP:
+ return "mmap";
+ default:
+ return "unknown";
+ }
+}
-bool test_cachestat_shmem(void)
+
+bool run_cachestat_test(enum file_type type)
{
size_t PS = sysconf(_SC_PAGESIZE);
size_t filesize = PS * 512 * 2; /* 2 2MB huge pages */
@@ -212,27 +229,50 @@ bool test_cachestat_shmem(void)
char *filename = "tmpshmcstat";
struct cachestat cs;
bool ret = true;
+ int fd;
unsigned long num_pages = compute_len / PS;
- int fd = shm_open(filename, O_CREAT | O_RDWR, 0600);
+ if (type == FILE_SHMEM)
+ fd = shm_open(filename, O_CREAT | O_RDWR, 0600);
+ else
+ fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0666);
if (fd < 0) {
- ksft_print_msg("Unable to create shmem file.\n");
+ ksft_print_msg("Unable to create %s file.\n",
+ file_type_str(type));
ret = false;
goto out;
}
if (ftruncate(fd, filesize)) {
- ksft_print_msg("Unable to truncate shmem file.\n");
+ ksft_print_msg("Unable to truncate %s file.\n",file_type_str(type));
ret = false;
goto close_fd;
}
+ switch (type) {
+ case FILE_SHMEM:
+ if (!write_exactly(fd, filesize)) {
+ ksft_print_msg("Unable to write to file.\n");
+ ret = false;
+ goto close_fd;
+ }
+ break;
+ case FILE_MMAP:
+ char *map = mmap(NULL, filesize, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, 0);
- if (!write_exactly(fd, filesize)) {
- ksft_print_msg("Unable to write to shmem file.\n");
+ if (map == MAP_FAILED) {
+ ksft_print_msg("mmap failed.\n");
+ ret = false;
+ goto close_fd;
+ }
+ for (int i = 0; i < filesize; i++)
+ map[i] = 'A';
+ break;
+ default:
+ ksft_print_msg("Unsupported file type.\n");
ret = false;
goto close_fd;
}
-
syscall_ret = syscall(__NR_cachestat, fd, &cs_range, &cs, 0);
if (syscall_ret) {
@@ -308,12 +348,18 @@ int main(void)
break;
}
- if (test_cachestat_shmem())
+ if (run_cachestat_test(FILE_SHMEM))
ksft_test_result_pass("cachestat works with a shmem file\n");
else {
ksft_test_result_fail("cachestat fails with a shmem file\n");
ret = 1;
}
+ if (run_cachestat_test(FILE_MMAP))
+ ksft_test_result_pass("cachestat works with a mmap file\n");
+ else {
+ ksft_test_result_fail("cachestat fails with a mmap file\n");
+ ret = 1;
+ }
return ret;
}
diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index f2dafa0b700b..e7b23a8a05fe 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -21,6 +21,7 @@ on-fault-limit
transhuge-stress
pagemap_ioctl
pfnmap
+process_madv
*.tmp*
protection_keys
protection_keys_32
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index ae6f994d3add..d13b3cef2a2b 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -85,6 +85,7 @@ TEST_GEN_FILES += mseal_test
TEST_GEN_FILES += on-fault-limit
TEST_GEN_FILES += pagemap_ioctl
TEST_GEN_FILES += pfnmap
+TEST_GEN_FILES += process_madv
TEST_GEN_FILES += thuge-gen
TEST_GEN_FILES += transhuge-stress
TEST_GEN_FILES += uffd-stress
diff --git a/tools/testing/selftests/mm/process_madv.c b/tools/testing/selftests/mm/process_madv.c
new file mode 100644
index 000000000000..471cae8427f1
--- /dev/null
+++ b/tools/testing/selftests/mm/process_madv.c
@@ -0,0 +1,344 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#define _GNU_SOURCE
+#include "../kselftest_harness.h"
+#include <errno.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/mman.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <sched.h>
+#include "vm_util.h"
+
+#include "../pidfd/pidfd.h"
+
+FIXTURE(process_madvise)
+{
+ unsigned long page_size;
+ pid_t child_pid;
+ int remote_pidfd;
+ int pidfd;
+};
+
+FIXTURE_SETUP(process_madvise)
+{
+ self->page_size = (unsigned long)sysconf(_SC_PAGESIZE);
+ self->pidfd = PIDFD_SELF;
+ self->remote_pidfd = -1;
+ self->child_pid = -1;
+};
+
+FIXTURE_TEARDOWN_PARENT(process_madvise)
+{
+ /* This teardown is guaranteed to run, even if tests SKIP or ASSERT */
+ if (self->child_pid > 0) {
+ kill(self->child_pid, SIGKILL);
+ waitpid(self->child_pid, NULL, 0);
+ }
+
+ if (self->remote_pidfd >= 0)
+ close(self->remote_pidfd);
+}
+
+static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec,
+ size_t vlen, int advice, unsigned int flags)
+{
+ return syscall(__NR_process_madvise, pidfd, iovec, vlen, advice, flags);
+}
+
+/*
+ * This test uses PIDFD_SELF to target the current process. The main
+ * goal is to verify the basic behavior of process_madvise() with
+ * a vector of non-contiguous memory ranges, not its cross-process
+ * capabilities.
+ */
+TEST_F(process_madvise, basic)
+{
+ const unsigned long pagesize = self->page_size;
+ const int madvise_pages = 4;
+ struct iovec vec[madvise_pages];
+ int pidfd = self->pidfd;
+ ssize_t ret;
+ char *map;
+
+ /*
+ * Create a single large mapping. We will pick pages from this
+ * mapping to advise on. This ensures we test non-contiguous iovecs.
+ */
+ map = mmap(NULL, pagesize * 10, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (map == MAP_FAILED)
+ SKIP(return, "mmap failed, not enough memory.\n");
+
+ /* Fill the entire region with a known pattern. */
+ memset(map, 'A', pagesize * 10);
+
+ /*
+ * Setup the iovec to point to 4 non-contiguous pages
+ * within the mapping.
+ */
+ vec[0].iov_base = &map[0 * pagesize];
+ vec[0].iov_len = pagesize;
+ vec[1].iov_base = &map[3 * pagesize];
+ vec[1].iov_len = pagesize;
+ vec[2].iov_base = &map[5 * pagesize];
+ vec[2].iov_len = pagesize;
+ vec[3].iov_base = &map[8 * pagesize];
+ vec[3].iov_len = pagesize;
+
+ ret = sys_process_madvise(pidfd, vec, madvise_pages, MADV_DONTNEED, 0);
+ if (ret == -1 && errno == EPERM)
+ SKIP(return,
+ "process_madvise() unsupported or permission denied, try running as root.\n");
+ else if (errno == EINVAL)
+ SKIP(return,
+ "process_madvise() unsupported or parameter invalid, please check arguments.\n");
+
+ /* The call should succeed and report the total bytes processed. */
+ ASSERT_EQ(ret, madvise_pages * pagesize);
+
+ /* Check that advised pages are now zero. */
+ for (int i = 0; i < madvise_pages; i++) {
+ char *advised_page = (char *)vec[i].iov_base;
+
+ /* Content must be 0, not 'A'. */
+ ASSERT_EQ(*advised_page, '\0');
+ }
+
+ /* Check that an un-advised page in between is still 'A'. */
+ char *unadvised_page = &map[1 * pagesize];
+
+ for (int i = 0; i < pagesize; i++)
+ ASSERT_EQ(unadvised_page[i], 'A');
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(map, pagesize * 10), 0);
+}
+
+/*
+ * This test deterministically validates process_madvise() with MADV_COLLAPSE
+ * on a remote process, other advices are difficult to verify reliably.
+ *
+ * The test verifies that a memory region in a child process,
+ * focus on process_madv remote result, only check addresses and lengths.
+ * The correctness of the MADV_COLLAPSE can be found in the relevant test examples in khugepaged.
+ */
+TEST_F(process_madvise, remote_collapse)
+{
+ const unsigned long pagesize = self->page_size;
+ long huge_page_size;
+ int pipe_info[2];
+ ssize_t ret;
+ struct iovec vec;
+
+ struct child_info {
+ pid_t pid;
+ void *map_addr;
+ } info;
+
+ huge_page_size = read_pmd_pagesize();
+ if (huge_page_size <= 0)
+ SKIP(return, "Could not determine a valid huge page size.\n");
+
+ ASSERT_EQ(pipe(pipe_info), 0);
+
+ self->child_pid = fork();
+ ASSERT_NE(self->child_pid, -1);
+
+ if (self->child_pid == 0) {
+ char *map;
+ size_t map_size = 2 * huge_page_size;
+
+ close(pipe_info[0]);
+
+ map = mmap(NULL, map_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(map, MAP_FAILED);
+
+ /* Fault in as small pages */
+ for (size_t i = 0; i < map_size; i += pagesize)
+ map[i] = 'A';
+
+ /* Send info and pause */
+ info.pid = getpid();
+ info.map_addr = map;
+ ret = write(pipe_info[1], &info, sizeof(info));
+ ASSERT_EQ(ret, sizeof(info));
+ close(pipe_info[1]);
+
+ pause();
+ exit(0);
+ }
+
+ close(pipe_info[1]);
+
+ /* Receive child info */
+ ret = read(pipe_info[0], &info, sizeof(info));
+ if (ret <= 0) {
+ waitpid(self->child_pid, NULL, 0);
+ SKIP(return, "Failed to read child info from pipe.\n");
+ }
+ ASSERT_EQ(ret, sizeof(info));
+ close(pipe_info[0]);
+ self->child_pid = info.pid;
+
+ self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0);
+ ASSERT_GE(self->remote_pidfd, 0);
+
+ vec.iov_base = info.map_addr;
+ vec.iov_len = huge_page_size;
+
+ ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_COLLAPSE,
+ 0);
+ if (ret == -1) {
+ if (errno == EINVAL)
+ SKIP(return, "PROCESS_MADV_ADVISE is not supported.\n");
+ else if (errno == EPERM)
+ SKIP(return,
+ "No process_madvise() permissions, try running as root.\n");
+ return;
+ }
+
+ ASSERT_EQ(ret, huge_page_size);
+}
+
+/*
+ * Test process_madvise() with a pidfd for a process that has already
+ * exited to ensure correct error handling.
+ */
+TEST_F(process_madvise, exited_process_pidfd)
+{
+ const unsigned long pagesize = self->page_size;
+ struct iovec vec;
+ char *map;
+ ssize_t ret;
+
+ map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
+ 0);
+ if (map == MAP_FAILED)
+ SKIP(return, "mmap failed, not enough memory.\n");
+
+ vec.iov_base = map;
+ vec.iov_len = pagesize;
+
+ /*
+ * Using a pidfd for a process that has already exited should fail
+ * with ESRCH.
+ */
+ self->child_pid = fork();
+ ASSERT_NE(self->child_pid, -1);
+
+ if (self->child_pid == 0)
+ exit(0);
+
+ self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0);
+ ASSERT_GE(self->remote_pidfd, 0);
+
+ /* Wait for the child to ensure it has terminated. */
+ waitpid(self->child_pid, NULL, 0);
+
+ ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_DONTNEED,
+ 0);
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, ESRCH);
+}
+
+/*
+ * Test process_madvise() with bad pidfds to ensure correct error
+ * handling.
+ */
+TEST_F(process_madvise, bad_pidfd)
+{
+ const unsigned long pagesize = self->page_size;
+ struct iovec vec;
+ char *map;
+ ssize_t ret;
+
+ map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
+ 0);
+ if (map == MAP_FAILED)
+ SKIP(return, "mmap failed, not enough memory.\n");
+
+ vec.iov_base = map;
+ vec.iov_len = pagesize;
+
+ /* Using an invalid fd number (-1) should fail with EBADF. */
+ ret = sys_process_madvise(-1, &vec, 1, MADV_DONTNEED, 0);
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, EBADF);
+
+ /*
+ * Using a valid fd that is not a pidfd (e.g. stdin) should fail
+ * with EBADF.
+ */
+ ret = sys_process_madvise(STDIN_FILENO, &vec, 1, MADV_DONTNEED, 0);
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, EBADF);
+}
+
+/*
+ * Test that process_madvise() rejects vlen > UIO_MAXIOV.
+ * The kernel should return -EINVAL when the number of iovecs exceeds 1024.
+ */
+TEST_F(process_madvise, invalid_vlen)
+{
+ const unsigned long pagesize = self->page_size;
+ int pidfd = self->pidfd;
+ struct iovec vec;
+ char *map;
+ ssize_t ret;
+
+ map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
+ 0);
+ if (map == MAP_FAILED)
+ SKIP(return, "mmap failed, not enough memory.\n");
+
+ vec.iov_base = map;
+ vec.iov_len = pagesize;
+
+ ret = sys_process_madvise(pidfd, &vec, 1025, MADV_DONTNEED, 0);
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, EINVAL);
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(map, pagesize), 0);
+}
+
+/*
+ * Test process_madvise() with an invalid flag value. Currently, only a flag
+ * value of 0 is supported. This test is reserved for the future, e.g., if
+ * synchronous flags are added.
+ */
+TEST_F(process_madvise, flag)
+{
+ const unsigned long pagesize = self->page_size;
+ unsigned int invalid_flag;
+ int pidfd = self->pidfd;
+ struct iovec vec;
+ char *map;
+ ssize_t ret;
+
+ map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
+ 0);
+ if (map == MAP_FAILED)
+ SKIP(return, "mmap failed, not enough memory.\n");
+
+ vec.iov_base = map;
+ vec.iov_len = pagesize;
+
+ invalid_flag = 0x80000000;
+
+ ret = sys_process_madvise(pidfd, &vec, 1, MADV_DONTNEED, invalid_flag);
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, EINVAL);
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(map, pagesize), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index a38c984103ce..471e539d82b8 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -65,6 +65,8 @@ separated by spaces:
test pagemap_scan IOCTL
- pfnmap
tests for VM_PFNMAP handling
+- process_madv
+ test for process_madv
- cow
test copy-on-write semantics
- thp
@@ -425,6 +427,9 @@ CATEGORY="madv_guard" run_test ./guard-regions
# MADV_POPULATE_READ and MADV_POPULATE_WRITE tests
CATEGORY="madv_populate" run_test ./madv_populate
+# PROCESS_MADV test
+CATEGORY="process_madv" run_test ./process_madv
+
CATEGORY="vma_merge" run_test ./merge
if [ -x ./memfd_secret ]
diff --git a/tools/testing/selftests/perf_events/.gitignore b/tools/testing/selftests/perf_events/.gitignore
index ee93dc4969b8..4931b3b6bbd3 100644
--- a/tools/testing/selftests/perf_events/.gitignore
+++ b/tools/testing/selftests/perf_events/.gitignore
@@ -2,3 +2,4 @@
sigtrap_threads
remove_on_exec
watermark_signal
+mmap
diff --git a/tools/testing/selftests/perf_events/Makefile b/tools/testing/selftests/perf_events/Makefile
index 70e3ff211278..2e5d85770dfe 100644
--- a/tools/testing/selftests/perf_events/Makefile
+++ b/tools/testing/selftests/perf_events/Makefile
@@ -2,5 +2,5 @@
CFLAGS += -Wl,-no-as-needed -Wall $(KHDR_INCLUDES)
LDFLAGS += -lpthread
-TEST_GEN_PROGS := sigtrap_threads remove_on_exec watermark_signal
+TEST_GEN_PROGS := sigtrap_threads remove_on_exec watermark_signal mmap
include ../lib.mk
diff --git a/tools/testing/selftests/perf_events/mmap.c b/tools/testing/selftests/perf_events/mmap.c
new file mode 100644
index 000000000000..ea0427aac1f9
--- /dev/null
+++ b/tools/testing/selftests/perf_events/mmap.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define _GNU_SOURCE
+
+#include <dirent.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+
+#include <linux/perf_event.h>
+
+#include "../kselftest_harness.h"
+
+#define RB_SIZE 0x3000
+#define AUX_SIZE 0x10000
+#define AUX_OFFS 0x4000
+
+#define HOLE_SIZE 0x1000
+
+/* Reserve space for rb, aux with space for shrink-beyond-vma testing. */
+#define REGION_SIZE (2 * RB_SIZE + 2 * AUX_SIZE)
+#define REGION_AUX_OFFS (2 * RB_SIZE)
+
+#define MAP_BASE 1
+#define MAP_AUX 2
+
+#define EVENT_SRC_DIR "/sys/bus/event_source/devices"
+
+FIXTURE(perf_mmap)
+{
+ int fd;
+ void *ptr;
+ void *region;
+};
+
+FIXTURE_VARIANT(perf_mmap)
+{
+ bool aux;
+ unsigned long ptr_size;
+};
+
+FIXTURE_VARIANT_ADD(perf_mmap, rb)
+{
+ .aux = false,
+ .ptr_size = RB_SIZE,
+};
+
+FIXTURE_VARIANT_ADD(perf_mmap, aux)
+{
+ .aux = true,
+ .ptr_size = AUX_SIZE,
+};
+
+static bool read_event_type(struct dirent *dent, __u32 *type)
+{
+ char typefn[512];
+ FILE *fp;
+ int res;
+
+ snprintf(typefn, sizeof(typefn), "%s/%s/type", EVENT_SRC_DIR, dent->d_name);
+ fp = fopen(typefn, "r");
+ if (!fp)
+ return false;
+
+ res = fscanf(fp, "%u", type);
+ fclose(fp);
+ return res > 0;
+}
+
+FIXTURE_SETUP(perf_mmap)
+{
+ struct perf_event_attr attr = {
+ .size = sizeof(attr),
+ .disabled = 1,
+ .exclude_kernel = 1,
+ .exclude_hv = 1,
+ };
+ struct perf_event_attr attr_ok = {};
+ unsigned int eacces = 0, map = 0;
+ struct perf_event_mmap_page *rb;
+ struct dirent *dent;
+ void *aux, *region;
+ DIR *dir;
+
+ self->ptr = NULL;
+
+ dir = opendir(EVENT_SRC_DIR);
+ if (!dir)
+ SKIP(return, "perf not available.");
+
+ region = mmap(NULL, REGION_SIZE, PROT_NONE, MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(region, MAP_FAILED);
+ self->region = region;
+
+ // Try to find a suitable event on this system
+ while ((dent = readdir(dir))) {
+ int fd;
+
+ if (!read_event_type(dent, &attr.type))
+ continue;
+
+ fd = syscall(SYS_perf_event_open, &attr, 0, -1, -1, 0);
+ if (fd < 0) {
+ if (errno == EACCES)
+ eacces++;
+ continue;
+ }
+
+ // Check whether the event supports mmap()
+ rb = mmap(region, RB_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, 0);
+ if (rb == MAP_FAILED) {
+ close(fd);
+ continue;
+ }
+
+ if (!map) {
+ // Save the event in case that no AUX capable event is found
+ attr_ok = attr;
+ map = MAP_BASE;
+ }
+
+ if (!variant->aux)
+ continue;
+
+ rb->aux_offset = AUX_OFFS;
+ rb->aux_size = AUX_SIZE;
+
+ // Check whether it supports a AUX buffer
+ aux = mmap(region + REGION_AUX_OFFS, AUX_SIZE, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_FIXED, fd, AUX_OFFS);
+ if (aux == MAP_FAILED) {
+ munmap(rb, RB_SIZE);
+ close(fd);
+ continue;
+ }
+
+ attr_ok = attr;
+ map = MAP_AUX;
+ munmap(aux, AUX_SIZE);
+ munmap(rb, RB_SIZE);
+ close(fd);
+ break;
+ }
+ closedir(dir);
+
+ if (!map) {
+ if (!eacces)
+ SKIP(return, "No mappable perf event found.");
+ else
+ SKIP(return, "No permissions for perf_event_open()");
+ }
+
+ self->fd = syscall(SYS_perf_event_open, &attr_ok, 0, -1, -1, 0);
+ ASSERT_NE(self->fd, -1);
+
+ rb = mmap(region, RB_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, self->fd, 0);
+ ASSERT_NE(rb, MAP_FAILED);
+
+ if (!variant->aux) {
+ self->ptr = rb;
+ return;
+ }
+
+ if (map != MAP_AUX)
+ SKIP(return, "No AUX event found.");
+
+ rb->aux_offset = AUX_OFFS;
+ rb->aux_size = AUX_SIZE;
+ aux = mmap(region + REGION_AUX_OFFS, AUX_SIZE, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_FIXED, self->fd, AUX_OFFS);
+ ASSERT_NE(aux, MAP_FAILED);
+ self->ptr = aux;
+}
+
+FIXTURE_TEARDOWN(perf_mmap)
+{
+ ASSERT_EQ(munmap(self->region, REGION_SIZE), 0);
+ if (self->fd != -1)
+ ASSERT_EQ(close(self->fd), 0);
+}
+
+TEST_F(perf_mmap, remap)
+{
+ void *tmp, *ptr = self->ptr;
+ unsigned long size = variant->ptr_size;
+
+ // Test the invalid remaps
+ ASSERT_EQ(mremap(ptr, size, HOLE_SIZE, MREMAP_MAYMOVE), MAP_FAILED);
+ ASSERT_EQ(mremap(ptr + HOLE_SIZE, size, HOLE_SIZE, MREMAP_MAYMOVE), MAP_FAILED);
+ ASSERT_EQ(mremap(ptr + size - HOLE_SIZE, HOLE_SIZE, size, MREMAP_MAYMOVE), MAP_FAILED);
+ // Shrink the end of the mapping such that we only unmap past end of the VMA,
+ // which should succeed and poke a hole into the PROT_NONE region
+ ASSERT_NE(mremap(ptr + size - HOLE_SIZE, size, HOLE_SIZE, MREMAP_MAYMOVE), MAP_FAILED);
+
+ // Remap the whole buffer to a new address
+ tmp = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(tmp, MAP_FAILED);
+
+ // Try splitting offset 1 hole size into VMA, this should fail
+ ASSERT_EQ(mremap(ptr + HOLE_SIZE, size - HOLE_SIZE, size - HOLE_SIZE,
+ MREMAP_MAYMOVE | MREMAP_FIXED, tmp), MAP_FAILED);
+ // Remapping the whole thing should succeed fine
+ ptr = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, tmp);
+ ASSERT_EQ(ptr, tmp);
+ ASSERT_EQ(munmap(tmp, size), 0);
+}
+
+TEST_F(perf_mmap, unmap)
+{
+ unsigned long size = variant->ptr_size;
+
+ // Try to poke holes into the mappings
+ ASSERT_NE(munmap(self->ptr, HOLE_SIZE), 0);
+ ASSERT_NE(munmap(self->ptr + HOLE_SIZE, HOLE_SIZE), 0);
+ ASSERT_NE(munmap(self->ptr + size - HOLE_SIZE, HOLE_SIZE), 0);
+}
+
+TEST_F(perf_mmap, map)
+{
+ unsigned long size = variant->ptr_size;
+
+ // Try to poke holes into the mappings by mapping anonymous memory over it
+ ASSERT_EQ(mmap(self->ptr, HOLE_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0), MAP_FAILED);
+ ASSERT_EQ(mmap(self->ptr + HOLE_SIZE, HOLE_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0), MAP_FAILED);
+ ASSERT_EQ(mmap(self->ptr + size - HOLE_SIZE, HOLE_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0), MAP_FAILED);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index a838c37f93e5..3639aa8dd2b0 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -108,8 +108,10 @@ extern unsigned long dac_mmap_min_addr;
#define CAP_IPC_LOCK 14
#ifdef CONFIG_64BIT
-/* VM is sealed, in vm_flags */
-#define VM_SEALED _BITUL(63)
+#define VM_SEALED_BIT 42
+#define VM_SEALED BIT(VM_SEALED_BIT)
+#else
+#define VM_SEALED VM_NONE
#endif
#define FIRST_USER_ADDRESS 0UL