diff options
-rw-r--r-- | ports/stm32/dma.c | 53 | ||||
-rw-r--r-- | ports/stm32/dma.h | 38 | ||||
-rw-r--r-- | ports/stm32/mpu.h | 16 | ||||
-rw-r--r-- | ports/stm32/spi.c | 8 | ||||
-rw-r--r-- | tests/ports/stm32_hardware/dma_alignment.py | 43 | ||||
-rw-r--r-- | tests/ports/stm32_hardware/dma_alignment.py.exp | 2 |
6 files changed, 158 insertions, 2 deletions
diff --git a/ports/stm32/dma.c b/ports/stm32/dma.c index af56d8123..df8a408cb 100644 --- a/ports/stm32/dma.c +++ b/ports/stm32/dma.c @@ -33,6 +33,7 @@ #include "systick.h" #include "dma.h" #include "irq.h" +#include "mpu.h" // When this option is enabled, the DMA will turn off automatically after // a period of inactivity. @@ -1852,3 +1853,55 @@ void dma_external_acquire(uint32_t controller, uint32_t stream) { void dma_external_release(uint32_t controller, uint32_t stream) { dma_disable_clock(DMA_ID_FROM_CONTROLLER_STREAM(controller, stream)); } + +#if __DCACHE_PRESENT + +void dma_protect_rx_region(void *dest, size_t len) { + #if __DCACHE_PRESENT + uint32_t start_addr = (uint32_t)dest; + uint32_t start_aligned = start_addr & ~(__SCB_DCACHE_LINE_SIZE - 1U); + uint32_t end_addr = start_addr + len - 1; // Address of last byte in the buffer + uint32_t end_aligned = end_addr & ~(__SCB_DCACHE_LINE_SIZE - 1U); + + uint32_t irq_state = mpu_config_start(); + + // Clean (write back) any cached memory in this region, so there's no dirty + // cache entries that might be written back later after DMA RX is done. + MP_HAL_CLEAN_DCACHE(dest, len); + + // The way we protect the whole region is to mark the first and last cache + // line as UNCACHED using the MPU. This means any unrelated reads/writes in + // these cache lines will bypass the cache, and can coexist with DMA also + // writing to parts of these cache lines. + // + // This is redundant sometimes (because the DMA region fills the entire cache line, or because + // the region fits in a single cache line.) However, the implementation is only 3 register writes so + // it's more efficient to call it every time. + mpu_config_region(MPU_REGION_DMA_UNCACHED_1, start_aligned, MPU_CONFIG_UNCACHED(MPU_REGION_SIZE_32B)); + mpu_config_region(MPU_REGION_DMA_UNCACHED_2, end_aligned, MPU_CONFIG_UNCACHED(MPU_REGION_SIZE_32B)); + + mpu_config_end(irq_state); + #endif +} + +void dma_unprotect_rx_region(void *dest, size_t len) { + #if __DCACHE_PRESENT + uint32_t irq_state = mpu_config_start(); + + // Disabling these regions removes them from the MPU + mpu_config_region(MPU_REGION_DMA_UNCACHED_1, 0, MPU_CONFIG_DISABLE); + mpu_config_region(MPU_REGION_DMA_UNCACHED_2, 0, MPU_CONFIG_DISABLE); + + // Invalidate the whole region in the cache. This may seem redundant, but it + // is possible that during the DMA operation the CPU read inside this region + // (excluding the first & last cache lines), and cache lines were filled. + // + // (This can happen in SPI if src==dest, for example, possibly due to speculative + // cache line fills.) + MP_HAL_CLEANINVALIDATE_DCACHE(dest, len); + + mpu_config_end(irq_state); + #endif +} + +#endif diff --git a/ports/stm32/dma.h b/ports/stm32/dma.h index fa3238413..7ba04baf5 100644 --- a/ports/stm32/dma.h +++ b/ports/stm32/dma.h @@ -26,6 +26,8 @@ #ifndef MICROPY_INCLUDED_STM32_DMA_H #define MICROPY_INCLUDED_STM32_DMA_H +#include "py/mpconfig.h" + typedef struct _dma_descr_t dma_descr_t; #if defined(STM32H5) @@ -164,4 +166,40 @@ void dma_nohal_start(const dma_descr_t *descr, uint32_t src_addr, uint32_t dst_a void dma_external_acquire(uint32_t controller, uint32_t stream); void dma_external_release(uint32_t controller, uint32_t stream); +#if __DCACHE_PRESENT +// On chips with D-Cache, it's necessary to call this function before using DMA to read +// into a user-supplied buffer. It does two things: +// +// 1) Cleans this region in the cache. +// +// 2) Temporarily disables caching for the first and last cache lines of the +// buffer. This prevents cache coherency issues if the start or end of the +// buffer don't align to a cache line. (For example, this can happen if the CPU +// accesses unrelated memory adjacent to the buffer and in the same cache +// line.) +// +// Only one region can be protected in this way at a time. +void dma_protect_rx_region(void *dest, size_t len); + +// Release the region protected by dma_protect_rx_region(). +// +// Call this after the DMA operation completes, before the CPU reads any of the +// data that was written. +// +// As well as restoring data caching, this function invalidates D-cache for the +// entire specified region. +void dma_unprotect_rx_region(void *dest, size_t len); + +#else + +inline static void dma_protect_rx_region(uint8_t *dest, size_t len) { + // No-ops on targets without D-Cache. +} + +inline static void dma_unprotect_rx_region(void *dest, size_t len) { + +} + +#endif // __DCACHE_PRESENT + #endif // MICROPY_INCLUDED_STM32_DMA_H diff --git a/ports/stm32/mpu.h b/ports/stm32/mpu.h index 62a428198..64880a85d 100644 --- a/ports/stm32/mpu.h +++ b/ports/stm32/mpu.h @@ -37,6 +37,10 @@ #define MPU_REGION_SDRAM1 (MPU_REGION_NUMBER4) #define MPU_REGION_SDRAM2 (MPU_REGION_NUMBER5) +// Only relevant on CPUs with D-Cache, must be higher priority than SDRAM +#define MPU_REGION_DMA_UNCACHED_1 (MPU_REGION_NUMBER6) +#define MPU_REGION_DMA_UNCACHED_2 (MPU_REGION_NUMBER7) + // Attribute value to disable a region entirely, remove it from the MPU // (i.e. the MPU_REGION_ENABLE bit is unset.) #define MPU_CONFIG_DISABLE 0 @@ -78,6 +82,18 @@ | MPU_REGION_ENABLE << MPU_RASR_ENABLE_Pos \ ) +#define MPU_CONFIG_UNCACHED(size) ( \ + MPU_INSTRUCTION_ACCESS_DISABLE << MPU_RASR_XN_Pos \ + | MPU_REGION_FULL_ACCESS << MPU_RASR_AP_Pos \ + | MPU_TEX_LEVEL1 << MPU_RASR_TEX_Pos \ + | MPU_ACCESS_NOT_SHAREABLE << MPU_RASR_S_Pos \ + | MPU_ACCESS_NOT_CACHEABLE << MPU_RASR_C_Pos \ + | MPU_ACCESS_NOT_BUFFERABLE << MPU_RASR_B_Pos \ + | 0x00 << MPU_RASR_SRD_Pos \ + | (size) << MPU_RASR_SIZE_Pos \ + | MPU_REGION_ENABLE << MPU_RASR_ENABLE_Pos \ + ) + static inline void mpu_init(void) { MPU->CTRL = MPU_PRIVILEGED_DEFAULT | MPU_CTRL_ENABLE_Msk; SCB->SHCSR |= SCB_SHCSR_MEMFAULTENA_Msk; diff --git a/ports/stm32/spi.c b/ports/stm32/spi.c index 5aace8e9a..607b3fe68 100644 --- a/ports/stm32/spi.c +++ b/ports/stm32/spi.c @@ -571,6 +571,8 @@ void spi_transfer(const spi_t *self, size_t len, const uint8_t *src, uint8_t *de // Note: DMA transfers are limited to 65535 bytes at a time. HAL_StatusTypeDef status; + void *odest = dest; // Original values of dest & len + size_t olen = len; if (dest == NULL) { // send only @@ -613,7 +615,7 @@ void spi_transfer(const spi_t *self, size_t len, const uint8_t *src, uint8_t *de } dma_init(&rx_dma, self->rx_dma_descr, DMA_PERIPH_TO_MEMORY, self->spi); self->spi->hdmarx = &rx_dma; - MP_HAL_CLEANINVALIDATE_DCACHE(dest, len); + dma_protect_rx_region(dest, len); uint32_t t_start = HAL_GetTick(); do { uint32_t l = MIN(len, 65535); @@ -632,6 +634,7 @@ void spi_transfer(const spi_t *self, size_t len, const uint8_t *src, uint8_t *de dma_deinit(self->tx_dma_descr); } dma_deinit(self->rx_dma_descr); + dma_unprotect_rx_region(odest, olen); } } else { // send and receive @@ -644,7 +647,7 @@ void spi_transfer(const spi_t *self, size_t len, const uint8_t *src, uint8_t *de dma_init(&rx_dma, self->rx_dma_descr, DMA_PERIPH_TO_MEMORY, self->spi); self->spi->hdmarx = &rx_dma; MP_HAL_CLEAN_DCACHE(src, len); - MP_HAL_CLEANINVALIDATE_DCACHE(dest, len); + dma_protect_rx_region(dest, len); uint32_t t_start = HAL_GetTick(); do { uint32_t l = MIN(len, 65535); @@ -662,6 +665,7 @@ void spi_transfer(const spi_t *self, size_t len, const uint8_t *src, uint8_t *de } while (len); dma_deinit(self->tx_dma_descr); dma_deinit(self->rx_dma_descr); + dma_unprotect_rx_region(odest, olen); } } diff --git a/tests/ports/stm32_hardware/dma_alignment.py b/tests/ports/stm32_hardware/dma_alignment.py new file mode 100644 index 000000000..1836b25d8 --- /dev/null +++ b/tests/ports/stm32_hardware/dma_alignment.py @@ -0,0 +1,43 @@ +from machine import SPI +# Regression test for DMA for DCache coherency bugs with cache line +# written originally for https://github.com/micropython/micropython/issues/13471 + +# IMPORTANT: This test requires SPI2 MISO (pin Y8 on Pyboard D) to be connected to GND + +SPI_NUM = 2 + +spi = SPI(SPI_NUM, baudrate=5_000_000) +buf = bytearray(1024) +ok = True + +for offs in range(0, len(buf)): + v = memoryview(buf)[offs : offs + 128] + spi.readinto(v, 0xFF) + if not all(b == 0x00 for b in v): + print(offs, v.hex()) + ok = False + +print("Variable offset fixed length " + ("OK" if ok else "FAIL")) + +# this takes around 30s to run, so skipped if already failing +if ok: + for op_len in range(1, 66): + wr = b"\xFF" * op_len + for offs in range(1, len(buf) - op_len - 1): + # Place some "sentinel" values before and after the DMA buffer + before = offs & 0xFF + after = (~offs) & 0xFF + buf[offs - 1] = before + buf[offs + op_len] = after + v = memoryview(buf)[offs : offs + op_len] + spi.write_readinto(wr, v) + if ( + not all(b == 0x00 for b in v) + or buf[offs - 1] != before + or buf[offs + op_len] != after + ): + print(v.hex()) + print(hex(op_len), hex(offs), hex(buf[offs - 1]), hex(buf[offs + op_len])) + ok = False + + print("Variable offset and lengths " + ("OK" if ok else "FAIL")) diff --git a/tests/ports/stm32_hardware/dma_alignment.py.exp b/tests/ports/stm32_hardware/dma_alignment.py.exp new file mode 100644 index 000000000..e890e0081 --- /dev/null +++ b/tests/ports/stm32_hardware/dma_alignment.py.exp @@ -0,0 +1,2 @@ +Variable offset fixed length OK +Variable offset and lengths OK |