summaryrefslogtreecommitdiff
path: root/py
diff options
context:
space:
mode:
authorAlessandro Gatti <a.gatti@frob.it>2025-11-10 22:43:44 +0100
committerDamien George <damien@micropython.org>2025-12-19 17:06:53 +1100
commit7a69b2d786f3a5d0ae75a7d1aa89f38be82fcd88 (patch)
tree29541964b321725949836fb027c5531216e088e3 /py
parent1df86516e0b4515c4d9b94811ca7da969de6dbc0 (diff)
py/asmrv32: Use Zcmp opcodes for function prologues and epilogues.
This commit introduces the possibility of using Zcmp opcodes when generating function prologues and epilogues, reducing the generated code size. With the addition of selected Zcmp opcodes, each generated function can be up to 30 bytes shorter and having a faster prologue and epilogue. If Zcmp opcodes can be used then register saving is a matter of a simple CM.PUSH opcode rather than a series of C.SWSP opcodes. Conversely, register restoring is a single CM.POPRET opcode instead of a series of C.LWSP opcodes followed by a C.JR RA opcode. This should also lead to faster code given that there's only one opcode doing the registers saving rather than a series of them. For functions that allocate less than three locals then the generated code will allocate up to 12 bytes of unused stack space. Whilst this is a relatively rare occurrence for generated native and viper code, inline assembler blocks will probably incur into this penalty. Still, considering that at the moment the only targets that support Zcmp opcodes are relatively high-end MCUs (the RP2350 in RV32 mode and the ESP32P4), this is probably not much of an issue. Signed-off-by: Alessandro Gatti <a.gatti@frob.it>
Diffstat (limited to 'py')
-rw-r--r--py/asmrv32.c77
-rw-r--r--py/asmrv32.h20
2 files changed, 86 insertions, 11 deletions
diff --git a/py/asmrv32.c b/py/asmrv32.c
index 1d0cea6c0..8b643af56 100644
--- a/py/asmrv32.c
+++ b/py/asmrv32.c
@@ -53,6 +53,14 @@
((((value) & ~((1U << ((bits) - 1)) - 1)) == 0) || \
(((value) & ~((1U << ((bits) - 1)) - 1)) == ~((1U << ((bits) - 1)) - 1)))
+static bool asm_rv32_allow_zba_opcodes(void) {
+ return asm_rv32_allowed_extensions() & RV32_EXT_ZBA;
+}
+
+static bool asm_rv32_allow_zcmp_opcodes(void) {
+ return asm_rv32_allowed_extensions() & RV32_EXT_ZCMP;
+}
+
///////////////////////////////////////////////////////////////////////////////
void asm_rv32_emit_word_opcode(asm_rv32_t *state, mp_uint_t word) {
@@ -214,6 +222,14 @@ static void adjust_stack(asm_rv32_t *state, mp_int_t stack_size) {
return;
}
+ // WARNING: If REG_TEMP0 is not set to a caller-saved register, then this
+ // bit has to be rewritten to avoid clobbering the temporary
+ // register when performing the stack adjustment.
+
+ MP_STATIC_ASSERT(((REG_TEMP0 >= ASM_RV32_REG_T0) && (REG_TEMP0 <= ASM_RV32_REG_T2)) || \
+ ((REG_TEMP0 >= ASM_RV32_REG_A0) && (REG_TEMP0 <= ASM_RV32_REG_A7)) || \
+ ((REG_TEMP0 >= ASM_RV32_REG_T3) && (REG_TEMP0 <= ASM_RV32_REG_T6)));
+
// li temporary, stack_size
// c.add sp, temporary
load_full_immediate(state, REG_TEMP0, stack_size);
@@ -245,6 +261,45 @@ static void emit_function_epilogue(asm_rv32_t *state, mp_uint_t registers) {
state->saved_registers_mask = old_saved_registers_mask;
}
+static mp_uint_t compute_zcmp_sequence_length(mp_uint_t registers) {
+ // Can only handle RA and S0..S11 and must have at least one entry.
+ assert((registers != 0) && (registers & (~0x0FFC0302U)) == 0 && "Invalid Zcmp registers set.");
+ mp_uint_t length = 32 - mp_clz(((registers & 0x00000002) >> 1) | ((registers & 0x00000300) >> 7) | ((registers & 0x0FFC0000) >> 15));
+ return length == 12 ? 13 : length;
+}
+
+#define EMIT_ASSERT(state, condition, message) assert((((state)->base.pass != MP_ASM_PASS_EMIT) ? true : (condition)) && (message))
+
+static void emit_compressed_function_prologue(asm_rv32_t *state, mp_uint_t registers_mask) {
+ mp_uint_t sequence_length = compute_zcmp_sequence_length(registers_mask);
+ mp_uint_t allocated_stack = (sequence_length + 3) & (mp_uint_t)-4;
+ EMIT_ASSERT(state, allocated_stack >= sequence_length, "Incorrect allocated stack calculation.");
+ mp_uint_t tail_slack = allocated_stack - sequence_length;
+ mp_uint_t locals_left = (state->locals_count < tail_slack) ? 0 : (state->locals_count - tail_slack);
+ mp_uint_t adjustment_chunks = MIN(3, locals_left / 4);
+ EMIT_ASSERT(state, (adjustment_chunks * 4) <= locals_left, "Incorrect adjustment chunks rounding.");
+ locals_left -= adjustment_chunks * 4;
+ EMIT_ASSERT(state, locals_left <= (MP_INT_MAX / sizeof(uint32_t)), "Too many locals.");
+ mp_int_t stack_size = (mp_int_t)(locals_left * sizeof(uint32_t));
+ asm_rv32_opcode_cmpush(state, MIN(3 + sequence_length, 15), adjustment_chunks);
+ // CM.PUSH allocates a stack block and then puts the registers *at the end*
+ // of the block, so for example "CM.PUSH {RA, S0-S11}, -64" will put RA at
+ // SP + 60, not at SP + 0.
+ adjust_stack(state, -stack_size);
+ // The stack size is expressed in bytes and as a multiple of 4, hence the
+ // bottom two bits are not used. Since there can be up to three adjustment
+ // chunks, that number can be expressed in two bits, fitting nicely in the
+ // existing variable.
+ state->stack_size = ((mp_uint_t)stack_size) | adjustment_chunks;
+}
+
+static void emit_compressed_function_epilogue(asm_rv32_t *state, mp_uint_t registers_mask) {
+ mp_uint_t sequence_length = compute_zcmp_sequence_length(registers_mask);
+ mp_uint_t stack_size = state->stack_size & (mp_uint_t)(~0x03U);
+ adjust_stack(state, stack_size);
+ asm_rv32_opcode_cmpopret(state, MIN(3 + sequence_length, 15), state->stack_size & 0x03);
+}
+
static bool calculate_displacement_for_label(asm_rv32_t *state, mp_uint_t label, ptrdiff_t *displacement) {
assert(displacement != NULL && "Displacement pointer is NULL");
@@ -256,16 +311,24 @@ static bool calculate_displacement_for_label(asm_rv32_t *state, mp_uint_t label,
///////////////////////////////////////////////////////////////////////////////
void asm_rv32_entry(asm_rv32_t *state, mp_uint_t locals) {
+ state->locals_count = locals;
state->saved_registers_mask |= (1U << REG_FUN_TABLE) | (1U << REG_LOCAL_1) | \
(1U << REG_LOCAL_2) | (1U << REG_LOCAL_3);
- state->locals_count = locals;
- emit_function_prologue(state, state->saved_registers_mask);
+ if (asm_rv32_allow_zcmp_opcodes()) {
+ emit_compressed_function_prologue(state, state->saved_registers_mask);
+ } else {
+ emit_function_prologue(state, state->saved_registers_mask);
+ }
}
void asm_rv32_exit(asm_rv32_t *state) {
- emit_function_epilogue(state, state->saved_registers_mask);
- // c.jr ra
- asm_rv32_opcode_cjr(state, ASM_RV32_REG_RA);
+ if (asm_rv32_allow_zcmp_opcodes()) {
+ emit_compressed_function_epilogue(state, state->saved_registers_mask);
+ } else {
+ emit_function_epilogue(state, state->saved_registers_mask);
+ // c.jr ra
+ asm_rv32_opcode_cjr(state, ASM_RV32_REG_RA);
+ }
}
void asm_rv32_end_pass(asm_rv32_t *state) {
@@ -557,10 +620,6 @@ void asm_rv32_emit_optimised_xor(asm_rv32_t *state, mp_uint_t rd, mp_uint_t rs)
asm_rv32_opcode_xor(state, rd, rd, rs);
}
-static bool asm_rv32_allow_zba_opcodes(void) {
- return asm_rv32_allowed_extensions() & RV32_EXT_ZBA;
-}
-
static void asm_rv32_fix_up_scaled_reg_reg_reg(asm_rv32_t *state, mp_uint_t rs1, mp_uint_t rs2, mp_uint_t operation_size) {
assert(operation_size <= 2 && "Operation size value out of range.");
diff --git a/py/asmrv32.h b/py/asmrv32.h
index ed1b5a835..c25b1aa4e 100644
--- a/py/asmrv32.h
+++ b/py/asmrv32.h
@@ -197,6 +197,10 @@ void asm_rv32_end_pass(asm_rv32_t *state);
((rs & 0x07) << 7) | ((imm & 0x40) >> 1) | ((imm & 0x38) << 7) | \
((imm & 0x04) << 4))
+#define RV32_ENCODE_TYPE_CMPP(op, ft6, ft2, rlist, imm) \
+ ((op & 0x03) | ((ft6 & 0x3F) << 10) | ((ft2 & 0x03) << 8) | \
+ ((rlist & 0x0F) << 4) | ((imm & 0x03) << 2))
+
#define RV32_ENCODE_TYPE_CR(op, ft4, rs1, rs2) \
((op & 0x03) | ((rs2 & 0x1F) << 2) | ((rs1 & 0x1F) << 7) | ((ft4 & 0x0F) << 12))
@@ -440,6 +444,18 @@ static inline void asm_rv32_opcode_cxor(asm_rv32_t *state, mp_uint_t rd, mp_uint
asm_rv32_emit_halfword_opcode(state, RV32_ENCODE_TYPE_CA(0x01, 0x23, 0x01, rd, rs));
}
+// CM.POPRET {REG_LIST}, IMMEDIATE
+static inline void asm_rv32_opcode_cmpopret(asm_rv32_t *state, mp_uint_t reg_list, mp_uint_t immediate) {
+ // CMPP: 10111110 ... .. 10
+ asm_rv32_emit_halfword_opcode(state, RV32_ENCODE_TYPE_CMPP(0x02, 0x2F, 0x02, reg_list, immediate));
+}
+
+// CM.PUSH {REG_LIST}, -IMMEDIATE
+static inline void asm_rv32_opcode_cmpush(asm_rv32_t *state, mp_uint_t reg_list, mp_uint_t immediate) {
+ // CMPP: 10111000 .... .. 10
+ asm_rv32_emit_halfword_opcode(state, RV32_ENCODE_TYPE_CMPP(0x02, 0x2E, 0x00, reg_list, immediate));
+}
+
// CSRRC RD, RS, IMMEDIATE
static inline void asm_rv32_opcode_csrrc(asm_rv32_t *state, mp_uint_t rd, mp_uint_t rs, mp_int_t immediate) {
// I: ............ ..... 011 ..... 1110011
@@ -737,8 +753,8 @@ static inline uint8_t asm_rv32_allowed_extensions(void) {
#define REG_TEMP2 ASM_RV32_REG_T3
#define REG_FUN_TABLE ASM_RV32_REG_S1
#define REG_LOCAL_1 ASM_RV32_REG_S3
-#define REG_LOCAL_2 ASM_RV32_REG_S4
-#define REG_LOCAL_3 ASM_RV32_REG_S5
+#define REG_LOCAL_2 ASM_RV32_REG_S2
+#define REG_LOCAL_3 ASM_RV32_REG_S4
#define REG_ZERO ASM_RV32_REG_ZERO
void asm_rv32_meta_comparison_eq(asm_rv32_t *state, mp_uint_t rs1, mp_uint_t rs2, mp_uint_t rd);