Skip to content

Commit fc9c3b8

Browse files
committed
Introduce preliminary macro operation fusion
Through our observations, we have identified certain patterns in instruction sequences. By converting these specific RISC-V instruction patterns into faster and equivalent code, we can significantly improve execution efficiency. In our current analysis, we focus on a commonly used benchmark and have found the following frequently occurring instruction patterns: auipc + addi, auipc + add, multiple sw, and multiple lw. | Metric | commit fba5802 | macro fuse operation |Speedup| |----------+--------------------------+---------------------------+-------| | CoreMark | 1351.065 (Iterations/Sec)| 1352.843 (Iterations/Sec)|+0.13% | | dhrystone| 1073 DMIPS | 1146 DMIPS | +6.8% | | nqueens | 8295 msec | 7824 msec | +6.0% |
1 parent fba5802 commit fc9c3b8

File tree

3 files changed

+147
-3
lines changed

3 files changed

+147
-3
lines changed

src/decode.h

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,14 @@
156156
_(cjalr, 1) \
157157
_(cadd, 0) \
158158
_(cswsp, 0) \
159-
)
159+
) \
160+
/* macro operation fusion: convert specific RISC-V instruction patterns
161+
* into faster and equivalent code
162+
*/ \
163+
_(fuse1, 0) \
164+
_(fuse2, 0) \
165+
_(fuse3, 0) \
166+
_(fuse4, 0)
160167
/* clang-format on */
161168

162169
/* IR list */
@@ -228,6 +235,11 @@ enum {
228235
INSN_32 = 4,
229236
};
230237

238+
typedef struct {
239+
int32_t imm;
240+
uint8_t rd, rs1, rs2;
241+
} opcode_fuse_t;
242+
231243
typedef struct rv_insn {
232244
union {
233245
int32_t imm;
@@ -240,6 +252,9 @@ typedef struct rv_insn {
240252
#if RV32_HAS(EXT_C)
241253
uint8_t shamt;
242254
#endif
255+
/* fuse operation */
256+
int16_t imm2;
257+
opcode_fuse_t *fuse;
243258

244259
/* instruction length */
245260
uint8_t insn_len;

src/emulate.c

Lines changed: 129 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ extern struct target_ops gdbstub_ops;
3131
#include "decode.h"
3232
#include "riscv.h"
3333
#include "riscv_private.h"
34+
#include "state.h"
3435
#include "utils.h"
3536

3637
/* RISC-V exception code list */
@@ -310,7 +311,15 @@ static uint32_t last_pc = 0;
310311
/* RV32I Base Instruction Set */
311312

312313
/* Internal */
313-
RVOP(nop, {/* no operation */});
314+
static bool do_nop(riscv_t *rv, const rv_insn_t *ir)
315+
{
316+
rv->X[rv_reg_zero] = 0;
317+
rv->csr_cycle++;
318+
rv->PC += ir->insn_len;
319+
const rv_insn_t *next = ir + 1;
320+
MUST_TAIL return next->impl(rv, next);
321+
}
322+
314323

315324
/* LUI is used to build 32-bit constants and uses the U-type format. LUI
316325
* places the U-immediate value in the top 20 bits of the destination
@@ -1219,6 +1228,46 @@ RVOP(cswsp, {
12191228
})
12201229
#endif
12211230

1231+
/* auipc + addi */
1232+
RVOP(fuse1, { rv->X[ir->rd] = (int32_t) (rv->PC + ir->imm + ir->imm2); })
1233+
1234+
/* auipc + add */
1235+
RVOP(fuse2, {
1236+
rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + (int32_t) (rv->PC + ir->imm);
1237+
})
1238+
1239+
/* multiple sw */
1240+
RVOP(fuse3, {
1241+
opcode_fuse_t *fuse = ir->fuse;
1242+
uint32_t addr = rv->X[fuse[0].rs1] + fuse[0].imm;
1243+
/* the memory addresses of the sw instructions are contiguous, so we only
1244+
* need to check the first sw instruction to determine if its memory address
1245+
* is misaligned or if the memory chunk does not exist.
1246+
*/
1247+
RV_EXC_MISALIGN_HANDLER(3, store, false, 1);
1248+
rv->io.mem_write_w(rv, addr, rv->X[fuse[0].rs2]);
1249+
for (int i = 1; i < ir->imm2; i++) {
1250+
addr = rv->X[fuse[i].rs1] + fuse[i].imm;
1251+
rv->io.mem_write_w(rv, addr, rv->X[fuse[i].rs2]);
1252+
}
1253+
})
1254+
1255+
/* multiple lw */
1256+
RVOP(fuse4, {
1257+
opcode_fuse_t *fuse = ir->fuse;
1258+
uint32_t addr = rv->X[fuse[0].rs1] + fuse[0].imm;
1259+
/* the memory addresses of the lw instructions are contiguous, so we only
1260+
* need to check the first lw instruction to determine if its memory address
1261+
* is misaligned or if the memory chunk does not exist.
1262+
*/
1263+
RV_EXC_MISALIGN_HANDLER(3, load, false, 1);
1264+
rv->X[fuse[0].rd] = rv->io.mem_read_w(rv, addr);
1265+
for (int i = 1; i < ir->imm2; i++) {
1266+
addr = rv->X[fuse[i].rs1] + fuse[i].imm;
1267+
rv->X[fuse[i].rd] = rv->io.mem_read_w(rv, addr);
1268+
}
1269+
})
1270+
12221271
static const void *dispatch_table[] = {
12231272
#define _(inst, can_branch) [rv_insn_##inst] = do_##inst,
12241273
RISCV_INSN_LIST
@@ -1337,7 +1386,6 @@ static void block_translate(riscv_t *rv, block_t *block)
13371386
/* compute the end of pc */
13381387
block->pc_end += ir->insn_len;
13391388
block->n_insn++;
1340-
13411389
/* stop on branch */
13421390
if (insn_is_branch(ir->opcode)) {
13431391
/* recursive jump translation */
@@ -1356,6 +1404,82 @@ static void block_translate(riscv_t *rv, block_t *block)
13561404
block->ir[block->n_insn - 1].tailcall = true;
13571405
}
13581406

1407+
#define COMBINE_MEM_OPS(RW) \
1408+
count = 1; \
1409+
next_ir = ir + 1; \
1410+
if (next_ir->opcode != IIF(RW)(rv_insn_lw, rv_insn_sw)) \
1411+
break; \
1412+
sign = (ir->imm - next_ir->imm) >> 31 ? -1 : 1; \
1413+
for (uint32_t j = 1; j < block->n_insn - 1 - i; j++) { \
1414+
next_ir = ir + j; \
1415+
if (next_ir->opcode != IIF(RW)(rv_insn_lw, rv_insn_sw) || \
1416+
ir->rs1 != next_ir->rs1 || ir->imm - next_ir->imm != 4 * sign) \
1417+
break; \
1418+
count++; \
1419+
} \
1420+
if (count > 1) { \
1421+
ir->opcode = IIF(RW)(rv_insn_fuse4, rv_insn_fuse3); \
1422+
ir->fuse = malloc(count * sizeof(opcode_fuse_t)); \
1423+
ir->imm2 = count; \
1424+
memcpy(ir->fuse, ir, sizeof(opcode_fuse_t)); \
1425+
ir->impl = dispatch_table[ir->opcode]; \
1426+
for (int j = 1; j < count; j++) { \
1427+
next_ir = ir + j; \
1428+
memcpy(ir->fuse + j, next_ir, sizeof(opcode_fuse_t)); \
1429+
next_ir->opcode = rv_insn_nop; \
1430+
next_ir->impl = dispatch_table[next_ir->opcode]; \
1431+
} \
1432+
} \
1433+
break;
1434+
1435+
1436+
/* examine whether instructions in a block match a specific pattern. If so,
1437+
* rewrite them into fused instructions.
1438+
*
1439+
* We plan to devise strategies to increase the number of instructions that
1440+
* match the pattern, such as reordering the instructions.
1441+
*/
1442+
static void match_pattern(block_t *block)
1443+
{
1444+
for (uint32_t i = 0; i < block->n_insn - 1; i++) {
1445+
rv_insn_t *ir = block->ir + i, *next_ir = NULL;
1446+
int32_t count = 0, sign = 1;
1447+
switch (ir->opcode) {
1448+
case rv_insn_auipc:
1449+
next_ir = ir + 1;
1450+
if (next_ir->opcode == rv_insn_addi && ir->rd == next_ir->rs1) {
1451+
/* the destination register of instruction auipc is equal to the
1452+
* source register 1 of next instruction addi */
1453+
ir->opcode = rv_insn_fuse1;
1454+
ir->rd = next_ir->rd;
1455+
ir->imm2 = next_ir->imm;
1456+
ir->impl = dispatch_table[ir->opcode];
1457+
next_ir->opcode = rv_insn_nop;
1458+
next_ir->impl = dispatch_table[next_ir->opcode];
1459+
} else if (next_ir->opcode == rv_insn_add &&
1460+
ir->rd == next_ir->rs2) {
1461+
/* the destination register of instruction auipc is equal to the
1462+
* source register 2 of next instruction add */
1463+
ir->opcode = rv_insn_fuse2;
1464+
ir->rd = next_ir->rd;
1465+
ir->rs1 = next_ir->rs1;
1466+
ir->impl = dispatch_table[ir->opcode];
1467+
next_ir->opcode = rv_insn_nop;
1468+
next_ir->impl = dispatch_table[next_ir->opcode];
1469+
}
1470+
break;
1471+
/* If the memory addresses of a sequence of store or load instructions
1472+
* are contiguous, combine these instructions.
1473+
*/
1474+
case rv_insn_sw:
1475+
COMBINE_MEM_OPS(0);
1476+
case rv_insn_lw:
1477+
COMBINE_MEM_OPS(1);
1478+
/* FIXME: lui + addi*/
1479+
}
1480+
}
1481+
}
1482+
13591483
static block_t *prev = NULL;
13601484
static block_t *block_find_or_translate(riscv_t *rv)
13611485
{
@@ -1375,6 +1499,9 @@ static block_t *block_find_or_translate(riscv_t *rv)
13751499
/* translate the basic block */
13761500
block_translate(rv, next);
13771501

1502+
/* macro operation fusion */
1503+
match_pattern(next);
1504+
13781505
/* insert the block into block map */
13791506
block_insert(&rv->block_map, next);
13801507

src/riscv.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ void block_map_clear(block_map_t *map)
2525
block_t *block = map->map[i];
2626
if (!block)
2727
continue;
28+
for (uint32_t i = 0; i < block->n_insn; i++)
29+
free(block->ir[i].fuse);
2830
free(block->ir);
2931
free(block);
3032
map->map[i] = NULL;

0 commit comments

Comments
 (0)