Skip to content

Commit 3ae3059

Browse files
committed
Add fuse instruction
To enhance execution efficiency, we employ instruction fusion by combining sequences that adhere to specific patterns into fused instructions. Currently, we have incorporated four fused instructions: auipc + addi, auipc + add, multiple sw, and multiple lw.
1 parent 1c11b39 commit 3ae3059

File tree

2 files changed

+157
-13
lines changed

2 files changed

+157
-13
lines changed

src/decode.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,12 @@
156156
_(cjalr, 1) \
157157
_(cadd, 0) \
158158
_(cswsp, 0) \
159-
)
159+
) \
160+
_(fuse1, 0) \
161+
_(fuse2, 0) \
162+
_(fuse3, 0) \
163+
_(fuse4, 0) \
164+
_(empty, 0)
160165
/* clang-format on */
161166

162167
/* IR list */
@@ -228,6 +233,11 @@ enum {
228233
INSN_32 = 4,
229234
};
230235

236+
typedef struct mem_fuse {
237+
int32_t imm;
238+
uint8_t rd, rs1, rs2;
239+
} mem_fuse_t;
240+
231241
typedef struct rv_insn {
232242
union {
233243
int32_t imm;
@@ -240,6 +250,9 @@ typedef struct rv_insn {
240250
#if RV32_HAS(EXT_C)
241251
uint8_t shamt;
242252
#endif
253+
/* fuse operation */
254+
int32_t imm2;
255+
mem_fuse_t *mem_fuse;
243256

244257
/* instruction length */
245258
uint8_t insn_len;

src/emulate.c

Lines changed: 143 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -287,18 +287,18 @@ enum {
287287
#define RVOP_RUN_NEXT (!ir->tailcall)
288288
#endif
289289

290-
#define RVOP(inst, code) \
291-
static bool do_##inst(riscv_t *rv UNUSED, const rv_insn_t *ir UNUSED) \
292-
{ \
293-
rv->X[rv_reg_zero] = 0; \
294-
code; \
295-
rv->csr_cycle++; \
296-
nextop: \
297-
rv->PC += ir->insn_len; \
298-
if (!RVOP_RUN_NEXT) \
299-
return true; \
300-
const rv_insn_t *next = ir + 1; \
301-
MUST_TAIL return next->impl(rv, next); \
290+
#define RVOP(inst, code) \
291+
static bool do_##inst(riscv_t *rv, const rv_insn_t *ir) \
292+
{ \
293+
rv->X[rv_reg_zero] = 0; \
294+
rv->csr_cycle++; \
295+
code; \
296+
nextop: \
297+
rv->PC += ir->insn_len; \
298+
if (!RVOP_RUN_NEXT) \
299+
return true; \
300+
const rv_insn_t *next = ir + 1; \
301+
MUST_TAIL return next->impl(rv, next); \
302302
}
303303

304304
/* RV32I Base Instruction Set */
@@ -1277,6 +1277,48 @@ RVOP(cswsp, {
12771277
})
12781278
#endif
12791279

1280+
/* auipc + addi */
1281+
RVOP(fuse1, {
1282+
rv->X[ir->rd] = (int32_t) (rv->PC + ir->imm + ir->imm2);
1283+
rv->PC += ir->insn_len;
1284+
})
1285+
1286+
/* auipc + add */
1287+
RVOP(fuse2, {
1288+
rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + (int32_t) (rv->PC + ir->imm);
1289+
rv->PC += ir->insn_len;
1290+
})
1291+
1292+
/* multiple sw */
1293+
RVOP(fuse3, {
1294+
mem_fuse_t *mem_fuse = ir->mem_fuse;
1295+
for (int i = 0; i < ir->imm2; i++) {
1296+
const uint32_t addr = rv->X[mem_fuse[i].rs1] + mem_fuse[i].imm;
1297+
RV_EXC_MISALIGN_HANDLER(3, store, false, 1);
1298+
rv->io.mem_write_w(rv, addr, rv->X[mem_fuse[i].rs2]);
1299+
}
1300+
rv->PC += ir->insn_len * (ir->imm2 - 1);
1301+
})
1302+
1303+
/* multiple lw */
1304+
RVOP(fuse4, {
1305+
mem_fuse_t *mem_fuse = ir->mem_fuse;
1306+
for (int i = 0; i < ir->imm2; i++) {
1307+
const uint32_t addr = rv->X[mem_fuse[i].rs1] + mem_fuse[i].imm;
1308+
RV_EXC_MISALIGN_HANDLER(3, load, false, 1);
1309+
rv->X[mem_fuse[i].rd] = rv->io.mem_read_w(rv, addr);
1310+
}
1311+
rv->PC += ir->insn_len * (ir->imm2 - 1);
1312+
})
1313+
1314+
static bool do_empty(riscv_t *rv, const rv_insn_t *ir)
1315+
{
1316+
rv->X[rv_reg_zero] = 0;
1317+
rv->csr_cycle++;
1318+
const rv_insn_t *next = ir + 1;
1319+
MUST_TAIL return next->impl(rv, next);
1320+
}
1321+
12801322
static const void *dispatch_table[] = {
12811323
#define _(inst, can_branch) [rv_insn_##inst] = do_##inst,
12821324
RISCV_INSN_LIST
@@ -1407,6 +1449,92 @@ static void extend_block(riscv_t *rv, block_t *block)
14071449
last_ir->branch_untaken = next->ir;
14081450
}
14091451

1452+
static void match_pattern(block_t *block)
1453+
{
1454+
for (uint32_t i = 0; i < block->n_insn - 1; i++) {
1455+
rv_insn_t *ir = block->ir + i, *next_ir = NULL;
1456+
int32_t count = 0;
1457+
switch (ir->opcode) {
1458+
case rv_insn_auipc:
1459+
next_ir = ir + 1;
1460+
if (next_ir->opcode == rv_insn_addi) {
1461+
if (ir->rd == next_ir->rs1) {
1462+
ir->opcode = rv_insn_fuse1;
1463+
ir->rd = next_ir->rd;
1464+
ir->imm2 = next_ir->imm;
1465+
ir->impl = dispatch_table[ir->opcode];
1466+
next_ir->opcode = rv_insn_empty;
1467+
next_ir->impl = dispatch_table[next_ir->opcode];
1468+
} else if (ir->rd == next_ir->rs2) {
1469+
ir->opcode = rv_insn_fuse2;
1470+
ir->rd = next_ir->rd;
1471+
ir->rs1 = next_ir->rs1;
1472+
ir->impl = dispatch_table[ir->opcode];
1473+
next_ir->opcode = rv_insn_empty;
1474+
next_ir->impl = dispatch_table[next_ir->opcode];
1475+
}
1476+
}
1477+
break;
1478+
case rv_insn_sw:
1479+
count = 1;
1480+
for (uint32_t j = 1; j < block->n_insn - 1 - i; j++) {
1481+
next_ir = ir + j;
1482+
if (next_ir->opcode != rv_insn_sw)
1483+
break;
1484+
count++;
1485+
}
1486+
if (count >= 5) {
1487+
ir->opcode = rv_insn_fuse3;
1488+
ir->mem_fuse = malloc(count * sizeof(mem_fuse_t));
1489+
ir->imm2 = count;
1490+
ir->mem_fuse[0].imm = ir->imm;
1491+
ir->mem_fuse[0].rd = ir->rd;
1492+
ir->mem_fuse[0].rs1 = ir->rs1;
1493+
ir->mem_fuse[0].rs2 = ir->rs2;
1494+
ir->impl = dispatch_table[ir->opcode];
1495+
for (int j = 1; j < count; j++) {
1496+
next_ir = ir + j;
1497+
ir->mem_fuse[j].imm = next_ir->imm;
1498+
ir->mem_fuse[j].rd = next_ir->rd;
1499+
ir->mem_fuse[j].rs1 = next_ir->rs1;
1500+
ir->mem_fuse[j].rs2 = next_ir->rs2;
1501+
next_ir->opcode = rv_insn_empty;
1502+
next_ir->impl = dispatch_table[next_ir->opcode];
1503+
}
1504+
}
1505+
break;
1506+
case rv_insn_lw:
1507+
count = 1;
1508+
for (uint32_t j = 1; j < block->n_insn - 1 - i; j++) {
1509+
next_ir = ir + j;
1510+
if (next_ir->opcode != rv_insn_lw)
1511+
break;
1512+
count++;
1513+
}
1514+
if (count >= 5) {
1515+
ir->opcode = rv_insn_fuse4;
1516+
ir->mem_fuse = malloc(count * sizeof(mem_fuse_t));
1517+
ir->imm2 = count;
1518+
ir->mem_fuse[0].imm = ir->imm;
1519+
ir->mem_fuse[0].rd = ir->rd;
1520+
ir->mem_fuse[0].rs1 = ir->rs1;
1521+
ir->mem_fuse[0].rs2 = ir->rs2;
1522+
ir->impl = dispatch_table[ir->opcode];
1523+
for (int j = 1; j < count; j++) {
1524+
next_ir = ir + j;
1525+
ir->mem_fuse[j].imm = next_ir->imm;
1526+
ir->mem_fuse[j].rd = next_ir->rd;
1527+
ir->mem_fuse[j].rs1 = next_ir->rs1;
1528+
ir->mem_fuse[j].rs2 = next_ir->rs2;
1529+
next_ir->opcode = rv_insn_empty;
1530+
next_ir->impl = dispatch_table[next_ir->opcode];
1531+
}
1532+
}
1533+
break;
1534+
}
1535+
}
1536+
}
1537+
14101538
static block_t *block_find_or_translate(riscv_t *rv, block_t *prev)
14111539
{
14121540
block_map_t *map = &rv->block_map;
@@ -1425,6 +1553,9 @@ static block_t *block_find_or_translate(riscv_t *rv, block_t *prev)
14251553
/* translate the basic block */
14261554
block_translate(rv, next);
14271555

1556+
/* fuse instruction */
1557+
match_pattern(next);
1558+
14281559
/* insert the block into block map */
14291560
block_insert(&rv->block_map, next);
14301561

0 commit comments

Comments
 (0)