Skip to content

Commit f064d63

Browse files
author
Yen-Fu Chen
committed
Refine instruction fusion and add new one
1. Refine origin fused instruction by skipping insturction nop and correctly updating value to register. 2. Add new fused insturction lui + addi. Benchmark dhrystone gains about 3% performance improvement base on this modification.
1 parent 3baf584 commit f064d63

File tree

2 files changed

+90
-29
lines changed

2 files changed

+90
-29
lines changed

src/decode.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,8 @@
163163
_(fuse1, 0) \
164164
_(fuse2, 0) \
165165
_(fuse3, 0) \
166-
_(fuse4, 0)
166+
_(fuse4, 0) \
167+
_(fuse5, 0)
167168
/* clang-format on */
168169

169170
/* IR list */
@@ -253,7 +254,7 @@ typedef struct rv_insn {
253254
uint8_t shamt;
254255
#endif
255256
/* fuse operation */
256-
int16_t imm2;
257+
int32_t imm2;
257258
opcode_fuse_t *fuse;
258259

259260
/* instruction length */

src/emulate.c

Lines changed: 87 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -312,15 +312,7 @@ static uint32_t last_pc = 0;
312312

313313
/* RV32I Base Instruction Set */
314314

315-
/* Internal */
316-
static bool do_nop(riscv_t *rv, const rv_insn_t *ir)
317-
{
318-
rv->X[rv_reg_zero] = 0;
319-
rv->csr_cycle++;
320-
rv->PC += ir->insn_len;
321-
const rv_insn_t *next = ir + 1;
322-
MUST_TAIL return next->impl(rv, next);
323-
}
315+
RVOP(nop, {/* no operation */})
324316

325317
/* LUI is used to build 32-bit constants and uses the U-type format. LUI
326318
* places the U-immediate value in the top 20 bits of the destination
@@ -1251,15 +1243,38 @@ RVOP(cswsp, {
12511243
#endif
12521244

12531245
/* auipc + addi */
1254-
RVOP(fuse1, { rv->X[ir->rd] = (int32_t) (rv->PC + ir->imm + ir->imm2); })
1246+
static bool do_fuse1(riscv_t *rv, const rv_insn_t *ir)
1247+
{
1248+
rv->X[rv_reg_zero] = 0;
1249+
rv->csr_cycle += 2;
1250+
rv->X[ir->rd] = rv->PC + ir->imm;
1251+
rv->X[ir->rs1] = rv->X[ir->rd] + ir->imm2;
1252+
rv->PC += 2 * ir->insn_len;
1253+
if (unlikely(RVOP_NO_NEXT(ir)))
1254+
return true;
1255+
const rv_insn_t *next = ir + 2;
1256+
MUST_TAIL return next->impl(rv, next);
1257+
}
12551258

12561259
/* auipc + add */
1257-
RVOP(fuse2, {
1258-
rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + (int32_t) (rv->PC + ir->imm);
1259-
})
1260+
static bool do_fuse2(riscv_t *rv, const rv_insn_t *ir)
1261+
{
1262+
rv->X[rv_reg_zero] = 0;
1263+
rv->csr_cycle += 2;
1264+
rv->X[ir->rd] = rv->PC + ir->imm;
1265+
rv->X[ir->rs2] = rv->X[ir->rd] + rv->X[ir->rs1];
1266+
rv->PC += 2 * ir->insn_len;
1267+
if (unlikely(RVOP_NO_NEXT(ir)))
1268+
return true;
1269+
const rv_insn_t *next = ir + 2;
1270+
MUST_TAIL return next->impl(rv, next);
1271+
}
12601272

12611273
/* multiple sw */
1262-
RVOP(fuse3, {
1274+
static bool do_fuse3(riscv_t *rv, const rv_insn_t *ir)
1275+
{
1276+
rv->X[rv_reg_zero] = 0;
1277+
rv->csr_cycle += ir->imm2;
12631278
opcode_fuse_t *fuse = ir->fuse;
12641279
uint32_t addr = rv->X[fuse[0].rs1] + fuse[0].imm;
12651280
/* the memory addresses of the sw instructions are contiguous, so we only
@@ -1272,10 +1287,18 @@ RVOP(fuse3, {
12721287
addr = rv->X[fuse[i].rs1] + fuse[i].imm;
12731288
rv->io.mem_write_w(addr, rv->X[fuse[i].rs2]);
12741289
}
1275-
})
1290+
rv->PC += ir->imm2 * ir->insn_len;
1291+
if (unlikely(RVOP_NO_NEXT(ir)))
1292+
return true;
1293+
const rv_insn_t *next = ir + ir->imm2;
1294+
MUST_TAIL return next->impl(rv, next);
1295+
}
12761296

12771297
/* multiple lw */
1278-
RVOP(fuse4, {
1298+
static bool do_fuse4(riscv_t *rv, const rv_insn_t *ir)
1299+
{
1300+
rv->X[rv_reg_zero] = 0;
1301+
rv->csr_cycle += ir->imm2;
12791302
opcode_fuse_t *fuse = ir->fuse;
12801303
uint32_t addr = rv->X[fuse[0].rs1] + fuse[0].imm;
12811304
/* the memory addresses of the lw instructions are contiguous, so we only
@@ -1288,7 +1311,26 @@ RVOP(fuse4, {
12881311
addr = rv->X[fuse[i].rs1] + fuse[i].imm;
12891312
rv->X[fuse[i].rd] = rv->io.mem_read_w(addr);
12901313
}
1291-
})
1314+
rv->PC += ir->imm2 * ir->insn_len;
1315+
if (unlikely(RVOP_NO_NEXT(ir)))
1316+
return true;
1317+
const rv_insn_t *next = ir + ir->imm2;
1318+
MUST_TAIL return next->impl(rv, next);
1319+
}
1320+
1321+
/* lui + addi */
1322+
static bool do_fuse5(riscv_t *rv, const rv_insn_t *ir)
1323+
{
1324+
rv->X[rv_reg_zero] = 0;
1325+
rv->csr_cycle += 2;
1326+
rv->X[ir->rd] = ir->imm;
1327+
rv->X[ir->rs1] = ir->imm + ir->imm2;
1328+
rv->PC += 2 * ir->insn_len;
1329+
if (unlikely(RVOP_NO_NEXT(ir)))
1330+
return true;
1331+
const rv_insn_t *next = ir + 2;
1332+
MUST_TAIL return next->impl(rv, next);
1333+
}
12921334

12931335
static const void *dispatch_table[] = {
12941336
#define _(inst, can_branch) [rv_insn_##inst] = do_##inst,
@@ -1448,9 +1490,8 @@ static void block_translate(riscv_t *rv, block_t *block)
14481490
for (int j = 1; j < count; j++) { \
14491491
next_ir = ir + j; \
14501492
memcpy(ir->fuse + j, next_ir, sizeof(opcode_fuse_t)); \
1451-
next_ir->opcode = rv_insn_nop; \
1452-
next_ir->impl = dispatch_table[next_ir->opcode]; \
14531493
} \
1494+
ir->tailcall = next_ir->tailcall; \
14541495
}
14551496

14561497
/* examine whether instructions in a block match a specific pattern. If so,
@@ -1469,25 +1510,32 @@ static void match_pattern(block_t *block)
14691510
next_ir = ir + 1;
14701511
if (next_ir->opcode == rv_insn_addi && ir->rd == next_ir->rs1) {
14711512
/* the destination register of instruction auipc is equal to the
1472-
* source register 1 of next instruction addi */
1513+
* source register 1 of next instruction addi.
1514+
*/
14731515
ir->opcode = rv_insn_fuse1;
1474-
ir->rd = next_ir->rd;
1516+
ir->rs1 = next_ir->rd;
14751517
ir->imm2 = next_ir->imm;
14761518
ir->impl = dispatch_table[ir->opcode];
1477-
next_ir->opcode = rv_insn_nop;
1478-
next_ir->impl = dispatch_table[next_ir->opcode];
1519+
ir->tailcall = next_ir->tailcall;
14791520
} else if (next_ir->opcode == rv_insn_add &&
14801521
ir->rd == next_ir->rs2) {
14811522
/* the destination register of instruction auipc is equal to the
14821523
* source register 2 of next instruction add */
14831524
ir->opcode = rv_insn_fuse2;
1484-
ir->rd = next_ir->rd;
1525+
ir->rs2 = next_ir->rd;
14851526
ir->rs1 = next_ir->rs1;
14861527
ir->impl = dispatch_table[ir->opcode];
1487-
next_ir->opcode = rv_insn_nop;
1488-
next_ir->impl = dispatch_table[next_ir->opcode];
1528+
} else if (next_ir->opcode == rv_insn_add &&
1529+
ir->rd == next_ir->rs1) {
1530+
/* the destination register of instruction auipc is equal to the
1531+
* source register 1 of next instruction add */
1532+
ir->opcode = rv_insn_fuse2;
1533+
ir->rs2 = next_ir->rd;
1534+
ir->rs1 = next_ir->rs2;
1535+
ir->impl = dispatch_table[ir->opcode];
14891536
}
14901537
break;
1538+
14911539
/* If the memory addresses of a sequence of store or load instructions
14921540
* are contiguous, combine these instructions.
14931541
*/
@@ -1497,7 +1545,19 @@ static void match_pattern(block_t *block)
14971545
case rv_insn_lw:
14981546
COMBINE_MEM_OPS(1);
14991547
break;
1500-
/* FIXME: lui + addi */
1548+
case rv_insn_lui:
1549+
next_ir = ir + 1;
1550+
if (next_ir->opcode == rv_insn_addi && ir->rd == next_ir->rs1) {
1551+
/* the destination register of instruction lui is equal to
1552+
* the source register 1 of next instruction addi.
1553+
*/
1554+
ir->opcode = rv_insn_fuse5;
1555+
ir->rs1 = next_ir->rd;
1556+
ir->imm2 = next_ir->imm;
1557+
ir->impl = dispatch_table[ir->opcode];
1558+
ir->tailcall = next_ir->tailcall;
1559+
}
1560+
break;
15011561
/* TODO: mixture of sw and lw */
15021562
/* TODO: reorder insturction to match pattern */
15031563
}

0 commit comments

Comments
 (0)