Skip to content

Commit 1419d0e

Browse files
committed
math/big: implement subVV in riscv64 assembly
This provides an assembly implementation of subVV for riscv64, processing up to four words per loop, resulting in a significant performance gain. On a StarFive VisionFive 2: │ subvv.1 │ subvv.2 │ │ sec/op │ sec/op vs base │ SubVV/1-4 73.46n ± 0% 48.08n ± 0% -34.55% (p=0.000 n=10) SubVV/2-4 88.13n ± 0% 58.76n ± 0% -33.33% (p=0.000 n=10) SubVV/3-4 102.80n ± 0% 69.45n ± 0% -32.44% (p=0.000 n=10) SubVV/4-4 117.50n ± 0% 72.11n ± 0% -38.63% (p=0.000 n=10) SubVV/5-4 132.20n ± 0% 82.80n ± 0% -37.37% (p=0.000 n=10) SubVV/10-4 216.3n ± 0% 126.9n ± 0% -41.33% (p=0.000 n=10) SubVV/100-4 1659.0n ± 0% 886.5n ± 0% -46.56% (p=0.000 n=10) SubVV/1000-4 16.089µ ± 0% 8.401µ ± 0% -47.78% (p=0.000 n=10) SubVV/10000-4 244.7µ ± 0% 176.8µ ± 0% -27.74% (p=0.000 n=10) SubVV/100000-4 2.562m ± 0% 1.871m ± 0% -26.96% (p=0.000 n=10) geomean 1.436µ 904.4n -37.04% │ subvv.1 │ subvv.2 │ │ B/s │ B/s vs base │ SubVV/1-4 830.9Mi ± 0% 1269.5Mi ± 0% +52.79% (p=0.000 n=10) SubVV/2-4 1.353Gi ± 0% 2.029Gi ± 0% +49.99% (p=0.000 n=10) SubVV/3-4 1.739Gi ± 0% 2.575Gi ± 0% +48.06% (p=0.000 n=10) SubVV/4-4 2.029Gi ± 0% 3.306Gi ± 0% +62.96% (p=0.000 n=10) SubVV/5-4 2.254Gi ± 0% 3.600Gi ± 0% +59.67% (p=0.000 n=10) SubVV/10-4 2.755Gi ± 0% 4.699Gi ± 0% +70.53% (p=0.000 n=10) SubVV/100-4 3.594Gi ± 0% 6.723Gi ± 0% +87.08% (p=0.000 n=10) SubVV/1000-4 3.705Gi ± 0% 7.095Gi ± 0% +91.52% (p=0.000 n=10) SubVV/10000-4 2.436Gi ± 0% 3.372Gi ± 0% +38.39% (p=0.000 n=10) SubVV/100000-4 2.327Gi ± 0% 3.185Gi ± 0% +36.91% (p=0.000 n=10) geomean 2.118Gi 3.364Gi +58.84% Change-Id: I361cb3f4195b27a9f1e9486c9e1fdbeaa94d32b4 Reviewed-on: https://go-review.googlesource.com/c/go/+/595396 LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Meng Zhuo <[email protected]> Reviewed-by: Mark Ryan <[email protected]> Reviewed-by: Cherry Mui <[email protected]> Reviewed-by: Carlos Amedee <[email protected]>
1 parent 49adb8e commit 1419d0e

File tree

1 file changed

+80
-1
lines changed

1 file changed

+80
-1
lines changed

src/math/big/arith_riscv64.s

Lines changed: 80 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,86 @@ done:
9292
RET
9393

9494
TEXT ·subVV(SB),NOSPLIT,$0
95-
JMP ·subVV_g(SB)
95+
MOV x+24(FP), X5
96+
MOV y+48(FP), X6
97+
MOV z+0(FP), X7
98+
MOV z_len+8(FP), X30
99+
100+
MOV $4, X28
101+
MOV $0, X29 // b = 0
102+
103+
BEQZ X30, done
104+
BLTU X30, X28, loop1
105+
106+
loop4:
107+
MOV 0(X5), X8 // x[0]
108+
MOV 0(X6), X9 // y[0]
109+
MOV 8(X5), X11 // x[1]
110+
MOV 8(X6), X12 // y[1]
111+
MOV 16(X5), X14 // x[2]
112+
MOV 16(X6), X15 // y[2]
113+
MOV 24(X5), X17 // x[3]
114+
MOV 24(X6), X18 // y[3]
115+
116+
SUB X9, X8, X21 // z[0] = x[0] - y[0]
117+
SLTU X21, X8, X22
118+
SUB X29, X21, X10 // z[0] = x[0] - y[0] - b
119+
SLTU X10, X21, X23
120+
ADD X22, X23, X29 // next b
121+
122+
SUB X12, X11, X24 // z[1] = x[1] - y[1]
123+
SLTU X24, X11, X25
124+
SUB X29, X24, X13 // z[1] = x[1] - y[1] - b
125+
SLTU X13, X24, X26
126+
ADD X25, X26, X29 // next b
127+
128+
SUB X15, X14, X21 // z[2] = x[2] - y[2]
129+
SLTU X21, X14, X22
130+
SUB X29, X21, X16 // z[2] = x[2] - y[2] - b
131+
SLTU X16, X21, X23
132+
ADD X22, X23, X29 // next b
133+
134+
SUB X18, X17, X21 // z[3] = x[3] - y[3]
135+
SLTU X21, X17, X22
136+
SUB X29, X21, X19 // z[3] = x[3] - y[3] - b
137+
SLTU X19, X21, X23
138+
ADD X22, X23, X29 // next b
139+
140+
MOV X10, 0(X7) // z[0]
141+
MOV X13, 8(X7) // z[1]
142+
MOV X16, 16(X7) // z[2]
143+
MOV X19, 24(X7) // z[3]
144+
145+
ADD $32, X5
146+
ADD $32, X6
147+
ADD $32, X7
148+
SUB $4, X30
149+
150+
BGEU X30, X28, loop4
151+
BEQZ X30, done
152+
153+
loop1:
154+
MOV 0(X5), X10 // x
155+
MOV 0(X6), X11 // y
156+
157+
SUB X11, X10, X12 // z = x - y
158+
SLTU X12, X10, X14
159+
SUB X29, X12, X13 // z = x - y - b
160+
SLTU X13, X12, X15
161+
ADD X14, X15, X29 // next b
162+
163+
MOV X13, 0(X7) // z
164+
165+
ADD $8, X5
166+
ADD $8, X6
167+
ADD $8, X7
168+
SUB $1, X30
169+
170+
BNEZ X30, loop1
171+
172+
done:
173+
MOV X29, c+72(FP) // return b
174+
RET
96175

97176
TEXT ·addVW(SB),NOSPLIT,$0
98177
JMP ·addVW_g(SB)

0 commit comments

Comments
 (0)