Skip to content

Commit 3e61065

Browse files
committed
math/big: optimize amd64 asm shlVU and shrVU for shift==0 case
This adds branches for s == 0 and s == 0 && z.base == x.base to shlVU and shrVU. In the first case runtime.memmove is called, while in the second case we just return. Tests and benchmarks are also added for the new branches. Benchmarked on AMD64 Linux on i5-8300H: name old time/op new time/op delta ShlVUCopy1e7-8 16.0ms ± 0% 11.1ms ± 1% -30.79% (p=0.000 n=10+19) ShlVUNop1e7-8 10.5ms ± 1% 0.0ms ± 0% -100.00% (p=0.000 n=9+20) ShrVUCopy1e7-8 15.5ms ± 0% 11.1ms ± 1% -28.55% (p=0.000 n=8+18) ShrVUNop1e7-8 10.3ms ± 2% 0.0ms ± 0% -100.00% (p=0.000 n=9+20) Fixes #31097
1 parent 7b62e98 commit 3e61065

File tree

2 files changed

+125
-2
lines changed

2 files changed

+125
-2
lines changed

src/math/big/arith_amd64.s

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ large:
253253

254254

255255
// func shlVU(z, x []Word, s uint) (c Word)
256-
TEXT ·shlVU(SB),NOSPLIT,$0
256+
TEXT ·shlVU(SB),NOSPLIT,$24-64
257257
MOVQ z_len+8(FP), BX // i = z
258258
SUBQ $1, BX // i--
259259
JL X8b // i < 0 (n <= 0)
@@ -262,6 +262,9 @@ TEXT ·shlVU(SB),NOSPLIT,$0
262262
MOVQ z+0(FP), R10
263263
MOVQ x+24(FP), R8
264264
MOVQ s+48(FP), CX
265+
TESTB CL, CL
266+
JZ OPTI // s == 0
267+
265268
MOVQ (R8)(BX*8), AX // w1 = x[n-1]
266269
MOVQ $0, DX
267270
SHLQ CX, DX:AX // w1>>ŝ
@@ -283,12 +286,23 @@ X8a: SHLQ CX, AX // w1<<s
283286
MOVQ AX, (R10) // z[0] = w1<<s
284287
RET
285288

289+
COPY: INCQ BX
290+
SHLQ $3, BX
291+
MOVQ R10, 0(SP)
292+
MOVQ R8, 8(SP)
293+
MOVQ BX, 16(SP)
294+
CALL runtime·memmove(SB)
295+
JMP X8b
296+
297+
OPTI: CMPQ R8, R10
298+
JNE COPY // z.base == x.base
299+
286300
X8b: MOVQ $0, c+56(FP)
287301
RET
288302

289303

290304
// func shrVU(z, x []Word, s uint) (c Word)
291-
TEXT ·shrVU(SB),NOSPLIT,$0
305+
TEXT ·shrVU(SB),NOSPLIT,$24-64
292306
MOVQ z_len+8(FP), R11
293307
SUBQ $1, R11 // n--
294308
JL X9b // n < 0 (n <= 0)
@@ -297,6 +311,9 @@ TEXT ·shrVU(SB),NOSPLIT,$0
297311
MOVQ z+0(FP), R10
298312
MOVQ x+24(FP), R8
299313
MOVQ s+48(FP), CX
314+
TESTB CL, CL
315+
JZ OPTI // s == 0
316+
300317
MOVQ (R8), AX // w1 = x[0]
301318
MOVQ $0, DX
302319
SHRQ CX, DX:AX // w1<<ŝ
@@ -320,6 +337,17 @@ X9a: SHRQ CX, AX // w1>>s
320337
MOVQ AX, (R10)(R11*8) // z[n-1] = w1>>s
321338
RET
322339

340+
COPY: INCQ R11
341+
SHLQ $3, R11
342+
MOVQ R10, 0(SP)
343+
MOVQ R8, 8(SP)
344+
MOVQ R11, 16(SP)
345+
CALL runtime·memmove(SB)
346+
JMP X9b
347+
348+
OPTI: CMPQ R8, R10
349+
JNE COPY // z.base == x.base
350+
323351
X9b: MOVQ $0, c+56(FP)
324352
RET
325353

src/math/big/arith_test.go

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,101 @@ func TestFunVW(t *testing.T) {
213213
}
214214
}
215215

216+
func TestShlVUCopy(t *testing.T) {
217+
src := []Word{2, 3, 5, 7}
218+
dst := []Word{14, 10, 6, 4}
219+
if r := shlVU(dst[:len(dst)-1], src, 0); r != 0 {
220+
t.Errorf("ret = %v != 0", r)
221+
}
222+
if cap(dst) != 4 || cap(src) != 4 {
223+
t.Errorf("Underlying array changed. Slices: dst: %v, src: %v", dst, src)
224+
}
225+
if src[0] != 2 || src[1] != 3 || src[2] != 5 || src[3] != 7 {
226+
t.Errorf("src changed. Slices: dst: %v, src: %v", dst, src)
227+
}
228+
if dst[0] != 2 || dst[1] != 3 || dst[2] != 5 || dst[3] != 4 {
229+
t.Errorf("dst wrong. dst: %v", dst)
230+
}
231+
232+
}
233+
234+
func TestShlVUNop(t *testing.T) {
235+
dst := []Word{14, 10, 6, 4}
236+
if r := shlVU(dst[:len(dst)-1], dst, 0); r != 0 {
237+
t.Errorf("ret = %v != 0", r)
238+
}
239+
if cap(dst) != 4 {
240+
t.Errorf("Underlying array changed. Slice: dst: %v", dst)
241+
}
242+
if dst[0] != 14 || dst[1] != 10 || dst[2] != 6 || dst[3] != 4 {
243+
t.Errorf("dst wrong. dst: %v", dst)
244+
}
245+
}
246+
247+
func TestShrVUCopy(t *testing.T) {
248+
src := []Word{2, 3, 5, 7}
249+
dst := []Word{14, 10, 6, 4}
250+
if r := shrVU(dst[:len(dst)-1], src, 0); r != 0 {
251+
t.Errorf("ret = %v != 0", r)
252+
}
253+
if cap(dst) != 4 || cap(src) != 4 {
254+
t.Errorf("Underlying array changed. Slices: dst: %v, src: %v", dst, src)
255+
}
256+
if src[0] != 2 || src[1] != 3 || src[2] != 5 || src[3] != 7 {
257+
t.Errorf("src changed. Slices: dst: %v, src: %v", dst, src)
258+
}
259+
if dst[0] != 2 || dst[1] != 3 || dst[2] != 5 || dst[3] != 4 {
260+
t.Errorf("dst wrong. dst: %v", dst)
261+
}
262+
}
263+
264+
func TestShrVUNop(t *testing.T) {
265+
dst := []Word{14, 10, 6, 4}
266+
if r := shrVU(dst[:len(dst)-1], dst, 0); r != 0 {
267+
t.Errorf("ret = %v != 0", r)
268+
}
269+
if cap(dst) != 4 {
270+
t.Errorf("Underlying array changed. Slice: dst: %v", dst)
271+
}
272+
if dst[0] != 14 || dst[1] != 10 || dst[2] != 6 || dst[3] != 4 {
273+
t.Errorf("dst wrong. dst: %v", dst)
274+
}
275+
}
276+
277+
func BenchmarkShlVUCopy1e7(b *testing.B) {
278+
src := rndV(1e7)
279+
dst := make([]Word, len(src))
280+
b.ResetTimer()
281+
for i := 0; i < b.N; i++ {
282+
_ = shlVU(dst, src, 0)
283+
}
284+
}
285+
286+
func BenchmarkShlVUNop1e7(b *testing.B) {
287+
dst := rndV(1e7)
288+
b.ResetTimer()
289+
for i := 0; i < b.N; i++ {
290+
_ = shlVU(dst, dst, 0)
291+
}
292+
}
293+
294+
func BenchmarkShrVUCopy1e7(b *testing.B) {
295+
src := rndV(1e7)
296+
dst := make([]Word, len(src))
297+
b.ResetTimer()
298+
for i := 0; i < b.N; i++ {
299+
_ = shrVU(dst, src, 0)
300+
}
301+
}
302+
303+
func BenchmarkShrVUNop1e7(b *testing.B) {
304+
dst := rndV(1e7)
305+
b.ResetTimer()
306+
for i := 0; i < b.N; i++ {
307+
_ = shrVU(dst, dst, 0)
308+
}
309+
}
310+
216311
func BenchmarkAddVW(b *testing.B) {
217312
for _, n := range benchSizes {
218313
if isRaceBuilder && n > 1e3 {

0 commit comments

Comments
 (0)