Skip to content

Commit 654a185

Browse files
committed
math/big: faster assembly kernels for AddVx/SubVx for 386.
(analog to Change-Id: Ia473e9ab9c63a955c252426684176bca566645ae) Fixes #9243. benchmark old ns/op new ns/op delta BenchmarkAddVV_1 5.76 5.60 -2.78% BenchmarkAddVV_2 7.17 6.98 -2.65% BenchmarkAddVV_3 8.69 8.57 -1.38% BenchmarkAddVV_4 10.5 10.5 +0.00% BenchmarkAddVV_5 13.3 11.6 -12.78% BenchmarkAddVV_1e1 20.4 19.3 -5.39% BenchmarkAddVV_1e2 166 140 -15.66% BenchmarkAddVV_1e3 1588 1278 -19.52% BenchmarkAddVV_1e4 16138 12657 -21.57% BenchmarkAddVV_1e5 167608 127836 -23.73% BenchmarkAddVW_1 4.87 4.76 -2.26% BenchmarkAddVW_2 6.10 6.07 -0.49% BenchmarkAddVW_3 7.75 7.65 -1.29% BenchmarkAddVW_4 9.30 9.39 +0.97% BenchmarkAddVW_5 10.8 10.9 +0.93% BenchmarkAddVW_1e1 18.8 18.8 +0.00% BenchmarkAddVW_1e2 143 134 -6.29% BenchmarkAddVW_1e3 1390 1266 -8.92% BenchmarkAddVW_1e4 13877 12545 -9.60% BenchmarkAddVW_1e5 155330 125432 -19.25% benchmark old MB/s new MB/s speedup BenchmarkAddVV_1 5556.09 5715.12 1.03x BenchmarkAddVV_2 8926.55 9170.64 1.03x BenchmarkAddVV_3 11042.15 11201.77 1.01x BenchmarkAddVV_4 12168.21 12245.50 1.01x BenchmarkAddVV_5 12041.39 13805.73 1.15x BenchmarkAddVV_1e1 15659.65 16548.18 1.06x BenchmarkAddVV_1e2 19268.57 22728.64 1.18x BenchmarkAddVV_1e3 20141.45 25033.36 1.24x BenchmarkAddVV_1e4 19827.86 25281.92 1.28x BenchmarkAddVV_1e5 19092.06 25031.92 1.31x BenchmarkAddVW_1 822.12 840.92 1.02x BenchmarkAddVW_2 1310.89 1317.89 1.01x BenchmarkAddVW_3 1549.31 1568.26 1.01x BenchmarkAddVW_4 1720.45 1703.77 0.99x BenchmarkAddVW_5 1857.12 1828.66 0.98x BenchmarkAddVW_1e1 2126.39 2132.38 1.00x BenchmarkAddVW_1e2 2784.49 2969.21 1.07x BenchmarkAddVW_1e3 2876.89 3157.35 1.10x BenchmarkAddVW_1e4 2882.32 3188.51 1.11x BenchmarkAddVW_1e5 2575.16 3188.96 1.24x (measured on OS X 10.9.5, 2.3 GHz Intel Core i7, 8GB 1333 MHz DDR3) Change-Id: I46698729d5e0bc3e277aa0146a9d7a086c0c26f1 Reviewed-on: https://go-review.googlesource.com/2560 Reviewed-by: Keith Randall <[email protected]>
1 parent 06ed8f0 commit 654a185

File tree

1 file changed

+11
-11
lines changed

1 file changed

+11
-11
lines changed

src/math/big/arith_386.s

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
// This file provides fast assembly versions for the elementary
88
// arithmetic operations on vectors implemented in arith.go.
99

10-
// TODO(gri) Replace uses of RCRL/RCLL with ADDL/SBBL respectively.
11-
1210
// func mulWW(x, y Word) (z1, z0 Word)
1311
TEXT ·mulWW(SB),NOSPLIT,$0
1412
MOVL x+0(FP), AX
@@ -39,15 +37,16 @@ TEXT ·addVV(SB),NOSPLIT,$0
3937
JMP E1
4038

4139
L1: MOVL (SI)(BX*4), AX
42-
RCRL $1, DX
40+
ADDL DX, DX // restore CF
4341
ADCL (CX)(BX*4), AX
44-
RCLL $1, DX
42+
SBBL DX, DX // save CF
4543
MOVL AX, (DI)(BX*4)
4644
ADDL $1, BX // i++
4745

4846
E1: CMPL BX, BP // i < n
4947
JL L1
5048

49+
NEGL DX
5150
MOVL DX, c+36(FP)
5251
RET
5352

@@ -64,15 +63,16 @@ TEXT ·subVV(SB),NOSPLIT,$0
6463
JMP E2
6564

6665
L2: MOVL (SI)(BX*4), AX
67-
RCRL $1, DX
66+
ADDL DX, DX // restore CF
6867
SBBL (CX)(BX*4), AX
69-
RCLL $1, DX
68+
SBBL DX, DX // save CF
7069
MOVL AX, (DI)(BX*4)
7170
ADDL $1, BX // i++
7271

7372
E2: CMPL BX, BP // i < n
7473
JL L2
7574

75+
NEGL DX
7676
MOVL DX, c+36(FP)
7777
RET
7878

@@ -88,8 +88,8 @@ TEXT ·addVW(SB),NOSPLIT,$0
8888

8989
L3: ADDL (SI)(BX*4), AX
9090
MOVL AX, (DI)(BX*4)
91-
RCLL $1, AX
92-
ANDL $1, AX
91+
SBBL AX, AX // save CF
92+
NEGL AX
9393
ADDL $1, BX // i++
9494

9595
E3: CMPL BX, BP // i < n
@@ -108,11 +108,11 @@ TEXT ·subVW(SB),NOSPLIT,$0
108108
MOVL $0, BX // i = 0
109109
JMP E4
110110

111-
L4: MOVL (SI)(BX*4), DX // TODO(gri) is there a reverse SUBL?
111+
L4: MOVL (SI)(BX*4), DX
112112
SUBL AX, DX
113113
MOVL DX, (DI)(BX*4)
114-
RCLL $1, AX
115-
ANDL $1, AX
114+
SBBL AX, AX // save CF
115+
NEGL AX
116116
ADDL $1, BX // i++
117117

118118
E4: CMPL BX, BP // i < n

0 commit comments

Comments
 (0)