Skip to content

math/big: optimize amd64 asm shlVU and shrVU for shift==0 case #31171

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 30 additions & 2 deletions src/math/big/arith_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ large:


// func shlVU(z, x []Word, s uint) (c Word)
TEXT ·shlVU(SB),NOSPLIT,$0
TEXT ·shlVU(SB),NOSPLIT,$24-64
MOVQ z_len+8(FP), BX // i = z
SUBQ $1, BX // i--
JL X8b // i < 0 (n <= 0)
Expand All @@ -262,6 +262,9 @@ TEXT ·shlVU(SB),NOSPLIT,$0
MOVQ z+0(FP), R10
MOVQ x+24(FP), R8
MOVQ s+48(FP), CX
TESTB CL, CL
JZ OPTI // s == 0

MOVQ (R8)(BX*8), AX // w1 = x[n-1]
MOVQ $0, DX
SHLQ CX, DX:AX // w1>>ŝ
Expand All @@ -283,12 +286,23 @@ X8a: SHLQ CX, AX // w1<<s
MOVQ AX, (R10) // z[0] = w1<<s
RET

COPY: INCQ BX
SHLQ $3, BX
MOVQ R10, 0(SP)
MOVQ R8, 8(SP)
MOVQ BX, 16(SP)
CALL runtime·memmove(SB)
JMP X8b

OPTI: CMPQ R8, R10
JNE COPY // z.base == x.base

X8b: MOVQ $0, c+56(FP)
RET


// func shrVU(z, x []Word, s uint) (c Word)
TEXT ·shrVU(SB),NOSPLIT,$0
TEXT ·shrVU(SB),NOSPLIT,$24-64
MOVQ z_len+8(FP), R11
SUBQ $1, R11 // n--
JL X9b // n < 0 (n <= 0)
Expand All @@ -297,6 +311,9 @@ TEXT ·shrVU(SB),NOSPLIT,$0
MOVQ z+0(FP), R10
MOVQ x+24(FP), R8
MOVQ s+48(FP), CX
TESTB CL, CL
JZ OPTI // s == 0

MOVQ (R8), AX // w1 = x[0]
MOVQ $0, DX
SHRQ CX, DX:AX // w1<<ŝ
Expand All @@ -320,6 +337,17 @@ X9a: SHRQ CX, AX // w1>>s
MOVQ AX, (R10)(R11*8) // z[n-1] = w1>>s
RET

COPY: INCQ R11
SHLQ $3, R11
MOVQ R10, 0(SP)
MOVQ R8, 8(SP)
MOVQ R11, 16(SP)
CALL runtime·memmove(SB)
JMP X9b

OPTI: CMPQ R8, R10
JNE COPY // z.base == x.base

X9b: MOVQ $0, c+56(FP)
RET

Expand Down
95 changes: 95 additions & 0 deletions src/math/big/arith_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,101 @@ func TestFunVW(t *testing.T) {
}
}

func TestShlVUCopy(t *testing.T) {
src := []Word{2, 3, 5, 7}
dst := []Word{14, 10, 6, 4}
if r := shlVU(dst[:len(dst)-1], src, 0); r != 0 {
t.Errorf("ret = %v != 0", r)
}
if cap(dst) != 4 || cap(src) != 4 {
t.Errorf("Underlying array changed. Slices: dst: %v, src: %v", dst, src)
}
if src[0] != 2 || src[1] != 3 || src[2] != 5 || src[3] != 7 {
t.Errorf("src changed. Slices: dst: %v, src: %v", dst, src)
}
if dst[0] != 2 || dst[1] != 3 || dst[2] != 5 || dst[3] != 4 {
t.Errorf("dst wrong. dst: %v", dst)
}

}

func TestShlVUNop(t *testing.T) {
dst := []Word{14, 10, 6, 4}
if r := shlVU(dst[:len(dst)-1], dst, 0); r != 0 {
t.Errorf("ret = %v != 0", r)
}
if cap(dst) != 4 {
t.Errorf("Underlying array changed. Slice: dst: %v", dst)
}
if dst[0] != 14 || dst[1] != 10 || dst[2] != 6 || dst[3] != 4 {
t.Errorf("dst wrong. dst: %v", dst)
}
}

func TestShrVUCopy(t *testing.T) {
src := []Word{2, 3, 5, 7}
dst := []Word{14, 10, 6, 4}
if r := shrVU(dst[:len(dst)-1], src, 0); r != 0 {
t.Errorf("ret = %v != 0", r)
}
if cap(dst) != 4 || cap(src) != 4 {
t.Errorf("Underlying array changed. Slices: dst: %v, src: %v", dst, src)
}
if src[0] != 2 || src[1] != 3 || src[2] != 5 || src[3] != 7 {
t.Errorf("src changed. Slices: dst: %v, src: %v", dst, src)
}
if dst[0] != 2 || dst[1] != 3 || dst[2] != 5 || dst[3] != 4 {
t.Errorf("dst wrong. dst: %v", dst)
}
}

func TestShrVUNop(t *testing.T) {
dst := []Word{14, 10, 6, 4}
if r := shrVU(dst[:len(dst)-1], dst, 0); r != 0 {
t.Errorf("ret = %v != 0", r)
}
if cap(dst) != 4 {
t.Errorf("Underlying array changed. Slice: dst: %v", dst)
}
if dst[0] != 14 || dst[1] != 10 || dst[2] != 6 || dst[3] != 4 {
t.Errorf("dst wrong. dst: %v", dst)
}
}

func BenchmarkShlVUCopy1e7(b *testing.B) {
src := rndV(1e7)
dst := make([]Word, len(src))
b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = shlVU(dst, src, 0)
}
}

func BenchmarkShlVUNop1e7(b *testing.B) {
dst := rndV(1e7)
b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = shlVU(dst, dst, 0)
}
}

func BenchmarkShrVUCopy1e7(b *testing.B) {
src := rndV(1e7)
dst := make([]Word, len(src))
b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = shrVU(dst, src, 0)
}
}

func BenchmarkShrVUNop1e7(b *testing.B) {
dst := rndV(1e7)
b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = shrVU(dst, dst, 0)
}
}

func BenchmarkAddVW(b *testing.B) {
for _, n := range benchSizes {
if isRaceBuilder && n > 1e3 {
Expand Down