Skip to content

Commit ee8972c

Browse files
Ruixin(Peter) Baomundaym
Ruixin(Peter) Bao
authored andcommitted
math/big: rewrite addVW to use fast path on s390x
Rewrite addVW to use a fast path and remove the original vector and non vector implementation of addVW in assembly. This CL uses a similar idea as CL 164968, where we copy the rest of words when we know carry bit is zero. In addition, since we are copying vector of words, a faster implementation of copy is written in this CL to copy a word or multiple words at a time. Benchmarks: name old time/op new time/op delta AddVW/1-18 4.56ns ± 0% 4.01ns ± 6% -12.14% (p=0.000 n=18+20) AddVW/2-18 5.54ns ± 0% 4.42ns ± 5% -20.20% (p=0.000 n=18+20) AddVW/3-18 6.55ns ± 0% 4.61ns ± 0% -29.62% (p=0.000 n=16+18) AddVW/4-18 6.11ns ± 2% 5.12ns ± 6% -16.19% (p=0.000 n=20+20) AddVW/5-18 7.32ns ± 4% 5.14ns ± 0% -29.77% (p=0.000 n=20+19) AddVW/10-18 10.6ns ± 2% 7.2ns ± 1% -31.47% (p=0.000 n=20+20) AddVW/100-18 49.6ns ± 2% 18.0ns ± 0% -63.63% (p=0.000 n=20+20) AddVW/1000-18 465ns ± 3% 244ns ± 0% -47.54% (p=0.000 n=20+20) AddVW/10000-18 4.99µs ± 4% 2.97µs ± 0% -40.54% (p=0.000 n=20+20) AddVW/100000-18 48.3µs ± 3% 30.8µs ± 1% -36.29% (p=0.000 n=20+20) [Geo mean] 58.1ns 38.0ns -34.57% name old speed new speed delta AddVW/1-18 1.76GB/s ± 0% 2.00GB/s ± 6% +14.04% (p=0.000 n=20+20) AddVW/2-18 2.89GB/s ± 0% 3.63GB/s ± 5% +25.55% (p=0.000 n=18+20) AddVW/3-18 3.66GB/s ± 0% 5.21GB/s ± 0% +42.25% (p=0.000 n=18+19) AddVW/4-18 5.24GB/s ± 2% 6.27GB/s ± 6% +19.61% (p=0.000 n=20+20) AddVW/5-18 5.47GB/s ± 4% 7.78GB/s ± 0% +42.28% (p=0.000 n=20+18) AddVW/10-18 7.55GB/s ± 2% 11.04GB/s ± 1% +46.09% (p=0.000 n=20+20) AddVW/100-18 16.1GB/s ± 2% 44.3GB/s ± 0% +174.77% (p=0.000 n=20+20) AddVW/1000-18 17.2GB/s ± 3% 32.8GB/s ± 1% +90.58% (p=0.000 n=20+20) AddVW/10000-18 16.0GB/s ± 4% 26.9GB/s ± 0% +68.11% (p=0.000 n=20+20) AddVW/100000-18 16.6GB/s ± 3% 26.0GB/s ± 1% +56.94% (p=0.000 n=20+20) [Geo mean] 7.03GB/s 10.75GB/s +52.93% Change-Id: Idbb73f3178311bd2b18a93bdc1e48f26869d2f6a Reviewed-on: https://go-review.googlesource.com/c/go/+/209679 Reviewed-by: Michael Munday <[email protected]> Run-TryBot: Michael Munday <[email protected]> TryBot-Result: Gobot Gobot <[email protected]>
1 parent 82f2989 commit ee8972c

File tree

3 files changed

+84
-213
lines changed

3 files changed

+84
-213
lines changed

src/math/big/arith_decl_s390x.go

-3
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,6 @@ func addVV_novec(z, x, y []Word) (c Word)
1212
func subVV_check(z, x, y []Word) (c Word)
1313
func subVV_vec(z, x, y []Word) (c Word)
1414
func subVV_novec(z, x, y []Word) (c Word)
15-
func addVW_check(z, x []Word, y Word) (c Word)
16-
func addVW_vec(z, x []Word, y Word) (c Word)
17-
func addVW_novec(z, x []Word, y Word) (c Word)
1815
func subVW_check(z, x []Word, y Word) (c Word)
1916
func subVW_vec(z, x []Word, y Word) (c Word)
2017
func subVW_novec(z, x []Word, y Word) (c Word)

src/math/big/arith_s390x.s

+83-206
Original file line numberDiff line numberDiff line change
@@ -541,216 +541,93 @@ E1:
541541
RET
542542

543543
TEXT ·addVW(SB), NOSPLIT, $0
544-
MOVD addwvectorfacility+0x00(SB), R1
545-
BR (R1)
546-
547-
TEXT ·addVW_check(SB), NOSPLIT, $0
548-
MOVB ·hasVX(SB), R1
549-
CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported
550-
MOVD $addwvectorfacility+0x00(SB), R1
551-
MOVD $·addVW_novec(SB), R2
552-
MOVD R2, 0(R1)
553-
554-
// MOVD $·addVW_novec(SB), 0(R1)
555-
BR ·addVW_novec(SB)
556-
557-
vectorimpl:
558-
MOVD $addwvectorfacility+0x00(SB), R1
559-
MOVD $·addVW_vec(SB), R2
560-
MOVD R2, 0(R1)
561-
562-
// MOVD $·addVW_vec(SB), 0(R1)
563-
BR ·addVW_vec(SB)
564-
565-
GLOBL addwvectorfacility+0x00(SB), NOPTR, $8
566-
DATA addwvectorfacility+0x00(SB)/8, $·addVW_check(SB)
567-
568-
// func addVW_vec(z, x []Word, y Word) (c Word)
569-
TEXT ·addVW_vec(SB), NOSPLIT, $0
570-
MOVD z_len+8(FP), R3
571-
MOVD x+24(FP), R8
572-
MOVD y+48(FP), R4 // c = y
573-
MOVD z+0(FP), R2
574-
575-
MOVD $0, R0 // make sure it's zero
576-
MOVD $0, R10 // i = 0
577-
MOVD R8, R5
578-
MOVD R2, R7
579-
580-
// s/JL/JMP/ below to disable the unrolled loop
581-
SUB $4, R3 // n -= 4
582-
BLT v10 // if n < 0 goto v10
583-
SUB $12, R3
584-
BLT A10
585-
586-
// n >= 0
587-
// regular loop body unrolled 16x
588-
589-
VZERO V0 // prepare V0 to be final carry register
590-
VZERO V9 // to ensure upper half is zero
591-
VLVGG $1, R4, V9
592-
593-
UU1:
594-
VLM 0(R5), V1, V4 // 64-bytes into V1..V4
595-
ADD $64, R5
596-
VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
597-
VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
598-
599-
VACCCQ V1, V9, V0, V25
600-
VACQ V1, V9, V0, V17
601-
VZERO V9
602-
VACCCQ V2, V9, V25, V26
603-
VACQ V2, V9, V25, V18
604-
605-
VLM 0(R5), V5, V6 // 32-bytes into V5..V6
606-
ADD $32, R5
607-
608-
VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order
609-
VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order
610-
611-
VACCCQ V3, V9, V26, V27
612-
VACQ V3, V9, V26, V19
613-
VACCCQ V4, V9, V27, V28
614-
VACQ V4, V9, V27, V20
615-
616-
VLM 0(R5), V7, V8 // 32-bytes into V7..V8
617-
ADD $32, R5
618-
619-
VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order
620-
VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order
621-
622-
VACCCQ V5, V9, V28, V29
623-
VACQ V5, V9, V28, V21
624-
VACCCQ V6, V9, V29, V30
625-
VACQ V6, V9, V29, V22
626-
627-
VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order
628-
VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order
629-
630-
VACCCQ V7, V9, V30, V31
631-
VACQ V7, V9, V30, V23
632-
VACCCQ V8, V9, V31, V0 // V0 has carry-over
633-
VACQ V8, V9, V31, V24
634-
635-
VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order
636-
VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order
637-
VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order
638-
VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order
639-
VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order
640-
VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order
641-
VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order
642-
VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order
643-
VSTM V17, V24, 0(R7) // 128-bytes into z
644-
ADD $128, R7
645-
ADD $128, R10 // i += 16
646-
SUB $16, R3 // n -= 16
647-
BGE UU1 // if n >= 0 goto U1
648-
VLGVG $1, V0, R4 // put cf into R4 in case we branch to v10
649-
650-
A10:
651-
ADD $12, R3 // n += 16
652-
653-
// s/JL/JMP/ below to disable the unrolled loop
654-
655-
BLT v10 // if n < 0 goto v10
656-
657-
U4: // n >= 0
658-
// regular loop body unrolled 4x
659-
MOVD 0(R8)(R10*1), R5
660-
MOVD 8(R8)(R10*1), R6
661-
MOVD 16(R8)(R10*1), R7
662-
MOVD 24(R8)(R10*1), R1
663-
ADDC R4, R5
664-
ADDE R0, R6
665-
ADDE R0, R7
666-
ADDE R0, R1
667-
ADDE R0, R0
668-
MOVD R0, R4 // save CF
669-
SUB R0, R0
670-
MOVD R5, 0(R2)(R10*1)
671-
MOVD R6, 8(R2)(R10*1)
672-
MOVD R7, 16(R2)(R10*1)
673-
MOVD R1, 24(R2)(R10*1)
674-
675-
ADD $32, R10 // i += 4 -> i +=32
676-
SUB $4, R3 // n -= 4
677-
BGE U4 // if n >= 0 goto U4
678-
679-
v10:
680-
ADD $4, R3 // n += 4
681-
BLE E10 // if n <= 0 goto E4
682-
683-
L4: // n > 0
684-
MOVD 0(R8)(R10*1), R5
685-
ADDC R4, R5
544+
MOVD z_len+8(FP), R5 // length of z
545+
MOVD x+24(FP), R6
546+
MOVD y+48(FP), R7 // c = y
547+
MOVD z+0(FP), R8
548+
549+
CMPBEQ R5, $0, returnC // if len(z) == 0, we can have an early return
550+
551+
// Add the first two words, and determine which path (copy path or loop path) to take based on the carry flag.
552+
ADDC 0(R6), R7
553+
MOVD R7, 0(R8)
554+
CMPBEQ R5, $1, returnResult // len(z) == 1
555+
MOVD $0, R9
556+
ADDE 8(R6), R9
557+
MOVD R9, 8(R8)
558+
CMPBEQ R5, $2, returnResult // len(z) == 2
559+
560+
// Update the counters
561+
MOVD $16, R12 // i = 2
562+
MOVD $-2(R5), R5 // n = n - 2
563+
564+
loopOverEachWord:
565+
BRC $12, copySetup // carry = 0, copy the rest
566+
MOVD $1, R9
567+
568+
// Originally we used the carry flag generated in the previous iteration
569+
// (i.e: ADDE could be used here to do the addition). However, since we
570+
// already know carry is 1 (otherwise we will go to copy section), we can use
571+
// ADDC here so the current iteration does not depend on the carry flag
572+
// generated in the previous iteration. This could be useful when branch prediction happens.
573+
ADDC 0(R6)(R12*1), R9
574+
MOVD R9, 0(R8)(R12*1) // z[i] = x[i] + c
575+
576+
MOVD $8(R12), R12 // i++
577+
BRCTG R5, loopOverEachWord // n--
578+
579+
// Return the current carry value
580+
returnResult:
581+
MOVD $0, R0
686582
ADDE R0, R0
687-
MOVD R0, R4 // save CF
688-
SUB R0, R0
689-
MOVD R5, 0(R2)(R10*1)
690-
691-
ADD $8, R10 // i++
692-
SUB $1, R3 // n--
693-
BGT L4 // if n > 0 goto L4
694-
695-
E10:
696-
MOVD R4, c+56(FP) // return c
697-
583+
MOVD R0, c+56(FP)
698584
RET
699585

700-
TEXT ·addVW_novec(SB), NOSPLIT, $0
701-
// DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0)
702-
MOVD z_len+8(FP), R3
703-
MOVD x+24(FP), R8
704-
MOVD y+48(FP), R4 // c = y
705-
MOVD z+0(FP), R2
706-
MOVD $0, R0 // make sure it's 0
707-
MOVD $0, R10 // i = 0
708-
709-
// s/JL/JMP/ below to disable the unrolled loop
710-
SUB $4, R3 // n -= 4
711-
BLT v4 // if n < 4 goto v4
712-
713-
U4: // n >= 0
714-
// regular loop body unrolled 4x
715-
MOVD 0(R8)(R10*1), R5
716-
MOVD 8(R8)(R10*1), R6
717-
MOVD 16(R8)(R10*1), R7
718-
MOVD 24(R8)(R10*1), R1
719-
ADDC R4, R5
720-
ADDE R0, R6
721-
ADDE R0, R7
722-
ADDE R0, R1
723-
ADDE R0, R0
724-
MOVD R0, R4 // save CF
725-
SUB R0, R0
726-
MOVD R5, 0(R2)(R10*1)
727-
MOVD R6, 8(R2)(R10*1)
728-
MOVD R7, 16(R2)(R10*1)
729-
MOVD R1, 24(R2)(R10*1)
730-
731-
ADD $32, R10 // i += 4 -> i +=32
732-
SUB $4, R3 // n -= 4
733-
BGE U4 // if n >= 0 goto U4
734-
735-
v4:
736-
ADD $4, R3 // n += 4
737-
BLE E4 // if n <= 0 goto E4
738-
739-
L4: // n > 0
740-
MOVD 0(R8)(R10*1), R5
741-
ADDC R4, R5
742-
ADDE R0, R0
743-
MOVD R0, R4 // save CF
744-
SUB R0, R0
745-
MOVD R5, 0(R2)(R10*1)
746-
747-
ADD $8, R10 // i++
748-
SUB $1, R3 // n--
749-
BGT L4 // if n > 0 goto L4
750-
751-
E4:
752-
MOVD R4, c+56(FP) // return c
586+
// Update position of x(R6) and z(R8) based on the current counter value and perform copying.
587+
// With the assumption that x and z will not overlap with each other or x and z will
588+
// point to same memory region, we can use a faster version of copy using only MVC here.
589+
// In the following implementation, we have three copy loops, each copying a word, 4 words, and
590+
// 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove.
591+
copySetup:
592+
ADD R12, R6
593+
ADD R12, R8
594+
595+
CMPBGE R5, $4, mediumLoop
596+
597+
smallLoop: // does a loop unrolling to copy word when n < 4
598+
CMPBEQ R5, $0, returnZero
599+
MVC $8, 0(R6), 0(R8)
600+
CMPBEQ R5, $1, returnZero
601+
MVC $8, 8(R6), 8(R8)
602+
CMPBEQ R5, $2, returnZero
603+
MVC $8, 16(R6), 16(R8)
604+
605+
returnZero:
606+
MOVD $0, c+56(FP) // return 0 as carry
607+
RET
753608

609+
mediumLoop:
610+
CMPBLT R5, $4, smallLoop
611+
CMPBLT R5, $32, mediumLoopBody
612+
613+
largeLoop: // Copying 256 bytes at a time.
614+
MVC $256, 0(R6), 0(R8)
615+
MOVD $256(R6), R6
616+
MOVD $256(R8), R8
617+
MOVD $-32(R5), R5
618+
CMPBGE R5, $32, largeLoop
619+
BR mediumLoop
620+
621+
mediumLoopBody: // Copying 32 bytes at a time
622+
MVC $32, 0(R6), 0(R8)
623+
MOVD $32(R6), R6
624+
MOVD $32(R8), R8
625+
MOVD $-4(R5), R5
626+
CMPBGE R5, $4, mediumLoopBody
627+
BR smallLoop
628+
629+
returnC:
630+
MOVD R7, c+56(FP)
754631
RET
755632

756633
TEXT ·subVW(SB), NOSPLIT, $0

src/math/big/arith_s390x_test.go

+1-4
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,7 @@ func TestFunVVnovec(t *testing.T) {
3434
func TestFunVWnovec(t *testing.T) {
3535
if hasVX == true {
3636
for _, a := range sumVW {
37-
arg := a
38-
testFunVW(t, "addVW_novec", addVW_novec, arg)
39-
40-
arg = argVW{a.x, a.z, a.y, a.c}
37+
arg := argVW{a.x, a.z, a.y, a.c}
4138
testFunVW(t, "subVW_novec", subVW_novec, arg)
4239
}
4340
}

0 commit comments

Comments
 (0)