@@ -541,216 +541,93 @@ E1:
541
541
RET
542
542
543
543
TEXT ·addVW(SB) , NOSPLIT , $ 0
544
- MOVD addwvectorfacility + 0x00 (SB) , R1
545
- BR (R1)
546
-
547
- TEXT ·addVW_check(SB) , NOSPLIT , $ 0
548
- MOVB ·hasVX(SB) , R1
549
- CMPBEQ R1 , $ 1 , vectorimpl // vectorfacility = 1 , vector supported
550
- MOVD $ addwvectorfacility + 0x00 (SB) , R1
551
- MOVD $·addVW_novec(SB) , R2
552
- MOVD R2 , 0 (R1)
553
-
554
- // MOVD $·addVW_novec(SB) , 0 (R1)
555
- BR ·addVW_novec(SB)
556
-
557
- vectorimpl:
558
- MOVD $ addwvectorfacility + 0x00 (SB) , R1
559
- MOVD $·addVW_vec(SB) , R2
560
- MOVD R2 , 0 (R1)
561
-
562
- // MOVD $·addVW_vec(SB) , 0 (R1)
563
- BR ·addVW_vec(SB)
564
-
565
- GLOBL addwvectorfacility + 0x00 (SB) , NOPTR , $ 8
566
- DATA addwvectorfacility + 0x00 (SB)/ 8 , $·addVW_check(SB)
567
-
568
- // func addVW_vec(z , x [] Word , y Word) (c Word)
569
- TEXT ·addVW_vec(SB) , NOSPLIT , $ 0
570
- MOVD z_len + 8 (FP) , R3
571
- MOVD x + 24 (FP) , R8
572
- MOVD y + 48 (FP) , R4 // c = y
573
- MOVD z + 0 (FP) , R2
574
-
575
- MOVD $ 0 , R0 // make sure it's zero
576
- MOVD $ 0 , R10 // i = 0
577
- MOVD R8 , R5
578
- MOVD R2 , R7
579
-
580
- // s/ JL / JMP / below to disable the unrolled loop
581
- SUB $ 4 , R3 // n - = 4
582
- BLT v10 // if n < 0 goto v10
583
- SUB $ 12 , R3
584
- BLT A10
585
-
586
- // n >= 0
587
- // regular loop body unrolled 16x
588
-
589
- VZERO V0 // prepare V0 to be final carry register
590
- VZERO V9 // to ensure upper half is zero
591
- VLVGG $ 1 , R4 , V9
592
-
593
- UU1:
594
- VLM 0 (R5) , V1 , V4 // 64 - bytes into V1..V4
595
- ADD $ 64 , R5
596
- VPDI $ 0x4 , V1 , V1 , V1 // flip the doublewords to big - endian order
597
- VPDI $ 0x4 , V2 , V2 , V2 // flip the doublewords to big - endian order
598
-
599
- VACCCQ V1 , V9 , V0 , V25
600
- VACQ V1 , V9 , V0 , V17
601
- VZERO V9
602
- VACCCQ V2 , V9 , V25 , V26
603
- VACQ V2 , V9 , V25 , V18
604
-
605
- VLM 0 (R5) , V5 , V6 // 32 - bytes into V5..V6
606
- ADD $ 32 , R5
607
-
608
- VPDI $ 0x4 , V3 , V3 , V3 // flip the doublewords to big - endian order
609
- VPDI $ 0x4 , V4 , V4 , V4 // flip the doublewords to big - endian order
610
-
611
- VACCCQ V3 , V9 , V26 , V27
612
- VACQ V3 , V9 , V26 , V19
613
- VACCCQ V4 , V9 , V27 , V28
614
- VACQ V4 , V9 , V27 , V20
615
-
616
- VLM 0 (R5) , V7 , V8 // 32 - bytes into V7..V8
617
- ADD $ 32 , R5
618
-
619
- VPDI $ 0x4 , V5 , V5 , V5 // flip the doublewords to big - endian order
620
- VPDI $ 0x4 , V6 , V6 , V6 // flip the doublewords to big - endian order
621
-
622
- VACCCQ V5 , V9 , V28 , V29
623
- VACQ V5 , V9 , V28 , V21
624
- VACCCQ V6 , V9 , V29 , V30
625
- VACQ V6 , V9 , V29 , V22
626
-
627
- VPDI $ 0x4 , V7 , V7 , V7 // flip the doublewords to big - endian order
628
- VPDI $ 0x4 , V8 , V8 , V8 // flip the doublewords to big - endian order
629
-
630
- VACCCQ V7 , V9 , V30 , V31
631
- VACQ V7 , V9 , V30 , V23
632
- VACCCQ V8 , V9 , V31 , V0 // V0 has carry - over
633
- VACQ V8 , V9 , V31 , V24
634
-
635
- VPDI $ 0x4 , V17 , V17 , V17 // flip the doublewords to big - endian order
636
- VPDI $ 0x4 , V18 , V18 , V18 // flip the doublewords to big - endian order
637
- VPDI $ 0x4 , V19 , V19 , V19 // flip the doublewords to big - endian order
638
- VPDI $ 0x4 , V20 , V20 , V20 // flip the doublewords to big - endian order
639
- VPDI $ 0x4 , V21 , V21 , V21 // flip the doublewords to big - endian order
640
- VPDI $ 0x4 , V22 , V22 , V22 // flip the doublewords to big - endian order
641
- VPDI $ 0x4 , V23 , V23 , V23 // flip the doublewords to big - endian order
642
- VPDI $ 0x4 , V24 , V24 , V24 // flip the doublewords to big - endian order
643
- VSTM V17 , V24 , 0 (R7) // 128 - bytes into z
644
- ADD $ 128 , R7
645
- ADD $ 128 , R10 // i + = 16
646
- SUB $ 16 , R3 // n - = 16
647
- BGE UU1 // if n >= 0 goto U1
648
- VLGVG $ 1 , V0 , R4 // put cf into R4 in case we branch to v10
649
-
650
- A10:
651
- ADD $ 12 , R3 // n + = 16
652
-
653
- // s/ JL / JMP / below to disable the unrolled loop
654
-
655
- BLT v10 // if n < 0 goto v10
656
-
657
- U4: // n >= 0
658
- // regular loop body unrolled 4x
659
- MOVD 0 ( R8 )( R10 * 1 ) , R5
660
- MOVD 8 ( R8 )( R10 * 1 ) , R6
661
- MOVD 16 ( R8 )( R10 * 1 ) , R7
662
- MOVD 24 ( R8 )( R10 * 1 ) , R1
663
- ADDC R4 , R5
664
- ADDE R0 , R6
665
- ADDE R0 , R7
666
- ADDE R0 , R1
667
- ADDE R0 , R0
668
- MOVD R0 , R4 // save CF
669
- SUB R0 , R0
670
- MOVD R5 , 0 (R2)( R10 * 1 )
671
- MOVD R6 , 8 (R2)( R10 * 1 )
672
- MOVD R7 , 16 (R2)( R10 * 1 )
673
- MOVD R1 , 24 (R2)( R10 * 1 )
674
-
675
- ADD $ 32 , R10 // i + = 4 - > i + = 32
676
- SUB $ 4 , R3 // n - = 4
677
- BGE U4 // if n >= 0 goto U4
678
-
679
- v10:
680
- ADD $ 4 , R3 // n + = 4
681
- BLE E10 // if n <= 0 goto E4
682
-
683
- L4: // n > 0
684
- MOVD 0 ( R8 )( R10 * 1 ) , R5
685
- ADDC R4 , R5
544
+ MOVD z_len + 8 (FP) , R5 // length of z
545
+ MOVD x + 24 (FP) , R6
546
+ MOVD y + 48 (FP) , R7 // c = y
547
+ MOVD z + 0 (FP) , R8
548
+
549
+ CMPBEQ R5 , $ 0 , returnC // if len(z) == 0 , we can have an early return
550
+
551
+ // Add the first two words , and determine which path (copy path or loop path) to take based on the carry flag.
552
+ ADDC 0 (R6) , R7
553
+ MOVD R7 , 0 ( R8 )
554
+ CMPBEQ R5 , $ 1 , returnResult // len(z) == 1
555
+ MOVD $ 0 , R9
556
+ ADDE 8 (R6) , R9
557
+ MOVD R9 , 8 ( R8 )
558
+ CMPBEQ R5 , $ 2 , returnResult // len(z) == 2
559
+
560
+ // Update the counters
561
+ MOVD $ 16 , R12 // i = 2
562
+ MOVD $ - 2 (R5) , R5 // n = n - 2
563
+
564
+ loopOverEachWord:
565
+ BRC $ 12 , copySetup // carry = 0 , copy the rest
566
+ MOVD $ 1 , R9
567
+
568
+ // Originally we used the carry flag generated in the previous iteration
569
+ // (i.e: ADDE could be used here to do the addition). However , since we
570
+ // already know carry is 1 (otherwise we will go to copy section) , we can use
571
+ // ADDC here so the current iteration does not depend on the carry flag
572
+ // generated in the previous iteration. This could be useful when branch prediction happens.
573
+ ADDC 0 (R6)( R12 * 1 ) , R9
574
+ MOVD R9 , 0 ( R8 )( R12 * 1 ) // z [ i ] = x [ i ] + c
575
+
576
+ MOVD $ 8 ( R12 ) , R12 // i ++
577
+ BRCTG R5 , loopOverEachWord // n --
578
+
579
+ // Return the current carry value
580
+ returnResult:
581
+ MOVD $ 0 , R0
686
582
ADDE R0 , R0
687
- MOVD R0 , R4 // save CF
688
- SUB R0 , R0
689
- MOVD R5 , 0 (R2)( R10 * 1 )
690
-
691
- ADD $ 8 , R10 // i ++
692
- SUB $ 1 , R3 // n --
693
- BGT L4 // if n > 0 goto L4
694
-
695
- E10:
696
- MOVD R4 , c + 56 (FP) // return c
697
-
583
+ MOVD R0 , c + 56 (FP)
698
584
RET
699
585
700
- TEXT ·addVW_novec(SB) , NOSPLIT , $ 0
701
- // DI = R3 , CX = R4 , SI = r10 , r8 = r8 , r10 = r2 , r11 = r5 , r12 = r6 , r13 = r7 , r14 = r1 (R0 set to 0 )
702
- MOVD z_len + 8 (FP) , R3
703
- MOVD x + 24 (FP) , R8
704
- MOVD y + 48 (FP) , R4 // c = y
705
- MOVD z + 0 (FP) , R2
706
- MOVD $ 0 , R0 // make sure it's 0
707
- MOVD $ 0 , R10 // i = 0
708
-
709
- // s/ JL / JMP / below to disable the unrolled loop
710
- SUB $ 4 , R3 // n - = 4
711
- BLT v4 // if n < 4 goto v4
712
-
713
- U4: // n >= 0
714
- // regular loop body unrolled 4x
715
- MOVD 0 ( R8 )( R10 * 1 ) , R5
716
- MOVD 8 ( R8 )( R10 * 1 ) , R6
717
- MOVD 16 ( R8 )( R10 * 1 ) , R7
718
- MOVD 24 ( R8 )( R10 * 1 ) , R1
719
- ADDC R4 , R5
720
- ADDE R0 , R6
721
- ADDE R0 , R7
722
- ADDE R0 , R1
723
- ADDE R0 , R0
724
- MOVD R0 , R4 // save CF
725
- SUB R0 , R0
726
- MOVD R5 , 0 (R2)( R10 * 1 )
727
- MOVD R6 , 8 (R2)( R10 * 1 )
728
- MOVD R7 , 16 (R2)( R10 * 1 )
729
- MOVD R1 , 24 (R2)( R10 * 1 )
730
-
731
- ADD $ 32 , R10 // i + = 4 - > i + = 32
732
- SUB $ 4 , R3 // n - = 4
733
- BGE U4 // if n >= 0 goto U4
734
-
735
- v4:
736
- ADD $ 4 , R3 // n + = 4
737
- BLE E4 // if n <= 0 goto E4
738
-
739
- L4: // n > 0
740
- MOVD 0 ( R8 )( R10 * 1 ) , R5
741
- ADDC R4 , R5
742
- ADDE R0 , R0
743
- MOVD R0 , R4 // save CF
744
- SUB R0 , R0
745
- MOVD R5 , 0 (R2)( R10 * 1 )
746
-
747
- ADD $ 8 , R10 // i ++
748
- SUB $ 1 , R3 // n --
749
- BGT L4 // if n > 0 goto L4
750
-
751
- E4:
752
- MOVD R4 , c + 56 (FP) // return c
586
+ // Update position of x(R6) and z( R8 ) based on the current counter value and perform copying.
587
+ // With the assumption th at x and z will not overlap with each other or x and z will
588
+ // point to same memory region , we can use a faster version of copy using only MVC here.
589
+ // In the following implementation , we have three copy loops , each copying a word , 4 words , and
590
+ // 32 words at a time. Via benchmarking , this implementation is faster than calling runtime·memmove.
591
+ copySetup:
592
+ ADD R12 , R6
593
+ ADD R12 , R8
594
+
595
+ CMPBGE R5 , $ 4 , mediumLoop
596
+
597
+ smallLoop: // does a loop unrolling to copy word when n < 4
598
+ CMPBEQ R5 , $ 0 , returnZero
599
+ MVC $ 8 , 0 (R6) , 0 ( R8 )
600
+ CMPBEQ R5 , $ 1 , returnZero
601
+ MVC $ 8 , 8 (R6) , 8 ( R8 )
602
+ CMPBEQ R5 , $ 2 , returnZero
603
+ MVC $ 8 , 16 (R6) , 16 ( R8 )
604
+
605
+ returnZero:
606
+ MOVD $ 0 , c + 56 (FP) // return 0 as carry
607
+ RET
753
608
609
+ mediumLoop:
610
+ CMPBLT R5 , $ 4 , smallLoop
611
+ CMPBLT R5 , $ 32 , mediumLoopBody
612
+
613
+ largeLoop: // Copying 256 bytes at a time.
614
+ MVC $ 256 , 0 (R6) , 0 ( R8 )
615
+ MOVD $ 256 (R6) , R6
616
+ MOVD $ 256 ( R8 ) , R8
617
+ MOVD $ - 32 (R5) , R5
618
+ CMPBGE R5 , $ 32 , largeLoop
619
+ BR mediumLoop
620
+
621
+ mediumLoopBody: // Copying 32 bytes at a time
622
+ MVC $ 32 , 0 (R6) , 0 ( R8 )
623
+ MOVD $ 32 (R6) , R6
624
+ MOVD $ 32 ( R8 ) , R8
625
+ MOVD $ - 4 (R5) , R5
626
+ CMPBGE R5 , $ 4 , mediumLoopBody
627
+ BR smallLoop
628
+
629
+ returnC:
630
+ MOVD R7 , c + 56 (FP)
754
631
RET
755
632
756
633
TEXT ·subVW(SB) , NOSPLIT , $ 0
0 commit comments