Skip to content

Commit 54af9fd

Browse files
committed
internal/bytealg: add power9 version of bytes index
This adds a power9 version of the bytes.Index function for little endian. Here is the improvement on power9 for some of the Index benchmarks: Index/10 -0.14% Index/32 -3.19% Index/4K -12.66% Index/4M -13.34% Index/64M -13.17% Count/10 -0.59% Count/32 -2.88% Count/4K -12.63% Count/4M -13.35% Count/64M -13.17% IndexHard1 -23.03% IndexHard2 -13.01% IndexHard3 -22.12% IndexHard4 +0.16% CountHard1 -23.02% CountHard2 -13.01% CountHard3 -22.12% IndexPeriodic/IndexPeriodic2 -22.85% IndexPeriodic/IndexPeriodic4 -23.15% Change-Id: Id72353e2771eba2efbb1544d5f0be65f8a9f0433 Reviewed-on: https://go-review.googlesource.com/c/go/+/311380 Run-TryBot: Carlos Eduardo Seo <[email protected]> TryBot-Result: Go Bot <[email protected]> Reviewed-by: Carlos Eduardo Seo <[email protected]> Trust: Lynn Boger <[email protected]>
1 parent 122fca4 commit 54af9fd

File tree

3 files changed

+353
-2
lines changed

3 files changed

+353
-2
lines changed

src/internal/bytealg/bytealg.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ const (
1717
offsetX86HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
1818

1919
offsetS390xHasVX = unsafe.Offsetof(cpu.S390X.HasVX)
20+
21+
offsetPPC64HasPOWER9 = unsafe.Offsetof(cpu.PPC64.IsPOWER9)
2022
)
2123

2224
// MaxLen is the maximum length of the string to be searched for (argument b) in Index.

src/internal/bytealg/index_ppc64x.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,12 @@
88

99
package bytealg
1010

11+
import "internal/cpu"
12+
1113
const MaxBruteForce = 16
1214

15+
var SupportsPower9 = cpu.PPC64.IsPOWER9
16+
1317
func init() {
1418
MaxLen = 32
1519
}

src/internal/bytealg/index_ppc64x.s

Lines changed: 347 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,15 +51,33 @@ TEXT ·Index(SB), NOSPLIT|NOFRAME, $0-56
5151
MOVD b_base+24(FP), R5 // R5 = separator pointer
5252
MOVD b_len+32(FP), R6 // R6 = separator length
5353
MOVD $ret+48(FP), R14 // R14 = &ret
54-
BR indexbody<>(SB)
54+
55+
#ifdef GOARCH_ppc64le
56+
MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
57+
CMP R7, $1
58+
BNE power8
59+
BR indexbodyp9<>(SB)
60+
61+
#endif
62+
power8:
63+
BR indexbody<>(SB)
5564

5665
TEXT ·IndexString(SB), NOSPLIT|NOFRAME, $0-40
5766
MOVD a_base+0(FP), R3 // R3 = string
5867
MOVD a_len+8(FP), R4 // R4 = length
5968
MOVD b_base+16(FP), R5 // R5 = separator pointer
6069
MOVD b_len+24(FP), R6 // R6 = separator length
6170
MOVD $ret+32(FP), R14 // R14 = &ret
62-
BR indexbody<>(SB)
71+
72+
#ifdef GOARCH_ppc64le
73+
MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
74+
CMP R7, $1
75+
BNE power8
76+
BR indexbody<>(SB)
77+
78+
#endif
79+
power8:
80+
BR indexbody<>(SB)
6381

6482
// s: string we are searching
6583
// sep: string to search for
@@ -420,3 +438,330 @@ found:
420438
SUB R3, R7 // Return byte where found
421439
MOVD R7, (R14)
422440
RET
441+
442+
TEXT indexbodyp9<>(SB), NOSPLIT|NOFRAME, $0
443+
CMP R6, R4 // Compare lengths
444+
BGT notfound // If sep len is > string, notfound
445+
ADD R4, R3, LASTBYTE // find last byte addr
446+
SUB R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index)
447+
CMP R6, $0 // Check sep len
448+
BEQ notfound // sep len 0 -- not found
449+
MOVD R3, R7 // Copy of string addr
450+
MOVD $16, R16 // Index value 16
451+
MOVD $17, R17 // Index value 17
452+
MOVD $18, R18 // Index value 18
453+
MOVD $1, R19 // Index value 1
454+
VSPLTISB $0xFF, ONES // splat all 1s
455+
456+
CMP R6, $16, CR4 // CR4 for len(sep) >= 16
457+
VOR ONES, ONES, SEPMASK // Set up full SEPMASK
458+
BGE CR4, loadge16 // Load for len(sep) >= 16
459+
SUB R6, R16, R9 // 16-len of sep
460+
SLD $3, R9 // Set up for VSLO
461+
MTVSRD R9, V9_ // Set up for VSLO
462+
VSLDOI $8, V9, V9, V9 // Set up for VSLO
463+
VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16
464+
465+
loadge16:
466+
ANDCC $15, R5, R9 // Find byte offset of sep
467+
ADD R9, R6, R10 // Add sep len
468+
CMP R10, $16 // Check if sep len+offset > 16
469+
BGE sepcross16 // Sep crosses 16 byte boundary
470+
471+
RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container
472+
LXVB16X (R8)(R0), V0_ // Load 16 bytes @R8 into V0
473+
SLD $3, R9 // Set up shift count for VSLO
474+
MTVSRD R9, V8_ // Set up shift count for VSLO
475+
VSLDOI $8, V8, V8, V8
476+
VSLO V0, V8, V0 // Shift by start byte
477+
478+
VAND V0, SEPMASK, V0 // Mask separator (< 16)
479+
BR index2plus
480+
481+
sepcross16:
482+
LXVB16X (R5)(R0), V0_ // Load 16 bytes @R5 into V0
483+
484+
VAND V0, SEPMASK, V0 // mask out separator
485+
BLE CR4, index2to16
486+
BR index17plus // Handle sep > 16
487+
488+
index2plus:
489+
CMP R6, $2 // Check length of sep
490+
BNE index3plus // If not 2, check for 3
491+
ADD $16, R7, R9 // Check if next 16 bytes past last
492+
CMP R9, LASTBYTE // compare with last
493+
BGE index2to16 // 2 <= len(string) <= 16
494+
MOVD $0xff00, R21 // Mask for later
495+
MTVSRD R21, V25 // Move to Vreg
496+
VSPLTH $3, V25, V31 // Splat mask
497+
VSPLTH $0, V0, V1 // Splat 1st 2 bytes of sep
498+
VSPLTISB $0, V10 // Clear V10
499+
500+
// First case: 2 byte separator
501+
// V1: 2 byte separator splatted
502+
// V2: 16 bytes at addr
503+
// V4: 16 bytes at addr+1
504+
// Compare 2 byte separator at start
505+
// and at start+1. Use VSEL to combine
506+
// those results to find the first
507+
// matching start byte, returning
508+
// that value when found. Loop as
509+
// long as len(string) > 16
510+
index2loop2:
511+
LXVB16X (R7)(R19), V3_ // Load 16 bytes @R7+1 into V3
512+
513+
index2loop:
514+
LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7 into V2
515+
VCMPEQUH V1, V2, V5 // Search for sep
516+
VCMPEQUH V1, V3, V6 // Search for sep offset by 1
517+
VSEL V6, V5, V31, V7 // merge even and odd indices
518+
VCLZD V7, V18 // find index of first match
519+
MFVSRD V18, R25 // get first value
520+
CMP R25, $64 // Found if < 64
521+
BLT foundR25 // Return byte index where found
522+
523+
MFVSRLD V18, R25 // get second value
524+
CMP R25, $64 // Found if < 64
525+
ADD $64, R25 // Update byte offset
526+
BLT foundR25 // Return value
527+
ADD $16, R7 // R7+=16 Update string pointer
528+
ADD $17, R7, R9 // R9=F7+17 since loop unrolled
529+
CMP R9, LASTBYTE // Compare addr+17 against last byte
530+
BLT index2loop2 // If < last, continue loop
531+
CMP R7, LASTBYTE // Compare addr+16 against last byte
532+
BLT index2to16 // If < 16 handle specially
533+
LXVB16X (R7)(R0), V3_ // Load 16 bytes @R7 into V3
534+
VSLDOI $1, V3, V10, V3 // Shift left by 1 byte
535+
BR index2loop
536+
537+
index3plus:
538+
CMP R6, $3 // Check if sep == 3
539+
BNE index4plus // If not check larger
540+
ADD $19, R7, R9 // Find bytes for use in this loop
541+
CMP R9, LASTBYTE // Compare against last byte
542+
BGE index2to16 // Remaining string 2<=len<=16
543+
MOVD $0xff00, R21 // Set up mask for upcoming loop
544+
MTVSRD R21, V25 // Move mask to Vreg
545+
VSPLTH $3, V25, V31 // Splat mask
546+
VSPLTH $0, V0, V1 // Splat 1st two bytes of sep
547+
VSPLTB $2, V0, V8 // Splat 3rd byte of sep
548+
549+
// Loop to process 3 byte separator.
550+
// string[0:16] is in V2
551+
// string[2:18] is in V3
552+
// sep[0:2] splatted in V1
553+
// sec[3] splatted in v8
554+
// Load vectors at string, string+1
555+
// and string+2. Compare string, string+1
556+
// against first 2 bytes of separator
557+
// splatted, and string+2 against 3rd
558+
// byte splatted. Merge the results with
559+
// VSEL to find the first byte of a match.
560+
561+
// Special handling for last 16 bytes if the
562+
// string fits in 16 byte multiple.
563+
index3loop2:
564+
MOVD $2, R21 // Set up index for 2
565+
VSPLTISB $0, V10 // Clear V10
566+
LXVB16X (R7)(R21), V3_ // Load 16 bytes @R7+2 into V3
567+
VSLDOI $14, V3, V10, V3 // Left justify next 2 bytes
568+
569+
index3loop:
570+
LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7
571+
VSLDOI $1, V2, V3, V4 // string[1:17]
572+
VSLDOI $2, V2, V3, V9 // string[2:18]
573+
VCMPEQUH V1, V2, V5 // compare hw even indices
574+
VCMPEQUH V1, V4, V6 // compare hw odd indices
575+
VCMPEQUB V8, V9, V10 // compare 3rd to last byte
576+
VSEL V6, V5, V31, V7 // Find 1st matching byte using mask
577+
VAND V7, V10, V7 // AND matched bytes with matched 3rd byte
578+
VCLZD V7, V18 // Find first nonzero indexes
579+
MFVSRD V18, R25 // Move 1st doubleword
580+
CMP R25, $64 // If < 64 found
581+
BLT foundR25 // Return matching index
582+
583+
MFVSRLD V18, R25 // Move 2nd doubleword
584+
CMP R25, $64 // If < 64 found
585+
ADD $64, R25 // Update byte index
586+
BLT foundR25 // Return matching index
587+
ADD $16, R7 // R7+=16 string ptr
588+
ADD $19, R7, R9 // Number of string bytes for loop
589+
CMP R9, LASTBYTE // Compare against last byte of string
590+
BLT index3loop2 // If within, continue this loop
591+
CMP R7, LASTSTR // Compare against last start byte
592+
BLT index2to16 // Process remainder
593+
VSPLTISB $0, V3 // Special case for last 16 bytes
594+
BR index3loop // Continue this loop
595+
596+
// Loop to process 4 byte separator
597+
// string[0:16] in V2
598+
// string[3:16] in V3
599+
// sep[0:4] splatted in V1
600+
// Set up vectors with strings at offsets
601+
// 0, 1, 2, 3 and compare against the 4 byte
602+
// separator also splatted. Use VSEL with the
603+
// compare results to find the first byte where
604+
// a separator match is found.
605+
index4plus:
606+
CMP R6, $4 // Check if 4 byte separator
607+
BNE index5plus // If not next higher
608+
ADD $20, R7, R9 // Check string size to load
609+
CMP R9, LASTBYTE // Verify string length
610+
BGE index2to16 // If not large enough, process remaining
611+
MOVD $2, R15 // Set up index
612+
613+
// Set up masks for use with VSEL
614+
MOVD $0xff, R21 // Set up mask 0xff000000ff000000...
615+
SLD $24, R21
616+
MTVSRWS R21, V29
617+
618+
VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00...
619+
MOVD $0xffff, R21
620+
SLD $16, R21
621+
MTVSRWS R21, V31
622+
623+
VSPLTW $0, V0, V1 // Splat 1st word of separator
624+
625+
index4loop:
626+
LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7 into V2
627+
628+
next4:
629+
VSPLTISB $0, V10 // Clear
630+
MOVD $3, R9 // Number of bytes beyond 16
631+
LXVB16X (R7)(R9), V3_ // Load 16 bytes @R7 into V2
632+
VSLDOI $13, V3, V10, V3 // Shift left last 3 bytes
633+
VSLDOI $1, V2, V3, V4 // V4=(V2:V3)<<1
634+
VSLDOI $2, V2, V3, V9 // V9=(V2:V3)<<2
635+
VSLDOI $3, V2, V3, V10 // V10=(V2:v3)<<3
636+
VCMPEQUW V1, V2, V5 // compare index 0, 4, ... with sep
637+
VCMPEQUW V1, V4, V6 // compare index 1, 5, ... with sep
638+
VCMPEQUW V1, V9, V11 // compare index 2, 6, ... with sep
639+
VCMPEQUW V1, V10, V12 // compare index 3, 7, ... with sep
640+
VSEL V6, V5, V29, V13 // merge index 0, 1, 4, 5, using mask
641+
VSEL V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask
642+
VSEL V14, V13, V31, V7 // final merge
643+
VCLZD V7, V18 // Find first index for each half
644+
MFVSRD V18, R25 // Isolate value
645+
CMP R25, $64 // If < 64, found
646+
BLT foundR25 // Return found index
647+
648+
MFVSRLD V18, R25 // Isolate other value
649+
CMP R25, $64 // If < 64, found
650+
ADD $64, R25 // Update index for high doubleword
651+
BLT foundR25 // Return found index
652+
ADD $16, R7 // R7+=16 for next string
653+
ADD $20, R7, R9 // R+20 for all bytes to load
654+
CMP R9, LASTBYTE // Past end? Maybe check for extra?
655+
BLT index4loop // If not, continue loop
656+
CMP R7, LASTSTR // Check remainder
657+
BLE index2to16 // Process remainder
658+
BR notfound // Not found
659+
660+
index5plus:
661+
CMP R6, $16 // Check for sep > 16
662+
BGT index17plus // Handle large sep
663+
664+
// Assumption is that the separator is smaller than the string at this point
665+
index2to16:
666+
CMP R7, LASTSTR // Compare last start byte
667+
BGT notfound // last takes len(sep) into account
668+
669+
ADD $16, R7, R9 // Check for last byte of string
670+
CMP R9, LASTBYTE
671+
BGT index2to16tail
672+
673+
// At least 16 bytes of string left
674+
// Mask the number of bytes in sep
675+
index2to16loop:
676+
LXVB16X (R7)(R0), V1_ // Load 16 bytes @R7 into V1
677+
678+
compare:
679+
VAND V1, SEPMASK, V2 // Mask out sep size
680+
VCMPEQUBCC V0, V2, V3 // Compare masked string
681+
BLT CR6, found // All equal
682+
ADD $1, R7 // Update ptr to next byte
683+
CMP R7, LASTSTR // Still less than last start byte
684+
BGT notfound // Not found
685+
ADD $16, R7, R9 // Verify remaining bytes
686+
CMP R9, LASTBYTE // At least 16
687+
BLT index2to16loop // Try again
688+
689+
// Less than 16 bytes remaining in string
690+
// Separator >= 2
691+
index2to16tail:
692+
ADD R3, R4, R9 // End of string
693+
SUB R7, R9, R9 // Number of bytes left
694+
ANDCC $15, R7, R10 // 16 byte offset
695+
ADD R10, R9, R11 // offset + len
696+
CMP R11, $16 // >= 16?
697+
BLE short // Does not cross 16 bytes
698+
LXVB16X (R7)(R0), V1_ // Load 16 bytes @R7 into V1
699+
BR index2to16next // Continue on
700+
701+
short:
702+
RLDICR $0, R7, $59, R9 // Adjust addr to 16 byte container
703+
LXVB16X (R9)(R0), V1_ // Load 16 bytes @R9 into V1
704+
SLD $3, R10 // Set up shift
705+
MTVSRD R10, V8_ // Set up shift
706+
VSLDOI $8, V8, V8, V8
707+
VSLO V1, V8, V1 // Shift by start byte
708+
VSPLTISB $0, V25 // Clear for later use
709+
710+
index2to16next:
711+
VAND V1, SEPMASK, V2 // Just compare size of sep
712+
VCMPEQUBCC V0, V2, V3 // Compare sep and partial string
713+
BLT CR6, found // Found
714+
ADD $1, R7 // Not found, try next partial string
715+
CMP R7, LASTSTR // Check for end of string
716+
BGT notfound // If at end, then not found
717+
VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte
718+
BR index2to16next // Check the next partial string
719+
720+
index17plus:
721+
CMP R6, $32 // Check if 17 < len(sep) <= 32
722+
BGT index33plus
723+
SUB $16, R6, R9 // Extra > 16
724+
SLD $56, R9, R10 // Shift to use in VSLO
725+
MTVSRD R10, V9_ // Set up for VSLO
726+
LXVB16X (R5)(R9), V1_ // Load 16 bytes @R5+R9 into V1
727+
VSLO V1, V9, V1 // Shift left
728+
VSPLTISB $0xff, V7 // Splat 1s
729+
VSPLTISB $0, V27 // Splat 0
730+
731+
index17to32loop:
732+
LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7 into V2
733+
734+
next17:
735+
LXVB16X (R7)(R9), V3_ // Load 16 bytes @R7+R9 into V3
736+
VSLO V3, V9, V3 // Shift left
737+
VCMPEQUB V0, V2, V4 // Compare first 16 bytes
738+
VCMPEQUB V1, V3, V5 // Compare extra over 16 bytes
739+
VAND V4, V5, V6 // Check if both equal
740+
VCMPEQUBCC V6, V7, V8 // All equal?
741+
BLT CR6, found // Yes
742+
ADD $1, R7 // On to next byte
743+
CMP R7, LASTSTR // Check if last start byte
744+
BGT notfound // If too high, not found
745+
BR index17to32loop // Continue
746+
747+
notfound:
748+
MOVD $-1, R8 // Return -1 if not found
749+
MOVD R8, (R14)
750+
RET
751+
752+
index33plus:
753+
MOVD $0, (R0) // Case not implemented
754+
RET // Crash before return
755+
756+
foundR25:
757+
SRD $3, R25 // Convert from bits to bytes
758+
ADD R25, R7 // Add to current string address
759+
SUB R3, R7 // Subtract from start of string
760+
MOVD R7, (R14) // Return byte where found
761+
RET
762+
763+
found:
764+
SUB R3, R7 // Return byte where found
765+
MOVD R7, (R14)
766+
RET
767+

0 commit comments

Comments
 (0)