@@ -51,15 +51,33 @@ TEXT ·Index(SB), NOSPLIT|NOFRAME, $0-56
51
51
MOVD b_base + 24 (FP) , R5 // R5 = separator pointer
52
52
MOVD b_len + 32 (FP) , R6 // R6 = separator length
53
53
MOVD $ ret + 48 (FP) , R14 // R14 = & ret
54
- BR indexbody<>(SB)
54
+
55
+ #ifdef GOARCH_ppc64le
56
+ MOVBZ internal∕ cpu ·PPC64 + const_offsetPPC64HasPOWER9(SB) , R7
57
+ CMP R7 , $ 1
58
+ BNE power8
59
+ BR indexbodyp9<>(SB)
60
+
61
+ #endif
62
+ power8:
63
+ BR indexbody<>(SB)
55
64
56
65
TEXT ·IndexString(SB) , NOSPLIT|NOFRAME , $ 0 - 40
57
66
MOVD a_base + 0 (FP) , R3 // R3 = string
58
67
MOVD a_len + 8 (FP) , R4 // R4 = length
59
68
MOVD b_base + 16 (FP) , R5 // R5 = separator pointer
60
69
MOVD b_len + 24 (FP) , R6 // R6 = separator length
61
70
MOVD $ ret + 32 (FP) , R14 // R14 = & ret
62
- BR indexbody<>(SB)
71
+
72
+ #ifdef GOARCH_ppc64le
73
+ MOVBZ internal∕ cpu ·PPC64 + const_offsetPPC64HasPOWER9(SB) , R7
74
+ CMP R7 , $ 1
75
+ BNE power8
76
+ BR indexbody<>(SB)
77
+
78
+ #endif
79
+ power8:
80
+ BR indexbody<>(SB)
63
81
64
82
// s: string we are searching
65
83
// sep: string to search for
@@ -420,3 +438,330 @@ found:
420
438
SUB R3 , R7 // Return byte where found
421
439
MOVD R7 , ( R14 )
422
440
RET
441
+
442
+ TEXT indexbodyp9<>(SB) , NOSPLIT|NOFRAME , $ 0
443
+ CMP R6 , R4 // Compare lengths
444
+ BGT notfound // If sep len is > string , notfound
445
+ ADD R4 , R3 , LASTBYTE // find last byte addr
446
+ SUB R6 , LASTBYTE , LASTSTR // LAST=&s [ len(s) - len(sep) ] (last valid start index)
447
+ CMP R6 , $ 0 // Check sep len
448
+ BEQ notfound // sep len 0 -- not found
449
+ MOVD R3 , R7 // Copy of string addr
450
+ MOVD $ 16 , R16 // Index value 16
451
+ MOVD $ 17 , R17 // Index value 17
452
+ MOVD $ 18 , R18 // Index value 18
453
+ MOVD $ 1 , R19 // Index value 1
454
+ VSPLTISB $ 0xFF , ONES // spl at all 1s
455
+
456
+ CMP R6 , $ 16 , CR4 // CR4 for len(sep) >= 16
457
+ VOR ONES , ONES , SEPMASK // Set up full SEPMASK
458
+ BGE CR4 , loadge16 // Load for len(sep) >= 16
459
+ SUB R6 , R16 , R9 // 16 - len of sep
460
+ SLD $ 3 , R9 // Set up for VSLO
461
+ MTVSRD R9 , V9_ // Set up for VSLO
462
+ VSLDOI $ 8 , V9 , V9 , V9 // Set up for VSLO
463
+ VSLO ONES , V9 , SEPMASK // Mask for separator len(sep) < 16
464
+
465
+ loadge16:
466
+ ANDCC $ 15 , R5 , R9 // Find byte offset of sep
467
+ ADD R9 , R6 , R10 // Add sep len
468
+ CMP R10 , $ 16 // Check if sep len + offset > 16
469
+ BGE sepcross16 // Sep crosses 16 byte boundary
470
+
471
+ RLDICR $ 0 , R5 , $ 59 , R8 // Adjust addr to 16 byte container
472
+ LXVB16X ( R8 )(R0) , V0_ // Load 16 bytes @ R8 into V0
473
+ SLD $ 3 , R9 // Set up shift count for VSLO
474
+ MTVSRD R9 , V8_ // Set up shift count for VSLO
475
+ VSLDOI $ 8 , V8 , V8 , V8
476
+ VSLO V0 , V8 , V0 // Shift by start byte
477
+
478
+ VAND V0 , SEPMASK , V0 // Mask separator (< 16 )
479
+ BR index2plus
480
+
481
+ sepcross16:
482
+ LXVB16X (R5)(R0) , V0_ // Load 16 bytes @R5 into V0
483
+
484
+ VAND V0 , SEPMASK , V0 // mask out separator
485
+ BLE CR4 , index2to16
486
+ BR index17plus // Handle sep > 16
487
+
488
+ index2plus:
489
+ CMP R6 , $ 2 // Check length of sep
490
+ BNE index3plus // If not 2 , check for 3
491
+ ADD $ 16 , R7 , R9 // Check if next 16 bytes past last
492
+ CMP R9 , LASTBYTE // compare with last
493
+ BGE index2to16 // 2 <= len(string) <= 16
494
+ MOVD $ 0xff00 , R21 // Mask for later
495
+ MTVSRD R21 , V25 // Move to Vreg
496
+ VSPLTH $ 3 , V25 , V31 // Spl at mask
497
+ VSPLTH $ 0 , V0 , V1 // Spl at 1st 2 bytes of sep
498
+ VSPLTISB $ 0 , V10 // Clear V10
499
+
500
+ // First case: 2 byte separator
501
+ // V1: 2 byte separator splatted
502
+ // V2: 16 bytes at addr
503
+ // V4: 16 bytes at addr + 1
504
+ // Compare 2 byte separator at start
505
+ // and at start + 1 . Use VSEL to combine
506
+ // those results to find the first
507
+ // matching start byte , returning
508
+ // th at value when found. Loop as
509
+ // long as len(string) > 16
510
+ index2loop2:
511
+ LXVB16X (R7)(R19) , V3_ // Load 16 bytes @R7 + 1 into V3
512
+
513
+ index2loop:
514
+ LXVB16X (R7)(R0) , V2_ // Load 16 bytes @R7 into V2
515
+ VCMPEQUH V1 , V2 , V5 // Search for sep
516
+ VCMPEQUH V1 , V3 , V6 // Search for sep offset by 1
517
+ VSEL V6 , V5 , V31 , V7 // merge even and odd indices
518
+ VCLZD V7 , V18 // find index of first match
519
+ MFVSRD V18 , R25 // get first value
520
+ CMP R25 , $ 64 // Found if < 64
521
+ BLT foundR25 // Return byte index where found
522
+
523
+ MFVSRLD V18 , R25 // get second value
524
+ CMP R25 , $ 64 // Found if < 64
525
+ ADD $ 64 , R25 // Update byte offset
526
+ BLT foundR25 // Return value
527
+ ADD $ 16 , R7 // R7 + = 16 Update string pointer
528
+ ADD $ 17 , R7 , R9 // R9 =F7 + 17 since loop unrolled
529
+ CMP R9 , LASTBYTE // Compare addr + 17 against last byte
530
+ BLT index2loop2 // If < last , continue loop
531
+ CMP R7 , LASTBYTE // Compare addr + 16 against last byte
532
+ BLT index2to16 // If < 16 handle specially
533
+ LXVB16X (R7)(R0) , V3_ // Load 16 bytes @R7 into V3
534
+ VSLDOI $ 1 , V3 , V10 , V3 // Shift left by 1 byte
535
+ BR index2loop
536
+
537
+ index3plus:
538
+ CMP R6 , $ 3 // Check if sep == 3
539
+ BNE index4plus // If not check larger
540
+ ADD $ 19 , R7 , R9 // Find bytes for use in this loop
541
+ CMP R9 , LASTBYTE // Compare against last byte
542
+ BGE index2to16 // Remaining string 2 <=len<= 16
543
+ MOVD $ 0xff00 , R21 // Set up mask for upcoming loop
544
+ MTVSRD R21 , V25 // Move mask to Vreg
545
+ VSPLTH $ 3 , V25 , V31 // Spl at mask
546
+ VSPLTH $ 0 , V0 , V1 // Spl at 1st two bytes of sep
547
+ VSPLTB $ 2 , V0 , V8 // Spl at 3rd byte of sep
548
+
549
+ // Loop to process 3 byte separator.
550
+ // string [ 0 : 16 ] is in V2
551
+ // string [ 2 : 18 ] is in V3
552
+ // sep [ 0 : 2 ] splatted in V1
553
+ // sec [ 3 ] splatted in v8
554
+ // Load vectors at string , string + 1
555
+ // and string + 2 . Compare string , string + 1
556
+ // against first 2 bytes of separator
557
+ // splatted , and string + 2 against 3rd
558
+ // byte splatted. Merge the results with
559
+ // VSEL to find the first byte of a match.
560
+
561
+ // Special handling for last 16 bytes if the
562
+ // string fits in 16 byte multiple.
563
+ index3loop2:
564
+ MOVD $ 2 , R21 // Set up index for 2
565
+ VSPLTISB $ 0 , V10 // Clear V10
566
+ LXVB16X (R7)(R21) , V3_ // Load 16 bytes @R7 + 2 into V3
567
+ VSLDOI $ 14 , V3 , V10 , V3 // Left justify next 2 bytes
568
+
569
+ index3loop:
570
+ LXVB16X (R7)(R0) , V2_ // Load 16 bytes @R7
571
+ VSLDOI $ 1 , V2 , V3 , V4 // string [ 1 : 17 ]
572
+ VSLDOI $ 2 , V2 , V3 , V9 // string [ 2 : 18 ]
573
+ VCMPEQUH V1 , V2 , V5 // compare hw even indices
574
+ VCMPEQUH V1 , V4 , V6 // compare hw odd indices
575
+ VCMPEQUB V8 , V9 , V10 // compare 3rd to last byte
576
+ VSEL V6 , V5 , V31 , V7 // Find 1st matching byte using mask
577
+ VAND V7 , V10 , V7 // AND matched bytes with matched 3rd byte
578
+ VCLZD V7 , V18 // Find first nonzero indexes
579
+ MFVSRD V18 , R25 // Move 1st doubleword
580
+ CMP R25 , $ 64 // If < 64 found
581
+ BLT foundR25 // Return matching index
582
+
583
+ MFVSRLD V18 , R25 // Move 2nd doubleword
584
+ CMP R25 , $ 64 // If < 64 found
585
+ ADD $ 64 , R25 // Update byte index
586
+ BLT foundR25 // Return matching index
587
+ ADD $ 16 , R7 // R7 + = 16 string ptr
588
+ ADD $ 19 , R7 , R9 // Number of string bytes for loop
589
+ CMP R9 , LASTBYTE // Compare against last byte of string
590
+ BLT index3loop2 // If within , continue this loop
591
+ CMP R7 , LASTSTR // Compare against last start byte
592
+ BLT index2to16 // Process remainder
593
+ VSPLTISB $ 0 , V3 // Special case for last 16 bytes
594
+ BR index3loop // Continue this loop
595
+
596
+ // Loop to process 4 byte separator
597
+ // string [ 0 : 16 ] in V2
598
+ // string [ 3 : 16 ] in V3
599
+ // sep [ 0 : 4 ] splatted in V1
600
+ // Set up vectors with strings at offsets
601
+ // 0 , 1 , 2 , 3 and compare against the 4 byte
602
+ // separator also splatted. Use VSEL with the
603
+ // compare results to find the first byte where
604
+ // a separator match is found.
605
+ index4plus:
606
+ CMP R6 , $ 4 // Check if 4 byte separator
607
+ BNE index5plus // If not next higher
608
+ ADD $ 20 , R7 , R9 // Check string size to load
609
+ CMP R9 , LASTBYTE // Verify string length
610
+ BGE index2to16 // If not large enough , process remaining
611
+ MOVD $ 2 , R15 // Set up index
612
+
613
+ // Set up masks for use with VSEL
614
+ MOVD $ 0xff , R21 // Set up mask 0xff000000ff000000 ...
615
+ SLD $ 24 , R21
616
+ MTVSRWS R21 , V29
617
+
618
+ VSLDOI $ 2 , V29 , V29 , V30 // Mask 0x0000ff000000ff00 ...
619
+ MOVD $ 0xffff , R21
620
+ SLD $ 16 , R21
621
+ MTVSRWS R21 , V31
622
+
623
+ VSPLTW $ 0 , V0 , V1 // Spl at 1st word of separator
624
+
625
+ index4loop:
626
+ LXVB16X (R7)(R0) , V2_ // Load 16 bytes @R7 into V2
627
+
628
+ next4:
629
+ VSPLTISB $ 0 , V10 // Clear
630
+ MOVD $ 3 , R9 // Number of bytes beyond 16
631
+ LXVB16X (R7)( R9 ) , V3_ // Load 16 bytes @R7 into V2
632
+ VSLDOI $ 13 , V3 , V10 , V3 // Shift left last 3 bytes
633
+ VSLDOI $ 1 , V2 , V3 , V4 // V4=(V2:V3)<< 1
634
+ VSLDOI $ 2 , V2 , V3 , V9 // V9=(V2:V3)<< 2
635
+ VSLDOI $ 3 , V2 , V3 , V10 // V10=(V2:v3)<< 3
636
+ VCMPEQUW V1 , V2 , V5 // compare index 0 , 4 , ... with sep
637
+ VCMPEQUW V1 , V4 , V6 // compare index 1 , 5 , ... with sep
638
+ VCMPEQUW V1 , V9 , V11 // compare index 2 , 6 , ... with sep
639
+ VCMPEQUW V1 , V10 , V12 // compare index 3 , 7 , ... with sep
640
+ VSEL V6 , V5 , V29 , V13 // merge index 0 , 1 , 4 , 5 , using mask
641
+ VSEL V12 , V11 , V30 , V14 // merge index 2 , 3 , 6 , 7 , using mask
642
+ VSEL V14 , V13 , V31 , V7 // final merge
643
+ VCLZD V7 , V18 // Find first index for each half
644
+ MFVSRD V18 , R25 // Isolate value
645
+ CMP R25 , $ 64 // If < 64 , found
646
+ BLT foundR25 // Return found index
647
+
648
+ MFVSRLD V18 , R25 // Isolate other value
649
+ CMP R25 , $ 64 // If < 64 , found
650
+ ADD $ 64 , R25 // Update index for high doubleword
651
+ BLT foundR25 // Return found index
652
+ ADD $ 16 , R7 // R7 + = 16 for next string
653
+ ADD $ 20 , R7 , R9 // R + 20 for all bytes to load
654
+ CMP R9 , LASTBYTE // Past end? Maybe check for extra?
655
+ BLT index4loop // If not , continue loop
656
+ CMP R7 , LASTSTR // Check remainder
657
+ BLE index2to16 // Process remainder
658
+ BR notfound // Not found
659
+
660
+ index5plus:
661
+ CMP R6 , $ 16 // Check for sep > 16
662
+ BGT index17plus // Handle large sep
663
+
664
+ // Assumption is th at the separator is smaller than the string at this point
665
+ index2to16:
666
+ CMP R7 , LASTSTR // Compare last start byte
667
+ BGT notfound // last takes len(sep) into account
668
+
669
+ ADD $ 16 , R7 , R9 // Check for last byte of string
670
+ CMP R9 , LASTBYTE
671
+ BGT index2to16tail
672
+
673
+ // At least 16 bytes of string left
674
+ // Mask the number of bytes in sep
675
+ index2to16loop:
676
+ LXVB16X (R7)(R0) , V1_ // Load 16 bytes @R7 into V1
677
+
678
+ compare:
679
+ VAND V1 , SEPMASK , V2 // Mask out sep size
680
+ VCMPEQUBCC V0 , V2 , V3 // Compare masked string
681
+ BLT CR6 , found // All equal
682
+ ADD $ 1 , R7 // Update ptr to next byte
683
+ CMP R7 , LASTSTR // Still less than last start byte
684
+ BGT notfound // Not found
685
+ ADD $ 16 , R7 , R9 // Verify remaining bytes
686
+ CMP R9 , LASTBYTE // At least 16
687
+ BLT index2to16loop // Try again
688
+
689
+ // Less than 16 bytes remaining in string
690
+ // Separator >= 2
691
+ index2to16tail:
692
+ ADD R3 , R4 , R9 // End of string
693
+ SUB R7 , R9 , R9 // Number of bytes left
694
+ ANDCC $ 15 , R7 , R10 // 16 byte offset
695
+ ADD R10 , R9 , R11 // offset + len
696
+ CMP R11 , $ 16 // >= 16 ?
697
+ BLE short // Does not cross 16 bytes
698
+ LXVB16X (R7)(R0) , V1_ // Load 16 bytes @R7 into V1
699
+ BR index2to16next // Continue on
700
+
701
+ short:
702
+ RLDICR $ 0 , R7 , $ 59 , R9 // Adjust addr to 16 byte container
703
+ LXVB16X ( R9 )(R0) , V1_ // Load 16 bytes @ R9 into V1
704
+ SLD $ 3 , R10 // Set up shift
705
+ MTVSRD R10 , V8_ // Set up shift
706
+ VSLDOI $ 8 , V8 , V8 , V8
707
+ VSLO V1 , V8 , V1 // Shift by start byte
708
+ VSPLTISB $ 0 , V25 // Clear for later use
709
+
710
+ index2to16next:
711
+ VAND V1 , SEPMASK , V2 // Just compare size of sep
712
+ VCMPEQUBCC V0 , V2 , V3 // Compare sep and partial string
713
+ BLT CR6 , found // Found
714
+ ADD $ 1 , R7 // Not found , try next partial string
715
+ CMP R7 , LASTSTR // Check for end of string
716
+ BGT notfound // If at end , then not found
717
+ VSLDOI $ 1 , V1 , V25 , V1 // Shift string left by 1 byte
718
+ BR index2to16next // Check the next partial string
719
+
720
+ index17plus:
721
+ CMP R6 , $ 32 // Check if 17 < len(sep) <= 32
722
+ BGT index33plus
723
+ SUB $ 16 , R6 , R9 // Extra > 16
724
+ SLD $ 56 , R9 , R10 // Shift to use in VSLO
725
+ MTVSRD R10 , V9_ // Set up for VSLO
726
+ LXVB16X (R5)( R9 ) , V1_ // Load 16 bytes @R5 + R9 into V1
727
+ VSLO V1 , V9 , V1 // Shift left
728
+ VSPLTISB $ 0xff , V7 // Spl at 1s
729
+ VSPLTISB $ 0 , V27 // Spl at 0
730
+
731
+ index17to32loop:
732
+ LXVB16X (R7)(R0) , V2_ // Load 16 bytes @R7 into V2
733
+
734
+ next17:
735
+ LXVB16X (R7)( R9 ) , V3_ // Load 16 bytes @R7 + R9 into V3
736
+ VSLO V3 , V9 , V3 // Shift left
737
+ VCMPEQUB V0 , V2 , V4 // Compare first 16 bytes
738
+ VCMPEQUB V1 , V3 , V5 // Compare extra over 16 bytes
739
+ VAND V4 , V5 , V6 // Check if both equal
740
+ VCMPEQUBCC V6 , V7 , V8 // All equal?
741
+ BLT CR6 , found // Yes
742
+ ADD $ 1 , R7 // On to next byte
743
+ CMP R7 , LASTSTR // Check if last start byte
744
+ BGT notfound // If too high , not found
745
+ BR index17to32loop // Continue
746
+
747
+ notfound:
748
+ MOVD $ - 1 , R8 // Return - 1 if not found
749
+ MOVD R8 , ( R14 )
750
+ RET
751
+
752
+ index33plus:
753
+ MOVD $ 0 , (R0) // Case not implemented
754
+ RET // Crash before return
755
+
756
+ foundR25:
757
+ SRD $ 3 , R25 // Convert from bits to bytes
758
+ ADD R25 , R7 // Add to current string address
759
+ SUB R3 , R7 // Subtract from start of string
760
+ MOVD R7 , ( R14 ) // Return byte where found
761
+ RET
762
+
763
+ found:
764
+ SUB R3 , R7 // Return byte where found
765
+ MOVD R7 , ( R14 )
766
+ RET
767
+
0 commit comments