@@ -24,22 +24,25 @@ public static unsafe bool TryGetAsciiString(byte* input, char* output, int count
24
24
25
25
Debug . Assert ( ( long ) end >= Vector256 < sbyte > . Count ) ;
26
26
27
+ // PERF: so the JIT can reuse the zero from a register
28
+ Vector128 < sbyte > zero = Vector128 < sbyte > . Zero ;
29
+
27
30
if ( Sse2 . IsSupported )
28
31
{
29
32
if ( Avx2 . IsSupported && input <= end - Vector256 < sbyte > . Count )
30
33
{
31
- Vector256 < sbyte > zero = Vector256 < sbyte > . Zero ;
34
+ Vector256 < sbyte > avxZero = Vector256 < sbyte > . Zero ;
32
35
33
36
do
34
37
{
35
38
var vector = Avx . LoadVector256 ( input ) . AsSByte ( ) ;
36
- if ( ! CheckBytesInAsciiRange ( vector , zero ) )
39
+ if ( ! CheckBytesInAsciiRange ( vector , avxZero ) )
37
40
{
38
41
return false ;
39
42
}
40
43
41
- var tmp0 = Avx2 . UnpackLow ( vector , zero ) ;
42
- var tmp1 = Avx2 . UnpackHigh ( vector , zero ) ;
44
+ var tmp0 = Avx2 . UnpackLow ( vector , avxZero ) ;
45
+ var tmp1 = Avx2 . UnpackHigh ( vector , avxZero ) ;
43
46
44
47
// Bring into the right order
45
48
var out0 = Avx2 . Permute2x128 ( tmp0 , tmp1 , 0x20 ) ;
@@ -60,8 +63,6 @@ public static unsafe bool TryGetAsciiString(byte* input, char* output, int count
60
63
61
64
if ( input <= end - Vector128 < sbyte > . Count )
62
65
{
63
- Vector128 < sbyte > zero = Vector128 < sbyte > . Zero ;
64
-
65
66
do
66
67
{
67
68
var vector = Sse2 . LoadVector128 ( input ) . AsSByte ( ) ;
@@ -122,11 +123,12 @@ out Unsafe.AsRef<Vector<short>>(output),
122
123
return false ;
123
124
}
124
125
125
- if ( Bmi2 . X64 . IsSupported )
126
+ // BMI2 could be used, but this variant is faster on both Intel and AMD.
127
+ if ( Sse2 . X64 . IsSupported )
126
128
{
127
- // BMI2 will work regardless of the processor's endianness.
128
- ( ( ulong * ) output ) [ 0 ] = Bmi2 . X64 . ParallelBitDeposit ( ( ulong ) value , 0x00FF00FF_00FF00FFul ) ;
129
- ( ( ulong * ) output ) [ 1 ] = Bmi2 . X64 . ParallelBitDeposit ( ( ulong ) ( value >> 32 ) , 0x00FF00FF_00FF00FFul ) ;
129
+ Vector128 < sbyte > vecNarrow = Sse2 . X64 . ConvertScalarToVector128Int64 ( value ) . AsSByte ( ) ;
130
+ Vector128 < ulong > vecWide = Sse2 . UnpackLow ( vecNarrow , zero ) . AsUInt64 ( ) ;
131
+ Sse2 . Store ( ( ulong * ) output , vecWide ) ;
130
132
}
131
133
else
132
134
{
@@ -152,19 +154,7 @@ out Unsafe.AsRef<Vector<short>>(output),
152
154
return false ;
153
155
}
154
156
155
- if ( Bmi2 . IsSupported )
156
- {
157
- // BMI2 will work regardless of the processor's endianness.
158
- ( ( uint * ) output ) [ 0 ] = Bmi2 . ParallelBitDeposit ( ( uint ) value , 0x00FF00FFu ) ;
159
- ( ( uint * ) output ) [ 1 ] = Bmi2 . ParallelBitDeposit ( ( uint ) ( value >> 16 ) , 0x00FF00FFu ) ;
160
- }
161
- else
162
- {
163
- output [ 0 ] = ( char ) input [ 0 ] ;
164
- output [ 1 ] = ( char ) input [ 1 ] ;
165
- output [ 2 ] = ( char ) input [ 2 ] ;
166
- output [ 3 ] = ( char ) input [ 3 ] ;
167
- }
157
+ WidenFourAsciiBytesToUtf16AndWriteToBuffer ( output , input , value , zero ) ;
168
158
169
159
input += sizeof ( int ) ;
170
160
output += sizeof ( int ) ;
@@ -181,19 +171,7 @@ out Unsafe.AsRef<Vector<short>>(output),
181
171
return false ;
182
172
}
183
173
184
- if ( Bmi2 . IsSupported )
185
- {
186
- // BMI2 will work regardless of the processor's endianness.
187
- ( ( uint * ) output ) [ 0 ] = Bmi2 . ParallelBitDeposit ( ( uint ) value , 0x00FF00FFu ) ;
188
- ( ( uint * ) output ) [ 1 ] = Bmi2 . ParallelBitDeposit ( ( uint ) ( value >> 16 ) , 0x00FF00FFu ) ;
189
- }
190
- else
191
- {
192
- output [ 0 ] = ( char ) input [ 0 ] ;
193
- output [ 1 ] = ( char ) input [ 1 ] ;
194
- output [ 2 ] = ( char ) input [ 2 ] ;
195
- output [ 3 ] = ( char ) input [ 3 ] ;
196
- }
174
+ WidenFourAsciiBytesToUtf16AndWriteToBuffer ( output , input , value , zero ) ;
197
175
198
176
input += sizeof ( int ) ;
199
177
output += sizeof ( int ) ;
@@ -483,6 +461,25 @@ ref Unsafe.Add(ref str, offset),
483
461
return false ;
484
462
}
485
463
464
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
465
+ private static unsafe void WidenFourAsciiBytesToUtf16AndWriteToBuffer ( char * output , byte * input , int value , Vector128 < sbyte > zero )
466
+ {
467
+ // BMI2 could be used, but this variant is faster on both Intel and AMD.
468
+ if ( Sse2 . X64 . IsSupported )
469
+ {
470
+ Vector128 < sbyte > vecNarrow = Sse2 . ConvertScalarToVector128Int32 ( value ) . AsSByte ( ) ;
471
+ Vector128 < ulong > vecWide = Sse2 . UnpackLow ( vecNarrow , zero ) . AsUInt64 ( ) ;
472
+ Unsafe . WriteUnaligned ( output , Sse2 . X64 . ConvertToUInt64 ( vecWide ) ) ;
473
+ }
474
+ else
475
+ {
476
+ output [ 0 ] = ( char ) input [ 0 ] ;
477
+ output [ 1 ] = ( char ) input [ 1 ] ;
478
+ output [ 2 ] = ( char ) input [ 2 ] ;
479
+ output [ 3 ] = ( char ) input [ 3 ] ;
480
+ }
481
+ }
482
+
486
483
/// <summary>
487
484
/// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and
488
485
/// compares them to the WORD buffer with machine endianness.
@@ -495,11 +492,13 @@ private static bool WidenFourAsciiBytesToUtf16AndCompareToChars(ref char charSta
495
492
return false ;
496
493
}
497
494
498
- if ( Bmi2 . X64 . IsSupported )
495
+ // BMI2 could be used, but this variant is faster on both Intel and AMD.
496
+ if ( Sse2 . X64 . IsSupported )
499
497
{
500
- // BMI2 will work regardless of the processor's endianness.
498
+ Vector128 < byte > vecNarrow = Sse2 . ConvertScalarToVector128UInt32 ( value ) . AsByte ( ) ;
499
+ Vector128 < ulong > vecWide = Sse2 . UnpackLow ( vecNarrow , Vector128 < byte > . Zero ) . AsUInt64 ( ) ;
501
500
return Unsafe . ReadUnaligned < ulong > ( ref Unsafe . As < char , byte > ( ref charStart ) ) ==
502
- Bmi2 . X64 . ParallelBitDeposit ( value , 0x00FF00FF_00FF00FFul ) ;
501
+ Sse2 . X64 . ConvertToUInt64 ( vecWide ) ;
503
502
}
504
503
else
505
504
{
@@ -532,11 +531,13 @@ private static bool WidenTwoAsciiBytesToUtf16AndCompareToChars(ref char charStar
532
531
return false ;
533
532
}
534
533
535
- if ( Bmi2 . IsSupported )
534
+ // BMI2 could be used, but this variant is faster on both Intel and AMD.
535
+ if ( Sse2 . IsSupported )
536
536
{
537
- // BMI2 will work regardless of the processor's endianness.
537
+ Vector128 < byte > vecNarrow = Sse2 . ConvertScalarToVector128UInt32 ( value ) . AsByte ( ) ;
538
+ Vector128 < uint > vecWide = Sse2 . UnpackLow ( vecNarrow , Vector128 < byte > . Zero ) . AsUInt32 ( ) ;
538
539
return Unsafe . ReadUnaligned < uint > ( ref Unsafe . As < char , byte > ( ref charStart ) ) ==
539
- Bmi2 . ParallelBitDeposit ( value , 0x00FF00FFu ) ;
540
+ Sse2 . ConvertToUInt32 ( vecWide ) ;
540
541
}
541
542
else
542
543
{
@@ -665,12 +666,14 @@ private static bool CheckBytesInAsciiRange(long check)
665
666
return ( ( ( check - 0x0101010101010101L ) | check ) & HighBits ) == 0 ;
666
667
}
667
668
669
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
668
670
private static bool CheckBytesInAsciiRange ( int check )
669
671
{
670
672
const int HighBits = unchecked ( ( int ) 0x80808080 ) ;
671
673
return ( ( ( check - 0x01010101 ) | check ) & HighBits ) == 0 ;
672
674
}
673
675
676
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
674
677
private static bool CheckBytesInAsciiRange ( short check )
675
678
{
676
679
const short HighBits = unchecked ( ( short ) 0x8080 ) ;
0 commit comments