Skip to content

Commit a29bacc

Browse files
authored
Removed usage of BMI2 pdep with SSE2 alternative (#19009)
* SSE2 alternative to BMI2 ParallelBitDeposit * Codegen tuning * Keep zero vector in register * Better 64-bit BMI2 alternative * Removed BMI2
1 parent d5cf36a commit a29bacc

File tree

1 file changed

+45
-42
lines changed

1 file changed

+45
-42
lines changed

src/Shared/ServerInfrastructure/StringUtilities.cs

Lines changed: 45 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -24,22 +24,25 @@ public static unsafe bool TryGetAsciiString(byte* input, char* output, int count
2424

2525
Debug.Assert((long)end >= Vector256<sbyte>.Count);
2626

27+
// PERF: so the JIT can reuse the zero from a register
28+
Vector128<sbyte> zero = Vector128<sbyte>.Zero;
29+
2730
if (Sse2.IsSupported)
2831
{
2932
if (Avx2.IsSupported && input <= end - Vector256<sbyte>.Count)
3033
{
31-
Vector256<sbyte> zero = Vector256<sbyte>.Zero;
34+
Vector256<sbyte> avxZero = Vector256<sbyte>.Zero;
3235

3336
do
3437
{
3538
var vector = Avx.LoadVector256(input).AsSByte();
36-
if (!CheckBytesInAsciiRange(vector, zero))
39+
if (!CheckBytesInAsciiRange(vector, avxZero))
3740
{
3841
return false;
3942
}
4043

41-
var tmp0 = Avx2.UnpackLow(vector, zero);
42-
var tmp1 = Avx2.UnpackHigh(vector, zero);
44+
var tmp0 = Avx2.UnpackLow(vector, avxZero);
45+
var tmp1 = Avx2.UnpackHigh(vector, avxZero);
4346

4447
// Bring into the right order
4548
var out0 = Avx2.Permute2x128(tmp0, tmp1, 0x20);
@@ -60,8 +63,6 @@ public static unsafe bool TryGetAsciiString(byte* input, char* output, int count
6063

6164
if (input <= end - Vector128<sbyte>.Count)
6265
{
63-
Vector128<sbyte> zero = Vector128<sbyte>.Zero;
64-
6566
do
6667
{
6768
var vector = Sse2.LoadVector128(input).AsSByte();
@@ -122,11 +123,12 @@ out Unsafe.AsRef<Vector<short>>(output),
122123
return false;
123124
}
124125

125-
if (Bmi2.X64.IsSupported)
126+
// BMI2 could be used, but this variant is faster on both Intel and AMD.
127+
if (Sse2.X64.IsSupported)
126128
{
127-
// BMI2 will work regardless of the processor's endianness.
128-
((ulong*)output)[0] = Bmi2.X64.ParallelBitDeposit((ulong)value, 0x00FF00FF_00FF00FFul);
129-
((ulong*)output)[1] = Bmi2.X64.ParallelBitDeposit((ulong)(value >> 32), 0x00FF00FF_00FF00FFul);
129+
Vector128<sbyte> vecNarrow = Sse2.X64.ConvertScalarToVector128Int64(value).AsSByte();
130+
Vector128<ulong> vecWide = Sse2.UnpackLow(vecNarrow, zero).AsUInt64();
131+
Sse2.Store((ulong*)output, vecWide);
130132
}
131133
else
132134
{
@@ -152,19 +154,7 @@ out Unsafe.AsRef<Vector<short>>(output),
152154
return false;
153155
}
154156

155-
if (Bmi2.IsSupported)
156-
{
157-
// BMI2 will work regardless of the processor's endianness.
158-
((uint*)output)[0] = Bmi2.ParallelBitDeposit((uint)value, 0x00FF00FFu);
159-
((uint*)output)[1] = Bmi2.ParallelBitDeposit((uint)(value >> 16), 0x00FF00FFu);
160-
}
161-
else
162-
{
163-
output[0] = (char)input[0];
164-
output[1] = (char)input[1];
165-
output[2] = (char)input[2];
166-
output[3] = (char)input[3];
167-
}
157+
WidenFourAsciiBytesToUtf16AndWriteToBuffer(output, input, value, zero);
168158

169159
input += sizeof(int);
170160
output += sizeof(int);
@@ -181,19 +171,7 @@ out Unsafe.AsRef<Vector<short>>(output),
181171
return false;
182172
}
183173

184-
if (Bmi2.IsSupported)
185-
{
186-
// BMI2 will work regardless of the processor's endianness.
187-
((uint*)output)[0] = Bmi2.ParallelBitDeposit((uint)value, 0x00FF00FFu);
188-
((uint*)output)[1] = Bmi2.ParallelBitDeposit((uint)(value >> 16), 0x00FF00FFu);
189-
}
190-
else
191-
{
192-
output[0] = (char)input[0];
193-
output[1] = (char)input[1];
194-
output[2] = (char)input[2];
195-
output[3] = (char)input[3];
196-
}
174+
WidenFourAsciiBytesToUtf16AndWriteToBuffer(output, input, value, zero);
197175

198176
input += sizeof(int);
199177
output += sizeof(int);
@@ -483,6 +461,25 @@ ref Unsafe.Add(ref str, offset),
483461
return false;
484462
}
485463

464+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
465+
private static unsafe void WidenFourAsciiBytesToUtf16AndWriteToBuffer(char* output, byte* input, int value, Vector128<sbyte> zero)
466+
{
467+
// BMI2 could be used, but this variant is faster on both Intel and AMD.
468+
if (Sse2.X64.IsSupported)
469+
{
470+
Vector128<sbyte> vecNarrow = Sse2.ConvertScalarToVector128Int32(value).AsSByte();
471+
Vector128<ulong> vecWide = Sse2.UnpackLow(vecNarrow, zero).AsUInt64();
472+
Unsafe.WriteUnaligned(output, Sse2.X64.ConvertToUInt64(vecWide));
473+
}
474+
else
475+
{
476+
output[0] = (char)input[0];
477+
output[1] = (char)input[1];
478+
output[2] = (char)input[2];
479+
output[3] = (char)input[3];
480+
}
481+
}
482+
486483
/// <summary>
487484
/// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and
488485
/// compares them to the WORD buffer with machine endianness.
@@ -495,11 +492,13 @@ private static bool WidenFourAsciiBytesToUtf16AndCompareToChars(ref char charSta
495492
return false;
496493
}
497494

498-
if (Bmi2.X64.IsSupported)
495+
// BMI2 could be used, but this variant is faster on both Intel and AMD.
496+
if (Sse2.X64.IsSupported)
499497
{
500-
// BMI2 will work regardless of the processor's endianness.
498+
Vector128<byte> vecNarrow = Sse2.ConvertScalarToVector128UInt32(value).AsByte();
499+
Vector128<ulong> vecWide = Sse2.UnpackLow(vecNarrow, Vector128<byte>.Zero).AsUInt64();
501500
return Unsafe.ReadUnaligned<ulong>(ref Unsafe.As<char, byte>(ref charStart)) ==
502-
Bmi2.X64.ParallelBitDeposit(value, 0x00FF00FF_00FF00FFul);
501+
Sse2.X64.ConvertToUInt64(vecWide);
503502
}
504503
else
505504
{
@@ -532,11 +531,13 @@ private static bool WidenTwoAsciiBytesToUtf16AndCompareToChars(ref char charStar
532531
return false;
533532
}
534533

535-
if (Bmi2.IsSupported)
534+
// BMI2 could be used, but this variant is faster on both Intel and AMD.
535+
if (Sse2.IsSupported)
536536
{
537-
// BMI2 will work regardless of the processor's endianness.
537+
Vector128<byte> vecNarrow = Sse2.ConvertScalarToVector128UInt32(value).AsByte();
538+
Vector128<uint> vecWide = Sse2.UnpackLow(vecNarrow, Vector128<byte>.Zero).AsUInt32();
538539
return Unsafe.ReadUnaligned<uint>(ref Unsafe.As<char, byte>(ref charStart)) ==
539-
Bmi2.ParallelBitDeposit(value, 0x00FF00FFu);
540+
Sse2.ConvertToUInt32(vecWide);
540541
}
541542
else
542543
{
@@ -665,12 +666,14 @@ private static bool CheckBytesInAsciiRange(long check)
665666
return (((check - 0x0101010101010101L) | check) & HighBits) == 0;
666667
}
667668

669+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
668670
private static bool CheckBytesInAsciiRange(int check)
669671
{
670672
const int HighBits = unchecked((int)0x80808080);
671673
return (((check - 0x01010101) | check) & HighBits) == 0;
672674
}
673675

676+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
674677
private static bool CheckBytesInAsciiRange(short check)
675678
{
676679
const short HighBits = unchecked((short)0x8080);

0 commit comments

Comments
 (0)