From 2821a198f117bc30427d84cb7f014f3bf1e87223 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Tue, 11 Feb 2020 18:21:44 +0100 Subject: [PATCH 1/5] SSE2 alternative to BMI2 ParallelBitDeposit --- .../ServerInfrastructure/StringUtilities.cs | 88 +++++++++++++------ 1 file changed, 62 insertions(+), 26 deletions(-) diff --git a/src/Shared/ServerInfrastructure/StringUtilities.cs b/src/Shared/ServerInfrastructure/StringUtilities.cs index 856c3693e844..9e3c221b7812 100644 --- a/src/Shared/ServerInfrastructure/StringUtilities.cs +++ b/src/Shared/ServerInfrastructure/StringUtilities.cs @@ -122,12 +122,25 @@ out Unsafe.AsRef>(output), return false; } +#if USE_BMI2 if (Bmi2.X64.IsSupported) { // BMI2 will work regardless of the processor's endianness. ((ulong*)output)[0] = Bmi2.X64.ParallelBitDeposit((ulong)value, 0x00FF00FF_00FF00FFul); ((ulong*)output)[1] = Bmi2.X64.ParallelBitDeposit((ulong)(value >> 32), 0x00FF00FF_00FF00FFul); } +#else + if (Sse2.X64.IsSupported) + { + Vector128 vecNarrow = Sse2.ConvertScalarToVector128Int32((int)value).AsByte(); + Vector128 vecWide = Sse2.UnpackLow(vecNarrow, Vector128.Zero).AsUInt64(); + Unsafe.WriteUnaligned(output, Sse2.X64.ConvertToUInt64(vecWide)); + + vecNarrow = Sse2.ConvertScalarToVector128Int32((int)(value >> 32)).AsByte(); + vecWide = Sse2.UnpackLow(vecNarrow, Vector128.Zero).AsUInt64(); + Unsafe.WriteUnaligned(output+sizeof(int), Sse2.X64.ConvertToUInt64(vecWide)); + } +#endif else { output[0] = (char)input[0]; @@ -152,19 +165,7 @@ out Unsafe.AsRef>(output), return false; } - if (Bmi2.IsSupported) - { - // BMI2 will work regardless of the processor's endianness. - ((uint*)output)[0] = Bmi2.ParallelBitDeposit((uint)value, 0x00FF00FFu); - ((uint*)output)[1] = Bmi2.ParallelBitDeposit((uint)(value >> 16), 0x00FF00FFu); - } - else - { - output[0] = (char)input[0]; - output[1] = (char)input[1]; - output[2] = (char)input[2]; - output[3] = (char)input[3]; - } + WidenFourAsciiBytesToUtf16AndWriteToBuffer(output, input, value); input += sizeof(int); output += sizeof(int); @@ -181,19 +182,7 @@ out Unsafe.AsRef>(output), return false; } - if (Bmi2.IsSupported) - { - // BMI2 will work regardless of the processor's endianness. - ((uint*)output)[0] = Bmi2.ParallelBitDeposit((uint)value, 0x00FF00FFu); - ((uint*)output)[1] = Bmi2.ParallelBitDeposit((uint)(value >> 16), 0x00FF00FFu); - } - else - { - output[0] = (char)input[0]; - output[1] = (char)input[1]; - output[2] = (char)input[2]; - output[3] = (char)input[3]; - } + WidenFourAsciiBytesToUtf16AndWriteToBuffer(output, input, value); input += sizeof(int); output += sizeof(int); @@ -378,6 +367,33 @@ ref Unsafe.Add(ref str, offset), return false; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void WidenFourAsciiBytesToUtf16AndWriteToBuffer(char* output, byte* input, int value) + { +#if USE_BMI2 + if (Bmi2.IsSupported) + { + // BMI2 will work regardless of the processor's endianness. + ((uint*)output)[0] = Bmi2.ParallelBitDeposit((uint)value, 0x00FF00FFu); + ((uint*)output)[1] = Bmi2.ParallelBitDeposit((uint)(value >> 16), 0x00FF00FFu); + } +#else + if (Sse2.X64.IsSupported) + { + Vector128 vecNarrow = Sse2.ConvertScalarToVector128Int32(value).AsByte(); + Vector128 vecWide = Sse2.UnpackLow(vecNarrow, Vector128.Zero).AsUInt64(); + Unsafe.WriteUnaligned(output, Sse2.X64.ConvertToUInt64(vecWide)); + } +#endif + else + { + output[0] = (char)input[0]; + output[1] = (char)input[1]; + output[2] = (char)input[2]; + output[3] = (char)input[3]; + } + } + /// /// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and /// compares them to the WORD buffer with machine endianness. @@ -390,12 +406,22 @@ private static bool WidenFourAsciiBytesToUtf16AndCompareToChars(ref char charSta return false; } +#if USE_BMI2 if (Bmi2.X64.IsSupported) { // BMI2 will work regardless of the processor's endianness. return Unsafe.ReadUnaligned(ref Unsafe.As(ref charStart)) == Bmi2.X64.ParallelBitDeposit(value, 0x00FF00FF_00FF00FFul); } +#else + if (Sse2.X64.IsSupported) + { + Vector128 vecNarrow = Sse2.ConvertScalarToVector128UInt32(value).AsByte(); + Vector128 vecWide = Sse2.UnpackLow(vecNarrow, Vector128.Zero).AsUInt64(); + return Unsafe.ReadUnaligned(ref Unsafe.As(ref charStart)) == + Sse2.X64.ConvertToUInt64(vecWide); + } +#endif else { if (BitConverter.IsLittleEndian) @@ -427,12 +453,22 @@ private static bool WidenTwoAsciiBytesToUtf16AndCompareToChars(ref char charStar return false; } +#if USE_BMI2 if (Bmi2.IsSupported) { // BMI2 will work regardless of the processor's endianness. return Unsafe.ReadUnaligned(ref Unsafe.As(ref charStart)) == Bmi2.ParallelBitDeposit(value, 0x00FF00FFu); } +#else + if (Sse2.IsSupported) + { + Vector128 vecNarrow = Sse2.ConvertScalarToVector128UInt32(value).AsByte(); + Vector128 vecWide = Sse2.UnpackLow(vecNarrow, Vector128.Zero).AsUInt32(); + return Unsafe.ReadUnaligned(ref Unsafe.As(ref charStart)) == + Sse2.ConvertToUInt32(vecWide); + } +#endif else { if (BitConverter.IsLittleEndian) From 2aee84018699db78cc0c225a99c835d7dc052c20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Tue, 11 Feb 2020 18:48:38 +0100 Subject: [PATCH 2/5] Codegen tuning --- src/Shared/ServerInfrastructure/StringUtilities.cs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/Shared/ServerInfrastructure/StringUtilities.cs b/src/Shared/ServerInfrastructure/StringUtilities.cs index 9e3c221b7812..ba75db69c31d 100644 --- a/src/Shared/ServerInfrastructure/StringUtilities.cs +++ b/src/Shared/ServerInfrastructure/StringUtilities.cs @@ -132,13 +132,15 @@ out Unsafe.AsRef>(output), #else if (Sse2.X64.IsSupported) { + Vector128 zero = Vector128.Zero; + Vector128 vecNarrow = Sse2.ConvertScalarToVector128Int32((int)value).AsByte(); - Vector128 vecWide = Sse2.UnpackLow(vecNarrow, Vector128.Zero).AsUInt64(); + Vector128 vecWide = Sse2.UnpackLow(vecNarrow, zero).AsUInt64(); Unsafe.WriteUnaligned(output, Sse2.X64.ConvertToUInt64(vecWide)); vecNarrow = Sse2.ConvertScalarToVector128Int32((int)(value >> 32)).AsByte(); - vecWide = Sse2.UnpackLow(vecNarrow, Vector128.Zero).AsUInt64(); - Unsafe.WriteUnaligned(output+sizeof(int), Sse2.X64.ConvertToUInt64(vecWide)); + vecWide = Sse2.UnpackLow(vecNarrow, zero).AsUInt64(); + Unsafe.WriteUnaligned(output + sizeof(int), Sse2.X64.ConvertToUInt64(vecWide)); } #endif else @@ -150,7 +152,7 @@ out Unsafe.AsRef>(output), output[4] = (char)input[4]; output[5] = (char)input[5]; output[6] = (char)input[6]; - output[7] = (char)input[7]; + output[7] = (char)input[7]; } input += sizeof(long); @@ -596,12 +598,14 @@ private static bool CheckBytesInAsciiRange(long check) return (((check - 0x0101010101010101L) | check) & HighBits) == 0; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool CheckBytesInAsciiRange(int check) { const int HighBits = unchecked((int)0x80808080); return (((check - 0x01010101) | check) & HighBits) == 0; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool CheckBytesInAsciiRange(short check) { const short HighBits = unchecked((short)0x8080); From f14d3b038b702346ef65d1d86b0d147fff6e219b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Tue, 11 Feb 2020 19:41:01 +0100 Subject: [PATCH 3/5] Keep zero vector in register --- .../ServerInfrastructure/StringUtilities.cs | 31 +++++++++---------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/src/Shared/ServerInfrastructure/StringUtilities.cs b/src/Shared/ServerInfrastructure/StringUtilities.cs index ba75db69c31d..67dd470b4e57 100644 --- a/src/Shared/ServerInfrastructure/StringUtilities.cs +++ b/src/Shared/ServerInfrastructure/StringUtilities.cs @@ -24,22 +24,25 @@ public static unsafe bool TryGetAsciiString(byte* input, char* output, int count Debug.Assert((long)end >= Vector256.Count); + // PERF: so the JIT can reuse the zero from a register + Vector128 zero = Vector128.Zero; + if (Sse2.IsSupported) { if (Avx2.IsSupported && input <= end - Vector256.Count) { - Vector256 zero = Vector256.Zero; + Vector256 avxZero = Vector256.Zero; do { var vector = Avx.LoadVector256(input).AsSByte(); - if (!CheckBytesInAsciiRange(vector, zero)) + if (!CheckBytesInAsciiRange(vector, avxZero)) { return false; } - var tmp0 = Avx2.UnpackLow(vector, zero); - var tmp1 = Avx2.UnpackHigh(vector, zero); + var tmp0 = Avx2.UnpackLow(vector, avxZero); + var tmp1 = Avx2.UnpackHigh(vector, avxZero); // Bring into the right order var out0 = Avx2.Permute2x128(tmp0, tmp1, 0x20); @@ -60,8 +63,6 @@ public static unsafe bool TryGetAsciiString(byte* input, char* output, int count if (input <= end - Vector128.Count) { - Vector128 zero = Vector128.Zero; - do { var vector = Sse2.LoadVector128(input).AsSByte(); @@ -132,13 +133,11 @@ out Unsafe.AsRef>(output), #else if (Sse2.X64.IsSupported) { - Vector128 zero = Vector128.Zero; - - Vector128 vecNarrow = Sse2.ConvertScalarToVector128Int32((int)value).AsByte(); + Vector128 vecNarrow = Sse2.ConvertScalarToVector128Int32((int)value).AsSByte(); Vector128 vecWide = Sse2.UnpackLow(vecNarrow, zero).AsUInt64(); Unsafe.WriteUnaligned(output, Sse2.X64.ConvertToUInt64(vecWide)); - vecNarrow = Sse2.ConvertScalarToVector128Int32((int)(value >> 32)).AsByte(); + vecNarrow = Sse2.ConvertScalarToVector128Int32((int)(value >> 32)).AsSByte(); vecWide = Sse2.UnpackLow(vecNarrow, zero).AsUInt64(); Unsafe.WriteUnaligned(output + sizeof(int), Sse2.X64.ConvertToUInt64(vecWide)); } @@ -152,7 +151,7 @@ out Unsafe.AsRef>(output), output[4] = (char)input[4]; output[5] = (char)input[5]; output[6] = (char)input[6]; - output[7] = (char)input[7]; + output[7] = (char)input[7]; } input += sizeof(long); @@ -167,7 +166,7 @@ out Unsafe.AsRef>(output), return false; } - WidenFourAsciiBytesToUtf16AndWriteToBuffer(output, input, value); + WidenFourAsciiBytesToUtf16AndWriteToBuffer(output, input, value, zero); input += sizeof(int); output += sizeof(int); @@ -184,7 +183,7 @@ out Unsafe.AsRef>(output), return false; } - WidenFourAsciiBytesToUtf16AndWriteToBuffer(output, input, value); + WidenFourAsciiBytesToUtf16AndWriteToBuffer(output, input, value, zero); input += sizeof(int); output += sizeof(int); @@ -370,7 +369,7 @@ ref Unsafe.Add(ref str, offset), } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe void WidenFourAsciiBytesToUtf16AndWriteToBuffer(char* output, byte* input, int value) + private static unsafe void WidenFourAsciiBytesToUtf16AndWriteToBuffer(char* output, byte* input, int value, Vector128 zero) { #if USE_BMI2 if (Bmi2.IsSupported) @@ -382,8 +381,8 @@ private static unsafe void WidenFourAsciiBytesToUtf16AndWriteToBuffer(char* outp #else if (Sse2.X64.IsSupported) { - Vector128 vecNarrow = Sse2.ConvertScalarToVector128Int32(value).AsByte(); - Vector128 vecWide = Sse2.UnpackLow(vecNarrow, Vector128.Zero).AsUInt64(); + Vector128 vecNarrow = Sse2.ConvertScalarToVector128Int32(value).AsSByte(); + Vector128 vecWide = Sse2.UnpackLow(vecNarrow, zero).AsUInt64(); Unsafe.WriteUnaligned(output, Sse2.X64.ConvertToUInt64(vecWide)); } #endif From d77a1de3f15e59658bec077c622c26edc90e2bfb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Tue, 11 Feb 2020 22:34:09 +0100 Subject: [PATCH 4/5] Better 64-bit BMI2 alternative --- src/Shared/ServerInfrastructure/StringUtilities.cs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/Shared/ServerInfrastructure/StringUtilities.cs b/src/Shared/ServerInfrastructure/StringUtilities.cs index 67dd470b4e57..38caadeaca0e 100644 --- a/src/Shared/ServerInfrastructure/StringUtilities.cs +++ b/src/Shared/ServerInfrastructure/StringUtilities.cs @@ -133,13 +133,9 @@ out Unsafe.AsRef>(output), #else if (Sse2.X64.IsSupported) { - Vector128 vecNarrow = Sse2.ConvertScalarToVector128Int32((int)value).AsSByte(); + Vector128 vecNarrow = Sse2.X64.ConvertScalarToVector128Int64(value).AsSByte(); Vector128 vecWide = Sse2.UnpackLow(vecNarrow, zero).AsUInt64(); - Unsafe.WriteUnaligned(output, Sse2.X64.ConvertToUInt64(vecWide)); - - vecNarrow = Sse2.ConvertScalarToVector128Int32((int)(value >> 32)).AsSByte(); - vecWide = Sse2.UnpackLow(vecNarrow, zero).AsUInt64(); - Unsafe.WriteUnaligned(output + sizeof(int), Sse2.X64.ConvertToUInt64(vecWide)); + Sse2.Store((ulong*)output, vecWide); } #endif else From 8f9fe415f1dd9b2dcce2bf2158d11ef7af78af95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Tue, 11 Feb 2020 22:36:29 +0100 Subject: [PATCH 5/5] Removed BMI2 --- .../ServerInfrastructure/StringUtilities.cs | 40 ++----------------- 1 file changed, 4 insertions(+), 36 deletions(-) diff --git a/src/Shared/ServerInfrastructure/StringUtilities.cs b/src/Shared/ServerInfrastructure/StringUtilities.cs index 38caadeaca0e..7c2dcbc6ba26 100644 --- a/src/Shared/ServerInfrastructure/StringUtilities.cs +++ b/src/Shared/ServerInfrastructure/StringUtilities.cs @@ -123,21 +123,13 @@ out Unsafe.AsRef>(output), return false; } -#if USE_BMI2 - if (Bmi2.X64.IsSupported) - { - // BMI2 will work regardless of the processor's endianness. - ((ulong*)output)[0] = Bmi2.X64.ParallelBitDeposit((ulong)value, 0x00FF00FF_00FF00FFul); - ((ulong*)output)[1] = Bmi2.X64.ParallelBitDeposit((ulong)(value >> 32), 0x00FF00FF_00FF00FFul); - } -#else + // BMI2 could be used, but this variant is faster on both Intel and AMD. if (Sse2.X64.IsSupported) { Vector128 vecNarrow = Sse2.X64.ConvertScalarToVector128Int64(value).AsSByte(); Vector128 vecWide = Sse2.UnpackLow(vecNarrow, zero).AsUInt64(); Sse2.Store((ulong*)output, vecWide); } -#endif else { output[0] = (char)input[0]; @@ -367,21 +359,13 @@ ref Unsafe.Add(ref str, offset), [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void WidenFourAsciiBytesToUtf16AndWriteToBuffer(char* output, byte* input, int value, Vector128 zero) { -#if USE_BMI2 - if (Bmi2.IsSupported) - { - // BMI2 will work regardless of the processor's endianness. - ((uint*)output)[0] = Bmi2.ParallelBitDeposit((uint)value, 0x00FF00FFu); - ((uint*)output)[1] = Bmi2.ParallelBitDeposit((uint)(value >> 16), 0x00FF00FFu); - } -#else + // BMI2 could be used, but this variant is faster on both Intel and AMD. if (Sse2.X64.IsSupported) { Vector128 vecNarrow = Sse2.ConvertScalarToVector128Int32(value).AsSByte(); Vector128 vecWide = Sse2.UnpackLow(vecNarrow, zero).AsUInt64(); Unsafe.WriteUnaligned(output, Sse2.X64.ConvertToUInt64(vecWide)); } -#endif else { output[0] = (char)input[0]; @@ -403,14 +387,7 @@ private static bool WidenFourAsciiBytesToUtf16AndCompareToChars(ref char charSta return false; } -#if USE_BMI2 - if (Bmi2.X64.IsSupported) - { - // BMI2 will work regardless of the processor's endianness. - return Unsafe.ReadUnaligned(ref Unsafe.As(ref charStart)) == - Bmi2.X64.ParallelBitDeposit(value, 0x00FF00FF_00FF00FFul); - } -#else + // BMI2 could be used, but this variant is faster on both Intel and AMD. if (Sse2.X64.IsSupported) { Vector128 vecNarrow = Sse2.ConvertScalarToVector128UInt32(value).AsByte(); @@ -418,7 +395,6 @@ private static bool WidenFourAsciiBytesToUtf16AndCompareToChars(ref char charSta return Unsafe.ReadUnaligned(ref Unsafe.As(ref charStart)) == Sse2.X64.ConvertToUInt64(vecWide); } -#endif else { if (BitConverter.IsLittleEndian) @@ -450,14 +426,7 @@ private static bool WidenTwoAsciiBytesToUtf16AndCompareToChars(ref char charStar return false; } -#if USE_BMI2 - if (Bmi2.IsSupported) - { - // BMI2 will work regardless of the processor's endianness. - return Unsafe.ReadUnaligned(ref Unsafe.As(ref charStart)) == - Bmi2.ParallelBitDeposit(value, 0x00FF00FFu); - } -#else + // BMI2 could be used, but this variant is faster on both Intel and AMD. if (Sse2.IsSupported) { Vector128 vecNarrow = Sse2.ConvertScalarToVector128UInt32(value).AsByte(); @@ -465,7 +434,6 @@ private static bool WidenTwoAsciiBytesToUtf16AndCompareToChars(ref char charStar return Unsafe.ReadUnaligned(ref Unsafe.As(ref charStart)) == Sse2.ConvertToUInt32(vecWide); } -#endif else { if (BitConverter.IsLittleEndian)