Use the custom implementation of multipliedFullWidth on arm64_32 (#37905)

stephentyrone · web-flow · commit 99560979272a · 2021-06-14T21:38:51.000-04:00
Previously we were falling back on the generic implementation for 64b integers, which resulted in the following codegen: 00000008 asr x8, x0, #32 0000000c asr x9, x0, #63 00000010 cmp x0, #0x0 00000014 cinv w10, w0, lt 00000018 eor w9, w10, w9 0000001c asr x10, x1, #32 00000020 asr x11, x1, #63 00000024 cmp x1, #0x0 00000028 cinv w12, w1, lt 0000002c eor w11, w12, w11 00000030 umull x12, w11, w9 00000034 mul x11, x11, x8 00000038 add x11, x11, x12, lsr #32 0000003c asr x12, x11, #63 00000040 cmp x11, #0x0 00000044 cinv w13, w11, lt 00000048 eor w12, w13, w12 0000004c madd x9, x9, x10, x12 00000050 mul x8, x10, x8 00000054 add x8, x8, x11, asr #32 00000058 add x0, x8, x9, asr #32 0000005c ret Instead, we should use the 64b implementation when targeting arm64_32, which allows us to generate: 00000008 smulh x0, x1, x0 0000000c ret Unsurprisingly, this is considerably faster.
diff --git a/stdlib/public/core/IntegerTypes.swift.gyb b/stdlib/public/core/IntegerTypes.swift.gyb
@@ -1514,15 +1514,23 @@ ${assignmentOperatorComment(x.operator, True)}
 % end
 
 %   dbits = bits*2
-%   if bits <= word_bits:
+%   if bits == 64:
+  #if !(arch(arm) || arch(i386) || arch(wasm32))
+  //  On 32b architectures we fall back on the generic implementation,
+  //  because LLVM doesn't know how to codegen the 128b multiply we use.
+  //
+  //  Note that arm64_32 is a 64b architecture for the purposes of this
+  //  check, because we have a 64x64 -> 128 multiply there (the actual
+  //  ISA is AArch64).
+%   end
   /// Returns a tuple containing the high and low parts of the result of
   /// multiplying this value by the given value.
   ///
   /// Use this method to calculate the full result of a product that would
   /// otherwise overflow. Unlike traditional truncating multiplication, the
-  /// `multipliedFullWidth(by:)` method returns a tuple
-  /// containing both the `high` and `low` parts of the product of this value and
-  /// `other`. The following example uses this method to multiply two `UInt8`
+  /// `multipliedFullWidth(by:)` method returns a tuple containing both the
+  /// `high` and `low` parts of the product of this value and `other`.
+  /// The following example uses this method to multiply two `UInt8`
   /// values that normally overflow when multiplied:
   ///
   ///     let x: UInt8 = 100
@@ -1557,6 +1565,8 @@ ${assignmentOperatorComment(x.operator, True)}
     let high = ${Self}(Builtin.truncOrBitCast_Int${dbits}_Int${bits}(shifted))
     return (high: high, low: low)
   }
+%   if bits == 64:
+  #endif
 %   end
 
   /// Returns a tuple containing the quotient and remainder of dividing the