Make GeLU 10x faster

jart · jart · commit 2781abf82f45 · 2024-08-01T06:35:54.000-07:00
This change makes whisperfile and models like Gemma much faster by using
an approximation of tanhf() from ARM Limited that GCC 14 vectorizes very
well. Model output is slightly different but it might counterintuitively
be an improvement, since the people who trained the model used this gelu
approximation so it wouldn't be surprising if they used an approximation
of tanhf() too, possibly even this very same one. With whisperfile, this
change decimates overall encode time by eleven percent leading to a very
noticeable user-facing impact on performance.
diff --git a/llama.cpp/ggml-vector.inc b/llama.cpp/ggml-vector.inc
@@ -1276,14 +1276,83 @@ void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
 #endif
 }
 
+/* Helper routine for calculating exp(x) - 1.
+   Copied from expm1f_1u6.c, with several simplifications:
+   - No special-case handling for tiny or special values, instead return early
+   from the main routine.
+   - No special handling for large values:
+   - No early return for infinity.
+   - Simpler combination of p and t in final stage of algorithm.
+   - |i| < 27, so can calculate t by simpler shift-and-add, instead of ldexpf.
+   From Optimized Routines by Arm Limited.  */
+static inline float
+Expm1f(float x)
+{
+    /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+    float Shift = 0x1.8p23f;
+    float j = fmaf(0x1.715476p+0f, x, Shift) - Shift;
+    int i = j;
+    float f = fmaf(j, -0x1.62e4p-1f, x);
+    f = fmaf(j, -0x1.7f7d1cp-20f, f);
+
+    /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
+       Uses Estrin scheme, where the main expm1f routine uses Horner.  */
+    float f2 = f * f;
+    float p_01 = fmaf(f, 0x1.5554aep-3, 0x1.fffffep-2);
+    float p_23 = fmaf(f, 0x1.12287cp-7, 0x1.555736p-5);
+    float p = fmaf(f2, p_23, p_01);
+    p = fmaf(f2 * f2, 0x1.6b55a2p-10, p);
+    p = fmaf(f2, p, f);
+
+    /* t = 2^i.  */
+    union
+    {
+        unsigned i;
+        float f;
+    } u = { (i + 127) << 23 };
+    float t = u.f;
+
+    /* expm1(x) ~= p * t + (t - 1).  */
+    return fmaf(p, t, t - 1);
+}
+
+/* Single-precision tanh(x) approximation.
+   The maximum error is 2.58 ULP.
+   Designed by Arm Limited.  */
+static inline float
+Tanhf(float x)
+{
+    union
+    {
+        float f;
+        unsigned i;
+    } u = { x };
+    unsigned iax = u.i & 0x7fffffff;
+    unsigned sign = u.i & ~0x7fffffff;
+
+    /* Above 0x1.205966p+3 tanhf rounds to 1 (or -1 for negative).  */
+    if (iax > 0x41102cb3) {
+        if (iax > 0x7f800000)
+            return (x - x) / (x - x);
+        u.i = 0x3f800000 | sign;
+        return u.f;
+    }
+    if (iax < 0x34000000)
+        return x;
+
+    /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+    float q = Expm1f(2 * x);
+    return q / (q + 2);
+}
+
 void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s);   }
 void ggml_vec_sqr_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i];   }
 void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
 void ggml_vec_log_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]);   }
 void ggml_vec_abs_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
 void ggml_vec_sgn_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
 void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
-void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]);  }
+void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = Tanhf(x[i]);  }
 void ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
 void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
 void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
@@ -1331,7 +1400,7 @@ void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (i
 
 static inline float ggml_gelu_f32(float x) {
     // GeLU approximation that goes slower and we seem to be stuck with.
-    return .5f * x * (1.f + tanhf(sqrtf(M_2_PI) * (x + .044715f * x * x * x)));
+    return .5f * x * (1.f + Tanhf(sqrtf(M_2_PI) * (x + .044715f * x * x * x)));
 }
 
 void ggml_vec_gelu_f32(const int n, float * y, const float * x) {