Skip to content

Commit 2781abf

Browse files
committed
Make GeLU 10x faster
This change makes whisperfile and models like Gemma much faster by using an approximation of tanhf() from ARM Limited that GCC 14 vectorizes very well. Model output is slightly different but it might counterintuitively be an improvement, since the people who trained the model used this gelu approximation so it wouldn't be surprising if they used an approximation of tanhf() too, possibly even this very same one. With whisperfile, this change decimates overall encode time by eleven percent leading to a very noticeable user-facing impact on performance.
1 parent 8ba17ea commit 2781abf

File tree

1 file changed

+71
-2
lines changed

1 file changed

+71
-2
lines changed

llama.cpp/ggml-vector.inc

Lines changed: 71 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1276,14 +1276,83 @@ void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
12761276
#endif
12771277
}
12781278

1279+
/* Helper routine for calculating exp(x) - 1.
1280+
Copied from expm1f_1u6.c, with several simplifications:
1281+
- No special-case handling for tiny or special values, instead return early
1282+
from the main routine.
1283+
- No special handling for large values:
1284+
- No early return for infinity.
1285+
- Simpler combination of p and t in final stage of algorithm.
1286+
- |i| < 27, so can calculate t by simpler shift-and-add, instead of ldexpf.
1287+
From Optimized Routines by Arm Limited. */
1288+
static inline float
1289+
Expm1f(float x)
1290+
{
1291+
/* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
1292+
float Shift = 0x1.8p23f;
1293+
float j = fmaf(0x1.715476p+0f, x, Shift) - Shift;
1294+
int i = j;
1295+
float f = fmaf(j, -0x1.62e4p-1f, x);
1296+
f = fmaf(j, -0x1.7f7d1cp-20f, f);
1297+
1298+
/* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
1299+
Uses Estrin scheme, where the main expm1f routine uses Horner. */
1300+
float f2 = f * f;
1301+
float p_01 = fmaf(f, 0x1.5554aep-3, 0x1.fffffep-2);
1302+
float p_23 = fmaf(f, 0x1.12287cp-7, 0x1.555736p-5);
1303+
float p = fmaf(f2, p_23, p_01);
1304+
p = fmaf(f2 * f2, 0x1.6b55a2p-10, p);
1305+
p = fmaf(f2, p, f);
1306+
1307+
/* t = 2^i. */
1308+
union
1309+
{
1310+
unsigned i;
1311+
float f;
1312+
} u = { (i + 127) << 23 };
1313+
float t = u.f;
1314+
1315+
/* expm1(x) ~= p * t + (t - 1). */
1316+
return fmaf(p, t, t - 1);
1317+
}
1318+
1319+
/* Single-precision tanh(x) approximation.
1320+
The maximum error is 2.58 ULP.
1321+
Designed by Arm Limited. */
1322+
static inline float
1323+
Tanhf(float x)
1324+
{
1325+
union
1326+
{
1327+
float f;
1328+
unsigned i;
1329+
} u = { x };
1330+
unsigned iax = u.i & 0x7fffffff;
1331+
unsigned sign = u.i & ~0x7fffffff;
1332+
1333+
/* Above 0x1.205966p+3 tanhf rounds to 1 (or -1 for negative). */
1334+
if (iax > 0x41102cb3) {
1335+
if (iax > 0x7f800000)
1336+
return (x - x) / (x - x);
1337+
u.i = 0x3f800000 | sign;
1338+
return u.f;
1339+
}
1340+
if (iax < 0x34000000)
1341+
return x;
1342+
1343+
/* tanh(x) = (e^2x - 1) / (e^2x + 1). */
1344+
float q = Expm1f(2 * x);
1345+
return q / (q + 2);
1346+
}
1347+
12791348
void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
12801349
void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
12811350
void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
12821351
void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
12831352
void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
12841353
void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
12851354
void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
1286-
void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
1355+
void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = Tanhf(x[i]); }
12871356
void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
12881357
void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
12891358
void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
@@ -1331,7 +1400,7 @@ void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (i
13311400

13321401
static inline float ggml_gelu_f32(float x) {
13331402
// GeLU approximation that goes slower and we seem to be stuck with.
1334-
return .5f * x * (1.f + tanhf(sqrtf(M_2_PI) * (x + .044715f * x * x * x)));
1403+
return .5f * x * (1.f + Tanhf(sqrtf(M_2_PI) * (x + .044715f * x * x * x)));
13351404
}
13361405

13371406
void ggml_vec_gelu_f32(const int n, float * y, const float * x) {

0 commit comments

Comments
 (0)