@@ -1276,14 +1276,83 @@ void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
1276
1276
#endif
1277
1277
}
1278
1278
1279
+ /* Helper routine for calculating exp(x) - 1.
1280
+ Copied from expm1f_1u6.c, with several simplifications:
1281
+ - No special-case handling for tiny or special values, instead return early
1282
+ from the main routine.
1283
+ - No special handling for large values:
1284
+ - No early return for infinity.
1285
+ - Simpler combination of p and t in final stage of algorithm.
1286
+ - |i| < 27, so can calculate t by simpler shift-and-add, instead of ldexpf.
1287
+ From Optimized Routines by Arm Limited. */
1288
+ static inline float
1289
+ Expm1f (float x )
1290
+ {
1291
+ /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
1292
+ float Shift = 0x1.8p23f ;
1293
+ float j = fmaf (0x1.715476p+0f , x , Shift ) - Shift ;
1294
+ int i = j ;
1295
+ float f = fmaf (j , -0x1.62e4p-1f , x );
1296
+ f = fmaf (j , -0x1.7f7d1cp-20f , f );
1297
+
1298
+ /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
1299
+ Uses Estrin scheme, where the main expm1f routine uses Horner. */
1300
+ float f2 = f * f ;
1301
+ float p_01 = fmaf (f , 0x1.5554aep-3 , 0x1.fffffep-2 );
1302
+ float p_23 = fmaf (f , 0x1.12287cp-7 , 0x1.555736p-5 );
1303
+ float p = fmaf (f2 , p_23 , p_01 );
1304
+ p = fmaf (f2 * f2 , 0x1.6b55a2p-10 , p );
1305
+ p = fmaf (f2 , p , f );
1306
+
1307
+ /* t = 2^i. */
1308
+ union
1309
+ {
1310
+ unsigned i ;
1311
+ float f ;
1312
+ } u = { (i + 127 ) << 23 };
1313
+ float t = u .f ;
1314
+
1315
+ /* expm1(x) ~= p * t + (t - 1). */
1316
+ return fmaf (p , t , t - 1 );
1317
+ }
1318
+
1319
+ /* Single-precision tanh(x) approximation.
1320
+ The maximum error is 2.58 ULP.
1321
+ Designed by Arm Limited. */
1322
+ static inline float
1323
+ Tanhf (float x )
1324
+ {
1325
+ union
1326
+ {
1327
+ float f ;
1328
+ unsigned i ;
1329
+ } u = { x };
1330
+ unsigned iax = u .i & 0x7fffffff ;
1331
+ unsigned sign = u .i & ~0x7fffffff ;
1332
+
1333
+ /* Above 0x1.205966p+3 tanhf rounds to 1 (or -1 for negative). */
1334
+ if (iax > 0x41102cb3 ) {
1335
+ if (iax > 0x7f800000 )
1336
+ return (x - x ) / (x - x );
1337
+ u .i = 0x3f800000 | sign ;
1338
+ return u .f ;
1339
+ }
1340
+ if (iax < 0x34000000 )
1341
+ return x ;
1342
+
1343
+ /* tanh(x) = (e^2x - 1) / (e^2x + 1). */
1344
+ float q = Expm1f (2 * x );
1345
+ return q / (q + 2 );
1346
+ }
1347
+
1279
1348
void ggml_vec_norm_f32 (const int n , float * s , const float * x ) { ggml_vec_dot_f32 (n , s , 0 , x , 0 , x , 0 , 1 ); * s = sqrtf (* s ); }
1280
1349
void ggml_vec_sqr_f32 (const int n , float * y , const float * x ) { for (int i = 0 ; i < n ; ++ i ) y [i ] = x [i ]* x [i ]; }
1281
1350
void ggml_vec_sqrt_f32 (const int n , float * y , const float * x ) { for (int i = 0 ; i < n ; ++ i ) y [i ] = sqrtf (x [i ]); }
1282
1351
void ggml_vec_log_f32 (const int n , float * y , const float * x ) { for (int i = 0 ; i < n ; ++ i ) y [i ] = logf (x [i ]); }
1283
1352
void ggml_vec_abs_f32 (const int n , float * y , const float * x ) { for (int i = 0 ; i < n ; ++ i ) y [i ] = fabsf (x [i ]); }
1284
1353
void ggml_vec_sgn_f32 (const int n , float * y , const float * x ) { for (int i = 0 ; i < n ; ++ i ) y [i ] = (x [i ] > 0.f ) ? 1.f : ((x [i ] < 0.f ) ? -1.f : 0.f ); }
1285
1354
void ggml_vec_step_f32 (const int n , float * y , const float * x ) { for (int i = 0 ; i < n ; ++ i ) y [i ] = (x [i ] > 0.f ) ? 1.f : 0.f ; }
1286
- void ggml_vec_tanh_f32 (const int n , float * y , const float * x ) { for (int i = 0 ; i < n ; ++ i ) y [i ] = tanhf (x [i ]); }
1355
+ void ggml_vec_tanh_f32 (const int n , float * y , const float * x ) { for (int i = 0 ; i < n ; ++ i ) y [i ] = Tanhf (x [i ]); }
1287
1356
void ggml_vec_elu_f32 (const int n , float * y , const float * x ) { for (int i = 0 ; i < n ; ++ i ) y [i ] = (x [i ] > 0.f ) ? x [i ] : expf (x [i ])- 1 ; }
1288
1357
void ggml_vec_relu_f32 (const int n , float * y , const float * x ) { for (int i = 0 ; i < n ; ++ i ) y [i ] = (x [i ] > 0.f ) ? x [i ] : 0.f ; }
1289
1358
void ggml_vec_leaky_relu_f32 (const int n , float * y , const float * x , const float ns ) { for (int i = 0 ; i < n ; ++ i ) y [i ] = ((x [i ] > 0.f ) ? x [i ] : 0.f ) + ns * ((x [i ] < 0.0f ) ? x [i ] : 0.f ); }
@@ -1331,7 +1400,7 @@ void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (i
1331
1400
1332
1401
static inline float ggml_gelu_f32 (float x ) {
1333
1402
// GeLU approximation that goes slower and we seem to be stuck with.
1334
- return .5f * x * (1.f + tanhf (sqrtf (M_2_PI ) * (x + .044715f * x * x * x )));
1403
+ return .5f * x * (1.f + Tanhf (sqrtf (M_2_PI ) * (x + .044715f * x * x * x )));
1335
1404
}
1336
1405
1337
1406
void ggml_vec_gelu_f32 (const int n , float * y , const float * x ) {
0 commit comments