77
88#include " ggml-fp8.h"
99
10- /*
11- make clean
12- make -j8
13- # ./llama-quantize --output-tensor-type fp8_e3m4_q ~/LLM/Mistral-Nemo-Instruct-2407.BF16.gguf ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf E3M4_Q
14- ./llama-quantize ~/LLM/Mistral-Nemo-Instruct-2407.BF16.gguf ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf E3M4_Q
15- ./llama-cli -c 1024 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf -p "[INST]bonjour a tu un nom. je ne sais pas comment t'appeler. Si tu n'en as pas je peux t'appeler TINTIN[/INST]" -s 42
16- ./llama-perplexity --kl-divergence-base ~/LLM/Mistral-Nemo-Instruct-2407.BF16.kld --kl-divergence -s 31337 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf
17-
18- rm -rf build
19- cmake -B build
20- cmake --build build --config Release -j $(nproc)
21- ./build/bin/llama-quantize ~/LLM/Mistral-Nemo-Instruct-2407.BF16.gguf ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf E3M4_Q
22- ./build/bin/llama-cli -c 1024 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf -p "[INST]bonjour a tu un nom. je ne sais pas comment t'appeler. Si tu n'en as pas je peux t'appeler TINTIN[/INST]" -s 42
23- ./build/bin/llama-perplexity --kl-divergence-base ~/LLM/Mistral-Nemo-Instruct-2407.BF16.kld --kl-divergence -s 31337 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf
24-
25- # la CI local:
26- rm -rf tmp
27- mkdir tmp
28- bash ./ci/run.sh ./tmp/results ./tmp/mnt
29-
30- # HIP legacy target?
31- cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIPBLAS=ON
32- cmake --build build2 --config Release -j $(nproc) -v
33-
34- */
35-
36- template <int N> constexpr float EXP2 () {
37- if constexpr (N==0 ) return 1 ;
38- if constexpr (N>0 ) return EXP2<N-1 >()*2 ;
39- if constexpr (N<0 ) return EXP2<N+1 >()/2 ;
10+ template <int N>
11+ constexpr float exp_p2 () {
12+ return exp_p2<N-1 >()*2 ;
4013}
41-
42- // 2^N avec N>0 en entier
43- template <int N> constexpr int EXP_I2 () {
44- if constexpr (N==0 ) return 1 ;
45- if constexpr (N>0 ) return EXP_I2<N-1 >()*2 ;
14+ template <int N>
15+ constexpr float exp_m2 () {
16+ return exp_m2<N+1 >()/2 ;
17+ }
18+ template <int N>
19+ constexpr int exp_i2 () {
20+ return 1 << N;
4621}
22+ template <> constexpr float exp_p2<0 >() { return 1 ;}
23+ template <> constexpr float exp_m2<0 >() { return 1 ;}
4724
48- template <int _E > // , int M=7-E> 1.7 bits!
25+ template <int E > // , int M=7-E> 1.7 bits!
4926struct FP8 {
5027 uint8_t bits;
51- using type = FP8<_E >;
52- static constexpr int E=_E;
53- static constexpr int M= 7 -_E;
54- static constexpr int E_BIAS=EXP2<_E -1 >()-1 ;
55- static constexpr float MAX () { return (2 -EXP2 <-M+1 >())*EXP2<EXP_I2<_E -1 >()>(); }
56- static constexpr float MIN () { return EXP2 <-M>()*EXP2 <2 -EXP_I2<_E -1 >()>(); }
28+ using type = FP8<E >;
29+ // static constexpr int E=_E;
30+ static constexpr int M () { return 7 -E; }
31+ static constexpr int E_BIAS () { return exp_i2< E -1 >()-1 ; }
32+ static constexpr float MAX () { return (2 -exp_m2 <-M () +1 >())*exp_p2<exp_i2< E -1 >()>(); }
33+ static constexpr float MIN () { return exp_m2 <-M () >()*exp_m2 <2 -exp_i2< E -1 >()>(); }
5734 // =============================================
5835
5936#ifdef GGML_USE_OPENMP_SIMD
@@ -64,19 +41,19 @@ struct FP8 {
6441 float f;
6542 uint32_t bits;
6643 } in = {value};
67- // le signe:
44+ // the signe:
6845 bits = (in.bits >> 24 ) & 0x80 ;
69- // la valeur sans la signe!
46+ // value without signe!
7047 in.bits &= 0x7fffffff ;
7148 // GGML_ASSERT(in.bits < 0x7f800000); // +/- infini ou NAN
7249 if (in.f >= MAX ()) {
7350 bits |= 0x7E ;
7451 } else if (in.f <MIN ()) { // => 0.
7552 // OK: S.0000000
7653 } else {
77- in.f *= EXP2 <E_BIAS-127 >();
78- in.bits += 1 <<(22 -M); // for rounding
79- bits |= (in.bits >> (23 -M)) & 0x7F ;
54+ in.f *= exp_m2 <E_BIAS () -127 >();
55+ in.bits += 1 <<(22 -M () ); // for rounding
56+ bits |= (in.bits >> (23 -M () )) & 0x7F ;
8057 }
8158 }
8259
@@ -88,13 +65,12 @@ struct FP8 {
8865 float f;
8966 uint32_t bits;
9067 } out = {0 };
91- // le signe:
9268 out.bits = bits & 0x80 ;
9369 out.bits <<= 24 ;
9470 uint32_t _bits = bits & 0x7F ;
95- _bits <<= (23 -M);
71+ _bits <<= (23 -M () );
9672 out.bits |= _bits;
97- out.f *= EXP2 <127 -E_BIAS>();
73+ out.f *= exp_p2 <127 -E_BIAS () >();
9874 return out.f ;
9975 }
10076};
@@ -156,7 +132,7 @@ static inline void conv(const float* x, bloc_fp8<E, QK>* y, int64_t size) {
156132 for (int64_t q=0 ; q<qk_size; ++q) {
157133 float m = 0 ;
158134#ifdef GGML_USE_OPENMP_SIMD
159- // not work on macos and warn.
135+ // did not work on macOS and warn.
160136 // #pragma omp simd reduction(max:m)
161137#endif
162138 for (int64_t i=0 ; i<QK; i++) {
0 commit comments