A few fixes that make it compile on Mingw64.

lapo-luchini · lapo-luchini · commit 5addd62978c4 · 2023-03-16T23:55:51.000+01:00
Mostly taken from ggml-org/llama.cpp#22 Some might be unnecessary, this is the first version I managed to run.
diff --git a/Makefile b/Makefile
@@ -32,7 +32,7 @@ endif
 
 CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
 CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
-LDFLAGS  =
+LDFLAGS  = -static
 
 # OS specific
 # TODO: support Windows
@@ -48,6 +48,10 @@ ifeq ($(UNAME_S),FreeBSD)
 	CFLAGS   += -pthread
 	CXXFLAGS += -pthread
 endif
+ifeq ($(UNAME_S),NetBSD)
+	CFLAGS   += -pthread
+	CXXFLAGS += -pthread
+endif
 ifeq ($(UNAME_S),Haiku)
 	CFLAGS   += -pthread
 	CXXFLAGS += -pthread
diff --git a/ggml.c b/ggml.c
@@ -2,7 +2,7 @@
 
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
-#elif !defined(__FreeBSD__)
+#elif !defined(__FreeBSD__) && !defined(__NetBSD__)
 #include <alloca.h>
 #endif
 
@@ -364,7 +364,7 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
 #if __AVX2__
 // Unpack 32 4-bit fields into 32 bytes
 // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
-inline __m256i bytesFromNibbles( const uint8_t* rsi )
+static inline __m256i bytesFromNibbles( const uint8_t* rsi )
 {
     // Load 16 bytes from memory
     __m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi );
@@ -381,7 +381,7 @@ inline __m256i bytesFromNibbles( const uint8_t* rsi )
     return bytes;
 }
 
-inline __m128i packNibbles( __m256i bytes )
+static inline __m128i packNibbles( __m256i bytes )
 {
     // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
     const __m256i lowByte = _mm256_set1_epi16( 0xFF );
@@ -407,8 +407,8 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
     const int nb = k / QK;
     const size_t bs = sizeof(float) + QK/2;
 
-    uint8_t * restrict pd = (uint8_t *) (y + 0*bs);
-    uint8_t * restrict pb = (uint8_t *) (y + 0*bs + sizeof(float));
+    uint8_t * restrict pd = ((uint8_t *)y + 0*bs);
+    uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float));
 
     uint8_t pp[QK/2];
 
@@ -654,8 +654,8 @@ void dequantize_row_q4_0(const void * restrict x, float * restrict y, int k) {
     const int nb = k / QK;
     const size_t bs = sizeof(float) + QK/2;
 
-    const uint8_t * restrict pd = (const uint8_t *) (x + 0*bs);
-    const uint8_t * restrict pb = (const uint8_t *) (x + 0*bs + sizeof(float));
+    const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);
+    const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + sizeof(float));
 
     // scalar
     for (int i = 0; i < nb; i++) {
@@ -1301,11 +1301,11 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
 
     const size_t bs = sizeof(float) + QK/2;
 
-    const uint8_t * restrict pd0 = (const uint8_t *) (x + 0*bs);
-    const uint8_t * restrict pd1 = (const uint8_t *) (y + 0*bs);
+    const uint8_t * restrict pd0 = ((const uint8_t *)x + 0*bs);
+    const uint8_t * restrict pd1 = ((const uint8_t *)y + 0*bs);
 
-    const uint8_t * restrict pb0 = (const uint8_t *) (x + 0*bs + sizeof(float));
-    const uint8_t * restrict pb1 = (const uint8_t *) (y + 0*bs + sizeof(float));
+    const uint8_t * restrict pb0 = ((const uint8_t *)x + 0*bs + sizeof(float));
+    const uint8_t * restrict pb1 = ((const uint8_t *)y + 0*bs + sizeof(float));
 
     float sumf = 0.0;
 
@@ -1731,8 +1731,8 @@ inline static void ggml_vec_mad_q4_0(const int n, float * restrict y, void * res
     const int nb = n / QK;
     const size_t bs = sizeof(float) + QK/2;
 
-    const uint8_t * restrict pd = (const uint8_t *) (x + 0*bs);
-    const uint8_t * restrict pb = (const uint8_t *) (x + 0*bs + sizeof(float));
+    const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);
+    const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + sizeof(float));
 
 #if __ARM_NEON
 #if QK == 32
diff --git a/main.cpp b/main.cpp
@@ -11,6 +11,13 @@
 #include <string>
 #include <vector>
 
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+#include <signal.h>
+#include <unistd.h>
+#elif defined (_WIN32)
+#include <signal.h>
+#endif
+
 struct bloom_hparams {
     int32_t n_vocab = 32000;
     int32_t n_ctx   = 512;   // this is provided as user input?
@@ -212,8 +219,8 @@ bool bloom_model_load(const std::string & fname, bloom_model & model, gpt_vocab
     // create the ggml context
     {
         struct ggml_init_params params = {
-            .mem_size   = ctx_size,
-            .mem_buffer = NULL,
+            /*.mem_size   =*/ ctx_size,
+            /*.mem_buffer =*/ NULL,
         };
 
         model.ctx = ggml_init(params);
@@ -566,7 +573,8 @@ bool bloom_eval(
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph gf = { .n_threads = n_threads };
+    ggml_cgraph gf = {};
+    gf.n_threads = n_threads;
 
     struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
     memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
@@ -763,6 +771,7 @@ bool bloom_eval(
 }
 
 int main(int argc, char ** argv) {
+    ggml_time_init();
     const int64_t t_main_start_us = ggml_time_us();
 
     gpt_params params;
diff --git a/utils.cpp b/utils.cpp
@@ -4,6 +4,16 @@
 #include <cstring>
 #include <fstream>
 #include <regex>
+#include <iostream>
+#include <iterator>
+#include <string>
+#include <math.h>
+
+ #if defined(_MSC_VER) || defined(__MINGW32__)
+ #include <malloc.h> // using malloc.h with MSC/MINGW
+ #elif !defined(__FreeBSD__) && !defined(__NetBSD__)
+ #include <alloca.h>
+ #endif
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
     for (int i = 1; i < argc; i++) {
@@ -487,7 +497,8 @@ size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t
 
     assert(k % qk == 0);
 
-    uint8_t pp[qk/2];
+    const size_t pp_size = qk / 2;
+    uint8_t *pp = static_cast<uint8_t*>(alloca(pp_size));
 
     char * pdst = (char *) dst;
 
@@ -526,7 +537,7 @@ size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t
                     pp[l/2] = vi0 | (vi1 << 4);
                 }
 
-                memcpy(pb, pp, sizeof(pp));
+                memcpy(pb, pp, pp_size);
                 pb += bs;
             }
         }
@@ -541,7 +552,8 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
 
     assert(k % qk == 0);
 
-    uint8_t pp[qk/2];
+    const size_t pp_size = qk / 2;
+    uint8_t *pp = static_cast<uint8_t*>(alloca(pp_size));
 
     char * pdst = (char *) dst;
 
@@ -585,7 +597,7 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
                     pp[l/2] = vi0 | (vi1 << 4);
                 }
 
-                memcpy(pb + i*qk/2, pp, sizeof(pp));
+                memcpy(pb + i*qk/2, pp, pp_size);
             }
         }
     }