ggml : fix 32-bit ARM compatibility

ggerganov · ggerganov · commit db1093e34b83 · 2023-11-03T20:08:44.000+02:00
diff --git a/ggml-quants.c b/ggml-quants.c
@@ -270,6 +270,13 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
 
 // 64-bit compatibility
 
+// vaddvq_s16
+// vpaddq_s16
+// vaddvq_s32
+// vaddvq_f32
+// vmaxvq_f32
+// vcvtnq_s32_f32
+
 inline static int32_t vaddvq_s16(int16x8_t v) {
     return
         (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
@@ -309,6 +316,82 @@ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
     return res;
 }
 
+// vld1q_s16_x2
+// vld1q_u8_x2
+// vld1q_u8_x4
+// vld1q_s8_x2
+// vld1q_s8_x4
+// TODO: double-check these work correctly
+
+struct int16x8x2_t {
+    int16x8_t val[2];
+};
+
+inline static int16x8x2_t vld1q_s16_x2(const int16_t * ptr) {
+    int16x8x2_t res;
+
+    res.val[0] = vld1q_s16(ptr + 0);
+    res.val[1] = vld1q_s16(ptr + 8);
+
+    return res;
+}
+
+struct uint8x16x2_t {
+    uint8x16_t val[2];
+};
+
+inline static uint8x16x2_t vld1q_u8_x2(const uint8_t * ptr) {
+    uint8x16x2_t res;
+
+    res.val[0] = vld1q_u8(ptr + 0);
+    res.val[1] = vld1q_u8(ptr + 16);
+
+    return res;
+}
+
+struct uint8x16x4_t {
+    uint8x16_t val[4];
+};
+
+inline static uint8x16x4_t vld1q_u8_x4(const uint8_t * ptr) {
+    uint8x16x4_t res;
+
+    res.val[0] = vld1q_u8(ptr + 0);
+    res.val[1] = vld1q_u8(ptr + 16);
+    res.val[2] = vld1q_u8(ptr + 32);
+    res.val[3] = vld1q_u8(ptr + 48);
+
+    return res;
+}
+
+struct int8x16x2_t {
+    int8x16_t val[2];
+};
+
+inline static int8x16x2_t vld1q_s8_x2(const int8_t * ptr) {
+    int8x16x2_t res;
+
+    res.val[0] = vld1q_s8(ptr + 0);
+    res.val[1] = vld1q_s8(ptr + 16);
+
+    return res;
+}
+
+struct int8x16x4_t {
+    int8x16_t val[4];
+};
+
+inline static int8x16x4_t vld1q_s8_x4(const int8_t * ptr) {
+    int8x16x4_t res;
+
+    res.val[0] = vld1q_s8(ptr + 0);
+    res.val[1] = vld1q_s8(ptr + 16);
+    res.val[2] = vld1q_s8(ptr + 32);
+    res.val[3] = vld1q_s8(ptr + 48);
+
+    return res;
+}
+
 #endif
 #endif
 
diff --git a/ggml.c b/ggml.c
@@ -610,6 +610,18 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
 // simd mappings
 //
 
+#if defined(__ARM_NEON)
+#if !defined(__aarch64__)
+
+// 64-bit compatibility
+
+inline static float vaddvq_f32(float32x4_t v) {
+    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
+}
+
+#endif
+#endif
+
 // we define a common set of C macros which map to specific intrinsics based on the current architecture
 // we then implement the fundamental computation operations below using only these macros
 // adding support for new architectures requires to define the corresponding SIMD macros