Skip to content

Commit db1093e

Browse files
committed
ggml : fix 32-bit ARM compatibility
1 parent 72c8697 commit db1093e

File tree

2 files changed

+95
-0
lines changed

2 files changed

+95
-0
lines changed

ggml-quants.c

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,13 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
270270

271271
// 64-bit compatibility
272272

273+
// vaddvq_s16
274+
// vpaddq_s16
275+
// vaddvq_s32
276+
// vaddvq_f32
277+
// vmaxvq_f32
278+
// vcvtnq_s32_f32
279+
273280
inline static int32_t vaddvq_s16(int16x8_t v) {
274281
return
275282
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
@@ -309,6 +316,82 @@ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
309316
return res;
310317
}
311318

319+
// vld1q_s16_x2
320+
// vld1q_u8_x2
321+
// vld1q_u8_x4
322+
// vld1q_s8_x2
323+
// vld1q_s8_x4
324+
// TODO: double-check these work correctly
325+
326+
struct int16x8x2_t {
327+
int16x8_t val[2];
328+
};
329+
330+
inline static int16x8x2_t vld1q_s16_x2(const int16_t * ptr) {
331+
int16x8x2_t res;
332+
333+
res.val[0] = vld1q_s16(ptr + 0);
334+
res.val[1] = vld1q_s16(ptr + 8);
335+
336+
return res;
337+
}
338+
339+
struct uint8x16x2_t {
340+
uint8x16_t val[2];
341+
};
342+
343+
inline static uint8x16x2_t vld1q_u8_x2(const uint8_t * ptr) {
344+
uint8x16x2_t res;
345+
346+
res.val[0] = vld1q_u8(ptr + 0);
347+
res.val[1] = vld1q_u8(ptr + 16);
348+
349+
return res;
350+
}
351+
352+
struct uint8x16x4_t {
353+
uint8x16_t val[4];
354+
};
355+
356+
inline static uint8x16x4_t vld1q_u8_x4(const uint8_t * ptr) {
357+
uint8x16x4_t res;
358+
359+
res.val[0] = vld1q_u8(ptr + 0);
360+
res.val[1] = vld1q_u8(ptr + 16);
361+
res.val[2] = vld1q_u8(ptr + 32);
362+
res.val[3] = vld1q_u8(ptr + 48);
363+
364+
return res;
365+
}
366+
367+
struct int8x16x2_t {
368+
int8x16_t val[2];
369+
};
370+
371+
inline static int8x16x2_t vld1q_s8_x2(const int8_t * ptr) {
372+
int8x16x2_t res;
373+
374+
res.val[0] = vld1q_s8(ptr + 0);
375+
res.val[1] = vld1q_s8(ptr + 16);
376+
377+
return res;
378+
}
379+
380+
struct int8x16x4_t {
381+
int8x16_t val[4];
382+
};
383+
384+
inline static int8x16x4_t vld1q_s8_x4(const int8_t * ptr) {
385+
int8x16x4_t res;
386+
387+
res.val[0] = vld1q_s8(ptr + 0);
388+
res.val[1] = vld1q_s8(ptr + 16);
389+
res.val[2] = vld1q_s8(ptr + 32);
390+
res.val[3] = vld1q_s8(ptr + 48);
391+
392+
return res;
393+
}
394+
312395
#endif
313396
#endif
314397

ggml.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -610,6 +610,18 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
610610
// simd mappings
611611
//
612612

613+
#if defined(__ARM_NEON)
614+
#if !defined(__aarch64__)
615+
616+
// 64-bit compatibility
617+
618+
inline static float vaddvq_f32(float32x4_t v) {
619+
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
620+
}
621+
622+
#endif
623+
#endif
624+
613625
// we define a common set of C macros which map to specific intrinsics based on the current architecture
614626
// we then implement the fundamental computation operations below using only these macros
615627
// adding support for new architectures requires to define the corresponding SIMD macros

0 commit comments

Comments
 (0)