Skip to content

Commit 1673245

Browse files
committed
wip : rpi4 support
1 parent ce1fe95 commit 1673245

File tree

4 files changed

+151
-21
lines changed

4 files changed

+151
-21
lines changed

Makefile

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,25 +15,34 @@ CXXFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-unused-function
1515
# OS specific
1616
# TODO: support Windows
1717
ifeq ($(UNAME_S),Linux)
18-
CFLAGS += -pthread
18+
CFLAGS += -pthread
19+
CXXFLAGS += -pthread
1920
endif
2021
ifeq ($(UNAME_S),Darwin)
21-
CFLAGS += -pthread
22+
CFLAGS += -pthread
23+
CXXFLAGS += -pthread
2224
endif
2325

2426
# Architecture specific
2527
ifeq ($(UNAME_P),x86_64)
2628
CFLAGS += -mavx -mavx2 -mfma -mf16c
2729
endif
2830
ifneq ($(filter arm%,$(UNAME_P)),)
29-
CFLAGS += -mfpu=neon
31+
# Mac M1
3032
endif
31-
ifneq ($(filter aarch64%,$(UNAME_M)),)
32-
CFLAGS += -mfpu=neon
33+
ifneq ($(filter aarch64%,$(UNAME_P)),)
34+
endif
35+
ifneq ($(filter armv6%,$(UNAME_M)),)
36+
# Raspberry Pi 1, 2, 3
37+
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
3338
endif
34-
ifneq ($(filter armv%,$(UNAME_M)),)
39+
ifneq ($(filter armv7%,$(UNAME_M)),)
3540
# Raspberry Pi 4
36-
CFLAGS += -mcpu=cortex-a72 -mfloat-abi=hard -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
41+
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
42+
endif
43+
ifneq ($(filter armv8%,$(UNAME_M)),)
44+
# Raspberry Pi 4
45+
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
3746
endif
3847

3948
#

ggml.c

Lines changed: 123 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "ggml.h"
22

3+
#include <alloca.h>
34
#include <assert.h>
45
#include <time.h>
56
#include <math.h>
@@ -12,7 +13,12 @@
1213
#include <pthread.h>
1314

1415
#define GGML_DEBUG 0
15-
#define GGML_MEM_ALIGN 16
16+
17+
#if UINTPTR_MAX == 0xFFFFFFFF
18+
#define GGML_MEM_ALIGN 4
19+
#else
20+
#define GGML_MEM_ALIGN 16
21+
#endif
1622

1723
#define MAX(a, b) ((a) > (b) ? (a) : (b))
1824
#define MIN(a, b) ((a) < (b) ? (a) : (b))
@@ -305,6 +311,7 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
305311
#ifdef __ARM_NEON
306312
const int n32 = (n & ~31);
307313

314+
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
308315
float16x8_t sum0 = vdupq_n_f16(0);
309316
float16x8_t sum1 = vdupq_n_f16(0);
310317
float16x8_t sum2 = vdupq_n_f16(0);
@@ -344,6 +351,61 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
344351

345352
float32x2_t sumf32 = vadd_f32(vget_low_f32(sum0f32), vget_high_f32(sum0f32));
346353
sumf = vget_lane_f32(sumf32, 0) + vget_lane_f32(sumf32, 1);
354+
#else
355+
float32x4_t sum0 = vdupq_n_f32(0);
356+
float32x4_t sum1 = vdupq_n_f32(0);
357+
float32x4_t sum2 = vdupq_n_f32(0);
358+
float32x4_t sum3 = vdupq_n_f32(0);
359+
float32x4_t sum4 = vdupq_n_f32(0);
360+
float32x4_t sum5 = vdupq_n_f32(0);
361+
float32x4_t sum6 = vdupq_n_f32(0);
362+
float32x4_t sum7 = vdupq_n_f32(0);
363+
364+
float32x4_t x0, x1, x2, x3, x4, x5, x6, x7;
365+
float32x4_t y0, y1, y2, y3, y4, y5, y6, y7;
366+
367+
for (int i = 0; i < n32; i += 32) {
368+
x0 = vcvt_f32_f16(vld1_f16(x + i + 0 ));
369+
x1 = vcvt_f32_f16(vld1_f16(x + i + 4 ));
370+
x2 = vcvt_f32_f16(vld1_f16(x + i + 8 ));
371+
x3 = vcvt_f32_f16(vld1_f16(x + i + 12));
372+
x4 = vcvt_f32_f16(vld1_f16(x + i + 16));
373+
x5 = vcvt_f32_f16(vld1_f16(x + i + 20));
374+
x6 = vcvt_f32_f16(vld1_f16(x + i + 24));
375+
x7 = vcvt_f32_f16(vld1_f16(x + i + 28));
376+
377+
y0 = vcvt_f32_f16(vld1_f16(y + i + 0 ));
378+
y1 = vcvt_f32_f16(vld1_f16(y + i + 4 ));
379+
y2 = vcvt_f32_f16(vld1_f16(y + i + 8 ));
380+
y3 = vcvt_f32_f16(vld1_f16(y + i + 12));
381+
y4 = vcvt_f32_f16(vld1_f16(y + i + 16));
382+
y5 = vcvt_f32_f16(vld1_f16(y + i + 20));
383+
y6 = vcvt_f32_f16(vld1_f16(y + i + 24));
384+
y7 = vcvt_f32_f16(vld1_f16(y + i + 28));
385+
386+
sum0 = vfmaq_f32(sum0, x0, y0);
387+
sum1 = vfmaq_f32(sum1, x1, y1);
388+
sum2 = vfmaq_f32(sum2, x2, y2);
389+
sum3 = vfmaq_f32(sum3, x3, y3);
390+
sum4 = vfmaq_f32(sum4, x4, y4);
391+
sum5 = vfmaq_f32(sum5, x5, y5);
392+
sum6 = vfmaq_f32(sum6, x6, y6);
393+
sum7 = vfmaq_f32(sum7, x7, y7);
394+
}
395+
396+
// reduce sum0..sum7 to sum0
397+
sum0 = vaddq_f32(sum0, sum1);
398+
sum2 = vaddq_f32(sum2, sum3);
399+
sum4 = vaddq_f32(sum4, sum5);
400+
sum6 = vaddq_f32(sum6, sum7);
401+
sum0 = vaddq_f32(sum0, sum2);
402+
sum4 = vaddq_f32(sum4, sum6);
403+
sum0 = vaddq_f32(sum0, sum4);
404+
405+
// reduce sum0 to sumf
406+
float32x2_t sumf32 = vadd_f32(vget_low_f32(sum0), vget_high_f32(sum0));
407+
sumf = vget_lane_f32(sumf32, 0) + vget_lane_f32(sumf32, 1);
408+
#endif
347409

348410
// leftovers
349411
for (int i = n32; i < n; ++i) {
@@ -486,6 +548,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
486548
// NEON 128-bit
487549
const int n32 = (n & ~31);
488550

551+
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
489552
const float16x8_t v8 = vdupq_n_f16(v);
490553

491554
float16x8_t x0, x1, x2, x3;
@@ -512,6 +575,51 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
512575
vst1q_f16(y + i + 16, y2);
513576
vst1q_f16(y + i + 24, y3);
514577
}
578+
#else
579+
const float32x4_t v40 = vdupq_n_f32(v);
580+
const float32x4_t v41 = vdupq_n_f32(v);
581+
582+
float32x4_t x0, x1, x2, x3, x4, x5, x6, x7;
583+
float32x4_t y0, y1, y2, y3, y4, y5, y6, y7;
584+
585+
for (int i = 0; i < n32; i += 32) {
586+
y0 = vcvt_f32_f16(vld1_f16(y + i + 0 ));
587+
y1 = vcvt_f32_f16(vld1_f16(y + i + 4 ));
588+
y2 = vcvt_f32_f16(vld1_f16(y + i + 8 ));
589+
y3 = vcvt_f32_f16(vld1_f16(y + i + 12));
590+
y4 = vcvt_f32_f16(vld1_f16(y + i + 16));
591+
y5 = vcvt_f32_f16(vld1_f16(y + i + 20));
592+
y6 = vcvt_f32_f16(vld1_f16(y + i + 24));
593+
y7 = vcvt_f32_f16(vld1_f16(y + i + 28));
594+
595+
x0 = vcvt_f32_f16(vld1_f16(x + i + 0 ));
596+
x1 = vcvt_f32_f16(vld1_f16(x + i + 4 ));
597+
x2 = vcvt_f32_f16(vld1_f16(x + i + 8 ));
598+
x3 = vcvt_f32_f16(vld1_f16(x + i + 12));
599+
x4 = vcvt_f32_f16(vld1_f16(x + i + 16));
600+
x5 = vcvt_f32_f16(vld1_f16(x + i + 20));
601+
x6 = vcvt_f32_f16(vld1_f16(x + i + 24));
602+
x7 = vcvt_f32_f16(vld1_f16(x + i + 28));
603+
604+
y0 = vfmaq_f32(y0, x0, v40);
605+
y1 = vfmaq_f32(y1, x1, v40);
606+
y2 = vfmaq_f32(y2, x2, v40);
607+
y3 = vfmaq_f32(y3, x3, v40);
608+
y4 = vfmaq_f32(y4, x4, v41);
609+
y5 = vfmaq_f32(y5, x5, v41);
610+
y6 = vfmaq_f32(y6, x6, v41);
611+
y7 = vfmaq_f32(y7, x7, v41);
612+
613+
vst1_f16(y + i + 0 , vcvt_f16_f32(y0));
614+
vst1_f16(y + i + 4 , vcvt_f16_f32(y1));
615+
vst1_f16(y + i + 8 , vcvt_f16_f32(y2));
616+
vst1_f16(y + i + 12, vcvt_f16_f32(y3));
617+
vst1_f16(y + i + 16, vcvt_f16_f32(y4));
618+
vst1_f16(y + i + 20, vcvt_f16_f32(y5));
619+
vst1_f16(y + i + 24, vcvt_f16_f32(y6));
620+
vst1_f16(y + i + 28, vcvt_f16_f32(y7));
621+
}
622+
#endif
515623

516624
// leftovers
517625
for (int i = n32; i < n; ++i) {
@@ -911,16 +1019,18 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
9111019
if (is_first_call) {
9121020
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
9131021

1022+
ggml_fp16_t ii;
9141023
for (int i = 0; i < (1 << 16); ++i) {
915-
uint16_t ii = (uint16_t) i;
916-
const float f = ggml_fp16_to_fp32(*(ggml_fp16_t *)(&ii));
1024+
uint16_t ui = i;
1025+
memcpy(&ii, &ui, sizeof(ii));
1026+
const float f = ggml_fp16_to_fp32(ii);
9171027
table_gelu_f16[i] = ggml_fp32_to_fp16(ggml_gelu_f32(f));
9181028
table_exp_f16[i] = ggml_fp32_to_fp16(exp(f));
9191029
}
9201030

9211031
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
9221032

923-
GGML_PRINT_DEBUG("%s: GELU table initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
1033+
GGML_PRINT_DEBUG("%s: GELU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
9241034

9251035
is_first_call = false;
9261036
}
@@ -4427,13 +4537,15 @@ void ggml_compute_forward_soft_max_f32(
44274537

44284538
ggml_float sum = 0.0;
44294539

4540+
uint16_t ss;
44304541
for (int i = 0; i < nc; i++) {
44314542
if (p[i] == -INFINITY) {
44324543
p[i] = 0.0;
44334544
} else {
44344545
//const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max);
44354546
ggml_fp16_t s = ggml_fp32_to_fp16(p[i] - max);
4436-
const float val = ggml_fp16_to_fp32(table_exp_f16[*(uint16_t *) &s]);
4547+
memcpy(&ss, &s, sizeof(ss));
4548+
const float val = ggml_fp16_to_fp32(table_exp_f16[ss]);
44374549
sum += val;
44384550
p[i] = val;
44394551
}
@@ -5234,13 +5346,15 @@ void ggml_compute_forward_flash_attn_f32(
52345346

52355347
ggml_float sum = 0.0;
52365348

5349+
uint16_t ss;
52375350
for (int i = 0; i < M; i++) {
52385351
if (S[i] == -INFINITY) {
52395352
S[i] = 0.0;
52405353
} else {
52415354
//const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
52425355
ggml_fp16_t s = ggml_fp32_to_fp16(S[i] - max);
5243-
const float val = ggml_fp16_to_fp32(table_exp_f16[*(uint16_t *) &s]);
5356+
memcpy(&ss, &s, sizeof(ss));
5357+
const float val = ggml_fp16_to_fp32(table_exp_f16[ss]);
52445358
sum += val;
52455359
S[i] = val;
52465360
}
@@ -5413,13 +5527,15 @@ void ggml_compute_forward_flash_attn_f16(
54135527

54145528
ggml_float sum = 0.0;
54155529

5530+
uint16_t ss;
54165531
for (int i = 0; i < M; i++) {
54175532
if (S[i] == -INFINITY) {
54185533
S[i] = 0.0;
54195534
} else {
54205535
//const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
54215536
ggml_fp16_t s = ggml_fp32_to_fp16(S[i] - max);
5422-
const float val = ggml_fp16_to_fp32(table_exp_f16[*(uint16_t *) &s]);
5537+
memcpy(&ss, &s, sizeof(ss));
5538+
const float val = ggml_fp16_to_fp32(table_exp_f16[ss]);
54235539
sum += val;
54245540
S[i] = val;
54255541
}

ggml.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ struct ggml_tensor {
108108
int64_t perf_time_us;
109109

110110
void * data;
111-
char pad[8];
111+
char padding[8];
112112
};
113113

114114
// computation graph

whisper.cpp

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1291,7 +1291,8 @@ bool whisper_encode(
12911291
struct ggml_tensor * inpO = ggml_add(ctxL, cur, inpFF);
12921292

12931293
{
1294-
struct ggml_cgraph gf = { .n_threads = n_threads };
1294+
struct ggml_cgraph gf = {};
1295+
gf.n_threads = n_threads;
12951296

12961297
ggml_build_forward_expand(&gf, inpO);
12971298
ggml_graph_compute (ctxL, &gf);
@@ -1327,7 +1328,8 @@ bool whisper_encode(
13271328

13281329
// run the computation
13291330
{
1330-
struct ggml_cgraph gf = { .n_threads = n_threads };
1331+
struct ggml_cgraph gf = {};
1332+
gf.n_threads = n_threads;
13311333

13321334
ggml_build_forward_expand(&gf, cur);
13331335
ggml_graph_compute (ctx0, &gf);
@@ -1351,7 +1353,8 @@ bool whisper_encode(
13511353

13521354
// pre-compute cross-attention memory
13531355
{
1354-
struct ggml_cgraph gf = { .n_threads = n_threads };
1356+
struct ggml_cgraph gf = {};
1357+
gf.n_threads = n_threads;
13551358

13561359
// TODO: hack to disconnect the encoded features from the previous graph
13571360
cur->op = GGML_OP_NONE;
@@ -1461,7 +1464,8 @@ bool whisper_decode(
14611464
};
14621465

14631466
struct ggml_context * ctxL = ggml_init(paramsL);
1464-
struct ggml_cgraph gf = { .n_threads = n_threads };
1467+
struct ggml_cgraph gf = {};
1468+
gf.n_threads = n_threads;
14651469

14661470
// norm
14671471
{
@@ -1744,7 +1748,8 @@ bool whisper_decode(
17441748

17451749
// run the computation
17461750
{
1747-
struct ggml_cgraph gf = { .n_threads = n_threads };
1751+
struct ggml_cgraph gf = {};
1752+
gf.n_threads = n_threads;
17481753

17491754
ggml_build_forward_expand(&gf, cur);
17501755
ggml_graph_compute (ctx0, &gf);
@@ -2334,7 +2339,7 @@ int whisper_full(
23342339
}
23352340
}
23362341

2337-
if (seek >= whisper_n_len(ctx)) {
2342+
if (seek + 100 >= whisper_n_len(ctx)) {
23382343
break;
23392344
}
23402345

0 commit comments

Comments
 (0)