Skip to content

Commit 837811b

Browse files
committed
add intel amx isa detection
add amx kernel for gemm add vnni kernel for gemv cases add vnni and amx kernel support for block_q8_0 code cleanup fix packing B issue enable openmp fine tune amx kernel switch to aten parallel pattern add error message for nested parallelism code cleanup add f16 support in ggml-amx add amx kernels for QK_K quant formats: Q4_K, Q5_K, Q6_K and IQ4_XS
1 parent 081fe43 commit 837811b

File tree

7 files changed

+2643
-5
lines changed

7 files changed

+2643
-5
lines changed

Makefile

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -92,11 +92,6 @@ GGML_METAL := 1
9292
DEPRECATE_WARNING := 1
9393
endif
9494

95-
ifdef LLAMA_OPENMP
96-
GGML_OPENMP := 1
97-
DEPRECATE_WARNING := 1
98-
endif
99-
10095
ifdef LLAMA_RPC
10196
GGML_RPC := 1
10297
DEPRECATE_WARNING := 1
@@ -350,6 +345,12 @@ ifdef LLAMA_SANITIZE_UNDEFINED
350345
MK_LDFLAGS += -fsanitize=undefined -g
351346
endif
352347

348+
ifdef LLAMA_OPENMP
349+
MK_CPPFLAGS += -fopenmp
350+
MK_CFLAGS += -fopenmp
351+
MK_CXXFLAGS += -fopenmp
352+
endif
353+
353354
ifdef LLAMA_SERVER_VERBOSE
354355
MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
355356
endif
@@ -567,6 +568,11 @@ ifndef GGML_NO_LLAMAFILE
567568
OBJ_GGML += ggml/src/llamafile/sgemm.o
568569
endif
569570

571+
ifndef GGML_NO_AMX
572+
MK_CPPFLAGS += -DGGML_USE_AMX
573+
OBJ_GGML += ggml/src/ggml-amx/mmq.o
574+
endif
575+
570576
ifdef GGML_RPC
571577
MK_CPPFLAGS += -DGGML_USE_RPC
572578
OBJ_GGML += ggml/src/ggml-rpc.o
@@ -1026,6 +1032,14 @@ ggml/src/llamafile/sgemm.o: \
10261032
$(CXX) $(CXXFLAGS) -c $< -o $@
10271033
endif # GGML_NO_LLAMAFILE
10281034

1035+
ifndef GGML_NO_AMX
1036+
ggml/src/ggml-amx/mmq.o: \
1037+
ggml/src/ggml-amx/mmq.cpp \
1038+
ggml/src/ggml-amx/mmq.h \
1039+
ggml/include/ggml.h
1040+
$(CXX) $(CXXFLAGS) -c $< -o $@
1041+
endif
1042+
10291043
ifdef GGML_RPC
10301044
ggml/src/ggml-rpc.o: \
10311045
ggml/src/ggml-rpc.cpp \
@@ -1144,6 +1158,7 @@ clean:
11441158
rm -vrf ggml/src/ggml-metal-embed.metal
11451159
rm -vrf ggml/src/ggml-cuda/*.o
11461160
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
1161+
rm -vrf ggml/src/ggml-amx/*.o
11471162
rm -rvf $(BUILD_TARGETS)
11481163
rm -rvf $(TEST_TARGETS)
11491164
rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp

common/common.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,10 @@
7575
#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
7676
#endif // LLAMA_USE_CURL
7777

78+
#if defined(_OPENMP)
79+
#include <omp.h>
80+
#endif
81+
7882
using json = nlohmann::ordered_json;
7983

8084
//
@@ -1709,6 +1713,9 @@ std::string gpt_params_get_system_info(const gpt_params & params) {
17091713
if (params.n_threads_batch != -1) {
17101714
os << " (n_threads_batch = " << params.n_threads_batch << ")";
17111715
}
1716+
#ifdef _OPENMP
1717+
os << " (omp_num_threads = " << omp_get_max_threads() << ")";
1718+
#endif
17121719
os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
17131720

17141721
return os.str();

ggml/include/ggml.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2380,6 +2380,7 @@ extern "C" {
23802380
GGML_API int ggml_cpu_has_avx512_vbmi(void);
23812381
GGML_API int ggml_cpu_has_avx512_vnni(void);
23822382
GGML_API int ggml_cpu_has_avx512_bf16(void);
2383+
GGML_API int ggml_cpu_has_amx_int8 (void);
23832384
GGML_API int ggml_cpu_has_fma (void);
23842385
GGML_API int ggml_cpu_has_neon (void);
23852386
GGML_API int ggml_cpu_has_sve (void);

0 commit comments

Comments
 (0)