From 4f598dd9731497b60a2d4e372fc896636eb34b8c Mon Sep 17 00:00:00 2001 From: niansa Date: Thu, 22 Jun 2023 12:58:07 +0200 Subject: [PATCH 01/43] Initial working stuff --- .gitmodules | 3 + CMakeLists.txt | 18 ++ CMakeLists.txt.user | 454 ++++++++++++++++++++++++++++++++++++++++++++ ggml-vulkan.cpp | 151 +++++++++++++++ ggml-vulkan.h | 13 ++ ggml.c | 4 +- kompute | 1 + 7 files changed, 643 insertions(+), 1 deletion(-) create mode 100644 .gitmodules create mode 100644 CMakeLists.txt.user create mode 100644 ggml-vulkan.cpp create mode 100644 ggml-vulkan.h create mode 160000 kompute diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000000..4a068a6982090 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "kompute"] + path = kompute + url = https://github.com/KomputeProject/kompute.git diff --git a/CMakeLists.txt b/CMakeLists.txt index cc7560a7ae54e..cae41110944e2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -73,6 +73,7 @@ set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF) set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K") option(LLAMA_CLBLAST "llama: use CLBlast" OFF) +option(LLAMA_KOMPUTE "llama: use Kompute" OFF) option(LLAMA_METAL "llama: use Metal" OFF) option(LLAMA_K_QUANTS "llama: use k-quants" ON) @@ -309,6 +310,22 @@ if (LLAMA_CLBLAST) endif() endif() +if (LLAMA_KOMPUTE) + if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt") + message(STATUS "Kompute found") + + add_subdirectory(kompute) + + set(GGML_SOURCES_KOMPUTE ggml-vulkan.cpp ggml-vulkan.h) + + add_compile_definitions(GGML_USE_KOMPUTE) + + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute) + else() + message(WARNING "Kompute not found") + endif() +endif() + if (LLAMA_ALL_WARNINGS) if (NOT MSVC) set(c_flags @@ -466,6 +483,7 @@ add_library(ggml OBJECT ggml.h ${GGML_SOURCES_CUDA} ${GGML_SOURCES_OPENCL} + ${GGML_SOURCES_KOMPUTE} ${GGML_SOURCES_METAL} ${GGML_SOURCES_EXTRA} ) diff --git a/CMakeLists.txt.user b/CMakeLists.txt.user new file mode 100644 index 0000000000000..e7d3738278eb2 --- /dev/null +++ b/CMakeLists.txt.user @@ -0,0 +1,454 @@ + + + + + + EnvironmentId + {f3929b0b-3d39-4fa3-8d2d-2b329b63b30c} + + + ProjectExplorer.Project.ActiveTarget + 0 + + + ProjectExplorer.Project.EditorSettings + + true + false + true + + Cpp + + CppGlobal + + + + QmlJS + + QmlJSGlobal + + + 2 + UTF-8 + false + 4 + false + 80 + true + true + 1 + false + true + false + 0 + true + true + 0 + 8 + true + false + 1 + true + true + true + *.md, *.MD, Makefile + false + true + true + + + + ProjectExplorer.Project.PluginSettings + + + true + false + true + true + true + true + + + 0 + true + + true + true + Builtin.DefaultTidyAndClazy + 6 + + + + true + + + true + + + + + ProjectExplorer.Project.Target.0 + + Desktop + Clang + Clang + {913660d6-ca1c-4b66-a4da-64108a3258a2} + 0 + 0 + 0 + + Release + false + + -DCMAKE_GENERATOR:STRING=Unix Makefiles +-DCMAKE_BUILD_TYPE:STRING=Release +-DQT_QMAKE_EXECUTABLE:STRING=%{Qt:qmakeExecutable} +-DCMAKE_PREFIX_PATH:STRING=%{Qt:QT_INSTALL_PREFIX} +-DCMAKE_C_COMPILER:STRING=%{Compiler:Executable:C} +-DCMAKE_CXX_COMPILER:STRING=%{Compiler:Executable:Cxx} +-DCMAKE_CXX_FLAGS_INIT:STRING=%{Qt:QML_DEBUG_FLAG} + /mnt/hhdd/Programme/OSS/llama.cpp/../build-llama.cpp-Clang-Release + + + + + all + + false + + true + Build + CMakeProjectManager.MakeStep + + 1 + Build + Build + ProjectExplorer.BuildSteps.Build + + + + + + clean + + false + + true + Build + CMakeProjectManager.MakeStep + + 1 + Clean + Clean + ProjectExplorer.BuildSteps.Clean + + 2 + false + + false + + Release + CMakeProjectManager.CMakeBuildConfiguration + + 1 + + + 0 + Deploy + Deploy + ProjectExplorer.BuildSteps.Deploy + + 1 + + false + ProjectExplorer.DefaultDeployConfiguration + + 1 + + true + true + true + + 2 + + baby-llama + CMakeProjectManager.CMakeRunConfiguration.baby-llama + baby-llama + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + benchmark + CMakeProjectManager.CMakeRunConfiguration.benchmark + benchmark + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + test-quantize-perf + CMakeProjectManager.CMakeRunConfiguration.test-quantize-perf + test-quantize-perf + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + test-sampling + CMakeProjectManager.CMakeRunConfiguration.test-sampling + test-sampling + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + test-tokenizer-0 + CMakeProjectManager.CMakeRunConfiguration.test-tokenizer-0 + test-tokenizer-0 + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + train-text-from-scratch + CMakeProjectManager.CMakeRunConfiguration.train-text-from-scratch + train-text-from-scratch + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + vdot + CMakeProjectManager.CMakeRunConfiguration.vdot + vdot + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + simple + CMakeProjectManager.CMakeRunConfiguration.simple + simple + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + embedding + CMakeProjectManager.CMakeRunConfiguration.embedding + embedding + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + main + CMakeProjectManager.CMakeRunConfiguration.main + main + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + perplexity + CMakeProjectManager.CMakeRunConfiguration.perplexity + perplexity + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + q8dot + CMakeProjectManager.CMakeRunConfiguration.q8dot + q8dot + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + quantize + CMakeProjectManager.CMakeRunConfiguration.quantize + quantize + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + quantize-stats + CMakeProjectManager.CMakeRunConfiguration.quantize-stats + quantize-stats + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + save-load-state + CMakeProjectManager.CMakeRunConfiguration.save-load-state + save-load-state + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + test-quantize-fns + CMakeProjectManager.CMakeRunConfiguration.test-quantize-fns + test-quantize-fns + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + 16 + + + + ProjectExplorer.Project.TargetCount + 1 + + + ProjectExplorer.Project.Updater.FileVersion + 22 + + + Version + 22 + + diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp new file mode 100644 index 0000000000000..45f426a2fadd0 --- /dev/null +++ b/ggml-vulkan.cpp @@ -0,0 +1,151 @@ +#include "ggml-vulkan.h" +#include "ggml.h" + +#include +#include +#include +#include +#include +#include +#include + +typedef ggml_fp16_t half; + +#define MULTILINE_QUOTE(...) #__VA_ARGS__ +#define STRINGIFY(x) STRINGIFY2(x) +#define STRINGIFY2(x) #x + +#define QK4_0 32 +#define QR4_0 2 +#define QK4_1 32 + +typedef struct { + half d; + uint8_t qs[QK4_0 / 2]; +} block_q4_0; + +typedef struct { + half d; + half m; + uint8_t qs[QK4_1 / 2]; +} block_q4_1; + + +kp::Manager mgr; + + + +static const std::string program_source_head = R"( +#version 450 +#extension GL_EXT_shader_explicit_arithmetic_types_float16: enable +#extension GL_EXT_shader_explicit_arithmetic_types_int8: enable +#define QK4_0 32 +#define QR4_0 2 +#define QK4_1 32 +layout (local_size_x = 1) in; +)"; + + +static const std::string kernel_dequantize_row_q4_0 = + program_source_head+'\n'+MULTILINE_QUOTE( +// Tensors +layout(binding = 0) buffer tensorBlockQ4_0D { float16_t x_d[]; }; +layout(binding = 1) buffer tensorBlockQ4_0QS { uint8_t x_qs[]; }; +layout(binding = 2) buffer tensorY { float y[]; }; + +// Push constants +layout(push_constant) uniform PushConstants { + int k; +} pcs; + +void main() { + const int qk = QK4_0; + + const int i = int(gl_GlobalInvocationID.x); + const int j = int(gl_GlobalInvocationID.y); + + const float16_t d = x_d[i]; + const uint8_t qs = x_qs[i * (QK4_0 / 2) + j]; + + const int x0 = (qs & 0x0F) - 8; + const int x1 = (qs >> 4) - 8; + + y[i*qk + j + 0 ] = float16_t(x0)*d; + y[i*qk + j + qk/2] = float16_t(x1)*d; +} +); + + +std::vector compileSource(const std::string& source) { + //FIXME: Terrible solution!!!! + std::ofstream fileOut("tmp_kp_shader.comp"); + fileOut << source; + fileOut.close(); + if (system(std::string("glslangValidator -V tmp_kp_shader.comp -o tmp_kp_shader.comp.spv").c_str())) + throw std::runtime_error("Error running glslangValidator command"); + std::ifstream fileStream("tmp_kp_shader.comp.spv", std::ios::binary); + std::vector buffer; + buffer.insert(buffer.begin(), std::istreambuf_iterator(fileStream), {}); + return {(uint32_t*)buffer.data(), (uint32_t*)(buffer.data() + buffer.size())}; +} + +void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) { + static const int qk = QK4_0; + static const unsigned nb = k / qk; + static const unsigned y_size = qk*2*nb; + const static auto spirv = compileSource(kernel_dequantize_row_q4_0); + + const auto x = reinterpret_cast(x_); + + auto getVecBlockQ4_0D = [] (const block_q4_0 *x) { + std::vector fres; + fres.reserve(nb); + for (unsigned it = 0; it != nb; it++) { + fres.push_back(x[it].d); + } + return fres; + }; + auto getVecBlockQ4_0QS = [] (const block_q4_0 *x) { + std::vector fres; + fres.resize(nb*(qk/2)); + for (unsigned x_it = 0; x_it != nb; x_it++) { + for (unsigned qs_it = 0; qs_it != qk / 2; qs_it++) { + fres.push_back(x[x_it].qs[qs_it]); + } + } + return fres; + }; + + const auto tensorBlockQ4_0D = mgr.tensorT(getVecBlockQ4_0D(x)); + const auto tensorBlockQ4_0QS = mgr.tensorT(getVecBlockQ4_0QS(x)); + const auto tensorY = mgr.tensor(std::vector(y, y+y_size)); + + struct PushConsts { + int k; + } pushConsts { + k + }; + + mgr.sequence() + ->record({tensorBlockQ4_0D, tensorBlockQ4_0QS, tensorY}) + ->record(mgr.algorithm({tensorBlockQ4_0D, tensorBlockQ4_0QS, tensorY}, spirv, {nb, qk/2, 0}, {}, {0}), std::vector{pushConsts}) + ->record({tensorY}) + ->eval(); + + std::memcpy(y, tensorY->data(), tensorY->size()); +} + + +template<> +kp::Tensor::TensorDataTypes +kp::TensorT::dataType() +{ + return TensorDataTypes::eFloat; +} + +template<> +kp::Tensor::TensorDataTypes +kp::TensorT::dataType() +{ + return TensorDataTypes::eUnsignedInt; +} diff --git a/ggml-vulkan.h b/ggml-vulkan.h new file mode 100644 index 0000000000000..34e6d46b3dbfa --- /dev/null +++ b/ggml-vulkan.h @@ -0,0 +1,13 @@ +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +void ggml_vk_init(void); + +void ggml_vk_dequantize_row_q4_0(const void * x, float * y, int k); + +#ifdef __cplusplus +} +#endif diff --git a/ggml.c b/ggml.c index 4319683f5186e..151b9eefbf4b7 100644 --- a/ggml.c +++ b/ggml.c @@ -161,6 +161,8 @@ inline static void* ggml_aligned_malloc(size_t size) { #endif #elif defined(GGML_USE_OPENBLAS) #include +#elif defined(GGML_USE_KOMPUTE) +#include "ggml-vulkan.h" #elif defined(GGML_USE_CUBLAS) #include "ggml-cuda.h" #elif defined(GGML_USE_CLBLAST) @@ -1548,7 +1550,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { [GGML_TYPE_Q4_0] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_0, + .dequantize_row_q = (dequantize_row_q_t) ggml_vk_dequantize_row_q4_0, .quantize_row_q = quantize_row_q4_0, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference, .quantize_row_q_dot = quantize_row_q8_0, diff --git a/kompute b/kompute new file mode 160000 index 0000000000000..63567a72be6b2 --- /dev/null +++ b/kompute @@ -0,0 +1 @@ +Subproject commit 63567a72be6b26f79da92becaffa7cd55f46642b From 2f3fe0c0a45b6c5130ce6b3dd2cf82dcd9eb8a2e Mon Sep 17 00:00:00 2001 From: niansa Date: Thu, 22 Jun 2023 12:58:33 +0200 Subject: [PATCH 02/43] Updated gitignore --- .gitignore | 2 + CMakeLists.txt.user | 454 -------------------------------------------- 2 files changed, 2 insertions(+), 454 deletions(-) delete mode 100644 CMakeLists.txt.user diff --git a/.gitignore b/.gitignore index e7bfd52e3d63c..e88b7d83ad2f8 100644 --- a/.gitignore +++ b/.gitignore @@ -56,3 +56,5 @@ qnt-*.txt perf-*.txt examples/jeopardy/results.txt + +CMakeLists.txt.user* diff --git a/CMakeLists.txt.user b/CMakeLists.txt.user deleted file mode 100644 index e7d3738278eb2..0000000000000 --- a/CMakeLists.txt.user +++ /dev/null @@ -1,454 +0,0 @@ - - - - - - EnvironmentId - {f3929b0b-3d39-4fa3-8d2d-2b329b63b30c} - - - ProjectExplorer.Project.ActiveTarget - 0 - - - ProjectExplorer.Project.EditorSettings - - true - false - true - - Cpp - - CppGlobal - - - - QmlJS - - QmlJSGlobal - - - 2 - UTF-8 - false - 4 - false - 80 - true - true - 1 - false - true - false - 0 - true - true - 0 - 8 - true - false - 1 - true - true - true - *.md, *.MD, Makefile - false - true - true - - - - ProjectExplorer.Project.PluginSettings - - - true - false - true - true - true - true - - - 0 - true - - true - true - Builtin.DefaultTidyAndClazy - 6 - - - - true - - - true - - - - - ProjectExplorer.Project.Target.0 - - Desktop - Clang - Clang - {913660d6-ca1c-4b66-a4da-64108a3258a2} - 0 - 0 - 0 - - Release - false - - -DCMAKE_GENERATOR:STRING=Unix Makefiles --DCMAKE_BUILD_TYPE:STRING=Release --DQT_QMAKE_EXECUTABLE:STRING=%{Qt:qmakeExecutable} --DCMAKE_PREFIX_PATH:STRING=%{Qt:QT_INSTALL_PREFIX} --DCMAKE_C_COMPILER:STRING=%{Compiler:Executable:C} --DCMAKE_CXX_COMPILER:STRING=%{Compiler:Executable:Cxx} --DCMAKE_CXX_FLAGS_INIT:STRING=%{Qt:QML_DEBUG_FLAG} - /mnt/hhdd/Programme/OSS/llama.cpp/../build-llama.cpp-Clang-Release - - - - - all - - false - - true - Build - CMakeProjectManager.MakeStep - - 1 - Build - Build - ProjectExplorer.BuildSteps.Build - - - - - - clean - - false - - true - Build - CMakeProjectManager.MakeStep - - 1 - Clean - Clean - ProjectExplorer.BuildSteps.Clean - - 2 - false - - false - - Release - CMakeProjectManager.CMakeBuildConfiguration - - 1 - - - 0 - Deploy - Deploy - ProjectExplorer.BuildSteps.Deploy - - 1 - - false - ProjectExplorer.DefaultDeployConfiguration - - 1 - - true - true - true - - 2 - - baby-llama - CMakeProjectManager.CMakeRunConfiguration.baby-llama - baby-llama - false - true - true - false - true - /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin - - - true - true - true - - 2 - - benchmark - CMakeProjectManager.CMakeRunConfiguration.benchmark - benchmark - false - true - true - false - true - /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin - - - true - true - true - - 2 - - test-quantize-perf - CMakeProjectManager.CMakeRunConfiguration.test-quantize-perf - test-quantize-perf - false - true - true - false - true - /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin - - - true - true - true - - 2 - - test-sampling - CMakeProjectManager.CMakeRunConfiguration.test-sampling - test-sampling - false - true - true - false - true - /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin - - - true - true - true - - 2 - - test-tokenizer-0 - CMakeProjectManager.CMakeRunConfiguration.test-tokenizer-0 - test-tokenizer-0 - false - true - true - false - true - /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin - - - true - true - true - - 2 - - train-text-from-scratch - CMakeProjectManager.CMakeRunConfiguration.train-text-from-scratch - train-text-from-scratch - false - true - true - false - true - /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin - - - true - true - true - - 2 - - vdot - CMakeProjectManager.CMakeRunConfiguration.vdot - vdot - false - true - true - false - true - /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin - - - true - true - true - - 2 - - simple - CMakeProjectManager.CMakeRunConfiguration.simple - simple - false - true - true - false - true - /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin - - - true - true - true - - 2 - - embedding - CMakeProjectManager.CMakeRunConfiguration.embedding - embedding - false - true - true - false - true - /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin - - - true - true - true - - 2 - - main - CMakeProjectManager.CMakeRunConfiguration.main - main - false - true - true - false - true - /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin - - - true - true - true - - 2 - - perplexity - CMakeProjectManager.CMakeRunConfiguration.perplexity - perplexity - false - true - true - false - true - /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin - - - true - true - true - - 2 - - q8dot - CMakeProjectManager.CMakeRunConfiguration.q8dot - q8dot - false - true - true - false - true - /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin - - - true - true - true - - 2 - - quantize - CMakeProjectManager.CMakeRunConfiguration.quantize - quantize - false - true - true - false - true - /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin - - - true - true - true - - 2 - - quantize-stats - CMakeProjectManager.CMakeRunConfiguration.quantize-stats - quantize-stats - false - true - true - false - true - /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin - - - true - true - true - - 2 - - save-load-state - CMakeProjectManager.CMakeRunConfiguration.save-load-state - save-load-state - false - true - true - false - true - /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin - - - true - true - true - - 2 - - test-quantize-fns - CMakeProjectManager.CMakeRunConfiguration.test-quantize-fns - test-quantize-fns - false - true - true - false - true - /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin - - 16 - - - - ProjectExplorer.Project.TargetCount - 1 - - - ProjectExplorer.Project.Updater.FileVersion - 22 - - - Version - 22 - - From 3b3d30e4ade98185b47dd781a7c7b2e82b0353a7 Mon Sep 17 00:00:00 2001 From: niansa Date: Thu, 22 Jun 2023 13:55:25 +0200 Subject: [PATCH 03/43] Cleanups --- ggml-vulkan.cpp | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 45f426a2fadd0..706a0ffeedd6c 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -9,6 +9,10 @@ #include #include +#ifndef __STDC_IEC_559__ +#error Your C implementation is not IEC 559 compliant, which is required for proper Vulkan interop. +#endif + typedef ggml_fp16_t half; #define MULTILINE_QUOTE(...) #__VA_ARGS__ @@ -53,25 +57,20 @@ layout(binding = 0) buffer tensorBlockQ4_0D { float16_t x_d[]; }; layout(binding = 1) buffer tensorBlockQ4_0QS { uint8_t x_qs[]; }; layout(binding = 2) buffer tensorY { float y[]; }; -// Push constants -layout(push_constant) uniform PushConstants { - int k; -} pcs; - void main() { const int qk = QK4_0; const int i = int(gl_GlobalInvocationID.x); const int j = int(gl_GlobalInvocationID.y); - const float16_t d = x_d[i]; - const uint8_t qs = x_qs[i * (QK4_0 / 2) + j]; + const float d = float(x_d[i]); + const uint8_t qs = x_qs[i * (qk / 2) + j]; const int x0 = (qs & 0x0F) - 8; const int x1 = (qs >> 4) - 8; - y[i*qk + j + 0 ] = float16_t(x0)*d; - y[i*qk + j + qk/2] = float16_t(x1)*d; + y[i*qk + j + 0 ] = float(x0)*d; + y[i*qk + j + qk/2] = float(x1)*d; } ); @@ -97,20 +96,20 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) { const auto x = reinterpret_cast(x_); + assert(k % qk == 0); + auto getVecBlockQ4_0D = [] (const block_q4_0 *x) { - std::vector fres; - fres.reserve(nb); + std::vector fres(nb); for (unsigned it = 0; it != nb; it++) { - fres.push_back(x[it].d); + fres[it] = x[it].d; } return fres; }; auto getVecBlockQ4_0QS = [] (const block_q4_0 *x) { - std::vector fres; - fres.resize(nb*(qk/2)); + std::vector fres(nb*(qk/2)); for (unsigned x_it = 0; x_it != nb; x_it++) { for (unsigned qs_it = 0; qs_it != qk / 2; qs_it++) { - fres.push_back(x[x_it].qs[qs_it]); + fres[x_it * (qk / 2) + qs_it] = x[x_it].qs[qs_it]; } } return fres; @@ -120,15 +119,9 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) { const auto tensorBlockQ4_0QS = mgr.tensorT(getVecBlockQ4_0QS(x)); const auto tensorY = mgr.tensor(std::vector(y, y+y_size)); - struct PushConsts { - int k; - } pushConsts { - k - }; - mgr.sequence() ->record({tensorBlockQ4_0D, tensorBlockQ4_0QS, tensorY}) - ->record(mgr.algorithm({tensorBlockQ4_0D, tensorBlockQ4_0QS, tensorY}, spirv, {nb, qk/2, 0}, {}, {0}), std::vector{pushConsts}) + ->record(mgr.algorithm({tensorBlockQ4_0D, tensorBlockQ4_0QS, tensorY}, spirv, {nb, qk/2, 0})) ->record({tensorY}) ->eval(); From b0f11fa9c181e90c9294f83c16004874db682329 Mon Sep 17 00:00:00 2001 From: niansa Date: Thu, 22 Jun 2023 16:05:56 +0200 Subject: [PATCH 04/43] More code cleanups --- ggml-vulkan.cpp | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 706a0ffeedd6c..b0a84942e91ff 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #ifndef __STDC_IEC_559__ @@ -39,6 +40,20 @@ kp::Manager mgr; +std::vector compileSource(const std::string& source) { + //FIXME: Terrible solution!!!! + std::ofstream fileOut("tmp_kp_shader.comp"); + fileOut << source; + fileOut.close(); + if (system(std::string("glslangValidator -V tmp_kp_shader.comp -o tmp_kp_shader.comp.spv").c_str())) + throw std::runtime_error("Error running glslangValidator command"); + std::ifstream fileStream("tmp_kp_shader.comp.spv", std::ios::binary); + std::vector buffer; + buffer.insert(buffer.begin(), std::istreambuf_iterator(fileStream), {}); + return {(uint32_t*)buffer.data(), (uint32_t*)(buffer.data() + buffer.size())}; +} + + static const std::string program_source_head = R"( #version 450 #extension GL_EXT_shader_explicit_arithmetic_types_float16: enable @@ -50,9 +65,8 @@ layout (local_size_x = 1) in; )"; -static const std::string kernel_dequantize_row_q4_0 = +static const std::string program_dequantize_row_q4_0 = program_source_head+'\n'+MULTILINE_QUOTE( -// Tensors layout(binding = 0) buffer tensorBlockQ4_0D { float16_t x_d[]; }; layout(binding = 1) buffer tensorBlockQ4_0QS { uint8_t x_qs[]; }; layout(binding = 2) buffer tensorY { float y[]; }; @@ -75,37 +89,24 @@ void main() { ); -std::vector compileSource(const std::string& source) { - //FIXME: Terrible solution!!!! - std::ofstream fileOut("tmp_kp_shader.comp"); - fileOut << source; - fileOut.close(); - if (system(std::string("glslangValidator -V tmp_kp_shader.comp -o tmp_kp_shader.comp.spv").c_str())) - throw std::runtime_error("Error running glslangValidator command"); - std::ifstream fileStream("tmp_kp_shader.comp.spv", std::ios::binary); - std::vector buffer; - buffer.insert(buffer.begin(), std::istreambuf_iterator(fileStream), {}); - return {(uint32_t*)buffer.data(), (uint32_t*)(buffer.data() + buffer.size())}; -} - void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) { static const int qk = QK4_0; - static const unsigned nb = k / qk; - static const unsigned y_size = qk*2*nb; - const static auto spirv = compileSource(kernel_dequantize_row_q4_0); + const unsigned nb = k / qk; + const unsigned y_size = nb*qk; + const static auto spirv = compileSource(program_dequantize_row_q4_0); const auto x = reinterpret_cast(x_); assert(k % qk == 0); - auto getVecBlockQ4_0D = [] (const block_q4_0 *x) { + auto getVecBlockQ4_0D = [x, nb] () { std::vector fres(nb); for (unsigned it = 0; it != nb; it++) { fres[it] = x[it].d; } return fres; }; - auto getVecBlockQ4_0QS = [] (const block_q4_0 *x) { + auto getVecBlockQ4_0QS = [x, nb] () { std::vector fres(nb*(qk/2)); for (unsigned x_it = 0; x_it != nb; x_it++) { for (unsigned qs_it = 0; qs_it != qk / 2; qs_it++) { @@ -115,8 +116,8 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) { return fres; }; - const auto tensorBlockQ4_0D = mgr.tensorT(getVecBlockQ4_0D(x)); - const auto tensorBlockQ4_0QS = mgr.tensorT(getVecBlockQ4_0QS(x)); + const auto tensorBlockQ4_0D = mgr.tensorT(getVecBlockQ4_0D()); + const auto tensorBlockQ4_0QS = mgr.tensorT(getVecBlockQ4_0QS()); const auto tensorY = mgr.tensor(std::vector(y, y+y_size)); mgr.sequence() @@ -125,7 +126,7 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) { ->record({tensorY}) ->eval(); - std::memcpy(y, tensorY->data(), tensorY->size()); + std::memcpy(y, tensorY->data(), tensorY->size()*sizeof(*y)); } From 9cdaea9240c8ea21f4eed8ab7f7248ac19844022 Mon Sep 17 00:00:00 2001 From: niansa Date: Thu, 22 Jun 2023 16:30:36 +0200 Subject: [PATCH 05/43] Implemented dequantize_row_q4_1 --- ggml-vulkan.cpp | 101 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 81 insertions(+), 20 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index b0a84942e91ff..c722609a906c0 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -53,6 +53,35 @@ std::vector compileSource(const std::string& source) { return {(uint32_t*)buffer.data(), (uint32_t*)(buffer.data() + buffer.size())}; } +template +std::vector getVecBlockQ4_0D(T *x, unsigned nb) { + std::vector fres(nb); + for (unsigned it = 0; it != nb; it++) { + fres[it] = x[it].d; + } + return fres; +} + +template +std::vector getVecBlockQ4_0M(T *x, unsigned nb) { + std::vector fres(nb); + for (unsigned it = 0; it != nb; it++) { + fres[it] = x[it].m; + } + return fres; +} + +template +std::vector getVecBlockQ4_0QS(T *x, unsigned nb, unsigned qk) { + std::vector fres(nb*(qk/2)); + for (unsigned x_it = 0; x_it != nb; x_it++) { + for (unsigned qs_it = 0; qs_it != qk / 2; qs_it++) { + fres[x_it * (qk / 2) + qs_it] = x[x_it].qs[qs_it]; + } + } + return fres; +}; + static const std::string program_source_head = R"( #version 450 @@ -88,7 +117,6 @@ void main() { } ); - void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) { static const int qk = QK4_0; const unsigned nb = k / qk; @@ -99,25 +127,8 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) { assert(k % qk == 0); - auto getVecBlockQ4_0D = [x, nb] () { - std::vector fres(nb); - for (unsigned it = 0; it != nb; it++) { - fres[it] = x[it].d; - } - return fres; - }; - auto getVecBlockQ4_0QS = [x, nb] () { - std::vector fres(nb*(qk/2)); - for (unsigned x_it = 0; x_it != nb; x_it++) { - for (unsigned qs_it = 0; qs_it != qk / 2; qs_it++) { - fres[x_it * (qk / 2) + qs_it] = x[x_it].qs[qs_it]; - } - } - return fres; - }; - - const auto tensorBlockQ4_0D = mgr.tensorT(getVecBlockQ4_0D()); - const auto tensorBlockQ4_0QS = mgr.tensorT(getVecBlockQ4_0QS()); + const auto tensorBlockQ4_0D = mgr.tensorT(getVecBlockQ4_0D(x, nb)); + const auto tensorBlockQ4_0QS = mgr.tensorT(getVecBlockQ4_0QS(x, nb, qk)); const auto tensorY = mgr.tensor(std::vector(y, y+y_size)); mgr.sequence() @@ -130,6 +141,56 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) { } +static const std::string program_dequantize_row_q4_1 = + program_source_head+'\n'+MULTILINE_QUOTE( +layout(binding = 0) buffer tensorBlockQ4_0D { float16_t x_d[]; }; +layout(binding = 1) buffer tensorBlockQ4_0M { float16_t x_m[]; }; +layout(binding = 2) buffer tensorBlockQ4_0QS { uint8_t x_qs[]; }; +layout(binding = 3) buffer tensorY { float y[]; }; + +void main() { + const int qk = QK4_1; + + const int i = int(gl_GlobalInvocationID.x); + const int j = int(gl_GlobalInvocationID.y); + + const float d = float(x_d[i]); + const float m = float(x_m[i]); + const uint8_t qs = x_qs[i * (qk / 2) + j]; + + const int x0 = (qs & 0x0F); + const int x1 = (qs >> 4); + + y[i*qk + j + 0 ] = x0*d + m; + y[i*qk + j + qk/2] = x1*d + m; +} +); + +void ggml_vk_dequantize_row_q4_1(const void *x_, float *y, int k) { + static const int qk = QK4_1; + const unsigned nb = k / qk; + const unsigned y_size = nb*qk; + const static auto spirv = compileSource(program_dequantize_row_q4_1); + + const auto x = reinterpret_cast(x_); + + assert(k % qk == 0); + + const auto tensorBlockQ4_0D = mgr.tensorT(getVecBlockQ4_0D(x, nb)); + const auto tensorBlockQ4_0M = mgr.tensorT(getVecBlockQ4_0M(x, nb)); + const auto tensorBlockQ4_0QS = mgr.tensorT(getVecBlockQ4_0QS(x, nb, qk)); + const auto tensorY = mgr.tensor(std::vector(y, y+y_size)); + + mgr.sequence() + ->record({tensorBlockQ4_0D, tensorBlockQ4_0M, tensorBlockQ4_0QS, tensorY}) + ->record(mgr.algorithm({tensorBlockQ4_0D, tensorBlockQ4_0M, tensorBlockQ4_0QS, tensorY}, spirv, {nb, qk/2, 0})) + ->record({tensorY}) + ->eval(); + + std::memcpy(y, tensorY->data(), tensorY->size()*sizeof(*y)); +} + + template<> kp::Tensor::TensorDataTypes kp::TensorT::dataType() From 339bc36cdda3014a80c45051ca89bf982e76f750 Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 23 Jun 2023 11:50:30 +0200 Subject: [PATCH 06/43] Added more functions from Metal --- ggml-vulkan.cpp | 142 ++++++++++++++++++++++++++++++++++++++++++++++-- ggml-vulkan.h | 26 ++++++++- llama.cpp | 60 ++++++++++++++++++++ 3 files changed, 222 insertions(+), 6 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index c722609a906c0..b7e70e221a04c 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -5,8 +5,10 @@ #include #include #include -#include +#include #include +#include +#include #include #include @@ -14,8 +16,6 @@ #error Your C implementation is not IEC 559 compliant, which is required for proper Vulkan interop. #endif -typedef ggml_fp16_t half; - #define MULTILINE_QUOTE(...) #__VA_ARGS__ #define STRINGIFY(x) STRINGIFY2(x) #define STRINGIFY2(x) #x @@ -24,6 +24,10 @@ typedef ggml_fp16_t half; #define QR4_0 2 #define QK4_1 32 + +typedef ggml_fp16_t half; +enum class byte : unsigned char {}; + typedef struct { half d; uint8_t qs[QK4_0 / 2]; @@ -35,12 +39,82 @@ typedef struct { uint8_t qs[QK4_1 / 2]; } block_q4_1; +struct ggml_kompute_context { + std::unordered_map> buffers; + std::unordered_map> tensors; +}; + kp::Manager mgr; +ggml_kompute_context *ggml_vk_init() { + return new ggml_kompute_context; +} + +void ggml_metal_free(struct ggml_kompute_context * ctx) { + delete ctx; +} + + +bool ggml_vk_add_buffer( + struct ggml_kompute_context * ctx, + const char * name, + void * data, + size_t size, + size_t max_size) { + try { + std::vector vec(max_size); + std::memcpy(vec.data(), data, std::max(size, max_size)); + auto tensor = mgr.tensorT(vec); + ctx->buffers.emplace(name, std::move(tensor)); + } catch (const std::exception & e) { + fprintf(stderr, "ggml_vk: failed to add buffer '%s': %s\n", name, e.what()); + return false; + } + return true; +} + +std::shared_ptr ggml_vk_get_buffer(struct ggml_kompute_context * ctx, const char * name) { + auto res = ctx->buffers.find(name); + if (res == ctx->buffers.end()) return nullptr; + return res->second; +} + + +void ggml_vk_set_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) { + if (t->backend != GGML_BACKEND_GPU) { + return; + } + + auto data = t->data; + auto size = ggml_nbytes(t); + + std::vector vec(size); + memcpy(vec.data(), data, size); + + auto tensor = mgr.tensorT(vec); + mgr.sequence()->eval({tensor}); + ctx->tensors.emplace(t, std::move(tensor)); +} + +void ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) { + if (t->backend != GGML_BACKEND_GPU) { + return; + } + + auto data = t->data; + auto size = ggml_nbytes(t); + + auto res = ctx->tensors.find(t); + + auto tensor = res->second; + mgr.sequence()->eval({tensor}); + memcpy(data, tensor->data(), size); +} + -std::vector compileSource(const std::string& source) { +static std::vector compileSource(const std::string& source) { //FIXME: Terrible solution!!!! std::ofstream fileOut("tmp_kp_shader.comp"); fileOut << source; @@ -53,6 +127,7 @@ std::vector compileSource(const std::string& source) { return {(uint32_t*)buffer.data(), (uint32_t*)(buffer.data() + buffer.size())}; } + template std::vector getVecBlockQ4_0D(T *x, unsigned nb) { std::vector fres(nb); @@ -90,12 +165,12 @@ static const std::string program_source_head = R"( #define QK4_0 32 #define QR4_0 2 #define QK4_1 32 -layout (local_size_x = 1) in; )"; static const std::string program_dequantize_row_q4_0 = program_source_head+'\n'+MULTILINE_QUOTE( +layout(local_size_x = 1, local_size_y = 1) in; layout(binding = 0) buffer tensorBlockQ4_0D { float16_t x_d[]; }; layout(binding = 1) buffer tensorBlockQ4_0QS { uint8_t x_qs[]; }; layout(binding = 2) buffer tensorY { float y[]; }; @@ -143,6 +218,7 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) { static const std::string program_dequantize_row_q4_1 = program_source_head+'\n'+MULTILINE_QUOTE( +layout(local_size_x = 1, local_size_y = 1) in; layout(binding = 0) buffer tensorBlockQ4_0D { float16_t x_d[]; }; layout(binding = 1) buffer tensorBlockQ4_0M { float16_t x_m[]; }; layout(binding = 2) buffer tensorBlockQ4_0QS { uint8_t x_qs[]; }; @@ -191,6 +267,55 @@ void ggml_vk_dequantize_row_q4_1(const void *x_, float *y, int k) { } +static const std::string program_abmath = + program_source_head+'\n'+MULTILINE_QUOTE( +layout(push_constant) uniform PushConstants { + uint inAOff; + uint inBOff; + uint outOff; +} pcs; + + +layout(local_size_x = 1) in; +layout(binding = 0) buffer tensorInA { float inA[]; }; +layout(binding = 1) buffer tensorInB { float inB[]; }; +layout(binding = 2) buffer tensorout { float out[]; }; + + +void main() { + const int i = int(gl_GlobalInvocationID.x); + + out[pcs.outOff+i] = inA[pcs.inAOff+i] MATH_OP inB[pcs.inBOff+i]; +} +); + +template +void ggml_vk_abmath(const std::shared_ptr& inA, uint32_t inAOff, + const std::shared_ptr& inB, uint32_t inBOff, + std::shared_ptr& out, uint32_t outOff) { + const static auto spirv = compileSource("#define MATH_OP "+std::string(1, mathOP)+'\n'+program_abmath); + + struct PushConstants { + uint32_t inAOff, inBOff, outOff; + } pushConsts { + inAOff, inBOff, outOff + }; + + mgr.sequence() + ->eval(mgr.algorithm({inA, inB, out}, spirv, {std::min(inA->size(), inB->size())}, {}, {pushConsts})); +} + +template +void ggml_vk_add(Args&&... args) { + return ggml_vk_abmath<'+'>(std::forward(args)...); +} + +template +void ggml_vk_mul(Args&&... args) { + return ggml_vk_abmath<'*'>(std::forward(args)...); +} + + template<> kp::Tensor::TensorDataTypes kp::TensorT::dataType() @@ -204,3 +329,10 @@ kp::TensorT::dataType() { return TensorDataTypes::eUnsignedInt; } + +template<> +kp::Tensor::TensorDataTypes +kp::TensorT::dataType() +{ + return TensorDataTypes::eUnsignedInt; +} diff --git a/ggml-vulkan.h b/ggml-vulkan.h index 34e6d46b3dbfa..649c34b537c28 100644 --- a/ggml-vulkan.h +++ b/ggml-vulkan.h @@ -1,12 +1,36 @@ #pragma once +#include + #ifdef __cplusplus extern "C" { #endif -void ggml_vk_init(void); +struct ggml_kompute_context; + + +ggml_kompute_context * ggml_vk_init(void); +void ggml_metal_free(struct ggml_kompute_context * ctx); + +// creates a mapping between a host memory buffer and a device memory buffer +// - make sure to map all buffers used in the graph before calling ggml_vk_graph_compute +// - the mapping is used during computation to determine the arguments of the compute kernels +// - you don't need to keep the host memory buffer allocated as it is never accessed by Vulkan +// - max_size specifies the maximum size of a tensor and is used to create shared views such +// that it is guaranteed that the tensor will fit in at least one of the views +// +bool ggml_vk_add_buffer( + struct ggml_kompute_context * ctx, + const char * name, + void * data, + size_t size, + size_t max_size); + +void ggml_vk_set_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t); +void ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t); void ggml_vk_dequantize_row_q4_0(const void * x, float * y, int k); +void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * cgraph); #ifdef __cplusplus } diff --git a/llama.cpp b/llama.cpp index e597f5048234b..824ed6121ce1d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -14,6 +14,8 @@ #include "ggml-cuda.h" #elif defined(GGML_USE_CLBLAST) #include "ggml-opencl.h" +#elif defined(GGML_USE_KOMPUTE) +#include "ggml-vulkan.h" #endif #ifdef GGML_USE_METAL @@ -280,6 +282,8 @@ struct llama_context { #ifdef GGML_USE_METAL ggml_metal_context * ctx_metal = NULL; +#elif defined(GGML_USE_KOMPUTE) + ggml_kompute_context * ctx_kompute = NULL; #endif int buf_last = 0; @@ -1701,6 +1705,26 @@ static bool llama_eval_internal( ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v); } + ggml_graph_compute(ctx0, &gf); + } +#elif defined(GGML_USE_KOMPUTE) + if (lctx.ctx_kompute && N == 1) { + ggml_vk_graph_compute(lctx.ctx_kompute, &gf); + ggml_vk_get_tensor (lctx.ctx_kompute, cur); + } else { + // IMPORTANT: + // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla + // ggml_graph_compute(). + // + // When we implement Matrix x Matrix Metal multiplication, we can avoid this branch. + // But for now, we have focused only on Matrix x Vector Metal multiplication. + // + if (lctx.ctx_kompute) { + // We need to sync the GPU KV cache with the CPU KV cache + ggml_vk_get_tensor(lctx.ctx_kompute, kv_self.k); + ggml_vk_get_tensor(lctx.ctx_kompute, kv_self.v); + } + ggml_graph_compute(ctx0, &gf); } #else @@ -2743,6 +2767,42 @@ struct llama_context * llama_init_from_file( LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0)); #undef LLAMA_METAL_CHECK_BUF } +#elif defined(GGML_USE_KOMPUTE) + if (params.n_gpu_layers > 0) { + // this allocates all Metal resources and memory buffers + ctx->ctx_kompute = ggml_vk_init(); + + void * data_ptr = NULL; + size_t data_size = 0; + + if (params.use_mmap) { + data_ptr = ctx->model.mapping->addr; + data_size = ctx->model.mapping->size; + } else { + data_ptr = ggml_get_mem_buffer(ctx->model.ctx); + data_size = ggml_get_mem_size (ctx->model.ctx); + } + + const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx); + + printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); + +#define LLAMA_METAL_CHECK_BUF(result) \ + if (!(result)) { \ + fprintf(stderr, "%s: failed to add buffer\n", __func__); \ + llama_free(ctx); \ + return NULL; \ + } + + LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "data", data_ptr, data_size, max_size)); + + LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0)); + LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0)); + + LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0)); + LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0)); +#undef LLAMA_METAL_CHECK_BUF + } #endif return ctx; From 9d643755a62075bad0570c54e82d87d4228d06ab Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 23 Jun 2023 11:51:25 +0200 Subject: [PATCH 07/43] Fixed compile error --- ggml-vulkan.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ggml-vulkan.h b/ggml-vulkan.h index 649c34b537c28..a3bc781d7799c 100644 --- a/ggml-vulkan.h +++ b/ggml-vulkan.h @@ -1,15 +1,16 @@ #pragma once -#include - #ifdef __cplusplus +#include extern "C" { +#else +#include #endif struct ggml_kompute_context; -ggml_kompute_context * ggml_vk_init(void); +struct ggml_kompute_context * ggml_vk_init(void); void ggml_metal_free(struct ggml_kompute_context * ctx); // creates a mapping between a host memory buffer and a device memory buffer From b8a4594f8930a53a099d91b0d77c7dd6242ee2af Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 23 Jun 2023 12:19:33 +0200 Subject: [PATCH 08/43] More fixes... --- ggml-vulkan.cpp | 4 ++-- ggml-vulkan.h | 1 + ggml.c | 2 +- llama.cpp | 4 ++-- llama.h | 4 ++-- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index b7e70e221a04c..7879a59379b87 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -64,8 +64,8 @@ bool ggml_vk_add_buffer( size_t size, size_t max_size) { try { - std::vector vec(max_size); - std::memcpy(vec.data(), data, std::max(size, max_size)); + std::vector vec(std::max(size, max_size)); + std::memcpy(vec.data(), data, size); auto tensor = mgr.tensorT(vec); ctx->buffers.emplace(name, std::move(tensor)); } catch (const std::exception & e) { diff --git a/ggml-vulkan.h b/ggml-vulkan.h index a3bc781d7799c..b7f7371cb5ce2 100644 --- a/ggml-vulkan.h +++ b/ggml-vulkan.h @@ -31,6 +31,7 @@ void ggml_vk_set_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * void ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t); void ggml_vk_dequantize_row_q4_0(const void * x, float * y, int k); +void ggml_vk_dequantize_row_q4_1(const void * x, float * y, int k); void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * cgraph); #ifdef __cplusplus diff --git a/ggml.c b/ggml.c index 151b9eefbf4b7..34f0870002a41 100644 --- a/ggml.c +++ b/ggml.c @@ -1558,7 +1558,7 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_0, }, [GGML_TYPE_Q4_1] = { - .dequantize_row_q = (dequantize_row_q_t)dequantize_row_q4_1, + .dequantize_row_q = (dequantize_row_q_t) ggml_vk_dequantize_row_q4_1, .quantize_row_q = quantize_row_q4_1, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference, .quantize_row_q_dot = quantize_row_q8_1, diff --git a/llama.cpp b/llama.cpp index 824ed6121ce1d..85acd4e052ec2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1269,7 +1269,7 @@ static void llama_model_load_internal( } } #endif // GGML_USE_CUBLAS -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_KOMPUTE) const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu); @@ -1707,7 +1707,7 @@ static bool llama_eval_internal( ggml_graph_compute(ctx0, &gf); } -#elif defined(GGML_USE_KOMPUTE) +#elif defined(GGML_USE_KOMPUTE_TODO) if (lctx.ctx_kompute && N == 1) { ggml_vk_graph_compute(lctx.ctx_kompute, &gf); ggml_vk_get_tensor (lctx.ctx_kompute, cur); diff --git a/llama.h b/llama.h index 0de530d456932..446dd49b94de1 100644 --- a/llama.h +++ b/llama.h @@ -38,7 +38,7 @@ #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN #define LLAMA_SESSION_VERSION 1 -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) +#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_KOMPUTE) // Defined when llama.cpp is compiled with support for offloading model layers to GPU. #define LLAMA_SUPPORTS_GPU_OFFLOAD #endif @@ -71,7 +71,7 @@ extern "C" { typedef void (*llama_progress_callback)(float progress, void *ctx); - struct llama_context_params { + struct llama_context_params { int seed; // RNG seed, -1 for random int n_ctx; // text context int n_batch; // prompt processing batch size From d53924799631f93f9207c7be511cda5e75b33066 Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 23 Jun 2023 14:03:33 +0200 Subject: [PATCH 09/43] Began implementing ggml_graph_compute --- ggml-vulkan.cpp | 95 ++++++++++++++++++++++++++++++++++++++++++++++--- ggml-vulkan.h | 6 ++-- llama.cpp | 8 ++--- 3 files changed, 97 insertions(+), 12 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 7879a59379b87..d8cc9f1fa4ce3 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -75,6 +76,7 @@ bool ggml_vk_add_buffer( return true; } +static std::shared_ptr ggml_vk_get_buffer(struct ggml_kompute_context * ctx, const char * name) { auto res = ctx->buffers.find(name); if (res == ctx->buffers.end()) return nullptr; @@ -82,7 +84,7 @@ std::shared_ptr ggml_vk_get_buffer(struct ggml_kompute_context * ctx } -void ggml_vk_set_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) { +void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) { if (t->backend != GGML_BACKEND_GPU) { return; } @@ -98,7 +100,7 @@ void ggml_vk_set_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * ctx->tensors.emplace(t, std::move(tensor)); } -void ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) { +void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) { if (t->backend != GGML_BACKEND_GPU) { return; } @@ -107,12 +109,23 @@ void ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * auto size = ggml_nbytes(t); auto res = ctx->tensors.find(t); + assert(res != ctx->tensors.end()); auto tensor = res->second; mgr.sequence()->eval({tensor}); memcpy(data, tensor->data(), size); } +static +const std::shared_ptr & ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) { + assert(t->backend != GGML_BACKEND_GPU); + + auto res = ctx->tensors.find(t); + assert(res != ctx->tensors.end()); + + return res->second; +} + static std::vector compileSource(const std::string& source) { //FIXME: Terrible solution!!!! @@ -302,17 +315,89 @@ void ggml_vk_abmath(const std::shared_ptr& inA, uint32_t inAOff, }; mgr.sequence() - ->eval(mgr.algorithm({inA, inB, out}, spirv, {std::min(inA->size(), inB->size())}, {}, {pushConsts})); + ->eval(mgr.algorithm({inA, inB, out}, spirv, {std::min(inA->size()-inAOff, inB->size()-inBOff)}, {}, {pushConsts})); } template void ggml_vk_add(Args&&... args) { - return ggml_vk_abmath<'+'>(std::forward(args)...); + return ggml_vk_abmath<'+'>(std::forward(args)...); } template void ggml_vk_mul(Args&&... args) { - return ggml_vk_abmath<'*'>(std::forward(args)...); + return ggml_vk_abmath<'*'>(std::forward(args)...); +} + + +void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) { + printf("%s: evaluating graph\n", __func__); + + const int n_seq = gf->n_threads; + + std::vector sequences(n_seq); + + std::vector threads(n_seq); + + for (int seq_idx = 0; seq_idx < n_seq; ++seq_idx) { + const int n_nodes_per_seq = (gf->n_nodes + n_seq - 1) / n_seq; + + threads[seq_idx] = std::thread([&, seq_idx, n_nodes_per_seq] () { + size_t offs_src0 = 0; + size_t offs_src1 = 0; + size_t offs_dst = 0; + + auto& seq = sequences[seq_idx]; + + const int node_start = (seq_idx + 0) * n_nodes_per_seq; + const int node_end = (seq_idx == n_seq - 1) ? gf->n_nodes : (seq_idx + 1) * n_nodes_per_seq; + + for (int i = node_start; i < node_end; ++i) { + printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); + + struct ggml_tensor * src0 = gf->nodes[i]->src0; + struct ggml_tensor * src1 = gf->nodes[i]->src1; + struct ggml_tensor * dst = gf->nodes[i]; + + const int64_t ne00 = src0 ? src0->ne[0] : 0; + const int64_t ne01 = src0 ? src0->ne[1] : 0; + const int64_t ne02 = src0 ? src0->ne[2] : 0; + const int64_t ne03 = src0 ? src0->ne[3] : 0; + + const uint64_t nb00 = src0 ? src0->nb[0] : 0; + const uint64_t nb01 = src0 ? src0->nb[1] : 0; + const uint64_t nb02 = src0 ? src0->nb[2] : 0; + const uint64_t nb03 = src0 ? src0->nb[3] : 0; + + const int64_t ne10 = src1 ? src1->ne[0] : 0; + const int64_t ne11 = src1 ? src1->ne[1] : 0; + const int64_t ne12 = src1 ? src1->ne[2] : 0; + const int64_t ne13 = src1 ? src1->ne[3] : 0; (void)ne13; + + const uint64_t nb10 = src1 ? src1->nb[0] : 0; + const uint64_t nb11 = src1 ? src1->nb[1] : 0; + const uint64_t nb12 = src1 ? src1->nb[2] : 0; + const uint64_t nb13 = src1 ? src1->nb[3] : 0; (void)nb13; + + const int64_t ne0 = dst ? dst->ne[0] : 0; + const int64_t ne1 = dst ? dst->ne[1] : 0; + const int64_t ne2 = dst ? dst->ne[2] : 0; + const int64_t ne3 = dst ? dst->ne[3] : 0; + + const uint64_t nb0 = dst ? dst->nb[0] : 0; + const uint64_t nb1 = dst ? dst->nb[1] : 0; + const uint64_t nb2 = dst ? dst->nb[2] : 0; + const uint64_t nb3 = dst ? dst->nb[3] : 0; + + const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT; + const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; + const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT; + + std::shared_ptr id_src0 = src0 ? ggml_vk_get_tensor(ctx, src0) : nullptr; + std::shared_ptr id_src1 = src1 ? ggml_vk_get_tensor(ctx, src1) : nullptr; + std::shared_ptr id_dst = dst ? ggml_vk_get_tensor(ctx, dst) : nullptr; + } + }); + } } diff --git a/ggml-vulkan.h b/ggml-vulkan.h index b7f7371cb5ce2..19aaec949d126 100644 --- a/ggml-vulkan.h +++ b/ggml-vulkan.h @@ -27,12 +27,12 @@ bool ggml_vk_add_buffer( size_t size, size_t max_size); -void ggml_vk_set_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t); -void ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t); +void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t); +void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t); void ggml_vk_dequantize_row_q4_0(const void * x, float * y, int k); void ggml_vk_dequantize_row_q4_1(const void * x, float * y, int k); -void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * cgraph); +void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf); #ifdef __cplusplus } diff --git a/llama.cpp b/llama.cpp index 85acd4e052ec2..89c7fa6560ca5 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1707,10 +1707,10 @@ static bool llama_eval_internal( ggml_graph_compute(ctx0, &gf); } -#elif defined(GGML_USE_KOMPUTE_TODO) +#elif defined(GGML_USE_KOMPUTE) if (lctx.ctx_kompute && N == 1) { ggml_vk_graph_compute(lctx.ctx_kompute, &gf); - ggml_vk_get_tensor (lctx.ctx_kompute, cur); + ggml_vk_d2h_tensor (lctx.ctx_kompute, cur); } else { // IMPORTANT: // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla @@ -1721,8 +1721,8 @@ static bool llama_eval_internal( // if (lctx.ctx_kompute) { // We need to sync the GPU KV cache with the CPU KV cache - ggml_vk_get_tensor(lctx.ctx_kompute, kv_self.k); - ggml_vk_get_tensor(lctx.ctx_kompute, kv_self.v); + ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.k); + ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.v); } ggml_graph_compute(ctx0, &gf); From 18d6f7f8da06788788b4ec99c3dd7c90f52162e9 Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 23 Jun 2023 14:08:45 +0200 Subject: [PATCH 10/43] More progress... --- ggml-vulkan.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index d8cc9f1fa4ce3..3e7fe30a68ce6 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -395,6 +395,20 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph std::shared_ptr id_src0 = src0 ? ggml_vk_get_tensor(ctx, src0) : nullptr; std::shared_ptr id_src1 = src1 ? ggml_vk_get_tensor(ctx, src1) : nullptr; std::shared_ptr id_dst = dst ? ggml_vk_get_tensor(ctx, dst) : nullptr; + + switch (dst->op) { + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_TRANSPOSE: + case GGML_OP_PERMUTE: + { + // noop + } break; + case GGML_OP_ADD: + { + ggml_vk_add(id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst); + } break; + } } }); } From b6264542b7bd3dc0461b08949c438771be47eca8 Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 23 Jun 2023 14:19:31 +0200 Subject: [PATCH 11/43] Added vk_mul to ggml_vk_graph_compute --- ggml-vulkan.cpp | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 3e7fe30a68ce6..57e1ebf6fe1dc 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -286,6 +286,7 @@ layout(push_constant) uniform PushConstants { uint inAOff; uint inBOff; uint outOff; + uint row; } pcs; @@ -298,20 +299,23 @@ layout(binding = 2) buffer tensorout { float out[]; }; void main() { const int i = int(gl_GlobalInvocationID.x); - out[pcs.outOff+i] = inA[pcs.inAOff+i] MATH_OP inB[pcs.inBOff+i]; + out[pcs.outOff+i] = inA[pcs.inAOff+i] MATH_OP inB[pcs.inBOff+(i ROW_OP)]; } ); template void ggml_vk_abmath(const std::shared_ptr& inA, uint32_t inAOff, const std::shared_ptr& inB, uint32_t inBOff, - std::shared_ptr& out, uint32_t outOff) { - const static auto spirv = compileSource("#define MATH_OP "+std::string(1, mathOP)+'\n'+program_abmath); + std::shared_ptr& out, uint32_t outOff, + uint32_t row = 0) { + const static auto spirv = compileSource("#define MATH_OP "+std::string(1, mathOP)+"\n" + "#define ROW_OP "+(row?"% pcs.row":"")+"\n" + +program_abmath); struct PushConstants { - uint32_t inAOff, inBOff, outOff; + uint32_t inAOff, inBOff, outOff, row; } pushConsts { - inAOff, inBOff, outOff + inAOff, inBOff, outOff, row }; mgr.sequence() @@ -334,7 +338,11 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph const int n_seq = gf->n_threads; - std::vector sequences(n_seq); + std::vector> sequences(n_seq); + + for (auto& sequence : sequences) { + sequence = mgr.sequence(); + } std::vector threads(n_seq); @@ -346,7 +354,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph size_t offs_src1 = 0; size_t offs_dst = 0; - auto& seq = sequences[seq_idx]; + auto& seq = *sequences[seq_idx]; const int node_start = (seq_idx + 0) * n_nodes_per_seq; const int node_end = (seq_idx == n_seq - 1) ? gf->n_nodes : (seq_idx + 1) * n_nodes_per_seq; @@ -408,6 +416,15 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph { ggml_vk_add(id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst); } break; + case GGML_OP_MUL: + { + if (ggml_nelements(src1) == ne10) { + // src1 is a row + ggml_vk_mul(id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00); + } else { + ggml_vk_mul(id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst); + } + } break; } } }); From 5e9403342b953adce49cf585ca6a6574af4c6c61 Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 23 Jun 2023 15:01:09 +0200 Subject: [PATCH 12/43] Minor fixes --- ggml-vulkan.cpp | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 57e1ebf6fe1dc..af697b2214cb2 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -18,8 +18,6 @@ #endif #define MULTILINE_QUOTE(...) #__VA_ARGS__ -#define STRINGIFY(x) STRINGIFY2(x) -#define STRINGIFY2(x) #x #define QK4_0 32 #define QR4_0 2 @@ -182,7 +180,7 @@ static const std::string program_source_head = R"( static const std::string program_dequantize_row_q4_0 = - program_source_head+'\n'+MULTILINE_QUOTE( + MULTILINE_QUOTE( layout(local_size_x = 1, local_size_y = 1) in; layout(binding = 0) buffer tensorBlockQ4_0D { float16_t x_d[]; }; layout(binding = 1) buffer tensorBlockQ4_0QS { uint8_t x_qs[]; }; @@ -209,7 +207,7 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) { static const int qk = QK4_0; const unsigned nb = k / qk; const unsigned y_size = nb*qk; - const static auto spirv = compileSource(program_dequantize_row_q4_0); + const static auto spirv = compileSource(program_source_head+program_dequantize_row_q4_0); const auto x = reinterpret_cast(x_); @@ -230,7 +228,7 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) { static const std::string program_dequantize_row_q4_1 = - program_source_head+'\n'+MULTILINE_QUOTE( + MULTILINE_QUOTE( layout(local_size_x = 1, local_size_y = 1) in; layout(binding = 0) buffer tensorBlockQ4_0D { float16_t x_d[]; }; layout(binding = 1) buffer tensorBlockQ4_0M { float16_t x_m[]; }; @@ -259,7 +257,7 @@ void ggml_vk_dequantize_row_q4_1(const void *x_, float *y, int k) { static const int qk = QK4_1; const unsigned nb = k / qk; const unsigned y_size = nb*qk; - const static auto spirv = compileSource(program_dequantize_row_q4_1); + const static auto spirv = compileSource(program_source_head+program_dequantize_row_q4_1); const auto x = reinterpret_cast(x_); @@ -281,7 +279,7 @@ void ggml_vk_dequantize_row_q4_1(const void *x_, float *y, int k) { static const std::string program_abmath = - program_source_head+'\n'+MULTILINE_QUOTE( + MULTILINE_QUOTE( layout(push_constant) uniform PushConstants { uint inAOff; uint inBOff; @@ -293,24 +291,25 @@ layout(push_constant) uniform PushConstants { layout(local_size_x = 1) in; layout(binding = 0) buffer tensorInA { float inA[]; }; layout(binding = 1) buffer tensorInB { float inB[]; }; -layout(binding = 2) buffer tensorout { float out[]; }; +layout(binding = 2) buffer tensorOut { float out_[]; }; void main() { const int i = int(gl_GlobalInvocationID.x); - out[pcs.outOff+i] = inA[pcs.inAOff+i] MATH_OP inB[pcs.inBOff+(i ROW_OP)]; + out_[pcs.outOff+i] = inA[pcs.inAOff+i] MATH_OP inB[pcs.inBOff+(i ROW_OP)]; } ); template void ggml_vk_abmath(const std::shared_ptr& inA, uint32_t inAOff, - const std::shared_ptr& inB, uint32_t inBOff, - std::shared_ptr& out, uint32_t outOff, - uint32_t row = 0) { - const static auto spirv = compileSource("#define MATH_OP "+std::string(1, mathOP)+"\n" - "#define ROW_OP "+(row?"% pcs.row":"")+"\n" - +program_abmath); + const std::shared_ptr& inB, uint32_t inBOff, + const std::shared_ptr& out, uint32_t outOff, + uint32_t row = 0) { + const static auto spirv = compileSource(program_source_head+ + "#define MATH_OP "+std::string(1, mathOP)+"\n" + "#define ROW_OP "+(row?"% pcs.row":"")+'\n'+ + program_abmath); struct PushConstants { uint32_t inAOff, inBOff, outOff, row; From e830264c9240bda2c2976b4f36a7f028f765f550 Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 23 Jun 2023 15:10:24 +0200 Subject: [PATCH 13/43] Share sequence to functions and add scale() --- ggml-vulkan.cpp | 50 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index af697b2214cb2..45502ab5a5838 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -287,13 +287,11 @@ layout(push_constant) uniform PushConstants { uint row; } pcs; - layout(local_size_x = 1) in; layout(binding = 0) buffer tensorInA { float inA[]; }; layout(binding = 1) buffer tensorInB { float inB[]; }; layout(binding = 2) buffer tensorOut { float out_[]; }; - void main() { const int i = int(gl_GlobalInvocationID.x); @@ -302,7 +300,8 @@ void main() { ); template -void ggml_vk_abmath(const std::shared_ptr& inA, uint32_t inAOff, +void ggml_vk_abmath(kp::Sequence& seq, + const std::shared_ptr& inA, uint32_t inAOff, const std::shared_ptr& inB, uint32_t inBOff, const std::shared_ptr& out, uint32_t outOff, uint32_t row = 0) { @@ -317,8 +316,7 @@ void ggml_vk_abmath(const std::shared_ptr& inA, uint32_t inAOff, inAOff, inBOff, outOff, row }; - mgr.sequence() - ->eval(mgr.algorithm({inA, inB, out}, spirv, {std::min(inA->size()-inAOff, inB->size()-inBOff)}, {}, {pushConsts})); + seq.record(mgr.algorithm({inA, inB, out}, spirv, {std::min(inA->size()-inAOff, inB->size()-inBOff)}, {}, {pushConsts})); } template @@ -332,6 +330,42 @@ void ggml_vk_mul(Args&&... args) { } +static const std::string program_scale = + MULTILINE_QUOTE( +layout(push_constant) uniform PushConstants { + uint inAOff; + uint inOff; + float scale; +} pcs; + +layout(local_size_x = 1) in; +layout(binding = 0) buffer tensorInA { float in_[]; }; +layout(binding = 1) buffer tensorOut { float out_[]; }; + +void main() { + const int i = int(gl_GlobalInvocationID.x); + + out_[pcs.outOff+i] = in_[pcs.inOff+i] * pcs.scale; +} +); + +void ggml_vk_scale(kp::Sequence& seq, + const std::shared_ptr& in, uint32_t inOff, + const std::shared_ptr& out, uint32_t outOff, + float scale) { + const static auto spirv = compileSource(program_source_head+program_scale); + + struct PushConstants { + uint32_t inOff, outOff; + float scale; + } pushConsts { + inOff, outOff, scale + }; + + seq.record(mgr.algorithm({in, out}, spirv, {in->size()-inOff}, {}, {pushConsts})); +} + + void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) { printf("%s: evaluating graph\n", __func__); @@ -413,15 +447,15 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph } break; case GGML_OP_ADD: { - ggml_vk_add(id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst); + ggml_vk_add(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst); } break; case GGML_OP_MUL: { if (ggml_nelements(src1) == ne10) { // src1 is a row - ggml_vk_mul(id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00); + ggml_vk_mul(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00); } else { - ggml_vk_mul(id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst); + ggml_vk_mul(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst); } } break; } From 5c0d8dd0f23170d62893e67a931c689b672d80a6 Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 23 Jun 2023 15:58:13 +0200 Subject: [PATCH 14/43] Specify program output size --- ggml-vulkan.cpp | 74 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 59 insertions(+), 15 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 45502ab5a5838..ed6e704f47df3 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -293,7 +293,7 @@ layout(binding = 1) buffer tensorInB { float inB[]; }; layout(binding = 2) buffer tensorOut { float out_[]; }; void main() { - const int i = int(gl_GlobalInvocationID.x); + const uint i = gl_GlobalInvocationID.x; out_[pcs.outOff+i] = inA[pcs.inAOff+i] MATH_OP inB[pcs.inBOff+(i ROW_OP)]; } @@ -304,7 +304,7 @@ void ggml_vk_abmath(kp::Sequence& seq, const std::shared_ptr& inA, uint32_t inAOff, const std::shared_ptr& inB, uint32_t inBOff, const std::shared_ptr& out, uint32_t outOff, - uint32_t row = 0) { + uint32_t size, uint32_t row = 0) { const static auto spirv = compileSource(program_source_head+ "#define MATH_OP "+std::string(1, mathOP)+"\n" "#define ROW_OP "+(row?"% pcs.row":"")+'\n'+ @@ -316,7 +316,7 @@ void ggml_vk_abmath(kp::Sequence& seq, inAOff, inBOff, outOff, row }; - seq.record(mgr.algorithm({inA, inB, out}, spirv, {std::min(inA->size()-inAOff, inB->size()-inBOff)}, {}, {pushConsts})); + seq.record(mgr.algorithm({inA, inB, out}, spirv, {size}, {}, {pushConsts})); } template @@ -343,7 +343,7 @@ layout(binding = 0) buffer tensorInA { float in_[]; }; layout(binding = 1) buffer tensorOut { float out_[]; }; void main() { - const int i = int(gl_GlobalInvocationID.x); + const uint i = gl_GlobalInvocationID.x; out_[pcs.outOff+i] = in_[pcs.inOff+i] * pcs.scale; } @@ -352,7 +352,7 @@ void main() { void ggml_vk_scale(kp::Sequence& seq, const std::shared_ptr& in, uint32_t inOff, const std::shared_ptr& out, uint32_t outOff, - float scale) { + uint32_t size, float scale) { const static auto spirv = compileSource(program_source_head+program_scale); struct PushConstants { @@ -362,7 +362,42 @@ void ggml_vk_scale(kp::Sequence& seq, inOff, outOff, scale }; - seq.record(mgr.algorithm({in, out}, spirv, {in->size()-inOff}, {}, {pushConsts})); + seq.record(mgr.algorithm({in, out}, spirv, {size}, {}, {pushConsts})); +} + + +static const std::string program_silu = + MULTILINE_QUOTE( +layout(push_constant) uniform PushConstants { + uint inAOff; + uint inOff; +} pcs; + +layout(local_size_x = 1) in; +layout(binding = 0) buffer tensorInA { float in_[]; }; +layout(binding = 1) buffer tensorOut { float out_[]; }; + +void main() { + const uint i = gl_GlobalInvocationID.x; + const float x = in_[pcs.inOff+i]; + + out_[pcs.outOff+i] = x / (1.0f + exp(-x)); +} +); + +void ggml_vk_silu(kp::Sequence& seq, + const std::shared_ptr& in, uint32_t inOff, + const std::shared_ptr& out, uint32_t outOff, + uint32_t size) { + const static auto spirv = compileSource(program_source_head+program_silu); + + struct PushConstants { + uint32_t inOff, outOff; + } pushConsts { + inOff, outOff + }; + + seq.record(mgr.algorithm({in, out}, spirv, {size}, {}, {pushConsts})); } @@ -447,17 +482,26 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph } break; case GGML_OP_ADD: { - ggml_vk_add(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst); + ggml_vk_add(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ggml_nelements(dst)); } break; case GGML_OP_MUL: - { - if (ggml_nelements(src1) == ne10) { - // src1 is a row - ggml_vk_mul(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00); - } else { - ggml_vk_mul(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst); - } - } break; + { + if (ggml_nelements(src1) == ne10) { + // src1 is a row + ggml_vk_mul(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ggml_nelements(dst)); + } else { + ggml_vk_mul(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ggml_nelements(dst)); + } + } break; + case GGML_OP_SCALE: + { + const float scale = *(const float *) src1->data; + ggml_vk_scale(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst), scale); + } break; + case GGML_OP_SILU: + { + ggml_vk_silu(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst)); + } break; } } }); From 2589cb0c704189da7f8bf92fef276489f75cb548 Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 23 Jun 2023 16:02:49 +0200 Subject: [PATCH 15/43] Prevent compileSource race --- ggml-vulkan.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index ed6e704f47df3..5e1d206bdf76c 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -126,6 +127,8 @@ const std::shared_ptr & ggml_vk_get_tensor(struct ggml_kompute_conte static std::vector compileSource(const std::string& source) { + static std::mutex mutex; + std::lock_guard L(mutex); //FIXME: Terrible solution!!!! std::ofstream fileOut("tmp_kp_shader.comp"); fileOut << source; @@ -176,6 +179,8 @@ static const std::string program_source_head = R"( #define QK4_0 32 #define QR4_0 2 #define QK4_1 32 +#define GELU_COEF_A 0.044715; +#define SQRT_2_OVER_PI 0.79788456080286535587989211986876; )"; From 09b0b3a49ba757ef44b5e8387eab3a9af4521c0d Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 23 Jun 2023 16:13:32 +0200 Subject: [PATCH 16/43] Wait for all threads to finish --- ggml-vulkan.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 5e1d206bdf76c..c35509bd8b223 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -511,6 +511,12 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph } }); } + + // Wait for all threads to finish + for (auto& thread : threads) { + if (thread.joinable()) + thread.join(); + } } From 98e588c6eb54e7fe9726ad46c98d30bac5473a8b Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 23 Jun 2023 16:50:37 +0200 Subject: [PATCH 17/43] Fix ggml_vk_h2d_tensor throwing on second call --- ggml-vulkan.cpp | 18 +++++++++++++----- llama.cpp | 16 ++++++++-------- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index c35509bd8b223..c260c59c21402 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -91,12 +91,20 @@ void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * auto data = t->data; auto size = ggml_nbytes(t); - std::vector vec(size); - memcpy(vec.data(), data, size); + auto res = ctx->tensors.find(t); + + if (res != ctx->tensors.end()) { + assert(res->second->size() != size); + res->second->setRawData(data); + mgr.sequence()->eval({res->second}); + } else { + std::vector vec(size); + memcpy(vec.data(), data, size); - auto tensor = mgr.tensorT(vec); - mgr.sequence()->eval({tensor}); - ctx->tensors.emplace(t, std::move(tensor)); + auto tensor = mgr.tensorT(vec); + mgr.sequence()->eval({tensor}); + ctx->tensors.emplace(t, std::move(tensor)); + } } void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) { diff --git a/llama.cpp b/llama.cpp index 89c7fa6560ca5..cbe285afb743a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2769,7 +2769,7 @@ struct llama_context * llama_init_from_file( } #elif defined(GGML_USE_KOMPUTE) if (params.n_gpu_layers > 0) { - // this allocates all Metal resources and memory buffers + // this allocates all Vulkan resources and memory buffers ctx->ctx_kompute = ggml_vk_init(); void * data_ptr = NULL; @@ -2787,21 +2787,21 @@ struct llama_context * llama_init_from_file( printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); -#define LLAMA_METAL_CHECK_BUF(result) \ +#define LLAMA_VK_CHECK_BUF(result) \ if (!(result)) { \ fprintf(stderr, "%s: failed to add buffer\n", __func__); \ llama_free(ctx); \ return NULL; \ } - LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "data", data_ptr, data_size, max_size)); + LLAMA_VK_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "data", data_ptr, data_size, max_size)); - LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0)); - LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0)); + LLAMA_VK_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0)); + LLAMA_VK_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0)); - LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0)); - LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0)); -#undef LLAMA_METAL_CHECK_BUF + LLAMA_VK_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0)); + LLAMA_VK_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0)); +#undef LLAMA_VK_CHECK_BUF } #endif From 46f577bfc1e29b4397e6958bc4400b326456c314 Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 23 Jun 2023 17:10:45 +0200 Subject: [PATCH 18/43] h2d tensors during loadup --- ggml-vulkan.cpp | 86 +++++++++++++++++++++++++++++++++++++++++++------ llama.cpp | 12 +++++-- 2 files changed, 86 insertions(+), 12 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index c260c59c21402..0f454c899cd94 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -378,6 +378,19 @@ void ggml_vk_scale(kp::Sequence& seq, seq.record(mgr.algorithm({in, out}, spirv, {size}, {}, {pushConsts})); } +void ggml_vk_xxlu(const std::vector& spirv, kp::Sequence& seq, + const std::shared_ptr& in, uint32_t inOff, + const std::shared_ptr& out, uint32_t outOff, + uint32_t size) { + struct PushConstants { + uint32_t inOff, outOff; + } pushConsts { + inOff, outOff + }; + + seq.record(mgr.algorithm({in, out}, spirv, {size}, {}, {pushConsts})); +} + static const std::string program_silu = MULTILINE_QUOTE( @@ -398,19 +411,64 @@ void main() { } ); -void ggml_vk_silu(kp::Sequence& seq, - const std::shared_ptr& in, uint32_t inOff, - const std::shared_ptr& out, uint32_t outOff, - uint32_t size) { +template +void ggml_vk_silu(Args&&... args) { const static auto spirv = compileSource(program_source_head+program_silu); - struct PushConstants { - uint32_t inOff, outOff; - } pushConsts { - inOff, outOff - }; + ggml_vk_xxlu(spirv, std::forward(args)...); +} - seq.record(mgr.algorithm({in, out}, spirv, {size}, {}, {pushConsts})); + +static const std::string program_relu = + MULTILINE_QUOTE( +layout(push_constant) uniform PushConstants { + uint inAOff; + uint inOff; +} pcs; + +layout(local_size_x = 1) in; +layout(binding = 0) buffer tensorInA { float in_[]; }; +layout(binding = 1) buffer tensorOut { float out_[]; }; + +void main() { + const uint i = gl_GlobalInvocationID.x; + + out_[pcs.outOff+i] = max(0.0, in_[pcs.inOff+i]); +} +); + +template +void ggml_vk_relu(Args&&... args) { + const static auto spirv = compileSource(program_source_head+program_relu); + + ggml_vk_xxlu(spirv, std::forward(args)...); +} + + +static const std::string program_gelu = + MULTILINE_QUOTE( +layout(push_constant) uniform PushConstants { + uint inAOff; + uint inOff; +} pcs; + +layout(local_size_x = 1) in; +layout(binding = 0) buffer tensorInA { float in_[]; }; +layout(binding = 1) buffer tensorOut { float out_[]; }; + +void main() { + const uint i = gl_GlobalInvocationID.x; + const float x = in_[pcs.inOff+i]; + + out_[pcs.outOff+i] = 0.5*x*(1.0 + tanh(SQRT_2_OVER_PI*x*(1.0 + GELU_COEF_A*x*x))); +} +); + +template +void ggml_vk_gelu(Args&&... args) { + const static auto spirv = compileSource(program_source_head+program_gelu); + + ggml_vk_xxlu(spirv, std::forward(args)...); } @@ -515,6 +573,14 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph { ggml_vk_silu(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst)); } break; + case GGML_OP_RELU: + { + ggml_vk_relu(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst)); + } break; + case GGML_OP_GELU: + { + ggml_vk_gelu(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst)); + } break; } } }); diff --git a/llama.cpp b/llama.cpp index cbe285afb743a..be4b5ca6872a8 100644 --- a/llama.cpp +++ b/llama.cpp @@ -753,7 +753,7 @@ struct llama_model_loader { } } - void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { + void load_all_data(llama_context & lctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { size_t data_size = 0; size_t prefetch_size = 0; size_t lock_size = 0; @@ -810,6 +810,14 @@ struct llama_model_loader { free(lt.data); } break; +#elif defined(GGML_USE_KOMPUTE) + case GGML_BACKEND_GPU: + lt.ggml_tensor->data = lt.data; + ggml_vk_h2d_tensor(lctx.ctx_kompute, lt.ggml_tensor); + if (!use_mmap) { + free(lt.data); + } + break; #endif default: continue; @@ -1315,7 +1323,7 @@ static void llama_model_load_internal( } #endif - ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL); + ml->load_all_data(lctx, progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL); if (progress_callback) { progress_callback(1.0f, progress_callback_user_data); From 1a6819540856b8bf78958918f5b0279e080b99dc Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 23 Jun 2023 17:46:09 +0200 Subject: [PATCH 19/43] Add mutexes for gpu tensors --- ggml-vulkan.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 0f454c899cd94..f8b24f706b4ca 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -42,6 +42,7 @@ typedef struct { struct ggml_kompute_context { std::unordered_map> buffers; std::unordered_map> tensors; + std::mutex tensors_mutex; }; @@ -63,6 +64,8 @@ bool ggml_vk_add_buffer( void * data, size_t size, size_t max_size) { + printf("%s: Context: %p Name: '%s'\n", __func__, ctx, name); + try { std::vector vec(std::max(size, max_size)); std::memcpy(vec.data(), data, size); @@ -77,6 +80,8 @@ bool ggml_vk_add_buffer( static std::shared_ptr ggml_vk_get_buffer(struct ggml_kompute_context * ctx, const char * name) { + printf("%s: Context: %p Name: '%s'\n", __func__, ctx, name); + auto res = ctx->buffers.find(name); if (res == ctx->buffers.end()) return nullptr; return res->second; @@ -84,6 +89,8 @@ std::shared_ptr ggml_vk_get_buffer(struct ggml_kompute_context * ctx void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) { + printf("%s: Context: %p Tensor: %p\n", __func__, ctx, t); + if (t->backend != GGML_BACKEND_GPU) { return; } @@ -91,7 +98,9 @@ void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * auto data = t->data; auto size = ggml_nbytes(t); + ctx->tensors_mutex.lock(); auto res = ctx->tensors.find(t); + ctx->tensors_mutex.unlock(); if (res != ctx->tensors.end()) { assert(res->second->size() != size); @@ -103,11 +112,15 @@ void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * auto tensor = mgr.tensorT(vec); mgr.sequence()->eval({tensor}); + ctx->tensors_mutex.lock(); ctx->tensors.emplace(t, std::move(tensor)); + ctx->tensors_mutex.unlock(); } } void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) { + printf("%s: Context: %p Tensor: %p\n", __func__, ctx, t); + if (t->backend != GGML_BACKEND_GPU) { return; } @@ -115,7 +128,9 @@ void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * auto data = t->data; auto size = ggml_nbytes(t); + ctx->tensors_mutex.lock(); auto res = ctx->tensors.find(t); + ctx->tensors_mutex.unlock(); assert(res != ctx->tensors.end()); auto tensor = res->second; @@ -125,9 +140,13 @@ void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * static const std::shared_ptr & ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) { + printf("%s: Context: %p Tensor: %p\n", __func__, ctx, t); + assert(t->backend != GGML_BACKEND_GPU); + ctx->tensors_mutex.lock(); auto res = ctx->tensors.find(t); + ctx->tensors_mutex.unlock(); assert(res != ctx->tensors.end()); return res->second; From e6da9bd96b3444941421e71a0962976d9931a773 Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 23 Jun 2023 17:57:09 +0200 Subject: [PATCH 20/43] Added ggml_vk_mem_used() --- ggml-vulkan.cpp | 17 ++++++++++++++++- ggml-vulkan.h | 4 +++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index f8b24f706b4ca..12ed52fed6fc9 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -53,11 +54,25 @@ ggml_kompute_context *ggml_vk_init() { return new ggml_kompute_context; } -void ggml_metal_free(struct ggml_kompute_context * ctx) { +void ggml_vk_free(struct ggml_kompute_context * ctx) { delete ctx; } +size_t ggml_vk_mem_used(struct ggml_kompute_context * ctx) { + size_t fres = 0; + ctx->tensors_mutex.lock(); + for (const auto& tensor : ctx->tensors) { + fres += tensor.second->size(); + } + ctx->tensors_mutex.unlock(); + for (const auto& buffer : ctx->buffers) { + fres += buffer.second->size(); + } + return fres; +} + + bool ggml_vk_add_buffer( struct ggml_kompute_context * ctx, const char * name, diff --git a/ggml-vulkan.h b/ggml-vulkan.h index 19aaec949d126..5ec392782e0cd 100644 --- a/ggml-vulkan.h +++ b/ggml-vulkan.h @@ -11,7 +11,9 @@ struct ggml_kompute_context; struct ggml_kompute_context * ggml_vk_init(void); -void ggml_metal_free(struct ggml_kompute_context * ctx); +void ggml_vk_free(struct ggml_kompute_context * ctx); + +size_t ggml_vk_mem_used(struct ggml_kompute_context * ctx); // creates a mapping between a host memory buffer and a device memory buffer // - make sure to map all buffers used in the graph before calling ggml_vk_graph_compute From 40621ea0ec038bb0a360d1579999f6c8a3f73f88 Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 23 Jun 2023 18:26:21 +0200 Subject: [PATCH 21/43] Added more debugging --- ggml-vulkan.cpp | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 12ed52fed6fc9..9e422430929fe 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -118,7 +118,7 @@ void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * ctx->tensors_mutex.unlock(); if (res != ctx->tensors.end()) { - assert(res->second->size() != size); + GGML_ASSERT(res->second->size() != size); res->second->setRawData(data); mgr.sequence()->eval({res->second}); } else { @@ -146,7 +146,7 @@ void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * ctx->tensors_mutex.lock(); auto res = ctx->tensors.find(t); ctx->tensors_mutex.unlock(); - assert(res != ctx->tensors.end()); + GGML_ASSERT(res != ctx->tensors.end()); auto tensor = res->second; mgr.sequence()->eval({tensor}); @@ -157,12 +157,12 @@ static const std::shared_ptr & ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) { printf("%s: Context: %p Tensor: %p\n", __func__, ctx, t); - assert(t->backend != GGML_BACKEND_GPU); + GGML_ASSERT(t->backend != GGML_BACKEND_GPU); ctx->tensors_mutex.lock(); auto res = ctx->tensors.find(t); ctx->tensors_mutex.unlock(); - assert(res != ctx->tensors.end()); + GGML_ASSERT(res != ctx->tensors.end()); return res->second; } @@ -258,7 +258,7 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) { const auto x = reinterpret_cast(x_); - assert(k % qk == 0); + GGML_ASSERT(k % qk == 0); const auto tensorBlockQ4_0D = mgr.tensorT(getVecBlockQ4_0D(x, nb)); const auto tensorBlockQ4_0QS = mgr.tensorT(getVecBlockQ4_0QS(x, nb, qk)); @@ -308,7 +308,7 @@ void ggml_vk_dequantize_row_q4_1(const void *x_, float *y, int k) { const auto x = reinterpret_cast(x_); - assert(k % qk == 0); + GGML_ASSERT(k % qk == 0); const auto tensorBlockQ4_0D = mgr.tensorT(getVecBlockQ4_0D(x, nb)); const auto tensorBlockQ4_0M = mgr.tensorT(getVecBlockQ4_0M(x, nb)); @@ -615,8 +615,14 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph { ggml_vk_gelu(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst)); } break; + default: + fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); + GGML_ASSERT(false); } } + + // Evaluate sequence + seq.eval(); }); } From 4b267e88b683f91e9a292ae4fc53f497835b9421 Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 23 Jun 2023 18:40:58 +0200 Subject: [PATCH 22/43] Temporarily care for all layers --- ggml-vulkan.cpp | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 9e422430929fe..2b2e3378c4727 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -106,10 +106,6 @@ std::shared_ptr ggml_vk_get_buffer(struct ggml_kompute_context * ctx void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) { printf("%s: Context: %p Tensor: %p\n", __func__, ctx, t); - if (t->backend != GGML_BACKEND_GPU) { - return; - } - auto data = t->data; auto size = ggml_nbytes(t); @@ -121,6 +117,7 @@ void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * GGML_ASSERT(res->second->size() != size); res->second->setRawData(data); mgr.sequence()->eval({res->second}); + printf("%s: Updating Host->GPU tensor: %p\n", __func__, t); } else { std::vector vec(size); memcpy(vec.data(), data, size); @@ -130,16 +127,13 @@ void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * ctx->tensors_mutex.lock(); ctx->tensors.emplace(t, std::move(tensor)); ctx->tensors_mutex.unlock(); + printf("%s: Creating Host->GPU tensor: %p\n", __func__, t); } } void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) { printf("%s: Context: %p Tensor: %p\n", __func__, ctx, t); - if (t->backend != GGML_BACKEND_GPU) { - return; - } - auto data = t->data; auto size = ggml_nbytes(t); @@ -151,18 +145,21 @@ void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * auto tensor = res->second; mgr.sequence()->eval({tensor}); memcpy(data, tensor->data(), size); + printf("%s: Updating GPU->Host tensor: %p\n", __func__, t); } static const std::shared_ptr & ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) { printf("%s: Context: %p Tensor: %p\n", __func__, ctx, t); - GGML_ASSERT(t->backend != GGML_BACKEND_GPU); - ctx->tensors_mutex.lock(); auto res = ctx->tensors.find(t); ctx->tensors_mutex.unlock(); - GGML_ASSERT(res != ctx->tensors.end()); + + if (res == ctx->tensors.end()) { + ggml_vk_h2d_tensor(ctx, t); + return ggml_vk_get_tensor(ctx, t); + } return res->second; } From 55815b67f41998ce32e152569f730286480a8068 Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 23 Jun 2023 19:58:41 +0200 Subject: [PATCH 23/43] Improved memory safety --- ggml-vulkan.cpp | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 2b2e3378c4727..5007629103af1 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -94,26 +94,26 @@ bool ggml_vk_add_buffer( } static -std::shared_ptr ggml_vk_get_buffer(struct ggml_kompute_context * ctx, const char * name) { +kp::Tensor* ggml_vk_get_buffer(struct ggml_kompute_context * ctx, const char * name) { printf("%s: Context: %p Name: '%s'\n", __func__, ctx, name); - auto res = ctx->buffers.find(name); + const auto res = ctx->buffers.find(name); if (res == ctx->buffers.end()) return nullptr; - return res->second; + return res->second.get(); } void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) { printf("%s: Context: %p Tensor: %p\n", __func__, ctx, t); - auto data = t->data; - auto size = ggml_nbytes(t); + const auto data = t->data; + const auto size = ggml_nbytes(t); ctx->tensors_mutex.lock(); - auto res = ctx->tensors.find(t); - ctx->tensors_mutex.unlock(); + const auto res = ctx->tensors.find(t); if (res != ctx->tensors.end()) { + ctx->tensors_mutex.unlock(); GGML_ASSERT(res->second->size() != size); res->second->setRawData(data); mgr.sequence()->eval({res->second}); @@ -124,7 +124,6 @@ void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * auto tensor = mgr.tensorT(vec); mgr.sequence()->eval({tensor}); - ctx->tensors_mutex.lock(); ctx->tensors.emplace(t, std::move(tensor)); ctx->tensors_mutex.unlock(); printf("%s: Creating Host->GPU tensor: %p\n", __func__, t); @@ -134,15 +133,15 @@ void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) { printf("%s: Context: %p Tensor: %p\n", __func__, ctx, t); - auto data = t->data; - auto size = ggml_nbytes(t); + const auto data = t->data; + const auto size = ggml_nbytes(t); ctx->tensors_mutex.lock(); - auto res = ctx->tensors.find(t); + const auto res = ctx->tensors.find(t); ctx->tensors_mutex.unlock(); GGML_ASSERT(res != ctx->tensors.end()); - auto tensor = res->second; + auto& tensor = res->second; mgr.sequence()->eval({tensor}); memcpy(data, tensor->data(), size); printf("%s: Updating GPU->Host tensor: %p\n", __func__, t); @@ -153,10 +152,11 @@ const std::shared_ptr & ggml_vk_get_tensor(struct ggml_kompute_conte printf("%s: Context: %p Tensor: %p\n", __func__, ctx, t); ctx->tensors_mutex.lock(); - auto res = ctx->tensors.find(t); + const auto res = ctx->tensors.find(t); + const auto end = ctx->tensors.end(); ctx->tensors_mutex.unlock(); - if (res == ctx->tensors.end()) { + if (res == end) { ggml_vk_h2d_tensor(ctx, t); return ggml_vk_get_tensor(ctx, t); } @@ -356,7 +356,7 @@ void ggml_vk_abmath(kp::Sequence& seq, struct PushConstants { uint32_t inAOff, inBOff, outOff, row; - } pushConsts { + } const pushConsts { inAOff, inBOff, outOff, row }; @@ -370,6 +370,7 @@ void ggml_vk_add(Args&&... args) { template void ggml_vk_mul(Args&&... args) { + printf("%s: multiplying...\n", __func__); return ggml_vk_abmath<'*'>(std::forward(args)...); } @@ -377,13 +378,13 @@ void ggml_vk_mul(Args&&... args) { static const std::string program_scale = MULTILINE_QUOTE( layout(push_constant) uniform PushConstants { - uint inAOff; uint inOff; + uint outOff; float scale; } pcs; layout(local_size_x = 1) in; -layout(binding = 0) buffer tensorInA { float in_[]; }; +layout(binding = 0) buffer tensorIn { float in_[]; }; layout(binding = 1) buffer tensorOut { float out_[]; }; void main() { @@ -402,7 +403,7 @@ void ggml_vk_scale(kp::Sequence& seq, struct PushConstants { uint32_t inOff, outOff; float scale; - } pushConsts { + } const pushConsts { inOff, outOff, scale }; @@ -415,7 +416,7 @@ void ggml_vk_xxlu(const std::vector& spirv, kp::Sequence& seq, uint32_t size) { struct PushConstants { uint32_t inOff, outOff; - } pushConsts { + } const pushConsts { inOff, outOff }; @@ -426,8 +427,8 @@ void ggml_vk_xxlu(const std::vector& spirv, kp::Sequence& seq, static const std::string program_silu = MULTILINE_QUOTE( layout(push_constant) uniform PushConstants { - uint inAOff; uint inOff; + uint outOff; } pcs; layout(local_size_x = 1) in; @@ -614,7 +615,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph } break; default: fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); - GGML_ASSERT(false); + //GGML_ASSERT(false); } } From e0814f86a25d1f6e56db9e369b688bd8167e941c Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 23 Jun 2023 20:02:46 +0200 Subject: [PATCH 24/43] Free vk context --- llama.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llama.cpp b/llama.cpp index be4b5ca6872a8..740726445465a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2817,6 +2817,9 @@ struct llama_context * llama_init_from_file( } void llama_free(struct llama_context * ctx) { +#ifdef GGML_USE_KOMPUTE + ggml_vk_free(ctx->ctx_kompute); +#endif delete ctx; } From 5d5f66d1d914d520a3f40099881571024088a072 Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 23 Jun 2023 20:37:58 +0200 Subject: [PATCH 25/43] More little fixes and stuff --- ggml-vulkan.cpp | 231 ++++++++++++++++++++++++------------------------ 1 file changed, 114 insertions(+), 117 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 5007629103af1..4d4f31e77a6ab 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -165,14 +165,15 @@ const std::shared_ptr & ggml_vk_get_tensor(struct ggml_kompute_conte } -static std::vector compileSource(const std::string& source) { +static std::vector compileSource(const std::string& source, const char *debug_name) { + printf("%s: Compiling compute program: %s\n", __func__, debug_name); static std::mutex mutex; std::lock_guard L(mutex); //FIXME: Terrible solution!!!! std::ofstream fileOut("tmp_kp_shader.comp"); fileOut << source; fileOut.close(); - if (system(std::string("glslangValidator -V tmp_kp_shader.comp -o tmp_kp_shader.comp.spv").c_str())) + if (system(std::string("glslangValidator -V tmp_kp_shader.comp -o tmp_kp_shader.comp.spv > /dev/null").c_str())) throw std::runtime_error("Error running glslangValidator command"); std::ifstream fileStream("tmp_kp_shader.comp.spv", std::ios::binary); std::vector buffer; @@ -251,7 +252,7 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) { static const int qk = QK4_0; const unsigned nb = k / qk; const unsigned y_size = nb*qk; - const static auto spirv = compileSource(program_source_head+program_dequantize_row_q4_0); + const static auto spirv = compileSource(program_source_head+program_dequantize_row_q4_0, __func__); const auto x = reinterpret_cast(x_); @@ -301,7 +302,7 @@ void ggml_vk_dequantize_row_q4_1(const void *x_, float *y, int k) { static const int qk = QK4_1; const unsigned nb = k / qk; const unsigned y_size = nb*qk; - const static auto spirv = compileSource(program_source_head+program_dequantize_row_q4_1); + const static auto spirv = compileSource(program_source_head+program_dequantize_row_q4_1, __func__); const auto x = reinterpret_cast(x_); @@ -352,7 +353,7 @@ void ggml_vk_abmath(kp::Sequence& seq, const static auto spirv = compileSource(program_source_head+ "#define MATH_OP "+std::string(1, mathOP)+"\n" "#define ROW_OP "+(row?"% pcs.row":"")+'\n'+ - program_abmath); + program_abmath, __func__); struct PushConstants { uint32_t inAOff, inBOff, outOff, row; @@ -370,7 +371,6 @@ void ggml_vk_add(Args&&... args) { template void ggml_vk_mul(Args&&... args) { - printf("%s: multiplying...\n", __func__); return ggml_vk_abmath<'*'>(std::forward(args)...); } @@ -398,7 +398,7 @@ void ggml_vk_scale(kp::Sequence& seq, const std::shared_ptr& in, uint32_t inOff, const std::shared_ptr& out, uint32_t outOff, uint32_t size, float scale) { - const static auto spirv = compileSource(program_source_head+program_scale); + const static auto spirv = compileSource(program_source_head+program_scale, __func__); struct PushConstants { uint32_t inOff, outOff; @@ -445,7 +445,7 @@ void main() { template void ggml_vk_silu(Args&&... args) { - const static auto spirv = compileSource(program_source_head+program_silu); + const static auto spirv = compileSource(program_source_head+program_silu, __func__); ggml_vk_xxlu(spirv, std::forward(args)...); } @@ -471,7 +471,7 @@ void main() { template void ggml_vk_relu(Args&&... args) { - const static auto spirv = compileSource(program_source_head+program_relu); + const static auto spirv = compileSource(program_source_head+program_relu, __func__); ggml_vk_xxlu(spirv, std::forward(args)...); } @@ -498,7 +498,7 @@ void main() { template void ggml_vk_gelu(Args&&... args) { - const static auto spirv = compileSource(program_source_head+program_gelu); + const static auto spirv = compileSource(program_source_head+program_gelu, __func__); ggml_vk_xxlu(spirv, std::forward(args)...); } @@ -514,120 +514,117 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph for (auto& sequence : sequences) { sequence = mgr.sequence(); } - - std::vector threads(n_seq); - for (int seq_idx = 0; seq_idx < n_seq; ++seq_idx) { const int n_nodes_per_seq = (gf->n_nodes + n_seq - 1) / n_seq; - threads[seq_idx] = std::thread([&, seq_idx, n_nodes_per_seq] () { - size_t offs_src0 = 0; - size_t offs_src1 = 0; - size_t offs_dst = 0; - - auto& seq = *sequences[seq_idx]; - - const int node_start = (seq_idx + 0) * n_nodes_per_seq; - const int node_end = (seq_idx == n_seq - 1) ? gf->n_nodes : (seq_idx + 1) * n_nodes_per_seq; - - for (int i = node_start; i < node_end; ++i) { - printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); - - struct ggml_tensor * src0 = gf->nodes[i]->src0; - struct ggml_tensor * src1 = gf->nodes[i]->src1; - struct ggml_tensor * dst = gf->nodes[i]; - - const int64_t ne00 = src0 ? src0->ne[0] : 0; - const int64_t ne01 = src0 ? src0->ne[1] : 0; - const int64_t ne02 = src0 ? src0->ne[2] : 0; - const int64_t ne03 = src0 ? src0->ne[3] : 0; - - const uint64_t nb00 = src0 ? src0->nb[0] : 0; - const uint64_t nb01 = src0 ? src0->nb[1] : 0; - const uint64_t nb02 = src0 ? src0->nb[2] : 0; - const uint64_t nb03 = src0 ? src0->nb[3] : 0; - - const int64_t ne10 = src1 ? src1->ne[0] : 0; - const int64_t ne11 = src1 ? src1->ne[1] : 0; - const int64_t ne12 = src1 ? src1->ne[2] : 0; - const int64_t ne13 = src1 ? src1->ne[3] : 0; (void)ne13; - - const uint64_t nb10 = src1 ? src1->nb[0] : 0; - const uint64_t nb11 = src1 ? src1->nb[1] : 0; - const uint64_t nb12 = src1 ? src1->nb[2] : 0; - const uint64_t nb13 = src1 ? src1->nb[3] : 0; (void)nb13; - - const int64_t ne0 = dst ? dst->ne[0] : 0; - const int64_t ne1 = dst ? dst->ne[1] : 0; - const int64_t ne2 = dst ? dst->ne[2] : 0; - const int64_t ne3 = dst ? dst->ne[3] : 0; - - const uint64_t nb0 = dst ? dst->nb[0] : 0; - const uint64_t nb1 = dst ? dst->nb[1] : 0; - const uint64_t nb2 = dst ? dst->nb[2] : 0; - const uint64_t nb3 = dst ? dst->nb[3] : 0; - - const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT; - const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; - const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT; - - std::shared_ptr id_src0 = src0 ? ggml_vk_get_tensor(ctx, src0) : nullptr; - std::shared_ptr id_src1 = src1 ? ggml_vk_get_tensor(ctx, src1) : nullptr; - std::shared_ptr id_dst = dst ? ggml_vk_get_tensor(ctx, dst) : nullptr; - - switch (dst->op) { - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_TRANSPOSE: - case GGML_OP_PERMUTE: - { - // noop - } break; - case GGML_OP_ADD: - { - ggml_vk_add(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ggml_nelements(dst)); - } break; - case GGML_OP_MUL: - { - if (ggml_nelements(src1) == ne10) { - // src1 is a row - ggml_vk_mul(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ggml_nelements(dst)); - } else { - ggml_vk_mul(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ggml_nelements(dst)); - } - } break; - case GGML_OP_SCALE: - { - const float scale = *(const float *) src1->data; - ggml_vk_scale(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst), scale); - } break; - case GGML_OP_SILU: - { - ggml_vk_silu(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst)); - } break; - case GGML_OP_RELU: - { - ggml_vk_relu(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst)); - } break; - case GGML_OP_GELU: - { - ggml_vk_gelu(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst)); - } break; - default: - fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); - //GGML_ASSERT(false); - } + size_t offs_src0 = 0; + size_t offs_src1 = 0; + size_t offs_dst = 0; + + auto& seq = *sequences[seq_idx]; + + const int node_start = (seq_idx + 0) * n_nodes_per_seq; + const int node_end = (seq_idx == n_seq - 1) ? gf->n_nodes : (seq_idx + 1) * n_nodes_per_seq; + + for (int i = node_start; i < node_end; ++i) { + printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); + + struct ggml_tensor * src0 = gf->nodes[i]->src0; + struct ggml_tensor * src1 = gf->nodes[i]->src1; + struct ggml_tensor * dst = gf->nodes[i]; + + const int64_t ne00 = src0 ? src0->ne[0] : 0; + const int64_t ne01 = src0 ? src0->ne[1] : 0; + const int64_t ne02 = src0 ? src0->ne[2] : 0; + const int64_t ne03 = src0 ? src0->ne[3] : 0; + + const uint64_t nb00 = src0 ? src0->nb[0] : 0; + const uint64_t nb01 = src0 ? src0->nb[1] : 0; + const uint64_t nb02 = src0 ? src0->nb[2] : 0; + const uint64_t nb03 = src0 ? src0->nb[3] : 0; + + const int64_t ne10 = src1 ? src1->ne[0] : 0; + const int64_t ne11 = src1 ? src1->ne[1] : 0; + const int64_t ne12 = src1 ? src1->ne[2] : 0; + const int64_t ne13 = src1 ? src1->ne[3] : 0; (void)ne13; + + const uint64_t nb10 = src1 ? src1->nb[0] : 0; + const uint64_t nb11 = src1 ? src1->nb[1] : 0; + const uint64_t nb12 = src1 ? src1->nb[2] : 0; + const uint64_t nb13 = src1 ? src1->nb[3] : 0; (void)nb13; + + const int64_t ne0 = dst ? dst->ne[0] : 0; + const int64_t ne1 = dst ? dst->ne[1] : 0; + const int64_t ne2 = dst ? dst->ne[2] : 0; + const int64_t ne3 = dst ? dst->ne[3] : 0; + + const uint64_t nb0 = dst ? dst->nb[0] : 0; + const uint64_t nb1 = dst ? dst->nb[1] : 0; + const uint64_t nb2 = dst ? dst->nb[2] : 0; + const uint64_t nb3 = dst ? dst->nb[3] : 0; + + const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT; + const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; + const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT; + + + const static std::shared_ptr nullTensor = nullptr; + const std::shared_ptr& id_src0 = src0 ? ggml_vk_get_tensor(ctx, src0) : nullTensor; + const std::shared_ptr& id_src1 = src1 ? ggml_vk_get_tensor(ctx, src1) : nullTensor; + const std::shared_ptr& id_dst = dst ? ggml_vk_get_tensor(ctx, dst) : nullTensor; + + switch (dst->op) { + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_TRANSPOSE: + case GGML_OP_PERMUTE: + { + // noop + } break; + case GGML_OP_ADD: + { + ggml_vk_add(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ggml_nelements(dst)); + } break; + case GGML_OP_MUL: + { + if (ggml_nelements(src1) == ne10) { + // src1 is a row + ggml_vk_mul(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ggml_nelements(dst)); + } else { + ggml_vk_mul(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ggml_nelements(dst)); + } + } break; + case GGML_OP_SCALE: + { + const float scale = *(const float *) src1->data; + ggml_vk_scale(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst), scale); + } break; + case GGML_OP_SILU: + { + ggml_vk_silu(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst)); + } break; + case GGML_OP_RELU: + { + ggml_vk_relu(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst)); + } break; + case GGML_OP_GELU: + { + ggml_vk_gelu(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst)); + } break; + //default: + //fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); + //GGML_ASSERT(false); } + } - // Evaluate sequence - seq.eval(); - }); + // Evaluate sequence + seq.evalAsync(); } - // Wait for all threads to finish - for (auto& thread : threads) { - if (thread.joinable()) - thread.join(); + // Wait for all sequences to finish + for (auto& sequence : sequences) { + if (sequence->isRunning()) + sequence->evalAwait(); } } From acb7d90398980ffe8ea41785b43430672ca8e7f8 Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 23 Jun 2023 20:39:32 +0200 Subject: [PATCH 26/43] Reenabled unknown op message --- ggml-vulkan.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 4d4f31e77a6ab..056dd9244477e 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -611,8 +611,8 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph { ggml_vk_gelu(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst)); } break; - //default: - //fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); + default: + fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); //GGML_ASSERT(false); } } From 072007b1e8118164c10f126a3b9de8763646d034 Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 23 Jun 2023 21:21:16 +0200 Subject: [PATCH 27/43] Add buffer qualifiers --- ggml-vulkan.cpp | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 056dd9244477e..b5b2dc5fcb5ef 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -227,9 +227,9 @@ static const std::string program_source_head = R"( static const std::string program_dequantize_row_q4_0 = MULTILINE_QUOTE( layout(local_size_x = 1, local_size_y = 1) in; -layout(binding = 0) buffer tensorBlockQ4_0D { float16_t x_d[]; }; -layout(binding = 1) buffer tensorBlockQ4_0QS { uint8_t x_qs[]; }; -layout(binding = 2) buffer tensorY { float y[]; }; +layout(binding = 0) buffer restrict readonly tensorBlockQ4_0D { float16_t x_d[]; }; +layout(binding = 1) buffer restrict readonly tensorBlockQ4_0QS { uint8_t x_qs[]; }; +layout(binding = 2) buffer restrict writeonly tensorY { float y[]; }; void main() { const int qk = QK4_0; @@ -275,10 +275,10 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) { static const std::string program_dequantize_row_q4_1 = MULTILINE_QUOTE( layout(local_size_x = 1, local_size_y = 1) in; -layout(binding = 0) buffer tensorBlockQ4_0D { float16_t x_d[]; }; -layout(binding = 1) buffer tensorBlockQ4_0M { float16_t x_m[]; }; -layout(binding = 2) buffer tensorBlockQ4_0QS { uint8_t x_qs[]; }; -layout(binding = 3) buffer tensorY { float y[]; }; +layout(binding = 0) buffer restrict readonly tensorBlockQ4_0D { float16_t x_d[]; }; +layout(binding = 1) buffer restrict readonly tensorBlockQ4_0M { float16_t x_m[]; }; +layout(binding = 2) buffer restrict readonly tensorBlockQ4_0QS { uint8_t x_qs[]; }; +layout(binding = 3) buffer restrict writeonly tensorY { float y[]; }; void main() { const int qk = QK4_1; @@ -333,9 +333,9 @@ layout(push_constant) uniform PushConstants { } pcs; layout(local_size_x = 1) in; -layout(binding = 0) buffer tensorInA { float inA[]; }; -layout(binding = 1) buffer tensorInB { float inB[]; }; -layout(binding = 2) buffer tensorOut { float out_[]; }; +layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; }; +layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; }; +layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; }; void main() { const uint i = gl_GlobalInvocationID.x; @@ -384,8 +384,8 @@ layout(push_constant) uniform PushConstants { } pcs; layout(local_size_x = 1) in; -layout(binding = 0) buffer tensorIn { float in_[]; }; -layout(binding = 1) buffer tensorOut { float out_[]; }; +layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; +layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; void main() { const uint i = gl_GlobalInvocationID.x; @@ -432,8 +432,8 @@ layout(push_constant) uniform PushConstants { } pcs; layout(local_size_x = 1) in; -layout(binding = 0) buffer tensorInA { float in_[]; }; -layout(binding = 1) buffer tensorOut { float out_[]; }; +layout(binding = 0) buffer restrict readonly tensorInA { float in_[]; }; +layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; void main() { const uint i = gl_GlobalInvocationID.x; @@ -459,8 +459,8 @@ layout(push_constant) uniform PushConstants { } pcs; layout(local_size_x = 1) in; -layout(binding = 0) buffer tensorInA { float in_[]; }; -layout(binding = 1) buffer tensorOut { float out_[]; }; +layout(binding = 0) buffer restrict readonly tensorInA { float in_[]; }; +layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; void main() { const uint i = gl_GlobalInvocationID.x; @@ -485,8 +485,8 @@ layout(push_constant) uniform PushConstants { } pcs; layout(local_size_x = 1) in; -layout(binding = 0) buffer tensorInA { float in_[]; }; -layout(binding = 1) buffer tensorOut { float out_[]; }; +layout(binding = 0) buffer restrict readonly tensorInA { float in_[]; }; +layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; void main() { const uint i = gl_GlobalInvocationID.x; From ed14f0764ad94d5550016226e3c14c1c6e87ce35 Mon Sep 17 00:00:00 2001 From: niansa Date: Wed, 28 Jun 2023 10:15:23 +0200 Subject: [PATCH 28/43] Fixed ggml_vk_abmath row argument --- ggml-vulkan.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index b5b2dc5fcb5ef..15433d544bdfb 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -344,15 +344,17 @@ void main() { } ); -template +template void ggml_vk_abmath(kp::Sequence& seq, const std::shared_ptr& inA, uint32_t inAOff, const std::shared_ptr& inB, uint32_t inBOff, const std::shared_ptr& out, uint32_t outOff, uint32_t size, uint32_t row = 0) { + GGML_ASSERT(with_row?row:!row); + const static auto spirv = compileSource(program_source_head+ "#define MATH_OP "+std::string(1, mathOP)+"\n" - "#define ROW_OP "+(row?"% pcs.row":"")+'\n'+ + "#define ROW_OP "+(with_row?"% pcs.row":"")+'\n'+ program_abmath, __func__); struct PushConstants { @@ -369,9 +371,9 @@ void ggml_vk_add(Args&&... args) { return ggml_vk_abmath<'+'>(std::forward(args)...); } -template +template void ggml_vk_mul(Args&&... args) { - return ggml_vk_abmath<'*'>(std::forward(args)...); + return ggml_vk_abmath<'*', with_row>(std::forward(args)...); } @@ -589,7 +591,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph { if (ggml_nelements(src1) == ne10) { // src1 is a row - ggml_vk_mul(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ggml_nelements(dst)); + ggml_vk_mul(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ggml_nelements(dst), ne00); } else { ggml_vk_mul(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ggml_nelements(dst)); } From e2b721db654129f6d1a4c55dbd51bb503406104b Mon Sep 17 00:00:00 2001 From: niansa Date: Wed, 28 Jun 2023 10:19:18 +0200 Subject: [PATCH 29/43] Allow vk add row --- ggml-vulkan.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 15433d544bdfb..3c7beeddeea93 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -366,9 +366,9 @@ void ggml_vk_abmath(kp::Sequence& seq, seq.record(mgr.algorithm({inA, inB, out}, spirv, {size}, {}, {pushConsts})); } -template +template void ggml_vk_add(Args&&... args) { - return ggml_vk_abmath<'+'>(std::forward(args)...); + return ggml_vk_abmath<'+', with_row>(std::forward(args)...); } template From de7d1823ed7e6c9054e10368ebe34e0c666af7b2 Mon Sep 17 00:00:00 2001 From: niansa Date: Wed, 28 Jun 2023 12:48:41 +0200 Subject: [PATCH 30/43] Implemented ggml_vk_soft_max --- ggml-vulkan.cpp | 144 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 127 insertions(+), 17 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 3c7beeddeea93..1cc54d06f2636 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -212,15 +212,28 @@ std::vector getVecBlockQ4_0QS(T *x, unsigned nb, unsigned qk) { }; -static const std::string program_source_head = R"( -#version 450 +static const std::string program_source_head = R"(#version 450 + #extension GL_EXT_shader_explicit_arithmetic_types_float16: enable #extension GL_EXT_shader_explicit_arithmetic_types_int8: enable +#extension GL_EXT_shader_explicit_arithmetic_types_int64: enable + #define QK4_0 32 #define QR4_0 2 #define QK4_1 32 + #define GELU_COEF_A 0.044715; #define SQRT_2_OVER_PI 0.79788456080286535587989211986876; + +#ifndef QK_K +#define QK_K 256 +#endif + +#if QK_K == 256 +#define K_SCALE_SIZE 12 +#else +#define K_SCALE_SIZE 4 +#endif )"; @@ -366,16 +379,6 @@ void ggml_vk_abmath(kp::Sequence& seq, seq.record(mgr.algorithm({inA, inB, out}, spirv, {size}, {}, {pushConsts})); } -template -void ggml_vk_add(Args&&... args) { - return ggml_vk_abmath<'+', with_row>(std::forward(args)...); -} - -template -void ggml_vk_mul(Args&&... args) { - return ggml_vk_abmath<'*', with_row>(std::forward(args)...); -} - static const std::string program_scale = MULTILINE_QUOTE( @@ -456,8 +459,8 @@ void ggml_vk_silu(Args&&... args) { static const std::string program_relu = MULTILINE_QUOTE( layout(push_constant) uniform PushConstants { - uint inAOff; uint inOff; + uint outOff; } pcs; layout(local_size_x = 1) in; @@ -482,8 +485,8 @@ void ggml_vk_relu(Args&&... args) { static const std::string program_gelu = MULTILINE_QUOTE( layout(push_constant) uniform PushConstants { - uint inAOff; uint inOff; + uint outOff; } pcs; layout(local_size_x = 1) in; @@ -506,6 +509,109 @@ void ggml_vk_gelu(Args&&... args) { } +static const std::string program_soft_max = + MULTILINE_QUOTE( +layout(push_constant) uniform PushConstants { + uint64_t ne00; + uint64_t ne01; + uint64_t ne02; + uint inOff; + uint outOff; +} pcs; + +layout(local_size_x = nth) in; +layout(binding = 0) buffer restrict readonly tensorInA { float in_[]; }; +layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; + +shared float buf[nth]; + +void main() { + const uint64_t i03 = uint64_t(gl_GlobalInvocationID.z); + const uint64_t i02 = uint64_t(gl_GlobalInvocationID.y); + const uint64_t i01 = uint64_t(gl_GlobalInvocationID.x); + + const uint extra_off = uint(i03*pcs.ne02*pcs.ne01*pcs.ne00 + i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00); + const uint in_off = pcs.inOff + extra_off; + const uint out_off = pcs.outOff + extra_off; + + // parallel max + buf[gl_LocalInvocationID.x] = uintBitsToFloat(0xFF800000); + for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { + buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], in_[in_off + i00]); + } + + // reduce + barrier(); + memoryBarrierShared(); + for (uint i = nth/2; i > 0; i /= 2) { + if (gl_LocalInvocationID.x < i) { + buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], buf[gl_LocalInvocationID.x + i]); + } + barrier(); + memoryBarrierShared(); + } + + // broadcast (no effect?) + if (gl_LocalInvocationID.x == 0) { + buf[0] = buf[0]; // ??? + } + + barrier(); + memoryBarrierShared(); + + const float max_ = buf[0]; + + // parallel sum + buf[gl_LocalInvocationID.x] = 0.0; + for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { + buf[gl_LocalInvocationID.x] += exp(in_[in_off + i00] - max_); + } + + // reduce + barrier(); + memoryBarrierShared(); + for (uint i = nth/2; i > 0; i /= 2) { + if (gl_LocalInvocationID.x < i) { + buf[gl_LocalInvocationID.x] += buf[gl_LocalInvocationID.x + i]; + } + barrier(); + memoryBarrierShared(); + } + + // broadcast (no effect?) + if (gl_LocalInvocationID.x == 0) { + buf[0] = buf[0]; // ??? + } + + barrier(); + memoryBarrierShared(); + + const float sum = buf[0]; + + for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { + out_[out_off + i00] = exp(in_[in_off + i00] - max_) / sum; + } +} +); + +void ggml_vk_soft_max(kp::Sequence& seq, + const std::shared_ptr& in, uint32_t inOff, + const std::shared_ptr& out, uint32_t outOff, + int64_t ne00, int64_t ne01, int64_t ne02, uint64_t ne03) { + const static unsigned nth = 32; + const static auto spirv = compileSource(program_source_head+"#define nth "+std::to_string(nth)+"\n"+program_soft_max, __func__); + + struct PushConstants { + int64_t ne00, ne01, ne02; + uint32_t inOff, outOff; + } pushConsts { + ne00, ne01, ne02, inOff, outOff + }; + + seq.record(mgr.algorithm({in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts})); +} + + void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) { printf("%s: evaluating graph\n", __func__); @@ -585,15 +691,15 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph } break; case GGML_OP_ADD: { - ggml_vk_add(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ggml_nelements(dst)); + ggml_vk_abmath<'+'>(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ggml_nelements(dst)); } break; case GGML_OP_MUL: { if (ggml_nelements(src1) == ne10) { // src1 is a row - ggml_vk_mul(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ggml_nelements(dst), ne00); + ggml_vk_abmath<'*', true>(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ggml_nelements(dst), ne00); } else { - ggml_vk_mul(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ggml_nelements(dst)); + ggml_vk_abmath<'*'>(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ggml_nelements(dst)); } } break; case GGML_OP_SCALE: @@ -613,6 +719,10 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph { ggml_vk_gelu(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst)); } break; + case GGML_OP_SOFT_MAX: + { + ggml_vk_soft_max(seq, id_src0, offs_src0, id_dst, offs_dst, ne00, ne01, ne02, ne03); + } break; default: fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); //GGML_ASSERT(false); From 5ac68ccacb7b86037a402a8792ffa1fb102f4394 Mon Sep 17 00:00:00 2001 From: niansa Date: Thu, 29 Jun 2023 11:14:21 +0200 Subject: [PATCH 31/43] Cleanups --- ggml-vulkan.h | 2 -- ggml.c | 6 ++---- llama.cpp | 12 ++---------- 3 files changed, 4 insertions(+), 16 deletions(-) diff --git a/ggml-vulkan.h b/ggml-vulkan.h index 5ec392782e0cd..361d8b5e2c94e 100644 --- a/ggml-vulkan.h +++ b/ggml-vulkan.h @@ -32,8 +32,6 @@ bool ggml_vk_add_buffer( void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t); void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t); -void ggml_vk_dequantize_row_q4_0(const void * x, float * y, int k); -void ggml_vk_dequantize_row_q4_1(const void * x, float * y, int k); void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf); #ifdef __cplusplus diff --git a/ggml.c b/ggml.c index 34f0870002a41..416a205769414 100644 --- a/ggml.c +++ b/ggml.c @@ -161,8 +161,6 @@ inline static void* ggml_aligned_malloc(size_t size) { #endif #elif defined(GGML_USE_OPENBLAS) #include -#elif defined(GGML_USE_KOMPUTE) -#include "ggml-vulkan.h" #elif defined(GGML_USE_CUBLAS) #include "ggml-cuda.h" #elif defined(GGML_USE_CLBLAST) @@ -1550,7 +1548,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { [GGML_TYPE_Q4_0] = { - .dequantize_row_q = (dequantize_row_q_t) ggml_vk_dequantize_row_q4_0, + .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_0, .quantize_row_q = quantize_row_q4_0, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference, .quantize_row_q_dot = quantize_row_q8_0, @@ -1558,7 +1556,7 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_0, }, [GGML_TYPE_Q4_1] = { - .dequantize_row_q = (dequantize_row_q_t) ggml_vk_dequantize_row_q4_1, + .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_1, .quantize_row_q = quantize_row_q4_1, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference, .quantize_row_q_dot = quantize_row_q8_1, diff --git a/llama.cpp b/llama.cpp index 740726445465a..40e3a4a7d11bd 100644 --- a/llama.cpp +++ b/llama.cpp @@ -753,7 +753,7 @@ struct llama_model_loader { } } - void load_all_data(llama_context & lctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { + void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { size_t data_size = 0; size_t prefetch_size = 0; size_t lock_size = 0; @@ -810,14 +810,6 @@ struct llama_model_loader { free(lt.data); } break; -#elif defined(GGML_USE_KOMPUTE) - case GGML_BACKEND_GPU: - lt.ggml_tensor->data = lt.data; - ggml_vk_h2d_tensor(lctx.ctx_kompute, lt.ggml_tensor); - if (!use_mmap) { - free(lt.data); - } - break; #endif default: continue; @@ -1323,7 +1315,7 @@ static void llama_model_load_internal( } #endif - ml->load_all_data(lctx, progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL); + ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL); if (progress_callback) { progress_callback(1.0f, progress_callback_user_data); From 749d6179a84b2d09eab3367c4619480cf6f75ff3 Mon Sep 17 00:00:00 2001 From: niansa Date: Thu, 29 Jun 2023 14:23:00 +0200 Subject: [PATCH 32/43] Snake case all functions --- ggml-vulkan.cpp | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 1cc54d06f2636..70247a40d08ac 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -165,7 +165,7 @@ const std::shared_ptr & ggml_vk_get_tensor(struct ggml_kompute_conte } -static std::vector compileSource(const std::string& source, const char *debug_name) { +static std::vector glsl_compile_source(const std::string& source, const char *debug_name) { printf("%s: Compiling compute program: %s\n", __func__, debug_name); static std::mutex mutex; std::lock_guard L(mutex); @@ -183,7 +183,7 @@ static std::vector compileSource(const std::string& source, const char template -std::vector getVecBlockQ4_0D(T *x, unsigned nb) { +std::vector get_vec_block_Q4_0D(T *x, unsigned nb) { std::vector fres(nb); for (unsigned it = 0; it != nb; it++) { fres[it] = x[it].d; @@ -192,7 +192,7 @@ std::vector getVecBlockQ4_0D(T *x, unsigned nb) { } template -std::vector getVecBlockQ4_0M(T *x, unsigned nb) { +std::vector get_vec_block_Q4_0M(T *x, unsigned nb) { std::vector fres(nb); for (unsigned it = 0; it != nb; it++) { fres[it] = x[it].m; @@ -201,7 +201,7 @@ std::vector getVecBlockQ4_0M(T *x, unsigned nb) { } template -std::vector getVecBlockQ4_0QS(T *x, unsigned nb, unsigned qk) { +std::vector get_vec_block_Q4_0QS(T *x, unsigned nb, unsigned qk) { std::vector fres(nb*(qk/2)); for (unsigned x_it = 0; x_it != nb; x_it++) { for (unsigned qs_it = 0; qs_it != qk / 2; qs_it++) { @@ -265,14 +265,14 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) { static const int qk = QK4_0; const unsigned nb = k / qk; const unsigned y_size = nb*qk; - const static auto spirv = compileSource(program_source_head+program_dequantize_row_q4_0, __func__); + const static auto spirv = glsl_compile_source(program_source_head+program_dequantize_row_q4_0, __func__); const auto x = reinterpret_cast(x_); GGML_ASSERT(k % qk == 0); - const auto tensorBlockQ4_0D = mgr.tensorT(getVecBlockQ4_0D(x, nb)); - const auto tensorBlockQ4_0QS = mgr.tensorT(getVecBlockQ4_0QS(x, nb, qk)); + const auto tensorBlockQ4_0D = mgr.tensorT(get_vec_block_Q4_0D(x, nb)); + const auto tensorBlockQ4_0QS = mgr.tensorT(get_vec_block_Q4_0QS(x, nb, qk)); const auto tensorY = mgr.tensor(std::vector(y, y+y_size)); mgr.sequence() @@ -315,15 +315,15 @@ void ggml_vk_dequantize_row_q4_1(const void *x_, float *y, int k) { static const int qk = QK4_1; const unsigned nb = k / qk; const unsigned y_size = nb*qk; - const static auto spirv = compileSource(program_source_head+program_dequantize_row_q4_1, __func__); + const static auto spirv = glsl_compile_source(program_source_head+program_dequantize_row_q4_1, __func__); const auto x = reinterpret_cast(x_); GGML_ASSERT(k % qk == 0); - const auto tensorBlockQ4_0D = mgr.tensorT(getVecBlockQ4_0D(x, nb)); - const auto tensorBlockQ4_0M = mgr.tensorT(getVecBlockQ4_0M(x, nb)); - const auto tensorBlockQ4_0QS = mgr.tensorT(getVecBlockQ4_0QS(x, nb, qk)); + const auto tensorBlockQ4_0D = mgr.tensorT(get_vec_block_Q4_0D(x, nb)); + const auto tensorBlockQ4_0M = mgr.tensorT(get_vec_block_Q4_0M(x, nb)); + const auto tensorBlockQ4_0QS = mgr.tensorT(get_vec_block_Q4_0QS(x, nb, qk)); const auto tensorY = mgr.tensor(std::vector(y, y+y_size)); mgr.sequence() @@ -365,7 +365,7 @@ void ggml_vk_abmath(kp::Sequence& seq, uint32_t size, uint32_t row = 0) { GGML_ASSERT(with_row?row:!row); - const static auto spirv = compileSource(program_source_head+ + const static auto spirv = glsl_compile_source(program_source_head+ "#define MATH_OP "+std::string(1, mathOP)+"\n" "#define ROW_OP "+(with_row?"% pcs.row":"")+'\n'+ program_abmath, __func__); @@ -403,7 +403,7 @@ void ggml_vk_scale(kp::Sequence& seq, const std::shared_ptr& in, uint32_t inOff, const std::shared_ptr& out, uint32_t outOff, uint32_t size, float scale) { - const static auto spirv = compileSource(program_source_head+program_scale, __func__); + const static auto spirv = glsl_compile_source(program_source_head+program_scale, __func__); struct PushConstants { uint32_t inOff, outOff; @@ -450,7 +450,7 @@ void main() { template void ggml_vk_silu(Args&&... args) { - const static auto spirv = compileSource(program_source_head+program_silu, __func__); + const static auto spirv = glsl_compile_source(program_source_head+program_silu, __func__); ggml_vk_xxlu(spirv, std::forward(args)...); } @@ -476,7 +476,7 @@ void main() { template void ggml_vk_relu(Args&&... args) { - const static auto spirv = compileSource(program_source_head+program_relu, __func__); + const static auto spirv = glsl_compile_source(program_source_head+program_relu, __func__); ggml_vk_xxlu(spirv, std::forward(args)...); } @@ -503,7 +503,7 @@ void main() { template void ggml_vk_gelu(Args&&... args) { - const static auto spirv = compileSource(program_source_head+program_gelu, __func__); + const static auto spirv = glsl_compile_source(program_source_head+program_gelu, __func__); ggml_vk_xxlu(spirv, std::forward(args)...); } @@ -599,7 +599,7 @@ void ggml_vk_soft_max(kp::Sequence& seq, const std::shared_ptr& out, uint32_t outOff, int64_t ne00, int64_t ne01, int64_t ne02, uint64_t ne03) { const static unsigned nth = 32; - const static auto spirv = compileSource(program_source_head+"#define nth "+std::to_string(nth)+"\n"+program_soft_max, __func__); + const static auto spirv = glsl_compile_source(program_source_head+"#define nth "+std::to_string(nth)+"\n"+program_soft_max, __func__); struct PushConstants { int64_t ne00, ne01, ne02; From 964fe8c546dba2e88e13d6f6d09a62c45008ac61 Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 30 Jun 2023 11:47:10 +0200 Subject: [PATCH 33/43] Added mul_mat (needs fixes) --- ggml-vulkan.cpp | 357 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 343 insertions(+), 14 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 70247a40d08ac..d6b99aa1fe365 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -217,6 +217,7 @@ static const std::string program_source_head = R"(#version 450 #extension GL_EXT_shader_explicit_arithmetic_types_float16: enable #extension GL_EXT_shader_explicit_arithmetic_types_int8: enable #extension GL_EXT_shader_explicit_arithmetic_types_int64: enable +#extension GL_EXT_control_flow_attributes: enable #define QK4_0 32 #define QR4_0 2 @@ -336,6 +337,44 @@ void ggml_vk_dequantize_row_q4_1(const void *x_, float *y, int k) { } +static const std::string program_fpx_to_fpx = + MULTILINE_QUOTE( +layout(push_constant) uniform PushConstants { + uint inOff; + uint outOff; + uint row; +} pcs; + +layout(local_size_x = 1) in; +layout(binding = 0) buffer restrict readonly tensorIn { IN_TYPE in_[]; }; +layout(binding = 1) buffer restrict writeonly tensorOut { OUT_TYPE out_[]; }; + +void main() { + const uint i = gl_GlobalInvocationID.x; + + out_[pcs.outOff + i] = OUT_TYPE(in_[pcs.inOff + i]); +} +); + +void ggml_vk_fp32_to_fp16_row(kp::Sequence& seq, + const std::shared_ptr& in, uint32_t inOff, + const std::shared_ptr& out, uint32_t outOff, + uint32_t size) { + const static auto spirv = glsl_compile_source(program_source_head+ + "#define IN_TYPE float\n" + "#define OUT_TYPE float16_t\n"+ + program_fpx_to_fpx, __func__); + + struct PushConstants { + uint32_t inOff, outOff; + } const pushConsts { + inOff, outOff + }; + + seq.record(mgr.algorithm({in, out}, spirv, {size}, {}, {pushConsts})); +} + + static const std::string program_abmath = MULTILINE_QUOTE( layout(push_constant) uniform PushConstants { @@ -535,24 +574,24 @@ void main() { const uint out_off = pcs.outOff + extra_off; // parallel max - buf[gl_LocalInvocationID.x] = uintBitsToFloat(0xFF800000); - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { - buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], in_[in_off + i00]); + buf[gl_WorkGroupID.x] = uintBitsToFloat(0xFF800000); + for (uint i00 = gl_WorkGroupID.x; i00 < pcs.ne00; i00 += nth) { + buf[gl_WorkGroupID.x] = max(buf[gl_WorkGroupID.x], in_[in_off + i00]); } // reduce barrier(); memoryBarrierShared(); - for (uint i = nth/2; i > 0; i /= 2) { - if (gl_LocalInvocationID.x < i) { - buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], buf[gl_LocalInvocationID.x + i]); + [[unroll]] for (uint i = nth/2; i > 0; i /= 2) { + if (gl_WorkGroupID.x < i) { + buf[gl_WorkGroupID.x] = max(buf[gl_WorkGroupID.x], buf[gl_WorkGroupID.x + i]); } barrier(); memoryBarrierShared(); } // broadcast (no effect?) - if (gl_LocalInvocationID.x == 0) { + if (gl_WorkGroupID.x == 0) { buf[0] = buf[0]; // ??? } @@ -562,24 +601,24 @@ void main() { const float max_ = buf[0]; // parallel sum - buf[gl_LocalInvocationID.x] = 0.0; - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { - buf[gl_LocalInvocationID.x] += exp(in_[in_off + i00] - max_); + buf[gl_WorkGroupID.x] = 0.0; + for (uint i00 = gl_WorkGroupID.x; i00 < pcs.ne00; i00 += nth) { + buf[gl_WorkGroupID.x] += exp(in_[in_off + i00] - max_); } // reduce barrier(); memoryBarrierShared(); for (uint i = nth/2; i > 0; i /= 2) { - if (gl_LocalInvocationID.x < i) { - buf[gl_LocalInvocationID.x] += buf[gl_LocalInvocationID.x + i]; + if (gl_WorkGroupID.x < i) { + buf[gl_WorkGroupID.x] += buf[gl_WorkGroupID.x + i]; } barrier(); memoryBarrierShared(); } // broadcast (no effect?) - if (gl_LocalInvocationID.x == 0) { + if (gl_WorkGroupID.x == 0) { buf[0] = buf[0]; // ??? } @@ -588,7 +627,7 @@ void main() { const float sum = buf[0]; - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { + for (uint i00 = gl_WorkGroupID.x; i00 < pcs.ne00; i00 += nth) { out_[out_off + i00] = exp(in_[in_off + i00] - max_) / sum; } } @@ -612,6 +651,285 @@ void ggml_vk_soft_max(kp::Sequence& seq, } +static const std::string program_mul_mat_f16 = R"( +#define BM 128 +#define BN 128 +#define BK 8 +#define TM 8 +#define TN 8 +)" MULTILINE_QUOTE( +layout(local_size_x = (BM * BN) / (TM * TN), local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; }; +layout (binding = 1) readonly buffer tensorInB { float16_t inB[]; }; +layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; + +layout (push_constant) uniform parameter { + int M; + int N; + int K; + int inAStride; + int inBStride; + int outStride; + uint inAOff; + uint inBOff; + uint outOff; +} pcs; + +shared float16_t bufA[BM * (BK+1)]; +shared float16_t bufB[BN * (BK+1)]; + +void main() { + const int ir = int(gl_WorkGroupID.x); + const int ic = int(gl_WorkGroupID.y); + + const int rstride = BM / TM; + + const int lr = int(gl_LocalInvocationID.x % rstride); + const int lc = int(gl_LocalInvocationID.x / rstride); + + const int loadr = int(gl_LocalInvocationID.x % BK); + const int loadc = int(gl_LocalInvocationID.x / BK); + + const int loadstride = int(gl_WorkGroupSize.x); + + int posA = ir * BM * pcs.inAStride; + int posB = ic * BN * pcs.inBStride; + + float sums[TM * TN]; + float16_t cacheA[TM]; + float16_t cacheB[TN]; + + [[unroll]] for (int i = 0; i < TM*TN; i++) { + sums[i] = 0.0hf; + } + + [[unroll]] for (int block = 0; block < pcs.K; block += BK) { + [[unroll]] for (int l = 0; l < BM * BK; l += loadstride) { + const int lr = l % BK; + const int lc = l / BK; + bufA[(loadc + lc) * (BK+1) + loadr + lr] = inA[posA + (loadc + lc) * pcs.inAStride + loadr + lr]; + } + [[unroll]] for (int l = 0; l < BN * BK; l += loadstride) { + const int lr = l % BK; + const int lc = l / BK; + bufB[(loadc + lc) * (BK+1) + loadr + lr] = inB[posB + (loadc + lc) * pcs.inBStride + loadr + lr]; + } + + barrier(); + + posA += BK; + posB += BK; + + [[unroll]] for (int i = 0; i < BK; i++) { + // Load from shared into cache + [[unroll]] for (int j = 0; j < BM; j++) { + cacheA[j] = bufA[(lr + j*rstride) * (BK+1) + i]; + } + [[unroll]] for (int j = 0; j < TN; j++) { + cacheB[j] = bufB[(lc * TN + j) * (BK+1) + i]; + } + + [[unroll]] for (int cc = 0; cc < TN; cc++) { + [[unroll]] for (int cr = 0; cr < TM; cr++) { + sums[cc * TM + cr] += float(cacheA[cr]) * float(cacheB[cc]); + } + } + } + + barrier(); + } + + const int dr = ir * BM + lr; + const int dc = ic * BN + lc * TN; + + [[unroll]] for (int cc = 0; cc < TN; cc++) { + [[unroll]] for (int cr = 0; cr < TM; cr++) { + out_[(dc + cc) * pcs.outStride + dr + cr*rstride] = sums[cc * TM + cr]; + } + } +} +); + +void ggml_vk_mul_mat_f16(kp::Sequence& seq, + const std::shared_ptr& inA, uint32_t inAOff, + const std::shared_ptr& inB, uint32_t inBOff, + const std::shared_ptr& out, uint32_t outOff, + int64_t ne00, int64_t ne01, int64_t ne02, uint64_t ne03, + int64_t ne10, int64_t ne11, + int nb10, int nb11, int nb12, int nb13, + int nb2, int nb3) { + const static auto spirv = glsl_compile_source(program_source_head+program_mul_mat_f16, __func__); + + const bool inB_cont_rows = nb10 == sizeof(float); + const bool inB_cont_cols = (size_t)nb11 == ne11 * sizeof(float); + + struct PushConstants { + int32_t M, N, K, inAStride, inBStride, outStride; + uint32_t inAOff, inBOff, outOff; + } pushConsts { + (int)ne01, (int)ne11, (int)ne10, (int)ne00, (int)ne10, (int)ne01, + inAOff, inBOff, outOff + }; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + auto tmp = mgr.tensorT(std::vector(ne10*ne11)); + + if (inB_cont_rows) { + if (inB_cont_cols) { + ggml_vk_fp32_to_fp16_row(seq, inB, (i03*nb13 + i02*nb12)/sizeof(float), tmp, 0, ne10*ne11); + } + else { + for (int64_t i01 = 0; i01 < ne11; i01++) { + ggml_vk_fp32_to_fp16_row(seq, inB, (i03*nb13 + i02*nb12 + i01*nb11)/sizeof(float), tmp, i01*ne10, ne10); + } + } + } else { + for (int64_t i01 = 0; i01 < ne11; i01++) { + for (int64_t i00 = 0; i00 < ne10; i00++) { + // Extremely slow because of single shader invocation + ggml_vk_fp32_to_fp16_row(seq, inB, (i03*nb13 + i02*nb12 + i01*nb11 + i00*nb10)/sizeof(float), tmp, i01*ne10 + i00, 1); + } + } + } + + seq.record(mgr.algorithm({inA, tmp, out}, spirv, {(uint32_t)ne01, (uint32_t)ne11}, {}, {pushConsts})); + } + } +} + + +static const std::string program_mul_mat_f32 = R"( +#define BM 128 +#define BN 128 +#define BK 8 +#define TM 8 +#define TN 8 +)" MULTILINE_QUOTE( +layout(local_size_x = (BM * BN) / (TM * TN), local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer tensorInA { float inA[]; }; +layout (binding = 1) readonly buffer tensorInB { float inB[]; }; +layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; + +layout (push_constant) uniform parameter { + int M; + int N; + int K; + int inAStride; + int inBStride; + int outStride; + uint inAOff; + uint inBOff; + uint outOff; +} pcs; + +shared float bufA[BM * (BK+1)]; +shared float bufB[BN * (BK+1)]; + +void main() { + const int ir = int(gl_WorkGroupID.x); + const int ic = int(gl_WorkGroupID.y); + + const int rstride = BM / TM; + + const int lr = int(gl_WorkGroupID.x % rstride); + const int lc = int(gl_WorkGroupID.x / rstride); + + const int loadr = int(gl_WorkGroupID.x % BK); + const int loadc = int(gl_WorkGroupID.x / BK); + + const int loadstride = int(gl_WorkGroupSize.x); + + int posA = ir * BM * pcs.inAStride; + int posB = ic * BN * pcs.inBStride; + + float sums[TM * TN]; + float cacheA[TM]; + float cacheB[TN]; + + [[unroll]] for (int i = 0; i < TM*TN; i++) { + sums[i] = 0.0f; + } + + [[unroll]] for (int block = 0; block < pcs.K; block += BK) { + [[unroll]] for (int l = 0; l < BM * BK; l += loadstride) { + const int lr = l % BK; + const int lc = l / BK; + bufA[(loadc + lc) * (BK+1) + loadr + lr] = inA[posA + (loadc + lc) * pcs.inAStride + loadr + lr + pcs.inAOff]; + } + [[unroll]] for (int l = 0; l < BN * BK; l += loadstride) { + const int lr = l % BK; + const int lc = l / BK; + bufB[(loadc + lc) * (BK+1) + loadr + lr] = inB[posB + (loadc + lc) * pcs.inBStride + loadr + lr + pcs.inBOff]; + } + + barrier(); + memoryBarrierShared(); + + posA += BK; + posB += BK; + + [[unroll]] for (int i = 0; i < BK; i++) { + // Load from shared into cache + [[unroll]] for (int j = 0; j < BM; j++) { + cacheA[j] = bufA[(lr + j*rstride) * (BK+1) + i]; + } + [[unroll]] for (int j = 0; j < TN; j++) { + cacheB[j] = bufB[(lc * TN + j) * (BK+1) + i]; + } + + [[unroll]] for (int cc = 0; cc < TN; cc++) { + [[unroll]] for (int cr = 0; cr < TM; cr++) { + sums[cc * TM + cr] += cacheA[cr] * cacheB[cc]; + } + } + } + + barrier(); + } + + const int dr = ir * BM + lr; + const int dc = ic * BN + lc * TN; + + [[unroll]] for (int cc = 0; cc < TN; cc++) { + [[unroll]] for (int cr = 0; cr < TM; cr++) { + out_[(dc + cc) * pcs.outStride + dr + cr*rstride + pcs.outOff] = sums[cc * TM + cr]; + } + } +} +); + +void ggml_vk_mul_mat_f32(kp::Sequence& seq, + const std::shared_ptr& inA, uint32_t inAOff, + const std::shared_ptr& inB, uint32_t inBOff, + const std::shared_ptr& out, uint32_t outOff, + int64_t ne00, int64_t ne01, int64_t ne02, uint64_t ne03, + int64_t ne10, int64_t ne11, + int nb2, int nb3) { + const static auto spirv = glsl_compile_source(program_source_head+program_mul_mat_f32, __func__); + + struct PushConstants { + int32_t M, N, K, inAStride, inBStride, outStride; + uint32_t inAOff, inBOff, outOff; + } pushConsts { + (int)ne01, (int)ne11, (int)ne10, (int)ne00, (int)ne10, (int)ne01, + inAOff, inBOff, outOff + }; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + auto off = i02*nb2 + i03*nb3; + pushConsts.inAOff = inAOff + off; + pushConsts.inBOff = inBOff + off; + pushConsts.outOff = outOff + off; + seq.record(mgr.algorithm({inA, inB, out}, spirv, {(uint32_t)ne01, (uint32_t)ne11}, {}, {pushConsts})); + } + } +} + + void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) { printf("%s: evaluating graph\n", __func__); @@ -723,6 +1041,17 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph { ggml_vk_soft_max(seq, id_src0, offs_src0, id_dst, offs_dst, ne00, ne01, ne02, ne03); } break; + case GGML_OP_MUL_MAT: + { + if (src0->type == GGML_TYPE_F32 + && src1->type == GGML_TYPE_F32) { + ggml_vk_mul_mat_f32(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ne01, ne02, ne03, ne10, ne11, nb2, nb3); + break; + } else if (src0->type == GGML_TYPE_F32 + && src1->type == GGML_TYPE_F16) { + ggml_vk_mul_mat_f16(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ne01, ne02, ne03, ne10, ne11, nb10, nb11, nb12, nb13, nb2, nb3); + } + } default: fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); //GGML_ASSERT(false); From f093bf2e5e8fc650d1274d3920cb20a8d82c1c35 Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 30 Jun 2023 12:19:29 +0200 Subject: [PATCH 34/43] Minor MUL_MAT fix and implemented DIAG_MASK_INF --- ggml-vulkan.cpp | 84 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 66 insertions(+), 18 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index d6b99aa1fe365..7b92a7bac071a 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -223,8 +223,8 @@ static const std::string program_source_head = R"(#version 450 #define QR4_0 2 #define QK4_1 32 -#define GELU_COEF_A 0.044715; -#define SQRT_2_OVER_PI 0.79788456080286535587989211986876; +#define GELU_COEF_A 0.044715 +#define SQRT_2_OVER_PI 0.79788456080286535587989211986876 #ifndef QK_K #define QK_K 256 @@ -235,6 +235,12 @@ static const std::string program_source_head = R"(#version 450 #else #define K_SCALE_SIZE 4 #endif + +#define BM 128 +#define BN 128 +#define BK 8 +#define TM 8 +#define TN 8 )"; @@ -651,13 +657,56 @@ void ggml_vk_soft_max(kp::Sequence& seq, } -static const std::string program_mul_mat_f16 = R"( -#define BM 128 -#define BN 128 -#define BK 8 -#define TM 8 -#define TN 8 -)" MULTILINE_QUOTE( +static const std::string program_diag_mask_inf = + MULTILINE_QUOTE( +layout(push_constant) uniform PushConstants { + uint64_t ne00; + uint64_t ne01; + uint inAOff; + uint inBOff; + uint outOff; +} pcs; + +layout(local_size_x = 1) in; +layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; }; +layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; }; +layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; }; + +void main() { + const uint64_t i02 = uint64_t(gl_GlobalInvocationID.z); + const uint64_t i01 = uint64_t(gl_GlobalInvocationID.y); + const uint64_t i00 = uint64_t(gl_GlobalInvocationID.x); + + const int n_past = inB[pcs.inBOff]; + + if (i00 > n_past + i01) { + out_[uint(i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00 + i00 + pcs.outOff)] = uintBitsToFloat(0xFF800000); + } else { + out_[uint(i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00 + i00 + pcs.outOff)] = inA[uint(i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00 + i00 + pcs.inAOff)]; + } +} +); + +void ggml_vk_diag_mask_inf(kp::Sequence& seq, + const std::shared_ptr& inA, uint32_t inAOff, + const std::shared_ptr& inB, uint32_t inBOff, + const std::shared_ptr& out, uint32_t outOff, + int64_t ne00, int64_t ne01, int64_t ne02) { + const static auto spirv = glsl_compile_source(program_source_head+program_diag_mask_inf, __func__); + + struct PushConstants { + int64_t ne00, ne01; + uint32_t inAOff, inBOff, outOff; + } pushConsts { + ne00, ne01, inAOff, inBOff, outOff + }; + + seq.record(mgr.algorithm({inA, inB, out}, spirv, {unsigned(ne00), unsigned(ne01), unsigned(ne02)}, {}, {pushConsts})); +} + + +static const std::string program_mul_mat_f16 = + MULTILINE_QUOTE( layout(local_size_x = (BM * BN) / (TM * TN), local_size_y = 1, local_size_z = 1) in; layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; }; @@ -800,13 +849,8 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq, } -static const std::string program_mul_mat_f32 = R"( -#define BM 128 -#define BN 128 -#define BK 8 -#define TM 8 -#define TN 8 -)" MULTILINE_QUOTE( +static const std::string program_mul_mat_f32 = + MULTILINE_QUOTE( layout(local_size_x = (BM * BN) / (TM * TN), local_size_y = 1, local_size_z = 1) in; layout (binding = 0) readonly buffer tensorInA { float inA[]; }; @@ -1041,14 +1085,18 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph { ggml_vk_soft_max(seq, id_src0, offs_src0, id_dst, offs_dst, ne00, ne01, ne02, ne03); } break; + case GGML_OP_DIAG_MASK_INF: + { + ggml_vk_diag_mask_inf(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ne01, ne02); + } break; case GGML_OP_MUL_MAT: { if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { ggml_vk_mul_mat_f32(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ne01, ne02, ne03, ne10, ne11, nb2, nb3); break; - } else if (src0->type == GGML_TYPE_F32 - && src1->type == GGML_TYPE_F16) { + } else if (src0->type == GGML_TYPE_F16 + && src1->type == GGML_TYPE_F32) { ggml_vk_mul_mat_f16(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ne01, ne02, ne03, ne10, ne11, nb10, nb11, nb12, nb13, nb2, nb3); } } From 0dc5f2f2bad7c34b4caff0bd27b274e474335918 Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 30 Jun 2023 12:31:13 +0200 Subject: [PATCH 35/43] Fixed mul mat dispatch size --- ggml-vulkan.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 7b92a7bac071a..18c7ba8fa0631 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -843,7 +843,7 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq, } } - seq.record(mgr.algorithm({inA, tmp, out}, spirv, {(uint32_t)ne01, (uint32_t)ne11}, {}, {pushConsts})); + seq.record(mgr.algorithm({inA, tmp, out}, spirv, {uint32_t(ne01/128), uint32_t(ne11/128)}, {}, {pushConsts})); } } } @@ -968,7 +968,7 @@ void ggml_vk_mul_mat_f32(kp::Sequence& seq, pushConsts.inAOff = inAOff + off; pushConsts.inBOff = inBOff + off; pushConsts.outOff = outOff + off; - seq.record(mgr.algorithm({inA, inB, out}, spirv, {(uint32_t)ne01, (uint32_t)ne11}, {}, {pushConsts})); + seq.record(mgr.algorithm({inA, inB, out}, spirv, {uint32_t(ne01/128), uint32_t(ne11/128)}, {}, {pushConsts})); } } } @@ -1037,7 +1037,6 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT; - const static std::shared_ptr nullTensor = nullptr; const std::shared_ptr& id_src0 = src0 ? ggml_vk_get_tensor(ctx, src0) : nullTensor; const std::shared_ptr& id_src1 = src1 ? ggml_vk_get_tensor(ctx, src1) : nullTensor; From 8fa60134b17c0de70e6f5a55c2ec6f241bd2915b Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 30 Jun 2023 12:47:17 +0200 Subject: [PATCH 36/43] Added missing break to mul_mat_f16 case --- ggml-vulkan.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 18c7ba8fa0631..4d9c458dfc6cd 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -1097,6 +1097,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { ggml_vk_mul_mat_f16(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ne01, ne02, ne03, ne10, ne11, nb10, nb11, nb12, nb13, nb2, nb3); + break; } } default: From d1f84db4b6c970001c7a171155c7a4204a18aa35 Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 30 Jun 2023 15:18:10 +0200 Subject: [PATCH 37/43] Implemented GGML_OP_NORM --- ggml-vulkan.cpp | 109 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 105 insertions(+), 4 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 4d9c458dfc6cd..29c67e7768226 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -482,7 +482,7 @@ layout(push_constant) uniform PushConstants { } pcs; layout(local_size_x = 1) in; -layout(binding = 0) buffer restrict readonly tensorInA { float in_[]; }; +layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; void main() { @@ -509,7 +509,7 @@ layout(push_constant) uniform PushConstants { } pcs; layout(local_size_x = 1) in; -layout(binding = 0) buffer restrict readonly tensorInA { float in_[]; }; +layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; void main() { @@ -535,7 +535,7 @@ layout(push_constant) uniform PushConstants { } pcs; layout(local_size_x = 1) in; -layout(binding = 0) buffer restrict readonly tensorInA { float in_[]; }; +layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; void main() { @@ -565,7 +565,7 @@ layout(push_constant) uniform PushConstants { } pcs; layout(local_size_x = nth) in; -layout(binding = 0) buffer restrict readonly tensorInA { float in_[]; }; +layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; shared float buf[nth]; @@ -657,6 +657,104 @@ void ggml_vk_soft_max(kp::Sequence& seq, } +static const std::string program_norm = + MULTILINE_QUOTE( +layout(push_constant) uniform PushConstants { + uint64_t ne00; + uint64_t nb01; + float eps; + uint inOff; + uint outOff; +} pcs; + +layout(local_size_x = 1) in; +layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; +layout(binding = 1) buffer restrict tensorOut { float out_[]; }; + +shared float sum[nth]; + +void main() { + const uint x = gl_GlobalInvocationID.x; // Based from in_ + // MEAN + // parallel sum + sum[gl_GlobalInvocationID.y] = 0.0; + for (uint i00 = gl_GlobalInvocationID.y; i00 < pcs.ne00; i00 += nth) { + sum[gl_GlobalInvocationID.y] += in_[x+i00]; + } + // reduce + barrier(); + memoryBarrierShared(); + for (uint i = nth/2; i > 0; i /= 2) { + if (gl_GlobalInvocationID.y < i) { + sum[gl_GlobalInvocationID.y] += sum[gl_GlobalInvocationID.y + i]; + } + barrier(); + memoryBarrierShared(); + } + // broadcast + if (gl_GlobalInvocationID.y == 0) { + sum[0] /= float(pcs.ne00); + } + barrier(); + memoryBarrierShared(); + const float mean = sum[0]; + + // recenter + const uint y = gl_GlobalInvocationID.x; // Based from out_ + for (uint i00 = gl_GlobalInvocationID.y; i00 < pcs.ne00; i00 += nth) { + out_[y+i00] = in_[x+i00] - mean; + } + + // VARIANCE + // parallel sum + sum[gl_GlobalInvocationID.y] = 0.0; + for (uint i00 = gl_GlobalInvocationID.y; i00 < pcs.ne00; i00 += nth) { + sum[gl_GlobalInvocationID.y] += out_[y+i00] * out_[y+i00]; + } + // reduce + barrier(); + memoryBarrierShared(); + for (uint i = nth/2; i > 0; i /= 2) { + if (gl_GlobalInvocationID.y < i) { + sum[gl_GlobalInvocationID.y] += sum[gl_GlobalInvocationID.y + i]; + } + barrier(); + memoryBarrierShared(); + } + // broadcast + if (gl_GlobalInvocationID.y == 0) { + sum[0] /= float(pcs.ne00); + } + barrier(); + memoryBarrierShared(); + const float variance = sum[0]; + + const float scale = 1.0/sqrt(variance + pcs.eps); + for (uint i00 = gl_GlobalInvocationID.y; i00 < pcs.ne00; i00 += nth) { + out_[y+i00] *= scale; + } +} +); + +void ggml_vk_norm(kp::Sequence& seq, + const std::shared_ptr& in, uint32_t inOff, + const std::shared_ptr& out, uint32_t outOff, + int64_t ne00, int64_t ne01, + int64_t nrows) { + const static unsigned nth = 256; + const static auto spirv = glsl_compile_source(program_source_head+"#define nth "+std::to_string(nth)+"\n"+program_norm, __func__); + + struct PushConstants { + uint64_t ne00, nb01; + float eps; + uint32_t inOff, outOff; + } pushConsts { + (uint64_t)ne00, (uint64_t)ne01, 1e-5f, inOff, outOff + }; + + seq.record(mgr.algorithm({in, out}, spirv, {(uint32_t)nrows, nth}, {}, {pushConsts})); +} + static const std::string program_diag_mask_inf = MULTILINE_QUOTE( layout(push_constant) uniform PushConstants { @@ -1100,6 +1198,9 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph break; } } + case GGML_OP_NORM: { + ggml_vk_norm(seq, id_src0, offs_src0, id_dst, offs_dst, ne00, ne01, ggml_nrows(src0)); + } break; default: fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); //GGML_ASSERT(false); From f0e1429d7fd56483ee8352c6bca40344a653f01a Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 30 Jun 2023 16:01:08 +0200 Subject: [PATCH 38/43] Implemented RMS_NORM --- ggml-vulkan.cpp | 97 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 89 insertions(+), 8 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 29c67e7768226..35d31157b3f52 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -657,6 +657,23 @@ void ggml_vk_soft_max(kp::Sequence& seq, } +void ggml_vk_norm(kp::Sequence& seq, std::vector spirv, unsigned nth, + const std::shared_ptr& in, uint32_t inOff, + const std::shared_ptr& out, uint32_t outOff, + int64_t ne00, int64_t ne01, + int64_t nrows) { + struct PushConstants { + uint64_t ne00, nb01; + float eps; + uint32_t inOff, outOff; + } pushConsts { + (uint64_t)ne00, (uint64_t)ne01, 1e-5f, inOff, outOff + }; + + seq.record(mgr.algorithm({in, out}, spirv, {(uint32_t)nrows, nth}, {}, {pushConsts})); +} + + static const std::string program_norm = MULTILINE_QUOTE( layout(push_constant) uniform PushConstants { @@ -681,6 +698,7 @@ void main() { for (uint i00 = gl_GlobalInvocationID.y; i00 < pcs.ne00; i00 += nth) { sum[gl_GlobalInvocationID.y] += in_[x+i00]; } + // reduce barrier(); memoryBarrierShared(); @@ -691,6 +709,7 @@ void main() { barrier(); memoryBarrierShared(); } + // broadcast if (gl_GlobalInvocationID.y == 0) { sum[0] /= float(pcs.ne00); @@ -711,6 +730,7 @@ void main() { for (uint i00 = gl_GlobalInvocationID.y; i00 < pcs.ne00; i00 += nth) { sum[gl_GlobalInvocationID.y] += out_[y+i00] * out_[y+i00]; } + // reduce barrier(); memoryBarrierShared(); @@ -721,6 +741,7 @@ void main() { barrier(); memoryBarrierShared(); } + // broadcast if (gl_GlobalInvocationID.y == 0) { sum[0] /= float(pcs.ne00); @@ -744,16 +765,73 @@ void ggml_vk_norm(kp::Sequence& seq, const static unsigned nth = 256; const static auto spirv = glsl_compile_source(program_source_head+"#define nth "+std::to_string(nth)+"\n"+program_norm, __func__); - struct PushConstants { - uint64_t ne00, nb01; - float eps; - uint32_t inOff, outOff; - } pushConsts { - (uint64_t)ne00, (uint64_t)ne01, 1e-5f, inOff, outOff - }; + ggml_vk_norm(seq, spirv, nth, in, inOff, out, outOff, ne00, ne01, nrows); +} - seq.record(mgr.algorithm({in, out}, spirv, {(uint32_t)nrows, nth}, {}, {pushConsts})); + +static const std::string program_rms_norm = + MULTILINE_QUOTE( +layout(push_constant) uniform PushConstants { + uint64_t ne00; + uint64_t nb01; + float eps; + uint inOff; + uint outOff; +} pcs; + +layout(local_size_x = 1) in; +layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; +layout(binding = 1) buffer restrict tensorOut { float out_[]; }; + +shared float sum[nth]; + +void main() { + const uint x = gl_GlobalInvocationID.x; // Based from in_ + + // parallel sum + sum[gl_GlobalInvocationID.y] = 0.0; + for (uint i00 = gl_GlobalInvocationID.y; i00 < pcs.ne00; i00 += nth) { + sum[gl_GlobalInvocationID.y] += in_[x+i00] * in_[x+i00]; + } + + // reduce + barrier(); + memoryBarrierShared(); + for (uint i = nth/2; i > 0; i /= 2) { + if (gl_GlobalInvocationID.y < i) { + sum[gl_GlobalInvocationID.y] += sum[gl_GlobalInvocationID.y + i]; + } + barrier(); + memoryBarrierShared(); + } + + // broadcast + if (gl_GlobalInvocationID.y == 0) { + sum[0] /= float(pcs.ne00); + } + barrier(); + memoryBarrierShared(); + + const float scale = 1.0f/sqrt(sum[0] + pcs.eps); + + const uint y = gl_GlobalInvocationID.x; // Based from out_ + for (uint i00 = gl_GlobalInvocationID.y; i00 < pcs.ne00; i00 += nth) { + out_[y+i00] = in_[x+i00] * scale; + } } +); + +void ggml_vk_rms_norm(kp::Sequence& seq, + const std::shared_ptr& in, uint32_t inOff, + const std::shared_ptr& out, uint32_t outOff, + int64_t ne00, int64_t ne01, + int64_t nrows) { + const static unsigned nth = 256; + const static auto spirv = glsl_compile_source(program_source_head+"#define nth "+std::to_string(nth)+"\n"+program_rms_norm, __func__); + + ggml_vk_norm(seq, spirv, nth, in, inOff, out, outOff, ne00, ne01, nrows); +} + static const std::string program_diag_mask_inf = MULTILINE_QUOTE( @@ -1201,6 +1279,9 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph case GGML_OP_NORM: { ggml_vk_norm(seq, id_src0, offs_src0, id_dst, offs_dst, ne00, ne01, ggml_nrows(src0)); } break; + case GGML_OP_RMS_NORM: { + ggml_vk_rms_norm(seq, id_src0, offs_src0, id_dst, offs_dst, ne00, ne01, ggml_nrows(src0)); + } break; default: fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); //GGML_ASSERT(false); From 2fc8249ba371bea3fd710819c30ce08930dbef73 Mon Sep 17 00:00:00 2001 From: niansa Date: Wed, 5 Jul 2023 10:59:38 +0200 Subject: [PATCH 39/43] Simple mul_mat_f16 for speed and removal of unused mul_mat_f32 --- ggml-vulkan.cpp | 169 +++++++++++------------------------------------- 1 file changed, 38 insertions(+), 131 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 35d31157b3f52..517b98135f588 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -976,6 +976,42 @@ void main() { } ); +static const std::string program_fast_mul_mat_f16 = + MULTILINE_QUOTE( +layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in; + +layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; }; +layout (binding = 1) readonly buffer tensorInB { float16_t inB[]; }; +layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; + +layout (push_constant) uniform parameter { + int M; + int N; + int K; + int inAStride; + int inBStride; + int outStride; + uint inAOff; + uint inBOff; + uint outOff; +} pcs; + +void main() { + int row = int(gl_GlobalInvocationID.x); + int col = int(gl_GlobalInvocationID.y); + + if (row < pcs.M && col < pcs.N) { + float sum = 0.0f; + + for (int i = 0; i < pcs.K; i++) { + sum += float(inA[row * pcs.inAStride + i]) * float(inB[col * pcs.inBStride + i]); + } + + out_[col * pcs.outStride + row] = sum; + } +} +); + void ggml_vk_mul_mat_f16(kp::Sequence& seq, const std::shared_ptr& inA, uint32_t inAOff, const std::shared_ptr& inB, uint32_t inBOff, @@ -984,7 +1020,7 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq, int64_t ne10, int64_t ne11, int nb10, int nb11, int nb12, int nb13, int nb2, int nb3) { - const static auto spirv = glsl_compile_source(program_source_head+program_mul_mat_f16, __func__); + const static auto spirv = glsl_compile_source(program_source_head+program_fast_mul_mat_f16, __func__); const bool inB_cont_rows = nb10 == sizeof(float); const bool inB_cont_cols = (size_t)nb11 == ne11 * sizeof(float); @@ -1025,131 +1061,6 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq, } -static const std::string program_mul_mat_f32 = - MULTILINE_QUOTE( -layout(local_size_x = (BM * BN) / (TM * TN), local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer tensorInA { float inA[]; }; -layout (binding = 1) readonly buffer tensorInB { float inB[]; }; -layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; - -layout (push_constant) uniform parameter { - int M; - int N; - int K; - int inAStride; - int inBStride; - int outStride; - uint inAOff; - uint inBOff; - uint outOff; -} pcs; - -shared float bufA[BM * (BK+1)]; -shared float bufB[BN * (BK+1)]; - -void main() { - const int ir = int(gl_WorkGroupID.x); - const int ic = int(gl_WorkGroupID.y); - - const int rstride = BM / TM; - - const int lr = int(gl_WorkGroupID.x % rstride); - const int lc = int(gl_WorkGroupID.x / rstride); - - const int loadr = int(gl_WorkGroupID.x % BK); - const int loadc = int(gl_WorkGroupID.x / BK); - - const int loadstride = int(gl_WorkGroupSize.x); - - int posA = ir * BM * pcs.inAStride; - int posB = ic * BN * pcs.inBStride; - - float sums[TM * TN]; - float cacheA[TM]; - float cacheB[TN]; - - [[unroll]] for (int i = 0; i < TM*TN; i++) { - sums[i] = 0.0f; - } - - [[unroll]] for (int block = 0; block < pcs.K; block += BK) { - [[unroll]] for (int l = 0; l < BM * BK; l += loadstride) { - const int lr = l % BK; - const int lc = l / BK; - bufA[(loadc + lc) * (BK+1) + loadr + lr] = inA[posA + (loadc + lc) * pcs.inAStride + loadr + lr + pcs.inAOff]; - } - [[unroll]] for (int l = 0; l < BN * BK; l += loadstride) { - const int lr = l % BK; - const int lc = l / BK; - bufB[(loadc + lc) * (BK+1) + loadr + lr] = inB[posB + (loadc + lc) * pcs.inBStride + loadr + lr + pcs.inBOff]; - } - - barrier(); - memoryBarrierShared(); - - posA += BK; - posB += BK; - - [[unroll]] for (int i = 0; i < BK; i++) { - // Load from shared into cache - [[unroll]] for (int j = 0; j < BM; j++) { - cacheA[j] = bufA[(lr + j*rstride) * (BK+1) + i]; - } - [[unroll]] for (int j = 0; j < TN; j++) { - cacheB[j] = bufB[(lc * TN + j) * (BK+1) + i]; - } - - [[unroll]] for (int cc = 0; cc < TN; cc++) { - [[unroll]] for (int cr = 0; cr < TM; cr++) { - sums[cc * TM + cr] += cacheA[cr] * cacheB[cc]; - } - } - } - - barrier(); - } - - const int dr = ir * BM + lr; - const int dc = ic * BN + lc * TN; - - [[unroll]] for (int cc = 0; cc < TN; cc++) { - [[unroll]] for (int cr = 0; cr < TM; cr++) { - out_[(dc + cc) * pcs.outStride + dr + cr*rstride + pcs.outOff] = sums[cc * TM + cr]; - } - } -} -); - -void ggml_vk_mul_mat_f32(kp::Sequence& seq, - const std::shared_ptr& inA, uint32_t inAOff, - const std::shared_ptr& inB, uint32_t inBOff, - const std::shared_ptr& out, uint32_t outOff, - int64_t ne00, int64_t ne01, int64_t ne02, uint64_t ne03, - int64_t ne10, int64_t ne11, - int nb2, int nb3) { - const static auto spirv = glsl_compile_source(program_source_head+program_mul_mat_f32, __func__); - - struct PushConstants { - int32_t M, N, K, inAStride, inBStride, outStride; - uint32_t inAOff, inBOff, outOff; - } pushConsts { - (int)ne01, (int)ne11, (int)ne10, (int)ne00, (int)ne10, (int)ne01, - inAOff, inBOff, outOff - }; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - auto off = i02*nb2 + i03*nb3; - pushConsts.inAOff = inAOff + off; - pushConsts.inBOff = inBOff + off; - pushConsts.outOff = outOff + off; - seq.record(mgr.algorithm({inA, inB, out}, spirv, {uint32_t(ne01/128), uint32_t(ne11/128)}, {}, {pushConsts})); - } - } -} - - void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) { printf("%s: evaluating graph\n", __func__); @@ -1266,11 +1177,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph } break; case GGML_OP_MUL_MAT: { - if (src0->type == GGML_TYPE_F32 - && src1->type == GGML_TYPE_F32) { - ggml_vk_mul_mat_f32(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ne01, ne02, ne03, ne10, ne11, nb2, nb3); - break; - } else if (src0->type == GGML_TYPE_F16 + if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { ggml_vk_mul_mat_f16(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ne01, ne02, ne03, ne10, ne11, nb10, nb11, nb12, nb13, nb2, nb3); break; From 6be93e607149e94550c1ba2fa273cdbaa64f2815 Mon Sep 17 00:00:00 2001 From: niansa Date: Wed, 5 Jul 2023 13:28:40 +0200 Subject: [PATCH 40/43] Ported mat mul from Metal --- ggml-vulkan.cpp | 188 ++++++++++++------------------------------------ 1 file changed, 47 insertions(+), 141 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 517b98135f588..5f1b8d43a753e 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -173,7 +173,7 @@ static std::vector glsl_compile_source(const std::string& source, cons std::ofstream fileOut("tmp_kp_shader.comp"); fileOut << source; fileOut.close(); - if (system(std::string("glslangValidator -V tmp_kp_shader.comp -o tmp_kp_shader.comp.spv > /dev/null").c_str())) + if (system("glslangValidator -V tmp_kp_shader.comp -o tmp_kp_shader.comp.spv > /dev/null")) throw std::runtime_error("Error running glslangValidator command"); std::ifstream fileStream("tmp_kp_shader.comp.spv", std::ios::binary); std::vector buffer; @@ -883,131 +883,59 @@ void ggml_vk_diag_mask_inf(kp::Sequence& seq, static const std::string program_mul_mat_f16 = MULTILINE_QUOTE( -layout(local_size_x = (BM * BN) / (TM * TN), local_size_y = 1, local_size_z = 1) in; +layout(local_size_x = 64) in; layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; }; -layout (binding = 1) readonly buffer tensorInB { float16_t inB[]; }; +layout (binding = 1) readonly buffer tensorInB { float inB[]; }; layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; layout (push_constant) uniform parameter { - int M; - int N; - int K; - int inAStride; - int inBStride; - int outStride; + int64_t ne00; + int64_t ne01; + uint64_t nb00; + uint64_t nb01; + uint64_t nb02; + int64_t ne10; + int64_t ne11; + uint64_t nb10; + uint64_t nb11; + uint64_t nb12; + int64_t ne0; + int64_t ne1; uint inAOff; uint inBOff; uint outOff; } pcs; -shared float16_t bufA[BM * (BK+1)]; -shared float16_t bufB[BN * (BK+1)]; +shared float sum[gl_WorkGroupSize.x]; void main() { - const int ir = int(gl_WorkGroupID.x); - const int ic = int(gl_WorkGroupID.y); - - const int rstride = BM / TM; - - const int lr = int(gl_LocalInvocationID.x % rstride); - const int lc = int(gl_LocalInvocationID.x / rstride); - - const int loadr = int(gl_LocalInvocationID.x % BK); - const int loadc = int(gl_LocalInvocationID.x / BK); + const int64_t r0 = gl_GlobalInvocationID.x; + const int64_t r1 = gl_GlobalInvocationID.y; + const int64_t im = gl_GlobalInvocationID.z; - const int loadstride = int(gl_WorkGroupSize.x); + const uint x = uint((r0*pcs.nb01 + im*pcs.nb02) / 2); // Based from inA + const uint y = uint((r1*pcs.nb11 + im*pcs.nb12) / 4); // based from inB - int posA = ir * BM * pcs.inAStride; - int posB = ic * BN * pcs.inBStride; + sum[gl_WorkGroupID.x] = 0.0f; - float sums[TM * TN]; - float16_t cacheA[TM]; - float16_t cacheB[TN]; - - [[unroll]] for (int i = 0; i < TM*TN; i++) { - sums[i] = 0.0hf; + for (uint i = gl_WorkGroupID.x; i < pcs.ne00; i += gl_WorkGroupSize.x) { + sum[gl_WorkGroupID.x] += float(inA[x+i]) * float(inB[y+i]); } - [[unroll]] for (int block = 0; block < pcs.K; block += BK) { - [[unroll]] for (int l = 0; l < BM * BK; l += loadstride) { - const int lr = l % BK; - const int lc = l / BK; - bufA[(loadc + lc) * (BK+1) + loadr + lr] = inA[posA + (loadc + lc) * pcs.inAStride + loadr + lr]; - } - [[unroll]] for (int l = 0; l < BN * BK; l += loadstride) { - const int lr = l % BK; - const int lc = l / BK; - bufB[(loadc + lc) * (BK+1) + loadr + lr] = inB[posB + (loadc + lc) * pcs.inBStride + loadr + lr]; - } - - barrier(); - - posA += BK; - posB += BK; - - [[unroll]] for (int i = 0; i < BK; i++) { - // Load from shared into cache - [[unroll]] for (int j = 0; j < BM; j++) { - cacheA[j] = bufA[(lr + j*rstride) * (BK+1) + i]; - } - [[unroll]] for (int j = 0; j < TN; j++) { - cacheB[j] = bufB[(lc * TN + j) * (BK+1) + i]; - } - - [[unroll]] for (int cc = 0; cc < TN; cc++) { - [[unroll]] for (int cr = 0; cr < TM; cr++) { - sums[cc * TM + cr] += float(cacheA[cr]) * float(cacheB[cc]); - } - } + // accumulate the sum from all threads in the threadgroup + barrier(); + memoryBarrierShared(); + for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) { + if (gl_WorkGroupID.x < i) { + sum[gl_WorkGroupID.x] += sum[gl_WorkGroupID.x + i]; } - barrier(); + memoryBarrierShared(); } - const int dr = ir * BM + lr; - const int dc = ic * BN + lc * TN; - - [[unroll]] for (int cc = 0; cc < TN; cc++) { - [[unroll]] for (int cr = 0; cr < TM; cr++) { - out_[(dc + cc) * pcs.outStride + dr + cr*rstride] = sums[cc * TM + cr]; - } - } -} -); - -static const std::string program_fast_mul_mat_f16 = - MULTILINE_QUOTE( -layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in; - -layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; }; -layout (binding = 1) readonly buffer tensorInB { float16_t inB[]; }; -layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; - -layout (push_constant) uniform parameter { - int M; - int N; - int K; - int inAStride; - int inBStride; - int outStride; - uint inAOff; - uint inBOff; - uint outOff; -} pcs; - -void main() { - int row = int(gl_GlobalInvocationID.x); - int col = int(gl_GlobalInvocationID.y); - - if (row < pcs.M && col < pcs.N) { - float sum = 0.0f; - - for (int i = 0; i < pcs.K; i++) { - sum += float(inA[row * pcs.inAStride + i]) * float(inB[col * pcs.inBStride + i]); - } - - out_[col * pcs.outStride + row] = sum; + if (gl_WorkGroupID.x == 0) { + out_[uint(im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0)] = sum[0]; } } ); @@ -1016,48 +944,26 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq, const std::shared_ptr& inA, uint32_t inAOff, const std::shared_ptr& inB, uint32_t inBOff, const std::shared_ptr& out, uint32_t outOff, - int64_t ne00, int64_t ne01, int64_t ne02, uint64_t ne03, - int64_t ne10, int64_t ne11, - int nb10, int nb11, int nb12, int nb13, - int nb2, int nb3) { - const static auto spirv = glsl_compile_source(program_source_head+program_fast_mul_mat_f16, __func__); - - const bool inB_cont_rows = nb10 == sizeof(float); - const bool inB_cont_cols = (size_t)nb11 == ne11 * sizeof(float); + int64_t ne00, int64_t ne01, + uint64_t nb00, uint64_t nb01, uint64_t nb02, + int64_t ne10, int64_t ne11, int64_t ne12, + uint64_t nb10, uint64_t nb11, uint64_t nb12, + int64_t ne0, int64_t ne1) { + const static auto spirv = glsl_compile_source(program_source_head+program_mul_mat_f16, __func__); struct PushConstants { - int32_t M, N, K, inAStride, inBStride, outStride; + int64_t ne00, ne01; + uint64_t nb00, nb01, nb02; + int64_t ne10, ne11; + uint64_t nb10, nb11, nb12; + int64_t ne0, ne1; uint32_t inAOff, inBOff, outOff; } pushConsts { - (int)ne01, (int)ne11, (int)ne10, (int)ne00, (int)ne10, (int)ne01, + ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, ne0, ne1, inAOff, inBOff, outOff }; - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - auto tmp = mgr.tensorT(std::vector(ne10*ne11)); - - if (inB_cont_rows) { - if (inB_cont_cols) { - ggml_vk_fp32_to_fp16_row(seq, inB, (i03*nb13 + i02*nb12)/sizeof(float), tmp, 0, ne10*ne11); - } - else { - for (int64_t i01 = 0; i01 < ne11; i01++) { - ggml_vk_fp32_to_fp16_row(seq, inB, (i03*nb13 + i02*nb12 + i01*nb11)/sizeof(float), tmp, i01*ne10, ne10); - } - } - } else { - for (int64_t i01 = 0; i01 < ne11; i01++) { - for (int64_t i00 = 0; i00 < ne10; i00++) { - // Extremely slow because of single shader invocation - ggml_vk_fp32_to_fp16_row(seq, inB, (i03*nb13 + i02*nb12 + i01*nb11 + i00*nb10)/sizeof(float), tmp, i01*ne10 + i00, 1); - } - } - } - - seq.record(mgr.algorithm({inA, tmp, out}, spirv, {uint32_t(ne01/128), uint32_t(ne11/128)}, {}, {pushConsts})); - } - } + seq.record(mgr.algorithm({inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {}, {pushConsts})); } @@ -1179,7 +1085,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph { if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { - ggml_vk_mul_mat_f16(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ne01, ne02, ne03, ne10, ne11, nb10, nb11, nb12, nb13, nb2, nb3); + ggml_vk_mul_mat_f16(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ne01, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1); break; } } From 856b7589e9661507ff256b401f93c95da3173f2e Mon Sep 17 00:00:00 2001 From: niansa Date: Wed, 5 Jul 2023 13:34:01 +0200 Subject: [PATCH 41/43] Optimized ggml_vk_mul_mat_f16 argument count --- ggml-vulkan.cpp | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 5f1b8d43a753e..6aab3ddaed0d4 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -891,13 +891,8 @@ layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; layout (push_constant) uniform parameter { int64_t ne00; - int64_t ne01; - uint64_t nb00; uint64_t nb01; uint64_t nb02; - int64_t ne10; - int64_t ne11; - uint64_t nb10; uint64_t nb11; uint64_t nb12; int64_t ne0; @@ -945,21 +940,20 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq, const std::shared_ptr& inB, uint32_t inBOff, const std::shared_ptr& out, uint32_t outOff, int64_t ne00, int64_t ne01, - uint64_t nb00, uint64_t nb01, uint64_t nb02, - int64_t ne10, int64_t ne11, int64_t ne12, - uint64_t nb10, uint64_t nb11, uint64_t nb12, + uint64_t nb01, uint64_t nb02, + int64_t ne11, int64_t ne12, + uint64_t nb11, uint64_t nb12, int64_t ne0, int64_t ne1) { const static auto spirv = glsl_compile_source(program_source_head+program_mul_mat_f16, __func__); struct PushConstants { - int64_t ne00, ne01; - uint64_t nb00, nb01, nb02; - int64_t ne10, ne11; - uint64_t nb10, nb11, nb12; + int64_t ne00; + uint64_t nb01, nb02; + uint64_t nb11, nb12; int64_t ne0, ne1; uint32_t inAOff, inBOff, outOff; } pushConsts { - ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, ne0, ne1, + ne00, nb01, nb02, nb11, nb12, ne0, ne1, inAOff, inBOff, outOff }; @@ -1085,7 +1079,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph { if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { - ggml_vk_mul_mat_f16(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ne01, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1); + ggml_vk_mul_mat_f16(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1); break; } } From 77ebe46966b9173706b9e58df2ec02711003aced Mon Sep 17 00:00:00 2001 From: niansa Date: Wed, 5 Jul 2023 14:21:16 +0200 Subject: [PATCH 42/43] Fixed case order in ggml_vk_graph_compute --- ggml-vulkan.cpp | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 6aab3ddaed0d4..232109762d937 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -1075,21 +1075,25 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph { ggml_vk_diag_mask_inf(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ne01, ne02); } break; + case GGML_OP_NORM: + { + ggml_vk_norm(seq, id_src0, offs_src0, id_dst, offs_dst, ne00, ne01, ggml_nrows(src0)); + } break; + case GGML_OP_RMS_NORM: + { + ggml_vk_rms_norm(seq, id_src0, offs_src0, id_dst, offs_dst, ne00, ne01, ggml_nrows(src0)); + } break; case GGML_OP_MUL_MAT: { if (src0->type == GGML_TYPE_F16 - && src1->type == GGML_TYPE_F32) { + && src1->type == GGML_TYPE_F32) { ggml_vk_mul_mat_f16(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1); break; + } else { + printf("Unsupported quantization: %u/%u\n", src0->type, src1->type); } } - case GGML_OP_NORM: { - ggml_vk_norm(seq, id_src0, offs_src0, id_dst, offs_dst, ne00, ne01, ggml_nrows(src0)); - } break; - case GGML_OP_RMS_NORM: { - ggml_vk_rms_norm(seq, id_src0, offs_src0, id_dst, offs_dst, ne00, ne01, ggml_nrows(src0)); - } break; - default: + default: {} fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); //GGML_ASSERT(false); } From 44d214c04034b52e098f0df595341b50cab5248e Mon Sep 17 00:00:00 2001 From: niansa Date: Wed, 5 Jul 2023 14:34:18 +0200 Subject: [PATCH 43/43] Only warn if __STDC_IEC_559__ isn't defined --- ggml-vulkan.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 232109762d937..06c0434608ce0 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -16,7 +16,7 @@ #include #ifndef __STDC_IEC_559__ -#error Your C implementation is not IEC 559 compliant, which is required for proper Vulkan interop. +#warning Your C implementation is not IEC 559 compliant, which is required for proper Vulkan interop. #endif #define MULTILINE_QUOTE(...) #__VA_ARGS__